Series comparison

-[PULL 00/41] tcg patch queue
+[PULL 00/56] tcg patch queue
-The following changes since commit 035eed4c0d257c905a556fa0f4865a0c077b4e7f:
+The following changes since commit c52d69e7dbaaed0ffdef8125e79218672c30161d:
-  Merge remote-tracking branch 'remotes/vivier/tags/q800-for-5.0-pull-request' into staging (2020-01-07 17:08:21 +0000)
+  Merge remote-tracking branch 'remotes/cschoenebeck/tags/pull-9p-20211027' into staging (2021-10-27 11:45:18 -0700)
 are available in the Git repository at:
-  https://github.com/rth7680/qemu.git tags/pull-tcg-20200108
+  https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20211027
-for you to fetch changes up to 5e7ef51cbe47e726f76bfbc208e167085cf398c4:
+for you to fetch changes up to 820c025f0dcacf2f3c12735b1f162893fbfa7bc6:
-  MAINTAINERS: Replace Claudio Fontana for tcg/aarch64 (2020-01-08 11:54:12 +1100)
+  tcg/optimize: Propagate sign info for shifting (2021-10-27 17:11:23 -0700)
 ----------------------------------------------------------------
-Improve -static and -pie linking
+Improvements to qemu/int128
-Add cpu_{ld,st}*_mmuidx_ra
+Fixes for 128/64 division.
-Remove MMU_MODE*_SUFFIX
+Cleanup tcg/optimize.c
-Move tcg headers under include/
+Optimize redundant sign extensions
 ----------------------------------------------------------------
-Philippe Mathieu-Daudé (4):
+Frédéric Pétrot (1):
-      tcg: Search includes from the project root source directory
+      qemu/int128: Add int128_{not,xor}
       tcg: Search includes in the parent source directory
       tcg: Move TCG headers to include/tcg/
       configure: Remove tcg/ from the preprocessor include search list
-Richard Henderson (37):
+Luis Pires (4):
-      configure: Drop adjustment of textseg
+      host-utils: move checks out of divu128/divs128
-      tcg: Remove softmmu code_gen_buffer fixed address
+      host-utils: move udiv_qrnnd() to host-utils
-      configure: Do not force pie=no for non-x86
+      host-utils: add 128-bit quotient support to divu128/divs128
-      configure: Always detect -no-pie toolchain support
+      host-utils: add unit tests for divu128/divs128
       configure: Unnest detection of -z,relro and -z,now
       configure: Override the os default with --disable-pie
       configure: Support -static-pie if requested
       target/xtensa: Use probe_access for itlb_hit_test
       cputlb: Use trace_mem_get_info instead of trace_mem_build_info
       trace: Remove trace_mem_build_info_no_se_[bl]e
       target/s390x: Include tcg.h in mem_helper.c
       target/arm: Include tcg.h in sve_helper.c
       accel/tcg: Include tcg.h in tcg-runtime.c
       linux-user: Include tcg.h in syscall.c
       linux-user: Include trace-root.h in syscall-trace.h
       plugins: Include trace/mem.h in api.c
       cputlb: Move body of cpu_ldst_template.h out of line
       translator: Use cpu_ld*_code instead of open-coding
       cputlb: Rename helper_ret_ld*_cmmu to cpu_ld*_code
       cputlb: Provide cpu_(ld,st}*_mmuidx_ra for user-only
       target/i386: Use cpu_*_mmuidx_ra instead of templates
       cputlb: Expand cpu_ldst_useronly_template.h in user-exec.c
       target/nios2: Remove MMU_MODE{0,1}_SUFFIX
       target/alpha: Remove MMU_MODE{0,1}_SUFFIX
       target/cris: Remove MMU_MODE{0,1}_SUFFIX
       target/i386: Remove MMU_MODE{0,1,2}_SUFFIX
       target/microblaze: Remove MMU_MODE{0,1,2}_SUFFIX
       target/sh4: Remove MMU_MODE{0,1}_SUFFIX
       target/unicore32: Remove MMU_MODE{0,1}_SUFFIX
       target/xtensa: Remove MMU_MODE{0,1,2,3}_SUFFIX
       target/m68k: Use cpu_*_mmuidx_ra instead of MMU_MODE{0,1}_SUFFIX
       target/mips: Use cpu_*_mmuidx_ra instead of MMU_MODE*_SUFFIX
       target/s390x: Use cpu_*_mmuidx_ra instead of MMU_MODE*_SUFFIX
       target/ppc: Use cpu_*_mmuidx_ra instead of MMU_MODE*_SUFFIX
       cputlb: Remove support for MMU_MODE*_SUFFIX
       cputlb: Expand cpu_ldst_template.h in cputlb.c
       MAINTAINERS: Replace Claudio Fontana for tcg/aarch64
- Makefile                                  |   2 +-
+Richard Henderson (51):
- accel/tcg/atomic_template.h               |  67 ++---
+      tcg/optimize: Rename "mask" to "z_mask"
- include/exec/cpu_ldst.h                   | 446 +++++++++---------------------
+      tcg/optimize: Split out OptContext
- include/exec/cpu_ldst_template.h          | 211 --------------
+      tcg/optimize: Remove do_default label
- include/exec/cpu_ldst_useronly_template.h | 159 -----------
+      tcg/optimize: Change tcg_opt_gen_{mov,movi} interface
- include/exec/translator.h                 |  48 +---
+      tcg/optimize: Move prev_mb into OptContext
- {tcg => include/tcg}/tcg-gvec-desc.h      |   0
+      tcg/optimize: Split out init_arguments
- {tcg => include/tcg}/tcg-mo.h             |   0
+      tcg/optimize: Split out copy_propagate
- {tcg => include/tcg}/tcg-op-gvec.h        |   0
+      tcg/optimize: Split out fold_call
- {tcg => include/tcg}/tcg-op.h             |   2 +-
+      tcg/optimize: Drop nb_oargs, nb_iargs locals
- {tcg => include/tcg}/tcg-opc.h            |   0
+      tcg/optimize: Change fail return for do_constant_folding_cond*
- {tcg => include/tcg}/tcg.h                |  33 +--
+      tcg/optimize: Return true from tcg_opt_gen_{mov,movi}
- include/user/syscall-trace.h              |   2 +
+      tcg/optimize: Split out finish_folding
- target/alpha/cpu.h                        |   2 -
+      tcg/optimize: Use a boolean to avoid a mass of continues
- target/cris/cpu.h                         |   2 -
+      tcg/optimize: Split out fold_mb, fold_qemu_{ld,st}
- target/i386/cpu.h                         |   3 -
+      tcg/optimize: Split out fold_const{1,2}
- target/m68k/cpu.h                         |   2 -
+      tcg/optimize: Split out fold_setcond2
- target/microblaze/cpu.h                   |   3 -
+      tcg/optimize: Split out fold_brcond2
- target/mips/cpu.h                         |   4 -
+      tcg/optimize: Split out fold_brcond
- target/nios2/cpu.h                        |   2 -
+      tcg/optimize: Split out fold_setcond
- target/ppc/cpu.h                          |   2 -
+      tcg/optimize: Split out fold_mulu2_i32
- target/s390x/cpu.h                        |   5 -
+      tcg/optimize: Split out fold_addsub2_i32
- target/sh4/cpu.h                          |   2 -
+      tcg/optimize: Split out fold_movcond
- target/unicore32/cpu.h                    |   2 -
+      tcg/optimize: Split out fold_extract2
- target/xtensa/cpu.h                       |   4 -
+      tcg/optimize: Split out fold_extract, fold_sextract
- tcg/i386/tcg-target.h                     |   2 +-
+      tcg/optimize: Split out fold_deposit
- trace/mem-internal.h                      |  17 --
+      tcg/optimize: Split out fold_count_zeros
- accel/tcg/cpu-exec.c                      |   2 +-
+      tcg/optimize: Split out fold_bswap
- accel/tcg/cputlb.c                        | 315 ++++++++++++++++-----
+      tcg/optimize: Split out fold_dup, fold_dup2
- accel/tcg/tcg-runtime-gvec.c              |   2 +-
+      tcg/optimize: Split out fold_mov
- accel/tcg/tcg-runtime.c                   |   1 +
+      tcg/optimize: Split out fold_xx_to_i
- accel/tcg/translate-all.c                 |  39 +--
+      tcg/optimize: Split out fold_xx_to_x
- accel/tcg/user-exec.c                     | 238 +++++++++++++++-
+      tcg/optimize: Split out fold_xi_to_i
- bsd-user/main.c                           |   2 +-
+      tcg/optimize: Add type to OptContext
- cpus.c                                    |   2 +-
+      tcg/optimize: Split out fold_to_not
- exec.c                                    |   2 +-
+      tcg/optimize: Split out fold_sub_to_neg
- linux-user/main.c                         |   2 +-
+      tcg/optimize: Split out fold_xi_to_x
- linux-user/syscall.c                      |   1 +
+      tcg/optimize: Split out fold_ix_to_i
- plugins/api.c                             |   1 +
+      tcg/optimize: Split out fold_masks
- target/alpha/translate.c                  |   2 +-
+      tcg/optimize: Expand fold_mulu2_i32 to all 4-arg multiplies
- target/arm/helper-a64.c                   |   2 +-
+      tcg/optimize: Expand fold_addsub2_i32 to 64-bit ops
- target/arm/sve_helper.c                   |   1 +
+      tcg/optimize: Sink commutative operand swapping into fold functions
- target/arm/translate-a64.c                |   4 +-
+      tcg/optimize: Stop forcing z_mask to "garbage" for 32-bit values
- target/arm/translate-sve.c                |   6 +-
+      tcg/optimize: Use fold_xx_to_i for orc
- target/arm/translate.c                    |   4 +-
+      tcg/optimize: Use fold_xi_to_x for mul
- target/cris/translate.c                   |   2 +-
+      tcg/optimize: Use fold_xi_to_x for div
- target/hppa/translate.c                   |   2 +-
+      tcg/optimize: Use fold_xx_to_i for rem
- target/i386/mem_helper.c                  |   2 +-
+      tcg/optimize: Optimize sign extensions
- target/i386/seg_helper.c                  |  56 ++--
+      tcg/optimize: Propagate sign info for logical operations
- target/i386/translate.c                   |   2 +-
+      tcg/optimize: Propagate sign info for setcond
- target/lm32/translate.c                   |   2 +-
+      tcg/optimize: Propagate sign info for bit counting
- target/m68k/op_helper.c                   |  77 ++++--
+      tcg/optimize: Propagate sign info for shifting
  target/m68k/translate.c                   |   2 +-
  target/microblaze/translate.c             |   2 +-
  target/mips/op_helper.c                   | 182 ++++--------
  target/mips/translate.c                   |   2 +-
  target/moxie/translate.c                  |   2 +-
  target/nios2/translate.c                  |   2 +-
  target/openrisc/translate.c               |   2 +-
  target/ppc/mem_helper.c                   |  13 +-
  target/ppc/translate.c                    |   4 +-
  target/riscv/cpu_helper.c                 |   2 +-
  target/riscv/translate.c                  |   2 +-
  target/s390x/mem_helper.c                 |  11 +-
  target/s390x/translate.c                  |   4 +-
  target/sh4/translate.c                    |   2 +-
  target/sparc/ldst_helper.c                |   2 +-
  target/sparc/translate.c                  |   2 +-
  target/tilegx/translate.c                 |   2 +-
  target/tricore/translate.c                |   2 +-
  target/unicore32/translate.c              |   2 +-
  target/xtensa/mmu_helper.c                |   5 +-
  target/xtensa/translate.c                 |   2 +-
  tcg/aarch64/tcg-target.inc.c              |   4 +-
  tcg/arm/tcg-target.inc.c                  |   4 +-
  tcg/i386/tcg-target.inc.c                 |   4 +-
  tcg/mips/tcg-target.inc.c                 |   2 +-
  tcg/optimize.c                            |   2 +-
  tcg/ppc/tcg-target.inc.c                  |   4 +-
  tcg/riscv/tcg-target.inc.c                |   4 +-
  tcg/s390/tcg-target.inc.c                 |   4 +-
  tcg/sparc/tcg-target.inc.c                |   2 +-
  tcg/tcg-common.c                          |   2 +-
  tcg/tcg-op-gvec.c                         |   8 +-
  tcg/tcg-op-vec.c                          |   6 +-
  tcg/tcg-op.c                              |   6 +-
  tcg/tcg.c                                 |   2 +-
  tcg/tci.c                                 |   2 +-
  MAINTAINERS                               |   4 +-
  configure                                 | 117 +++-----
  docs/devel/loads-stores.rst               | 215 ++++++++++----
 files changed, 1075 insertions(+), 1357 deletions(-)
  delete mode 100644 include/exec/cpu_ldst_template.h
  delete mode 100644 include/exec/cpu_ldst_useronly_template.h
  rename {tcg => include/tcg}/tcg-gvec-desc.h (100%)
  rename {tcg => include/tcg}/tcg-mo.h (100%)
  rename {tcg => include/tcg}/tcg-op-gvec.h (100%)
  rename {tcg => include/tcg}/tcg-op.h (99%)
  rename {tcg => include/tcg}/tcg-opc.h (100%)
  rename {tcg => include/tcg}/tcg.h (96%)
+ include/fpu/softfloat-macros.h |   82 --
+ include/hw/clock.h             |    5 +-
+ include/qemu/host-utils.h      |  121 +-
+ include/qemu/int128.h          |   20 +
+ target/ppc/int_helper.c        |   23 +-
+ tcg/optimize.c                 | 2644 ++++++++++++++++++++++++----------------
+ tests/unit/test-div128.c       |  197 +++
+ util/host-utils.c              |  147 ++-
+ tests/unit/meson.build         |    1 +
+files changed, 2053 insertions(+), 1187 deletions(-)
+ create mode 100644 tests/unit/test-div128.c

-[PULL 22/41] cputlb: Expand cpu_ldst_useronly_template.h in user-exec.c
+[PULL 01/56] qemu/int128: Add int128_{not,xor}
-With the tracing hooks, the inline functions are no longer
+From: Frédéric Pétrot <frederic.petrot@univ-grenoble-alpes.fr>
 so simple.  Reduce the amount of preprocessor obfuscation
 by expanding the text of each of the functions generated.
-Tested-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+Addition of not and xor on 128-bit integers.
-Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Signed-off-by: Frédéric Pétrot <frederic.petrot@univ-grenoble-alpes.fr>
 Co-authored-by: Fabien Portas <fabien.portas@grenoble-inp.org>
 Message-Id: <20211025122818.168890-3-frederic.petrot@univ-grenoble-alpes.fr>
 [rth: Split out logical operations.]
 Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/exec/cpu_ldst.h                   |  54 +++--
+ include/qemu/int128.h | 20 ++++++++++++++++++++
- include/exec/cpu_ldst_useronly_template.h | 159 ---------------
+file changed, 20 insertions(+)
  accel/tcg/user-exec.c                     | 236 ++++++++++++++++++++++
 files changed, 262 insertions(+), 187 deletions(-)
  delete mode 100644 include/exec/cpu_ldst_useronly_template.h
-diff --git a/include/exec/cpu_ldst.h b/include/exec/cpu_ldst.h
+diff --git a/include/qemu/int128.h b/include/qemu/int128.h
 index XXXXXXX..XXXXXXX 100644
---- a/include/exec/cpu_ldst.h
+--- a/include/qemu/int128.h
-+++ b/include/exec/cpu_ldst.h
++++ b/include/qemu/int128.h
-@@ -XXX,XX +XXX,XX @@ static inline void clear_helper_retaddr(void)
+@@ -XXX,XX +XXX,XX @@ static inline Int128 int128_exts64(int64_t a)
+     return a;
  /* In user-only mode we provide only the _code and _data accessors. */
 -#define MEMSUFFIX _data
 -#define DATA_SIZE 1
 -#include "exec/cpu_ldst_useronly_template.h"
 +uint32_t cpu_ldub_data(CPUArchState *env, abi_ptr ptr);
 +uint32_t cpu_lduw_data(CPUArchState *env, abi_ptr ptr);
 +uint32_t cpu_ldl_data(CPUArchState *env, abi_ptr ptr);
 +uint64_t cpu_ldq_data(CPUArchState *env, abi_ptr ptr);
 +int cpu_ldsb_data(CPUArchState *env, abi_ptr ptr);
 +int cpu_ldsw_data(CPUArchState *env, abi_ptr ptr);
 -#define DATA_SIZE 2
 -#include "exec/cpu_ldst_useronly_template.h"
 +uint32_t cpu_ldub_data_ra(CPUArchState *env, abi_ptr ptr, uintptr_t retaddr);
 +uint32_t cpu_lduw_data_ra(CPUArchState *env, abi_ptr ptr, uintptr_t retaddr);
 +uint32_t cpu_ldl_data_ra(CPUArchState *env, abi_ptr ptr, uintptr_t retaddr);
 +uint64_t cpu_ldq_data_ra(CPUArchState *env, abi_ptr ptr, uintptr_t retaddr);
 +int cpu_ldsb_data_ra(CPUArchState *env, abi_ptr ptr, uintptr_t retaddr);
 +int cpu_ldsw_data_ra(CPUArchState *env, abi_ptr ptr, uintptr_t retaddr);
 -#define DATA_SIZE 4
 -#include "exec/cpu_ldst_useronly_template.h"
 +void cpu_stb_data(CPUArchState *env, abi_ptr ptr, uint32_t val);
 +void cpu_stw_data(CPUArchState *env, abi_ptr ptr, uint32_t val);
 +void cpu_stl_data(CPUArchState *env, abi_ptr ptr, uint32_t val);
 +void cpu_stq_data(CPUArchState *env, abi_ptr ptr, uint64_t val);
 -#define DATA_SIZE 8
 -#include "exec/cpu_ldst_useronly_template.h"
 -#undef MEMSUFFIX
 -
 -#define MEMSUFFIX _code
 -#define CODE_ACCESS
 -#define DATA_SIZE 1
 -#include "exec/cpu_ldst_useronly_template.h"
 -
 -#define DATA_SIZE 2
 -#include "exec/cpu_ldst_useronly_template.h"
 -
 -#define DATA_SIZE 4
 -#include "exec/cpu_ldst_useronly_template.h"
 -
 -#define DATA_SIZE 8
 -#include "exec/cpu_ldst_useronly_template.h"
 -#undef MEMSUFFIX
 -#undef CODE_ACCESS
 +void cpu_stb_data_ra(CPUArchState *env, abi_ptr ptr,
 +                     uint32_t val, uintptr_t retaddr);
 +void cpu_stw_data_ra(CPUArchState *env, abi_ptr ptr,
 +                     uint32_t val, uintptr_t retaddr);
 +void cpu_stl_data_ra(CPUArchState *env, abi_ptr ptr,
 +                     uint32_t val, uintptr_t retaddr);
 +void cpu_stq_data_ra(CPUArchState *env, abi_ptr ptr,
 +                     uint64_t val, uintptr_t retaddr);
  /*
   * Provide the same *_mmuidx_ra interface as for softmmu.
@@ -XXX,XX +XXX,XX @@ void cpu_stq_mmuidx_ra(CPUArchState *env, abi_ptr addr, uint64_t val,
  #undef CPU_MMU_INDEX
  #undef MEMSUFFIX
 +#endif /* defined(CONFIG_USER_ONLY) */
 +
  uint32_t cpu_ldub_code(CPUArchState *env, abi_ptr addr);
  uint32_t cpu_lduw_code(CPUArchState *env, abi_ptr addr);
  uint32_t cpu_ldl_code(CPUArchState *env, abi_ptr addr);
@@ -XXX,XX +XXX,XX @@ static inline int cpu_ldsw_code(CPUArchState *env, abi_ptr addr)
      return (int16_t)cpu_lduw_code(env, addr);
  }
--#endif /* defined(CONFIG_USER_ONLY) */
++static inline Int128 int128_not(Int128 a)
 -
  /**
   * tlb_vaddr_to_host:
   * @env: CPUArchState
 diff --git a/include/exec/cpu_ldst_useronly_template.h b/include/exec/cpu_ldst_useronly_template.h
 deleted file mode 100644
 index XXXXXXX..XXXXXXX
 --- a/include/exec/cpu_ldst_useronly_template.h
 +++ /dev/null
@@ -XXX,XX +XXX,XX @@
 -/*
 - *  User-only accessor function support
 - *
 - * Generate inline load/store functions for one data size.
 - *
 - * Generate a store function as well as signed and unsigned loads.
 - *
 - * Not used directly but included from cpu_ldst.h.
 - *
 - *  Copyright (c) 2015 Linaro Limited
 - *
 - * This library is free software; you can redistribute it and/or
 - * modify it under the terms of the GNU Lesser General Public
 - * License as published by the Free Software Foundation; either
 - * version 2 of the License, or (at your option) any later version.
 - *
 - * This library is distributed in the hope that it will be useful,
 - * but WITHOUT ANY WARRANTY; without even the implied warranty of
 - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 - * Lesser General Public License for more details.
 - *
 - * You should have received a copy of the GNU Lesser General Public
 - * License along with this library; if not, see <http://www.gnu.org/licenses/>.
 - */
 -
 -#if !defined(CODE_ACCESS)
 -#include "trace-root.h"
 -#endif
 -
 -#include "trace/mem.h"
 -
 -#if DATA_SIZE == 8
 -#define SUFFIX q
 -#define USUFFIX q
 -#define DATA_TYPE uint64_t
 -#define SHIFT 3
 -#elif DATA_SIZE == 4
 -#define SUFFIX l
 -#define USUFFIX l
 -#define DATA_TYPE uint32_t
 -#define SHIFT 2
 -#elif DATA_SIZE == 2
 -#define SUFFIX w
 -#define USUFFIX uw
 -#define DATA_TYPE uint16_t
 -#define DATA_STYPE int16_t
 -#define SHIFT 1
 -#elif DATA_SIZE == 1
 -#define SUFFIX b
 -#define USUFFIX ub
 -#define DATA_TYPE uint8_t
 -#define DATA_STYPE int8_t
 -#define SHIFT 0
 -#else
 -#error unsupported data size
 -#endif
 -
 -#if DATA_SIZE == 8
 -#define RES_TYPE uint64_t
 -#else
 -#define RES_TYPE uint32_t
 -#endif
 -
 -static inline RES_TYPE
 -glue(glue(cpu_ld, USUFFIX), MEMSUFFIX)(CPUArchState *env, abi_ptr ptr)
 -{
 -    RES_TYPE ret;
 -#ifdef CODE_ACCESS
 -    set_helper_retaddr(1);
 -    ret = glue(glue(ld, USUFFIX), _p)(g2h(ptr));
 -    clear_helper_retaddr();
 -#else
 -    MemOp op = MO_TE | SHIFT;
 -    uint16_t meminfo = trace_mem_get_info(op, MMU_USER_IDX, false);
 -    trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo);
 -    ret = glue(glue(ld, USUFFIX), _p)(g2h(ptr));
 -#endif
 -    return ret;
 -}
 -
 -#ifndef CODE_ACCESS
 -static inline RES_TYPE
 -glue(glue(glue(cpu_ld, USUFFIX), MEMSUFFIX), _ra)(CPUArchState *env,
 -                                                  abi_ptr ptr,
 -                                                  uintptr_t retaddr)
 -{
 -    RES_TYPE ret;
 -    set_helper_retaddr(retaddr);
 -    ret = glue(glue(cpu_ld, USUFFIX), MEMSUFFIX)(env, ptr);
 -    clear_helper_retaddr();
 -    return ret;
 -}
 -#endif
 -
 -#if DATA_SIZE <= 2
 -static inline int
 -glue(glue(cpu_lds, SUFFIX), MEMSUFFIX)(CPUArchState *env, abi_ptr ptr)
 -{
 -    int ret;
 -#ifdef CODE_ACCESS
 -    set_helper_retaddr(1);
 -    ret = glue(glue(lds, SUFFIX), _p)(g2h(ptr));
 -    clear_helper_retaddr();
 -#else
 -    MemOp op = MO_TE | MO_SIGN | SHIFT;
 -    uint16_t meminfo = trace_mem_get_info(op, MMU_USER_IDX, false);
 -    trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo);
 -    ret = glue(glue(lds, SUFFIX), _p)(g2h(ptr));
 -    qemu_plugin_vcpu_mem_cb(env_cpu(env), ptr, meminfo);
 -#endif
 -    return ret;
 -}
 -
 -#ifndef CODE_ACCESS
 -static inline int
 -glue(glue(glue(cpu_lds, SUFFIX), MEMSUFFIX), _ra)(CPUArchState *env,
 -                                                  abi_ptr ptr,
 -                                                  uintptr_t retaddr)
 -{
 -    int ret;
 -    set_helper_retaddr(retaddr);
 -    ret = glue(glue(cpu_lds, SUFFIX), MEMSUFFIX)(env, ptr);
 -    clear_helper_retaddr();
 -    return ret;
 -}
 -#endif /* CODE_ACCESS */
 -#endif /* DATA_SIZE <= 2 */
 -
 -#ifndef CODE_ACCESS
 -static inline void
 -glue(glue(cpu_st, SUFFIX), MEMSUFFIX)(CPUArchState *env, abi_ptr ptr,
 -                                      RES_TYPE v)
 -{
 -    MemOp op = MO_TE | SHIFT;
 -    uint16_t meminfo = trace_mem_get_info(op, MMU_USER_IDX, true);
 -    trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo);
 -    glue(glue(st, SUFFIX), _p)(g2h(ptr), v);
 -    qemu_plugin_vcpu_mem_cb(env_cpu(env), ptr, meminfo);
 -}
 -
 -static inline void
 -glue(glue(glue(cpu_st, SUFFIX), MEMSUFFIX), _ra)(CPUArchState *env,
 -                                                  abi_ptr ptr,
 -                                                  RES_TYPE v,
 -                                                  uintptr_t retaddr)
 -{
 -    set_helper_retaddr(retaddr);
 -    glue(glue(cpu_st, SUFFIX), MEMSUFFIX)(env, ptr, v);
 -    clear_helper_retaddr();
 -}
 -#endif
 -
 -#undef RES_TYPE
 -#undef DATA_TYPE
 -#undef DATA_STYPE
 -#undef SUFFIX
 -#undef USUFFIX
 -#undef DATA_SIZE
 -#undef SHIFT
 diff --git a/accel/tcg/user-exec.c b/accel/tcg/user-exec.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/user-exec.c
 +++ b/accel/tcg/user-exec.c
@@ -XXX,XX +XXX,XX @@
  #include "translate-all.h"
  #include "exec/helper-proto.h"
  #include "qemu/atomic128.h"
 +#include "trace-root.h"
 +#include "trace/mem.h"
  #undef EAX
  #undef ECX
@@ -XXX,XX +XXX,XX @@ int cpu_signal_handler(int host_signum, void *pinfo,
  /* The softmmu versions of these helpers are in cputlb.c.  */
 +uint32_t cpu_ldub_data(CPUArchState *env, abi_ptr ptr)
 +{
-+    uint32_t ret;
++    return ~a;
 +    uint16_t meminfo = trace_mem_get_info(MO_UB, MMU_USER_IDX, false);
 +
 +    trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo);
 +    ret = ldub_p(g2h(ptr));
 +    qemu_plugin_vcpu_mem_cb(env_cpu(env), ptr, meminfo);
 +    return ret;
 +}
 +
-+int cpu_ldsb_data(CPUArchState *env, abi_ptr ptr)
+ static inline Int128 int128_and(Int128 a, Int128 b)
  {
      return a & b;
@@ -XXX,XX +XXX,XX @@ static inline Int128 int128_or(Int128 a, Int128 b)
      return a | b;
  }
 +static inline Int128 int128_xor(Int128 a, Int128 b)
 +{
-+    int ret;
++    return a ^ b;
 +    uint16_t meminfo = trace_mem_get_info(MO_SB, MMU_USER_IDX, false);
 +
 +    trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo);
 +    ret = ldsb_p(g2h(ptr));
 +    qemu_plugin_vcpu_mem_cb(env_cpu(env), ptr, meminfo);
 +    return ret;
 +}
 +
-+uint32_t cpu_lduw_data(CPUArchState *env, abi_ptr ptr)
+ static inline Int128 int128_rshift(Int128 a, int n)
  {
      return a >> n;
@@ -XXX,XX +XXX,XX @@ static inline Int128 int128_exts64(int64_t a)
      return int128_make128(a, (a < 0) ? -1 : 0);
  }
 +static inline Int128 int128_not(Int128 a)
 +{
-+    uint32_t ret;
++    return int128_make128(~a.lo, ~a.hi);
 +    uint16_t meminfo = trace_mem_get_info(MO_TEUW, MMU_USER_IDX, false);
 +
 +    trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo);
 +    ret = lduw_p(g2h(ptr));
 +    qemu_plugin_vcpu_mem_cb(env_cpu(env), ptr, meminfo);
 +    return ret;
 +}
 +
-+int cpu_ldsw_data(CPUArchState *env, abi_ptr ptr)
+ static inline Int128 int128_and(Int128 a, Int128 b)
  {
      return int128_make128(a.lo & b.lo, a.hi & b.hi);
@@ -XXX,XX +XXX,XX @@ static inline Int128 int128_or(Int128 a, Int128 b)
      return int128_make128(a.lo | b.lo, a.hi | b.hi);
  }
 +static inline Int128 int128_xor(Int128 a, Int128 b)
 +{
-+    int ret;
++    return int128_make128(a.lo ^ b.lo, a.hi ^ b.hi);
 +    uint16_t meminfo = trace_mem_get_info(MO_TESW, MMU_USER_IDX, false);
 +
 +    trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo);
 +    ret = ldsw_p(g2h(ptr));
 +    qemu_plugin_vcpu_mem_cb(env_cpu(env), ptr, meminfo);
 +    return ret;
 +}
 +
-+uint32_t cpu_ldl_data(CPUArchState *env, abi_ptr ptr)
+ static inline Int128 int128_rshift(Int128 a, int n)
-+{
+ {
-+    uint32_t ret;
+     int64_t h;
 +    uint16_t meminfo = trace_mem_get_info(MO_TEUL, MMU_USER_IDX, false);
 +
 +    trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo);
 +    ret = ldl_p(g2h(ptr));
 +    qemu_plugin_vcpu_mem_cb(env_cpu(env), ptr, meminfo);
 +    return ret;
 +}
 +
 +uint64_t cpu_ldq_data(CPUArchState *env, abi_ptr ptr)
 +{
 +    uint64_t ret;
 +    uint16_t meminfo = trace_mem_get_info(MO_TEQ, MMU_USER_IDX, false);
 +
 +    trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo);
 +    ret = ldq_p(g2h(ptr));
 +    qemu_plugin_vcpu_mem_cb(env_cpu(env), ptr, meminfo);
 +    return ret;
 +}
 +
 +uint32_t cpu_ldub_data_ra(CPUArchState *env, abi_ptr ptr, uintptr_t retaddr)
 +{
 +    uint32_t ret;
 +
 +    set_helper_retaddr(retaddr);
 +    ret = cpu_ldub_data(env, ptr);
 +    clear_helper_retaddr();
 +    return ret;
 +}
 +
 +int cpu_ldsb_data_ra(CPUArchState *env, abi_ptr ptr, uintptr_t retaddr)
 +{
 +    int ret;
 +
 +    set_helper_retaddr(retaddr);
 +    ret = cpu_ldsb_data(env, ptr);
 +    clear_helper_retaddr();
 +    return ret;
 +}
 +
 +uint32_t cpu_lduw_data_ra(CPUArchState *env, abi_ptr ptr, uintptr_t retaddr)
 +{
 +    uint32_t ret;
 +
 +    set_helper_retaddr(retaddr);
 +    ret = cpu_lduw_data(env, ptr);
 +    clear_helper_retaddr();
 +    return ret;
 +}
 +
 +int cpu_ldsw_data_ra(CPUArchState *env, abi_ptr ptr, uintptr_t retaddr)
 +{
 +    int ret;
 +
 +    set_helper_retaddr(retaddr);
 +    ret = cpu_ldsw_data(env, ptr);
 +    clear_helper_retaddr();
 +    return ret;
 +}
 +
 +uint32_t cpu_ldl_data_ra(CPUArchState *env, abi_ptr ptr, uintptr_t retaddr)
 +{
 +    uint32_t ret;
 +
 +    set_helper_retaddr(retaddr);
 +    ret = cpu_ldl_data(env, ptr);
 +    clear_helper_retaddr();
 +    return ret;
 +}
 +
 +uint64_t cpu_ldq_data_ra(CPUArchState *env, abi_ptr ptr, uintptr_t retaddr)
 +{
 +    uint64_t ret;
 +
 +    set_helper_retaddr(retaddr);
 +    ret = cpu_ldq_data(env, ptr);
 +    clear_helper_retaddr();
 +    return ret;
 +}
 +
 +void cpu_stb_data(CPUArchState *env, abi_ptr ptr, uint32_t val)
 +{
 +    uint16_t meminfo = trace_mem_get_info(MO_UB, MMU_USER_IDX, true);
 +
 +    trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo);
 +    stb_p(g2h(ptr), val);
 +    qemu_plugin_vcpu_mem_cb(env_cpu(env), ptr, meminfo);
 +}
 +
 +void cpu_stw_data(CPUArchState *env, abi_ptr ptr, uint32_t val)
 +{
 +    uint16_t meminfo = trace_mem_get_info(MO_TEUW, MMU_USER_IDX, true);
 +
 +    trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo);
 +    stw_p(g2h(ptr), val);
 +    qemu_plugin_vcpu_mem_cb(env_cpu(env), ptr, meminfo);
 +}
 +
 +void cpu_stl_data(CPUArchState *env, abi_ptr ptr, uint32_t val)
 +{
 +    uint16_t meminfo = trace_mem_get_info(MO_TEUL, MMU_USER_IDX, true);
 +
 +    trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo);
 +    stl_p(g2h(ptr), val);
 +    qemu_plugin_vcpu_mem_cb(env_cpu(env), ptr, meminfo);
 +}
 +
 +void cpu_stq_data(CPUArchState *env, abi_ptr ptr, uint64_t val)
 +{
 +    uint16_t meminfo = trace_mem_get_info(MO_TEQ, MMU_USER_IDX, true);
 +
 +    trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo);
 +    stq_p(g2h(ptr), val);
 +    qemu_plugin_vcpu_mem_cb(env_cpu(env), ptr, meminfo);
 +}
 +
 +void cpu_stb_data_ra(CPUArchState *env, abi_ptr ptr,
 +                     uint32_t val, uintptr_t retaddr)
 +{
 +    set_helper_retaddr(retaddr);
 +    cpu_stb_data(env, ptr, val);
 +    clear_helper_retaddr();
 +}
 +
 +void cpu_stw_data_ra(CPUArchState *env, abi_ptr ptr,
 +                     uint32_t val, uintptr_t retaddr)
 +{
 +    set_helper_retaddr(retaddr);
 +    cpu_stw_data(env, ptr, val);
 +    clear_helper_retaddr();
 +}
 +
 +void cpu_stl_data_ra(CPUArchState *env, abi_ptr ptr,
 +                     uint32_t val, uintptr_t retaddr)
 +{
 +    set_helper_retaddr(retaddr);
 +    cpu_stl_data(env, ptr, val);
 +    clear_helper_retaddr();
 +}
 +
 +void cpu_stq_data_ra(CPUArchState *env, abi_ptr ptr,
 +                     uint64_t val, uintptr_t retaddr)
 +{
 +    set_helper_retaddr(retaddr);
 +    cpu_stq_data(env, ptr, val);
 +    clear_helper_retaddr();
 +}
 +
 +uint32_t cpu_ldub_code(CPUArchState *env, abi_ptr ptr)
 +{
 +    uint32_t ret;
 +
 +    set_helper_retaddr(1);
 +    ret = ldub_p(g2h(ptr));
 +    clear_helper_retaddr();
 +    return ret;
 +}
 +
 +uint32_t cpu_lduw_code(CPUArchState *env, abi_ptr ptr)
 +{
 +    uint32_t ret;
 +
 +    set_helper_retaddr(1);
 +    ret = lduw_p(g2h(ptr));
 +    clear_helper_retaddr();
 +    return ret;
 +}
 +
 +uint32_t cpu_ldl_code(CPUArchState *env, abi_ptr ptr)
 +{
 +    uint32_t ret;
 +
 +    set_helper_retaddr(1);
 +    ret = ldl_p(g2h(ptr));
 +    clear_helper_retaddr();
 +    return ret;
 +}
 +
 +uint64_t cpu_ldq_code(CPUArchState *env, abi_ptr ptr)
 +{
 +    uint64_t ret;
 +
 +    set_helper_retaddr(1);
 +    ret = ldq_p(g2h(ptr));
 +    clear_helper_retaddr();
 +    return ret;
 +}
 +
  /* Do not allow unaligned operations to proceed.  Return the host address.  */
  static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
                                 int size, uintptr_t retaddr)
 --
-.20.1
+.25.1

-New patch
+[PULL 02/56] host-utils: move checks out of divu128/divs128
+From: Luis Pires <luis.pires@eldorado.org.br>
 In preparation for changing the divu128/divs128 implementations
 to allow for quotients larger than 64 bits, move the div-by-zero
 and overflow checks to the callers.
 Signed-off-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
 Message-Id: <20211025191154.350831-2-luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
  include/hw/clock.h        |  5 +++--
  include/qemu/host-utils.h | 34 ++++++++++++---------------------
  target/ppc/int_helper.c   | 14 +++++++++-----
  util/host-utils.c         | 40 ++++++++++++++++++---------------------
 files changed, 42 insertions(+), 51 deletions(-)
 diff --git a/include/hw/clock.h b/include/hw/clock.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/hw/clock.h
 +++ b/include/hw/clock.h
@@ -XXX,XX +XXX,XX @@ static inline uint64_t clock_ns_to_ticks(const Clock *clk, uint64_t ns)
          return 0;
      }
      /*
 -     * Ignore divu128() return value as we've caught div-by-zero and don't
 -     * need different behaviour for overflow.
 +     * BUG: when CONFIG_INT128 is not defined, the current implementation of
 +     * divu128 does not return a valid truncated quotient, so the result will
 +     * be wrong.
       */
      divu128(&lo, &hi, clk->period);
      return lo;
 diff --git a/include/qemu/host-utils.h b/include/qemu/host-utils.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/qemu/host-utils.h
 +++ b/include/qemu/host-utils.h
@@ -XXX,XX +XXX,XX @@ static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
      return (__int128_t)a * b / c;
  }
 -static inline int divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
 +static inline void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
  {
 -    if (divisor == 0) {
 -        return 1;
 -    } else {
 -        __uint128_t dividend = ((__uint128_t)*phigh << 64) | *plow;
 -        __uint128_t result = dividend / divisor;
 -        *plow = result;
 -        *phigh = dividend % divisor;
 -        return result > UINT64_MAX;
 -    }
 +    __uint128_t dividend = ((__uint128_t)*phigh << 64) | *plow;
 +    __uint128_t result = dividend / divisor;
 +    *plow = result;
 +    *phigh = dividend % divisor;
  }
 -static inline int divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
 +static inline void divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
  {
 -    if (divisor == 0) {
 -        return 1;
 -    } else {
 -        __int128_t dividend = ((__int128_t)*phigh << 64) | (uint64_t)*plow;
 -        __int128_t result = dividend / divisor;
 -        *plow = result;
 -        *phigh = dividend % divisor;
 -        return result != *plow;
 -    }
 +    __int128_t dividend = ((__int128_t)*phigh << 64) | (uint64_t)*plow;
 +    __int128_t result = dividend / divisor;
 +    *plow = result;
 +    *phigh = dividend % divisor;
  }
  #else
  void muls64(uint64_t *plow, uint64_t *phigh, int64_t a, int64_t b);
  void mulu64(uint64_t *plow, uint64_t *phigh, uint64_t a, uint64_t b);
 -int divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor);
 -int divs128(int64_t *plow, int64_t *phigh, int64_t divisor);
 +void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor);
 +void divs128(int64_t *plow, int64_t *phigh, int64_t divisor);
  static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
  {
 diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/ppc/int_helper.c
 +++ b/target/ppc/int_helper.c
@@ -XXX,XX +XXX,XX @@ uint64_t helper_divdeu(CPUPPCState *env, uint64_t ra, uint64_t rb, uint32_t oe)
      uint64_t rt = 0;
      int overflow = 0;
 -    overflow = divu128(&rt, &ra, rb);
 -
 -    if (unlikely(overflow)) {
 +    if (unlikely(rb == 0 || ra >= rb)) {
 +        overflow = 1;
          rt = 0; /* Undefined */
 +    } else {
 +        divu128(&rt, &ra, rb);
      }
      if (oe) {
@@ -XXX,XX +XXX,XX @@ uint64_t helper_divde(CPUPPCState *env, uint64_t rau, uint64_t rbu, uint32_t oe)
      int64_t rt = 0;
      int64_t ra = (int64_t)rau;
      int64_t rb = (int64_t)rbu;
 -    int overflow = divs128(&rt, &ra, rb);
 +    int overflow = 0;
 -    if (unlikely(overflow)) {
 +    if (unlikely(rb == 0 || uabs64(ra) >= uabs64(rb))) {
 +        overflow = 1;
          rt = 0; /* Undefined */
 +    } else {
 +        divs128(&rt, &ra, rb);
      }
      if (oe) {
 diff --git a/util/host-utils.c b/util/host-utils.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/host-utils.c
 +++ b/util/host-utils.c
@@ -XXX,XX +XXX,XX @@ void muls64 (uint64_t *plow, uint64_t *phigh, int64_t a, int64_t b)
      *phigh = rh;
  }
 -/* Unsigned 128x64 division.  Returns 1 if overflow (divide by zero or */
 -/* quotient exceeds 64 bits).  Otherwise returns quotient via plow and */
 -/* remainder via phigh. */
 -int divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
 +/*
 + * Unsigned 128-by-64 division. Returns quotient via plow and
 + * remainder via phigh.
 + * The result must fit in 64 bits (plow) - otherwise, the result
 + * is undefined.
 + * This function will cause a division by zero if passed a zero divisor.
 + */
 +void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
  {
      uint64_t dhi = *phigh;
      uint64_t dlo = *plow;
      unsigned i;
      uint64_t carry = 0;
 -    if (divisor == 0) {
 -        return 1;
 -    } else if (dhi == 0) {
 +    if (divisor == 0 || dhi == 0) {
          *plow  = dlo / divisor;
          *phigh = dlo % divisor;
 -        return 0;
 -    } else if (dhi >= divisor) {
 -        return 1;
      } else {
          for (i = 0; i < 64; i++) {
@@ -XXX,XX +XXX,XX @@ int divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
          *plow = dlo;
          *phigh = dhi;
 -        return 0;
      }
  }
 -int divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
 +/*
 + * Signed 128-by-64 division. Returns quotient via plow and
 + * remainder via phigh.
 + * The result must fit in 64 bits (plow) - otherwise, the result
 + * is undefined.
 + * This function will cause a division by zero if passed a zero divisor.
 + */
 +void divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
  {
      int sgn_dvdnd = *phigh < 0;
      int sgn_divsr = divisor < 0;
 -    int overflow = 0;
      if (sgn_dvdnd) {
          *plow = ~(*plow);
@@ -XXX,XX +XXX,XX @@ int divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
          divisor = 0 - divisor;
      }
 -    overflow = divu128((uint64_t *)plow, (uint64_t *)phigh, (uint64_t)divisor);
 +    divu128((uint64_t *)plow, (uint64_t *)phigh, (uint64_t)divisor);
      if (sgn_dvdnd  ^ sgn_divsr) {
          *plow = 0 - *plow;
      }
 -
 -    if (!overflow) {
 -        if ((*plow < 0) ^ (sgn_dvdnd ^ sgn_divsr)) {
 -            overflow = 1;
 -        }
 -    }
 -
 -    return overflow;
  }
  #endif
 --
 .25.1

-New patch
+[PULL 03/56] host-utils: move udiv_qrnnd() to host-utils
+From: Luis Pires <luis.pires@eldorado.org.br>
 Move udiv_qrnnd() from include/fpu/softfloat-macros.h to host-utils,
 so it can be reused by divu128().
 Signed-off-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
 Message-Id: <20211025191154.350831-3-luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
  include/fpu/softfloat-macros.h | 82 ----------------------------------
  include/qemu/host-utils.h      | 81 +++++++++++++++++++++++++++++++++
 files changed, 81 insertions(+), 82 deletions(-)
 diff --git a/include/fpu/softfloat-macros.h b/include/fpu/softfloat-macros.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/fpu/softfloat-macros.h
 +++ b/include/fpu/softfloat-macros.h
@@ -XXX,XX +XXX,XX @@
   * so some portions are provided under:
   *  the SoftFloat-2a license
   *  the BSD license
 - *  GPL-v2-or-later
   *
   * Any future contributions to this file after December 1st 2014 will be
   * taken to be licensed under the Softfloat-2a license unless specifically
@@ -XXX,XX +XXX,XX @@ this code that are retained.
   * THE POSSIBILITY OF SUCH DAMAGE.
   */
 -/* Portions of this work are licensed under the terms of the GNU GPL,
 - * version 2 or later. See the COPYING file in the top-level directory.
 - */
 -
  #ifndef FPU_SOFTFLOAT_MACROS_H
  #define FPU_SOFTFLOAT_MACROS_H
@@ -XXX,XX +XXX,XX @@ static inline uint64_t estimateDiv128To64(uint64_t a0, uint64_t a1, uint64_t b)
  }
 -/* From the GNU Multi Precision Library - longlong.h __udiv_qrnnd
 - * (https://gmplib.org/repo/gmp/file/tip/longlong.h)
 - *
 - * Licensed under the GPLv2/LGPLv3
 - */
 -static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
 -                                  uint64_t n0, uint64_t d)
 -{
 -#if defined(__x86_64__)
 -    uint64_t q;
 -    asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d));
 -    return q;
 -#elif defined(__s390x__) && !defined(__clang__)
 -    /* Need to use a TImode type to get an even register pair for DLGR.  */
 -    unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
 -    asm("dlgr %0, %1" : "+r"(n) : "r"(d));
 -    *r = n >> 64;
 -    return n;
 -#elif defined(_ARCH_PPC64) && defined(_ARCH_PWR7)
 -    /* From Power ISA 2.06, programming note for divdeu.  */
 -    uint64_t q1, q2, Q, r1, r2, R;
 -    asm("divdeu %0,%2,%4; divdu %1,%3,%4"
 -        : "=&r"(q1), "=r"(q2)
 -        : "r"(n1), "r"(n0), "r"(d));
 -    r1 = -(q1 * d);         /* low part of (n1<<64) - (q1 * d) */
 -    r2 = n0 - (q2 * d);
 -    Q = q1 + q2;
 -    R = r1 + r2;
 -    if (R >= d || R < r2) { /* overflow implies R > d */
 -        Q += 1;
 -        R -= d;
 -    }
 -    *r = R;
 -    return Q;
 -#else
 -    uint64_t d0, d1, q0, q1, r1, r0, m;
 -
 -    d0 = (uint32_t)d;
 -    d1 = d >> 32;
 -
 -    r1 = n1 % d1;
 -    q1 = n1 / d1;
 -    m = q1 * d0;
 -    r1 = (r1 << 32) | (n0 >> 32);
 -    if (r1 < m) {
 -        q1 -= 1;
 -        r1 += d;
 -        if (r1 >= d) {
 -            if (r1 < m) {
 -                q1 -= 1;
 -                r1 += d;
 -            }
 -        }
 -    }
 -    r1 -= m;
 -
 -    r0 = r1 % d1;
 -    q0 = r1 / d1;
 -    m = q0 * d0;
 -    r0 = (r0 << 32) | (uint32_t)n0;
 -    if (r0 < m) {
 -        q0 -= 1;
 -        r0 += d;
 -        if (r0 >= d) {
 -            if (r0 < m) {
 -                q0 -= 1;
 -                r0 += d;
 -            }
 -        }
 -    }
 -    r0 -= m;
 -
 -    *r = r0;
 -    return (q1 << 32) | q0;
 -#endif
 -}
 -
  /*----------------------------------------------------------------------------
  | Returns an approximation to the square root of the 32-bit significand given
  | by `a'.  Considered as an integer, `a' must be at least 2^31.  If bit 0 of
 diff --git a/include/qemu/host-utils.h b/include/qemu/host-utils.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/qemu/host-utils.h
 +++ b/include/qemu/host-utils.h
@@ -XXX,XX +XXX,XX @@
   * THE SOFTWARE.
   */
 +/* Portions of this work are licensed under the terms of the GNU GPL,
 + * version 2 or later. See the COPYING file in the top-level directory.
 + */
 +
  #ifndef HOST_UTILS_H
  #define HOST_UTILS_H
@@ -XXX,XX +XXX,XX @@ void urshift(uint64_t *plow, uint64_t *phigh, int32_t shift);
   */
  void ulshift(uint64_t *plow, uint64_t *phigh, int32_t shift, bool *overflow);
 +/* From the GNU Multi Precision Library - longlong.h __udiv_qrnnd
 + * (https://gmplib.org/repo/gmp/file/tip/longlong.h)
 + *
 + * Licensed under the GPLv2/LGPLv3
 + */
 +static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
 +                                  uint64_t n0, uint64_t d)
 +{
 +#if defined(__x86_64__)
 +    uint64_t q;
 +    asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d));
 +    return q;
 +#elif defined(__s390x__) && !defined(__clang__)
 +    /* Need to use a TImode type to get an even register pair for DLGR.  */
 +    unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
 +    asm("dlgr %0, %1" : "+r"(n) : "r"(d));
 +    *r = n >> 64;
 +    return n;
 +#elif defined(_ARCH_PPC64) && defined(_ARCH_PWR7)
 +    /* From Power ISA 2.06, programming note for divdeu.  */
 +    uint64_t q1, q2, Q, r1, r2, R;
 +    asm("divdeu %0,%2,%4; divdu %1,%3,%4"
 +        : "=&r"(q1), "=r"(q2)
 +        : "r"(n1), "r"(n0), "r"(d));
 +    r1 = -(q1 * d);         /* low part of (n1<<64) - (q1 * d) */
 +    r2 = n0 - (q2 * d);
 +    Q = q1 + q2;
 +    R = r1 + r2;
 +    if (R >= d || R < r2) { /* overflow implies R > d */
 +        Q += 1;
 +        R -= d;
 +    }
 +    *r = R;
 +    return Q;
 +#else
 +    uint64_t d0, d1, q0, q1, r1, r0, m;
 +
 +    d0 = (uint32_t)d;
 +    d1 = d >> 32;
 +
 +    r1 = n1 % d1;
 +    q1 = n1 / d1;
 +    m = q1 * d0;
 +    r1 = (r1 << 32) | (n0 >> 32);
 +    if (r1 < m) {
 +        q1 -= 1;
 +        r1 += d;
 +        if (r1 >= d) {
 +            if (r1 < m) {
 +                q1 -= 1;
 +                r1 += d;
 +            }
 +        }
 +    }
 +    r1 -= m;
 +
 +    r0 = r1 % d1;
 +    q0 = r1 / d1;
 +    m = q0 * d0;
 +    r0 = (r0 << 32) | (uint32_t)n0;
 +    if (r0 < m) {
 +        q0 -= 1;
 +        r0 += d;
 +        if (r0 >= d) {
 +            if (r0 < m) {
 +                q0 -= 1;
 +                r0 += d;
 +            }
 +        }
 +    }
 +    r0 -= m;
 +
 +    *r = r0;
 +    return (q1 << 32) | q0;
 +#endif
 +}
 +
  #endif
 --
 .25.1

-New patch
+[PULL 04/56] host-utils: add 128-bit quotient support to divu128/divs128
+From: Luis Pires <luis.pires@eldorado.org.br>
 These will be used to implement new decimal floating point
 instructions from Power ISA 3.1.
 The remainder is now returned directly by divu128/divs128,
 freeing up phigh to receive the high 64 bits of the quotient.
 Signed-off-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
 Message-Id: <20211025191154.350831-4-luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
  include/hw/clock.h        |   6 +-
  include/qemu/host-utils.h |  20 ++++--
  target/ppc/int_helper.c   |   9 +--
  util/host-utils.c         | 133 +++++++++++++++++++++++++-------------
 files changed, 108 insertions(+), 60 deletions(-)
 diff --git a/include/hw/clock.h b/include/hw/clock.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/hw/clock.h
 +++ b/include/hw/clock.h
@@ -XXX,XX +XXX,XX @@ static inline uint64_t clock_ns_to_ticks(const Clock *clk, uint64_t ns)
      if (clk->period == 0) {
          return 0;
      }
 -    /*
 -     * BUG: when CONFIG_INT128 is not defined, the current implementation of
 -     * divu128 does not return a valid truncated quotient, so the result will
 -     * be wrong.
 -     */
 +
      divu128(&lo, &hi, clk->period);
      return lo;
  }
 diff --git a/include/qemu/host-utils.h b/include/qemu/host-utils.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/qemu/host-utils.h
 +++ b/include/qemu/host-utils.h
@@ -XXX,XX +XXX,XX @@ static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
      return (__int128_t)a * b / c;
  }
 -static inline void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
 +static inline uint64_t divu128(uint64_t *plow, uint64_t *phigh,
 +                               uint64_t divisor)
  {
      __uint128_t dividend = ((__uint128_t)*phigh << 64) | *plow;
      __uint128_t result = dividend / divisor;
 +
      *plow = result;
 -    *phigh = dividend % divisor;
 +    *phigh = result >> 64;
 +    return dividend % divisor;
  }
 -static inline void divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
 +static inline int64_t divs128(uint64_t *plow, int64_t *phigh,
 +                              int64_t divisor)
  {
 -    __int128_t dividend = ((__int128_t)*phigh << 64) | (uint64_t)*plow;
 +    __int128_t dividend = ((__int128_t)*phigh << 64) | *plow;
      __int128_t result = dividend / divisor;
 +
      *plow = result;
 -    *phigh = dividend % divisor;
 +    *phigh = result >> 64;
 +    return dividend % divisor;
  }
  #else
  void muls64(uint64_t *plow, uint64_t *phigh, int64_t a, int64_t b);
  void mulu64(uint64_t *plow, uint64_t *phigh, uint64_t a, uint64_t b);
 -void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor);
 -void divs128(int64_t *plow, int64_t *phigh, int64_t divisor);
 +uint64_t divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor);
 +int64_t divs128(uint64_t *plow, int64_t *phigh, int64_t divisor);
  static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
  {
 diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/ppc/int_helper.c
 +++ b/target/ppc/int_helper.c
@@ -XXX,XX +XXX,XX @@ uint64_t helper_divdeu(CPUPPCState *env, uint64_t ra, uint64_t rb, uint32_t oe)
  uint64_t helper_divde(CPUPPCState *env, uint64_t rau, uint64_t rbu, uint32_t oe)
  {
 -    int64_t rt = 0;
 +    uint64_t rt = 0;
      int64_t ra = (int64_t)rau;
      int64_t rb = (int64_t)rbu;
      int overflow = 0;
@@ -XXX,XX +XXX,XX @@ uint32_t helper_bcdcfsq(ppc_avr_t *r, ppc_avr_t *b, uint32_t ps)
      int cr;
      uint64_t lo_value;
      uint64_t hi_value;
 +    uint64_t rem;
      ppc_avr_t ret = { .u64 = { 0, 0 } };
      if (b->VsrSD(0) < 0) {
@@ -XXX,XX +XXX,XX @@ uint32_t helper_bcdcfsq(ppc_avr_t *r, ppc_avr_t *b, uint32_t ps)
           * In that case, we leave r unchanged.
           */
      } else {
 -        divu128(&lo_value, &hi_value, 1000000000000000ULL);
 +        rem = divu128(&lo_value, &hi_value, 1000000000000000ULL);
 -        for (i = 1; i < 16; hi_value /= 10, i++) {
 -            bcd_put_digit(&ret, hi_value % 10, i);
 +        for (i = 1; i < 16; rem /= 10, i++) {
 +            bcd_put_digit(&ret, rem % 10, i);
          }
          for (; i < 32; lo_value /= 10, i++) {
 diff --git a/util/host-utils.c b/util/host-utils.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/host-utils.c
 +++ b/util/host-utils.c
@@ -XXX,XX +XXX,XX @@ void muls64 (uint64_t *plow, uint64_t *phigh, int64_t a, int64_t b)
  }
  /*
 - * Unsigned 128-by-64 division. Returns quotient via plow and
 - * remainder via phigh.
 - * The result must fit in 64 bits (plow) - otherwise, the result
 - * is undefined.
 - * This function will cause a division by zero if passed a zero divisor.
 + * Unsigned 128-by-64 division.
 + * Returns the remainder.
 + * Returns quotient via plow and phigh.
 + * Also returns the remainder via the function return value.
   */
 -void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
 +uint64_t divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
  {
      uint64_t dhi = *phigh;
      uint64_t dlo = *plow;
 -    unsigned i;
 -    uint64_t carry = 0;
 +    uint64_t rem, dhighest;
 +    int sh;
      if (divisor == 0 || dhi == 0) {
          *plow  = dlo / divisor;
 -        *phigh = dlo % divisor;
 +        *phigh = 0;
 +        return dlo % divisor;
      } else {
 +        sh = clz64(divisor);
 -        for (i = 0; i < 64; i++) {
 -            carry = dhi >> 63;
 -            dhi = (dhi << 1) | (dlo >> 63);
 -            if (carry || (dhi >= divisor)) {
 -                dhi -= divisor;
 -                carry = 1;
 -            } else {
 -                carry = 0;
 +        if (dhi < divisor) {
 +            if (sh != 0) {
 +                /* normalize the divisor, shifting the dividend accordingly */
 +                divisor <<= sh;
 +                dhi = (dhi << sh) | (dlo >> (64 - sh));
 +                dlo <<= sh;
              }
 -            dlo = (dlo << 1) | carry;
 +
 +            *phigh = 0;
 +            *plow = udiv_qrnnd(&rem, dhi, dlo, divisor);
 +        } else {
 +            if (sh != 0) {
 +                /* normalize the divisor, shifting the dividend accordingly */
 +                divisor <<= sh;
 +                dhighest = dhi >> (64 - sh);
 +                dhi = (dhi << sh) | (dlo >> (64 - sh));
 +                dlo <<= sh;
 +
 +                *phigh = udiv_qrnnd(&dhi, dhighest, dhi, divisor);
 +            } else {
 +                /**
 +                 * dhi >= divisor
 +                 * Since the MSB of divisor is set (sh == 0),
 +                 * (dhi - divisor) < divisor
 +                 *
 +                 * Thus, the high part of the quotient is 1, and we can
 +                 * calculate the low part with a single call to udiv_qrnnd
 +                 * after subtracting divisor from dhi
 +                 */
 +                dhi -= divisor;
 +                *phigh = 1;
 +            }
 +
 +            *plow = udiv_qrnnd(&rem, dhi, dlo, divisor);
          }
 -        *plow = dlo;
 -        *phigh = dhi;
 +        /*
 +         * since the dividend/divisor might have been normalized,
 +         * the remainder might also have to be shifted back
 +         */
 +        return rem >> sh;
      }
  }
  /*
 - * Signed 128-by-64 division. Returns quotient via plow and
 - * remainder via phigh.
 - * The result must fit in 64 bits (plow) - otherwise, the result
 - * is undefined.
 - * This function will cause a division by zero if passed a zero divisor.
 + * Signed 128-by-64 division.
 + * Returns quotient via plow and phigh.
 + * Also returns the remainder via the function return value.
   */
 -void divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
 +int64_t divs128(uint64_t *plow, int64_t *phigh, int64_t divisor)
  {
 -    int sgn_dvdnd = *phigh < 0;
 -    int sgn_divsr = divisor < 0;
 +    bool neg_quotient = false, neg_remainder = false;
 +    uint64_t unsig_hi = *phigh, unsig_lo = *plow;
 +    uint64_t rem;
 -    if (sgn_dvdnd) {
 -        *plow = ~(*plow);
 -        *phigh = ~(*phigh);
 -        if (*plow == (int64_t)-1) {
 +    if (*phigh < 0) {
 +        neg_quotient = !neg_quotient;
 +        neg_remainder = !neg_remainder;
 +
 +        if (unsig_lo == 0) {
 +            unsig_hi = -unsig_hi;
 +        } else {
 +            unsig_hi = ~unsig_hi;
 +            unsig_lo = -unsig_lo;
 +        }
 +    }
 +
 +    if (divisor < 0) {
 +        neg_quotient = !neg_quotient;
 +
 +        divisor = -divisor;
 +    }
 +
 +    rem = divu128(&unsig_lo, &unsig_hi, (uint64_t)divisor);
 +
 +    if (neg_quotient) {
 +        if (unsig_lo == 0) {
 +            *phigh = -unsig_hi;
              *plow = 0;
 -            (*phigh)++;
 -         } else {
 -            (*plow)++;
 -         }
 +        } else {
 +            *phigh = ~unsig_hi;
 +            *plow = -unsig_lo;
 +        }
 +    } else {
 +        *phigh = unsig_hi;
 +        *plow = unsig_lo;
      }
 -    if (sgn_divsr) {
 -        divisor = 0 - divisor;
 -    }
 -
 -    divu128((uint64_t *)plow, (uint64_t *)phigh, (uint64_t)divisor);
 -
 -    if (sgn_dvdnd  ^ sgn_divsr) {
 -        *plow = 0 - *plow;
 +    if (neg_remainder) {
 +        return -rem;
 +    } else {
 +        return rem;
      }
  }
  #endif
 --
 .25.1

-New patch
+[PULL 05/56] host-utils: add unit tests for divu128/divs128
+From: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
 Message-Id: <20211025191154.350831-5-luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
  tests/unit/test-div128.c | 197 +++++++++++++++++++++++++++++++++++++++
  tests/unit/meson.build   |   1 +
 files changed, 198 insertions(+)
  create mode 100644 tests/unit/test-div128.c
 diff --git a/tests/unit/test-div128.c b/tests/unit/test-div128.c
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/tests/unit/test-div128.c
@@ -XXX,XX +XXX,XX @@
 +/*
 + * Test 128-bit division functions
 + *
 + * Copyright (c) 2021 Instituto de Pesquisas Eldorado (eldorado.org.br)
 + *
 + * This library is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public
 + * License as published by the Free Software Foundation; either
 + * version 2.1 of the License, or (at your option) any later version.
 + *
 + * This library is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 + * Lesser General Public License for more details.
 + *
 + * You should have received a copy of the GNU Lesser General Public
 + * License along with this library; if not, see <http://www.gnu.org/licenses/>.
 + */
 +
 +#include "qemu/osdep.h"
 +#include "qemu/host-utils.h"
 +
 +typedef struct {
 +    uint64_t high;
 +    uint64_t low;
 +    uint64_t rhigh;
 +    uint64_t rlow;
 +    uint64_t divisor;
 +    uint64_t remainder;
 +} test_data_unsigned;
 +
 +typedef struct {
 +    int64_t high;
 +    uint64_t low;
 +    int64_t rhigh;
 +    uint64_t rlow;
 +    int64_t divisor;
 +    int64_t remainder;
 +} test_data_signed;
 +
 +static const test_data_unsigned test_table_unsigned[] = {
 +    /* Dividend fits in 64 bits */
 +    { 0x0000000000000000ULL, 0x0000000000000000ULL,
 +      0x0000000000000000ULL, 0x0000000000000000ULL,
 +      0x0000000000000001ULL, 0x0000000000000000ULL},
 +    { 0x0000000000000000ULL, 0x0000000000000001ULL,
 +      0x0000000000000000ULL, 0x0000000000000001ULL,
 +      0x0000000000000001ULL, 0x0000000000000000ULL},
 +    { 0x0000000000000000ULL, 0x0000000000000003ULL,
 +      0x0000000000000000ULL, 0x0000000000000001ULL,
 +      0x0000000000000002ULL, 0x0000000000000001ULL},
 +    { 0x0000000000000000ULL, 0x8000000000000000ULL,
 +      0x0000000000000000ULL, 0x8000000000000000ULL,
 +      0x0000000000000001ULL, 0x0000000000000000ULL},
 +    { 0x0000000000000000ULL, 0xa000000000000000ULL,
 +      0x0000000000000000ULL, 0x0000000000000002ULL,
 +      0x4000000000000000ULL, 0x2000000000000000ULL},
 +    { 0x0000000000000000ULL, 0x8000000000000000ULL,
 +      0x0000000000000000ULL, 0x0000000000000001ULL,
 +      0x8000000000000000ULL, 0x0000000000000000ULL},
 +
 +    /* Dividend > 64 bits, with MSB 0 */
 +    { 0x123456789abcdefeULL, 0xefedcba987654321ULL,
 +      0x123456789abcdefeULL, 0xefedcba987654321ULL,
 +      0x0000000000000001ULL, 0x0000000000000000ULL},
 +    { 0x123456789abcdefeULL, 0xefedcba987654321ULL,
 +      0x0000000000000001ULL, 0x000000000000000dULL,
 +      0x123456789abcdefeULL, 0x03456789abcdf03bULL},
 +    { 0x123456789abcdefeULL, 0xefedcba987654321ULL,
 +      0x0123456789abcdefULL, 0xeefedcba98765432ULL,
 +      0x0000000000000010ULL, 0x0000000000000001ULL},
 +
 +    /* Dividend > 64 bits, with MSB 1 */
 +    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
 +      0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
 +      0x0000000000000001ULL, 0x0000000000000000ULL},
 +    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
 +      0x0000000000000001ULL, 0x0000000000000000ULL,
 +      0xfeeddccbbaa99887ULL, 0x766554433221100fULL},
 +    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
 +      0x0feeddccbbaa9988ULL, 0x7766554433221100ULL,
 +      0x0000000000000010ULL, 0x000000000000000fULL},
 +    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
 +      0x000000000000000eULL, 0x00f0f0f0f0f0f35aULL,
 +      0x123456789abcdefeULL, 0x0f8922bc55ef90c3ULL},
 +
 +    /**
 +     * Divisor == 64 bits, with MSB 1
 +     * and high 64 bits of dividend >= divisor
 +     * (for testing normalization)
 +     */
 +    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
 +      0x0000000000000001ULL, 0x0000000000000000ULL,
 +      0xfeeddccbbaa99887ULL, 0x766554433221100fULL},
 +    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
 +      0x0000000000000001ULL, 0xfddbb9977553310aULL,
 +      0x8000000000000001ULL, 0x78899aabbccddf05ULL},
 +
 +    /* Dividend > 64 bits, divisor almost as big */
 +    { 0x0000000000000001ULL, 0x23456789abcdef01ULL,
 +      0x0000000000000000ULL, 0x000000000000000fULL,
 +      0x123456789abcdefeULL, 0x123456789abcde1fULL},
 +};
 +
 +static const test_data_signed test_table_signed[] = {
 +    /* Positive dividend, positive/negative divisors */
 +    { 0x0000000000000000LL, 0x0000000000bc614eULL,
 +      0x0000000000000000LL, 0x0000000000bc614eULL,
 +      0x0000000000000001LL, 0x0000000000000000LL},
 +    { 0x0000000000000000LL, 0x0000000000bc614eULL,
 +      0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
 +      0xffffffffffffffffLL, 0x0000000000000000LL},
 +    { 0x0000000000000000LL, 0x0000000000bc614eULL,
 +      0x0000000000000000LL, 0x00000000005e30a7ULL,
 +      0x0000000000000002LL, 0x0000000000000000LL},
 +    { 0x0000000000000000LL, 0x0000000000bc614eULL,
 +      0xffffffffffffffffLL, 0xffffffffffa1cf59ULL,
 +      0xfffffffffffffffeLL, 0x0000000000000000LL},
 +    { 0x0000000000000000LL, 0x0000000000bc614eULL,
 +      0x0000000000000000LL, 0x0000000000178c29ULL,
 +      0x0000000000000008LL, 0x0000000000000006LL},
 +    { 0x0000000000000000LL, 0x0000000000bc614eULL,
 +      0xffffffffffffffffLL, 0xffffffffffe873d7ULL,
 +      0xfffffffffffffff8LL, 0x0000000000000006LL},
 +    { 0x0000000000000000LL, 0x0000000000bc614eULL,
 +      0x0000000000000000LL, 0x000000000000550dULL,
 +      0x0000000000000237LL, 0x0000000000000183LL},
 +    { 0x0000000000000000LL, 0x0000000000bc614eULL,
 +      0xffffffffffffffffLL, 0xffffffffffffaaf3ULL,
 +      0xfffffffffffffdc9LL, 0x0000000000000183LL},
 +
 +    /* Negative dividend, positive/negative divisors */
 +    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
 +      0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
 +      0x0000000000000001LL, 0x0000000000000000LL},
 +    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
 +      0x0000000000000000LL, 0x0000000000bc614eULL,
 +      0xffffffffffffffffLL, 0x0000000000000000LL},
 +    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
 +      0xffffffffffffffffLL, 0xffffffffffa1cf59ULL,
 +      0x0000000000000002LL, 0x0000000000000000LL},
 +    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
 +      0x0000000000000000LL, 0x00000000005e30a7ULL,
 +      0xfffffffffffffffeLL, 0x0000000000000000LL},
 +    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
 +      0xffffffffffffffffLL, 0xffffffffffe873d7ULL,
 +      0x0000000000000008LL, 0xfffffffffffffffaLL},
 +    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
 +      0x0000000000000000LL, 0x0000000000178c29ULL,
 +      0xfffffffffffffff8LL, 0xfffffffffffffffaLL},
 +    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
 +      0xffffffffffffffffLL, 0xffffffffffffaaf3ULL,
 +      0x0000000000000237LL, 0xfffffffffffffe7dLL},
 +    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
 +      0x0000000000000000LL, 0x000000000000550dULL,
 +      0xfffffffffffffdc9LL, 0xfffffffffffffe7dLL},
 +};
 +
 +static void test_divu128(void)
 +{
 +    int i;
 +    uint64_t rem;
 +    test_data_unsigned tmp;
 +
 +    for (i = 0; i < ARRAY_SIZE(test_table_unsigned); ++i) {
 +        tmp = test_table_unsigned[i];
 +
 +        rem = divu128(&tmp.low, &tmp.high, tmp.divisor);
 +        g_assert_cmpuint(tmp.low, ==, tmp.rlow);
 +        g_assert_cmpuint(tmp.high, ==, tmp.rhigh);
 +        g_assert_cmpuint(rem, ==, tmp.remainder);
 +    }
 +}
 +
 +static void test_divs128(void)
 +{
 +    int i;
 +    int64_t rem;
 +    test_data_signed tmp;
 +
 +    for (i = 0; i < ARRAY_SIZE(test_table_signed); ++i) {
 +        tmp = test_table_signed[i];
 +
 +        rem = divs128(&tmp.low, &tmp.high, tmp.divisor);
 +        g_assert_cmpuint(tmp.low, ==, tmp.rlow);
 +        g_assert_cmpuint(tmp.high, ==, tmp.rhigh);
 +        g_assert_cmpuint(rem, ==, tmp.remainder);
 +    }
 +}
 +
 +int main(int argc, char **argv)
 +{
 +    g_test_init(&argc, &argv, NULL);
 +    g_test_add_func("/host-utils/test_divu128", test_divu128);
 +    g_test_add_func("/host-utils/test_divs128", test_divs128);
 +    return g_test_run();
 +}
 diff --git a/tests/unit/meson.build b/tests/unit/meson.build
 index XXXXXXX..XXXXXXX 100644
 --- a/tests/unit/meson.build
 +++ b/tests/unit/meson.build
@@ -XXX,XX +XXX,XX @@ tests = {
    # all code tested by test-x86-cpuid is inside topology.h
    'test-x86-cpuid': [],
    'test-cutils': [],
 +  'test-div128': [],
    'test-shift128': [],
    'test-mul64': [],
    # all code tested by test-int128 is inside int128.h
 --
 .25.1

-[PULL 32/41] target/mips: Use cpu_*_mmuidx_ra instead of MMU_MODE*_SUFFIX
+[PULL 06/56] tcg/optimize: Rename "mask" to "z_mask"
-The separate suffixed functions were used to construct
+Prepare for tracking different masks by renaming this one.
 some do_##insn function switched on mmu_idx.  The interface
 is exactly identical to the *_mmuidx_ra functions.  Replace
 them directly and remove the constructions.
-Cc: Aurelien Jarno <aurelien@aurel32.net>
-Cc: Aleksandar Rikalo <aleksandar.rikalo@rt-rk.com>
-Tested-by: Philippe Mathieu-Daudé <philmd@redhat.com>
-Reviewed-by: Aleksandar Markovic <amarkovic@wavecomp.com>
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/mips/cpu.h       |   4 -
+ tcg/optimize.c | 142 +++++++++++++++++++++++++------------------------
- target/mips/op_helper.c | 182 +++++++++++++---------------------------
+file changed, 72 insertions(+), 70 deletions(-)
 files changed, 60 insertions(+), 126 deletions(-)
-diff --git a/target/mips/cpu.h b/target/mips/cpu.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/mips/cpu.h
+--- a/tcg/optimize.c
-+++ b/target/mips/cpu.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ extern uint32_t cpu_rddsp(uint32_t mask_num, CPUMIPSState *env);
+@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
-  * MMU modes definitions. We carefully match the indices with our
+     TCGTemp *prev_copy;
-  * hflags layout.
+     TCGTemp *next_copy;
-  */
+     uint64_t val;
--#define MMU_MODE0_SUFFIX _kernel
+-    uint64_t mask;
--#define MMU_MODE1_SUFFIX _super
++    uint64_t z_mask;  /* mask bit is 0 if and only if value bit is 0 */
--#define MMU_MODE2_SUFFIX _user
+ } TempOptInfo;
--#define MMU_MODE3_SUFFIX _error
- #define MMU_USER_IDX 2
+ static inline TempOptInfo *ts_info(TCGTemp *ts)
+@@ -XXX,XX +XXX,XX @@ static void reset_ts(TCGTemp *ts)
- static inline int hflags_mmu_index(uint32_t hflags)
+     ti->next_copy = ts;
-diff --git a/target/mips/op_helper.c b/target/mips/op_helper.c
+     ti->prev_copy = ts;
-index XXXXXXX..XXXXXXX 100644
+     ti->is_const = false;
---- a/target/mips/op_helper.c
+-    ti->mask = -1;
-+++ b/target/mips/op_helper.c
++    ti->z_mask = -1;
@@ -XXX,XX +XXX,XX @@ static void raise_exception(CPUMIPSState *env, uint32_t exception)
      do_raise_exception(env, exception, 0);
  }
--#if defined(CONFIG_USER_ONLY)
+ static void reset_temp(TCGArg arg)
--#define HELPER_LD(name, insn, type)                                     \
+@@ -XXX,XX +XXX,XX @@ static void init_ts_info(TCGTempSet *temps_used, TCGTemp *ts)
--static inline type do_##name(CPUMIPSState *env, target_ulong addr,      \
+     if (ts->kind == TEMP_CONST) {
--                             int mem_idx, uintptr_t retaddr)            \
+         ti->is_const = true;
--{                                                                       \
+         ti->val = ts->val;
--    return (type) cpu_##insn##_data_ra(env, addr, retaddr);             \
+-        ti->mask = ts->val;
--}
++        ti->z_mask = ts->val;
--#else
+         if (TCG_TARGET_REG_BITS > 32 && ts->type == TCG_TYPE_I32) {
--#define HELPER_LD(name, insn, type)                                     \
+             /* High bits of a 32-bit quantity are garbage.  */
--static inline type do_##name(CPUMIPSState *env, target_ulong addr,      \
+-            ti->mask |= ~0xffffffffull;
--                             int mem_idx, uintptr_t retaddr)            \
++            ti->z_mask |= ~0xffffffffull;
--{                                                                       \
+         }
--    switch (mem_idx) {                                                  \
+     } else {
--    case 0: return (type) cpu_##insn##_kernel_ra(env, addr, retaddr);   \
+         ti->is_const = false;
--    case 1: return (type) cpu_##insn##_super_ra(env, addr, retaddr);    \
+-        ti->mask = -1;
--    default:                                                            \
++        ti->z_mask = -1;
 -    case 2: return (type) cpu_##insn##_user_ra(env, addr, retaddr);     \
 -    case 3: return (type) cpu_##insn##_error_ra(env, addr, retaddr);    \
 -    }                                                                   \
 -}
 -#endif
 -HELPER_LD(lw, ldl, int32_t)
 -#if defined(TARGET_MIPS64)
 -HELPER_LD(ld, ldq, int64_t)
 -#endif
 -#undef HELPER_LD
 -
 -#if defined(CONFIG_USER_ONLY)
 -#define HELPER_ST(name, insn, type)                                     \
 -static inline void do_##name(CPUMIPSState *env, target_ulong addr,      \
 -                             type val, int mem_idx, uintptr_t retaddr)  \
 -{                                                                       \
 -    cpu_##insn##_data_ra(env, addr, val, retaddr);                      \
 -}
 -#else
 -#define HELPER_ST(name, insn, type)                                     \
 -static inline void do_##name(CPUMIPSState *env, target_ulong addr,      \
 -                             type val, int mem_idx, uintptr_t retaddr)  \
 -{                                                                       \
 -    switch (mem_idx) {                                                  \
 -    case 0:                                                             \
 -        cpu_##insn##_kernel_ra(env, addr, val, retaddr);                \
 -        break;                                                          \
 -    case 1:                                                             \
 -        cpu_##insn##_super_ra(env, addr, val, retaddr);                 \
 -        break;                                                          \
 -    default:                                                            \
 -    case 2:                                                             \
 -        cpu_##insn##_user_ra(env, addr, val, retaddr);                  \
 -        break;                                                          \
 -    case 3:                                                             \
 -        cpu_##insn##_error_ra(env, addr, val, retaddr);                 \
 -        break;                                                          \
 -    }                                                                   \
 -}
 -#endif
 -HELPER_ST(sb, stb, uint8_t)
 -HELPER_ST(sw, stl, uint32_t)
 -#if defined(TARGET_MIPS64)
 -HELPER_ST(sd, stq, uint64_t)
 -#endif
 -#undef HELPER_ST
 -
  /* 64 bits arithmetic for 32 bits hosts */
  static inline uint64_t get_HILO(CPUMIPSState *env)
  {
@@ -XXX,XX +XXX,XX @@ target_ulong helper_##name(CPUMIPSState *env, target_ulong arg, int mem_idx)  \
      }                                                                         \
      env->CP0_LLAddr = do_translate_address(env, arg, 0, GETPC());             \
      env->lladdr = arg;                                                        \
 -    env->llval = do_##insn(env, arg, mem_idx, GETPC());                       \
 +    env->llval = cpu_##insn##_mmuidx_ra(env, arg, mem_idx, GETPC());          \
      return env->llval;                                                        \
  }
 -HELPER_LD_ATOMIC(ll, lw, 0x3)
 +HELPER_LD_ATOMIC(ll, ldl, 0x3)
  #ifdef TARGET_MIPS64
 -HELPER_LD_ATOMIC(lld, ld, 0x7)
 +HELPER_LD_ATOMIC(lld, ldq, 0x7)
  #endif
  #undef HELPER_LD_ATOMIC
  #endif
@@ -XXX,XX +XXX,XX @@ HELPER_LD_ATOMIC(lld, ld, 0x7)
  void helper_swl(CPUMIPSState *env, target_ulong arg1, target_ulong arg2,
                  int mem_idx)
  {
 -    do_sb(env, arg2, (uint8_t)(arg1 >> 24), mem_idx, GETPC());
 +    cpu_stb_mmuidx_ra(env, arg2, (uint8_t)(arg1 >> 24), mem_idx, GETPC());
      if (GET_LMASK(arg2) <= 2) {
 -        do_sb(env, GET_OFFSET(arg2, 1), (uint8_t)(arg1 >> 16), mem_idx,
 -              GETPC());
 +        cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, 1), (uint8_t)(arg1 >> 16),
 +                          mem_idx, GETPC());
      }
      if (GET_LMASK(arg2) <= 1) {
 -        do_sb(env, GET_OFFSET(arg2, 2), (uint8_t)(arg1 >> 8), mem_idx,
 -              GETPC());
 +        cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, 2), (uint8_t)(arg1 >> 8),
 +                          mem_idx, GETPC());
      }
      if (GET_LMASK(arg2) == 0) {
 -        do_sb(env, GET_OFFSET(arg2, 3), (uint8_t)arg1, mem_idx,
 -              GETPC());
 +        cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, 3), (uint8_t)arg1,
 +                          mem_idx, GETPC());
      }
  }
- void helper_swr(CPUMIPSState *env, target_ulong arg1, target_ulong arg2,
+@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
-                 int mem_idx)
+     const TCGOpDef *def;
- {
+     TempOptInfo *di;
--    do_sb(env, arg2, (uint8_t)arg1, mem_idx, GETPC());
+     TempOptInfo *si;
-+    cpu_stb_mmuidx_ra(env, arg2, (uint8_t)arg1, mem_idx, GETPC());
+-    uint64_t mask;
++    uint64_t z_mask;
-     if (GET_LMASK(arg2) >= 1) {
+     TCGOpcode new_op;
--        do_sb(env, GET_OFFSET(arg2, -1), (uint8_t)(arg1 >> 8), mem_idx,
--              GETPC());
+     if (ts_are_copies(dst_ts, src_ts)) {
-+        cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, -1), (uint8_t)(arg1 >> 8),
+@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
-+                          mem_idx, GETPC());
+     op->args[0] = dst;
      op->args[1] = src;
 -    mask = si->mask;
 +    z_mask = si->z_mask;
      if (TCG_TARGET_REG_BITS > 32 && new_op == INDEX_op_mov_i32) {
          /* High bits of the destination are now garbage.  */
 -        mask |= ~0xffffffffull;
 +        z_mask |= ~0xffffffffull;
      }
+-    di->mask = mask;
-     if (GET_LMASK(arg2) >= 2) {
++    di->z_mask = z_mask;
--        do_sb(env, GET_OFFSET(arg2, -2), (uint8_t)(arg1 >> 16), mem_idx,
--              GETPC());
+     if (src_ts->type == dst_ts->type) {
-+        cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, -2), (uint8_t)(arg1 >> 16),
+         TempOptInfo *ni = ts_info(si->next_copy);
-+                          mem_idx, GETPC());
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
      }
-     if (GET_LMASK(arg2) == 3) {
+     QTAILQ_FOREACH_SAFE(op, &s->ops, link, op_next) {
--        do_sb(env, GET_OFFSET(arg2, -3), (uint8_t)(arg1 >> 24), mem_idx,
+-        uint64_t mask, partmask, affected, tmp;
--              GETPC());
++        uint64_t z_mask, partmask, affected, tmp;
-+        cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, -3), (uint8_t)(arg1 >> 24),
+         int nb_oargs, nb_iargs;
-+                          mem_idx, GETPC());
+         TCGOpcode opc = op->opc;
-     }
+         const TCGOpDef *def = &tcg_op_defs[opc];
- }
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-@@ -XXX,XX +XXX,XX @@ void helper_swr(CPUMIPSState *env, target_ulong arg1, target_ulong arg2,
+         /* Simplify using known-zero bits. Currently only ops with a single
- void helper_sdl(CPUMIPSState *env, target_ulong arg1, target_ulong arg2,
+            output argument is supported. */
-                 int mem_idx)
+-        mask = -1;
- {
++        z_mask = -1;
--    do_sb(env, arg2, (uint8_t)(arg1 >> 56), mem_idx, GETPC());
+         affected = -1;
-+    cpu_stb_mmuidx_ra(env, arg2, (uint8_t)(arg1 >> 56), mem_idx, GETPC());
+         switch (opc) {
+         CASE_OP_32_64(ext8s):
-     if (GET_LMASK64(arg2) <= 6) {
+-            if ((arg_info(op->args[1])->mask & 0x80) != 0) {
--        do_sb(env, GET_OFFSET(arg2, 1), (uint8_t)(arg1 >> 48), mem_idx,
++            if ((arg_info(op->args[1])->z_mask & 0x80) != 0) {
--              GETPC());
+                 break;
-+        cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, 1), (uint8_t)(arg1 >> 48),
+             }
-+                          mem_idx, GETPC());
+             QEMU_FALLTHROUGH;
-     }
+         CASE_OP_32_64(ext8u):
+-            mask = 0xff;
-     if (GET_LMASK64(arg2) <= 5) {
++            z_mask = 0xff;
--        do_sb(env, GET_OFFSET(arg2, 2), (uint8_t)(arg1 >> 40), mem_idx,
+             goto and_const;
--              GETPC());
+         CASE_OP_32_64(ext16s):
-+        cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, 2), (uint8_t)(arg1 >> 40),
+-            if ((arg_info(op->args[1])->mask & 0x8000) != 0) {
-+                          mem_idx, GETPC());
++            if ((arg_info(op->args[1])->z_mask & 0x8000) != 0) {
-     }
+                 break;
+             }
-     if (GET_LMASK64(arg2) <= 4) {
+             QEMU_FALLTHROUGH;
--        do_sb(env, GET_OFFSET(arg2, 3), (uint8_t)(arg1 >> 32), mem_idx,
+         CASE_OP_32_64(ext16u):
--              GETPC());
+-            mask = 0xffff;
-+        cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, 3), (uint8_t)(arg1 >> 32),
++            z_mask = 0xffff;
-+                          mem_idx, GETPC());
+             goto and_const;
-     }
+         case INDEX_op_ext32s_i64:
+-            if ((arg_info(op->args[1])->mask & 0x80000000) != 0) {
-     if (GET_LMASK64(arg2) <= 3) {
++            if ((arg_info(op->args[1])->z_mask & 0x80000000) != 0) {
--        do_sb(env, GET_OFFSET(arg2, 4), (uint8_t)(arg1 >> 24), mem_idx,
+                 break;
--              GETPC());
+             }
-+        cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, 4), (uint8_t)(arg1 >> 24),
+             QEMU_FALLTHROUGH;
-+                          mem_idx, GETPC());
+         case INDEX_op_ext32u_i64:
-     }
+-            mask = 0xffffffffU;
++            z_mask = 0xffffffffU;
-     if (GET_LMASK64(arg2) <= 2) {
+             goto and_const;
--        do_sb(env, GET_OFFSET(arg2, 5), (uint8_t)(arg1 >> 16), mem_idx,
--              GETPC());
+         CASE_OP_32_64(and):
-+        cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, 5), (uint8_t)(arg1 >> 16),
+-            mask = arg_info(op->args[2])->mask;
-+                          mem_idx, GETPC());
++            z_mask = arg_info(op->args[2])->z_mask;
-     }
+             if (arg_is_const(op->args[2])) {
+         and_const:
-     if (GET_LMASK64(arg2) <= 1) {
+-                affected = arg_info(op->args[1])->mask & ~mask;
--        do_sb(env, GET_OFFSET(arg2, 6), (uint8_t)(arg1 >> 8), mem_idx,
++                affected = arg_info(op->args[1])->z_mask & ~z_mask;
--              GETPC());
+             }
-+        cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, 6), (uint8_t)(arg1 >> 8),
+-            mask = arg_info(op->args[1])->mask & mask;
-+                          mem_idx, GETPC());
++            z_mask = arg_info(op->args[1])->z_mask & z_mask;
-     }
+             break;
-     if (GET_LMASK64(arg2) <= 0) {
+         case INDEX_op_ext_i32_i64:
--        do_sb(env, GET_OFFSET(arg2, 7), (uint8_t)arg1, mem_idx,
+-            if ((arg_info(op->args[1])->mask & 0x80000000) != 0) {
--              GETPC());
++            if ((arg_info(op->args[1])->z_mask & 0x80000000) != 0) {
-+        cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, 7), (uint8_t)arg1,
+                 break;
-+                          mem_idx, GETPC());
+             }
-     }
+             QEMU_FALLTHROUGH;
- }
+         case INDEX_op_extu_i32_i64:
+             /* We do not compute affected as it is a size changing op.  */
- void helper_sdr(CPUMIPSState *env, target_ulong arg1, target_ulong arg2,
+-            mask = (uint32_t)arg_info(op->args[1])->mask;
-                 int mem_idx)
++            z_mask = (uint32_t)arg_info(op->args[1])->z_mask;
- {
+             break;
--    do_sb(env, arg2, (uint8_t)arg1, mem_idx, GETPC());
-+    cpu_stb_mmuidx_ra(env, arg2, (uint8_t)arg1, mem_idx, GETPC());
+         CASE_OP_32_64(andc):
+             /* Known-zeros does not imply known-ones.  Therefore unless
-     if (GET_LMASK64(arg2) >= 1) {
+                op->args[2] is constant, we can't infer anything from it.  */
--        do_sb(env, GET_OFFSET(arg2, -1), (uint8_t)(arg1 >> 8), mem_idx,
+             if (arg_is_const(op->args[2])) {
--              GETPC());
+-                mask = ~arg_info(op->args[2])->mask;
-+        cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, -1), (uint8_t)(arg1 >> 8),
++                z_mask = ~arg_info(op->args[2])->z_mask;
-+                          mem_idx, GETPC());
+                 goto and_const;
-     }
+             }
+             /* But we certainly know nothing outside args[1] may be set. */
-     if (GET_LMASK64(arg2) >= 2) {
+-            mask = arg_info(op->args[1])->mask;
--        do_sb(env, GET_OFFSET(arg2, -2), (uint8_t)(arg1 >> 16), mem_idx,
++            z_mask = arg_info(op->args[1])->z_mask;
--              GETPC());
+             break;
-+        cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, -2), (uint8_t)(arg1 >> 16),
-+                          mem_idx, GETPC());
+         case INDEX_op_sar_i32:
-     }
+             if (arg_is_const(op->args[2])) {
+                 tmp = arg_info(op->args[2])->val & 31;
-     if (GET_LMASK64(arg2) >= 3) {
+-                mask = (int32_t)arg_info(op->args[1])->mask >> tmp;
--        do_sb(env, GET_OFFSET(arg2, -3), (uint8_t)(arg1 >> 24), mem_idx,
++                z_mask = (int32_t)arg_info(op->args[1])->z_mask >> tmp;
--              GETPC());
+             }
-+        cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, -3), (uint8_t)(arg1 >> 24),
+             break;
-+                          mem_idx, GETPC());
+         case INDEX_op_sar_i64:
-     }
+             if (arg_is_const(op->args[2])) {
+                 tmp = arg_info(op->args[2])->val & 63;
-     if (GET_LMASK64(arg2) >= 4) {
+-                mask = (int64_t)arg_info(op->args[1])->mask >> tmp;
--        do_sb(env, GET_OFFSET(arg2, -4), (uint8_t)(arg1 >> 32), mem_idx,
++                z_mask = (int64_t)arg_info(op->args[1])->z_mask >> tmp;
--              GETPC());
+             }
-+        cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, -4), (uint8_t)(arg1 >> 32),
+             break;
-+                          mem_idx, GETPC());
-     }
+         case INDEX_op_shr_i32:
+             if (arg_is_const(op->args[2])) {
-     if (GET_LMASK64(arg2) >= 5) {
+                 tmp = arg_info(op->args[2])->val & 31;
--        do_sb(env, GET_OFFSET(arg2, -5), (uint8_t)(arg1 >> 40), mem_idx,
+-                mask = (uint32_t)arg_info(op->args[1])->mask >> tmp;
--              GETPC());
++                z_mask = (uint32_t)arg_info(op->args[1])->z_mask >> tmp;
-+        cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, -5), (uint8_t)(arg1 >> 40),
+             }
-+                          mem_idx, GETPC());
+             break;
-     }
+         case INDEX_op_shr_i64:
+             if (arg_is_const(op->args[2])) {
-     if (GET_LMASK64(arg2) >= 6) {
+                 tmp = arg_info(op->args[2])->val & 63;
--        do_sb(env, GET_OFFSET(arg2, -6), (uint8_t)(arg1 >> 48), mem_idx,
+-                mask = (uint64_t)arg_info(op->args[1])->mask >> tmp;
--              GETPC());
++                z_mask = (uint64_t)arg_info(op->args[1])->z_mask >> tmp;
-+        cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, -6), (uint8_t)(arg1 >> 48),
+             }
-+                          mem_idx, GETPC());
+             break;
-     }
+         case INDEX_op_extrl_i64_i32:
-     if (GET_LMASK64(arg2) == 7) {
+-            mask = (uint32_t)arg_info(op->args[1])->mask;
--        do_sb(env, GET_OFFSET(arg2, -7), (uint8_t)(arg1 >> 56), mem_idx,
++            z_mask = (uint32_t)arg_info(op->args[1])->z_mask;
--              GETPC());
+             break;
-+        cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, -7), (uint8_t)(arg1 >> 56),
+         case INDEX_op_extrh_i64_i32:
-+                          mem_idx, GETPC());
+-            mask = (uint64_t)arg_info(op->args[1])->mask >> 32;
-     }
++            z_mask = (uint64_t)arg_info(op->args[1])->z_mask >> 32;
- }
+             break;
- #endif /* TARGET_MIPS64 */
-@@ -XXX,XX +XXX,XX @@ void helper_lwm(CPUMIPSState *env, target_ulong addr, target_ulong reglist,
+         CASE_OP_32_64(shl):
+             if (arg_is_const(op->args[2])) {
-         for (i = 0; i < base_reglist; i++) {
+                 tmp = arg_info(op->args[2])->val & (TCG_TARGET_REG_BITS - 1);
-             env->active_tc.gpr[multiple_regs[i]] =
+-                mask = arg_info(op->args[1])->mask << tmp;
--                (target_long)do_lw(env, addr, mem_idx, GETPC());
++                z_mask = arg_info(op->args[1])->z_mask << tmp;
-+                (target_long)cpu_ldl_mmuidx_ra(env, addr, mem_idx, GETPC());
+             }
-             addr += 4;
+             break;
          CASE_OP_32_64(neg):
              /* Set to 1 all bits to the left of the rightmost.  */
 -            mask = -(arg_info(op->args[1])->mask
 -                     & -arg_info(op->args[1])->mask);
 +            z_mask = -(arg_info(op->args[1])->z_mask
 +                       & -arg_info(op->args[1])->z_mask);
              break;
          CASE_OP_32_64(deposit):
 -            mask = deposit64(arg_info(op->args[1])->mask,
 -                             op->args[3], op->args[4],
 -                             arg_info(op->args[2])->mask);
 +            z_mask = deposit64(arg_info(op->args[1])->z_mask,
 +                               op->args[3], op->args[4],
 +                               arg_info(op->args[2])->z_mask);
              break;
          CASE_OP_32_64(extract):
 -            mask = extract64(arg_info(op->args[1])->mask,
 -                             op->args[2], op->args[3]);
 +            z_mask = extract64(arg_info(op->args[1])->z_mask,
 +                               op->args[2], op->args[3]);
              if (op->args[2] == 0) {
 -                affected = arg_info(op->args[1])->mask & ~mask;
 +                affected = arg_info(op->args[1])->z_mask & ~z_mask;
              }
              break;
          CASE_OP_32_64(sextract):
 -            mask = sextract64(arg_info(op->args[1])->mask,
 -                              op->args[2], op->args[3]);
 -            if (op->args[2] == 0 && (tcg_target_long)mask >= 0) {
 -                affected = arg_info(op->args[1])->mask & ~mask;
 +            z_mask = sextract64(arg_info(op->args[1])->z_mask,
 +                                op->args[2], op->args[3]);
 +            if (op->args[2] == 0 && (tcg_target_long)z_mask >= 0) {
 +                affected = arg_info(op->args[1])->z_mask & ~z_mask;
              }
              break;
          CASE_OP_32_64(or):
          CASE_OP_32_64(xor):
 -            mask = arg_info(op->args[1])->mask | arg_info(op->args[2])->mask;
 +            z_mask = arg_info(op->args[1])->z_mask
 +                   | arg_info(op->args[2])->z_mask;
              break;
          case INDEX_op_clz_i32:
          case INDEX_op_ctz_i32:
 -            mask = arg_info(op->args[2])->mask | 31;
 +            z_mask = arg_info(op->args[2])->z_mask | 31;
              break;
          case INDEX_op_clz_i64:
          case INDEX_op_ctz_i64:
 -            mask = arg_info(op->args[2])->mask | 63;
 +            z_mask = arg_info(op->args[2])->z_mask | 63;
              break;
          case INDEX_op_ctpop_i32:
 -            mask = 32 | 31;
 +            z_mask = 32 | 31;
              break;
          case INDEX_op_ctpop_i64:
 -            mask = 64 | 63;
 +            z_mask = 64 | 63;
              break;
          CASE_OP_32_64(setcond):
          case INDEX_op_setcond2_i32:
 -            mask = 1;
 +            z_mask = 1;
              break;
          CASE_OP_32_64(movcond):
 -            mask = arg_info(op->args[3])->mask | arg_info(op->args[4])->mask;
 +            z_mask = arg_info(op->args[3])->z_mask
 +                   | arg_info(op->args[4])->z_mask;
              break;
          CASE_OP_32_64(ld8u):
 -            mask = 0xff;
 +            z_mask = 0xff;
              break;
          CASE_OP_32_64(ld16u):
 -            mask = 0xffff;
 +            z_mask = 0xffff;
              break;
          case INDEX_op_ld32u_i64:
 -            mask = 0xffffffffu;
 +            z_mask = 0xffffffffu;
              break;
          CASE_OP_32_64(qemu_ld):
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  MemOpIdx oi = op->args[nb_oargs + nb_iargs];
                  MemOp mop = get_memop(oi);
                  if (!(mop & MO_SIGN)) {
 -                    mask = (2ULL << ((8 << (mop & MO_SIZE)) - 1)) - 1;
 +                    z_mask = (2ULL << ((8 << (mop & MO_SIZE)) - 1)) - 1;
                  }
              }
              break;
          CASE_OP_32_64(bswap16):
 -            mask = arg_info(op->args[1])->mask;
 -            if (mask <= 0xffff) {
 +            z_mask = arg_info(op->args[1])->z_mask;
 +            if (z_mask <= 0xffff) {
                  op->args[2] |= TCG_BSWAP_IZ;
              }
 -            mask = bswap16(mask);
 +            z_mask = bswap16(z_mask);
              switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
              case TCG_BSWAP_OZ:
                  break;
              case TCG_BSWAP_OS:
 -                mask = (int16_t)mask;
 +                z_mask = (int16_t)z_mask;
                  break;
              default: /* undefined high bits */
 -                mask |= MAKE_64BIT_MASK(16, 48);
 +                z_mask |= MAKE_64BIT_MASK(16, 48);
                  break;
              }
              break;
          case INDEX_op_bswap32_i64:
 -            mask = arg_info(op->args[1])->mask;
 -            if (mask <= 0xffffffffu) {
 +            z_mask = arg_info(op->args[1])->z_mask;
 +            if (z_mask <= 0xffffffffu) {
                  op->args[2] |= TCG_BSWAP_IZ;
              }
 -            mask = bswap32(mask);
 +            z_mask = bswap32(z_mask);
              switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
              case TCG_BSWAP_OZ:
                  break;
              case TCG_BSWAP_OS:
 -                mask = (int32_t)mask;
 +                z_mask = (int32_t)z_mask;
                  break;
              default: /* undefined high bits */
 -                mask |= MAKE_64BIT_MASK(32, 32);
 +                z_mask |= MAKE_64BIT_MASK(32, 32);
                  break;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          /* 32-bit ops generate 32-bit results.  For the result is zero test
             below, we can ignore high bits, but for further optimizations we
             need to record that the high bits contain garbage.  */
 -        partmask = mask;
 +        partmask = z_mask;
          if (!(def->flags & TCG_OPF_64BIT)) {
 -            mask |= ~(tcg_target_ulong)0xffffffffu;
 +            z_mask |= ~(tcg_target_ulong)0xffffffffu;
              partmask &= 0xffffffffu;
              affected &= 0xffffffffu;
          }
-     }
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                    vs the high word of the input.  */
-     if (do_r31) {
+             do_setcond_high:
--        env->active_tc.gpr[31] = (target_long)do_lw(env, addr, mem_idx,
+                 reset_temp(op->args[0]);
--                                                    GETPC());
+-                arg_info(op->args[0])->mask = 1;
-+        env->active_tc.gpr[31] =
++                arg_info(op->args[0])->z_mask = 1;
-+            (target_long)cpu_ldl_mmuidx_ra(env, addr, mem_idx, GETPC());
+                 op->opc = INDEX_op_setcond_i32;
-     }
+                 op->args[1] = op->args[2];
- }
+                 op->args[2] = op->args[4];
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-@@ -XXX,XX +XXX,XX @@ void helper_swm(CPUMIPSState *env, target_ulong addr, target_ulong reglist,
+                 }
-         target_ulong i;
+             do_setcond_low:
+                 reset_temp(op->args[0]);
-         for (i = 0; i < base_reglist; i++) {
+-                arg_info(op->args[0])->mask = 1;
--            do_sw(env, addr, env->active_tc.gpr[multiple_regs[i]], mem_idx,
++                arg_info(op->args[0])->z_mask = 1;
--                  GETPC());
+                 op->opc = INDEX_op_setcond_i32;
-+            cpu_stw_mmuidx_ra(env, addr, env->active_tc.gpr[multiple_regs[i]],
+                 op->args[2] = op->args[3];
-+                              mem_idx, GETPC());
+                 op->args[3] = op->args[5];
-             addr += 4;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-         }
+             /* Default case: we know nothing about operation (or were unable
-     }
+                to compute the operation result) so no propagation is done.
+                We trash everything if the operation is the end of a basic
-     if (do_r31) {
+-               block, otherwise we only trash the output args.  "mask" is
--        do_sw(env, addr, env->active_tc.gpr[31], mem_idx, GETPC());
++               block, otherwise we only trash the output args.  "z_mask" is
-+        cpu_stw_mmuidx_ra(env, addr, env->active_tc.gpr[31], mem_idx, GETPC());
+                the non-zero bits mask for the first output arg.  */
-     }
+             if (def->flags & TCG_OPF_BB_END) {
- }
+                 memset(&temps_used, 0, sizeof(temps_used));
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-@@ -XXX,XX +XXX,XX @@ void helper_ldm(CPUMIPSState *env, target_ulong addr, target_ulong reglist,
+                     /* Save the corresponding known-zero bits mask for the
-         target_ulong i;
+                        first output argument (only one supported so far). */
+                     if (i == 0) {
-         for (i = 0; i < base_reglist; i++) {
+-                        arg_info(op->args[i])->mask = mask;
--            env->active_tc.gpr[multiple_regs[i]] = do_ld(env, addr, mem_idx,
++                        arg_info(op->args[i])->z_mask = z_mask;
--                                                         GETPC());
+                     }
-+            env->active_tc.gpr[multiple_regs[i]] =
+                 }
-+                cpu_ldq_mmuidx_ra(env, addr, mem_idx, GETPC());
+             }
              addr += 8;
          }
      }
      if (do_r31) {
 -        env->active_tc.gpr[31] = do_ld(env, addr, mem_idx, GETPC());
 +        env->active_tc.gpr[31] =
 +            cpu_ldq_mmuidx_ra(env, addr, mem_idx, GETPC());
      }
  }
@@ -XXX,XX +XXX,XX @@ void helper_sdm(CPUMIPSState *env, target_ulong addr, target_ulong reglist,
          target_ulong i;
          for (i = 0; i < base_reglist; i++) {
 -            do_sd(env, addr, env->active_tc.gpr[multiple_regs[i]], mem_idx,
 -                  GETPC());
 +            cpu_stq_mmuidx_ra(env, addr, env->active_tc.gpr[multiple_regs[i]],
 +                              mem_idx, GETPC());
              addr += 8;
          }
      }
      if (do_r31) {
 -        do_sd(env, addr, env->active_tc.gpr[31], mem_idx, GETPC());
 +        cpu_stq_mmuidx_ra(env, addr, env->active_tc.gpr[31], mem_idx, GETPC());
      }
  }
  #endif
 --
-.20.1
+.25.1

-[PULL 33/41] target/s390x: Use cpu_*_mmuidx_ra instead of MMU_MODE*_SUFFIX
+[PULL 07/56] tcg/optimize: Split out OptContext
-The generated functions aside from *_real are unused.
+Provide what will become a larger context for splitting
-The *_real functions have a couple of users in mem_helper.c;
+the very large tcg_optimize function.
 use *_mmuidx_ra instead, with MMU_REAL_IDX.
-Tested-by: Philippe Mathieu-Daudé <philmd@redhat.com>
-Reviewed-by: David Hildenbrand <david@redhat.com>
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
-v2: Use *_mmuidx_ra directly, without intermediate macros.
+ tcg/optimize.c | 77 ++++++++++++++++++++++++++------------------------
----
+file changed, 40 insertions(+), 37 deletions(-)
  target/s390x/cpu.h        |  5 -----
  target/s390x/mem_helper.c | 10 +++++-----
 files changed, 5 insertions(+), 10 deletions(-)
-diff --git a/target/s390x/cpu.h b/target/s390x/cpu.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/s390x/cpu.h
+--- a/tcg/optimize.c
-+++ b/target/s390x/cpu.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
+     uint64_t z_mask;  /* mask bit is 0 if and only if value bit is 0 */
- #define TARGET_INSN_START_EXTRA_WORDS 2
+ } TempOptInfo;
--#define MMU_MODE0_SUFFIX _primary
++typedef struct OptContext {
--#define MMU_MODE1_SUFFIX _secondary
++    TCGTempSet temps_used;
--#define MMU_MODE2_SUFFIX _home
++} OptContext;
--#define MMU_MODE3_SUFFIX _real
++
--
+ static inline TempOptInfo *ts_info(TCGTemp *ts)
- #define MMU_USER_IDX 0
+ {
+     return ts->state_ptr;
- #define S390_MAX_CPUS 248
+@@ -XXX,XX +XXX,XX @@ static void reset_temp(TCGArg arg)
-diff --git a/target/s390x/mem_helper.c b/target/s390x/mem_helper.c
+ }
-index XXXXXXX..XXXXXXX 100644
---- a/target/s390x/mem_helper.c
+ /* Initialize and activate a temporary.  */
-+++ b/target/s390x/mem_helper.c
+-static void init_ts_info(TCGTempSet *temps_used, TCGTemp *ts)
-@@ -XXX,XX +XXX,XX @@ uint32_t HELPER(testblock)(CPUS390XState *env, uint64_t real_addr)
++static void init_ts_info(OptContext *ctx, TCGTemp *ts)
-     real_addr = wrap_address(env, real_addr) & TARGET_PAGE_MASK;
+ {
+     size_t idx = temp_idx(ts);
-     for (i = 0; i < TARGET_PAGE_SIZE; i += 8) {
+     TempOptInfo *ti;
--        cpu_stq_real_ra(env, real_addr + i, 0, ra);
-+        cpu_stq_mmuidx_ra(env, real_addr + i, 0, MMU_REAL_IDX, ra);
+-    if (test_bit(idx, temps_used->l)) {
 +    if (test_bit(idx, ctx->temps_used.l)) {
          return;
      }
+-    set_bit(idx, temps_used->l);
-     return 0;
++    set_bit(idx, ctx->temps_used.l);
-@@ -XXX,XX +XXX,XX @@ void HELPER(idte)(CPUS390XState *env, uint64_t r1, uint64_t r2, uint32_t m4)
-         for (i = 0; i < entries; i++) {
+     ti = ts->state_ptr;
-             /* addresses are not wrapped in 24/31bit mode but table index is */
+     if (ti == NULL) {
-             raddr = table + ((index + i) & 0x7ff) * sizeof(entry);
+@@ -XXX,XX +XXX,XX @@ static void init_ts_info(TCGTempSet *temps_used, TCGTemp *ts)
--            entry = cpu_ldq_real_ra(env, raddr, ra);
+     }
-+            entry = cpu_ldq_mmuidx_ra(env, raddr, MMU_REAL_IDX, ra);
+ }
-             if (!(entry & REGION_ENTRY_I)) {
-                 /* we are allowed to not store if already invalid */
+-static void init_arg_info(TCGTempSet *temps_used, TCGArg arg)
-                 entry |= REGION_ENTRY_I;
++static void init_arg_info(OptContext *ctx, TCGArg arg)
--                cpu_stq_real_ra(env, raddr, entry, ra);
+ {
-+                cpu_stq_mmuidx_ra(env, raddr, entry, MMU_REAL_IDX, ra);
+-    init_ts_info(temps_used, arg_temp(arg));
 +    init_ts_info(ctx, arg_temp(arg));
  }
  static TCGTemp *find_better_copy(TCGContext *s, TCGTemp *ts)
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
      }
  }
 -static void tcg_opt_gen_movi(TCGContext *s, TCGTempSet *temps_used,
 +static void tcg_opt_gen_movi(TCGContext *s, OptContext *ctx,
                               TCGOp *op, TCGArg dst, uint64_t val)
  {
      const TCGOpDef *def = &tcg_op_defs[op->opc];
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_movi(TCGContext *s, TCGTempSet *temps_used,
      /* Convert movi to mov with constant temp. */
      tv = tcg_constant_internal(type, val);
 -    init_ts_info(temps_used, tv);
 +    init_ts_info(ctx, tv);
      tcg_opt_gen_mov(s, op, dst, temp_arg(tv));
  }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
  {
      int nb_temps, nb_globals, i;
      TCGOp *op, *op_next, *prev_mb = NULL;
 -    TCGTempSet temps_used;
 +    OptContext ctx = {};
      /* Array VALS has an element for each temp.
         If this temp holds a constant then its value is kept in VALS' element.
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
      nb_temps = s->nb_temps;
      nb_globals = s->nb_globals;
 -    memset(&temps_used, 0, sizeof(temps_used));
      for (i = 0; i < nb_temps; ++i) {
          s->temps[i].state_ptr = NULL;
      }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              for (i = 0; i < nb_oargs + nb_iargs; i++) {
                  TCGTemp *ts = arg_temp(op->args[i]);
                  if (ts) {
 -                    init_ts_info(&temps_used, ts);
 +                    init_ts_info(&ctx, ts);
                  }
              }
          } else {
              nb_oargs = def->nb_oargs;
              nb_iargs = def->nb_iargs;
              for (i = 0; i < nb_oargs + nb_iargs; i++) {
 -                init_arg_info(&temps_used, op->args[i]);
 +                init_arg_info(&ctx, op->args[i]);
              }
          }
-     }
-@@ -XXX,XX +XXX,XX @@ void HELPER(ipte)(CPUS390XState *env, uint64_t pto, uint64_t vaddr,
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-     pte_addr += VADDR_PAGE_TX(vaddr) * 8;
+         CASE_OP_32_64(rotr):
+             if (arg_is_const(op->args[1])
-     /* Mark the page table entry as invalid */
+                 && arg_info(op->args[1])->val == 0) {
--    pte = cpu_ldq_real_ra(env, pte_addr, ra);
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
-+    pte = cpu_ldq_mmuidx_ra(env, pte_addr, MMU_REAL_IDX, ra);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
-     pte |= PAGE_ENTRY_I;
+                 continue;
--    cpu_stq_real_ra(env, pte_addr, pte, ra);
+             }
-+    cpu_stq_mmuidx_ra(env, pte_addr, pte, MMU_REAL_IDX, ra);
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-     /* XXX we exploit the fact that Linux passes the exact virtual
-        address here - it's not obliged to! */
+         if (partmask == 0) {
              tcg_debug_assert(nb_oargs == 1);
 -            tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
 +            tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
              continue;
          }
          if (affected == 0) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          CASE_OP_32_64(mulsh):
              if (arg_is_const(op->args[2])
                  && arg_info(op->args[2])->val == 0) {
 -                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
 +                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          CASE_OP_32_64_VEC(sub):
          CASE_OP_32_64_VEC(xor):
              if (args_are_copies(op->args[1], op->args[2])) {
 -                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
 +                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (arg_is_const(op->args[1])) {
                  tmp = arg_info(op->args[1])->val;
                  tmp = dup_const(TCGOP_VECE(op), tmp);
 -                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                  break;
              }
              goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          case INDEX_op_dup2_vec:
              assert(TCG_TARGET_REG_BITS == 32);
              if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
 -                tcg_opt_gen_movi(s, &temps_used, op, op->args[0],
 +                tcg_opt_gen_movi(s, &ctx, op, op->args[0],
                                   deposit64(arg_info(op->args[1])->val, 32, 32,
                                             arg_info(op->args[2])->val));
                  break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          case INDEX_op_extrh_i64_i32:
              if (arg_is_const(op->args[1])) {
                  tmp = do_constant_folding(opc, arg_info(op->args[1])->val, 0);
 -                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                  break;
              }
              goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (arg_is_const(op->args[1])) {
                  tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
                                            op->args[2]);
 -                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                  break;
              }
              goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
                  tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
                                            arg_info(op->args[2])->val);
 -                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                  break;
              }
              goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  TCGArg v = arg_info(op->args[1])->val;
                  if (v != 0) {
                      tmp = do_constant_folding(opc, v, 0);
 -                    tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
 +                    tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                  } else {
                      tcg_opt_gen_mov(s, op, op->args[0], op->args[2]);
                  }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  tmp = deposit64(arg_info(op->args[1])->val,
                                  op->args[3], op->args[4],
                                  arg_info(op->args[2])->val);
 -                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                  break;
              }
              goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (arg_is_const(op->args[1])) {
                  tmp = extract64(arg_info(op->args[1])->val,
                                  op->args[2], op->args[3]);
 -                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                  break;
              }
              goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (arg_is_const(op->args[1])) {
                  tmp = sextract64(arg_info(op->args[1])->val,
                                   op->args[2], op->args[3]);
 -                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                  break;
              }
              goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                      tmp = (int32_t)(((uint32_t)v1 >> shr) |
                                      ((uint32_t)v2 << (32 - shr)));
                  }
 -                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                  break;
              }
              goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              tmp = do_constant_folding_cond(opc, op->args[1],
                                             op->args[2], op->args[3]);
              if (tmp != 2) {
 -                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                  break;
              }
              goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                                             op->args[1], op->args[2]);
              if (tmp != 2) {
                  if (tmp) {
 -                    memset(&temps_used, 0, sizeof(temps_used));
 +                    memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
                      op->opc = INDEX_op_br;
                      op->args[0] = op->args[3];
                  } else {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  rl = op->args[0];
                  rh = op->args[1];
 -                tcg_opt_gen_movi(s, &temps_used, op, rl, (int32_t)a);
 -                tcg_opt_gen_movi(s, &temps_used, op2, rh, (int32_t)(a >> 32));
 +                tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)a);
 +                tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(a >> 32));
                  break;
              }
              goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  rl = op->args[0];
                  rh = op->args[1];
 -                tcg_opt_gen_movi(s, &temps_used, op, rl, (int32_t)r);
 -                tcg_opt_gen_movi(s, &temps_used, op2, rh, (int32_t)(r >> 32));
 +                tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)r);
 +                tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(r >> 32));
                  break;
              }
              goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (tmp != 2) {
                  if (tmp) {
              do_brcond_true:
 -                    memset(&temps_used, 0, sizeof(temps_used));
 +                    memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
                      op->opc = INDEX_op_br;
                      op->args[0] = op->args[5];
                  } else {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  /* Simplify LT/GE comparisons vs zero to a single compare
                     vs the high word of the input.  */
              do_brcond_high:
 -                memset(&temps_used, 0, sizeof(temps_used));
 +                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
                  op->opc = INDEX_op_brcond_i32;
                  op->args[0] = op->args[1];
                  op->args[1] = op->args[3];
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                      goto do_default;
                  }
              do_brcond_low:
 -                memset(&temps_used, 0, sizeof(temps_used));
 +                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
                  op->opc = INDEX_op_brcond_i32;
                  op->args[1] = op->args[2];
                  op->args[2] = op->args[4];
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                                              op->args[5]);
              if (tmp != 2) {
              do_setcond_const:
 -                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
              } else if ((op->args[5] == TCG_COND_LT
                          || op->args[5] == TCG_COND_GE)
                         && arg_is_const(op->args[3])
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (!(tcg_call_flags(op)
                    & (TCG_CALL_NO_READ_GLOBALS | TCG_CALL_NO_WRITE_GLOBALS))) {
                  for (i = 0; i < nb_globals; i++) {
 -                    if (test_bit(i, temps_used.l)) {
 +                    if (test_bit(i, ctx.temps_used.l)) {
                          reset_ts(&s->temps[i]);
                      }
                  }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 block, otherwise we only trash the output args.  "z_mask" is
                 the non-zero bits mask for the first output arg.  */
              if (def->flags & TCG_OPF_BB_END) {
 -                memset(&temps_used, 0, sizeof(temps_used));
 +                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
              } else {
          do_reset_output:
                  for (i = 0; i < nb_oargs; i++) {
 --
-.20.1
+.25.1

-[PULL 16/41] plugins: Include trace/mem.h in api.c
+[PULL 08/56] tcg/optimize: Remove do_default label
-Code movement in an upcoming patch will show that this file
+Break the final cleanup clause out of the main switch
-was implicitly depending on trace/mem.h being included beforehand.
+statement.  When fully folding an opcode to mov/movi,
 use "continue" to process the next opcode, else break
 to fall into the final cleanup.
-Tested-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
-Reported-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- plugins/api.c | 1 +
+ tcg/optimize.c | 190 ++++++++++++++++++++++++-------------------------
-file changed, 1 insertion(+)
+file changed, 94 insertions(+), 96 deletions(-)
-diff --git a/plugins/api.c b/plugins/api.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/plugins/api.c
+--- a/tcg/optimize.c
-+++ b/plugins/api.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
- #include "qemu/plugin-memory.h"
+         switch (opc) {
- #include "hw/boards.h"
+         CASE_OP_32_64_VEC(mov):
- #endif
+             tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
-+#include "trace/mem.h"
+-            break;
++            continue;
- /* Uninstall and Reset handlers */
+         case INDEX_op_dup_vec:
              if (arg_is_const(op->args[1])) {
                  tmp = arg_info(op->args[1])->val;
                  tmp = dup_const(TCGOP_VECE(op), tmp);
                  tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 -                break;
 +                continue;
              }
 -            goto do_default;
 +            break;
          case INDEX_op_dup2_vec:
              assert(TCG_TARGET_REG_BITS == 32);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  tcg_opt_gen_movi(s, &ctx, op, op->args[0],
                                   deposit64(arg_info(op->args[1])->val, 32, 32,
                                             arg_info(op->args[2])->val));
 -                break;
 +                continue;
              } else if (args_are_copies(op->args[1], op->args[2])) {
                  op->opc = INDEX_op_dup_vec;
                  TCGOP_VECE(op) = MO_32;
                  nb_iargs = 1;
              }
 -            goto do_default;
 +            break;
          CASE_OP_32_64(not):
          CASE_OP_32_64(neg):
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (arg_is_const(op->args[1])) {
                  tmp = do_constant_folding(opc, arg_info(op->args[1])->val, 0);
                  tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 -                break;
 +                continue;
              }
 -            goto do_default;
 +            break;
          CASE_OP_32_64(bswap16):
          CASE_OP_32_64(bswap32):
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
                                            op->args[2]);
                  tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 -                break;
 +                continue;
              }
 -            goto do_default;
 +            break;
          CASE_OP_32_64(add):
          CASE_OP_32_64(sub):
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
                                            arg_info(op->args[2])->val);
                  tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 -                break;
 +                continue;
              }
 -            goto do_default;
 +            break;
          CASE_OP_32_64(clz):
          CASE_OP_32_64(ctz):
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  } else {
                      tcg_opt_gen_mov(s, op, op->args[0], op->args[2]);
                  }
 -                break;
 +                continue;
              }
 -            goto do_default;
 +            break;
          CASE_OP_32_64(deposit):
              if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                                  op->args[3], op->args[4],
                                  arg_info(op->args[2])->val);
                  tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 -                break;
 +                continue;
              }
 -            goto do_default;
 +            break;
          CASE_OP_32_64(extract):
              if (arg_is_const(op->args[1])) {
                  tmp = extract64(arg_info(op->args[1])->val,
                                  op->args[2], op->args[3]);
                  tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 -                break;
 +                continue;
              }
 -            goto do_default;
 +            break;
          CASE_OP_32_64(sextract):
              if (arg_is_const(op->args[1])) {
                  tmp = sextract64(arg_info(op->args[1])->val,
                                   op->args[2], op->args[3]);
                  tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 -                break;
 +                continue;
              }
 -            goto do_default;
 +            break;
          CASE_OP_32_64(extract2):
              if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                                      ((uint32_t)v2 << (32 - shr)));
                  }
                  tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 -                break;
 +                continue;
              }
 -            goto do_default;
 +            break;
          CASE_OP_32_64(setcond):
              tmp = do_constant_folding_cond(opc, op->args[1],
                                             op->args[2], op->args[3]);
              if (tmp != 2) {
                  tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 -                break;
 +                continue;
              }
 -            goto do_default;
 +            break;
          CASE_OP_32_64(brcond):
              tmp = do_constant_folding_cond(opc, op->args[0],
                                             op->args[1], op->args[2]);
 -            if (tmp != 2) {
 -                if (tmp) {
 -                    memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
 -                    op->opc = INDEX_op_br;
 -                    op->args[0] = op->args[3];
 -                } else {
 -                    tcg_op_remove(s, op);
 -                }
 +            switch (tmp) {
 +            case 0:
 +                tcg_op_remove(s, op);
 +                continue;
 +            case 1:
 +                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
 +                op->opc = opc = INDEX_op_br;
 +                op->args[0] = op->args[3];
                  break;
              }
 -            goto do_default;
 +            break;
          CASE_OP_32_64(movcond):
              tmp = do_constant_folding_cond(opc, op->args[1],
                                             op->args[2], op->args[5]);
              if (tmp != 2) {
                  tcg_opt_gen_mov(s, op, op->args[0], op->args[4-tmp]);
 -                break;
 +                continue;
              }
              if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
                  uint64_t tv = arg_info(op->args[3])->val;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  if (fv == 1 && tv == 0) {
                      cond = tcg_invert_cond(cond);
                  } else if (!(tv == 1 && fv == 0)) {
 -                    goto do_default;
 +                    break;
                  }
                  op->args[3] = cond;
                  op->opc = opc = (opc == INDEX_op_movcond_i32
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                                   : INDEX_op_setcond_i64);
                  nb_iargs = 2;
              }
 -            goto do_default;
 +            break;
          case INDEX_op_add2_i32:
          case INDEX_op_sub2_i32:
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  rh = op->args[1];
                  tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)a);
                  tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(a >> 32));
 -                break;
 +                continue;
              }
 -            goto do_default;
 +            break;
          case INDEX_op_mulu2_i32:
              if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  rh = op->args[1];
                  tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)r);
                  tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(r >> 32));
 -                break;
 +                continue;
              }
 -            goto do_default;
 +            break;
          case INDEX_op_brcond2_i32:
              tmp = do_constant_folding_cond2(&op->args[0], &op->args[2],
                                              op->args[4]);
 -            if (tmp != 2) {
 -                if (tmp) {
 -            do_brcond_true:
 -                    memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
 -                    op->opc = INDEX_op_br;
 -                    op->args[0] = op->args[5];
 -                } else {
 +            if (tmp == 0) {
              do_brcond_false:
 -                    tcg_op_remove(s, op);
 -                }
 -            } else if ((op->args[4] == TCG_COND_LT
 -                        || op->args[4] == TCG_COND_GE)
 -                       && arg_is_const(op->args[2])
 -                       && arg_info(op->args[2])->val == 0
 -                       && arg_is_const(op->args[3])
 -                       && arg_info(op->args[3])->val == 0) {
 +                tcg_op_remove(s, op);
 +                continue;
 +            }
 +            if (tmp == 1) {
 +            do_brcond_true:
 +                op->opc = opc = INDEX_op_br;
 +                op->args[0] = op->args[5];
 +                break;
 +            }
 +            if ((op->args[4] == TCG_COND_LT || op->args[4] == TCG_COND_GE)
 +                 && arg_is_const(op->args[2])
 +                 && arg_info(op->args[2])->val == 0
 +                 && arg_is_const(op->args[3])
 +                 && arg_info(op->args[3])->val == 0) {
                  /* Simplify LT/GE comparisons vs zero to a single compare
                     vs the high word of the input.  */
              do_brcond_high:
 -                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
 -                op->opc = INDEX_op_brcond_i32;
 +                op->opc = opc = INDEX_op_brcond_i32;
                  op->args[0] = op->args[1];
                  op->args[1] = op->args[3];
                  op->args[2] = op->args[4];
                  op->args[3] = op->args[5];
 -            } else if (op->args[4] == TCG_COND_EQ) {
 +                break;
 +            }
 +            if (op->args[4] == TCG_COND_EQ) {
                  /* Simplify EQ comparisons where one of the pairs
                     can be simplified.  */
                  tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  if (tmp == 0) {
                      goto do_brcond_false;
                  } else if (tmp != 1) {
 -                    goto do_default;
 +                    break;
                  }
              do_brcond_low:
                  memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  op->args[1] = op->args[2];
                  op->args[2] = op->args[4];
                  op->args[3] = op->args[5];
 -            } else if (op->args[4] == TCG_COND_NE) {
 +                break;
 +            }
 +            if (op->args[4] == TCG_COND_NE) {
                  /* Simplify NE comparisons where one of the pairs
                     can be simplified.  */
                  tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  } else if (tmp == 1) {
                      goto do_brcond_true;
                  }
 -                goto do_default;
 -            } else {
 -                goto do_default;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (tmp != 2) {
              do_setcond_const:
                  tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 -            } else if ((op->args[5] == TCG_COND_LT
 -                        || op->args[5] == TCG_COND_GE)
 -                       && arg_is_const(op->args[3])
 -                       && arg_info(op->args[3])->val == 0
 -                       && arg_is_const(op->args[4])
 -                       && arg_info(op->args[4])->val == 0) {
 +                continue;
 +            }
 +            if ((op->args[5] == TCG_COND_LT || op->args[5] == TCG_COND_GE)
 +                 && arg_is_const(op->args[3])
 +                 && arg_info(op->args[3])->val == 0
 +                 && arg_is_const(op->args[4])
 +                 && arg_info(op->args[4])->val == 0) {
                  /* Simplify LT/GE comparisons vs zero to a single compare
                     vs the high word of the input.  */
              do_setcond_high:
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  op->args[1] = op->args[2];
                  op->args[2] = op->args[4];
                  op->args[3] = op->args[5];
 -            } else if (op->args[5] == TCG_COND_EQ) {
 +                break;
 +            }
 +            if (op->args[5] == TCG_COND_EQ) {
                  /* Simplify EQ comparisons where one of the pairs
                     can be simplified.  */
                  tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  if (tmp == 0) {
                      goto do_setcond_high;
                  } else if (tmp != 1) {
 -                    goto do_default;
 +                    break;
                  }
              do_setcond_low:
                  reset_temp(op->args[0]);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  op->opc = INDEX_op_setcond_i32;
                  op->args[2] = op->args[3];
                  op->args[3] = op->args[5];
 -            } else if (op->args[5] == TCG_COND_NE) {
 +                break;
 +            }
 +            if (op->args[5] == TCG_COND_NE) {
                  /* Simplify NE comparisons where one of the pairs
                     can be simplified.  */
                  tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  } else if (tmp == 1) {
                      goto do_setcond_const;
                  }
 -                goto do_default;
 -            } else {
 -                goto do_default;
              }
              break;
 -        case INDEX_op_call:
 -            if (!(tcg_call_flags(op)
 +        default:
 +            break;
 +        }
 +
 +        /* Some of the folding above can change opc. */
 +        opc = op->opc;
 +        def = &tcg_op_defs[opc];
 +        if (def->flags & TCG_OPF_BB_END) {
 +            memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
 +        } else {
 +            if (opc == INDEX_op_call &&
 +                !(tcg_call_flags(op)
                    & (TCG_CALL_NO_READ_GLOBALS | TCG_CALL_NO_WRITE_GLOBALS))) {
                  for (i = 0; i < nb_globals; i++) {
                      if (test_bit(i, ctx.temps_used.l)) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                      }
                  }
              }
 -            goto do_reset_output;
 -        default:
 -        do_default:
 -            /* Default case: we know nothing about operation (or were unable
 -               to compute the operation result) so no propagation is done.
 -               We trash everything if the operation is the end of a basic
 -               block, otherwise we only trash the output args.  "z_mask" is
 -               the non-zero bits mask for the first output arg.  */
 -            if (def->flags & TCG_OPF_BB_END) {
 -                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
 -            } else {
 -        do_reset_output:
 -                for (i = 0; i < nb_oargs; i++) {
 -                    reset_temp(op->args[i]);
 -                    /* Save the corresponding known-zero bits mask for the
 -                       first output argument (only one supported so far). */
 -                    if (i == 0) {
 -                        arg_info(op->args[i])->z_mask = z_mask;
 -                    }
 +            for (i = 0; i < nb_oargs; i++) {
 +                reset_temp(op->args[i]);
 +                /* Save the corresponding known-zero bits mask for the
 +                   first output argument (only one supported so far). */
 +                if (i == 0) {
 +                    arg_info(op->args[i])->z_mask = z_mask;
                  }
              }
 -            break;
          }
          /* Eliminate duplicate and redundant fence instructions.  */
 --
-.20.1
+.25.1

-New patch
+[PULL 09/56] tcg/optimize: Change tcg_opt_gen_{mov,movi} interface
+Adjust the interface to take the OptContext parameter instead
 of TCGContext or both.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
  tcg/optimize.c | 67 +++++++++++++++++++++++++-------------------------
 file changed, 34 insertions(+), 33 deletions(-)
 diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/optimize.c
 +++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
  } TempOptInfo;
  typedef struct OptContext {
 +    TCGContext *tcg;
      TCGTempSet temps_used;
  } OptContext;
@@ -XXX,XX +XXX,XX @@ static bool args_are_copies(TCGArg arg1, TCGArg arg2)
      return ts_are_copies(arg_temp(arg1), arg_temp(arg2));
  }
 -static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
 +static void tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
  {
      TCGTemp *dst_ts = arg_temp(dst);
      TCGTemp *src_ts = arg_temp(src);
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
      TCGOpcode new_op;
      if (ts_are_copies(dst_ts, src_ts)) {
 -        tcg_op_remove(s, op);
 +        tcg_op_remove(ctx->tcg, op);
          return;
      }
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
      }
  }
 -static void tcg_opt_gen_movi(TCGContext *s, OptContext *ctx,
 -                             TCGOp *op, TCGArg dst, uint64_t val)
 +static void tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
 +                             TCGArg dst, uint64_t val)
  {
      const TCGOpDef *def = &tcg_op_defs[op->opc];
      TCGType type;
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_movi(TCGContext *s, OptContext *ctx,
      /* Convert movi to mov with constant temp. */
      tv = tcg_constant_internal(type, val);
      init_ts_info(ctx, tv);
 -    tcg_opt_gen_mov(s, op, dst, temp_arg(tv));
 +    tcg_opt_gen_mov(ctx, op, dst, temp_arg(tv));
  }
  static uint64_t do_constant_folding_2(TCGOpcode op, uint64_t x, uint64_t y)
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
  {
      int nb_temps, nb_globals, i;
      TCGOp *op, *op_next, *prev_mb = NULL;
 -    OptContext ctx = {};
 +    OptContext ctx = { .tcg = s };
      /* Array VALS has an element for each temp.
         If this temp holds a constant then its value is kept in VALS' element.
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          CASE_OP_32_64(rotr):
              if (arg_is_const(op->args[1])
                  && arg_info(op->args[1])->val == 0) {
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (!arg_is_const(op->args[1])
                  && arg_is_const(op->args[2])
                  && arg_info(op->args[2])->val == 0) {
 -                tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
 +                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (!arg_is_const(op->args[1])
                  && arg_is_const(op->args[2])
                  && arg_info(op->args[2])->val == -1) {
 -                tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
 +                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          if (partmask == 0) {
              tcg_debug_assert(nb_oargs == 1);
 -            tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
 +            tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
              continue;
          }
          if (affected == 0) {
              tcg_debug_assert(nb_oargs == 1);
 -            tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
 +            tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
              continue;
          }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          CASE_OP_32_64(mulsh):
              if (arg_is_const(op->args[2])
                  && arg_info(op->args[2])->val == 0) {
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          CASE_OP_32_64_VEC(or):
          CASE_OP_32_64_VEC(and):
              if (args_are_copies(op->args[1], op->args[2])) {
 -                tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
 +                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          CASE_OP_32_64_VEC(sub):
          CASE_OP_32_64_VEC(xor):
              if (args_are_copies(op->args[1], op->args[2])) {
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             allocator where needed and possible.  Also detect copies. */
          switch (opc) {
          CASE_OP_32_64_VEC(mov):
 -            tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
 +            tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
              continue;
          case INDEX_op_dup_vec:
              if (arg_is_const(op->args[1])) {
                  tmp = arg_info(op->args[1])->val;
                  tmp = dup_const(TCGOP_VECE(op), tmp);
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          case INDEX_op_dup2_vec:
              assert(TCG_TARGET_REG_BITS == 32);
              if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0],
 +                tcg_opt_gen_movi(&ctx, op, op->args[0],
                                   deposit64(arg_info(op->args[1])->val, 32, 32,
                                             arg_info(op->args[2])->val));
                  continue;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          case INDEX_op_extrh_i64_i32:
              if (arg_is_const(op->args[1])) {
                  tmp = do_constant_folding(opc, arg_info(op->args[1])->val, 0);
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (arg_is_const(op->args[1])) {
                  tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
                                            op->args[2]);
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
                  tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
                                            arg_info(op->args[2])->val);
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  TCGArg v = arg_info(op->args[1])->val;
                  if (v != 0) {
                      tmp = do_constant_folding(opc, v, 0);
 -                    tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                    tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  } else {
 -                    tcg_opt_gen_mov(s, op, op->args[0], op->args[2]);
 +                    tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[2]);
                  }
                  continue;
              }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  tmp = deposit64(arg_info(op->args[1])->val,
                                  op->args[3], op->args[4],
                                  arg_info(op->args[2])->val);
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (arg_is_const(op->args[1])) {
                  tmp = extract64(arg_info(op->args[1])->val,
                                  op->args[2], op->args[3]);
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (arg_is_const(op->args[1])) {
                  tmp = sextract64(arg_info(op->args[1])->val,
                                   op->args[2], op->args[3]);
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                      tmp = (int32_t)(((uint32_t)v1 >> shr) |
                                      ((uint32_t)v2 << (32 - shr)));
                  }
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              tmp = do_constant_folding_cond(opc, op->args[1],
                                             op->args[2], op->args[3]);
              if (tmp != 2) {
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              tmp = do_constant_folding_cond(opc, op->args[1],
                                             op->args[2], op->args[5]);
              if (tmp != 2) {
 -                tcg_opt_gen_mov(s, op, op->args[0], op->args[4-tmp]);
 +                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[4-tmp]);
                  continue;
              }
              if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  rl = op->args[0];
                  rh = op->args[1];
 -                tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)a);
 -                tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(a >> 32));
 +                tcg_opt_gen_movi(&ctx, op, rl, (int32_t)a);
 +                tcg_opt_gen_movi(&ctx, op2, rh, (int32_t)(a >> 32));
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  rl = op->args[0];
                  rh = op->args[1];
 -                tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)r);
 -                tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(r >> 32));
 +                tcg_opt_gen_movi(&ctx, op, rl, (int32_t)r);
 +                tcg_opt_gen_movi(&ctx, op2, rh, (int32_t)(r >> 32));
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                                              op->args[5]);
              if (tmp != 2) {
              do_setcond_const:
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  continue;
              }
              if ((op->args[5] == TCG_COND_LT || op->args[5] == TCG_COND_GE)
 --
 .25.1

-[PULL 34/41] target/ppc: Use cpu_*_mmuidx_ra instead of MMU_MODE*_SUFFIX
+[PULL 10/56] tcg/optimize: Move prev_mb into OptContext
-There are only two uses.  Within dcbz_common, the local variable
+This will expose the variable to subroutines that
-mmu_idx already contains the epid computation, and we can avoid
+will be broken out of tcg_optimize.
 repeating it for the store.  Within helper_icbiep, the usage is
 trivially expanded using PPC_TLB_EPID_LOAD.
-Tested-by: Philippe Mathieu-Daudé <philmd@redhat.com>
-Acked-by: David Gibson <david@gibson.dropbear.id.au>
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/ppc/cpu.h        |  2 --
+ tcg/optimize.c | 11 ++++++-----
- target/ppc/mem_helper.c | 11 ++---------
+file changed, 6 insertions(+), 5 deletions(-)
 files changed, 2 insertions(+), 11 deletions(-)
-diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/ppc/cpu.h
+--- a/tcg/optimize.c
-+++ b/target/ppc/cpu.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ struct ppc_radix_page_info {
+@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
-  * + real/paged mode combinations. The other two modes are for
-  * external PID load/store.
+ typedef struct OptContext {
-  */
+     TCGContext *tcg;
--#define MMU_MODE8_SUFFIX _epl
++    TCGOp *prev_mb;
--#define MMU_MODE9_SUFFIX _eps
+     TCGTempSet temps_used;
- #define PPC_TLB_EPID_LOAD 8
+ } OptContext;
- #define PPC_TLB_EPID_STORE 9
+@@ -XXX,XX +XXX,XX @@ static bool swap_commutative2(TCGArg *p1, TCGArg *p2)
-diff --git a/target/ppc/mem_helper.c b/target/ppc/mem_helper.c
+ void tcg_optimize(TCGContext *s)
-index XXXXXXX..XXXXXXX 100644
+ {
---- a/target/ppc/mem_helper.c
+     int nb_temps, nb_globals, i;
-+++ b/target/ppc/mem_helper.c
+-    TCGOp *op, *op_next, *prev_mb = NULL;
-@@ -XXX,XX +XXX,XX @@ static void dcbz_common(CPUPPCState *env, target_ulong addr,
++    TCGOp *op, *op_next;
-     } else {
+     OptContext ctx = { .tcg = s };
-         /* Slow path */
-         for (i = 0; i < dcbz_size; i += 8) {
+     /* Array VALS has an element for each temp.
--            if (epid) {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
--#if !defined(CONFIG_USER_ONLY)
+         }
--                /* Does not make sense on USER_ONLY config */
--                cpu_stq_eps_ra(env, addr + i, 0, retaddr);
+         /* Eliminate duplicate and redundant fence instructions.  */
--#endif
+-        if (prev_mb) {
--            } else {
++        if (ctx.prev_mb) {
--                cpu_stq_data_ra(env, addr + i, 0, retaddr);
+             switch (opc) {
--            }
+             case INDEX_op_mb:
-+            cpu_stq_mmuidx_ra(env, addr + i, 0, mmu_idx, retaddr);
+                 /* Merge two barriers of the same type into one,
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                   * barrier.  This is stricter than specified but for
                   * the purposes of TCG is better than not optimizing.
                   */
 -                prev_mb->args[0] |= op->args[0];
 +                ctx.prev_mb->args[0] |= op->args[0];
                  tcg_op_remove(s, op);
                  break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              case INDEX_op_qemu_st_i64:
              case INDEX_op_call:
                  /* Opcodes that touch guest memory stop the optimization.  */
 -                prev_mb = NULL;
 +                ctx.prev_mb = NULL;
                  break;
              }
          } else if (opc == INDEX_op_mb) {
 -            prev_mb = op;
 +            ctx.prev_mb = op;
          }
      }
  }
-@@ -XXX,XX +XXX,XX @@ void helper_icbiep(CPUPPCState *env, target_ulong addr)
- #if !defined(CONFIG_USER_ONLY)
-     /* See comments above */
-     addr &= ~(env->dcache_line_size - 1);
--    cpu_ldl_epl_ra(env, addr, GETPC());
-+    cpu_ldl_mmuidx_ra(env, addr, PPC_TLB_EPID_LOAD, GETPC());
- #endif
- }
 --
-.20.1
+.25.1

-[PULL 10/41] trace: Remove trace_mem_build_info_no_se_[bl]e
+[PULL 11/56] tcg/optimize: Split out init_arguments
-It is easy for the atomic helpers to use trace_mem_build_info
+There was no real reason for calls to have separate code here.
-directly, without resorting to symbol pasting.  For this usage,
+Unify init for calls vs non-calls using the call path, which
-we cannot use trace_mem_get_info, because the MemOp does not
+handles TCG_CALL_DUMMY_ARG.
 support 16-byte accesses.
-Tested-by: Philippe Mathieu-Daudé <philmd@redhat.com>
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/atomic_template.h | 67 +++++++++++++------------------------
+ tcg/optimize.c | 25 +++++++++++--------------
- trace/mem-internal.h        | 17 ----------
+file changed, 11 insertions(+), 14 deletions(-)
 files changed, 24 insertions(+), 60 deletions(-)
-diff --git a/accel/tcg/atomic_template.h b/accel/tcg/atomic_template.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/atomic_template.h
+--- a/tcg/optimize.c
-+++ b/accel/tcg/atomic_template.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static void init_ts_info(OptContext *ctx, TCGTemp *ts)
-    the ATOMIC_NAME macro, and redefined below.  */
+     }
  #if DATA_SIZE == 1
  # define END
 -# define MEND _be /* either le or be would be fine */
  #elif defined(HOST_WORDS_BIGENDIAN)
  # define END  _be
 -# define MEND _be
  #else
  # define END  _le
 -# define MEND _le
  #endif
  ABI_TYPE ATOMIC_NAME(cmpxchg)(CPUArchState *env, target_ulong addr,
@@ -XXX,XX +XXX,XX @@ ABI_TYPE ATOMIC_NAME(cmpxchg)(CPUArchState *env, target_ulong addr,
      ATOMIC_MMU_DECLS;
      DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;
      DATA_TYPE ret;
 -    uint16_t info = glue(trace_mem_build_info_no_se, MEND)(SHIFT, false,
 -                                                           ATOMIC_MMU_IDX);
 +    uint16_t info = trace_mem_build_info(SHIFT, false, 0, false,
 +                                         ATOMIC_MMU_IDX);
      atomic_trace_rmw_pre(env, addr, info);
  #if DATA_SIZE == 16
@@ -XXX,XX +XXX,XX @@ ABI_TYPE ATOMIC_NAME(ld)(CPUArchState *env, target_ulong addr EXTRA_ARGS)
  {
      ATOMIC_MMU_DECLS;
      DATA_TYPE val, *haddr = ATOMIC_MMU_LOOKUP;
 -    uint16_t info = glue(trace_mem_build_info_no_se, MEND)(SHIFT, false,
 -                                                           ATOMIC_MMU_IDX);
 +    uint16_t info = trace_mem_build_info(SHIFT, false, 0, false,
 +                                         ATOMIC_MMU_IDX);
      atomic_trace_ld_pre(env, addr, info);
      val = atomic16_read(haddr);
@@ -XXX,XX +XXX,XX @@ void ATOMIC_NAME(st)(CPUArchState *env, target_ulong addr,
  {
      ATOMIC_MMU_DECLS;
      DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;
 -    uint16_t info = glue(trace_mem_build_info_no_se, MEND)(SHIFT, true,
 -                                                          ATOMIC_MMU_IDX);
 +    uint16_t info = trace_mem_build_info(SHIFT, false, 0, true,
 +                                         ATOMIC_MMU_IDX);
      atomic_trace_st_pre(env, addr, info);
      atomic16_set(haddr, val);
@@ -XXX,XX +XXX,XX @@ ABI_TYPE ATOMIC_NAME(xchg)(CPUArchState *env, target_ulong addr,
      ATOMIC_MMU_DECLS;
      DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;
      DATA_TYPE ret;
 -    uint16_t info = glue(trace_mem_build_info_no_se, MEND)(SHIFT, false,
 -                                                          ATOMIC_MMU_IDX);
 +    uint16_t info = trace_mem_build_info(SHIFT, false, 0, false,
 +                                         ATOMIC_MMU_IDX);
      atomic_trace_rmw_pre(env, addr, info);
      ret = atomic_xchg__nocheck(haddr, val);
@@ -XXX,XX +XXX,XX @@ ABI_TYPE ATOMIC_NAME(X)(CPUArchState *env, target_ulong addr,       \
      ATOMIC_MMU_DECLS;                                               \
      DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;                           \
      DATA_TYPE ret;                                                  \
 -    uint16_t info = glue(trace_mem_build_info_no_se, MEND)(SHIFT,   \
 -                                                           false,   \
 -                                                           ATOMIC_MMU_IDX); \
 -                                                                    \
 +    uint16_t info = trace_mem_build_info(SHIFT, false, 0, false,    \
 +                                         ATOMIC_MMU_IDX);           \
      atomic_trace_rmw_pre(env, addr, info);                          \
      ret = atomic_##X(haddr, val);                                   \
      ATOMIC_MMU_CLEANUP;                                             \
@@ -XXX,XX +XXX,XX @@ ABI_TYPE ATOMIC_NAME(X)(CPUArchState *env, target_ulong addr,       \
      ATOMIC_MMU_DECLS;                                               \
      XDATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;                          \
      XDATA_TYPE cmp, old, new, val = xval;                           \
 -    uint16_t info = glue(trace_mem_build_info_no_se, MEND)(SHIFT,   \
 -                                                           false,   \
 -                                                           ATOMIC_MMU_IDX); \
 -                                                                    \
 +    uint16_t info = trace_mem_build_info(SHIFT, false, 0, false,    \
 +                                         ATOMIC_MMU_IDX);           \
      atomic_trace_rmw_pre(env, addr, info);                          \
      smp_mb();                                                       \
      cmp = atomic_read__nocheck(haddr);                              \
@@ -XXX,XX +XXX,XX @@ GEN_ATOMIC_HELPER_FN(umax_fetch, MAX,  DATA_TYPE, new)
  #endif /* DATA SIZE >= 16 */
  #undef END
 -#undef MEND
  #if DATA_SIZE > 1
@@ -XXX,XX +XXX,XX @@ GEN_ATOMIC_HELPER_FN(umax_fetch, MAX,  DATA_TYPE, new)
     within the ATOMIC_NAME macro.  */
  #ifdef HOST_WORDS_BIGENDIAN
  # define END  _le
 -# define MEND _le
  #else
  # define END  _be
 -# define MEND _be
  #endif
  ABI_TYPE ATOMIC_NAME(cmpxchg)(CPUArchState *env, target_ulong addr,
@@ -XXX,XX +XXX,XX @@ ABI_TYPE ATOMIC_NAME(cmpxchg)(CPUArchState *env, target_ulong addr,
      ATOMIC_MMU_DECLS;
      DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;
      DATA_TYPE ret;
 -    uint16_t info = glue(trace_mem_build_info_no_se, MEND)(SHIFT,
 -                                                           false,
 -                                                           ATOMIC_MMU_IDX);
 +    uint16_t info = trace_mem_build_info(SHIFT, false, MO_BSWAP, false,
 +                                         ATOMIC_MMU_IDX);
      atomic_trace_rmw_pre(env, addr, info);
  #if DATA_SIZE == 16
@@ -XXX,XX +XXX,XX @@ ABI_TYPE ATOMIC_NAME(ld)(CPUArchState *env, target_ulong addr EXTRA_ARGS)
  {
      ATOMIC_MMU_DECLS;
      DATA_TYPE val, *haddr = ATOMIC_MMU_LOOKUP;
 -    uint16_t info = glue(trace_mem_build_info_no_se, MEND)(SHIFT,
 -                                                           false,
 -                                                           ATOMIC_MMU_IDX);
 +    uint16_t info = trace_mem_build_info(SHIFT, false, MO_BSWAP, false,
 +                                         ATOMIC_MMU_IDX);
      atomic_trace_ld_pre(env, addr, info);
      val = atomic16_read(haddr);
@@ -XXX,XX +XXX,XX @@ void ATOMIC_NAME(st)(CPUArchState *env, target_ulong addr,
  {
      ATOMIC_MMU_DECLS;
      DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;
 -    uint16_t info = glue(trace_mem_build_info_no_se, MEND)(SHIFT,
 -                                                           true,
 -                                                           ATOMIC_MMU_IDX);
 +    uint16_t info = trace_mem_build_info(SHIFT, false, MO_BSWAP, true,
 +                                         ATOMIC_MMU_IDX);
      val = BSWAP(val);
      atomic_trace_st_pre(env, addr, info);
@@ -XXX,XX +XXX,XX @@ ABI_TYPE ATOMIC_NAME(xchg)(CPUArchState *env, target_ulong addr,
      ATOMIC_MMU_DECLS;
      DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;
      ABI_TYPE ret;
 -    uint16_t info = glue(trace_mem_build_info_no_se, MEND)(SHIFT,
 -                                                           false,
 -                                                           ATOMIC_MMU_IDX);
 +    uint16_t info = trace_mem_build_info(SHIFT, false, MO_BSWAP, false,
 +                                         ATOMIC_MMU_IDX);
      atomic_trace_rmw_pre(env, addr, info);
      ret = atomic_xchg__nocheck(haddr, BSWAP(val));
@@ -XXX,XX +XXX,XX @@ ABI_TYPE ATOMIC_NAME(X)(CPUArchState *env, target_ulong addr,       \
      ATOMIC_MMU_DECLS;                                               \
      DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;                           \
      DATA_TYPE ret;                                                  \
 -    uint16_t info = glue(trace_mem_build_info_no_se, MEND)(SHIFT,   \
 -                                                           false,   \
 -                                                           ATOMIC_MMU_IDX); \
 -                                                                    \
 +    uint16_t info = trace_mem_build_info(SHIFT, false, MO_BSWAP,    \
 +                                         false, ATOMIC_MMU_IDX);    \
      atomic_trace_rmw_pre(env, addr, info);                          \
      ret = atomic_##X(haddr, BSWAP(val));                            \
      ATOMIC_MMU_CLEANUP;                                             \
@@ -XXX,XX +XXX,XX @@ ABI_TYPE ATOMIC_NAME(X)(CPUArchState *env, target_ulong addr,       \
      ATOMIC_MMU_DECLS;                                               \
      XDATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;                          \
      XDATA_TYPE ldo, ldn, old, new, val = xval;                      \
 -    uint16_t info = glue(trace_mem_build_info_no_se, MEND)(SHIFT,   \
 -                                                           false,   \
 -                                                           ATOMIC_MMU_IDX); \
 -                                                                    \
 +    uint16_t info = trace_mem_build_info(SHIFT, false, MO_BSWAP,    \
 +                                         false, ATOMIC_MMU_IDX);    \
      atomic_trace_rmw_pre(env, addr, info);                          \
      smp_mb();                                                       \
      ldn = atomic_read__nocheck(haddr);                              \
@@ -XXX,XX +XXX,XX @@ GEN_ATOMIC_HELPER_FN(add_fetch, ADD, DATA_TYPE, new)
  #endif /* DATA_SIZE >= 16 */
  #undef END
 -#undef MEND
  #endif /* DATA_SIZE > 1 */
  #undef BSWAP
 diff --git a/trace/mem-internal.h b/trace/mem-internal.h
 index XXXXXXX..XXXXXXX 100644
 --- a/trace/mem-internal.h
 +++ b/trace/mem-internal.h
@@ -XXX,XX +XXX,XX @@ static inline uint16_t trace_mem_get_info(MemOp op,
                                  mmu_idx);
  }
--/* Used by the atomic helpers */
+-static void init_arg_info(OptContext *ctx, TCGArg arg)
 -static inline
 -uint16_t trace_mem_build_info_no_se_be(int size_shift, bool store,
 -                                       TCGMemOpIdx oi)
 -{
--    return trace_mem_build_info(size_shift, false, MO_BE, store,
+-    init_ts_info(ctx, arg_temp(arg));
 -                                get_mmuidx(oi));
 -}
 -
--static inline
+ static TCGTemp *find_better_copy(TCGContext *s, TCGTemp *ts)
--uint16_t trace_mem_build_info_no_se_le(int size_shift, bool store,
+ {
--                                       TCGMemOpIdx oi)
+     TCGTemp *i, *g, *l;
--{
+@@ -XXX,XX +XXX,XX @@ static bool swap_commutative2(TCGArg *p1, TCGArg *p2)
--    return trace_mem_build_info(size_shift, false, MO_LE, store,
+     return false;
--                                get_mmuidx(oi));
+ }
--}
--
++static void init_arguments(OptContext *ctx, TCGOp *op, int nb_args)
- #endif /* TRACE__MEM_INTERNAL_H */
++{
 +    for (int i = 0; i < nb_args; i++) {
 +        TCGTemp *ts = arg_temp(op->args[i]);
 +        if (ts) {
 +            init_ts_info(ctx, ts);
 +        }
 +    }
 +}
 +
  /* Propagate constants and copies, fold constant expressions. */
  void tcg_optimize(TCGContext *s)
  {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          if (opc == INDEX_op_call) {
              nb_oargs = TCGOP_CALLO(op);
              nb_iargs = TCGOP_CALLI(op);
 -            for (i = 0; i < nb_oargs + nb_iargs; i++) {
 -                TCGTemp *ts = arg_temp(op->args[i]);
 -                if (ts) {
 -                    init_ts_info(&ctx, ts);
 -                }
 -            }
          } else {
              nb_oargs = def->nb_oargs;
              nb_iargs = def->nb_iargs;
 -            for (i = 0; i < nb_oargs + nb_iargs; i++) {
 -                init_arg_info(&ctx, op->args[i]);
 -            }
          }
 +        init_arguments(&ctx, op, nb_oargs + nb_iargs);
          /* Do copy propagation */
          for (i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
 --
-.20.1
+.25.1

-[PULL 14/41] linux-user: Include tcg.h in syscall.c
+[PULL 12/56] tcg/optimize: Split out copy_propagate
-Code movement in an upcoming patch will show that this file
+Continue splitting tcg_optimize.
 was implicitly depending on tcg.h being included indirectly.
-Tested-by: Philippe Mathieu-Daudé <philmd@redhat.com>
-Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- linux-user/syscall.c | 1 +
+ tcg/optimize.c | 22 ++++++++++++++--------
-file changed, 1 insertion(+)
+file changed, 14 insertions(+), 8 deletions(-)
-diff --git a/linux-user/syscall.c b/linux-user/syscall.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/linux-user/syscall.c
+--- a/tcg/optimize.c
-+++ b/linux-user/syscall.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static void init_arguments(OptContext *ctx, TCGOp *op, int nb_args)
- #include "user/syscall-trace.h"
+     }
- #include "qapi/error.h"
+ }
- #include "fd-trans.h"
-+#include "tcg.h"
++static void copy_propagate(OptContext *ctx, TCGOp *op,
++                           int nb_oargs, int nb_iargs)
- #ifndef CLONE_IO
++{
- #define CLONE_IO                0x80000000      /* Clone io context */
++    TCGContext *s = ctx->tcg;
 +
 +    for (int i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
 +        TCGTemp *ts = arg_temp(op->args[i]);
 +        if (ts && ts_is_copy(ts)) {
 +            op->args[i] = temp_arg(find_better_copy(s, ts));
 +        }
 +    }
 +}
 +
  /* Propagate constants and copies, fold constant expressions. */
  void tcg_optimize(TCGContext *s)
  {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              nb_iargs = def->nb_iargs;
          }
          init_arguments(&ctx, op, nb_oargs + nb_iargs);
 -
 -        /* Do copy propagation */
 -        for (i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
 -            TCGTemp *ts = arg_temp(op->args[i]);
 -            if (ts && ts_is_copy(ts)) {
 -                op->args[i] = temp_arg(find_better_copy(s, ts));
 -            }
 -        }
 +        copy_propagate(&ctx, op, nb_oargs, nb_iargs);
          /* For commutative operations make constant second argument */
          switch (opc) {
 --
-.20.1
+.25.1

-New patch
+[PULL 13/56] tcg/optimize: Split out fold_call
+Calls are special in that they have a variable number
+of arguments, and need to be able to clobber globals.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 63 ++++++++++++++++++++++++++++++++------------------
+file changed, 41 insertions(+), 22 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static void copy_propagate(OptContext *ctx, TCGOp *op,
+     }
+ }
++static bool fold_call(OptContext *ctx, TCGOp *op)
++{
++    TCGContext *s = ctx->tcg;
++    int nb_oargs = TCGOP_CALLO(op);
++    int nb_iargs = TCGOP_CALLI(op);
++    int flags, i;
++
++    init_arguments(ctx, op, nb_oargs + nb_iargs);
++    copy_propagate(ctx, op, nb_oargs, nb_iargs);
++
++    /* If the function reads or writes globals, reset temp data. */
++    flags = tcg_call_flags(op);
++    if (!(flags & (TCG_CALL_NO_READ_GLOBALS | TCG_CALL_NO_WRITE_GLOBALS))) {
++        int nb_globals = s->nb_globals;
++
++        for (i = 0; i < nb_globals; i++) {
++            if (test_bit(i, ctx->temps_used.l)) {
++                reset_ts(&ctx->tcg->temps[i]);
++            }
++        }
++    }
++
++    /* Reset temp data for outputs. */
++    for (i = 0; i < nb_oargs; i++) {
++        reset_temp(op->args[i]);
++    }
++
++    /* Stop optimizing MB across calls. */
++    ctx->prev_mb = NULL;
++    return true;
++}
++
+ /* Propagate constants and copies, fold constant expressions. */
+ void tcg_optimize(TCGContext *s)
+ {
+-    int nb_temps, nb_globals, i;
++    int nb_temps, i;
+     TCGOp *op, *op_next;
+     OptContext ctx = { .tcg = s };
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+        available through the doubly linked circular list. */
+     nb_temps = s->nb_temps;
+-    nb_globals = s->nb_globals;
+-
+     for (i = 0; i < nb_temps; ++i) {
+         s->temps[i].state_ptr = NULL;
+     }
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         uint64_t z_mask, partmask, affected, tmp;
+         int nb_oargs, nb_iargs;
+         TCGOpcode opc = op->opc;
+-        const TCGOpDef *def = &tcg_op_defs[opc];
++        const TCGOpDef *def;
+-        /* Count the arguments, and initialize the temps that are
+-           going to be used */
++        /* Calls are special. */
+         if (opc == INDEX_op_call) {
+-            nb_oargs = TCGOP_CALLO(op);
+-            nb_iargs = TCGOP_CALLI(op);
+-        } else {
+-            nb_oargs = def->nb_oargs;
+-            nb_iargs = def->nb_iargs;
++            fold_call(&ctx, op);
++            continue;
+         }
++
++        def = &tcg_op_defs[opc];
++        nb_oargs = def->nb_oargs;
++        nb_iargs = def->nb_iargs;
+         init_arguments(&ctx, op, nb_oargs + nb_iargs);
+         copy_propagate(&ctx, op, nb_oargs, nb_iargs);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         if (def->flags & TCG_OPF_BB_END) {
+             memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+         } else {
+-            if (opc == INDEX_op_call &&
+-                !(tcg_call_flags(op)
+-                  & (TCG_CALL_NO_READ_GLOBALS | TCG_CALL_NO_WRITE_GLOBALS))) {
+-                for (i = 0; i < nb_globals; i++) {
+-                    if (test_bit(i, ctx.temps_used.l)) {
+-                        reset_ts(&s->temps[i]);
+-                    }
+-                }
+-            }
+-
+             for (i = 0; i < nb_oargs; i++) {
+                 reset_temp(op->args[i]);
+                 /* Save the corresponding known-zero bits mask for the
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             case INDEX_op_qemu_st_i32:
+             case INDEX_op_qemu_st8_i32:
+             case INDEX_op_qemu_st_i64:
+-            case INDEX_op_call:
+                 /* Opcodes that touch guest memory stop the optimization.  */
+                 ctx.prev_mb = NULL;
+                 break;
+--
+.25.1

-[PULL 13/41] accel/tcg: Include tcg.h in tcg-runtime.c
+[PULL 14/56] tcg/optimize: Drop nb_oargs, nb_iargs locals
-Code movement in an upcoming patch will show that this file
+Rather than try to keep these up-to-date across folding,
-was implicitly depending on tcg.h being included indirectly.
+re-read nb_oargs at the end, after re-reading the opcode.
-Tested-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+A couple of asserts need dropping, but that will take care
 of itself as we split the function further.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/tcg-runtime.c | 1 +
+ tcg/optimize.c | 14 ++++----------
-file changed, 1 insertion(+)
+file changed, 4 insertions(+), 10 deletions(-)
-diff --git a/accel/tcg/tcg-runtime.c b/accel/tcg/tcg-runtime.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/tcg-runtime.c
+--- a/tcg/optimize.c
-+++ b/accel/tcg/tcg-runtime.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
- #include "exec/tb-lookup.h"
- #include "disas/disas.h"
+     QTAILQ_FOREACH_SAFE(op, &s->ops, link, op_next) {
- #include "exec/log.h"
+         uint64_t z_mask, partmask, affected, tmp;
-+#include "tcg.h"
+-        int nb_oargs, nb_iargs;
+         TCGOpcode opc = op->opc;
- /* 32-bit helpers */
+         const TCGOpDef *def;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          }
          def = &tcg_op_defs[opc];
 -        nb_oargs = def->nb_oargs;
 -        nb_iargs = def->nb_iargs;
 -        init_arguments(&ctx, op, nb_oargs + nb_iargs);
 -        copy_propagate(&ctx, op, nb_oargs, nb_iargs);
 +        init_arguments(&ctx, op, def->nb_oargs + def->nb_iargs);
 +        copy_propagate(&ctx, op, def->nb_oargs, def->nb_iargs);
          /* For commutative operations make constant second argument */
          switch (opc) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          CASE_OP_32_64(qemu_ld):
              {
 -                MemOpIdx oi = op->args[nb_oargs + nb_iargs];
 +                MemOpIdx oi = op->args[def->nb_oargs + def->nb_iargs];
                  MemOp mop = get_memop(oi);
                  if (!(mop & MO_SIGN)) {
                      z_mask = (2ULL << ((8 << (mop & MO_SIZE)) - 1)) - 1;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          }
          if (partmask == 0) {
 -            tcg_debug_assert(nb_oargs == 1);
              tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
              continue;
          }
          if (affected == 0) {
 -            tcg_debug_assert(nb_oargs == 1);
              tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
              continue;
          }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              } else if (args_are_copies(op->args[1], op->args[2])) {
                  op->opc = INDEX_op_dup_vec;
                  TCGOP_VECE(op) = MO_32;
 -                nb_iargs = 1;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  op->opc = opc = (opc == INDEX_op_movcond_i32
                                   ? INDEX_op_setcond_i32
                                   : INDEX_op_setcond_i64);
 -                nb_iargs = 2;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          if (def->flags & TCG_OPF_BB_END) {
              memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
          } else {
 +            int nb_oargs = def->nb_oargs;
              for (i = 0; i < nb_oargs; i++) {
                  reset_temp(op->args[i]);
                  /* Save the corresponding known-zero bits mask for the
 --
-.20.1
+.25.1

-New patch
+[PULL 15/56] tcg/optimize: Change fail return for do_constant_folding_cond*
+Return -1 instead of 2 for failure, so that we can
 use comparisons against 0 for all cases.
 Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
  tcg/optimize.c | 145 +++++++++++++++++++++++++------------------------
 file changed, 74 insertions(+), 71 deletions(-)
 diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/optimize.c
 +++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool do_constant_folding_cond_eq(TCGCond c)
      }
  }
 -/* Return 2 if the condition can't be simplified, and the result
 -   of the condition (0 or 1) if it can */
 -static TCGArg do_constant_folding_cond(TCGOpcode op, TCGArg x,
 -                                       TCGArg y, TCGCond c)
 +/*
 + * Return -1 if the condition can't be simplified,
 + * and the result of the condition (0 or 1) if it can.
 + */
 +static int do_constant_folding_cond(TCGOpcode op, TCGArg x,
 +                                    TCGArg y, TCGCond c)
  {
      uint64_t xv = arg_info(x)->val;
      uint64_t yv = arg_info(y)->val;
@@ -XXX,XX +XXX,XX @@ static TCGArg do_constant_folding_cond(TCGOpcode op, TCGArg x,
          case TCG_COND_GEU:
              return 1;
          default:
 -            return 2;
 +            return -1;
          }
      }
 -    return 2;
 +    return -1;
  }
 -/* Return 2 if the condition can't be simplified, and the result
 -   of the condition (0 or 1) if it can */
 -static TCGArg do_constant_folding_cond2(TCGArg *p1, TCGArg *p2, TCGCond c)
 +/*
 + * Return -1 if the condition can't be simplified,
 + * and the result of the condition (0 or 1) if it can.
 + */
 +static int do_constant_folding_cond2(TCGArg *p1, TCGArg *p2, TCGCond c)
  {
      TCGArg al = p1[0], ah = p1[1];
      TCGArg bl = p2[0], bh = p2[1];
@@ -XXX,XX +XXX,XX @@ static TCGArg do_constant_folding_cond2(TCGArg *p1, TCGArg *p2, TCGCond c)
      if (args_are_copies(al, bl) && args_are_copies(ah, bh)) {
          return do_constant_folding_cond_eq(c);
      }
 -    return 2;
 +    return -1;
  }
  static bool swap_commutative(TCGArg dest, TCGArg *p1, TCGArg *p2)
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              break;
          CASE_OP_32_64(setcond):
 -            tmp = do_constant_folding_cond(opc, op->args[1],
 -                                           op->args[2], op->args[3]);
 -            if (tmp != 2) {
 -                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
 +            i = do_constant_folding_cond(opc, op->args[1],
 +                                         op->args[2], op->args[3]);
 +            if (i >= 0) {
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], i);
                  continue;
              }
              break;
          CASE_OP_32_64(brcond):
 -            tmp = do_constant_folding_cond(opc, op->args[0],
 -                                           op->args[1], op->args[2]);
 -            switch (tmp) {
 -            case 0:
 +            i = do_constant_folding_cond(opc, op->args[0],
 +                                         op->args[1], op->args[2]);
 +            if (i == 0) {
                  tcg_op_remove(s, op);
                  continue;
 -            case 1:
 +            } else if (i > 0) {
                  memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
                  op->opc = opc = INDEX_op_br;
                  op->args[0] = op->args[3];
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              break;
          CASE_OP_32_64(movcond):
 -            tmp = do_constant_folding_cond(opc, op->args[1],
 -                                           op->args[2], op->args[5]);
 -            if (tmp != 2) {
 -                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[4-tmp]);
 +            i = do_constant_folding_cond(opc, op->args[1],
 +                                         op->args[2], op->args[5]);
 +            if (i >= 0) {
 +                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[4 - i]);
                  continue;
              }
              if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              break;
          case INDEX_op_brcond2_i32:
 -            tmp = do_constant_folding_cond2(&op->args[0], &op->args[2],
 -                                            op->args[4]);
 -            if (tmp == 0) {
 +            i = do_constant_folding_cond2(&op->args[0], &op->args[2],
 +                                          op->args[4]);
 +            if (i == 0) {
              do_brcond_false:
                  tcg_op_remove(s, op);
                  continue;
              }
 -            if (tmp == 1) {
 +            if (i > 0) {
              do_brcond_true:
                  op->opc = opc = INDEX_op_br;
                  op->args[0] = op->args[5];
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (op->args[4] == TCG_COND_EQ) {
                  /* Simplify EQ comparisons where one of the pairs
                     can be simplified.  */
 -                tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
 -                                               op->args[0], op->args[2],
 -                                               TCG_COND_EQ);
 -                if (tmp == 0) {
 +                i = do_constant_folding_cond(INDEX_op_brcond_i32,
 +                                             op->args[0], op->args[2],
 +                                             TCG_COND_EQ);
 +                if (i == 0) {
                      goto do_brcond_false;
 -                } else if (tmp == 1) {
 +                } else if (i > 0) {
                      goto do_brcond_high;
                  }
 -                tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
 -                                               op->args[1], op->args[3],
 -                                               TCG_COND_EQ);
 -                if (tmp == 0) {
 +                i = do_constant_folding_cond(INDEX_op_brcond_i32,
 +                                             op->args[1], op->args[3],
 +                                             TCG_COND_EQ);
 +                if (i == 0) {
                      goto do_brcond_false;
 -                } else if (tmp != 1) {
 +                } else if (i < 0) {
                      break;
                  }
              do_brcond_low:
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (op->args[4] == TCG_COND_NE) {
                  /* Simplify NE comparisons where one of the pairs
                     can be simplified.  */
 -                tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
 -                                               op->args[0], op->args[2],
 -                                               TCG_COND_NE);
 -                if (tmp == 0) {
 +                i = do_constant_folding_cond(INDEX_op_brcond_i32,
 +                                             op->args[0], op->args[2],
 +                                             TCG_COND_NE);
 +                if (i == 0) {
                      goto do_brcond_high;
 -                } else if (tmp == 1) {
 +                } else if (i > 0) {
                      goto do_brcond_true;
                  }
 -                tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
 -                                               op->args[1], op->args[3],
 -                                               TCG_COND_NE);
 -                if (tmp == 0) {
 +                i = do_constant_folding_cond(INDEX_op_brcond_i32,
 +                                             op->args[1], op->args[3],
 +                                             TCG_COND_NE);
 +                if (i == 0) {
                      goto do_brcond_low;
 -                } else if (tmp == 1) {
 +                } else if (i > 0) {
                      goto do_brcond_true;
                  }
              }
              break;
          case INDEX_op_setcond2_i32:
 -            tmp = do_constant_folding_cond2(&op->args[1], &op->args[3],
 -                                            op->args[5]);
 -            if (tmp != 2) {
 +            i = do_constant_folding_cond2(&op->args[1], &op->args[3],
 +                                          op->args[5]);
 +            if (i >= 0) {
              do_setcond_const:
 -                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], i);
                  continue;
              }
              if ((op->args[5] == TCG_COND_LT || op->args[5] == TCG_COND_GE)
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (op->args[5] == TCG_COND_EQ) {
                  /* Simplify EQ comparisons where one of the pairs
                     can be simplified.  */
 -                tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
 -                                               op->args[1], op->args[3],
 -                                               TCG_COND_EQ);
 -                if (tmp == 0) {
 +                i = do_constant_folding_cond(INDEX_op_setcond_i32,
 +                                             op->args[1], op->args[3],
 +                                             TCG_COND_EQ);
 +                if (i == 0) {
                      goto do_setcond_const;
 -                } else if (tmp == 1) {
 +                } else if (i > 0) {
                      goto do_setcond_high;
                  }
 -                tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
 -                                               op->args[2], op->args[4],
 -                                               TCG_COND_EQ);
 -                if (tmp == 0) {
 +                i = do_constant_folding_cond(INDEX_op_setcond_i32,
 +                                             op->args[2], op->args[4],
 +                                             TCG_COND_EQ);
 +                if (i == 0) {
                      goto do_setcond_high;
 -                } else if (tmp != 1) {
 +                } else if (i < 0) {
                      break;
                  }
              do_setcond_low:
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (op->args[5] == TCG_COND_NE) {
                  /* Simplify NE comparisons where one of the pairs
                     can be simplified.  */
 -                tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
 -                                               op->args[1], op->args[3],
 -                                               TCG_COND_NE);
 -                if (tmp == 0) {
 +                i = do_constant_folding_cond(INDEX_op_setcond_i32,
 +                                             op->args[1], op->args[3],
 +                                             TCG_COND_NE);
 +                if (i == 0) {
                      goto do_setcond_high;
 -                } else if (tmp == 1) {
 +                } else if (i > 0) {
                      goto do_setcond_const;
                  }
 -                tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
 -                                               op->args[2], op->args[4],
 -                                               TCG_COND_NE);
 -                if (tmp == 0) {
 +                i = do_constant_folding_cond(INDEX_op_setcond_i32,
 +                                             op->args[2], op->args[4],
 +                                             TCG_COND_NE);
 +                if (i == 0) {
                      goto do_setcond_low;
 -                } else if (tmp == 1) {
 +                } else if (i > 0) {
                      goto do_setcond_const;
                  }
              }
 --
 .25.1

-[PULL 31/41] target/m68k: Use cpu_*_mmuidx_ra instead of MMU_MODE{0, 1}_SUFFIX
+[PULL 16/56] tcg/optimize: Return true from tcg_opt_gen_{mov,movi}
-The generated *_user functions are unused.  The *_kernel functions
+This will allow callers to tail call to these functions
-have a couple of users in op_helper.c; use *_mmuidx_ra instead,
+and return true indicating processing complete.
 with MMU_KERNEL_IDX.
-Tested-by: Philippe Mathieu-Daudé <philmd@redhat.com>
-Reviewed-by: Laurent Vivier <laurent@vivier.eu>
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
-v2: Use *_mmuidx_ra directly, without intermediate macros.
+ tcg/optimize.c | 9 +++++----
----
+file changed, 5 insertions(+), 4 deletions(-)
  target/m68k/cpu.h       |  2 --
  target/m68k/op_helper.c | 77 +++++++++++++++++++++++++----------------
 files changed, 47 insertions(+), 32 deletions(-)
-diff --git a/target/m68k/cpu.h b/target/m68k/cpu.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/m68k/cpu.h
+--- a/tcg/optimize.c
-+++ b/target/m68k/cpu.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ enum {
+@@ -XXX,XX +XXX,XX @@ static bool args_are_copies(TCGArg arg1, TCGArg arg2)
- #define cpu_list m68k_cpu_list
+     return ts_are_copies(arg_temp(arg1), arg_temp(arg2));
  /* MMU modes definitions */
 -#define MMU_MODE0_SUFFIX _kernel
 -#define MMU_MODE1_SUFFIX _user
  #define MMU_KERNEL_IDX 0
  #define MMU_USER_IDX 1
  static inline int cpu_mmu_index (CPUM68KState *env, bool ifetch)
 diff --git a/target/m68k/op_helper.c b/target/m68k/op_helper.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/m68k/op_helper.c
 +++ b/target/m68k/op_helper.c
@@ -XXX,XX +XXX,XX @@ static void cf_rte(CPUM68KState *env)
      uint32_t fmt;
      sp = env->aregs[7];
 -    fmt = cpu_ldl_kernel(env, sp);
 -    env->pc = cpu_ldl_kernel(env, sp + 4);
 +    fmt = cpu_ldl_mmuidx_ra(env, sp, MMU_KERNEL_IDX, 0);
 +    env->pc = cpu_ldl_mmuidx_ra(env, sp + 4, MMU_KERNEL_IDX, 0);
      sp |= (fmt >> 28) & 3;
      env->aregs[7] = sp + 8;
@@ -XXX,XX +XXX,XX @@ static void m68k_rte(CPUM68KState *env)
      sp = env->aregs[7];
  throwaway:
 -    sr = cpu_lduw_kernel(env, sp);
 +    sr = cpu_lduw_mmuidx_ra(env, sp, MMU_KERNEL_IDX, 0);
      sp += 2;
 -    env->pc = cpu_ldl_kernel(env, sp);
 +    env->pc = cpu_ldl_mmuidx_ra(env, sp, MMU_KERNEL_IDX, 0);
      sp += 4;
      if (m68k_feature(env, M68K_FEATURE_QUAD_MULDIV)) {
          /*  all except 68000 */
 -        fmt = cpu_lduw_kernel(env, sp);
 +        fmt = cpu_lduw_mmuidx_ra(env, sp, MMU_KERNEL_IDX, 0);
          sp += 2;
          switch (fmt >> 12) {
          case 0:
@@ -XXX,XX +XXX,XX @@ static void cf_interrupt_all(CPUM68KState *env, int is_hw)
      /* ??? This could cause MMU faults.  */
      sp &= ~3;
      sp -= 4;
 -    cpu_stl_kernel(env, sp, retaddr);
 +    cpu_stl_mmuidx_ra(env, sp, retaddr, MMU_KERNEL_IDX, 0);
      sp -= 4;
 -    cpu_stl_kernel(env, sp, fmt);
 +    cpu_stl_mmuidx_ra(env, sp, fmt, MMU_KERNEL_IDX, 0);
      env->aregs[7] = sp;
      /* Jump to vector.  */
 -    env->pc = cpu_ldl_kernel(env, env->vbr + vector);
 +    env->pc = cpu_ldl_mmuidx_ra(env, env->vbr + vector, MMU_KERNEL_IDX, 0);
  }
- static inline void do_stack_frame(CPUM68KState *env, uint32_t *sp,
+-static void tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
-@@ -XXX,XX +XXX,XX @@ static inline void do_stack_frame(CPUM68KState *env, uint32_t *sp,
++static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
-         switch (format) {
+ {
-         case 4:
+     TCGTemp *dst_ts = arg_temp(dst);
-             *sp -= 4;
+     TCGTemp *src_ts = arg_temp(src);
--            cpu_stl_kernel(env, *sp, env->pc);
+@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
-+            cpu_stl_mmuidx_ra(env, *sp, env->pc, MMU_KERNEL_IDX, 0);
-             *sp -= 4;
+     if (ts_are_copies(dst_ts, src_ts)) {
--            cpu_stl_kernel(env, *sp, addr);
+         tcg_op_remove(ctx->tcg, op);
-+            cpu_stl_mmuidx_ra(env, *sp, addr, MMU_KERNEL_IDX, 0);
+-        return;
-             break;
++        return true;
          case 3:
          case 2:
              *sp -= 4;
 -            cpu_stl_kernel(env, *sp, addr);
 +            cpu_stl_mmuidx_ra(env, *sp, addr, MMU_KERNEL_IDX, 0);
              break;
          }
          *sp -= 2;
 -        cpu_stw_kernel(env, *sp, (format << 12) + (cs->exception_index << 2));
 +        cpu_stw_mmuidx_ra(env, *sp, (format << 12) + (cs->exception_index << 2),
 +                          MMU_KERNEL_IDX, 0);
      }
-     *sp -= 4;
--    cpu_stl_kernel(env, *sp, retaddr);
+     reset_ts(dst_ts);
-+    cpu_stl_mmuidx_ra(env, *sp, retaddr, MMU_KERNEL_IDX, 0);
+@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
-     *sp -= 2;
+         di->is_const = si->is_const;
--    cpu_stw_kernel(env, *sp, sr);
+         di->val = si->val;
-+    cpu_stw_mmuidx_ra(env, *sp, sr, MMU_KERNEL_IDX, 0);
+     }
 +    return true;
  }
- static void m68k_interrupt_all(CPUM68KState *env, int is_hw)
+-static void tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
-@@ -XXX,XX +XXX,XX @@ static void m68k_interrupt_all(CPUM68KState *env, int is_hw)
++static bool tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
-             cpu_abort(cs, "DOUBLE MMU FAULT\n");
+                              TCGArg dst, uint64_t val)
-         }
+ {
-         env->mmu.fault = true;
+     const TCGOpDef *def = &tcg_op_defs[op->opc];
-+        /* push data 3 */
+@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
-         sp -= 4;
+     /* Convert movi to mov with constant temp. */
--        cpu_stl_kernel(env, sp, 0); /* push data 3 */
+     tv = tcg_constant_internal(type, val);
-+        cpu_stl_mmuidx_ra(env, sp, 0, MMU_KERNEL_IDX, 0);
+     init_ts_info(ctx, tv);
-+        /* push data 2 */
+-    tcg_opt_gen_mov(ctx, op, dst, temp_arg(tv));
-         sp -= 4;
++    return tcg_opt_gen_mov(ctx, op, dst, temp_arg(tv));
 -        cpu_stl_kernel(env, sp, 0); /* push data 2 */
 +        cpu_stl_mmuidx_ra(env, sp, 0, MMU_KERNEL_IDX, 0);
 +        /* push data 1 */
          sp -= 4;
 -        cpu_stl_kernel(env, sp, 0); /* push data 1 */
 +        cpu_stl_mmuidx_ra(env, sp, 0, MMU_KERNEL_IDX, 0);
 +        /* write back 1 / push data 0 */
          sp -= 4;
 -        cpu_stl_kernel(env, sp, 0); /* write back 1 / push data 0 */
 +        cpu_stl_mmuidx_ra(env, sp, 0, MMU_KERNEL_IDX, 0);
 +        /* write back 1 address */
          sp -= 4;
 -        cpu_stl_kernel(env, sp, 0); /* write back 1 address */
 +        cpu_stl_mmuidx_ra(env, sp, 0, MMU_KERNEL_IDX, 0);
 +        /* write back 2 data */
          sp -= 4;
 -        cpu_stl_kernel(env, sp, 0); /* write back 2 data */
 +        cpu_stl_mmuidx_ra(env, sp, 0, MMU_KERNEL_IDX, 0);
 +        /* write back 2 address */
          sp -= 4;
 -        cpu_stl_kernel(env, sp, 0); /* write back 2 address */
 +        cpu_stl_mmuidx_ra(env, sp, 0, MMU_KERNEL_IDX, 0);
 +        /* write back 3 data */
          sp -= 4;
 -        cpu_stl_kernel(env, sp, 0); /* write back 3 data */
 +        cpu_stl_mmuidx_ra(env, sp, 0, MMU_KERNEL_IDX, 0);
 +        /* write back 3 address */
          sp -= 4;
 -        cpu_stl_kernel(env, sp, env->mmu.ar); /* write back 3 address */
 +        cpu_stl_mmuidx_ra(env, sp, env->mmu.ar, MMU_KERNEL_IDX, 0);
 +        /* fault address */
          sp -= 4;
 -        cpu_stl_kernel(env, sp, env->mmu.ar); /* fault address */
 +        cpu_stl_mmuidx_ra(env, sp, env->mmu.ar, MMU_KERNEL_IDX, 0);
 +        /* write back 1 status */
          sp -= 2;
 -        cpu_stw_kernel(env, sp, 0); /* write back 1 status */
 +        cpu_stw_mmuidx_ra(env, sp, 0, MMU_KERNEL_IDX, 0);
 +        /* write back 2 status */
          sp -= 2;
 -        cpu_stw_kernel(env, sp, 0); /* write back 2 status */
 +        cpu_stw_mmuidx_ra(env, sp, 0, MMU_KERNEL_IDX, 0);
 +        /* write back 3 status */
          sp -= 2;
 -        cpu_stw_kernel(env, sp, 0); /* write back 3 status */
 +        cpu_stw_mmuidx_ra(env, sp, 0, MMU_KERNEL_IDX, 0);
 +        /* special status word */
          sp -= 2;
 -        cpu_stw_kernel(env, sp, env->mmu.ssw); /* special status word */
 +        cpu_stw_mmuidx_ra(env, sp, env->mmu.ssw, MMU_KERNEL_IDX, 0);
 +        /* effective address */
          sp -= 4;
 -        cpu_stl_kernel(env, sp, env->mmu.ar); /* effective address */
 +        cpu_stl_mmuidx_ra(env, sp, env->mmu.ar, MMU_KERNEL_IDX, 0);
 +
          do_stack_frame(env, &sp, 7, oldsr, 0, retaddr);
          env->mmu.fault = false;
          if (qemu_loglevel_mask(CPU_LOG_INT)) {
@@ -XXX,XX +XXX,XX @@ static void m68k_interrupt_all(CPUM68KState *env, int is_hw)
      env->aregs[7] = sp;
      /* Jump to vector.  */
 -    env->pc = cpu_ldl_kernel(env, env->vbr + vector);
 +    env->pc = cpu_ldl_mmuidx_ra(env, env->vbr + vector, MMU_KERNEL_IDX, 0);
  }
- static void do_interrupt_all(CPUM68KState *env, int is_hw)
+ static uint64_t do_constant_folding_2(TCGOpcode op, uint64_t x, uint64_t y)
 --
-.20.1
+.25.1

-[PULL 07/41] configure: Support -static-pie if requested
+[PULL 17/56] tcg/optimize: Split out finish_folding
-Recent toolchains support static and pie at the same time.
+Copy z_mask into OptContext, for writeback to the
+first output within the new function.
 As with normal dynamic builds, allow --static to default to PIE
 if supported by the toolchain.  Allow --enable/--disable-pie to
 override the default.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
-v2: Fix --disable-pie --static
+ tcg/optimize.c | 49 +++++++++++++++++++++++++++++++++----------------
----
+file changed, 33 insertions(+), 16 deletions(-)
  configure | 19 ++++++++++++-------
 file changed, 12 insertions(+), 7 deletions(-)
-diff --git a/configure b/configure
+diff --git a/tcg/optimize.c b/tcg/optimize.c
-index XXXXXXX..XXXXXXX 100755
+index XXXXXXX..XXXXXXX 100644
---- a/configure
+--- a/tcg/optimize.c
-+++ b/configure
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ for opt do
+@@ -XXX,XX +XXX,XX @@ typedef struct OptContext {
-   ;;
+     TCGContext *tcg;
-   --static)
+     TCGOp *prev_mb;
-     static="yes"
+     TCGTempSet temps_used;
--    LDFLAGS="-static $LDFLAGS"
++
-     QEMU_PKG_CONFIG_FLAGS="--static $QEMU_PKG_CONFIG_FLAGS"
++    /* In flight values from optimization. */
-   ;;
++    uint64_t z_mask;
-   --mandir=*) mandir="$optarg"
+ } OptContext;
-@@ -XXX,XX +XXX,XX @@ if test "$static" = "yes" ; then
-   if test "$modules" = "yes" ; then
+ static inline TempOptInfo *ts_info(TCGTemp *ts)
-     error_exit "static and modules are mutually incompatible"
+@@ -XXX,XX +XXX,XX @@ static void copy_propagate(OptContext *ctx, TCGOp *op,
-   fi
+     }
--  if test "$pie" = "yes" ; then
+ }
--    error_exit "static and pie are mutually incompatible"
--  else
++static void finish_folding(OptContext *ctx, TCGOp *op)
--    pie="no"
++{
--  fi
++    const TCGOpDef *def = &tcg_op_defs[op->opc];
- fi
++    int i, nb_oargs;
++
- # Unconditional check for compiler __thread support
++    /*
-@@ -XXX,XX +XXX,XX @@ if compile_prog "-Werror -fno-pie" "-no-pie"; then
++     * For an opcode that ends a BB, reset all temp data.
-   LDFLAGS_NOPIE="-no-pie"
++     * We do no cross-BB optimization.
- fi
++     */
++    if (def->flags & TCG_OPF_BB_END) {
--if test "$pie" = "no"; then
++        memset(&ctx->temps_used, 0, sizeof(ctx->temps_used));
-+if test "$static" = "yes"; then
++        ctx->prev_mb = NULL;
-+  if test "$pie" != "no" && compile_prog "-fPIE -DPIE" "-static-pie"; then
++        return;
-+    QEMU_CFLAGS="-fPIE -DPIE $QEMU_CFLAGS"
++    }
-+    LDFLAGS="-static-pie $LDFLAGS"
++
-+    pie="yes"
++    nb_oargs = def->nb_oargs;
-+  elif test "$pie" = "yes"; then
++    for (i = 0; i < nb_oargs; i++) {
-+    error_exit "-static-pie not available due to missing toolchain support"
++        reset_temp(op->args[i]);
-+  else
++        /*
-+    LDFLAGS="-static $LDFLAGS"
++         * Save the corresponding known-zero bits mask for the
-+    pie="no"
++         * first output argument (only one supported so far).
-+  fi
++         */
-+elif test "$pie" = "no"; then
++        if (i == 0) {
-   QEMU_CFLAGS="$CFLAGS_NOPIE $QEMU_CFLAGS"
++            arg_info(op->args[i])->z_mask = ctx->z_mask;
-   LDFLAGS="$LDFLAGS_NOPIE $LDFLAGS"
++        }
- elif compile_prog "-fPIE -DPIE" "-pie"; then
++    }
 +}
 +
  static bool fold_call(OptContext *ctx, TCGOp *op)
  {
      TCGContext *s = ctx->tcg;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              partmask &= 0xffffffffu;
              affected &= 0xffffffffu;
          }
 +        ctx.z_mask = z_mask;
          if (partmask == 0) {
              tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              break;
          }
 -        /* Some of the folding above can change opc. */
 -        opc = op->opc;
 -        def = &tcg_op_defs[opc];
 -        if (def->flags & TCG_OPF_BB_END) {
 -            memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
 -        } else {
 -            int nb_oargs = def->nb_oargs;
 -            for (i = 0; i < nb_oargs; i++) {
 -                reset_temp(op->args[i]);
 -                /* Save the corresponding known-zero bits mask for the
 -                   first output argument (only one supported so far). */
 -                if (i == 0) {
 -                    arg_info(op->args[i])->z_mask = z_mask;
 -                }
 -            }
 -        }
 +        finish_folding(&ctx, op);
          /* Eliminate duplicate and redundant fence instructions.  */
          if (ctx.prev_mb) {
 --
-.20.1
+.25.1

-[PULL 05/41] configure: Unnest detection of -z,relro and -z,now
+[PULL 18/56] tcg/optimize: Use a boolean to avoid a mass of continues
-There is nothing about these options that is related to PIE.
-Use them unconditionally.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-Reviewed-by: Fangrui Song <i@maskray.me>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
-Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
-v2: Do not split into two tests.
+ tcg/optimize.c | 9 ++++++---
 ---
  configure | 9 ++++++---
 file changed, 6 insertions(+), 3 deletions(-)
-diff --git a/configure b/configure
+diff --git a/tcg/optimize.c b/tcg/optimize.c
-index XXXXXXX..XXXXXXX 100755
+index XXXXXXX..XXXXXXX 100644
---- a/configure
+--- a/tcg/optimize.c
-+++ b/configure
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ if test "$pie" != "no" ; then
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-     QEMU_CFLAGS="-fPIE -DPIE $QEMU_CFLAGS"
+         uint64_t z_mask, partmask, affected, tmp;
-     LDFLAGS="-pie $LDFLAGS"
+         TCGOpcode opc = op->opc;
-     pie="yes"
+         const TCGOpDef *def;
--    if compile_prog "" "-Wl,-z,relro -Wl,-z,now" ; then
++        bool done = false;
--      LDFLAGS="-Wl,-z,relro -Wl,-z,now $LDFLAGS"
--    fi
+         /* Calls are special. */
-   else
+         if (opc == INDEX_op_call) {
-     if test "$pie" = "yes"; then
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-       error_exit "PIE not available due to missing toolchain support"
+            allocator where needed and possible.  Also detect copies. */
-@@ -XXX,XX +XXX,XX @@ if test "$pie" != "no" ; then
+         switch (opc) {
-   fi
+         CASE_OP_32_64_VEC(mov):
- fi
+-            tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
+-            continue;
-+# Detect support for PT_GNU_RELRO + DT_BIND_NOW.
++            done = tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
-+# The combination is known as "full relro", because .got.plt is read-only too.
++            break;
-+if compile_prog "" "-Wl,-z,relro -Wl,-z,now" ; then
-+  LDFLAGS="-Wl,-z,relro -Wl,-z,now $LDFLAGS"
+         case INDEX_op_dup_vec:
-+fi
+             if (arg_is_const(op->args[1])) {
-+
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
- ##########################################
+             break;
- # __sync_fetch_and_and requires at least -march=i486. Many toolchains
+         }
- # use i686 as default anyway, but for those that don't, an explicit
 -        finish_folding(&ctx, op);
 +        if (!done) {
 +            finish_folding(&ctx, op);
 +        }
          /* Eliminate duplicate and redundant fence instructions.  */
          if (ctx.prev_mb) {
 --
-.20.1
+.25.1

-[PULL 20/41] cputlb: Provide cpu_(ld,st}*_mmuidx_ra for user-only
+[PULL 19/56] tcg/optimize: Split out fold_mb, fold_qemu_{ld,st}
-This finishes the new interface began with the previous patch.
+This puts the separate mb optimization into the same framework
-Document the interface and deprecate MMU_MODE<N>_SUFFIX.
+as the others.  While fold_qemu_{ld,st} are currently identical,
 that won't last as more code gets moved.
-Tested-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
-Reviewed-by: Aleksandar Markovic <amarkovic@wavecomp.com>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/exec/cpu_ldst.h     |  80 +++++++++++++-
+ tcg/optimize.c | 89 +++++++++++++++++++++++++++++---------------------
- docs/devel/loads-stores.rst | 211 ++++++++++++++++++++++++++----------
+file changed, 51 insertions(+), 38 deletions(-)
 files changed, 230 insertions(+), 61 deletions(-)
-diff --git a/include/exec/cpu_ldst.h b/include/exec/cpu_ldst.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/exec/cpu_ldst.h
+--- a/tcg/optimize.c
-+++ b/include/exec/cpu_ldst.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static bool fold_call(OptContext *ctx, TCGOp *op)
-  *
+     return true;
-  * The syntax for the accessors is:
+ }
-  *
-- * load: cpu_ld{sign}{size}_{mmusuffix}(env, ptr)
++static bool fold_mb(OptContext *ctx, TCGOp *op)
 + * load:  cpu_ld{sign}{size}_{mmusuffix}(env, ptr)
 + *        cpu_ld{sign}{size}_{mmusuffix}_ra(env, ptr, retaddr)
 + *        cpu_ld{sign}{size}_mmuidx_ra(env, ptr, mmu_idx, retaddr)
   *
 - * store: cpu_st{sign}{size}_{mmusuffix}(env, ptr, val)
 + * store: cpu_st{size}_{mmusuffix}(env, ptr, val)
 + *        cpu_st{size}_{mmusuffix}_ra(env, ptr, val, retaddr)
 + *        cpu_st{size}_mmuidx_ra(env, ptr, val, mmu_idx, retaddr)
   *
   * sign is:
   * (empty): for 32 and 64 bit sizes
@@ -XXX,XX +XXX,XX @@
   *   l: 32 bits
   *   q: 64 bits
   *
 - * mmusuffix is one of the generic suffixes "data" or "code", or
 - * (for softmmu configs)  a target-specific MMU mode suffix as defined
 - * in target cpu.h.
 + * mmusuffix is one of the generic suffixes "data" or "code", or "mmuidx".
 + * The "mmuidx" suffix carries an extra mmu_idx argument that specifies
 + * the index to use; the "data" and "code" suffixes take the index from
 + * cpu_mmu_index().
   */
  #ifndef CPU_LDST_H
  #define CPU_LDST_H
@@ -XXX,XX +XXX,XX @@ static inline void clear_helper_retaddr(void)
  #undef MEMSUFFIX
  #undef CODE_ACCESS
 +/*
 + * Provide the same *_mmuidx_ra interface as for softmmu.
 + * The mmu_idx argument is ignored.
 + */
 +
 +static inline uint32_t cpu_ldub_mmuidx_ra(CPUArchState *env, abi_ptr addr,
 +                                          int mmu_idx, uintptr_t ra)
 +{
-+    return cpu_ldub_data_ra(env, addr, ra);
++    /* Eliminate duplicate and redundant fence instructions.  */
 +    if (ctx->prev_mb) {
 +        /*
 +         * Merge two barriers of the same type into one,
 +         * or a weaker barrier into a stronger one,
 +         * or two weaker barriers into a stronger one.
 +         *   mb X; mb Y => mb X|Y
 +         *   mb; strl => mb; st
 +         *   ldaq; mb => ld; mb
 +         *   ldaq; strl => ld; mb; st
 +         * Other combinations are also merged into a strong
 +         * barrier.  This is stricter than specified but for
 +         * the purposes of TCG is better than not optimizing.
 +         */
 +        ctx->prev_mb->args[0] |= op->args[0];
 +        tcg_op_remove(ctx->tcg, op);
 +    } else {
 +        ctx->prev_mb = op;
 +    }
 +    return true;
 +}
 +
-+static inline uint32_t cpu_lduw_mmuidx_ra(CPUArchState *env, abi_ptr addr,
++static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
 +                                          int mmu_idx, uintptr_t ra)
 +{
-+    return cpu_lduw_data_ra(env, addr, ra);
++    /* Opcodes that touch guest memory stop the mb optimization.  */
 +    ctx->prev_mb = NULL;
 +    return false;
 +}
 +
-+static inline uint32_t cpu_ldl_mmuidx_ra(CPUArchState *env, abi_ptr addr,
++static bool fold_qemu_st(OptContext *ctx, TCGOp *op)
 +                                         int mmu_idx, uintptr_t ra)
 +{
-+    return cpu_ldl_data_ra(env, addr, ra);
++    /* Opcodes that touch guest memory stop the mb optimization.  */
 +    ctx->prev_mb = NULL;
 +    return false;
 +}
 +
-+static inline uint64_t cpu_ldq_mmuidx_ra(CPUArchState *env, abi_ptr addr,
+ /* Propagate constants and copies, fold constant expressions. */
-+                                         int mmu_idx, uintptr_t ra)
+ void tcg_optimize(TCGContext *s)
-+{
+ {
-+    return cpu_ldq_data_ra(env, addr, ra);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-+}
+             }
              break;
 +        case INDEX_op_mb:
 +            done = fold_mb(&ctx, op);
 +            break;
 +        case INDEX_op_qemu_ld_i32:
 +        case INDEX_op_qemu_ld_i64:
 +            done = fold_qemu_ld(&ctx, op);
 +            break;
 +        case INDEX_op_qemu_st_i32:
 +        case INDEX_op_qemu_st8_i32:
 +        case INDEX_op_qemu_st_i64:
 +            done = fold_qemu_st(&ctx, op);
 +            break;
 +
-+static inline int cpu_ldsb_mmuidx_ra(CPUArchState *env, abi_ptr addr,
+         default:
-+                                     int mmu_idx, uintptr_t ra)
+             break;
-+{
+         }
-+    return cpu_ldsb_data_ra(env, addr, ra);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-+}
+         if (!done) {
-+
+             finish_folding(&ctx, op);
-+static inline int cpu_ldsw_mmuidx_ra(CPUArchState *env, abi_ptr addr,
+         }
 +                                     int mmu_idx, uintptr_t ra)
 +{
 +    return cpu_ldsw_data_ra(env, addr, ra);
 +}
 +
 +static inline void cpu_stb_mmuidx_ra(CPUArchState *env, abi_ptr addr,
 +                                     uint32_t val, int mmu_idx, uintptr_t ra)
 +{
 +    cpu_stb_data_ra(env, addr, val, ra);
 +}
 +
 +static inline void cpu_stw_mmuidx_ra(CPUArchState *env, abi_ptr addr,
 +                                     uint32_t val, int mmu_idx, uintptr_t ra)
 +{
 +    cpu_stw_data_ra(env, addr, val, ra);
 +}
 +
 +static inline void cpu_stl_mmuidx_ra(CPUArchState *env, abi_ptr addr,
 +                                     uint32_t val, int mmu_idx, uintptr_t ra)
 +{
 +    cpu_stl_data_ra(env, addr, val, ra);
 +}
 +
 +static inline void cpu_stq_mmuidx_ra(CPUArchState *env, abi_ptr addr,
 +                                     uint64_t val, int mmu_idx, uintptr_t ra)
 +{
 +    cpu_stq_data_ra(env, addr, val, ra);
 +}
 +
  #else
  /* Needed for TCG_OVERSIZED_GUEST */
 diff --git a/docs/devel/loads-stores.rst b/docs/devel/loads-stores.rst
 index XXXXXXX..XXXXXXX 100644
 --- a/docs/devel/loads-stores.rst
 +++ b/docs/devel/loads-stores.rst
@@ -XXX,XX +XXX,XX @@ Regexes for git grep
   - ``\<ldn_\([hbl]e\)?_p\>``
   - ``\<stn_\([hbl]e\)?_p\>``
 -``cpu_{ld,st}_*``
 -~~~~~~~~~~~~~~~~~
 +``cpu_{ld,st}*_mmuidx_ra``
 +~~~~~~~~~~~~~~~~~~~~~~~~~~
 -These functions operate on a guest virtual address. Be aware
 -that these functions may cause a guest CPU exception to be
 -taken (e.g. for an alignment fault or MMU fault) which will
 -result in guest CPU state being updated and control longjumping
 -out of the function call. They should therefore only be used
 -in code that is implementing emulation of the target CPU.
 +These functions operate on a guest virtual address plus a context,
 +known as a "mmu index" or ``mmuidx``, which controls how that virtual
 +address is translated.  The meaning of the indexes are target specific,
 +but specifying a particular index might be necessary if, for instance,
 +the helper requires an "always as non-privileged" access rather that
 +the default access for the current state of the guest CPU.
 -These functions may throw an exception (longjmp() back out
 -to the top level TCG loop). This means they must only be used
 -from helper functions where the translator has saved all
 -necessary CPU state before generating the helper function call.
 -It's usually better to use the ``_ra`` variants described below
 -from helper functions, but these functions are the right choice
 -for calls made from hooks like the CPU do_interrupt hook or
 -when you know for certain that the translator had to save all
 -the CPU state that ``cpu_restore_state()`` would restore anyway.
 +These functions may cause a guest CPU exception to be taken
 +(e.g. for an alignment fault or MMU fault) which will result in
 +guest CPU state being updated and control longjmp'ing out of the
 +function call.  They should therefore only be used in code that is
 +implementing emulation of the guest CPU.
 +
 +The ``retaddr`` parameter is used to control unwinding of the
 +guest CPU state in case of a guest CPU exception.  This is passed
 +to ``cpu_restore_state()``.  Therefore the value should either be 0,
 +to indicate that the guest CPU state is already synchronized, or
 +the result of ``GETPC()`` from the top level ``HELPER(foo)``
 +function, which is a return address into the generated code.
  Function names follow the pattern:
 -load: ``cpu_ld{sign}{size}_{mmusuffix}(env, ptr)``
 +load: ``cpu_ld{sign}{size}_mmuidx_ra(env, ptr, mmuidx, retaddr)``
 -store: ``cpu_st{size}_{mmusuffix}(env, ptr, val)``
 +store: ``cpu_st{size}_mmuidx_ra(env, ptr, val, mmuidx, retaddr)``
  ``sign``
   - (empty) : for 32 or 64 bit sizes
@@ -XXX,XX +XXX,XX @@ store: ``cpu_st{size}_{mmusuffix}(env, ptr, val)``
   - ``l`` : 32 bits
   - ``q`` : 64 bits
 -``mmusuffix`` is one of the generic suffixes ``data`` or ``code``, or
 -(for softmmu configs) a target-specific MMU mode suffix as defined
 -in the target's ``cpu.h``.
 +Regexes for git grep:
 + - ``\<cpu_ld[us]\?[bwlq]_mmuidx_ra\>``
 + - ``\<cpu_st[bwlq]_mmuidx_ra\>``
 -Regexes for git grep
 - - ``\<cpu_ld[us]\?[bwlq]_[a-zA-Z0-9]\+\>``
 - - ``\<cpu_st[bwlq]_[a-zA-Z0-9]\+\>``
 +``cpu_{ld,st}*_data_ra``
 +~~~~~~~~~~~~~~~~~~~~~~~~
 -``cpu_{ld,st}_*_ra``
 -~~~~~~~~~~~~~~~~~~~~
 -
--These functions work like the ``cpu_{ld,st}_*`` functions except
+-        /* Eliminate duplicate and redundant fence instructions.  */
--that they also take a ``retaddr`` argument. This extra argument
+-        if (ctx.prev_mb) {
--allows for correct unwinding of any exception that is taken,
+-            switch (opc) {
--and should generally be the result of GETPC() called directly
+-            case INDEX_op_mb:
--from the top level HELPER(foo) function (i.e. the return address
+-                /* Merge two barriers of the same type into one,
--in the generated code).
+-                 * or a weaker barrier into a stronger one,
-+These functions work like the ``cpu_{ld,st}_mmuidx_ra`` functions
+-                 * or two weaker barriers into a stronger one.
-+except that the ``mmuidx`` parameter is taken from the current mode
+-                 *   mb X; mb Y => mb X|Y
-+of the guest CPU, as determined by ``cpu_mmu_index(env, false)``.
+-                 *   mb; strl => mb; st
+-                 *   ldaq; mb => ld; mb
- These are generally the preferred way to do accesses by guest
+-                 *   ldaq; strl => ld; mb; st
--virtual address from helper functions; see the documentation
+-                 * Other combinations are also merged into a strong
--of the non-``_ra`` variants for when those would be better.
+-                 * barrier.  This is stricter than specified but for
 -                 * the purposes of TCG is better than not optimizing.
 -                 */
 -                ctx.prev_mb->args[0] |= op->args[0];
 -                tcg_op_remove(s, op);
 -                break;
 -
--Calling these functions with a ``retaddr`` argument of 0 is
+-            default:
--equivalent to calling the non-``_ra`` version of the function.
+-                /* Opcodes that end the block stop the optimization.  */
-+virtual address from helper functions, unless the access should
+-                if ((def->flags & TCG_OPF_BB_END) == 0) {
-+be performed with a context other than the default.
+-                    break;
+-                }
- Function names follow the pattern:
+-                /* fallthru */
+-            case INDEX_op_qemu_ld_i32:
--load: ``cpu_ld{sign}{size}_{mmusuffix}_ra(env, ptr, retaddr)``
+-            case INDEX_op_qemu_ld_i64:
-+load: ``cpu_ld{sign}{size}_data_ra(env, ptr, ra)``
+-            case INDEX_op_qemu_st_i32:
+-            case INDEX_op_qemu_st8_i32:
--store: ``cpu_st{sign}{size}_{mmusuffix}_ra(env, ptr, val, retaddr)``
+-            case INDEX_op_qemu_st_i64:
-+store: ``cpu_st{size}_data_ra(env, ptr, val, ra)``
+-                /* Opcodes that touch guest memory stop the optimization.  */
-+
+-                ctx.prev_mb = NULL;
-+``sign``
+-                break;
-+ - (empty) : for 32 or 64 bit sizes
+-            }
-+ - ``u`` : unsigned
+-        } else if (opc == INDEX_op_mb) {
-+ - ``s`` : signed
+-            ctx.prev_mb = op;
-+
+-        }
-+``size``
+     }
-+ - ``b`` : 8 bits
+ }
 + - ``w`` : 16 bits
 + - ``l`` : 32 bits
 + - ``q`` : 64 bits
 +
 +Regexes for git grep:
 + - ``\<cpu_ld[us]\?[bwlq]_data_ra\>``
 + - ``\<cpu_st[bwlq]_data_ra\>``
 +
 +``cpu_{ld,st}*_data``
 +~~~~~~~~~~~~~~~~~~~~~
 +
 +These functions work like the ``cpu_{ld,st}_data_ra`` functions
 +except that the ``retaddr`` parameter is 0, and thus does not
 +unwind guest CPU state.
 +
 +This means they must only be used from helper functions where the
 +translator has saved all necessary CPU state.  These functions are
 +the right choice for calls made from hooks like the CPU ``do_interrupt``
 +hook or when you know for certain that the translator had to save all
 +the CPU state anyway.
 +
 +Function names follow the pattern:
 +
 +load: ``cpu_ld{sign}{size}_data(env, ptr)``
 +
 +store: ``cpu_st{size}_data(env, ptr, val)``
 +
 +``sign``
 + - (empty) : for 32 or 64 bit sizes
 + - ``u`` : unsigned
 + - ``s`` : signed
 +
 +``size``
 + - ``b`` : 8 bits
 + - ``w`` : 16 bits
 + - ``l`` : 32 bits
 + - ``q`` : 64 bits
  Regexes for git grep
 - - ``\<cpu_ld[us]\?[bwlq]_[a-zA-Z0-9]\+_ra\>``
 - - ``\<cpu_st[bwlq]_[a-zA-Z0-9]\+_ra\>``
 + - ``\<cpu_ld[us]\?[bwlq]_data\>``
 + - ``\<cpu_st[bwlq]_data\+\>``
 -``helper_*_{ld,st}*mmu``
 -~~~~~~~~~~~~~~~~~~~~~~~~
 +``cpu_ld*_code``
 +~~~~~~~~~~~~~~~~
 +
 +These functions perform a read for instruction execution.  The ``mmuidx``
 +parameter is taken from the current mode of the guest CPU, as determined
 +by ``cpu_mmu_index(env, true)``.  The ``retaddr`` parameter is 0, and
 +thus does not unwind guest CPU state, because CPU state is always
 +synchronized while translating instructions.  Any guest CPU exception
 +that is raised will indicate an instruction execution fault rather than
 +a data read fault.
 +
 +In general these functions should not be used directly during translation.
 +There are wrapper functions that are to be used which also take care of
 +plugins for tracing.
 +
 +Function names follow the pattern:
 +
 +load: ``cpu_ld{sign}{size}_code(env, ptr)``
 +
 +``sign``
 + - (empty) : for 32 or 64 bit sizes
 + - ``u`` : unsigned
 + - ``s`` : signed
 +
 +``size``
 + - ``b`` : 8 bits
 + - ``w`` : 16 bits
 + - ``l`` : 32 bits
 + - ``q`` : 64 bits
 +
 +Regexes for git grep:
 + - ``\<cpu_ld[us]\?[bwlq]_code\>``
 +
 +``translator_ld*``
 +~~~~~~~~~~~~~~~~~~
 +
 +These functions are a wrapper for ``cpu_ld*_code`` which also perform
 +any actions required by any tracing plugins.  They are only to be
 +called during the translator callback ``translate_insn``.
 +
 +There is a set of functions ending in ``_swap`` which, if the parameter
 +is true, returns the value in the endianness that is the reverse of
 +the guest native endianness, as determined by ``TARGET_WORDS_BIGENDIAN``.
 +
 +Function names follow the pattern:
 +
 +load: ``translator_ld{sign}{size}(env, ptr)``
 +
 +swap: ``translator_ld{sign}{size}_swap(env, ptr, swap)``
 +
 +``sign``
 + - (empty) : for 32 or 64 bit sizes
 + - ``u`` : unsigned
 + - ``s`` : signed
 +
 +``size``
 + - ``b`` : 8 bits
 + - ``w`` : 16 bits
 + - ``l`` : 32 bits
 + - ``q`` : 64 bits
 +
 +Regexes for git grep
 + - ``\<translator_ld[us]\?[bwlq]\(_swap\)\?\>``
 +
 +``helper_*_{ld,st}*_mmu``
 +~~~~~~~~~~~~~~~~~~~~~~~~~
  These functions are intended primarily to be called by the code
  generated by the TCG backend. They may also be called by target
 -CPU helper function code. Like the ``cpu_{ld,st}_*_ra`` functions
 -they perform accesses by guest virtual address; the difference is
 -that these functions allow you to specify an ``opindex`` parameter
 -which encodes (among other things) the mmu index to use for the
 -access. This is necessary if your helper needs to make an access
 -via a specific mmu index (for instance, an "always as non-privileged"
 -access) rather than using the default mmu index for the current state
 -of the guest CPU.
 +CPU helper function code. Like the ``cpu_{ld,st}_mmuidx_ra`` functions
 +they perform accesses by guest virtual address, with a given ``mmuidx``.
 -The ``opindex`` parameter should be created by calling ``make_memop_idx()``.
 +These functions specify an ``opindex`` parameter which encodes
 +(among other things) the mmu index to use for the access.  This parameter
 +should be created by calling ``make_memop_idx()``.
  The ``retaddr`` parameter should be the result of GETPC() called directly
  from the top level HELPER(foo) function (or 0 if no guest CPU state
@@ -XXX,XX +XXX,XX @@ unwinding is required).
  **TODO** The names of these functions are a bit odd for historical
  reasons because they were originally expected to be called only from
 -within generated code. We should rename them to bring them
 -more in line with the other memory access functions.
 +within generated code. We should rename them to bring them more in
 +line with the other memory access functions. The explicit endianness
 +is the only feature they have beyond ``*_mmuidx_ra``.
  load: ``helper_{endian}_ld{sign}{size}_mmu(env, addr, opindex, retaddr)``
 --
-.20.1
+.25.1

-New patch
+[PULL 20/56] tcg/optimize: Split out fold_const{1,2}
+Split out a whole bunch of placeholder functions, which are
 currently identical.  That won't last as more code gets moved.
 Use CASE_32_64_VEC for some logical operators that previously
 missed the addition of vectors.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
  tcg/optimize.c | 271 +++++++++++++++++++++++++++++++++++++++----------
 file changed, 219 insertions(+), 52 deletions(-)
 diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/optimize.c
 +++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static void finish_folding(OptContext *ctx, TCGOp *op)
      }
  }
 +/*
 + * The fold_* functions return true when processing is complete,
 + * usually by folding the operation to a constant or to a copy,
 + * and calling tcg_opt_gen_{mov,movi}.  They may do other things,
 + * like collect information about the value produced, for use in
 + * optimizing a subsequent operation.
 + *
 + * These first fold_* functions are all helpers, used by other
 + * folders for more specific operations.
 + */
 +
 +static bool fold_const1(OptContext *ctx, TCGOp *op)
 +{
 +    if (arg_is_const(op->args[1])) {
 +        uint64_t t;
 +
 +        t = arg_info(op->args[1])->val;
 +        t = do_constant_folding(op->opc, t, 0);
 +        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
 +    }
 +    return false;
 +}
 +
 +static bool fold_const2(OptContext *ctx, TCGOp *op)
 +{
 +    if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
 +        uint64_t t1 = arg_info(op->args[1])->val;
 +        uint64_t t2 = arg_info(op->args[2])->val;
 +
 +        t1 = do_constant_folding(op->opc, t1, t2);
 +        return tcg_opt_gen_movi(ctx, op, op->args[0], t1);
 +    }
 +    return false;
 +}
 +
 +/*
 + * These outermost fold_<op> functions are sorted alphabetically.
 + */
 +
 +static bool fold_add(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
 +static bool fold_and(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
 +static bool fold_andc(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
  static bool fold_call(OptContext *ctx, TCGOp *op)
  {
      TCGContext *s = ctx->tcg;
@@ -XXX,XX +XXX,XX @@ static bool fold_call(OptContext *ctx, TCGOp *op)
      return true;
  }
 +static bool fold_ctpop(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const1(ctx, op);
 +}
 +
 +static bool fold_divide(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
 +static bool fold_eqv(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
 +static bool fold_exts(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const1(ctx, op);
 +}
 +
 +static bool fold_extu(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const1(ctx, op);
 +}
 +
  static bool fold_mb(OptContext *ctx, TCGOp *op)
  {
      /* Eliminate duplicate and redundant fence instructions.  */
@@ -XXX,XX +XXX,XX @@ static bool fold_mb(OptContext *ctx, TCGOp *op)
      return true;
  }
 +static bool fold_mul(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
 +static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
 +static bool fold_nand(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
 +static bool fold_neg(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const1(ctx, op);
 +}
 +
 +static bool fold_nor(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
 +static bool fold_not(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const1(ctx, op);
 +}
 +
 +static bool fold_or(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
 +static bool fold_orc(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
  static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
  {
      /* Opcodes that touch guest memory stop the mb optimization.  */
@@ -XXX,XX +XXX,XX @@ static bool fold_qemu_st(OptContext *ctx, TCGOp *op)
      return false;
  }
 +static bool fold_remainder(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
 +static bool fold_shift(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
 +static bool fold_sub(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
 +static bool fold_xor(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
  /* Propagate constants and copies, fold constant expressions. */
  void tcg_optimize(TCGContext *s)
  {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              }
              break;
 -        CASE_OP_32_64(not):
 -        CASE_OP_32_64(neg):
 -        CASE_OP_32_64(ext8s):
 -        CASE_OP_32_64(ext8u):
 -        CASE_OP_32_64(ext16s):
 -        CASE_OP_32_64(ext16u):
 -        CASE_OP_32_64(ctpop):
 -        case INDEX_op_ext32s_i64:
 -        case INDEX_op_ext32u_i64:
 -        case INDEX_op_ext_i32_i64:
 -        case INDEX_op_extu_i32_i64:
 -        case INDEX_op_extrl_i64_i32:
 -        case INDEX_op_extrh_i64_i32:
 -            if (arg_is_const(op->args[1])) {
 -                tmp = do_constant_folding(opc, arg_info(op->args[1])->val, 0);
 -                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
 -                continue;
 -            }
 -            break;
 -
          CASE_OP_32_64(bswap16):
          CASE_OP_32_64(bswap32):
          case INDEX_op_bswap64_i64:
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              }
              break;
 -        CASE_OP_32_64(add):
 -        CASE_OP_32_64(sub):
 -        CASE_OP_32_64(mul):
 -        CASE_OP_32_64(or):
 -        CASE_OP_32_64(and):
 -        CASE_OP_32_64(xor):
 -        CASE_OP_32_64(shl):
 -        CASE_OP_32_64(shr):
 -        CASE_OP_32_64(sar):
 -        CASE_OP_32_64(rotl):
 -        CASE_OP_32_64(rotr):
 -        CASE_OP_32_64(andc):
 -        CASE_OP_32_64(orc):
 -        CASE_OP_32_64(eqv):
 -        CASE_OP_32_64(nand):
 -        CASE_OP_32_64(nor):
 -        CASE_OP_32_64(muluh):
 -        CASE_OP_32_64(mulsh):
 -        CASE_OP_32_64(div):
 -        CASE_OP_32_64(divu):
 -        CASE_OP_32_64(rem):
 -        CASE_OP_32_64(remu):
 -            if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
 -                tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
 -                                          arg_info(op->args[2])->val);
 -                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
 -                continue;
 -            }
 -            break;
 -
          CASE_OP_32_64(clz):
          CASE_OP_32_64(ctz):
              if (arg_is_const(op->args[1])) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              }
              break;
 +        default:
 +            break;
 +
 +        /* ---------------------------------------------------------- */
 +        /* Sorted alphabetically by opcode as much as possible. */
 +
 +        CASE_OP_32_64_VEC(add):
 +            done = fold_add(&ctx, op);
 +            break;
 +        CASE_OP_32_64_VEC(and):
 +            done = fold_and(&ctx, op);
 +            break;
 +        CASE_OP_32_64_VEC(andc):
 +            done = fold_andc(&ctx, op);
 +            break;
 +        CASE_OP_32_64(ctpop):
 +            done = fold_ctpop(&ctx, op);
 +            break;
 +        CASE_OP_32_64(div):
 +        CASE_OP_32_64(divu):
 +            done = fold_divide(&ctx, op);
 +            break;
 +        CASE_OP_32_64(eqv):
 +            done = fold_eqv(&ctx, op);
 +            break;
 +        CASE_OP_32_64(ext8s):
 +        CASE_OP_32_64(ext16s):
 +        case INDEX_op_ext32s_i64:
 +        case INDEX_op_ext_i32_i64:
 +            done = fold_exts(&ctx, op);
 +            break;
 +        CASE_OP_32_64(ext8u):
 +        CASE_OP_32_64(ext16u):
 +        case INDEX_op_ext32u_i64:
 +        case INDEX_op_extu_i32_i64:
 +        case INDEX_op_extrl_i64_i32:
 +        case INDEX_op_extrh_i64_i32:
 +            done = fold_extu(&ctx, op);
 +            break;
          case INDEX_op_mb:
              done = fold_mb(&ctx, op);
              break;
 +        CASE_OP_32_64(mul):
 +            done = fold_mul(&ctx, op);
 +            break;
 +        CASE_OP_32_64(mulsh):
 +        CASE_OP_32_64(muluh):
 +            done = fold_mul_highpart(&ctx, op);
 +            break;
 +        CASE_OP_32_64(nand):
 +            done = fold_nand(&ctx, op);
 +            break;
 +        CASE_OP_32_64(neg):
 +            done = fold_neg(&ctx, op);
 +            break;
 +        CASE_OP_32_64(nor):
 +            done = fold_nor(&ctx, op);
 +            break;
 +        CASE_OP_32_64_VEC(not):
 +            done = fold_not(&ctx, op);
 +            break;
 +        CASE_OP_32_64_VEC(or):
 +            done = fold_or(&ctx, op);
 +            break;
 +        CASE_OP_32_64_VEC(orc):
 +            done = fold_orc(&ctx, op);
 +            break;
          case INDEX_op_qemu_ld_i32:
          case INDEX_op_qemu_ld_i64:
              done = fold_qemu_ld(&ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          case INDEX_op_qemu_st_i64:
              done = fold_qemu_st(&ctx, op);
              break;
 -
 -        default:
 +        CASE_OP_32_64(rem):
 +        CASE_OP_32_64(remu):
 +            done = fold_remainder(&ctx, op);
 +            break;
 +        CASE_OP_32_64(rotl):
 +        CASE_OP_32_64(rotr):
 +        CASE_OP_32_64(sar):
 +        CASE_OP_32_64(shl):
 +        CASE_OP_32_64(shr):
 +            done = fold_shift(&ctx, op);
 +            break;
 +        CASE_OP_32_64_VEC(sub):
 +            done = fold_sub(&ctx, op);
 +            break;
 +        CASE_OP_32_64_VEC(xor):
 +            done = fold_xor(&ctx, op);
              break;
          }
 --
 .25.1

-[PULL 06/41] configure: Override the os default with --disable-pie
+[PULL 21/56] tcg/optimize: Split out fold_setcond2
-Some distributions, e.g. Ubuntu 19.10, enable PIE by default.
+Reduce some code duplication by folding the NE and EQ cases.
 If for some reason one wishes to build a non-pie binary, we
 must provide additional options to override.
 At the same time, reorg the code to an elif chain.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Thomas Huth <thuth@redhat.com>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- configure | 25 ++++++++++++-------------
+ tcg/optimize.c | 145 ++++++++++++++++++++++++-------------------------
-file changed, 12 insertions(+), 13 deletions(-)
+file changed, 72 insertions(+), 73 deletions(-)
-diff --git a/configure b/configure
+diff --git a/tcg/optimize.c b/tcg/optimize.c
-index XXXXXXX..XXXXXXX 100755
+index XXXXXXX..XXXXXXX 100644
---- a/configure
+--- a/tcg/optimize.c
-+++ b/configure
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ if compile_prog "-Werror -fno-pie" "-no-pie"; then
+@@ -XXX,XX +XXX,XX @@ static bool fold_remainder(OptContext *ctx, TCGOp *op)
-   LDFLAGS_NOPIE="-no-pie"
+     return fold_const2(ctx, op);
- fi
+ }
--if test "$pie" != "no" ; then
++static bool fold_setcond2(OptContext *ctx, TCGOp *op)
--  if compile_prog "-fPIE -DPIE" "-pie"; then
++{
--    QEMU_CFLAGS="-fPIE -DPIE $QEMU_CFLAGS"
++    TCGCond cond = op->args[5];
--    LDFLAGS="-pie $LDFLAGS"
++    int i = do_constant_folding_cond2(&op->args[1], &op->args[3], cond);
--    pie="yes"
++    int inv = 0;
--  else
++
--    if test "$pie" = "yes"; then
++    if (i >= 0) {
--      error_exit "PIE not available due to missing toolchain support"
++        goto do_setcond_const;
--    else
++    }
--      echo "Disabling PIE due to missing toolchain support"
++
--      pie="no"
++    switch (cond) {
--    fi
++    case TCG_COND_LT:
--  fi
++    case TCG_COND_GE:
-+if test "$pie" = "no"; then
++        /*
-+  QEMU_CFLAGS="$CFLAGS_NOPIE $QEMU_CFLAGS"
++         * Simplify LT/GE comparisons vs zero to a single compare
-+  LDFLAGS="$LDFLAGS_NOPIE $LDFLAGS"
++         * vs the high word of the input.
-+elif compile_prog "-fPIE -DPIE" "-pie"; then
++         */
-+  QEMU_CFLAGS="-fPIE -DPIE $QEMU_CFLAGS"
++        if (arg_is_const(op->args[3]) && arg_info(op->args[3])->val == 0 &&
-+  LDFLAGS="-pie $LDFLAGS"
++            arg_is_const(op->args[4]) && arg_info(op->args[4])->val == 0) {
-+  pie="yes"
++            goto do_setcond_high;
-+elif test "$pie" = "yes"; then
++        }
-+  error_exit "PIE not available due to missing toolchain support"
++        break;
-+else
++
-+  echo "Disabling PIE due to missing toolchain support"
++    case TCG_COND_NE:
-+  pie="no"
++        inv = 1;
- fi
++        QEMU_FALLTHROUGH;
++    case TCG_COND_EQ:
- # Detect support for PT_GNU_RELRO + DT_BIND_NOW.
++        /*
 +         * Simplify EQ/NE comparisons where one of the pairs
 +         * can be simplified.
 +         */
 +        i = do_constant_folding_cond(INDEX_op_setcond_i32, op->args[1],
 +                                     op->args[3], cond);
 +        switch (i ^ inv) {
 +        case 0:
 +            goto do_setcond_const;
 +        case 1:
 +            goto do_setcond_high;
 +        }
 +
 +        i = do_constant_folding_cond(INDEX_op_setcond_i32, op->args[2],
 +                                     op->args[4], cond);
 +        switch (i ^ inv) {
 +        case 0:
 +            goto do_setcond_const;
 +        case 1:
 +            op->args[2] = op->args[3];
 +            op->args[3] = cond;
 +            op->opc = INDEX_op_setcond_i32;
 +            break;
 +        }
 +        break;
 +
 +    default:
 +        break;
 +
 +    do_setcond_high:
 +        op->args[1] = op->args[2];
 +        op->args[2] = op->args[4];
 +        op->args[3] = cond;
 +        op->opc = INDEX_op_setcond_i32;
 +        break;
 +    }
 +    return false;
 +
 + do_setcond_const:
 +    return tcg_opt_gen_movi(ctx, op, op->args[0], i);
 +}
 +
  static bool fold_shift(OptContext *ctx, TCGOp *op)
  {
      return fold_const2(ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              }
              break;
 -        case INDEX_op_setcond2_i32:
 -            i = do_constant_folding_cond2(&op->args[1], &op->args[3],
 -                                          op->args[5]);
 -            if (i >= 0) {
 -            do_setcond_const:
 -                tcg_opt_gen_movi(&ctx, op, op->args[0], i);
 -                continue;
 -            }
 -            if ((op->args[5] == TCG_COND_LT || op->args[5] == TCG_COND_GE)
 -                 && arg_is_const(op->args[3])
 -                 && arg_info(op->args[3])->val == 0
 -                 && arg_is_const(op->args[4])
 -                 && arg_info(op->args[4])->val == 0) {
 -                /* Simplify LT/GE comparisons vs zero to a single compare
 -                   vs the high word of the input.  */
 -            do_setcond_high:
 -                reset_temp(op->args[0]);
 -                arg_info(op->args[0])->z_mask = 1;
 -                op->opc = INDEX_op_setcond_i32;
 -                op->args[1] = op->args[2];
 -                op->args[2] = op->args[4];
 -                op->args[3] = op->args[5];
 -                break;
 -            }
 -            if (op->args[5] == TCG_COND_EQ) {
 -                /* Simplify EQ comparisons where one of the pairs
 -                   can be simplified.  */
 -                i = do_constant_folding_cond(INDEX_op_setcond_i32,
 -                                             op->args[1], op->args[3],
 -                                             TCG_COND_EQ);
 -                if (i == 0) {
 -                    goto do_setcond_const;
 -                } else if (i > 0) {
 -                    goto do_setcond_high;
 -                }
 -                i = do_constant_folding_cond(INDEX_op_setcond_i32,
 -                                             op->args[2], op->args[4],
 -                                             TCG_COND_EQ);
 -                if (i == 0) {
 -                    goto do_setcond_high;
 -                } else if (i < 0) {
 -                    break;
 -                }
 -            do_setcond_low:
 -                reset_temp(op->args[0]);
 -                arg_info(op->args[0])->z_mask = 1;
 -                op->opc = INDEX_op_setcond_i32;
 -                op->args[2] = op->args[3];
 -                op->args[3] = op->args[5];
 -                break;
 -            }
 -            if (op->args[5] == TCG_COND_NE) {
 -                /* Simplify NE comparisons where one of the pairs
 -                   can be simplified.  */
 -                i = do_constant_folding_cond(INDEX_op_setcond_i32,
 -                                             op->args[1], op->args[3],
 -                                             TCG_COND_NE);
 -                if (i == 0) {
 -                    goto do_setcond_high;
 -                } else if (i > 0) {
 -                    goto do_setcond_const;
 -                }
 -                i = do_constant_folding_cond(INDEX_op_setcond_i32,
 -                                             op->args[2], op->args[4],
 -                                             TCG_COND_NE);
 -                if (i == 0) {
 -                    goto do_setcond_low;
 -                } else if (i > 0) {
 -                    goto do_setcond_const;
 -                }
 -            }
 -            break;
 -
          default:
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          CASE_OP_32_64(shr):
              done = fold_shift(&ctx, op);
              break;
 +        case INDEX_op_setcond2_i32:
 +            done = fold_setcond2(&ctx, op);
 +            break;
          CASE_OP_32_64_VEC(sub):
              done = fold_sub(&ctx, op);
              break;
 --
-.20.1
+.25.1

-New patch
+[PULL 22/56] tcg/optimize: Split out fold_brcond2
+Reduce some code duplication by folding the NE and EQ cases.
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 159 +++++++++++++++++++++++++------------------------
+file changed, 81 insertions(+), 78 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
+     return fold_const2(ctx, op);
+ }
++static bool fold_brcond2(OptContext *ctx, TCGOp *op)
++{
++    TCGCond cond = op->args[4];
++    int i = do_constant_folding_cond2(&op->args[0], &op->args[2], cond);
++    TCGArg label = op->args[5];
++    int inv = 0;
++
++    if (i >= 0) {
++        goto do_brcond_const;
++    }
++
++    switch (cond) {
++    case TCG_COND_LT:
++    case TCG_COND_GE:
++        /*
++         * Simplify LT/GE comparisons vs zero to a single compare
++         * vs the high word of the input.
++         */
++        if (arg_is_const(op->args[2]) && arg_info(op->args[2])->val == 0 &&
++            arg_is_const(op->args[3]) && arg_info(op->args[3])->val == 0) {
++            goto do_brcond_high;
++        }
++        break;
++
++    case TCG_COND_NE:
++        inv = 1;
++        QEMU_FALLTHROUGH;
++    case TCG_COND_EQ:
++        /*
++         * Simplify EQ/NE comparisons where one of the pairs
++         * can be simplified.
++         */
++        i = do_constant_folding_cond(INDEX_op_brcond_i32, op->args[0],
++                                     op->args[2], cond);
++        switch (i ^ inv) {
++        case 0:
++            goto do_brcond_const;
++        case 1:
++            goto do_brcond_high;
++        }
++
++        i = do_constant_folding_cond(INDEX_op_brcond_i32, op->args[1],
++                                     op->args[3], cond);
++        switch (i ^ inv) {
++        case 0:
++            goto do_brcond_const;
++        case 1:
++            op->opc = INDEX_op_brcond_i32;
++            op->args[1] = op->args[2];
++            op->args[2] = cond;
++            op->args[3] = label;
++            break;
++        }
++        break;
++
++    default:
++        break;
++
++    do_brcond_high:
++        op->opc = INDEX_op_brcond_i32;
++        op->args[0] = op->args[1];
++        op->args[1] = op->args[3];
++        op->args[2] = cond;
++        op->args[3] = label;
++        break;
++
++    do_brcond_const:
++        if (i == 0) {
++            tcg_op_remove(ctx->tcg, op);
++            return true;
++        }
++        op->opc = INDEX_op_br;
++        op->args[0] = label;
++        break;
++    }
++    return false;
++}
++
+ static bool fold_call(OptContext *ctx, TCGOp *op)
+ {
+     TCGContext *s = ctx->tcg;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             }
+             break;
+-        case INDEX_op_brcond2_i32:
+-            i = do_constant_folding_cond2(&op->args[0], &op->args[2],
+-                                          op->args[4]);
+-            if (i == 0) {
+-            do_brcond_false:
+-                tcg_op_remove(s, op);
+-                continue;
+-            }
+-            if (i > 0) {
+-            do_brcond_true:
+-                op->opc = opc = INDEX_op_br;
+-                op->args[0] = op->args[5];
+-                break;
+-            }
+-            if ((op->args[4] == TCG_COND_LT || op->args[4] == TCG_COND_GE)
+-                 && arg_is_const(op->args[2])
+-                 && arg_info(op->args[2])->val == 0
+-                 && arg_is_const(op->args[3])
+-                 && arg_info(op->args[3])->val == 0) {
+-                /* Simplify LT/GE comparisons vs zero to a single compare
+-                   vs the high word of the input.  */
+-            do_brcond_high:
+-                op->opc = opc = INDEX_op_brcond_i32;
+-                op->args[0] = op->args[1];
+-                op->args[1] = op->args[3];
+-                op->args[2] = op->args[4];
+-                op->args[3] = op->args[5];
+-                break;
+-            }
+-            if (op->args[4] == TCG_COND_EQ) {
+-                /* Simplify EQ comparisons where one of the pairs
+-                   can be simplified.  */
+-                i = do_constant_folding_cond(INDEX_op_brcond_i32,
+-                                             op->args[0], op->args[2],
+-                                             TCG_COND_EQ);
+-                if (i == 0) {
+-                    goto do_brcond_false;
+-                } else if (i > 0) {
+-                    goto do_brcond_high;
+-                }
+-                i = do_constant_folding_cond(INDEX_op_brcond_i32,
+-                                             op->args[1], op->args[3],
+-                                             TCG_COND_EQ);
+-                if (i == 0) {
+-                    goto do_brcond_false;
+-                } else if (i < 0) {
+-                    break;
+-                }
+-            do_brcond_low:
+-                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+-                op->opc = INDEX_op_brcond_i32;
+-                op->args[1] = op->args[2];
+-                op->args[2] = op->args[4];
+-                op->args[3] = op->args[5];
+-                break;
+-            }
+-            if (op->args[4] == TCG_COND_NE) {
+-                /* Simplify NE comparisons where one of the pairs
+-                   can be simplified.  */
+-                i = do_constant_folding_cond(INDEX_op_brcond_i32,
+-                                             op->args[0], op->args[2],
+-                                             TCG_COND_NE);
+-                if (i == 0) {
+-                    goto do_brcond_high;
+-                } else if (i > 0) {
+-                    goto do_brcond_true;
+-                }
+-                i = do_constant_folding_cond(INDEX_op_brcond_i32,
+-                                             op->args[1], op->args[3],
+-                                             TCG_COND_NE);
+-                if (i == 0) {
+-                    goto do_brcond_low;
+-                } else if (i > 0) {
+-                    goto do_brcond_true;
+-                }
+-            }
+-            break;
+-
+         default:
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64_VEC(andc):
+             done = fold_andc(&ctx, op);
+             break;
++        case INDEX_op_brcond2_i32:
++            done = fold_brcond2(&ctx, op);
++            break;
+         CASE_OP_32_64(ctpop):
+             done = fold_ctpop(&ctx, op);
+             break;
+--
+.25.1

-New patch
+[PULL 23/56] tcg/optimize: Split out fold_brcond
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 33 +++++++++++++++++++--------------
+file changed, 19 insertions(+), 14 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
+     return fold_const2(ctx, op);
+ }
++static bool fold_brcond(OptContext *ctx, TCGOp *op)
++{
++    TCGCond cond = op->args[2];
++    int i = do_constant_folding_cond(op->opc, op->args[0], op->args[1], cond);
++
++    if (i == 0) {
++        tcg_op_remove(ctx->tcg, op);
++        return true;
++    }
++    if (i > 0) {
++        op->opc = INDEX_op_br;
++        op->args[0] = op->args[3];
++    }
++    return false;
++}
++
+ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
+ {
+     TCGCond cond = op->args[4];
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             }
+             break;
+-        CASE_OP_32_64(brcond):
+-            i = do_constant_folding_cond(opc, op->args[0],
+-                                         op->args[1], op->args[2]);
+-            if (i == 0) {
+-                tcg_op_remove(s, op);
+-                continue;
+-            } else if (i > 0) {
+-                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+-                op->opc = opc = INDEX_op_br;
+-                op->args[0] = op->args[3];
+-                break;
+-            }
+-            break;
+-
+         CASE_OP_32_64(movcond):
+             i = do_constant_folding_cond(opc, op->args[1],
+                                          op->args[2], op->args[5]);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64_VEC(andc):
+             done = fold_andc(&ctx, op);
+             break;
++        CASE_OP_32_64(brcond):
++            done = fold_brcond(&ctx, op);
++            break;
+         case INDEX_op_brcond2_i32:
+             done = fold_brcond2(&ctx, op);
+             break;
+--
+.25.1

-New patch
+[PULL 24/56] tcg/optimize: Split out fold_setcond
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 23 ++++++++++++++---------
+file changed, 14 insertions(+), 9 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_remainder(OptContext *ctx, TCGOp *op)
+     return fold_const2(ctx, op);
+ }
++static bool fold_setcond(OptContext *ctx, TCGOp *op)
++{
++    TCGCond cond = op->args[3];
++    int i = do_constant_folding_cond(op->opc, op->args[1], op->args[2], cond);
++
++    if (i >= 0) {
++        return tcg_opt_gen_movi(ctx, op, op->args[0], i);
++    }
++    return false;
++}
++
+ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
+ {
+     TCGCond cond = op->args[5];
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             }
+             break;
+-        CASE_OP_32_64(setcond):
+-            i = do_constant_folding_cond(opc, op->args[1],
+-                                         op->args[2], op->args[3]);
+-            if (i >= 0) {
+-                tcg_opt_gen_movi(&ctx, op, op->args[0], i);
+-                continue;
+-            }
+-            break;
+-
+         CASE_OP_32_64(movcond):
+             i = do_constant_folding_cond(opc, op->args[1],
+                                          op->args[2], op->args[5]);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64(shr):
+             done = fold_shift(&ctx, op);
+             break;
++        CASE_OP_32_64(setcond):
++            done = fold_setcond(&ctx, op);
++            break;
+         case INDEX_op_setcond2_i32:
+             done = fold_setcond2(&ctx, op);
+             break;
+--
+.25.1

-[PULL 12/41] target/arm: Include tcg.h in sve_helper.c
+[PULL 25/56] tcg/optimize: Split out fold_mulu2_i32
-Code movement in an upcoming patch will show that this file
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
-was implicitly depending on tcg.h being included indirectly.
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Cc: Peter Maydell <peter.maydell@linaro.org>
 Tested-by: Philippe Mathieu-Daudé <philmd@redhat.com>
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/arm/sve_helper.c | 1 +
+ tcg/optimize.c | 37 +++++++++++++++++++++----------------
-file changed, 1 insertion(+)
+file changed, 21 insertions(+), 16 deletions(-)
-diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/arm/sve_helper.c
+--- a/tcg/optimize.c
-+++ b/target/arm/sve_helper.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
- #include "exec/helper-proto.h"
+     return fold_const2(ctx, op);
- #include "tcg/tcg-gvec-desc.h"
+ }
- #include "fpu/softfloat.h"
-+#include "tcg.h"
++static bool fold_mulu2_i32(OptContext *ctx, TCGOp *op)
++{
++    if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
- /* Note that vector data is stored in host-endian 64-bit chunks,
++        uint32_t a = arg_info(op->args[2])->val;
 +        uint32_t b = arg_info(op->args[3])->val;
 +        uint64_t r = (uint64_t)a * b;
 +        TCGArg rl, rh;
 +        TCGOp *op2 = tcg_op_insert_before(ctx->tcg, op, INDEX_op_mov_i32);
 +
 +        rl = op->args[0];
 +        rh = op->args[1];
 +        tcg_opt_gen_movi(ctx, op, rl, (int32_t)r);
 +        tcg_opt_gen_movi(ctx, op2, rh, (int32_t)(r >> 32));
 +        return true;
 +    }
 +    return false;
 +}
 +
  static bool fold_nand(OptContext *ctx, TCGOp *op)
  {
      return fold_const2(ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              }
              break;
 -        case INDEX_op_mulu2_i32:
 -            if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
 -                uint32_t a = arg_info(op->args[2])->val;
 -                uint32_t b = arg_info(op->args[3])->val;
 -                uint64_t r = (uint64_t)a * b;
 -                TCGArg rl, rh;
 -                TCGOp *op2 = tcg_op_insert_before(s, op, INDEX_op_mov_i32);
 -
 -                rl = op->args[0];
 -                rh = op->args[1];
 -                tcg_opt_gen_movi(&ctx, op, rl, (int32_t)r);
 -                tcg_opt_gen_movi(&ctx, op2, rh, (int32_t)(r >> 32));
 -                continue;
 -            }
 -            break;
 -
          default:
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          CASE_OP_32_64(muluh):
              done = fold_mul_highpart(&ctx, op);
              break;
 +        case INDEX_op_mulu2_i32:
 +            done = fold_mulu2_i32(&ctx, op);
 +            break;
          CASE_OP_32_64(nand):
              done = fold_nand(&ctx, op);
              break;
 --
-.20.1
+.25.1

-[PULL 36/41] cputlb: Expand cpu_ldst_template.h in cputlb.c
+[PULL 26/56] tcg/optimize: Split out fold_addsub2_i32
-Reduce the amount of preprocessor obfuscation by expanding
+Add two additional helpers, fold_add2_i32 and fold_sub2_i32
-the text of each of the functions generated.  The result is
+which will not be simple wrappers forever.
 only slightly smaller than the original.
-Tested-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
-Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Reviewed-by: Aleksandar Markovic <amarkovic@wavecomp.com>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/exec/cpu_ldst.h          |  67 +++++++-----------
+ tcg/optimize.c | 70 +++++++++++++++++++++++++++++++-------------------
- include/exec/cpu_ldst_template.h | 117 -------------------------------
+file changed, 44 insertions(+), 26 deletions(-)
  accel/tcg/cputlb.c               | 107 +++++++++++++++++++++++++++-
 files changed, 130 insertions(+), 161 deletions(-)
  delete mode 100644 include/exec/cpu_ldst_template.h
-diff --git a/include/exec/cpu_ldst.h b/include/exec/cpu_ldst.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/exec/cpu_ldst.h
+--- a/tcg/optimize.c
-+++ b/include/exec/cpu_ldst.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ typedef target_ulong abi_ptr;
+@@ -XXX,XX +XXX,XX @@ static bool fold_add(OptContext *ctx, TCGOp *op)
- #define TARGET_ABI_FMT_ptr TARGET_ABI_FMT_lx
+     return fold_const2(ctx, op);
- #endif
+ }
--#if defined(CONFIG_USER_ONLY)
++static bool fold_addsub2_i32(OptContext *ctx, TCGOp *op, bool add)
--
++{
--extern __thread uintptr_t helper_retaddr;
++    if (arg_is_const(op->args[2]) && arg_is_const(op->args[3]) &&
--
++        arg_is_const(op->args[4]) && arg_is_const(op->args[5])) {
--static inline void set_helper_retaddr(uintptr_t ra)
++        uint32_t al = arg_info(op->args[2])->val;
--{
++        uint32_t ah = arg_info(op->args[3])->val;
--    helper_retaddr = ra;
++        uint32_t bl = arg_info(op->args[4])->val;
--    /*
++        uint32_t bh = arg_info(op->args[5])->val;
--     * Ensure that this write is visible to the SIGSEGV handler that
++        uint64_t a = ((uint64_t)ah << 32) | al;
--     * may be invoked due to a subsequent invalid memory operation.
++        uint64_t b = ((uint64_t)bh << 32) | bl;
--     */
++        TCGArg rl, rh;
--    signal_barrier();
++        TCGOp *op2 = tcg_op_insert_before(ctx->tcg, op, INDEX_op_mov_i32);
 -}
 -
 -static inline void clear_helper_retaddr(void)
 -{
 -    /*
 -     * Ensure that previous memory operations have succeeded before
 -     * removing the data visible to the signal handler.
 -     */
 -    signal_barrier();
 -    helper_retaddr = 0;
 -}
 -
 -/* In user-only mode we provide only the _code and _data accessors. */
 -
  uint32_t cpu_ldub_data(CPUArchState *env, abi_ptr ptr);
  uint32_t cpu_lduw_data(CPUArchState *env, abi_ptr ptr);
  uint32_t cpu_ldl_data(CPUArchState *env, abi_ptr ptr);
@@ -XXX,XX +XXX,XX @@ void cpu_stl_data_ra(CPUArchState *env, abi_ptr ptr,
  void cpu_stq_data_ra(CPUArchState *env, abi_ptr ptr,
                       uint64_t val, uintptr_t retaddr);
 +#if defined(CONFIG_USER_ONLY)
 +
-+extern __thread uintptr_t helper_retaddr;
++        if (add) {
 +            a += b;
 +        } else {
 +            a -= b;
 +        }
 +
-+static inline void set_helper_retaddr(uintptr_t ra)
++        rl = op->args[0];
-+{
++        rh = op->args[1];
-+    helper_retaddr = ra;
++        tcg_opt_gen_movi(ctx, op, rl, (int32_t)a);
-+    /*
++        tcg_opt_gen_movi(ctx, op2, rh, (int32_t)(a >> 32));
-+     * Ensure that this write is visible to the SIGSEGV handler that
++        return true;
-+     * may be invoked due to a subsequent invalid memory operation.
++    }
-+     */
++    return false;
 +    signal_barrier();
 +}
 +
-+static inline void clear_helper_retaddr(void)
++static bool fold_add2_i32(OptContext *ctx, TCGOp *op)
 +{
-+    /*
++    return fold_addsub2_i32(ctx, op, true);
 +     * Ensure that previous memory operations have succeeded before
 +     * removing the data visible to the signal handler.
 +     */
 +    signal_barrier();
 +    helper_retaddr = 0;
 +}
 +
- /*
+ static bool fold_and(OptContext *ctx, TCGOp *op)
-  * Provide the same *_mmuidx_ra interface as for softmmu.
+ {
-  * The mmu_idx argument is ignored.
+     return fold_const2(ctx, op);
-@@ -XXX,XX +XXX,XX @@ void cpu_stl_mmuidx_ra(CPUArchState *env, abi_ptr addr, uint32_t val,
+@@ -XXX,XX +XXX,XX @@ static bool fold_sub(OptContext *ctx, TCGOp *op)
- void cpu_stq_mmuidx_ra(CPUArchState *env, abi_ptr addr, uint64_t val,
+     return fold_const2(ctx, op);
                         int mmu_idx, uintptr_t retaddr);
 -/* these access are slower, they must be as rare as possible */
 -#define CPU_MMU_INDEX (cpu_mmu_index(env, false))
 -#define MEMSUFFIX _data
 -#define DATA_SIZE 1
 -#include "exec/cpu_ldst_template.h"
 -
 -#define DATA_SIZE 2
 -#include "exec/cpu_ldst_template.h"
 -
 -#define DATA_SIZE 4
 -#include "exec/cpu_ldst_template.h"
 -
 -#define DATA_SIZE 8
 -#include "exec/cpu_ldst_template.h"
 -#undef CPU_MMU_INDEX
 -#undef MEMSUFFIX
 -
  #endif /* defined(CONFIG_USER_ONLY) */
  uint32_t cpu_ldub_code(CPUArchState *env, abi_ptr addr);
 diff --git a/include/exec/cpu_ldst_template.h b/include/exec/cpu_ldst_template.h
 deleted file mode 100644
 index XXXXXXX..XXXXXXX
 --- a/include/exec/cpu_ldst_template.h
 +++ /dev/null
@@ -XXX,XX +XXX,XX @@
 -/*
 - *  Software MMU support
 - *
 - * Generate inline load/store functions for one MMU mode and data
 - * size.
 - *
 - * Generate a store function as well as signed and unsigned loads.
 - *
 - * Not used directly but included from cpu_ldst.h.
 - *
 - *  Copyright (c) 2003 Fabrice Bellard
 - *
 - * This library is free software; you can redistribute it and/or
 - * modify it under the terms of the GNU Lesser General Public
 - * License as published by the Free Software Foundation; either
 - * version 2 of the License, or (at your option) any later version.
 - *
 - * This library is distributed in the hope that it will be useful,
 - * but WITHOUT ANY WARRANTY; without even the implied warranty of
 - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 - * Lesser General Public License for more details.
 - *
 - * You should have received a copy of the GNU Lesser General Public
 - * License along with this library; if not, see <http://www.gnu.org/licenses/>.
 - */
 -
 -#if DATA_SIZE == 8
 -#define SUFFIX q
 -#define USUFFIX q
 -#define DATA_TYPE uint64_t
 -#define SHIFT 3
 -#elif DATA_SIZE == 4
 -#define SUFFIX l
 -#define USUFFIX l
 -#define DATA_TYPE uint32_t
 -#define SHIFT 2
 -#elif DATA_SIZE == 2
 -#define SUFFIX w
 -#define USUFFIX uw
 -#define DATA_TYPE uint16_t
 -#define DATA_STYPE int16_t
 -#define SHIFT 1
 -#elif DATA_SIZE == 1
 -#define SUFFIX b
 -#define USUFFIX ub
 -#define DATA_TYPE uint8_t
 -#define DATA_STYPE int8_t
 -#define SHIFT 0
 -#else
 -#error unsupported data size
 -#endif
 -
 -#if DATA_SIZE == 8
 -#define RES_TYPE uint64_t
 -#else
 -#define RES_TYPE uint32_t
 -#endif
 -
 -/* generic load/store macros */
 -
 -static inline RES_TYPE
 -glue(glue(glue(cpu_ld, USUFFIX), MEMSUFFIX), _ra)(CPUArchState *env,
 -                                                  target_ulong ptr,
 -                                                  uintptr_t retaddr)
 -{
 -    return glue(glue(cpu_ld, USUFFIX), _mmuidx_ra)(env, ptr, CPU_MMU_INDEX,
 -                                                   retaddr);
 -}
 -
 -static inline RES_TYPE
 -glue(glue(cpu_ld, USUFFIX), MEMSUFFIX)(CPUArchState *env, target_ulong ptr)
 -{
 -    return glue(glue(cpu_ld, USUFFIX), _mmuidx_ra)(env, ptr, CPU_MMU_INDEX, 0);
 -}
 -
 -#if DATA_SIZE <= 2
 -static inline int
 -glue(glue(glue(cpu_lds, SUFFIX), MEMSUFFIX), _ra)(CPUArchState *env,
 -                                                  target_ulong ptr,
 -                                                  uintptr_t retaddr)
 -{
 -    return glue(glue(cpu_lds, SUFFIX), _mmuidx_ra)(env, ptr, CPU_MMU_INDEX,
 -                                                   retaddr);
 -}
 -
 -static inline int
 -glue(glue(cpu_lds, SUFFIX), MEMSUFFIX)(CPUArchState *env, target_ulong ptr)
 -{
 -    return glue(glue(cpu_lds, SUFFIX), _mmuidx_ra)(env, ptr, CPU_MMU_INDEX, 0);
 -}
 -#endif
 -
 -/* generic store macro */
 -
 -static inline void
 -glue(glue(glue(cpu_st, SUFFIX), MEMSUFFIX), _ra)(CPUArchState *env,
 -                                                 target_ulong ptr,
 -                                                 RES_TYPE v, uintptr_t retaddr)
 -{
 -    glue(glue(cpu_st, SUFFIX), _mmuidx_ra)(env, ptr, v, CPU_MMU_INDEX,
 -                                           retaddr);
 -}
 -
 -static inline void
 -glue(glue(cpu_st, SUFFIX), MEMSUFFIX)(CPUArchState *env, target_ulong ptr,
 -                                      RES_TYPE v)
 -{
 -    glue(glue(cpu_st, SUFFIX), _mmuidx_ra)(env, ptr, v, CPU_MMU_INDEX, 0);
 -}
 -
 -#undef RES_TYPE
 -#undef DATA_TYPE
 -#undef DATA_STYPE
 -#undef SUFFIX
 -#undef USUFFIX
 -#undef DATA_SIZE
 -#undef SHIFT
 diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/cputlb.c
 +++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@
  #include "qemu/atomic128.h"
  #include "translate-all.h"
  #include "trace-root.h"
 -#include "qemu/plugin.h"
  #include "trace/mem.h"
  #ifdef CONFIG_PLUGIN
  #include "qemu/plugin-memory.h"
@@ -XXX,XX +XXX,XX @@ uint64_t cpu_ldq_mmuidx_ra(CPUArchState *env, abi_ptr addr,
                             ? helper_le_ldq_mmu : helper_be_ldq_mmu);
  }
-+uint32_t cpu_ldub_data_ra(CPUArchState *env, target_ulong ptr,
++static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
 +                          uintptr_t retaddr)
 +{
-+    return cpu_ldub_mmuidx_ra(env, ptr, cpu_mmu_index(env, false), retaddr);
++    return fold_addsub2_i32(ctx, op, false);
 +}
 +
-+int cpu_ldsb_data_ra(CPUArchState *env, target_ulong ptr, uintptr_t retaddr)
+ static bool fold_xor(OptContext *ctx, TCGOp *op)
-+{
+ {
-+    return cpu_ldsb_mmuidx_ra(env, ptr, cpu_mmu_index(env, false), retaddr);
+     return fold_const2(ctx, op);
-+}
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-+
+             }
-+uint32_t cpu_lduw_data_ra(CPUArchState *env, target_ulong ptr,
+             break;
-+                          uintptr_t retaddr)
-+{
+-        case INDEX_op_add2_i32:
-+    return cpu_lduw_mmuidx_ra(env, ptr, cpu_mmu_index(env, false), retaddr);
+-        case INDEX_op_sub2_i32:
-+}
+-            if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])
-+
+-                && arg_is_const(op->args[4]) && arg_is_const(op->args[5])) {
-+int cpu_ldsw_data_ra(CPUArchState *env, target_ulong ptr, uintptr_t retaddr)
+-                uint32_t al = arg_info(op->args[2])->val;
-+{
+-                uint32_t ah = arg_info(op->args[3])->val;
-+    return cpu_ldsw_mmuidx_ra(env, ptr, cpu_mmu_index(env, false), retaddr);
+-                uint32_t bl = arg_info(op->args[4])->val;
-+}
+-                uint32_t bh = arg_info(op->args[5])->val;
-+
+-                uint64_t a = ((uint64_t)ah << 32) | al;
-+uint32_t cpu_ldl_data_ra(CPUArchState *env, target_ulong ptr, uintptr_t retaddr)
+-                uint64_t b = ((uint64_t)bh << 32) | bl;
-+{
+-                TCGArg rl, rh;
-+    return cpu_ldl_mmuidx_ra(env, ptr, cpu_mmu_index(env, false), retaddr);
+-                TCGOp *op2 = tcg_op_insert_before(s, op, INDEX_op_mov_i32);
-+}
+-
-+
+-                if (opc == INDEX_op_add2_i32) {
-+uint64_t cpu_ldq_data_ra(CPUArchState *env, target_ulong ptr, uintptr_t retaddr)
+-                    a += b;
-+{
+-                } else {
-+    return cpu_ldq_mmuidx_ra(env, ptr, cpu_mmu_index(env, false), retaddr);
+-                    a -= b;
-+}
+-                }
-+
+-
-+uint32_t cpu_ldub_data(CPUArchState *env, target_ulong ptr)
+-                rl = op->args[0];
-+{
+-                rh = op->args[1];
-+    return cpu_ldub_data_ra(env, ptr, 0);
+-                tcg_opt_gen_movi(&ctx, op, rl, (int32_t)a);
-+}
+-                tcg_opt_gen_movi(&ctx, op2, rh, (int32_t)(a >> 32));
-+
+-                continue;
-+int cpu_ldsb_data(CPUArchState *env, target_ulong ptr)
+-            }
-+{
+-            break;
-+    return cpu_ldsb_data_ra(env, ptr, 0);
-+}
+         default:
-+
+             break;
-+uint32_t cpu_lduw_data(CPUArchState *env, target_ulong ptr)
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-+{
+         CASE_OP_32_64_VEC(add):
-+    return cpu_lduw_data_ra(env, ptr, 0);
+             done = fold_add(&ctx, op);
-+}
+             break;
-+
++        case INDEX_op_add2_i32:
-+int cpu_ldsw_data(CPUArchState *env, target_ulong ptr)
++            done = fold_add2_i32(&ctx, op);
-+{
++            break;
-+    return cpu_ldsw_data_ra(env, ptr, 0);
+         CASE_OP_32_64_VEC(and):
-+}
+             done = fold_and(&ctx, op);
-+
+             break;
-+uint32_t cpu_ldl_data(CPUArchState *env, target_ulong ptr)
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-+{
+         CASE_OP_32_64_VEC(sub):
-+    return cpu_ldl_data_ra(env, ptr, 0);
+             done = fold_sub(&ctx, op);
-+}
+             break;
-+
++        case INDEX_op_sub2_i32:
-+uint64_t cpu_ldq_data(CPUArchState *env, target_ulong ptr)
++            done = fold_sub2_i32(&ctx, op);
-+{
++            break;
-+    return cpu_ldq_data_ra(env, ptr, 0);
+         CASE_OP_32_64_VEC(xor):
-+}
+             done = fold_xor(&ctx, op);
-+
+             break;
  /*
   * Store Helpers
   */
@@ -XXX,XX +XXX,XX @@ void cpu_stq_mmuidx_ra(CPUArchState *env, target_ulong addr, uint64_t val,
      cpu_store_helper(env, addr, val, mmu_idx, retaddr, MO_TEQ);
  }
 +void cpu_stb_data_ra(CPUArchState *env, target_ulong ptr,
 +                     uint32_t val, uintptr_t retaddr)
 +{
 +    cpu_stb_mmuidx_ra(env, ptr, val, cpu_mmu_index(env, false), retaddr);
 +}
 +
 +void cpu_stw_data_ra(CPUArchState *env, target_ulong ptr,
 +                     uint32_t val, uintptr_t retaddr)
 +{
 +    cpu_stw_mmuidx_ra(env, ptr, val, cpu_mmu_index(env, false), retaddr);
 +}
 +
 +void cpu_stl_data_ra(CPUArchState *env, target_ulong ptr,
 +                     uint32_t val, uintptr_t retaddr)
 +{
 +    cpu_stl_mmuidx_ra(env, ptr, val, cpu_mmu_index(env, false), retaddr);
 +}
 +
 +void cpu_stq_data_ra(CPUArchState *env, target_ulong ptr,
 +                     uint64_t val, uintptr_t retaddr)
 +{
 +    cpu_stq_mmuidx_ra(env, ptr, val, cpu_mmu_index(env, false), retaddr);
 +}
 +
 +void cpu_stb_data(CPUArchState *env, target_ulong ptr, uint32_t val)
 +{
 +    cpu_stb_data_ra(env, ptr, val, 0);
 +}
 +
 +void cpu_stw_data(CPUArchState *env, target_ulong ptr, uint32_t val)
 +{
 +    cpu_stw_data_ra(env, ptr, val, 0);
 +}
 +
 +void cpu_stl_data(CPUArchState *env, target_ulong ptr, uint32_t val)
 +{
 +    cpu_stl_data_ra(env, ptr, val, 0);
 +}
 +
 +void cpu_stq_data(CPUArchState *env, target_ulong ptr, uint64_t val)
 +{
 +    cpu_stq_data_ra(env, ptr, val, 0);
 +}
 +
  /* First set of helpers allows passing in of OI and RETADDR.  This makes
     them callable from other helpers.  */
 --
-.20.1
+.25.1

-[PULL 41/41] MAINTAINERS: Replace Claudio Fontana for tcg/aarch64
+[PULL 27/56] tcg/optimize: Split out fold_movcond
-Claudio's Huawei address has been defunct for quite a while.  In
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
   https://lists.gnu.org/archive/html/qemu-devel/2019-05/msg06872.html
 he asked for his personal address to be removed as well.
 I will take over officially.
 Cc: Claudio Fontana <claudio.fontana@gmail.com>
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- MAINTAINERS | 3 +--
+ tcg/optimize.c | 56 ++++++++++++++++++++++++++++----------------------
-file changed, 1 insertion(+), 2 deletions(-)
+file changed, 31 insertions(+), 25 deletions(-)
-diff --git a/MAINTAINERS b/MAINTAINERS
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/MAINTAINERS
+--- a/tcg/optimize.c
-+++ b/MAINTAINERS
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ F: plugins/
+@@ -XXX,XX +XXX,XX @@ static bool fold_mb(OptContext *ctx, TCGOp *op)
- F: tests/plugin
+     return true;
+ }
- AArch64 TCG target
--M: Claudio Fontana <claudio.fontana@huawei.com>
++static bool fold_movcond(OptContext *ctx, TCGOp *op)
--M: Claudio Fontana <claudio.fontana@gmail.com>
++{
-+M: Richard Henderson <richard.henderson@linaro.org>
++    TCGOpcode opc = op->opc;
- S: Maintained
++    TCGCond cond = op->args[5];
- L: qemu-arm@nongnu.org
++    int i = do_constant_folding_cond(opc, op->args[1], op->args[2], cond);
- F: tcg/aarch64/
++
 +    if (i >= 0) {
 +        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[4 - i]);
 +    }
 +
 +    if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
 +        uint64_t tv = arg_info(op->args[3])->val;
 +        uint64_t fv = arg_info(op->args[4])->val;
 +
 +        opc = (opc == INDEX_op_movcond_i32
 +               ? INDEX_op_setcond_i32 : INDEX_op_setcond_i64);
 +
 +        if (tv == 1 && fv == 0) {
 +            op->opc = opc;
 +            op->args[3] = cond;
 +        } else if (fv == 1 && tv == 0) {
 +            op->opc = opc;
 +            op->args[3] = tcg_invert_cond(cond);
 +        }
 +    }
 +    return false;
 +}
 +
  static bool fold_mul(OptContext *ctx, TCGOp *op)
  {
      return fold_const2(ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              }
              break;
 -        CASE_OP_32_64(movcond):
 -            i = do_constant_folding_cond(opc, op->args[1],
 -                                         op->args[2], op->args[5]);
 -            if (i >= 0) {
 -                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[4 - i]);
 -                continue;
 -            }
 -            if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
 -                uint64_t tv = arg_info(op->args[3])->val;
 -                uint64_t fv = arg_info(op->args[4])->val;
 -                TCGCond cond = op->args[5];
 -
 -                if (fv == 1 && tv == 0) {
 -                    cond = tcg_invert_cond(cond);
 -                } else if (!(tv == 1 && fv == 0)) {
 -                    break;
 -                }
 -                op->args[3] = cond;
 -                op->opc = opc = (opc == INDEX_op_movcond_i32
 -                                 ? INDEX_op_setcond_i32
 -                                 : INDEX_op_setcond_i64);
 -            }
 -            break;
 -
 -
          default:
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          case INDEX_op_mb:
              done = fold_mb(&ctx, op);
              break;
 +        CASE_OP_32_64(movcond):
 +            done = fold_movcond(&ctx, op);
 +            break;
          CASE_OP_32_64(mul):
              done = fold_mul(&ctx, op);
              break;
 --
-.20.1
+.25.1

-[PULL 40/41] configure: Remove tcg/ from the preprocessor include search list
+[PULL 28/56] tcg/optimize: Split out fold_extract2
-From: Philippe Mathieu-Daudé <philmd@redhat.com>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 All tcg includes are relative to the repository root directory,
 we can safely remove the tcg/ directory from the include search
 path list.
 Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
 Reviewed-by: Alistair Francis <alistair.francis@wdc.com>
 Reviewed-by: Stefan Weil <sw@weilnetz.de>
 Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
 Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
 Message-Id: <20200101112303.20724-5-philmd@redhat.com>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- configure | 1 -
+ tcg/optimize.c | 39 ++++++++++++++++++++++-----------------
-file changed, 1 deletion(-)
+file changed, 22 insertions(+), 17 deletions(-)
-diff --git a/configure b/configure
+diff --git a/tcg/optimize.c b/tcg/optimize.c
-index XXXXXXX..XXXXXXX 100755
+index XXXXXXX..XXXXXXX 100644
---- a/configure
+--- a/tcg/optimize.c
-+++ b/configure
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ elif test "$ARCH" = "riscv32" || test "$ARCH" = "riscv64" ; then
+@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
- else
+     return fold_const2(ctx, op);
-   QEMU_INCLUDES="-iquote \$(SRC_PATH)/tcg/\$(ARCH) $QEMU_INCLUDES"
+ }
- fi
--QEMU_INCLUDES="-iquote \$(SRC_PATH)/tcg $QEMU_INCLUDES"
++static bool fold_extract2(OptContext *ctx, TCGOp *op)
++{
- echo "TOOLS=$tools" >> $config_host_mak
++    if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
- echo "ROMS=$roms" >> $config_host_mak
++        uint64_t v1 = arg_info(op->args[1])->val;
 +        uint64_t v2 = arg_info(op->args[2])->val;
 +        int shr = op->args[3];
 +
 +        if (op->opc == INDEX_op_extract2_i64) {
 +            v1 >>= shr;
 +            v2 <<= 64 - shr;
 +        } else {
 +            v1 = (uint32_t)v1 >> shr;
 +            v2 = (int32_t)v2 << (32 - shr);
 +        }
 +        return tcg_opt_gen_movi(ctx, op, op->args[0], v1 | v2);
 +    }
 +    return false;
 +}
 +
  static bool fold_exts(OptContext *ctx, TCGOp *op)
  {
      return fold_const1(ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              }
              break;
 -        CASE_OP_32_64(extract2):
 -            if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
 -                uint64_t v1 = arg_info(op->args[1])->val;
 -                uint64_t v2 = arg_info(op->args[2])->val;
 -                int shr = op->args[3];
 -
 -                if (opc == INDEX_op_extract2_i64) {
 -                    tmp = (v1 >> shr) | (v2 << (64 - shr));
 -                } else {
 -                    tmp = (int32_t)(((uint32_t)v1 >> shr) |
 -                                    ((uint32_t)v2 << (32 - shr)));
 -                }
 -                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
 -                continue;
 -            }
 -            break;
 -
          default:
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          CASE_OP_32_64(eqv):
              done = fold_eqv(&ctx, op);
              break;
 +        CASE_OP_32_64(extract2):
 +            done = fold_extract2(&ctx, op);
 +            break;
          CASE_OP_32_64(ext8s):
          CASE_OP_32_64(ext16s):
          case INDEX_op_ext32s_i64:
 --
-.20.1
+.25.1

-[PULL 39/41] tcg: Move TCG headers to include/tcg/
+[PULL 29/56] tcg/optimize: Split out fold_extract, fold_sextract
-From: Philippe Mathieu-Daudé <philmd@redhat.com>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
 Reviewed-by: Alistair Francis <alistair.francis@wdc.com>
 Reviewed-by: Stefan Weil <sw@weilnetz.de>
 Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
 Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
 Message-Id: <20200101112303.20724-4-philmd@redhat.com>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- {tcg => include/tcg}/tcg-gvec-desc.h | 0
+ tcg/optimize.c | 48 ++++++++++++++++++++++++++++++------------------
- {tcg => include/tcg}/tcg-mo.h        | 0
+file changed, 30 insertions(+), 18 deletions(-)
  {tcg => include/tcg}/tcg-op-gvec.h   | 0
  {tcg => include/tcg}/tcg-op.h        | 0
  {tcg => include/tcg}/tcg-opc.h       | 0
  {tcg => include/tcg}/tcg.h           | 0
  MAINTAINERS                          | 1 +
 files changed, 1 insertion(+)
  rename {tcg => include/tcg}/tcg-gvec-desc.h (100%)
  rename {tcg => include/tcg}/tcg-mo.h (100%)
  rename {tcg => include/tcg}/tcg-op-gvec.h (100%)
  rename {tcg => include/tcg}/tcg-op.h (100%)
  rename {tcg => include/tcg}/tcg-opc.h (100%)
  rename {tcg => include/tcg}/tcg.h (100%)
-diff --git a/tcg/tcg-gvec-desc.h b/include/tcg/tcg-gvec-desc.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 similarity index 100%
 rename from tcg/tcg-gvec-desc.h
 rename to include/tcg/tcg-gvec-desc.h
 diff --git a/tcg/tcg-mo.h b/include/tcg/tcg-mo.h
 similarity index 100%
 rename from tcg/tcg-mo.h
 rename to include/tcg/tcg-mo.h
 diff --git a/tcg/tcg-op-gvec.h b/include/tcg/tcg-op-gvec.h
 similarity index 100%
 rename from tcg/tcg-op-gvec.h
 rename to include/tcg/tcg-op-gvec.h
 diff --git a/tcg/tcg-op.h b/include/tcg/tcg-op.h
 similarity index 100%
 rename from tcg/tcg-op.h
 rename to include/tcg/tcg-op.h
 diff --git a/tcg/tcg-opc.h b/include/tcg/tcg-opc.h
 similarity index 100%
 rename from tcg/tcg-opc.h
 rename to include/tcg/tcg-opc.h
 diff --git a/tcg/tcg.h b/include/tcg/tcg.h
 similarity index 100%
 rename from tcg/tcg.h
 rename to include/tcg/tcg.h
 diff --git a/MAINTAINERS b/MAINTAINERS
 index XXXXXXX..XXXXXXX 100644
---- a/MAINTAINERS
+--- a/tcg/optimize.c
-+++ b/MAINTAINERS
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ Common TCG code
+@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
- M: Richard Henderson <rth@twiddle.net>
+     return fold_const2(ctx, op);
- S: Maintained
+ }
- F: tcg/
-+F: include/tcg/
++static bool fold_extract(OptContext *ctx, TCGOp *op)
++{
- TCG Plugins
++    if (arg_is_const(op->args[1])) {
- M: Alex Bennée <alex.bennee@linaro.org>
++        uint64_t t;
 +
 +        t = arg_info(op->args[1])->val;
 +        t = extract64(t, op->args[2], op->args[3]);
 +        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
 +    }
 +    return false;
 +}
 +
  static bool fold_extract2(OptContext *ctx, TCGOp *op)
  {
      if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
      return tcg_opt_gen_movi(ctx, op, op->args[0], i);
  }
 +static bool fold_sextract(OptContext *ctx, TCGOp *op)
 +{
 +    if (arg_is_const(op->args[1])) {
 +        uint64_t t;
 +
 +        t = arg_info(op->args[1])->val;
 +        t = sextract64(t, op->args[2], op->args[3]);
 +        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
 +    }
 +    return false;
 +}
 +
  static bool fold_shift(OptContext *ctx, TCGOp *op)
  {
      return fold_const2(ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              }
              break;
 -        CASE_OP_32_64(extract):
 -            if (arg_is_const(op->args[1])) {
 -                tmp = extract64(arg_info(op->args[1])->val,
 -                                op->args[2], op->args[3]);
 -                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
 -                continue;
 -            }
 -            break;
 -
 -        CASE_OP_32_64(sextract):
 -            if (arg_is_const(op->args[1])) {
 -                tmp = sextract64(arg_info(op->args[1])->val,
 -                                 op->args[2], op->args[3]);
 -                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
 -                continue;
 -            }
 -            break;
 -
          default:
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          CASE_OP_32_64(eqv):
              done = fold_eqv(&ctx, op);
              break;
 +        CASE_OP_32_64(extract):
 +            done = fold_extract(&ctx, op);
 +            break;
          CASE_OP_32_64(extract2):
              done = fold_extract2(&ctx, op);
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          case INDEX_op_setcond2_i32:
              done = fold_setcond2(&ctx, op);
              break;
 +        CASE_OP_32_64(sextract):
 +            done = fold_sextract(&ctx, op);
 +            break;
          CASE_OP_32_64_VEC(sub):
              done = fold_sub(&ctx, op);
              break;
 --
-.20.1
+.25.1

-[PULL 35/41] cputlb: Remove support for MMU_MODE*_SUFFIX
+[PULL 30/56] tcg/optimize: Split out fold_deposit
-All users have now been converted to cpu_*_mmuidx_ra.
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Tested-by: Philippe Mathieu-Daudé <philmd@redhat.com>
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/exec/cpu_ldst.h | 230 ----------------------------------------
+ tcg/optimize.c | 25 +++++++++++++++----------
-file changed, 230 deletions(-)
+file changed, 15 insertions(+), 10 deletions(-)
-diff --git a/include/exec/cpu_ldst.h b/include/exec/cpu_ldst.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/exec/cpu_ldst.h
+--- a/tcg/optimize.c
-+++ b/include/exec/cpu_ldst.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ void cpu_stl_mmuidx_ra(CPUArchState *env, abi_ptr addr, uint32_t val,
+@@ -XXX,XX +XXX,XX @@ static bool fold_ctpop(OptContext *ctx, TCGOp *op)
- void cpu_stq_mmuidx_ra(CPUArchState *env, abi_ptr addr, uint64_t val,
+     return fold_const1(ctx, op);
-                        int mmu_idx, uintptr_t retaddr);
+ }
--#ifdef MMU_MODE0_SUFFIX
++static bool fold_deposit(OptContext *ctx, TCGOp *op)
--#define CPU_MMU_INDEX 0
++{
--#define MEMSUFFIX MMU_MODE0_SUFFIX
++    if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
--#define DATA_SIZE 1
++        uint64_t t1 = arg_info(op->args[1])->val;
--#include "exec/cpu_ldst_template.h"
++        uint64_t t2 = arg_info(op->args[2])->val;
 +
 +        t1 = deposit64(t1, op->args[3], op->args[4], t2);
 +        return tcg_opt_gen_movi(ctx, op, op->args[0], t1);
 +    }
 +    return false;
 +}
 +
  static bool fold_divide(OptContext *ctx, TCGOp *op)
  {
      return fold_const2(ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              }
              break;
 -        CASE_OP_32_64(deposit):
 -            if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
 -                tmp = deposit64(arg_info(op->args[1])->val,
 -                                op->args[3], op->args[4],
 -                                arg_info(op->args[2])->val);
 -                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
 -                continue;
 -            }
 -            break;
 -
--#define DATA_SIZE 2
+         default:
--#include "exec/cpu_ldst_template.h"
+             break;
--
--#define DATA_SIZE 4
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
--#include "exec/cpu_ldst_template.h"
+         CASE_OP_32_64(ctpop):
--
+             done = fold_ctpop(&ctx, op);
--#define DATA_SIZE 8
+             break;
--#include "exec/cpu_ldst_template.h"
++        CASE_OP_32_64(deposit):
--#undef CPU_MMU_INDEX
++            done = fold_deposit(&ctx, op);
--#undef MEMSUFFIX
++            break;
--#endif
+         CASE_OP_32_64(div):
--
+         CASE_OP_32_64(divu):
--#if (NB_MMU_MODES >= 2) && defined(MMU_MODE1_SUFFIX)
+             done = fold_divide(&ctx, op);
 -#define CPU_MMU_INDEX 1
 -#define MEMSUFFIX MMU_MODE1_SUFFIX
 -#define DATA_SIZE 1
 -#include "exec/cpu_ldst_template.h"
 -
 -#define DATA_SIZE 2
 -#include "exec/cpu_ldst_template.h"
 -
 -#define DATA_SIZE 4
 -#include "exec/cpu_ldst_template.h"
 -
 -#define DATA_SIZE 8
 -#include "exec/cpu_ldst_template.h"
 -#undef CPU_MMU_INDEX
 -#undef MEMSUFFIX
 -#endif
 -
 -#if (NB_MMU_MODES >= 3) && defined(MMU_MODE2_SUFFIX)
 -
 -#define CPU_MMU_INDEX 2
 -#define MEMSUFFIX MMU_MODE2_SUFFIX
 -#define DATA_SIZE 1
 -#include "exec/cpu_ldst_template.h"
 -
 -#define DATA_SIZE 2
 -#include "exec/cpu_ldst_template.h"
 -
 -#define DATA_SIZE 4
 -#include "exec/cpu_ldst_template.h"
 -
 -#define DATA_SIZE 8
 -#include "exec/cpu_ldst_template.h"
 -#undef CPU_MMU_INDEX
 -#undef MEMSUFFIX
 -#endif /* (NB_MMU_MODES >= 3) */
 -
 -#if (NB_MMU_MODES >= 4) && defined(MMU_MODE3_SUFFIX)
 -
 -#define CPU_MMU_INDEX 3
 -#define MEMSUFFIX MMU_MODE3_SUFFIX
 -#define DATA_SIZE 1
 -#include "exec/cpu_ldst_template.h"
 -
 -#define DATA_SIZE 2
 -#include "exec/cpu_ldst_template.h"
 -
 -#define DATA_SIZE 4
 -#include "exec/cpu_ldst_template.h"
 -
 -#define DATA_SIZE 8
 -#include "exec/cpu_ldst_template.h"
 -#undef CPU_MMU_INDEX
 -#undef MEMSUFFIX
 -#endif /* (NB_MMU_MODES >= 4) */
 -
 -#if (NB_MMU_MODES >= 5) && defined(MMU_MODE4_SUFFIX)
 -
 -#define CPU_MMU_INDEX 4
 -#define MEMSUFFIX MMU_MODE4_SUFFIX
 -#define DATA_SIZE 1
 -#include "exec/cpu_ldst_template.h"
 -
 -#define DATA_SIZE 2
 -#include "exec/cpu_ldst_template.h"
 -
 -#define DATA_SIZE 4
 -#include "exec/cpu_ldst_template.h"
 -
 -#define DATA_SIZE 8
 -#include "exec/cpu_ldst_template.h"
 -#undef CPU_MMU_INDEX
 -#undef MEMSUFFIX
 -#endif /* (NB_MMU_MODES >= 5) */
 -
 -#if (NB_MMU_MODES >= 6) && defined(MMU_MODE5_SUFFIX)
 -
 -#define CPU_MMU_INDEX 5
 -#define MEMSUFFIX MMU_MODE5_SUFFIX
 -#define DATA_SIZE 1
 -#include "exec/cpu_ldst_template.h"
 -
 -#define DATA_SIZE 2
 -#include "exec/cpu_ldst_template.h"
 -
 -#define DATA_SIZE 4
 -#include "exec/cpu_ldst_template.h"
 -
 -#define DATA_SIZE 8
 -#include "exec/cpu_ldst_template.h"
 -#undef CPU_MMU_INDEX
 -#undef MEMSUFFIX
 -#endif /* (NB_MMU_MODES >= 6) */
 -
 -#if (NB_MMU_MODES >= 7) && defined(MMU_MODE6_SUFFIX)
 -
 -#define CPU_MMU_INDEX 6
 -#define MEMSUFFIX MMU_MODE6_SUFFIX
 -#define DATA_SIZE 1
 -#include "exec/cpu_ldst_template.h"
 -
 -#define DATA_SIZE 2
 -#include "exec/cpu_ldst_template.h"
 -
 -#define DATA_SIZE 4
 -#include "exec/cpu_ldst_template.h"
 -
 -#define DATA_SIZE 8
 -#include "exec/cpu_ldst_template.h"
 -#undef CPU_MMU_INDEX
 -#undef MEMSUFFIX
 -#endif /* (NB_MMU_MODES >= 7) */
 -
 -#if (NB_MMU_MODES >= 8) && defined(MMU_MODE7_SUFFIX)
 -
 -#define CPU_MMU_INDEX 7
 -#define MEMSUFFIX MMU_MODE7_SUFFIX
 -#define DATA_SIZE 1
 -#include "exec/cpu_ldst_template.h"
 -
 -#define DATA_SIZE 2
 -#include "exec/cpu_ldst_template.h"
 -
 -#define DATA_SIZE 4
 -#include "exec/cpu_ldst_template.h"
 -
 -#define DATA_SIZE 8
 -#include "exec/cpu_ldst_template.h"
 -#undef CPU_MMU_INDEX
 -#undef MEMSUFFIX
 -#endif /* (NB_MMU_MODES >= 8) */
 -
 -#if (NB_MMU_MODES >= 9) && defined(MMU_MODE8_SUFFIX)
 -
 -#define CPU_MMU_INDEX 8
 -#define MEMSUFFIX MMU_MODE8_SUFFIX
 -#define DATA_SIZE 1
 -#include "exec/cpu_ldst_template.h"
 -
 -#define DATA_SIZE 2
 -#include "exec/cpu_ldst_template.h"
 -
 -#define DATA_SIZE 4
 -#include "exec/cpu_ldst_template.h"
 -
 -#define DATA_SIZE 8
 -#include "exec/cpu_ldst_template.h"
 -#undef CPU_MMU_INDEX
 -#undef MEMSUFFIX
 -#endif /* (NB_MMU_MODES >= 9) */
 -
 -#if (NB_MMU_MODES >= 10) && defined(MMU_MODE9_SUFFIX)
 -
 -#define CPU_MMU_INDEX 9
 -#define MEMSUFFIX MMU_MODE9_SUFFIX
 -#define DATA_SIZE 1
 -#include "exec/cpu_ldst_template.h"
 -
 -#define DATA_SIZE 2
 -#include "exec/cpu_ldst_template.h"
 -
 -#define DATA_SIZE 4
 -#include "exec/cpu_ldst_template.h"
 -
 -#define DATA_SIZE 8
 -#include "exec/cpu_ldst_template.h"
 -#undef CPU_MMU_INDEX
 -#undef MEMSUFFIX
 -#endif /* (NB_MMU_MODES >= 10) */
 -
 -#if (NB_MMU_MODES >= 11) && defined(MMU_MODE10_SUFFIX)
 -
 -#define CPU_MMU_INDEX 10
 -#define MEMSUFFIX MMU_MODE10_SUFFIX
 -#define DATA_SIZE 1
 -#include "exec/cpu_ldst_template.h"
 -
 -#define DATA_SIZE 2
 -#include "exec/cpu_ldst_template.h"
 -
 -#define DATA_SIZE 4
 -#include "exec/cpu_ldst_template.h"
 -
 -#define DATA_SIZE 8
 -#include "exec/cpu_ldst_template.h"
 -#undef CPU_MMU_INDEX
 -#undef MEMSUFFIX
 -#endif /* (NB_MMU_MODES >= 11) */
 -
 -#if (NB_MMU_MODES >= 12) && defined(MMU_MODE11_SUFFIX)
 -
 -#define CPU_MMU_INDEX 11
 -#define MEMSUFFIX MMU_MODE11_SUFFIX
 -#define DATA_SIZE 1
 -#include "exec/cpu_ldst_template.h"
 -
 -#define DATA_SIZE 2
 -#include "exec/cpu_ldst_template.h"
 -
 -#define DATA_SIZE 4
 -#include "exec/cpu_ldst_template.h"
 -
 -#define DATA_SIZE 8
 -#include "exec/cpu_ldst_template.h"
 -#undef CPU_MMU_INDEX
 -#undef MEMSUFFIX
 -#endif /* (NB_MMU_MODES >= 12) */
 -
 -#if (NB_MMU_MODES > 12)
 -#error "NB_MMU_MODES > 12 is not supported for now"
 -#endif /* (NB_MMU_MODES > 12) */
 -
  /* these access are slower, they must be as rare as possible */
  #define CPU_MMU_INDEX (cpu_mmu_index(env, false))
  #define MEMSUFFIX _data
 --
-.20.1
+.25.1

-[PULL 30/41] target/xtensa: Remove MMU_MODE{0,1,2,3}_SUFFIX
+[PULL 31/56] tcg/optimize: Split out fold_count_zeros
-The functions generated by these macros are unused.
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Tested-by: Philippe Mathieu-Daudé <philmd@redhat.com>
 Acked-by: Max Filippov <jcmvbkbc@gmail.com>
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/xtensa/cpu.h | 4 ----
+ tcg/optimize.c | 32 ++++++++++++++++++--------------
-file changed, 4 deletions(-)
+file changed, 18 insertions(+), 14 deletions(-)
-diff --git a/target/xtensa/cpu.h b/target/xtensa/cpu.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/xtensa/cpu.h
+--- a/tcg/optimize.c
-+++ b/target/xtensa/cpu.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static inline uint32_t xtensa_replicate_windowstart(CPUXtensaState *env)
+@@ -XXX,XX +XXX,XX @@ static bool fold_call(OptContext *ctx, TCGOp *op)
      return true;
  }
- /* MMU modes definitions */
++static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
--#define MMU_MODE0_SUFFIX _ring0
++{
--#define MMU_MODE1_SUFFIX _ring1
++    if (arg_is_const(op->args[1])) {
--#define MMU_MODE2_SUFFIX _ring2
++        uint64_t t = arg_info(op->args[1])->val;
--#define MMU_MODE3_SUFFIX _ring3
++
- #define MMU_USER_IDX 3
++        if (t != 0) {
++            t = do_constant_folding(op->opc, t, 0);
- static inline int cpu_mmu_index(CPUXtensaState *env, bool ifetch)
++            return tcg_opt_gen_movi(ctx, op, op->args[0], t);
 +        }
 +        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[2]);
 +    }
 +    return false;
 +}
 +
  static bool fold_ctpop(OptContext *ctx, TCGOp *op)
  {
      return fold_const1(ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              }
              break;
 -        CASE_OP_32_64(clz):
 -        CASE_OP_32_64(ctz):
 -            if (arg_is_const(op->args[1])) {
 -                TCGArg v = arg_info(op->args[1])->val;
 -                if (v != 0) {
 -                    tmp = do_constant_folding(opc, v, 0);
 -                    tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
 -                } else {
 -                    tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[2]);
 -                }
 -                continue;
 -            }
 -            break;
 -
          default:
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          case INDEX_op_brcond2_i32:
              done = fold_brcond2(&ctx, op);
              break;
 +        CASE_OP_32_64(clz):
 +        CASE_OP_32_64(ctz):
 +            done = fold_count_zeros(&ctx, op);
 +            break;
          CASE_OP_32_64(ctpop):
              done = fold_ctpop(&ctx, op);
              break;
 --
-.20.1
+.25.1

-[PULL 29/41] target/unicore32: Remove MMU_MODE{0,1}_SUFFIX
+[PULL 32/56] tcg/optimize: Split out fold_bswap
-The functions generated by these macros are unused.
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Cc: Guan Xuetao <gxt@mprc.pku.edu.cn>
 Tested-by: Philippe Mathieu-Daudé <philmd@redhat.com>
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/unicore32/cpu.h | 2 --
+ tcg/optimize.c | 27 ++++++++++++++++-----------
-file changed, 2 deletions(-)
+file changed, 16 insertions(+), 11 deletions(-)
-diff --git a/target/unicore32/cpu.h b/target/unicore32/cpu.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/unicore32/cpu.h
+--- a/tcg/optimize.c
-+++ b/target/unicore32/cpu.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ void cpu_asr_write(CPUUniCore32State *env1, target_ulong val, target_ulong mask)
+@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
- int uc32_cpu_signal_handler(int host_signum, void *pinfo, void *puc);
+     return false;
+ }
- /* MMU modes definitions */
--#define MMU_MODE0_SUFFIX _kernel
++static bool fold_bswap(OptContext *ctx, TCGOp *op)
--#define MMU_MODE1_SUFFIX _user
++{
- #define MMU_USER_IDX 1
++    if (arg_is_const(op->args[1])) {
- static inline int cpu_mmu_index(CPUUniCore32State *env, bool ifetch)
++        uint64_t t = arg_info(op->args[1])->val;
 +
 +        t = do_constant_folding(op->opc, t, op->args[2]);
 +        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
 +    }
 +    return false;
 +}
 +
  static bool fold_call(OptContext *ctx, TCGOp *op)
  {
+     TCGContext *s = ctx->tcg;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             }
+             break;
+-        CASE_OP_32_64(bswap16):
+-        CASE_OP_32_64(bswap32):
+-        case INDEX_op_bswap64_i64:
+-            if (arg_is_const(op->args[1])) {
+-                tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
+-                                          op->args[2]);
+-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
+-                continue;
+-            }
+-            break;
+-
+         default:
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         case INDEX_op_brcond2_i32:
+             done = fold_brcond2(&ctx, op);
+             break;
++        CASE_OP_32_64(bswap16):
++        CASE_OP_32_64(bswap32):
++        case INDEX_op_bswap64_i64:
++            done = fold_bswap(&ctx, op);
++            break;
+         CASE_OP_32_64(clz):
+         CASE_OP_32_64(ctz):
+             done = fold_count_zeros(&ctx, op);
 --
-.20.1
+.25.1

-[PULL 28/41] target/sh4: Remove MMU_MODE{0,1}_SUFFIX
+[PULL 33/56] tcg/optimize: Split out fold_dup, fold_dup2
-The functions generated by these macros are unused.
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Cc: Aurelien Jarno <aurelien@aurel32.net>
 Tested-by: Philippe Mathieu-Daudé <philmd@redhat.com>
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/sh4/cpu.h | 2 --
+ tcg/optimize.c | 53 +++++++++++++++++++++++++++++---------------------
-file changed, 2 deletions(-)
+file changed, 31 insertions(+), 22 deletions(-)
-diff --git a/target/sh4/cpu.h b/target/sh4/cpu.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/sh4/cpu.h
+--- a/tcg/optimize.c
-+++ b/target/sh4/cpu.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ void cpu_load_tlb(CPUSH4State * env);
+@@ -XXX,XX +XXX,XX @@ static bool fold_divide(OptContext *ctx, TCGOp *op)
- #define cpu_list sh4_cpu_list
+     return fold_const2(ctx, op);
+ }
- /* MMU modes definitions */
--#define MMU_MODE0_SUFFIX _kernel
++static bool fold_dup(OptContext *ctx, TCGOp *op)
--#define MMU_MODE1_SUFFIX _user
++{
- #define MMU_USER_IDX 1
++    if (arg_is_const(op->args[1])) {
- static inline int cpu_mmu_index (CPUSH4State *env, bool ifetch)
++        uint64_t t = arg_info(op->args[1])->val;
 +        t = dup_const(TCGOP_VECE(op), t);
 +        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
 +    }
 +    return false;
 +}
 +
 +static bool fold_dup2(OptContext *ctx, TCGOp *op)
 +{
 +    if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
 +        uint64_t t = deposit64(arg_info(op->args[1])->val, 32, 32,
 +                               arg_info(op->args[2])->val);
 +        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
 +    }
 +
 +    if (args_are_copies(op->args[1], op->args[2])) {
 +        op->opc = INDEX_op_dup_vec;
 +        TCGOP_VECE(op) = MO_32;
 +    }
 +    return false;
 +}
 +
  static bool fold_eqv(OptContext *ctx, TCGOp *op)
  {
+     return fold_const2(ctx, op);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             done = tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
+             break;
+-        case INDEX_op_dup_vec:
+-            if (arg_is_const(op->args[1])) {
+-                tmp = arg_info(op->args[1])->val;
+-                tmp = dup_const(TCGOP_VECE(op), tmp);
+-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
+-                continue;
+-            }
+-            break;
+-
+-        case INDEX_op_dup2_vec:
+-            assert(TCG_TARGET_REG_BITS == 32);
+-            if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
+-                tcg_opt_gen_movi(&ctx, op, op->args[0],
+-                                 deposit64(arg_info(op->args[1])->val, 32, 32,
+-                                           arg_info(op->args[2])->val));
+-                continue;
+-            } else if (args_are_copies(op->args[1], op->args[2])) {
+-                op->opc = INDEX_op_dup_vec;
+-                TCGOP_VECE(op) = MO_32;
+-            }
+-            break;
+-
+         default:
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64(divu):
+             done = fold_divide(&ctx, op);
+             break;
++        case INDEX_op_dup_vec:
++            done = fold_dup(&ctx, op);
++            break;
++        case INDEX_op_dup2_vec:
++            done = fold_dup2(&ctx, op);
++            break;
+         CASE_OP_32_64(eqv):
+             done = fold_eqv(&ctx, op);
+             break;
 --
-.20.1
+.25.1

-[PULL 27/41] target/microblaze: Remove MMU_MODE{0,1,2}_SUFFIX
+[PULL 34/56] tcg/optimize: Split out fold_mov
-The functions generated by these macros are unused.
+This is the final entry in the main switch that was in a
 different form.  After this, we have the option to convert
 the switch into a function dispatch table.
-Cc: Edgar E. Iglesias <edgar.iglesias@gmail.com>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
-Tested-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/microblaze/cpu.h | 3 ---
+ tcg/optimize.c | 27 ++++++++++++++-------------
-file changed, 3 deletions(-)
+file changed, 14 insertions(+), 13 deletions(-)
-diff --git a/target/microblaze/cpu.h b/target/microblaze/cpu.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/microblaze/cpu.h
+--- a/tcg/optimize.c
-+++ b/target/microblaze/cpu.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ int cpu_mb_signal_handler(int host_signum, void *pinfo,
+@@ -XXX,XX +XXX,XX @@ static bool fold_mb(OptContext *ctx, TCGOp *op)
- #define cpu_signal_handler cpu_mb_signal_handler
+     return true;
+ }
- /* MMU modes definitions */
--#define MMU_MODE0_SUFFIX _nommu
++static bool fold_mov(OptContext *ctx, TCGOp *op)
--#define MMU_MODE1_SUFFIX _kernel
++{
--#define MMU_MODE2_SUFFIX _user
++    return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]);
- #define MMU_NOMMU_IDX   0
++}
- #define MMU_KERNEL_IDX  1
++
- #define MMU_USER_IDX    2
+ static bool fold_movcond(OptContext *ctx, TCGOp *op)
  {
      TCGOpcode opc = op->opc;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              break;
          }
 -        /* Propagate constants through copy operations and do constant
 -           folding.  Constants will be substituted to arguments by register
 -           allocator where needed and possible.  Also detect copies. */
 +        /*
 +         * Process each opcode.
 +         * Sorted alphabetically by opcode as much as possible.
 +         */
          switch (opc) {
 -        CASE_OP_32_64_VEC(mov):
 -            done = tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
 -            break;
 -
 -        default:
 -            break;
 -
 -        /* ---------------------------------------------------------- */
 -        /* Sorted alphabetically by opcode as much as possible. */
 -
          CASE_OP_32_64_VEC(add):
              done = fold_add(&ctx, op);
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          case INDEX_op_mb:
              done = fold_mb(&ctx, op);
              break;
 +        CASE_OP_32_64_VEC(mov):
 +            done = fold_mov(&ctx, op);
 +            break;
          CASE_OP_32_64(movcond):
              done = fold_movcond(&ctx, op);
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          CASE_OP_32_64_VEC(xor):
              done = fold_xor(&ctx, op);
              break;
 +        default:
 +            break;
          }
          if (!done) {
 --
-.20.1
+.25.1

-[PULL 15/41] linux-user: Include trace-root.h in syscall-trace.h
+[PULL 35/56] tcg/optimize: Split out fold_xx_to_i
-Code movement in an upcoming patch will show that this file
+Pull the "op r, a, a => movi r, 0" optimization into a function,
-was implicitly depending on trace-root.h being included beforehand.
+and use it in the outer opcode fold functions.
-Tested-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
-Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/user/syscall-trace.h | 2 ++
+ tcg/optimize.c | 41 ++++++++++++++++++++++++-----------------
-file changed, 2 insertions(+)
+file changed, 24 insertions(+), 17 deletions(-)
-diff --git a/include/user/syscall-trace.h b/include/user/syscall-trace.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/user/syscall-trace.h
+--- a/tcg/optimize.c
-+++ b/include/user/syscall-trace.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
- #ifndef _SYSCALL_TRACE_H_
+     return false;
- #define _SYSCALL_TRACE_H_
+ }
-+#include "trace-root.h"
++/* If the binary operation has both arguments equal, fold to @i. */
 +static bool fold_xx_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
 +{
 +    if (args_are_copies(op->args[1], op->args[2])) {
 +        return tcg_opt_gen_movi(ctx, op, op->args[0], i);
 +    }
 +    return false;
 +}
 +
  /*
-  * These helpers just provide a common place for the various
+  * These outermost fold_<op> functions are sorted alphabetically.
-  * subsystems that want to track syscalls to put their hooks in. We
+  */
@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
  static bool fold_andc(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const2(ctx, op);
 +    if (fold_const2(ctx, op) ||
 +        fold_xx_to_i(ctx, op, 0)) {
 +        return true;
 +    }
 +    return false;
  }
  static bool fold_brcond(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_shift(OptContext *ctx, TCGOp *op)
  static bool fold_sub(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const2(ctx, op);
 +    if (fold_const2(ctx, op) ||
 +        fold_xx_to_i(ctx, op, 0)) {
 +        return true;
 +    }
 +    return false;
  }
  static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
  static bool fold_xor(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const2(ctx, op);
 +    if (fold_const2(ctx, op) ||
 +        fold_xx_to_i(ctx, op, 0)) {
 +        return true;
 +    }
 +    return false;
  }
  /* Propagate constants and copies, fold constant expressions. */
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              break;
          }
 -        /* Simplify expression for "op r, a, a => movi r, 0" cases */
 -        switch (opc) {
 -        CASE_OP_32_64_VEC(andc):
 -        CASE_OP_32_64_VEC(sub):
 -        CASE_OP_32_64_VEC(xor):
 -            if (args_are_copies(op->args[1], op->args[2])) {
 -                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
 -                continue;
 -            }
 -            break;
 -        default:
 -            break;
 -        }
 -
          /*
           * Process each opcode.
           * Sorted alphabetically by opcode as much as possible.
 --
-.20.1
+.25.1

-[PULL 17/41] cputlb: Move body of cpu_ldst_template.h out of line
+[PULL 36/56] tcg/optimize: Split out fold_xx_to_x
-With the tracing hooks, the inline functions are no longer
+Pull the "op r, a, a => mov r, a" optimization into a function,
-so simple.  Once out-of-line, the current tlb_entry lookup
+and use it in the outer opcode fold functions.
 is redundant with the one in the main load/store_helper.
-This also begins the introduction of a new target facing
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
-interface, with suffix *_mmuidx_ra.  This is not yet
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 official because the interface is not done for user-only.
 Use abi_ptr instead of target_ulong in preparation for
 user-only; the two types are identical for softmmu.
 What remains in cpu_ldst_template.h are the expansions
 for _code, _data, and MMU_MODE<N>_SUFFIX.
 Tested-by: Philippe Mathieu-Daudé <philmd@redhat.com>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/exec/cpu_ldst.h          |  25 ++++++-
+ tcg/optimize.c | 39 ++++++++++++++++++++++++---------------
- include/exec/cpu_ldst_template.h | 125 +++++++------------------------
+file changed, 24 insertions(+), 15 deletions(-)
  accel/tcg/cputlb.c               | 116 ++++++++++++++++++++++++++++
 files changed, 166 insertions(+), 100 deletions(-)
-diff --git a/include/exec/cpu_ldst.h b/include/exec/cpu_ldst.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/exec/cpu_ldst.h
+--- a/tcg/optimize.c
-+++ b/include/exec/cpu_ldst.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static inline void clear_helper_retaddr(void)
+@@ -XXX,XX +XXX,XX @@ static bool fold_xx_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
+     return false;
  #else
 -/* The memory helpers for tcg-generated code need tcg_target_long etc.  */
 +/* Needed for TCG_OVERSIZED_GUEST */
  #include "tcg.h"
  static inline target_ulong tlb_addr_write(const CPUTLBEntry *entry)
@@ -XXX,XX +XXX,XX @@ static inline CPUTLBEntry *tlb_entry(CPUArchState *env, uintptr_t mmu_idx,
      return &env_tlb(env)->f[mmu_idx].table[tlb_index(env, mmu_idx, addr)];
  }
-+uint32_t cpu_ldub_mmuidx_ra(CPUArchState *env, abi_ptr addr,
++/* If the binary operation has both arguments equal, fold to identity. */
-+                            int mmu_idx, uintptr_t ra);
++static bool fold_xx_to_x(OptContext *ctx, TCGOp *op)
 +uint32_t cpu_lduw_mmuidx_ra(CPUArchState *env, abi_ptr addr,
 +                            int mmu_idx, uintptr_t ra);
 +uint32_t cpu_ldl_mmuidx_ra(CPUArchState *env, abi_ptr addr,
 +                           int mmu_idx, uintptr_t ra);
 +uint64_t cpu_ldq_mmuidx_ra(CPUArchState *env, abi_ptr addr,
 +                           int mmu_idx, uintptr_t ra);
 +
 +int cpu_ldsb_mmuidx_ra(CPUArchState *env, abi_ptr addr,
 +                       int mmu_idx, uintptr_t ra);
 +int cpu_ldsw_mmuidx_ra(CPUArchState *env, abi_ptr addr,
 +                       int mmu_idx, uintptr_t ra);
 +
 +void cpu_stb_mmuidx_ra(CPUArchState *env, abi_ptr addr, uint32_t val,
 +                       int mmu_idx, uintptr_t retaddr);
 +void cpu_stw_mmuidx_ra(CPUArchState *env, abi_ptr addr, uint32_t val,
 +                       int mmu_idx, uintptr_t retaddr);
 +void cpu_stl_mmuidx_ra(CPUArchState *env, abi_ptr addr, uint32_t val,
 +                       int mmu_idx, uintptr_t retaddr);
 +void cpu_stq_mmuidx_ra(CPUArchState *env, abi_ptr addr, uint64_t val,
 +                       int mmu_idx, uintptr_t retaddr);
 +
  #ifdef MMU_MODE0_SUFFIX
  #define CPU_MMU_INDEX 0
  #define MEMSUFFIX MMU_MODE0_SUFFIX
 diff --git a/include/exec/cpu_ldst_template.h b/include/exec/cpu_ldst_template.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/exec/cpu_ldst_template.h
 +++ b/include/exec/cpu_ldst_template.h
@@ -XXX,XX +XXX,XX @@
   * License along with this library; if not, see <http://www.gnu.org/licenses/>.
   */
 -#if !defined(SOFTMMU_CODE_ACCESS)
 -#include "trace-root.h"
 -#endif
 -
 -#include "qemu/plugin.h"
 -#include "trace/mem.h"
 -
  #if DATA_SIZE == 8
  #define SUFFIX q
  #define USUFFIX q
@@ -XXX,XX +XXX,XX @@
  #define RES_TYPE uint32_t
  #endif
 +/* generic load/store macros */
 +
  #ifdef SOFTMMU_CODE_ACCESS
 -#define ADDR_READ addr_code
 -#define MMUSUFFIX _cmmu
 -#define URETSUFFIX USUFFIX
 -#define SRETSUFFIX glue(s, SUFFIX)
 -#else
 -#define ADDR_READ addr_read
 -#define MMUSUFFIX _mmu
 -#define URETSUFFIX USUFFIX
 -#define SRETSUFFIX glue(s, SUFFIX)
 +
 +static inline RES_TYPE
 +glue(glue(cpu_ld, USUFFIX), _code)(CPUArchState *env, target_ulong ptr)
 +{
-+    TCGMemOpIdx oi = make_memop_idx(MO_TE | SHIFT, CPU_MMU_INDEX);
++    if (args_are_copies(op->args[1], op->args[2])) {
-+    return glue(glue(helper_ret_ld, USUFFIX), _cmmu)(env, ptr, oi, 0);
++        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]);
-+}
++    }
-+
++    return false;
 +#if DATA_SIZE <= 2
 +static inline int
 +glue(glue(cpu_lds, SUFFIX), _code)(CPUArchState *env, target_ulong ptr)
 +{
 +    return (DATA_STYPE)glue(glue(cpu_ld, USUFFIX), _code)(env, ptr);
 +}
  #endif
 -/* generic load/store macros */
 +#else
  static inline RES_TYPE
  glue(glue(glue(cpu_ld, USUFFIX), MEMSUFFIX), _ra)(CPUArchState *env,
                                                    target_ulong ptr,
                                                    uintptr_t retaddr)
  {
 -    CPUTLBEntry *entry;
 -    RES_TYPE res;
 -    target_ulong addr;
 -    int mmu_idx = CPU_MMU_INDEX;
 -    MemOp op = MO_TE | SHIFT;
 -#if !defined(SOFTMMU_CODE_ACCESS)
 -    uint16_t meminfo = trace_mem_get_info(op, mmu_idx, false);
 -    trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo);
 -#endif
 -
 -    addr = ptr;
 -    entry = tlb_entry(env, mmu_idx, addr);
 -    if (unlikely(entry->ADDR_READ !=
 -                 (addr & (TARGET_PAGE_MASK | (DATA_SIZE - 1))))) {
 -        TCGMemOpIdx oi = make_memop_idx(op, mmu_idx);
 -        res = glue(glue(helper_ret_ld, URETSUFFIX), MMUSUFFIX)(env, addr,
 -                                                               oi, retaddr);
 -    } else {
 -        uintptr_t hostaddr = addr + entry->addend;
 -        res = glue(glue(ld, USUFFIX), _p)((uint8_t *)hostaddr);
 -    }
 -#ifndef SOFTMMU_CODE_ACCESS
 -    qemu_plugin_vcpu_mem_cb(env_cpu(env), ptr, meminfo);
 -#endif
 -    return res;
 +    return glue(glue(cpu_ld, USUFFIX), _mmuidx_ra)(env, ptr, CPU_MMU_INDEX,
 +                                                   retaddr);
  }
  static inline RES_TYPE
  glue(glue(cpu_ld, USUFFIX), MEMSUFFIX)(CPUArchState *env, target_ulong ptr)
  {
 -    return glue(glue(glue(cpu_ld, USUFFIX), MEMSUFFIX), _ra)(env, ptr, 0);
 +    return glue(glue(cpu_ld, USUFFIX), _mmuidx_ra)(env, ptr, CPU_MMU_INDEX, 0);
  }
  #if DATA_SIZE <= 2
@@ -XXX,XX +XXX,XX @@ glue(glue(glue(cpu_lds, SUFFIX), MEMSUFFIX), _ra)(CPUArchState *env,
                                                    target_ulong ptr,
                                                    uintptr_t retaddr)
  {
 -    CPUTLBEntry *entry;
 -    int res;
 -    target_ulong addr;
 -    int mmu_idx = CPU_MMU_INDEX;
 -    MemOp op = MO_TE | MO_SIGN | SHIFT;
 -#ifndef SOFTMMU_CODE_ACCESS
 -    uint16_t meminfo = trace_mem_get_info(op, mmu_idx, false);
 -    trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo);
 -#endif
 -
 -    addr = ptr;
 -    entry = tlb_entry(env, mmu_idx, addr);
 -    if (unlikely(entry->ADDR_READ !=
 -                 (addr & (TARGET_PAGE_MASK | (DATA_SIZE - 1))))) {
 -        TCGMemOpIdx oi = make_memop_idx(op & ~MO_SIGN, mmu_idx);
 -        res = (DATA_STYPE)glue(glue(helper_ret_ld, SRETSUFFIX),
 -                               MMUSUFFIX)(env, addr, oi, retaddr);
 -    } else {
 -        uintptr_t hostaddr = addr + entry->addend;
 -        res = glue(glue(lds, SUFFIX), _p)((uint8_t *)hostaddr);
 -    }
 -#ifndef SOFTMMU_CODE_ACCESS
 -    qemu_plugin_vcpu_mem_cb(env_cpu(env), ptr, meminfo);
 -#endif
 -    return res;
 +    return glue(glue(cpu_lds, SUFFIX), _mmuidx_ra)(env, ptr, CPU_MMU_INDEX,
 +                                                   retaddr);
  }
  static inline int
  glue(glue(cpu_lds, SUFFIX), MEMSUFFIX)(CPUArchState *env, target_ulong ptr)
  {
 -    return glue(glue(glue(cpu_lds, SUFFIX), MEMSUFFIX), _ra)(env, ptr, 0);
 +    return glue(glue(cpu_lds, SUFFIX), _mmuidx_ra)(env, ptr, CPU_MMU_INDEX, 0);
  }
  #endif
 -#ifndef SOFTMMU_CODE_ACCESS
 -
  /* generic store macro */
  static inline void
@@ -XXX,XX +XXX,XX @@ glue(glue(glue(cpu_st, SUFFIX), MEMSUFFIX), _ra)(CPUArchState *env,
                                                   target_ulong ptr,
                                                   RES_TYPE v, uintptr_t retaddr)
  {
 -    CPUTLBEntry *entry;
 -    target_ulong addr;
 -    int mmu_idx = CPU_MMU_INDEX;
 -    MemOp op = MO_TE | SHIFT;
 -#if !defined(SOFTMMU_CODE_ACCESS)
 -    uint16_t meminfo = trace_mem_get_info(op, mmu_idx, true);
 -    trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo);
 -#endif
 -
 -    addr = ptr;
 -    entry = tlb_entry(env, mmu_idx, addr);
 -    if (unlikely(tlb_addr_write(entry) !=
 -                 (addr & (TARGET_PAGE_MASK | (DATA_SIZE - 1))))) {
 -        TCGMemOpIdx oi = make_memop_idx(op, mmu_idx);
 -        glue(glue(helper_ret_st, SUFFIX), MMUSUFFIX)(env, addr, v, oi,
 -                                                     retaddr);
 -    } else {
 -        uintptr_t hostaddr = addr + entry->addend;
 -        glue(glue(st, SUFFIX), _p)((uint8_t *)hostaddr, v);
 -    }
 -#ifndef SOFTMMU_CODE_ACCESS
 -    qemu_plugin_vcpu_mem_cb(env_cpu(env), ptr, meminfo);
 -#endif
 +    glue(glue(cpu_st, SUFFIX), _mmuidx_ra)(env, ptr, v, CPU_MMU_INDEX,
 +                                           retaddr);
  }
  static inline void
  glue(glue(cpu_st, SUFFIX), MEMSUFFIX)(CPUArchState *env, target_ulong ptr,
                                        RES_TYPE v)
  {
 -    glue(glue(glue(cpu_st, SUFFIX), MEMSUFFIX), _ra)(env, ptr, v, 0);
 +    glue(glue(cpu_st, SUFFIX), _mmuidx_ra)(env, ptr, v, CPU_MMU_INDEX, 0);
  }
  #endif /* !SOFTMMU_CODE_ACCESS */
@@ -XXX,XX +XXX,XX @@ glue(glue(cpu_st, SUFFIX), MEMSUFFIX)(CPUArchState *env, target_ulong ptr,
  #undef SUFFIX
  #undef USUFFIX
  #undef DATA_SIZE
 -#undef MMUSUFFIX
 -#undef ADDR_READ
 -#undef URETSUFFIX
 -#undef SRETSUFFIX
  #undef SHIFT
 diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/cputlb.c
 +++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@
  #include "qemu/atomic.h"
  #include "qemu/atomic128.h"
  #include "translate-all.h"
 +#include "trace-root.h"
 +#include "qemu/plugin.h"
 +#include "trace/mem.h"
  #ifdef CONFIG_PLUGIN
  #include "qemu/plugin-memory.h"
  #endif
@@ -XXX,XX +XXX,XX @@ tcg_target_ulong helper_be_ldsl_mmu(CPUArchState *env, target_ulong addr,
      return (int32_t)helper_be_ldul_mmu(env, addr, oi, retaddr);
  }
 +/*
 + * Load helpers for cpu_ldst.h.
 + */
 +
 +static inline uint64_t cpu_load_helper(CPUArchState *env, abi_ptr addr,
 +                                       int mmu_idx, uintptr_t retaddr,
 +                                       MemOp op, FullLoadHelper *full_load)
 +{
 +    uint16_t meminfo;
 +    TCGMemOpIdx oi;
 +    uint64_t ret;
 +
 +    meminfo = trace_mem_get_info(op, mmu_idx, false);
 +    trace_guest_mem_before_exec(env_cpu(env), addr, meminfo);
 +
 +    op &= ~MO_SIGN;
 +    oi = make_memop_idx(op, mmu_idx);
 +    ret = full_load(env, addr, oi, retaddr);
 +
 +    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, meminfo);
 +
 +    return ret;
 +}
 +
 +uint32_t cpu_ldub_mmuidx_ra(CPUArchState *env, abi_ptr addr,
 +                            int mmu_idx, uintptr_t ra)
 +{
 +    return cpu_load_helper(env, addr, mmu_idx, ra, MO_UB, full_ldub_mmu);
 +}
 +
 +int cpu_ldsb_mmuidx_ra(CPUArchState *env, abi_ptr addr,
 +                       int mmu_idx, uintptr_t ra)
 +{
 +    return (int8_t)cpu_load_helper(env, addr, mmu_idx, ra, MO_SB,
 +                                   full_ldub_mmu);
 +}
 +
 +uint32_t cpu_lduw_mmuidx_ra(CPUArchState *env, abi_ptr addr,
 +                            int mmu_idx, uintptr_t ra)
 +{
 +    return cpu_load_helper(env, addr, mmu_idx, ra, MO_TEUW,
 +                           MO_TE == MO_LE
 +                           ? full_le_lduw_mmu : full_be_lduw_mmu);
 +}
 +
 +int cpu_ldsw_mmuidx_ra(CPUArchState *env, abi_ptr addr,
 +                       int mmu_idx, uintptr_t ra)
 +{
 +    return (int16_t)cpu_load_helper(env, addr, mmu_idx, ra, MO_TESW,
 +                                    MO_TE == MO_LE
 +                                    ? full_le_lduw_mmu : full_be_lduw_mmu);
 +}
 +
 +uint32_t cpu_ldl_mmuidx_ra(CPUArchState *env, abi_ptr addr,
 +                           int mmu_idx, uintptr_t ra)
 +{
 +    return cpu_load_helper(env, addr, mmu_idx, ra, MO_TEUL,
 +                           MO_TE == MO_LE
 +                           ? full_le_ldul_mmu : full_be_ldul_mmu);
 +}
 +
 +uint64_t cpu_ldq_mmuidx_ra(CPUArchState *env, abi_ptr addr,
 +                           int mmu_idx, uintptr_t ra)
 +{
 +    return cpu_load_helper(env, addr, mmu_idx, ra, MO_TEQ,
 +                           MO_TE == MO_LE
 +                           ? helper_le_ldq_mmu : helper_be_ldq_mmu);
 +}
 +
  /*
-  * Store Helpers
+  * These outermost fold_<op> functions are sorted alphabetically.
 + *
 + * The ordering of the transformations should be:
 + *   1) those that produce a constant
 + *   2) those that produce a copy
 + *   3) those that produce information about the result value.
   */
-@@ -XXX,XX +XXX,XX @@ void helper_be_stq_mmu(CPUArchState *env, target_ulong addr, uint64_t val,
-     store_helper(env, addr, val, oi, retaddr, MO_BEQ);
+ static bool fold_add(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_add2_i32(OptContext *ctx, TCGOp *op)
  static bool fold_and(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const2(ctx, op);
 +    if (fold_const2(ctx, op) ||
 +        fold_xx_to_x(ctx, op)) {
 +        return true;
 +    }
 +    return false;
  }
-+/*
+ static bool fold_andc(OptContext *ctx, TCGOp *op)
-+ * Store Helpers for cpu_ldst.h
+@@ -XXX,XX +XXX,XX @@ static bool fold_not(OptContext *ctx, TCGOp *op)
-+ */
-+
+ static bool fold_or(OptContext *ctx, TCGOp *op)
-+static inline void QEMU_ALWAYS_INLINE
+ {
-+cpu_store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
+-    return fold_const2(ctx, op);
-+                 int mmu_idx, uintptr_t retaddr, MemOp op)
++    if (fold_const2(ctx, op) ||
-+{
++        fold_xx_to_x(ctx, op)) {
-+    TCGMemOpIdx oi;
++        return true;
-+    uint16_t meminfo;
++    }
-+
++    return false;
-+    meminfo = trace_mem_get_info(op, mmu_idx, true);
+ }
-+    trace_guest_mem_before_exec(env_cpu(env), addr, meminfo);
-+
+ static bool fold_orc(OptContext *ctx, TCGOp *op)
-+    oi = make_memop_idx(op, mmu_idx);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-+    store_helper(env, addr, val, oi, retaddr, op);
+             break;
-+
+         }
-+    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, meminfo);
-+}
+-        /* Simplify expression for "op r, a, a => mov r, a" cases */
-+
+-        switch (opc) {
-+void cpu_stb_mmuidx_ra(CPUArchState *env, target_ulong addr, uint32_t val,
+-        CASE_OP_32_64_VEC(or):
-+                       int mmu_idx, uintptr_t retaddr)
+-        CASE_OP_32_64_VEC(and):
-+{
+-            if (args_are_copies(op->args[1], op->args[2])) {
-+    cpu_store_helper(env, addr, val, mmu_idx, retaddr, MO_UB);
+-                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
-+}
+-                continue;
-+
+-            }
-+void cpu_stw_mmuidx_ra(CPUArchState *env, target_ulong addr, uint32_t val,
+-            break;
-+                       int mmu_idx, uintptr_t retaddr)
+-        default:
-+{
+-            break;
-+    cpu_store_helper(env, addr, val, mmu_idx, retaddr, MO_TEUW);
+-        }
-+}
+-
-+
+         /*
-+void cpu_stl_mmuidx_ra(CPUArchState *env, target_ulong addr, uint32_t val,
+          * Process each opcode.
-+                       int mmu_idx, uintptr_t retaddr)
+          * Sorted alphabetically by opcode as much as possible.
 +{
 +    cpu_store_helper(env, addr, val, mmu_idx, retaddr, MO_TEUL);
 +}
 +
 +void cpu_stq_mmuidx_ra(CPUArchState *env, target_ulong addr, uint64_t val,
 +                       int mmu_idx, uintptr_t retaddr)
 +{
 +    cpu_store_helper(env, addr, val, mmu_idx, retaddr, MO_TEQ);
 +}
 +
  /* First set of helpers allows passing in of OI and RETADDR.  This makes
     them callable from other helpers.  */
 --
-.20.1
+.25.1

-[PULL 26/41] target/i386: Remove MMU_MODE{0,1,2}_SUFFIX
+[PULL 37/56] tcg/optimize: Split out fold_xi_to_i
-The functions generated by these macros are unused.
+Pull the "op r, a, 0 => movi r, 0" optimization into a function,
 and use it in the outer opcode fold functions.
-Cc: Eduardo Habkost <ehabkost@redhat.com>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
-Tested-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Acked-by: Paolo Bonzini <pbonzini@redhat.com>
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/i386/cpu.h | 3 ---
+ tcg/optimize.c | 38 ++++++++++++++++++++------------------
-file changed, 3 deletions(-)
+file changed, 20 insertions(+), 18 deletions(-)
-diff --git a/target/i386/cpu.h b/target/i386/cpu.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/i386/cpu.h
+--- a/tcg/optimize.c
-+++ b/target/i386/cpu.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ uint64_t cpu_get_tsc(CPUX86State *env);
+@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
- #define cpu_list x86_cpu_list
+     return false;
+ }
- /* MMU modes definitions */
--#define MMU_MODE0_SUFFIX _ksmap
++/* If the binary operation has second argument @i, fold to @i. */
--#define MMU_MODE1_SUFFIX _user
++static bool fold_xi_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
--#define MMU_MODE2_SUFFIX _knosmap /* SMAP disabled or CPL<3 && AC=1 */
++{
- #define MMU_KSMAP_IDX   0
++    if (arg_is_const(op->args[2]) && arg_info(op->args[2])->val == i) {
- #define MMU_USER_IDX    1
++        return tcg_opt_gen_movi(ctx, op, op->args[0], i);
- #define MMU_KNOSMAP_IDX 2
++    }
 +    return false;
 +}
 +
  /* If the binary operation has both arguments equal, fold to @i. */
  static bool fold_xx_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
  {
@@ -XXX,XX +XXX,XX @@ static bool fold_add2_i32(OptContext *ctx, TCGOp *op)
  static bool fold_and(OptContext *ctx, TCGOp *op)
  {
      if (fold_const2(ctx, op) ||
 +        fold_xi_to_i(ctx, op, 0) ||
          fold_xx_to_x(ctx, op)) {
          return true;
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
  static bool fold_mul(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const2(ctx, op);
 +    if (fold_const2(ctx, op) ||
 +        fold_xi_to_i(ctx, op, 0)) {
 +        return true;
 +    }
 +    return false;
  }
  static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const2(ctx, op);
 +    if (fold_const2(ctx, op) ||
 +        fold_xi_to_i(ctx, op, 0)) {
 +        return true;
 +    }
 +    return false;
  }
  static bool fold_mulu2_i32(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              continue;
          }
 -        /* Simplify expression for "op r, a, 0 => movi r, 0" cases */
 -        switch (opc) {
 -        CASE_OP_32_64_VEC(and):
 -        CASE_OP_32_64_VEC(mul):
 -        CASE_OP_32_64(muluh):
 -        CASE_OP_32_64(mulsh):
 -            if (arg_is_const(op->args[2])
 -                && arg_info(op->args[2])->val == 0) {
 -                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
 -                continue;
 -            }
 -            break;
 -        default:
 -            break;
 -        }
 -
          /*
           * Process each opcode.
           * Sorted alphabetically by opcode as much as possible.
 --
-.20.1
+.25.1

-New patch
+[PULL 38/56] tcg/optimize: Add type to OptContext
+Compute the type of the operation early.
 There are at least 4 places that used a def->flags ladder
 to determine the type of the operation being optimized.
 There were two places that assumed !TCG_OPF_64BIT means
 TCG_TYPE_I32, and so could potentially compute incorrect
 results for vector operations.
 Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
  tcg/optimize.c | 149 +++++++++++++++++++++++++++++--------------------
 file changed, 89 insertions(+), 60 deletions(-)
 diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/optimize.c
 +++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ typedef struct OptContext {
      /* In flight values from optimization. */
      uint64_t z_mask;
 +    TCGType type;
  } OptContext;
  static inline TempOptInfo *ts_info(TCGTemp *ts)
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
  {
      TCGTemp *dst_ts = arg_temp(dst);
      TCGTemp *src_ts = arg_temp(src);
 -    const TCGOpDef *def;
      TempOptInfo *di;
      TempOptInfo *si;
      uint64_t z_mask;
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
      reset_ts(dst_ts);
      di = ts_info(dst_ts);
      si = ts_info(src_ts);
 -    def = &tcg_op_defs[op->opc];
 -    if (def->flags & TCG_OPF_VECTOR) {
 -        new_op = INDEX_op_mov_vec;
 -    } else if (def->flags & TCG_OPF_64BIT) {
 -        new_op = INDEX_op_mov_i64;
 -    } else {
 +
 +    switch (ctx->type) {
 +    case TCG_TYPE_I32:
          new_op = INDEX_op_mov_i32;
 +        break;
 +    case TCG_TYPE_I64:
 +        new_op = INDEX_op_mov_i64;
 +        break;
 +    case TCG_TYPE_V64:
 +    case TCG_TYPE_V128:
 +    case TCG_TYPE_V256:
 +        /* TCGOP_VECL and TCGOP_VECE remain unchanged.  */
 +        new_op = INDEX_op_mov_vec;
 +        break;
 +    default:
 +        g_assert_not_reached();
      }
      op->opc = new_op;
 -    /* TCGOP_VECL and TCGOP_VECE remain unchanged.  */
      op->args[0] = dst;
      op->args[1] = src;
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
  static bool tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
                               TCGArg dst, uint64_t val)
  {
 -    const TCGOpDef *def = &tcg_op_defs[op->opc];
 -    TCGType type;
 -    TCGTemp *tv;
 -
 -    if (def->flags & TCG_OPF_VECTOR) {
 -        type = TCGOP_VECL(op) + TCG_TYPE_V64;
 -    } else if (def->flags & TCG_OPF_64BIT) {
 -        type = TCG_TYPE_I64;
 -    } else {
 -        type = TCG_TYPE_I32;
 -    }
 -
      /* Convert movi to mov with constant temp. */
 -    tv = tcg_constant_internal(type, val);
 +    TCGTemp *tv = tcg_constant_internal(ctx->type, val);
 +
      init_ts_info(ctx, tv);
      return tcg_opt_gen_mov(ctx, op, dst, temp_arg(tv));
  }
@@ -XXX,XX +XXX,XX @@ static uint64_t do_constant_folding_2(TCGOpcode op, uint64_t x, uint64_t y)
      }
  }
 -static uint64_t do_constant_folding(TCGOpcode op, uint64_t x, uint64_t y)
 +static uint64_t do_constant_folding(TCGOpcode op, TCGType type,
 +                                    uint64_t x, uint64_t y)
  {
 -    const TCGOpDef *def = &tcg_op_defs[op];
      uint64_t res = do_constant_folding_2(op, x, y);
 -    if (!(def->flags & TCG_OPF_64BIT)) {
 +    if (type == TCG_TYPE_I32) {
          res = (int32_t)res;
      }
      return res;
@@ -XXX,XX +XXX,XX @@ static bool do_constant_folding_cond_eq(TCGCond c)
   * Return -1 if the condition can't be simplified,
   * and the result of the condition (0 or 1) if it can.
   */
 -static int do_constant_folding_cond(TCGOpcode op, TCGArg x,
 +static int do_constant_folding_cond(TCGType type, TCGArg x,
                                      TCGArg y, TCGCond c)
  {
      uint64_t xv = arg_info(x)->val;
      uint64_t yv = arg_info(y)->val;
      if (arg_is_const(x) && arg_is_const(y)) {
 -        const TCGOpDef *def = &tcg_op_defs[op];
 -        tcg_debug_assert(!(def->flags & TCG_OPF_VECTOR));
 -        if (def->flags & TCG_OPF_64BIT) {
 -            return do_constant_folding_cond_64(xv, yv, c);
 -        } else {
 +        switch (type) {
 +        case TCG_TYPE_I32:
              return do_constant_folding_cond_32(xv, yv, c);
 +        case TCG_TYPE_I64:
 +            return do_constant_folding_cond_64(xv, yv, c);
 +        default:
 +            /* Only scalar comparisons are optimizable */
 +            return -1;
          }
      } else if (args_are_copies(x, y)) {
          return do_constant_folding_cond_eq(c);
@@ -XXX,XX +XXX,XX @@ static bool fold_const1(OptContext *ctx, TCGOp *op)
          uint64_t t;
          t = arg_info(op->args[1])->val;
 -        t = do_constant_folding(op->opc, t, 0);
 +        t = do_constant_folding(op->opc, ctx->type, t, 0);
          return tcg_opt_gen_movi(ctx, op, op->args[0], t);
      }
      return false;
@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
          uint64_t t1 = arg_info(op->args[1])->val;
          uint64_t t2 = arg_info(op->args[2])->val;
 -        t1 = do_constant_folding(op->opc, t1, t2);
 +        t1 = do_constant_folding(op->opc, ctx->type, t1, t2);
          return tcg_opt_gen_movi(ctx, op, op->args[0], t1);
      }
      return false;
@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
  static bool fold_brcond(OptContext *ctx, TCGOp *op)
  {
      TCGCond cond = op->args[2];
 -    int i = do_constant_folding_cond(op->opc, op->args[0], op->args[1], cond);
 +    int i = do_constant_folding_cond(ctx->type, op->args[0], op->args[1], cond);
      if (i == 0) {
          tcg_op_remove(ctx->tcg, op);
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
           * Simplify EQ/NE comparisons where one of the pairs
           * can be simplified.
           */
 -        i = do_constant_folding_cond(INDEX_op_brcond_i32, op->args[0],
 +        i = do_constant_folding_cond(TCG_TYPE_I32, op->args[0],
                                       op->args[2], cond);
          switch (i ^ inv) {
          case 0:
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
              goto do_brcond_high;
          }
 -        i = do_constant_folding_cond(INDEX_op_brcond_i32, op->args[1],
 +        i = do_constant_folding_cond(TCG_TYPE_I32, op->args[1],
                                       op->args[3], cond);
          switch (i ^ inv) {
          case 0:
@@ -XXX,XX +XXX,XX @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
      if (arg_is_const(op->args[1])) {
          uint64_t t = arg_info(op->args[1])->val;
 -        t = do_constant_folding(op->opc, t, op->args[2]);
 +        t = do_constant_folding(op->opc, ctx->type, t, op->args[2]);
          return tcg_opt_gen_movi(ctx, op, op->args[0], t);
      }
      return false;
@@ -XXX,XX +XXX,XX @@ static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
          uint64_t t = arg_info(op->args[1])->val;
          if (t != 0) {
 -            t = do_constant_folding(op->opc, t, 0);
 +            t = do_constant_folding(op->opc, ctx->type, t, 0);
              return tcg_opt_gen_movi(ctx, op, op->args[0], t);
          }
          return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[2]);
@@ -XXX,XX +XXX,XX @@ static bool fold_mov(OptContext *ctx, TCGOp *op)
  static bool fold_movcond(OptContext *ctx, TCGOp *op)
  {
 -    TCGOpcode opc = op->opc;
      TCGCond cond = op->args[5];
 -    int i = do_constant_folding_cond(opc, op->args[1], op->args[2], cond);
 +    int i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
      if (i >= 0) {
          return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[4 - i]);
@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
      if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
          uint64_t tv = arg_info(op->args[3])->val;
          uint64_t fv = arg_info(op->args[4])->val;
 +        TCGOpcode opc;
 -        opc = (opc == INDEX_op_movcond_i32
 -               ? INDEX_op_setcond_i32 : INDEX_op_setcond_i64);
 +        switch (ctx->type) {
 +        case TCG_TYPE_I32:
 +            opc = INDEX_op_setcond_i32;
 +            break;
 +        case TCG_TYPE_I64:
 +            opc = INDEX_op_setcond_i64;
 +            break;
 +        default:
 +            g_assert_not_reached();
 +        }
          if (tv == 1 && fv == 0) {
              op->opc = opc;
@@ -XXX,XX +XXX,XX @@ static bool fold_remainder(OptContext *ctx, TCGOp *op)
  static bool fold_setcond(OptContext *ctx, TCGOp *op)
  {
      TCGCond cond = op->args[3];
 -    int i = do_constant_folding_cond(op->opc, op->args[1], op->args[2], cond);
 +    int i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
      if (i >= 0) {
          return tcg_opt_gen_movi(ctx, op, op->args[0], i);
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
           * Simplify EQ/NE comparisons where one of the pairs
           * can be simplified.
           */
 -        i = do_constant_folding_cond(INDEX_op_setcond_i32, op->args[1],
 +        i = do_constant_folding_cond(TCG_TYPE_I32, op->args[1],
                                       op->args[3], cond);
          switch (i ^ inv) {
          case 0:
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
              goto do_setcond_high;
          }
 -        i = do_constant_folding_cond(INDEX_op_setcond_i32, op->args[2],
 +        i = do_constant_folding_cond(TCG_TYPE_I32, op->args[2],
                                       op->args[4], cond);
          switch (i ^ inv) {
          case 0:
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          init_arguments(&ctx, op, def->nb_oargs + def->nb_iargs);
          copy_propagate(&ctx, op, def->nb_oargs, def->nb_iargs);
 +        /* Pre-compute the type of the operation. */
 +        if (def->flags & TCG_OPF_VECTOR) {
 +            ctx.type = TCG_TYPE_V64 + TCGOP_VECL(op);
 +        } else if (def->flags & TCG_OPF_64BIT) {
 +            ctx.type = TCG_TYPE_I64;
 +        } else {
 +            ctx.type = TCG_TYPE_I32;
 +        }
 +
          /* For commutative operations make constant second argument */
          switch (opc) {
          CASE_OP_32_64_VEC(add):
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                      /* Proceed with possible constant folding. */
                      break;
                  }
 -                if (opc == INDEX_op_sub_i32) {
 +                switch (ctx.type) {
 +                case TCG_TYPE_I32:
                      neg_op = INDEX_op_neg_i32;
                      have_neg = TCG_TARGET_HAS_neg_i32;
 -                } else if (opc == INDEX_op_sub_i64) {
 +                    break;
 +                case TCG_TYPE_I64:
                      neg_op = INDEX_op_neg_i64;
                      have_neg = TCG_TARGET_HAS_neg_i64;
 -                } else if (TCG_TARGET_HAS_neg_vec) {
 -                    TCGType type = TCGOP_VECL(op) + TCG_TYPE_V64;
 -                    unsigned vece = TCGOP_VECE(op);
 -                    neg_op = INDEX_op_neg_vec;
 -                    have_neg = tcg_can_emit_vec_op(neg_op, type, vece) > 0;
 -                } else {
                      break;
 +                case TCG_TYPE_V64:
 +                case TCG_TYPE_V128:
 +                case TCG_TYPE_V256:
 +                    neg_op = INDEX_op_neg_vec;
 +                    have_neg = tcg_can_emit_vec_op(neg_op, ctx.type,
 +                                                   TCGOP_VECE(op)) > 0;
 +                    break;
 +                default:
 +                    g_assert_not_reached();
                  }
                  if (!have_neg) {
                      break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  TCGOpcode not_op;
                  bool have_not;
 -                if (def->flags & TCG_OPF_VECTOR) {
 -                    not_op = INDEX_op_not_vec;
 -                    have_not = TCG_TARGET_HAS_not_vec;
 -                } else if (def->flags & TCG_OPF_64BIT) {
 -                    not_op = INDEX_op_not_i64;
 -                    have_not = TCG_TARGET_HAS_not_i64;
 -                } else {
 +                switch (ctx.type) {
 +                case TCG_TYPE_I32:
                      not_op = INDEX_op_not_i32;
                      have_not = TCG_TARGET_HAS_not_i32;
 +                    break;
 +                case TCG_TYPE_I64:
 +                    not_op = INDEX_op_not_i64;
 +                    have_not = TCG_TARGET_HAS_not_i64;
 +                    break;
 +                case TCG_TYPE_V64:
 +                case TCG_TYPE_V128:
 +                case TCG_TYPE_V256:
 +                    not_op = INDEX_op_not_vec;
 +                    have_not = TCG_TARGET_HAS_not_vec;
 +                    break;
 +                default:
 +                    g_assert_not_reached();
                  }
                  if (!have_not) {
                      break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             below, we can ignore high bits, but for further optimizations we
             need to record that the high bits contain garbage.  */
          partmask = z_mask;
 -        if (!(def->flags & TCG_OPF_64BIT)) {
 +        if (ctx.type == TCG_TYPE_I32) {
              z_mask |= ~(tcg_target_ulong)0xffffffffu;
              partmask &= 0xffffffffu;
              affected &= 0xffffffffu;
 --
 .25.1

-New patch
+[PULL 39/56] tcg/optimize: Split out fold_to_not
+Split out the conditional conversion from a more complex logical
 operation to a simple NOT.  Create a couple more helpers to make
 this easy for the outer-most logical operations.
 Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
  tcg/optimize.c | 158 +++++++++++++++++++++++++++----------------------
 file changed, 86 insertions(+), 72 deletions(-)
 diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/optimize.c
 +++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
      return false;
  }
 +/*
 + * Convert @op to NOT, if NOT is supported by the host.
 + * Return true f the conversion is successful, which will still
 + * indicate that the processing is complete.
 + */
 +static bool fold_not(OptContext *ctx, TCGOp *op);
 +static bool fold_to_not(OptContext *ctx, TCGOp *op, int idx)
 +{
 +    TCGOpcode not_op;
 +    bool have_not;
 +
 +    switch (ctx->type) {
 +    case TCG_TYPE_I32:
 +        not_op = INDEX_op_not_i32;
 +        have_not = TCG_TARGET_HAS_not_i32;
 +        break;
 +    case TCG_TYPE_I64:
 +        not_op = INDEX_op_not_i64;
 +        have_not = TCG_TARGET_HAS_not_i64;
 +        break;
 +    case TCG_TYPE_V64:
 +    case TCG_TYPE_V128:
 +    case TCG_TYPE_V256:
 +        not_op = INDEX_op_not_vec;
 +        have_not = TCG_TARGET_HAS_not_vec;
 +        break;
 +    default:
 +        g_assert_not_reached();
 +    }
 +    if (have_not) {
 +        op->opc = not_op;
 +        op->args[1] = op->args[idx];
 +        return fold_not(ctx, op);
 +    }
 +    return false;
 +}
 +
 +/* If the binary operation has first argument @i, fold to NOT. */
 +static bool fold_ix_to_not(OptContext *ctx, TCGOp *op, uint64_t i)
 +{
 +    if (arg_is_const(op->args[1]) && arg_info(op->args[1])->val == i) {
 +        return fold_to_not(ctx, op, 2);
 +    }
 +    return false;
 +}
 +
  /* If the binary operation has second argument @i, fold to @i. */
  static bool fold_xi_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
  {
@@ -XXX,XX +XXX,XX @@ static bool fold_xi_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
      return false;
  }
 +/* If the binary operation has second argument @i, fold to NOT. */
 +static bool fold_xi_to_not(OptContext *ctx, TCGOp *op, uint64_t i)
 +{
 +    if (arg_is_const(op->args[2]) && arg_info(op->args[2])->val == i) {
 +        return fold_to_not(ctx, op, 1);
 +    }
 +    return false;
 +}
 +
  /* If the binary operation has both arguments equal, fold to @i. */
  static bool fold_xx_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
  {
@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
  static bool fold_andc(OptContext *ctx, TCGOp *op)
  {
      if (fold_const2(ctx, op) ||
 -        fold_xx_to_i(ctx, op, 0)) {
 +        fold_xx_to_i(ctx, op, 0) ||
 +        fold_ix_to_not(ctx, op, -1)) {
          return true;
      }
      return false;
@@ -XXX,XX +XXX,XX @@ static bool fold_dup2(OptContext *ctx, TCGOp *op)
  static bool fold_eqv(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const2(ctx, op);
 +    if (fold_const2(ctx, op) ||
 +        fold_xi_to_not(ctx, op, 0)) {
 +        return true;
 +    }
 +    return false;
  }
  static bool fold_extract(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_mulu2_i32(OptContext *ctx, TCGOp *op)
  static bool fold_nand(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const2(ctx, op);
 +    if (fold_const2(ctx, op) ||
 +        fold_xi_to_not(ctx, op, -1)) {
 +        return true;
 +    }
 +    return false;
  }
  static bool fold_neg(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_neg(OptContext *ctx, TCGOp *op)
  static bool fold_nor(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const2(ctx, op);
 +    if (fold_const2(ctx, op) ||
 +        fold_xi_to_not(ctx, op, 0)) {
 +        return true;
 +    }
 +    return false;
  }
  static bool fold_not(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const1(ctx, op);
 +    if (fold_const1(ctx, op)) {
 +        return true;
 +    }
 +
 +    /* Because of fold_to_not, we want to always return true, via finish. */
 +    finish_folding(ctx, op);
 +    return true;
  }
  static bool fold_or(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_or(OptContext *ctx, TCGOp *op)
  static bool fold_orc(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const2(ctx, op);
 +    if (fold_const2(ctx, op) ||
 +        fold_ix_to_not(ctx, op, 0)) {
 +        return true;
 +    }
 +    return false;
  }
  static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
  static bool fold_xor(OptContext *ctx, TCGOp *op)
  {
      if (fold_const2(ctx, op) ||
 -        fold_xx_to_i(ctx, op, 0)) {
 +        fold_xx_to_i(ctx, op, 0) ||
 +        fold_xi_to_not(ctx, op, -1)) {
          return true;
      }
      return false;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  }
              }
              break;
 -        CASE_OP_32_64_VEC(xor):
 -        CASE_OP_32_64(nand):
 -            if (!arg_is_const(op->args[1])
 -                && arg_is_const(op->args[2])
 -                && arg_info(op->args[2])->val == -1) {
 -                i = 1;
 -                goto try_not;
 -            }
 -            break;
 -        CASE_OP_32_64(nor):
 -            if (!arg_is_const(op->args[1])
 -                && arg_is_const(op->args[2])
 -                && arg_info(op->args[2])->val == 0) {
 -                i = 1;
 -                goto try_not;
 -            }
 -            break;
 -        CASE_OP_32_64_VEC(andc):
 -            if (!arg_is_const(op->args[2])
 -                && arg_is_const(op->args[1])
 -                && arg_info(op->args[1])->val == -1) {
 -                i = 2;
 -                goto try_not;
 -            }
 -            break;
 -        CASE_OP_32_64_VEC(orc):
 -        CASE_OP_32_64(eqv):
 -            if (!arg_is_const(op->args[2])
 -                && arg_is_const(op->args[1])
 -                && arg_info(op->args[1])->val == 0) {
 -                i = 2;
 -                goto try_not;
 -            }
 -            break;
 -        try_not:
 -            {
 -                TCGOpcode not_op;
 -                bool have_not;
 -
 -                switch (ctx.type) {
 -                case TCG_TYPE_I32:
 -                    not_op = INDEX_op_not_i32;
 -                    have_not = TCG_TARGET_HAS_not_i32;
 -                    break;
 -                case TCG_TYPE_I64:
 -                    not_op = INDEX_op_not_i64;
 -                    have_not = TCG_TARGET_HAS_not_i64;
 -                    break;
 -                case TCG_TYPE_V64:
 -                case TCG_TYPE_V128:
 -                case TCG_TYPE_V256:
 -                    not_op = INDEX_op_not_vec;
 -                    have_not = TCG_TARGET_HAS_not_vec;
 -                    break;
 -                default:
 -                    g_assert_not_reached();
 -                }
 -                if (!have_not) {
 -                    break;
 -                }
 -                op->opc = not_op;
 -                reset_temp(op->args[0]);
 -                op->args[1] = op->args[i];
 -                continue;
 -            }
          default:
              break;
          }
 --
 .25.1

-[PULL 25/41] target/cris: Remove MMU_MODE{0,1}_SUFFIX
+[PULL 40/56] tcg/optimize: Split out fold_sub_to_neg
-The functions generated by these macros are unused.
+Even though there is only one user, place this more complex
 conversion into its own helper.
-Cc: Edgar E. Iglesias <edgar.iglesias@gmail.com>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Tested-by: Philippe Mathieu-Daudé <philmd@redhat.com>
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/cris/cpu.h | 2 --
+ tcg/optimize.c | 89 ++++++++++++++++++++++++++------------------------
-file changed, 2 deletions(-)
+file changed, 47 insertions(+), 42 deletions(-)
-diff --git a/target/cris/cpu.h b/target/cris/cpu.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/cris/cpu.h
+--- a/tcg/optimize.c
-+++ b/target/cris/cpu.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ enum {
+@@ -XXX,XX +XXX,XX @@ static bool fold_nand(OptContext *ctx, TCGOp *op)
- #define cpu_signal_handler cpu_cris_signal_handler
+ static bool fold_neg(OptContext *ctx, TCGOp *op)
  /* MMU modes definitions */
 -#define MMU_MODE0_SUFFIX _kernel
 -#define MMU_MODE1_SUFFIX _user
  #define MMU_USER_IDX 1
  static inline int cpu_mmu_index (CPUCRISState *env, bool ifetch)
  {
+-    return fold_const1(ctx, op);
++    if (fold_const1(ctx, op)) {
++        return true;
++    }
++    /*
++     * Because of fold_sub_to_neg, we want to always return true,
++     * via finish_folding.
++     */
++    finish_folding(ctx, op);
++    return true;
+ }
+ static bool fold_nor(OptContext *ctx, TCGOp *op)
+@@ -XXX,XX +XXX,XX @@ static bool fold_shift(OptContext *ctx, TCGOp *op)
+     return fold_const2(ctx, op);
+ }
++static bool fold_sub_to_neg(OptContext *ctx, TCGOp *op)
++{
++    TCGOpcode neg_op;
++    bool have_neg;
++
++    if (!arg_is_const(op->args[1]) || arg_info(op->args[1])->val != 0) {
++        return false;
++    }
++
++    switch (ctx->type) {
++    case TCG_TYPE_I32:
++        neg_op = INDEX_op_neg_i32;
++        have_neg = TCG_TARGET_HAS_neg_i32;
++        break;
++    case TCG_TYPE_I64:
++        neg_op = INDEX_op_neg_i64;
++        have_neg = TCG_TARGET_HAS_neg_i64;
++        break;
++    case TCG_TYPE_V64:
++    case TCG_TYPE_V128:
++    case TCG_TYPE_V256:
++        neg_op = INDEX_op_neg_vec;
++        have_neg = (TCG_TARGET_HAS_neg_vec &&
++                    tcg_can_emit_vec_op(neg_op, ctx->type, TCGOP_VECE(op)) > 0);
++        break;
++    default:
++        g_assert_not_reached();
++    }
++    if (have_neg) {
++        op->opc = neg_op;
++        op->args[1] = op->args[2];
++        return fold_neg(ctx, op);
++    }
++    return false;
++}
++
+ static bool fold_sub(OptContext *ctx, TCGOp *op)
+ {
+     if (fold_const2(ctx, op) ||
+-        fold_xx_to_i(ctx, op, 0)) {
++        fold_xx_to_i(ctx, op, 0) ||
++        fold_sub_to_neg(ctx, op)) {
+         return true;
+     }
+     return false;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 continue;
+             }
+             break;
+-        CASE_OP_32_64_VEC(sub):
+-            {
+-                TCGOpcode neg_op;
+-                bool have_neg;
+-
+-                if (arg_is_const(op->args[2])) {
+-                    /* Proceed with possible constant folding. */
+-                    break;
+-                }
+-                switch (ctx.type) {
+-                case TCG_TYPE_I32:
+-                    neg_op = INDEX_op_neg_i32;
+-                    have_neg = TCG_TARGET_HAS_neg_i32;
+-                    break;
+-                case TCG_TYPE_I64:
+-                    neg_op = INDEX_op_neg_i64;
+-                    have_neg = TCG_TARGET_HAS_neg_i64;
+-                    break;
+-                case TCG_TYPE_V64:
+-                case TCG_TYPE_V128:
+-                case TCG_TYPE_V256:
+-                    neg_op = INDEX_op_neg_vec;
+-                    have_neg = tcg_can_emit_vec_op(neg_op, ctx.type,
+-                                                   TCGOP_VECE(op)) > 0;
+-                    break;
+-                default:
+-                    g_assert_not_reached();
+-                }
+-                if (!have_neg) {
+-                    break;
+-                }
+-                if (arg_is_const(op->args[1])
+-                    && arg_info(op->args[1])->val == 0) {
+-                    op->opc = neg_op;
+-                    reset_temp(op->args[0]);
+-                    op->args[1] = op->args[2];
+-                    continue;
+-                }
+-            }
+-            break;
+         default:
+             break;
+         }
 --
-.20.1
+.25.1

-[PULL 38/41] tcg: Search includes in the parent source directory
+[PULL 41/56] tcg/optimize: Split out fold_xi_to_x
-From: Philippe Mathieu-Daudé <philmd@redhat.com>
+Pull the "op r, a, i => mov r, a" optimization into a function,
 and use them in the outer-most logical operations.
-All the *.inc.c files included by tcg/$TARGET/tcg-target.inc.c
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 are in tcg/, their parent directory. To simplify the preprocessor
 search path, include the relative parent path: '..'.
 Patch created mechanically by running:
   $ for x in tcg-pool.inc.c tcg-ldst.inc.c; do \
     sed -i "s,#include \"$x\",#include \"../$x\"," \
       $(git grep -l "#include \"$x\""); \
     done
 Acked-by: David Gibson <david@gibson.dropbear.id.au> (ppc parts)
 Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
 Reviewed-by: Alistair Francis <alistair.francis@wdc.com>
 Reviewed-by: Stefan Weil <sw@weilnetz.de>
 Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
 Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
 Message-Id: <20200101112303.20724-3-philmd@redhat.com>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/aarch64/tcg-target.inc.c | 4 ++--
+ tcg/optimize.c | 61 +++++++++++++++++++++-----------------------------
- tcg/arm/tcg-target.inc.c     | 4 ++--
+file changed, 26 insertions(+), 35 deletions(-)
  tcg/i386/tcg-target.inc.c    | 4 ++--
  tcg/mips/tcg-target.inc.c    | 2 +-
  tcg/ppc/tcg-target.inc.c     | 4 ++--
  tcg/riscv/tcg-target.inc.c   | 4 ++--
  tcg/s390/tcg-target.inc.c    | 4 ++--
  tcg/sparc/tcg-target.inc.c   | 2 +-
 files changed, 14 insertions(+), 14 deletions(-)
-diff --git a/tcg/aarch64/tcg-target.inc.c b/tcg/aarch64/tcg-target.inc.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/aarch64/tcg-target.inc.c
+--- a/tcg/optimize.c
-+++ b/tcg/aarch64/tcg-target.inc.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static bool fold_xi_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
-  * See the COPYING file in the top-level directory for details.
+     return false;
   */
 -#include "tcg-pool.inc.c"
 +#include "../tcg-pool.inc.c"
  #include "qemu/bitops.h"
  /* We're going to re-use TCGType in setting of the SF bit, which controls
@@ -XXX,XX +XXX,XX @@ static void tcg_out_cltz(TCGContext *s, TCGType ext, TCGReg d,
  }
- #ifdef CONFIG_SOFTMMU
++/* If the binary operation has second argument @i, fold to identity. */
--#include "tcg-ldst.inc.c"
++static bool fold_xi_to_x(OptContext *ctx, TCGOp *op, uint64_t i)
-+#include "../tcg-ldst.inc.c"
++{
++    if (arg_is_const(op->args[2]) && arg_info(op->args[2])->val == i) {
- /* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
++        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]);
-  *                                     TCGMemOpIdx oi, uintptr_t ra)
++    }
-diff --git a/tcg/arm/tcg-target.inc.c b/tcg/arm/tcg-target.inc.c
++    return false;
-index XXXXXXX..XXXXXXX 100644
++}
---- a/tcg/arm/tcg-target.inc.c
++
-+++ b/tcg/arm/tcg-target.inc.c
+ /* If the binary operation has second argument @i, fold to NOT. */
-@@ -XXX,XX +XXX,XX @@
+ static bool fold_xi_to_not(OptContext *ctx, TCGOp *op, uint64_t i)
-  */
+ {
+@@ -XXX,XX +XXX,XX @@ static bool fold_xx_to_x(OptContext *ctx, TCGOp *op)
- #include "elf.h"
--#include "tcg-pool.inc.c"
+ static bool fold_add(OptContext *ctx, TCGOp *op)
-+#include "../tcg-pool.inc.c"
+ {
+-    return fold_const2(ctx, op);
- int arm_arch = __ARM_ARCH;
++    if (fold_const2(ctx, op) ||
++        fold_xi_to_x(ctx, op, 0)) {
-@@ -XXX,XX +XXX,XX @@ static TCGCond tcg_out_cmp2(TCGContext *s, const TCGArg *args,
++        return true;
 +    }
 +    return false;
  }
- #ifdef CONFIG_SOFTMMU
+ static bool fold_addsub2_i32(OptContext *ctx, TCGOp *op, bool add)
--#include "tcg-ldst.inc.c"
+@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
-+#include "../tcg-ldst.inc.c"
+ {
+     if (fold_const2(ctx, op) ||
- /* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
+         fold_xi_to_i(ctx, op, 0) ||
-  *                                     int mmu_idx, uintptr_t ra)
++        fold_xi_to_x(ctx, op, -1) ||
-diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
+         fold_xx_to_x(ctx, op)) {
-index XXXXXXX..XXXXXXX 100644
+         return true;
---- a/tcg/i386/tcg-target.inc.c
+     }
-+++ b/tcg/i386/tcg-target.inc.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
-@@ -XXX,XX +XXX,XX @@
+ {
-  * THE SOFTWARE.
+     if (fold_const2(ctx, op) ||
-  */
+         fold_xx_to_i(ctx, op, 0) ||
++        fold_xi_to_x(ctx, op, 0) ||
--#include "tcg-pool.inc.c"
+         fold_ix_to_not(ctx, op, -1)) {
-+#include "../tcg-pool.inc.c"
+         return true;
+     }
- #ifdef CONFIG_DEBUG_TCG
+@@ -XXX,XX +XXX,XX @@ static bool fold_dup2(OptContext *ctx, TCGOp *op)
- static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
+ static bool fold_eqv(OptContext *ctx, TCGOp *op)
-@@ -XXX,XX +XXX,XX @@ static void tcg_out_nopn(TCGContext *s, int n)
+ {
      if (fold_const2(ctx, op) ||
 +        fold_xi_to_x(ctx, op, -1) ||
          fold_xi_to_not(ctx, op, 0)) {
          return true;
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_not(OptContext *ctx, TCGOp *op)
  static bool fold_or(OptContext *ctx, TCGOp *op)
  {
      if (fold_const2(ctx, op) ||
 +        fold_xi_to_x(ctx, op, 0) ||
          fold_xx_to_x(ctx, op)) {
          return true;
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_or(OptContext *ctx, TCGOp *op)
  static bool fold_orc(OptContext *ctx, TCGOp *op)
  {
      if (fold_const2(ctx, op) ||
 +        fold_xi_to_x(ctx, op, -1) ||
          fold_ix_to_not(ctx, op, 0)) {
          return true;
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
  static bool fold_shift(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const2(ctx, op);
 +    if (fold_const2(ctx, op) ||
 +        fold_xi_to_x(ctx, op, 0)) {
 +        return true;
 +    }
 +    return false;
  }
- #if defined(CONFIG_SOFTMMU)
+ static bool fold_sub_to_neg(OptContext *ctx, TCGOp *op)
--#include "tcg-ldst.inc.c"
+@@ -XXX,XX +XXX,XX @@ static bool fold_sub(OptContext *ctx, TCGOp *op)
-+#include "../tcg-ldst.inc.c"
+ {
+     if (fold_const2(ctx, op) ||
- /* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
+         fold_xx_to_i(ctx, op, 0) ||
-  *                                     int mmu_idx, uintptr_t ra)
++        fold_xi_to_x(ctx, op, 0) ||
-diff --git a/tcg/mips/tcg-target.inc.c b/tcg/mips/tcg-target.inc.c
+         fold_sub_to_neg(ctx, op)) {
-index XXXXXXX..XXXXXXX 100644
+         return true;
---- a/tcg/mips/tcg-target.inc.c
+     }
-+++ b/tcg/mips/tcg-target.inc.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_xor(OptContext *ctx, TCGOp *op)
-@@ -XXX,XX +XXX,XX @@ static void tcg_out_call(TCGContext *s, tcg_insn_unit *arg)
+ {
- }
+     if (fold_const2(ctx, op) ||
+         fold_xx_to_i(ctx, op, 0) ||
- #if defined(CONFIG_SOFTMMU)
++        fold_xi_to_x(ctx, op, 0) ||
--#include "tcg-ldst.inc.c"
+         fold_xi_to_not(ctx, op, -1)) {
-+#include "../tcg-ldst.inc.c"
+         return true;
+     }
- static void * const qemu_ld_helpers[16] = {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-     [MO_UB]   = helper_ret_ldub_mmu,
+             break;
-diff --git a/tcg/ppc/tcg-target.inc.c b/tcg/ppc/tcg-target.inc.c
+         }
-index XXXXXXX..XXXXXXX 100644
---- a/tcg/ppc/tcg-target.inc.c
+-        /* Simplify expression for "op r, a, const => mov r, a" cases */
-+++ b/tcg/ppc/tcg-target.inc.c
+-        switch (opc) {
-@@ -XXX,XX +XXX,XX @@
+-        CASE_OP_32_64_VEC(add):
-  */
+-        CASE_OP_32_64_VEC(sub):
+-        CASE_OP_32_64_VEC(or):
- #include "elf.h"
+-        CASE_OP_32_64_VEC(xor):
--#include "tcg-pool.inc.c"
+-        CASE_OP_32_64_VEC(andc):
-+#include "../tcg-pool.inc.c"
+-        CASE_OP_32_64(shl):
+-        CASE_OP_32_64(shr):
- #if defined _CALL_DARWIN || defined __APPLE__
+-        CASE_OP_32_64(sar):
- #define TCG_TARGET_CALL_DARWIN
+-        CASE_OP_32_64(rotl):
-@@ -XXX,XX +XXX,XX @@ static const uint32_t qemu_exts_opc[4] = {
+-        CASE_OP_32_64(rotr):
- };
+-            if (!arg_is_const(op->args[1])
+-                && arg_is_const(op->args[2])
- #if defined (CONFIG_SOFTMMU)
+-                && arg_info(op->args[2])->val == 0) {
--#include "tcg-ldst.inc.c"
+-                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
-+#include "../tcg-ldst.inc.c"
+-                continue;
+-            }
- /* helper signature: helper_ld_mmu(CPUState *env, target_ulong addr,
+-            break;
-  *                                 int mmu_idx, uintptr_t ra)
+-        CASE_OP_32_64_VEC(and):
-diff --git a/tcg/riscv/tcg-target.inc.c b/tcg/riscv/tcg-target.inc.c
+-        CASE_OP_32_64_VEC(orc):
-index XXXXXXX..XXXXXXX 100644
+-        CASE_OP_32_64(eqv):
---- a/tcg/riscv/tcg-target.inc.c
+-            if (!arg_is_const(op->args[1])
-+++ b/tcg/riscv/tcg-target.inc.c
+-                && arg_is_const(op->args[2])
-@@ -XXX,XX +XXX,XX @@
+-                && arg_info(op->args[2])->val == -1) {
-  * THE SOFTWARE.
+-                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
-  */
+-                continue;
+-            }
--#include "tcg-pool.inc.c"
+-            break;
-+#include "../tcg-pool.inc.c"
+-        default:
+-            break;
- #ifdef CONFIG_DEBUG_TCG
+-        }
- static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
+-
-@@ -XXX,XX +XXX,XX @@ static void tcg_out_mb(TCGContext *s, TCGArg a0)
+         /* Simplify using known-zero bits. Currently only ops with a single
-  */
+            output argument is supported. */
+         z_mask = -1;
  #if defined(CONFIG_SOFTMMU)
 -#include "tcg-ldst.inc.c"
 +#include "../tcg-ldst.inc.c"
  /* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
   *                                     TCGMemOpIdx oi, uintptr_t ra)
 diff --git a/tcg/s390/tcg-target.inc.c b/tcg/s390/tcg-target.inc.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/s390/tcg-target.inc.c
 +++ b/tcg/s390/tcg-target.inc.c
@@ -XXX,XX +XXX,XX @@
  #error "unsupported code generation mode"
  #endif
 -#include "tcg-pool.inc.c"
 +#include "../tcg-pool.inc.c"
  #include "elf.h"
  /* ??? The translation blocks produced by TCG are generally small enough to
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st_direct(TCGContext *s, MemOp opc, TCGReg data,
  }
  #if defined(CONFIG_SOFTMMU)
 -#include "tcg-ldst.inc.c"
 +#include "../tcg-ldst.inc.c"
  /* We're expecting to use a 20-bit negative offset on the tlb memory ops.  */
  QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) > 0);
 diff --git a/tcg/sparc/tcg-target.inc.c b/tcg/sparc/tcg-target.inc.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/sparc/tcg-target.inc.c
 +++ b/tcg/sparc/tcg-target.inc.c
@@ -XXX,XX +XXX,XX @@
   * THE SOFTWARE.
   */
 -#include "tcg-pool.inc.c"
 +#include "../tcg-pool.inc.c"
  #ifdef CONFIG_DEBUG_TCG
  static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
 --
-.20.1
+.25.1

-[PULL 24/41] target/alpha: Remove MMU_MODE{0,1}_SUFFIX
+[PULL 42/56] tcg/optimize: Split out fold_ix_to_i
-The functions generated by these macros are unused.
+Pull the "op r, 0, b => movi r, 0" optimization into a function,
 and use it in fold_shift.
-Tested-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
-Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/alpha/cpu.h | 2 --
+ tcg/optimize.c | 28 ++++++++++------------------
-file changed, 2 deletions(-)
+file changed, 10 insertions(+), 18 deletions(-)
-diff --git a/target/alpha/cpu.h b/target/alpha/cpu.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/alpha/cpu.h
+--- a/tcg/optimize.c
-+++ b/target/alpha/cpu.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ enum {
+@@ -XXX,XX +XXX,XX @@ static bool fold_to_not(OptContext *ctx, TCGOp *op, int idx)
-    PALcode cheats and usees the KSEG mapping for its code+data rather than
+     return false;
-    physical addresses.  */
+ }
--#define MMU_MODE0_SUFFIX _kernel
++/* If the binary operation has first argument @i, fold to @i. */
--#define MMU_MODE1_SUFFIX _user
++static bool fold_ix_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
- #define MMU_KERNEL_IDX   0
++{
- #define MMU_USER_IDX     1
++    if (arg_is_const(op->args[1]) && arg_info(op->args[1])->val == i) {
- #define MMU_PHYS_IDX     2
++        return tcg_opt_gen_movi(ctx, op, op->args[0], i);
 +    }
 +    return false;
 +}
 +
  /* If the binary operation has first argument @i, fold to NOT. */
  static bool fold_ix_to_not(OptContext *ctx, TCGOp *op, uint64_t i)
  {
@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
  static bool fold_shift(OptContext *ctx, TCGOp *op)
  {
      if (fold_const2(ctx, op) ||
 +        fold_ix_to_i(ctx, op, 0) ||
          fold_xi_to_x(ctx, op, 0)) {
          return true;
      }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              break;
          }
 -        /* Simplify expressions for "shift/rot r, 0, a => movi r, 0",
 -           and "sub r, 0, a => neg r, a" case.  */
 -        switch (opc) {
 -        CASE_OP_32_64(shl):
 -        CASE_OP_32_64(shr):
 -        CASE_OP_32_64(sar):
 -        CASE_OP_32_64(rotl):
 -        CASE_OP_32_64(rotr):
 -            if (arg_is_const(op->args[1])
 -                && arg_info(op->args[1])->val == 0) {
 -                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
 -                continue;
 -            }
 -            break;
 -        default:
 -            break;
 -        }
 -
          /* Simplify using known-zero bits. Currently only ops with a single
             output argument is supported. */
          z_mask = -1;
 --
-.20.1
+.25.1

-[PULL 01/41] configure: Drop adjustment of textseg
+[PULL 43/56] tcg/optimize: Split out fold_masks
-This adjustment was random and unnecessary.  The user mode
+Move all of the known-zero optimizations into the per-opcode
-startup code in probe_guest_base() will choose a value for
+functions.  Use fold_masks when there is a possibility of the
-guest_base that allows the host qemu binary to not conflict
+result being determined, and simply set ctx->z_mask otherwise.
 with the guest binary.
 With modern distributions, this isn't even used, as the default
 is PIE, which does the same job in a more portable way.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-Reviewed-by: Thomas Huth <thuth@redhat.com>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
-v2: Remove mention of config-host.ld from make distclean
+ tcg/optimize.c | 545 ++++++++++++++++++++++++++-----------------------
----
+file changed, 294 insertions(+), 251 deletions(-)
  Makefile  |  2 +-
  configure | 47 -----------------------------------------------
 files changed, 1 insertion(+), 48 deletions(-)
-diff --git a/Makefile b/Makefile
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/Makefile
+--- a/tcg/optimize.c
-+++ b/Makefile
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ rm -f $(MANUAL_BUILDDIR)/$1/objects.inv $(MANUAL_BUILDDIR)/$1/searchindex.js $(M
+@@ -XXX,XX +XXX,XX @@ typedef struct OptContext {
- endef
+     TCGTempSet temps_used;
- distclean: clean
+     /* In flight values from optimization. */
--    rm -f config-host.mak config-host.h* config-host.ld $(DOCS) qemu-options.texi qemu-img-cmds.texi qemu-monitor.texi qemu-monitor-info.texi
+-    uint64_t z_mask;
-+    rm -f config-host.mak config-host.h* $(DOCS) qemu-options.texi qemu-img-cmds.texi qemu-monitor.texi qemu-monitor-info.texi
++    uint64_t a_mask;  /* mask bit is 0 iff value identical to first input */
-     rm -f tests/tcg/config-*.mak
++    uint64_t z_mask;  /* mask bit is 0 iff value bit is 0 */
-     rm -f config-all-devices.mak config-all-disas.mak config.status
+     TCGType type;
-     rm -f $(SUBDIR_DEVICES_MAK)
+ } OptContext;
-diff --git a/configure b/configure
-index XXXXXXX..XXXXXXX 100755
+@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
---- a/configure
+     return false;
-+++ b/configure
+ }
-@@ -XXX,XX +XXX,XX @@ if test "$cpu" = "s390x" ; then
-   fi
++static bool fold_masks(OptContext *ctx, TCGOp *op)
- fi
++{
++    uint64_t a_mask = ctx->a_mask;
--# Probe for the need for relocating the user-only binary.
++    uint64_t z_mask = ctx->z_mask;
--if ( [ "$linux_user" = yes ] || [ "$bsd_user" = yes ] ) && [ "$pie" = no ]; then
++
--  textseg_addr=
++    /*
--  case "$cpu" in
++     * 32-bit ops generate 32-bit results.  For the result is zero test
--    arm | i386 | ppc* | s390* | sparc* | x86_64 | x32)
++     * below, we can ignore high bits, but for further optimizations we
--      # ??? Rationale for choosing this address
++     * need to record that the high bits contain garbage.
--      textseg_addr=0x60000000
++     */
--      ;;
++    if (ctx->type == TCG_TYPE_I32) {
--    mips)
++        ctx->z_mask |= MAKE_64BIT_MASK(32, 32);
--      # A 256M aligned address, high in the address space, with enough
++        a_mask &= MAKE_64BIT_MASK(0, 32);
--      # room for the code_gen_buffer above it before the stack.
++        z_mask &= MAKE_64BIT_MASK(0, 32);
--      textseg_addr=0x60000000
++    }
--      ;;
++
--  esac
++    if (z_mask == 0) {
--  if [ -n "$textseg_addr" ]; then
++        return tcg_opt_gen_movi(ctx, op, op->args[0], 0);
--    cat > $TMPC <<EOF
++    }
--    int main(void) { return 0; }
++    if (a_mask == 0) {
--EOF
++        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]);
--    textseg_ldflags="-Wl,-Ttext-segment=$textseg_addr"
++    }
--    if ! compile_prog "" "$textseg_ldflags"; then
++    return false;
--      # In case ld does not support -Ttext-segment, edit the default linker
++}
--      # script via sed to set the .text start addr.  This is needed on FreeBSD
++
--      # at least.
+ /*
--      if ! $ld --verbose >/dev/null 2>&1; then
+  * Convert @op to NOT, if NOT is supported by the host.
--        error_exit \
+  * Return true f the conversion is successful, which will still
--            "We need to link the QEMU user mode binaries at a" \
+@@ -XXX,XX +XXX,XX @@ static bool fold_add2_i32(OptContext *ctx, TCGOp *op)
--            "specific text address. Unfortunately your linker" \
--            "doesn't support either the -Ttext-segment option or" \
+ static bool fold_and(OptContext *ctx, TCGOp *op)
--            "printing the default linker script with --verbose." \
+ {
--            "If you don't want the user mode binaries, pass the" \
++    uint64_t z1, z2;
--            "--disable-user option to configure."
++
--      fi
+     if (fold_const2(ctx, op) ||
--
+         fold_xi_to_i(ctx, op, 0) ||
--      $ld --verbose | sed \
+         fold_xi_to_x(ctx, op, -1) ||
--        -e '1,/==================================================/d' \
+         fold_xx_to_x(ctx, op)) {
--        -e '/==================================================/,$d' \
+         return true;
--        -e "s/[.] = [0-9a-fx]* [+] SIZEOF_HEADERS/. = $textseg_addr + SIZEOF_HEADERS/" \
+     }
--        -e "s/__executable_start = [0-9a-fx]*/__executable_start = $textseg_addr/" > config-host.ld
+-    return false;
--      textseg_ldflags="-Wl,-T../config-host.ld"
++
--    fi
++    z1 = arg_info(op->args[1])->z_mask;
--  fi
++    z2 = arg_info(op->args[2])->z_mask;
--fi
++    ctx->z_mask = z1 & z2;
--
++
- # Check that the C++ compiler exists and works with the C compiler.
++    /*
- # All the QEMU_CXXFLAGS are based on QEMU_CFLAGS. Keep this at the end to don't miss any other that could be added.
++     * Known-zeros does not imply known-ones.  Therefore unless
- if has $cxx; then
++     * arg2 is constant, we can't infer affected bits from it.
-@@ -XXX,XX +XXX,XX @@ if test "$gprof" = "yes" ; then
++     */
-   fi
++    if (arg_is_const(op->args[2])) {
- fi
++        ctx->a_mask = z1 & ~z2;
++    }
--if test "$target_linux_user" = "yes" || test "$target_bsd_user" = "yes" ; then
++
--  ldflags="$ldflags $textseg_ldflags"
++    return fold_masks(ctx, op);
--fi
+ }
--
- # Newer kernels on s390 check for an S390_PGSTE program header and
+ static bool fold_andc(OptContext *ctx, TCGOp *op)
- # enable the pgste page table extensions in that case. This makes
+ {
- # the vm.allocate_pgste sysctl unnecessary. We enable this program
++    uint64_t z1;
 +
      if (fold_const2(ctx, op) ||
          fold_xx_to_i(ctx, op, 0) ||
          fold_xi_to_x(ctx, op, 0) ||
          fold_ix_to_not(ctx, op, -1)) {
          return true;
      }
 -    return false;
 +
 +    z1 = arg_info(op->args[1])->z_mask;
 +
 +    /*
 +     * Known-zeros does not imply known-ones.  Therefore unless
 +     * arg2 is constant, we can't infer anything from it.
 +     */
 +    if (arg_is_const(op->args[2])) {
 +        uint64_t z2 = ~arg_info(op->args[2])->z_mask;
 +        ctx->a_mask = z1 & ~z2;
 +        z1 &= z2;
 +    }
 +    ctx->z_mask = z1;
 +
 +    return fold_masks(ctx, op);
  }
  static bool fold_brcond(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
  static bool fold_bswap(OptContext *ctx, TCGOp *op)
  {
 +    uint64_t z_mask, sign;
 +
      if (arg_is_const(op->args[1])) {
          uint64_t t = arg_info(op->args[1])->val;
          t = do_constant_folding(op->opc, ctx->type, t, op->args[2]);
          return tcg_opt_gen_movi(ctx, op, op->args[0], t);
      }
 -    return false;
 +
 +    z_mask = arg_info(op->args[1])->z_mask;
 +    switch (op->opc) {
 +    case INDEX_op_bswap16_i32:
 +    case INDEX_op_bswap16_i64:
 +        z_mask = bswap16(z_mask);
 +        sign = INT16_MIN;
 +        break;
 +    case INDEX_op_bswap32_i32:
 +    case INDEX_op_bswap32_i64:
 +        z_mask = bswap32(z_mask);
 +        sign = INT32_MIN;
 +        break;
 +    case INDEX_op_bswap64_i64:
 +        z_mask = bswap64(z_mask);
 +        sign = INT64_MIN;
 +        break;
 +    default:
 +        g_assert_not_reached();
 +    }
 +
 +    switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
 +    case TCG_BSWAP_OZ:
 +        break;
 +    case TCG_BSWAP_OS:
 +        /* If the sign bit may be 1, force all the bits above to 1. */
 +        if (z_mask & sign) {
 +            z_mask |= sign;
 +        }
 +        break;
 +    default:
 +        /* The high bits are undefined: force all bits above the sign to 1. */
 +        z_mask |= sign << 1;
 +        break;
 +    }
 +    ctx->z_mask = z_mask;
 +
 +    return fold_masks(ctx, op);
  }
  static bool fold_call(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_call(OptContext *ctx, TCGOp *op)
  static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
  {
 +    uint64_t z_mask;
 +
      if (arg_is_const(op->args[1])) {
          uint64_t t = arg_info(op->args[1])->val;
@@ -XXX,XX +XXX,XX @@ static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
          }
          return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[2]);
      }
 +
 +    switch (ctx->type) {
 +    case TCG_TYPE_I32:
 +        z_mask = 31;
 +        break;
 +    case TCG_TYPE_I64:
 +        z_mask = 63;
 +        break;
 +    default:
 +        g_assert_not_reached();
 +    }
 +    ctx->z_mask = arg_info(op->args[2])->z_mask | z_mask;
 +
      return false;
  }
  static bool fold_ctpop(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const1(ctx, op);
 +    if (fold_const1(ctx, op)) {
 +        return true;
 +    }
 +
 +    switch (ctx->type) {
 +    case TCG_TYPE_I32:
 +        ctx->z_mask = 32 | 31;
 +        break;
 +    case TCG_TYPE_I64:
 +        ctx->z_mask = 64 | 63;
 +        break;
 +    default:
 +        g_assert_not_reached();
 +    }
 +    return false;
  }
  static bool fold_deposit(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_deposit(OptContext *ctx, TCGOp *op)
          t1 = deposit64(t1, op->args[3], op->args[4], t2);
          return tcg_opt_gen_movi(ctx, op, op->args[0], t1);
      }
 +
 +    ctx->z_mask = deposit64(arg_info(op->args[1])->z_mask,
 +                            op->args[3], op->args[4],
 +                            arg_info(op->args[2])->z_mask);
      return false;
  }
@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
  static bool fold_extract(OptContext *ctx, TCGOp *op)
  {
 +    uint64_t z_mask_old, z_mask;
 +
      if (arg_is_const(op->args[1])) {
          uint64_t t;
@@ -XXX,XX +XXX,XX @@ static bool fold_extract(OptContext *ctx, TCGOp *op)
          t = extract64(t, op->args[2], op->args[3]);
          return tcg_opt_gen_movi(ctx, op, op->args[0], t);
      }
 -    return false;
 +
 +    z_mask_old = arg_info(op->args[1])->z_mask;
 +    z_mask = extract64(z_mask_old, op->args[2], op->args[3]);
 +    if (op->args[2] == 0) {
 +        ctx->a_mask = z_mask_old ^ z_mask;
 +    }
 +    ctx->z_mask = z_mask;
 +
 +    return fold_masks(ctx, op);
  }
  static bool fold_extract2(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_extract2(OptContext *ctx, TCGOp *op)
  static bool fold_exts(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const1(ctx, op);
 +    uint64_t z_mask_old, z_mask, sign;
 +    bool type_change = false;
 +
 +    if (fold_const1(ctx, op)) {
 +        return true;
 +    }
 +
 +    z_mask_old = z_mask = arg_info(op->args[1])->z_mask;
 +
 +    switch (op->opc) {
 +    CASE_OP_32_64(ext8s):
 +        sign = INT8_MIN;
 +        z_mask = (uint8_t)z_mask;
 +        break;
 +    CASE_OP_32_64(ext16s):
 +        sign = INT16_MIN;
 +        z_mask = (uint16_t)z_mask;
 +        break;
 +    case INDEX_op_ext_i32_i64:
 +        type_change = true;
 +        QEMU_FALLTHROUGH;
 +    case INDEX_op_ext32s_i64:
 +        sign = INT32_MIN;
 +        z_mask = (uint32_t)z_mask;
 +        break;
 +    default:
 +        g_assert_not_reached();
 +    }
 +
 +    if (z_mask & sign) {
 +        z_mask |= sign;
 +    } else if (!type_change) {
 +        ctx->a_mask = z_mask_old ^ z_mask;
 +    }
 +    ctx->z_mask = z_mask;
 +
 +    return fold_masks(ctx, op);
  }
  static bool fold_extu(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const1(ctx, op);
 +    uint64_t z_mask_old, z_mask;
 +    bool type_change = false;
 +
 +    if (fold_const1(ctx, op)) {
 +        return true;
 +    }
 +
 +    z_mask_old = z_mask = arg_info(op->args[1])->z_mask;
 +
 +    switch (op->opc) {
 +    CASE_OP_32_64(ext8u):
 +        z_mask = (uint8_t)z_mask;
 +        break;
 +    CASE_OP_32_64(ext16u):
 +        z_mask = (uint16_t)z_mask;
 +        break;
 +    case INDEX_op_extrl_i64_i32:
 +    case INDEX_op_extu_i32_i64:
 +        type_change = true;
 +        QEMU_FALLTHROUGH;
 +    case INDEX_op_ext32u_i64:
 +        z_mask = (uint32_t)z_mask;
 +        break;
 +    case INDEX_op_extrh_i64_i32:
 +        type_change = true;
 +        z_mask >>= 32;
 +        break;
 +    default:
 +        g_assert_not_reached();
 +    }
 +
 +    ctx->z_mask = z_mask;
 +    if (!type_change) {
 +        ctx->a_mask = z_mask_old ^ z_mask;
 +    }
 +    return fold_masks(ctx, op);
  }
  static bool fold_mb(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
          return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[4 - i]);
      }
 +    ctx->z_mask = arg_info(op->args[3])->z_mask
 +                | arg_info(op->args[4])->z_mask;
 +
      if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
          uint64_t tv = arg_info(op->args[3])->val;
          uint64_t fv = arg_info(op->args[4])->val;
@@ -XXX,XX +XXX,XX @@ static bool fold_nand(OptContext *ctx, TCGOp *op)
  static bool fold_neg(OptContext *ctx, TCGOp *op)
  {
 +    uint64_t z_mask;
 +
      if (fold_const1(ctx, op)) {
          return true;
      }
 +
 +    /* Set to 1 all bits to the left of the rightmost.  */
 +    z_mask = arg_info(op->args[1])->z_mask;
 +    ctx->z_mask = -(z_mask & -z_mask);
 +
      /*
       * Because of fold_sub_to_neg, we want to always return true,
       * via finish_folding.
@@ -XXX,XX +XXX,XX @@ static bool fold_or(OptContext *ctx, TCGOp *op)
          fold_xx_to_x(ctx, op)) {
          return true;
      }
 -    return false;
 +
 +    ctx->z_mask = arg_info(op->args[1])->z_mask
 +                | arg_info(op->args[2])->z_mask;
 +    return fold_masks(ctx, op);
  }
  static bool fold_orc(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_orc(OptContext *ctx, TCGOp *op)
  static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
  {
 +    const TCGOpDef *def = &tcg_op_defs[op->opc];
 +    MemOpIdx oi = op->args[def->nb_oargs + def->nb_iargs];
 +    MemOp mop = get_memop(oi);
 +    int width = 8 * memop_size(mop);
 +
 +    if (!(mop & MO_SIGN) && width < 64) {
 +        ctx->z_mask = MAKE_64BIT_MASK(0, width);
 +    }
 +
      /* Opcodes that touch guest memory stop the mb optimization.  */
      ctx->prev_mb = NULL;
      return false;
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond(OptContext *ctx, TCGOp *op)
      if (i >= 0) {
          return tcg_opt_gen_movi(ctx, op, op->args[0], i);
      }
 +
 +    ctx->z_mask = 1;
      return false;
  }
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
          op->opc = INDEX_op_setcond_i32;
          break;
      }
 +
 +    ctx->z_mask = 1;
      return false;
   do_setcond_const:
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
  static bool fold_sextract(OptContext *ctx, TCGOp *op)
  {
 +    int64_t z_mask_old, z_mask;
 +
      if (arg_is_const(op->args[1])) {
          uint64_t t;
@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
          t = sextract64(t, op->args[2], op->args[3]);
          return tcg_opt_gen_movi(ctx, op, op->args[0], t);
      }
 -    return false;
 +
 +    z_mask_old = arg_info(op->args[1])->z_mask;
 +    z_mask = sextract64(z_mask_old, op->args[2], op->args[3]);
 +    if (op->args[2] == 0 && z_mask >= 0) {
 +        ctx->a_mask = z_mask_old ^ z_mask;
 +    }
 +    ctx->z_mask = z_mask;
 +
 +    return fold_masks(ctx, op);
  }
  static bool fold_shift(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_shift(OptContext *ctx, TCGOp *op)
          fold_xi_to_x(ctx, op, 0)) {
          return true;
      }
 +
 +    if (arg_is_const(op->args[2])) {
 +        ctx->z_mask = do_constant_folding(op->opc, ctx->type,
 +                                          arg_info(op->args[1])->z_mask,
 +                                          arg_info(op->args[2])->val);
 +        return fold_masks(ctx, op);
 +    }
      return false;
  }
@@ -XXX,XX +XXX,XX @@ static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
      return fold_addsub2_i32(ctx, op, false);
  }
 +static bool fold_tcg_ld(OptContext *ctx, TCGOp *op)
 +{
 +    /* We can't do any folding with a load, but we can record bits. */
 +    switch (op->opc) {
 +    CASE_OP_32_64(ld8u):
 +        ctx->z_mask = MAKE_64BIT_MASK(0, 8);
 +        break;
 +    CASE_OP_32_64(ld16u):
 +        ctx->z_mask = MAKE_64BIT_MASK(0, 16);
 +        break;
 +    case INDEX_op_ld32u_i64:
 +        ctx->z_mask = MAKE_64BIT_MASK(0, 32);
 +        break;
 +    default:
 +        g_assert_not_reached();
 +    }
 +    return false;
 +}
 +
  static bool fold_xor(OptContext *ctx, TCGOp *op)
  {
      if (fold_const2(ctx, op) ||
@@ -XXX,XX +XXX,XX @@ static bool fold_xor(OptContext *ctx, TCGOp *op)
          fold_xi_to_not(ctx, op, -1)) {
          return true;
      }
 -    return false;
 +
 +    ctx->z_mask = arg_info(op->args[1])->z_mask
 +                | arg_info(op->args[2])->z_mask;
 +    return fold_masks(ctx, op);
  }
  /* Propagate constants and copies, fold constant expressions. */
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
      }
      QTAILQ_FOREACH_SAFE(op, &s->ops, link, op_next) {
 -        uint64_t z_mask, partmask, affected, tmp;
          TCGOpcode opc = op->opc;
          const TCGOpDef *def;
          bool done = false;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              break;
          }
 -        /* Simplify using known-zero bits. Currently only ops with a single
 -           output argument is supported. */
 -        z_mask = -1;
 -        affected = -1;
 -        switch (opc) {
 -        CASE_OP_32_64(ext8s):
 -            if ((arg_info(op->args[1])->z_mask & 0x80) != 0) {
 -                break;
 -            }
 -            QEMU_FALLTHROUGH;
 -        CASE_OP_32_64(ext8u):
 -            z_mask = 0xff;
 -            goto and_const;
 -        CASE_OP_32_64(ext16s):
 -            if ((arg_info(op->args[1])->z_mask & 0x8000) != 0) {
 -                break;
 -            }
 -            QEMU_FALLTHROUGH;
 -        CASE_OP_32_64(ext16u):
 -            z_mask = 0xffff;
 -            goto and_const;
 -        case INDEX_op_ext32s_i64:
 -            if ((arg_info(op->args[1])->z_mask & 0x80000000) != 0) {
 -                break;
 -            }
 -            QEMU_FALLTHROUGH;
 -        case INDEX_op_ext32u_i64:
 -            z_mask = 0xffffffffU;
 -            goto and_const;
 -
 -        CASE_OP_32_64(and):
 -            z_mask = arg_info(op->args[2])->z_mask;
 -            if (arg_is_const(op->args[2])) {
 -        and_const:
 -                affected = arg_info(op->args[1])->z_mask & ~z_mask;
 -            }
 -            z_mask = arg_info(op->args[1])->z_mask & z_mask;
 -            break;
 -
 -        case INDEX_op_ext_i32_i64:
 -            if ((arg_info(op->args[1])->z_mask & 0x80000000) != 0) {
 -                break;
 -            }
 -            QEMU_FALLTHROUGH;
 -        case INDEX_op_extu_i32_i64:
 -            /* We do not compute affected as it is a size changing op.  */
 -            z_mask = (uint32_t)arg_info(op->args[1])->z_mask;
 -            break;
 -
 -        CASE_OP_32_64(andc):
 -            /* Known-zeros does not imply known-ones.  Therefore unless
 -               op->args[2] is constant, we can't infer anything from it.  */
 -            if (arg_is_const(op->args[2])) {
 -                z_mask = ~arg_info(op->args[2])->z_mask;
 -                goto and_const;
 -            }
 -            /* But we certainly know nothing outside args[1] may be set. */
 -            z_mask = arg_info(op->args[1])->z_mask;
 -            break;
 -
 -        case INDEX_op_sar_i32:
 -            if (arg_is_const(op->args[2])) {
 -                tmp = arg_info(op->args[2])->val & 31;
 -                z_mask = (int32_t)arg_info(op->args[1])->z_mask >> tmp;
 -            }
 -            break;
 -        case INDEX_op_sar_i64:
 -            if (arg_is_const(op->args[2])) {
 -                tmp = arg_info(op->args[2])->val & 63;
 -                z_mask = (int64_t)arg_info(op->args[1])->z_mask >> tmp;
 -            }
 -            break;
 -
 -        case INDEX_op_shr_i32:
 -            if (arg_is_const(op->args[2])) {
 -                tmp = arg_info(op->args[2])->val & 31;
 -                z_mask = (uint32_t)arg_info(op->args[1])->z_mask >> tmp;
 -            }
 -            break;
 -        case INDEX_op_shr_i64:
 -            if (arg_is_const(op->args[2])) {
 -                tmp = arg_info(op->args[2])->val & 63;
 -                z_mask = (uint64_t)arg_info(op->args[1])->z_mask >> tmp;
 -            }
 -            break;
 -
 -        case INDEX_op_extrl_i64_i32:
 -            z_mask = (uint32_t)arg_info(op->args[1])->z_mask;
 -            break;
 -        case INDEX_op_extrh_i64_i32:
 -            z_mask = (uint64_t)arg_info(op->args[1])->z_mask >> 32;
 -            break;
 -
 -        CASE_OP_32_64(shl):
 -            if (arg_is_const(op->args[2])) {
 -                tmp = arg_info(op->args[2])->val & (TCG_TARGET_REG_BITS - 1);
 -                z_mask = arg_info(op->args[1])->z_mask << tmp;
 -            }
 -            break;
 -
 -        CASE_OP_32_64(neg):
 -            /* Set to 1 all bits to the left of the rightmost.  */
 -            z_mask = -(arg_info(op->args[1])->z_mask
 -                       & -arg_info(op->args[1])->z_mask);
 -            break;
 -
 -        CASE_OP_32_64(deposit):
 -            z_mask = deposit64(arg_info(op->args[1])->z_mask,
 -                               op->args[3], op->args[4],
 -                               arg_info(op->args[2])->z_mask);
 -            break;
 -
 -        CASE_OP_32_64(extract):
 -            z_mask = extract64(arg_info(op->args[1])->z_mask,
 -                               op->args[2], op->args[3]);
 -            if (op->args[2] == 0) {
 -                affected = arg_info(op->args[1])->z_mask & ~z_mask;
 -            }
 -            break;
 -        CASE_OP_32_64(sextract):
 -            z_mask = sextract64(arg_info(op->args[1])->z_mask,
 -                                op->args[2], op->args[3]);
 -            if (op->args[2] == 0 && (tcg_target_long)z_mask >= 0) {
 -                affected = arg_info(op->args[1])->z_mask & ~z_mask;
 -            }
 -            break;
 -
 -        CASE_OP_32_64(or):
 -        CASE_OP_32_64(xor):
 -            z_mask = arg_info(op->args[1])->z_mask
 -                   | arg_info(op->args[2])->z_mask;
 -            break;
 -
 -        case INDEX_op_clz_i32:
 -        case INDEX_op_ctz_i32:
 -            z_mask = arg_info(op->args[2])->z_mask | 31;
 -            break;
 -
 -        case INDEX_op_clz_i64:
 -        case INDEX_op_ctz_i64:
 -            z_mask = arg_info(op->args[2])->z_mask | 63;
 -            break;
 -
 -        case INDEX_op_ctpop_i32:
 -            z_mask = 32 | 31;
 -            break;
 -        case INDEX_op_ctpop_i64:
 -            z_mask = 64 | 63;
 -            break;
 -
 -        CASE_OP_32_64(setcond):
 -        case INDEX_op_setcond2_i32:
 -            z_mask = 1;
 -            break;
 -
 -        CASE_OP_32_64(movcond):
 -            z_mask = arg_info(op->args[3])->z_mask
 -                   | arg_info(op->args[4])->z_mask;
 -            break;
 -
 -        CASE_OP_32_64(ld8u):
 -            z_mask = 0xff;
 -            break;
 -        CASE_OP_32_64(ld16u):
 -            z_mask = 0xffff;
 -            break;
 -        case INDEX_op_ld32u_i64:
 -            z_mask = 0xffffffffu;
 -            break;
 -
 -        CASE_OP_32_64(qemu_ld):
 -            {
 -                MemOpIdx oi = op->args[def->nb_oargs + def->nb_iargs];
 -                MemOp mop = get_memop(oi);
 -                if (!(mop & MO_SIGN)) {
 -                    z_mask = (2ULL << ((8 << (mop & MO_SIZE)) - 1)) - 1;
 -                }
 -            }
 -            break;
 -
 -        CASE_OP_32_64(bswap16):
 -            z_mask = arg_info(op->args[1])->z_mask;
 -            if (z_mask <= 0xffff) {
 -                op->args[2] |= TCG_BSWAP_IZ;
 -            }
 -            z_mask = bswap16(z_mask);
 -            switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
 -            case TCG_BSWAP_OZ:
 -                break;
 -            case TCG_BSWAP_OS:
 -                z_mask = (int16_t)z_mask;
 -                break;
 -            default: /* undefined high bits */
 -                z_mask |= MAKE_64BIT_MASK(16, 48);
 -                break;
 -            }
 -            break;
 -
 -        case INDEX_op_bswap32_i64:
 -            z_mask = arg_info(op->args[1])->z_mask;
 -            if (z_mask <= 0xffffffffu) {
 -                op->args[2] |= TCG_BSWAP_IZ;
 -            }
 -            z_mask = bswap32(z_mask);
 -            switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
 -            case TCG_BSWAP_OZ:
 -                break;
 -            case TCG_BSWAP_OS:
 -                z_mask = (int32_t)z_mask;
 -                break;
 -            default: /* undefined high bits */
 -                z_mask |= MAKE_64BIT_MASK(32, 32);
 -                break;
 -            }
 -            break;
 -
 -        default:
 -            break;
 -        }
 -
 -        /* 32-bit ops generate 32-bit results.  For the result is zero test
 -           below, we can ignore high bits, but for further optimizations we
 -           need to record that the high bits contain garbage.  */
 -        partmask = z_mask;
 -        if (ctx.type == TCG_TYPE_I32) {
 -            z_mask |= ~(tcg_target_ulong)0xffffffffu;
 -            partmask &= 0xffffffffu;
 -            affected &= 0xffffffffu;
 -        }
 -        ctx.z_mask = z_mask;
 -
 -        if (partmask == 0) {
 -            tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
 -            continue;
 -        }
 -        if (affected == 0) {
 -            tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
 -            continue;
 -        }
 +        /* Assume all bits affected, and no bits known zero. */
 +        ctx.a_mask = -1;
 +        ctx.z_mask = -1;
          /*
           * Process each opcode.
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          case INDEX_op_extrh_i64_i32:
              done = fold_extu(&ctx, op);
              break;
 +        CASE_OP_32_64(ld8u):
 +        CASE_OP_32_64(ld16u):
 +        case INDEX_op_ld32u_i64:
 +            done = fold_tcg_ld(&ctx, op);
 +            break;
          case INDEX_op_mb:
              done = fold_mb(&ctx, op);
              break;
 --
-.20.1
+.25.1

-[PULL 21/41] target/i386: Use cpu_*_mmuidx_ra instead of templates
+[PULL 44/56] tcg/optimize: Expand fold_mulu2_i32 to all 4-arg multiplies
-Do not use exec/cpu_ldst_{,useronly_}template.h directly,
+Rename to fold_multiply2, and handle muls2_i32, mulu2_i64,
-but instead use the functional interface.
+and muls2_i64.
-Cc: Eduardo Habkost <ehabkost@redhat.com>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
-Tested-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Acked-by: Paolo Bonzini <pbonzini@redhat.com>
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/i386/seg_helper.c | 56 ++++++++++++++++++++--------------------
+ tcg/optimize.c | 44 +++++++++++++++++++++++++++++++++++---------
-file changed, 28 insertions(+), 28 deletions(-)
+file changed, 35 insertions(+), 9 deletions(-)
-diff --git a/target/i386/seg_helper.c b/target/i386/seg_helper.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/i386/seg_helper.c
+--- a/tcg/optimize.c
-+++ b/target/i386/seg_helper.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
- # define LOG_PCALL_STATE(cpu) do { } while (0)
+     return false;
- #endif
+ }
--#ifdef CONFIG_USER_ONLY
+-static bool fold_mulu2_i32(OptContext *ctx, TCGOp *op)
--#define MEMSUFFIX _kernel
++static bool fold_multiply2(OptContext *ctx, TCGOp *op)
--#define DATA_SIZE 1
+ {
--#include "exec/cpu_ldst_useronly_template.h"
+     if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
-+/*
+-        uint32_t a = arg_info(op->args[2])->val;
-+ * TODO: Convert callers to compute cpu_mmu_index_kernel once
+-        uint32_t b = arg_info(op->args[3])->val;
-+ * and use *_mmuidx_ra directly.
+-        uint64_t r = (uint64_t)a * b;
-+ */
++        uint64_t a = arg_info(op->args[2])->val;
-+#define cpu_ldub_kernel_ra(e, p, r) \
++        uint64_t b = arg_info(op->args[3])->val;
-+    cpu_ldub_mmuidx_ra(e, p, cpu_mmu_index_kernel(e), r)
++        uint64_t h, l;
-+#define cpu_lduw_kernel_ra(e, p, r) \
+         TCGArg rl, rh;
-+    cpu_lduw_mmuidx_ra(e, p, cpu_mmu_index_kernel(e), r)
+-        TCGOp *op2 = tcg_op_insert_before(ctx->tcg, op, INDEX_op_mov_i32);
-+#define cpu_ldl_kernel_ra(e, p, r) \
++        TCGOp *op2;
-+    cpu_ldl_mmuidx_ra(e, p, cpu_mmu_index_kernel(e), r)
++
-+#define cpu_ldq_kernel_ra(e, p, r) \
++        switch (op->opc) {
-+    cpu_ldq_mmuidx_ra(e, p, cpu_mmu_index_kernel(e), r)
++        case INDEX_op_mulu2_i32:
++            l = (uint64_t)(uint32_t)a * (uint32_t)b;
--#define DATA_SIZE 2
++            h = (int32_t)(l >> 32);
--#include "exec/cpu_ldst_useronly_template.h"
++            l = (int32_t)l;
-+#define cpu_stb_kernel_ra(e, p, v, r) \
++            break;
-+    cpu_stb_mmuidx_ra(e, p, v, cpu_mmu_index_kernel(e), r)
++        case INDEX_op_muls2_i32:
-+#define cpu_stw_kernel_ra(e, p, v, r) \
++            l = (int64_t)(int32_t)a * (int32_t)b;
-+    cpu_stw_mmuidx_ra(e, p, v, cpu_mmu_index_kernel(e), r)
++            h = l >> 32;
-+#define cpu_stl_kernel_ra(e, p, v, r) \
++            l = (int32_t)l;
-+    cpu_stl_mmuidx_ra(e, p, v, cpu_mmu_index_kernel(e), r)
++            break;
-+#define cpu_stq_kernel_ra(e, p, v, r) \
++        case INDEX_op_mulu2_i64:
-+    cpu_stq_mmuidx_ra(e, p, v, cpu_mmu_index_kernel(e), r)
++            mulu64(&l, &h, a, b);
++            break;
--#define DATA_SIZE 4
++        case INDEX_op_muls2_i64:
--#include "exec/cpu_ldst_useronly_template.h"
++            muls64(&l, &h, a, b);
-+#define cpu_ldub_kernel(e, p)    cpu_ldub_kernel_ra(e, p, 0)
++            break;
-+#define cpu_lduw_kernel(e, p)    cpu_lduw_kernel_ra(e, p, 0)
++        default:
-+#define cpu_ldl_kernel(e, p)     cpu_ldl_kernel_ra(e, p, 0)
++            g_assert_not_reached();
-+#define cpu_ldq_kernel(e, p)     cpu_ldq_kernel_ra(e, p, 0)
++        }
--#define DATA_SIZE 8
+         rl = op->args[0];
--#include "exec/cpu_ldst_useronly_template.h"
+         rh = op->args[1];
--#undef MEMSUFFIX
+-        tcg_opt_gen_movi(ctx, op, rl, (int32_t)r);
--#else
+-        tcg_opt_gen_movi(ctx, op2, rh, (int32_t)(r >> 32));
--#define CPU_MMU_INDEX (cpu_mmu_index_kernel(env))
++
--#define MEMSUFFIX _kernel
++        /* The proper opcode is supplied by tcg_opt_gen_mov. */
--#define DATA_SIZE 1
++        op2 = tcg_op_insert_before(ctx->tcg, op, 0);
--#include "exec/cpu_ldst_template.h"
++
--
++        tcg_opt_gen_movi(ctx, op, rl, l);
--#define DATA_SIZE 2
++        tcg_opt_gen_movi(ctx, op2, rh, h);
--#include "exec/cpu_ldst_template.h"
+         return true;
--
+     }
--#define DATA_SIZE 4
+     return false;
--#include "exec/cpu_ldst_template.h"
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
--
+         CASE_OP_32_64(muluh):
--#define DATA_SIZE 8
+             done = fold_mul_highpart(&ctx, op);
--#include "exec/cpu_ldst_template.h"
+             break;
--#undef CPU_MMU_INDEX
+-        case INDEX_op_mulu2_i32:
--#undef MEMSUFFIX
+-            done = fold_mulu2_i32(&ctx, op);
--#endif
++        CASE_OP_32_64(muls2):
-+#define cpu_stb_kernel(e, p, v)  cpu_stb_kernel_ra(e, p, v, 0)
++        CASE_OP_32_64(mulu2):
-+#define cpu_stw_kernel(e, p, v)  cpu_stw_kernel_ra(e, p, v, 0)
++            done = fold_multiply2(&ctx, op);
-+#define cpu_stl_kernel(e, p, v)  cpu_stl_kernel_ra(e, p, v, 0)
+             break;
-+#define cpu_stq_kernel(e, p, v)  cpu_stq_kernel_ra(e, p, v, 0)
+         CASE_OP_32_64(nand):
+             done = fold_nand(&ctx, op);
  /* return non zero if error */
  static inline int load_segment_ra(CPUX86State *env, uint32_t *e1_ptr,
 --
-.20.1
+.25.1

-[PULL 37/41] tcg: Search includes from the project root source directory
+[PULL 45/56] tcg/optimize: Expand fold_addsub2_i32 to 64-bit ops
-From: Philippe Mathieu-Daudé <philmd@redhat.com>
+Rename to fold_addsub2.
 Use Int128 to implement the wider operation.
-We currently search both the root and the tcg/ directories for tcg
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-files:
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
   $ git grep '#include "tcg/' | wc -l
   $ git grep '#include "tcg[^/]' | wc -l
 To simplify the preprocessor search path, unify by expliciting the
 tcg/ directory.
 Patch created mechanically by running:
   $ for x in \
       tcg.h tcg-mo.h tcg-op.h tcg-opc.h \
       tcg-op-gvec.h tcg-gvec-desc.h; do \
     sed -i "s,#include \"$x\",#include \"tcg/$x\"," \
       $(git grep -l "#include \"$x\""); \
     done
 Acked-by: David Gibson <david@gibson.dropbear.id.au> (ppc parts)
 Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
 Reviewed-by: Alistair Francis <alistair.francis@wdc.com>
 Reviewed-by: Stefan Weil <sw@weilnetz.de>
 Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
 Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
 Message-Id: <20200101112303.20724-2-philmd@redhat.com>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/exec/cpu_ldst.h       | 2 +-
+ tcg/optimize.c | 65 ++++++++++++++++++++++++++++++++++----------------
- tcg/i386/tcg-target.h         | 2 +-
+file changed, 44 insertions(+), 21 deletions(-)
  tcg/tcg-op.h                  | 2 +-
  tcg/tcg.h                     | 4 ++--
  accel/tcg/cpu-exec.c          | 2 +-
  accel/tcg/tcg-runtime-gvec.c  | 2 +-
  accel/tcg/tcg-runtime.c       | 2 +-
  accel/tcg/translate-all.c     | 2 +-
  accel/tcg/user-exec.c         | 2 +-
  bsd-user/main.c               | 2 +-
  cpus.c                        | 2 +-
  exec.c                        | 2 +-
  linux-user/main.c             | 2 +-
  linux-user/syscall.c          | 2 +-
  target/alpha/translate.c      | 2 +-
  target/arm/helper-a64.c       | 2 +-
  target/arm/sve_helper.c       | 2 +-
  target/arm/translate-a64.c    | 4 ++--
  target/arm/translate-sve.c    | 6 +++---
  target/arm/translate.c        | 4 ++--
  target/cris/translate.c       | 2 +-
  target/hppa/translate.c       | 2 +-
  target/i386/mem_helper.c      | 2 +-
  target/i386/translate.c       | 2 +-
  target/lm32/translate.c       | 2 +-
  target/m68k/translate.c       | 2 +-
  target/microblaze/translate.c | 2 +-
  target/mips/translate.c       | 2 +-
  target/moxie/translate.c      | 2 +-
  target/nios2/translate.c      | 2 +-
  target/openrisc/translate.c   | 2 +-
  target/ppc/mem_helper.c       | 2 +-
  target/ppc/translate.c        | 4 ++--
  target/riscv/cpu_helper.c     | 2 +-
  target/riscv/translate.c      | 2 +-
  target/s390x/mem_helper.c     | 2 +-
  target/s390x/translate.c      | 4 ++--
  target/sh4/translate.c        | 2 +-
  target/sparc/ldst_helper.c    | 2 +-
  target/sparc/translate.c      | 2 +-
  target/tilegx/translate.c     | 2 +-
  target/tricore/translate.c    | 2 +-
  target/unicore32/translate.c  | 2 +-
  target/xtensa/translate.c     | 2 +-
  tcg/optimize.c                | 2 +-
  tcg/tcg-common.c              | 2 +-
  tcg/tcg-op-gvec.c             | 8 ++++----
  tcg/tcg-op-vec.c              | 6 +++---
  tcg/tcg-op.c                  | 6 +++---
  tcg/tcg.c                     | 2 +-
  tcg/tci.c                     | 2 +-
 files changed, 65 insertions(+), 65 deletions(-)
-diff --git a/include/exec/cpu_ldst.h b/include/exec/cpu_ldst.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/exec/cpu_ldst.h
-+++ b/include/exec/cpu_ldst.h
-@@ -XXX,XX +XXX,XX @@ static inline void cpu_stq_mmuidx_ra(CPUArchState *env, abi_ptr addr,
- #else
- /* Needed for TCG_OVERSIZED_GUEST */
--#include "tcg.h"
-+#include "tcg/tcg.h"
- static inline target_ulong tlb_addr_write(const CPUTLBEntry *entry)
- {
-diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
-index XXXXXXX..XXXXXXX 100644
---- a/tcg/i386/tcg-target.h
-+++ b/tcg/i386/tcg-target.h
-@@ -XXX,XX +XXX,XX @@ static inline void tb_target_set_jmp_target(uintptr_t tc_ptr,
-  * The x86 has a pretty strong memory ordering which only really
-  * allows for some stores to be re-ordered after loads.
-  */
--#include "tcg-mo.h"
-+#include "tcg/tcg-mo.h"
- #define TCG_TARGET_DEFAULT_MO (TCG_MO_ALL & ~TCG_MO_ST_LD)
-diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h
-index XXXXXXX..XXXXXXX 100644
---- a/tcg/tcg-op.h
-+++ b/tcg/tcg-op.h
-@@ -XXX,XX +XXX,XX @@
- #ifndef TCG_TCG_OP_H
- #define TCG_TCG_OP_H
--#include "tcg.h"
-+#include "tcg/tcg.h"
- #include "exec/helper-proto.h"
- #include "exec/helper-gen.h"
-diff --git a/tcg/tcg.h b/tcg/tcg.h
-index XXXXXXX..XXXXXXX 100644
---- a/tcg/tcg.h
-+++ b/tcg/tcg.h
-@@ -XXX,XX +XXX,XX @@
- #include "qemu/bitops.h"
- #include "qemu/plugin.h"
- #include "qemu/queue.h"
--#include "tcg-mo.h"
-+#include "tcg/tcg-mo.h"
- #include "tcg-target.h"
- #include "qemu/int128.h"
-@@ -XXX,XX +XXX,XX @@ typedef uint64_t TCGRegSet;
- typedef enum TCGOpcode {
- #define DEF(name, oargs, iargs, cargs, flags) INDEX_op_ ## name,
--#include "tcg-opc.h"
-+#include "tcg/tcg-opc.h"
- #undef DEF
-     NB_OPS,
- } TCGOpcode;
-diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
-index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/cpu-exec.c
-+++ b/accel/tcg/cpu-exec.c
-@@ -XXX,XX +XXX,XX @@
- #include "trace.h"
- #include "disas/disas.h"
- #include "exec/exec-all.h"
--#include "tcg.h"
-+#include "tcg/tcg.h"
- #include "qemu/atomic.h"
- #include "sysemu/qtest.h"
- #include "qemu/timer.h"
-diff --git a/accel/tcg/tcg-runtime-gvec.c b/accel/tcg/tcg-runtime-gvec.c
-index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/tcg-runtime-gvec.c
-+++ b/accel/tcg/tcg-runtime-gvec.c
-@@ -XXX,XX +XXX,XX @@
- #include "qemu/host-utils.h"
- #include "cpu.h"
- #include "exec/helper-proto.h"
--#include "tcg-gvec-desc.h"
-+#include "tcg/tcg-gvec-desc.h"
- /* Virtually all hosts support 16-byte vectors.  Those that don't can emulate
-diff --git a/accel/tcg/tcg-runtime.c b/accel/tcg/tcg-runtime.c
-index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/tcg-runtime.c
-+++ b/accel/tcg/tcg-runtime.c
-@@ -XXX,XX +XXX,XX @@
- #include "exec/tb-lookup.h"
- #include "disas/disas.h"
- #include "exec/log.h"
--#include "tcg.h"
-+#include "tcg/tcg.h"
- /* 32-bit helpers */
-diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
-index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/translate-all.c
-+++ b/accel/tcg/translate-all.c
-@@ -XXX,XX +XXX,XX @@
- #include "trace.h"
- #include "disas/disas.h"
- #include "exec/exec-all.h"
--#include "tcg.h"
-+#include "tcg/tcg.h"
- #if defined(CONFIG_USER_ONLY)
- #include "qemu.h"
- #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
-diff --git a/accel/tcg/user-exec.c b/accel/tcg/user-exec.c
-index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/user-exec.c
-+++ b/accel/tcg/user-exec.c
-@@ -XXX,XX +XXX,XX @@
- #include "cpu.h"
- #include "disas/disas.h"
- #include "exec/exec-all.h"
--#include "tcg.h"
-+#include "tcg/tcg.h"
- #include "qemu/bitops.h"
- #include "exec/cpu_ldst.h"
- #include "translate-all.h"
-diff --git a/bsd-user/main.c b/bsd-user/main.c
-index XXXXXXX..XXXXXXX 100644
---- a/bsd-user/main.c
-+++ b/bsd-user/main.c
-@@ -XXX,XX +XXX,XX @@
- #include "qemu/module.h"
- #include "cpu.h"
- #include "exec/exec-all.h"
--#include "tcg.h"
-+#include "tcg/tcg.h"
- #include "qemu/timer.h"
- #include "qemu/envlist.h"
- #include "exec/log.h"
-diff --git a/cpus.c b/cpus.c
-index XXXXXXX..XXXXXXX 100644
---- a/cpus.c
-+++ b/cpus.c
-@@ -XXX,XX +XXX,XX @@
- #include "qemu/bitmap.h"
- #include "qemu/seqlock.h"
- #include "qemu/guest-random.h"
--#include "tcg.h"
-+#include "tcg/tcg.h"
- #include "hw/nmi.h"
- #include "sysemu/replay.h"
- #include "sysemu/runstate.h"
-diff --git a/exec.c b/exec.c
-index XXXXXXX..XXXXXXX 100644
---- a/exec.c
-+++ b/exec.c
-@@ -XXX,XX +XXX,XX @@
- #include "cpu.h"
- #include "exec/exec-all.h"
- #include "exec/target_page.h"
--#include "tcg.h"
-+#include "tcg/tcg.h"
- #include "hw/qdev-core.h"
- #include "hw/qdev-properties.h"
- #if !defined(CONFIG_USER_ONLY)
-diff --git a/linux-user/main.c b/linux-user/main.c
-index XXXXXXX..XXXXXXX 100644
---- a/linux-user/main.c
-+++ b/linux-user/main.c
-@@ -XXX,XX +XXX,XX @@
- #include "qemu/plugin.h"
- #include "cpu.h"
- #include "exec/exec-all.h"
--#include "tcg.h"
-+#include "tcg/tcg.h"
- #include "qemu/timer.h"
- #include "qemu/envlist.h"
- #include "qemu/guest-random.h"
-diff --git a/linux-user/syscall.c b/linux-user/syscall.c
-index XXXXXXX..XXXXXXX 100644
---- a/linux-user/syscall.c
-+++ b/linux-user/syscall.c
-@@ -XXX,XX +XXX,XX @@
- #include "user/syscall-trace.h"
- #include "qapi/error.h"
- #include "fd-trans.h"
--#include "tcg.h"
-+#include "tcg/tcg.h"
- #ifndef CLONE_IO
- #define CLONE_IO                0x80000000      /* Clone io context */
-diff --git a/target/alpha/translate.c b/target/alpha/translate.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/alpha/translate.c
-+++ b/target/alpha/translate.c
-@@ -XXX,XX +XXX,XX @@
- #include "disas/disas.h"
- #include "qemu/host-utils.h"
- #include "exec/exec-all.h"
--#include "tcg-op.h"
-+#include "tcg/tcg-op.h"
- #include "exec/cpu_ldst.h"
- #include "exec/helper-proto.h"
- #include "exec/helper-gen.h"
-diff --git a/target/arm/helper-a64.c b/target/arm/helper-a64.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/arm/helper-a64.c
-+++ b/target/arm/helper-a64.c
-@@ -XXX,XX +XXX,XX @@
- #include "exec/cpu_ldst.h"
- #include "qemu/int128.h"
- #include "qemu/atomic128.h"
--#include "tcg.h"
-+#include "tcg/tcg.h"
- #include "fpu/softfloat.h"
- #include <zlib.h> /* For crc32 */
-diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/arm/sve_helper.c
-+++ b/target/arm/sve_helper.c
-@@ -XXX,XX +XXX,XX @@
- #include "exec/helper-proto.h"
- #include "tcg/tcg-gvec-desc.h"
- #include "fpu/softfloat.h"
--#include "tcg.h"
-+#include "tcg/tcg.h"
- /* Note that vector data is stored in host-endian 64-bit chunks,
-diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/arm/translate-a64.c
-+++ b/target/arm/translate-a64.c
-@@ -XXX,XX +XXX,XX @@
- #include "cpu.h"
- #include "exec/exec-all.h"
--#include "tcg-op.h"
--#include "tcg-op-gvec.h"
-+#include "tcg/tcg-op.h"
-+#include "tcg/tcg-op-gvec.h"
- #include "qemu/log.h"
- #include "arm_ldst.h"
- #include "translate.h"
-diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/arm/translate-sve.c
-+++ b/target/arm/translate-sve.c
-@@ -XXX,XX +XXX,XX @@
- #include "qemu/osdep.h"
- #include "cpu.h"
- #include "exec/exec-all.h"
--#include "tcg-op.h"
--#include "tcg-op-gvec.h"
--#include "tcg-gvec-desc.h"
-+#include "tcg/tcg-op.h"
-+#include "tcg/tcg-op-gvec.h"
-+#include "tcg/tcg-gvec-desc.h"
- #include "qemu/log.h"
- #include "arm_ldst.h"
- #include "translate.h"
-diff --git a/target/arm/translate.c b/target/arm/translate.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/arm/translate.c
-+++ b/target/arm/translate.c
-@@ -XXX,XX +XXX,XX @@
- #include "internals.h"
- #include "disas/disas.h"
- #include "exec/exec-all.h"
--#include "tcg-op.h"
--#include "tcg-op-gvec.h"
-+#include "tcg/tcg-op.h"
-+#include "tcg/tcg-op-gvec.h"
- #include "qemu/log.h"
- #include "qemu/bitops.h"
- #include "arm_ldst.h"
-diff --git a/target/cris/translate.c b/target/cris/translate.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/cris/translate.c
-+++ b/target/cris/translate.c
-@@ -XXX,XX +XXX,XX @@
- #include "cpu.h"
- #include "disas/disas.h"
- #include "exec/exec-all.h"
--#include "tcg-op.h"
-+#include "tcg/tcg-op.h"
- #include "exec/helper-proto.h"
- #include "mmu.h"
- #include "exec/cpu_ldst.h"
-diff --git a/target/hppa/translate.c b/target/hppa/translate.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/hppa/translate.c
-+++ b/target/hppa/translate.c
-@@ -XXX,XX +XXX,XX @@
- #include "disas/disas.h"
- #include "qemu/host-utils.h"
- #include "exec/exec-all.h"
--#include "tcg-op.h"
-+#include "tcg/tcg-op.h"
- #include "exec/cpu_ldst.h"
- #include "exec/helper-proto.h"
- #include "exec/helper-gen.h"
-diff --git a/target/i386/mem_helper.c b/target/i386/mem_helper.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/i386/mem_helper.c
-+++ b/target/i386/mem_helper.c
-@@ -XXX,XX +XXX,XX @@
- #include "exec/cpu_ldst.h"
- #include "qemu/int128.h"
- #include "qemu/atomic128.h"
--#include "tcg.h"
-+#include "tcg/tcg.h"
- void helper_cmpxchg8b_unlocked(CPUX86State *env, target_ulong a0)
- {
-diff --git a/target/i386/translate.c b/target/i386/translate.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/i386/translate.c
-+++ b/target/i386/translate.c
-@@ -XXX,XX +XXX,XX @@
- #include "cpu.h"
- #include "disas/disas.h"
- #include "exec/exec-all.h"
--#include "tcg-op.h"
-+#include "tcg/tcg-op.h"
- #include "exec/cpu_ldst.h"
- #include "exec/translator.h"
-diff --git a/target/lm32/translate.c b/target/lm32/translate.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/lm32/translate.c
-+++ b/target/lm32/translate.c
-@@ -XXX,XX +XXX,XX @@
- #include "exec/helper-proto.h"
- #include "exec/exec-all.h"
- #include "exec/translator.h"
--#include "tcg-op.h"
-+#include "tcg/tcg-op.h"
- #include "qemu/qemu-print.h"
- #include "exec/cpu_ldst.h"
-diff --git a/target/m68k/translate.c b/target/m68k/translate.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/m68k/translate.c
-+++ b/target/m68k/translate.c
-@@ -XXX,XX +XXX,XX @@
- #include "cpu.h"
- #include "disas/disas.h"
- #include "exec/exec-all.h"
--#include "tcg-op.h"
-+#include "tcg/tcg-op.h"
- #include "qemu/log.h"
- #include "qemu/qemu-print.h"
- #include "exec/cpu_ldst.h"
-diff --git a/target/microblaze/translate.c b/target/microblaze/translate.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/microblaze/translate.c
-+++ b/target/microblaze/translate.c
-@@ -XXX,XX +XXX,XX @@
- #include "cpu.h"
- #include "disas/disas.h"
- #include "exec/exec-all.h"
--#include "tcg-op.h"
-+#include "tcg/tcg-op.h"
- #include "exec/helper-proto.h"
- #include "microblaze-decode.h"
- #include "exec/cpu_ldst.h"
-diff --git a/target/mips/translate.c b/target/mips/translate.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/mips/translate.c
-+++ b/target/mips/translate.c
-@@ -XXX,XX +XXX,XX @@
- #include "internal.h"
- #include "disas/disas.h"
- #include "exec/exec-all.h"
--#include "tcg-op.h"
-+#include "tcg/tcg-op.h"
- #include "exec/cpu_ldst.h"
- #include "hw/mips/cpudevs.h"
-diff --git a/target/moxie/translate.c b/target/moxie/translate.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/moxie/translate.c
-+++ b/target/moxie/translate.c
-@@ -XXX,XX +XXX,XX @@
- #include "cpu.h"
- #include "exec/exec-all.h"
- #include "disas/disas.h"
--#include "tcg-op.h"
-+#include "tcg/tcg-op.h"
- #include "exec/cpu_ldst.h"
- #include "qemu/qemu-print.h"
-diff --git a/target/nios2/translate.c b/target/nios2/translate.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/nios2/translate.c
-+++ b/target/nios2/translate.c
-@@ -XXX,XX +XXX,XX @@
- #include "qemu/osdep.h"
- #include "cpu.h"
--#include "tcg-op.h"
-+#include "tcg/tcg-op.h"
- #include "exec/exec-all.h"
- #include "disas/disas.h"
- #include "exec/helper-proto.h"
-diff --git a/target/openrisc/translate.c b/target/openrisc/translate.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/openrisc/translate.c
-+++ b/target/openrisc/translate.c
-@@ -XXX,XX +XXX,XX @@
- #include "cpu.h"
- #include "exec/exec-all.h"
- #include "disas/disas.h"
--#include "tcg-op.h"
-+#include "tcg/tcg-op.h"
- #include "qemu/log.h"
- #include "qemu/bitops.h"
- #include "qemu/qemu-print.h"
-diff --git a/target/ppc/mem_helper.c b/target/ppc/mem_helper.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/ppc/mem_helper.c
-+++ b/target/ppc/mem_helper.c
-@@ -XXX,XX +XXX,XX @@
- #include "exec/helper-proto.h"
- #include "helper_regs.h"
- #include "exec/cpu_ldst.h"
--#include "tcg.h"
-+#include "tcg/tcg.h"
- #include "internal.h"
- #include "qemu/atomic128.h"
-diff --git a/target/ppc/translate.c b/target/ppc/translate.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/ppc/translate.c
-+++ b/target/ppc/translate.c
-@@ -XXX,XX +XXX,XX @@
- #include "internal.h"
- #include "disas/disas.h"
- #include "exec/exec-all.h"
--#include "tcg-op.h"
--#include "tcg-op-gvec.h"
-+#include "tcg/tcg-op.h"
-+#include "tcg/tcg-op-gvec.h"
- #include "qemu/host-utils.h"
- #include "qemu/main-loop.h"
- #include "exec/cpu_ldst.h"
-diff --git a/target/riscv/cpu_helper.c b/target/riscv/cpu_helper.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/riscv/cpu_helper.c
-+++ b/target/riscv/cpu_helper.c
-@@ -XXX,XX +XXX,XX @@
- #include "qemu/main-loop.h"
- #include "cpu.h"
- #include "exec/exec-all.h"
--#include "tcg-op.h"
-+#include "tcg/tcg-op.h"
- #include "trace.h"
- int riscv_cpu_mmu_index(CPURISCVState *env, bool ifetch)
-diff --git a/target/riscv/translate.c b/target/riscv/translate.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/riscv/translate.c
-+++ b/target/riscv/translate.c
-@@ -XXX,XX +XXX,XX @@
- #include "qemu/osdep.h"
- #include "qemu/log.h"
- #include "cpu.h"
--#include "tcg-op.h"
-+#include "tcg/tcg-op.h"
- #include "disas/disas.h"
- #include "exec/cpu_ldst.h"
- #include "exec/exec-all.h"
-diff --git a/target/s390x/mem_helper.c b/target/s390x/mem_helper.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/s390x/mem_helper.c
-+++ b/target/s390x/mem_helper.c
-@@ -XXX,XX +XXX,XX @@
- #include "exec/cpu_ldst.h"
- #include "qemu/int128.h"
- #include "qemu/atomic128.h"
--#include "tcg.h"
-+#include "tcg/tcg.h"
- #if !defined(CONFIG_USER_ONLY)
- #include "hw/s390x/storage-keys.h"
-diff --git a/target/s390x/translate.c b/target/s390x/translate.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/s390x/translate.c
-+++ b/target/s390x/translate.c
-@@ -XXX,XX +XXX,XX @@
- #include "internal.h"
- #include "disas/disas.h"
- #include "exec/exec-all.h"
--#include "tcg-op.h"
--#include "tcg-op-gvec.h"
-+#include "tcg/tcg-op.h"
-+#include "tcg/tcg-op-gvec.h"
- #include "qemu/log.h"
- #include "qemu/host-utils.h"
- #include "exec/cpu_ldst.h"
-diff --git a/target/sh4/translate.c b/target/sh4/translate.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/sh4/translate.c
-+++ b/target/sh4/translate.c
-@@ -XXX,XX +XXX,XX @@
- #include "cpu.h"
- #include "disas/disas.h"
- #include "exec/exec-all.h"
--#include "tcg-op.h"
-+#include "tcg/tcg-op.h"
- #include "exec/cpu_ldst.h"
- #include "exec/helper-proto.h"
- #include "exec/helper-gen.h"
-diff --git a/target/sparc/ldst_helper.c b/target/sparc/ldst_helper.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/sparc/ldst_helper.c
-+++ b/target/sparc/ldst_helper.c
-@@ -XXX,XX +XXX,XX @@
- #include "qemu/osdep.h"
- #include "cpu.h"
--#include "tcg.h"
-+#include "tcg/tcg.h"
- #include "exec/helper-proto.h"
- #include "exec/exec-all.h"
- #include "exec/cpu_ldst.h"
-diff --git a/target/sparc/translate.c b/target/sparc/translate.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/sparc/translate.c
-+++ b/target/sparc/translate.c
-@@ -XXX,XX +XXX,XX @@
- #include "disas/disas.h"
- #include "exec/helper-proto.h"
- #include "exec/exec-all.h"
--#include "tcg-op.h"
-+#include "tcg/tcg-op.h"
- #include "exec/cpu_ldst.h"
- #include "exec/helper-gen.h"
-diff --git a/target/tilegx/translate.c b/target/tilegx/translate.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/tilegx/translate.c
-+++ b/target/tilegx/translate.c
-@@ -XXX,XX +XXX,XX @@
- #include "exec/log.h"
- #include "disas/disas.h"
- #include "exec/exec-all.h"
--#include "tcg-op.h"
-+#include "tcg/tcg-op.h"
- #include "exec/cpu_ldst.h"
- #include "linux-user/syscall_defs.h"
-diff --git a/target/tricore/translate.c b/target/tricore/translate.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/tricore/translate.c
-+++ b/target/tricore/translate.c
-@@ -XXX,XX +XXX,XX @@
- #include "cpu.h"
- #include "disas/disas.h"
- #include "exec/exec-all.h"
--#include "tcg-op.h"
-+#include "tcg/tcg-op.h"
- #include "exec/cpu_ldst.h"
- #include "qemu/qemu-print.h"
-diff --git a/target/unicore32/translate.c b/target/unicore32/translate.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/unicore32/translate.c
-+++ b/target/unicore32/translate.c
-@@ -XXX,XX +XXX,XX @@
- #include "cpu.h"
- #include "disas/disas.h"
- #include "exec/exec-all.h"
--#include "tcg-op.h"
-+#include "tcg/tcg-op.h"
- #include "qemu/log.h"
- #include "exec/cpu_ldst.h"
- #include "exec/translator.h"
-diff --git a/target/xtensa/translate.c b/target/xtensa/translate.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/xtensa/translate.c
-+++ b/target/xtensa/translate.c
-@@ -XXX,XX +XXX,XX @@
- #include "cpu.h"
- #include "exec/exec-all.h"
- #include "disas/disas.h"
--#include "tcg-op.h"
-+#include "tcg/tcg-op.h"
- #include "qemu/log.h"
- #include "qemu/qemu-print.h"
- #include "exec/cpu_ldst.h"
 diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/optimize.c
 +++ b/tcg/optimize.c
 @@ -XXX,XX +XXX,XX @@
   */
  #include "qemu/osdep.h"
--#include "tcg-op.h"
++#include "qemu/int128.h"
-+#include "tcg/tcg-op.h"
+ #include "tcg/tcg-op.h"
+ #include "tcg-internal.h"
- #define CASE_OP_32_64(x)                        \
-         glue(glue(case INDEX_op_, x), _i32):    \
+@@ -XXX,XX +XXX,XX @@ static bool fold_add(OptContext *ctx, TCGOp *op)
-diff --git a/tcg/tcg-common.c b/tcg/tcg-common.c
+     return false;
-index XXXXXXX..XXXXXXX 100644
+ }
---- a/tcg/tcg-common.c
-+++ b/tcg/tcg-common.c
+-static bool fold_addsub2_i32(OptContext *ctx, TCGOp *op, bool add)
-@@ -XXX,XX +XXX,XX @@ uintptr_t tci_tb_ptr;
++static bool fold_addsub2(OptContext *ctx, TCGOp *op, bool add)
- TCGOpDef tcg_op_defs[] = {
+ {
- #define DEF(s, oargs, iargs, cargs, flags) \
+     if (arg_is_const(op->args[2]) && arg_is_const(op->args[3]) &&
-          { #s, oargs, iargs, cargs, iargs + oargs + cargs, flags },
+         arg_is_const(op->args[4]) && arg_is_const(op->args[5])) {
--#include "tcg-opc.h"
+-        uint32_t al = arg_info(op->args[2])->val;
-+#include "tcg/tcg-opc.h"
+-        uint32_t ah = arg_info(op->args[3])->val;
- #undef DEF
+-        uint32_t bl = arg_info(op->args[4])->val;
- };
+-        uint32_t bh = arg_info(op->args[5])->val;
- const size_t tcg_op_defs_max = ARRAY_SIZE(tcg_op_defs);
+-        uint64_t a = ((uint64_t)ah << 32) | al;
-diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
+-        uint64_t b = ((uint64_t)bh << 32) | bl;
-index XXXXXXX..XXXXXXX 100644
++        uint64_t al = arg_info(op->args[2])->val;
---- a/tcg/tcg-op-gvec.c
++        uint64_t ah = arg_info(op->args[3])->val;
-+++ b/tcg/tcg-op-gvec.c
++        uint64_t bl = arg_info(op->args[4])->val;
-@@ -XXX,XX +XXX,XX @@
++        uint64_t bh = arg_info(op->args[5])->val;
-  */
+         TCGArg rl, rh;
+-        TCGOp *op2 = tcg_op_insert_before(ctx->tcg, op, INDEX_op_mov_i32);
- #include "qemu/osdep.h"
++        TCGOp *op2;
--#include "tcg.h"
--#include "tcg-op.h"
+-        if (add) {
--#include "tcg-op-gvec.h"
+-            a += b;
-+#include "tcg/tcg.h"
++        if (ctx->type == TCG_TYPE_I32) {
-+#include "tcg/tcg-op.h"
++            uint64_t a = deposit64(al, 32, 32, ah);
-+#include "tcg/tcg-op-gvec.h"
++            uint64_t b = deposit64(bl, 32, 32, bh);
- #include "qemu/main-loop.h"
++
--#include "tcg-gvec-desc.h"
++            if (add) {
-+#include "tcg/tcg-gvec-desc.h"
++                a += b;
++            } else {
- #define MAX_UNROLL  4
++                a -= b;
++            }
-diff --git a/tcg/tcg-op-vec.c b/tcg/tcg-op-vec.c
++
-index XXXXXXX..XXXXXXX 100644
++            al = sextract64(a, 0, 32);
---- a/tcg/tcg-op-vec.c
++            ah = sextract64(a, 32, 32);
-+++ b/tcg/tcg-op-vec.c
+         } else {
-@@ -XXX,XX +XXX,XX @@
+-            a -= b;
++            Int128 a = int128_make128(al, ah);
- #include "qemu/osdep.h"
++            Int128 b = int128_make128(bl, bh);
- #include "cpu.h"
++
--#include "tcg.h"
++            if (add) {
--#include "tcg-op.h"
++                a = int128_add(a, b);
--#include "tcg-mo.h"
++            } else {
-+#include "tcg/tcg.h"
++                a = int128_sub(a, b);
-+#include "tcg/tcg-op.h"
++            }
-+#include "tcg/tcg-mo.h"
++
++            al = int128_getlo(a);
- /* Reduce the number of ifdefs below.  This assumes that all uses of
++            ah = int128_gethi(a);
-    TCGV_HIGH and TCGV_LOW are properly protected by a conditional that
+         }
-diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
-index XXXXXXX..XXXXXXX 100644
+         rl = op->args[0];
---- a/tcg/tcg-op.c
+         rh = op->args[1];
-+++ b/tcg/tcg-op.c
+-        tcg_opt_gen_movi(ctx, op, rl, (int32_t)a);
-@@ -XXX,XX +XXX,XX @@
+-        tcg_opt_gen_movi(ctx, op2, rh, (int32_t)(a >> 32));
- #include "qemu/osdep.h"
++
- #include "cpu.h"
++        /* The proper opcode is supplied by tcg_opt_gen_mov. */
- #include "exec/exec-all.h"
++        op2 = tcg_op_insert_before(ctx->tcg, op, 0);
--#include "tcg.h"
++
--#include "tcg-op.h"
++        tcg_opt_gen_movi(ctx, op, rl, al);
--#include "tcg-mo.h"
++        tcg_opt_gen_movi(ctx, op2, rh, ah);
-+#include "tcg/tcg.h"
+         return true;
-+#include "tcg/tcg-op.h"
+     }
-+#include "tcg/tcg-mo.h"
+     return false;
- #include "trace-tcg.h"
+ }
- #include "trace/mem.h"
- #include "exec/plugin-gen.h"
+-static bool fold_add2_i32(OptContext *ctx, TCGOp *op)
-diff --git a/tcg/tcg.c b/tcg/tcg.c
++static bool fold_add2(OptContext *ctx, TCGOp *op)
-index XXXXXXX..XXXXXXX 100644
+ {
---- a/tcg/tcg.c
+-    return fold_addsub2_i32(ctx, op, true);
-+++ b/tcg/tcg.c
++    return fold_addsub2(ctx, op, true);
-@@ -XXX,XX +XXX,XX @@
+ }
- #include "hw/boards.h"
- #endif
+ static bool fold_and(OptContext *ctx, TCGOp *op)
+@@ -XXX,XX +XXX,XX @@ static bool fold_sub(OptContext *ctx, TCGOp *op)
--#include "tcg-op.h"
+     return false;
-+#include "tcg/tcg-op.h"
+ }
- #if UINTPTR_MAX == UINT32_MAX
+-static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
- # define ELF_CLASS  ELFCLASS32
++static bool fold_sub2(OptContext *ctx, TCGOp *op)
-diff --git a/tcg/tci.c b/tcg/tci.c
+ {
-index XXXXXXX..XXXXXXX 100644
+-    return fold_addsub2_i32(ctx, op, false);
---- a/tcg/tci.c
++    return fold_addsub2(ctx, op, false);
-+++ b/tcg/tci.c
+ }
-@@ -XXX,XX +XXX,XX @@
- #include "qemu-common.h"
+ static bool fold_tcg_ld(OptContext *ctx, TCGOp *op)
- #include "tcg/tcg.h"           /* MAX_OPC_PARAM_IARGS */
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
- #include "exec/cpu_ldst.h"
+         CASE_OP_32_64_VEC(add):
--#include "tcg-op.h"
+             done = fold_add(&ctx, op);
-+#include "tcg/tcg-op.h"
+             break;
+-        case INDEX_op_add2_i32:
- /* Marker for missing code. */
+-            done = fold_add2_i32(&ctx, op);
- #define TODO() \
++        CASE_OP_32_64(add2):
 +            done = fold_add2(&ctx, op);
              break;
          CASE_OP_32_64_VEC(and):
              done = fold_and(&ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          CASE_OP_32_64_VEC(sub):
              done = fold_sub(&ctx, op);
              break;
 -        case INDEX_op_sub2_i32:
 -            done = fold_sub2_i32(&ctx, op);
 +        CASE_OP_32_64(sub2):
 +            done = fold_sub2(&ctx, op);
              break;
          CASE_OP_32_64_VEC(xor):
              done = fold_xor(&ctx, op);
 --
-.20.1
+.25.1

-New patch
+[PULL 46/56] tcg/optimize: Sink commutative operand swapping into fold functions
+Most of these are handled by creating a fold_const2_commutative
 to handle all of the binary operators.  The rest were already
 handled on a case-by-case basis in the switch, and have their
 own fold function in which to place the call.
 We now have only one major switch on TCGOpcode.
 Introduce NO_DEST and a block comment for swap_commutative in
 order to make the handling of brcond and movcond opcodes cleaner.
 Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
  tcg/optimize.c | 142 ++++++++++++++++++++++++-------------------------
 file changed, 70 insertions(+), 72 deletions(-)
 diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/optimize.c
 +++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static int do_constant_folding_cond2(TCGArg *p1, TCGArg *p2, TCGCond c)
      return -1;
  }
 +/**
 + * swap_commutative:
 + * @dest: TCGArg of the destination argument, or NO_DEST.
 + * @p1: first paired argument
 + * @p2: second paired argument
 + *
 + * If *@p1 is a constant and *@p2 is not, swap.
 + * If *@p2 matches @dest, swap.
 + * Return true if a swap was performed.
 + */
 +
 +#define NO_DEST  temp_arg(NULL)
 +
  static bool swap_commutative(TCGArg dest, TCGArg *p1, TCGArg *p2)
  {
      TCGArg a1 = *p1, a2 = *p2;
@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
      return false;
  }
 +static bool fold_const2_commutative(OptContext *ctx, TCGOp *op)
 +{
 +    swap_commutative(op->args[0], &op->args[1], &op->args[2]);
 +    return fold_const2(ctx, op);
 +}
 +
  static bool fold_masks(OptContext *ctx, TCGOp *op)
  {
      uint64_t a_mask = ctx->a_mask;
@@ -XXX,XX +XXX,XX @@ static bool fold_xx_to_x(OptContext *ctx, TCGOp *op)
  static bool fold_add(OptContext *ctx, TCGOp *op)
  {
 -    if (fold_const2(ctx, op) ||
 +    if (fold_const2_commutative(ctx, op) ||
          fold_xi_to_x(ctx, op, 0)) {
          return true;
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_addsub2(OptContext *ctx, TCGOp *op, bool add)
  static bool fold_add2(OptContext *ctx, TCGOp *op)
  {
 +    /* Note that the high and low parts may be independently swapped. */
 +    swap_commutative(op->args[0], &op->args[2], &op->args[4]);
 +    swap_commutative(op->args[1], &op->args[3], &op->args[5]);
 +
      return fold_addsub2(ctx, op, true);
  }
@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
  {
      uint64_t z1, z2;
 -    if (fold_const2(ctx, op) ||
 +    if (fold_const2_commutative(ctx, op) ||
          fold_xi_to_i(ctx, op, 0) ||
          fold_xi_to_x(ctx, op, -1) ||
          fold_xx_to_x(ctx, op)) {
@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
  static bool fold_brcond(OptContext *ctx, TCGOp *op)
  {
      TCGCond cond = op->args[2];
 -    int i = do_constant_folding_cond(ctx->type, op->args[0], op->args[1], cond);
 +    int i;
 +    if (swap_commutative(NO_DEST, &op->args[0], &op->args[1])) {
 +        op->args[2] = cond = tcg_swap_cond(cond);
 +    }
 +
 +    i = do_constant_folding_cond(ctx->type, op->args[0], op->args[1], cond);
      if (i == 0) {
          tcg_op_remove(ctx->tcg, op);
          return true;
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond(OptContext *ctx, TCGOp *op)
  static bool fold_brcond2(OptContext *ctx, TCGOp *op)
  {
      TCGCond cond = op->args[4];
 -    int i = do_constant_folding_cond2(&op->args[0], &op->args[2], cond);
      TCGArg label = op->args[5];
 -    int inv = 0;
 +    int i, inv = 0;
 +    if (swap_commutative2(&op->args[0], &op->args[2])) {
 +        op->args[4] = cond = tcg_swap_cond(cond);
 +    }
 +
 +    i = do_constant_folding_cond2(&op->args[0], &op->args[2], cond);
      if (i >= 0) {
          goto do_brcond_const;
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_dup2(OptContext *ctx, TCGOp *op)
  static bool fold_eqv(OptContext *ctx, TCGOp *op)
  {
 -    if (fold_const2(ctx, op) ||
 +    if (fold_const2_commutative(ctx, op) ||
          fold_xi_to_x(ctx, op, -1) ||
          fold_xi_to_not(ctx, op, 0)) {
          return true;
@@ -XXX,XX +XXX,XX @@ static bool fold_mov(OptContext *ctx, TCGOp *op)
  static bool fold_movcond(OptContext *ctx, TCGOp *op)
  {
      TCGCond cond = op->args[5];
 -    int i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
 +    int i;
 +    if (swap_commutative(NO_DEST, &op->args[1], &op->args[2])) {
 +        op->args[5] = cond = tcg_swap_cond(cond);
 +    }
 +    /*
 +     * Canonicalize the "false" input reg to match the destination reg so
 +     * that the tcg backend can implement a "move if true" operation.
 +     */
 +    if (swap_commutative(op->args[0], &op->args[4], &op->args[3])) {
 +        op->args[5] = cond = tcg_invert_cond(cond);
 +    }
 +
 +    i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
      if (i >= 0) {
          return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[4 - i]);
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_mul(OptContext *ctx, TCGOp *op)
  static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
  {
 -    if (fold_const2(ctx, op) ||
 +    if (fold_const2_commutative(ctx, op) ||
          fold_xi_to_i(ctx, op, 0)) {
          return true;
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
  static bool fold_multiply2(OptContext *ctx, TCGOp *op)
  {
 +    swap_commutative(op->args[0], &op->args[2], &op->args[3]);
 +
      if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
          uint64_t a = arg_info(op->args[2])->val;
          uint64_t b = arg_info(op->args[3])->val;
@@ -XXX,XX +XXX,XX @@ static bool fold_multiply2(OptContext *ctx, TCGOp *op)
  static bool fold_nand(OptContext *ctx, TCGOp *op)
  {
 -    if (fold_const2(ctx, op) ||
 +    if (fold_const2_commutative(ctx, op) ||
          fold_xi_to_not(ctx, op, -1)) {
          return true;
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_neg(OptContext *ctx, TCGOp *op)
  static bool fold_nor(OptContext *ctx, TCGOp *op)
  {
 -    if (fold_const2(ctx, op) ||
 +    if (fold_const2_commutative(ctx, op) ||
          fold_xi_to_not(ctx, op, 0)) {
          return true;
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_not(OptContext *ctx, TCGOp *op)
  static bool fold_or(OptContext *ctx, TCGOp *op)
  {
 -    if (fold_const2(ctx, op) ||
 +    if (fold_const2_commutative(ctx, op) ||
          fold_xi_to_x(ctx, op, 0) ||
          fold_xx_to_x(ctx, op)) {
          return true;
@@ -XXX,XX +XXX,XX @@ static bool fold_remainder(OptContext *ctx, TCGOp *op)
  static bool fold_setcond(OptContext *ctx, TCGOp *op)
  {
      TCGCond cond = op->args[3];
 -    int i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
 +    int i;
 +    if (swap_commutative(op->args[0], &op->args[1], &op->args[2])) {
 +        op->args[3] = cond = tcg_swap_cond(cond);
 +    }
 +
 +    i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
      if (i >= 0) {
          return tcg_opt_gen_movi(ctx, op, op->args[0], i);
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond(OptContext *ctx, TCGOp *op)
  static bool fold_setcond2(OptContext *ctx, TCGOp *op)
  {
      TCGCond cond = op->args[5];
 -    int i = do_constant_folding_cond2(&op->args[1], &op->args[3], cond);
 -    int inv = 0;
 +    int i, inv = 0;
 +    if (swap_commutative2(&op->args[1], &op->args[3])) {
 +        op->args[5] = cond = tcg_swap_cond(cond);
 +    }
 +
 +    i = do_constant_folding_cond2(&op->args[1], &op->args[3], cond);
      if (i >= 0) {
          goto do_setcond_const;
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_tcg_ld(OptContext *ctx, TCGOp *op)
  static bool fold_xor(OptContext *ctx, TCGOp *op)
  {
 -    if (fold_const2(ctx, op) ||
 +    if (fold_const2_commutative(ctx, op) ||
          fold_xx_to_i(ctx, op, 0) ||
          fold_xi_to_x(ctx, op, 0) ||
          fold_xi_to_not(ctx, op, -1)) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              ctx.type = TCG_TYPE_I32;
          }
 -        /* For commutative operations make constant second argument */
 -        switch (opc) {
 -        CASE_OP_32_64_VEC(add):
 -        CASE_OP_32_64_VEC(mul):
 -        CASE_OP_32_64_VEC(and):
 -        CASE_OP_32_64_VEC(or):
 -        CASE_OP_32_64_VEC(xor):
 -        CASE_OP_32_64(eqv):
 -        CASE_OP_32_64(nand):
 -        CASE_OP_32_64(nor):
 -        CASE_OP_32_64(muluh):
 -        CASE_OP_32_64(mulsh):
 -            swap_commutative(op->args[0], &op->args[1], &op->args[2]);
 -            break;
 -        CASE_OP_32_64(brcond):
 -            if (swap_commutative(-1, &op->args[0], &op->args[1])) {
 -                op->args[2] = tcg_swap_cond(op->args[2]);
 -            }
 -            break;
 -        CASE_OP_32_64(setcond):
 -            if (swap_commutative(op->args[0], &op->args[1], &op->args[2])) {
 -                op->args[3] = tcg_swap_cond(op->args[3]);
 -            }
 -            break;
 -        CASE_OP_32_64(movcond):
 -            if (swap_commutative(-1, &op->args[1], &op->args[2])) {
 -                op->args[5] = tcg_swap_cond(op->args[5]);
 -            }
 -            /* For movcond, we canonicalize the "false" input reg to match
 -               the destination reg so that the tcg backend can implement
 -               a "move if true" operation.  */
 -            if (swap_commutative(op->args[0], &op->args[4], &op->args[3])) {
 -                op->args[5] = tcg_invert_cond(op->args[5]);
 -            }
 -            break;
 -        CASE_OP_32_64(add2):
 -            swap_commutative(op->args[0], &op->args[2], &op->args[4]);
 -            swap_commutative(op->args[1], &op->args[3], &op->args[5]);
 -            break;
 -        CASE_OP_32_64(mulu2):
 -        CASE_OP_32_64(muls2):
 -            swap_commutative(op->args[0], &op->args[2], &op->args[3]);
 -            break;
 -        case INDEX_op_brcond2_i32:
 -            if (swap_commutative2(&op->args[0], &op->args[2])) {
 -                op->args[4] = tcg_swap_cond(op->args[4]);
 -            }
 -            break;
 -        case INDEX_op_setcond2_i32:
 -            if (swap_commutative2(&op->args[1], &op->args[3])) {
 -                op->args[5] = tcg_swap_cond(op->args[5]);
 -            }
 -            break;
 -        default:
 -            break;
 -        }
 -
          /* Assume all bits affected, and no bits known zero. */
          ctx.a_mask = -1;
          ctx.z_mask = -1;
 --
 .25.1

-[PULL 18/41] translator: Use cpu_ld*_code instead of open-coding
+[PULL 47/56] tcg/optimize: Stop forcing z_mask to "garbage" for 32-bit values
-The DO_LOAD macros replicate the distinction already performed
+This "garbage" setting pre-dates the addition of the type
-by the cpu_ldst.h functions.  Use them.
+changing opcodes INDEX_op_ext_i32_i64, INDEX_op_extu_i32_i64,
 and INDEX_op_extr{l,h}_i64_i32.
-Tested-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+So now we have a definitive points at which to adjust z_mask
 to eliminate such bits from the 32-bit operands.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/exec/cpu_ldst.h   | 11 ---------
+ tcg/optimize.c | 35 ++++++++++++++++-------------------
- include/exec/translator.h | 48 +++++++++++----------------------------
+file changed, 16 insertions(+), 19 deletions(-)
 files changed, 13 insertions(+), 46 deletions(-)
-diff --git a/include/exec/cpu_ldst.h b/include/exec/cpu_ldst.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/exec/cpu_ldst.h
+--- a/tcg/optimize.c
-+++ b/include/exec/cpu_ldst.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static inline void clear_helper_retaddr(void)
+@@ -XXX,XX +XXX,XX @@ static void init_ts_info(OptContext *ctx, TCGTemp *ts)
- #include "exec/cpu_ldst_useronly_template.h"
+         ti->is_const = true;
- #undef MEMSUFFIX
+         ti->val = ts->val;
+         ti->z_mask = ts->val;
--/*
+-        if (TCG_TARGET_REG_BITS > 32 && ts->type == TCG_TYPE_I32) {
-- * Code access is deprecated in favour of translator_ld* functions
+-            /* High bits of a 32-bit quantity are garbage.  */
-- * (see translator.h). However there are still users that need to
+-            ti->z_mask |= ~0xffffffffull;
-- * converted so for now these stay.
+-        }
-- */
+     } else {
- #define MEMSUFFIX _code
+         ti->is_const = false;
- #define CODE_ACCESS
+         ti->z_mask = -1;
- #define DATA_SIZE 1
+@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
-@@ -XXX,XX +XXX,XX @@ void cpu_stq_mmuidx_ra(CPUArchState *env, abi_ptr addr, uint64_t val,
+     TCGTemp *src_ts = arg_temp(src);
- #undef CPU_MMU_INDEX
+     TempOptInfo *di;
- #undef MEMSUFFIX
+     TempOptInfo *si;
+-    uint64_t z_mask;
--/*
+     TCGOpcode new_op;
-- * Code access is deprecated in favour of translator_ld* functions
-- * (see translator.h). However there are still users that need to
+     if (ts_are_copies(dst_ts, src_ts)) {
-- * converted so for now these stay.
+@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
-- */
+     op->args[0] = dst;
--
+     op->args[1] = src;
- #define CPU_MMU_INDEX (cpu_mmu_index(env, true))
- #define MEMSUFFIX _code
+-    z_mask = si->z_mask;
- #define SOFTMMU_CODE_ACCESS
+-    if (TCG_TARGET_REG_BITS > 32 && new_op == INDEX_op_mov_i32) {
-diff --git a/include/exec/translator.h b/include/exec/translator.h
+-        /* High bits of the destination are now garbage.  */
-index XXXXXXX..XXXXXXX 100644
+-        z_mask |= ~0xffffffffull;
---- a/include/exec/translator.h
+-    }
-+++ b/include/exec/translator.h
+-    di->z_mask = z_mask;
-@@ -XXX,XX +XXX,XX @@ void translator_loop_temp_check(DisasContextBase *db);
++    di->z_mask = si->z_mask;
- /*
-  * Translator Load Functions
+     if (src_ts->type == dst_ts->type) {
-  *
+         TempOptInfo *ni = ts_info(si->next_copy);
-- * These are intended to replace the old cpu_ld*_code functions and
+@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
-- * are mandatory for front-ends that have been migrated to the common
+ static bool tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
-- * translator_loop. These functions are only intended to be called
+                              TCGArg dst, uint64_t val)
-- * from the translation stage and should not be called from helper
+ {
-- * functions. Those functions should be converted to encode the
+-    /* Convert movi to mov with constant temp. */
-- * relevant information at translation time.
+-    TCGTemp *tv = tcg_constant_internal(ctx->type, val);
-+ * These are intended to replace the direct usage of the cpu_ld*_code
++    TCGTemp *tv;
-+ * functions and are mandatory for front-ends that have been migrated
-+ * to the common translator_loop. These functions are only intended
++    if (ctx->type == TCG_TYPE_I32) {
-+ * to be called from the translation stage and should not be called
++        val = (int32_t)val;
-+ * from helper functions. Those functions should be converted to encode
++    }
-+ * the relevant information at translation time.
++
-  */
++    /* Convert movi to mov with constant temp. */
++    tv = tcg_constant_internal(ctx->type, val);
--#ifdef CONFIG_USER_ONLY
+     init_ts_info(ctx, tv);
--
+     return tcg_opt_gen_mov(ctx, op, dst, temp_arg(tv));
--#define DO_LOAD(type, name, shift)               \
+ }
--    do {                                         \
+@@ -XXX,XX +XXX,XX @@ static bool fold_masks(OptContext *ctx, TCGOp *op)
--        set_helper_retaddr(1);                   \
+     uint64_t z_mask = ctx->z_mask;
--        ret = name ## _p(g2h(pc));               \
--        clear_helper_retaddr();                  \
+     /*
--    } while (0)
+-     * 32-bit ops generate 32-bit results.  For the result is zero test
--
+-     * below, we can ignore high bits, but for further optimizations we
--#else
+-     * need to record that the high bits contain garbage.
--
++     * 32-bit ops generate 32-bit results, which for the purpose of
--#define DO_LOAD(type, name, shift)                          \
++     * simplifying tcg are sign-extended.  Certainly that's how we
--    do {                                                    \
++     * represent our constants elsewhere.  Note that the bits will
--        int mmu_idx = cpu_mmu_index(env, true);             \
++     * be reset properly for a 64-bit value when encountering the
--        TCGMemOpIdx oi = make_memop_idx(shift, mmu_idx);    \
++     * type changing opcodes.
--        ret = helper_ret_ ## name ## _cmmu(env, pc, oi, 0); \
+      */
--    } while (0)
+     if (ctx->type == TCG_TYPE_I32) {
--
+-        ctx->z_mask |= MAKE_64BIT_MASK(32, 32);
--#endif
+-        a_mask &= MAKE_64BIT_MASK(0, 32);
--
+-        z_mask &= MAKE_64BIT_MASK(0, 32);
--#define GEN_TRANSLATOR_LD(fullname, name, type, shift, swap_fn)         \
++        a_mask = (int32_t)a_mask;
-+#define GEN_TRANSLATOR_LD(fullname, type, load_fn, swap_fn)             \
++        z_mask = (int32_t)z_mask;
-     static inline type                                                  \
++        ctx->z_mask = z_mask;
      fullname ## _swap(CPUArchState *env, abi_ptr pc, bool do_swap)      \
      {                                                                   \
 -        type ret;                                                       \
 -        DO_LOAD(type, name, shift);                                     \
 -                                                                        \
 +        type ret = load_fn(env, pc);                                    \
          if (do_swap) {                                                  \
              ret = swap_fn(ret);                                         \
          }                                                               \
@@ -XXX,XX +XXX,XX @@ void translator_loop_temp_check(DisasContextBase *db);
          return fullname ## _swap(env, pc, false);                       \
      }
--GEN_TRANSLATOR_LD(translator_ldub, ldub, uint8_t, 0, /* no swap */ )
+     if (z_mask == 0) {
 -GEN_TRANSLATOR_LD(translator_ldsw, ldsw, int16_t, 1, bswap16)
 -GEN_TRANSLATOR_LD(translator_lduw, lduw, uint16_t, 1, bswap16)
 -GEN_TRANSLATOR_LD(translator_ldl, ldl, uint32_t, 2, bswap32)
 -GEN_TRANSLATOR_LD(translator_ldq, ldq, uint64_t, 3, bswap64)
 +GEN_TRANSLATOR_LD(translator_ldub, uint8_t, cpu_ldub_code, /* no swap */)
 +GEN_TRANSLATOR_LD(translator_ldsw, int16_t, cpu_ldsw_code, bswap16)
 +GEN_TRANSLATOR_LD(translator_lduw, uint16_t, cpu_lduw_code, bswap16)
 +GEN_TRANSLATOR_LD(translator_ldl, uint32_t, cpu_ldl_code, bswap32)
 +GEN_TRANSLATOR_LD(translator_ldq, uint64_t, cpu_ldq_code, bswap64)
  #undef GEN_TRANSLATOR_LD
  #endif  /* EXEC__TRANSLATOR_H */
 --
-.20.1
+.25.1

-[PULL 11/41] target/s390x: Include tcg.h in mem_helper.c
+[PULL 48/56] tcg/optimize: Use fold_xx_to_i for orc
-Code movement in an upcoming patch will show that this file
+Recognize the constant function for or-complement.
 was implicitly depending on tcg.h being included indirectly.
-Tested-by: Philippe Mathieu-Daudé <philmd@redhat.com>
-Reviewed-by: David Hildenbrand <david@redhat.com>
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/s390x/mem_helper.c | 1 +
+ tcg/optimize.c | 1 +
 file changed, 1 insertion(+)
-diff --git a/target/s390x/mem_helper.c b/target/s390x/mem_helper.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/s390x/mem_helper.c
+--- a/tcg/optimize.c
-+++ b/target/s390x/mem_helper.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static bool fold_or(OptContext *ctx, TCGOp *op)
- #include "exec/cpu_ldst.h"
+ static bool fold_orc(OptContext *ctx, TCGOp *op)
- #include "qemu/int128.h"
+ {
- #include "qemu/atomic128.h"
+     if (fold_const2(ctx, op) ||
-+#include "tcg.h"
++        fold_xx_to_i(ctx, op, -1) ||
+         fold_xi_to_x(ctx, op, -1) ||
- #if !defined(CONFIG_USER_ONLY)
+         fold_ix_to_not(ctx, op, 0)) {
- #include "hw/s390x/storage-keys.h"
+         return true;
 --
-.20.1
+.25.1

-[PULL 09/41] cputlb: Use trace_mem_get_info instead of trace_mem_build_info
+[PULL 49/56] tcg/optimize: Use fold_xi_to_x for mul
-In the cpu_ldst templates, we already require a MemOp, and it
+Recognize the identity function for low-part multiply.
 is cleaner and clearer to pass that instead of 3 separate
 arguments describing the memory operation.
-Tested-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+Suggested-by: Luis Pires <luis.pires@eldorado.org.br>
-Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
-Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/exec/cpu_ldst_template.h          | 22 +++++++++++-----------
+ tcg/optimize.c | 3 ++-
- include/exec/cpu_ldst_useronly_template.h | 12 ++++++------
+file changed, 2 insertions(+), 1 deletion(-)
 files changed, 17 insertions(+), 17 deletions(-)
-diff --git a/include/exec/cpu_ldst_template.h b/include/exec/cpu_ldst_template.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/exec/cpu_ldst_template.h
+--- a/tcg/optimize.c
-+++ b/include/exec/cpu_ldst_template.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ glue(glue(glue(cpu_ld, USUFFIX), MEMSUFFIX), _ra)(CPUArchState *env,
+@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
-     RES_TYPE res;
+ static bool fold_mul(OptContext *ctx, TCGOp *op)
      target_ulong addr;
      int mmu_idx = CPU_MMU_INDEX;
 -    TCGMemOpIdx oi;
 +    MemOp op = MO_TE | SHIFT;
  #if !defined(SOFTMMU_CODE_ACCESS)
 -    uint16_t meminfo = trace_mem_build_info(SHIFT, false, MO_TE, false, mmu_idx);
 +    uint16_t meminfo = trace_mem_get_info(op, mmu_idx, false);
      trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo);
  #endif
@@ -XXX,XX +XXX,XX @@ glue(glue(glue(cpu_ld, USUFFIX), MEMSUFFIX), _ra)(CPUArchState *env,
      entry = tlb_entry(env, mmu_idx, addr);
      if (unlikely(entry->ADDR_READ !=
                   (addr & (TARGET_PAGE_MASK | (DATA_SIZE - 1))))) {
 -        oi = make_memop_idx(SHIFT, mmu_idx);
 +        TCGMemOpIdx oi = make_memop_idx(op, mmu_idx);
          res = glue(glue(helper_ret_ld, URETSUFFIX), MMUSUFFIX)(env, addr,
 -                                                            oi, retaddr);
 +                                                               oi, retaddr);
      } else {
          uintptr_t hostaddr = addr + entry->addend;
          res = glue(glue(ld, USUFFIX), _p)((uint8_t *)hostaddr);
@@ -XXX,XX +XXX,XX @@ glue(glue(glue(cpu_lds, SUFFIX), MEMSUFFIX), _ra)(CPUArchState *env,
      int res;
      target_ulong addr;
      int mmu_idx = CPU_MMU_INDEX;
 -    TCGMemOpIdx oi;
 -#if !defined(SOFTMMU_CODE_ACCESS)
 -    uint16_t meminfo = trace_mem_build_info(SHIFT, true, MO_TE, false, mmu_idx);
 +    MemOp op = MO_TE | MO_SIGN | SHIFT;
 +#ifndef SOFTMMU_CODE_ACCESS
 +    uint16_t meminfo = trace_mem_get_info(op, mmu_idx, false);
      trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo);
  #endif
@@ -XXX,XX +XXX,XX @@ glue(glue(glue(cpu_lds, SUFFIX), MEMSUFFIX), _ra)(CPUArchState *env,
      entry = tlb_entry(env, mmu_idx, addr);
      if (unlikely(entry->ADDR_READ !=
                   (addr & (TARGET_PAGE_MASK | (DATA_SIZE - 1))))) {
 -        oi = make_memop_idx(SHIFT, mmu_idx);
 +        TCGMemOpIdx oi = make_memop_idx(op & ~MO_SIGN, mmu_idx);
          res = (DATA_STYPE)glue(glue(helper_ret_ld, SRETSUFFIX),
                                 MMUSUFFIX)(env, addr, oi, retaddr);
      } else {
@@ -XXX,XX +XXX,XX @@ glue(glue(glue(cpu_st, SUFFIX), MEMSUFFIX), _ra)(CPUArchState *env,
      CPUTLBEntry *entry;
      target_ulong addr;
      int mmu_idx = CPU_MMU_INDEX;
 -    TCGMemOpIdx oi;
 +    MemOp op = MO_TE | SHIFT;
  #if !defined(SOFTMMU_CODE_ACCESS)
 -    uint16_t meminfo = trace_mem_build_info(SHIFT, false, MO_TE, true, mmu_idx);
 +    uint16_t meminfo = trace_mem_get_info(op, mmu_idx, true);
      trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo);
  #endif
@@ -XXX,XX +XXX,XX @@ glue(glue(glue(cpu_st, SUFFIX), MEMSUFFIX), _ra)(CPUArchState *env,
      entry = tlb_entry(env, mmu_idx, addr);
      if (unlikely(tlb_addr_write(entry) !=
                   (addr & (TARGET_PAGE_MASK | (DATA_SIZE - 1))))) {
 -        oi = make_memop_idx(SHIFT, mmu_idx);
 +        TCGMemOpIdx oi = make_memop_idx(op, mmu_idx);
          glue(glue(helper_ret_st, SUFFIX), MMUSUFFIX)(env, addr, v, oi,
                                                       retaddr);
      } else {
 diff --git a/include/exec/cpu_ldst_useronly_template.h b/include/exec/cpu_ldst_useronly_template.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/exec/cpu_ldst_useronly_template.h
 +++ b/include/exec/cpu_ldst_useronly_template.h
@@ -XXX,XX +XXX,XX @@ glue(glue(cpu_ld, USUFFIX), MEMSUFFIX)(CPUArchState *env, abi_ptr ptr)
      ret = glue(glue(ld, USUFFIX), _p)(g2h(ptr));
      clear_helper_retaddr();
  #else
 -    uint16_t meminfo = trace_mem_build_info(SHIFT, false, MO_TE, false,
 -                                            MMU_USER_IDX);
 +    MemOp op = MO_TE | SHIFT;
 +    uint16_t meminfo = trace_mem_get_info(op, MMU_USER_IDX, false);
      trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo);
      ret = glue(glue(ld, USUFFIX), _p)(g2h(ptr));
  #endif
@@ -XXX,XX +XXX,XX @@ glue(glue(cpu_lds, SUFFIX), MEMSUFFIX)(CPUArchState *env, abi_ptr ptr)
      ret = glue(glue(lds, SUFFIX), _p)(g2h(ptr));
      clear_helper_retaddr();
  #else
 -    uint16_t meminfo = trace_mem_build_info(SHIFT, true, MO_TE, false,
 -                                            MMU_USER_IDX);
 +    MemOp op = MO_TE | MO_SIGN | SHIFT;
 +    uint16_t meminfo = trace_mem_get_info(op, MMU_USER_IDX, false);
      trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo);
      ret = glue(glue(lds, SUFFIX), _p)(g2h(ptr));
      qemu_plugin_vcpu_mem_cb(env_cpu(env), ptr, meminfo);
@@ -XXX,XX +XXX,XX @@ static inline void
  glue(glue(cpu_st, SUFFIX), MEMSUFFIX)(CPUArchState *env, abi_ptr ptr,
                                        RES_TYPE v)
  {
--    uint16_t meminfo = trace_mem_build_info(SHIFT, false, MO_TE, true,
+     if (fold_const2(ctx, op) ||
--                                            MMU_USER_IDX);
+-        fold_xi_to_i(ctx, op, 0)) {
-+    MemOp op = MO_TE | SHIFT;
++        fold_xi_to_i(ctx, op, 0) ||
-+    uint16_t meminfo = trace_mem_get_info(op, MMU_USER_IDX, true);
++        fold_xi_to_x(ctx, op, 1)) {
-     trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo);
+         return true;
-     glue(glue(st, SUFFIX), _p)(g2h(ptr), v);
+     }
-     qemu_plugin_vcpu_mem_cb(env_cpu(env), ptr, meminfo);
+     return false;
 --
-.20.1
+.25.1

-[PULL 04/41] configure: Always detect -no-pie toolchain support
+[PULL 50/56] tcg/optimize: Use fold_xi_to_x for div
-The CFLAGS_NOPIE and LDFLAGS_NOPIE variables are used
+Recognize the identity function for division.
 in pc-bios/optionrom/Makefile, which has nothing to do
 with the PIE setting of the main qemu executables.
-This overrides any operating system default to build
+Suggested-by: Luis Pires <luis.pires@eldorado.org.br>
-all executables as PIE, which is important for ROMs.
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
 Reviewed-by: Thomas Huth <thuth@redhat.com>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- configure | 18 ++++++++----------
+ tcg/optimize.c | 6 +++++-
-file changed, 8 insertions(+), 10 deletions(-)
+file changed, 5 insertions(+), 1 deletion(-)
-diff --git a/configure b/configure
+diff --git a/tcg/optimize.c b/tcg/optimize.c
-index XXXXXXX..XXXXXXX 100755
+index XXXXXXX..XXXXXXX 100644
---- a/configure
+--- a/tcg/optimize.c
-+++ b/configure
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ if ! compile_prog "-Werror" "" ; then
+@@ -XXX,XX +XXX,XX @@ static bool fold_deposit(OptContext *ctx, TCGOp *op)
-     "Thread-Local Storage (TLS). Please upgrade to a version that does."
- fi
+ static bool fold_divide(OptContext *ctx, TCGOp *op)
+ {
--if test "$pie" != "no" ; then
+-    return fold_const2(ctx, op);
--  cat > $TMPC << EOF
++    if (fold_const2(ctx, op) ||
-+cat > $TMPC << EOF
++        fold_xi_to_x(ctx, op, 1)) {
++        return true;
- #ifdef __linux__
++    }
- #  define THREAD __thread
++    return false;
- #else
+ }
- #  define THREAD
- #endif
+ static bool fold_dup(OptContext *ctx, TCGOp *op)
 -
  static THREAD int tls_var;
 -
  int main(void) { return tls_var; }
 -
  EOF
 -  # check we support --no-pie first...
 -  if compile_prog "-Werror -fno-pie" "-no-pie"; then
 -    CFLAGS_NOPIE="-fno-pie"
 -    LDFLAGS_NOPIE="-nopie"
 -  fi
 +# Check we support --no-pie first; we will need this for building ROMs.
 +if compile_prog "-Werror -fno-pie" "-no-pie"; then
 +  CFLAGS_NOPIE="-fno-pie"
 +  LDFLAGS_NOPIE="-no-pie"
 +fi
 +
 +if test "$pie" != "no" ; then
    if compile_prog "-fPIE -DPIE" "-pie"; then
      QEMU_CFLAGS="-fPIE -DPIE $QEMU_CFLAGS"
      LDFLAGS="-pie $LDFLAGS"
 --
-.20.1
+.25.1

-[PULL 08/41] target/xtensa: Use probe_access for itlb_hit_test
+[PULL 51/56] tcg/optimize: Use fold_xx_to_i for rem
-We don't actually need the result of the read, only to probe that the
+Recognize the constant function for remainder.
 memory mapping exists.  This is exactly what probe_access does.
-This is also the only user of any cpu_ld*_code_ra function.
+Suggested-by: Luis Pires <luis.pires@eldorado.org.br>
-Removing this allows the interface to be removed shortly.
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Tested-by: Philippe Mathieu-Daudé <philmd@redhat.com>
 Acked-by: Max Filippov <jcmvbkbc@gmail.com>
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/xtensa/mmu_helper.c | 5 +++--
+ tcg/optimize.c | 6 +++++-
-file changed, 3 insertions(+), 2 deletions(-)
+file changed, 5 insertions(+), 1 deletion(-)
-diff --git a/target/xtensa/mmu_helper.c b/target/xtensa/mmu_helper.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/xtensa/mmu_helper.c
+--- a/tcg/optimize.c
-+++ b/target/xtensa/mmu_helper.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static bool fold_qemu_st(OptContext *ctx, TCGOp *op)
- void HELPER(itlb_hit_test)(CPUXtensaState *env, uint32_t vaddr)
  static bool fold_remainder(OptContext *ctx, TCGOp *op)
  {
-     /*
+-    return fold_const2(ctx, op);
--     * Attempt the memory load; we don't care about the result but
++    if (fold_const2(ctx, op) ||
-+     * Probe the memory; we don't care about the result but
++        fold_xx_to_i(ctx, op, 0)) {
-      * only the side-effects (ie any MMU or other exception)
++        return true;
-      */
++    }
--    cpu_ldub_code_ra(env, vaddr, GETPC());
++    return false;
 +    probe_access(env, vaddr, 1, MMU_INST_FETCH,
 +                 cpu_mmu_index(env, true), GETPC());
  }
- void HELPER(wsr_rasid)(CPUXtensaState *env, uint32_t v)
+ static bool fold_setcond(OptContext *ctx, TCGOp *op)
 --
-.20.1
+.25.1

-New patch
+[PULL 52/56] tcg/optimize: Optimize sign extensions
+Certain targets, like riscv, produce signed 32-bit results.
 This can lead to lots of redundant extensions as values are
 manipulated.
 Begin by tracking only the obvious sign-extensions, and
 converting them to simple copies when possible.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
  tcg/optimize.c | 123 ++++++++++++++++++++++++++++++++++++++++---------
 file changed, 102 insertions(+), 21 deletions(-)
 diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/optimize.c
 +++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
      TCGTemp *next_copy;
      uint64_t val;
      uint64_t z_mask;  /* mask bit is 0 if and only if value bit is 0 */
 +    uint64_t s_mask;  /* a left-aligned mask of clrsb(value) bits. */
  } TempOptInfo;
  typedef struct OptContext {
@@ -XXX,XX +XXX,XX @@ typedef struct OptContext {
      /* In flight values from optimization. */
      uint64_t a_mask;  /* mask bit is 0 iff value identical to first input */
      uint64_t z_mask;  /* mask bit is 0 iff value bit is 0 */
 +    uint64_t s_mask;  /* mask of clrsb(value) bits */
      TCGType type;
  } OptContext;
 +/* Calculate the smask for a specific value. */
 +static uint64_t smask_from_value(uint64_t value)
 +{
 +    int rep = clrsb64(value);
 +    return ~(~0ull >> rep);
 +}
 +
 +/*
 + * Calculate the smask for a given set of known-zeros.
 + * If there are lots of zeros on the left, we can consider the remainder
 + * an unsigned field, and thus the corresponding signed field is one bit
 + * larger.
 + */
 +static uint64_t smask_from_zmask(uint64_t zmask)
 +{
 +    /*
 +     * Only the 0 bits are significant for zmask, thus the msb itself
 +     * must be zero, else we have no sign information.
 +     */
 +    int rep = clz64(zmask);
 +    if (rep == 0) {
 +        return 0;
 +    }
 +    rep -= 1;
 +    return ~(~0ull >> rep);
 +}
 +
  static inline TempOptInfo *ts_info(TCGTemp *ts)
  {
      return ts->state_ptr;
@@ -XXX,XX +XXX,XX @@ static void reset_ts(TCGTemp *ts)
      ti->prev_copy = ts;
      ti->is_const = false;
      ti->z_mask = -1;
 +    ti->s_mask = 0;
  }
  static void reset_temp(TCGArg arg)
@@ -XXX,XX +XXX,XX @@ static void init_ts_info(OptContext *ctx, TCGTemp *ts)
          ti->is_const = true;
          ti->val = ts->val;
          ti->z_mask = ts->val;
 +        ti->s_mask = smask_from_value(ts->val);
      } else {
          ti->is_const = false;
          ti->z_mask = -1;
 +        ti->s_mask = 0;
      }
  }
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
      op->args[1] = src;
      di->z_mask = si->z_mask;
 +    di->s_mask = si->s_mask;
      if (src_ts->type == dst_ts->type) {
          TempOptInfo *ni = ts_info(si->next_copy);
@@ -XXX,XX +XXX,XX @@ static void finish_folding(OptContext *ctx, TCGOp *op)
      nb_oargs = def->nb_oargs;
      for (i = 0; i < nb_oargs; i++) {
 -        reset_temp(op->args[i]);
 +        TCGTemp *ts = arg_temp(op->args[i]);
 +        reset_ts(ts);
          /*
 -         * Save the corresponding known-zero bits mask for the
 +         * Save the corresponding known-zero/sign bits mask for the
           * first output argument (only one supported so far).
           */
          if (i == 0) {
 -            arg_info(op->args[i])->z_mask = ctx->z_mask;
 +            ts_info(ts)->z_mask = ctx->z_mask;
 +            ts_info(ts)->s_mask = ctx->s_mask;
          }
      }
  }
@@ -XXX,XX +XXX,XX @@ static bool fold_masks(OptContext *ctx, TCGOp *op)
  {
      uint64_t a_mask = ctx->a_mask;
      uint64_t z_mask = ctx->z_mask;
 +    uint64_t s_mask = ctx->s_mask;
      /*
       * 32-bit ops generate 32-bit results, which for the purpose of
@@ -XXX,XX +XXX,XX @@ static bool fold_masks(OptContext *ctx, TCGOp *op)
      if (ctx->type == TCG_TYPE_I32) {
          a_mask = (int32_t)a_mask;
          z_mask = (int32_t)z_mask;
 +        s_mask |= MAKE_64BIT_MASK(32, 32);
          ctx->z_mask = z_mask;
 +        ctx->s_mask = s_mask;
      }
      if (z_mask == 0) {
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
  static bool fold_bswap(OptContext *ctx, TCGOp *op)
  {
 -    uint64_t z_mask, sign;
 +    uint64_t z_mask, s_mask, sign;
      if (arg_is_const(op->args[1])) {
          uint64_t t = arg_info(op->args[1])->val;
@@ -XXX,XX +XXX,XX @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
      }
      z_mask = arg_info(op->args[1])->z_mask;
 +
      switch (op->opc) {
      case INDEX_op_bswap16_i32:
      case INDEX_op_bswap16_i64:
@@ -XXX,XX +XXX,XX @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
      default:
          g_assert_not_reached();
      }
 +    s_mask = smask_from_zmask(z_mask);
      switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
      case TCG_BSWAP_OZ:
@@ -XXX,XX +XXX,XX @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
          /* If the sign bit may be 1, force all the bits above to 1. */
          if (z_mask & sign) {
              z_mask |= sign;
 +            s_mask = sign << 1;
          }
          break;
      default:
          /* The high bits are undefined: force all bits above the sign to 1. */
          z_mask |= sign << 1;
 +        s_mask = 0;
          break;
      }
      ctx->z_mask = z_mask;
 +    ctx->s_mask = s_mask;
      return fold_masks(ctx, op);
  }
@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
  static bool fold_extract(OptContext *ctx, TCGOp *op)
  {
      uint64_t z_mask_old, z_mask;
 +    int pos = op->args[2];
 +    int len = op->args[3];
      if (arg_is_const(op->args[1])) {
          uint64_t t;
          t = arg_info(op->args[1])->val;
 -        t = extract64(t, op->args[2], op->args[3]);
 +        t = extract64(t, pos, len);
          return tcg_opt_gen_movi(ctx, op, op->args[0], t);
      }
      z_mask_old = arg_info(op->args[1])->z_mask;
 -    z_mask = extract64(z_mask_old, op->args[2], op->args[3]);
 -    if (op->args[2] == 0) {
 +    z_mask = extract64(z_mask_old, pos, len);
 +    if (pos == 0) {
          ctx->a_mask = z_mask_old ^ z_mask;
      }
      ctx->z_mask = z_mask;
 +    ctx->s_mask = smask_from_zmask(z_mask);
      return fold_masks(ctx, op);
  }
@@ -XXX,XX +XXX,XX @@ static bool fold_extract2(OptContext *ctx, TCGOp *op)
  static bool fold_exts(OptContext *ctx, TCGOp *op)
  {
 -    uint64_t z_mask_old, z_mask, sign;
 +    uint64_t s_mask_old, s_mask, z_mask, sign;
      bool type_change = false;
      if (fold_const1(ctx, op)) {
          return true;
      }
 -    z_mask_old = z_mask = arg_info(op->args[1])->z_mask;
 +    z_mask = arg_info(op->args[1])->z_mask;
 +    s_mask = arg_info(op->args[1])->s_mask;
 +    s_mask_old = s_mask;
      switch (op->opc) {
      CASE_OP_32_64(ext8s):
@@ -XXX,XX +XXX,XX @@ static bool fold_exts(OptContext *ctx, TCGOp *op)
      if (z_mask & sign) {
          z_mask |= sign;
 -    } else if (!type_change) {
 -        ctx->a_mask = z_mask_old ^ z_mask;
      }
 +    s_mask |= sign << 1;
 +
      ctx->z_mask = z_mask;
 +    ctx->s_mask = s_mask;
 +    if (!type_change) {
 +        ctx->a_mask = s_mask & ~s_mask_old;
 +    }
      return fold_masks(ctx, op);
  }
@@ -XXX,XX +XXX,XX @@ static bool fold_extu(OptContext *ctx, TCGOp *op)
      }
      ctx->z_mask = z_mask;
 +    ctx->s_mask = smask_from_zmask(z_mask);
      if (!type_change) {
          ctx->a_mask = z_mask_old ^ z_mask;
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
      MemOp mop = get_memop(oi);
      int width = 8 * memop_size(mop);
 -    if (!(mop & MO_SIGN) && width < 64) {
 -        ctx->z_mask = MAKE_64BIT_MASK(0, width);
 +    if (width < 64) {
 +        ctx->s_mask = MAKE_64BIT_MASK(width, 64 - width);
 +        if (!(mop & MO_SIGN)) {
 +            ctx->z_mask = MAKE_64BIT_MASK(0, width);
 +            ctx->s_mask <<= 1;
 +        }
      }
      /* Opcodes that touch guest memory stop the mb optimization.  */
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
  static bool fold_sextract(OptContext *ctx, TCGOp *op)
  {
 -    int64_t z_mask_old, z_mask;
 +    uint64_t z_mask, s_mask, s_mask_old;
 +    int pos = op->args[2];
 +    int len = op->args[3];
      if (arg_is_const(op->args[1])) {
          uint64_t t;
          t = arg_info(op->args[1])->val;
 -        t = sextract64(t, op->args[2], op->args[3]);
 +        t = sextract64(t, pos, len);
          return tcg_opt_gen_movi(ctx, op, op->args[0], t);
      }
 -    z_mask_old = arg_info(op->args[1])->z_mask;
 -    z_mask = sextract64(z_mask_old, op->args[2], op->args[3]);
 -    if (op->args[2] == 0 && z_mask >= 0) {
 -        ctx->a_mask = z_mask_old ^ z_mask;
 -    }
 +    z_mask = arg_info(op->args[1])->z_mask;
 +    z_mask = sextract64(z_mask, pos, len);
      ctx->z_mask = z_mask;
 +    s_mask_old = arg_info(op->args[1])->s_mask;
 +    s_mask = sextract64(s_mask_old, pos, len);
 +    s_mask |= MAKE_64BIT_MASK(len, 64 - len);
 +    ctx->s_mask = s_mask;
 +
 +    if (pos == 0) {
 +        ctx->a_mask = s_mask & ~s_mask_old;
 +    }
 +
      return fold_masks(ctx, op);
  }
@@ -XXX,XX +XXX,XX @@ static bool fold_tcg_ld(OptContext *ctx, TCGOp *op)
  {
      /* We can't do any folding with a load, but we can record bits. */
      switch (op->opc) {
 +    CASE_OP_32_64(ld8s):
 +        ctx->s_mask = MAKE_64BIT_MASK(8, 56);
 +        break;
      CASE_OP_32_64(ld8u):
          ctx->z_mask = MAKE_64BIT_MASK(0, 8);
 +        ctx->s_mask = MAKE_64BIT_MASK(9, 55);
 +        break;
 +    CASE_OP_32_64(ld16s):
 +        ctx->s_mask = MAKE_64BIT_MASK(16, 48);
          break;
      CASE_OP_32_64(ld16u):
          ctx->z_mask = MAKE_64BIT_MASK(0, 16);
 +        ctx->s_mask = MAKE_64BIT_MASK(17, 47);
 +        break;
 +    case INDEX_op_ld32s_i64:
 +        ctx->s_mask = MAKE_64BIT_MASK(32, 32);
          break;
      case INDEX_op_ld32u_i64:
          ctx->z_mask = MAKE_64BIT_MASK(0, 32);
 +        ctx->s_mask = MAKE_64BIT_MASK(33, 31);
          break;
      default:
          g_assert_not_reached();
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              ctx.type = TCG_TYPE_I32;
          }
 -        /* Assume all bits affected, and no bits known zero. */
 +        /* Assume all bits affected, no bits known zero, no sign reps. */
          ctx.a_mask = -1;
          ctx.z_mask = -1;
 +        ctx.s_mask = 0;
          /*
           * Process each opcode.
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          case INDEX_op_extrh_i64_i32:
              done = fold_extu(&ctx, op);
              break;
 +        CASE_OP_32_64(ld8s):
          CASE_OP_32_64(ld8u):
 +        CASE_OP_32_64(ld16s):
          CASE_OP_32_64(ld16u):
 +        case INDEX_op_ld32s_i64:
          case INDEX_op_ld32u_i64:
              done = fold_tcg_ld(&ctx, op);
              break;
 --
 .25.1

-[PULL 19/41] cputlb: Rename helper_ret_ld*_cmmu to cpu_ld*_code
+[PULL 53/56] tcg/optimize: Propagate sign info for logical operations
-There are no uses of the *_cmmu names other than the bare wrapping
+Sign repetitions are perforce all identical, whether they are 1 or 0.
-within the *_code inlines.  Therefore rename the functions so we
+Bitwise operations preserve the relative quantity of the repetitions.
 can drop the inlines.
-Use abi_ptr instead of target_ulong in preparation for user-only;
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-the two types are identical for softmmu.
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Tested-by: Philippe Mathieu-Daudé <philmd@redhat.com>
 Reviewed-by: Aleksandar Markovic <amarkovic@wavecomp.com>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/exec/cpu_ldst.h          | 29 ++++------
+ tcg/optimize.c | 29 +++++++++++++++++++++++++++++
- include/exec/cpu_ldst_template.h | 21 -------
+file changed, 29 insertions(+)
  tcg/tcg.h                        | 29 ----------
  accel/tcg/cputlb.c               | 94 ++++++++------------------------
  docs/devel/loads-stores.rst      |  4 +-
 files changed, 36 insertions(+), 141 deletions(-)
-diff --git a/include/exec/cpu_ldst.h b/include/exec/cpu_ldst.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/exec/cpu_ldst.h
+--- a/tcg/optimize.c
-+++ b/include/exec/cpu_ldst.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ void cpu_stq_mmuidx_ra(CPUArchState *env, abi_ptr addr, uint64_t val,
+@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
- #undef CPU_MMU_INDEX
+     z2 = arg_info(op->args[2])->z_mask;
- #undef MEMSUFFIX
+     ctx->z_mask = z1 & z2;
--#define CPU_MMU_INDEX (cpu_mmu_index(env, true))
++    /*
--#define MEMSUFFIX _code
++     * Sign repetitions are perforce all identical, whether they are 1 or 0.
--#define SOFTMMU_CODE_ACCESS
++     * Bitwise operations preserve the relative quantity of the repetitions.
-+uint32_t cpu_ldub_code(CPUArchState *env, abi_ptr addr);
++     */
-+uint32_t cpu_lduw_code(CPUArchState *env, abi_ptr addr);
++    ctx->s_mask = arg_info(op->args[1])->s_mask
-+uint32_t cpu_ldl_code(CPUArchState *env, abi_ptr addr);
++                & arg_info(op->args[2])->s_mask;
-+uint64_t cpu_ldq_code(CPUArchState *env, abi_ptr addr);
++
+     /*
--#define DATA_SIZE 1
+      * Known-zeros does not imply known-ones.  Therefore unless
--#include "exec/cpu_ldst_template.h"
+      * arg2 is constant, we can't infer affected bits from it.
-+static inline int cpu_ldsb_code(CPUArchState *env, abi_ptr addr)
+@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
-+{
+     }
-+    return (int8_t)cpu_ldub_code(env, addr);
+     ctx->z_mask = z1;
-+}
++    ctx->s_mask = arg_info(op->args[1])->s_mask
--#define DATA_SIZE 2
++                & arg_info(op->args[2])->s_mask;
--#include "exec/cpu_ldst_template.h"
+     return fold_masks(ctx, op);
 -
 -#define DATA_SIZE 4
 -#include "exec/cpu_ldst_template.h"
 -
 -#define DATA_SIZE 8
 -#include "exec/cpu_ldst_template.h"
 -
 -#undef CPU_MMU_INDEX
 -#undef MEMSUFFIX
 -#undef SOFTMMU_CODE_ACCESS
 +static inline int cpu_ldsw_code(CPUArchState *env, abi_ptr addr)
 +{
 +    return (int16_t)cpu_lduw_code(env, addr);
 +}
  #endif /* defined(CONFIG_USER_ONLY) */
 diff --git a/include/exec/cpu_ldst_template.h b/include/exec/cpu_ldst_template.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/exec/cpu_ldst_template.h
 +++ b/include/exec/cpu_ldst_template.h
@@ -XXX,XX +XXX,XX @@
  /* generic load/store macros */
 -#ifdef SOFTMMU_CODE_ACCESS
 -
 -static inline RES_TYPE
 -glue(glue(cpu_ld, USUFFIX), _code)(CPUArchState *env, target_ulong ptr)
 -{
 -    TCGMemOpIdx oi = make_memop_idx(MO_TE | SHIFT, CPU_MMU_INDEX);
 -    return glue(glue(helper_ret_ld, USUFFIX), _cmmu)(env, ptr, oi, 0);
 -}
 -
 -#if DATA_SIZE <= 2
 -static inline int
 -glue(glue(cpu_lds, SUFFIX), _code)(CPUArchState *env, target_ulong ptr)
 -{
 -    return (DATA_STYPE)glue(glue(cpu_ld, USUFFIX), _code)(env, ptr);
 -}
 -#endif
 -
 -#else
 -
  static inline RES_TYPE
  glue(glue(glue(cpu_ld, USUFFIX), MEMSUFFIX), _ra)(CPUArchState *env,
                                                    target_ulong ptr,
@@ -XXX,XX +XXX,XX @@ glue(glue(cpu_st, SUFFIX), MEMSUFFIX)(CPUArchState *env, target_ulong ptr,
      glue(glue(cpu_st, SUFFIX), _mmuidx_ra)(env, ptr, v, CPU_MMU_INDEX, 0);
  }
--#endif /* !SOFTMMU_CODE_ACCESS */
+@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
--
+         fold_xi_to_not(ctx, op, 0)) {
- #undef RES_TYPE
+         return true;
- #undef DATA_TYPE
+     }
- #undef DATA_STYPE
++
-diff --git a/tcg/tcg.h b/tcg/tcg.h
++    ctx->s_mask = arg_info(op->args[1])->s_mask
-index XXXXXXX..XXXXXXX 100644
++                & arg_info(op->args[2])->s_mask;
---- a/tcg/tcg.h
+     return false;
 +++ b/tcg/tcg.h
@@ -XXX,XX +XXX,XX @@ void helper_be_stl_mmu(CPUArchState *env, target_ulong addr, uint32_t val,
  void helper_be_stq_mmu(CPUArchState *env, target_ulong addr, uint64_t val,
                         TCGMemOpIdx oi, uintptr_t retaddr);
 -uint8_t helper_ret_ldub_cmmu(CPUArchState *env, target_ulong addr,
 -                            TCGMemOpIdx oi, uintptr_t retaddr);
 -int8_t helper_ret_ldsb_cmmu(CPUArchState *env, target_ulong addr,
 -                            TCGMemOpIdx oi, uintptr_t retaddr);
 -uint16_t helper_le_lduw_cmmu(CPUArchState *env, target_ulong addr,
 -                             TCGMemOpIdx oi, uintptr_t retaddr);
 -int16_t helper_le_ldsw_cmmu(CPUArchState *env, target_ulong addr,
 -                             TCGMemOpIdx oi, uintptr_t retaddr);
 -uint32_t helper_le_ldl_cmmu(CPUArchState *env, target_ulong addr,
 -                            TCGMemOpIdx oi, uintptr_t retaddr);
 -uint64_t helper_le_ldq_cmmu(CPUArchState *env, target_ulong addr,
 -                            TCGMemOpIdx oi, uintptr_t retaddr);
 -uint16_t helper_be_lduw_cmmu(CPUArchState *env, target_ulong addr,
 -                             TCGMemOpIdx oi, uintptr_t retaddr);
 -int16_t helper_be_ldsw_cmmu(CPUArchState *env, target_ulong addr,
 -                             TCGMemOpIdx oi, uintptr_t retaddr);
 -uint32_t helper_be_ldl_cmmu(CPUArchState *env, target_ulong addr,
 -                            TCGMemOpIdx oi, uintptr_t retaddr);
 -uint64_t helper_be_ldq_cmmu(CPUArchState *env, target_ulong addr,
 -                            TCGMemOpIdx oi, uintptr_t retaddr);
 -
  /* Temporary aliases until backends are converted.  */
  #ifdef TARGET_WORDS_BIGENDIAN
  # define helper_ret_ldsw_mmu  helper_be_ldsw_mmu
@@ -XXX,XX +XXX,XX @@ uint64_t helper_be_ldq_cmmu(CPUArchState *env, target_ulong addr,
  # define helper_ret_stw_mmu   helper_be_stw_mmu
  # define helper_ret_stl_mmu   helper_be_stl_mmu
  # define helper_ret_stq_mmu   helper_be_stq_mmu
 -# define helper_ret_lduw_cmmu  helper_be_lduw_cmmu
 -# define helper_ret_ldsw_cmmu  helper_be_ldsw_cmmu
 -# define helper_ret_ldl_cmmu  helper_be_ldl_cmmu
 -# define helper_ret_ldq_cmmu  helper_be_ldq_cmmu
  #else
  # define helper_ret_ldsw_mmu  helper_le_ldsw_mmu
  # define helper_ret_lduw_mmu  helper_le_lduw_mmu
@@ -XXX,XX +XXX,XX @@ uint64_t helper_be_ldq_cmmu(CPUArchState *env, target_ulong addr,
  # define helper_ret_stw_mmu   helper_le_stw_mmu
  # define helper_ret_stl_mmu   helper_le_stl_mmu
  # define helper_ret_stq_mmu   helper_le_stq_mmu
 -# define helper_ret_lduw_cmmu  helper_le_lduw_cmmu
 -# define helper_ret_ldsw_cmmu  helper_le_ldsw_cmmu
 -# define helper_ret_ldl_cmmu  helper_le_ldl_cmmu
 -# define helper_ret_ldq_cmmu  helper_le_ldq_cmmu
  #endif
  uint32_t helper_atomic_cmpxchgb_mmu(CPUArchState *env, target_ulong addr,
 diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/cputlb.c
 +++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ void cpu_stq_mmuidx_ra(CPUArchState *env, target_ulong addr, uint64_t val,
  /* Code access functions.  */
 -static uint64_t full_ldub_cmmu(CPUArchState *env, target_ulong addr,
 +static uint64_t full_ldub_code(CPUArchState *env, target_ulong addr,
                                 TCGMemOpIdx oi, uintptr_t retaddr)
  {
 -    return load_helper(env, addr, oi, retaddr, MO_8, true, full_ldub_cmmu);
 +    return load_helper(env, addr, oi, retaddr, MO_8, true, full_ldub_code);
  }
--uint8_t helper_ret_ldub_cmmu(CPUArchState *env, target_ulong addr,
+@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
--                            TCGMemOpIdx oi, uintptr_t retaddr)
-+uint32_t cpu_ldub_code(CPUArchState *env, abi_ptr addr)
+     ctx->z_mask = arg_info(op->args[3])->z_mask
- {
+                 | arg_info(op->args[4])->z_mask;
--    return full_ldub_cmmu(env, addr, oi, retaddr);
++    ctx->s_mask = arg_info(op->args[3])->s_mask
-+    TCGMemOpIdx oi = make_memop_idx(MO_UB, cpu_mmu_index(env, true));
++                & arg_info(op->args[4])->s_mask;
-+    return full_ldub_code(env, addr, oi, 0);
      if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
          uint64_t tv = arg_info(op->args[3])->val;
@@ -XXX,XX +XXX,XX @@ static bool fold_nand(OptContext *ctx, TCGOp *op)
          fold_xi_to_not(ctx, op, -1)) {
          return true;
      }
 +
 +    ctx->s_mask = arg_info(op->args[1])->s_mask
 +                & arg_info(op->args[2])->s_mask;
      return false;
  }
--int8_t helper_ret_ldsb_cmmu(CPUArchState *env, target_ulong addr,
+@@ -XXX,XX +XXX,XX @@ static bool fold_nor(OptContext *ctx, TCGOp *op)
--                            TCGMemOpIdx oi, uintptr_t retaddr)
+         fold_xi_to_not(ctx, op, 0)) {
-+static uint64_t full_lduw_code(CPUArchState *env, target_ulong addr,
+         return true;
-+                               TCGMemOpIdx oi, uintptr_t retaddr)
+     }
- {
++
--    return (int8_t) full_ldub_cmmu(env, addr, oi, retaddr);
++    ctx->s_mask = arg_info(op->args[1])->s_mask
-+    return load_helper(env, addr, oi, retaddr, MO_TEUW, true, full_lduw_code);
++                & arg_info(op->args[2])->s_mask;
      return false;
  }
--static uint64_t full_le_lduw_cmmu(CPUArchState *env, target_ulong addr,
+@@ -XXX,XX +XXX,XX @@ static bool fold_not(OptContext *ctx, TCGOp *op)
--                                  TCGMemOpIdx oi, uintptr_t retaddr)
+         return true;
-+uint32_t cpu_lduw_code(CPUArchState *env, abi_ptr addr)
+     }
- {
--    return load_helper(env, addr, oi, retaddr, MO_LEUW, true,
++    ctx->s_mask = arg_info(op->args[1])->s_mask;
--                       full_le_lduw_cmmu);
++
-+    TCGMemOpIdx oi = make_memop_idx(MO_TEUW, cpu_mmu_index(env, true));
+     /* Because of fold_to_not, we want to always return true, via finish. */
-+    return full_lduw_code(env, addr, oi, 0);
+     finish_folding(ctx, op);
      return true;
@@ -XXX,XX +XXX,XX @@ static bool fold_or(OptContext *ctx, TCGOp *op)
      ctx->z_mask = arg_info(op->args[1])->z_mask
                  | arg_info(op->args[2])->z_mask;
 +    ctx->s_mask = arg_info(op->args[1])->s_mask
 +                & arg_info(op->args[2])->s_mask;
      return fold_masks(ctx, op);
  }
--uint16_t helper_le_lduw_cmmu(CPUArchState *env, target_ulong addr,
+@@ -XXX,XX +XXX,XX @@ static bool fold_orc(OptContext *ctx, TCGOp *op)
--                            TCGMemOpIdx oi, uintptr_t retaddr)
+         fold_ix_to_not(ctx, op, 0)) {
-+static uint64_t full_ldl_code(CPUArchState *env, target_ulong addr,
+         return true;
-+                              TCGMemOpIdx oi, uintptr_t retaddr)
+     }
- {
++
--    return full_le_lduw_cmmu(env, addr, oi, retaddr);
++    ctx->s_mask = arg_info(op->args[1])->s_mask
-+    return load_helper(env, addr, oi, retaddr, MO_TEUL, true, full_ldl_code);
++                & arg_info(op->args[2])->s_mask;
      return false;
  }
--int16_t helper_le_ldsw_cmmu(CPUArchState *env, target_ulong addr,
+@@ -XXX,XX +XXX,XX @@ static bool fold_xor(OptContext *ctx, TCGOp *op)
--                            TCGMemOpIdx oi, uintptr_t retaddr)
-+uint32_t cpu_ldl_code(CPUArchState *env, abi_ptr addr)
+     ctx->z_mask = arg_info(op->args[1])->z_mask
- {
+                 | arg_info(op->args[2])->z_mask;
--    return (int16_t) full_le_lduw_cmmu(env, addr, oi, retaddr);
++    ctx->s_mask = arg_info(op->args[1])->s_mask
-+    TCGMemOpIdx oi = make_memop_idx(MO_TEUL, cpu_mmu_index(env, true));
++                & arg_info(op->args[2])->s_mask;
-+    return full_ldl_code(env, addr, oi, 0);
+     return fold_masks(ctx, op);
  }
--static uint64_t full_be_lduw_cmmu(CPUArchState *env, target_ulong addr,
--                                  TCGMemOpIdx oi, uintptr_t retaddr)
-+static uint64_t full_ldq_code(CPUArchState *env, target_ulong addr,
-+                              TCGMemOpIdx oi, uintptr_t retaddr)
- {
--    return load_helper(env, addr, oi, retaddr, MO_BEUW, true,
--                       full_be_lduw_cmmu);
-+    return load_helper(env, addr, oi, retaddr, MO_TEQ, true, full_ldq_code);
- }
--uint16_t helper_be_lduw_cmmu(CPUArchState *env, target_ulong addr,
--                            TCGMemOpIdx oi, uintptr_t retaddr)
-+uint64_t cpu_ldq_code(CPUArchState *env, abi_ptr addr)
- {
--    return full_be_lduw_cmmu(env, addr, oi, retaddr);
--}
--
--int16_t helper_be_ldsw_cmmu(CPUArchState *env, target_ulong addr,
--                            TCGMemOpIdx oi, uintptr_t retaddr)
--{
--    return (int16_t) full_be_lduw_cmmu(env, addr, oi, retaddr);
--}
--
--static uint64_t full_le_ldul_cmmu(CPUArchState *env, target_ulong addr,
--                                  TCGMemOpIdx oi, uintptr_t retaddr)
--{
--    return load_helper(env, addr, oi, retaddr, MO_LEUL, true,
--                       full_le_ldul_cmmu);
--}
--
--uint32_t helper_le_ldl_cmmu(CPUArchState *env, target_ulong addr,
--                            TCGMemOpIdx oi, uintptr_t retaddr)
--{
--    return full_le_ldul_cmmu(env, addr, oi, retaddr);
--}
--
--static uint64_t full_be_ldul_cmmu(CPUArchState *env, target_ulong addr,
--                                  TCGMemOpIdx oi, uintptr_t retaddr)
--{
--    return load_helper(env, addr, oi, retaddr, MO_BEUL, true,
--                       full_be_ldul_cmmu);
--}
--
--uint32_t helper_be_ldl_cmmu(CPUArchState *env, target_ulong addr,
--                            TCGMemOpIdx oi, uintptr_t retaddr)
--{
--    return full_be_ldul_cmmu(env, addr, oi, retaddr);
--}
--
--uint64_t helper_le_ldq_cmmu(CPUArchState *env, target_ulong addr,
--                            TCGMemOpIdx oi, uintptr_t retaddr)
--{
--    return load_helper(env, addr, oi, retaddr, MO_LEQ, true,
--                       helper_le_ldq_cmmu);
--}
--
--uint64_t helper_be_ldq_cmmu(CPUArchState *env, target_ulong addr,
--                            TCGMemOpIdx oi, uintptr_t retaddr)
--{
--    return load_helper(env, addr, oi, retaddr, MO_BEQ, true,
--                       helper_be_ldq_cmmu);
-+    TCGMemOpIdx oi = make_memop_idx(MO_TEQ, cpu_mmu_index(env, true));
-+    return full_ldq_code(env, addr, oi, 0);
- }
-diff --git a/docs/devel/loads-stores.rst b/docs/devel/loads-stores.rst
-index XXXXXXX..XXXXXXX 100644
---- a/docs/devel/loads-stores.rst
-+++ b/docs/devel/loads-stores.rst
-@@ -XXX,XX +XXX,XX @@ more in line with the other memory access functions.
- load: ``helper_{endian}_ld{sign}{size}_mmu(env, addr, opindex, retaddr)``
--load (code): ``helper_{endian}_ld{sign}{size}_cmmu(env, addr, opindex, retaddr)``
--
- store: ``helper_{endian}_st{size}_mmu(env, addr, val, opindex, retaddr)``
- ``sign``
-@@ -XXX,XX +XXX,XX @@ store: ``helper_{endian}_st{size}_mmu(env, addr, val, opindex, retaddr)``
-  - ``ret`` : target endianness
- Regexes for git grep
-- - ``\<helper_\(le\|be\|ret\)_ld[us]\?[bwlq]_c\?mmu\>``
-+ - ``\<helper_\(le\|be\|ret\)_ld[us]\?[bwlq]_mmu\>``
-  - ``\<helper_\(le\|be\|ret\)_st[bwlq]_mmu\>``
- ``address_space_*``
 --
-.20.1
+.25.1

-[PULL 02/41] tcg: Remove softmmu code_gen_buffer fixed address
+[PULL 54/56] tcg/optimize: Propagate sign info for setcond
-The commentary talks about "in concert with the addresses
+The result is either 0 or 1, which means that we have
-assigned in the relevant linker script", except there is no
+a 2 bit signed result, and thus 62 bits of sign.
-linker script for softmmu, nor has there been for some time.
+For clarity, use the smask_from_zmask function.
 (Do not confuse the user-only linker script editing that was
 removed in the previous patch, because user-only does not
 use this code_gen_buffer allocation method.)
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-Reviewed-by: Thomas Huth <thuth@redhat.com>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/translate-all.c | 37 +++++--------------------------------
+ tcg/optimize.c | 2 ++
-file changed, 5 insertions(+), 32 deletions(-)
+file changed, 2 insertions(+)
-diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/translate-all.c
+--- a/tcg/optimize.c
-+++ b/accel/tcg/translate-all.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static inline void *alloc_code_gen_buffer(void)
+@@ -XXX,XX +XXX,XX @@ static bool fold_setcond(OptContext *ctx, TCGOp *op)
  {
      int prot = PROT_WRITE | PROT_READ | PROT_EXEC;
      int flags = MAP_PRIVATE | MAP_ANONYMOUS;
 -    uintptr_t start = 0;
      size_t size = tcg_ctx->code_gen_buffer_size;
      void *buf;
 -    /* Constrain the position of the buffer based on the host cpu.
 -       Note that these addresses are chosen in concert with the
 -       addresses assigned in the relevant linker script file.  */
 -# if defined(__PIE__) || defined(__PIC__)
 -    /* Don't bother setting a preferred location if we're building
 -       a position-independent executable.  We're more likely to get
 -       an address near the main executable if we let the kernel
 -       choose the address.  */
 -# elif defined(__x86_64__) && defined(MAP_32BIT)
 -    /* Force the memory down into low memory with the executable.
 -       Leave the choice of exact location with the kernel.  */
 -    flags |= MAP_32BIT;
 -    /* Cannot expect to map more than 800MB in low memory.  */
 -    if (size > 800u * 1024 * 1024) {
 -        tcg_ctx->code_gen_buffer_size = size = 800u * 1024 * 1024;
 -    }
 -# elif defined(__sparc__)
 -    start = 0x40000000ul;
 -# elif defined(__s390x__)
 -    start = 0x90000000ul;
 -# elif defined(__mips__)
 -#  if _MIPS_SIM == _ABI64
 -    start = 0x128000000ul;
 -#  else
 -    start = 0x08000000ul;
 -#  endif
 -# endif
 -
 -    buf = mmap((void *)start, size, prot, flags, -1, 0);
 +    buf = mmap(NULL, size, prot, flags, -1, 0);
      if (buf == MAP_FAILED) {
          return NULL;
      }
- #ifdef __mips__
+     ctx->z_mask = 1;
-     if (cross_256mb(buf, size)) {
++    ctx->s_mask = smask_from_zmask(1);
--        /* Try again, with the original still mapped, to avoid re-acquiring
+     return false;
--           that 256mb crossing.  This time don't specify an address.  */
+ }
-+        /*
-+         * Try again, with the original still mapped, to avoid re-acquiring
+@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
-+         * the same 256mb crossing.
+     }
-+         */
-         size_t size2;
+     ctx->z_mask = 1;
-         void *buf2 = mmap(NULL, size, prot, flags, -1, 0);
++    ctx->s_mask = smask_from_zmask(1);
-         switch ((int)(buf2 != MAP_FAILED)) {
+     return false;
   do_setcond_const:
 --
-.20.1
+.25.1

-[PULL 23/41] target/nios2: Remove MMU_MODE{0,1}_SUFFIX
+[PULL 55/56] tcg/optimize: Propagate sign info for bit counting
-The functions generated by these macros are unused.
+The results are generally 6 bit unsigned values, though
 the count leading and trailing bits may produce any value
 for a zero input.
-Cc: Chris Wulff <crwulff@gmail.com>
-Cc: Marek Vasut <marex@denx.de>
-Tested-by: Philippe Mathieu-Daudé <philmd@redhat.com>
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/nios2/cpu.h | 2 --
+ tcg/optimize.c | 3 ++-
-file changed, 2 deletions(-)
+file changed, 2 insertions(+), 1 deletion(-)
-diff --git a/target/nios2/cpu.h b/target/nios2/cpu.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/nios2/cpu.h
+--- a/tcg/optimize.c
-+++ b/target/nios2/cpu.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ void do_nios2_semihosting(CPUNios2State *env);
+@@ -XXX,XX +XXX,XX @@ static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
- #define CPU_SAVE_VERSION 1
+         g_assert_not_reached();
+     }
- /* MMU modes definitions */
+     ctx->z_mask = arg_info(op->args[2])->z_mask | z_mask;
--#define MMU_MODE0_SUFFIX _kernel
+-
--#define MMU_MODE1_SUFFIX _user
++    ctx->s_mask = smask_from_zmask(ctx->z_mask);
- #define MMU_SUPERVISOR_IDX  0
+     return false;
- #define MMU_USER_IDX        1
+ }
@@ -XXX,XX +XXX,XX @@ static bool fold_ctpop(OptContext *ctx, TCGOp *op)
      default:
          g_assert_not_reached();
      }
 +    ctx->s_mask = smask_from_zmask(ctx->z_mask);
      return false;
  }
 --
-.20.1
+.25.1

-[PULL 03/41] configure: Do not force pie=no for non-x86
+[PULL 56/56] tcg/optimize: Propagate sign info for shifting
-PIE is supported on many other hosts besides x86.
+For constant shifts, we can simply shift the s_mask.
-The default for non-x86 is now the same as x86: pie is used
+For variable shifts, we know that sar does not reduce
-if supported, and may be forced via --enable/--disable-pie.
+the s_mask, which helps for sequences like
-The original commit (40d6444e91c) said:
+    ext32s_i64  t, in
     sar_i64     t, t, v
     ext32s_i64  out, t
-  "Non-x86 are not changed, as they require TCG changes"
+allowing the final extend to be eliminated.
-but I think that's wrong -- there's nothing about PIE that
-affects TCG one way or another.
-Tested on aarch64 (bionic) and ppc64le (centos 7) hosts.
-Tested-by: Alex Bennée <alex.bennee@linaro.org>
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- configure | 10 ----------
+ tcg/optimize.c | 50 +++++++++++++++++++++++++++++++++++++++++++++++---
-file changed, 10 deletions(-)
+file changed, 47 insertions(+), 3 deletions(-)
-diff --git a/configure b/configure
+diff --git a/tcg/optimize.c b/tcg/optimize.c
-index XXXXXXX..XXXXXXX 100755
+index XXXXXXX..XXXXXXX 100644
---- a/configure
+--- a/tcg/optimize.c
-+++ b/configure
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ if ! compile_prog "-Werror" "" ; then
+@@ -XXX,XX +XXX,XX @@ static uint64_t smask_from_zmask(uint64_t zmask)
-     "Thread-Local Storage (TLS). Please upgrade to a version that does."
+     return ~(~0ull >> rep);
- fi
+ }
--if test "$pie" = ""; then
++/*
--  case "$cpu-$targetos" in
++ * Recreate a properly left-aligned smask after manipulation.
--    i386-Linux|x86_64-Linux|x32-Linux|i386-OpenBSD|x86_64-OpenBSD)
++ * Some bit-shuffling, particularly shifts and rotates, may
--      ;;
++ * retain sign bits on the left, but may scatter disconnected
--    *)
++ * sign bits on the right.  Retain only what remains to the left.
--      pie="no"
++ */
--      ;;
++static uint64_t smask_from_smask(int64_t smask)
--  esac
++{
--fi
++    /* Only the 1 bits are significant for smask */
--
++    return smask_from_zmask(~smask);
- if test "$pie" != "no" ; then
++}
-   cat > $TMPC << EOF
++
  static inline TempOptInfo *ts_info(TCGTemp *ts)
  {
      return ts->state_ptr;
@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
  static bool fold_shift(OptContext *ctx, TCGOp *op)
  {
 +    uint64_t s_mask, z_mask, sign;
 +
      if (fold_const2(ctx, op) ||
          fold_ix_to_i(ctx, op, 0) ||
          fold_xi_to_x(ctx, op, 0)) {
          return true;
      }
 +    s_mask = arg_info(op->args[1])->s_mask;
 +    z_mask = arg_info(op->args[1])->z_mask;
 +
      if (arg_is_const(op->args[2])) {
 -        ctx->z_mask = do_constant_folding(op->opc, ctx->type,
 -                                          arg_info(op->args[1])->z_mask,
 -                                          arg_info(op->args[2])->val);
 +        int sh = arg_info(op->args[2])->val;
 +
 +        ctx->z_mask = do_constant_folding(op->opc, ctx->type, z_mask, sh);
 +
 +        s_mask = do_constant_folding(op->opc, ctx->type, s_mask, sh);
 +        ctx->s_mask = smask_from_smask(s_mask);
 +
          return fold_masks(ctx, op);
      }
 +
 +    switch (op->opc) {
 +    CASE_OP_32_64(sar):
 +        /*
 +         * Arithmetic right shift will not reduce the number of
 +         * input sign repetitions.
 +         */
 +        ctx->s_mask = s_mask;
 +        break;
 +    CASE_OP_32_64(shr):
 +        /*
 +         * If the sign bit is known zero, then logical right shift
 +         * will not reduced the number of input sign repetitions.
 +         */
 +        sign = (s_mask & -s_mask) >> 1;
 +        if (!(z_mask & sign)) {
 +            ctx->s_mask = s_mask;
 +        }
 +        break;
 +    default:
 +        break;
 +    }
 +
      return false;
  }
 --
-.20.1
+.25.1

The following changes since commit 035eed4c0d257c905a556fa0f4865a0c077b4e7f:

Merge remote-tracking branch 'remotes/vivier/tags/q800-for-5.0-pull-request' into staging (2020-01-07 17:08:21 +0000)

are available in the Git repository at:

https://github.com/rth7680/qemu.git tags/pull-tcg-20200108

for you to fetch changes up to 5e7ef51cbe47e726f76bfbc208e167085cf398c4:

MAINTAINERS: Replace Claudio Fontana for tcg/aarch64 (2020-01-08 11:54:12 +1100)

----------------------------------------------------------------
Improve -static and -pie linking
Add cpu_{ld,st}*_mmuidx_ra
Remove MMU_MODE*_SUFFIX
Move tcg headers under include/

----------------------------------------------------------------
Philippe Mathieu-Daudé (4):
      tcg: Search includes from the project root source directory
      tcg: Search includes in the parent source directory
      tcg: Move TCG headers to include/tcg/
      configure: Remove tcg/ from the preprocessor include search list

Richard Henderson (37):
      configure: Drop adjustment of textseg
      tcg: Remove softmmu code_gen_buffer fixed address
      configure: Do not force pie=no for non-x86
      configure: Always detect -no-pie toolchain support
      configure: Unnest detection of -z,relro and -z,now
      configure: Override the os default with --disable-pie
      configure: Support -static-pie if requested
      target/xtensa: Use probe_access for itlb_hit_test
      cputlb: Use trace_mem_get_info instead of trace_mem_build_info
      trace: Remove trace_mem_build_info_no_se_[bl]e
      target/s390x: Include tcg.h in mem_helper.c
      target/arm: Include tcg.h in sve_helper.c
      accel/tcg: Include tcg.h in tcg-runtime.c
      linux-user: Include tcg.h in syscall.c
      linux-user: Include trace-root.h in syscall-trace.h
      plugins: Include trace/mem.h in api.c
      cputlb: Move body of cpu_ldst_template.h out of line
      translator: Use cpu_ld*_code instead of open-coding
      cputlb: Rename helper_ret_ld*_cmmu to cpu_ld*_code
      cputlb: Provide cpu_(ld,st}*_mmuidx_ra for user-only
      target/i386: Use cpu_*_mmuidx_ra instead of templates
      cputlb: Expand cpu_ldst_useronly_template.h in user-exec.c
      target/nios2: Remove MMU_MODE{0,1}_SUFFIX
      target/alpha: Remove MMU_MODE{0,1}_SUFFIX
      target/cris: Remove MMU_MODE{0,1}_SUFFIX
      target/i386: Remove MMU_MODE{0,1,2}_SUFFIX
      target/microblaze: Remove MMU_MODE{0,1,2}_SUFFIX
      target/sh4: Remove MMU_MODE{0,1}_SUFFIX
      target/unicore32: Remove MMU_MODE{0,1}_SUFFIX
      target/xtensa: Remove MMU_MODE{0,1,2,3}_SUFFIX
      target/m68k: Use cpu_*_mmuidx_ra instead of MMU_MODE{0,1}_SUFFIX
      target/mips: Use cpu_*_mmuidx_ra instead of MMU_MODE*_SUFFIX
      target/s390x: Use cpu_*_mmuidx_ra instead of MMU_MODE*_SUFFIX
      target/ppc: Use cpu_*_mmuidx_ra instead of MMU_MODE*_SUFFIX
      cputlb: Remove support for MMU_MODE*_SUFFIX
      cputlb: Expand cpu_ldst_template.h in cputlb.c
      MAINTAINERS: Replace Claudio Fontana for tcg/aarch64

Makefile                                  |   2 +-
 accel/tcg/atomic_template.h               |  67 ++---
 include/exec/cpu_ldst.h                   | 446 +++++++++---------------------
 include/exec/cpu_ldst_template.h          | 211 --------------
 include/exec/cpu_ldst_useronly_template.h | 159 -----------
 include/exec/translator.h                 |  48 +---
 {tcg => include/tcg}/tcg-gvec-desc.h      |   0
 {tcg => include/tcg}/tcg-mo.h             |   0
 {tcg => include/tcg}/tcg-op-gvec.h        |   0
 {tcg => include/tcg}/tcg-op.h             |   2 +-
 {tcg => include/tcg}/tcg-opc.h            |   0
 {tcg => include/tcg}/tcg.h                |  33 +--
 include/user/syscall-trace.h              |   2 +
 target/alpha/cpu.h                        |   2 -
 target/cris/cpu.h                         |   2 -
 target/i386/cpu.h                         |   3 -
 target/m68k/cpu.h                         |   2 -
 target/microblaze/cpu.h                   |   3 -
 target/mips/cpu.h                         |   4 -
 target/nios2/cpu.h                        |   2 -
 target/ppc/cpu.h                          |   2 -
 target/s390x/cpu.h                        |   5 -
 target/sh4/cpu.h                          |   2 -
 target/unicore32/cpu.h                    |   2 -
 target/xtensa/cpu.h                       |   4 -
 tcg/i386/tcg-target.h                     |   2 +-
 trace/mem-internal.h                      |  17 --
 accel/tcg/cpu-exec.c                      |   2 +-
 accel/tcg/cputlb.c                        | 315 ++++++++++++++++-----
 accel/tcg/tcg-runtime-gvec.c              |   2 +-
 accel/tcg/tcg-runtime.c                   |   1 +
 accel/tcg/translate-all.c                 |  39 +--
 accel/tcg/user-exec.c                     | 238 +++++++++++++++-
 bsd-user/main.c                           |   2 +-
 cpus.c                                    |   2 +-
 exec.c                                    |   2 +-
 linux-user/main.c                         |   2 +-
 linux-user/syscall.c                      |   1 +
 plugins/api.c                             |   1 +
 target/alpha/translate.c                  |   2 +-
 target/arm/helper-a64.c                   |   2 +-
 target/arm/sve_helper.c                   |   1 +
 target/arm/translate-a64.c                |   4 +-
 target/arm/translate-sve.c                |   6 +-
 target/arm/translate.c                    |   4 +-
 target/cris/translate.c                   |   2 +-
 target/hppa/translate.c                   |   2 +-
 target/i386/mem_helper.c                  |   2 +-
 target/i386/seg_helper.c                  |  56 ++--
 target/i386/translate.c                   |   2 +-
 target/lm32/translate.c                   |   2 +-
 target/m68k/op_helper.c                   |  77 ++++--
 target/m68k/translate.c                   |   2 +-
 target/microblaze/translate.c             |   2 +-
 target/mips/op_helper.c                   | 182 ++++--------
 target/mips/translate.c                   |   2 +-
 target/moxie/translate.c                  |   2 +-
 target/nios2/translate.c                  |   2 +-
 target/openrisc/translate.c               |   2 +-
 target/ppc/mem_helper.c                   |  13 +-
 target/ppc/translate.c                    |   4 +-
 target/riscv/cpu_helper.c                 |   2 +-
 target/riscv/translate.c                  |   2 +-
 target/s390x/mem_helper.c                 |  11 +-
 target/s390x/translate.c                  |   4 +-
 target/sh4/translate.c                    |   2 +-
 target/sparc/ldst_helper.c                |   2 +-
 target/sparc/translate.c                  |   2 +-
 target/tilegx/translate.c                 |   2 +-
 target/tricore/translate.c                |   2 +-
 target/unicore32/translate.c              |   2 +-
 target/xtensa/mmu_helper.c                |   5 +-
 target/xtensa/translate.c                 |   2 +-
 tcg/aarch64/tcg-target.inc.c              |   4 +-
 tcg/arm/tcg-target.inc.c                  |   4 +-
 tcg/i386/tcg-target.inc.c                 |   4 +-
 tcg/mips/tcg-target.inc.c                 |   2 +-
 tcg/optimize.c                            |   2 +-
 tcg/ppc/tcg-target.inc.c                  |   4 +-
 tcg/riscv/tcg-target.inc.c                |   4 +-
 tcg/s390/tcg-target.inc.c                 |   4 +-
 tcg/sparc/tcg-target.inc.c                |   2 +-
 tcg/tcg-common.c                          |   2 +-
 tcg/tcg-op-gvec.c                         |   8 +-
 tcg/tcg-op-vec.c                          |   6 +-
 tcg/tcg-op.c                              |   6 +-
 tcg/tcg.c                                 |   2 +-
 tcg/tci.c                                 |   2 +-
 MAINTAINERS                               |   4 +-
 configure                                 | 117 +++-----
 docs/devel/loads-stores.rst               | 215 ++++++++++----
 91 files changed, 1075 insertions(+), 1357 deletions(-)
 delete mode 100644 include/exec/cpu_ldst_template.h
 delete mode 100644 include/exec/cpu_ldst_useronly_template.h
 rename {tcg => include/tcg}/tcg-gvec-desc.h (100%)
 rename {tcg => include/tcg}/tcg-mo.h (100%)
 rename {tcg => include/tcg}/tcg-op-gvec.h (100%)
 rename {tcg => include/tcg}/tcg-op.h (99%)
 rename {tcg => include/tcg}/tcg-opc.h (100%)
 rename {tcg => include/tcg}/tcg.h (96%)

This adjustment was random and unnecessary.  The user mode
startup code in probe_guest_base() will choose a value for
guest_base that allows the host qemu binary to not conflict
with the guest binary.

With modern distributions, this isn't even used, as the default
is PIE, which does the same job in a more portable way.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
v2: Remove mention of config-host.ld from make distclean
---
 Makefile  |  2 +-
 configure | 47 -----------------------------------------------
 2 files changed, 1 insertion(+), 48 deletions(-)

diff --git a/Makefile b/Makefile
index XXXXXXX..XXXXXXX 100644
--- a/Makefile
+++ b/Makefile
@@ -XXX,XX +XXX,XX @@ rm -f $(MANUAL_BUILDDIR)/$1/objects.inv $(MANUAL_BUILDDIR)/$1/searchindex.js $(M
 endef
 
 distclean: clean
-	rm -f config-host.mak config-host.h* config-host.ld $(DOCS) qemu-options.texi qemu-img-cmds.texi qemu-monitor.texi qemu-monitor-info.texi
+	rm -f config-host.mak config-host.h* $(DOCS) qemu-options.texi qemu-img-cmds.texi qemu-monitor.texi qemu-monitor-info.texi
 	rm -f tests/tcg/config-*.mak
 	rm -f config-all-devices.mak config-all-disas.mak config.status
 	rm -f $(SUBDIR_DEVICES_MAK)
diff --git a/configure b/configure
index XXXXXXX..XXXXXXX 100755
--- a/configure
+++ b/configure
@@ -XXX,XX +XXX,XX @@ if test "$cpu" = "s390x" ; then
   fi
 fi
 
-# Probe for the need for relocating the user-only binary.
-if ( [ "$linux_user" = yes ] || [ "$bsd_user" = yes ] ) && [ "$pie" = no ]; then
-  textseg_addr=
-  case "$cpu" in
-    arm | i386 | ppc* | s390* | sparc* | x86_64 | x32)
-      # ??? Rationale for choosing this address
-      textseg_addr=0x60000000
-      ;;
-    mips)
-      # A 256M aligned address, high in the address space, with enough
-      # room for the code_gen_buffer above it before the stack.
-      textseg_addr=0x60000000
-      ;;
-  esac
-  if [ -n "$textseg_addr" ]; then
-    cat > $TMPC <<EOF
-    int main(void) { return 0; }
-EOF
-    textseg_ldflags="-Wl,-Ttext-segment=$textseg_addr"
-    if ! compile_prog "" "$textseg_ldflags"; then
-      # In case ld does not support -Ttext-segment, edit the default linker
-      # script via sed to set the .text start addr.  This is needed on FreeBSD
-      # at least.
-      if ! $ld --verbose >/dev/null 2>&1; then
-        error_exit \
-            "We need to link the QEMU user mode binaries at a" \
-            "specific text address. Unfortunately your linker" \
-            "doesn't support either the -Ttext-segment option or" \
-            "printing the default linker script with --verbose." \
-            "If you don't want the user mode binaries, pass the" \
-            "--disable-user option to configure."
-      fi
-
-      $ld --verbose | sed \
-        -e '1,/==================================================/d' \
-        -e '/==================================================/,$d' \
-        -e "s/[.] = [0-9a-fx]* [+] SIZEOF_HEADERS/. = $textseg_addr + SIZEOF_HEADERS/" \
-        -e "s/__executable_start = [0-9a-fx]*/__executable_start = $textseg_addr/" > config-host.ld
-      textseg_ldflags="-Wl,-T../config-host.ld"
-    fi
-  fi
-fi
-
 # Check that the C++ compiler exists and works with the C compiler.
 # All the QEMU_CXXFLAGS are based on QEMU_CFLAGS. Keep this at the end to don't miss any other that could be added.
 if has $cxx; then
@@ -XXX,XX +XXX,XX @@ if test "$gprof" = "yes" ; then
   fi
 fi
 
-if test "$target_linux_user" = "yes" || test "$target_bsd_user" = "yes" ; then
-  ldflags="$ldflags $textseg_ldflags"
-fi
-
 # Newer kernels on s390 check for an S390_PGSTE program header and
 # enable the pgste page table extensions in that case. This makes
 # the vm.allocate_pgste sysctl unnecessary. We enable this program
-- 
2.20.1

The commentary talks about "in concert with the addresses
assigned in the relevant linker script", except there is no
linker script for softmmu, nor has there been for some time.

(Do not confuse the user-only linker script editing that was
removed in the previous patch, because user-only does not
use this code_gen_buffer allocation method.)

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/translate-all.c | 37 +++++--------------------------------
 1 file changed, 5 insertions(+), 32 deletions(-)

diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -XXX,XX +XXX,XX @@ static inline void *alloc_code_gen_buffer(void)
 {
     int prot = PROT_WRITE | PROT_READ | PROT_EXEC;
     int flags = MAP_PRIVATE | MAP_ANONYMOUS;
-    uintptr_t start = 0;
     size_t size = tcg_ctx->code_gen_buffer_size;
     void *buf;
 
-    /* Constrain the position of the buffer based on the host cpu.
-       Note that these addresses are chosen in concert with the
-       addresses assigned in the relevant linker script file.  */
-# if defined(__PIE__) || defined(__PIC__)
-    /* Don't bother setting a preferred location if we're building
-       a position-independent executable.  We're more likely to get
-       an address near the main executable if we let the kernel
-       choose the address.  */
-# elif defined(__x86_64__) && defined(MAP_32BIT)
-    /* Force the memory down into low memory with the executable.
-       Leave the choice of exact location with the kernel.  */
-    flags |= MAP_32BIT;
-    /* Cannot expect to map more than 800MB in low memory.  */
-    if (size > 800u * 1024 * 1024) {
-        tcg_ctx->code_gen_buffer_size = size = 800u * 1024 * 1024;
-    }
-# elif defined(__sparc__)
-    start = 0x40000000ul;
-# elif defined(__s390x__)
-    start = 0x90000000ul;
-# elif defined(__mips__)
-#  if _MIPS_SIM == _ABI64
-    start = 0x128000000ul;
-#  else
-    start = 0x08000000ul;
-#  endif
-# endif
-
-    buf = mmap((void *)start, size, prot, flags, -1, 0);
+    buf = mmap(NULL, size, prot, flags, -1, 0);
     if (buf == MAP_FAILED) {
         return NULL;
     }
 
 #ifdef __mips__
     if (cross_256mb(buf, size)) {
-        /* Try again, with the original still mapped, to avoid re-acquiring
-           that 256mb crossing.  This time don't specify an address.  */
+        /*
+         * Try again, with the original still mapped, to avoid re-acquiring
+         * the same 256mb crossing.
+         */
         size_t size2;
         void *buf2 = mmap(NULL, size, prot, flags, -1, 0);
         switch ((int)(buf2 != MAP_FAILED)) {
-- 
2.20.1

PIE is supported on many other hosts besides x86.

The default for non-x86 is now the same as x86: pie is used
if supported, and may be forced via --enable/--disable-pie.

The original commit (40d6444e91c) said:

"Non-x86 are not changed, as they require TCG changes"

but I think that's wrong -- there's nothing about PIE that
affects TCG one way or another.

Tested on aarch64 (bionic) and ppc64le (centos 7) hosts.

Tested-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 configure | 10 ----------
 1 file changed, 10 deletions(-)

The CFLAGS_NOPIE and LDFLAGS_NOPIE variables are used
in pc-bios/optionrom/Makefile, which has nothing to do
with the PIE setting of the main qemu executables.

This overrides any operating system default to build
all executables as PIE, which is important for ROMs.

Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Reviewed-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 configure | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/configure b/configure
index XXXXXXX..XXXXXXX 100755
--- a/configure
+++ b/configure
@@ -XXX,XX +XXX,XX @@ if ! compile_prog "-Werror" "" ; then
 	"Thread-Local Storage (TLS). Please upgrade to a version that does."
 fi
 
-if test "$pie" != "no" ; then
-  cat > $TMPC << EOF
+cat > $TMPC << EOF
 
 #ifdef __linux__
 #  define THREAD __thread
 #else
 #  define THREAD
 #endif
-
 static THREAD int tls_var;
-
 int main(void) { return tls_var; }
-
 EOF
-  # check we support --no-pie first...
-  if compile_prog "-Werror -fno-pie" "-no-pie"; then
-    CFLAGS_NOPIE="-fno-pie"
-    LDFLAGS_NOPIE="-nopie"
-  fi
 
+# Check we support --no-pie first; we will need this for building ROMs.
+if compile_prog "-Werror -fno-pie" "-no-pie"; then
+  CFLAGS_NOPIE="-fno-pie"
+  LDFLAGS_NOPIE="-no-pie"
+fi
+
+if test "$pie" != "no" ; then
   if compile_prog "-fPIE -DPIE" "-pie"; then
     QEMU_CFLAGS="-fPIE -DPIE $QEMU_CFLAGS"
     LDFLAGS="-pie $LDFLAGS"
-- 
2.20.1

There is nothing about these options that is related to PIE.
Use them unconditionally.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Fangrui Song <i@maskray.me>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
v2: Do not split into two tests.
---
 configure | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/configure b/configure
index XXXXXXX..XXXXXXX 100755
--- a/configure
+++ b/configure
@@ -XXX,XX +XXX,XX @@ if test "$pie" != "no" ; then
     QEMU_CFLAGS="-fPIE -DPIE $QEMU_CFLAGS"
     LDFLAGS="-pie $LDFLAGS"
     pie="yes"
-    if compile_prog "" "-Wl,-z,relro -Wl,-z,now" ; then
-      LDFLAGS="-Wl,-z,relro -Wl,-z,now $LDFLAGS"
-    fi
   else
     if test "$pie" = "yes"; then
       error_exit "PIE not available due to missing toolchain support"
@@ -XXX,XX +XXX,XX @@ if test "$pie" != "no" ; then
   fi
 fi
 
+# Detect support for PT_GNU_RELRO + DT_BIND_NOW.
+# The combination is known as "full relro", because .got.plt is read-only too.
+if compile_prog "" "-Wl,-z,relro -Wl,-z,now" ; then
+  LDFLAGS="-Wl,-z,relro -Wl,-z,now $LDFLAGS"
+fi
+
 ##########################################
 # __sync_fetch_and_and requires at least -march=i486. Many toolchains
 # use i686 as default anyway, but for those that don't, an explicit
-- 
2.20.1

Some distributions, e.g. Ubuntu 19.10, enable PIE by default.
If for some reason one wishes to build a non-pie binary, we
must provide additional options to override.

At the same time, reorg the code to an elif chain.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Reviewed-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 configure | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/configure b/configure
index XXXXXXX..XXXXXXX 100755
--- a/configure
+++ b/configure
@@ -XXX,XX +XXX,XX @@ if compile_prog "-Werror -fno-pie" "-no-pie"; then
   LDFLAGS_NOPIE="-no-pie"
 fi
 
-if test "$pie" != "no" ; then
-  if compile_prog "-fPIE -DPIE" "-pie"; then
-    QEMU_CFLAGS="-fPIE -DPIE $QEMU_CFLAGS"
-    LDFLAGS="-pie $LDFLAGS"
-    pie="yes"
-  else
-    if test "$pie" = "yes"; then
-      error_exit "PIE not available due to missing toolchain support"
-    else
-      echo "Disabling PIE due to missing toolchain support"
-      pie="no"
-    fi
-  fi
+if test "$pie" = "no"; then
+  QEMU_CFLAGS="$CFLAGS_NOPIE $QEMU_CFLAGS"
+  LDFLAGS="$LDFLAGS_NOPIE $LDFLAGS"
+elif compile_prog "-fPIE -DPIE" "-pie"; then
+  QEMU_CFLAGS="-fPIE -DPIE $QEMU_CFLAGS"
+  LDFLAGS="-pie $LDFLAGS"
+  pie="yes"
+elif test "$pie" = "yes"; then
+  error_exit "PIE not available due to missing toolchain support"
+else
+  echo "Disabling PIE due to missing toolchain support"
+  pie="no"
 fi
 
 # Detect support for PT_GNU_RELRO + DT_BIND_NOW.
-- 
2.20.1

Recent toolchains support static and pie at the same time.

As with normal dynamic builds, allow --static to default to PIE
if supported by the toolchain.  Allow --enable/--disable-pie to
override the default.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
v2: Fix --disable-pie --static
---
 configure | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/configure b/configure
index XXXXXXX..XXXXXXX 100755
--- a/configure
+++ b/configure
@@ -XXX,XX +XXX,XX @@ for opt do
   ;;
   --static)
     static="yes"
-    LDFLAGS="-static $LDFLAGS"
     QEMU_PKG_CONFIG_FLAGS="--static $QEMU_PKG_CONFIG_FLAGS"
   ;;
   --mandir=*) mandir="$optarg"
@@ -XXX,XX +XXX,XX @@ if test "$static" = "yes" ; then
   if test "$modules" = "yes" ; then
     error_exit "static and modules are mutually incompatible"
   fi
-  if test "$pie" = "yes" ; then
-    error_exit "static and pie are mutually incompatible"
-  else
-    pie="no"
-  fi
 fi
 
 # Unconditional check for compiler __thread support
@@ -XXX,XX +XXX,XX @@ if compile_prog "-Werror -fno-pie" "-no-pie"; then
   LDFLAGS_NOPIE="-no-pie"
 fi
 
-if test "$pie" = "no"; then
+if test "$static" = "yes"; then
+  if test "$pie" != "no" && compile_prog "-fPIE -DPIE" "-static-pie"; then
+    QEMU_CFLAGS="-fPIE -DPIE $QEMU_CFLAGS"
+    LDFLAGS="-static-pie $LDFLAGS"
+    pie="yes"
+  elif test "$pie" = "yes"; then
+    error_exit "-static-pie not available due to missing toolchain support"
+  else
+    LDFLAGS="-static $LDFLAGS"
+    pie="no"
+  fi
+elif test "$pie" = "no"; then
   QEMU_CFLAGS="$CFLAGS_NOPIE $QEMU_CFLAGS"
   LDFLAGS="$LDFLAGS_NOPIE $LDFLAGS"
 elif compile_prog "-fPIE -DPIE" "-pie"; then
-- 
2.20.1

We don't actually need the result of the read, only to probe that the
memory mapping exists.  This is exactly what probe_access does.

This is also the only user of any cpu_ld*_code_ra function.
Removing this allows the interface to be removed shortly.

Tested-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Acked-by: Max Filippov <jcmvbkbc@gmail.com>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/xtensa/mmu_helper.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/target/xtensa/mmu_helper.c b/target/xtensa/mmu_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/xtensa/mmu_helper.c
+++ b/target/xtensa/mmu_helper.c
@@ -XXX,XX +XXX,XX @@
 void HELPER(itlb_hit_test)(CPUXtensaState *env, uint32_t vaddr)
 {
     /*
-     * Attempt the memory load; we don't care about the result but
+     * Probe the memory; we don't care about the result but
      * only the side-effects (ie any MMU or other exception)
      */
-    cpu_ldub_code_ra(env, vaddr, GETPC());
+    probe_access(env, vaddr, 1, MMU_INST_FETCH,
+                 cpu_mmu_index(env, true), GETPC());
 }
 
 void HELPER(wsr_rasid)(CPUXtensaState *env, uint32_t v)
-- 
2.20.1

In the cpu_ldst templates, we already require a MemOp, and it
is cleaner and clearer to pass that instead of 3 separate
arguments describing the memory operation.

Tested-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/cpu_ldst_template.h          | 22 +++++++++++-----------
 include/exec/cpu_ldst_useronly_template.h | 12 ++++++------
 2 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/include/exec/cpu_ldst_template.h b/include/exec/cpu_ldst_template.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/cpu_ldst_template.h
+++ b/include/exec/cpu_ldst_template.h
@@ -XXX,XX +XXX,XX @@ glue(glue(glue(cpu_ld, USUFFIX), MEMSUFFIX), _ra)(CPUArchState *env,
     RES_TYPE res;
     target_ulong addr;
     int mmu_idx = CPU_MMU_INDEX;
-    TCGMemOpIdx oi;
+    MemOp op = MO_TE | SHIFT;
 #if !defined(SOFTMMU_CODE_ACCESS)
-    uint16_t meminfo = trace_mem_build_info(SHIFT, false, MO_TE, false, mmu_idx);
+    uint16_t meminfo = trace_mem_get_info(op, mmu_idx, false);
     trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo);
 #endif
 
@@ -XXX,XX +XXX,XX @@ glue(glue(glue(cpu_ld, USUFFIX), MEMSUFFIX), _ra)(CPUArchState *env,
     entry = tlb_entry(env, mmu_idx, addr);
     if (unlikely(entry->ADDR_READ !=
                  (addr & (TARGET_PAGE_MASK | (DATA_SIZE - 1))))) {
-        oi = make_memop_idx(SHIFT, mmu_idx);
+        TCGMemOpIdx oi = make_memop_idx(op, mmu_idx);
         res = glue(glue(helper_ret_ld, URETSUFFIX), MMUSUFFIX)(env, addr,
-                                                            oi, retaddr);
+                                                               oi, retaddr);
     } else {
         uintptr_t hostaddr = addr + entry->addend;
         res = glue(glue(ld, USUFFIX), _p)((uint8_t *)hostaddr);
@@ -XXX,XX +XXX,XX @@ glue(glue(glue(cpu_lds, SUFFIX), MEMSUFFIX), _ra)(CPUArchState *env,
     int res;
     target_ulong addr;
     int mmu_idx = CPU_MMU_INDEX;
-    TCGMemOpIdx oi;
-#if !defined(SOFTMMU_CODE_ACCESS)
-    uint16_t meminfo = trace_mem_build_info(SHIFT, true, MO_TE, false, mmu_idx);
+    MemOp op = MO_TE | MO_SIGN | SHIFT;
+#ifndef SOFTMMU_CODE_ACCESS
+    uint16_t meminfo = trace_mem_get_info(op, mmu_idx, false);
     trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo);
 #endif
 
@@ -XXX,XX +XXX,XX @@ glue(glue(glue(cpu_lds, SUFFIX), MEMSUFFIX), _ra)(CPUArchState *env,
     entry = tlb_entry(env, mmu_idx, addr);
     if (unlikely(entry->ADDR_READ !=
                  (addr & (TARGET_PAGE_MASK | (DATA_SIZE - 1))))) {
-        oi = make_memop_idx(SHIFT, mmu_idx);
+        TCGMemOpIdx oi = make_memop_idx(op & ~MO_SIGN, mmu_idx);
         res = (DATA_STYPE)glue(glue(helper_ret_ld, SRETSUFFIX),
                                MMUSUFFIX)(env, addr, oi, retaddr);
     } else {
@@ -XXX,XX +XXX,XX @@ glue(glue(glue(cpu_st, SUFFIX), MEMSUFFIX), _ra)(CPUArchState *env,
     CPUTLBEntry *entry;
     target_ulong addr;
     int mmu_idx = CPU_MMU_INDEX;
-    TCGMemOpIdx oi;
+    MemOp op = MO_TE | SHIFT;
 #if !defined(SOFTMMU_CODE_ACCESS)
-    uint16_t meminfo = trace_mem_build_info(SHIFT, false, MO_TE, true, mmu_idx);
+    uint16_t meminfo = trace_mem_get_info(op, mmu_idx, true);
     trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo);
 #endif
 
@@ -XXX,XX +XXX,XX @@ glue(glue(glue(cpu_st, SUFFIX), MEMSUFFIX), _ra)(CPUArchState *env,
     entry = tlb_entry(env, mmu_idx, addr);
     if (unlikely(tlb_addr_write(entry) !=
                  (addr & (TARGET_PAGE_MASK | (DATA_SIZE - 1))))) {
-        oi = make_memop_idx(SHIFT, mmu_idx);
+        TCGMemOpIdx oi = make_memop_idx(op, mmu_idx);
         glue(glue(helper_ret_st, SUFFIX), MMUSUFFIX)(env, addr, v, oi,
                                                      retaddr);
     } else {
diff --git a/include/exec/cpu_ldst_useronly_template.h b/include/exec/cpu_ldst_useronly_template.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/cpu_ldst_useronly_template.h
+++ b/include/exec/cpu_ldst_useronly_template.h
@@ -XXX,XX +XXX,XX @@ glue(glue(cpu_ld, USUFFIX), MEMSUFFIX)(CPUArchState *env, abi_ptr ptr)
     ret = glue(glue(ld, USUFFIX), _p)(g2h(ptr));
     clear_helper_retaddr();
 #else
-    uint16_t meminfo = trace_mem_build_info(SHIFT, false, MO_TE, false,
-                                            MMU_USER_IDX);
+    MemOp op = MO_TE | SHIFT;
+    uint16_t meminfo = trace_mem_get_info(op, MMU_USER_IDX, false);
     trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo);
     ret = glue(glue(ld, USUFFIX), _p)(g2h(ptr));
 #endif
@@ -XXX,XX +XXX,XX @@ glue(glue(cpu_lds, SUFFIX), MEMSUFFIX)(CPUArchState *env, abi_ptr ptr)
     ret = glue(glue(lds, SUFFIX), _p)(g2h(ptr));
     clear_helper_retaddr();
 #else
-    uint16_t meminfo = trace_mem_build_info(SHIFT, true, MO_TE, false,
-                                            MMU_USER_IDX);
+    MemOp op = MO_TE | MO_SIGN | SHIFT;
+    uint16_t meminfo = trace_mem_get_info(op, MMU_USER_IDX, false);
     trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo);
     ret = glue(glue(lds, SUFFIX), _p)(g2h(ptr));
     qemu_plugin_vcpu_mem_cb(env_cpu(env), ptr, meminfo);
@@ -XXX,XX +XXX,XX @@ static inline void
 glue(glue(cpu_st, SUFFIX), MEMSUFFIX)(CPUArchState *env, abi_ptr ptr,
                                       RES_TYPE v)
 {
-    uint16_t meminfo = trace_mem_build_info(SHIFT, false, MO_TE, true,
-                                            MMU_USER_IDX);
+    MemOp op = MO_TE | SHIFT;
+    uint16_t meminfo = trace_mem_get_info(op, MMU_USER_IDX, true);
     trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo);
     glue(glue(st, SUFFIX), _p)(g2h(ptr), v);
     qemu_plugin_vcpu_mem_cb(env_cpu(env), ptr, meminfo);
-- 
2.20.1

It is easy for the atomic helpers to use trace_mem_build_info
directly, without resorting to symbol pasting.  For this usage,
we cannot use trace_mem_get_info, because the MemOp does not
support 16-byte accesses.

Tested-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/atomic_template.h | 67 +++++++++++++------------------------
 trace/mem-internal.h        | 17 ----------
 2 files changed, 24 insertions(+), 60 deletions(-)

diff --git a/accel/tcg/atomic_template.h b/accel/tcg/atomic_template.h
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/atomic_template.h
+++ b/accel/tcg/atomic_template.h
@@ -XXX,XX +XXX,XX @@
    the ATOMIC_NAME macro, and redefined below.  */
 #if DATA_SIZE == 1
 # define END
-# define MEND _be /* either le or be would be fine */
 #elif defined(HOST_WORDS_BIGENDIAN)
 # define END  _be
-# define MEND _be
 #else
 # define END  _le
-# define MEND _le
 #endif
 
 ABI_TYPE ATOMIC_NAME(cmpxchg)(CPUArchState *env, target_ulong addr,
@@ -XXX,XX +XXX,XX @@ ABI_TYPE ATOMIC_NAME(cmpxchg)(CPUArchState *env, target_ulong addr,
     ATOMIC_MMU_DECLS;
     DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;
     DATA_TYPE ret;
-    uint16_t info = glue(trace_mem_build_info_no_se, MEND)(SHIFT, false,
-                                                           ATOMIC_MMU_IDX);
+    uint16_t info = trace_mem_build_info(SHIFT, false, 0, false,
+                                         ATOMIC_MMU_IDX);
 
     atomic_trace_rmw_pre(env, addr, info);
 #if DATA_SIZE == 16
@@ -XXX,XX +XXX,XX @@ ABI_TYPE ATOMIC_NAME(ld)(CPUArchState *env, target_ulong addr EXTRA_ARGS)
 {
     ATOMIC_MMU_DECLS;
     DATA_TYPE val, *haddr = ATOMIC_MMU_LOOKUP;
-    uint16_t info = glue(trace_mem_build_info_no_se, MEND)(SHIFT, false,
-                                                           ATOMIC_MMU_IDX);
+    uint16_t info = trace_mem_build_info(SHIFT, false, 0, false,
+                                         ATOMIC_MMU_IDX);
 
     atomic_trace_ld_pre(env, addr, info);
     val = atomic16_read(haddr);
@@ -XXX,XX +XXX,XX @@ void ATOMIC_NAME(st)(CPUArchState *env, target_ulong addr,
 {
     ATOMIC_MMU_DECLS;
     DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;
-    uint16_t info = glue(trace_mem_build_info_no_se, MEND)(SHIFT, true,
-                                                          ATOMIC_MMU_IDX);
+    uint16_t info = trace_mem_build_info(SHIFT, false, 0, true,
+                                         ATOMIC_MMU_IDX);
 
     atomic_trace_st_pre(env, addr, info);
     atomic16_set(haddr, val);
@@ -XXX,XX +XXX,XX @@ ABI_TYPE ATOMIC_NAME(xchg)(CPUArchState *env, target_ulong addr,
     ATOMIC_MMU_DECLS;
     DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;
     DATA_TYPE ret;
-    uint16_t info = glue(trace_mem_build_info_no_se, MEND)(SHIFT, false,
-                                                          ATOMIC_MMU_IDX);
+    uint16_t info = trace_mem_build_info(SHIFT, false, 0, false,
+                                         ATOMIC_MMU_IDX);
 
     atomic_trace_rmw_pre(env, addr, info);
     ret = atomic_xchg__nocheck(haddr, val);
@@ -XXX,XX +XXX,XX @@ ABI_TYPE ATOMIC_NAME(X)(CPUArchState *env, target_ulong addr,       \
     ATOMIC_MMU_DECLS;                                               \
     DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;                           \
     DATA_TYPE ret;                                                  \
-    uint16_t info = glue(trace_mem_build_info_no_se, MEND)(SHIFT,   \
-                                                           false,   \
-                                                           ATOMIC_MMU_IDX); \
-                                                                    \
+    uint16_t info = trace_mem_build_info(SHIFT, false, 0, false,    \
+                                         ATOMIC_MMU_IDX);           \
     atomic_trace_rmw_pre(env, addr, info);                          \
     ret = atomic_##X(haddr, val);                                   \
     ATOMIC_MMU_CLEANUP;                                             \
@@ -XXX,XX +XXX,XX @@ ABI_TYPE ATOMIC_NAME(X)(CPUArchState *env, target_ulong addr,       \
     ATOMIC_MMU_DECLS;                                               \
     XDATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;                          \
     XDATA_TYPE cmp, old, new, val = xval;                           \
-    uint16_t info = glue(trace_mem_build_info_no_se, MEND)(SHIFT,   \
-                                                           false,   \
-                                                           ATOMIC_MMU_IDX); \
-                                                                    \
+    uint16_t info = trace_mem_build_info(SHIFT, false, 0, false,    \
+                                         ATOMIC_MMU_IDX);           \
     atomic_trace_rmw_pre(env, addr, info);                          \
     smp_mb();                                                       \
     cmp = atomic_read__nocheck(haddr);                              \
@@ -XXX,XX +XXX,XX @@ GEN_ATOMIC_HELPER_FN(umax_fetch, MAX,  DATA_TYPE, new)
 #endif /* DATA SIZE >= 16 */
 
 #undef END
-#undef MEND
 
 #if DATA_SIZE > 1
 
@@ -XXX,XX +XXX,XX @@ GEN_ATOMIC_HELPER_FN(umax_fetch, MAX,  DATA_TYPE, new)
    within the ATOMIC_NAME macro.  */
 #ifdef HOST_WORDS_BIGENDIAN
 # define END  _le
-# define MEND _le
 #else
 # define END  _be
-# define MEND _be
 #endif
 
 ABI_TYPE ATOMIC_NAME(cmpxchg)(CPUArchState *env, target_ulong addr,
@@ -XXX,XX +XXX,XX @@ ABI_TYPE ATOMIC_NAME(cmpxchg)(CPUArchState *env, target_ulong addr,
     ATOMIC_MMU_DECLS;
     DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;
     DATA_TYPE ret;
-    uint16_t info = glue(trace_mem_build_info_no_se, MEND)(SHIFT,
-                                                           false,
-                                                           ATOMIC_MMU_IDX);
+    uint16_t info = trace_mem_build_info(SHIFT, false, MO_BSWAP, false,
+                                         ATOMIC_MMU_IDX);
 
     atomic_trace_rmw_pre(env, addr, info);
 #if DATA_SIZE == 16
@@ -XXX,XX +XXX,XX @@ ABI_TYPE ATOMIC_NAME(ld)(CPUArchState *env, target_ulong addr EXTRA_ARGS)
 {
     ATOMIC_MMU_DECLS;
     DATA_TYPE val, *haddr = ATOMIC_MMU_LOOKUP;
-    uint16_t info = glue(trace_mem_build_info_no_se, MEND)(SHIFT,
-                                                           false,
-                                                           ATOMIC_MMU_IDX);
+    uint16_t info = trace_mem_build_info(SHIFT, false, MO_BSWAP, false,
+                                         ATOMIC_MMU_IDX);
 
     atomic_trace_ld_pre(env, addr, info);
     val = atomic16_read(haddr);
@@ -XXX,XX +XXX,XX @@ void ATOMIC_NAME(st)(CPUArchState *env, target_ulong addr,
 {
     ATOMIC_MMU_DECLS;
     DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;
-    uint16_t info = glue(trace_mem_build_info_no_se, MEND)(SHIFT,
-                                                           true,
-                                                           ATOMIC_MMU_IDX);
+    uint16_t info = trace_mem_build_info(SHIFT, false, MO_BSWAP, true,
+                                         ATOMIC_MMU_IDX);
 
     val = BSWAP(val);
     atomic_trace_st_pre(env, addr, info);
@@ -XXX,XX +XXX,XX @@ ABI_TYPE ATOMIC_NAME(xchg)(CPUArchState *env, target_ulong addr,
     ATOMIC_MMU_DECLS;
     DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;
     ABI_TYPE ret;
-    uint16_t info = glue(trace_mem_build_info_no_se, MEND)(SHIFT,
-                                                           false,
-                                                           ATOMIC_MMU_IDX);
+    uint16_t info = trace_mem_build_info(SHIFT, false, MO_BSWAP, false,
+                                         ATOMIC_MMU_IDX);
 
     atomic_trace_rmw_pre(env, addr, info);
     ret = atomic_xchg__nocheck(haddr, BSWAP(val));
@@ -XXX,XX +XXX,XX @@ ABI_TYPE ATOMIC_NAME(X)(CPUArchState *env, target_ulong addr,       \
     ATOMIC_MMU_DECLS;                                               \
     DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;                           \
     DATA_TYPE ret;                                                  \
-    uint16_t info = glue(trace_mem_build_info_no_se, MEND)(SHIFT,   \
-                                                           false,   \
-                                                           ATOMIC_MMU_IDX); \
-                                                                    \
+    uint16_t info = trace_mem_build_info(SHIFT, false, MO_BSWAP,    \
+                                         false, ATOMIC_MMU_IDX);    \
     atomic_trace_rmw_pre(env, addr, info);                          \
     ret = atomic_##X(haddr, BSWAP(val));                            \
     ATOMIC_MMU_CLEANUP;                                             \
@@ -XXX,XX +XXX,XX @@ ABI_TYPE ATOMIC_NAME(X)(CPUArchState *env, target_ulong addr,       \
     ATOMIC_MMU_DECLS;                                               \
     XDATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;                          \
     XDATA_TYPE ldo, ldn, old, new, val = xval;                      \
-    uint16_t info = glue(trace_mem_build_info_no_se, MEND)(SHIFT,   \
-                                                           false,   \
-                                                           ATOMIC_MMU_IDX); \
-                                                                    \
+    uint16_t info = trace_mem_build_info(SHIFT, false, MO_BSWAP,    \
+                                         false, ATOMIC_MMU_IDX);    \
     atomic_trace_rmw_pre(env, addr, info);                          \
     smp_mb();                                                       \
     ldn = atomic_read__nocheck(haddr);                              \
@@ -XXX,XX +XXX,XX @@ GEN_ATOMIC_HELPER_FN(add_fetch, ADD, DATA_TYPE, new)
 #endif /* DATA_SIZE >= 16 */
 
 #undef END
-#undef MEND
 #endif /* DATA_SIZE > 1 */
 
 #undef BSWAP
diff --git a/trace/mem-internal.h b/trace/mem-internal.h
index XXXXXXX..XXXXXXX 100644
--- a/trace/mem-internal.h
+++ b/trace/mem-internal.h
@@ -XXX,XX +XXX,XX @@ static inline uint16_t trace_mem_get_info(MemOp op,
                                 mmu_idx);
 }
 
-/* Used by the atomic helpers */
-static inline
-uint16_t trace_mem_build_info_no_se_be(int size_shift, bool store,
-                                       TCGMemOpIdx oi)
-{
-    return trace_mem_build_info(size_shift, false, MO_BE, store,
-                                get_mmuidx(oi));
-}
-
-static inline
-uint16_t trace_mem_build_info_no_se_le(int size_shift, bool store,
-                                       TCGMemOpIdx oi)
-{
-    return trace_mem_build_info(size_shift, false, MO_LE, store,
-                                get_mmuidx(oi));
-}
-
 #endif /* TRACE__MEM_INTERNAL_H */
-- 
2.20.1

With the tracing hooks, the inline functions are no longer
so simple.  Once out-of-line, the current tlb_entry lookup
is redundant with the one in the main load/store_helper.

This also begins the introduction of a new target facing
interface, with suffix *_mmuidx_ra.  This is not yet
official because the interface is not done for user-only.

Use abi_ptr instead of target_ulong in preparation for
user-only; the two types are identical for softmmu.

What remains in cpu_ldst_template.h are the expansions
for _code, _data, and MMU_MODE<N>_SUFFIX.

Tested-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/cpu_ldst.h          |  25 ++++++-
 include/exec/cpu_ldst_template.h | 125 +++++++------------------------
 accel/tcg/cputlb.c               | 116 ++++++++++++++++++++++++++++
 3 files changed, 166 insertions(+), 100 deletions(-)

diff --git a/include/exec/cpu_ldst.h b/include/exec/cpu_ldst.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/cpu_ldst.h
+++ b/include/exec/cpu_ldst.h
@@ -XXX,XX +XXX,XX @@ static inline void clear_helper_retaddr(void)
 
 #else
 
-/* The memory helpers for tcg-generated code need tcg_target_long etc.  */
+/* Needed for TCG_OVERSIZED_GUEST */
 #include "tcg.h"
 
 static inline target_ulong tlb_addr_write(const CPUTLBEntry *entry)
@@ -XXX,XX +XXX,XX @@ static inline CPUTLBEntry *tlb_entry(CPUArchState *env, uintptr_t mmu_idx,
     return &env_tlb(env)->f[mmu_idx].table[tlb_index(env, mmu_idx, addr)];
 }
 
+uint32_t cpu_ldub_mmuidx_ra(CPUArchState *env, abi_ptr addr,
+                            int mmu_idx, uintptr_t ra);
+uint32_t cpu_lduw_mmuidx_ra(CPUArchState *env, abi_ptr addr,
+                            int mmu_idx, uintptr_t ra);
+uint32_t cpu_ldl_mmuidx_ra(CPUArchState *env, abi_ptr addr,
+                           int mmu_idx, uintptr_t ra);
+uint64_t cpu_ldq_mmuidx_ra(CPUArchState *env, abi_ptr addr,
+                           int mmu_idx, uintptr_t ra);
+
+int cpu_ldsb_mmuidx_ra(CPUArchState *env, abi_ptr addr,
+                       int mmu_idx, uintptr_t ra);
+int cpu_ldsw_mmuidx_ra(CPUArchState *env, abi_ptr addr,
+                       int mmu_idx, uintptr_t ra);
+
+void cpu_stb_mmuidx_ra(CPUArchState *env, abi_ptr addr, uint32_t val,
+                       int mmu_idx, uintptr_t retaddr);
+void cpu_stw_mmuidx_ra(CPUArchState *env, abi_ptr addr, uint32_t val,
+                       int mmu_idx, uintptr_t retaddr);
+void cpu_stl_mmuidx_ra(CPUArchState *env, abi_ptr addr, uint32_t val,
+                       int mmu_idx, uintptr_t retaddr);
+void cpu_stq_mmuidx_ra(CPUArchState *env, abi_ptr addr, uint64_t val,
+                       int mmu_idx, uintptr_t retaddr);
+
 #ifdef MMU_MODE0_SUFFIX
 #define CPU_MMU_INDEX 0
 #define MEMSUFFIX MMU_MODE0_SUFFIX
diff --git a/include/exec/cpu_ldst_template.h b/include/exec/cpu_ldst_template.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/cpu_ldst_template.h
+++ b/include/exec/cpu_ldst_template.h
@@ -XXX,XX +XXX,XX @@
  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  */
 
-#if !defined(SOFTMMU_CODE_ACCESS)
-#include "trace-root.h"
-#endif
-
-#include "qemu/plugin.h"
-#include "trace/mem.h"
-
 #if DATA_SIZE == 8
 #define SUFFIX q
 #define USUFFIX q
@@ -XXX,XX +XXX,XX @@
 #define RES_TYPE uint32_t
 #endif
 
+/* generic load/store macros */
+
 #ifdef SOFTMMU_CODE_ACCESS
-#define ADDR_READ addr_code
-#define MMUSUFFIX _cmmu
-#define URETSUFFIX USUFFIX
-#define SRETSUFFIX glue(s, SUFFIX)
-#else
-#define ADDR_READ addr_read
-#define MMUSUFFIX _mmu
-#define URETSUFFIX USUFFIX
-#define SRETSUFFIX glue(s, SUFFIX)
+
+static inline RES_TYPE
+glue(glue(cpu_ld, USUFFIX), _code)(CPUArchState *env, target_ulong ptr)
+{
+    TCGMemOpIdx oi = make_memop_idx(MO_TE | SHIFT, CPU_MMU_INDEX);
+    return glue(glue(helper_ret_ld, USUFFIX), _cmmu)(env, ptr, oi, 0);
+}
+
+#if DATA_SIZE <= 2
+static inline int
+glue(glue(cpu_lds, SUFFIX), _code)(CPUArchState *env, target_ulong ptr)
+{
+    return (DATA_STYPE)glue(glue(cpu_ld, USUFFIX), _code)(env, ptr);
+}
 #endif
 
-/* generic load/store macros */
+#else
 
 static inline RES_TYPE
 glue(glue(glue(cpu_ld, USUFFIX), MEMSUFFIX), _ra)(CPUArchState *env,
                                                   target_ulong ptr,
                                                   uintptr_t retaddr)
 {
-    CPUTLBEntry *entry;
-    RES_TYPE res;
-    target_ulong addr;
-    int mmu_idx = CPU_MMU_INDEX;
-    MemOp op = MO_TE | SHIFT;
-#if !defined(SOFTMMU_CODE_ACCESS)
-    uint16_t meminfo = trace_mem_get_info(op, mmu_idx, false);
-    trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo);
-#endif
-
-    addr = ptr;
-    entry = tlb_entry(env, mmu_idx, addr);
-    if (unlikely(entry->ADDR_READ !=
-                 (addr & (TARGET_PAGE_MASK | (DATA_SIZE - 1))))) {
-        TCGMemOpIdx oi = make_memop_idx(op, mmu_idx);
-        res = glue(glue(helper_ret_ld, URETSUFFIX), MMUSUFFIX)(env, addr,
-                                                               oi, retaddr);
-    } else {
-        uintptr_t hostaddr = addr + entry->addend;
-        res = glue(glue(ld, USUFFIX), _p)((uint8_t *)hostaddr);
-    }
-#ifndef SOFTMMU_CODE_ACCESS
-    qemu_plugin_vcpu_mem_cb(env_cpu(env), ptr, meminfo);
-#endif
-    return res;
+    return glue(glue(cpu_ld, USUFFIX), _mmuidx_ra)(env, ptr, CPU_MMU_INDEX,
+                                                   retaddr);
 }
 
 static inline RES_TYPE
 glue(glue(cpu_ld, USUFFIX), MEMSUFFIX)(CPUArchState *env, target_ulong ptr)
 {
-    return glue(glue(glue(cpu_ld, USUFFIX), MEMSUFFIX), _ra)(env, ptr, 0);
+    return glue(glue(cpu_ld, USUFFIX), _mmuidx_ra)(env, ptr, CPU_MMU_INDEX, 0);
 }
 
 #if DATA_SIZE <= 2
@@ -XXX,XX +XXX,XX @@ glue(glue(glue(cpu_lds, SUFFIX), MEMSUFFIX), _ra)(CPUArchState *env,
                                                   target_ulong ptr,
                                                   uintptr_t retaddr)
 {
-    CPUTLBEntry *entry;
-    int res;
-    target_ulong addr;
-    int mmu_idx = CPU_MMU_INDEX;
-    MemOp op = MO_TE | MO_SIGN | SHIFT;
-#ifndef SOFTMMU_CODE_ACCESS
-    uint16_t meminfo = trace_mem_get_info(op, mmu_idx, false);
-    trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo);
-#endif
-
-    addr = ptr;
-    entry = tlb_entry(env, mmu_idx, addr);
-    if (unlikely(entry->ADDR_READ !=
-                 (addr & (TARGET_PAGE_MASK | (DATA_SIZE - 1))))) {
-        TCGMemOpIdx oi = make_memop_idx(op & ~MO_SIGN, mmu_idx);
-        res = (DATA_STYPE)glue(glue(helper_ret_ld, SRETSUFFIX),
-                               MMUSUFFIX)(env, addr, oi, retaddr);
-    } else {
-        uintptr_t hostaddr = addr + entry->addend;
-        res = glue(glue(lds, SUFFIX), _p)((uint8_t *)hostaddr);
-    }
-#ifndef SOFTMMU_CODE_ACCESS
-    qemu_plugin_vcpu_mem_cb(env_cpu(env), ptr, meminfo);
-#endif
-    return res;
+    return glue(glue(cpu_lds, SUFFIX), _mmuidx_ra)(env, ptr, CPU_MMU_INDEX,
+                                                   retaddr);
 }
 
 static inline int
 glue(glue(cpu_lds, SUFFIX), MEMSUFFIX)(CPUArchState *env, target_ulong ptr)
 {
-    return glue(glue(glue(cpu_lds, SUFFIX), MEMSUFFIX), _ra)(env, ptr, 0);
+    return glue(glue(cpu_lds, SUFFIX), _mmuidx_ra)(env, ptr, CPU_MMU_INDEX, 0);
 }
 #endif
 
-#ifndef SOFTMMU_CODE_ACCESS
-
 /* generic store macro */
 
 static inline void
@@ -XXX,XX +XXX,XX @@ glue(glue(glue(cpu_st, SUFFIX), MEMSUFFIX), _ra)(CPUArchState *env,
                                                  target_ulong ptr,
                                                  RES_TYPE v, uintptr_t retaddr)
 {
-    CPUTLBEntry *entry;
-    target_ulong addr;
-    int mmu_idx = CPU_MMU_INDEX;
-    MemOp op = MO_TE | SHIFT;
-#if !defined(SOFTMMU_CODE_ACCESS)
-    uint16_t meminfo = trace_mem_get_info(op, mmu_idx, true);
-    trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo);
-#endif
-
-    addr = ptr;
-    entry = tlb_entry(env, mmu_idx, addr);
-    if (unlikely(tlb_addr_write(entry) !=
-                 (addr & (TARGET_PAGE_MASK | (DATA_SIZE - 1))))) {
-        TCGMemOpIdx oi = make_memop_idx(op, mmu_idx);
-        glue(glue(helper_ret_st, SUFFIX), MMUSUFFIX)(env, addr, v, oi,
-                                                     retaddr);
-    } else {
-        uintptr_t hostaddr = addr + entry->addend;
-        glue(glue(st, SUFFIX), _p)((uint8_t *)hostaddr, v);
-    }
-#ifndef SOFTMMU_CODE_ACCESS
-    qemu_plugin_vcpu_mem_cb(env_cpu(env), ptr, meminfo);
-#endif
+    glue(glue(cpu_st, SUFFIX), _mmuidx_ra)(env, ptr, v, CPU_MMU_INDEX,
+                                           retaddr);
 }
 
 static inline void
 glue(glue(cpu_st, SUFFIX), MEMSUFFIX)(CPUArchState *env, target_ulong ptr,
                                       RES_TYPE v)
 {
-    glue(glue(glue(cpu_st, SUFFIX), MEMSUFFIX), _ra)(env, ptr, v, 0);
+    glue(glue(cpu_st, SUFFIX), _mmuidx_ra)(env, ptr, v, CPU_MMU_INDEX, 0);
 }
 
 #endif /* !SOFTMMU_CODE_ACCESS */
@@ -XXX,XX +XXX,XX @@ glue(glue(cpu_st, SUFFIX), MEMSUFFIX)(CPUArchState *env, target_ulong ptr,
 #undef SUFFIX
 #undef USUFFIX
 #undef DATA_SIZE
-#undef MMUSUFFIX
-#undef ADDR_READ
-#undef URETSUFFIX
-#undef SRETSUFFIX
 #undef SHIFT
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/atomic.h"
 #include "qemu/atomic128.h"
 #include "translate-all.h"
+#include "trace-root.h"
+#include "qemu/plugin.h"
+#include "trace/mem.h"
 #ifdef CONFIG_PLUGIN
 #include "qemu/plugin-memory.h"
 #endif
@@ -XXX,XX +XXX,XX @@ tcg_target_ulong helper_be_ldsl_mmu(CPUArchState *env, target_ulong addr,
     return (int32_t)helper_be_ldul_mmu(env, addr, oi, retaddr);
 }
 
+/*
+ * Load helpers for cpu_ldst.h.
+ */
+
+static inline uint64_t cpu_load_helper(CPUArchState *env, abi_ptr addr,
+                                       int mmu_idx, uintptr_t retaddr,
+                                       MemOp op, FullLoadHelper *full_load)
+{
+    uint16_t meminfo;
+    TCGMemOpIdx oi;
+    uint64_t ret;
+
+    meminfo = trace_mem_get_info(op, mmu_idx, false);
+    trace_guest_mem_before_exec(env_cpu(env), addr, meminfo);
+
+    op &= ~MO_SIGN;
+    oi = make_memop_idx(op, mmu_idx);
+    ret = full_load(env, addr, oi, retaddr);
+
+    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, meminfo);
+
+    return ret;
+}
+
+uint32_t cpu_ldub_mmuidx_ra(CPUArchState *env, abi_ptr addr,
+                            int mmu_idx, uintptr_t ra)
+{
+    return cpu_load_helper(env, addr, mmu_idx, ra, MO_UB, full_ldub_mmu);
+}
+
+int cpu_ldsb_mmuidx_ra(CPUArchState *env, abi_ptr addr,
+                       int mmu_idx, uintptr_t ra)
+{
+    return (int8_t)cpu_load_helper(env, addr, mmu_idx, ra, MO_SB,
+                                   full_ldub_mmu);
+}
+
+uint32_t cpu_lduw_mmuidx_ra(CPUArchState *env, abi_ptr addr,
+                            int mmu_idx, uintptr_t ra)
+{
+    return cpu_load_helper(env, addr, mmu_idx, ra, MO_TEUW,
+                           MO_TE == MO_LE
+                           ? full_le_lduw_mmu : full_be_lduw_mmu);
+}
+
+int cpu_ldsw_mmuidx_ra(CPUArchState *env, abi_ptr addr,
+                       int mmu_idx, uintptr_t ra)
+{
+    return (int16_t)cpu_load_helper(env, addr, mmu_idx, ra, MO_TESW,
+                                    MO_TE == MO_LE
+                                    ? full_le_lduw_mmu : full_be_lduw_mmu);
+}
+
+uint32_t cpu_ldl_mmuidx_ra(CPUArchState *env, abi_ptr addr,
+                           int mmu_idx, uintptr_t ra)
+{
+    return cpu_load_helper(env, addr, mmu_idx, ra, MO_TEUL,
+                           MO_TE == MO_LE
+                           ? full_le_ldul_mmu : full_be_ldul_mmu);
+}
+
+uint64_t cpu_ldq_mmuidx_ra(CPUArchState *env, abi_ptr addr,
+                           int mmu_idx, uintptr_t ra)
+{
+    return cpu_load_helper(env, addr, mmu_idx, ra, MO_TEQ,
+                           MO_TE == MO_LE
+                           ? helper_le_ldq_mmu : helper_be_ldq_mmu);
+}
+
 /*
  * Store Helpers
  */
@@ -XXX,XX +XXX,XX @@ void helper_be_stq_mmu(CPUArchState *env, target_ulong addr, uint64_t val,
     store_helper(env, addr, val, oi, retaddr, MO_BEQ);
 }
 
+/*
+ * Store Helpers for cpu_ldst.h
+ */
+
+static inline void QEMU_ALWAYS_INLINE
+cpu_store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
+                 int mmu_idx, uintptr_t retaddr, MemOp op)
+{
+    TCGMemOpIdx oi;
+    uint16_t meminfo;
+
+    meminfo = trace_mem_get_info(op, mmu_idx, true);
+    trace_guest_mem_before_exec(env_cpu(env), addr, meminfo);
+
+    oi = make_memop_idx(op, mmu_idx);
+    store_helper(env, addr, val, oi, retaddr, op);
+
+    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, meminfo);
+}
+
+void cpu_stb_mmuidx_ra(CPUArchState *env, target_ulong addr, uint32_t val,
+                       int mmu_idx, uintptr_t retaddr)
+{
+    cpu_store_helper(env, addr, val, mmu_idx, retaddr, MO_UB);
+}
+
+void cpu_stw_mmuidx_ra(CPUArchState *env, target_ulong addr, uint32_t val,
+                       int mmu_idx, uintptr_t retaddr)
+{
+    cpu_store_helper(env, addr, val, mmu_idx, retaddr, MO_TEUW);
+}
+
+void cpu_stl_mmuidx_ra(CPUArchState *env, target_ulong addr, uint32_t val,
+                       int mmu_idx, uintptr_t retaddr)
+{
+    cpu_store_helper(env, addr, val, mmu_idx, retaddr, MO_TEUL);
+}
+
+void cpu_stq_mmuidx_ra(CPUArchState *env, target_ulong addr, uint64_t val,
+                       int mmu_idx, uintptr_t retaddr)
+{
+    cpu_store_helper(env, addr, val, mmu_idx, retaddr, MO_TEQ);
+}
+
 /* First set of helpers allows passing in of OI and RETADDR.  This makes
    them callable from other helpers.  */
 
-- 
2.20.1

The DO_LOAD macros replicate the distinction already performed
by the cpu_ldst.h functions.  Use them.

Tested-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/cpu_ldst.h   | 11 ---------
 include/exec/translator.h | 48 +++++++++++----------------------------
 2 files changed, 13 insertions(+), 46 deletions(-)

diff --git a/include/exec/cpu_ldst.h b/include/exec/cpu_ldst.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/cpu_ldst.h
+++ b/include/exec/cpu_ldst.h
@@ -XXX,XX +XXX,XX @@ static inline void clear_helper_retaddr(void)
 #include "exec/cpu_ldst_useronly_template.h"
 #undef MEMSUFFIX
 
-/*
- * Code access is deprecated in favour of translator_ld* functions
- * (see translator.h). However there are still users that need to
- * converted so for now these stay.
- */
 #define MEMSUFFIX _code
 #define CODE_ACCESS
 #define DATA_SIZE 1
@@ -XXX,XX +XXX,XX @@ void cpu_stq_mmuidx_ra(CPUArchState *env, abi_ptr addr, uint64_t val,
 #undef CPU_MMU_INDEX
 #undef MEMSUFFIX
 
-/*
- * Code access is deprecated in favour of translator_ld* functions
- * (see translator.h). However there are still users that need to
- * converted so for now these stay.
- */
-
 #define CPU_MMU_INDEX (cpu_mmu_index(env, true))
 #define MEMSUFFIX _code
 #define SOFTMMU_CODE_ACCESS
diff --git a/include/exec/translator.h b/include/exec/translator.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/translator.h
+++ b/include/exec/translator.h
@@ -XXX,XX +XXX,XX @@ void translator_loop_temp_check(DisasContextBase *db);
 /*
  * Translator Load Functions
  *
- * These are intended to replace the old cpu_ld*_code functions and
- * are mandatory for front-ends that have been migrated to the common
- * translator_loop. These functions are only intended to be called
- * from the translation stage and should not be called from helper
- * functions. Those functions should be converted to encode the
- * relevant information at translation time.
+ * These are intended to replace the direct usage of the cpu_ld*_code
+ * functions and are mandatory for front-ends that have been migrated
+ * to the common translator_loop. These functions are only intended
+ * to be called from the translation stage and should not be called
+ * from helper functions. Those functions should be converted to encode
+ * the relevant information at translation time.
  */
 
-#ifdef CONFIG_USER_ONLY
-
-#define DO_LOAD(type, name, shift)               \
-    do {                                         \
-        set_helper_retaddr(1);                   \
-        ret = name ## _p(g2h(pc));               \
-        clear_helper_retaddr();                  \
-    } while (0)
-
-#else
-
-#define DO_LOAD(type, name, shift)                          \
-    do {                                                    \
-        int mmu_idx = cpu_mmu_index(env, true);             \
-        TCGMemOpIdx oi = make_memop_idx(shift, mmu_idx);    \
-        ret = helper_ret_ ## name ## _cmmu(env, pc, oi, 0); \
-    } while (0)
-
-#endif
-
-#define GEN_TRANSLATOR_LD(fullname, name, type, shift, swap_fn)         \
+#define GEN_TRANSLATOR_LD(fullname, type, load_fn, swap_fn)             \
     static inline type                                                  \
     fullname ## _swap(CPUArchState *env, abi_ptr pc, bool do_swap)      \
     {                                                                   \
-        type ret;                                                       \
-        DO_LOAD(type, name, shift);                                     \
-                                                                        \
+        type ret = load_fn(env, pc);                                    \
         if (do_swap) {                                                  \
             ret = swap_fn(ret);                                         \
         }                                                               \
@@ -XXX,XX +XXX,XX @@ void translator_loop_temp_check(DisasContextBase *db);
         return fullname ## _swap(env, pc, false);                       \
     }
 
-GEN_TRANSLATOR_LD(translator_ldub, ldub, uint8_t, 0, /* no swap */ )
-GEN_TRANSLATOR_LD(translator_ldsw, ldsw, int16_t, 1, bswap16)
-GEN_TRANSLATOR_LD(translator_lduw, lduw, uint16_t, 1, bswap16)
-GEN_TRANSLATOR_LD(translator_ldl, ldl, uint32_t, 2, bswap32)
-GEN_TRANSLATOR_LD(translator_ldq, ldq, uint64_t, 3, bswap64)
+GEN_TRANSLATOR_LD(translator_ldub, uint8_t, cpu_ldub_code, /* no swap */)
+GEN_TRANSLATOR_LD(translator_ldsw, int16_t, cpu_ldsw_code, bswap16)
+GEN_TRANSLATOR_LD(translator_lduw, uint16_t, cpu_lduw_code, bswap16)
+GEN_TRANSLATOR_LD(translator_ldl, uint32_t, cpu_ldl_code, bswap32)
+GEN_TRANSLATOR_LD(translator_ldq, uint64_t, cpu_ldq_code, bswap64)
 #undef GEN_TRANSLATOR_LD
 
 #endif  /* EXEC__TRANSLATOR_H */
-- 
2.20.1

There are no uses of the *_cmmu names other than the bare wrapping
within the *_code inlines.  Therefore rename the functions so we
can drop the inlines.

Use abi_ptr instead of target_ulong in preparation for user-only;
the two types are identical for softmmu.

Tested-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Reviewed-by: Aleksandar Markovic <amarkovic@wavecomp.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/cpu_ldst.h          | 29 ++++------
 include/exec/cpu_ldst_template.h | 21 -------
 tcg/tcg.h                        | 29 ----------
 accel/tcg/cputlb.c               | 94 ++++++++------------------------
 docs/devel/loads-stores.rst      |  4 +-
 5 files changed, 36 insertions(+), 141 deletions(-)

diff --git a/include/exec/cpu_ldst.h b/include/exec/cpu_ldst.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/cpu_ldst.h
+++ b/include/exec/cpu_ldst.h
@@ -XXX,XX +XXX,XX @@ void cpu_stq_mmuidx_ra(CPUArchState *env, abi_ptr addr, uint64_t val,
 #undef CPU_MMU_INDEX
 #undef MEMSUFFIX
 
-#define CPU_MMU_INDEX (cpu_mmu_index(env, true))
-#define MEMSUFFIX _code
-#define SOFTMMU_CODE_ACCESS
+uint32_t cpu_ldub_code(CPUArchState *env, abi_ptr addr);
+uint32_t cpu_lduw_code(CPUArchState *env, abi_ptr addr);
+uint32_t cpu_ldl_code(CPUArchState *env, abi_ptr addr);
+uint64_t cpu_ldq_code(CPUArchState *env, abi_ptr addr);
 
-#define DATA_SIZE 1
-#include "exec/cpu_ldst_template.h"
+static inline int cpu_ldsb_code(CPUArchState *env, abi_ptr addr)
+{
+    return (int8_t)cpu_ldub_code(env, addr);
+}
 
-#define DATA_SIZE 2
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 4
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 8
-#include "exec/cpu_ldst_template.h"
-
-#undef CPU_MMU_INDEX
-#undef MEMSUFFIX
-#undef SOFTMMU_CODE_ACCESS
+static inline int cpu_ldsw_code(CPUArchState *env, abi_ptr addr)
+{
+    return (int16_t)cpu_lduw_code(env, addr);
+}
 
 #endif /* defined(CONFIG_USER_ONLY) */
 
diff --git a/include/exec/cpu_ldst_template.h b/include/exec/cpu_ldst_template.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/cpu_ldst_template.h
+++ b/include/exec/cpu_ldst_template.h
@@ -XXX,XX +XXX,XX @@
 
 /* generic load/store macros */
 
-#ifdef SOFTMMU_CODE_ACCESS
-
-static inline RES_TYPE
-glue(glue(cpu_ld, USUFFIX), _code)(CPUArchState *env, target_ulong ptr)
-{
-    TCGMemOpIdx oi = make_memop_idx(MO_TE | SHIFT, CPU_MMU_INDEX);
-    return glue(glue(helper_ret_ld, USUFFIX), _cmmu)(env, ptr, oi, 0);
-}
-
-#if DATA_SIZE <= 2
-static inline int
-glue(glue(cpu_lds, SUFFIX), _code)(CPUArchState *env, target_ulong ptr)
-{
-    return (DATA_STYPE)glue(glue(cpu_ld, USUFFIX), _code)(env, ptr);
-}
-#endif
-
-#else
-
 static inline RES_TYPE
 glue(glue(glue(cpu_ld, USUFFIX), MEMSUFFIX), _ra)(CPUArchState *env,
                                                   target_ulong ptr,
@@ -XXX,XX +XXX,XX @@ glue(glue(cpu_st, SUFFIX), MEMSUFFIX)(CPUArchState *env, target_ulong ptr,
     glue(glue(cpu_st, SUFFIX), _mmuidx_ra)(env, ptr, v, CPU_MMU_INDEX, 0);
 }
 
-#endif /* !SOFTMMU_CODE_ACCESS */
-
 #undef RES_TYPE
 #undef DATA_TYPE
 #undef DATA_STYPE
diff --git a/tcg/tcg.h b/tcg/tcg.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -XXX,XX +XXX,XX @@ void helper_be_stl_mmu(CPUArchState *env, target_ulong addr, uint32_t val,
 void helper_be_stq_mmu(CPUArchState *env, target_ulong addr, uint64_t val,
                        TCGMemOpIdx oi, uintptr_t retaddr);
 
-uint8_t helper_ret_ldub_cmmu(CPUArchState *env, target_ulong addr,
-                            TCGMemOpIdx oi, uintptr_t retaddr);
-int8_t helper_ret_ldsb_cmmu(CPUArchState *env, target_ulong addr,
-                            TCGMemOpIdx oi, uintptr_t retaddr);
-uint16_t helper_le_lduw_cmmu(CPUArchState *env, target_ulong addr,
-                             TCGMemOpIdx oi, uintptr_t retaddr);
-int16_t helper_le_ldsw_cmmu(CPUArchState *env, target_ulong addr,
-                             TCGMemOpIdx oi, uintptr_t retaddr);
-uint32_t helper_le_ldl_cmmu(CPUArchState *env, target_ulong addr,
-                            TCGMemOpIdx oi, uintptr_t retaddr);
-uint64_t helper_le_ldq_cmmu(CPUArchState *env, target_ulong addr,
-                            TCGMemOpIdx oi, uintptr_t retaddr);
-uint16_t helper_be_lduw_cmmu(CPUArchState *env, target_ulong addr,
-                             TCGMemOpIdx oi, uintptr_t retaddr);
-int16_t helper_be_ldsw_cmmu(CPUArchState *env, target_ulong addr,
-                             TCGMemOpIdx oi, uintptr_t retaddr);
-uint32_t helper_be_ldl_cmmu(CPUArchState *env, target_ulong addr,
-                            TCGMemOpIdx oi, uintptr_t retaddr);
-uint64_t helper_be_ldq_cmmu(CPUArchState *env, target_ulong addr,
-                            TCGMemOpIdx oi, uintptr_t retaddr);
-
 /* Temporary aliases until backends are converted.  */
 #ifdef TARGET_WORDS_BIGENDIAN
 # define helper_ret_ldsw_mmu  helper_be_ldsw_mmu
@@ -XXX,XX +XXX,XX @@ uint64_t helper_be_ldq_cmmu(CPUArchState *env, target_ulong addr,
 # define helper_ret_stw_mmu   helper_be_stw_mmu
 # define helper_ret_stl_mmu   helper_be_stl_mmu
 # define helper_ret_stq_mmu   helper_be_stq_mmu
-# define helper_ret_lduw_cmmu  helper_be_lduw_cmmu
-# define helper_ret_ldsw_cmmu  helper_be_ldsw_cmmu
-# define helper_ret_ldl_cmmu  helper_be_ldl_cmmu
-# define helper_ret_ldq_cmmu  helper_be_ldq_cmmu
 #else
 # define helper_ret_ldsw_mmu  helper_le_ldsw_mmu
 # define helper_ret_lduw_mmu  helper_le_lduw_mmu
@@ -XXX,XX +XXX,XX @@ uint64_t helper_be_ldq_cmmu(CPUArchState *env, target_ulong addr,
 # define helper_ret_stw_mmu   helper_le_stw_mmu
 # define helper_ret_stl_mmu   helper_le_stl_mmu
 # define helper_ret_stq_mmu   helper_le_stq_mmu
-# define helper_ret_lduw_cmmu  helper_le_lduw_cmmu
-# define helper_ret_ldsw_cmmu  helper_le_ldsw_cmmu
-# define helper_ret_ldl_cmmu  helper_le_ldl_cmmu
-# define helper_ret_ldq_cmmu  helper_le_ldq_cmmu
 #endif
 
 uint32_t helper_atomic_cmpxchgb_mmu(CPUArchState *env, target_ulong addr,
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ void cpu_stq_mmuidx_ra(CPUArchState *env, target_ulong addr, uint64_t val,
 
 /* Code access functions.  */
 
-static uint64_t full_ldub_cmmu(CPUArchState *env, target_ulong addr,
+static uint64_t full_ldub_code(CPUArchState *env, target_ulong addr,
                                TCGMemOpIdx oi, uintptr_t retaddr)
 {
-    return load_helper(env, addr, oi, retaddr, MO_8, true, full_ldub_cmmu);
+    return load_helper(env, addr, oi, retaddr, MO_8, true, full_ldub_code);
 }
 
-uint8_t helper_ret_ldub_cmmu(CPUArchState *env, target_ulong addr,
-                            TCGMemOpIdx oi, uintptr_t retaddr)
+uint32_t cpu_ldub_code(CPUArchState *env, abi_ptr addr)
 {
-    return full_ldub_cmmu(env, addr, oi, retaddr);
+    TCGMemOpIdx oi = make_memop_idx(MO_UB, cpu_mmu_index(env, true));
+    return full_ldub_code(env, addr, oi, 0);
 }
 
-int8_t helper_ret_ldsb_cmmu(CPUArchState *env, target_ulong addr,
-                            TCGMemOpIdx oi, uintptr_t retaddr)
+static uint64_t full_lduw_code(CPUArchState *env, target_ulong addr,
+                               TCGMemOpIdx oi, uintptr_t retaddr)
 {
-    return (int8_t) full_ldub_cmmu(env, addr, oi, retaddr);
+    return load_helper(env, addr, oi, retaddr, MO_TEUW, true, full_lduw_code);
 }
 
-static uint64_t full_le_lduw_cmmu(CPUArchState *env, target_ulong addr,
-                                  TCGMemOpIdx oi, uintptr_t retaddr)
+uint32_t cpu_lduw_code(CPUArchState *env, abi_ptr addr)
 {
-    return load_helper(env, addr, oi, retaddr, MO_LEUW, true,
-                       full_le_lduw_cmmu);
+    TCGMemOpIdx oi = make_memop_idx(MO_TEUW, cpu_mmu_index(env, true));
+    return full_lduw_code(env, addr, oi, 0);
 }
 
-uint16_t helper_le_lduw_cmmu(CPUArchState *env, target_ulong addr,
-                            TCGMemOpIdx oi, uintptr_t retaddr)
+static uint64_t full_ldl_code(CPUArchState *env, target_ulong addr,
+                              TCGMemOpIdx oi, uintptr_t retaddr)
 {
-    return full_le_lduw_cmmu(env, addr, oi, retaddr);
+    return load_helper(env, addr, oi, retaddr, MO_TEUL, true, full_ldl_code);
 }
 
-int16_t helper_le_ldsw_cmmu(CPUArchState *env, target_ulong addr,
-                            TCGMemOpIdx oi, uintptr_t retaddr)
+uint32_t cpu_ldl_code(CPUArchState *env, abi_ptr addr)
 {
-    return (int16_t) full_le_lduw_cmmu(env, addr, oi, retaddr);
+    TCGMemOpIdx oi = make_memop_idx(MO_TEUL, cpu_mmu_index(env, true));
+    return full_ldl_code(env, addr, oi, 0);
 }
 
-static uint64_t full_be_lduw_cmmu(CPUArchState *env, target_ulong addr,
-                                  TCGMemOpIdx oi, uintptr_t retaddr)
+static uint64_t full_ldq_code(CPUArchState *env, target_ulong addr,
+                              TCGMemOpIdx oi, uintptr_t retaddr)
 {
-    return load_helper(env, addr, oi, retaddr, MO_BEUW, true,
-                       full_be_lduw_cmmu);
+    return load_helper(env, addr, oi, retaddr, MO_TEQ, true, full_ldq_code);
 }
 
-uint16_t helper_be_lduw_cmmu(CPUArchState *env, target_ulong addr,
-                            TCGMemOpIdx oi, uintptr_t retaddr)
+uint64_t cpu_ldq_code(CPUArchState *env, abi_ptr addr)
 {
-    return full_be_lduw_cmmu(env, addr, oi, retaddr);
-}
-
-int16_t helper_be_ldsw_cmmu(CPUArchState *env, target_ulong addr,
-                            TCGMemOpIdx oi, uintptr_t retaddr)
-{
-    return (int16_t) full_be_lduw_cmmu(env, addr, oi, retaddr);
-}
-
-static uint64_t full_le_ldul_cmmu(CPUArchState *env, target_ulong addr,
-                                  TCGMemOpIdx oi, uintptr_t retaddr)
-{
-    return load_helper(env, addr, oi, retaddr, MO_LEUL, true,
-                       full_le_ldul_cmmu);
-}
-
-uint32_t helper_le_ldl_cmmu(CPUArchState *env, target_ulong addr,
-                            TCGMemOpIdx oi, uintptr_t retaddr)
-{
-    return full_le_ldul_cmmu(env, addr, oi, retaddr);
-}
-
-static uint64_t full_be_ldul_cmmu(CPUArchState *env, target_ulong addr,
-                                  TCGMemOpIdx oi, uintptr_t retaddr)
-{
-    return load_helper(env, addr, oi, retaddr, MO_BEUL, true,
-                       full_be_ldul_cmmu);
-}
-
-uint32_t helper_be_ldl_cmmu(CPUArchState *env, target_ulong addr,
-                            TCGMemOpIdx oi, uintptr_t retaddr)
-{
-    return full_be_ldul_cmmu(env, addr, oi, retaddr);
-}
-
-uint64_t helper_le_ldq_cmmu(CPUArchState *env, target_ulong addr,
-                            TCGMemOpIdx oi, uintptr_t retaddr)
-{
-    return load_helper(env, addr, oi, retaddr, MO_LEQ, true,
-                       helper_le_ldq_cmmu);
-}
-
-uint64_t helper_be_ldq_cmmu(CPUArchState *env, target_ulong addr,
-                            TCGMemOpIdx oi, uintptr_t retaddr)
-{
-    return load_helper(env, addr, oi, retaddr, MO_BEQ, true,
-                       helper_be_ldq_cmmu);
+    TCGMemOpIdx oi = make_memop_idx(MO_TEQ, cpu_mmu_index(env, true));
+    return full_ldq_code(env, addr, oi, 0);
 }
diff --git a/docs/devel/loads-stores.rst b/docs/devel/loads-stores.rst
index XXXXXXX..XXXXXXX 100644
--- a/docs/devel/loads-stores.rst
+++ b/docs/devel/loads-stores.rst
@@ -XXX,XX +XXX,XX @@ more in line with the other memory access functions.
 
 load: ``helper_{endian}_ld{sign}{size}_mmu(env, addr, opindex, retaddr)``
 
-load (code): ``helper_{endian}_ld{sign}{size}_cmmu(env, addr, opindex, retaddr)``
-
 store: ``helper_{endian}_st{size}_mmu(env, addr, val, opindex, retaddr)``
 
 ``sign``
@@ -XXX,XX +XXX,XX @@ store: ``helper_{endian}_st{size}_mmu(env, addr, val, opindex, retaddr)``
  - ``ret`` : target endianness
 
 Regexes for git grep
- - ``\<helper_$le\|be\|ret$_ld[us]\?[bwlq]_c\?mmu\>``
+ - ``\<helper_$le\|be\|ret$_ld[us]\?[bwlq]_mmu\>``
  - ``\<helper_$le\|be\|ret$_st[bwlq]_mmu\>``
 
 ``address_space_*``
-- 
2.20.1

This finishes the new interface began with the previous patch.
Document the interface and deprecate MMU_MODE<N>_SUFFIX.

Tested-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Reviewed-by: Aleksandar Markovic <amarkovic@wavecomp.com>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/cpu_ldst.h     |  80 +++++++++++++-
 docs/devel/loads-stores.rst | 211 ++++++++++++++++++++++++++----------
 2 files changed, 230 insertions(+), 61 deletions(-)

diff --git a/include/exec/cpu_ldst.h b/include/exec/cpu_ldst.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/cpu_ldst.h
+++ b/include/exec/cpu_ldst.h
@@ -XXX,XX +XXX,XX @@
  *
  * The syntax for the accessors is:
  *
- * load: cpu_ld{sign}{size}_{mmusuffix}(env, ptr)
+ * load:  cpu_ld{sign}{size}_{mmusuffix}(env, ptr)
+ *        cpu_ld{sign}{size}_{mmusuffix}_ra(env, ptr, retaddr)
+ *        cpu_ld{sign}{size}_mmuidx_ra(env, ptr, mmu_idx, retaddr)
  *
- * store: cpu_st{sign}{size}_{mmusuffix}(env, ptr, val)
+ * store: cpu_st{size}_{mmusuffix}(env, ptr, val)
+ *        cpu_st{size}_{mmusuffix}_ra(env, ptr, val, retaddr)
+ *        cpu_st{size}_mmuidx_ra(env, ptr, val, mmu_idx, retaddr)
  *
  * sign is:
  * (empty): for 32 and 64 bit sizes
@@ -XXX,XX +XXX,XX @@
  *   l: 32 bits
  *   q: 64 bits
  *
- * mmusuffix is one of the generic suffixes "data" or "code", or
- * (for softmmu configs)  a target-specific MMU mode suffix as defined
- * in target cpu.h.
+ * mmusuffix is one of the generic suffixes "data" or "code", or "mmuidx".
+ * The "mmuidx" suffix carries an extra mmu_idx argument that specifies
+ * the index to use; the "data" and "code" suffixes take the index from
+ * cpu_mmu_index().
  */
 #ifndef CPU_LDST_H
 #define CPU_LDST_H
@@ -XXX,XX +XXX,XX @@ static inline void clear_helper_retaddr(void)
 #undef MEMSUFFIX
 #undef CODE_ACCESS
 
+/*
+ * Provide the same *_mmuidx_ra interface as for softmmu.
+ * The mmu_idx argument is ignored.
+ */
+
+static inline uint32_t cpu_ldub_mmuidx_ra(CPUArchState *env, abi_ptr addr,
+                                          int mmu_idx, uintptr_t ra)
+{
+    return cpu_ldub_data_ra(env, addr, ra);
+}
+
+static inline uint32_t cpu_lduw_mmuidx_ra(CPUArchState *env, abi_ptr addr,
+                                          int mmu_idx, uintptr_t ra)
+{
+    return cpu_lduw_data_ra(env, addr, ra);
+}
+
+static inline uint32_t cpu_ldl_mmuidx_ra(CPUArchState *env, abi_ptr addr,
+                                         int mmu_idx, uintptr_t ra)
+{
+    return cpu_ldl_data_ra(env, addr, ra);
+}
+
+static inline uint64_t cpu_ldq_mmuidx_ra(CPUArchState *env, abi_ptr addr,
+                                         int mmu_idx, uintptr_t ra)
+{
+    return cpu_ldq_data_ra(env, addr, ra);
+}
+
+static inline int cpu_ldsb_mmuidx_ra(CPUArchState *env, abi_ptr addr,
+                                     int mmu_idx, uintptr_t ra)
+{
+    return cpu_ldsb_data_ra(env, addr, ra);
+}
+
+static inline int cpu_ldsw_mmuidx_ra(CPUArchState *env, abi_ptr addr,
+                                     int mmu_idx, uintptr_t ra)
+{
+    return cpu_ldsw_data_ra(env, addr, ra);
+}
+
+static inline void cpu_stb_mmuidx_ra(CPUArchState *env, abi_ptr addr,
+                                     uint32_t val, int mmu_idx, uintptr_t ra)
+{
+    cpu_stb_data_ra(env, addr, val, ra);
+}
+
+static inline void cpu_stw_mmuidx_ra(CPUArchState *env, abi_ptr addr,
+                                     uint32_t val, int mmu_idx, uintptr_t ra)
+{
+    cpu_stw_data_ra(env, addr, val, ra);
+}
+
+static inline void cpu_stl_mmuidx_ra(CPUArchState *env, abi_ptr addr,
+                                     uint32_t val, int mmu_idx, uintptr_t ra)
+{
+    cpu_stl_data_ra(env, addr, val, ra);
+}
+
+static inline void cpu_stq_mmuidx_ra(CPUArchState *env, abi_ptr addr,
+                                     uint64_t val, int mmu_idx, uintptr_t ra)
+{
+    cpu_stq_data_ra(env, addr, val, ra);
+}
+
 #else
 
 /* Needed for TCG_OVERSIZED_GUEST */
diff --git a/docs/devel/loads-stores.rst b/docs/devel/loads-stores.rst
index XXXXXXX..XXXXXXX 100644
--- a/docs/devel/loads-stores.rst
+++ b/docs/devel/loads-stores.rst
@@ -XXX,XX +XXX,XX @@ Regexes for git grep
  - ``\<ldn_$[hbl]e$?_p\>``
  - ``\<stn_$[hbl]e$?_p\>``
 
-``cpu_{ld,st}_*``
-~~~~~~~~~~~~~~~~~
+``cpu_{ld,st}*_mmuidx_ra``
+~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-These functions operate on a guest virtual address. Be aware
-that these functions may cause a guest CPU exception to be
-taken (e.g. for an alignment fault or MMU fault) which will
-result in guest CPU state being updated and control longjumping
-out of the function call. They should therefore only be used
-in code that is implementing emulation of the target CPU.
+These functions operate on a guest virtual address plus a context,
+known as a "mmu index" or ``mmuidx``, which controls how that virtual
+address is translated.  The meaning of the indexes are target specific,
+but specifying a particular index might be necessary if, for instance,
+the helper requires an "always as non-privileged" access rather that
+the default access for the current state of the guest CPU.
 
-These functions may throw an exception (longjmp() back out
-to the top level TCG loop). This means they must only be used
-from helper functions where the translator has saved all
-necessary CPU state before generating the helper function call.
-It's usually better to use the ``_ra`` variants described below
-from helper functions, but these functions are the right choice
-for calls made from hooks like the CPU do_interrupt hook or
-when you know for certain that the translator had to save all
-the CPU state that ``cpu_restore_state()`` would restore anyway.
+These functions may cause a guest CPU exception to be taken
+(e.g. for an alignment fault or MMU fault) which will result in
+guest CPU state being updated and control longjmp'ing out of the
+function call.  They should therefore only be used in code that is
+implementing emulation of the guest CPU.
+
+The ``retaddr`` parameter is used to control unwinding of the
+guest CPU state in case of a guest CPU exception.  This is passed
+to ``cpu_restore_state()``.  Therefore the value should either be 0,
+to indicate that the guest CPU state is already synchronized, or
+the result of ``GETPC()`` from the top level ``HELPER(foo)``
+function, which is a return address into the generated code.
 
 Function names follow the pattern:
 
-load: ``cpu_ld{sign}{size}_{mmusuffix}(env, ptr)``
+load: ``cpu_ld{sign}{size}_mmuidx_ra(env, ptr, mmuidx, retaddr)``
 
-store: ``cpu_st{size}_{mmusuffix}(env, ptr, val)``
+store: ``cpu_st{size}_mmuidx_ra(env, ptr, val, mmuidx, retaddr)``
 
 ``sign``
  - (empty) : for 32 or 64 bit sizes
@@ -XXX,XX +XXX,XX @@ store: ``cpu_st{size}_{mmusuffix}(env, ptr, val)``
  - ``l`` : 32 bits
  - ``q`` : 64 bits
 
-``mmusuffix`` is one of the generic suffixes ``data`` or ``code``, or
-(for softmmu configs) a target-specific MMU mode suffix as defined
-in the target's ``cpu.h``.
+Regexes for git grep:
+ - ``\<cpu_ld[us]\?[bwlq]_mmuidx_ra\>``
+ - ``\<cpu_st[bwlq]_mmuidx_ra\>``
 
-Regexes for git grep
- - ``\<cpu_ld[us]\?[bwlq]_[a-zA-Z0-9]\+\>``
- - ``\<cpu_st[bwlq]_[a-zA-Z0-9]\+\>``
+``cpu_{ld,st}*_data_ra``
+~~~~~~~~~~~~~~~~~~~~~~~~
 
-``cpu_{ld,st}_*_ra``
-~~~~~~~~~~~~~~~~~~~~
-
-These functions work like the ``cpu_{ld,st}_*`` functions except
-that they also take a ``retaddr`` argument. This extra argument
-allows for correct unwinding of any exception that is taken,
-and should generally be the result of GETPC() called directly
-from the top level HELPER(foo) function (i.e. the return address
-in the generated code).
+These functions work like the ``cpu_{ld,st}_mmuidx_ra`` functions
+except that the ``mmuidx`` parameter is taken from the current mode
+of the guest CPU, as determined by ``cpu_mmu_index(env, false)``.
 
 These are generally the preferred way to do accesses by guest
-virtual address from helper functions; see the documentation
-of the non-``_ra`` variants for when those would be better.
-
-Calling these functions with a ``retaddr`` argument of 0 is
-equivalent to calling the non-``_ra`` version of the function.
+virtual address from helper functions, unless the access should
+be performed with a context other than the default.
 
 Function names follow the pattern:
 
-load: ``cpu_ld{sign}{size}_{mmusuffix}_ra(env, ptr, retaddr)``
+load: ``cpu_ld{sign}{size}_data_ra(env, ptr, ra)``
 
-store: ``cpu_st{sign}{size}_{mmusuffix}_ra(env, ptr, val, retaddr)``
+store: ``cpu_st{size}_data_ra(env, ptr, val, ra)``
+
+``sign``
+ - (empty) : for 32 or 64 bit sizes
+ - ``u`` : unsigned
+ - ``s`` : signed
+
+``size``
+ - ``b`` : 8 bits
+ - ``w`` : 16 bits
+ - ``l`` : 32 bits
+ - ``q`` : 64 bits
+
+Regexes for git grep:
+ - ``\<cpu_ld[us]\?[bwlq]_data_ra\>``
+ - ``\<cpu_st[bwlq]_data_ra\>``
+
+``cpu_{ld,st}*_data``
+~~~~~~~~~~~~~~~~~~~~~
+
+These functions work like the ``cpu_{ld,st}_data_ra`` functions
+except that the ``retaddr`` parameter is 0, and thus does not
+unwind guest CPU state.
+
+This means they must only be used from helper functions where the
+translator has saved all necessary CPU state.  These functions are
+the right choice for calls made from hooks like the CPU ``do_interrupt``
+hook or when you know for certain that the translator had to save all
+the CPU state anyway.
+
+Function names follow the pattern:
+
+load: ``cpu_ld{sign}{size}_data(env, ptr)``
+
+store: ``cpu_st{size}_data(env, ptr, val)``
+
+``sign``
+ - (empty) : for 32 or 64 bit sizes
+ - ``u`` : unsigned
+ - ``s`` : signed
+
+``size``
+ - ``b`` : 8 bits
+ - ``w`` : 16 bits
+ - ``l`` : 32 bits
+ - ``q`` : 64 bits
 
 Regexes for git grep
- - ``\<cpu_ld[us]\?[bwlq]_[a-zA-Z0-9]\+_ra\>``
- - ``\<cpu_st[bwlq]_[a-zA-Z0-9]\+_ra\>``
+ - ``\<cpu_ld[us]\?[bwlq]_data\>``
+ - ``\<cpu_st[bwlq]_data\+\>``
 
-``helper_*_{ld,st}*mmu``
-~~~~~~~~~~~~~~~~~~~~~~~~
+``cpu_ld*_code``
+~~~~~~~~~~~~~~~~
+
+These functions perform a read for instruction execution.  The ``mmuidx``
+parameter is taken from the current mode of the guest CPU, as determined
+by ``cpu_mmu_index(env, true)``.  The ``retaddr`` parameter is 0, and
+thus does not unwind guest CPU state, because CPU state is always
+synchronized while translating instructions.  Any guest CPU exception
+that is raised will indicate an instruction execution fault rather than
+a data read fault.
+
+In general these functions should not be used directly during translation.
+There are wrapper functions that are to be used which also take care of
+plugins for tracing.
+
+Function names follow the pattern:
+
+load: ``cpu_ld{sign}{size}_code(env, ptr)``
+
+``sign``
+ - (empty) : for 32 or 64 bit sizes
+ - ``u`` : unsigned
+ - ``s`` : signed
+
+``size``
+ - ``b`` : 8 bits
+ - ``w`` : 16 bits
+ - ``l`` : 32 bits
+ - ``q`` : 64 bits
+
+Regexes for git grep:
+ - ``\<cpu_ld[us]\?[bwlq]_code\>``
+
+``translator_ld*``
+~~~~~~~~~~~~~~~~~~
+
+These functions are a wrapper for ``cpu_ld*_code`` which also perform
+any actions required by any tracing plugins.  They are only to be
+called during the translator callback ``translate_insn``.
+
+There is a set of functions ending in ``_swap`` which, if the parameter
+is true, returns the value in the endianness that is the reverse of
+the guest native endianness, as determined by ``TARGET_WORDS_BIGENDIAN``.
+
+Function names follow the pattern:
+
+load: ``translator_ld{sign}{size}(env, ptr)``
+
+swap: ``translator_ld{sign}{size}_swap(env, ptr, swap)``
+
+``sign``
+ - (empty) : for 32 or 64 bit sizes
+ - ``u`` : unsigned
+ - ``s`` : signed
+
+``size``
+ - ``b`` : 8 bits
+ - ``w`` : 16 bits
+ - ``l`` : 32 bits
+ - ``q`` : 64 bits
+
+Regexes for git grep
+ - ``\<translator_ld[us]\?[bwlq]$_swap$\?\>``
+
+``helper_*_{ld,st}*_mmu``
+~~~~~~~~~~~~~~~~~~~~~~~~~
 
 These functions are intended primarily to be called by the code
 generated by the TCG backend. They may also be called by target
-CPU helper function code. Like the ``cpu_{ld,st}_*_ra`` functions
-they perform accesses by guest virtual address; the difference is
-that these functions allow you to specify an ``opindex`` parameter
-which encodes (among other things) the mmu index to use for the
-access. This is necessary if your helper needs to make an access
-via a specific mmu index (for instance, an "always as non-privileged"
-access) rather than using the default mmu index for the current state
-of the guest CPU.
+CPU helper function code. Like the ``cpu_{ld,st}_mmuidx_ra`` functions
+they perform accesses by guest virtual address, with a given ``mmuidx``.
 
-The ``opindex`` parameter should be created by calling ``make_memop_idx()``.
+These functions specify an ``opindex`` parameter which encodes
+(among other things) the mmu index to use for the access.  This parameter
+should be created by calling ``make_memop_idx()``.
 
 The ``retaddr`` parameter should be the result of GETPC() called directly
 from the top level HELPER(foo) function (or 0 if no guest CPU state
@@ -XXX,XX +XXX,XX @@ unwinding is required).
 
 **TODO** The names of these functions are a bit odd for historical
 reasons because they were originally expected to be called only from
-within generated code. We should rename them to bring them
-more in line with the other memory access functions.
+within generated code. We should rename them to bring them more in
+line with the other memory access functions. The explicit endianness
+is the only feature they have beyond ``*_mmuidx_ra``.
 
 load: ``helper_{endian}_ld{sign}{size}_mmu(env, addr, opindex, retaddr)``
 
-- 
2.20.1

Do not use exec/cpu_ldst_{,useronly_}template.h directly,
but instead use the functional interface.

Cc: Eduardo Habkost <ehabkost@redhat.com>
Tested-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Acked-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/i386/seg_helper.c | 56 ++++++++++++++++++++--------------------
 1 file changed, 28 insertions(+), 28 deletions(-)

diff --git a/target/i386/seg_helper.c b/target/i386/seg_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/i386/seg_helper.c
+++ b/target/i386/seg_helper.c
@@ -XXX,XX +XXX,XX @@
 # define LOG_PCALL_STATE(cpu) do { } while (0)
 #endif
 
-#ifdef CONFIG_USER_ONLY
-#define MEMSUFFIX _kernel
-#define DATA_SIZE 1
-#include "exec/cpu_ldst_useronly_template.h"
+/*
+ * TODO: Convert callers to compute cpu_mmu_index_kernel once
+ * and use *_mmuidx_ra directly.
+ */
+#define cpu_ldub_kernel_ra(e, p, r) \
+    cpu_ldub_mmuidx_ra(e, p, cpu_mmu_index_kernel(e), r)
+#define cpu_lduw_kernel_ra(e, p, r) \
+    cpu_lduw_mmuidx_ra(e, p, cpu_mmu_index_kernel(e), r)
+#define cpu_ldl_kernel_ra(e, p, r) \
+    cpu_ldl_mmuidx_ra(e, p, cpu_mmu_index_kernel(e), r)
+#define cpu_ldq_kernel_ra(e, p, r) \
+    cpu_ldq_mmuidx_ra(e, p, cpu_mmu_index_kernel(e), r)
 
-#define DATA_SIZE 2
-#include "exec/cpu_ldst_useronly_template.h"
+#define cpu_stb_kernel_ra(e, p, v, r) \
+    cpu_stb_mmuidx_ra(e, p, v, cpu_mmu_index_kernel(e), r)
+#define cpu_stw_kernel_ra(e, p, v, r) \
+    cpu_stw_mmuidx_ra(e, p, v, cpu_mmu_index_kernel(e), r)
+#define cpu_stl_kernel_ra(e, p, v, r) \
+    cpu_stl_mmuidx_ra(e, p, v, cpu_mmu_index_kernel(e), r)
+#define cpu_stq_kernel_ra(e, p, v, r) \
+    cpu_stq_mmuidx_ra(e, p, v, cpu_mmu_index_kernel(e), r)
 
-#define DATA_SIZE 4
-#include "exec/cpu_ldst_useronly_template.h"
+#define cpu_ldub_kernel(e, p)    cpu_ldub_kernel_ra(e, p, 0)
+#define cpu_lduw_kernel(e, p)    cpu_lduw_kernel_ra(e, p, 0)
+#define cpu_ldl_kernel(e, p)     cpu_ldl_kernel_ra(e, p, 0)
+#define cpu_ldq_kernel(e, p)     cpu_ldq_kernel_ra(e, p, 0)
 
-#define DATA_SIZE 8
-#include "exec/cpu_ldst_useronly_template.h"
-#undef MEMSUFFIX
-#else
-#define CPU_MMU_INDEX (cpu_mmu_index_kernel(env))
-#define MEMSUFFIX _kernel
-#define DATA_SIZE 1
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 2
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 4
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 8
-#include "exec/cpu_ldst_template.h"
-#undef CPU_MMU_INDEX
-#undef MEMSUFFIX
-#endif
+#define cpu_stb_kernel(e, p, v)  cpu_stb_kernel_ra(e, p, v, 0)
+#define cpu_stw_kernel(e, p, v)  cpu_stw_kernel_ra(e, p, v, 0)
+#define cpu_stl_kernel(e, p, v)  cpu_stl_kernel_ra(e, p, v, 0)
+#define cpu_stq_kernel(e, p, v)  cpu_stq_kernel_ra(e, p, v, 0)
 
 /* return non zero if error */
 static inline int load_segment_ra(CPUX86State *env, uint32_t *e1_ptr,
-- 
2.20.1

With the tracing hooks, the inline functions are no longer
so simple.  Reduce the amount of preprocessor obfuscation
by expanding the text of each of the functions generated.

Tested-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/cpu_ldst.h                   |  54 +++--
 include/exec/cpu_ldst_useronly_template.h | 159 ---------------
 accel/tcg/user-exec.c                     | 236 ++++++++++++++++++++++
 3 files changed, 262 insertions(+), 187 deletions(-)
 delete mode 100644 include/exec/cpu_ldst_useronly_template.h

diff --git a/include/exec/cpu_ldst.h b/include/exec/cpu_ldst.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/cpu_ldst.h
+++ b/include/exec/cpu_ldst.h
@@ -XXX,XX +XXX,XX @@ static inline void clear_helper_retaddr(void)
 
 /* In user-only mode we provide only the _code and _data accessors. */
 
-#define MEMSUFFIX _data
-#define DATA_SIZE 1
-#include "exec/cpu_ldst_useronly_template.h"
+uint32_t cpu_ldub_data(CPUArchState *env, abi_ptr ptr);
+uint32_t cpu_lduw_data(CPUArchState *env, abi_ptr ptr);
+uint32_t cpu_ldl_data(CPUArchState *env, abi_ptr ptr);
+uint64_t cpu_ldq_data(CPUArchState *env, abi_ptr ptr);
+int cpu_ldsb_data(CPUArchState *env, abi_ptr ptr);
+int cpu_ldsw_data(CPUArchState *env, abi_ptr ptr);
 
-#define DATA_SIZE 2
-#include "exec/cpu_ldst_useronly_template.h"
+uint32_t cpu_ldub_data_ra(CPUArchState *env, abi_ptr ptr, uintptr_t retaddr);
+uint32_t cpu_lduw_data_ra(CPUArchState *env, abi_ptr ptr, uintptr_t retaddr);
+uint32_t cpu_ldl_data_ra(CPUArchState *env, abi_ptr ptr, uintptr_t retaddr);
+uint64_t cpu_ldq_data_ra(CPUArchState *env, abi_ptr ptr, uintptr_t retaddr);
+int cpu_ldsb_data_ra(CPUArchState *env, abi_ptr ptr, uintptr_t retaddr);
+int cpu_ldsw_data_ra(CPUArchState *env, abi_ptr ptr, uintptr_t retaddr);
 
-#define DATA_SIZE 4
-#include "exec/cpu_ldst_useronly_template.h"
+void cpu_stb_data(CPUArchState *env, abi_ptr ptr, uint32_t val);
+void cpu_stw_data(CPUArchState *env, abi_ptr ptr, uint32_t val);
+void cpu_stl_data(CPUArchState *env, abi_ptr ptr, uint32_t val);
+void cpu_stq_data(CPUArchState *env, abi_ptr ptr, uint64_t val);
 
-#define DATA_SIZE 8
-#include "exec/cpu_ldst_useronly_template.h"
-#undef MEMSUFFIX
-
-#define MEMSUFFIX _code
-#define CODE_ACCESS
-#define DATA_SIZE 1
-#include "exec/cpu_ldst_useronly_template.h"
-
-#define DATA_SIZE 2
-#include "exec/cpu_ldst_useronly_template.h"
-
-#define DATA_SIZE 4
-#include "exec/cpu_ldst_useronly_template.h"
-
-#define DATA_SIZE 8
-#include "exec/cpu_ldst_useronly_template.h"
-#undef MEMSUFFIX
-#undef CODE_ACCESS
+void cpu_stb_data_ra(CPUArchState *env, abi_ptr ptr,
+                     uint32_t val, uintptr_t retaddr);
+void cpu_stw_data_ra(CPUArchState *env, abi_ptr ptr,
+                     uint32_t val, uintptr_t retaddr);
+void cpu_stl_data_ra(CPUArchState *env, abi_ptr ptr,
+                     uint32_t val, uintptr_t retaddr);
+void cpu_stq_data_ra(CPUArchState *env, abi_ptr ptr,
+                     uint64_t val, uintptr_t retaddr);
 
 /*
  * Provide the same *_mmuidx_ra interface as for softmmu.
@@ -XXX,XX +XXX,XX @@ void cpu_stq_mmuidx_ra(CPUArchState *env, abi_ptr addr, uint64_t val,
 #undef CPU_MMU_INDEX
 #undef MEMSUFFIX
 
+#endif /* defined(CONFIG_USER_ONLY) */
+
 uint32_t cpu_ldub_code(CPUArchState *env, abi_ptr addr);
 uint32_t cpu_lduw_code(CPUArchState *env, abi_ptr addr);
 uint32_t cpu_ldl_code(CPUArchState *env, abi_ptr addr);
@@ -XXX,XX +XXX,XX @@ static inline int cpu_ldsw_code(CPUArchState *env, abi_ptr addr)
     return (int16_t)cpu_lduw_code(env, addr);
 }
 
-#endif /* defined(CONFIG_USER_ONLY) */
-
 /**
  * tlb_vaddr_to_host:
  * @env: CPUArchState
diff --git a/include/exec/cpu_ldst_useronly_template.h b/include/exec/cpu_ldst_useronly_template.h
deleted file mode 100644
index XXXXXXX..XXXXXXX
--- a/include/exec/cpu_ldst_useronly_template.h
+++ /dev/null
@@ -XXX,XX +XXX,XX @@
-/*
- *  User-only accessor function support
- *
- * Generate inline load/store functions for one data size.
- *
- * Generate a store function as well as signed and unsigned loads.
- *
- * Not used directly but included from cpu_ldst.h.
- *
- *  Copyright (c) 2015 Linaro Limited
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#if !defined(CODE_ACCESS)
-#include "trace-root.h"
-#endif
-
-#include "trace/mem.h"
-
-#if DATA_SIZE == 8
-#define SUFFIX q
-#define USUFFIX q
-#define DATA_TYPE uint64_t
-#define SHIFT 3
-#elif DATA_SIZE == 4
-#define SUFFIX l
-#define USUFFIX l
-#define DATA_TYPE uint32_t
-#define SHIFT 2
-#elif DATA_SIZE == 2
-#define SUFFIX w
-#define USUFFIX uw
-#define DATA_TYPE uint16_t
-#define DATA_STYPE int16_t
-#define SHIFT 1
-#elif DATA_SIZE == 1
-#define SUFFIX b
-#define USUFFIX ub
-#define DATA_TYPE uint8_t
-#define DATA_STYPE int8_t
-#define SHIFT 0
-#else
-#error unsupported data size
-#endif
-
-#if DATA_SIZE == 8
-#define RES_TYPE uint64_t
-#else
-#define RES_TYPE uint32_t
-#endif
-
-static inline RES_TYPE
-glue(glue(cpu_ld, USUFFIX), MEMSUFFIX)(CPUArchState *env, abi_ptr ptr)
-{
-    RES_TYPE ret;
-#ifdef CODE_ACCESS
-    set_helper_retaddr(1);
-    ret = glue(glue(ld, USUFFIX), _p)(g2h(ptr));
-    clear_helper_retaddr();
-#else
-    MemOp op = MO_TE | SHIFT;
-    uint16_t meminfo = trace_mem_get_info(op, MMU_USER_IDX, false);
-    trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo);
-    ret = glue(glue(ld, USUFFIX), _p)(g2h(ptr));
-#endif
-    return ret;
-}
-
-#ifndef CODE_ACCESS
-static inline RES_TYPE
-glue(glue(glue(cpu_ld, USUFFIX), MEMSUFFIX), _ra)(CPUArchState *env,
-                                                  abi_ptr ptr,
-                                                  uintptr_t retaddr)
-{
-    RES_TYPE ret;
-    set_helper_retaddr(retaddr);
-    ret = glue(glue(cpu_ld, USUFFIX), MEMSUFFIX)(env, ptr);
-    clear_helper_retaddr();
-    return ret;
-}
-#endif
-
-#if DATA_SIZE <= 2
-static inline int
-glue(glue(cpu_lds, SUFFIX), MEMSUFFIX)(CPUArchState *env, abi_ptr ptr)
-{
-    int ret;
-#ifdef CODE_ACCESS
-    set_helper_retaddr(1);
-    ret = glue(glue(lds, SUFFIX), _p)(g2h(ptr));
-    clear_helper_retaddr();
-#else
-    MemOp op = MO_TE | MO_SIGN | SHIFT;
-    uint16_t meminfo = trace_mem_get_info(op, MMU_USER_IDX, false);
-    trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo);
-    ret = glue(glue(lds, SUFFIX), _p)(g2h(ptr));
-    qemu_plugin_vcpu_mem_cb(env_cpu(env), ptr, meminfo);
-#endif
-    return ret;
-}
-
-#ifndef CODE_ACCESS
-static inline int
-glue(glue(glue(cpu_lds, SUFFIX), MEMSUFFIX), _ra)(CPUArchState *env,
-                                                  abi_ptr ptr,
-                                                  uintptr_t retaddr)
-{
-    int ret;
-    set_helper_retaddr(retaddr);
-    ret = glue(glue(cpu_lds, SUFFIX), MEMSUFFIX)(env, ptr);
-    clear_helper_retaddr();
-    return ret;
-}
-#endif /* CODE_ACCESS */
-#endif /* DATA_SIZE <= 2 */
-
-#ifndef CODE_ACCESS
-static inline void
-glue(glue(cpu_st, SUFFIX), MEMSUFFIX)(CPUArchState *env, abi_ptr ptr,
-                                      RES_TYPE v)
-{
-    MemOp op = MO_TE | SHIFT;
-    uint16_t meminfo = trace_mem_get_info(op, MMU_USER_IDX, true);
-    trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo);
-    glue(glue(st, SUFFIX), _p)(g2h(ptr), v);
-    qemu_plugin_vcpu_mem_cb(env_cpu(env), ptr, meminfo);
-}
-
-static inline void
-glue(glue(glue(cpu_st, SUFFIX), MEMSUFFIX), _ra)(CPUArchState *env,
-                                                  abi_ptr ptr,
-                                                  RES_TYPE v,
-                                                  uintptr_t retaddr)
-{
-    set_helper_retaddr(retaddr);
-    glue(glue(cpu_st, SUFFIX), MEMSUFFIX)(env, ptr, v);
-    clear_helper_retaddr();
-}
-#endif
-
-#undef RES_TYPE
-#undef DATA_TYPE
-#undef DATA_STYPE
-#undef SUFFIX
-#undef USUFFIX
-#undef DATA_SIZE
-#undef SHIFT
diff --git a/accel/tcg/user-exec.c b/accel/tcg/user-exec.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/user-exec.c
+++ b/accel/tcg/user-exec.c
@@ -XXX,XX +XXX,XX @@
 #include "translate-all.h"
 #include "exec/helper-proto.h"
 #include "qemu/atomic128.h"
+#include "trace-root.h"
+#include "trace/mem.h"
 
 #undef EAX
 #undef ECX
@@ -XXX,XX +XXX,XX @@ int cpu_signal_handler(int host_signum, void *pinfo,
 
 /* The softmmu versions of these helpers are in cputlb.c.  */
 
+uint32_t cpu_ldub_data(CPUArchState *env, abi_ptr ptr)
+{
+    uint32_t ret;
+    uint16_t meminfo = trace_mem_get_info(MO_UB, MMU_USER_IDX, false);
+
+    trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo);
+    ret = ldub_p(g2h(ptr));
+    qemu_plugin_vcpu_mem_cb(env_cpu(env), ptr, meminfo);
+    return ret;
+}
+
+int cpu_ldsb_data(CPUArchState *env, abi_ptr ptr)
+{
+    int ret;
+    uint16_t meminfo = trace_mem_get_info(MO_SB, MMU_USER_IDX, false);
+
+    trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo);
+    ret = ldsb_p(g2h(ptr));
+    qemu_plugin_vcpu_mem_cb(env_cpu(env), ptr, meminfo);
+    return ret;
+}
+
+uint32_t cpu_lduw_data(CPUArchState *env, abi_ptr ptr)
+{
+    uint32_t ret;
+    uint16_t meminfo = trace_mem_get_info(MO_TEUW, MMU_USER_IDX, false);
+
+    trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo);
+    ret = lduw_p(g2h(ptr));
+    qemu_plugin_vcpu_mem_cb(env_cpu(env), ptr, meminfo);
+    return ret;
+}
+
+int cpu_ldsw_data(CPUArchState *env, abi_ptr ptr)
+{
+    int ret;
+    uint16_t meminfo = trace_mem_get_info(MO_TESW, MMU_USER_IDX, false);
+
+    trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo);
+    ret = ldsw_p(g2h(ptr));
+    qemu_plugin_vcpu_mem_cb(env_cpu(env), ptr, meminfo);
+    return ret;
+}
+
+uint32_t cpu_ldl_data(CPUArchState *env, abi_ptr ptr)
+{
+    uint32_t ret;
+    uint16_t meminfo = trace_mem_get_info(MO_TEUL, MMU_USER_IDX, false);
+
+    trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo);
+    ret = ldl_p(g2h(ptr));
+    qemu_plugin_vcpu_mem_cb(env_cpu(env), ptr, meminfo);
+    return ret;
+}
+
+uint64_t cpu_ldq_data(CPUArchState *env, abi_ptr ptr)
+{
+    uint64_t ret;
+    uint16_t meminfo = trace_mem_get_info(MO_TEQ, MMU_USER_IDX, false);
+
+    trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo);
+    ret = ldq_p(g2h(ptr));
+    qemu_plugin_vcpu_mem_cb(env_cpu(env), ptr, meminfo);
+    return ret;
+}
+
+uint32_t cpu_ldub_data_ra(CPUArchState *env, abi_ptr ptr, uintptr_t retaddr)
+{
+    uint32_t ret;
+
+    set_helper_retaddr(retaddr);
+    ret = cpu_ldub_data(env, ptr);
+    clear_helper_retaddr();
+    return ret;
+}
+
+int cpu_ldsb_data_ra(CPUArchState *env, abi_ptr ptr, uintptr_t retaddr)
+{
+    int ret;
+
+    set_helper_retaddr(retaddr);
+    ret = cpu_ldsb_data(env, ptr);
+    clear_helper_retaddr();
+    return ret;
+}
+
+uint32_t cpu_lduw_data_ra(CPUArchState *env, abi_ptr ptr, uintptr_t retaddr)
+{
+    uint32_t ret;
+
+    set_helper_retaddr(retaddr);
+    ret = cpu_lduw_data(env, ptr);
+    clear_helper_retaddr();
+    return ret;
+}
+
+int cpu_ldsw_data_ra(CPUArchState *env, abi_ptr ptr, uintptr_t retaddr)
+{
+    int ret;
+
+    set_helper_retaddr(retaddr);
+    ret = cpu_ldsw_data(env, ptr);
+    clear_helper_retaddr();
+    return ret;
+}
+
+uint32_t cpu_ldl_data_ra(CPUArchState *env, abi_ptr ptr, uintptr_t retaddr)
+{
+    uint32_t ret;
+
+    set_helper_retaddr(retaddr);
+    ret = cpu_ldl_data(env, ptr);
+    clear_helper_retaddr();
+    return ret;
+}
+
+uint64_t cpu_ldq_data_ra(CPUArchState *env, abi_ptr ptr, uintptr_t retaddr)
+{
+    uint64_t ret;
+
+    set_helper_retaddr(retaddr);
+    ret = cpu_ldq_data(env, ptr);
+    clear_helper_retaddr();
+    return ret;
+}
+
+void cpu_stb_data(CPUArchState *env, abi_ptr ptr, uint32_t val)
+{
+    uint16_t meminfo = trace_mem_get_info(MO_UB, MMU_USER_IDX, true);
+
+    trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo);
+    stb_p(g2h(ptr), val);
+    qemu_plugin_vcpu_mem_cb(env_cpu(env), ptr, meminfo);
+}
+
+void cpu_stw_data(CPUArchState *env, abi_ptr ptr, uint32_t val)
+{
+    uint16_t meminfo = trace_mem_get_info(MO_TEUW, MMU_USER_IDX, true);
+
+    trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo);
+    stw_p(g2h(ptr), val);
+    qemu_plugin_vcpu_mem_cb(env_cpu(env), ptr, meminfo);
+}
+
+void cpu_stl_data(CPUArchState *env, abi_ptr ptr, uint32_t val)
+{
+    uint16_t meminfo = trace_mem_get_info(MO_TEUL, MMU_USER_IDX, true);
+
+    trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo);
+    stl_p(g2h(ptr), val);
+    qemu_plugin_vcpu_mem_cb(env_cpu(env), ptr, meminfo);
+}
+
+void cpu_stq_data(CPUArchState *env, abi_ptr ptr, uint64_t val)
+{
+    uint16_t meminfo = trace_mem_get_info(MO_TEQ, MMU_USER_IDX, true);
+
+    trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo);
+    stq_p(g2h(ptr), val);
+    qemu_plugin_vcpu_mem_cb(env_cpu(env), ptr, meminfo);
+}
+
+void cpu_stb_data_ra(CPUArchState *env, abi_ptr ptr,
+                     uint32_t val, uintptr_t retaddr)
+{
+    set_helper_retaddr(retaddr);
+    cpu_stb_data(env, ptr, val);
+    clear_helper_retaddr();
+}
+
+void cpu_stw_data_ra(CPUArchState *env, abi_ptr ptr,
+                     uint32_t val, uintptr_t retaddr)
+{
+    set_helper_retaddr(retaddr);
+    cpu_stw_data(env, ptr, val);
+    clear_helper_retaddr();
+}
+
+void cpu_stl_data_ra(CPUArchState *env, abi_ptr ptr,
+                     uint32_t val, uintptr_t retaddr)
+{
+    set_helper_retaddr(retaddr);
+    cpu_stl_data(env, ptr, val);
+    clear_helper_retaddr();
+}
+
+void cpu_stq_data_ra(CPUArchState *env, abi_ptr ptr,
+                     uint64_t val, uintptr_t retaddr)
+{
+    set_helper_retaddr(retaddr);
+    cpu_stq_data(env, ptr, val);
+    clear_helper_retaddr();
+}
+
+uint32_t cpu_ldub_code(CPUArchState *env, abi_ptr ptr)
+{
+    uint32_t ret;
+
+    set_helper_retaddr(1);
+    ret = ldub_p(g2h(ptr));
+    clear_helper_retaddr();
+    return ret;
+}
+
+uint32_t cpu_lduw_code(CPUArchState *env, abi_ptr ptr)
+{
+    uint32_t ret;
+
+    set_helper_retaddr(1);
+    ret = lduw_p(g2h(ptr));
+    clear_helper_retaddr();
+    return ret;
+}
+
+uint32_t cpu_ldl_code(CPUArchState *env, abi_ptr ptr)
+{
+    uint32_t ret;
+
+    set_helper_retaddr(1);
+    ret = ldl_p(g2h(ptr));
+    clear_helper_retaddr();
+    return ret;
+}
+
+uint64_t cpu_ldq_code(CPUArchState *env, abi_ptr ptr)
+{
+    uint64_t ret;
+
+    set_helper_retaddr(1);
+    ret = ldq_p(g2h(ptr));
+    clear_helper_retaddr();
+    return ret;
+}
+
 /* Do not allow unaligned operations to proceed.  Return the host address.  */
 static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
                                int size, uintptr_t retaddr)
-- 
2.20.1

The generated *_user functions are unused.  The *_kernel functions
have a couple of users in op_helper.c; use *_mmuidx_ra instead,
with MMU_KERNEL_IDX.

Tested-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Reviewed-by: Laurent Vivier <laurent@vivier.eu>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
v2: Use *_mmuidx_ra directly, without intermediate macros.
---
 target/m68k/cpu.h       |  2 --
 target/m68k/op_helper.c | 77 +++++++++++++++++++++++++----------------
 2 files changed, 47 insertions(+), 32 deletions(-)

diff --git a/target/m68k/cpu.h b/target/m68k/cpu.h
index XXXXXXX..XXXXXXX 100644
--- a/target/m68k/cpu.h
+++ b/target/m68k/cpu.h
@@ -XXX,XX +XXX,XX @@ enum {
 #define cpu_list m68k_cpu_list
 
 /* MMU modes definitions */
-#define MMU_MODE0_SUFFIX _kernel
-#define MMU_MODE1_SUFFIX _user
 #define MMU_KERNEL_IDX 0
 #define MMU_USER_IDX 1
 static inline int cpu_mmu_index (CPUM68KState *env, bool ifetch)
diff --git a/target/m68k/op_helper.c b/target/m68k/op_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/m68k/op_helper.c
+++ b/target/m68k/op_helper.c
@@ -XXX,XX +XXX,XX @@ static void cf_rte(CPUM68KState *env)
     uint32_t fmt;
 
     sp = env->aregs[7];
-    fmt = cpu_ldl_kernel(env, sp);
-    env->pc = cpu_ldl_kernel(env, sp + 4);
+    fmt = cpu_ldl_mmuidx_ra(env, sp, MMU_KERNEL_IDX, 0);
+    env->pc = cpu_ldl_mmuidx_ra(env, sp + 4, MMU_KERNEL_IDX, 0);
     sp |= (fmt >> 28) & 3;
     env->aregs[7] = sp + 8;
 
@@ -XXX,XX +XXX,XX @@ static void m68k_rte(CPUM68KState *env)
 
     sp = env->aregs[7];
 throwaway:
-    sr = cpu_lduw_kernel(env, sp);
+    sr = cpu_lduw_mmuidx_ra(env, sp, MMU_KERNEL_IDX, 0);
     sp += 2;
-    env->pc = cpu_ldl_kernel(env, sp);
+    env->pc = cpu_ldl_mmuidx_ra(env, sp, MMU_KERNEL_IDX, 0);
     sp += 4;
     if (m68k_feature(env, M68K_FEATURE_QUAD_MULDIV)) {
         /*  all except 68000 */
-        fmt = cpu_lduw_kernel(env, sp);
+        fmt = cpu_lduw_mmuidx_ra(env, sp, MMU_KERNEL_IDX, 0);
         sp += 2;
         switch (fmt >> 12) {
         case 0:
@@ -XXX,XX +XXX,XX @@ static void cf_interrupt_all(CPUM68KState *env, int is_hw)
     /* ??? This could cause MMU faults.  */
     sp &= ~3;
     sp -= 4;
-    cpu_stl_kernel(env, sp, retaddr);
+    cpu_stl_mmuidx_ra(env, sp, retaddr, MMU_KERNEL_IDX, 0);
     sp -= 4;
-    cpu_stl_kernel(env, sp, fmt);
+    cpu_stl_mmuidx_ra(env, sp, fmt, MMU_KERNEL_IDX, 0);
     env->aregs[7] = sp;
     /* Jump to vector.  */
-    env->pc = cpu_ldl_kernel(env, env->vbr + vector);
+    env->pc = cpu_ldl_mmuidx_ra(env, env->vbr + vector, MMU_KERNEL_IDX, 0);
 }
 
 static inline void do_stack_frame(CPUM68KState *env, uint32_t *sp,
@@ -XXX,XX +XXX,XX @@ static inline void do_stack_frame(CPUM68KState *env, uint32_t *sp,
         switch (format) {
         case 4:
             *sp -= 4;
-            cpu_stl_kernel(env, *sp, env->pc);
+            cpu_stl_mmuidx_ra(env, *sp, env->pc, MMU_KERNEL_IDX, 0);
             *sp -= 4;
-            cpu_stl_kernel(env, *sp, addr);
+            cpu_stl_mmuidx_ra(env, *sp, addr, MMU_KERNEL_IDX, 0);
             break;
         case 3:
         case 2:
             *sp -= 4;
-            cpu_stl_kernel(env, *sp, addr);
+            cpu_stl_mmuidx_ra(env, *sp, addr, MMU_KERNEL_IDX, 0);
             break;
         }
         *sp -= 2;
-        cpu_stw_kernel(env, *sp, (format << 12) + (cs->exception_index << 2));
+        cpu_stw_mmuidx_ra(env, *sp, (format << 12) + (cs->exception_index << 2),
+                          MMU_KERNEL_IDX, 0);
     }
     *sp -= 4;
-    cpu_stl_kernel(env, *sp, retaddr);
+    cpu_stl_mmuidx_ra(env, *sp, retaddr, MMU_KERNEL_IDX, 0);
     *sp -= 2;
-    cpu_stw_kernel(env, *sp, sr);
+    cpu_stw_mmuidx_ra(env, *sp, sr, MMU_KERNEL_IDX, 0);
 }
 
 static void m68k_interrupt_all(CPUM68KState *env, int is_hw)
@@ -XXX,XX +XXX,XX @@ static void m68k_interrupt_all(CPUM68KState *env, int is_hw)
             cpu_abort(cs, "DOUBLE MMU FAULT\n");
         }
         env->mmu.fault = true;
+        /* push data 3 */
         sp -= 4;
-        cpu_stl_kernel(env, sp, 0); /* push data 3 */
+        cpu_stl_mmuidx_ra(env, sp, 0, MMU_KERNEL_IDX, 0);
+        /* push data 2 */
         sp -= 4;
-        cpu_stl_kernel(env, sp, 0); /* push data 2 */
+        cpu_stl_mmuidx_ra(env, sp, 0, MMU_KERNEL_IDX, 0);
+        /* push data 1 */
         sp -= 4;
-        cpu_stl_kernel(env, sp, 0); /* push data 1 */
+        cpu_stl_mmuidx_ra(env, sp, 0, MMU_KERNEL_IDX, 0);
+        /* write back 1 / push data 0 */
         sp -= 4;
-        cpu_stl_kernel(env, sp, 0); /* write back 1 / push data 0 */
+        cpu_stl_mmuidx_ra(env, sp, 0, MMU_KERNEL_IDX, 0);
+        /* write back 1 address */
         sp -= 4;
-        cpu_stl_kernel(env, sp, 0); /* write back 1 address */
+        cpu_stl_mmuidx_ra(env, sp, 0, MMU_KERNEL_IDX, 0);
+        /* write back 2 data */
         sp -= 4;
-        cpu_stl_kernel(env, sp, 0); /* write back 2 data */
+        cpu_stl_mmuidx_ra(env, sp, 0, MMU_KERNEL_IDX, 0);
+        /* write back 2 address */
         sp -= 4;
-        cpu_stl_kernel(env, sp, 0); /* write back 2 address */
+        cpu_stl_mmuidx_ra(env, sp, 0, MMU_KERNEL_IDX, 0);
+        /* write back 3 data */
         sp -= 4;
-        cpu_stl_kernel(env, sp, 0); /* write back 3 data */
+        cpu_stl_mmuidx_ra(env, sp, 0, MMU_KERNEL_IDX, 0);
+        /* write back 3 address */
         sp -= 4;
-        cpu_stl_kernel(env, sp, env->mmu.ar); /* write back 3 address */
+        cpu_stl_mmuidx_ra(env, sp, env->mmu.ar, MMU_KERNEL_IDX, 0);
+        /* fault address */
         sp -= 4;
-        cpu_stl_kernel(env, sp, env->mmu.ar); /* fault address */
+        cpu_stl_mmuidx_ra(env, sp, env->mmu.ar, MMU_KERNEL_IDX, 0);
+        /* write back 1 status */
         sp -= 2;
-        cpu_stw_kernel(env, sp, 0); /* write back 1 status */
+        cpu_stw_mmuidx_ra(env, sp, 0, MMU_KERNEL_IDX, 0);
+        /* write back 2 status */
         sp -= 2;
-        cpu_stw_kernel(env, sp, 0); /* write back 2 status */
+        cpu_stw_mmuidx_ra(env, sp, 0, MMU_KERNEL_IDX, 0);
+        /* write back 3 status */
         sp -= 2;
-        cpu_stw_kernel(env, sp, 0); /* write back 3 status */
+        cpu_stw_mmuidx_ra(env, sp, 0, MMU_KERNEL_IDX, 0);
+        /* special status word */
         sp -= 2;
-        cpu_stw_kernel(env, sp, env->mmu.ssw); /* special status word */
+        cpu_stw_mmuidx_ra(env, sp, env->mmu.ssw, MMU_KERNEL_IDX, 0);
+        /* effective address */
         sp -= 4;
-        cpu_stl_kernel(env, sp, env->mmu.ar); /* effective address */
+        cpu_stl_mmuidx_ra(env, sp, env->mmu.ar, MMU_KERNEL_IDX, 0);
+
         do_stack_frame(env, &sp, 7, oldsr, 0, retaddr);
         env->mmu.fault = false;
         if (qemu_loglevel_mask(CPU_LOG_INT)) {
@@ -XXX,XX +XXX,XX @@ static void m68k_interrupt_all(CPUM68KState *env, int is_hw)
 
     env->aregs[7] = sp;
     /* Jump to vector.  */
-    env->pc = cpu_ldl_kernel(env, env->vbr + vector);
+    env->pc = cpu_ldl_mmuidx_ra(env, env->vbr + vector, MMU_KERNEL_IDX, 0);
 }
 
 static void do_interrupt_all(CPUM68KState *env, int is_hw)
-- 
2.20.1

The separate suffixed functions were used to construct
some do_##insn function switched on mmu_idx.  The interface
is exactly identical to the *_mmuidx_ra functions.  Replace
them directly and remove the constructions.

Cc: Aurelien Jarno <aurelien@aurel32.net>
Cc: Aleksandar Rikalo <aleksandar.rikalo@rt-rk.com>
Tested-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Reviewed-by: Aleksandar Markovic <amarkovic@wavecomp.com>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/mips/cpu.h       |   4 -
 target/mips/op_helper.c | 182 +++++++++++++---------------------------
 2 files changed, 60 insertions(+), 126 deletions(-)

diff --git a/target/mips/cpu.h b/target/mips/cpu.h
index XXXXXXX..XXXXXXX 100644
--- a/target/mips/cpu.h
+++ b/target/mips/cpu.h
@@ -XXX,XX +XXX,XX @@ extern uint32_t cpu_rddsp(uint32_t mask_num, CPUMIPSState *env);
  * MMU modes definitions. We carefully match the indices with our
  * hflags layout.
  */
-#define MMU_MODE0_SUFFIX _kernel
-#define MMU_MODE1_SUFFIX _super
-#define MMU_MODE2_SUFFIX _user
-#define MMU_MODE3_SUFFIX _error
 #define MMU_USER_IDX 2
 
 static inline int hflags_mmu_index(uint32_t hflags)
diff --git a/target/mips/op_helper.c b/target/mips/op_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/mips/op_helper.c
+++ b/target/mips/op_helper.c
@@ -XXX,XX +XXX,XX @@ static void raise_exception(CPUMIPSState *env, uint32_t exception)
     do_raise_exception(env, exception, 0);
 }
 
-#if defined(CONFIG_USER_ONLY)
-#define HELPER_LD(name, insn, type)                                     \
-static inline type do_##name(CPUMIPSState *env, target_ulong addr,      \
-                             int mem_idx, uintptr_t retaddr)            \
-{                                                                       \
-    return (type) cpu_##insn##_data_ra(env, addr, retaddr);             \
-}
-#else
-#define HELPER_LD(name, insn, type)                                     \
-static inline type do_##name(CPUMIPSState *env, target_ulong addr,      \
-                             int mem_idx, uintptr_t retaddr)            \
-{                                                                       \
-    switch (mem_idx) {                                                  \
-    case 0: return (type) cpu_##insn##_kernel_ra(env, addr, retaddr);   \
-    case 1: return (type) cpu_##insn##_super_ra(env, addr, retaddr);    \
-    default:                                                            \
-    case 2: return (type) cpu_##insn##_user_ra(env, addr, retaddr);     \
-    case 3: return (type) cpu_##insn##_error_ra(env, addr, retaddr);    \
-    }                                                                   \
-}
-#endif
-HELPER_LD(lw, ldl, int32_t)
-#if defined(TARGET_MIPS64)
-HELPER_LD(ld, ldq, int64_t)
-#endif
-#undef HELPER_LD
-
-#if defined(CONFIG_USER_ONLY)
-#define HELPER_ST(name, insn, type)                                     \
-static inline void do_##name(CPUMIPSState *env, target_ulong addr,      \
-                             type val, int mem_idx, uintptr_t retaddr)  \
-{                                                                       \
-    cpu_##insn##_data_ra(env, addr, val, retaddr);                      \
-}
-#else
-#define HELPER_ST(name, insn, type)                                     \
-static inline void do_##name(CPUMIPSState *env, target_ulong addr,      \
-                             type val, int mem_idx, uintptr_t retaddr)  \
-{                                                                       \
-    switch (mem_idx) {                                                  \
-    case 0:                                                             \
-        cpu_##insn##_kernel_ra(env, addr, val, retaddr);                \
-        break;                                                          \
-    case 1:                                                             \
-        cpu_##insn##_super_ra(env, addr, val, retaddr);                 \
-        break;                                                          \
-    default:                                                            \
-    case 2:                                                             \
-        cpu_##insn##_user_ra(env, addr, val, retaddr);                  \
-        break;                                                          \
-    case 3:                                                             \
-        cpu_##insn##_error_ra(env, addr, val, retaddr);                 \
-        break;                                                          \
-    }                                                                   \
-}
-#endif
-HELPER_ST(sb, stb, uint8_t)
-HELPER_ST(sw, stl, uint32_t)
-#if defined(TARGET_MIPS64)
-HELPER_ST(sd, stq, uint64_t)
-#endif
-#undef HELPER_ST
-
 /* 64 bits arithmetic for 32 bits hosts */
 static inline uint64_t get_HILO(CPUMIPSState *env)
 {
@@ -XXX,XX +XXX,XX @@ target_ulong helper_##name(CPUMIPSState *env, target_ulong arg, int mem_idx)  \
     }                                                                         \
     env->CP0_LLAddr = do_translate_address(env, arg, 0, GETPC());             \
     env->lladdr = arg;                                                        \
-    env->llval = do_##insn(env, arg, mem_idx, GETPC());                       \
+    env->llval = cpu_##insn##_mmuidx_ra(env, arg, mem_idx, GETPC());          \
     return env->llval;                                                        \
 }
-HELPER_LD_ATOMIC(ll, lw, 0x3)
+HELPER_LD_ATOMIC(ll, ldl, 0x3)
 #ifdef TARGET_MIPS64
-HELPER_LD_ATOMIC(lld, ld, 0x7)
+HELPER_LD_ATOMIC(lld, ldq, 0x7)
 #endif
 #undef HELPER_LD_ATOMIC
 #endif
@@ -XXX,XX +XXX,XX @@ HELPER_LD_ATOMIC(lld, ld, 0x7)
 void helper_swl(CPUMIPSState *env, target_ulong arg1, target_ulong arg2,
                 int mem_idx)
 {
-    do_sb(env, arg2, (uint8_t)(arg1 >> 24), mem_idx, GETPC());
+    cpu_stb_mmuidx_ra(env, arg2, (uint8_t)(arg1 >> 24), mem_idx, GETPC());
 
     if (GET_LMASK(arg2) <= 2) {
-        do_sb(env, GET_OFFSET(arg2, 1), (uint8_t)(arg1 >> 16), mem_idx,
-              GETPC());
+        cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, 1), (uint8_t)(arg1 >> 16),
+                          mem_idx, GETPC());
     }
 
     if (GET_LMASK(arg2) <= 1) {
-        do_sb(env, GET_OFFSET(arg2, 2), (uint8_t)(arg1 >> 8), mem_idx,
-              GETPC());
+        cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, 2), (uint8_t)(arg1 >> 8),
+                          mem_idx, GETPC());
     }
 
     if (GET_LMASK(arg2) == 0) {
-        do_sb(env, GET_OFFSET(arg2, 3), (uint8_t)arg1, mem_idx,
-              GETPC());
+        cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, 3), (uint8_t)arg1,
+                          mem_idx, GETPC());
     }
 }
 
 void helper_swr(CPUMIPSState *env, target_ulong arg1, target_ulong arg2,
                 int mem_idx)
 {
-    do_sb(env, arg2, (uint8_t)arg1, mem_idx, GETPC());
+    cpu_stb_mmuidx_ra(env, arg2, (uint8_t)arg1, mem_idx, GETPC());
 
     if (GET_LMASK(arg2) >= 1) {
-        do_sb(env, GET_OFFSET(arg2, -1), (uint8_t)(arg1 >> 8), mem_idx,
-              GETPC());
+        cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, -1), (uint8_t)(arg1 >> 8),
+                          mem_idx, GETPC());
     }
 
     if (GET_LMASK(arg2) >= 2) {
-        do_sb(env, GET_OFFSET(arg2, -2), (uint8_t)(arg1 >> 16), mem_idx,
-              GETPC());
+        cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, -2), (uint8_t)(arg1 >> 16),
+                          mem_idx, GETPC());
     }
 
     if (GET_LMASK(arg2) == 3) {
-        do_sb(env, GET_OFFSET(arg2, -3), (uint8_t)(arg1 >> 24), mem_idx,
-              GETPC());
+        cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, -3), (uint8_t)(arg1 >> 24),
+                          mem_idx, GETPC());
     }
 }
 
@@ -XXX,XX +XXX,XX @@ void helper_swr(CPUMIPSState *env, target_ulong arg1, target_ulong arg2,
 void helper_sdl(CPUMIPSState *env, target_ulong arg1, target_ulong arg2,
                 int mem_idx)
 {
-    do_sb(env, arg2, (uint8_t)(arg1 >> 56), mem_idx, GETPC());
+    cpu_stb_mmuidx_ra(env, arg2, (uint8_t)(arg1 >> 56), mem_idx, GETPC());
 
     if (GET_LMASK64(arg2) <= 6) {
-        do_sb(env, GET_OFFSET(arg2, 1), (uint8_t)(arg1 >> 48), mem_idx,
-              GETPC());
+        cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, 1), (uint8_t)(arg1 >> 48),
+                          mem_idx, GETPC());
     }
 
     if (GET_LMASK64(arg2) <= 5) {
-        do_sb(env, GET_OFFSET(arg2, 2), (uint8_t)(arg1 >> 40), mem_idx,
-              GETPC());
+        cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, 2), (uint8_t)(arg1 >> 40),
+                          mem_idx, GETPC());
     }
 
     if (GET_LMASK64(arg2) <= 4) {
-        do_sb(env, GET_OFFSET(arg2, 3), (uint8_t)(arg1 >> 32), mem_idx,
-              GETPC());
+        cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, 3), (uint8_t)(arg1 >> 32),
+                          mem_idx, GETPC());
     }
 
     if (GET_LMASK64(arg2) <= 3) {
-        do_sb(env, GET_OFFSET(arg2, 4), (uint8_t)(arg1 >> 24), mem_idx,
-              GETPC());
+        cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, 4), (uint8_t)(arg1 >> 24),
+                          mem_idx, GETPC());
     }
 
     if (GET_LMASK64(arg2) <= 2) {
-        do_sb(env, GET_OFFSET(arg2, 5), (uint8_t)(arg1 >> 16), mem_idx,
-              GETPC());
+        cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, 5), (uint8_t)(arg1 >> 16),
+                          mem_idx, GETPC());
     }
 
     if (GET_LMASK64(arg2) <= 1) {
-        do_sb(env, GET_OFFSET(arg2, 6), (uint8_t)(arg1 >> 8), mem_idx,
-              GETPC());
+        cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, 6), (uint8_t)(arg1 >> 8),
+                          mem_idx, GETPC());
     }
 
     if (GET_LMASK64(arg2) <= 0) {
-        do_sb(env, GET_OFFSET(arg2, 7), (uint8_t)arg1, mem_idx,
-              GETPC());
+        cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, 7), (uint8_t)arg1,
+                          mem_idx, GETPC());
     }
 }
 
 void helper_sdr(CPUMIPSState *env, target_ulong arg1, target_ulong arg2,
                 int mem_idx)
 {
-    do_sb(env, arg2, (uint8_t)arg1, mem_idx, GETPC());
+    cpu_stb_mmuidx_ra(env, arg2, (uint8_t)arg1, mem_idx, GETPC());
 
     if (GET_LMASK64(arg2) >= 1) {
-        do_sb(env, GET_OFFSET(arg2, -1), (uint8_t)(arg1 >> 8), mem_idx,
-              GETPC());
+        cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, -1), (uint8_t)(arg1 >> 8),
+                          mem_idx, GETPC());
     }
 
     if (GET_LMASK64(arg2) >= 2) {
-        do_sb(env, GET_OFFSET(arg2, -2), (uint8_t)(arg1 >> 16), mem_idx,
-              GETPC());
+        cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, -2), (uint8_t)(arg1 >> 16),
+                          mem_idx, GETPC());
     }
 
     if (GET_LMASK64(arg2) >= 3) {
-        do_sb(env, GET_OFFSET(arg2, -3), (uint8_t)(arg1 >> 24), mem_idx,
-              GETPC());
+        cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, -3), (uint8_t)(arg1 >> 24),
+                          mem_idx, GETPC());
     }
 
     if (GET_LMASK64(arg2) >= 4) {
-        do_sb(env, GET_OFFSET(arg2, -4), (uint8_t)(arg1 >> 32), mem_idx,
-              GETPC());
+        cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, -4), (uint8_t)(arg1 >> 32),
+                          mem_idx, GETPC());
     }
 
     if (GET_LMASK64(arg2) >= 5) {
-        do_sb(env, GET_OFFSET(arg2, -5), (uint8_t)(arg1 >> 40), mem_idx,
-              GETPC());
+        cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, -5), (uint8_t)(arg1 >> 40),
+                          mem_idx, GETPC());
     }
 
     if (GET_LMASK64(arg2) >= 6) {
-        do_sb(env, GET_OFFSET(arg2, -6), (uint8_t)(arg1 >> 48), mem_idx,
-              GETPC());
+        cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, -6), (uint8_t)(arg1 >> 48),
+                          mem_idx, GETPC());
     }
 
     if (GET_LMASK64(arg2) == 7) {
-        do_sb(env, GET_OFFSET(arg2, -7), (uint8_t)(arg1 >> 56), mem_idx,
-              GETPC());
+        cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, -7), (uint8_t)(arg1 >> 56),
+                          mem_idx, GETPC());
     }
 }
 #endif /* TARGET_MIPS64 */
@@ -XXX,XX +XXX,XX @@ void helper_lwm(CPUMIPSState *env, target_ulong addr, target_ulong reglist,
 
         for (i = 0; i < base_reglist; i++) {
             env->active_tc.gpr[multiple_regs[i]] =
-                (target_long)do_lw(env, addr, mem_idx, GETPC());
+                (target_long)cpu_ldl_mmuidx_ra(env, addr, mem_idx, GETPC());
             addr += 4;
         }
     }
 
     if (do_r31) {
-        env->active_tc.gpr[31] = (target_long)do_lw(env, addr, mem_idx,
-                                                    GETPC());
+        env->active_tc.gpr[31] =
+            (target_long)cpu_ldl_mmuidx_ra(env, addr, mem_idx, GETPC());
     }
 }
 
@@ -XXX,XX +XXX,XX @@ void helper_swm(CPUMIPSState *env, target_ulong addr, target_ulong reglist,
         target_ulong i;
 
         for (i = 0; i < base_reglist; i++) {
-            do_sw(env, addr, env->active_tc.gpr[multiple_regs[i]], mem_idx,
-                  GETPC());
+            cpu_stw_mmuidx_ra(env, addr, env->active_tc.gpr[multiple_regs[i]],
+                              mem_idx, GETPC());
             addr += 4;
         }
     }
 
     if (do_r31) {
-        do_sw(env, addr, env->active_tc.gpr[31], mem_idx, GETPC());
+        cpu_stw_mmuidx_ra(env, addr, env->active_tc.gpr[31], mem_idx, GETPC());
     }
 }
 
@@ -XXX,XX +XXX,XX @@ void helper_ldm(CPUMIPSState *env, target_ulong addr, target_ulong reglist,
         target_ulong i;
 
         for (i = 0; i < base_reglist; i++) {
-            env->active_tc.gpr[multiple_regs[i]] = do_ld(env, addr, mem_idx,
-                                                         GETPC());
+            env->active_tc.gpr[multiple_regs[i]] =
+                cpu_ldq_mmuidx_ra(env, addr, mem_idx, GETPC());
             addr += 8;
         }
     }
 
     if (do_r31) {
-        env->active_tc.gpr[31] = do_ld(env, addr, mem_idx, GETPC());
+        env->active_tc.gpr[31] =
+            cpu_ldq_mmuidx_ra(env, addr, mem_idx, GETPC());
     }
 }
 
@@ -XXX,XX +XXX,XX @@ void helper_sdm(CPUMIPSState *env, target_ulong addr, target_ulong reglist,
         target_ulong i;
 
         for (i = 0; i < base_reglist; i++) {
-            do_sd(env, addr, env->active_tc.gpr[multiple_regs[i]], mem_idx,
-                  GETPC());
+            cpu_stq_mmuidx_ra(env, addr, env->active_tc.gpr[multiple_regs[i]],
+                              mem_idx, GETPC());
             addr += 8;
         }
     }
 
     if (do_r31) {
-        do_sd(env, addr, env->active_tc.gpr[31], mem_idx, GETPC());
+        cpu_stq_mmuidx_ra(env, addr, env->active_tc.gpr[31], mem_idx, GETPC());
     }
 }
 #endif
-- 
2.20.1

The generated functions aside from *_real are unused.
The *_real functions have a couple of users in mem_helper.c;
use *_mmuidx_ra instead, with MMU_REAL_IDX.

Tested-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
v2: Use *_mmuidx_ra directly, without intermediate macros.
---
 target/s390x/cpu.h        |  5 -----
 target/s390x/mem_helper.c | 10 +++++-----
 2 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/target/s390x/cpu.h b/target/s390x/cpu.h
index XXXXXXX..XXXXXXX 100644
--- a/target/s390x/cpu.h
+++ b/target/s390x/cpu.h
@@ -XXX,XX +XXX,XX @@
 
 #define TARGET_INSN_START_EXTRA_WORDS 2
 
-#define MMU_MODE0_SUFFIX _primary
-#define MMU_MODE1_SUFFIX _secondary
-#define MMU_MODE2_SUFFIX _home
-#define MMU_MODE3_SUFFIX _real
-
 #define MMU_USER_IDX 0
 
 #define S390_MAX_CPUS 248
diff --git a/target/s390x/mem_helper.c b/target/s390x/mem_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/s390x/mem_helper.c
+++ b/target/s390x/mem_helper.c
@@ -XXX,XX +XXX,XX @@ uint32_t HELPER(testblock)(CPUS390XState *env, uint64_t real_addr)
     real_addr = wrap_address(env, real_addr) & TARGET_PAGE_MASK;
 
     for (i = 0; i < TARGET_PAGE_SIZE; i += 8) {
-        cpu_stq_real_ra(env, real_addr + i, 0, ra);
+        cpu_stq_mmuidx_ra(env, real_addr + i, 0, MMU_REAL_IDX, ra);
     }
 
     return 0;
@@ -XXX,XX +XXX,XX @@ void HELPER(idte)(CPUS390XState *env, uint64_t r1, uint64_t r2, uint32_t m4)
         for (i = 0; i < entries; i++) {
             /* addresses are not wrapped in 24/31bit mode but table index is */
             raddr = table + ((index + i) & 0x7ff) * sizeof(entry);
-            entry = cpu_ldq_real_ra(env, raddr, ra);
+            entry = cpu_ldq_mmuidx_ra(env, raddr, MMU_REAL_IDX, ra);
             if (!(entry & REGION_ENTRY_I)) {
                 /* we are allowed to not store if already invalid */
                 entry |= REGION_ENTRY_I;
-                cpu_stq_real_ra(env, raddr, entry, ra);
+                cpu_stq_mmuidx_ra(env, raddr, entry, MMU_REAL_IDX, ra);
             }
         }
     }
@@ -XXX,XX +XXX,XX @@ void HELPER(ipte)(CPUS390XState *env, uint64_t pto, uint64_t vaddr,
     pte_addr += VADDR_PAGE_TX(vaddr) * 8;
 
     /* Mark the page table entry as invalid */
-    pte = cpu_ldq_real_ra(env, pte_addr, ra);
+    pte = cpu_ldq_mmuidx_ra(env, pte_addr, MMU_REAL_IDX, ra);
     pte |= PAGE_ENTRY_I;
-    cpu_stq_real_ra(env, pte_addr, pte, ra);
+    cpu_stq_mmuidx_ra(env, pte_addr, pte, MMU_REAL_IDX, ra);
 
     /* XXX we exploit the fact that Linux passes the exact virtual
        address here - it's not obliged to! */
-- 
2.20.1

There are only two uses.  Within dcbz_common, the local variable
mmu_idx already contains the epid computation, and we can avoid
repeating it for the store.  Within helper_icbiep, the usage is
trivially expanded using PPC_TLB_EPID_LOAD.

Tested-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Acked-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/ppc/cpu.h        |  2 --
 target/ppc/mem_helper.c | 11 ++---------
 2 files changed, 2 insertions(+), 11 deletions(-)

diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
index XXXXXXX..XXXXXXX 100644
--- a/target/ppc/cpu.h
+++ b/target/ppc/cpu.h
@@ -XXX,XX +XXX,XX @@ struct ppc_radix_page_info {
  * + real/paged mode combinations. The other two modes are for
  * external PID load/store.
  */
-#define MMU_MODE8_SUFFIX _epl
-#define MMU_MODE9_SUFFIX _eps
 #define PPC_TLB_EPID_LOAD 8
 #define PPC_TLB_EPID_STORE 9
 
diff --git a/target/ppc/mem_helper.c b/target/ppc/mem_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/ppc/mem_helper.c
+++ b/target/ppc/mem_helper.c
@@ -XXX,XX +XXX,XX @@ static void dcbz_common(CPUPPCState *env, target_ulong addr,
     } else {
         /* Slow path */
         for (i = 0; i < dcbz_size; i += 8) {
-            if (epid) {
-#if !defined(CONFIG_USER_ONLY)
-                /* Does not make sense on USER_ONLY config */
-                cpu_stq_eps_ra(env, addr + i, 0, retaddr);
-#endif
-            } else {
-                cpu_stq_data_ra(env, addr + i, 0, retaddr);
-            }
+            cpu_stq_mmuidx_ra(env, addr + i, 0, mmu_idx, retaddr);
         }
     }
 }
@@ -XXX,XX +XXX,XX @@ void helper_icbiep(CPUPPCState *env, target_ulong addr)
 #if !defined(CONFIG_USER_ONLY)
     /* See comments above */
     addr &= ~(env->dcache_line_size - 1);
-    cpu_ldl_epl_ra(env, addr, GETPC());
+    cpu_ldl_mmuidx_ra(env, addr, PPC_TLB_EPID_LOAD, GETPC());
 #endif
 }
 
-- 
2.20.1

All users have now been converted to cpu_*_mmuidx_ra.

diff --git a/include/exec/cpu_ldst.h b/include/exec/cpu_ldst.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/cpu_ldst.h
+++ b/include/exec/cpu_ldst.h
@@ -XXX,XX +XXX,XX @@ void cpu_stl_mmuidx_ra(CPUArchState *env, abi_ptr addr, uint32_t val,
 void cpu_stq_mmuidx_ra(CPUArchState *env, abi_ptr addr, uint64_t val,
                        int mmu_idx, uintptr_t retaddr);
 
-#ifdef MMU_MODE0_SUFFIX
-#define CPU_MMU_INDEX 0
-#define MEMSUFFIX MMU_MODE0_SUFFIX
-#define DATA_SIZE 1
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 2
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 4
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 8
-#include "exec/cpu_ldst_template.h"
-#undef CPU_MMU_INDEX
-#undef MEMSUFFIX
-#endif
-
-#if (NB_MMU_MODES >= 2) && defined(MMU_MODE1_SUFFIX)
-#define CPU_MMU_INDEX 1
-#define MEMSUFFIX MMU_MODE1_SUFFIX
-#define DATA_SIZE 1
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 2
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 4
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 8
-#include "exec/cpu_ldst_template.h"
-#undef CPU_MMU_INDEX
-#undef MEMSUFFIX
-#endif
-
-#if (NB_MMU_MODES >= 3) && defined(MMU_MODE2_SUFFIX)
-
-#define CPU_MMU_INDEX 2
-#define MEMSUFFIX MMU_MODE2_SUFFIX
-#define DATA_SIZE 1
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 2
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 4
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 8
-#include "exec/cpu_ldst_template.h"
-#undef CPU_MMU_INDEX
-#undef MEMSUFFIX
-#endif /* (NB_MMU_MODES >= 3) */
-
-#if (NB_MMU_MODES >= 4) && defined(MMU_MODE3_SUFFIX)
-
-#define CPU_MMU_INDEX 3
-#define MEMSUFFIX MMU_MODE3_SUFFIX
-#define DATA_SIZE 1
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 2
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 4
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 8
-#include "exec/cpu_ldst_template.h"
-#undef CPU_MMU_INDEX
-#undef MEMSUFFIX
-#endif /* (NB_MMU_MODES >= 4) */
-
-#if (NB_MMU_MODES >= 5) && defined(MMU_MODE4_SUFFIX)
-
-#define CPU_MMU_INDEX 4
-#define MEMSUFFIX MMU_MODE4_SUFFIX
-#define DATA_SIZE 1
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 2
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 4
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 8
-#include "exec/cpu_ldst_template.h"
-#undef CPU_MMU_INDEX
-#undef MEMSUFFIX
-#endif /* (NB_MMU_MODES >= 5) */
-
-#if (NB_MMU_MODES >= 6) && defined(MMU_MODE5_SUFFIX)
-
-#define CPU_MMU_INDEX 5
-#define MEMSUFFIX MMU_MODE5_SUFFIX
-#define DATA_SIZE 1
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 2
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 4
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 8
-#include "exec/cpu_ldst_template.h"
-#undef CPU_MMU_INDEX
-#undef MEMSUFFIX
-#endif /* (NB_MMU_MODES >= 6) */
-
-#if (NB_MMU_MODES >= 7) && defined(MMU_MODE6_SUFFIX)
-
-#define CPU_MMU_INDEX 6
-#define MEMSUFFIX MMU_MODE6_SUFFIX
-#define DATA_SIZE 1
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 2
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 4
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 8
-#include "exec/cpu_ldst_template.h"
-#undef CPU_MMU_INDEX
-#undef MEMSUFFIX
-#endif /* (NB_MMU_MODES >= 7) */
-
-#if (NB_MMU_MODES >= 8) && defined(MMU_MODE7_SUFFIX)
-
-#define CPU_MMU_INDEX 7
-#define MEMSUFFIX MMU_MODE7_SUFFIX
-#define DATA_SIZE 1
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 2
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 4
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 8
-#include "exec/cpu_ldst_template.h"
-#undef CPU_MMU_INDEX
-#undef MEMSUFFIX
-#endif /* (NB_MMU_MODES >= 8) */
-
-#if (NB_MMU_MODES >= 9) && defined(MMU_MODE8_SUFFIX)
-
-#define CPU_MMU_INDEX 8
-#define MEMSUFFIX MMU_MODE8_SUFFIX
-#define DATA_SIZE 1
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 2
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 4
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 8
-#include "exec/cpu_ldst_template.h"
-#undef CPU_MMU_INDEX
-#undef MEMSUFFIX
-#endif /* (NB_MMU_MODES >= 9) */
-
-#if (NB_MMU_MODES >= 10) && defined(MMU_MODE9_SUFFIX)
-
-#define CPU_MMU_INDEX 9
-#define MEMSUFFIX MMU_MODE9_SUFFIX
-#define DATA_SIZE 1
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 2
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 4
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 8
-#include "exec/cpu_ldst_template.h"
-#undef CPU_MMU_INDEX
-#undef MEMSUFFIX
-#endif /* (NB_MMU_MODES >= 10) */
-
-#if (NB_MMU_MODES >= 11) && defined(MMU_MODE10_SUFFIX)
-
-#define CPU_MMU_INDEX 10
-#define MEMSUFFIX MMU_MODE10_SUFFIX
-#define DATA_SIZE 1
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 2
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 4
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 8
-#include "exec/cpu_ldst_template.h"
-#undef CPU_MMU_INDEX
-#undef MEMSUFFIX
-#endif /* (NB_MMU_MODES >= 11) */
-
-#if (NB_MMU_MODES >= 12) && defined(MMU_MODE11_SUFFIX)
-
-#define CPU_MMU_INDEX 11
-#define MEMSUFFIX MMU_MODE11_SUFFIX
-#define DATA_SIZE 1
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 2
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 4
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 8
-#include "exec/cpu_ldst_template.h"
-#undef CPU_MMU_INDEX
-#undef MEMSUFFIX
-#endif /* (NB_MMU_MODES >= 12) */
-
-#if (NB_MMU_MODES > 12)
-#error "NB_MMU_MODES > 12 is not supported for now"
-#endif /* (NB_MMU_MODES > 12) */
-
 /* these access are slower, they must be as rare as possible */
 #define CPU_MMU_INDEX (cpu_mmu_index(env, false))
 #define MEMSUFFIX _data
-- 
2.20.1

Reduce the amount of preprocessor obfuscation by expanding
the text of each of the functions generated.  The result is
only slightly smaller than the original.

Tested-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Aleksandar Markovic <amarkovic@wavecomp.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/cpu_ldst.h          |  67 +++++++-----------
 include/exec/cpu_ldst_template.h | 117 -------------------------------
 accel/tcg/cputlb.c               | 107 +++++++++++++++++++++++++++-
 3 files changed, 130 insertions(+), 161 deletions(-)
 delete mode 100644 include/exec/cpu_ldst_template.h

diff --git a/include/exec/cpu_ldst.h b/include/exec/cpu_ldst.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/cpu_ldst.h
+++ b/include/exec/cpu_ldst.h
@@ -XXX,XX +XXX,XX @@ typedef target_ulong abi_ptr;
 #define TARGET_ABI_FMT_ptr TARGET_ABI_FMT_lx
 #endif
 
-#if defined(CONFIG_USER_ONLY)
-
-extern __thread uintptr_t helper_retaddr;
-
-static inline void set_helper_retaddr(uintptr_t ra)
-{
-    helper_retaddr = ra;
-    /*
-     * Ensure that this write is visible to the SIGSEGV handler that
-     * may be invoked due to a subsequent invalid memory operation.
-     */
-    signal_barrier();
-}
-
-static inline void clear_helper_retaddr(void)
-{
-    /*
-     * Ensure that previous memory operations have succeeded before
-     * removing the data visible to the signal handler.
-     */
-    signal_barrier();
-    helper_retaddr = 0;
-}
-
-/* In user-only mode we provide only the _code and _data accessors. */
-
 uint32_t cpu_ldub_data(CPUArchState *env, abi_ptr ptr);
 uint32_t cpu_lduw_data(CPUArchState *env, abi_ptr ptr);
 uint32_t cpu_ldl_data(CPUArchState *env, abi_ptr ptr);
@@ -XXX,XX +XXX,XX @@ void cpu_stl_data_ra(CPUArchState *env, abi_ptr ptr,
 void cpu_stq_data_ra(CPUArchState *env, abi_ptr ptr,
                      uint64_t val, uintptr_t retaddr);
 
+#if defined(CONFIG_USER_ONLY)
+
+extern __thread uintptr_t helper_retaddr;
+
+static inline void set_helper_retaddr(uintptr_t ra)
+{
+    helper_retaddr = ra;
+    /*
+     * Ensure that this write is visible to the SIGSEGV handler that
+     * may be invoked due to a subsequent invalid memory operation.
+     */
+    signal_barrier();
+}
+
+static inline void clear_helper_retaddr(void)
+{
+    /*
+     * Ensure that previous memory operations have succeeded before
+     * removing the data visible to the signal handler.
+     */
+    signal_barrier();
+    helper_retaddr = 0;
+}
+
 /*
  * Provide the same *_mmuidx_ra interface as for softmmu.
  * The mmu_idx argument is ignored.
@@ -XXX,XX +XXX,XX @@ void cpu_stl_mmuidx_ra(CPUArchState *env, abi_ptr addr, uint32_t val,
 void cpu_stq_mmuidx_ra(CPUArchState *env, abi_ptr addr, uint64_t val,
                        int mmu_idx, uintptr_t retaddr);
 
-/* these access are slower, they must be as rare as possible */
-#define CPU_MMU_INDEX (cpu_mmu_index(env, false))
-#define MEMSUFFIX _data
-#define DATA_SIZE 1
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 2
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 4
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 8
-#include "exec/cpu_ldst_template.h"
-#undef CPU_MMU_INDEX
-#undef MEMSUFFIX
-
 #endif /* defined(CONFIG_USER_ONLY) */
 
 uint32_t cpu_ldub_code(CPUArchState *env, abi_ptr addr);
diff --git a/include/exec/cpu_ldst_template.h b/include/exec/cpu_ldst_template.h
deleted file mode 100644
index XXXXXXX..XXXXXXX
--- a/include/exec/cpu_ldst_template.h
+++ /dev/null
@@ -XXX,XX +XXX,XX @@
-/*
- *  Software MMU support
- *
- * Generate inline load/store functions for one MMU mode and data
- * size.
- *
- * Generate a store function as well as signed and unsigned loads.
- *
- * Not used directly but included from cpu_ldst.h.
- *
- *  Copyright (c) 2003 Fabrice Bellard
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#if DATA_SIZE == 8
-#define SUFFIX q
-#define USUFFIX q
-#define DATA_TYPE uint64_t
-#define SHIFT 3
-#elif DATA_SIZE == 4
-#define SUFFIX l
-#define USUFFIX l
-#define DATA_TYPE uint32_t
-#define SHIFT 2
-#elif DATA_SIZE == 2
-#define SUFFIX w
-#define USUFFIX uw
-#define DATA_TYPE uint16_t
-#define DATA_STYPE int16_t
-#define SHIFT 1
-#elif DATA_SIZE == 1
-#define SUFFIX b
-#define USUFFIX ub
-#define DATA_TYPE uint8_t
-#define DATA_STYPE int8_t
-#define SHIFT 0
-#else
-#error unsupported data size
-#endif
-
-#if DATA_SIZE == 8
-#define RES_TYPE uint64_t
-#else
-#define RES_TYPE uint32_t
-#endif
-
-/* generic load/store macros */
-
-static inline RES_TYPE
-glue(glue(glue(cpu_ld, USUFFIX), MEMSUFFIX), _ra)(CPUArchState *env,
-                                                  target_ulong ptr,
-                                                  uintptr_t retaddr)
-{
-    return glue(glue(cpu_ld, USUFFIX), _mmuidx_ra)(env, ptr, CPU_MMU_INDEX,
-                                                   retaddr);
-}
-
-static inline RES_TYPE
-glue(glue(cpu_ld, USUFFIX), MEMSUFFIX)(CPUArchState *env, target_ulong ptr)
-{
-    return glue(glue(cpu_ld, USUFFIX), _mmuidx_ra)(env, ptr, CPU_MMU_INDEX, 0);
-}
-
-#if DATA_SIZE <= 2
-static inline int
-glue(glue(glue(cpu_lds, SUFFIX), MEMSUFFIX), _ra)(CPUArchState *env,
-                                                  target_ulong ptr,
-                                                  uintptr_t retaddr)
-{
-    return glue(glue(cpu_lds, SUFFIX), _mmuidx_ra)(env, ptr, CPU_MMU_INDEX,
-                                                   retaddr);
-}
-
-static inline int
-glue(glue(cpu_lds, SUFFIX), MEMSUFFIX)(CPUArchState *env, target_ulong ptr)
-{
-    return glue(glue(cpu_lds, SUFFIX), _mmuidx_ra)(env, ptr, CPU_MMU_INDEX, 0);
-}
-#endif
-
-/* generic store macro */
-
-static inline void
-glue(glue(glue(cpu_st, SUFFIX), MEMSUFFIX), _ra)(CPUArchState *env,
-                                                 target_ulong ptr,
-                                                 RES_TYPE v, uintptr_t retaddr)
-{
-    glue(glue(cpu_st, SUFFIX), _mmuidx_ra)(env, ptr, v, CPU_MMU_INDEX,
-                                           retaddr);
-}
-
-static inline void
-glue(glue(cpu_st, SUFFIX), MEMSUFFIX)(CPUArchState *env, target_ulong ptr,
-                                      RES_TYPE v)
-{
-    glue(glue(cpu_st, SUFFIX), _mmuidx_ra)(env, ptr, v, CPU_MMU_INDEX, 0);
-}
-
-#undef RES_TYPE
-#undef DATA_TYPE
-#undef DATA_STYPE
-#undef SUFFIX
-#undef USUFFIX
-#undef DATA_SIZE
-#undef SHIFT
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/atomic128.h"
 #include "translate-all.h"
 #include "trace-root.h"
-#include "qemu/plugin.h"
 #include "trace/mem.h"
 #ifdef CONFIG_PLUGIN
 #include "qemu/plugin-memory.h"
@@ -XXX,XX +XXX,XX @@ uint64_t cpu_ldq_mmuidx_ra(CPUArchState *env, abi_ptr addr,
                            ? helper_le_ldq_mmu : helper_be_ldq_mmu);
 }
 
+uint32_t cpu_ldub_data_ra(CPUArchState *env, target_ulong ptr,
+                          uintptr_t retaddr)
+{
+    return cpu_ldub_mmuidx_ra(env, ptr, cpu_mmu_index(env, false), retaddr);
+}
+
+int cpu_ldsb_data_ra(CPUArchState *env, target_ulong ptr, uintptr_t retaddr)
+{
+    return cpu_ldsb_mmuidx_ra(env, ptr, cpu_mmu_index(env, false), retaddr);
+}
+
+uint32_t cpu_lduw_data_ra(CPUArchState *env, target_ulong ptr,
+                          uintptr_t retaddr)
+{
+    return cpu_lduw_mmuidx_ra(env, ptr, cpu_mmu_index(env, false), retaddr);
+}
+
+int cpu_ldsw_data_ra(CPUArchState *env, target_ulong ptr, uintptr_t retaddr)
+{
+    return cpu_ldsw_mmuidx_ra(env, ptr, cpu_mmu_index(env, false), retaddr);
+}
+
+uint32_t cpu_ldl_data_ra(CPUArchState *env, target_ulong ptr, uintptr_t retaddr)
+{
+    return cpu_ldl_mmuidx_ra(env, ptr, cpu_mmu_index(env, false), retaddr);
+}
+
+uint64_t cpu_ldq_data_ra(CPUArchState *env, target_ulong ptr, uintptr_t retaddr)
+{
+    return cpu_ldq_mmuidx_ra(env, ptr, cpu_mmu_index(env, false), retaddr);
+}
+
+uint32_t cpu_ldub_data(CPUArchState *env, target_ulong ptr)
+{
+    return cpu_ldub_data_ra(env, ptr, 0);
+}
+
+int cpu_ldsb_data(CPUArchState *env, target_ulong ptr)
+{
+    return cpu_ldsb_data_ra(env, ptr, 0);
+}
+
+uint32_t cpu_lduw_data(CPUArchState *env, target_ulong ptr)
+{
+    return cpu_lduw_data_ra(env, ptr, 0);
+}
+
+int cpu_ldsw_data(CPUArchState *env, target_ulong ptr)
+{
+    return cpu_ldsw_data_ra(env, ptr, 0);
+}
+
+uint32_t cpu_ldl_data(CPUArchState *env, target_ulong ptr)
+{
+    return cpu_ldl_data_ra(env, ptr, 0);
+}
+
+uint64_t cpu_ldq_data(CPUArchState *env, target_ulong ptr)
+{
+    return cpu_ldq_data_ra(env, ptr, 0);
+}
+
 /*
  * Store Helpers
  */
@@ -XXX,XX +XXX,XX @@ void cpu_stq_mmuidx_ra(CPUArchState *env, target_ulong addr, uint64_t val,
     cpu_store_helper(env, addr, val, mmu_idx, retaddr, MO_TEQ);
 }
 
+void cpu_stb_data_ra(CPUArchState *env, target_ulong ptr,
+                     uint32_t val, uintptr_t retaddr)
+{
+    cpu_stb_mmuidx_ra(env, ptr, val, cpu_mmu_index(env, false), retaddr);
+}
+
+void cpu_stw_data_ra(CPUArchState *env, target_ulong ptr,
+                     uint32_t val, uintptr_t retaddr)
+{
+    cpu_stw_mmuidx_ra(env, ptr, val, cpu_mmu_index(env, false), retaddr);
+}
+
+void cpu_stl_data_ra(CPUArchState *env, target_ulong ptr,
+                     uint32_t val, uintptr_t retaddr)
+{
+    cpu_stl_mmuidx_ra(env, ptr, val, cpu_mmu_index(env, false), retaddr);
+}
+
+void cpu_stq_data_ra(CPUArchState *env, target_ulong ptr,
+                     uint64_t val, uintptr_t retaddr)
+{
+    cpu_stq_mmuidx_ra(env, ptr, val, cpu_mmu_index(env, false), retaddr);
+}
+
+void cpu_stb_data(CPUArchState *env, target_ulong ptr, uint32_t val)
+{
+    cpu_stb_data_ra(env, ptr, val, 0);
+}
+
+void cpu_stw_data(CPUArchState *env, target_ulong ptr, uint32_t val)
+{
+    cpu_stw_data_ra(env, ptr, val, 0);
+}
+
+void cpu_stl_data(CPUArchState *env, target_ulong ptr, uint32_t val)
+{
+    cpu_stl_data_ra(env, ptr, val, 0);
+}
+
+void cpu_stq_data(CPUArchState *env, target_ulong ptr, uint64_t val)
+{
+    cpu_stq_data_ra(env, ptr, val, 0);
+}
+
 /* First set of helpers allows passing in of OI and RETADDR.  This makes
    them callable from other helpers.  */
 
-- 
2.20.1

From: Philippe Mathieu-Daudé <philmd@redhat.com>

We currently search both the root and the tcg/ directories for tcg
files:

$ git grep '#include "tcg/' | wc -l
  28

$ git grep '#include "tcg[^/]' | wc -l
  94

To simplify the preprocessor search path, unify by expliciting the
tcg/ directory.

Patch created mechanically by running:

$ for x in \
      tcg.h tcg-mo.h tcg-op.h tcg-opc.h \
      tcg-op-gvec.h tcg-gvec-desc.h; do \
    sed -i "s,#include \"$x\",#include \"tcg/$x\"," \
      $(git grep -l "#include \"$x\""); \
    done

diff --git a/include/exec/cpu_ldst.h b/include/exec/cpu_ldst.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/cpu_ldst.h
+++ b/include/exec/cpu_ldst.h
@@ -XXX,XX +XXX,XX @@ static inline void cpu_stq_mmuidx_ra(CPUArchState *env, abi_ptr addr,
 #else
 
 /* Needed for TCG_OVERSIZED_GUEST */
-#include "tcg.h"
+#include "tcg/tcg.h"
 
 static inline target_ulong tlb_addr_write(const CPUTLBEntry *entry)
 {
diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -XXX,XX +XXX,XX @@ static inline void tb_target_set_jmp_target(uintptr_t tc_ptr,
  * The x86 has a pretty strong memory ordering which only really
  * allows for some stores to be re-ordered after loads.
  */
-#include "tcg-mo.h"
+#include "tcg/tcg-mo.h"
 
 #define TCG_TARGET_DEFAULT_MO (TCG_MO_ALL & ~TCG_MO_ST_LD)
 
diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg-op.h
+++ b/tcg/tcg-op.h
@@ -XXX,XX +XXX,XX @@
 #ifndef TCG_TCG_OP_H
 #define TCG_TCG_OP_H
 
-#include "tcg.h"
+#include "tcg/tcg.h"
 #include "exec/helper-proto.h"
 #include "exec/helper-gen.h"
 
diff --git a/tcg/tcg.h b/tcg/tcg.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -XXX,XX +XXX,XX @@
 #include "qemu/bitops.h"
 #include "qemu/plugin.h"
 #include "qemu/queue.h"
-#include "tcg-mo.h"
+#include "tcg/tcg-mo.h"
 #include "tcg-target.h"
 #include "qemu/int128.h"
 
@@ -XXX,XX +XXX,XX @@ typedef uint64_t TCGRegSet;
 
 typedef enum TCGOpcode {
 #define DEF(name, oargs, iargs, cargs, flags) INDEX_op_ ## name,
-#include "tcg-opc.h"
+#include "tcg/tcg-opc.h"
 #undef DEF
     NB_OPS,
 } TCGOpcode;
diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cpu-exec.c
+++ b/accel/tcg/cpu-exec.c
@@ -XXX,XX +XXX,XX @@
 #include "trace.h"
 #include "disas/disas.h"
 #include "exec/exec-all.h"
-#include "tcg.h"
+#include "tcg/tcg.h"
 #include "qemu/atomic.h"
 #include "sysemu/qtest.h"
 #include "qemu/timer.h"
diff --git a/accel/tcg/tcg-runtime-gvec.c b/accel/tcg/tcg-runtime-gvec.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-runtime-gvec.c
+++ b/accel/tcg/tcg-runtime-gvec.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/host-utils.h"
 #include "cpu.h"
 #include "exec/helper-proto.h"
-#include "tcg-gvec-desc.h"
+#include "tcg/tcg-gvec-desc.h"
 
 
 /* Virtually all hosts support 16-byte vectors.  Those that don't can emulate
diff --git a/accel/tcg/tcg-runtime.c b/accel/tcg/tcg-runtime.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-runtime.c
+++ b/accel/tcg/tcg-runtime.c
@@ -XXX,XX +XXX,XX @@
 #include "exec/tb-lookup.h"
 #include "disas/disas.h"
 #include "exec/log.h"
-#include "tcg.h"
+#include "tcg/tcg.h"
 
 /* 32-bit helpers */
 
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -XXX,XX +XXX,XX @@
 #include "trace.h"
 #include "disas/disas.h"
 #include "exec/exec-all.h"
-#include "tcg.h"
+#include "tcg/tcg.h"
 #if defined(CONFIG_USER_ONLY)
 #include "qemu.h"
 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
diff --git a/accel/tcg/user-exec.c b/accel/tcg/user-exec.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/user-exec.c
+++ b/accel/tcg/user-exec.c
@@ -XXX,XX +XXX,XX @@
 #include "cpu.h"
 #include "disas/disas.h"
 #include "exec/exec-all.h"
-#include "tcg.h"
+#include "tcg/tcg.h"
 #include "qemu/bitops.h"
 #include "exec/cpu_ldst.h"
 #include "translate-all.h"
diff --git a/bsd-user/main.c b/bsd-user/main.c
index XXXXXXX..XXXXXXX 100644
--- a/bsd-user/main.c
+++ b/bsd-user/main.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/module.h"
 #include "cpu.h"
 #include "exec/exec-all.h"
-#include "tcg.h"
+#include "tcg/tcg.h"
 #include "qemu/timer.h"
 #include "qemu/envlist.h"
 #include "exec/log.h"
diff --git a/cpus.c b/cpus.c
index XXXXXXX..XXXXXXX 100644
--- a/cpus.c
+++ b/cpus.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/bitmap.h"
 #include "qemu/seqlock.h"
 #include "qemu/guest-random.h"
-#include "tcg.h"
+#include "tcg/tcg.h"
 #include "hw/nmi.h"
 #include "sysemu/replay.h"
 #include "sysemu/runstate.h"
diff --git a/exec.c b/exec.c
index XXXXXXX..XXXXXXX 100644
--- a/exec.c
+++ b/exec.c
@@ -XXX,XX +XXX,XX @@
 #include "cpu.h"
 #include "exec/exec-all.h"
 #include "exec/target_page.h"
-#include "tcg.h"
+#include "tcg/tcg.h"
 #include "hw/qdev-core.h"
 #include "hw/qdev-properties.h"
 #if !defined(CONFIG_USER_ONLY)
diff --git a/linux-user/main.c b/linux-user/main.c
index XXXXXXX..XXXXXXX 100644
--- a/linux-user/main.c
+++ b/linux-user/main.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/plugin.h"
 #include "cpu.h"
 #include "exec/exec-all.h"
-#include "tcg.h"
+#include "tcg/tcg.h"
 #include "qemu/timer.h"
 #include "qemu/envlist.h"
 #include "qemu/guest-random.h"
diff --git a/linux-user/syscall.c b/linux-user/syscall.c
index XXXXXXX..XXXXXXX 100644
--- a/linux-user/syscall.c
+++ b/linux-user/syscall.c
@@ -XXX,XX +XXX,XX @@
 #include "user/syscall-trace.h"
 #include "qapi/error.h"
 #include "fd-trans.h"
-#include "tcg.h"
+#include "tcg/tcg.h"
 
 #ifndef CLONE_IO
 #define CLONE_IO                0x80000000      /* Clone io context */
diff --git a/target/alpha/translate.c b/target/alpha/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/alpha/translate.c
+++ b/target/alpha/translate.c
@@ -XXX,XX +XXX,XX @@
 #include "disas/disas.h"
 #include "qemu/host-utils.h"
 #include "exec/exec-all.h"
-#include "tcg-op.h"
+#include "tcg/tcg-op.h"
 #include "exec/cpu_ldst.h"
 #include "exec/helper-proto.h"
 #include "exec/helper-gen.h"
diff --git a/target/arm/helper-a64.c b/target/arm/helper-a64.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/helper-a64.c
+++ b/target/arm/helper-a64.c
@@ -XXX,XX +XXX,XX @@
 #include "exec/cpu_ldst.h"
 #include "qemu/int128.h"
 #include "qemu/atomic128.h"
-#include "tcg.h"
+#include "tcg/tcg.h"
 #include "fpu/softfloat.h"
 #include <zlib.h> /* For crc32 */
 
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -XXX,XX +XXX,XX @@
 #include "exec/helper-proto.h"
 #include "tcg/tcg-gvec-desc.h"
 #include "fpu/softfloat.h"
-#include "tcg.h"
+#include "tcg/tcg.h"
 
 
 /* Note that vector data is stored in host-endian 64-bit chunks,
diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/translate-a64.c
+++ b/target/arm/translate-a64.c
@@ -XXX,XX +XXX,XX @@
 
 #include "cpu.h"
 #include "exec/exec-all.h"
-#include "tcg-op.h"
-#include "tcg-op-gvec.h"
+#include "tcg/tcg-op.h"
+#include "tcg/tcg-op-gvec.h"
 #include "qemu/log.h"
 #include "arm_ldst.h"
 #include "translate.h"
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/osdep.h"
 #include "cpu.h"
 #include "exec/exec-all.h"
-#include "tcg-op.h"
-#include "tcg-op-gvec.h"
-#include "tcg-gvec-desc.h"
+#include "tcg/tcg-op.h"
+#include "tcg/tcg-op-gvec.h"
+#include "tcg/tcg-gvec-desc.h"
 #include "qemu/log.h"
 #include "arm_ldst.h"
 #include "translate.h"
diff --git a/target/arm/translate.c b/target/arm/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/translate.c
+++ b/target/arm/translate.c
@@ -XXX,XX +XXX,XX @@
 #include "internals.h"
 #include "disas/disas.h"
 #include "exec/exec-all.h"
-#include "tcg-op.h"
-#include "tcg-op-gvec.h"
+#include "tcg/tcg-op.h"
+#include "tcg/tcg-op-gvec.h"
 #include "qemu/log.h"
 #include "qemu/bitops.h"
 #include "arm_ldst.h"
diff --git a/target/cris/translate.c b/target/cris/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/cris/translate.c
+++ b/target/cris/translate.c
@@ -XXX,XX +XXX,XX @@
 #include "cpu.h"
 #include "disas/disas.h"
 #include "exec/exec-all.h"
-#include "tcg-op.h"
+#include "tcg/tcg-op.h"
 #include "exec/helper-proto.h"
 #include "mmu.h"
 #include "exec/cpu_ldst.h"
diff --git a/target/hppa/translate.c b/target/hppa/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/hppa/translate.c
+++ b/target/hppa/translate.c
@@ -XXX,XX +XXX,XX @@
 #include "disas/disas.h"
 #include "qemu/host-utils.h"
 #include "exec/exec-all.h"
-#include "tcg-op.h"
+#include "tcg/tcg-op.h"
 #include "exec/cpu_ldst.h"
 #include "exec/helper-proto.h"
 #include "exec/helper-gen.h"
diff --git a/target/i386/mem_helper.c b/target/i386/mem_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/i386/mem_helper.c
+++ b/target/i386/mem_helper.c
@@ -XXX,XX +XXX,XX @@
 #include "exec/cpu_ldst.h"
 #include "qemu/int128.h"
 #include "qemu/atomic128.h"
-#include "tcg.h"
+#include "tcg/tcg.h"
 
 void helper_cmpxchg8b_unlocked(CPUX86State *env, target_ulong a0)
 {
diff --git a/target/i386/translate.c b/target/i386/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/i386/translate.c
+++ b/target/i386/translate.c
@@ -XXX,XX +XXX,XX @@
 #include "cpu.h"
 #include "disas/disas.h"
 #include "exec/exec-all.h"
-#include "tcg-op.h"
+#include "tcg/tcg-op.h"
 #include "exec/cpu_ldst.h"
 #include "exec/translator.h"
 
diff --git a/target/lm32/translate.c b/target/lm32/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/lm32/translate.c
+++ b/target/lm32/translate.c
@@ -XXX,XX +XXX,XX @@
 #include "exec/helper-proto.h"
 #include "exec/exec-all.h"
 #include "exec/translator.h"
-#include "tcg-op.h"
+#include "tcg/tcg-op.h"
 #include "qemu/qemu-print.h"
 
 #include "exec/cpu_ldst.h"
diff --git a/target/m68k/translate.c b/target/m68k/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/m68k/translate.c
+++ b/target/m68k/translate.c
@@ -XXX,XX +XXX,XX @@
 #include "cpu.h"
 #include "disas/disas.h"
 #include "exec/exec-all.h"
-#include "tcg-op.h"
+#include "tcg/tcg-op.h"
 #include "qemu/log.h"
 #include "qemu/qemu-print.h"
 #include "exec/cpu_ldst.h"
diff --git a/target/microblaze/translate.c b/target/microblaze/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/microblaze/translate.c
+++ b/target/microblaze/translate.c
@@ -XXX,XX +XXX,XX @@
 #include "cpu.h"
 #include "disas/disas.h"
 #include "exec/exec-all.h"
-#include "tcg-op.h"
+#include "tcg/tcg-op.h"
 #include "exec/helper-proto.h"
 #include "microblaze-decode.h"
 #include "exec/cpu_ldst.h"
diff --git a/target/mips/translate.c b/target/mips/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/mips/translate.c
+++ b/target/mips/translate.c
@@ -XXX,XX +XXX,XX @@
 #include "internal.h"
 #include "disas/disas.h"
 #include "exec/exec-all.h"
-#include "tcg-op.h"
+#include "tcg/tcg-op.h"
 #include "exec/cpu_ldst.h"
 #include "hw/mips/cpudevs.h"
 
diff --git a/target/moxie/translate.c b/target/moxie/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/moxie/translate.c
+++ b/target/moxie/translate.c
@@ -XXX,XX +XXX,XX @@
 #include "cpu.h"
 #include "exec/exec-all.h"
 #include "disas/disas.h"
-#include "tcg-op.h"
+#include "tcg/tcg-op.h"
 #include "exec/cpu_ldst.h"
 #include "qemu/qemu-print.h"
 
diff --git a/target/nios2/translate.c b/target/nios2/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/nios2/translate.c
+++ b/target/nios2/translate.c
@@ -XXX,XX +XXX,XX @@
 
 #include "qemu/osdep.h"
 #include "cpu.h"
-#include "tcg-op.h"
+#include "tcg/tcg-op.h"
 #include "exec/exec-all.h"
 #include "disas/disas.h"
 #include "exec/helper-proto.h"
diff --git a/target/openrisc/translate.c b/target/openrisc/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/openrisc/translate.c
+++ b/target/openrisc/translate.c
@@ -XXX,XX +XXX,XX @@
 #include "cpu.h"
 #include "exec/exec-all.h"
 #include "disas/disas.h"
-#include "tcg-op.h"
+#include "tcg/tcg-op.h"
 #include "qemu/log.h"
 #include "qemu/bitops.h"
 #include "qemu/qemu-print.h"
diff --git a/target/ppc/mem_helper.c b/target/ppc/mem_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/ppc/mem_helper.c
+++ b/target/ppc/mem_helper.c
@@ -XXX,XX +XXX,XX @@
 #include "exec/helper-proto.h"
 #include "helper_regs.h"
 #include "exec/cpu_ldst.h"
-#include "tcg.h"
+#include "tcg/tcg.h"
 #include "internal.h"
 #include "qemu/atomic128.h"
 
diff --git a/target/ppc/translate.c b/target/ppc/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/ppc/translate.c
+++ b/target/ppc/translate.c
@@ -XXX,XX +XXX,XX @@
 #include "internal.h"
 #include "disas/disas.h"
 #include "exec/exec-all.h"
-#include "tcg-op.h"
-#include "tcg-op-gvec.h"
+#include "tcg/tcg-op.h"
+#include "tcg/tcg-op-gvec.h"
 #include "qemu/host-utils.h"
 #include "qemu/main-loop.h"
 #include "exec/cpu_ldst.h"
diff --git a/target/riscv/cpu_helper.c b/target/riscv/cpu_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/riscv/cpu_helper.c
+++ b/target/riscv/cpu_helper.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/main-loop.h"
 #include "cpu.h"
 #include "exec/exec-all.h"
-#include "tcg-op.h"
+#include "tcg/tcg-op.h"
 #include "trace.h"
 
 int riscv_cpu_mmu_index(CPURISCVState *env, bool ifetch)
diff --git a/target/riscv/translate.c b/target/riscv/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/riscv/translate.c
+++ b/target/riscv/translate.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/osdep.h"
 #include "qemu/log.h"
 #include "cpu.h"
-#include "tcg-op.h"
+#include "tcg/tcg-op.h"
 #include "disas/disas.h"
 #include "exec/cpu_ldst.h"
 #include "exec/exec-all.h"
diff --git a/target/s390x/mem_helper.c b/target/s390x/mem_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/s390x/mem_helper.c
+++ b/target/s390x/mem_helper.c
@@ -XXX,XX +XXX,XX @@
 #include "exec/cpu_ldst.h"
 #include "qemu/int128.h"
 #include "qemu/atomic128.h"
-#include "tcg.h"
+#include "tcg/tcg.h"
 
 #if !defined(CONFIG_USER_ONLY)
 #include "hw/s390x/storage-keys.h"
diff --git a/target/s390x/translate.c b/target/s390x/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/s390x/translate.c
+++ b/target/s390x/translate.c
@@ -XXX,XX +XXX,XX @@
 #include "internal.h"
 #include "disas/disas.h"
 #include "exec/exec-all.h"
-#include "tcg-op.h"
-#include "tcg-op-gvec.h"
+#include "tcg/tcg-op.h"
+#include "tcg/tcg-op-gvec.h"
 #include "qemu/log.h"
 #include "qemu/host-utils.h"
 #include "exec/cpu_ldst.h"
diff --git a/target/sh4/translate.c b/target/sh4/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/sh4/translate.c
+++ b/target/sh4/translate.c
@@ -XXX,XX +XXX,XX @@
 #include "cpu.h"
 #include "disas/disas.h"
 #include "exec/exec-all.h"
-#include "tcg-op.h"
+#include "tcg/tcg-op.h"
 #include "exec/cpu_ldst.h"
 #include "exec/helper-proto.h"
 #include "exec/helper-gen.h"
diff --git a/target/sparc/ldst_helper.c b/target/sparc/ldst_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/sparc/ldst_helper.c
+++ b/target/sparc/ldst_helper.c
@@ -XXX,XX +XXX,XX @@
 
 #include "qemu/osdep.h"
 #include "cpu.h"
-#include "tcg.h"
+#include "tcg/tcg.h"
 #include "exec/helper-proto.h"
 #include "exec/exec-all.h"
 #include "exec/cpu_ldst.h"
diff --git a/target/sparc/translate.c b/target/sparc/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/sparc/translate.c
+++ b/target/sparc/translate.c
@@ -XXX,XX +XXX,XX @@
 #include "disas/disas.h"
 #include "exec/helper-proto.h"
 #include "exec/exec-all.h"
-#include "tcg-op.h"
+#include "tcg/tcg-op.h"
 #include "exec/cpu_ldst.h"
 
 #include "exec/helper-gen.h"
diff --git a/target/tilegx/translate.c b/target/tilegx/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/tilegx/translate.c
+++ b/target/tilegx/translate.c
@@ -XXX,XX +XXX,XX @@
 #include "exec/log.h"
 #include "disas/disas.h"
 #include "exec/exec-all.h"
-#include "tcg-op.h"
+#include "tcg/tcg-op.h"
 #include "exec/cpu_ldst.h"
 #include "linux-user/syscall_defs.h"
 
diff --git a/target/tricore/translate.c b/target/tricore/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/tricore/translate.c
+++ b/target/tricore/translate.c
@@ -XXX,XX +XXX,XX @@
 #include "cpu.h"
 #include "disas/disas.h"
 #include "exec/exec-all.h"
-#include "tcg-op.h"
+#include "tcg/tcg-op.h"
 #include "exec/cpu_ldst.h"
 #include "qemu/qemu-print.h"
 
diff --git a/target/unicore32/translate.c b/target/unicore32/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/unicore32/translate.c
+++ b/target/unicore32/translate.c
@@ -XXX,XX +XXX,XX @@
 #include "cpu.h"
 #include "disas/disas.h"
 #include "exec/exec-all.h"
-#include "tcg-op.h"
+#include "tcg/tcg-op.h"
 #include "qemu/log.h"
 #include "exec/cpu_ldst.h"
 #include "exec/translator.h"
diff --git a/target/xtensa/translate.c b/target/xtensa/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/xtensa/translate.c
+++ b/target/xtensa/translate.c
@@ -XXX,XX +XXX,XX @@
 #include "cpu.h"
 #include "exec/exec-all.h"
 #include "disas/disas.h"
-#include "tcg-op.h"
+#include "tcg/tcg-op.h"
 #include "qemu/log.h"
 #include "qemu/qemu-print.h"
 #include "exec/cpu_ldst.h"
diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@
  */
 
 #include "qemu/osdep.h"
-#include "tcg-op.h"
+#include "tcg/tcg-op.h"
 
 #define CASE_OP_32_64(x)                        \
         glue(glue(case INDEX_op_, x), _i32):    \
diff --git a/tcg/tcg-common.c b/tcg/tcg-common.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg-common.c
+++ b/tcg/tcg-common.c
@@ -XXX,XX +XXX,XX @@ uintptr_t tci_tb_ptr;
 TCGOpDef tcg_op_defs[] = {
 #define DEF(s, oargs, iargs, cargs, flags) \
          { #s, oargs, iargs, cargs, iargs + oargs + cargs, flags },
-#include "tcg-opc.h"
+#include "tcg/tcg-opc.h"
 #undef DEF
 };
 const size_t tcg_op_defs_max = ARRAY_SIZE(tcg_op_defs);
diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg-op-gvec.c
+++ b/tcg/tcg-op-gvec.c
@@ -XXX,XX +XXX,XX @@
  */
 
 #include "qemu/osdep.h"
-#include "tcg.h"
-#include "tcg-op.h"
-#include "tcg-op-gvec.h"
+#include "tcg/tcg.h"
+#include "tcg/tcg-op.h"
+#include "tcg/tcg-op-gvec.h"
 #include "qemu/main-loop.h"
-#include "tcg-gvec-desc.h"
+#include "tcg/tcg-gvec-desc.h"
 
 #define MAX_UNROLL  4
 
diff --git a/tcg/tcg-op-vec.c b/tcg/tcg-op-vec.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg-op-vec.c
+++ b/tcg/tcg-op-vec.c
@@ -XXX,XX +XXX,XX @@
 
 #include "qemu/osdep.h"
 #include "cpu.h"
-#include "tcg.h"
-#include "tcg-op.h"
-#include "tcg-mo.h"
+#include "tcg/tcg.h"
+#include "tcg/tcg-op.h"
+#include "tcg/tcg-mo.h"
 
 /* Reduce the number of ifdefs below.  This assumes that all uses of
    TCGV_HIGH and TCGV_LOW are properly protected by a conditional that
diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg-op.c
+++ b/tcg/tcg-op.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/osdep.h"
 #include "cpu.h"
 #include "exec/exec-all.h"
-#include "tcg.h"
-#include "tcg-op.h"
-#include "tcg-mo.h"
+#include "tcg/tcg.h"
+#include "tcg/tcg-op.h"
+#include "tcg/tcg-mo.h"
 #include "trace-tcg.h"
 #include "trace/mem.h"
 #include "exec/plugin-gen.h"
diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@
 #include "hw/boards.h"
 #endif
 
-#include "tcg-op.h"
+#include "tcg/tcg-op.h"
 
 #if UINTPTR_MAX == UINT32_MAX
 # define ELF_CLASS  ELFCLASS32
diff --git a/tcg/tci.c b/tcg/tci.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci.c
+++ b/tcg/tci.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu-common.h"
 #include "tcg/tcg.h"           /* MAX_OPC_PARAM_IARGS */
 #include "exec/cpu_ldst.h"
-#include "tcg-op.h"
+#include "tcg/tcg-op.h"
 
 /* Marker for missing code. */
 #define TODO() \
-- 
2.20.1

From: Philippe Mathieu-Daudé <philmd@redhat.com>

All the *.inc.c files included by tcg/$TARGET/tcg-target.inc.c
are in tcg/, their parent directory. To simplify the preprocessor
search path, include the relative parent path: '..'.

Patch created mechanically by running:

$ for x in tcg-pool.inc.c tcg-ldst.inc.c; do \
    sed -i "s,#include \"$x\",#include \"../$x\"," \
      $(git grep -l "#include \"$x\""); \
    done

diff --git a/tcg/aarch64/tcg-target.inc.c b/tcg/aarch64/tcg-target.inc.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/aarch64/tcg-target.inc.c
+++ b/tcg/aarch64/tcg-target.inc.c
@@ -XXX,XX +XXX,XX @@
  * See the COPYING file in the top-level directory for details.
  */
 
-#include "tcg-pool.inc.c"
+#include "../tcg-pool.inc.c"
 #include "qemu/bitops.h"
 
 /* We're going to re-use TCGType in setting of the SF bit, which controls
@@ -XXX,XX +XXX,XX @@ static void tcg_out_cltz(TCGContext *s, TCGType ext, TCGReg d,
 }
 
 #ifdef CONFIG_SOFTMMU
-#include "tcg-ldst.inc.c"
+#include "../tcg-ldst.inc.c"
 
 /* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
  *                                     TCGMemOpIdx oi, uintptr_t ra)
diff --git a/tcg/arm/tcg-target.inc.c b/tcg/arm/tcg-target.inc.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.inc.c
+++ b/tcg/arm/tcg-target.inc.c
@@ -XXX,XX +XXX,XX @@
  */
 
 #include "elf.h"
-#include "tcg-pool.inc.c"
+#include "../tcg-pool.inc.c"
 
 int arm_arch = __ARM_ARCH;
 
@@ -XXX,XX +XXX,XX @@ static TCGCond tcg_out_cmp2(TCGContext *s, const TCGArg *args,
 }
 
 #ifdef CONFIG_SOFTMMU
-#include "tcg-ldst.inc.c"
+#include "../tcg-ldst.inc.c"
 
 /* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
  *                                     int mmu_idx, uintptr_t ra)
diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/i386/tcg-target.inc.c
+++ b/tcg/i386/tcg-target.inc.c
@@ -XXX,XX +XXX,XX @@
  * THE SOFTWARE.
  */
 
-#include "tcg-pool.inc.c"
+#include "../tcg-pool.inc.c"
 
 #ifdef CONFIG_DEBUG_TCG
 static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
@@ -XXX,XX +XXX,XX @@ static void tcg_out_nopn(TCGContext *s, int n)
 }
 
 #if defined(CONFIG_SOFTMMU)
-#include "tcg-ldst.inc.c"
+#include "../tcg-ldst.inc.c"
 
 /* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
  *                                     int mmu_idx, uintptr_t ra)
diff --git a/tcg/mips/tcg-target.inc.c b/tcg/mips/tcg-target.inc.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/mips/tcg-target.inc.c
+++ b/tcg/mips/tcg-target.inc.c
@@ -XXX,XX +XXX,XX @@ static void tcg_out_call(TCGContext *s, tcg_insn_unit *arg)
 }
 
 #if defined(CONFIG_SOFTMMU)
-#include "tcg-ldst.inc.c"
+#include "../tcg-ldst.inc.c"
 
 static void * const qemu_ld_helpers[16] = {
     [MO_UB]   = helper_ret_ldub_mmu,
diff --git a/tcg/ppc/tcg-target.inc.c b/tcg/ppc/tcg-target.inc.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/ppc/tcg-target.inc.c
+++ b/tcg/ppc/tcg-target.inc.c
@@ -XXX,XX +XXX,XX @@
  */
 
 #include "elf.h"
-#include "tcg-pool.inc.c"
+#include "../tcg-pool.inc.c"
 
 #if defined _CALL_DARWIN || defined __APPLE__
 #define TCG_TARGET_CALL_DARWIN
@@ -XXX,XX +XXX,XX @@ static const uint32_t qemu_exts_opc[4] = {
 };
 
 #if defined (CONFIG_SOFTMMU)
-#include "tcg-ldst.inc.c"
+#include "../tcg-ldst.inc.c"
 
 /* helper signature: helper_ld_mmu(CPUState *env, target_ulong addr,
  *                                 int mmu_idx, uintptr_t ra)
diff --git a/tcg/riscv/tcg-target.inc.c b/tcg/riscv/tcg-target.inc.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/riscv/tcg-target.inc.c
+++ b/tcg/riscv/tcg-target.inc.c
@@ -XXX,XX +XXX,XX @@
  * THE SOFTWARE.
  */
 
-#include "tcg-pool.inc.c"
+#include "../tcg-pool.inc.c"
 
 #ifdef CONFIG_DEBUG_TCG
 static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
@@ -XXX,XX +XXX,XX @@ static void tcg_out_mb(TCGContext *s, TCGArg a0)
  */
 
 #if defined(CONFIG_SOFTMMU)
-#include "tcg-ldst.inc.c"
+#include "../tcg-ldst.inc.c"
 
 /* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
  *                                     TCGMemOpIdx oi, uintptr_t ra)
diff --git a/tcg/s390/tcg-target.inc.c b/tcg/s390/tcg-target.inc.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/s390/tcg-target.inc.c
+++ b/tcg/s390/tcg-target.inc.c
@@ -XXX,XX +XXX,XX @@
 #error "unsupported code generation mode"
 #endif
 
-#include "tcg-pool.inc.c"
+#include "../tcg-pool.inc.c"
 #include "elf.h"
 
 /* ??? The translation blocks produced by TCG are generally small enough to
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st_direct(TCGContext *s, MemOp opc, TCGReg data,
 }
 
 #if defined(CONFIG_SOFTMMU)
-#include "tcg-ldst.inc.c"
+#include "../tcg-ldst.inc.c"
 
 /* We're expecting to use a 20-bit negative offset on the tlb memory ops.  */
 QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) > 0);
diff --git a/tcg/sparc/tcg-target.inc.c b/tcg/sparc/tcg-target.inc.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/sparc/tcg-target.inc.c
+++ b/tcg/sparc/tcg-target.inc.c
@@ -XXX,XX +XXX,XX @@
  * THE SOFTWARE.
  */
 
-#include "tcg-pool.inc.c"
+#include "../tcg-pool.inc.c"
 
 #ifdef CONFIG_DEBUG_TCG
 static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
-- 
2.20.1

From: Philippe Mathieu-Daudé <philmd@redhat.com>

Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Alistair Francis <alistair.francis@wdc.com>
Reviewed-by: Stefan Weil <sw@weilnetz.de>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-Id: <20200101112303.20724-4-philmd@redhat.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 {tcg => include/tcg}/tcg-gvec-desc.h | 0
 {tcg => include/tcg}/tcg-mo.h        | 0
 {tcg => include/tcg}/tcg-op-gvec.h   | 0
 {tcg => include/tcg}/tcg-op.h        | 0
 {tcg => include/tcg}/tcg-opc.h       | 0
 {tcg => include/tcg}/tcg.h           | 0
 MAINTAINERS                          | 1 +
 7 files changed, 1 insertion(+)
 rename {tcg => include/tcg}/tcg-gvec-desc.h (100%)
 rename {tcg => include/tcg}/tcg-mo.h (100%)
 rename {tcg => include/tcg}/tcg-op-gvec.h (100%)
 rename {tcg => include/tcg}/tcg-op.h (100%)
 rename {tcg => include/tcg}/tcg-opc.h (100%)
 rename {tcg => include/tcg}/tcg.h (100%)

diff --git a/tcg/tcg-gvec-desc.h b/include/tcg/tcg-gvec-desc.h
similarity index 100%
rename from tcg/tcg-gvec-desc.h
rename to include/tcg/tcg-gvec-desc.h
diff --git a/tcg/tcg-mo.h b/include/tcg/tcg-mo.h
similarity index 100%
rename from tcg/tcg-mo.h
rename to include/tcg/tcg-mo.h
diff --git a/tcg/tcg-op-gvec.h b/include/tcg/tcg-op-gvec.h
similarity index 100%
rename from tcg/tcg-op-gvec.h
rename to include/tcg/tcg-op-gvec.h
diff --git a/tcg/tcg-op.h b/include/tcg/tcg-op.h
similarity index 100%
rename from tcg/tcg-op.h
rename to include/tcg/tcg-op.h
diff --git a/tcg/tcg-opc.h b/include/tcg/tcg-opc.h
similarity index 100%
rename from tcg/tcg-opc.h
rename to include/tcg/tcg-opc.h
diff --git a/tcg/tcg.h b/include/tcg/tcg.h
similarity index 100%
rename from tcg/tcg.h
rename to include/tcg/tcg.h
diff --git a/MAINTAINERS b/MAINTAINERS
index XXXXXXX..XXXXXXX 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -XXX,XX +XXX,XX @@ Common TCG code
 M: Richard Henderson <rth@twiddle.net>
 S: Maintained
 F: tcg/
+F: include/tcg/
 
 TCG Plugins
 M: Alex Bennée <alex.bennee@linaro.org>
-- 
2.20.1

From: Philippe Mathieu-Daudé <philmd@redhat.com>

All tcg includes are relative to the repository root directory,
we can safely remove the tcg/ directory from the include search
path list.

diff --git a/configure b/configure
index XXXXXXX..XXXXXXX 100755
--- a/configure
+++ b/configure
@@ -XXX,XX +XXX,XX @@ elif test "$ARCH" = "riscv32" || test "$ARCH" = "riscv64" ; then
 else
   QEMU_INCLUDES="-iquote \$(SRC_PATH)/tcg/\$(ARCH) $QEMU_INCLUDES"
 fi
-QEMU_INCLUDES="-iquote \$(SRC_PATH)/tcg $QEMU_INCLUDES"
 
 echo "TOOLS=$tools" >> $config_host_mak
 echo "ROMS=$roms" >> $config_host_mak
-- 
2.20.1

The following changes since commit c52d69e7dbaaed0ffdef8125e79218672c30161d:

Merge remote-tracking branch 'remotes/cschoenebeck/tags/pull-9p-20211027' into staging (2021-10-27 11:45:18 -0700)

are available in the Git repository at:

https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20211027

for you to fetch changes up to 820c025f0dcacf2f3c12735b1f162893fbfa7bc6:

tcg/optimize: Propagate sign info for shifting (2021-10-27 17:11:23 -0700)

----------------------------------------------------------------
Improvements to qemu/int128
Fixes for 128/64 division.
Cleanup tcg/optimize.c
Optimize redundant sign extensions

----------------------------------------------------------------
Frédéric Pétrot (1):
      qemu/int128: Add int128_{not,xor}

Luis Pires (4):
      host-utils: move checks out of divu128/divs128
      host-utils: move udiv_qrnnd() to host-utils
      host-utils: add 128-bit quotient support to divu128/divs128
      host-utils: add unit tests for divu128/divs128

Richard Henderson (51):
      tcg/optimize: Rename "mask" to "z_mask"
      tcg/optimize: Split out OptContext
      tcg/optimize: Remove do_default label
      tcg/optimize: Change tcg_opt_gen_{mov,movi} interface
      tcg/optimize: Move prev_mb into OptContext
      tcg/optimize: Split out init_arguments
      tcg/optimize: Split out copy_propagate
      tcg/optimize: Split out fold_call
      tcg/optimize: Drop nb_oargs, nb_iargs locals
      tcg/optimize: Change fail return for do_constant_folding_cond*
      tcg/optimize: Return true from tcg_opt_gen_{mov,movi}
      tcg/optimize: Split out finish_folding
      tcg/optimize: Use a boolean to avoid a mass of continues
      tcg/optimize: Split out fold_mb, fold_qemu_{ld,st}
      tcg/optimize: Split out fold_const{1,2}
      tcg/optimize: Split out fold_setcond2
      tcg/optimize: Split out fold_brcond2
      tcg/optimize: Split out fold_brcond
      tcg/optimize: Split out fold_setcond
      tcg/optimize: Split out fold_mulu2_i32
      tcg/optimize: Split out fold_addsub2_i32
      tcg/optimize: Split out fold_movcond
      tcg/optimize: Split out fold_extract2
      tcg/optimize: Split out fold_extract, fold_sextract
      tcg/optimize: Split out fold_deposit
      tcg/optimize: Split out fold_count_zeros
      tcg/optimize: Split out fold_bswap
      tcg/optimize: Split out fold_dup, fold_dup2
      tcg/optimize: Split out fold_mov
      tcg/optimize: Split out fold_xx_to_i
      tcg/optimize: Split out fold_xx_to_x
      tcg/optimize: Split out fold_xi_to_i
      tcg/optimize: Add type to OptContext
      tcg/optimize: Split out fold_to_not
      tcg/optimize: Split out fold_sub_to_neg
      tcg/optimize: Split out fold_xi_to_x
      tcg/optimize: Split out fold_ix_to_i
      tcg/optimize: Split out fold_masks
      tcg/optimize: Expand fold_mulu2_i32 to all 4-arg multiplies
      tcg/optimize: Expand fold_addsub2_i32 to 64-bit ops
      tcg/optimize: Sink commutative operand swapping into fold functions
      tcg/optimize: Stop forcing z_mask to "garbage" for 32-bit values
      tcg/optimize: Use fold_xx_to_i for orc
      tcg/optimize: Use fold_xi_to_x for mul
      tcg/optimize: Use fold_xi_to_x for div
      tcg/optimize: Use fold_xx_to_i for rem
      tcg/optimize: Optimize sign extensions
      tcg/optimize: Propagate sign info for logical operations
      tcg/optimize: Propagate sign info for setcond
      tcg/optimize: Propagate sign info for bit counting
      tcg/optimize: Propagate sign info for shifting

From: Frédéric Pétrot <frederic.petrot@univ-grenoble-alpes.fr>

Addition of not and xor on 128-bit integers.

Signed-off-by: Frédéric Pétrot <frederic.petrot@univ-grenoble-alpes.fr>
Co-authored-by: Fabien Portas <fabien.portas@grenoble-inp.org>
Message-Id: <20211025122818.168890-3-frederic.petrot@univ-grenoble-alpes.fr>
[rth: Split out logical operations.]
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/qemu/int128.h | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/include/qemu/int128.h b/include/qemu/int128.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/int128.h
+++ b/include/qemu/int128.h
@@ -XXX,XX +XXX,XX @@ static inline Int128 int128_exts64(int64_t a)
     return a;
 }
 
+static inline Int128 int128_not(Int128 a)
+{
+    return ~a;
+}
+
 static inline Int128 int128_and(Int128 a, Int128 b)
 {
     return a & b;
@@ -XXX,XX +XXX,XX @@ static inline Int128 int128_or(Int128 a, Int128 b)
     return a | b;
 }
 
+static inline Int128 int128_xor(Int128 a, Int128 b)
+{
+    return a ^ b;
+}
+
 static inline Int128 int128_rshift(Int128 a, int n)
 {
     return a >> n;
@@ -XXX,XX +XXX,XX @@ static inline Int128 int128_exts64(int64_t a)
     return int128_make128(a, (a < 0) ? -1 : 0);
 }
 
+static inline Int128 int128_not(Int128 a)
+{
+    return int128_make128(~a.lo, ~a.hi);
+}
+
 static inline Int128 int128_and(Int128 a, Int128 b)
 {
     return int128_make128(a.lo & b.lo, a.hi & b.hi);
@@ -XXX,XX +XXX,XX @@ static inline Int128 int128_or(Int128 a, Int128 b)
     return int128_make128(a.lo | b.lo, a.hi | b.hi);
 }
 
+static inline Int128 int128_xor(Int128 a, Int128 b)
+{
+    return int128_make128(a.lo ^ b.lo, a.hi ^ b.hi);
+}
+
 static inline Int128 int128_rshift(Int128 a, int n)
 {
     int64_t h;
-- 
2.25.1

From: Luis Pires <luis.pires@eldorado.org.br>

In preparation for changing the divu128/divs128 implementations
to allow for quotients larger than 64 bits, move the div-by-zero
and overflow checks to the callers.

Signed-off-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-Id: <20211025191154.350831-2-luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/hw/clock.h        |  5 +++--
 include/qemu/host-utils.h | 34 ++++++++++++---------------------
 target/ppc/int_helper.c   | 14 +++++++++-----
 util/host-utils.c         | 40 ++++++++++++++++++---------------------
 4 files changed, 42 insertions(+), 51 deletions(-)

diff --git a/include/hw/clock.h b/include/hw/clock.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/clock.h
+++ b/include/hw/clock.h
@@ -XXX,XX +XXX,XX @@ static inline uint64_t clock_ns_to_ticks(const Clock *clk, uint64_t ns)
         return 0;
     }
     /*
-     * Ignore divu128() return value as we've caught div-by-zero and don't
-     * need different behaviour for overflow.
+     * BUG: when CONFIG_INT128 is not defined, the current implementation of
+     * divu128 does not return a valid truncated quotient, so the result will
+     * be wrong.
      */
     divu128(&lo, &hi, clk->period);
     return lo;
diff --git a/include/qemu/host-utils.h b/include/qemu/host-utils.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/host-utils.h
+++ b/include/qemu/host-utils.h
@@ -XXX,XX +XXX,XX @@ static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
     return (__int128_t)a * b / c;
 }
 
-static inline int divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
+static inline void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
 {
-    if (divisor == 0) {
-        return 1;
-    } else {
-        __uint128_t dividend = ((__uint128_t)*phigh << 64) | *plow;
-        __uint128_t result = dividend / divisor;
-        *plow = result;
-        *phigh = dividend % divisor;
-        return result > UINT64_MAX;
-    }
+    __uint128_t dividend = ((__uint128_t)*phigh << 64) | *plow;
+    __uint128_t result = dividend / divisor;
+    *plow = result;
+    *phigh = dividend % divisor;
 }
 
-static inline int divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
+static inline void divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
 {
-    if (divisor == 0) {
-        return 1;
-    } else {
-        __int128_t dividend = ((__int128_t)*phigh << 64) | (uint64_t)*plow;
-        __int128_t result = dividend / divisor;
-        *plow = result;
-        *phigh = dividend % divisor;
-        return result != *plow;
-    }
+    __int128_t dividend = ((__int128_t)*phigh << 64) | (uint64_t)*plow;
+    __int128_t result = dividend / divisor;
+    *plow = result;
+    *phigh = dividend % divisor;
 }
 #else
 void muls64(uint64_t *plow, uint64_t *phigh, int64_t a, int64_t b);
 void mulu64(uint64_t *plow, uint64_t *phigh, uint64_t a, uint64_t b);
-int divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor);
-int divs128(int64_t *plow, int64_t *phigh, int64_t divisor);
+void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor);
+void divs128(int64_t *plow, int64_t *phigh, int64_t divisor);
 
 static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
 {
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -XXX,XX +XXX,XX @@ uint64_t helper_divdeu(CPUPPCState *env, uint64_t ra, uint64_t rb, uint32_t oe)
     uint64_t rt = 0;
     int overflow = 0;
 
-    overflow = divu128(&rt, &ra, rb);
-
-    if (unlikely(overflow)) {
+    if (unlikely(rb == 0 || ra >= rb)) {
+        overflow = 1;
         rt = 0; /* Undefined */
+    } else {
+        divu128(&rt, &ra, rb);
     }
 
     if (oe) {
@@ -XXX,XX +XXX,XX @@ uint64_t helper_divde(CPUPPCState *env, uint64_t rau, uint64_t rbu, uint32_t oe)
     int64_t rt = 0;
     int64_t ra = (int64_t)rau;
     int64_t rb = (int64_t)rbu;
-    int overflow = divs128(&rt, &ra, rb);
+    int overflow = 0;
 
-    if (unlikely(overflow)) {
+    if (unlikely(rb == 0 || uabs64(ra) >= uabs64(rb))) {
+        overflow = 1;
         rt = 0; /* Undefined */
+    } else {
+        divs128(&rt, &ra, rb);
     }
 
     if (oe) {
diff --git a/util/host-utils.c b/util/host-utils.c
index XXXXXXX..XXXXXXX 100644
--- a/util/host-utils.c
+++ b/util/host-utils.c
@@ -XXX,XX +XXX,XX @@ void muls64 (uint64_t *plow, uint64_t *phigh, int64_t a, int64_t b)
     *phigh = rh;
 }
 
-/* Unsigned 128x64 division.  Returns 1 if overflow (divide by zero or */
-/* quotient exceeds 64 bits).  Otherwise returns quotient via plow and */
-/* remainder via phigh. */
-int divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
+/*
+ * Unsigned 128-by-64 division. Returns quotient via plow and
+ * remainder via phigh.
+ * The result must fit in 64 bits (plow) - otherwise, the result
+ * is undefined.
+ * This function will cause a division by zero if passed a zero divisor.
+ */
+void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
 {
     uint64_t dhi = *phigh;
     uint64_t dlo = *plow;
     unsigned i;
     uint64_t carry = 0;
 
-    if (divisor == 0) {
-        return 1;
-    } else if (dhi == 0) {
+    if (divisor == 0 || dhi == 0) {
         *plow  = dlo / divisor;
         *phigh = dlo % divisor;
-        return 0;
-    } else if (dhi >= divisor) {
-        return 1;
     } else {
 
         for (i = 0; i < 64; i++) {
@@ -XXX,XX +XXX,XX @@ int divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
 
         *plow = dlo;
         *phigh = dhi;
-        return 0;
     }
 }
 
-int divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
+/*
+ * Signed 128-by-64 division. Returns quotient via plow and
+ * remainder via phigh.
+ * The result must fit in 64 bits (plow) - otherwise, the result
+ * is undefined.
+ * This function will cause a division by zero if passed a zero divisor.
+ */
+void divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
 {
     int sgn_dvdnd = *phigh < 0;
     int sgn_divsr = divisor < 0;
-    int overflow = 0;
 
     if (sgn_dvdnd) {
         *plow = ~(*plow);
@@ -XXX,XX +XXX,XX @@ int divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
         divisor = 0 - divisor;
     }
 
-    overflow = divu128((uint64_t *)plow, (uint64_t *)phigh, (uint64_t)divisor);
+    divu128((uint64_t *)plow, (uint64_t *)phigh, (uint64_t)divisor);
 
     if (sgn_dvdnd  ^ sgn_divsr) {
         *plow = 0 - *plow;
     }
-
-    if (!overflow) {
-        if ((*plow < 0) ^ (sgn_dvdnd ^ sgn_divsr)) {
-            overflow = 1;
-        }
-    }
-
-    return overflow;
 }
 #endif
 
-- 
2.25.1

From: Luis Pires <luis.pires@eldorado.org.br>

Move udiv_qrnnd() from include/fpu/softfloat-macros.h to host-utils,
so it can be reused by divu128().

Signed-off-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-Id: <20211025191154.350831-3-luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/fpu/softfloat-macros.h | 82 ----------------------------------
 include/qemu/host-utils.h      | 81 +++++++++++++++++++++++++++++++++
 2 files changed, 81 insertions(+), 82 deletions(-)

diff --git a/include/fpu/softfloat-macros.h b/include/fpu/softfloat-macros.h
index XXXXXXX..XXXXXXX 100644
--- a/include/fpu/softfloat-macros.h
+++ b/include/fpu/softfloat-macros.h
@@ -XXX,XX +XXX,XX @@
  * so some portions are provided under:
  *  the SoftFloat-2a license
  *  the BSD license
- *  GPL-v2-or-later
  *
  * Any future contributions to this file after December 1st 2014 will be
  * taken to be licensed under the Softfloat-2a license unless specifically
@@ -XXX,XX +XXX,XX @@ this code that are retained.
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-/* Portions of this work are licensed under the terms of the GNU GPL,
- * version 2 or later. See the COPYING file in the top-level directory.
- */
-
 #ifndef FPU_SOFTFLOAT_MACROS_H
 #define FPU_SOFTFLOAT_MACROS_H
 
@@ -XXX,XX +XXX,XX @@ static inline uint64_t estimateDiv128To64(uint64_t a0, uint64_t a1, uint64_t b)
 
 }
 
-/* From the GNU Multi Precision Library - longlong.h __udiv_qrnnd
- * (https://gmplib.org/repo/gmp/file/tip/longlong.h)
- *
- * Licensed under the GPLv2/LGPLv3
- */
-static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
-                                  uint64_t n0, uint64_t d)
-{
-#if defined(__x86_64__)
-    uint64_t q;
-    asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d));
-    return q;
-#elif defined(__s390x__) && !defined(__clang__)
-    /* Need to use a TImode type to get an even register pair for DLGR.  */
-    unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
-    asm("dlgr %0, %1" : "+r"(n) : "r"(d));
-    *r = n >> 64;
-    return n;
-#elif defined(_ARCH_PPC64) && defined(_ARCH_PWR7)
-    /* From Power ISA 2.06, programming note for divdeu.  */
-    uint64_t q1, q2, Q, r1, r2, R;
-    asm("divdeu %0,%2,%4; divdu %1,%3,%4"
-        : "=&r"(q1), "=r"(q2)
-        : "r"(n1), "r"(n0), "r"(d));
-    r1 = -(q1 * d);         /* low part of (n1<<64) - (q1 * d) */
-    r2 = n0 - (q2 * d);
-    Q = q1 + q2;
-    R = r1 + r2;
-    if (R >= d || R < r2) { /* overflow implies R > d */
-        Q += 1;
-        R -= d;
-    }
-    *r = R;
-    return Q;
-#else
-    uint64_t d0, d1, q0, q1, r1, r0, m;
-
-    d0 = (uint32_t)d;
-    d1 = d >> 32;
-
-    r1 = n1 % d1;
-    q1 = n1 / d1;
-    m = q1 * d0;
-    r1 = (r1 << 32) | (n0 >> 32);
-    if (r1 < m) {
-        q1 -= 1;
-        r1 += d;
-        if (r1 >= d) {
-            if (r1 < m) {
-                q1 -= 1;
-                r1 += d;
-            }
-        }
-    }
-    r1 -= m;
-
-    r0 = r1 % d1;
-    q0 = r1 / d1;
-    m = q0 * d0;
-    r0 = (r0 << 32) | (uint32_t)n0;
-    if (r0 < m) {
-        q0 -= 1;
-        r0 += d;
-        if (r0 >= d) {
-            if (r0 < m) {
-                q0 -= 1;
-                r0 += d;
-            }
-        }
-    }
-    r0 -= m;
-
-    *r = r0;
-    return (q1 << 32) | q0;
-#endif
-}
-
 /*----------------------------------------------------------------------------
 | Returns an approximation to the square root of the 32-bit significand given
 | by `a'.  Considered as an integer, `a' must be at least 2^31.  If bit 0 of
diff --git a/include/qemu/host-utils.h b/include/qemu/host-utils.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/host-utils.h
+++ b/include/qemu/host-utils.h
@@ -XXX,XX +XXX,XX @@
  * THE SOFTWARE.
  */
 
+/* Portions of this work are licensed under the terms of the GNU GPL,
+ * version 2 or later. See the COPYING file in the top-level directory.
+ */
+
 #ifndef HOST_UTILS_H
 #define HOST_UTILS_H
 
@@ -XXX,XX +XXX,XX @@ void urshift(uint64_t *plow, uint64_t *phigh, int32_t shift);
  */
 void ulshift(uint64_t *plow, uint64_t *phigh, int32_t shift, bool *overflow);
 
+/* From the GNU Multi Precision Library - longlong.h __udiv_qrnnd
+ * (https://gmplib.org/repo/gmp/file/tip/longlong.h)
+ *
+ * Licensed under the GPLv2/LGPLv3
+ */
+static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
+                                  uint64_t n0, uint64_t d)
+{
+#if defined(__x86_64__)
+    uint64_t q;
+    asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d));
+    return q;
+#elif defined(__s390x__) && !defined(__clang__)
+    /* Need to use a TImode type to get an even register pair for DLGR.  */
+    unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
+    asm("dlgr %0, %1" : "+r"(n) : "r"(d));
+    *r = n >> 64;
+    return n;
+#elif defined(_ARCH_PPC64) && defined(_ARCH_PWR7)
+    /* From Power ISA 2.06, programming note for divdeu.  */
+    uint64_t q1, q2, Q, r1, r2, R;
+    asm("divdeu %0,%2,%4; divdu %1,%3,%4"
+        : "=&r"(q1), "=r"(q2)
+        : "r"(n1), "r"(n0), "r"(d));
+    r1 = -(q1 * d);         /* low part of (n1<<64) - (q1 * d) */
+    r2 = n0 - (q2 * d);
+    Q = q1 + q2;
+    R = r1 + r2;
+    if (R >= d || R < r2) { /* overflow implies R > d */
+        Q += 1;
+        R -= d;
+    }
+    *r = R;
+    return Q;
+#else
+    uint64_t d0, d1, q0, q1, r1, r0, m;
+
+    d0 = (uint32_t)d;
+    d1 = d >> 32;
+
+    r1 = n1 % d1;
+    q1 = n1 / d1;
+    m = q1 * d0;
+    r1 = (r1 << 32) | (n0 >> 32);
+    if (r1 < m) {
+        q1 -= 1;
+        r1 += d;
+        if (r1 >= d) {
+            if (r1 < m) {
+                q1 -= 1;
+                r1 += d;
+            }
+        }
+    }
+    r1 -= m;
+
+    r0 = r1 % d1;
+    q0 = r1 / d1;
+    m = q0 * d0;
+    r0 = (r0 << 32) | (uint32_t)n0;
+    if (r0 < m) {
+        q0 -= 1;
+        r0 += d;
+        if (r0 >= d) {
+            if (r0 < m) {
+                q0 -= 1;
+                r0 += d;
+            }
+        }
+    }
+    r0 -= m;
+
+    *r = r0;
+    return (q1 << 32) | q0;
+#endif
+}
+
 #endif
-- 
2.25.1

From: Luis Pires <luis.pires@eldorado.org.br>

These will be used to implement new decimal floating point
instructions from Power ISA 3.1.

The remainder is now returned directly by divu128/divs128,
freeing up phigh to receive the high 64 bits of the quotient.

Signed-off-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-Id: <20211025191154.350831-4-luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/hw/clock.h        |   6 +-
 include/qemu/host-utils.h |  20 ++++--
 target/ppc/int_helper.c   |   9 +--
 util/host-utils.c         | 133 +++++++++++++++++++++++++-------------
 4 files changed, 108 insertions(+), 60 deletions(-)

diff --git a/include/hw/clock.h b/include/hw/clock.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/clock.h
+++ b/include/hw/clock.h
@@ -XXX,XX +XXX,XX @@ static inline uint64_t clock_ns_to_ticks(const Clock *clk, uint64_t ns)
     if (clk->period == 0) {
         return 0;
     }
-    /*
-     * BUG: when CONFIG_INT128 is not defined, the current implementation of
-     * divu128 does not return a valid truncated quotient, so the result will
-     * be wrong.
-     */
+
     divu128(&lo, &hi, clk->period);
     return lo;
 }
diff --git a/include/qemu/host-utils.h b/include/qemu/host-utils.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/host-utils.h
+++ b/include/qemu/host-utils.h
@@ -XXX,XX +XXX,XX @@ static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
     return (__int128_t)a * b / c;
 }
 
-static inline void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
+static inline uint64_t divu128(uint64_t *plow, uint64_t *phigh,
+                               uint64_t divisor)
 {
     __uint128_t dividend = ((__uint128_t)*phigh << 64) | *plow;
     __uint128_t result = dividend / divisor;
+
     *plow = result;
-    *phigh = dividend % divisor;
+    *phigh = result >> 64;
+    return dividend % divisor;
 }
 
-static inline void divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
+static inline int64_t divs128(uint64_t *plow, int64_t *phigh,
+                              int64_t divisor)
 {
-    __int128_t dividend = ((__int128_t)*phigh << 64) | (uint64_t)*plow;
+    __int128_t dividend = ((__int128_t)*phigh << 64) | *plow;
     __int128_t result = dividend / divisor;
+
     *plow = result;
-    *phigh = dividend % divisor;
+    *phigh = result >> 64;
+    return dividend % divisor;
 }
 #else
 void muls64(uint64_t *plow, uint64_t *phigh, int64_t a, int64_t b);
 void mulu64(uint64_t *plow, uint64_t *phigh, uint64_t a, uint64_t b);
-void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor);
-void divs128(int64_t *plow, int64_t *phigh, int64_t divisor);
+uint64_t divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor);
+int64_t divs128(uint64_t *plow, int64_t *phigh, int64_t divisor);
 
 static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
 {
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -XXX,XX +XXX,XX @@ uint64_t helper_divdeu(CPUPPCState *env, uint64_t ra, uint64_t rb, uint32_t oe)
 
 uint64_t helper_divde(CPUPPCState *env, uint64_t rau, uint64_t rbu, uint32_t oe)
 {
-    int64_t rt = 0;
+    uint64_t rt = 0;
     int64_t ra = (int64_t)rau;
     int64_t rb = (int64_t)rbu;
     int overflow = 0;
@@ -XXX,XX +XXX,XX @@ uint32_t helper_bcdcfsq(ppc_avr_t *r, ppc_avr_t *b, uint32_t ps)
     int cr;
     uint64_t lo_value;
     uint64_t hi_value;
+    uint64_t rem;
     ppc_avr_t ret = { .u64 = { 0, 0 } };
 
     if (b->VsrSD(0) < 0) {
@@ -XXX,XX +XXX,XX @@ uint32_t helper_bcdcfsq(ppc_avr_t *r, ppc_avr_t *b, uint32_t ps)
          * In that case, we leave r unchanged.
          */
     } else {
-        divu128(&lo_value, &hi_value, 1000000000000000ULL);
+        rem = divu128(&lo_value, &hi_value, 1000000000000000ULL);
 
-        for (i = 1; i < 16; hi_value /= 10, i++) {
-            bcd_put_digit(&ret, hi_value % 10, i);
+        for (i = 1; i < 16; rem /= 10, i++) {
+            bcd_put_digit(&ret, rem % 10, i);
         }
 
         for (; i < 32; lo_value /= 10, i++) {
diff --git a/util/host-utils.c b/util/host-utils.c
index XXXXXXX..XXXXXXX 100644
--- a/util/host-utils.c
+++ b/util/host-utils.c
@@ -XXX,XX +XXX,XX @@ void muls64 (uint64_t *plow, uint64_t *phigh, int64_t a, int64_t b)
 }
 
 /*
- * Unsigned 128-by-64 division. Returns quotient via plow and
- * remainder via phigh.
- * The result must fit in 64 bits (plow) - otherwise, the result
- * is undefined.
- * This function will cause a division by zero if passed a zero divisor.
+ * Unsigned 128-by-64 division.
+ * Returns the remainder.
+ * Returns quotient via plow and phigh.
+ * Also returns the remainder via the function return value.
  */
-void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
+uint64_t divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
 {
     uint64_t dhi = *phigh;
     uint64_t dlo = *plow;
-    unsigned i;
-    uint64_t carry = 0;
+    uint64_t rem, dhighest;
+    int sh;
 
     if (divisor == 0 || dhi == 0) {
         *plow  = dlo / divisor;
-        *phigh = dlo % divisor;
+        *phigh = 0;
+        return dlo % divisor;
     } else {
+        sh = clz64(divisor);
 
-        for (i = 0; i < 64; i++) {
-            carry = dhi >> 63;
-            dhi = (dhi << 1) | (dlo >> 63);
-            if (carry || (dhi >= divisor)) {
-                dhi -= divisor;
-                carry = 1;
-            } else {
-                carry = 0;
+        if (dhi < divisor) {
+            if (sh != 0) {
+                /* normalize the divisor, shifting the dividend accordingly */
+                divisor <<= sh;
+                dhi = (dhi << sh) | (dlo >> (64 - sh));
+                dlo <<= sh;
             }
-            dlo = (dlo << 1) | carry;
+
+            *phigh = 0;
+            *plow = udiv_qrnnd(&rem, dhi, dlo, divisor);
+        } else {
+            if (sh != 0) {
+                /* normalize the divisor, shifting the dividend accordingly */
+                divisor <<= sh;
+                dhighest = dhi >> (64 - sh);
+                dhi = (dhi << sh) | (dlo >> (64 - sh));
+                dlo <<= sh;
+
+                *phigh = udiv_qrnnd(&dhi, dhighest, dhi, divisor);
+            } else {
+                /**
+                 * dhi >= divisor
+                 * Since the MSB of divisor is set (sh == 0),
+                 * (dhi - divisor) < divisor
+                 *
+                 * Thus, the high part of the quotient is 1, and we can
+                 * calculate the low part with a single call to udiv_qrnnd
+                 * after subtracting divisor from dhi
+                 */
+                dhi -= divisor;
+                *phigh = 1;
+            }
+
+            *plow = udiv_qrnnd(&rem, dhi, dlo, divisor);
         }
 
-        *plow = dlo;
-        *phigh = dhi;
+        /*
+         * since the dividend/divisor might have been normalized,
+         * the remainder might also have to be shifted back
+         */
+        return rem >> sh;
     }
 }
 
 /*
- * Signed 128-by-64 division. Returns quotient via plow and
- * remainder via phigh.
- * The result must fit in 64 bits (plow) - otherwise, the result
- * is undefined.
- * This function will cause a division by zero if passed a zero divisor.
+ * Signed 128-by-64 division.
+ * Returns quotient via plow and phigh.
+ * Also returns the remainder via the function return value.
  */
-void divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
+int64_t divs128(uint64_t *plow, int64_t *phigh, int64_t divisor)
 {
-    int sgn_dvdnd = *phigh < 0;
-    int sgn_divsr = divisor < 0;
+    bool neg_quotient = false, neg_remainder = false;
+    uint64_t unsig_hi = *phigh, unsig_lo = *plow;
+    uint64_t rem;
 
-    if (sgn_dvdnd) {
-        *plow = ~(*plow);
-        *phigh = ~(*phigh);
-        if (*plow == (int64_t)-1) {
+    if (*phigh < 0) {
+        neg_quotient = !neg_quotient;
+        neg_remainder = !neg_remainder;
+
+        if (unsig_lo == 0) {
+            unsig_hi = -unsig_hi;
+        } else {
+            unsig_hi = ~unsig_hi;
+            unsig_lo = -unsig_lo;
+        }
+    }
+
+    if (divisor < 0) {
+        neg_quotient = !neg_quotient;
+
+        divisor = -divisor;
+    }
+
+    rem = divu128(&unsig_lo, &unsig_hi, (uint64_t)divisor);
+
+    if (neg_quotient) {
+        if (unsig_lo == 0) {
+            *phigh = -unsig_hi;
             *plow = 0;
-            (*phigh)++;
-         } else {
-            (*plow)++;
-         }
+        } else {
+            *phigh = ~unsig_hi;
+            *plow = -unsig_lo;
+        }
+    } else {
+        *phigh = unsig_hi;
+        *plow = unsig_lo;
     }
 
-    if (sgn_divsr) {
-        divisor = 0 - divisor;
-    }
-
-    divu128((uint64_t *)plow, (uint64_t *)phigh, (uint64_t)divisor);
-
-    if (sgn_dvdnd  ^ sgn_divsr) {
-        *plow = 0 - *plow;
+    if (neg_remainder) {
+        return -rem;
+    } else {
+        return rem;
     }
 }
 #endif
-- 
2.25.1

From: Luis Pires <luis.pires@eldorado.org.br>

Signed-off-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-Id: <20211025191154.350831-5-luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tests/unit/test-div128.c | 197 +++++++++++++++++++++++++++++++++++++++
 tests/unit/meson.build   |   1 +
 2 files changed, 198 insertions(+)
 create mode 100644 tests/unit/test-div128.c

diff --git a/tests/unit/test-div128.c b/tests/unit/test-div128.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tests/unit/test-div128.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * Test 128-bit division functions
+ *
+ * Copyright (c) 2021 Instituto de Pesquisas Eldorado (eldorado.org.br)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/host-utils.h"
+
+typedef struct {
+    uint64_t high;
+    uint64_t low;
+    uint64_t rhigh;
+    uint64_t rlow;
+    uint64_t divisor;
+    uint64_t remainder;
+} test_data_unsigned;
+
+typedef struct {
+    int64_t high;
+    uint64_t low;
+    int64_t rhigh;
+    uint64_t rlow;
+    int64_t divisor;
+    int64_t remainder;
+} test_data_signed;
+
+static const test_data_unsigned test_table_unsigned[] = {
+    /* Dividend fits in 64 bits */
+    { 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000001ULL, 0x0000000000000000ULL},
+    { 0x0000000000000000ULL, 0x0000000000000001ULL,
+      0x0000000000000000ULL, 0x0000000000000001ULL,
+      0x0000000000000001ULL, 0x0000000000000000ULL},
+    { 0x0000000000000000ULL, 0x0000000000000003ULL,
+      0x0000000000000000ULL, 0x0000000000000001ULL,
+      0x0000000000000002ULL, 0x0000000000000001ULL},
+    { 0x0000000000000000ULL, 0x8000000000000000ULL,
+      0x0000000000000000ULL, 0x8000000000000000ULL,
+      0x0000000000000001ULL, 0x0000000000000000ULL},
+    { 0x0000000000000000ULL, 0xa000000000000000ULL,
+      0x0000000000000000ULL, 0x0000000000000002ULL,
+      0x4000000000000000ULL, 0x2000000000000000ULL},
+    { 0x0000000000000000ULL, 0x8000000000000000ULL,
+      0x0000000000000000ULL, 0x0000000000000001ULL,
+      0x8000000000000000ULL, 0x0000000000000000ULL},
+
+    /* Dividend > 64 bits, with MSB 0 */
+    { 0x123456789abcdefeULL, 0xefedcba987654321ULL,
+      0x123456789abcdefeULL, 0xefedcba987654321ULL,
+      0x0000000000000001ULL, 0x0000000000000000ULL},
+    { 0x123456789abcdefeULL, 0xefedcba987654321ULL,
+      0x0000000000000001ULL, 0x000000000000000dULL,
+      0x123456789abcdefeULL, 0x03456789abcdf03bULL},
+    { 0x123456789abcdefeULL, 0xefedcba987654321ULL,
+      0x0123456789abcdefULL, 0xeefedcba98765432ULL,
+      0x0000000000000010ULL, 0x0000000000000001ULL},
+
+    /* Dividend > 64 bits, with MSB 1 */
+    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
+      0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
+      0x0000000000000001ULL, 0x0000000000000000ULL},
+    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
+      0x0000000000000001ULL, 0x0000000000000000ULL,
+      0xfeeddccbbaa99887ULL, 0x766554433221100fULL},
+    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
+      0x0feeddccbbaa9988ULL, 0x7766554433221100ULL,
+      0x0000000000000010ULL, 0x000000000000000fULL},
+    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
+      0x000000000000000eULL, 0x00f0f0f0f0f0f35aULL,
+      0x123456789abcdefeULL, 0x0f8922bc55ef90c3ULL},
+
+    /**
+     * Divisor == 64 bits, with MSB 1
+     * and high 64 bits of dividend >= divisor
+     * (for testing normalization)
+     */
+    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
+      0x0000000000000001ULL, 0x0000000000000000ULL,
+      0xfeeddccbbaa99887ULL, 0x766554433221100fULL},
+    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
+      0x0000000000000001ULL, 0xfddbb9977553310aULL,
+      0x8000000000000001ULL, 0x78899aabbccddf05ULL},
+
+    /* Dividend > 64 bits, divisor almost as big */
+    { 0x0000000000000001ULL, 0x23456789abcdef01ULL,
+      0x0000000000000000ULL, 0x000000000000000fULL,
+      0x123456789abcdefeULL, 0x123456789abcde1fULL},
+};
+
+static const test_data_signed test_table_signed[] = {
+    /* Positive dividend, positive/negative divisors */
+    { 0x0000000000000000LL, 0x0000000000bc614eULL,
+      0x0000000000000000LL, 0x0000000000bc614eULL,
+      0x0000000000000001LL, 0x0000000000000000LL},
+    { 0x0000000000000000LL, 0x0000000000bc614eULL,
+      0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
+      0xffffffffffffffffLL, 0x0000000000000000LL},
+    { 0x0000000000000000LL, 0x0000000000bc614eULL,
+      0x0000000000000000LL, 0x00000000005e30a7ULL,
+      0x0000000000000002LL, 0x0000000000000000LL},
+    { 0x0000000000000000LL, 0x0000000000bc614eULL,
+      0xffffffffffffffffLL, 0xffffffffffa1cf59ULL,
+      0xfffffffffffffffeLL, 0x0000000000000000LL},
+    { 0x0000000000000000LL, 0x0000000000bc614eULL,
+      0x0000000000000000LL, 0x0000000000178c29ULL,
+      0x0000000000000008LL, 0x0000000000000006LL},
+    { 0x0000000000000000LL, 0x0000000000bc614eULL,
+      0xffffffffffffffffLL, 0xffffffffffe873d7ULL,
+      0xfffffffffffffff8LL, 0x0000000000000006LL},
+    { 0x0000000000000000LL, 0x0000000000bc614eULL,
+      0x0000000000000000LL, 0x000000000000550dULL,
+      0x0000000000000237LL, 0x0000000000000183LL},
+    { 0x0000000000000000LL, 0x0000000000bc614eULL,
+      0xffffffffffffffffLL, 0xffffffffffffaaf3ULL,
+      0xfffffffffffffdc9LL, 0x0000000000000183LL},
+
+    /* Negative dividend, positive/negative divisors */
+    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
+      0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
+      0x0000000000000001LL, 0x0000000000000000LL},
+    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
+      0x0000000000000000LL, 0x0000000000bc614eULL,
+      0xffffffffffffffffLL, 0x0000000000000000LL},
+    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
+      0xffffffffffffffffLL, 0xffffffffffa1cf59ULL,
+      0x0000000000000002LL, 0x0000000000000000LL},
+    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
+      0x0000000000000000LL, 0x00000000005e30a7ULL,
+      0xfffffffffffffffeLL, 0x0000000000000000LL},
+    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
+      0xffffffffffffffffLL, 0xffffffffffe873d7ULL,
+      0x0000000000000008LL, 0xfffffffffffffffaLL},
+    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
+      0x0000000000000000LL, 0x0000000000178c29ULL,
+      0xfffffffffffffff8LL, 0xfffffffffffffffaLL},
+    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
+      0xffffffffffffffffLL, 0xffffffffffffaaf3ULL,
+      0x0000000000000237LL, 0xfffffffffffffe7dLL},
+    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
+      0x0000000000000000LL, 0x000000000000550dULL,
+      0xfffffffffffffdc9LL, 0xfffffffffffffe7dLL},
+};
+
+static void test_divu128(void)
+{
+    int i;
+    uint64_t rem;
+    test_data_unsigned tmp;
+
+    for (i = 0; i < ARRAY_SIZE(test_table_unsigned); ++i) {
+        tmp = test_table_unsigned[i];
+
+        rem = divu128(&tmp.low, &tmp.high, tmp.divisor);
+        g_assert_cmpuint(tmp.low, ==, tmp.rlow);
+        g_assert_cmpuint(tmp.high, ==, tmp.rhigh);
+        g_assert_cmpuint(rem, ==, tmp.remainder);
+    }
+}
+
+static void test_divs128(void)
+{
+    int i;
+    int64_t rem;
+    test_data_signed tmp;
+
+    for (i = 0; i < ARRAY_SIZE(test_table_signed); ++i) {
+        tmp = test_table_signed[i];
+
+        rem = divs128(&tmp.low, &tmp.high, tmp.divisor);
+        g_assert_cmpuint(tmp.low, ==, tmp.rlow);
+        g_assert_cmpuint(tmp.high, ==, tmp.rhigh);
+        g_assert_cmpuint(rem, ==, tmp.remainder);
+    }
+}
+
+int main(int argc, char **argv)
+{
+    g_test_init(&argc, &argv, NULL);
+    g_test_add_func("/host-utils/test_divu128", test_divu128);
+    g_test_add_func("/host-utils/test_divs128", test_divs128);
+    return g_test_run();
+}
diff --git a/tests/unit/meson.build b/tests/unit/meson.build
index XXXXXXX..XXXXXXX 100644
--- a/tests/unit/meson.build
+++ b/tests/unit/meson.build
@@ -XXX,XX +XXX,XX @@ tests = {
   # all code tested by test-x86-cpuid is inside topology.h
   'test-x86-cpuid': [],
   'test-cutils': [],
+  'test-div128': [],
   'test-shift128': [],
   'test-mul64': [],
   # all code tested by test-int128 is inside int128.h
-- 
2.25.1

Prepare for tracking different masks by renaming this one.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 142 +++++++++++++++++++++++++------------------------
 1 file changed, 72 insertions(+), 70 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
     TCGTemp *prev_copy;
     TCGTemp *next_copy;
     uint64_t val;
-    uint64_t mask;
+    uint64_t z_mask;  /* mask bit is 0 if and only if value bit is 0 */
 } TempOptInfo;
 
 static inline TempOptInfo *ts_info(TCGTemp *ts)
@@ -XXX,XX +XXX,XX @@ static void reset_ts(TCGTemp *ts)
     ti->next_copy = ts;
     ti->prev_copy = ts;
     ti->is_const = false;
-    ti->mask = -1;
+    ti->z_mask = -1;
 }
 
 static void reset_temp(TCGArg arg)
@@ -XXX,XX +XXX,XX @@ static void init_ts_info(TCGTempSet *temps_used, TCGTemp *ts)
     if (ts->kind == TEMP_CONST) {
         ti->is_const = true;
         ti->val = ts->val;
-        ti->mask = ts->val;
+        ti->z_mask = ts->val;
         if (TCG_TARGET_REG_BITS > 32 && ts->type == TCG_TYPE_I32) {
             /* High bits of a 32-bit quantity are garbage.  */
-            ti->mask |= ~0xffffffffull;
+            ti->z_mask |= ~0xffffffffull;
         }
     } else {
         ti->is_const = false;
-        ti->mask = -1;
+        ti->z_mask = -1;
     }
 }
 
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
     const TCGOpDef *def;
     TempOptInfo *di;
     TempOptInfo *si;
-    uint64_t mask;
+    uint64_t z_mask;
     TCGOpcode new_op;
 
     if (ts_are_copies(dst_ts, src_ts)) {
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
     op->args[0] = dst;
     op->args[1] = src;
 
-    mask = si->mask;
+    z_mask = si->z_mask;
     if (TCG_TARGET_REG_BITS > 32 && new_op == INDEX_op_mov_i32) {
         /* High bits of the destination are now garbage.  */
-        mask |= ~0xffffffffull;
+        z_mask |= ~0xffffffffull;
     }
-    di->mask = mask;
+    di->z_mask = z_mask;
 
     if (src_ts->type == dst_ts->type) {
         TempOptInfo *ni = ts_info(si->next_copy);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
     }
 
     QTAILQ_FOREACH_SAFE(op, &s->ops, link, op_next) {
-        uint64_t mask, partmask, affected, tmp;
+        uint64_t z_mask, partmask, affected, tmp;
         int nb_oargs, nb_iargs;
         TCGOpcode opc = op->opc;
         const TCGOpDef *def = &tcg_op_defs[opc];
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 
         /* Simplify using known-zero bits. Currently only ops with a single
            output argument is supported. */
-        mask = -1;
+        z_mask = -1;
         affected = -1;
         switch (opc) {
         CASE_OP_32_64(ext8s):
-            if ((arg_info(op->args[1])->mask & 0x80) != 0) {
+            if ((arg_info(op->args[1])->z_mask & 0x80) != 0) {
                 break;
             }
             QEMU_FALLTHROUGH;
         CASE_OP_32_64(ext8u):
-            mask = 0xff;
+            z_mask = 0xff;
             goto and_const;
         CASE_OP_32_64(ext16s):
-            if ((arg_info(op->args[1])->mask & 0x8000) != 0) {
+            if ((arg_info(op->args[1])->z_mask & 0x8000) != 0) {
                 break;
             }
             QEMU_FALLTHROUGH;
         CASE_OP_32_64(ext16u):
-            mask = 0xffff;
+            z_mask = 0xffff;
             goto and_const;
         case INDEX_op_ext32s_i64:
-            if ((arg_info(op->args[1])->mask & 0x80000000) != 0) {
+            if ((arg_info(op->args[1])->z_mask & 0x80000000) != 0) {
                 break;
             }
             QEMU_FALLTHROUGH;
         case INDEX_op_ext32u_i64:
-            mask = 0xffffffffU;
+            z_mask = 0xffffffffU;
             goto and_const;
 
         CASE_OP_32_64(and):
-            mask = arg_info(op->args[2])->mask;
+            z_mask = arg_info(op->args[2])->z_mask;
             if (arg_is_const(op->args[2])) {
         and_const:
-                affected = arg_info(op->args[1])->mask & ~mask;
+                affected = arg_info(op->args[1])->z_mask & ~z_mask;
             }
-            mask = arg_info(op->args[1])->mask & mask;
+            z_mask = arg_info(op->args[1])->z_mask & z_mask;
             break;
 
         case INDEX_op_ext_i32_i64:
-            if ((arg_info(op->args[1])->mask & 0x80000000) != 0) {
+            if ((arg_info(op->args[1])->z_mask & 0x80000000) != 0) {
                 break;
             }
             QEMU_FALLTHROUGH;
         case INDEX_op_extu_i32_i64:
             /* We do not compute affected as it is a size changing op.  */
-            mask = (uint32_t)arg_info(op->args[1])->mask;
+            z_mask = (uint32_t)arg_info(op->args[1])->z_mask;
             break;
 
         CASE_OP_32_64(andc):
             /* Known-zeros does not imply known-ones.  Therefore unless
                op->args[2] is constant, we can't infer anything from it.  */
             if (arg_is_const(op->args[2])) {
-                mask = ~arg_info(op->args[2])->mask;
+                z_mask = ~arg_info(op->args[2])->z_mask;
                 goto and_const;
             }
             /* But we certainly know nothing outside args[1] may be set. */
-            mask = arg_info(op->args[1])->mask;
+            z_mask = arg_info(op->args[1])->z_mask;
             break;
 
         case INDEX_op_sar_i32:
             if (arg_is_const(op->args[2])) {
                 tmp = arg_info(op->args[2])->val & 31;
-                mask = (int32_t)arg_info(op->args[1])->mask >> tmp;
+                z_mask = (int32_t)arg_info(op->args[1])->z_mask >> tmp;
             }
             break;
         case INDEX_op_sar_i64:
             if (arg_is_const(op->args[2])) {
                 tmp = arg_info(op->args[2])->val & 63;
-                mask = (int64_t)arg_info(op->args[1])->mask >> tmp;
+                z_mask = (int64_t)arg_info(op->args[1])->z_mask >> tmp;
             }
             break;
 
         case INDEX_op_shr_i32:
             if (arg_is_const(op->args[2])) {
                 tmp = arg_info(op->args[2])->val & 31;
-                mask = (uint32_t)arg_info(op->args[1])->mask >> tmp;
+                z_mask = (uint32_t)arg_info(op->args[1])->z_mask >> tmp;
             }
             break;
         case INDEX_op_shr_i64:
             if (arg_is_const(op->args[2])) {
                 tmp = arg_info(op->args[2])->val & 63;
-                mask = (uint64_t)arg_info(op->args[1])->mask >> tmp;
+                z_mask = (uint64_t)arg_info(op->args[1])->z_mask >> tmp;
             }
             break;
 
         case INDEX_op_extrl_i64_i32:
-            mask = (uint32_t)arg_info(op->args[1])->mask;
+            z_mask = (uint32_t)arg_info(op->args[1])->z_mask;
             break;
         case INDEX_op_extrh_i64_i32:
-            mask = (uint64_t)arg_info(op->args[1])->mask >> 32;
+            z_mask = (uint64_t)arg_info(op->args[1])->z_mask >> 32;
             break;
 
         CASE_OP_32_64(shl):
             if (arg_is_const(op->args[2])) {
                 tmp = arg_info(op->args[2])->val & (TCG_TARGET_REG_BITS - 1);
-                mask = arg_info(op->args[1])->mask << tmp;
+                z_mask = arg_info(op->args[1])->z_mask << tmp;
             }
             break;
 
         CASE_OP_32_64(neg):
             /* Set to 1 all bits to the left of the rightmost.  */
-            mask = -(arg_info(op->args[1])->mask
-                     & -arg_info(op->args[1])->mask);
+            z_mask = -(arg_info(op->args[1])->z_mask
+                       & -arg_info(op->args[1])->z_mask);
             break;
 
         CASE_OP_32_64(deposit):
-            mask = deposit64(arg_info(op->args[1])->mask,
-                             op->args[3], op->args[4],
-                             arg_info(op->args[2])->mask);
+            z_mask = deposit64(arg_info(op->args[1])->z_mask,
+                               op->args[3], op->args[4],
+                               arg_info(op->args[2])->z_mask);
             break;
 
         CASE_OP_32_64(extract):
-            mask = extract64(arg_info(op->args[1])->mask,
-                             op->args[2], op->args[3]);
+            z_mask = extract64(arg_info(op->args[1])->z_mask,
+                               op->args[2], op->args[3]);
             if (op->args[2] == 0) {
-                affected = arg_info(op->args[1])->mask & ~mask;
+                affected = arg_info(op->args[1])->z_mask & ~z_mask;
             }
             break;
         CASE_OP_32_64(sextract):
-            mask = sextract64(arg_info(op->args[1])->mask,
-                              op->args[2], op->args[3]);
-            if (op->args[2] == 0 && (tcg_target_long)mask >= 0) {
-                affected = arg_info(op->args[1])->mask & ~mask;
+            z_mask = sextract64(arg_info(op->args[1])->z_mask,
+                                op->args[2], op->args[3]);
+            if (op->args[2] == 0 && (tcg_target_long)z_mask >= 0) {
+                affected = arg_info(op->args[1])->z_mask & ~z_mask;
             }
             break;
 
         CASE_OP_32_64(or):
         CASE_OP_32_64(xor):
-            mask = arg_info(op->args[1])->mask | arg_info(op->args[2])->mask;
+            z_mask = arg_info(op->args[1])->z_mask
+                   | arg_info(op->args[2])->z_mask;
             break;
 
         case INDEX_op_clz_i32:
         case INDEX_op_ctz_i32:
-            mask = arg_info(op->args[2])->mask | 31;
+            z_mask = arg_info(op->args[2])->z_mask | 31;
             break;
 
         case INDEX_op_clz_i64:
         case INDEX_op_ctz_i64:
-            mask = arg_info(op->args[2])->mask | 63;
+            z_mask = arg_info(op->args[2])->z_mask | 63;
             break;
 
         case INDEX_op_ctpop_i32:
-            mask = 32 | 31;
+            z_mask = 32 | 31;
             break;
         case INDEX_op_ctpop_i64:
-            mask = 64 | 63;
+            z_mask = 64 | 63;
             break;
 
         CASE_OP_32_64(setcond):
         case INDEX_op_setcond2_i32:
-            mask = 1;
+            z_mask = 1;
             break;
 
         CASE_OP_32_64(movcond):
-            mask = arg_info(op->args[3])->mask | arg_info(op->args[4])->mask;
+            z_mask = arg_info(op->args[3])->z_mask
+                   | arg_info(op->args[4])->z_mask;
             break;
 
         CASE_OP_32_64(ld8u):
-            mask = 0xff;
+            z_mask = 0xff;
             break;
         CASE_OP_32_64(ld16u):
-            mask = 0xffff;
+            z_mask = 0xffff;
             break;
         case INDEX_op_ld32u_i64:
-            mask = 0xffffffffu;
+            z_mask = 0xffffffffu;
             break;
 
         CASE_OP_32_64(qemu_ld):
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 MemOpIdx oi = op->args[nb_oargs + nb_iargs];
                 MemOp mop = get_memop(oi);
                 if (!(mop & MO_SIGN)) {
-                    mask = (2ULL << ((8 << (mop & MO_SIZE)) - 1)) - 1;
+                    z_mask = (2ULL << ((8 << (mop & MO_SIZE)) - 1)) - 1;
                 }
             }
             break;
 
         CASE_OP_32_64(bswap16):
-            mask = arg_info(op->args[1])->mask;
-            if (mask <= 0xffff) {
+            z_mask = arg_info(op->args[1])->z_mask;
+            if (z_mask <= 0xffff) {
                 op->args[2] |= TCG_BSWAP_IZ;
             }
-            mask = bswap16(mask);
+            z_mask = bswap16(z_mask);
             switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
             case TCG_BSWAP_OZ:
                 break;
             case TCG_BSWAP_OS:
-                mask = (int16_t)mask;
+                z_mask = (int16_t)z_mask;
                 break;
             default: /* undefined high bits */
-                mask |= MAKE_64BIT_MASK(16, 48);
+                z_mask |= MAKE_64BIT_MASK(16, 48);
                 break;
             }
             break;
 
         case INDEX_op_bswap32_i64:
-            mask = arg_info(op->args[1])->mask;
-            if (mask <= 0xffffffffu) {
+            z_mask = arg_info(op->args[1])->z_mask;
+            if (z_mask <= 0xffffffffu) {
                 op->args[2] |= TCG_BSWAP_IZ;
             }
-            mask = bswap32(mask);
+            z_mask = bswap32(z_mask);
             switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
             case TCG_BSWAP_OZ:
                 break;
             case TCG_BSWAP_OS:
-                mask = (int32_t)mask;
+                z_mask = (int32_t)z_mask;
                 break;
             default: /* undefined high bits */
-                mask |= MAKE_64BIT_MASK(32, 32);
+                z_mask |= MAKE_64BIT_MASK(32, 32);
                 break;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         /* 32-bit ops generate 32-bit results.  For the result is zero test
            below, we can ignore high bits, but for further optimizations we
            need to record that the high bits contain garbage.  */
-        partmask = mask;
+        partmask = z_mask;
         if (!(def->flags & TCG_OPF_64BIT)) {
-            mask |= ~(tcg_target_ulong)0xffffffffu;
+            z_mask |= ~(tcg_target_ulong)0xffffffffu;
             partmask &= 0xffffffffu;
             affected &= 0xffffffffu;
         }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                    vs the high word of the input.  */
             do_setcond_high:
                 reset_temp(op->args[0]);
-                arg_info(op->args[0])->mask = 1;
+                arg_info(op->args[0])->z_mask = 1;
                 op->opc = INDEX_op_setcond_i32;
                 op->args[1] = op->args[2];
                 op->args[2] = op->args[4];
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 }
             do_setcond_low:
                 reset_temp(op->args[0]);
-                arg_info(op->args[0])->mask = 1;
+                arg_info(op->args[0])->z_mask = 1;
                 op->opc = INDEX_op_setcond_i32;
                 op->args[2] = op->args[3];
                 op->args[3] = op->args[5];
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             /* Default case: we know nothing about operation (or were unable
                to compute the operation result) so no propagation is done.
                We trash everything if the operation is the end of a basic
-               block, otherwise we only trash the output args.  "mask" is
+               block, otherwise we only trash the output args.  "z_mask" is
                the non-zero bits mask for the first output arg.  */
             if (def->flags & TCG_OPF_BB_END) {
                 memset(&temps_used, 0, sizeof(temps_used));
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                     /* Save the corresponding known-zero bits mask for the
                        first output argument (only one supported so far). */
                     if (i == 0) {
-                        arg_info(op->args[i])->mask = mask;
+                        arg_info(op->args[i])->z_mask = z_mask;
                     }
                 }
             }
-- 
2.25.1

Provide what will become a larger context for splitting
the very large tcg_optimize function.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 77 ++++++++++++++++++++++++++------------------------
 1 file changed, 40 insertions(+), 37 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
     uint64_t z_mask;  /* mask bit is 0 if and only if value bit is 0 */
 } TempOptInfo;
 
+typedef struct OptContext {
+    TCGTempSet temps_used;
+} OptContext;
+
 static inline TempOptInfo *ts_info(TCGTemp *ts)
 {
     return ts->state_ptr;
@@ -XXX,XX +XXX,XX @@ static void reset_temp(TCGArg arg)
 }
 
 /* Initialize and activate a temporary.  */
-static void init_ts_info(TCGTempSet *temps_used, TCGTemp *ts)
+static void init_ts_info(OptContext *ctx, TCGTemp *ts)
 {
     size_t idx = temp_idx(ts);
     TempOptInfo *ti;
 
-    if (test_bit(idx, temps_used->l)) {
+    if (test_bit(idx, ctx->temps_used.l)) {
         return;
     }
-    set_bit(idx, temps_used->l);
+    set_bit(idx, ctx->temps_used.l);
 
     ti = ts->state_ptr;
     if (ti == NULL) {
@@ -XXX,XX +XXX,XX @@ static void init_ts_info(TCGTempSet *temps_used, TCGTemp *ts)
     }
 }
 
-static void init_arg_info(TCGTempSet *temps_used, TCGArg arg)
+static void init_arg_info(OptContext *ctx, TCGArg arg)
 {
-    init_ts_info(temps_used, arg_temp(arg));
+    init_ts_info(ctx, arg_temp(arg));
 }
 
 static TCGTemp *find_better_copy(TCGContext *s, TCGTemp *ts)
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
     }
 }
 
-static void tcg_opt_gen_movi(TCGContext *s, TCGTempSet *temps_used,
+static void tcg_opt_gen_movi(TCGContext *s, OptContext *ctx,
                              TCGOp *op, TCGArg dst, uint64_t val)
 {
     const TCGOpDef *def = &tcg_op_defs[op->opc];
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_movi(TCGContext *s, TCGTempSet *temps_used,
 
     /* Convert movi to mov with constant temp. */
     tv = tcg_constant_internal(type, val);
-    init_ts_info(temps_used, tv);
+    init_ts_info(ctx, tv);
     tcg_opt_gen_mov(s, op, dst, temp_arg(tv));
 }
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 {
     int nb_temps, nb_globals, i;
     TCGOp *op, *op_next, *prev_mb = NULL;
-    TCGTempSet temps_used;
+    OptContext ctx = {};
 
     /* Array VALS has an element for each temp.
        If this temp holds a constant then its value is kept in VALS' element.
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
     nb_temps = s->nb_temps;
     nb_globals = s->nb_globals;
 
-    memset(&temps_used, 0, sizeof(temps_used));
     for (i = 0; i < nb_temps; ++i) {
         s->temps[i].state_ptr = NULL;
     }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             for (i = 0; i < nb_oargs + nb_iargs; i++) {
                 TCGTemp *ts = arg_temp(op->args[i]);
                 if (ts) {
-                    init_ts_info(&temps_used, ts);
+                    init_ts_info(&ctx, ts);
                 }
             }
         } else {
             nb_oargs = def->nb_oargs;
             nb_iargs = def->nb_iargs;
             for (i = 0; i < nb_oargs + nb_iargs; i++) {
-                init_arg_info(&temps_used, op->args[i]);
+                init_arg_info(&ctx, op->args[i]);
             }
         }
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(rotr):
             if (arg_is_const(op->args[1])
                 && arg_info(op->args[1])->val == 0) {
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 
         if (partmask == 0) {
             tcg_debug_assert(nb_oargs == 1);
-            tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
+            tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
             continue;
         }
         if (affected == 0) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(mulsh):
             if (arg_is_const(op->args[2])
                 && arg_info(op->args[2])->val == 0) {
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64_VEC(sub):
         CASE_OP_32_64_VEC(xor):
             if (args_are_copies(op->args[1], op->args[2])) {
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (arg_is_const(op->args[1])) {
                 tmp = arg_info(op->args[1])->val;
                 tmp = dup_const(TCGOP_VECE(op), tmp);
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_dup2_vec:
             assert(TCG_TARGET_REG_BITS == 32);
             if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0],
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0],
                                  deposit64(arg_info(op->args[1])->val, 32, 32,
                                            arg_info(op->args[2])->val));
                 break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_extrh_i64_i32:
             if (arg_is_const(op->args[1])) {
                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val, 0);
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (arg_is_const(op->args[1])) {
                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
                                           op->args[2]);
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
                                           arg_info(op->args[2])->val);
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 TCGArg v = arg_info(op->args[1])->val;
                 if (v != 0) {
                     tmp = do_constant_folding(opc, v, 0);
-                    tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
+                    tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                 } else {
                     tcg_opt_gen_mov(s, op, op->args[0], op->args[2]);
                 }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 tmp = deposit64(arg_info(op->args[1])->val,
                                 op->args[3], op->args[4],
                                 arg_info(op->args[2])->val);
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (arg_is_const(op->args[1])) {
                 tmp = extract64(arg_info(op->args[1])->val,
                                 op->args[2], op->args[3]);
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (arg_is_const(op->args[1])) {
                 tmp = sextract64(arg_info(op->args[1])->val,
                                  op->args[2], op->args[3]);
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                     tmp = (int32_t)(((uint32_t)v1 >> shr) |
                                     ((uint32_t)v2 << (32 - shr)));
                 }
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             tmp = do_constant_folding_cond(opc, op->args[1],
                                            op->args[2], op->args[3]);
             if (tmp != 2) {
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                                            op->args[1], op->args[2]);
             if (tmp != 2) {
                 if (tmp) {
-                    memset(&temps_used, 0, sizeof(temps_used));
+                    memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
                     op->opc = INDEX_op_br;
                     op->args[0] = op->args[3];
                 } else {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 
                 rl = op->args[0];
                 rh = op->args[1];
-                tcg_opt_gen_movi(s, &temps_used, op, rl, (int32_t)a);
-                tcg_opt_gen_movi(s, &temps_used, op2, rh, (int32_t)(a >> 32));
+                tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)a);
+                tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(a >> 32));
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 
                 rl = op->args[0];
                 rh = op->args[1];
-                tcg_opt_gen_movi(s, &temps_used, op, rl, (int32_t)r);
-                tcg_opt_gen_movi(s, &temps_used, op2, rh, (int32_t)(r >> 32));
+                tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)r);
+                tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(r >> 32));
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (tmp != 2) {
                 if (tmp) {
             do_brcond_true:
-                    memset(&temps_used, 0, sizeof(temps_used));
+                    memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
                     op->opc = INDEX_op_br;
                     op->args[0] = op->args[5];
                 } else {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 /* Simplify LT/GE comparisons vs zero to a single compare
                    vs the high word of the input.  */
             do_brcond_high:
-                memset(&temps_used, 0, sizeof(temps_used));
+                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
                 op->opc = INDEX_op_brcond_i32;
                 op->args[0] = op->args[1];
                 op->args[1] = op->args[3];
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                     goto do_default;
                 }
             do_brcond_low:
-                memset(&temps_used, 0, sizeof(temps_used));
+                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
                 op->opc = INDEX_op_brcond_i32;
                 op->args[1] = op->args[2];
                 op->args[2] = op->args[4];
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                                             op->args[5]);
             if (tmp != 2) {
             do_setcond_const:
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
             } else if ((op->args[5] == TCG_COND_LT
                         || op->args[5] == TCG_COND_GE)
                        && arg_is_const(op->args[3])
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (!(tcg_call_flags(op)
                   & (TCG_CALL_NO_READ_GLOBALS | TCG_CALL_NO_WRITE_GLOBALS))) {
                 for (i = 0; i < nb_globals; i++) {
-                    if (test_bit(i, temps_used.l)) {
+                    if (test_bit(i, ctx.temps_used.l)) {
                         reset_ts(&s->temps[i]);
                     }
                 }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                block, otherwise we only trash the output args.  "z_mask" is
                the non-zero bits mask for the first output arg.  */
             if (def->flags & TCG_OPF_BB_END) {
-                memset(&temps_used, 0, sizeof(temps_used));
+                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
             } else {
         do_reset_output:
                 for (i = 0; i < nb_oargs; i++) {
-- 
2.25.1

Break the final cleanup clause out of the main switch
statement.  When fully folding an opcode to mov/movi,
use "continue" to process the next opcode, else break
to fall into the final cleanup.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 190 ++++++++++++++++++++++++-------------------------
 1 file changed, 94 insertions(+), 96 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         switch (opc) {
         CASE_OP_32_64_VEC(mov):
             tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
-            break;
+            continue;
 
         case INDEX_op_dup_vec:
             if (arg_is_const(op->args[1])) {
                 tmp = arg_info(op->args[1])->val;
                 tmp = dup_const(TCGOP_VECE(op), tmp);
                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         case INDEX_op_dup2_vec:
             assert(TCG_TARGET_REG_BITS == 32);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 tcg_opt_gen_movi(s, &ctx, op, op->args[0],
                                  deposit64(arg_info(op->args[1])->val, 32, 32,
                                            arg_info(op->args[2])->val));
-                break;
+                continue;
             } else if (args_are_copies(op->args[1], op->args[2])) {
                 op->opc = INDEX_op_dup_vec;
                 TCGOP_VECE(op) = MO_32;
                 nb_iargs = 1;
             }
-            goto do_default;
+            break;
 
         CASE_OP_32_64(not):
         CASE_OP_32_64(neg):
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (arg_is_const(op->args[1])) {
                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val, 0);
                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         CASE_OP_32_64(bswap16):
         CASE_OP_32_64(bswap32):
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
                                           op->args[2]);
                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         CASE_OP_32_64(add):
         CASE_OP_32_64(sub):
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
                                           arg_info(op->args[2])->val);
                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         CASE_OP_32_64(clz):
         CASE_OP_32_64(ctz):
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 } else {
                     tcg_opt_gen_mov(s, op, op->args[0], op->args[2]);
                 }
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         CASE_OP_32_64(deposit):
             if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                                 op->args[3], op->args[4],
                                 arg_info(op->args[2])->val);
                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         CASE_OP_32_64(extract):
             if (arg_is_const(op->args[1])) {
                 tmp = extract64(arg_info(op->args[1])->val,
                                 op->args[2], op->args[3]);
                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         CASE_OP_32_64(sextract):
             if (arg_is_const(op->args[1])) {
                 tmp = sextract64(arg_info(op->args[1])->val,
                                  op->args[2], op->args[3]);
                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         CASE_OP_32_64(extract2):
             if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                                     ((uint32_t)v2 << (32 - shr)));
                 }
                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         CASE_OP_32_64(setcond):
             tmp = do_constant_folding_cond(opc, op->args[1],
                                            op->args[2], op->args[3]);
             if (tmp != 2) {
                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         CASE_OP_32_64(brcond):
             tmp = do_constant_folding_cond(opc, op->args[0],
                                            op->args[1], op->args[2]);
-            if (tmp != 2) {
-                if (tmp) {
-                    memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
-                    op->opc = INDEX_op_br;
-                    op->args[0] = op->args[3];
-                } else {
-                    tcg_op_remove(s, op);
-                }
+            switch (tmp) {
+            case 0:
+                tcg_op_remove(s, op);
+                continue;
+            case 1:
+                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+                op->opc = opc = INDEX_op_br;
+                op->args[0] = op->args[3];
                 break;
             }
-            goto do_default;
+            break;
 
         CASE_OP_32_64(movcond):
             tmp = do_constant_folding_cond(opc, op->args[1],
                                            op->args[2], op->args[5]);
             if (tmp != 2) {
                 tcg_opt_gen_mov(s, op, op->args[0], op->args[4-tmp]);
-                break;
+                continue;
             }
             if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
                 uint64_t tv = arg_info(op->args[3])->val;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 if (fv == 1 && tv == 0) {
                     cond = tcg_invert_cond(cond);
                 } else if (!(tv == 1 && fv == 0)) {
-                    goto do_default;
+                    break;
                 }
                 op->args[3] = cond;
                 op->opc = opc = (opc == INDEX_op_movcond_i32
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                                  : INDEX_op_setcond_i64);
                 nb_iargs = 2;
             }
-            goto do_default;
+            break;
 
         case INDEX_op_add2_i32:
         case INDEX_op_sub2_i32:
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 rh = op->args[1];
                 tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)a);
                 tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(a >> 32));
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         case INDEX_op_mulu2_i32:
             if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 rh = op->args[1];
                 tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)r);
                 tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(r >> 32));
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         case INDEX_op_brcond2_i32:
             tmp = do_constant_folding_cond2(&op->args[0], &op->args[2],
                                             op->args[4]);
-            if (tmp != 2) {
-                if (tmp) {
-            do_brcond_true:
-                    memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
-                    op->opc = INDEX_op_br;
-                    op->args[0] = op->args[5];
-                } else {
+            if (tmp == 0) {
             do_brcond_false:
-                    tcg_op_remove(s, op);
-                }
-            } else if ((op->args[4] == TCG_COND_LT
-                        || op->args[4] == TCG_COND_GE)
-                       && arg_is_const(op->args[2])
-                       && arg_info(op->args[2])->val == 0
-                       && arg_is_const(op->args[3])
-                       && arg_info(op->args[3])->val == 0) {
+                tcg_op_remove(s, op);
+                continue;
+            }
+            if (tmp == 1) {
+            do_brcond_true:
+                op->opc = opc = INDEX_op_br;
+                op->args[0] = op->args[5];
+                break;
+            }
+            if ((op->args[4] == TCG_COND_LT || op->args[4] == TCG_COND_GE)
+                 && arg_is_const(op->args[2])
+                 && arg_info(op->args[2])->val == 0
+                 && arg_is_const(op->args[3])
+                 && arg_info(op->args[3])->val == 0) {
                 /* Simplify LT/GE comparisons vs zero to a single compare
                    vs the high word of the input.  */
             do_brcond_high:
-                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
-                op->opc = INDEX_op_brcond_i32;
+                op->opc = opc = INDEX_op_brcond_i32;
                 op->args[0] = op->args[1];
                 op->args[1] = op->args[3];
                 op->args[2] = op->args[4];
                 op->args[3] = op->args[5];
-            } else if (op->args[4] == TCG_COND_EQ) {
+                break;
+            }
+            if (op->args[4] == TCG_COND_EQ) {
                 /* Simplify EQ comparisons where one of the pairs
                    can be simplified.  */
                 tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 if (tmp == 0) {
                     goto do_brcond_false;
                 } else if (tmp != 1) {
-                    goto do_default;
+                    break;
                 }
             do_brcond_low:
                 memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 op->args[1] = op->args[2];
                 op->args[2] = op->args[4];
                 op->args[3] = op->args[5];
-            } else if (op->args[4] == TCG_COND_NE) {
+                break;
+            }
+            if (op->args[4] == TCG_COND_NE) {
                 /* Simplify NE comparisons where one of the pairs
                    can be simplified.  */
                 tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 } else if (tmp == 1) {
                     goto do_brcond_true;
                 }
-                goto do_default;
-            } else {
-                goto do_default;
             }
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (tmp != 2) {
             do_setcond_const:
                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
-            } else if ((op->args[5] == TCG_COND_LT
-                        || op->args[5] == TCG_COND_GE)
-                       && arg_is_const(op->args[3])
-                       && arg_info(op->args[3])->val == 0
-                       && arg_is_const(op->args[4])
-                       && arg_info(op->args[4])->val == 0) {
+                continue;
+            }
+            if ((op->args[5] == TCG_COND_LT || op->args[5] == TCG_COND_GE)
+                 && arg_is_const(op->args[3])
+                 && arg_info(op->args[3])->val == 0
+                 && arg_is_const(op->args[4])
+                 && arg_info(op->args[4])->val == 0) {
                 /* Simplify LT/GE comparisons vs zero to a single compare
                    vs the high word of the input.  */
             do_setcond_high:
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 op->args[1] = op->args[2];
                 op->args[2] = op->args[4];
                 op->args[3] = op->args[5];
-            } else if (op->args[5] == TCG_COND_EQ) {
+                break;
+            }
+            if (op->args[5] == TCG_COND_EQ) {
                 /* Simplify EQ comparisons where one of the pairs
                    can be simplified.  */
                 tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 if (tmp == 0) {
                     goto do_setcond_high;
                 } else if (tmp != 1) {
-                    goto do_default;
+                    break;
                 }
             do_setcond_low:
                 reset_temp(op->args[0]);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 op->opc = INDEX_op_setcond_i32;
                 op->args[2] = op->args[3];
                 op->args[3] = op->args[5];
-            } else if (op->args[5] == TCG_COND_NE) {
+                break;
+            }
+            if (op->args[5] == TCG_COND_NE) {
                 /* Simplify NE comparisons where one of the pairs
                    can be simplified.  */
                 tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 } else if (tmp == 1) {
                     goto do_setcond_const;
                 }
-                goto do_default;
-            } else {
-                goto do_default;
             }
             break;
 
-        case INDEX_op_call:
-            if (!(tcg_call_flags(op)
+        default:
+            break;
+        }
+
+        /* Some of the folding above can change opc. */
+        opc = op->opc;
+        def = &tcg_op_defs[opc];
+        if (def->flags & TCG_OPF_BB_END) {
+            memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+        } else {
+            if (opc == INDEX_op_call &&
+                !(tcg_call_flags(op)
                   & (TCG_CALL_NO_READ_GLOBALS | TCG_CALL_NO_WRITE_GLOBALS))) {
                 for (i = 0; i < nb_globals; i++) {
                     if (test_bit(i, ctx.temps_used.l)) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                     }
                 }
             }
-            goto do_reset_output;
 
-        default:
-        do_default:
-            /* Default case: we know nothing about operation (or were unable
-               to compute the operation result) so no propagation is done.
-               We trash everything if the operation is the end of a basic
-               block, otherwise we only trash the output args.  "z_mask" is
-               the non-zero bits mask for the first output arg.  */
-            if (def->flags & TCG_OPF_BB_END) {
-                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
-            } else {
-        do_reset_output:
-                for (i = 0; i < nb_oargs; i++) {
-                    reset_temp(op->args[i]);
-                    /* Save the corresponding known-zero bits mask for the
-                       first output argument (only one supported so far). */
-                    if (i == 0) {
-                        arg_info(op->args[i])->z_mask = z_mask;
-                    }
+            for (i = 0; i < nb_oargs; i++) {
+                reset_temp(op->args[i]);
+                /* Save the corresponding known-zero bits mask for the
+                   first output argument (only one supported so far). */
+                if (i == 0) {
+                    arg_info(op->args[i])->z_mask = z_mask;
                 }
             }
-            break;
         }
 
         /* Eliminate duplicate and redundant fence instructions.  */
-- 
2.25.1

Adjust the interface to take the OptContext parameter instead
of TCGContext or both.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 67 +++++++++++++++++++++++++-------------------------
 1 file changed, 34 insertions(+), 33 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
 } TempOptInfo;
 
 typedef struct OptContext {
+    TCGContext *tcg;
     TCGTempSet temps_used;
 } OptContext;
 
@@ -XXX,XX +XXX,XX @@ static bool args_are_copies(TCGArg arg1, TCGArg arg2)
     return ts_are_copies(arg_temp(arg1), arg_temp(arg2));
 }
 
-static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
+static void tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
 {
     TCGTemp *dst_ts = arg_temp(dst);
     TCGTemp *src_ts = arg_temp(src);
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
     TCGOpcode new_op;
 
     if (ts_are_copies(dst_ts, src_ts)) {
-        tcg_op_remove(s, op);
+        tcg_op_remove(ctx->tcg, op);
         return;
     }
 
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
     }
 }
 
-static void tcg_opt_gen_movi(TCGContext *s, OptContext *ctx,
-                             TCGOp *op, TCGArg dst, uint64_t val)
+static void tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
+                             TCGArg dst, uint64_t val)
 {
     const TCGOpDef *def = &tcg_op_defs[op->opc];
     TCGType type;
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_movi(TCGContext *s, OptContext *ctx,
     /* Convert movi to mov with constant temp. */
     tv = tcg_constant_internal(type, val);
     init_ts_info(ctx, tv);
-    tcg_opt_gen_mov(s, op, dst, temp_arg(tv));
+    tcg_opt_gen_mov(ctx, op, dst, temp_arg(tv));
 }
 
 static uint64_t do_constant_folding_2(TCGOpcode op, uint64_t x, uint64_t y)
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 {
     int nb_temps, nb_globals, i;
     TCGOp *op, *op_next, *prev_mb = NULL;
-    OptContext ctx = {};
+    OptContext ctx = { .tcg = s };
 
     /* Array VALS has an element for each temp.
        If this temp holds a constant then its value is kept in VALS' element.
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(rotr):
             if (arg_is_const(op->args[1])
                 && arg_info(op->args[1])->val == 0) {
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (!arg_is_const(op->args[1])
                 && arg_is_const(op->args[2])
                 && arg_info(op->args[2])->val == 0) {
-                tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
+                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (!arg_is_const(op->args[1])
                 && arg_is_const(op->args[2])
                 && arg_info(op->args[2])->val == -1) {
-                tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
+                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 
         if (partmask == 0) {
             tcg_debug_assert(nb_oargs == 1);
-            tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
+            tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
             continue;
         }
         if (affected == 0) {
             tcg_debug_assert(nb_oargs == 1);
-            tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
+            tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
             continue;
         }
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(mulsh):
             if (arg_is_const(op->args[2])
                 && arg_info(op->args[2])->val == 0) {
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64_VEC(or):
         CASE_OP_32_64_VEC(and):
             if (args_are_copies(op->args[1], op->args[2])) {
-                tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
+                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64_VEC(sub):
         CASE_OP_32_64_VEC(xor):
             if (args_are_copies(op->args[1], op->args[2])) {
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
            allocator where needed and possible.  Also detect copies. */
         switch (opc) {
         CASE_OP_32_64_VEC(mov):
-            tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
+            tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
             continue;
 
         case INDEX_op_dup_vec:
             if (arg_is_const(op->args[1])) {
                 tmp = arg_info(op->args[1])->val;
                 tmp = dup_const(TCGOP_VECE(op), tmp);
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_dup2_vec:
             assert(TCG_TARGET_REG_BITS == 32);
             if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0],
+                tcg_opt_gen_movi(&ctx, op, op->args[0],
                                  deposit64(arg_info(op->args[1])->val, 32, 32,
                                            arg_info(op->args[2])->val));
                 continue;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_extrh_i64_i32:
             if (arg_is_const(op->args[1])) {
                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val, 0);
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (arg_is_const(op->args[1])) {
                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
                                           op->args[2]);
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
                                           arg_info(op->args[2])->val);
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 TCGArg v = arg_info(op->args[1])->val;
                 if (v != 0) {
                     tmp = do_constant_folding(opc, v, 0);
-                    tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                    tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                 } else {
-                    tcg_opt_gen_mov(s, op, op->args[0], op->args[2]);
+                    tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[2]);
                 }
                 continue;
             }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 tmp = deposit64(arg_info(op->args[1])->val,
                                 op->args[3], op->args[4],
                                 arg_info(op->args[2])->val);
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (arg_is_const(op->args[1])) {
                 tmp = extract64(arg_info(op->args[1])->val,
                                 op->args[2], op->args[3]);
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (arg_is_const(op->args[1])) {
                 tmp = sextract64(arg_info(op->args[1])->val,
                                  op->args[2], op->args[3]);
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                     tmp = (int32_t)(((uint32_t)v1 >> shr) |
                                     ((uint32_t)v2 << (32 - shr)));
                 }
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             tmp = do_constant_folding_cond(opc, op->args[1],
                                            op->args[2], op->args[3]);
             if (tmp != 2) {
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             tmp = do_constant_folding_cond(opc, op->args[1],
                                            op->args[2], op->args[5]);
             if (tmp != 2) {
-                tcg_opt_gen_mov(s, op, op->args[0], op->args[4-tmp]);
+                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[4-tmp]);
                 continue;
             }
             if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 
                 rl = op->args[0];
                 rh = op->args[1];
-                tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)a);
-                tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(a >> 32));
+                tcg_opt_gen_movi(&ctx, op, rl, (int32_t)a);
+                tcg_opt_gen_movi(&ctx, op2, rh, (int32_t)(a >> 32));
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 
                 rl = op->args[0];
                 rh = op->args[1];
-                tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)r);
-                tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(r >> 32));
+                tcg_opt_gen_movi(&ctx, op, rl, (int32_t)r);
+                tcg_opt_gen_movi(&ctx, op2, rh, (int32_t)(r >> 32));
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                                             op->args[5]);
             if (tmp != 2) {
             do_setcond_const:
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                 continue;
             }
             if ((op->args[5] == TCG_COND_LT || op->args[5] == TCG_COND_GE)
-- 
2.25.1

This will expose the variable to subroutines that
will be broken out of tcg_optimize.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
 
 typedef struct OptContext {
     TCGContext *tcg;
+    TCGOp *prev_mb;
     TCGTempSet temps_used;
 } OptContext;
 
@@ -XXX,XX +XXX,XX @@ static bool swap_commutative2(TCGArg *p1, TCGArg *p2)
 void tcg_optimize(TCGContext *s)
 {
     int nb_temps, nb_globals, i;
-    TCGOp *op, *op_next, *prev_mb = NULL;
+    TCGOp *op, *op_next;
     OptContext ctx = { .tcg = s };
 
     /* Array VALS has an element for each temp.
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         }
 
         /* Eliminate duplicate and redundant fence instructions.  */
-        if (prev_mb) {
+        if (ctx.prev_mb) {
             switch (opc) {
             case INDEX_op_mb:
                 /* Merge two barriers of the same type into one,
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  * barrier.  This is stricter than specified but for
                  * the purposes of TCG is better than not optimizing.
                  */
-                prev_mb->args[0] |= op->args[0];
+                ctx.prev_mb->args[0] |= op->args[0];
                 tcg_op_remove(s, op);
                 break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             case INDEX_op_qemu_st_i64:
             case INDEX_op_call:
                 /* Opcodes that touch guest memory stop the optimization.  */
-                prev_mb = NULL;
+                ctx.prev_mb = NULL;
                 break;
             }
         } else if (opc == INDEX_op_mb) {
-            prev_mb = op;
+            ctx.prev_mb = op;
         }
     }
 }
-- 
2.25.1

There was no real reason for calls to have separate code here.
Unify init for calls vs non-calls using the call path, which
handles TCG_CALL_DUMMY_ARG.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static void init_ts_info(OptContext *ctx, TCGTemp *ts)
     }
 }
 
-static void init_arg_info(OptContext *ctx, TCGArg arg)
-{
-    init_ts_info(ctx, arg_temp(arg));
-}
-
 static TCGTemp *find_better_copy(TCGContext *s, TCGTemp *ts)
 {
     TCGTemp *i, *g, *l;
@@ -XXX,XX +XXX,XX @@ static bool swap_commutative2(TCGArg *p1, TCGArg *p2)
     return false;
 }
 
+static void init_arguments(OptContext *ctx, TCGOp *op, int nb_args)
+{
+    for (int i = 0; i < nb_args; i++) {
+        TCGTemp *ts = arg_temp(op->args[i]);
+        if (ts) {
+            init_ts_info(ctx, ts);
+        }
+    }
+}
+
 /* Propagate constants and copies, fold constant expressions. */
 void tcg_optimize(TCGContext *s)
 {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         if (opc == INDEX_op_call) {
             nb_oargs = TCGOP_CALLO(op);
             nb_iargs = TCGOP_CALLI(op);
-            for (i = 0; i < nb_oargs + nb_iargs; i++) {
-                TCGTemp *ts = arg_temp(op->args[i]);
-                if (ts) {
-                    init_ts_info(&ctx, ts);
-                }
-            }
         } else {
             nb_oargs = def->nb_oargs;
             nb_iargs = def->nb_iargs;
-            for (i = 0; i < nb_oargs + nb_iargs; i++) {
-                init_arg_info(&ctx, op->args[i]);
-            }
         }
+        init_arguments(&ctx, op, nb_oargs + nb_iargs);
 
         /* Do copy propagation */
         for (i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
-- 
2.25.1

Continue splitting tcg_optimize.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static void init_arguments(OptContext *ctx, TCGOp *op, int nb_args)
     }
 }
 
+static void copy_propagate(OptContext *ctx, TCGOp *op,
+                           int nb_oargs, int nb_iargs)
+{
+    TCGContext *s = ctx->tcg;
+
+    for (int i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
+        TCGTemp *ts = arg_temp(op->args[i]);
+        if (ts && ts_is_copy(ts)) {
+            op->args[i] = temp_arg(find_better_copy(s, ts));
+        }
+    }
+}
+
 /* Propagate constants and copies, fold constant expressions. */
 void tcg_optimize(TCGContext *s)
 {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             nb_iargs = def->nb_iargs;
         }
         init_arguments(&ctx, op, nb_oargs + nb_iargs);
-
-        /* Do copy propagation */
-        for (i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
-            TCGTemp *ts = arg_temp(op->args[i]);
-            if (ts && ts_is_copy(ts)) {
-                op->args[i] = temp_arg(find_better_copy(s, ts));
-            }
-        }
+        copy_propagate(&ctx, op, nb_oargs, nb_iargs);
 
         /* For commutative operations make constant second argument */
         switch (opc) {
-- 
2.25.1

Calls are special in that they have a variable number
of arguments, and need to be able to clobber globals.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 63 ++++++++++++++++++++++++++++++++------------------
 1 file changed, 41 insertions(+), 22 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static void copy_propagate(OptContext *ctx, TCGOp *op,
     }
 }
 
+static bool fold_call(OptContext *ctx, TCGOp *op)
+{
+    TCGContext *s = ctx->tcg;
+    int nb_oargs = TCGOP_CALLO(op);
+    int nb_iargs = TCGOP_CALLI(op);
+    int flags, i;
+
+    init_arguments(ctx, op, nb_oargs + nb_iargs);
+    copy_propagate(ctx, op, nb_oargs, nb_iargs);
+
+    /* If the function reads or writes globals, reset temp data. */
+    flags = tcg_call_flags(op);
+    if (!(flags & (TCG_CALL_NO_READ_GLOBALS | TCG_CALL_NO_WRITE_GLOBALS))) {
+        int nb_globals = s->nb_globals;
+
+        for (i = 0; i < nb_globals; i++) {
+            if (test_bit(i, ctx->temps_used.l)) {
+                reset_ts(&ctx->tcg->temps[i]);
+            }
+        }
+    }
+
+    /* Reset temp data for outputs. */
+    for (i = 0; i < nb_oargs; i++) {
+        reset_temp(op->args[i]);
+    }
+
+    /* Stop optimizing MB across calls. */
+    ctx->prev_mb = NULL;
+    return true;
+}
+
 /* Propagate constants and copies, fold constant expressions. */
 void tcg_optimize(TCGContext *s)
 {
-    int nb_temps, nb_globals, i;
+    int nb_temps, i;
     TCGOp *op, *op_next;
     OptContext ctx = { .tcg = s };
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
        available through the doubly linked circular list. */
 
     nb_temps = s->nb_temps;
-    nb_globals = s->nb_globals;
-
     for (i = 0; i < nb_temps; ++i) {
         s->temps[i].state_ptr = NULL;
     }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         uint64_t z_mask, partmask, affected, tmp;
         int nb_oargs, nb_iargs;
         TCGOpcode opc = op->opc;
-        const TCGOpDef *def = &tcg_op_defs[opc];
+        const TCGOpDef *def;
 
-        /* Count the arguments, and initialize the temps that are
-           going to be used */
+        /* Calls are special. */
         if (opc == INDEX_op_call) {
-            nb_oargs = TCGOP_CALLO(op);
-            nb_iargs = TCGOP_CALLI(op);
-        } else {
-            nb_oargs = def->nb_oargs;
-            nb_iargs = def->nb_iargs;
+            fold_call(&ctx, op);
+            continue;
         }
+
+        def = &tcg_op_defs[opc];
+        nb_oargs = def->nb_oargs;
+        nb_iargs = def->nb_iargs;
         init_arguments(&ctx, op, nb_oargs + nb_iargs);
         copy_propagate(&ctx, op, nb_oargs, nb_iargs);
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         if (def->flags & TCG_OPF_BB_END) {
             memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
         } else {
-            if (opc == INDEX_op_call &&
-                !(tcg_call_flags(op)
-                  & (TCG_CALL_NO_READ_GLOBALS | TCG_CALL_NO_WRITE_GLOBALS))) {
-                for (i = 0; i < nb_globals; i++) {
-                    if (test_bit(i, ctx.temps_used.l)) {
-                        reset_ts(&s->temps[i]);
-                    }
-                }
-            }
-
             for (i = 0; i < nb_oargs; i++) {
                 reset_temp(op->args[i]);
                 /* Save the corresponding known-zero bits mask for the
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             case INDEX_op_qemu_st_i32:
             case INDEX_op_qemu_st8_i32:
             case INDEX_op_qemu_st_i64:
-            case INDEX_op_call:
                 /* Opcodes that touch guest memory stop the optimization.  */
                 ctx.prev_mb = NULL;
                 break;
-- 
2.25.1

Rather than try to keep these up-to-date across folding,
re-read nb_oargs at the end, after re-reading the opcode.

A couple of asserts need dropping, but that will take care
of itself as we split the function further.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 
     QTAILQ_FOREACH_SAFE(op, &s->ops, link, op_next) {
         uint64_t z_mask, partmask, affected, tmp;
-        int nb_oargs, nb_iargs;
         TCGOpcode opc = op->opc;
         const TCGOpDef *def;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         }
 
         def = &tcg_op_defs[opc];
-        nb_oargs = def->nb_oargs;
-        nb_iargs = def->nb_iargs;
-        init_arguments(&ctx, op, nb_oargs + nb_iargs);
-        copy_propagate(&ctx, op, nb_oargs, nb_iargs);
+        init_arguments(&ctx, op, def->nb_oargs + def->nb_iargs);
+        copy_propagate(&ctx, op, def->nb_oargs, def->nb_iargs);
 
         /* For commutative operations make constant second argument */
         switch (opc) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 
         CASE_OP_32_64(qemu_ld):
             {
-                MemOpIdx oi = op->args[nb_oargs + nb_iargs];
+                MemOpIdx oi = op->args[def->nb_oargs + def->nb_iargs];
                 MemOp mop = get_memop(oi);
                 if (!(mop & MO_SIGN)) {
                     z_mask = (2ULL << ((8 << (mop & MO_SIZE)) - 1)) - 1;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         }
 
         if (partmask == 0) {
-            tcg_debug_assert(nb_oargs == 1);
             tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
             continue;
         }
         if (affected == 0) {
-            tcg_debug_assert(nb_oargs == 1);
             tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
             continue;
         }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             } else if (args_are_copies(op->args[1], op->args[2])) {
                 op->opc = INDEX_op_dup_vec;
                 TCGOP_VECE(op) = MO_32;
-                nb_iargs = 1;
             }
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 op->opc = opc = (opc == INDEX_op_movcond_i32
                                  ? INDEX_op_setcond_i32
                                  : INDEX_op_setcond_i64);
-                nb_iargs = 2;
             }
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         if (def->flags & TCG_OPF_BB_END) {
             memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
         } else {
+            int nb_oargs = def->nb_oargs;
             for (i = 0; i < nb_oargs; i++) {
                 reset_temp(op->args[i]);
                 /* Save the corresponding known-zero bits mask for the
-- 
2.25.1

Return -1 instead of 2 for failure, so that we can
use comparisons against 0 for all cases.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 145 +++++++++++++++++++++++++------------------------
 1 file changed, 74 insertions(+), 71 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool do_constant_folding_cond_eq(TCGCond c)
     }
 }
 
-/* Return 2 if the condition can't be simplified, and the result
-   of the condition (0 or 1) if it can */
-static TCGArg do_constant_folding_cond(TCGOpcode op, TCGArg x,
-                                       TCGArg y, TCGCond c)
+/*
+ * Return -1 if the condition can't be simplified,
+ * and the result of the condition (0 or 1) if it can.
+ */
+static int do_constant_folding_cond(TCGOpcode op, TCGArg x,
+                                    TCGArg y, TCGCond c)
 {
     uint64_t xv = arg_info(x)->val;
     uint64_t yv = arg_info(y)->val;
@@ -XXX,XX +XXX,XX @@ static TCGArg do_constant_folding_cond(TCGOpcode op, TCGArg x,
         case TCG_COND_GEU:
             return 1;
         default:
-            return 2;
+            return -1;
         }
     }
-    return 2;
+    return -1;
 }
 
-/* Return 2 if the condition can't be simplified, and the result
-   of the condition (0 or 1) if it can */
-static TCGArg do_constant_folding_cond2(TCGArg *p1, TCGArg *p2, TCGCond c)
+/*
+ * Return -1 if the condition can't be simplified,
+ * and the result of the condition (0 or 1) if it can.
+ */
+static int do_constant_folding_cond2(TCGArg *p1, TCGArg *p2, TCGCond c)
 {
     TCGArg al = p1[0], ah = p1[1];
     TCGArg bl = p2[0], bh = p2[1];
@@ -XXX,XX +XXX,XX @@ static TCGArg do_constant_folding_cond2(TCGArg *p1, TCGArg *p2, TCGCond c)
     if (args_are_copies(al, bl) && args_are_copies(ah, bh)) {
         return do_constant_folding_cond_eq(c);
     }
-    return 2;
+    return -1;
 }
 
 static bool swap_commutative(TCGArg dest, TCGArg *p1, TCGArg *p2)
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             break;
 
         CASE_OP_32_64(setcond):
-            tmp = do_constant_folding_cond(opc, op->args[1],
-                                           op->args[2], op->args[3]);
-            if (tmp != 2) {
-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
+            i = do_constant_folding_cond(opc, op->args[1],
+                                         op->args[2], op->args[3]);
+            if (i >= 0) {
+                tcg_opt_gen_movi(&ctx, op, op->args[0], i);
                 continue;
             }
             break;
 
         CASE_OP_32_64(brcond):
-            tmp = do_constant_folding_cond(opc, op->args[0],
-                                           op->args[1], op->args[2]);
-            switch (tmp) {
-            case 0:
+            i = do_constant_folding_cond(opc, op->args[0],
+                                         op->args[1], op->args[2]);
+            if (i == 0) {
                 tcg_op_remove(s, op);
                 continue;
-            case 1:
+            } else if (i > 0) {
                 memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
                 op->opc = opc = INDEX_op_br;
                 op->args[0] = op->args[3];
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             break;
 
         CASE_OP_32_64(movcond):
-            tmp = do_constant_folding_cond(opc, op->args[1],
-                                           op->args[2], op->args[5]);
-            if (tmp != 2) {
-                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[4-tmp]);
+            i = do_constant_folding_cond(opc, op->args[1],
+                                         op->args[2], op->args[5]);
+            if (i >= 0) {
+                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[4 - i]);
                 continue;
             }
             if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             break;
 
         case INDEX_op_brcond2_i32:
-            tmp = do_constant_folding_cond2(&op->args[0], &op->args[2],
-                                            op->args[4]);
-            if (tmp == 0) {
+            i = do_constant_folding_cond2(&op->args[0], &op->args[2],
+                                          op->args[4]);
+            if (i == 0) {
             do_brcond_false:
                 tcg_op_remove(s, op);
                 continue;
             }
-            if (tmp == 1) {
+            if (i > 0) {
             do_brcond_true:
                 op->opc = opc = INDEX_op_br;
                 op->args[0] = op->args[5];
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (op->args[4] == TCG_COND_EQ) {
                 /* Simplify EQ comparisons where one of the pairs
                    can be simplified.  */
-                tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
-                                               op->args[0], op->args[2],
-                                               TCG_COND_EQ);
-                if (tmp == 0) {
+                i = do_constant_folding_cond(INDEX_op_brcond_i32,
+                                             op->args[0], op->args[2],
+                                             TCG_COND_EQ);
+                if (i == 0) {
                     goto do_brcond_false;
-                } else if (tmp == 1) {
+                } else if (i > 0) {
                     goto do_brcond_high;
                 }
-                tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
-                                               op->args[1], op->args[3],
-                                               TCG_COND_EQ);
-                if (tmp == 0) {
+                i = do_constant_folding_cond(INDEX_op_brcond_i32,
+                                             op->args[1], op->args[3],
+                                             TCG_COND_EQ);
+                if (i == 0) {
                     goto do_brcond_false;
-                } else if (tmp != 1) {
+                } else if (i < 0) {
                     break;
                 }
             do_brcond_low:
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (op->args[4] == TCG_COND_NE) {
                 /* Simplify NE comparisons where one of the pairs
                    can be simplified.  */
-                tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
-                                               op->args[0], op->args[2],
-                                               TCG_COND_NE);
-                if (tmp == 0) {
+                i = do_constant_folding_cond(INDEX_op_brcond_i32,
+                                             op->args[0], op->args[2],
+                                             TCG_COND_NE);
+                if (i == 0) {
                     goto do_brcond_high;
-                } else if (tmp == 1) {
+                } else if (i > 0) {
                     goto do_brcond_true;
                 }
-                tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
-                                               op->args[1], op->args[3],
-                                               TCG_COND_NE);
-                if (tmp == 0) {
+                i = do_constant_folding_cond(INDEX_op_brcond_i32,
+                                             op->args[1], op->args[3],
+                                             TCG_COND_NE);
+                if (i == 0) {
                     goto do_brcond_low;
-                } else if (tmp == 1) {
+                } else if (i > 0) {
                     goto do_brcond_true;
                 }
             }
             break;
 
         case INDEX_op_setcond2_i32:
-            tmp = do_constant_folding_cond2(&op->args[1], &op->args[3],
-                                            op->args[5]);
-            if (tmp != 2) {
+            i = do_constant_folding_cond2(&op->args[1], &op->args[3],
+                                          op->args[5]);
+            if (i >= 0) {
             do_setcond_const:
-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], i);
                 continue;
             }
             if ((op->args[5] == TCG_COND_LT || op->args[5] == TCG_COND_GE)
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (op->args[5] == TCG_COND_EQ) {
                 /* Simplify EQ comparisons where one of the pairs
                    can be simplified.  */
-                tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
-                                               op->args[1], op->args[3],
-                                               TCG_COND_EQ);
-                if (tmp == 0) {
+                i = do_constant_folding_cond(INDEX_op_setcond_i32,
+                                             op->args[1], op->args[3],
+                                             TCG_COND_EQ);
+                if (i == 0) {
                     goto do_setcond_const;
-                } else if (tmp == 1) {
+                } else if (i > 0) {
                     goto do_setcond_high;
                 }
-                tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
-                                               op->args[2], op->args[4],
-                                               TCG_COND_EQ);
-                if (tmp == 0) {
+                i = do_constant_folding_cond(INDEX_op_setcond_i32,
+                                             op->args[2], op->args[4],
+                                             TCG_COND_EQ);
+                if (i == 0) {
                     goto do_setcond_high;
-                } else if (tmp != 1) {
+                } else if (i < 0) {
                     break;
                 }
             do_setcond_low:
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (op->args[5] == TCG_COND_NE) {
                 /* Simplify NE comparisons where one of the pairs
                    can be simplified.  */
-                tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
-                                               op->args[1], op->args[3],
-                                               TCG_COND_NE);
-                if (tmp == 0) {
+                i = do_constant_folding_cond(INDEX_op_setcond_i32,
+                                             op->args[1], op->args[3],
+                                             TCG_COND_NE);
+                if (i == 0) {
                     goto do_setcond_high;
-                } else if (tmp == 1) {
+                } else if (i > 0) {
                     goto do_setcond_const;
                 }
-                tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
-                                               op->args[2], op->args[4],
-                                               TCG_COND_NE);
-                if (tmp == 0) {
+                i = do_constant_folding_cond(INDEX_op_setcond_i32,
+                                             op->args[2], op->args[4],
+                                             TCG_COND_NE);
+                if (i == 0) {
                     goto do_setcond_low;
-                } else if (tmp == 1) {
+                } else if (i > 0) {
                     goto do_setcond_const;
                 }
             }
-- 
2.25.1

This will allow callers to tail call to these functions
and return true indicating processing complete.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool args_are_copies(TCGArg arg1, TCGArg arg2)
     return ts_are_copies(arg_temp(arg1), arg_temp(arg2));
 }
 
-static void tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
+static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
 {
     TCGTemp *dst_ts = arg_temp(dst);
     TCGTemp *src_ts = arg_temp(src);
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
 
     if (ts_are_copies(dst_ts, src_ts)) {
         tcg_op_remove(ctx->tcg, op);
-        return;
+        return true;
     }
 
     reset_ts(dst_ts);
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
         di->is_const = si->is_const;
         di->val = si->val;
     }
+    return true;
 }
 
-static void tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
+static bool tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
                              TCGArg dst, uint64_t val)
 {
     const TCGOpDef *def = &tcg_op_defs[op->opc];
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
     /* Convert movi to mov with constant temp. */
     tv = tcg_constant_internal(type, val);
     init_ts_info(ctx, tv);
-    tcg_opt_gen_mov(ctx, op, dst, temp_arg(tv));
+    return tcg_opt_gen_mov(ctx, op, dst, temp_arg(tv));
 }
 
 static uint64_t do_constant_folding_2(TCGOpcode op, uint64_t x, uint64_t y)
-- 
2.25.1

Copy z_mask into OptContext, for writeback to the
first output within the new function.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 49 +++++++++++++++++++++++++++++++++----------------
 1 file changed, 33 insertions(+), 16 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ typedef struct OptContext {
     TCGContext *tcg;
     TCGOp *prev_mb;
     TCGTempSet temps_used;
+
+    /* In flight values from optimization. */
+    uint64_t z_mask;
 } OptContext;
 
 static inline TempOptInfo *ts_info(TCGTemp *ts)
@@ -XXX,XX +XXX,XX @@ static void copy_propagate(OptContext *ctx, TCGOp *op,
     }
 }
 
+static void finish_folding(OptContext *ctx, TCGOp *op)
+{
+    const TCGOpDef *def = &tcg_op_defs[op->opc];
+    int i, nb_oargs;
+
+    /*
+     * For an opcode that ends a BB, reset all temp data.
+     * We do no cross-BB optimization.
+     */
+    if (def->flags & TCG_OPF_BB_END) {
+        memset(&ctx->temps_used, 0, sizeof(ctx->temps_used));
+        ctx->prev_mb = NULL;
+        return;
+    }
+
+    nb_oargs = def->nb_oargs;
+    for (i = 0; i < nb_oargs; i++) {
+        reset_temp(op->args[i]);
+        /*
+         * Save the corresponding known-zero bits mask for the
+         * first output argument (only one supported so far).
+         */
+        if (i == 0) {
+            arg_info(op->args[i])->z_mask = ctx->z_mask;
+        }
+    }
+}
+
 static bool fold_call(OptContext *ctx, TCGOp *op)
 {
     TCGContext *s = ctx->tcg;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             partmask &= 0xffffffffu;
             affected &= 0xffffffffu;
         }
+        ctx.z_mask = z_mask;
 
         if (partmask == 0) {
             tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             break;
         }
 
-        /* Some of the folding above can change opc. */
-        opc = op->opc;
-        def = &tcg_op_defs[opc];
-        if (def->flags & TCG_OPF_BB_END) {
-            memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
-        } else {
-            int nb_oargs = def->nb_oargs;
-            for (i = 0; i < nb_oargs; i++) {
-                reset_temp(op->args[i]);
-                /* Save the corresponding known-zero bits mask for the
-                   first output argument (only one supported so far). */
-                if (i == 0) {
-                    arg_info(op->args[i])->z_mask = z_mask;
-                }
-            }
-        }
+        finish_folding(&ctx, op);
 
         /* Eliminate duplicate and redundant fence instructions.  */
         if (ctx.prev_mb) {
-- 
2.25.1

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         uint64_t z_mask, partmask, affected, tmp;
         TCGOpcode opc = op->opc;
         const TCGOpDef *def;
+        bool done = false;
 
         /* Calls are special. */
         if (opc == INDEX_op_call) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
            allocator where needed and possible.  Also detect copies. */
         switch (opc) {
         CASE_OP_32_64_VEC(mov):
-            tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
-            continue;
+            done = tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
+            break;
 
         case INDEX_op_dup_vec:
             if (arg_is_const(op->args[1])) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             break;
         }
 
-        finish_folding(&ctx, op);
+        if (!done) {
+            finish_folding(&ctx, op);
+        }
 
         /* Eliminate duplicate and redundant fence instructions.  */
         if (ctx.prev_mb) {
-- 
2.25.1

This puts the separate mb optimization into the same framework
as the others.  While fold_qemu_{ld,st} are currently identical,
that won't last as more code gets moved.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 89 +++++++++++++++++++++++++++++---------------------
 1 file changed, 51 insertions(+), 38 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_call(OptContext *ctx, TCGOp *op)
     return true;
 }
 
+static bool fold_mb(OptContext *ctx, TCGOp *op)
+{
+    /* Eliminate duplicate and redundant fence instructions.  */
+    if (ctx->prev_mb) {
+        /*
+         * Merge two barriers of the same type into one,
+         * or a weaker barrier into a stronger one,
+         * or two weaker barriers into a stronger one.
+         *   mb X; mb Y => mb X|Y
+         *   mb; strl => mb; st
+         *   ldaq; mb => ld; mb
+         *   ldaq; strl => ld; mb; st
+         * Other combinations are also merged into a strong
+         * barrier.  This is stricter than specified but for
+         * the purposes of TCG is better than not optimizing.
+         */
+        ctx->prev_mb->args[0] |= op->args[0];
+        tcg_op_remove(ctx->tcg, op);
+    } else {
+        ctx->prev_mb = op;
+    }
+    return true;
+}
+
+static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
+{
+    /* Opcodes that touch guest memory stop the mb optimization.  */
+    ctx->prev_mb = NULL;
+    return false;
+}
+
+static bool fold_qemu_st(OptContext *ctx, TCGOp *op)
+{
+    /* Opcodes that touch guest memory stop the mb optimization.  */
+    ctx->prev_mb = NULL;
+    return false;
+}
+
 /* Propagate constants and copies, fold constant expressions. */
 void tcg_optimize(TCGContext *s)
 {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
+        case INDEX_op_mb:
+            done = fold_mb(&ctx, op);
+            break;
+        case INDEX_op_qemu_ld_i32:
+        case INDEX_op_qemu_ld_i64:
+            done = fold_qemu_ld(&ctx, op);
+            break;
+        case INDEX_op_qemu_st_i32:
+        case INDEX_op_qemu_st8_i32:
+        case INDEX_op_qemu_st_i64:
+            done = fold_qemu_st(&ctx, op);
+            break;
+
         default:
             break;
         }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         if (!done) {
             finish_folding(&ctx, op);
         }
-
-        /* Eliminate duplicate and redundant fence instructions.  */
-        if (ctx.prev_mb) {
-            switch (opc) {
-            case INDEX_op_mb:
-                /* Merge two barriers of the same type into one,
-                 * or a weaker barrier into a stronger one,
-                 * or two weaker barriers into a stronger one.
-                 *   mb X; mb Y => mb X|Y
-                 *   mb; strl => mb; st
-                 *   ldaq; mb => ld; mb
-                 *   ldaq; strl => ld; mb; st
-                 * Other combinations are also merged into a strong
-                 * barrier.  This is stricter than specified but for
-                 * the purposes of TCG is better than not optimizing.
-                 */
-                ctx.prev_mb->args[0] |= op->args[0];
-                tcg_op_remove(s, op);
-                break;
-
-            default:
-                /* Opcodes that end the block stop the optimization.  */
-                if ((def->flags & TCG_OPF_BB_END) == 0) {
-                    break;
-                }
-                /* fallthru */
-            case INDEX_op_qemu_ld_i32:
-            case INDEX_op_qemu_ld_i64:
-            case INDEX_op_qemu_st_i32:
-            case INDEX_op_qemu_st8_i32:
-            case INDEX_op_qemu_st_i64:
-                /* Opcodes that touch guest memory stop the optimization.  */
-                ctx.prev_mb = NULL;
-                break;
-            }
-        } else if (opc == INDEX_op_mb) {
-            ctx.prev_mb = op;
-        }
     }
 }
-- 
2.25.1

Split out a whole bunch of placeholder functions, which are
currently identical.  That won't last as more code gets moved.

Use CASE_32_64_VEC for some logical operators that previously
missed the addition of vectors.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 271 +++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 219 insertions(+), 52 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static void finish_folding(OptContext *ctx, TCGOp *op)
     }
 }
 
+/*
+ * The fold_* functions return true when processing is complete,
+ * usually by folding the operation to a constant or to a copy,
+ * and calling tcg_opt_gen_{mov,movi}.  They may do other things,
+ * like collect information about the value produced, for use in
+ * optimizing a subsequent operation.
+ *
+ * These first fold_* functions are all helpers, used by other
+ * folders for more specific operations.
+ */
+
+static bool fold_const1(OptContext *ctx, TCGOp *op)
+{
+    if (arg_is_const(op->args[1])) {
+        uint64_t t;
+
+        t = arg_info(op->args[1])->val;
+        t = do_constant_folding(op->opc, t, 0);
+        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
+    }
+    return false;
+}
+
+static bool fold_const2(OptContext *ctx, TCGOp *op)
+{
+    if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
+        uint64_t t1 = arg_info(op->args[1])->val;
+        uint64_t t2 = arg_info(op->args[2])->val;
+
+        t1 = do_constant_folding(op->opc, t1, t2);
+        return tcg_opt_gen_movi(ctx, op, op->args[0], t1);
+    }
+    return false;
+}
+
+/*
+ * These outermost fold_<op> functions are sorted alphabetically.
+ */
+
+static bool fold_add(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_and(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_andc(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
 static bool fold_call(OptContext *ctx, TCGOp *op)
 {
     TCGContext *s = ctx->tcg;
@@ -XXX,XX +XXX,XX @@ static bool fold_call(OptContext *ctx, TCGOp *op)
     return true;
 }
 
+static bool fold_ctpop(OptContext *ctx, TCGOp *op)
+{
+    return fold_const1(ctx, op);
+}
+
+static bool fold_divide(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_eqv(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_exts(OptContext *ctx, TCGOp *op)
+{
+    return fold_const1(ctx, op);
+}
+
+static bool fold_extu(OptContext *ctx, TCGOp *op)
+{
+    return fold_const1(ctx, op);
+}
+
 static bool fold_mb(OptContext *ctx, TCGOp *op)
 {
     /* Eliminate duplicate and redundant fence instructions.  */
@@ -XXX,XX +XXX,XX @@ static bool fold_mb(OptContext *ctx, TCGOp *op)
     return true;
 }
 
+static bool fold_mul(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_nand(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_neg(OptContext *ctx, TCGOp *op)
+{
+    return fold_const1(ctx, op);
+}
+
+static bool fold_nor(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_not(OptContext *ctx, TCGOp *op)
+{
+    return fold_const1(ctx, op);
+}
+
+static bool fold_or(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_orc(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
 static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
 {
     /* Opcodes that touch guest memory stop the mb optimization.  */
@@ -XXX,XX +XXX,XX @@ static bool fold_qemu_st(OptContext *ctx, TCGOp *op)
     return false;
 }
 
+static bool fold_remainder(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_shift(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_sub(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_xor(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
 /* Propagate constants and copies, fold constant expressions. */
 void tcg_optimize(TCGContext *s)
 {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
-        CASE_OP_32_64(not):
-        CASE_OP_32_64(neg):
-        CASE_OP_32_64(ext8s):
-        CASE_OP_32_64(ext8u):
-        CASE_OP_32_64(ext16s):
-        CASE_OP_32_64(ext16u):
-        CASE_OP_32_64(ctpop):
-        case INDEX_op_ext32s_i64:
-        case INDEX_op_ext32u_i64:
-        case INDEX_op_ext_i32_i64:
-        case INDEX_op_extu_i32_i64:
-        case INDEX_op_extrl_i64_i32:
-        case INDEX_op_extrh_i64_i32:
-            if (arg_is_const(op->args[1])) {
-                tmp = do_constant_folding(opc, arg_info(op->args[1])->val, 0);
-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
-                continue;
-            }
-            break;
-
         CASE_OP_32_64(bswap16):
         CASE_OP_32_64(bswap32):
         case INDEX_op_bswap64_i64:
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
-        CASE_OP_32_64(add):
-        CASE_OP_32_64(sub):
-        CASE_OP_32_64(mul):
-        CASE_OP_32_64(or):
-        CASE_OP_32_64(and):
-        CASE_OP_32_64(xor):
-        CASE_OP_32_64(shl):
-        CASE_OP_32_64(shr):
-        CASE_OP_32_64(sar):
-        CASE_OP_32_64(rotl):
-        CASE_OP_32_64(rotr):
-        CASE_OP_32_64(andc):
-        CASE_OP_32_64(orc):
-        CASE_OP_32_64(eqv):
-        CASE_OP_32_64(nand):
-        CASE_OP_32_64(nor):
-        CASE_OP_32_64(muluh):
-        CASE_OP_32_64(mulsh):
-        CASE_OP_32_64(div):
-        CASE_OP_32_64(divu):
-        CASE_OP_32_64(rem):
-        CASE_OP_32_64(remu):
-            if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
-                tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
-                                          arg_info(op->args[2])->val);
-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
-                continue;
-            }
-            break;
-
         CASE_OP_32_64(clz):
         CASE_OP_32_64(ctz):
             if (arg_is_const(op->args[1])) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
+        default:
+            break;
+
+        /* ---------------------------------------------------------- */
+        /* Sorted alphabetically by opcode as much as possible. */
+
+        CASE_OP_32_64_VEC(add):
+            done = fold_add(&ctx, op);
+            break;
+        CASE_OP_32_64_VEC(and):
+            done = fold_and(&ctx, op);
+            break;
+        CASE_OP_32_64_VEC(andc):
+            done = fold_andc(&ctx, op);
+            break;
+        CASE_OP_32_64(ctpop):
+            done = fold_ctpop(&ctx, op);
+            break;
+        CASE_OP_32_64(div):
+        CASE_OP_32_64(divu):
+            done = fold_divide(&ctx, op);
+            break;
+        CASE_OP_32_64(eqv):
+            done = fold_eqv(&ctx, op);
+            break;
+        CASE_OP_32_64(ext8s):
+        CASE_OP_32_64(ext16s):
+        case INDEX_op_ext32s_i64:
+        case INDEX_op_ext_i32_i64:
+            done = fold_exts(&ctx, op);
+            break;
+        CASE_OP_32_64(ext8u):
+        CASE_OP_32_64(ext16u):
+        case INDEX_op_ext32u_i64:
+        case INDEX_op_extu_i32_i64:
+        case INDEX_op_extrl_i64_i32:
+        case INDEX_op_extrh_i64_i32:
+            done = fold_extu(&ctx, op);
+            break;
         case INDEX_op_mb:
             done = fold_mb(&ctx, op);
             break;
+        CASE_OP_32_64(mul):
+            done = fold_mul(&ctx, op);
+            break;
+        CASE_OP_32_64(mulsh):
+        CASE_OP_32_64(muluh):
+            done = fold_mul_highpart(&ctx, op);
+            break;
+        CASE_OP_32_64(nand):
+            done = fold_nand(&ctx, op);
+            break;
+        CASE_OP_32_64(neg):
+            done = fold_neg(&ctx, op);
+            break;
+        CASE_OP_32_64(nor):
+            done = fold_nor(&ctx, op);
+            break;
+        CASE_OP_32_64_VEC(not):
+            done = fold_not(&ctx, op);
+            break;
+        CASE_OP_32_64_VEC(or):
+            done = fold_or(&ctx, op);
+            break;
+        CASE_OP_32_64_VEC(orc):
+            done = fold_orc(&ctx, op);
+            break;
         case INDEX_op_qemu_ld_i32:
         case INDEX_op_qemu_ld_i64:
             done = fold_qemu_ld(&ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_qemu_st_i64:
             done = fold_qemu_st(&ctx, op);
             break;
-
-        default:
+        CASE_OP_32_64(rem):
+        CASE_OP_32_64(remu):
+            done = fold_remainder(&ctx, op);
+            break;
+        CASE_OP_32_64(rotl):
+        CASE_OP_32_64(rotr):
+        CASE_OP_32_64(sar):
+        CASE_OP_32_64(shl):
+        CASE_OP_32_64(shr):
+            done = fold_shift(&ctx, op);
+            break;
+        CASE_OP_32_64_VEC(sub):
+            done = fold_sub(&ctx, op);
+            break;
+        CASE_OP_32_64_VEC(xor):
+            done = fold_xor(&ctx, op);
             break;
         }
 
-- 
2.25.1

Reduce some code duplication by folding the NE and EQ cases.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 145 ++++++++++++++++++++++++-------------------------
 1 file changed, 72 insertions(+), 73 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_remainder(OptContext *ctx, TCGOp *op)
     return fold_const2(ctx, op);
 }
 
+static bool fold_setcond2(OptContext *ctx, TCGOp *op)
+{
+    TCGCond cond = op->args[5];
+    int i = do_constant_folding_cond2(&op->args[1], &op->args[3], cond);
+    int inv = 0;
+
+    if (i >= 0) {
+        goto do_setcond_const;
+    }
+
+    switch (cond) {
+    case TCG_COND_LT:
+    case TCG_COND_GE:
+        /*
+         * Simplify LT/GE comparisons vs zero to a single compare
+         * vs the high word of the input.
+         */
+        if (arg_is_const(op->args[3]) && arg_info(op->args[3])->val == 0 &&
+            arg_is_const(op->args[4]) && arg_info(op->args[4])->val == 0) {
+            goto do_setcond_high;
+        }
+        break;
+
+    case TCG_COND_NE:
+        inv = 1;
+        QEMU_FALLTHROUGH;
+    case TCG_COND_EQ:
+        /*
+         * Simplify EQ/NE comparisons where one of the pairs
+         * can be simplified.
+         */
+        i = do_constant_folding_cond(INDEX_op_setcond_i32, op->args[1],
+                                     op->args[3], cond);
+        switch (i ^ inv) {
+        case 0:
+            goto do_setcond_const;
+        case 1:
+            goto do_setcond_high;
+        }
+
+        i = do_constant_folding_cond(INDEX_op_setcond_i32, op->args[2],
+                                     op->args[4], cond);
+        switch (i ^ inv) {
+        case 0:
+            goto do_setcond_const;
+        case 1:
+            op->args[2] = op->args[3];
+            op->args[3] = cond;
+            op->opc = INDEX_op_setcond_i32;
+            break;
+        }
+        break;
+
+    default:
+        break;
+
+    do_setcond_high:
+        op->args[1] = op->args[2];
+        op->args[2] = op->args[4];
+        op->args[3] = cond;
+        op->opc = INDEX_op_setcond_i32;
+        break;
+    }
+    return false;
+
+ do_setcond_const:
+    return tcg_opt_gen_movi(ctx, op, op->args[0], i);
+}
+
 static bool fold_shift(OptContext *ctx, TCGOp *op)
 {
     return fold_const2(ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
-        case INDEX_op_setcond2_i32:
-            i = do_constant_folding_cond2(&op->args[1], &op->args[3],
-                                          op->args[5]);
-            if (i >= 0) {
-            do_setcond_const:
-                tcg_opt_gen_movi(&ctx, op, op->args[0], i);
-                continue;
-            }
-            if ((op->args[5] == TCG_COND_LT || op->args[5] == TCG_COND_GE)
-                 && arg_is_const(op->args[3])
-                 && arg_info(op->args[3])->val == 0
-                 && arg_is_const(op->args[4])
-                 && arg_info(op->args[4])->val == 0) {
-                /* Simplify LT/GE comparisons vs zero to a single compare
-                   vs the high word of the input.  */
-            do_setcond_high:
-                reset_temp(op->args[0]);
-                arg_info(op->args[0])->z_mask = 1;
-                op->opc = INDEX_op_setcond_i32;
-                op->args[1] = op->args[2];
-                op->args[2] = op->args[4];
-                op->args[3] = op->args[5];
-                break;
-            }
-            if (op->args[5] == TCG_COND_EQ) {
-                /* Simplify EQ comparisons where one of the pairs
-                   can be simplified.  */
-                i = do_constant_folding_cond(INDEX_op_setcond_i32,
-                                             op->args[1], op->args[3],
-                                             TCG_COND_EQ);
-                if (i == 0) {
-                    goto do_setcond_const;
-                } else if (i > 0) {
-                    goto do_setcond_high;
-                }
-                i = do_constant_folding_cond(INDEX_op_setcond_i32,
-                                             op->args[2], op->args[4],
-                                             TCG_COND_EQ);
-                if (i == 0) {
-                    goto do_setcond_high;
-                } else if (i < 0) {
-                    break;
-                }
-            do_setcond_low:
-                reset_temp(op->args[0]);
-                arg_info(op->args[0])->z_mask = 1;
-                op->opc = INDEX_op_setcond_i32;
-                op->args[2] = op->args[3];
-                op->args[3] = op->args[5];
-                break;
-            }
-            if (op->args[5] == TCG_COND_NE) {
-                /* Simplify NE comparisons where one of the pairs
-                   can be simplified.  */
-                i = do_constant_folding_cond(INDEX_op_setcond_i32,
-                                             op->args[1], op->args[3],
-                                             TCG_COND_NE);
-                if (i == 0) {
-                    goto do_setcond_high;
-                } else if (i > 0) {
-                    goto do_setcond_const;
-                }
-                i = do_constant_folding_cond(INDEX_op_setcond_i32,
-                                             op->args[2], op->args[4],
-                                             TCG_COND_NE);
-                if (i == 0) {
-                    goto do_setcond_low;
-                } else if (i > 0) {
-                    goto do_setcond_const;
-                }
-            }
-            break;
-
         default:
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(shr):
             done = fold_shift(&ctx, op);
             break;
+        case INDEX_op_setcond2_i32:
+            done = fold_setcond2(&ctx, op);
+            break;
         CASE_OP_32_64_VEC(sub):
             done = fold_sub(&ctx, op);
             break;
-- 
2.25.1

Reduce some code duplication by folding the NE and EQ cases.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 159 +++++++++++++++++++++++++------------------------
 1 file changed, 81 insertions(+), 78 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
     return fold_const2(ctx, op);
 }
 
+static bool fold_brcond2(OptContext *ctx, TCGOp *op)
+{
+    TCGCond cond = op->args[4];
+    int i = do_constant_folding_cond2(&op->args[0], &op->args[2], cond);
+    TCGArg label = op->args[5];
+    int inv = 0;
+
+    if (i >= 0) {
+        goto do_brcond_const;
+    }
+
+    switch (cond) {
+    case TCG_COND_LT:
+    case TCG_COND_GE:
+        /*
+         * Simplify LT/GE comparisons vs zero to a single compare
+         * vs the high word of the input.
+         */
+        if (arg_is_const(op->args[2]) && arg_info(op->args[2])->val == 0 &&
+            arg_is_const(op->args[3]) && arg_info(op->args[3])->val == 0) {
+            goto do_brcond_high;
+        }
+        break;
+
+    case TCG_COND_NE:
+        inv = 1;
+        QEMU_FALLTHROUGH;
+    case TCG_COND_EQ:
+        /*
+         * Simplify EQ/NE comparisons where one of the pairs
+         * can be simplified.
+         */
+        i = do_constant_folding_cond(INDEX_op_brcond_i32, op->args[0],
+                                     op->args[2], cond);
+        switch (i ^ inv) {
+        case 0:
+            goto do_brcond_const;
+        case 1:
+            goto do_brcond_high;
+        }
+
+        i = do_constant_folding_cond(INDEX_op_brcond_i32, op->args[1],
+                                     op->args[3], cond);
+        switch (i ^ inv) {
+        case 0:
+            goto do_brcond_const;
+        case 1:
+            op->opc = INDEX_op_brcond_i32;
+            op->args[1] = op->args[2];
+            op->args[2] = cond;
+            op->args[3] = label;
+            break;
+        }
+        break;
+
+    default:
+        break;
+
+    do_brcond_high:
+        op->opc = INDEX_op_brcond_i32;
+        op->args[0] = op->args[1];
+        op->args[1] = op->args[3];
+        op->args[2] = cond;
+        op->args[3] = label;
+        break;
+
+    do_brcond_const:
+        if (i == 0) {
+            tcg_op_remove(ctx->tcg, op);
+            return true;
+        }
+        op->opc = INDEX_op_br;
+        op->args[0] = label;
+        break;
+    }
+    return false;
+}
+
 static bool fold_call(OptContext *ctx, TCGOp *op)
 {
     TCGContext *s = ctx->tcg;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
-        case INDEX_op_brcond2_i32:
-            i = do_constant_folding_cond2(&op->args[0], &op->args[2],
-                                          op->args[4]);
-            if (i == 0) {
-            do_brcond_false:
-                tcg_op_remove(s, op);
-                continue;
-            }
-            if (i > 0) {
-            do_brcond_true:
-                op->opc = opc = INDEX_op_br;
-                op->args[0] = op->args[5];
-                break;
-            }
-            if ((op->args[4] == TCG_COND_LT || op->args[4] == TCG_COND_GE)
-                 && arg_is_const(op->args[2])
-                 && arg_info(op->args[2])->val == 0
-                 && arg_is_const(op->args[3])
-                 && arg_info(op->args[3])->val == 0) {
-                /* Simplify LT/GE comparisons vs zero to a single compare
-                   vs the high word of the input.  */
-            do_brcond_high:
-                op->opc = opc = INDEX_op_brcond_i32;
-                op->args[0] = op->args[1];
-                op->args[1] = op->args[3];
-                op->args[2] = op->args[4];
-                op->args[3] = op->args[5];
-                break;
-            }
-            if (op->args[4] == TCG_COND_EQ) {
-                /* Simplify EQ comparisons where one of the pairs
-                   can be simplified.  */
-                i = do_constant_folding_cond(INDEX_op_brcond_i32,
-                                             op->args[0], op->args[2],
-                                             TCG_COND_EQ);
-                if (i == 0) {
-                    goto do_brcond_false;
-                } else if (i > 0) {
-                    goto do_brcond_high;
-                }
-                i = do_constant_folding_cond(INDEX_op_brcond_i32,
-                                             op->args[1], op->args[3],
-                                             TCG_COND_EQ);
-                if (i == 0) {
-                    goto do_brcond_false;
-                } else if (i < 0) {
-                    break;
-                }
-            do_brcond_low:
-                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
-                op->opc = INDEX_op_brcond_i32;
-                op->args[1] = op->args[2];
-                op->args[2] = op->args[4];
-                op->args[3] = op->args[5];
-                break;
-            }
-            if (op->args[4] == TCG_COND_NE) {
-                /* Simplify NE comparisons where one of the pairs
-                   can be simplified.  */
-                i = do_constant_folding_cond(INDEX_op_brcond_i32,
-                                             op->args[0], op->args[2],
-                                             TCG_COND_NE);
-                if (i == 0) {
-                    goto do_brcond_high;
-                } else if (i > 0) {
-                    goto do_brcond_true;
-                }
-                i = do_constant_folding_cond(INDEX_op_brcond_i32,
-                                             op->args[1], op->args[3],
-                                             TCG_COND_NE);
-                if (i == 0) {
-                    goto do_brcond_low;
-                } else if (i > 0) {
-                    goto do_brcond_true;
-                }
-            }
-            break;
-
         default:
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64_VEC(andc):
             done = fold_andc(&ctx, op);
             break;
+        case INDEX_op_brcond2_i32:
+            done = fold_brcond2(&ctx, op);
+            break;
         CASE_OP_32_64(ctpop):
             done = fold_ctpop(&ctx, op);
             break;
-- 
2.25.1

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 37 +++++++++++++++++++++----------------
 1 file changed, 21 insertions(+), 16 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
     return fold_const2(ctx, op);
 }
 
+static bool fold_mulu2_i32(OptContext *ctx, TCGOp *op)
+{
+    if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
+        uint32_t a = arg_info(op->args[2])->val;
+        uint32_t b = arg_info(op->args[3])->val;
+        uint64_t r = (uint64_t)a * b;
+        TCGArg rl, rh;
+        TCGOp *op2 = tcg_op_insert_before(ctx->tcg, op, INDEX_op_mov_i32);
+
+        rl = op->args[0];
+        rh = op->args[1];
+        tcg_opt_gen_movi(ctx, op, rl, (int32_t)r);
+        tcg_opt_gen_movi(ctx, op2, rh, (int32_t)(r >> 32));
+        return true;
+    }
+    return false;
+}
+
 static bool fold_nand(OptContext *ctx, TCGOp *op)
 {
     return fold_const2(ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
-        case INDEX_op_mulu2_i32:
-            if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
-                uint32_t a = arg_info(op->args[2])->val;
-                uint32_t b = arg_info(op->args[3])->val;
-                uint64_t r = (uint64_t)a * b;
-                TCGArg rl, rh;
-                TCGOp *op2 = tcg_op_insert_before(s, op, INDEX_op_mov_i32);
-
-                rl = op->args[0];
-                rh = op->args[1];
-                tcg_opt_gen_movi(&ctx, op, rl, (int32_t)r);
-                tcg_opt_gen_movi(&ctx, op2, rh, (int32_t)(r >> 32));
-                continue;
-            }
-            break;
-
         default:
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(muluh):
             done = fold_mul_highpart(&ctx, op);
             break;
+        case INDEX_op_mulu2_i32:
+            done = fold_mulu2_i32(&ctx, op);
+            break;
         CASE_OP_32_64(nand):
             done = fold_nand(&ctx, op);
             break;
-- 
2.25.1

Add two additional helpers, fold_add2_i32 and fold_sub2_i32
which will not be simple wrappers forever.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 70 +++++++++++++++++++++++++++++++-------------------
 1 file changed, 44 insertions(+), 26 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_add(OptContext *ctx, TCGOp *op)
     return fold_const2(ctx, op);
 }
 
+static bool fold_addsub2_i32(OptContext *ctx, TCGOp *op, bool add)
+{
+    if (arg_is_const(op->args[2]) && arg_is_const(op->args[3]) &&
+        arg_is_const(op->args[4]) && arg_is_const(op->args[5])) {
+        uint32_t al = arg_info(op->args[2])->val;
+        uint32_t ah = arg_info(op->args[3])->val;
+        uint32_t bl = arg_info(op->args[4])->val;
+        uint32_t bh = arg_info(op->args[5])->val;
+        uint64_t a = ((uint64_t)ah << 32) | al;
+        uint64_t b = ((uint64_t)bh << 32) | bl;
+        TCGArg rl, rh;
+        TCGOp *op2 = tcg_op_insert_before(ctx->tcg, op, INDEX_op_mov_i32);
+
+        if (add) {
+            a += b;
+        } else {
+            a -= b;
+        }
+
+        rl = op->args[0];
+        rh = op->args[1];
+        tcg_opt_gen_movi(ctx, op, rl, (int32_t)a);
+        tcg_opt_gen_movi(ctx, op2, rh, (int32_t)(a >> 32));
+        return true;
+    }
+    return false;
+}
+
+static bool fold_add2_i32(OptContext *ctx, TCGOp *op)
+{
+    return fold_addsub2_i32(ctx, op, true);
+}
+
 static bool fold_and(OptContext *ctx, TCGOp *op)
 {
     return fold_const2(ctx, op);
@@ -XXX,XX +XXX,XX @@ static bool fold_sub(OptContext *ctx, TCGOp *op)
     return fold_const2(ctx, op);
 }
 
+static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
+{
+    return fold_addsub2_i32(ctx, op, false);
+}
+
 static bool fold_xor(OptContext *ctx, TCGOp *op)
 {
     return fold_const2(ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
-        case INDEX_op_add2_i32:
-        case INDEX_op_sub2_i32:
-            if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])
-                && arg_is_const(op->args[4]) && arg_is_const(op->args[5])) {
-                uint32_t al = arg_info(op->args[2])->val;
-                uint32_t ah = arg_info(op->args[3])->val;
-                uint32_t bl = arg_info(op->args[4])->val;
-                uint32_t bh = arg_info(op->args[5])->val;
-                uint64_t a = ((uint64_t)ah << 32) | al;
-                uint64_t b = ((uint64_t)bh << 32) | bl;
-                TCGArg rl, rh;
-                TCGOp *op2 = tcg_op_insert_before(s, op, INDEX_op_mov_i32);
-
-                if (opc == INDEX_op_add2_i32) {
-                    a += b;
-                } else {
-                    a -= b;
-                }
-
-                rl = op->args[0];
-                rh = op->args[1];
-                tcg_opt_gen_movi(&ctx, op, rl, (int32_t)a);
-                tcg_opt_gen_movi(&ctx, op2, rh, (int32_t)(a >> 32));
-                continue;
-            }
-            break;
 
         default:
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64_VEC(add):
             done = fold_add(&ctx, op);
             break;
+        case INDEX_op_add2_i32:
+            done = fold_add2_i32(&ctx, op);
+            break;
         CASE_OP_32_64_VEC(and):
             done = fold_and(&ctx, op);
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64_VEC(sub):
             done = fold_sub(&ctx, op);
             break;
+        case INDEX_op_sub2_i32:
+            done = fold_sub2_i32(&ctx, op);
+            break;
         CASE_OP_32_64_VEC(xor):
             done = fold_xor(&ctx, op);
             break;
-- 
2.25.1

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 56 ++++++++++++++++++++++++++++----------------------
 1 file changed, 31 insertions(+), 25 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_mb(OptContext *ctx, TCGOp *op)
     return true;
 }
 
+static bool fold_movcond(OptContext *ctx, TCGOp *op)
+{
+    TCGOpcode opc = op->opc;
+    TCGCond cond = op->args[5];
+    int i = do_constant_folding_cond(opc, op->args[1], op->args[2], cond);
+
+    if (i >= 0) {
+        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[4 - i]);
+    }
+
+    if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
+        uint64_t tv = arg_info(op->args[3])->val;
+        uint64_t fv = arg_info(op->args[4])->val;
+
+        opc = (opc == INDEX_op_movcond_i32
+               ? INDEX_op_setcond_i32 : INDEX_op_setcond_i64);
+
+        if (tv == 1 && fv == 0) {
+            op->opc = opc;
+            op->args[3] = cond;
+        } else if (fv == 1 && tv == 0) {
+            op->opc = opc;
+            op->args[3] = tcg_invert_cond(cond);
+        }
+    }
+    return false;
+}
+
 static bool fold_mul(OptContext *ctx, TCGOp *op)
 {
     return fold_const2(ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
-        CASE_OP_32_64(movcond):
-            i = do_constant_folding_cond(opc, op->args[1],
-                                         op->args[2], op->args[5]);
-            if (i >= 0) {
-                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[4 - i]);
-                continue;
-            }
-            if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
-                uint64_t tv = arg_info(op->args[3])->val;
-                uint64_t fv = arg_info(op->args[4])->val;
-                TCGCond cond = op->args[5];
-
-                if (fv == 1 && tv == 0) {
-                    cond = tcg_invert_cond(cond);
-                } else if (!(tv == 1 && fv == 0)) {
-                    break;
-                }
-                op->args[3] = cond;
-                op->opc = opc = (opc == INDEX_op_movcond_i32
-                                 ? INDEX_op_setcond_i32
-                                 : INDEX_op_setcond_i64);
-            }
-            break;
-
-
         default:
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_mb:
             done = fold_mb(&ctx, op);
             break;
+        CASE_OP_32_64(movcond):
+            done = fold_movcond(&ctx, op);
+            break;
         CASE_OP_32_64(mul):
             done = fold_mul(&ctx, op);
             break;
-- 
2.25.1

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 39 ++++++++++++++++++++++-----------------
 1 file changed, 22 insertions(+), 17 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
     return fold_const2(ctx, op);
 }
 
+static bool fold_extract2(OptContext *ctx, TCGOp *op)
+{
+    if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
+        uint64_t v1 = arg_info(op->args[1])->val;
+        uint64_t v2 = arg_info(op->args[2])->val;
+        int shr = op->args[3];
+
+        if (op->opc == INDEX_op_extract2_i64) {
+            v1 >>= shr;
+            v2 <<= 64 - shr;
+        } else {
+            v1 = (uint32_t)v1 >> shr;
+            v2 = (int32_t)v2 << (32 - shr);
+        }
+        return tcg_opt_gen_movi(ctx, op, op->args[0], v1 | v2);
+    }
+    return false;
+}
+
 static bool fold_exts(OptContext *ctx, TCGOp *op)
 {
     return fold_const1(ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
-        CASE_OP_32_64(extract2):
-            if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
-                uint64_t v1 = arg_info(op->args[1])->val;
-                uint64_t v2 = arg_info(op->args[2])->val;
-                int shr = op->args[3];
-
-                if (opc == INDEX_op_extract2_i64) {
-                    tmp = (v1 >> shr) | (v2 << (64 - shr));
-                } else {
-                    tmp = (int32_t)(((uint32_t)v1 >> shr) |
-                                    ((uint32_t)v2 << (32 - shr)));
-                }
-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
-                continue;
-            }
-            break;
-
         default:
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(eqv):
             done = fold_eqv(&ctx, op);
             break;
+        CASE_OP_32_64(extract2):
+            done = fold_extract2(&ctx, op);
+            break;
         CASE_OP_32_64(ext8s):
         CASE_OP_32_64(ext16s):
         case INDEX_op_ext32s_i64:
-- 
2.25.1

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 48 ++++++++++++++++++++++++++++++------------------
 1 file changed, 30 insertions(+), 18 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
     return fold_const2(ctx, op);
 }
 
+static bool fold_extract(OptContext *ctx, TCGOp *op)
+{
+    if (arg_is_const(op->args[1])) {
+        uint64_t t;
+
+        t = arg_info(op->args[1])->val;
+        t = extract64(t, op->args[2], op->args[3]);
+        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
+    }
+    return false;
+}
+
 static bool fold_extract2(OptContext *ctx, TCGOp *op)
 {
     if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
     return tcg_opt_gen_movi(ctx, op, op->args[0], i);
 }
 
+static bool fold_sextract(OptContext *ctx, TCGOp *op)
+{
+    if (arg_is_const(op->args[1])) {
+        uint64_t t;
+
+        t = arg_info(op->args[1])->val;
+        t = sextract64(t, op->args[2], op->args[3]);
+        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
+    }
+    return false;
+}
+
 static bool fold_shift(OptContext *ctx, TCGOp *op)
 {
     return fold_const2(ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
-        CASE_OP_32_64(extract):
-            if (arg_is_const(op->args[1])) {
-                tmp = extract64(arg_info(op->args[1])->val,
-                                op->args[2], op->args[3]);
-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
-                continue;
-            }
-            break;
-
-        CASE_OP_32_64(sextract):
-            if (arg_is_const(op->args[1])) {
-                tmp = sextract64(arg_info(op->args[1])->val,
-                                 op->args[2], op->args[3]);
-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
-                continue;
-            }
-            break;
-
         default:
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(eqv):
             done = fold_eqv(&ctx, op);
             break;
+        CASE_OP_32_64(extract):
+            done = fold_extract(&ctx, op);
+            break;
         CASE_OP_32_64(extract2):
             done = fold_extract2(&ctx, op);
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_setcond2_i32:
             done = fold_setcond2(&ctx, op);
             break;
+        CASE_OP_32_64(sextract):
+            done = fold_sextract(&ctx, op);
+            break;
         CASE_OP_32_64_VEC(sub):
             done = fold_sub(&ctx, op);
             break;
-- 
2.25.1

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_ctpop(OptContext *ctx, TCGOp *op)
     return fold_const1(ctx, op);
 }
 
+static bool fold_deposit(OptContext *ctx, TCGOp *op)
+{
+    if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
+        uint64_t t1 = arg_info(op->args[1])->val;
+        uint64_t t2 = arg_info(op->args[2])->val;
+
+        t1 = deposit64(t1, op->args[3], op->args[4], t2);
+        return tcg_opt_gen_movi(ctx, op, op->args[0], t1);
+    }
+    return false;
+}
+
 static bool fold_divide(OptContext *ctx, TCGOp *op)
 {
     return fold_const2(ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
-        CASE_OP_32_64(deposit):
-            if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
-                tmp = deposit64(arg_info(op->args[1])->val,
-                                op->args[3], op->args[4],
-                                arg_info(op->args[2])->val);
-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
-                continue;
-            }
-            break;
-
         default:
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(ctpop):
             done = fold_ctpop(&ctx, op);
             break;
+        CASE_OP_32_64(deposit):
+            done = fold_deposit(&ctx, op);
+            break;
         CASE_OP_32_64(div):
         CASE_OP_32_64(divu):
             done = fold_divide(&ctx, op);
-- 
2.25.1

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 32 ++++++++++++++++++--------------
 1 file changed, 18 insertions(+), 14 deletions(-)

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
     return false;
 }
 
+static bool fold_bswap(OptContext *ctx, TCGOp *op)
+{
+    if (arg_is_const(op->args[1])) {
+        uint64_t t = arg_info(op->args[1])->val;
+
+        t = do_constant_folding(op->opc, t, op->args[2]);
+        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
+    }
+    return false;
+}
+
 static bool fold_call(OptContext *ctx, TCGOp *op)
 {
     TCGContext *s = ctx->tcg;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
-        CASE_OP_32_64(bswap16):
-        CASE_OP_32_64(bswap32):
-        case INDEX_op_bswap64_i64:
-            if (arg_is_const(op->args[1])) {
-                tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
-                                          op->args[2]);
-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
-                continue;
-            }
-            break;
-
         default:
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_brcond2_i32:
             done = fold_brcond2(&ctx, op);
             break;
+        CASE_OP_32_64(bswap16):
+        CASE_OP_32_64(bswap32):
+        case INDEX_op_bswap64_i64:
+            done = fold_bswap(&ctx, op);
+            break;
         CASE_OP_32_64(clz):
         CASE_OP_32_64(ctz):
             done = fold_count_zeros(&ctx, op);
-- 
2.25.1

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 53 +++++++++++++++++++++++++++++---------------------
 1 file changed, 31 insertions(+), 22 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_divide(OptContext *ctx, TCGOp *op)
     return fold_const2(ctx, op);
 }
 
+static bool fold_dup(OptContext *ctx, TCGOp *op)
+{
+    if (arg_is_const(op->args[1])) {
+        uint64_t t = arg_info(op->args[1])->val;
+        t = dup_const(TCGOP_VECE(op), t);
+        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
+    }
+    return false;
+}
+
+static bool fold_dup2(OptContext *ctx, TCGOp *op)
+{
+    if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
+        uint64_t t = deposit64(arg_info(op->args[1])->val, 32, 32,
+                               arg_info(op->args[2])->val);
+        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
+    }
+
+    if (args_are_copies(op->args[1], op->args[2])) {
+        op->opc = INDEX_op_dup_vec;
+        TCGOP_VECE(op) = MO_32;
+    }
+    return false;
+}
+
 static bool fold_eqv(OptContext *ctx, TCGOp *op)
 {
     return fold_const2(ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             done = tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
             break;
 
-        case INDEX_op_dup_vec:
-            if (arg_is_const(op->args[1])) {
-                tmp = arg_info(op->args[1])->val;
-                tmp = dup_const(TCGOP_VECE(op), tmp);
-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
-                continue;
-            }
-            break;
-
-        case INDEX_op_dup2_vec:
-            assert(TCG_TARGET_REG_BITS == 32);
-            if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
-                tcg_opt_gen_movi(&ctx, op, op->args[0],
-                                 deposit64(arg_info(op->args[1])->val, 32, 32,
-                                           arg_info(op->args[2])->val));
-                continue;
-            } else if (args_are_copies(op->args[1], op->args[2])) {
-                op->opc = INDEX_op_dup_vec;
-                TCGOP_VECE(op) = MO_32;
-            }
-            break;
-
         default:
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(divu):
             done = fold_divide(&ctx, op);
             break;
+        case INDEX_op_dup_vec:
+            done = fold_dup(&ctx, op);
+            break;
+        case INDEX_op_dup2_vec:
+            done = fold_dup2(&ctx, op);
+            break;
         CASE_OP_32_64(eqv):
             done = fold_eqv(&ctx, op);
             break;
-- 
2.25.1

This is the final entry in the main switch that was in a
different form.  After this, we have the option to convert
the switch into a function dispatch table.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_mb(OptContext *ctx, TCGOp *op)
     return true;
 }
 
+static bool fold_mov(OptContext *ctx, TCGOp *op)
+{
+    return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]);
+}
+
 static bool fold_movcond(OptContext *ctx, TCGOp *op)
 {
     TCGOpcode opc = op->opc;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             break;
         }
 
-        /* Propagate constants through copy operations and do constant
-           folding.  Constants will be substituted to arguments by register
-           allocator where needed and possible.  Also detect copies. */
+        /*
+         * Process each opcode.
+         * Sorted alphabetically by opcode as much as possible.
+         */
         switch (opc) {
-        CASE_OP_32_64_VEC(mov):
-            done = tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
-            break;
-
-        default:
-            break;
-
-        /* ---------------------------------------------------------- */
-        /* Sorted alphabetically by opcode as much as possible. */
-
         CASE_OP_32_64_VEC(add):
             done = fold_add(&ctx, op);
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_mb:
             done = fold_mb(&ctx, op);
             break;
+        CASE_OP_32_64_VEC(mov):
+            done = fold_mov(&ctx, op);
+            break;
         CASE_OP_32_64(movcond):
             done = fold_movcond(&ctx, op);
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64_VEC(xor):
             done = fold_xor(&ctx, op);
             break;
+        default:
+            break;
         }
 
         if (!done) {
-- 
2.25.1

Pull the "op r, a, a => movi r, 0" optimization into a function,
and use it in the outer opcode fold functions.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 41 ++++++++++++++++++++++++-----------------
 1 file changed, 24 insertions(+), 17 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
     return false;
 }
 
+/* If the binary operation has both arguments equal, fold to @i. */
+static bool fold_xx_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
+{
+    if (args_are_copies(op->args[1], op->args[2])) {
+        return tcg_opt_gen_movi(ctx, op, op->args[0], i);
+    }
+    return false;
+}
+
 /*
  * These outermost fold_<op> functions are sorted alphabetically.
  */
@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
 
 static bool fold_andc(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xx_to_i(ctx, op, 0)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_brcond(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_shift(OptContext *ctx, TCGOp *op)
 
 static bool fold_sub(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xx_to_i(ctx, op, 0)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
 
 static bool fold_xor(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xx_to_i(ctx, op, 0)) {
+        return true;
+    }
+    return false;
 }
 
 /* Propagate constants and copies, fold constant expressions. */
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             break;
         }
 
-        /* Simplify expression for "op r, a, a => movi r, 0" cases */
-        switch (opc) {
-        CASE_OP_32_64_VEC(andc):
-        CASE_OP_32_64_VEC(sub):
-        CASE_OP_32_64_VEC(xor):
-            if (args_are_copies(op->args[1], op->args[2])) {
-                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
-                continue;
-            }
-            break;
-        default:
-            break;
-        }
-
         /*
          * Process each opcode.
          * Sorted alphabetically by opcode as much as possible.
-- 
2.25.1

Pull the "op r, a, a => mov r, a" optimization into a function,
and use it in the outer opcode fold functions.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 39 ++++++++++++++++++++++++---------------
 1 file changed, 24 insertions(+), 15 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_xx_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
     return false;
 }
 
+/* If the binary operation has both arguments equal, fold to identity. */
+static bool fold_xx_to_x(OptContext *ctx, TCGOp *op)
+{
+    if (args_are_copies(op->args[1], op->args[2])) {
+        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]);
+    }
+    return false;
+}
+
 /*
  * These outermost fold_<op> functions are sorted alphabetically.
+ *
+ * The ordering of the transformations should be:
+ *   1) those that produce a constant
+ *   2) those that produce a copy
+ *   3) those that produce information about the result value.
  */
 
 static bool fold_add(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_add2_i32(OptContext *ctx, TCGOp *op)
 
 static bool fold_and(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xx_to_x(ctx, op)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_andc(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_not(OptContext *ctx, TCGOp *op)
 
 static bool fold_or(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xx_to_x(ctx, op)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_orc(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             break;
         }
 
-        /* Simplify expression for "op r, a, a => mov r, a" cases */
-        switch (opc) {
-        CASE_OP_32_64_VEC(or):
-        CASE_OP_32_64_VEC(and):
-            if (args_are_copies(op->args[1], op->args[2])) {
-                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
-                continue;
-            }
-            break;
-        default:
-            break;
-        }
-
         /*
          * Process each opcode.
          * Sorted alphabetically by opcode as much as possible.
-- 
2.25.1

Pull the "op r, a, 0 => movi r, 0" optimization into a function,
and use it in the outer opcode fold functions.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 38 ++++++++++++++++++++------------------
 1 file changed, 20 insertions(+), 18 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
     return false;
 }
 
+/* If the binary operation has second argument @i, fold to @i. */
+static bool fold_xi_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
+{
+    if (arg_is_const(op->args[2]) && arg_info(op->args[2])->val == i) {
+        return tcg_opt_gen_movi(ctx, op, op->args[0], i);
+    }
+    return false;
+}
+
 /* If the binary operation has both arguments equal, fold to @i. */
 static bool fold_xx_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
 {
@@ -XXX,XX +XXX,XX @@ static bool fold_add2_i32(OptContext *ctx, TCGOp *op)
 static bool fold_and(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
+        fold_xi_to_i(ctx, op, 0) ||
         fold_xx_to_x(ctx, op)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
 
 static bool fold_mul(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xi_to_i(ctx, op, 0)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xi_to_i(ctx, op, 0)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_mulu2_i32(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             continue;
         }
 
-        /* Simplify expression for "op r, a, 0 => movi r, 0" cases */
-        switch (opc) {
-        CASE_OP_32_64_VEC(and):
-        CASE_OP_32_64_VEC(mul):
-        CASE_OP_32_64(muluh):
-        CASE_OP_32_64(mulsh):
-            if (arg_is_const(op->args[2])
-                && arg_info(op->args[2])->val == 0) {
-                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
-                continue;
-            }
-            break;
-        default:
-            break;
-        }
-
         /*
          * Process each opcode.
          * Sorted alphabetically by opcode as much as possible.
-- 
2.25.1

Compute the type of the operation early.

There are at least 4 places that used a def->flags ladder
to determine the type of the operation being optimized.

There were two places that assumed !TCG_OPF_64BIT means
TCG_TYPE_I32, and so could potentially compute incorrect
results for vector operations.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 149 +++++++++++++++++++++++++++++--------------------
 1 file changed, 89 insertions(+), 60 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ typedef struct OptContext {
 
     /* In flight values from optimization. */
     uint64_t z_mask;
+    TCGType type;
 } OptContext;
 
 static inline TempOptInfo *ts_info(TCGTemp *ts)
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
 {
     TCGTemp *dst_ts = arg_temp(dst);
     TCGTemp *src_ts = arg_temp(src);
-    const TCGOpDef *def;
     TempOptInfo *di;
     TempOptInfo *si;
     uint64_t z_mask;
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
     reset_ts(dst_ts);
     di = ts_info(dst_ts);
     si = ts_info(src_ts);
-    def = &tcg_op_defs[op->opc];
-    if (def->flags & TCG_OPF_VECTOR) {
-        new_op = INDEX_op_mov_vec;
-    } else if (def->flags & TCG_OPF_64BIT) {
-        new_op = INDEX_op_mov_i64;
-    } else {
+
+    switch (ctx->type) {
+    case TCG_TYPE_I32:
         new_op = INDEX_op_mov_i32;
+        break;
+    case TCG_TYPE_I64:
+        new_op = INDEX_op_mov_i64;
+        break;
+    case TCG_TYPE_V64:
+    case TCG_TYPE_V128:
+    case TCG_TYPE_V256:
+        /* TCGOP_VECL and TCGOP_VECE remain unchanged.  */
+        new_op = INDEX_op_mov_vec;
+        break;
+    default:
+        g_assert_not_reached();
     }
     op->opc = new_op;
-    /* TCGOP_VECL and TCGOP_VECE remain unchanged.  */
     op->args[0] = dst;
     op->args[1] = src;
 
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
 static bool tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
                              TCGArg dst, uint64_t val)
 {
-    const TCGOpDef *def = &tcg_op_defs[op->opc];
-    TCGType type;
-    TCGTemp *tv;
-
-    if (def->flags & TCG_OPF_VECTOR) {
-        type = TCGOP_VECL(op) + TCG_TYPE_V64;
-    } else if (def->flags & TCG_OPF_64BIT) {
-        type = TCG_TYPE_I64;
-    } else {
-        type = TCG_TYPE_I32;
-    }
-
     /* Convert movi to mov with constant temp. */
-    tv = tcg_constant_internal(type, val);
+    TCGTemp *tv = tcg_constant_internal(ctx->type, val);
+
     init_ts_info(ctx, tv);
     return tcg_opt_gen_mov(ctx, op, dst, temp_arg(tv));
 }
@@ -XXX,XX +XXX,XX @@ static uint64_t do_constant_folding_2(TCGOpcode op, uint64_t x, uint64_t y)
     }
 }
 
-static uint64_t do_constant_folding(TCGOpcode op, uint64_t x, uint64_t y)
+static uint64_t do_constant_folding(TCGOpcode op, TCGType type,
+                                    uint64_t x, uint64_t y)
 {
-    const TCGOpDef *def = &tcg_op_defs[op];
     uint64_t res = do_constant_folding_2(op, x, y);
-    if (!(def->flags & TCG_OPF_64BIT)) {
+    if (type == TCG_TYPE_I32) {
         res = (int32_t)res;
     }
     return res;
@@ -XXX,XX +XXX,XX @@ static bool do_constant_folding_cond_eq(TCGCond c)
  * Return -1 if the condition can't be simplified,
  * and the result of the condition (0 or 1) if it can.
  */
-static int do_constant_folding_cond(TCGOpcode op, TCGArg x,
+static int do_constant_folding_cond(TCGType type, TCGArg x,
                                     TCGArg y, TCGCond c)
 {
     uint64_t xv = arg_info(x)->val;
     uint64_t yv = arg_info(y)->val;
 
     if (arg_is_const(x) && arg_is_const(y)) {
-        const TCGOpDef *def = &tcg_op_defs[op];
-        tcg_debug_assert(!(def->flags & TCG_OPF_VECTOR));
-        if (def->flags & TCG_OPF_64BIT) {
-            return do_constant_folding_cond_64(xv, yv, c);
-        } else {
+        switch (type) {
+        case TCG_TYPE_I32:
             return do_constant_folding_cond_32(xv, yv, c);
+        case TCG_TYPE_I64:
+            return do_constant_folding_cond_64(xv, yv, c);
+        default:
+            /* Only scalar comparisons are optimizable */
+            return -1;
         }
     } else if (args_are_copies(x, y)) {
         return do_constant_folding_cond_eq(c);
@@ -XXX,XX +XXX,XX @@ static bool fold_const1(OptContext *ctx, TCGOp *op)
         uint64_t t;
 
         t = arg_info(op->args[1])->val;
-        t = do_constant_folding(op->opc, t, 0);
+        t = do_constant_folding(op->opc, ctx->type, t, 0);
         return tcg_opt_gen_movi(ctx, op, op->args[0], t);
     }
     return false;
@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
         uint64_t t1 = arg_info(op->args[1])->val;
         uint64_t t2 = arg_info(op->args[2])->val;
 
-        t1 = do_constant_folding(op->opc, t1, t2);
+        t1 = do_constant_folding(op->opc, ctx->type, t1, t2);
         return tcg_opt_gen_movi(ctx, op, op->args[0], t1);
     }
     return false;
@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
 static bool fold_brcond(OptContext *ctx, TCGOp *op)
 {
     TCGCond cond = op->args[2];
-    int i = do_constant_folding_cond(op->opc, op->args[0], op->args[1], cond);
+    int i = do_constant_folding_cond(ctx->type, op->args[0], op->args[1], cond);
 
     if (i == 0) {
         tcg_op_remove(ctx->tcg, op);
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
          * Simplify EQ/NE comparisons where one of the pairs
          * can be simplified.
          */
-        i = do_constant_folding_cond(INDEX_op_brcond_i32, op->args[0],
+        i = do_constant_folding_cond(TCG_TYPE_I32, op->args[0],
                                      op->args[2], cond);
         switch (i ^ inv) {
         case 0:
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
             goto do_brcond_high;
         }
 
-        i = do_constant_folding_cond(INDEX_op_brcond_i32, op->args[1],
+        i = do_constant_folding_cond(TCG_TYPE_I32, op->args[1],
                                      op->args[3], cond);
         switch (i ^ inv) {
         case 0:
@@ -XXX,XX +XXX,XX @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
     if (arg_is_const(op->args[1])) {
         uint64_t t = arg_info(op->args[1])->val;
 
-        t = do_constant_folding(op->opc, t, op->args[2]);
+        t = do_constant_folding(op->opc, ctx->type, t, op->args[2]);
         return tcg_opt_gen_movi(ctx, op, op->args[0], t);
     }
     return false;
@@ -XXX,XX +XXX,XX @@ static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
         uint64_t t = arg_info(op->args[1])->val;
 
         if (t != 0) {
-            t = do_constant_folding(op->opc, t, 0);
+            t = do_constant_folding(op->opc, ctx->type, t, 0);
             return tcg_opt_gen_movi(ctx, op, op->args[0], t);
         }
         return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[2]);
@@ -XXX,XX +XXX,XX @@ static bool fold_mov(OptContext *ctx, TCGOp *op)
 
 static bool fold_movcond(OptContext *ctx, TCGOp *op)
 {
-    TCGOpcode opc = op->opc;
     TCGCond cond = op->args[5];
-    int i = do_constant_folding_cond(opc, op->args[1], op->args[2], cond);
+    int i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
 
     if (i >= 0) {
         return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[4 - i]);
@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
     if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
         uint64_t tv = arg_info(op->args[3])->val;
         uint64_t fv = arg_info(op->args[4])->val;
+        TCGOpcode opc;
 
-        opc = (opc == INDEX_op_movcond_i32
-               ? INDEX_op_setcond_i32 : INDEX_op_setcond_i64);
+        switch (ctx->type) {
+        case TCG_TYPE_I32:
+            opc = INDEX_op_setcond_i32;
+            break;
+        case TCG_TYPE_I64:
+            opc = INDEX_op_setcond_i64;
+            break;
+        default:
+            g_assert_not_reached();
+        }
 
         if (tv == 1 && fv == 0) {
             op->opc = opc;
@@ -XXX,XX +XXX,XX @@ static bool fold_remainder(OptContext *ctx, TCGOp *op)
 static bool fold_setcond(OptContext *ctx, TCGOp *op)
 {
     TCGCond cond = op->args[3];
-    int i = do_constant_folding_cond(op->opc, op->args[1], op->args[2], cond);
+    int i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
 
     if (i >= 0) {
         return tcg_opt_gen_movi(ctx, op, op->args[0], i);
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
          * Simplify EQ/NE comparisons where one of the pairs
          * can be simplified.
          */
-        i = do_constant_folding_cond(INDEX_op_setcond_i32, op->args[1],
+        i = do_constant_folding_cond(TCG_TYPE_I32, op->args[1],
                                      op->args[3], cond);
         switch (i ^ inv) {
         case 0:
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
             goto do_setcond_high;
         }
 
-        i = do_constant_folding_cond(INDEX_op_setcond_i32, op->args[2],
+        i = do_constant_folding_cond(TCG_TYPE_I32, op->args[2],
                                      op->args[4], cond);
         switch (i ^ inv) {
         case 0:
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         init_arguments(&ctx, op, def->nb_oargs + def->nb_iargs);
         copy_propagate(&ctx, op, def->nb_oargs, def->nb_iargs);
 
+        /* Pre-compute the type of the operation. */
+        if (def->flags & TCG_OPF_VECTOR) {
+            ctx.type = TCG_TYPE_V64 + TCGOP_VECL(op);
+        } else if (def->flags & TCG_OPF_64BIT) {
+            ctx.type = TCG_TYPE_I64;
+        } else {
+            ctx.type = TCG_TYPE_I32;
+        }
+
         /* For commutative operations make constant second argument */
         switch (opc) {
         CASE_OP_32_64_VEC(add):
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                     /* Proceed with possible constant folding. */
                     break;
                 }
-                if (opc == INDEX_op_sub_i32) {
+                switch (ctx.type) {
+                case TCG_TYPE_I32:
                     neg_op = INDEX_op_neg_i32;
                     have_neg = TCG_TARGET_HAS_neg_i32;
-                } else if (opc == INDEX_op_sub_i64) {
+                    break;
+                case TCG_TYPE_I64:
                     neg_op = INDEX_op_neg_i64;
                     have_neg = TCG_TARGET_HAS_neg_i64;
-                } else if (TCG_TARGET_HAS_neg_vec) {
-                    TCGType type = TCGOP_VECL(op) + TCG_TYPE_V64;
-                    unsigned vece = TCGOP_VECE(op);
-                    neg_op = INDEX_op_neg_vec;
-                    have_neg = tcg_can_emit_vec_op(neg_op, type, vece) > 0;
-                } else {
                     break;
+                case TCG_TYPE_V64:
+                case TCG_TYPE_V128:
+                case TCG_TYPE_V256:
+                    neg_op = INDEX_op_neg_vec;
+                    have_neg = tcg_can_emit_vec_op(neg_op, ctx.type,
+                                                   TCGOP_VECE(op)) > 0;
+                    break;
+                default:
+                    g_assert_not_reached();
                 }
                 if (!have_neg) {
                     break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 TCGOpcode not_op;
                 bool have_not;
 
-                if (def->flags & TCG_OPF_VECTOR) {
-                    not_op = INDEX_op_not_vec;
-                    have_not = TCG_TARGET_HAS_not_vec;
-                } else if (def->flags & TCG_OPF_64BIT) {
-                    not_op = INDEX_op_not_i64;
-                    have_not = TCG_TARGET_HAS_not_i64;
-                } else {
+                switch (ctx.type) {
+                case TCG_TYPE_I32:
                     not_op = INDEX_op_not_i32;
                     have_not = TCG_TARGET_HAS_not_i32;
+                    break;
+                case TCG_TYPE_I64:
+                    not_op = INDEX_op_not_i64;
+                    have_not = TCG_TARGET_HAS_not_i64;
+                    break;
+                case TCG_TYPE_V64:
+                case TCG_TYPE_V128:
+                case TCG_TYPE_V256:
+                    not_op = INDEX_op_not_vec;
+                    have_not = TCG_TARGET_HAS_not_vec;
+                    break;
+                default:
+                    g_assert_not_reached();
                 }
                 if (!have_not) {
                     break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
            below, we can ignore high bits, but for further optimizations we
            need to record that the high bits contain garbage.  */
         partmask = z_mask;
-        if (!(def->flags & TCG_OPF_64BIT)) {
+        if (ctx.type == TCG_TYPE_I32) {
             z_mask |= ~(tcg_target_ulong)0xffffffffu;
             partmask &= 0xffffffffu;
             affected &= 0xffffffffu;
-- 
2.25.1

Split out the conditional conversion from a more complex logical
operation to a simple NOT.  Create a couple more helpers to make
this easy for the outer-most logical operations.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 158 +++++++++++++++++++++++++++----------------------
 1 file changed, 86 insertions(+), 72 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
     return false;
 }
 
+/*
+ * Convert @op to NOT, if NOT is supported by the host.
+ * Return true f the conversion is successful, which will still
+ * indicate that the processing is complete.
+ */
+static bool fold_not(OptContext *ctx, TCGOp *op);
+static bool fold_to_not(OptContext *ctx, TCGOp *op, int idx)
+{
+    TCGOpcode not_op;
+    bool have_not;
+
+    switch (ctx->type) {
+    case TCG_TYPE_I32:
+        not_op = INDEX_op_not_i32;
+        have_not = TCG_TARGET_HAS_not_i32;
+        break;
+    case TCG_TYPE_I64:
+        not_op = INDEX_op_not_i64;
+        have_not = TCG_TARGET_HAS_not_i64;
+        break;
+    case TCG_TYPE_V64:
+    case TCG_TYPE_V128:
+    case TCG_TYPE_V256:
+        not_op = INDEX_op_not_vec;
+        have_not = TCG_TARGET_HAS_not_vec;
+        break;
+    default:
+        g_assert_not_reached();
+    }
+    if (have_not) {
+        op->opc = not_op;
+        op->args[1] = op->args[idx];
+        return fold_not(ctx, op);
+    }
+    return false;
+}
+
+/* If the binary operation has first argument @i, fold to NOT. */
+static bool fold_ix_to_not(OptContext *ctx, TCGOp *op, uint64_t i)
+{
+    if (arg_is_const(op->args[1]) && arg_info(op->args[1])->val == i) {
+        return fold_to_not(ctx, op, 2);
+    }
+    return false;
+}
+
 /* If the binary operation has second argument @i, fold to @i. */
 static bool fold_xi_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
 {
@@ -XXX,XX +XXX,XX @@ static bool fold_xi_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
     return false;
 }
 
+/* If the binary operation has second argument @i, fold to NOT. */
+static bool fold_xi_to_not(OptContext *ctx, TCGOp *op, uint64_t i)
+{
+    if (arg_is_const(op->args[2]) && arg_info(op->args[2])->val == i) {
+        return fold_to_not(ctx, op, 1);
+    }
+    return false;
+}
+
 /* If the binary operation has both arguments equal, fold to @i. */
 static bool fold_xx_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
 {
@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
 static bool fold_andc(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
-        fold_xx_to_i(ctx, op, 0)) {
+        fold_xx_to_i(ctx, op, 0) ||
+        fold_ix_to_not(ctx, op, -1)) {
         return true;
     }
     return false;
@@ -XXX,XX +XXX,XX @@ static bool fold_dup2(OptContext *ctx, TCGOp *op)
 
 static bool fold_eqv(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xi_to_not(ctx, op, 0)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_extract(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_mulu2_i32(OptContext *ctx, TCGOp *op)
 
 static bool fold_nand(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xi_to_not(ctx, op, -1)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_neg(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_neg(OptContext *ctx, TCGOp *op)
 
 static bool fold_nor(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xi_to_not(ctx, op, 0)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_not(OptContext *ctx, TCGOp *op)
 {
-    return fold_const1(ctx, op);
+    if (fold_const1(ctx, op)) {
+        return true;
+    }
+
+    /* Because of fold_to_not, we want to always return true, via finish. */
+    finish_folding(ctx, op);
+    return true;
 }
 
 static bool fold_or(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_or(OptContext *ctx, TCGOp *op)
 
 static bool fold_orc(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_ix_to_not(ctx, op, 0)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
 static bool fold_xor(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
-        fold_xx_to_i(ctx, op, 0)) {
+        fold_xx_to_i(ctx, op, 0) ||
+        fold_xi_to_not(ctx, op, -1)) {
         return true;
     }
     return false;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 }
             }
             break;
-        CASE_OP_32_64_VEC(xor):
-        CASE_OP_32_64(nand):
-            if (!arg_is_const(op->args[1])
-                && arg_is_const(op->args[2])
-                && arg_info(op->args[2])->val == -1) {
-                i = 1;
-                goto try_not;
-            }
-            break;
-        CASE_OP_32_64(nor):
-            if (!arg_is_const(op->args[1])
-                && arg_is_const(op->args[2])
-                && arg_info(op->args[2])->val == 0) {
-                i = 1;
-                goto try_not;
-            }
-            break;
-        CASE_OP_32_64_VEC(andc):
-            if (!arg_is_const(op->args[2])
-                && arg_is_const(op->args[1])
-                && arg_info(op->args[1])->val == -1) {
-                i = 2;
-                goto try_not;
-            }
-            break;
-        CASE_OP_32_64_VEC(orc):
-        CASE_OP_32_64(eqv):
-            if (!arg_is_const(op->args[2])
-                && arg_is_const(op->args[1])
-                && arg_info(op->args[1])->val == 0) {
-                i = 2;
-                goto try_not;
-            }
-            break;
-        try_not:
-            {
-                TCGOpcode not_op;
-                bool have_not;
-
-                switch (ctx.type) {
-                case TCG_TYPE_I32:
-                    not_op = INDEX_op_not_i32;
-                    have_not = TCG_TARGET_HAS_not_i32;
-                    break;
-                case TCG_TYPE_I64:
-                    not_op = INDEX_op_not_i64;
-                    have_not = TCG_TARGET_HAS_not_i64;
-                    break;
-                case TCG_TYPE_V64:
-                case TCG_TYPE_V128:
-                case TCG_TYPE_V256:
-                    not_op = INDEX_op_not_vec;
-                    have_not = TCG_TARGET_HAS_not_vec;
-                    break;
-                default:
-                    g_assert_not_reached();
-                }
-                if (!have_not) {
-                    break;
-                }
-                op->opc = not_op;
-                reset_temp(op->args[0]);
-                op->args[1] = op->args[i];
-                continue;
-            }
         default:
             break;
         }
-- 
2.25.1

Even though there is only one user, place this more complex
conversion into its own helper.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 89 ++++++++++++++++++++++++++------------------------
 1 file changed, 47 insertions(+), 42 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_nand(OptContext *ctx, TCGOp *op)
 
 static bool fold_neg(OptContext *ctx, TCGOp *op)
 {
-    return fold_const1(ctx, op);
+    if (fold_const1(ctx, op)) {
+        return true;
+    }
+    /*
+     * Because of fold_sub_to_neg, we want to always return true,
+     * via finish_folding.
+     */
+    finish_folding(ctx, op);
+    return true;
 }
 
 static bool fold_nor(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_shift(OptContext *ctx, TCGOp *op)
     return fold_const2(ctx, op);
 }
 
+static bool fold_sub_to_neg(OptContext *ctx, TCGOp *op)
+{
+    TCGOpcode neg_op;
+    bool have_neg;
+
+    if (!arg_is_const(op->args[1]) || arg_info(op->args[1])->val != 0) {
+        return false;
+    }
+
+    switch (ctx->type) {
+    case TCG_TYPE_I32:
+        neg_op = INDEX_op_neg_i32;
+        have_neg = TCG_TARGET_HAS_neg_i32;
+        break;
+    case TCG_TYPE_I64:
+        neg_op = INDEX_op_neg_i64;
+        have_neg = TCG_TARGET_HAS_neg_i64;
+        break;
+    case TCG_TYPE_V64:
+    case TCG_TYPE_V128:
+    case TCG_TYPE_V256:
+        neg_op = INDEX_op_neg_vec;
+        have_neg = (TCG_TARGET_HAS_neg_vec &&
+                    tcg_can_emit_vec_op(neg_op, ctx->type, TCGOP_VECE(op)) > 0);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+    if (have_neg) {
+        op->opc = neg_op;
+        op->args[1] = op->args[2];
+        return fold_neg(ctx, op);
+    }
+    return false;
+}
+
 static bool fold_sub(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
-        fold_xx_to_i(ctx, op, 0)) {
+        fold_xx_to_i(ctx, op, 0) ||
+        fold_sub_to_neg(ctx, op)) {
         return true;
     }
     return false;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 continue;
             }
             break;
-        CASE_OP_32_64_VEC(sub):
-            {
-                TCGOpcode neg_op;
-                bool have_neg;
-
-                if (arg_is_const(op->args[2])) {
-                    /* Proceed with possible constant folding. */
-                    break;
-                }
-                switch (ctx.type) {
-                case TCG_TYPE_I32:
-                    neg_op = INDEX_op_neg_i32;
-                    have_neg = TCG_TARGET_HAS_neg_i32;
-                    break;
-                case TCG_TYPE_I64:
-                    neg_op = INDEX_op_neg_i64;
-                    have_neg = TCG_TARGET_HAS_neg_i64;
-                    break;
-                case TCG_TYPE_V64:
-                case TCG_TYPE_V128:
-                case TCG_TYPE_V256:
-                    neg_op = INDEX_op_neg_vec;
-                    have_neg = tcg_can_emit_vec_op(neg_op, ctx.type,
-                                                   TCGOP_VECE(op)) > 0;
-                    break;
-                default:
-                    g_assert_not_reached();
-                }
-                if (!have_neg) {
-                    break;
-                }
-                if (arg_is_const(op->args[1])
-                    && arg_info(op->args[1])->val == 0) {
-                    op->opc = neg_op;
-                    reset_temp(op->args[0]);
-                    op->args[1] = op->args[2];
-                    continue;
-                }
-            }
-            break;
         default:
             break;
         }
-- 
2.25.1

Pull the "op r, a, i => mov r, a" optimization into a function,
and use them in the outer-most logical operations.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 61 +++++++++++++++++++++-----------------------------
 1 file changed, 26 insertions(+), 35 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_xi_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
     return false;
 }
 
+/* If the binary operation has second argument @i, fold to identity. */
+static bool fold_xi_to_x(OptContext *ctx, TCGOp *op, uint64_t i)
+{
+    if (arg_is_const(op->args[2]) && arg_info(op->args[2])->val == i) {
+        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]);
+    }
+    return false;
+}
+
 /* If the binary operation has second argument @i, fold to NOT. */
 static bool fold_xi_to_not(OptContext *ctx, TCGOp *op, uint64_t i)
 {
@@ -XXX,XX +XXX,XX @@ static bool fold_xx_to_x(OptContext *ctx, TCGOp *op)
 
 static bool fold_add(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xi_to_x(ctx, op, 0)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_addsub2_i32(OptContext *ctx, TCGOp *op, bool add)
@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
         fold_xi_to_i(ctx, op, 0) ||
+        fold_xi_to_x(ctx, op, -1) ||
         fold_xx_to_x(ctx, op)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
         fold_xx_to_i(ctx, op, 0) ||
+        fold_xi_to_x(ctx, op, 0) ||
         fold_ix_to_not(ctx, op, -1)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_dup2(OptContext *ctx, TCGOp *op)
 static bool fold_eqv(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
+        fold_xi_to_x(ctx, op, -1) ||
         fold_xi_to_not(ctx, op, 0)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_not(OptContext *ctx, TCGOp *op)
 static bool fold_or(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
+        fold_xi_to_x(ctx, op, 0) ||
         fold_xx_to_x(ctx, op)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_or(OptContext *ctx, TCGOp *op)
 static bool fold_orc(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
+        fold_xi_to_x(ctx, op, -1) ||
         fold_ix_to_not(ctx, op, 0)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
 
 static bool fold_shift(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xi_to_x(ctx, op, 0)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_sub_to_neg(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_sub(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
         fold_xx_to_i(ctx, op, 0) ||
+        fold_xi_to_x(ctx, op, 0) ||
         fold_sub_to_neg(ctx, op)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_xor(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
         fold_xx_to_i(ctx, op, 0) ||
+        fold_xi_to_x(ctx, op, 0) ||
         fold_xi_to_not(ctx, op, -1)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             break;
         }
 
-        /* Simplify expression for "op r, a, const => mov r, a" cases */
-        switch (opc) {
-        CASE_OP_32_64_VEC(add):
-        CASE_OP_32_64_VEC(sub):
-        CASE_OP_32_64_VEC(or):
-        CASE_OP_32_64_VEC(xor):
-        CASE_OP_32_64_VEC(andc):
-        CASE_OP_32_64(shl):
-        CASE_OP_32_64(shr):
-        CASE_OP_32_64(sar):
-        CASE_OP_32_64(rotl):
-        CASE_OP_32_64(rotr):
-            if (!arg_is_const(op->args[1])
-                && arg_is_const(op->args[2])
-                && arg_info(op->args[2])->val == 0) {
-                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
-                continue;
-            }
-            break;
-        CASE_OP_32_64_VEC(and):
-        CASE_OP_32_64_VEC(orc):
-        CASE_OP_32_64(eqv):
-            if (!arg_is_const(op->args[1])
-                && arg_is_const(op->args[2])
-                && arg_info(op->args[2])->val == -1) {
-                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
-                continue;
-            }
-            break;
-        default:
-            break;
-        }
-
         /* Simplify using known-zero bits. Currently only ops with a single
            output argument is supported. */
         z_mask = -1;
-- 
2.25.1

Pull the "op r, 0, b => movi r, 0" optimization into a function,
and use it in fold_shift.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 28 ++++++++++------------------
 1 file changed, 10 insertions(+), 18 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_to_not(OptContext *ctx, TCGOp *op, int idx)
     return false;
 }
 
+/* If the binary operation has first argument @i, fold to @i. */
+static bool fold_ix_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
+{
+    if (arg_is_const(op->args[1]) && arg_info(op->args[1])->val == i) {
+        return tcg_opt_gen_movi(ctx, op, op->args[0], i);
+    }
+    return false;
+}
+
 /* If the binary operation has first argument @i, fold to NOT. */
 static bool fold_ix_to_not(OptContext *ctx, TCGOp *op, uint64_t i)
 {
@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
 static bool fold_shift(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
+        fold_ix_to_i(ctx, op, 0) ||
         fold_xi_to_x(ctx, op, 0)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             break;
         }
 
-        /* Simplify expressions for "shift/rot r, 0, a => movi r, 0",
-           and "sub r, 0, a => neg r, a" case.  */
-        switch (opc) {
-        CASE_OP_32_64(shl):
-        CASE_OP_32_64(shr):
-        CASE_OP_32_64(sar):
-        CASE_OP_32_64(rotl):
-        CASE_OP_32_64(rotr):
-            if (arg_is_const(op->args[1])
-                && arg_info(op->args[1])->val == 0) {
-                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
-                continue;
-            }
-            break;
-        default:
-            break;
-        }
-
         /* Simplify using known-zero bits. Currently only ops with a single
            output argument is supported. */
         z_mask = -1;
-- 
2.25.1

Move all of the known-zero optimizations into the per-opcode
functions.  Use fold_masks when there is a possibility of the
result being determined, and simply set ctx->z_mask otherwise.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 545 ++++++++++++++++++++++++++-----------------------
 1 file changed, 294 insertions(+), 251 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ typedef struct OptContext {
     TCGTempSet temps_used;
 
     /* In flight values from optimization. */
-    uint64_t z_mask;
+    uint64_t a_mask;  /* mask bit is 0 iff value identical to first input */
+    uint64_t z_mask;  /* mask bit is 0 iff value bit is 0 */
     TCGType type;
 } OptContext;
 
@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
     return false;
 }
 
+static bool fold_masks(OptContext *ctx, TCGOp *op)
+{
+    uint64_t a_mask = ctx->a_mask;
+    uint64_t z_mask = ctx->z_mask;
+
+    /*
+     * 32-bit ops generate 32-bit results.  For the result is zero test
+     * below, we can ignore high bits, but for further optimizations we
+     * need to record that the high bits contain garbage.
+     */
+    if (ctx->type == TCG_TYPE_I32) {
+        ctx->z_mask |= MAKE_64BIT_MASK(32, 32);
+        a_mask &= MAKE_64BIT_MASK(0, 32);
+        z_mask &= MAKE_64BIT_MASK(0, 32);
+    }
+
+    if (z_mask == 0) {
+        return tcg_opt_gen_movi(ctx, op, op->args[0], 0);
+    }
+    if (a_mask == 0) {
+        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]);
+    }
+    return false;
+}
+
 /*
  * Convert @op to NOT, if NOT is supported by the host.
  * Return true f the conversion is successful, which will still
@@ -XXX,XX +XXX,XX @@ static bool fold_add2_i32(OptContext *ctx, TCGOp *op)
 
 static bool fold_and(OptContext *ctx, TCGOp *op)
 {
+    uint64_t z1, z2;
+
     if (fold_const2(ctx, op) ||
         fold_xi_to_i(ctx, op, 0) ||
         fold_xi_to_x(ctx, op, -1) ||
         fold_xx_to_x(ctx, op)) {
         return true;
     }
-    return false;
+
+    z1 = arg_info(op->args[1])->z_mask;
+    z2 = arg_info(op->args[2])->z_mask;
+    ctx->z_mask = z1 & z2;
+
+    /*
+     * Known-zeros does not imply known-ones.  Therefore unless
+     * arg2 is constant, we can't infer affected bits from it.
+     */
+    if (arg_is_const(op->args[2])) {
+        ctx->a_mask = z1 & ~z2;
+    }
+
+    return fold_masks(ctx, op);
 }
 
 static bool fold_andc(OptContext *ctx, TCGOp *op)
 {
+    uint64_t z1;
+
     if (fold_const2(ctx, op) ||
         fold_xx_to_i(ctx, op, 0) ||
         fold_xi_to_x(ctx, op, 0) ||
         fold_ix_to_not(ctx, op, -1)) {
         return true;
     }
-    return false;
+
+    z1 = arg_info(op->args[1])->z_mask;
+
+    /*
+     * Known-zeros does not imply known-ones.  Therefore unless
+     * arg2 is constant, we can't infer anything from it.
+     */
+    if (arg_is_const(op->args[2])) {
+        uint64_t z2 = ~arg_info(op->args[2])->z_mask;
+        ctx->a_mask = z1 & ~z2;
+        z1 &= z2;
+    }
+    ctx->z_mask = z1;
+
+    return fold_masks(ctx, op);
 }
 
 static bool fold_brcond(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
 
 static bool fold_bswap(OptContext *ctx, TCGOp *op)
 {
+    uint64_t z_mask, sign;
+
     if (arg_is_const(op->args[1])) {
         uint64_t t = arg_info(op->args[1])->val;
 
         t = do_constant_folding(op->opc, ctx->type, t, op->args[2]);
         return tcg_opt_gen_movi(ctx, op, op->args[0], t);
     }
-    return false;
+
+    z_mask = arg_info(op->args[1])->z_mask;
+    switch (op->opc) {
+    case INDEX_op_bswap16_i32:
+    case INDEX_op_bswap16_i64:
+        z_mask = bswap16(z_mask);
+        sign = INT16_MIN;
+        break;
+    case INDEX_op_bswap32_i32:
+    case INDEX_op_bswap32_i64:
+        z_mask = bswap32(z_mask);
+        sign = INT32_MIN;
+        break;
+    case INDEX_op_bswap64_i64:
+        z_mask = bswap64(z_mask);
+        sign = INT64_MIN;
+        break;
+    default:
+        g_assert_not_reached();
+    }
+
+    switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
+    case TCG_BSWAP_OZ:
+        break;
+    case TCG_BSWAP_OS:
+        /* If the sign bit may be 1, force all the bits above to 1. */
+        if (z_mask & sign) {
+            z_mask |= sign;
+        }
+        break;
+    default:
+        /* The high bits are undefined: force all bits above the sign to 1. */
+        z_mask |= sign << 1;
+        break;
+    }
+    ctx->z_mask = z_mask;
+
+    return fold_masks(ctx, op);
 }
 
 static bool fold_call(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_call(OptContext *ctx, TCGOp *op)
 
 static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
 {
+    uint64_t z_mask;
+
     if (arg_is_const(op->args[1])) {
         uint64_t t = arg_info(op->args[1])->val;
 
@@ -XXX,XX +XXX,XX @@ static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
         }
         return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[2]);
     }
+
+    switch (ctx->type) {
+    case TCG_TYPE_I32:
+        z_mask = 31;
+        break;
+    case TCG_TYPE_I64:
+        z_mask = 63;
+        break;
+    default:
+        g_assert_not_reached();
+    }
+    ctx->z_mask = arg_info(op->args[2])->z_mask | z_mask;
+
     return false;
 }
 
 static bool fold_ctpop(OptContext *ctx, TCGOp *op)
 {
-    return fold_const1(ctx, op);
+    if (fold_const1(ctx, op)) {
+        return true;
+    }
+
+    switch (ctx->type) {
+    case TCG_TYPE_I32:
+        ctx->z_mask = 32 | 31;
+        break;
+    case TCG_TYPE_I64:
+        ctx->z_mask = 64 | 63;
+        break;
+    default:
+        g_assert_not_reached();
+    }
+    return false;
 }
 
 static bool fold_deposit(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_deposit(OptContext *ctx, TCGOp *op)
         t1 = deposit64(t1, op->args[3], op->args[4], t2);
         return tcg_opt_gen_movi(ctx, op, op->args[0], t1);
     }
+
+    ctx->z_mask = deposit64(arg_info(op->args[1])->z_mask,
+                            op->args[3], op->args[4],
+                            arg_info(op->args[2])->z_mask);
     return false;
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
 
 static bool fold_extract(OptContext *ctx, TCGOp *op)
 {
+    uint64_t z_mask_old, z_mask;
+
     if (arg_is_const(op->args[1])) {
         uint64_t t;
 
@@ -XXX,XX +XXX,XX @@ static bool fold_extract(OptContext *ctx, TCGOp *op)
         t = extract64(t, op->args[2], op->args[3]);
         return tcg_opt_gen_movi(ctx, op, op->args[0], t);
     }
-    return false;
+
+    z_mask_old = arg_info(op->args[1])->z_mask;
+    z_mask = extract64(z_mask_old, op->args[2], op->args[3]);
+    if (op->args[2] == 0) {
+        ctx->a_mask = z_mask_old ^ z_mask;
+    }
+    ctx->z_mask = z_mask;
+
+    return fold_masks(ctx, op);
 }
 
 static bool fold_extract2(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_extract2(OptContext *ctx, TCGOp *op)
 
 static bool fold_exts(OptContext *ctx, TCGOp *op)
 {
-    return fold_const1(ctx, op);
+    uint64_t z_mask_old, z_mask, sign;
+    bool type_change = false;
+
+    if (fold_const1(ctx, op)) {
+        return true;
+    }
+
+    z_mask_old = z_mask = arg_info(op->args[1])->z_mask;
+
+    switch (op->opc) {
+    CASE_OP_32_64(ext8s):
+        sign = INT8_MIN;
+        z_mask = (uint8_t)z_mask;
+        break;
+    CASE_OP_32_64(ext16s):
+        sign = INT16_MIN;
+        z_mask = (uint16_t)z_mask;
+        break;
+    case INDEX_op_ext_i32_i64:
+        type_change = true;
+        QEMU_FALLTHROUGH;
+    case INDEX_op_ext32s_i64:
+        sign = INT32_MIN;
+        z_mask = (uint32_t)z_mask;
+        break;
+    default:
+        g_assert_not_reached();
+    }
+
+    if (z_mask & sign) {
+        z_mask |= sign;
+    } else if (!type_change) {
+        ctx->a_mask = z_mask_old ^ z_mask;
+    }
+    ctx->z_mask = z_mask;
+
+    return fold_masks(ctx, op);
 }
 
 static bool fold_extu(OptContext *ctx, TCGOp *op)
 {
-    return fold_const1(ctx, op);
+    uint64_t z_mask_old, z_mask;
+    bool type_change = false;
+
+    if (fold_const1(ctx, op)) {
+        return true;
+    }
+
+    z_mask_old = z_mask = arg_info(op->args[1])->z_mask;
+
+    switch (op->opc) {
+    CASE_OP_32_64(ext8u):
+        z_mask = (uint8_t)z_mask;
+        break;
+    CASE_OP_32_64(ext16u):
+        z_mask = (uint16_t)z_mask;
+        break;
+    case INDEX_op_extrl_i64_i32:
+    case INDEX_op_extu_i32_i64:
+        type_change = true;
+        QEMU_FALLTHROUGH;
+    case INDEX_op_ext32u_i64:
+        z_mask = (uint32_t)z_mask;
+        break;
+    case INDEX_op_extrh_i64_i32:
+        type_change = true;
+        z_mask >>= 32;
+        break;
+    default:
+        g_assert_not_reached();
+    }
+
+    ctx->z_mask = z_mask;
+    if (!type_change) {
+        ctx->a_mask = z_mask_old ^ z_mask;
+    }
+    return fold_masks(ctx, op);
 }
 
 static bool fold_mb(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
         return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[4 - i]);
     }
 
+    ctx->z_mask = arg_info(op->args[3])->z_mask
+                | arg_info(op->args[4])->z_mask;
+
     if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
         uint64_t tv = arg_info(op->args[3])->val;
         uint64_t fv = arg_info(op->args[4])->val;
@@ -XXX,XX +XXX,XX @@ static bool fold_nand(OptContext *ctx, TCGOp *op)
 
 static bool fold_neg(OptContext *ctx, TCGOp *op)
 {
+    uint64_t z_mask;
+
     if (fold_const1(ctx, op)) {
         return true;
     }
+
+    /* Set to 1 all bits to the left of the rightmost.  */
+    z_mask = arg_info(op->args[1])->z_mask;
+    ctx->z_mask = -(z_mask & -z_mask);
+
     /*
      * Because of fold_sub_to_neg, we want to always return true,
      * via finish_folding.
@@ -XXX,XX +XXX,XX @@ static bool fold_or(OptContext *ctx, TCGOp *op)
         fold_xx_to_x(ctx, op)) {
         return true;
     }
-    return false;
+
+    ctx->z_mask = arg_info(op->args[1])->z_mask
+                | arg_info(op->args[2])->z_mask;
+    return fold_masks(ctx, op);
 }
 
 static bool fold_orc(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_orc(OptContext *ctx, TCGOp *op)
 
 static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
 {
+    const TCGOpDef *def = &tcg_op_defs[op->opc];
+    MemOpIdx oi = op->args[def->nb_oargs + def->nb_iargs];
+    MemOp mop = get_memop(oi);
+    int width = 8 * memop_size(mop);
+
+    if (!(mop & MO_SIGN) && width < 64) {
+        ctx->z_mask = MAKE_64BIT_MASK(0, width);
+    }
+
     /* Opcodes that touch guest memory stop the mb optimization.  */
     ctx->prev_mb = NULL;
     return false;
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond(OptContext *ctx, TCGOp *op)
     if (i >= 0) {
         return tcg_opt_gen_movi(ctx, op, op->args[0], i);
     }
+
+    ctx->z_mask = 1;
     return false;
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
         op->opc = INDEX_op_setcond_i32;
         break;
     }
+
+    ctx->z_mask = 1;
     return false;
 
  do_setcond_const:
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
 
 static bool fold_sextract(OptContext *ctx, TCGOp *op)
 {
+    int64_t z_mask_old, z_mask;
+
     if (arg_is_const(op->args[1])) {
         uint64_t t;
 
@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
         t = sextract64(t, op->args[2], op->args[3]);
         return tcg_opt_gen_movi(ctx, op, op->args[0], t);
     }
-    return false;
+
+    z_mask_old = arg_info(op->args[1])->z_mask;
+    z_mask = sextract64(z_mask_old, op->args[2], op->args[3]);
+    if (op->args[2] == 0 && z_mask >= 0) {
+        ctx->a_mask = z_mask_old ^ z_mask;
+    }
+    ctx->z_mask = z_mask;
+
+    return fold_masks(ctx, op);
 }
 
 static bool fold_shift(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_shift(OptContext *ctx, TCGOp *op)
         fold_xi_to_x(ctx, op, 0)) {
         return true;
     }
+
+    if (arg_is_const(op->args[2])) {
+        ctx->z_mask = do_constant_folding(op->opc, ctx->type,
+                                          arg_info(op->args[1])->z_mask,
+                                          arg_info(op->args[2])->val);
+        return fold_masks(ctx, op);
+    }
     return false;
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
     return fold_addsub2_i32(ctx, op, false);
 }
 
+static bool fold_tcg_ld(OptContext *ctx, TCGOp *op)
+{
+    /* We can't do any folding with a load, but we can record bits. */
+    switch (op->opc) {
+    CASE_OP_32_64(ld8u):
+        ctx->z_mask = MAKE_64BIT_MASK(0, 8);
+        break;
+    CASE_OP_32_64(ld16u):
+        ctx->z_mask = MAKE_64BIT_MASK(0, 16);
+        break;
+    case INDEX_op_ld32u_i64:
+        ctx->z_mask = MAKE_64BIT_MASK(0, 32);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+    return false;
+}
+
 static bool fold_xor(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
@@ -XXX,XX +XXX,XX @@ static bool fold_xor(OptContext *ctx, TCGOp *op)
         fold_xi_to_not(ctx, op, -1)) {
         return true;
     }
-    return false;
+
+    ctx->z_mask = arg_info(op->args[1])->z_mask
+                | arg_info(op->args[2])->z_mask;
+    return fold_masks(ctx, op);
 }
 
 /* Propagate constants and copies, fold constant expressions. */
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
     }
 
     QTAILQ_FOREACH_SAFE(op, &s->ops, link, op_next) {
-        uint64_t z_mask, partmask, affected, tmp;
         TCGOpcode opc = op->opc;
         const TCGOpDef *def;
         bool done = false;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             break;
         }
 
-        /* Simplify using known-zero bits. Currently only ops with a single
-           output argument is supported. */
-        z_mask = -1;
-        affected = -1;
-        switch (opc) {
-        CASE_OP_32_64(ext8s):
-            if ((arg_info(op->args[1])->z_mask & 0x80) != 0) {
-                break;
-            }
-            QEMU_FALLTHROUGH;
-        CASE_OP_32_64(ext8u):
-            z_mask = 0xff;
-            goto and_const;
-        CASE_OP_32_64(ext16s):
-            if ((arg_info(op->args[1])->z_mask & 0x8000) != 0) {
-                break;
-            }
-            QEMU_FALLTHROUGH;
-        CASE_OP_32_64(ext16u):
-            z_mask = 0xffff;
-            goto and_const;
-        case INDEX_op_ext32s_i64:
-            if ((arg_info(op->args[1])->z_mask & 0x80000000) != 0) {
-                break;
-            }
-            QEMU_FALLTHROUGH;
-        case INDEX_op_ext32u_i64:
-            z_mask = 0xffffffffU;
-            goto and_const;
-
-        CASE_OP_32_64(and):
-            z_mask = arg_info(op->args[2])->z_mask;
-            if (arg_is_const(op->args[2])) {
-        and_const:
-                affected = arg_info(op->args[1])->z_mask & ~z_mask;
-            }
-            z_mask = arg_info(op->args[1])->z_mask & z_mask;
-            break;
-
-        case INDEX_op_ext_i32_i64:
-            if ((arg_info(op->args[1])->z_mask & 0x80000000) != 0) {
-                break;
-            }
-            QEMU_FALLTHROUGH;
-        case INDEX_op_extu_i32_i64:
-            /* We do not compute affected as it is a size changing op.  */
-            z_mask = (uint32_t)arg_info(op->args[1])->z_mask;
-            break;
-
-        CASE_OP_32_64(andc):
-            /* Known-zeros does not imply known-ones.  Therefore unless
-               op->args[2] is constant, we can't infer anything from it.  */
-            if (arg_is_const(op->args[2])) {
-                z_mask = ~arg_info(op->args[2])->z_mask;
-                goto and_const;
-            }
-            /* But we certainly know nothing outside args[1] may be set. */
-            z_mask = arg_info(op->args[1])->z_mask;
-            break;
-
-        case INDEX_op_sar_i32:
-            if (arg_is_const(op->args[2])) {
-                tmp = arg_info(op->args[2])->val & 31;
-                z_mask = (int32_t)arg_info(op->args[1])->z_mask >> tmp;
-            }
-            break;
-        case INDEX_op_sar_i64:
-            if (arg_is_const(op->args[2])) {
-                tmp = arg_info(op->args[2])->val & 63;
-                z_mask = (int64_t)arg_info(op->args[1])->z_mask >> tmp;
-            }
-            break;
-
-        case INDEX_op_shr_i32:
-            if (arg_is_const(op->args[2])) {
-                tmp = arg_info(op->args[2])->val & 31;
-                z_mask = (uint32_t)arg_info(op->args[1])->z_mask >> tmp;
-            }
-            break;
-        case INDEX_op_shr_i64:
-            if (arg_is_const(op->args[2])) {
-                tmp = arg_info(op->args[2])->val & 63;
-                z_mask = (uint64_t)arg_info(op->args[1])->z_mask >> tmp;
-            }
-            break;
-
-        case INDEX_op_extrl_i64_i32:
-            z_mask = (uint32_t)arg_info(op->args[1])->z_mask;
-            break;
-        case INDEX_op_extrh_i64_i32:
-            z_mask = (uint64_t)arg_info(op->args[1])->z_mask >> 32;
-            break;
-
-        CASE_OP_32_64(shl):
-            if (arg_is_const(op->args[2])) {
-                tmp = arg_info(op->args[2])->val & (TCG_TARGET_REG_BITS - 1);
-                z_mask = arg_info(op->args[1])->z_mask << tmp;
-            }
-            break;
-
-        CASE_OP_32_64(neg):
-            /* Set to 1 all bits to the left of the rightmost.  */
-            z_mask = -(arg_info(op->args[1])->z_mask
-                       & -arg_info(op->args[1])->z_mask);
-            break;
-
-        CASE_OP_32_64(deposit):
-            z_mask = deposit64(arg_info(op->args[1])->z_mask,
-                               op->args[3], op->args[4],
-                               arg_info(op->args[2])->z_mask);
-            break;
-
-        CASE_OP_32_64(extract):
-            z_mask = extract64(arg_info(op->args[1])->z_mask,
-                               op->args[2], op->args[3]);
-            if (op->args[2] == 0) {
-                affected = arg_info(op->args[1])->z_mask & ~z_mask;
-            }
-            break;
-        CASE_OP_32_64(sextract):
-            z_mask = sextract64(arg_info(op->args[1])->z_mask,
-                                op->args[2], op->args[3]);
-            if (op->args[2] == 0 && (tcg_target_long)z_mask >= 0) {
-                affected = arg_info(op->args[1])->z_mask & ~z_mask;
-            }
-            break;
-
-        CASE_OP_32_64(or):
-        CASE_OP_32_64(xor):
-            z_mask = arg_info(op->args[1])->z_mask
-                   | arg_info(op->args[2])->z_mask;
-            break;
-
-        case INDEX_op_clz_i32:
-        case INDEX_op_ctz_i32:
-            z_mask = arg_info(op->args[2])->z_mask | 31;
-            break;
-
-        case INDEX_op_clz_i64:
-        case INDEX_op_ctz_i64:
-            z_mask = arg_info(op->args[2])->z_mask | 63;
-            break;
-
-        case INDEX_op_ctpop_i32:
-            z_mask = 32 | 31;
-            break;
-        case INDEX_op_ctpop_i64:
-            z_mask = 64 | 63;
-            break;
-
-        CASE_OP_32_64(setcond):
-        case INDEX_op_setcond2_i32:
-            z_mask = 1;
-            break;
-
-        CASE_OP_32_64(movcond):
-            z_mask = arg_info(op->args[3])->z_mask
-                   | arg_info(op->args[4])->z_mask;
-            break;
-
-        CASE_OP_32_64(ld8u):
-            z_mask = 0xff;
-            break;
-        CASE_OP_32_64(ld16u):
-            z_mask = 0xffff;
-            break;
-        case INDEX_op_ld32u_i64:
-            z_mask = 0xffffffffu;
-            break;
-
-        CASE_OP_32_64(qemu_ld):
-            {
-                MemOpIdx oi = op->args[def->nb_oargs + def->nb_iargs];
-                MemOp mop = get_memop(oi);
-                if (!(mop & MO_SIGN)) {
-                    z_mask = (2ULL << ((8 << (mop & MO_SIZE)) - 1)) - 1;
-                }
-            }
-            break;
-
-        CASE_OP_32_64(bswap16):
-            z_mask = arg_info(op->args[1])->z_mask;
-            if (z_mask <= 0xffff) {
-                op->args[2] |= TCG_BSWAP_IZ;
-            }
-            z_mask = bswap16(z_mask);
-            switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
-            case TCG_BSWAP_OZ:
-                break;
-            case TCG_BSWAP_OS:
-                z_mask = (int16_t)z_mask;
-                break;
-            default: /* undefined high bits */
-                z_mask |= MAKE_64BIT_MASK(16, 48);
-                break;
-            }
-            break;
-
-        case INDEX_op_bswap32_i64:
-            z_mask = arg_info(op->args[1])->z_mask;
-            if (z_mask <= 0xffffffffu) {
-                op->args[2] |= TCG_BSWAP_IZ;
-            }
-            z_mask = bswap32(z_mask);
-            switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
-            case TCG_BSWAP_OZ:
-                break;
-            case TCG_BSWAP_OS:
-                z_mask = (int32_t)z_mask;
-                break;
-            default: /* undefined high bits */
-                z_mask |= MAKE_64BIT_MASK(32, 32);
-                break;
-            }
-            break;
-
-        default:
-            break;
-        }
-
-        /* 32-bit ops generate 32-bit results.  For the result is zero test
-           below, we can ignore high bits, but for further optimizations we
-           need to record that the high bits contain garbage.  */
-        partmask = z_mask;
-        if (ctx.type == TCG_TYPE_I32) {
-            z_mask |= ~(tcg_target_ulong)0xffffffffu;
-            partmask &= 0xffffffffu;
-            affected &= 0xffffffffu;
-        }
-        ctx.z_mask = z_mask;
-
-        if (partmask == 0) {
-            tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
-            continue;
-        }
-        if (affected == 0) {
-            tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
-            continue;
-        }
+        /* Assume all bits affected, and no bits known zero. */
+        ctx.a_mask = -1;
+        ctx.z_mask = -1;
 
         /*
          * Process each opcode.
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_extrh_i64_i32:
             done = fold_extu(&ctx, op);
             break;
+        CASE_OP_32_64(ld8u):
+        CASE_OP_32_64(ld16u):
+        case INDEX_op_ld32u_i64:
+            done = fold_tcg_ld(&ctx, op);
+            break;
         case INDEX_op_mb:
             done = fold_mb(&ctx, op);
             break;
-- 
2.25.1

Rename to fold_multiply2, and handle muls2_i32, mulu2_i64,
and muls2_i64.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 44 +++++++++++++++++++++++++++++++++++---------
 1 file changed, 35 insertions(+), 9 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
     return false;
 }
 
-static bool fold_mulu2_i32(OptContext *ctx, TCGOp *op)
+static bool fold_multiply2(OptContext *ctx, TCGOp *op)
 {
     if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
-        uint32_t a = arg_info(op->args[2])->val;
-        uint32_t b = arg_info(op->args[3])->val;
-        uint64_t r = (uint64_t)a * b;
+        uint64_t a = arg_info(op->args[2])->val;
+        uint64_t b = arg_info(op->args[3])->val;
+        uint64_t h, l;
         TCGArg rl, rh;
-        TCGOp *op2 = tcg_op_insert_before(ctx->tcg, op, INDEX_op_mov_i32);
+        TCGOp *op2;
+
+        switch (op->opc) {
+        case INDEX_op_mulu2_i32:
+            l = (uint64_t)(uint32_t)a * (uint32_t)b;
+            h = (int32_t)(l >> 32);
+            l = (int32_t)l;
+            break;
+        case INDEX_op_muls2_i32:
+            l = (int64_t)(int32_t)a * (int32_t)b;
+            h = l >> 32;
+            l = (int32_t)l;
+            break;
+        case INDEX_op_mulu2_i64:
+            mulu64(&l, &h, a, b);
+            break;
+        case INDEX_op_muls2_i64:
+            muls64(&l, &h, a, b);
+            break;
+        default:
+            g_assert_not_reached();
+        }
 
         rl = op->args[0];
         rh = op->args[1];
-        tcg_opt_gen_movi(ctx, op, rl, (int32_t)r);
-        tcg_opt_gen_movi(ctx, op2, rh, (int32_t)(r >> 32));
+
+        /* The proper opcode is supplied by tcg_opt_gen_mov. */
+        op2 = tcg_op_insert_before(ctx->tcg, op, 0);
+
+        tcg_opt_gen_movi(ctx, op, rl, l);
+        tcg_opt_gen_movi(ctx, op2, rh, h);
         return true;
     }
     return false;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(muluh):
             done = fold_mul_highpart(&ctx, op);
             break;
-        case INDEX_op_mulu2_i32:
-            done = fold_mulu2_i32(&ctx, op);
+        CASE_OP_32_64(muls2):
+        CASE_OP_32_64(mulu2):
+            done = fold_multiply2(&ctx, op);
             break;
         CASE_OP_32_64(nand):
             done = fold_nand(&ctx, op);
-- 
2.25.1

Rename to fold_addsub2.
Use Int128 to implement the wider operation.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 65 ++++++++++++++++++++++++++++++++++----------------
 1 file changed, 44 insertions(+), 21 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@
  */
 
 #include "qemu/osdep.h"
+#include "qemu/int128.h"
 #include "tcg/tcg-op.h"
 #include "tcg-internal.h"
 
@@ -XXX,XX +XXX,XX @@ static bool fold_add(OptContext *ctx, TCGOp *op)
     return false;
 }
 
-static bool fold_addsub2_i32(OptContext *ctx, TCGOp *op, bool add)
+static bool fold_addsub2(OptContext *ctx, TCGOp *op, bool add)
 {
     if (arg_is_const(op->args[2]) && arg_is_const(op->args[3]) &&
         arg_is_const(op->args[4]) && arg_is_const(op->args[5])) {
-        uint32_t al = arg_info(op->args[2])->val;
-        uint32_t ah = arg_info(op->args[3])->val;
-        uint32_t bl = arg_info(op->args[4])->val;
-        uint32_t bh = arg_info(op->args[5])->val;
-        uint64_t a = ((uint64_t)ah << 32) | al;
-        uint64_t b = ((uint64_t)bh << 32) | bl;
+        uint64_t al = arg_info(op->args[2])->val;
+        uint64_t ah = arg_info(op->args[3])->val;
+        uint64_t bl = arg_info(op->args[4])->val;
+        uint64_t bh = arg_info(op->args[5])->val;
         TCGArg rl, rh;
-        TCGOp *op2 = tcg_op_insert_before(ctx->tcg, op, INDEX_op_mov_i32);
+        TCGOp *op2;
 
-        if (add) {
-            a += b;
+        if (ctx->type == TCG_TYPE_I32) {
+            uint64_t a = deposit64(al, 32, 32, ah);
+            uint64_t b = deposit64(bl, 32, 32, bh);
+
+            if (add) {
+                a += b;
+            } else {
+                a -= b;
+            }
+
+            al = sextract64(a, 0, 32);
+            ah = sextract64(a, 32, 32);
         } else {
-            a -= b;
+            Int128 a = int128_make128(al, ah);
+            Int128 b = int128_make128(bl, bh);
+
+            if (add) {
+                a = int128_add(a, b);
+            } else {
+                a = int128_sub(a, b);
+            }
+
+            al = int128_getlo(a);
+            ah = int128_gethi(a);
         }
 
         rl = op->args[0];
         rh = op->args[1];
-        tcg_opt_gen_movi(ctx, op, rl, (int32_t)a);
-        tcg_opt_gen_movi(ctx, op2, rh, (int32_t)(a >> 32));
+
+        /* The proper opcode is supplied by tcg_opt_gen_mov. */
+        op2 = tcg_op_insert_before(ctx->tcg, op, 0);
+
+        tcg_opt_gen_movi(ctx, op, rl, al);
+        tcg_opt_gen_movi(ctx, op2, rh, ah);
         return true;
     }
     return false;
 }
 
-static bool fold_add2_i32(OptContext *ctx, TCGOp *op)
+static bool fold_add2(OptContext *ctx, TCGOp *op)
 {
-    return fold_addsub2_i32(ctx, op, true);
+    return fold_addsub2(ctx, op, true);
 }
 
 static bool fold_and(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_sub(OptContext *ctx, TCGOp *op)
     return false;
 }
 
-static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
+static bool fold_sub2(OptContext *ctx, TCGOp *op)
 {
-    return fold_addsub2_i32(ctx, op, false);
+    return fold_addsub2(ctx, op, false);
 }
 
 static bool fold_tcg_ld(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64_VEC(add):
             done = fold_add(&ctx, op);
             break;
-        case INDEX_op_add2_i32:
-            done = fold_add2_i32(&ctx, op);
+        CASE_OP_32_64(add2):
+            done = fold_add2(&ctx, op);
             break;
         CASE_OP_32_64_VEC(and):
             done = fold_and(&ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64_VEC(sub):
             done = fold_sub(&ctx, op);
             break;
-        case INDEX_op_sub2_i32:
-            done = fold_sub2_i32(&ctx, op);
+        CASE_OP_32_64(sub2):
+            done = fold_sub2(&ctx, op);
             break;
         CASE_OP_32_64_VEC(xor):
             done = fold_xor(&ctx, op);
-- 
2.25.1

Most of these are handled by creating a fold_const2_commutative
to handle all of the binary operators.  The rest were already
handled on a case-by-case basis in the switch, and have their
own fold function in which to place the call.

We now have only one major switch on TCGOpcode.

Introduce NO_DEST and a block comment for swap_commutative in
order to make the handling of brcond and movcond opcodes cleaner.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 142 ++++++++++++++++++++++++-------------------------
 1 file changed, 70 insertions(+), 72 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static int do_constant_folding_cond2(TCGArg *p1, TCGArg *p2, TCGCond c)
     return -1;
 }
 
+/**
+ * swap_commutative:
+ * @dest: TCGArg of the destination argument, or NO_DEST.
+ * @p1: first paired argument
+ * @p2: second paired argument
+ *
+ * If *@p1 is a constant and *@p2 is not, swap.
+ * If *@p2 matches @dest, swap.
+ * Return true if a swap was performed.
+ */
+
+#define NO_DEST  temp_arg(NULL)
+
 static bool swap_commutative(TCGArg dest, TCGArg *p1, TCGArg *p2)
 {
     TCGArg a1 = *p1, a2 = *p2;
@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
     return false;
 }
 
+static bool fold_const2_commutative(OptContext *ctx, TCGOp *op)
+{
+    swap_commutative(op->args[0], &op->args[1], &op->args[2]);
+    return fold_const2(ctx, op);
+}
+
 static bool fold_masks(OptContext *ctx, TCGOp *op)
 {
     uint64_t a_mask = ctx->a_mask;
@@ -XXX,XX +XXX,XX @@ static bool fold_xx_to_x(OptContext *ctx, TCGOp *op)
 
 static bool fold_add(OptContext *ctx, TCGOp *op)
 {
-    if (fold_const2(ctx, op) ||
+    if (fold_const2_commutative(ctx, op) ||
         fold_xi_to_x(ctx, op, 0)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_addsub2(OptContext *ctx, TCGOp *op, bool add)
 
 static bool fold_add2(OptContext *ctx, TCGOp *op)
 {
+    /* Note that the high and low parts may be independently swapped. */
+    swap_commutative(op->args[0], &op->args[2], &op->args[4]);
+    swap_commutative(op->args[1], &op->args[3], &op->args[5]);
+
     return fold_addsub2(ctx, op, true);
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
 {
     uint64_t z1, z2;
 
-    if (fold_const2(ctx, op) ||
+    if (fold_const2_commutative(ctx, op) ||
         fold_xi_to_i(ctx, op, 0) ||
         fold_xi_to_x(ctx, op, -1) ||
         fold_xx_to_x(ctx, op)) {
@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
 static bool fold_brcond(OptContext *ctx, TCGOp *op)
 {
     TCGCond cond = op->args[2];
-    int i = do_constant_folding_cond(ctx->type, op->args[0], op->args[1], cond);
+    int i;
 
+    if (swap_commutative(NO_DEST, &op->args[0], &op->args[1])) {
+        op->args[2] = cond = tcg_swap_cond(cond);
+    }
+
+    i = do_constant_folding_cond(ctx->type, op->args[0], op->args[1], cond);
     if (i == 0) {
         tcg_op_remove(ctx->tcg, op);
         return true;
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond(OptContext *ctx, TCGOp *op)
 static bool fold_brcond2(OptContext *ctx, TCGOp *op)
 {
     TCGCond cond = op->args[4];
-    int i = do_constant_folding_cond2(&op->args[0], &op->args[2], cond);
     TCGArg label = op->args[5];
-    int inv = 0;
+    int i, inv = 0;
 
+    if (swap_commutative2(&op->args[0], &op->args[2])) {
+        op->args[4] = cond = tcg_swap_cond(cond);
+    }
+
+    i = do_constant_folding_cond2(&op->args[0], &op->args[2], cond);
     if (i >= 0) {
         goto do_brcond_const;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_dup2(OptContext *ctx, TCGOp *op)
 
 static bool fold_eqv(OptContext *ctx, TCGOp *op)
 {
-    if (fold_const2(ctx, op) ||
+    if (fold_const2_commutative(ctx, op) ||
         fold_xi_to_x(ctx, op, -1) ||
         fold_xi_to_not(ctx, op, 0)) {
         return true;
@@ -XXX,XX +XXX,XX @@ static bool fold_mov(OptContext *ctx, TCGOp *op)
 static bool fold_movcond(OptContext *ctx, TCGOp *op)
 {
     TCGCond cond = op->args[5];
-    int i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
+    int i;
 
+    if (swap_commutative(NO_DEST, &op->args[1], &op->args[2])) {
+        op->args[5] = cond = tcg_swap_cond(cond);
+    }
+    /*
+     * Canonicalize the "false" input reg to match the destination reg so
+     * that the tcg backend can implement a "move if true" operation.
+     */
+    if (swap_commutative(op->args[0], &op->args[4], &op->args[3])) {
+        op->args[5] = cond = tcg_invert_cond(cond);
+    }
+
+    i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
     if (i >= 0) {
         return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[4 - i]);
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_mul(OptContext *ctx, TCGOp *op)
 
 static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
 {
-    if (fold_const2(ctx, op) ||
+    if (fold_const2_commutative(ctx, op) ||
         fold_xi_to_i(ctx, op, 0)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
 
 static bool fold_multiply2(OptContext *ctx, TCGOp *op)
 {
+    swap_commutative(op->args[0], &op->args[2], &op->args[3]);
+
     if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
         uint64_t a = arg_info(op->args[2])->val;
         uint64_t b = arg_info(op->args[3])->val;
@@ -XXX,XX +XXX,XX @@ static bool fold_multiply2(OptContext *ctx, TCGOp *op)
 
 static bool fold_nand(OptContext *ctx, TCGOp *op)
 {
-    if (fold_const2(ctx, op) ||
+    if (fold_const2_commutative(ctx, op) ||
         fold_xi_to_not(ctx, op, -1)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_neg(OptContext *ctx, TCGOp *op)
 
 static bool fold_nor(OptContext *ctx, TCGOp *op)
 {
-    if (fold_const2(ctx, op) ||
+    if (fold_const2_commutative(ctx, op) ||
         fold_xi_to_not(ctx, op, 0)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_not(OptContext *ctx, TCGOp *op)
 
 static bool fold_or(OptContext *ctx, TCGOp *op)
 {
-    if (fold_const2(ctx, op) ||
+    if (fold_const2_commutative(ctx, op) ||
         fold_xi_to_x(ctx, op, 0) ||
         fold_xx_to_x(ctx, op)) {
         return true;
@@ -XXX,XX +XXX,XX @@ static bool fold_remainder(OptContext *ctx, TCGOp *op)
 static bool fold_setcond(OptContext *ctx, TCGOp *op)
 {
     TCGCond cond = op->args[3];
-    int i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
+    int i;
 
+    if (swap_commutative(op->args[0], &op->args[1], &op->args[2])) {
+        op->args[3] = cond = tcg_swap_cond(cond);
+    }
+
+    i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
     if (i >= 0) {
         return tcg_opt_gen_movi(ctx, op, op->args[0], i);
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond(OptContext *ctx, TCGOp *op)
 static bool fold_setcond2(OptContext *ctx, TCGOp *op)
 {
     TCGCond cond = op->args[5];
-    int i = do_constant_folding_cond2(&op->args[1], &op->args[3], cond);
-    int inv = 0;
+    int i, inv = 0;
 
+    if (swap_commutative2(&op->args[1], &op->args[3])) {
+        op->args[5] = cond = tcg_swap_cond(cond);
+    }
+
+    i = do_constant_folding_cond2(&op->args[1], &op->args[3], cond);
     if (i >= 0) {
         goto do_setcond_const;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_tcg_ld(OptContext *ctx, TCGOp *op)
 
 static bool fold_xor(OptContext *ctx, TCGOp *op)
 {
-    if (fold_const2(ctx, op) ||
+    if (fold_const2_commutative(ctx, op) ||
         fold_xx_to_i(ctx, op, 0) ||
         fold_xi_to_x(ctx, op, 0) ||
         fold_xi_to_not(ctx, op, -1)) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             ctx.type = TCG_TYPE_I32;
         }
 
-        /* For commutative operations make constant second argument */
-        switch (opc) {
-        CASE_OP_32_64_VEC(add):
-        CASE_OP_32_64_VEC(mul):
-        CASE_OP_32_64_VEC(and):
-        CASE_OP_32_64_VEC(or):
-        CASE_OP_32_64_VEC(xor):
-        CASE_OP_32_64(eqv):
-        CASE_OP_32_64(nand):
-        CASE_OP_32_64(nor):
-        CASE_OP_32_64(muluh):
-        CASE_OP_32_64(mulsh):
-            swap_commutative(op->args[0], &op->args[1], &op->args[2]);
-            break;
-        CASE_OP_32_64(brcond):
-            if (swap_commutative(-1, &op->args[0], &op->args[1])) {
-                op->args[2] = tcg_swap_cond(op->args[2]);
-            }
-            break;
-        CASE_OP_32_64(setcond):
-            if (swap_commutative(op->args[0], &op->args[1], &op->args[2])) {
-                op->args[3] = tcg_swap_cond(op->args[3]);
-            }
-            break;
-        CASE_OP_32_64(movcond):
-            if (swap_commutative(-1, &op->args[1], &op->args[2])) {
-                op->args[5] = tcg_swap_cond(op->args[5]);
-            }
-            /* For movcond, we canonicalize the "false" input reg to match
-               the destination reg so that the tcg backend can implement
-               a "move if true" operation.  */
-            if (swap_commutative(op->args[0], &op->args[4], &op->args[3])) {
-                op->args[5] = tcg_invert_cond(op->args[5]);
-            }
-            break;
-        CASE_OP_32_64(add2):
-            swap_commutative(op->args[0], &op->args[2], &op->args[4]);
-            swap_commutative(op->args[1], &op->args[3], &op->args[5]);
-            break;
-        CASE_OP_32_64(mulu2):
-        CASE_OP_32_64(muls2):
-            swap_commutative(op->args[0], &op->args[2], &op->args[3]);
-            break;
-        case INDEX_op_brcond2_i32:
-            if (swap_commutative2(&op->args[0], &op->args[2])) {
-                op->args[4] = tcg_swap_cond(op->args[4]);
-            }
-            break;
-        case INDEX_op_setcond2_i32:
-            if (swap_commutative2(&op->args[1], &op->args[3])) {
-                op->args[5] = tcg_swap_cond(op->args[5]);
-            }
-            break;
-        default:
-            break;
-        }
-
         /* Assume all bits affected, and no bits known zero. */
         ctx.a_mask = -1;
         ctx.z_mask = -1;
-- 
2.25.1

This "garbage" setting pre-dates the addition of the type
changing opcodes INDEX_op_ext_i32_i64, INDEX_op_extu_i32_i64,
and INDEX_op_extr{l,h}_i64_i32.

So now we have a definitive points at which to adjust z_mask
to eliminate such bits from the 32-bit operands.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 35 ++++++++++++++++-------------------
 1 file changed, 16 insertions(+), 19 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static void init_ts_info(OptContext *ctx, TCGTemp *ts)
         ti->is_const = true;
         ti->val = ts->val;
         ti->z_mask = ts->val;
-        if (TCG_TARGET_REG_BITS > 32 && ts->type == TCG_TYPE_I32) {
-            /* High bits of a 32-bit quantity are garbage.  */
-            ti->z_mask |= ~0xffffffffull;
-        }
     } else {
         ti->is_const = false;
         ti->z_mask = -1;
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
     TCGTemp *src_ts = arg_temp(src);
     TempOptInfo *di;
     TempOptInfo *si;
-    uint64_t z_mask;
     TCGOpcode new_op;
 
     if (ts_are_copies(dst_ts, src_ts)) {
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
     op->args[0] = dst;
     op->args[1] = src;
 
-    z_mask = si->z_mask;
-    if (TCG_TARGET_REG_BITS > 32 && new_op == INDEX_op_mov_i32) {
-        /* High bits of the destination are now garbage.  */
-        z_mask |= ~0xffffffffull;
-    }
-    di->z_mask = z_mask;
+    di->z_mask = si->z_mask;
 
     if (src_ts->type == dst_ts->type) {
         TempOptInfo *ni = ts_info(si->next_copy);
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
 static bool tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
                              TCGArg dst, uint64_t val)
 {
-    /* Convert movi to mov with constant temp. */
-    TCGTemp *tv = tcg_constant_internal(ctx->type, val);
+    TCGTemp *tv;
 
+    if (ctx->type == TCG_TYPE_I32) {
+        val = (int32_t)val;
+    }
+
+    /* Convert movi to mov with constant temp. */
+    tv = tcg_constant_internal(ctx->type, val);
     init_ts_info(ctx, tv);
     return tcg_opt_gen_mov(ctx, op, dst, temp_arg(tv));
 }
@@ -XXX,XX +XXX,XX @@ static bool fold_masks(OptContext *ctx, TCGOp *op)
     uint64_t z_mask = ctx->z_mask;
 
     /*
-     * 32-bit ops generate 32-bit results.  For the result is zero test
-     * below, we can ignore high bits, but for further optimizations we
-     * need to record that the high bits contain garbage.
+     * 32-bit ops generate 32-bit results, which for the purpose of
+     * simplifying tcg are sign-extended.  Certainly that's how we
+     * represent our constants elsewhere.  Note that the bits will
+     * be reset properly for a 64-bit value when encountering the
+     * type changing opcodes.
      */
     if (ctx->type == TCG_TYPE_I32) {
-        ctx->z_mask |= MAKE_64BIT_MASK(32, 32);
-        a_mask &= MAKE_64BIT_MASK(0, 32);
-        z_mask &= MAKE_64BIT_MASK(0, 32);
+        a_mask = (int32_t)a_mask;
+        z_mask = (int32_t)z_mask;
+        ctx->z_mask = z_mask;
     }
 
     if (z_mask == 0) {
-- 
2.25.1

Certain targets, like riscv, produce signed 32-bit results.
This can lead to lots of redundant extensions as values are
manipulated.

Begin by tracking only the obvious sign-extensions, and
converting them to simple copies when possible.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 123 ++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 102 insertions(+), 21 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
     TCGTemp *next_copy;
     uint64_t val;
     uint64_t z_mask;  /* mask bit is 0 if and only if value bit is 0 */
+    uint64_t s_mask;  /* a left-aligned mask of clrsb(value) bits. */
 } TempOptInfo;
 
 typedef struct OptContext {
@@ -XXX,XX +XXX,XX @@ typedef struct OptContext {
     /* In flight values from optimization. */
     uint64_t a_mask;  /* mask bit is 0 iff value identical to first input */
     uint64_t z_mask;  /* mask bit is 0 iff value bit is 0 */
+    uint64_t s_mask;  /* mask of clrsb(value) bits */
     TCGType type;
 } OptContext;
 
+/* Calculate the smask for a specific value. */
+static uint64_t smask_from_value(uint64_t value)
+{
+    int rep = clrsb64(value);
+    return ~(~0ull >> rep);
+}
+
+/*
+ * Calculate the smask for a given set of known-zeros.
+ * If there are lots of zeros on the left, we can consider the remainder
+ * an unsigned field, and thus the corresponding signed field is one bit
+ * larger.
+ */
+static uint64_t smask_from_zmask(uint64_t zmask)
+{
+    /*
+     * Only the 0 bits are significant for zmask, thus the msb itself
+     * must be zero, else we have no sign information.
+     */
+    int rep = clz64(zmask);
+    if (rep == 0) {
+        return 0;
+    }
+    rep -= 1;
+    return ~(~0ull >> rep);
+}
+
 static inline TempOptInfo *ts_info(TCGTemp *ts)
 {
     return ts->state_ptr;
@@ -XXX,XX +XXX,XX @@ static void reset_ts(TCGTemp *ts)
     ti->prev_copy = ts;
     ti->is_const = false;
     ti->z_mask = -1;
+    ti->s_mask = 0;
 }
 
 static void reset_temp(TCGArg arg)
@@ -XXX,XX +XXX,XX @@ static void init_ts_info(OptContext *ctx, TCGTemp *ts)
         ti->is_const = true;
         ti->val = ts->val;
         ti->z_mask = ts->val;
+        ti->s_mask = smask_from_value(ts->val);
     } else {
         ti->is_const = false;
         ti->z_mask = -1;
+        ti->s_mask = 0;
     }
 }
 
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
     op->args[1] = src;
 
     di->z_mask = si->z_mask;
+    di->s_mask = si->s_mask;
 
     if (src_ts->type == dst_ts->type) {
         TempOptInfo *ni = ts_info(si->next_copy);
@@ -XXX,XX +XXX,XX @@ static void finish_folding(OptContext *ctx, TCGOp *op)
 
     nb_oargs = def->nb_oargs;
     for (i = 0; i < nb_oargs; i++) {
-        reset_temp(op->args[i]);
+        TCGTemp *ts = arg_temp(op->args[i]);
+        reset_ts(ts);
         /*
-         * Save the corresponding known-zero bits mask for the
+         * Save the corresponding known-zero/sign bits mask for the
          * first output argument (only one supported so far).
          */
         if (i == 0) {
-            arg_info(op->args[i])->z_mask = ctx->z_mask;
+            ts_info(ts)->z_mask = ctx->z_mask;
+            ts_info(ts)->s_mask = ctx->s_mask;
         }
     }
 }
@@ -XXX,XX +XXX,XX @@ static bool fold_masks(OptContext *ctx, TCGOp *op)
 {
     uint64_t a_mask = ctx->a_mask;
     uint64_t z_mask = ctx->z_mask;
+    uint64_t s_mask = ctx->s_mask;
 
     /*
      * 32-bit ops generate 32-bit results, which for the purpose of
@@ -XXX,XX +XXX,XX @@ static bool fold_masks(OptContext *ctx, TCGOp *op)
     if (ctx->type == TCG_TYPE_I32) {
         a_mask = (int32_t)a_mask;
         z_mask = (int32_t)z_mask;
+        s_mask |= MAKE_64BIT_MASK(32, 32);
         ctx->z_mask = z_mask;
+        ctx->s_mask = s_mask;
     }
 
     if (z_mask == 0) {
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
 
 static bool fold_bswap(OptContext *ctx, TCGOp *op)
 {
-    uint64_t z_mask, sign;
+    uint64_t z_mask, s_mask, sign;
 
     if (arg_is_const(op->args[1])) {
         uint64_t t = arg_info(op->args[1])->val;
@@ -XXX,XX +XXX,XX @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
     }
 
     z_mask = arg_info(op->args[1])->z_mask;
+
     switch (op->opc) {
     case INDEX_op_bswap16_i32:
     case INDEX_op_bswap16_i64:
@@ -XXX,XX +XXX,XX @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
     default:
         g_assert_not_reached();
     }
+    s_mask = smask_from_zmask(z_mask);
 
     switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
     case TCG_BSWAP_OZ:
@@ -XXX,XX +XXX,XX @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
         /* If the sign bit may be 1, force all the bits above to 1. */
         if (z_mask & sign) {
             z_mask |= sign;
+            s_mask = sign << 1;
         }
         break;
     default:
         /* The high bits are undefined: force all bits above the sign to 1. */
         z_mask |= sign << 1;
+        s_mask = 0;
         break;
     }
     ctx->z_mask = z_mask;
+    ctx->s_mask = s_mask;
 
     return fold_masks(ctx, op);
 }
@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
 static bool fold_extract(OptContext *ctx, TCGOp *op)
 {
     uint64_t z_mask_old, z_mask;
+    int pos = op->args[2];
+    int len = op->args[3];
 
     if (arg_is_const(op->args[1])) {
         uint64_t t;
 
         t = arg_info(op->args[1])->val;
-        t = extract64(t, op->args[2], op->args[3]);
+        t = extract64(t, pos, len);
         return tcg_opt_gen_movi(ctx, op, op->args[0], t);
     }
 
     z_mask_old = arg_info(op->args[1])->z_mask;
-    z_mask = extract64(z_mask_old, op->args[2], op->args[3]);
-    if (op->args[2] == 0) {
+    z_mask = extract64(z_mask_old, pos, len);
+    if (pos == 0) {
         ctx->a_mask = z_mask_old ^ z_mask;
     }
     ctx->z_mask = z_mask;
+    ctx->s_mask = smask_from_zmask(z_mask);
 
     return fold_masks(ctx, op);
 }
@@ -XXX,XX +XXX,XX @@ static bool fold_extract2(OptContext *ctx, TCGOp *op)
 
 static bool fold_exts(OptContext *ctx, TCGOp *op)
 {
-    uint64_t z_mask_old, z_mask, sign;
+    uint64_t s_mask_old, s_mask, z_mask, sign;
     bool type_change = false;
 
     if (fold_const1(ctx, op)) {
         return true;
     }
 
-    z_mask_old = z_mask = arg_info(op->args[1])->z_mask;
+    z_mask = arg_info(op->args[1])->z_mask;
+    s_mask = arg_info(op->args[1])->s_mask;
+    s_mask_old = s_mask;
 
     switch (op->opc) {
     CASE_OP_32_64(ext8s):
@@ -XXX,XX +XXX,XX @@ static bool fold_exts(OptContext *ctx, TCGOp *op)
 
     if (z_mask & sign) {
         z_mask |= sign;
-    } else if (!type_change) {
-        ctx->a_mask = z_mask_old ^ z_mask;
     }
+    s_mask |= sign << 1;
+
     ctx->z_mask = z_mask;
+    ctx->s_mask = s_mask;
+    if (!type_change) {
+        ctx->a_mask = s_mask & ~s_mask_old;
+    }
 
     return fold_masks(ctx, op);
 }
@@ -XXX,XX +XXX,XX @@ static bool fold_extu(OptContext *ctx, TCGOp *op)
     }
 
     ctx->z_mask = z_mask;
+    ctx->s_mask = smask_from_zmask(z_mask);
     if (!type_change) {
         ctx->a_mask = z_mask_old ^ z_mask;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
     MemOp mop = get_memop(oi);
     int width = 8 * memop_size(mop);
 
-    if (!(mop & MO_SIGN) && width < 64) {
-        ctx->z_mask = MAKE_64BIT_MASK(0, width);
+    if (width < 64) {
+        ctx->s_mask = MAKE_64BIT_MASK(width, 64 - width);
+        if (!(mop & MO_SIGN)) {
+            ctx->z_mask = MAKE_64BIT_MASK(0, width);
+            ctx->s_mask <<= 1;
+        }
     }
 
     /* Opcodes that touch guest memory stop the mb optimization.  */
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
 
 static bool fold_sextract(OptContext *ctx, TCGOp *op)
 {
-    int64_t z_mask_old, z_mask;
+    uint64_t z_mask, s_mask, s_mask_old;
+    int pos = op->args[2];
+    int len = op->args[3];
 
     if (arg_is_const(op->args[1])) {
         uint64_t t;
 
         t = arg_info(op->args[1])->val;
-        t = sextract64(t, op->args[2], op->args[3]);
+        t = sextract64(t, pos, len);
         return tcg_opt_gen_movi(ctx, op, op->args[0], t);
     }
 
-    z_mask_old = arg_info(op->args[1])->z_mask;
-    z_mask = sextract64(z_mask_old, op->args[2], op->args[3]);
-    if (op->args[2] == 0 && z_mask >= 0) {
-        ctx->a_mask = z_mask_old ^ z_mask;
-    }
+    z_mask = arg_info(op->args[1])->z_mask;
+    z_mask = sextract64(z_mask, pos, len);
     ctx->z_mask = z_mask;
 
+    s_mask_old = arg_info(op->args[1])->s_mask;
+    s_mask = sextract64(s_mask_old, pos, len);
+    s_mask |= MAKE_64BIT_MASK(len, 64 - len);
+    ctx->s_mask = s_mask;
+
+    if (pos == 0) {
+        ctx->a_mask = s_mask & ~s_mask_old;
+    }
+
     return fold_masks(ctx, op);
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_tcg_ld(OptContext *ctx, TCGOp *op)
 {
     /* We can't do any folding with a load, but we can record bits. */
     switch (op->opc) {
+    CASE_OP_32_64(ld8s):
+        ctx->s_mask = MAKE_64BIT_MASK(8, 56);
+        break;
     CASE_OP_32_64(ld8u):
         ctx->z_mask = MAKE_64BIT_MASK(0, 8);
+        ctx->s_mask = MAKE_64BIT_MASK(9, 55);
+        break;
+    CASE_OP_32_64(ld16s):
+        ctx->s_mask = MAKE_64BIT_MASK(16, 48);
         break;
     CASE_OP_32_64(ld16u):
         ctx->z_mask = MAKE_64BIT_MASK(0, 16);
+        ctx->s_mask = MAKE_64BIT_MASK(17, 47);
+        break;
+    case INDEX_op_ld32s_i64:
+        ctx->s_mask = MAKE_64BIT_MASK(32, 32);
         break;
     case INDEX_op_ld32u_i64:
         ctx->z_mask = MAKE_64BIT_MASK(0, 32);
+        ctx->s_mask = MAKE_64BIT_MASK(33, 31);
         break;
     default:
         g_assert_not_reached();
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             ctx.type = TCG_TYPE_I32;
         }
 
-        /* Assume all bits affected, and no bits known zero. */
+        /* Assume all bits affected, no bits known zero, no sign reps. */
         ctx.a_mask = -1;
         ctx.z_mask = -1;
+        ctx.s_mask = 0;
 
         /*
          * Process each opcode.
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_extrh_i64_i32:
             done = fold_extu(&ctx, op);
             break;
+        CASE_OP_32_64(ld8s):
         CASE_OP_32_64(ld8u):
+        CASE_OP_32_64(ld16s):
         CASE_OP_32_64(ld16u):
+        case INDEX_op_ld32s_i64:
         case INDEX_op_ld32u_i64:
             done = fold_tcg_ld(&ctx, op);
             break;
-- 
2.25.1

Sign repetitions are perforce all identical, whether they are 1 or 0.
Bitwise operations preserve the relative quantity of the repetitions.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
     z2 = arg_info(op->args[2])->z_mask;
     ctx->z_mask = z1 & z2;
 
+    /*
+     * Sign repetitions are perforce all identical, whether they are 1 or 0.
+     * Bitwise operations preserve the relative quantity of the repetitions.
+     */
+    ctx->s_mask = arg_info(op->args[1])->s_mask
+                & arg_info(op->args[2])->s_mask;
+
     /*
      * Known-zeros does not imply known-ones.  Therefore unless
      * arg2 is constant, we can't infer affected bits from it.
@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
     }
     ctx->z_mask = z1;
 
+    ctx->s_mask = arg_info(op->args[1])->s_mask
+                & arg_info(op->args[2])->s_mask;
     return fold_masks(ctx, op);
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
         fold_xi_to_not(ctx, op, 0)) {
         return true;
     }
+
+    ctx->s_mask = arg_info(op->args[1])->s_mask
+                & arg_info(op->args[2])->s_mask;
     return false;
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
 
     ctx->z_mask = arg_info(op->args[3])->z_mask
                 | arg_info(op->args[4])->z_mask;
+    ctx->s_mask = arg_info(op->args[3])->s_mask
+                & arg_info(op->args[4])->s_mask;
 
     if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
         uint64_t tv = arg_info(op->args[3])->val;
@@ -XXX,XX +XXX,XX @@ static bool fold_nand(OptContext *ctx, TCGOp *op)
         fold_xi_to_not(ctx, op, -1)) {
         return true;
     }
+
+    ctx->s_mask = arg_info(op->args[1])->s_mask
+                & arg_info(op->args[2])->s_mask;
     return false;
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_nor(OptContext *ctx, TCGOp *op)
         fold_xi_to_not(ctx, op, 0)) {
         return true;
     }
+
+    ctx->s_mask = arg_info(op->args[1])->s_mask
+                & arg_info(op->args[2])->s_mask;
     return false;
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_not(OptContext *ctx, TCGOp *op)
         return true;
     }
 
+    ctx->s_mask = arg_info(op->args[1])->s_mask;
+
     /* Because of fold_to_not, we want to always return true, via finish. */
     finish_folding(ctx, op);
     return true;
@@ -XXX,XX +XXX,XX @@ static bool fold_or(OptContext *ctx, TCGOp *op)
 
     ctx->z_mask = arg_info(op->args[1])->z_mask
                 | arg_info(op->args[2])->z_mask;
+    ctx->s_mask = arg_info(op->args[1])->s_mask
+                & arg_info(op->args[2])->s_mask;
     return fold_masks(ctx, op);
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_orc(OptContext *ctx, TCGOp *op)
         fold_ix_to_not(ctx, op, 0)) {
         return true;
     }
+
+    ctx->s_mask = arg_info(op->args[1])->s_mask
+                & arg_info(op->args[2])->s_mask;
     return false;
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_xor(OptContext *ctx, TCGOp *op)
 
     ctx->z_mask = arg_info(op->args[1])->z_mask
                 | arg_info(op->args[2])->z_mask;
+    ctx->s_mask = arg_info(op->args[1])->s_mask
+                & arg_info(op->args[2])->s_mask;
     return fold_masks(ctx, op);
 }
 
-- 
2.25.1

For constant shifts, we can simply shift the s_mask.

For variable shifts, we know that sar does not reduce
the s_mask, which helps for sequences like

ext32s_i64  t, in
    sar_i64     t, t, v
    ext32s_i64  out, t

allowing the final extend to be eliminated.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 50 +++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 47 insertions(+), 3 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static uint64_t smask_from_zmask(uint64_t zmask)
     return ~(~0ull >> rep);
 }
 
+/*
+ * Recreate a properly left-aligned smask after manipulation.
+ * Some bit-shuffling, particularly shifts and rotates, may
+ * retain sign bits on the left, but may scatter disconnected
+ * sign bits on the right.  Retain only what remains to the left.
+ */
+static uint64_t smask_from_smask(int64_t smask)
+{
+    /* Only the 1 bits are significant for smask */
+    return smask_from_zmask(~smask);
+}
+
 static inline TempOptInfo *ts_info(TCGTemp *ts)
 {
     return ts->state_ptr;
@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
 
 static bool fold_shift(OptContext *ctx, TCGOp *op)
 {
+    uint64_t s_mask, z_mask, sign;
+
     if (fold_const2(ctx, op) ||
         fold_ix_to_i(ctx, op, 0) ||
         fold_xi_to_x(ctx, op, 0)) {
         return true;
     }
 
+    s_mask = arg_info(op->args[1])->s_mask;
+    z_mask = arg_info(op->args[1])->z_mask;
+
     if (arg_is_const(op->args[2])) {
-        ctx->z_mask = do_constant_folding(op->opc, ctx->type,
-                                          arg_info(op->args[1])->z_mask,
-                                          arg_info(op->args[2])->val);
+        int sh = arg_info(op->args[2])->val;
+
+        ctx->z_mask = do_constant_folding(op->opc, ctx->type, z_mask, sh);
+
+        s_mask = do_constant_folding(op->opc, ctx->type, s_mask, sh);
+        ctx->s_mask = smask_from_smask(s_mask);
+
         return fold_masks(ctx, op);
     }
+
+    switch (op->opc) {
+    CASE_OP_32_64(sar):
+        /*
+         * Arithmetic right shift will not reduce the number of
+         * input sign repetitions.
+         */
+        ctx->s_mask = s_mask;
+        break;
+    CASE_OP_32_64(shr):
+        /*
+         * If the sign bit is known zero, then logical right shift
+         * will not reduced the number of input sign repetitions.
+         */
+        sign = (s_mask & -s_mask) >> 1;
+        if (!(z_mask & sign)) {
+            ctx->s_mask = s_mask;
+        }
+        break;
+    default:
+        break;
+    }
+
     return false;
 }
 
-- 
2.25.1