Series comparison

-[PATCH 00/41] tcg patch queue
+[PULL 00/28] tcg patch queue
-The following changes since commit 05de778b5b8ab0b402996769117b88c7ea5c7c61:
+The following changes since commit aa33508196f4e2da04625bee36e1f7be5b9267e7:
-  Merge remote-tracking branch 'remotes/mst/tags/for_upstream' into staging (2021-07-09 14:30:01 +0100)
+  Merge tag 'mem-2023-05-23' of https://github.com/davidhildenbrand/qemu into staging (2023-05-23 10:57:25 -0700)
 are available in the Git repository at:
-  https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20210710
+  https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20230523
-for you to fetch changes up to ad1a706f386c2281adb0b09257d892735e405834:
+for you to fetch changes up to 30d56836f98c7ed2d309bff1dde8854f3d0b5634:
-  cpu: Add breakpoint tracepoints (2021-07-09 21:31:11 -0700)
+  tcg: Remove USE_TCG_OPTIMIZATIONS (2023-05-23 16:52:39 -0700)
 ----------------------------------------------------------------
-Add translator_use_goto_tb.
+util: Host cpu detection for x86 and aa64
-Cleanups in prep of breakpoint fixes.
+util: Use cpu detection for bufferiszero
-Misc fixes.
+migration: Use cpu detection for xbzrle
 tcg: Replace and remove cpu_atomic_{ld,st}o*
 host/include: Split qemu/atomic128.h
 tcg: Remove DEBUG_DISAS
 tcg: Remove USE_TCG_OPTIMIZATIONS
 ----------------------------------------------------------------
-Liren Wei (2):
+Richard Henderson (28):
-      accel/tcg: Hoist tcg_tb_insert() up above tb_link_page()
+      util: Introduce host-specific cpuinfo.h
-      tcg: Bake tb_destroy() into tcg_region_tree
+      util: Add cpuinfo-i386.c
       util: Add i386 CPUINFO_ATOMIC_VMOVDQU
       tcg/i386: Use host/cpuinfo.h
       util/bufferiszero: Use i386 host/cpuinfo.h
       migration/xbzrle: Shuffle function order
       migration/xbzrle: Use i386 host/cpuinfo.h
       migration: Build migration_files once
       util: Add cpuinfo-aarch64.c
       include/host: Split out atomic128-cas.h
       include/host: Split out atomic128-ldst.h
       meson: Fix detect atomic128 support with optimization
       include/qemu: Move CONFIG_ATOMIC128_OPT handling to atomic128.h
       target/ppc: Use tcg_gen_qemu_{ld,st}_i128 for LQARX, LQ, STQ
       target/s390x: Use tcg_gen_qemu_{ld,st}_i128 for LPQ, STPQ
       accel/tcg: Unify cpu_{ld,st}*_{be,le}_mmu
       target/s390x: Use cpu_{ld,st}*_mmu in do_csst
       target/s390x: Always use cpu_atomic_cmpxchgl_be_mmu in do_csst
       accel/tcg: Remove cpu_atomic_{ld,st}o_*_mmu
       accel/tcg: Remove prot argument to atomic_mmu_lookup
       accel/tcg: Eliminate #if on HAVE_ATOMIC128 and HAVE_CMPXCHG128
       qemu/atomic128: Split atomic16_read
       accel/tcg: Correctly use atomic128.h in ldst_atomicity.c.inc
       tcg: Split out tcg/debug-assert.h
       qemu/atomic128: Improve cmpxchg fallback for atomic16_set
       qemu/atomic128: Add runtime test for FEAT_LSE2
       tcg: Remove DEBUG_DISAS
       tcg: Remove USE_TCG_OPTIMIZATIONS
-Philippe Mathieu-Daudé (1):
+ accel/tcg/atomic_template.h                |  93 +-----
-      tcg: Avoid including 'trace-tcg.h' in target translate.c
+ host/include/aarch64/host/atomic128-cas.h  |  45 +++
+ host/include/aarch64/host/atomic128-ldst.h |  79 +++++
-Richard Henderson (38):
+ host/include/aarch64/host/cpuinfo.h        |  22 ++
-      tcg: Add separator in INDEX_op_call dump
+ host/include/generic/host/atomic128-cas.h  |  47 +++
-      tcg: Move tb_phys_invalidate_count to tb_ctx
+ host/include/generic/host/atomic128-ldst.h |  81 +++++
-      accel/tcg: Introduce translator_use_goto_tb
+ host/include/generic/host/cpuinfo.h        |   4 +
-      target/alpha: Remove use_exit_tb
+ host/include/i386/host/cpuinfo.h           |  39 +++
-      target/alpha: Remove in_superpage
+ host/include/x86_64/host/cpuinfo.h         |   1 +
-      target/alpha: Use translator_use_goto_tb
+ include/exec/cpu_ldst.h                    |  67 +----
-      target/arm: Use DISAS_TOO_MANY for ISB and SB
+ include/exec/exec-all.h                    |   3 -
-      target/arm: Use translator_use_goto_tb for aarch64
+ include/qemu/atomic128.h                   | 146 ++-------
-      target/arm: Use translator_use_goto_tb for aarch32
+ include/tcg/debug-assert.h                 |  17 ++
-      target/avr: Use translator_use_goto_tb
+ include/tcg/tcg.h                          |   9 +-
-      target/avr: Mark some helpers noreturn
+ migration/xbzrle.h                         |   5 +-
-      target/cris: Use translator_use_goto_tb
+ target/ppc/cpu.h                           |   1 -
-      target/hppa: Use translator_use_goto_tb
+ target/ppc/helper.h                        |   9 -
-      target/i386: Use translator_use_goto_tb
+ target/s390x/cpu.h                         |   3 -
-      target/m68k: Use translator_use_goto_tb
+ target/s390x/helper.h                      |   4 -
-      target/microblaze: Use translator_use_goto_tb
+ tcg/aarch64/tcg-target.h                   |   6 +-
-      target/mips: Use translator_use_goto_tb
+ tcg/i386/tcg-target.h                      |  28 +-
-      target/mips: Fix missing else in gen_goto_tb
+ accel/tcg/cpu-exec.c                       |   2 -
-      target/nios2: Use translator_use_goto_tb
+ accel/tcg/cputlb.c                         | 211 ++++---------
-      target/openrisc: Use translator_use_goto_tb
+ accel/tcg/translate-all.c                  |   2 -
-      target/ppc: Use translator_use_goto_tb
+ accel/tcg/translator.c                     |   2 -
-      target/riscv: Use translator_use_goto_tb
+ accel/tcg/user-exec.c                      | 332 ++++++--------------
-      target/rx: Use translator_use_goto_tb
+ migration/ram.c                            |  34 +--
-      target/s390x: Use translator_use_goto_tb
+ migration/xbzrle.c                         | 268 +++++++++--------
-      target/s390x: Remove use_exit_tb
+ target/arm/tcg/m_helper.c                  |   4 +-
-      target/sh4: Use translator_use_goto_tb
+ target/ppc/mem_helper.c                    |  48 ---
-      target/sparc: Use translator_use_goto_tb
+ target/ppc/translate.c                     |  34 +--
-      target/tricore: Use translator_use_goto_tb
+ target/s390x/tcg/mem_helper.c              | 137 ++-------
-      target/tricore: Use tcg_gen_lookup_and_goto_ptr
+ target/s390x/tcg/translate.c               |  30 +-
-      target/xtensa: Use translator_use_goto_tb
+ target/sh4/translate.c                     |   2 -
-      tcg: Fix prologue disassembly
+ target/sparc/ldst_helper.c                 |  18 +-
-      target/i386: Use cpu_breakpoint_test in breakpoint_handler
+ target/sparc/translate.c                   |   2 -
-      accel/tcg: Move helper_lookup_tb_ptr to cpu-exec.c
+ tcg/tcg.c                                  |  14 +-
-      accel/tcg: Move tb_lookup to cpu-exec.c
+ tests/bench/xbzrle-bench.c                 | 469 -----------------------------
-      accel/tcg: Split out log_cpu_exec
+ tests/unit/test-xbzrle.c                   |  49 +--
-      accel/tcg: Log tb->cflags with -d exec
+ util/bufferiszero.c                        | 127 +++-----
-      tcg: Remove TCG_TARGET_HAS_goto_ptr
+ util/cpuinfo-aarch64.c                     |  67 +++++
-      cpu: Add breakpoint tracepoints
+ util/cpuinfo-i386.c                        |  99 ++++++
+ MAINTAINERS                                |   3 +
- accel/tcg/tb-context.h              |   1 +
+ accel/tcg/atomic_common.c.inc              |  14 -
- accel/tcg/tb-lookup.h               |  49 ----------------
+ accel/tcg/ldst_atomicity.c.inc             | 135 ++-------
- include/exec/translator.h           |  10 ++++
+ accel/tcg/ldst_common.c.inc                |  24 +-
- include/tcg/tcg-opc.h               |   3 +-
+ meson.build                                |  12 +-
- include/tcg/tcg.h                   |   4 --
+ migration/meson.build                      |   1 -
- target/avr/helper.h                 |   8 +--
+ target/ppc/translate/fixedpoint-impl.c.inc |  51 +---
- tcg/aarch64/tcg-target.h            |   1 -
+ target/s390x/tcg/insn-data.h.inc           |   2 +-
- tcg/arm/tcg-target.h                |   1 -
+ tcg/aarch64/tcg-target.c.inc               |  40 ---
- tcg/i386/tcg-target.h               |   1 -
+ tcg/i386/tcg-target.c.inc                  | 123 +-------
- tcg/mips/tcg-target.h               |   1 -
+ tests/bench/meson.build                    |   6 -
- tcg/ppc/tcg-target.h                |   1 -
+ util/meson.build                           |   6 +
- tcg/riscv/tcg-target.h              |   1 -
+files changed, 1035 insertions(+), 2042 deletions(-)
- tcg/s390/tcg-target.h               |   1 -
+ create mode 100644 host/include/aarch64/host/atomic128-cas.h
- tcg/sparc/tcg-target.h              |   1 -
+ create mode 100644 host/include/aarch64/host/atomic128-ldst.h
- tcg/tci/tcg-target.h                |   1 -
+ create mode 100644 host/include/aarch64/host/cpuinfo.h
- accel/tcg/cpu-exec.c                | 112 ++++++++++++++++++++++++++++--------
+ create mode 100644 host/include/generic/host/atomic128-cas.h
- accel/tcg/tcg-runtime.c             |  22 -------
+ create mode 100644 host/include/generic/host/atomic128-ldst.h
- accel/tcg/translate-all.c           |  23 ++++----
+ create mode 100644 host/include/generic/host/cpuinfo.h
- accel/tcg/translator.c              |  11 ++++
+ create mode 100644 host/include/i386/host/cpuinfo.h
- cpu.c                               |  13 +++--
+ create mode 100644 host/include/x86_64/host/cpuinfo.h
- target/alpha/translate.c            |  47 ++-------------
+ create mode 100644 include/tcg/debug-assert.h
- target/arm/translate-a64.c          |  26 ++-------
+ delete mode 100644 tests/bench/xbzrle-bench.c
- target/arm/translate-sve.c          |   1 -
+ create mode 100644 util/cpuinfo-aarch64.c
- target/arm/translate.c              |  17 +-----
+ create mode 100644 util/cpuinfo-i386.c
  target/avr/translate.c              |   9 ++-
  target/cris/translate.c             |   6 +-
  target/hppa/translate.c             |   6 +-
  target/i386/tcg/sysemu/bpt_helper.c |  12 +---
  target/i386/tcg/translate.c         |  15 +----
  target/m68k/translate.c             |  13 +----
  target/microblaze/translate.c       |  12 +---
  target/mips/tcg/translate.c         |  21 ++-----
  target/nios2/translate.c            |  15 +----
  target/openrisc/translate.c         |  16 +++---
  target/ppc/translate.c              |  11 +---
  target/riscv/translate.c            |  20 +------
  target/rx/translate.c               |  12 +---
  target/s390x/translate.c            |  19 +-----
  target/sh4/translate.c              |  12 +---
  target/sparc/translate.c            |  20 ++-----
  target/tricore/translate.c          |  20 ++-----
  target/xtensa/translate.c           |   7 +--
  tcg/region.c                        |  33 +++--------
  tcg/tcg-op.c                        |   2 +-
  tcg/tcg.c                           |  14 ++---
  trace-events                        |   5 ++
 files changed, 217 insertions(+), 439 deletions(-)
  delete mode 100644 accel/tcg/tb-lookup.h

-[PATCH 41/41] cpu: Add breakpoint tracepoints
+[PULL 01/28] util: Introduce host-specific cpuinfo.h
-Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+The entire contents of the header is host-specific, but the
 existence of such a header is not, which could prevent some
 host specific ifdefs at the top of the file for the include.
 Add host/include/{arch,generic} to the project arguments.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Reviewed-by: Juan Quintela <quintela@redhat.com>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- cpu.c        | 13 +++++++++----
+ host/include/generic/host/cpuinfo.h |  4 ++++
- trace-events |  5 +++++
+ meson.build                         | 10 ++++++++++
-files changed, 14 insertions(+), 4 deletions(-)
+files changed, 14 insertions(+)
  create mode 100644 host/include/generic/host/cpuinfo.h
-diff --git a/cpu.c b/cpu.c
+diff --git a/host/include/generic/host/cpuinfo.h b/host/include/generic/host/cpuinfo.h
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/host/include/generic/host/cpuinfo.h
@@ -XXX,XX +XXX,XX @@
 +/*
 + * No host specific cpu indentification.
 + * SPDX-License-Identifier: GPL-2.0-or-later
 + */
 diff --git a/meson.build b/meson.build
 index XXXXXXX..XXXXXXX 100644
---- a/cpu.c
+--- a/meson.build
-+++ b/cpu.c
++++ b/meson.build
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ add_project_arguments('-iquote', '.',
- #include "exec/translate-all.h"
+                       '-iquote', meson.current_source_dir() / 'include',
- #include "exec/log.h"
+                       language: all_languages)
- #include "hw/core/accel-cpu.h"
-+#include "trace/trace-root.h"
++# If a host-specific include directory exists, list that first...
++host_include = meson.current_source_dir() / 'host/include/'
- uintptr_t qemu_host_page_size;
++if fs.is_dir(host_include / host_arch)
- intptr_t qemu_host_page_mask;
++  add_project_arguments('-iquote', host_include / host_arch,
-@@ -XXX,XX +XXX,XX @@ int cpu_breakpoint_insert(CPUState *cpu, vaddr pc, int flags,
++                        language: all_languages)
-     if (breakpoint) {
++endif
-         *breakpoint = bp;
++# ... followed by the generic fallback.
-     }
++add_project_arguments('-iquote', host_include / 'generic',
 +                      language: all_languages)
 +
-+    trace_breakpoint_insert(cpu->cpu_index, pc, flags);
+ sparse = find_program('cgcc', required: get_option('sparse'))
-     return 0;
+ if sparse.found()
- }
+   run_target('sparse',
@@ -XXX,XX +XXX,XX @@ int cpu_breakpoint_remove(CPUState *cpu, vaddr pc, int flags)
  }
  /* Remove a specific breakpoint by reference.  */
 -void cpu_breakpoint_remove_by_ref(CPUState *cpu, CPUBreakpoint *breakpoint)
 +void cpu_breakpoint_remove_by_ref(CPUState *cpu, CPUBreakpoint *bp)
  {
 -    QTAILQ_REMOVE(&cpu->breakpoints, breakpoint, entry);
 +    QTAILQ_REMOVE(&cpu->breakpoints, bp, entry);
 -    breakpoint_invalidate(cpu, breakpoint->pc);
 +    breakpoint_invalidate(cpu, bp->pc);
 -    g_free(breakpoint);
 +    trace_breakpoint_remove(cpu->cpu_index, bp->pc, bp->flags);
 +    g_free(bp);
  }
  /* Remove all matching breakpoints. */
@@ -XXX,XX +XXX,XX @@ void cpu_single_step(CPUState *cpu, int enabled)
              /* XXX: only flush what is necessary */
              tb_flush(cpu);
          }
 +        trace_breakpoint_singlestep(cpu->cpu_index, enabled);
      }
  }
 diff --git a/trace-events b/trace-events
 index XXXXXXX..XXXXXXX 100644
 --- a/trace-events
 +++ b/trace-events
@@ -XXX,XX +XXX,XX @@
  #
  # The <format-string> should be a sprintf()-compatible format string.
 +# cpu.c
 +breakpoint_insert(int cpu_index, uint64_t pc, int flags) "cpu=%d pc=0x%" PRIx64 " flags=0x%x"
 +breakpoint_remove(int cpu_index, uint64_t pc, int flags) "cpu=%d pc=0x%" PRIx64 " flags=0x%x"
 +breakpoint_singlestep(int cpu_index, int enabled) "cpu=%d enable=%d"
 +
  # dma-helpers.c
  dma_blk_io(void *dbs, void *bs, int64_t offset, bool to_dev) "dbs=%p bs=%p offset=%" PRId64 " to_dev=%d"
  dma_aio_cancel(void *dbs) "dbs=%p"
 --
-.25.1
+.34.1

-[PATCH 39/41] accel/tcg: Log tb->cflags with -d exec
+[PULL 02/28] util: Add cpuinfo-i386.c
-Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Add cpuinfo.h for i386 and x86_64, and the initialization
 for that in util/.  Populate that with a slightly altered
 copy of the tcg host probing code.  Other uses of cpuid.h
 will be adjusted one patch at a time.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Reviewed-by: Juan Quintela <quintela@redhat.com>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/cpu-exec.c | 6 +++---
+ host/include/i386/host/cpuinfo.h   | 38 ++++++++++++
-file changed, 3 insertions(+), 3 deletions(-)
+ host/include/x86_64/host/cpuinfo.h |  1 +
+ util/cpuinfo-i386.c                | 97 ++++++++++++++++++++++++++++++
-diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
+ MAINTAINERS                        |  2 +
  util/meson.build                   |  4 ++
 files changed, 142 insertions(+)
  create mode 100644 host/include/i386/host/cpuinfo.h
  create mode 100644 host/include/x86_64/host/cpuinfo.h
  create mode 100644 util/cpuinfo-i386.c
 diff --git a/host/include/i386/host/cpuinfo.h b/host/include/i386/host/cpuinfo.h
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/host/include/i386/host/cpuinfo.h
@@ -XXX,XX +XXX,XX @@
 +/*
 + * SPDX-License-Identifier: GPL-2.0-or-later
 + * Host specific cpu indentification for x86.
 + */
 +
 +#ifndef HOST_CPUINFO_H
 +#define HOST_CPUINFO_H
 +
 +/* Digested version of <cpuid.h> */
 +
 +#define CPUINFO_ALWAYS          (1u << 0)  /* so cpuinfo is nonzero */
 +#define CPUINFO_CMOV            (1u << 1)
 +#define CPUINFO_MOVBE           (1u << 2)
 +#define CPUINFO_LZCNT           (1u << 3)
 +#define CPUINFO_POPCNT          (1u << 4)
 +#define CPUINFO_BMI1            (1u << 5)
 +#define CPUINFO_BMI2            (1u << 6)
 +#define CPUINFO_SSE2            (1u << 7)
 +#define CPUINFO_SSE4            (1u << 8)
 +#define CPUINFO_AVX1            (1u << 9)
 +#define CPUINFO_AVX2            (1u << 10)
 +#define CPUINFO_AVX512F         (1u << 11)
 +#define CPUINFO_AVX512VL        (1u << 12)
 +#define CPUINFO_AVX512BW        (1u << 13)
 +#define CPUINFO_AVX512DQ        (1u << 14)
 +#define CPUINFO_AVX512VBMI2     (1u << 15)
 +#define CPUINFO_ATOMIC_VMOVDQA  (1u << 16)
 +
 +/* Initialized with a constructor. */
 +extern unsigned cpuinfo;
 +
 +/*
 + * We cannot rely on constructor ordering, so other constructors must
 + * use the function interface rather than the variable above.
 + */
 +unsigned cpuinfo_init(void);
 +
 +#endif /* HOST_CPUINFO_H */
 diff --git a/host/include/x86_64/host/cpuinfo.h b/host/include/x86_64/host/cpuinfo.h
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/host/include/x86_64/host/cpuinfo.h
@@ -0,0 +1 @@
 +#include "host/include/i386/host/cpuinfo.h"
 diff --git a/util/cpuinfo-i386.c b/util/cpuinfo-i386.c
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/util/cpuinfo-i386.c
@@ -XXX,XX +XXX,XX @@
 +/*
 + * SPDX-License-Identifier: GPL-2.0-or-later
 + * Host specific cpu indentification for x86.
 + */
 +
 +#include "qemu/osdep.h"
 +#include "host/cpuinfo.h"
 +#ifdef CONFIG_CPUID_H
 +# include "qemu/cpuid.h"
 +#endif
 +
 +unsigned cpuinfo;
 +
 +/* Called both as constructor and (possibly) via other constructors. */
 +unsigned __attribute__((constructor)) cpuinfo_init(void)
 +{
 +    unsigned info = cpuinfo;
 +
 +    if (info) {
 +        return info;
 +    }
 +
 +#ifdef CONFIG_CPUID_H
 +    unsigned max, a, b, c, d, b7 = 0, c7 = 0;
 +
 +    max = __get_cpuid_max(0, 0);
 +
 +    if (max >= 7) {
 +        __cpuid_count(7, 0, a, b7, c7, d);
 +        info |= (b7 & bit_BMI ? CPUINFO_BMI1 : 0);
 +        info |= (b7 & bit_BMI2 ? CPUINFO_BMI2 : 0);
 +    }
 +
 +    if (max >= 1) {
 +        __cpuid(1, a, b, c, d);
 +
 +        info |= (d & bit_CMOV ? CPUINFO_CMOV : 0);
 +        info |= (d & bit_SSE2 ? CPUINFO_SSE2 : 0);
 +        info |= (c & bit_SSE4_1 ? CPUINFO_SSE4 : 0);
 +        info |= (c & bit_MOVBE ? CPUINFO_MOVBE : 0);
 +        info |= (c & bit_POPCNT ? CPUINFO_POPCNT : 0);
 +
 +        /* For AVX features, we must check available and usable. */
 +        if ((c & bit_AVX) && (c & bit_OSXSAVE)) {
 +            unsigned bv = xgetbv_low(0);
 +
 +            if ((bv & 6) == 6) {
 +                info |= CPUINFO_AVX1;
 +                info |= (b7 & bit_AVX2 ? CPUINFO_AVX2 : 0);
 +
 +                if ((bv & 0xe0) == 0xe0) {
 +                    info |= (b7 & bit_AVX512F ? CPUINFO_AVX512F : 0);
 +                    info |= (b7 & bit_AVX512VL ? CPUINFO_AVX512VL : 0);
 +                    info |= (b7 & bit_AVX512BW ? CPUINFO_AVX512BW : 0);
 +                    info |= (b7 & bit_AVX512DQ ? CPUINFO_AVX512DQ : 0);
 +                    info |= (c7 & bit_AVX512VBMI2 ? CPUINFO_AVX512VBMI2 : 0);
 +                }
 +
 +                /*
 +                 * The Intel SDM has added:
 +                 *   Processors that enumerate support for Intel® AVX
 +                 *   (by setting the feature flag CPUID.01H:ECX.AVX[bit 28])
 +                 *   guarantee that the 16-byte memory operations performed
 +                 *   by the following instructions will always be carried
 +                 *   out atomically:
 +                 *   - MOVAPD, MOVAPS, and MOVDQA.
 +                 *   - VMOVAPD, VMOVAPS, and VMOVDQA when encoded with VEX.128.
 +                 *   - VMOVAPD, VMOVAPS, VMOVDQA32, and VMOVDQA64 when encoded
 +                 *     with EVEX.128 and k0 (masking disabled).
 +                 * Note that these instructions require the linear addresses
 +                 * of their memory operands to be 16-byte aligned.
 +                 *
 +                 * AMD has provided an even stronger guarantee that processors
 +                 * with AVX provide 16-byte atomicity for all cachable,
 +                 * naturally aligned single loads and stores, e.g. MOVDQU.
 +                 *
 +                 * See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104688
 +                 */
 +                __cpuid(0, a, b, c, d);
 +                if (c == signature_INTEL_ecx || c == signature_AMD_ecx) {
 +                    info |= CPUINFO_ATOMIC_VMOVDQA;
 +                }
 +            }
 +        }
 +    }
 +
 +    max = __get_cpuid_max(0x8000000, 0);
 +    if (max >= 1) {
 +        __cpuid(0x80000001, a, b, c, d);
 +        info |= (c & bit_LZCNT ? CPUINFO_LZCNT : 0);
 +    }
 +#endif
 +
 +    info |= CPUINFO_ALWAYS;
 +    cpuinfo = info;
 +    return info;
 +}
 diff --git a/MAINTAINERS b/MAINTAINERS
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/cpu-exec.c
+--- a/MAINTAINERS
-+++ b/accel/tcg/cpu-exec.c
++++ b/MAINTAINERS
-@@ -XXX,XX +XXX,XX @@ static inline void log_cpu_exec(target_ulong pc, CPUState *cpu,
+@@ -XXX,XX +XXX,XX @@ F: include/exec/helper*.h
+ F: include/sysemu/cpus.h
-         qemu_log_mask(CPU_LOG_EXEC,
+ F: include/sysemu/tcg.h
-                       "Trace %d: %p [" TARGET_FMT_lx
+ F: include/hw/core/tcg-cpu-ops.h
--                      "/" TARGET_FMT_lx "/%#x] %s\n",
++F: host/include/*/host/cpuinfo.h
--                      cpu->cpu_index, tb->tc.ptr, tb->cs_base, pc, tb->flags,
++F: util/cpuinfo-*.c
--                      lookup_symbol(pc));
-+                      "/" TARGET_FMT_lx "/%08x/%08x] %s\n",
+ FPU emulation
-+                      cpu->cpu_index, tb->tc.ptr, tb->cs_base, pc,
+ M: Aurelien Jarno <aurelien@aurel32.net>
-+                      tb->flags, tb->cflags, lookup_symbol(pc));
+diff --git a/util/meson.build b/util/meson.build
+index XXXXXXX..XXXXXXX 100644
- #if defined(DEBUG_DISAS)
+--- a/util/meson.build
-         if (qemu_loglevel_mask(CPU_LOG_TB_CPU)) {
++++ b/util/meson.build
@@ -XXX,XX +XXX,XX @@ if have_block
    endif
    util_ss.add(when: 'CONFIG_LINUX', if_true: files('vfio-helpers.c'))
  endif
 +
 +if cpu in ['x86', 'x86_64']
 +  util_ss.add(files('cpuinfo-i386.c'))
 +endif
 --
-.25.1
+.34.1

-[PATCH 07/41] target/alpha: Remove use_exit_tb
+[PULL 03/28] util: Add i386 CPUINFO_ATOMIC_VMOVDQU
-We have not needed to end a TB for I/O since ba3e7926691
+Add a bit to indicate when VMOVDQU is also atomic if aligned.
 ("icount: clean up cpu_can_io at the entry to the block").
 We do not need to use exit_tb for singlestep, which only
 means generate one insn per TB.
-Which leaves only singlestep_enabled, which means raise a
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 debug trap after every TB, which does not use exit_tb,
 which would leave the function mis-named.
 Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/alpha/translate.c | 15 ++-------------
+ host/include/i386/host/cpuinfo.h | 1 +
-file changed, 2 insertions(+), 13 deletions(-)
+ util/cpuinfo-i386.c              | 4 +++-
 files changed, 4 insertions(+), 1 deletion(-)
-diff --git a/target/alpha/translate.c b/target/alpha/translate.c
+diff --git a/host/include/i386/host/cpuinfo.h b/host/include/i386/host/cpuinfo.h
 index XXXXXXX..XXXXXXX 100644
---- a/target/alpha/translate.c
+--- a/host/include/i386/host/cpuinfo.h
-+++ b/target/alpha/translate.c
++++ b/host/include/i386/host/cpuinfo.h
-@@ -XXX,XX +XXX,XX @@ static bool in_superpage(DisasContext *ctx, int64_t addr)
+@@ -XXX,XX +XXX,XX @@
- #endif
+ #define CPUINFO_AVX512DQ        (1u << 14)
- }
+ #define CPUINFO_AVX512VBMI2     (1u << 15)
+ #define CPUINFO_ATOMIC_VMOVDQA  (1u << 16)
--static bool use_exit_tb(DisasContext *ctx)
++#define CPUINFO_ATOMIC_VMOVDQU  (1u << 17)
--{
--    return ((tb_cflags(ctx->base.tb) & CF_LAST_IO)
+ /* Initialized with a constructor. */
--            || ctx->base.singlestep_enabled
+ extern unsigned cpuinfo;
--            || singlestep);
+diff --git a/util/cpuinfo-i386.c b/util/cpuinfo-i386.c
--}
+index XXXXXXX..XXXXXXX 100644
--
+--- a/util/cpuinfo-i386.c
- static bool use_goto_tb(DisasContext *ctx, uint64_t dest)
++++ b/util/cpuinfo-i386.c
- {
+@@ -XXX,XX +XXX,XX @@ unsigned __attribute__((constructor)) cpuinfo_init(void)
--    /* Suppress goto_tb in the case of single-steping and IO.  */
+                  * See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104688
--    if (unlikely(use_exit_tb(ctx))) {
+                  */
--        return false;
+                 __cpuid(0, a, b, c, d);
--    }
+-                if (c == signature_INTEL_ecx || c == signature_AMD_ecx) {
- #ifndef CONFIG_USER_ONLY
++                if (c == signature_INTEL_ecx) {
-     /* If the destination is in the superpage, the page perms can't change.  */
+                     info |= CPUINFO_ATOMIC_VMOVDQA;
-     if (in_superpage(ctx, dest)) {
++                } else if (c == signature_AMD_ecx) {
-@@ -XXX,XX +XXX,XX @@ static DisasJumpType gen_call_pal(DisasContext *ctx, int palcode)
++                    info |= CPUINFO_ATOMIC_VMOVDQA | CPUINFO_ATOMIC_VMOVDQU;
-            need the page permissions check.  We'll see the existence of
+                 }
-            the page when we create the TB, and we'll flush all TBs if
+             }
             we change the PAL base register.  */
 -        if (!use_exit_tb(ctx)) {
 +        if (!ctx->base.singlestep_enabled) {
              tcg_gen_goto_tb(0);
              tcg_gen_movi_i64(cpu_pc, entry);
              tcg_gen_exit_tb(ctx->base.tb, 0);
@@ -XXX,XX +XXX,XX @@ static void alpha_tr_tb_stop(DisasContextBase *dcbase, CPUState *cpu)
          tcg_gen_movi_i64(cpu_pc, ctx->base.pc_next);
          /* FALLTHRU */
      case DISAS_PC_UPDATED:
 -        if (!use_exit_tb(ctx)) {
 +        if (!ctx->base.singlestep_enabled) {
              tcg_gen_lookup_and_goto_ptr();
              break;
          }
 --
-.25.1
+.34.1

-[PATCH 40/41] tcg: Remove TCG_TARGET_HAS_goto_ptr
+[PULL 04/28] tcg/i386: Use host/cpuinfo.h
-Since 6eea04347eb6, all tcg backends support goto_ptr.
+Use the CPUINFO_* bits instead of the individual boolean
-Remove the conditional, making support mandatory.
+variables that we had been using.  Remove all of the init
+code that was moved over to cpuinfo-i386.c.
-Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Note that have_avx512* check both AVX512{F,VL}, as we had
 previously done during tcg_target_init.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/tcg/tcg-opc.h    | 3 +--
+ tcg/i386/tcg-target.h     |  28 +++++----
- tcg/aarch64/tcg-target.h | 1 -
+ tcg/i386/tcg-target.c.inc | 123 ++------------------------------------
- tcg/arm/tcg-target.h     | 1 -
+files changed, 22 insertions(+), 129 deletions(-)
- tcg/i386/tcg-target.h    | 1 -
  tcg/mips/tcg-target.h    | 1 -
  tcg/ppc/tcg-target.h     | 1 -
  tcg/riscv/tcg-target.h   | 1 -
  tcg/s390/tcg-target.h    | 1 -
  tcg/sparc/tcg-target.h   | 1 -
  tcg/tci/tcg-target.h     | 1 -
  tcg/tcg-op.c             | 2 +-
  tcg/tcg.c                | 8 ++------
 files changed, 4 insertions(+), 18 deletions(-)
 diff --git a/include/tcg/tcg-opc.h b/include/tcg/tcg-opc.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/tcg/tcg-opc.h
 +++ b/include/tcg/tcg-opc.h
@@ -XXX,XX +XXX,XX @@ DEF(insn_start, 0, 0, TLADDR_ARGS * TARGET_INSN_START_WORDS,
      TCG_OPF_NOT_PRESENT)
  DEF(exit_tb, 0, 0, 1, TCG_OPF_BB_EXIT | TCG_OPF_BB_END)
  DEF(goto_tb, 0, 0, 1, TCG_OPF_BB_EXIT | TCG_OPF_BB_END)
 -DEF(goto_ptr, 0, 1, 0,
 -    TCG_OPF_BB_EXIT | TCG_OPF_BB_END | IMPL(TCG_TARGET_HAS_goto_ptr))
 +DEF(goto_ptr, 0, 1, 0, TCG_OPF_BB_EXIT | TCG_OPF_BB_END)
  DEF(plugin_cb_start, 0, 0, 3, TCG_OPF_NOT_PRESENT)
  DEF(plugin_cb_end, 0, 0, 0, TCG_OPF_NOT_PRESENT)
 diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/aarch64/tcg-target.h
 +++ b/tcg/aarch64/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
  #define TCG_TARGET_HAS_mulsh_i32        0
  #define TCG_TARGET_HAS_extrl_i64_i32    0
  #define TCG_TARGET_HAS_extrh_i64_i32    0
 -#define TCG_TARGET_HAS_goto_ptr         1
  #define TCG_TARGET_HAS_qemu_st8_i32     0
  #define TCG_TARGET_HAS_div_i64          1
 diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/arm/tcg-target.h
 +++ b/tcg/arm/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool use_neon_instructions;
  #define TCG_TARGET_HAS_mulsh_i32        0
  #define TCG_TARGET_HAS_div_i32          use_idiv_instructions
  #define TCG_TARGET_HAS_rem_i32          0
 -#define TCG_TARGET_HAS_goto_ptr         1
  #define TCG_TARGET_HAS_direct_jump      0
  #define TCG_TARGET_HAS_qemu_st8_i32     0
 diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/i386/tcg-target.h
 +++ b/tcg/i386/tcg-target.h
-@@ -XXX,XX +XXX,XX @@ extern bool have_movbe;
+@@ -XXX,XX +XXX,XX @@
- #define TCG_TARGET_HAS_muls2_i32        1
+ #ifndef I386_TCG_TARGET_H
- #define TCG_TARGET_HAS_muluh_i32        0
+ #define I386_TCG_TARGET_H
- #define TCG_TARGET_HAS_mulsh_i32        0
--#define TCG_TARGET_HAS_goto_ptr         1
++#include "host/cpuinfo.h"
- #define TCG_TARGET_HAS_direct_jump      1
++
+ #define TCG_TARGET_INSN_UNIT_SIZE  1
  #define TCG_TARGET_TLB_DISPLACEMENT_BITS 31
@@ -XXX,XX +XXX,XX @@ typedef enum {
  # define TCG_TARGET_CALL_RET_I128    TCG_CALL_RET_BY_REF
  #endif
 -extern bool have_bmi1;
 -extern bool have_popcnt;
 -extern bool have_avx1;
 -extern bool have_avx2;
 -extern bool have_avx512bw;
 -extern bool have_avx512dq;
 -extern bool have_avx512vbmi2;
 -extern bool have_avx512vl;
 -extern bool have_movbe;
 -extern bool have_atomic16;
 +#define have_bmi1         (cpuinfo & CPUINFO_BMI1)
 +#define have_popcnt       (cpuinfo & CPUINFO_POPCNT)
 +#define have_avx1         (cpuinfo & CPUINFO_AVX1)
 +#define have_avx2         (cpuinfo & CPUINFO_AVX2)
 +#define have_movbe        (cpuinfo & CPUINFO_MOVBE)
 +#define have_atomic16     (cpuinfo & CPUINFO_ATOMIC_VMOVDQA)
 +
 +/*
 + * There are interesting instructions in AVX512, so long as we have AVX512VL,
 + * which indicates support for EVEX on sizes smaller than 512 bits.
 + */
 +#define have_avx512vl     ((cpuinfo & CPUINFO_AVX512VL) && \
 +                           (cpuinfo & CPUINFO_AVX512F))
 +#define have_avx512bw     ((cpuinfo & CPUINFO_AVX512BW) && have_avx512vl)
 +#define have_avx512dq     ((cpuinfo & CPUINFO_AVX512DQ) && have_avx512vl)
 +#define have_avx512vbmi2  ((cpuinfo & CPUINFO_AVX512VBMI2) && have_avx512vl)
  /* optional instructions */
  #define TCG_TARGET_HAS_div2_i32         1
 diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/i386/tcg-target.c.inc
 +++ b/tcg/i386/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind kind, int slot)
  # define SOFTMMU_RESERVE_REGS  0
  #endif
 -/* The host compiler should supply <cpuid.h> to enable runtime features
 -   detection, as we're not going to go so far as our own inline assembly.
 -   If not available, default values will be assumed.  */
 -#if defined(CONFIG_CPUID_H)
 -#include "qemu/cpuid.h"
 -#endif
 -
  /* For 64-bit, we always know that CMOV is available.  */
  #if TCG_TARGET_REG_BITS == 64
-diff --git a/tcg/mips/tcg-target.h b/tcg/mips/tcg-target.h
+-# define have_cmov 1
-index XXXXXXX..XXXXXXX 100644
+-#elif defined(CONFIG_CPUID_H)
---- a/tcg/mips/tcg-target.h
+-static bool have_cmov;
-+++ b/tcg/mips/tcg-target.h
++# define have_cmov      true
-@@ -XXX,XX +XXX,XX @@ extern bool use_mips32r2_instructions;
+ #else
- #define TCG_TARGET_HAS_muluh_i32        1
+-# define have_cmov 0
- #define TCG_TARGET_HAS_mulsh_i32        1
+-#endif
- #define TCG_TARGET_HAS_bswap32_i32      1
+-
--#define TCG_TARGET_HAS_goto_ptr         1
+-/* We need these symbols in tcg-target.h, and we can't properly conditionalize
- #define TCG_TARGET_HAS_direct_jump      1
+-   it there.  Therefore we always define the variable.  */
+-bool have_bmi1;
- #if TCG_TARGET_REG_BITS == 64
+-bool have_popcnt;
-diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h
+-bool have_avx1;
-index XXXXXXX..XXXXXXX 100644
+-bool have_avx2;
---- a/tcg/ppc/tcg-target.h
+-bool have_avx512bw;
-+++ b/tcg/ppc/tcg-target.h
+-bool have_avx512dq;
-@@ -XXX,XX +XXX,XX @@ extern bool have_vsx;
+-bool have_avx512vbmi2;
- #define TCG_TARGET_HAS_muls2_i32        0
+-bool have_avx512vl;
- #define TCG_TARGET_HAS_muluh_i32        1
+-bool have_movbe;
- #define TCG_TARGET_HAS_mulsh_i32        1
+-bool have_atomic16;
--#define TCG_TARGET_HAS_goto_ptr         1
+-
- #define TCG_TARGET_HAS_direct_jump      1
+-#ifdef CONFIG_CPUID_H
- #define TCG_TARGET_HAS_qemu_st8_i32     0
+-static bool have_bmi2;
+-static bool have_lzcnt;
-diff --git a/tcg/riscv/tcg-target.h b/tcg/riscv/tcg-target.h
+-#else
-index XXXXXXX..XXXXXXX 100644
+-# define have_bmi2 0
---- a/tcg/riscv/tcg-target.h
+-# define have_lzcnt 0
-+++ b/tcg/riscv/tcg-target.h
++# define have_cmov      (cpuinfo & CPUINFO_CMOV)
-@@ -XXX,XX +XXX,XX @@ typedef enum {
+ #endif
- #define TCG_TARGET_CALL_STACK_OFFSET    0
++#define have_bmi2       (cpuinfo & CPUINFO_BMI2)
++#define have_lzcnt      (cpuinfo & CPUINFO_LZCNT)
- /* optional instructions */
--#define TCG_TARGET_HAS_goto_ptr         1
+ static const tcg_insn_unit *tb_ret_addr;
- #define TCG_TARGET_HAS_movcond_i32      0
- #define TCG_TARGET_HAS_div_i32          1
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
- #define TCG_TARGET_HAS_rem_i32          1
-diff --git a/tcg/s390/tcg-target.h b/tcg/s390/tcg-target.h
+ static void tcg_target_init(TCGContext *s)
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/s390/tcg-target.h
 +++ b/tcg/s390/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern uint64_t s390_facilities;
  #define TCG_TARGET_HAS_mulsh_i32      0
  #define TCG_TARGET_HAS_extrl_i64_i32  0
  #define TCG_TARGET_HAS_extrh_i64_i32  0
 -#define TCG_TARGET_HAS_goto_ptr       1
  #define TCG_TARGET_HAS_direct_jump    (s390_facilities & FACILITY_GEN_INST_EXT)
  #define TCG_TARGET_HAS_qemu_st8_i32   0
 diff --git a/tcg/sparc/tcg-target.h b/tcg/sparc/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/sparc/tcg-target.h
 +++ b/tcg/sparc/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool use_vis3_instructions;
  #define TCG_TARGET_HAS_muls2_i32        1
  #define TCG_TARGET_HAS_muluh_i32        0
  #define TCG_TARGET_HAS_mulsh_i32        0
 -#define TCG_TARGET_HAS_goto_ptr         1
  #define TCG_TARGET_HAS_direct_jump      1
  #define TCG_TARGET_HAS_qemu_st8_i32     0
 diff --git a/tcg/tci/tcg-target.h b/tcg/tci/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tci/tcg-target.h
 +++ b/tcg/tci/tcg-target.h
@@ -XXX,XX +XXX,XX @@
  #define TCG_TARGET_HAS_muls2_i32        1
  #define TCG_TARGET_HAS_muluh_i32        0
  #define TCG_TARGET_HAS_mulsh_i32        0
 -#define TCG_TARGET_HAS_goto_ptr         1
  #define TCG_TARGET_HAS_direct_jump      0
  #define TCG_TARGET_HAS_qemu_st8_i32     0
 diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tcg-op.c
 +++ b/tcg/tcg-op.c
@@ -XXX,XX +XXX,XX @@ void tcg_gen_goto_tb(unsigned idx)
  void tcg_gen_lookup_and_goto_ptr(void)
  {
--    if (TCG_TARGET_HAS_goto_ptr && !qemu_loglevel_mask(CPU_LOG_TB_NOCHAIN)) {
+-#ifdef CONFIG_CPUID_H
-+    if (!qemu_loglevel_mask(CPU_LOG_TB_NOCHAIN)) {
+-    unsigned a, b, c, d, b7 = 0, c7 = 0;
-         TCGv_ptr ptr;
+-    unsigned max = __get_cpuid_max(0, 0);
+-
-         plugin_gen_disable_mem_helpers();
+-    if (max >= 7) {
-diff --git a/tcg/tcg.c b/tcg/tcg.c
+-        /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs.  */
-index XXXXXXX..XXXXXXX 100644
+-        __cpuid_count(7, 0, a, b7, c7, d);
---- a/tcg/tcg.c
+-        have_bmi1 = (b7 & bit_BMI) != 0;
-+++ b/tcg/tcg.c
+-        have_bmi2 = (b7 & bit_BMI2) != 0;
@@ -XXX,XX +XXX,XX @@ void tcg_prologue_init(TCGContext *s)
       * For tci, we use NULL as the signal to return from the interpreter,
       * so skip this check.
       */
 -    if (TCG_TARGET_HAS_goto_ptr) {
 -        tcg_debug_assert(tcg_code_gen_epilogue != NULL);
 -    }
-+    tcg_debug_assert(tcg_code_gen_epilogue != NULL);
+-
- #endif
+-    if (max >= 1) {
+-        __cpuid(1, a, b, c, d);
-     tcg_region_prologue_set(s);
+-#ifndef have_cmov
-@@ -XXX,XX +XXX,XX @@ bool tcg_op_supported(TCGOpcode op)
+-        /* For 32-bit, 99% certainty that we're running on hardware that
-     case INDEX_op_insn_start:
+-           supports cmov, but we still need to check.  In case cmov is not
-     case INDEX_op_exit_tb:
+-           available, we'll use a small forward branch.  */
-     case INDEX_op_goto_tb:
+-        have_cmov = (d & bit_CMOV) != 0;
-+    case INDEX_op_goto_ptr:
+-#endif
-     case INDEX_op_qemu_ld_i32:
+-
-     case INDEX_op_qemu_st_i32:
+-        /* MOVBE is only available on Intel Atom and Haswell CPUs, so we
-     case INDEX_op_qemu_ld_i64:
+-           need to probe for it.  */
-@@ -XXX,XX +XXX,XX @@ bool tcg_op_supported(TCGOpcode op)
+-        have_movbe = (c & bit_MOVBE) != 0;
-     case INDEX_op_qemu_st8_i32:
+-        have_popcnt = (c & bit_POPCNT) != 0;
-         return TCG_TARGET_HAS_qemu_st8_i32;
+-
+-        /* There are a number of things we must check before we can be
--    case INDEX_op_goto_ptr:
+-           sure of not hitting invalid opcode.  */
--        return TCG_TARGET_HAS_goto_ptr;
+-        if (c & bit_OSXSAVE) {
--
+-            unsigned bv = xgetbv_low(0);
-     case INDEX_op_mov_i32:
+-
-     case INDEX_op_setcond_i32:
+-            if ((bv & 6) == 6) {
-     case INDEX_op_brcond_i32:
+-                have_avx1 = (c & bit_AVX) != 0;
 -                have_avx2 = (b7 & bit_AVX2) != 0;
 -
 -                /*
 -                 * There are interesting instructions in AVX512, so long
 -                 * as we have AVX512VL, which indicates support for EVEX
 -                 * on sizes smaller than 512 bits.  We are required to
 -                 * check that OPMASK and all extended ZMM state are enabled
 -                 * even if we're not using them -- the insns will fault.
 -                 */
 -                if ((bv & 0xe0) == 0xe0
 -                    && (b7 & bit_AVX512F)
 -                    && (b7 & bit_AVX512VL)) {
 -                    have_avx512vl = true;
 -                    have_avx512bw = (b7 & bit_AVX512BW) != 0;
 -                    have_avx512dq = (b7 & bit_AVX512DQ) != 0;
 -                    have_avx512vbmi2 = (c7 & bit_AVX512VBMI2) != 0;
 -                }
 -
 -                /*
 -                 * The Intel SDM has added:
 -                 *   Processors that enumerate support for Intel® AVX
 -                 *   (by setting the feature flag CPUID.01H:ECX.AVX[bit 28])
 -                 *   guarantee that the 16-byte memory operations performed
 -                 *   by the following instructions will always be carried
 -                 *   out atomically:
 -                 *   - MOVAPD, MOVAPS, and MOVDQA.
 -                 *   - VMOVAPD, VMOVAPS, and VMOVDQA when encoded with VEX.128.
 -                 *   - VMOVAPD, VMOVAPS, VMOVDQA32, and VMOVDQA64 when encoded
 -                 *     with EVEX.128 and k0 (masking disabled).
 -                 * Note that these instructions require the linear addresses
 -                 * of their memory operands to be 16-byte aligned.
 -                 *
 -                 * AMD has provided an even stronger guarantee that processors
 -                 * with AVX provide 16-byte atomicity for all cachable,
 -                 * naturally aligned single loads and stores, e.g. MOVDQU.
 -                 *
 -                 * See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104688
 -                 */
 -                if (have_avx1) {
 -                    __cpuid(0, a, b, c, d);
 -                    have_atomic16 = (c == signature_INTEL_ecx ||
 -                                     c == signature_AMD_ecx);
 -                }
 -            }
 -        }
 -    }
 -
 -    max = __get_cpuid_max(0x8000000, 0);
 -    if (max >= 1) {
 -        __cpuid(0x80000001, a, b, c, d);
 -        /* LZCNT was introduced with AMD Barcelona and Intel Haswell CPUs.  */
 -        have_lzcnt = (c & bit_LZCNT) != 0;
 -    }
 -#endif /* CONFIG_CPUID_H */
 -
      tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS;
      if (TCG_TARGET_REG_BITS == 64) {
          tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS;
 --
-.25.1
+.34.1

-[PATCH 30/41] target/sparc: Use translator_use_goto_tb
+[PULL 05/28] util/bufferiszero: Use i386 host/cpuinfo.h
-Reviewed-by: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>
+Use cpuinfo_init() during init_accel(), and the variable cpuinfo
 during test_buffer_is_zero_next_accel().  Adjust the logic that
 cycles through the set of accelerators for testing.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/sparc/translate.c | 19 +++++--------------
+ util/bufferiszero.c | 127 ++++++++++++++++----------------------------
-file changed, 5 insertions(+), 14 deletions(-)
+file changed, 46 insertions(+), 81 deletions(-)
-diff --git a/target/sparc/translate.c b/target/sparc/translate.c
+diff --git a/util/bufferiszero.c b/util/bufferiszero.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/sparc/translate.c
+--- a/util/bufferiszero.c
-+++ b/target/sparc/translate.c
++++ b/util/bufferiszero.c
-@@ -XXX,XX +XXX,XX @@ static inline TCGv gen_dest_gpr(DisasContext *dc, int reg)
+@@ -XXX,XX +XXX,XX @@
-     }
+ #include "qemu/osdep.h"
  #include "qemu/cutils.h"
  #include "qemu/bswap.h"
 +#include "host/cpuinfo.h"
  static bool
  buffer_zero_int(const void *buf, size_t len)
@@ -XXX,XX +XXX,XX @@ buffer_zero_avx512(const void *buf, size_t len)
  }
+ #endif /* CONFIG_AVX512F_OPT */
--static inline bool use_goto_tb(DisasContext *s, target_ulong pc,
--                               target_ulong npc)
+-
-+static bool use_goto_tb(DisasContext *s, target_ulong pc, target_ulong npc)
+-/* Note that for test_buffer_is_zero_next_accel, the most preferred
 - * ISA must have the least significant bit.
 - */
 -#define CACHE_AVX512F 1
 -#define CACHE_AVX2    2
 -#define CACHE_SSE4    4
 -#define CACHE_SSE2    8
 -
 -/* Make sure that these variables are appropriately initialized when
 +/*
 + * Make sure that these variables are appropriately initialized when
   * SSE2 is enabled on the compiler command-line, but the compiler is
   * too old to support CONFIG_AVX2_OPT.
   */
  #if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT)
 -# define INIT_CACHE 0
 -# define INIT_ACCEL buffer_zero_int
 +# define INIT_USED     0
 +# define INIT_LENGTH   0
 +# define INIT_ACCEL    buffer_zero_int
  #else
  # ifndef __SSE2__
  #  error "ISA selection confusion"
  # endif
 -# define INIT_CACHE CACHE_SSE2
 -# define INIT_ACCEL buffer_zero_sse2
 +# define INIT_USED     CPUINFO_SSE2
 +# define INIT_LENGTH   64
 +# define INIT_ACCEL    buffer_zero_sse2
  #endif
 -static unsigned cpuid_cache = INIT_CACHE;
 +static unsigned used_accel = INIT_USED;
 +static unsigned length_to_accel = INIT_LENGTH;
  static bool (*buffer_accel)(const void *, size_t) = INIT_ACCEL;
 -static int length_to_accel = 64;
 -static void init_accel(unsigned cache)
 +static unsigned __attribute__((noinline))
 +select_accel_cpuinfo(unsigned info)
  {
--    if (unlikely(s->base.singlestep_enabled || singlestep)) {
+-    bool (*fn)(const void *, size_t) = buffer_zero_int;
 -    if (cache & CACHE_SSE2) {
 -        fn = buffer_zero_sse2;
 -        length_to_accel = 64;
 -    }
 -#ifdef CONFIG_AVX2_OPT
 -    if (cache & CACHE_SSE4) {
 -        fn = buffer_zero_sse4;
 -        length_to_accel = 64;
 -    }
 -    if (cache & CACHE_AVX2) {
 -        fn = buffer_zero_avx2;
 -        length_to_accel = 128;
 -    }
 -#endif
 +    /* Array is sorted in order of algorithm preference. */
 +    static const struct {
 +        unsigned bit;
 +        unsigned len;
 +        bool (*fn)(const void *, size_t);
 +    } all[] = {
  #ifdef CONFIG_AVX512F_OPT
 -    if (cache & CACHE_AVX512F) {
 -        fn = buffer_zero_avx512;
 -        length_to_accel = 256;
 -    }
 +        { CPUINFO_AVX512F, 256, buffer_zero_avx512 },
  #endif
 -    buffer_accel = fn;
 +#ifdef CONFIG_AVX2_OPT
 +        { CPUINFO_AVX2,    128, buffer_zero_avx2 },
 +        { CPUINFO_SSE4,     64, buffer_zero_sse4 },
 +#endif
 +        { CPUINFO_SSE2,     64, buffer_zero_sse2 },
 +        { CPUINFO_ALWAYS,    0, buffer_zero_int },
 +    };
 +
 +    for (unsigned i = 0; i < ARRAY_SIZE(all); ++i) {
 +        if (info & all[i].bit) {
 +            length_to_accel = all[i].len;
 +            buffer_accel = all[i].fn;
 +            return all[i].bit;
 +        }
 +    }
 +    return 0;
  }
  #if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT)
 -#include "qemu/cpuid.h"
 -
 -static void __attribute__((constructor)) init_cpuid_cache(void)
 +static void __attribute__((constructor)) init_accel(void)
  {
 -    unsigned max = __get_cpuid_max(0, NULL);
 -    int a, b, c, d;
 -    unsigned cache = 0;
 -
 -    if (max >= 1) {
 -        __cpuid(1, a, b, c, d);
 -        if (d & bit_SSE2) {
 -            cache |= CACHE_SSE2;
 -        }
 -        if (c & bit_SSE4_1) {
 -            cache |= CACHE_SSE4;
 -        }
 -
 -        /* We must check that AVX is not just available, but usable.  */
 -        if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
 -            unsigned bv = xgetbv_low(0);
 -            __cpuid_count(7, 0, a, b, c, d);
 -            if ((bv & 0x6) == 0x6 && (b & bit_AVX2)) {
 -                cache |= CACHE_AVX2;
 -            }
 -            /* 0xe6:
 -            *  XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15
 -            *                    and ZMM16-ZMM31 state are enabled by OS)
 -            *  XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS)
 -            */
 -            if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512F)) {
 -                cache |= CACHE_AVX512F;
 -            }
 -        }
 -    }
 -    cpuid_cache = cache;
 -    init_accel(cache);
 +    used_accel = select_accel_cpuinfo(cpuinfo_init());
  }
  #endif /* CONFIG_AVX2_OPT */
  bool test_buffer_is_zero_next_accel(void)
  {
 -    /* If no bits set, we just tested buffer_zero_int, and there
 -       are no more acceleration options to test.  */
 -    if (cpuid_cache == 0) {
 -        return false;
 -    }
--
+-    /* Disable the accelerator we used before and select a new one.  */
--#ifndef CONFIG_USER_ONLY
+-    cpuid_cache &= cpuid_cache - 1;
--    return (pc & TARGET_PAGE_MASK) == (s->base.tb->pc & TARGET_PAGE_MASK) &&
+-    init_accel(cpuid_cache);
 -           (npc & TARGET_PAGE_MASK) == (s->base.tb->pc & TARGET_PAGE_MASK);
 -#else
 -    return true;
--#endif
++    /*
-+    return translator_use_goto_tb(&s->base, pc) &&
++     * Accumulate the accelerators that we've already tested, and
-+           translator_use_goto_tb(&s->base, npc);
++     * remove them from the set to test this round.  We'll get back
 +     * a zero from select_accel_cpuinfo when there are no more.
 +     */
 +    unsigned used = select_accel_cpuinfo(cpuinfo & ~used_accel);
 +    used_accel |= used;
 +    return used;
  }
--static inline void gen_goto_tb(DisasContext *s, int tb_num,
+ static bool select_accel_fn(const void *buf, size_t len)
 -                               target_ulong pc, target_ulong npc)
 +static void gen_goto_tb(DisasContext *s, int tb_num,
 +                        target_ulong pc, target_ulong npc)
  {
      if (use_goto_tb(s, pc, npc))  {
          /* jump to same page: we can use a direct jump */
 --
-.25.1
+.34.1

-[PATCH 22/41] target/nios2: Use translator_use_goto_tb
+[PULL 06/28] migration/xbzrle: Shuffle function order
-Just use translator_use_goto_tb directly at the one call site,
+Place the CONFIG_AVX512BW_OPT block at the top,
-rather than maintaining a local wrapper.
+which will aid function selection in the next patch.
-Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Reviewed-by: Juan Quintela <quintela@redhat.com>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/nios2/translate.c | 15 +--------------
+ migration/xbzrle.c | 244 ++++++++++++++++++++++-----------------------
-file changed, 1 insertion(+), 14 deletions(-)
+file changed, 122 insertions(+), 122 deletions(-)
-diff --git a/target/nios2/translate.c b/target/nios2/translate.c
+diff --git a/migration/xbzrle.c b/migration/xbzrle.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/nios2/translate.c
+--- a/migration/xbzrle.c
-+++ b/target/nios2/translate.c
++++ b/migration/xbzrle.c
-@@ -XXX,XX +XXX,XX @@ static void t_gen_helper_raise_exception(DisasContext *dc,
+@@ -XXX,XX +XXX,XX @@
-     dc->base.is_jmp = DISAS_NORETURN;
+ #include "qemu/host-utils.h"
  #include "xbzrle.h"
 +#if defined(CONFIG_AVX512BW_OPT)
 +#include <immintrin.h>
 +
 +int __attribute__((target("avx512bw")))
 +xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen,
 +                            uint8_t *dst, int dlen)
 +{
 +    uint32_t zrun_len = 0, nzrun_len = 0;
 +    int d = 0, i = 0, num = 0;
 +    uint8_t *nzrun_start = NULL;
 +    /* add 1 to include residual part in main loop */
 +    uint32_t count512s = (slen >> 6) + 1;
 +    /* countResidual is tail of data, i.e., countResidual = slen % 64 */
 +    uint32_t count_residual = slen & 0b111111;
 +    bool never_same = true;
 +    uint64_t mask_residual = 1;
 +    mask_residual <<= count_residual;
 +    mask_residual -= 1;
 +    __m512i r = _mm512_set1_epi32(0);
 +
 +    while (count512s) {
 +        int bytes_to_check = 64;
 +        uint64_t mask = 0xffffffffffffffff;
 +        if (count512s == 1) {
 +            bytes_to_check = count_residual;
 +            mask = mask_residual;
 +        }
 +        __m512i old_data = _mm512_mask_loadu_epi8(r,
 +                                                  mask, old_buf + i);
 +        __m512i new_data = _mm512_mask_loadu_epi8(r,
 +                                                  mask, new_buf + i);
 +        uint64_t comp = _mm512_cmpeq_epi8_mask(old_data, new_data);
 +        count512s--;
 +
 +        bool is_same = (comp & 0x1);
 +        while (bytes_to_check) {
 +            if (d + 2 > dlen) {
 +                return -1;
 +            }
 +            if (is_same) {
 +                if (nzrun_len) {
 +                    d += uleb128_encode_small(dst + d, nzrun_len);
 +                    if (d + nzrun_len > dlen) {
 +                        return -1;
 +                    }
 +                    nzrun_start = new_buf + i - nzrun_len;
 +                    memcpy(dst + d, nzrun_start, nzrun_len);
 +                    d += nzrun_len;
 +                    nzrun_len = 0;
 +                }
 +                /* 64 data at a time for speed */
 +                if (count512s && (comp == 0xffffffffffffffff)) {
 +                    i += 64;
 +                    zrun_len += 64;
 +                    break;
 +                }
 +                never_same = false;
 +                num = ctz64(~comp);
 +                num = (num < bytes_to_check) ? num : bytes_to_check;
 +                zrun_len += num;
 +                bytes_to_check -= num;
 +                comp >>= num;
 +                i += num;
 +                if (bytes_to_check) {
 +                    /* still has different data after same data */
 +                    d += uleb128_encode_small(dst + d, zrun_len);
 +                    zrun_len = 0;
 +                } else {
 +                    break;
 +                }
 +            }
 +            if (never_same || zrun_len) {
 +                /*
 +                 * never_same only acts if
 +                 * data begins with diff in first count512s
 +                 */
 +                d += uleb128_encode_small(dst + d, zrun_len);
 +                zrun_len = 0;
 +                never_same = false;
 +            }
 +            /* has diff, 64 data at a time for speed */
 +            if ((bytes_to_check == 64) && (comp == 0x0)) {
 +                i += 64;
 +                nzrun_len += 64;
 +                break;
 +            }
 +            num = ctz64(comp);
 +            num = (num < bytes_to_check) ? num : bytes_to_check;
 +            nzrun_len += num;
 +            bytes_to_check -= num;
 +            comp >>= num;
 +            i += num;
 +            if (bytes_to_check) {
 +                /* mask like 111000 */
 +                d += uleb128_encode_small(dst + d, nzrun_len);
 +                /* overflow */
 +                if (d + nzrun_len > dlen) {
 +                    return -1;
 +                }
 +                nzrun_start = new_buf + i - nzrun_len;
 +                memcpy(dst + d, nzrun_start, nzrun_len);
 +                d += nzrun_len;
 +                nzrun_len = 0;
 +                is_same = true;
 +            }
 +        }
 +    }
 +
 +    if (nzrun_len != 0) {
 +        d += uleb128_encode_small(dst + d, nzrun_len);
 +        /* overflow */
 +        if (d + nzrun_len > dlen) {
 +            return -1;
 +        }
 +        nzrun_start = new_buf + i - nzrun_len;
 +        memcpy(dst + d, nzrun_start, nzrun_len);
 +        d += nzrun_len;
 +    }
 +    return d;
 +}
 +#endif
 +
  /*
    page = zrun nzrun
         | zrun nzrun page
@@ -XXX,XX +XXX,XX @@ int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen)
      return d;
  }
+-
--static bool use_goto_tb(DisasContext *dc, uint32_t dest)
+-#if defined(CONFIG_AVX512BW_OPT)
 -#include <immintrin.h>
 -
 -int __attribute__((target("avx512bw")))
 -xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen,
 -                            uint8_t *dst, int dlen)
 -{
--    if (unlikely(dc->base.singlestep_enabled)) {
+-    uint32_t zrun_len = 0, nzrun_len = 0;
--        return false;
+-    int d = 0, i = 0, num = 0;
 -    uint8_t *nzrun_start = NULL;
 -    /* add 1 to include residual part in main loop */
 -    uint32_t count512s = (slen >> 6) + 1;
 -    /* countResidual is tail of data, i.e., countResidual = slen % 64 */
 -    uint32_t count_residual = slen & 0b111111;
 -    bool never_same = true;
 -    uint64_t mask_residual = 1;
 -    mask_residual <<= count_residual;
 -    mask_residual -= 1;
 -    __m512i r = _mm512_set1_epi32(0);
 -
 -    while (count512s) {
 -        int bytes_to_check = 64;
 -        uint64_t mask = 0xffffffffffffffff;
 -        if (count512s == 1) {
 -            bytes_to_check = count_residual;
 -            mask = mask_residual;
 -        }
 -        __m512i old_data = _mm512_mask_loadu_epi8(r,
 -                                                  mask, old_buf + i);
 -        __m512i new_data = _mm512_mask_loadu_epi8(r,
 -                                                  mask, new_buf + i);
 -        uint64_t comp = _mm512_cmpeq_epi8_mask(old_data, new_data);
 -        count512s--;
 -
 -        bool is_same = (comp & 0x1);
 -        while (bytes_to_check) {
 -            if (d + 2 > dlen) {
 -                return -1;
 -            }
 -            if (is_same) {
 -                if (nzrun_len) {
 -                    d += uleb128_encode_small(dst + d, nzrun_len);
 -                    if (d + nzrun_len > dlen) {
 -                        return -1;
 -                    }
 -                    nzrun_start = new_buf + i - nzrun_len;
 -                    memcpy(dst + d, nzrun_start, nzrun_len);
 -                    d += nzrun_len;
 -                    nzrun_len = 0;
 -                }
 -                /* 64 data at a time for speed */
 -                if (count512s && (comp == 0xffffffffffffffff)) {
 -                    i += 64;
 -                    zrun_len += 64;
 -                    break;
 -                }
 -                never_same = false;
 -                num = ctz64(~comp);
 -                num = (num < bytes_to_check) ? num : bytes_to_check;
 -                zrun_len += num;
 -                bytes_to_check -= num;
 -                comp >>= num;
 -                i += num;
 -                if (bytes_to_check) {
 -                    /* still has different data after same data */
 -                    d += uleb128_encode_small(dst + d, zrun_len);
 -                    zrun_len = 0;
 -                } else {
 -                    break;
 -                }
 -            }
 -            if (never_same || zrun_len) {
 -                /*
 -                 * never_same only acts if
 -                 * data begins with diff in first count512s
 -                 */
 -                d += uleb128_encode_small(dst + d, zrun_len);
 -                zrun_len = 0;
 -                never_same = false;
 -            }
 -            /* has diff, 64 data at a time for speed */
 -            if ((bytes_to_check == 64) && (comp == 0x0)) {
 -                i += 64;
 -                nzrun_len += 64;
 -                break;
 -            }
 -            num = ctz64(comp);
 -            num = (num < bytes_to_check) ? num : bytes_to_check;
 -            nzrun_len += num;
 -            bytes_to_check -= num;
 -            comp >>= num;
 -            i += num;
 -            if (bytes_to_check) {
 -                /* mask like 111000 */
 -                d += uleb128_encode_small(dst + d, nzrun_len);
 -                /* overflow */
 -                if (d + nzrun_len > dlen) {
 -                    return -1;
 -                }
 -                nzrun_start = new_buf + i - nzrun_len;
 -                memcpy(dst + d, nzrun_start, nzrun_len);
 -                d += nzrun_len;
 -                nzrun_len = 0;
 -                is_same = true;
 -            }
 -        }
 -    }
 -
--#ifndef CONFIG_USER_ONLY
+-    if (nzrun_len != 0) {
--    return (dc->base.pc_first & TARGET_PAGE_MASK) == (dest & TARGET_PAGE_MASK);
+-        d += uleb128_encode_small(dst + d, nzrun_len);
--#else
+-        /* overflow */
--    return true;
+-        if (d + nzrun_len > dlen) {
 -            return -1;
 -        }
 -        nzrun_start = new_buf + i - nzrun_len;
 -        memcpy(dst + d, nzrun_start, nzrun_len);
 -        d += nzrun_len;
 -    }
 -    return d;
 -}
 -#endif
--}
--
- static void gen_goto_tb(DisasContext *dc, int n, uint32_t dest)
- {
-     const TranslationBlock *tb = dc->base.tb;
--    if (use_goto_tb(dc, dest)) {
-+    if (translator_use_goto_tb(&dc->base, dest)) {
-         tcg_gen_goto_tb(n);
-         tcg_gen_movi_tl(cpu_R[R_PC], dest);
-         tcg_gen_exit_tb(tb, n);
 --
-.25.1
+.34.1

-[PATCH 37/41] accel/tcg: Move tb_lookup to cpu-exec.c
+[PULL 07/28] migration/xbzrle: Use i386 host/cpuinfo.h
-Now that we've moved helper_lookup_tb_ptr, the only user
+Perform the function selection once, and only if CONFIG_AVX512_OPT
-of tb-lookup.h is cpu-exec.c; merge the contents in.
+is enabled.  Centralize the selection to xbzrle.c, instead of
 spreading the init across 3 files.
-Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Remove xbzrle-bench.c.  The benefit of being able to benchmark
 the different implementations is less important than not peeking
 into the internals of the implementation.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Reviewed-by: Juan Quintela <quintela@redhat.com>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/tb-lookup.h | 49 -------------------------------------------
+ migration/xbzrle.h         |   5 +-
- accel/tcg/cpu-exec.c  | 31 ++++++++++++++++++++++++++-
+ migration/ram.c            |  34 +--
-files changed, 30 insertions(+), 50 deletions(-)
+ migration/xbzrle.c         |  26 +-
- delete mode 100644 accel/tcg/tb-lookup.h
+ tests/bench/xbzrle-bench.c | 469 -------------------------------------
  tests/unit/test-xbzrle.c   |  49 +---
  tests/bench/meson.build    |   6 -
 files changed, 39 insertions(+), 550 deletions(-)
  delete mode 100644 tests/bench/xbzrle-bench.c
-diff --git a/accel/tcg/tb-lookup.h b/accel/tcg/tb-lookup.h
+diff --git a/migration/xbzrle.h b/migration/xbzrle.h
 index XXXXXXX..XXXXXXX 100644
 --- a/migration/xbzrle.h
 +++ b/migration/xbzrle.h
@@ -XXX,XX +XXX,XX @@ int xbzrle_encode_buffer(uint8_t *old_buf, uint8_t *new_buf, int slen,
                           uint8_t *dst, int dlen);
  int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen);
 -#if defined(CONFIG_AVX512BW_OPT)
 -int xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen,
 -                                uint8_t *dst, int dlen);
 -#endif
 +
  #endif
 diff --git a/migration/ram.c b/migration/ram.c
 index XXXXXXX..XXXXXXX 100644
 --- a/migration/ram.c
 +++ b/migration/ram.c
@@ -XXX,XX +XXX,XX @@
  #define RAM_SAVE_FLAG_MULTIFD_FLUSH    0x200
  /* We can't use any flag that is bigger than 0x200 */
 -int (*xbzrle_encode_buffer_func)(uint8_t *, uint8_t *, int,
 -     uint8_t *, int) = xbzrle_encode_buffer;
 -#if defined(CONFIG_AVX512BW_OPT)
 -#include "qemu/cpuid.h"
 -static void __attribute__((constructor)) init_cpu_flag(void)
 -{
 -    unsigned max = __get_cpuid_max(0, NULL);
 -    int a, b, c, d;
 -    if (max >= 1) {
 -        __cpuid(1, a, b, c, d);
 -         /* We must check that AVX is not just available, but usable.  */
 -        if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
 -            int bv;
 -            __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
 -            __cpuid_count(7, 0, a, b, c, d);
 -           /* 0xe6:
 -            *  XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15
 -            *                    and ZMM16-ZMM31 state are enabled by OS)
 -            *  XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS)
 -            */
 -            if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) {
 -                xbzrle_encode_buffer_func = xbzrle_encode_buffer_avx512;
 -            }
 -        }
 -    }
 -}
 -#endif
 -
  XBZRLECacheStats xbzrle_counters;
  /* used by the search for pages to send */
@@ -XXX,XX +XXX,XX @@ static int save_xbzrle_page(RAMState *rs, PageSearchStatus *pss,
      memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
      /* XBZRLE encoding (if there is no overflow) */
 -    encoded_len = xbzrle_encode_buffer_func(prev_cached_page, XBZRLE.current_buf,
 -                                            TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 -                                            TARGET_PAGE_SIZE);
 +    encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 +                                       TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 +                                       TARGET_PAGE_SIZE);
      /*
       * Update the cache contents, so that it corresponds to the data
 diff --git a/migration/xbzrle.c b/migration/xbzrle.c
 index XXXXXXX..XXXXXXX 100644
 --- a/migration/xbzrle.c
 +++ b/migration/xbzrle.c
@@ -XXX,XX +XXX,XX @@
  #if defined(CONFIG_AVX512BW_OPT)
  #include <immintrin.h>
 +#include "host/cpuinfo.h"
 -int __attribute__((target("avx512bw")))
 +static int __attribute__((target("avx512bw")))
  xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen,
                              uint8_t *dst, int dlen)
  {
@@ -XXX,XX +XXX,XX @@ xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen,
      }
      return d;
  }
 +
 +static int xbzrle_encode_buffer_int(uint8_t *old_buf, uint8_t *new_buf,
 +                                    int slen, uint8_t *dst, int dlen);
 +
 +static int (*accel_func)(uint8_t *, uint8_t *, int, uint8_t *, int);
 +
 +static void __attribute__((constructor)) init_accel(void)
 +{
 +    unsigned info = cpuinfo_init();
 +    if (info & CPUINFO_AVX512BW) {
 +        accel_func = xbzrle_encode_buffer_avx512;
 +    } else {
 +        accel_func = xbzrle_encode_buffer_int;
 +    }
 +}
 +
 +int xbzrle_encode_buffer(uint8_t *old_buf, uint8_t *new_buf, int slen,
 +                         uint8_t *dst, int dlen)
 +{
 +    return accel_func(old_buf, new_buf, slen, dst, dlen);
 +}
 +
 +#define xbzrle_encode_buffer xbzrle_encode_buffer_int
  #endif
  /*
 diff --git a/tests/bench/xbzrle-bench.c b/tests/bench/xbzrle-bench.c
 deleted file mode 100644
 index XXXXXXX..XXXXXXX
---- a/accel/tcg/tb-lookup.h
+--- a/tests/bench/xbzrle-bench.c
 +++ /dev/null
 @@ -XXX,XX +XXX,XX @@
 -/*
-- * Copyright (C) 2017, Emilio G. Cota <cota@braap.org>
+- * Xor Based Zero Run Length Encoding unit tests.
 - *
-- * License: GNU GPL, version 2 or later.
+- * Copyright 2013 Red Hat, Inc. and/or its affiliates
-- *   See the COPYING file in the top-level directory.
+- *
 - * Authors:
 - *  Orit Wasserman  <owasserm@redhat.com>
 - *
 - * This work is licensed under the terms of the GNU GPL, version 2 or later.
 - * See the COPYING file in the top-level directory.
 - *
 - */
--#ifndef EXEC_TB_LOOKUP_H
+-#include "qemu/osdep.h"
--#define EXEC_TB_LOOKUP_H
+-#include "qemu/cutils.h"
--
+-#include "../migration/xbzrle.h"
--#ifdef NEED_CPU_H
+-
--#include "cpu.h"
+-#if defined(CONFIG_AVX512BW_OPT)
--#else
+-#define XBZRLE_PAGE_SIZE 4096
--#include "exec/poison.h"
+-static bool is_cpu_support_avx512bw;
 -#include "qemu/cpuid.h"
 -static void __attribute__((constructor)) init_cpu_flag(void)
 -{
 -    unsigned max = __get_cpuid_max(0, NULL);
 -    int a, b, c, d;
 -    is_cpu_support_avx512bw = false;
 -    if (max >= 1) {
 -        __cpuid(1, a, b, c, d);
 -         /* We must check that AVX is not just available, but usable.  */
 -        if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
 -            int bv;
 -            __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
 -            __cpuid_count(7, 0, a, b, c, d);
 -           /* 0xe6:
 -            *  XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15
 -            *                    and ZMM16-ZMM31 state are enabled by OS)
 -            *  XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS)
 -            */
 -            if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) {
 -                is_cpu_support_avx512bw = true;
 -            }
 -        }
 -    }
 -    return ;
 -}
 -
 -struct ResTime {
 -    float t_raw;
 -    float t_512;
 -};
 -
 -
 -/* Function prototypes
 -int xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen,
 -                                uint8_t *dst, int dlen);
 -*/
 -static void encode_decode_zero(struct ResTime *res)
 -{
 -    uint8_t *buffer = g_malloc0(XBZRLE_PAGE_SIZE);
 -    uint8_t *compressed = g_malloc0(XBZRLE_PAGE_SIZE);
 -    uint8_t *buffer512 = g_malloc0(XBZRLE_PAGE_SIZE);
 -    uint8_t *compressed512 = g_malloc0(XBZRLE_PAGE_SIZE);
 -    int i = 0;
 -    int dlen = 0, dlen512 = 0;
 -    int diff_len = g_test_rand_int_range(0, XBZRLE_PAGE_SIZE - 1006);
 -
 -    for (i = diff_len; i > 0; i--) {
 -        buffer[1000 + i] = i;
 -        buffer512[1000 + i] = i;
 -    }
 -
 -    buffer[1000 + diff_len + 3] = 103;
 -    buffer[1000 + diff_len + 5] = 105;
 -
 -    buffer512[1000 + diff_len + 3] = 103;
 -    buffer512[1000 + diff_len + 5] = 105;
 -
 -    /* encode zero page */
 -    time_t t_start, t_end, t_start512, t_end512;
 -    t_start = clock();
 -    dlen = xbzrle_encode_buffer(buffer, buffer, XBZRLE_PAGE_SIZE, compressed,
 -                       XBZRLE_PAGE_SIZE);
 -    t_end = clock();
 -    float time_val = difftime(t_end, t_start);
 -    g_assert(dlen == 0);
 -
 -    t_start512 = clock();
 -    dlen512 = xbzrle_encode_buffer_avx512(buffer512, buffer512, XBZRLE_PAGE_SIZE,
 -                                       compressed512, XBZRLE_PAGE_SIZE);
 -    t_end512 = clock();
 -    float time_val512 = difftime(t_end512, t_start512);
 -    g_assert(dlen512 == 0);
 -
 -    res->t_raw = time_val;
 -    res->t_512 = time_val512;
 -
 -    g_free(buffer);
 -    g_free(compressed);
 -    g_free(buffer512);
 -    g_free(compressed512);
 -
 -}
 -
 -static void test_encode_decode_zero_avx512(void)
 -{
 -    int i;
 -    float time_raw = 0.0, time_512 = 0.0;
 -    struct ResTime res;
 -    for (i = 0; i < 10000; i++) {
 -        encode_decode_zero(&res);
 -        time_raw += res.t_raw;
 -        time_512 += res.t_512;
 -    }
 -    printf("Zero test:\n");
 -    printf("Raw xbzrle_encode time is %f ms\n", time_raw);
 -    printf("512 xbzrle_encode time is %f ms\n", time_512);
 -}
 -
 -static void encode_decode_unchanged(struct ResTime *res)
 -{
 -    uint8_t *compressed = g_malloc0(XBZRLE_PAGE_SIZE);
 -    uint8_t *test = g_malloc0(XBZRLE_PAGE_SIZE);
 -    uint8_t *compressed512 = g_malloc0(XBZRLE_PAGE_SIZE);
 -    uint8_t *test512 = g_malloc0(XBZRLE_PAGE_SIZE);
 -    int i = 0;
 -    int dlen = 0, dlen512 = 0;
 -    int diff_len = g_test_rand_int_range(0, XBZRLE_PAGE_SIZE - 1006);
 -
 -    for (i = diff_len; i > 0; i--) {
 -        test[1000 + i] = i + 4;
 -        test512[1000 + i] = i + 4;
 -    }
 -
 -    test[1000 + diff_len + 3] = 107;
 -    test[1000 + diff_len + 5] = 109;
 -
 -    test512[1000 + diff_len + 3] = 107;
 -    test512[1000 + diff_len + 5] = 109;
 -
 -    /* test unchanged buffer */
 -    time_t t_start, t_end, t_start512, t_end512;
 -    t_start = clock();
 -    dlen = xbzrle_encode_buffer(test, test, XBZRLE_PAGE_SIZE, compressed,
 -                                XBZRLE_PAGE_SIZE);
 -    t_end = clock();
 -    float time_val = difftime(t_end, t_start);
 -    g_assert(dlen == 0);
 -
 -    t_start512 = clock();
 -    dlen512 = xbzrle_encode_buffer_avx512(test512, test512, XBZRLE_PAGE_SIZE,
 -                                       compressed512, XBZRLE_PAGE_SIZE);
 -    t_end512 = clock();
 -    float time_val512 = difftime(t_end512, t_start512);
 -    g_assert(dlen512 == 0);
 -
 -    res->t_raw = time_val;
 -    res->t_512 = time_val512;
 -
 -    g_free(test);
 -    g_free(compressed);
 -    g_free(test512);
 -    g_free(compressed512);
 -
 -}
 -
 -static void test_encode_decode_unchanged_avx512(void)
 -{
 -    int i;
 -    float time_raw = 0.0, time_512 = 0.0;
 -    struct ResTime res;
 -    for (i = 0; i < 10000; i++) {
 -        encode_decode_unchanged(&res);
 -        time_raw += res.t_raw;
 -        time_512 += res.t_512;
 -    }
 -    printf("Unchanged test:\n");
 -    printf("Raw xbzrle_encode time is %f ms\n", time_raw);
 -    printf("512 xbzrle_encode time is %f ms\n", time_512);
 -}
 -
 -static void encode_decode_1_byte(struct ResTime *res)
 -{
 -    uint8_t *buffer = g_malloc0(XBZRLE_PAGE_SIZE);
 -    uint8_t *test = g_malloc0(XBZRLE_PAGE_SIZE);
 -    uint8_t *compressed = g_malloc(XBZRLE_PAGE_SIZE);
 -    uint8_t *buffer512 = g_malloc0(XBZRLE_PAGE_SIZE);
 -    uint8_t *test512 = g_malloc0(XBZRLE_PAGE_SIZE);
 -    uint8_t *compressed512 = g_malloc(XBZRLE_PAGE_SIZE);
 -    int dlen = 0, rc = 0, dlen512 = 0, rc512 = 0;
 -    uint8_t buf[2];
 -    uint8_t buf512[2];
 -
 -    test[XBZRLE_PAGE_SIZE - 1] = 1;
 -    test512[XBZRLE_PAGE_SIZE - 1] = 1;
 -
 -    time_t t_start, t_end, t_start512, t_end512;
 -    t_start = clock();
 -    dlen = xbzrle_encode_buffer(buffer, test, XBZRLE_PAGE_SIZE, compressed,
 -                       XBZRLE_PAGE_SIZE);
 -    t_end = clock();
 -    float time_val = difftime(t_end, t_start);
 -    g_assert(dlen == (uleb128_encode_small(&buf[0], 4095) + 2));
 -
 -    rc = xbzrle_decode_buffer(compressed, dlen, buffer, XBZRLE_PAGE_SIZE);
 -    g_assert(rc == XBZRLE_PAGE_SIZE);
 -    g_assert(memcmp(test, buffer, XBZRLE_PAGE_SIZE) == 0);
 -
 -    t_start512 = clock();
 -    dlen512 = xbzrle_encode_buffer_avx512(buffer512, test512, XBZRLE_PAGE_SIZE,
 -                                       compressed512, XBZRLE_PAGE_SIZE);
 -    t_end512 = clock();
 -    float time_val512 = difftime(t_end512, t_start512);
 -    g_assert(dlen512 == (uleb128_encode_small(&buf512[0], 4095) + 2));
 -
 -    rc512 = xbzrle_decode_buffer(compressed512, dlen512, buffer512,
 -                                 XBZRLE_PAGE_SIZE);
 -    g_assert(rc512 == XBZRLE_PAGE_SIZE);
 -    g_assert(memcmp(test512, buffer512, XBZRLE_PAGE_SIZE) == 0);
 -
 -    res->t_raw = time_val;
 -    res->t_512 = time_val512;
 -
 -    g_free(buffer);
 -    g_free(compressed);
 -    g_free(test);
 -    g_free(buffer512);
 -    g_free(compressed512);
 -    g_free(test512);
 -
 -}
 -
 -static void test_encode_decode_1_byte_avx512(void)
 -{
 -    int i;
 -    float time_raw = 0.0, time_512 = 0.0;
 -    struct ResTime res;
 -    for (i = 0; i < 10000; i++) {
 -        encode_decode_1_byte(&res);
 -        time_raw += res.t_raw;
 -        time_512 += res.t_512;
 -    }
 -    printf("1 byte test:\n");
 -    printf("Raw xbzrle_encode time is %f ms\n", time_raw);
 -    printf("512 xbzrle_encode time is %f ms\n", time_512);
 -}
 -
 -static void encode_decode_overflow(struct ResTime *res)
 -{
 -    uint8_t *compressed = g_malloc0(XBZRLE_PAGE_SIZE);
 -    uint8_t *test = g_malloc0(XBZRLE_PAGE_SIZE);
 -    uint8_t *buffer = g_malloc0(XBZRLE_PAGE_SIZE);
 -    uint8_t *compressed512 = g_malloc0(XBZRLE_PAGE_SIZE);
 -    uint8_t *test512 = g_malloc0(XBZRLE_PAGE_SIZE);
 -    uint8_t *buffer512 = g_malloc0(XBZRLE_PAGE_SIZE);
 -    int i = 0, rc = 0, rc512 = 0;
 -
 -    for (i = 0; i < XBZRLE_PAGE_SIZE / 2 - 1; i++) {
 -        test[i * 2] = 1;
 -        test512[i * 2] = 1;
 -    }
 -
 -    /* encode overflow */
 -    time_t t_start, t_end, t_start512, t_end512;
 -    t_start = clock();
 -    rc = xbzrle_encode_buffer(buffer, test, XBZRLE_PAGE_SIZE, compressed,
 -                              XBZRLE_PAGE_SIZE);
 -    t_end = clock();
 -    float time_val = difftime(t_end, t_start);
 -    g_assert(rc == -1);
 -
 -    t_start512 = clock();
 -    rc512 = xbzrle_encode_buffer_avx512(buffer512, test512, XBZRLE_PAGE_SIZE,
 -                                     compressed512, XBZRLE_PAGE_SIZE);
 -    t_end512 = clock();
 -    float time_val512 = difftime(t_end512, t_start512);
 -    g_assert(rc512 == -1);
 -
 -    res->t_raw = time_val;
 -    res->t_512 = time_val512;
 -
 -    g_free(buffer);
 -    g_free(compressed);
 -    g_free(test);
 -    g_free(buffer512);
 -    g_free(compressed512);
 -    g_free(test512);
 -
 -}
 -
 -static void test_encode_decode_overflow_avx512(void)
 -{
 -    int i;
 -    float time_raw = 0.0, time_512 = 0.0;
 -    struct ResTime res;
 -    for (i = 0; i < 10000; i++) {
 -        encode_decode_overflow(&res);
 -        time_raw += res.t_raw;
 -        time_512 += res.t_512;
 -    }
 -    printf("Overflow test:\n");
 -    printf("Raw xbzrle_encode time is %f ms\n", time_raw);
 -    printf("512 xbzrle_encode time is %f ms\n", time_512);
 -}
 -
 -static void encode_decode_range_avx512(struct ResTime *res)
 -{
 -    uint8_t *buffer = g_malloc0(XBZRLE_PAGE_SIZE);
 -    uint8_t *compressed = g_malloc(XBZRLE_PAGE_SIZE);
 -    uint8_t *test = g_malloc0(XBZRLE_PAGE_SIZE);
 -    uint8_t *buffer512 = g_malloc0(XBZRLE_PAGE_SIZE);
 -    uint8_t *compressed512 = g_malloc(XBZRLE_PAGE_SIZE);
 -    uint8_t *test512 = g_malloc0(XBZRLE_PAGE_SIZE);
 -    int i = 0, rc = 0, rc512 = 0;
 -    int dlen = 0, dlen512 = 0;
 -
 -    int diff_len = g_test_rand_int_range(0, XBZRLE_PAGE_SIZE - 1006);
 -
 -    for (i = diff_len; i > 0; i--) {
 -        buffer[1000 + i] = i;
 -        test[1000 + i] = i + 4;
 -        buffer512[1000 + i] = i;
 -        test512[1000 + i] = i + 4;
 -    }
 -
 -    buffer[1000 + diff_len + 3] = 103;
 -    test[1000 + diff_len + 3] = 107;
 -
 -    buffer[1000 + diff_len + 5] = 105;
 -    test[1000 + diff_len + 5] = 109;
 -
 -    buffer512[1000 + diff_len + 3] = 103;
 -    test512[1000 + diff_len + 3] = 107;
 -
 -    buffer512[1000 + diff_len + 5] = 105;
 -    test512[1000 + diff_len + 5] = 109;
 -
 -    /* test encode/decode */
 -    time_t t_start, t_end, t_start512, t_end512;
 -    t_start = clock();
 -    dlen = xbzrle_encode_buffer(test, buffer, XBZRLE_PAGE_SIZE, compressed,
 -                                XBZRLE_PAGE_SIZE);
 -    t_end = clock();
 -    float time_val = difftime(t_end, t_start);
 -    rc = xbzrle_decode_buffer(compressed, dlen, test, XBZRLE_PAGE_SIZE);
 -    g_assert(rc < XBZRLE_PAGE_SIZE);
 -    g_assert(memcmp(test, buffer, XBZRLE_PAGE_SIZE) == 0);
 -
 -    t_start512 = clock();
 -    dlen512 = xbzrle_encode_buffer_avx512(test512, buffer512, XBZRLE_PAGE_SIZE,
 -                                       compressed512, XBZRLE_PAGE_SIZE);
 -    t_end512 = clock();
 -    float time_val512 = difftime(t_end512, t_start512);
 -    rc512 = xbzrle_decode_buffer(compressed512, dlen512, test512, XBZRLE_PAGE_SIZE);
 -    g_assert(rc512 < XBZRLE_PAGE_SIZE);
 -    g_assert(memcmp(test512, buffer512, XBZRLE_PAGE_SIZE) == 0);
 -
 -    res->t_raw = time_val;
 -    res->t_512 = time_val512;
 -
 -    g_free(buffer);
 -    g_free(compressed);
 -    g_free(test);
 -    g_free(buffer512);
 -    g_free(compressed512);
 -    g_free(test512);
 -
 -}
 -
 -static void test_encode_decode_avx512(void)
 -{
 -    int i;
 -    float time_raw = 0.0, time_512 = 0.0;
 -    struct ResTime res;
 -    for (i = 0; i < 10000; i++) {
 -        encode_decode_range_avx512(&res);
 -        time_raw += res.t_raw;
 -        time_512 += res.t_512;
 -    }
 -    printf("Encode decode test:\n");
 -    printf("Raw xbzrle_encode time is %f ms\n", time_raw);
 -    printf("512 xbzrle_encode time is %f ms\n", time_512);
 -}
 -
 -static void encode_decode_random(struct ResTime *res)
 -{
 -    uint8_t *buffer = g_malloc0(XBZRLE_PAGE_SIZE);
 -    uint8_t *compressed = g_malloc(XBZRLE_PAGE_SIZE);
 -    uint8_t *test = g_malloc0(XBZRLE_PAGE_SIZE);
 -    uint8_t *buffer512 = g_malloc0(XBZRLE_PAGE_SIZE);
 -    uint8_t *compressed512 = g_malloc(XBZRLE_PAGE_SIZE);
 -    uint8_t *test512 = g_malloc0(XBZRLE_PAGE_SIZE);
 -    int i = 0, rc = 0, rc512 = 0;
 -    int dlen = 0, dlen512 = 0;
 -
 -    int diff_len = g_test_rand_int_range(0, XBZRLE_PAGE_SIZE - 1);
 -    /* store the index of diff */
 -    int dirty_index[diff_len];
 -    for (int j = 0; j < diff_len; j++) {
 -        dirty_index[j] = g_test_rand_int_range(0, XBZRLE_PAGE_SIZE - 1);
 -    }
 -    for (i = diff_len - 1; i >= 0; i--) {
 -        buffer[dirty_index[i]] = i;
 -        test[dirty_index[i]] = i + 4;
 -        buffer512[dirty_index[i]] = i;
 -        test512[dirty_index[i]] = i + 4;
 -    }
 -
 -    time_t t_start, t_end, t_start512, t_end512;
 -    t_start = clock();
 -    dlen = xbzrle_encode_buffer(test, buffer, XBZRLE_PAGE_SIZE, compressed,
 -                                XBZRLE_PAGE_SIZE);
 -    t_end = clock();
 -    float time_val = difftime(t_end, t_start);
 -    rc = xbzrle_decode_buffer(compressed, dlen, test, XBZRLE_PAGE_SIZE);
 -    g_assert(rc < XBZRLE_PAGE_SIZE);
 -
 -    t_start512 = clock();
 -    dlen512 = xbzrle_encode_buffer_avx512(test512, buffer512, XBZRLE_PAGE_SIZE,
 -                                       compressed512, XBZRLE_PAGE_SIZE);
 -    t_end512 = clock();
 -    float time_val512 = difftime(t_end512, t_start512);
 -    rc512 = xbzrle_decode_buffer(compressed512, dlen512, test512, XBZRLE_PAGE_SIZE);
 -    g_assert(rc512 < XBZRLE_PAGE_SIZE);
 -
 -    res->t_raw = time_val;
 -    res->t_512 = time_val512;
 -
 -    g_free(buffer);
 -    g_free(compressed);
 -    g_free(test);
 -    g_free(buffer512);
 -    g_free(compressed512);
 -    g_free(test512);
 -
 -}
 -
 -static void test_encode_decode_random_avx512(void)
 -{
 -    int i;
 -    float time_raw = 0.0, time_512 = 0.0;
 -    struct ResTime res;
 -    for (i = 0; i < 10000; i++) {
 -        encode_decode_random(&res);
 -        time_raw += res.t_raw;
 -        time_512 += res.t_512;
 -    }
 -    printf("Random test:\n");
 -    printf("Raw xbzrle_encode time is %f ms\n", time_raw);
 -    printf("512 xbzrle_encode time is %f ms\n", time_512);
 -}
 -#endif
 -
--#include "exec/exec-all.h"
+-int main(int argc, char **argv)
--#include "tb-hash.h"
+-{
--
+-    g_test_init(&argc, &argv, NULL);
--/* Might cause an exception, so have a longjmp destination ready */
+-    g_test_rand_int();
--static inline TranslationBlock *tb_lookup(CPUState *cpu, target_ulong pc,
+-    #if defined(CONFIG_AVX512BW_OPT)
--                                          target_ulong cs_base,
+-    if (likely(is_cpu_support_avx512bw)) {
--                                          uint32_t flags, uint32_t cflags)
+-        g_test_add_func("/xbzrle/encode_decode_zero", test_encode_decode_zero_avx512);
--{
+-        g_test_add_func("/xbzrle/encode_decode_unchanged",
--    TranslationBlock *tb;
+-                        test_encode_decode_unchanged_avx512);
--    uint32_t hash;
+-        g_test_add_func("/xbzrle/encode_decode_1_byte", test_encode_decode_1_byte_avx512);
--
+-        g_test_add_func("/xbzrle/encode_decode_overflow",
--    /* we should never be trying to look up an INVALID tb */
+-                        test_encode_decode_overflow_avx512);
--    tcg_debug_assert(!(cflags & CF_INVALID));
+-        g_test_add_func("/xbzrle/encode_decode", test_encode_decode_avx512);
--
+-        g_test_add_func("/xbzrle/encode_decode_random", test_encode_decode_random_avx512);
--    hash = tb_jmp_cache_hash_func(pc);
+-    }
--    tb = qatomic_rcu_read(&cpu->tb_jmp_cache[hash]);
+-    #endif
--
+-    return g_test_run();
--    if (likely(tb &&
+-}
--               tb->pc == pc &&
+diff --git a/tests/unit/test-xbzrle.c b/tests/unit/test-xbzrle.c
 -               tb->cs_base == cs_base &&
 -               tb->flags == flags &&
 -               tb->trace_vcpu_dstate == *cpu->trace_dstate &&
 -               tb_cflags(tb) == cflags)) {
 -        return tb;
 -    }
 -    tb = tb_htable_lookup(cpu, pc, cs_base, flags, cflags);
 -    if (tb == NULL) {
 -        return NULL;
 -    }
 -    qatomic_set(&cpu->tb_jmp_cache[hash], tb);
 -    return tb;
 -}
 -
 -#endif /* EXEC_TB_LOOKUP_H */
 diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/cpu-exec.c
+--- a/tests/unit/test-xbzrle.c
-+++ b/accel/tcg/cpu-exec.c
++++ b/tests/unit/test-xbzrle.c
 @@ -XXX,XX +XXX,XX @@
- #include "sysemu/replay.h"
- #include "exec/helper-proto.h"
+ #define XBZRLE_PAGE_SIZE 4096
- #include "tb-hash.h"
--#include "tb-lookup.h"
+-int (*xbzrle_encode_buffer_func)(uint8_t *, uint8_t *, int,
- #include "tb-context.h"
+-     uint8_t *, int) = xbzrle_encode_buffer;
- #include "internal.h"
+-#if defined(CONFIG_AVX512BW_OPT)
+-#include "qemu/cpuid.h"
-@@ -XXX,XX +XXX,XX @@ static void init_delay_params(SyncClocks *sc, const CPUState *cpu)
+-static void __attribute__((constructor)) init_cpu_flag(void)
- }
+-{
- #endif /* CONFIG USER ONLY */
+-    unsigned max = __get_cpuid_max(0, NULL);
+-    int a, b, c, d;
-+/* Might cause an exception, so have a longjmp destination ready */
+-    if (max >= 1) {
-+static inline TranslationBlock *tb_lookup(CPUState *cpu, target_ulong pc,
+-        __cpuid(1, a, b, c, d);
-+                                          target_ulong cs_base,
+-         /* We must check that AVX is not just available, but usable.  */
-+                                          uint32_t flags, uint32_t cflags)
+-        if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
-+{
+-            int bv;
-+    TranslationBlock *tb;
+-            __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
-+    uint32_t hash;
+-            __cpuid_count(7, 0, a, b, c, d);
-+
+-           /* 0xe6:
-+    /* we should never be trying to look up an INVALID tb */
+-            *  XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15
-+    tcg_debug_assert(!(cflags & CF_INVALID));
+-            *                    and ZMM16-ZMM31 state are enabled by OS)
-+
+-            *  XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS)
-+    hash = tb_jmp_cache_hash_func(pc);
+-            */
-+    tb = qatomic_rcu_read(&cpu->tb_jmp_cache[hash]);
+-            if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) {
-+
+-                xbzrle_encode_buffer_func = xbzrle_encode_buffer_avx512;
-+    if (likely(tb &&
+-            }
-+               tb->pc == pc &&
+-        }
-+               tb->cs_base == cs_base &&
+-    }
-+               tb->flags == flags &&
+-    return ;
-+               tb->trace_vcpu_dstate == *cpu->trace_dstate &&
+-}
-+               tb_cflags(tb) == cflags)) {
+-#endif
-+        return tb;
+-
-+    }
+ static void test_uleb(void)
-+    tb = tb_htable_lookup(cpu, pc, cs_base, flags, cflags);
+ {
-+    if (tb == NULL) {
+     uint32_t i, val;
-+        return NULL;
+@@ -XXX,XX +XXX,XX @@ static void test_encode_decode_zero(void)
-+    }
+     buffer[1000 + diff_len + 5] = 105;
-+    qatomic_set(&cpu->tb_jmp_cache[hash], tb);
-+    return tb;
+     /* encode zero page */
-+}
+-    dlen = xbzrle_encode_buffer_func(buffer, buffer, XBZRLE_PAGE_SIZE, compressed,
-+
+-                       XBZRLE_PAGE_SIZE);
- /**
++    dlen = xbzrle_encode_buffer(buffer, buffer, XBZRLE_PAGE_SIZE,
-  * helper_lookup_tb_ptr: quick check for next tb
++                                compressed, XBZRLE_PAGE_SIZE);
-  * @env: current cpu state
+     g_assert(dlen == 0);
      g_free(buffer);
@@ -XXX,XX +XXX,XX @@ static void test_encode_decode_unchanged(void)
      test[1000 + diff_len + 5] = 109;
      /* test unchanged buffer */
 -    dlen = xbzrle_encode_buffer_func(test, test, XBZRLE_PAGE_SIZE, compressed,
 -                                XBZRLE_PAGE_SIZE);
 +    dlen = xbzrle_encode_buffer(test, test, XBZRLE_PAGE_SIZE,
 +                                compressed, XBZRLE_PAGE_SIZE);
      g_assert(dlen == 0);
      g_free(test);
@@ -XXX,XX +XXX,XX @@ static void test_encode_decode_1_byte(void)
      test[XBZRLE_PAGE_SIZE - 1] = 1;
 -    dlen = xbzrle_encode_buffer_func(buffer, test, XBZRLE_PAGE_SIZE, compressed,
 -                       XBZRLE_PAGE_SIZE);
 +    dlen = xbzrle_encode_buffer(buffer, test, XBZRLE_PAGE_SIZE,
 +                                compressed, XBZRLE_PAGE_SIZE);
      g_assert(dlen == (uleb128_encode_small(&buf[0], 4095) + 2));
      rc = xbzrle_decode_buffer(compressed, dlen, buffer, XBZRLE_PAGE_SIZE);
@@ -XXX,XX +XXX,XX @@ static void test_encode_decode_overflow(void)
      }
      /* encode overflow */
 -    rc = xbzrle_encode_buffer_func(buffer, test, XBZRLE_PAGE_SIZE, compressed,
 -                              XBZRLE_PAGE_SIZE);
 +    rc = xbzrle_encode_buffer(buffer, test, XBZRLE_PAGE_SIZE,
 +                              compressed, XBZRLE_PAGE_SIZE);
      g_assert(rc == -1);
      g_free(buffer);
@@ -XXX,XX +XXX,XX @@ static void encode_decode_range(void)
      test[1000 + diff_len + 5] = 109;
      /* test encode/decode */
 -    dlen = xbzrle_encode_buffer_func(test, buffer, XBZRLE_PAGE_SIZE, compressed,
 -                                XBZRLE_PAGE_SIZE);
 +    dlen = xbzrle_encode_buffer(test, buffer, XBZRLE_PAGE_SIZE,
 +                                compressed, XBZRLE_PAGE_SIZE);
      rc = xbzrle_decode_buffer(compressed, dlen, test, XBZRLE_PAGE_SIZE);
      g_assert(rc < XBZRLE_PAGE_SIZE);
 diff --git a/tests/bench/meson.build b/tests/bench/meson.build
 index XXXXXXX..XXXXXXX 100644
 --- a/tests/bench/meson.build
 +++ b/tests/bench/meson.build
@@ -XXX,XX +XXX,XX @@ qht_bench = executable('qht-bench',
                         sources: 'qht-bench.c',
                         dependencies: [qemuutil])
 -if have_system
 -xbzrle_bench = executable('xbzrle-bench',
 -                       sources: 'xbzrle-bench.c',
 -                       dependencies: [qemuutil,migration])
 -endif
 -
  qtree_bench = executable('qtree-bench',
                           sources: 'qtree-bench.c',
                           dependencies: [qemuutil])
 --
-.25.1
+.34.1

-[PATCH 38/41] accel/tcg: Split out log_cpu_exec
+[PULL 08/28] migration: Build migration_files once
-Split out CPU_LOG_EXEC and CPU_LOG_TB_CPU logging from
+The items in migration_files are built for libmigration and included
-cpu_tb_exec to a new function.  Perform only one pc
+info softmmu_ss from there; no need to also include them directly.
 range check after a combined mask check.
-Use the new function in lookup_tb_ptr.  This enables
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-CPU_LOG_TB_CPU between indirectly chained tbs.
+Reviewed-by: Juan Quintela <quintela@redhat.com>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/cpu-exec.c | 61 ++++++++++++++++++++++++--------------------
+ migration/meson.build | 1 -
-file changed, 34 insertions(+), 27 deletions(-)
+file changed, 1 deletion(-)
-diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
+diff --git a/migration/meson.build b/migration/meson.build
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/cpu-exec.c
+--- a/migration/meson.build
-+++ b/accel/tcg/cpu-exec.c
++++ b/migration/meson.build
-@@ -XXX,XX +XXX,XX @@ static inline TranslationBlock *tb_lookup(CPUState *cpu, target_ulong pc,
+@@ -XXX,XX +XXX,XX @@ migration_files = files(
-     return tb;
+   'qemu-file.c',
- }
+   'yank_functions.c',
+ )
-+static inline void log_cpu_exec(target_ulong pc, CPUState *cpu,
+-softmmu_ss.add(migration_files)
-+                                const TranslationBlock *tb)
-+{
+ softmmu_ss.add(files(
-+    if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_CPU | CPU_LOG_EXEC))
+   'block-dirty-bitmap.c',
 +        && qemu_log_in_addr_range(pc)) {
 +
 +        qemu_log_mask(CPU_LOG_EXEC,
 +                      "Trace %d: %p [" TARGET_FMT_lx
 +                      "/" TARGET_FMT_lx "/%#x] %s\n",
 +                      cpu->cpu_index, tb->tc.ptr, tb->cs_base, pc, tb->flags,
 +                      lookup_symbol(pc));
 +
 +#if defined(DEBUG_DISAS)
 +        if (qemu_loglevel_mask(CPU_LOG_TB_CPU)) {
 +            FILE *logfile = qemu_log_lock();
 +            int flags = 0;
 +
 +            if (qemu_loglevel_mask(CPU_LOG_TB_FPU)) {
 +                flags |= CPU_DUMP_FPU;
 +            }
 +#if defined(TARGET_I386)
 +            flags |= CPU_DUMP_CCOP;
 +#endif
 +            log_cpu_state(cpu, flags);
 +            qemu_log_unlock(logfile);
 +        }
 +#endif /* DEBUG_DISAS */
 +    }
 +}
 +
  /**
   * helper_lookup_tb_ptr: quick check for next tb
   * @env: current cpu state
@@ -XXX,XX +XXX,XX @@ const void *HELPER(lookup_tb_ptr)(CPUArchState *env)
      if (tb == NULL) {
          return tcg_code_gen_epilogue;
      }
 -    qemu_log_mask_and_addr(CPU_LOG_EXEC, pc,
 -                           "Chain %d: %p ["
 -                           TARGET_FMT_lx "/" TARGET_FMT_lx "/%#x] %s\n",
 -                           cpu->cpu_index, tb->tc.ptr, cs_base, pc, flags,
 -                           lookup_symbol(pc));
 +
 +    log_cpu_exec(pc, cpu, tb);
 +
      return tb->tc.ptr;
  }
@@ -XXX,XX +XXX,XX @@ cpu_tb_exec(CPUState *cpu, TranslationBlock *itb, int *tb_exit)
      TranslationBlock *last_tb;
      const void *tb_ptr = itb->tc.ptr;
 -    qemu_log_mask_and_addr(CPU_LOG_EXEC, itb->pc,
 -                           "Trace %d: %p ["
 -                           TARGET_FMT_lx "/" TARGET_FMT_lx "/%#x] %s\n",
 -                           cpu->cpu_index, itb->tc.ptr,
 -                           itb->cs_base, itb->pc, itb->flags,
 -                           lookup_symbol(itb->pc));
 -
 -#if defined(DEBUG_DISAS)
 -    if (qemu_loglevel_mask(CPU_LOG_TB_CPU)
 -        && qemu_log_in_addr_range(itb->pc)) {
 -        FILE *logfile = qemu_log_lock();
 -        int flags = 0;
 -        if (qemu_loglevel_mask(CPU_LOG_TB_FPU)) {
 -            flags |= CPU_DUMP_FPU;
 -        }
 -#if defined(TARGET_I386)
 -        flags |= CPU_DUMP_CCOP;
 -#endif
 -        log_cpu_state(cpu, flags);
 -        qemu_log_unlock(logfile);
 -    }
 -#endif /* DEBUG_DISAS */
 +    log_cpu_exec(itb->pc, cpu, itb);
      qemu_thread_jit_execute();
      ret = tcg_qemu_tb_exec(env, tb_ptr);
 --
-.25.1
+.34.1

-[PATCH 26/41] target/rx: Use translator_use_goto_tb
+[PULL 09/28] util: Add cpuinfo-aarch64.c
-Just use translator_use_goto_tb directly at the one call site,
+Move the code from tcg/.  The only use of these bits so far
-rather than maintaining a local wrapper.
+is with respect to the atomicity of tcg operations.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/rx/translate.c | 11 +----------
+ host/include/aarch64/host/cpuinfo.h | 22 ++++++++++
-file changed, 1 insertion(+), 10 deletions(-)
+ tcg/aarch64/tcg-target.h            |  6 ++-
+ util/cpuinfo-aarch64.c              | 67 +++++++++++++++++++++++++++++
-diff --git a/target/rx/translate.c b/target/rx/translate.c
+ tcg/aarch64/tcg-target.c.inc        | 40 -----------------
  util/meson.build                    |  4 +-
 files changed, 96 insertions(+), 43 deletions(-)
  create mode 100644 host/include/aarch64/host/cpuinfo.h
  create mode 100644 util/cpuinfo-aarch64.c
 diff --git a/host/include/aarch64/host/cpuinfo.h b/host/include/aarch64/host/cpuinfo.h
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/host/include/aarch64/host/cpuinfo.h
@@ -XXX,XX +XXX,XX @@
 +/*
 + * SPDX-License-Identifier: GPL-2.0-or-later
 + * Host specific cpu indentification for AArch64.
 + */
 +
 +#ifndef HOST_CPUINFO_H
 +#define HOST_CPUINFO_H
 +
 +#define CPUINFO_ALWAYS          (1u << 0)  /* so cpuinfo is nonzero */
 +#define CPUINFO_LSE             (1u << 1)
 +#define CPUINFO_LSE2            (1u << 2)
 +
 +/* Initialized with a constructor. */
 +extern unsigned cpuinfo;
 +
 +/*
 + * We cannot rely on constructor ordering, so other constructors must
 + * use the function interface rather than the variable above.
 + */
 +unsigned cpuinfo_init(void);
 +
 +#endif /* HOST_CPUINFO_H */
 diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
---- a/target/rx/translate.c
+--- a/tcg/aarch64/tcg-target.h
-+++ b/target/rx/translate.c
++++ b/tcg/aarch64/tcg-target.h
-@@ -XXX,XX +XXX,XX @@ void rx_cpu_dump_state(CPUState *cs, FILE *f, int flags)
+@@ -XXX,XX +XXX,XX @@
  #ifndef AARCH64_TCG_TARGET_H
  #define AARCH64_TCG_TARGET_H
 +#include "host/cpuinfo.h"
 +
  #define TCG_TARGET_INSN_UNIT_SIZE  4
  #define TCG_TARGET_TLB_DISPLACEMENT_BITS 24
  #define MAX_CODE_GEN_BUFFER_SIZE  ((size_t)-1)
@@ -XXX,XX +XXX,XX @@ typedef enum {
  #define TCG_TARGET_CALL_ARG_I128        TCG_CALL_ARG_EVEN
  #define TCG_TARGET_CALL_RET_I128        TCG_CALL_RET_NORMAL
 -extern bool have_lse;
 -extern bool have_lse2;
 +#define have_lse    (cpuinfo & CPUINFO_LSE)
 +#define have_lse2   (cpuinfo & CPUINFO_LSE2)
  /* optional instructions */
  #define TCG_TARGET_HAS_div_i32          1
 diff --git a/util/cpuinfo-aarch64.c b/util/cpuinfo-aarch64.c
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/util/cpuinfo-aarch64.c
@@ -XXX,XX +XXX,XX @@
 +/*
 + * SPDX-License-Identifier: GPL-2.0-or-later
 + * Host specific cpu indentification for AArch64.
 + */
 +
 +#include "qemu/osdep.h"
 +#include "host/cpuinfo.h"
 +
 +#ifdef CONFIG_LINUX
 +# ifdef CONFIG_GETAUXVAL
 +#  include <sys/auxv.h>
 +# else
 +#  include <asm/hwcap.h>
 +#  include "elf.h"
 +# endif
 +#endif
 +#ifdef CONFIG_DARWIN
 +# include <sys/sysctl.h>
 +#endif
 +
 +unsigned cpuinfo;
 +
 +#ifdef CONFIG_DARWIN
 +static bool sysctl_for_bool(const char *name)
 +{
 +    int val = 0;
 +    size_t len = sizeof(val);
 +
 +    if (sysctlbyname(name, &val, &len, NULL, 0) == 0) {
 +        return val != 0;
 +    }
 +
 +    /*
 +     * We might in the future ask for properties not present in older kernels,
 +     * but we're only asking about static properties, all of which should be
 +     * 'int'.  So we shouln't see ENOMEM (val too small), or any of the other
 +     * more exotic errors.
 +     */
 +    assert(errno == ENOENT);
 +    return false;
 +}
 +#endif
 +
 +/* Called both as constructor and (possibly) via other constructors. */
 +unsigned __attribute__((constructor)) cpuinfo_init(void)
 +{
 +    unsigned info = cpuinfo;
 +
 +    if (info) {
 +        return info;
 +    }
 +
 +    info = CPUINFO_ALWAYS;
 +
 +#ifdef CONFIG_LINUX
 +    unsigned long hwcap = qemu_getauxval(AT_HWCAP);
 +    info |= (hwcap & HWCAP_ATOMICS ? CPUINFO_LSE : 0);
 +    info |= (hwcap & HWCAP_USCAT ? CPUINFO_LSE2 : 0);
 +#endif
 +#ifdef CONFIG_DARWIN
 +    info |= sysctl_for_bool("hw.optional.arm.FEAT_LSE") * CPUINFO_LSE;
 +    info |= sysctl_for_bool("hw.optional.arm.FEAT_LSE2") * CPUINFO_LSE2;
 +#endif
 +
 +    cpuinfo = info;
 +    return info;
 +}
 diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/aarch64/tcg-target.c.inc
 +++ b/tcg/aarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@
  #include "../tcg-ldst.c.inc"
  #include "../tcg-pool.c.inc"
  #include "qemu/bitops.h"
 -#ifdef __linux__
 -#include <asm/hwcap.h>
 -#endif
 -#ifdef CONFIG_DARWIN
 -#include <sys/sysctl.h>
 -#endif
  /* We're going to re-use TCGType in setting of the SF bit, which controls
     the size of the operation performed.  If we know the values match, it
@@ -XXX,XX +XXX,XX @@ static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind kind, int slot)
      return TCG_REG_X0 + slot;
  }
 -bool have_lse;
 -bool have_lse2;
 -
  #define TCG_REG_TMP TCG_REG_X30
  #define TCG_VEC_TMP TCG_REG_V31
@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
      }
  }
--static bool use_goto_tb(DisasContext *dc, target_ulong dest)
+-#ifdef CONFIG_DARWIN
 -static bool sysctl_for_bool(const char *name)
 -{
--    if (unlikely(dc->base.singlestep_enabled)) {
+-    int val = 0;
--        return false;
+-    size_t len = sizeof(val);
--    } else {
+-
--        return true;
+-    if (sysctlbyname(name, &val, &len, NULL, 0) == 0) {
 -        return val != 0;
 -    }
+-
+-    /*
+-     * We might in the future ask for properties not present in older kernels,
+-     * but we're only asking about static properties, all of which should be
+-     * 'int'.  So we shouln't see ENOMEM (val too small), or any of the other
+-     * more exotic errors.
+-     */
+-    assert(errno == ENOENT);
+-    return false;
 -}
--
+-#endif
- static void gen_goto_tb(DisasContext *dc, int n, target_ulong dest)
+-
  static void tcg_target_init(TCGContext *s)
  {
--    if (use_goto_tb(dc, dest)) {
+-#ifdef __linux__
-+    if (translator_use_goto_tb(&dc->base, dest)) {
+-    unsigned long hwcap = qemu_getauxval(AT_HWCAP);
-         tcg_gen_goto_tb(n);
+-    have_lse = hwcap & HWCAP_ATOMICS;
-         tcg_gen_movi_i32(cpu_pc, dest);
+-    have_lse2 = hwcap & HWCAP_USCAT;
-         tcg_gen_exit_tb(dc->base.tb, n);
+-#endif
 -#ifdef CONFIG_DARWIN
 -    have_lse = sysctl_for_bool("hw.optional.arm.FEAT_LSE");
 -    have_lse2 = sysctl_for_bool("hw.optional.arm.FEAT_LSE2");
 -#endif
 -
      tcg_target_available_regs[TCG_TYPE_I32] = 0xffffffffu;
      tcg_target_available_regs[TCG_TYPE_I64] = 0xffffffffu;
      tcg_target_available_regs[TCG_TYPE_V64] = 0xffffffff00000000ull;
 diff --git a/util/meson.build b/util/meson.build
 index XXXXXXX..XXXXXXX 100644
 --- a/util/meson.build
 +++ b/util/meson.build
@@ -XXX,XX +XXX,XX @@ if have_block
    util_ss.add(when: 'CONFIG_LINUX', if_true: files('vfio-helpers.c'))
  endif
 -if cpu in ['x86', 'x86_64']
 +if cpu == 'aarch64'
 +  util_ss.add(files('cpuinfo-aarch64.c'))
 +elif cpu in ['x86', 'x86_64']
    util_ss.add(files('cpuinfo-i386.c'))
  endif
 --
-.25.1
+.34.1

-[PATCH 33/41] target/xtensa: Use translator_use_goto_tb
+[PULL 10/28] include/host: Split out atomic128-cas.h
-Reviewed-by: Max Filippov <jcmvbkbc@gmail.com>
+Separates the aarch64-specific portion into its own file.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/xtensa/translate.c | 6 +-----
+ host/include/aarch64/host/atomic128-cas.h | 43 ++++++++++++++++++
-file changed, 1 insertion(+), 5 deletions(-)
+ host/include/generic/host/atomic128-cas.h | 43 ++++++++++++++++++
  include/qemu/atomic128.h                  | 55 +----------------------
 files changed, 87 insertions(+), 54 deletions(-)
  create mode 100644 host/include/aarch64/host/atomic128-cas.h
  create mode 100644 host/include/generic/host/atomic128-cas.h
-diff --git a/target/xtensa/translate.c b/target/xtensa/translate.c
+diff --git a/host/include/aarch64/host/atomic128-cas.h b/host/include/aarch64/host/atomic128-cas.h
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/host/include/aarch64/host/atomic128-cas.h
@@ -XXX,XX +XXX,XX @@
 +/*
 + * SPDX-License-Identifier: GPL-2.0-or-later
 + * Compare-and-swap for 128-bit atomic operations, AArch64 version.
 + *
 + * Copyright (C) 2018, 2023 Linaro, Ltd.
 + *
 + * See docs/devel/atomics.rst for discussion about the guarantees each
 + * atomic primitive is meant to provide.
 + */
 +
 +#ifndef AARCH64_ATOMIC128_CAS_H
 +#define AARCH64_ATOMIC128_CAS_H
 +
 +/* Through gcc 10, aarch64 has no support for 128-bit atomics.  */
 +#if defined(CONFIG_ATOMIC128) || defined(CONFIG_CMPXCHG128)
 +#include "host/include/generic/host/atomic128-cas.h"
 +#else
 +static inline Int128 atomic16_cmpxchg(Int128 *ptr, Int128 cmp, Int128 new)
 +{
 +    uint64_t cmpl = int128_getlo(cmp), cmph = int128_gethi(cmp);
 +    uint64_t newl = int128_getlo(new), newh = int128_gethi(new);
 +    uint64_t oldl, oldh;
 +    uint32_t tmp;
 +
 +    asm("0: ldaxp %[oldl], %[oldh], %[mem]\n\t"
 +        "cmp %[oldl], %[cmpl]\n\t"
 +        "ccmp %[oldh], %[cmph], #0, eq\n\t"
 +        "b.ne 1f\n\t"
 +        "stlxp %w[tmp], %[newl], %[newh], %[mem]\n\t"
 +        "cbnz %w[tmp], 0b\n"
 +        "1:"
 +        : [mem] "+m"(*ptr), [tmp] "=&r"(tmp),
 +          [oldl] "=&r"(oldl), [oldh] "=&r"(oldh)
 +        : [cmpl] "r"(cmpl), [cmph] "r"(cmph),
 +          [newl] "r"(newl), [newh] "r"(newh)
 +        : "memory", "cc");
 +
 +    return int128_make128(oldl, oldh);
 +}
 +# define HAVE_CMPXCHG128 1
 +#endif
 +
 +#endif /* AARCH64_ATOMIC128_CAS_H */
 diff --git a/host/include/generic/host/atomic128-cas.h b/host/include/generic/host/atomic128-cas.h
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/host/include/generic/host/atomic128-cas.h
@@ -XXX,XX +XXX,XX @@
 +/*
 + * SPDX-License-Identifier: GPL-2.0-or-later
 + * Compare-and-swap for 128-bit atomic operations, generic version.
 + *
 + * Copyright (C) 2018, 2023 Linaro, Ltd.
 + *
 + * See docs/devel/atomics.rst for discussion about the guarantees each
 + * atomic primitive is meant to provide.
 + */
 +
 +#ifndef HOST_ATOMIC128_CAS_H
 +#define HOST_ATOMIC128_CAS_H
 +
 +#if defined(CONFIG_ATOMIC128)
 +static inline Int128 atomic16_cmpxchg(Int128 *ptr, Int128 cmp, Int128 new)
 +{
 +    Int128Alias r, c, n;
 +
 +    c.s = cmp;
 +    n.s = new;
 +    r.i = qatomic_cmpxchg__nocheck((__int128_t *)ptr, c.i, n.i);
 +    return r.s;
 +}
 +# define HAVE_CMPXCHG128 1
 +#elif defined(CONFIG_CMPXCHG128)
 +static inline Int128 atomic16_cmpxchg(Int128 *ptr, Int128 cmp, Int128 new)
 +{
 +    Int128Alias r, c, n;
 +
 +    c.s = cmp;
 +    n.s = new;
 +    r.i = __sync_val_compare_and_swap_16((__int128_t *)ptr, c.i, n.i);
 +    return r.s;
 +}
 +# define HAVE_CMPXCHG128 1
 +#else
 +/* Fallback definition that must be optimized away, or error.  */
 +Int128 QEMU_ERROR("unsupported atomic")
 +    atomic16_cmpxchg(Int128 *ptr, Int128 cmp, Int128 new);
 +# define HAVE_CMPXCHG128 0
 +#endif
 +
 +#endif /* HOST_ATOMIC128_CAS_H */
 diff --git a/include/qemu/atomic128.h b/include/qemu/atomic128.h
 index XXXXXXX..XXXXXXX 100644
---- a/target/xtensa/translate.c
+--- a/include/qemu/atomic128.h
-+++ b/target/xtensa/translate.c
++++ b/include/qemu/atomic128.h
-@@ -XXX,XX +XXX,XX @@ static void gen_jump(DisasContext *dc, TCGv dest)
+@@ -XXX,XX +XXX,XX @@
+  * Therefore, special case each platform.
- static int adjust_jump_slot(DisasContext *dc, uint32_t dest, int slot)
+  */
- {
--    if (((dc->base.pc_first ^ dest) & TARGET_PAGE_MASK) != 0) {
+-#if defined(CONFIG_ATOMIC128)
--        return -1;
+-static inline Int128 atomic16_cmpxchg(Int128 *ptr, Int128 cmp, Int128 new)
--    } else {
+-{
--        return slot;
+-    Int128Alias r, c, n;
--    }
+-
-+    return translator_use_goto_tb(&dc->base, dest) ? slot : -1;
+-    c.s = cmp;
- }
+-    n.s = new;
+-    r.i = qatomic_cmpxchg__nocheck((__int128_t *)ptr, c.i, n.i);
- static void gen_jumpi(DisasContext *dc, uint32_t dest, int slot)
+-    return r.s;
 -}
 -# define HAVE_CMPXCHG128 1
 -#elif defined(CONFIG_CMPXCHG128)
 -static inline Int128 atomic16_cmpxchg(Int128 *ptr, Int128 cmp, Int128 new)
 -{
 -    Int128Alias r, c, n;
 -
 -    c.s = cmp;
 -    n.s = new;
 -    r.i = __sync_val_compare_and_swap_16((__int128_t *)ptr, c.i, n.i);
 -    return r.s;
 -}
 -# define HAVE_CMPXCHG128 1
 -#elif defined(__aarch64__)
 -/* Through gcc 8, aarch64 has no support for 128-bit at all.  */
 -static inline Int128 atomic16_cmpxchg(Int128 *ptr, Int128 cmp, Int128 new)
 -{
 -    uint64_t cmpl = int128_getlo(cmp), cmph = int128_gethi(cmp);
 -    uint64_t newl = int128_getlo(new), newh = int128_gethi(new);
 -    uint64_t oldl, oldh;
 -    uint32_t tmp;
 -
 -    asm("0: ldaxp %[oldl], %[oldh], %[mem]\n\t"
 -        "cmp %[oldl], %[cmpl]\n\t"
 -        "ccmp %[oldh], %[cmph], #0, eq\n\t"
 -        "b.ne 1f\n\t"
 -        "stlxp %w[tmp], %[newl], %[newh], %[mem]\n\t"
 -        "cbnz %w[tmp], 0b\n"
 -        "1:"
 -        : [mem] "+m"(*ptr), [tmp] "=&r"(tmp),
 -          [oldl] "=&r"(oldl), [oldh] "=&r"(oldh)
 -        : [cmpl] "r"(cmpl), [cmph] "r"(cmph),
 -          [newl] "r"(newl), [newh] "r"(newh)
 -        : "memory", "cc");
 -
 -    return int128_make128(oldl, oldh);
 -}
 -# define HAVE_CMPXCHG128 1
 -#else
 -/* Fallback definition that must be optimized away, or error.  */
 -Int128 QEMU_ERROR("unsupported atomic")
 -    atomic16_cmpxchg(Int128 *ptr, Int128 cmp, Int128 new);
 -# define HAVE_CMPXCHG128 0
 -#endif /* Some definition for HAVE_CMPXCHG128 */
 -
 +#include "host/atomic128-cas.h"
  #if defined(CONFIG_ATOMIC128)
  static inline Int128 atomic16_read(Int128 *ptr)
 --
-.25.1
+.34.1

-[PATCH 14/41] target/avr: Mark some helpers noreturn
+[PULL 11/28] include/host: Split out atomic128-ldst.h
-All of these helpers end with cpu_loop_exit.
+Separates the aarch64-specific portion into its own file.
-Reviewed-by: Michael Rolnik <mrolnik@gmail.com>
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/avr/helper.h | 8 ++++----
+ host/include/aarch64/host/atomic128-ldst.h | 49 ++++++++++++++
-file changed, 4 insertions(+), 4 deletions(-)
+ host/include/generic/host/atomic128-ldst.h | 57 +++++++++++++++++
+ include/qemu/atomic128.h                   | 74 +---------------------
-diff --git a/target/avr/helper.h b/target/avr/helper.h
+files changed, 107 insertions(+), 73 deletions(-)
  create mode 100644 host/include/aarch64/host/atomic128-ldst.h
  create mode 100644 host/include/generic/host/atomic128-ldst.h
 diff --git a/host/include/aarch64/host/atomic128-ldst.h b/host/include/aarch64/host/atomic128-ldst.h
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/host/include/aarch64/host/atomic128-ldst.h
@@ -XXX,XX +XXX,XX @@
 +/*
 + * SPDX-License-Identifier: GPL-2.0-or-later
 + * Load/store for 128-bit atomic operations, AArch64 version.
 + *
 + * Copyright (C) 2018, 2023 Linaro, Ltd.
 + *
 + * See docs/devel/atomics.rst for discussion about the guarantees each
 + * atomic primitive is meant to provide.
 + */
 +
 +#ifndef AARCH64_ATOMIC128_LDST_H
 +#define AARCH64_ATOMIC128_LDST_H
 +
 +/* Through gcc 10, aarch64 has no support for 128-bit atomics.  */
 +#if !defined(CONFIG_ATOMIC128) && !defined(CONFIG_USER_ONLY)
 +/* We can do better than cmpxchg for AArch64.  */
 +static inline Int128 atomic16_read(Int128 *ptr)
 +{
 +    uint64_t l, h;
 +    uint32_t tmp;
 +
 +    /* The load must be paired with the store to guarantee not tearing.  */
 +    asm("0: ldxp %[l], %[h], %[mem]\n\t"
 +        "stxp %w[tmp], %[l], %[h], %[mem]\n\t"
 +        "cbnz %w[tmp], 0b"
 +        : [mem] "+m"(*ptr), [tmp] "=r"(tmp), [l] "=r"(l), [h] "=r"(h));
 +
 +    return int128_make128(l, h);
 +}
 +
 +static inline void atomic16_set(Int128 *ptr, Int128 val)
 +{
 +    uint64_t l = int128_getlo(val), h = int128_gethi(val);
 +    uint64_t t1, t2;
 +
 +    /* Load into temporaries to acquire the exclusive access lock.  */
 +    asm("0: ldxp %[t1], %[t2], %[mem]\n\t"
 +        "stxp %w[t1], %[l], %[h], %[mem]\n\t"
 +        "cbnz %w[t1], 0b"
 +        : [mem] "+m"(*ptr), [t1] "=&r"(t1), [t2] "=&r"(t2)
 +        : [l] "r"(l), [h] "r"(h));
 +}
 +
 +# define HAVE_ATOMIC128 1
 +#else
 +#include "host/include/generic/host/atomic128-ldst.h"
 +#endif
 +
 +#endif /* AARCH64_ATOMIC128_LDST_H */
 diff --git a/host/include/generic/host/atomic128-ldst.h b/host/include/generic/host/atomic128-ldst.h
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/host/include/generic/host/atomic128-ldst.h
@@ -XXX,XX +XXX,XX @@
 +/*
 + * SPDX-License-Identifier: GPL-2.0-or-later
 + * Load/store for 128-bit atomic operations, generic version.
 + *
 + * Copyright (C) 2018, 2023 Linaro, Ltd.
 + *
 + * See docs/devel/atomics.rst for discussion about the guarantees each
 + * atomic primitive is meant to provide.
 + */
 +
 +#ifndef HOST_ATOMIC128_LDST_H
 +#define HOST_ATOMIC128_LDST_H
 +
 +#if defined(CONFIG_ATOMIC128)
 +static inline Int128 atomic16_read(Int128 *ptr)
 +{
 +    Int128Alias r;
 +
 +    r.i = qatomic_read__nocheck((__int128_t *)ptr);
 +    return r.s;
 +}
 +
 +static inline void atomic16_set(Int128 *ptr, Int128 val)
 +{
 +    Int128Alias v;
 +
 +    v.s = val;
 +    qatomic_set__nocheck((__int128_t *)ptr, v.i);
 +}
 +
 +# define HAVE_ATOMIC128 1
 +#elif !defined(CONFIG_USER_ONLY) && HAVE_CMPXCHG128
 +static inline Int128 atomic16_read(Int128 *ptr)
 +{
 +    /* Maybe replace 0 with 0, returning the old value.  */
 +    Int128 z = int128_make64(0);
 +    return atomic16_cmpxchg(ptr, z, z);
 +}
 +
 +static inline void atomic16_set(Int128 *ptr, Int128 val)
 +{
 +    Int128 old = *ptr, cmp;
 +    do {
 +        cmp = old;
 +        old = atomic16_cmpxchg(ptr, cmp, val);
 +    } while (int128_ne(old, cmp));
 +}
 +
 +# define HAVE_ATOMIC128 1
 +#else
 +/* Fallback definitions that must be optimized away, or error.  */
 +Int128 QEMU_ERROR("unsupported atomic") atomic16_read(Int128 *ptr);
 +void QEMU_ERROR("unsupported atomic") atomic16_set(Int128 *ptr, Int128 val);
 +# define HAVE_ATOMIC128 0
 +#endif
 +
 +#endif /* HOST_ATOMIC128_LDST_H */
 diff --git a/include/qemu/atomic128.h b/include/qemu/atomic128.h
 index XXXXXXX..XXXXXXX 100644
---- a/target/avr/helper.h
+--- a/include/qemu/atomic128.h
-+++ b/target/avr/helper.h
++++ b/include/qemu/atomic128.h
 @@ -XXX,XX +XXX,XX @@
   */
- DEF_HELPER_1(wdr, void, env)
+ #include "host/atomic128-cas.h"
--DEF_HELPER_1(debug, void, env)
+-
--DEF_HELPER_1(break, void, env)
+-#if defined(CONFIG_ATOMIC128)
--DEF_HELPER_1(sleep, void, env)
+-static inline Int128 atomic16_read(Int128 *ptr)
--DEF_HELPER_1(unsupported, void, env)
+-{
-+DEF_HELPER_1(debug, noreturn, env)
+-    Int128Alias r;
-+DEF_HELPER_1(break, noreturn, env)
+-
-+DEF_HELPER_1(sleep, noreturn, env)
+-    r.i = qatomic_read__nocheck((__int128_t *)ptr);
-+DEF_HELPER_1(unsupported, noreturn, env)
+-    return r.s;
- DEF_HELPER_3(outb, void, env, i32, i32)
+-}
- DEF_HELPER_2(inb, tl, env, i32)
+-
- DEF_HELPER_3(fullwr, void, env, i32, i32)
+-static inline void atomic16_set(Int128 *ptr, Int128 val)
 -{
 -    Int128Alias v;
 -
 -    v.s = val;
 -    qatomic_set__nocheck((__int128_t *)ptr, v.i);
 -}
 -
 -# define HAVE_ATOMIC128 1
 -#elif !defined(CONFIG_USER_ONLY) && defined(__aarch64__)
 -/* We can do better than cmpxchg for AArch64.  */
 -static inline Int128 atomic16_read(Int128 *ptr)
 -{
 -    uint64_t l, h;
 -    uint32_t tmp;
 -
 -    /* The load must be paired with the store to guarantee not tearing.  */
 -    asm("0: ldxp %[l], %[h], %[mem]\n\t"
 -        "stxp %w[tmp], %[l], %[h], %[mem]\n\t"
 -        "cbnz %w[tmp], 0b"
 -        : [mem] "+m"(*ptr), [tmp] "=r"(tmp), [l] "=r"(l), [h] "=r"(h));
 -
 -    return int128_make128(l, h);
 -}
 -
 -static inline void atomic16_set(Int128 *ptr, Int128 val)
 -{
 -    uint64_t l = int128_getlo(val), h = int128_gethi(val);
 -    uint64_t t1, t2;
 -
 -    /* Load into temporaries to acquire the exclusive access lock.  */
 -    asm("0: ldxp %[t1], %[t2], %[mem]\n\t"
 -        "stxp %w[t1], %[l], %[h], %[mem]\n\t"
 -        "cbnz %w[t1], 0b"
 -        : [mem] "+m"(*ptr), [t1] "=&r"(t1), [t2] "=&r"(t2)
 -        : [l] "r"(l), [h] "r"(h));
 -}
 -
 -# define HAVE_ATOMIC128 1
 -#elif !defined(CONFIG_USER_ONLY) && HAVE_CMPXCHG128
 -static inline Int128 atomic16_read(Int128 *ptr)
 -{
 -    /* Maybe replace 0 with 0, returning the old value.  */
 -    Int128 z = int128_make64(0);
 -    return atomic16_cmpxchg(ptr, z, z);
 -}
 -
 -static inline void atomic16_set(Int128 *ptr, Int128 val)
 -{
 -    Int128 old = *ptr, cmp;
 -    do {
 -        cmp = old;
 -        old = atomic16_cmpxchg(ptr, cmp, val);
 -    } while (int128_ne(old, cmp));
 -}
 -
 -# define HAVE_ATOMIC128 1
 -#else
 -/* Fallback definitions that must be optimized away, or error.  */
 -Int128 QEMU_ERROR("unsupported atomic") atomic16_read(Int128 *ptr);
 -void QEMU_ERROR("unsupported atomic") atomic16_set(Int128 *ptr, Int128 val);
 -# define HAVE_ATOMIC128 0
 -#endif /* Some definition for HAVE_ATOMIC128 */
 +#include "host/atomic128-ldst.h"
  #endif /* QEMU_ATOMIC128_H */
 --
-.25.1
+.34.1

-[PATCH 01/41] tcg: Add separator in INDEX_op_call dump
+[PULL 12/28] meson: Fix detect atomic128 support with optimization
-We lost the ',' following the called function name.
+Silly typo: sizeof(16) != 16.
-Fixes: 3e92aa34434
+Fixes: e61f1efeb730 ("meson: Detect atomic128 support with optimization")
-Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/tcg.c | 2 +-
+ meson.build | 2 +-
 file changed, 1 insertion(+), 1 deletion(-)
-diff --git a/tcg/tcg.c b/tcg/tcg.c
+diff --git a/meson.build b/meson.build
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/tcg.c
+--- a/meson.build
-+++ b/tcg/tcg.c
++++ b/meson.build
-@@ -XXX,XX +XXX,XX @@ static void tcg_dump_ops(TCGContext *s, bool have_prefs)
+@@ -XXX,XX +XXX,XX @@ if has_int128
-                 col += qemu_log("plugin(%p)", func);
+   # __alignof(unsigned __int128) for the host.
-             }
+   atomic_test_128 = '''
+     int main(int ac, char **av) {
--            col += qemu_log("$0x%x,$%d", info->flags, nb_oargs);
+-      unsigned __int128 *p = __builtin_assume_aligned(av[ac - 1], sizeof(16));
-+            col += qemu_log(",$0x%x,$%d", info->flags, nb_oargs);
++      unsigned __int128 *p = __builtin_assume_aligned(av[ac - 1], 16);
-             for (i = 0; i < nb_oargs; i++) {
+       p[1] = __atomic_load_n(&p[0], __ATOMIC_RELAXED);
-                 col += qemu_log(",%s", tcg_get_arg_str(s, buf, sizeof(buf),
+       __atomic_store_n(&p[2], p[3], __ATOMIC_RELAXED);
-                                                        op->args[i]));
+       __atomic_compare_exchange_n(&p[4], &p[5], p[6], 0, __ATOMIC_RELAXED, __ATOMIC_RELAXED);
 --
-.25.1
+.34.1

-[PATCH 06/41] accel/tcg: Introduce translator_use_goto_tb
+[PULL 13/28] include/qemu: Move CONFIG_ATOMIC128_OPT handling to atomic128.h
-Add a generic version of the common use_goto_tb test.
+Not only the routines in ldst_atomicity.c.inc need markup,
 but also the ones in the headers.
-Various targets avoid the page crossing test for CONFIG_USER_ONLY,
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 but that is wrong: mmap and mprotect can change page permissions.
 Reviewed-by: Max Filippov <jcmvbkbc@gmail.com>
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/exec/translator.h | 10 ++++++++++
+ host/include/generic/host/atomic128-cas.h  | 12 ++++++++----
- accel/tcg/translator.c    | 11 +++++++++++
+ host/include/generic/host/atomic128-ldst.h | 18 ++++++++++++------
-files changed, 21 insertions(+)
+ include/qemu/atomic128.h                   | 17 +++++++++++++++++
  accel/tcg/ldst_atomicity.c.inc             | 17 -----------------
 files changed, 37 insertions(+), 27 deletions(-)
-diff --git a/include/exec/translator.h b/include/exec/translator.h
+diff --git a/host/include/generic/host/atomic128-cas.h b/host/include/generic/host/atomic128-cas.h
 index XXXXXXX..XXXXXXX 100644
---- a/include/exec/translator.h
+--- a/host/include/generic/host/atomic128-cas.h
-+++ b/include/exec/translator.h
++++ b/host/include/generic/host/atomic128-cas.h
-@@ -XXX,XX +XXX,XX @@ void translator_loop(const TranslatorOps *ops, DisasContextBase *db,
+@@ -XXX,XX +XXX,XX @@
+ #define HOST_ATOMIC128_CAS_H
- void translator_loop_temp_check(DisasContextBase *db);
+ #if defined(CONFIG_ATOMIC128)
-+/**
+-static inline Int128 atomic16_cmpxchg(Int128 *ptr, Int128 cmp, Int128 new)
-+ * translator_use_goto_tb
++static inline Int128 ATTRIBUTE_ATOMIC128_OPT
-+ * @db: Disassembly context
++atomic16_cmpxchg(Int128 *ptr, Int128 cmp, Int128 new)
-+ * @dest: target pc of the goto
+ {
-+ *
++    __int128_t *ptr_align = __builtin_assume_aligned(ptr, 16);
-+ * Return true if goto_tb is allowed between the current TB
+     Int128Alias r, c, n;
-+ * and the destination PC.
      c.s = cmp;
      n.s = new;
 -    r.i = qatomic_cmpxchg__nocheck((__int128_t *)ptr, c.i, n.i);
 +    r.i = qatomic_cmpxchg__nocheck(ptr_align, c.i, n.i);
      return r.s;
  }
  # define HAVE_CMPXCHG128 1
  #elif defined(CONFIG_CMPXCHG128)
 -static inline Int128 atomic16_cmpxchg(Int128 *ptr, Int128 cmp, Int128 new)
 +static inline Int128 ATTRIBUTE_ATOMIC128_OPT
 +atomic16_cmpxchg(Int128 *ptr, Int128 cmp, Int128 new)
  {
 +    __int128_t *ptr_align = __builtin_assume_aligned(ptr, 16);
      Int128Alias r, c, n;
      c.s = cmp;
      n.s = new;
 -    r.i = __sync_val_compare_and_swap_16((__int128_t *)ptr, c.i, n.i);
 +    r.i = __sync_val_compare_and_swap_16(ptr_align, c.i, n.i);
      return r.s;
  }
  # define HAVE_CMPXCHG128 1
 diff --git a/host/include/generic/host/atomic128-ldst.h b/host/include/generic/host/atomic128-ldst.h
 index XXXXXXX..XXXXXXX 100644
 --- a/host/include/generic/host/atomic128-ldst.h
 +++ b/host/include/generic/host/atomic128-ldst.h
@@ -XXX,XX +XXX,XX @@
  #define HOST_ATOMIC128_LDST_H
  #if defined(CONFIG_ATOMIC128)
 -static inline Int128 atomic16_read(Int128 *ptr)
 +static inline Int128 ATTRIBUTE_ATOMIC128_OPT
 +atomic16_read(Int128 *ptr)
  {
 +    __int128_t *ptr_align = __builtin_assume_aligned(ptr, 16);
      Int128Alias r;
 -    r.i = qatomic_read__nocheck((__int128_t *)ptr);
 +    r.i = qatomic_read__nocheck(ptr_align);
      return r.s;
  }
 -static inline void atomic16_set(Int128 *ptr, Int128 val)
 +static inline void ATTRIBUTE_ATOMIC128_OPT
 +atomic16_set(Int128 *ptr, Int128 val)
  {
 +    __int128_t *ptr_align = __builtin_assume_aligned(ptr, 16);
      Int128Alias v;
      v.s = val;
 -    qatomic_set__nocheck((__int128_t *)ptr, v.i);
 +    qatomic_set__nocheck(ptr_align, v.i);
  }
  # define HAVE_ATOMIC128 1
  #elif !defined(CONFIG_USER_ONLY) && HAVE_CMPXCHG128
 -static inline Int128 atomic16_read(Int128 *ptr)
 +static inline Int128 ATTRIBUTE_ATOMIC128_OPT
 +atomic16_read(Int128 *ptr)
  {
      /* Maybe replace 0 with 0, returning the old value.  */
      Int128 z = int128_make64(0);
      return atomic16_cmpxchg(ptr, z, z);
  }
 -static inline void atomic16_set(Int128 *ptr, Int128 val)
 +static inline void ATTRIBUTE_ATOMIC128_OPT
 +atomic16_set(Int128 *ptr, Int128 val)
  {
      Int128 old = *ptr, cmp;
      do {
 diff --git a/include/qemu/atomic128.h b/include/qemu/atomic128.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/qemu/atomic128.h
 +++ b/include/qemu/atomic128.h
@@ -XXX,XX +XXX,XX @@
  #include "qemu/int128.h"
 +/*
 + * If __alignof(unsigned __int128) < 16, GCC may refuse to inline atomics
 + * that are supported by the host, e.g. s390x.  We can force the pointer to
 + * have our known alignment with __builtin_assume_aligned, however prior to
 + * GCC 13 that was only reliable with optimization enabled.  See
 + *   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107389
 + */
-+bool translator_use_goto_tb(DisasContextBase *db, target_ulong dest);
++#if defined(CONFIG_ATOMIC128_OPT)
 +# if !defined(__OPTIMIZE__)
 +#  define ATTRIBUTE_ATOMIC128_OPT  __attribute__((optimize("O1")))
 +# endif
 +# define CONFIG_ATOMIC128
 +#endif
 +#ifndef ATTRIBUTE_ATOMIC128_OPT
 +# define ATTRIBUTE_ATOMIC128_OPT
 +#endif
 +
  /*
-  * Translator Load Functions
+  * GCC is a house divided about supporting large atomic operations.
   *
-diff --git a/accel/tcg/translator.c b/accel/tcg/translator.c
+diff --git a/accel/tcg/ldst_atomicity.c.inc b/accel/tcg/ldst_atomicity.c.inc
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/translator.c
+--- a/accel/tcg/ldst_atomicity.c.inc
-+++ b/accel/tcg/translator.c
++++ b/accel/tcg/ldst_atomicity.c.inc
-@@ -XXX,XX +XXX,XX @@ void translator_loop_temp_check(DisasContextBase *db)
+@@ -XXX,XX +XXX,XX @@
-     }
+ #endif
- }
+ #define HAVE_al8_fast      (ATOMIC_REG_SIZE >= 8)
-+bool translator_use_goto_tb(DisasContextBase *db, target_ulong dest)
+-/*
-+{
+- * If __alignof(unsigned __int128) < 16, GCC may refuse to inline atomics
-+    /* Suppress goto_tb in the case of single-steping.  */
+- * that are supported by the host, e.g. s390x.  We can force the pointer to
-+    if (db->singlestep_enabled || singlestep) {
+- * have our known alignment with __builtin_assume_aligned, however prior to
-+        return false;
+- * GCC 13 that was only reliable with optimization enabled.  See
-+    }
+- *   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107389
-+
+- */
-+    /* Check for the dest on the same page as the start of the TB.  */
+-#if defined(CONFIG_ATOMIC128_OPT)
-+    return ((db->pc_first ^ dest) & TARGET_PAGE_MASK) == 0;
+-# if !defined(__OPTIMIZE__)
-+}
+-#  define ATTRIBUTE_ATOMIC128_OPT  __attribute__((optimize("O1")))
-+
+-# endif
- void translator_loop(const TranslatorOps *ops, DisasContextBase *db,
+-# define CONFIG_ATOMIC128
-                      CPUState *cpu, TranslationBlock *tb, int max_insns)
+-#endif
- {
+-#ifndef ATTRIBUTE_ATOMIC128_OPT
 -# define ATTRIBUTE_ATOMIC128_OPT
 -#endif
 -
  #if defined(CONFIG_ATOMIC128)
  # define HAVE_al16_fast    true
  #else
 --
-.25.1
+.34.1

-[PATCH 24/41] target/ppc: Use translator_use_goto_tb
+[PULL 14/28] target/ppc: Use tcg_gen_qemu_{ld, st}_i128 for LQARX, LQ, STQ
-Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+No need to roll our own, as this is now provided by tcg.
 This was the last use of retxl, so remove that too.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/ppc/translate.c | 10 +---------
+ target/ppc/cpu.h                           |  1 -
-file changed, 1 insertion(+), 9 deletions(-)
+ target/ppc/helper.h                        |  9 ----
+ target/ppc/mem_helper.c                    | 48 --------------------
  target/ppc/translate.c                     | 34 ++-------------
  target/ppc/translate/fixedpoint-impl.c.inc | 51 +++-------------------
 files changed, 11 insertions(+), 132 deletions(-)
 diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
 index XXXXXXX..XXXXXXX 100644
 --- a/target/ppc/cpu.h
 +++ b/target/ppc/cpu.h
@@ -XXX,XX +XXX,XX @@ struct CPUArchState {
                             /* used to speed-up TLB assist handlers */
      target_ulong nip;      /* next instruction pointer */
 -    uint64_t retxh;        /* high part of 128-bit helper return */
      /* when a memory exception occurs, the access type is stored here */
      int access_type;
 diff --git a/target/ppc/helper.h b/target/ppc/helper.h
 index XXXXXXX..XXXXXXX 100644
 --- a/target/ppc/helper.h
 +++ b/target/ppc/helper.h
@@ -XXX,XX +XXX,XX @@ DEF_HELPER_4(DSCLIQ, void, env, fprp, fprp, i32)
  DEF_HELPER_1(tbegin, void, env)
  DEF_HELPER_FLAGS_1(fixup_thrm, TCG_CALL_NO_RWG, void, env)
 -
 -#ifdef TARGET_PPC64
 -DEF_HELPER_FLAGS_3(lq_le_parallel, TCG_CALL_NO_WG, i64, env, tl, i32)
 -DEF_HELPER_FLAGS_3(lq_be_parallel, TCG_CALL_NO_WG, i64, env, tl, i32)
 -DEF_HELPER_FLAGS_5(stq_le_parallel, TCG_CALL_NO_WG,
 -                   void, env, tl, i64, i64, i32)
 -DEF_HELPER_FLAGS_5(stq_be_parallel, TCG_CALL_NO_WG,
 -                   void, env, tl, i64, i64, i32)
 -#endif
 diff --git a/target/ppc/mem_helper.c b/target/ppc/mem_helper.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/ppc/mem_helper.c
 +++ b/target/ppc/mem_helper.c
@@ -XXX,XX +XXX,XX @@ target_ulong helper_lscbx(CPUPPCState *env, target_ulong addr, uint32_t reg,
      return i;
  }
 -#ifdef TARGET_PPC64
 -uint64_t helper_lq_le_parallel(CPUPPCState *env, target_ulong addr,
 -                               uint32_t opidx)
 -{
 -    Int128 ret;
 -
 -    /* We will have raised EXCP_ATOMIC from the translator.  */
 -    assert(HAVE_ATOMIC128);
 -    ret = cpu_atomic_ldo_le_mmu(env, addr, opidx, GETPC());
 -    env->retxh = int128_gethi(ret);
 -    return int128_getlo(ret);
 -}
 -
 -uint64_t helper_lq_be_parallel(CPUPPCState *env, target_ulong addr,
 -                               uint32_t opidx)
 -{
 -    Int128 ret;
 -
 -    /* We will have raised EXCP_ATOMIC from the translator.  */
 -    assert(HAVE_ATOMIC128);
 -    ret = cpu_atomic_ldo_be_mmu(env, addr, opidx, GETPC());
 -    env->retxh = int128_gethi(ret);
 -    return int128_getlo(ret);
 -}
 -
 -void helper_stq_le_parallel(CPUPPCState *env, target_ulong addr,
 -                            uint64_t lo, uint64_t hi, uint32_t opidx)
 -{
 -    Int128 val;
 -
 -    /* We will have raised EXCP_ATOMIC from the translator.  */
 -    assert(HAVE_ATOMIC128);
 -    val = int128_make128(lo, hi);
 -    cpu_atomic_sto_le_mmu(env, addr, val, opidx, GETPC());
 -}
 -
 -void helper_stq_be_parallel(CPUPPCState *env, target_ulong addr,
 -                            uint64_t lo, uint64_t hi, uint32_t opidx)
 -{
 -    Int128 val;
 -
 -    /* We will have raised EXCP_ATOMIC from the translator.  */
 -    assert(HAVE_ATOMIC128);
 -    val = int128_make128(lo, hi);
 -    cpu_atomic_sto_be_mmu(env, addr, val, opidx, GETPC());
 -}
 -#endif
 -
  /*****************************************************************************/
  /* Altivec extension helpers */
  #if HOST_BIG_ENDIAN
 diff --git a/target/ppc/translate.c b/target/ppc/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/ppc/translate.c
 +++ b/target/ppc/translate.c
-@@ -XXX,XX +XXX,XX @@ static inline void gen_update_cfar(DisasContext *ctx, target_ulong nip)
+@@ -XXX,XX +XXX,XX @@ static void gen_lqarx(DisasContext *ctx)
  static inline bool use_goto_tb(DisasContext *ctx, target_ulong dest)
  {
--    if (unlikely(ctx->singlestep_enabled)) {
+     int rd = rD(ctx->opcode);
--        return false;
+     TCGv EA, hi, lo;
 +    TCGv_i128 t16;
      if (unlikely((rd & 1) || (rd == rA(ctx->opcode)) ||
                   (rd == rB(ctx->opcode)))) {
@@ -XXX,XX +XXX,XX @@ static void gen_lqarx(DisasContext *ctx)
      lo = cpu_gpr[rd + 1];
      hi = cpu_gpr[rd];
 -    if (tb_cflags(ctx->base.tb) & CF_PARALLEL) {
 -        if (HAVE_ATOMIC128) {
 -            TCGv_i32 oi = tcg_temp_new_i32();
 -            if (ctx->le_mode) {
 -                tcg_gen_movi_i32(oi, make_memop_idx(MO_LE | MO_128 | MO_ALIGN,
 -                                                    ctx->mem_idx));
 -                gen_helper_lq_le_parallel(lo, cpu_env, EA, oi);
 -            } else {
 -                tcg_gen_movi_i32(oi, make_memop_idx(MO_BE | MO_128 | MO_ALIGN,
 -                                                    ctx->mem_idx));
 -                gen_helper_lq_be_parallel(lo, cpu_env, EA, oi);
 -            }
 -            tcg_gen_ld_i64(hi, cpu_env, offsetof(CPUPPCState, retxh));
 -        } else {
 -            /* Restart with exclusive lock.  */
 -            gen_helper_exit_atomic(cpu_env);
 -            ctx->base.is_jmp = DISAS_NORETURN;
 -            return;
 -        }
 -    } else if (ctx->le_mode) {
 -        tcg_gen_qemu_ld_i64(lo, EA, ctx->mem_idx, MO_LEUQ | MO_ALIGN_16);
 -        tcg_gen_mov_tl(cpu_reserve, EA);
 -        gen_addr_add(ctx, EA, EA, 8);
 -        tcg_gen_qemu_ld_i64(hi, EA, ctx->mem_idx, MO_LEUQ);
 -    } else {
 -        tcg_gen_qemu_ld_i64(hi, EA, ctx->mem_idx, MO_BEUQ | MO_ALIGN_16);
 -        tcg_gen_mov_tl(cpu_reserve, EA);
 -        gen_addr_add(ctx, EA, EA, 8);
 -        tcg_gen_qemu_ld_i64(lo, EA, ctx->mem_idx, MO_BEUQ);
 -    }
--
++    t16 = tcg_temp_new_i128();
--#ifndef CONFIG_USER_ONLY
++    tcg_gen_qemu_ld_i128(t16, EA, ctx->mem_idx, DEF_MEMOP(MO_128 | MO_ALIGN));
--    return (ctx->base.tb->pc & TARGET_PAGE_MASK) == (dest & TARGET_PAGE_MASK);
++    tcg_gen_extr_i128_i64(lo, hi, t16);
--#else
--    return true;
+     tcg_gen_st_tl(hi, cpu_env, offsetof(CPUPPCState, reserve_val));
--#endif
+     tcg_gen_st_tl(lo, cpu_env, offsetof(CPUPPCState, reserve_val2));
-+    return translator_use_goto_tb(&ctx->base, dest);
+diff --git a/target/ppc/translate/fixedpoint-impl.c.inc b/target/ppc/translate/fixedpoint-impl.c.inc
- }
+index XXXXXXX..XXXXXXX 100644
+--- a/target/ppc/translate/fixedpoint-impl.c.inc
- static void gen_lookup_and_goto_ptr(DisasContext *ctx)
++++ b/target/ppc/translate/fixedpoint-impl.c.inc
@@ -XXX,XX +XXX,XX @@ static bool do_ldst_quad(DisasContext *ctx, arg_D *a, bool store, bool prefixed)
  #if defined(TARGET_PPC64)
      TCGv ea;
      TCGv_i64 low_addr_gpr, high_addr_gpr;
 -    MemOp mop;
 +    TCGv_i128 t16;
      REQUIRE_INSNS_FLAGS(ctx, 64BX);
@@ -XXX,XX +XXX,XX @@ static bool do_ldst_quad(DisasContext *ctx, arg_D *a, bool store, bool prefixed)
          low_addr_gpr = cpu_gpr[a->rt + 1];
          high_addr_gpr = cpu_gpr[a->rt];
      }
 +    t16 = tcg_temp_new_i128();
 -    if (tb_cflags(ctx->base.tb) & CF_PARALLEL) {
 -        if (HAVE_ATOMIC128) {
 -            mop = DEF_MEMOP(MO_128);
 -            TCGv_i32 oi = tcg_constant_i32(make_memop_idx(mop, ctx->mem_idx));
 -            if (store) {
 -                if (ctx->le_mode) {
 -                    gen_helper_stq_le_parallel(cpu_env, ea, low_addr_gpr,
 -                                               high_addr_gpr, oi);
 -                } else {
 -                    gen_helper_stq_be_parallel(cpu_env, ea, high_addr_gpr,
 -                                               low_addr_gpr, oi);
 -
 -                }
 -            } else {
 -                if (ctx->le_mode) {
 -                    gen_helper_lq_le_parallel(low_addr_gpr, cpu_env, ea, oi);
 -                    tcg_gen_ld_i64(high_addr_gpr, cpu_env,
 -                                   offsetof(CPUPPCState, retxh));
 -                } else {
 -                    gen_helper_lq_be_parallel(high_addr_gpr, cpu_env, ea, oi);
 -                    tcg_gen_ld_i64(low_addr_gpr, cpu_env,
 -                                   offsetof(CPUPPCState, retxh));
 -                }
 -            }
 -        } else {
 -            /* Restart with exclusive lock.  */
 -            gen_helper_exit_atomic(cpu_env);
 -            ctx->base.is_jmp = DISAS_NORETURN;
 -        }
 +    if (store) {
 +        tcg_gen_concat_i64_i128(t16, low_addr_gpr, high_addr_gpr);
 +        tcg_gen_qemu_st_i128(t16, ea, ctx->mem_idx, DEF_MEMOP(MO_128));
      } else {
 -        mop = DEF_MEMOP(MO_UQ);
 -        if (store) {
 -            tcg_gen_qemu_st_i64(low_addr_gpr, ea, ctx->mem_idx, mop);
 -        } else {
 -            tcg_gen_qemu_ld_i64(low_addr_gpr, ea, ctx->mem_idx, mop);
 -        }
 -
 -        gen_addr_add(ctx, ea, ea, 8);
 -
 -        if (store) {
 -            tcg_gen_qemu_st_i64(high_addr_gpr, ea, ctx->mem_idx, mop);
 -        } else {
 -            tcg_gen_qemu_ld_i64(high_addr_gpr, ea, ctx->mem_idx, mop);
 -        }
 +        tcg_gen_qemu_ld_i128(t16, ea, ctx->mem_idx, DEF_MEMOP(MO_128));
 +        tcg_gen_extr_i128_i64(low_addr_gpr, high_addr_gpr, t16);
      }
  #else
      qemu_build_not_reached();
 --
-.25.1
+.34.1

-[PATCH 08/41] target/alpha: Remove in_superpage
+[PULL 15/28] target/s390x: Use tcg_gen_qemu_{ld, st}_i128 for LPQ, STPQ
-The number of links across (normal) pages using this is low,
+No need to roll our own, as this is now provided by tcg.
-and it will shortly violate the contract for breakpoints.
+This was the last use of retxl, so remove that too.
-Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Reviewed-by: David Hildenbrand <david@redhat.com>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/alpha/translate.c | 24 ++----------------------
+ target/s390x/cpu.h               |  3 --
-file changed, 2 insertions(+), 22 deletions(-)
+ target/s390x/helper.h            |  4 ---
  target/s390x/tcg/mem_helper.c    | 61 --------------------------------
  target/s390x/tcg/translate.c     | 30 +++++-----------
  target/s390x/tcg/insn-data.h.inc |  2 +-
 files changed, 9 insertions(+), 91 deletions(-)
-diff --git a/target/alpha/translate.c b/target/alpha/translate.c
+diff --git a/target/s390x/cpu.h b/target/s390x/cpu.h
 index XXXXXXX..XXXXXXX 100644
---- a/target/alpha/translate.c
+--- a/target/s390x/cpu.h
-+++ b/target/alpha/translate.c
++++ b/target/s390x/cpu.h
-@@ -XXX,XX +XXX,XX @@ static DisasJumpType gen_store_conditional(DisasContext *ctx, int ra, int rb,
+@@ -XXX,XX +XXX,XX @@ struct CPUArchState {
      float_status fpu_status; /* passed to softfloat lib */
 -    /* The low part of a 128-bit return, or remainder of a divide.  */
 -    uint64_t retxl;
 -
      PSW psw;
      S390CrashReason crash_reason;
 diff --git a/target/s390x/helper.h b/target/s390x/helper.h
 index XXXXXXX..XXXXXXX 100644
 --- a/target/s390x/helper.h
 +++ b/target/s390x/helper.h
@@ -XXX,XX +XXX,XX @@ DEF_HELPER_FLAGS_2(sfas, TCG_CALL_NO_WG, void, env, i64)
  DEF_HELPER_FLAGS_2(srnm, TCG_CALL_NO_WG, void, env, i64)
  DEF_HELPER_FLAGS_1(popcnt, TCG_CALL_NO_RWG_SE, i64, i64)
  DEF_HELPER_2(stfle, i32, env, i64)
 -DEF_HELPER_FLAGS_2(lpq, TCG_CALL_NO_WG, i64, env, i64)
 -DEF_HELPER_FLAGS_2(lpq_parallel, TCG_CALL_NO_WG, i64, env, i64)
 -DEF_HELPER_FLAGS_4(stpq, TCG_CALL_NO_WG, void, env, i64, i64, i64)
 -DEF_HELPER_FLAGS_4(stpq_parallel, TCG_CALL_NO_WG, void, env, i64, i64, i64)
  DEF_HELPER_4(mvcos, i32, env, i64, i64, i64)
  DEF_HELPER_4(cu12, i32, env, i32, i32, i32)
  DEF_HELPER_4(cu14, i32, env, i32, i32, i32)
 diff --git a/target/s390x/tcg/mem_helper.c b/target/s390x/tcg/mem_helper.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/s390x/tcg/mem_helper.c
 +++ b/target/s390x/tcg/mem_helper.c
@@ -XXX,XX +XXX,XX @@ uint64_t HELPER(lra)(CPUS390XState *env, uint64_t addr)
  }
  #endif
 -/* load pair from quadword */
 -uint64_t HELPER(lpq)(CPUS390XState *env, uint64_t addr)
 -{
 -    uintptr_t ra = GETPC();
 -    uint64_t hi, lo;
 -
 -    check_alignment(env, addr, 16, ra);
 -    hi = cpu_ldq_data_ra(env, addr + 0, ra);
 -    lo = cpu_ldq_data_ra(env, addr + 8, ra);
 -
 -    env->retxl = lo;
 -    return hi;
 -}
 -
 -uint64_t HELPER(lpq_parallel)(CPUS390XState *env, uint64_t addr)
 -{
 -    uintptr_t ra = GETPC();
 -    uint64_t hi, lo;
 -    int mem_idx;
 -    MemOpIdx oi;
 -    Int128 v;
 -
 -    assert(HAVE_ATOMIC128);
 -
 -    mem_idx = cpu_mmu_index(env, false);
 -    oi = make_memop_idx(MO_TEUQ | MO_ALIGN_16, mem_idx);
 -    v = cpu_atomic_ldo_be_mmu(env, addr, oi, ra);
 -    hi = int128_gethi(v);
 -    lo = int128_getlo(v);
 -
 -    env->retxl = lo;
 -    return hi;
 -}
 -
 -/* store pair to quadword */
 -void HELPER(stpq)(CPUS390XState *env, uint64_t addr,
 -                  uint64_t low, uint64_t high)
 -{
 -    uintptr_t ra = GETPC();
 -
 -    check_alignment(env, addr, 16, ra);
 -    cpu_stq_data_ra(env, addr + 0, high, ra);
 -    cpu_stq_data_ra(env, addr + 8, low, ra);
 -}
 -
 -void HELPER(stpq_parallel)(CPUS390XState *env, uint64_t addr,
 -                           uint64_t low, uint64_t high)
 -{
 -    uintptr_t ra = GETPC();
 -    int mem_idx;
 -    MemOpIdx oi;
 -    Int128 v;
 -
 -    assert(HAVE_ATOMIC128);
 -
 -    mem_idx = cpu_mmu_index(env, false);
 -    oi = make_memop_idx(MO_TEUQ | MO_ALIGN_16, mem_idx);
 -    v = int128_make128(low, high);
 -    cpu_atomic_sto_be_mmu(env, addr, v, oi, ra);
 -}
 -
  /* Execute instruction.  This instruction executes an insn modified with
     the contents of r1.  It does not change the executed instruction in memory;
     it does not change the program counter.
 diff --git a/target/s390x/tcg/translate.c b/target/s390x/tcg/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/s390x/tcg/translate.c
 +++ b/target/s390x/tcg/translate.c
@@ -XXX,XX +XXX,XX @@ static void store_freg32_i64(int reg, TCGv_i64 v)
      tcg_gen_st32_i64(v, cpu_env, freg32_offset(reg));
  }
 -static void return_low128(TCGv_i64 dest)
 -{
 -    tcg_gen_ld_i64(dest, cpu_env, offsetof(CPUS390XState, retxl));
 -}
 -
  static void update_psw_addr(DisasContext *s)
  {
      /* psw.addr */
@@ -XXX,XX +XXX,XX @@ static DisasJumpType op_lpd(DisasContext *s, DisasOps *o)
  static DisasJumpType op_lpq(DisasContext *s, DisasOps *o)
  {
 -    if (!(tb_cflags(s->base.tb) & CF_PARALLEL)) {
 -        gen_helper_lpq(o->out, cpu_env, o->in2);
 -    } else if (HAVE_ATOMIC128) {
 -        gen_helper_lpq_parallel(o->out, cpu_env, o->in2);
 -    } else {
 -        gen_helper_exit_atomic(cpu_env);
 -        return DISAS_NORETURN;
 -    }
 -    return_low128(o->out2);
 +    o->out_128 = tcg_temp_new_i128();
 +    tcg_gen_qemu_ld_i128(o->out_128, o->in2, get_mem_index(s),
 +                         MO_TE | MO_128 | MO_ALIGN);
      return DISAS_NEXT;
  }
--static bool in_superpage(DisasContext *ctx, int64_t addr)
+@@ -XXX,XX +XXX,XX @@ static DisasJumpType op_stmh(DisasContext *s, DisasOps *o)
--{
--#ifndef CONFIG_USER_ONLY
+ static DisasJumpType op_stpq(DisasContext *s, DisasOps *o)
 -    return ((ctx->tbflags & ENV_FLAG_PS_USER) == 0
 -            && addr >> TARGET_VIRT_ADDR_SPACE_BITS == -1
 -            && ((addr >> 41) & 3) == 2);
 -#else
 -    return false;
 -#endif
 -}
 -
  static bool use_goto_tb(DisasContext *ctx, uint64_t dest)
  {
- #ifndef CONFIG_USER_ONLY
+-    if (!(tb_cflags(s->base.tb) & CF_PARALLEL)) {
--    /* If the destination is in the superpage, the page perms can't change.  */
+-        gen_helper_stpq(cpu_env, o->in2, o->out2, o->out);
--    if (in_superpage(ctx, dest)) {
+-    } else if (HAVE_ATOMIC128) {
--        return true;
+-        gen_helper_stpq_parallel(cpu_env, o->in2, o->out2, o->out);
 -    } else {
 -        gen_helper_exit_atomic(cpu_env);
 -        return DISAS_NORETURN;
 -    }
-     /* Check for the dest on the same page as the start of the TB.  */
++    TCGv_i128 t16 = tcg_temp_new_i128();
-     return ((ctx->base.tb->pc ^ dest) & TARGET_PAGE_MASK) == 0;
++
- #else
++    tcg_gen_concat_i64_i128(t16, o->out2, o->out);
-@@ -XXX,XX +XXX,XX @@ static void alpha_tr_init_disas_context(DisasContextBase *dcbase, CPUState *cpu)
++    tcg_gen_qemu_st_i128(t16, o->in2, get_mem_index(s),
- {
++                         MO_TE | MO_128 | MO_ALIGN);
-     DisasContext *ctx = container_of(dcbase, DisasContext, base);
+     return DISAS_NEXT;
      CPUAlphaState *env = cpu->env_ptr;
 -    int64_t bound, mask;
 +    int64_t bound;
      ctx->tbflags = ctx->base.tb->flags;
      ctx->mem_idx = cpu_mmu_index(env, false);
@@ -XXX,XX +XXX,XX @@ static void alpha_tr_init_disas_context(DisasContextBase *dcbase, CPUState *cpu)
      ctx->lit = NULL;
      /* Bound the number of insns to execute to those left on the page.  */
 -    if (in_superpage(ctx, ctx->base.pc_first)) {
 -        mask = -1ULL << 41;
 -    } else {
 -        mask = TARGET_PAGE_MASK;
 -    }
 -    bound = -(ctx->base.pc_first | mask) / 4;
 +    bound = -(ctx->base.pc_first | TARGET_PAGE_MASK) / 4;
      ctx->base.max_insns = MIN(ctx->base.max_insns, bound);
  }
+diff --git a/target/s390x/tcg/insn-data.h.inc b/target/s390x/tcg/insn-data.h.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/target/s390x/tcg/insn-data.h.inc
++++ b/target/s390x/tcg/insn-data.h.inc
+@@ -XXX,XX +XXX,XX @@
+     D(0xc804, LPD,     SSF,   ILA, 0, 0, new_P, r3_P32, lpd, 0, MO_TEUL)
+     D(0xc805, LPDG,    SSF,   ILA, 0, 0, new_P, r3_P64, lpd, 0, MO_TEUQ)
+ /* LOAD PAIR FROM QUADWORD */
+-    C(0xe38f, LPQ,     RXY_a, Z,   0, a2, r1_P, 0, lpq, 0)
++    C(0xe38f, LPQ,     RXY_a, Z,   0, a2, 0, r1_D64, lpq, 0)
+ /* LOAD POSITIVE */
+     C(0x1000, LPR,     RR_a,  Z,   0, r2_32s, new, r1_32, abs, abs32)
+     C(0xb900, LPGR,    RRE,   Z,   0, r2, r1, 0, abs, abs64)
 --
-.25.1
+.34.1

-[PATCH 36/41] accel/tcg: Move helper_lookup_tb_ptr to cpu-exec.c
+[PULL 16/28] accel/tcg: Unify cpu_{ld,st}*_{be,le}_mmu
-This will allow additional code sharing.
+With the current structure of cputlb.c, there is no difference
-No functional change.
+between the little-endian and big-endian entry points, aside
 from the assert.  Unify the pairs of functions.
-Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+The only use of the functions with explicit endianness was in
 target/sparc64, and that was only to satisfy the assert: the
 correct endianness is already built into memop.
 Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/cpu-exec.c    | 30 ++++++++++++++++++++++++++++++
+ include/exec/cpu_ldst.h     |  58 ++-----
- accel/tcg/tcg-runtime.c | 22 ----------------------
+ accel/tcg/cputlb.c          | 122 +++-----------
-files changed, 30 insertions(+), 22 deletions(-)
+ accel/tcg/user-exec.c       | 322 ++++++++++--------------------------
  target/arm/tcg/m_helper.c   |   4 +-
  target/sparc/ldst_helper.c  |  18 +-
  accel/tcg/ldst_common.c.inc |  24 +--
 files changed, 137 insertions(+), 411 deletions(-)
-diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
+diff --git a/include/exec/cpu_ldst.h b/include/exec/cpu_ldst.h
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/cpu-exec.c
+--- a/include/exec/cpu_ldst.h
-+++ b/accel/tcg/cpu-exec.c
++++ b/include/exec/cpu_ldst.h
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ void cpu_stq_le_mmuidx_ra(CPUArchState *env, abi_ptr ptr, uint64_t val,
- #include "exec/cpu-all.h"
+                           int mmu_idx, uintptr_t ra);
- #include "sysemu/cpu-timers.h"
- #include "sysemu/replay.h"
+ uint8_t cpu_ldb_mmu(CPUArchState *env, abi_ptr ptr, MemOpIdx oi, uintptr_t ra);
-+#include "exec/helper-proto.h"
+-uint16_t cpu_ldw_be_mmu(CPUArchState *env, abi_ptr ptr,
- #include "tb-hash.h"
+-                        MemOpIdx oi, uintptr_t ra);
- #include "tb-lookup.h"
+-uint32_t cpu_ldl_be_mmu(CPUArchState *env, abi_ptr ptr,
- #include "tb-context.h"
+-                        MemOpIdx oi, uintptr_t ra);
-@@ -XXX,XX +XXX,XX @@ static void init_delay_params(SyncClocks *sc, const CPUState *cpu)
+-uint64_t cpu_ldq_be_mmu(CPUArchState *env, abi_ptr ptr,
- }
+-                        MemOpIdx oi, uintptr_t ra);
- #endif /* CONFIG USER ONLY */
+-uint16_t cpu_ldw_le_mmu(CPUArchState *env, abi_ptr ptr,
+-                        MemOpIdx oi, uintptr_t ra);
-+/**
+-uint32_t cpu_ldl_le_mmu(CPUArchState *env, abi_ptr ptr,
-+ * helper_lookup_tb_ptr: quick check for next tb
+-                        MemOpIdx oi, uintptr_t ra);
-+ * @env: current cpu state
+-uint64_t cpu_ldq_le_mmu(CPUArchState *env, abi_ptr ptr,
-+ *
+-                        MemOpIdx oi, uintptr_t ra);
-+ * Look for an existing TB matching the current cpu state.
+-
-+ * If found, return the code pointer.  If not found, return
+-Int128 cpu_ld16_be_mmu(CPUArchState *env, abi_ptr addr,
-+ * the tcg epilogue so that we return into cpu_tb_exec.
+-                       MemOpIdx oi, uintptr_t ra);
-+ */
+-Int128 cpu_ld16_le_mmu(CPUArchState *env, abi_ptr addr,
-+const void *HELPER(lookup_tb_ptr)(CPUArchState *env)
+-                       MemOpIdx oi, uintptr_t ra);
 +uint16_t cpu_ldw_mmu(CPUArchState *env, abi_ptr ptr, MemOpIdx oi, uintptr_t ra);
 +uint32_t cpu_ldl_mmu(CPUArchState *env, abi_ptr ptr, MemOpIdx oi, uintptr_t ra);
 +uint64_t cpu_ldq_mmu(CPUArchState *env, abi_ptr ptr, MemOpIdx oi, uintptr_t ra);
 +Int128 cpu_ld16_mmu(CPUArchState *env, abi_ptr addr, MemOpIdx oi, uintptr_t ra);
  void cpu_stb_mmu(CPUArchState *env, abi_ptr ptr, uint8_t val,
                   MemOpIdx oi, uintptr_t ra);
 -void cpu_stw_be_mmu(CPUArchState *env, abi_ptr ptr, uint16_t val,
 -                    MemOpIdx oi, uintptr_t ra);
 -void cpu_stl_be_mmu(CPUArchState *env, abi_ptr ptr, uint32_t val,
 -                    MemOpIdx oi, uintptr_t ra);
 -void cpu_stq_be_mmu(CPUArchState *env, abi_ptr ptr, uint64_t val,
 -                    MemOpIdx oi, uintptr_t ra);
 -void cpu_stw_le_mmu(CPUArchState *env, abi_ptr ptr, uint16_t val,
 -                    MemOpIdx oi, uintptr_t ra);
 -void cpu_stl_le_mmu(CPUArchState *env, abi_ptr ptr, uint32_t val,
 -                    MemOpIdx oi, uintptr_t ra);
 -void cpu_stq_le_mmu(CPUArchState *env, abi_ptr ptr, uint64_t val,
 -                    MemOpIdx oi, uintptr_t ra);
 -
 -void cpu_st16_be_mmu(CPUArchState *env, abi_ptr addr, Int128 val,
 -                     MemOpIdx oi, uintptr_t ra);
 -void cpu_st16_le_mmu(CPUArchState *env, abi_ptr addr, Int128 val,
 -                     MemOpIdx oi, uintptr_t ra);
 +void cpu_stw_mmu(CPUArchState *env, abi_ptr ptr, uint16_t val,
 +                 MemOpIdx oi, uintptr_t ra);
 +void cpu_stl_mmu(CPUArchState *env, abi_ptr ptr, uint32_t val,
 +                 MemOpIdx oi, uintptr_t ra);
 +void cpu_stq_mmu(CPUArchState *env, abi_ptr ptr, uint64_t val,
 +                 MemOpIdx oi, uintptr_t ra);
 +void cpu_st16_mmu(CPUArchState *env, abi_ptr addr, Int128 val,
 +                  MemOpIdx oi, uintptr_t ra);
  uint32_t cpu_atomic_cmpxchgb_mmu(CPUArchState *env, target_ulong addr,
                                   uint32_t cmpv, uint32_t newv,
@@ -XXX,XX +XXX,XX @@ static inline CPUTLBEntry *tlb_entry(CPUArchState *env, uintptr_t mmu_idx,
  # define cpu_ldsw_mmuidx_ra   cpu_ldsw_be_mmuidx_ra
  # define cpu_ldl_mmuidx_ra    cpu_ldl_be_mmuidx_ra
  # define cpu_ldq_mmuidx_ra    cpu_ldq_be_mmuidx_ra
 -# define cpu_ldw_mmu          cpu_ldw_be_mmu
 -# define cpu_ldl_mmu          cpu_ldl_be_mmu
 -# define cpu_ldq_mmu          cpu_ldq_be_mmu
  # define cpu_stw_data         cpu_stw_be_data
  # define cpu_stl_data         cpu_stl_be_data
  # define cpu_stq_data         cpu_stq_be_data
@@ -XXX,XX +XXX,XX @@ static inline CPUTLBEntry *tlb_entry(CPUArchState *env, uintptr_t mmu_idx,
  # define cpu_stw_mmuidx_ra    cpu_stw_be_mmuidx_ra
  # define cpu_stl_mmuidx_ra    cpu_stl_be_mmuidx_ra
  # define cpu_stq_mmuidx_ra    cpu_stq_be_mmuidx_ra
 -# define cpu_stw_mmu          cpu_stw_be_mmu
 -# define cpu_stl_mmu          cpu_stl_be_mmu
 -# define cpu_stq_mmu          cpu_stq_be_mmu
  #else
  # define cpu_lduw_data        cpu_lduw_le_data
  # define cpu_ldsw_data        cpu_ldsw_le_data
@@ -XXX,XX +XXX,XX @@ static inline CPUTLBEntry *tlb_entry(CPUArchState *env, uintptr_t mmu_idx,
  # define cpu_ldsw_mmuidx_ra   cpu_ldsw_le_mmuidx_ra
  # define cpu_ldl_mmuidx_ra    cpu_ldl_le_mmuidx_ra
  # define cpu_ldq_mmuidx_ra    cpu_ldq_le_mmuidx_ra
 -# define cpu_ldw_mmu          cpu_ldw_le_mmu
 -# define cpu_ldl_mmu          cpu_ldl_le_mmu
 -# define cpu_ldq_mmu          cpu_ldq_le_mmu
  # define cpu_stw_data         cpu_stw_le_data
  # define cpu_stl_data         cpu_stl_le_data
  # define cpu_stq_data         cpu_stq_le_data
@@ -XXX,XX +XXX,XX @@ static inline CPUTLBEntry *tlb_entry(CPUArchState *env, uintptr_t mmu_idx,
  # define cpu_stw_mmuidx_ra    cpu_stw_le_mmuidx_ra
  # define cpu_stl_mmuidx_ra    cpu_stl_le_mmuidx_ra
  # define cpu_stq_mmuidx_ra    cpu_stq_le_mmuidx_ra
 -# define cpu_stw_mmu          cpu_stw_le_mmu
 -# define cpu_stl_mmu          cpu_stl_le_mmu
 -# define cpu_stq_mmu          cpu_stq_le_mmu
  #endif
  uint8_t cpu_ldb_code_mmu(CPUArchState *env, abi_ptr addr,
 diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/cputlb.c
 +++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ uint8_t cpu_ldb_mmu(CPUArchState *env, abi_ptr addr, MemOpIdx oi, uintptr_t ra)
      return ret;
  }
 -uint16_t cpu_ldw_be_mmu(CPUArchState *env, abi_ptr addr,
 -                        MemOpIdx oi, uintptr_t ra)
 +uint16_t cpu_ldw_mmu(CPUArchState *env, abi_ptr addr,
 +                     MemOpIdx oi, uintptr_t ra)
  {
      uint16_t ret;
 -    tcg_debug_assert((get_memop(oi) & (MO_BSWAP | MO_SIZE)) == MO_BEUW);
 +    tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_16);
      ret = do_ld2_mmu(env, addr, oi, ra, MMU_DATA_LOAD);
      plugin_load_cb(env, addr, oi);
      return ret;
  }
 -uint32_t cpu_ldl_be_mmu(CPUArchState *env, abi_ptr addr,
 -                        MemOpIdx oi, uintptr_t ra)
 +uint32_t cpu_ldl_mmu(CPUArchState *env, abi_ptr addr,
 +                     MemOpIdx oi, uintptr_t ra)
  {
      uint32_t ret;
 -    tcg_debug_assert((get_memop(oi) & (MO_BSWAP | MO_SIZE)) == MO_BEUL);
 +    tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_32);
      ret = do_ld4_mmu(env, addr, oi, ra, MMU_DATA_LOAD);
      plugin_load_cb(env, addr, oi);
      return ret;
  }
 -uint64_t cpu_ldq_be_mmu(CPUArchState *env, abi_ptr addr,
 -                        MemOpIdx oi, uintptr_t ra)
 +uint64_t cpu_ldq_mmu(CPUArchState *env, abi_ptr addr,
 +                     MemOpIdx oi, uintptr_t ra)
  {
      uint64_t ret;
 -    tcg_debug_assert((get_memop(oi) & (MO_BSWAP | MO_SIZE)) == MO_BEUQ);
 +    tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_64);
      ret = do_ld8_mmu(env, addr, oi, ra, MMU_DATA_LOAD);
      plugin_load_cb(env, addr, oi);
      return ret;
  }
 -uint16_t cpu_ldw_le_mmu(CPUArchState *env, abi_ptr addr,
 -                        MemOpIdx oi, uintptr_t ra)
 -{
 -    uint16_t ret;
 -
 -    tcg_debug_assert((get_memop(oi) & (MO_BSWAP | MO_SIZE)) == MO_LEUW);
 -    ret = do_ld2_mmu(env, addr, oi, ra, MMU_DATA_LOAD);
 -    plugin_load_cb(env, addr, oi);
 -    return ret;
 -}
 -
 -uint32_t cpu_ldl_le_mmu(CPUArchState *env, abi_ptr addr,
 -                        MemOpIdx oi, uintptr_t ra)
 -{
 -    uint32_t ret;
 -
 -    tcg_debug_assert((get_memop(oi) & (MO_BSWAP | MO_SIZE)) == MO_LEUL);
 -    ret = do_ld4_mmu(env, addr, oi, ra, MMU_DATA_LOAD);
 -    plugin_load_cb(env, addr, oi);
 -    return ret;
 -}
 -
 -uint64_t cpu_ldq_le_mmu(CPUArchState *env, abi_ptr addr,
 -                        MemOpIdx oi, uintptr_t ra)
 -{
 -    uint64_t ret;
 -
 -    tcg_debug_assert((get_memop(oi) & (MO_BSWAP | MO_SIZE)) == MO_LEUQ);
 -    ret = do_ld8_mmu(env, addr, oi, ra, MMU_DATA_LOAD);
 -    plugin_load_cb(env, addr, oi);
 -    return ret;
 -}
 -
 -Int128 cpu_ld16_be_mmu(CPUArchState *env, abi_ptr addr,
 -                       MemOpIdx oi, uintptr_t ra)
 +Int128 cpu_ld16_mmu(CPUArchState *env, abi_ptr addr,
 +                    MemOpIdx oi, uintptr_t ra)
  {
      Int128 ret;
 -    tcg_debug_assert((get_memop(oi) & (MO_BSWAP|MO_SIZE)) == (MO_BE|MO_128));
 -    ret = do_ld16_mmu(env, addr, oi, ra);
 -    plugin_load_cb(env, addr, oi);
 -    return ret;
 -}
 -
 -Int128 cpu_ld16_le_mmu(CPUArchState *env, abi_ptr addr,
 -                       MemOpIdx oi, uintptr_t ra)
 -{
 -    Int128 ret;
 -
 -    tcg_debug_assert((get_memop(oi) & (MO_BSWAP|MO_SIZE)) == (MO_LE|MO_128));
 +    tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_128);
      ret = do_ld16_mmu(env, addr, oi, ra);
      plugin_load_cb(env, addr, oi);
      return ret;
@@ -XXX,XX +XXX,XX @@ void cpu_stb_mmu(CPUArchState *env, target_ulong addr, uint8_t val,
      plugin_store_cb(env, addr, oi);
  }
 -void cpu_stw_be_mmu(CPUArchState *env, target_ulong addr, uint16_t val,
 -                    MemOpIdx oi, uintptr_t retaddr)
 +void cpu_stw_mmu(CPUArchState *env, target_ulong addr, uint16_t val,
 +                 MemOpIdx oi, uintptr_t retaddr)
  {
 -    tcg_debug_assert((get_memop(oi) & (MO_BSWAP | MO_SIZE)) == MO_BEUW);
 +    tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_16);
      do_st2_mmu(env, addr, val, oi, retaddr);
      plugin_store_cb(env, addr, oi);
  }
 -void cpu_stl_be_mmu(CPUArchState *env, target_ulong addr, uint32_t val,
 +void cpu_stl_mmu(CPUArchState *env, target_ulong addr, uint32_t val,
                      MemOpIdx oi, uintptr_t retaddr)
  {
 -    tcg_debug_assert((get_memop(oi) & (MO_BSWAP | MO_SIZE)) == MO_BEUL);
 +    tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_32);
      do_st4_mmu(env, addr, val, oi, retaddr);
      plugin_store_cb(env, addr, oi);
  }
 -void cpu_stq_be_mmu(CPUArchState *env, target_ulong addr, uint64_t val,
 -                    MemOpIdx oi, uintptr_t retaddr)
 +void cpu_stq_mmu(CPUArchState *env, target_ulong addr, uint64_t val,
 +                 MemOpIdx oi, uintptr_t retaddr)
  {
 -    tcg_debug_assert((get_memop(oi) & (MO_BSWAP | MO_SIZE)) == MO_BEUQ);
 +    tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_64);
      do_st8_mmu(env, addr, val, oi, retaddr);
      plugin_store_cb(env, addr, oi);
  }
 -void cpu_stw_le_mmu(CPUArchState *env, target_ulong addr, uint16_t val,
 -                    MemOpIdx oi, uintptr_t retaddr)
 +void cpu_st16_mmu(CPUArchState *env, target_ulong addr, Int128 val,
 +                  MemOpIdx oi, uintptr_t retaddr)
  {
 -    tcg_debug_assert((get_memop(oi) & (MO_BSWAP | MO_SIZE)) == MO_LEUW);
 -    do_st2_mmu(env, addr, val, oi, retaddr);
 -    plugin_store_cb(env, addr, oi);
 -}
 -
 -void cpu_stl_le_mmu(CPUArchState *env, target_ulong addr, uint32_t val,
 -                    MemOpIdx oi, uintptr_t retaddr)
 -{
 -    tcg_debug_assert((get_memop(oi) & (MO_BSWAP | MO_SIZE)) == MO_LEUL);
 -    do_st4_mmu(env, addr, val, oi, retaddr);
 -    plugin_store_cb(env, addr, oi);
 -}
 -
 -void cpu_stq_le_mmu(CPUArchState *env, target_ulong addr, uint64_t val,
 -                    MemOpIdx oi, uintptr_t retaddr)
 -{
 -    tcg_debug_assert((get_memop(oi) & (MO_BSWAP | MO_SIZE)) == MO_LEUQ);
 -    do_st8_mmu(env, addr, val, oi, retaddr);
 -    plugin_store_cb(env, addr, oi);
 -}
 -
 -void cpu_st16_be_mmu(CPUArchState *env, target_ulong addr, Int128 val,
 -                     MemOpIdx oi, uintptr_t retaddr)
 -{
 -    tcg_debug_assert((get_memop(oi) & (MO_BSWAP|MO_SIZE)) == (MO_BE|MO_128));
 -    do_st16_mmu(env, addr, val, oi, retaddr);
 -    plugin_store_cb(env, addr, oi);
 -}
 -
 -void cpu_st16_le_mmu(CPUArchState *env, target_ulong addr, Int128 val,
 -                     MemOpIdx oi, uintptr_t retaddr)
 -{
 -    tcg_debug_assert((get_memop(oi) & (MO_BSWAP|MO_SIZE)) == (MO_LE|MO_128));
 +    tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_128);
      do_st16_mmu(env, addr, val, oi, retaddr);
      plugin_store_cb(env, addr, oi);
  }
 diff --git a/accel/tcg/user-exec.c b/accel/tcg/user-exec.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/user-exec.c
 +++ b/accel/tcg/user-exec.c
@@ -XXX,XX +XXX,XX @@ uint8_t cpu_ldb_mmu(CPUArchState *env, abi_ptr addr,
      return ret;
  }
 -static uint16_t do_ld2_he_mmu(CPUArchState *env, abi_ptr addr,
 -                              MemOp mop, uintptr_t ra)
 +static uint16_t do_ld2_mmu(CPUArchState *env, abi_ptr addr,
 +                           MemOp mop, uintptr_t ra)
  {
      void *haddr;
      uint16_t ret;
@@ -XXX,XX +XXX,XX @@ static uint16_t do_ld2_he_mmu(CPUArchState *env, abi_ptr addr,
      haddr = cpu_mmu_lookup(env, addr, mop, ra, MMU_DATA_LOAD);
      ret = load_atom_2(env, ra, haddr, mop);
      clear_helper_retaddr();
 +
 +    if (mop & MO_BSWAP) {
 +        ret = bswap16(ret);
 +    }
      return ret;
  }
  tcg_target_ulong helper_lduw_mmu(CPUArchState *env, uint64_t addr,
                                   MemOpIdx oi, uintptr_t ra)
  {
 -    MemOp mop = get_memop(oi);
 -    uint16_t ret = do_ld2_he_mmu(env, addr, mop, ra);
 -
 -    if (mop & MO_BSWAP) {
 -        ret = bswap16(ret);
 -    }
 -    return ret;
 +    return do_ld2_mmu(env, addr, get_memop(oi), ra);
  }
  tcg_target_ulong helper_ldsw_mmu(CPUArchState *env, uint64_t addr,
                                   MemOpIdx oi, uintptr_t ra)
  {
 -    MemOp mop = get_memop(oi);
 -    int16_t ret = do_ld2_he_mmu(env, addr, mop, ra);
 +    return (int16_t)do_ld2_mmu(env, addr, get_memop(oi), ra);
 +}
 -    if (mop & MO_BSWAP) {
 -        ret = bswap16(ret);
 -    }
 +uint16_t cpu_ldw_mmu(CPUArchState *env, abi_ptr addr,
 +                     MemOpIdx oi, uintptr_t ra)
 +{
-+    CPUState *cpu = env_cpu(env);
++    uint16_t ret = do_ld2_mmu(env, addr, get_memop(oi), ra);
-+    TranslationBlock *tb;
++    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
-+    target_ulong cs_base, pc;
+     return ret;
-+    uint32_t flags;
+ }
 -uint16_t cpu_ldw_be_mmu(CPUArchState *env, abi_ptr addr,
 -                        MemOpIdx oi, uintptr_t ra)
 -{
 -    MemOp mop = get_memop(oi);
 -    uint16_t ret;
 -
 -    tcg_debug_assert((mop & MO_BSWAP) == MO_BE);
 -    ret = do_ld2_he_mmu(env, addr, mop, ra);
 -    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
 -    return cpu_to_be16(ret);
 -}
 -
 -uint16_t cpu_ldw_le_mmu(CPUArchState *env, abi_ptr addr,
 -                        MemOpIdx oi, uintptr_t ra)
 -{
 -    MemOp mop = get_memop(oi);
 -    uint16_t ret;
 -
 -    tcg_debug_assert((mop & MO_BSWAP) == MO_LE);
 -    ret = do_ld2_he_mmu(env, addr, mop, ra);
 -    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
 -    return cpu_to_le16(ret);
 -}
 -
 -static uint32_t do_ld4_he_mmu(CPUArchState *env, abi_ptr addr,
 -                              MemOp mop, uintptr_t ra)
 +static uint32_t do_ld4_mmu(CPUArchState *env, abi_ptr addr,
 +                           MemOp mop, uintptr_t ra)
  {
      void *haddr;
      uint32_t ret;
@@ -XXX,XX +XXX,XX @@ static uint32_t do_ld4_he_mmu(CPUArchState *env, abi_ptr addr,
      haddr = cpu_mmu_lookup(env, addr, mop, ra, MMU_DATA_LOAD);
      ret = load_atom_4(env, ra, haddr, mop);
      clear_helper_retaddr();
 +
-+    cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
++    if (mop & MO_BSWAP) {
-+
++        ret = bswap32(ret);
 +    tb = tb_lookup(cpu, pc, cs_base, flags, curr_cflags(cpu));
 +    if (tb == NULL) {
 +        return tcg_code_gen_epilogue;
 +    }
-+    qemu_log_mask_and_addr(CPU_LOG_EXEC, pc,
+     return ret;
-+                           "Chain %d: %p ["
+ }
-+                           TARGET_FMT_lx "/" TARGET_FMT_lx "/%#x] %s\n",
-+                           cpu->cpu_index, tb->tc.ptr, cs_base, pc, flags,
+ tcg_target_ulong helper_ldul_mmu(CPUArchState *env, uint64_t addr,
-+                           lookup_symbol(pc));
+                                  MemOpIdx oi, uintptr_t ra)
-+    return tb->tc.ptr;
+ {
 -    MemOp mop = get_memop(oi);
 -    uint32_t ret = do_ld4_he_mmu(env, addr, mop, ra);
 -
 -    if (mop & MO_BSWAP) {
 -        ret = bswap32(ret);
 -    }
 -    return ret;
 +    return do_ld4_mmu(env, addr, get_memop(oi), ra);
  }
  tcg_target_ulong helper_ldsl_mmu(CPUArchState *env, uint64_t addr,
                                   MemOpIdx oi, uintptr_t ra)
  {
 -    MemOp mop = get_memop(oi);
 -    int32_t ret = do_ld4_he_mmu(env, addr, mop, ra);
 +    return (int32_t)do_ld4_mmu(env, addr, get_memop(oi), ra);
 +}
 -    if (mop & MO_BSWAP) {
 -        ret = bswap32(ret);
 -    }
 +uint32_t cpu_ldl_mmu(CPUArchState *env, abi_ptr addr,
 +                     MemOpIdx oi, uintptr_t ra)
 +{
 +    uint32_t ret = do_ld4_mmu(env, addr, get_memop(oi), ra);
 +    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
      return ret;
  }
 -uint32_t cpu_ldl_be_mmu(CPUArchState *env, abi_ptr addr,
 -                        MemOpIdx oi, uintptr_t ra)
 -{
 -    MemOp mop = get_memop(oi);
 -    uint32_t ret;
 -
 -    tcg_debug_assert((mop & MO_BSWAP) == MO_BE);
 -    ret = do_ld4_he_mmu(env, addr, mop, ra);
 -    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
 -    return cpu_to_be32(ret);
 -}
 -
 -uint32_t cpu_ldl_le_mmu(CPUArchState *env, abi_ptr addr,
 -                        MemOpIdx oi, uintptr_t ra)
 -{
 -    MemOp mop = get_memop(oi);
 -    uint32_t ret;
 -
 -    tcg_debug_assert((mop & MO_BSWAP) == MO_LE);
 -    ret = do_ld4_he_mmu(env, addr, mop, ra);
 -    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
 -    return cpu_to_le32(ret);
 -}
 -
 -static uint64_t do_ld8_he_mmu(CPUArchState *env, abi_ptr addr,
 -                              MemOp mop, uintptr_t ra)
 +static uint64_t do_ld8_mmu(CPUArchState *env, abi_ptr addr,
 +                           MemOp mop, uintptr_t ra)
  {
      void *haddr;
      uint64_t ret;
@@ -XXX,XX +XXX,XX @@ static uint64_t do_ld8_he_mmu(CPUArchState *env, abi_ptr addr,
      haddr = cpu_mmu_lookup(env, addr, mop, ra, MMU_DATA_LOAD);
      ret = load_atom_8(env, ra, haddr, mop);
      clear_helper_retaddr();
 -    return ret;
 -}
 -
 -uint64_t helper_ldq_mmu(CPUArchState *env, uint64_t addr,
 -                        MemOpIdx oi, uintptr_t ra)
 -{
 -    MemOp mop = get_memop(oi);
 -    uint64_t ret = do_ld8_he_mmu(env, addr, mop, ra);
      if (mop & MO_BSWAP) {
          ret = bswap64(ret);
@@ -XXX,XX +XXX,XX @@ uint64_t helper_ldq_mmu(CPUArchState *env, uint64_t addr,
      return ret;
  }
 -uint64_t cpu_ldq_be_mmu(CPUArchState *env, abi_ptr addr,
 +uint64_t helper_ldq_mmu(CPUArchState *env, uint64_t addr,
                          MemOpIdx oi, uintptr_t ra)
  {
 -    MemOp mop = get_memop(oi);
 -    uint64_t ret;
 -
 -    tcg_debug_assert((mop & MO_BSWAP) == MO_BE);
 -    ret = do_ld8_he_mmu(env, addr, mop, ra);
 -    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
 -    return cpu_to_be64(ret);
 +    return do_ld8_mmu(env, addr, get_memop(oi), ra);
  }
 -uint64_t cpu_ldq_le_mmu(CPUArchState *env, abi_ptr addr,
 -                        MemOpIdx oi, uintptr_t ra)
 +uint64_t cpu_ldq_mmu(CPUArchState *env, abi_ptr addr,
 +                     MemOpIdx oi, uintptr_t ra)
  {
 -    MemOp mop = get_memop(oi);
 -    uint64_t ret;
 -
 -    tcg_debug_assert((mop & MO_BSWAP) == MO_LE);
 -    ret = do_ld8_he_mmu(env, addr, mop, ra);
 +    uint64_t ret = do_ld8_mmu(env, addr, get_memop(oi), ra);
      qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
 -    return cpu_to_le64(ret);
 +    return ret;
  }
 -static Int128 do_ld16_he_mmu(CPUArchState *env, abi_ptr addr,
 -                             MemOp mop, uintptr_t ra)
 +static Int128 do_ld16_mmu(CPUArchState *env, abi_ptr addr,
 +                          MemOp mop, uintptr_t ra)
  {
      void *haddr;
      Int128 ret;
@@ -XXX,XX +XXX,XX @@ static Int128 do_ld16_he_mmu(CPUArchState *env, abi_ptr addr,
      haddr = cpu_mmu_lookup(env, addr, mop, ra, MMU_DATA_LOAD);
      ret = load_atom_16(env, ra, haddr, mop);
      clear_helper_retaddr();
 -    return ret;
 -}
 -
 -Int128 helper_ld16_mmu(CPUArchState *env, uint64_t addr,
 -                       MemOpIdx oi, uintptr_t ra)
 -{
 -    MemOp mop = get_memop(oi);
 -    Int128 ret = do_ld16_he_mmu(env, addr, mop, ra);
      if (mop & MO_BSWAP) {
          ret = bswap128(ret);
@@ -XXX,XX +XXX,XX @@ Int128 helper_ld16_mmu(CPUArchState *env, uint64_t addr,
      return ret;
  }
 +Int128 helper_ld16_mmu(CPUArchState *env, uint64_t addr,
 +                       MemOpIdx oi, uintptr_t ra)
 +{
 +    return do_ld16_mmu(env, addr, get_memop(oi), ra);
 +}
 +
- /* Execute a TB, and fix up the CPU state afterwards if necessary */
+ Int128 helper_ld_i128(CPUArchState *env, uint64_t addr, MemOpIdx oi)
- /*
+ {
-  * Disable CFI checks.
+     return helper_ld16_mmu(env, addr, oi, GETPC());
-diff --git a/accel/tcg/tcg-runtime.c b/accel/tcg/tcg-runtime.c
+ }
 -Int128 cpu_ld16_be_mmu(CPUArchState *env, abi_ptr addr,
 -                       MemOpIdx oi, uintptr_t ra)
 +Int128 cpu_ld16_mmu(CPUArchState *env, abi_ptr addr,
 +                    MemOpIdx oi, uintptr_t ra)
  {
 -    MemOp mop = get_memop(oi);
 -    Int128 ret;
 -
 -    tcg_debug_assert((mop & MO_BSWAP) == MO_BE);
 -    ret = do_ld16_he_mmu(env, addr, mop, ra);
 +    Int128 ret = do_ld16_mmu(env, addr, get_memop(oi), ra);
      qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
 -    if (!HOST_BIG_ENDIAN) {
 -        ret = bswap128(ret);
 -    }
 -    return ret;
 -}
 -
 -Int128 cpu_ld16_le_mmu(CPUArchState *env, abi_ptr addr,
 -                       MemOpIdx oi, uintptr_t ra)
 -{
 -    MemOp mop = get_memop(oi);
 -    Int128 ret;
 -
 -    tcg_debug_assert((mop & MO_BSWAP) == MO_LE);
 -    ret = do_ld16_he_mmu(env, addr, mop, ra);
 -    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
 -    if (HOST_BIG_ENDIAN) {
 -        ret = bswap128(ret);
 -    }
      return ret;
  }
@@ -XXX,XX +XXX,XX @@ void cpu_stb_mmu(CPUArchState *env, abi_ptr addr, uint8_t val,
      qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W);
  }
 -static void do_st2_he_mmu(CPUArchState *env, abi_ptr addr, uint16_t val,
 -                          MemOp mop, uintptr_t ra)
 +static void do_st2_mmu(CPUArchState *env, abi_ptr addr, uint16_t val,
 +                       MemOp mop, uintptr_t ra)
  {
      void *haddr;
      tcg_debug_assert((mop & MO_SIZE) == MO_16);
      haddr = cpu_mmu_lookup(env, addr, mop, ra, MMU_DATA_STORE);
 +
 +    if (mop & MO_BSWAP) {
 +        val = bswap16(val);
 +    }
      store_atom_2(env, ra, haddr, mop, val);
      clear_helper_retaddr();
  }
@@ -XXX,XX +XXX,XX @@ static void do_st2_he_mmu(CPUArchState *env, abi_ptr addr, uint16_t val,
  void helper_stw_mmu(CPUArchState *env, uint64_t addr, uint32_t val,
                      MemOpIdx oi, uintptr_t ra)
  {
 -    MemOp mop = get_memop(oi);
 -
 -    if (mop & MO_BSWAP) {
 -        val = bswap16(val);
 -    }
 -    do_st2_he_mmu(env, addr, val, mop, ra);
 +    do_st2_mmu(env, addr, val, get_memop(oi), ra);
  }
 -void cpu_stw_be_mmu(CPUArchState *env, abi_ptr addr, uint16_t val,
 +void cpu_stw_mmu(CPUArchState *env, abi_ptr addr, uint16_t val,
                      MemOpIdx oi, uintptr_t ra)
  {
 -    MemOp mop = get_memop(oi);
 -
 -    tcg_debug_assert((mop & MO_BSWAP) == MO_BE);
 -    do_st2_he_mmu(env, addr, be16_to_cpu(val), mop, ra);
 +    do_st2_mmu(env, addr, val, get_memop(oi), ra);
      qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W);
  }
 -void cpu_stw_le_mmu(CPUArchState *env, abi_ptr addr, uint16_t val,
 -                    MemOpIdx oi, uintptr_t ra)
 -{
 -    MemOp mop = get_memop(oi);
 -
 -    tcg_debug_assert((mop & MO_BSWAP) == MO_LE);
 -    do_st2_he_mmu(env, addr, le16_to_cpu(val), mop, ra);
 -    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W);
 -}
 -
 -static void do_st4_he_mmu(CPUArchState *env, abi_ptr addr, uint32_t val,
 -                          MemOp mop, uintptr_t ra)
 +static void do_st4_mmu(CPUArchState *env, abi_ptr addr, uint32_t val,
 +                       MemOp mop, uintptr_t ra)
  {
      void *haddr;
      tcg_debug_assert((mop & MO_SIZE) == MO_32);
      haddr = cpu_mmu_lookup(env, addr, mop, ra, MMU_DATA_STORE);
 +
 +    if (mop & MO_BSWAP) {
 +        val = bswap32(val);
 +    }
      store_atom_4(env, ra, haddr, mop, val);
      clear_helper_retaddr();
  }
@@ -XXX,XX +XXX,XX @@ static void do_st4_he_mmu(CPUArchState *env, abi_ptr addr, uint32_t val,
  void helper_stl_mmu(CPUArchState *env, uint64_t addr, uint32_t val,
                      MemOpIdx oi, uintptr_t ra)
  {
 -    MemOp mop = get_memop(oi);
 -
 -    if (mop & MO_BSWAP) {
 -        val = bswap32(val);
 -    }
 -    do_st4_he_mmu(env, addr, val, mop, ra);
 +    do_st4_mmu(env, addr, val, get_memop(oi), ra);
  }
 -void cpu_stl_be_mmu(CPUArchState *env, abi_ptr addr, uint32_t val,
 -                    MemOpIdx oi, uintptr_t ra)
 +void cpu_stl_mmu(CPUArchState *env, abi_ptr addr, uint32_t val,
 +                 MemOpIdx oi, uintptr_t ra)
  {
 -    MemOp mop = get_memop(oi);
 -
 -    tcg_debug_assert((mop & MO_BSWAP) == MO_BE);
 -    do_st4_he_mmu(env, addr, be32_to_cpu(val), mop, ra);
 +    do_st4_mmu(env, addr, val, get_memop(oi), ra);
      qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W);
  }
 -void cpu_stl_le_mmu(CPUArchState *env, abi_ptr addr, uint32_t val,
 -                    MemOpIdx oi, uintptr_t ra)
 -{
 -    MemOp mop = get_memop(oi);
 -
 -    tcg_debug_assert((mop & MO_BSWAP) == MO_LE);
 -    do_st4_he_mmu(env, addr, le32_to_cpu(val), mop, ra);
 -    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W);
 -}
 -
 -static void do_st8_he_mmu(CPUArchState *env, abi_ptr addr, uint64_t val,
 -                          MemOp mop, uintptr_t ra)
 +static void do_st8_mmu(CPUArchState *env, abi_ptr addr, uint64_t val,
 +                       MemOp mop, uintptr_t ra)
  {
      void *haddr;
      tcg_debug_assert((mop & MO_SIZE) == MO_64);
      haddr = cpu_mmu_lookup(env, addr, mop, ra, MMU_DATA_STORE);
 +
 +    if (mop & MO_BSWAP) {
 +        val = bswap64(val);
 +    }
      store_atom_8(env, ra, haddr, mop, val);
      clear_helper_retaddr();
  }
@@ -XXX,XX +XXX,XX @@ static void do_st8_he_mmu(CPUArchState *env, abi_ptr addr, uint64_t val,
  void helper_stq_mmu(CPUArchState *env, uint64_t addr, uint64_t val,
                      MemOpIdx oi, uintptr_t ra)
  {
 -    MemOp mop = get_memop(oi);
 -
 -    if (mop & MO_BSWAP) {
 -        val = bswap64(val);
 -    }
 -    do_st8_he_mmu(env, addr, val, mop, ra);
 +    do_st8_mmu(env, addr, val, get_memop(oi), ra);
  }
 -void cpu_stq_be_mmu(CPUArchState *env, abi_ptr addr, uint64_t val,
 +void cpu_stq_mmu(CPUArchState *env, abi_ptr addr, uint64_t val,
                      MemOpIdx oi, uintptr_t ra)
  {
 -    MemOp mop = get_memop(oi);
 -
 -    tcg_debug_assert((mop & MO_BSWAP) == MO_BE);
 -    do_st8_he_mmu(env, addr, cpu_to_be64(val), mop, ra);
 +    do_st8_mmu(env, addr, val, get_memop(oi), ra);
      qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W);
  }
 -void cpu_stq_le_mmu(CPUArchState *env, abi_ptr addr, uint64_t val,
 -                    MemOpIdx oi, uintptr_t ra)
 -{
 -    MemOp mop = get_memop(oi);
 -
 -    tcg_debug_assert((mop & MO_BSWAP) == MO_LE);
 -    do_st8_he_mmu(env, addr, cpu_to_le64(val), mop, ra);
 -    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W);
 -}
 -
 -static void do_st16_he_mmu(CPUArchState *env, abi_ptr addr, Int128 val,
 -                           MemOp mop, uintptr_t ra)
 +static void do_st16_mmu(CPUArchState *env, abi_ptr addr, Int128 val,
 +                        MemOp mop, uintptr_t ra)
  {
      void *haddr;
      tcg_debug_assert((mop & MO_SIZE) == MO_128);
      haddr = cpu_mmu_lookup(env, addr, mop, ra, MMU_DATA_STORE);
 +
 +    if (mop & MO_BSWAP) {
 +        val = bswap128(val);
 +    }
      store_atom_16(env, ra, haddr, mop, val);
      clear_helper_retaddr();
  }
@@ -XXX,XX +XXX,XX @@ static void do_st16_he_mmu(CPUArchState *env, abi_ptr addr, Int128 val,
  void helper_st16_mmu(CPUArchState *env, uint64_t addr, Int128 val,
                       MemOpIdx oi, uintptr_t ra)
  {
 -    MemOp mop = get_memop(oi);
 -
 -    if (mop & MO_BSWAP) {
 -        val = bswap128(val);
 -    }
 -    do_st16_he_mmu(env, addr, val, mop, ra);
 +    do_st16_mmu(env, addr, val, get_memop(oi), ra);
  }
  void helper_st_i128(CPUArchState *env, uint64_t addr, Int128 val, MemOpIdx oi)
@@ -XXX,XX +XXX,XX @@ void helper_st_i128(CPUArchState *env, uint64_t addr, Int128 val, MemOpIdx oi)
      helper_st16_mmu(env, addr, val, oi, GETPC());
  }
 -void cpu_st16_be_mmu(CPUArchState *env, abi_ptr addr,
 -                     Int128 val, MemOpIdx oi, uintptr_t ra)
 +void cpu_st16_mmu(CPUArchState *env, abi_ptr addr,
 +                  Int128 val, MemOpIdx oi, uintptr_t ra)
  {
 -    MemOp mop = get_memop(oi);
 -
 -    tcg_debug_assert((mop & MO_BSWAP) == MO_BE);
 -    if (!HOST_BIG_ENDIAN) {
 -        val = bswap128(val);
 -    }
 -    do_st16_he_mmu(env, addr, val, mop, ra);
 -    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W);
 -}
 -
 -void cpu_st16_le_mmu(CPUArchState *env, abi_ptr addr,
 -                     Int128 val, MemOpIdx oi, uintptr_t ra)
 -{
 -    MemOp mop = get_memop(oi);
 -
 -    tcg_debug_assert((mop & MO_BSWAP) == MO_LE);
 -    if (HOST_BIG_ENDIAN) {
 -        val = bswap128(val);
 -    }
 -    do_st16_he_mmu(env, addr, val, mop, ra);
 +    do_st16_mmu(env, addr, val, get_memop(oi), ra);
      qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W);
  }
 diff --git a/target/arm/tcg/m_helper.c b/target/arm/tcg/m_helper.c
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/tcg-runtime.c
+--- a/target/arm/tcg/m_helper.c
-+++ b/accel/tcg/tcg-runtime.c
++++ b/target/arm/tcg/m_helper.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static bool do_v7m_function_return(ARMCPU *cpu)
- #include "disas/disas.h"
+          */
- #include "exec/log.h"
+         mmu_idx = arm_v7m_mmu_idx_for_secstate(env, true);
- #include "tcg/tcg.h"
+         oi = make_memop_idx(MO_LEUL, arm_to_core_mmu_idx(mmu_idx));
--#include "tb-lookup.h"
+-        newpc = cpu_ldl_le_mmu(env, frameptr, oi, 0);
+-        newpsr = cpu_ldl_le_mmu(env, frameptr + 4, oi, 0);
- /* 32-bit helpers */
++        newpc = cpu_ldl_mmu(env, frameptr, oi, 0);
++        newpsr = cpu_ldl_mmu(env, frameptr + 4, oi, 0);
-@@ -XXX,XX +XXX,XX @@ uint64_t HELPER(ctpop_i64)(uint64_t arg)
-     return ctpop64(arg);
+         /* Consistency checks on new IPSR */
- }
+         newpsr_exc = newpsr & XPSR_EXCP;
+diff --git a/target/sparc/ldst_helper.c b/target/sparc/ldst_helper.c
--const void *HELPER(lookup_tb_ptr)(CPUArchState *env)
+index XXXXXXX..XXXXXXX 100644
--{
+--- a/target/sparc/ldst_helper.c
--    CPUState *cpu = env_cpu(env);
++++ b/target/sparc/ldst_helper.c
--    TranslationBlock *tb;
+@@ -XXX,XX +XXX,XX @@ uint64_t helper_ld_asi(CPUSPARCState *env, target_ulong addr,
--    target_ulong cs_base, pc;
+                 ret = cpu_ldb_mmu(env, addr, oi, GETPC());
--    uint32_t flags;
+                 break;
--
+             case 2:
--    cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
+-                if (asi & 8) {
--
+-                    ret = cpu_ldw_le_mmu(env, addr, oi, GETPC());
--    tb = tb_lookup(cpu, pc, cs_base, flags, curr_cflags(cpu));
+-                } else {
--    if (tb == NULL) {
+-                    ret = cpu_ldw_be_mmu(env, addr, oi, GETPC());
--        return tcg_code_gen_epilogue;
+-                }
--    }
++                ret = cpu_ldw_mmu(env, addr, oi, GETPC());
--    qemu_log_mask_and_addr(CPU_LOG_EXEC, pc,
+                 break;
--                           "Chain %d: %p ["
+             case 4:
--                           TARGET_FMT_lx "/" TARGET_FMT_lx "/%#x] %s\n",
+-                if (asi & 8) {
--                           cpu->cpu_index, tb->tc.ptr, cs_base, pc, flags,
+-                    ret = cpu_ldl_le_mmu(env, addr, oi, GETPC());
--                           lookup_symbol(pc));
+-                } else {
--    return tb->tc.ptr;
+-                    ret = cpu_ldl_be_mmu(env, addr, oi, GETPC());
--}
+-                }
--
++                ret = cpu_ldl_mmu(env, addr, oi, GETPC());
- void HELPER(exit_atomic)(CPUArchState *env)
+                 break;
- {
+             case 8:
-     cpu_loop_exit_atomic(env_cpu(env), GETPC());
+-                if (asi & 8) {
 -                    ret = cpu_ldq_le_mmu(env, addr, oi, GETPC());
 -                } else {
 -                    ret = cpu_ldq_be_mmu(env, addr, oi, GETPC());
 -                }
 +                ret = cpu_ldq_mmu(env, addr, oi, GETPC());
                  break;
              default:
                  g_assert_not_reached();
 diff --git a/accel/tcg/ldst_common.c.inc b/accel/tcg/ldst_common.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/ldst_common.c.inc
 +++ b/accel/tcg/ldst_common.c.inc
@@ -XXX,XX +XXX,XX @@ uint32_t cpu_lduw_be_mmuidx_ra(CPUArchState *env, abi_ptr addr,
                                 int mmu_idx, uintptr_t ra)
  {
      MemOpIdx oi = make_memop_idx(MO_BEUW | MO_UNALN, mmu_idx);
 -    return cpu_ldw_be_mmu(env, addr, oi, ra);
 +    return cpu_ldw_mmu(env, addr, oi, ra);
  }
  int cpu_ldsw_be_mmuidx_ra(CPUArchState *env, abi_ptr addr,
@@ -XXX,XX +XXX,XX @@ uint32_t cpu_ldl_be_mmuidx_ra(CPUArchState *env, abi_ptr addr,
                                int mmu_idx, uintptr_t ra)
  {
      MemOpIdx oi = make_memop_idx(MO_BEUL | MO_UNALN, mmu_idx);
 -    return cpu_ldl_be_mmu(env, addr, oi, ra);
 +    return cpu_ldl_mmu(env, addr, oi, ra);
  }
  uint64_t cpu_ldq_be_mmuidx_ra(CPUArchState *env, abi_ptr addr,
                                int mmu_idx, uintptr_t ra)
  {
      MemOpIdx oi = make_memop_idx(MO_BEUQ | MO_UNALN, mmu_idx);
 -    return cpu_ldq_be_mmu(env, addr, oi, ra);
 +    return cpu_ldq_mmu(env, addr, oi, ra);
  }
  uint32_t cpu_lduw_le_mmuidx_ra(CPUArchState *env, abi_ptr addr,
                                 int mmu_idx, uintptr_t ra)
  {
      MemOpIdx oi = make_memop_idx(MO_LEUW | MO_UNALN, mmu_idx);
 -    return cpu_ldw_le_mmu(env, addr, oi, ra);
 +    return cpu_ldw_mmu(env, addr, oi, ra);
  }
  int cpu_ldsw_le_mmuidx_ra(CPUArchState *env, abi_ptr addr,
@@ -XXX,XX +XXX,XX @@ uint32_t cpu_ldl_le_mmuidx_ra(CPUArchState *env, abi_ptr addr,
                                int mmu_idx, uintptr_t ra)
  {
      MemOpIdx oi = make_memop_idx(MO_LEUL | MO_UNALN, mmu_idx);
 -    return cpu_ldl_le_mmu(env, addr, oi, ra);
 +    return cpu_ldl_mmu(env, addr, oi, ra);
  }
  uint64_t cpu_ldq_le_mmuidx_ra(CPUArchState *env, abi_ptr addr,
                                int mmu_idx, uintptr_t ra)
  {
      MemOpIdx oi = make_memop_idx(MO_LEUQ | MO_UNALN, mmu_idx);
 -    return cpu_ldq_le_mmu(env, addr, oi, ra);
 +    return cpu_ldq_mmu(env, addr, oi, ra);
  }
  void cpu_stb_mmuidx_ra(CPUArchState *env, abi_ptr addr, uint32_t val,
@@ -XXX,XX +XXX,XX @@ void cpu_stw_be_mmuidx_ra(CPUArchState *env, abi_ptr addr, uint32_t val,
                            int mmu_idx, uintptr_t ra)
  {
      MemOpIdx oi = make_memop_idx(MO_BEUW | MO_UNALN, mmu_idx);
 -    cpu_stw_be_mmu(env, addr, val, oi, ra);
 +    cpu_stw_mmu(env, addr, val, oi, ra);
  }
  void cpu_stl_be_mmuidx_ra(CPUArchState *env, abi_ptr addr, uint32_t val,
                            int mmu_idx, uintptr_t ra)
  {
      MemOpIdx oi = make_memop_idx(MO_BEUL | MO_UNALN, mmu_idx);
 -    cpu_stl_be_mmu(env, addr, val, oi, ra);
 +    cpu_stl_mmu(env, addr, val, oi, ra);
  }
  void cpu_stq_be_mmuidx_ra(CPUArchState *env, abi_ptr addr, uint64_t val,
                            int mmu_idx, uintptr_t ra)
  {
      MemOpIdx oi = make_memop_idx(MO_BEUQ | MO_UNALN, mmu_idx);
 -    cpu_stq_be_mmu(env, addr, val, oi, ra);
 +    cpu_stq_mmu(env, addr, val, oi, ra);
  }
  void cpu_stw_le_mmuidx_ra(CPUArchState *env, abi_ptr addr, uint32_t val,
                            int mmu_idx, uintptr_t ra)
  {
      MemOpIdx oi = make_memop_idx(MO_LEUW | MO_UNALN, mmu_idx);
 -    cpu_stw_le_mmu(env, addr, val, oi, ra);
 +    cpu_stw_mmu(env, addr, val, oi, ra);
  }
  void cpu_stl_le_mmuidx_ra(CPUArchState *env, abi_ptr addr, uint32_t val,
                            int mmu_idx, uintptr_t ra)
  {
      MemOpIdx oi = make_memop_idx(MO_LEUL | MO_UNALN, mmu_idx);
 -    cpu_stl_le_mmu(env, addr, val, oi, ra);
 +    cpu_stl_mmu(env, addr, val, oi, ra);
  }
  void cpu_stq_le_mmuidx_ra(CPUArchState *env, abi_ptr addr, uint64_t val,
                            int mmu_idx, uintptr_t ra)
  {
      MemOpIdx oi = make_memop_idx(MO_LEUQ | MO_UNALN, mmu_idx);
 -    cpu_stq_le_mmu(env, addr, val, oi, ra);
 +    cpu_stq_mmu(env, addr, val, oi, ra);
  }
  /*--------------------------*/
 --
-.25.1
+.34.1

-[PATCH 28/41] target/s390x: Remove use_exit_tb
+[PULL 17/28] target/s390x: Use cpu_{ld,st}*_mmu in do_csst
-We have not needed to end a TB for I/O since ba3e7926691
+Use cpu_ld16_mmu and cpu_st16_mmu to eliminate the special case,
-("icount: clean up cpu_can_io at the entry to the block").
+and change all of the *_data_ra functions to match.
-In use_goto_tb, the check for singlestep_enabled is in the
+Note that we check the alignment of both compare and store
-generic translator_use_goto_tb.  In s390x_tr_tb_stop, the
+pointers at the top of the function, so MO_ALIGN* may be
-check for singlestep_enabled is in the preceding do_debug test.
+safely removed from the individual memory operations.
-Which leaves only FLAG_MASK_PER: fold that test alone into
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 the two callers of use_exit tb.
 Reviewed-by: David Hildenbrand <david@redhat.com>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/s390x/translate.c | 11 ++---------
+ target/s390x/tcg/mem_helper.c | 66 ++++++++++++++---------------------
-file changed, 2 insertions(+), 9 deletions(-)
+file changed, 27 insertions(+), 39 deletions(-)
-diff --git a/target/s390x/translate.c b/target/s390x/translate.c
+diff --git a/target/s390x/tcg/mem_helper.c b/target/s390x/tcg/mem_helper.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/s390x/translate.c
+--- a/target/s390x/tcg/mem_helper.c
-+++ b/target/s390x/translate.c
++++ b/target/s390x/tcg/mem_helper.c
-@@ -XXX,XX +XXX,XX @@ static void gen_op_calc_cc(DisasContext *s)
+@@ -XXX,XX +XXX,XX @@ static uint32_t do_csst(CPUS390XState *env, uint32_t r3, uint64_t a1,
-     set_cc_static(s);
+                         uint64_t a2, bool parallel)
- }
+ {
+     uint32_t mem_idx = cpu_mmu_index(env, false);
--static bool use_exit_tb(DisasContext *s)
++    MemOpIdx oi16 = make_memop_idx(MO_TE | MO_128, mem_idx);
--{
++    MemOpIdx oi8 = make_memop_idx(MO_TE | MO_64, mem_idx);
--    return s->base.singlestep_enabled ||
++    MemOpIdx oi4 = make_memop_idx(MO_TE | MO_32, mem_idx);
--            (tb_cflags(s->base.tb) & CF_LAST_IO) ||
++    MemOpIdx oi2 = make_memop_idx(MO_TE | MO_16, mem_idx);
--            (s->base.tb->flags & FLAG_MASK_PER);
++    MemOpIdx oi1 = make_memop_idx(MO_8, mem_idx);
--}
+     uintptr_t ra = GETPC();
      uint32_t fc = extract32(env->regs[0], 0, 8);
      uint32_t sc = extract32(env->regs[0], 8, 8);
@@ -XXX,XX +XXX,XX @@ static uint32_t do_csst(CPUS390XState *env, uint32_t r3, uint64_t a1,
          }
      }
 -    /* All loads happen before all stores.  For simplicity, load the entire
 -       store value area from the parameter list.  */
 -    svh = cpu_ldq_data_ra(env, pl + 16, ra);
 -    svl = cpu_ldq_data_ra(env, pl + 24, ra);
 +    /*
 +     * All loads happen before all stores.  For simplicity, load the entire
 +     * store value area from the parameter list.
 +     */
 +    svh = cpu_ldq_mmu(env, pl + 16, oi8, ra);
 +    svl = cpu_ldq_mmu(env, pl + 24, oi8, ra);
      switch (fc) {
      case 0:
          {
 -            uint32_t nv = cpu_ldl_data_ra(env, pl, ra);
 +            uint32_t nv = cpu_ldl_mmu(env, pl, oi4, ra);
              uint32_t cv = env->regs[r3];
              uint32_t ov;
@@ -XXX,XX +XXX,XX @@ static uint32_t do_csst(CPUS390XState *env, uint32_t r3, uint64_t a1,
                  ov = cpu_atomic_cmpxchgl_be_mmu(env, a1, cv, nv, oi, ra);
  #endif
              } else {
 -                ov = cpu_ldl_data_ra(env, a1, ra);
 -                cpu_stl_data_ra(env, a1, (ov == cv ? nv : ov), ra);
 +                ov = cpu_ldl_mmu(env, a1, oi4, ra);
 +                cpu_stl_mmu(env, a1, (ov == cv ? nv : ov), oi4, ra);
              }
              cc = (ov != cv);
              env->regs[r3] = deposit64(env->regs[r3], 32, 32, ov);
@@ -XXX,XX +XXX,XX @@ static uint32_t do_csst(CPUS390XState *env, uint32_t r3, uint64_t a1,
      case 1:
          {
 -            uint64_t nv = cpu_ldq_data_ra(env, pl, ra);
 +            uint64_t nv = cpu_ldq_mmu(env, pl, oi8, ra);
              uint64_t cv = env->regs[r3];
              uint64_t ov;
              if (parallel) {
  #ifdef CONFIG_ATOMIC64
 -                MemOpIdx oi = make_memop_idx(MO_TEUQ | MO_ALIGN, mem_idx);
 -                ov = cpu_atomic_cmpxchgq_be_mmu(env, a1, cv, nv, oi, ra);
 +                ov = cpu_atomic_cmpxchgq_be_mmu(env, a1, cv, nv, oi8, ra);
  #else
                  /* Note that we asserted !parallel above.  */
                  g_assert_not_reached();
  #endif
              } else {
 -                ov = cpu_ldq_data_ra(env, a1, ra);
 -                cpu_stq_data_ra(env, a1, (ov == cv ? nv : ov), ra);
 +                ov = cpu_ldq_mmu(env, a1, oi8, ra);
 +                cpu_stq_mmu(env, a1, (ov == cv ? nv : ov), oi8, ra);
              }
              cc = (ov != cv);
              env->regs[r3] = ov;
@@ -XXX,XX +XXX,XX @@ static uint32_t do_csst(CPUS390XState *env, uint32_t r3, uint64_t a1,
      case 2:
          {
 -            uint64_t nvh = cpu_ldq_data_ra(env, pl, ra);
 -            uint64_t nvl = cpu_ldq_data_ra(env, pl + 8, ra);
 -            Int128 nv = int128_make128(nvl, nvh);
 +            Int128 nv = cpu_ld16_mmu(env, pl, oi16, ra);
              Int128 cv = int128_make128(env->regs[r3 + 1], env->regs[r3]);
              Int128 ov;
              if (!parallel) {
 -                uint64_t oh = cpu_ldq_data_ra(env, a1 + 0, ra);
 -                uint64_t ol = cpu_ldq_data_ra(env, a1 + 8, ra);
 -
- static bool use_goto_tb(DisasContext *s, uint64_t dest)
+-                ov = int128_make128(ol, oh);
- {
++                ov = cpu_ld16_mmu(env, a1, oi16, ra);
--    if (unlikely(use_exit_tb(s))) {
+                 cc = !int128_eq(ov, cv);
-+    if (unlikely(s->base.tb->flags & FLAG_MASK_PER)) {
+                 if (cc) {
-         return false;
+                     nv = ov;
-     }
+                 }
-     return translator_use_goto_tb(&s->base, dest);
+-
-@@ -XXX,XX +XXX,XX @@ static void s390x_tr_tb_stop(DisasContextBase *dcbase, CPUState *cs)
+-                cpu_stq_data_ra(env, a1 + 0, int128_gethi(nv), ra);
-         /* Exit the TB, either by raising a debug exception or by return.  */
+-                cpu_stq_data_ra(env, a1 + 8, int128_getlo(nv), ra);
-         if (dc->do_debug) {
++                cpu_st16_mmu(env, a1, nv, oi16, ra);
-             gen_exception(EXCP_DEBUG);
+             } else if (HAVE_CMPXCHG128) {
--        } else if (use_exit_tb(dc) ||
+-                MemOpIdx oi = make_memop_idx(MO_TE | MO_128 | MO_ALIGN, mem_idx);
-+        } else if ((dc->base.tb->flags & FLAG_MASK_PER) ||
+-                ov = cpu_atomic_cmpxchgo_be_mmu(env, a1, cv, nv, oi, ra);
-                    dc->base.is_jmp == DISAS_PC_STALE_NOCHAIN) {
++                ov = cpu_atomic_cmpxchgo_be_mmu(env, a1, cv, nv, oi16, ra);
-             tcg_gen_exit_tb(NULL, 0);
+                 cc = !int128_eq(ov, cv);
-         } else {
+             } else {
                  /* Note that we asserted !parallel above.  */
@@ -XXX,XX +XXX,XX @@ static uint32_t do_csst(CPUS390XState *env, uint32_t r3, uint64_t a1,
      if (cc == 0) {
          switch (sc) {
          case 0:
 -            cpu_stb_data_ra(env, a2, svh >> 56, ra);
 +            cpu_stb_mmu(env, a2, svh >> 56, oi1, ra);
              break;
          case 1:
 -            cpu_stw_data_ra(env, a2, svh >> 48, ra);
 +            cpu_stw_mmu(env, a2, svh >> 48, oi2, ra);
              break;
          case 2:
 -            cpu_stl_data_ra(env, a2, svh >> 32, ra);
 +            cpu_stl_mmu(env, a2, svh >> 32, oi4, ra);
              break;
          case 3:
 -            cpu_stq_data_ra(env, a2, svh, ra);
 +            cpu_stq_mmu(env, a2, svh, oi8, ra);
              break;
          case 4:
 -            if (!parallel) {
 -                cpu_stq_data_ra(env, a2 + 0, svh, ra);
 -                cpu_stq_data_ra(env, a2 + 8, svl, ra);
 -            } else if (HAVE_ATOMIC128) {
 -                MemOpIdx oi = make_memop_idx(MO_TEUQ | MO_ALIGN_16, mem_idx);
 -                Int128 sv = int128_make128(svl, svh);
 -                cpu_atomic_sto_be_mmu(env, a2, sv, oi, ra);
 -            } else {
 -                /* Note that we asserted !parallel above.  */
 -                g_assert_not_reached();
 -            }
 +            cpu_st16_mmu(env, a2, int128_make128(svl, svh), oi16, ra);
              break;
          default:
              g_assert_not_reached();
 --
-.25.1
+.34.1

-[PATCH 27/41] target/s390x: Use translator_use_goto_tb
+[PULL 18/28] target/s390x: Always use cpu_atomic_cmpxchgl_be_mmu in do_csst
+Eliminate the CONFIG_USER_ONLY specialization.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Reviewed-by: David Hildenbrand <david@redhat.com>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/s390x/translate.c | 7 +------
+ target/s390x/tcg/mem_helper.c | 8 +-------
-file changed, 1 insertion(+), 6 deletions(-)
+file changed, 1 insertion(+), 7 deletions(-)
-diff --git a/target/s390x/translate.c b/target/s390x/translate.c
+diff --git a/target/s390x/tcg/mem_helper.c b/target/s390x/tcg/mem_helper.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/s390x/translate.c
+--- a/target/s390x/tcg/mem_helper.c
-+++ b/target/s390x/translate.c
++++ b/target/s390x/tcg/mem_helper.c
-@@ -XXX,XX +XXX,XX @@ static bool use_goto_tb(DisasContext *s, uint64_t dest)
+@@ -XXX,XX +XXX,XX @@ static uint32_t do_csst(CPUS390XState *env, uint32_t r3, uint64_t a1,
-     if (unlikely(use_exit_tb(s))) {
+             uint32_t ov;
-         return false;
-     }
+             if (parallel) {
--#ifndef CONFIG_USER_ONLY
+-#ifdef CONFIG_USER_ONLY
--    return (dest & TARGET_PAGE_MASK) == (s->base.tb->pc & TARGET_PAGE_MASK) ||
+-                uint32_t *haddr = g2h(env_cpu(env), a1);
--           (dest & TARGET_PAGE_MASK) == (s->base.pc_next & TARGET_PAGE_MASK);
+-                ov = qatomic_cmpxchg__nocheck(haddr, cv, nv);
 -#else
--    return true;
+-                MemOpIdx oi = make_memop_idx(MO_TEUL | MO_ALIGN, mem_idx);
 -                ov = cpu_atomic_cmpxchgl_be_mmu(env, a1, cv, nv, oi, ra);
 -#endif
-+    return translator_use_goto_tb(&s->base, dest);
++                ov = cpu_atomic_cmpxchgl_be_mmu(env, a1, cv, nv, oi4, ra);
- }
+             } else {
+                 ov = cpu_ldl_mmu(env, a1, oi4, ra);
- static void account_noninline_branch(DisasContext *s, int cc_op)
+                 cpu_stl_mmu(env, a1, (ov == cv ? nv : ov), oi4, ra);
 --
-.25.1
+.34.1

-[PATCH 31/41] target/tricore: Use translator_use_goto_tb
+[PULL 19/28] accel/tcg: Remove cpu_atomic_{ld,st}o_*_mmu
-Just use translator_use_goto_tb directly at the one call site,
+Atomic load/store of 128-byte quantities is now handled
-rather than maintaining a local wrapper.
+by cpu_{ld,st}16_mmu.
-Reviewed-by: Bastian Koppelmann <kbastian@mail.uni-paderborn.de>
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/tricore/translate.c | 17 ++---------------
+ accel/tcg/atomic_template.h   | 61 +++--------------------------------
-file changed, 2 insertions(+), 15 deletions(-)
+ include/exec/cpu_ldst.h       |  9 ------
  accel/tcg/atomic_common.c.inc | 14 --------
 files changed, 4 insertions(+), 80 deletions(-)
-diff --git a/target/tricore/translate.c b/target/tricore/translate.c
+diff --git a/accel/tcg/atomic_template.h b/accel/tcg/atomic_template.h
 index XXXXXXX..XXXXXXX 100644
---- a/target/tricore/translate.c
+--- a/accel/tcg/atomic_template.h
-+++ b/target/tricore/translate.c
++++ b/accel/tcg/atomic_template.h
-@@ -XXX,XX +XXX,XX @@ static inline void gen_save_pc(target_ulong pc)
+@@ -XXX,XX +XXX,XX @@ ABI_TYPE ATOMIC_NAME(cmpxchg)(CPUArchState *env, target_ulong addr,
-     tcg_gen_movi_tl(cpu_PC, pc);
+     return ret;
  }
--static inline bool use_goto_tb(DisasContext *ctx, target_ulong dest)
+-#if DATA_SIZE >= 16
 -#if HAVE_ATOMIC128
 -ABI_TYPE ATOMIC_NAME(ld)(CPUArchState *env, target_ulong addr,
 -                         MemOpIdx oi, uintptr_t retaddr)
 -{
--    if (unlikely(ctx->base.singlestep_enabled)) {
+-    DATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE,
--        return false;
+-                                         PAGE_READ, retaddr);
--    }
+-    DATA_TYPE val;
 -
--#ifndef CONFIG_USER_ONLY
+-    val = atomic16_read(haddr);
--    return (ctx->base.tb->pc & TARGET_PAGE_MASK) == (dest & TARGET_PAGE_MASK);
+-    ATOMIC_MMU_CLEANUP;
--#else
+-    atomic_trace_ld_post(env, addr, oi);
--    return true;
+-    return val;
 -#endif
 -}
 -
- static void generate_qemu_excp(DisasContext *ctx, int excp)
+-void ATOMIC_NAME(st)(CPUArchState *env, target_ulong addr, ABI_TYPE val,
 -                     MemOpIdx oi, uintptr_t retaddr)
 -{
 -    DATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE,
 -                                         PAGE_WRITE, retaddr);
 -
 -    atomic16_set(haddr, val);
 -    ATOMIC_MMU_CLEANUP;
 -    atomic_trace_st_post(env, addr, oi);
 -}
 -#endif
 -#else
 +#if DATA_SIZE < 16
  ABI_TYPE ATOMIC_NAME(xchg)(CPUArchState *env, target_ulong addr, ABI_TYPE val,
                             MemOpIdx oi, uintptr_t retaddr)
  {
-     TCGv_i32 tmp = tcg_const_i32(excp);
+@@ -XXX,XX +XXX,XX @@ GEN_ATOMIC_HELPER_FN(smax_fetch, MAX, SDATA_TYPE, new)
-@@ -XXX,XX +XXX,XX @@ static void generate_qemu_excp(DisasContext *ctx, int excp)
+ GEN_ATOMIC_HELPER_FN(umax_fetch, MAX,  DATA_TYPE, new)
-     tcg_temp_free(tmp);
  #undef GEN_ATOMIC_HELPER_FN
 -#endif /* DATA SIZE >= 16 */
 +#endif /* DATA SIZE < 16 */
  #undef END
@@ -XXX,XX +XXX,XX @@ ABI_TYPE ATOMIC_NAME(cmpxchg)(CPUArchState *env, target_ulong addr,
      return BSWAP(ret);
  }
--static inline void gen_goto_tb(DisasContext *ctx, int n, target_ulong dest)
+-#if DATA_SIZE >= 16
-+static void gen_goto_tb(DisasContext *ctx, int n, target_ulong dest)
+-#if HAVE_ATOMIC128
 -ABI_TYPE ATOMIC_NAME(ld)(CPUArchState *env, target_ulong addr,
 -                         MemOpIdx oi, uintptr_t retaddr)
 -{
 -    DATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE,
 -                                         PAGE_READ, retaddr);
 -    DATA_TYPE val;
 -
 -    val = atomic16_read(haddr);
 -    ATOMIC_MMU_CLEANUP;
 -    atomic_trace_ld_post(env, addr, oi);
 -    return BSWAP(val);
 -}
 -
 -void ATOMIC_NAME(st)(CPUArchState *env, target_ulong addr, ABI_TYPE val,
 -                     MemOpIdx oi, uintptr_t retaddr)
 -{
 -    DATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE,
 -                                         PAGE_WRITE, retaddr);
 -
 -    val = BSWAP(val);
 -    atomic16_set(haddr, val);
 -    ATOMIC_MMU_CLEANUP;
 -    atomic_trace_st_post(env, addr, oi);
 -}
 -#endif
 -#else
 +#if DATA_SIZE < 16
  ABI_TYPE ATOMIC_NAME(xchg)(CPUArchState *env, target_ulong addr, ABI_TYPE val,
                             MemOpIdx oi, uintptr_t retaddr)
  {
--    if (use_goto_tb(ctx, dest)) {
+@@ -XXX,XX +XXX,XX @@ GEN_ATOMIC_HELPER_FN(add_fetch, ADD, DATA_TYPE, new)
-+    if (translator_use_goto_tb(&ctx->base, dest)) {
+ #undef ADD
-         tcg_gen_goto_tb(n);
-         gen_save_pc(dest);
+ #undef GEN_ATOMIC_HELPER_FN
-         tcg_gen_exit_tb(ctx->base.tb, n);
+-#endif /* DATA_SIZE >= 16 */
 +#endif /* DATA_SIZE < 16 */
  #undef END
  #endif /* DATA_SIZE > 1 */
 diff --git a/include/exec/cpu_ldst.h b/include/exec/cpu_ldst.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/exec/cpu_ldst.h
 +++ b/include/exec/cpu_ldst.h
@@ -XXX,XX +XXX,XX @@ Int128 cpu_atomic_cmpxchgo_be_mmu(CPUArchState *env, target_ulong addr,
                                    Int128 cmpv, Int128 newv,
                                    MemOpIdx oi, uintptr_t retaddr);
 -Int128 cpu_atomic_ldo_le_mmu(CPUArchState *env, target_ulong addr,
 -                             MemOpIdx oi, uintptr_t retaddr);
 -Int128 cpu_atomic_ldo_be_mmu(CPUArchState *env, target_ulong addr,
 -                             MemOpIdx oi, uintptr_t retaddr);
 -void cpu_atomic_sto_le_mmu(CPUArchState *env, target_ulong addr, Int128 val,
 -                           MemOpIdx oi, uintptr_t retaddr);
 -void cpu_atomic_sto_be_mmu(CPUArchState *env, target_ulong addr, Int128 val,
 -                           MemOpIdx oi, uintptr_t retaddr);
 -
  #if defined(CONFIG_USER_ONLY)
  extern __thread uintptr_t helper_retaddr;
 diff --git a/accel/tcg/atomic_common.c.inc b/accel/tcg/atomic_common.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/atomic_common.c.inc
 +++ b/accel/tcg/atomic_common.c.inc
@@ -XXX,XX +XXX,XX @@ static void atomic_trace_rmw_post(CPUArchState *env, uint64_t addr,
      qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_RW);
  }
 -#if HAVE_ATOMIC128
 -static void atomic_trace_ld_post(CPUArchState *env, uint64_t addr,
 -                                 MemOpIdx oi)
 -{
 -    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
 -}
 -
 -static void atomic_trace_st_post(CPUArchState *env, uint64_t addr,
 -                                 MemOpIdx oi)
 -{
 -    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W);
 -}
 -#endif
 -
  /*
   * Atomic helpers callable from TCG.
   * These have a common interface and all defer to cpu_atomic_*
 --
-.25.1
+.34.1

-[PATCH 03/41] accel/tcg: Hoist tcg_tb_insert() up above tb_link_page()
+[PULL 20/28] accel/tcg: Remove prot argument to atomic_mmu_lookup
-From: Liren Wei <lrwei@bupt.edu.cn>
+Now that load/store are gone, we're always passing
+PAGE_READ | PAGE_WRITE for RMW atomic operations.
-TranslationBlocks not inserted into the corresponding region
-tree shall be regarded as partially initialized objects, and
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 needs to be finalized first before inserting into QHT.
 Signed-off-by: Liren Wei <lrwei@bupt.edu.cn>
 Message-Id: <f9fc263f71e11b6308d8c1fbc0dd366bf4aeb532.1625404483.git.lrwei@bupt.edu.cn>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/translate-all.c | 9 ++++++++-
+ accel/tcg/atomic_template.h | 32 ++++++--------
-file changed, 8 insertions(+), 1 deletion(-)
+ accel/tcg/cputlb.c          | 85 ++++++++++++++-----------------------
+ accel/tcg/user-exec.c       |  8 +---
-diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
+files changed, 45 insertions(+), 80 deletions(-)
 diff --git a/accel/tcg/atomic_template.h b/accel/tcg/atomic_template.h
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/translate-all.c
+--- a/accel/tcg/atomic_template.h
-+++ b/accel/tcg/translate-all.c
++++ b/accel/tcg/atomic_template.h
-@@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_gen_code(CPUState *cpu,
+@@ -XXX,XX +XXX,XX @@ ABI_TYPE ATOMIC_NAME(cmpxchg)(CPUArchState *env, target_ulong addr,
-         return tb;
+                               ABI_TYPE cmpv, ABI_TYPE newv,
-     }
+                               MemOpIdx oi, uintptr_t retaddr)
  {
 -    DATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE,
 -                                         PAGE_READ | PAGE_WRITE, retaddr);
 +    DATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE, retaddr);
      DATA_TYPE ret;
  #if DATA_SIZE == 16
@@ -XXX,XX +XXX,XX @@ ABI_TYPE ATOMIC_NAME(cmpxchg)(CPUArchState *env, target_ulong addr,
  ABI_TYPE ATOMIC_NAME(xchg)(CPUArchState *env, target_ulong addr, ABI_TYPE val,
                             MemOpIdx oi, uintptr_t retaddr)
  {
 -    DATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE,
 -                                         PAGE_READ | PAGE_WRITE, retaddr);
 +    DATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE, retaddr);
      DATA_TYPE ret;
      ret = qatomic_xchg__nocheck(haddr, val);
@@ -XXX,XX +XXX,XX @@ ABI_TYPE ATOMIC_NAME(xchg)(CPUArchState *env, target_ulong addr, ABI_TYPE val,
  ABI_TYPE ATOMIC_NAME(X)(CPUArchState *env, target_ulong addr,       \
                          ABI_TYPE val, MemOpIdx oi, uintptr_t retaddr) \
  {                                                                   \
 -    DATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE,  \
 -                                         PAGE_READ | PAGE_WRITE, retaddr); \
 -    DATA_TYPE ret;                                                  \
 +    DATA_TYPE *haddr, ret;                                          \
 +    haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE, retaddr);   \
      ret = qatomic_##X(haddr, val);                                  \
      ATOMIC_MMU_CLEANUP;                                             \
      atomic_trace_rmw_post(env, addr, oi);                           \
@@ -XXX,XX +XXX,XX @@ GEN_ATOMIC_HELPER(xor_fetch)
  ABI_TYPE ATOMIC_NAME(X)(CPUArchState *env, target_ulong addr,       \
                          ABI_TYPE xval, MemOpIdx oi, uintptr_t retaddr) \
  {                                                                   \
 -    XDATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE, \
 -                                          PAGE_READ | PAGE_WRITE, retaddr); \
 -    XDATA_TYPE cmp, old, new, val = xval;                           \
 +    XDATA_TYPE *haddr, cmp, old, new, val = xval;                   \
 +    haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE, retaddr);   \
      smp_mb();                                                       \
      cmp = qatomic_read__nocheck(haddr);                             \
      do {                                                            \
@@ -XXX,XX +XXX,XX @@ ABI_TYPE ATOMIC_NAME(cmpxchg)(CPUArchState *env, target_ulong addr,
                                ABI_TYPE cmpv, ABI_TYPE newv,
                                MemOpIdx oi, uintptr_t retaddr)
  {
 -    DATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE,
 -                                         PAGE_READ | PAGE_WRITE, retaddr);
 +    DATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE, retaddr);
      DATA_TYPE ret;
  #if DATA_SIZE == 16
@@ -XXX,XX +XXX,XX @@ ABI_TYPE ATOMIC_NAME(cmpxchg)(CPUArchState *env, target_ulong addr,
  ABI_TYPE ATOMIC_NAME(xchg)(CPUArchState *env, target_ulong addr, ABI_TYPE val,
                             MemOpIdx oi, uintptr_t retaddr)
  {
 -    DATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE,
 -                                         PAGE_READ | PAGE_WRITE, retaddr);
 +    DATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE, retaddr);
      ABI_TYPE ret;
      ret = qatomic_xchg__nocheck(haddr, BSWAP(val));
@@ -XXX,XX +XXX,XX @@ ABI_TYPE ATOMIC_NAME(xchg)(CPUArchState *env, target_ulong addr, ABI_TYPE val,
  ABI_TYPE ATOMIC_NAME(X)(CPUArchState *env, target_ulong addr,       \
                          ABI_TYPE val, MemOpIdx oi, uintptr_t retaddr) \
  {                                                                   \
 -    DATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE,  \
 -                                         PAGE_READ | PAGE_WRITE, retaddr); \
 -    DATA_TYPE ret;                                                  \
 +    DATA_TYPE *haddr, ret;                                          \
 +    haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE, retaddr);   \
      ret = qatomic_##X(haddr, BSWAP(val));                           \
      ATOMIC_MMU_CLEANUP;                                             \
      atomic_trace_rmw_post(env, addr, oi);                           \
@@ -XXX,XX +XXX,XX @@ GEN_ATOMIC_HELPER(xor_fetch)
  ABI_TYPE ATOMIC_NAME(X)(CPUArchState *env, target_ulong addr,       \
                          ABI_TYPE xval, MemOpIdx oi, uintptr_t retaddr) \
  {                                                                   \
 -    XDATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE, \
 -                                          PAGE_READ | PAGE_WRITE, retaddr); \
 -    XDATA_TYPE ldo, ldn, old, new, val = xval;                      \
 +    XDATA_TYPE *haddr, ldo, ldn, old, new, val = xval;              \
 +    haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE, retaddr);   \
      smp_mb();                                                       \
      ldn = qatomic_read__nocheck(haddr);                             \
      do {                                                            \
 diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/cputlb.c
 +++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ static bool mmu_lookup(CPUArchState *env, target_ulong addr, MemOpIdx oi,
  /*
   * Probe for an atomic operation.  Do not allow unaligned operations,
   * or io operations to proceed.  Return the host address.
 - *
 - * @prot may be PAGE_READ, PAGE_WRITE, or PAGE_READ|PAGE_WRITE.
   */
  static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
 -                               MemOpIdx oi, int size, int prot,
 -                               uintptr_t retaddr)
 +                               MemOpIdx oi, int size, uintptr_t retaddr)
  {
      uintptr_t mmu_idx = get_mmuidx(oi);
      MemOp mop = get_memop(oi);
@@ -XXX,XX +XXX,XX @@ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
      tlbe = tlb_entry(env, mmu_idx, addr);
      /* Check TLB entry and enforce page permissions.  */
 -    if (prot & PAGE_WRITE) {
 -        tlb_addr = tlb_addr_write(tlbe);
 -        if (!tlb_hit(tlb_addr, addr)) {
 -            if (!victim_tlb_hit(env, mmu_idx, index, MMU_DATA_STORE,
 -                                addr & TARGET_PAGE_MASK)) {
 -                tlb_fill(env_cpu(env), addr, size,
 -                         MMU_DATA_STORE, mmu_idx, retaddr);
 -                index = tlb_index(env, mmu_idx, addr);
 -                tlbe = tlb_entry(env, mmu_idx, addr);
 -            }
 -            tlb_addr = tlb_addr_write(tlbe) & ~TLB_INVALID_MASK;
 -        }
 -
 -        if (prot & PAGE_READ) {
 -            /*
 -             * Let the guest notice RMW on a write-only page.
 -             * We have just verified that the page is writable.
 -             * Subpage lookups may have left TLB_INVALID_MASK set,
 -             * but addr_read will only be -1 if PAGE_READ was unset.
 -             */
 -            if (unlikely(tlbe->addr_read == -1)) {
 -                tlb_fill(env_cpu(env), addr, size,
 -                         MMU_DATA_LOAD, mmu_idx, retaddr);
 -                /*
 -                 * Since we don't support reads and writes to different
 -                 * addresses, and we do have the proper page loaded for
 -                 * write, this shouldn't ever return.  But just in case,
 -                 * handle via stop-the-world.
 -                 */
 -                goto stop_the_world;
 -            }
 -            /* Collect TLB_WATCHPOINT for read. */
 -            tlb_addr |= tlbe->addr_read;
 -        }
 -    } else /* if (prot & PAGE_READ) */ {
 -        tlb_addr = tlbe->addr_read;
 -        if (!tlb_hit(tlb_addr, addr)) {
 -            if (!victim_tlb_hit(env, mmu_idx, index, MMU_DATA_LOAD,
 -                                addr & TARGET_PAGE_MASK)) {
 -                tlb_fill(env_cpu(env), addr, size,
 -                         MMU_DATA_LOAD, mmu_idx, retaddr);
 -                index = tlb_index(env, mmu_idx, addr);
 -                tlbe = tlb_entry(env, mmu_idx, addr);
 -            }
 -            tlb_addr = tlbe->addr_read & ~TLB_INVALID_MASK;
 +    tlb_addr = tlb_addr_write(tlbe);
 +    if (!tlb_hit(tlb_addr, addr)) {
 +        if (!victim_tlb_hit(env, mmu_idx, index, MMU_DATA_STORE,
 +                            addr & TARGET_PAGE_MASK)) {
 +            tlb_fill(env_cpu(env), addr, size,
 +                     MMU_DATA_STORE, mmu_idx, retaddr);
 +            index = tlb_index(env, mmu_idx, addr);
 +            tlbe = tlb_entry(env, mmu_idx, addr);
          }
 +        tlb_addr = tlb_addr_write(tlbe) & ~TLB_INVALID_MASK;
      }
 +    /*
-+     * Insert TB into the corresponding region tree before publishing it
++     * Let the guest notice RMW on a write-only page.
-+     * through QHT. Otherwise rewinding happened in the TB might fail to
++     * We have just verified that the page is writable.
-+     * lookup itself using host PC.
++     * Subpage lookups may have left TLB_INVALID_MASK set,
 +     * but addr_read will only be -1 if PAGE_READ was unset.
 +     */
-+    tcg_tb_insert(tb);
++    if (unlikely(tlbe->addr_read == -1)) {
 +        tlb_fill(env_cpu(env), addr, size, MMU_DATA_LOAD, mmu_idx, retaddr);
 +        /*
 +         * Since we don't support reads and writes to different
 +         * addresses, and we do have the proper page loaded for
 +         * write, this shouldn't ever return.  But just in case,
 +         * handle via stop-the-world.
 +         */
 +        goto stop_the_world;
 +    }
 +    /* Collect TLB_WATCHPOINT for read. */
 +    tlb_addr |= tlbe->addr_read;
 +
-     /* check next page if needed */
+     /* Notice an IO access or a needs-MMU-lookup access */
-     virt_page2 = (pc + tb->size - 1) & TARGET_PAGE_MASK;
+     if (unlikely(tlb_addr & (TLB_MMIO | TLB_DISCARD_WRITE))) {
-     phys_page2 = -1;
+         /* There's really nothing that can be done to
-@@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_gen_code(CPUState *cpu,
+@@ -XXX,XX +XXX,XX @@ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
-         orig_aligned -= ROUND_UP(sizeof(*tb), qemu_icache_linesize);
+     }
-         qatomic_set(&tcg_ctx->code_gen_ptr, (void *)orig_aligned);
-         tb_destroy(tb);
+     if (unlikely(tlb_addr & TLB_WATCHPOINT)) {
-+        tcg_tb_remove(tb);
+-        QEMU_BUILD_BUG_ON(PAGE_READ != BP_MEM_READ);
-         return existing_tb;
+-        QEMU_BUILD_BUG_ON(PAGE_WRITE != BP_MEM_WRITE);
-     }
+-        /* therefore prot == watchpoint bits */
--    tcg_tb_insert(tb);
+-        cpu_check_watchpoint(env_cpu(env), addr, size,
-     return tb;
+-                             full->attrs, prot, retaddr);
- }
++        cpu_check_watchpoint(env_cpu(env), addr, size, full->attrs,
++                             BP_MEM_READ | BP_MEM_WRITE, retaddr);
      }
      return hostaddr;
 diff --git a/accel/tcg/user-exec.c b/accel/tcg/user-exec.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/user-exec.c
 +++ b/accel/tcg/user-exec.c
@@ -XXX,XX +XXX,XX @@ uint64_t cpu_ldq_code_mmu(CPUArchState *env, abi_ptr addr,
  /*
   * Do not allow unaligned operations to proceed.  Return the host address.
 - *
 - * @prot may be PAGE_READ, PAGE_WRITE, or PAGE_READ|PAGE_WRITE.
   */
  static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
 -                               MemOpIdx oi, int size, int prot,
 -                               uintptr_t retaddr)
 +                               MemOpIdx oi, int size, uintptr_t retaddr)
  {
      MemOp mop = get_memop(oi);
      int a_bits = get_alignment_bits(mop);
@@ -XXX,XX +XXX,XX @@ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
      /* Enforce guest required alignment.  */
      if (unlikely(addr & ((1 << a_bits) - 1))) {
 -        MMUAccessType t = prot == PAGE_READ ? MMU_DATA_LOAD : MMU_DATA_STORE;
 -        cpu_loop_exit_sigbus(env_cpu(env), addr, t, retaddr);
 +        cpu_loop_exit_sigbus(env_cpu(env), addr, MMU_DATA_STORE, retaddr);
      }
      /* Enforce qemu required alignment.  */
 --
-.25.1
+.34.1

-[PATCH 10/41] target/arm: Use DISAS_TOO_MANY for ISB and SB
+[PULL 21/28] accel/tcg: Eliminate #if on HAVE_ATOMIC128 and HAVE_CMPXCHG128
-Using gen_goto_tb directly misses the single-step check.
+These symbols will shortly become dynamic runtime tests and
-Let the branch or debug exception be emitted by arm_tr_tb_stop.
+therefore not appropriate for the preprocessor.  Use the
 matching CONFIG_* symbols for that purpose.
-Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/arm/translate.c | 4 ++--
+ host/include/aarch64/host/atomic128-cas.h  | 2 ++
-file changed, 2 insertions(+), 2 deletions(-)
+ host/include/generic/host/atomic128-ldst.h | 2 +-
  accel/tcg/cputlb.c                         | 2 +-
  accel/tcg/user-exec.c                      | 2 +-
 files changed, 5 insertions(+), 3 deletions(-)
-diff --git a/target/arm/translate.c b/target/arm/translate.c
+diff --git a/host/include/aarch64/host/atomic128-cas.h b/host/include/aarch64/host/atomic128-cas.h
 index XXXXXXX..XXXXXXX 100644
---- a/target/arm/translate.c
+--- a/host/include/aarch64/host/atomic128-cas.h
-+++ b/target/arm/translate.c
++++ b/host/include/aarch64/host/atomic128-cas.h
-@@ -XXX,XX +XXX,XX @@ static bool trans_ISB(DisasContext *s, arg_ISB *a)
+@@ -XXX,XX +XXX,XX @@ static inline Int128 atomic16_cmpxchg(Int128 *ptr, Int128 cmp, Int128 new)
-      * self-modifying code correctly and also to take
-      * any pending interrupts immediately.
+     return int128_make128(oldl, oldh);
       */
 -    gen_goto_tb(s, 0, s->base.pc_next);
 +    s->base.is_jmp = DISAS_TOO_MANY;
      return true;
  }
++
-@@ -XXX,XX +XXX,XX @@ static bool trans_SB(DisasContext *s, arg_SB *a)
++# define CONFIG_CMPXCHG128 1
-      * for TCG; MB and end the TB instead.
+ # define HAVE_CMPXCHG128 1
-      */
+ #endif
-     tcg_gen_mb(TCG_MO_ALL | TCG_BAR_SC);
--    gen_goto_tb(s, 0, s->base.pc_next);
+diff --git a/host/include/generic/host/atomic128-ldst.h b/host/include/generic/host/atomic128-ldst.h
-+    s->base.is_jmp = DISAS_TOO_MANY;
+index XXXXXXX..XXXXXXX 100644
-     return true;
+--- a/host/include/generic/host/atomic128-ldst.h
 +++ b/host/include/generic/host/atomic128-ldst.h
@@ -XXX,XX +XXX,XX @@ atomic16_set(Int128 *ptr, Int128 val)
  }
+ # define HAVE_ATOMIC128 1
+-#elif !defined(CONFIG_USER_ONLY) && HAVE_CMPXCHG128
++#elif defined(CONFIG_CMPXCHG128) && !defined(CONFIG_USER_ONLY)
+ static inline Int128 ATTRIBUTE_ATOMIC128_OPT
+ atomic16_read(Int128 *ptr)
+ {
+diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
+index XXXXXXX..XXXXXXX 100644
+--- a/accel/tcg/cputlb.c
++++ b/accel/tcg/cputlb.c
+@@ -XXX,XX +XXX,XX @@ void cpu_st16_mmu(CPUArchState *env, target_ulong addr, Int128 val,
+ #include "atomic_template.h"
+ #endif
+-#if HAVE_CMPXCHG128 || HAVE_ATOMIC128
++#if defined(CONFIG_ATOMIC128) || defined(CONFIG_CMPXCHG128)
+ #define DATA_SIZE 16
+ #include "atomic_template.h"
+ #endif
+diff --git a/accel/tcg/user-exec.c b/accel/tcg/user-exec.c
+index XXXXXXX..XXXXXXX 100644
+--- a/accel/tcg/user-exec.c
++++ b/accel/tcg/user-exec.c
+@@ -XXX,XX +XXX,XX @@ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
+ #include "atomic_template.h"
+ #endif
+-#if HAVE_ATOMIC128 || HAVE_CMPXCHG128
++#if defined(CONFIG_ATOMIC128) || defined(CONFIG_CMPXCHG128)
+ #define DATA_SIZE 16
+ #include "atomic_template.h"
+ #endif
 --
-.25.1
+.34.1

-[PATCH 11/41] target/arm: Use translator_use_goto_tb for aarch64
+[PULL 22/28] qemu/atomic128: Split atomic16_read
-We have not needed to end a TB for I/O since ba3e7926691
+Create both atomic16_read_ro and atomic16_read_rw.
-("icount: clean up cpu_can_io at the entry to the block"),
+Previously we pretended that we had atomic16_read in system mode,
-and gdbstub singlestep is handled by the generic function.
+because we "know" that all ram is always writable to the host.
 Now, expose read-only and read-write versions all of the time.
-Drop the unused 'n' argument to use_goto_tb.
+For aarch64, do not fall back to __atomic_read_16 even if
 supported by the compiler, to work around a clang bug.
-Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/arm/translate-a64.c | 25 +++++--------------------
+ host/include/aarch64/host/atomic128-ldst.h | 21 ++++++++-------
-file changed, 5 insertions(+), 20 deletions(-)
+ host/include/generic/host/atomic128-ldst.h | 31 ++++++++++++++++------
  target/s390x/tcg/mem_helper.c              |  2 +-
 files changed, 36 insertions(+), 18 deletions(-)
-diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
+diff --git a/host/include/aarch64/host/atomic128-ldst.h b/host/include/aarch64/host/atomic128-ldst.h
 index XXXXXXX..XXXXXXX 100644
---- a/target/arm/translate-a64.c
+--- a/host/include/aarch64/host/atomic128-ldst.h
-+++ b/target/arm/translate-a64.c
++++ b/host/include/aarch64/host/atomic128-ldst.h
-@@ -XXX,XX +XXX,XX @@ static void gen_step_complete_exception(DisasContext *s)
+@@ -XXX,XX +XXX,XX @@
-     s->base.is_jmp = DISAS_NORETURN;
+ #ifndef AARCH64_ATOMIC128_LDST_H
  #define AARCH64_ATOMIC128_LDST_H
 -/* Through gcc 10, aarch64 has no support for 128-bit atomics.  */
 -#if !defined(CONFIG_ATOMIC128) && !defined(CONFIG_USER_ONLY)
 -/* We can do better than cmpxchg for AArch64.  */
 -static inline Int128 atomic16_read(Int128 *ptr)
 +/*
 + * Through gcc 10, aarch64 has no support for 128-bit atomics.
 + * Through clang 16, without -march=armv8.4-a, __atomic_load_16
 + * is incorrectly expanded to a read-write operation.
 + */
 +
 +#define HAVE_ATOMIC128_RO 0
 +#define HAVE_ATOMIC128_RW 1
 +
 +Int128 QEMU_ERROR("unsupported atomic") atomic16_read_ro(const Int128 *ptr);
 +
 +static inline Int128 atomic16_read_rw(Int128 *ptr)
  {
      uint64_t l, h;
      uint32_t tmp;
@@ -XXX,XX +XXX,XX @@ static inline void atomic16_set(Int128 *ptr, Int128 val)
          : [l] "r"(l), [h] "r"(h));
  }
--static inline bool use_goto_tb(DisasContext *s, int n, uint64_t dest)
+-# define HAVE_ATOMIC128 1
-+static inline bool use_goto_tb(DisasContext *s, uint64_t dest)
+-#else
- {
+-#include "host/include/generic/host/atomic128-ldst.h"
 -    /* No direct tb linking with singlestep (either QEMU's or the ARM
 -     * debug architecture kind) or deterministic io
 -     */
 -    if (s->base.singlestep_enabled || s->ss_active ||
 -        (tb_cflags(s->base.tb) & CF_LAST_IO)) {
 +    if (s->ss_active) {
          return false;
      }
 -
 -#ifndef CONFIG_USER_ONLY
 -    /* Only link tbs from inside the same guest page */
 -    if ((s->base.tb->pc & TARGET_PAGE_MASK) != (dest & TARGET_PAGE_MASK)) {
 -        return false;
 -    }
 -#endif
 -
--    return true;
+ #endif /* AARCH64_ATOMIC128_LDST_H */
-+    return translator_use_goto_tb(&s->base, dest);
+diff --git a/host/include/generic/host/atomic128-ldst.h b/host/include/generic/host/atomic128-ldst.h
 index XXXXXXX..XXXXXXX 100644
 --- a/host/include/generic/host/atomic128-ldst.h
 +++ b/host/include/generic/host/atomic128-ldst.h
@@ -XXX,XX +XXX,XX @@
  #define HOST_ATOMIC128_LDST_H
  #if defined(CONFIG_ATOMIC128)
 +# define HAVE_ATOMIC128_RO 1
 +# define HAVE_ATOMIC128_RW 1
 +
  static inline Int128 ATTRIBUTE_ATOMIC128_OPT
 -atomic16_read(Int128 *ptr)
 +atomic16_read_ro(const Int128 *ptr)
  {
 -    __int128_t *ptr_align = __builtin_assume_aligned(ptr, 16);
 +    const __int128_t *ptr_align = __builtin_assume_aligned(ptr, 16);
      Int128Alias r;
      r.i = qatomic_read__nocheck(ptr_align);
      return r.s;
  }
- static inline void gen_goto_tb(DisasContext *s, int n, uint64_t dest)
++static inline Int128 ATTRIBUTE_ATOMIC128_OPT
 +atomic16_read_rw(Int128 *ptr)
 +{
 +    return atomic16_read_ro(ptr);
 +}
 +
  static inline void ATTRIBUTE_ATOMIC128_OPT
  atomic16_set(Int128 *ptr, Int128 val)
  {
--    const TranslationBlock *tb;
+@@ -XXX,XX +XXX,XX @@ atomic16_set(Int128 *ptr, Int128 val)
--
+     qatomic_set__nocheck(ptr_align, v.i);
--    tb = s->base.tb;
+ }
--    if (use_goto_tb(s, n, dest)) {
-+    if (use_goto_tb(s, dest)) {
+-# define HAVE_ATOMIC128 1
-         tcg_gen_goto_tb(n);
+-#elif defined(CONFIG_CMPXCHG128) && !defined(CONFIG_USER_ONLY)
-         gen_a64_set_pc_im(dest);
++#elif defined(CONFIG_CMPXCHG128)
--        tcg_gen_exit_tb(tb, n);
++# define HAVE_ATOMIC128_RO 0
-+        tcg_gen_exit_tb(s->base.tb, n);
++# define HAVE_ATOMIC128_RW 1
-         s->base.is_jmp = DISAS_NORETURN;
++
-     } else {
++Int128 QEMU_ERROR("unsupported atomic") atomic16_read_ro(const Int128 *ptr);
-         gen_a64_set_pc_im(dest);
++
  static inline Int128 ATTRIBUTE_ATOMIC128_OPT
 -atomic16_read(Int128 *ptr)
 +atomic16_read_rw(Int128 *ptr)
  {
      /* Maybe replace 0 with 0, returning the old value.  */
      Int128 z = int128_make64(0);
@@ -XXX,XX +XXX,XX @@ atomic16_set(Int128 *ptr, Int128 val)
      } while (int128_ne(old, cmp));
  }
 -# define HAVE_ATOMIC128 1
  #else
 +# define HAVE_ATOMIC128_RO 0
 +# define HAVE_ATOMIC128_RW 0
 +
  /* Fallback definitions that must be optimized away, or error.  */
 -Int128 QEMU_ERROR("unsupported atomic") atomic16_read(Int128 *ptr);
 +Int128 QEMU_ERROR("unsupported atomic") atomic16_read_ro(const Int128 *ptr);
 +Int128 QEMU_ERROR("unsupported atomic") atomic16_read_rw(Int128 *ptr);
  void QEMU_ERROR("unsupported atomic") atomic16_set(Int128 *ptr, Int128 val);
 -# define HAVE_ATOMIC128 0
  #endif
  #endif /* HOST_ATOMIC128_LDST_H */
 diff --git a/target/s390x/tcg/mem_helper.c b/target/s390x/tcg/mem_helper.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/s390x/tcg/mem_helper.c
 +++ b/target/s390x/tcg/mem_helper.c
@@ -XXX,XX +XXX,XX @@ static uint32_t do_csst(CPUS390XState *env, uint32_t r3, uint64_t a1,
          max = 3;
  #endif
          if ((HAVE_CMPXCHG128 ? 0 : fc + 2 > max) ||
 -            (HAVE_ATOMIC128  ? 0 : sc > max)) {
 +            (HAVE_ATOMIC128_RW ? 0 : sc > max)) {
              cpu_loop_exit_atomic(env_cpu(env), ra);
          }
      }
 --
-.25.1
+.34.1

-[PATCH 04/41] tcg: Bake tb_destroy() into tcg_region_tree
+[PULL 23/28] accel/tcg: Correctly use atomic128.h in ldst_atomicity.c.inc
-From: Liren Wei <lrwei@bupt.edu.cn>
+Remove the locally defined load_atomic16 and store_atomic16,
+along with HAVE_al16 and HAVE_al16_fast in favor of the
-The function is called only at tcg_gen_code() when duplicated TBs
+routines defined in atomic128.h.
-are translated by different threads, and when the tcg_region_tree
-is reset. Bake it into the underlying GTree as its value destroy
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 function to unite these situations.
 Also remove tcg_region_tree_traverse() which now becomes useless.
 Signed-off-by: Liren Wei <lrwei@bupt.edu.cn>
 Message-Id: <8dc352f08d038c4e7a1f5f56962398cdc700c3aa.1625404483.git.lrwei@bupt.edu.cn>
 [rth: Name the new tb_tc_cmp parameter correctly.]
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/tcg/tcg.h         |  1 -
+ accel/tcg/cputlb.c             |   2 +-
- accel/tcg/translate-all.c |  6 ------
+ accel/tcg/ldst_atomicity.c.inc | 118 +++++++--------------------------
- tcg/region.c              | 19 ++++++++-----------
+files changed, 24 insertions(+), 96 deletions(-)
-files changed, 8 insertions(+), 18 deletions(-)
+diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
 diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
 index XXXXXXX..XXXXXXX 100644
---- a/include/tcg/tcg.h
+--- a/accel/tcg/cputlb.c
-+++ b/include/tcg/tcg.h
++++ b/accel/tcg/cputlb.c
-@@ -XXX,XX +XXX,XX @@ void *tcg_malloc_internal(TCGContext *s, int size);
+@@ -XXX,XX +XXX,XX @@ static uint64_t do_st16_leN(CPUArchState *env, MMULookupPageData *p,
- void tcg_pool_reset(TCGContext *s);
- TranslationBlock *tcg_tb_alloc(TCGContext *s);
+     case MO_ATOM_WITHIN16_PAIR:
+         /* Since size > 8, this is the half that must be atomic. */
--void tb_destroy(TranslationBlock *tb);
+-        if (!HAVE_al16) {
- void tcg_region_reset_all(void);
++        if (!HAVE_ATOMIC128_RW) {
+             cpu_loop_exit_atomic(env_cpu(env), ra);
- size_t tcg_code_size(void);
+         }
-diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
+         return store_whole_le16(p->haddr, p->size, val_le);
 diff --git a/accel/tcg/ldst_atomicity.c.inc b/accel/tcg/ldst_atomicity.c.inc
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/translate-all.c
+--- a/accel/tcg/ldst_atomicity.c.inc
-+++ b/accel/tcg/translate-all.c
++++ b/accel/tcg/ldst_atomicity.c.inc
-@@ -XXX,XX +XXX,XX @@ static int cpu_restore_state_from_tb(CPUState *cpu, TranslationBlock *tb,
+@@ -XXX,XX +XXX,XX @@
-     return 0;
+ #endif
  #define HAVE_al8_fast      (ATOMIC_REG_SIZE >= 8)
 -#if defined(CONFIG_ATOMIC128)
 -# define HAVE_al16_fast    true
 -#else
 -# define HAVE_al16_fast    false
 -#endif
 -#if defined(CONFIG_ATOMIC128) || defined(CONFIG_CMPXCHG128)
 -# define HAVE_al16         true
 -#else
 -# define HAVE_al16         false
 -#endif
 -
 -
  /**
   * required_atomicity:
   *
@@ -XXX,XX +XXX,XX @@ static inline uint64_t load_atomic8(void *pv)
      return qatomic_read__nocheck(p);
  }
--void tb_destroy(TranslationBlock *tb)
+-/**
 - * load_atomic16:
 - * @pv: host address
 - *
 - * Atomically load 16 aligned bytes from @pv.
 - */
 -static inline Int128 ATTRIBUTE_ATOMIC128_OPT
 -load_atomic16(void *pv)
 -{
--    qemu_spin_destroy(&tb->jmp_lock);
+-#ifdef CONFIG_ATOMIC128
 -    __uint128_t *p = __builtin_assume_aligned(pv, 16);
 -    Int128Alias r;
 -
 -    r.u = qatomic_read__nocheck(p);
 -    return r.s;
 -#else
 -    qemu_build_not_reached();
 -#endif
 -}
 -
- bool cpu_restore_state(CPUState *cpu, uintptr_t host_pc, bool will_exit)
+ /**
   * load_atomic8_or_exit:
   * @env: cpu context
@@ -XXX,XX +XXX,XX @@ static Int128 load_atomic16_or_exit(CPUArchState *env, uintptr_t ra, void *pv)
  {
-     /*
+     Int128 *p = __builtin_assume_aligned(pv, 16);
-@@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_gen_code(CPUState *cpu,
+-    if (HAVE_al16_fast) {
-         orig_aligned -= ROUND_UP(sizeof(*tb), qemu_icache_linesize);
+-        return load_atomic16(p);
-         qatomic_set(&tcg_ctx->code_gen_ptr, (void *)orig_aligned);
++    if (HAVE_ATOMIC128_RO) {
--        tb_destroy(tb);
++        return atomic16_read_ro(p);
-         tcg_tb_remove(tb);
+     }
-         return existing_tb;
-     }
+ #ifdef CONFIG_USER_ONLY
-diff --git a/tcg/region.c b/tcg/region.c
+@@ -XXX,XX +XXX,XX @@ static Int128 load_atomic16_or_exit(CPUArchState *env, uintptr_t ra, void *pv)
-index XXXXXXX..XXXXXXX 100644
+      * In system mode all guest pages are writable, and for user-only
---- a/tcg/region.c
+      * we have just checked writability.  Try cmpxchg.
-+++ b/tcg/region.c
+      */
-@@ -XXX,XX +XXX,XX @@ static int ptr_cmp_tb_tc(const void *ptr, const struct tb_tc *s)
+-#if defined(CONFIG_CMPXCHG128)
-     return 0;
+-    /* Swap 0 with 0, with the side-effect of returning the old value. */
 -    {
 -        Int128Alias r;
 -        r.u = __sync_val_compare_and_swap_16((__uint128_t *)p, 0, 0);
 -        return r.s;
 +    if (HAVE_ATOMIC128_RW) {
 +        return atomic16_read_rw(p);
      }
 -#endif
      /* Ultimate fallback: re-execute in serial context. */
      cpu_loop_exit_atomic(env_cpu(env), ra);
@@ -XXX,XX +XXX,XX @@ static uint64_t load_atom_extract_al16_or_exit(CPUArchState *env, uintptr_t ra,
  static inline uint64_t ATTRIBUTE_ATOMIC128_OPT
  load_atom_extract_al16_or_al8(void *pv, int s)
  {
 -#if defined(CONFIG_ATOMIC128)
      uintptr_t pi = (uintptr_t)pv;
      int o = pi & 7;
      int shr = (HOST_BIG_ENDIAN ? 16 - s - o : o) * 8;
 -    __uint128_t r;
 +    Int128 r;
      pv = (void *)(pi & ~7);
      if (pi & 8) {
@@ -XXX,XX +XXX,XX @@ load_atom_extract_al16_or_al8(void *pv, int s)
          uint64_t b = qatomic_read__nocheck(p8 + 1);
          if (HOST_BIG_ENDIAN) {
 -            r = ((__uint128_t)a << 64) | b;
 +            r = int128_make128(b, a);
          } else {
 -            r = ((__uint128_t)b << 64) | a;
 +            r = int128_make128(a, b);
          }
      } else {
 -        __uint128_t *p16 = __builtin_assume_aligned(pv, 16, 0);
 -        r = qatomic_read__nocheck(p16);
 +        r = atomic16_read_ro(pv);
      }
 -    return r >> shr;
 -#else
 -    qemu_build_not_reached();
 -#endif
 +    return int128_getlo(int128_urshift(r, shr));
  }
--static gint tb_tc_cmp(gconstpointer ap, gconstpointer bp)
+ /**
-+static gint tb_tc_cmp(gconstpointer ap, gconstpointer bp, gpointer userdata)
+@@ -XXX,XX +XXX,XX @@ static uint16_t load_atom_2(CPUArchState *env, uintptr_t ra,
- {
+     if (likely((pi & 1) == 0)) {
-     const struct tb_tc *a = ap;
+         return load_atomic2(pv);
-     const struct tb_tc *b = bp;
+     }
-@@ -XXX,XX +XXX,XX @@ static gint tb_tc_cmp(gconstpointer ap, gconstpointer bp)
+-    if (HAVE_al16_fast) {
-     return ptr_cmp_tb_tc(b->ptr, a);
++    if (HAVE_ATOMIC128_RO) {
          return load_atom_extract_al16_or_al8(pv, 2);
      }
@@ -XXX,XX +XXX,XX @@ static uint32_t load_atom_4(CPUArchState *env, uintptr_t ra,
      if (likely((pi & 3) == 0)) {
          return load_atomic4(pv);
      }
 -    if (HAVE_al16_fast) {
 +    if (HAVE_ATOMIC128_RO) {
          return load_atom_extract_al16_or_al8(pv, 4);
      }
@@ -XXX,XX +XXX,XX @@ static uint64_t load_atom_8(CPUArchState *env, uintptr_t ra,
      if (HAVE_al8 && likely((pi & 7) == 0)) {
          return load_atomic8(pv);
      }
 -    if (HAVE_al16_fast) {
 +    if (HAVE_ATOMIC128_RO) {
          return load_atom_extract_al16_or_al8(pv, 8);
      }
@@ -XXX,XX +XXX,XX @@ static Int128 load_atom_16(CPUArchState *env, uintptr_t ra,
       * If the host does not support 16-byte atomics, wait until we have
       * examined the atomicity parameters below.
       */
 -    if (HAVE_al16_fast && likely((pi & 15) == 0)) {
 -        return load_atomic16(pv);
 +    if (HAVE_ATOMIC128_RO && likely((pi & 15) == 0)) {
 +        return atomic16_read_ro(pv);
      }
      atmax = required_atomicity(env, pi, memop);
@@ -XXX,XX +XXX,XX @@ static inline void store_atomic8(void *pv, uint64_t val)
      qatomic_set__nocheck(p, val);
  }
-+static void tb_destroy(gpointer value)
+-/**
-+{
+- * store_atomic16:
-+    TranslationBlock *tb = value;
+- * @pv: host address
-+    qemu_spin_destroy(&tb->jmp_lock);
+- * @val: value to store
-+}
+- *
-+
+- * Atomically store 16 aligned bytes to @pv.
- static void tcg_region_trees_init(void)
+- */
- {
+-static inline void ATTRIBUTE_ATOMIC128_OPT
-     size_t i;
+-store_atomic16(void *pv, Int128Alias val)
@@ -XXX,XX +XXX,XX @@ static void tcg_region_trees_init(void)
          struct tcg_region_tree *rt = region_trees + i * tree_size;
          qemu_mutex_init(&rt->lock);
 -        rt->tree = g_tree_new(tb_tc_cmp);
 +        rt->tree = g_tree_new_full(tb_tc_cmp, NULL, NULL, tb_destroy);
      }
  }
@@ -XXX,XX +XXX,XX @@ size_t tcg_nb_tbs(void)
      return nb_tbs;
  }
 -static gboolean tcg_region_tree_traverse(gpointer k, gpointer v, gpointer data)
 -{
--    TranslationBlock *tb = v;
+-#if defined(CONFIG_ATOMIC128)
--
+-    __uint128_t *pu = __builtin_assume_aligned(pv, 16);
--    tb_destroy(tb);
+-    qatomic_set__nocheck(pu, val.u);
--    return FALSE;
+-#elif defined(CONFIG_CMPXCHG128)
 -    __uint128_t *pu = __builtin_assume_aligned(pv, 16);
 -    __uint128_t o;
 -
 -    /*
 -     * Without CONFIG_ATOMIC128, __atomic_compare_exchange_n will always
 -     * defer to libatomic, so we must use __sync_*_compare_and_swap_16
 -     * and accept the sequential consistency that comes with it.
 -     */
 -    do {
 -        o = *pu;
 -    } while (!__sync_bool_compare_and_swap_16(pu, o, val.u));
 -#else
 -    qemu_build_not_reached();
 -#endif
 -}
 -
- static void tcg_region_tree_reset_all(void)
+ /**
- {
+  * store_atom_4x2
-     size_t i;
+  */
-@@ -XXX,XX +XXX,XX @@ static void tcg_region_tree_reset_all(void)
+@@ -XXX,XX +XXX,XX @@ static uint64_t store_whole_le16(void *pv, int size, Int128 val_le)
-     for (i = 0; i < region.n; i++) {
+     int sh = o * 8;
-         struct tcg_region_tree *rt = region_trees + i * tree_size;
+     Int128 m, v;
--        g_tree_foreach(rt->tree, tcg_region_tree_traverse, NULL);
+-    qemu_build_assert(HAVE_al16);
-         /* Increment the refcount first so that destroy acts as a reset */
++    qemu_build_assert(HAVE_ATOMIC128_RW);
-         g_tree_ref(rt->tree);
-         g_tree_destroy(rt->tree);
+     /* Like MAKE_64BIT_MASK(0, sz), but larger. */
      if (sz <= 64) {
@@ -XXX,XX +XXX,XX @@ static void store_atom_2(CPUArchState *env, uintptr_t ra,
              return;
          }
      } else if ((pi & 15) == 7) {
 -        if (HAVE_al16) {
 +        if (HAVE_ATOMIC128_RW) {
              Int128 v = int128_lshift(int128_make64(val), 56);
              Int128 m = int128_lshift(int128_make64(0xffff), 56);
              store_atom_insert_al16(pv - 7, v, m);
@@ -XXX,XX +XXX,XX @@ static void store_atom_4(CPUArchState *env, uintptr_t ra,
                  return;
              }
          } else {
 -            if (HAVE_al16) {
 +            if (HAVE_ATOMIC128_RW) {
                  store_whole_le16(pv, 4, int128_make64(cpu_to_le32(val)));
                  return;
              }
@@ -XXX,XX +XXX,XX @@ static void store_atom_8(CPUArchState *env, uintptr_t ra,
          }
          break;
      case MO_64:
 -        if (HAVE_al16) {
 +        if (HAVE_ATOMIC128_RW) {
              store_whole_le16(pv, 8, int128_make64(cpu_to_le64(val)));
              return;
          }
@@ -XXX,XX +XXX,XX @@ static void store_atom_16(CPUArchState *env, uintptr_t ra,
      uint64_t a, b;
      int atmax;
 -    if (HAVE_al16_fast && likely((pi & 15) == 0)) {
 -        store_atomic16(pv, val);
 +    if (HAVE_ATOMIC128_RW && likely((pi & 15) == 0)) {
 +        atomic16_set(pv, val);
          return;
      }
@@ -XXX,XX +XXX,XX @@ static void store_atom_16(CPUArchState *env, uintptr_t ra,
          }
          break;
      case -MO_64:
 -        if (HAVE_al16) {
 +        if (HAVE_ATOMIC128_RW) {
              uint64_t val_le;
              int s2 = pi & 15;
              int s1 = 16 - s2;
@@ -XXX,XX +XXX,XX @@ static void store_atom_16(CPUArchState *env, uintptr_t ra,
          }
          break;
      case MO_128:
 -        if (HAVE_al16) {
 -            store_atomic16(pv, val);
 +        if (HAVE_ATOMIC128_RW) {
 +            atomic16_set(pv, val);
              return;
          }
          break;
 --
-.25.1
+.34.1

-[PATCH 05/41] tcg: Move tb_phys_invalidate_count to tb_ctx
+[PULL 24/28] tcg: Split out tcg/debug-assert.h
-We can call do_tb_phys_invalidate from an iocontext, which has
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-no per-thread tcg_ctx.  Move this to tb_ctx, which is global.
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 The actual update still takes place with a lock held, so only
 an atomic set is required, not an atomic increment.
 Resolves: https://gitlab.com/qemu-project/qemu/-/issues/457
 Tested-by: Viktor Ashirov <vashirov@redhat.com>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/tb-context.h    |  1 +
+ include/tcg/debug-assert.h | 17 +++++++++++++++++
- include/tcg/tcg.h         |  3 ---
+ include/tcg/tcg.h          |  9 +--------
- accel/tcg/translate-all.c |  8 ++++----
+ MAINTAINERS                |  1 +
- tcg/region.c              | 14 --------------
+files changed, 19 insertions(+), 8 deletions(-)
-files changed, 5 insertions(+), 21 deletions(-)
+ create mode 100644 include/tcg/debug-assert.h
-diff --git a/accel/tcg/tb-context.h b/accel/tcg/tb-context.h
+diff --git a/include/tcg/debug-assert.h b/include/tcg/debug-assert.h
-index XXXXXXX..XXXXXXX 100644
+new file mode 100644
---- a/accel/tcg/tb-context.h
+index XXXXXXX..XXXXXXX
-+++ b/accel/tcg/tb-context.h
+--- /dev/null
-@@ -XXX,XX +XXX,XX @@ struct TBContext {
++++ b/include/tcg/debug-assert.h
+@@ -XXX,XX +XXX,XX @@
-     /* statistics */
++/* SPDX-License-Identifier: MIT */
-     unsigned tb_flush_count;
++/*
-+    unsigned tb_phys_invalidate_count;
++ * Define tcg_debug_assert
- };
++ * Copyright (c) 2008 Fabrice Bellard
++ */
- extern TBContext tb_ctx;
++
 +#ifndef TCG_DEBUG_ASSERT_H
 +#define TCG_DEBUG_ASSERT_H
 +
 +#if defined CONFIG_DEBUG_TCG || defined QEMU_STATIC_ANALYSIS
 +# define tcg_debug_assert(X) do { assert(X); } while (0)
 +#else
 +# define tcg_debug_assert(X) \
 +    do { if (!(X)) { __builtin_unreachable(); } } while (0)
 +#endif
 +
 +#endif
 diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/tcg/tcg.h
 +++ b/include/tcg/tcg.h
-@@ -XXX,XX +XXX,XX @@ struct TCGContext {
+@@ -XXX,XX +XXX,XX @@
-     /* Threshold to flush the translated code buffer.  */
+ #include "tcg/tcg-mo.h"
-     void *code_gen_highwater;
+ #include "tcg-target.h"
+ #include "tcg/tcg-cond.h"
--    size_t tb_phys_invalidate_count;
++#include "tcg/debug-assert.h"
  /* XXX: make safe guess about sizes */
  #define MAX_OP_PER_INSTR 266
@@ -XXX,XX +XXX,XX @@ typedef uint64_t tcg_insn_unit;
  /* The port better have done this.  */
  #endif
 -
-     /* Track which vCPU triggers events */
+-#if defined CONFIG_DEBUG_TCG || defined QEMU_STATIC_ANALYSIS
-     CPUState *cpu;                      /* *_trans */
+-# define tcg_debug_assert(X) do { assert(X); } while (0)
+-#else
-@@ -XXX,XX +XXX,XX @@ size_t tcg_code_capacity(void);
+-# define tcg_debug_assert(X) \
+-    do { if (!(X)) { __builtin_unreachable(); } } while (0)
- void tcg_tb_insert(TranslationBlock *tb);
+-#endif
- void tcg_tb_remove(TranslationBlock *tb);
+-
--size_t tcg_tb_phys_invalidate_count(void);
+ typedef struct TCGRelocation TCGRelocation;
- TranslationBlock *tcg_tb_lookup(uintptr_t tc_ptr);
+ struct TCGRelocation {
- void tcg_tb_foreach(GTraverseFunc func, gpointer user_data);
+     QSIMPLEQ_ENTRY(TCGRelocation) next;
- size_t tcg_nb_tbs(void);
+diff --git a/MAINTAINERS b/MAINTAINERS
 diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/translate-all.c
+--- a/MAINTAINERS
-+++ b/accel/tcg/translate-all.c
++++ b/MAINTAINERS
-@@ -XXX,XX +XXX,XX @@ static void do_tb_phys_invalidate(TranslationBlock *tb, bool rm_from_page_list)
+@@ -XXX,XX +XXX,XX @@ F: include/sysemu/tcg.h
-     /* suppress any remaining jumps to this TB */
+ F: include/hw/core/tcg-cpu-ops.h
-     tb_jmp_unlink(tb);
+ F: host/include/*/host/cpuinfo.h
+ F: util/cpuinfo-*.c
--    qatomic_set(&tcg_ctx->tb_phys_invalidate_count,
++F: include/tcg/
--               tcg_ctx->tb_phys_invalidate_count + 1);
-+    qatomic_set(&tb_ctx.tb_phys_invalidate_count,
+ FPU emulation
-+                tb_ctx.tb_phys_invalidate_count + 1);
+ M: Aurelien Jarno <aurelien@aurel32.net>
  }
  static void tb_phys_invalidate__locked(TranslationBlock *tb)
@@ -XXX,XX +XXX,XX @@ void dump_exec_info(void)
      qemu_printf("\nStatistics:\n");
      qemu_printf("TB flush count      %u\n",
                  qatomic_read(&tb_ctx.tb_flush_count));
 -    qemu_printf("TB invalidate count %zu\n",
 -                tcg_tb_phys_invalidate_count());
 +    qemu_printf("TB invalidate count %u\n",
 +                qatomic_read(&tb_ctx.tb_phys_invalidate_count));
      tlb_flush_counts(&flush_full, &flush_part, &flush_elide);
      qemu_printf("TLB full flushes    %zu\n", flush_full);
 diff --git a/tcg/region.c b/tcg/region.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/region.c
 +++ b/tcg/region.c
@@ -XXX,XX +XXX,XX @@ size_t tcg_code_capacity(void)
      return capacity;
  }
 -
 -size_t tcg_tb_phys_invalidate_count(void)
 -{
 -    unsigned int n_ctxs = qatomic_read(&tcg_cur_ctxs);
 -    unsigned int i;
 -    size_t total = 0;
 -
 -    for (i = 0; i < n_ctxs; i++) {
 -        const TCGContext *s = qatomic_read(&tcg_ctxs[i]);
 -
 -        total += qatomic_read(&s->tb_phys_invalidate_count);
 -    }
 -    return total;
 -}
 --
-.25.1
+.34.1

-[PATCH 32/41] target/tricore: Use tcg_gen_lookup_and_goto_ptr
+[PULL 25/28] qemu/atomic128: Improve cmpxchg fallback for atomic16_set
-The non-single-step case of gen_goto_tb may use
+Use __sync_bool_compare_and_swap_16 to control the loop,
-tcg_gen_lookup_and_goto_ptr to indirectly chain.
+rather than a separate comparison.
-Reviewed-by: Bastian Koppelmann <kbastian@mail.uni-paderborn.de>
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/tricore/translate.c | 3 ++-
+ host/include/generic/host/atomic128-ldst.h | 11 +++++++----
-file changed, 2 insertions(+), 1 deletion(-)
+file changed, 7 insertions(+), 4 deletions(-)
-diff --git a/target/tricore/translate.c b/target/tricore/translate.c
+diff --git a/host/include/generic/host/atomic128-ldst.h b/host/include/generic/host/atomic128-ldst.h
 index XXXXXXX..XXXXXXX 100644
---- a/target/tricore/translate.c
+--- a/host/include/generic/host/atomic128-ldst.h
-+++ b/target/tricore/translate.c
++++ b/host/include/generic/host/atomic128-ldst.h
-@@ -XXX,XX +XXX,XX @@ static void gen_goto_tb(DisasContext *ctx, int n, target_ulong dest)
+@@ -XXX,XX +XXX,XX @@ atomic16_read_rw(Int128 *ptr)
-         gen_save_pc(dest);
+ static inline void ATTRIBUTE_ATOMIC128_OPT
-         if (ctx->base.singlestep_enabled) {
+ atomic16_set(Int128 *ptr, Int128 val)
-             generate_qemu_excp(ctx, EXCP_DEBUG);
+ {
-+        } else {
+-    Int128 old = *ptr, cmp;
-+            tcg_gen_lookup_and_goto_ptr();
++    __int128_t *ptr_align = __builtin_assume_aligned(ptr, 16);
-         }
++    __int128_t old;
--        tcg_gen_exit_tb(NULL, 0);
++    Int128Alias new;
-     }
++
 +    new.s = val;
      do {
 -        cmp = old;
 -        old = atomic16_cmpxchg(ptr, cmp, val);
 -    } while (int128_ne(old, cmp));
 +        old = *ptr_align;
 +    } while (!__sync_bool_compare_and_swap_16(ptr_align, old, new.i));
  }
+ #else
 --
-.25.1
+.34.1

-[PATCH 29/41] target/sh4: Use translator_use_goto_tb
+[PULL 26/28] qemu/atomic128: Add runtime test for FEAT_LSE2
-Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
+With FEAT_LSE2, load and store of int128 is directly supported.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/sh4/translate.c | 11 +++--------
+ host/include/aarch64/host/atomic128-ldst.h | 53 ++++++++++++++++------
-file changed, 3 insertions(+), 8 deletions(-)
+file changed, 40 insertions(+), 13 deletions(-)
-diff --git a/target/sh4/translate.c b/target/sh4/translate.c
+diff --git a/host/include/aarch64/host/atomic128-ldst.h b/host/include/aarch64/host/atomic128-ldst.h
 index XXXXXXX..XXXXXXX 100644
---- a/target/sh4/translate.c
+--- a/host/include/aarch64/host/atomic128-ldst.h
-+++ b/target/sh4/translate.c
++++ b/host/include/aarch64/host/atomic128-ldst.h
-@@ -XXX,XX +XXX,XX @@ static inline bool use_exit_tb(DisasContext *ctx)
+@@ -XXX,XX +XXX,XX @@
-     return (ctx->tbflags & GUSA_EXCLUSIVE) != 0;
+ #ifndef AARCH64_ATOMIC128_LDST_H
  #define AARCH64_ATOMIC128_LDST_H
 +#include "host/cpuinfo.h"
 +#include "tcg/debug-assert.h"
 +
  /*
   * Through gcc 10, aarch64 has no support for 128-bit atomics.
   * Through clang 16, without -march=armv8.4-a, __atomic_load_16
   * is incorrectly expanded to a read-write operation.
 + *
 + * Anyway, this method allows runtime detection of FEAT_LSE2.
   */
 -#define HAVE_ATOMIC128_RO 0
 +#define HAVE_ATOMIC128_RO (cpuinfo & CPUINFO_LSE2)
  #define HAVE_ATOMIC128_RW 1
 -Int128 QEMU_ERROR("unsupported atomic") atomic16_read_ro(const Int128 *ptr);
 +static inline Int128 atomic16_read_ro(const Int128 *ptr)
 +{
 +    uint64_t l, h;
 +
 +    tcg_debug_assert(HAVE_ATOMIC128_RO);
 +    /* With FEAT_LSE2, 16-byte aligned LDP is atomic. */
 +    asm("ldp %[l], %[h], %[mem]"
 +        : [l] "=r"(l), [h] "=r"(h) : [mem] "m"(*ptr));
 +
 +    return int128_make128(l, h);
 +}
  static inline Int128 atomic16_read_rw(Int128 *ptr)
  {
      uint64_t l, h;
      uint32_t tmp;
 -    /* The load must be paired with the store to guarantee not tearing.  */
 -    asm("0: ldxp %[l], %[h], %[mem]\n\t"
 -        "stxp %w[tmp], %[l], %[h], %[mem]\n\t"
 -        "cbnz %w[tmp], 0b"
 -        : [mem] "+m"(*ptr), [tmp] "=r"(tmp), [l] "=r"(l), [h] "=r"(h));
 +    if (cpuinfo & CPUINFO_LSE2) {
 +        /* With FEAT_LSE2, 16-byte aligned LDP is atomic. */
 +        asm("ldp %[l], %[h], %[mem]"
 +            : [l] "=r"(l), [h] "=r"(h) : [mem] "m"(*ptr));
 +    } else {
 +        /* The load must be paired with the store to guarantee not tearing.  */
 +        asm("0: ldxp %[l], %[h], %[mem]\n\t"
 +            "stxp %w[tmp], %[l], %[h], %[mem]\n\t"
 +            "cbnz %w[tmp], 0b"
 +            : [mem] "+m"(*ptr), [tmp] "=r"(tmp), [l] "=r"(l), [h] "=r"(h));
 +    }
      return int128_make128(l, h);
  }
+@@ -XXX,XX +XXX,XX @@ static inline void atomic16_set(Int128 *ptr, Int128 val)
--static inline bool use_goto_tb(DisasContext *ctx, target_ulong dest)
+     uint64_t l = int128_getlo(val), h = int128_gethi(val);
-+static bool use_goto_tb(DisasContext *ctx, target_ulong dest)
+     uint64_t t1, t2;
- {
--    /* Use a direct jump if in same page and singlestep not enabled */
+-    /* Load into temporaries to acquire the exclusive access lock.  */
--    if (unlikely(ctx->base.singlestep_enabled || use_exit_tb(ctx))) {
+-    asm("0: ldxp %[t1], %[t2], %[mem]\n\t"
-+    if (use_exit_tb(ctx)) {
+-        "stxp %w[t1], %[l], %[h], %[mem]\n\t"
-         return false;
+-        "cbnz %w[t1], 0b"
-     }
+-        : [mem] "+m"(*ptr), [t1] "=&r"(t1), [t2] "=&r"(t2)
--#ifndef CONFIG_USER_ONLY
+-        : [l] "r"(l), [h] "r"(h));
--    return (ctx->base.tb->pc & TARGET_PAGE_MASK) == (dest & TARGET_PAGE_MASK);
++    if (cpuinfo & CPUINFO_LSE2) {
--#else
++        /* With FEAT_LSE2, 16-byte aligned STP is atomic. */
--    return true;
++        asm("stp %[l], %[h], %[mem]"
--#endif
++            : [mem] "=m"(*ptr) : [l] "r"(l), [h] "r"(h));
-+    return translator_use_goto_tb(&ctx->base, dest);
++    } else {
 +        /* Load into temporaries to acquire the exclusive access lock.  */
 +        asm("0: ldxp %[t1], %[t2], %[mem]\n\t"
 +            "stxp %w[t1], %[l], %[h], %[mem]\n\t"
 +            "cbnz %w[t1], 0b"
 +            : [mem] "+m"(*ptr), [t1] "=&r"(t1), [t2] "=&r"(t2)
 +            : [l] "r"(l), [h] "r"(h));
 +    }
  }
- static void gen_goto_tb(DisasContext *ctx, int n, target_ulong dest)
+ #endif /* AARCH64_ATOMIC128_LDST_H */
 --
-.25.1
+.34.1

-[PATCH 02/41] tcg: Avoid including 'trace-tcg.h' in target translate.c
+[PULL 27/28] tcg: Remove DEBUG_DISAS
-From: Philippe Mathieu-Daudé <f4bug@amsat.org>
+This had been set since the beginning, is never undefined,
 and it would seem to be harmful to debugging to do so.
-The root trace-events only declares a single TCG event:
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
   $ git grep -w tcg trace-events
   trace-events:115:# tcg/tcg-op.c
   trace-events:137:vcpu tcg guest_mem_before(TCGv vaddr, uint16_t info) "info=%d", "vaddr=0x%016"PRIx64" info=%d"
 and only a tcg/tcg-op.c uses it:
   $ git grep -l trace_guest_mem_before_tcg
   tcg/tcg-op.c
 therefore it is pointless to include "trace-tcg.h" in each target
 (because it is not used). Remove it.
 Signed-off-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Message-Id: <20210629050935.2570721-1-f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/alpha/translate.c      | 1 -
+ include/exec/exec-all.h   | 3 ---
- target/arm/translate-a64.c    | 1 -
+ accel/tcg/cpu-exec.c      | 2 --
- target/arm/translate-sve.c    | 1 -
+ accel/tcg/translate-all.c | 2 --
- target/arm/translate.c        | 1 -
+ accel/tcg/translator.c    | 2 --
- target/cris/translate.c       | 1 -
+ target/sh4/translate.c    | 2 --
- target/hppa/translate.c       | 1 -
+ target/sparc/translate.c  | 2 --
- target/i386/tcg/translate.c   | 1 -
+ tcg/tcg.c                 | 9 +--------
- target/m68k/translate.c       | 1 -
+files changed, 1 insertion(+), 21 deletions(-)
  target/microblaze/translate.c | 1 -
  target/mips/tcg/translate.c   | 1 -
  target/openrisc/translate.c   | 1 -
  target/ppc/translate.c        | 1 -
  target/rx/translate.c         | 1 -
  target/s390x/translate.c      | 1 -
  target/sh4/translate.c        | 1 -
  target/sparc/translate.c      | 1 -
  target/xtensa/translate.c     | 1 -
 files changed, 17 deletions(-)
-diff --git a/target/alpha/translate.c b/target/alpha/translate.c
+diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
 index XXXXXXX..XXXXXXX 100644
---- a/target/alpha/translate.c
+--- a/include/exec/exec-all.h
-+++ b/target/alpha/translate.c
++++ b/include/exec/exec-all.h
 @@ -XXX,XX +XXX,XX @@
- #include "exec/cpu_ldst.h"
+ #include "qemu/interval-tree.h"
- #include "exec/helper-proto.h"
+ #include "qemu/clang-tsa.h"
- #include "exec/helper-gen.h"
--#include "trace-tcg.h"
+-/* allow to see translation results - the slowdown should be negligible, so we leave it */
- #include "exec/translator.h"
+-#define DEBUG_DISAS
- #include "exec/log.h"
+-
+ /* Page tracking code uses ram addresses in system mode, and virtual
-diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
+    addresses in userspace mode.  Define tb_page_addr_t to be an appropriate
     type.  */
 diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/arm/translate-a64.c
+--- a/accel/tcg/cpu-exec.c
-+++ b/target/arm/translate-a64.c
++++ b/accel/tcg/cpu-exec.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static void log_cpu_exec(target_ulong pc, CPUState *cpu,
- #include "exec/helper-gen.h"
+                       cpu->cpu_index, tb->tc.ptr, tb->cs_base, pc,
- #include "exec/log.h"
+                       tb->flags, tb->cflags, lookup_symbol(pc));
--#include "trace-tcg.h"
+-#if defined(DEBUG_DISAS)
- #include "translate-a64.h"
+         if (qemu_loglevel_mask(CPU_LOG_TB_CPU)) {
- #include "qemu/atomic128.h"
+             FILE *logfile = qemu_log_trylock();
+             if (logfile) {
-diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
+@@ -XXX,XX +XXX,XX @@ static void log_cpu_exec(target_ulong pc, CPUState *cpu,
                  qemu_log_unlock(logfile);
              }
          }
 -#endif /* DEBUG_DISAS */
      }
  }
 diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/arm/translate-sve.c
+--- a/accel/tcg/translate-all.c
-+++ b/target/arm/translate-sve.c
++++ b/accel/tcg/translate-all.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_gen_code(CPUState *cpu,
- #include "exec/helper-proto.h"
+     qatomic_set(&prof->search_out_len, prof->search_out_len + search_size);
- #include "exec/helper-gen.h"
+ #endif
- #include "exec/log.h"
--#include "trace-tcg.h"
+-#ifdef DEBUG_DISAS
- #include "translate-a64.h"
+     if (qemu_loglevel_mask(CPU_LOG_TB_OUT_ASM) &&
- #include "fpu/softfloat.h"
+         qemu_log_in_addr_range(pc)) {
+         FILE *logfile = qemu_log_trylock();
-diff --git a/target/arm/translate.c b/target/arm/translate.c
+@@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_gen_code(CPUState *cpu,
              qemu_log_unlock(logfile);
          }
      }
 -#endif
      qatomic_set(&tcg_ctx->code_gen_ptr, (void *)
          ROUND_UP((uintptr_t)gen_code_buf + gen_code_size + search_size,
 diff --git a/accel/tcg/translator.c b/accel/tcg/translator.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/arm/translate.c
+--- a/accel/tcg/translator.c
-+++ b/target/arm/translate.c
++++ b/accel/tcg/translator.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ void translator_loop(CPUState *cpu, TranslationBlock *tb, int *max_insns,
- #include "exec/helper-proto.h"
+     tb->size = db->pc_next - db->pc_first;
- #include "exec/helper-gen.h"
+     tb->icount = db->num_insns;
--#include "trace-tcg.h"
+-#ifdef DEBUG_DISAS
- #include "exec/log.h"
+     if (qemu_loglevel_mask(CPU_LOG_TB_IN_ASM)
+         && qemu_log_in_addr_range(db->pc_first)) {
+         FILE *logfile = qemu_log_trylock();
-diff --git a/target/cris/translate.c b/target/cris/translate.c
+@@ -XXX,XX +XXX,XX @@ void translator_loop(CPUState *cpu, TranslationBlock *tb, int *max_insns,
-index XXXXXXX..XXXXXXX 100644
+             qemu_log_unlock(logfile);
---- a/target/cris/translate.c
+         }
-+++ b/target/cris/translate.c
+     }
-@@ -XXX,XX +XXX,XX @@
+-#endif
+ }
- #include "exec/helper-gen.h"
+ static void *translator_access(CPUArchState *env, DisasContextBase *db,
 -#include "trace-tcg.h"
  #include "exec/log.h"
 diff --git a/target/hppa/translate.c b/target/hppa/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/hppa/translate.c
 +++ b/target/hppa/translate.c
@@ -XXX,XX +XXX,XX @@
  #include "exec/helper-proto.h"
  #include "exec/helper-gen.h"
  #include "exec/translator.h"
 -#include "trace-tcg.h"
  #include "exec/log.h"
  /* Since we have a distinction between register size and address size,
 diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/i386/tcg/translate.c
 +++ b/target/i386/tcg/translate.c
@@ -XXX,XX +XXX,XX @@
  #include "exec/helper-gen.h"
  #include "helper-tcg.h"
 -#include "trace-tcg.h"
  #include "exec/log.h"
  #define PREFIX_REPZ   0x01
 diff --git a/target/m68k/translate.c b/target/m68k/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/m68k/translate.c
 +++ b/target/m68k/translate.c
@@ -XXX,XX +XXX,XX @@
  #include "exec/helper-proto.h"
  #include "exec/helper-gen.h"
 -#include "trace-tcg.h"
  #include "exec/log.h"
  #include "fpu/softfloat.h"
 diff --git a/target/microblaze/translate.c b/target/microblaze/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/microblaze/translate.c
 +++ b/target/microblaze/translate.c
@@ -XXX,XX +XXX,XX @@
  #include "exec/translator.h"
  #include "qemu/qemu-print.h"
 -#include "trace-tcg.h"
  #include "exec/log.h"
  #define EXTRACT_FIELD(src, start, end) \
 diff --git a/target/mips/tcg/translate.c b/target/mips/tcg/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/mips/tcg/translate.c
 +++ b/target/mips/tcg/translate.c
@@ -XXX,XX +XXX,XX @@
  #include "semihosting/semihost.h"
  #include "trace.h"
 -#include "trace-tcg.h"
  #include "exec/translator.h"
  #include "exec/log.h"
  #include "qemu/qemu-print.h"
 diff --git a/target/openrisc/translate.c b/target/openrisc/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/openrisc/translate.c
 +++ b/target/openrisc/translate.c
@@ -XXX,XX +XXX,XX @@
  #include "exec/helper-gen.h"
  #include "exec/gen-icount.h"
 -#include "trace-tcg.h"
  #include "exec/log.h"
  /* is_jmp field values */
 diff --git a/target/ppc/translate.c b/target/ppc/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/ppc/translate.c
 +++ b/target/ppc/translate.c
@@ -XXX,XX +XXX,XX @@
  #include "exec/helper-proto.h"
  #include "exec/helper-gen.h"
 -#include "trace-tcg.h"
  #include "exec/translator.h"
  #include "exec/log.h"
  #include "qemu/atomic128.h"
 diff --git a/target/rx/translate.c b/target/rx/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/rx/translate.c
 +++ b/target/rx/translate.c
@@ -XXX,XX +XXX,XX @@
  #include "exec/helper-proto.h"
  #include "exec/helper-gen.h"
  #include "exec/translator.h"
 -#include "trace-tcg.h"
  #include "exec/log.h"
  typedef struct DisasContext {
 diff --git a/target/s390x/translate.c b/target/s390x/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/s390x/translate.c
 +++ b/target/s390x/translate.c
@@ -XXX,XX +XXX,XX @@
  #include "exec/helper-proto.h"
  #include "exec/helper-gen.h"
 -#include "trace-tcg.h"
  #include "exec/translator.h"
  #include "exec/log.h"
  #include "qemu/atomic128.h"
 diff --git a/target/sh4/translate.c b/target/sh4/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/sh4/translate.c
 +++ b/target/sh4/translate.c
 @@ -XXX,XX +XXX,XX @@
- #include "exec/helper-proto.h"
+  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
- #include "exec/helper-gen.h"
+  */
- #include "exec/translator.h"
--#include "trace-tcg.h"
+-#define DEBUG_DISAS
- #include "exec/log.h"
+-
- #include "qemu/qemu-print.h"
+ #include "qemu/osdep.h"
+ #include "cpu.h"
  #include "disas/disas.h"
 diff --git a/target/sparc/translate.c b/target/sparc/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/sparc/translate.c
 +++ b/target/sparc/translate.c
 @@ -XXX,XX +XXX,XX @@
- #include "exec/helper-gen.h"
--#include "trace-tcg.h"
- #include "exec/translator.h"
- #include "exec/log.h"
  #include "asi.h"
-diff --git a/target/xtensa/translate.c b/target/xtensa/translate.c
 -#define DEBUG_DISAS
 -
  #define DYNAMIC_PC  1 /* dynamic pc value */
  #define JUMP_PC     2 /* dynamic pc value which takes only two values
                           according to jump_pc[T2] */
 diff --git a/tcg/tcg.c b/tcg/tcg.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/xtensa/translate.c
+--- a/tcg/tcg.c
-+++ b/target/xtensa/translate.c
++++ b/tcg/tcg.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ void tcg_prologue_init(TCGContext *s)
- #include "exec/helper-proto.h"
+                         (uintptr_t)s->code_buf, prologue_size);
- #include "exec/helper-gen.h"
+ #endif
--#include "trace-tcg.h"
+-#ifdef DEBUG_DISAS
- #include "exec/log.h"
+     if (qemu_loglevel_mask(CPU_LOG_TB_OUT_ASM)) {
+         FILE *logfile = qemu_log_trylock();
+         if (logfile) {
@@ -XXX,XX +XXX,XX @@ void tcg_prologue_init(TCGContext *s)
              qemu_log_unlock(logfile);
          }
      }
 -#endif
  #ifndef CONFIG_TCG_INTERPRETER
      /*
@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start)
      }
  #endif
 -#ifdef DEBUG_DISAS
      if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_OP)
                   && qemu_log_in_addr_range(pc_start))) {
          FILE *logfile = qemu_log_trylock();
@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start)
              qemu_log_unlock(logfile);
          }
      }
 -#endif
  #ifdef CONFIG_DEBUG_TCG
      /* Ensure all labels referenced have been emitted.  */
@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start)
      liveness_pass_1(s);
      if (s->nb_indirects > 0) {
 -#ifdef DEBUG_DISAS
          if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_OP_IND)
                       && qemu_log_in_addr_range(pc_start))) {
              FILE *logfile = qemu_log_trylock();
@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start)
                  qemu_log_unlock(logfile);
              }
          }
 -#endif
 +
          /* Replace indirect temps with direct temps.  */
          if (liveness_pass_2(s)) {
              /* If changes were made, re-run liveness.  */
@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start)
      qatomic_set(&prof->la_time, prof->la_time + profile_getclock());
  #endif
 -#ifdef DEBUG_DISAS
      if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_OP_OPT)
                   && qemu_log_in_addr_range(pc_start))) {
          FILE *logfile = qemu_log_trylock();
@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start)
              qemu_log_unlock(logfile);
          }
      }
 -#endif
      /* Initialize goto_tb jump offsets. */
      tb->jmp_reset_offset[0] = TB_JMP_OFFSET_INVALID;
 --
-.25.1
+.34.1

-[PATCH 09/41] target/alpha: Use translator_use_goto_tb
+Deleted patch
-Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
-Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
----
- target/alpha/translate.c | 7 +------
-file changed, 1 insertion(+), 6 deletions(-)
-diff --git a/target/alpha/translate.c b/target/alpha/translate.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/alpha/translate.c
-+++ b/target/alpha/translate.c
-@@ -XXX,XX +XXX,XX @@ static DisasJumpType gen_store_conditional(DisasContext *ctx, int ra, int rb,
- static bool use_goto_tb(DisasContext *ctx, uint64_t dest)
- {
--#ifndef CONFIG_USER_ONLY
--    /* Check for the dest on the same page as the start of the TB.  */
--    return ((ctx->base.tb->pc ^ dest) & TARGET_PAGE_MASK) == 0;
--#else
--    return true;
--#endif
-+    return translator_use_goto_tb(&ctx->base, dest);
- }
- static DisasJumpType gen_bdirect(DisasContext *ctx, int ra, int32_t disp)
---
-.25.1

-[PATCH 12/41] target/arm: Use translator_use_goto_tb for aarch32
+Deleted patch
-Just use translator_use_goto_tb directly at the one call site,
-rather than maintaining a local wrapper.
-Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
-Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
----
- target/arm/translate.c | 12 +-----------
-file changed, 1 insertion(+), 11 deletions(-)
-diff --git a/target/arm/translate.c b/target/arm/translate.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/arm/translate.c
-+++ b/target/arm/translate.c
-@@ -XXX,XX +XXX,XX @@ static int disas_dsp_insn(DisasContext *s, uint32_t insn)
-     return 1;
- }
--static inline bool use_goto_tb(DisasContext *s, target_ulong dest)
--{
--#ifndef CONFIG_USER_ONLY
--    return (s->base.tb->pc & TARGET_PAGE_MASK) == (dest & TARGET_PAGE_MASK) ||
--           ((s->base.pc_next - 1) & TARGET_PAGE_MASK) == (dest & TARGET_PAGE_MASK);
--#else
--    return true;
--#endif
--}
--
- static void gen_goto_ptr(void)
- {
-     tcg_gen_lookup_and_goto_ptr();
-@@ -XXX,XX +XXX,XX @@ static void gen_goto_ptr(void)
-  */
- static void gen_goto_tb(DisasContext *s, int n, target_ulong dest)
- {
--    if (use_goto_tb(s, dest)) {
-+    if (translator_use_goto_tb(&s->base, dest)) {
-         tcg_gen_goto_tb(n);
-         gen_set_pc_im(s, dest);
-         tcg_gen_exit_tb(s->base.tb, n);
---
-.25.1

-[PATCH 13/41] target/avr: Use translator_use_goto_tb
+Deleted patch
-Single stepping is not the only reason not to use goto_tb.
-If goto_tb is disallowed, and single-stepping is not enabled,
-then use tcg_gen_lookup_and_goto_tb to indirectly chain.
-Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
-Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
----
- target/avr/translate.c | 9 ++++++---
-file changed, 6 insertions(+), 3 deletions(-)
-diff --git a/target/avr/translate.c b/target/avr/translate.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/avr/translate.c
-+++ b/target/avr/translate.c
-@@ -XXX,XX +XXX,XX @@ static void gen_goto_tb(DisasContext *ctx, int n, target_ulong dest)
- {
-     const TranslationBlock *tb = ctx->base.tb;
--    if (!ctx->base.singlestep_enabled) {
-+    if (translator_use_goto_tb(&ctx->base, dest)) {
-         tcg_gen_goto_tb(n);
-         tcg_gen_movi_i32(cpu_pc, dest);
-         tcg_gen_exit_tb(tb, n);
-     } else {
-         tcg_gen_movi_i32(cpu_pc, dest);
--        gen_helper_debug(cpu_env);
--        tcg_gen_exit_tb(NULL, 0);
-+        if (ctx->base.singlestep_enabled) {
-+            gen_helper_debug(cpu_env);
-+        } else {
-+            tcg_gen_lookup_and_goto_ptr();
-+        }
-     }
-     ctx->base.is_jmp = DISAS_NORETURN;
- }
---
-.25.1

-[PATCH 15/41] target/cris: Use translator_use_goto_tb
+Deleted patch
-The test for singlestepping is done in translator_use_goto_tb,
-so we may elide it from cris_tr_tb_stop.
-Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
-Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
----
- target/cris/translate.c | 5 ++---
-file changed, 2 insertions(+), 3 deletions(-)
-diff --git a/target/cris/translate.c b/target/cris/translate.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/cris/translate.c
-+++ b/target/cris/translate.c
-@@ -XXX,XX +XXX,XX @@ static void t_gen_swapr(TCGv d, TCGv s)
- static bool use_goto_tb(DisasContext *dc, target_ulong dest)
- {
--    return ((dest ^ dc->base.pc_first) & TARGET_PAGE_MASK) == 0;
-+    return translator_use_goto_tb(&dc->base, dest);
- }
- static void gen_goto_tb(DisasContext *dc, int n, target_ulong dest)
-@@ -XXX,XX +XXX,XX @@ static void cris_tr_tb_stop(DisasContextBase *dcbase, CPUState *cpu)
-              * Use a conditional branch if either taken or not-taken path
-              * can use goto_tb.  If neither can, then treat it as indirect.
-              */
--            if (likely(!dc->base.singlestep_enabled)
--                && likely(!dc->cpustate_changed)
-+            if (likely(!dc->cpustate_changed)
-                 && (use_goto_tb(dc, dc->jmp_pc) || use_goto_tb(dc, npc))) {
-                 TCGLabel *not_taken = gen_new_label();
---
-.25.1

-[PATCH 16/41] target/hppa: Use translator_use_goto_tb
+Deleted patch
-Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
-Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
----
- target/hppa/translate.c | 5 +----
-file changed, 1 insertion(+), 4 deletions(-)
-diff --git a/target/hppa/translate.c b/target/hppa/translate.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/hppa/translate.c
-+++ b/target/hppa/translate.c
-@@ -XXX,XX +XXX,XX @@ static bool gen_illegal(DisasContext *ctx)
- static bool use_goto_tb(DisasContext *ctx, target_ureg dest)
- {
--    /* Suppress goto_tb for page crossing, IO, or single-steping.  */
--    return !(((ctx->base.pc_first ^ dest) & TARGET_PAGE_MASK)
--             || (tb_cflags(ctx->base.tb) & CF_LAST_IO)
--             || ctx->base.singlestep_enabled);
-+    return translator_use_goto_tb(&ctx->base, dest);
- }
- /* If the next insn is to be nullified, and it's on the same page,
---
-.25.1

-[PATCH 17/41] target/i386: Use translator_use_goto_tb
+Deleted patch
-Just use translator_use_goto_tb directly at the one call site,
-rather than maintaining a local wrapper.
-Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
-Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
----
- target/i386/tcg/translate.c | 14 ++------------
-file changed, 2 insertions(+), 12 deletions(-)
-diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/i386/tcg/translate.c
-+++ b/target/i386/tcg/translate.c
-@@ -XXX,XX +XXX,XX @@ static inline int insn_const_size(MemOp ot)
-     }
- }
--static inline bool use_goto_tb(DisasContext *s, target_ulong pc)
--{
--#ifndef CONFIG_USER_ONLY
--    return (pc & TARGET_PAGE_MASK) == (s->base.tb->pc & TARGET_PAGE_MASK) ||
--           (pc & TARGET_PAGE_MASK) == (s->pc_start & TARGET_PAGE_MASK);
--#else
--    return true;
--#endif
--}
--
--static inline void gen_goto_tb(DisasContext *s, int tb_num, target_ulong eip)
-+static void gen_goto_tb(DisasContext *s, int tb_num, target_ulong eip)
- {
-     target_ulong pc = s->cs_base + eip;
--    if (use_goto_tb(s, pc))  {
-+    if (translator_use_goto_tb(&s->base, pc))  {
-         /* jump to same page: we can use a direct jump */
-         tcg_gen_goto_tb(tb_num);
-         gen_jmp_im(s, eip);
---
-.25.1

-[PATCH 18/41] target/m68k: Use translator_use_goto_tb
+Deleted patch
-Just use translator_use_goto_tb directly at the one call site,
-rather than maintaining a local wrapper.
-Acked-by: Laurent Vivier <laurent@vivier.eu>
-Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
-Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
----
- target/m68k/translate.c | 12 +-----------
-file changed, 1 insertion(+), 11 deletions(-)
-diff --git a/target/m68k/translate.c b/target/m68k/translate.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/m68k/translate.c
-+++ b/target/m68k/translate.c
-@@ -XXX,XX +XXX,XX @@ static void gen_exit_tb(DisasContext *s)
-         }                                                               \
-     } while (0)
--static inline bool use_goto_tb(DisasContext *s, uint32_t dest)
--{
--#ifndef CONFIG_USER_ONLY
--    return (s->base.pc_first & TARGET_PAGE_MASK) == (dest & TARGET_PAGE_MASK)
--        || (s->base.pc_next & TARGET_PAGE_MASK) == (dest & TARGET_PAGE_MASK);
--#else
--    return true;
--#endif
--}
--
- /* Generate a jump to an immediate address.  */
- static void gen_jmp_tb(DisasContext *s, int n, uint32_t dest)
- {
-@@ -XXX,XX +XXX,XX @@ static void gen_jmp_tb(DisasContext *s, int n, uint32_t dest)
-         update_cc_op(s);
-         tcg_gen_movi_i32(QREG_PC, dest);
-         gen_singlestep_exception(s);
--    } else if (use_goto_tb(s, dest)) {
-+    } else if (translator_use_goto_tb(&s->base, dest)) {
-         tcg_gen_goto_tb(n);
-         tcg_gen_movi_i32(QREG_PC, dest);
-         tcg_gen_exit_tb(s->base.tb, n);
---
-.25.1

-[PATCH 19/41] target/microblaze: Use translator_use_goto_tb
+Deleted patch
-Just use translator_use_goto_tb directly at the one call site,
-rather than maintaining a local wrapper.
-Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
-Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
----
- target/microblaze/translate.c | 11 +----------
-file changed, 1 insertion(+), 10 deletions(-)
-diff --git a/target/microblaze/translate.c b/target/microblaze/translate.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/microblaze/translate.c
-+++ b/target/microblaze/translate.c
-@@ -XXX,XX +XXX,XX @@ static void gen_raise_hw_excp(DisasContext *dc, uint32_t esr_ec)
-     gen_raise_exception_sync(dc, EXCP_HW_EXCP);
- }
--static inline bool use_goto_tb(DisasContext *dc, target_ulong dest)
--{
--#ifndef CONFIG_USER_ONLY
--    return (dc->base.pc_first & TARGET_PAGE_MASK) == (dest & TARGET_PAGE_MASK);
--#else
--    return true;
--#endif
--}
--
- static void gen_goto_tb(DisasContext *dc, int n, target_ulong dest)
- {
-     if (dc->base.singlestep_enabled) {
-@@ -XXX,XX +XXX,XX @@ static void gen_goto_tb(DisasContext *dc, int n, target_ulong dest)
-         tcg_gen_movi_i32(cpu_pc, dest);
-         gen_helper_raise_exception(cpu_env, tmp);
-         tcg_temp_free_i32(tmp);
--    } else if (use_goto_tb(dc, dest)) {
-+    } else if (translator_use_goto_tb(&dc->base, dest)) {
-         tcg_gen_goto_tb(n);
-         tcg_gen_movi_i32(cpu_pc, dest);
-         tcg_gen_exit_tb(dc->base.tb, n);
---
-.25.1

-[PATCH 20/41] target/mips: Use translator_use_goto_tb
+Deleted patch
-Just use translator_use_goto_tb directly at the one call site,
-rather than maintaining a local wrapper.
-Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
-Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
----
- target/mips/tcg/translate.c | 17 ++---------------
-file changed, 2 insertions(+), 15 deletions(-)
-diff --git a/target/mips/tcg/translate.c b/target/mips/tcg/translate.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/mips/tcg/translate.c
-+++ b/target/mips/tcg/translate.c
-@@ -XXX,XX +XXX,XX @@ static void gen_trap(DisasContext *ctx, uint32_t opc,
-     tcg_temp_free(t1);
- }
--static inline bool use_goto_tb(DisasContext *ctx, target_ulong dest)
-+static void gen_goto_tb(DisasContext *ctx, int n, target_ulong dest)
- {
--    if (unlikely(ctx->base.singlestep_enabled)) {
--        return false;
--    }
--
--#ifndef CONFIG_USER_ONLY
--    return (ctx->base.tb->pc & TARGET_PAGE_MASK) == (dest & TARGET_PAGE_MASK);
--#else
--    return true;
--#endif
--}
--
--static inline void gen_goto_tb(DisasContext *ctx, int n, target_ulong dest)
--{
--    if (use_goto_tb(ctx, dest)) {
-+    if (translator_use_goto_tb(&ctx->base, dest)) {
-         tcg_gen_goto_tb(n);
-         gen_save_pc(dest);
-         tcg_gen_exit_tb(ctx->base.tb, n);
---
-.25.1

-[PATCH 21/41] target/mips: Fix missing else in gen_goto_tb
+Deleted patch
-Do not emit dead code for the singlestep_enabled case,
-after having exited the TB with a debug exception.
-Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
-Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
----
- target/mips/tcg/translate.c | 3 ++-
-file changed, 2 insertions(+), 1 deletion(-)
-diff --git a/target/mips/tcg/translate.c b/target/mips/tcg/translate.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/mips/tcg/translate.c
-+++ b/target/mips/tcg/translate.c
-@@ -XXX,XX +XXX,XX @@ static void gen_goto_tb(DisasContext *ctx, int n, target_ulong dest)
-         if (ctx->base.singlestep_enabled) {
-             save_cpu_state(ctx, 0);
-             gen_helper_raise_exception_debug(cpu_env);
-+        } else {
-+            tcg_gen_lookup_and_goto_ptr();
-         }
--        tcg_gen_lookup_and_goto_ptr();
-     }
- }
---
-.25.1

-[PATCH 23/41] target/openrisc: Use translator_use_goto_tb
+Deleted patch
-Reorder the control statements to allow using the page boundary
-check from translator_use_goto_tb().
-Reviewed-by: Stafford Horne <shorne@gmail.com>
-Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
----
- target/openrisc/translate.c | 15 ++++++++-------
-file changed, 8 insertions(+), 7 deletions(-)
-diff --git a/target/openrisc/translate.c b/target/openrisc/translate.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/openrisc/translate.c
-+++ b/target/openrisc/translate.c
-@@ -XXX,XX +XXX,XX @@ static void openrisc_tr_tb_stop(DisasContextBase *dcbase, CPUState *cs)
-         /* fallthru */
-     case DISAS_TOO_MANY:
--        if (unlikely(dc->base.singlestep_enabled)) {
--            tcg_gen_movi_tl(cpu_pc, jmp_dest);
--            gen_exception(dc, EXCP_DEBUG);
--        } else if ((dc->base.pc_first ^ jmp_dest) & TARGET_PAGE_MASK) {
--            tcg_gen_movi_tl(cpu_pc, jmp_dest);
--            tcg_gen_lookup_and_goto_ptr();
--        } else {
-+        if (translator_use_goto_tb(&dc->base, jmp_dest)) {
-             tcg_gen_goto_tb(0);
-             tcg_gen_movi_tl(cpu_pc, jmp_dest);
-             tcg_gen_exit_tb(dc->base.tb, 0);
-+            break;
-+        }
-+        tcg_gen_movi_tl(cpu_pc, jmp_dest);
-+        if (unlikely(dc->base.singlestep_enabled)) {
-+            gen_exception(dc, EXCP_DEBUG);
-+        } else {
-+            tcg_gen_lookup_and_goto_ptr();
-         }
-         break;
---
-.25.1

-[PATCH 25/41] target/riscv: Use translator_use_goto_tb
+Deleted patch
-Just use translator_use_goto_tb directly at the one call site,
-rather than maintaining a local wrapper.
-Reviewed-by: Alistair Francis <alistair.francis@wdc.com>
-Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
----
- target/riscv/translate.c | 20 +-------------------
-file changed, 1 insertion(+), 19 deletions(-)
-diff --git a/target/riscv/translate.c b/target/riscv/translate.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/riscv/translate.c
-+++ b/target/riscv/translate.c
-@@ -XXX,XX +XXX,XX @@ static void gen_exception_inst_addr_mis(DisasContext *ctx)
-     generate_exception_mtval(ctx, RISCV_EXCP_INST_ADDR_MIS);
- }
--static inline bool use_goto_tb(DisasContext *ctx, target_ulong dest)
--{
--    if (unlikely(ctx->base.singlestep_enabled)) {
--        return false;
--    }
--
--#ifndef CONFIG_USER_ONLY
--    return (ctx->base.tb->pc & TARGET_PAGE_MASK) == (dest & TARGET_PAGE_MASK);
--#else
--    return true;
--#endif
--}
--
- static void gen_goto_tb(DisasContext *ctx, int n, target_ulong dest)
- {
--    if (use_goto_tb(ctx, dest)) {
--        /* chaining is only allowed when the jump is to the same page */
-+    if (translator_use_goto_tb(&ctx->base, dest)) {
-         tcg_gen_goto_tb(n);
-         tcg_gen_movi_tl(cpu_pc, dest);
--
--        /* No need to check for single stepping here as use_goto_tb() will
--         * return false in case of single stepping.
--         */
-         tcg_gen_exit_tb(ctx->base.tb, n);
-     } else {
-         tcg_gen_movi_tl(cpu_pc, dest);
---
-.25.1

-[PATCH 34/41] tcg: Fix prologue disassembly
+[PULL 28/28] tcg: Remove USE_TCG_OPTIMIZATIONS
-In tcg_region_prologue_set, we reset TCGContext.code_gen_ptr.
+This is always defined, and the optimization pass is
-So do that after we've used it to dump the prologue contents.
+essential to producing reasonable code.
-Fixes: b0a0794a0f16
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/tcg.c | 4 ++--
+ tcg/tcg.c | 5 -----
-file changed, 2 insertions(+), 2 deletions(-)
+file changed, 5 deletions(-)
 diff --git a/tcg/tcg.c b/tcg/tcg.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tcg.c
 +++ b/tcg/tcg.c
-@@ -XXX,XX +XXX,XX @@ void tcg_prologue_init(TCGContext *s)
+@@ -XXX,XX +XXX,XX @@
-                         (uintptr_t)s->code_buf, prologue_size);
+  * THE SOFTWARE.
   */
 -/* define it to use liveness analysis (better code) */
 -#define USE_TCG_OPTIMIZATIONS
 -
  #include "qemu/osdep.h"
  /* Define to jump the ELF file used to communicate with GDB.  */
@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start)
      qatomic_set(&prof->opt_time, prof->opt_time - profile_getclock());
  #endif
--    tcg_region_prologue_set(s);
+-#ifdef USE_TCG_OPTIMIZATIONS
--
+     tcg_optimize(s);
- #ifdef DEBUG_DISAS
+-#endif
-     if (qemu_loglevel_mask(CPU_LOG_TB_OUT_ASM)) {
-         FILE *logfile = qemu_log_lock();
+ #ifdef CONFIG_PROFILER
-@@ -XXX,XX +XXX,XX @@ void tcg_prologue_init(TCGContext *s)
+     qatomic_set(&prof->opt_time, prof->opt_time + profile_getclock());
          tcg_debug_assert(tcg_code_gen_epilogue != NULL);
      }
  #endif
 +
 +    tcg_region_prologue_set(s);
  }
  void tcg_func_start(TCGContext *s)
 --
-.25.1
+.34.1

-[PATCH 35/41] target/i386: Use cpu_breakpoint_test in breakpoint_handler
+Deleted patch
-The loop is performing a simple boolean test for the existence
-of a BP_CPU breakpoint at EIP.  Plus it gets the iteration wrong,
-if we happen to have a BP_GDB breakpoint at the same address.
-We have a function for this: cpu_breakpoint_test.
-Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
-Reviewed-by: Eduardo Habkost <ehabkost@redhat.com>
-Message-Id: <20210620062317.1399034-1-richard.henderson@linaro.org>
----
- target/i386/tcg/sysemu/bpt_helper.c | 12 +++---------
-file changed, 3 insertions(+), 9 deletions(-)
-diff --git a/target/i386/tcg/sysemu/bpt_helper.c b/target/i386/tcg/sysemu/bpt_helper.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/i386/tcg/sysemu/bpt_helper.c
-+++ b/target/i386/tcg/sysemu/bpt_helper.c
-@@ -XXX,XX +XXX,XX @@ void breakpoint_handler(CPUState *cs)
- {
-     X86CPU *cpu = X86_CPU(cs);
-     CPUX86State *env = &cpu->env;
--    CPUBreakpoint *bp;
-     if (cs->watchpoint_hit) {
-         if (cs->watchpoint_hit->flags & BP_CPU) {
-@@ -XXX,XX +XXX,XX @@ void breakpoint_handler(CPUState *cs)
-             }
-         }
-     } else {
--        QTAILQ_FOREACH(bp, &cs->breakpoints, entry) {
--            if (bp->pc == env->eip) {
--                if (bp->flags & BP_CPU) {
--                    check_hw_breakpoints(env, true);
--                    raise_exception(env, EXCP01_DB);
--                }
--                break;
--            }
-+        if (cpu_breakpoint_test(cs, env->eip, BP_CPU)) {
-+            check_hw_breakpoints(env, true);
-+            raise_exception(env, EXCP01_DB);
-         }
-     }
- }
---
-.25.1

The following changes since commit 05de778b5b8ab0b402996769117b88c7ea5c7c61:

Merge remote-tracking branch 'remotes/mst/tags/for_upstream' into staging (2021-07-09 14:30:01 +0100)

are available in the Git repository at:

https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20210710

for you to fetch changes up to ad1a706f386c2281adb0b09257d892735e405834:

cpu: Add breakpoint tracepoints (2021-07-09 21:31:11 -0700)

----------------------------------------------------------------
Add translator_use_goto_tb.
Cleanups in prep of breakpoint fixes.
Misc fixes.

----------------------------------------------------------------
Liren Wei (2):
      accel/tcg: Hoist tcg_tb_insert() up above tb_link_page()
      tcg: Bake tb_destroy() into tcg_region_tree

Philippe Mathieu-Daudé (1):
      tcg: Avoid including 'trace-tcg.h' in target translate.c

Richard Henderson (38):
      tcg: Add separator in INDEX_op_call dump
      tcg: Move tb_phys_invalidate_count to tb_ctx
      accel/tcg: Introduce translator_use_goto_tb
      target/alpha: Remove use_exit_tb
      target/alpha: Remove in_superpage
      target/alpha: Use translator_use_goto_tb
      target/arm: Use DISAS_TOO_MANY for ISB and SB
      target/arm: Use translator_use_goto_tb for aarch64
      target/arm: Use translator_use_goto_tb for aarch32
      target/avr: Use translator_use_goto_tb
      target/avr: Mark some helpers noreturn
      target/cris: Use translator_use_goto_tb
      target/hppa: Use translator_use_goto_tb
      target/i386: Use translator_use_goto_tb
      target/m68k: Use translator_use_goto_tb
      target/microblaze: Use translator_use_goto_tb
      target/mips: Use translator_use_goto_tb
      target/mips: Fix missing else in gen_goto_tb
      target/nios2: Use translator_use_goto_tb
      target/openrisc: Use translator_use_goto_tb
      target/ppc: Use translator_use_goto_tb
      target/riscv: Use translator_use_goto_tb
      target/rx: Use translator_use_goto_tb
      target/s390x: Use translator_use_goto_tb
      target/s390x: Remove use_exit_tb
      target/sh4: Use translator_use_goto_tb
      target/sparc: Use translator_use_goto_tb
      target/tricore: Use translator_use_goto_tb
      target/tricore: Use tcg_gen_lookup_and_goto_ptr
      target/xtensa: Use translator_use_goto_tb
      tcg: Fix prologue disassembly
      target/i386: Use cpu_breakpoint_test in breakpoint_handler
      accel/tcg: Move helper_lookup_tb_ptr to cpu-exec.c
      accel/tcg: Move tb_lookup to cpu-exec.c
      accel/tcg: Split out log_cpu_exec
      accel/tcg: Log tb->cflags with -d exec
      tcg: Remove TCG_TARGET_HAS_goto_ptr
      cpu: Add breakpoint tracepoints

From: Philippe Mathieu-Daudé <f4bug@amsat.org>

The root trace-events only declares a single TCG event:

$ git grep -w tcg trace-events
  trace-events:115:# tcg/tcg-op.c
  trace-events:137:vcpu tcg guest_mem_before(TCGv vaddr, uint16_t info) "info=%d", "vaddr=0x%016"PRIx64" info=%d"

and only a tcg/tcg-op.c uses it:

$ git grep -l trace_guest_mem_before_tcg
  tcg/tcg-op.c

therefore it is pointless to include "trace-tcg.h" in each target
(because it is not used). Remove it.

diff --git a/target/alpha/translate.c b/target/alpha/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/alpha/translate.c
+++ b/target/alpha/translate.c
@@ -XXX,XX +XXX,XX @@
 #include "exec/cpu_ldst.h"
 #include "exec/helper-proto.h"
 #include "exec/helper-gen.h"
-#include "trace-tcg.h"
 #include "exec/translator.h"
 #include "exec/log.h"
 
diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/translate-a64.c
+++ b/target/arm/translate-a64.c
@@ -XXX,XX +XXX,XX @@
 #include "exec/helper-gen.h"
 #include "exec/log.h"
 
-#include "trace-tcg.h"
 #include "translate-a64.h"
 #include "qemu/atomic128.h"
 
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -XXX,XX +XXX,XX @@
 #include "exec/helper-proto.h"
 #include "exec/helper-gen.h"
 #include "exec/log.h"
-#include "trace-tcg.h"
 #include "translate-a64.h"
 #include "fpu/softfloat.h"
 
diff --git a/target/arm/translate.c b/target/arm/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/translate.c
+++ b/target/arm/translate.c
@@ -XXX,XX +XXX,XX @@
 #include "exec/helper-proto.h"
 #include "exec/helper-gen.h"
 
-#include "trace-tcg.h"
 #include "exec/log.h"
 
 
diff --git a/target/cris/translate.c b/target/cris/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/cris/translate.c
+++ b/target/cris/translate.c
@@ -XXX,XX +XXX,XX @@
 
 #include "exec/helper-gen.h"
 
-#include "trace-tcg.h"
 #include "exec/log.h"
 
 
diff --git a/target/hppa/translate.c b/target/hppa/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/hppa/translate.c
+++ b/target/hppa/translate.c
@@ -XXX,XX +XXX,XX @@
 #include "exec/helper-proto.h"
 #include "exec/helper-gen.h"
 #include "exec/translator.h"
-#include "trace-tcg.h"
 #include "exec/log.h"
 
 /* Since we have a distinction between register size and address size,
diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -XXX,XX +XXX,XX @@
 #include "exec/helper-gen.h"
 #include "helper-tcg.h"
 
-#include "trace-tcg.h"
 #include "exec/log.h"
 
 #define PREFIX_REPZ   0x01
diff --git a/target/m68k/translate.c b/target/m68k/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/m68k/translate.c
+++ b/target/m68k/translate.c
@@ -XXX,XX +XXX,XX @@
 #include "exec/helper-proto.h"
 #include "exec/helper-gen.h"
 
-#include "trace-tcg.h"
 #include "exec/log.h"
 #include "fpu/softfloat.h"
 
diff --git a/target/microblaze/translate.c b/target/microblaze/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/microblaze/translate.c
+++ b/target/microblaze/translate.c
@@ -XXX,XX +XXX,XX @@
 #include "exec/translator.h"
 #include "qemu/qemu-print.h"
 
-#include "trace-tcg.h"
 #include "exec/log.h"
 
 #define EXTRACT_FIELD(src, start, end) \
diff --git a/target/mips/tcg/translate.c b/target/mips/tcg/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/mips/tcg/translate.c
+++ b/target/mips/tcg/translate.c
@@ -XXX,XX +XXX,XX @@
 #include "semihosting/semihost.h"
 
 #include "trace.h"
-#include "trace-tcg.h"
 #include "exec/translator.h"
 #include "exec/log.h"
 #include "qemu/qemu-print.h"
diff --git a/target/openrisc/translate.c b/target/openrisc/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/openrisc/translate.c
+++ b/target/openrisc/translate.c
@@ -XXX,XX +XXX,XX @@
 #include "exec/helper-gen.h"
 #include "exec/gen-icount.h"
 
-#include "trace-tcg.h"
 #include "exec/log.h"
 
 /* is_jmp field values */
diff --git a/target/ppc/translate.c b/target/ppc/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/ppc/translate.c
+++ b/target/ppc/translate.c
@@ -XXX,XX +XXX,XX @@
 #include "exec/helper-proto.h"
 #include "exec/helper-gen.h"
 
-#include "trace-tcg.h"
 #include "exec/translator.h"
 #include "exec/log.h"
 #include "qemu/atomic128.h"
diff --git a/target/rx/translate.c b/target/rx/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/rx/translate.c
+++ b/target/rx/translate.c
@@ -XXX,XX +XXX,XX @@
 #include "exec/helper-proto.h"
 #include "exec/helper-gen.h"
 #include "exec/translator.h"
-#include "trace-tcg.h"
 #include "exec/log.h"
 
 typedef struct DisasContext {
diff --git a/target/s390x/translate.c b/target/s390x/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/s390x/translate.c
+++ b/target/s390x/translate.c
@@ -XXX,XX +XXX,XX @@
 #include "exec/helper-proto.h"
 #include "exec/helper-gen.h"
 
-#include "trace-tcg.h"
 #include "exec/translator.h"
 #include "exec/log.h"
 #include "qemu/atomic128.h"
diff --git a/target/sh4/translate.c b/target/sh4/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/sh4/translate.c
+++ b/target/sh4/translate.c
@@ -XXX,XX +XXX,XX @@
 #include "exec/helper-proto.h"
 #include "exec/helper-gen.h"
 #include "exec/translator.h"
-#include "trace-tcg.h"
 #include "exec/log.h"
 #include "qemu/qemu-print.h"
 
diff --git a/target/sparc/translate.c b/target/sparc/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/sparc/translate.c
+++ b/target/sparc/translate.c
@@ -XXX,XX +XXX,XX @@
 
 #include "exec/helper-gen.h"
 
-#include "trace-tcg.h"
 #include "exec/translator.h"
 #include "exec/log.h"
 #include "asi.h"
diff --git a/target/xtensa/translate.c b/target/xtensa/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/xtensa/translate.c
+++ b/target/xtensa/translate.c
@@ -XXX,XX +XXX,XX @@
 #include "exec/helper-proto.h"
 #include "exec/helper-gen.h"
 
-#include "trace-tcg.h"
 #include "exec/log.h"
 
 
-- 
2.25.1

From: Liren Wei <lrwei@bupt.edu.cn>

TranslationBlocks not inserted into the corresponding region
tree shall be regarded as partially initialized objects, and
needs to be finalized first before inserting into QHT.

Signed-off-by: Liren Wei <lrwei@bupt.edu.cn>
Message-Id: <f9fc263f71e11b6308d8c1fbc0dd366bf4aeb532.1625404483.git.lrwei@bupt.edu.cn>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/translate-all.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_gen_code(CPUState *cpu,
         return tb;
     }
 
+    /*
+     * Insert TB into the corresponding region tree before publishing it
+     * through QHT. Otherwise rewinding happened in the TB might fail to
+     * lookup itself using host PC.
+     */
+    tcg_tb_insert(tb);
+
     /* check next page if needed */
     virt_page2 = (pc + tb->size - 1) & TARGET_PAGE_MASK;
     phys_page2 = -1;
@@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_gen_code(CPUState *cpu,
         orig_aligned -= ROUND_UP(sizeof(*tb), qemu_icache_linesize);
         qatomic_set(&tcg_ctx->code_gen_ptr, (void *)orig_aligned);
         tb_destroy(tb);
+        tcg_tb_remove(tb);
         return existing_tb;
     }
-    tcg_tb_insert(tb);
     return tb;
 }
 
-- 
2.25.1

From: Liren Wei <lrwei@bupt.edu.cn>

The function is called only at tcg_gen_code() when duplicated TBs
are translated by different threads, and when the tcg_region_tree
is reset. Bake it into the underlying GTree as its value destroy
function to unite these situations.
Also remove tcg_region_tree_traverse() which now becomes useless.

Signed-off-by: Liren Wei <lrwei@bupt.edu.cn>
Message-Id: <8dc352f08d038c4e7a1f5f56962398cdc700c3aa.1625404483.git.lrwei@bupt.edu.cn>
[rth: Name the new tb_tc_cmp parameter correctly.]
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/tcg/tcg.h         |  1 -
 accel/tcg/translate-all.c |  6 ------
 tcg/region.c              | 19 ++++++++-----------
 3 files changed, 8 insertions(+), 18 deletions(-)

diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg.h
+++ b/include/tcg/tcg.h
@@ -XXX,XX +XXX,XX @@ void *tcg_malloc_internal(TCGContext *s, int size);
 void tcg_pool_reset(TCGContext *s);
 TranslationBlock *tcg_tb_alloc(TCGContext *s);
 
-void tb_destroy(TranslationBlock *tb);
 void tcg_region_reset_all(void);
 
 size_t tcg_code_size(void);
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -XXX,XX +XXX,XX @@ static int cpu_restore_state_from_tb(CPUState *cpu, TranslationBlock *tb,
     return 0;
 }
 
-void tb_destroy(TranslationBlock *tb)
-{
-    qemu_spin_destroy(&tb->jmp_lock);
-}
-
 bool cpu_restore_state(CPUState *cpu, uintptr_t host_pc, bool will_exit)
 {
     /*
@@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_gen_code(CPUState *cpu,
 
         orig_aligned -= ROUND_UP(sizeof(*tb), qemu_icache_linesize);
         qatomic_set(&tcg_ctx->code_gen_ptr, (void *)orig_aligned);
-        tb_destroy(tb);
         tcg_tb_remove(tb);
         return existing_tb;
     }
diff --git a/tcg/region.c b/tcg/region.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/region.c
+++ b/tcg/region.c
@@ -XXX,XX +XXX,XX @@ static int ptr_cmp_tb_tc(const void *ptr, const struct tb_tc *s)
     return 0;
 }
 
-static gint tb_tc_cmp(gconstpointer ap, gconstpointer bp)
+static gint tb_tc_cmp(gconstpointer ap, gconstpointer bp, gpointer userdata)
 {
     const struct tb_tc *a = ap;
     const struct tb_tc *b = bp;
@@ -XXX,XX +XXX,XX @@ static gint tb_tc_cmp(gconstpointer ap, gconstpointer bp)
     return ptr_cmp_tb_tc(b->ptr, a);
 }
 
+static void tb_destroy(gpointer value)
+{
+    TranslationBlock *tb = value;
+    qemu_spin_destroy(&tb->jmp_lock);
+}
+
 static void tcg_region_trees_init(void)
 {
     size_t i;
@@ -XXX,XX +XXX,XX @@ static void tcg_region_trees_init(void)
         struct tcg_region_tree *rt = region_trees + i * tree_size;
 
         qemu_mutex_init(&rt->lock);
-        rt->tree = g_tree_new(tb_tc_cmp);
+        rt->tree = g_tree_new_full(tb_tc_cmp, NULL, NULL, tb_destroy);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ size_t tcg_nb_tbs(void)
     return nb_tbs;
 }
 
-static gboolean tcg_region_tree_traverse(gpointer k, gpointer v, gpointer data)
-{
-    TranslationBlock *tb = v;
-
-    tb_destroy(tb);
-    return FALSE;
-}
-
 static void tcg_region_tree_reset_all(void)
 {
     size_t i;
@@ -XXX,XX +XXX,XX @@ static void tcg_region_tree_reset_all(void)
     for (i = 0; i < region.n; i++) {
         struct tcg_region_tree *rt = region_trees + i * tree_size;
 
-        g_tree_foreach(rt->tree, tcg_region_tree_traverse, NULL);
         /* Increment the refcount first so that destroy acts as a reset */
         g_tree_ref(rt->tree);
         g_tree_destroy(rt->tree);
-- 
2.25.1

We can call do_tb_phys_invalidate from an iocontext, which has
no per-thread tcg_ctx.  Move this to tb_ctx, which is global.
The actual update still takes place with a lock held, so only
an atomic set is required, not an atomic increment.

Resolves: https://gitlab.com/qemu-project/qemu/-/issues/457
Tested-by: Viktor Ashirov <vashirov@redhat.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/tb-context.h    |  1 +
 include/tcg/tcg.h         |  3 ---
 accel/tcg/translate-all.c |  8 ++++----
 tcg/region.c              | 14 --------------
 4 files changed, 5 insertions(+), 21 deletions(-)

diff --git a/accel/tcg/tb-context.h b/accel/tcg/tb-context.h
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tb-context.h
+++ b/accel/tcg/tb-context.h
@@ -XXX,XX +XXX,XX @@ struct TBContext {
 
     /* statistics */
     unsigned tb_flush_count;
+    unsigned tb_phys_invalidate_count;
 };
 
 extern TBContext tb_ctx;
diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg.h
+++ b/include/tcg/tcg.h
@@ -XXX,XX +XXX,XX @@ struct TCGContext {
     /* Threshold to flush the translated code buffer.  */
     void *code_gen_highwater;
 
-    size_t tb_phys_invalidate_count;
-
     /* Track which vCPU triggers events */
     CPUState *cpu;                      /* *_trans */
 
@@ -XXX,XX +XXX,XX @@ size_t tcg_code_capacity(void);
 
 void tcg_tb_insert(TranslationBlock *tb);
 void tcg_tb_remove(TranslationBlock *tb);
-size_t tcg_tb_phys_invalidate_count(void);
 TranslationBlock *tcg_tb_lookup(uintptr_t tc_ptr);
 void tcg_tb_foreach(GTraverseFunc func, gpointer user_data);
 size_t tcg_nb_tbs(void);
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -XXX,XX +XXX,XX @@ static void do_tb_phys_invalidate(TranslationBlock *tb, bool rm_from_page_list)
     /* suppress any remaining jumps to this TB */
     tb_jmp_unlink(tb);
 
-    qatomic_set(&tcg_ctx->tb_phys_invalidate_count,
-               tcg_ctx->tb_phys_invalidate_count + 1);
+    qatomic_set(&tb_ctx.tb_phys_invalidate_count,
+                tb_ctx.tb_phys_invalidate_count + 1);
 }
 
 static void tb_phys_invalidate__locked(TranslationBlock *tb)
@@ -XXX,XX +XXX,XX @@ void dump_exec_info(void)
     qemu_printf("\nStatistics:\n");
     qemu_printf("TB flush count      %u\n",
                 qatomic_read(&tb_ctx.tb_flush_count));
-    qemu_printf("TB invalidate count %zu\n",
-                tcg_tb_phys_invalidate_count());
+    qemu_printf("TB invalidate count %u\n",
+                qatomic_read(&tb_ctx.tb_phys_invalidate_count));
 
     tlb_flush_counts(&flush_full, &flush_part, &flush_elide);
     qemu_printf("TLB full flushes    %zu\n", flush_full);
diff --git a/tcg/region.c b/tcg/region.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/region.c
+++ b/tcg/region.c
@@ -XXX,XX +XXX,XX @@ size_t tcg_code_capacity(void)
 
     return capacity;
 }
-
-size_t tcg_tb_phys_invalidate_count(void)
-{
-    unsigned int n_ctxs = qatomic_read(&tcg_cur_ctxs);
-    unsigned int i;
-    size_t total = 0;
-
-    for (i = 0; i < n_ctxs; i++) {
-        const TCGContext *s = qatomic_read(&tcg_ctxs[i]);
-
-        total += qatomic_read(&s->tb_phys_invalidate_count);
-    }
-    return total;
-}
-- 
2.25.1

Add a generic version of the common use_goto_tb test.

Various targets avoid the page crossing test for CONFIG_USER_ONLY,
but that is wrong: mmap and mprotect can change page permissions.

Reviewed-by: Max Filippov <jcmvbkbc@gmail.com>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/translator.h | 10 ++++++++++
 accel/tcg/translator.c    | 11 +++++++++++
 2 files changed, 21 insertions(+)

diff --git a/include/exec/translator.h b/include/exec/translator.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/translator.h
+++ b/include/exec/translator.h
@@ -XXX,XX +XXX,XX @@ void translator_loop(const TranslatorOps *ops, DisasContextBase *db,
 
 void translator_loop_temp_check(DisasContextBase *db);
 
+/**
+ * translator_use_goto_tb
+ * @db: Disassembly context
+ * @dest: target pc of the goto
+ *
+ * Return true if goto_tb is allowed between the current TB
+ * and the destination PC.
+ */
+bool translator_use_goto_tb(DisasContextBase *db, target_ulong dest);
+
 /*
  * Translator Load Functions
  *
diff --git a/accel/tcg/translator.c b/accel/tcg/translator.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/translator.c
+++ b/accel/tcg/translator.c
@@ -XXX,XX +XXX,XX @@ void translator_loop_temp_check(DisasContextBase *db)
     }
 }
 
+bool translator_use_goto_tb(DisasContextBase *db, target_ulong dest)
+{
+    /* Suppress goto_tb in the case of single-steping.  */
+    if (db->singlestep_enabled || singlestep) {
+        return false;
+    }
+
+    /* Check for the dest on the same page as the start of the TB.  */
+    return ((db->pc_first ^ dest) & TARGET_PAGE_MASK) == 0;
+}
+
 void translator_loop(const TranslatorOps *ops, DisasContextBase *db,
                      CPUState *cpu, TranslationBlock *tb, int max_insns)
 {
-- 
2.25.1

We have not needed to end a TB for I/O since ba3e7926691
("icount: clean up cpu_can_io at the entry to the block").
We do not need to use exit_tb for singlestep, which only
means generate one insn per TB.

Which leaves only singlestep_enabled, which means raise a
debug trap after every TB, which does not use exit_tb,
which would leave the function mis-named.

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/alpha/translate.c | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

diff --git a/target/alpha/translate.c b/target/alpha/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/alpha/translate.c
+++ b/target/alpha/translate.c
@@ -XXX,XX +XXX,XX @@ static bool in_superpage(DisasContext *ctx, int64_t addr)
 #endif
 }
 
-static bool use_exit_tb(DisasContext *ctx)
-{
-    return ((tb_cflags(ctx->base.tb) & CF_LAST_IO)
-            || ctx->base.singlestep_enabled
-            || singlestep);
-}
-
 static bool use_goto_tb(DisasContext *ctx, uint64_t dest)
 {
-    /* Suppress goto_tb in the case of single-steping and IO.  */
-    if (unlikely(use_exit_tb(ctx))) {
-        return false;
-    }
 #ifndef CONFIG_USER_ONLY
     /* If the destination is in the superpage, the page perms can't change.  */
     if (in_superpage(ctx, dest)) {
@@ -XXX,XX +XXX,XX @@ static DisasJumpType gen_call_pal(DisasContext *ctx, int palcode)
            need the page permissions check.  We'll see the existence of
            the page when we create the TB, and we'll flush all TBs if
            we change the PAL base register.  */
-        if (!use_exit_tb(ctx)) {
+        if (!ctx->base.singlestep_enabled) {
             tcg_gen_goto_tb(0);
             tcg_gen_movi_i64(cpu_pc, entry);
             tcg_gen_exit_tb(ctx->base.tb, 0);
@@ -XXX,XX +XXX,XX @@ static void alpha_tr_tb_stop(DisasContextBase *dcbase, CPUState *cpu)
         tcg_gen_movi_i64(cpu_pc, ctx->base.pc_next);
         /* FALLTHRU */
     case DISAS_PC_UPDATED:
-        if (!use_exit_tb(ctx)) {
+        if (!ctx->base.singlestep_enabled) {
             tcg_gen_lookup_and_goto_ptr();
             break;
         }
-- 
2.25.1

The number of links across (normal) pages using this is low,
and it will shortly violate the contract for breakpoints.

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/alpha/translate.c | 24 ++----------------------
 1 file changed, 2 insertions(+), 22 deletions(-)

diff --git a/target/alpha/translate.c b/target/alpha/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/alpha/translate.c
+++ b/target/alpha/translate.c
@@ -XXX,XX +XXX,XX @@ static DisasJumpType gen_store_conditional(DisasContext *ctx, int ra, int rb,
     return DISAS_NEXT;
 }
 
-static bool in_superpage(DisasContext *ctx, int64_t addr)
-{
-#ifndef CONFIG_USER_ONLY
-    return ((ctx->tbflags & ENV_FLAG_PS_USER) == 0
-            && addr >> TARGET_VIRT_ADDR_SPACE_BITS == -1
-            && ((addr >> 41) & 3) == 2);
-#else
-    return false;
-#endif
-}
-
 static bool use_goto_tb(DisasContext *ctx, uint64_t dest)
 {
 #ifndef CONFIG_USER_ONLY
-    /* If the destination is in the superpage, the page perms can't change.  */
-    if (in_superpage(ctx, dest)) {
-        return true;
-    }
     /* Check for the dest on the same page as the start of the TB.  */
     return ((ctx->base.tb->pc ^ dest) & TARGET_PAGE_MASK) == 0;
 #else
@@ -XXX,XX +XXX,XX @@ static void alpha_tr_init_disas_context(DisasContextBase *dcbase, CPUState *cpu)
 {
     DisasContext *ctx = container_of(dcbase, DisasContext, base);
     CPUAlphaState *env = cpu->env_ptr;
-    int64_t bound, mask;
+    int64_t bound;
 
     ctx->tbflags = ctx->base.tb->flags;
     ctx->mem_idx = cpu_mmu_index(env, false);
@@ -XXX,XX +XXX,XX @@ static void alpha_tr_init_disas_context(DisasContextBase *dcbase, CPUState *cpu)
     ctx->lit = NULL;
 
     /* Bound the number of insns to execute to those left on the page.  */
-    if (in_superpage(ctx, ctx->base.pc_first)) {
-        mask = -1ULL << 41;
-    } else {
-        mask = TARGET_PAGE_MASK;
-    }
-    bound = -(ctx->base.pc_first | mask) / 4;
+    bound = -(ctx->base.pc_first | TARGET_PAGE_MASK) / 4;
     ctx->base.max_insns = MIN(ctx->base.max_insns, bound);
 }
 
-- 
2.25.1

Using gen_goto_tb directly misses the single-step check.
Let the branch or debug exception be emitted by arm_tr_tb_stop.

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/arm/translate.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/target/arm/translate.c b/target/arm/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/translate.c
+++ b/target/arm/translate.c
@@ -XXX,XX +XXX,XX @@ static bool trans_ISB(DisasContext *s, arg_ISB *a)
      * self-modifying code correctly and also to take
      * any pending interrupts immediately.
      */
-    gen_goto_tb(s, 0, s->base.pc_next);
+    s->base.is_jmp = DISAS_TOO_MANY;
     return true;
 }
 
@@ -XXX,XX +XXX,XX @@ static bool trans_SB(DisasContext *s, arg_SB *a)
      * for TCG; MB and end the TB instead.
      */
     tcg_gen_mb(TCG_MO_ALL | TCG_BAR_SC);
-    gen_goto_tb(s, 0, s->base.pc_next);
+    s->base.is_jmp = DISAS_TOO_MANY;
     return true;
 }
 
-- 
2.25.1

We have not needed to end a TB for I/O since ba3e7926691
("icount: clean up cpu_can_io at the entry to the block"),
and gdbstub singlestep is handled by the generic function.

Drop the unused 'n' argument to use_goto_tb.

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/arm/translate-a64.c | 25 +++++--------------------
 1 file changed, 5 insertions(+), 20 deletions(-)

diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/translate-a64.c
+++ b/target/arm/translate-a64.c
@@ -XXX,XX +XXX,XX @@ static void gen_step_complete_exception(DisasContext *s)
     s->base.is_jmp = DISAS_NORETURN;
 }
 
-static inline bool use_goto_tb(DisasContext *s, int n, uint64_t dest)
+static inline bool use_goto_tb(DisasContext *s, uint64_t dest)
 {
-    /* No direct tb linking with singlestep (either QEMU's or the ARM
-     * debug architecture kind) or deterministic io
-     */
-    if (s->base.singlestep_enabled || s->ss_active ||
-        (tb_cflags(s->base.tb) & CF_LAST_IO)) {
+    if (s->ss_active) {
         return false;
     }
-
-#ifndef CONFIG_USER_ONLY
-    /* Only link tbs from inside the same guest page */
-    if ((s->base.tb->pc & TARGET_PAGE_MASK) != (dest & TARGET_PAGE_MASK)) {
-        return false;
-    }
-#endif
-
-    return true;
+    return translator_use_goto_tb(&s->base, dest);
 }
 
 static inline void gen_goto_tb(DisasContext *s, int n, uint64_t dest)
 {
-    const TranslationBlock *tb;
-
-    tb = s->base.tb;
-    if (use_goto_tb(s, n, dest)) {
+    if (use_goto_tb(s, dest)) {
         tcg_gen_goto_tb(n);
         gen_a64_set_pc_im(dest);
-        tcg_gen_exit_tb(tb, n);
+        tcg_gen_exit_tb(s->base.tb, n);
         s->base.is_jmp = DISAS_NORETURN;
     } else {
         gen_a64_set_pc_im(dest);
-- 
2.25.1

Just use translator_use_goto_tb directly at the one call site,
rather than maintaining a local wrapper.

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/arm/translate.c | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/target/arm/translate.c b/target/arm/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/translate.c
+++ b/target/arm/translate.c
@@ -XXX,XX +XXX,XX @@ static int disas_dsp_insn(DisasContext *s, uint32_t insn)
     return 1;
 }
 
-static inline bool use_goto_tb(DisasContext *s, target_ulong dest)
-{
-#ifndef CONFIG_USER_ONLY
-    return (s->base.tb->pc & TARGET_PAGE_MASK) == (dest & TARGET_PAGE_MASK) ||
-           ((s->base.pc_next - 1) & TARGET_PAGE_MASK) == (dest & TARGET_PAGE_MASK);
-#else
-    return true;
-#endif
-}
-
 static void gen_goto_ptr(void)
 {
     tcg_gen_lookup_and_goto_ptr();
@@ -XXX,XX +XXX,XX @@ static void gen_goto_ptr(void)
  */
 static void gen_goto_tb(DisasContext *s, int n, target_ulong dest)
 {
-    if (use_goto_tb(s, dest)) {
+    if (translator_use_goto_tb(&s->base, dest)) {
         tcg_gen_goto_tb(n);
         gen_set_pc_im(s, dest);
         tcg_gen_exit_tb(s->base.tb, n);
-- 
2.25.1

Single stepping is not the only reason not to use goto_tb.
If goto_tb is disallowed, and single-stepping is not enabled,
then use tcg_gen_lookup_and_goto_tb to indirectly chain.

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/avr/translate.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/target/avr/translate.c b/target/avr/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/avr/translate.c
+++ b/target/avr/translate.c
@@ -XXX,XX +XXX,XX @@ static void gen_goto_tb(DisasContext *ctx, int n, target_ulong dest)
 {
     const TranslationBlock *tb = ctx->base.tb;
 
-    if (!ctx->base.singlestep_enabled) {
+    if (translator_use_goto_tb(&ctx->base, dest)) {
         tcg_gen_goto_tb(n);
         tcg_gen_movi_i32(cpu_pc, dest);
         tcg_gen_exit_tb(tb, n);
     } else {
         tcg_gen_movi_i32(cpu_pc, dest);
-        gen_helper_debug(cpu_env);
-        tcg_gen_exit_tb(NULL, 0);
+        if (ctx->base.singlestep_enabled) {
+            gen_helper_debug(cpu_env);
+        } else {
+            tcg_gen_lookup_and_goto_ptr();
+        }
     }
     ctx->base.is_jmp = DISAS_NORETURN;
 }
-- 
2.25.1

The test for singlestepping is done in translator_use_goto_tb,
so we may elide it from cris_tr_tb_stop.

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/cris/translate.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/target/cris/translate.c b/target/cris/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/cris/translate.c
+++ b/target/cris/translate.c
@@ -XXX,XX +XXX,XX @@ static void t_gen_swapr(TCGv d, TCGv s)
 
 static bool use_goto_tb(DisasContext *dc, target_ulong dest)
 {
-    return ((dest ^ dc->base.pc_first) & TARGET_PAGE_MASK) == 0;
+    return translator_use_goto_tb(&dc->base, dest);
 }
 
 static void gen_goto_tb(DisasContext *dc, int n, target_ulong dest)
@@ -XXX,XX +XXX,XX @@ static void cris_tr_tb_stop(DisasContextBase *dcbase, CPUState *cpu)
              * Use a conditional branch if either taken or not-taken path
              * can use goto_tb.  If neither can, then treat it as indirect.
              */
-            if (likely(!dc->base.singlestep_enabled)
-                && likely(!dc->cpustate_changed)
+            if (likely(!dc->cpustate_changed)
                 && (use_goto_tb(dc, dc->jmp_pc) || use_goto_tb(dc, npc))) {
                 TCGLabel *not_taken = gen_new_label();
 
-- 
2.25.1

Just use translator_use_goto_tb directly at the one call site,
rather than maintaining a local wrapper.

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/i386/tcg/translate.c | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -XXX,XX +XXX,XX @@ static inline int insn_const_size(MemOp ot)
     }
 }
 
-static inline bool use_goto_tb(DisasContext *s, target_ulong pc)
-{
-#ifndef CONFIG_USER_ONLY
-    return (pc & TARGET_PAGE_MASK) == (s->base.tb->pc & TARGET_PAGE_MASK) ||
-           (pc & TARGET_PAGE_MASK) == (s->pc_start & TARGET_PAGE_MASK);
-#else
-    return true;
-#endif
-}
-
-static inline void gen_goto_tb(DisasContext *s, int tb_num, target_ulong eip)
+static void gen_goto_tb(DisasContext *s, int tb_num, target_ulong eip)
 {
     target_ulong pc = s->cs_base + eip;
 
-    if (use_goto_tb(s, pc))  {
+    if (translator_use_goto_tb(&s->base, pc))  {
         /* jump to same page: we can use a direct jump */
         tcg_gen_goto_tb(tb_num);
         gen_jmp_im(s, eip);
-- 
2.25.1

Just use translator_use_goto_tb directly at the one call site,
rather than maintaining a local wrapper.

Acked-by: Laurent Vivier <laurent@vivier.eu>
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/m68k/translate.c | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/target/m68k/translate.c b/target/m68k/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/m68k/translate.c
+++ b/target/m68k/translate.c
@@ -XXX,XX +XXX,XX @@ static void gen_exit_tb(DisasContext *s)
         }                                                               \
     } while (0)
 
-static inline bool use_goto_tb(DisasContext *s, uint32_t dest)
-{
-#ifndef CONFIG_USER_ONLY
-    return (s->base.pc_first & TARGET_PAGE_MASK) == (dest & TARGET_PAGE_MASK)
-        || (s->base.pc_next & TARGET_PAGE_MASK) == (dest & TARGET_PAGE_MASK);
-#else
-    return true;
-#endif
-}
-
 /* Generate a jump to an immediate address.  */
 static void gen_jmp_tb(DisasContext *s, int n, uint32_t dest)
 {
@@ -XXX,XX +XXX,XX @@ static void gen_jmp_tb(DisasContext *s, int n, uint32_t dest)
         update_cc_op(s);
         tcg_gen_movi_i32(QREG_PC, dest);
         gen_singlestep_exception(s);
-    } else if (use_goto_tb(s, dest)) {
+    } else if (translator_use_goto_tb(&s->base, dest)) {
         tcg_gen_goto_tb(n);
         tcg_gen_movi_i32(QREG_PC, dest);
         tcg_gen_exit_tb(s->base.tb, n);
-- 
2.25.1

Just use translator_use_goto_tb directly at the one call site,
rather than maintaining a local wrapper.

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/microblaze/translate.c | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/target/microblaze/translate.c b/target/microblaze/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/microblaze/translate.c
+++ b/target/microblaze/translate.c
@@ -XXX,XX +XXX,XX @@ static void gen_raise_hw_excp(DisasContext *dc, uint32_t esr_ec)
     gen_raise_exception_sync(dc, EXCP_HW_EXCP);
 }
 
-static inline bool use_goto_tb(DisasContext *dc, target_ulong dest)
-{
-#ifndef CONFIG_USER_ONLY
-    return (dc->base.pc_first & TARGET_PAGE_MASK) == (dest & TARGET_PAGE_MASK);
-#else
-    return true;
-#endif
-}
-
 static void gen_goto_tb(DisasContext *dc, int n, target_ulong dest)
 {
     if (dc->base.singlestep_enabled) {
@@ -XXX,XX +XXX,XX @@ static void gen_goto_tb(DisasContext *dc, int n, target_ulong dest)
         tcg_gen_movi_i32(cpu_pc, dest);
         gen_helper_raise_exception(cpu_env, tmp);
         tcg_temp_free_i32(tmp);
-    } else if (use_goto_tb(dc, dest)) {
+    } else if (translator_use_goto_tb(&dc->base, dest)) {
         tcg_gen_goto_tb(n);
         tcg_gen_movi_i32(cpu_pc, dest);
         tcg_gen_exit_tb(dc->base.tb, n);
-- 
2.25.1

Just use translator_use_goto_tb directly at the one call site,
rather than maintaining a local wrapper.

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/mips/tcg/translate.c | 17 ++---------------
 1 file changed, 2 insertions(+), 15 deletions(-)

diff --git a/target/mips/tcg/translate.c b/target/mips/tcg/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/mips/tcg/translate.c
+++ b/target/mips/tcg/translate.c
@@ -XXX,XX +XXX,XX @@ static void gen_trap(DisasContext *ctx, uint32_t opc,
     tcg_temp_free(t1);
 }
 
-static inline bool use_goto_tb(DisasContext *ctx, target_ulong dest)
+static void gen_goto_tb(DisasContext *ctx, int n, target_ulong dest)
 {
-    if (unlikely(ctx->base.singlestep_enabled)) {
-        return false;
-    }
-
-#ifndef CONFIG_USER_ONLY
-    return (ctx->base.tb->pc & TARGET_PAGE_MASK) == (dest & TARGET_PAGE_MASK);
-#else
-    return true;
-#endif
-}
-
-static inline void gen_goto_tb(DisasContext *ctx, int n, target_ulong dest)
-{
-    if (use_goto_tb(ctx, dest)) {
+    if (translator_use_goto_tb(&ctx->base, dest)) {
         tcg_gen_goto_tb(n);
         gen_save_pc(dest);
         tcg_gen_exit_tb(ctx->base.tb, n);
-- 
2.25.1

Just use translator_use_goto_tb directly at the one call site,
rather than maintaining a local wrapper.

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/nios2/translate.c | 15 +--------------
 1 file changed, 1 insertion(+), 14 deletions(-)

diff --git a/target/nios2/translate.c b/target/nios2/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/nios2/translate.c
+++ b/target/nios2/translate.c
@@ -XXX,XX +XXX,XX @@ static void t_gen_helper_raise_exception(DisasContext *dc,
     dc->base.is_jmp = DISAS_NORETURN;
 }
 
-static bool use_goto_tb(DisasContext *dc, uint32_t dest)
-{
-    if (unlikely(dc->base.singlestep_enabled)) {
-        return false;
-    }
-
-#ifndef CONFIG_USER_ONLY
-    return (dc->base.pc_first & TARGET_PAGE_MASK) == (dest & TARGET_PAGE_MASK);
-#else
-    return true;
-#endif
-}
-
 static void gen_goto_tb(DisasContext *dc, int n, uint32_t dest)
 {
     const TranslationBlock *tb = dc->base.tb;
 
-    if (use_goto_tb(dc, dest)) {
+    if (translator_use_goto_tb(&dc->base, dest)) {
         tcg_gen_goto_tb(n);
         tcg_gen_movi_tl(cpu_R[R_PC], dest);
         tcg_gen_exit_tb(tb, n);
-- 
2.25.1

Reorder the control statements to allow using the page boundary
check from translator_use_goto_tb().

Reviewed-by: Stafford Horne <shorne@gmail.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/openrisc/translate.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/target/openrisc/translate.c b/target/openrisc/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/openrisc/translate.c
+++ b/target/openrisc/translate.c
@@ -XXX,XX +XXX,XX @@ static void openrisc_tr_tb_stop(DisasContextBase *dcbase, CPUState *cs)
         /* fallthru */
 
     case DISAS_TOO_MANY:
-        if (unlikely(dc->base.singlestep_enabled)) {
-            tcg_gen_movi_tl(cpu_pc, jmp_dest);
-            gen_exception(dc, EXCP_DEBUG);
-        } else if ((dc->base.pc_first ^ jmp_dest) & TARGET_PAGE_MASK) {
-            tcg_gen_movi_tl(cpu_pc, jmp_dest);
-            tcg_gen_lookup_and_goto_ptr();
-        } else {
+        if (translator_use_goto_tb(&dc->base, jmp_dest)) {
             tcg_gen_goto_tb(0);
             tcg_gen_movi_tl(cpu_pc, jmp_dest);
             tcg_gen_exit_tb(dc->base.tb, 0);
+            break;
+        }
+        tcg_gen_movi_tl(cpu_pc, jmp_dest);
+        if (unlikely(dc->base.singlestep_enabled)) {
+            gen_exception(dc, EXCP_DEBUG);
+        } else {
+            tcg_gen_lookup_and_goto_ptr();
         }
         break;
 
-- 
2.25.1

Just use translator_use_goto_tb directly at the one call site,
rather than maintaining a local wrapper.

Reviewed-by: Alistair Francis <alistair.francis@wdc.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/riscv/translate.c | 20 +-------------------
 1 file changed, 1 insertion(+), 19 deletions(-)

diff --git a/target/riscv/translate.c b/target/riscv/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/riscv/translate.c
+++ b/target/riscv/translate.c
@@ -XXX,XX +XXX,XX @@ static void gen_exception_inst_addr_mis(DisasContext *ctx)
     generate_exception_mtval(ctx, RISCV_EXCP_INST_ADDR_MIS);
 }
 
-static inline bool use_goto_tb(DisasContext *ctx, target_ulong dest)
-{
-    if (unlikely(ctx->base.singlestep_enabled)) {
-        return false;
-    }
-
-#ifndef CONFIG_USER_ONLY
-    return (ctx->base.tb->pc & TARGET_PAGE_MASK) == (dest & TARGET_PAGE_MASK);
-#else
-    return true;
-#endif
-}
-
 static void gen_goto_tb(DisasContext *ctx, int n, target_ulong dest)
 {
-    if (use_goto_tb(ctx, dest)) {
-        /* chaining is only allowed when the jump is to the same page */
+    if (translator_use_goto_tb(&ctx->base, dest)) {
         tcg_gen_goto_tb(n);
         tcg_gen_movi_tl(cpu_pc, dest);
-
-        /* No need to check for single stepping here as use_goto_tb() will
-         * return false in case of single stepping.
-         */
         tcg_gen_exit_tb(ctx->base.tb, n);
     } else {
         tcg_gen_movi_tl(cpu_pc, dest);
-- 
2.25.1

Just use translator_use_goto_tb directly at the one call site,
rather than maintaining a local wrapper.

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/rx/translate.c | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/target/rx/translate.c b/target/rx/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/rx/translate.c
+++ b/target/rx/translate.c
@@ -XXX,XX +XXX,XX @@ void rx_cpu_dump_state(CPUState *cs, FILE *f, int flags)
     }
 }
 
-static bool use_goto_tb(DisasContext *dc, target_ulong dest)
-{
-    if (unlikely(dc->base.singlestep_enabled)) {
-        return false;
-    } else {
-        return true;
-    }
-}
-
 static void gen_goto_tb(DisasContext *dc, int n, target_ulong dest)
 {
-    if (use_goto_tb(dc, dest)) {
+    if (translator_use_goto_tb(&dc->base, dest)) {
         tcg_gen_goto_tb(n);
         tcg_gen_movi_i32(cpu_pc, dest);
         tcg_gen_exit_tb(dc->base.tb, n);
-- 
2.25.1

We have not needed to end a TB for I/O since ba3e7926691
("icount: clean up cpu_can_io at the entry to the block").

In use_goto_tb, the check for singlestep_enabled is in the
generic translator_use_goto_tb.  In s390x_tr_tb_stop, the
check for singlestep_enabled is in the preceding do_debug test.

Which leaves only FLAG_MASK_PER: fold that test alone into
the two callers of use_exit tb.

Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/s390x/translate.c | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/target/s390x/translate.c b/target/s390x/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/s390x/translate.c
+++ b/target/s390x/translate.c
@@ -XXX,XX +XXX,XX @@ static void gen_op_calc_cc(DisasContext *s)
     set_cc_static(s);
 }
 
-static bool use_exit_tb(DisasContext *s)
-{
-    return s->base.singlestep_enabled ||
-            (tb_cflags(s->base.tb) & CF_LAST_IO) ||
-            (s->base.tb->flags & FLAG_MASK_PER);
-}
-
 static bool use_goto_tb(DisasContext *s, uint64_t dest)
 {
-    if (unlikely(use_exit_tb(s))) {
+    if (unlikely(s->base.tb->flags & FLAG_MASK_PER)) {
         return false;
     }
     return translator_use_goto_tb(&s->base, dest);
@@ -XXX,XX +XXX,XX @@ static void s390x_tr_tb_stop(DisasContextBase *dcbase, CPUState *cs)
         /* Exit the TB, either by raising a debug exception or by return.  */
         if (dc->do_debug) {
             gen_exception(EXCP_DEBUG);
-        } else if (use_exit_tb(dc) ||
+        } else if ((dc->base.tb->flags & FLAG_MASK_PER) ||
                    dc->base.is_jmp == DISAS_PC_STALE_NOCHAIN) {
             tcg_gen_exit_tb(NULL, 0);
         } else {
-- 
2.25.1

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/sh4/translate.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/target/sh4/translate.c b/target/sh4/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/sh4/translate.c
+++ b/target/sh4/translate.c
@@ -XXX,XX +XXX,XX @@ static inline bool use_exit_tb(DisasContext *ctx)
     return (ctx->tbflags & GUSA_EXCLUSIVE) != 0;
 }
 
-static inline bool use_goto_tb(DisasContext *ctx, target_ulong dest)
+static bool use_goto_tb(DisasContext *ctx, target_ulong dest)
 {
-    /* Use a direct jump if in same page and singlestep not enabled */
-    if (unlikely(ctx->base.singlestep_enabled || use_exit_tb(ctx))) {
+    if (use_exit_tb(ctx)) {
         return false;
     }
-#ifndef CONFIG_USER_ONLY
-    return (ctx->base.tb->pc & TARGET_PAGE_MASK) == (dest & TARGET_PAGE_MASK);
-#else
-    return true;
-#endif
+    return translator_use_goto_tb(&ctx->base, dest);
 }
 
 static void gen_goto_tb(DisasContext *ctx, int n, target_ulong dest)
-- 
2.25.1

Reviewed-by: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/sparc/translate.c | 19 +++++--------------
 1 file changed, 5 insertions(+), 14 deletions(-)

diff --git a/target/sparc/translate.c b/target/sparc/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/sparc/translate.c
+++ b/target/sparc/translate.c
@@ -XXX,XX +XXX,XX @@ static inline TCGv gen_dest_gpr(DisasContext *dc, int reg)
     }
 }
 
-static inline bool use_goto_tb(DisasContext *s, target_ulong pc,
-                               target_ulong npc)
+static bool use_goto_tb(DisasContext *s, target_ulong pc, target_ulong npc)
 {
-    if (unlikely(s->base.singlestep_enabled || singlestep)) {
-        return false;
-    }
-
-#ifndef CONFIG_USER_ONLY
-    return (pc & TARGET_PAGE_MASK) == (s->base.tb->pc & TARGET_PAGE_MASK) &&
-           (npc & TARGET_PAGE_MASK) == (s->base.tb->pc & TARGET_PAGE_MASK);
-#else
-    return true;
-#endif
+    return translator_use_goto_tb(&s->base, pc) &&
+           translator_use_goto_tb(&s->base, npc);
 }
 
-static inline void gen_goto_tb(DisasContext *s, int tb_num,
-                               target_ulong pc, target_ulong npc)
+static void gen_goto_tb(DisasContext *s, int tb_num,
+                        target_ulong pc, target_ulong npc)
 {
     if (use_goto_tb(s, pc, npc))  {
         /* jump to same page: we can use a direct jump */
-- 
2.25.1

Just use translator_use_goto_tb directly at the one call site,
rather than maintaining a local wrapper.

Reviewed-by: Bastian Koppelmann <kbastian@mail.uni-paderborn.de>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/tricore/translate.c | 17 ++---------------
 1 file changed, 2 insertions(+), 15 deletions(-)

diff --git a/target/tricore/translate.c b/target/tricore/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/tricore/translate.c
+++ b/target/tricore/translate.c
@@ -XXX,XX +XXX,XX @@ static inline void gen_save_pc(target_ulong pc)
     tcg_gen_movi_tl(cpu_PC, pc);
 }
 
-static inline bool use_goto_tb(DisasContext *ctx, target_ulong dest)
-{
-    if (unlikely(ctx->base.singlestep_enabled)) {
-        return false;
-    }
-
-#ifndef CONFIG_USER_ONLY
-    return (ctx->base.tb->pc & TARGET_PAGE_MASK) == (dest & TARGET_PAGE_MASK);
-#else
-    return true;
-#endif
-}
-
 static void generate_qemu_excp(DisasContext *ctx, int excp)
 {
     TCGv_i32 tmp = tcg_const_i32(excp);
@@ -XXX,XX +XXX,XX @@ static void generate_qemu_excp(DisasContext *ctx, int excp)
     tcg_temp_free(tmp);
 }
 
-static inline void gen_goto_tb(DisasContext *ctx, int n, target_ulong dest)
+static void gen_goto_tb(DisasContext *ctx, int n, target_ulong dest)
 {
-    if (use_goto_tb(ctx, dest)) {
+    if (translator_use_goto_tb(&ctx->base, dest)) {
         tcg_gen_goto_tb(n);
         gen_save_pc(dest);
         tcg_gen_exit_tb(ctx->base.tb, n);
-- 
2.25.1

The loop is performing a simple boolean test for the existence
of a BP_CPU breakpoint at EIP.  Plus it gets the iteration wrong,
if we happen to have a BP_GDB breakpoint at the same address.

We have a function for this: cpu_breakpoint_test.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Reviewed-by: Eduardo Habkost <ehabkost@redhat.com>
Message-Id: <20210620062317.1399034-1-richard.henderson@linaro.org>
---
 target/i386/tcg/sysemu/bpt_helper.c | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/target/i386/tcg/sysemu/bpt_helper.c b/target/i386/tcg/sysemu/bpt_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/i386/tcg/sysemu/bpt_helper.c
+++ b/target/i386/tcg/sysemu/bpt_helper.c
@@ -XXX,XX +XXX,XX @@ void breakpoint_handler(CPUState *cs)
 {
     X86CPU *cpu = X86_CPU(cs);
     CPUX86State *env = &cpu->env;
-    CPUBreakpoint *bp;
 
     if (cs->watchpoint_hit) {
         if (cs->watchpoint_hit->flags & BP_CPU) {
@@ -XXX,XX +XXX,XX @@ void breakpoint_handler(CPUState *cs)
             }
         }
     } else {
-        QTAILQ_FOREACH(bp, &cs->breakpoints, entry) {
-            if (bp->pc == env->eip) {
-                if (bp->flags & BP_CPU) {
-                    check_hw_breakpoints(env, true);
-                    raise_exception(env, EXCP01_DB);
-                }
-                break;
-            }
+        if (cpu_breakpoint_test(cs, env->eip, BP_CPU)) {
+            check_hw_breakpoints(env, true);
+            raise_exception(env, EXCP01_DB);
         }
     }
 }
-- 
2.25.1

This will allow additional code sharing.
No functional change.

Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/cpu-exec.c    | 30 ++++++++++++++++++++++++++++++
 accel/tcg/tcg-runtime.c | 22 ----------------------
 2 files changed, 30 insertions(+), 22 deletions(-)

diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cpu-exec.c
+++ b/accel/tcg/cpu-exec.c
@@ -XXX,XX +XXX,XX @@
 #include "exec/cpu-all.h"
 #include "sysemu/cpu-timers.h"
 #include "sysemu/replay.h"
+#include "exec/helper-proto.h"
 #include "tb-hash.h"
 #include "tb-lookup.h"
 #include "tb-context.h"
@@ -XXX,XX +XXX,XX @@ static void init_delay_params(SyncClocks *sc, const CPUState *cpu)
 }
 #endif /* CONFIG USER ONLY */
 
+/**
+ * helper_lookup_tb_ptr: quick check for next tb
+ * @env: current cpu state
+ *
+ * Look for an existing TB matching the current cpu state.
+ * If found, return the code pointer.  If not found, return
+ * the tcg epilogue so that we return into cpu_tb_exec.
+ */
+const void *HELPER(lookup_tb_ptr)(CPUArchState *env)
+{
+    CPUState *cpu = env_cpu(env);
+    TranslationBlock *tb;
+    target_ulong cs_base, pc;
+    uint32_t flags;
+
+    cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
+
+    tb = tb_lookup(cpu, pc, cs_base, flags, curr_cflags(cpu));
+    if (tb == NULL) {
+        return tcg_code_gen_epilogue;
+    }
+    qemu_log_mask_and_addr(CPU_LOG_EXEC, pc,
+                           "Chain %d: %p ["
+                           TARGET_FMT_lx "/" TARGET_FMT_lx "/%#x] %s\n",
+                           cpu->cpu_index, tb->tc.ptr, cs_base, pc, flags,
+                           lookup_symbol(pc));
+    return tb->tc.ptr;
+}
+
 /* Execute a TB, and fix up the CPU state afterwards if necessary */
 /*
  * Disable CFI checks.
diff --git a/accel/tcg/tcg-runtime.c b/accel/tcg/tcg-runtime.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-runtime.c
+++ b/accel/tcg/tcg-runtime.c
@@ -XXX,XX +XXX,XX @@
 #include "disas/disas.h"
 #include "exec/log.h"
 #include "tcg/tcg.h"
-#include "tb-lookup.h"
 
 /* 32-bit helpers */
 
@@ -XXX,XX +XXX,XX @@ uint64_t HELPER(ctpop_i64)(uint64_t arg)
     return ctpop64(arg);
 }
 
-const void *HELPER(lookup_tb_ptr)(CPUArchState *env)
-{
-    CPUState *cpu = env_cpu(env);
-    TranslationBlock *tb;
-    target_ulong cs_base, pc;
-    uint32_t flags;
-
-    cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
-
-    tb = tb_lookup(cpu, pc, cs_base, flags, curr_cflags(cpu));
-    if (tb == NULL) {
-        return tcg_code_gen_epilogue;
-    }
-    qemu_log_mask_and_addr(CPU_LOG_EXEC, pc,
-                           "Chain %d: %p ["
-                           TARGET_FMT_lx "/" TARGET_FMT_lx "/%#x] %s\n",
-                           cpu->cpu_index, tb->tc.ptr, cs_base, pc, flags,
-                           lookup_symbol(pc));
-    return tb->tc.ptr;
-}
-
 void HELPER(exit_atomic)(CPUArchState *env)
 {
     cpu_loop_exit_atomic(env_cpu(env), GETPC());
-- 
2.25.1

Now that we've moved helper_lookup_tb_ptr, the only user
of tb-lookup.h is cpu-exec.c; merge the contents in.

Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/tb-lookup.h | 49 -------------------------------------------
 accel/tcg/cpu-exec.c  | 31 ++++++++++++++++++++++++++-
 2 files changed, 30 insertions(+), 50 deletions(-)
 delete mode 100644 accel/tcg/tb-lookup.h

diff --git a/accel/tcg/tb-lookup.h b/accel/tcg/tb-lookup.h
deleted file mode 100644
index XXXXXXX..XXXXXXX
--- a/accel/tcg/tb-lookup.h
+++ /dev/null
@@ -XXX,XX +XXX,XX @@
-/*
- * Copyright (C) 2017, Emilio G. Cota <cota@braap.org>
- *
- * License: GNU GPL, version 2 or later.
- *   See the COPYING file in the top-level directory.
- */
-#ifndef EXEC_TB_LOOKUP_H
-#define EXEC_TB_LOOKUP_H
-
-#ifdef NEED_CPU_H
-#include "cpu.h"
-#else
-#include "exec/poison.h"
-#endif
-
-#include "exec/exec-all.h"
-#include "tb-hash.h"
-
-/* Might cause an exception, so have a longjmp destination ready */
-static inline TranslationBlock *tb_lookup(CPUState *cpu, target_ulong pc,
-                                          target_ulong cs_base,
-                                          uint32_t flags, uint32_t cflags)
-{
-    TranslationBlock *tb;
-    uint32_t hash;
-
-    /* we should never be trying to look up an INVALID tb */
-    tcg_debug_assert(!(cflags & CF_INVALID));
-
-    hash = tb_jmp_cache_hash_func(pc);
-    tb = qatomic_rcu_read(&cpu->tb_jmp_cache[hash]);
-
-    if (likely(tb &&
-               tb->pc == pc &&
-               tb->cs_base == cs_base &&
-               tb->flags == flags &&
-               tb->trace_vcpu_dstate == *cpu->trace_dstate &&
-               tb_cflags(tb) == cflags)) {
-        return tb;
-    }
-    tb = tb_htable_lookup(cpu, pc, cs_base, flags, cflags);
-    if (tb == NULL) {
-        return NULL;
-    }
-    qatomic_set(&cpu->tb_jmp_cache[hash], tb);
-    return tb;
-}
-
-#endif /* EXEC_TB_LOOKUP_H */
diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cpu-exec.c
+++ b/accel/tcg/cpu-exec.c
@@ -XXX,XX +XXX,XX @@
 #include "sysemu/replay.h"
 #include "exec/helper-proto.h"
 #include "tb-hash.h"
-#include "tb-lookup.h"
 #include "tb-context.h"
 #include "internal.h"
 
@@ -XXX,XX +XXX,XX @@ static void init_delay_params(SyncClocks *sc, const CPUState *cpu)
 }
 #endif /* CONFIG USER ONLY */
 
+/* Might cause an exception, so have a longjmp destination ready */
+static inline TranslationBlock *tb_lookup(CPUState *cpu, target_ulong pc,
+                                          target_ulong cs_base,
+                                          uint32_t flags, uint32_t cflags)
+{
+    TranslationBlock *tb;
+    uint32_t hash;
+
+    /* we should never be trying to look up an INVALID tb */
+    tcg_debug_assert(!(cflags & CF_INVALID));
+
+    hash = tb_jmp_cache_hash_func(pc);
+    tb = qatomic_rcu_read(&cpu->tb_jmp_cache[hash]);
+
+    if (likely(tb &&
+               tb->pc == pc &&
+               tb->cs_base == cs_base &&
+               tb->flags == flags &&
+               tb->trace_vcpu_dstate == *cpu->trace_dstate &&
+               tb_cflags(tb) == cflags)) {
+        return tb;
+    }
+    tb = tb_htable_lookup(cpu, pc, cs_base, flags, cflags);
+    if (tb == NULL) {
+        return NULL;
+    }
+    qatomic_set(&cpu->tb_jmp_cache[hash], tb);
+    return tb;
+}
+
 /**
  * helper_lookup_tb_ptr: quick check for next tb
  * @env: current cpu state
-- 
2.25.1

Split out CPU_LOG_EXEC and CPU_LOG_TB_CPU logging from
cpu_tb_exec to a new function.  Perform only one pc
range check after a combined mask check.

Use the new function in lookup_tb_ptr.  This enables
CPU_LOG_TB_CPU between indirectly chained tbs.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/cpu-exec.c | 61 ++++++++++++++++++++++++--------------------
 1 file changed, 34 insertions(+), 27 deletions(-)

diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cpu-exec.c
+++ b/accel/tcg/cpu-exec.c
@@ -XXX,XX +XXX,XX @@ static inline TranslationBlock *tb_lookup(CPUState *cpu, target_ulong pc,
     return tb;
 }
 
+static inline void log_cpu_exec(target_ulong pc, CPUState *cpu,
+                                const TranslationBlock *tb)
+{
+    if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_CPU | CPU_LOG_EXEC))
+        && qemu_log_in_addr_range(pc)) {
+
+        qemu_log_mask(CPU_LOG_EXEC,
+                      "Trace %d: %p [" TARGET_FMT_lx
+                      "/" TARGET_FMT_lx "/%#x] %s\n",
+                      cpu->cpu_index, tb->tc.ptr, tb->cs_base, pc, tb->flags,
+                      lookup_symbol(pc));
+
+#if defined(DEBUG_DISAS)
+        if (qemu_loglevel_mask(CPU_LOG_TB_CPU)) {
+            FILE *logfile = qemu_log_lock();
+            int flags = 0;
+
+            if (qemu_loglevel_mask(CPU_LOG_TB_FPU)) {
+                flags |= CPU_DUMP_FPU;
+            }
+#if defined(TARGET_I386)
+            flags |= CPU_DUMP_CCOP;
+#endif
+            log_cpu_state(cpu, flags);
+            qemu_log_unlock(logfile);
+        }
+#endif /* DEBUG_DISAS */
+    }
+}
+
 /**
  * helper_lookup_tb_ptr: quick check for next tb
  * @env: current cpu state
@@ -XXX,XX +XXX,XX @@ const void *HELPER(lookup_tb_ptr)(CPUArchState *env)
     if (tb == NULL) {
         return tcg_code_gen_epilogue;
     }
-    qemu_log_mask_and_addr(CPU_LOG_EXEC, pc,
-                           "Chain %d: %p ["
-                           TARGET_FMT_lx "/" TARGET_FMT_lx "/%#x] %s\n",
-                           cpu->cpu_index, tb->tc.ptr, cs_base, pc, flags,
-                           lookup_symbol(pc));
+
+    log_cpu_exec(pc, cpu, tb);
+
     return tb->tc.ptr;
 }
 
@@ -XXX,XX +XXX,XX @@ cpu_tb_exec(CPUState *cpu, TranslationBlock *itb, int *tb_exit)
     TranslationBlock *last_tb;
     const void *tb_ptr = itb->tc.ptr;
 
-    qemu_log_mask_and_addr(CPU_LOG_EXEC, itb->pc,
-                           "Trace %d: %p ["
-                           TARGET_FMT_lx "/" TARGET_FMT_lx "/%#x] %s\n",
-                           cpu->cpu_index, itb->tc.ptr,
-                           itb->cs_base, itb->pc, itb->flags,
-                           lookup_symbol(itb->pc));
-
-#if defined(DEBUG_DISAS)
-    if (qemu_loglevel_mask(CPU_LOG_TB_CPU)
-        && qemu_log_in_addr_range(itb->pc)) {
-        FILE *logfile = qemu_log_lock();
-        int flags = 0;
-        if (qemu_loglevel_mask(CPU_LOG_TB_FPU)) {
-            flags |= CPU_DUMP_FPU;
-        }
-#if defined(TARGET_I386)
-        flags |= CPU_DUMP_CCOP;
-#endif
-        log_cpu_state(cpu, flags);
-        qemu_log_unlock(logfile);
-    }
-#endif /* DEBUG_DISAS */
+    log_cpu_exec(itb->pc, cpu, itb);
 
     qemu_thread_jit_execute();
     ret = tcg_qemu_tb_exec(env, tb_ptr);
-- 
2.25.1

Since 6eea04347eb6, all tcg backends support goto_ptr.
Remove the conditional, making support mandatory.

diff --git a/include/tcg/tcg-opc.h b/include/tcg/tcg-opc.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg-opc.h
+++ b/include/tcg/tcg-opc.h
@@ -XXX,XX +XXX,XX @@ DEF(insn_start, 0, 0, TLADDR_ARGS * TARGET_INSN_START_WORDS,
     TCG_OPF_NOT_PRESENT)
 DEF(exit_tb, 0, 0, 1, TCG_OPF_BB_EXIT | TCG_OPF_BB_END)
 DEF(goto_tb, 0, 0, 1, TCG_OPF_BB_EXIT | TCG_OPF_BB_END)
-DEF(goto_ptr, 0, 1, 0,
-    TCG_OPF_BB_EXIT | TCG_OPF_BB_END | IMPL(TCG_TARGET_HAS_goto_ptr))
+DEF(goto_ptr, 0, 1, 0, TCG_OPF_BB_EXIT | TCG_OPF_BB_END)
 
 DEF(plugin_cb_start, 0, 0, 3, TCG_OPF_NOT_PRESENT)
 DEF(plugin_cb_end, 0, 0, 0, TCG_OPF_NOT_PRESENT)
diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/aarch64/tcg-target.h
+++ b/tcg/aarch64/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
 #define TCG_TARGET_HAS_mulsh_i32        0
 #define TCG_TARGET_HAS_extrl_i64_i32    0
 #define TCG_TARGET_HAS_extrh_i64_i32    0
-#define TCG_TARGET_HAS_goto_ptr         1
 #define TCG_TARGET_HAS_qemu_st8_i32     0
 
 #define TCG_TARGET_HAS_div_i64          1
diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.h
+++ b/tcg/arm/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool use_neon_instructions;
 #define TCG_TARGET_HAS_mulsh_i32        0
 #define TCG_TARGET_HAS_div_i32          use_idiv_instructions
 #define TCG_TARGET_HAS_rem_i32          0
-#define TCG_TARGET_HAS_goto_ptr         1
 #define TCG_TARGET_HAS_direct_jump      0
 #define TCG_TARGET_HAS_qemu_st8_i32     0
 
diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool have_movbe;
 #define TCG_TARGET_HAS_muls2_i32        1
 #define TCG_TARGET_HAS_muluh_i32        0
 #define TCG_TARGET_HAS_mulsh_i32        0
-#define TCG_TARGET_HAS_goto_ptr         1
 #define TCG_TARGET_HAS_direct_jump      1
 
 #if TCG_TARGET_REG_BITS == 64
diff --git a/tcg/mips/tcg-target.h b/tcg/mips/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/mips/tcg-target.h
+++ b/tcg/mips/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool use_mips32r2_instructions;
 #define TCG_TARGET_HAS_muluh_i32        1
 #define TCG_TARGET_HAS_mulsh_i32        1
 #define TCG_TARGET_HAS_bswap32_i32      1
-#define TCG_TARGET_HAS_goto_ptr         1
 #define TCG_TARGET_HAS_direct_jump      1
 
 #if TCG_TARGET_REG_BITS == 64
diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/ppc/tcg-target.h
+++ b/tcg/ppc/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool have_vsx;
 #define TCG_TARGET_HAS_muls2_i32        0
 #define TCG_TARGET_HAS_muluh_i32        1
 #define TCG_TARGET_HAS_mulsh_i32        1
-#define TCG_TARGET_HAS_goto_ptr         1
 #define TCG_TARGET_HAS_direct_jump      1
 #define TCG_TARGET_HAS_qemu_st8_i32     0
 
diff --git a/tcg/riscv/tcg-target.h b/tcg/riscv/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/riscv/tcg-target.h
+++ b/tcg/riscv/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
 #define TCG_TARGET_CALL_STACK_OFFSET    0
 
 /* optional instructions */
-#define TCG_TARGET_HAS_goto_ptr         1
 #define TCG_TARGET_HAS_movcond_i32      0
 #define TCG_TARGET_HAS_div_i32          1
 #define TCG_TARGET_HAS_rem_i32          1
diff --git a/tcg/s390/tcg-target.h b/tcg/s390/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/s390/tcg-target.h
+++ b/tcg/s390/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern uint64_t s390_facilities;
 #define TCG_TARGET_HAS_mulsh_i32      0
 #define TCG_TARGET_HAS_extrl_i64_i32  0
 #define TCG_TARGET_HAS_extrh_i64_i32  0
-#define TCG_TARGET_HAS_goto_ptr       1
 #define TCG_TARGET_HAS_direct_jump    (s390_facilities & FACILITY_GEN_INST_EXT)
 #define TCG_TARGET_HAS_qemu_st8_i32   0
 
diff --git a/tcg/sparc/tcg-target.h b/tcg/sparc/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/sparc/tcg-target.h
+++ b/tcg/sparc/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool use_vis3_instructions;
 #define TCG_TARGET_HAS_muls2_i32        1
 #define TCG_TARGET_HAS_muluh_i32        0
 #define TCG_TARGET_HAS_mulsh_i32        0
-#define TCG_TARGET_HAS_goto_ptr         1
 #define TCG_TARGET_HAS_direct_jump      1
 #define TCG_TARGET_HAS_qemu_st8_i32     0
 
diff --git a/tcg/tci/tcg-target.h b/tcg/tci/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci/tcg-target.h
+++ b/tcg/tci/tcg-target.h
@@ -XXX,XX +XXX,XX @@
 #define TCG_TARGET_HAS_muls2_i32        1
 #define TCG_TARGET_HAS_muluh_i32        0
 #define TCG_TARGET_HAS_mulsh_i32        0
-#define TCG_TARGET_HAS_goto_ptr         1
 #define TCG_TARGET_HAS_direct_jump      0
 #define TCG_TARGET_HAS_qemu_st8_i32     0
 
diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg-op.c
+++ b/tcg/tcg-op.c
@@ -XXX,XX +XXX,XX @@ void tcg_gen_goto_tb(unsigned idx)
 
 void tcg_gen_lookup_and_goto_ptr(void)
 {
-    if (TCG_TARGET_HAS_goto_ptr && !qemu_loglevel_mask(CPU_LOG_TB_NOCHAIN)) {
+    if (!qemu_loglevel_mask(CPU_LOG_TB_NOCHAIN)) {
         TCGv_ptr ptr;
 
         plugin_gen_disable_mem_helpers();
diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ void tcg_prologue_init(TCGContext *s)
      * For tci, we use NULL as the signal to return from the interpreter,
      * so skip this check.
      */
-    if (TCG_TARGET_HAS_goto_ptr) {
-        tcg_debug_assert(tcg_code_gen_epilogue != NULL);
-    }
+    tcg_debug_assert(tcg_code_gen_epilogue != NULL);
 #endif
 
     tcg_region_prologue_set(s);
@@ -XXX,XX +XXX,XX @@ bool tcg_op_supported(TCGOpcode op)
     case INDEX_op_insn_start:
     case INDEX_op_exit_tb:
     case INDEX_op_goto_tb:
+    case INDEX_op_goto_ptr:
     case INDEX_op_qemu_ld_i32:
     case INDEX_op_qemu_st_i32:
     case INDEX_op_qemu_ld_i64:
@@ -XXX,XX +XXX,XX @@ bool tcg_op_supported(TCGOpcode op)
     case INDEX_op_qemu_st8_i32:
         return TCG_TARGET_HAS_qemu_st8_i32;
 
-    case INDEX_op_goto_ptr:
-        return TCG_TARGET_HAS_goto_ptr;
-
     case INDEX_op_mov_i32:
     case INDEX_op_setcond_i32:
     case INDEX_op_brcond_i32:
-- 
2.25.1

Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 cpu.c        | 13 +++++++++----
 trace-events |  5 +++++
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/cpu.c b/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/cpu.c
+++ b/cpu.c
@@ -XXX,XX +XXX,XX @@
 #include "exec/translate-all.h"
 #include "exec/log.h"
 #include "hw/core/accel-cpu.h"
+#include "trace/trace-root.h"
 
 uintptr_t qemu_host_page_size;
 intptr_t qemu_host_page_mask;
@@ -XXX,XX +XXX,XX @@ int cpu_breakpoint_insert(CPUState *cpu, vaddr pc, int flags,
     if (breakpoint) {
         *breakpoint = bp;
     }
+
+    trace_breakpoint_insert(cpu->cpu_index, pc, flags);
     return 0;
 }
 
@@ -XXX,XX +XXX,XX @@ int cpu_breakpoint_remove(CPUState *cpu, vaddr pc, int flags)
 }
 
 /* Remove a specific breakpoint by reference.  */
-void cpu_breakpoint_remove_by_ref(CPUState *cpu, CPUBreakpoint *breakpoint)
+void cpu_breakpoint_remove_by_ref(CPUState *cpu, CPUBreakpoint *bp)
 {
-    QTAILQ_REMOVE(&cpu->breakpoints, breakpoint, entry);
+    QTAILQ_REMOVE(&cpu->breakpoints, bp, entry);
 
-    breakpoint_invalidate(cpu, breakpoint->pc);
+    breakpoint_invalidate(cpu, bp->pc);
 
-    g_free(breakpoint);
+    trace_breakpoint_remove(cpu->cpu_index, bp->pc, bp->flags);
+    g_free(bp);
 }
 
 /* Remove all matching breakpoints. */
@@ -XXX,XX +XXX,XX @@ void cpu_single_step(CPUState *cpu, int enabled)
             /* XXX: only flush what is necessary */
             tb_flush(cpu);
         }
+        trace_breakpoint_singlestep(cpu->cpu_index, enabled);
     }
 }
 
diff --git a/trace-events b/trace-events
index XXXXXXX..XXXXXXX 100644
--- a/trace-events
+++ b/trace-events
@@ -XXX,XX +XXX,XX @@
 #
 # The <format-string> should be a sprintf()-compatible format string.
 
+# cpu.c
+breakpoint_insert(int cpu_index, uint64_t pc, int flags) "cpu=%d pc=0x%" PRIx64 " flags=0x%x"
+breakpoint_remove(int cpu_index, uint64_t pc, int flags) "cpu=%d pc=0x%" PRIx64 " flags=0x%x"
+breakpoint_singlestep(int cpu_index, int enabled) "cpu=%d enable=%d"
+
 # dma-helpers.c
 dma_blk_io(void *dbs, void *bs, int64_t offset, bool to_dev) "dbs=%p bs=%p offset=%" PRId64 " to_dev=%d"
 dma_aio_cancel(void *dbs) "dbs=%p"
-- 
2.25.1

The following changes since commit aa33508196f4e2da04625bee36e1f7be5b9267e7:

Merge tag 'mem-2023-05-23' of https://github.com/davidhildenbrand/qemu into staging (2023-05-23 10:57:25 -0700)

are available in the Git repository at:

https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20230523

for you to fetch changes up to 30d56836f98c7ed2d309bff1dde8854f3d0b5634:

tcg: Remove USE_TCG_OPTIMIZATIONS (2023-05-23 16:52:39 -0700)

----------------------------------------------------------------
util: Host cpu detection for x86 and aa64
util: Use cpu detection for bufferiszero
migration: Use cpu detection for xbzrle
tcg: Replace and remove cpu_atomic_{ld,st}o*
host/include: Split qemu/atomic128.h
tcg: Remove DEBUG_DISAS
tcg: Remove USE_TCG_OPTIMIZATIONS

----------------------------------------------------------------
Richard Henderson (28):
      util: Introduce host-specific cpuinfo.h
      util: Add cpuinfo-i386.c
      util: Add i386 CPUINFO_ATOMIC_VMOVDQU
      tcg/i386: Use host/cpuinfo.h
      util/bufferiszero: Use i386 host/cpuinfo.h
      migration/xbzrle: Shuffle function order
      migration/xbzrle: Use i386 host/cpuinfo.h
      migration: Build migration_files once
      util: Add cpuinfo-aarch64.c
      include/host: Split out atomic128-cas.h
      include/host: Split out atomic128-ldst.h
      meson: Fix detect atomic128 support with optimization
      include/qemu: Move CONFIG_ATOMIC128_OPT handling to atomic128.h
      target/ppc: Use tcg_gen_qemu_{ld,st}_i128 for LQARX, LQ, STQ
      target/s390x: Use tcg_gen_qemu_{ld,st}_i128 for LPQ, STPQ
      accel/tcg: Unify cpu_{ld,st}*_{be,le}_mmu
      target/s390x: Use cpu_{ld,st}*_mmu in do_csst
      target/s390x: Always use cpu_atomic_cmpxchgl_be_mmu in do_csst
      accel/tcg: Remove cpu_atomic_{ld,st}o_*_mmu
      accel/tcg: Remove prot argument to atomic_mmu_lookup
      accel/tcg: Eliminate #if on HAVE_ATOMIC128 and HAVE_CMPXCHG128
      qemu/atomic128: Split atomic16_read
      accel/tcg: Correctly use atomic128.h in ldst_atomicity.c.inc
      tcg: Split out tcg/debug-assert.h
      qemu/atomic128: Improve cmpxchg fallback for atomic16_set
      qemu/atomic128: Add runtime test for FEAT_LSE2
      tcg: Remove DEBUG_DISAS
      tcg: Remove USE_TCG_OPTIMIZATIONS

accel/tcg/atomic_template.h                |  93 +-----
 host/include/aarch64/host/atomic128-cas.h  |  45 +++
 host/include/aarch64/host/atomic128-ldst.h |  79 +++++
 host/include/aarch64/host/cpuinfo.h        |  22 ++
 host/include/generic/host/atomic128-cas.h  |  47 +++
 host/include/generic/host/atomic128-ldst.h |  81 +++++
 host/include/generic/host/cpuinfo.h        |   4 +
 host/include/i386/host/cpuinfo.h           |  39 +++
 host/include/x86_64/host/cpuinfo.h         |   1 +
 include/exec/cpu_ldst.h                    |  67 +----
 include/exec/exec-all.h                    |   3 -
 include/qemu/atomic128.h                   | 146 ++-------
 include/tcg/debug-assert.h                 |  17 ++
 include/tcg/tcg.h                          |   9 +-
 migration/xbzrle.h                         |   5 +-
 target/ppc/cpu.h                           |   1 -
 target/ppc/helper.h                        |   9 -
 target/s390x/cpu.h                         |   3 -
 target/s390x/helper.h                      |   4 -
 tcg/aarch64/tcg-target.h                   |   6 +-
 tcg/i386/tcg-target.h                      |  28 +-
 accel/tcg/cpu-exec.c                       |   2 -
 accel/tcg/cputlb.c                         | 211 ++++---------
 accel/tcg/translate-all.c                  |   2 -
 accel/tcg/translator.c                     |   2 -
 accel/tcg/user-exec.c                      | 332 ++++++--------------
 migration/ram.c                            |  34 +--
 migration/xbzrle.c                         | 268 +++++++++--------
 target/arm/tcg/m_helper.c                  |   4 +-
 target/ppc/mem_helper.c                    |  48 ---
 target/ppc/translate.c                     |  34 +--
 target/s390x/tcg/mem_helper.c              | 137 ++-------
 target/s390x/tcg/translate.c               |  30 +-
 target/sh4/translate.c                     |   2 -
 target/sparc/ldst_helper.c                 |  18 +-
 target/sparc/translate.c                   |   2 -
 tcg/tcg.c                                  |  14 +-
 tests/bench/xbzrle-bench.c                 | 469 -----------------------------
 tests/unit/test-xbzrle.c                   |  49 +--
 util/bufferiszero.c                        | 127 +++-----
 util/cpuinfo-aarch64.c                     |  67 +++++
 util/cpuinfo-i386.c                        |  99 ++++++
 MAINTAINERS                                |   3 +
 accel/tcg/atomic_common.c.inc              |  14 -
 accel/tcg/ldst_atomicity.c.inc             | 135 ++-------
 accel/tcg/ldst_common.c.inc                |  24 +-
 meson.build                                |  12 +-
 migration/meson.build                      |   1 -
 target/ppc/translate/fixedpoint-impl.c.inc |  51 +---
 target/s390x/tcg/insn-data.h.inc           |   2 +-
 tcg/aarch64/tcg-target.c.inc               |  40 ---
 tcg/i386/tcg-target.c.inc                  | 123 +-------
 tests/bench/meson.build                    |   6 -
 util/meson.build                           |   6 +
 54 files changed, 1035 insertions(+), 2042 deletions(-)
 create mode 100644 host/include/aarch64/host/atomic128-cas.h
 create mode 100644 host/include/aarch64/host/atomic128-ldst.h
 create mode 100644 host/include/aarch64/host/cpuinfo.h
 create mode 100644 host/include/generic/host/atomic128-cas.h
 create mode 100644 host/include/generic/host/atomic128-ldst.h
 create mode 100644 host/include/generic/host/cpuinfo.h
 create mode 100644 host/include/i386/host/cpuinfo.h
 create mode 100644 host/include/x86_64/host/cpuinfo.h
 create mode 100644 include/tcg/debug-assert.h
 delete mode 100644 tests/bench/xbzrle-bench.c
 create mode 100644 util/cpuinfo-aarch64.c
 create mode 100644 util/cpuinfo-i386.c

The entire contents of the header is host-specific, but the
existence of such a header is not, which could prevent some
host specific ifdefs at the top of the file for the include.

Add host/include/{arch,generic} to the project arguments.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 host/include/generic/host/cpuinfo.h |  4 ++++
 meson.build                         | 10 ++++++++++
 2 files changed, 14 insertions(+)
 create mode 100644 host/include/generic/host/cpuinfo.h

diff --git a/host/include/generic/host/cpuinfo.h b/host/include/generic/host/cpuinfo.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/host/include/generic/host/cpuinfo.h
@@ -XXX,XX +XXX,XX @@
+/*
+ * No host specific cpu indentification.
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
diff --git a/meson.build b/meson.build
index XXXXXXX..XXXXXXX 100644
--- a/meson.build
+++ b/meson.build
@@ -XXX,XX +XXX,XX @@ add_project_arguments('-iquote', '.',
                       '-iquote', meson.current_source_dir() / 'include',
                       language: all_languages)
 
+# If a host-specific include directory exists, list that first...
+host_include = meson.current_source_dir() / 'host/include/'
+if fs.is_dir(host_include / host_arch)
+  add_project_arguments('-iquote', host_include / host_arch,
+                        language: all_languages)
+endif
+# ... followed by the generic fallback.
+add_project_arguments('-iquote', host_include / 'generic',
+                      language: all_languages)
+
 sparse = find_program('cgcc', required: get_option('sparse'))
 if sparse.found()
   run_target('sparse',
-- 
2.34.1

Add cpuinfo.h for i386 and x86_64, and the initialization
for that in util/.  Populate that with a slightly altered
copy of the tcg host probing code.  Other uses of cpuid.h
will be adjusted one patch at a time.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 host/include/i386/host/cpuinfo.h   | 38 ++++++++++++
 host/include/x86_64/host/cpuinfo.h |  1 +
 util/cpuinfo-i386.c                | 97 ++++++++++++++++++++++++++++++
 MAINTAINERS                        |  2 +
 util/meson.build                   |  4 ++
 5 files changed, 142 insertions(+)
 create mode 100644 host/include/i386/host/cpuinfo.h
 create mode 100644 host/include/x86_64/host/cpuinfo.h
 create mode 100644 util/cpuinfo-i386.c

diff --git a/host/include/i386/host/cpuinfo.h b/host/include/i386/host/cpuinfo.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/host/include/i386/host/cpuinfo.h
@@ -XXX,XX +XXX,XX @@
+/*
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ * Host specific cpu indentification for x86.
+ */
+
+#ifndef HOST_CPUINFO_H
+#define HOST_CPUINFO_H
+
+/* Digested version of <cpuid.h> */
+
+#define CPUINFO_ALWAYS          (1u << 0)  /* so cpuinfo is nonzero */
+#define CPUINFO_CMOV            (1u << 1)
+#define CPUINFO_MOVBE           (1u << 2)
+#define CPUINFO_LZCNT           (1u << 3)
+#define CPUINFO_POPCNT          (1u << 4)
+#define CPUINFO_BMI1            (1u << 5)
+#define CPUINFO_BMI2            (1u << 6)
+#define CPUINFO_SSE2            (1u << 7)
+#define CPUINFO_SSE4            (1u << 8)
+#define CPUINFO_AVX1            (1u << 9)
+#define CPUINFO_AVX2            (1u << 10)
+#define CPUINFO_AVX512F         (1u << 11)
+#define CPUINFO_AVX512VL        (1u << 12)
+#define CPUINFO_AVX512BW        (1u << 13)
+#define CPUINFO_AVX512DQ        (1u << 14)
+#define CPUINFO_AVX512VBMI2     (1u << 15)
+#define CPUINFO_ATOMIC_VMOVDQA  (1u << 16)
+
+/* Initialized with a constructor. */
+extern unsigned cpuinfo;
+
+/*
+ * We cannot rely on constructor ordering, so other constructors must
+ * use the function interface rather than the variable above.
+ */
+unsigned cpuinfo_init(void);
+
+#endif /* HOST_CPUINFO_H */
diff --git a/host/include/x86_64/host/cpuinfo.h b/host/include/x86_64/host/cpuinfo.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/host/include/x86_64/host/cpuinfo.h
@@ -0,0 +1 @@
+#include "host/include/i386/host/cpuinfo.h"
diff --git a/util/cpuinfo-i386.c b/util/cpuinfo-i386.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/util/cpuinfo-i386.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ * Host specific cpu indentification for x86.
+ */
+
+#include "qemu/osdep.h"
+#include "host/cpuinfo.h"
+#ifdef CONFIG_CPUID_H
+# include "qemu/cpuid.h"
+#endif
+
+unsigned cpuinfo;
+
+/* Called both as constructor and (possibly) via other constructors. */
+unsigned __attribute__((constructor)) cpuinfo_init(void)
+{
+    unsigned info = cpuinfo;
+
+    if (info) {
+        return info;
+    }
+
+#ifdef CONFIG_CPUID_H
+    unsigned max, a, b, c, d, b7 = 0, c7 = 0;
+
+    max = __get_cpuid_max(0, 0);
+
+    if (max >= 7) {
+        __cpuid_count(7, 0, a, b7, c7, d);
+        info |= (b7 & bit_BMI ? CPUINFO_BMI1 : 0);
+        info |= (b7 & bit_BMI2 ? CPUINFO_BMI2 : 0);
+    }
+
+    if (max >= 1) {
+        __cpuid(1, a, b, c, d);
+
+        info |= (d & bit_CMOV ? CPUINFO_CMOV : 0);
+        info |= (d & bit_SSE2 ? CPUINFO_SSE2 : 0);
+        info |= (c & bit_SSE4_1 ? CPUINFO_SSE4 : 0);
+        info |= (c & bit_MOVBE ? CPUINFO_MOVBE : 0);
+        info |= (c & bit_POPCNT ? CPUINFO_POPCNT : 0);
+
+        /* For AVX features, we must check available and usable. */
+        if ((c & bit_AVX) && (c & bit_OSXSAVE)) {
+            unsigned bv = xgetbv_low(0);
+
+            if ((bv & 6) == 6) {
+                info |= CPUINFO_AVX1;
+                info |= (b7 & bit_AVX2 ? CPUINFO_AVX2 : 0);
+
+                if ((bv & 0xe0) == 0xe0) {
+                    info |= (b7 & bit_AVX512F ? CPUINFO_AVX512F : 0);
+                    info |= (b7 & bit_AVX512VL ? CPUINFO_AVX512VL : 0);
+                    info |= (b7 & bit_AVX512BW ? CPUINFO_AVX512BW : 0);
+                    info |= (b7 & bit_AVX512DQ ? CPUINFO_AVX512DQ : 0);
+                    info |= (c7 & bit_AVX512VBMI2 ? CPUINFO_AVX512VBMI2 : 0);
+                }
+
+                /*
+                 * The Intel SDM has added:
+                 *   Processors that enumerate support for Intel® AVX
+                 *   (by setting the feature flag CPUID.01H:ECX.AVX[bit 28])
+                 *   guarantee that the 16-byte memory operations performed
+                 *   by the following instructions will always be carried
+                 *   out atomically:
+                 *   - MOVAPD, MOVAPS, and MOVDQA.
+                 *   - VMOVAPD, VMOVAPS, and VMOVDQA when encoded with VEX.128.
+                 *   - VMOVAPD, VMOVAPS, VMOVDQA32, and VMOVDQA64 when encoded
+                 *     with EVEX.128 and k0 (masking disabled).
+                 * Note that these instructions require the linear addresses
+                 * of their memory operands to be 16-byte aligned.
+                 *
+                 * AMD has provided an even stronger guarantee that processors
+                 * with AVX provide 16-byte atomicity for all cachable,
+                 * naturally aligned single loads and stores, e.g. MOVDQU.
+                 *
+                 * See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104688
+                 */
+                __cpuid(0, a, b, c, d);
+                if (c == signature_INTEL_ecx || c == signature_AMD_ecx) {
+                    info |= CPUINFO_ATOMIC_VMOVDQA;
+                }
+            }
+        }
+    }
+
+    max = __get_cpuid_max(0x8000000, 0);
+    if (max >= 1) {
+        __cpuid(0x80000001, a, b, c, d);
+        info |= (c & bit_LZCNT ? CPUINFO_LZCNT : 0);
+    }
+#endif
+
+    info |= CPUINFO_ALWAYS;
+    cpuinfo = info;
+    return info;
+}
diff --git a/MAINTAINERS b/MAINTAINERS
index XXXXXXX..XXXXXXX 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -XXX,XX +XXX,XX @@ F: include/exec/helper*.h
 F: include/sysemu/cpus.h
 F: include/sysemu/tcg.h
 F: include/hw/core/tcg-cpu-ops.h
+F: host/include/*/host/cpuinfo.h
+F: util/cpuinfo-*.c
 
 FPU emulation
 M: Aurelien Jarno <aurelien@aurel32.net>
diff --git a/util/meson.build b/util/meson.build
index XXXXXXX..XXXXXXX 100644
--- a/util/meson.build
+++ b/util/meson.build
@@ -XXX,XX +XXX,XX @@ if have_block
   endif
   util_ss.add(when: 'CONFIG_LINUX', if_true: files('vfio-helpers.c'))
 endif
+
+if cpu in ['x86', 'x86_64']
+  util_ss.add(files('cpuinfo-i386.c'))
+endif
-- 
2.34.1

Add a bit to indicate when VMOVDQU is also atomic if aligned.

diff --git a/host/include/i386/host/cpuinfo.h b/host/include/i386/host/cpuinfo.h
index XXXXXXX..XXXXXXX 100644
--- a/host/include/i386/host/cpuinfo.h
+++ b/host/include/i386/host/cpuinfo.h
@@ -XXX,XX +XXX,XX @@
 #define CPUINFO_AVX512DQ        (1u << 14)
 #define CPUINFO_AVX512VBMI2     (1u << 15)
 #define CPUINFO_ATOMIC_VMOVDQA  (1u << 16)
+#define CPUINFO_ATOMIC_VMOVDQU  (1u << 17)
 
 /* Initialized with a constructor. */
 extern unsigned cpuinfo;
diff --git a/util/cpuinfo-i386.c b/util/cpuinfo-i386.c
index XXXXXXX..XXXXXXX 100644
--- a/util/cpuinfo-i386.c
+++ b/util/cpuinfo-i386.c
@@ -XXX,XX +XXX,XX @@ unsigned __attribute__((constructor)) cpuinfo_init(void)
                  * See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104688
                  */
                 __cpuid(0, a, b, c, d);
-                if (c == signature_INTEL_ecx || c == signature_AMD_ecx) {
+                if (c == signature_INTEL_ecx) {
                     info |= CPUINFO_ATOMIC_VMOVDQA;
+                } else if (c == signature_AMD_ecx) {
+                    info |= CPUINFO_ATOMIC_VMOVDQA | CPUINFO_ATOMIC_VMOVDQU;
                 }
             }
         }
-- 
2.34.1

Use the CPUINFO_* bits instead of the individual boolean
variables that we had been using.  Remove all of the init
code that was moved over to cpuinfo-i386.c.

Note that have_avx512* check both AVX512{F,VL}, as we had
previously done during tcg_target_init.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/i386/tcg-target.h     |  28 +++++----
 tcg/i386/tcg-target.c.inc | 123 ++------------------------------------
 2 files changed, 22 insertions(+), 129 deletions(-)

diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -XXX,XX +XXX,XX @@
 #ifndef I386_TCG_TARGET_H
 #define I386_TCG_TARGET_H
 
+#include "host/cpuinfo.h"
+
 #define TCG_TARGET_INSN_UNIT_SIZE  1
 #define TCG_TARGET_TLB_DISPLACEMENT_BITS 31
 
@@ -XXX,XX +XXX,XX @@ typedef enum {
 # define TCG_TARGET_CALL_RET_I128    TCG_CALL_RET_BY_REF
 #endif
 
-extern bool have_bmi1;
-extern bool have_popcnt;
-extern bool have_avx1;
-extern bool have_avx2;
-extern bool have_avx512bw;
-extern bool have_avx512dq;
-extern bool have_avx512vbmi2;
-extern bool have_avx512vl;
-extern bool have_movbe;
-extern bool have_atomic16;
+#define have_bmi1         (cpuinfo & CPUINFO_BMI1)
+#define have_popcnt       (cpuinfo & CPUINFO_POPCNT)
+#define have_avx1         (cpuinfo & CPUINFO_AVX1)
+#define have_avx2         (cpuinfo & CPUINFO_AVX2)
+#define have_movbe        (cpuinfo & CPUINFO_MOVBE)
+#define have_atomic16     (cpuinfo & CPUINFO_ATOMIC_VMOVDQA)
+
+/*
+ * There are interesting instructions in AVX512, so long as we have AVX512VL,
+ * which indicates support for EVEX on sizes smaller than 512 bits.
+ */
+#define have_avx512vl     ((cpuinfo & CPUINFO_AVX512VL) && \
+                           (cpuinfo & CPUINFO_AVX512F))
+#define have_avx512bw     ((cpuinfo & CPUINFO_AVX512BW) && have_avx512vl)
+#define have_avx512dq     ((cpuinfo & CPUINFO_AVX512DQ) && have_avx512vl)
+#define have_avx512vbmi2  ((cpuinfo & CPUINFO_AVX512VBMI2) && have_avx512vl)
 
 /* optional instructions */
 #define TCG_TARGET_HAS_div2_i32         1
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind kind, int slot)
 # define SOFTMMU_RESERVE_REGS  0
 #endif
 
-/* The host compiler should supply <cpuid.h> to enable runtime features
-   detection, as we're not going to go so far as our own inline assembly.
-   If not available, default values will be assumed.  */
-#if defined(CONFIG_CPUID_H)
-#include "qemu/cpuid.h"
-#endif
-
 /* For 64-bit, we always know that CMOV is available.  */
 #if TCG_TARGET_REG_BITS == 64
-# define have_cmov 1
-#elif defined(CONFIG_CPUID_H)
-static bool have_cmov;
+# define have_cmov      true
 #else
-# define have_cmov 0
-#endif
-
-/* We need these symbols in tcg-target.h, and we can't properly conditionalize
-   it there.  Therefore we always define the variable.  */
-bool have_bmi1;
-bool have_popcnt;
-bool have_avx1;
-bool have_avx2;
-bool have_avx512bw;
-bool have_avx512dq;
-bool have_avx512vbmi2;
-bool have_avx512vl;
-bool have_movbe;
-bool have_atomic16;
-
-#ifdef CONFIG_CPUID_H
-static bool have_bmi2;
-static bool have_lzcnt;
-#else
-# define have_bmi2 0
-# define have_lzcnt 0
+# define have_cmov      (cpuinfo & CPUINFO_CMOV)
 #endif
+#define have_bmi2       (cpuinfo & CPUINFO_BMI2)
+#define have_lzcnt      (cpuinfo & CPUINFO_LZCNT)
 
 static const tcg_insn_unit *tb_ret_addr;
 
@@ -XXX,XX +XXX,XX @@ static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
 
 static void tcg_target_init(TCGContext *s)
 {
-#ifdef CONFIG_CPUID_H
-    unsigned a, b, c, d, b7 = 0, c7 = 0;
-    unsigned max = __get_cpuid_max(0, 0);
-
-    if (max >= 7) {
-        /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs.  */
-        __cpuid_count(7, 0, a, b7, c7, d);
-        have_bmi1 = (b7 & bit_BMI) != 0;
-        have_bmi2 = (b7 & bit_BMI2) != 0;
-    }
-
-    if (max >= 1) {
-        __cpuid(1, a, b, c, d);
-#ifndef have_cmov
-        /* For 32-bit, 99% certainty that we're running on hardware that
-           supports cmov, but we still need to check.  In case cmov is not
-           available, we'll use a small forward branch.  */
-        have_cmov = (d & bit_CMOV) != 0;
-#endif
-
-        /* MOVBE is only available on Intel Atom and Haswell CPUs, so we
-           need to probe for it.  */
-        have_movbe = (c & bit_MOVBE) != 0;
-        have_popcnt = (c & bit_POPCNT) != 0;
-
-        /* There are a number of things we must check before we can be
-           sure of not hitting invalid opcode.  */
-        if (c & bit_OSXSAVE) {
-            unsigned bv = xgetbv_low(0);
-
-            if ((bv & 6) == 6) {
-                have_avx1 = (c & bit_AVX) != 0;
-                have_avx2 = (b7 & bit_AVX2) != 0;
-
-                /*
-                 * There are interesting instructions in AVX512, so long
-                 * as we have AVX512VL, which indicates support for EVEX
-                 * on sizes smaller than 512 bits.  We are required to
-                 * check that OPMASK and all extended ZMM state are enabled
-                 * even if we're not using them -- the insns will fault.
-                 */
-                if ((bv & 0xe0) == 0xe0
-                    && (b7 & bit_AVX512F)
-                    && (b7 & bit_AVX512VL)) {
-                    have_avx512vl = true;
-                    have_avx512bw = (b7 & bit_AVX512BW) != 0;
-                    have_avx512dq = (b7 & bit_AVX512DQ) != 0;
-                    have_avx512vbmi2 = (c7 & bit_AVX512VBMI2) != 0;
-                }
-
-                /*
-                 * The Intel SDM has added:
-                 *   Processors that enumerate support for Intel® AVX
-                 *   (by setting the feature flag CPUID.01H:ECX.AVX[bit 28])
-                 *   guarantee that the 16-byte memory operations performed
-                 *   by the following instructions will always be carried
-                 *   out atomically:
-                 *   - MOVAPD, MOVAPS, and MOVDQA.
-                 *   - VMOVAPD, VMOVAPS, and VMOVDQA when encoded with VEX.128.
-                 *   - VMOVAPD, VMOVAPS, VMOVDQA32, and VMOVDQA64 when encoded
-                 *     with EVEX.128 and k0 (masking disabled).
-                 * Note that these instructions require the linear addresses
-                 * of their memory operands to be 16-byte aligned.
-                 *
-                 * AMD has provided an even stronger guarantee that processors
-                 * with AVX provide 16-byte atomicity for all cachable,
-                 * naturally aligned single loads and stores, e.g. MOVDQU.
-                 *
-                 * See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104688
-                 */
-                if (have_avx1) {
-                    __cpuid(0, a, b, c, d);
-                    have_atomic16 = (c == signature_INTEL_ecx ||
-                                     c == signature_AMD_ecx);
-                }
-            }
-        }
-    }
-
-    max = __get_cpuid_max(0x8000000, 0);
-    if (max >= 1) {
-        __cpuid(0x80000001, a, b, c, d);
-        /* LZCNT was introduced with AMD Barcelona and Intel Haswell CPUs.  */
-        have_lzcnt = (c & bit_LZCNT) != 0;
-    }
-#endif /* CONFIG_CPUID_H */
-
     tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS;
     if (TCG_TARGET_REG_BITS == 64) {
         tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS;
-- 
2.34.1

Use cpuinfo_init() during init_accel(), and the variable cpuinfo
during test_buffer_is_zero_next_accel().  Adjust the logic that
cycles through the set of accelerators for testing.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 util/bufferiszero.c | 127 ++++++++++++++++----------------------------
 1 file changed, 46 insertions(+), 81 deletions(-)

diff --git a/util/bufferiszero.c b/util/bufferiszero.c
index XXXXXXX..XXXXXXX 100644
--- a/util/bufferiszero.c
+++ b/util/bufferiszero.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/osdep.h"
 #include "qemu/cutils.h"
 #include "qemu/bswap.h"
+#include "host/cpuinfo.h"
 
 static bool
 buffer_zero_int(const void *buf, size_t len)
@@ -XXX,XX +XXX,XX @@ buffer_zero_avx512(const void *buf, size_t len)
 }
 #endif /* CONFIG_AVX512F_OPT */
 
-
-/* Note that for test_buffer_is_zero_next_accel, the most preferred
- * ISA must have the least significant bit.
- */
-#define CACHE_AVX512F 1
-#define CACHE_AVX2    2
-#define CACHE_SSE4    4
-#define CACHE_SSE2    8
-
-/* Make sure that these variables are appropriately initialized when
+/*
+ * Make sure that these variables are appropriately initialized when
  * SSE2 is enabled on the compiler command-line, but the compiler is
  * too old to support CONFIG_AVX2_OPT.
  */
 #if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT)
-# define INIT_CACHE 0
-# define INIT_ACCEL buffer_zero_int
+# define INIT_USED     0
+# define INIT_LENGTH   0
+# define INIT_ACCEL    buffer_zero_int
 #else
 # ifndef __SSE2__
 #  error "ISA selection confusion"
 # endif
-# define INIT_CACHE CACHE_SSE2
-# define INIT_ACCEL buffer_zero_sse2
+# define INIT_USED     CPUINFO_SSE2
+# define INIT_LENGTH   64
+# define INIT_ACCEL    buffer_zero_sse2
 #endif
 
-static unsigned cpuid_cache = INIT_CACHE;
+static unsigned used_accel = INIT_USED;
+static unsigned length_to_accel = INIT_LENGTH;
 static bool (*buffer_accel)(const void *, size_t) = INIT_ACCEL;
-static int length_to_accel = 64;
 
-static void init_accel(unsigned cache)
+static unsigned __attribute__((noinline))
+select_accel_cpuinfo(unsigned info)
 {
-    bool (*fn)(const void *, size_t) = buffer_zero_int;
-    if (cache & CACHE_SSE2) {
-        fn = buffer_zero_sse2;
-        length_to_accel = 64;
-    }
-#ifdef CONFIG_AVX2_OPT
-    if (cache & CACHE_SSE4) {
-        fn = buffer_zero_sse4;
-        length_to_accel = 64;
-    }
-    if (cache & CACHE_AVX2) {
-        fn = buffer_zero_avx2;
-        length_to_accel = 128;
-    }
-#endif
+    /* Array is sorted in order of algorithm preference. */
+    static const struct {
+        unsigned bit;
+        unsigned len;
+        bool (*fn)(const void *, size_t);
+    } all[] = {
 #ifdef CONFIG_AVX512F_OPT
-    if (cache & CACHE_AVX512F) {
-        fn = buffer_zero_avx512;
-        length_to_accel = 256;
-    }
+        { CPUINFO_AVX512F, 256, buffer_zero_avx512 },
 #endif
-    buffer_accel = fn;
+#ifdef CONFIG_AVX2_OPT
+        { CPUINFO_AVX2,    128, buffer_zero_avx2 },
+        { CPUINFO_SSE4,     64, buffer_zero_sse4 },
+#endif
+        { CPUINFO_SSE2,     64, buffer_zero_sse2 },
+        { CPUINFO_ALWAYS,    0, buffer_zero_int },
+    };
+
+    for (unsigned i = 0; i < ARRAY_SIZE(all); ++i) {
+        if (info & all[i].bit) {
+            length_to_accel = all[i].len;
+            buffer_accel = all[i].fn;
+            return all[i].bit;
+        }
+    }
+    return 0;
 }
 
 #if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT)
-#include "qemu/cpuid.h"
-
-static void __attribute__((constructor)) init_cpuid_cache(void)
+static void __attribute__((constructor)) init_accel(void)
 {
-    unsigned max = __get_cpuid_max(0, NULL);
-    int a, b, c, d;
-    unsigned cache = 0;
-
-    if (max >= 1) {
-        __cpuid(1, a, b, c, d);
-        if (d & bit_SSE2) {
-            cache |= CACHE_SSE2;
-        }
-        if (c & bit_SSE4_1) {
-            cache |= CACHE_SSE4;
-        }
-
-        /* We must check that AVX is not just available, but usable.  */
-        if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
-            unsigned bv = xgetbv_low(0);
-            __cpuid_count(7, 0, a, b, c, d);
-            if ((bv & 0x6) == 0x6 && (b & bit_AVX2)) {
-                cache |= CACHE_AVX2;
-            }
-            /* 0xe6:
-            *  XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15
-            *                    and ZMM16-ZMM31 state are enabled by OS)
-            *  XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS)
-            */
-            if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512F)) {
-                cache |= CACHE_AVX512F;
-            }
-        }
-    }
-    cpuid_cache = cache;
-    init_accel(cache);
+    used_accel = select_accel_cpuinfo(cpuinfo_init());
 }
 #endif /* CONFIG_AVX2_OPT */
 
 bool test_buffer_is_zero_next_accel(void)
 {
-    /* If no bits set, we just tested buffer_zero_int, and there
-       are no more acceleration options to test.  */
-    if (cpuid_cache == 0) {
-        return false;
-    }
-    /* Disable the accelerator we used before and select a new one.  */
-    cpuid_cache &= cpuid_cache - 1;
-    init_accel(cpuid_cache);
-    return true;
+    /*
+     * Accumulate the accelerators that we've already tested, and
+     * remove them from the set to test this round.  We'll get back
+     * a zero from select_accel_cpuinfo when there are no more.
+     */
+    unsigned used = select_accel_cpuinfo(cpuinfo & ~used_accel);
+    used_accel |= used;
+    return used;
 }
 
 static bool select_accel_fn(const void *buf, size_t len)
-- 
2.34.1

Place the CONFIG_AVX512BW_OPT block at the top,
which will aid function selection in the next patch.

diff --git a/migration/xbzrle.c b/migration/xbzrle.c
index XXXXXXX..XXXXXXX 100644
--- a/migration/xbzrle.c
+++ b/migration/xbzrle.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/host-utils.h"
 #include "xbzrle.h"
 
+#if defined(CONFIG_AVX512BW_OPT)
+#include <immintrin.h>
+
+int __attribute__((target("avx512bw")))
+xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen,
+                            uint8_t *dst, int dlen)
+{
+    uint32_t zrun_len = 0, nzrun_len = 0;
+    int d = 0, i = 0, num = 0;
+    uint8_t *nzrun_start = NULL;
+    /* add 1 to include residual part in main loop */
+    uint32_t count512s = (slen >> 6) + 1;
+    /* countResidual is tail of data, i.e., countResidual = slen % 64 */
+    uint32_t count_residual = slen & 0b111111;
+    bool never_same = true;
+    uint64_t mask_residual = 1;
+    mask_residual <<= count_residual;
+    mask_residual -= 1;
+    __m512i r = _mm512_set1_epi32(0);
+
+    while (count512s) {
+        int bytes_to_check = 64;
+        uint64_t mask = 0xffffffffffffffff;
+        if (count512s == 1) {
+            bytes_to_check = count_residual;
+            mask = mask_residual;
+        }
+        __m512i old_data = _mm512_mask_loadu_epi8(r,
+                                                  mask, old_buf + i);
+        __m512i new_data = _mm512_mask_loadu_epi8(r,
+                                                  mask, new_buf + i);
+        uint64_t comp = _mm512_cmpeq_epi8_mask(old_data, new_data);
+        count512s--;
+
+        bool is_same = (comp & 0x1);
+        while (bytes_to_check) {
+            if (d + 2 > dlen) {
+                return -1;
+            }
+            if (is_same) {
+                if (nzrun_len) {
+                    d += uleb128_encode_small(dst + d, nzrun_len);
+                    if (d + nzrun_len > dlen) {
+                        return -1;
+                    }
+                    nzrun_start = new_buf + i - nzrun_len;
+                    memcpy(dst + d, nzrun_start, nzrun_len);
+                    d += nzrun_len;
+                    nzrun_len = 0;
+                }
+                /* 64 data at a time for speed */
+                if (count512s && (comp == 0xffffffffffffffff)) {
+                    i += 64;
+                    zrun_len += 64;
+                    break;
+                }
+                never_same = false;
+                num = ctz64(~comp);
+                num = (num < bytes_to_check) ? num : bytes_to_check;
+                zrun_len += num;
+                bytes_to_check -= num;
+                comp >>= num;
+                i += num;
+                if (bytes_to_check) {
+                    /* still has different data after same data */
+                    d += uleb128_encode_small(dst + d, zrun_len);
+                    zrun_len = 0;
+                } else {
+                    break;
+                }
+            }
+            if (never_same || zrun_len) {
+                /*
+                 * never_same only acts if
+                 * data begins with diff in first count512s
+                 */
+                d += uleb128_encode_small(dst + d, zrun_len);
+                zrun_len = 0;
+                never_same = false;
+            }
+            /* has diff, 64 data at a time for speed */
+            if ((bytes_to_check == 64) && (comp == 0x0)) {
+                i += 64;
+                nzrun_len += 64;
+                break;
+            }
+            num = ctz64(comp);
+            num = (num < bytes_to_check) ? num : bytes_to_check;
+            nzrun_len += num;
+            bytes_to_check -= num;
+            comp >>= num;
+            i += num;
+            if (bytes_to_check) {
+                /* mask like 111000 */
+                d += uleb128_encode_small(dst + d, nzrun_len);
+                /* overflow */
+                if (d + nzrun_len > dlen) {
+                    return -1;
+                }
+                nzrun_start = new_buf + i - nzrun_len;
+                memcpy(dst + d, nzrun_start, nzrun_len);
+                d += nzrun_len;
+                nzrun_len = 0;
+                is_same = true;
+            }
+        }
+    }
+
+    if (nzrun_len != 0) {
+        d += uleb128_encode_small(dst + d, nzrun_len);
+        /* overflow */
+        if (d + nzrun_len > dlen) {
+            return -1;
+        }
+        nzrun_start = new_buf + i - nzrun_len;
+        memcpy(dst + d, nzrun_start, nzrun_len);
+        d += nzrun_len;
+    }
+    return d;
+}
+#endif
+
 /*
   page = zrun nzrun
        | zrun nzrun page
@@ -XXX,XX +XXX,XX @@ int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen)
 
     return d;
 }
-
-#if defined(CONFIG_AVX512BW_OPT)
-#include <immintrin.h>
-
-int __attribute__((target("avx512bw")))
-xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen,
-                            uint8_t *dst, int dlen)
-{
-    uint32_t zrun_len = 0, nzrun_len = 0;
-    int d = 0, i = 0, num = 0;
-    uint8_t *nzrun_start = NULL;
-    /* add 1 to include residual part in main loop */
-    uint32_t count512s = (slen >> 6) + 1;
-    /* countResidual is tail of data, i.e., countResidual = slen % 64 */
-    uint32_t count_residual = slen & 0b111111;
-    bool never_same = true;
-    uint64_t mask_residual = 1;
-    mask_residual <<= count_residual;
-    mask_residual -= 1;
-    __m512i r = _mm512_set1_epi32(0);
-
-    while (count512s) {
-        int bytes_to_check = 64;
-        uint64_t mask = 0xffffffffffffffff;
-        if (count512s == 1) {
-            bytes_to_check = count_residual;
-            mask = mask_residual;
-        }
-        __m512i old_data = _mm512_mask_loadu_epi8(r,
-                                                  mask, old_buf + i);
-        __m512i new_data = _mm512_mask_loadu_epi8(r,
-                                                  mask, new_buf + i);
-        uint64_t comp = _mm512_cmpeq_epi8_mask(old_data, new_data);
-        count512s--;
-
-        bool is_same = (comp & 0x1);
-        while (bytes_to_check) {
-            if (d + 2 > dlen) {
-                return -1;
-            }
-            if (is_same) {
-                if (nzrun_len) {
-                    d += uleb128_encode_small(dst + d, nzrun_len);
-                    if (d + nzrun_len > dlen) {
-                        return -1;
-                    }
-                    nzrun_start = new_buf + i - nzrun_len;
-                    memcpy(dst + d, nzrun_start, nzrun_len);
-                    d += nzrun_len;
-                    nzrun_len = 0;
-                }
-                /* 64 data at a time for speed */
-                if (count512s && (comp == 0xffffffffffffffff)) {
-                    i += 64;
-                    zrun_len += 64;
-                    break;
-                }
-                never_same = false;
-                num = ctz64(~comp);
-                num = (num < bytes_to_check) ? num : bytes_to_check;
-                zrun_len += num;
-                bytes_to_check -= num;
-                comp >>= num;
-                i += num;
-                if (bytes_to_check) {
-                    /* still has different data after same data */
-                    d += uleb128_encode_small(dst + d, zrun_len);
-                    zrun_len = 0;
-                } else {
-                    break;
-                }
-            }
-            if (never_same || zrun_len) {
-                /*
-                 * never_same only acts if
-                 * data begins with diff in first count512s
-                 */
-                d += uleb128_encode_small(dst + d, zrun_len);
-                zrun_len = 0;
-                never_same = false;
-            }
-            /* has diff, 64 data at a time for speed */
-            if ((bytes_to_check == 64) && (comp == 0x0)) {
-                i += 64;
-                nzrun_len += 64;
-                break;
-            }
-            num = ctz64(comp);
-            num = (num < bytes_to_check) ? num : bytes_to_check;
-            nzrun_len += num;
-            bytes_to_check -= num;
-            comp >>= num;
-            i += num;
-            if (bytes_to_check) {
-                /* mask like 111000 */
-                d += uleb128_encode_small(dst + d, nzrun_len);
-                /* overflow */
-                if (d + nzrun_len > dlen) {
-                    return -1;
-                }
-                nzrun_start = new_buf + i - nzrun_len;
-                memcpy(dst + d, nzrun_start, nzrun_len);
-                d += nzrun_len;
-                nzrun_len = 0;
-                is_same = true;
-            }
-        }
-    }
-
-    if (nzrun_len != 0) {
-        d += uleb128_encode_small(dst + d, nzrun_len);
-        /* overflow */
-        if (d + nzrun_len > dlen) {
-            return -1;
-        }
-        nzrun_start = new_buf + i - nzrun_len;
-        memcpy(dst + d, nzrun_start, nzrun_len);
-        d += nzrun_len;
-    }
-    return d;
-}
-#endif
-- 
2.34.1

Perform the function selection once, and only if CONFIG_AVX512_OPT
is enabled.  Centralize the selection to xbzrle.c, instead of
spreading the init across 3 files.

Remove xbzrle-bench.c.  The benefit of being able to benchmark
the different implementations is less important than not peeking
into the internals of the implementation.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 migration/xbzrle.h         |   5 +-
 migration/ram.c            |  34 +--
 migration/xbzrle.c         |  26 +-
 tests/bench/xbzrle-bench.c | 469 -------------------------------------
 tests/unit/test-xbzrle.c   |  49 +---
 tests/bench/meson.build    |   6 -
 6 files changed, 39 insertions(+), 550 deletions(-)
 delete mode 100644 tests/bench/xbzrle-bench.c

diff --git a/migration/xbzrle.h b/migration/xbzrle.h
index XXXXXXX..XXXXXXX 100644
--- a/migration/xbzrle.h
+++ b/migration/xbzrle.h
@@ -XXX,XX +XXX,XX @@ int xbzrle_encode_buffer(uint8_t *old_buf, uint8_t *new_buf, int slen,
                          uint8_t *dst, int dlen);
 
 int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen);
-#if defined(CONFIG_AVX512BW_OPT)
-int xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen,
-                                uint8_t *dst, int dlen);
-#endif
+
 #endif
diff --git a/migration/ram.c b/migration/ram.c
index XXXXXXX..XXXXXXX 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -XXX,XX +XXX,XX @@
 #define RAM_SAVE_FLAG_MULTIFD_FLUSH    0x200
 /* We can't use any flag that is bigger than 0x200 */
 
-int (*xbzrle_encode_buffer_func)(uint8_t *, uint8_t *, int,
-     uint8_t *, int) = xbzrle_encode_buffer;
-#if defined(CONFIG_AVX512BW_OPT)
-#include "qemu/cpuid.h"
-static void __attribute__((constructor)) init_cpu_flag(void)
-{
-    unsigned max = __get_cpuid_max(0, NULL);
-    int a, b, c, d;
-    if (max >= 1) {
-        __cpuid(1, a, b, c, d);
-         /* We must check that AVX is not just available, but usable.  */
-        if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
-            int bv;
-            __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
-            __cpuid_count(7, 0, a, b, c, d);
-           /* 0xe6:
-            *  XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15
-            *                    and ZMM16-ZMM31 state are enabled by OS)
-            *  XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS)
-            */
-            if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) {
-                xbzrle_encode_buffer_func = xbzrle_encode_buffer_avx512;
-            }
-        }
-    }
-}
-#endif
-
 XBZRLECacheStats xbzrle_counters;
 
 /* used by the search for pages to send */
@@ -XXX,XX +XXX,XX @@ static int save_xbzrle_page(RAMState *rs, PageSearchStatus *pss,
     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 
     /* XBZRLE encoding (if there is no overflow) */
-    encoded_len = xbzrle_encode_buffer_func(prev_cached_page, XBZRLE.current_buf,
-                                            TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
-                                            TARGET_PAGE_SIZE);
+    encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
+                                       TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
+                                       TARGET_PAGE_SIZE);
 
     /*
      * Update the cache contents, so that it corresponds to the data
diff --git a/migration/xbzrle.c b/migration/xbzrle.c
index XXXXXXX..XXXXXXX 100644
--- a/migration/xbzrle.c
+++ b/migration/xbzrle.c
@@ -XXX,XX +XXX,XX @@
 
 #if defined(CONFIG_AVX512BW_OPT)
 #include <immintrin.h>
+#include "host/cpuinfo.h"
 
-int __attribute__((target("avx512bw")))
+static int __attribute__((target("avx512bw")))
 xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen,
                             uint8_t *dst, int dlen)
 {
@@ -XXX,XX +XXX,XX @@ xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen,
     }
     return d;
 }
+
+static int xbzrle_encode_buffer_int(uint8_t *old_buf, uint8_t *new_buf,
+                                    int slen, uint8_t *dst, int dlen);
+
+static int (*accel_func)(uint8_t *, uint8_t *, int, uint8_t *, int);
+
+static void __attribute__((constructor)) init_accel(void)
+{
+    unsigned info = cpuinfo_init();
+    if (info & CPUINFO_AVX512BW) {
+        accel_func = xbzrle_encode_buffer_avx512;
+    } else {
+        accel_func = xbzrle_encode_buffer_int;
+    }
+}
+
+int xbzrle_encode_buffer(uint8_t *old_buf, uint8_t *new_buf, int slen,
+                         uint8_t *dst, int dlen)
+{
+    return accel_func(old_buf, new_buf, slen, dst, dlen);
+}
+
+#define xbzrle_encode_buffer xbzrle_encode_buffer_int
 #endif
 
 /*
diff --git a/tests/bench/xbzrle-bench.c b/tests/bench/xbzrle-bench.c
deleted file mode 100644
index XXXXXXX..XXXXXXX
--- a/tests/bench/xbzrle-bench.c
+++ /dev/null
@@ -XXX,XX +XXX,XX @@
-/*
- * Xor Based Zero Run Length Encoding unit tests.
- *
- * Copyright 2013 Red Hat, Inc. and/or its affiliates
- *
- * Authors:
- *  Orit Wasserman  <owasserm@redhat.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2 or later.
- * See the COPYING file in the top-level directory.
- *
- */
-#include "qemu/osdep.h"
-#include "qemu/cutils.h"
-#include "../migration/xbzrle.h"
-
-#if defined(CONFIG_AVX512BW_OPT)
-#define XBZRLE_PAGE_SIZE 4096
-static bool is_cpu_support_avx512bw;
-#include "qemu/cpuid.h"
-static void __attribute__((constructor)) init_cpu_flag(void)
-{
-    unsigned max = __get_cpuid_max(0, NULL);
-    int a, b, c, d;
-    is_cpu_support_avx512bw = false;
-    if (max >= 1) {
-        __cpuid(1, a, b, c, d);
-         /* We must check that AVX is not just available, but usable.  */
-        if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
-            int bv;
-            __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
-            __cpuid_count(7, 0, a, b, c, d);
-           /* 0xe6:
-            *  XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15
-            *                    and ZMM16-ZMM31 state are enabled by OS)
-            *  XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS)
-            */
-            if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) {
-                is_cpu_support_avx512bw = true;
-            }
-        }
-    }
-    return ;
-}
-
-struct ResTime {
-    float t_raw;
-    float t_512;
-};
-
-
-/* Function prototypes
-int xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen,
-                                uint8_t *dst, int dlen);
-*/
-static void encode_decode_zero(struct ResTime *res)
-{
-    uint8_t *buffer = g_malloc0(XBZRLE_PAGE_SIZE);
-    uint8_t *compressed = g_malloc0(XBZRLE_PAGE_SIZE);
-    uint8_t *buffer512 = g_malloc0(XBZRLE_PAGE_SIZE);
-    uint8_t *compressed512 = g_malloc0(XBZRLE_PAGE_SIZE);
-    int i = 0;
-    int dlen = 0, dlen512 = 0;
-    int diff_len = g_test_rand_int_range(0, XBZRLE_PAGE_SIZE - 1006);
-
-    for (i = diff_len; i > 0; i--) {
-        buffer[1000 + i] = i;
-        buffer512[1000 + i] = i;
-    }
-
-    buffer[1000 + diff_len + 3] = 103;
-    buffer[1000 + diff_len + 5] = 105;
-
-    buffer512[1000 + diff_len + 3] = 103;
-    buffer512[1000 + diff_len + 5] = 105;
-
-    /* encode zero page */
-    time_t t_start, t_end, t_start512, t_end512;
-    t_start = clock();
-    dlen = xbzrle_encode_buffer(buffer, buffer, XBZRLE_PAGE_SIZE, compressed,
-                       XBZRLE_PAGE_SIZE);
-    t_end = clock();
-    float time_val = difftime(t_end, t_start);
-    g_assert(dlen == 0);
-
-    t_start512 = clock();
-    dlen512 = xbzrle_encode_buffer_avx512(buffer512, buffer512, XBZRLE_PAGE_SIZE,
-                                       compressed512, XBZRLE_PAGE_SIZE);
-    t_end512 = clock();
-    float time_val512 = difftime(t_end512, t_start512);
-    g_assert(dlen512 == 0);
-
-    res->t_raw = time_val;
-    res->t_512 = time_val512;
-
-    g_free(buffer);
-    g_free(compressed);
-    g_free(buffer512);
-    g_free(compressed512);
-
-}
-
-static void test_encode_decode_zero_avx512(void)
-{
-    int i;
-    float time_raw = 0.0, time_512 = 0.0;
-    struct ResTime res;
-    for (i = 0; i < 10000; i++) {
-        encode_decode_zero(&res);
-        time_raw += res.t_raw;
-        time_512 += res.t_512;
-    }
-    printf("Zero test:\n");
-    printf("Raw xbzrle_encode time is %f ms\n", time_raw);
-    printf("512 xbzrle_encode time is %f ms\n", time_512);
-}
-
-static void encode_decode_unchanged(struct ResTime *res)
-{
-    uint8_t *compressed = g_malloc0(XBZRLE_PAGE_SIZE);
-    uint8_t *test = g_malloc0(XBZRLE_PAGE_SIZE);
-    uint8_t *compressed512 = g_malloc0(XBZRLE_PAGE_SIZE);
-    uint8_t *test512 = g_malloc0(XBZRLE_PAGE_SIZE);
-    int i = 0;
-    int dlen = 0, dlen512 = 0;
-    int diff_len = g_test_rand_int_range(0, XBZRLE_PAGE_SIZE - 1006);
-
-    for (i = diff_len; i > 0; i--) {
-        test[1000 + i] = i + 4;
-        test512[1000 + i] = i + 4;
-    }
-
-    test[1000 + diff_len + 3] = 107;
-    test[1000 + diff_len + 5] = 109;
-
-    test512[1000 + diff_len + 3] = 107;
-    test512[1000 + diff_len + 5] = 109;
-
-    /* test unchanged buffer */
-    time_t t_start, t_end, t_start512, t_end512;
-    t_start = clock();
-    dlen = xbzrle_encode_buffer(test, test, XBZRLE_PAGE_SIZE, compressed,
-                                XBZRLE_PAGE_SIZE);
-    t_end = clock();
-    float time_val = difftime(t_end, t_start);
-    g_assert(dlen == 0);
-
-    t_start512 = clock();
-    dlen512 = xbzrle_encode_buffer_avx512(test512, test512, XBZRLE_PAGE_SIZE,
-                                       compressed512, XBZRLE_PAGE_SIZE);
-    t_end512 = clock();
-    float time_val512 = difftime(t_end512, t_start512);
-    g_assert(dlen512 == 0);
-
-    res->t_raw = time_val;
-    res->t_512 = time_val512;
-
-    g_free(test);
-    g_free(compressed);
-    g_free(test512);
-    g_free(compressed512);
-
-}
-
-static void test_encode_decode_unchanged_avx512(void)
-{
-    int i;
-    float time_raw = 0.0, time_512 = 0.0;
-    struct ResTime res;
-    for (i = 0; i < 10000; i++) {
-        encode_decode_unchanged(&res);
-        time_raw += res.t_raw;
-        time_512 += res.t_512;
-    }
-    printf("Unchanged test:\n");
-    printf("Raw xbzrle_encode time is %f ms\n", time_raw);
-    printf("512 xbzrle_encode time is %f ms\n", time_512);
-}
-
-static void encode_decode_1_byte(struct ResTime *res)
-{
-    uint8_t *buffer = g_malloc0(XBZRLE_PAGE_SIZE);
-    uint8_t *test = g_malloc0(XBZRLE_PAGE_SIZE);
-    uint8_t *compressed = g_malloc(XBZRLE_PAGE_SIZE);
-    uint8_t *buffer512 = g_malloc0(XBZRLE_PAGE_SIZE);
-    uint8_t *test512 = g_malloc0(XBZRLE_PAGE_SIZE);
-    uint8_t *compressed512 = g_malloc(XBZRLE_PAGE_SIZE);
-    int dlen = 0, rc = 0, dlen512 = 0, rc512 = 0;
-    uint8_t buf[2];
-    uint8_t buf512[2];
-
-    test[XBZRLE_PAGE_SIZE - 1] = 1;
-    test512[XBZRLE_PAGE_SIZE - 1] = 1;
-
-    time_t t_start, t_end, t_start512, t_end512;
-    t_start = clock();
-    dlen = xbzrle_encode_buffer(buffer, test, XBZRLE_PAGE_SIZE, compressed,
-                       XBZRLE_PAGE_SIZE);
-    t_end = clock();
-    float time_val = difftime(t_end, t_start);
-    g_assert(dlen == (uleb128_encode_small(&buf[0], 4095) + 2));
-
-    rc = xbzrle_decode_buffer(compressed, dlen, buffer, XBZRLE_PAGE_SIZE);
-    g_assert(rc == XBZRLE_PAGE_SIZE);
-    g_assert(memcmp(test, buffer, XBZRLE_PAGE_SIZE) == 0);
-
-    t_start512 = clock();
-    dlen512 = xbzrle_encode_buffer_avx512(buffer512, test512, XBZRLE_PAGE_SIZE,
-                                       compressed512, XBZRLE_PAGE_SIZE);
-    t_end512 = clock();
-    float time_val512 = difftime(t_end512, t_start512);
-    g_assert(dlen512 == (uleb128_encode_small(&buf512[0], 4095) + 2));
-
-    rc512 = xbzrle_decode_buffer(compressed512, dlen512, buffer512,
-                                 XBZRLE_PAGE_SIZE);
-    g_assert(rc512 == XBZRLE_PAGE_SIZE);
-    g_assert(memcmp(test512, buffer512, XBZRLE_PAGE_SIZE) == 0);
-
-    res->t_raw = time_val;
-    res->t_512 = time_val512;
-
-    g_free(buffer);
-    g_free(compressed);
-    g_free(test);
-    g_free(buffer512);
-    g_free(compressed512);
-    g_free(test512);
-
-}
-
-static void test_encode_decode_1_byte_avx512(void)
-{
-    int i;
-    float time_raw = 0.0, time_512 = 0.0;
-    struct ResTime res;
-    for (i = 0; i < 10000; i++) {
-        encode_decode_1_byte(&res);
-        time_raw += res.t_raw;
-        time_512 += res.t_512;
-    }
-    printf("1 byte test:\n");
-    printf("Raw xbzrle_encode time is %f ms\n", time_raw);
-    printf("512 xbzrle_encode time is %f ms\n", time_512);
-}
-
-static void encode_decode_overflow(struct ResTime *res)
-{
-    uint8_t *compressed = g_malloc0(XBZRLE_PAGE_SIZE);
-    uint8_t *test = g_malloc0(XBZRLE_PAGE_SIZE);
-    uint8_t *buffer = g_malloc0(XBZRLE_PAGE_SIZE);
-    uint8_t *compressed512 = g_malloc0(XBZRLE_PAGE_SIZE);
-    uint8_t *test512 = g_malloc0(XBZRLE_PAGE_SIZE);
-    uint8_t *buffer512 = g_malloc0(XBZRLE_PAGE_SIZE);
-    int i = 0, rc = 0, rc512 = 0;
-
-    for (i = 0; i < XBZRLE_PAGE_SIZE / 2 - 1; i++) {
-        test[i * 2] = 1;
-        test512[i * 2] = 1;
-    }
-
-    /* encode overflow */
-    time_t t_start, t_end, t_start512, t_end512;
-    t_start = clock();
-    rc = xbzrle_encode_buffer(buffer, test, XBZRLE_PAGE_SIZE, compressed,
-                              XBZRLE_PAGE_SIZE);
-    t_end = clock();
-    float time_val = difftime(t_end, t_start);
-    g_assert(rc == -1);
-
-    t_start512 = clock();
-    rc512 = xbzrle_encode_buffer_avx512(buffer512, test512, XBZRLE_PAGE_SIZE,
-                                     compressed512, XBZRLE_PAGE_SIZE);
-    t_end512 = clock();
-    float time_val512 = difftime(t_end512, t_start512);
-    g_assert(rc512 == -1);
-
-    res->t_raw = time_val;
-    res->t_512 = time_val512;
-
-    g_free(buffer);
-    g_free(compressed);
-    g_free(test);
-    g_free(buffer512);
-    g_free(compressed512);
-    g_free(test512);
-
-}
-
-static void test_encode_decode_overflow_avx512(void)
-{
-    int i;
-    float time_raw = 0.0, time_512 = 0.0;
-    struct ResTime res;
-    for (i = 0; i < 10000; i++) {
-        encode_decode_overflow(&res);
-        time_raw += res.t_raw;
-        time_512 += res.t_512;
-    }
-    printf("Overflow test:\n");
-    printf("Raw xbzrle_encode time is %f ms\n", time_raw);
-    printf("512 xbzrle_encode time is %f ms\n", time_512);
-}
-
-static void encode_decode_range_avx512(struct ResTime *res)
-{
-    uint8_t *buffer = g_malloc0(XBZRLE_PAGE_SIZE);
-    uint8_t *compressed = g_malloc(XBZRLE_PAGE_SIZE);
-    uint8_t *test = g_malloc0(XBZRLE_PAGE_SIZE);
-    uint8_t *buffer512 = g_malloc0(XBZRLE_PAGE_SIZE);
-    uint8_t *compressed512 = g_malloc(XBZRLE_PAGE_SIZE);
-    uint8_t *test512 = g_malloc0(XBZRLE_PAGE_SIZE);
-    int i = 0, rc = 0, rc512 = 0;
-    int dlen = 0, dlen512 = 0;
-
-    int diff_len = g_test_rand_int_range(0, XBZRLE_PAGE_SIZE - 1006);
-
-    for (i = diff_len; i > 0; i--) {
-        buffer[1000 + i] = i;
-        test[1000 + i] = i + 4;
-        buffer512[1000 + i] = i;
-        test512[1000 + i] = i + 4;
-    }
-
-    buffer[1000 + diff_len + 3] = 103;
-    test[1000 + diff_len + 3] = 107;
-
-    buffer[1000 + diff_len + 5] = 105;
-    test[1000 + diff_len + 5] = 109;
-
-    buffer512[1000 + diff_len + 3] = 103;
-    test512[1000 + diff_len + 3] = 107;
-
-    buffer512[1000 + diff_len + 5] = 105;
-    test512[1000 + diff_len + 5] = 109;
-
-    /* test encode/decode */
-    time_t t_start, t_end, t_start512, t_end512;
-    t_start = clock();
-    dlen = xbzrle_encode_buffer(test, buffer, XBZRLE_PAGE_SIZE, compressed,
-                                XBZRLE_PAGE_SIZE);
-    t_end = clock();
-    float time_val = difftime(t_end, t_start);
-    rc = xbzrle_decode_buffer(compressed, dlen, test, XBZRLE_PAGE_SIZE);
-    g_assert(rc < XBZRLE_PAGE_SIZE);
-    g_assert(memcmp(test, buffer, XBZRLE_PAGE_SIZE) == 0);
-
-    t_start512 = clock();
-    dlen512 = xbzrle_encode_buffer_avx512(test512, buffer512, XBZRLE_PAGE_SIZE,
-                                       compressed512, XBZRLE_PAGE_SIZE);
-    t_end512 = clock();
-    float time_val512 = difftime(t_end512, t_start512);
-    rc512 = xbzrle_decode_buffer(compressed512, dlen512, test512, XBZRLE_PAGE_SIZE);
-    g_assert(rc512 < XBZRLE_PAGE_SIZE);
-    g_assert(memcmp(test512, buffer512, XBZRLE_PAGE_SIZE) == 0);
-
-    res->t_raw = time_val;
-    res->t_512 = time_val512;
-
-    g_free(buffer);
-    g_free(compressed);
-    g_free(test);
-    g_free(buffer512);
-    g_free(compressed512);
-    g_free(test512);
-
-}
-
-static void test_encode_decode_avx512(void)
-{
-    int i;
-    float time_raw = 0.0, time_512 = 0.0;
-    struct ResTime res;
-    for (i = 0; i < 10000; i++) {
-        encode_decode_range_avx512(&res);
-        time_raw += res.t_raw;
-        time_512 += res.t_512;
-    }
-    printf("Encode decode test:\n");
-    printf("Raw xbzrle_encode time is %f ms\n", time_raw);
-    printf("512 xbzrle_encode time is %f ms\n", time_512);
-}
-
-static void encode_decode_random(struct ResTime *res)
-{
-    uint8_t *buffer = g_malloc0(XBZRLE_PAGE_SIZE);
-    uint8_t *compressed = g_malloc(XBZRLE_PAGE_SIZE);
-    uint8_t *test = g_malloc0(XBZRLE_PAGE_SIZE);
-    uint8_t *buffer512 = g_malloc0(XBZRLE_PAGE_SIZE);
-    uint8_t *compressed512 = g_malloc(XBZRLE_PAGE_SIZE);
-    uint8_t *test512 = g_malloc0(XBZRLE_PAGE_SIZE);
-    int i = 0, rc = 0, rc512 = 0;
-    int dlen = 0, dlen512 = 0;
-
-    int diff_len = g_test_rand_int_range(0, XBZRLE_PAGE_SIZE - 1);
-    /* store the index of diff */
-    int dirty_index[diff_len];
-    for (int j = 0; j < diff_len; j++) {
-        dirty_index[j] = g_test_rand_int_range(0, XBZRLE_PAGE_SIZE - 1);
-    }
-    for (i = diff_len - 1; i >= 0; i--) {
-        buffer[dirty_index[i]] = i;
-        test[dirty_index[i]] = i + 4;
-        buffer512[dirty_index[i]] = i;
-        test512[dirty_index[i]] = i + 4;
-    }
-
-    time_t t_start, t_end, t_start512, t_end512;
-    t_start = clock();
-    dlen = xbzrle_encode_buffer(test, buffer, XBZRLE_PAGE_SIZE, compressed,
-                                XBZRLE_PAGE_SIZE);
-    t_end = clock();
-    float time_val = difftime(t_end, t_start);
-    rc = xbzrle_decode_buffer(compressed, dlen, test, XBZRLE_PAGE_SIZE);
-    g_assert(rc < XBZRLE_PAGE_SIZE);
-
-    t_start512 = clock();
-    dlen512 = xbzrle_encode_buffer_avx512(test512, buffer512, XBZRLE_PAGE_SIZE,
-                                       compressed512, XBZRLE_PAGE_SIZE);
-    t_end512 = clock();
-    float time_val512 = difftime(t_end512, t_start512);
-    rc512 = xbzrle_decode_buffer(compressed512, dlen512, test512, XBZRLE_PAGE_SIZE);
-    g_assert(rc512 < XBZRLE_PAGE_SIZE);
-
-    res->t_raw = time_val;
-    res->t_512 = time_val512;
-
-    g_free(buffer);
-    g_free(compressed);
-    g_free(test);
-    g_free(buffer512);
-    g_free(compressed512);
-    g_free(test512);
-
-}
-
-static void test_encode_decode_random_avx512(void)
-{
-    int i;
-    float time_raw = 0.0, time_512 = 0.0;
-    struct ResTime res;
-    for (i = 0; i < 10000; i++) {
-        encode_decode_random(&res);
-        time_raw += res.t_raw;
-        time_512 += res.t_512;
-    }
-    printf("Random test:\n");
-    printf("Raw xbzrle_encode time is %f ms\n", time_raw);
-    printf("512 xbzrle_encode time is %f ms\n", time_512);
-}
-#endif
-
-int main(int argc, char **argv)
-{
-    g_test_init(&argc, &argv, NULL);
-    g_test_rand_int();
-    #if defined(CONFIG_AVX512BW_OPT)
-    if (likely(is_cpu_support_avx512bw)) {
-        g_test_add_func("/xbzrle/encode_decode_zero", test_encode_decode_zero_avx512);
-        g_test_add_func("/xbzrle/encode_decode_unchanged",
-                        test_encode_decode_unchanged_avx512);
-        g_test_add_func("/xbzrle/encode_decode_1_byte", test_encode_decode_1_byte_avx512);
-        g_test_add_func("/xbzrle/encode_decode_overflow",
-                        test_encode_decode_overflow_avx512);
-        g_test_add_func("/xbzrle/encode_decode", test_encode_decode_avx512);
-        g_test_add_func("/xbzrle/encode_decode_random", test_encode_decode_random_avx512);
-    }
-    #endif
-    return g_test_run();
-}
diff --git a/tests/unit/test-xbzrle.c b/tests/unit/test-xbzrle.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/unit/test-xbzrle.c
+++ b/tests/unit/test-xbzrle.c
@@ -XXX,XX +XXX,XX @@
 
 #define XBZRLE_PAGE_SIZE 4096
 
-int (*xbzrle_encode_buffer_func)(uint8_t *, uint8_t *, int,
-     uint8_t *, int) = xbzrle_encode_buffer;
-#if defined(CONFIG_AVX512BW_OPT)
-#include "qemu/cpuid.h"
-static void __attribute__((constructor)) init_cpu_flag(void)
-{
-    unsigned max = __get_cpuid_max(0, NULL);
-    int a, b, c, d;
-    if (max >= 1) {
-        __cpuid(1, a, b, c, d);
-         /* We must check that AVX is not just available, but usable.  */
-        if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
-            int bv;
-            __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
-            __cpuid_count(7, 0, a, b, c, d);
-           /* 0xe6:
-            *  XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15
-            *                    and ZMM16-ZMM31 state are enabled by OS)
-            *  XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS)
-            */
-            if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) {
-                xbzrle_encode_buffer_func = xbzrle_encode_buffer_avx512;
-            }
-        }
-    }
-    return ;
-}
-#endif
-
 static void test_uleb(void)
 {
     uint32_t i, val;
@@ -XXX,XX +XXX,XX @@ static void test_encode_decode_zero(void)
     buffer[1000 + diff_len + 5] = 105;
 
     /* encode zero page */
-    dlen = xbzrle_encode_buffer_func(buffer, buffer, XBZRLE_PAGE_SIZE, compressed,
-                       XBZRLE_PAGE_SIZE);
+    dlen = xbzrle_encode_buffer(buffer, buffer, XBZRLE_PAGE_SIZE,
+                                compressed, XBZRLE_PAGE_SIZE);
     g_assert(dlen == 0);
 
     g_free(buffer);
@@ -XXX,XX +XXX,XX @@ static void test_encode_decode_unchanged(void)
     test[1000 + diff_len + 5] = 109;
 
     /* test unchanged buffer */
-    dlen = xbzrle_encode_buffer_func(test, test, XBZRLE_PAGE_SIZE, compressed,
-                                XBZRLE_PAGE_SIZE);
+    dlen = xbzrle_encode_buffer(test, test, XBZRLE_PAGE_SIZE,
+                                compressed, XBZRLE_PAGE_SIZE);
     g_assert(dlen == 0);
 
     g_free(test);
@@ -XXX,XX +XXX,XX @@ static void test_encode_decode_1_byte(void)
 
     test[XBZRLE_PAGE_SIZE - 1] = 1;
 
-    dlen = xbzrle_encode_buffer_func(buffer, test, XBZRLE_PAGE_SIZE, compressed,
-                       XBZRLE_PAGE_SIZE);
+    dlen = xbzrle_encode_buffer(buffer, test, XBZRLE_PAGE_SIZE,
+                                compressed, XBZRLE_PAGE_SIZE);
     g_assert(dlen == (uleb128_encode_small(&buf[0], 4095) + 2));
 
     rc = xbzrle_decode_buffer(compressed, dlen, buffer, XBZRLE_PAGE_SIZE);
@@ -XXX,XX +XXX,XX @@ static void test_encode_decode_overflow(void)
     }
 
     /* encode overflow */
-    rc = xbzrle_encode_buffer_func(buffer, test, XBZRLE_PAGE_SIZE, compressed,
-                              XBZRLE_PAGE_SIZE);
+    rc = xbzrle_encode_buffer(buffer, test, XBZRLE_PAGE_SIZE,
+                              compressed, XBZRLE_PAGE_SIZE);
     g_assert(rc == -1);
 
     g_free(buffer);
@@ -XXX,XX +XXX,XX @@ static void encode_decode_range(void)
     test[1000 + diff_len + 5] = 109;
 
     /* test encode/decode */
-    dlen = xbzrle_encode_buffer_func(test, buffer, XBZRLE_PAGE_SIZE, compressed,
-                                XBZRLE_PAGE_SIZE);
+    dlen = xbzrle_encode_buffer(test, buffer, XBZRLE_PAGE_SIZE,
+                                compressed, XBZRLE_PAGE_SIZE);
 
     rc = xbzrle_decode_buffer(compressed, dlen, test, XBZRLE_PAGE_SIZE);
     g_assert(rc < XBZRLE_PAGE_SIZE);
diff --git a/tests/bench/meson.build b/tests/bench/meson.build
index XXXXXXX..XXXXXXX 100644
--- a/tests/bench/meson.build
+++ b/tests/bench/meson.build
@@ -XXX,XX +XXX,XX @@ qht_bench = executable('qht-bench',
                        sources: 'qht-bench.c',
                        dependencies: [qemuutil])
 
-if have_system
-xbzrle_bench = executable('xbzrle-bench',
-                       sources: 'xbzrle-bench.c',
-                       dependencies: [qemuutil,migration])
-endif
-
 qtree_bench = executable('qtree-bench',
                          sources: 'qtree-bench.c',
                          dependencies: [qemuutil])
-- 
2.34.1

Move the code from tcg/.  The only use of these bits so far
is with respect to the atomicity of tcg operations.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 host/include/aarch64/host/cpuinfo.h | 22 ++++++++++
 tcg/aarch64/tcg-target.h            |  6 ++-
 util/cpuinfo-aarch64.c              | 67 +++++++++++++++++++++++++++++
 tcg/aarch64/tcg-target.c.inc        | 40 -----------------
 util/meson.build                    |  4 +-
 5 files changed, 96 insertions(+), 43 deletions(-)
 create mode 100644 host/include/aarch64/host/cpuinfo.h
 create mode 100644 util/cpuinfo-aarch64.c

diff --git a/host/include/aarch64/host/cpuinfo.h b/host/include/aarch64/host/cpuinfo.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/host/include/aarch64/host/cpuinfo.h
@@ -XXX,XX +XXX,XX @@
+/*
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ * Host specific cpu indentification for AArch64.
+ */
+
+#ifndef HOST_CPUINFO_H
+#define HOST_CPUINFO_H
+
+#define CPUINFO_ALWAYS          (1u << 0)  /* so cpuinfo is nonzero */
+#define CPUINFO_LSE             (1u << 1)
+#define CPUINFO_LSE2            (1u << 2)
+
+/* Initialized with a constructor. */
+extern unsigned cpuinfo;
+
+/*
+ * We cannot rely on constructor ordering, so other constructors must
+ * use the function interface rather than the variable above.
+ */
+unsigned cpuinfo_init(void);
+
+#endif /* HOST_CPUINFO_H */
diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/aarch64/tcg-target.h
+++ b/tcg/aarch64/tcg-target.h
@@ -XXX,XX +XXX,XX @@
 #ifndef AARCH64_TCG_TARGET_H
 #define AARCH64_TCG_TARGET_H
 
+#include "host/cpuinfo.h"
+
 #define TCG_TARGET_INSN_UNIT_SIZE  4
 #define TCG_TARGET_TLB_DISPLACEMENT_BITS 24
 #define MAX_CODE_GEN_BUFFER_SIZE  ((size_t)-1)
@@ -XXX,XX +XXX,XX @@ typedef enum {
 #define TCG_TARGET_CALL_ARG_I128        TCG_CALL_ARG_EVEN
 #define TCG_TARGET_CALL_RET_I128        TCG_CALL_RET_NORMAL
 
-extern bool have_lse;
-extern bool have_lse2;
+#define have_lse    (cpuinfo & CPUINFO_LSE)
+#define have_lse2   (cpuinfo & CPUINFO_LSE2)
 
 /* optional instructions */
 #define TCG_TARGET_HAS_div_i32          1
diff --git a/util/cpuinfo-aarch64.c b/util/cpuinfo-aarch64.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/util/cpuinfo-aarch64.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ * Host specific cpu indentification for AArch64.
+ */
+
+#include "qemu/osdep.h"
+#include "host/cpuinfo.h"
+
+#ifdef CONFIG_LINUX
+# ifdef CONFIG_GETAUXVAL
+#  include <sys/auxv.h>
+# else
+#  include <asm/hwcap.h>
+#  include "elf.h"
+# endif
+#endif
+#ifdef CONFIG_DARWIN
+# include <sys/sysctl.h>
+#endif
+
+unsigned cpuinfo;
+
+#ifdef CONFIG_DARWIN
+static bool sysctl_for_bool(const char *name)
+{
+    int val = 0;
+    size_t len = sizeof(val);
+
+    if (sysctlbyname(name, &val, &len, NULL, 0) == 0) {
+        return val != 0;
+    }
+
+    /*
+     * We might in the future ask for properties not present in older kernels,
+     * but we're only asking about static properties, all of which should be
+     * 'int'.  So we shouln't see ENOMEM (val too small), or any of the other
+     * more exotic errors.
+     */
+    assert(errno == ENOENT);
+    return false;
+}
+#endif
+
+/* Called both as constructor and (possibly) via other constructors. */
+unsigned __attribute__((constructor)) cpuinfo_init(void)
+{
+    unsigned info = cpuinfo;
+
+    if (info) {
+        return info;
+    }
+
+    info = CPUINFO_ALWAYS;
+
+#ifdef CONFIG_LINUX
+    unsigned long hwcap = qemu_getauxval(AT_HWCAP);
+    info |= (hwcap & HWCAP_ATOMICS ? CPUINFO_LSE : 0);
+    info |= (hwcap & HWCAP_USCAT ? CPUINFO_LSE2 : 0);
+#endif
+#ifdef CONFIG_DARWIN
+    info |= sysctl_for_bool("hw.optional.arm.FEAT_LSE") * CPUINFO_LSE;
+    info |= sysctl_for_bool("hw.optional.arm.FEAT_LSE2") * CPUINFO_LSE2;
+#endif
+
+    cpuinfo = info;
+    return info;
+}
diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/aarch64/tcg-target.c.inc
+++ b/tcg/aarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@
 #include "../tcg-ldst.c.inc"
 #include "../tcg-pool.c.inc"
 #include "qemu/bitops.h"
-#ifdef __linux__
-#include <asm/hwcap.h>
-#endif
-#ifdef CONFIG_DARWIN
-#include <sys/sysctl.h>
-#endif
 
 /* We're going to re-use TCGType in setting of the SF bit, which controls
    the size of the operation performed.  If we know the values match, it
@@ -XXX,XX +XXX,XX @@ static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind kind, int slot)
     return TCG_REG_X0 + slot;
 }
 
-bool have_lse;
-bool have_lse2;
-
 #define TCG_REG_TMP TCG_REG_X30
 #define TCG_VEC_TMP TCG_REG_V31
 
@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
     }
 }
 
-#ifdef CONFIG_DARWIN
-static bool sysctl_for_bool(const char *name)
-{
-    int val = 0;
-    size_t len = sizeof(val);
-
-    if (sysctlbyname(name, &val, &len, NULL, 0) == 0) {
-        return val != 0;
-    }
-
-    /*
-     * We might in the future ask for properties not present in older kernels,
-     * but we're only asking about static properties, all of which should be
-     * 'int'.  So we shouln't see ENOMEM (val too small), or any of the other
-     * more exotic errors.
-     */
-    assert(errno == ENOENT);
-    return false;
-}
-#endif
-
 static void tcg_target_init(TCGContext *s)
 {
-#ifdef __linux__
-    unsigned long hwcap = qemu_getauxval(AT_HWCAP);
-    have_lse = hwcap & HWCAP_ATOMICS;
-    have_lse2 = hwcap & HWCAP_USCAT;
-#endif
-#ifdef CONFIG_DARWIN
-    have_lse = sysctl_for_bool("hw.optional.arm.FEAT_LSE");
-    have_lse2 = sysctl_for_bool("hw.optional.arm.FEAT_LSE2");
-#endif
-
     tcg_target_available_regs[TCG_TYPE_I32] = 0xffffffffu;
     tcg_target_available_regs[TCG_TYPE_I64] = 0xffffffffu;
     tcg_target_available_regs[TCG_TYPE_V64] = 0xffffffff00000000ull;
diff --git a/util/meson.build b/util/meson.build
index XXXXXXX..XXXXXXX 100644
--- a/util/meson.build
+++ b/util/meson.build
@@ -XXX,XX +XXX,XX @@ if have_block
   util_ss.add(when: 'CONFIG_LINUX', if_true: files('vfio-helpers.c'))
 endif
 
-if cpu in ['x86', 'x86_64']
+if cpu == 'aarch64'
+  util_ss.add(files('cpuinfo-aarch64.c'))
+elif cpu in ['x86', 'x86_64']
   util_ss.add(files('cpuinfo-i386.c'))
 endif
-- 
2.34.1

Separates the aarch64-specific portion into its own file.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 host/include/aarch64/host/atomic128-cas.h | 43 ++++++++++++++++++
 host/include/generic/host/atomic128-cas.h | 43 ++++++++++++++++++
 include/qemu/atomic128.h                  | 55 +----------------------
 3 files changed, 87 insertions(+), 54 deletions(-)
 create mode 100644 host/include/aarch64/host/atomic128-cas.h
 create mode 100644 host/include/generic/host/atomic128-cas.h

diff --git a/host/include/aarch64/host/atomic128-cas.h b/host/include/aarch64/host/atomic128-cas.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/host/include/aarch64/host/atomic128-cas.h
@@ -XXX,XX +XXX,XX @@
+/*
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ * Compare-and-swap for 128-bit atomic operations, AArch64 version.
+ *
+ * Copyright (C) 2018, 2023 Linaro, Ltd.
+ *
+ * See docs/devel/atomics.rst for discussion about the guarantees each
+ * atomic primitive is meant to provide.
+ */
+
+#ifndef AARCH64_ATOMIC128_CAS_H
+#define AARCH64_ATOMIC128_CAS_H
+
+/* Through gcc 10, aarch64 has no support for 128-bit atomics.  */
+#if defined(CONFIG_ATOMIC128) || defined(CONFIG_CMPXCHG128)
+#include "host/include/generic/host/atomic128-cas.h"
+#else
+static inline Int128 atomic16_cmpxchg(Int128 *ptr, Int128 cmp, Int128 new)
+{
+    uint64_t cmpl = int128_getlo(cmp), cmph = int128_gethi(cmp);
+    uint64_t newl = int128_getlo(new), newh = int128_gethi(new);
+    uint64_t oldl, oldh;
+    uint32_t tmp;
+
+    asm("0: ldaxp %[oldl], %[oldh], %[mem]\n\t"
+        "cmp %[oldl], %[cmpl]\n\t"
+        "ccmp %[oldh], %[cmph], #0, eq\n\t"
+        "b.ne 1f\n\t"
+        "stlxp %w[tmp], %[newl], %[newh], %[mem]\n\t"
+        "cbnz %w[tmp], 0b\n"
+        "1:"
+        : [mem] "+m"(*ptr), [tmp] "=&r"(tmp),
+          [oldl] "=&r"(oldl), [oldh] "=&r"(oldh)
+        : [cmpl] "r"(cmpl), [cmph] "r"(cmph),
+          [newl] "r"(newl), [newh] "r"(newh)
+        : "memory", "cc");
+
+    return int128_make128(oldl, oldh);
+}
+# define HAVE_CMPXCHG128 1
+#endif
+
+#endif /* AARCH64_ATOMIC128_CAS_H */
diff --git a/host/include/generic/host/atomic128-cas.h b/host/include/generic/host/atomic128-cas.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/host/include/generic/host/atomic128-cas.h
@@ -XXX,XX +XXX,XX @@
+/*
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ * Compare-and-swap for 128-bit atomic operations, generic version.
+ *
+ * Copyright (C) 2018, 2023 Linaro, Ltd.
+ *
+ * See docs/devel/atomics.rst for discussion about the guarantees each
+ * atomic primitive is meant to provide.
+ */
+
+#ifndef HOST_ATOMIC128_CAS_H
+#define HOST_ATOMIC128_CAS_H
+
+#if defined(CONFIG_ATOMIC128)
+static inline Int128 atomic16_cmpxchg(Int128 *ptr, Int128 cmp, Int128 new)
+{
+    Int128Alias r, c, n;
+
+    c.s = cmp;
+    n.s = new;
+    r.i = qatomic_cmpxchg__nocheck((__int128_t *)ptr, c.i, n.i);
+    return r.s;
+}
+# define HAVE_CMPXCHG128 1
+#elif defined(CONFIG_CMPXCHG128)
+static inline Int128 atomic16_cmpxchg(Int128 *ptr, Int128 cmp, Int128 new)
+{
+    Int128Alias r, c, n;
+
+    c.s = cmp;
+    n.s = new;
+    r.i = __sync_val_compare_and_swap_16((__int128_t *)ptr, c.i, n.i);
+    return r.s;
+}
+# define HAVE_CMPXCHG128 1
+#else
+/* Fallback definition that must be optimized away, or error.  */
+Int128 QEMU_ERROR("unsupported atomic")
+    atomic16_cmpxchg(Int128 *ptr, Int128 cmp, Int128 new);
+# define HAVE_CMPXCHG128 0
+#endif
+
+#endif /* HOST_ATOMIC128_CAS_H */
diff --git a/include/qemu/atomic128.h b/include/qemu/atomic128.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/atomic128.h
+++ b/include/qemu/atomic128.h
@@ -XXX,XX +XXX,XX @@
  * Therefore, special case each platform.
  */
 
-#if defined(CONFIG_ATOMIC128)
-static inline Int128 atomic16_cmpxchg(Int128 *ptr, Int128 cmp, Int128 new)
-{
-    Int128Alias r, c, n;
-
-    c.s = cmp;
-    n.s = new;
-    r.i = qatomic_cmpxchg__nocheck((__int128_t *)ptr, c.i, n.i);
-    return r.s;
-}
-# define HAVE_CMPXCHG128 1
-#elif defined(CONFIG_CMPXCHG128)
-static inline Int128 atomic16_cmpxchg(Int128 *ptr, Int128 cmp, Int128 new)
-{
-    Int128Alias r, c, n;
-
-    c.s = cmp;
-    n.s = new;
-    r.i = __sync_val_compare_and_swap_16((__int128_t *)ptr, c.i, n.i);
-    return r.s;
-}
-# define HAVE_CMPXCHG128 1
-#elif defined(__aarch64__)
-/* Through gcc 8, aarch64 has no support for 128-bit at all.  */
-static inline Int128 atomic16_cmpxchg(Int128 *ptr, Int128 cmp, Int128 new)
-{
-    uint64_t cmpl = int128_getlo(cmp), cmph = int128_gethi(cmp);
-    uint64_t newl = int128_getlo(new), newh = int128_gethi(new);
-    uint64_t oldl, oldh;
-    uint32_t tmp;
-
-    asm("0: ldaxp %[oldl], %[oldh], %[mem]\n\t"
-        "cmp %[oldl], %[cmpl]\n\t"
-        "ccmp %[oldh], %[cmph], #0, eq\n\t"
-        "b.ne 1f\n\t"
-        "stlxp %w[tmp], %[newl], %[newh], %[mem]\n\t"
-        "cbnz %w[tmp], 0b\n"
-        "1:"
-        : [mem] "+m"(*ptr), [tmp] "=&r"(tmp),
-          [oldl] "=&r"(oldl), [oldh] "=&r"(oldh)
-        : [cmpl] "r"(cmpl), [cmph] "r"(cmph),
-          [newl] "r"(newl), [newh] "r"(newh)
-        : "memory", "cc");
-
-    return int128_make128(oldl, oldh);
-}
-# define HAVE_CMPXCHG128 1
-#else
-/* Fallback definition that must be optimized away, or error.  */
-Int128 QEMU_ERROR("unsupported atomic")
-    atomic16_cmpxchg(Int128 *ptr, Int128 cmp, Int128 new);
-# define HAVE_CMPXCHG128 0
-#endif /* Some definition for HAVE_CMPXCHG128 */
-
+#include "host/atomic128-cas.h"
 
 #if defined(CONFIG_ATOMIC128)
 static inline Int128 atomic16_read(Int128 *ptr)
-- 
2.34.1

Separates the aarch64-specific portion into its own file.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 host/include/aarch64/host/atomic128-ldst.h | 49 ++++++++++++++
 host/include/generic/host/atomic128-ldst.h | 57 +++++++++++++++++
 include/qemu/atomic128.h                   | 74 +---------------------
 3 files changed, 107 insertions(+), 73 deletions(-)
 create mode 100644 host/include/aarch64/host/atomic128-ldst.h
 create mode 100644 host/include/generic/host/atomic128-ldst.h

diff --git a/host/include/aarch64/host/atomic128-ldst.h b/host/include/aarch64/host/atomic128-ldst.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/host/include/aarch64/host/atomic128-ldst.h
@@ -XXX,XX +XXX,XX @@
+/*
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ * Load/store for 128-bit atomic operations, AArch64 version.
+ *
+ * Copyright (C) 2018, 2023 Linaro, Ltd.
+ *
+ * See docs/devel/atomics.rst for discussion about the guarantees each
+ * atomic primitive is meant to provide.
+ */
+
+#ifndef AARCH64_ATOMIC128_LDST_H
+#define AARCH64_ATOMIC128_LDST_H
+
+/* Through gcc 10, aarch64 has no support for 128-bit atomics.  */
+#if !defined(CONFIG_ATOMIC128) && !defined(CONFIG_USER_ONLY)
+/* We can do better than cmpxchg for AArch64.  */
+static inline Int128 atomic16_read(Int128 *ptr)
+{
+    uint64_t l, h;
+    uint32_t tmp;
+
+    /* The load must be paired with the store to guarantee not tearing.  */
+    asm("0: ldxp %[l], %[h], %[mem]\n\t"
+        "stxp %w[tmp], %[l], %[h], %[mem]\n\t"
+        "cbnz %w[tmp], 0b"
+        : [mem] "+m"(*ptr), [tmp] "=r"(tmp), [l] "=r"(l), [h] "=r"(h));
+
+    return int128_make128(l, h);
+}
+
+static inline void atomic16_set(Int128 *ptr, Int128 val)
+{
+    uint64_t l = int128_getlo(val), h = int128_gethi(val);
+    uint64_t t1, t2;
+
+    /* Load into temporaries to acquire the exclusive access lock.  */
+    asm("0: ldxp %[t1], %[t2], %[mem]\n\t"
+        "stxp %w[t1], %[l], %[h], %[mem]\n\t"
+        "cbnz %w[t1], 0b"
+        : [mem] "+m"(*ptr), [t1] "=&r"(t1), [t2] "=&r"(t2)
+        : [l] "r"(l), [h] "r"(h));
+}
+
+# define HAVE_ATOMIC128 1
+#else
+#include "host/include/generic/host/atomic128-ldst.h"
+#endif
+
+#endif /* AARCH64_ATOMIC128_LDST_H */
diff --git a/host/include/generic/host/atomic128-ldst.h b/host/include/generic/host/atomic128-ldst.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/host/include/generic/host/atomic128-ldst.h
@@ -XXX,XX +XXX,XX @@
+/*
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ * Load/store for 128-bit atomic operations, generic version.
+ *
+ * Copyright (C) 2018, 2023 Linaro, Ltd.
+ *
+ * See docs/devel/atomics.rst for discussion about the guarantees each
+ * atomic primitive is meant to provide.
+ */
+
+#ifndef HOST_ATOMIC128_LDST_H
+#define HOST_ATOMIC128_LDST_H
+
+#if defined(CONFIG_ATOMIC128)
+static inline Int128 atomic16_read(Int128 *ptr)
+{
+    Int128Alias r;
+
+    r.i = qatomic_read__nocheck((__int128_t *)ptr);
+    return r.s;
+}
+
+static inline void atomic16_set(Int128 *ptr, Int128 val)
+{
+    Int128Alias v;
+
+    v.s = val;
+    qatomic_set__nocheck((__int128_t *)ptr, v.i);
+}
+
+# define HAVE_ATOMIC128 1
+#elif !defined(CONFIG_USER_ONLY) && HAVE_CMPXCHG128
+static inline Int128 atomic16_read(Int128 *ptr)
+{
+    /* Maybe replace 0 with 0, returning the old value.  */
+    Int128 z = int128_make64(0);
+    return atomic16_cmpxchg(ptr, z, z);
+}
+
+static inline void atomic16_set(Int128 *ptr, Int128 val)
+{
+    Int128 old = *ptr, cmp;
+    do {
+        cmp = old;
+        old = atomic16_cmpxchg(ptr, cmp, val);
+    } while (int128_ne(old, cmp));
+}
+
+# define HAVE_ATOMIC128 1
+#else
+/* Fallback definitions that must be optimized away, or error.  */
+Int128 QEMU_ERROR("unsupported atomic") atomic16_read(Int128 *ptr);
+void QEMU_ERROR("unsupported atomic") atomic16_set(Int128 *ptr, Int128 val);
+# define HAVE_ATOMIC128 0
+#endif
+
+#endif /* HOST_ATOMIC128_LDST_H */
diff --git a/include/qemu/atomic128.h b/include/qemu/atomic128.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/atomic128.h
+++ b/include/qemu/atomic128.h
@@ -XXX,XX +XXX,XX @@
  */
 
 #include "host/atomic128-cas.h"
-
-#if defined(CONFIG_ATOMIC128)
-static inline Int128 atomic16_read(Int128 *ptr)
-{
-    Int128Alias r;
-
-    r.i = qatomic_read__nocheck((__int128_t *)ptr);
-    return r.s;
-}
-
-static inline void atomic16_set(Int128 *ptr, Int128 val)
-{
-    Int128Alias v;
-
-    v.s = val;
-    qatomic_set__nocheck((__int128_t *)ptr, v.i);
-}
-
-# define HAVE_ATOMIC128 1
-#elif !defined(CONFIG_USER_ONLY) && defined(__aarch64__)
-/* We can do better than cmpxchg for AArch64.  */
-static inline Int128 atomic16_read(Int128 *ptr)
-{
-    uint64_t l, h;
-    uint32_t tmp;
-
-    /* The load must be paired with the store to guarantee not tearing.  */
-    asm("0: ldxp %[l], %[h], %[mem]\n\t"
-        "stxp %w[tmp], %[l], %[h], %[mem]\n\t"
-        "cbnz %w[tmp], 0b"
-        : [mem] "+m"(*ptr), [tmp] "=r"(tmp), [l] "=r"(l), [h] "=r"(h));
-
-    return int128_make128(l, h);
-}
-
-static inline void atomic16_set(Int128 *ptr, Int128 val)
-{
-    uint64_t l = int128_getlo(val), h = int128_gethi(val);
-    uint64_t t1, t2;
-
-    /* Load into temporaries to acquire the exclusive access lock.  */
-    asm("0: ldxp %[t1], %[t2], %[mem]\n\t"
-        "stxp %w[t1], %[l], %[h], %[mem]\n\t"
-        "cbnz %w[t1], 0b"
-        : [mem] "+m"(*ptr), [t1] "=&r"(t1), [t2] "=&r"(t2)
-        : [l] "r"(l), [h] "r"(h));
-}
-
-# define HAVE_ATOMIC128 1
-#elif !defined(CONFIG_USER_ONLY) && HAVE_CMPXCHG128
-static inline Int128 atomic16_read(Int128 *ptr)
-{
-    /* Maybe replace 0 with 0, returning the old value.  */
-    Int128 z = int128_make64(0);
-    return atomic16_cmpxchg(ptr, z, z);
-}
-
-static inline void atomic16_set(Int128 *ptr, Int128 val)
-{
-    Int128 old = *ptr, cmp;
-    do {
-        cmp = old;
-        old = atomic16_cmpxchg(ptr, cmp, val);
-    } while (int128_ne(old, cmp));
-}
-
-# define HAVE_ATOMIC128 1
-#else
-/* Fallback definitions that must be optimized away, or error.  */
-Int128 QEMU_ERROR("unsupported atomic") atomic16_read(Int128 *ptr);
-void QEMU_ERROR("unsupported atomic") atomic16_set(Int128 *ptr, Int128 val);
-# define HAVE_ATOMIC128 0
-#endif /* Some definition for HAVE_ATOMIC128 */
+#include "host/atomic128-ldst.h"
 
 #endif /* QEMU_ATOMIC128_H */
-- 
2.34.1

Not only the routines in ldst_atomicity.c.inc need markup,
but also the ones in the headers.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 host/include/generic/host/atomic128-cas.h  | 12 ++++++++----
 host/include/generic/host/atomic128-ldst.h | 18 ++++++++++++------
 include/qemu/atomic128.h                   | 17 +++++++++++++++++
 accel/tcg/ldst_atomicity.c.inc             | 17 -----------------
 4 files changed, 37 insertions(+), 27 deletions(-)

diff --git a/host/include/generic/host/atomic128-cas.h b/host/include/generic/host/atomic128-cas.h
index XXXXXXX..XXXXXXX 100644
--- a/host/include/generic/host/atomic128-cas.h
+++ b/host/include/generic/host/atomic128-cas.h
@@ -XXX,XX +XXX,XX @@
 #define HOST_ATOMIC128_CAS_H
 
 #if defined(CONFIG_ATOMIC128)
-static inline Int128 atomic16_cmpxchg(Int128 *ptr, Int128 cmp, Int128 new)
+static inline Int128 ATTRIBUTE_ATOMIC128_OPT
+atomic16_cmpxchg(Int128 *ptr, Int128 cmp, Int128 new)
 {
+    __int128_t *ptr_align = __builtin_assume_aligned(ptr, 16);
     Int128Alias r, c, n;
 
     c.s = cmp;
     n.s = new;
-    r.i = qatomic_cmpxchg__nocheck((__int128_t *)ptr, c.i, n.i);
+    r.i = qatomic_cmpxchg__nocheck(ptr_align, c.i, n.i);
     return r.s;
 }
 # define HAVE_CMPXCHG128 1
 #elif defined(CONFIG_CMPXCHG128)
-static inline Int128 atomic16_cmpxchg(Int128 *ptr, Int128 cmp, Int128 new)
+static inline Int128 ATTRIBUTE_ATOMIC128_OPT
+atomic16_cmpxchg(Int128 *ptr, Int128 cmp, Int128 new)
 {
+    __int128_t *ptr_align = __builtin_assume_aligned(ptr, 16);
     Int128Alias r, c, n;
 
     c.s = cmp;
     n.s = new;
-    r.i = __sync_val_compare_and_swap_16((__int128_t *)ptr, c.i, n.i);
+    r.i = __sync_val_compare_and_swap_16(ptr_align, c.i, n.i);
     return r.s;
 }
 # define HAVE_CMPXCHG128 1
diff --git a/host/include/generic/host/atomic128-ldst.h b/host/include/generic/host/atomic128-ldst.h
index XXXXXXX..XXXXXXX 100644
--- a/host/include/generic/host/atomic128-ldst.h
+++ b/host/include/generic/host/atomic128-ldst.h
@@ -XXX,XX +XXX,XX @@
 #define HOST_ATOMIC128_LDST_H
 
 #if defined(CONFIG_ATOMIC128)
-static inline Int128 atomic16_read(Int128 *ptr)
+static inline Int128 ATTRIBUTE_ATOMIC128_OPT
+atomic16_read(Int128 *ptr)
 {
+    __int128_t *ptr_align = __builtin_assume_aligned(ptr, 16);
     Int128Alias r;
 
-    r.i = qatomic_read__nocheck((__int128_t *)ptr);
+    r.i = qatomic_read__nocheck(ptr_align);
     return r.s;
 }
 
-static inline void atomic16_set(Int128 *ptr, Int128 val)
+static inline void ATTRIBUTE_ATOMIC128_OPT
+atomic16_set(Int128 *ptr, Int128 val)
 {
+    __int128_t *ptr_align = __builtin_assume_aligned(ptr, 16);
     Int128Alias v;
 
     v.s = val;
-    qatomic_set__nocheck((__int128_t *)ptr, v.i);
+    qatomic_set__nocheck(ptr_align, v.i);
 }
 
 # define HAVE_ATOMIC128 1
 #elif !defined(CONFIG_USER_ONLY) && HAVE_CMPXCHG128
-static inline Int128 atomic16_read(Int128 *ptr)
+static inline Int128 ATTRIBUTE_ATOMIC128_OPT
+atomic16_read(Int128 *ptr)
 {
     /* Maybe replace 0 with 0, returning the old value.  */
     Int128 z = int128_make64(0);
     return atomic16_cmpxchg(ptr, z, z);
 }
 
-static inline void atomic16_set(Int128 *ptr, Int128 val)
+static inline void ATTRIBUTE_ATOMIC128_OPT
+atomic16_set(Int128 *ptr, Int128 val)
 {
     Int128 old = *ptr, cmp;
     do {
diff --git a/include/qemu/atomic128.h b/include/qemu/atomic128.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/atomic128.h
+++ b/include/qemu/atomic128.h
@@ -XXX,XX +XXX,XX @@
 
 #include "qemu/int128.h"
 
+/*
+ * If __alignof(unsigned __int128) < 16, GCC may refuse to inline atomics
+ * that are supported by the host, e.g. s390x.  We can force the pointer to
+ * have our known alignment with __builtin_assume_aligned, however prior to
+ * GCC 13 that was only reliable with optimization enabled.  See
+ *   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107389
+ */
+#if defined(CONFIG_ATOMIC128_OPT)
+# if !defined(__OPTIMIZE__)
+#  define ATTRIBUTE_ATOMIC128_OPT  __attribute__((optimize("O1")))
+# endif
+# define CONFIG_ATOMIC128
+#endif
+#ifndef ATTRIBUTE_ATOMIC128_OPT
+# define ATTRIBUTE_ATOMIC128_OPT
+#endif
+
 /*
  * GCC is a house divided about supporting large atomic operations.
  *
diff --git a/accel/tcg/ldst_atomicity.c.inc b/accel/tcg/ldst_atomicity.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/ldst_atomicity.c.inc
+++ b/accel/tcg/ldst_atomicity.c.inc
@@ -XXX,XX +XXX,XX @@
 #endif
 #define HAVE_al8_fast      (ATOMIC_REG_SIZE >= 8)
 
-/*
- * If __alignof(unsigned __int128) < 16, GCC may refuse to inline atomics
- * that are supported by the host, e.g. s390x.  We can force the pointer to
- * have our known alignment with __builtin_assume_aligned, however prior to
- * GCC 13 that was only reliable with optimization enabled.  See
- *   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107389
- */
-#if defined(CONFIG_ATOMIC128_OPT)
-# if !defined(__OPTIMIZE__)
-#  define ATTRIBUTE_ATOMIC128_OPT  __attribute__((optimize("O1")))
-# endif
-# define CONFIG_ATOMIC128
-#endif
-#ifndef ATTRIBUTE_ATOMIC128_OPT
-# define ATTRIBUTE_ATOMIC128_OPT
-#endif
-
 #if defined(CONFIG_ATOMIC128)
 # define HAVE_al16_fast    true
 #else
-- 
2.34.1

No need to roll our own, as this is now provided by tcg.
This was the last use of retxl, so remove that too.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/ppc/cpu.h                           |  1 -
 target/ppc/helper.h                        |  9 ----
 target/ppc/mem_helper.c                    | 48 --------------------
 target/ppc/translate.c                     | 34 ++-------------
 target/ppc/translate/fixedpoint-impl.c.inc | 51 +++-------------------
 5 files changed, 11 insertions(+), 132 deletions(-)

diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
index XXXXXXX..XXXXXXX 100644
--- a/target/ppc/cpu.h
+++ b/target/ppc/cpu.h
@@ -XXX,XX +XXX,XX @@ struct CPUArchState {
                            /* used to speed-up TLB assist handlers */
 
     target_ulong nip;      /* next instruction pointer */
-    uint64_t retxh;        /* high part of 128-bit helper return */
 
     /* when a memory exception occurs, the access type is stored here */
     int access_type;
diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index XXXXXXX..XXXXXXX 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -XXX,XX +XXX,XX @@ DEF_HELPER_4(DSCLIQ, void, env, fprp, fprp, i32)
 
 DEF_HELPER_1(tbegin, void, env)
 DEF_HELPER_FLAGS_1(fixup_thrm, TCG_CALL_NO_RWG, void, env)
-
-#ifdef TARGET_PPC64
-DEF_HELPER_FLAGS_3(lq_le_parallel, TCG_CALL_NO_WG, i64, env, tl, i32)
-DEF_HELPER_FLAGS_3(lq_be_parallel, TCG_CALL_NO_WG, i64, env, tl, i32)
-DEF_HELPER_FLAGS_5(stq_le_parallel, TCG_CALL_NO_WG,
-                   void, env, tl, i64, i64, i32)
-DEF_HELPER_FLAGS_5(stq_be_parallel, TCG_CALL_NO_WG,
-                   void, env, tl, i64, i64, i32)
-#endif
diff --git a/target/ppc/mem_helper.c b/target/ppc/mem_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/ppc/mem_helper.c
+++ b/target/ppc/mem_helper.c
@@ -XXX,XX +XXX,XX @@ target_ulong helper_lscbx(CPUPPCState *env, target_ulong addr, uint32_t reg,
     return i;
 }
 
-#ifdef TARGET_PPC64
-uint64_t helper_lq_le_parallel(CPUPPCState *env, target_ulong addr,
-                               uint32_t opidx)
-{
-    Int128 ret;
-
-    /* We will have raised EXCP_ATOMIC from the translator.  */
-    assert(HAVE_ATOMIC128);
-    ret = cpu_atomic_ldo_le_mmu(env, addr, opidx, GETPC());
-    env->retxh = int128_gethi(ret);
-    return int128_getlo(ret);
-}
-
-uint64_t helper_lq_be_parallel(CPUPPCState *env, target_ulong addr,
-                               uint32_t opidx)
-{
-    Int128 ret;
-
-    /* We will have raised EXCP_ATOMIC from the translator.  */
-    assert(HAVE_ATOMIC128);
-    ret = cpu_atomic_ldo_be_mmu(env, addr, opidx, GETPC());
-    env->retxh = int128_gethi(ret);
-    return int128_getlo(ret);
-}
-
-void helper_stq_le_parallel(CPUPPCState *env, target_ulong addr,
-                            uint64_t lo, uint64_t hi, uint32_t opidx)
-{
-    Int128 val;
-
-    /* We will have raised EXCP_ATOMIC from the translator.  */
-    assert(HAVE_ATOMIC128);
-    val = int128_make128(lo, hi);
-    cpu_atomic_sto_le_mmu(env, addr, val, opidx, GETPC());
-}
-
-void helper_stq_be_parallel(CPUPPCState *env, target_ulong addr,
-                            uint64_t lo, uint64_t hi, uint32_t opidx)
-{
-    Int128 val;
-
-    /* We will have raised EXCP_ATOMIC from the translator.  */
-    assert(HAVE_ATOMIC128);
-    val = int128_make128(lo, hi);
-    cpu_atomic_sto_be_mmu(env, addr, val, opidx, GETPC());
-}
-#endif
-
 /*****************************************************************************/
 /* Altivec extension helpers */
 #if HOST_BIG_ENDIAN
diff --git a/target/ppc/translate.c b/target/ppc/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/ppc/translate.c
+++ b/target/ppc/translate.c
@@ -XXX,XX +XXX,XX @@ static void gen_lqarx(DisasContext *ctx)
 {
     int rd = rD(ctx->opcode);
     TCGv EA, hi, lo;
+    TCGv_i128 t16;
 
     if (unlikely((rd & 1) || (rd == rA(ctx->opcode)) ||
                  (rd == rB(ctx->opcode)))) {
@@ -XXX,XX +XXX,XX @@ static void gen_lqarx(DisasContext *ctx)
     lo = cpu_gpr[rd + 1];
     hi = cpu_gpr[rd];
 
-    if (tb_cflags(ctx->base.tb) & CF_PARALLEL) {
-        if (HAVE_ATOMIC128) {
-            TCGv_i32 oi = tcg_temp_new_i32();
-            if (ctx->le_mode) {
-                tcg_gen_movi_i32(oi, make_memop_idx(MO_LE | MO_128 | MO_ALIGN,
-                                                    ctx->mem_idx));
-                gen_helper_lq_le_parallel(lo, cpu_env, EA, oi);
-            } else {
-                tcg_gen_movi_i32(oi, make_memop_idx(MO_BE | MO_128 | MO_ALIGN,
-                                                    ctx->mem_idx));
-                gen_helper_lq_be_parallel(lo, cpu_env, EA, oi);
-            }
-            tcg_gen_ld_i64(hi, cpu_env, offsetof(CPUPPCState, retxh));
-        } else {
-            /* Restart with exclusive lock.  */
-            gen_helper_exit_atomic(cpu_env);
-            ctx->base.is_jmp = DISAS_NORETURN;
-            return;
-        }
-    } else if (ctx->le_mode) {
-        tcg_gen_qemu_ld_i64(lo, EA, ctx->mem_idx, MO_LEUQ | MO_ALIGN_16);
-        tcg_gen_mov_tl(cpu_reserve, EA);
-        gen_addr_add(ctx, EA, EA, 8);
-        tcg_gen_qemu_ld_i64(hi, EA, ctx->mem_idx, MO_LEUQ);
-    } else {
-        tcg_gen_qemu_ld_i64(hi, EA, ctx->mem_idx, MO_BEUQ | MO_ALIGN_16);
-        tcg_gen_mov_tl(cpu_reserve, EA);
-        gen_addr_add(ctx, EA, EA, 8);
-        tcg_gen_qemu_ld_i64(lo, EA, ctx->mem_idx, MO_BEUQ);
-    }
+    t16 = tcg_temp_new_i128();
+    tcg_gen_qemu_ld_i128(t16, EA, ctx->mem_idx, DEF_MEMOP(MO_128 | MO_ALIGN));
+    tcg_gen_extr_i128_i64(lo, hi, t16);
 
     tcg_gen_st_tl(hi, cpu_env, offsetof(CPUPPCState, reserve_val));
     tcg_gen_st_tl(lo, cpu_env, offsetof(CPUPPCState, reserve_val2));
diff --git a/target/ppc/translate/fixedpoint-impl.c.inc b/target/ppc/translate/fixedpoint-impl.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/target/ppc/translate/fixedpoint-impl.c.inc
+++ b/target/ppc/translate/fixedpoint-impl.c.inc
@@ -XXX,XX +XXX,XX @@ static bool do_ldst_quad(DisasContext *ctx, arg_D *a, bool store, bool prefixed)
 #if defined(TARGET_PPC64)
     TCGv ea;
     TCGv_i64 low_addr_gpr, high_addr_gpr;
-    MemOp mop;
+    TCGv_i128 t16;
 
     REQUIRE_INSNS_FLAGS(ctx, 64BX);
 
@@ -XXX,XX +XXX,XX @@ static bool do_ldst_quad(DisasContext *ctx, arg_D *a, bool store, bool prefixed)
         low_addr_gpr = cpu_gpr[a->rt + 1];
         high_addr_gpr = cpu_gpr[a->rt];
     }
+    t16 = tcg_temp_new_i128();
 
-    if (tb_cflags(ctx->base.tb) & CF_PARALLEL) {
-        if (HAVE_ATOMIC128) {
-            mop = DEF_MEMOP(MO_128);
-            TCGv_i32 oi = tcg_constant_i32(make_memop_idx(mop, ctx->mem_idx));
-            if (store) {
-                if (ctx->le_mode) {
-                    gen_helper_stq_le_parallel(cpu_env, ea, low_addr_gpr,
-                                               high_addr_gpr, oi);
-                } else {
-                    gen_helper_stq_be_parallel(cpu_env, ea, high_addr_gpr,
-                                               low_addr_gpr, oi);
-
-                }
-            } else {
-                if (ctx->le_mode) {
-                    gen_helper_lq_le_parallel(low_addr_gpr, cpu_env, ea, oi);
-                    tcg_gen_ld_i64(high_addr_gpr, cpu_env,
-                                   offsetof(CPUPPCState, retxh));
-                } else {
-                    gen_helper_lq_be_parallel(high_addr_gpr, cpu_env, ea, oi);
-                    tcg_gen_ld_i64(low_addr_gpr, cpu_env,
-                                   offsetof(CPUPPCState, retxh));
-                }
-            }
-        } else {
-            /* Restart with exclusive lock.  */
-            gen_helper_exit_atomic(cpu_env);
-            ctx->base.is_jmp = DISAS_NORETURN;
-        }
+    if (store) {
+        tcg_gen_concat_i64_i128(t16, low_addr_gpr, high_addr_gpr);
+        tcg_gen_qemu_st_i128(t16, ea, ctx->mem_idx, DEF_MEMOP(MO_128));
     } else {
-        mop = DEF_MEMOP(MO_UQ);
-        if (store) {
-            tcg_gen_qemu_st_i64(low_addr_gpr, ea, ctx->mem_idx, mop);
-        } else {
-            tcg_gen_qemu_ld_i64(low_addr_gpr, ea, ctx->mem_idx, mop);
-        }
-
-        gen_addr_add(ctx, ea, ea, 8);
-
-        if (store) {
-            tcg_gen_qemu_st_i64(high_addr_gpr, ea, ctx->mem_idx, mop);
-        } else {
-            tcg_gen_qemu_ld_i64(high_addr_gpr, ea, ctx->mem_idx, mop);
-        }
+        tcg_gen_qemu_ld_i128(t16, ea, ctx->mem_idx, DEF_MEMOP(MO_128));
+        tcg_gen_extr_i128_i64(low_addr_gpr, high_addr_gpr, t16);
     }
 #else
     qemu_build_not_reached();
-- 
2.34.1

No need to roll our own, as this is now provided by tcg.
This was the last use of retxl, so remove that too.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/s390x/cpu.h               |  3 --
 target/s390x/helper.h            |  4 ---
 target/s390x/tcg/mem_helper.c    | 61 --------------------------------
 target/s390x/tcg/translate.c     | 30 +++++-----------
 target/s390x/tcg/insn-data.h.inc |  2 +-
 5 files changed, 9 insertions(+), 91 deletions(-)

diff --git a/target/s390x/cpu.h b/target/s390x/cpu.h
index XXXXXXX..XXXXXXX 100644
--- a/target/s390x/cpu.h
+++ b/target/s390x/cpu.h
@@ -XXX,XX +XXX,XX @@ struct CPUArchState {
 
     float_status fpu_status; /* passed to softfloat lib */
 
-    /* The low part of a 128-bit return, or remainder of a divide.  */
-    uint64_t retxl;
-
     PSW psw;
 
     S390CrashReason crash_reason;
diff --git a/target/s390x/helper.h b/target/s390x/helper.h
index XXXXXXX..XXXXXXX 100644
--- a/target/s390x/helper.h
+++ b/target/s390x/helper.h
@@ -XXX,XX +XXX,XX @@ DEF_HELPER_FLAGS_2(sfas, TCG_CALL_NO_WG, void, env, i64)
 DEF_HELPER_FLAGS_2(srnm, TCG_CALL_NO_WG, void, env, i64)
 DEF_HELPER_FLAGS_1(popcnt, TCG_CALL_NO_RWG_SE, i64, i64)
 DEF_HELPER_2(stfle, i32, env, i64)
-DEF_HELPER_FLAGS_2(lpq, TCG_CALL_NO_WG, i64, env, i64)
-DEF_HELPER_FLAGS_2(lpq_parallel, TCG_CALL_NO_WG, i64, env, i64)
-DEF_HELPER_FLAGS_4(stpq, TCG_CALL_NO_WG, void, env, i64, i64, i64)
-DEF_HELPER_FLAGS_4(stpq_parallel, TCG_CALL_NO_WG, void, env, i64, i64, i64)
 DEF_HELPER_4(mvcos, i32, env, i64, i64, i64)
 DEF_HELPER_4(cu12, i32, env, i32, i32, i32)
 DEF_HELPER_4(cu14, i32, env, i32, i32, i32)
diff --git a/target/s390x/tcg/mem_helper.c b/target/s390x/tcg/mem_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/s390x/tcg/mem_helper.c
+++ b/target/s390x/tcg/mem_helper.c
@@ -XXX,XX +XXX,XX @@ uint64_t HELPER(lra)(CPUS390XState *env, uint64_t addr)
 }
 #endif
 
-/* load pair from quadword */
-uint64_t HELPER(lpq)(CPUS390XState *env, uint64_t addr)
-{
-    uintptr_t ra = GETPC();
-    uint64_t hi, lo;
-
-    check_alignment(env, addr, 16, ra);
-    hi = cpu_ldq_data_ra(env, addr + 0, ra);
-    lo = cpu_ldq_data_ra(env, addr + 8, ra);
-
-    env->retxl = lo;
-    return hi;
-}
-
-uint64_t HELPER(lpq_parallel)(CPUS390XState *env, uint64_t addr)
-{
-    uintptr_t ra = GETPC();
-    uint64_t hi, lo;
-    int mem_idx;
-    MemOpIdx oi;
-    Int128 v;
-
-    assert(HAVE_ATOMIC128);
-
-    mem_idx = cpu_mmu_index(env, false);
-    oi = make_memop_idx(MO_TEUQ | MO_ALIGN_16, mem_idx);
-    v = cpu_atomic_ldo_be_mmu(env, addr, oi, ra);
-    hi = int128_gethi(v);
-    lo = int128_getlo(v);
-
-    env->retxl = lo;
-    return hi;
-}
-
-/* store pair to quadword */
-void HELPER(stpq)(CPUS390XState *env, uint64_t addr,
-                  uint64_t low, uint64_t high)
-{
-    uintptr_t ra = GETPC();
-
-    check_alignment(env, addr, 16, ra);
-    cpu_stq_data_ra(env, addr + 0, high, ra);
-    cpu_stq_data_ra(env, addr + 8, low, ra);
-}
-
-void HELPER(stpq_parallel)(CPUS390XState *env, uint64_t addr,
-                           uint64_t low, uint64_t high)
-{
-    uintptr_t ra = GETPC();
-    int mem_idx;
-    MemOpIdx oi;
-    Int128 v;
-
-    assert(HAVE_ATOMIC128);
-
-    mem_idx = cpu_mmu_index(env, false);
-    oi = make_memop_idx(MO_TEUQ | MO_ALIGN_16, mem_idx);
-    v = int128_make128(low, high);
-    cpu_atomic_sto_be_mmu(env, addr, v, oi, ra);
-}
-
 /* Execute instruction.  This instruction executes an insn modified with
    the contents of r1.  It does not change the executed instruction in memory;
    it does not change the program counter.
diff --git a/target/s390x/tcg/translate.c b/target/s390x/tcg/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/s390x/tcg/translate.c
+++ b/target/s390x/tcg/translate.c
@@ -XXX,XX +XXX,XX @@ static void store_freg32_i64(int reg, TCGv_i64 v)
     tcg_gen_st32_i64(v, cpu_env, freg32_offset(reg));
 }
 
-static void return_low128(TCGv_i64 dest)
-{
-    tcg_gen_ld_i64(dest, cpu_env, offsetof(CPUS390XState, retxl));
-}
-
 static void update_psw_addr(DisasContext *s)
 {
     /* psw.addr */
@@ -XXX,XX +XXX,XX @@ static DisasJumpType op_lpd(DisasContext *s, DisasOps *o)
 
 static DisasJumpType op_lpq(DisasContext *s, DisasOps *o)
 {
-    if (!(tb_cflags(s->base.tb) & CF_PARALLEL)) {
-        gen_helper_lpq(o->out, cpu_env, o->in2);
-    } else if (HAVE_ATOMIC128) {
-        gen_helper_lpq_parallel(o->out, cpu_env, o->in2);
-    } else {
-        gen_helper_exit_atomic(cpu_env);
-        return DISAS_NORETURN;
-    }
-    return_low128(o->out2);
+    o->out_128 = tcg_temp_new_i128();
+    tcg_gen_qemu_ld_i128(o->out_128, o->in2, get_mem_index(s),
+                         MO_TE | MO_128 | MO_ALIGN);
     return DISAS_NEXT;
 }
 
@@ -XXX,XX +XXX,XX @@ static DisasJumpType op_stmh(DisasContext *s, DisasOps *o)
 
 static DisasJumpType op_stpq(DisasContext *s, DisasOps *o)
 {
-    if (!(tb_cflags(s->base.tb) & CF_PARALLEL)) {
-        gen_helper_stpq(cpu_env, o->in2, o->out2, o->out);
-    } else if (HAVE_ATOMIC128) {
-        gen_helper_stpq_parallel(cpu_env, o->in2, o->out2, o->out);
-    } else {
-        gen_helper_exit_atomic(cpu_env);
-        return DISAS_NORETURN;
-    }
+    TCGv_i128 t16 = tcg_temp_new_i128();
+
+    tcg_gen_concat_i64_i128(t16, o->out2, o->out);
+    tcg_gen_qemu_st_i128(t16, o->in2, get_mem_index(s),
+                         MO_TE | MO_128 | MO_ALIGN);
     return DISAS_NEXT;
 }
 
diff --git a/target/s390x/tcg/insn-data.h.inc b/target/s390x/tcg/insn-data.h.inc
index XXXXXXX..XXXXXXX 100644
--- a/target/s390x/tcg/insn-data.h.inc
+++ b/target/s390x/tcg/insn-data.h.inc
@@ -XXX,XX +XXX,XX @@
     D(0xc804, LPD,     SSF,   ILA, 0, 0, new_P, r3_P32, lpd, 0, MO_TEUL)
     D(0xc805, LPDG,    SSF,   ILA, 0, 0, new_P, r3_P64, lpd, 0, MO_TEUQ)
 /* LOAD PAIR FROM QUADWORD */
-    C(0xe38f, LPQ,     RXY_a, Z,   0, a2, r1_P, 0, lpq, 0)
+    C(0xe38f, LPQ,     RXY_a, Z,   0, a2, 0, r1_D64, lpq, 0)
 /* LOAD POSITIVE */
     C(0x1000, LPR,     RR_a,  Z,   0, r2_32s, new, r1_32, abs, abs32)
     C(0xb900, LPGR,    RRE,   Z,   0, r2, r1, 0, abs, abs64)
-- 
2.34.1

With the current structure of cputlb.c, there is no difference
between the little-endian and big-endian entry points, aside
from the assert.  Unify the pairs of functions.

The only use of the functions with explicit endianness was in
target/sparc64, and that was only to satisfy the assert: the
correct endianness is already built into memop.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/cpu_ldst.h     |  58 ++-----
 accel/tcg/cputlb.c          | 122 +++-----------
 accel/tcg/user-exec.c       | 322 ++++++++++--------------------------
 target/arm/tcg/m_helper.c   |   4 +-
 target/sparc/ldst_helper.c  |  18 +-
 accel/tcg/ldst_common.c.inc |  24 +--
 6 files changed, 137 insertions(+), 411 deletions(-)

diff --git a/include/exec/cpu_ldst.h b/include/exec/cpu_ldst.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/cpu_ldst.h
+++ b/include/exec/cpu_ldst.h
@@ -XXX,XX +XXX,XX @@ void cpu_stq_le_mmuidx_ra(CPUArchState *env, abi_ptr ptr, uint64_t val,
                           int mmu_idx, uintptr_t ra);
 
 uint8_t cpu_ldb_mmu(CPUArchState *env, abi_ptr ptr, MemOpIdx oi, uintptr_t ra);
-uint16_t cpu_ldw_be_mmu(CPUArchState *env, abi_ptr ptr,
-                        MemOpIdx oi, uintptr_t ra);
-uint32_t cpu_ldl_be_mmu(CPUArchState *env, abi_ptr ptr,
-                        MemOpIdx oi, uintptr_t ra);
-uint64_t cpu_ldq_be_mmu(CPUArchState *env, abi_ptr ptr,
-                        MemOpIdx oi, uintptr_t ra);
-uint16_t cpu_ldw_le_mmu(CPUArchState *env, abi_ptr ptr,
-                        MemOpIdx oi, uintptr_t ra);
-uint32_t cpu_ldl_le_mmu(CPUArchState *env, abi_ptr ptr,
-                        MemOpIdx oi, uintptr_t ra);
-uint64_t cpu_ldq_le_mmu(CPUArchState *env, abi_ptr ptr,
-                        MemOpIdx oi, uintptr_t ra);
-
-Int128 cpu_ld16_be_mmu(CPUArchState *env, abi_ptr addr,
-                       MemOpIdx oi, uintptr_t ra);
-Int128 cpu_ld16_le_mmu(CPUArchState *env, abi_ptr addr,
-                       MemOpIdx oi, uintptr_t ra);
+uint16_t cpu_ldw_mmu(CPUArchState *env, abi_ptr ptr, MemOpIdx oi, uintptr_t ra);
+uint32_t cpu_ldl_mmu(CPUArchState *env, abi_ptr ptr, MemOpIdx oi, uintptr_t ra);
+uint64_t cpu_ldq_mmu(CPUArchState *env, abi_ptr ptr, MemOpIdx oi, uintptr_t ra);
+Int128 cpu_ld16_mmu(CPUArchState *env, abi_ptr addr, MemOpIdx oi, uintptr_t ra);
 
 void cpu_stb_mmu(CPUArchState *env, abi_ptr ptr, uint8_t val,
                  MemOpIdx oi, uintptr_t ra);
-void cpu_stw_be_mmu(CPUArchState *env, abi_ptr ptr, uint16_t val,
-                    MemOpIdx oi, uintptr_t ra);
-void cpu_stl_be_mmu(CPUArchState *env, abi_ptr ptr, uint32_t val,
-                    MemOpIdx oi, uintptr_t ra);
-void cpu_stq_be_mmu(CPUArchState *env, abi_ptr ptr, uint64_t val,
-                    MemOpIdx oi, uintptr_t ra);
-void cpu_stw_le_mmu(CPUArchState *env, abi_ptr ptr, uint16_t val,
-                    MemOpIdx oi, uintptr_t ra);
-void cpu_stl_le_mmu(CPUArchState *env, abi_ptr ptr, uint32_t val,
-                    MemOpIdx oi, uintptr_t ra);
-void cpu_stq_le_mmu(CPUArchState *env, abi_ptr ptr, uint64_t val,
-                    MemOpIdx oi, uintptr_t ra);
-
-void cpu_st16_be_mmu(CPUArchState *env, abi_ptr addr, Int128 val,
-                     MemOpIdx oi, uintptr_t ra);
-void cpu_st16_le_mmu(CPUArchState *env, abi_ptr addr, Int128 val,
-                     MemOpIdx oi, uintptr_t ra);
+void cpu_stw_mmu(CPUArchState *env, abi_ptr ptr, uint16_t val,
+                 MemOpIdx oi, uintptr_t ra);
+void cpu_stl_mmu(CPUArchState *env, abi_ptr ptr, uint32_t val,
+                 MemOpIdx oi, uintptr_t ra);
+void cpu_stq_mmu(CPUArchState *env, abi_ptr ptr, uint64_t val,
+                 MemOpIdx oi, uintptr_t ra);
+void cpu_st16_mmu(CPUArchState *env, abi_ptr addr, Int128 val,
+                  MemOpIdx oi, uintptr_t ra);
 
 uint32_t cpu_atomic_cmpxchgb_mmu(CPUArchState *env, target_ulong addr,
                                  uint32_t cmpv, uint32_t newv,
@@ -XXX,XX +XXX,XX @@ static inline CPUTLBEntry *tlb_entry(CPUArchState *env, uintptr_t mmu_idx,
 # define cpu_ldsw_mmuidx_ra   cpu_ldsw_be_mmuidx_ra
 # define cpu_ldl_mmuidx_ra    cpu_ldl_be_mmuidx_ra
 # define cpu_ldq_mmuidx_ra    cpu_ldq_be_mmuidx_ra
-# define cpu_ldw_mmu          cpu_ldw_be_mmu
-# define cpu_ldl_mmu          cpu_ldl_be_mmu
-# define cpu_ldq_mmu          cpu_ldq_be_mmu
 # define cpu_stw_data         cpu_stw_be_data
 # define cpu_stl_data         cpu_stl_be_data
 # define cpu_stq_data         cpu_stq_be_data
@@ -XXX,XX +XXX,XX @@ static inline CPUTLBEntry *tlb_entry(CPUArchState *env, uintptr_t mmu_idx,
 # define cpu_stw_mmuidx_ra    cpu_stw_be_mmuidx_ra
 # define cpu_stl_mmuidx_ra    cpu_stl_be_mmuidx_ra
 # define cpu_stq_mmuidx_ra    cpu_stq_be_mmuidx_ra
-# define cpu_stw_mmu          cpu_stw_be_mmu
-# define cpu_stl_mmu          cpu_stl_be_mmu
-# define cpu_stq_mmu          cpu_stq_be_mmu
 #else
 # define cpu_lduw_data        cpu_lduw_le_data
 # define cpu_ldsw_data        cpu_ldsw_le_data
@@ -XXX,XX +XXX,XX @@ static inline CPUTLBEntry *tlb_entry(CPUArchState *env, uintptr_t mmu_idx,
 # define cpu_ldsw_mmuidx_ra   cpu_ldsw_le_mmuidx_ra
 # define cpu_ldl_mmuidx_ra    cpu_ldl_le_mmuidx_ra
 # define cpu_ldq_mmuidx_ra    cpu_ldq_le_mmuidx_ra
-# define cpu_ldw_mmu          cpu_ldw_le_mmu
-# define cpu_ldl_mmu          cpu_ldl_le_mmu
-# define cpu_ldq_mmu          cpu_ldq_le_mmu
 # define cpu_stw_data         cpu_stw_le_data
 # define cpu_stl_data         cpu_stl_le_data
 # define cpu_stq_data         cpu_stq_le_data
@@ -XXX,XX +XXX,XX @@ static inline CPUTLBEntry *tlb_entry(CPUArchState *env, uintptr_t mmu_idx,
 # define cpu_stw_mmuidx_ra    cpu_stw_le_mmuidx_ra
 # define cpu_stl_mmuidx_ra    cpu_stl_le_mmuidx_ra
 # define cpu_stq_mmuidx_ra    cpu_stq_le_mmuidx_ra
-# define cpu_stw_mmu          cpu_stw_le_mmu
-# define cpu_stl_mmu          cpu_stl_le_mmu
-# define cpu_stq_mmu          cpu_stq_le_mmu
 #endif
 
 uint8_t cpu_ldb_code_mmu(CPUArchState *env, abi_ptr addr,
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ uint8_t cpu_ldb_mmu(CPUArchState *env, abi_ptr addr, MemOpIdx oi, uintptr_t ra)
     return ret;
 }
 
-uint16_t cpu_ldw_be_mmu(CPUArchState *env, abi_ptr addr,
-                        MemOpIdx oi, uintptr_t ra)
+uint16_t cpu_ldw_mmu(CPUArchState *env, abi_ptr addr,
+                     MemOpIdx oi, uintptr_t ra)
 {
     uint16_t ret;
 
-    tcg_debug_assert((get_memop(oi) & (MO_BSWAP | MO_SIZE)) == MO_BEUW);
+    tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_16);
     ret = do_ld2_mmu(env, addr, oi, ra, MMU_DATA_LOAD);
     plugin_load_cb(env, addr, oi);
     return ret;
 }
 
-uint32_t cpu_ldl_be_mmu(CPUArchState *env, abi_ptr addr,
-                        MemOpIdx oi, uintptr_t ra)
+uint32_t cpu_ldl_mmu(CPUArchState *env, abi_ptr addr,
+                     MemOpIdx oi, uintptr_t ra)
 {
     uint32_t ret;
 
-    tcg_debug_assert((get_memop(oi) & (MO_BSWAP | MO_SIZE)) == MO_BEUL);
+    tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_32);
     ret = do_ld4_mmu(env, addr, oi, ra, MMU_DATA_LOAD);
     plugin_load_cb(env, addr, oi);
     return ret;
 }
 
-uint64_t cpu_ldq_be_mmu(CPUArchState *env, abi_ptr addr,
-                        MemOpIdx oi, uintptr_t ra)
+uint64_t cpu_ldq_mmu(CPUArchState *env, abi_ptr addr,
+                     MemOpIdx oi, uintptr_t ra)
 {
     uint64_t ret;
 
-    tcg_debug_assert((get_memop(oi) & (MO_BSWAP | MO_SIZE)) == MO_BEUQ);
+    tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_64);
     ret = do_ld8_mmu(env, addr, oi, ra, MMU_DATA_LOAD);
     plugin_load_cb(env, addr, oi);
     return ret;
 }
 
-uint16_t cpu_ldw_le_mmu(CPUArchState *env, abi_ptr addr,
-                        MemOpIdx oi, uintptr_t ra)
-{
-    uint16_t ret;
-
-    tcg_debug_assert((get_memop(oi) & (MO_BSWAP | MO_SIZE)) == MO_LEUW);
-    ret = do_ld2_mmu(env, addr, oi, ra, MMU_DATA_LOAD);
-    plugin_load_cb(env, addr, oi);
-    return ret;
-}
-
-uint32_t cpu_ldl_le_mmu(CPUArchState *env, abi_ptr addr,
-                        MemOpIdx oi, uintptr_t ra)
-{
-    uint32_t ret;
-
-    tcg_debug_assert((get_memop(oi) & (MO_BSWAP | MO_SIZE)) == MO_LEUL);
-    ret = do_ld4_mmu(env, addr, oi, ra, MMU_DATA_LOAD);
-    plugin_load_cb(env, addr, oi);
-    return ret;
-}
-
-uint64_t cpu_ldq_le_mmu(CPUArchState *env, abi_ptr addr,
-                        MemOpIdx oi, uintptr_t ra)
-{
-    uint64_t ret;
-
-    tcg_debug_assert((get_memop(oi) & (MO_BSWAP | MO_SIZE)) == MO_LEUQ);
-    ret = do_ld8_mmu(env, addr, oi, ra, MMU_DATA_LOAD);
-    plugin_load_cb(env, addr, oi);
-    return ret;
-}
-
-Int128 cpu_ld16_be_mmu(CPUArchState *env, abi_ptr addr,
-                       MemOpIdx oi, uintptr_t ra)
+Int128 cpu_ld16_mmu(CPUArchState *env, abi_ptr addr,
+                    MemOpIdx oi, uintptr_t ra)
 {
     Int128 ret;
 
-    tcg_debug_assert((get_memop(oi) & (MO_BSWAP|MO_SIZE)) == (MO_BE|MO_128));
-    ret = do_ld16_mmu(env, addr, oi, ra);
-    plugin_load_cb(env, addr, oi);
-    return ret;
-}
-
-Int128 cpu_ld16_le_mmu(CPUArchState *env, abi_ptr addr,
-                       MemOpIdx oi, uintptr_t ra)
-{
-    Int128 ret;
-
-    tcg_debug_assert((get_memop(oi) & (MO_BSWAP|MO_SIZE)) == (MO_LE|MO_128));
+    tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_128);
     ret = do_ld16_mmu(env, addr, oi, ra);
     plugin_load_cb(env, addr, oi);
     return ret;
@@ -XXX,XX +XXX,XX @@ void cpu_stb_mmu(CPUArchState *env, target_ulong addr, uint8_t val,
     plugin_store_cb(env, addr, oi);
 }
 
-void cpu_stw_be_mmu(CPUArchState *env, target_ulong addr, uint16_t val,
-                    MemOpIdx oi, uintptr_t retaddr)
+void cpu_stw_mmu(CPUArchState *env, target_ulong addr, uint16_t val,
+                 MemOpIdx oi, uintptr_t retaddr)
 {
-    tcg_debug_assert((get_memop(oi) & (MO_BSWAP | MO_SIZE)) == MO_BEUW);
+    tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_16);
     do_st2_mmu(env, addr, val, oi, retaddr);
     plugin_store_cb(env, addr, oi);
 }
 
-void cpu_stl_be_mmu(CPUArchState *env, target_ulong addr, uint32_t val,
+void cpu_stl_mmu(CPUArchState *env, target_ulong addr, uint32_t val,
                     MemOpIdx oi, uintptr_t retaddr)
 {
-    tcg_debug_assert((get_memop(oi) & (MO_BSWAP | MO_SIZE)) == MO_BEUL);
+    tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_32);
     do_st4_mmu(env, addr, val, oi, retaddr);
     plugin_store_cb(env, addr, oi);
 }
 
-void cpu_stq_be_mmu(CPUArchState *env, target_ulong addr, uint64_t val,
-                    MemOpIdx oi, uintptr_t retaddr)
+void cpu_stq_mmu(CPUArchState *env, target_ulong addr, uint64_t val,
+                 MemOpIdx oi, uintptr_t retaddr)
 {
-    tcg_debug_assert((get_memop(oi) & (MO_BSWAP | MO_SIZE)) == MO_BEUQ);
+    tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_64);
     do_st8_mmu(env, addr, val, oi, retaddr);
     plugin_store_cb(env, addr, oi);
 }
 
-void cpu_stw_le_mmu(CPUArchState *env, target_ulong addr, uint16_t val,
-                    MemOpIdx oi, uintptr_t retaddr)
+void cpu_st16_mmu(CPUArchState *env, target_ulong addr, Int128 val,
+                  MemOpIdx oi, uintptr_t retaddr)
 {
-    tcg_debug_assert((get_memop(oi) & (MO_BSWAP | MO_SIZE)) == MO_LEUW);
-    do_st2_mmu(env, addr, val, oi, retaddr);
-    plugin_store_cb(env, addr, oi);
-}
-
-void cpu_stl_le_mmu(CPUArchState *env, target_ulong addr, uint32_t val,
-                    MemOpIdx oi, uintptr_t retaddr)
-{
-    tcg_debug_assert((get_memop(oi) & (MO_BSWAP | MO_SIZE)) == MO_LEUL);
-    do_st4_mmu(env, addr, val, oi, retaddr);
-    plugin_store_cb(env, addr, oi);
-}
-
-void cpu_stq_le_mmu(CPUArchState *env, target_ulong addr, uint64_t val,
-                    MemOpIdx oi, uintptr_t retaddr)
-{
-    tcg_debug_assert((get_memop(oi) & (MO_BSWAP | MO_SIZE)) == MO_LEUQ);
-    do_st8_mmu(env, addr, val, oi, retaddr);
-    plugin_store_cb(env, addr, oi);
-}
-
-void cpu_st16_be_mmu(CPUArchState *env, target_ulong addr, Int128 val,
-                     MemOpIdx oi, uintptr_t retaddr)
-{
-    tcg_debug_assert((get_memop(oi) & (MO_BSWAP|MO_SIZE)) == (MO_BE|MO_128));
-    do_st16_mmu(env, addr, val, oi, retaddr);
-    plugin_store_cb(env, addr, oi);
-}
-
-void cpu_st16_le_mmu(CPUArchState *env, target_ulong addr, Int128 val,
-                     MemOpIdx oi, uintptr_t retaddr)
-{
-    tcg_debug_assert((get_memop(oi) & (MO_BSWAP|MO_SIZE)) == (MO_LE|MO_128));
+    tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_128);
     do_st16_mmu(env, addr, val, oi, retaddr);
     plugin_store_cb(env, addr, oi);
 }
diff --git a/accel/tcg/user-exec.c b/accel/tcg/user-exec.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/user-exec.c
+++ b/accel/tcg/user-exec.c
@@ -XXX,XX +XXX,XX @@ uint8_t cpu_ldb_mmu(CPUArchState *env, abi_ptr addr,
     return ret;
 }
 
-static uint16_t do_ld2_he_mmu(CPUArchState *env, abi_ptr addr,
-                              MemOp mop, uintptr_t ra)
+static uint16_t do_ld2_mmu(CPUArchState *env, abi_ptr addr,
+                           MemOp mop, uintptr_t ra)
 {
     void *haddr;
     uint16_t ret;
@@ -XXX,XX +XXX,XX @@ static uint16_t do_ld2_he_mmu(CPUArchState *env, abi_ptr addr,
     haddr = cpu_mmu_lookup(env, addr, mop, ra, MMU_DATA_LOAD);
     ret = load_atom_2(env, ra, haddr, mop);
     clear_helper_retaddr();
+
+    if (mop & MO_BSWAP) {
+        ret = bswap16(ret);
+    }
     return ret;
 }
 
 tcg_target_ulong helper_lduw_mmu(CPUArchState *env, uint64_t addr,
                                  MemOpIdx oi, uintptr_t ra)
 {
-    MemOp mop = get_memop(oi);
-    uint16_t ret = do_ld2_he_mmu(env, addr, mop, ra);
-
-    if (mop & MO_BSWAP) {
-        ret = bswap16(ret);
-    }
-    return ret;
+    return do_ld2_mmu(env, addr, get_memop(oi), ra);
 }
 
 tcg_target_ulong helper_ldsw_mmu(CPUArchState *env, uint64_t addr,
                                  MemOpIdx oi, uintptr_t ra)
 {
-    MemOp mop = get_memop(oi);
-    int16_t ret = do_ld2_he_mmu(env, addr, mop, ra);
+    return (int16_t)do_ld2_mmu(env, addr, get_memop(oi), ra);
+}
 
-    if (mop & MO_BSWAP) {
-        ret = bswap16(ret);
-    }
+uint16_t cpu_ldw_mmu(CPUArchState *env, abi_ptr addr,
+                     MemOpIdx oi, uintptr_t ra)
+{
+    uint16_t ret = do_ld2_mmu(env, addr, get_memop(oi), ra);
+    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
     return ret;
 }
 
-uint16_t cpu_ldw_be_mmu(CPUArchState *env, abi_ptr addr,
-                        MemOpIdx oi, uintptr_t ra)
-{
-    MemOp mop = get_memop(oi);
-    uint16_t ret;
-
-    tcg_debug_assert((mop & MO_BSWAP) == MO_BE);
-    ret = do_ld2_he_mmu(env, addr, mop, ra);
-    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
-    return cpu_to_be16(ret);
-}
-
-uint16_t cpu_ldw_le_mmu(CPUArchState *env, abi_ptr addr,
-                        MemOpIdx oi, uintptr_t ra)
-{
-    MemOp mop = get_memop(oi);
-    uint16_t ret;
-
-    tcg_debug_assert((mop & MO_BSWAP) == MO_LE);
-    ret = do_ld2_he_mmu(env, addr, mop, ra);
-    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
-    return cpu_to_le16(ret);
-}
-
-static uint32_t do_ld4_he_mmu(CPUArchState *env, abi_ptr addr,
-                              MemOp mop, uintptr_t ra)
+static uint32_t do_ld4_mmu(CPUArchState *env, abi_ptr addr,
+                           MemOp mop, uintptr_t ra)
 {
     void *haddr;
     uint32_t ret;
@@ -XXX,XX +XXX,XX @@ static uint32_t do_ld4_he_mmu(CPUArchState *env, abi_ptr addr,
     haddr = cpu_mmu_lookup(env, addr, mop, ra, MMU_DATA_LOAD);
     ret = load_atom_4(env, ra, haddr, mop);
     clear_helper_retaddr();
+
+    if (mop & MO_BSWAP) {
+        ret = bswap32(ret);
+    }
     return ret;
 }
 
 tcg_target_ulong helper_ldul_mmu(CPUArchState *env, uint64_t addr,
                                  MemOpIdx oi, uintptr_t ra)
 {
-    MemOp mop = get_memop(oi);
-    uint32_t ret = do_ld4_he_mmu(env, addr, mop, ra);
-
-    if (mop & MO_BSWAP) {
-        ret = bswap32(ret);
-    }
-    return ret;
+    return do_ld4_mmu(env, addr, get_memop(oi), ra);
 }
 
 tcg_target_ulong helper_ldsl_mmu(CPUArchState *env, uint64_t addr,
                                  MemOpIdx oi, uintptr_t ra)
 {
-    MemOp mop = get_memop(oi);
-    int32_t ret = do_ld4_he_mmu(env, addr, mop, ra);
+    return (int32_t)do_ld4_mmu(env, addr, get_memop(oi), ra);
+}
 
-    if (mop & MO_BSWAP) {
-        ret = bswap32(ret);
-    }
+uint32_t cpu_ldl_mmu(CPUArchState *env, abi_ptr addr,
+                     MemOpIdx oi, uintptr_t ra)
+{
+    uint32_t ret = do_ld4_mmu(env, addr, get_memop(oi), ra);
+    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
     return ret;
 }
 
-uint32_t cpu_ldl_be_mmu(CPUArchState *env, abi_ptr addr,
-                        MemOpIdx oi, uintptr_t ra)
-{
-    MemOp mop = get_memop(oi);
-    uint32_t ret;
-
-    tcg_debug_assert((mop & MO_BSWAP) == MO_BE);
-    ret = do_ld4_he_mmu(env, addr, mop, ra);
-    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
-    return cpu_to_be32(ret);
-}
-
-uint32_t cpu_ldl_le_mmu(CPUArchState *env, abi_ptr addr,
-                        MemOpIdx oi, uintptr_t ra)
-{
-    MemOp mop = get_memop(oi);
-    uint32_t ret;
-
-    tcg_debug_assert((mop & MO_BSWAP) == MO_LE);
-    ret = do_ld4_he_mmu(env, addr, mop, ra);
-    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
-    return cpu_to_le32(ret);
-}
-
-static uint64_t do_ld8_he_mmu(CPUArchState *env, abi_ptr addr,
-                              MemOp mop, uintptr_t ra)
+static uint64_t do_ld8_mmu(CPUArchState *env, abi_ptr addr,
+                           MemOp mop, uintptr_t ra)
 {
     void *haddr;
     uint64_t ret;
@@ -XXX,XX +XXX,XX @@ static uint64_t do_ld8_he_mmu(CPUArchState *env, abi_ptr addr,
     haddr = cpu_mmu_lookup(env, addr, mop, ra, MMU_DATA_LOAD);
     ret = load_atom_8(env, ra, haddr, mop);
     clear_helper_retaddr();
-    return ret;
-}
-
-uint64_t helper_ldq_mmu(CPUArchState *env, uint64_t addr,
-                        MemOpIdx oi, uintptr_t ra)
-{
-    MemOp mop = get_memop(oi);
-    uint64_t ret = do_ld8_he_mmu(env, addr, mop, ra);
 
     if (mop & MO_BSWAP) {
         ret = bswap64(ret);
@@ -XXX,XX +XXX,XX @@ uint64_t helper_ldq_mmu(CPUArchState *env, uint64_t addr,
     return ret;
 }
 
-uint64_t cpu_ldq_be_mmu(CPUArchState *env, abi_ptr addr,
+uint64_t helper_ldq_mmu(CPUArchState *env, uint64_t addr,
                         MemOpIdx oi, uintptr_t ra)
 {
-    MemOp mop = get_memop(oi);
-    uint64_t ret;
-
-    tcg_debug_assert((mop & MO_BSWAP) == MO_BE);
-    ret = do_ld8_he_mmu(env, addr, mop, ra);
-    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
-    return cpu_to_be64(ret);
+    return do_ld8_mmu(env, addr, get_memop(oi), ra);
 }
 
-uint64_t cpu_ldq_le_mmu(CPUArchState *env, abi_ptr addr,
-                        MemOpIdx oi, uintptr_t ra)
+uint64_t cpu_ldq_mmu(CPUArchState *env, abi_ptr addr,
+                     MemOpIdx oi, uintptr_t ra)
 {
-    MemOp mop = get_memop(oi);
-    uint64_t ret;
-
-    tcg_debug_assert((mop & MO_BSWAP) == MO_LE);
-    ret = do_ld8_he_mmu(env, addr, mop, ra);
+    uint64_t ret = do_ld8_mmu(env, addr, get_memop(oi), ra);
     qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
-    return cpu_to_le64(ret);
+    return ret;
 }
 
-static Int128 do_ld16_he_mmu(CPUArchState *env, abi_ptr addr,
-                             MemOp mop, uintptr_t ra)
+static Int128 do_ld16_mmu(CPUArchState *env, abi_ptr addr,
+                          MemOp mop, uintptr_t ra)
 {
     void *haddr;
     Int128 ret;
@@ -XXX,XX +XXX,XX @@ static Int128 do_ld16_he_mmu(CPUArchState *env, abi_ptr addr,
     haddr = cpu_mmu_lookup(env, addr, mop, ra, MMU_DATA_LOAD);
     ret = load_atom_16(env, ra, haddr, mop);
     clear_helper_retaddr();
-    return ret;
-}
-
-Int128 helper_ld16_mmu(CPUArchState *env, uint64_t addr,
-                       MemOpIdx oi, uintptr_t ra)
-{
-    MemOp mop = get_memop(oi);
-    Int128 ret = do_ld16_he_mmu(env, addr, mop, ra);
 
     if (mop & MO_BSWAP) {
         ret = bswap128(ret);
@@ -XXX,XX +XXX,XX @@ Int128 helper_ld16_mmu(CPUArchState *env, uint64_t addr,
     return ret;
 }
 
+Int128 helper_ld16_mmu(CPUArchState *env, uint64_t addr,
+                       MemOpIdx oi, uintptr_t ra)
+{
+    return do_ld16_mmu(env, addr, get_memop(oi), ra);
+}
+
 Int128 helper_ld_i128(CPUArchState *env, uint64_t addr, MemOpIdx oi)
 {
     return helper_ld16_mmu(env, addr, oi, GETPC());
 }
 
-Int128 cpu_ld16_be_mmu(CPUArchState *env, abi_ptr addr,
-                       MemOpIdx oi, uintptr_t ra)
+Int128 cpu_ld16_mmu(CPUArchState *env, abi_ptr addr,
+                    MemOpIdx oi, uintptr_t ra)
 {
-    MemOp mop = get_memop(oi);
-    Int128 ret;
-
-    tcg_debug_assert((mop & MO_BSWAP) == MO_BE);
-    ret = do_ld16_he_mmu(env, addr, mop, ra);
+    Int128 ret = do_ld16_mmu(env, addr, get_memop(oi), ra);
     qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
-    if (!HOST_BIG_ENDIAN) {
-        ret = bswap128(ret);
-    }
-    return ret;
-}
-
-Int128 cpu_ld16_le_mmu(CPUArchState *env, abi_ptr addr,
-                       MemOpIdx oi, uintptr_t ra)
-{
-    MemOp mop = get_memop(oi);
-    Int128 ret;
-
-    tcg_debug_assert((mop & MO_BSWAP) == MO_LE);
-    ret = do_ld16_he_mmu(env, addr, mop, ra);
-    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
-    if (HOST_BIG_ENDIAN) {
-        ret = bswap128(ret);
-    }
     return ret;
 }
 
@@ -XXX,XX +XXX,XX @@ void cpu_stb_mmu(CPUArchState *env, abi_ptr addr, uint8_t val,
     qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W);
 }
 
-static void do_st2_he_mmu(CPUArchState *env, abi_ptr addr, uint16_t val,
-                          MemOp mop, uintptr_t ra)
+static void do_st2_mmu(CPUArchState *env, abi_ptr addr, uint16_t val,
+                       MemOp mop, uintptr_t ra)
 {
     void *haddr;
 
     tcg_debug_assert((mop & MO_SIZE) == MO_16);
     haddr = cpu_mmu_lookup(env, addr, mop, ra, MMU_DATA_STORE);
+
+    if (mop & MO_BSWAP) {
+        val = bswap16(val);
+    }
     store_atom_2(env, ra, haddr, mop, val);
     clear_helper_retaddr();
 }
@@ -XXX,XX +XXX,XX @@ static void do_st2_he_mmu(CPUArchState *env, abi_ptr addr, uint16_t val,
 void helper_stw_mmu(CPUArchState *env, uint64_t addr, uint32_t val,
                     MemOpIdx oi, uintptr_t ra)
 {
-    MemOp mop = get_memop(oi);
-
-    if (mop & MO_BSWAP) {
-        val = bswap16(val);
-    }
-    do_st2_he_mmu(env, addr, val, mop, ra);
+    do_st2_mmu(env, addr, val, get_memop(oi), ra);
 }
 
-void cpu_stw_be_mmu(CPUArchState *env, abi_ptr addr, uint16_t val,
+void cpu_stw_mmu(CPUArchState *env, abi_ptr addr, uint16_t val,
                     MemOpIdx oi, uintptr_t ra)
 {
-    MemOp mop = get_memop(oi);
-
-    tcg_debug_assert((mop & MO_BSWAP) == MO_BE);
-    do_st2_he_mmu(env, addr, be16_to_cpu(val), mop, ra);
+    do_st2_mmu(env, addr, val, get_memop(oi), ra);
     qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W);
 }
 
-void cpu_stw_le_mmu(CPUArchState *env, abi_ptr addr, uint16_t val,
-                    MemOpIdx oi, uintptr_t ra)
-{
-    MemOp mop = get_memop(oi);
-
-    tcg_debug_assert((mop & MO_BSWAP) == MO_LE);
-    do_st2_he_mmu(env, addr, le16_to_cpu(val), mop, ra);
-    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W);
-}
-
-static void do_st4_he_mmu(CPUArchState *env, abi_ptr addr, uint32_t val,
-                          MemOp mop, uintptr_t ra)
+static void do_st4_mmu(CPUArchState *env, abi_ptr addr, uint32_t val,
+                       MemOp mop, uintptr_t ra)
 {
     void *haddr;
 
     tcg_debug_assert((mop & MO_SIZE) == MO_32);
     haddr = cpu_mmu_lookup(env, addr, mop, ra, MMU_DATA_STORE);
+
+    if (mop & MO_BSWAP) {
+        val = bswap32(val);
+    }
     store_atom_4(env, ra, haddr, mop, val);
     clear_helper_retaddr();
 }
@@ -XXX,XX +XXX,XX @@ static void do_st4_he_mmu(CPUArchState *env, abi_ptr addr, uint32_t val,
 void helper_stl_mmu(CPUArchState *env, uint64_t addr, uint32_t val,
                     MemOpIdx oi, uintptr_t ra)
 {
-    MemOp mop = get_memop(oi);
-
-    if (mop & MO_BSWAP) {
-        val = bswap32(val);
-    }
-    do_st4_he_mmu(env, addr, val, mop, ra);
+    do_st4_mmu(env, addr, val, get_memop(oi), ra);
 }
 
-void cpu_stl_be_mmu(CPUArchState *env, abi_ptr addr, uint32_t val,
-                    MemOpIdx oi, uintptr_t ra)
+void cpu_stl_mmu(CPUArchState *env, abi_ptr addr, uint32_t val,
+                 MemOpIdx oi, uintptr_t ra)
 {
-    MemOp mop = get_memop(oi);
-
-    tcg_debug_assert((mop & MO_BSWAP) == MO_BE);
-    do_st4_he_mmu(env, addr, be32_to_cpu(val), mop, ra);
+    do_st4_mmu(env, addr, val, get_memop(oi), ra);
     qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W);
 }
 
-void cpu_stl_le_mmu(CPUArchState *env, abi_ptr addr, uint32_t val,
-                    MemOpIdx oi, uintptr_t ra)
-{
-    MemOp mop = get_memop(oi);
-
-    tcg_debug_assert((mop & MO_BSWAP) == MO_LE);
-    do_st4_he_mmu(env, addr, le32_to_cpu(val), mop, ra);
-    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W);
-}
-
-static void do_st8_he_mmu(CPUArchState *env, abi_ptr addr, uint64_t val,
-                          MemOp mop, uintptr_t ra)
+static void do_st8_mmu(CPUArchState *env, abi_ptr addr, uint64_t val,
+                       MemOp mop, uintptr_t ra)
 {
     void *haddr;
 
     tcg_debug_assert((mop & MO_SIZE) == MO_64);
     haddr = cpu_mmu_lookup(env, addr, mop, ra, MMU_DATA_STORE);
+
+    if (mop & MO_BSWAP) {
+        val = bswap64(val);
+    }
     store_atom_8(env, ra, haddr, mop, val);
     clear_helper_retaddr();
 }
@@ -XXX,XX +XXX,XX @@ static void do_st8_he_mmu(CPUArchState *env, abi_ptr addr, uint64_t val,
 void helper_stq_mmu(CPUArchState *env, uint64_t addr, uint64_t val,
                     MemOpIdx oi, uintptr_t ra)
 {
-    MemOp mop = get_memop(oi);
-
-    if (mop & MO_BSWAP) {
-        val = bswap64(val);
-    }
-    do_st8_he_mmu(env, addr, val, mop, ra);
+    do_st8_mmu(env, addr, val, get_memop(oi), ra);
 }
 
-void cpu_stq_be_mmu(CPUArchState *env, abi_ptr addr, uint64_t val,
+void cpu_stq_mmu(CPUArchState *env, abi_ptr addr, uint64_t val,
                     MemOpIdx oi, uintptr_t ra)
 {
-    MemOp mop = get_memop(oi);
-
-    tcg_debug_assert((mop & MO_BSWAP) == MO_BE);
-    do_st8_he_mmu(env, addr, cpu_to_be64(val), mop, ra);
+    do_st8_mmu(env, addr, val, get_memop(oi), ra);
     qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W);
 }
 
-void cpu_stq_le_mmu(CPUArchState *env, abi_ptr addr, uint64_t val,
-                    MemOpIdx oi, uintptr_t ra)
-{
-    MemOp mop = get_memop(oi);
-
-    tcg_debug_assert((mop & MO_BSWAP) == MO_LE);
-    do_st8_he_mmu(env, addr, cpu_to_le64(val), mop, ra);
-    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W);
-}
-
-static void do_st16_he_mmu(CPUArchState *env, abi_ptr addr, Int128 val,
-                           MemOp mop, uintptr_t ra)
+static void do_st16_mmu(CPUArchState *env, abi_ptr addr, Int128 val,
+                        MemOp mop, uintptr_t ra)
 {
     void *haddr;
 
     tcg_debug_assert((mop & MO_SIZE) == MO_128);
     haddr = cpu_mmu_lookup(env, addr, mop, ra, MMU_DATA_STORE);
+
+    if (mop & MO_BSWAP) {
+        val = bswap128(val);
+    }
     store_atom_16(env, ra, haddr, mop, val);
     clear_helper_retaddr();
 }
@@ -XXX,XX +XXX,XX @@ static void do_st16_he_mmu(CPUArchState *env, abi_ptr addr, Int128 val,
 void helper_st16_mmu(CPUArchState *env, uint64_t addr, Int128 val,
                      MemOpIdx oi, uintptr_t ra)
 {
-    MemOp mop = get_memop(oi);
-
-    if (mop & MO_BSWAP) {
-        val = bswap128(val);
-    }
-    do_st16_he_mmu(env, addr, val, mop, ra);
+    do_st16_mmu(env, addr, val, get_memop(oi), ra);
 }
 
 void helper_st_i128(CPUArchState *env, uint64_t addr, Int128 val, MemOpIdx oi)
@@ -XXX,XX +XXX,XX @@ void helper_st_i128(CPUArchState *env, uint64_t addr, Int128 val, MemOpIdx oi)
     helper_st16_mmu(env, addr, val, oi, GETPC());
 }
 
-void cpu_st16_be_mmu(CPUArchState *env, abi_ptr addr,
-                     Int128 val, MemOpIdx oi, uintptr_t ra)
+void cpu_st16_mmu(CPUArchState *env, abi_ptr addr,
+                  Int128 val, MemOpIdx oi, uintptr_t ra)
 {
-    MemOp mop = get_memop(oi);
-
-    tcg_debug_assert((mop & MO_BSWAP) == MO_BE);
-    if (!HOST_BIG_ENDIAN) {
-        val = bswap128(val);
-    }
-    do_st16_he_mmu(env, addr, val, mop, ra);
-    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W);
-}
-
-void cpu_st16_le_mmu(CPUArchState *env, abi_ptr addr,
-                     Int128 val, MemOpIdx oi, uintptr_t ra)
-{
-    MemOp mop = get_memop(oi);
-
-    tcg_debug_assert((mop & MO_BSWAP) == MO_LE);
-    if (HOST_BIG_ENDIAN) {
-        val = bswap128(val);
-    }
-    do_st16_he_mmu(env, addr, val, mop, ra);
+    do_st16_mmu(env, addr, val, get_memop(oi), ra);
     qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W);
 }
 
diff --git a/target/arm/tcg/m_helper.c b/target/arm/tcg/m_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/tcg/m_helper.c
+++ b/target/arm/tcg/m_helper.c
@@ -XXX,XX +XXX,XX @@ static bool do_v7m_function_return(ARMCPU *cpu)
          */
         mmu_idx = arm_v7m_mmu_idx_for_secstate(env, true);
         oi = make_memop_idx(MO_LEUL, arm_to_core_mmu_idx(mmu_idx));
-        newpc = cpu_ldl_le_mmu(env, frameptr, oi, 0);
-        newpsr = cpu_ldl_le_mmu(env, frameptr + 4, oi, 0);
+        newpc = cpu_ldl_mmu(env, frameptr, oi, 0);
+        newpsr = cpu_ldl_mmu(env, frameptr + 4, oi, 0);
 
         /* Consistency checks on new IPSR */
         newpsr_exc = newpsr & XPSR_EXCP;
diff --git a/target/sparc/ldst_helper.c b/target/sparc/ldst_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/sparc/ldst_helper.c
+++ b/target/sparc/ldst_helper.c
@@ -XXX,XX +XXX,XX @@ uint64_t helper_ld_asi(CPUSPARCState *env, target_ulong addr,
                 ret = cpu_ldb_mmu(env, addr, oi, GETPC());
                 break;
             case 2:
-                if (asi & 8) {
-                    ret = cpu_ldw_le_mmu(env, addr, oi, GETPC());
-                } else {
-                    ret = cpu_ldw_be_mmu(env, addr, oi, GETPC());
-                }
+                ret = cpu_ldw_mmu(env, addr, oi, GETPC());
                 break;
             case 4:
-                if (asi & 8) {
-                    ret = cpu_ldl_le_mmu(env, addr, oi, GETPC());
-                } else {
-                    ret = cpu_ldl_be_mmu(env, addr, oi, GETPC());
-                }
+                ret = cpu_ldl_mmu(env, addr, oi, GETPC());
                 break;
             case 8:
-                if (asi & 8) {
-                    ret = cpu_ldq_le_mmu(env, addr, oi, GETPC());
-                } else {
-                    ret = cpu_ldq_be_mmu(env, addr, oi, GETPC());
-                }
+                ret = cpu_ldq_mmu(env, addr, oi, GETPC());
                 break;
             default:
                 g_assert_not_reached();
diff --git a/accel/tcg/ldst_common.c.inc b/accel/tcg/ldst_common.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/ldst_common.c.inc
+++ b/accel/tcg/ldst_common.c.inc
@@ -XXX,XX +XXX,XX @@ uint32_t cpu_lduw_be_mmuidx_ra(CPUArchState *env, abi_ptr addr,
                                int mmu_idx, uintptr_t ra)
 {
     MemOpIdx oi = make_memop_idx(MO_BEUW | MO_UNALN, mmu_idx);
-    return cpu_ldw_be_mmu(env, addr, oi, ra);
+    return cpu_ldw_mmu(env, addr, oi, ra);
 }
 
 int cpu_ldsw_be_mmuidx_ra(CPUArchState *env, abi_ptr addr,
@@ -XXX,XX +XXX,XX @@ uint32_t cpu_ldl_be_mmuidx_ra(CPUArchState *env, abi_ptr addr,
                               int mmu_idx, uintptr_t ra)
 {
     MemOpIdx oi = make_memop_idx(MO_BEUL | MO_UNALN, mmu_idx);
-    return cpu_ldl_be_mmu(env, addr, oi, ra);
+    return cpu_ldl_mmu(env, addr, oi, ra);
 }
 
 uint64_t cpu_ldq_be_mmuidx_ra(CPUArchState *env, abi_ptr addr,
                               int mmu_idx, uintptr_t ra)
 {
     MemOpIdx oi = make_memop_idx(MO_BEUQ | MO_UNALN, mmu_idx);
-    return cpu_ldq_be_mmu(env, addr, oi, ra);
+    return cpu_ldq_mmu(env, addr, oi, ra);
 }
 
 uint32_t cpu_lduw_le_mmuidx_ra(CPUArchState *env, abi_ptr addr,
                                int mmu_idx, uintptr_t ra)
 {
     MemOpIdx oi = make_memop_idx(MO_LEUW | MO_UNALN, mmu_idx);
-    return cpu_ldw_le_mmu(env, addr, oi, ra);
+    return cpu_ldw_mmu(env, addr, oi, ra);
 }
 
 int cpu_ldsw_le_mmuidx_ra(CPUArchState *env, abi_ptr addr,
@@ -XXX,XX +XXX,XX @@ uint32_t cpu_ldl_le_mmuidx_ra(CPUArchState *env, abi_ptr addr,
                               int mmu_idx, uintptr_t ra)
 {
     MemOpIdx oi = make_memop_idx(MO_LEUL | MO_UNALN, mmu_idx);
-    return cpu_ldl_le_mmu(env, addr, oi, ra);
+    return cpu_ldl_mmu(env, addr, oi, ra);
 }
 
 uint64_t cpu_ldq_le_mmuidx_ra(CPUArchState *env, abi_ptr addr,
                               int mmu_idx, uintptr_t ra)
 {
     MemOpIdx oi = make_memop_idx(MO_LEUQ | MO_UNALN, mmu_idx);
-    return cpu_ldq_le_mmu(env, addr, oi, ra);
+    return cpu_ldq_mmu(env, addr, oi, ra);
 }
 
 void cpu_stb_mmuidx_ra(CPUArchState *env, abi_ptr addr, uint32_t val,
@@ -XXX,XX +XXX,XX @@ void cpu_stw_be_mmuidx_ra(CPUArchState *env, abi_ptr addr, uint32_t val,
                           int mmu_idx, uintptr_t ra)
 {
     MemOpIdx oi = make_memop_idx(MO_BEUW | MO_UNALN, mmu_idx);
-    cpu_stw_be_mmu(env, addr, val, oi, ra);
+    cpu_stw_mmu(env, addr, val, oi, ra);
 }
 
 void cpu_stl_be_mmuidx_ra(CPUArchState *env, abi_ptr addr, uint32_t val,
                           int mmu_idx, uintptr_t ra)
 {
     MemOpIdx oi = make_memop_idx(MO_BEUL | MO_UNALN, mmu_idx);
-    cpu_stl_be_mmu(env, addr, val, oi, ra);
+    cpu_stl_mmu(env, addr, val, oi, ra);
 }
 
 void cpu_stq_be_mmuidx_ra(CPUArchState *env, abi_ptr addr, uint64_t val,
                           int mmu_idx, uintptr_t ra)
 {
     MemOpIdx oi = make_memop_idx(MO_BEUQ | MO_UNALN, mmu_idx);
-    cpu_stq_be_mmu(env, addr, val, oi, ra);
+    cpu_stq_mmu(env, addr, val, oi, ra);
 }
 
 void cpu_stw_le_mmuidx_ra(CPUArchState *env, abi_ptr addr, uint32_t val,
                           int mmu_idx, uintptr_t ra)
 {
     MemOpIdx oi = make_memop_idx(MO_LEUW | MO_UNALN, mmu_idx);
-    cpu_stw_le_mmu(env, addr, val, oi, ra);
+    cpu_stw_mmu(env, addr, val, oi, ra);
 }
 
 void cpu_stl_le_mmuidx_ra(CPUArchState *env, abi_ptr addr, uint32_t val,
                           int mmu_idx, uintptr_t ra)
 {
     MemOpIdx oi = make_memop_idx(MO_LEUL | MO_UNALN, mmu_idx);
-    cpu_stl_le_mmu(env, addr, val, oi, ra);
+    cpu_stl_mmu(env, addr, val, oi, ra);
 }
 
 void cpu_stq_le_mmuidx_ra(CPUArchState *env, abi_ptr addr, uint64_t val,
                           int mmu_idx, uintptr_t ra)
 {
     MemOpIdx oi = make_memop_idx(MO_LEUQ | MO_UNALN, mmu_idx);
-    cpu_stq_le_mmu(env, addr, val, oi, ra);
+    cpu_stq_mmu(env, addr, val, oi, ra);
 }
 
 /*--------------------------*/
-- 
2.34.1

Use cpu_ld16_mmu and cpu_st16_mmu to eliminate the special case,
and change all of the *_data_ra functions to match.

Note that we check the alignment of both compare and store
pointers at the top of the function, so MO_ALIGN* may be
safely removed from the individual memory operations.

diff --git a/target/s390x/tcg/mem_helper.c b/target/s390x/tcg/mem_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/s390x/tcg/mem_helper.c
+++ b/target/s390x/tcg/mem_helper.c
@@ -XXX,XX +XXX,XX @@ static uint32_t do_csst(CPUS390XState *env, uint32_t r3, uint64_t a1,
                         uint64_t a2, bool parallel)
 {
     uint32_t mem_idx = cpu_mmu_index(env, false);
+    MemOpIdx oi16 = make_memop_idx(MO_TE | MO_128, mem_idx);
+    MemOpIdx oi8 = make_memop_idx(MO_TE | MO_64, mem_idx);
+    MemOpIdx oi4 = make_memop_idx(MO_TE | MO_32, mem_idx);
+    MemOpIdx oi2 = make_memop_idx(MO_TE | MO_16, mem_idx);
+    MemOpIdx oi1 = make_memop_idx(MO_8, mem_idx);
     uintptr_t ra = GETPC();
     uint32_t fc = extract32(env->regs[0], 0, 8);
     uint32_t sc = extract32(env->regs[0], 8, 8);
@@ -XXX,XX +XXX,XX @@ static uint32_t do_csst(CPUS390XState *env, uint32_t r3, uint64_t a1,
         }
     }
 
-    /* All loads happen before all stores.  For simplicity, load the entire
-       store value area from the parameter list.  */
-    svh = cpu_ldq_data_ra(env, pl + 16, ra);
-    svl = cpu_ldq_data_ra(env, pl + 24, ra);
+    /*
+     * All loads happen before all stores.  For simplicity, load the entire
+     * store value area from the parameter list.
+     */
+    svh = cpu_ldq_mmu(env, pl + 16, oi8, ra);
+    svl = cpu_ldq_mmu(env, pl + 24, oi8, ra);
 
     switch (fc) {
     case 0:
         {
-            uint32_t nv = cpu_ldl_data_ra(env, pl, ra);
+            uint32_t nv = cpu_ldl_mmu(env, pl, oi4, ra);
             uint32_t cv = env->regs[r3];
             uint32_t ov;
 
@@ -XXX,XX +XXX,XX @@ static uint32_t do_csst(CPUS390XState *env, uint32_t r3, uint64_t a1,
                 ov = cpu_atomic_cmpxchgl_be_mmu(env, a1, cv, nv, oi, ra);
 #endif
             } else {
-                ov = cpu_ldl_data_ra(env, a1, ra);
-                cpu_stl_data_ra(env, a1, (ov == cv ? nv : ov), ra);
+                ov = cpu_ldl_mmu(env, a1, oi4, ra);
+                cpu_stl_mmu(env, a1, (ov == cv ? nv : ov), oi4, ra);
             }
             cc = (ov != cv);
             env->regs[r3] = deposit64(env->regs[r3], 32, 32, ov);
@@ -XXX,XX +XXX,XX @@ static uint32_t do_csst(CPUS390XState *env, uint32_t r3, uint64_t a1,
 
     case 1:
         {
-            uint64_t nv = cpu_ldq_data_ra(env, pl, ra);
+            uint64_t nv = cpu_ldq_mmu(env, pl, oi8, ra);
             uint64_t cv = env->regs[r3];
             uint64_t ov;
 
             if (parallel) {
 #ifdef CONFIG_ATOMIC64
-                MemOpIdx oi = make_memop_idx(MO_TEUQ | MO_ALIGN, mem_idx);
-                ov = cpu_atomic_cmpxchgq_be_mmu(env, a1, cv, nv, oi, ra);
+                ov = cpu_atomic_cmpxchgq_be_mmu(env, a1, cv, nv, oi8, ra);
 #else
                 /* Note that we asserted !parallel above.  */
                 g_assert_not_reached();
 #endif
             } else {
-                ov = cpu_ldq_data_ra(env, a1, ra);
-                cpu_stq_data_ra(env, a1, (ov == cv ? nv : ov), ra);
+                ov = cpu_ldq_mmu(env, a1, oi8, ra);
+                cpu_stq_mmu(env, a1, (ov == cv ? nv : ov), oi8, ra);
             }
             cc = (ov != cv);
             env->regs[r3] = ov;
@@ -XXX,XX +XXX,XX @@ static uint32_t do_csst(CPUS390XState *env, uint32_t r3, uint64_t a1,
 
     case 2:
         {
-            uint64_t nvh = cpu_ldq_data_ra(env, pl, ra);
-            uint64_t nvl = cpu_ldq_data_ra(env, pl + 8, ra);
-            Int128 nv = int128_make128(nvl, nvh);
+            Int128 nv = cpu_ld16_mmu(env, pl, oi16, ra);
             Int128 cv = int128_make128(env->regs[r3 + 1], env->regs[r3]);
             Int128 ov;
 
             if (!parallel) {
-                uint64_t oh = cpu_ldq_data_ra(env, a1 + 0, ra);
-                uint64_t ol = cpu_ldq_data_ra(env, a1 + 8, ra);
-
-                ov = int128_make128(ol, oh);
+                ov = cpu_ld16_mmu(env, a1, oi16, ra);
                 cc = !int128_eq(ov, cv);
                 if (cc) {
                     nv = ov;
                 }
-
-                cpu_stq_data_ra(env, a1 + 0, int128_gethi(nv), ra);
-                cpu_stq_data_ra(env, a1 + 8, int128_getlo(nv), ra);
+                cpu_st16_mmu(env, a1, nv, oi16, ra);
             } else if (HAVE_CMPXCHG128) {
-                MemOpIdx oi = make_memop_idx(MO_TE | MO_128 | MO_ALIGN, mem_idx);
-                ov = cpu_atomic_cmpxchgo_be_mmu(env, a1, cv, nv, oi, ra);
+                ov = cpu_atomic_cmpxchgo_be_mmu(env, a1, cv, nv, oi16, ra);
                 cc = !int128_eq(ov, cv);
             } else {
                 /* Note that we asserted !parallel above.  */
@@ -XXX,XX +XXX,XX @@ static uint32_t do_csst(CPUS390XState *env, uint32_t r3, uint64_t a1,
     if (cc == 0) {
         switch (sc) {
         case 0:
-            cpu_stb_data_ra(env, a2, svh >> 56, ra);
+            cpu_stb_mmu(env, a2, svh >> 56, oi1, ra);
             break;
         case 1:
-            cpu_stw_data_ra(env, a2, svh >> 48, ra);
+            cpu_stw_mmu(env, a2, svh >> 48, oi2, ra);
             break;
         case 2:
-            cpu_stl_data_ra(env, a2, svh >> 32, ra);
+            cpu_stl_mmu(env, a2, svh >> 32, oi4, ra);
             break;
         case 3:
-            cpu_stq_data_ra(env, a2, svh, ra);
+            cpu_stq_mmu(env, a2, svh, oi8, ra);
             break;
         case 4:
-            if (!parallel) {
-                cpu_stq_data_ra(env, a2 + 0, svh, ra);
-                cpu_stq_data_ra(env, a2 + 8, svl, ra);
-            } else if (HAVE_ATOMIC128) {
-                MemOpIdx oi = make_memop_idx(MO_TEUQ | MO_ALIGN_16, mem_idx);
-                Int128 sv = int128_make128(svl, svh);
-                cpu_atomic_sto_be_mmu(env, a2, sv, oi, ra);
-            } else {
-                /* Note that we asserted !parallel above.  */
-                g_assert_not_reached();
-            }
+            cpu_st16_mmu(env, a2, int128_make128(svl, svh), oi16, ra);
             break;
         default:
             g_assert_not_reached();
-- 
2.34.1

Eliminate the CONFIG_USER_ONLY specialization.

Atomic load/store of 128-byte quantities is now handled
by cpu_{ld,st}16_mmu.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/atomic_template.h   | 61 +++--------------------------------
 include/exec/cpu_ldst.h       |  9 ------
 accel/tcg/atomic_common.c.inc | 14 --------
 3 files changed, 4 insertions(+), 80 deletions(-)

Now that load/store are gone, we're always passing
PAGE_READ | PAGE_WRITE for RMW atomic operations.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/atomic_template.h | 32 ++++++--------
 accel/tcg/cputlb.c          | 85 ++++++++++++++-----------------------
 accel/tcg/user-exec.c       |  8 +---
 3 files changed, 45 insertions(+), 80 deletions(-)

diff --git a/accel/tcg/atomic_template.h b/accel/tcg/atomic_template.h
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/atomic_template.h
+++ b/accel/tcg/atomic_template.h
@@ -XXX,XX +XXX,XX @@ ABI_TYPE ATOMIC_NAME(cmpxchg)(CPUArchState *env, target_ulong addr,
                               ABI_TYPE cmpv, ABI_TYPE newv,
                               MemOpIdx oi, uintptr_t retaddr)
 {
-    DATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE,
-                                         PAGE_READ | PAGE_WRITE, retaddr);
+    DATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE, retaddr);
     DATA_TYPE ret;
 
 #if DATA_SIZE == 16
@@ -XXX,XX +XXX,XX @@ ABI_TYPE ATOMIC_NAME(cmpxchg)(CPUArchState *env, target_ulong addr,
 ABI_TYPE ATOMIC_NAME(xchg)(CPUArchState *env, target_ulong addr, ABI_TYPE val,
                            MemOpIdx oi, uintptr_t retaddr)
 {
-    DATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE,
-                                         PAGE_READ | PAGE_WRITE, retaddr);
+    DATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE, retaddr);
     DATA_TYPE ret;
 
     ret = qatomic_xchg__nocheck(haddr, val);
@@ -XXX,XX +XXX,XX @@ ABI_TYPE ATOMIC_NAME(xchg)(CPUArchState *env, target_ulong addr, ABI_TYPE val,
 ABI_TYPE ATOMIC_NAME(X)(CPUArchState *env, target_ulong addr,       \
                         ABI_TYPE val, MemOpIdx oi, uintptr_t retaddr) \
 {                                                                   \
-    DATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE,  \
-                                         PAGE_READ | PAGE_WRITE, retaddr); \
-    DATA_TYPE ret;                                                  \
+    DATA_TYPE *haddr, ret;                                          \
+    haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE, retaddr);   \
     ret = qatomic_##X(haddr, val);                                  \
     ATOMIC_MMU_CLEANUP;                                             \
     atomic_trace_rmw_post(env, addr, oi);                           \
@@ -XXX,XX +XXX,XX @@ GEN_ATOMIC_HELPER(xor_fetch)
 ABI_TYPE ATOMIC_NAME(X)(CPUArchState *env, target_ulong addr,       \
                         ABI_TYPE xval, MemOpIdx oi, uintptr_t retaddr) \
 {                                                                   \
-    XDATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE, \
-                                          PAGE_READ | PAGE_WRITE, retaddr); \
-    XDATA_TYPE cmp, old, new, val = xval;                           \
+    XDATA_TYPE *haddr, cmp, old, new, val = xval;                   \
+    haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE, retaddr);   \
     smp_mb();                                                       \
     cmp = qatomic_read__nocheck(haddr);                             \
     do {                                                            \
@@ -XXX,XX +XXX,XX @@ ABI_TYPE ATOMIC_NAME(cmpxchg)(CPUArchState *env, target_ulong addr,
                               ABI_TYPE cmpv, ABI_TYPE newv,
                               MemOpIdx oi, uintptr_t retaddr)
 {
-    DATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE,
-                                         PAGE_READ | PAGE_WRITE, retaddr);
+    DATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE, retaddr);
     DATA_TYPE ret;
 
 #if DATA_SIZE == 16
@@ -XXX,XX +XXX,XX @@ ABI_TYPE ATOMIC_NAME(cmpxchg)(CPUArchState *env, target_ulong addr,
 ABI_TYPE ATOMIC_NAME(xchg)(CPUArchState *env, target_ulong addr, ABI_TYPE val,
                            MemOpIdx oi, uintptr_t retaddr)
 {
-    DATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE,
-                                         PAGE_READ | PAGE_WRITE, retaddr);
+    DATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE, retaddr);
     ABI_TYPE ret;
 
     ret = qatomic_xchg__nocheck(haddr, BSWAP(val));
@@ -XXX,XX +XXX,XX @@ ABI_TYPE ATOMIC_NAME(xchg)(CPUArchState *env, target_ulong addr, ABI_TYPE val,
 ABI_TYPE ATOMIC_NAME(X)(CPUArchState *env, target_ulong addr,       \
                         ABI_TYPE val, MemOpIdx oi, uintptr_t retaddr) \
 {                                                                   \
-    DATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE,  \
-                                         PAGE_READ | PAGE_WRITE, retaddr); \
-    DATA_TYPE ret;                                                  \
+    DATA_TYPE *haddr, ret;                                          \
+    haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE, retaddr);   \
     ret = qatomic_##X(haddr, BSWAP(val));                           \
     ATOMIC_MMU_CLEANUP;                                             \
     atomic_trace_rmw_post(env, addr, oi);                           \
@@ -XXX,XX +XXX,XX @@ GEN_ATOMIC_HELPER(xor_fetch)
 ABI_TYPE ATOMIC_NAME(X)(CPUArchState *env, target_ulong addr,       \
                         ABI_TYPE xval, MemOpIdx oi, uintptr_t retaddr) \
 {                                                                   \
-    XDATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE, \
-                                          PAGE_READ | PAGE_WRITE, retaddr); \
-    XDATA_TYPE ldo, ldn, old, new, val = xval;                      \
+    XDATA_TYPE *haddr, ldo, ldn, old, new, val = xval;              \
+    haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE, retaddr);   \
     smp_mb();                                                       \
     ldn = qatomic_read__nocheck(haddr);                             \
     do {                                                            \
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ static bool mmu_lookup(CPUArchState *env, target_ulong addr, MemOpIdx oi,
 /*
  * Probe for an atomic operation.  Do not allow unaligned operations,
  * or io operations to proceed.  Return the host address.
- *
- * @prot may be PAGE_READ, PAGE_WRITE, or PAGE_READ|PAGE_WRITE.
  */
 static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
-                               MemOpIdx oi, int size, int prot,
-                               uintptr_t retaddr)
+                               MemOpIdx oi, int size, uintptr_t retaddr)
 {
     uintptr_t mmu_idx = get_mmuidx(oi);
     MemOp mop = get_memop(oi);
@@ -XXX,XX +XXX,XX @@ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
     tlbe = tlb_entry(env, mmu_idx, addr);
 
     /* Check TLB entry and enforce page permissions.  */
-    if (prot & PAGE_WRITE) {
-        tlb_addr = tlb_addr_write(tlbe);
-        if (!tlb_hit(tlb_addr, addr)) {
-            if (!victim_tlb_hit(env, mmu_idx, index, MMU_DATA_STORE,
-                                addr & TARGET_PAGE_MASK)) {
-                tlb_fill(env_cpu(env), addr, size,
-                         MMU_DATA_STORE, mmu_idx, retaddr);
-                index = tlb_index(env, mmu_idx, addr);
-                tlbe = tlb_entry(env, mmu_idx, addr);
-            }
-            tlb_addr = tlb_addr_write(tlbe) & ~TLB_INVALID_MASK;
-        }
-
-        if (prot & PAGE_READ) {
-            /*
-             * Let the guest notice RMW on a write-only page.
-             * We have just verified that the page is writable.
-             * Subpage lookups may have left TLB_INVALID_MASK set,
-             * but addr_read will only be -1 if PAGE_READ was unset.
-             */
-            if (unlikely(tlbe->addr_read == -1)) {
-                tlb_fill(env_cpu(env), addr, size,
-                         MMU_DATA_LOAD, mmu_idx, retaddr);
-                /*
-                 * Since we don't support reads and writes to different
-                 * addresses, and we do have the proper page loaded for
-                 * write, this shouldn't ever return.  But just in case,
-                 * handle via stop-the-world.
-                 */
-                goto stop_the_world;
-            }
-            /* Collect TLB_WATCHPOINT for read. */
-            tlb_addr |= tlbe->addr_read;
-        }
-    } else /* if (prot & PAGE_READ) */ {
-        tlb_addr = tlbe->addr_read;
-        if (!tlb_hit(tlb_addr, addr)) {
-            if (!victim_tlb_hit(env, mmu_idx, index, MMU_DATA_LOAD,
-                                addr & TARGET_PAGE_MASK)) {
-                tlb_fill(env_cpu(env), addr, size,
-                         MMU_DATA_LOAD, mmu_idx, retaddr);
-                index = tlb_index(env, mmu_idx, addr);
-                tlbe = tlb_entry(env, mmu_idx, addr);
-            }
-            tlb_addr = tlbe->addr_read & ~TLB_INVALID_MASK;
+    tlb_addr = tlb_addr_write(tlbe);
+    if (!tlb_hit(tlb_addr, addr)) {
+        if (!victim_tlb_hit(env, mmu_idx, index, MMU_DATA_STORE,
+                            addr & TARGET_PAGE_MASK)) {
+            tlb_fill(env_cpu(env), addr, size,
+                     MMU_DATA_STORE, mmu_idx, retaddr);
+            index = tlb_index(env, mmu_idx, addr);
+            tlbe = tlb_entry(env, mmu_idx, addr);
         }
+        tlb_addr = tlb_addr_write(tlbe) & ~TLB_INVALID_MASK;
     }
 
+    /*
+     * Let the guest notice RMW on a write-only page.
+     * We have just verified that the page is writable.
+     * Subpage lookups may have left TLB_INVALID_MASK set,
+     * but addr_read will only be -1 if PAGE_READ was unset.
+     */
+    if (unlikely(tlbe->addr_read == -1)) {
+        tlb_fill(env_cpu(env), addr, size, MMU_DATA_LOAD, mmu_idx, retaddr);
+        /*
+         * Since we don't support reads and writes to different
+         * addresses, and we do have the proper page loaded for
+         * write, this shouldn't ever return.  But just in case,
+         * handle via stop-the-world.
+         */
+        goto stop_the_world;
+    }
+    /* Collect TLB_WATCHPOINT for read. */
+    tlb_addr |= tlbe->addr_read;
+
     /* Notice an IO access or a needs-MMU-lookup access */
     if (unlikely(tlb_addr & (TLB_MMIO | TLB_DISCARD_WRITE))) {
         /* There's really nothing that can be done to
@@ -XXX,XX +XXX,XX @@ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
     }
 
     if (unlikely(tlb_addr & TLB_WATCHPOINT)) {
-        QEMU_BUILD_BUG_ON(PAGE_READ != BP_MEM_READ);
-        QEMU_BUILD_BUG_ON(PAGE_WRITE != BP_MEM_WRITE);
-        /* therefore prot == watchpoint bits */
-        cpu_check_watchpoint(env_cpu(env), addr, size,
-                             full->attrs, prot, retaddr);
+        cpu_check_watchpoint(env_cpu(env), addr, size, full->attrs,
+                             BP_MEM_READ | BP_MEM_WRITE, retaddr);
     }
 
     return hostaddr;
diff --git a/accel/tcg/user-exec.c b/accel/tcg/user-exec.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/user-exec.c
+++ b/accel/tcg/user-exec.c
@@ -XXX,XX +XXX,XX @@ uint64_t cpu_ldq_code_mmu(CPUArchState *env, abi_ptr addr,
 
 /*
  * Do not allow unaligned operations to proceed.  Return the host address.
- *
- * @prot may be PAGE_READ, PAGE_WRITE, or PAGE_READ|PAGE_WRITE.
  */
 static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
-                               MemOpIdx oi, int size, int prot,
-                               uintptr_t retaddr)
+                               MemOpIdx oi, int size, uintptr_t retaddr)
 {
     MemOp mop = get_memop(oi);
     int a_bits = get_alignment_bits(mop);
@@ -XXX,XX +XXX,XX @@ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
 
     /* Enforce guest required alignment.  */
     if (unlikely(addr & ((1 << a_bits) - 1))) {
-        MMUAccessType t = prot == PAGE_READ ? MMU_DATA_LOAD : MMU_DATA_STORE;
-        cpu_loop_exit_sigbus(env_cpu(env), addr, t, retaddr);
+        cpu_loop_exit_sigbus(env_cpu(env), addr, MMU_DATA_STORE, retaddr);
     }
 
     /* Enforce qemu required alignment.  */
-- 
2.34.1

These symbols will shortly become dynamic runtime tests and
therefore not appropriate for the preprocessor.  Use the
matching CONFIG_* symbols for that purpose.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 host/include/aarch64/host/atomic128-cas.h  | 2 ++
 host/include/generic/host/atomic128-ldst.h | 2 +-
 accel/tcg/cputlb.c                         | 2 +-
 accel/tcg/user-exec.c                      | 2 +-
 4 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/host/include/aarch64/host/atomic128-cas.h b/host/include/aarch64/host/atomic128-cas.h
index XXXXXXX..XXXXXXX 100644
--- a/host/include/aarch64/host/atomic128-cas.h
+++ b/host/include/aarch64/host/atomic128-cas.h
@@ -XXX,XX +XXX,XX @@ static inline Int128 atomic16_cmpxchg(Int128 *ptr, Int128 cmp, Int128 new)
 
     return int128_make128(oldl, oldh);
 }
+
+# define CONFIG_CMPXCHG128 1
 # define HAVE_CMPXCHG128 1
 #endif
 
diff --git a/host/include/generic/host/atomic128-ldst.h b/host/include/generic/host/atomic128-ldst.h
index XXXXXXX..XXXXXXX 100644
--- a/host/include/generic/host/atomic128-ldst.h
+++ b/host/include/generic/host/atomic128-ldst.h
@@ -XXX,XX +XXX,XX @@ atomic16_set(Int128 *ptr, Int128 val)
 }
 
 # define HAVE_ATOMIC128 1
-#elif !defined(CONFIG_USER_ONLY) && HAVE_CMPXCHG128
+#elif defined(CONFIG_CMPXCHG128) && !defined(CONFIG_USER_ONLY)
 static inline Int128 ATTRIBUTE_ATOMIC128_OPT
 atomic16_read(Int128 *ptr)
 {
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ void cpu_st16_mmu(CPUArchState *env, target_ulong addr, Int128 val,
 #include "atomic_template.h"
 #endif
 
-#if HAVE_CMPXCHG128 || HAVE_ATOMIC128
+#if defined(CONFIG_ATOMIC128) || defined(CONFIG_CMPXCHG128)
 #define DATA_SIZE 16
 #include "atomic_template.h"
 #endif
diff --git a/accel/tcg/user-exec.c b/accel/tcg/user-exec.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/user-exec.c
+++ b/accel/tcg/user-exec.c
@@ -XXX,XX +XXX,XX @@ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
 #include "atomic_template.h"
 #endif
 
-#if HAVE_ATOMIC128 || HAVE_CMPXCHG128
+#if defined(CONFIG_ATOMIC128) || defined(CONFIG_CMPXCHG128)
 #define DATA_SIZE 16
 #include "atomic_template.h"
 #endif
-- 
2.34.1

Create both atomic16_read_ro and atomic16_read_rw.
Previously we pretended that we had atomic16_read in system mode,
because we "know" that all ram is always writable to the host.
Now, expose read-only and read-write versions all of the time.

For aarch64, do not fall back to __atomic_read_16 even if
supported by the compiler, to work around a clang bug.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 host/include/aarch64/host/atomic128-ldst.h | 21 ++++++++-------
 host/include/generic/host/atomic128-ldst.h | 31 ++++++++++++++++------
 target/s390x/tcg/mem_helper.c              |  2 +-
 3 files changed, 36 insertions(+), 18 deletions(-)

diff --git a/host/include/aarch64/host/atomic128-ldst.h b/host/include/aarch64/host/atomic128-ldst.h
index XXXXXXX..XXXXXXX 100644
--- a/host/include/aarch64/host/atomic128-ldst.h
+++ b/host/include/aarch64/host/atomic128-ldst.h
@@ -XXX,XX +XXX,XX @@
 #ifndef AARCH64_ATOMIC128_LDST_H
 #define AARCH64_ATOMIC128_LDST_H
 
-/* Through gcc 10, aarch64 has no support for 128-bit atomics.  */
-#if !defined(CONFIG_ATOMIC128) && !defined(CONFIG_USER_ONLY)
-/* We can do better than cmpxchg for AArch64.  */
-static inline Int128 atomic16_read(Int128 *ptr)
+/*
+ * Through gcc 10, aarch64 has no support for 128-bit atomics.
+ * Through clang 16, without -march=armv8.4-a, __atomic_load_16
+ * is incorrectly expanded to a read-write operation.
+ */
+
+#define HAVE_ATOMIC128_RO 0
+#define HAVE_ATOMIC128_RW 1
+
+Int128 QEMU_ERROR("unsupported atomic") atomic16_read_ro(const Int128 *ptr);
+
+static inline Int128 atomic16_read_rw(Int128 *ptr)
 {
     uint64_t l, h;
     uint32_t tmp;
@@ -XXX,XX +XXX,XX @@ static inline void atomic16_set(Int128 *ptr, Int128 val)
         : [l] "r"(l), [h] "r"(h));
 }
 
-# define HAVE_ATOMIC128 1
-#else
-#include "host/include/generic/host/atomic128-ldst.h"
-#endif
-
 #endif /* AARCH64_ATOMIC128_LDST_H */
diff --git a/host/include/generic/host/atomic128-ldst.h b/host/include/generic/host/atomic128-ldst.h
index XXXXXXX..XXXXXXX 100644
--- a/host/include/generic/host/atomic128-ldst.h
+++ b/host/include/generic/host/atomic128-ldst.h
@@ -XXX,XX +XXX,XX @@
 #define HOST_ATOMIC128_LDST_H
 
 #if defined(CONFIG_ATOMIC128)
+# define HAVE_ATOMIC128_RO 1
+# define HAVE_ATOMIC128_RW 1
+
 static inline Int128 ATTRIBUTE_ATOMIC128_OPT
-atomic16_read(Int128 *ptr)
+atomic16_read_ro(const Int128 *ptr)
 {
-    __int128_t *ptr_align = __builtin_assume_aligned(ptr, 16);
+    const __int128_t *ptr_align = __builtin_assume_aligned(ptr, 16);
     Int128Alias r;
 
     r.i = qatomic_read__nocheck(ptr_align);
     return r.s;
 }
 
+static inline Int128 ATTRIBUTE_ATOMIC128_OPT
+atomic16_read_rw(Int128 *ptr)
+{
+    return atomic16_read_ro(ptr);
+}
+
 static inline void ATTRIBUTE_ATOMIC128_OPT
 atomic16_set(Int128 *ptr, Int128 val)
 {
@@ -XXX,XX +XXX,XX @@ atomic16_set(Int128 *ptr, Int128 val)
     qatomic_set__nocheck(ptr_align, v.i);
 }
 
-# define HAVE_ATOMIC128 1
-#elif defined(CONFIG_CMPXCHG128) && !defined(CONFIG_USER_ONLY)
+#elif defined(CONFIG_CMPXCHG128)
+# define HAVE_ATOMIC128_RO 0
+# define HAVE_ATOMIC128_RW 1
+
+Int128 QEMU_ERROR("unsupported atomic") atomic16_read_ro(const Int128 *ptr);
+
 static inline Int128 ATTRIBUTE_ATOMIC128_OPT
-atomic16_read(Int128 *ptr)
+atomic16_read_rw(Int128 *ptr)
 {
     /* Maybe replace 0 with 0, returning the old value.  */
     Int128 z = int128_make64(0);
@@ -XXX,XX +XXX,XX @@ atomic16_set(Int128 *ptr, Int128 val)
     } while (int128_ne(old, cmp));
 }
 
-# define HAVE_ATOMIC128 1
 #else
+# define HAVE_ATOMIC128_RO 0
+# define HAVE_ATOMIC128_RW 0
+
 /* Fallback definitions that must be optimized away, or error.  */
-Int128 QEMU_ERROR("unsupported atomic") atomic16_read(Int128 *ptr);
+Int128 QEMU_ERROR("unsupported atomic") atomic16_read_ro(const Int128 *ptr);
+Int128 QEMU_ERROR("unsupported atomic") atomic16_read_rw(Int128 *ptr);
 void QEMU_ERROR("unsupported atomic") atomic16_set(Int128 *ptr, Int128 val);
-# define HAVE_ATOMIC128 0
 #endif
 
 #endif /* HOST_ATOMIC128_LDST_H */
diff --git a/target/s390x/tcg/mem_helper.c b/target/s390x/tcg/mem_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/s390x/tcg/mem_helper.c
+++ b/target/s390x/tcg/mem_helper.c
@@ -XXX,XX +XXX,XX @@ static uint32_t do_csst(CPUS390XState *env, uint32_t r3, uint64_t a1,
         max = 3;
 #endif
         if ((HAVE_CMPXCHG128 ? 0 : fc + 2 > max) ||
-            (HAVE_ATOMIC128  ? 0 : sc > max)) {
+            (HAVE_ATOMIC128_RW ? 0 : sc > max)) {
             cpu_loop_exit_atomic(env_cpu(env), ra);
         }
     }
-- 
2.34.1

Remove the locally defined load_atomic16 and store_atomic16,
along with HAVE_al16 and HAVE_al16_fast in favor of the
routines defined in atomic128.h.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/cputlb.c             |   2 +-
 accel/tcg/ldst_atomicity.c.inc | 118 +++++++--------------------------
 2 files changed, 24 insertions(+), 96 deletions(-)

diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ static uint64_t do_st16_leN(CPUArchState *env, MMULookupPageData *p,
 
     case MO_ATOM_WITHIN16_PAIR:
         /* Since size > 8, this is the half that must be atomic. */
-        if (!HAVE_al16) {
+        if (!HAVE_ATOMIC128_RW) {
             cpu_loop_exit_atomic(env_cpu(env), ra);
         }
         return store_whole_le16(p->haddr, p->size, val_le);
diff --git a/accel/tcg/ldst_atomicity.c.inc b/accel/tcg/ldst_atomicity.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/ldst_atomicity.c.inc
+++ b/accel/tcg/ldst_atomicity.c.inc
@@ -XXX,XX +XXX,XX @@
 #endif
 #define HAVE_al8_fast      (ATOMIC_REG_SIZE >= 8)
 
-#if defined(CONFIG_ATOMIC128)
-# define HAVE_al16_fast    true
-#else
-# define HAVE_al16_fast    false
-#endif
-#if defined(CONFIG_ATOMIC128) || defined(CONFIG_CMPXCHG128)
-# define HAVE_al16         true
-#else
-# define HAVE_al16         false
-#endif
-
-
 /**
  * required_atomicity:
  *
@@ -XXX,XX +XXX,XX @@ static inline uint64_t load_atomic8(void *pv)
     return qatomic_read__nocheck(p);
 }
 
-/**
- * load_atomic16:
- * @pv: host address
- *
- * Atomically load 16 aligned bytes from @pv.
- */
-static inline Int128 ATTRIBUTE_ATOMIC128_OPT
-load_atomic16(void *pv)
-{
-#ifdef CONFIG_ATOMIC128
-    __uint128_t *p = __builtin_assume_aligned(pv, 16);
-    Int128Alias r;
-
-    r.u = qatomic_read__nocheck(p);
-    return r.s;
-#else
-    qemu_build_not_reached();
-#endif
-}
-
 /**
  * load_atomic8_or_exit:
  * @env: cpu context
@@ -XXX,XX +XXX,XX @@ static Int128 load_atomic16_or_exit(CPUArchState *env, uintptr_t ra, void *pv)
 {
     Int128 *p = __builtin_assume_aligned(pv, 16);
 
-    if (HAVE_al16_fast) {
-        return load_atomic16(p);
+    if (HAVE_ATOMIC128_RO) {
+        return atomic16_read_ro(p);
     }
 
 #ifdef CONFIG_USER_ONLY
@@ -XXX,XX +XXX,XX @@ static Int128 load_atomic16_or_exit(CPUArchState *env, uintptr_t ra, void *pv)
      * In system mode all guest pages are writable, and for user-only
      * we have just checked writability.  Try cmpxchg.
      */
-#if defined(CONFIG_CMPXCHG128)
-    /* Swap 0 with 0, with the side-effect of returning the old value. */
-    {
-        Int128Alias r;
-        r.u = __sync_val_compare_and_swap_16((__uint128_t *)p, 0, 0);
-        return r.s;
+    if (HAVE_ATOMIC128_RW) {
+        return atomic16_read_rw(p);
     }
-#endif
 
     /* Ultimate fallback: re-execute in serial context. */
     cpu_loop_exit_atomic(env_cpu(env), ra);
@@ -XXX,XX +XXX,XX @@ static uint64_t load_atom_extract_al16_or_exit(CPUArchState *env, uintptr_t ra,
 static inline uint64_t ATTRIBUTE_ATOMIC128_OPT
 load_atom_extract_al16_or_al8(void *pv, int s)
 {
-#if defined(CONFIG_ATOMIC128)
     uintptr_t pi = (uintptr_t)pv;
     int o = pi & 7;
     int shr = (HOST_BIG_ENDIAN ? 16 - s - o : o) * 8;
-    __uint128_t r;
+    Int128 r;
 
     pv = (void *)(pi & ~7);
     if (pi & 8) {
@@ -XXX,XX +XXX,XX @@ load_atom_extract_al16_or_al8(void *pv, int s)
         uint64_t b = qatomic_read__nocheck(p8 + 1);
 
         if (HOST_BIG_ENDIAN) {
-            r = ((__uint128_t)a << 64) | b;
+            r = int128_make128(b, a);
         } else {
-            r = ((__uint128_t)b << 64) | a;
+            r = int128_make128(a, b);
         }
     } else {
-        __uint128_t *p16 = __builtin_assume_aligned(pv, 16, 0);
-        r = qatomic_read__nocheck(p16);
+        r = atomic16_read_ro(pv);
     }
-    return r >> shr;
-#else
-    qemu_build_not_reached();
-#endif
+    return int128_getlo(int128_urshift(r, shr));
 }
 
 /**
@@ -XXX,XX +XXX,XX @@ static uint16_t load_atom_2(CPUArchState *env, uintptr_t ra,
     if (likely((pi & 1) == 0)) {
         return load_atomic2(pv);
     }
-    if (HAVE_al16_fast) {
+    if (HAVE_ATOMIC128_RO) {
         return load_atom_extract_al16_or_al8(pv, 2);
     }
 
@@ -XXX,XX +XXX,XX @@ static uint32_t load_atom_4(CPUArchState *env, uintptr_t ra,
     if (likely((pi & 3) == 0)) {
         return load_atomic4(pv);
     }
-    if (HAVE_al16_fast) {
+    if (HAVE_ATOMIC128_RO) {
         return load_atom_extract_al16_or_al8(pv, 4);
     }
 
@@ -XXX,XX +XXX,XX @@ static uint64_t load_atom_8(CPUArchState *env, uintptr_t ra,
     if (HAVE_al8 && likely((pi & 7) == 0)) {
         return load_atomic8(pv);
     }
-    if (HAVE_al16_fast) {
+    if (HAVE_ATOMIC128_RO) {
         return load_atom_extract_al16_or_al8(pv, 8);
     }
 
@@ -XXX,XX +XXX,XX @@ static Int128 load_atom_16(CPUArchState *env, uintptr_t ra,
      * If the host does not support 16-byte atomics, wait until we have
      * examined the atomicity parameters below.
      */
-    if (HAVE_al16_fast && likely((pi & 15) == 0)) {
-        return load_atomic16(pv);
+    if (HAVE_ATOMIC128_RO && likely((pi & 15) == 0)) {
+        return atomic16_read_ro(pv);
     }
 
     atmax = required_atomicity(env, pi, memop);
@@ -XXX,XX +XXX,XX @@ static inline void store_atomic8(void *pv, uint64_t val)
     qatomic_set__nocheck(p, val);
 }
 
-/**
- * store_atomic16:
- * @pv: host address
- * @val: value to store
- *
- * Atomically store 16 aligned bytes to @pv.
- */
-static inline void ATTRIBUTE_ATOMIC128_OPT
-store_atomic16(void *pv, Int128Alias val)
-{
-#if defined(CONFIG_ATOMIC128)
-    __uint128_t *pu = __builtin_assume_aligned(pv, 16);
-    qatomic_set__nocheck(pu, val.u);
-#elif defined(CONFIG_CMPXCHG128)
-    __uint128_t *pu = __builtin_assume_aligned(pv, 16);
-    __uint128_t o;
-
-    /*
-     * Without CONFIG_ATOMIC128, __atomic_compare_exchange_n will always
-     * defer to libatomic, so we must use __sync_*_compare_and_swap_16
-     * and accept the sequential consistency that comes with it.
-     */
-    do {
-        o = *pu;
-    } while (!__sync_bool_compare_and_swap_16(pu, o, val.u));
-#else
-    qemu_build_not_reached();
-#endif
-}
-
 /**
  * store_atom_4x2
  */
@@ -XXX,XX +XXX,XX @@ static uint64_t store_whole_le16(void *pv, int size, Int128 val_le)
     int sh = o * 8;
     Int128 m, v;
 
-    qemu_build_assert(HAVE_al16);
+    qemu_build_assert(HAVE_ATOMIC128_RW);
 
     /* Like MAKE_64BIT_MASK(0, sz), but larger. */
     if (sz <= 64) {
@@ -XXX,XX +XXX,XX @@ static void store_atom_2(CPUArchState *env, uintptr_t ra,
             return;
         }
     } else if ((pi & 15) == 7) {
-        if (HAVE_al16) {
+        if (HAVE_ATOMIC128_RW) {
             Int128 v = int128_lshift(int128_make64(val), 56);
             Int128 m = int128_lshift(int128_make64(0xffff), 56);
             store_atom_insert_al16(pv - 7, v, m);
@@ -XXX,XX +XXX,XX @@ static void store_atom_4(CPUArchState *env, uintptr_t ra,
                 return;
             }
         } else {
-            if (HAVE_al16) {
+            if (HAVE_ATOMIC128_RW) {
                 store_whole_le16(pv, 4, int128_make64(cpu_to_le32(val)));
                 return;
             }
@@ -XXX,XX +XXX,XX @@ static void store_atom_8(CPUArchState *env, uintptr_t ra,
         }
         break;
     case MO_64:
-        if (HAVE_al16) {
+        if (HAVE_ATOMIC128_RW) {
             store_whole_le16(pv, 8, int128_make64(cpu_to_le64(val)));
             return;
         }
@@ -XXX,XX +XXX,XX @@ static void store_atom_16(CPUArchState *env, uintptr_t ra,
     uint64_t a, b;
     int atmax;
 
-    if (HAVE_al16_fast && likely((pi & 15) == 0)) {
-        store_atomic16(pv, val);
+    if (HAVE_ATOMIC128_RW && likely((pi & 15) == 0)) {
+        atomic16_set(pv, val);
         return;
     }
 
@@ -XXX,XX +XXX,XX @@ static void store_atom_16(CPUArchState *env, uintptr_t ra,
         }
         break;
     case -MO_64:
-        if (HAVE_al16) {
+        if (HAVE_ATOMIC128_RW) {
             uint64_t val_le;
             int s2 = pi & 15;
             int s1 = 16 - s2;
@@ -XXX,XX +XXX,XX @@ static void store_atom_16(CPUArchState *env, uintptr_t ra,
         }
         break;
     case MO_128:
-        if (HAVE_al16) {
-            store_atomic16(pv, val);
+        if (HAVE_ATOMIC128_RW) {
+            atomic16_set(pv, val);
             return;
         }
         break;
-- 
2.34.1

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/tcg/debug-assert.h | 17 +++++++++++++++++
 include/tcg/tcg.h          |  9 +--------
 MAINTAINERS                |  1 +
 3 files changed, 19 insertions(+), 8 deletions(-)
 create mode 100644 include/tcg/debug-assert.h

diff --git a/include/tcg/debug-assert.h b/include/tcg/debug-assert.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/include/tcg/debug-assert.h
@@ -XXX,XX +XXX,XX @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Define tcg_debug_assert
+ * Copyright (c) 2008 Fabrice Bellard
+ */
+
+#ifndef TCG_DEBUG_ASSERT_H
+#define TCG_DEBUG_ASSERT_H
+
+#if defined CONFIG_DEBUG_TCG || defined QEMU_STATIC_ANALYSIS
+# define tcg_debug_assert(X) do { assert(X); } while (0)
+#else
+# define tcg_debug_assert(X) \
+    do { if (!(X)) { __builtin_unreachable(); } } while (0)
+#endif
+
+#endif
diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg.h
+++ b/include/tcg/tcg.h
@@ -XXX,XX +XXX,XX @@
 #include "tcg/tcg-mo.h"
 #include "tcg-target.h"
 #include "tcg/tcg-cond.h"
+#include "tcg/debug-assert.h"
 
 /* XXX: make safe guess about sizes */
 #define MAX_OP_PER_INSTR 266
@@ -XXX,XX +XXX,XX @@ typedef uint64_t tcg_insn_unit;
 /* The port better have done this.  */
 #endif
 
-
-#if defined CONFIG_DEBUG_TCG || defined QEMU_STATIC_ANALYSIS
-# define tcg_debug_assert(X) do { assert(X); } while (0)
-#else
-# define tcg_debug_assert(X) \
-    do { if (!(X)) { __builtin_unreachable(); } } while (0)
-#endif
-
 typedef struct TCGRelocation TCGRelocation;
 struct TCGRelocation {
     QSIMPLEQ_ENTRY(TCGRelocation) next;
diff --git a/MAINTAINERS b/MAINTAINERS
index XXXXXXX..XXXXXXX 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -XXX,XX +XXX,XX @@ F: include/sysemu/tcg.h
 F: include/hw/core/tcg-cpu-ops.h
 F: host/include/*/host/cpuinfo.h
 F: util/cpuinfo-*.c
+F: include/tcg/
 
 FPU emulation
 M: Aurelien Jarno <aurelien@aurel32.net>
-- 
2.34.1

Use __sync_bool_compare_and_swap_16 to control the loop,
rather than a separate comparison.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 host/include/generic/host/atomic128-ldst.h | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/host/include/generic/host/atomic128-ldst.h b/host/include/generic/host/atomic128-ldst.h
index XXXXXXX..XXXXXXX 100644
--- a/host/include/generic/host/atomic128-ldst.h
+++ b/host/include/generic/host/atomic128-ldst.h
@@ -XXX,XX +XXX,XX @@ atomic16_read_rw(Int128 *ptr)
 static inline void ATTRIBUTE_ATOMIC128_OPT
 atomic16_set(Int128 *ptr, Int128 val)
 {
-    Int128 old = *ptr, cmp;
+    __int128_t *ptr_align = __builtin_assume_aligned(ptr, 16);
+    __int128_t old;
+    Int128Alias new;
+
+    new.s = val;
     do {
-        cmp = old;
-        old = atomic16_cmpxchg(ptr, cmp, val);
-    } while (int128_ne(old, cmp));
+        old = *ptr_align;
+    } while (!__sync_bool_compare_and_swap_16(ptr_align, old, new.i));
 }
 
 #else
-- 
2.34.1

With FEAT_LSE2, load and store of int128 is directly supported.

diff --git a/host/include/aarch64/host/atomic128-ldst.h b/host/include/aarch64/host/atomic128-ldst.h
index XXXXXXX..XXXXXXX 100644
--- a/host/include/aarch64/host/atomic128-ldst.h
+++ b/host/include/aarch64/host/atomic128-ldst.h
@@ -XXX,XX +XXX,XX @@
 #ifndef AARCH64_ATOMIC128_LDST_H
 #define AARCH64_ATOMIC128_LDST_H
 
+#include "host/cpuinfo.h"
+#include "tcg/debug-assert.h"
+
 /*
  * Through gcc 10, aarch64 has no support for 128-bit atomics.
  * Through clang 16, without -march=armv8.4-a, __atomic_load_16
  * is incorrectly expanded to a read-write operation.
+ *
+ * Anyway, this method allows runtime detection of FEAT_LSE2.
  */
 
-#define HAVE_ATOMIC128_RO 0
+#define HAVE_ATOMIC128_RO (cpuinfo & CPUINFO_LSE2)
 #define HAVE_ATOMIC128_RW 1
 
-Int128 QEMU_ERROR("unsupported atomic") atomic16_read_ro(const Int128 *ptr);
+static inline Int128 atomic16_read_ro(const Int128 *ptr)
+{
+    uint64_t l, h;
+
+    tcg_debug_assert(HAVE_ATOMIC128_RO);
+    /* With FEAT_LSE2, 16-byte aligned LDP is atomic. */
+    asm("ldp %[l], %[h], %[mem]"
+        : [l] "=r"(l), [h] "=r"(h) : [mem] "m"(*ptr));
+
+    return int128_make128(l, h);
+}
 
 static inline Int128 atomic16_read_rw(Int128 *ptr)
 {
     uint64_t l, h;
     uint32_t tmp;
 
-    /* The load must be paired with the store to guarantee not tearing.  */
-    asm("0: ldxp %[l], %[h], %[mem]\n\t"
-        "stxp %w[tmp], %[l], %[h], %[mem]\n\t"
-        "cbnz %w[tmp], 0b"
-        : [mem] "+m"(*ptr), [tmp] "=r"(tmp), [l] "=r"(l), [h] "=r"(h));
+    if (cpuinfo & CPUINFO_LSE2) {
+        /* With FEAT_LSE2, 16-byte aligned LDP is atomic. */
+        asm("ldp %[l], %[h], %[mem]"
+            : [l] "=r"(l), [h] "=r"(h) : [mem] "m"(*ptr));
+    } else {
+        /* The load must be paired with the store to guarantee not tearing.  */
+        asm("0: ldxp %[l], %[h], %[mem]\n\t"
+            "stxp %w[tmp], %[l], %[h], %[mem]\n\t"
+            "cbnz %w[tmp], 0b"
+            : [mem] "+m"(*ptr), [tmp] "=r"(tmp), [l] "=r"(l), [h] "=r"(h));
+    }
 
     return int128_make128(l, h);
 }
@@ -XXX,XX +XXX,XX @@ static inline void atomic16_set(Int128 *ptr, Int128 val)
     uint64_t l = int128_getlo(val), h = int128_gethi(val);
     uint64_t t1, t2;
 
-    /* Load into temporaries to acquire the exclusive access lock.  */
-    asm("0: ldxp %[t1], %[t2], %[mem]\n\t"
-        "stxp %w[t1], %[l], %[h], %[mem]\n\t"
-        "cbnz %w[t1], 0b"
-        : [mem] "+m"(*ptr), [t1] "=&r"(t1), [t2] "=&r"(t2)
-        : [l] "r"(l), [h] "r"(h));
+    if (cpuinfo & CPUINFO_LSE2) {
+        /* With FEAT_LSE2, 16-byte aligned STP is atomic. */
+        asm("stp %[l], %[h], %[mem]"
+            : [mem] "=m"(*ptr) : [l] "r"(l), [h] "r"(h));
+    } else {
+        /* Load into temporaries to acquire the exclusive access lock.  */
+        asm("0: ldxp %[t1], %[t2], %[mem]\n\t"
+            "stxp %w[t1], %[l], %[h], %[mem]\n\t"
+            "cbnz %w[t1], 0b"
+            : [mem] "+m"(*ptr), [t1] "=&r"(t1), [t2] "=&r"(t2)
+            : [l] "r"(l), [h] "r"(h));
+    }
 }
 
 #endif /* AARCH64_ATOMIC128_LDST_H */
-- 
2.34.1

This had been set since the beginning, is never undefined,
and it would seem to be harmful to debugging to do so.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/exec-all.h   | 3 ---
 accel/tcg/cpu-exec.c      | 2 --
 accel/tcg/translate-all.c | 2 --
 accel/tcg/translator.c    | 2 --
 target/sh4/translate.c    | 2 --
 target/sparc/translate.c  | 2 --
 tcg/tcg.c                 | 9 +--------
 7 files changed, 1 insertion(+), 21 deletions(-)

diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -XXX,XX +XXX,XX @@
 #include "qemu/interval-tree.h"
 #include "qemu/clang-tsa.h"
 
-/* allow to see translation results - the slowdown should be negligible, so we leave it */
-#define DEBUG_DISAS
-
 /* Page tracking code uses ram addresses in system mode, and virtual
    addresses in userspace mode.  Define tb_page_addr_t to be an appropriate
    type.  */
diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cpu-exec.c
+++ b/accel/tcg/cpu-exec.c
@@ -XXX,XX +XXX,XX @@ static void log_cpu_exec(target_ulong pc, CPUState *cpu,
                       cpu->cpu_index, tb->tc.ptr, tb->cs_base, pc,
                       tb->flags, tb->cflags, lookup_symbol(pc));
 
-#if defined(DEBUG_DISAS)
         if (qemu_loglevel_mask(CPU_LOG_TB_CPU)) {
             FILE *logfile = qemu_log_trylock();
             if (logfile) {
@@ -XXX,XX +XXX,XX @@ static void log_cpu_exec(target_ulong pc, CPUState *cpu,
                 qemu_log_unlock(logfile);
             }
         }
-#endif /* DEBUG_DISAS */
     }
 }
 
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_gen_code(CPUState *cpu,
     qatomic_set(&prof->search_out_len, prof->search_out_len + search_size);
 #endif
 
-#ifdef DEBUG_DISAS
     if (qemu_loglevel_mask(CPU_LOG_TB_OUT_ASM) &&
         qemu_log_in_addr_range(pc)) {
         FILE *logfile = qemu_log_trylock();
@@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_gen_code(CPUState *cpu,
             qemu_log_unlock(logfile);
         }
     }
-#endif
 
     qatomic_set(&tcg_ctx->code_gen_ptr, (void *)
         ROUND_UP((uintptr_t)gen_code_buf + gen_code_size + search_size,
diff --git a/accel/tcg/translator.c b/accel/tcg/translator.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/translator.c
+++ b/accel/tcg/translator.c
@@ -XXX,XX +XXX,XX @@ void translator_loop(CPUState *cpu, TranslationBlock *tb, int *max_insns,
     tb->size = db->pc_next - db->pc_first;
     tb->icount = db->num_insns;
 
-#ifdef DEBUG_DISAS
     if (qemu_loglevel_mask(CPU_LOG_TB_IN_ASM)
         && qemu_log_in_addr_range(db->pc_first)) {
         FILE *logfile = qemu_log_trylock();
@@ -XXX,XX +XXX,XX @@ void translator_loop(CPUState *cpu, TranslationBlock *tb, int *max_insns,
             qemu_log_unlock(logfile);
         }
     }
-#endif
 }
 
 static void *translator_access(CPUArchState *env, DisasContextBase *db,
diff --git a/target/sh4/translate.c b/target/sh4/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/sh4/translate.c
+++ b/target/sh4/translate.c
@@ -XXX,XX +XXX,XX @@
  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  */
 
-#define DEBUG_DISAS
-
 #include "qemu/osdep.h"
 #include "cpu.h"
 #include "disas/disas.h"
diff --git a/target/sparc/translate.c b/target/sparc/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/sparc/translate.c
+++ b/target/sparc/translate.c
@@ -XXX,XX +XXX,XX @@
 #include "asi.h"
 
 
-#define DEBUG_DISAS
-
 #define DYNAMIC_PC  1 /* dynamic pc value */
 #define JUMP_PC     2 /* dynamic pc value which takes only two values
                          according to jump_pc[T2] */
diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ void tcg_prologue_init(TCGContext *s)
                         (uintptr_t)s->code_buf, prologue_size);
 #endif
 
-#ifdef DEBUG_DISAS
     if (qemu_loglevel_mask(CPU_LOG_TB_OUT_ASM)) {
         FILE *logfile = qemu_log_trylock();
         if (logfile) {
@@ -XXX,XX +XXX,XX @@ void tcg_prologue_init(TCGContext *s)
             qemu_log_unlock(logfile);
         }
     }
-#endif
 
 #ifndef CONFIG_TCG_INTERPRETER
     /*
@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start)
     }
 #endif
 
-#ifdef DEBUG_DISAS
     if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_OP)
                  && qemu_log_in_addr_range(pc_start))) {
         FILE *logfile = qemu_log_trylock();
@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start)
             qemu_log_unlock(logfile);
         }
     }
-#endif
 
 #ifdef CONFIG_DEBUG_TCG
     /* Ensure all labels referenced have been emitted.  */
@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start)
     liveness_pass_1(s);
 
     if (s->nb_indirects > 0) {
-#ifdef DEBUG_DISAS
         if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_OP_IND)
                      && qemu_log_in_addr_range(pc_start))) {
             FILE *logfile = qemu_log_trylock();
@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start)
                 qemu_log_unlock(logfile);
             }
         }
-#endif
+
         /* Replace indirect temps with direct temps.  */
         if (liveness_pass_2(s)) {
             /* If changes were made, re-run liveness.  */
@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start)
     qatomic_set(&prof->la_time, prof->la_time + profile_getclock());
 #endif
 
-#ifdef DEBUG_DISAS
     if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_OP_OPT)
                  && qemu_log_in_addr_range(pc_start))) {
         FILE *logfile = qemu_log_trylock();
@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start)
             qemu_log_unlock(logfile);
         }
     }
-#endif
 
     /* Initialize goto_tb jump offsets. */
     tb->jmp_reset_offset[0] = TB_JMP_OFFSET_INVALID;
-- 
2.34.1