Series comparison

-[PULL 00/47] tcg patch queue
+[PULL 00/28] tcg patch queue
-The following changes since commit e750a7ace492f0b450653d4ad368a77d6f660fb8:
+The following changes since commit aa33508196f4e2da04625bee36e1f7be5b9267e7:
-  Merge tag 'pull-9p-20221024' of https://github.com/cschoenebeck/qemu into staging (2022-10-24 14:27:12 -0400)
+  Merge tag 'mem-2023-05-23' of https://github.com/davidhildenbrand/qemu into staging (2023-05-23 10:57:25 -0700)
 are available in the Git repository at:
-  https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20221026
+  https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20230523
-for you to fetch changes up to 04f105758b0089f73ee47260671580cde35f96cc:
+for you to fetch changes up to 30d56836f98c7ed2d309bff1dde8854f3d0b5634:
-  accel/tcg: Remove restore_state_to_opc function (2022-10-26 11:11:28 +1000)
+  tcg: Remove USE_TCG_OPTIMIZATIONS (2023-05-23 16:52:39 -0700)
 ----------------------------------------------------------------
-Revert incorrect cflags initialization.
+util: Host cpu detection for x86 and aa64
-Add direct jumps for tcg/loongarch64.
+util: Use cpu detection for bufferiszero
-Speed up breakpoint check.
+migration: Use cpu detection for xbzrle
-Improve assertions for atomic.h.
+tcg: Replace and remove cpu_atomic_{ld,st}o*
-Move restore_state_to_opc to TCGCPUOps.
+host/include: Split qemu/atomic128.h
-Cleanups to TranslationBlock maintenance.
+tcg: Remove DEBUG_DISAS
 tcg: Remove USE_TCG_OPTIMIZATIONS
 ----------------------------------------------------------------
-Leandro Lupori (1):
+Richard Henderson (28):
-      accel/tcg: Add a quicker check for breakpoints
+      util: Introduce host-specific cpuinfo.h
       util: Add cpuinfo-i386.c
       util: Add i386 CPUINFO_ATOMIC_VMOVDQU
       tcg/i386: Use host/cpuinfo.h
       util/bufferiszero: Use i386 host/cpuinfo.h
       migration/xbzrle: Shuffle function order
       migration/xbzrle: Use i386 host/cpuinfo.h
       migration: Build migration_files once
       util: Add cpuinfo-aarch64.c
       include/host: Split out atomic128-cas.h
       include/host: Split out atomic128-ldst.h
       meson: Fix detect atomic128 support with optimization
       include/qemu: Move CONFIG_ATOMIC128_OPT handling to atomic128.h
       target/ppc: Use tcg_gen_qemu_{ld,st}_i128 for LQARX, LQ, STQ
       target/s390x: Use tcg_gen_qemu_{ld,st}_i128 for LPQ, STPQ
       accel/tcg: Unify cpu_{ld,st}*_{be,le}_mmu
       target/s390x: Use cpu_{ld,st}*_mmu in do_csst
       target/s390x: Always use cpu_atomic_cmpxchgl_be_mmu in do_csst
       accel/tcg: Remove cpu_atomic_{ld,st}o_*_mmu
       accel/tcg: Remove prot argument to atomic_mmu_lookup
       accel/tcg: Eliminate #if on HAVE_ATOMIC128 and HAVE_CMPXCHG128
       qemu/atomic128: Split atomic16_read
       accel/tcg: Correctly use atomic128.h in ldst_atomicity.c.inc
       tcg: Split out tcg/debug-assert.h
       qemu/atomic128: Improve cmpxchg fallback for atomic16_set
       qemu/atomic128: Add runtime test for FEAT_LSE2
       tcg: Remove DEBUG_DISAS
       tcg: Remove USE_TCG_OPTIMIZATIONS
-Peter Maydell (1):
+ accel/tcg/atomic_template.h                |  93 +-----
-      Revert "accel/tcg: Init TCG cflags in vCPU thread handler"
+ host/include/aarch64/host/atomic128-cas.h  |  45 +++
+ host/include/aarch64/host/atomic128-ldst.h |  79 +++++
-Qi Hu (2):
+ host/include/aarch64/host/cpuinfo.h        |  22 ++
-      tcg/loongarch64: Add direct jump support
+ host/include/generic/host/atomic128-cas.h  |  47 +++
-      tcg/aarch64: Remove unused code in tcg_out_op
+ host/include/generic/host/atomic128-ldst.h |  81 +++++
+ host/include/generic/host/cpuinfo.h        |   4 +
-Richard Henderson (43):
+ host/include/i386/host/cpuinfo.h           |  39 +++
-      include/qemu/osdep: Add qemu_build_assert
+ host/include/x86_64/host/cpuinfo.h         |   1 +
-      include/qemu/atomic: Use qemu_build_assert
+ include/exec/cpu_ldst.h                    |  67 +----
-      include/qemu/thread: Use qatomic_* functions
+ include/exec/exec-all.h                    |   3 -
-      accel/tcg: Make page_alloc_target_data allocation constant
+ include/qemu/atomic128.h                   | 146 ++-------
-      accel/tcg: Remove disabled debug in translate-all.c
+ include/tcg/debug-assert.h                 |  17 ++
-      accel/tcg: Split out PageDesc to internal.h
+ include/tcg/tcg.h                          |   9 +-
-      accel/tcg: Split out tb-maint.c
+ migration/xbzrle.h                         |   5 +-
-      accel/tcg: Move assert_no_pages_locked to internal.h
+ target/ppc/cpu.h                           |   1 -
-      accel/tcg: Drop cpu_get_tb_cpu_state from TARGET_HAS_PRECISE_SMC
+ target/ppc/helper.h                        |   9 -
-      accel/tcg: Remove duplicate store to tb->page_addr[]
+ target/s390x/cpu.h                         |   3 -
-      accel/tcg: Introduce tb_{set_}page_addr{0,1}
+ target/s390x/helper.h                      |   4 -
-      accel/tcg: Rename tb_invalidate_phys_page
+ tcg/aarch64/tcg-target.h                   |   6 +-
-      accel/tcg: Rename tb_invalidate_phys_page_range and drop end parameter
+ tcg/i386/tcg-target.h                      |  28 +-
-      accel/tcg: Unify declarations of tb_invalidate_phys_range
+ accel/tcg/cpu-exec.c                       |   2 -
-      accel/tcg: Use tb_invalidate_phys_page in page_set_flags
+ accel/tcg/cputlb.c                         | 211 ++++---------
-      accel/tcg: Call tb_invalidate_phys_page for PAGE_RESET
+ accel/tcg/translate-all.c                  |   2 -
-      accel/tcg: Use page_reset_target_data in page_set_flags
+ accel/tcg/translator.c                     |   2 -
-      accel/tcg: Use tb_invalidate_phys_range in page_set_flags
+ accel/tcg/user-exec.c                      | 332 ++++++--------------
-      accel/tcg: Move TARGET_PAGE_DATA_SIZE impl to user-exec.c
+ migration/ram.c                            |  34 +--
-      accel/tcg: Simplify page_get/alloc_target_data
+ migration/xbzrle.c                         | 268 +++++++++--------
-      accel/tcg: Add restore_state_to_opc to TCGCPUOps
+ target/arm/tcg/m_helper.c                  |   4 +-
-      target/alpha: Convert to tcg_ops restore_state_to_opc
+ target/ppc/mem_helper.c                    |  48 ---
-      target/arm: Convert to tcg_ops restore_state_to_opc
+ target/ppc/translate.c                     |  34 +--
-      target/avr: Convert to tcg_ops restore_state_to_opc
+ target/s390x/tcg/mem_helper.c              | 137 ++-------
-      target/cris: Convert to tcg_ops restore_state_to_opc
+ target/s390x/tcg/translate.c               |  30 +-
-      target/hexagon: Convert to tcg_ops restore_state_to_opc
+ target/sh4/translate.c                     |   2 -
-      target/hppa: Convert to tcg_ops restore_state_to_opc
+ target/sparc/ldst_helper.c                 |  18 +-
-      target/i386: Convert to tcg_ops restore_state_to_opc
+ target/sparc/translate.c                   |   2 -
-      target/loongarch: Convert to tcg_ops restore_state_to_opc
+ tcg/tcg.c                                  |  14 +-
-      target/m68k: Convert to tcg_ops restore_state_to_opc
+ tests/bench/xbzrle-bench.c                 | 469 -----------------------------
-      target/microblaze: Convert to tcg_ops restore_state_to_opc
+ tests/unit/test-xbzrle.c                   |  49 +--
-      target/mips: Convert to tcg_ops restore_state_to_opc
+ util/bufferiszero.c                        | 127 +++-----
-      target/nios2: Convert to tcg_ops restore_state_to_opc
+ util/cpuinfo-aarch64.c                     |  67 +++++
-      target/openrisc: Convert to tcg_ops restore_state_to_opc
+ util/cpuinfo-i386.c                        |  99 ++++++
-      target/ppc: Convert to tcg_ops restore_state_to_opc
+ MAINTAINERS                                |   3 +
-      target/riscv: Convert to tcg_ops restore_state_to_opc
+ accel/tcg/atomic_common.c.inc              |  14 -
-      target/rx: Convert to tcg_ops restore_state_to_opc
+ accel/tcg/ldst_atomicity.c.inc             | 135 ++-------
-      target/s390x: Convert to tcg_ops restore_state_to_opc
+ accel/tcg/ldst_common.c.inc                |  24 +-
-      target/sh4: Convert to tcg_ops restore_state_to_opc
+ meson.build                                |  12 +-
-      target/sparc: Convert to tcg_ops restore_state_to_opc
+ migration/meson.build                      |   1 -
-      target/tricore: Convert to tcg_ops restore_state_to_opc
+ target/ppc/translate/fixedpoint-impl.c.inc |  51 +---
-      target/xtensa: Convert to tcg_ops restore_state_to_opc
+ target/s390x/tcg/insn-data.h.inc           |   2 +-
-      accel/tcg: Remove restore_state_to_opc function
+ tcg/aarch64/tcg-target.c.inc               |  40 ---
+ tcg/i386/tcg-target.c.inc                  | 123 +-------
- accel/tcg/internal.h             |   91 ++++
+ tests/bench/meson.build                    |   6 -
- include/exec/cpu-all.h           |   22 +-
+ util/meson.build                           |   6 +
- include/exec/exec-all.h          |   35 +-
+files changed, 1035 insertions(+), 2042 deletions(-)
- include/exec/ram_addr.h          |    2 -
+ create mode 100644 host/include/aarch64/host/atomic128-cas.h
- include/exec/translate-all.h     |    2 +-
+ create mode 100644 host/include/aarch64/host/atomic128-ldst.h
- include/hw/core/tcg-cpu-ops.h    |   11 +
+ create mode 100644 host/include/aarch64/host/cpuinfo.h
- include/qemu/atomic.h            |   16 +-
+ create mode 100644 host/include/generic/host/atomic128-cas.h
- include/qemu/osdep.h             |    8 +
+ create mode 100644 host/include/generic/host/atomic128-ldst.h
- include/qemu/thread.h            |    8 +-
+ create mode 100644 host/include/generic/host/cpuinfo.h
- target/arm/cpu.h                 |    8 +
+ create mode 100644 host/include/i386/host/cpuinfo.h
- target/arm/internals.h           |    4 -
+ create mode 100644 host/include/x86_64/host/cpuinfo.h
- target/mips/tcg/tcg-internal.h   |    3 +
+ create mode 100644 include/tcg/debug-assert.h
- target/s390x/s390x-internal.h    |    4 +-
+ delete mode 100644 tests/bench/xbzrle-bench.c
- target/sparc/cpu.h               |    3 +
+ create mode 100644 util/cpuinfo-aarch64.c
- tcg/loongarch64/tcg-target.h     |    9 +-
+ create mode 100644 util/cpuinfo-i386.c
  accel/tcg/cpu-exec.c             |   24 +-
  accel/tcg/tb-maint.c             |  704 ++++++++++++++++++++++++++
  accel/tcg/tcg-accel-ops-mttcg.c  |    5 +-
  accel/tcg/tcg-accel-ops-rr.c     |    7 +-
  accel/tcg/translate-all.c        | 1017 ++------------------------------------
  accel/tcg/translator.c           |    9 +-
  accel/tcg/user-exec.c            |   42 ++
  bsd-user/mmap.c                  |    2 -
  cpu.c                            |    4 +-
  linux-user/mmap.c                |    4 -
  target/alpha/cpu.c               |    9 +
  target/alpha/translate.c         |    6 -
  target/arm/cpu.c                 |   26 +
  target/arm/mte_helper.c          |    5 -
  target/arm/translate.c           |   22 -
  target/avr/cpu.c                 |   11 +
  target/avr/translate.c           |    6 -
  target/cris/cpu.c                |   11 +
  target/cris/translate.c          |    6 -
  target/hexagon/cpu.c             |    9 +-
  target/hppa/cpu.c                |   19 +
  target/hppa/translate.c          |   13 -
  target/i386/tcg/tcg-cpu.c        |   19 +
  target/i386/tcg/translate.c      |   15 -
  target/loongarch/cpu.c           |   11 +
  target/loongarch/translate.c     |    6 -
  target/m68k/cpu.c                |   14 +
  target/m68k/translate.c          |   10 -
  target/microblaze/cpu.c          |   11 +
  target/microblaze/translate.c    |    7 -
  target/mips/cpu.c                |    1 +
  target/mips/tcg/translate.c      |    8 +-
  target/nios2/cpu.c               |   11 +
  target/nios2/translate.c         |    6 -
  target/openrisc/cpu.c            |   13 +
  target/openrisc/translate.c      |   10 -
  target/ppc/cpu_init.c            |   10 +
  target/ppc/translate.c           |    6 -
  target/riscv/cpu.c               |    9 +-
  target/rx/cpu.c                  |   10 +
  target/rx/translate.c            |    6 -
  target/s390x/cpu.c               |    1 +
  target/s390x/tcg/translate.c     |    7 +-
  target/sh4/cpu.c                 |   16 +
  target/sh4/translate.c           |   10 -
  target/sparc/cpu.c               |    1 +
  target/sparc/translate.c         |    7 +-
  target/tricore/cpu.c             |   11 +
  target/tricore/translate.c       |    6 -
  target/xtensa/cpu.c              |   10 +
  target/xtensa/translate.c        |    6 -
  tcg/aarch64/tcg-target.c.inc     |   31 +-
  tcg/loongarch64/tcg-target.c.inc |   48 +-
  accel/tcg/meson.build            |    1 +
 files changed, 1304 insertions(+), 1221 deletions(-)
  create mode 100644 accel/tcg/tb-maint.c

-[PULL 01/47] Revert "accel/tcg: Init TCG cflags in vCPU thread handler"
+Deleted patch
-From: Peter Maydell <peter.maydell@linaro.org>
-Commit a82fd5a4ec24d was intended to be a code cleanup, but
-unfortunately it has a bug. It moves the initialization of the
-TCG cflags from the "start a new vcpu" function to the
-thread handler; this is fine when each vcpu has its own thread,
-but when we are doing round-robin of vcpus on a single thread
-we end up only initializing the cflags for CPU 0, not for any
-of the others.
-The most obvious effect of this bug is that running in icount
-mode with more than one CPU is broken; typically the guest
-hangs shortly after it brings up the secondary CPUs.
-This reverts commit a82fd5a4ec24d923ff1e6da128c0fd4a74079d99.
-Cc: qemu-stable@nongnu.org
-Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
-Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
-Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
-Message-Id: <20221021163409.3674911-1-peter.maydell@linaro.org>
-Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
----
- accel/tcg/tcg-accel-ops-mttcg.c | 5 +++--
- accel/tcg/tcg-accel-ops-rr.c    | 7 ++++---
-files changed, 7 insertions(+), 5 deletions(-)
-diff --git a/accel/tcg/tcg-accel-ops-mttcg.c b/accel/tcg/tcg-accel-ops-mttcg.c
-index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/tcg-accel-ops-mttcg.c
-+++ b/accel/tcg/tcg-accel-ops-mttcg.c
-@@ -XXX,XX +XXX,XX @@ static void *mttcg_cpu_thread_fn(void *arg)
-     assert(tcg_enabled());
-     g_assert(!icount_enabled());
--    tcg_cpu_init_cflags(cpu, current_machine->smp.max_cpus > 1);
--
-     rcu_register_thread();
-     force_rcu.notifier.notify = mttcg_force_rcu;
-     force_rcu.cpu = cpu;
-@@ -XXX,XX +XXX,XX @@ void mttcg_start_vcpu_thread(CPUState *cpu)
- {
-     char thread_name[VCPU_THREAD_NAME_SIZE];
-+    g_assert(tcg_enabled());
-+    tcg_cpu_init_cflags(cpu, current_machine->smp.max_cpus > 1);
-+
-     cpu->thread = g_new0(QemuThread, 1);
-     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
-     qemu_cond_init(cpu->halt_cond);
-diff --git a/accel/tcg/tcg-accel-ops-rr.c b/accel/tcg/tcg-accel-ops-rr.c
-index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/tcg-accel-ops-rr.c
-+++ b/accel/tcg/tcg-accel-ops-rr.c
-@@ -XXX,XX +XXX,XX @@ static void *rr_cpu_thread_fn(void *arg)
-     Notifier force_rcu;
-     CPUState *cpu = arg;
--    g_assert(tcg_enabled());
--    tcg_cpu_init_cflags(cpu, false);
--
-+    assert(tcg_enabled());
-     rcu_register_thread();
-     force_rcu.notify = rr_force_rcu;
-     rcu_add_force_rcu_notifier(&force_rcu);
-@@ -XXX,XX +XXX,XX @@ void rr_start_vcpu_thread(CPUState *cpu)
-     static QemuCond *single_tcg_halt_cond;
-     static QemuThread *single_tcg_cpu_thread;
-+    g_assert(tcg_enabled());
-+    tcg_cpu_init_cflags(cpu, false);
-+
-     if (!single_tcg_cpu_thread) {
-         cpu->thread = g_new0(QemuThread, 1);
-         cpu->halt_cond = g_new0(QemuCond, 1);
---
-.34.1

-[PULL 02/47] tcg/loongarch64: Add direct jump support
+Deleted patch
-From: Qi Hu <huqi@loongson.cn>
-Similar to the ARM64, LoongArch has PC-relative instructions such as
-PCADDU18I. These instructions can be used to support direct jump for
-LoongArch. Additionally, if instruction "B offset" can cover the target
-address(target is within ±128MB range), a single "B offset" plus a nop
-will be used by "tb_target_set_jump_target".
-Signed-off-by: Qi Hu <huqi@loongson.cn>
-Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
-Reviewed-by: WANG Xuerui <git@xen0n.name>
-Message-Id: <20221015092754.91971-1-huqi@loongson.cn>
-Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
----
- tcg/loongarch64/tcg-target.h     |  9 ++++--
- tcg/loongarch64/tcg-target.c.inc | 48 +++++++++++++++++++++++++++++---
-files changed, 50 insertions(+), 7 deletions(-)
-diff --git a/tcg/loongarch64/tcg-target.h b/tcg/loongarch64/tcg-target.h
-index XXXXXXX..XXXXXXX 100644
---- a/tcg/loongarch64/tcg-target.h
-+++ b/tcg/loongarch64/tcg-target.h
-@@ -XXX,XX +XXX,XX @@
- #define TCG_TARGET_INSN_UNIT_SIZE 4
- #define TCG_TARGET_NB_REGS 32
--#define MAX_CODE_GEN_BUFFER_SIZE  SIZE_MAX
-+/*
-+ * PCADDU18I + JIRL sequence can give 20 + 16 + 2 = 38 bits
-+ * signed offset, which is +/- 128 GiB.
-+ */
-+#define MAX_CODE_GEN_BUFFER_SIZE  (128 * GiB)
- typedef enum {
-     TCG_REG_ZERO,
-@@ -XXX,XX +XXX,XX @@ typedef enum {
- #define TCG_TARGET_HAS_clz_i32          1
- #define TCG_TARGET_HAS_ctz_i32          1
- #define TCG_TARGET_HAS_ctpop_i32        0
--#define TCG_TARGET_HAS_direct_jump      0
-+#define TCG_TARGET_HAS_direct_jump      1
- #define TCG_TARGET_HAS_brcond2          0
- #define TCG_TARGET_HAS_setcond2         0
- #define TCG_TARGET_HAS_qemu_st8_i32     0
-@@ -XXX,XX +XXX,XX @@ typedef enum {
- #define TCG_TARGET_HAS_muluh_i64        1
- #define TCG_TARGET_HAS_mulsh_i64        1
--/* not defined -- call should be eliminated at compile time */
- void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t);
- #define TCG_TARGET_DEFAULT_MO (0)
-diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
-index XXXXXXX..XXXXXXX 100644
---- a/tcg/loongarch64/tcg-target.c.inc
-+++ b/tcg/loongarch64/tcg-target.c.inc
-@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args)
- #endif
- }
-+/* LoongArch uses `andi zero, zero, 0` as NOP.  */
-+#define NOP OPC_ANDI
-+static void tcg_out_nop(TCGContext *s)
-+{
-+    tcg_out32(s, NOP);
-+}
-+
-+void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_rx,
-+                              uintptr_t jmp_rw, uintptr_t addr)
-+{
-+    tcg_insn_unit i1, i2;
-+    ptrdiff_t upper, lower;
-+    ptrdiff_t offset = (ptrdiff_t)(addr - jmp_rx) >> 2;
-+
-+    if (offset == sextreg(offset, 0, 26)) {
-+        i1 = encode_sd10k16_insn(OPC_B, offset);
-+        i2 = NOP;
-+    } else {
-+        tcg_debug_assert(offset == sextreg(offset, 0, 36));
-+        lower = (int16_t)offset;
-+        upper = (offset - lower) >> 16;
-+
-+        i1 = encode_dsj20_insn(OPC_PCADDU18I, TCG_REG_TMP0, upper);
-+        i2 = encode_djsk16_insn(OPC_JIRL, TCG_REG_ZERO, TCG_REG_TMP0, lower);
-+    }
-+    uint64_t pair = ((uint64_t)i2 << 32) | i1;
-+    qatomic_set((uint64_t *)jmp_rw, pair);
-+    flush_idcache_range(jmp_rx, jmp_rw, 8);
-+}
-+
- /*
-  * Entry-points
-  */
-@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
-         break;
-     case INDEX_op_goto_tb:
--        assert(s->tb_jmp_insn_offset == 0);
--        /* indirect jump method */
--        tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP0, TCG_REG_ZERO,
--                   (uintptr_t)(s->tb_jmp_target_addr + a0));
-+        tcg_debug_assert(s->tb_jmp_insn_offset != NULL);
-+        /*
-+         * Ensure that patch area is 8-byte aligned so that an
-+         * atomic write can be used to patch the target address.
-+         */
-+        if ((uintptr_t)s->code_ptr & 7) {
-+            tcg_out_nop(s);
-+        }
-+        s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
-+        /*
-+         * actual branch destination will be patched by
-+         * tb_target_set_jmp_target later
-+         */
-+        tcg_out_opc_pcaddu18i(s, TCG_REG_TMP0, 0);
-         tcg_out_opc_jirl(s, TCG_REG_ZERO, TCG_REG_TMP0, 0);
-         set_jmp_reset_offset(s, a0);
-         break;
---
-.34.1

-[PULL 03/47] tcg/aarch64: Remove unused code in tcg_out_op
+Deleted patch
-From: Qi Hu <huqi@loongson.cn>
-AArch64 defines the TCG_TARGET_HAS_direct_jump. So the "else" block is
-useless in the case of "INDEX_op_goto_tb" in function "tcg_out_op". Add
-an assertion and delete these codes for clarity.
-Suggested-by: WANG Xuerui <git@xen0n.name>
-Signed-off-by: Qi Hu <huqi@loongson.cn>
-Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
-Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
-Message-Id: <20221017020826.990729-1-huqi@loongson.cn>
-Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
----
- tcg/aarch64/tcg-target.c.inc | 31 ++++++++++++++-----------------
-file changed, 14 insertions(+), 17 deletions(-)
-diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
-index XXXXXXX..XXXXXXX 100644
---- a/tcg/aarch64/tcg-target.c.inc
-+++ b/tcg/aarch64/tcg-target.c.inc
-@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
-         break;
-     case INDEX_op_goto_tb:
--        if (s->tb_jmp_insn_offset != NULL) {
--            /* TCG_TARGET_HAS_direct_jump */
--            /* Ensure that ADRP+ADD are 8-byte aligned so that an atomic
--               write can be used to patch the target address. */
--            if ((uintptr_t)s->code_ptr & 7) {
--                tcg_out32(s, NOP);
--            }
--            s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
--            /* actual branch destination will be patched by
--               tb_target_set_jmp_target later. */
--            tcg_out_insn(s, 3406, ADRP, TCG_REG_TMP, 0);
--            tcg_out_insn(s, 3401, ADDI, TCG_TYPE_I64, TCG_REG_TMP, TCG_REG_TMP, 0);
--        } else {
--            /* !TCG_TARGET_HAS_direct_jump */
--            tcg_debug_assert(s->tb_jmp_target_addr != NULL);
--            intptr_t offset = tcg_pcrel_diff(s, (s->tb_jmp_target_addr + a0)) >> 2;
--            tcg_out_insn(s, 3305, LDR, offset, TCG_REG_TMP);
-+        tcg_debug_assert(s->tb_jmp_insn_offset != NULL);
-+        /*
-+         * Ensure that ADRP+ADD are 8-byte aligned so that an atomic
-+         * write can be used to patch the target address.
-+         */
-+        if ((uintptr_t)s->code_ptr & 7) {
-+            tcg_out32(s, NOP);
-         }
-+        s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
-+        /*
-+         * actual branch destination will be patched by
-+         * tb_target_set_jmp_target later
-+         */
-+        tcg_out_insn(s, 3406, ADRP, TCG_REG_TMP, 0);
-+        tcg_out_insn(s, 3401, ADDI, TCG_TYPE_I64, TCG_REG_TMP, TCG_REG_TMP, 0);
-         tcg_out_insn(s, 3207, BR, TCG_REG_TMP);
-         set_jmp_reset_offset(s, a0);
-         break;
---
-.34.1

-[PULL 04/47] accel/tcg: Add a quicker check for breakpoints
+Deleted patch
-From: Leandro Lupori <leandro.lupori@eldorado.org.br>
-Profiling QEMU during Fedora 35 for PPC64 boot revealed that a
-considerable amount of time was being spent in
-check_for_breakpoints() (0.61% of total time on PPC64 and 2.19% on
-amd64), even though it was just checking that its queue was empty
-and returning, when no breakpoints were set. It turns out this
-function is not inlined by the compiler and it's always called by
-helper_lookup_tb_ptr(), one of the most called functions.
-By leaving only the check for empty queue in
-check_for_breakpoints() and moving the remaining code to
-check_for_breakpoints_slow(), called only when the queue is not
-empty, it's possible to avoid the call overhead. An improvement of
-about 3% in total time was measured on POWER9.
-Signed-off-by: Leandro Lupori <leandro.lupori@eldorado.org.br>
-Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
-Message-Id: <20221025202424.195984-2-leandro.lupori@eldorado.org.br>
-Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
----
- accel/tcg/cpu-exec.c | 15 +++++++++------
-file changed, 9 insertions(+), 6 deletions(-)
-diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
-index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/cpu-exec.c
-+++ b/accel/tcg/cpu-exec.c
-@@ -XXX,XX +XXX,XX @@ static void log_cpu_exec(target_ulong pc, CPUState *cpu,
-     }
- }
--static bool check_for_breakpoints(CPUState *cpu, target_ulong pc,
--                                  uint32_t *cflags)
-+static bool check_for_breakpoints_slow(CPUState *cpu, target_ulong pc,
-+                                       uint32_t *cflags)
- {
-     CPUBreakpoint *bp;
-     bool match_page = false;
--    if (likely(QTAILQ_EMPTY(&cpu->breakpoints))) {
--        return false;
--    }
--
-     /*
-      * Singlestep overrides breakpoints.
-      * This requirement is visible in the record-replay tests, where
-@@ -XXX,XX +XXX,XX @@ static bool check_for_breakpoints(CPUState *cpu, target_ulong pc,
-     return false;
- }
-+static inline bool check_for_breakpoints(CPUState *cpu, target_ulong pc,
-+                                         uint32_t *cflags)
-+{
-+    return unlikely(!QTAILQ_EMPTY(&cpu->breakpoints)) &&
-+        check_for_breakpoints_slow(cpu, pc, cflags);
-+}
-+
- /**
-  * helper_lookup_tb_ptr: quick check for next tb
-  * @env: current cpu state
---
-.34.1

-[PULL 46/47] target/xtensa: Convert to tcg_ops restore_state_to_opc
+[PULL 01/28] util: Introduce host-specific cpuinfo.h
+The entire contents of the header is host-specific, but the
+existence of such a header is not, which could prevent some
+host specific ifdefs at the top of the file for the include.
+Add host/include/{arch,generic} to the project arguments.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+Reviewed-by: Juan Quintela <quintela@redhat.com>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/xtensa/cpu.c       | 10 ++++++++++
+ host/include/generic/host/cpuinfo.h |  4 ++++
- target/xtensa/translate.c |  6 ------
+ meson.build                         | 10 ++++++++++
-files changed, 10 insertions(+), 6 deletions(-)
+files changed, 14 insertions(+)
  create mode 100644 host/include/generic/host/cpuinfo.h
-diff --git a/target/xtensa/cpu.c b/target/xtensa/cpu.c
+diff --git a/host/include/generic/host/cpuinfo.h b/host/include/generic/host/cpuinfo.h
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/host/include/generic/host/cpuinfo.h
@@ -XXX,XX +XXX,XX @@
 +/*
 + * No host specific cpu indentification.
 + * SPDX-License-Identifier: GPL-2.0-or-later
 + */
 diff --git a/meson.build b/meson.build
 index XXXXXXX..XXXXXXX 100644
---- a/target/xtensa/cpu.c
+--- a/meson.build
-+++ b/target/xtensa/cpu.c
++++ b/meson.build
-@@ -XXX,XX +XXX,XX @@ static vaddr xtensa_cpu_get_pc(CPUState *cs)
+@@ -XXX,XX +XXX,XX @@ add_project_arguments('-iquote', '.',
-     return cpu->env.pc;
+                       '-iquote', meson.current_source_dir() / 'include',
- }
+                       language: all_languages)
-+static void xtensa_restore_state_to_opc(CPUState *cs,
++# If a host-specific include directory exists, list that first...
-+                                        const TranslationBlock *tb,
++host_include = meson.current_source_dir() / 'host/include/'
-+                                        const uint64_t *data)
++if fs.is_dir(host_include / host_arch)
-+{
++  add_project_arguments('-iquote', host_include / host_arch,
-+    XtensaCPU *cpu = XTENSA_CPU(cs);
++                        language: all_languages)
 +endif
 +# ... followed by the generic fallback.
 +add_project_arguments('-iquote', host_include / 'generic',
 +                      language: all_languages)
 +
-+    cpu->env.pc = data[0];
+ sparse = find_program('cgcc', required: get_option('sparse'))
-+}
+ if sparse.found()
-+
+   run_target('sparse',
  static bool xtensa_cpu_has_work(CPUState *cs)
  {
  #ifndef CONFIG_USER_ONLY
@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps xtensa_sysemu_ops = {
  static const struct TCGCPUOps xtensa_tcg_ops = {
      .initialize = xtensa_translate_init,
      .debug_excp_handler = xtensa_breakpoint_handler,
 +    .restore_state_to_opc = xtensa_restore_state_to_opc,
  #ifndef CONFIG_USER_ONLY
      .tlb_fill = xtensa_cpu_tlb_fill,
 diff --git a/target/xtensa/translate.c b/target/xtensa/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/xtensa/translate.c
 +++ b/target/xtensa/translate.c
@@ -XXX,XX +XXX,XX @@ void xtensa_cpu_dump_state(CPUState *cs, FILE *f, int flags)
      }
  }
 -void restore_state_to_opc(CPUXtensaState *env, TranslationBlock *tb,
 -                          target_ulong *data)
 -{
 -    env->pc = data[0];
 -}
 -
  static void translate_abs(DisasContext *dc, const OpcodeArg arg[],
                            const uint32_t par[])
  {
 --
 .34.1

-[PULL 45/47] target/tricore: Convert to tcg_ops restore_state_to_opc
+[PULL 02/28] util: Add cpuinfo-i386.c
-Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+Add cpuinfo.h for i386 and x86_64, and the initialization
 for that in util/.  Populate that with a slightly altered
 copy of the tcg host probing code.  Other uses of cpuid.h
 will be adjusted one patch at a time.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Reviewed-by: Juan Quintela <quintela@redhat.com>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/tricore/cpu.c       | 11 +++++++++++
+ host/include/i386/host/cpuinfo.h   | 38 ++++++++++++
- target/tricore/translate.c |  6 ------
+ host/include/x86_64/host/cpuinfo.h |  1 +
-files changed, 11 insertions(+), 6 deletions(-)
+ util/cpuinfo-i386.c                | 97 ++++++++++++++++++++++++++++++
+ MAINTAINERS                        |  2 +
-diff --git a/target/tricore/cpu.c b/target/tricore/cpu.c
+ util/meson.build                   |  4 ++
 files changed, 142 insertions(+)
  create mode 100644 host/include/i386/host/cpuinfo.h
  create mode 100644 host/include/x86_64/host/cpuinfo.h
  create mode 100644 util/cpuinfo-i386.c
 diff --git a/host/include/i386/host/cpuinfo.h b/host/include/i386/host/cpuinfo.h
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/host/include/i386/host/cpuinfo.h
@@ -XXX,XX +XXX,XX @@
 +/*
 + * SPDX-License-Identifier: GPL-2.0-or-later
 + * Host specific cpu indentification for x86.
 + */
 +
 +#ifndef HOST_CPUINFO_H
 +#define HOST_CPUINFO_H
 +
 +/* Digested version of <cpuid.h> */
 +
 +#define CPUINFO_ALWAYS          (1u << 0)  /* so cpuinfo is nonzero */
 +#define CPUINFO_CMOV            (1u << 1)
 +#define CPUINFO_MOVBE           (1u << 2)
 +#define CPUINFO_LZCNT           (1u << 3)
 +#define CPUINFO_POPCNT          (1u << 4)
 +#define CPUINFO_BMI1            (1u << 5)
 +#define CPUINFO_BMI2            (1u << 6)
 +#define CPUINFO_SSE2            (1u << 7)
 +#define CPUINFO_SSE4            (1u << 8)
 +#define CPUINFO_AVX1            (1u << 9)
 +#define CPUINFO_AVX2            (1u << 10)
 +#define CPUINFO_AVX512F         (1u << 11)
 +#define CPUINFO_AVX512VL        (1u << 12)
 +#define CPUINFO_AVX512BW        (1u << 13)
 +#define CPUINFO_AVX512DQ        (1u << 14)
 +#define CPUINFO_AVX512VBMI2     (1u << 15)
 +#define CPUINFO_ATOMIC_VMOVDQA  (1u << 16)
 +
 +/* Initialized with a constructor. */
 +extern unsigned cpuinfo;
 +
 +/*
 + * We cannot rely on constructor ordering, so other constructors must
 + * use the function interface rather than the variable above.
 + */
 +unsigned cpuinfo_init(void);
 +
 +#endif /* HOST_CPUINFO_H */
 diff --git a/host/include/x86_64/host/cpuinfo.h b/host/include/x86_64/host/cpuinfo.h
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/host/include/x86_64/host/cpuinfo.h
@@ -0,0 +1 @@
 +#include "host/include/i386/host/cpuinfo.h"
 diff --git a/util/cpuinfo-i386.c b/util/cpuinfo-i386.c
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/util/cpuinfo-i386.c
@@ -XXX,XX +XXX,XX @@
 +/*
 + * SPDX-License-Identifier: GPL-2.0-or-later
 + * Host specific cpu indentification for x86.
 + */
 +
 +#include "qemu/osdep.h"
 +#include "host/cpuinfo.h"
 +#ifdef CONFIG_CPUID_H
 +# include "qemu/cpuid.h"
 +#endif
 +
 +unsigned cpuinfo;
 +
 +/* Called both as constructor and (possibly) via other constructors. */
 +unsigned __attribute__((constructor)) cpuinfo_init(void)
 +{
 +    unsigned info = cpuinfo;
 +
 +    if (info) {
 +        return info;
 +    }
 +
 +#ifdef CONFIG_CPUID_H
 +    unsigned max, a, b, c, d, b7 = 0, c7 = 0;
 +
 +    max = __get_cpuid_max(0, 0);
 +
 +    if (max >= 7) {
 +        __cpuid_count(7, 0, a, b7, c7, d);
 +        info |= (b7 & bit_BMI ? CPUINFO_BMI1 : 0);
 +        info |= (b7 & bit_BMI2 ? CPUINFO_BMI2 : 0);
 +    }
 +
 +    if (max >= 1) {
 +        __cpuid(1, a, b, c, d);
 +
 +        info |= (d & bit_CMOV ? CPUINFO_CMOV : 0);
 +        info |= (d & bit_SSE2 ? CPUINFO_SSE2 : 0);
 +        info |= (c & bit_SSE4_1 ? CPUINFO_SSE4 : 0);
 +        info |= (c & bit_MOVBE ? CPUINFO_MOVBE : 0);
 +        info |= (c & bit_POPCNT ? CPUINFO_POPCNT : 0);
 +
 +        /* For AVX features, we must check available and usable. */
 +        if ((c & bit_AVX) && (c & bit_OSXSAVE)) {
 +            unsigned bv = xgetbv_low(0);
 +
 +            if ((bv & 6) == 6) {
 +                info |= CPUINFO_AVX1;
 +                info |= (b7 & bit_AVX2 ? CPUINFO_AVX2 : 0);
 +
 +                if ((bv & 0xe0) == 0xe0) {
 +                    info |= (b7 & bit_AVX512F ? CPUINFO_AVX512F : 0);
 +                    info |= (b7 & bit_AVX512VL ? CPUINFO_AVX512VL : 0);
 +                    info |= (b7 & bit_AVX512BW ? CPUINFO_AVX512BW : 0);
 +                    info |= (b7 & bit_AVX512DQ ? CPUINFO_AVX512DQ : 0);
 +                    info |= (c7 & bit_AVX512VBMI2 ? CPUINFO_AVX512VBMI2 : 0);
 +                }
 +
 +                /*
 +                 * The Intel SDM has added:
 +                 *   Processors that enumerate support for Intel® AVX
 +                 *   (by setting the feature flag CPUID.01H:ECX.AVX[bit 28])
 +                 *   guarantee that the 16-byte memory operations performed
 +                 *   by the following instructions will always be carried
 +                 *   out atomically:
 +                 *   - MOVAPD, MOVAPS, and MOVDQA.
 +                 *   - VMOVAPD, VMOVAPS, and VMOVDQA when encoded with VEX.128.
 +                 *   - VMOVAPD, VMOVAPS, VMOVDQA32, and VMOVDQA64 when encoded
 +                 *     with EVEX.128 and k0 (masking disabled).
 +                 * Note that these instructions require the linear addresses
 +                 * of their memory operands to be 16-byte aligned.
 +                 *
 +                 * AMD has provided an even stronger guarantee that processors
 +                 * with AVX provide 16-byte atomicity for all cachable,
 +                 * naturally aligned single loads and stores, e.g. MOVDQU.
 +                 *
 +                 * See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104688
 +                 */
 +                __cpuid(0, a, b, c, d);
 +                if (c == signature_INTEL_ecx || c == signature_AMD_ecx) {
 +                    info |= CPUINFO_ATOMIC_VMOVDQA;
 +                }
 +            }
 +        }
 +    }
 +
 +    max = __get_cpuid_max(0x8000000, 0);
 +    if (max >= 1) {
 +        __cpuid(0x80000001, a, b, c, d);
 +        info |= (c & bit_LZCNT ? CPUINFO_LZCNT : 0);
 +    }
 +#endif
 +
 +    info |= CPUINFO_ALWAYS;
 +    cpuinfo = info;
 +    return info;
 +}
 diff --git a/MAINTAINERS b/MAINTAINERS
 index XXXXXXX..XXXXXXX 100644
---- a/target/tricore/cpu.c
+--- a/MAINTAINERS
-+++ b/target/tricore/cpu.c
++++ b/MAINTAINERS
-@@ -XXX,XX +XXX,XX @@ static void tricore_cpu_synchronize_from_tb(CPUState *cs,
+@@ -XXX,XX +XXX,XX @@ F: include/exec/helper*.h
-     env->PC = tb_pc(tb);
+ F: include/sysemu/cpus.h
- }
+ F: include/sysemu/tcg.h
+ F: include/hw/core/tcg-cpu-ops.h
-+static void tricore_restore_state_to_opc(CPUState *cs,
++F: host/include/*/host/cpuinfo.h
-+                                         const TranslationBlock *tb,
++F: util/cpuinfo-*.c
-+                                         const uint64_t *data)
-+{
+ FPU emulation
-+    TriCoreCPU *cpu = TRICORE_CPU(cs);
+ M: Aurelien Jarno <aurelien@aurel32.net>
-+    CPUTriCoreState *env = &cpu->env;
+diff --git a/util/meson.build b/util/meson.build
 +
 +    env->PC = data[0];
 +}
 +
  static void tricore_cpu_reset(DeviceState *dev)
  {
      CPUState *s = CPU(dev);
@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps tricore_sysemu_ops = {
  static const struct TCGCPUOps tricore_tcg_ops = {
      .initialize = tricore_tcg_init,
      .synchronize_from_tb = tricore_cpu_synchronize_from_tb,
 +    .restore_state_to_opc = tricore_restore_state_to_opc,
      .tlb_fill = tricore_cpu_tlb_fill,
  };
 diff --git a/target/tricore/translate.c b/target/tricore/translate.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/tricore/translate.c
+--- a/util/meson.build
-+++ b/target/tricore/translate.c
++++ b/util/meson.build
-@@ -XXX,XX +XXX,XX @@ void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int max_insns,
+@@ -XXX,XX +XXX,XX @@ if have_block
-                     &tricore_tr_ops, &ctx.base);
+   endif
- }
+   util_ss.add(when: 'CONFIG_LINUX', if_true: files('vfio-helpers.c'))
+ endif
--void
++
--restore_state_to_opc(CPUTriCoreState *env, TranslationBlock *tb,
++if cpu in ['x86', 'x86_64']
--                     target_ulong *data)
++  util_ss.add(files('cpuinfo-i386.c'))
--{
++endif
 -    env->PC = data[0];
 -}
  /*
   *
   * Initialization
 --
 .34.1

-[PULL 44/47] target/sparc: Convert to tcg_ops restore_state_to_opc
+[PULL 03/28] util: Add i386 CPUINFO_ATOMIC_VMOVDQU
-Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+Add a bit to indicate when VMOVDQU is also atomic if aligned.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/sparc/cpu.h       | 3 +++
+ host/include/i386/host/cpuinfo.h | 1 +
- target/sparc/cpu.c       | 1 +
+ util/cpuinfo-i386.c              | 4 +++-
- target/sparc/translate.c | 7 +++++--
+files changed, 4 insertions(+), 1 deletion(-)
 files changed, 9 insertions(+), 2 deletions(-)
-diff --git a/target/sparc/cpu.h b/target/sparc/cpu.h
+diff --git a/host/include/i386/host/cpuinfo.h b/host/include/i386/host/cpuinfo.h
 index XXXXXXX..XXXXXXX 100644
---- a/target/sparc/cpu.h
+--- a/host/include/i386/host/cpuinfo.h
-+++ b/target/sparc/cpu.h
++++ b/host/include/i386/host/cpuinfo.h
-@@ -XXX,XX +XXX,XX @@ int sparc_cpu_memory_rw_debug(CPUState *cpu, vaddr addr,
+@@ -XXX,XX +XXX,XX @@
+ #define CPUINFO_AVX512DQ        (1u << 14)
- /* translate.c */
+ #define CPUINFO_AVX512VBMI2     (1u << 15)
- void sparc_tcg_init(void);
+ #define CPUINFO_ATOMIC_VMOVDQA  (1u << 16)
-+void sparc_restore_state_to_opc(CPUState *cs,
++#define CPUINFO_ATOMIC_VMOVDQU  (1u << 17)
-+                                const TranslationBlock *tb,
-+                                const uint64_t *data);
+ /* Initialized with a constructor. */
+ extern unsigned cpuinfo;
- /* cpu-exec.c */
+diff --git a/util/cpuinfo-i386.c b/util/cpuinfo-i386.c
 diff --git a/target/sparc/cpu.c b/target/sparc/cpu.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/sparc/cpu.c
+--- a/util/cpuinfo-i386.c
-+++ b/target/sparc/cpu.c
++++ b/util/cpuinfo-i386.c
-@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps sparc_sysemu_ops = {
+@@ -XXX,XX +XXX,XX @@ unsigned __attribute__((constructor)) cpuinfo_init(void)
- static const struct TCGCPUOps sparc_tcg_ops = {
+                  * See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104688
-     .initialize = sparc_tcg_init,
+                  */
-     .synchronize_from_tb = sparc_cpu_synchronize_from_tb,
+                 __cpuid(0, a, b, c, d);
-+    .restore_state_to_opc = sparc_restore_state_to_opc,
+-                if (c == signature_INTEL_ecx || c == signature_AMD_ecx) {
++                if (c == signature_INTEL_ecx) {
- #ifndef CONFIG_USER_ONLY
+                     info |= CPUINFO_ATOMIC_VMOVDQA;
-     .tlb_fill = sparc_cpu_tlb_fill,
++                } else if (c == signature_AMD_ecx) {
-diff --git a/target/sparc/translate.c b/target/sparc/translate.c
++                    info |= CPUINFO_ATOMIC_VMOVDQA | CPUINFO_ATOMIC_VMOVDQU;
-index XXXXXXX..XXXXXXX 100644
+                 }
---- a/target/sparc/translate.c
+             }
-+++ b/target/sparc/translate.c
+         }
@@ -XXX,XX +XXX,XX @@ void sparc_tcg_init(void)
      }
  }
 -void restore_state_to_opc(CPUSPARCState *env, TranslationBlock *tb,
 -                          target_ulong *data)
 +void sparc_restore_state_to_opc(CPUState *cs,
 +                                const TranslationBlock *tb,
 +                                const uint64_t *data)
  {
 +    SPARCCPU *cpu = SPARC_CPU(cs);
 +    CPUSPARCState *env = &cpu->env;
      target_ulong pc = data[0];
      target_ulong npc = data[1];
 --
 .34.1

-[PULL 43/47] target/sh4: Convert to tcg_ops restore_state_to_opc
+[PULL 04/28] tcg/i386: Use host/cpuinfo.h
-Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+Use the CPUINFO_* bits instead of the individual boolean
 variables that we had been using.  Remove all of the init
 code that was moved over to cpuinfo-i386.c.
 Note that have_avx512* check both AVX512{F,VL}, as we had
 previously done during tcg_target_init.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/sh4/cpu.c       | 16 ++++++++++++++++
+ tcg/i386/tcg-target.h     |  28 +++++----
- target/sh4/translate.c | 10 ----------
+ tcg/i386/tcg-target.c.inc | 123 ++------------------------------------
-files changed, 16 insertions(+), 10 deletions(-)
+files changed, 22 insertions(+), 129 deletions(-)
-diff --git a/target/sh4/cpu.c b/target/sh4/cpu.c
+diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
---- a/target/sh4/cpu.c
+--- a/tcg/i386/tcg-target.h
-+++ b/target/sh4/cpu.c
++++ b/tcg/i386/tcg-target.h
-@@ -XXX,XX +XXX,XX @@ static void superh_cpu_synchronize_from_tb(CPUState *cs,
+@@ -XXX,XX +XXX,XX @@
-     cpu->env.flags = tb->flags;
+ #ifndef I386_TCG_TARGET_H
- }
+ #define I386_TCG_TARGET_H
-+static void superh_restore_state_to_opc(CPUState *cs,
++#include "host/cpuinfo.h"
 +                                        const TranslationBlock *tb,
 +                                        const uint64_t *data)
 +{
 +    SuperHCPU *cpu = SUPERH_CPU(cs);
 +
-+    cpu->env.pc = data[0];
+ #define TCG_TARGET_INSN_UNIT_SIZE  1
-+    cpu->env.flags = data[1];
+ #define TCG_TARGET_TLB_DISPLACEMENT_BITS 31
-+    /*
-+     * Theoretically delayed_pc should also be restored. In practice the
+@@ -XXX,XX +XXX,XX @@ typedef enum {
-+     * branch instruction is re-executed after exception, so the delayed
+ # define TCG_TARGET_CALL_RET_I128    TCG_CALL_RET_BY_REF
-+     * branch target will be recomputed.
+ #endif
-+     */
-+}
+-extern bool have_bmi1;
 -extern bool have_popcnt;
 -extern bool have_avx1;
 -extern bool have_avx2;
 -extern bool have_avx512bw;
 -extern bool have_avx512dq;
 -extern bool have_avx512vbmi2;
 -extern bool have_avx512vl;
 -extern bool have_movbe;
 -extern bool have_atomic16;
 +#define have_bmi1         (cpuinfo & CPUINFO_BMI1)
 +#define have_popcnt       (cpuinfo & CPUINFO_POPCNT)
 +#define have_avx1         (cpuinfo & CPUINFO_AVX1)
 +#define have_avx2         (cpuinfo & CPUINFO_AVX2)
 +#define have_movbe        (cpuinfo & CPUINFO_MOVBE)
 +#define have_atomic16     (cpuinfo & CPUINFO_ATOMIC_VMOVDQA)
 +
- #ifndef CONFIG_USER_ONLY
++/*
- static bool superh_io_recompile_replay_branch(CPUState *cs,
++ * There are interesting instructions in AVX512, so long as we have AVX512VL,
-                                               const TranslationBlock *tb)
++ * which indicates support for EVEX on sizes smaller than 512 bits.
-@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps sh4_sysemu_ops = {
++ */
- static const struct TCGCPUOps superh_tcg_ops = {
++#define have_avx512vl     ((cpuinfo & CPUINFO_AVX512VL) && \
-     .initialize = sh4_translate_init,
++                           (cpuinfo & CPUINFO_AVX512F))
-     .synchronize_from_tb = superh_cpu_synchronize_from_tb,
++#define have_avx512bw     ((cpuinfo & CPUINFO_AVX512BW) && have_avx512vl)
-+    .restore_state_to_opc = superh_restore_state_to_opc,
++#define have_avx512dq     ((cpuinfo & CPUINFO_AVX512DQ) && have_avx512vl)
++#define have_avx512vbmi2  ((cpuinfo & CPUINFO_AVX512VBMI2) && have_avx512vl)
- #ifndef CONFIG_USER_ONLY
-     .tlb_fill = superh_cpu_tlb_fill,
+ /* optional instructions */
-diff --git a/target/sh4/translate.c b/target/sh4/translate.c
+ #define TCG_TARGET_HAS_div2_i32         1
 diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
---- a/target/sh4/translate.c
+--- a/tcg/i386/tcg-target.c.inc
-+++ b/target/sh4/translate.c
++++ b/tcg/i386/tcg-target.c.inc
-@@ -XXX,XX +XXX,XX @@ void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int max_insns,
+@@ -XXX,XX +XXX,XX @@ static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind kind, int slot)
+ # define SOFTMMU_RESERVE_REGS  0
-     translator_loop(cs, tb, max_insns, pc, host_pc, &sh4_tr_ops, &ctx.base);
+ #endif
- }
--
+-/* The host compiler should supply <cpuid.h> to enable runtime features
--void restore_state_to_opc(CPUSH4State *env, TranslationBlock *tb,
+-   detection, as we're not going to go so far as our own inline assembly.
--                          target_ulong *data)
+-   If not available, default values will be assumed.  */
--{
+-#if defined(CONFIG_CPUID_H)
--    env->pc = data[0];
+-#include "qemu/cpuid.h"
--    env->flags = data[1];
+-#endif
--    /* Theoretically delayed_pc should also be restored. In practice the
+-
--       branch instruction is re-executed after exception, so the delayed
+ /* For 64-bit, we always know that CMOV is available.  */
--       branch target will be recomputed. */
+ #if TCG_TARGET_REG_BITS == 64
--}
+-# define have_cmov 1
 -#elif defined(CONFIG_CPUID_H)
 -static bool have_cmov;
 +# define have_cmov      true
  #else
 -# define have_cmov 0
 -#endif
 -
 -/* We need these symbols in tcg-target.h, and we can't properly conditionalize
 -   it there.  Therefore we always define the variable.  */
 -bool have_bmi1;
 -bool have_popcnt;
 -bool have_avx1;
 -bool have_avx2;
 -bool have_avx512bw;
 -bool have_avx512dq;
 -bool have_avx512vbmi2;
 -bool have_avx512vl;
 -bool have_movbe;
 -bool have_atomic16;
 -
 -#ifdef CONFIG_CPUID_H
 -static bool have_bmi2;
 -static bool have_lzcnt;
 -#else
 -# define have_bmi2 0
 -# define have_lzcnt 0
 +# define have_cmov      (cpuinfo & CPUINFO_CMOV)
  #endif
 +#define have_bmi2       (cpuinfo & CPUINFO_BMI2)
 +#define have_lzcnt      (cpuinfo & CPUINFO_LZCNT)
  static const tcg_insn_unit *tb_ret_addr;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
  static void tcg_target_init(TCGContext *s)
  {
 -#ifdef CONFIG_CPUID_H
 -    unsigned a, b, c, d, b7 = 0, c7 = 0;
 -    unsigned max = __get_cpuid_max(0, 0);
 -
 -    if (max >= 7) {
 -        /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs.  */
 -        __cpuid_count(7, 0, a, b7, c7, d);
 -        have_bmi1 = (b7 & bit_BMI) != 0;
 -        have_bmi2 = (b7 & bit_BMI2) != 0;
 -    }
 -
 -    if (max >= 1) {
 -        __cpuid(1, a, b, c, d);
 -#ifndef have_cmov
 -        /* For 32-bit, 99% certainty that we're running on hardware that
 -           supports cmov, but we still need to check.  In case cmov is not
 -           available, we'll use a small forward branch.  */
 -        have_cmov = (d & bit_CMOV) != 0;
 -#endif
 -
 -        /* MOVBE is only available on Intel Atom and Haswell CPUs, so we
 -           need to probe for it.  */
 -        have_movbe = (c & bit_MOVBE) != 0;
 -        have_popcnt = (c & bit_POPCNT) != 0;
 -
 -        /* There are a number of things we must check before we can be
 -           sure of not hitting invalid opcode.  */
 -        if (c & bit_OSXSAVE) {
 -            unsigned bv = xgetbv_low(0);
 -
 -            if ((bv & 6) == 6) {
 -                have_avx1 = (c & bit_AVX) != 0;
 -                have_avx2 = (b7 & bit_AVX2) != 0;
 -
 -                /*
 -                 * There are interesting instructions in AVX512, so long
 -                 * as we have AVX512VL, which indicates support for EVEX
 -                 * on sizes smaller than 512 bits.  We are required to
 -                 * check that OPMASK and all extended ZMM state are enabled
 -                 * even if we're not using them -- the insns will fault.
 -                 */
 -                if ((bv & 0xe0) == 0xe0
 -                    && (b7 & bit_AVX512F)
 -                    && (b7 & bit_AVX512VL)) {
 -                    have_avx512vl = true;
 -                    have_avx512bw = (b7 & bit_AVX512BW) != 0;
 -                    have_avx512dq = (b7 & bit_AVX512DQ) != 0;
 -                    have_avx512vbmi2 = (c7 & bit_AVX512VBMI2) != 0;
 -                }
 -
 -                /*
 -                 * The Intel SDM has added:
 -                 *   Processors that enumerate support for Intel® AVX
 -                 *   (by setting the feature flag CPUID.01H:ECX.AVX[bit 28])
 -                 *   guarantee that the 16-byte memory operations performed
 -                 *   by the following instructions will always be carried
 -                 *   out atomically:
 -                 *   - MOVAPD, MOVAPS, and MOVDQA.
 -                 *   - VMOVAPD, VMOVAPS, and VMOVDQA when encoded with VEX.128.
 -                 *   - VMOVAPD, VMOVAPS, VMOVDQA32, and VMOVDQA64 when encoded
 -                 *     with EVEX.128 and k0 (masking disabled).
 -                 * Note that these instructions require the linear addresses
 -                 * of their memory operands to be 16-byte aligned.
 -                 *
 -                 * AMD has provided an even stronger guarantee that processors
 -                 * with AVX provide 16-byte atomicity for all cachable,
 -                 * naturally aligned single loads and stores, e.g. MOVDQU.
 -                 *
 -                 * See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104688
 -                 */
 -                if (have_avx1) {
 -                    __cpuid(0, a, b, c, d);
 -                    have_atomic16 = (c == signature_INTEL_ecx ||
 -                                     c == signature_AMD_ecx);
 -                }
 -            }
 -        }
 -    }
 -
 -    max = __get_cpuid_max(0x8000000, 0);
 -    if (max >= 1) {
 -        __cpuid(0x80000001, a, b, c, d);
 -        /* LZCNT was introduced with AMD Barcelona and Intel Haswell CPUs.  */
 -        have_lzcnt = (c & bit_LZCNT) != 0;
 -    }
 -#endif /* CONFIG_CPUID_H */
 -
      tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS;
      if (TCG_TARGET_REG_BITS == 64) {
          tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS;
 --
 .34.1

-[PULL 18/47] accel/tcg: Unify declarations of tb_invalidate_phys_range
+[PULL 05/28] util/bufferiszero: Use i386 host/cpuinfo.h
-We missed this function when we introduced tb_page_addr_t.
+Use cpuinfo_init() during init_accel(), and the variable cpuinfo
 during test_buffer_is_zero_next_accel().  Adjust the logic that
 cycles through the set of accelerators for testing.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/exec/exec-all.h |  2 +-
+ util/bufferiszero.c | 127 ++++++++++++++++----------------------------
- include/exec/ram_addr.h |  2 --
+file changed, 46 insertions(+), 81 deletions(-)
  accel/tcg/tb-maint.c    | 13 ++-----------
 files changed, 3 insertions(+), 14 deletions(-)
-diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
+diff --git a/util/bufferiszero.c b/util/bufferiszero.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/exec/exec-all.h
+--- a/util/bufferiszero.c
-+++ b/include/exec/exec-all.h
++++ b/util/bufferiszero.c
@@ -XXX,XX +XXX,XX @@ uint32_t curr_cflags(CPUState *cpu);
  /* TranslationBlock invalidate API */
  #if defined(CONFIG_USER_ONLY)
  void tb_invalidate_phys_addr(target_ulong addr);
 -void tb_invalidate_phys_range(target_ulong start, target_ulong end);
  #else
  void tb_invalidate_phys_addr(AddressSpace *as, hwaddr addr, MemTxAttrs attrs);
  #endif
  void tb_flush(CPUState *cpu);
  void tb_phys_invalidate(TranslationBlock *tb, tb_page_addr_t page_addr);
 +void tb_invalidate_phys_range(tb_page_addr_t start, tb_page_addr_t end);
  void tb_set_jmp_target(TranslationBlock *tb, int n, uintptr_t addr);
  /* GETPC is the true target of the return instruction that we'll execute.  */
 diff --git a/include/exec/ram_addr.h b/include/exec/ram_addr.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/exec/ram_addr.h
 +++ b/include/exec/ram_addr.h
@@ -XXX,XX +XXX,XX @@ static inline void qemu_ram_block_writeback(RAMBlock *block)
  #define DIRTY_CLIENTS_ALL     ((1 << DIRTY_MEMORY_NUM) - 1)
  #define DIRTY_CLIENTS_NOCODE  (DIRTY_CLIENTS_ALL & ~(1 << DIRTY_MEMORY_CODE))
 -void tb_invalidate_phys_range(ram_addr_t start, ram_addr_t end);
 -
  static inline bool cpu_physical_memory_get_dirty(ram_addr_t start,
                                                   ram_addr_t length,
                                                   unsigned client)
 diff --git a/accel/tcg/tb-maint.c b/accel/tcg/tb-maint.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/tb-maint.c
 +++ b/accel/tcg/tb-maint.c
 @@ -XXX,XX +XXX,XX @@
  #include "qemu/osdep.h"
- #include "exec/cputlb.h"
+ #include "qemu/cutils.h"
- #include "exec/log.h"
+ #include "qemu/bswap.h"
-+#include "exec/exec-all.h"
++#include "host/cpuinfo.h"
- #include "exec/translate-all.h"
- #include "sysemu/tcg.h"
+ static bool
- #include "tcg/tcg.h"
+ buffer_zero_int(const void *buf, size_t len)
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ buffer_zero_avx512(const void *buf, size_t len)
- #include "tb-context.h"
+ }
- #include "internal.h"
+ #endif /* CONFIG_AVX512F_OPT */
--/* FIXME: tb_invalidate_phys_range is declared in different places. */
+-
--#ifdef CONFIG_USER_ONLY
+-/* Note that for test_buffer_is_zero_next_accel, the most preferred
--#include "exec/exec-all.h"
+- * ISA must have the least significant bit.
--#else
+- */
--#include "exec/ram_addr.h"
+-#define CACHE_AVX512F 1
 -#define CACHE_AVX2    2
 -#define CACHE_SSE4    4
 -#define CACHE_SSE2    8
 -
 -/* Make sure that these variables are appropriately initialized when
 +/*
 + * Make sure that these variables are appropriately initialized when
   * SSE2 is enabled on the compiler command-line, but the compiler is
   * too old to support CONFIG_AVX2_OPT.
   */
  #if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT)
 -# define INIT_CACHE 0
 -# define INIT_ACCEL buffer_zero_int
 +# define INIT_USED     0
 +# define INIT_LENGTH   0
 +# define INIT_ACCEL    buffer_zero_int
  #else
  # ifndef __SSE2__
  #  error "ISA selection confusion"
  # endif
 -# define INIT_CACHE CACHE_SSE2
 -# define INIT_ACCEL buffer_zero_sse2
 +# define INIT_USED     CPUINFO_SSE2
 +# define INIT_LENGTH   64
 +# define INIT_ACCEL    buffer_zero_sse2
  #endif
 -static unsigned cpuid_cache = INIT_CACHE;
 +static unsigned used_accel = INIT_USED;
 +static unsigned length_to_accel = INIT_LENGTH;
  static bool (*buffer_accel)(const void *, size_t) = INIT_ACCEL;
 -static int length_to_accel = 64;
 -static void init_accel(unsigned cache)
 +static unsigned __attribute__((noinline))
 +select_accel_cpuinfo(unsigned info)
  {
 -    bool (*fn)(const void *, size_t) = buffer_zero_int;
 -    if (cache & CACHE_SSE2) {
 -        fn = buffer_zero_sse2;
 -        length_to_accel = 64;
 -    }
 -#ifdef CONFIG_AVX2_OPT
 -    if (cache & CACHE_SSE4) {
 -        fn = buffer_zero_sse4;
 -        length_to_accel = 64;
 -    }
 -    if (cache & CACHE_AVX2) {
 -        fn = buffer_zero_avx2;
 -        length_to_accel = 128;
 -    }
 -#endif
++    /* Array is sorted in order of algorithm preference. */
- static bool tb_cmp(const void *ap, const void *bp)
++    static const struct {
 +        unsigned bit;
 +        unsigned len;
 +        bool (*fn)(const void *, size_t);
 +    } all[] = {
  #ifdef CONFIG_AVX512F_OPT
 -    if (cache & CACHE_AVX512F) {
 -        fn = buffer_zero_avx512;
 -        length_to_accel = 256;
 -    }
 +        { CPUINFO_AVX512F, 256, buffer_zero_avx512 },
  #endif
 -    buffer_accel = fn;
 +#ifdef CONFIG_AVX2_OPT
 +        { CPUINFO_AVX2,    128, buffer_zero_avx2 },
 +        { CPUINFO_SSE4,     64, buffer_zero_sse4 },
 +#endif
 +        { CPUINFO_SSE2,     64, buffer_zero_sse2 },
 +        { CPUINFO_ALWAYS,    0, buffer_zero_int },
 +    };
 +
 +    for (unsigned i = 0; i < ARRAY_SIZE(all); ++i) {
 +        if (info & all[i].bit) {
 +            length_to_accel = all[i].len;
 +            buffer_accel = all[i].fn;
 +            return all[i].bit;
 +        }
 +    }
 +    return 0;
  }
  #if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT)
 -#include "qemu/cpuid.h"
 -
 -static void __attribute__((constructor)) init_cpuid_cache(void)
 +static void __attribute__((constructor)) init_accel(void)
  {
-@@ -XXX,XX +XXX,XX @@ void tb_invalidate_phys_page(tb_page_addr_t addr)
+-    unsigned max = __get_cpuid_max(0, NULL);
-  *
+-    int a, b, c, d;
-  * Called with mmap_lock held for user-mode emulation.
+-    unsigned cache = 0;
-  */
+-
--#ifdef CONFIG_SOFTMMU
+-    if (max >= 1) {
--void tb_invalidate_phys_range(ram_addr_t start, ram_addr_t end)
+-        __cpuid(1, a, b, c, d);
--#else
+-        if (d & bit_SSE2) {
--void tb_invalidate_phys_range(target_ulong start, target_ulong end)
+-            cache |= CACHE_SSE2;
--#endif
+-        }
-+void tb_invalidate_phys_range(tb_page_addr_t start, tb_page_addr_t end)
+-        if (c & bit_SSE4_1) {
 -            cache |= CACHE_SSE4;
 -        }
 -
 -        /* We must check that AVX is not just available, but usable.  */
 -        if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
 -            unsigned bv = xgetbv_low(0);
 -            __cpuid_count(7, 0, a, b, c, d);
 -            if ((bv & 0x6) == 0x6 && (b & bit_AVX2)) {
 -                cache |= CACHE_AVX2;
 -            }
 -            /* 0xe6:
 -            *  XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15
 -            *                    and ZMM16-ZMM31 state are enabled by OS)
 -            *  XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS)
 -            */
 -            if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512F)) {
 -                cache |= CACHE_AVX512F;
 -            }
 -        }
 -    }
 -    cpuid_cache = cache;
 -    init_accel(cache);
 +    used_accel = select_accel_cpuinfo(cpuinfo_init());
  }
  #endif /* CONFIG_AVX2_OPT */
  bool test_buffer_is_zero_next_accel(void)
  {
-     struct page_collection *pages;
+-    /* If no bits set, we just tested buffer_zero_int, and there
-     tb_page_addr_t next;
+-       are no more acceleration options to test.  */
 -    if (cpuid_cache == 0) {
 -        return false;
 -    }
 -    /* Disable the accelerator we used before and select a new one.  */
 -    cpuid_cache &= cpuid_cache - 1;
 -    init_accel(cpuid_cache);
 -    return true;
 +    /*
 +     * Accumulate the accelerators that we've already tested, and
 +     * remove them from the set to test this round.  We'll get back
 +     * a zero from select_accel_cpuinfo when there are no more.
 +     */
 +    unsigned used = select_accel_cpuinfo(cpuinfo & ~used_accel);
 +    used_accel |= used;
 +    return used;
  }
  static bool select_accel_fn(const void *buf, size_t len)
 --
 .34.1

-[PULL 42/47] target/s390x: Convert to tcg_ops restore_state_to_opc
+[PULL 06/28] migration/xbzrle: Shuffle function order
-Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+Place the CONFIG_AVX512BW_OPT block at the top,
 which will aid function selection in the next patch.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Reviewed-by: Juan Quintela <quintela@redhat.com>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/s390x/s390x-internal.h | 4 +++-
+ migration/xbzrle.c | 244 ++++++++++++++++++++++-----------------------
- target/s390x/cpu.c            | 1 +
+file changed, 122 insertions(+), 122 deletions(-)
- target/s390x/tcg/translate.c  | 7 +++++--
-files changed, 9 insertions(+), 3 deletions(-)
+diff --git a/migration/xbzrle.c b/migration/xbzrle.c
 diff --git a/target/s390x/s390x-internal.h b/target/s390x/s390x-internal.h
 index XXXXXXX..XXXXXXX 100644
---- a/target/s390x/s390x-internal.h
+--- a/migration/xbzrle.c
-+++ b/target/s390x/s390x-internal.h
++++ b/migration/xbzrle.c
-@@ -XXX,XX +XXX,XX @@ void handle_diag_308(CPUS390XState *env, uint64_t r1, uint64_t r3,
+@@ -XXX,XX +XXX,XX @@
+ #include "qemu/host-utils.h"
- /* translate.c */
+ #include "xbzrle.h"
- void s390x_translate_init(void);
--
++#if defined(CONFIG_AVX512BW_OPT)
-+void s390x_restore_state_to_opc(CPUState *cs,
++#include <immintrin.h>
-+                                const TranslationBlock *tb,
++
-+                                const uint64_t *data);
++int __attribute__((target("avx512bw")))
++xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen,
- /* sigp.c */
++                            uint8_t *dst, int dlen)
- int handle_sigp(CPUS390XState *env, uint8_t order, uint64_t r1, uint64_t r3);
++{
-diff --git a/target/s390x/cpu.c b/target/s390x/cpu.c
++    uint32_t zrun_len = 0, nzrun_len = 0;
-index XXXXXXX..XXXXXXX 100644
++    int d = 0, i = 0, num = 0;
---- a/target/s390x/cpu.c
++    uint8_t *nzrun_start = NULL;
-+++ b/target/s390x/cpu.c
++    /* add 1 to include residual part in main loop */
-@@ -XXX,XX +XXX,XX @@ static void s390_cpu_reset_full(DeviceState *dev)
++    uint32_t count512s = (slen >> 6) + 1;
++    /* countResidual is tail of data, i.e., countResidual = slen % 64 */
- static const struct TCGCPUOps s390_tcg_ops = {
++    uint32_t count_residual = slen & 0b111111;
-     .initialize = s390x_translate_init,
++    bool never_same = true;
-+    .restore_state_to_opc = s390x_restore_state_to_opc,
++    uint64_t mask_residual = 1;
++    mask_residual <<= count_residual;
- #ifdef CONFIG_USER_ONLY
++    mask_residual -= 1;
-     .record_sigsegv = s390_cpu_record_sigsegv,
++    __m512i r = _mm512_set1_epi32(0);
-diff --git a/target/s390x/tcg/translate.c b/target/s390x/tcg/translate.c
++
-index XXXXXXX..XXXXXXX 100644
++    while (count512s) {
---- a/target/s390x/tcg/translate.c
++        int bytes_to_check = 64;
-+++ b/target/s390x/tcg/translate.c
++        uint64_t mask = 0xffffffffffffffff;
-@@ -XXX,XX +XXX,XX @@ void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int max_insns,
++        if (count512s == 1) {
-     translator_loop(cs, tb, max_insns, pc, host_pc, &s390x_tr_ops, &dc.base);
++            bytes_to_check = count_residual;
 +            mask = mask_residual;
 +        }
 +        __m512i old_data = _mm512_mask_loadu_epi8(r,
 +                                                  mask, old_buf + i);
 +        __m512i new_data = _mm512_mask_loadu_epi8(r,
 +                                                  mask, new_buf + i);
 +        uint64_t comp = _mm512_cmpeq_epi8_mask(old_data, new_data);
 +        count512s--;
 +
 +        bool is_same = (comp & 0x1);
 +        while (bytes_to_check) {
 +            if (d + 2 > dlen) {
 +                return -1;
 +            }
 +            if (is_same) {
 +                if (nzrun_len) {
 +                    d += uleb128_encode_small(dst + d, nzrun_len);
 +                    if (d + nzrun_len > dlen) {
 +                        return -1;
 +                    }
 +                    nzrun_start = new_buf + i - nzrun_len;
 +                    memcpy(dst + d, nzrun_start, nzrun_len);
 +                    d += nzrun_len;
 +                    nzrun_len = 0;
 +                }
 +                /* 64 data at a time for speed */
 +                if (count512s && (comp == 0xffffffffffffffff)) {
 +                    i += 64;
 +                    zrun_len += 64;
 +                    break;
 +                }
 +                never_same = false;
 +                num = ctz64(~comp);
 +                num = (num < bytes_to_check) ? num : bytes_to_check;
 +                zrun_len += num;
 +                bytes_to_check -= num;
 +                comp >>= num;
 +                i += num;
 +                if (bytes_to_check) {
 +                    /* still has different data after same data */
 +                    d += uleb128_encode_small(dst + d, zrun_len);
 +                    zrun_len = 0;
 +                } else {
 +                    break;
 +                }
 +            }
 +            if (never_same || zrun_len) {
 +                /*
 +                 * never_same only acts if
 +                 * data begins with diff in first count512s
 +                 */
 +                d += uleb128_encode_small(dst + d, zrun_len);
 +                zrun_len = 0;
 +                never_same = false;
 +            }
 +            /* has diff, 64 data at a time for speed */
 +            if ((bytes_to_check == 64) && (comp == 0x0)) {
 +                i += 64;
 +                nzrun_len += 64;
 +                break;
 +            }
 +            num = ctz64(comp);
 +            num = (num < bytes_to_check) ? num : bytes_to_check;
 +            nzrun_len += num;
 +            bytes_to_check -= num;
 +            comp >>= num;
 +            i += num;
 +            if (bytes_to_check) {
 +                /* mask like 111000 */
 +                d += uleb128_encode_small(dst + d, nzrun_len);
 +                /* overflow */
 +                if (d + nzrun_len > dlen) {
 +                    return -1;
 +                }
 +                nzrun_start = new_buf + i - nzrun_len;
 +                memcpy(dst + d, nzrun_start, nzrun_len);
 +                d += nzrun_len;
 +                nzrun_len = 0;
 +                is_same = true;
 +            }
 +        }
 +    }
 +
 +    if (nzrun_len != 0) {
 +        d += uleb128_encode_small(dst + d, nzrun_len);
 +        /* overflow */
 +        if (d + nzrun_len > dlen) {
 +            return -1;
 +        }
 +        nzrun_start = new_buf + i - nzrun_len;
 +        memcpy(dst + d, nzrun_start, nzrun_len);
 +        d += nzrun_len;
 +    }
 +    return d;
 +}
 +#endif
 +
  /*
    page = zrun nzrun
         | zrun nzrun page
@@ -XXX,XX +XXX,XX @@ int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen)
      return d;
  }
+-
--void restore_state_to_opc(CPUS390XState *env, TranslationBlock *tb,
+-#if defined(CONFIG_AVX512BW_OPT)
--                          target_ulong *data)
+-#include <immintrin.h>
-+void s390x_restore_state_to_opc(CPUState *cs,
+-
-+                                const TranslationBlock *tb,
+-int __attribute__((target("avx512bw")))
-+                                const uint64_t *data)
+-xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen,
- {
+-                            uint8_t *dst, int dlen)
-+    S390CPU *cpu = S390_CPU(cs);
+-{
-+    CPUS390XState *env = &cpu->env;
+-    uint32_t zrun_len = 0, nzrun_len = 0;
-     int cc_op = data[1];
+-    int d = 0, i = 0, num = 0;
+-    uint8_t *nzrun_start = NULL;
-     env->psw.addr = data[0];
+-    /* add 1 to include residual part in main loop */
 -    uint32_t count512s = (slen >> 6) + 1;
 -    /* countResidual is tail of data, i.e., countResidual = slen % 64 */
 -    uint32_t count_residual = slen & 0b111111;
 -    bool never_same = true;
 -    uint64_t mask_residual = 1;
 -    mask_residual <<= count_residual;
 -    mask_residual -= 1;
 -    __m512i r = _mm512_set1_epi32(0);
 -
 -    while (count512s) {
 -        int bytes_to_check = 64;
 -        uint64_t mask = 0xffffffffffffffff;
 -        if (count512s == 1) {
 -            bytes_to_check = count_residual;
 -            mask = mask_residual;
 -        }
 -        __m512i old_data = _mm512_mask_loadu_epi8(r,
 -                                                  mask, old_buf + i);
 -        __m512i new_data = _mm512_mask_loadu_epi8(r,
 -                                                  mask, new_buf + i);
 -        uint64_t comp = _mm512_cmpeq_epi8_mask(old_data, new_data);
 -        count512s--;
 -
 -        bool is_same = (comp & 0x1);
 -        while (bytes_to_check) {
 -            if (d + 2 > dlen) {
 -                return -1;
 -            }
 -            if (is_same) {
 -                if (nzrun_len) {
 -                    d += uleb128_encode_small(dst + d, nzrun_len);
 -                    if (d + nzrun_len > dlen) {
 -                        return -1;
 -                    }
 -                    nzrun_start = new_buf + i - nzrun_len;
 -                    memcpy(dst + d, nzrun_start, nzrun_len);
 -                    d += nzrun_len;
 -                    nzrun_len = 0;
 -                }
 -                /* 64 data at a time for speed */
 -                if (count512s && (comp == 0xffffffffffffffff)) {
 -                    i += 64;
 -                    zrun_len += 64;
 -                    break;
 -                }
 -                never_same = false;
 -                num = ctz64(~comp);
 -                num = (num < bytes_to_check) ? num : bytes_to_check;
 -                zrun_len += num;
 -                bytes_to_check -= num;
 -                comp >>= num;
 -                i += num;
 -                if (bytes_to_check) {
 -                    /* still has different data after same data */
 -                    d += uleb128_encode_small(dst + d, zrun_len);
 -                    zrun_len = 0;
 -                } else {
 -                    break;
 -                }
 -            }
 -            if (never_same || zrun_len) {
 -                /*
 -                 * never_same only acts if
 -                 * data begins with diff in first count512s
 -                 */
 -                d += uleb128_encode_small(dst + d, zrun_len);
 -                zrun_len = 0;
 -                never_same = false;
 -            }
 -            /* has diff, 64 data at a time for speed */
 -            if ((bytes_to_check == 64) && (comp == 0x0)) {
 -                i += 64;
 -                nzrun_len += 64;
 -                break;
 -            }
 -            num = ctz64(comp);
 -            num = (num < bytes_to_check) ? num : bytes_to_check;
 -            nzrun_len += num;
 -            bytes_to_check -= num;
 -            comp >>= num;
 -            i += num;
 -            if (bytes_to_check) {
 -                /* mask like 111000 */
 -                d += uleb128_encode_small(dst + d, nzrun_len);
 -                /* overflow */
 -                if (d + nzrun_len > dlen) {
 -                    return -1;
 -                }
 -                nzrun_start = new_buf + i - nzrun_len;
 -                memcpy(dst + d, nzrun_start, nzrun_len);
 -                d += nzrun_len;
 -                nzrun_len = 0;
 -                is_same = true;
 -            }
 -        }
 -    }
 -
 -    if (nzrun_len != 0) {
 -        d += uleb128_encode_small(dst + d, nzrun_len);
 -        /* overflow */
 -        if (d + nzrun_len > dlen) {
 -            return -1;
 -        }
 -        nzrun_start = new_buf + i - nzrun_len;
 -        memcpy(dst + d, nzrun_start, nzrun_len);
 -        d += nzrun_len;
 -    }
 -    return d;
 -}
 -#endif
 --
 .34.1

-[PULL 34/47] target/m68k: Convert to tcg_ops restore_state_to_opc
+[PULL 07/28] migration/xbzrle: Use i386 host/cpuinfo.h
-Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+Perform the function selection once, and only if CONFIG_AVX512_OPT
 is enabled.  Centralize the selection to xbzrle.c, instead of
 spreading the init across 3 files.
 Remove xbzrle-bench.c.  The benefit of being able to benchmark
 the different implementations is less important than not peeking
 into the internals of the implementation.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Reviewed-by: Juan Quintela <quintela@redhat.com>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/m68k/cpu.c       | 14 ++++++++++++++
+ migration/xbzrle.h         |   5 +-
- target/m68k/translate.c | 10 ----------
+ migration/ram.c            |  34 +--
-files changed, 14 insertions(+), 10 deletions(-)
+ migration/xbzrle.c         |  26 +-
  tests/bench/xbzrle-bench.c | 469 -------------------------------------
  tests/unit/test-xbzrle.c   |  49 +---
  tests/bench/meson.build    |   6 -
 files changed, 39 insertions(+), 550 deletions(-)
  delete mode 100644 tests/bench/xbzrle-bench.c
-diff --git a/target/m68k/cpu.c b/target/m68k/cpu.c
+diff --git a/migration/xbzrle.h b/migration/xbzrle.h
 index XXXXXXX..XXXXXXX 100644
---- a/target/m68k/cpu.c
+--- a/migration/xbzrle.h
-+++ b/target/m68k/cpu.c
++++ b/migration/xbzrle.h
-@@ -XXX,XX +XXX,XX @@ static vaddr m68k_cpu_get_pc(CPUState *cs)
+@@ -XXX,XX +XXX,XX @@ int xbzrle_encode_buffer(uint8_t *old_buf, uint8_t *new_buf, int slen,
-     return cpu->env.pc;
+                          uint8_t *dst, int dlen);
  int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen);
 -#if defined(CONFIG_AVX512BW_OPT)
 -int xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen,
 -                                uint8_t *dst, int dlen);
 -#endif
 +
  #endif
 diff --git a/migration/ram.c b/migration/ram.c
 index XXXXXXX..XXXXXXX 100644
 --- a/migration/ram.c
 +++ b/migration/ram.c
@@ -XXX,XX +XXX,XX @@
  #define RAM_SAVE_FLAG_MULTIFD_FLUSH    0x200
  /* We can't use any flag that is bigger than 0x200 */
 -int (*xbzrle_encode_buffer_func)(uint8_t *, uint8_t *, int,
 -     uint8_t *, int) = xbzrle_encode_buffer;
 -#if defined(CONFIG_AVX512BW_OPT)
 -#include "qemu/cpuid.h"
 -static void __attribute__((constructor)) init_cpu_flag(void)
 -{
 -    unsigned max = __get_cpuid_max(0, NULL);
 -    int a, b, c, d;
 -    if (max >= 1) {
 -        __cpuid(1, a, b, c, d);
 -         /* We must check that AVX is not just available, but usable.  */
 -        if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
 -            int bv;
 -            __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
 -            __cpuid_count(7, 0, a, b, c, d);
 -           /* 0xe6:
 -            *  XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15
 -            *                    and ZMM16-ZMM31 state are enabled by OS)
 -            *  XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS)
 -            */
 -            if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) {
 -                xbzrle_encode_buffer_func = xbzrle_encode_buffer_avx512;
 -            }
 -        }
 -    }
 -}
 -#endif
 -
  XBZRLECacheStats xbzrle_counters;
  /* used by the search for pages to send */
@@ -XXX,XX +XXX,XX @@ static int save_xbzrle_page(RAMState *rs, PageSearchStatus *pss,
      memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
      /* XBZRLE encoding (if there is no overflow) */
 -    encoded_len = xbzrle_encode_buffer_func(prev_cached_page, XBZRLE.current_buf,
 -                                            TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 -                                            TARGET_PAGE_SIZE);
 +    encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 +                                       TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 +                                       TARGET_PAGE_SIZE);
      /*
       * Update the cache contents, so that it corresponds to the data
 diff --git a/migration/xbzrle.c b/migration/xbzrle.c
 index XXXXXXX..XXXXXXX 100644
 --- a/migration/xbzrle.c
 +++ b/migration/xbzrle.c
@@ -XXX,XX +XXX,XX @@
  #if defined(CONFIG_AVX512BW_OPT)
  #include <immintrin.h>
 +#include "host/cpuinfo.h"
 -int __attribute__((target("avx512bw")))
 +static int __attribute__((target("avx512bw")))
  xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen,
                              uint8_t *dst, int dlen)
  {
@@ -XXX,XX +XXX,XX @@ xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen,
      }
      return d;
  }
++
-+static void m68k_restore_state_to_opc(CPUState *cs,
++static int xbzrle_encode_buffer_int(uint8_t *old_buf, uint8_t *new_buf,
-+                                      const TranslationBlock *tb,
++                                    int slen, uint8_t *dst, int dlen);
-+                                      const uint64_t *data)
++
 +static int (*accel_func)(uint8_t *, uint8_t *, int, uint8_t *, int);
 +
 +static void __attribute__((constructor)) init_accel(void)
 +{
-+    M68kCPU *cpu = M68K_CPU(cs);
++    unsigned info = cpuinfo_init();
-+    int cc_op = data[1];
++    if (info & CPUINFO_AVX512BW) {
-+
++        accel_func = xbzrle_encode_buffer_avx512;
-+    cpu->env.pc = data[0];
++    } else {
-+    if (cc_op != CC_OP_DYNAMIC) {
++        accel_func = xbzrle_encode_buffer_int;
 +        cpu->env.cc_op = cc_op;
 +    }
 +}
 +
- static bool m68k_cpu_has_work(CPUState *cs)
++int xbzrle_encode_buffer(uint8_t *old_buf, uint8_t *new_buf, int slen,
 +                         uint8_t *dst, int dlen)
 +{
 +    return accel_func(old_buf, new_buf, slen, dst, dlen);
 +}
 +
 +#define xbzrle_encode_buffer xbzrle_encode_buffer_int
  #endif
  /*
 diff --git a/tests/bench/xbzrle-bench.c b/tests/bench/xbzrle-bench.c
 deleted file mode 100644
 index XXXXXXX..XXXXXXX
 --- a/tests/bench/xbzrle-bench.c
 +++ /dev/null
@@ -XXX,XX +XXX,XX @@
 -/*
 - * Xor Based Zero Run Length Encoding unit tests.
 - *
 - * Copyright 2013 Red Hat, Inc. and/or its affiliates
 - *
 - * Authors:
 - *  Orit Wasserman  <owasserm@redhat.com>
 - *
 - * This work is licensed under the terms of the GNU GPL, version 2 or later.
 - * See the COPYING file in the top-level directory.
 - *
 - */
 -#include "qemu/osdep.h"
 -#include "qemu/cutils.h"
 -#include "../migration/xbzrle.h"
 -
 -#if defined(CONFIG_AVX512BW_OPT)
 -#define XBZRLE_PAGE_SIZE 4096
 -static bool is_cpu_support_avx512bw;
 -#include "qemu/cpuid.h"
 -static void __attribute__((constructor)) init_cpu_flag(void)
 -{
 -    unsigned max = __get_cpuid_max(0, NULL);
 -    int a, b, c, d;
 -    is_cpu_support_avx512bw = false;
 -    if (max >= 1) {
 -        __cpuid(1, a, b, c, d);
 -         /* We must check that AVX is not just available, but usable.  */
 -        if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
 -            int bv;
 -            __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
 -            __cpuid_count(7, 0, a, b, c, d);
 -           /* 0xe6:
 -            *  XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15
 -            *                    and ZMM16-ZMM31 state are enabled by OS)
 -            *  XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS)
 -            */
 -            if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) {
 -                is_cpu_support_avx512bw = true;
 -            }
 -        }
 -    }
 -    return ;
 -}
 -
 -struct ResTime {
 -    float t_raw;
 -    float t_512;
 -};
 -
 -
 -/* Function prototypes
 -int xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen,
 -                                uint8_t *dst, int dlen);
 -*/
 -static void encode_decode_zero(struct ResTime *res)
 -{
 -    uint8_t *buffer = g_malloc0(XBZRLE_PAGE_SIZE);
 -    uint8_t *compressed = g_malloc0(XBZRLE_PAGE_SIZE);
 -    uint8_t *buffer512 = g_malloc0(XBZRLE_PAGE_SIZE);
 -    uint8_t *compressed512 = g_malloc0(XBZRLE_PAGE_SIZE);
 -    int i = 0;
 -    int dlen = 0, dlen512 = 0;
 -    int diff_len = g_test_rand_int_range(0, XBZRLE_PAGE_SIZE - 1006);
 -
 -    for (i = diff_len; i > 0; i--) {
 -        buffer[1000 + i] = i;
 -        buffer512[1000 + i] = i;
 -    }
 -
 -    buffer[1000 + diff_len + 3] = 103;
 -    buffer[1000 + diff_len + 5] = 105;
 -
 -    buffer512[1000 + diff_len + 3] = 103;
 -    buffer512[1000 + diff_len + 5] = 105;
 -
 -    /* encode zero page */
 -    time_t t_start, t_end, t_start512, t_end512;
 -    t_start = clock();
 -    dlen = xbzrle_encode_buffer(buffer, buffer, XBZRLE_PAGE_SIZE, compressed,
 -                       XBZRLE_PAGE_SIZE);
 -    t_end = clock();
 -    float time_val = difftime(t_end, t_start);
 -    g_assert(dlen == 0);
 -
 -    t_start512 = clock();
 -    dlen512 = xbzrle_encode_buffer_avx512(buffer512, buffer512, XBZRLE_PAGE_SIZE,
 -                                       compressed512, XBZRLE_PAGE_SIZE);
 -    t_end512 = clock();
 -    float time_val512 = difftime(t_end512, t_start512);
 -    g_assert(dlen512 == 0);
 -
 -    res->t_raw = time_val;
 -    res->t_512 = time_val512;
 -
 -    g_free(buffer);
 -    g_free(compressed);
 -    g_free(buffer512);
 -    g_free(compressed512);
 -
 -}
 -
 -static void test_encode_decode_zero_avx512(void)
 -{
 -    int i;
 -    float time_raw = 0.0, time_512 = 0.0;
 -    struct ResTime res;
 -    for (i = 0; i < 10000; i++) {
 -        encode_decode_zero(&res);
 -        time_raw += res.t_raw;
 -        time_512 += res.t_512;
 -    }
 -    printf("Zero test:\n");
 -    printf("Raw xbzrle_encode time is %f ms\n", time_raw);
 -    printf("512 xbzrle_encode time is %f ms\n", time_512);
 -}
 -
 -static void encode_decode_unchanged(struct ResTime *res)
 -{
 -    uint8_t *compressed = g_malloc0(XBZRLE_PAGE_SIZE);
 -    uint8_t *test = g_malloc0(XBZRLE_PAGE_SIZE);
 -    uint8_t *compressed512 = g_malloc0(XBZRLE_PAGE_SIZE);
 -    uint8_t *test512 = g_malloc0(XBZRLE_PAGE_SIZE);
 -    int i = 0;
 -    int dlen = 0, dlen512 = 0;
 -    int diff_len = g_test_rand_int_range(0, XBZRLE_PAGE_SIZE - 1006);
 -
 -    for (i = diff_len; i > 0; i--) {
 -        test[1000 + i] = i + 4;
 -        test512[1000 + i] = i + 4;
 -    }
 -
 -    test[1000 + diff_len + 3] = 107;
 -    test[1000 + diff_len + 5] = 109;
 -
 -    test512[1000 + diff_len + 3] = 107;
 -    test512[1000 + diff_len + 5] = 109;
 -
 -    /* test unchanged buffer */
 -    time_t t_start, t_end, t_start512, t_end512;
 -    t_start = clock();
 -    dlen = xbzrle_encode_buffer(test, test, XBZRLE_PAGE_SIZE, compressed,
 -                                XBZRLE_PAGE_SIZE);
 -    t_end = clock();
 -    float time_val = difftime(t_end, t_start);
 -    g_assert(dlen == 0);
 -
 -    t_start512 = clock();
 -    dlen512 = xbzrle_encode_buffer_avx512(test512, test512, XBZRLE_PAGE_SIZE,
 -                                       compressed512, XBZRLE_PAGE_SIZE);
 -    t_end512 = clock();
 -    float time_val512 = difftime(t_end512, t_start512);
 -    g_assert(dlen512 == 0);
 -
 -    res->t_raw = time_val;
 -    res->t_512 = time_val512;
 -
 -    g_free(test);
 -    g_free(compressed);
 -    g_free(test512);
 -    g_free(compressed512);
 -
 -}
 -
 -static void test_encode_decode_unchanged_avx512(void)
 -{
 -    int i;
 -    float time_raw = 0.0, time_512 = 0.0;
 -    struct ResTime res;
 -    for (i = 0; i < 10000; i++) {
 -        encode_decode_unchanged(&res);
 -        time_raw += res.t_raw;
 -        time_512 += res.t_512;
 -    }
 -    printf("Unchanged test:\n");
 -    printf("Raw xbzrle_encode time is %f ms\n", time_raw);
 -    printf("512 xbzrle_encode time is %f ms\n", time_512);
 -}
 -
 -static void encode_decode_1_byte(struct ResTime *res)
 -{
 -    uint8_t *buffer = g_malloc0(XBZRLE_PAGE_SIZE);
 -    uint8_t *test = g_malloc0(XBZRLE_PAGE_SIZE);
 -    uint8_t *compressed = g_malloc(XBZRLE_PAGE_SIZE);
 -    uint8_t *buffer512 = g_malloc0(XBZRLE_PAGE_SIZE);
 -    uint8_t *test512 = g_malloc0(XBZRLE_PAGE_SIZE);
 -    uint8_t *compressed512 = g_malloc(XBZRLE_PAGE_SIZE);
 -    int dlen = 0, rc = 0, dlen512 = 0, rc512 = 0;
 -    uint8_t buf[2];
 -    uint8_t buf512[2];
 -
 -    test[XBZRLE_PAGE_SIZE - 1] = 1;
 -    test512[XBZRLE_PAGE_SIZE - 1] = 1;
 -
 -    time_t t_start, t_end, t_start512, t_end512;
 -    t_start = clock();
 -    dlen = xbzrle_encode_buffer(buffer, test, XBZRLE_PAGE_SIZE, compressed,
 -                       XBZRLE_PAGE_SIZE);
 -    t_end = clock();
 -    float time_val = difftime(t_end, t_start);
 -    g_assert(dlen == (uleb128_encode_small(&buf[0], 4095) + 2));
 -
 -    rc = xbzrle_decode_buffer(compressed, dlen, buffer, XBZRLE_PAGE_SIZE);
 -    g_assert(rc == XBZRLE_PAGE_SIZE);
 -    g_assert(memcmp(test, buffer, XBZRLE_PAGE_SIZE) == 0);
 -
 -    t_start512 = clock();
 -    dlen512 = xbzrle_encode_buffer_avx512(buffer512, test512, XBZRLE_PAGE_SIZE,
 -                                       compressed512, XBZRLE_PAGE_SIZE);
 -    t_end512 = clock();
 -    float time_val512 = difftime(t_end512, t_start512);
 -    g_assert(dlen512 == (uleb128_encode_small(&buf512[0], 4095) + 2));
 -
 -    rc512 = xbzrle_decode_buffer(compressed512, dlen512, buffer512,
 -                                 XBZRLE_PAGE_SIZE);
 -    g_assert(rc512 == XBZRLE_PAGE_SIZE);
 -    g_assert(memcmp(test512, buffer512, XBZRLE_PAGE_SIZE) == 0);
 -
 -    res->t_raw = time_val;
 -    res->t_512 = time_val512;
 -
 -    g_free(buffer);
 -    g_free(compressed);
 -    g_free(test);
 -    g_free(buffer512);
 -    g_free(compressed512);
 -    g_free(test512);
 -
 -}
 -
 -static void test_encode_decode_1_byte_avx512(void)
 -{
 -    int i;
 -    float time_raw = 0.0, time_512 = 0.0;
 -    struct ResTime res;
 -    for (i = 0; i < 10000; i++) {
 -        encode_decode_1_byte(&res);
 -        time_raw += res.t_raw;
 -        time_512 += res.t_512;
 -    }
 -    printf("1 byte test:\n");
 -    printf("Raw xbzrle_encode time is %f ms\n", time_raw);
 -    printf("512 xbzrle_encode time is %f ms\n", time_512);
 -}
 -
 -static void encode_decode_overflow(struct ResTime *res)
 -{
 -    uint8_t *compressed = g_malloc0(XBZRLE_PAGE_SIZE);
 -    uint8_t *test = g_malloc0(XBZRLE_PAGE_SIZE);
 -    uint8_t *buffer = g_malloc0(XBZRLE_PAGE_SIZE);
 -    uint8_t *compressed512 = g_malloc0(XBZRLE_PAGE_SIZE);
 -    uint8_t *test512 = g_malloc0(XBZRLE_PAGE_SIZE);
 -    uint8_t *buffer512 = g_malloc0(XBZRLE_PAGE_SIZE);
 -    int i = 0, rc = 0, rc512 = 0;
 -
 -    for (i = 0; i < XBZRLE_PAGE_SIZE / 2 - 1; i++) {
 -        test[i * 2] = 1;
 -        test512[i * 2] = 1;
 -    }
 -
 -    /* encode overflow */
 -    time_t t_start, t_end, t_start512, t_end512;
 -    t_start = clock();
 -    rc = xbzrle_encode_buffer(buffer, test, XBZRLE_PAGE_SIZE, compressed,
 -                              XBZRLE_PAGE_SIZE);
 -    t_end = clock();
 -    float time_val = difftime(t_end, t_start);
 -    g_assert(rc == -1);
 -
 -    t_start512 = clock();
 -    rc512 = xbzrle_encode_buffer_avx512(buffer512, test512, XBZRLE_PAGE_SIZE,
 -                                     compressed512, XBZRLE_PAGE_SIZE);
 -    t_end512 = clock();
 -    float time_val512 = difftime(t_end512, t_start512);
 -    g_assert(rc512 == -1);
 -
 -    res->t_raw = time_val;
 -    res->t_512 = time_val512;
 -
 -    g_free(buffer);
 -    g_free(compressed);
 -    g_free(test);
 -    g_free(buffer512);
 -    g_free(compressed512);
 -    g_free(test512);
 -
 -}
 -
 -static void test_encode_decode_overflow_avx512(void)
 -{
 -    int i;
 -    float time_raw = 0.0, time_512 = 0.0;
 -    struct ResTime res;
 -    for (i = 0; i < 10000; i++) {
 -        encode_decode_overflow(&res);
 -        time_raw += res.t_raw;
 -        time_512 += res.t_512;
 -    }
 -    printf("Overflow test:\n");
 -    printf("Raw xbzrle_encode time is %f ms\n", time_raw);
 -    printf("512 xbzrle_encode time is %f ms\n", time_512);
 -}
 -
 -static void encode_decode_range_avx512(struct ResTime *res)
 -{
 -    uint8_t *buffer = g_malloc0(XBZRLE_PAGE_SIZE);
 -    uint8_t *compressed = g_malloc(XBZRLE_PAGE_SIZE);
 -    uint8_t *test = g_malloc0(XBZRLE_PAGE_SIZE);
 -    uint8_t *buffer512 = g_malloc0(XBZRLE_PAGE_SIZE);
 -    uint8_t *compressed512 = g_malloc(XBZRLE_PAGE_SIZE);
 -    uint8_t *test512 = g_malloc0(XBZRLE_PAGE_SIZE);
 -    int i = 0, rc = 0, rc512 = 0;
 -    int dlen = 0, dlen512 = 0;
 -
 -    int diff_len = g_test_rand_int_range(0, XBZRLE_PAGE_SIZE - 1006);
 -
 -    for (i = diff_len; i > 0; i--) {
 -        buffer[1000 + i] = i;
 -        test[1000 + i] = i + 4;
 -        buffer512[1000 + i] = i;
 -        test512[1000 + i] = i + 4;
 -    }
 -
 -    buffer[1000 + diff_len + 3] = 103;
 -    test[1000 + diff_len + 3] = 107;
 -
 -    buffer[1000 + diff_len + 5] = 105;
 -    test[1000 + diff_len + 5] = 109;
 -
 -    buffer512[1000 + diff_len + 3] = 103;
 -    test512[1000 + diff_len + 3] = 107;
 -
 -    buffer512[1000 + diff_len + 5] = 105;
 -    test512[1000 + diff_len + 5] = 109;
 -
 -    /* test encode/decode */
 -    time_t t_start, t_end, t_start512, t_end512;
 -    t_start = clock();
 -    dlen = xbzrle_encode_buffer(test, buffer, XBZRLE_PAGE_SIZE, compressed,
 -                                XBZRLE_PAGE_SIZE);
 -    t_end = clock();
 -    float time_val = difftime(t_end, t_start);
 -    rc = xbzrle_decode_buffer(compressed, dlen, test, XBZRLE_PAGE_SIZE);
 -    g_assert(rc < XBZRLE_PAGE_SIZE);
 -    g_assert(memcmp(test, buffer, XBZRLE_PAGE_SIZE) == 0);
 -
 -    t_start512 = clock();
 -    dlen512 = xbzrle_encode_buffer_avx512(test512, buffer512, XBZRLE_PAGE_SIZE,
 -                                       compressed512, XBZRLE_PAGE_SIZE);
 -    t_end512 = clock();
 -    float time_val512 = difftime(t_end512, t_start512);
 -    rc512 = xbzrle_decode_buffer(compressed512, dlen512, test512, XBZRLE_PAGE_SIZE);
 -    g_assert(rc512 < XBZRLE_PAGE_SIZE);
 -    g_assert(memcmp(test512, buffer512, XBZRLE_PAGE_SIZE) == 0);
 -
 -    res->t_raw = time_val;
 -    res->t_512 = time_val512;
 -
 -    g_free(buffer);
 -    g_free(compressed);
 -    g_free(test);
 -    g_free(buffer512);
 -    g_free(compressed512);
 -    g_free(test512);
 -
 -}
 -
 -static void test_encode_decode_avx512(void)
 -{
 -    int i;
 -    float time_raw = 0.0, time_512 = 0.0;
 -    struct ResTime res;
 -    for (i = 0; i < 10000; i++) {
 -        encode_decode_range_avx512(&res);
 -        time_raw += res.t_raw;
 -        time_512 += res.t_512;
 -    }
 -    printf("Encode decode test:\n");
 -    printf("Raw xbzrle_encode time is %f ms\n", time_raw);
 -    printf("512 xbzrle_encode time is %f ms\n", time_512);
 -}
 -
 -static void encode_decode_random(struct ResTime *res)
 -{
 -    uint8_t *buffer = g_malloc0(XBZRLE_PAGE_SIZE);
 -    uint8_t *compressed = g_malloc(XBZRLE_PAGE_SIZE);
 -    uint8_t *test = g_malloc0(XBZRLE_PAGE_SIZE);
 -    uint8_t *buffer512 = g_malloc0(XBZRLE_PAGE_SIZE);
 -    uint8_t *compressed512 = g_malloc(XBZRLE_PAGE_SIZE);
 -    uint8_t *test512 = g_malloc0(XBZRLE_PAGE_SIZE);
 -    int i = 0, rc = 0, rc512 = 0;
 -    int dlen = 0, dlen512 = 0;
 -
 -    int diff_len = g_test_rand_int_range(0, XBZRLE_PAGE_SIZE - 1);
 -    /* store the index of diff */
 -    int dirty_index[diff_len];
 -    for (int j = 0; j < diff_len; j++) {
 -        dirty_index[j] = g_test_rand_int_range(0, XBZRLE_PAGE_SIZE - 1);
 -    }
 -    for (i = diff_len - 1; i >= 0; i--) {
 -        buffer[dirty_index[i]] = i;
 -        test[dirty_index[i]] = i + 4;
 -        buffer512[dirty_index[i]] = i;
 -        test512[dirty_index[i]] = i + 4;
 -    }
 -
 -    time_t t_start, t_end, t_start512, t_end512;
 -    t_start = clock();
 -    dlen = xbzrle_encode_buffer(test, buffer, XBZRLE_PAGE_SIZE, compressed,
 -                                XBZRLE_PAGE_SIZE);
 -    t_end = clock();
 -    float time_val = difftime(t_end, t_start);
 -    rc = xbzrle_decode_buffer(compressed, dlen, test, XBZRLE_PAGE_SIZE);
 -    g_assert(rc < XBZRLE_PAGE_SIZE);
 -
 -    t_start512 = clock();
 -    dlen512 = xbzrle_encode_buffer_avx512(test512, buffer512, XBZRLE_PAGE_SIZE,
 -                                       compressed512, XBZRLE_PAGE_SIZE);
 -    t_end512 = clock();
 -    float time_val512 = difftime(t_end512, t_start512);
 -    rc512 = xbzrle_decode_buffer(compressed512, dlen512, test512, XBZRLE_PAGE_SIZE);
 -    g_assert(rc512 < XBZRLE_PAGE_SIZE);
 -
 -    res->t_raw = time_val;
 -    res->t_512 = time_val512;
 -
 -    g_free(buffer);
 -    g_free(compressed);
 -    g_free(test);
 -    g_free(buffer512);
 -    g_free(compressed512);
 -    g_free(test512);
 -
 -}
 -
 -static void test_encode_decode_random_avx512(void)
 -{
 -    int i;
 -    float time_raw = 0.0, time_512 = 0.0;
 -    struct ResTime res;
 -    for (i = 0; i < 10000; i++) {
 -        encode_decode_random(&res);
 -        time_raw += res.t_raw;
 -        time_512 += res.t_512;
 -    }
 -    printf("Random test:\n");
 -    printf("Raw xbzrle_encode time is %f ms\n", time_raw);
 -    printf("512 xbzrle_encode time is %f ms\n", time_512);
 -}
 -#endif
 -
 -int main(int argc, char **argv)
 -{
 -    g_test_init(&argc, &argv, NULL);
 -    g_test_rand_int();
 -    #if defined(CONFIG_AVX512BW_OPT)
 -    if (likely(is_cpu_support_avx512bw)) {
 -        g_test_add_func("/xbzrle/encode_decode_zero", test_encode_decode_zero_avx512);
 -        g_test_add_func("/xbzrle/encode_decode_unchanged",
 -                        test_encode_decode_unchanged_avx512);
 -        g_test_add_func("/xbzrle/encode_decode_1_byte", test_encode_decode_1_byte_avx512);
 -        g_test_add_func("/xbzrle/encode_decode_overflow",
 -                        test_encode_decode_overflow_avx512);
 -        g_test_add_func("/xbzrle/encode_decode", test_encode_decode_avx512);
 -        g_test_add_func("/xbzrle/encode_decode_random", test_encode_decode_random_avx512);
 -    }
 -    #endif
 -    return g_test_run();
 -}
 diff --git a/tests/unit/test-xbzrle.c b/tests/unit/test-xbzrle.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tests/unit/test-xbzrle.c
 +++ b/tests/unit/test-xbzrle.c
@@ -XXX,XX +XXX,XX @@
  #define XBZRLE_PAGE_SIZE 4096
 -int (*xbzrle_encode_buffer_func)(uint8_t *, uint8_t *, int,
 -     uint8_t *, int) = xbzrle_encode_buffer;
 -#if defined(CONFIG_AVX512BW_OPT)
 -#include "qemu/cpuid.h"
 -static void __attribute__((constructor)) init_cpu_flag(void)
 -{
 -    unsigned max = __get_cpuid_max(0, NULL);
 -    int a, b, c, d;
 -    if (max >= 1) {
 -        __cpuid(1, a, b, c, d);
 -         /* We must check that AVX is not just available, but usable.  */
 -        if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
 -            int bv;
 -            __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
 -            __cpuid_count(7, 0, a, b, c, d);
 -           /* 0xe6:
 -            *  XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15
 -            *                    and ZMM16-ZMM31 state are enabled by OS)
 -            *  XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS)
 -            */
 -            if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) {
 -                xbzrle_encode_buffer_func = xbzrle_encode_buffer_avx512;
 -            }
 -        }
 -    }
 -    return ;
 -}
 -#endif
 -
  static void test_uleb(void)
  {
-     return cs->interrupt_request & CPU_INTERRUPT_HARD;
+     uint32_t i, val;
-@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps m68k_sysemu_ops = {
+@@ -XXX,XX +XXX,XX @@ static void test_encode_decode_zero(void)
+     buffer[1000 + diff_len + 5] = 105;
- static const struct TCGCPUOps m68k_tcg_ops = {
-     .initialize = m68k_tcg_init,
+     /* encode zero page */
-+    .restore_state_to_opc = m68k_restore_state_to_opc,
+-    dlen = xbzrle_encode_buffer_func(buffer, buffer, XBZRLE_PAGE_SIZE, compressed,
+-                       XBZRLE_PAGE_SIZE);
- #ifndef CONFIG_USER_ONLY
++    dlen = xbzrle_encode_buffer(buffer, buffer, XBZRLE_PAGE_SIZE,
-     .tlb_fill = m68k_cpu_tlb_fill,
++                                compressed, XBZRLE_PAGE_SIZE);
-diff --git a/target/m68k/translate.c b/target/m68k/translate.c
+     g_assert(dlen == 0);
      g_free(buffer);
@@ -XXX,XX +XXX,XX @@ static void test_encode_decode_unchanged(void)
      test[1000 + diff_len + 5] = 109;
      /* test unchanged buffer */
 -    dlen = xbzrle_encode_buffer_func(test, test, XBZRLE_PAGE_SIZE, compressed,
 -                                XBZRLE_PAGE_SIZE);
 +    dlen = xbzrle_encode_buffer(test, test, XBZRLE_PAGE_SIZE,
 +                                compressed, XBZRLE_PAGE_SIZE);
      g_assert(dlen == 0);
      g_free(test);
@@ -XXX,XX +XXX,XX @@ static void test_encode_decode_1_byte(void)
      test[XBZRLE_PAGE_SIZE - 1] = 1;
 -    dlen = xbzrle_encode_buffer_func(buffer, test, XBZRLE_PAGE_SIZE, compressed,
 -                       XBZRLE_PAGE_SIZE);
 +    dlen = xbzrle_encode_buffer(buffer, test, XBZRLE_PAGE_SIZE,
 +                                compressed, XBZRLE_PAGE_SIZE);
      g_assert(dlen == (uleb128_encode_small(&buf[0], 4095) + 2));
      rc = xbzrle_decode_buffer(compressed, dlen, buffer, XBZRLE_PAGE_SIZE);
@@ -XXX,XX +XXX,XX @@ static void test_encode_decode_overflow(void)
      }
      /* encode overflow */
 -    rc = xbzrle_encode_buffer_func(buffer, test, XBZRLE_PAGE_SIZE, compressed,
 -                              XBZRLE_PAGE_SIZE);
 +    rc = xbzrle_encode_buffer(buffer, test, XBZRLE_PAGE_SIZE,
 +                              compressed, XBZRLE_PAGE_SIZE);
      g_assert(rc == -1);
      g_free(buffer);
@@ -XXX,XX +XXX,XX @@ static void encode_decode_range(void)
      test[1000 + diff_len + 5] = 109;
      /* test encode/decode */
 -    dlen = xbzrle_encode_buffer_func(test, buffer, XBZRLE_PAGE_SIZE, compressed,
 -                                XBZRLE_PAGE_SIZE);
 +    dlen = xbzrle_encode_buffer(test, buffer, XBZRLE_PAGE_SIZE,
 +                                compressed, XBZRLE_PAGE_SIZE);
      rc = xbzrle_decode_buffer(compressed, dlen, test, XBZRLE_PAGE_SIZE);
      g_assert(rc < XBZRLE_PAGE_SIZE);
 diff --git a/tests/bench/meson.build b/tests/bench/meson.build
 index XXXXXXX..XXXXXXX 100644
---- a/target/m68k/translate.c
+--- a/tests/bench/meson.build
-+++ b/target/m68k/translate.c
++++ b/tests/bench/meson.build
-@@ -XXX,XX +XXX,XX @@ void m68k_cpu_dump_state(CPUState *cs, FILE *f, int flags)
+@@ -XXX,XX +XXX,XX @@ qht_bench = executable('qht-bench',
-                  env->mmu.mmusr, env->mmu.ar);
+                        sources: 'qht-bench.c',
- #endif
+                        dependencies: [qemuutil])
- }
--
+-if have_system
--void restore_state_to_opc(CPUM68KState *env, TranslationBlock *tb,
+-xbzrle_bench = executable('xbzrle-bench',
--                          target_ulong *data)
+-                       sources: 'xbzrle-bench.c',
--{
+-                       dependencies: [qemuutil,migration])
--    int cc_op = data[1];
+-endif
--    env->pc = data[0];
+-
--    if (cc_op != CC_OP_DYNAMIC) {
+ qtree_bench = executable('qtree-bench',
--        env->cc_op = cc_op;
+                          sources: 'qtree-bench.c',
--    }
+                          dependencies: [qemuutil])
 -}
 --
 .34.1

-[PULL 41/47] target/rx: Convert to tcg_ops restore_state_to_opc
+[PULL 08/28] migration: Build migration_files once
-Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+The items in migration_files are built for libmigration and included
 info softmmu_ss from there; no need to also include them directly.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Reviewed-by: Juan Quintela <quintela@redhat.com>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/rx/cpu.c       | 10 ++++++++++
+ migration/meson.build | 1 -
- target/rx/translate.c |  6 ------
+file changed, 1 deletion(-)
 files changed, 10 insertions(+), 6 deletions(-)
-diff --git a/target/rx/cpu.c b/target/rx/cpu.c
+diff --git a/migration/meson.build b/migration/meson.build
 index XXXXXXX..XXXXXXX 100644
---- a/target/rx/cpu.c
+--- a/migration/meson.build
-+++ b/target/rx/cpu.c
++++ b/migration/meson.build
-@@ -XXX,XX +XXX,XX @@ static void rx_cpu_synchronize_from_tb(CPUState *cs,
+@@ -XXX,XX +XXX,XX @@ migration_files = files(
-     cpu->env.pc = tb_pc(tb);
+   'qemu-file.c',
- }
+   'yank_functions.c',
+ )
-+static void rx_restore_state_to_opc(CPUState *cs,
+-softmmu_ss.add(migration_files)
-+                                    const TranslationBlock *tb,
-+                                    const uint64_t *data)
+ softmmu_ss.add(files(
-+{
+   'block-dirty-bitmap.c',
 +    RXCPU *cpu = RX_CPU(cs);
 +
 +    cpu->env.pc = data[0];
 +}
 +
  static bool rx_cpu_has_work(CPUState *cs)
  {
      return cs->interrupt_request &
@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps rx_sysemu_ops = {
  static const struct TCGCPUOps rx_tcg_ops = {
      .initialize = rx_translate_init,
      .synchronize_from_tb = rx_cpu_synchronize_from_tb,
 +    .restore_state_to_opc = rx_restore_state_to_opc,
      .tlb_fill = rx_cpu_tlb_fill,
  #ifndef CONFIG_USER_ONLY
 diff --git a/target/rx/translate.c b/target/rx/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/rx/translate.c
 +++ b/target/rx/translate.c
@@ -XXX,XX +XXX,XX @@ void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int max_insns,
      translator_loop(cs, tb, max_insns, pc, host_pc, &rx_tr_ops, &dc.base);
  }
 -void restore_state_to_opc(CPURXState *env, TranslationBlock *tb,
 -                          target_ulong *data)
 -{
 -    env->pc = data[0];
 -}
 -
  #define ALLOC_REGISTER(sym, name) \
      cpu_##sym = tcg_global_mem_new_i32(cpu_env, \
                                         offsetof(CPURXState, sym), name)
 --
 .34.1

-[PULL 11/47] accel/tcg: Split out tb-maint.c
+[PULL 09/28] util: Add cpuinfo-aarch64.c
-Move all of the TranslationBlock flushing and page linking
+Move the code from tcg/.  The only use of these bits so far
-code from translate-all.c to tb-maint.c.
+is with respect to the atomicity of tcg operations.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/internal.h      |  55 +++
+ host/include/aarch64/host/cpuinfo.h | 22 ++++++++++
- accel/tcg/tb-maint.c      | 735 ++++++++++++++++++++++++++++++++++++
+ tcg/aarch64/tcg-target.h            |  6 ++-
- accel/tcg/translate-all.c | 766 +-------------------------------------
+ util/cpuinfo-aarch64.c              | 67 +++++++++++++++++++++++++++++
- accel/tcg/meson.build     |   1 +
+ tcg/aarch64/tcg-target.c.inc        | 40 -----------------
-files changed, 802 insertions(+), 755 deletions(-)
+ util/meson.build                    |  4 +-
- create mode 100644 accel/tcg/tb-maint.c
+files changed, 96 insertions(+), 43 deletions(-)
+ create mode 100644 host/include/aarch64/host/cpuinfo.h
-diff --git a/accel/tcg/internal.h b/accel/tcg/internal.h
+ create mode 100644 util/cpuinfo-aarch64.c
-index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/internal.h
+diff --git a/host/include/aarch64/host/cpuinfo.h b/host/include/aarch64/host/cpuinfo.h
 +++ b/accel/tcg/internal.h
@@ -XXX,XX +XXX,XX @@ typedef struct PageDesc {
  #endif
  } PageDesc;
 +/* Size of the L2 (and L3, etc) page tables.  */
 +#define V_L2_BITS 10
 +#define V_L2_SIZE (1 << V_L2_BITS)
 +
 +/*
 + * L1 Mapping properties
 + */
 +extern int v_l1_size;
 +extern int v_l1_shift;
 +extern int v_l2_levels;
 +
 +/*
 + * The bottom level has pointers to PageDesc, and is indexed by
 + * anything from 4 to (V_L2_BITS + 3) bits, depending on target page size.
 + */
 +#define V_L1_MIN_BITS 4
 +#define V_L1_MAX_BITS (V_L2_BITS + 3)
 +#define V_L1_MAX_SIZE (1 << V_L1_MAX_BITS)
 +
 +extern void *l1_map[V_L1_MAX_SIZE];
 +
  PageDesc *page_find_alloc(tb_page_addr_t index, bool alloc);
  static inline PageDesc *page_find(tb_page_addr_t index)
@@ -XXX,XX +XXX,XX @@ static inline PageDesc *page_find(tb_page_addr_t index)
      return page_find_alloc(index, false);
  }
 +/* list iterators for lists of tagged pointers in TranslationBlock */
 +#define TB_FOR_EACH_TAGGED(head, tb, n, field)                          \
 +    for (n = (head) & 1, tb = (TranslationBlock *)((head) & ~1);        \
 +         tb; tb = (TranslationBlock *)tb->field[n], n = (uintptr_t)tb & 1, \
 +             tb = (TranslationBlock *)((uintptr_t)tb & ~1))
 +
 +#define PAGE_FOR_EACH_TB(pagedesc, tb, n)                       \
 +    TB_FOR_EACH_TAGGED((pagedesc)->first_tb, tb, n, page_next)
 +
 +#define TB_FOR_EACH_JMP(head_tb, tb, n)                                 \
 +    TB_FOR_EACH_TAGGED((head_tb)->jmp_list_head, tb, n, jmp_list_next)
 +
 +/* In user-mode page locks aren't used; mmap_lock is enough */
 +#ifdef CONFIG_USER_ONLY
 +#define assert_page_locked(pd) tcg_debug_assert(have_mmap_lock())
 +static inline void page_lock(PageDesc *pd) { }
 +static inline void page_unlock(PageDesc *pd) { }
 +#else
 +#ifdef CONFIG_DEBUG_TCG
 +void do_assert_page_locked(const PageDesc *pd, const char *file, int line);
 +#define assert_page_locked(pd) do_assert_page_locked(pd, __FILE__, __LINE__)
 +#else
 +#define assert_page_locked(pd)
 +#endif
 +void page_lock(PageDesc *pd);
 +void page_unlock(PageDesc *pd);
 +#endif
 +
  TranslationBlock *tb_gen_code(CPUState *cpu, target_ulong pc,
                                target_ulong cs_base, uint32_t flags,
                                int cflags);
  G_NORETURN void cpu_io_recompile(CPUState *cpu, uintptr_t retaddr);
  void page_init(void);
  void tb_htable_init(void);
 +void tb_reset_jump(TranslationBlock *tb, int n);
 +TranslationBlock *tb_link_page(TranslationBlock *tb, tb_page_addr_t phys_pc,
 +                               tb_page_addr_t phys_page2);
 +bool tb_invalidate_phys_page(tb_page_addr_t addr, uintptr_t pc);
 +int cpu_restore_state_from_tb(CPUState *cpu, TranslationBlock *tb,
 +                              uintptr_t searched_pc, bool reset_icount);
  /* Return the current PC from CPU, which may be cached in TB. */
  static inline target_ulong log_pc(CPUState *cpu, const TranslationBlock *tb)
 diff --git a/accel/tcg/tb-maint.c b/accel/tcg/tb-maint.c
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
-+++ b/accel/tcg/tb-maint.c
++++ b/host/include/aarch64/host/cpuinfo.h
 @@ -XXX,XX +XXX,XX @@
 +/*
-+ * Translation Block Maintaince
++ * SPDX-License-Identifier: GPL-2.0-or-later
-+ *
++ * Host specific cpu indentification for AArch64.
 + *  Copyright (c) 2003 Fabrice Bellard
 + *
 + * This library is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public
 + * License as published by the Free Software Foundation; either
 + * version 2.1 of the License, or (at your option) any later version.
 + *
 + * This library is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 + * Lesser General Public License for more details.
 + *
 + * You should have received a copy of the GNU Lesser General Public
 + * License along with this library; if not, see <http://www.gnu.org/licenses/>.
 + */
 +
++#ifndef HOST_CPUINFO_H
++#define HOST_CPUINFO_H
++
++#define CPUINFO_ALWAYS          (1u << 0)  /* so cpuinfo is nonzero */
++#define CPUINFO_LSE             (1u << 1)
++#define CPUINFO_LSE2            (1u << 2)
++
++/* Initialized with a constructor. */
++extern unsigned cpuinfo;
++
++/*
++ * We cannot rely on constructor ordering, so other constructors must
++ * use the function interface rather than the variable above.
++ */
++unsigned cpuinfo_init(void);
++
++#endif /* HOST_CPUINFO_H */
+diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/aarch64/tcg-target.h
++++ b/tcg/aarch64/tcg-target.h
+@@ -XXX,XX +XXX,XX @@
+ #ifndef AARCH64_TCG_TARGET_H
+ #define AARCH64_TCG_TARGET_H
++#include "host/cpuinfo.h"
++
+ #define TCG_TARGET_INSN_UNIT_SIZE  4
+ #define TCG_TARGET_TLB_DISPLACEMENT_BITS 24
+ #define MAX_CODE_GEN_BUFFER_SIZE  ((size_t)-1)
+@@ -XXX,XX +XXX,XX @@ typedef enum {
+ #define TCG_TARGET_CALL_ARG_I128        TCG_CALL_ARG_EVEN
+ #define TCG_TARGET_CALL_RET_I128        TCG_CALL_RET_NORMAL
+-extern bool have_lse;
+-extern bool have_lse2;
++#define have_lse    (cpuinfo & CPUINFO_LSE)
++#define have_lse2   (cpuinfo & CPUINFO_LSE2)
+ /* optional instructions */
+ #define TCG_TARGET_HAS_div_i32          1
+diff --git a/util/cpuinfo-aarch64.c b/util/cpuinfo-aarch64.c
+new file mode 100644
+index XXXXXXX..XXXXXXX
+--- /dev/null
++++ b/util/cpuinfo-aarch64.c
+@@ -XXX,XX +XXX,XX @@
++/*
++ * SPDX-License-Identifier: GPL-2.0-or-later
++ * Host specific cpu indentification for AArch64.
++ */
++
 +#include "qemu/osdep.h"
-+#include "exec/cputlb.h"
++#include "host/cpuinfo.h"
-+#include "exec/log.h"
++
-+#include "exec/translate-all.h"
++#ifdef CONFIG_LINUX
-+#include "sysemu/tcg.h"
++# ifdef CONFIG_GETAUXVAL
-+#include "tcg/tcg.h"
++#  include <sys/auxv.h>
-+#include "tb-hash.h"
++# else
-+#include "tb-context.h"
++#  include <asm/hwcap.h>
-+#include "internal.h"
++#  include "elf.h"
-+
++# endif
-+/* FIXME: tb_invalidate_phys_range is declared in different places. */
++#endif
-+#ifdef CONFIG_USER_ONLY
++#ifdef CONFIG_DARWIN
-+#include "exec/exec-all.h"
++# include <sys/sysctl.h>
-+#else
++#endif
-+#include "exec/ram_addr.h"
++
-+#endif
++unsigned cpuinfo;
 +
-+static bool tb_cmp(const void *ap, const void *bp)
++#ifdef CONFIG_DARWIN
 +static bool sysctl_for_bool(const char *name)
 +{
-+    const TranslationBlock *a = ap;
++    int val = 0;
-+    const TranslationBlock *b = bp;
++    size_t len = sizeof(val);
 +
-+    return ((TARGET_TB_PCREL || tb_pc(a) == tb_pc(b)) &&
++    if (sysctlbyname(name, &val, &len, NULL, 0) == 0) {
-+            a->cs_base == b->cs_base &&
++        return val != 0;
 +            a->flags == b->flags &&
 +            (tb_cflags(a) & ~CF_INVALID) == (tb_cflags(b) & ~CF_INVALID) &&
 +            a->trace_vcpu_dstate == b->trace_vcpu_dstate &&
 +            a->page_addr[0] == b->page_addr[0] &&
 +            a->page_addr[1] == b->page_addr[1]);
 +}
 +
 +void tb_htable_init(void)
 +{
 +    unsigned int mode = QHT_MODE_AUTO_RESIZE;
 +
 +    qht_init(&tb_ctx.htable, tb_cmp, CODE_GEN_HTABLE_SIZE, mode);
 +}
 +
 +/* Set to NULL all the 'first_tb' fields in all PageDescs. */
 +static void page_flush_tb_1(int level, void **lp)
 +{
 +    int i;
 +
 +    if (*lp == NULL) {
 +        return;
 +    }
-+    if (level == 0) {
++
 +        PageDesc *pd = *lp;
 +
 +        for (i = 0; i < V_L2_SIZE; ++i) {
 +            page_lock(&pd[i]);
 +            pd[i].first_tb = (uintptr_t)NULL;
 +            page_unlock(&pd[i]);
 +        }
 +    } else {
 +        void **pp = *lp;
 +
 +        for (i = 0; i < V_L2_SIZE; ++i) {
 +            page_flush_tb_1(level - 1, pp + i);
 +        }
 +    }
 +}
 +
 +static void page_flush_tb(void)
 +{
 +    int i, l1_sz = v_l1_size;
 +
 +    for (i = 0; i < l1_sz; i++) {
 +        page_flush_tb_1(v_l2_levels, l1_map + i);
 +    }
 +}
 +
 +/* flush all the translation blocks */
 +static void do_tb_flush(CPUState *cpu, run_on_cpu_data tb_flush_count)
 +{
 +    bool did_flush = false;
 +
 +    mmap_lock();
 +    /* If it is already been done on request of another CPU, just retry. */
 +    if (tb_ctx.tb_flush_count != tb_flush_count.host_int) {
 +        goto done;
 +    }
 +    did_flush = true;
 +
 +    CPU_FOREACH(cpu) {
 +        tcg_flush_jmp_cache(cpu);
 +    }
 +
 +    qht_reset_size(&tb_ctx.htable, CODE_GEN_HTABLE_SIZE);
 +    page_flush_tb();
 +
 +    tcg_region_reset_all();
 +    /* XXX: flush processor icache at this point if cache flush is expensive */
 +    qatomic_mb_set(&tb_ctx.tb_flush_count, tb_ctx.tb_flush_count + 1);
 +
 +done:
 +    mmap_unlock();
 +    if (did_flush) {
 +        qemu_plugin_flush_cb();
 +    }
 +}
 +
 +void tb_flush(CPUState *cpu)
 +{
 +    if (tcg_enabled()) {
 +        unsigned tb_flush_count = qatomic_mb_read(&tb_ctx.tb_flush_count);
 +
 +        if (cpu_in_exclusive_context(cpu)) {
 +            do_tb_flush(cpu, RUN_ON_CPU_HOST_INT(tb_flush_count));
 +        } else {
 +            async_safe_run_on_cpu(cpu, do_tb_flush,
 +                                  RUN_ON_CPU_HOST_INT(tb_flush_count));
 +        }
 +    }
 +}
 +
 +/*
 + * user-mode: call with mmap_lock held
 + * !user-mode: call with @pd->lock held
 + */
 +static inline void tb_page_remove(PageDesc *pd, TranslationBlock *tb)
 +{
 +    TranslationBlock *tb1;
 +    uintptr_t *pprev;
 +    unsigned int n1;
 +
 +    assert_page_locked(pd);
 +    pprev = &pd->first_tb;
 +    PAGE_FOR_EACH_TB(pd, tb1, n1) {
 +        if (tb1 == tb) {
 +            *pprev = tb1->page_next[n1];
 +            return;
 +        }
 +        pprev = &tb1->page_next[n1];
 +    }
 +    g_assert_not_reached();
 +}
 +
 +/* remove @orig from its @n_orig-th jump list */
 +static inline void tb_remove_from_jmp_list(TranslationBlock *orig, int n_orig)
 +{
 +    uintptr_t ptr, ptr_locked;
 +    TranslationBlock *dest;
 +    TranslationBlock *tb;
 +    uintptr_t *pprev;
 +    int n;
 +
 +    /* mark the LSB of jmp_dest[] so that no further jumps can be inserted */
 +    ptr = qatomic_or_fetch(&orig->jmp_dest[n_orig], 1);
 +    dest = (TranslationBlock *)(ptr & ~1);
 +    if (dest == NULL) {
 +        return;
 +    }
 +
 +    qemu_spin_lock(&dest->jmp_lock);
 +    /*
-+     * While acquiring the lock, the jump might have been removed if the
++     * We might in the future ask for properties not present in older kernels,
-+     * destination TB was invalidated; check again.
++     * but we're only asking about static properties, all of which should be
 +     * 'int'.  So we shouln't see ENOMEM (val too small), or any of the other
 +     * more exotic errors.
 +     */
-+    ptr_locked = qatomic_read(&orig->jmp_dest[n_orig]);
++    assert(errno == ENOENT);
 +    if (ptr_locked != ptr) {
 +        qemu_spin_unlock(&dest->jmp_lock);
 +        /*
 +         * The only possibility is that the jump was unlinked via
 +         * tb_jump_unlink(dest). Seeing here another destination would be a bug,
 +         * because we set the LSB above.
 +         */
 +        g_assert(ptr_locked == 1 && dest->cflags & CF_INVALID);
 +        return;
 +    }
 +    /*
 +     * We first acquired the lock, and since the destination pointer matches,
 +     * we know for sure that @orig is in the jmp list.
 +     */
 +    pprev = &dest->jmp_list_head;
 +    TB_FOR_EACH_JMP(dest, tb, n) {
 +        if (tb == orig && n == n_orig) {
 +            *pprev = tb->jmp_list_next[n];
 +            /* no need to set orig->jmp_dest[n]; setting the LSB was enough */
 +            qemu_spin_unlock(&dest->jmp_lock);
 +            return;
 +        }
 +        pprev = &tb->jmp_list_next[n];
 +    }
 +    g_assert_not_reached();
 +}
 +
 +/*
 + * Reset the jump entry 'n' of a TB so that it is not chained to another TB.
 + */
 +void tb_reset_jump(TranslationBlock *tb, int n)
 +{
 +    uintptr_t addr = (uintptr_t)(tb->tc.ptr + tb->jmp_reset_offset[n]);
 +    tb_set_jmp_target(tb, n, addr);
 +}
 +
 +/* remove any jumps to the TB */
 +static inline void tb_jmp_unlink(TranslationBlock *dest)
 +{
 +    TranslationBlock *tb;
 +    int n;
 +
 +    qemu_spin_lock(&dest->jmp_lock);
 +
 +    TB_FOR_EACH_JMP(dest, tb, n) {
 +        tb_reset_jump(tb, n);
 +        qatomic_and(&tb->jmp_dest[n], (uintptr_t)NULL | 1);
 +        /* No need to clear the list entry; setting the dest ptr is enough */
 +    }
 +    dest->jmp_list_head = (uintptr_t)NULL;
 +
 +    qemu_spin_unlock(&dest->jmp_lock);
 +}
 +
 +static void tb_jmp_cache_inval_tb(TranslationBlock *tb)
 +{
 +    CPUState *cpu;
 +
 +    if (TARGET_TB_PCREL) {
 +        /* A TB may be at any virtual address */
 +        CPU_FOREACH(cpu) {
 +            tcg_flush_jmp_cache(cpu);
 +        }
 +    } else {
 +        uint32_t h = tb_jmp_cache_hash_func(tb_pc(tb));
 +
 +        CPU_FOREACH(cpu) {
 +            CPUJumpCache *jc = cpu->tb_jmp_cache;
 +
 +            if (qatomic_read(&jc->array[h].tb) == tb) {
 +                qatomic_set(&jc->array[h].tb, NULL);
 +            }
 +        }
 +    }
 +}
 +
 +/*
 + * In user-mode, call with mmap_lock held.
 + * In !user-mode, if @rm_from_page_list is set, call with the TB's pages'
 + * locks held.
 + */
 +static void do_tb_phys_invalidate(TranslationBlock *tb, bool rm_from_page_list)
 +{
 +    PageDesc *p;
 +    uint32_t h;
 +    tb_page_addr_t phys_pc;
 +    uint32_t orig_cflags = tb_cflags(tb);
 +
 +    assert_memory_lock();
 +
 +    /* make sure no further incoming jumps will be chained to this TB */
 +    qemu_spin_lock(&tb->jmp_lock);
 +    qatomic_set(&tb->cflags, tb->cflags | CF_INVALID);
 +    qemu_spin_unlock(&tb->jmp_lock);
 +
 +    /* remove the TB from the hash list */
 +    phys_pc = tb->page_addr[0];
 +    h = tb_hash_func(phys_pc, (TARGET_TB_PCREL ? 0 : tb_pc(tb)),
 +                     tb->flags, orig_cflags, tb->trace_vcpu_dstate);
 +    if (!qht_remove(&tb_ctx.htable, tb, h)) {
 +        return;
 +    }
 +
 +    /* remove the TB from the page list */
 +    if (rm_from_page_list) {
 +        p = page_find(tb->page_addr[0] >> TARGET_PAGE_BITS);
 +        tb_page_remove(p, tb);
 +        if (tb->page_addr[1] != -1) {
 +            p = page_find(tb->page_addr[1] >> TARGET_PAGE_BITS);
 +            tb_page_remove(p, tb);
 +        }
 +    }
 +
 +    /* remove the TB from the hash list */
 +    tb_jmp_cache_inval_tb(tb);
 +
 +    /* suppress this TB from the two jump lists */
 +    tb_remove_from_jmp_list(tb, 0);
 +    tb_remove_from_jmp_list(tb, 1);
 +
 +    /* suppress any remaining jumps to this TB */
 +    tb_jmp_unlink(tb);
 +
 +    qatomic_set(&tb_ctx.tb_phys_invalidate_count,
 +                tb_ctx.tb_phys_invalidate_count + 1);
 +}
 +
 +static void tb_phys_invalidate__locked(TranslationBlock *tb)
 +{
 +    qemu_thread_jit_write();
 +    do_tb_phys_invalidate(tb, true);
 +    qemu_thread_jit_execute();
 +}
 +
 +static void page_lock_pair(PageDesc **ret_p1, tb_page_addr_t phys1,
 +                           PageDesc **ret_p2, tb_page_addr_t phys2, bool alloc)
 +{
 +    PageDesc *p1, *p2;
 +    tb_page_addr_t page1;
 +    tb_page_addr_t page2;
 +
 +    assert_memory_lock();
 +    g_assert(phys1 != -1);
 +
 +    page1 = phys1 >> TARGET_PAGE_BITS;
 +    page2 = phys2 >> TARGET_PAGE_BITS;
 +
 +    p1 = page_find_alloc(page1, alloc);
 +    if (ret_p1) {
 +        *ret_p1 = p1;
 +    }
 +    if (likely(phys2 == -1)) {
 +        page_lock(p1);
 +        return;
 +    } else if (page1 == page2) {
 +        page_lock(p1);
 +        if (ret_p2) {
 +            *ret_p2 = p1;
 +        }
 +        return;
 +    }
 +    p2 = page_find_alloc(page2, alloc);
 +    if (ret_p2) {
 +        *ret_p2 = p2;
 +    }
 +    if (page1 < page2) {
 +        page_lock(p1);
 +        page_lock(p2);
 +    } else {
 +        page_lock(p2);
 +        page_lock(p1);
 +    }
 +}
 +
 +#ifdef CONFIG_USER_ONLY
 +static inline void page_lock_tb(const TranslationBlock *tb) { }
 +static inline void page_unlock_tb(const TranslationBlock *tb) { }
 +#else
 +/* lock the page(s) of a TB in the correct acquisition order */
 +static void page_lock_tb(const TranslationBlock *tb)
 +{
 +    page_lock_pair(NULL, tb->page_addr[0], NULL, tb->page_addr[1], false);
 +}
 +
 +static void page_unlock_tb(const TranslationBlock *tb)
 +{
 +    PageDesc *p1 = page_find(tb->page_addr[0] >> TARGET_PAGE_BITS);
 +
 +    page_unlock(p1);
 +    if (unlikely(tb->page_addr[1] != -1)) {
 +        PageDesc *p2 = page_find(tb->page_addr[1] >> TARGET_PAGE_BITS);
 +
 +        if (p2 != p1) {
 +            page_unlock(p2);
 +        }
 +    }
 +}
 +#endif
 +
 +/*
 + * Invalidate one TB.
 + * Called with mmap_lock held in user-mode.
 + */
 +void tb_phys_invalidate(TranslationBlock *tb, tb_page_addr_t page_addr)
 +{
 +    if (page_addr == -1 && tb->page_addr[0] != -1) {
 +        page_lock_tb(tb);
 +        do_tb_phys_invalidate(tb, true);
 +        page_unlock_tb(tb);
 +    } else {
 +        do_tb_phys_invalidate(tb, false);
 +    }
 +}
 +
 +/*
 + * Add the tb in the target page and protect it if necessary.
 + * Called with mmap_lock held for user-mode emulation.
 + * Called with @p->lock held in !user-mode.
 + */
 +static inline void tb_page_add(PageDesc *p, TranslationBlock *tb,
 +                               unsigned int n, tb_page_addr_t page_addr)
 +{
 +#ifndef CONFIG_USER_ONLY
 +    bool page_already_protected;
 +#endif
 +
 +    assert_page_locked(p);
 +
 +    tb->page_addr[n] = page_addr;
 +    tb->page_next[n] = p->first_tb;
 +#ifndef CONFIG_USER_ONLY
 +    page_already_protected = p->first_tb != (uintptr_t)NULL;
 +#endif
 +    p->first_tb = (uintptr_t)tb | n;
 +
 +#if defined(CONFIG_USER_ONLY)
 +    /* translator_loop() must have made all TB pages non-writable */
 +    assert(!(p->flags & PAGE_WRITE));
 +#else
 +    /*
 +     * If some code is already present, then the pages are already
 +     * protected. So we handle the case where only the first TB is
 +     * allocated in a physical page.
 +     */
 +    if (!page_already_protected) {
 +        tlb_protect_code(page_addr);
 +    }
 +#endif
 +}
 +
 +/*
 + * Add a new TB and link it to the physical page tables. phys_page2 is
 + * (-1) to indicate that only one page contains the TB.
 + *
 + * Called with mmap_lock held for user-mode emulation.
 + *
 + * Returns a pointer @tb, or a pointer to an existing TB that matches @tb.
 + * Note that in !user-mode, another thread might have already added a TB
 + * for the same block of guest code that @tb corresponds to. In that case,
 + * the caller should discard the original @tb, and use instead the returned TB.
 + */
 +TranslationBlock *tb_link_page(TranslationBlock *tb, tb_page_addr_t phys_pc,
 +                               tb_page_addr_t phys_page2)
 +{
 +    PageDesc *p;
 +    PageDesc *p2 = NULL;
 +    void *existing_tb = NULL;
 +    uint32_t h;
 +
 +    assert_memory_lock();
 +    tcg_debug_assert(!(tb->cflags & CF_INVALID));
 +
 +    /*
 +     * Add the TB to the page list, acquiring first the pages's locks.
 +     * We keep the locks held until after inserting the TB in the hash table,
 +     * so that if the insertion fails we know for sure that the TBs are still
 +     * in the page descriptors.
 +     * Note that inserting into the hash table first isn't an option, since
 +     * we can only insert TBs that are fully initialized.
 +     */
 +    page_lock_pair(&p, phys_pc, &p2, phys_page2, true);
 +    tb_page_add(p, tb, 0, phys_pc);
 +    if (p2) {
 +        tb_page_add(p2, tb, 1, phys_page2);
 +    } else {
 +        tb->page_addr[1] = -1;
 +    }
 +
 +    /* add in the hash table */
 +    h = tb_hash_func(phys_pc, (TARGET_TB_PCREL ? 0 : tb_pc(tb)),
 +                     tb->flags, tb->cflags, tb->trace_vcpu_dstate);
 +    qht_insert(&tb_ctx.htable, tb, h, &existing_tb);
 +
 +    /* remove TB from the page(s) if we couldn't insert it */
 +    if (unlikely(existing_tb)) {
 +        tb_page_remove(p, tb);
 +        if (p2) {
 +            tb_page_remove(p2, tb);
 +        }
 +        tb = existing_tb;
 +    }
 +
 +    if (p2 && p2 != p) {
 +        page_unlock(p2);
 +    }
 +    page_unlock(p);
 +    return tb;
 +}
 +
 +/*
 + * @p must be non-NULL.
 + * user-mode: call with mmap_lock held.
 + * !user-mode: call with all @pages locked.
 + */
 +static void
 +tb_invalidate_phys_page_range__locked(struct page_collection *pages,
 +                                      PageDesc *p, tb_page_addr_t start,
 +                                      tb_page_addr_t end,
 +                                      uintptr_t retaddr)
 +{
 +    TranslationBlock *tb;
 +    tb_page_addr_t tb_start, tb_end;
 +    int n;
 +#ifdef TARGET_HAS_PRECISE_SMC
 +    CPUState *cpu = current_cpu;
 +    CPUArchState *env = NULL;
 +    bool current_tb_not_found = retaddr != 0;
 +    bool current_tb_modified = false;
 +    TranslationBlock *current_tb = NULL;
 +    target_ulong current_pc = 0;
 +    target_ulong current_cs_base = 0;
 +    uint32_t current_flags = 0;
 +#endif /* TARGET_HAS_PRECISE_SMC */
 +
 +    assert_page_locked(p);
 +
 +#if defined(TARGET_HAS_PRECISE_SMC)
 +    if (cpu != NULL) {
 +        env = cpu->env_ptr;
 +    }
 +#endif
 +
 +    /*
 +     * We remove all the TBs in the range [start, end[.
 +     * XXX: see if in some cases it could be faster to invalidate all the code
 +     */
 +    PAGE_FOR_EACH_TB(p, tb, n) {
 +        assert_page_locked(p);
 +        /* NOTE: this is subtle as a TB may span two physical pages */
 +        if (n == 0) {
 +            /* NOTE: tb_end may be after the end of the page, but
 +               it is not a problem */
 +            tb_start = tb->page_addr[0];
 +            tb_end = tb_start + tb->size;
 +        } else {
 +            tb_start = tb->page_addr[1];
 +            tb_end = tb_start + ((tb->page_addr[0] + tb->size)
 +                                 & ~TARGET_PAGE_MASK);
 +        }
 +        if (!(tb_end <= start || tb_start >= end)) {
 +#ifdef TARGET_HAS_PRECISE_SMC
 +            if (current_tb_not_found) {
 +                current_tb_not_found = false;
 +                /* now we have a real cpu fault */
 +                current_tb = tcg_tb_lookup(retaddr);
 +            }
 +            if (current_tb == tb &&
 +                (tb_cflags(current_tb) & CF_COUNT_MASK) != 1) {
 +                /*
 +                 * If we are modifying the current TB, we must stop
 +                 * its execution. We could be more precise by checking
 +                 * that the modification is after the current PC, but it
 +                 * would require a specialized function to partially
 +                 * restore the CPU state.
 +                 */
 +                current_tb_modified = true;
 +                cpu_restore_state_from_tb(cpu, current_tb, retaddr, true);
 +                cpu_get_tb_cpu_state(env, &current_pc, &current_cs_base,
 +                                     &current_flags);
 +            }
 +#endif /* TARGET_HAS_PRECISE_SMC */
 +            tb_phys_invalidate__locked(tb);
 +        }
 +    }
 +#if !defined(CONFIG_USER_ONLY)
 +    /* if no code remaining, no need to continue to use slow writes */
 +    if (!p->first_tb) {
 +        tlb_unprotect_code(start);
 +    }
 +#endif
 +#ifdef TARGET_HAS_PRECISE_SMC
 +    if (current_tb_modified) {
 +        page_collection_unlock(pages);
 +        /* Force execution of one insn next time.  */
 +        cpu->cflags_next_tb = 1 | CF_NOIRQ | curr_cflags(cpu);
 +        mmap_unlock();
 +        cpu_loop_exit_noexc(cpu);
 +    }
 +#endif
 +}
 +
 +/*
 + * Invalidate all TBs which intersect with the target physical address range
 + * [start;end[. NOTE: start and end must refer to the *same* physical page.
 + * 'is_cpu_write_access' should be true if called from a real cpu write
 + * access: the virtual CPU will exit the current TB if code is modified inside
 + * this TB.
 + *
 + * Called with mmap_lock held for user-mode emulation
 + */
 +void tb_invalidate_phys_page_range(tb_page_addr_t start, tb_page_addr_t end)
 +{
 +    struct page_collection *pages;
 +    PageDesc *p;
 +
 +    assert_memory_lock();
 +
 +    p = page_find(start >> TARGET_PAGE_BITS);
 +    if (p == NULL) {
 +        return;
 +    }
 +    pages = page_collection_lock(start, end);
 +    tb_invalidate_phys_page_range__locked(pages, p, start, end, 0);
 +    page_collection_unlock(pages);
 +}
 +
 +/*
 + * Invalidate all TBs which intersect with the target physical address range
 + * [start;end[. NOTE: start and end may refer to *different* physical pages.
 + * 'is_cpu_write_access' should be true if called from a real cpu write
 + * access: the virtual CPU will exit the current TB if code is modified inside
 + * this TB.
 + *
 + * Called with mmap_lock held for user-mode emulation.
 + */
 +#ifdef CONFIG_SOFTMMU
 +void tb_invalidate_phys_range(ram_addr_t start, ram_addr_t end)
 +#else
 +void tb_invalidate_phys_range(target_ulong start, target_ulong end)
 +#endif
 +{
 +    struct page_collection *pages;
 +    tb_page_addr_t next;
 +
 +    assert_memory_lock();
 +
 +    pages = page_collection_lock(start, end);
 +    for (next = (start & TARGET_PAGE_MASK) + TARGET_PAGE_SIZE;
 +         start < end;
 +         start = next, next += TARGET_PAGE_SIZE) {
 +        PageDesc *pd = page_find(start >> TARGET_PAGE_BITS);
 +        tb_page_addr_t bound = MIN(next, end);
 +
 +        if (pd == NULL) {
 +            continue;
 +        }
 +        tb_invalidate_phys_page_range__locked(pages, pd, start, bound, 0);
 +    }
 +    page_collection_unlock(pages);
 +}
 +
 +#ifdef CONFIG_SOFTMMU
 +/*
 + * len must be <= 8 and start must be a multiple of len.
 + * Called via softmmu_template.h when code areas are written to with
 + * iothread mutex not held.
 + *
 + * Call with all @pages in the range [@start, @start + len[ locked.
 + */
 +void tb_invalidate_phys_page_fast(struct page_collection *pages,
 +                                  tb_page_addr_t start, int len,
 +                                  uintptr_t retaddr)
 +{
 +    PageDesc *p;
 +
 +    assert_memory_lock();
 +
 +    p = page_find(start >> TARGET_PAGE_BITS);
 +    if (!p) {
 +        return;
 +    }
 +
 +    assert_page_locked(p);
 +    tb_invalidate_phys_page_range__locked(pages, p, start, start + len,
 +                                          retaddr);
 +}
 +#else
 +/*
 + * Called with mmap_lock held. If pc is not 0 then it indicates the
 + * host PC of the faulting store instruction that caused this invalidate.
 + * Returns true if the caller needs to abort execution of the current
 + * TB (because it was modified by this store and the guest CPU has
 + * precise-SMC semantics).
 + */
 +bool tb_invalidate_phys_page(tb_page_addr_t addr, uintptr_t pc)
 +{
 +    TranslationBlock *tb;
 +    PageDesc *p;
 +    int n;
 +#ifdef TARGET_HAS_PRECISE_SMC
 +    TranslationBlock *current_tb = NULL;
 +    CPUState *cpu = current_cpu;
 +    CPUArchState *env = NULL;
 +    int current_tb_modified = 0;
 +    target_ulong current_pc = 0;
 +    target_ulong current_cs_base = 0;
 +    uint32_t current_flags = 0;
 +#endif
 +
 +    assert_memory_lock();
 +
 +    addr &= TARGET_PAGE_MASK;
 +    p = page_find(addr >> TARGET_PAGE_BITS);
 +    if (!p) {
 +        return false;
 +    }
 +
 +#ifdef TARGET_HAS_PRECISE_SMC
 +    if (p->first_tb && pc != 0) {
 +        current_tb = tcg_tb_lookup(pc);
 +    }
 +    if (cpu != NULL) {
 +        env = cpu->env_ptr;
 +    }
 +#endif
 +    assert_page_locked(p);
 +    PAGE_FOR_EACH_TB(p, tb, n) {
 +#ifdef TARGET_HAS_PRECISE_SMC
 +        if (current_tb == tb &&
 +            (tb_cflags(current_tb) & CF_COUNT_MASK) != 1) {
 +            /*
 +             * If we are modifying the current TB, we must stop its execution.
 +             * We could be more precise by checking that the modification is
 +             * after the current PC, but it would require a specialized
 +             * function to partially restore the CPU state.
 +             */
 +            current_tb_modified = 1;
 +            cpu_restore_state_from_tb(cpu, current_tb, pc, true);
 +            cpu_get_tb_cpu_state(env, &current_pc, &current_cs_base,
 +                                 &current_flags);
 +        }
 +#endif /* TARGET_HAS_PRECISE_SMC */
 +        tb_phys_invalidate(tb, addr);
 +    }
 +    p->first_tb = (uintptr_t)NULL;
 +#ifdef TARGET_HAS_PRECISE_SMC
 +    if (current_tb_modified) {
 +        /* Force execution of one insn next time.  */
 +        cpu->cflags_next_tb = 1 | CF_NOIRQ | curr_cflags(cpu);
 +        return true;
 +    }
 +#endif
 +
 +    return false;
 +}
 +#endif
-diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
++
 +/* Called both as constructor and (possibly) via other constructors. */
 +unsigned __attribute__((constructor)) cpuinfo_init(void)
 +{
 +    unsigned info = cpuinfo;
 +
 +    if (info) {
 +        return info;
 +    }
 +
 +    info = CPUINFO_ALWAYS;
 +
 +#ifdef CONFIG_LINUX
 +    unsigned long hwcap = qemu_getauxval(AT_HWCAP);
 +    info |= (hwcap & HWCAP_ATOMICS ? CPUINFO_LSE : 0);
 +    info |= (hwcap & HWCAP_USCAT ? CPUINFO_LSE2 : 0);
 +#endif
 +#ifdef CONFIG_DARWIN
 +    info |= sysctl_for_bool("hw.optional.arm.FEAT_LSE") * CPUINFO_LSE;
 +    info |= sysctl_for_bool("hw.optional.arm.FEAT_LSE2") * CPUINFO_LSE2;
 +#endif
 +
 +    cpuinfo = info;
 +    return info;
 +}
 diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/translate-all.c
+--- a/tcg/aarch64/tcg-target.c.inc
-+++ b/accel/tcg/translate-all.c
++++ b/tcg/aarch64/tcg-target.c.inc
-@@ -XXX,XX +XXX,XX @@ struct page_collection {
+@@ -XXX,XX +XXX,XX @@
-     struct page_entry *max;
+ #include "../tcg-ldst.c.inc"
- };
+ #include "../tcg-pool.c.inc"
+ #include "qemu/bitops.h"
--/* list iterators for lists of tagged pointers in TranslationBlock */
+-#ifdef __linux__
--#define TB_FOR_EACH_TAGGED(head, tb, n, field)                          \
+-#include <asm/hwcap.h>
--    for (n = (head) & 1, tb = (TranslationBlock *)((head) & ~1);        \
+-#endif
--         tb; tb = (TranslationBlock *)tb->field[n], n = (uintptr_t)tb & 1, \
+-#ifdef CONFIG_DARWIN
--             tb = (TranslationBlock *)((uintptr_t)tb & ~1))
+-#include <sys/sysctl.h>
--
+-#endif
--#define PAGE_FOR_EACH_TB(pagedesc, tb, n)                       \
--    TB_FOR_EACH_TAGGED((pagedesc)->first_tb, tb, n, page_next)
+ /* We're going to re-use TCGType in setting of the SF bit, which controls
--
+    the size of the operation performed.  If we know the values match, it
--#define TB_FOR_EACH_JMP(head_tb, tb, n)                                 \
+@@ -XXX,XX +XXX,XX @@ static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind kind, int slot)
--    TB_FOR_EACH_TAGGED((head_tb)->jmp_list_head, tb, n, jmp_list_next)
+     return TCG_REG_X0 + slot;
 -
  /*
   * In system mode we want L1_MAP to be based on ram offsets,
   * while in user mode we want it to be based on virtual addresses.
@@ -XXX,XX +XXX,XX @@ struct page_collection {
  # define L1_MAP_ADDR_SPACE_BITS  MIN(HOST_LONG_BITS, TARGET_ABI_BITS)
  #endif
 -/* Size of the L2 (and L3, etc) page tables.  */
 -#define V_L2_BITS 10
 -#define V_L2_SIZE (1 << V_L2_BITS)
 -
  /* Make sure all possible CPU event bits fit in tb->trace_vcpu_dstate */
  QEMU_BUILD_BUG_ON(CPU_TRACE_DSTATE_MAX_EVENTS >
                    sizeof_field(TranslationBlock, trace_vcpu_dstate)
@@ -XXX,XX +XXX,XX @@ QEMU_BUILD_BUG_ON(CPU_TRACE_DSTATE_MAX_EVENTS >
  /*
   * L1 Mapping properties
   */
 -static int v_l1_size;
 -static int v_l1_shift;
 -static int v_l2_levels;
 +int v_l1_size;
 +int v_l1_shift;
 +int v_l2_levels;
 -/* The bottom level has pointers to PageDesc, and is indexed by
 - * anything from 4 to (V_L2_BITS + 3) bits, depending on target page size.
 - */
 -#define V_L1_MIN_BITS 4
 -#define V_L1_MAX_BITS (V_L2_BITS + 3)
 -#define V_L1_MAX_SIZE (1 << V_L1_MAX_BITS)
 -
 -static void *l1_map[V_L1_MAX_SIZE];
 +void *l1_map[V_L1_MAX_SIZE];
  TBContext tb_ctx;
@@ -XXX,XX +XXX,XX @@ static int encode_search(TranslationBlock *tb, uint8_t *block)
   * When reset_icount is true, current TB will be interrupted and
   * icount should be recalculated.
   */
 -static int cpu_restore_state_from_tb(CPUState *cpu, TranslationBlock *tb,
 -                                     uintptr_t searched_pc, bool reset_icount)
 +int cpu_restore_state_from_tb(CPUState *cpu, TranslationBlock *tb,
 +                              uintptr_t searched_pc, bool reset_icount)
  {
      target_ulong data[TARGET_INSN_START_WORDS];
      uintptr_t host_pc = (uintptr_t)tb->tc.ptr;
@@ -XXX,XX +XXX,XX @@ PageDesc *page_find_alloc(tb_page_addr_t index, bool alloc)
      return pd + (index & (V_L2_SIZE - 1));
  }
--static void page_lock_pair(PageDesc **ret_p1, tb_page_addr_t phys1,
+-bool have_lse;
--                           PageDesc **ret_p2, tb_page_addr_t phys2, bool alloc);
+-bool have_lse2;
 -
- /* In user-mode page locks aren't used; mmap_lock is enough */
+ #define TCG_REG_TMP TCG_REG_X30
- #ifdef CONFIG_USER_ONLY
+ #define TCG_VEC_TMP TCG_REG_V31
--
--#define assert_page_locked(pd) tcg_debug_assert(have_mmap_lock())
+@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
 -
 -static inline void page_lock(PageDesc *pd)
 -{ }
 -
 -static inline void page_unlock(PageDesc *pd)
 -{ }
 -
 -static inline void page_lock_tb(const TranslationBlock *tb)
 -{ }
 -
 -static inline void page_unlock_tb(const TranslationBlock *tb)
 -{ }
 -
  struct page_collection *
  page_collection_lock(tb_page_addr_t start, tb_page_addr_t end)
  {
@@ -XXX,XX +XXX,XX @@ static void page_unlock__debug(const PageDesc *pd)
      g_assert(removed);
  }
 -static void
 -do_assert_page_locked(const PageDesc *pd, const char *file, int line)
 +void do_assert_page_locked(const PageDesc *pd, const char *file, int line)
  {
      if (unlikely(!page_is_locked(pd))) {
          error_report("assert_page_lock: PageDesc %p not locked @ %s:%d",
@@ -XXX,XX +XXX,XX @@ do_assert_page_locked(const PageDesc *pd, const char *file, int line)
      }
  }
--#define assert_page_locked(pd) do_assert_page_locked(pd, __FILE__, __LINE__)
+-#ifdef CONFIG_DARWIN
--
+-static bool sysctl_for_bool(const char *name)
  void assert_no_pages_locked(void)
  {
      ht_pages_locked_debug_init();
@@ -XXX,XX +XXX,XX @@ void assert_no_pages_locked(void)
  #else /* !CONFIG_DEBUG_TCG */
 -#define assert_page_locked(pd)
 -
 -static inline void page_lock__debug(const PageDesc *pd)
 -{
--}
+-    int val = 0;
--
+-    size_t len = sizeof(val);
--static inline void page_unlock__debug(const PageDesc *pd)
+-
--{
+-    if (sysctlbyname(name, &val, &len, NULL, 0) == 0) {
--}
+-        return val != 0;
 +static inline void page_lock__debug(const PageDesc *pd) { }
 +static inline void page_unlock__debug(const PageDesc *pd) { }
  #endif /* CONFIG_DEBUG_TCG */
 -static inline void page_lock(PageDesc *pd)
 +void page_lock(PageDesc *pd)
  {
      page_lock__debug(pd);
      qemu_spin_lock(&pd->lock);
  }
 -static inline void page_unlock(PageDesc *pd)
 +void page_unlock(PageDesc *pd)
  {
      qemu_spin_unlock(&pd->lock);
      page_unlock__debug(pd);
  }
 -/* lock the page(s) of a TB in the correct acquisition order */
 -static inline void page_lock_tb(const TranslationBlock *tb)
 -{
 -    page_lock_pair(NULL, tb->page_addr[0], NULL, tb->page_addr[1], false);
 -}
 -
 -static inline void page_unlock_tb(const TranslationBlock *tb)
 -{
 -    PageDesc *p1 = page_find(tb->page_addr[0] >> TARGET_PAGE_BITS);
 -
 -    page_unlock(p1);
 -    if (unlikely(tb->page_addr[1] != -1)) {
 -        PageDesc *p2 = page_find(tb->page_addr[1] >> TARGET_PAGE_BITS);
 -
 -        if (p2 != p1) {
 -            page_unlock(p2);
 -        }
 -    }
--}
+-
--
+-    /*
- static inline struct page_entry *
+-     * We might in the future ask for properties not present in older kernels,
- page_entry_new(PageDesc *pd, tb_page_addr_t index)
+-     * but we're only asking about static properties, all of which should be
- {
+-     * 'int'.  So we shouln't see ENOMEM (val too small), or any of the other
-@@ -XXX,XX +XXX,XX @@ void page_collection_unlock(struct page_collection *set)
+-     * more exotic errors.
  #endif /* !CONFIG_USER_ONLY */
 -static void page_lock_pair(PageDesc **ret_p1, tb_page_addr_t phys1,
 -                           PageDesc **ret_p2, tb_page_addr_t phys2, bool alloc)
 -{
 -    PageDesc *p1, *p2;
 -    tb_page_addr_t page1;
 -    tb_page_addr_t page2;
 -
 -    assert_memory_lock();
 -    g_assert(phys1 != -1);
 -
 -    page1 = phys1 >> TARGET_PAGE_BITS;
 -    page2 = phys2 >> TARGET_PAGE_BITS;
 -
 -    p1 = page_find_alloc(page1, alloc);
 -    if (ret_p1) {
 -        *ret_p1 = p1;
 -    }
 -    if (likely(phys2 == -1)) {
 -        page_lock(p1);
 -        return;
 -    } else if (page1 == page2) {
 -        page_lock(p1);
 -        if (ret_p2) {
 -            *ret_p2 = p1;
 -        }
 -        return;
 -    }
 -    p2 = page_find_alloc(page2, alloc);
 -    if (ret_p2) {
 -        *ret_p2 = p2;
 -    }
 -    if (page1 < page2) {
 -        page_lock(p1);
 -        page_lock(p2);
 -    } else {
 -        page_lock(p2);
 -        page_lock(p1);
 -    }
 -}
 -
 -static bool tb_cmp(const void *ap, const void *bp)
 -{
 -    const TranslationBlock *a = ap;
 -    const TranslationBlock *b = bp;
 -
 -    return ((TARGET_TB_PCREL || tb_pc(a) == tb_pc(b)) &&
 -            a->cs_base == b->cs_base &&
 -            a->flags == b->flags &&
 -            (tb_cflags(a) & ~CF_INVALID) == (tb_cflags(b) & ~CF_INVALID) &&
 -            a->trace_vcpu_dstate == b->trace_vcpu_dstate &&
 -            a->page_addr[0] == b->page_addr[0] &&
 -            a->page_addr[1] == b->page_addr[1]);
 -}
 -
 -void tb_htable_init(void)
 -{
 -    unsigned int mode = QHT_MODE_AUTO_RESIZE;
 -
 -    qht_init(&tb_ctx.htable, tb_cmp, CODE_GEN_HTABLE_SIZE, mode);
 -}
 -
 -/* Set to NULL all the 'first_tb' fields in all PageDescs. */
 -static void page_flush_tb_1(int level, void **lp)
 -{
 -    int i;
 -
 -    if (*lp == NULL) {
 -        return;
 -    }
 -    if (level == 0) {
 -        PageDesc *pd = *lp;
 -
 -        for (i = 0; i < V_L2_SIZE; ++i) {
 -            page_lock(&pd[i]);
 -            pd[i].first_tb = (uintptr_t)NULL;
 -            page_unlock(&pd[i]);
 -        }
 -    } else {
 -        void **pp = *lp;
 -
 -        for (i = 0; i < V_L2_SIZE; ++i) {
 -            page_flush_tb_1(level - 1, pp + i);
 -        }
 -    }
 -}
 -
 -static void page_flush_tb(void)
 -{
 -    int i, l1_sz = v_l1_size;
 -
 -    for (i = 0; i < l1_sz; i++) {
 -        page_flush_tb_1(v_l2_levels, l1_map + i);
 -    }
 -}
 -
 -/* flush all the translation blocks */
 -static void do_tb_flush(CPUState *cpu, run_on_cpu_data tb_flush_count)
 -{
 -    bool did_flush = false;
 -
 -    mmap_lock();
 -    /* If it is already been done on request of another CPU,
 -     * just retry.
 -     */
--    if (tb_ctx.tb_flush_count != tb_flush_count.host_int) {
+-    assert(errno == ENOENT);
 -        goto done;
 -    }
 -    did_flush = true;
 -
 -    CPU_FOREACH(cpu) {
 -        tcg_flush_jmp_cache(cpu);
 -    }
 -
 -    qht_reset_size(&tb_ctx.htable, CODE_GEN_HTABLE_SIZE);
 -    page_flush_tb();
 -
 -    tcg_region_reset_all();
 -    /* XXX: flush processor icache at this point if cache flush is
 -       expensive */
 -    qatomic_mb_set(&tb_ctx.tb_flush_count, tb_ctx.tb_flush_count + 1);
 -
 -done:
 -    mmap_unlock();
 -    if (did_flush) {
 -        qemu_plugin_flush_cb();
 -    }
 -}
 -
 -void tb_flush(CPUState *cpu)
 -{
 -    if (tcg_enabled()) {
 -        unsigned tb_flush_count = qatomic_mb_read(&tb_ctx.tb_flush_count);
 -
 -        if (cpu_in_exclusive_context(cpu)) {
 -            do_tb_flush(cpu, RUN_ON_CPU_HOST_INT(tb_flush_count));
 -        } else {
 -            async_safe_run_on_cpu(cpu, do_tb_flush,
 -                                  RUN_ON_CPU_HOST_INT(tb_flush_count));
 -        }
 -    }
 -}
 -
 -/*
 - * user-mode: call with mmap_lock held
 - * !user-mode: call with @pd->lock held
 - */
 -static inline void tb_page_remove(PageDesc *pd, TranslationBlock *tb)
 -{
 -    TranslationBlock *tb1;
 -    uintptr_t *pprev;
 -    unsigned int n1;
 -
 -    assert_page_locked(pd);
 -    pprev = &pd->first_tb;
 -    PAGE_FOR_EACH_TB(pd, tb1, n1) {
 -        if (tb1 == tb) {
 -            *pprev = tb1->page_next[n1];
 -            return;
 -        }
 -        pprev = &tb1->page_next[n1];
 -    }
 -    g_assert_not_reached();
 -}
 -
 -/* remove @orig from its @n_orig-th jump list */
 -static inline void tb_remove_from_jmp_list(TranslationBlock *orig, int n_orig)
 -{
 -    uintptr_t ptr, ptr_locked;
 -    TranslationBlock *dest;
 -    TranslationBlock *tb;
 -    uintptr_t *pprev;
 -    int n;
 -
 -    /* mark the LSB of jmp_dest[] so that no further jumps can be inserted */
 -    ptr = qatomic_or_fetch(&orig->jmp_dest[n_orig], 1);
 -    dest = (TranslationBlock *)(ptr & ~1);
 -    if (dest == NULL) {
 -        return;
 -    }
 -
 -    qemu_spin_lock(&dest->jmp_lock);
 -    /*
 -     * While acquiring the lock, the jump might have been removed if the
 -     * destination TB was invalidated; check again.
 -     */
 -    ptr_locked = qatomic_read(&orig->jmp_dest[n_orig]);
 -    if (ptr_locked != ptr) {
 -        qemu_spin_unlock(&dest->jmp_lock);
 -        /*
 -         * The only possibility is that the jump was unlinked via
 -         * tb_jump_unlink(dest). Seeing here another destination would be a bug,
 -         * because we set the LSB above.
 -         */
 -        g_assert(ptr_locked == 1 && dest->cflags & CF_INVALID);
 -        return;
 -    }
 -    /*
 -     * We first acquired the lock, and since the destination pointer matches,
 -     * we know for sure that @orig is in the jmp list.
 -     */
 -    pprev = &dest->jmp_list_head;
 -    TB_FOR_EACH_JMP(dest, tb, n) {
 -        if (tb == orig && n == n_orig) {
 -            *pprev = tb->jmp_list_next[n];
 -            /* no need to set orig->jmp_dest[n]; setting the LSB was enough */
 -            qemu_spin_unlock(&dest->jmp_lock);
 -            return;
 -        }
 -        pprev = &tb->jmp_list_next[n];
 -    }
 -    g_assert_not_reached();
 -}
 -
 -/* reset the jump entry 'n' of a TB so that it is not chained to
 -   another TB */
 -static inline void tb_reset_jump(TranslationBlock *tb, int n)
 -{
 -    uintptr_t addr = (uintptr_t)(tb->tc.ptr + tb->jmp_reset_offset[n]);
 -    tb_set_jmp_target(tb, n, addr);
 -}
 -
 -/* remove any jumps to the TB */
 -static inline void tb_jmp_unlink(TranslationBlock *dest)
 -{
 -    TranslationBlock *tb;
 -    int n;
 -
 -    qemu_spin_lock(&dest->jmp_lock);
 -
 -    TB_FOR_EACH_JMP(dest, tb, n) {
 -        tb_reset_jump(tb, n);
 -        qatomic_and(&tb->jmp_dest[n], (uintptr_t)NULL | 1);
 -        /* No need to clear the list entry; setting the dest ptr is enough */
 -    }
 -    dest->jmp_list_head = (uintptr_t)NULL;
 -
 -    qemu_spin_unlock(&dest->jmp_lock);
 -}
 -
 -static void tb_jmp_cache_inval_tb(TranslationBlock *tb)
 -{
 -    CPUState *cpu;
 -
 -    if (TARGET_TB_PCREL) {
 -        /* A TB may be at any virtual address */
 -        CPU_FOREACH(cpu) {
 -            tcg_flush_jmp_cache(cpu);
 -        }
 -    } else {
 -        uint32_t h = tb_jmp_cache_hash_func(tb_pc(tb));
 -
 -        CPU_FOREACH(cpu) {
 -            CPUJumpCache *jc = cpu->tb_jmp_cache;
 -
 -            if (qatomic_read(&jc->array[h].tb) == tb) {
 -                qatomic_set(&jc->array[h].tb, NULL);
 -            }
 -        }
 -    }
 -}
 -
 -/*
 - * In user-mode, call with mmap_lock held.
 - * In !user-mode, if @rm_from_page_list is set, call with the TB's pages'
 - * locks held.
 - */
 -static void do_tb_phys_invalidate(TranslationBlock *tb, bool rm_from_page_list)
 -{
 -    PageDesc *p;
 -    uint32_t h;
 -    tb_page_addr_t phys_pc;
 -    uint32_t orig_cflags = tb_cflags(tb);
 -
 -    assert_memory_lock();
 -
 -    /* make sure no further incoming jumps will be chained to this TB */
 -    qemu_spin_lock(&tb->jmp_lock);
 -    qatomic_set(&tb->cflags, tb->cflags | CF_INVALID);
 -    qemu_spin_unlock(&tb->jmp_lock);
 -
 -    /* remove the TB from the hash list */
 -    phys_pc = tb->page_addr[0];
 -    h = tb_hash_func(phys_pc, (TARGET_TB_PCREL ? 0 : tb_pc(tb)),
 -                     tb->flags, orig_cflags, tb->trace_vcpu_dstate);
 -    if (!qht_remove(&tb_ctx.htable, tb, h)) {
 -        return;
 -    }
 -
 -    /* remove the TB from the page list */
 -    if (rm_from_page_list) {
 -        p = page_find(tb->page_addr[0] >> TARGET_PAGE_BITS);
 -        tb_page_remove(p, tb);
 -        if (tb->page_addr[1] != -1) {
 -            p = page_find(tb->page_addr[1] >> TARGET_PAGE_BITS);
 -            tb_page_remove(p, tb);
 -        }
 -    }
 -
 -    /* remove the TB from the hash list */
 -    tb_jmp_cache_inval_tb(tb);
 -
 -    /* suppress this TB from the two jump lists */
 -    tb_remove_from_jmp_list(tb, 0);
 -    tb_remove_from_jmp_list(tb, 1);
 -
 -    /* suppress any remaining jumps to this TB */
 -    tb_jmp_unlink(tb);
 -
 -    qatomic_set(&tb_ctx.tb_phys_invalidate_count,
 -                tb_ctx.tb_phys_invalidate_count + 1);
 -}
 -
 -static void tb_phys_invalidate__locked(TranslationBlock *tb)
 -{
 -    qemu_thread_jit_write();
 -    do_tb_phys_invalidate(tb, true);
 -    qemu_thread_jit_execute();
 -}
 -
 -/* invalidate one TB
 - *
 - * Called with mmap_lock held in user-mode.
 - */
 -void tb_phys_invalidate(TranslationBlock *tb, tb_page_addr_t page_addr)
 -{
 -    if (page_addr == -1 && tb->page_addr[0] != -1) {
 -        page_lock_tb(tb);
 -        do_tb_phys_invalidate(tb, true);
 -        page_unlock_tb(tb);
 -    } else {
 -        do_tb_phys_invalidate(tb, false);
 -    }
 -}
 -
 -/* add the tb in the target page and protect it if necessary
 - *
 - * Called with mmap_lock held for user-mode emulation.
 - * Called with @p->lock held in !user-mode.
 - */
 -static inline void tb_page_add(PageDesc *p, TranslationBlock *tb,
 -                               unsigned int n, tb_page_addr_t page_addr)
 -{
 -#ifndef CONFIG_USER_ONLY
 -    bool page_already_protected;
 -#endif
 -
 -    assert_page_locked(p);
 -
 -    tb->page_addr[n] = page_addr;
 -    tb->page_next[n] = p->first_tb;
 -#ifndef CONFIG_USER_ONLY
 -    page_already_protected = p->first_tb != (uintptr_t)NULL;
 -#endif
 -    p->first_tb = (uintptr_t)tb | n;
 -
 -#if defined(CONFIG_USER_ONLY)
 -    /* translator_loop() must have made all TB pages non-writable */
 -    assert(!(p->flags & PAGE_WRITE));
 -#else
 -    /* if some code is already present, then the pages are already
 -       protected. So we handle the case where only the first TB is
 -       allocated in a physical page */
 -    if (!page_already_protected) {
 -        tlb_protect_code(page_addr);
 -    }
 -#endif
 -}
 -
 -/*
 - * Add a new TB and link it to the physical page tables. phys_page2 is
 - * (-1) to indicate that only one page contains the TB.
 - *
 - * Called with mmap_lock held for user-mode emulation.
 - *
 - * Returns a pointer @tb, or a pointer to an existing TB that matches @tb.
 - * Note that in !user-mode, another thread might have already added a TB
 - * for the same block of guest code that @tb corresponds to. In that case,
 - * the caller should discard the original @tb, and use instead the returned TB.
 - */
 -static TranslationBlock *
 -tb_link_page(TranslationBlock *tb, tb_page_addr_t phys_pc,
 -             tb_page_addr_t phys_page2)
 -{
 -    PageDesc *p;
 -    PageDesc *p2 = NULL;
 -    void *existing_tb = NULL;
 -    uint32_t h;
 -
 -    assert_memory_lock();
 -    tcg_debug_assert(!(tb->cflags & CF_INVALID));
 -
 -    /*
 -     * Add the TB to the page list, acquiring first the pages's locks.
 -     * We keep the locks held until after inserting the TB in the hash table,
 -     * so that if the insertion fails we know for sure that the TBs are still
 -     * in the page descriptors.
 -     * Note that inserting into the hash table first isn't an option, since
 -     * we can only insert TBs that are fully initialized.
 -     */
 -    page_lock_pair(&p, phys_pc, &p2, phys_page2, true);
 -    tb_page_add(p, tb, 0, phys_pc);
 -    if (p2) {
 -        tb_page_add(p2, tb, 1, phys_page2);
 -    } else {
 -        tb->page_addr[1] = -1;
 -    }
 -
 -    /* add in the hash table */
 -    h = tb_hash_func(phys_pc, (TARGET_TB_PCREL ? 0 : tb_pc(tb)),
 -                     tb->flags, tb->cflags, tb->trace_vcpu_dstate);
 -    qht_insert(&tb_ctx.htable, tb, h, &existing_tb);
 -
 -    /* remove TB from the page(s) if we couldn't insert it */
 -    if (unlikely(existing_tb)) {
 -        tb_page_remove(p, tb);
 -        if (p2) {
 -            tb_page_remove(p2, tb);
 -        }
 -        tb = existing_tb;
 -    }
 -
 -    if (p2 && p2 != p) {
 -        page_unlock(p2);
 -    }
 -    page_unlock(p);
 -    return tb;
 -}
 -
  /* Called with mmap_lock held for user mode emulation.  */
  TranslationBlock *tb_gen_code(CPUState *cpu,
                                target_ulong pc, target_ulong cs_base,
@@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_gen_code(CPUState *cpu,
      return tb;
  }
 -/*
 - * @p must be non-NULL.
 - * user-mode: call with mmap_lock held.
 - * !user-mode: call with all @pages locked.
 - */
 -static void
 -tb_invalidate_phys_page_range__locked(struct page_collection *pages,
 -                                      PageDesc *p, tb_page_addr_t start,
 -                                      tb_page_addr_t end,
 -                                      uintptr_t retaddr)
 -{
 -    TranslationBlock *tb;
 -    tb_page_addr_t tb_start, tb_end;
 -    int n;
 -#ifdef TARGET_HAS_PRECISE_SMC
 -    CPUState *cpu = current_cpu;
 -    CPUArchState *env = NULL;
 -    bool current_tb_not_found = retaddr != 0;
 -    bool current_tb_modified = false;
 -    TranslationBlock *current_tb = NULL;
 -    target_ulong current_pc = 0;
 -    target_ulong current_cs_base = 0;
 -    uint32_t current_flags = 0;
 -#endif /* TARGET_HAS_PRECISE_SMC */
 -
 -    assert_page_locked(p);
 -
 -#if defined(TARGET_HAS_PRECISE_SMC)
 -    if (cpu != NULL) {
 -        env = cpu->env_ptr;
 -    }
 -#endif
 -
 -    /* we remove all the TBs in the range [start, end[ */
 -    /* XXX: see if in some cases it could be faster to invalidate all
 -       the code */
 -    PAGE_FOR_EACH_TB(p, tb, n) {
 -        assert_page_locked(p);
 -        /* NOTE: this is subtle as a TB may span two physical pages */
 -        if (n == 0) {
 -            /* NOTE: tb_end may be after the end of the page, but
 -               it is not a problem */
 -            tb_start = tb->page_addr[0];
 -            tb_end = tb_start + tb->size;
 -        } else {
 -            tb_start = tb->page_addr[1];
 -            tb_end = tb_start + ((tb->page_addr[0] + tb->size)
 -                                 & ~TARGET_PAGE_MASK);
 -        }
 -        if (!(tb_end <= start || tb_start >= end)) {
 -#ifdef TARGET_HAS_PRECISE_SMC
 -            if (current_tb_not_found) {
 -                current_tb_not_found = false;
 -                /* now we have a real cpu fault */
 -                current_tb = tcg_tb_lookup(retaddr);
 -            }
 -            if (current_tb == tb &&
 -                (tb_cflags(current_tb) & CF_COUNT_MASK) != 1) {
 -                /*
 -                 * If we are modifying the current TB, we must stop
 -                 * its execution. We could be more precise by checking
 -                 * that the modification is after the current PC, but it
 -                 * would require a specialized function to partially
 -                 * restore the CPU state.
 -                 */
 -                current_tb_modified = true;
 -                cpu_restore_state_from_tb(cpu, current_tb, retaddr, true);
 -                cpu_get_tb_cpu_state(env, &current_pc, &current_cs_base,
 -                                     &current_flags);
 -            }
 -#endif /* TARGET_HAS_PRECISE_SMC */
 -            tb_phys_invalidate__locked(tb);
 -        }
 -    }
 -#if !defined(CONFIG_USER_ONLY)
 -    /* if no code remaining, no need to continue to use slow writes */
 -    if (!p->first_tb) {
 -        tlb_unprotect_code(start);
 -    }
 -#endif
 -#ifdef TARGET_HAS_PRECISE_SMC
 -    if (current_tb_modified) {
 -        page_collection_unlock(pages);
 -        /* Force execution of one insn next time.  */
 -        cpu->cflags_next_tb = 1 | CF_NOIRQ | curr_cflags(cpu);
 -        mmap_unlock();
 -        cpu_loop_exit_noexc(cpu);
 -    }
 -#endif
 -}
 -
 -/*
 - * Invalidate all TBs which intersect with the target physical address range
 - * [start;end[. NOTE: start and end must refer to the *same* physical page.
 - * 'is_cpu_write_access' should be true if called from a real cpu write
 - * access: the virtual CPU will exit the current TB if code is modified inside
 - * this TB.
 - *
 - * Called with mmap_lock held for user-mode emulation
 - */
 -void tb_invalidate_phys_page_range(tb_page_addr_t start, tb_page_addr_t end)
 -{
 -    struct page_collection *pages;
 -    PageDesc *p;
 -
 -    assert_memory_lock();
 -
 -    p = page_find(start >> TARGET_PAGE_BITS);
 -    if (p == NULL) {
 -        return;
 -    }
 -    pages = page_collection_lock(start, end);
 -    tb_invalidate_phys_page_range__locked(pages, p, start, end, 0);
 -    page_collection_unlock(pages);
 -}
 -
 -/*
 - * Invalidate all TBs which intersect with the target physical address range
 - * [start;end[. NOTE: start and end may refer to *different* physical pages.
 - * 'is_cpu_write_access' should be true if called from a real cpu write
 - * access: the virtual CPU will exit the current TB if code is modified inside
 - * this TB.
 - *
 - * Called with mmap_lock held for user-mode emulation.
 - */
 -#ifdef CONFIG_SOFTMMU
 -void tb_invalidate_phys_range(ram_addr_t start, ram_addr_t end)
 -#else
 -void tb_invalidate_phys_range(target_ulong start, target_ulong end)
 -#endif
 -{
 -    struct page_collection *pages;
 -    tb_page_addr_t next;
 -
 -    assert_memory_lock();
 -
 -    pages = page_collection_lock(start, end);
 -    for (next = (start & TARGET_PAGE_MASK) + TARGET_PAGE_SIZE;
 -         start < end;
 -         start = next, next += TARGET_PAGE_SIZE) {
 -        PageDesc *pd = page_find(start >> TARGET_PAGE_BITS);
 -        tb_page_addr_t bound = MIN(next, end);
 -
 -        if (pd == NULL) {
 -            continue;
 -        }
 -        tb_invalidate_phys_page_range__locked(pages, pd, start, bound, 0);
 -    }
 -    page_collection_unlock(pages);
 -}
 -
 -#ifdef CONFIG_SOFTMMU
 -/* len must be <= 8 and start must be a multiple of len.
 - * Called via softmmu_template.h when code areas are written to with
 - * iothread mutex not held.
 - *
 - * Call with all @pages in the range [@start, @start + len[ locked.
 - */
 -void tb_invalidate_phys_page_fast(struct page_collection *pages,
 -                                  tb_page_addr_t start, int len,
 -                                  uintptr_t retaddr)
 -{
 -    PageDesc *p;
 -
 -    assert_memory_lock();
 -
 -    p = page_find(start >> TARGET_PAGE_BITS);
 -    if (!p) {
 -        return;
 -    }
 -
 -    assert_page_locked(p);
 -    tb_invalidate_phys_page_range__locked(pages, p, start, start + len,
 -                                          retaddr);
 -}
 -#else
 -/* Called with mmap_lock held. If pc is not 0 then it indicates the
 - * host PC of the faulting store instruction that caused this invalidate.
 - * Returns true if the caller needs to abort execution of the current
 - * TB (because it was modified by this store and the guest CPU has
 - * precise-SMC semantics).
 - */
 -static bool tb_invalidate_phys_page(tb_page_addr_t addr, uintptr_t pc)
 -{
 -    TranslationBlock *tb;
 -    PageDesc *p;
 -    int n;
 -#ifdef TARGET_HAS_PRECISE_SMC
 -    TranslationBlock *current_tb = NULL;
 -    CPUState *cpu = current_cpu;
 -    CPUArchState *env = NULL;
 -    int current_tb_modified = 0;
 -    target_ulong current_pc = 0;
 -    target_ulong current_cs_base = 0;
 -    uint32_t current_flags = 0;
 -#endif
 -
 -    assert_memory_lock();
 -
 -    addr &= TARGET_PAGE_MASK;
 -    p = page_find(addr >> TARGET_PAGE_BITS);
 -    if (!p) {
 -        return false;
 -    }
 -
 -#ifdef TARGET_HAS_PRECISE_SMC
 -    if (p->first_tb && pc != 0) {
 -        current_tb = tcg_tb_lookup(pc);
 -    }
 -    if (cpu != NULL) {
 -        env = cpu->env_ptr;
 -    }
 -#endif
 -    assert_page_locked(p);
 -    PAGE_FOR_EACH_TB(p, tb, n) {
 -#ifdef TARGET_HAS_PRECISE_SMC
 -        if (current_tb == tb &&
 -            (tb_cflags(current_tb) & CF_COUNT_MASK) != 1) {
 -                /* If we are modifying the current TB, we must stop
 -                   its execution. We could be more precise by checking
 -                   that the modification is after the current PC, but it
 -                   would require a specialized function to partially
 -                   restore the CPU state */
 -
 -            current_tb_modified = 1;
 -            cpu_restore_state_from_tb(cpu, current_tb, pc, true);
 -            cpu_get_tb_cpu_state(env, &current_pc, &current_cs_base,
 -                                 &current_flags);
 -        }
 -#endif /* TARGET_HAS_PRECISE_SMC */
 -        tb_phys_invalidate(tb, addr);
 -    }
 -    p->first_tb = (uintptr_t)NULL;
 -#ifdef TARGET_HAS_PRECISE_SMC
 -    if (current_tb_modified) {
 -        /* Force execution of one insn next time.  */
 -        cpu->cflags_next_tb = 1 | CF_NOIRQ | curr_cflags(cpu);
 -        return true;
 -    }
 -#endif
 -
 -    return false;
 -}
 -#endif
 -
- /* user-mode: call with mmap_lock held */
+ static void tcg_target_init(TCGContext *s)
  void tb_check_watchpoint(CPUState *cpu, uintptr_t retaddr)
  {
-diff --git a/accel/tcg/meson.build b/accel/tcg/meson.build
+-#ifdef __linux__
 -    unsigned long hwcap = qemu_getauxval(AT_HWCAP);
 -    have_lse = hwcap & HWCAP_ATOMICS;
 -    have_lse2 = hwcap & HWCAP_USCAT;
 -#endif
 -#ifdef CONFIG_DARWIN
 -    have_lse = sysctl_for_bool("hw.optional.arm.FEAT_LSE");
 -    have_lse2 = sysctl_for_bool("hw.optional.arm.FEAT_LSE2");
 -#endif
 -
      tcg_target_available_regs[TCG_TYPE_I32] = 0xffffffffu;
      tcg_target_available_regs[TCG_TYPE_I64] = 0xffffffffu;
      tcg_target_available_regs[TCG_TYPE_V64] = 0xffffffff00000000ull;
 diff --git a/util/meson.build b/util/meson.build
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/meson.build
+--- a/util/meson.build
-+++ b/accel/tcg/meson.build
++++ b/util/meson.build
-@@ -XXX,XX +XXX,XX @@ tcg_ss.add(files(
+@@ -XXX,XX +XXX,XX @@ if have_block
-   'tcg-all.c',
+   util_ss.add(when: 'CONFIG_LINUX', if_true: files('vfio-helpers.c'))
-   'cpu-exec-common.c',
+ endif
-   'cpu-exec.c',
-+  'tb-maint.c',
+-if cpu in ['x86', 'x86_64']
-   'tcg-runtime-gvec.c',
++if cpu == 'aarch64'
-   'tcg-runtime.c',
++  util_ss.add(files('cpuinfo-aarch64.c'))
-   'translate-all.c',
++elif cpu in ['x86', 'x86_64']
    util_ss.add(files('cpuinfo-i386.c'))
  endif
 --
 .34.1

-[PULL 08/47] accel/tcg: Make page_alloc_target_data allocation constant
+[PULL 10/28] include/host: Split out atomic128-cas.h
-Use a constant target data allocation size for all pages.
+Separates the aarch64-specific portion into its own file.
 This will be necessary to reduce overhead of page tracking.
 Since TARGET_PAGE_DATA_SIZE is now required, we can use this
 to omit data tracking for targets that don't require it.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/exec/cpu-all.h    | 9 ++++-----
+ host/include/aarch64/host/atomic128-cas.h | 43 ++++++++++++++++++
- target/arm/cpu.h          | 8 ++++++++
+ host/include/generic/host/atomic128-cas.h | 43 ++++++++++++++++++
- target/arm/internals.h    | 4 ----
+ include/qemu/atomic128.h                  | 55 +----------------------
- accel/tcg/translate-all.c | 8 ++++++--
+files changed, 87 insertions(+), 54 deletions(-)
- target/arm/mte_helper.c   | 3 +--
+ create mode 100644 host/include/aarch64/host/atomic128-cas.h
-files changed, 19 insertions(+), 13 deletions(-)
+ create mode 100644 host/include/generic/host/atomic128-cas.h
-diff --git a/include/exec/cpu-all.h b/include/exec/cpu-all.h
+diff --git a/host/include/aarch64/host/atomic128-cas.h b/host/include/aarch64/host/atomic128-cas.h
-index XXXXXXX..XXXXXXX 100644
+new file mode 100644
---- a/include/exec/cpu-all.h
+index XXXXXXX..XXXXXXX
-+++ b/include/exec/cpu-all.h
+--- /dev/null
-@@ -XXX,XX +XXX,XX @@ void page_reset_target_data(target_ulong start, target_ulong end);
++++ b/host/include/aarch64/host/atomic128-cas.h
- int page_check_range(target_ulong start, target_ulong len, int flags);
+@@ -XXX,XX +XXX,XX @@
++/*
- /**
++ * SPDX-License-Identifier: GPL-2.0-or-later
-- * page_alloc_target_data(address, size)
++ * Compare-and-swap for 128-bit atomic operations, AArch64 version.
-+ * page_alloc_target_data(address)
++ *
-  * @address: guest virtual address
++ * Copyright (C) 2018, 2023 Linaro, Ltd.
-- * @size: size of data to allocate
++ *
-  *
++ * See docs/devel/atomics.rst for discussion about the guarantees each
-- * Allocate @size bytes of out-of-band data to associate with the
++ * atomic primitive is meant to provide.
-- * guest page at @address.  If the page is not mapped, NULL will
++ */
 + * Allocate TARGET_PAGE_DATA_SIZE bytes of out-of-band data to associate
 + * with the guest page at @address.  If the page is not mapped, NULL will
   * be returned.  If there is existing data associated with @address,
   * no new memory will be allocated.
   *
   * The memory will be freed when the guest page is deallocated,
   * e.g. with the munmap system call.
   */
 -void *page_alloc_target_data(target_ulong address, size_t size);
 +void *page_alloc_target_data(target_ulong address);
  /**
   * page_get_target_data(address)
 diff --git a/target/arm/cpu.h b/target/arm/cpu.h
 index XXXXXXX..XXXXXXX 100644
 --- a/target/arm/cpu.h
 +++ b/target/arm/cpu.h
@@ -XXX,XX +XXX,XX @@ extern const uint64_t pred_esz_masks[5];
  #define PAGE_MTE            PAGE_TARGET_2
  #define PAGE_TARGET_STICKY  PAGE_MTE
 +/* We associate one allocation tag per 16 bytes, the minimum.  */
 +#define LOG2_TAG_GRANULE 4
 +#define TAG_GRANULE      (1 << LOG2_TAG_GRANULE)
 +
-+#ifdef CONFIG_USER_ONLY
++#ifndef AARCH64_ATOMIC128_CAS_H
-+#define TARGET_PAGE_DATA_SIZE (TARGET_PAGE_SIZE >> (LOG2_TAG_GRANULE + 1))
++#define AARCH64_ATOMIC128_CAS_H
 +
 +/* Through gcc 10, aarch64 has no support for 128-bit atomics.  */
 +#if defined(CONFIG_ATOMIC128) || defined(CONFIG_CMPXCHG128)
 +#include "host/include/generic/host/atomic128-cas.h"
 +#else
 +static inline Int128 atomic16_cmpxchg(Int128 *ptr, Int128 cmp, Int128 new)
 +{
 +    uint64_t cmpl = int128_getlo(cmp), cmph = int128_gethi(cmp);
 +    uint64_t newl = int128_getlo(new), newh = int128_gethi(new);
 +    uint64_t oldl, oldh;
 +    uint32_t tmp;
 +
 +    asm("0: ldaxp %[oldl], %[oldh], %[mem]\n\t"
 +        "cmp %[oldl], %[cmpl]\n\t"
 +        "ccmp %[oldh], %[cmph], #0, eq\n\t"
 +        "b.ne 1f\n\t"
 +        "stlxp %w[tmp], %[newl], %[newh], %[mem]\n\t"
 +        "cbnz %w[tmp], 0b\n"
 +        "1:"
 +        : [mem] "+m"(*ptr), [tmp] "=&r"(tmp),
 +          [oldl] "=&r"(oldl), [oldh] "=&r"(oldh)
 +        : [cmpl] "r"(cmpl), [cmph] "r"(cmph),
 +          [newl] "r"(newl), [newh] "r"(newh)
 +        : "memory", "cc");
 +
 +    return int128_make128(oldl, oldh);
 +}
 +# define HAVE_CMPXCHG128 1
 +#endif
 +
- #ifdef TARGET_TAGGED_ADDRESSES
++#endif /* AARCH64_ATOMIC128_CAS_H */
- /**
+diff --git a/host/include/generic/host/atomic128-cas.h b/host/include/generic/host/atomic128-cas.h
-  * cpu_untagged_addr:
+new file mode 100644
-diff --git a/target/arm/internals.h b/target/arm/internals.h
+index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/host/include/generic/host/atomic128-cas.h
@@ -XXX,XX +XXX,XX @@
 +/*
 + * SPDX-License-Identifier: GPL-2.0-or-later
 + * Compare-and-swap for 128-bit atomic operations, generic version.
 + *
 + * Copyright (C) 2018, 2023 Linaro, Ltd.
 + *
 + * See docs/devel/atomics.rst for discussion about the guarantees each
 + * atomic primitive is meant to provide.
 + */
 +
 +#ifndef HOST_ATOMIC128_CAS_H
 +#define HOST_ATOMIC128_CAS_H
 +
 +#if defined(CONFIG_ATOMIC128)
 +static inline Int128 atomic16_cmpxchg(Int128 *ptr, Int128 cmp, Int128 new)
 +{
 +    Int128Alias r, c, n;
 +
 +    c.s = cmp;
 +    n.s = new;
 +    r.i = qatomic_cmpxchg__nocheck((__int128_t *)ptr, c.i, n.i);
 +    return r.s;
 +}
 +# define HAVE_CMPXCHG128 1
 +#elif defined(CONFIG_CMPXCHG128)
 +static inline Int128 atomic16_cmpxchg(Int128 *ptr, Int128 cmp, Int128 new)
 +{
 +    Int128Alias r, c, n;
 +
 +    c.s = cmp;
 +    n.s = new;
 +    r.i = __sync_val_compare_and_swap_16((__int128_t *)ptr, c.i, n.i);
 +    return r.s;
 +}
 +# define HAVE_CMPXCHG128 1
 +#else
 +/* Fallback definition that must be optimized away, or error.  */
 +Int128 QEMU_ERROR("unsupported atomic")
 +    atomic16_cmpxchg(Int128 *ptr, Int128 cmp, Int128 new);
 +# define HAVE_CMPXCHG128 0
 +#endif
 +
 +#endif /* HOST_ATOMIC128_CAS_H */
 diff --git a/include/qemu/atomic128.h b/include/qemu/atomic128.h
 index XXXXXXX..XXXXXXX 100644
---- a/target/arm/internals.h
+--- a/include/qemu/atomic128.h
-+++ b/target/arm/internals.h
++++ b/include/qemu/atomic128.h
-@@ -XXX,XX +XXX,XX @@ void arm_log_exception(CPUState *cs);
+@@ -XXX,XX +XXX,XX @@
   * Therefore, special case each platform.
   */
- #define GMID_EL1_BS  6
+-#if defined(CONFIG_ATOMIC128)
--/* We associate one allocation tag per 16 bytes, the minimum.  */
+-static inline Int128 atomic16_cmpxchg(Int128 *ptr, Int128 cmp, Int128 new)
--#define LOG2_TAG_GRANULE 4
+-{
--#define TAG_GRANULE      (1 << LOG2_TAG_GRANULE)
+-    Int128Alias r, c, n;
 -
- /*
+-    c.s = cmp;
-  * SVE predicates are 1/8 the size of SVE vectors, and cannot use
+-    n.s = new;
-  * the same simd_desc() encoding due to restrictions on size.
+-    r.i = qatomic_cmpxchg__nocheck((__int128_t *)ptr, c.i, n.i);
-diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
+-    return r.s;
-index XXXXXXX..XXXXXXX 100644
+-}
---- a/accel/tcg/translate-all.c
+-# define HAVE_CMPXCHG128 1
-+++ b/accel/tcg/translate-all.c
+-#elif defined(CONFIG_CMPXCHG128)
-@@ -XXX,XX +XXX,XX @@ void page_set_flags(target_ulong start, target_ulong end, int flags)
+-static inline Int128 atomic16_cmpxchg(Int128 *ptr, Int128 cmp, Int128 new)
+-{
- void page_reset_target_data(target_ulong start, target_ulong end)
+-    Int128Alias r, c, n;
- {
+-
-+#ifdef TARGET_PAGE_DATA_SIZE
+-    c.s = cmp;
-     target_ulong addr, len;
+-    n.s = new;
+-    r.i = __sync_val_compare_and_swap_16((__int128_t *)ptr, c.i, n.i);
-     /*
+-    return r.s;
-@@ -XXX,XX +XXX,XX @@ void page_reset_target_data(target_ulong start, target_ulong end)
+-}
-         g_free(p->target_data);
+-# define HAVE_CMPXCHG128 1
-         p->target_data = NULL;
+-#elif defined(__aarch64__)
-     }
+-/* Through gcc 8, aarch64 has no support for 128-bit at all.  */
-+#endif
+-static inline Int128 atomic16_cmpxchg(Int128 *ptr, Int128 cmp, Int128 new)
- }
+-{
+-    uint64_t cmpl = int128_getlo(cmp), cmph = int128_gethi(cmp);
-+#ifdef TARGET_PAGE_DATA_SIZE
+-    uint64_t newl = int128_getlo(new), newh = int128_gethi(new);
- void *page_get_target_data(target_ulong address)
+-    uint64_t oldl, oldh;
- {
+-    uint32_t tmp;
-     PageDesc *p = page_find(address >> TARGET_PAGE_BITS);
+-
-     return p ? p->target_data : NULL;
+-    asm("0: ldaxp %[oldl], %[oldh], %[mem]\n\t"
- }
+-        "cmp %[oldl], %[cmpl]\n\t"
+-        "ccmp %[oldh], %[cmph], #0, eq\n\t"
--void *page_alloc_target_data(target_ulong address, size_t size)
+-        "b.ne 1f\n\t"
-+void *page_alloc_target_data(target_ulong address)
+-        "stlxp %w[tmp], %[newl], %[newh], %[mem]\n\t"
- {
+-        "cbnz %w[tmp], 0b\n"
-     PageDesc *p = page_find(address >> TARGET_PAGE_BITS);
+-        "1:"
-     void *ret = NULL;
+-        : [mem] "+m"(*ptr), [tmp] "=&r"(tmp),
-@@ -XXX,XX +XXX,XX @@ void *page_alloc_target_data(target_ulong address, size_t size)
+-          [oldl] "=&r"(oldl), [oldh] "=&r"(oldh)
-     if (p->flags & PAGE_VALID) {
+-        : [cmpl] "r"(cmpl), [cmph] "r"(cmph),
-         ret = p->target_data;
+-          [newl] "r"(newl), [newh] "r"(newh)
-         if (!ret) {
+-        : "memory", "cc");
--            p->target_data = ret = g_malloc0(size);
+-
-+            p->target_data = ret = g_malloc0(TARGET_PAGE_DATA_SIZE);
+-    return int128_make128(oldl, oldh);
-         }
+-}
-     }
+-# define HAVE_CMPXCHG128 1
-     return ret;
+-#else
- }
+-/* Fallback definition that must be optimized away, or error.  */
-+#endif /* TARGET_PAGE_DATA_SIZE */
+-Int128 QEMU_ERROR("unsupported atomic")
+-    atomic16_cmpxchg(Int128 *ptr, Int128 cmp, Int128 new);
- int page_check_range(target_ulong start, target_ulong len, int flags)
+-# define HAVE_CMPXCHG128 0
- {
+-#endif /* Some definition for HAVE_CMPXCHG128 */
-diff --git a/target/arm/mte_helper.c b/target/arm/mte_helper.c
+-
-index XXXXXXX..XXXXXXX 100644
++#include "host/atomic128-cas.h"
---- a/target/arm/mte_helper.c
-+++ b/target/arm/mte_helper.c
+ #if defined(CONFIG_ATOMIC128)
-@@ -XXX,XX +XXX,XX @@ static uint8_t *allocation_tag_mem(CPUARMState *env, int ptr_mmu_idx,
+ static inline Int128 atomic16_read(Int128 *ptr)
      tags = page_get_target_data(clean_ptr);
      if (tags == NULL) {
 -        size_t alloc_size = TARGET_PAGE_SIZE >> (LOG2_TAG_GRANULE + 1);
 -        tags = page_alloc_target_data(clean_ptr, alloc_size);
 +        tags = page_alloc_target_data(clean_ptr);
          assert(tags != NULL);
      }
 --
 .34.1

-[PULL 16/47] accel/tcg: Rename tb_invalidate_phys_page
+[PULL 11/28] include/host: Split out atomic128-ldst.h
-Rename to tb_invalidate_phys_page_unwind to emphasize that
+Separates the aarch64-specific portion into its own file.
 we also detect invalidating the current TB, and also to free
 up that name for other usage.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/internal.h      | 2 +-
+ host/include/aarch64/host/atomic128-ldst.h | 49 ++++++++++++++
- accel/tcg/tb-maint.c      | 2 +-
+ host/include/generic/host/atomic128-ldst.h | 57 +++++++++++++++++
- accel/tcg/translate-all.c | 5 +++--
+ include/qemu/atomic128.h                   | 74 +---------------------
-files changed, 5 insertions(+), 4 deletions(-)
+files changed, 107 insertions(+), 73 deletions(-)
+ create mode 100644 host/include/aarch64/host/atomic128-ldst.h
-diff --git a/accel/tcg/internal.h b/accel/tcg/internal.h
+ create mode 100644 host/include/generic/host/atomic128-ldst.h
 diff --git a/host/include/aarch64/host/atomic128-ldst.h b/host/include/aarch64/host/atomic128-ldst.h
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/host/include/aarch64/host/atomic128-ldst.h
@@ -XXX,XX +XXX,XX @@
 +/*
 + * SPDX-License-Identifier: GPL-2.0-or-later
 + * Load/store for 128-bit atomic operations, AArch64 version.
 + *
 + * Copyright (C) 2018, 2023 Linaro, Ltd.
 + *
 + * See docs/devel/atomics.rst for discussion about the guarantees each
 + * atomic primitive is meant to provide.
 + */
 +
 +#ifndef AARCH64_ATOMIC128_LDST_H
 +#define AARCH64_ATOMIC128_LDST_H
 +
 +/* Through gcc 10, aarch64 has no support for 128-bit atomics.  */
 +#if !defined(CONFIG_ATOMIC128) && !defined(CONFIG_USER_ONLY)
 +/* We can do better than cmpxchg for AArch64.  */
 +static inline Int128 atomic16_read(Int128 *ptr)
 +{
 +    uint64_t l, h;
 +    uint32_t tmp;
 +
 +    /* The load must be paired with the store to guarantee not tearing.  */
 +    asm("0: ldxp %[l], %[h], %[mem]\n\t"
 +        "stxp %w[tmp], %[l], %[h], %[mem]\n\t"
 +        "cbnz %w[tmp], 0b"
 +        : [mem] "+m"(*ptr), [tmp] "=r"(tmp), [l] "=r"(l), [h] "=r"(h));
 +
 +    return int128_make128(l, h);
 +}
 +
 +static inline void atomic16_set(Int128 *ptr, Int128 val)
 +{
 +    uint64_t l = int128_getlo(val), h = int128_gethi(val);
 +    uint64_t t1, t2;
 +
 +    /* Load into temporaries to acquire the exclusive access lock.  */
 +    asm("0: ldxp %[t1], %[t2], %[mem]\n\t"
 +        "stxp %w[t1], %[l], %[h], %[mem]\n\t"
 +        "cbnz %w[t1], 0b"
 +        : [mem] "+m"(*ptr), [t1] "=&r"(t1), [t2] "=&r"(t2)
 +        : [l] "r"(l), [h] "r"(h));
 +}
 +
 +# define HAVE_ATOMIC128 1
 +#else
 +#include "host/include/generic/host/atomic128-ldst.h"
 +#endif
 +
 +#endif /* AARCH64_ATOMIC128_LDST_H */
 diff --git a/host/include/generic/host/atomic128-ldst.h b/host/include/generic/host/atomic128-ldst.h
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/host/include/generic/host/atomic128-ldst.h
@@ -XXX,XX +XXX,XX @@
 +/*
 + * SPDX-License-Identifier: GPL-2.0-or-later
 + * Load/store for 128-bit atomic operations, generic version.
 + *
 + * Copyright (C) 2018, 2023 Linaro, Ltd.
 + *
 + * See docs/devel/atomics.rst for discussion about the guarantees each
 + * atomic primitive is meant to provide.
 + */
 +
 +#ifndef HOST_ATOMIC128_LDST_H
 +#define HOST_ATOMIC128_LDST_H
 +
 +#if defined(CONFIG_ATOMIC128)
 +static inline Int128 atomic16_read(Int128 *ptr)
 +{
 +    Int128Alias r;
 +
 +    r.i = qatomic_read__nocheck((__int128_t *)ptr);
 +    return r.s;
 +}
 +
 +static inline void atomic16_set(Int128 *ptr, Int128 val)
 +{
 +    Int128Alias v;
 +
 +    v.s = val;
 +    qatomic_set__nocheck((__int128_t *)ptr, v.i);
 +}
 +
 +# define HAVE_ATOMIC128 1
 +#elif !defined(CONFIG_USER_ONLY) && HAVE_CMPXCHG128
 +static inline Int128 atomic16_read(Int128 *ptr)
 +{
 +    /* Maybe replace 0 with 0, returning the old value.  */
 +    Int128 z = int128_make64(0);
 +    return atomic16_cmpxchg(ptr, z, z);
 +}
 +
 +static inline void atomic16_set(Int128 *ptr, Int128 val)
 +{
 +    Int128 old = *ptr, cmp;
 +    do {
 +        cmp = old;
 +        old = atomic16_cmpxchg(ptr, cmp, val);
 +    } while (int128_ne(old, cmp));
 +}
 +
 +# define HAVE_ATOMIC128 1
 +#else
 +/* Fallback definitions that must be optimized away, or error.  */
 +Int128 QEMU_ERROR("unsupported atomic") atomic16_read(Int128 *ptr);
 +void QEMU_ERROR("unsupported atomic") atomic16_set(Int128 *ptr, Int128 val);
 +# define HAVE_ATOMIC128 0
 +#endif
 +
 +#endif /* HOST_ATOMIC128_LDST_H */
 diff --git a/include/qemu/atomic128.h b/include/qemu/atomic128.h
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/internal.h
+--- a/include/qemu/atomic128.h
-+++ b/accel/tcg/internal.h
++++ b/include/qemu/atomic128.h
-@@ -XXX,XX +XXX,XX @@ void tb_htable_init(void);
+@@ -XXX,XX +XXX,XX @@
  void tb_reset_jump(TranslationBlock *tb, int n);
  TranslationBlock *tb_link_page(TranslationBlock *tb, tb_page_addr_t phys_pc,
                                 tb_page_addr_t phys_page2);
 -bool tb_invalidate_phys_page(tb_page_addr_t addr, uintptr_t pc);
 +bool tb_invalidate_phys_page_unwind(tb_page_addr_t addr, uintptr_t pc);
  int cpu_restore_state_from_tb(CPUState *cpu, TranslationBlock *tb,
                                uintptr_t searched_pc, bool reset_icount);
 diff --git a/accel/tcg/tb-maint.c b/accel/tcg/tb-maint.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/tb-maint.c
 +++ b/accel/tcg/tb-maint.c
@@ -XXX,XX +XXX,XX @@ void tb_invalidate_phys_page_fast(struct page_collection *pages,
   * TB (because it was modified by this store and the guest CPU has
   * precise-SMC semantics).
   */
--bool tb_invalidate_phys_page(tb_page_addr_t addr, uintptr_t pc)
-+bool tb_invalidate_phys_page_unwind(tb_page_addr_t addr, uintptr_t pc)
+ #include "host/atomic128-cas.h"
- {
+-
-     TranslationBlock *tb;
+-#if defined(CONFIG_ATOMIC128)
-     PageDesc *p;
+-static inline Int128 atomic16_read(Int128 *ptr)
-diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
+-{
-index XXXXXXX..XXXXXXX 100644
+-    Int128Alias r;
---- a/accel/tcg/translate-all.c
+-
-+++ b/accel/tcg/translate-all.c
+-    r.i = qatomic_read__nocheck((__int128_t *)ptr);
-@@ -XXX,XX +XXX,XX @@ void page_set_flags(target_ulong start, target_ulong end, int flags)
+-    return r.s;
-         if (!(p->flags & PAGE_WRITE) &&
+-}
-             (flags & PAGE_WRITE) &&
+-
-             p->first_tb) {
+-static inline void atomic16_set(Int128 *ptr, Int128 val)
--            tb_invalidate_phys_page(addr, 0);
+-{
-+            tb_invalidate_phys_page_unwind(addr, 0);
+-    Int128Alias v;
-         }
+-
-         if (reset_target_data) {
+-    v.s = val;
-             g_free(p->target_data);
+-    qatomic_set__nocheck((__int128_t *)ptr, v.i);
-@@ -XXX,XX +XXX,XX @@ int page_unprotect(target_ulong address, uintptr_t pc)
+-}
+-
-                 /* and since the content will be modified, we must invalidate
+-# define HAVE_ATOMIC128 1
-                    the corresponding translated code. */
+-#elif !defined(CONFIG_USER_ONLY) && defined(__aarch64__)
--                current_tb_invalidated |= tb_invalidate_phys_page(addr, pc);
+-/* We can do better than cmpxchg for AArch64.  */
-+                current_tb_invalidated |=
+-static inline Int128 atomic16_read(Int128 *ptr)
-+                    tb_invalidate_phys_page_unwind(addr, pc);
+-{
-             }
+-    uint64_t l, h;
-             mprotect((void *)g2h_untagged(host_start), qemu_host_page_size,
+-    uint32_t tmp;
-                      prot & PAGE_BITS);
+-
 -    /* The load must be paired with the store to guarantee not tearing.  */
 -    asm("0: ldxp %[l], %[h], %[mem]\n\t"
 -        "stxp %w[tmp], %[l], %[h], %[mem]\n\t"
 -        "cbnz %w[tmp], 0b"
 -        : [mem] "+m"(*ptr), [tmp] "=r"(tmp), [l] "=r"(l), [h] "=r"(h));
 -
 -    return int128_make128(l, h);
 -}
 -
 -static inline void atomic16_set(Int128 *ptr, Int128 val)
 -{
 -    uint64_t l = int128_getlo(val), h = int128_gethi(val);
 -    uint64_t t1, t2;
 -
 -    /* Load into temporaries to acquire the exclusive access lock.  */
 -    asm("0: ldxp %[t1], %[t2], %[mem]\n\t"
 -        "stxp %w[t1], %[l], %[h], %[mem]\n\t"
 -        "cbnz %w[t1], 0b"
 -        : [mem] "+m"(*ptr), [t1] "=&r"(t1), [t2] "=&r"(t2)
 -        : [l] "r"(l), [h] "r"(h));
 -}
 -
 -# define HAVE_ATOMIC128 1
 -#elif !defined(CONFIG_USER_ONLY) && HAVE_CMPXCHG128
 -static inline Int128 atomic16_read(Int128 *ptr)
 -{
 -    /* Maybe replace 0 with 0, returning the old value.  */
 -    Int128 z = int128_make64(0);
 -    return atomic16_cmpxchg(ptr, z, z);
 -}
 -
 -static inline void atomic16_set(Int128 *ptr, Int128 val)
 -{
 -    Int128 old = *ptr, cmp;
 -    do {
 -        cmp = old;
 -        old = atomic16_cmpxchg(ptr, cmp, val);
 -    } while (int128_ne(old, cmp));
 -}
 -
 -# define HAVE_ATOMIC128 1
 -#else
 -/* Fallback definitions that must be optimized away, or error.  */
 -Int128 QEMU_ERROR("unsupported atomic") atomic16_read(Int128 *ptr);
 -void QEMU_ERROR("unsupported atomic") atomic16_set(Int128 *ptr, Int128 val);
 -# define HAVE_ATOMIC128 0
 -#endif /* Some definition for HAVE_ATOMIC128 */
 +#include "host/atomic128-ldst.h"
  #endif /* QEMU_ATOMIC128_H */
 --
 .34.1

-[PULL 19/47] accel/tcg: Use tb_invalidate_phys_page in page_set_flags
+[PULL 12/28] meson: Fix detect atomic128 support with optimization
-We do not require detection of overlapping TBs here,
+Silly typo: sizeof(16) != 16.
 so use the more appropriate function.
+Fixes: e61f1efeb730 ("meson: Detect atomic128 support with optimization")
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/translate-all.c | 2 +-
+ meson.build | 2 +-
 file changed, 1 insertion(+), 1 deletion(-)
-diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
+diff --git a/meson.build b/meson.build
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/translate-all.c
+--- a/meson.build
-+++ b/accel/tcg/translate-all.c
++++ b/meson.build
-@@ -XXX,XX +XXX,XX @@ void page_set_flags(target_ulong start, target_ulong end, int flags)
+@@ -XXX,XX +XXX,XX @@ if has_int128
-         if (!(p->flags & PAGE_WRITE) &&
+   # __alignof(unsigned __int128) for the host.
-             (flags & PAGE_WRITE) &&
+   atomic_test_128 = '''
-             p->first_tb) {
+     int main(int ac, char **av) {
--            tb_invalidate_phys_page_unwind(addr, 0);
+-      unsigned __int128 *p = __builtin_assume_aligned(av[ac - 1], sizeof(16));
-+            tb_invalidate_phys_page(addr);
++      unsigned __int128 *p = __builtin_assume_aligned(av[ac - 1], 16);
-         }
+       p[1] = __atomic_load_n(&p[0], __ATOMIC_RELAXED);
-         if (reset_target_data) {
+       __atomic_store_n(&p[2], p[3], __ATOMIC_RELAXED);
-             g_free(p->target_data);
+       __atomic_compare_exchange_n(&p[4], &p[5], p[6], 0, __ATOMIC_RELAXED, __ATOMIC_RELAXED);
 --
 .34.1

-[PULL 10/47] accel/tcg: Split out PageDesc to internal.h
+[PULL 13/28] include/qemu: Move CONFIG_ATOMIC128_OPT handling to atomic128.h
+Not only the routines in ldst_atomicity.c.inc need markup,
+but also the ones in the headers.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/internal.h      | 31 +++++++++++++++++++++++++++++++
+ host/include/generic/host/atomic128-cas.h  | 12 ++++++++----
- accel/tcg/translate-all.c | 31 +------------------------------
+ host/include/generic/host/atomic128-ldst.h | 18 ++++++++++++------
-files changed, 32 insertions(+), 30 deletions(-)
+ include/qemu/atomic128.h                   | 17 +++++++++++++++++
  accel/tcg/ldst_atomicity.c.inc             | 17 -----------------
 files changed, 37 insertions(+), 27 deletions(-)
-diff --git a/accel/tcg/internal.h b/accel/tcg/internal.h
+diff --git a/host/include/generic/host/atomic128-cas.h b/host/include/generic/host/atomic128-cas.h
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/internal.h
+--- a/host/include/generic/host/atomic128-cas.h
-+++ b/accel/tcg/internal.h
++++ b/host/include/generic/host/atomic128-cas.h
 @@ -XXX,XX +XXX,XX @@
+ #define HOST_ATOMIC128_CAS_H
- #include "exec/exec-all.h"
  #if defined(CONFIG_ATOMIC128)
 -static inline Int128 atomic16_cmpxchg(Int128 *ptr, Int128 cmp, Int128 new)
 +static inline Int128 ATTRIBUTE_ATOMIC128_OPT
 +atomic16_cmpxchg(Int128 *ptr, Int128 cmp, Int128 new)
  {
 +    __int128_t *ptr_align = __builtin_assume_aligned(ptr, 16);
      Int128Alias r, c, n;
      c.s = cmp;
      n.s = new;
 -    r.i = qatomic_cmpxchg__nocheck((__int128_t *)ptr, c.i, n.i);
 +    r.i = qatomic_cmpxchg__nocheck(ptr_align, c.i, n.i);
      return r.s;
  }
  # define HAVE_CMPXCHG128 1
  #elif defined(CONFIG_CMPXCHG128)
 -static inline Int128 atomic16_cmpxchg(Int128 *ptr, Int128 cmp, Int128 new)
 +static inline Int128 ATTRIBUTE_ATOMIC128_OPT
 +atomic16_cmpxchg(Int128 *ptr, Int128 cmp, Int128 new)
  {
 +    __int128_t *ptr_align = __builtin_assume_aligned(ptr, 16);
      Int128Alias r, c, n;
      c.s = cmp;
      n.s = new;
 -    r.i = __sync_val_compare_and_swap_16((__int128_t *)ptr, c.i, n.i);
 +    r.i = __sync_val_compare_and_swap_16(ptr_align, c.i, n.i);
      return r.s;
  }
  # define HAVE_CMPXCHG128 1
 diff --git a/host/include/generic/host/atomic128-ldst.h b/host/include/generic/host/atomic128-ldst.h
 index XXXXXXX..XXXXXXX 100644
 --- a/host/include/generic/host/atomic128-ldst.h
 +++ b/host/include/generic/host/atomic128-ldst.h
@@ -XXX,XX +XXX,XX @@
  #define HOST_ATOMIC128_LDST_H
  #if defined(CONFIG_ATOMIC128)
 -static inline Int128 atomic16_read(Int128 *ptr)
 +static inline Int128 ATTRIBUTE_ATOMIC128_OPT
 +atomic16_read(Int128 *ptr)
  {
 +    __int128_t *ptr_align = __builtin_assume_aligned(ptr, 16);
      Int128Alias r;
 -    r.i = qatomic_read__nocheck((__int128_t *)ptr);
 +    r.i = qatomic_read__nocheck(ptr_align);
      return r.s;
  }
 -static inline void atomic16_set(Int128 *ptr, Int128 val)
 +static inline void ATTRIBUTE_ATOMIC128_OPT
 +atomic16_set(Int128 *ptr, Int128 val)
  {
 +    __int128_t *ptr_align = __builtin_assume_aligned(ptr, 16);
      Int128Alias v;
      v.s = val;
 -    qatomic_set__nocheck((__int128_t *)ptr, v.i);
 +    qatomic_set__nocheck(ptr_align, v.i);
  }
  # define HAVE_ATOMIC128 1
  #elif !defined(CONFIG_USER_ONLY) && HAVE_CMPXCHG128
 -static inline Int128 atomic16_read(Int128 *ptr)
 +static inline Int128 ATTRIBUTE_ATOMIC128_OPT
 +atomic16_read(Int128 *ptr)
  {
      /* Maybe replace 0 with 0, returning the old value.  */
      Int128 z = int128_make64(0);
      return atomic16_cmpxchg(ptr, z, z);
  }
 -static inline void atomic16_set(Int128 *ptr, Int128 val)
 +static inline void ATTRIBUTE_ATOMIC128_OPT
 +atomic16_set(Int128 *ptr, Int128 val)
  {
      Int128 old = *ptr, cmp;
      do {
 diff --git a/include/qemu/atomic128.h b/include/qemu/atomic128.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/qemu/atomic128.h
 +++ b/include/qemu/atomic128.h
@@ -XXX,XX +XXX,XX @@
  #include "qemu/int128.h"
 +/*
-+ * Access to the various translations structures need to be serialised
++ * If __alignof(unsigned __int128) < 16, GCC may refuse to inline atomics
-+ * via locks for consistency.  In user-mode emulation access to the
++ * that are supported by the host, e.g. s390x.  We can force the pointer to
-+ * memory related structures are protected with mmap_lock.
++ * have our known alignment with __builtin_assume_aligned, however prior to
-+ * In !user-mode we use per-page locks.
++ * GCC 13 that was only reliable with optimization enabled.  See
 + *   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107389
 + */
-+#ifdef CONFIG_SOFTMMU
++#if defined(CONFIG_ATOMIC128_OPT)
-+#define assert_memory_lock()
++# if !defined(__OPTIMIZE__)
-+#else
++#  define ATTRIBUTE_ATOMIC128_OPT  __attribute__((optimize("O1")))
-+#define assert_memory_lock() tcg_debug_assert(have_mmap_lock())
++# endif
 +# define CONFIG_ATOMIC128
 +#endif
 +#ifndef ATTRIBUTE_ATOMIC128_OPT
 +# define ATTRIBUTE_ATOMIC128_OPT
 +#endif
 +
-+typedef struct PageDesc {
+ /*
-+    /* list of TBs intersecting this ram page */
+  * GCC is a house divided about supporting large atomic operations.
-+    uintptr_t first_tb;
+  *
-+#ifdef CONFIG_USER_ONLY
+diff --git a/accel/tcg/ldst_atomicity.c.inc b/accel/tcg/ldst_atomicity.c.inc
 +    unsigned long flags;
 +    void *target_data;
 +#endif
 +#ifdef CONFIG_SOFTMMU
 +    QemuSpin lock;
 +#endif
 +} PageDesc;
 +
 +PageDesc *page_find_alloc(tb_page_addr_t index, bool alloc);
 +
 +static inline PageDesc *page_find(tb_page_addr_t index)
 +{
 +    return page_find_alloc(index, false);
 +}
 +
  TranslationBlock *tb_gen_code(CPUState *cpu, target_ulong pc,
                                target_ulong cs_base, uint32_t flags,
                                int cflags);
 diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/translate-all.c
+--- a/accel/tcg/ldst_atomicity.c.inc
-+++ b/accel/tcg/translate-all.c
++++ b/accel/tcg/ldst_atomicity.c.inc
 @@ -XXX,XX +XXX,XX @@
+ #endif
- /* make various TB consistency checks */
+ #define HAVE_al8_fast      (ATOMIC_REG_SIZE >= 8)
--/* Access to the various translations structures need to be serialised via locks
+-/*
-- * for consistency.
+- * If __alignof(unsigned __int128) < 16, GCC may refuse to inline atomics
-- * In user-mode emulation access to the memory related structures are protected
+- * that are supported by the host, e.g. s390x.  We can force the pointer to
-- * with mmap_lock.
+- * have our known alignment with __builtin_assume_aligned, however prior to
-- * In !user-mode we use per-page locks.
+- * GCC 13 that was only reliable with optimization enabled.  See
 - *   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107389
 - */
--#ifdef CONFIG_SOFTMMU
+-#if defined(CONFIG_ATOMIC128_OPT)
--#define assert_memory_lock()
+-# if !defined(__OPTIMIZE__)
--#else
+-#  define ATTRIBUTE_ATOMIC128_OPT  __attribute__((optimize("O1")))
--#define assert_memory_lock() tcg_debug_assert(have_mmap_lock())
+-# endif
 -# define CONFIG_ATOMIC128
 -#endif
 -#ifndef ATTRIBUTE_ATOMIC128_OPT
 -# define ATTRIBUTE_ATOMIC128_OPT
 -#endif
 -
--typedef struct PageDesc {
+ #if defined(CONFIG_ATOMIC128)
--    /* list of TBs intersecting this ram page */
+ # define HAVE_al16_fast    true
--    uintptr_t first_tb;
+ #else
 -#ifdef CONFIG_USER_ONLY
 -    unsigned long flags;
 -    void *target_data;
 -#endif
 -#ifdef CONFIG_SOFTMMU
 -    QemuSpin lock;
 -#endif
 -} PageDesc;
 -
  /**
   * struct page_entry - page descriptor entry
   * @pd:     pointer to the &struct PageDesc of the page this entry represents
@@ -XXX,XX +XXX,XX @@ void page_init(void)
  #endif
  }
 -static PageDesc *page_find_alloc(tb_page_addr_t index, bool alloc)
 +PageDesc *page_find_alloc(tb_page_addr_t index, bool alloc)
  {
      PageDesc *pd;
      void **lp;
@@ -XXX,XX +XXX,XX @@ static PageDesc *page_find_alloc(tb_page_addr_t index, bool alloc)
      return pd + (index & (V_L2_SIZE - 1));
  }
 -static inline PageDesc *page_find(tb_page_addr_t index)
 -{
 -    return page_find_alloc(index, false);
 -}
 -
  static void page_lock_pair(PageDesc **ret_p1, tb_page_addr_t phys1,
                             PageDesc **ret_p2, tb_page_addr_t phys2, bool alloc);
 --
 .34.1

-[PULL 39/47] target/ppc: Convert to tcg_ops restore_state_to_opc
+[PULL 14/28] target/ppc: Use tcg_gen_qemu_{ld, st}_i128 for LQARX, LQ, STQ
-Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+No need to roll our own, as this is now provided by tcg.
 This was the last use of retxl, so remove that too.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/ppc/cpu_init.c  | 10 ++++++++++
+ target/ppc/cpu.h                           |  1 -
- target/ppc/translate.c |  6 ------
+ target/ppc/helper.h                        |  9 ----
-files changed, 10 insertions(+), 6 deletions(-)
+ target/ppc/mem_helper.c                    | 48 --------------------
+ target/ppc/translate.c                     | 34 ++-------------
-diff --git a/target/ppc/cpu_init.c b/target/ppc/cpu_init.c
+ target/ppc/translate/fixedpoint-impl.c.inc | 51 +++-------------------
-index XXXXXXX..XXXXXXX 100644
+files changed, 11 insertions(+), 132 deletions(-)
---- a/target/ppc/cpu_init.c
-+++ b/target/ppc/cpu_init.c
+diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
-@@ -XXX,XX +XXX,XX @@ static vaddr ppc_cpu_get_pc(CPUState *cs)
+index XXXXXXX..XXXXXXX 100644
-     return cpu->env.nip;
+--- a/target/ppc/cpu.h
 +++ b/target/ppc/cpu.h
@@ -XXX,XX +XXX,XX @@ struct CPUArchState {
                             /* used to speed-up TLB assist handlers */
      target_ulong nip;      /* next instruction pointer */
 -    uint64_t retxh;        /* high part of 128-bit helper return */
      /* when a memory exception occurs, the access type is stored here */
      int access_type;
 diff --git a/target/ppc/helper.h b/target/ppc/helper.h
 index XXXXXXX..XXXXXXX 100644
 --- a/target/ppc/helper.h
 +++ b/target/ppc/helper.h
@@ -XXX,XX +XXX,XX @@ DEF_HELPER_4(DSCLIQ, void, env, fprp, fprp, i32)
  DEF_HELPER_1(tbegin, void, env)
  DEF_HELPER_FLAGS_1(fixup_thrm, TCG_CALL_NO_RWG, void, env)
 -
 -#ifdef TARGET_PPC64
 -DEF_HELPER_FLAGS_3(lq_le_parallel, TCG_CALL_NO_WG, i64, env, tl, i32)
 -DEF_HELPER_FLAGS_3(lq_be_parallel, TCG_CALL_NO_WG, i64, env, tl, i32)
 -DEF_HELPER_FLAGS_5(stq_le_parallel, TCG_CALL_NO_WG,
 -                   void, env, tl, i64, i64, i32)
 -DEF_HELPER_FLAGS_5(stq_be_parallel, TCG_CALL_NO_WG,
 -                   void, env, tl, i64, i64, i32)
 -#endif
 diff --git a/target/ppc/mem_helper.c b/target/ppc/mem_helper.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/ppc/mem_helper.c
 +++ b/target/ppc/mem_helper.c
@@ -XXX,XX +XXX,XX @@ target_ulong helper_lscbx(CPUPPCState *env, target_ulong addr, uint32_t reg,
      return i;
  }
-+static void ppc_restore_state_to_opc(CPUState *cs,
+-#ifdef TARGET_PPC64
-+                                     const TranslationBlock *tb,
+-uint64_t helper_lq_le_parallel(CPUPPCState *env, target_ulong addr,
-+                                     const uint64_t *data)
+-                               uint32_t opidx)
-+{
+-{
-+    PowerPCCPU *cpu = POWERPC_CPU(cs);
+-    Int128 ret;
-+
+-
-+    cpu->env.nip = data[0];
+-    /* We will have raised EXCP_ATOMIC from the translator.  */
-+}
+-    assert(HAVE_ATOMIC128);
-+
+-    ret = cpu_atomic_ldo_le_mmu(env, addr, opidx, GETPC());
- static bool ppc_cpu_has_work(CPUState *cs)
+-    env->retxh = int128_gethi(ret);
- {
+-    return int128_getlo(ret);
-     PowerPCCPU *cpu = POWERPC_CPU(cs);
+-}
-@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps ppc_sysemu_ops = {
+-
+-uint64_t helper_lq_be_parallel(CPUPPCState *env, target_ulong addr,
- static const struct TCGCPUOps ppc_tcg_ops = {
+-                               uint32_t opidx)
-   .initialize = ppc_translate_init,
+-{
-+  .restore_state_to_opc = ppc_restore_state_to_opc,
+-    Int128 ret;
+-
- #ifdef CONFIG_USER_ONLY
+-    /* We will have raised EXCP_ATOMIC from the translator.  */
-   .record_sigsegv = ppc_cpu_record_sigsegv,
+-    assert(HAVE_ATOMIC128);
 -    ret = cpu_atomic_ldo_be_mmu(env, addr, opidx, GETPC());
 -    env->retxh = int128_gethi(ret);
 -    return int128_getlo(ret);
 -}
 -
 -void helper_stq_le_parallel(CPUPPCState *env, target_ulong addr,
 -                            uint64_t lo, uint64_t hi, uint32_t opidx)
 -{
 -    Int128 val;
 -
 -    /* We will have raised EXCP_ATOMIC from the translator.  */
 -    assert(HAVE_ATOMIC128);
 -    val = int128_make128(lo, hi);
 -    cpu_atomic_sto_le_mmu(env, addr, val, opidx, GETPC());
 -}
 -
 -void helper_stq_be_parallel(CPUPPCState *env, target_ulong addr,
 -                            uint64_t lo, uint64_t hi, uint32_t opidx)
 -{
 -    Int128 val;
 -
 -    /* We will have raised EXCP_ATOMIC from the translator.  */
 -    assert(HAVE_ATOMIC128);
 -    val = int128_make128(lo, hi);
 -    cpu_atomic_sto_be_mmu(env, addr, val, opidx, GETPC());
 -}
 -#endif
 -
  /*****************************************************************************/
  /* Altivec extension helpers */
  #if HOST_BIG_ENDIAN
 diff --git a/target/ppc/translate.c b/target/ppc/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/ppc/translate.c
 +++ b/target/ppc/translate.c
-@@ -XXX,XX +XXX,XX @@ void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int max_insns,
+@@ -XXX,XX +XXX,XX @@ static void gen_lqarx(DisasContext *ctx)
+ {
-     translator_loop(cs, tb, max_insns, pc, host_pc, &ppc_tr_ops, &ctx.base);
+     int rd = rD(ctx->opcode);
- }
+     TCGv EA, hi, lo;
--
++    TCGv_i128 t16;
--void restore_state_to_opc(CPUPPCState *env, TranslationBlock *tb,
--                          target_ulong *data)
+     if (unlikely((rd & 1) || (rd == rA(ctx->opcode)) ||
--{
+                  (rd == rB(ctx->opcode)))) {
--    env->nip = data[0];
+@@ -XXX,XX +XXX,XX @@ static void gen_lqarx(DisasContext *ctx)
--}
+     lo = cpu_gpr[rd + 1];
      hi = cpu_gpr[rd];
 -    if (tb_cflags(ctx->base.tb) & CF_PARALLEL) {
 -        if (HAVE_ATOMIC128) {
 -            TCGv_i32 oi = tcg_temp_new_i32();
 -            if (ctx->le_mode) {
 -                tcg_gen_movi_i32(oi, make_memop_idx(MO_LE | MO_128 | MO_ALIGN,
 -                                                    ctx->mem_idx));
 -                gen_helper_lq_le_parallel(lo, cpu_env, EA, oi);
 -            } else {
 -                tcg_gen_movi_i32(oi, make_memop_idx(MO_BE | MO_128 | MO_ALIGN,
 -                                                    ctx->mem_idx));
 -                gen_helper_lq_be_parallel(lo, cpu_env, EA, oi);
 -            }
 -            tcg_gen_ld_i64(hi, cpu_env, offsetof(CPUPPCState, retxh));
 -        } else {
 -            /* Restart with exclusive lock.  */
 -            gen_helper_exit_atomic(cpu_env);
 -            ctx->base.is_jmp = DISAS_NORETURN;
 -            return;
 -        }
 -    } else if (ctx->le_mode) {
 -        tcg_gen_qemu_ld_i64(lo, EA, ctx->mem_idx, MO_LEUQ | MO_ALIGN_16);
 -        tcg_gen_mov_tl(cpu_reserve, EA);
 -        gen_addr_add(ctx, EA, EA, 8);
 -        tcg_gen_qemu_ld_i64(hi, EA, ctx->mem_idx, MO_LEUQ);
 -    } else {
 -        tcg_gen_qemu_ld_i64(hi, EA, ctx->mem_idx, MO_BEUQ | MO_ALIGN_16);
 -        tcg_gen_mov_tl(cpu_reserve, EA);
 -        gen_addr_add(ctx, EA, EA, 8);
 -        tcg_gen_qemu_ld_i64(lo, EA, ctx->mem_idx, MO_BEUQ);
 -    }
 +    t16 = tcg_temp_new_i128();
 +    tcg_gen_qemu_ld_i128(t16, EA, ctx->mem_idx, DEF_MEMOP(MO_128 | MO_ALIGN));
 +    tcg_gen_extr_i128_i64(lo, hi, t16);
      tcg_gen_st_tl(hi, cpu_env, offsetof(CPUPPCState, reserve_val));
      tcg_gen_st_tl(lo, cpu_env, offsetof(CPUPPCState, reserve_val2));
 diff --git a/target/ppc/translate/fixedpoint-impl.c.inc b/target/ppc/translate/fixedpoint-impl.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/target/ppc/translate/fixedpoint-impl.c.inc
 +++ b/target/ppc/translate/fixedpoint-impl.c.inc
@@ -XXX,XX +XXX,XX @@ static bool do_ldst_quad(DisasContext *ctx, arg_D *a, bool store, bool prefixed)
  #if defined(TARGET_PPC64)
      TCGv ea;
      TCGv_i64 low_addr_gpr, high_addr_gpr;
 -    MemOp mop;
 +    TCGv_i128 t16;
      REQUIRE_INSNS_FLAGS(ctx, 64BX);
@@ -XXX,XX +XXX,XX @@ static bool do_ldst_quad(DisasContext *ctx, arg_D *a, bool store, bool prefixed)
          low_addr_gpr = cpu_gpr[a->rt + 1];
          high_addr_gpr = cpu_gpr[a->rt];
      }
 +    t16 = tcg_temp_new_i128();
 -    if (tb_cflags(ctx->base.tb) & CF_PARALLEL) {
 -        if (HAVE_ATOMIC128) {
 -            mop = DEF_MEMOP(MO_128);
 -            TCGv_i32 oi = tcg_constant_i32(make_memop_idx(mop, ctx->mem_idx));
 -            if (store) {
 -                if (ctx->le_mode) {
 -                    gen_helper_stq_le_parallel(cpu_env, ea, low_addr_gpr,
 -                                               high_addr_gpr, oi);
 -                } else {
 -                    gen_helper_stq_be_parallel(cpu_env, ea, high_addr_gpr,
 -                                               low_addr_gpr, oi);
 -
 -                }
 -            } else {
 -                if (ctx->le_mode) {
 -                    gen_helper_lq_le_parallel(low_addr_gpr, cpu_env, ea, oi);
 -                    tcg_gen_ld_i64(high_addr_gpr, cpu_env,
 -                                   offsetof(CPUPPCState, retxh));
 -                } else {
 -                    gen_helper_lq_be_parallel(high_addr_gpr, cpu_env, ea, oi);
 -                    tcg_gen_ld_i64(low_addr_gpr, cpu_env,
 -                                   offsetof(CPUPPCState, retxh));
 -                }
 -            }
 -        } else {
 -            /* Restart with exclusive lock.  */
 -            gen_helper_exit_atomic(cpu_env);
 -            ctx->base.is_jmp = DISAS_NORETURN;
 -        }
 +    if (store) {
 +        tcg_gen_concat_i64_i128(t16, low_addr_gpr, high_addr_gpr);
 +        tcg_gen_qemu_st_i128(t16, ea, ctx->mem_idx, DEF_MEMOP(MO_128));
      } else {
 -        mop = DEF_MEMOP(MO_UQ);
 -        if (store) {
 -            tcg_gen_qemu_st_i64(low_addr_gpr, ea, ctx->mem_idx, mop);
 -        } else {
 -            tcg_gen_qemu_ld_i64(low_addr_gpr, ea, ctx->mem_idx, mop);
 -        }
 -
 -        gen_addr_add(ctx, ea, ea, 8);
 -
 -        if (store) {
 -            tcg_gen_qemu_st_i64(high_addr_gpr, ea, ctx->mem_idx, mop);
 -        } else {
 -            tcg_gen_qemu_ld_i64(high_addr_gpr, ea, ctx->mem_idx, mop);
 -        }
 +        tcg_gen_qemu_ld_i128(t16, ea, ctx->mem_idx, DEF_MEMOP(MO_128));
 +        tcg_gen_extr_i128_i64(low_addr_gpr, high_addr_gpr, t16);
      }
  #else
      qemu_build_not_reached();
 --
 .34.1

-[PULL 09/47] accel/tcg: Remove disabled debug in translate-all.c
+[PULL 15/28] target/s390x: Use tcg_gen_qemu_{ld, st}_i128 for LPQ, STPQ
-These items printf, and could be replaced with proper
+No need to roll our own, as this is now provided by tcg.
-tracepoints if we really cared.
+This was the last use of retxl, so remove that too.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: David Hildenbrand <david@redhat.com>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/translate-all.c | 109 --------------------------------------
+ target/s390x/cpu.h               |  3 --
-file changed, 109 deletions(-)
+ target/s390x/helper.h            |  4 ---
  target/s390x/tcg/mem_helper.c    | 61 --------------------------------
  target/s390x/tcg/translate.c     | 30 +++++-----------
  target/s390x/tcg/insn-data.h.inc |  2 +-
 files changed, 9 insertions(+), 91 deletions(-)
-diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
+diff --git a/target/s390x/cpu.h b/target/s390x/cpu.h
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/translate-all.c
+--- a/target/s390x/cpu.h
-+++ b/accel/tcg/translate-all.c
++++ b/target/s390x/cpu.h
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ struct CPUArchState {
- #include "tb-context.h"
- #include "internal.h"
+     float_status fpu_status; /* passed to softfloat lib */
--/* #define DEBUG_TB_INVALIDATE */
+-    /* The low part of a 128-bit return, or remainder of a divide.  */
--/* #define DEBUG_TB_FLUSH */
+-    uint64_t retxl;
  /* make various TB consistency checks */
 -/* #define DEBUG_TB_CHECK */
 -
--#ifdef DEBUG_TB_INVALIDATE
+     PSW psw;
--#define DEBUG_TB_INVALIDATE_GATE 1
--#else
+     S390CrashReason crash_reason;
--#define DEBUG_TB_INVALIDATE_GATE 0
+diff --git a/target/s390x/helper.h b/target/s390x/helper.h
--#endif
+index XXXXXXX..XXXXXXX 100644
 --- a/target/s390x/helper.h
 +++ b/target/s390x/helper.h
@@ -XXX,XX +XXX,XX @@ DEF_HELPER_FLAGS_2(sfas, TCG_CALL_NO_WG, void, env, i64)
  DEF_HELPER_FLAGS_2(srnm, TCG_CALL_NO_WG, void, env, i64)
  DEF_HELPER_FLAGS_1(popcnt, TCG_CALL_NO_RWG_SE, i64, i64)
  DEF_HELPER_2(stfle, i32, env, i64)
 -DEF_HELPER_FLAGS_2(lpq, TCG_CALL_NO_WG, i64, env, i64)
 -DEF_HELPER_FLAGS_2(lpq_parallel, TCG_CALL_NO_WG, i64, env, i64)
 -DEF_HELPER_FLAGS_4(stpq, TCG_CALL_NO_WG, void, env, i64, i64, i64)
 -DEF_HELPER_FLAGS_4(stpq_parallel, TCG_CALL_NO_WG, void, env, i64, i64, i64)
  DEF_HELPER_4(mvcos, i32, env, i64, i64, i64)
  DEF_HELPER_4(cu12, i32, env, i32, i32, i32)
  DEF_HELPER_4(cu14, i32, env, i32, i32, i32)
 diff --git a/target/s390x/tcg/mem_helper.c b/target/s390x/tcg/mem_helper.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/s390x/tcg/mem_helper.c
 +++ b/target/s390x/tcg/mem_helper.c
@@ -XXX,XX +XXX,XX @@ uint64_t HELPER(lra)(CPUS390XState *env, uint64_t addr)
  }
  #endif
 -/* load pair from quadword */
 -uint64_t HELPER(lpq)(CPUS390XState *env, uint64_t addr)
 -{
 -    uintptr_t ra = GETPC();
 -    uint64_t hi, lo;
 -
--#ifdef DEBUG_TB_FLUSH
+-    check_alignment(env, addr, 16, ra);
--#define DEBUG_TB_FLUSH_GATE 1
+-    hi = cpu_ldq_data_ra(env, addr + 0, ra);
--#else
+-    lo = cpu_ldq_data_ra(env, addr + 8, ra);
 -#define DEBUG_TB_FLUSH_GATE 0
 -#endif
 -
--#if !defined(CONFIG_USER_ONLY)
+-    env->retxl = lo;
--/* TB consistency checks only implemented for usermode emulation.  */
+-    return hi;
 -#undef DEBUG_TB_CHECK
 -#endif
 -
 -#ifdef DEBUG_TB_CHECK
 -#define DEBUG_TB_CHECK_GATE 1
 -#else
 -#define DEBUG_TB_CHECK_GATE 0
 -#endif
  /* Access to the various translations structures need to be serialised via locks
   * for consistency.
@@ -XXX,XX +XXX,XX @@ static void page_flush_tb(void)
      }
  }
 -static gboolean tb_host_size_iter(gpointer key, gpointer value, gpointer data)
 -{
 -    const TranslationBlock *tb = value;
 -    size_t *size = data;
 -
 -    *size += tb->tc.size;
 -    return false;
 -}
 -
- /* flush all the translation blocks */
+-uint64_t HELPER(lpq_parallel)(CPUS390XState *env, uint64_t addr)
- static void do_tb_flush(CPUState *cpu, run_on_cpu_data tb_flush_count)
+-{
- {
+-    uintptr_t ra = GETPC();
-@@ -XXX,XX +XXX,XX @@ static void do_tb_flush(CPUState *cpu, run_on_cpu_data tb_flush_count)
+-    uint64_t hi, lo;
-     }
+-    int mem_idx;
-     did_flush = true;
+-    MemOpIdx oi;
+-    Int128 v;
 -    if (DEBUG_TB_FLUSH_GATE) {
 -        size_t nb_tbs = tcg_nb_tbs();
 -        size_t host_size = 0;
 -
--        tcg_tb_foreach(tb_host_size_iter, &host_size);
+-    assert(HAVE_ATOMIC128);
 -        printf("qemu: flush code_size=%zu nb_tbs=%zu avg_tb_size=%zu\n",
 -               tcg_code_size(), nb_tbs, nb_tbs > 0 ? host_size / nb_tbs : 0);
 -    }
 -
-     CPU_FOREACH(cpu) {
+-    mem_idx = cpu_mmu_index(env, false);
-         tcg_flush_jmp_cache(cpu);
+-    oi = make_memop_idx(MO_TEUQ | MO_ALIGN_16, mem_idx);
-     }
+-    v = cpu_atomic_ldo_be_mmu(env, addr, oi, ra);
-@@ -XXX,XX +XXX,XX @@ void tb_flush(CPUState *cpu)
+-    hi = int128_gethi(v);
-     }
+-    lo = int128_getlo(v);
  }
 -/*
 - * Formerly ifdef DEBUG_TB_CHECK. These debug functions are user-mode-only,
 - * so in order to prevent bit rot we compile them unconditionally in user-mode,
 - * and let the optimizer get rid of them by wrapping their user-only callers
 - * with if (DEBUG_TB_CHECK_GATE).
 - */
 -#ifdef CONFIG_USER_ONLY
 -
--static void do_tb_invalidate_check(void *p, uint32_t hash, void *userp)
+-    env->retxl = lo;
--{
+-    return hi;
 -    TranslationBlock *tb = p;
 -    target_ulong addr = *(target_ulong *)userp;
 -
 -    if (!(addr + TARGET_PAGE_SIZE <= tb_pc(tb) ||
 -          addr >= tb_pc(tb) + tb->size)) {
 -        printf("ERROR invalidate: address=" TARGET_FMT_lx
 -               " PC=%08lx size=%04x\n", addr, (long)tb_pc(tb), tb->size);
 -    }
 -}
 -
--/* verify that all the pages have correct rights for code
+-/* store pair to quadword */
-- *
+-void HELPER(stpq)(CPUS390XState *env, uint64_t addr,
-- * Called with mmap_lock held.
+-                  uint64_t low, uint64_t high)
 - */
 -static void tb_invalidate_check(target_ulong address)
 -{
--    address &= TARGET_PAGE_MASK;
+-    uintptr_t ra = GETPC();
--    qht_iter(&tb_ctx.htable, do_tb_invalidate_check, &address);
+-
 -    check_alignment(env, addr, 16, ra);
 -    cpu_stq_data_ra(env, addr + 0, high, ra);
 -    cpu_stq_data_ra(env, addr + 8, low, ra);
 -}
 -
--static void do_tb_page_check(void *p, uint32_t hash, void *userp)
+-void HELPER(stpq_parallel)(CPUS390XState *env, uint64_t addr,
 -                           uint64_t low, uint64_t high)
 -{
--    TranslationBlock *tb = p;
+-    uintptr_t ra = GETPC();
--    int flags1, flags2;
+-    int mem_idx;
 -    MemOpIdx oi;
 -    Int128 v;
 -
--    flags1 = page_get_flags(tb_pc(tb));
+-    assert(HAVE_ATOMIC128);
--    flags2 = page_get_flags(tb_pc(tb) + tb->size - 1);
+-
--    if ((flags1 & PAGE_WRITE) || (flags2 & PAGE_WRITE)) {
+-    mem_idx = cpu_mmu_index(env, false);
--        printf("ERROR page flags: PC=%08lx size=%04x f1=%x f2=%x\n",
+-    oi = make_memop_idx(MO_TEUQ | MO_ALIGN_16, mem_idx);
--               (long)tb_pc(tb), tb->size, flags1, flags2);
+-    v = int128_make128(low, high);
--    }
+-    cpu_atomic_sto_be_mmu(env, addr, v, oi, ra);
 -}
 -
--/* verify that all the pages have correct rights for code */
+ /* Execute instruction.  This instruction executes an insn modified with
--static void tb_page_check(void)
+    the contents of r1.  It does not change the executed instruction in memory;
     it does not change the program counter.
 diff --git a/target/s390x/tcg/translate.c b/target/s390x/tcg/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/s390x/tcg/translate.c
 +++ b/target/s390x/tcg/translate.c
@@ -XXX,XX +XXX,XX @@ static void store_freg32_i64(int reg, TCGv_i64 v)
      tcg_gen_st32_i64(v, cpu_env, freg32_offset(reg));
  }
 -static void return_low128(TCGv_i64 dest)
 -{
--    qht_iter(&tb_ctx.htable, do_tb_page_check, NULL);
+-    tcg_gen_ld_i64(dest, cpu_env, offsetof(CPUS390XState, retxl));
 -}
 -
--#endif /* CONFIG_USER_ONLY */
+ static void update_psw_addr(DisasContext *s)
--
+ {
- /*
+     /* psw.addr */
-  * user-mode: call with mmap_lock held
+@@ -XXX,XX +XXX,XX @@ static DisasJumpType op_lpd(DisasContext *s, DisasOps *o)
-  * !user-mode: call with @pd->lock held
-@@ -XXX,XX +XXX,XX @@ tb_link_page(TranslationBlock *tb, tb_page_addr_t phys_pc,
+ static DisasJumpType op_lpq(DisasContext *s, DisasOps *o)
-         page_unlock(p2);
+ {
-     }
+-    if (!(tb_cflags(s->base.tb) & CF_PARALLEL)) {
-     page_unlock(p);
+-        gen_helper_lpq(o->out, cpu_env, o->in2);
--
+-    } else if (HAVE_ATOMIC128) {
--#ifdef CONFIG_USER_ONLY
+-        gen_helper_lpq_parallel(o->out, cpu_env, o->in2);
--    if (DEBUG_TB_CHECK_GATE) {
+-    } else {
--        tb_page_check();
+-        gen_helper_exit_atomic(cpu_env);
 -        return DISAS_NORETURN;
 -    }
--#endif
+-    return_low128(o->out2);
-     return tb;
++    o->out_128 = tcg_temp_new_i128();
 +    tcg_gen_qemu_ld_i128(o->out_128, o->in2, get_mem_index(s),
 +                         MO_TE | MO_128 | MO_ALIGN);
      return DISAS_NEXT;
  }
-@@ -XXX,XX +XXX,XX @@ void page_protect(tb_page_addr_t page_addr)
+@@ -XXX,XX +XXX,XX @@ static DisasJumpType op_stmh(DisasContext *s, DisasOps *o)
-         }
-         mprotect(g2h_untagged(page_addr), qemu_host_page_size,
+ static DisasJumpType op_stpq(DisasContext *s, DisasOps *o)
-                  (prot & PAGE_BITS) & ~PAGE_WRITE);
+ {
--        if (DEBUG_TB_INVALIDATE_GATE) {
+-    if (!(tb_cflags(s->base.tb) & CF_PARALLEL)) {
--            printf("protecting code page: 0x" TB_PAGE_ADDR_FMT "\n", page_addr);
+-        gen_helper_stpq(cpu_env, o->in2, o->out2, o->out);
--        }
+-    } else if (HAVE_ATOMIC128) {
-     }
+-        gen_helper_stpq_parallel(cpu_env, o->in2, o->out2, o->out);
 -    } else {
 -        gen_helper_exit_atomic(cpu_env);
 -        return DISAS_NORETURN;
 -    }
 +    TCGv_i128 t16 = tcg_temp_new_i128();
 +
 +    tcg_gen_concat_i64_i128(t16, o->out2, o->out);
 +    tcg_gen_qemu_st_i128(t16, o->in2, get_mem_index(s),
 +                         MO_TE | MO_128 | MO_ALIGN);
      return DISAS_NEXT;
  }
-@@ -XXX,XX +XXX,XX @@ int page_unprotect(target_ulong address, uintptr_t pc)
+diff --git a/target/s390x/tcg/insn-data.h.inc b/target/s390x/tcg/insn-data.h.inc
-                 /* and since the content will be modified, we must invalidate
+index XXXXXXX..XXXXXXX 100644
-                    the corresponding translated code. */
+--- a/target/s390x/tcg/insn-data.h.inc
-                 current_tb_invalidated |= tb_invalidate_phys_page(addr, pc);
++++ b/target/s390x/tcg/insn-data.h.inc
--#ifdef CONFIG_USER_ONLY
+@@ -XXX,XX +XXX,XX @@
--                if (DEBUG_TB_CHECK_GATE) {
+     D(0xc804, LPD,     SSF,   ILA, 0, 0, new_P, r3_P32, lpd, 0, MO_TEUL)
--                    tb_invalidate_check(addr);
+     D(0xc805, LPDG,    SSF,   ILA, 0, 0, new_P, r3_P64, lpd, 0, MO_TEUQ)
--                }
+ /* LOAD PAIR FROM QUADWORD */
--#endif
+-    C(0xe38f, LPQ,     RXY_a, Z,   0, a2, r1_P, 0, lpq, 0)
-             }
++    C(0xe38f, LPQ,     RXY_a, Z,   0, a2, 0, r1_D64, lpq, 0)
-             mprotect((void *)g2h_untagged(host_start), qemu_host_page_size,
+ /* LOAD POSITIVE */
-                      prot & PAGE_BITS);
+     C(0x1000, LPR,     RR_a,  Z,   0, r2_32s, new, r1_32, abs, abs32)
      C(0xb900, LPGR,    RRE,   Z,   0, r2, r1, 0, abs, abs64)
 --
 .34.1

-[PULL 37/47] target/nios2: Convert to tcg_ops restore_state_to_opc
+[PULL 16/28] accel/tcg: Unify cpu_{ld,st}*_{be,le}_mmu
+With the current structure of cputlb.c, there is no difference
+between the little-endian and big-endian entry points, aside
+from the assert.  Unify the pairs of functions.
+The only use of the functions with explicit endianness was in
+target/sparc64, and that was only to satisfy the assert: the
+correct endianness is already built into memop.
 Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/nios2/cpu.c       | 11 +++++++++++
+ include/exec/cpu_ldst.h     |  58 ++-----
- target/nios2/translate.c |  6 ------
+ accel/tcg/cputlb.c          | 122 +++-----------
-files changed, 11 insertions(+), 6 deletions(-)
+ accel/tcg/user-exec.c       | 322 ++++++++++--------------------------
  target/arm/tcg/m_helper.c   |   4 +-
  target/sparc/ldst_helper.c  |  18 +-
  accel/tcg/ldst_common.c.inc |  24 +--
 files changed, 137 insertions(+), 411 deletions(-)
-diff --git a/target/nios2/cpu.c b/target/nios2/cpu.c
+diff --git a/include/exec/cpu_ldst.h b/include/exec/cpu_ldst.h
 index XXXXXXX..XXXXXXX 100644
---- a/target/nios2/cpu.c
+--- a/include/exec/cpu_ldst.h
-+++ b/target/nios2/cpu.c
++++ b/include/exec/cpu_ldst.h
-@@ -XXX,XX +XXX,XX @@ static vaddr nios2_cpu_get_pc(CPUState *cs)
+@@ -XXX,XX +XXX,XX @@ void cpu_stq_le_mmuidx_ra(CPUArchState *env, abi_ptr ptr, uint64_t val,
-     return env->pc;
+                           int mmu_idx, uintptr_t ra);
- }
+ uint8_t cpu_ldb_mmu(CPUArchState *env, abi_ptr ptr, MemOpIdx oi, uintptr_t ra);
-+static void nios2_restore_state_to_opc(CPUState *cs,
+-uint16_t cpu_ldw_be_mmu(CPUArchState *env, abi_ptr ptr,
-+                                       const TranslationBlock *tb,
+-                        MemOpIdx oi, uintptr_t ra);
-+                                       const uint64_t *data)
+-uint32_t cpu_ldl_be_mmu(CPUArchState *env, abi_ptr ptr,
 -                        MemOpIdx oi, uintptr_t ra);
 -uint64_t cpu_ldq_be_mmu(CPUArchState *env, abi_ptr ptr,
 -                        MemOpIdx oi, uintptr_t ra);
 -uint16_t cpu_ldw_le_mmu(CPUArchState *env, abi_ptr ptr,
 -                        MemOpIdx oi, uintptr_t ra);
 -uint32_t cpu_ldl_le_mmu(CPUArchState *env, abi_ptr ptr,
 -                        MemOpIdx oi, uintptr_t ra);
 -uint64_t cpu_ldq_le_mmu(CPUArchState *env, abi_ptr ptr,
 -                        MemOpIdx oi, uintptr_t ra);
 -
 -Int128 cpu_ld16_be_mmu(CPUArchState *env, abi_ptr addr,
 -                       MemOpIdx oi, uintptr_t ra);
 -Int128 cpu_ld16_le_mmu(CPUArchState *env, abi_ptr addr,
 -                       MemOpIdx oi, uintptr_t ra);
 +uint16_t cpu_ldw_mmu(CPUArchState *env, abi_ptr ptr, MemOpIdx oi, uintptr_t ra);
 +uint32_t cpu_ldl_mmu(CPUArchState *env, abi_ptr ptr, MemOpIdx oi, uintptr_t ra);
 +uint64_t cpu_ldq_mmu(CPUArchState *env, abi_ptr ptr, MemOpIdx oi, uintptr_t ra);
 +Int128 cpu_ld16_mmu(CPUArchState *env, abi_ptr addr, MemOpIdx oi, uintptr_t ra);
  void cpu_stb_mmu(CPUArchState *env, abi_ptr ptr, uint8_t val,
                   MemOpIdx oi, uintptr_t ra);
 -void cpu_stw_be_mmu(CPUArchState *env, abi_ptr ptr, uint16_t val,
 -                    MemOpIdx oi, uintptr_t ra);
 -void cpu_stl_be_mmu(CPUArchState *env, abi_ptr ptr, uint32_t val,
 -                    MemOpIdx oi, uintptr_t ra);
 -void cpu_stq_be_mmu(CPUArchState *env, abi_ptr ptr, uint64_t val,
 -                    MemOpIdx oi, uintptr_t ra);
 -void cpu_stw_le_mmu(CPUArchState *env, abi_ptr ptr, uint16_t val,
 -                    MemOpIdx oi, uintptr_t ra);
 -void cpu_stl_le_mmu(CPUArchState *env, abi_ptr ptr, uint32_t val,
 -                    MemOpIdx oi, uintptr_t ra);
 -void cpu_stq_le_mmu(CPUArchState *env, abi_ptr ptr, uint64_t val,
 -                    MemOpIdx oi, uintptr_t ra);
 -
 -void cpu_st16_be_mmu(CPUArchState *env, abi_ptr addr, Int128 val,
 -                     MemOpIdx oi, uintptr_t ra);
 -void cpu_st16_le_mmu(CPUArchState *env, abi_ptr addr, Int128 val,
 -                     MemOpIdx oi, uintptr_t ra);
 +void cpu_stw_mmu(CPUArchState *env, abi_ptr ptr, uint16_t val,
 +                 MemOpIdx oi, uintptr_t ra);
 +void cpu_stl_mmu(CPUArchState *env, abi_ptr ptr, uint32_t val,
 +                 MemOpIdx oi, uintptr_t ra);
 +void cpu_stq_mmu(CPUArchState *env, abi_ptr ptr, uint64_t val,
 +                 MemOpIdx oi, uintptr_t ra);
 +void cpu_st16_mmu(CPUArchState *env, abi_ptr addr, Int128 val,
 +                  MemOpIdx oi, uintptr_t ra);
  uint32_t cpu_atomic_cmpxchgb_mmu(CPUArchState *env, target_ulong addr,
                                   uint32_t cmpv, uint32_t newv,
@@ -XXX,XX +XXX,XX @@ static inline CPUTLBEntry *tlb_entry(CPUArchState *env, uintptr_t mmu_idx,
  # define cpu_ldsw_mmuidx_ra   cpu_ldsw_be_mmuidx_ra
  # define cpu_ldl_mmuidx_ra    cpu_ldl_be_mmuidx_ra
  # define cpu_ldq_mmuidx_ra    cpu_ldq_be_mmuidx_ra
 -# define cpu_ldw_mmu          cpu_ldw_be_mmu
 -# define cpu_ldl_mmu          cpu_ldl_be_mmu
 -# define cpu_ldq_mmu          cpu_ldq_be_mmu
  # define cpu_stw_data         cpu_stw_be_data
  # define cpu_stl_data         cpu_stl_be_data
  # define cpu_stq_data         cpu_stq_be_data
@@ -XXX,XX +XXX,XX @@ static inline CPUTLBEntry *tlb_entry(CPUArchState *env, uintptr_t mmu_idx,
  # define cpu_stw_mmuidx_ra    cpu_stw_be_mmuidx_ra
  # define cpu_stl_mmuidx_ra    cpu_stl_be_mmuidx_ra
  # define cpu_stq_mmuidx_ra    cpu_stq_be_mmuidx_ra
 -# define cpu_stw_mmu          cpu_stw_be_mmu
 -# define cpu_stl_mmu          cpu_stl_be_mmu
 -# define cpu_stq_mmu          cpu_stq_be_mmu
  #else
  # define cpu_lduw_data        cpu_lduw_le_data
  # define cpu_ldsw_data        cpu_ldsw_le_data
@@ -XXX,XX +XXX,XX @@ static inline CPUTLBEntry *tlb_entry(CPUArchState *env, uintptr_t mmu_idx,
  # define cpu_ldsw_mmuidx_ra   cpu_ldsw_le_mmuidx_ra
  # define cpu_ldl_mmuidx_ra    cpu_ldl_le_mmuidx_ra
  # define cpu_ldq_mmuidx_ra    cpu_ldq_le_mmuidx_ra
 -# define cpu_ldw_mmu          cpu_ldw_le_mmu
 -# define cpu_ldl_mmu          cpu_ldl_le_mmu
 -# define cpu_ldq_mmu          cpu_ldq_le_mmu
  # define cpu_stw_data         cpu_stw_le_data
  # define cpu_stl_data         cpu_stl_le_data
  # define cpu_stq_data         cpu_stq_le_data
@@ -XXX,XX +XXX,XX @@ static inline CPUTLBEntry *tlb_entry(CPUArchState *env, uintptr_t mmu_idx,
  # define cpu_stw_mmuidx_ra    cpu_stw_le_mmuidx_ra
  # define cpu_stl_mmuidx_ra    cpu_stl_le_mmuidx_ra
  # define cpu_stq_mmuidx_ra    cpu_stq_le_mmuidx_ra
 -# define cpu_stw_mmu          cpu_stw_le_mmu
 -# define cpu_stl_mmu          cpu_stl_le_mmu
 -# define cpu_stq_mmu          cpu_stq_le_mmu
  #endif
  uint8_t cpu_ldb_code_mmu(CPUArchState *env, abi_ptr addr,
 diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/cputlb.c
 +++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ uint8_t cpu_ldb_mmu(CPUArchState *env, abi_ptr addr, MemOpIdx oi, uintptr_t ra)
      return ret;
  }
 -uint16_t cpu_ldw_be_mmu(CPUArchState *env, abi_ptr addr,
 -                        MemOpIdx oi, uintptr_t ra)
 +uint16_t cpu_ldw_mmu(CPUArchState *env, abi_ptr addr,
 +                     MemOpIdx oi, uintptr_t ra)
  {
      uint16_t ret;
 -    tcg_debug_assert((get_memop(oi) & (MO_BSWAP | MO_SIZE)) == MO_BEUW);
 +    tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_16);
      ret = do_ld2_mmu(env, addr, oi, ra, MMU_DATA_LOAD);
      plugin_load_cb(env, addr, oi);
      return ret;
  }
 -uint32_t cpu_ldl_be_mmu(CPUArchState *env, abi_ptr addr,
 -                        MemOpIdx oi, uintptr_t ra)
 +uint32_t cpu_ldl_mmu(CPUArchState *env, abi_ptr addr,
 +                     MemOpIdx oi, uintptr_t ra)
  {
      uint32_t ret;
 -    tcg_debug_assert((get_memop(oi) & (MO_BSWAP | MO_SIZE)) == MO_BEUL);
 +    tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_32);
      ret = do_ld4_mmu(env, addr, oi, ra, MMU_DATA_LOAD);
      plugin_load_cb(env, addr, oi);
      return ret;
  }
 -uint64_t cpu_ldq_be_mmu(CPUArchState *env, abi_ptr addr,
 -                        MemOpIdx oi, uintptr_t ra)
 +uint64_t cpu_ldq_mmu(CPUArchState *env, abi_ptr addr,
 +                     MemOpIdx oi, uintptr_t ra)
  {
      uint64_t ret;
 -    tcg_debug_assert((get_memop(oi) & (MO_BSWAP | MO_SIZE)) == MO_BEUQ);
 +    tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_64);
      ret = do_ld8_mmu(env, addr, oi, ra, MMU_DATA_LOAD);
      plugin_load_cb(env, addr, oi);
      return ret;
  }
 -uint16_t cpu_ldw_le_mmu(CPUArchState *env, abi_ptr addr,
 -                        MemOpIdx oi, uintptr_t ra)
 -{
 -    uint16_t ret;
 -
 -    tcg_debug_assert((get_memop(oi) & (MO_BSWAP | MO_SIZE)) == MO_LEUW);
 -    ret = do_ld2_mmu(env, addr, oi, ra, MMU_DATA_LOAD);
 -    plugin_load_cb(env, addr, oi);
 -    return ret;
 -}
 -
 -uint32_t cpu_ldl_le_mmu(CPUArchState *env, abi_ptr addr,
 -                        MemOpIdx oi, uintptr_t ra)
 -{
 -    uint32_t ret;
 -
 -    tcg_debug_assert((get_memop(oi) & (MO_BSWAP | MO_SIZE)) == MO_LEUL);
 -    ret = do_ld4_mmu(env, addr, oi, ra, MMU_DATA_LOAD);
 -    plugin_load_cb(env, addr, oi);
 -    return ret;
 -}
 -
 -uint64_t cpu_ldq_le_mmu(CPUArchState *env, abi_ptr addr,
 -                        MemOpIdx oi, uintptr_t ra)
 -{
 -    uint64_t ret;
 -
 -    tcg_debug_assert((get_memop(oi) & (MO_BSWAP | MO_SIZE)) == MO_LEUQ);
 -    ret = do_ld8_mmu(env, addr, oi, ra, MMU_DATA_LOAD);
 -    plugin_load_cb(env, addr, oi);
 -    return ret;
 -}
 -
 -Int128 cpu_ld16_be_mmu(CPUArchState *env, abi_ptr addr,
 -                       MemOpIdx oi, uintptr_t ra)
 +Int128 cpu_ld16_mmu(CPUArchState *env, abi_ptr addr,
 +                    MemOpIdx oi, uintptr_t ra)
  {
      Int128 ret;
 -    tcg_debug_assert((get_memop(oi) & (MO_BSWAP|MO_SIZE)) == (MO_BE|MO_128));
 -    ret = do_ld16_mmu(env, addr, oi, ra);
 -    plugin_load_cb(env, addr, oi);
 -    return ret;
 -}
 -
 -Int128 cpu_ld16_le_mmu(CPUArchState *env, abi_ptr addr,
 -                       MemOpIdx oi, uintptr_t ra)
 -{
 -    Int128 ret;
 -
 -    tcg_debug_assert((get_memop(oi) & (MO_BSWAP|MO_SIZE)) == (MO_LE|MO_128));
 +    tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_128);
      ret = do_ld16_mmu(env, addr, oi, ra);
      plugin_load_cb(env, addr, oi);
      return ret;
@@ -XXX,XX +XXX,XX @@ void cpu_stb_mmu(CPUArchState *env, target_ulong addr, uint8_t val,
      plugin_store_cb(env, addr, oi);
  }
 -void cpu_stw_be_mmu(CPUArchState *env, target_ulong addr, uint16_t val,
 -                    MemOpIdx oi, uintptr_t retaddr)
 +void cpu_stw_mmu(CPUArchState *env, target_ulong addr, uint16_t val,
 +                 MemOpIdx oi, uintptr_t retaddr)
  {
 -    tcg_debug_assert((get_memop(oi) & (MO_BSWAP | MO_SIZE)) == MO_BEUW);
 +    tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_16);
      do_st2_mmu(env, addr, val, oi, retaddr);
      plugin_store_cb(env, addr, oi);
  }
 -void cpu_stl_be_mmu(CPUArchState *env, target_ulong addr, uint32_t val,
 +void cpu_stl_mmu(CPUArchState *env, target_ulong addr, uint32_t val,
                      MemOpIdx oi, uintptr_t retaddr)
  {
 -    tcg_debug_assert((get_memop(oi) & (MO_BSWAP | MO_SIZE)) == MO_BEUL);
 +    tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_32);
      do_st4_mmu(env, addr, val, oi, retaddr);
      plugin_store_cb(env, addr, oi);
  }
 -void cpu_stq_be_mmu(CPUArchState *env, target_ulong addr, uint64_t val,
 -                    MemOpIdx oi, uintptr_t retaddr)
 +void cpu_stq_mmu(CPUArchState *env, target_ulong addr, uint64_t val,
 +                 MemOpIdx oi, uintptr_t retaddr)
  {
 -    tcg_debug_assert((get_memop(oi) & (MO_BSWAP | MO_SIZE)) == MO_BEUQ);
 +    tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_64);
      do_st8_mmu(env, addr, val, oi, retaddr);
      plugin_store_cb(env, addr, oi);
  }
 -void cpu_stw_le_mmu(CPUArchState *env, target_ulong addr, uint16_t val,
 -                    MemOpIdx oi, uintptr_t retaddr)
 +void cpu_st16_mmu(CPUArchState *env, target_ulong addr, Int128 val,
 +                  MemOpIdx oi, uintptr_t retaddr)
  {
 -    tcg_debug_assert((get_memop(oi) & (MO_BSWAP | MO_SIZE)) == MO_LEUW);
 -    do_st2_mmu(env, addr, val, oi, retaddr);
 -    plugin_store_cb(env, addr, oi);
 -}
 -
 -void cpu_stl_le_mmu(CPUArchState *env, target_ulong addr, uint32_t val,
 -                    MemOpIdx oi, uintptr_t retaddr)
 -{
 -    tcg_debug_assert((get_memop(oi) & (MO_BSWAP | MO_SIZE)) == MO_LEUL);
 -    do_st4_mmu(env, addr, val, oi, retaddr);
 -    plugin_store_cb(env, addr, oi);
 -}
 -
 -void cpu_stq_le_mmu(CPUArchState *env, target_ulong addr, uint64_t val,
 -                    MemOpIdx oi, uintptr_t retaddr)
 -{
 -    tcg_debug_assert((get_memop(oi) & (MO_BSWAP | MO_SIZE)) == MO_LEUQ);
 -    do_st8_mmu(env, addr, val, oi, retaddr);
 -    plugin_store_cb(env, addr, oi);
 -}
 -
 -void cpu_st16_be_mmu(CPUArchState *env, target_ulong addr, Int128 val,
 -                     MemOpIdx oi, uintptr_t retaddr)
 -{
 -    tcg_debug_assert((get_memop(oi) & (MO_BSWAP|MO_SIZE)) == (MO_BE|MO_128));
 -    do_st16_mmu(env, addr, val, oi, retaddr);
 -    plugin_store_cb(env, addr, oi);
 -}
 -
 -void cpu_st16_le_mmu(CPUArchState *env, target_ulong addr, Int128 val,
 -                     MemOpIdx oi, uintptr_t retaddr)
 -{
 -    tcg_debug_assert((get_memop(oi) & (MO_BSWAP|MO_SIZE)) == (MO_LE|MO_128));
 +    tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_128);
      do_st16_mmu(env, addr, val, oi, retaddr);
      plugin_store_cb(env, addr, oi);
  }
 diff --git a/accel/tcg/user-exec.c b/accel/tcg/user-exec.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/user-exec.c
 +++ b/accel/tcg/user-exec.c
@@ -XXX,XX +XXX,XX @@ uint8_t cpu_ldb_mmu(CPUArchState *env, abi_ptr addr,
      return ret;
  }
 -static uint16_t do_ld2_he_mmu(CPUArchState *env, abi_ptr addr,
 -                              MemOp mop, uintptr_t ra)
 +static uint16_t do_ld2_mmu(CPUArchState *env, abi_ptr addr,
 +                           MemOp mop, uintptr_t ra)
  {
      void *haddr;
      uint16_t ret;
@@ -XXX,XX +XXX,XX @@ static uint16_t do_ld2_he_mmu(CPUArchState *env, abi_ptr addr,
      haddr = cpu_mmu_lookup(env, addr, mop, ra, MMU_DATA_LOAD);
      ret = load_atom_2(env, ra, haddr, mop);
      clear_helper_retaddr();
 +
 +    if (mop & MO_BSWAP) {
 +        ret = bswap16(ret);
 +    }
      return ret;
  }
  tcg_target_ulong helper_lduw_mmu(CPUArchState *env, uint64_t addr,
                                   MemOpIdx oi, uintptr_t ra)
  {
 -    MemOp mop = get_memop(oi);
 -    uint16_t ret = do_ld2_he_mmu(env, addr, mop, ra);
 -
 -    if (mop & MO_BSWAP) {
 -        ret = bswap16(ret);
 -    }
 -    return ret;
 +    return do_ld2_mmu(env, addr, get_memop(oi), ra);
  }
  tcg_target_ulong helper_ldsw_mmu(CPUArchState *env, uint64_t addr,
                                   MemOpIdx oi, uintptr_t ra)
  {
 -    MemOp mop = get_memop(oi);
 -    int16_t ret = do_ld2_he_mmu(env, addr, mop, ra);
 +    return (int16_t)do_ld2_mmu(env, addr, get_memop(oi), ra);
 +}
 -    if (mop & MO_BSWAP) {
 -        ret = bswap16(ret);
 -    }
 +uint16_t cpu_ldw_mmu(CPUArchState *env, abi_ptr addr,
 +                     MemOpIdx oi, uintptr_t ra)
 +{
-+    Nios2CPU *cpu = NIOS2_CPU(cs);
++    uint16_t ret = do_ld2_mmu(env, addr, get_memop(oi), ra);
-+    CPUNios2State *env = &cpu->env;
++    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
      return ret;
  }
 -uint16_t cpu_ldw_be_mmu(CPUArchState *env, abi_ptr addr,
 -                        MemOpIdx oi, uintptr_t ra)
 -{
 -    MemOp mop = get_memop(oi);
 -    uint16_t ret;
 -
 -    tcg_debug_assert((mop & MO_BSWAP) == MO_BE);
 -    ret = do_ld2_he_mmu(env, addr, mop, ra);
 -    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
 -    return cpu_to_be16(ret);
 -}
 -
 -uint16_t cpu_ldw_le_mmu(CPUArchState *env, abi_ptr addr,
 -                        MemOpIdx oi, uintptr_t ra)
 -{
 -    MemOp mop = get_memop(oi);
 -    uint16_t ret;
 -
 -    tcg_debug_assert((mop & MO_BSWAP) == MO_LE);
 -    ret = do_ld2_he_mmu(env, addr, mop, ra);
 -    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
 -    return cpu_to_le16(ret);
 -}
 -
 -static uint32_t do_ld4_he_mmu(CPUArchState *env, abi_ptr addr,
 -                              MemOp mop, uintptr_t ra)
 +static uint32_t do_ld4_mmu(CPUArchState *env, abi_ptr addr,
 +                           MemOp mop, uintptr_t ra)
  {
      void *haddr;
      uint32_t ret;
@@ -XXX,XX +XXX,XX @@ static uint32_t do_ld4_he_mmu(CPUArchState *env, abi_ptr addr,
      haddr = cpu_mmu_lookup(env, addr, mop, ra, MMU_DATA_LOAD);
      ret = load_atom_4(env, ra, haddr, mop);
      clear_helper_retaddr();
 +
-+    env->pc = data[0];
++    if (mop & MO_BSWAP) {
 +        ret = bswap32(ret);
 +    }
      return ret;
  }
  tcg_target_ulong helper_ldul_mmu(CPUArchState *env, uint64_t addr,
                                   MemOpIdx oi, uintptr_t ra)
  {
 -    MemOp mop = get_memop(oi);
 -    uint32_t ret = do_ld4_he_mmu(env, addr, mop, ra);
 -
 -    if (mop & MO_BSWAP) {
 -        ret = bswap32(ret);
 -    }
 -    return ret;
 +    return do_ld4_mmu(env, addr, get_memop(oi), ra);
  }
  tcg_target_ulong helper_ldsl_mmu(CPUArchState *env, uint64_t addr,
                                   MemOpIdx oi, uintptr_t ra)
  {
 -    MemOp mop = get_memop(oi);
 -    int32_t ret = do_ld4_he_mmu(env, addr, mop, ra);
 +    return (int32_t)do_ld4_mmu(env, addr, get_memop(oi), ra);
 +}
 -    if (mop & MO_BSWAP) {
 -        ret = bswap32(ret);
 -    }
 +uint32_t cpu_ldl_mmu(CPUArchState *env, abi_ptr addr,
 +                     MemOpIdx oi, uintptr_t ra)
 +{
 +    uint32_t ret = do_ld4_mmu(env, addr, get_memop(oi), ra);
 +    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
      return ret;
  }
 -uint32_t cpu_ldl_be_mmu(CPUArchState *env, abi_ptr addr,
 -                        MemOpIdx oi, uintptr_t ra)
 -{
 -    MemOp mop = get_memop(oi);
 -    uint32_t ret;
 -
 -    tcg_debug_assert((mop & MO_BSWAP) == MO_BE);
 -    ret = do_ld4_he_mmu(env, addr, mop, ra);
 -    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
 -    return cpu_to_be32(ret);
 -}
 -
 -uint32_t cpu_ldl_le_mmu(CPUArchState *env, abi_ptr addr,
 -                        MemOpIdx oi, uintptr_t ra)
 -{
 -    MemOp mop = get_memop(oi);
 -    uint32_t ret;
 -
 -    tcg_debug_assert((mop & MO_BSWAP) == MO_LE);
 -    ret = do_ld4_he_mmu(env, addr, mop, ra);
 -    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
 -    return cpu_to_le32(ret);
 -}
 -
 -static uint64_t do_ld8_he_mmu(CPUArchState *env, abi_ptr addr,
 -                              MemOp mop, uintptr_t ra)
 +static uint64_t do_ld8_mmu(CPUArchState *env, abi_ptr addr,
 +                           MemOp mop, uintptr_t ra)
  {
      void *haddr;
      uint64_t ret;
@@ -XXX,XX +XXX,XX @@ static uint64_t do_ld8_he_mmu(CPUArchState *env, abi_ptr addr,
      haddr = cpu_mmu_lookup(env, addr, mop, ra, MMU_DATA_LOAD);
      ret = load_atom_8(env, ra, haddr, mop);
      clear_helper_retaddr();
 -    return ret;
 -}
 -
 -uint64_t helper_ldq_mmu(CPUArchState *env, uint64_t addr,
 -                        MemOpIdx oi, uintptr_t ra)
 -{
 -    MemOp mop = get_memop(oi);
 -    uint64_t ret = do_ld8_he_mmu(env, addr, mop, ra);
      if (mop & MO_BSWAP) {
          ret = bswap64(ret);
@@ -XXX,XX +XXX,XX @@ uint64_t helper_ldq_mmu(CPUArchState *env, uint64_t addr,
      return ret;
  }
 -uint64_t cpu_ldq_be_mmu(CPUArchState *env, abi_ptr addr,
 +uint64_t helper_ldq_mmu(CPUArchState *env, uint64_t addr,
                          MemOpIdx oi, uintptr_t ra)
  {
 -    MemOp mop = get_memop(oi);
 -    uint64_t ret;
 -
 -    tcg_debug_assert((mop & MO_BSWAP) == MO_BE);
 -    ret = do_ld8_he_mmu(env, addr, mop, ra);
 -    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
 -    return cpu_to_be64(ret);
 +    return do_ld8_mmu(env, addr, get_memop(oi), ra);
  }
 -uint64_t cpu_ldq_le_mmu(CPUArchState *env, abi_ptr addr,
 -                        MemOpIdx oi, uintptr_t ra)
 +uint64_t cpu_ldq_mmu(CPUArchState *env, abi_ptr addr,
 +                     MemOpIdx oi, uintptr_t ra)
  {
 -    MemOp mop = get_memop(oi);
 -    uint64_t ret;
 -
 -    tcg_debug_assert((mop & MO_BSWAP) == MO_LE);
 -    ret = do_ld8_he_mmu(env, addr, mop, ra);
 +    uint64_t ret = do_ld8_mmu(env, addr, get_memop(oi), ra);
      qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
 -    return cpu_to_le64(ret);
 +    return ret;
  }
 -static Int128 do_ld16_he_mmu(CPUArchState *env, abi_ptr addr,
 -                             MemOp mop, uintptr_t ra)
 +static Int128 do_ld16_mmu(CPUArchState *env, abi_ptr addr,
 +                          MemOp mop, uintptr_t ra)
  {
      void *haddr;
      Int128 ret;
@@ -XXX,XX +XXX,XX @@ static Int128 do_ld16_he_mmu(CPUArchState *env, abi_ptr addr,
      haddr = cpu_mmu_lookup(env, addr, mop, ra, MMU_DATA_LOAD);
      ret = load_atom_16(env, ra, haddr, mop);
      clear_helper_retaddr();
 -    return ret;
 -}
 -
 -Int128 helper_ld16_mmu(CPUArchState *env, uint64_t addr,
 -                       MemOpIdx oi, uintptr_t ra)
 -{
 -    MemOp mop = get_memop(oi);
 -    Int128 ret = do_ld16_he_mmu(env, addr, mop, ra);
      if (mop & MO_BSWAP) {
          ret = bswap128(ret);
@@ -XXX,XX +XXX,XX @@ Int128 helper_ld16_mmu(CPUArchState *env, uint64_t addr,
      return ret;
  }
 +Int128 helper_ld16_mmu(CPUArchState *env, uint64_t addr,
 +                       MemOpIdx oi, uintptr_t ra)
 +{
 +    return do_ld16_mmu(env, addr, get_memop(oi), ra);
 +}
 +
- static bool nios2_cpu_has_work(CPUState *cs)
+ Int128 helper_ld_i128(CPUArchState *env, uint64_t addr, MemOpIdx oi)
  {
-     return cs->interrupt_request & CPU_INTERRUPT_HARD;
+     return helper_ld16_mmu(env, addr, oi, GETPC());
-@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps nios2_sysemu_ops = {
+ }
- static const struct TCGCPUOps nios2_tcg_ops = {
+-Int128 cpu_ld16_be_mmu(CPUArchState *env, abi_ptr addr,
-     .initialize = nios2_tcg_init,
+-                       MemOpIdx oi, uintptr_t ra)
-+    .restore_state_to_opc = nios2_restore_state_to_opc,
++Int128 cpu_ld16_mmu(CPUArchState *env, abi_ptr addr,
++                    MemOpIdx oi, uintptr_t ra)
- #ifndef CONFIG_USER_ONLY
+ {
-     .tlb_fill = nios2_cpu_tlb_fill,
+-    MemOp mop = get_memop(oi);
-diff --git a/target/nios2/translate.c b/target/nios2/translate.c
+-    Int128 ret;
 -
 -    tcg_debug_assert((mop & MO_BSWAP) == MO_BE);
 -    ret = do_ld16_he_mmu(env, addr, mop, ra);
 +    Int128 ret = do_ld16_mmu(env, addr, get_memop(oi), ra);
      qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
 -    if (!HOST_BIG_ENDIAN) {
 -        ret = bswap128(ret);
 -    }
 -    return ret;
 -}
 -
 -Int128 cpu_ld16_le_mmu(CPUArchState *env, abi_ptr addr,
 -                       MemOpIdx oi, uintptr_t ra)
 -{
 -    MemOp mop = get_memop(oi);
 -    Int128 ret;
 -
 -    tcg_debug_assert((mop & MO_BSWAP) == MO_LE);
 -    ret = do_ld16_he_mmu(env, addr, mop, ra);
 -    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
 -    if (HOST_BIG_ENDIAN) {
 -        ret = bswap128(ret);
 -    }
      return ret;
  }
@@ -XXX,XX +XXX,XX @@ void cpu_stb_mmu(CPUArchState *env, abi_ptr addr, uint8_t val,
      qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W);
  }
 -static void do_st2_he_mmu(CPUArchState *env, abi_ptr addr, uint16_t val,
 -                          MemOp mop, uintptr_t ra)
 +static void do_st2_mmu(CPUArchState *env, abi_ptr addr, uint16_t val,
 +                       MemOp mop, uintptr_t ra)
  {
      void *haddr;
      tcg_debug_assert((mop & MO_SIZE) == MO_16);
      haddr = cpu_mmu_lookup(env, addr, mop, ra, MMU_DATA_STORE);
 +
 +    if (mop & MO_BSWAP) {
 +        val = bswap16(val);
 +    }
      store_atom_2(env, ra, haddr, mop, val);
      clear_helper_retaddr();
  }
@@ -XXX,XX +XXX,XX @@ static void do_st2_he_mmu(CPUArchState *env, abi_ptr addr, uint16_t val,
  void helper_stw_mmu(CPUArchState *env, uint64_t addr, uint32_t val,
                      MemOpIdx oi, uintptr_t ra)
  {
 -    MemOp mop = get_memop(oi);
 -
 -    if (mop & MO_BSWAP) {
 -        val = bswap16(val);
 -    }
 -    do_st2_he_mmu(env, addr, val, mop, ra);
 +    do_st2_mmu(env, addr, val, get_memop(oi), ra);
  }
 -void cpu_stw_be_mmu(CPUArchState *env, abi_ptr addr, uint16_t val,
 +void cpu_stw_mmu(CPUArchState *env, abi_ptr addr, uint16_t val,
                      MemOpIdx oi, uintptr_t ra)
  {
 -    MemOp mop = get_memop(oi);
 -
 -    tcg_debug_assert((mop & MO_BSWAP) == MO_BE);
 -    do_st2_he_mmu(env, addr, be16_to_cpu(val), mop, ra);
 +    do_st2_mmu(env, addr, val, get_memop(oi), ra);
      qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W);
  }
 -void cpu_stw_le_mmu(CPUArchState *env, abi_ptr addr, uint16_t val,
 -                    MemOpIdx oi, uintptr_t ra)
 -{
 -    MemOp mop = get_memop(oi);
 -
 -    tcg_debug_assert((mop & MO_BSWAP) == MO_LE);
 -    do_st2_he_mmu(env, addr, le16_to_cpu(val), mop, ra);
 -    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W);
 -}
 -
 -static void do_st4_he_mmu(CPUArchState *env, abi_ptr addr, uint32_t val,
 -                          MemOp mop, uintptr_t ra)
 +static void do_st4_mmu(CPUArchState *env, abi_ptr addr, uint32_t val,
 +                       MemOp mop, uintptr_t ra)
  {
      void *haddr;
      tcg_debug_assert((mop & MO_SIZE) == MO_32);
      haddr = cpu_mmu_lookup(env, addr, mop, ra, MMU_DATA_STORE);
 +
 +    if (mop & MO_BSWAP) {
 +        val = bswap32(val);
 +    }
      store_atom_4(env, ra, haddr, mop, val);
      clear_helper_retaddr();
  }
@@ -XXX,XX +XXX,XX @@ static void do_st4_he_mmu(CPUArchState *env, abi_ptr addr, uint32_t val,
  void helper_stl_mmu(CPUArchState *env, uint64_t addr, uint32_t val,
                      MemOpIdx oi, uintptr_t ra)
  {
 -    MemOp mop = get_memop(oi);
 -
 -    if (mop & MO_BSWAP) {
 -        val = bswap32(val);
 -    }
 -    do_st4_he_mmu(env, addr, val, mop, ra);
 +    do_st4_mmu(env, addr, val, get_memop(oi), ra);
  }
 -void cpu_stl_be_mmu(CPUArchState *env, abi_ptr addr, uint32_t val,
 -                    MemOpIdx oi, uintptr_t ra)
 +void cpu_stl_mmu(CPUArchState *env, abi_ptr addr, uint32_t val,
 +                 MemOpIdx oi, uintptr_t ra)
  {
 -    MemOp mop = get_memop(oi);
 -
 -    tcg_debug_assert((mop & MO_BSWAP) == MO_BE);
 -    do_st4_he_mmu(env, addr, be32_to_cpu(val), mop, ra);
 +    do_st4_mmu(env, addr, val, get_memop(oi), ra);
      qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W);
  }
 -void cpu_stl_le_mmu(CPUArchState *env, abi_ptr addr, uint32_t val,
 -                    MemOpIdx oi, uintptr_t ra)
 -{
 -    MemOp mop = get_memop(oi);
 -
 -    tcg_debug_assert((mop & MO_BSWAP) == MO_LE);
 -    do_st4_he_mmu(env, addr, le32_to_cpu(val), mop, ra);
 -    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W);
 -}
 -
 -static void do_st8_he_mmu(CPUArchState *env, abi_ptr addr, uint64_t val,
 -                          MemOp mop, uintptr_t ra)
 +static void do_st8_mmu(CPUArchState *env, abi_ptr addr, uint64_t val,
 +                       MemOp mop, uintptr_t ra)
  {
      void *haddr;
      tcg_debug_assert((mop & MO_SIZE) == MO_64);
      haddr = cpu_mmu_lookup(env, addr, mop, ra, MMU_DATA_STORE);
 +
 +    if (mop & MO_BSWAP) {
 +        val = bswap64(val);
 +    }
      store_atom_8(env, ra, haddr, mop, val);
      clear_helper_retaddr();
  }
@@ -XXX,XX +XXX,XX @@ static void do_st8_he_mmu(CPUArchState *env, abi_ptr addr, uint64_t val,
  void helper_stq_mmu(CPUArchState *env, uint64_t addr, uint64_t val,
                      MemOpIdx oi, uintptr_t ra)
  {
 -    MemOp mop = get_memop(oi);
 -
 -    if (mop & MO_BSWAP) {
 -        val = bswap64(val);
 -    }
 -    do_st8_he_mmu(env, addr, val, mop, ra);
 +    do_st8_mmu(env, addr, val, get_memop(oi), ra);
  }
 -void cpu_stq_be_mmu(CPUArchState *env, abi_ptr addr, uint64_t val,
 +void cpu_stq_mmu(CPUArchState *env, abi_ptr addr, uint64_t val,
                      MemOpIdx oi, uintptr_t ra)
  {
 -    MemOp mop = get_memop(oi);
 -
 -    tcg_debug_assert((mop & MO_BSWAP) == MO_BE);
 -    do_st8_he_mmu(env, addr, cpu_to_be64(val), mop, ra);
 +    do_st8_mmu(env, addr, val, get_memop(oi), ra);
      qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W);
  }
 -void cpu_stq_le_mmu(CPUArchState *env, abi_ptr addr, uint64_t val,
 -                    MemOpIdx oi, uintptr_t ra)
 -{
 -    MemOp mop = get_memop(oi);
 -
 -    tcg_debug_assert((mop & MO_BSWAP) == MO_LE);
 -    do_st8_he_mmu(env, addr, cpu_to_le64(val), mop, ra);
 -    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W);
 -}
 -
 -static void do_st16_he_mmu(CPUArchState *env, abi_ptr addr, Int128 val,
 -                           MemOp mop, uintptr_t ra)
 +static void do_st16_mmu(CPUArchState *env, abi_ptr addr, Int128 val,
 +                        MemOp mop, uintptr_t ra)
  {
      void *haddr;
      tcg_debug_assert((mop & MO_SIZE) == MO_128);
      haddr = cpu_mmu_lookup(env, addr, mop, ra, MMU_DATA_STORE);
 +
 +    if (mop & MO_BSWAP) {
 +        val = bswap128(val);
 +    }
      store_atom_16(env, ra, haddr, mop, val);
      clear_helper_retaddr();
  }
@@ -XXX,XX +XXX,XX @@ static void do_st16_he_mmu(CPUArchState *env, abi_ptr addr, Int128 val,
  void helper_st16_mmu(CPUArchState *env, uint64_t addr, Int128 val,
                       MemOpIdx oi, uintptr_t ra)
  {
 -    MemOp mop = get_memop(oi);
 -
 -    if (mop & MO_BSWAP) {
 -        val = bswap128(val);
 -    }
 -    do_st16_he_mmu(env, addr, val, mop, ra);
 +    do_st16_mmu(env, addr, val, get_memop(oi), ra);
  }
  void helper_st_i128(CPUArchState *env, uint64_t addr, Int128 val, MemOpIdx oi)
@@ -XXX,XX +XXX,XX @@ void helper_st_i128(CPUArchState *env, uint64_t addr, Int128 val, MemOpIdx oi)
      helper_st16_mmu(env, addr, val, oi, GETPC());
  }
 -void cpu_st16_be_mmu(CPUArchState *env, abi_ptr addr,
 -                     Int128 val, MemOpIdx oi, uintptr_t ra)
 +void cpu_st16_mmu(CPUArchState *env, abi_ptr addr,
 +                  Int128 val, MemOpIdx oi, uintptr_t ra)
  {
 -    MemOp mop = get_memop(oi);
 -
 -    tcg_debug_assert((mop & MO_BSWAP) == MO_BE);
 -    if (!HOST_BIG_ENDIAN) {
 -        val = bswap128(val);
 -    }
 -    do_st16_he_mmu(env, addr, val, mop, ra);
 -    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W);
 -}
 -
 -void cpu_st16_le_mmu(CPUArchState *env, abi_ptr addr,
 -                     Int128 val, MemOpIdx oi, uintptr_t ra)
 -{
 -    MemOp mop = get_memop(oi);
 -
 -    tcg_debug_assert((mop & MO_BSWAP) == MO_LE);
 -    if (HOST_BIG_ENDIAN) {
 -        val = bswap128(val);
 -    }
 -    do_st16_he_mmu(env, addr, val, mop, ra);
 +    do_st16_mmu(env, addr, val, get_memop(oi), ra);
      qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W);
  }
 diff --git a/target/arm/tcg/m_helper.c b/target/arm/tcg/m_helper.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/nios2/translate.c
+--- a/target/arm/tcg/m_helper.c
-+++ b/target/nios2/translate.c
++++ b/target/arm/tcg/m_helper.c
-@@ -XXX,XX +XXX,XX @@ void nios2_tcg_init(void)
+@@ -XXX,XX +XXX,XX @@ static bool do_v7m_function_return(ARMCPU *cpu)
-     cpu_pc = tcg_global_mem_new(cpu_env,
+          */
-                                 offsetof(CPUNios2State, pc), "pc");
+         mmu_idx = arm_v7m_mmu_idx_for_secstate(env, true);
- }
+         oi = make_memop_idx(MO_LEUL, arm_to_core_mmu_idx(mmu_idx));
--
+-        newpc = cpu_ldl_le_mmu(env, frameptr, oi, 0);
--void restore_state_to_opc(CPUNios2State *env, TranslationBlock *tb,
+-        newpsr = cpu_ldl_le_mmu(env, frameptr + 4, oi, 0);
--                          target_ulong *data)
++        newpc = cpu_ldl_mmu(env, frameptr, oi, 0);
--{
++        newpsr = cpu_ldl_mmu(env, frameptr + 4, oi, 0);
--    env->pc = data[0];
--}
+         /* Consistency checks on new IPSR */
          newpsr_exc = newpsr & XPSR_EXCP;
 diff --git a/target/sparc/ldst_helper.c b/target/sparc/ldst_helper.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/sparc/ldst_helper.c
 +++ b/target/sparc/ldst_helper.c
@@ -XXX,XX +XXX,XX @@ uint64_t helper_ld_asi(CPUSPARCState *env, target_ulong addr,
                  ret = cpu_ldb_mmu(env, addr, oi, GETPC());
                  break;
              case 2:
 -                if (asi & 8) {
 -                    ret = cpu_ldw_le_mmu(env, addr, oi, GETPC());
 -                } else {
 -                    ret = cpu_ldw_be_mmu(env, addr, oi, GETPC());
 -                }
 +                ret = cpu_ldw_mmu(env, addr, oi, GETPC());
                  break;
              case 4:
 -                if (asi & 8) {
 -                    ret = cpu_ldl_le_mmu(env, addr, oi, GETPC());
 -                } else {
 -                    ret = cpu_ldl_be_mmu(env, addr, oi, GETPC());
 -                }
 +                ret = cpu_ldl_mmu(env, addr, oi, GETPC());
                  break;
              case 8:
 -                if (asi & 8) {
 -                    ret = cpu_ldq_le_mmu(env, addr, oi, GETPC());
 -                } else {
 -                    ret = cpu_ldq_be_mmu(env, addr, oi, GETPC());
 -                }
 +                ret = cpu_ldq_mmu(env, addr, oi, GETPC());
                  break;
              default:
                  g_assert_not_reached();
 diff --git a/accel/tcg/ldst_common.c.inc b/accel/tcg/ldst_common.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/ldst_common.c.inc
 +++ b/accel/tcg/ldst_common.c.inc
@@ -XXX,XX +XXX,XX @@ uint32_t cpu_lduw_be_mmuidx_ra(CPUArchState *env, abi_ptr addr,
                                 int mmu_idx, uintptr_t ra)
  {
      MemOpIdx oi = make_memop_idx(MO_BEUW | MO_UNALN, mmu_idx);
 -    return cpu_ldw_be_mmu(env, addr, oi, ra);
 +    return cpu_ldw_mmu(env, addr, oi, ra);
  }
  int cpu_ldsw_be_mmuidx_ra(CPUArchState *env, abi_ptr addr,
@@ -XXX,XX +XXX,XX @@ uint32_t cpu_ldl_be_mmuidx_ra(CPUArchState *env, abi_ptr addr,
                                int mmu_idx, uintptr_t ra)
  {
      MemOpIdx oi = make_memop_idx(MO_BEUL | MO_UNALN, mmu_idx);
 -    return cpu_ldl_be_mmu(env, addr, oi, ra);
 +    return cpu_ldl_mmu(env, addr, oi, ra);
  }
  uint64_t cpu_ldq_be_mmuidx_ra(CPUArchState *env, abi_ptr addr,
                                int mmu_idx, uintptr_t ra)
  {
      MemOpIdx oi = make_memop_idx(MO_BEUQ | MO_UNALN, mmu_idx);
 -    return cpu_ldq_be_mmu(env, addr, oi, ra);
 +    return cpu_ldq_mmu(env, addr, oi, ra);
  }
  uint32_t cpu_lduw_le_mmuidx_ra(CPUArchState *env, abi_ptr addr,
                                 int mmu_idx, uintptr_t ra)
  {
      MemOpIdx oi = make_memop_idx(MO_LEUW | MO_UNALN, mmu_idx);
 -    return cpu_ldw_le_mmu(env, addr, oi, ra);
 +    return cpu_ldw_mmu(env, addr, oi, ra);
  }
  int cpu_ldsw_le_mmuidx_ra(CPUArchState *env, abi_ptr addr,
@@ -XXX,XX +XXX,XX @@ uint32_t cpu_ldl_le_mmuidx_ra(CPUArchState *env, abi_ptr addr,
                                int mmu_idx, uintptr_t ra)
  {
      MemOpIdx oi = make_memop_idx(MO_LEUL | MO_UNALN, mmu_idx);
 -    return cpu_ldl_le_mmu(env, addr, oi, ra);
 +    return cpu_ldl_mmu(env, addr, oi, ra);
  }
  uint64_t cpu_ldq_le_mmuidx_ra(CPUArchState *env, abi_ptr addr,
                                int mmu_idx, uintptr_t ra)
  {
      MemOpIdx oi = make_memop_idx(MO_LEUQ | MO_UNALN, mmu_idx);
 -    return cpu_ldq_le_mmu(env, addr, oi, ra);
 +    return cpu_ldq_mmu(env, addr, oi, ra);
  }
  void cpu_stb_mmuidx_ra(CPUArchState *env, abi_ptr addr, uint32_t val,
@@ -XXX,XX +XXX,XX @@ void cpu_stw_be_mmuidx_ra(CPUArchState *env, abi_ptr addr, uint32_t val,
                            int mmu_idx, uintptr_t ra)
  {
      MemOpIdx oi = make_memop_idx(MO_BEUW | MO_UNALN, mmu_idx);
 -    cpu_stw_be_mmu(env, addr, val, oi, ra);
 +    cpu_stw_mmu(env, addr, val, oi, ra);
  }
  void cpu_stl_be_mmuidx_ra(CPUArchState *env, abi_ptr addr, uint32_t val,
                            int mmu_idx, uintptr_t ra)
  {
      MemOpIdx oi = make_memop_idx(MO_BEUL | MO_UNALN, mmu_idx);
 -    cpu_stl_be_mmu(env, addr, val, oi, ra);
 +    cpu_stl_mmu(env, addr, val, oi, ra);
  }
  void cpu_stq_be_mmuidx_ra(CPUArchState *env, abi_ptr addr, uint64_t val,
                            int mmu_idx, uintptr_t ra)
  {
      MemOpIdx oi = make_memop_idx(MO_BEUQ | MO_UNALN, mmu_idx);
 -    cpu_stq_be_mmu(env, addr, val, oi, ra);
 +    cpu_stq_mmu(env, addr, val, oi, ra);
  }
  void cpu_stw_le_mmuidx_ra(CPUArchState *env, abi_ptr addr, uint32_t val,
                            int mmu_idx, uintptr_t ra)
  {
      MemOpIdx oi = make_memop_idx(MO_LEUW | MO_UNALN, mmu_idx);
 -    cpu_stw_le_mmu(env, addr, val, oi, ra);
 +    cpu_stw_mmu(env, addr, val, oi, ra);
  }
  void cpu_stl_le_mmuidx_ra(CPUArchState *env, abi_ptr addr, uint32_t val,
                            int mmu_idx, uintptr_t ra)
  {
      MemOpIdx oi = make_memop_idx(MO_LEUL | MO_UNALN, mmu_idx);
 -    cpu_stl_le_mmu(env, addr, val, oi, ra);
 +    cpu_stl_mmu(env, addr, val, oi, ra);
  }
  void cpu_stq_le_mmuidx_ra(CPUArchState *env, abi_ptr addr, uint64_t val,
                            int mmu_idx, uintptr_t ra)
  {
      MemOpIdx oi = make_memop_idx(MO_LEUQ | MO_UNALN, mmu_idx);
 -    cpu_stq_le_mmu(env, addr, val, oi, ra);
 +    cpu_stq_mmu(env, addr, val, oi, ra);
  }
  /*--------------------------*/
 --
 .34.1

-[PULL 20/47] accel/tcg: Call tb_invalidate_phys_page for PAGE_RESET
+[PULL 17/28] target/s390x: Use cpu_{ld,st}*_mmu in do_csst
-When PAGE_RESET is set, we are replacing pages with new
+Use cpu_ld16_mmu and cpu_st16_mmu to eliminate the special case,
-content, which means that we need to invalidate existing
+and change all of the *_data_ra functions to match.
 cached data, such as TranslationBlocks.  Perform the
 reset invalidate while we're doing other invalidates,
 which allows us to remove the separate invalidates from
 the user-only mmap/munmap/mprotect routines.
-In addition, restrict invalidation to PAGE_EXEC pages.
+Note that we check the alignment of both compare and store
-Since cdf713085131, we have validated PAGE_EXEC is present
+pointers at the top of the function, so MO_ALIGN* may be
-before translation, which means we can assume that if the
+safely removed from the individual memory operations.
 bit is not present, there are no translations to invalidate.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: David Hildenbrand <david@redhat.com>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/translate-all.c | 19 +++++++++++--------
+ target/s390x/tcg/mem_helper.c | 66 ++++++++++++++---------------------
- bsd-user/mmap.c           |  2 --
+file changed, 27 insertions(+), 39 deletions(-)
  linux-user/mmap.c         |  4 ----
 files changed, 11 insertions(+), 14 deletions(-)
-diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
+diff --git a/target/s390x/tcg/mem_helper.c b/target/s390x/tcg/mem_helper.c
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/translate-all.c
+--- a/target/s390x/tcg/mem_helper.c
-+++ b/accel/tcg/translate-all.c
++++ b/target/s390x/tcg/mem_helper.c
-@@ -XXX,XX +XXX,XX @@ int page_get_flags(target_ulong address)
+@@ -XXX,XX +XXX,XX @@ static uint32_t do_csst(CPUS390XState *env, uint32_t r3, uint64_t a1,
- void page_set_flags(target_ulong start, target_ulong end, int flags)
+                         uint64_t a2, bool parallel)
  {
-     target_ulong addr, len;
+     uint32_t mem_idx = cpu_mmu_index(env, false);
--    bool reset_target_data;
++    MemOpIdx oi16 = make_memop_idx(MO_TE | MO_128, mem_idx);
-+    bool reset;
++    MemOpIdx oi8 = make_memop_idx(MO_TE | MO_64, mem_idx);
++    MemOpIdx oi4 = make_memop_idx(MO_TE | MO_32, mem_idx);
-     /* This function should never be called with addresses outside the
++    MemOpIdx oi2 = make_memop_idx(MO_TE | MO_16, mem_idx);
-        guest address space.  If this assert fires, it probably indicates
++    MemOpIdx oi1 = make_memop_idx(MO_8, mem_idx);
-@@ -XXX,XX +XXX,XX @@ void page_set_flags(target_ulong start, target_ulong end, int flags)
+     uintptr_t ra = GETPC();
-     if (flags & PAGE_WRITE) {
+     uint32_t fc = extract32(env->regs[0], 0, 8);
-         flags |= PAGE_WRITE_ORG;
+     uint32_t sc = extract32(env->regs[0], 8, 8);
-     }
+@@ -XXX,XX +XXX,XX @@ static uint32_t do_csst(CPUS390XState *env, uint32_t r3, uint64_t a1,
 -    reset_target_data = !(flags & PAGE_VALID) || (flags & PAGE_RESET);
 +    reset = !(flags & PAGE_VALID) || (flags & PAGE_RESET);
      flags &= ~PAGE_RESET;
      for (addr = start, len = end - start;
@@ -XXX,XX +XXX,XX @@ void page_set_flags(target_ulong start, target_ulong end, int flags)
           len -= TARGET_PAGE_SIZE, addr += TARGET_PAGE_SIZE) {
          PageDesc *p = page_find_alloc(addr >> TARGET_PAGE_BITS, true);
 -        /* If the write protection bit is set, then we invalidate
 -           the code inside.  */
 -        if (!(p->flags & PAGE_WRITE) &&
 -            (flags & PAGE_WRITE) &&
 -            p->first_tb) {
 +        /*
 +         * If the page was executable, but is reset, or is no longer
 +         * executable, or has become writable, then invalidate any code.
 +         */
 +        if ((p->flags & PAGE_EXEC)
 +            && (reset ||
 +                !(flags & PAGE_EXEC) ||
 +                (flags & ~p->flags & PAGE_WRITE))) {
              tb_invalidate_phys_page(addr);
          }
 -        if (reset_target_data) {
 +        if (reset) {
              g_free(p->target_data);
              p->target_data = NULL;
              p->flags = flags;
 diff --git a/bsd-user/mmap.c b/bsd-user/mmap.c
 index XXXXXXX..XXXXXXX 100644
 --- a/bsd-user/mmap.c
 +++ b/bsd-user/mmap.c
@@ -XXX,XX +XXX,XX @@ abi_long target_mmap(abi_ulong start, abi_ulong len, int prot,
      page_dump(stdout);
      printf("\n");
  #endif
 -    tb_invalidate_phys_range(start, start + len);
      mmap_unlock();
      return start;
  fail:
@@ -XXX,XX +XXX,XX @@ int target_munmap(abi_ulong start, abi_ulong len)
      if (ret == 0) {
          page_set_flags(start, start + len, 0);
 -        tb_invalidate_phys_range(start, start + len);
      }
      mmap_unlock();
      return ret;
 diff --git a/linux-user/mmap.c b/linux-user/mmap.c
 index XXXXXXX..XXXXXXX 100644
 --- a/linux-user/mmap.c
 +++ b/linux-user/mmap.c
@@ -XXX,XX +XXX,XX @@ int target_mprotect(abi_ulong start, abi_ulong len, int target_prot)
      }
      page_set_flags(start, start + len, page_flags);
 -    tb_invalidate_phys_range(start, start + len);
      ret = 0;
  error:
@@ -XXX,XX +XXX,XX @@ abi_long target_mmap(abi_ulong start, abi_ulong len, int target_prot,
              qemu_log_unlock(f);
          }
      }
--    tb_invalidate_phys_range(start, start + len);
-     mmap_unlock();
+-    /* All loads happen before all stores.  For simplicity, load the entire
-     return start;
+-       store value area from the parameter list.  */
- fail:
+-    svh = cpu_ldq_data_ra(env, pl + 16, ra);
-@@ -XXX,XX +XXX,XX @@ int target_munmap(abi_ulong start, abi_ulong len)
+-    svl = cpu_ldq_data_ra(env, pl + 24, ra);
++    /*
-     if (ret == 0) {
++     * All loads happen before all stores.  For simplicity, load the entire
-         page_set_flags(start, start + len, 0);
++     * store value area from the parameter list.
--        tb_invalidate_phys_range(start, start + len);
++     */
-     }
++    svh = cpu_ldq_mmu(env, pl + 16, oi8, ra);
-     mmap_unlock();
++    svl = cpu_ldq_mmu(env, pl + 24, oi8, ra);
-     return ret;
-@@ -XXX,XX +XXX,XX @@ abi_long target_mremap(abi_ulong old_addr, abi_ulong old_size,
+     switch (fc) {
-         page_set_flags(new_addr, new_addr + new_size,
+     case 0:
-                        prot | PAGE_VALID | PAGE_RESET);
+         {
-     }
+-            uint32_t nv = cpu_ldl_data_ra(env, pl, ra);
--    tb_invalidate_phys_range(new_addr, new_addr + new_size);
++            uint32_t nv = cpu_ldl_mmu(env, pl, oi4, ra);
-     mmap_unlock();
+             uint32_t cv = env->regs[r3];
-     return new_addr;
+             uint32_t ov;
- }
@@ -XXX,XX +XXX,XX @@ static uint32_t do_csst(CPUS390XState *env, uint32_t r3, uint64_t a1,
                  ov = cpu_atomic_cmpxchgl_be_mmu(env, a1, cv, nv, oi, ra);
  #endif
              } else {
 -                ov = cpu_ldl_data_ra(env, a1, ra);
 -                cpu_stl_data_ra(env, a1, (ov == cv ? nv : ov), ra);
 +                ov = cpu_ldl_mmu(env, a1, oi4, ra);
 +                cpu_stl_mmu(env, a1, (ov == cv ? nv : ov), oi4, ra);
              }
              cc = (ov != cv);
              env->regs[r3] = deposit64(env->regs[r3], 32, 32, ov);
@@ -XXX,XX +XXX,XX @@ static uint32_t do_csst(CPUS390XState *env, uint32_t r3, uint64_t a1,
      case 1:
          {
 -            uint64_t nv = cpu_ldq_data_ra(env, pl, ra);
 +            uint64_t nv = cpu_ldq_mmu(env, pl, oi8, ra);
              uint64_t cv = env->regs[r3];
              uint64_t ov;
              if (parallel) {
  #ifdef CONFIG_ATOMIC64
 -                MemOpIdx oi = make_memop_idx(MO_TEUQ | MO_ALIGN, mem_idx);
 -                ov = cpu_atomic_cmpxchgq_be_mmu(env, a1, cv, nv, oi, ra);
 +                ov = cpu_atomic_cmpxchgq_be_mmu(env, a1, cv, nv, oi8, ra);
  #else
                  /* Note that we asserted !parallel above.  */
                  g_assert_not_reached();
  #endif
              } else {
 -                ov = cpu_ldq_data_ra(env, a1, ra);
 -                cpu_stq_data_ra(env, a1, (ov == cv ? nv : ov), ra);
 +                ov = cpu_ldq_mmu(env, a1, oi8, ra);
 +                cpu_stq_mmu(env, a1, (ov == cv ? nv : ov), oi8, ra);
              }
              cc = (ov != cv);
              env->regs[r3] = ov;
@@ -XXX,XX +XXX,XX @@ static uint32_t do_csst(CPUS390XState *env, uint32_t r3, uint64_t a1,
      case 2:
          {
 -            uint64_t nvh = cpu_ldq_data_ra(env, pl, ra);
 -            uint64_t nvl = cpu_ldq_data_ra(env, pl + 8, ra);
 -            Int128 nv = int128_make128(nvl, nvh);
 +            Int128 nv = cpu_ld16_mmu(env, pl, oi16, ra);
              Int128 cv = int128_make128(env->regs[r3 + 1], env->regs[r3]);
              Int128 ov;
              if (!parallel) {
 -                uint64_t oh = cpu_ldq_data_ra(env, a1 + 0, ra);
 -                uint64_t ol = cpu_ldq_data_ra(env, a1 + 8, ra);
 -
 -                ov = int128_make128(ol, oh);
 +                ov = cpu_ld16_mmu(env, a1, oi16, ra);
                  cc = !int128_eq(ov, cv);
                  if (cc) {
                      nv = ov;
                  }
 -
 -                cpu_stq_data_ra(env, a1 + 0, int128_gethi(nv), ra);
 -                cpu_stq_data_ra(env, a1 + 8, int128_getlo(nv), ra);
 +                cpu_st16_mmu(env, a1, nv, oi16, ra);
              } else if (HAVE_CMPXCHG128) {
 -                MemOpIdx oi = make_memop_idx(MO_TE | MO_128 | MO_ALIGN, mem_idx);
 -                ov = cpu_atomic_cmpxchgo_be_mmu(env, a1, cv, nv, oi, ra);
 +                ov = cpu_atomic_cmpxchgo_be_mmu(env, a1, cv, nv, oi16, ra);
                  cc = !int128_eq(ov, cv);
              } else {
                  /* Note that we asserted !parallel above.  */
@@ -XXX,XX +XXX,XX @@ static uint32_t do_csst(CPUS390XState *env, uint32_t r3, uint64_t a1,
      if (cc == 0) {
          switch (sc) {
          case 0:
 -            cpu_stb_data_ra(env, a2, svh >> 56, ra);
 +            cpu_stb_mmu(env, a2, svh >> 56, oi1, ra);
              break;
          case 1:
 -            cpu_stw_data_ra(env, a2, svh >> 48, ra);
 +            cpu_stw_mmu(env, a2, svh >> 48, oi2, ra);
              break;
          case 2:
 -            cpu_stl_data_ra(env, a2, svh >> 32, ra);
 +            cpu_stl_mmu(env, a2, svh >> 32, oi4, ra);
              break;
          case 3:
 -            cpu_stq_data_ra(env, a2, svh, ra);
 +            cpu_stq_mmu(env, a2, svh, oi8, ra);
              break;
          case 4:
 -            if (!parallel) {
 -                cpu_stq_data_ra(env, a2 + 0, svh, ra);
 -                cpu_stq_data_ra(env, a2 + 8, svl, ra);
 -            } else if (HAVE_ATOMIC128) {
 -                MemOpIdx oi = make_memop_idx(MO_TEUQ | MO_ALIGN_16, mem_idx);
 -                Int128 sv = int128_make128(svl, svh);
 -                cpu_atomic_sto_be_mmu(env, a2, sv, oi, ra);
 -            } else {
 -                /* Note that we asserted !parallel above.  */
 -                g_assert_not_reached();
 -            }
 +            cpu_st16_mmu(env, a2, int128_make128(svl, svh), oi16, ra);
              break;
          default:
              g_assert_not_reached();
 --
 .34.1

-[PULL 40/47] target/riscv: Convert to tcg_ops restore_state_to_opc
+[PULL 18/28] target/s390x: Always use cpu_atomic_cmpxchgl_be_mmu in do_csst
-Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+Eliminate the CONFIG_USER_ONLY specialization.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Reviewed-by: David Hildenbrand <david@redhat.com>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/riscv/cpu.c | 9 +++++++--
+ target/s390x/tcg/mem_helper.c | 8 +-------
-file changed, 7 insertions(+), 2 deletions(-)
+file changed, 1 insertion(+), 7 deletions(-)
-diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
+diff --git a/target/s390x/tcg/mem_helper.c b/target/s390x/tcg/mem_helper.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/riscv/cpu.c
+--- a/target/s390x/tcg/mem_helper.c
-+++ b/target/riscv/cpu.c
++++ b/target/s390x/tcg/mem_helper.c
-@@ -XXX,XX +XXX,XX @@ static bool riscv_cpu_has_work(CPUState *cs)
+@@ -XXX,XX +XXX,XX @@ static uint32_t do_csst(CPUS390XState *env, uint32_t r3, uint64_t a1,
- #endif
+             uint32_t ov;
- }
+             if (parallel) {
--void restore_state_to_opc(CPURISCVState *env, TranslationBlock *tb,
+-#ifdef CONFIG_USER_ONLY
--                          target_ulong *data)
+-                uint32_t *haddr = g2h(env_cpu(env), a1);
-+static void riscv_restore_state_to_opc(CPUState *cs,
+-                ov = qatomic_cmpxchg__nocheck(haddr, cv, nv);
-+                                       const TranslationBlock *tb,
+-#else
-+                                       const uint64_t *data)
+-                MemOpIdx oi = make_memop_idx(MO_TEUL | MO_ALIGN, mem_idx);
- {
+-                ov = cpu_atomic_cmpxchgl_be_mmu(env, a1, cv, nv, oi, ra);
-+    RISCVCPU *cpu = RISCV_CPU(cs);
+-#endif
-+    CPURISCVState *env = &cpu->env;
++                ov = cpu_atomic_cmpxchgl_be_mmu(env, a1, cv, nv, oi4, ra);
-     RISCVMXL xl = FIELD_EX32(tb->flags, TB_FLAGS, XL);
+             } else {
-+
+                 ov = cpu_ldl_mmu(env, a1, oi4, ra);
-     if (xl == MXL_RV32) {
+                 cpu_stl_mmu(env, a1, (ov == cv ? nv : ov), oi4, ra);
          env->pc = (int32_t)data[0];
      } else {
@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps riscv_sysemu_ops = {
  static const struct TCGCPUOps riscv_tcg_ops = {
      .initialize = riscv_translate_init,
      .synchronize_from_tb = riscv_cpu_synchronize_from_tb,
 +    .restore_state_to_opc = riscv_restore_state_to_opc,
  #ifndef CONFIG_USER_ONLY
      .tlb_fill = riscv_cpu_tlb_fill,
 --
 .34.1

-[PULL 12/47] accel/tcg: Move assert_no_pages_locked to internal.h
+[PULL 19/28] accel/tcg: Remove cpu_atomic_{ld,st}o_*_mmu
-There are no users outside of accel/tcg; this function
+Atomic load/store of 128-byte quantities is now handled
-does not need to be defined in exec-all.h.
+by cpu_{ld,st}16_mmu.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/internal.h    | 5 +++++
+ accel/tcg/atomic_template.h   | 61 +++--------------------------------
- include/exec/exec-all.h | 8 --------
+ include/exec/cpu_ldst.h       |  9 ------
-files changed, 5 insertions(+), 8 deletions(-)
+ accel/tcg/atomic_common.c.inc | 14 --------
 files changed, 4 insertions(+), 80 deletions(-)
-diff --git a/accel/tcg/internal.h b/accel/tcg/internal.h
+diff --git a/accel/tcg/atomic_template.h b/accel/tcg/atomic_template.h
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/internal.h
+--- a/accel/tcg/atomic_template.h
-+++ b/accel/tcg/internal.h
++++ b/accel/tcg/atomic_template.h
-@@ -XXX,XX +XXX,XX @@ void do_assert_page_locked(const PageDesc *pd, const char *file, int line);
+@@ -XXX,XX +XXX,XX @@ ABI_TYPE ATOMIC_NAME(cmpxchg)(CPUArchState *env, target_ulong addr,
- void page_lock(PageDesc *pd);
+     return ret;
- void page_unlock(PageDesc *pd);
+ }
- #endif
-+#if !defined(CONFIG_USER_ONLY) && defined(CONFIG_DEBUG_TCG)
+-#if DATA_SIZE >= 16
-+void assert_no_pages_locked(void);
+-#if HAVE_ATOMIC128
-+#else
+-ABI_TYPE ATOMIC_NAME(ld)(CPUArchState *env, target_ulong addr,
-+static inline void assert_no_pages_locked(void) { }
+-                         MemOpIdx oi, uintptr_t retaddr)
-+#endif
+-{
+-    DATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE,
- TranslationBlock *tb_gen_code(CPUState *cpu, target_ulong pc,
+-                                         PAGE_READ, retaddr);
-                               target_ulong cs_base, uint32_t flags,
+-    DATA_TYPE val;
-diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
+-
 -    val = atomic16_read(haddr);
 -    ATOMIC_MMU_CLEANUP;
 -    atomic_trace_ld_post(env, addr, oi);
 -    return val;
 -}
 -
 -void ATOMIC_NAME(st)(CPUArchState *env, target_ulong addr, ABI_TYPE val,
 -                     MemOpIdx oi, uintptr_t retaddr)
 -{
 -    DATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE,
 -                                         PAGE_WRITE, retaddr);
 -
 -    atomic16_set(haddr, val);
 -    ATOMIC_MMU_CLEANUP;
 -    atomic_trace_st_post(env, addr, oi);
 -}
 -#endif
 -#else
 +#if DATA_SIZE < 16
  ABI_TYPE ATOMIC_NAME(xchg)(CPUArchState *env, target_ulong addr, ABI_TYPE val,
                             MemOpIdx oi, uintptr_t retaddr)
  {
@@ -XXX,XX +XXX,XX @@ GEN_ATOMIC_HELPER_FN(smax_fetch, MAX, SDATA_TYPE, new)
  GEN_ATOMIC_HELPER_FN(umax_fetch, MAX,  DATA_TYPE, new)
  #undef GEN_ATOMIC_HELPER_FN
 -#endif /* DATA SIZE >= 16 */
 +#endif /* DATA SIZE < 16 */
  #undef END
@@ -XXX,XX +XXX,XX @@ ABI_TYPE ATOMIC_NAME(cmpxchg)(CPUArchState *env, target_ulong addr,
      return BSWAP(ret);
  }
 -#if DATA_SIZE >= 16
 -#if HAVE_ATOMIC128
 -ABI_TYPE ATOMIC_NAME(ld)(CPUArchState *env, target_ulong addr,
 -                         MemOpIdx oi, uintptr_t retaddr)
 -{
 -    DATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE,
 -                                         PAGE_READ, retaddr);
 -    DATA_TYPE val;
 -
 -    val = atomic16_read(haddr);
 -    ATOMIC_MMU_CLEANUP;
 -    atomic_trace_ld_post(env, addr, oi);
 -    return BSWAP(val);
 -}
 -
 -void ATOMIC_NAME(st)(CPUArchState *env, target_ulong addr, ABI_TYPE val,
 -                     MemOpIdx oi, uintptr_t retaddr)
 -{
 -    DATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE,
 -                                         PAGE_WRITE, retaddr);
 -
 -    val = BSWAP(val);
 -    atomic16_set(haddr, val);
 -    ATOMIC_MMU_CLEANUP;
 -    atomic_trace_st_post(env, addr, oi);
 -}
 -#endif
 -#else
 +#if DATA_SIZE < 16
  ABI_TYPE ATOMIC_NAME(xchg)(CPUArchState *env, target_ulong addr, ABI_TYPE val,
                             MemOpIdx oi, uintptr_t retaddr)
  {
@@ -XXX,XX +XXX,XX @@ GEN_ATOMIC_HELPER_FN(add_fetch, ADD, DATA_TYPE, new)
  #undef ADD
  #undef GEN_ATOMIC_HELPER_FN
 -#endif /* DATA_SIZE >= 16 */
 +#endif /* DATA_SIZE < 16 */
  #undef END
  #endif /* DATA_SIZE > 1 */
 diff --git a/include/exec/cpu_ldst.h b/include/exec/cpu_ldst.h
 index XXXXXXX..XXXXXXX 100644
---- a/include/exec/exec-all.h
+--- a/include/exec/cpu_ldst.h
-+++ b/include/exec/exec-all.h
++++ b/include/exec/cpu_ldst.h
-@@ -XXX,XX +XXX,XX @@ extern __thread uintptr_t tci_tb_ptr;
+@@ -XXX,XX +XXX,XX @@ Int128 cpu_atomic_cmpxchgo_be_mmu(CPUArchState *env, target_ulong addr,
-    smaller than 4 bytes, so we don't worry about special-casing this.  */
+                                   Int128 cmpv, Int128 newv,
- #define GETPC_ADJ   2
+                                   MemOpIdx oi, uintptr_t retaddr);
--#if !defined(CONFIG_USER_ONLY) && defined(CONFIG_DEBUG_TCG)
+-Int128 cpu_atomic_ldo_le_mmu(CPUArchState *env, target_ulong addr,
--void assert_no_pages_locked(void);
+-                             MemOpIdx oi, uintptr_t retaddr);
--#else
+-Int128 cpu_atomic_ldo_be_mmu(CPUArchState *env, target_ulong addr,
--static inline void assert_no_pages_locked(void)
+-                             MemOpIdx oi, uintptr_t retaddr);
 -void cpu_atomic_sto_le_mmu(CPUArchState *env, target_ulong addr, Int128 val,
 -                           MemOpIdx oi, uintptr_t retaddr);
 -void cpu_atomic_sto_be_mmu(CPUArchState *env, target_ulong addr, Int128 val,
 -                           MemOpIdx oi, uintptr_t retaddr);
 -
  #if defined(CONFIG_USER_ONLY)
  extern __thread uintptr_t helper_retaddr;
 diff --git a/accel/tcg/atomic_common.c.inc b/accel/tcg/atomic_common.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/atomic_common.c.inc
 +++ b/accel/tcg/atomic_common.c.inc
@@ -XXX,XX +XXX,XX @@ static void atomic_trace_rmw_post(CPUArchState *env, uint64_t addr,
      qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_RW);
  }
 -#if HAVE_ATOMIC128
 -static void atomic_trace_ld_post(CPUArchState *env, uint64_t addr,
 -                                 MemOpIdx oi)
 -{
+-    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
+-}
+-
+-static void atomic_trace_st_post(CPUArchState *env, uint64_t addr,
+-                                 MemOpIdx oi)
+-{
+-    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W);
 -}
 -#endif
 -
- #if !defined(CONFIG_USER_ONLY)
+ /*
+  * Atomic helpers callable from TCG.
- /**
+  * These have a common interface and all defer to cpu_atomic_*
 --
 .34.1

-[PULL 23/47] accel/tcg: Move TARGET_PAGE_DATA_SIZE impl to user-exec.c
+[PULL 20/28] accel/tcg: Remove prot argument to atomic_mmu_lookup
-Since "target data" is always user-only, move it out of
+Now that load/store are gone, we're always passing
-translate-all.c to user-exec.c.
+PAGE_READ | PAGE_WRITE for RMW atomic operations.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/translate-all.c | 50 ---------------------------------------
+ accel/tcg/atomic_template.h | 32 ++++++--------
- accel/tcg/user-exec.c     | 50 +++++++++++++++++++++++++++++++++++++++
+ accel/tcg/cputlb.c          | 85 ++++++++++++++-----------------------
-files changed, 50 insertions(+), 50 deletions(-)
+ accel/tcg/user-exec.c       |  8 +---
+files changed, 45 insertions(+), 80 deletions(-)
-diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
 diff --git a/accel/tcg/atomic_template.h b/accel/tcg/atomic_template.h
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/translate-all.c
+--- a/accel/tcg/atomic_template.h
-+++ b/accel/tcg/translate-all.c
++++ b/accel/tcg/atomic_template.h
-@@ -XXX,XX +XXX,XX @@ void page_set_flags(target_ulong start, target_ulong end, int flags)
+@@ -XXX,XX +XXX,XX @@ ABI_TYPE ATOMIC_NAME(cmpxchg)(CPUArchState *env, target_ulong addr,
-     }
+                               ABI_TYPE cmpv, ABI_TYPE newv,
- }
+                               MemOpIdx oi, uintptr_t retaddr)
+ {
--void page_reset_target_data(target_ulong start, target_ulong end)
+-    DATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE,
--{
+-                                         PAGE_READ | PAGE_WRITE, retaddr);
--#ifdef TARGET_PAGE_DATA_SIZE
++    DATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE, retaddr);
--    target_ulong addr, len;
+     DATA_TYPE ret;
  #if DATA_SIZE == 16
@@ -XXX,XX +XXX,XX @@ ABI_TYPE ATOMIC_NAME(cmpxchg)(CPUArchState *env, target_ulong addr,
  ABI_TYPE ATOMIC_NAME(xchg)(CPUArchState *env, target_ulong addr, ABI_TYPE val,
                             MemOpIdx oi, uintptr_t retaddr)
  {
 -    DATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE,
 -                                         PAGE_READ | PAGE_WRITE, retaddr);
 +    DATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE, retaddr);
      DATA_TYPE ret;
      ret = qatomic_xchg__nocheck(haddr, val);
@@ -XXX,XX +XXX,XX @@ ABI_TYPE ATOMIC_NAME(xchg)(CPUArchState *env, target_ulong addr, ABI_TYPE val,
  ABI_TYPE ATOMIC_NAME(X)(CPUArchState *env, target_ulong addr,       \
                          ABI_TYPE val, MemOpIdx oi, uintptr_t retaddr) \
  {                                                                   \
 -    DATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE,  \
 -                                         PAGE_READ | PAGE_WRITE, retaddr); \
 -    DATA_TYPE ret;                                                  \
 +    DATA_TYPE *haddr, ret;                                          \
 +    haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE, retaddr);   \
      ret = qatomic_##X(haddr, val);                                  \
      ATOMIC_MMU_CLEANUP;                                             \
      atomic_trace_rmw_post(env, addr, oi);                           \
@@ -XXX,XX +XXX,XX @@ GEN_ATOMIC_HELPER(xor_fetch)
  ABI_TYPE ATOMIC_NAME(X)(CPUArchState *env, target_ulong addr,       \
                          ABI_TYPE xval, MemOpIdx oi, uintptr_t retaddr) \
  {                                                                   \
 -    XDATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE, \
 -                                          PAGE_READ | PAGE_WRITE, retaddr); \
 -    XDATA_TYPE cmp, old, new, val = xval;                           \
 +    XDATA_TYPE *haddr, cmp, old, new, val = xval;                   \
 +    haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE, retaddr);   \
      smp_mb();                                                       \
      cmp = qatomic_read__nocheck(haddr);                             \
      do {                                                            \
@@ -XXX,XX +XXX,XX @@ ABI_TYPE ATOMIC_NAME(cmpxchg)(CPUArchState *env, target_ulong addr,
                                ABI_TYPE cmpv, ABI_TYPE newv,
                                MemOpIdx oi, uintptr_t retaddr)
  {
 -    DATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE,
 -                                         PAGE_READ | PAGE_WRITE, retaddr);
 +    DATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE, retaddr);
      DATA_TYPE ret;
  #if DATA_SIZE == 16
@@ -XXX,XX +XXX,XX @@ ABI_TYPE ATOMIC_NAME(cmpxchg)(CPUArchState *env, target_ulong addr,
  ABI_TYPE ATOMIC_NAME(xchg)(CPUArchState *env, target_ulong addr, ABI_TYPE val,
                             MemOpIdx oi, uintptr_t retaddr)
  {
 -    DATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE,
 -                                         PAGE_READ | PAGE_WRITE, retaddr);
 +    DATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE, retaddr);
      ABI_TYPE ret;
      ret = qatomic_xchg__nocheck(haddr, BSWAP(val));
@@ -XXX,XX +XXX,XX @@ ABI_TYPE ATOMIC_NAME(xchg)(CPUArchState *env, target_ulong addr, ABI_TYPE val,
  ABI_TYPE ATOMIC_NAME(X)(CPUArchState *env, target_ulong addr,       \
                          ABI_TYPE val, MemOpIdx oi, uintptr_t retaddr) \
  {                                                                   \
 -    DATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE,  \
 -                                         PAGE_READ | PAGE_WRITE, retaddr); \
 -    DATA_TYPE ret;                                                  \
 +    DATA_TYPE *haddr, ret;                                          \
 +    haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE, retaddr);   \
      ret = qatomic_##X(haddr, BSWAP(val));                           \
      ATOMIC_MMU_CLEANUP;                                             \
      atomic_trace_rmw_post(env, addr, oi);                           \
@@ -XXX,XX +XXX,XX @@ GEN_ATOMIC_HELPER(xor_fetch)
  ABI_TYPE ATOMIC_NAME(X)(CPUArchState *env, target_ulong addr,       \
                          ABI_TYPE xval, MemOpIdx oi, uintptr_t retaddr) \
  {                                                                   \
 -    XDATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE, \
 -                                          PAGE_READ | PAGE_WRITE, retaddr); \
 -    XDATA_TYPE ldo, ldn, old, new, val = xval;                      \
 +    XDATA_TYPE *haddr, ldo, ldn, old, new, val = xval;              \
 +    haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE, retaddr);   \
      smp_mb();                                                       \
      ldn = qatomic_read__nocheck(haddr);                             \
      do {                                                            \
 diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/cputlb.c
 +++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ static bool mmu_lookup(CPUArchState *env, target_ulong addr, MemOpIdx oi,
  /*
   * Probe for an atomic operation.  Do not allow unaligned operations,
   * or io operations to proceed.  Return the host address.
 - *
 - * @prot may be PAGE_READ, PAGE_WRITE, or PAGE_READ|PAGE_WRITE.
   */
  static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
 -                               MemOpIdx oi, int size, int prot,
 -                               uintptr_t retaddr)
 +                               MemOpIdx oi, int size, uintptr_t retaddr)
  {
      uintptr_t mmu_idx = get_mmuidx(oi);
      MemOp mop = get_memop(oi);
@@ -XXX,XX +XXX,XX @@ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
      tlbe = tlb_entry(env, mmu_idx, addr);
      /* Check TLB entry and enforce page permissions.  */
 -    if (prot & PAGE_WRITE) {
 -        tlb_addr = tlb_addr_write(tlbe);
 -        if (!tlb_hit(tlb_addr, addr)) {
 -            if (!victim_tlb_hit(env, mmu_idx, index, MMU_DATA_STORE,
 -                                addr & TARGET_PAGE_MASK)) {
 -                tlb_fill(env_cpu(env), addr, size,
 -                         MMU_DATA_STORE, mmu_idx, retaddr);
 -                index = tlb_index(env, mmu_idx, addr);
 -                tlbe = tlb_entry(env, mmu_idx, addr);
 -            }
 -            tlb_addr = tlb_addr_write(tlbe) & ~TLB_INVALID_MASK;
 -        }
 -
--    /*
+-        if (prot & PAGE_READ) {
--     * This function should never be called with addresses outside the
+-            /*
--     * guest address space.  If this assert fires, it probably indicates
+-             * Let the guest notice RMW on a write-only page.
--     * a missing call to h2g_valid.
+-             * We have just verified that the page is writable.
--     */
+-             * Subpage lookups may have left TLB_INVALID_MASK set,
--    assert(end - 1 <= GUEST_ADDR_MAX);
+-             * but addr_read will only be -1 if PAGE_READ was unset.
--    assert(start < end);
+-             */
--    assert_memory_lock();
+-            if (unlikely(tlbe->addr_read == -1)) {
--
+-                tlb_fill(env_cpu(env), addr, size,
--    start = start & TARGET_PAGE_MASK;
+-                         MMU_DATA_LOAD, mmu_idx, retaddr);
--    end = TARGET_PAGE_ALIGN(end);
+-                /*
--
+-                 * Since we don't support reads and writes to different
--    for (addr = start, len = end - start;
+-                 * addresses, and we do have the proper page loaded for
--         len != 0;
+-                 * write, this shouldn't ever return.  But just in case,
--         len -= TARGET_PAGE_SIZE, addr += TARGET_PAGE_SIZE) {
+-                 * handle via stop-the-world.
--        PageDesc *p = page_find_alloc(addr >> TARGET_PAGE_BITS, 1);
+-                 */
--
+-                goto stop_the_world;
--        g_free(p->target_data);
+-            }
--        p->target_data = NULL;
+-            /* Collect TLB_WATCHPOINT for read. */
--    }
+-            tlb_addr |= tlbe->addr_read;
 -#endif
 -}
 -
 -#ifdef TARGET_PAGE_DATA_SIZE
 -void *page_get_target_data(target_ulong address)
 -{
 -    PageDesc *p = page_find(address >> TARGET_PAGE_BITS);
 -    return p ? p->target_data : NULL;
 -}
 -
 -void *page_alloc_target_data(target_ulong address)
 -{
 -    PageDesc *p = page_find(address >> TARGET_PAGE_BITS);
 -    void *ret = NULL;
 -
 -    if (p->flags & PAGE_VALID) {
 -        ret = p->target_data;
 -        if (!ret) {
 -            p->target_data = ret = g_malloc0(TARGET_PAGE_DATA_SIZE);
 -        }
--    }
+-    } else /* if (prot & PAGE_READ) */ {
--    return ret;
+-        tlb_addr = tlbe->addr_read;
--}
+-        if (!tlb_hit(tlb_addr, addr)) {
--#endif /* TARGET_PAGE_DATA_SIZE */
+-            if (!victim_tlb_hit(env, mmu_idx, index, MMU_DATA_LOAD,
--
+-                                addr & TARGET_PAGE_MASK)) {
- int page_check_range(target_ulong start, target_ulong len, int flags)
+-                tlb_fill(env_cpu(env), addr, size,
- {
+-                         MMU_DATA_LOAD, mmu_idx, retaddr);
-     PageDesc *p;
+-                index = tlb_index(env, mmu_idx, addr);
 -                tlbe = tlb_entry(env, mmu_idx, addr);
 -            }
 -            tlb_addr = tlbe->addr_read & ~TLB_INVALID_MASK;
 +    tlb_addr = tlb_addr_write(tlbe);
 +    if (!tlb_hit(tlb_addr, addr)) {
 +        if (!victim_tlb_hit(env, mmu_idx, index, MMU_DATA_STORE,
 +                            addr & TARGET_PAGE_MASK)) {
 +            tlb_fill(env_cpu(env), addr, size,
 +                     MMU_DATA_STORE, mmu_idx, retaddr);
 +            index = tlb_index(env, mmu_idx, addr);
 +            tlbe = tlb_entry(env, mmu_idx, addr);
          }
 +        tlb_addr = tlb_addr_write(tlbe) & ~TLB_INVALID_MASK;
      }
 +    /*
 +     * Let the guest notice RMW on a write-only page.
 +     * We have just verified that the page is writable.
 +     * Subpage lookups may have left TLB_INVALID_MASK set,
 +     * but addr_read will only be -1 if PAGE_READ was unset.
 +     */
 +    if (unlikely(tlbe->addr_read == -1)) {
 +        tlb_fill(env_cpu(env), addr, size, MMU_DATA_LOAD, mmu_idx, retaddr);
 +        /*
 +         * Since we don't support reads and writes to different
 +         * addresses, and we do have the proper page loaded for
 +         * write, this shouldn't ever return.  But just in case,
 +         * handle via stop-the-world.
 +         */
 +        goto stop_the_world;
 +    }
 +    /* Collect TLB_WATCHPOINT for read. */
 +    tlb_addr |= tlbe->addr_read;
 +
      /* Notice an IO access or a needs-MMU-lookup access */
      if (unlikely(tlb_addr & (TLB_MMIO | TLB_DISCARD_WRITE))) {
          /* There's really nothing that can be done to
@@ -XXX,XX +XXX,XX @@ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
      }
      if (unlikely(tlb_addr & TLB_WATCHPOINT)) {
 -        QEMU_BUILD_BUG_ON(PAGE_READ != BP_MEM_READ);
 -        QEMU_BUILD_BUG_ON(PAGE_WRITE != BP_MEM_WRITE);
 -        /* therefore prot == watchpoint bits */
 -        cpu_check_watchpoint(env_cpu(env), addr, size,
 -                             full->attrs, prot, retaddr);
 +        cpu_check_watchpoint(env_cpu(env), addr, size, full->attrs,
 +                             BP_MEM_READ | BP_MEM_WRITE, retaddr);
      }
      return hostaddr;
 diff --git a/accel/tcg/user-exec.c b/accel/tcg/user-exec.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/user-exec.c
 +++ b/accel/tcg/user-exec.c
-@@ -XXX,XX +XXX,XX @@ tb_page_addr_t get_page_addr_code_hostp(CPUArchState *env, target_ulong addr,
+@@ -XXX,XX +XXX,XX @@ uint64_t cpu_ldq_code_mmu(CPUArchState *env, abi_ptr addr,
      return addr;
  }
 +void page_reset_target_data(target_ulong start, target_ulong end)
 +{
 +#ifdef TARGET_PAGE_DATA_SIZE
 +    target_ulong addr, len;
 +
 +    /*
 +     * This function should never be called with addresses outside the
 +     * guest address space.  If this assert fires, it probably indicates
 +     * a missing call to h2g_valid.
 +     */
 +    assert(end - 1 <= GUEST_ADDR_MAX);
 +    assert(start < end);
 +    assert_memory_lock();
 +
 +    start = start & TARGET_PAGE_MASK;
 +    end = TARGET_PAGE_ALIGN(end);
 +
 +    for (addr = start, len = end - start;
 +         len != 0;
 +         len -= TARGET_PAGE_SIZE, addr += TARGET_PAGE_SIZE) {
 +        PageDesc *p = page_find_alloc(addr >> TARGET_PAGE_BITS, 1);
 +
 +        g_free(p->target_data);
 +        p->target_data = NULL;
 +    }
 +#endif
 +}
 +
 +#ifdef TARGET_PAGE_DATA_SIZE
 +void *page_get_target_data(target_ulong address)
 +{
 +    PageDesc *p = page_find(address >> TARGET_PAGE_BITS);
 +    return p ? p->target_data : NULL;
 +}
 +
 +void *page_alloc_target_data(target_ulong address)
 +{
 +    PageDesc *p = page_find(address >> TARGET_PAGE_BITS);
 +    void *ret = NULL;
 +
 +    if (p->flags & PAGE_VALID) {
 +        ret = p->target_data;
 +        if (!ret) {
 +            p->target_data = ret = g_malloc0(TARGET_PAGE_DATA_SIZE);
 +        }
 +    }
 +    return ret;
 +}
 +#endif
 +
  /* The softmmu versions of these helpers are in cputlb.c.  */
  /*
+  * Do not allow unaligned operations to proceed.  Return the host address.
+- *
+- * @prot may be PAGE_READ, PAGE_WRITE, or PAGE_READ|PAGE_WRITE.
+  */
+ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
+-                               MemOpIdx oi, int size, int prot,
+-                               uintptr_t retaddr)
++                               MemOpIdx oi, int size, uintptr_t retaddr)
+ {
+     MemOp mop = get_memop(oi);
+     int a_bits = get_alignment_bits(mop);
+@@ -XXX,XX +XXX,XX @@ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
+     /* Enforce guest required alignment.  */
+     if (unlikely(addr & ((1 << a_bits) - 1))) {
+-        MMUAccessType t = prot == PAGE_READ ? MMU_DATA_LOAD : MMU_DATA_STORE;
+-        cpu_loop_exit_sigbus(env_cpu(env), addr, t, retaddr);
++        cpu_loop_exit_sigbus(env_cpu(env), addr, MMU_DATA_STORE, retaddr);
+     }
+     /* Enforce qemu required alignment.  */
 --
 .34.1

-[PULL 24/47] accel/tcg: Simplify page_get/alloc_target_data
+[PULL 21/28] accel/tcg: Eliminate #if on HAVE_ATOMIC128 and HAVE_CMPXCHG128
-Since the only user, Arm MTE, always requires allocation,
+These symbols will shortly become dynamic runtime tests and
-merge the get and alloc functions to always produce a
+therefore not appropriate for the preprocessor.  Use the
-non-null result.  Also assume that the user has already
+matching CONFIG_* symbols for that purpose.
 checked page validity.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/exec/cpu-all.h  | 21 ++++++---------------
+ host/include/aarch64/host/atomic128-cas.h  | 2 ++
- accel/tcg/user-exec.c   | 16 ++++------------
+ host/include/generic/host/atomic128-ldst.h | 2 +-
- target/arm/mte_helper.c |  4 ----
+ accel/tcg/cputlb.c                         | 2 +-
-files changed, 10 insertions(+), 31 deletions(-)
+ accel/tcg/user-exec.c                      | 2 +-
 files changed, 5 insertions(+), 3 deletions(-)
-diff --git a/include/exec/cpu-all.h b/include/exec/cpu-all.h
+diff --git a/host/include/aarch64/host/atomic128-cas.h b/host/include/aarch64/host/atomic128-cas.h
 index XXXXXXX..XXXXXXX 100644
---- a/include/exec/cpu-all.h
+--- a/host/include/aarch64/host/atomic128-cas.h
-+++ b/include/exec/cpu-all.h
++++ b/host/include/aarch64/host/atomic128-cas.h
-@@ -XXX,XX +XXX,XX @@ void page_reset_target_data(target_ulong start, target_ulong end);
+@@ -XXX,XX +XXX,XX @@ static inline Int128 atomic16_cmpxchg(Int128 *ptr, Int128 cmp, Int128 new)
- int page_check_range(target_ulong start, target_ulong len, int flags);
+     return int128_make128(oldl, oldh);
- /**
+ }
-- * page_alloc_target_data(address)
++
-+ * page_get_target_data(address)
++# define CONFIG_CMPXCHG128 1
-  * @address: guest virtual address
+ # define HAVE_CMPXCHG128 1
   *
 - * Allocate TARGET_PAGE_DATA_SIZE bytes of out-of-band data to associate
 - * with the guest page at @address.  If the page is not mapped, NULL will
 - * be returned.  If there is existing data associated with @address,
 - * no new memory will be allocated.
 + * Return TARGET_PAGE_DATA_SIZE bytes of out-of-band data to associate
 + * with the guest page at @address, allocating it if necessary.  The
 + * caller should already have verified that the address is valid.
   *
   * The memory will be freed when the guest page is deallocated,
   * e.g. with the munmap system call.
   */
 -void *page_alloc_target_data(target_ulong address);
 -
 -/**
 - * page_get_target_data(address)
 - * @address: guest virtual address
 - *
 - * Return any out-of-bound memory assocated with the guest page
 - * at @address, as per page_alloc_target_data.
 - */
 -void *page_get_target_data(target_ulong address);
 +void *page_get_target_data(target_ulong address)
 +    __attribute__((returns_nonnull));
  #endif
- CPUArchState *cpu_copy(CPUArchState *env);
+diff --git a/host/include/generic/host/atomic128-ldst.h b/host/include/generic/host/atomic128-ldst.h
 index XXXXXXX..XXXXXXX 100644
 --- a/host/include/generic/host/atomic128-ldst.h
 +++ b/host/include/generic/host/atomic128-ldst.h
@@ -XXX,XX +XXX,XX @@ atomic16_set(Int128 *ptr, Int128 val)
  }
  # define HAVE_ATOMIC128 1
 -#elif !defined(CONFIG_USER_ONLY) && HAVE_CMPXCHG128
 +#elif defined(CONFIG_CMPXCHG128) && !defined(CONFIG_USER_ONLY)
  static inline Int128 ATTRIBUTE_ATOMIC128_OPT
  atomic16_read(Int128 *ptr)
  {
 diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/cputlb.c
 +++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ void cpu_st16_mmu(CPUArchState *env, target_ulong addr, Int128 val,
  #include "atomic_template.h"
  #endif
 -#if HAVE_CMPXCHG128 || HAVE_ATOMIC128
 +#if defined(CONFIG_ATOMIC128) || defined(CONFIG_CMPXCHG128)
  #define DATA_SIZE 16
  #include "atomic_template.h"
  #endif
 diff --git a/accel/tcg/user-exec.c b/accel/tcg/user-exec.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/user-exec.c
 +++ b/accel/tcg/user-exec.c
-@@ -XXX,XX +XXX,XX @@ void page_reset_target_data(target_ulong start, target_ulong end)
+@@ -XXX,XX +XXX,XX @@ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
- void *page_get_target_data(target_ulong address)
+ #include "atomic_template.h"
- {
+ #endif
-     PageDesc *p = page_find(address >> TARGET_PAGE_BITS);
--    return p ? p->target_data : NULL;
+-#if HAVE_ATOMIC128 || HAVE_CMPXCHG128
--}
++#if defined(CONFIG_ATOMIC128) || defined(CONFIG_CMPXCHG128)
-+    void *ret = p->target_data;
+ #define DATA_SIZE 16
+ #include "atomic_template.h"
--void *page_alloc_target_data(target_ulong address)
+ #endif
 -{
 -    PageDesc *p = page_find(address >> TARGET_PAGE_BITS);
 -    void *ret = NULL;
 -
 -    if (p->flags & PAGE_VALID) {
 -        ret = p->target_data;
 -        if (!ret) {
 -            p->target_data = ret = g_malloc0(TARGET_PAGE_DATA_SIZE);
 -        }
 +    if (!ret) {
 +        ret = g_malloc0(TARGET_PAGE_DATA_SIZE);
 +        p->target_data = ret;
      }
      return ret;
  }
 diff --git a/target/arm/mte_helper.c b/target/arm/mte_helper.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/arm/mte_helper.c
 +++ b/target/arm/mte_helper.c
@@ -XXX,XX +XXX,XX @@ static uint8_t *allocation_tag_mem(CPUARMState *env, int ptr_mmu_idx,
      }
      tags = page_get_target_data(clean_ptr);
 -    if (tags == NULL) {
 -        tags = page_alloc_target_data(clean_ptr);
 -        assert(tags != NULL);
 -    }
      index = extract32(ptr, LOG2_TAG_GRANULE + 1,
                        TARGET_PAGE_BITS - LOG2_TAG_GRANULE - 1);
 --
 .34.1

-[PULL 13/47] accel/tcg: Drop cpu_get_tb_cpu_state from TARGET_HAS_PRECISE_SMC
+[PULL 22/28] qemu/atomic128: Split atomic16_read
-The results of the calls to cpu_get_tb_cpu_state,
+Create both atomic16_read_ro and atomic16_read_rw.
-current_{pc,cs_base,flags}, are not used.
+Previously we pretended that we had atomic16_read in system mode,
-In tb_invalidate_phys_page, use bool for current_tb_modified.
+because we "know" that all ram is always writable to the host.
 Now, expose read-only and read-write versions all of the time.
 For aarch64, do not fall back to __atomic_read_16 even if
 supported by the compiler, to work around a clang bug.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/tb-maint.c | 25 ++-----------------------
+ host/include/aarch64/host/atomic128-ldst.h | 21 ++++++++-------
-file changed, 2 insertions(+), 23 deletions(-)
+ host/include/generic/host/atomic128-ldst.h | 31 ++++++++++++++++------
  target/s390x/tcg/mem_helper.c              |  2 +-
 files changed, 36 insertions(+), 18 deletions(-)
-diff --git a/accel/tcg/tb-maint.c b/accel/tcg/tb-maint.c
+diff --git a/host/include/aarch64/host/atomic128-ldst.h b/host/include/aarch64/host/atomic128-ldst.h
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/tb-maint.c
+--- a/host/include/aarch64/host/atomic128-ldst.h
-+++ b/accel/tcg/tb-maint.c
++++ b/host/include/aarch64/host/atomic128-ldst.h
-@@ -XXX,XX +XXX,XX @@ tb_invalidate_phys_page_range__locked(struct page_collection *pages,
+@@ -XXX,XX +XXX,XX @@
-     int n;
+ #ifndef AARCH64_ATOMIC128_LDST_H
- #ifdef TARGET_HAS_PRECISE_SMC
+ #define AARCH64_ATOMIC128_LDST_H
-     CPUState *cpu = current_cpu;
--    CPUArchState *env = NULL;
+-/* Through gcc 10, aarch64 has no support for 128-bit atomics.  */
-     bool current_tb_not_found = retaddr != 0;
+-#if !defined(CONFIG_ATOMIC128) && !defined(CONFIG_USER_ONLY)
-     bool current_tb_modified = false;
+-/* We can do better than cmpxchg for AArch64.  */
-     TranslationBlock *current_tb = NULL;
+-static inline Int128 atomic16_read(Int128 *ptr)
--    target_ulong current_pc = 0;
++/*
--    target_ulong current_cs_base = 0;
++ * Through gcc 10, aarch64 has no support for 128-bit atomics.
--    uint32_t current_flags = 0;
++ * Through clang 16, without -march=armv8.4-a, __atomic_load_16
- #endif /* TARGET_HAS_PRECISE_SMC */
++ * is incorrectly expanded to a read-write operation.
++ */
-     assert_page_locked(p);
++
++#define HAVE_ATOMIC128_RO 0
--#if defined(TARGET_HAS_PRECISE_SMC)
++#define HAVE_ATOMIC128_RW 1
--    if (cpu != NULL) {
++
--        env = cpu->env_ptr;
++Int128 QEMU_ERROR("unsupported atomic") atomic16_read_ro(const Int128 *ptr);
--    }
++
 +static inline Int128 atomic16_read_rw(Int128 *ptr)
  {
      uint64_t l, h;
      uint32_t tmp;
@@ -XXX,XX +XXX,XX @@ static inline void atomic16_set(Int128 *ptr, Int128 val)
          : [l] "r"(l), [h] "r"(h));
  }
 -# define HAVE_ATOMIC128 1
 -#else
 -#include "host/include/generic/host/atomic128-ldst.h"
 -#endif
 -
-     /*
+ #endif /* AARCH64_ATOMIC128_LDST_H */
-      * We remove all the TBs in the range [start, end[.
+diff --git a/host/include/generic/host/atomic128-ldst.h b/host/include/generic/host/atomic128-ldst.h
-      * XXX: see if in some cases it could be faster to invalidate all the code
+index XXXXXXX..XXXXXXX 100644
-@@ -XXX,XX +XXX,XX @@ tb_invalidate_phys_page_range__locked(struct page_collection *pages,
+--- a/host/include/generic/host/atomic128-ldst.h
-                  */
++++ b/host/include/generic/host/atomic128-ldst.h
-                 current_tb_modified = true;
+@@ -XXX,XX +XXX,XX @@
-                 cpu_restore_state_from_tb(cpu, current_tb, retaddr, true);
+ #define HOST_ATOMIC128_LDST_H
--                cpu_get_tb_cpu_state(env, &current_pc, &current_cs_base,
--                                     &current_flags);
+ #if defined(CONFIG_ATOMIC128)
-             }
++# define HAVE_ATOMIC128_RO 1
- #endif /* TARGET_HAS_PRECISE_SMC */
++# define HAVE_ATOMIC128_RW 1
-             tb_phys_invalidate__locked(tb);
++
-@@ -XXX,XX +XXX,XX @@ bool tb_invalidate_phys_page(tb_page_addr_t addr, uintptr_t pc)
+ static inline Int128 ATTRIBUTE_ATOMIC128_OPT
- #ifdef TARGET_HAS_PRECISE_SMC
+-atomic16_read(Int128 *ptr)
-     TranslationBlock *current_tb = NULL;
++atomic16_read_ro(const Int128 *ptr)
-     CPUState *cpu = current_cpu;
+ {
--    CPUArchState *env = NULL;
+-    __int128_t *ptr_align = __builtin_assume_aligned(ptr, 16);
--    int current_tb_modified = 0;
++    const __int128_t *ptr_align = __builtin_assume_aligned(ptr, 16);
--    target_ulong current_pc = 0;
+     Int128Alias r;
--    target_ulong current_cs_base = 0;
--    uint32_t current_flags = 0;
+     r.i = qatomic_read__nocheck(ptr_align);
-+    bool current_tb_modified = false;
+     return r.s;
  }
 +static inline Int128 ATTRIBUTE_ATOMIC128_OPT
 +atomic16_read_rw(Int128 *ptr)
 +{
 +    return atomic16_read_ro(ptr);
 +}
 +
  static inline void ATTRIBUTE_ATOMIC128_OPT
  atomic16_set(Int128 *ptr, Int128 val)
  {
@@ -XXX,XX +XXX,XX @@ atomic16_set(Int128 *ptr, Int128 val)
      qatomic_set__nocheck(ptr_align, v.i);
  }
 -# define HAVE_ATOMIC128 1
 -#elif defined(CONFIG_CMPXCHG128) && !defined(CONFIG_USER_ONLY)
 +#elif defined(CONFIG_CMPXCHG128)
 +# define HAVE_ATOMIC128_RO 0
 +# define HAVE_ATOMIC128_RW 1
 +
 +Int128 QEMU_ERROR("unsupported atomic") atomic16_read_ro(const Int128 *ptr);
 +
  static inline Int128 ATTRIBUTE_ATOMIC128_OPT
 -atomic16_read(Int128 *ptr)
 +atomic16_read_rw(Int128 *ptr)
  {
      /* Maybe replace 0 with 0, returning the old value.  */
      Int128 z = int128_make64(0);
@@ -XXX,XX +XXX,XX @@ atomic16_set(Int128 *ptr, Int128 val)
      } while (int128_ne(old, cmp));
  }
 -# define HAVE_ATOMIC128 1
  #else
 +# define HAVE_ATOMIC128_RO 0
 +# define HAVE_ATOMIC128_RW 0
 +
  /* Fallback definitions that must be optimized away, or error.  */
 -Int128 QEMU_ERROR("unsupported atomic") atomic16_read(Int128 *ptr);
 +Int128 QEMU_ERROR("unsupported atomic") atomic16_read_ro(const Int128 *ptr);
 +Int128 QEMU_ERROR("unsupported atomic") atomic16_read_rw(Int128 *ptr);
  void QEMU_ERROR("unsupported atomic") atomic16_set(Int128 *ptr, Int128 val);
 -# define HAVE_ATOMIC128 0
  #endif
-     assert_memory_lock();
+ #endif /* HOST_ATOMIC128_LDST_H */
-@@ -XXX,XX +XXX,XX @@ bool tb_invalidate_phys_page(tb_page_addr_t addr, uintptr_t pc)
+diff --git a/target/s390x/tcg/mem_helper.c b/target/s390x/tcg/mem_helper.c
-     if (p->first_tb && pc != 0) {
+index XXXXXXX..XXXXXXX 100644
-         current_tb = tcg_tb_lookup(pc);
+--- a/target/s390x/tcg/mem_helper.c
 +++ b/target/s390x/tcg/mem_helper.c
@@ -XXX,XX +XXX,XX @@ static uint32_t do_csst(CPUS390XState *env, uint32_t r3, uint64_t a1,
          max = 3;
  #endif
          if ((HAVE_CMPXCHG128 ? 0 : fc + 2 > max) ||
 -            (HAVE_ATOMIC128  ? 0 : sc > max)) {
 +            (HAVE_ATOMIC128_RW ? 0 : sc > max)) {
              cpu_loop_exit_atomic(env_cpu(env), ra);
          }
      }
--    if (cpu != NULL) {
--        env = cpu->env_ptr;
--    }
- #endif
-     assert_page_locked(p);
-     PAGE_FOR_EACH_TB(p, tb, n) {
-@@ -XXX,XX +XXX,XX @@ bool tb_invalidate_phys_page(tb_page_addr_t addr, uintptr_t pc)
-              * after the current PC, but it would require a specialized
-              * function to partially restore the CPU state.
-              */
--            current_tb_modified = 1;
-+            current_tb_modified = true;
-             cpu_restore_state_from_tb(cpu, current_tb, pc, true);
--            cpu_get_tb_cpu_state(env, &current_pc, &current_cs_base,
--                                 &current_flags);
-         }
- #endif /* TARGET_HAS_PRECISE_SMC */
-         tb_phys_invalidate(tb, addr);
 --
 .34.1

-[PULL 17/47] accel/tcg: Rename tb_invalidate_phys_page_range and drop end parameter
+[PULL 23/28] accel/tcg: Correctly use atomic128.h in ldst_atomicity.c.inc
-This function is is never called with a real range,
+Remove the locally defined load_atomic16 and store_atomic16,
-only for a single page.  Drop the second parameter
+along with HAVE_al16 and HAVE_al16_fast in favor of the
-and rename to tb_invalidate_phys_page.
+routines defined in atomic128.h.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/exec/translate-all.h |  2 +-
+ accel/tcg/cputlb.c             |   2 +-
- accel/tcg/tb-maint.c         | 15 ++++++++-------
+ accel/tcg/ldst_atomicity.c.inc | 118 +++++++--------------------------
- cpu.c                        |  4 ++--
+files changed, 24 insertions(+), 96 deletions(-)
-files changed, 11 insertions(+), 10 deletions(-)
+diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
 diff --git a/include/exec/translate-all.h b/include/exec/translate-all.h
 index XXXXXXX..XXXXXXX 100644
---- a/include/exec/translate-all.h
+--- a/accel/tcg/cputlb.c
-+++ b/include/exec/translate-all.h
++++ b/accel/tcg/cputlb.c
-@@ -XXX,XX +XXX,XX @@ void page_collection_unlock(struct page_collection *set);
+@@ -XXX,XX +XXX,XX @@ static uint64_t do_st16_leN(CPUArchState *env, MMULookupPageData *p,
- void tb_invalidate_phys_page_fast(struct page_collection *pages,
-                                   tb_page_addr_t start, int len,
+     case MO_ATOM_WITHIN16_PAIR:
-                                   uintptr_t retaddr);
+         /* Since size > 8, this is the half that must be atomic. */
--void tb_invalidate_phys_page_range(tb_page_addr_t start, tb_page_addr_t end);
+-        if (!HAVE_al16) {
-+void tb_invalidate_phys_page(tb_page_addr_t addr);
++        if (!HAVE_ATOMIC128_RW) {
- void tb_check_watchpoint(CPUState *cpu, uintptr_t retaddr);
+             cpu_loop_exit_atomic(env_cpu(env), ra);
          }
          return store_whole_le16(p->haddr, p->size, val_le);
 diff --git a/accel/tcg/ldst_atomicity.c.inc b/accel/tcg/ldst_atomicity.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/ldst_atomicity.c.inc
 +++ b/accel/tcg/ldst_atomicity.c.inc
@@ -XXX,XX +XXX,XX @@
  #endif
  #define HAVE_al8_fast      (ATOMIC_REG_SIZE >= 8)
 -#if defined(CONFIG_ATOMIC128)
 -# define HAVE_al16_fast    true
 -#else
 -# define HAVE_al16_fast    false
 -#endif
 -#if defined(CONFIG_ATOMIC128) || defined(CONFIG_CMPXCHG128)
 -# define HAVE_al16         true
 -#else
 -# define HAVE_al16         false
 -#endif
 -
 -
  /**
   * required_atomicity:
   *
@@ -XXX,XX +XXX,XX @@ static inline uint64_t load_atomic8(void *pv)
      return qatomic_read__nocheck(p);
  }
 -/**
 - * load_atomic16:
 - * @pv: host address
 - *
 - * Atomically load 16 aligned bytes from @pv.
 - */
 -static inline Int128 ATTRIBUTE_ATOMIC128_OPT
 -load_atomic16(void *pv)
 -{
 -#ifdef CONFIG_ATOMIC128
 -    __uint128_t *p = __builtin_assume_aligned(pv, 16);
 -    Int128Alias r;
 -
 -    r.u = qatomic_read__nocheck(p);
 -    return r.s;
 -#else
 -    qemu_build_not_reached();
 -#endif
 -}
 -
  /**
   * load_atomic8_or_exit:
   * @env: cpu context
@@ -XXX,XX +XXX,XX @@ static Int128 load_atomic16_or_exit(CPUArchState *env, uintptr_t ra, void *pv)
  {
      Int128 *p = __builtin_assume_aligned(pv, 16);
 -    if (HAVE_al16_fast) {
 -        return load_atomic16(p);
 +    if (HAVE_ATOMIC128_RO) {
 +        return atomic16_read_ro(p);
      }
  #ifdef CONFIG_USER_ONLY
-diff --git a/accel/tcg/tb-maint.c b/accel/tcg/tb-maint.c
+@@ -XXX,XX +XXX,XX @@ static Int128 load_atomic16_or_exit(CPUArchState *env, uintptr_t ra, void *pv)
-index XXXXXXX..XXXXXXX 100644
+      * In system mode all guest pages are writable, and for user-only
---- a/accel/tcg/tb-maint.c
+      * we have just checked writability.  Try cmpxchg.
-+++ b/accel/tcg/tb-maint.c
+      */
-@@ -XXX,XX +XXX,XX @@ tb_invalidate_phys_page_range__locked(struct page_collection *pages,
+-#if defined(CONFIG_CMPXCHG128)
 -    /* Swap 0 with 0, with the side-effect of returning the old value. */
 -    {
 -        Int128Alias r;
 -        r.u = __sync_val_compare_and_swap_16((__uint128_t *)p, 0, 0);
 -        return r.s;
 +    if (HAVE_ATOMIC128_RW) {
 +        return atomic16_read_rw(p);
      }
 -#endif
      /* Ultimate fallback: re-execute in serial context. */
      cpu_loop_exit_atomic(env_cpu(env), ra);
@@ -XXX,XX +XXX,XX @@ static uint64_t load_atom_extract_al16_or_exit(CPUArchState *env, uintptr_t ra,
  static inline uint64_t ATTRIBUTE_ATOMIC128_OPT
  load_atom_extract_al16_or_al8(void *pv, int s)
  {
 -#if defined(CONFIG_ATOMIC128)
      uintptr_t pi = (uintptr_t)pv;
      int o = pi & 7;
      int shr = (HOST_BIG_ENDIAN ? 16 - s - o : o) * 8;
 -    __uint128_t r;
 +    Int128 r;
      pv = (void *)(pi & ~7);
      if (pi & 8) {
@@ -XXX,XX +XXX,XX @@ load_atom_extract_al16_or_al8(void *pv, int s)
          uint64_t b = qatomic_read__nocheck(p8 + 1);
          if (HOST_BIG_ENDIAN) {
 -            r = ((__uint128_t)a << 64) | b;
 +            r = int128_make128(b, a);
          } else {
 -            r = ((__uint128_t)b << 64) | a;
 +            r = int128_make128(a, b);
          }
      } else {
 -        __uint128_t *p16 = __builtin_assume_aligned(pv, 16, 0);
 -        r = qatomic_read__nocheck(p16);
 +        r = atomic16_read_ro(pv);
      }
 -    return r >> shr;
 -#else
 -    qemu_build_not_reached();
 -#endif
 +    return int128_getlo(int128_urshift(r, shr));
  }
- /*
+ /**
-- * Invalidate all TBs which intersect with the target physical address range
+@@ -XXX,XX +XXX,XX @@ static uint16_t load_atom_2(CPUArchState *env, uintptr_t ra,
-- * [start;end[. NOTE: start and end must refer to the *same* physical page.
+     if (likely((pi & 1) == 0)) {
-- * 'is_cpu_write_access' should be true if called from a real cpu write
+         return load_atomic2(pv);
-- * access: the virtual CPU will exit the current TB if code is modified inside
+     }
-- * this TB.
+-    if (HAVE_al16_fast) {
-+ * Invalidate all TBs which intersect with the target physical
++    if (HAVE_ATOMIC128_RO) {
-+ * address page @addr.
+         return load_atom_extract_al16_or_al8(pv, 2);
-  *
+     }
-  * Called with mmap_lock held for user-mode emulation
@@ -XXX,XX +XXX,XX @@ static uint32_t load_atom_4(CPUArchState *env, uintptr_t ra,
      if (likely((pi & 3) == 0)) {
          return load_atomic4(pv);
      }
 -    if (HAVE_al16_fast) {
 +    if (HAVE_ATOMIC128_RO) {
          return load_atom_extract_al16_or_al8(pv, 4);
      }
@@ -XXX,XX +XXX,XX @@ static uint64_t load_atom_8(CPUArchState *env, uintptr_t ra,
      if (HAVE_al8 && likely((pi & 7) == 0)) {
          return load_atomic8(pv);
      }
 -    if (HAVE_al16_fast) {
 +    if (HAVE_ATOMIC128_RO) {
          return load_atom_extract_al16_or_al8(pv, 8);
      }
@@ -XXX,XX +XXX,XX @@ static Int128 load_atom_16(CPUArchState *env, uintptr_t ra,
       * If the host does not support 16-byte atomics, wait until we have
       * examined the atomicity parameters below.
       */
 -    if (HAVE_al16_fast && likely((pi & 15) == 0)) {
 -        return load_atomic16(pv);
 +    if (HAVE_ATOMIC128_RO && likely((pi & 15) == 0)) {
 +        return atomic16_read_ro(pv);
      }
      atmax = required_atomicity(env, pi, memop);
@@ -XXX,XX +XXX,XX @@ static inline void store_atomic8(void *pv, uint64_t val)
      qatomic_set__nocheck(p, val);
  }
 -/**
 - * store_atomic16:
 - * @pv: host address
 - * @val: value to store
 - *
 - * Atomically store 16 aligned bytes to @pv.
 - */
 -static inline void ATTRIBUTE_ATOMIC128_OPT
 -store_atomic16(void *pv, Int128Alias val)
 -{
 -#if defined(CONFIG_ATOMIC128)
 -    __uint128_t *pu = __builtin_assume_aligned(pv, 16);
 -    qatomic_set__nocheck(pu, val.u);
 -#elif defined(CONFIG_CMPXCHG128)
 -    __uint128_t *pu = __builtin_assume_aligned(pv, 16);
 -    __uint128_t o;
 -
 -    /*
 -     * Without CONFIG_ATOMIC128, __atomic_compare_exchange_n will always
 -     * defer to libatomic, so we must use __sync_*_compare_and_swap_16
 -     * and accept the sequential consistency that comes with it.
 -     */
 -    do {
 -        o = *pu;
 -    } while (!__sync_bool_compare_and_swap_16(pu, o, val.u));
 -#else
 -    qemu_build_not_reached();
 -#endif
 -}
 -
  /**
   * store_atom_4x2
   */
--void tb_invalidate_phys_page_range(tb_page_addr_t start, tb_page_addr_t end)
+@@ -XXX,XX +XXX,XX @@ static uint64_t store_whole_le16(void *pv, int size, Int128 val_le)
-+void tb_invalidate_phys_page(tb_page_addr_t addr)
+     int sh = o * 8;
- {
+     Int128 m, v;
-     struct page_collection *pages;
-+    tb_page_addr_t start, end;
+-    qemu_build_assert(HAVE_al16);
-     PageDesc *p;
++    qemu_build_assert(HAVE_ATOMIC128_RW);
-     assert_memory_lock();
+     /* Like MAKE_64BIT_MASK(0, sz), but larger. */
+     if (sz <= 64) {
--    p = page_find(start >> TARGET_PAGE_BITS);
+@@ -XXX,XX +XXX,XX @@ static void store_atom_2(CPUArchState *env, uintptr_t ra,
-+    p = page_find(addr >> TARGET_PAGE_BITS);
+             return;
-     if (p == NULL) {
+         }
      } else if ((pi & 15) == 7) {
 -        if (HAVE_al16) {
 +        if (HAVE_ATOMIC128_RW) {
              Int128 v = int128_lshift(int128_make64(val), 56);
              Int128 m = int128_lshift(int128_make64(0xffff), 56);
              store_atom_insert_al16(pv - 7, v, m);
@@ -XXX,XX +XXX,XX @@ static void store_atom_4(CPUArchState *env, uintptr_t ra,
                  return;
              }
          } else {
 -            if (HAVE_al16) {
 +            if (HAVE_ATOMIC128_RW) {
                  store_whole_le16(pv, 4, int128_make64(cpu_to_le32(val)));
                  return;
              }
@@ -XXX,XX +XXX,XX @@ static void store_atom_8(CPUArchState *env, uintptr_t ra,
          }
          break;
      case MO_64:
 -        if (HAVE_al16) {
 +        if (HAVE_ATOMIC128_RW) {
              store_whole_le16(pv, 8, int128_make64(cpu_to_le64(val)));
              return;
          }
@@ -XXX,XX +XXX,XX @@ static void store_atom_16(CPUArchState *env, uintptr_t ra,
      uint64_t a, b;
      int atmax;
 -    if (HAVE_al16_fast && likely((pi & 15) == 0)) {
 -        store_atomic16(pv, val);
 +    if (HAVE_ATOMIC128_RW && likely((pi & 15) == 0)) {
 +        atomic16_set(pv, val);
          return;
      }
-+
-+    start = addr & TARGET_PAGE_MASK;
+@@ -XXX,XX +XXX,XX @@ static void store_atom_16(CPUArchState *env, uintptr_t ra,
-+    end = start + TARGET_PAGE_SIZE;
+         }
-     pages = page_collection_lock(start, end);
+         break;
-     tb_invalidate_phys_page_range__locked(pages, p, start, end, 0);
+     case -MO_64:
-     page_collection_unlock(pages);
+-        if (HAVE_al16) {
-diff --git a/cpu.c b/cpu.c
++        if (HAVE_ATOMIC128_RW) {
-index XXXXXXX..XXXXXXX 100644
+             uint64_t val_le;
---- a/cpu.c
+             int s2 = pi & 15;
-+++ b/cpu.c
+             int s1 = 16 - s2;
-@@ -XXX,XX +XXX,XX @@ void list_cpus(const char *optarg)
+@@ -XXX,XX +XXX,XX @@ static void store_atom_16(CPUArchState *env, uintptr_t ra,
- void tb_invalidate_phys_addr(target_ulong addr)
+         }
- {
+         break;
-     mmap_lock();
+     case MO_128:
--    tb_invalidate_phys_page_range(addr, addr + 1);
+-        if (HAVE_al16) {
-+    tb_invalidate_phys_page(addr);
+-            store_atomic16(pv, val);
-     mmap_unlock();
++        if (HAVE_ATOMIC128_RW) {
- }
++            atomic16_set(pv, val);
- #else
+             return;
-@@ -XXX,XX +XXX,XX @@ void tb_invalidate_phys_addr(AddressSpace *as, hwaddr addr, MemTxAttrs attrs)
+         }
-         return;
+         break;
      }
      ram_addr = memory_region_get_ram_addr(mr) + addr;
 -    tb_invalidate_phys_page_range(ram_addr, ram_addr + 1);
 +    tb_invalidate_phys_page(ram_addr);
  }
  #endif
 --
 .34.1

-[PULL 38/47] target/openrisc: Convert to tcg_ops restore_state_to_opc
+[PULL 24/28] tcg: Split out tcg/debug-assert.h
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/openrisc/cpu.c       | 13 +++++++++++++
+ include/tcg/debug-assert.h | 17 +++++++++++++++++
- target/openrisc/translate.c | 10 ----------
+ include/tcg/tcg.h          |  9 +--------
-files changed, 13 insertions(+), 10 deletions(-)
+ MAINTAINERS                |  1 +
 files changed, 19 insertions(+), 8 deletions(-)
  create mode 100644 include/tcg/debug-assert.h
-diff --git a/target/openrisc/cpu.c b/target/openrisc/cpu.c
+diff --git a/include/tcg/debug-assert.h b/include/tcg/debug-assert.h
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/include/tcg/debug-assert.h
@@ -XXX,XX +XXX,XX @@
 +/* SPDX-License-Identifier: MIT */
 +/*
 + * Define tcg_debug_assert
 + * Copyright (c) 2008 Fabrice Bellard
 + */
 +
 +#ifndef TCG_DEBUG_ASSERT_H
 +#define TCG_DEBUG_ASSERT_H
 +
 +#if defined CONFIG_DEBUG_TCG || defined QEMU_STATIC_ANALYSIS
 +# define tcg_debug_assert(X) do { assert(X); } while (0)
 +#else
 +# define tcg_debug_assert(X) \
 +    do { if (!(X)) { __builtin_unreachable(); } } while (0)
 +#endif
 +
 +#endif
 diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
 index XXXXXXX..XXXXXXX 100644
---- a/target/openrisc/cpu.c
+--- a/include/tcg/tcg.h
-+++ b/target/openrisc/cpu.c
++++ b/include/tcg/tcg.h
-@@ -XXX,XX +XXX,XX @@ static void openrisc_cpu_synchronize_from_tb(CPUState *cs,
+@@ -XXX,XX +XXX,XX @@
-     cpu->env.pc = tb_pc(tb);
+ #include "tcg/tcg-mo.h"
- }
+ #include "tcg-target.h"
+ #include "tcg/tcg-cond.h"
-+static void openrisc_restore_state_to_opc(CPUState *cs,
++#include "tcg/debug-assert.h"
-+                                          const TranslationBlock *tb,
-+                                          const uint64_t *data)
+ /* XXX: make safe guess about sizes */
-+{
+ #define MAX_OP_PER_INSTR 266
-+    OpenRISCCPU *cpu = OPENRISC_CPU(cs);
+@@ -XXX,XX +XXX,XX @@ typedef uint64_t tcg_insn_unit;
-+
+ /* The port better have done this.  */
-+    cpu->env.pc = data[0];
+ #endif
-+    cpu->env.dflag = data[1] & 1;
-+    if (data[1] & 2) {
+-
-+        cpu->env.ppc = cpu->env.pc - 4;
+-#if defined CONFIG_DEBUG_TCG || defined QEMU_STATIC_ANALYSIS
-+    }
+-# define tcg_debug_assert(X) do { assert(X); } while (0)
-+}
+-#else
+-# define tcg_debug_assert(X) \
- static bool openrisc_cpu_has_work(CPUState *cs)
+-    do { if (!(X)) { __builtin_unreachable(); } } while (0)
- {
+-#endif
-@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps openrisc_sysemu_ops = {
+-
- static const struct TCGCPUOps openrisc_tcg_ops = {
+ typedef struct TCGRelocation TCGRelocation;
-     .initialize = openrisc_translate_init,
+ struct TCGRelocation {
-     .synchronize_from_tb = openrisc_cpu_synchronize_from_tb,
+     QSIMPLEQ_ENTRY(TCGRelocation) next;
-+    .restore_state_to_opc = openrisc_restore_state_to_opc,
+diff --git a/MAINTAINERS b/MAINTAINERS
  #ifndef CONFIG_USER_ONLY
      .tlb_fill = openrisc_cpu_tlb_fill,
 diff --git a/target/openrisc/translate.c b/target/openrisc/translate.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/openrisc/translate.c
+--- a/MAINTAINERS
-+++ b/target/openrisc/translate.c
++++ b/MAINTAINERS
-@@ -XXX,XX +XXX,XX @@ void openrisc_cpu_dump_state(CPUState *cs, FILE *f, int flags)
+@@ -XXX,XX +XXX,XX @@ F: include/sysemu/tcg.h
-                      (i % 4) == 3 ? '\n' : ' ');
+ F: include/hw/core/tcg-cpu-ops.h
-     }
+ F: host/include/*/host/cpuinfo.h
- }
+ F: util/cpuinfo-*.c
--
++F: include/tcg/
--void restore_state_to_opc(CPUOpenRISCState *env, TranslationBlock *tb,
--                          target_ulong *data)
+ FPU emulation
--{
+ M: Aurelien Jarno <aurelien@aurel32.net>
 -    env->pc = data[0];
 -    env->dflag = data[1] & 1;
 -    if (data[1] & 2) {
 -        env->ppc = env->pc - 4;
 -    }
 -}
 --
 .34.1

-[PULL 21/47] accel/tcg: Use page_reset_target_data in page_set_flags
+[PULL 25/28] qemu/atomic128: Improve cmpxchg fallback for atomic16_set
-Use the existing function for clearing target data.
+Use __sync_bool_compare_and_swap_16 to control the loop,
 rather than a separate comparison.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/translate-all.c | 13 +++++--------
+ host/include/generic/host/atomic128-ldst.h | 11 +++++++----
-file changed, 5 insertions(+), 8 deletions(-)
+file changed, 7 insertions(+), 4 deletions(-)
-diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
+diff --git a/host/include/generic/host/atomic128-ldst.h b/host/include/generic/host/atomic128-ldst.h
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/translate-all.c
+--- a/host/include/generic/host/atomic128-ldst.h
-+++ b/accel/tcg/translate-all.c
++++ b/host/include/generic/host/atomic128-ldst.h
-@@ -XXX,XX +XXX,XX @@ void page_set_flags(target_ulong start, target_ulong end, int flags)
+@@ -XXX,XX +XXX,XX @@ atomic16_read_rw(Int128 *ptr)
-         flags |= PAGE_WRITE_ORG;
+ static inline void ATTRIBUTE_ATOMIC128_OPT
-     }
+ atomic16_set(Int128 *ptr, Int128 val)
-     reset = !(flags & PAGE_VALID) || (flags & PAGE_RESET);
+ {
-+    if (reset) {
+-    Int128 old = *ptr, cmp;
-+        page_reset_target_data(start, end);
++    __int128_t *ptr_align = __builtin_assume_aligned(ptr, 16);
-+    }
++    __int128_t old;
-     flags &= ~PAGE_RESET;
++    Int128Alias new;
++
-     for (addr = start, len = end - start;
++    new.s = val;
-@@ -XXX,XX +XXX,XX @@ void page_set_flags(target_ulong start, target_ulong end, int flags)
+     do {
-                 (flags & ~p->flags & PAGE_WRITE))) {
+-        cmp = old;
-             tb_invalidate_phys_page(addr);
+-        old = atomic16_cmpxchg(ptr, cmp, val);
-         }
+-    } while (int128_ne(old, cmp));
--        if (reset) {
++        old = *ptr_align;
--            g_free(p->target_data);
++    } while (!__sync_bool_compare_and_swap_16(ptr_align, old, new.i));
 -            p->target_data = NULL;
 -            p->flags = flags;
 -        } else {
 -            /* Using mprotect on a page does not change sticky bits. */
 -            p->flags = (p->flags & PAGE_STICKY) | flags;
 -        }
 +        /* Using mprotect on a page does not change sticky bits. */
 +        p->flags = (reset ? 0 : p->flags & PAGE_STICKY) | flags;
      }
  }
+ #else
 --
 .34.1

-[PULL 22/47] accel/tcg: Use tb_invalidate_phys_range in page_set_flags
+[PULL 26/28] qemu/atomic128: Add runtime test for FEAT_LSE2
-Flush translation blocks in bulk, rather than page-by-page.
+With FEAT_LSE2, load and store of int128 is directly supported.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/translate-all.c | 8 ++++++--
+ host/include/aarch64/host/atomic128-ldst.h | 53 ++++++++++++++++------
-file changed, 6 insertions(+), 2 deletions(-)
+file changed, 40 insertions(+), 13 deletions(-)
-diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
+diff --git a/host/include/aarch64/host/atomic128-ldst.h b/host/include/aarch64/host/atomic128-ldst.h
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/translate-all.c
+--- a/host/include/aarch64/host/atomic128-ldst.h
-+++ b/accel/tcg/translate-all.c
++++ b/host/include/aarch64/host/atomic128-ldst.h
-@@ -XXX,XX +XXX,XX @@ int page_get_flags(target_ulong address)
+@@ -XXX,XX +XXX,XX @@
- void page_set_flags(target_ulong start, target_ulong end, int flags)
+ #ifndef AARCH64_ATOMIC128_LDST_H
  #define AARCH64_ATOMIC128_LDST_H
 +#include "host/cpuinfo.h"
 +#include "tcg/debug-assert.h"
 +
  /*
   * Through gcc 10, aarch64 has no support for 128-bit atomics.
   * Through clang 16, without -march=armv8.4-a, __atomic_load_16
   * is incorrectly expanded to a read-write operation.
 + *
 + * Anyway, this method allows runtime detection of FEAT_LSE2.
   */
 -#define HAVE_ATOMIC128_RO 0
 +#define HAVE_ATOMIC128_RO (cpuinfo & CPUINFO_LSE2)
  #define HAVE_ATOMIC128_RW 1
 -Int128 QEMU_ERROR("unsupported atomic") atomic16_read_ro(const Int128 *ptr);
 +static inline Int128 atomic16_read_ro(const Int128 *ptr)
 +{
 +    uint64_t l, h;
 +
 +    tcg_debug_assert(HAVE_ATOMIC128_RO);
 +    /* With FEAT_LSE2, 16-byte aligned LDP is atomic. */
 +    asm("ldp %[l], %[h], %[mem]"
 +        : [l] "=r"(l), [h] "=r"(h) : [mem] "m"(*ptr));
 +
 +    return int128_make128(l, h);
 +}
  static inline Int128 atomic16_read_rw(Int128 *ptr)
  {
-     target_ulong addr, len;
+     uint64_t l, h;
--    bool reset;
+     uint32_t tmp;
-+    bool reset, inval_tb = false;
+-    /* The load must be paired with the store to guarantee not tearing.  */
-     /* This function should never be called with addresses outside the
+-    asm("0: ldxp %[l], %[h], %[mem]\n\t"
-        guest address space.  If this assert fires, it probably indicates
+-        "stxp %w[tmp], %[l], %[h], %[mem]\n\t"
-@@ -XXX,XX +XXX,XX @@ void page_set_flags(target_ulong start, target_ulong end, int flags)
+-        "cbnz %w[tmp], 0b"
-             && (reset ||
+-        : [mem] "+m"(*ptr), [tmp] "=r"(tmp), [l] "=r"(l), [h] "=r"(h));
-                 !(flags & PAGE_EXEC) ||
++    if (cpuinfo & CPUINFO_LSE2) {
-                 (flags & ~p->flags & PAGE_WRITE))) {
++        /* With FEAT_LSE2, 16-byte aligned LDP is atomic. */
--            tb_invalidate_phys_page(addr);
++        asm("ldp %[l], %[h], %[mem]"
-+            inval_tb = true;
++            : [l] "=r"(l), [h] "=r"(h) : [mem] "m"(*ptr));
-         }
++    } else {
-         /* Using mprotect on a page does not change sticky bits. */
++        /* The load must be paired with the store to guarantee not tearing.  */
-         p->flags = (reset ? 0 : p->flags & PAGE_STICKY) | flags;
++        asm("0: ldxp %[l], %[h], %[mem]\n\t"
-     }
++            "stxp %w[tmp], %[l], %[h], %[mem]\n\t"
-+
++            "cbnz %w[tmp], 0b"
-+    if (inval_tb) {
++            : [mem] "+m"(*ptr), [tmp] "=r"(tmp), [l] "=r"(l), [h] "=r"(h));
-+        tb_invalidate_phys_range(start, end);
++    }
      return int128_make128(l, h);
  }
@@ -XXX,XX +XXX,XX @@ static inline void atomic16_set(Int128 *ptr, Int128 val)
      uint64_t l = int128_getlo(val), h = int128_gethi(val);
      uint64_t t1, t2;
 -    /* Load into temporaries to acquire the exclusive access lock.  */
 -    asm("0: ldxp %[t1], %[t2], %[mem]\n\t"
 -        "stxp %w[t1], %[l], %[h], %[mem]\n\t"
 -        "cbnz %w[t1], 0b"
 -        : [mem] "+m"(*ptr), [t1] "=&r"(t1), [t2] "=&r"(t2)
 -        : [l] "r"(l), [h] "r"(h));
 +    if (cpuinfo & CPUINFO_LSE2) {
 +        /* With FEAT_LSE2, 16-byte aligned STP is atomic. */
 +        asm("stp %[l], %[h], %[mem]"
 +            : [mem] "=m"(*ptr) : [l] "r"(l), [h] "r"(h));
 +    } else {
 +        /* Load into temporaries to acquire the exclusive access lock.  */
 +        asm("0: ldxp %[t1], %[t2], %[mem]\n\t"
 +            "stxp %w[t1], %[l], %[h], %[mem]\n\t"
 +            "cbnz %w[t1], 0b"
 +            : [mem] "+m"(*ptr), [t1] "=&r"(t1), [t2] "=&r"(t2)
 +            : [l] "r"(l), [h] "r"(h));
 +    }
  }
- void page_reset_target_data(target_ulong start, target_ulong end)
+ #endif /* AARCH64_ATOMIC128_LDST_H */
 --
 .34.1

-[PULL 15/47] accel/tcg: Introduce tb_{set_}page_addr{0,1}
+[PULL 27/28] tcg: Remove DEBUG_DISAS
-This data structure will be replaced for user-only: add accessors.
+This had been set since the beginning, is never undefined,
 and it would seem to be harmful to debugging to do so.
-Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/exec/exec-all.h   | 22 ++++++++++++++++++++++
+ include/exec/exec-all.h   | 3 ---
- accel/tcg/cpu-exec.c      |  9 +++++----
+ accel/tcg/cpu-exec.c      | 2 --
- accel/tcg/tb-maint.c      | 29 +++++++++++++++--------------
+ accel/tcg/translate-all.c | 2 --
- accel/tcg/translate-all.c | 16 ++++++++--------
+ accel/tcg/translator.c    | 2 --
- accel/tcg/translator.c    |  9 +++++----
+ target/sh4/translate.c    | 2 --
-files changed, 55 insertions(+), 30 deletions(-)
+ target/sparc/translate.c  | 2 --
  tcg/tcg.c                 | 9 +--------
 files changed, 1 insertion(+), 21 deletions(-)
 diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/exec/exec-all.h
 +++ b/include/exec/exec-all.h
-@@ -XXX,XX +XXX,XX @@ static inline uint32_t tb_cflags(const TranslationBlock *tb)
+@@ -XXX,XX +XXX,XX @@
-     return qatomic_read(&tb->cflags);
+ #include "qemu/interval-tree.h"
- }
+ #include "qemu/clang-tsa.h"
-+static inline tb_page_addr_t tb_page_addr0(const TranslationBlock *tb)
+-/* allow to see translation results - the slowdown should be negligible, so we leave it */
-+{
+-#define DEBUG_DISAS
-+    return tb->page_addr[0];
+-
-+}
+ /* Page tracking code uses ram addresses in system mode, and virtual
-+
+    addresses in userspace mode.  Define tb_page_addr_t to be an appropriate
-+static inline tb_page_addr_t tb_page_addr1(const TranslationBlock *tb)
+    type.  */
 +{
 +    return tb->page_addr[1];
 +}
 +
 +static inline void tb_set_page_addr0(TranslationBlock *tb,
 +                                     tb_page_addr_t addr)
 +{
 +    tb->page_addr[0] = addr;
 +}
 +
 +static inline void tb_set_page_addr1(TranslationBlock *tb,
 +                                     tb_page_addr_t addr)
 +{
 +    tb->page_addr[1] = addr;
 +}
 +
  /* current cflags for hashing/comparison */
  uint32_t curr_cflags(CPUState *cpu);
 diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/cpu-exec.c
 +++ b/accel/tcg/cpu-exec.c
-@@ -XXX,XX +XXX,XX @@ static bool tb_lookup_cmp(const void *p, const void *d)
+@@ -XXX,XX +XXX,XX @@ static void log_cpu_exec(target_ulong pc, CPUState *cpu,
-     const struct tb_desc *desc = d;
+                       cpu->cpu_index, tb->tc.ptr, tb->cs_base, pc,
+                       tb->flags, tb->cflags, lookup_symbol(pc));
-     if ((TARGET_TB_PCREL || tb_pc(tb) == desc->pc) &&
--        tb->page_addr[0] == desc->page_addr0 &&
+-#if defined(DEBUG_DISAS)
-+        tb_page_addr0(tb) == desc->page_addr0 &&
+         if (qemu_loglevel_mask(CPU_LOG_TB_CPU)) {
-         tb->cs_base == desc->cs_base &&
+             FILE *logfile = qemu_log_trylock();
-         tb->flags == desc->flags &&
+             if (logfile) {
-         tb->trace_vcpu_dstate == desc->trace_vcpu_dstate &&
+@@ -XXX,XX +XXX,XX @@ static void log_cpu_exec(target_ulong pc, CPUState *cpu,
-         tb_cflags(tb) == desc->cflags) {
+                 qemu_log_unlock(logfile);
          /* check next page if needed */
 -        if (tb->page_addr[1] == -1) {
 +        tb_page_addr_t tb_phys_page1 = tb_page_addr1(tb);
 +        if (tb_phys_page1 == -1) {
              return true;
          } else {
              tb_page_addr_t phys_page1;
@@ -XXX,XX +XXX,XX @@ static bool tb_lookup_cmp(const void *p, const void *d)
               */
              virt_page1 = TARGET_PAGE_ALIGN(desc->pc);
              phys_page1 = get_page_addr_code(desc->env, virt_page1);
 -            if (tb->page_addr[1] == phys_page1) {
 +            if (tb_phys_page1 == phys_page1) {
                  return true;
              }
          }
-@@ -XXX,XX +XXX,XX @@ int cpu_exec(CPUState *cpu)
+-#endif /* DEBUG_DISAS */
-              * direct jump to a TB spanning two pages because the mapping
+     }
               * for the second page can change.
               */
 -            if (tb->page_addr[1] != -1) {
 +            if (tb_page_addr1(tb) != -1) {
                  last_tb = NULL;
              }
  #endif
 diff --git a/accel/tcg/tb-maint.c b/accel/tcg/tb-maint.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/tb-maint.c
 +++ b/accel/tcg/tb-maint.c
@@ -XXX,XX +XXX,XX @@ static bool tb_cmp(const void *ap, const void *bp)
              a->flags == b->flags &&
              (tb_cflags(a) & ~CF_INVALID) == (tb_cflags(b) & ~CF_INVALID) &&
              a->trace_vcpu_dstate == b->trace_vcpu_dstate &&
 -            a->page_addr[0] == b->page_addr[0] &&
 -            a->page_addr[1] == b->page_addr[1]);
 +            tb_page_addr0(a) == tb_page_addr0(b) &&
 +            tb_page_addr1(a) == tb_page_addr1(b));
  }
- void tb_htable_init(void)
-@@ -XXX,XX +XXX,XX @@ static void do_tb_phys_invalidate(TranslationBlock *tb, bool rm_from_page_list)
-     qemu_spin_unlock(&tb->jmp_lock);
-     /* remove the TB from the hash list */
--    phys_pc = tb->page_addr[0];
-+    phys_pc = tb_page_addr0(tb);
-     h = tb_hash_func(phys_pc, (TARGET_TB_PCREL ? 0 : tb_pc(tb)),
-                      tb->flags, orig_cflags, tb->trace_vcpu_dstate);
-     if (!qht_remove(&tb_ctx.htable, tb, h)) {
-@@ -XXX,XX +XXX,XX @@ static void do_tb_phys_invalidate(TranslationBlock *tb, bool rm_from_page_list)
-     /* remove the TB from the page list */
-     if (rm_from_page_list) {
--        p = page_find(tb->page_addr[0] >> TARGET_PAGE_BITS);
-+        p = page_find(phys_pc >> TARGET_PAGE_BITS);
-         tb_page_remove(p, tb);
--        if (tb->page_addr[1] != -1) {
--            p = page_find(tb->page_addr[1] >> TARGET_PAGE_BITS);
-+        phys_pc = tb_page_addr1(tb);
-+        if (phys_pc != -1) {
-+            p = page_find(phys_pc >> TARGET_PAGE_BITS);
-             tb_page_remove(p, tb);
-         }
-     }
-@@ -XXX,XX +XXX,XX @@ static inline void page_unlock_tb(const TranslationBlock *tb) { }
- /* lock the page(s) of a TB in the correct acquisition order */
- static void page_lock_tb(const TranslationBlock *tb)
- {
--    page_lock_pair(NULL, tb->page_addr[0], NULL, tb->page_addr[1], false);
-+    page_lock_pair(NULL, tb_page_addr0(tb), NULL, tb_page_addr1(tb), false);
- }
- static void page_unlock_tb(const TranslationBlock *tb)
- {
--    PageDesc *p1 = page_find(tb->page_addr[0] >> TARGET_PAGE_BITS);
-+    PageDesc *p1 = page_find(tb_page_addr0(tb) >> TARGET_PAGE_BITS);
-     page_unlock(p1);
--    if (unlikely(tb->page_addr[1] != -1)) {
--        PageDesc *p2 = page_find(tb->page_addr[1] >> TARGET_PAGE_BITS);
-+    if (unlikely(tb_page_addr1(tb) != -1)) {
-+        PageDesc *p2 = page_find(tb_page_addr1(tb) >> TARGET_PAGE_BITS);
-         if (p2 != p1) {
-             page_unlock(p2);
-@@ -XXX,XX +XXX,XX @@ static void page_unlock_tb(const TranslationBlock *tb)
-  */
- void tb_phys_invalidate(TranslationBlock *tb, tb_page_addr_t page_addr)
- {
--    if (page_addr == -1 && tb->page_addr[0] != -1) {
-+    if (page_addr == -1 && tb_page_addr0(tb) != -1) {
-         page_lock_tb(tb);
-         do_tb_phys_invalidate(tb, true);
-         page_unlock_tb(tb);
-@@ -XXX,XX +XXX,XX @@ tb_invalidate_phys_page_range__locked(struct page_collection *pages,
-         if (n == 0) {
-             /* NOTE: tb_end may be after the end of the page, but
-                it is not a problem */
--            tb_start = tb->page_addr[0];
-+            tb_start = tb_page_addr0(tb);
-             tb_end = tb_start + tb->size;
-         } else {
--            tb_start = tb->page_addr[1];
--            tb_end = tb_start + ((tb->page_addr[0] + tb->size)
-+            tb_start = tb_page_addr1(tb);
-+            tb_end = tb_start + ((tb_page_addr0(tb) + tb->size)
-                                  & ~TARGET_PAGE_MASK);
-         }
-         if (!(tb_end <= start || tb_start >= end)) {
 diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/translate-all.c
 +++ b/accel/tcg/translate-all.c
-@@ -XXX,XX +XXX,XX @@ page_collection_lock(tb_page_addr_t start, tb_page_addr_t end)
+@@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_gen_code(CPUState *cpu,
      qatomic_set(&prof->search_out_len, prof->search_out_len + search_size);
  #endif
 -#ifdef DEBUG_DISAS
      if (qemu_loglevel_mask(CPU_LOG_TB_OUT_ASM) &&
          qemu_log_in_addr_range(pc)) {
          FILE *logfile = qemu_log_trylock();
@@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_gen_code(CPUState *cpu,
              qemu_log_unlock(logfile);
          }
-         assert_page_locked(pd);
-         PAGE_FOR_EACH_TB(pd, tb, n) {
--            if (page_trylock_add(set, tb->page_addr[0]) ||
--                (tb->page_addr[1] != -1 &&
--                 page_trylock_add(set, tb->page_addr[1]))) {
-+            if (page_trylock_add(set, tb_page_addr0(tb)) ||
-+                (tb_page_addr1(tb) != -1 &&
-+                 page_trylock_add(set, tb_page_addr1(tb)))) {
-                 /* drop all locks, and reacquire in order */
-                 g_tree_foreach(set->tree, page_entry_unlock, NULL);
-                 goto retry;
-@@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_gen_code(CPUState *cpu,
-     tb->flags = flags;
-     tb->cflags = cflags;
-     tb->trace_vcpu_dstate = *cpu->trace_dstate;
--    tb->page_addr[0] = phys_pc;
--    tb->page_addr[1] = -1;
-+    tb_set_page_addr0(tb, phys_pc);
-+    tb_set_page_addr1(tb, -1);
-     tcg_ctx->tb_cflags = cflags;
-  tb_overflow:
-@@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_gen_code(CPUState *cpu,
-      * a temporary one-insn TB, and we have nothing left to do. Return early
-      * before attempting to link to other TBs or add to the lookup table.
-      */
--    if (tb->page_addr[0] == -1) {
-+    if (tb_page_addr0(tb) == -1) {
-         return tb;
      }
+-#endif
-@@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_gen_code(CPUState *cpu,
-      * No explicit memory barrier is required -- tb_link_page() makes the
+     qatomic_set(&tcg_ctx->code_gen_ptr, (void *)
-      * TB visible in a consistent state.
+         ROUND_UP((uintptr_t)gen_code_buf + gen_code_size + search_size,
       */
 -    existing_tb = tb_link_page(tb, tb->page_addr[0], tb->page_addr[1]);
 +    existing_tb = tb_link_page(tb, tb_page_addr0(tb), tb_page_addr1(tb));
      /* if the TB already exists, discard what we just translated */
      if (unlikely(existing_tb != tb)) {
          uintptr_t orig_aligned = (uintptr_t)gen_code_buf;
@@ -XXX,XX +XXX,XX @@ static gboolean tb_tree_stats_iter(gpointer key, gpointer value, gpointer data)
      if (tb->size > tst->max_target_size) {
          tst->max_target_size = tb->size;
      }
 -    if (tb->page_addr[1] != -1) {
 +    if (tb_page_addr1(tb) != -1) {
          tst->cross_page++;
      }
      if (tb->jmp_reset_offset[0] != TB_JMP_RESET_OFFSET_INVALID) {
 diff --git a/accel/tcg/translator.c b/accel/tcg/translator.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/translator.c
 +++ b/accel/tcg/translator.c
-@@ -XXX,XX +XXX,XX @@ static void *translator_access(CPUArchState *env, DisasContextBase *db,
+@@ -XXX,XX +XXX,XX @@ void translator_loop(CPUState *cpu, TranslationBlock *tb, int *max_insns,
-     tb = db->tb;
+     tb->size = db->pc_next - db->pc_first;
+     tb->icount = db->num_insns;
-     /* Use slow path if first page is MMIO. */
--    if (unlikely(tb->page_addr[0] == -1)) {
+-#ifdef DEBUG_DISAS
-+    if (unlikely(tb_page_addr0(tb) == -1)) {
+     if (qemu_loglevel_mask(CPU_LOG_TB_IN_ASM)
-         return NULL;
+         && qemu_log_in_addr_range(db->pc_first)) {
          FILE *logfile = qemu_log_trylock();
@@ -XXX,XX +XXX,XX @@ void translator_loop(CPUState *cpu, TranslationBlock *tb, int *max_insns,
              qemu_log_unlock(logfile);
          }
      }
+-#endif
-@@ -XXX,XX +XXX,XX @@ static void *translator_access(CPUArchState *env, DisasContextBase *db,
+ }
-         host = db->host_addr[1];
-         base = TARGET_PAGE_ALIGN(db->pc_first);
+ static void *translator_access(CPUArchState *env, DisasContextBase *db,
-         if (host == NULL) {
+diff --git a/target/sh4/translate.c b/target/sh4/translate.c
--            tb->page_addr[1] =
+index XXXXXXX..XXXXXXX 100644
-+            tb_page_addr_t phys_page =
+--- a/target/sh4/translate.c
-                 get_page_addr_code_hostp(env, base, &db->host_addr[1]);
++++ b/target/sh4/translate.c
-+            /* We cannot handle MMIO as second page. */
+@@ -XXX,XX +XXX,XX @@
-+            assert(phys_page != -1);
+  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
-+            tb_set_page_addr1(tb, phys_page);
+  */
- #ifdef CONFIG_USER_ONLY
-             page_protect(end);
+-#define DEBUG_DISAS
 -
  #include "qemu/osdep.h"
  #include "cpu.h"
  #include "disas/disas.h"
 diff --git a/target/sparc/translate.c b/target/sparc/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/sparc/translate.c
 +++ b/target/sparc/translate.c
@@ -XXX,XX +XXX,XX @@
  #include "asi.h"
 -#define DEBUG_DISAS
 -
  #define DYNAMIC_PC  1 /* dynamic pc value */
  #define JUMP_PC     2 /* dynamic pc value which takes only two values
                           according to jump_pc[T2] */
 diff --git a/tcg/tcg.c b/tcg/tcg.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tcg.c
 +++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ void tcg_prologue_init(TCGContext *s)
                          (uintptr_t)s->code_buf, prologue_size);
  #endif
--            /* We cannot handle MMIO as second page. */
--            assert(tb->page_addr[1] != -1);
+-#ifdef DEBUG_DISAS
-             host = db->host_addr[1];
+     if (qemu_loglevel_mask(CPU_LOG_TB_OUT_ASM)) {
          FILE *logfile = qemu_log_trylock();
          if (logfile) {
@@ -XXX,XX +XXX,XX @@ void tcg_prologue_init(TCGContext *s)
              qemu_log_unlock(logfile);
          }
+     }
 -#endif
  #ifndef CONFIG_TCG_INTERPRETER
      /*
@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start)
      }
  #endif
 -#ifdef DEBUG_DISAS
      if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_OP)
                   && qemu_log_in_addr_range(pc_start))) {
          FILE *logfile = qemu_log_trylock();
@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start)
              qemu_log_unlock(logfile);
          }
      }
 -#endif
  #ifdef CONFIG_DEBUG_TCG
      /* Ensure all labels referenced have been emitted.  */
@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start)
      liveness_pass_1(s);
      if (s->nb_indirects > 0) {
 -#ifdef DEBUG_DISAS
          if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_OP_IND)
                       && qemu_log_in_addr_range(pc_start))) {
              FILE *logfile = qemu_log_trylock();
@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start)
                  qemu_log_unlock(logfile);
              }
          }
 -#endif
 +
          /* Replace indirect temps with direct temps.  */
          if (liveness_pass_2(s)) {
              /* If changes were made, re-run liveness.  */
@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start)
      qatomic_set(&prof->la_time, prof->la_time + profile_getclock());
  #endif
 -#ifdef DEBUG_DISAS
      if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_OP_OPT)
                   && qemu_log_in_addr_range(pc_start))) {
          FILE *logfile = qemu_log_trylock();
@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start)
              qemu_log_unlock(logfile);
          }
      }
 -#endif
      /* Initialize goto_tb jump offsets. */
      tb->jmp_reset_offset[0] = TB_JMP_OFFSET_INVALID;
 --
 .34.1

-[PULL 05/47] include/qemu/osdep: Add qemu_build_assert
+[PULL 28/28] tcg: Remove USE_TCG_OPTIMIZATIONS
-This differs from assert, in that with optimization enabled it
+This is always defined, and the optimization pass is
-triggers at build-time.  It differs from QEMU_BUILD_BUG_ON,
+essential to producing reasonable code.
 aka _Static_assert, in that it is sensitive to control flow
 and is subject to dead-code elimination.
-Acked-by: Paolo Bonzini <pbonzini@redhat.com>
 Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/qemu/osdep.h | 8 ++++++++
+ tcg/tcg.c | 5 -----
-file changed, 8 insertions(+)
+file changed, 5 deletions(-)
-diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
+diff --git a/tcg/tcg.c b/tcg/tcg.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/qemu/osdep.h
+--- a/tcg/tcg.c
-+++ b/include/qemu/osdep.h
++++ b/tcg/tcg.c
-@@ -XXX,XX +XXX,XX @@ void QEMU_ERROR("code path is reachable")
+@@ -XXX,XX +XXX,XX @@
- #define qemu_build_not_reached()  g_assert_not_reached()
+  * THE SOFTWARE.
   */
 -/* define it to use liveness analysis (better code) */
 -#define USE_TCG_OPTIMIZATIONS
 -
  #include "qemu/osdep.h"
  /* Define to jump the ELF file used to communicate with GDB.  */
@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start)
      qatomic_set(&prof->opt_time, prof->opt_time - profile_getclock());
  #endif
-+/**
+-#ifdef USE_TCG_OPTIMIZATIONS
-+ * qemu_build_assert()
+     tcg_optimize(s);
-+ *
+-#endif
-+ * The compiler, during optimization, is expected to prove that the
-+ * assertion is true.
+ #ifdef CONFIG_PROFILER
-+ */
+     qatomic_set(&prof->opt_time, prof->opt_time + profile_getclock());
 +#define qemu_build_assert(test)  while (!(test)) qemu_build_not_reached()
 +
  /*
   * According to waitpid man page:
   * WCOREDUMP
 --
 .34.1

-[PULL 06/47] include/qemu/atomic: Use qemu_build_assert
+Deleted patch
-Change from QEMU_BUILD_BUG_ON, which requires ifdefs to avoid
-problematic code, to qemu_build_assert, which can use C ifs.
-Acked-by: Paolo Bonzini <pbonzini@redhat.com>
-Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
-Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
----
- include/qemu/atomic.h | 16 ++++++++--------
-file changed, 8 insertions(+), 8 deletions(-)
-diff --git a/include/qemu/atomic.h b/include/qemu/atomic.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/qemu/atomic.h
-+++ b/include/qemu/atomic.h
-@@ -XXX,XX +XXX,XX @@
- #define qatomic_read(ptr)                              \
-     ({                                                 \
--    QEMU_BUILD_BUG_ON(sizeof(*ptr) > ATOMIC_REG_SIZE); \
-+    qemu_build_assert(sizeof(*ptr) <= ATOMIC_REG_SIZE); \
-     qatomic_read__nocheck(ptr);                        \
-     })
-@@ -XXX,XX +XXX,XX @@
-     __atomic_store_n(ptr, i, __ATOMIC_RELAXED)
- #define qatomic_set(ptr, i)  do {                      \
--    QEMU_BUILD_BUG_ON(sizeof(*ptr) > ATOMIC_REG_SIZE); \
-+    qemu_build_assert(sizeof(*ptr) <= ATOMIC_REG_SIZE); \
-     qatomic_set__nocheck(ptr, i);                      \
- } while(0)
-@@ -XXX,XX +XXX,XX @@
- #define qatomic_rcu_read(ptr)                          \
-     ({                                                 \
--    QEMU_BUILD_BUG_ON(sizeof(*ptr) > ATOMIC_REG_SIZE); \
-+    qemu_build_assert(sizeof(*ptr) <= ATOMIC_REG_SIZE); \
-     typeof_strip_qual(*ptr) _val;                      \
-     qatomic_rcu_read__nocheck(ptr, &_val);             \
-     _val;                                              \
-     })
- #define qatomic_rcu_set(ptr, i) do {                   \
--    QEMU_BUILD_BUG_ON(sizeof(*ptr) > ATOMIC_REG_SIZE); \
-+    qemu_build_assert(sizeof(*ptr) <= ATOMIC_REG_SIZE); \
-     __atomic_store_n(ptr, i, __ATOMIC_RELEASE);        \
- } while(0)
- #define qatomic_load_acquire(ptr)                       \
-     ({                                                  \
--    QEMU_BUILD_BUG_ON(sizeof(*ptr) > ATOMIC_REG_SIZE);  \
-+    qemu_build_assert(sizeof(*ptr) <= ATOMIC_REG_SIZE); \
-     typeof_strip_qual(*ptr) _val;                       \
-     __atomic_load(ptr, &_val, __ATOMIC_ACQUIRE);        \
-     _val;                                               \
-     })
- #define qatomic_store_release(ptr, i)  do {             \
--    QEMU_BUILD_BUG_ON(sizeof(*ptr) > ATOMIC_REG_SIZE);  \
-+    qemu_build_assert(sizeof(*ptr) <= ATOMIC_REG_SIZE); \
-     __atomic_store_n(ptr, i, __ATOMIC_RELEASE);         \
- } while(0)
-@@ -XXX,XX +XXX,XX @@
- })
- #define qatomic_xchg(ptr, i)    ({                          \
--    QEMU_BUILD_BUG_ON(sizeof(*ptr) > ATOMIC_REG_SIZE);      \
-+    qemu_build_assert(sizeof(*ptr) <= ATOMIC_REG_SIZE);     \
-     qatomic_xchg__nocheck(ptr, i);                          \
- })
-@@ -XXX,XX +XXX,XX @@
- })
- #define qatomic_cmpxchg(ptr, old, new)    ({                            \
--    QEMU_BUILD_BUG_ON(sizeof(*ptr) > ATOMIC_REG_SIZE);                  \
-+    qemu_build_assert(sizeof(*ptr) <= ATOMIC_REG_SIZE);                 \
-     qatomic_cmpxchg__nocheck(ptr, old, new);                            \
- })
---
-.34.1

-[PULL 07/47] include/qemu/thread: Use qatomic_* functions
+Deleted patch
-Use qatomic_*, which expands to __atomic_* in preference
-to the "legacy" __sync_* functions.
-Acked-by: Paolo Bonzini <pbonzini@redhat.com>
-Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
----
- include/qemu/thread.h | 8 ++++----
-file changed, 4 insertions(+), 4 deletions(-)
-diff --git a/include/qemu/thread.h b/include/qemu/thread.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/qemu/thread.h
-+++ b/include/qemu/thread.h
-@@ -XXX,XX +XXX,XX @@ struct QemuSpin {
- static inline void qemu_spin_init(QemuSpin *spin)
- {
--    __sync_lock_release(&spin->value);
-+    qatomic_set(&spin->value, 0);
- #ifdef CONFIG_TSAN
-     __tsan_mutex_create(spin, __tsan_mutex_not_static);
- #endif
-@@ -XXX,XX +XXX,XX @@ static inline void qemu_spin_lock(QemuSpin *spin)
- #ifdef CONFIG_TSAN
-     __tsan_mutex_pre_lock(spin, 0);
- #endif
--    while (unlikely(__sync_lock_test_and_set(&spin->value, true))) {
-+    while (unlikely(qatomic_xchg(&spin->value, 1))) {
-         while (qatomic_read(&spin->value)) {
-             cpu_relax();
-         }
-@@ -XXX,XX +XXX,XX @@ static inline bool qemu_spin_trylock(QemuSpin *spin)
- #ifdef CONFIG_TSAN
-     __tsan_mutex_pre_lock(spin, __tsan_mutex_try_lock);
- #endif
--    bool busy = __sync_lock_test_and_set(&spin->value, true);
-+    bool busy = qatomic_xchg(&spin->value, true);
- #ifdef CONFIG_TSAN
-     unsigned flags = __tsan_mutex_try_lock;
-     flags |= busy ? __tsan_mutex_try_lock_failed : 0;
-@@ -XXX,XX +XXX,XX @@ static inline void qemu_spin_unlock(QemuSpin *spin)
- #ifdef CONFIG_TSAN
-     __tsan_mutex_pre_unlock(spin, 0);
- #endif
--    __sync_lock_release(&spin->value);
-+    qatomic_store_release(&spin->value, 0);
- #ifdef CONFIG_TSAN
-     __tsan_mutex_post_unlock(spin, 0);
- #endif
---
-.34.1

-[PULL 14/47] accel/tcg: Remove duplicate store to tb->page_addr[]
+Deleted patch
-When we added the fast path, we initialized page_addr[] early.
-These stores in and around tb_page_add() are redundant; remove them.
-Fixes: 50627f1b7b1 ("accel/tcg: Add fast path for translator_ld*")
-Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
----
- accel/tcg/tb-maint.c | 3 ---
-file changed, 3 deletions(-)
-diff --git a/accel/tcg/tb-maint.c b/accel/tcg/tb-maint.c
-index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/tb-maint.c
-+++ b/accel/tcg/tb-maint.c
-@@ -XXX,XX +XXX,XX @@ static inline void tb_page_add(PageDesc *p, TranslationBlock *tb,
-     assert_page_locked(p);
--    tb->page_addr[n] = page_addr;
-     tb->page_next[n] = p->first_tb;
- #ifndef CONFIG_USER_ONLY
-     page_already_protected = p->first_tb != (uintptr_t)NULL;
-@@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_link_page(TranslationBlock *tb, tb_page_addr_t phys_pc,
-     tb_page_add(p, tb, 0, phys_pc);
-     if (p2) {
-         tb_page_add(p2, tb, 1, phys_page2);
--    } else {
--        tb->page_addr[1] = -1;
-     }
-     /* add in the hash table */
---
-.34.1

-[PULL 25/47] accel/tcg: Add restore_state_to_opc to TCGCPUOps
+Deleted patch
-Add a tcg_ops hook to replace the restore_state_to_opc
-function call.  Because these generic hooks cannot depend
-on target-specific types, temporarily, copy the current
-target_ulong data[] into uint64_t d64[].
-Reviewed-by: Claudio Fontana <cfontana@suse.de>
-Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
----
- include/exec/exec-all.h       |  2 +-
- include/hw/core/tcg-cpu-ops.h | 11 +++++++++++
- accel/tcg/translate-all.c     | 24 ++++++++++++++++++++++--
-files changed, 34 insertions(+), 3 deletions(-)
-diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/exec/exec-all.h
-+++ b/include/exec/exec-all.h
-@@ -XXX,XX +XXX,XX @@ typedef ram_addr_t tb_page_addr_t;
- #endif
- void restore_state_to_opc(CPUArchState *env, TranslationBlock *tb,
--                          target_ulong *data);
-+                          target_ulong *data) __attribute__((weak));
- /**
-  * cpu_restore_state:
-diff --git a/include/hw/core/tcg-cpu-ops.h b/include/hw/core/tcg-cpu-ops.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/hw/core/tcg-cpu-ops.h
-+++ b/include/hw/core/tcg-cpu-ops.h
-@@ -XXX,XX +XXX,XX @@ struct TCGCPUOps {
-      * function to restore all the state, and register it here.
-      */
-     void (*synchronize_from_tb)(CPUState *cpu, const TranslationBlock *tb);
-+    /**
-+     * @restore_state_to_opc: Synchronize state from INDEX_op_start_insn
-+     *
-+     * This is called when we unwind state in the middle of a TB,
-+     * usually before raising an exception.  Set all part of the CPU
-+     * state which are tracked insn-by-insn in the target-specific
-+     * arguments to start_insn, passed as @data.
-+     */
-+    void (*restore_state_to_opc)(CPUState *cpu, const TranslationBlock *tb,
-+                                 const uint64_t *data);
-+
-     /** @cpu_exec_enter: Callback for cpu_exec preparation */
-     void (*cpu_exec_enter)(CPUState *cpu);
-     /** @cpu_exec_exit: Callback for cpu_exec cleanup */
-diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
-index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/translate-all.c
-+++ b/accel/tcg/translate-all.c
-@@ -XXX,XX +XXX,XX @@ int cpu_restore_state_from_tb(CPUState *cpu, TranslationBlock *tb,
- {
-     target_ulong data[TARGET_INSN_START_WORDS];
-     uintptr_t host_pc = (uintptr_t)tb->tc.ptr;
--    CPUArchState *env = cpu->env_ptr;
-     const uint8_t *p = tb->tc.ptr + tb->tc.size;
-     int i, j, num_insns = tb->icount;
- #ifdef CONFIG_PROFILER
-@@ -XXX,XX +XXX,XX @@ int cpu_restore_state_from_tb(CPUState *cpu, TranslationBlock *tb,
-            and shift if to the number of actually executed instructions */
-         cpu_neg(cpu)->icount_decr.u16.low += num_insns - i;
-     }
--    restore_state_to_opc(env, tb, data);
-+
-+    {
-+        const struct TCGCPUOps *ops = cpu->cc->tcg_ops;
-+        __typeof(ops->restore_state_to_opc) restore = ops->restore_state_to_opc;
-+        if (restore) {
-+            uint64_t d64[TARGET_INSN_START_WORDS];
-+            for (i = 0; i < TARGET_INSN_START_WORDS; ++i) {
-+                d64[i] = data[i];
-+            }
-+            restore(cpu, tb, d64);
-+        } else {
-+            restore_state_to_opc(cpu->env_ptr, tb, data);
-+        }
-+    }
- #ifdef CONFIG_PROFILER
-     qatomic_set(&prof->restore_time,
-@@ -XXX,XX +XXX,XX @@ int cpu_restore_state_from_tb(CPUState *cpu, TranslationBlock *tb,
- bool cpu_restore_state(CPUState *cpu, uintptr_t host_pc, bool will_exit)
- {
-+    /*
-+     * The pc update associated with restore without exit will
-+     * break the relative pc adjustments performed by TARGET_TB_PCREL.
-+     */
-+    if (TARGET_TB_PCREL) {
-+        assert(will_exit);
-+    }
-+
-     /*
-      * The host_pc has to be in the rx region of the code buffer.
-      * If it is not we will not be able to resolve it here.
---
-.34.1

-[PULL 26/47] target/alpha: Convert to tcg_ops restore_state_to_opc
+Deleted patch
-Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
-Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
----
- target/alpha/cpu.c       | 9 +++++++++
- target/alpha/translate.c | 6 ------
-files changed, 9 insertions(+), 6 deletions(-)
-diff --git a/target/alpha/cpu.c b/target/alpha/cpu.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/alpha/cpu.c
-+++ b/target/alpha/cpu.c
-@@ -XXX,XX +XXX,XX @@ static vaddr alpha_cpu_get_pc(CPUState *cs)
-     return cpu->env.pc;
- }
-+static void alpha_restore_state_to_opc(CPUState *cs,
-+                                       const TranslationBlock *tb,
-+                                       const uint64_t *data)
-+{
-+    AlphaCPU *cpu = ALPHA_CPU(cs);
-+
-+    cpu->env.pc = data[0];
-+}
- static bool alpha_cpu_has_work(CPUState *cs)
- {
-@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps alpha_sysemu_ops = {
- static const struct TCGCPUOps alpha_tcg_ops = {
-     .initialize = alpha_translate_init,
-+    .restore_state_to_opc = alpha_restore_state_to_opc,
- #ifdef CONFIG_USER_ONLY
-     .record_sigsegv = alpha_cpu_record_sigsegv,
-diff --git a/target/alpha/translate.c b/target/alpha/translate.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/alpha/translate.c
-+++ b/target/alpha/translate.c
-@@ -XXX,XX +XXX,XX @@ void gen_intermediate_code(CPUState *cpu, TranslationBlock *tb, int max_insns,
-     DisasContext dc;
-     translator_loop(cpu, tb, max_insns, pc, host_pc, &alpha_tr_ops, &dc.base);
- }
--
--void restore_state_to_opc(CPUAlphaState *env, TranslationBlock *tb,
--                          target_ulong *data)
--{
--    env->pc = data[0];
--}
---
-.34.1

-[PULL 27/47] target/arm: Convert to tcg_ops restore_state_to_opc
+Deleted patch
-Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
-Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
----
- target/arm/cpu.c       | 26 ++++++++++++++++++++++++++
- target/arm/translate.c | 22 ----------------------
-files changed, 26 insertions(+), 22 deletions(-)
-diff --git a/target/arm/cpu.c b/target/arm/cpu.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/arm/cpu.c
-+++ b/target/arm/cpu.c
-@@ -XXX,XX +XXX,XX @@ void arm_cpu_synchronize_from_tb(CPUState *cs,
-         }
-     }
- }
-+
-+static void arm_restore_state_to_opc(CPUState *cs,
-+                                     const TranslationBlock *tb,
-+                                     const uint64_t *data)
-+{
-+    CPUARMState *env = cs->env_ptr;
-+
-+    if (is_a64(env)) {
-+        if (TARGET_TB_PCREL) {
-+            env->pc = (env->pc & TARGET_PAGE_MASK) | data[0];
-+        } else {
-+            env->pc = data[0];
-+        }
-+        env->condexec_bits = 0;
-+        env->exception.syndrome = data[2] << ARM_INSN_START_WORD2_SHIFT;
-+    } else {
-+        if (TARGET_TB_PCREL) {
-+            env->regs[15] = (env->regs[15] & TARGET_PAGE_MASK) | data[0];
-+        } else {
-+            env->regs[15] = data[0];
-+        }
-+        env->condexec_bits = data[1];
-+        env->exception.syndrome = data[2] << ARM_INSN_START_WORD2_SHIFT;
-+    }
-+}
- #endif /* CONFIG_TCG */
- static bool arm_cpu_has_work(CPUState *cs)
-@@ -XXX,XX +XXX,XX @@ static const struct TCGCPUOps arm_tcg_ops = {
-     .initialize = arm_translate_init,
-     .synchronize_from_tb = arm_cpu_synchronize_from_tb,
-     .debug_excp_handler = arm_debug_excp_handler,
-+    .restore_state_to_opc = arm_restore_state_to_opc,
- #ifdef CONFIG_USER_ONLY
-     .record_sigsegv = arm_cpu_record_sigsegv,
-diff --git a/target/arm/translate.c b/target/arm/translate.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/arm/translate.c
-+++ b/target/arm/translate.c
-@@ -XXX,XX +XXX,XX @@ void gen_intermediate_code(CPUState *cpu, TranslationBlock *tb, int max_insns,
-     translator_loop(cpu, tb, max_insns, pc, host_pc, ops, &dc.base);
- }
--
--void restore_state_to_opc(CPUARMState *env, TranslationBlock *tb,
--                          target_ulong *data)
--{
--    if (is_a64(env)) {
--        if (TARGET_TB_PCREL) {
--            env->pc = (env->pc & TARGET_PAGE_MASK) | data[0];
--        } else {
--            env->pc = data[0];
--        }
--        env->condexec_bits = 0;
--        env->exception.syndrome = data[2] << ARM_INSN_START_WORD2_SHIFT;
--    } else {
--        if (TARGET_TB_PCREL) {
--            env->regs[15] = (env->regs[15] & TARGET_PAGE_MASK) | data[0];
--        } else {
--            env->regs[15] = data[0];
--        }
--        env->condexec_bits = data[1];
--        env->exception.syndrome = data[2] << ARM_INSN_START_WORD2_SHIFT;
--    }
--}
---
-.34.1

-[PULL 28/47] target/avr: Convert to tcg_ops restore_state_to_opc
+Deleted patch
-Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
-Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
----
- target/avr/cpu.c       | 11 +++++++++++
- target/avr/translate.c |  6 ------
-files changed, 11 insertions(+), 6 deletions(-)
-diff --git a/target/avr/cpu.c b/target/avr/cpu.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/avr/cpu.c
-+++ b/target/avr/cpu.c
-@@ -XXX,XX +XXX,XX @@ static void avr_cpu_synchronize_from_tb(CPUState *cs,
-     env->pc_w = tb_pc(tb) / 2; /* internally PC points to words */
- }
-+static void avr_restore_state_to_opc(CPUState *cs,
-+                                     const TranslationBlock *tb,
-+                                     const uint64_t *data)
-+{
-+    AVRCPU *cpu = AVR_CPU(cs);
-+    CPUAVRState *env = &cpu->env;
-+
-+    env->pc_w = data[0];
-+}
-+
- static void avr_cpu_reset(DeviceState *ds)
- {
-     CPUState *cs = CPU(ds);
-@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps avr_sysemu_ops = {
- static const struct TCGCPUOps avr_tcg_ops = {
-     .initialize = avr_cpu_tcg_init,
-     .synchronize_from_tb = avr_cpu_synchronize_from_tb,
-+    .restore_state_to_opc = avr_restore_state_to_opc,
-     .cpu_exec_interrupt = avr_cpu_exec_interrupt,
-     .tlb_fill = avr_cpu_tlb_fill,
-     .do_interrupt = avr_cpu_do_interrupt,
-diff --git a/target/avr/translate.c b/target/avr/translate.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/avr/translate.c
-+++ b/target/avr/translate.c
-@@ -XXX,XX +XXX,XX @@ void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int max_insns,
-     DisasContext dc = { };
-     translator_loop(cs, tb, max_insns, pc, host_pc, &avr_tr_ops, &dc.base);
- }
--
--void restore_state_to_opc(CPUAVRState *env, TranslationBlock *tb,
--                            target_ulong *data)
--{
--    env->pc_w = data[0];
--}
---
-.34.1

-[PULL 29/47] target/cris: Convert to tcg_ops restore_state_to_opc
+Deleted patch
-Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
-Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
----
- target/cris/cpu.c       | 11 +++++++++++
- target/cris/translate.c |  6 ------
-files changed, 11 insertions(+), 6 deletions(-)
-diff --git a/target/cris/cpu.c b/target/cris/cpu.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/cris/cpu.c
-+++ b/target/cris/cpu.c
-@@ -XXX,XX +XXX,XX @@ static vaddr cris_cpu_get_pc(CPUState *cs)
-     return cpu->env.pc;
- }
-+static void cris_restore_state_to_opc(CPUState *cs,
-+                                      const TranslationBlock *tb,
-+                                      const uint64_t *data)
-+{
-+    CRISCPU *cpu = CRIS_CPU(cs);
-+
-+    cpu->env.pc = data[0];
-+}
-+
- static bool cris_cpu_has_work(CPUState *cs)
- {
-     return cs->interrupt_request & (CPU_INTERRUPT_HARD | CPU_INTERRUPT_NMI);
-@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps cris_sysemu_ops = {
- static const struct TCGCPUOps crisv10_tcg_ops = {
-     .initialize = cris_initialize_crisv10_tcg,
-+    .restore_state_to_opc = cris_restore_state_to_opc,
- #ifndef CONFIG_USER_ONLY
-     .tlb_fill = cris_cpu_tlb_fill,
-@@ -XXX,XX +XXX,XX @@ static const struct TCGCPUOps crisv10_tcg_ops = {
- static const struct TCGCPUOps crisv32_tcg_ops = {
-     .initialize = cris_initialize_tcg,
-+    .restore_state_to_opc = cris_restore_state_to_opc,
- #ifndef CONFIG_USER_ONLY
-     .tlb_fill = cris_cpu_tlb_fill,
-diff --git a/target/cris/translate.c b/target/cris/translate.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/cris/translate.c
-+++ b/target/cris/translate.c
-@@ -XXX,XX +XXX,XX @@ void cris_initialize_tcg(void)
-                                        pregnames_v32[i]);
-     }
- }
--
--void restore_state_to_opc(CPUCRISState *env, TranslationBlock *tb,
--                          target_ulong *data)
--{
--    env->pc = data[0];
--}
---
-.34.1

-[PULL 30/47] target/hexagon: Convert to tcg_ops restore_state_to_opc
+Deleted patch
-Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
-Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
----
- target/hexagon/cpu.c | 9 +++++++--
-file changed, 7 insertions(+), 2 deletions(-)
-diff --git a/target/hexagon/cpu.c b/target/hexagon/cpu.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/hexagon/cpu.c
-+++ b/target/hexagon/cpu.c
-@@ -XXX,XX +XXX,XX @@ static bool hexagon_cpu_has_work(CPUState *cs)
-     return true;
- }
--void restore_state_to_opc(CPUHexagonState *env, TranslationBlock *tb,
--                          target_ulong *data)
-+static void hexagon_restore_state_to_opc(CPUState *cs,
-+                                         const TranslationBlock *tb,
-+                                         const uint64_t *data)
- {
-+    HexagonCPU *cpu = HEXAGON_CPU(cs);
-+    CPUHexagonState *env = &cpu->env;
-+
-     env->gpr[HEX_REG_PC] = data[0];
- }
-@@ -XXX,XX +XXX,XX @@ static void hexagon_cpu_init(Object *obj)
- static const struct TCGCPUOps hexagon_tcg_ops = {
-     .initialize = hexagon_translate_init,
-     .synchronize_from_tb = hexagon_cpu_synchronize_from_tb,
-+    .restore_state_to_opc = hexagon_restore_state_to_opc,
- };
- static void hexagon_cpu_class_init(ObjectClass *c, void *data)
---
-.34.1

-[PULL 31/47] target/hppa: Convert to tcg_ops restore_state_to_opc
+Deleted patch
-Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
-Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
----
- target/hppa/cpu.c       | 19 +++++++++++++++++++
- target/hppa/translate.c | 13 -------------
-files changed, 19 insertions(+), 13 deletions(-)
-diff --git a/target/hppa/cpu.c b/target/hppa/cpu.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/hppa/cpu.c
-+++ b/target/hppa/cpu.c
-@@ -XXX,XX +XXX,XX @@ static void hppa_cpu_synchronize_from_tb(CPUState *cs,
-     cpu->env.psw_n = (tb->flags & PSW_N) != 0;
- }
-+static void hppa_restore_state_to_opc(CPUState *cs,
-+                                      const TranslationBlock *tb,
-+                                      const uint64_t *data)
-+{
-+    HPPACPU *cpu = HPPA_CPU(cs);
-+
-+    cpu->env.iaoq_f = data[0];
-+    if (data[1] != (target_ureg)-1) {
-+        cpu->env.iaoq_b = data[1];
-+    }
-+    /*
-+     * Since we were executing the instruction at IAOQ_F, and took some
-+     * sort of action that provoked the cpu_restore_state, we can infer
-+     * that the instruction was not nullified.
-+     */
-+    cpu->env.psw_n = 0;
-+}
-+
- static bool hppa_cpu_has_work(CPUState *cs)
- {
-     return cs->interrupt_request & (CPU_INTERRUPT_HARD | CPU_INTERRUPT_NMI);
-@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps hppa_sysemu_ops = {
- static const struct TCGCPUOps hppa_tcg_ops = {
-     .initialize = hppa_translate_init,
-     .synchronize_from_tb = hppa_cpu_synchronize_from_tb,
-+    .restore_state_to_opc = hppa_restore_state_to_opc,
- #ifndef CONFIG_USER_ONLY
-     .tlb_fill = hppa_cpu_tlb_fill,
-diff --git a/target/hppa/translate.c b/target/hppa/translate.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/hppa/translate.c
-+++ b/target/hppa/translate.c
-@@ -XXX,XX +XXX,XX @@ void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int max_insns,
-     DisasContext ctx;
-     translator_loop(cs, tb, max_insns, pc, host_pc, &hppa_tr_ops, &ctx.base);
- }
--
--void restore_state_to_opc(CPUHPPAState *env, TranslationBlock *tb,
--                          target_ulong *data)
--{
--    env->iaoq_f = data[0];
--    if (data[1] != (target_ureg)-1) {
--        env->iaoq_b = data[1];
--    }
--    /* Since we were executing the instruction at IAOQ_F, and took some
--       sort of action that provoked the cpu_restore_state, we can infer
--       that the instruction was not nullified.  */
--    env->psw_n = 0;
--}
---
-.34.1

-[PULL 32/47] target/i386: Convert to tcg_ops restore_state_to_opc
+Deleted patch
-Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
-Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
----
- target/i386/tcg/tcg-cpu.c   | 19 +++++++++++++++++++
- target/i386/tcg/translate.c | 15 ---------------
-files changed, 19 insertions(+), 15 deletions(-)
-diff --git a/target/i386/tcg/tcg-cpu.c b/target/i386/tcg/tcg-cpu.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/i386/tcg/tcg-cpu.c
-+++ b/target/i386/tcg/tcg-cpu.c
-@@ -XXX,XX +XXX,XX @@ static void x86_cpu_synchronize_from_tb(CPUState *cs,
-     }
- }
-+static void x86_restore_state_to_opc(CPUState *cs,
-+                                     const TranslationBlock *tb,
-+                                     const uint64_t *data)
-+{
-+    X86CPU *cpu = X86_CPU(cs);
-+    CPUX86State *env = &cpu->env;
-+    int cc_op = data[1];
-+
-+    if (TARGET_TB_PCREL) {
-+        env->eip = (env->eip & TARGET_PAGE_MASK) | data[0];
-+    } else {
-+        env->eip = data[0] - tb->cs_base;
-+    }
-+    if (cc_op != CC_OP_DYNAMIC) {
-+        env->cc_op = cc_op;
-+    }
-+}
-+
- #ifndef CONFIG_USER_ONLY
- static bool x86_debug_check_breakpoint(CPUState *cs)
- {
-@@ -XXX,XX +XXX,XX @@ static bool x86_debug_check_breakpoint(CPUState *cs)
- static const struct TCGCPUOps x86_tcg_ops = {
-     .initialize = tcg_x86_init,
-     .synchronize_from_tb = x86_cpu_synchronize_from_tb,
-+    .restore_state_to_opc = x86_restore_state_to_opc,
-     .cpu_exec_enter = x86_cpu_exec_enter,
-     .cpu_exec_exit = x86_cpu_exec_exit,
- #ifdef CONFIG_USER_ONLY
-diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/i386/tcg/translate.c
-+++ b/target/i386/tcg/translate.c
-@@ -XXX,XX +XXX,XX @@ void gen_intermediate_code(CPUState *cpu, TranslationBlock *tb, int max_insns,
-     translator_loop(cpu, tb, max_insns, pc, host_pc, &i386_tr_ops, &dc.base);
- }
--
--void restore_state_to_opc(CPUX86State *env, TranslationBlock *tb,
--                          target_ulong *data)
--{
--    int cc_op = data[1];
--
--    if (TARGET_TB_PCREL) {
--        env->eip = (env->eip & TARGET_PAGE_MASK) | data[0];
--    } else {
--        env->eip = data[0] - tb->cs_base;
--    }
--    if (cc_op != CC_OP_DYNAMIC) {
--        env->cc_op = cc_op;
--    }
--}
---
-.34.1

-[PULL 33/47] target/loongarch: Convert to tcg_ops restore_state_to_opc
+Deleted patch
-Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
-Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
----
- target/loongarch/cpu.c       | 11 +++++++++++
- target/loongarch/translate.c |  6 ------
-files changed, 11 insertions(+), 6 deletions(-)
-diff --git a/target/loongarch/cpu.c b/target/loongarch/cpu.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/loongarch/cpu.c
-+++ b/target/loongarch/cpu.c
-@@ -XXX,XX +XXX,XX @@ static void loongarch_cpu_synchronize_from_tb(CPUState *cs,
-     env->pc = tb_pc(tb);
- }
-+
-+static void loongarch_restore_state_to_opc(CPUState *cs,
-+                                           const TranslationBlock *tb,
-+                                           const uint64_t *data)
-+{
-+    LoongArchCPU *cpu = LOONGARCH_CPU(cs);
-+    CPULoongArchState *env = &cpu->env;
-+
-+    env->pc = data[0];
-+}
- #endif /* CONFIG_TCG */
- static bool loongarch_cpu_has_work(CPUState *cs)
-@@ -XXX,XX +XXX,XX @@ void loongarch_cpu_dump_state(CPUState *cs, FILE *f, int flags)
- static struct TCGCPUOps loongarch_tcg_ops = {
-     .initialize = loongarch_translate_init,
-     .synchronize_from_tb = loongarch_cpu_synchronize_from_tb,
-+    .restore_state_to_opc = loongarch_restore_state_to_opc,
- #ifndef CONFIG_USER_ONLY
-     .tlb_fill = loongarch_cpu_tlb_fill,
-diff --git a/target/loongarch/translate.c b/target/loongarch/translate.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/loongarch/translate.c
-+++ b/target/loongarch/translate.c
-@@ -XXX,XX +XXX,XX @@ void loongarch_translate_init(void)
-     cpu_llval = tcg_global_mem_new(cpu_env,
-                     offsetof(CPULoongArchState, llval), "llval");
- }
--
--void restore_state_to_opc(CPULoongArchState *env, TranslationBlock *tb,
--                          target_ulong *data)
--{
--    env->pc = data[0];
--}
---
-.34.1

-[PULL 35/47] target/microblaze: Convert to tcg_ops restore_state_to_opc
+Deleted patch
-Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
-Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
----
- target/microblaze/cpu.c       | 11 +++++++++++
- target/microblaze/translate.c |  7 -------
-files changed, 11 insertions(+), 7 deletions(-)
-diff --git a/target/microblaze/cpu.c b/target/microblaze/cpu.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/microblaze/cpu.c
-+++ b/target/microblaze/cpu.c
-@@ -XXX,XX +XXX,XX @@ static void mb_cpu_synchronize_from_tb(CPUState *cs,
-     cpu->env.iflags = tb->flags & IFLAGS_TB_MASK;
- }
-+static void mb_restore_state_to_opc(CPUState *cs,
-+                                    const TranslationBlock *tb,
-+                                    const uint64_t *data)
-+{
-+    MicroBlazeCPU *cpu = MICROBLAZE_CPU(cs);
-+
-+    cpu->env.pc = data[0];
-+    cpu->env.iflags = data[1];
-+}
-+
- static bool mb_cpu_has_work(CPUState *cs)
- {
-     return cs->interrupt_request & (CPU_INTERRUPT_HARD | CPU_INTERRUPT_NMI);
-@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps mb_sysemu_ops = {
- static const struct TCGCPUOps mb_tcg_ops = {
-     .initialize = mb_tcg_init,
-     .synchronize_from_tb = mb_cpu_synchronize_from_tb,
-+    .restore_state_to_opc = mb_restore_state_to_opc,
- #ifndef CONFIG_USER_ONLY
-     .tlb_fill = mb_cpu_tlb_fill,
-diff --git a/target/microblaze/translate.c b/target/microblaze/translate.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/microblaze/translate.c
-+++ b/target/microblaze/translate.c
-@@ -XXX,XX +XXX,XX @@ void mb_tcg_init(void)
-     cpu_res_addr =
-         tcg_global_mem_new(cpu_env, offsetof(CPUMBState, res_addr), "res_addr");
- }
--
--void restore_state_to_opc(CPUMBState *env, TranslationBlock *tb,
--                          target_ulong *data)
--{
--    env->pc = data[0];
--    env->iflags = data[1];
--}
---
-.34.1

-[PULL 36/47] target/mips: Convert to tcg_ops restore_state_to_opc
+Deleted patch
-Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
-Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
----
- target/mips/tcg/tcg-internal.h | 3 +++
- target/mips/cpu.c              | 1 +
- target/mips/tcg/translate.c    | 8 ++++++--
-files changed, 10 insertions(+), 2 deletions(-)
-diff --git a/target/mips/tcg/tcg-internal.h b/target/mips/tcg/tcg-internal.h
-index XXXXXXX..XXXXXXX 100644
---- a/target/mips/tcg/tcg-internal.h
-+++ b/target/mips/tcg/tcg-internal.h
-@@ -XXX,XX +XXX,XX @@ void mips_cpu_synchronize_from_tb(CPUState *cs, const TranslationBlock *tb);
- G_NORETURN void mips_cpu_do_unaligned_access(CPUState *cpu, vaddr addr,
-                                              MMUAccessType access_type, int mmu_idx,
-                                              uintptr_t retaddr);
-+void mips_restore_state_to_opc(CPUState *cs,
-+                               const TranslationBlock *tb,
-+                               const uint64_t *data);
- const char *mips_exception_name(int32_t exception);
-diff --git a/target/mips/cpu.c b/target/mips/cpu.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/mips/cpu.c
-+++ b/target/mips/cpu.c
-@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps mips_sysemu_ops = {
- static const struct TCGCPUOps mips_tcg_ops = {
-     .initialize = mips_tcg_init,
-     .synchronize_from_tb = mips_cpu_synchronize_from_tb,
-+    .restore_state_to_opc = mips_restore_state_to_opc,
- #if !defined(CONFIG_USER_ONLY)
-     .tlb_fill = mips_cpu_tlb_fill,
-diff --git a/target/mips/tcg/translate.c b/target/mips/tcg/translate.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/mips/tcg/translate.c
-+++ b/target/mips/tcg/translate.c
-@@ -XXX,XX +XXX,XX @@ void mips_tcg_init(void)
-     }
- }
--void restore_state_to_opc(CPUMIPSState *env, TranslationBlock *tb,
--                          target_ulong *data)
-+void mips_restore_state_to_opc(CPUState *cs,
-+                               const TranslationBlock *tb,
-+                               const uint64_t *data)
- {
-+    MIPSCPU *cpu = MIPS_CPU(cs);
-+    CPUMIPSState *env = &cpu->env;
-+
-     env->active_tc.PC = data[0];
-     env->hflags &= ~MIPS_HFLAG_BMASK;
-     env->hflags |= data[1];
---
-.34.1

-[PULL 47/47] accel/tcg: Remove restore_state_to_opc function
+Deleted patch
-All targets have been updated.  Use the tcg_ops target hook
-exclusively, which allows the compat code to be removed.
-Reviewed-by: Claudio Fontana <cfontana@suse.de>
-Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
----
- include/exec/exec-all.h   |  3 ---
- accel/tcg/translate-all.c | 16 ++--------------
-files changed, 2 insertions(+), 17 deletions(-)
-diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/exec/exec-all.h
-+++ b/include/exec/exec-all.h
-@@ -XXX,XX +XXX,XX @@ typedef ram_addr_t tb_page_addr_t;
- #define TB_PAGE_ADDR_FMT RAM_ADDR_FMT
- #endif
--void restore_state_to_opc(CPUArchState *env, TranslationBlock *tb,
--                          target_ulong *data) __attribute__((weak));
--
- /**
-  * cpu_restore_state:
-  * @cpu: the vCPU state is to be restore to
-diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
-index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/translate-all.c
-+++ b/accel/tcg/translate-all.c
-@@ -XXX,XX +XXX,XX @@ static int encode_search(TranslationBlock *tb, uint8_t *block)
- int cpu_restore_state_from_tb(CPUState *cpu, TranslationBlock *tb,
-                               uintptr_t searched_pc, bool reset_icount)
- {
--    target_ulong data[TARGET_INSN_START_WORDS];
-+    uint64_t data[TARGET_INSN_START_WORDS];
-     uintptr_t host_pc = (uintptr_t)tb->tc.ptr;
-     const uint8_t *p = tb->tc.ptr + tb->tc.size;
-     int i, j, num_insns = tb->icount;
-@@ -XXX,XX +XXX,XX @@ int cpu_restore_state_from_tb(CPUState *cpu, TranslationBlock *tb,
-         cpu_neg(cpu)->icount_decr.u16.low += num_insns - i;
-     }
--    {
--        const struct TCGCPUOps *ops = cpu->cc->tcg_ops;
--        __typeof(ops->restore_state_to_opc) restore = ops->restore_state_to_opc;
--        if (restore) {
--            uint64_t d64[TARGET_INSN_START_WORDS];
--            for (i = 0; i < TARGET_INSN_START_WORDS; ++i) {
--                d64[i] = data[i];
--            }
--            restore(cpu, tb, d64);
--        } else {
--            restore_state_to_opc(cpu->env_ptr, tb, data);
--        }
--    }
-+    cpu->cc->tcg_ops->restore_state_to_opc(cpu, tb, data);
- #ifdef CONFIG_PROFILER
-     qatomic_set(&prof->restore_time,
---
-.34.1

The following changes since commit e750a7ace492f0b450653d4ad368a77d6f660fb8:

Merge tag 'pull-9p-20221024' of https://github.com/cschoenebeck/qemu into staging (2022-10-24 14:27:12 -0400)

are available in the Git repository at:

https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20221026

for you to fetch changes up to 04f105758b0089f73ee47260671580cde35f96cc:

accel/tcg: Remove restore_state_to_opc function (2022-10-26 11:11:28 +1000)

----------------------------------------------------------------
Revert incorrect cflags initialization.
Add direct jumps for tcg/loongarch64.
Speed up breakpoint check.
Improve assertions for atomic.h.
Move restore_state_to_opc to TCGCPUOps.
Cleanups to TranslationBlock maintenance.

----------------------------------------------------------------
Leandro Lupori (1):
      accel/tcg: Add a quicker check for breakpoints

Peter Maydell (1):
      Revert "accel/tcg: Init TCG cflags in vCPU thread handler"

Qi Hu (2):
      tcg/loongarch64: Add direct jump support
      tcg/aarch64: Remove unused code in tcg_out_op

Richard Henderson (43):
      include/qemu/osdep: Add qemu_build_assert
      include/qemu/atomic: Use qemu_build_assert
      include/qemu/thread: Use qatomic_* functions
      accel/tcg: Make page_alloc_target_data allocation constant
      accel/tcg: Remove disabled debug in translate-all.c
      accel/tcg: Split out PageDesc to internal.h
      accel/tcg: Split out tb-maint.c
      accel/tcg: Move assert_no_pages_locked to internal.h
      accel/tcg: Drop cpu_get_tb_cpu_state from TARGET_HAS_PRECISE_SMC
      accel/tcg: Remove duplicate store to tb->page_addr[]
      accel/tcg: Introduce tb_{set_}page_addr{0,1}
      accel/tcg: Rename tb_invalidate_phys_page
      accel/tcg: Rename tb_invalidate_phys_page_range and drop end parameter
      accel/tcg: Unify declarations of tb_invalidate_phys_range
      accel/tcg: Use tb_invalidate_phys_page in page_set_flags
      accel/tcg: Call tb_invalidate_phys_page for PAGE_RESET
      accel/tcg: Use page_reset_target_data in page_set_flags
      accel/tcg: Use tb_invalidate_phys_range in page_set_flags
      accel/tcg: Move TARGET_PAGE_DATA_SIZE impl to user-exec.c
      accel/tcg: Simplify page_get/alloc_target_data
      accel/tcg: Add restore_state_to_opc to TCGCPUOps
      target/alpha: Convert to tcg_ops restore_state_to_opc
      target/arm: Convert to tcg_ops restore_state_to_opc
      target/avr: Convert to tcg_ops restore_state_to_opc
      target/cris: Convert to tcg_ops restore_state_to_opc
      target/hexagon: Convert to tcg_ops restore_state_to_opc
      target/hppa: Convert to tcg_ops restore_state_to_opc
      target/i386: Convert to tcg_ops restore_state_to_opc
      target/loongarch: Convert to tcg_ops restore_state_to_opc
      target/m68k: Convert to tcg_ops restore_state_to_opc
      target/microblaze: Convert to tcg_ops restore_state_to_opc
      target/mips: Convert to tcg_ops restore_state_to_opc
      target/nios2: Convert to tcg_ops restore_state_to_opc
      target/openrisc: Convert to tcg_ops restore_state_to_opc
      target/ppc: Convert to tcg_ops restore_state_to_opc
      target/riscv: Convert to tcg_ops restore_state_to_opc
      target/rx: Convert to tcg_ops restore_state_to_opc
      target/s390x: Convert to tcg_ops restore_state_to_opc
      target/sh4: Convert to tcg_ops restore_state_to_opc
      target/sparc: Convert to tcg_ops restore_state_to_opc
      target/tricore: Convert to tcg_ops restore_state_to_opc
      target/xtensa: Convert to tcg_ops restore_state_to_opc
      accel/tcg: Remove restore_state_to_opc function

From: Peter Maydell <peter.maydell@linaro.org>

Commit a82fd5a4ec24d was intended to be a code cleanup, but
unfortunately it has a bug. It moves the initialization of the
TCG cflags from the "start a new vcpu" function to the
thread handler; this is fine when each vcpu has its own thread,
but when we are doing round-robin of vcpus on a single thread
we end up only initializing the cflags for CPU 0, not for any
of the others.

The most obvious effect of this bug is that running in icount
mode with more than one CPU is broken; typically the guest
hangs shortly after it brings up the secondary CPUs.

This reverts commit a82fd5a4ec24d923ff1e6da128c0fd4a74079d99.

Cc: qemu-stable@nongnu.org
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Message-Id: <20221021163409.3674911-1-peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/tcg-accel-ops-mttcg.c | 5 +++--
 accel/tcg/tcg-accel-ops-rr.c    | 7 ++++---
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/accel/tcg/tcg-accel-ops-mttcg.c b/accel/tcg/tcg-accel-ops-mttcg.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-accel-ops-mttcg.c
+++ b/accel/tcg/tcg-accel-ops-mttcg.c
@@ -XXX,XX +XXX,XX @@ static void *mttcg_cpu_thread_fn(void *arg)
     assert(tcg_enabled());
     g_assert(!icount_enabled());
 
-    tcg_cpu_init_cflags(cpu, current_machine->smp.max_cpus > 1);
-
     rcu_register_thread();
     force_rcu.notifier.notify = mttcg_force_rcu;
     force_rcu.cpu = cpu;
@@ -XXX,XX +XXX,XX @@ void mttcg_start_vcpu_thread(CPUState *cpu)
 {
     char thread_name[VCPU_THREAD_NAME_SIZE];
 
+    g_assert(tcg_enabled());
+    tcg_cpu_init_cflags(cpu, current_machine->smp.max_cpus > 1);
+
     cpu->thread = g_new0(QemuThread, 1);
     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
     qemu_cond_init(cpu->halt_cond);
diff --git a/accel/tcg/tcg-accel-ops-rr.c b/accel/tcg/tcg-accel-ops-rr.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-accel-ops-rr.c
+++ b/accel/tcg/tcg-accel-ops-rr.c
@@ -XXX,XX +XXX,XX @@ static void *rr_cpu_thread_fn(void *arg)
     Notifier force_rcu;
     CPUState *cpu = arg;
 
-    g_assert(tcg_enabled());
-    tcg_cpu_init_cflags(cpu, false);
-
+    assert(tcg_enabled());
     rcu_register_thread();
     force_rcu.notify = rr_force_rcu;
     rcu_add_force_rcu_notifier(&force_rcu);
@@ -XXX,XX +XXX,XX @@ void rr_start_vcpu_thread(CPUState *cpu)
     static QemuCond *single_tcg_halt_cond;
     static QemuThread *single_tcg_cpu_thread;
 
+    g_assert(tcg_enabled());
+    tcg_cpu_init_cflags(cpu, false);
+
     if (!single_tcg_cpu_thread) {
         cpu->thread = g_new0(QemuThread, 1);
         cpu->halt_cond = g_new0(QemuCond, 1);
-- 
2.34.1

From: Qi Hu <huqi@loongson.cn>

Similar to the ARM64, LoongArch has PC-relative instructions such as
PCADDU18I. These instructions can be used to support direct jump for
LoongArch. Additionally, if instruction "B offset" can cover the target
address(target is within ±128MB range), a single "B offset" plus a nop
will be used by "tb_target_set_jump_target".

Signed-off-by: Qi Hu <huqi@loongson.cn>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Reviewed-by: WANG Xuerui <git@xen0n.name>
Message-Id: <20221015092754.91971-1-huqi@loongson.cn>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/loongarch64/tcg-target.h     |  9 ++++--
 tcg/loongarch64/tcg-target.c.inc | 48 +++++++++++++++++++++++++++++---
 2 files changed, 50 insertions(+), 7 deletions(-)

diff --git a/tcg/loongarch64/tcg-target.h b/tcg/loongarch64/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/loongarch64/tcg-target.h
+++ b/tcg/loongarch64/tcg-target.h
@@ -XXX,XX +XXX,XX @@
 
 #define TCG_TARGET_INSN_UNIT_SIZE 4
 #define TCG_TARGET_NB_REGS 32
-#define MAX_CODE_GEN_BUFFER_SIZE  SIZE_MAX
+/*
+ * PCADDU18I + JIRL sequence can give 20 + 16 + 2 = 38 bits
+ * signed offset, which is +/- 128 GiB.
+ */
+#define MAX_CODE_GEN_BUFFER_SIZE  (128 * GiB)
 
 typedef enum {
     TCG_REG_ZERO,
@@ -XXX,XX +XXX,XX @@ typedef enum {
 #define TCG_TARGET_HAS_clz_i32          1
 #define TCG_TARGET_HAS_ctz_i32          1
 #define TCG_TARGET_HAS_ctpop_i32        0
-#define TCG_TARGET_HAS_direct_jump      0
+#define TCG_TARGET_HAS_direct_jump      1
 #define TCG_TARGET_HAS_brcond2          0
 #define TCG_TARGET_HAS_setcond2         0
 #define TCG_TARGET_HAS_qemu_st8_i32     0
@@ -XXX,XX +XXX,XX @@ typedef enum {
 #define TCG_TARGET_HAS_muluh_i64        1
 #define TCG_TARGET_HAS_mulsh_i64        1
 
-/* not defined -- call should be eliminated at compile time */
 void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t);
 
 #define TCG_TARGET_DEFAULT_MO (0)
diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/loongarch64/tcg-target.c.inc
+++ b/tcg/loongarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args)
 #endif
 }
 
+/* LoongArch uses `andi zero, zero, 0` as NOP.  */
+#define NOP OPC_ANDI
+static void tcg_out_nop(TCGContext *s)
+{
+    tcg_out32(s, NOP);
+}
+
+void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_rx,
+                              uintptr_t jmp_rw, uintptr_t addr)
+{
+    tcg_insn_unit i1, i2;
+    ptrdiff_t upper, lower;
+    ptrdiff_t offset = (ptrdiff_t)(addr - jmp_rx) >> 2;
+
+    if (offset == sextreg(offset, 0, 26)) {
+        i1 = encode_sd10k16_insn(OPC_B, offset);
+        i2 = NOP;
+    } else {
+        tcg_debug_assert(offset == sextreg(offset, 0, 36));
+        lower = (int16_t)offset;
+        upper = (offset - lower) >> 16;
+
+        i1 = encode_dsj20_insn(OPC_PCADDU18I, TCG_REG_TMP0, upper);
+        i2 = encode_djsk16_insn(OPC_JIRL, TCG_REG_ZERO, TCG_REG_TMP0, lower);
+    }
+    uint64_t pair = ((uint64_t)i2 << 32) | i1;
+    qatomic_set((uint64_t *)jmp_rw, pair);
+    flush_idcache_range(jmp_rx, jmp_rw, 8);
+}
+
 /*
  * Entry-points
  */
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
         break;
 
     case INDEX_op_goto_tb:
-        assert(s->tb_jmp_insn_offset == 0);
-        /* indirect jump method */
-        tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP0, TCG_REG_ZERO,
-                   (uintptr_t)(s->tb_jmp_target_addr + a0));
+        tcg_debug_assert(s->tb_jmp_insn_offset != NULL);
+        /*
+         * Ensure that patch area is 8-byte aligned so that an
+         * atomic write can be used to patch the target address.
+         */
+        if ((uintptr_t)s->code_ptr & 7) {
+            tcg_out_nop(s);
+        }
+        s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
+        /*
+         * actual branch destination will be patched by
+         * tb_target_set_jmp_target later
+         */
+        tcg_out_opc_pcaddu18i(s, TCG_REG_TMP0, 0);
         tcg_out_opc_jirl(s, TCG_REG_ZERO, TCG_REG_TMP0, 0);
         set_jmp_reset_offset(s, a0);
         break;
-- 
2.34.1

From: Qi Hu <huqi@loongson.cn>

AArch64 defines the TCG_TARGET_HAS_direct_jump. So the "else" block is
useless in the case of "INDEX_op_goto_tb" in function "tcg_out_op". Add
an assertion and delete these codes for clarity.

Suggested-by: WANG Xuerui <git@xen0n.name>
Signed-off-by: Qi Hu <huqi@loongson.cn>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-Id: <20221017020826.990729-1-huqi@loongson.cn>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/aarch64/tcg-target.c.inc | 31 ++++++++++++++-----------------
 1 file changed, 14 insertions(+), 17 deletions(-)

diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/aarch64/tcg-target.c.inc
+++ b/tcg/aarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
         break;
 
     case INDEX_op_goto_tb:
-        if (s->tb_jmp_insn_offset != NULL) {
-            /* TCG_TARGET_HAS_direct_jump */
-            /* Ensure that ADRP+ADD are 8-byte aligned so that an atomic
-               write can be used to patch the target address. */
-            if ((uintptr_t)s->code_ptr & 7) {
-                tcg_out32(s, NOP);
-            }
-            s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
-            /* actual branch destination will be patched by
-               tb_target_set_jmp_target later. */
-            tcg_out_insn(s, 3406, ADRP, TCG_REG_TMP, 0);
-            tcg_out_insn(s, 3401, ADDI, TCG_TYPE_I64, TCG_REG_TMP, TCG_REG_TMP, 0);
-        } else {
-            /* !TCG_TARGET_HAS_direct_jump */
-            tcg_debug_assert(s->tb_jmp_target_addr != NULL);
-            intptr_t offset = tcg_pcrel_diff(s, (s->tb_jmp_target_addr + a0)) >> 2;
-            tcg_out_insn(s, 3305, LDR, offset, TCG_REG_TMP);
+        tcg_debug_assert(s->tb_jmp_insn_offset != NULL);
+        /*
+         * Ensure that ADRP+ADD are 8-byte aligned so that an atomic
+         * write can be used to patch the target address.
+         */
+        if ((uintptr_t)s->code_ptr & 7) {
+            tcg_out32(s, NOP);
         }
+        s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
+        /*
+         * actual branch destination will be patched by
+         * tb_target_set_jmp_target later
+         */
+        tcg_out_insn(s, 3406, ADRP, TCG_REG_TMP, 0);
+        tcg_out_insn(s, 3401, ADDI, TCG_TYPE_I64, TCG_REG_TMP, TCG_REG_TMP, 0);
         tcg_out_insn(s, 3207, BR, TCG_REG_TMP);
         set_jmp_reset_offset(s, a0);
         break;
-- 
2.34.1

From: Leandro Lupori <leandro.lupori@eldorado.org.br>

Profiling QEMU during Fedora 35 for PPC64 boot revealed that a
considerable amount of time was being spent in
check_for_breakpoints() (0.61% of total time on PPC64 and 2.19% on
amd64), even though it was just checking that its queue was empty
and returning, when no breakpoints were set. It turns out this
function is not inlined by the compiler and it's always called by
helper_lookup_tb_ptr(), one of the most called functions.

By leaving only the check for empty queue in
check_for_breakpoints() and moving the remaining code to
check_for_breakpoints_slow(), called only when the queue is not
empty, it's possible to avoid the call overhead. An improvement of
about 3% in total time was measured on POWER9.

Signed-off-by: Leandro Lupori <leandro.lupori@eldorado.org.br>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-Id: <20221025202424.195984-2-leandro.lupori@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/cpu-exec.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cpu-exec.c
+++ b/accel/tcg/cpu-exec.c
@@ -XXX,XX +XXX,XX @@ static void log_cpu_exec(target_ulong pc, CPUState *cpu,
     }
 }
 
-static bool check_for_breakpoints(CPUState *cpu, target_ulong pc,
-                                  uint32_t *cflags)
+static bool check_for_breakpoints_slow(CPUState *cpu, target_ulong pc,
+                                       uint32_t *cflags)
 {
     CPUBreakpoint *bp;
     bool match_page = false;
 
-    if (likely(QTAILQ_EMPTY(&cpu->breakpoints))) {
-        return false;
-    }
-
     /*
      * Singlestep overrides breakpoints.
      * This requirement is visible in the record-replay tests, where
@@ -XXX,XX +XXX,XX @@ static bool check_for_breakpoints(CPUState *cpu, target_ulong pc,
     return false;
 }
 
+static inline bool check_for_breakpoints(CPUState *cpu, target_ulong pc,
+                                         uint32_t *cflags)
+{
+    return unlikely(!QTAILQ_EMPTY(&cpu->breakpoints)) &&
+        check_for_breakpoints_slow(cpu, pc, cflags);
+}
+
 /**
  * helper_lookup_tb_ptr: quick check for next tb
  * @env: current cpu state
-- 
2.34.1

This differs from assert, in that with optimization enabled it
triggers at build-time.  It differs from QEMU_BUILD_BUG_ON,
aka _Static_assert, in that it is sensitive to control flow
and is subject to dead-code elimination.

Acked-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/qemu/osdep.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/osdep.h
+++ b/include/qemu/osdep.h
@@ -XXX,XX +XXX,XX @@ void QEMU_ERROR("code path is reachable")
 #define qemu_build_not_reached()  g_assert_not_reached()
 #endif
 
+/**
+ * qemu_build_assert()
+ *
+ * The compiler, during optimization, is expected to prove that the
+ * assertion is true.
+ */
+#define qemu_build_assert(test)  while (!(test)) qemu_build_not_reached()
+
 /*
  * According to waitpid man page:
  * WCOREDUMP
-- 
2.34.1

Change from QEMU_BUILD_BUG_ON, which requires ifdefs to avoid
problematic code, to qemu_build_assert, which can use C ifs.

Acked-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/qemu/atomic.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/include/qemu/atomic.h b/include/qemu/atomic.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/atomic.h
+++ b/include/qemu/atomic.h
@@ -XXX,XX +XXX,XX @@
 
 #define qatomic_read(ptr)                              \
     ({                                                 \
-    QEMU_BUILD_BUG_ON(sizeof(*ptr) > ATOMIC_REG_SIZE); \
+    qemu_build_assert(sizeof(*ptr) <= ATOMIC_REG_SIZE); \
     qatomic_read__nocheck(ptr);                        \
     })
 
@@ -XXX,XX +XXX,XX @@
     __atomic_store_n(ptr, i, __ATOMIC_RELAXED)
 
 #define qatomic_set(ptr, i)  do {                      \
-    QEMU_BUILD_BUG_ON(sizeof(*ptr) > ATOMIC_REG_SIZE); \
+    qemu_build_assert(sizeof(*ptr) <= ATOMIC_REG_SIZE); \
     qatomic_set__nocheck(ptr, i);                      \
 } while(0)
 
@@ -XXX,XX +XXX,XX @@
 
 #define qatomic_rcu_read(ptr)                          \
     ({                                                 \
-    QEMU_BUILD_BUG_ON(sizeof(*ptr) > ATOMIC_REG_SIZE); \
+    qemu_build_assert(sizeof(*ptr) <= ATOMIC_REG_SIZE); \
     typeof_strip_qual(*ptr) _val;                      \
     qatomic_rcu_read__nocheck(ptr, &_val);             \
     _val;                                              \
     })
 
 #define qatomic_rcu_set(ptr, i) do {                   \
-    QEMU_BUILD_BUG_ON(sizeof(*ptr) > ATOMIC_REG_SIZE); \
+    qemu_build_assert(sizeof(*ptr) <= ATOMIC_REG_SIZE); \
     __atomic_store_n(ptr, i, __ATOMIC_RELEASE);        \
 } while(0)
 
 #define qatomic_load_acquire(ptr)                       \
     ({                                                  \
-    QEMU_BUILD_BUG_ON(sizeof(*ptr) > ATOMIC_REG_SIZE);  \
+    qemu_build_assert(sizeof(*ptr) <= ATOMIC_REG_SIZE); \
     typeof_strip_qual(*ptr) _val;                       \
     __atomic_load(ptr, &_val, __ATOMIC_ACQUIRE);        \
     _val;                                               \
     })
 
 #define qatomic_store_release(ptr, i)  do {             \
-    QEMU_BUILD_BUG_ON(sizeof(*ptr) > ATOMIC_REG_SIZE);  \
+    qemu_build_assert(sizeof(*ptr) <= ATOMIC_REG_SIZE); \
     __atomic_store_n(ptr, i, __ATOMIC_RELEASE);         \
 } while(0)
 
@@ -XXX,XX +XXX,XX @@
 })
 
 #define qatomic_xchg(ptr, i)    ({                          \
-    QEMU_BUILD_BUG_ON(sizeof(*ptr) > ATOMIC_REG_SIZE);      \
+    qemu_build_assert(sizeof(*ptr) <= ATOMIC_REG_SIZE);     \
     qatomic_xchg__nocheck(ptr, i);                          \
 })
 
@@ -XXX,XX +XXX,XX @@
 })
 
 #define qatomic_cmpxchg(ptr, old, new)    ({                            \
-    QEMU_BUILD_BUG_ON(sizeof(*ptr) > ATOMIC_REG_SIZE);                  \
+    qemu_build_assert(sizeof(*ptr) <= ATOMIC_REG_SIZE);                 \
     qatomic_cmpxchg__nocheck(ptr, old, new);                            \
 })
 
-- 
2.34.1

Use qatomic_*, which expands to __atomic_* in preference
to the "legacy" __sync_* functions.

Acked-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/qemu/thread.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/qemu/thread.h b/include/qemu/thread.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/thread.h
+++ b/include/qemu/thread.h
@@ -XXX,XX +XXX,XX @@ struct QemuSpin {
 
 static inline void qemu_spin_init(QemuSpin *spin)
 {
-    __sync_lock_release(&spin->value);
+    qatomic_set(&spin->value, 0);
 #ifdef CONFIG_TSAN
     __tsan_mutex_create(spin, __tsan_mutex_not_static);
 #endif
@@ -XXX,XX +XXX,XX @@ static inline void qemu_spin_lock(QemuSpin *spin)
 #ifdef CONFIG_TSAN
     __tsan_mutex_pre_lock(spin, 0);
 #endif
-    while (unlikely(__sync_lock_test_and_set(&spin->value, true))) {
+    while (unlikely(qatomic_xchg(&spin->value, 1))) {
         while (qatomic_read(&spin->value)) {
             cpu_relax();
         }
@@ -XXX,XX +XXX,XX @@ static inline bool qemu_spin_trylock(QemuSpin *spin)
 #ifdef CONFIG_TSAN
     __tsan_mutex_pre_lock(spin, __tsan_mutex_try_lock);
 #endif
-    bool busy = __sync_lock_test_and_set(&spin->value, true);
+    bool busy = qatomic_xchg(&spin->value, true);
 #ifdef CONFIG_TSAN
     unsigned flags = __tsan_mutex_try_lock;
     flags |= busy ? __tsan_mutex_try_lock_failed : 0;
@@ -XXX,XX +XXX,XX @@ static inline void qemu_spin_unlock(QemuSpin *spin)
 #ifdef CONFIG_TSAN
     __tsan_mutex_pre_unlock(spin, 0);
 #endif
-    __sync_lock_release(&spin->value);
+    qatomic_store_release(&spin->value, 0);
 #ifdef CONFIG_TSAN
     __tsan_mutex_post_unlock(spin, 0);
 #endif
-- 
2.34.1

Use a constant target data allocation size for all pages.
This will be necessary to reduce overhead of page tracking.
Since TARGET_PAGE_DATA_SIZE is now required, we can use this
to omit data tracking for targets that don't require it.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/cpu-all.h    | 9 ++++-----
 target/arm/cpu.h          | 8 ++++++++
 target/arm/internals.h    | 4 ----
 accel/tcg/translate-all.c | 8 ++++++--
 target/arm/mte_helper.c   | 3 +--
 5 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/include/exec/cpu-all.h b/include/exec/cpu-all.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/cpu-all.h
+++ b/include/exec/cpu-all.h
@@ -XXX,XX +XXX,XX @@ void page_reset_target_data(target_ulong start, target_ulong end);
 int page_check_range(target_ulong start, target_ulong len, int flags);
 
 /**
- * page_alloc_target_data(address, size)
+ * page_alloc_target_data(address)
  * @address: guest virtual address
- * @size: size of data to allocate
  *
- * Allocate @size bytes of out-of-band data to associate with the
- * guest page at @address.  If the page is not mapped, NULL will
+ * Allocate TARGET_PAGE_DATA_SIZE bytes of out-of-band data to associate
+ * with the guest page at @address.  If the page is not mapped, NULL will
  * be returned.  If there is existing data associated with @address,
  * no new memory will be allocated.
  *
  * The memory will be freed when the guest page is deallocated,
  * e.g. with the munmap system call.
  */
-void *page_alloc_target_data(target_ulong address, size_t size);
+void *page_alloc_target_data(target_ulong address);
 
 /**
  * page_get_target_data(address)
diff --git a/target/arm/cpu.h b/target/arm/cpu.h
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/cpu.h
+++ b/target/arm/cpu.h
@@ -XXX,XX +XXX,XX @@ extern const uint64_t pred_esz_masks[5];
 #define PAGE_MTE            PAGE_TARGET_2
 #define PAGE_TARGET_STICKY  PAGE_MTE
 
+/* We associate one allocation tag per 16 bytes, the minimum.  */
+#define LOG2_TAG_GRANULE 4
+#define TAG_GRANULE      (1 << LOG2_TAG_GRANULE)
+
+#ifdef CONFIG_USER_ONLY
+#define TARGET_PAGE_DATA_SIZE (TARGET_PAGE_SIZE >> (LOG2_TAG_GRANULE + 1))
+#endif
+
 #ifdef TARGET_TAGGED_ADDRESSES
 /**
  * cpu_untagged_addr:
diff --git a/target/arm/internals.h b/target/arm/internals.h
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/internals.h
+++ b/target/arm/internals.h
@@ -XXX,XX +XXX,XX @@ void arm_log_exception(CPUState *cs);
  */
 #define GMID_EL1_BS  6
 
-/* We associate one allocation tag per 16 bytes, the minimum.  */
-#define LOG2_TAG_GRANULE 4
-#define TAG_GRANULE      (1 << LOG2_TAG_GRANULE)
-
 /*
  * SVE predicates are 1/8 the size of SVE vectors, and cannot use
  * the same simd_desc() encoding due to restrictions on size.
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -XXX,XX +XXX,XX @@ void page_set_flags(target_ulong start, target_ulong end, int flags)
 
 void page_reset_target_data(target_ulong start, target_ulong end)
 {
+#ifdef TARGET_PAGE_DATA_SIZE
     target_ulong addr, len;
 
     /*
@@ -XXX,XX +XXX,XX @@ void page_reset_target_data(target_ulong start, target_ulong end)
         g_free(p->target_data);
         p->target_data = NULL;
     }
+#endif
 }
 
+#ifdef TARGET_PAGE_DATA_SIZE
 void *page_get_target_data(target_ulong address)
 {
     PageDesc *p = page_find(address >> TARGET_PAGE_BITS);
     return p ? p->target_data : NULL;
 }
 
-void *page_alloc_target_data(target_ulong address, size_t size)
+void *page_alloc_target_data(target_ulong address)
 {
     PageDesc *p = page_find(address >> TARGET_PAGE_BITS);
     void *ret = NULL;
@@ -XXX,XX +XXX,XX @@ void *page_alloc_target_data(target_ulong address, size_t size)
     if (p->flags & PAGE_VALID) {
         ret = p->target_data;
         if (!ret) {
-            p->target_data = ret = g_malloc0(size);
+            p->target_data = ret = g_malloc0(TARGET_PAGE_DATA_SIZE);
         }
     }
     return ret;
 }
+#endif /* TARGET_PAGE_DATA_SIZE */
 
 int page_check_range(target_ulong start, target_ulong len, int flags)
 {
diff --git a/target/arm/mte_helper.c b/target/arm/mte_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/mte_helper.c
+++ b/target/arm/mte_helper.c
@@ -XXX,XX +XXX,XX @@ static uint8_t *allocation_tag_mem(CPUARMState *env, int ptr_mmu_idx,
 
     tags = page_get_target_data(clean_ptr);
     if (tags == NULL) {
-        size_t alloc_size = TARGET_PAGE_SIZE >> (LOG2_TAG_GRANULE + 1);
-        tags = page_alloc_target_data(clean_ptr, alloc_size);
+        tags = page_alloc_target_data(clean_ptr);
         assert(tags != NULL);
     }
 
-- 
2.34.1

These items printf, and could be replaced with proper
tracepoints if we really cared.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/translate-all.c | 109 --------------------------------------
 1 file changed, 109 deletions(-)

diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -XXX,XX +XXX,XX @@
 #include "tb-context.h"
 #include "internal.h"
 
-/* #define DEBUG_TB_INVALIDATE */
-/* #define DEBUG_TB_FLUSH */
 /* make various TB consistency checks */
-/* #define DEBUG_TB_CHECK */
-
-#ifdef DEBUG_TB_INVALIDATE
-#define DEBUG_TB_INVALIDATE_GATE 1
-#else
-#define DEBUG_TB_INVALIDATE_GATE 0
-#endif
-
-#ifdef DEBUG_TB_FLUSH
-#define DEBUG_TB_FLUSH_GATE 1
-#else
-#define DEBUG_TB_FLUSH_GATE 0
-#endif
-
-#if !defined(CONFIG_USER_ONLY)
-/* TB consistency checks only implemented for usermode emulation.  */
-#undef DEBUG_TB_CHECK
-#endif
-
-#ifdef DEBUG_TB_CHECK
-#define DEBUG_TB_CHECK_GATE 1
-#else
-#define DEBUG_TB_CHECK_GATE 0
-#endif
 
 /* Access to the various translations structures need to be serialised via locks
  * for consistency.
@@ -XXX,XX +XXX,XX @@ static void page_flush_tb(void)
     }
 }
 
-static gboolean tb_host_size_iter(gpointer key, gpointer value, gpointer data)
-{
-    const TranslationBlock *tb = value;
-    size_t *size = data;
-
-    *size += tb->tc.size;
-    return false;
-}
-
 /* flush all the translation blocks */
 static void do_tb_flush(CPUState *cpu, run_on_cpu_data tb_flush_count)
 {
@@ -XXX,XX +XXX,XX @@ static void do_tb_flush(CPUState *cpu, run_on_cpu_data tb_flush_count)
     }
     did_flush = true;
 
-    if (DEBUG_TB_FLUSH_GATE) {
-        size_t nb_tbs = tcg_nb_tbs();
-        size_t host_size = 0;
-
-        tcg_tb_foreach(tb_host_size_iter, &host_size);
-        printf("qemu: flush code_size=%zu nb_tbs=%zu avg_tb_size=%zu\n",
-               tcg_code_size(), nb_tbs, nb_tbs > 0 ? host_size / nb_tbs : 0);
-    }
-
     CPU_FOREACH(cpu) {
         tcg_flush_jmp_cache(cpu);
     }
@@ -XXX,XX +XXX,XX @@ void tb_flush(CPUState *cpu)
     }
 }
 
-/*
- * Formerly ifdef DEBUG_TB_CHECK. These debug functions are user-mode-only,
- * so in order to prevent bit rot we compile them unconditionally in user-mode,
- * and let the optimizer get rid of them by wrapping their user-only callers
- * with if (DEBUG_TB_CHECK_GATE).
- */
-#ifdef CONFIG_USER_ONLY
-
-static void do_tb_invalidate_check(void *p, uint32_t hash, void *userp)
-{
-    TranslationBlock *tb = p;
-    target_ulong addr = *(target_ulong *)userp;
-
-    if (!(addr + TARGET_PAGE_SIZE <= tb_pc(tb) ||
-          addr >= tb_pc(tb) + tb->size)) {
-        printf("ERROR invalidate: address=" TARGET_FMT_lx
-               " PC=%08lx size=%04x\n", addr, (long)tb_pc(tb), tb->size);
-    }
-}
-
-/* verify that all the pages have correct rights for code
- *
- * Called with mmap_lock held.
- */
-static void tb_invalidate_check(target_ulong address)
-{
-    address &= TARGET_PAGE_MASK;
-    qht_iter(&tb_ctx.htable, do_tb_invalidate_check, &address);
-}
-
-static void do_tb_page_check(void *p, uint32_t hash, void *userp)
-{
-    TranslationBlock *tb = p;
-    int flags1, flags2;
-
-    flags1 = page_get_flags(tb_pc(tb));
-    flags2 = page_get_flags(tb_pc(tb) + tb->size - 1);
-    if ((flags1 & PAGE_WRITE) || (flags2 & PAGE_WRITE)) {
-        printf("ERROR page flags: PC=%08lx size=%04x f1=%x f2=%x\n",
-               (long)tb_pc(tb), tb->size, flags1, flags2);
-    }
-}
-
-/* verify that all the pages have correct rights for code */
-static void tb_page_check(void)
-{
-    qht_iter(&tb_ctx.htable, do_tb_page_check, NULL);
-}
-
-#endif /* CONFIG_USER_ONLY */
-
 /*
  * user-mode: call with mmap_lock held
  * !user-mode: call with @pd->lock held
@@ -XXX,XX +XXX,XX @@ tb_link_page(TranslationBlock *tb, tb_page_addr_t phys_pc,
         page_unlock(p2);
     }
     page_unlock(p);
-
-#ifdef CONFIG_USER_ONLY
-    if (DEBUG_TB_CHECK_GATE) {
-        tb_page_check();
-    }
-#endif
     return tb;
 }
 
@@ -XXX,XX +XXX,XX @@ void page_protect(tb_page_addr_t page_addr)
         }
         mprotect(g2h_untagged(page_addr), qemu_host_page_size,
                  (prot & PAGE_BITS) & ~PAGE_WRITE);
-        if (DEBUG_TB_INVALIDATE_GATE) {
-            printf("protecting code page: 0x" TB_PAGE_ADDR_FMT "\n", page_addr);
-        }
     }
 }
 
@@ -XXX,XX +XXX,XX @@ int page_unprotect(target_ulong address, uintptr_t pc)
                 /* and since the content will be modified, we must invalidate
                    the corresponding translated code. */
                 current_tb_invalidated |= tb_invalidate_phys_page(addr, pc);
-#ifdef CONFIG_USER_ONLY
-                if (DEBUG_TB_CHECK_GATE) {
-                    tb_invalidate_check(addr);
-                }
-#endif
             }
             mprotect((void *)g2h_untagged(host_start), qemu_host_page_size,
                      prot & PAGE_BITS);
-- 
2.34.1

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/internal.h      | 31 +++++++++++++++++++++++++++++++
 accel/tcg/translate-all.c | 31 +------------------------------
 2 files changed, 32 insertions(+), 30 deletions(-)

diff --git a/accel/tcg/internal.h b/accel/tcg/internal.h
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/internal.h
+++ b/accel/tcg/internal.h
@@ -XXX,XX +XXX,XX @@
 
 #include "exec/exec-all.h"
 
+/*
+ * Access to the various translations structures need to be serialised
+ * via locks for consistency.  In user-mode emulation access to the
+ * memory related structures are protected with mmap_lock.
+ * In !user-mode we use per-page locks.
+ */
+#ifdef CONFIG_SOFTMMU
+#define assert_memory_lock()
+#else
+#define assert_memory_lock() tcg_debug_assert(have_mmap_lock())
+#endif
+
+typedef struct PageDesc {
+    /* list of TBs intersecting this ram page */
+    uintptr_t first_tb;
+#ifdef CONFIG_USER_ONLY
+    unsigned long flags;
+    void *target_data;
+#endif
+#ifdef CONFIG_SOFTMMU
+    QemuSpin lock;
+#endif
+} PageDesc;
+
+PageDesc *page_find_alloc(tb_page_addr_t index, bool alloc);
+
+static inline PageDesc *page_find(tb_page_addr_t index)
+{
+    return page_find_alloc(index, false);
+}
+
 TranslationBlock *tb_gen_code(CPUState *cpu, target_ulong pc,
                               target_ulong cs_base, uint32_t flags,
                               int cflags);
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -XXX,XX +XXX,XX @@
 
 /* make various TB consistency checks */
 
-/* Access to the various translations structures need to be serialised via locks
- * for consistency.
- * In user-mode emulation access to the memory related structures are protected
- * with mmap_lock.
- * In !user-mode we use per-page locks.
- */
-#ifdef CONFIG_SOFTMMU
-#define assert_memory_lock()
-#else
-#define assert_memory_lock() tcg_debug_assert(have_mmap_lock())
-#endif
-
-typedef struct PageDesc {
-    /* list of TBs intersecting this ram page */
-    uintptr_t first_tb;
-#ifdef CONFIG_USER_ONLY
-    unsigned long flags;
-    void *target_data;
-#endif
-#ifdef CONFIG_SOFTMMU
-    QemuSpin lock;
-#endif
-} PageDesc;
-
 /**
  * struct page_entry - page descriptor entry
  * @pd:     pointer to the &struct PageDesc of the page this entry represents
@@ -XXX,XX +XXX,XX @@ void page_init(void)
 #endif
 }
 
-static PageDesc *page_find_alloc(tb_page_addr_t index, bool alloc)
+PageDesc *page_find_alloc(tb_page_addr_t index, bool alloc)
 {
     PageDesc *pd;
     void **lp;
@@ -XXX,XX +XXX,XX @@ static PageDesc *page_find_alloc(tb_page_addr_t index, bool alloc)
     return pd + (index & (V_L2_SIZE - 1));
 }
 
-static inline PageDesc *page_find(tb_page_addr_t index)
-{
-    return page_find_alloc(index, false);
-}
-
 static void page_lock_pair(PageDesc **ret_p1, tb_page_addr_t phys1,
                            PageDesc **ret_p2, tb_page_addr_t phys2, bool alloc);
 
-- 
2.34.1

Move all of the TranslationBlock flushing and page linking
code from translate-all.c to tb-maint.c.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/internal.h      |  55 +++
 accel/tcg/tb-maint.c      | 735 ++++++++++++++++++++++++++++++++++++
 accel/tcg/translate-all.c | 766 +-------------------------------------
 accel/tcg/meson.build     |   1 +
 4 files changed, 802 insertions(+), 755 deletions(-)
 create mode 100644 accel/tcg/tb-maint.c

diff --git a/accel/tcg/internal.h b/accel/tcg/internal.h
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/internal.h
+++ b/accel/tcg/internal.h
@@ -XXX,XX +XXX,XX @@ typedef struct PageDesc {
 #endif
 } PageDesc;
 
+/* Size of the L2 (and L3, etc) page tables.  */
+#define V_L2_BITS 10
+#define V_L2_SIZE (1 << V_L2_BITS)
+
+/*
+ * L1 Mapping properties
+ */
+extern int v_l1_size;
+extern int v_l1_shift;
+extern int v_l2_levels;
+
+/*
+ * The bottom level has pointers to PageDesc, and is indexed by
+ * anything from 4 to (V_L2_BITS + 3) bits, depending on target page size.
+ */
+#define V_L1_MIN_BITS 4
+#define V_L1_MAX_BITS (V_L2_BITS + 3)
+#define V_L1_MAX_SIZE (1 << V_L1_MAX_BITS)
+
+extern void *l1_map[V_L1_MAX_SIZE];
+
 PageDesc *page_find_alloc(tb_page_addr_t index, bool alloc);
 
 static inline PageDesc *page_find(tb_page_addr_t index)
@@ -XXX,XX +XXX,XX @@ static inline PageDesc *page_find(tb_page_addr_t index)
     return page_find_alloc(index, false);
 }
 
+/* list iterators for lists of tagged pointers in TranslationBlock */
+#define TB_FOR_EACH_TAGGED(head, tb, n, field)                          \
+    for (n = (head) & 1, tb = (TranslationBlock *)((head) & ~1);        \
+         tb; tb = (TranslationBlock *)tb->field[n], n = (uintptr_t)tb & 1, \
+             tb = (TranslationBlock *)((uintptr_t)tb & ~1))
+
+#define PAGE_FOR_EACH_TB(pagedesc, tb, n)                       \
+    TB_FOR_EACH_TAGGED((pagedesc)->first_tb, tb, n, page_next)
+
+#define TB_FOR_EACH_JMP(head_tb, tb, n)                                 \
+    TB_FOR_EACH_TAGGED((head_tb)->jmp_list_head, tb, n, jmp_list_next)
+
+/* In user-mode page locks aren't used; mmap_lock is enough */
+#ifdef CONFIG_USER_ONLY
+#define assert_page_locked(pd) tcg_debug_assert(have_mmap_lock())
+static inline void page_lock(PageDesc *pd) { }
+static inline void page_unlock(PageDesc *pd) { }
+#else
+#ifdef CONFIG_DEBUG_TCG
+void do_assert_page_locked(const PageDesc *pd, const char *file, int line);
+#define assert_page_locked(pd) do_assert_page_locked(pd, __FILE__, __LINE__)
+#else
+#define assert_page_locked(pd)
+#endif
+void page_lock(PageDesc *pd);
+void page_unlock(PageDesc *pd);
+#endif
+
 TranslationBlock *tb_gen_code(CPUState *cpu, target_ulong pc,
                               target_ulong cs_base, uint32_t flags,
                               int cflags);
 G_NORETURN void cpu_io_recompile(CPUState *cpu, uintptr_t retaddr);
 void page_init(void);
 void tb_htable_init(void);
+void tb_reset_jump(TranslationBlock *tb, int n);
+TranslationBlock *tb_link_page(TranslationBlock *tb, tb_page_addr_t phys_pc,
+                               tb_page_addr_t phys_page2);
+bool tb_invalidate_phys_page(tb_page_addr_t addr, uintptr_t pc);
+int cpu_restore_state_from_tb(CPUState *cpu, TranslationBlock *tb,
+                              uintptr_t searched_pc, bool reset_icount);
 
 /* Return the current PC from CPU, which may be cached in TB. */
 static inline target_ulong log_pc(CPUState *cpu, const TranslationBlock *tb)
diff --git a/accel/tcg/tb-maint.c b/accel/tcg/tb-maint.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/accel/tcg/tb-maint.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * Translation Block Maintaince
+ *
+ *  Copyright (c) 2003 Fabrice Bellard
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "exec/cputlb.h"
+#include "exec/log.h"
+#include "exec/translate-all.h"
+#include "sysemu/tcg.h"
+#include "tcg/tcg.h"
+#include "tb-hash.h"
+#include "tb-context.h"
+#include "internal.h"
+
+/* FIXME: tb_invalidate_phys_range is declared in different places. */
+#ifdef CONFIG_USER_ONLY
+#include "exec/exec-all.h"
+#else
+#include "exec/ram_addr.h"
+#endif
+
+static bool tb_cmp(const void *ap, const void *bp)
+{
+    const TranslationBlock *a = ap;
+    const TranslationBlock *b = bp;
+
+    return ((TARGET_TB_PCREL || tb_pc(a) == tb_pc(b)) &&
+            a->cs_base == b->cs_base &&
+            a->flags == b->flags &&
+            (tb_cflags(a) & ~CF_INVALID) == (tb_cflags(b) & ~CF_INVALID) &&
+            a->trace_vcpu_dstate == b->trace_vcpu_dstate &&
+            a->page_addr[0] == b->page_addr[0] &&
+            a->page_addr[1] == b->page_addr[1]);
+}
+
+void tb_htable_init(void)
+{
+    unsigned int mode = QHT_MODE_AUTO_RESIZE;
+
+    qht_init(&tb_ctx.htable, tb_cmp, CODE_GEN_HTABLE_SIZE, mode);
+}
+
+/* Set to NULL all the 'first_tb' fields in all PageDescs. */
+static void page_flush_tb_1(int level, void **lp)
+{
+    int i;
+
+    if (*lp == NULL) {
+        return;
+    }
+    if (level == 0) {
+        PageDesc *pd = *lp;
+
+        for (i = 0; i < V_L2_SIZE; ++i) {
+            page_lock(&pd[i]);
+            pd[i].first_tb = (uintptr_t)NULL;
+            page_unlock(&pd[i]);
+        }
+    } else {
+        void **pp = *lp;
+
+        for (i = 0; i < V_L2_SIZE; ++i) {
+            page_flush_tb_1(level - 1, pp + i);
+        }
+    }
+}
+
+static void page_flush_tb(void)
+{
+    int i, l1_sz = v_l1_size;
+
+    for (i = 0; i < l1_sz; i++) {
+        page_flush_tb_1(v_l2_levels, l1_map + i);
+    }
+}
+
+/* flush all the translation blocks */
+static void do_tb_flush(CPUState *cpu, run_on_cpu_data tb_flush_count)
+{
+    bool did_flush = false;
+
+    mmap_lock();
+    /* If it is already been done on request of another CPU, just retry. */
+    if (tb_ctx.tb_flush_count != tb_flush_count.host_int) {
+        goto done;
+    }
+    did_flush = true;
+
+    CPU_FOREACH(cpu) {
+        tcg_flush_jmp_cache(cpu);
+    }
+
+    qht_reset_size(&tb_ctx.htable, CODE_GEN_HTABLE_SIZE);
+    page_flush_tb();
+
+    tcg_region_reset_all();
+    /* XXX: flush processor icache at this point if cache flush is expensive */
+    qatomic_mb_set(&tb_ctx.tb_flush_count, tb_ctx.tb_flush_count + 1);
+
+done:
+    mmap_unlock();
+    if (did_flush) {
+        qemu_plugin_flush_cb();
+    }
+}
+
+void tb_flush(CPUState *cpu)
+{
+    if (tcg_enabled()) {
+        unsigned tb_flush_count = qatomic_mb_read(&tb_ctx.tb_flush_count);
+
+        if (cpu_in_exclusive_context(cpu)) {
+            do_tb_flush(cpu, RUN_ON_CPU_HOST_INT(tb_flush_count));
+        } else {
+            async_safe_run_on_cpu(cpu, do_tb_flush,
+                                  RUN_ON_CPU_HOST_INT(tb_flush_count));
+        }
+    }
+}
+
+/*
+ * user-mode: call with mmap_lock held
+ * !user-mode: call with @pd->lock held
+ */
+static inline void tb_page_remove(PageDesc *pd, TranslationBlock *tb)
+{
+    TranslationBlock *tb1;
+    uintptr_t *pprev;
+    unsigned int n1;
+
+    assert_page_locked(pd);
+    pprev = &pd->first_tb;
+    PAGE_FOR_EACH_TB(pd, tb1, n1) {
+        if (tb1 == tb) {
+            *pprev = tb1->page_next[n1];
+            return;
+        }
+        pprev = &tb1->page_next[n1];
+    }
+    g_assert_not_reached();
+}
+
+/* remove @orig from its @n_orig-th jump list */
+static inline void tb_remove_from_jmp_list(TranslationBlock *orig, int n_orig)
+{
+    uintptr_t ptr, ptr_locked;
+    TranslationBlock *dest;
+    TranslationBlock *tb;
+    uintptr_t *pprev;
+    int n;
+
+    /* mark the LSB of jmp_dest[] so that no further jumps can be inserted */
+    ptr = qatomic_or_fetch(&orig->jmp_dest[n_orig], 1);
+    dest = (TranslationBlock *)(ptr & ~1);
+    if (dest == NULL) {
+        return;
+    }
+
+    qemu_spin_lock(&dest->jmp_lock);
+    /*
+     * While acquiring the lock, the jump might have been removed if the
+     * destination TB was invalidated; check again.
+     */
+    ptr_locked = qatomic_read(&orig->jmp_dest[n_orig]);
+    if (ptr_locked != ptr) {
+        qemu_spin_unlock(&dest->jmp_lock);
+        /*
+         * The only possibility is that the jump was unlinked via
+         * tb_jump_unlink(dest). Seeing here another destination would be a bug,
+         * because we set the LSB above.
+         */
+        g_assert(ptr_locked == 1 && dest->cflags & CF_INVALID);
+        return;
+    }
+    /*
+     * We first acquired the lock, and since the destination pointer matches,
+     * we know for sure that @orig is in the jmp list.
+     */
+    pprev = &dest->jmp_list_head;
+    TB_FOR_EACH_JMP(dest, tb, n) {
+        if (tb == orig && n == n_orig) {
+            *pprev = tb->jmp_list_next[n];
+            /* no need to set orig->jmp_dest[n]; setting the LSB was enough */
+            qemu_spin_unlock(&dest->jmp_lock);
+            return;
+        }
+        pprev = &tb->jmp_list_next[n];
+    }
+    g_assert_not_reached();
+}
+
+/*
+ * Reset the jump entry 'n' of a TB so that it is not chained to another TB.
+ */
+void tb_reset_jump(TranslationBlock *tb, int n)
+{
+    uintptr_t addr = (uintptr_t)(tb->tc.ptr + tb->jmp_reset_offset[n]);
+    tb_set_jmp_target(tb, n, addr);
+}
+
+/* remove any jumps to the TB */
+static inline void tb_jmp_unlink(TranslationBlock *dest)
+{
+    TranslationBlock *tb;
+    int n;
+
+    qemu_spin_lock(&dest->jmp_lock);
+
+    TB_FOR_EACH_JMP(dest, tb, n) {
+        tb_reset_jump(tb, n);
+        qatomic_and(&tb->jmp_dest[n], (uintptr_t)NULL | 1);
+        /* No need to clear the list entry; setting the dest ptr is enough */
+    }
+    dest->jmp_list_head = (uintptr_t)NULL;
+
+    qemu_spin_unlock(&dest->jmp_lock);
+}
+
+static void tb_jmp_cache_inval_tb(TranslationBlock *tb)
+{
+    CPUState *cpu;
+
+    if (TARGET_TB_PCREL) {
+        /* A TB may be at any virtual address */
+        CPU_FOREACH(cpu) {
+            tcg_flush_jmp_cache(cpu);
+        }
+    } else {
+        uint32_t h = tb_jmp_cache_hash_func(tb_pc(tb));
+
+        CPU_FOREACH(cpu) {
+            CPUJumpCache *jc = cpu->tb_jmp_cache;
+
+            if (qatomic_read(&jc->array[h].tb) == tb) {
+                qatomic_set(&jc->array[h].tb, NULL);
+            }
+        }
+    }
+}
+
+/*
+ * In user-mode, call with mmap_lock held.
+ * In !user-mode, if @rm_from_page_list is set, call with the TB's pages'
+ * locks held.
+ */
+static void do_tb_phys_invalidate(TranslationBlock *tb, bool rm_from_page_list)
+{
+    PageDesc *p;
+    uint32_t h;
+    tb_page_addr_t phys_pc;
+    uint32_t orig_cflags = tb_cflags(tb);
+
+    assert_memory_lock();
+
+    /* make sure no further incoming jumps will be chained to this TB */
+    qemu_spin_lock(&tb->jmp_lock);
+    qatomic_set(&tb->cflags, tb->cflags | CF_INVALID);
+    qemu_spin_unlock(&tb->jmp_lock);
+
+    /* remove the TB from the hash list */
+    phys_pc = tb->page_addr[0];
+    h = tb_hash_func(phys_pc, (TARGET_TB_PCREL ? 0 : tb_pc(tb)),
+                     tb->flags, orig_cflags, tb->trace_vcpu_dstate);
+    if (!qht_remove(&tb_ctx.htable, tb, h)) {
+        return;
+    }
+
+    /* remove the TB from the page list */
+    if (rm_from_page_list) {
+        p = page_find(tb->page_addr[0] >> TARGET_PAGE_BITS);
+        tb_page_remove(p, tb);
+        if (tb->page_addr[1] != -1) {
+            p = page_find(tb->page_addr[1] >> TARGET_PAGE_BITS);
+            tb_page_remove(p, tb);
+        }
+    }
+
+    /* remove the TB from the hash list */
+    tb_jmp_cache_inval_tb(tb);
+
+    /* suppress this TB from the two jump lists */
+    tb_remove_from_jmp_list(tb, 0);
+    tb_remove_from_jmp_list(tb, 1);
+
+    /* suppress any remaining jumps to this TB */
+    tb_jmp_unlink(tb);
+
+    qatomic_set(&tb_ctx.tb_phys_invalidate_count,
+                tb_ctx.tb_phys_invalidate_count + 1);
+}
+
+static void tb_phys_invalidate__locked(TranslationBlock *tb)
+{
+    qemu_thread_jit_write();
+    do_tb_phys_invalidate(tb, true);
+    qemu_thread_jit_execute();
+}
+
+static void page_lock_pair(PageDesc **ret_p1, tb_page_addr_t phys1,
+                           PageDesc **ret_p2, tb_page_addr_t phys2, bool alloc)
+{
+    PageDesc *p1, *p2;
+    tb_page_addr_t page1;
+    tb_page_addr_t page2;
+
+    assert_memory_lock();
+    g_assert(phys1 != -1);
+
+    page1 = phys1 >> TARGET_PAGE_BITS;
+    page2 = phys2 >> TARGET_PAGE_BITS;
+
+    p1 = page_find_alloc(page1, alloc);
+    if (ret_p1) {
+        *ret_p1 = p1;
+    }
+    if (likely(phys2 == -1)) {
+        page_lock(p1);
+        return;
+    } else if (page1 == page2) {
+        page_lock(p1);
+        if (ret_p2) {
+            *ret_p2 = p1;
+        }
+        return;
+    }
+    p2 = page_find_alloc(page2, alloc);
+    if (ret_p2) {
+        *ret_p2 = p2;
+    }
+    if (page1 < page2) {
+        page_lock(p1);
+        page_lock(p2);
+    } else {
+        page_lock(p2);
+        page_lock(p1);
+    }
+}
+
+#ifdef CONFIG_USER_ONLY
+static inline void page_lock_tb(const TranslationBlock *tb) { }
+static inline void page_unlock_tb(const TranslationBlock *tb) { }
+#else
+/* lock the page(s) of a TB in the correct acquisition order */
+static void page_lock_tb(const TranslationBlock *tb)
+{
+    page_lock_pair(NULL, tb->page_addr[0], NULL, tb->page_addr[1], false);
+}
+
+static void page_unlock_tb(const TranslationBlock *tb)
+{
+    PageDesc *p1 = page_find(tb->page_addr[0] >> TARGET_PAGE_BITS);
+
+    page_unlock(p1);
+    if (unlikely(tb->page_addr[1] != -1)) {
+        PageDesc *p2 = page_find(tb->page_addr[1] >> TARGET_PAGE_BITS);
+
+        if (p2 != p1) {
+            page_unlock(p2);
+        }
+    }
+}
+#endif
+
+/*
+ * Invalidate one TB.
+ * Called with mmap_lock held in user-mode.
+ */
+void tb_phys_invalidate(TranslationBlock *tb, tb_page_addr_t page_addr)
+{
+    if (page_addr == -1 && tb->page_addr[0] != -1) {
+        page_lock_tb(tb);
+        do_tb_phys_invalidate(tb, true);
+        page_unlock_tb(tb);
+    } else {
+        do_tb_phys_invalidate(tb, false);
+    }
+}
+
+/*
+ * Add the tb in the target page and protect it if necessary.
+ * Called with mmap_lock held for user-mode emulation.
+ * Called with @p->lock held in !user-mode.
+ */
+static inline void tb_page_add(PageDesc *p, TranslationBlock *tb,
+                               unsigned int n, tb_page_addr_t page_addr)
+{
+#ifndef CONFIG_USER_ONLY
+    bool page_already_protected;
+#endif
+
+    assert_page_locked(p);
+
+    tb->page_addr[n] = page_addr;
+    tb->page_next[n] = p->first_tb;
+#ifndef CONFIG_USER_ONLY
+    page_already_protected = p->first_tb != (uintptr_t)NULL;
+#endif
+    p->first_tb = (uintptr_t)tb | n;
+
+#if defined(CONFIG_USER_ONLY)
+    /* translator_loop() must have made all TB pages non-writable */
+    assert(!(p->flags & PAGE_WRITE));
+#else
+    /*
+     * If some code is already present, then the pages are already
+     * protected. So we handle the case where only the first TB is
+     * allocated in a physical page.
+     */
+    if (!page_already_protected) {
+        tlb_protect_code(page_addr);
+    }
+#endif
+}
+
+/*
+ * Add a new TB and link it to the physical page tables. phys_page2 is
+ * (-1) to indicate that only one page contains the TB.
+ *
+ * Called with mmap_lock held for user-mode emulation.
+ *
+ * Returns a pointer @tb, or a pointer to an existing TB that matches @tb.
+ * Note that in !user-mode, another thread might have already added a TB
+ * for the same block of guest code that @tb corresponds to. In that case,
+ * the caller should discard the original @tb, and use instead the returned TB.
+ */
+TranslationBlock *tb_link_page(TranslationBlock *tb, tb_page_addr_t phys_pc,
+                               tb_page_addr_t phys_page2)
+{
+    PageDesc *p;
+    PageDesc *p2 = NULL;
+    void *existing_tb = NULL;
+    uint32_t h;
+
+    assert_memory_lock();
+    tcg_debug_assert(!(tb->cflags & CF_INVALID));
+
+    /*
+     * Add the TB to the page list, acquiring first the pages's locks.
+     * We keep the locks held until after inserting the TB in the hash table,
+     * so that if the insertion fails we know for sure that the TBs are still
+     * in the page descriptors.
+     * Note that inserting into the hash table first isn't an option, since
+     * we can only insert TBs that are fully initialized.
+     */
+    page_lock_pair(&p, phys_pc, &p2, phys_page2, true);
+    tb_page_add(p, tb, 0, phys_pc);
+    if (p2) {
+        tb_page_add(p2, tb, 1, phys_page2);
+    } else {
+        tb->page_addr[1] = -1;
+    }
+
+    /* add in the hash table */
+    h = tb_hash_func(phys_pc, (TARGET_TB_PCREL ? 0 : tb_pc(tb)),
+                     tb->flags, tb->cflags, tb->trace_vcpu_dstate);
+    qht_insert(&tb_ctx.htable, tb, h, &existing_tb);
+
+    /* remove TB from the page(s) if we couldn't insert it */
+    if (unlikely(existing_tb)) {
+        tb_page_remove(p, tb);
+        if (p2) {
+            tb_page_remove(p2, tb);
+        }
+        tb = existing_tb;
+    }
+
+    if (p2 && p2 != p) {
+        page_unlock(p2);
+    }
+    page_unlock(p);
+    return tb;
+}
+
+/*
+ * @p must be non-NULL.
+ * user-mode: call with mmap_lock held.
+ * !user-mode: call with all @pages locked.
+ */
+static void
+tb_invalidate_phys_page_range__locked(struct page_collection *pages,
+                                      PageDesc *p, tb_page_addr_t start,
+                                      tb_page_addr_t end,
+                                      uintptr_t retaddr)
+{
+    TranslationBlock *tb;
+    tb_page_addr_t tb_start, tb_end;
+    int n;
+#ifdef TARGET_HAS_PRECISE_SMC
+    CPUState *cpu = current_cpu;
+    CPUArchState *env = NULL;
+    bool current_tb_not_found = retaddr != 0;
+    bool current_tb_modified = false;
+    TranslationBlock *current_tb = NULL;
+    target_ulong current_pc = 0;
+    target_ulong current_cs_base = 0;
+    uint32_t current_flags = 0;
+#endif /* TARGET_HAS_PRECISE_SMC */
+
+    assert_page_locked(p);
+
+#if defined(TARGET_HAS_PRECISE_SMC)
+    if (cpu != NULL) {
+        env = cpu->env_ptr;
+    }
+#endif
+
+    /*
+     * We remove all the TBs in the range [start, end[.
+     * XXX: see if in some cases it could be faster to invalidate all the code
+     */
+    PAGE_FOR_EACH_TB(p, tb, n) {
+        assert_page_locked(p);
+        /* NOTE: this is subtle as a TB may span two physical pages */
+        if (n == 0) {
+            /* NOTE: tb_end may be after the end of the page, but
+               it is not a problem */
+            tb_start = tb->page_addr[0];
+            tb_end = tb_start + tb->size;
+        } else {
+            tb_start = tb->page_addr[1];
+            tb_end = tb_start + ((tb->page_addr[0] + tb->size)
+                                 & ~TARGET_PAGE_MASK);
+        }
+        if (!(tb_end <= start || tb_start >= end)) {
+#ifdef TARGET_HAS_PRECISE_SMC
+            if (current_tb_not_found) {
+                current_tb_not_found = false;
+                /* now we have a real cpu fault */
+                current_tb = tcg_tb_lookup(retaddr);
+            }
+            if (current_tb == tb &&
+                (tb_cflags(current_tb) & CF_COUNT_MASK) != 1) {
+                /*
+                 * If we are modifying the current TB, we must stop
+                 * its execution. We could be more precise by checking
+                 * that the modification is after the current PC, but it
+                 * would require a specialized function to partially
+                 * restore the CPU state.
+                 */
+                current_tb_modified = true;
+                cpu_restore_state_from_tb(cpu, current_tb, retaddr, true);
+                cpu_get_tb_cpu_state(env, &current_pc, &current_cs_base,
+                                     &current_flags);
+            }
+#endif /* TARGET_HAS_PRECISE_SMC */
+            tb_phys_invalidate__locked(tb);
+        }
+    }
+#if !defined(CONFIG_USER_ONLY)
+    /* if no code remaining, no need to continue to use slow writes */
+    if (!p->first_tb) {
+        tlb_unprotect_code(start);
+    }
+#endif
+#ifdef TARGET_HAS_PRECISE_SMC
+    if (current_tb_modified) {
+        page_collection_unlock(pages);
+        /* Force execution of one insn next time.  */
+        cpu->cflags_next_tb = 1 | CF_NOIRQ | curr_cflags(cpu);
+        mmap_unlock();
+        cpu_loop_exit_noexc(cpu);
+    }
+#endif
+}
+
+/*
+ * Invalidate all TBs which intersect with the target physical address range
+ * [start;end[. NOTE: start and end must refer to the *same* physical page.
+ * 'is_cpu_write_access' should be true if called from a real cpu write
+ * access: the virtual CPU will exit the current TB if code is modified inside
+ * this TB.
+ *
+ * Called with mmap_lock held for user-mode emulation
+ */
+void tb_invalidate_phys_page_range(tb_page_addr_t start, tb_page_addr_t end)
+{
+    struct page_collection *pages;
+    PageDesc *p;
+
+    assert_memory_lock();
+
+    p = page_find(start >> TARGET_PAGE_BITS);
+    if (p == NULL) {
+        return;
+    }
+    pages = page_collection_lock(start, end);
+    tb_invalidate_phys_page_range__locked(pages, p, start, end, 0);
+    page_collection_unlock(pages);
+}
+
+/*
+ * Invalidate all TBs which intersect with the target physical address range
+ * [start;end[. NOTE: start and end may refer to *different* physical pages.
+ * 'is_cpu_write_access' should be true if called from a real cpu write
+ * access: the virtual CPU will exit the current TB if code is modified inside
+ * this TB.
+ *
+ * Called with mmap_lock held for user-mode emulation.
+ */
+#ifdef CONFIG_SOFTMMU
+void tb_invalidate_phys_range(ram_addr_t start, ram_addr_t end)
+#else
+void tb_invalidate_phys_range(target_ulong start, target_ulong end)
+#endif
+{
+    struct page_collection *pages;
+    tb_page_addr_t next;
+
+    assert_memory_lock();
+
+    pages = page_collection_lock(start, end);
+    for (next = (start & TARGET_PAGE_MASK) + TARGET_PAGE_SIZE;
+         start < end;
+         start = next, next += TARGET_PAGE_SIZE) {
+        PageDesc *pd = page_find(start >> TARGET_PAGE_BITS);
+        tb_page_addr_t bound = MIN(next, end);
+
+        if (pd == NULL) {
+            continue;
+        }
+        tb_invalidate_phys_page_range__locked(pages, pd, start, bound, 0);
+    }
+    page_collection_unlock(pages);
+}
+
+#ifdef CONFIG_SOFTMMU
+/*
+ * len must be <= 8 and start must be a multiple of len.
+ * Called via softmmu_template.h when code areas are written to with
+ * iothread mutex not held.
+ *
+ * Call with all @pages in the range [@start, @start + len[ locked.
+ */
+void tb_invalidate_phys_page_fast(struct page_collection *pages,
+                                  tb_page_addr_t start, int len,
+                                  uintptr_t retaddr)
+{
+    PageDesc *p;
+
+    assert_memory_lock();
+
+    p = page_find(start >> TARGET_PAGE_BITS);
+    if (!p) {
+        return;
+    }
+
+    assert_page_locked(p);
+    tb_invalidate_phys_page_range__locked(pages, p, start, start + len,
+                                          retaddr);
+}
+#else
+/*
+ * Called with mmap_lock held. If pc is not 0 then it indicates the
+ * host PC of the faulting store instruction that caused this invalidate.
+ * Returns true if the caller needs to abort execution of the current
+ * TB (because it was modified by this store and the guest CPU has
+ * precise-SMC semantics).
+ */
+bool tb_invalidate_phys_page(tb_page_addr_t addr, uintptr_t pc)
+{
+    TranslationBlock *tb;
+    PageDesc *p;
+    int n;
+#ifdef TARGET_HAS_PRECISE_SMC
+    TranslationBlock *current_tb = NULL;
+    CPUState *cpu = current_cpu;
+    CPUArchState *env = NULL;
+    int current_tb_modified = 0;
+    target_ulong current_pc = 0;
+    target_ulong current_cs_base = 0;
+    uint32_t current_flags = 0;
+#endif
+
+    assert_memory_lock();
+
+    addr &= TARGET_PAGE_MASK;
+    p = page_find(addr >> TARGET_PAGE_BITS);
+    if (!p) {
+        return false;
+    }
+
+#ifdef TARGET_HAS_PRECISE_SMC
+    if (p->first_tb && pc != 0) {
+        current_tb = tcg_tb_lookup(pc);
+    }
+    if (cpu != NULL) {
+        env = cpu->env_ptr;
+    }
+#endif
+    assert_page_locked(p);
+    PAGE_FOR_EACH_TB(p, tb, n) {
+#ifdef TARGET_HAS_PRECISE_SMC
+        if (current_tb == tb &&
+            (tb_cflags(current_tb) & CF_COUNT_MASK) != 1) {
+            /*
+             * If we are modifying the current TB, we must stop its execution.
+             * We could be more precise by checking that the modification is
+             * after the current PC, but it would require a specialized
+             * function to partially restore the CPU state.
+             */
+            current_tb_modified = 1;
+            cpu_restore_state_from_tb(cpu, current_tb, pc, true);
+            cpu_get_tb_cpu_state(env, &current_pc, &current_cs_base,
+                                 &current_flags);
+        }
+#endif /* TARGET_HAS_PRECISE_SMC */
+        tb_phys_invalidate(tb, addr);
+    }
+    p->first_tb = (uintptr_t)NULL;
+#ifdef TARGET_HAS_PRECISE_SMC
+    if (current_tb_modified) {
+        /* Force execution of one insn next time.  */
+        cpu->cflags_next_tb = 1 | CF_NOIRQ | curr_cflags(cpu);
+        return true;
+    }
+#endif
+
+    return false;
+}
+#endif
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -XXX,XX +XXX,XX @@ struct page_collection {
     struct page_entry *max;
 };
 
-/* list iterators for lists of tagged pointers in TranslationBlock */
-#define TB_FOR_EACH_TAGGED(head, tb, n, field)                          \
-    for (n = (head) & 1, tb = (TranslationBlock *)((head) & ~1);        \
-         tb; tb = (TranslationBlock *)tb->field[n], n = (uintptr_t)tb & 1, \
-             tb = (TranslationBlock *)((uintptr_t)tb & ~1))
-
-#define PAGE_FOR_EACH_TB(pagedesc, tb, n)                       \
-    TB_FOR_EACH_TAGGED((pagedesc)->first_tb, tb, n, page_next)
-
-#define TB_FOR_EACH_JMP(head_tb, tb, n)                                 \
-    TB_FOR_EACH_TAGGED((head_tb)->jmp_list_head, tb, n, jmp_list_next)
-
 /*
  * In system mode we want L1_MAP to be based on ram offsets,
  * while in user mode we want it to be based on virtual addresses.
@@ -XXX,XX +XXX,XX @@ struct page_collection {
 # define L1_MAP_ADDR_SPACE_BITS  MIN(HOST_LONG_BITS, TARGET_ABI_BITS)
 #endif
 
-/* Size of the L2 (and L3, etc) page tables.  */
-#define V_L2_BITS 10
-#define V_L2_SIZE (1 << V_L2_BITS)
-
 /* Make sure all possible CPU event bits fit in tb->trace_vcpu_dstate */
 QEMU_BUILD_BUG_ON(CPU_TRACE_DSTATE_MAX_EVENTS >
                   sizeof_field(TranslationBlock, trace_vcpu_dstate)
@@ -XXX,XX +XXX,XX @@ QEMU_BUILD_BUG_ON(CPU_TRACE_DSTATE_MAX_EVENTS >
 /*
  * L1 Mapping properties
  */
-static int v_l1_size;
-static int v_l1_shift;
-static int v_l2_levels;
+int v_l1_size;
+int v_l1_shift;
+int v_l2_levels;
 
-/* The bottom level has pointers to PageDesc, and is indexed by
- * anything from 4 to (V_L2_BITS + 3) bits, depending on target page size.
- */
-#define V_L1_MIN_BITS 4
-#define V_L1_MAX_BITS (V_L2_BITS + 3)
-#define V_L1_MAX_SIZE (1 << V_L1_MAX_BITS)
-
-static void *l1_map[V_L1_MAX_SIZE];
+void *l1_map[V_L1_MAX_SIZE];
 
 TBContext tb_ctx;
 
@@ -XXX,XX +XXX,XX @@ static int encode_search(TranslationBlock *tb, uint8_t *block)
  * When reset_icount is true, current TB will be interrupted and
  * icount should be recalculated.
  */
-static int cpu_restore_state_from_tb(CPUState *cpu, TranslationBlock *tb,
-                                     uintptr_t searched_pc, bool reset_icount)
+int cpu_restore_state_from_tb(CPUState *cpu, TranslationBlock *tb,
+                              uintptr_t searched_pc, bool reset_icount)
 {
     target_ulong data[TARGET_INSN_START_WORDS];
     uintptr_t host_pc = (uintptr_t)tb->tc.ptr;
@@ -XXX,XX +XXX,XX @@ PageDesc *page_find_alloc(tb_page_addr_t index, bool alloc)
     return pd + (index & (V_L2_SIZE - 1));
 }
 
-static void page_lock_pair(PageDesc **ret_p1, tb_page_addr_t phys1,
-                           PageDesc **ret_p2, tb_page_addr_t phys2, bool alloc);
-
 /* In user-mode page locks aren't used; mmap_lock is enough */
 #ifdef CONFIG_USER_ONLY
-
-#define assert_page_locked(pd) tcg_debug_assert(have_mmap_lock())
-
-static inline void page_lock(PageDesc *pd)
-{ }
-
-static inline void page_unlock(PageDesc *pd)
-{ }
-
-static inline void page_lock_tb(const TranslationBlock *tb)
-{ }
-
-static inline void page_unlock_tb(const TranslationBlock *tb)
-{ }
-
 struct page_collection *
 page_collection_lock(tb_page_addr_t start, tb_page_addr_t end)
 {
@@ -XXX,XX +XXX,XX @@ static void page_unlock__debug(const PageDesc *pd)
     g_assert(removed);
 }
 
-static void
-do_assert_page_locked(const PageDesc *pd, const char *file, int line)
+void do_assert_page_locked(const PageDesc *pd, const char *file, int line)
 {
     if (unlikely(!page_is_locked(pd))) {
         error_report("assert_page_lock: PageDesc %p not locked @ %s:%d",
@@ -XXX,XX +XXX,XX @@ do_assert_page_locked(const PageDesc *pd, const char *file, int line)
     }
 }
 
-#define assert_page_locked(pd) do_assert_page_locked(pd, __FILE__, __LINE__)
-
 void assert_no_pages_locked(void)
 {
     ht_pages_locked_debug_init();
@@ -XXX,XX +XXX,XX @@ void assert_no_pages_locked(void)
 
 #else /* !CONFIG_DEBUG_TCG */
 
-#define assert_page_locked(pd)
-
-static inline void page_lock__debug(const PageDesc *pd)
-{
-}
-
-static inline void page_unlock__debug(const PageDesc *pd)
-{
-}
+static inline void page_lock__debug(const PageDesc *pd) { }
+static inline void page_unlock__debug(const PageDesc *pd) { }
 
 #endif /* CONFIG_DEBUG_TCG */
 
-static inline void page_lock(PageDesc *pd)
+void page_lock(PageDesc *pd)
 {
     page_lock__debug(pd);
     qemu_spin_lock(&pd->lock);
 }
 
-static inline void page_unlock(PageDesc *pd)
+void page_unlock(PageDesc *pd)
 {
     qemu_spin_unlock(&pd->lock);
     page_unlock__debug(pd);
 }
 
-/* lock the page(s) of a TB in the correct acquisition order */
-static inline void page_lock_tb(const TranslationBlock *tb)
-{
-    page_lock_pair(NULL, tb->page_addr[0], NULL, tb->page_addr[1], false);
-}
-
-static inline void page_unlock_tb(const TranslationBlock *tb)
-{
-    PageDesc *p1 = page_find(tb->page_addr[0] >> TARGET_PAGE_BITS);
-
-    page_unlock(p1);
-    if (unlikely(tb->page_addr[1] != -1)) {
-        PageDesc *p2 = page_find(tb->page_addr[1] >> TARGET_PAGE_BITS);
-
-        if (p2 != p1) {
-            page_unlock(p2);
-        }
-    }
-}
-
 static inline struct page_entry *
 page_entry_new(PageDesc *pd, tb_page_addr_t index)
 {
@@ -XXX,XX +XXX,XX @@ void page_collection_unlock(struct page_collection *set)
 
 #endif /* !CONFIG_USER_ONLY */
 
-static void page_lock_pair(PageDesc **ret_p1, tb_page_addr_t phys1,
-                           PageDesc **ret_p2, tb_page_addr_t phys2, bool alloc)
-{
-    PageDesc *p1, *p2;
-    tb_page_addr_t page1;
-    tb_page_addr_t page2;
-
-    assert_memory_lock();
-    g_assert(phys1 != -1);
-
-    page1 = phys1 >> TARGET_PAGE_BITS;
-    page2 = phys2 >> TARGET_PAGE_BITS;
-
-    p1 = page_find_alloc(page1, alloc);
-    if (ret_p1) {
-        *ret_p1 = p1;
-    }
-    if (likely(phys2 == -1)) {
-        page_lock(p1);
-        return;
-    } else if (page1 == page2) {
-        page_lock(p1);
-        if (ret_p2) {
-            *ret_p2 = p1;
-        }
-        return;
-    }
-    p2 = page_find_alloc(page2, alloc);
-    if (ret_p2) {
-        *ret_p2 = p2;
-    }
-    if (page1 < page2) {
-        page_lock(p1);
-        page_lock(p2);
-    } else {
-        page_lock(p2);
-        page_lock(p1);
-    }
-}
-
-static bool tb_cmp(const void *ap, const void *bp)
-{
-    const TranslationBlock *a = ap;
-    const TranslationBlock *b = bp;
-
-    return ((TARGET_TB_PCREL || tb_pc(a) == tb_pc(b)) &&
-            a->cs_base == b->cs_base &&
-            a->flags == b->flags &&
-            (tb_cflags(a) & ~CF_INVALID) == (tb_cflags(b) & ~CF_INVALID) &&
-            a->trace_vcpu_dstate == b->trace_vcpu_dstate &&
-            a->page_addr[0] == b->page_addr[0] &&
-            a->page_addr[1] == b->page_addr[1]);
-}
-
-void tb_htable_init(void)
-{
-    unsigned int mode = QHT_MODE_AUTO_RESIZE;
-
-    qht_init(&tb_ctx.htable, tb_cmp, CODE_GEN_HTABLE_SIZE, mode);
-}
-
-/* Set to NULL all the 'first_tb' fields in all PageDescs. */
-static void page_flush_tb_1(int level, void **lp)
-{
-    int i;
-
-    if (*lp == NULL) {
-        return;
-    }
-    if (level == 0) {
-        PageDesc *pd = *lp;
-
-        for (i = 0; i < V_L2_SIZE; ++i) {
-            page_lock(&pd[i]);
-            pd[i].first_tb = (uintptr_t)NULL;
-            page_unlock(&pd[i]);
-        }
-    } else {
-        void **pp = *lp;
-
-        for (i = 0; i < V_L2_SIZE; ++i) {
-            page_flush_tb_1(level - 1, pp + i);
-        }
-    }
-}
-
-static void page_flush_tb(void)
-{
-    int i, l1_sz = v_l1_size;
-
-    for (i = 0; i < l1_sz; i++) {
-        page_flush_tb_1(v_l2_levels, l1_map + i);
-    }
-}
-
-/* flush all the translation blocks */
-static void do_tb_flush(CPUState *cpu, run_on_cpu_data tb_flush_count)
-{
-    bool did_flush = false;
-
-    mmap_lock();
-    /* If it is already been done on request of another CPU,
-     * just retry.
-     */
-    if (tb_ctx.tb_flush_count != tb_flush_count.host_int) {
-        goto done;
-    }
-    did_flush = true;
-
-    CPU_FOREACH(cpu) {
-        tcg_flush_jmp_cache(cpu);
-    }
-
-    qht_reset_size(&tb_ctx.htable, CODE_GEN_HTABLE_SIZE);
-    page_flush_tb();
-
-    tcg_region_reset_all();
-    /* XXX: flush processor icache at this point if cache flush is
-       expensive */
-    qatomic_mb_set(&tb_ctx.tb_flush_count, tb_ctx.tb_flush_count + 1);
-
-done:
-    mmap_unlock();
-    if (did_flush) {
-        qemu_plugin_flush_cb();
-    }
-}
-
-void tb_flush(CPUState *cpu)
-{
-    if (tcg_enabled()) {
-        unsigned tb_flush_count = qatomic_mb_read(&tb_ctx.tb_flush_count);
-
-        if (cpu_in_exclusive_context(cpu)) {
-            do_tb_flush(cpu, RUN_ON_CPU_HOST_INT(tb_flush_count));
-        } else {
-            async_safe_run_on_cpu(cpu, do_tb_flush,
-                                  RUN_ON_CPU_HOST_INT(tb_flush_count));
-        }
-    }
-}
-
-/*
- * user-mode: call with mmap_lock held
- * !user-mode: call with @pd->lock held
- */
-static inline void tb_page_remove(PageDesc *pd, TranslationBlock *tb)
-{
-    TranslationBlock *tb1;
-    uintptr_t *pprev;
-    unsigned int n1;
-
-    assert_page_locked(pd);
-    pprev = &pd->first_tb;
-    PAGE_FOR_EACH_TB(pd, tb1, n1) {
-        if (tb1 == tb) {
-            *pprev = tb1->page_next[n1];
-            return;
-        }
-        pprev = &tb1->page_next[n1];
-    }
-    g_assert_not_reached();
-}
-
-/* remove @orig from its @n_orig-th jump list */
-static inline void tb_remove_from_jmp_list(TranslationBlock *orig, int n_orig)
-{
-    uintptr_t ptr, ptr_locked;
-    TranslationBlock *dest;
-    TranslationBlock *tb;
-    uintptr_t *pprev;
-    int n;
-
-    /* mark the LSB of jmp_dest[] so that no further jumps can be inserted */
-    ptr = qatomic_or_fetch(&orig->jmp_dest[n_orig], 1);
-    dest = (TranslationBlock *)(ptr & ~1);
-    if (dest == NULL) {
-        return;
-    }
-
-    qemu_spin_lock(&dest->jmp_lock);
-    /*
-     * While acquiring the lock, the jump might have been removed if the
-     * destination TB was invalidated; check again.
-     */
-    ptr_locked = qatomic_read(&orig->jmp_dest[n_orig]);
-    if (ptr_locked != ptr) {
-        qemu_spin_unlock(&dest->jmp_lock);
-        /*
-         * The only possibility is that the jump was unlinked via
-         * tb_jump_unlink(dest). Seeing here another destination would be a bug,
-         * because we set the LSB above.
-         */
-        g_assert(ptr_locked == 1 && dest->cflags & CF_INVALID);
-        return;
-    }
-    /*
-     * We first acquired the lock, and since the destination pointer matches,
-     * we know for sure that @orig is in the jmp list.
-     */
-    pprev = &dest->jmp_list_head;
-    TB_FOR_EACH_JMP(dest, tb, n) {
-        if (tb == orig && n == n_orig) {
-            *pprev = tb->jmp_list_next[n];
-            /* no need to set orig->jmp_dest[n]; setting the LSB was enough */
-            qemu_spin_unlock(&dest->jmp_lock);
-            return;
-        }
-        pprev = &tb->jmp_list_next[n];
-    }
-    g_assert_not_reached();
-}
-
-/* reset the jump entry 'n' of a TB so that it is not chained to
-   another TB */
-static inline void tb_reset_jump(TranslationBlock *tb, int n)
-{
-    uintptr_t addr = (uintptr_t)(tb->tc.ptr + tb->jmp_reset_offset[n]);
-    tb_set_jmp_target(tb, n, addr);
-}
-
-/* remove any jumps to the TB */
-static inline void tb_jmp_unlink(TranslationBlock *dest)
-{
-    TranslationBlock *tb;
-    int n;
-
-    qemu_spin_lock(&dest->jmp_lock);
-
-    TB_FOR_EACH_JMP(dest, tb, n) {
-        tb_reset_jump(tb, n);
-        qatomic_and(&tb->jmp_dest[n], (uintptr_t)NULL | 1);
-        /* No need to clear the list entry; setting the dest ptr is enough */
-    }
-    dest->jmp_list_head = (uintptr_t)NULL;
-
-    qemu_spin_unlock(&dest->jmp_lock);
-}
-
-static void tb_jmp_cache_inval_tb(TranslationBlock *tb)
-{
-    CPUState *cpu;
-
-    if (TARGET_TB_PCREL) {
-        /* A TB may be at any virtual address */
-        CPU_FOREACH(cpu) {
-            tcg_flush_jmp_cache(cpu);
-        }
-    } else {
-        uint32_t h = tb_jmp_cache_hash_func(tb_pc(tb));
-
-        CPU_FOREACH(cpu) {
-            CPUJumpCache *jc = cpu->tb_jmp_cache;
-
-            if (qatomic_read(&jc->array[h].tb) == tb) {
-                qatomic_set(&jc->array[h].tb, NULL);
-            }
-        }
-    }
-}
-
-/*
- * In user-mode, call with mmap_lock held.
- * In !user-mode, if @rm_from_page_list is set, call with the TB's pages'
- * locks held.
- */
-static void do_tb_phys_invalidate(TranslationBlock *tb, bool rm_from_page_list)
-{
-    PageDesc *p;
-    uint32_t h;
-    tb_page_addr_t phys_pc;
-    uint32_t orig_cflags = tb_cflags(tb);
-
-    assert_memory_lock();
-
-    /* make sure no further incoming jumps will be chained to this TB */
-    qemu_spin_lock(&tb->jmp_lock);
-    qatomic_set(&tb->cflags, tb->cflags | CF_INVALID);
-    qemu_spin_unlock(&tb->jmp_lock);
-
-    /* remove the TB from the hash list */
-    phys_pc = tb->page_addr[0];
-    h = tb_hash_func(phys_pc, (TARGET_TB_PCREL ? 0 : tb_pc(tb)),
-                     tb->flags, orig_cflags, tb->trace_vcpu_dstate);
-    if (!qht_remove(&tb_ctx.htable, tb, h)) {
-        return;
-    }
-
-    /* remove the TB from the page list */
-    if (rm_from_page_list) {
-        p = page_find(tb->page_addr[0] >> TARGET_PAGE_BITS);
-        tb_page_remove(p, tb);
-        if (tb->page_addr[1] != -1) {
-            p = page_find(tb->page_addr[1] >> TARGET_PAGE_BITS);
-            tb_page_remove(p, tb);
-        }
-    }
-
-    /* remove the TB from the hash list */
-    tb_jmp_cache_inval_tb(tb);
-
-    /* suppress this TB from the two jump lists */
-    tb_remove_from_jmp_list(tb, 0);
-    tb_remove_from_jmp_list(tb, 1);
-
-    /* suppress any remaining jumps to this TB */
-    tb_jmp_unlink(tb);
-
-    qatomic_set(&tb_ctx.tb_phys_invalidate_count,
-                tb_ctx.tb_phys_invalidate_count + 1);
-}
-
-static void tb_phys_invalidate__locked(TranslationBlock *tb)
-{
-    qemu_thread_jit_write();
-    do_tb_phys_invalidate(tb, true);
-    qemu_thread_jit_execute();
-}
-
-/* invalidate one TB
- *
- * Called with mmap_lock held in user-mode.
- */
-void tb_phys_invalidate(TranslationBlock *tb, tb_page_addr_t page_addr)
-{
-    if (page_addr == -1 && tb->page_addr[0] != -1) {
-        page_lock_tb(tb);
-        do_tb_phys_invalidate(tb, true);
-        page_unlock_tb(tb);
-    } else {
-        do_tb_phys_invalidate(tb, false);
-    }
-}
-
-/* add the tb in the target page and protect it if necessary
- *
- * Called with mmap_lock held for user-mode emulation.
- * Called with @p->lock held in !user-mode.
- */
-static inline void tb_page_add(PageDesc *p, TranslationBlock *tb,
-                               unsigned int n, tb_page_addr_t page_addr)
-{
-#ifndef CONFIG_USER_ONLY
-    bool page_already_protected;
-#endif
-
-    assert_page_locked(p);
-
-    tb->page_addr[n] = page_addr;
-    tb->page_next[n] = p->first_tb;
-#ifndef CONFIG_USER_ONLY
-    page_already_protected = p->first_tb != (uintptr_t)NULL;
-#endif
-    p->first_tb = (uintptr_t)tb | n;
-
-#if defined(CONFIG_USER_ONLY)
-    /* translator_loop() must have made all TB pages non-writable */
-    assert(!(p->flags & PAGE_WRITE));
-#else
-    /* if some code is already present, then the pages are already
-       protected. So we handle the case where only the first TB is
-       allocated in a physical page */
-    if (!page_already_protected) {
-        tlb_protect_code(page_addr);
-    }
-#endif
-}
-
-/*
- * Add a new TB and link it to the physical page tables. phys_page2 is
- * (-1) to indicate that only one page contains the TB.
- *
- * Called with mmap_lock held for user-mode emulation.
- *
- * Returns a pointer @tb, or a pointer to an existing TB that matches @tb.
- * Note that in !user-mode, another thread might have already added a TB
- * for the same block of guest code that @tb corresponds to. In that case,
- * the caller should discard the original @tb, and use instead the returned TB.
- */
-static TranslationBlock *
-tb_link_page(TranslationBlock *tb, tb_page_addr_t phys_pc,
-             tb_page_addr_t phys_page2)
-{
-    PageDesc *p;
-    PageDesc *p2 = NULL;
-    void *existing_tb = NULL;
-    uint32_t h;
-
-    assert_memory_lock();
-    tcg_debug_assert(!(tb->cflags & CF_INVALID));
-
-    /*
-     * Add the TB to the page list, acquiring first the pages's locks.
-     * We keep the locks held until after inserting the TB in the hash table,
-     * so that if the insertion fails we know for sure that the TBs are still
-     * in the page descriptors.
-     * Note that inserting into the hash table first isn't an option, since
-     * we can only insert TBs that are fully initialized.
-     */
-    page_lock_pair(&p, phys_pc, &p2, phys_page2, true);
-    tb_page_add(p, tb, 0, phys_pc);
-    if (p2) {
-        tb_page_add(p2, tb, 1, phys_page2);
-    } else {
-        tb->page_addr[1] = -1;
-    }
-
-    /* add in the hash table */
-    h = tb_hash_func(phys_pc, (TARGET_TB_PCREL ? 0 : tb_pc(tb)),
-                     tb->flags, tb->cflags, tb->trace_vcpu_dstate);
-    qht_insert(&tb_ctx.htable, tb, h, &existing_tb);
-
-    /* remove TB from the page(s) if we couldn't insert it */
-    if (unlikely(existing_tb)) {
-        tb_page_remove(p, tb);
-        if (p2) {
-            tb_page_remove(p2, tb);
-        }
-        tb = existing_tb;
-    }
-
-    if (p2 && p2 != p) {
-        page_unlock(p2);
-    }
-    page_unlock(p);
-    return tb;
-}
-
 /* Called with mmap_lock held for user mode emulation.  */
 TranslationBlock *tb_gen_code(CPUState *cpu,
                               target_ulong pc, target_ulong cs_base,
@@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_gen_code(CPUState *cpu,
     return tb;
 }
 
-/*
- * @p must be non-NULL.
- * user-mode: call with mmap_lock held.
- * !user-mode: call with all @pages locked.
- */
-static void
-tb_invalidate_phys_page_range__locked(struct page_collection *pages,
-                                      PageDesc *p, tb_page_addr_t start,
-                                      tb_page_addr_t end,
-                                      uintptr_t retaddr)
-{
-    TranslationBlock *tb;
-    tb_page_addr_t tb_start, tb_end;
-    int n;
-#ifdef TARGET_HAS_PRECISE_SMC
-    CPUState *cpu = current_cpu;
-    CPUArchState *env = NULL;
-    bool current_tb_not_found = retaddr != 0;
-    bool current_tb_modified = false;
-    TranslationBlock *current_tb = NULL;
-    target_ulong current_pc = 0;
-    target_ulong current_cs_base = 0;
-    uint32_t current_flags = 0;
-#endif /* TARGET_HAS_PRECISE_SMC */
-
-    assert_page_locked(p);
-
-#if defined(TARGET_HAS_PRECISE_SMC)
-    if (cpu != NULL) {
-        env = cpu->env_ptr;
-    }
-#endif
-
-    /* we remove all the TBs in the range [start, end[ */
-    /* XXX: see if in some cases it could be faster to invalidate all
-       the code */
-    PAGE_FOR_EACH_TB(p, tb, n) {
-        assert_page_locked(p);
-        /* NOTE: this is subtle as a TB may span two physical pages */
-        if (n == 0) {
-            /* NOTE: tb_end may be after the end of the page, but
-               it is not a problem */
-            tb_start = tb->page_addr[0];
-            tb_end = tb_start + tb->size;
-        } else {
-            tb_start = tb->page_addr[1];
-            tb_end = tb_start + ((tb->page_addr[0] + tb->size)
-                                 & ~TARGET_PAGE_MASK);
-        }
-        if (!(tb_end <= start || tb_start >= end)) {
-#ifdef TARGET_HAS_PRECISE_SMC
-            if (current_tb_not_found) {
-                current_tb_not_found = false;
-                /* now we have a real cpu fault */
-                current_tb = tcg_tb_lookup(retaddr);
-            }
-            if (current_tb == tb &&
-                (tb_cflags(current_tb) & CF_COUNT_MASK) != 1) {
-                /*
-                 * If we are modifying the current TB, we must stop
-                 * its execution. We could be more precise by checking
-                 * that the modification is after the current PC, but it
-                 * would require a specialized function to partially
-                 * restore the CPU state.
-                 */
-                current_tb_modified = true;
-                cpu_restore_state_from_tb(cpu, current_tb, retaddr, true);
-                cpu_get_tb_cpu_state(env, &current_pc, &current_cs_base,
-                                     &current_flags);
-            }
-#endif /* TARGET_HAS_PRECISE_SMC */
-            tb_phys_invalidate__locked(tb);
-        }
-    }
-#if !defined(CONFIG_USER_ONLY)
-    /* if no code remaining, no need to continue to use slow writes */
-    if (!p->first_tb) {
-        tlb_unprotect_code(start);
-    }
-#endif
-#ifdef TARGET_HAS_PRECISE_SMC
-    if (current_tb_modified) {
-        page_collection_unlock(pages);
-        /* Force execution of one insn next time.  */
-        cpu->cflags_next_tb = 1 | CF_NOIRQ | curr_cflags(cpu);
-        mmap_unlock();
-        cpu_loop_exit_noexc(cpu);
-    }
-#endif
-}
-
-/*
- * Invalidate all TBs which intersect with the target physical address range
- * [start;end[. NOTE: start and end must refer to the *same* physical page.
- * 'is_cpu_write_access' should be true if called from a real cpu write
- * access: the virtual CPU will exit the current TB if code is modified inside
- * this TB.
- *
- * Called with mmap_lock held for user-mode emulation
- */
-void tb_invalidate_phys_page_range(tb_page_addr_t start, tb_page_addr_t end)
-{
-    struct page_collection *pages;
-    PageDesc *p;
-
-    assert_memory_lock();
-
-    p = page_find(start >> TARGET_PAGE_BITS);
-    if (p == NULL) {
-        return;
-    }
-    pages = page_collection_lock(start, end);
-    tb_invalidate_phys_page_range__locked(pages, p, start, end, 0);
-    page_collection_unlock(pages);
-}
-
-/*
- * Invalidate all TBs which intersect with the target physical address range
- * [start;end[. NOTE: start and end may refer to *different* physical pages.
- * 'is_cpu_write_access' should be true if called from a real cpu write
- * access: the virtual CPU will exit the current TB if code is modified inside
- * this TB.
- *
- * Called with mmap_lock held for user-mode emulation.
- */
-#ifdef CONFIG_SOFTMMU
-void tb_invalidate_phys_range(ram_addr_t start, ram_addr_t end)
-#else
-void tb_invalidate_phys_range(target_ulong start, target_ulong end)
-#endif
-{
-    struct page_collection *pages;
-    tb_page_addr_t next;
-
-    assert_memory_lock();
-
-    pages = page_collection_lock(start, end);
-    for (next = (start & TARGET_PAGE_MASK) + TARGET_PAGE_SIZE;
-         start < end;
-         start = next, next += TARGET_PAGE_SIZE) {
-        PageDesc *pd = page_find(start >> TARGET_PAGE_BITS);
-        tb_page_addr_t bound = MIN(next, end);
-
-        if (pd == NULL) {
-            continue;
-        }
-        tb_invalidate_phys_page_range__locked(pages, pd, start, bound, 0);
-    }
-    page_collection_unlock(pages);
-}
-
-#ifdef CONFIG_SOFTMMU
-/* len must be <= 8 and start must be a multiple of len.
- * Called via softmmu_template.h when code areas are written to with
- * iothread mutex not held.
- *
- * Call with all @pages in the range [@start, @start + len[ locked.
- */
-void tb_invalidate_phys_page_fast(struct page_collection *pages,
-                                  tb_page_addr_t start, int len,
-                                  uintptr_t retaddr)
-{
-    PageDesc *p;
-
-    assert_memory_lock();
-
-    p = page_find(start >> TARGET_PAGE_BITS);
-    if (!p) {
-        return;
-    }
-
-    assert_page_locked(p);
-    tb_invalidate_phys_page_range__locked(pages, p, start, start + len,
-                                          retaddr);
-}
-#else
-/* Called with mmap_lock held. If pc is not 0 then it indicates the
- * host PC of the faulting store instruction that caused this invalidate.
- * Returns true if the caller needs to abort execution of the current
- * TB (because it was modified by this store and the guest CPU has
- * precise-SMC semantics).
- */
-static bool tb_invalidate_phys_page(tb_page_addr_t addr, uintptr_t pc)
-{
-    TranslationBlock *tb;
-    PageDesc *p;
-    int n;
-#ifdef TARGET_HAS_PRECISE_SMC
-    TranslationBlock *current_tb = NULL;
-    CPUState *cpu = current_cpu;
-    CPUArchState *env = NULL;
-    int current_tb_modified = 0;
-    target_ulong current_pc = 0;
-    target_ulong current_cs_base = 0;
-    uint32_t current_flags = 0;
-#endif
-
-    assert_memory_lock();
-
-    addr &= TARGET_PAGE_MASK;
-    p = page_find(addr >> TARGET_PAGE_BITS);
-    if (!p) {
-        return false;
-    }
-
-#ifdef TARGET_HAS_PRECISE_SMC
-    if (p->first_tb && pc != 0) {
-        current_tb = tcg_tb_lookup(pc);
-    }
-    if (cpu != NULL) {
-        env = cpu->env_ptr;
-    }
-#endif
-    assert_page_locked(p);
-    PAGE_FOR_EACH_TB(p, tb, n) {
-#ifdef TARGET_HAS_PRECISE_SMC
-        if (current_tb == tb &&
-            (tb_cflags(current_tb) & CF_COUNT_MASK) != 1) {
-                /* If we are modifying the current TB, we must stop
-                   its execution. We could be more precise by checking
-                   that the modification is after the current PC, but it
-                   would require a specialized function to partially
-                   restore the CPU state */
-
-            current_tb_modified = 1;
-            cpu_restore_state_from_tb(cpu, current_tb, pc, true);
-            cpu_get_tb_cpu_state(env, &current_pc, &current_cs_base,
-                                 &current_flags);
-        }
-#endif /* TARGET_HAS_PRECISE_SMC */
-        tb_phys_invalidate(tb, addr);
-    }
-    p->first_tb = (uintptr_t)NULL;
-#ifdef TARGET_HAS_PRECISE_SMC
-    if (current_tb_modified) {
-        /* Force execution of one insn next time.  */
-        cpu->cflags_next_tb = 1 | CF_NOIRQ | curr_cflags(cpu);
-        return true;
-    }
-#endif
-
-    return false;
-}
-#endif
-
 /* user-mode: call with mmap_lock held */
 void tb_check_watchpoint(CPUState *cpu, uintptr_t retaddr)
 {
diff --git a/accel/tcg/meson.build b/accel/tcg/meson.build
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/meson.build
+++ b/accel/tcg/meson.build
@@ -XXX,XX +XXX,XX @@ tcg_ss.add(files(
   'tcg-all.c',
   'cpu-exec-common.c',
   'cpu-exec.c',
+  'tb-maint.c',
   'tcg-runtime-gvec.c',
   'tcg-runtime.c',
   'translate-all.c',
-- 
2.34.1

There are no users outside of accel/tcg; this function
does not need to be defined in exec-all.h.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/internal.h    | 5 +++++
 include/exec/exec-all.h | 8 --------
 2 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/accel/tcg/internal.h b/accel/tcg/internal.h
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/internal.h
+++ b/accel/tcg/internal.h
@@ -XXX,XX +XXX,XX @@ void do_assert_page_locked(const PageDesc *pd, const char *file, int line);
 void page_lock(PageDesc *pd);
 void page_unlock(PageDesc *pd);
 #endif
+#if !defined(CONFIG_USER_ONLY) && defined(CONFIG_DEBUG_TCG)
+void assert_no_pages_locked(void);
+#else
+static inline void assert_no_pages_locked(void) { }
+#endif
 
 TranslationBlock *tb_gen_code(CPUState *cpu, target_ulong pc,
                               target_ulong cs_base, uint32_t flags,
diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -XXX,XX +XXX,XX @@ extern __thread uintptr_t tci_tb_ptr;
    smaller than 4 bytes, so we don't worry about special-casing this.  */
 #define GETPC_ADJ   2
 
-#if !defined(CONFIG_USER_ONLY) && defined(CONFIG_DEBUG_TCG)
-void assert_no_pages_locked(void);
-#else
-static inline void assert_no_pages_locked(void)
-{
-}
-#endif
-
 #if !defined(CONFIG_USER_ONLY)
 
 /**
-- 
2.34.1

The results of the calls to cpu_get_tb_cpu_state,
current_{pc,cs_base,flags}, are not used.
In tb_invalidate_phys_page, use bool for current_tb_modified.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/tb-maint.c | 25 ++-----------------------
 1 file changed, 2 insertions(+), 23 deletions(-)

diff --git a/accel/tcg/tb-maint.c b/accel/tcg/tb-maint.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tb-maint.c
+++ b/accel/tcg/tb-maint.c
@@ -XXX,XX +XXX,XX @@ tb_invalidate_phys_page_range__locked(struct page_collection *pages,
     int n;
 #ifdef TARGET_HAS_PRECISE_SMC
     CPUState *cpu = current_cpu;
-    CPUArchState *env = NULL;
     bool current_tb_not_found = retaddr != 0;
     bool current_tb_modified = false;
     TranslationBlock *current_tb = NULL;
-    target_ulong current_pc = 0;
-    target_ulong current_cs_base = 0;
-    uint32_t current_flags = 0;
 #endif /* TARGET_HAS_PRECISE_SMC */
 
     assert_page_locked(p);
 
-#if defined(TARGET_HAS_PRECISE_SMC)
-    if (cpu != NULL) {
-        env = cpu->env_ptr;
-    }
-#endif
-
     /*
      * We remove all the TBs in the range [start, end[.
      * XXX: see if in some cases it could be faster to invalidate all the code
@@ -XXX,XX +XXX,XX @@ tb_invalidate_phys_page_range__locked(struct page_collection *pages,
                  */
                 current_tb_modified = true;
                 cpu_restore_state_from_tb(cpu, current_tb, retaddr, true);
-                cpu_get_tb_cpu_state(env, &current_pc, &current_cs_base,
-                                     &current_flags);
             }
 #endif /* TARGET_HAS_PRECISE_SMC */
             tb_phys_invalidate__locked(tb);
@@ -XXX,XX +XXX,XX @@ bool tb_invalidate_phys_page(tb_page_addr_t addr, uintptr_t pc)
 #ifdef TARGET_HAS_PRECISE_SMC
     TranslationBlock *current_tb = NULL;
     CPUState *cpu = current_cpu;
-    CPUArchState *env = NULL;
-    int current_tb_modified = 0;
-    target_ulong current_pc = 0;
-    target_ulong current_cs_base = 0;
-    uint32_t current_flags = 0;
+    bool current_tb_modified = false;
 #endif
 
     assert_memory_lock();
@@ -XXX,XX +XXX,XX @@ bool tb_invalidate_phys_page(tb_page_addr_t addr, uintptr_t pc)
     if (p->first_tb && pc != 0) {
         current_tb = tcg_tb_lookup(pc);
     }
-    if (cpu != NULL) {
-        env = cpu->env_ptr;
-    }
 #endif
     assert_page_locked(p);
     PAGE_FOR_EACH_TB(p, tb, n) {
@@ -XXX,XX +XXX,XX @@ bool tb_invalidate_phys_page(tb_page_addr_t addr, uintptr_t pc)
              * after the current PC, but it would require a specialized
              * function to partially restore the CPU state.
              */
-            current_tb_modified = 1;
+            current_tb_modified = true;
             cpu_restore_state_from_tb(cpu, current_tb, pc, true);
-            cpu_get_tb_cpu_state(env, &current_pc, &current_cs_base,
-                                 &current_flags);
         }
 #endif /* TARGET_HAS_PRECISE_SMC */
         tb_phys_invalidate(tb, addr);
-- 
2.34.1

When we added the fast path, we initialized page_addr[] early.
These stores in and around tb_page_add() are redundant; remove them.

Fixes: 50627f1b7b1 ("accel/tcg: Add fast path for translator_ld*")
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/tb-maint.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/accel/tcg/tb-maint.c b/accel/tcg/tb-maint.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tb-maint.c
+++ b/accel/tcg/tb-maint.c
@@ -XXX,XX +XXX,XX @@ static inline void tb_page_add(PageDesc *p, TranslationBlock *tb,
 
     assert_page_locked(p);
 
-    tb->page_addr[n] = page_addr;
     tb->page_next[n] = p->first_tb;
 #ifndef CONFIG_USER_ONLY
     page_already_protected = p->first_tb != (uintptr_t)NULL;
@@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_link_page(TranslationBlock *tb, tb_page_addr_t phys_pc,
     tb_page_add(p, tb, 0, phys_pc);
     if (p2) {
         tb_page_add(p2, tb, 1, phys_page2);
-    } else {
-        tb->page_addr[1] = -1;
     }
 
     /* add in the hash table */
-- 
2.34.1

This data structure will be replaced for user-only: add accessors.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/exec-all.h   | 22 ++++++++++++++++++++++
 accel/tcg/cpu-exec.c      |  9 +++++----
 accel/tcg/tb-maint.c      | 29 +++++++++++++++--------------
 accel/tcg/translate-all.c | 16 ++++++++--------
 accel/tcg/translator.c    |  9 +++++----
 5 files changed, 55 insertions(+), 30 deletions(-)

diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -XXX,XX +XXX,XX @@ static inline uint32_t tb_cflags(const TranslationBlock *tb)
     return qatomic_read(&tb->cflags);
 }
 
+static inline tb_page_addr_t tb_page_addr0(const TranslationBlock *tb)
+{
+    return tb->page_addr[0];
+}
+
+static inline tb_page_addr_t tb_page_addr1(const TranslationBlock *tb)
+{
+    return tb->page_addr[1];
+}
+
+static inline void tb_set_page_addr0(TranslationBlock *tb,
+                                     tb_page_addr_t addr)
+{
+    tb->page_addr[0] = addr;
+}
+
+static inline void tb_set_page_addr1(TranslationBlock *tb,
+                                     tb_page_addr_t addr)
+{
+    tb->page_addr[1] = addr;
+}
+
 /* current cflags for hashing/comparison */
 uint32_t curr_cflags(CPUState *cpu);
 
diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cpu-exec.c
+++ b/accel/tcg/cpu-exec.c
@@ -XXX,XX +XXX,XX @@ static bool tb_lookup_cmp(const void *p, const void *d)
     const struct tb_desc *desc = d;
 
     if ((TARGET_TB_PCREL || tb_pc(tb) == desc->pc) &&
-        tb->page_addr[0] == desc->page_addr0 &&
+        tb_page_addr0(tb) == desc->page_addr0 &&
         tb->cs_base == desc->cs_base &&
         tb->flags == desc->flags &&
         tb->trace_vcpu_dstate == desc->trace_vcpu_dstate &&
         tb_cflags(tb) == desc->cflags) {
         /* check next page if needed */
-        if (tb->page_addr[1] == -1) {
+        tb_page_addr_t tb_phys_page1 = tb_page_addr1(tb);
+        if (tb_phys_page1 == -1) {
             return true;
         } else {
             tb_page_addr_t phys_page1;
@@ -XXX,XX +XXX,XX @@ static bool tb_lookup_cmp(const void *p, const void *d)
              */
             virt_page1 = TARGET_PAGE_ALIGN(desc->pc);
             phys_page1 = get_page_addr_code(desc->env, virt_page1);
-            if (tb->page_addr[1] == phys_page1) {
+            if (tb_phys_page1 == phys_page1) {
                 return true;
             }
         }
@@ -XXX,XX +XXX,XX @@ int cpu_exec(CPUState *cpu)
              * direct jump to a TB spanning two pages because the mapping
              * for the second page can change.
              */
-            if (tb->page_addr[1] != -1) {
+            if (tb_page_addr1(tb) != -1) {
                 last_tb = NULL;
             }
 #endif
diff --git a/accel/tcg/tb-maint.c b/accel/tcg/tb-maint.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tb-maint.c
+++ b/accel/tcg/tb-maint.c
@@ -XXX,XX +XXX,XX @@ static bool tb_cmp(const void *ap, const void *bp)
             a->flags == b->flags &&
             (tb_cflags(a) & ~CF_INVALID) == (tb_cflags(b) & ~CF_INVALID) &&
             a->trace_vcpu_dstate == b->trace_vcpu_dstate &&
-            a->page_addr[0] == b->page_addr[0] &&
-            a->page_addr[1] == b->page_addr[1]);
+            tb_page_addr0(a) == tb_page_addr0(b) &&
+            tb_page_addr1(a) == tb_page_addr1(b));
 }
 
 void tb_htable_init(void)
@@ -XXX,XX +XXX,XX @@ static void do_tb_phys_invalidate(TranslationBlock *tb, bool rm_from_page_list)
     qemu_spin_unlock(&tb->jmp_lock);
 
     /* remove the TB from the hash list */
-    phys_pc = tb->page_addr[0];
+    phys_pc = tb_page_addr0(tb);
     h = tb_hash_func(phys_pc, (TARGET_TB_PCREL ? 0 : tb_pc(tb)),
                      tb->flags, orig_cflags, tb->trace_vcpu_dstate);
     if (!qht_remove(&tb_ctx.htable, tb, h)) {
@@ -XXX,XX +XXX,XX @@ static void do_tb_phys_invalidate(TranslationBlock *tb, bool rm_from_page_list)
 
     /* remove the TB from the page list */
     if (rm_from_page_list) {
-        p = page_find(tb->page_addr[0] >> TARGET_PAGE_BITS);
+        p = page_find(phys_pc >> TARGET_PAGE_BITS);
         tb_page_remove(p, tb);
-        if (tb->page_addr[1] != -1) {
-            p = page_find(tb->page_addr[1] >> TARGET_PAGE_BITS);
+        phys_pc = tb_page_addr1(tb);
+        if (phys_pc != -1) {
+            p = page_find(phys_pc >> TARGET_PAGE_BITS);
             tb_page_remove(p, tb);
         }
     }
@@ -XXX,XX +XXX,XX @@ static inline void page_unlock_tb(const TranslationBlock *tb) { }
 /* lock the page(s) of a TB in the correct acquisition order */
 static void page_lock_tb(const TranslationBlock *tb)
 {
-    page_lock_pair(NULL, tb->page_addr[0], NULL, tb->page_addr[1], false);
+    page_lock_pair(NULL, tb_page_addr0(tb), NULL, tb_page_addr1(tb), false);
 }
 
 static void page_unlock_tb(const TranslationBlock *tb)
 {
-    PageDesc *p1 = page_find(tb->page_addr[0] >> TARGET_PAGE_BITS);
+    PageDesc *p1 = page_find(tb_page_addr0(tb) >> TARGET_PAGE_BITS);
 
     page_unlock(p1);
-    if (unlikely(tb->page_addr[1] != -1)) {
-        PageDesc *p2 = page_find(tb->page_addr[1] >> TARGET_PAGE_BITS);
+    if (unlikely(tb_page_addr1(tb) != -1)) {
+        PageDesc *p2 = page_find(tb_page_addr1(tb) >> TARGET_PAGE_BITS);
 
         if (p2 != p1) {
             page_unlock(p2);
@@ -XXX,XX +XXX,XX @@ static void page_unlock_tb(const TranslationBlock *tb)
  */
 void tb_phys_invalidate(TranslationBlock *tb, tb_page_addr_t page_addr)
 {
-    if (page_addr == -1 && tb->page_addr[0] != -1) {
+    if (page_addr == -1 && tb_page_addr0(tb) != -1) {
         page_lock_tb(tb);
         do_tb_phys_invalidate(tb, true);
         page_unlock_tb(tb);
@@ -XXX,XX +XXX,XX @@ tb_invalidate_phys_page_range__locked(struct page_collection *pages,
         if (n == 0) {
             /* NOTE: tb_end may be after the end of the page, but
                it is not a problem */
-            tb_start = tb->page_addr[0];
+            tb_start = tb_page_addr0(tb);
             tb_end = tb_start + tb->size;
         } else {
-            tb_start = tb->page_addr[1];
-            tb_end = tb_start + ((tb->page_addr[0] + tb->size)
+            tb_start = tb_page_addr1(tb);
+            tb_end = tb_start + ((tb_page_addr0(tb) + tb->size)
                                  & ~TARGET_PAGE_MASK);
         }
         if (!(tb_end <= start || tb_start >= end)) {
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -XXX,XX +XXX,XX @@ page_collection_lock(tb_page_addr_t start, tb_page_addr_t end)
         }
         assert_page_locked(pd);
         PAGE_FOR_EACH_TB(pd, tb, n) {
-            if (page_trylock_add(set, tb->page_addr[0]) ||
-                (tb->page_addr[1] != -1 &&
-                 page_trylock_add(set, tb->page_addr[1]))) {
+            if (page_trylock_add(set, tb_page_addr0(tb)) ||
+                (tb_page_addr1(tb) != -1 &&
+                 page_trylock_add(set, tb_page_addr1(tb)))) {
                 /* drop all locks, and reacquire in order */
                 g_tree_foreach(set->tree, page_entry_unlock, NULL);
                 goto retry;
@@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_gen_code(CPUState *cpu,
     tb->flags = flags;
     tb->cflags = cflags;
     tb->trace_vcpu_dstate = *cpu->trace_dstate;
-    tb->page_addr[0] = phys_pc;
-    tb->page_addr[1] = -1;
+    tb_set_page_addr0(tb, phys_pc);
+    tb_set_page_addr1(tb, -1);
     tcg_ctx->tb_cflags = cflags;
  tb_overflow:
 
@@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_gen_code(CPUState *cpu,
      * a temporary one-insn TB, and we have nothing left to do. Return early
      * before attempting to link to other TBs or add to the lookup table.
      */
-    if (tb->page_addr[0] == -1) {
+    if (tb_page_addr0(tb) == -1) {
         return tb;
     }
 
@@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_gen_code(CPUState *cpu,
      * No explicit memory barrier is required -- tb_link_page() makes the
      * TB visible in a consistent state.
      */
-    existing_tb = tb_link_page(tb, tb->page_addr[0], tb->page_addr[1]);
+    existing_tb = tb_link_page(tb, tb_page_addr0(tb), tb_page_addr1(tb));
     /* if the TB already exists, discard what we just translated */
     if (unlikely(existing_tb != tb)) {
         uintptr_t orig_aligned = (uintptr_t)gen_code_buf;
@@ -XXX,XX +XXX,XX @@ static gboolean tb_tree_stats_iter(gpointer key, gpointer value, gpointer data)
     if (tb->size > tst->max_target_size) {
         tst->max_target_size = tb->size;
     }
-    if (tb->page_addr[1] != -1) {
+    if (tb_page_addr1(tb) != -1) {
         tst->cross_page++;
     }
     if (tb->jmp_reset_offset[0] != TB_JMP_RESET_OFFSET_INVALID) {
diff --git a/accel/tcg/translator.c b/accel/tcg/translator.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/translator.c
+++ b/accel/tcg/translator.c
@@ -XXX,XX +XXX,XX @@ static void *translator_access(CPUArchState *env, DisasContextBase *db,
     tb = db->tb;
 
     /* Use slow path if first page is MMIO. */
-    if (unlikely(tb->page_addr[0] == -1)) {
+    if (unlikely(tb_page_addr0(tb) == -1)) {
         return NULL;
     }
 
@@ -XXX,XX +XXX,XX @@ static void *translator_access(CPUArchState *env, DisasContextBase *db,
         host = db->host_addr[1];
         base = TARGET_PAGE_ALIGN(db->pc_first);
         if (host == NULL) {
-            tb->page_addr[1] =
+            tb_page_addr_t phys_page =
                 get_page_addr_code_hostp(env, base, &db->host_addr[1]);
+            /* We cannot handle MMIO as second page. */
+            assert(phys_page != -1);
+            tb_set_page_addr1(tb, phys_page);
 #ifdef CONFIG_USER_ONLY
             page_protect(end);
 #endif
-            /* We cannot handle MMIO as second page. */
-            assert(tb->page_addr[1] != -1);
             host = db->host_addr[1];
         }
 
-- 
2.34.1

Rename to tb_invalidate_phys_page_unwind to emphasize that
we also detect invalidating the current TB, and also to free
up that name for other usage.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/internal.h      | 2 +-
 accel/tcg/tb-maint.c      | 2 +-
 accel/tcg/translate-all.c | 5 +++--
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/accel/tcg/internal.h b/accel/tcg/internal.h
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/internal.h
+++ b/accel/tcg/internal.h
@@ -XXX,XX +XXX,XX @@ void tb_htable_init(void);
 void tb_reset_jump(TranslationBlock *tb, int n);
 TranslationBlock *tb_link_page(TranslationBlock *tb, tb_page_addr_t phys_pc,
                                tb_page_addr_t phys_page2);
-bool tb_invalidate_phys_page(tb_page_addr_t addr, uintptr_t pc);
+bool tb_invalidate_phys_page_unwind(tb_page_addr_t addr, uintptr_t pc);
 int cpu_restore_state_from_tb(CPUState *cpu, TranslationBlock *tb,
                               uintptr_t searched_pc, bool reset_icount);
 
diff --git a/accel/tcg/tb-maint.c b/accel/tcg/tb-maint.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tb-maint.c
+++ b/accel/tcg/tb-maint.c
@@ -XXX,XX +XXX,XX @@ void tb_invalidate_phys_page_fast(struct page_collection *pages,
  * TB (because it was modified by this store and the guest CPU has
  * precise-SMC semantics).
  */
-bool tb_invalidate_phys_page(tb_page_addr_t addr, uintptr_t pc)
+bool tb_invalidate_phys_page_unwind(tb_page_addr_t addr, uintptr_t pc)
 {
     TranslationBlock *tb;
     PageDesc *p;
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -XXX,XX +XXX,XX @@ void page_set_flags(target_ulong start, target_ulong end, int flags)
         if (!(p->flags & PAGE_WRITE) &&
             (flags & PAGE_WRITE) &&
             p->first_tb) {
-            tb_invalidate_phys_page(addr, 0);
+            tb_invalidate_phys_page_unwind(addr, 0);
         }
         if (reset_target_data) {
             g_free(p->target_data);
@@ -XXX,XX +XXX,XX @@ int page_unprotect(target_ulong address, uintptr_t pc)
 
                 /* and since the content will be modified, we must invalidate
                    the corresponding translated code. */
-                current_tb_invalidated |= tb_invalidate_phys_page(addr, pc);
+                current_tb_invalidated |=
+                    tb_invalidate_phys_page_unwind(addr, pc);
             }
             mprotect((void *)g2h_untagged(host_start), qemu_host_page_size,
                      prot & PAGE_BITS);
-- 
2.34.1

This function is is never called with a real range,
only for a single page.  Drop the second parameter
and rename to tb_invalidate_phys_page.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/translate-all.h |  2 +-
 accel/tcg/tb-maint.c         | 15 ++++++++-------
 cpu.c                        |  4 ++--
 3 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/include/exec/translate-all.h b/include/exec/translate-all.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/translate-all.h
+++ b/include/exec/translate-all.h
@@ -XXX,XX +XXX,XX @@ void page_collection_unlock(struct page_collection *set);
 void tb_invalidate_phys_page_fast(struct page_collection *pages,
                                   tb_page_addr_t start, int len,
                                   uintptr_t retaddr);
-void tb_invalidate_phys_page_range(tb_page_addr_t start, tb_page_addr_t end);
+void tb_invalidate_phys_page(tb_page_addr_t addr);
 void tb_check_watchpoint(CPUState *cpu, uintptr_t retaddr);
 
 #ifdef CONFIG_USER_ONLY
diff --git a/accel/tcg/tb-maint.c b/accel/tcg/tb-maint.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tb-maint.c
+++ b/accel/tcg/tb-maint.c
@@ -XXX,XX +XXX,XX @@ tb_invalidate_phys_page_range__locked(struct page_collection *pages,
 }
 
 /*
- * Invalidate all TBs which intersect with the target physical address range
- * [start;end[. NOTE: start and end must refer to the *same* physical page.
- * 'is_cpu_write_access' should be true if called from a real cpu write
- * access: the virtual CPU will exit the current TB if code is modified inside
- * this TB.
+ * Invalidate all TBs which intersect with the target physical
+ * address page @addr.
  *
  * Called with mmap_lock held for user-mode emulation
  */
-void tb_invalidate_phys_page_range(tb_page_addr_t start, tb_page_addr_t end)
+void tb_invalidate_phys_page(tb_page_addr_t addr)
 {
     struct page_collection *pages;
+    tb_page_addr_t start, end;
     PageDesc *p;
 
     assert_memory_lock();
 
-    p = page_find(start >> TARGET_PAGE_BITS);
+    p = page_find(addr >> TARGET_PAGE_BITS);
     if (p == NULL) {
         return;
     }
+
+    start = addr & TARGET_PAGE_MASK;
+    end = start + TARGET_PAGE_SIZE;
     pages = page_collection_lock(start, end);
     tb_invalidate_phys_page_range__locked(pages, p, start, end, 0);
     page_collection_unlock(pages);
diff --git a/cpu.c b/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/cpu.c
+++ b/cpu.c
@@ -XXX,XX +XXX,XX @@ void list_cpus(const char *optarg)
 void tb_invalidate_phys_addr(target_ulong addr)
 {
     mmap_lock();
-    tb_invalidate_phys_page_range(addr, addr + 1);
+    tb_invalidate_phys_page(addr);
     mmap_unlock();
 }
 #else
@@ -XXX,XX +XXX,XX @@ void tb_invalidate_phys_addr(AddressSpace *as, hwaddr addr, MemTxAttrs attrs)
         return;
     }
     ram_addr = memory_region_get_ram_addr(mr) + addr;
-    tb_invalidate_phys_page_range(ram_addr, ram_addr + 1);
+    tb_invalidate_phys_page(ram_addr);
 }
 #endif
 
-- 
2.34.1

We missed this function when we introduced tb_page_addr_t.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/exec-all.h |  2 +-
 include/exec/ram_addr.h |  2 --
 accel/tcg/tb-maint.c    | 13 ++-----------
 3 files changed, 3 insertions(+), 14 deletions(-)

diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -XXX,XX +XXX,XX @@ uint32_t curr_cflags(CPUState *cpu);
 /* TranslationBlock invalidate API */
 #if defined(CONFIG_USER_ONLY)
 void tb_invalidate_phys_addr(target_ulong addr);
-void tb_invalidate_phys_range(target_ulong start, target_ulong end);
 #else
 void tb_invalidate_phys_addr(AddressSpace *as, hwaddr addr, MemTxAttrs attrs);
 #endif
 void tb_flush(CPUState *cpu);
 void tb_phys_invalidate(TranslationBlock *tb, tb_page_addr_t page_addr);
+void tb_invalidate_phys_range(tb_page_addr_t start, tb_page_addr_t end);
 void tb_set_jmp_target(TranslationBlock *tb, int n, uintptr_t addr);
 
 /* GETPC is the true target of the return instruction that we'll execute.  */
diff --git a/include/exec/ram_addr.h b/include/exec/ram_addr.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/ram_addr.h
+++ b/include/exec/ram_addr.h
@@ -XXX,XX +XXX,XX @@ static inline void qemu_ram_block_writeback(RAMBlock *block)
 #define DIRTY_CLIENTS_ALL     ((1 << DIRTY_MEMORY_NUM) - 1)
 #define DIRTY_CLIENTS_NOCODE  (DIRTY_CLIENTS_ALL & ~(1 << DIRTY_MEMORY_CODE))
 
-void tb_invalidate_phys_range(ram_addr_t start, ram_addr_t end);
-
 static inline bool cpu_physical_memory_get_dirty(ram_addr_t start,
                                                  ram_addr_t length,
                                                  unsigned client)
diff --git a/accel/tcg/tb-maint.c b/accel/tcg/tb-maint.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tb-maint.c
+++ b/accel/tcg/tb-maint.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/osdep.h"
 #include "exec/cputlb.h"
 #include "exec/log.h"
+#include "exec/exec-all.h"
 #include "exec/translate-all.h"
 #include "sysemu/tcg.h"
 #include "tcg/tcg.h"
@@ -XXX,XX +XXX,XX @@
 #include "tb-context.h"
 #include "internal.h"
 
-/* FIXME: tb_invalidate_phys_range is declared in different places. */
-#ifdef CONFIG_USER_ONLY
-#include "exec/exec-all.h"
-#else
-#include "exec/ram_addr.h"
-#endif
 
 static bool tb_cmp(const void *ap, const void *bp)
 {
@@ -XXX,XX +XXX,XX @@ void tb_invalidate_phys_page(tb_page_addr_t addr)
  *
  * Called with mmap_lock held for user-mode emulation.
  */
-#ifdef CONFIG_SOFTMMU
-void tb_invalidate_phys_range(ram_addr_t start, ram_addr_t end)
-#else
-void tb_invalidate_phys_range(target_ulong start, target_ulong end)
-#endif
+void tb_invalidate_phys_range(tb_page_addr_t start, tb_page_addr_t end)
 {
     struct page_collection *pages;
     tb_page_addr_t next;
-- 
2.34.1

When PAGE_RESET is set, we are replacing pages with new
content, which means that we need to invalidate existing
cached data, such as TranslationBlocks.  Perform the
reset invalidate while we're doing other invalidates,
which allows us to remove the separate invalidates from
the user-only mmap/munmap/mprotect routines.

In addition, restrict invalidation to PAGE_EXEC pages.
Since cdf713085131, we have validated PAGE_EXEC is present
before translation, which means we can assume that if the
bit is not present, there are no translations to invalidate.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/translate-all.c | 19 +++++++++++--------
 bsd-user/mmap.c           |  2 --
 linux-user/mmap.c         |  4 ----
 3 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -XXX,XX +XXX,XX @@ int page_get_flags(target_ulong address)
 void page_set_flags(target_ulong start, target_ulong end, int flags)
 {
     target_ulong addr, len;
-    bool reset_target_data;
+    bool reset;
 
     /* This function should never be called with addresses outside the
        guest address space.  If this assert fires, it probably indicates
@@ -XXX,XX +XXX,XX @@ void page_set_flags(target_ulong start, target_ulong end, int flags)
     if (flags & PAGE_WRITE) {
         flags |= PAGE_WRITE_ORG;
     }
-    reset_target_data = !(flags & PAGE_VALID) || (flags & PAGE_RESET);
+    reset = !(flags & PAGE_VALID) || (flags & PAGE_RESET);
     flags &= ~PAGE_RESET;
 
     for (addr = start, len = end - start;
@@ -XXX,XX +XXX,XX @@ void page_set_flags(target_ulong start, target_ulong end, int flags)
          len -= TARGET_PAGE_SIZE, addr += TARGET_PAGE_SIZE) {
         PageDesc *p = page_find_alloc(addr >> TARGET_PAGE_BITS, true);
 
-        /* If the write protection bit is set, then we invalidate
-           the code inside.  */
-        if (!(p->flags & PAGE_WRITE) &&
-            (flags & PAGE_WRITE) &&
-            p->first_tb) {
+        /*
+         * If the page was executable, but is reset, or is no longer
+         * executable, or has become writable, then invalidate any code.
+         */
+        if ((p->flags & PAGE_EXEC)
+            && (reset ||
+                !(flags & PAGE_EXEC) ||
+                (flags & ~p->flags & PAGE_WRITE))) {
             tb_invalidate_phys_page(addr);
         }
-        if (reset_target_data) {
+        if (reset) {
             g_free(p->target_data);
             p->target_data = NULL;
             p->flags = flags;
diff --git a/bsd-user/mmap.c b/bsd-user/mmap.c
index XXXXXXX..XXXXXXX 100644
--- a/bsd-user/mmap.c
+++ b/bsd-user/mmap.c
@@ -XXX,XX +XXX,XX @@ abi_long target_mmap(abi_ulong start, abi_ulong len, int prot,
     page_dump(stdout);
     printf("\n");
 #endif
-    tb_invalidate_phys_range(start, start + len);
     mmap_unlock();
     return start;
 fail:
@@ -XXX,XX +XXX,XX @@ int target_munmap(abi_ulong start, abi_ulong len)
 
     if (ret == 0) {
         page_set_flags(start, start + len, 0);
-        tb_invalidate_phys_range(start, start + len);
     }
     mmap_unlock();
     return ret;
diff --git a/linux-user/mmap.c b/linux-user/mmap.c
index XXXXXXX..XXXXXXX 100644
--- a/linux-user/mmap.c
+++ b/linux-user/mmap.c
@@ -XXX,XX +XXX,XX @@ int target_mprotect(abi_ulong start, abi_ulong len, int target_prot)
     }
 
     page_set_flags(start, start + len, page_flags);
-    tb_invalidate_phys_range(start, start + len);
     ret = 0;
 
 error:
@@ -XXX,XX +XXX,XX @@ abi_long target_mmap(abi_ulong start, abi_ulong len, int target_prot,
             qemu_log_unlock(f);
         }
     }
-    tb_invalidate_phys_range(start, start + len);
     mmap_unlock();
     return start;
 fail:
@@ -XXX,XX +XXX,XX @@ int target_munmap(abi_ulong start, abi_ulong len)
 
     if (ret == 0) {
         page_set_flags(start, start + len, 0);
-        tb_invalidate_phys_range(start, start + len);
     }
     mmap_unlock();
     return ret;
@@ -XXX,XX +XXX,XX @@ abi_long target_mremap(abi_ulong old_addr, abi_ulong old_size,
         page_set_flags(new_addr, new_addr + new_size,
                        prot | PAGE_VALID | PAGE_RESET);
     }
-    tb_invalidate_phys_range(new_addr, new_addr + new_size);
     mmap_unlock();
     return new_addr;
 }
-- 
2.34.1

Use the existing function for clearing target data.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/translate-all.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -XXX,XX +XXX,XX @@ void page_set_flags(target_ulong start, target_ulong end, int flags)
         flags |= PAGE_WRITE_ORG;
     }
     reset = !(flags & PAGE_VALID) || (flags & PAGE_RESET);
+    if (reset) {
+        page_reset_target_data(start, end);
+    }
     flags &= ~PAGE_RESET;
 
     for (addr = start, len = end - start;
@@ -XXX,XX +XXX,XX @@ void page_set_flags(target_ulong start, target_ulong end, int flags)
                 (flags & ~p->flags & PAGE_WRITE))) {
             tb_invalidate_phys_page(addr);
         }
-        if (reset) {
-            g_free(p->target_data);
-            p->target_data = NULL;
-            p->flags = flags;
-        } else {
-            /* Using mprotect on a page does not change sticky bits. */
-            p->flags = (p->flags & PAGE_STICKY) | flags;
-        }
+        /* Using mprotect on a page does not change sticky bits. */
+        p->flags = (reset ? 0 : p->flags & PAGE_STICKY) | flags;
     }
 }
 
-- 
2.34.1

Flush translation blocks in bulk, rather than page-by-page.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/translate-all.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -XXX,XX +XXX,XX @@ int page_get_flags(target_ulong address)
 void page_set_flags(target_ulong start, target_ulong end, int flags)
 {
     target_ulong addr, len;
-    bool reset;
+    bool reset, inval_tb = false;
 
     /* This function should never be called with addresses outside the
        guest address space.  If this assert fires, it probably indicates
@@ -XXX,XX +XXX,XX @@ void page_set_flags(target_ulong start, target_ulong end, int flags)
             && (reset ||
                 !(flags & PAGE_EXEC) ||
                 (flags & ~p->flags & PAGE_WRITE))) {
-            tb_invalidate_phys_page(addr);
+            inval_tb = true;
         }
         /* Using mprotect on a page does not change sticky bits. */
         p->flags = (reset ? 0 : p->flags & PAGE_STICKY) | flags;
     }
+
+    if (inval_tb) {
+        tb_invalidate_phys_range(start, end);
+    }
 }
 
 void page_reset_target_data(target_ulong start, target_ulong end)
-- 
2.34.1

Since "target data" is always user-only, move it out of
translate-all.c to user-exec.c.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/translate-all.c | 50 ---------------------------------------
 accel/tcg/user-exec.c     | 50 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 50 insertions(+), 50 deletions(-)

diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -XXX,XX +XXX,XX @@ void page_set_flags(target_ulong start, target_ulong end, int flags)
     }
 }
 
-void page_reset_target_data(target_ulong start, target_ulong end)
-{
-#ifdef TARGET_PAGE_DATA_SIZE
-    target_ulong addr, len;
-
-    /*
-     * This function should never be called with addresses outside the
-     * guest address space.  If this assert fires, it probably indicates
-     * a missing call to h2g_valid.
-     */
-    assert(end - 1 <= GUEST_ADDR_MAX);
-    assert(start < end);
-    assert_memory_lock();
-
-    start = start & TARGET_PAGE_MASK;
-    end = TARGET_PAGE_ALIGN(end);
-
-    for (addr = start, len = end - start;
-         len != 0;
-         len -= TARGET_PAGE_SIZE, addr += TARGET_PAGE_SIZE) {
-        PageDesc *p = page_find_alloc(addr >> TARGET_PAGE_BITS, 1);
-
-        g_free(p->target_data);
-        p->target_data = NULL;
-    }
-#endif
-}
-
-#ifdef TARGET_PAGE_DATA_SIZE
-void *page_get_target_data(target_ulong address)
-{
-    PageDesc *p = page_find(address >> TARGET_PAGE_BITS);
-    return p ? p->target_data : NULL;
-}
-
-void *page_alloc_target_data(target_ulong address)
-{
-    PageDesc *p = page_find(address >> TARGET_PAGE_BITS);
-    void *ret = NULL;
-
-    if (p->flags & PAGE_VALID) {
-        ret = p->target_data;
-        if (!ret) {
-            p->target_data = ret = g_malloc0(TARGET_PAGE_DATA_SIZE);
-        }
-    }
-    return ret;
-}
-#endif /* TARGET_PAGE_DATA_SIZE */
-
 int page_check_range(target_ulong start, target_ulong len, int flags)
 {
     PageDesc *p;
diff --git a/accel/tcg/user-exec.c b/accel/tcg/user-exec.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/user-exec.c
+++ b/accel/tcg/user-exec.c
@@ -XXX,XX +XXX,XX @@ tb_page_addr_t get_page_addr_code_hostp(CPUArchState *env, target_ulong addr,
     return addr;
 }
 
+void page_reset_target_data(target_ulong start, target_ulong end)
+{
+#ifdef TARGET_PAGE_DATA_SIZE
+    target_ulong addr, len;
+
+    /*
+     * This function should never be called with addresses outside the
+     * guest address space.  If this assert fires, it probably indicates
+     * a missing call to h2g_valid.
+     */
+    assert(end - 1 <= GUEST_ADDR_MAX);
+    assert(start < end);
+    assert_memory_lock();
+
+    start = start & TARGET_PAGE_MASK;
+    end = TARGET_PAGE_ALIGN(end);
+
+    for (addr = start, len = end - start;
+         len != 0;
+         len -= TARGET_PAGE_SIZE, addr += TARGET_PAGE_SIZE) {
+        PageDesc *p = page_find_alloc(addr >> TARGET_PAGE_BITS, 1);
+
+        g_free(p->target_data);
+        p->target_data = NULL;
+    }
+#endif
+}
+
+#ifdef TARGET_PAGE_DATA_SIZE
+void *page_get_target_data(target_ulong address)
+{
+    PageDesc *p = page_find(address >> TARGET_PAGE_BITS);
+    return p ? p->target_data : NULL;
+}
+
+void *page_alloc_target_data(target_ulong address)
+{
+    PageDesc *p = page_find(address >> TARGET_PAGE_BITS);
+    void *ret = NULL;
+
+    if (p->flags & PAGE_VALID) {
+        ret = p->target_data;
+        if (!ret) {
+            p->target_data = ret = g_malloc0(TARGET_PAGE_DATA_SIZE);
+        }
+    }
+    return ret;
+}
+#endif
+
 /* The softmmu versions of these helpers are in cputlb.c.  */
 
 /*
-- 
2.34.1

Since the only user, Arm MTE, always requires allocation,
merge the get and alloc functions to always produce a
non-null result.  Also assume that the user has already
checked page validity.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/cpu-all.h  | 21 ++++++---------------
 accel/tcg/user-exec.c   | 16 ++++------------
 target/arm/mte_helper.c |  4 ----
 3 files changed, 10 insertions(+), 31 deletions(-)

Add a tcg_ops hook to replace the restore_state_to_opc
function call.  Because these generic hooks cannot depend
on target-specific types, temporarily, copy the current
target_ulong data[] into uint64_t d64[].

Reviewed-by: Claudio Fontana <cfontana@suse.de>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/exec-all.h       |  2 +-
 include/hw/core/tcg-cpu-ops.h | 11 +++++++++++
 accel/tcg/translate-all.c     | 24 ++++++++++++++++++++++--
 3 files changed, 34 insertions(+), 3 deletions(-)

diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -XXX,XX +XXX,XX @@ typedef ram_addr_t tb_page_addr_t;
 #endif
 
 void restore_state_to_opc(CPUArchState *env, TranslationBlock *tb,
-                          target_ulong *data);
+                          target_ulong *data) __attribute__((weak));
 
 /**
  * cpu_restore_state:
diff --git a/include/hw/core/tcg-cpu-ops.h b/include/hw/core/tcg-cpu-ops.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/core/tcg-cpu-ops.h
+++ b/include/hw/core/tcg-cpu-ops.h
@@ -XXX,XX +XXX,XX @@ struct TCGCPUOps {
      * function to restore all the state, and register it here.
      */
     void (*synchronize_from_tb)(CPUState *cpu, const TranslationBlock *tb);
+    /**
+     * @restore_state_to_opc: Synchronize state from INDEX_op_start_insn
+     *
+     * This is called when we unwind state in the middle of a TB,
+     * usually before raising an exception.  Set all part of the CPU
+     * state which are tracked insn-by-insn in the target-specific
+     * arguments to start_insn, passed as @data.
+     */
+    void (*restore_state_to_opc)(CPUState *cpu, const TranslationBlock *tb,
+                                 const uint64_t *data);
+
     /** @cpu_exec_enter: Callback for cpu_exec preparation */
     void (*cpu_exec_enter)(CPUState *cpu);
     /** @cpu_exec_exit: Callback for cpu_exec cleanup */
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -XXX,XX +XXX,XX @@ int cpu_restore_state_from_tb(CPUState *cpu, TranslationBlock *tb,
 {
     target_ulong data[TARGET_INSN_START_WORDS];
     uintptr_t host_pc = (uintptr_t)tb->tc.ptr;
-    CPUArchState *env = cpu->env_ptr;
     const uint8_t *p = tb->tc.ptr + tb->tc.size;
     int i, j, num_insns = tb->icount;
 #ifdef CONFIG_PROFILER
@@ -XXX,XX +XXX,XX @@ int cpu_restore_state_from_tb(CPUState *cpu, TranslationBlock *tb,
            and shift if to the number of actually executed instructions */
         cpu_neg(cpu)->icount_decr.u16.low += num_insns - i;
     }
-    restore_state_to_opc(env, tb, data);
+
+    {
+        const struct TCGCPUOps *ops = cpu->cc->tcg_ops;
+        __typeof(ops->restore_state_to_opc) restore = ops->restore_state_to_opc;
+        if (restore) {
+            uint64_t d64[TARGET_INSN_START_WORDS];
+            for (i = 0; i < TARGET_INSN_START_WORDS; ++i) {
+                d64[i] = data[i];
+            }
+            restore(cpu, tb, d64);
+        } else {
+            restore_state_to_opc(cpu->env_ptr, tb, data);
+        }
+    }
 
 #ifdef CONFIG_PROFILER
     qatomic_set(&prof->restore_time,
@@ -XXX,XX +XXX,XX @@ int cpu_restore_state_from_tb(CPUState *cpu, TranslationBlock *tb,
 
 bool cpu_restore_state(CPUState *cpu, uintptr_t host_pc, bool will_exit)
 {
+    /*
+     * The pc update associated with restore without exit will
+     * break the relative pc adjustments performed by TARGET_TB_PCREL.
+     */
+    if (TARGET_TB_PCREL) {
+        assert(will_exit);
+    }
+
     /*
      * The host_pc has to be in the rx region of the code buffer.
      * If it is not we will not be able to resolve it here.
-- 
2.34.1

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/alpha/cpu.c       | 9 +++++++++
 target/alpha/translate.c | 6 ------
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/target/alpha/cpu.c b/target/alpha/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/alpha/cpu.c
+++ b/target/alpha/cpu.c
@@ -XXX,XX +XXX,XX @@ static vaddr alpha_cpu_get_pc(CPUState *cs)
     return cpu->env.pc;
 }
 
+static void alpha_restore_state_to_opc(CPUState *cs,
+                                       const TranslationBlock *tb,
+                                       const uint64_t *data)
+{
+    AlphaCPU *cpu = ALPHA_CPU(cs);
+
+    cpu->env.pc = data[0];
+}
 
 static bool alpha_cpu_has_work(CPUState *cs)
 {
@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps alpha_sysemu_ops = {
 
 static const struct TCGCPUOps alpha_tcg_ops = {
     .initialize = alpha_translate_init,
+    .restore_state_to_opc = alpha_restore_state_to_opc,
 
 #ifdef CONFIG_USER_ONLY
     .record_sigsegv = alpha_cpu_record_sigsegv,
diff --git a/target/alpha/translate.c b/target/alpha/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/alpha/translate.c
+++ b/target/alpha/translate.c
@@ -XXX,XX +XXX,XX @@ void gen_intermediate_code(CPUState *cpu, TranslationBlock *tb, int max_insns,
     DisasContext dc;
     translator_loop(cpu, tb, max_insns, pc, host_pc, &alpha_tr_ops, &dc.base);
 }
-
-void restore_state_to_opc(CPUAlphaState *env, TranslationBlock *tb,
-                          target_ulong *data)
-{
-    env->pc = data[0];
-}
-- 
2.34.1

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/arm/cpu.c       | 26 ++++++++++++++++++++++++++
 target/arm/translate.c | 22 ----------------------
 2 files changed, 26 insertions(+), 22 deletions(-)

diff --git a/target/arm/cpu.c b/target/arm/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/cpu.c
+++ b/target/arm/cpu.c
@@ -XXX,XX +XXX,XX @@ void arm_cpu_synchronize_from_tb(CPUState *cs,
         }
     }
 }
+
+static void arm_restore_state_to_opc(CPUState *cs,
+                                     const TranslationBlock *tb,
+                                     const uint64_t *data)
+{
+    CPUARMState *env = cs->env_ptr;
+
+    if (is_a64(env)) {
+        if (TARGET_TB_PCREL) {
+            env->pc = (env->pc & TARGET_PAGE_MASK) | data[0];
+        } else {
+            env->pc = data[0];
+        }
+        env->condexec_bits = 0;
+        env->exception.syndrome = data[2] << ARM_INSN_START_WORD2_SHIFT;
+    } else {
+        if (TARGET_TB_PCREL) {
+            env->regs[15] = (env->regs[15] & TARGET_PAGE_MASK) | data[0];
+        } else {
+            env->regs[15] = data[0];
+        }
+        env->condexec_bits = data[1];
+        env->exception.syndrome = data[2] << ARM_INSN_START_WORD2_SHIFT;
+    }
+}
 #endif /* CONFIG_TCG */
 
 static bool arm_cpu_has_work(CPUState *cs)
@@ -XXX,XX +XXX,XX @@ static const struct TCGCPUOps arm_tcg_ops = {
     .initialize = arm_translate_init,
     .synchronize_from_tb = arm_cpu_synchronize_from_tb,
     .debug_excp_handler = arm_debug_excp_handler,
+    .restore_state_to_opc = arm_restore_state_to_opc,
 
 #ifdef CONFIG_USER_ONLY
     .record_sigsegv = arm_cpu_record_sigsegv,
diff --git a/target/arm/translate.c b/target/arm/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/translate.c
+++ b/target/arm/translate.c
@@ -XXX,XX +XXX,XX @@ void gen_intermediate_code(CPUState *cpu, TranslationBlock *tb, int max_insns,
 
     translator_loop(cpu, tb, max_insns, pc, host_pc, ops, &dc.base);
 }
-
-void restore_state_to_opc(CPUARMState *env, TranslationBlock *tb,
-                          target_ulong *data)
-{
-    if (is_a64(env)) {
-        if (TARGET_TB_PCREL) {
-            env->pc = (env->pc & TARGET_PAGE_MASK) | data[0];
-        } else {
-            env->pc = data[0];
-        }
-        env->condexec_bits = 0;
-        env->exception.syndrome = data[2] << ARM_INSN_START_WORD2_SHIFT;
-    } else {
-        if (TARGET_TB_PCREL) {
-            env->regs[15] = (env->regs[15] & TARGET_PAGE_MASK) | data[0];
-        } else {
-            env->regs[15] = data[0];
-        }
-        env->condexec_bits = data[1];
-        env->exception.syndrome = data[2] << ARM_INSN_START_WORD2_SHIFT;
-    }
-}
-- 
2.34.1

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/avr/cpu.c       | 11 +++++++++++
 target/avr/translate.c |  6 ------
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/target/avr/cpu.c b/target/avr/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/avr/cpu.c
+++ b/target/avr/cpu.c
@@ -XXX,XX +XXX,XX @@ static void avr_cpu_synchronize_from_tb(CPUState *cs,
     env->pc_w = tb_pc(tb) / 2; /* internally PC points to words */
 }
 
+static void avr_restore_state_to_opc(CPUState *cs,
+                                     const TranslationBlock *tb,
+                                     const uint64_t *data)
+{
+    AVRCPU *cpu = AVR_CPU(cs);
+    CPUAVRState *env = &cpu->env;
+
+    env->pc_w = data[0];
+}
+
 static void avr_cpu_reset(DeviceState *ds)
 {
     CPUState *cs = CPU(ds);
@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps avr_sysemu_ops = {
 static const struct TCGCPUOps avr_tcg_ops = {
     .initialize = avr_cpu_tcg_init,
     .synchronize_from_tb = avr_cpu_synchronize_from_tb,
+    .restore_state_to_opc = avr_restore_state_to_opc,
     .cpu_exec_interrupt = avr_cpu_exec_interrupt,
     .tlb_fill = avr_cpu_tlb_fill,
     .do_interrupt = avr_cpu_do_interrupt,
diff --git a/target/avr/translate.c b/target/avr/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/avr/translate.c
+++ b/target/avr/translate.c
@@ -XXX,XX +XXX,XX @@ void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int max_insns,
     DisasContext dc = { };
     translator_loop(cs, tb, max_insns, pc, host_pc, &avr_tr_ops, &dc.base);
 }
-
-void restore_state_to_opc(CPUAVRState *env, TranslationBlock *tb,
-                            target_ulong *data)
-{
-    env->pc_w = data[0];
-}
-- 
2.34.1

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/cris/cpu.c       | 11 +++++++++++
 target/cris/translate.c |  6 ------
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/target/cris/cpu.c b/target/cris/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/cris/cpu.c
+++ b/target/cris/cpu.c
@@ -XXX,XX +XXX,XX @@ static vaddr cris_cpu_get_pc(CPUState *cs)
     return cpu->env.pc;
 }
 
+static void cris_restore_state_to_opc(CPUState *cs,
+                                      const TranslationBlock *tb,
+                                      const uint64_t *data)
+{
+    CRISCPU *cpu = CRIS_CPU(cs);
+
+    cpu->env.pc = data[0];
+}
+
 static bool cris_cpu_has_work(CPUState *cs)
 {
     return cs->interrupt_request & (CPU_INTERRUPT_HARD | CPU_INTERRUPT_NMI);
@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps cris_sysemu_ops = {
 
 static const struct TCGCPUOps crisv10_tcg_ops = {
     .initialize = cris_initialize_crisv10_tcg,
+    .restore_state_to_opc = cris_restore_state_to_opc,
 
 #ifndef CONFIG_USER_ONLY
     .tlb_fill = cris_cpu_tlb_fill,
@@ -XXX,XX +XXX,XX @@ static const struct TCGCPUOps crisv10_tcg_ops = {
 
 static const struct TCGCPUOps crisv32_tcg_ops = {
     .initialize = cris_initialize_tcg,
+    .restore_state_to_opc = cris_restore_state_to_opc,
 
 #ifndef CONFIG_USER_ONLY
     .tlb_fill = cris_cpu_tlb_fill,
diff --git a/target/cris/translate.c b/target/cris/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/cris/translate.c
+++ b/target/cris/translate.c
@@ -XXX,XX +XXX,XX @@ void cris_initialize_tcg(void)
                                        pregnames_v32[i]);
     }
 }
-
-void restore_state_to_opc(CPUCRISState *env, TranslationBlock *tb,
-                          target_ulong *data)
-{
-    env->pc = data[0];
-}
-- 
2.34.1

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/hexagon/cpu.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/target/hexagon/cpu.c b/target/hexagon/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/hexagon/cpu.c
+++ b/target/hexagon/cpu.c
@@ -XXX,XX +XXX,XX @@ static bool hexagon_cpu_has_work(CPUState *cs)
     return true;
 }
 
-void restore_state_to_opc(CPUHexagonState *env, TranslationBlock *tb,
-                          target_ulong *data)
+static void hexagon_restore_state_to_opc(CPUState *cs,
+                                         const TranslationBlock *tb,
+                                         const uint64_t *data)
 {
+    HexagonCPU *cpu = HEXAGON_CPU(cs);
+    CPUHexagonState *env = &cpu->env;
+
     env->gpr[HEX_REG_PC] = data[0];
 }
 
@@ -XXX,XX +XXX,XX @@ static void hexagon_cpu_init(Object *obj)
 static const struct TCGCPUOps hexagon_tcg_ops = {
     .initialize = hexagon_translate_init,
     .synchronize_from_tb = hexagon_cpu_synchronize_from_tb,
+    .restore_state_to_opc = hexagon_restore_state_to_opc,
 };
 
 static void hexagon_cpu_class_init(ObjectClass *c, void *data)
-- 
2.34.1

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/hppa/cpu.c       | 19 +++++++++++++++++++
 target/hppa/translate.c | 13 -------------
 2 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/target/hppa/cpu.c b/target/hppa/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/hppa/cpu.c
+++ b/target/hppa/cpu.c
@@ -XXX,XX +XXX,XX @@ static void hppa_cpu_synchronize_from_tb(CPUState *cs,
     cpu->env.psw_n = (tb->flags & PSW_N) != 0;
 }
 
+static void hppa_restore_state_to_opc(CPUState *cs,
+                                      const TranslationBlock *tb,
+                                      const uint64_t *data)
+{
+    HPPACPU *cpu = HPPA_CPU(cs);
+
+    cpu->env.iaoq_f = data[0];
+    if (data[1] != (target_ureg)-1) {
+        cpu->env.iaoq_b = data[1];
+    }
+    /*
+     * Since we were executing the instruction at IAOQ_F, and took some
+     * sort of action that provoked the cpu_restore_state, we can infer
+     * that the instruction was not nullified.
+     */
+    cpu->env.psw_n = 0;
+}
+
 static bool hppa_cpu_has_work(CPUState *cs)
 {
     return cs->interrupt_request & (CPU_INTERRUPT_HARD | CPU_INTERRUPT_NMI);
@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps hppa_sysemu_ops = {
 static const struct TCGCPUOps hppa_tcg_ops = {
     .initialize = hppa_translate_init,
     .synchronize_from_tb = hppa_cpu_synchronize_from_tb,
+    .restore_state_to_opc = hppa_restore_state_to_opc,
 
 #ifndef CONFIG_USER_ONLY
     .tlb_fill = hppa_cpu_tlb_fill,
diff --git a/target/hppa/translate.c b/target/hppa/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/hppa/translate.c
+++ b/target/hppa/translate.c
@@ -XXX,XX +XXX,XX @@ void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int max_insns,
     DisasContext ctx;
     translator_loop(cs, tb, max_insns, pc, host_pc, &hppa_tr_ops, &ctx.base);
 }
-
-void restore_state_to_opc(CPUHPPAState *env, TranslationBlock *tb,
-                          target_ulong *data)
-{
-    env->iaoq_f = data[0];
-    if (data[1] != (target_ureg)-1) {
-        env->iaoq_b = data[1];
-    }
-    /* Since we were executing the instruction at IAOQ_F, and took some
-       sort of action that provoked the cpu_restore_state, we can infer
-       that the instruction was not nullified.  */
-    env->psw_n = 0;
-}
-- 
2.34.1

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/i386/tcg/tcg-cpu.c   | 19 +++++++++++++++++++
 target/i386/tcg/translate.c | 15 ---------------
 2 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/target/i386/tcg/tcg-cpu.c b/target/i386/tcg/tcg-cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/i386/tcg/tcg-cpu.c
+++ b/target/i386/tcg/tcg-cpu.c
@@ -XXX,XX +XXX,XX @@ static void x86_cpu_synchronize_from_tb(CPUState *cs,
     }
 }
 
+static void x86_restore_state_to_opc(CPUState *cs,
+                                     const TranslationBlock *tb,
+                                     const uint64_t *data)
+{
+    X86CPU *cpu = X86_CPU(cs);
+    CPUX86State *env = &cpu->env;
+    int cc_op = data[1];
+
+    if (TARGET_TB_PCREL) {
+        env->eip = (env->eip & TARGET_PAGE_MASK) | data[0];
+    } else {
+        env->eip = data[0] - tb->cs_base;
+    }
+    if (cc_op != CC_OP_DYNAMIC) {
+        env->cc_op = cc_op;
+    }
+}
+
 #ifndef CONFIG_USER_ONLY
 static bool x86_debug_check_breakpoint(CPUState *cs)
 {
@@ -XXX,XX +XXX,XX @@ static bool x86_debug_check_breakpoint(CPUState *cs)
 static const struct TCGCPUOps x86_tcg_ops = {
     .initialize = tcg_x86_init,
     .synchronize_from_tb = x86_cpu_synchronize_from_tb,
+    .restore_state_to_opc = x86_restore_state_to_opc,
     .cpu_exec_enter = x86_cpu_exec_enter,
     .cpu_exec_exit = x86_cpu_exec_exit,
 #ifdef CONFIG_USER_ONLY
diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -XXX,XX +XXX,XX @@ void gen_intermediate_code(CPUState *cpu, TranslationBlock *tb, int max_insns,
 
     translator_loop(cpu, tb, max_insns, pc, host_pc, &i386_tr_ops, &dc.base);
 }
-
-void restore_state_to_opc(CPUX86State *env, TranslationBlock *tb,
-                          target_ulong *data)
-{
-    int cc_op = data[1];
-
-    if (TARGET_TB_PCREL) {
-        env->eip = (env->eip & TARGET_PAGE_MASK) | data[0];
-    } else {
-        env->eip = data[0] - tb->cs_base;
-    }
-    if (cc_op != CC_OP_DYNAMIC) {
-        env->cc_op = cc_op;
-    }
-}
-- 
2.34.1

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/loongarch/cpu.c       | 11 +++++++++++
 target/loongarch/translate.c |  6 ------
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/target/loongarch/cpu.c b/target/loongarch/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/loongarch/cpu.c
+++ b/target/loongarch/cpu.c
@@ -XXX,XX +XXX,XX @@ static void loongarch_cpu_synchronize_from_tb(CPUState *cs,
 
     env->pc = tb_pc(tb);
 }
+
+static void loongarch_restore_state_to_opc(CPUState *cs,
+                                           const TranslationBlock *tb,
+                                           const uint64_t *data)
+{
+    LoongArchCPU *cpu = LOONGARCH_CPU(cs);
+    CPULoongArchState *env = &cpu->env;
+
+    env->pc = data[0];
+}
 #endif /* CONFIG_TCG */
 
 static bool loongarch_cpu_has_work(CPUState *cs)
@@ -XXX,XX +XXX,XX @@ void loongarch_cpu_dump_state(CPUState *cs, FILE *f, int flags)
 static struct TCGCPUOps loongarch_tcg_ops = {
     .initialize = loongarch_translate_init,
     .synchronize_from_tb = loongarch_cpu_synchronize_from_tb,
+    .restore_state_to_opc = loongarch_restore_state_to_opc,
 
 #ifndef CONFIG_USER_ONLY
     .tlb_fill = loongarch_cpu_tlb_fill,
diff --git a/target/loongarch/translate.c b/target/loongarch/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/loongarch/translate.c
+++ b/target/loongarch/translate.c
@@ -XXX,XX +XXX,XX @@ void loongarch_translate_init(void)
     cpu_llval = tcg_global_mem_new(cpu_env,
                     offsetof(CPULoongArchState, llval), "llval");
 }
-
-void restore_state_to_opc(CPULoongArchState *env, TranslationBlock *tb,
-                          target_ulong *data)
-{
-    env->pc = data[0];
-}
-- 
2.34.1

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/m68k/cpu.c       | 14 ++++++++++++++
 target/m68k/translate.c | 10 ----------
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/target/m68k/cpu.c b/target/m68k/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/m68k/cpu.c
+++ b/target/m68k/cpu.c
@@ -XXX,XX +XXX,XX @@ static vaddr m68k_cpu_get_pc(CPUState *cs)
     return cpu->env.pc;
 }
 
+static void m68k_restore_state_to_opc(CPUState *cs,
+                                      const TranslationBlock *tb,
+                                      const uint64_t *data)
+{
+    M68kCPU *cpu = M68K_CPU(cs);
+    int cc_op = data[1];
+
+    cpu->env.pc = data[0];
+    if (cc_op != CC_OP_DYNAMIC) {
+        cpu->env.cc_op = cc_op;
+    }
+}
+
 static bool m68k_cpu_has_work(CPUState *cs)
 {
     return cs->interrupt_request & CPU_INTERRUPT_HARD;
@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps m68k_sysemu_ops = {
 
 static const struct TCGCPUOps m68k_tcg_ops = {
     .initialize = m68k_tcg_init,
+    .restore_state_to_opc = m68k_restore_state_to_opc,
 
 #ifndef CONFIG_USER_ONLY
     .tlb_fill = m68k_cpu_tlb_fill,
diff --git a/target/m68k/translate.c b/target/m68k/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/m68k/translate.c
+++ b/target/m68k/translate.c
@@ -XXX,XX +XXX,XX @@ void m68k_cpu_dump_state(CPUState *cs, FILE *f, int flags)
                  env->mmu.mmusr, env->mmu.ar);
 #endif
 }
-
-void restore_state_to_opc(CPUM68KState *env, TranslationBlock *tb,
-                          target_ulong *data)
-{
-    int cc_op = data[1];
-    env->pc = data[0];
-    if (cc_op != CC_OP_DYNAMIC) {
-        env->cc_op = cc_op;
-    }
-}
-- 
2.34.1

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/microblaze/cpu.c       | 11 +++++++++++
 target/microblaze/translate.c |  7 -------
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/target/microblaze/cpu.c b/target/microblaze/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/microblaze/cpu.c
+++ b/target/microblaze/cpu.c
@@ -XXX,XX +XXX,XX @@ static void mb_cpu_synchronize_from_tb(CPUState *cs,
     cpu->env.iflags = tb->flags & IFLAGS_TB_MASK;
 }
 
+static void mb_restore_state_to_opc(CPUState *cs,
+                                    const TranslationBlock *tb,
+                                    const uint64_t *data)
+{
+    MicroBlazeCPU *cpu = MICROBLAZE_CPU(cs);
+
+    cpu->env.pc = data[0];
+    cpu->env.iflags = data[1];
+}
+
 static bool mb_cpu_has_work(CPUState *cs)
 {
     return cs->interrupt_request & (CPU_INTERRUPT_HARD | CPU_INTERRUPT_NMI);
@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps mb_sysemu_ops = {
 static const struct TCGCPUOps mb_tcg_ops = {
     .initialize = mb_tcg_init,
     .synchronize_from_tb = mb_cpu_synchronize_from_tb,
+    .restore_state_to_opc = mb_restore_state_to_opc,
 
 #ifndef CONFIG_USER_ONLY
     .tlb_fill = mb_cpu_tlb_fill,
diff --git a/target/microblaze/translate.c b/target/microblaze/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/microblaze/translate.c
+++ b/target/microblaze/translate.c
@@ -XXX,XX +XXX,XX @@ void mb_tcg_init(void)
     cpu_res_addr =
         tcg_global_mem_new(cpu_env, offsetof(CPUMBState, res_addr), "res_addr");
 }
-
-void restore_state_to_opc(CPUMBState *env, TranslationBlock *tb,
-                          target_ulong *data)
-{
-    env->pc = data[0];
-    env->iflags = data[1];
-}
-- 
2.34.1

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/mips/tcg/tcg-internal.h | 3 +++
 target/mips/cpu.c              | 1 +
 target/mips/tcg/translate.c    | 8 ++++++--
 3 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/target/mips/tcg/tcg-internal.h b/target/mips/tcg/tcg-internal.h
index XXXXXXX..XXXXXXX 100644
--- a/target/mips/tcg/tcg-internal.h
+++ b/target/mips/tcg/tcg-internal.h
@@ -XXX,XX +XXX,XX @@ void mips_cpu_synchronize_from_tb(CPUState *cs, const TranslationBlock *tb);
 G_NORETURN void mips_cpu_do_unaligned_access(CPUState *cpu, vaddr addr,
                                              MMUAccessType access_type, int mmu_idx,
                                              uintptr_t retaddr);
+void mips_restore_state_to_opc(CPUState *cs,
+                               const TranslationBlock *tb,
+                               const uint64_t *data);
 
 const char *mips_exception_name(int32_t exception);
 
diff --git a/target/mips/cpu.c b/target/mips/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/mips/cpu.c
+++ b/target/mips/cpu.c
@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps mips_sysemu_ops = {
 static const struct TCGCPUOps mips_tcg_ops = {
     .initialize = mips_tcg_init,
     .synchronize_from_tb = mips_cpu_synchronize_from_tb,
+    .restore_state_to_opc = mips_restore_state_to_opc,
 
 #if !defined(CONFIG_USER_ONLY)
     .tlb_fill = mips_cpu_tlb_fill,
diff --git a/target/mips/tcg/translate.c b/target/mips/tcg/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/mips/tcg/translate.c
+++ b/target/mips/tcg/translate.c
@@ -XXX,XX +XXX,XX @@ void mips_tcg_init(void)
     }
 }
 
-void restore_state_to_opc(CPUMIPSState *env, TranslationBlock *tb,
-                          target_ulong *data)
+void mips_restore_state_to_opc(CPUState *cs,
+                               const TranslationBlock *tb,
+                               const uint64_t *data)
 {
+    MIPSCPU *cpu = MIPS_CPU(cs);
+    CPUMIPSState *env = &cpu->env;
+
     env->active_tc.PC = data[0];
     env->hflags &= ~MIPS_HFLAG_BMASK;
     env->hflags |= data[1];
-- 
2.34.1

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/nios2/cpu.c       | 11 +++++++++++
 target/nios2/translate.c |  6 ------
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/target/nios2/cpu.c b/target/nios2/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/nios2/cpu.c
+++ b/target/nios2/cpu.c
@@ -XXX,XX +XXX,XX @@ static vaddr nios2_cpu_get_pc(CPUState *cs)
     return env->pc;
 }
 
+static void nios2_restore_state_to_opc(CPUState *cs,
+                                       const TranslationBlock *tb,
+                                       const uint64_t *data)
+{
+    Nios2CPU *cpu = NIOS2_CPU(cs);
+    CPUNios2State *env = &cpu->env;
+
+    env->pc = data[0];
+}
+
 static bool nios2_cpu_has_work(CPUState *cs)
 {
     return cs->interrupt_request & CPU_INTERRUPT_HARD;
@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps nios2_sysemu_ops = {
 
 static const struct TCGCPUOps nios2_tcg_ops = {
     .initialize = nios2_tcg_init,
+    .restore_state_to_opc = nios2_restore_state_to_opc,
 
 #ifndef CONFIG_USER_ONLY
     .tlb_fill = nios2_cpu_tlb_fill,
diff --git a/target/nios2/translate.c b/target/nios2/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/nios2/translate.c
+++ b/target/nios2/translate.c
@@ -XXX,XX +XXX,XX @@ void nios2_tcg_init(void)
     cpu_pc = tcg_global_mem_new(cpu_env,
                                 offsetof(CPUNios2State, pc), "pc");
 }
-
-void restore_state_to_opc(CPUNios2State *env, TranslationBlock *tb,
-                          target_ulong *data)
-{
-    env->pc = data[0];
-}
-- 
2.34.1

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/openrisc/cpu.c       | 13 +++++++++++++
 target/openrisc/translate.c | 10 ----------
 2 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/target/openrisc/cpu.c b/target/openrisc/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/openrisc/cpu.c
+++ b/target/openrisc/cpu.c
@@ -XXX,XX +XXX,XX @@ static void openrisc_cpu_synchronize_from_tb(CPUState *cs,
     cpu->env.pc = tb_pc(tb);
 }
 
+static void openrisc_restore_state_to_opc(CPUState *cs,
+                                          const TranslationBlock *tb,
+                                          const uint64_t *data)
+{
+    OpenRISCCPU *cpu = OPENRISC_CPU(cs);
+
+    cpu->env.pc = data[0];
+    cpu->env.dflag = data[1] & 1;
+    if (data[1] & 2) {
+        cpu->env.ppc = cpu->env.pc - 4;
+    }
+}
 
 static bool openrisc_cpu_has_work(CPUState *cs)
 {
@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps openrisc_sysemu_ops = {
 static const struct TCGCPUOps openrisc_tcg_ops = {
     .initialize = openrisc_translate_init,
     .synchronize_from_tb = openrisc_cpu_synchronize_from_tb,
+    .restore_state_to_opc = openrisc_restore_state_to_opc,
 
 #ifndef CONFIG_USER_ONLY
     .tlb_fill = openrisc_cpu_tlb_fill,
diff --git a/target/openrisc/translate.c b/target/openrisc/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/openrisc/translate.c
+++ b/target/openrisc/translate.c
@@ -XXX,XX +XXX,XX @@ void openrisc_cpu_dump_state(CPUState *cs, FILE *f, int flags)
                      (i % 4) == 3 ? '\n' : ' ');
     }
 }
-
-void restore_state_to_opc(CPUOpenRISCState *env, TranslationBlock *tb,
-                          target_ulong *data)
-{
-    env->pc = data[0];
-    env->dflag = data[1] & 1;
-    if (data[1] & 2) {
-        env->ppc = env->pc - 4;
-    }
-}
-- 
2.34.1

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/ppc/cpu_init.c  | 10 ++++++++++
 target/ppc/translate.c |  6 ------
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/target/ppc/cpu_init.c b/target/ppc/cpu_init.c
index XXXXXXX..XXXXXXX 100644
--- a/target/ppc/cpu_init.c
+++ b/target/ppc/cpu_init.c
@@ -XXX,XX +XXX,XX @@ static vaddr ppc_cpu_get_pc(CPUState *cs)
     return cpu->env.nip;
 }
 
+static void ppc_restore_state_to_opc(CPUState *cs,
+                                     const TranslationBlock *tb,
+                                     const uint64_t *data)
+{
+    PowerPCCPU *cpu = POWERPC_CPU(cs);
+
+    cpu->env.nip = data[0];
+}
+
 static bool ppc_cpu_has_work(CPUState *cs)
 {
     PowerPCCPU *cpu = POWERPC_CPU(cs);
@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps ppc_sysemu_ops = {
 
 static const struct TCGCPUOps ppc_tcg_ops = {
   .initialize = ppc_translate_init,
+  .restore_state_to_opc = ppc_restore_state_to_opc,
 
 #ifdef CONFIG_USER_ONLY
   .record_sigsegv = ppc_cpu_record_sigsegv,
diff --git a/target/ppc/translate.c b/target/ppc/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/ppc/translate.c
+++ b/target/ppc/translate.c
@@ -XXX,XX +XXX,XX @@ void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int max_insns,
 
     translator_loop(cs, tb, max_insns, pc, host_pc, &ppc_tr_ops, &ctx.base);
 }
-
-void restore_state_to_opc(CPUPPCState *env, TranslationBlock *tb,
-                          target_ulong *data)
-{
-    env->nip = data[0];
-}
-- 
2.34.1

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/riscv/cpu.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/riscv/cpu.c
+++ b/target/riscv/cpu.c
@@ -XXX,XX +XXX,XX @@ static bool riscv_cpu_has_work(CPUState *cs)
 #endif
 }
 
-void restore_state_to_opc(CPURISCVState *env, TranslationBlock *tb,
-                          target_ulong *data)
+static void riscv_restore_state_to_opc(CPUState *cs,
+                                       const TranslationBlock *tb,
+                                       const uint64_t *data)
 {
+    RISCVCPU *cpu = RISCV_CPU(cs);
+    CPURISCVState *env = &cpu->env;
     RISCVMXL xl = FIELD_EX32(tb->flags, TB_FLAGS, XL);
+
     if (xl == MXL_RV32) {
         env->pc = (int32_t)data[0];
     } else {
@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps riscv_sysemu_ops = {
 static const struct TCGCPUOps riscv_tcg_ops = {
     .initialize = riscv_translate_init,
     .synchronize_from_tb = riscv_cpu_synchronize_from_tb,
+    .restore_state_to_opc = riscv_restore_state_to_opc,
 
 #ifndef CONFIG_USER_ONLY
     .tlb_fill = riscv_cpu_tlb_fill,
-- 
2.34.1

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/rx/cpu.c       | 10 ++++++++++
 target/rx/translate.c |  6 ------
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/target/rx/cpu.c b/target/rx/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/rx/cpu.c
+++ b/target/rx/cpu.c
@@ -XXX,XX +XXX,XX @@ static void rx_cpu_synchronize_from_tb(CPUState *cs,
     cpu->env.pc = tb_pc(tb);
 }
 
+static void rx_restore_state_to_opc(CPUState *cs,
+                                    const TranslationBlock *tb,
+                                    const uint64_t *data)
+{
+    RXCPU *cpu = RX_CPU(cs);
+
+    cpu->env.pc = data[0];
+}
+
 static bool rx_cpu_has_work(CPUState *cs)
 {
     return cs->interrupt_request &
@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps rx_sysemu_ops = {
 static const struct TCGCPUOps rx_tcg_ops = {
     .initialize = rx_translate_init,
     .synchronize_from_tb = rx_cpu_synchronize_from_tb,
+    .restore_state_to_opc = rx_restore_state_to_opc,
     .tlb_fill = rx_cpu_tlb_fill,
 
 #ifndef CONFIG_USER_ONLY
diff --git a/target/rx/translate.c b/target/rx/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/rx/translate.c
+++ b/target/rx/translate.c
@@ -XXX,XX +XXX,XX @@ void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int max_insns,
     translator_loop(cs, tb, max_insns, pc, host_pc, &rx_tr_ops, &dc.base);
 }
 
-void restore_state_to_opc(CPURXState *env, TranslationBlock *tb,
-                          target_ulong *data)
-{
-    env->pc = data[0];
-}
-
 #define ALLOC_REGISTER(sym, name) \
     cpu_##sym = tcg_global_mem_new_i32(cpu_env, \
                                        offsetof(CPURXState, sym), name)
-- 
2.34.1

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/s390x/s390x-internal.h | 4 +++-
 target/s390x/cpu.c            | 1 +
 target/s390x/tcg/translate.c  | 7 +++++--
 3 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/target/s390x/s390x-internal.h b/target/s390x/s390x-internal.h
index XXXXXXX..XXXXXXX 100644
--- a/target/s390x/s390x-internal.h
+++ b/target/s390x/s390x-internal.h
@@ -XXX,XX +XXX,XX @@ void handle_diag_308(CPUS390XState *env, uint64_t r1, uint64_t r3,
 
 /* translate.c */
 void s390x_translate_init(void);
-
+void s390x_restore_state_to_opc(CPUState *cs,
+                                const TranslationBlock *tb,
+                                const uint64_t *data);
 
 /* sigp.c */
 int handle_sigp(CPUS390XState *env, uint8_t order, uint64_t r1, uint64_t r3);
diff --git a/target/s390x/cpu.c b/target/s390x/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/s390x/cpu.c
+++ b/target/s390x/cpu.c
@@ -XXX,XX +XXX,XX @@ static void s390_cpu_reset_full(DeviceState *dev)
 
 static const struct TCGCPUOps s390_tcg_ops = {
     .initialize = s390x_translate_init,
+    .restore_state_to_opc = s390x_restore_state_to_opc,
 
 #ifdef CONFIG_USER_ONLY
     .record_sigsegv = s390_cpu_record_sigsegv,
diff --git a/target/s390x/tcg/translate.c b/target/s390x/tcg/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/s390x/tcg/translate.c
+++ b/target/s390x/tcg/translate.c
@@ -XXX,XX +XXX,XX @@ void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int max_insns,
     translator_loop(cs, tb, max_insns, pc, host_pc, &s390x_tr_ops, &dc.base);
 }
 
-void restore_state_to_opc(CPUS390XState *env, TranslationBlock *tb,
-                          target_ulong *data)
+void s390x_restore_state_to_opc(CPUState *cs,
+                                const TranslationBlock *tb,
+                                const uint64_t *data)
 {
+    S390CPU *cpu = S390_CPU(cs);
+    CPUS390XState *env = &cpu->env;
     int cc_op = data[1];
 
     env->psw.addr = data[0];
-- 
2.34.1

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/sh4/cpu.c       | 16 ++++++++++++++++
 target/sh4/translate.c | 10 ----------
 2 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/target/sh4/cpu.c b/target/sh4/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/sh4/cpu.c
+++ b/target/sh4/cpu.c
@@ -XXX,XX +XXX,XX @@ static void superh_cpu_synchronize_from_tb(CPUState *cs,
     cpu->env.flags = tb->flags;
 }
 
+static void superh_restore_state_to_opc(CPUState *cs,
+                                        const TranslationBlock *tb,
+                                        const uint64_t *data)
+{
+    SuperHCPU *cpu = SUPERH_CPU(cs);
+
+    cpu->env.pc = data[0];
+    cpu->env.flags = data[1];
+    /*
+     * Theoretically delayed_pc should also be restored. In practice the
+     * branch instruction is re-executed after exception, so the delayed
+     * branch target will be recomputed.
+     */
+}
+
 #ifndef CONFIG_USER_ONLY
 static bool superh_io_recompile_replay_branch(CPUState *cs,
                                               const TranslationBlock *tb)
@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps sh4_sysemu_ops = {
 static const struct TCGCPUOps superh_tcg_ops = {
     .initialize = sh4_translate_init,
     .synchronize_from_tb = superh_cpu_synchronize_from_tb,
+    .restore_state_to_opc = superh_restore_state_to_opc,
 
 #ifndef CONFIG_USER_ONLY
     .tlb_fill = superh_cpu_tlb_fill,
diff --git a/target/sh4/translate.c b/target/sh4/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/sh4/translate.c
+++ b/target/sh4/translate.c
@@ -XXX,XX +XXX,XX @@ void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int max_insns,
 
     translator_loop(cs, tb, max_insns, pc, host_pc, &sh4_tr_ops, &ctx.base);
 }
-
-void restore_state_to_opc(CPUSH4State *env, TranslationBlock *tb,
-                          target_ulong *data)
-{
-    env->pc = data[0];
-    env->flags = data[1];
-    /* Theoretically delayed_pc should also be restored. In practice the
-       branch instruction is re-executed after exception, so the delayed
-       branch target will be recomputed. */
-}
-- 
2.34.1

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/sparc/cpu.h       | 3 +++
 target/sparc/cpu.c       | 1 +
 target/sparc/translate.c | 7 +++++--
 3 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/target/sparc/cpu.h b/target/sparc/cpu.h
index XXXXXXX..XXXXXXX 100644
--- a/target/sparc/cpu.h
+++ b/target/sparc/cpu.h
@@ -XXX,XX +XXX,XX @@ int sparc_cpu_memory_rw_debug(CPUState *cpu, vaddr addr,
 
 /* translate.c */
 void sparc_tcg_init(void);
+void sparc_restore_state_to_opc(CPUState *cs,
+                                const TranslationBlock *tb,
+                                const uint64_t *data);
 
 /* cpu-exec.c */
 
diff --git a/target/sparc/cpu.c b/target/sparc/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/sparc/cpu.c
+++ b/target/sparc/cpu.c
@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps sparc_sysemu_ops = {
 static const struct TCGCPUOps sparc_tcg_ops = {
     .initialize = sparc_tcg_init,
     .synchronize_from_tb = sparc_cpu_synchronize_from_tb,
+    .restore_state_to_opc = sparc_restore_state_to_opc,
 
 #ifndef CONFIG_USER_ONLY
     .tlb_fill = sparc_cpu_tlb_fill,
diff --git a/target/sparc/translate.c b/target/sparc/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/sparc/translate.c
+++ b/target/sparc/translate.c
@@ -XXX,XX +XXX,XX @@ void sparc_tcg_init(void)
     }
 }
 
-void restore_state_to_opc(CPUSPARCState *env, TranslationBlock *tb,
-                          target_ulong *data)
+void sparc_restore_state_to_opc(CPUState *cs,
+                                const TranslationBlock *tb,
+                                const uint64_t *data)
 {
+    SPARCCPU *cpu = SPARC_CPU(cs);
+    CPUSPARCState *env = &cpu->env;
     target_ulong pc = data[0];
     target_ulong npc = data[1];
 
-- 
2.34.1

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/tricore/cpu.c       | 11 +++++++++++
 target/tricore/translate.c |  6 ------
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/target/tricore/cpu.c b/target/tricore/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/tricore/cpu.c
+++ b/target/tricore/cpu.c
@@ -XXX,XX +XXX,XX @@ static void tricore_cpu_synchronize_from_tb(CPUState *cs,
     env->PC = tb_pc(tb);
 }
 
+static void tricore_restore_state_to_opc(CPUState *cs,
+                                         const TranslationBlock *tb,
+                                         const uint64_t *data)
+{
+    TriCoreCPU *cpu = TRICORE_CPU(cs);
+    CPUTriCoreState *env = &cpu->env;
+
+    env->PC = data[0];
+}
+
 static void tricore_cpu_reset(DeviceState *dev)
 {
     CPUState *s = CPU(dev);
@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps tricore_sysemu_ops = {
 static const struct TCGCPUOps tricore_tcg_ops = {
     .initialize = tricore_tcg_init,
     .synchronize_from_tb = tricore_cpu_synchronize_from_tb,
+    .restore_state_to_opc = tricore_restore_state_to_opc,
     .tlb_fill = tricore_cpu_tlb_fill,
 };
 
diff --git a/target/tricore/translate.c b/target/tricore/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/tricore/translate.c
+++ b/target/tricore/translate.c
@@ -XXX,XX +XXX,XX @@ void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int max_insns,
                     &tricore_tr_ops, &ctx.base);
 }
 
-void
-restore_state_to_opc(CPUTriCoreState *env, TranslationBlock *tb,
-                     target_ulong *data)
-{
-    env->PC = data[0];
-}
 /*
  *
  * Initialization
-- 
2.34.1

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/xtensa/cpu.c       | 10 ++++++++++
 target/xtensa/translate.c |  6 ------
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/target/xtensa/cpu.c b/target/xtensa/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/xtensa/cpu.c
+++ b/target/xtensa/cpu.c
@@ -XXX,XX +XXX,XX @@ static vaddr xtensa_cpu_get_pc(CPUState *cs)
     return cpu->env.pc;
 }
 
+static void xtensa_restore_state_to_opc(CPUState *cs,
+                                        const TranslationBlock *tb,
+                                        const uint64_t *data)
+{
+    XtensaCPU *cpu = XTENSA_CPU(cs);
+
+    cpu->env.pc = data[0];
+}
+
 static bool xtensa_cpu_has_work(CPUState *cs)
 {
 #ifndef CONFIG_USER_ONLY
@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps xtensa_sysemu_ops = {
 static const struct TCGCPUOps xtensa_tcg_ops = {
     .initialize = xtensa_translate_init,
     .debug_excp_handler = xtensa_breakpoint_handler,
+    .restore_state_to_opc = xtensa_restore_state_to_opc,
 
 #ifndef CONFIG_USER_ONLY
     .tlb_fill = xtensa_cpu_tlb_fill,
diff --git a/target/xtensa/translate.c b/target/xtensa/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/xtensa/translate.c
+++ b/target/xtensa/translate.c
@@ -XXX,XX +XXX,XX @@ void xtensa_cpu_dump_state(CPUState *cs, FILE *f, int flags)
     }
 }
 
-void restore_state_to_opc(CPUXtensaState *env, TranslationBlock *tb,
-                          target_ulong *data)
-{
-    env->pc = data[0];
-}
-
 static void translate_abs(DisasContext *dc, const OpcodeArg arg[],
                           const uint32_t par[])
 {
-- 
2.34.1

All targets have been updated.  Use the tcg_ops target hook
exclusively, which allows the compat code to be removed.

Reviewed-by: Claudio Fontana <cfontana@suse.de>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/exec-all.h   |  3 ---
 accel/tcg/translate-all.c | 16 ++--------------
 2 files changed, 2 insertions(+), 17 deletions(-)

diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -XXX,XX +XXX,XX @@ typedef ram_addr_t tb_page_addr_t;
 #define TB_PAGE_ADDR_FMT RAM_ADDR_FMT
 #endif
 
-void restore_state_to_opc(CPUArchState *env, TranslationBlock *tb,
-                          target_ulong *data) __attribute__((weak));
-
 /**
  * cpu_restore_state:
  * @cpu: the vCPU state is to be restore to
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -XXX,XX +XXX,XX @@ static int encode_search(TranslationBlock *tb, uint8_t *block)
 int cpu_restore_state_from_tb(CPUState *cpu, TranslationBlock *tb,
                               uintptr_t searched_pc, bool reset_icount)
 {
-    target_ulong data[TARGET_INSN_START_WORDS];
+    uint64_t data[TARGET_INSN_START_WORDS];
     uintptr_t host_pc = (uintptr_t)tb->tc.ptr;
     const uint8_t *p = tb->tc.ptr + tb->tc.size;
     int i, j, num_insns = tb->icount;
@@ -XXX,XX +XXX,XX @@ int cpu_restore_state_from_tb(CPUState *cpu, TranslationBlock *tb,
         cpu_neg(cpu)->icount_decr.u16.low += num_insns - i;
     }
 
-    {
-        const struct TCGCPUOps *ops = cpu->cc->tcg_ops;
-        __typeof(ops->restore_state_to_opc) restore = ops->restore_state_to_opc;
-        if (restore) {
-            uint64_t d64[TARGET_INSN_START_WORDS];
-            for (i = 0; i < TARGET_INSN_START_WORDS; ++i) {
-                d64[i] = data[i];
-            }
-            restore(cpu, tb, d64);
-        } else {
-            restore_state_to_opc(cpu->env_ptr, tb, data);
-        }
-    }
+    cpu->cc->tcg_ops->restore_state_to_opc(cpu, tb, data);
 
 #ifdef CONFIG_PROFILER
     qatomic_set(&prof->restore_time,
-- 
2.34.1

The following changes since commit aa33508196f4e2da04625bee36e1f7be5b9267e7:

Merge tag 'mem-2023-05-23' of https://github.com/davidhildenbrand/qemu into staging (2023-05-23 10:57:25 -0700)

are available in the Git repository at:

https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20230523

for you to fetch changes up to 30d56836f98c7ed2d309bff1dde8854f3d0b5634:

tcg: Remove USE_TCG_OPTIMIZATIONS (2023-05-23 16:52:39 -0700)

----------------------------------------------------------------
util: Host cpu detection for x86 and aa64
util: Use cpu detection for bufferiszero
migration: Use cpu detection for xbzrle
tcg: Replace and remove cpu_atomic_{ld,st}o*
host/include: Split qemu/atomic128.h
tcg: Remove DEBUG_DISAS
tcg: Remove USE_TCG_OPTIMIZATIONS

----------------------------------------------------------------
Richard Henderson (28):
      util: Introduce host-specific cpuinfo.h
      util: Add cpuinfo-i386.c
      util: Add i386 CPUINFO_ATOMIC_VMOVDQU
      tcg/i386: Use host/cpuinfo.h
      util/bufferiszero: Use i386 host/cpuinfo.h
      migration/xbzrle: Shuffle function order
      migration/xbzrle: Use i386 host/cpuinfo.h
      migration: Build migration_files once
      util: Add cpuinfo-aarch64.c
      include/host: Split out atomic128-cas.h
      include/host: Split out atomic128-ldst.h
      meson: Fix detect atomic128 support with optimization
      include/qemu: Move CONFIG_ATOMIC128_OPT handling to atomic128.h
      target/ppc: Use tcg_gen_qemu_{ld,st}_i128 for LQARX, LQ, STQ
      target/s390x: Use tcg_gen_qemu_{ld,st}_i128 for LPQ, STPQ
      accel/tcg: Unify cpu_{ld,st}*_{be,le}_mmu
      target/s390x: Use cpu_{ld,st}*_mmu in do_csst
      target/s390x: Always use cpu_atomic_cmpxchgl_be_mmu in do_csst
      accel/tcg: Remove cpu_atomic_{ld,st}o_*_mmu
      accel/tcg: Remove prot argument to atomic_mmu_lookup
      accel/tcg: Eliminate #if on HAVE_ATOMIC128 and HAVE_CMPXCHG128
      qemu/atomic128: Split atomic16_read
      accel/tcg: Correctly use atomic128.h in ldst_atomicity.c.inc
      tcg: Split out tcg/debug-assert.h
      qemu/atomic128: Improve cmpxchg fallback for atomic16_set
      qemu/atomic128: Add runtime test for FEAT_LSE2
      tcg: Remove DEBUG_DISAS
      tcg: Remove USE_TCG_OPTIMIZATIONS

accel/tcg/atomic_template.h                |  93 +-----
 host/include/aarch64/host/atomic128-cas.h  |  45 +++
 host/include/aarch64/host/atomic128-ldst.h |  79 +++++
 host/include/aarch64/host/cpuinfo.h        |  22 ++
 host/include/generic/host/atomic128-cas.h  |  47 +++
 host/include/generic/host/atomic128-ldst.h |  81 +++++
 host/include/generic/host/cpuinfo.h        |   4 +
 host/include/i386/host/cpuinfo.h           |  39 +++
 host/include/x86_64/host/cpuinfo.h         |   1 +
 include/exec/cpu_ldst.h                    |  67 +----
 include/exec/exec-all.h                    |   3 -
 include/qemu/atomic128.h                   | 146 ++-------
 include/tcg/debug-assert.h                 |  17 ++
 include/tcg/tcg.h                          |   9 +-
 migration/xbzrle.h                         |   5 +-
 target/ppc/cpu.h                           |   1 -
 target/ppc/helper.h                        |   9 -
 target/s390x/cpu.h                         |   3 -
 target/s390x/helper.h                      |   4 -
 tcg/aarch64/tcg-target.h                   |   6 +-
 tcg/i386/tcg-target.h                      |  28 +-
 accel/tcg/cpu-exec.c                       |   2 -
 accel/tcg/cputlb.c                         | 211 ++++---------
 accel/tcg/translate-all.c                  |   2 -
 accel/tcg/translator.c                     |   2 -
 accel/tcg/user-exec.c                      | 332 ++++++--------------
 migration/ram.c                            |  34 +--
 migration/xbzrle.c                         | 268 +++++++++--------
 target/arm/tcg/m_helper.c                  |   4 +-
 target/ppc/mem_helper.c                    |  48 ---
 target/ppc/translate.c                     |  34 +--
 target/s390x/tcg/mem_helper.c              | 137 ++-------
 target/s390x/tcg/translate.c               |  30 +-
 target/sh4/translate.c                     |   2 -
 target/sparc/ldst_helper.c                 |  18 +-
 target/sparc/translate.c                   |   2 -
 tcg/tcg.c                                  |  14 +-
 tests/bench/xbzrle-bench.c                 | 469 -----------------------------
 tests/unit/test-xbzrle.c                   |  49 +--
 util/bufferiszero.c                        | 127 +++-----
 util/cpuinfo-aarch64.c                     |  67 +++++
 util/cpuinfo-i386.c                        |  99 ++++++
 MAINTAINERS                                |   3 +
 accel/tcg/atomic_common.c.inc              |  14 -
 accel/tcg/ldst_atomicity.c.inc             | 135 ++-------
 accel/tcg/ldst_common.c.inc                |  24 +-
 meson.build                                |  12 +-
 migration/meson.build                      |   1 -
 target/ppc/translate/fixedpoint-impl.c.inc |  51 +---
 target/s390x/tcg/insn-data.h.inc           |   2 +-
 tcg/aarch64/tcg-target.c.inc               |  40 ---
 tcg/i386/tcg-target.c.inc                  | 123 +-------
 tests/bench/meson.build                    |   6 -
 util/meson.build                           |   6 +
 54 files changed, 1035 insertions(+), 2042 deletions(-)
 create mode 100644 host/include/aarch64/host/atomic128-cas.h
 create mode 100644 host/include/aarch64/host/atomic128-ldst.h
 create mode 100644 host/include/aarch64/host/cpuinfo.h
 create mode 100644 host/include/generic/host/atomic128-cas.h
 create mode 100644 host/include/generic/host/atomic128-ldst.h
 create mode 100644 host/include/generic/host/cpuinfo.h
 create mode 100644 host/include/i386/host/cpuinfo.h
 create mode 100644 host/include/x86_64/host/cpuinfo.h
 create mode 100644 include/tcg/debug-assert.h
 delete mode 100644 tests/bench/xbzrle-bench.c
 create mode 100644 util/cpuinfo-aarch64.c
 create mode 100644 util/cpuinfo-i386.c

The entire contents of the header is host-specific, but the
existence of such a header is not, which could prevent some
host specific ifdefs at the top of the file for the include.

Add host/include/{arch,generic} to the project arguments.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 host/include/generic/host/cpuinfo.h |  4 ++++
 meson.build                         | 10 ++++++++++
 2 files changed, 14 insertions(+)
 create mode 100644 host/include/generic/host/cpuinfo.h

diff --git a/host/include/generic/host/cpuinfo.h b/host/include/generic/host/cpuinfo.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/host/include/generic/host/cpuinfo.h
@@ -XXX,XX +XXX,XX @@
+/*
+ * No host specific cpu indentification.
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
diff --git a/meson.build b/meson.build
index XXXXXXX..XXXXXXX 100644
--- a/meson.build
+++ b/meson.build
@@ -XXX,XX +XXX,XX @@ add_project_arguments('-iquote', '.',
                       '-iquote', meson.current_source_dir() / 'include',
                       language: all_languages)
 
+# If a host-specific include directory exists, list that first...
+host_include = meson.current_source_dir() / 'host/include/'
+if fs.is_dir(host_include / host_arch)
+  add_project_arguments('-iquote', host_include / host_arch,
+                        language: all_languages)
+endif
+# ... followed by the generic fallback.
+add_project_arguments('-iquote', host_include / 'generic',
+                      language: all_languages)
+
 sparse = find_program('cgcc', required: get_option('sparse'))
 if sparse.found()
   run_target('sparse',
-- 
2.34.1

Add cpuinfo.h for i386 and x86_64, and the initialization
for that in util/.  Populate that with a slightly altered
copy of the tcg host probing code.  Other uses of cpuid.h
will be adjusted one patch at a time.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 host/include/i386/host/cpuinfo.h   | 38 ++++++++++++
 host/include/x86_64/host/cpuinfo.h |  1 +
 util/cpuinfo-i386.c                | 97 ++++++++++++++++++++++++++++++
 MAINTAINERS                        |  2 +
 util/meson.build                   |  4 ++
 5 files changed, 142 insertions(+)
 create mode 100644 host/include/i386/host/cpuinfo.h
 create mode 100644 host/include/x86_64/host/cpuinfo.h
 create mode 100644 util/cpuinfo-i386.c

diff --git a/host/include/i386/host/cpuinfo.h b/host/include/i386/host/cpuinfo.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/host/include/i386/host/cpuinfo.h
@@ -XXX,XX +XXX,XX @@
+/*
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ * Host specific cpu indentification for x86.
+ */
+
+#ifndef HOST_CPUINFO_H
+#define HOST_CPUINFO_H
+
+/* Digested version of <cpuid.h> */
+
+#define CPUINFO_ALWAYS          (1u << 0)  /* so cpuinfo is nonzero */
+#define CPUINFO_CMOV            (1u << 1)
+#define CPUINFO_MOVBE           (1u << 2)
+#define CPUINFO_LZCNT           (1u << 3)
+#define CPUINFO_POPCNT          (1u << 4)
+#define CPUINFO_BMI1            (1u << 5)
+#define CPUINFO_BMI2            (1u << 6)
+#define CPUINFO_SSE2            (1u << 7)
+#define CPUINFO_SSE4            (1u << 8)
+#define CPUINFO_AVX1            (1u << 9)
+#define CPUINFO_AVX2            (1u << 10)
+#define CPUINFO_AVX512F         (1u << 11)
+#define CPUINFO_AVX512VL        (1u << 12)
+#define CPUINFO_AVX512BW        (1u << 13)
+#define CPUINFO_AVX512DQ        (1u << 14)
+#define CPUINFO_AVX512VBMI2     (1u << 15)
+#define CPUINFO_ATOMIC_VMOVDQA  (1u << 16)
+
+/* Initialized with a constructor. */
+extern unsigned cpuinfo;
+
+/*
+ * We cannot rely on constructor ordering, so other constructors must
+ * use the function interface rather than the variable above.
+ */
+unsigned cpuinfo_init(void);
+
+#endif /* HOST_CPUINFO_H */
diff --git a/host/include/x86_64/host/cpuinfo.h b/host/include/x86_64/host/cpuinfo.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/host/include/x86_64/host/cpuinfo.h
@@ -0,0 +1 @@
+#include "host/include/i386/host/cpuinfo.h"
diff --git a/util/cpuinfo-i386.c b/util/cpuinfo-i386.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/util/cpuinfo-i386.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ * Host specific cpu indentification for x86.
+ */
+
+#include "qemu/osdep.h"
+#include "host/cpuinfo.h"
+#ifdef CONFIG_CPUID_H
+# include "qemu/cpuid.h"
+#endif
+
+unsigned cpuinfo;
+
+/* Called both as constructor and (possibly) via other constructors. */
+unsigned __attribute__((constructor)) cpuinfo_init(void)
+{
+    unsigned info = cpuinfo;
+
+    if (info) {
+        return info;
+    }
+
+#ifdef CONFIG_CPUID_H
+    unsigned max, a, b, c, d, b7 = 0, c7 = 0;
+
+    max = __get_cpuid_max(0, 0);
+
+    if (max >= 7) {
+        __cpuid_count(7, 0, a, b7, c7, d);
+        info |= (b7 & bit_BMI ? CPUINFO_BMI1 : 0);
+        info |= (b7 & bit_BMI2 ? CPUINFO_BMI2 : 0);
+    }
+
+    if (max >= 1) {
+        __cpuid(1, a, b, c, d);
+
+        info |= (d & bit_CMOV ? CPUINFO_CMOV : 0);
+        info |= (d & bit_SSE2 ? CPUINFO_SSE2 : 0);
+        info |= (c & bit_SSE4_1 ? CPUINFO_SSE4 : 0);
+        info |= (c & bit_MOVBE ? CPUINFO_MOVBE : 0);
+        info |= (c & bit_POPCNT ? CPUINFO_POPCNT : 0);
+
+        /* For AVX features, we must check available and usable. */
+        if ((c & bit_AVX) && (c & bit_OSXSAVE)) {
+            unsigned bv = xgetbv_low(0);
+
+            if ((bv & 6) == 6) {
+                info |= CPUINFO_AVX1;
+                info |= (b7 & bit_AVX2 ? CPUINFO_AVX2 : 0);
+
+                if ((bv & 0xe0) == 0xe0) {
+                    info |= (b7 & bit_AVX512F ? CPUINFO_AVX512F : 0);
+                    info |= (b7 & bit_AVX512VL ? CPUINFO_AVX512VL : 0);
+                    info |= (b7 & bit_AVX512BW ? CPUINFO_AVX512BW : 0);
+                    info |= (b7 & bit_AVX512DQ ? CPUINFO_AVX512DQ : 0);
+                    info |= (c7 & bit_AVX512VBMI2 ? CPUINFO_AVX512VBMI2 : 0);
+                }
+
+                /*
+                 * The Intel SDM has added:
+                 *   Processors that enumerate support for Intel® AVX
+                 *   (by setting the feature flag CPUID.01H:ECX.AVX[bit 28])
+                 *   guarantee that the 16-byte memory operations performed
+                 *   by the following instructions will always be carried
+                 *   out atomically:
+                 *   - MOVAPD, MOVAPS, and MOVDQA.
+                 *   - VMOVAPD, VMOVAPS, and VMOVDQA when encoded with VEX.128.
+                 *   - VMOVAPD, VMOVAPS, VMOVDQA32, and VMOVDQA64 when encoded
+                 *     with EVEX.128 and k0 (masking disabled).
+                 * Note that these instructions require the linear addresses
+                 * of their memory operands to be 16-byte aligned.
+                 *
+                 * AMD has provided an even stronger guarantee that processors
+                 * with AVX provide 16-byte atomicity for all cachable,
+                 * naturally aligned single loads and stores, e.g. MOVDQU.
+                 *
+                 * See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104688
+                 */
+                __cpuid(0, a, b, c, d);
+                if (c == signature_INTEL_ecx || c == signature_AMD_ecx) {
+                    info |= CPUINFO_ATOMIC_VMOVDQA;
+                }
+            }
+        }
+    }
+
+    max = __get_cpuid_max(0x8000000, 0);
+    if (max >= 1) {
+        __cpuid(0x80000001, a, b, c, d);
+        info |= (c & bit_LZCNT ? CPUINFO_LZCNT : 0);
+    }
+#endif
+
+    info |= CPUINFO_ALWAYS;
+    cpuinfo = info;
+    return info;
+}
diff --git a/MAINTAINERS b/MAINTAINERS
index XXXXXXX..XXXXXXX 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -XXX,XX +XXX,XX @@ F: include/exec/helper*.h
 F: include/sysemu/cpus.h
 F: include/sysemu/tcg.h
 F: include/hw/core/tcg-cpu-ops.h
+F: host/include/*/host/cpuinfo.h
+F: util/cpuinfo-*.c
 
 FPU emulation
 M: Aurelien Jarno <aurelien@aurel32.net>
diff --git a/util/meson.build b/util/meson.build
index XXXXXXX..XXXXXXX 100644
--- a/util/meson.build
+++ b/util/meson.build
@@ -XXX,XX +XXX,XX @@ if have_block
   endif
   util_ss.add(when: 'CONFIG_LINUX', if_true: files('vfio-helpers.c'))
 endif
+
+if cpu in ['x86', 'x86_64']
+  util_ss.add(files('cpuinfo-i386.c'))
+endif
-- 
2.34.1

Add a bit to indicate when VMOVDQU is also atomic if aligned.

diff --git a/host/include/i386/host/cpuinfo.h b/host/include/i386/host/cpuinfo.h
index XXXXXXX..XXXXXXX 100644
--- a/host/include/i386/host/cpuinfo.h
+++ b/host/include/i386/host/cpuinfo.h
@@ -XXX,XX +XXX,XX @@
 #define CPUINFO_AVX512DQ        (1u << 14)
 #define CPUINFO_AVX512VBMI2     (1u << 15)
 #define CPUINFO_ATOMIC_VMOVDQA  (1u << 16)
+#define CPUINFO_ATOMIC_VMOVDQU  (1u << 17)
 
 /* Initialized with a constructor. */
 extern unsigned cpuinfo;
diff --git a/util/cpuinfo-i386.c b/util/cpuinfo-i386.c
index XXXXXXX..XXXXXXX 100644
--- a/util/cpuinfo-i386.c
+++ b/util/cpuinfo-i386.c
@@ -XXX,XX +XXX,XX @@ unsigned __attribute__((constructor)) cpuinfo_init(void)
                  * See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104688
                  */
                 __cpuid(0, a, b, c, d);
-                if (c == signature_INTEL_ecx || c == signature_AMD_ecx) {
+                if (c == signature_INTEL_ecx) {
                     info |= CPUINFO_ATOMIC_VMOVDQA;
+                } else if (c == signature_AMD_ecx) {
+                    info |= CPUINFO_ATOMIC_VMOVDQA | CPUINFO_ATOMIC_VMOVDQU;
                 }
             }
         }
-- 
2.34.1

Use the CPUINFO_* bits instead of the individual boolean
variables that we had been using.  Remove all of the init
code that was moved over to cpuinfo-i386.c.

Note that have_avx512* check both AVX512{F,VL}, as we had
previously done during tcg_target_init.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/i386/tcg-target.h     |  28 +++++----
 tcg/i386/tcg-target.c.inc | 123 ++------------------------------------
 2 files changed, 22 insertions(+), 129 deletions(-)

diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -XXX,XX +XXX,XX @@
 #ifndef I386_TCG_TARGET_H
 #define I386_TCG_TARGET_H
 
+#include "host/cpuinfo.h"
+
 #define TCG_TARGET_INSN_UNIT_SIZE  1
 #define TCG_TARGET_TLB_DISPLACEMENT_BITS 31
 
@@ -XXX,XX +XXX,XX @@ typedef enum {
 # define TCG_TARGET_CALL_RET_I128    TCG_CALL_RET_BY_REF
 #endif
 
-extern bool have_bmi1;
-extern bool have_popcnt;
-extern bool have_avx1;
-extern bool have_avx2;
-extern bool have_avx512bw;
-extern bool have_avx512dq;
-extern bool have_avx512vbmi2;
-extern bool have_avx512vl;
-extern bool have_movbe;
-extern bool have_atomic16;
+#define have_bmi1         (cpuinfo & CPUINFO_BMI1)
+#define have_popcnt       (cpuinfo & CPUINFO_POPCNT)
+#define have_avx1         (cpuinfo & CPUINFO_AVX1)
+#define have_avx2         (cpuinfo & CPUINFO_AVX2)
+#define have_movbe        (cpuinfo & CPUINFO_MOVBE)
+#define have_atomic16     (cpuinfo & CPUINFO_ATOMIC_VMOVDQA)
+
+/*
+ * There are interesting instructions in AVX512, so long as we have AVX512VL,
+ * which indicates support for EVEX on sizes smaller than 512 bits.
+ */
+#define have_avx512vl     ((cpuinfo & CPUINFO_AVX512VL) && \
+                           (cpuinfo & CPUINFO_AVX512F))
+#define have_avx512bw     ((cpuinfo & CPUINFO_AVX512BW) && have_avx512vl)
+#define have_avx512dq     ((cpuinfo & CPUINFO_AVX512DQ) && have_avx512vl)
+#define have_avx512vbmi2  ((cpuinfo & CPUINFO_AVX512VBMI2) && have_avx512vl)
 
 /* optional instructions */
 #define TCG_TARGET_HAS_div2_i32         1
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind kind, int slot)
 # define SOFTMMU_RESERVE_REGS  0
 #endif
 
-/* The host compiler should supply <cpuid.h> to enable runtime features
-   detection, as we're not going to go so far as our own inline assembly.
-   If not available, default values will be assumed.  */
-#if defined(CONFIG_CPUID_H)
-#include "qemu/cpuid.h"
-#endif
-
 /* For 64-bit, we always know that CMOV is available.  */
 #if TCG_TARGET_REG_BITS == 64
-# define have_cmov 1
-#elif defined(CONFIG_CPUID_H)
-static bool have_cmov;
+# define have_cmov      true
 #else
-# define have_cmov 0
-#endif
-
-/* We need these symbols in tcg-target.h, and we can't properly conditionalize
-   it there.  Therefore we always define the variable.  */
-bool have_bmi1;
-bool have_popcnt;
-bool have_avx1;
-bool have_avx2;
-bool have_avx512bw;
-bool have_avx512dq;
-bool have_avx512vbmi2;
-bool have_avx512vl;
-bool have_movbe;
-bool have_atomic16;
-
-#ifdef CONFIG_CPUID_H
-static bool have_bmi2;
-static bool have_lzcnt;
-#else
-# define have_bmi2 0
-# define have_lzcnt 0
+# define have_cmov      (cpuinfo & CPUINFO_CMOV)
 #endif
+#define have_bmi2       (cpuinfo & CPUINFO_BMI2)
+#define have_lzcnt      (cpuinfo & CPUINFO_LZCNT)
 
 static const tcg_insn_unit *tb_ret_addr;
 
@@ -XXX,XX +XXX,XX @@ static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
 
 static void tcg_target_init(TCGContext *s)
 {
-#ifdef CONFIG_CPUID_H
-    unsigned a, b, c, d, b7 = 0, c7 = 0;
-    unsigned max = __get_cpuid_max(0, 0);
-
-    if (max >= 7) {
-        /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs.  */
-        __cpuid_count(7, 0, a, b7, c7, d);
-        have_bmi1 = (b7 & bit_BMI) != 0;
-        have_bmi2 = (b7 & bit_BMI2) != 0;
-    }
-
-    if (max >= 1) {
-        __cpuid(1, a, b, c, d);
-#ifndef have_cmov
-        /* For 32-bit, 99% certainty that we're running on hardware that
-           supports cmov, but we still need to check.  In case cmov is not
-           available, we'll use a small forward branch.  */
-        have_cmov = (d & bit_CMOV) != 0;
-#endif
-
-        /* MOVBE is only available on Intel Atom and Haswell CPUs, so we
-           need to probe for it.  */
-        have_movbe = (c & bit_MOVBE) != 0;
-        have_popcnt = (c & bit_POPCNT) != 0;
-
-        /* There are a number of things we must check before we can be
-           sure of not hitting invalid opcode.  */
-        if (c & bit_OSXSAVE) {
-            unsigned bv = xgetbv_low(0);
-
-            if ((bv & 6) == 6) {
-                have_avx1 = (c & bit_AVX) != 0;
-                have_avx2 = (b7 & bit_AVX2) != 0;
-
-                /*
-                 * There are interesting instructions in AVX512, so long
-                 * as we have AVX512VL, which indicates support for EVEX
-                 * on sizes smaller than 512 bits.  We are required to
-                 * check that OPMASK and all extended ZMM state are enabled
-                 * even if we're not using them -- the insns will fault.
-                 */
-                if ((bv & 0xe0) == 0xe0
-                    && (b7 & bit_AVX512F)
-                    && (b7 & bit_AVX512VL)) {
-                    have_avx512vl = true;
-                    have_avx512bw = (b7 & bit_AVX512BW) != 0;
-                    have_avx512dq = (b7 & bit_AVX512DQ) != 0;
-                    have_avx512vbmi2 = (c7 & bit_AVX512VBMI2) != 0;
-                }
-
-                /*
-                 * The Intel SDM has added:
-                 *   Processors that enumerate support for Intel® AVX
-                 *   (by setting the feature flag CPUID.01H:ECX.AVX[bit 28])
-                 *   guarantee that the 16-byte memory operations performed
-                 *   by the following instructions will always be carried
-                 *   out atomically:
-                 *   - MOVAPD, MOVAPS, and MOVDQA.
-                 *   - VMOVAPD, VMOVAPS, and VMOVDQA when encoded with VEX.128.
-                 *   - VMOVAPD, VMOVAPS, VMOVDQA32, and VMOVDQA64 when encoded
-                 *     with EVEX.128 and k0 (masking disabled).
-                 * Note that these instructions require the linear addresses
-                 * of their memory operands to be 16-byte aligned.
-                 *
-                 * AMD has provided an even stronger guarantee that processors
-                 * with AVX provide 16-byte atomicity for all cachable,
-                 * naturally aligned single loads and stores, e.g. MOVDQU.
-                 *
-                 * See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104688
-                 */
-                if (have_avx1) {
-                    __cpuid(0, a, b, c, d);
-                    have_atomic16 = (c == signature_INTEL_ecx ||
-                                     c == signature_AMD_ecx);
-                }
-            }
-        }
-    }
-
-    max = __get_cpuid_max(0x8000000, 0);
-    if (max >= 1) {
-        __cpuid(0x80000001, a, b, c, d);
-        /* LZCNT was introduced with AMD Barcelona and Intel Haswell CPUs.  */
-        have_lzcnt = (c & bit_LZCNT) != 0;
-    }
-#endif /* CONFIG_CPUID_H */
-
     tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS;
     if (TCG_TARGET_REG_BITS == 64) {
         tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS;
-- 
2.34.1

Use cpuinfo_init() during init_accel(), and the variable cpuinfo
during test_buffer_is_zero_next_accel().  Adjust the logic that
cycles through the set of accelerators for testing.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 util/bufferiszero.c | 127 ++++++++++++++++----------------------------
 1 file changed, 46 insertions(+), 81 deletions(-)

diff --git a/util/bufferiszero.c b/util/bufferiszero.c
index XXXXXXX..XXXXXXX 100644
--- a/util/bufferiszero.c
+++ b/util/bufferiszero.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/osdep.h"
 #include "qemu/cutils.h"
 #include "qemu/bswap.h"
+#include "host/cpuinfo.h"
 
 static bool
 buffer_zero_int(const void *buf, size_t len)
@@ -XXX,XX +XXX,XX @@ buffer_zero_avx512(const void *buf, size_t len)
 }
 #endif /* CONFIG_AVX512F_OPT */
 
-
-/* Note that for test_buffer_is_zero_next_accel, the most preferred
- * ISA must have the least significant bit.
- */
-#define CACHE_AVX512F 1
-#define CACHE_AVX2    2
-#define CACHE_SSE4    4
-#define CACHE_SSE2    8
-
-/* Make sure that these variables are appropriately initialized when
+/*
+ * Make sure that these variables are appropriately initialized when
  * SSE2 is enabled on the compiler command-line, but the compiler is
  * too old to support CONFIG_AVX2_OPT.
  */
 #if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT)
-# define INIT_CACHE 0
-# define INIT_ACCEL buffer_zero_int
+# define INIT_USED     0
+# define INIT_LENGTH   0
+# define INIT_ACCEL    buffer_zero_int
 #else
 # ifndef __SSE2__
 #  error "ISA selection confusion"
 # endif
-# define INIT_CACHE CACHE_SSE2
-# define INIT_ACCEL buffer_zero_sse2
+# define INIT_USED     CPUINFO_SSE2
+# define INIT_LENGTH   64
+# define INIT_ACCEL    buffer_zero_sse2
 #endif
 
-static unsigned cpuid_cache = INIT_CACHE;
+static unsigned used_accel = INIT_USED;
+static unsigned length_to_accel = INIT_LENGTH;
 static bool (*buffer_accel)(const void *, size_t) = INIT_ACCEL;
-static int length_to_accel = 64;
 
-static void init_accel(unsigned cache)
+static unsigned __attribute__((noinline))
+select_accel_cpuinfo(unsigned info)
 {
-    bool (*fn)(const void *, size_t) = buffer_zero_int;
-    if (cache & CACHE_SSE2) {
-        fn = buffer_zero_sse2;
-        length_to_accel = 64;
-    }
-#ifdef CONFIG_AVX2_OPT
-    if (cache & CACHE_SSE4) {
-        fn = buffer_zero_sse4;
-        length_to_accel = 64;
-    }
-    if (cache & CACHE_AVX2) {
-        fn = buffer_zero_avx2;
-        length_to_accel = 128;
-    }
-#endif
+    /* Array is sorted in order of algorithm preference. */
+    static const struct {
+        unsigned bit;
+        unsigned len;
+        bool (*fn)(const void *, size_t);
+    } all[] = {
 #ifdef CONFIG_AVX512F_OPT
-    if (cache & CACHE_AVX512F) {
-        fn = buffer_zero_avx512;
-        length_to_accel = 256;
-    }
+        { CPUINFO_AVX512F, 256, buffer_zero_avx512 },
 #endif
-    buffer_accel = fn;
+#ifdef CONFIG_AVX2_OPT
+        { CPUINFO_AVX2,    128, buffer_zero_avx2 },
+        { CPUINFO_SSE4,     64, buffer_zero_sse4 },
+#endif
+        { CPUINFO_SSE2,     64, buffer_zero_sse2 },
+        { CPUINFO_ALWAYS,    0, buffer_zero_int },
+    };
+
+    for (unsigned i = 0; i < ARRAY_SIZE(all); ++i) {
+        if (info & all[i].bit) {
+            length_to_accel = all[i].len;
+            buffer_accel = all[i].fn;
+            return all[i].bit;
+        }
+    }
+    return 0;
 }
 
 #if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT)
-#include "qemu/cpuid.h"
-
-static void __attribute__((constructor)) init_cpuid_cache(void)
+static void __attribute__((constructor)) init_accel(void)
 {
-    unsigned max = __get_cpuid_max(0, NULL);
-    int a, b, c, d;
-    unsigned cache = 0;
-
-    if (max >= 1) {
-        __cpuid(1, a, b, c, d);
-        if (d & bit_SSE2) {
-            cache |= CACHE_SSE2;
-        }
-        if (c & bit_SSE4_1) {
-            cache |= CACHE_SSE4;
-        }
-
-        /* We must check that AVX is not just available, but usable.  */
-        if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
-            unsigned bv = xgetbv_low(0);
-            __cpuid_count(7, 0, a, b, c, d);
-            if ((bv & 0x6) == 0x6 && (b & bit_AVX2)) {
-                cache |= CACHE_AVX2;
-            }
-            /* 0xe6:
-            *  XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15
-            *                    and ZMM16-ZMM31 state are enabled by OS)
-            *  XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS)
-            */
-            if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512F)) {
-                cache |= CACHE_AVX512F;
-            }
-        }
-    }
-    cpuid_cache = cache;
-    init_accel(cache);
+    used_accel = select_accel_cpuinfo(cpuinfo_init());
 }
 #endif /* CONFIG_AVX2_OPT */
 
 bool test_buffer_is_zero_next_accel(void)
 {
-    /* If no bits set, we just tested buffer_zero_int, and there
-       are no more acceleration options to test.  */
-    if (cpuid_cache == 0) {
-        return false;
-    }
-    /* Disable the accelerator we used before and select a new one.  */
-    cpuid_cache &= cpuid_cache - 1;
-    init_accel(cpuid_cache);
-    return true;
+    /*
+     * Accumulate the accelerators that we've already tested, and
+     * remove them from the set to test this round.  We'll get back
+     * a zero from select_accel_cpuinfo when there are no more.
+     */
+    unsigned used = select_accel_cpuinfo(cpuinfo & ~used_accel);
+    used_accel |= used;
+    return used;
 }
 
 static bool select_accel_fn(const void *buf, size_t len)
-- 
2.34.1

Place the CONFIG_AVX512BW_OPT block at the top,
which will aid function selection in the next patch.

diff --git a/migration/xbzrle.c b/migration/xbzrle.c
index XXXXXXX..XXXXXXX 100644
--- a/migration/xbzrle.c
+++ b/migration/xbzrle.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/host-utils.h"
 #include "xbzrle.h"
 
+#if defined(CONFIG_AVX512BW_OPT)
+#include <immintrin.h>
+
+int __attribute__((target("avx512bw")))
+xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen,
+                            uint8_t *dst, int dlen)
+{
+    uint32_t zrun_len = 0, nzrun_len = 0;
+    int d = 0, i = 0, num = 0;
+    uint8_t *nzrun_start = NULL;
+    /* add 1 to include residual part in main loop */
+    uint32_t count512s = (slen >> 6) + 1;
+    /* countResidual is tail of data, i.e., countResidual = slen % 64 */
+    uint32_t count_residual = slen & 0b111111;
+    bool never_same = true;
+    uint64_t mask_residual = 1;
+    mask_residual <<= count_residual;
+    mask_residual -= 1;
+    __m512i r = _mm512_set1_epi32(0);
+
+    while (count512s) {
+        int bytes_to_check = 64;
+        uint64_t mask = 0xffffffffffffffff;
+        if (count512s == 1) {
+            bytes_to_check = count_residual;
+            mask = mask_residual;
+        }
+        __m512i old_data = _mm512_mask_loadu_epi8(r,
+                                                  mask, old_buf + i);
+        __m512i new_data = _mm512_mask_loadu_epi8(r,
+                                                  mask, new_buf + i);
+        uint64_t comp = _mm512_cmpeq_epi8_mask(old_data, new_data);
+        count512s--;
+
+        bool is_same = (comp & 0x1);
+        while (bytes_to_check) {
+            if (d + 2 > dlen) {
+                return -1;
+            }
+            if (is_same) {
+                if (nzrun_len) {
+                    d += uleb128_encode_small(dst + d, nzrun_len);
+                    if (d + nzrun_len > dlen) {
+                        return -1;
+                    }
+                    nzrun_start = new_buf + i - nzrun_len;
+                    memcpy(dst + d, nzrun_start, nzrun_len);
+                    d += nzrun_len;
+                    nzrun_len = 0;
+                }
+                /* 64 data at a time for speed */
+                if (count512s && (comp == 0xffffffffffffffff)) {
+                    i += 64;
+                    zrun_len += 64;
+                    break;
+                }
+                never_same = false;
+                num = ctz64(~comp);
+                num = (num < bytes_to_check) ? num : bytes_to_check;
+                zrun_len += num;
+                bytes_to_check -= num;
+                comp >>= num;
+                i += num;
+                if (bytes_to_check) {
+                    /* still has different data after same data */
+                    d += uleb128_encode_small(dst + d, zrun_len);
+                    zrun_len = 0;
+                } else {
+                    break;
+                }
+            }
+            if (never_same || zrun_len) {
+                /*
+                 * never_same only acts if
+                 * data begins with diff in first count512s
+                 */
+                d += uleb128_encode_small(dst + d, zrun_len);
+                zrun_len = 0;
+                never_same = false;
+            }
+            /* has diff, 64 data at a time for speed */
+            if ((bytes_to_check == 64) && (comp == 0x0)) {
+                i += 64;
+                nzrun_len += 64;
+                break;
+            }
+            num = ctz64(comp);
+            num = (num < bytes_to_check) ? num : bytes_to_check;
+            nzrun_len += num;
+            bytes_to_check -= num;
+            comp >>= num;
+            i += num;
+            if (bytes_to_check) {
+                /* mask like 111000 */
+                d += uleb128_encode_small(dst + d, nzrun_len);
+                /* overflow */
+                if (d + nzrun_len > dlen) {
+                    return -1;
+                }
+                nzrun_start = new_buf + i - nzrun_len;
+                memcpy(dst + d, nzrun_start, nzrun_len);
+                d += nzrun_len;
+                nzrun_len = 0;
+                is_same = true;
+            }
+        }
+    }
+
+    if (nzrun_len != 0) {
+        d += uleb128_encode_small(dst + d, nzrun_len);
+        /* overflow */
+        if (d + nzrun_len > dlen) {
+            return -1;
+        }
+        nzrun_start = new_buf + i - nzrun_len;
+        memcpy(dst + d, nzrun_start, nzrun_len);
+        d += nzrun_len;
+    }
+    return d;
+}
+#endif
+
 /*
   page = zrun nzrun
        | zrun nzrun page
@@ -XXX,XX +XXX,XX @@ int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen)
 
     return d;
 }
-
-#if defined(CONFIG_AVX512BW_OPT)
-#include <immintrin.h>
-
-int __attribute__((target("avx512bw")))
-xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen,
-                            uint8_t *dst, int dlen)
-{
-    uint32_t zrun_len = 0, nzrun_len = 0;
-    int d = 0, i = 0, num = 0;
-    uint8_t *nzrun_start = NULL;
-    /* add 1 to include residual part in main loop */
-    uint32_t count512s = (slen >> 6) + 1;
-    /* countResidual is tail of data, i.e., countResidual = slen % 64 */
-    uint32_t count_residual = slen & 0b111111;
-    bool never_same = true;
-    uint64_t mask_residual = 1;
-    mask_residual <<= count_residual;
-    mask_residual -= 1;
-    __m512i r = _mm512_set1_epi32(0);
-
-    while (count512s) {
-        int bytes_to_check = 64;
-        uint64_t mask = 0xffffffffffffffff;
-        if (count512s == 1) {
-            bytes_to_check = count_residual;
-            mask = mask_residual;
-        }
-        __m512i old_data = _mm512_mask_loadu_epi8(r,
-                                                  mask, old_buf + i);
-        __m512i new_data = _mm512_mask_loadu_epi8(r,
-                                                  mask, new_buf + i);
-        uint64_t comp = _mm512_cmpeq_epi8_mask(old_data, new_data);
-        count512s--;
-
-        bool is_same = (comp & 0x1);
-        while (bytes_to_check) {
-            if (d + 2 > dlen) {
-                return -1;
-            }
-            if (is_same) {
-                if (nzrun_len) {
-                    d += uleb128_encode_small(dst + d, nzrun_len);
-                    if (d + nzrun_len > dlen) {
-                        return -1;
-                    }
-                    nzrun_start = new_buf + i - nzrun_len;
-                    memcpy(dst + d, nzrun_start, nzrun_len);
-                    d += nzrun_len;
-                    nzrun_len = 0;
-                }
-                /* 64 data at a time for speed */
-                if (count512s && (comp == 0xffffffffffffffff)) {
-                    i += 64;
-                    zrun_len += 64;
-                    break;
-                }
-                never_same = false;
-                num = ctz64(~comp);
-                num = (num < bytes_to_check) ? num : bytes_to_check;
-                zrun_len += num;
-                bytes_to_check -= num;
-                comp >>= num;
-                i += num;
-                if (bytes_to_check) {
-                    /* still has different data after same data */
-                    d += uleb128_encode_small(dst + d, zrun_len);
-                    zrun_len = 0;
-                } else {
-                    break;
-                }
-            }
-            if (never_same || zrun_len) {
-                /*
-                 * never_same only acts if
-                 * data begins with diff in first count512s
-                 */
-                d += uleb128_encode_small(dst + d, zrun_len);
-                zrun_len = 0;
-                never_same = false;
-            }
-            /* has diff, 64 data at a time for speed */
-            if ((bytes_to_check == 64) && (comp == 0x0)) {
-                i += 64;
-                nzrun_len += 64;
-                break;
-            }
-            num = ctz64(comp);
-            num = (num < bytes_to_check) ? num : bytes_to_check;
-            nzrun_len += num;
-            bytes_to_check -= num;
-            comp >>= num;
-            i += num;
-            if (bytes_to_check) {
-                /* mask like 111000 */
-                d += uleb128_encode_small(dst + d, nzrun_len);
-                /* overflow */
-                if (d + nzrun_len > dlen) {
-                    return -1;
-                }
-                nzrun_start = new_buf + i - nzrun_len;
-                memcpy(dst + d, nzrun_start, nzrun_len);
-                d += nzrun_len;
-                nzrun_len = 0;
-                is_same = true;
-            }
-        }
-    }
-
-    if (nzrun_len != 0) {
-        d += uleb128_encode_small(dst + d, nzrun_len);
-        /* overflow */
-        if (d + nzrun_len > dlen) {
-            return -1;
-        }
-        nzrun_start = new_buf + i - nzrun_len;
-        memcpy(dst + d, nzrun_start, nzrun_len);
-        d += nzrun_len;
-    }
-    return d;
-}
-#endif
-- 
2.34.1

Perform the function selection once, and only if CONFIG_AVX512_OPT
is enabled.  Centralize the selection to xbzrle.c, instead of
spreading the init across 3 files.

Remove xbzrle-bench.c.  The benefit of being able to benchmark
the different implementations is less important than not peeking
into the internals of the implementation.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 migration/xbzrle.h         |   5 +-
 migration/ram.c            |  34 +--
 migration/xbzrle.c         |  26 +-
 tests/bench/xbzrle-bench.c | 469 -------------------------------------
 tests/unit/test-xbzrle.c   |  49 +---
 tests/bench/meson.build    |   6 -
 6 files changed, 39 insertions(+), 550 deletions(-)
 delete mode 100644 tests/bench/xbzrle-bench.c

diff --git a/migration/xbzrle.h b/migration/xbzrle.h
index XXXXXXX..XXXXXXX 100644
--- a/migration/xbzrle.h
+++ b/migration/xbzrle.h
@@ -XXX,XX +XXX,XX @@ int xbzrle_encode_buffer(uint8_t *old_buf, uint8_t *new_buf, int slen,
                          uint8_t *dst, int dlen);
 
 int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen);
-#if defined(CONFIG_AVX512BW_OPT)
-int xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen,
-                                uint8_t *dst, int dlen);
-#endif
+
 #endif
diff --git a/migration/ram.c b/migration/ram.c
index XXXXXXX..XXXXXXX 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -XXX,XX +XXX,XX @@
 #define RAM_SAVE_FLAG_MULTIFD_FLUSH    0x200
 /* We can't use any flag that is bigger than 0x200 */
 
-int (*xbzrle_encode_buffer_func)(uint8_t *, uint8_t *, int,
-     uint8_t *, int) = xbzrle_encode_buffer;
-#if defined(CONFIG_AVX512BW_OPT)
-#include "qemu/cpuid.h"
-static void __attribute__((constructor)) init_cpu_flag(void)
-{
-    unsigned max = __get_cpuid_max(0, NULL);
-    int a, b, c, d;
-    if (max >= 1) {
-        __cpuid(1, a, b, c, d);
-         /* We must check that AVX is not just available, but usable.  */
-        if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
-            int bv;
-            __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
-            __cpuid_count(7, 0, a, b, c, d);
-           /* 0xe6:
-            *  XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15
-            *                    and ZMM16-ZMM31 state are enabled by OS)
-            *  XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS)
-            */
-            if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) {
-                xbzrle_encode_buffer_func = xbzrle_encode_buffer_avx512;
-            }
-        }
-    }
-}
-#endif
-
 XBZRLECacheStats xbzrle_counters;
 
 /* used by the search for pages to send */
@@ -XXX,XX +XXX,XX @@ static int save_xbzrle_page(RAMState *rs, PageSearchStatus *pss,
     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 
     /* XBZRLE encoding (if there is no overflow) */
-    encoded_len = xbzrle_encode_buffer_func(prev_cached_page, XBZRLE.current_buf,
-                                            TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
-                                            TARGET_PAGE_SIZE);
+    encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
+                                       TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
+                                       TARGET_PAGE_SIZE);
 
     /*
      * Update the cache contents, so that it corresponds to the data
diff --git a/migration/xbzrle.c b/migration/xbzrle.c
index XXXXXXX..XXXXXXX 100644
--- a/migration/xbzrle.c
+++ b/migration/xbzrle.c
@@ -XXX,XX +XXX,XX @@
 
 #if defined(CONFIG_AVX512BW_OPT)
 #include <immintrin.h>
+#include "host/cpuinfo.h"
 
-int __attribute__((target("avx512bw")))
+static int __attribute__((target("avx512bw")))
 xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen,
                             uint8_t *dst, int dlen)
 {
@@ -XXX,XX +XXX,XX @@ xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen,
     }
     return d;
 }
+
+static int xbzrle_encode_buffer_int(uint8_t *old_buf, uint8_t *new_buf,
+                                    int slen, uint8_t *dst, int dlen);
+
+static int (*accel_func)(uint8_t *, uint8_t *, int, uint8_t *, int);
+
+static void __attribute__((constructor)) init_accel(void)
+{
+    unsigned info = cpuinfo_init();
+    if (info & CPUINFO_AVX512BW) {
+        accel_func = xbzrle_encode_buffer_avx512;
+    } else {
+        accel_func = xbzrle_encode_buffer_int;
+    }
+}
+
+int xbzrle_encode_buffer(uint8_t *old_buf, uint8_t *new_buf, int slen,
+                         uint8_t *dst, int dlen)
+{
+    return accel_func(old_buf, new_buf, slen, dst, dlen);
+}
+
+#define xbzrle_encode_buffer xbzrle_encode_buffer_int
 #endif
 
 /*
diff --git a/tests/bench/xbzrle-bench.c b/tests/bench/xbzrle-bench.c
deleted file mode 100644
index XXXXXXX..XXXXXXX
--- a/tests/bench/xbzrle-bench.c
+++ /dev/null
@@ -XXX,XX +XXX,XX @@
-/*
- * Xor Based Zero Run Length Encoding unit tests.
- *
- * Copyright 2013 Red Hat, Inc. and/or its affiliates
- *
- * Authors:
- *  Orit Wasserman  <owasserm@redhat.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2 or later.
- * See the COPYING file in the top-level directory.
- *
- */
-#include "qemu/osdep.h"
-#include "qemu/cutils.h"
-#include "../migration/xbzrle.h"
-
-#if defined(CONFIG_AVX512BW_OPT)
-#define XBZRLE_PAGE_SIZE 4096
-static bool is_cpu_support_avx512bw;
-#include "qemu/cpuid.h"
-static void __attribute__((constructor)) init_cpu_flag(void)
-{
-    unsigned max = __get_cpuid_max(0, NULL);
-    int a, b, c, d;
-    is_cpu_support_avx512bw = false;
-    if (max >= 1) {
-        __cpuid(1, a, b, c, d);
-         /* We must check that AVX is not just available, but usable.  */
-        if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
-            int bv;
-            __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
-            __cpuid_count(7, 0, a, b, c, d);
-           /* 0xe6:
-            *  XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15
-            *                    and ZMM16-ZMM31 state are enabled by OS)
-            *  XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS)
-            */
-            if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) {
-                is_cpu_support_avx512bw = true;
-            }
-        }
-    }
-    return ;
-}
-
-struct ResTime {
-    float t_raw;
-    float t_512;
-};
-
-
-/* Function prototypes
-int xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen,
-                                uint8_t *dst, int dlen);
-*/
-static void encode_decode_zero(struct ResTime *res)
-{
-    uint8_t *buffer = g_malloc0(XBZRLE_PAGE_SIZE);
-    uint8_t *compressed = g_malloc0(XBZRLE_PAGE_SIZE);
-    uint8_t *buffer512 = g_malloc0(XBZRLE_PAGE_SIZE);
-    uint8_t *compressed512 = g_malloc0(XBZRLE_PAGE_SIZE);
-    int i = 0;
-    int dlen = 0, dlen512 = 0;
-    int diff_len = g_test_rand_int_range(0, XBZRLE_PAGE_SIZE - 1006);
-
-    for (i = diff_len; i > 0; i--) {
-        buffer[1000 + i] = i;
-        buffer512[1000 + i] = i;
-    }
-
-    buffer[1000 + diff_len + 3] = 103;
-    buffer[1000 + diff_len + 5] = 105;
-
-    buffer512[1000 + diff_len + 3] = 103;
-    buffer512[1000 + diff_len + 5] = 105;
-
-    /* encode zero page */
-    time_t t_start, t_end, t_start512, t_end512;
-    t_start = clock();
-    dlen = xbzrle_encode_buffer(buffer, buffer, XBZRLE_PAGE_SIZE, compressed,
-                       XBZRLE_PAGE_SIZE);
-    t_end = clock();
-    float time_val = difftime(t_end, t_start);
-    g_assert(dlen == 0);
-
-    t_start512 = clock();
-    dlen512 = xbzrle_encode_buffer_avx512(buffer512, buffer512, XBZRLE_PAGE_SIZE,
-                                       compressed512, XBZRLE_PAGE_SIZE);
-    t_end512 = clock();
-    float time_val512 = difftime(t_end512, t_start512);
-    g_assert(dlen512 == 0);
-
-    res->t_raw = time_val;
-    res->t_512 = time_val512;
-
-    g_free(buffer);
-    g_free(compressed);
-    g_free(buffer512);
-    g_free(compressed512);
-
-}
-
-static void test_encode_decode_zero_avx512(void)
-{
-    int i;
-    float time_raw = 0.0, time_512 = 0.0;
-    struct ResTime res;
-    for (i = 0; i < 10000; i++) {
-        encode_decode_zero(&res);
-        time_raw += res.t_raw;
-        time_512 += res.t_512;
-    }
-    printf("Zero test:\n");
-    printf("Raw xbzrle_encode time is %f ms\n", time_raw);
-    printf("512 xbzrle_encode time is %f ms\n", time_512);
-}
-
-static void encode_decode_unchanged(struct ResTime *res)
-{
-    uint8_t *compressed = g_malloc0(XBZRLE_PAGE_SIZE);
-    uint8_t *test = g_malloc0(XBZRLE_PAGE_SIZE);
-    uint8_t *compressed512 = g_malloc0(XBZRLE_PAGE_SIZE);
-    uint8_t *test512 = g_malloc0(XBZRLE_PAGE_SIZE);
-    int i = 0;
-    int dlen = 0, dlen512 = 0;
-    int diff_len = g_test_rand_int_range(0, XBZRLE_PAGE_SIZE - 1006);
-
-    for (i = diff_len; i > 0; i--) {
-        test[1000 + i] = i + 4;
-        test512[1000 + i] = i + 4;
-    }
-
-    test[1000 + diff_len + 3] = 107;
-    test[1000 + diff_len + 5] = 109;
-
-    test512[1000 + diff_len + 3] = 107;
-    test512[1000 + diff_len + 5] = 109;
-
-    /* test unchanged buffer */
-    time_t t_start, t_end, t_start512, t_end512;
-    t_start = clock();
-    dlen = xbzrle_encode_buffer(test, test, XBZRLE_PAGE_SIZE, compressed,
-                                XBZRLE_PAGE_SIZE);
-    t_end = clock();
-    float time_val = difftime(t_end, t_start);
-    g_assert(dlen == 0);
-
-    t_start512 = clock();
-    dlen512 = xbzrle_encode_buffer_avx512(test512, test512, XBZRLE_PAGE_SIZE,
-                                       compressed512, XBZRLE_PAGE_SIZE);
-    t_end512 = clock();
-    float time_val512 = difftime(t_end512, t_start512);
-    g_assert(dlen512 == 0);
-
-    res->t_raw = time_val;
-    res->t_512 = time_val512;
-
-    g_free(test);
-    g_free(compressed);
-    g_free(test512);
-    g_free(compressed512);
-
-}
-
-static void test_encode_decode_unchanged_avx512(void)
-{
-    int i;
-    float time_raw = 0.0, time_512 = 0.0;
-    struct ResTime res;
-    for (i = 0; i < 10000; i++) {
-        encode_decode_unchanged(&res);
-        time_raw += res.t_raw;
-        time_512 += res.t_512;
-    }
-    printf("Unchanged test:\n");
-    printf("Raw xbzrle_encode time is %f ms\n", time_raw);
-    printf("512 xbzrle_encode time is %f ms\n", time_512);
-}
-
-static void encode_decode_1_byte(struct ResTime *res)
-{
-    uint8_t *buffer = g_malloc0(XBZRLE_PAGE_SIZE);
-    uint8_t *test = g_malloc0(XBZRLE_PAGE_SIZE);
-    uint8_t *compressed = g_malloc(XBZRLE_PAGE_SIZE);
-    uint8_t *buffer512 = g_malloc0(XBZRLE_PAGE_SIZE);
-    uint8_t *test512 = g_malloc0(XBZRLE_PAGE_SIZE);
-    uint8_t *compressed512 = g_malloc(XBZRLE_PAGE_SIZE);
-    int dlen = 0, rc = 0, dlen512 = 0, rc512 = 0;
-    uint8_t buf[2];
-    uint8_t buf512[2];
-
-    test[XBZRLE_PAGE_SIZE - 1] = 1;
-    test512[XBZRLE_PAGE_SIZE - 1] = 1;
-
-    time_t t_start, t_end, t_start512, t_end512;
-    t_start = clock();
-    dlen = xbzrle_encode_buffer(buffer, test, XBZRLE_PAGE_SIZE, compressed,
-                       XBZRLE_PAGE_SIZE);
-    t_end = clock();
-    float time_val = difftime(t_end, t_start);
-    g_assert(dlen == (uleb128_encode_small(&buf[0], 4095) + 2));
-
-    rc = xbzrle_decode_buffer(compressed, dlen, buffer, XBZRLE_PAGE_SIZE);
-    g_assert(rc == XBZRLE_PAGE_SIZE);
-    g_assert(memcmp(test, buffer, XBZRLE_PAGE_SIZE) == 0);
-
-    t_start512 = clock();
-    dlen512 = xbzrle_encode_buffer_avx512(buffer512, test512, XBZRLE_PAGE_SIZE,
-                                       compressed512, XBZRLE_PAGE_SIZE);
-    t_end512 = clock();
-    float time_val512 = difftime(t_end512, t_start512);
-    g_assert(dlen512 == (uleb128_encode_small(&buf512[0], 4095) + 2));
-
-    rc512 = xbzrle_decode_buffer(compressed512, dlen512, buffer512,
-                                 XBZRLE_PAGE_SIZE);
-    g_assert(rc512 == XBZRLE_PAGE_SIZE);
-    g_assert(memcmp(test512, buffer512, XBZRLE_PAGE_SIZE) == 0);
-
-    res->t_raw = time_val;
-    res->t_512 = time_val512;
-
-    g_free(buffer);
-    g_free(compressed);
-    g_free(test);
-    g_free(buffer512);
-    g_free(compressed512);
-    g_free(test512);
-
-}
-
-static void test_encode_decode_1_byte_avx512(void)
-{
-    int i;
-    float time_raw = 0.0, time_512 = 0.0;
-    struct ResTime res;
-    for (i = 0; i < 10000; i++) {
-        encode_decode_1_byte(&res);
-        time_raw += res.t_raw;
-        time_512 += res.t_512;
-    }
-    printf("1 byte test:\n");
-    printf("Raw xbzrle_encode time is %f ms\n", time_raw);
-    printf("512 xbzrle_encode time is %f ms\n", time_512);
-}
-
-static void encode_decode_overflow(struct ResTime *res)
-{
-    uint8_t *compressed = g_malloc0(XBZRLE_PAGE_SIZE);
-    uint8_t *test = g_malloc0(XBZRLE_PAGE_SIZE);
-    uint8_t *buffer = g_malloc0(XBZRLE_PAGE_SIZE);
-    uint8_t *compressed512 = g_malloc0(XBZRLE_PAGE_SIZE);
-    uint8_t *test512 = g_malloc0(XBZRLE_PAGE_SIZE);
-    uint8_t *buffer512 = g_malloc0(XBZRLE_PAGE_SIZE);
-    int i = 0, rc = 0, rc512 = 0;
-
-    for (i = 0; i < XBZRLE_PAGE_SIZE / 2 - 1; i++) {
-        test[i * 2] = 1;
-        test512[i * 2] = 1;
-    }
-
-    /* encode overflow */
-    time_t t_start, t_end, t_start512, t_end512;
-    t_start = clock();
-    rc = xbzrle_encode_buffer(buffer, test, XBZRLE_PAGE_SIZE, compressed,
-                              XBZRLE_PAGE_SIZE);
-    t_end = clock();
-    float time_val = difftime(t_end, t_start);
-    g_assert(rc == -1);
-
-    t_start512 = clock();
-    rc512 = xbzrle_encode_buffer_avx512(buffer512, test512, XBZRLE_PAGE_SIZE,
-                                     compressed512, XBZRLE_PAGE_SIZE);
-    t_end512 = clock();
-    float time_val512 = difftime(t_end512, t_start512);
-    g_assert(rc512 == -1);
-
-    res->t_raw = time_val;
-    res->t_512 = time_val512;
-
-    g_free(buffer);
-    g_free(compressed);
-    g_free(test);
-    g_free(buffer512);
-    g_free(compressed512);
-    g_free(test512);
-
-}
-
-static void test_encode_decode_overflow_avx512(void)
-{
-    int i;
-    float time_raw = 0.0, time_512 = 0.0;
-    struct ResTime res;
-    for (i = 0; i < 10000; i++) {
-        encode_decode_overflow(&res);
-        time_raw += res.t_raw;
-        time_512 += res.t_512;
-    }
-    printf("Overflow test:\n");
-    printf("Raw xbzrle_encode time is %f ms\n", time_raw);
-    printf("512 xbzrle_encode time is %f ms\n", time_512);
-}
-
-static void encode_decode_range_avx512(struct ResTime *res)
-{
-    uint8_t *buffer = g_malloc0(XBZRLE_PAGE_SIZE);
-    uint8_t *compressed = g_malloc(XBZRLE_PAGE_SIZE);
-    uint8_t *test = g_malloc0(XBZRLE_PAGE_SIZE);
-    uint8_t *buffer512 = g_malloc0(XBZRLE_PAGE_SIZE);
-    uint8_t *compressed512 = g_malloc(XBZRLE_PAGE_SIZE);
-    uint8_t *test512 = g_malloc0(XBZRLE_PAGE_SIZE);
-    int i = 0, rc = 0, rc512 = 0;
-    int dlen = 0, dlen512 = 0;
-
-    int diff_len = g_test_rand_int_range(0, XBZRLE_PAGE_SIZE - 1006);
-
-    for (i = diff_len; i > 0; i--) {
-        buffer[1000 + i] = i;
-        test[1000 + i] = i + 4;
-        buffer512[1000 + i] = i;
-        test512[1000 + i] = i + 4;
-    }
-
-    buffer[1000 + diff_len + 3] = 103;
-    test[1000 + diff_len + 3] = 107;
-
-    buffer[1000 + diff_len + 5] = 105;
-    test[1000 + diff_len + 5] = 109;
-
-    buffer512[1000 + diff_len + 3] = 103;
-    test512[1000 + diff_len + 3] = 107;
-
-    buffer512[1000 + diff_len + 5] = 105;
-    test512[1000 + diff_len + 5] = 109;
-
-    /* test encode/decode */
-    time_t t_start, t_end, t_start512, t_end512;
-    t_start = clock();
-    dlen = xbzrle_encode_buffer(test, buffer, XBZRLE_PAGE_SIZE, compressed,
-                                XBZRLE_PAGE_SIZE);
-    t_end = clock();
-    float time_val = difftime(t_end, t_start);
-    rc = xbzrle_decode_buffer(compressed, dlen, test, XBZRLE_PAGE_SIZE);
-    g_assert(rc < XBZRLE_PAGE_SIZE);
-    g_assert(memcmp(test, buffer, XBZRLE_PAGE_SIZE) == 0);
-
-    t_start512 = clock();
-    dlen512 = xbzrle_encode_buffer_avx512(test512, buffer512, XBZRLE_PAGE_SIZE,
-                                       compressed512, XBZRLE_PAGE_SIZE);
-    t_end512 = clock();
-    float time_val512 = difftime(t_end512, t_start512);
-    rc512 = xbzrle_decode_buffer(compressed512, dlen512, test512, XBZRLE_PAGE_SIZE);
-    g_assert(rc512 < XBZRLE_PAGE_SIZE);
-    g_assert(memcmp(test512, buffer512, XBZRLE_PAGE_SIZE) == 0);
-
-    res->t_raw = time_val;
-    res->t_512 = time_val512;
-
-    g_free(buffer);
-    g_free(compressed);
-    g_free(test);
-    g_free(buffer512);
-    g_free(compressed512);
-    g_free(test512);
-
-}
-
-static void test_encode_decode_avx512(void)
-{
-    int i;
-    float time_raw = 0.0, time_512 = 0.0;
-    struct ResTime res;
-    for (i = 0; i < 10000; i++) {
-        encode_decode_range_avx512(&res);
-        time_raw += res.t_raw;
-        time_512 += res.t_512;
-    }
-    printf("Encode decode test:\n");
-    printf("Raw xbzrle_encode time is %f ms\n", time_raw);
-    printf("512 xbzrle_encode time is %f ms\n", time_512);
-}
-
-static void encode_decode_random(struct ResTime *res)
-{
-    uint8_t *buffer = g_malloc0(XBZRLE_PAGE_SIZE);
-    uint8_t *compressed = g_malloc(XBZRLE_PAGE_SIZE);
-    uint8_t *test = g_malloc0(XBZRLE_PAGE_SIZE);
-    uint8_t *buffer512 = g_malloc0(XBZRLE_PAGE_SIZE);
-    uint8_t *compressed512 = g_malloc(XBZRLE_PAGE_SIZE);
-    uint8_t *test512 = g_malloc0(XBZRLE_PAGE_SIZE);
-    int i = 0, rc = 0, rc512 = 0;
-    int dlen = 0, dlen512 = 0;
-
-    int diff_len = g_test_rand_int_range(0, XBZRLE_PAGE_SIZE - 1);
-    /* store the index of diff */
-    int dirty_index[diff_len];
-    for (int j = 0; j < diff_len; j++) {
-        dirty_index[j] = g_test_rand_int_range(0, XBZRLE_PAGE_SIZE - 1);
-    }
-    for (i = diff_len - 1; i >= 0; i--) {
-        buffer[dirty_index[i]] = i;
-        test[dirty_index[i]] = i + 4;
-        buffer512[dirty_index[i]] = i;
-        test512[dirty_index[i]] = i + 4;
-    }
-
-    time_t t_start, t_end, t_start512, t_end512;
-    t_start = clock();
-    dlen = xbzrle_encode_buffer(test, buffer, XBZRLE_PAGE_SIZE, compressed,
-                                XBZRLE_PAGE_SIZE);
-    t_end = clock();
-    float time_val = difftime(t_end, t_start);
-    rc = xbzrle_decode_buffer(compressed, dlen, test, XBZRLE_PAGE_SIZE);
-    g_assert(rc < XBZRLE_PAGE_SIZE);
-
-    t_start512 = clock();
-    dlen512 = xbzrle_encode_buffer_avx512(test512, buffer512, XBZRLE_PAGE_SIZE,
-                                       compressed512, XBZRLE_PAGE_SIZE);
-    t_end512 = clock();
-    float time_val512 = difftime(t_end512, t_start512);
-    rc512 = xbzrle_decode_buffer(compressed512, dlen512, test512, XBZRLE_PAGE_SIZE);
-    g_assert(rc512 < XBZRLE_PAGE_SIZE);
-
-    res->t_raw = time_val;
-    res->t_512 = time_val512;
-
-    g_free(buffer);
-    g_free(compressed);
-    g_free(test);
-    g_free(buffer512);
-    g_free(compressed512);
-    g_free(test512);
-
-}
-
-static void test_encode_decode_random_avx512(void)
-{
-    int i;
-    float time_raw = 0.0, time_512 = 0.0;
-    struct ResTime res;
-    for (i = 0; i < 10000; i++) {
-        encode_decode_random(&res);
-        time_raw += res.t_raw;
-        time_512 += res.t_512;
-    }
-    printf("Random test:\n");
-    printf("Raw xbzrle_encode time is %f ms\n", time_raw);
-    printf("512 xbzrle_encode time is %f ms\n", time_512);
-}
-#endif
-
-int main(int argc, char **argv)
-{
-    g_test_init(&argc, &argv, NULL);
-    g_test_rand_int();
-    #if defined(CONFIG_AVX512BW_OPT)
-    if (likely(is_cpu_support_avx512bw)) {
-        g_test_add_func("/xbzrle/encode_decode_zero", test_encode_decode_zero_avx512);
-        g_test_add_func("/xbzrle/encode_decode_unchanged",
-                        test_encode_decode_unchanged_avx512);
-        g_test_add_func("/xbzrle/encode_decode_1_byte", test_encode_decode_1_byte_avx512);
-        g_test_add_func("/xbzrle/encode_decode_overflow",
-                        test_encode_decode_overflow_avx512);
-        g_test_add_func("/xbzrle/encode_decode", test_encode_decode_avx512);
-        g_test_add_func("/xbzrle/encode_decode_random", test_encode_decode_random_avx512);
-    }
-    #endif
-    return g_test_run();
-}
diff --git a/tests/unit/test-xbzrle.c b/tests/unit/test-xbzrle.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/unit/test-xbzrle.c
+++ b/tests/unit/test-xbzrle.c
@@ -XXX,XX +XXX,XX @@
 
 #define XBZRLE_PAGE_SIZE 4096
 
-int (*xbzrle_encode_buffer_func)(uint8_t *, uint8_t *, int,
-     uint8_t *, int) = xbzrle_encode_buffer;
-#if defined(CONFIG_AVX512BW_OPT)
-#include "qemu/cpuid.h"
-static void __attribute__((constructor)) init_cpu_flag(void)
-{
-    unsigned max = __get_cpuid_max(0, NULL);
-    int a, b, c, d;
-    if (max >= 1) {
-        __cpuid(1, a, b, c, d);
-         /* We must check that AVX is not just available, but usable.  */
-        if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
-            int bv;
-            __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
-            __cpuid_count(7, 0, a, b, c, d);
-           /* 0xe6:
-            *  XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15
-            *                    and ZMM16-ZMM31 state are enabled by OS)
-            *  XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS)
-            */
-            if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) {
-                xbzrle_encode_buffer_func = xbzrle_encode_buffer_avx512;
-            }
-        }
-    }
-    return ;
-}
-#endif
-
 static void test_uleb(void)
 {
     uint32_t i, val;
@@ -XXX,XX +XXX,XX @@ static void test_encode_decode_zero(void)
     buffer[1000 + diff_len + 5] = 105;
 
     /* encode zero page */
-    dlen = xbzrle_encode_buffer_func(buffer, buffer, XBZRLE_PAGE_SIZE, compressed,
-                       XBZRLE_PAGE_SIZE);
+    dlen = xbzrle_encode_buffer(buffer, buffer, XBZRLE_PAGE_SIZE,
+                                compressed, XBZRLE_PAGE_SIZE);
     g_assert(dlen == 0);
 
     g_free(buffer);
@@ -XXX,XX +XXX,XX @@ static void test_encode_decode_unchanged(void)
     test[1000 + diff_len + 5] = 109;
 
     /* test unchanged buffer */
-    dlen = xbzrle_encode_buffer_func(test, test, XBZRLE_PAGE_SIZE, compressed,
-                                XBZRLE_PAGE_SIZE);
+    dlen = xbzrle_encode_buffer(test, test, XBZRLE_PAGE_SIZE,
+                                compressed, XBZRLE_PAGE_SIZE);
     g_assert(dlen == 0);
 
     g_free(test);
@@ -XXX,XX +XXX,XX @@ static void test_encode_decode_1_byte(void)
 
     test[XBZRLE_PAGE_SIZE - 1] = 1;
 
-    dlen = xbzrle_encode_buffer_func(buffer, test, XBZRLE_PAGE_SIZE, compressed,
-                       XBZRLE_PAGE_SIZE);
+    dlen = xbzrle_encode_buffer(buffer, test, XBZRLE_PAGE_SIZE,
+                                compressed, XBZRLE_PAGE_SIZE);
     g_assert(dlen == (uleb128_encode_small(&buf[0], 4095) + 2));
 
     rc = xbzrle_decode_buffer(compressed, dlen, buffer, XBZRLE_PAGE_SIZE);
@@ -XXX,XX +XXX,XX @@ static void test_encode_decode_overflow(void)
     }
 
     /* encode overflow */
-    rc = xbzrle_encode_buffer_func(buffer, test, XBZRLE_PAGE_SIZE, compressed,
-                              XBZRLE_PAGE_SIZE);
+    rc = xbzrle_encode_buffer(buffer, test, XBZRLE_PAGE_SIZE,
+                              compressed, XBZRLE_PAGE_SIZE);
     g_assert(rc == -1);
 
     g_free(buffer);
@@ -XXX,XX +XXX,XX @@ static void encode_decode_range(void)
     test[1000 + diff_len + 5] = 109;
 
     /* test encode/decode */
-    dlen = xbzrle_encode_buffer_func(test, buffer, XBZRLE_PAGE_SIZE, compressed,
-                                XBZRLE_PAGE_SIZE);
+    dlen = xbzrle_encode_buffer(test, buffer, XBZRLE_PAGE_SIZE,
+                                compressed, XBZRLE_PAGE_SIZE);
 
     rc = xbzrle_decode_buffer(compressed, dlen, test, XBZRLE_PAGE_SIZE);
     g_assert(rc < XBZRLE_PAGE_SIZE);
diff --git a/tests/bench/meson.build b/tests/bench/meson.build
index XXXXXXX..XXXXXXX 100644
--- a/tests/bench/meson.build
+++ b/tests/bench/meson.build
@@ -XXX,XX +XXX,XX @@ qht_bench = executable('qht-bench',
                        sources: 'qht-bench.c',
                        dependencies: [qemuutil])
 
-if have_system
-xbzrle_bench = executable('xbzrle-bench',
-                       sources: 'xbzrle-bench.c',
-                       dependencies: [qemuutil,migration])
-endif
-
 qtree_bench = executable('qtree-bench',
                          sources: 'qtree-bench.c',
                          dependencies: [qemuutil])
-- 
2.34.1

Move the code from tcg/.  The only use of these bits so far
is with respect to the atomicity of tcg operations.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 host/include/aarch64/host/cpuinfo.h | 22 ++++++++++
 tcg/aarch64/tcg-target.h            |  6 ++-
 util/cpuinfo-aarch64.c              | 67 +++++++++++++++++++++++++++++
 tcg/aarch64/tcg-target.c.inc        | 40 -----------------
 util/meson.build                    |  4 +-
 5 files changed, 96 insertions(+), 43 deletions(-)
 create mode 100644 host/include/aarch64/host/cpuinfo.h
 create mode 100644 util/cpuinfo-aarch64.c

diff --git a/host/include/aarch64/host/cpuinfo.h b/host/include/aarch64/host/cpuinfo.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/host/include/aarch64/host/cpuinfo.h
@@ -XXX,XX +XXX,XX @@
+/*
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ * Host specific cpu indentification for AArch64.
+ */
+
+#ifndef HOST_CPUINFO_H
+#define HOST_CPUINFO_H
+
+#define CPUINFO_ALWAYS          (1u << 0)  /* so cpuinfo is nonzero */
+#define CPUINFO_LSE             (1u << 1)
+#define CPUINFO_LSE2            (1u << 2)
+
+/* Initialized with a constructor. */
+extern unsigned cpuinfo;
+
+/*
+ * We cannot rely on constructor ordering, so other constructors must
+ * use the function interface rather than the variable above.
+ */
+unsigned cpuinfo_init(void);
+
+#endif /* HOST_CPUINFO_H */
diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/aarch64/tcg-target.h
+++ b/tcg/aarch64/tcg-target.h
@@ -XXX,XX +XXX,XX @@
 #ifndef AARCH64_TCG_TARGET_H
 #define AARCH64_TCG_TARGET_H
 
+#include "host/cpuinfo.h"
+
 #define TCG_TARGET_INSN_UNIT_SIZE  4
 #define TCG_TARGET_TLB_DISPLACEMENT_BITS 24
 #define MAX_CODE_GEN_BUFFER_SIZE  ((size_t)-1)
@@ -XXX,XX +XXX,XX @@ typedef enum {
 #define TCG_TARGET_CALL_ARG_I128        TCG_CALL_ARG_EVEN
 #define TCG_TARGET_CALL_RET_I128        TCG_CALL_RET_NORMAL
 
-extern bool have_lse;
-extern bool have_lse2;
+#define have_lse    (cpuinfo & CPUINFO_LSE)
+#define have_lse2   (cpuinfo & CPUINFO_LSE2)
 
 /* optional instructions */
 #define TCG_TARGET_HAS_div_i32          1
diff --git a/util/cpuinfo-aarch64.c b/util/cpuinfo-aarch64.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/util/cpuinfo-aarch64.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ * Host specific cpu indentification for AArch64.
+ */
+
+#include "qemu/osdep.h"
+#include "host/cpuinfo.h"
+
+#ifdef CONFIG_LINUX
+# ifdef CONFIG_GETAUXVAL
+#  include <sys/auxv.h>
+# else
+#  include <asm/hwcap.h>
+#  include "elf.h"
+# endif
+#endif
+#ifdef CONFIG_DARWIN
+# include <sys/sysctl.h>
+#endif
+
+unsigned cpuinfo;
+
+#ifdef CONFIG_DARWIN
+static bool sysctl_for_bool(const char *name)
+{
+    int val = 0;
+    size_t len = sizeof(val);
+
+    if (sysctlbyname(name, &val, &len, NULL, 0) == 0) {
+        return val != 0;
+    }
+
+    /*
+     * We might in the future ask for properties not present in older kernels,
+     * but we're only asking about static properties, all of which should be
+     * 'int'.  So we shouln't see ENOMEM (val too small), or any of the other
+     * more exotic errors.
+     */
+    assert(errno == ENOENT);
+    return false;
+}
+#endif
+
+/* Called both as constructor and (possibly) via other constructors. */
+unsigned __attribute__((constructor)) cpuinfo_init(void)
+{
+    unsigned info = cpuinfo;
+
+    if (info) {
+        return info;
+    }
+
+    info = CPUINFO_ALWAYS;
+
+#ifdef CONFIG_LINUX
+    unsigned long hwcap = qemu_getauxval(AT_HWCAP);
+    info |= (hwcap & HWCAP_ATOMICS ? CPUINFO_LSE : 0);
+    info |= (hwcap & HWCAP_USCAT ? CPUINFO_LSE2 : 0);
+#endif
+#ifdef CONFIG_DARWIN
+    info |= sysctl_for_bool("hw.optional.arm.FEAT_LSE") * CPUINFO_LSE;
+    info |= sysctl_for_bool("hw.optional.arm.FEAT_LSE2") * CPUINFO_LSE2;
+#endif
+
+    cpuinfo = info;
+    return info;
+}
diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/aarch64/tcg-target.c.inc
+++ b/tcg/aarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@
 #include "../tcg-ldst.c.inc"
 #include "../tcg-pool.c.inc"
 #include "qemu/bitops.h"
-#ifdef __linux__
-#include <asm/hwcap.h>
-#endif
-#ifdef CONFIG_DARWIN
-#include <sys/sysctl.h>
-#endif
 
 /* We're going to re-use TCGType in setting of the SF bit, which controls
    the size of the operation performed.  If we know the values match, it
@@ -XXX,XX +XXX,XX @@ static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind kind, int slot)
     return TCG_REG_X0 + slot;
 }
 
-bool have_lse;
-bool have_lse2;
-
 #define TCG_REG_TMP TCG_REG_X30
 #define TCG_VEC_TMP TCG_REG_V31
 
@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
     }
 }
 
-#ifdef CONFIG_DARWIN
-static bool sysctl_for_bool(const char *name)
-{
-    int val = 0;
-    size_t len = sizeof(val);
-
-    if (sysctlbyname(name, &val, &len, NULL, 0) == 0) {
-        return val != 0;
-    }
-
-    /*
-     * We might in the future ask for properties not present in older kernels,
-     * but we're only asking about static properties, all of which should be
-     * 'int'.  So we shouln't see ENOMEM (val too small), or any of the other
-     * more exotic errors.
-     */
-    assert(errno == ENOENT);
-    return false;
-}
-#endif
-
 static void tcg_target_init(TCGContext *s)
 {
-#ifdef __linux__
-    unsigned long hwcap = qemu_getauxval(AT_HWCAP);
-    have_lse = hwcap & HWCAP_ATOMICS;
-    have_lse2 = hwcap & HWCAP_USCAT;
-#endif
-#ifdef CONFIG_DARWIN
-    have_lse = sysctl_for_bool("hw.optional.arm.FEAT_LSE");
-    have_lse2 = sysctl_for_bool("hw.optional.arm.FEAT_LSE2");
-#endif
-
     tcg_target_available_regs[TCG_TYPE_I32] = 0xffffffffu;
     tcg_target_available_regs[TCG_TYPE_I64] = 0xffffffffu;
     tcg_target_available_regs[TCG_TYPE_V64] = 0xffffffff00000000ull;
diff --git a/util/meson.build b/util/meson.build
index XXXXXXX..XXXXXXX 100644
--- a/util/meson.build
+++ b/util/meson.build
@@ -XXX,XX +XXX,XX @@ if have_block
   util_ss.add(when: 'CONFIG_LINUX', if_true: files('vfio-helpers.c'))
 endif
 
-if cpu in ['x86', 'x86_64']
+if cpu == 'aarch64'
+  util_ss.add(files('cpuinfo-aarch64.c'))
+elif cpu in ['x86', 'x86_64']
   util_ss.add(files('cpuinfo-i386.c'))
 endif
-- 
2.34.1

Separates the aarch64-specific portion into its own file.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 host/include/aarch64/host/atomic128-cas.h | 43 ++++++++++++++++++
 host/include/generic/host/atomic128-cas.h | 43 ++++++++++++++++++
 include/qemu/atomic128.h                  | 55 +----------------------
 3 files changed, 87 insertions(+), 54 deletions(-)
 create mode 100644 host/include/aarch64/host/atomic128-cas.h
 create mode 100644 host/include/generic/host/atomic128-cas.h

diff --git a/host/include/aarch64/host/atomic128-cas.h b/host/include/aarch64/host/atomic128-cas.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/host/include/aarch64/host/atomic128-cas.h
@@ -XXX,XX +XXX,XX @@
+/*
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ * Compare-and-swap for 128-bit atomic operations, AArch64 version.
+ *
+ * Copyright (C) 2018, 2023 Linaro, Ltd.
+ *
+ * See docs/devel/atomics.rst for discussion about the guarantees each
+ * atomic primitive is meant to provide.
+ */
+
+#ifndef AARCH64_ATOMIC128_CAS_H
+#define AARCH64_ATOMIC128_CAS_H
+
+/* Through gcc 10, aarch64 has no support for 128-bit atomics.  */
+#if defined(CONFIG_ATOMIC128) || defined(CONFIG_CMPXCHG128)
+#include "host/include/generic/host/atomic128-cas.h"
+#else
+static inline Int128 atomic16_cmpxchg(Int128 *ptr, Int128 cmp, Int128 new)
+{
+    uint64_t cmpl = int128_getlo(cmp), cmph = int128_gethi(cmp);
+    uint64_t newl = int128_getlo(new), newh = int128_gethi(new);
+    uint64_t oldl, oldh;
+    uint32_t tmp;
+
+    asm("0: ldaxp %[oldl], %[oldh], %[mem]\n\t"
+        "cmp %[oldl], %[cmpl]\n\t"
+        "ccmp %[oldh], %[cmph], #0, eq\n\t"
+        "b.ne 1f\n\t"
+        "stlxp %w[tmp], %[newl], %[newh], %[mem]\n\t"
+        "cbnz %w[tmp], 0b\n"
+        "1:"
+        : [mem] "+m"(*ptr), [tmp] "=&r"(tmp),
+          [oldl] "=&r"(oldl), [oldh] "=&r"(oldh)
+        : [cmpl] "r"(cmpl), [cmph] "r"(cmph),
+          [newl] "r"(newl), [newh] "r"(newh)
+        : "memory", "cc");
+
+    return int128_make128(oldl, oldh);
+}
+# define HAVE_CMPXCHG128 1
+#endif
+
+#endif /* AARCH64_ATOMIC128_CAS_H */
diff --git a/host/include/generic/host/atomic128-cas.h b/host/include/generic/host/atomic128-cas.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/host/include/generic/host/atomic128-cas.h
@@ -XXX,XX +XXX,XX @@
+/*
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ * Compare-and-swap for 128-bit atomic operations, generic version.
+ *
+ * Copyright (C) 2018, 2023 Linaro, Ltd.
+ *
+ * See docs/devel/atomics.rst for discussion about the guarantees each
+ * atomic primitive is meant to provide.
+ */
+
+#ifndef HOST_ATOMIC128_CAS_H
+#define HOST_ATOMIC128_CAS_H
+
+#if defined(CONFIG_ATOMIC128)
+static inline Int128 atomic16_cmpxchg(Int128 *ptr, Int128 cmp, Int128 new)
+{
+    Int128Alias r, c, n;
+
+    c.s = cmp;
+    n.s = new;
+    r.i = qatomic_cmpxchg__nocheck((__int128_t *)ptr, c.i, n.i);
+    return r.s;
+}
+# define HAVE_CMPXCHG128 1
+#elif defined(CONFIG_CMPXCHG128)
+static inline Int128 atomic16_cmpxchg(Int128 *ptr, Int128 cmp, Int128 new)
+{
+    Int128Alias r, c, n;
+
+    c.s = cmp;
+    n.s = new;
+    r.i = __sync_val_compare_and_swap_16((__int128_t *)ptr, c.i, n.i);
+    return r.s;
+}
+# define HAVE_CMPXCHG128 1
+#else
+/* Fallback definition that must be optimized away, or error.  */
+Int128 QEMU_ERROR("unsupported atomic")
+    atomic16_cmpxchg(Int128 *ptr, Int128 cmp, Int128 new);
+# define HAVE_CMPXCHG128 0
+#endif
+
+#endif /* HOST_ATOMIC128_CAS_H */
diff --git a/include/qemu/atomic128.h b/include/qemu/atomic128.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/atomic128.h
+++ b/include/qemu/atomic128.h
@@ -XXX,XX +XXX,XX @@
  * Therefore, special case each platform.
  */
 
-#if defined(CONFIG_ATOMIC128)
-static inline Int128 atomic16_cmpxchg(Int128 *ptr, Int128 cmp, Int128 new)
-{
-    Int128Alias r, c, n;
-
-    c.s = cmp;
-    n.s = new;
-    r.i = qatomic_cmpxchg__nocheck((__int128_t *)ptr, c.i, n.i);
-    return r.s;
-}
-# define HAVE_CMPXCHG128 1
-#elif defined(CONFIG_CMPXCHG128)
-static inline Int128 atomic16_cmpxchg(Int128 *ptr, Int128 cmp, Int128 new)
-{
-    Int128Alias r, c, n;
-
-    c.s = cmp;
-    n.s = new;
-    r.i = __sync_val_compare_and_swap_16((__int128_t *)ptr, c.i, n.i);
-    return r.s;
-}
-# define HAVE_CMPXCHG128 1
-#elif defined(__aarch64__)
-/* Through gcc 8, aarch64 has no support for 128-bit at all.  */
-static inline Int128 atomic16_cmpxchg(Int128 *ptr, Int128 cmp, Int128 new)
-{
-    uint64_t cmpl = int128_getlo(cmp), cmph = int128_gethi(cmp);
-    uint64_t newl = int128_getlo(new), newh = int128_gethi(new);
-    uint64_t oldl, oldh;
-    uint32_t tmp;
-
-    asm("0: ldaxp %[oldl], %[oldh], %[mem]\n\t"
-        "cmp %[oldl], %[cmpl]\n\t"
-        "ccmp %[oldh], %[cmph], #0, eq\n\t"
-        "b.ne 1f\n\t"
-        "stlxp %w[tmp], %[newl], %[newh], %[mem]\n\t"
-        "cbnz %w[tmp], 0b\n"
-        "1:"
-        : [mem] "+m"(*ptr), [tmp] "=&r"(tmp),
-          [oldl] "=&r"(oldl), [oldh] "=&r"(oldh)
-        : [cmpl] "r"(cmpl), [cmph] "r"(cmph),
-          [newl] "r"(newl), [newh] "r"(newh)
-        : "memory", "cc");
-
-    return int128_make128(oldl, oldh);
-}
-# define HAVE_CMPXCHG128 1
-#else
-/* Fallback definition that must be optimized away, or error.  */
-Int128 QEMU_ERROR("unsupported atomic")
-    atomic16_cmpxchg(Int128 *ptr, Int128 cmp, Int128 new);
-# define HAVE_CMPXCHG128 0
-#endif /* Some definition for HAVE_CMPXCHG128 */
-
+#include "host/atomic128-cas.h"
 
 #if defined(CONFIG_ATOMIC128)
 static inline Int128 atomic16_read(Int128 *ptr)
-- 
2.34.1

Separates the aarch64-specific portion into its own file.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 host/include/aarch64/host/atomic128-ldst.h | 49 ++++++++++++++
 host/include/generic/host/atomic128-ldst.h | 57 +++++++++++++++++
 include/qemu/atomic128.h                   | 74 +---------------------
 3 files changed, 107 insertions(+), 73 deletions(-)
 create mode 100644 host/include/aarch64/host/atomic128-ldst.h
 create mode 100644 host/include/generic/host/atomic128-ldst.h

diff --git a/host/include/aarch64/host/atomic128-ldst.h b/host/include/aarch64/host/atomic128-ldst.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/host/include/aarch64/host/atomic128-ldst.h
@@ -XXX,XX +XXX,XX @@
+/*
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ * Load/store for 128-bit atomic operations, AArch64 version.
+ *
+ * Copyright (C) 2018, 2023 Linaro, Ltd.
+ *
+ * See docs/devel/atomics.rst for discussion about the guarantees each
+ * atomic primitive is meant to provide.
+ */
+
+#ifndef AARCH64_ATOMIC128_LDST_H
+#define AARCH64_ATOMIC128_LDST_H
+
+/* Through gcc 10, aarch64 has no support for 128-bit atomics.  */
+#if !defined(CONFIG_ATOMIC128) && !defined(CONFIG_USER_ONLY)
+/* We can do better than cmpxchg for AArch64.  */
+static inline Int128 atomic16_read(Int128 *ptr)
+{
+    uint64_t l, h;
+    uint32_t tmp;
+
+    /* The load must be paired with the store to guarantee not tearing.  */
+    asm("0: ldxp %[l], %[h], %[mem]\n\t"
+        "stxp %w[tmp], %[l], %[h], %[mem]\n\t"
+        "cbnz %w[tmp], 0b"
+        : [mem] "+m"(*ptr), [tmp] "=r"(tmp), [l] "=r"(l), [h] "=r"(h));
+
+    return int128_make128(l, h);
+}
+
+static inline void atomic16_set(Int128 *ptr, Int128 val)
+{
+    uint64_t l = int128_getlo(val), h = int128_gethi(val);
+    uint64_t t1, t2;
+
+    /* Load into temporaries to acquire the exclusive access lock.  */
+    asm("0: ldxp %[t1], %[t2], %[mem]\n\t"
+        "stxp %w[t1], %[l], %[h], %[mem]\n\t"
+        "cbnz %w[t1], 0b"
+        : [mem] "+m"(*ptr), [t1] "=&r"(t1), [t2] "=&r"(t2)
+        : [l] "r"(l), [h] "r"(h));
+}
+
+# define HAVE_ATOMIC128 1
+#else
+#include "host/include/generic/host/atomic128-ldst.h"
+#endif
+
+#endif /* AARCH64_ATOMIC128_LDST_H */
diff --git a/host/include/generic/host/atomic128-ldst.h b/host/include/generic/host/atomic128-ldst.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/host/include/generic/host/atomic128-ldst.h
@@ -XXX,XX +XXX,XX @@
+/*
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ * Load/store for 128-bit atomic operations, generic version.
+ *
+ * Copyright (C) 2018, 2023 Linaro, Ltd.
+ *
+ * See docs/devel/atomics.rst for discussion about the guarantees each
+ * atomic primitive is meant to provide.
+ */
+
+#ifndef HOST_ATOMIC128_LDST_H
+#define HOST_ATOMIC128_LDST_H
+
+#if defined(CONFIG_ATOMIC128)
+static inline Int128 atomic16_read(Int128 *ptr)
+{
+    Int128Alias r;
+
+    r.i = qatomic_read__nocheck((__int128_t *)ptr);
+    return r.s;
+}
+
+static inline void atomic16_set(Int128 *ptr, Int128 val)
+{
+    Int128Alias v;
+
+    v.s = val;
+    qatomic_set__nocheck((__int128_t *)ptr, v.i);
+}
+
+# define HAVE_ATOMIC128 1
+#elif !defined(CONFIG_USER_ONLY) && HAVE_CMPXCHG128
+static inline Int128 atomic16_read(Int128 *ptr)
+{
+    /* Maybe replace 0 with 0, returning the old value.  */
+    Int128 z = int128_make64(0);
+    return atomic16_cmpxchg(ptr, z, z);
+}
+
+static inline void atomic16_set(Int128 *ptr, Int128 val)
+{
+    Int128 old = *ptr, cmp;
+    do {
+        cmp = old;
+        old = atomic16_cmpxchg(ptr, cmp, val);
+    } while (int128_ne(old, cmp));
+}
+
+# define HAVE_ATOMIC128 1
+#else
+/* Fallback definitions that must be optimized away, or error.  */
+Int128 QEMU_ERROR("unsupported atomic") atomic16_read(Int128 *ptr);
+void QEMU_ERROR("unsupported atomic") atomic16_set(Int128 *ptr, Int128 val);
+# define HAVE_ATOMIC128 0
+#endif
+
+#endif /* HOST_ATOMIC128_LDST_H */
diff --git a/include/qemu/atomic128.h b/include/qemu/atomic128.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/atomic128.h
+++ b/include/qemu/atomic128.h
@@ -XXX,XX +XXX,XX @@
  */
 
 #include "host/atomic128-cas.h"
-
-#if defined(CONFIG_ATOMIC128)
-static inline Int128 atomic16_read(Int128 *ptr)
-{
-    Int128Alias r;
-
-    r.i = qatomic_read__nocheck((__int128_t *)ptr);
-    return r.s;
-}
-
-static inline void atomic16_set(Int128 *ptr, Int128 val)
-{
-    Int128Alias v;
-
-    v.s = val;
-    qatomic_set__nocheck((__int128_t *)ptr, v.i);
-}
-
-# define HAVE_ATOMIC128 1
-#elif !defined(CONFIG_USER_ONLY) && defined(__aarch64__)
-/* We can do better than cmpxchg for AArch64.  */
-static inline Int128 atomic16_read(Int128 *ptr)
-{
-    uint64_t l, h;
-    uint32_t tmp;
-
-    /* The load must be paired with the store to guarantee not tearing.  */
-    asm("0: ldxp %[l], %[h], %[mem]\n\t"
-        "stxp %w[tmp], %[l], %[h], %[mem]\n\t"
-        "cbnz %w[tmp], 0b"
-        : [mem] "+m"(*ptr), [tmp] "=r"(tmp), [l] "=r"(l), [h] "=r"(h));
-
-    return int128_make128(l, h);
-}
-
-static inline void atomic16_set(Int128 *ptr, Int128 val)
-{
-    uint64_t l = int128_getlo(val), h = int128_gethi(val);
-    uint64_t t1, t2;
-
-    /* Load into temporaries to acquire the exclusive access lock.  */
-    asm("0: ldxp %[t1], %[t2], %[mem]\n\t"
-        "stxp %w[t1], %[l], %[h], %[mem]\n\t"
-        "cbnz %w[t1], 0b"
-        : [mem] "+m"(*ptr), [t1] "=&r"(t1), [t2] "=&r"(t2)
-        : [l] "r"(l), [h] "r"(h));
-}
-
-# define HAVE_ATOMIC128 1
-#elif !defined(CONFIG_USER_ONLY) && HAVE_CMPXCHG128
-static inline Int128 atomic16_read(Int128 *ptr)
-{
-    /* Maybe replace 0 with 0, returning the old value.  */
-    Int128 z = int128_make64(0);
-    return atomic16_cmpxchg(ptr, z, z);
-}
-
-static inline void atomic16_set(Int128 *ptr, Int128 val)
-{
-    Int128 old = *ptr, cmp;
-    do {
-        cmp = old;
-        old = atomic16_cmpxchg(ptr, cmp, val);
-    } while (int128_ne(old, cmp));
-}
-
-# define HAVE_ATOMIC128 1
-#else
-/* Fallback definitions that must be optimized away, or error.  */
-Int128 QEMU_ERROR("unsupported atomic") atomic16_read(Int128 *ptr);
-void QEMU_ERROR("unsupported atomic") atomic16_set(Int128 *ptr, Int128 val);
-# define HAVE_ATOMIC128 0
-#endif /* Some definition for HAVE_ATOMIC128 */
+#include "host/atomic128-ldst.h"
 
 #endif /* QEMU_ATOMIC128_H */
-- 
2.34.1

Not only the routines in ldst_atomicity.c.inc need markup,
but also the ones in the headers.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 host/include/generic/host/atomic128-cas.h  | 12 ++++++++----
 host/include/generic/host/atomic128-ldst.h | 18 ++++++++++++------
 include/qemu/atomic128.h                   | 17 +++++++++++++++++
 accel/tcg/ldst_atomicity.c.inc             | 17 -----------------
 4 files changed, 37 insertions(+), 27 deletions(-)

diff --git a/host/include/generic/host/atomic128-cas.h b/host/include/generic/host/atomic128-cas.h
index XXXXXXX..XXXXXXX 100644
--- a/host/include/generic/host/atomic128-cas.h
+++ b/host/include/generic/host/atomic128-cas.h
@@ -XXX,XX +XXX,XX @@
 #define HOST_ATOMIC128_CAS_H
 
 #if defined(CONFIG_ATOMIC128)
-static inline Int128 atomic16_cmpxchg(Int128 *ptr, Int128 cmp, Int128 new)
+static inline Int128 ATTRIBUTE_ATOMIC128_OPT
+atomic16_cmpxchg(Int128 *ptr, Int128 cmp, Int128 new)
 {
+    __int128_t *ptr_align = __builtin_assume_aligned(ptr, 16);
     Int128Alias r, c, n;
 
     c.s = cmp;
     n.s = new;
-    r.i = qatomic_cmpxchg__nocheck((__int128_t *)ptr, c.i, n.i);
+    r.i = qatomic_cmpxchg__nocheck(ptr_align, c.i, n.i);
     return r.s;
 }
 # define HAVE_CMPXCHG128 1
 #elif defined(CONFIG_CMPXCHG128)
-static inline Int128 atomic16_cmpxchg(Int128 *ptr, Int128 cmp, Int128 new)
+static inline Int128 ATTRIBUTE_ATOMIC128_OPT
+atomic16_cmpxchg(Int128 *ptr, Int128 cmp, Int128 new)
 {
+    __int128_t *ptr_align = __builtin_assume_aligned(ptr, 16);
     Int128Alias r, c, n;
 
     c.s = cmp;
     n.s = new;
-    r.i = __sync_val_compare_and_swap_16((__int128_t *)ptr, c.i, n.i);
+    r.i = __sync_val_compare_and_swap_16(ptr_align, c.i, n.i);
     return r.s;
 }
 # define HAVE_CMPXCHG128 1
diff --git a/host/include/generic/host/atomic128-ldst.h b/host/include/generic/host/atomic128-ldst.h
index XXXXXXX..XXXXXXX 100644
--- a/host/include/generic/host/atomic128-ldst.h
+++ b/host/include/generic/host/atomic128-ldst.h
@@ -XXX,XX +XXX,XX @@
 #define HOST_ATOMIC128_LDST_H
 
 #if defined(CONFIG_ATOMIC128)
-static inline Int128 atomic16_read(Int128 *ptr)
+static inline Int128 ATTRIBUTE_ATOMIC128_OPT
+atomic16_read(Int128 *ptr)
 {
+    __int128_t *ptr_align = __builtin_assume_aligned(ptr, 16);
     Int128Alias r;
 
-    r.i = qatomic_read__nocheck((__int128_t *)ptr);
+    r.i = qatomic_read__nocheck(ptr_align);
     return r.s;
 }
 
-static inline void atomic16_set(Int128 *ptr, Int128 val)
+static inline void ATTRIBUTE_ATOMIC128_OPT
+atomic16_set(Int128 *ptr, Int128 val)
 {
+    __int128_t *ptr_align = __builtin_assume_aligned(ptr, 16);
     Int128Alias v;
 
     v.s = val;
-    qatomic_set__nocheck((__int128_t *)ptr, v.i);
+    qatomic_set__nocheck(ptr_align, v.i);
 }
 
 # define HAVE_ATOMIC128 1
 #elif !defined(CONFIG_USER_ONLY) && HAVE_CMPXCHG128
-static inline Int128 atomic16_read(Int128 *ptr)
+static inline Int128 ATTRIBUTE_ATOMIC128_OPT
+atomic16_read(Int128 *ptr)
 {
     /* Maybe replace 0 with 0, returning the old value.  */
     Int128 z = int128_make64(0);
     return atomic16_cmpxchg(ptr, z, z);
 }
 
-static inline void atomic16_set(Int128 *ptr, Int128 val)
+static inline void ATTRIBUTE_ATOMIC128_OPT
+atomic16_set(Int128 *ptr, Int128 val)
 {
     Int128 old = *ptr, cmp;
     do {
diff --git a/include/qemu/atomic128.h b/include/qemu/atomic128.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/atomic128.h
+++ b/include/qemu/atomic128.h
@@ -XXX,XX +XXX,XX @@
 
 #include "qemu/int128.h"
 
+/*
+ * If __alignof(unsigned __int128) < 16, GCC may refuse to inline atomics
+ * that are supported by the host, e.g. s390x.  We can force the pointer to
+ * have our known alignment with __builtin_assume_aligned, however prior to
+ * GCC 13 that was only reliable with optimization enabled.  See
+ *   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107389
+ */
+#if defined(CONFIG_ATOMIC128_OPT)
+# if !defined(__OPTIMIZE__)
+#  define ATTRIBUTE_ATOMIC128_OPT  __attribute__((optimize("O1")))
+# endif
+# define CONFIG_ATOMIC128
+#endif
+#ifndef ATTRIBUTE_ATOMIC128_OPT
+# define ATTRIBUTE_ATOMIC128_OPT
+#endif
+
 /*
  * GCC is a house divided about supporting large atomic operations.
  *
diff --git a/accel/tcg/ldst_atomicity.c.inc b/accel/tcg/ldst_atomicity.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/ldst_atomicity.c.inc
+++ b/accel/tcg/ldst_atomicity.c.inc
@@ -XXX,XX +XXX,XX @@
 #endif
 #define HAVE_al8_fast      (ATOMIC_REG_SIZE >= 8)
 
-/*
- * If __alignof(unsigned __int128) < 16, GCC may refuse to inline atomics
- * that are supported by the host, e.g. s390x.  We can force the pointer to
- * have our known alignment with __builtin_assume_aligned, however prior to
- * GCC 13 that was only reliable with optimization enabled.  See
- *   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107389
- */
-#if defined(CONFIG_ATOMIC128_OPT)
-# if !defined(__OPTIMIZE__)
-#  define ATTRIBUTE_ATOMIC128_OPT  __attribute__((optimize("O1")))
-# endif
-# define CONFIG_ATOMIC128
-#endif
-#ifndef ATTRIBUTE_ATOMIC128_OPT
-# define ATTRIBUTE_ATOMIC128_OPT
-#endif
-
 #if defined(CONFIG_ATOMIC128)
 # define HAVE_al16_fast    true
 #else
-- 
2.34.1

No need to roll our own, as this is now provided by tcg.
This was the last use of retxl, so remove that too.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/ppc/cpu.h                           |  1 -
 target/ppc/helper.h                        |  9 ----
 target/ppc/mem_helper.c                    | 48 --------------------
 target/ppc/translate.c                     | 34 ++-------------
 target/ppc/translate/fixedpoint-impl.c.inc | 51 +++-------------------
 5 files changed, 11 insertions(+), 132 deletions(-)

diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
index XXXXXXX..XXXXXXX 100644
--- a/target/ppc/cpu.h
+++ b/target/ppc/cpu.h
@@ -XXX,XX +XXX,XX @@ struct CPUArchState {
                            /* used to speed-up TLB assist handlers */
 
     target_ulong nip;      /* next instruction pointer */
-    uint64_t retxh;        /* high part of 128-bit helper return */
 
     /* when a memory exception occurs, the access type is stored here */
     int access_type;
diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index XXXXXXX..XXXXXXX 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -XXX,XX +XXX,XX @@ DEF_HELPER_4(DSCLIQ, void, env, fprp, fprp, i32)
 
 DEF_HELPER_1(tbegin, void, env)
 DEF_HELPER_FLAGS_1(fixup_thrm, TCG_CALL_NO_RWG, void, env)
-
-#ifdef TARGET_PPC64
-DEF_HELPER_FLAGS_3(lq_le_parallel, TCG_CALL_NO_WG, i64, env, tl, i32)
-DEF_HELPER_FLAGS_3(lq_be_parallel, TCG_CALL_NO_WG, i64, env, tl, i32)
-DEF_HELPER_FLAGS_5(stq_le_parallel, TCG_CALL_NO_WG,
-                   void, env, tl, i64, i64, i32)
-DEF_HELPER_FLAGS_5(stq_be_parallel, TCG_CALL_NO_WG,
-                   void, env, tl, i64, i64, i32)
-#endif
diff --git a/target/ppc/mem_helper.c b/target/ppc/mem_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/ppc/mem_helper.c
+++ b/target/ppc/mem_helper.c
@@ -XXX,XX +XXX,XX @@ target_ulong helper_lscbx(CPUPPCState *env, target_ulong addr, uint32_t reg,
     return i;
 }
 
-#ifdef TARGET_PPC64
-uint64_t helper_lq_le_parallel(CPUPPCState *env, target_ulong addr,
-                               uint32_t opidx)
-{
-    Int128 ret;
-
-    /* We will have raised EXCP_ATOMIC from the translator.  */
-    assert(HAVE_ATOMIC128);
-    ret = cpu_atomic_ldo_le_mmu(env, addr, opidx, GETPC());
-    env->retxh = int128_gethi(ret);
-    return int128_getlo(ret);
-}
-
-uint64_t helper_lq_be_parallel(CPUPPCState *env, target_ulong addr,
-                               uint32_t opidx)
-{
-    Int128 ret;
-
-    /* We will have raised EXCP_ATOMIC from the translator.  */
-    assert(HAVE_ATOMIC128);
-    ret = cpu_atomic_ldo_be_mmu(env, addr, opidx, GETPC());
-    env->retxh = int128_gethi(ret);
-    return int128_getlo(ret);
-}
-
-void helper_stq_le_parallel(CPUPPCState *env, target_ulong addr,
-                            uint64_t lo, uint64_t hi, uint32_t opidx)
-{
-    Int128 val;
-
-    /* We will have raised EXCP_ATOMIC from the translator.  */
-    assert(HAVE_ATOMIC128);
-    val = int128_make128(lo, hi);
-    cpu_atomic_sto_le_mmu(env, addr, val, opidx, GETPC());
-}
-
-void helper_stq_be_parallel(CPUPPCState *env, target_ulong addr,
-                            uint64_t lo, uint64_t hi, uint32_t opidx)
-{
-    Int128 val;
-
-    /* We will have raised EXCP_ATOMIC from the translator.  */
-    assert(HAVE_ATOMIC128);
-    val = int128_make128(lo, hi);
-    cpu_atomic_sto_be_mmu(env, addr, val, opidx, GETPC());
-}
-#endif
-
 /*****************************************************************************/
 /* Altivec extension helpers */
 #if HOST_BIG_ENDIAN
diff --git a/target/ppc/translate.c b/target/ppc/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/ppc/translate.c
+++ b/target/ppc/translate.c
@@ -XXX,XX +XXX,XX @@ static void gen_lqarx(DisasContext *ctx)
 {
     int rd = rD(ctx->opcode);
     TCGv EA, hi, lo;
+    TCGv_i128 t16;
 
     if (unlikely((rd & 1) || (rd == rA(ctx->opcode)) ||
                  (rd == rB(ctx->opcode)))) {
@@ -XXX,XX +XXX,XX @@ static void gen_lqarx(DisasContext *ctx)
     lo = cpu_gpr[rd + 1];
     hi = cpu_gpr[rd];
 
-    if (tb_cflags(ctx->base.tb) & CF_PARALLEL) {
-        if (HAVE_ATOMIC128) {
-            TCGv_i32 oi = tcg_temp_new_i32();
-            if (ctx->le_mode) {
-                tcg_gen_movi_i32(oi, make_memop_idx(MO_LE | MO_128 | MO_ALIGN,
-                                                    ctx->mem_idx));
-                gen_helper_lq_le_parallel(lo, cpu_env, EA, oi);
-            } else {
-                tcg_gen_movi_i32(oi, make_memop_idx(MO_BE | MO_128 | MO_ALIGN,
-                                                    ctx->mem_idx));
-                gen_helper_lq_be_parallel(lo, cpu_env, EA, oi);
-            }
-            tcg_gen_ld_i64(hi, cpu_env, offsetof(CPUPPCState, retxh));
-        } else {
-            /* Restart with exclusive lock.  */
-            gen_helper_exit_atomic(cpu_env);
-            ctx->base.is_jmp = DISAS_NORETURN;
-            return;
-        }
-    } else if (ctx->le_mode) {
-        tcg_gen_qemu_ld_i64(lo, EA, ctx->mem_idx, MO_LEUQ | MO_ALIGN_16);
-        tcg_gen_mov_tl(cpu_reserve, EA);
-        gen_addr_add(ctx, EA, EA, 8);
-        tcg_gen_qemu_ld_i64(hi, EA, ctx->mem_idx, MO_LEUQ);
-    } else {
-        tcg_gen_qemu_ld_i64(hi, EA, ctx->mem_idx, MO_BEUQ | MO_ALIGN_16);
-        tcg_gen_mov_tl(cpu_reserve, EA);
-        gen_addr_add(ctx, EA, EA, 8);
-        tcg_gen_qemu_ld_i64(lo, EA, ctx->mem_idx, MO_BEUQ);
-    }
+    t16 = tcg_temp_new_i128();
+    tcg_gen_qemu_ld_i128(t16, EA, ctx->mem_idx, DEF_MEMOP(MO_128 | MO_ALIGN));
+    tcg_gen_extr_i128_i64(lo, hi, t16);
 
     tcg_gen_st_tl(hi, cpu_env, offsetof(CPUPPCState, reserve_val));
     tcg_gen_st_tl(lo, cpu_env, offsetof(CPUPPCState, reserve_val2));
diff --git a/target/ppc/translate/fixedpoint-impl.c.inc b/target/ppc/translate/fixedpoint-impl.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/target/ppc/translate/fixedpoint-impl.c.inc
+++ b/target/ppc/translate/fixedpoint-impl.c.inc
@@ -XXX,XX +XXX,XX @@ static bool do_ldst_quad(DisasContext *ctx, arg_D *a, bool store, bool prefixed)
 #if defined(TARGET_PPC64)
     TCGv ea;
     TCGv_i64 low_addr_gpr, high_addr_gpr;
-    MemOp mop;
+    TCGv_i128 t16;
 
     REQUIRE_INSNS_FLAGS(ctx, 64BX);
 
@@ -XXX,XX +XXX,XX @@ static bool do_ldst_quad(DisasContext *ctx, arg_D *a, bool store, bool prefixed)
         low_addr_gpr = cpu_gpr[a->rt + 1];
         high_addr_gpr = cpu_gpr[a->rt];
     }
+    t16 = tcg_temp_new_i128();
 
-    if (tb_cflags(ctx->base.tb) & CF_PARALLEL) {
-        if (HAVE_ATOMIC128) {
-            mop = DEF_MEMOP(MO_128);
-            TCGv_i32 oi = tcg_constant_i32(make_memop_idx(mop, ctx->mem_idx));
-            if (store) {
-                if (ctx->le_mode) {
-                    gen_helper_stq_le_parallel(cpu_env, ea, low_addr_gpr,
-                                               high_addr_gpr, oi);
-                } else {
-                    gen_helper_stq_be_parallel(cpu_env, ea, high_addr_gpr,
-                                               low_addr_gpr, oi);
-
-                }
-            } else {
-                if (ctx->le_mode) {
-                    gen_helper_lq_le_parallel(low_addr_gpr, cpu_env, ea, oi);
-                    tcg_gen_ld_i64(high_addr_gpr, cpu_env,
-                                   offsetof(CPUPPCState, retxh));
-                } else {
-                    gen_helper_lq_be_parallel(high_addr_gpr, cpu_env, ea, oi);
-                    tcg_gen_ld_i64(low_addr_gpr, cpu_env,
-                                   offsetof(CPUPPCState, retxh));
-                }
-            }
-        } else {
-            /* Restart with exclusive lock.  */
-            gen_helper_exit_atomic(cpu_env);
-            ctx->base.is_jmp = DISAS_NORETURN;
-        }
+    if (store) {
+        tcg_gen_concat_i64_i128(t16, low_addr_gpr, high_addr_gpr);
+        tcg_gen_qemu_st_i128(t16, ea, ctx->mem_idx, DEF_MEMOP(MO_128));
     } else {
-        mop = DEF_MEMOP(MO_UQ);
-        if (store) {
-            tcg_gen_qemu_st_i64(low_addr_gpr, ea, ctx->mem_idx, mop);
-        } else {
-            tcg_gen_qemu_ld_i64(low_addr_gpr, ea, ctx->mem_idx, mop);
-        }
-
-        gen_addr_add(ctx, ea, ea, 8);
-
-        if (store) {
-            tcg_gen_qemu_st_i64(high_addr_gpr, ea, ctx->mem_idx, mop);
-        } else {
-            tcg_gen_qemu_ld_i64(high_addr_gpr, ea, ctx->mem_idx, mop);
-        }
+        tcg_gen_qemu_ld_i128(t16, ea, ctx->mem_idx, DEF_MEMOP(MO_128));
+        tcg_gen_extr_i128_i64(low_addr_gpr, high_addr_gpr, t16);
     }
 #else
     qemu_build_not_reached();
-- 
2.34.1

No need to roll our own, as this is now provided by tcg.
This was the last use of retxl, so remove that too.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/s390x/cpu.h               |  3 --
 target/s390x/helper.h            |  4 ---
 target/s390x/tcg/mem_helper.c    | 61 --------------------------------
 target/s390x/tcg/translate.c     | 30 +++++-----------
 target/s390x/tcg/insn-data.h.inc |  2 +-
 5 files changed, 9 insertions(+), 91 deletions(-)

diff --git a/target/s390x/cpu.h b/target/s390x/cpu.h
index XXXXXXX..XXXXXXX 100644
--- a/target/s390x/cpu.h
+++ b/target/s390x/cpu.h
@@ -XXX,XX +XXX,XX @@ struct CPUArchState {
 
     float_status fpu_status; /* passed to softfloat lib */
 
-    /* The low part of a 128-bit return, or remainder of a divide.  */
-    uint64_t retxl;
-
     PSW psw;
 
     S390CrashReason crash_reason;
diff --git a/target/s390x/helper.h b/target/s390x/helper.h
index XXXXXXX..XXXXXXX 100644
--- a/target/s390x/helper.h
+++ b/target/s390x/helper.h
@@ -XXX,XX +XXX,XX @@ DEF_HELPER_FLAGS_2(sfas, TCG_CALL_NO_WG, void, env, i64)
 DEF_HELPER_FLAGS_2(srnm, TCG_CALL_NO_WG, void, env, i64)
 DEF_HELPER_FLAGS_1(popcnt, TCG_CALL_NO_RWG_SE, i64, i64)
 DEF_HELPER_2(stfle, i32, env, i64)
-DEF_HELPER_FLAGS_2(lpq, TCG_CALL_NO_WG, i64, env, i64)
-DEF_HELPER_FLAGS_2(lpq_parallel, TCG_CALL_NO_WG, i64, env, i64)
-DEF_HELPER_FLAGS_4(stpq, TCG_CALL_NO_WG, void, env, i64, i64, i64)
-DEF_HELPER_FLAGS_4(stpq_parallel, TCG_CALL_NO_WG, void, env, i64, i64, i64)
 DEF_HELPER_4(mvcos, i32, env, i64, i64, i64)
 DEF_HELPER_4(cu12, i32, env, i32, i32, i32)
 DEF_HELPER_4(cu14, i32, env, i32, i32, i32)
diff --git a/target/s390x/tcg/mem_helper.c b/target/s390x/tcg/mem_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/s390x/tcg/mem_helper.c
+++ b/target/s390x/tcg/mem_helper.c
@@ -XXX,XX +XXX,XX @@ uint64_t HELPER(lra)(CPUS390XState *env, uint64_t addr)
 }
 #endif
 
-/* load pair from quadword */
-uint64_t HELPER(lpq)(CPUS390XState *env, uint64_t addr)
-{
-    uintptr_t ra = GETPC();
-    uint64_t hi, lo;
-
-    check_alignment(env, addr, 16, ra);
-    hi = cpu_ldq_data_ra(env, addr + 0, ra);
-    lo = cpu_ldq_data_ra(env, addr + 8, ra);
-
-    env->retxl = lo;
-    return hi;
-}
-
-uint64_t HELPER(lpq_parallel)(CPUS390XState *env, uint64_t addr)
-{
-    uintptr_t ra = GETPC();
-    uint64_t hi, lo;
-    int mem_idx;
-    MemOpIdx oi;
-    Int128 v;
-
-    assert(HAVE_ATOMIC128);
-
-    mem_idx = cpu_mmu_index(env, false);
-    oi = make_memop_idx(MO_TEUQ | MO_ALIGN_16, mem_idx);
-    v = cpu_atomic_ldo_be_mmu(env, addr, oi, ra);
-    hi = int128_gethi(v);
-    lo = int128_getlo(v);
-
-    env->retxl = lo;
-    return hi;
-}
-
-/* store pair to quadword */
-void HELPER(stpq)(CPUS390XState *env, uint64_t addr,
-                  uint64_t low, uint64_t high)
-{
-    uintptr_t ra = GETPC();
-
-    check_alignment(env, addr, 16, ra);
-    cpu_stq_data_ra(env, addr + 0, high, ra);
-    cpu_stq_data_ra(env, addr + 8, low, ra);
-}
-
-void HELPER(stpq_parallel)(CPUS390XState *env, uint64_t addr,
-                           uint64_t low, uint64_t high)
-{
-    uintptr_t ra = GETPC();
-    int mem_idx;
-    MemOpIdx oi;
-    Int128 v;
-
-    assert(HAVE_ATOMIC128);
-
-    mem_idx = cpu_mmu_index(env, false);
-    oi = make_memop_idx(MO_TEUQ | MO_ALIGN_16, mem_idx);
-    v = int128_make128(low, high);
-    cpu_atomic_sto_be_mmu(env, addr, v, oi, ra);
-}
-
 /* Execute instruction.  This instruction executes an insn modified with
    the contents of r1.  It does not change the executed instruction in memory;
    it does not change the program counter.
diff --git a/target/s390x/tcg/translate.c b/target/s390x/tcg/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/s390x/tcg/translate.c
+++ b/target/s390x/tcg/translate.c
@@ -XXX,XX +XXX,XX @@ static void store_freg32_i64(int reg, TCGv_i64 v)
     tcg_gen_st32_i64(v, cpu_env, freg32_offset(reg));
 }
 
-static void return_low128(TCGv_i64 dest)
-{
-    tcg_gen_ld_i64(dest, cpu_env, offsetof(CPUS390XState, retxl));
-}
-
 static void update_psw_addr(DisasContext *s)
 {
     /* psw.addr */
@@ -XXX,XX +XXX,XX @@ static DisasJumpType op_lpd(DisasContext *s, DisasOps *o)
 
 static DisasJumpType op_lpq(DisasContext *s, DisasOps *o)
 {
-    if (!(tb_cflags(s->base.tb) & CF_PARALLEL)) {
-        gen_helper_lpq(o->out, cpu_env, o->in2);
-    } else if (HAVE_ATOMIC128) {
-        gen_helper_lpq_parallel(o->out, cpu_env, o->in2);
-    } else {
-        gen_helper_exit_atomic(cpu_env);
-        return DISAS_NORETURN;
-    }
-    return_low128(o->out2);
+    o->out_128 = tcg_temp_new_i128();
+    tcg_gen_qemu_ld_i128(o->out_128, o->in2, get_mem_index(s),
+                         MO_TE | MO_128 | MO_ALIGN);
     return DISAS_NEXT;
 }
 
@@ -XXX,XX +XXX,XX @@ static DisasJumpType op_stmh(DisasContext *s, DisasOps *o)
 
 static DisasJumpType op_stpq(DisasContext *s, DisasOps *o)
 {
-    if (!(tb_cflags(s->base.tb) & CF_PARALLEL)) {
-        gen_helper_stpq(cpu_env, o->in2, o->out2, o->out);
-    } else if (HAVE_ATOMIC128) {
-        gen_helper_stpq_parallel(cpu_env, o->in2, o->out2, o->out);
-    } else {
-        gen_helper_exit_atomic(cpu_env);
-        return DISAS_NORETURN;
-    }
+    TCGv_i128 t16 = tcg_temp_new_i128();
+
+    tcg_gen_concat_i64_i128(t16, o->out2, o->out);
+    tcg_gen_qemu_st_i128(t16, o->in2, get_mem_index(s),
+                         MO_TE | MO_128 | MO_ALIGN);
     return DISAS_NEXT;
 }
 
diff --git a/target/s390x/tcg/insn-data.h.inc b/target/s390x/tcg/insn-data.h.inc
index XXXXXXX..XXXXXXX 100644
--- a/target/s390x/tcg/insn-data.h.inc
+++ b/target/s390x/tcg/insn-data.h.inc
@@ -XXX,XX +XXX,XX @@
     D(0xc804, LPD,     SSF,   ILA, 0, 0, new_P, r3_P32, lpd, 0, MO_TEUL)
     D(0xc805, LPDG,    SSF,   ILA, 0, 0, new_P, r3_P64, lpd, 0, MO_TEUQ)
 /* LOAD PAIR FROM QUADWORD */
-    C(0xe38f, LPQ,     RXY_a, Z,   0, a2, r1_P, 0, lpq, 0)
+    C(0xe38f, LPQ,     RXY_a, Z,   0, a2, 0, r1_D64, lpq, 0)
 /* LOAD POSITIVE */
     C(0x1000, LPR,     RR_a,  Z,   0, r2_32s, new, r1_32, abs, abs32)
     C(0xb900, LPGR,    RRE,   Z,   0, r2, r1, 0, abs, abs64)
-- 
2.34.1

With the current structure of cputlb.c, there is no difference
between the little-endian and big-endian entry points, aside
from the assert.  Unify the pairs of functions.

The only use of the functions with explicit endianness was in
target/sparc64, and that was only to satisfy the assert: the
correct endianness is already built into memop.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/cpu_ldst.h     |  58 ++-----
 accel/tcg/cputlb.c          | 122 +++-----------
 accel/tcg/user-exec.c       | 322 ++++++++++--------------------------
 target/arm/tcg/m_helper.c   |   4 +-
 target/sparc/ldst_helper.c  |  18 +-
 accel/tcg/ldst_common.c.inc |  24 +--
 6 files changed, 137 insertions(+), 411 deletions(-)

diff --git a/include/exec/cpu_ldst.h b/include/exec/cpu_ldst.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/cpu_ldst.h
+++ b/include/exec/cpu_ldst.h
@@ -XXX,XX +XXX,XX @@ void cpu_stq_le_mmuidx_ra(CPUArchState *env, abi_ptr ptr, uint64_t val,
                           int mmu_idx, uintptr_t ra);
 
 uint8_t cpu_ldb_mmu(CPUArchState *env, abi_ptr ptr, MemOpIdx oi, uintptr_t ra);
-uint16_t cpu_ldw_be_mmu(CPUArchState *env, abi_ptr ptr,
-                        MemOpIdx oi, uintptr_t ra);
-uint32_t cpu_ldl_be_mmu(CPUArchState *env, abi_ptr ptr,
-                        MemOpIdx oi, uintptr_t ra);
-uint64_t cpu_ldq_be_mmu(CPUArchState *env, abi_ptr ptr,
-                        MemOpIdx oi, uintptr_t ra);
-uint16_t cpu_ldw_le_mmu(CPUArchState *env, abi_ptr ptr,
-                        MemOpIdx oi, uintptr_t ra);
-uint32_t cpu_ldl_le_mmu(CPUArchState *env, abi_ptr ptr,
-                        MemOpIdx oi, uintptr_t ra);
-uint64_t cpu_ldq_le_mmu(CPUArchState *env, abi_ptr ptr,
-                        MemOpIdx oi, uintptr_t ra);
-
-Int128 cpu_ld16_be_mmu(CPUArchState *env, abi_ptr addr,
-                       MemOpIdx oi, uintptr_t ra);
-Int128 cpu_ld16_le_mmu(CPUArchState *env, abi_ptr addr,
-                       MemOpIdx oi, uintptr_t ra);
+uint16_t cpu_ldw_mmu(CPUArchState *env, abi_ptr ptr, MemOpIdx oi, uintptr_t ra);
+uint32_t cpu_ldl_mmu(CPUArchState *env, abi_ptr ptr, MemOpIdx oi, uintptr_t ra);
+uint64_t cpu_ldq_mmu(CPUArchState *env, abi_ptr ptr, MemOpIdx oi, uintptr_t ra);
+Int128 cpu_ld16_mmu(CPUArchState *env, abi_ptr addr, MemOpIdx oi, uintptr_t ra);
 
 void cpu_stb_mmu(CPUArchState *env, abi_ptr ptr, uint8_t val,
                  MemOpIdx oi, uintptr_t ra);
-void cpu_stw_be_mmu(CPUArchState *env, abi_ptr ptr, uint16_t val,
-                    MemOpIdx oi, uintptr_t ra);
-void cpu_stl_be_mmu(CPUArchState *env, abi_ptr ptr, uint32_t val,
-                    MemOpIdx oi, uintptr_t ra);
-void cpu_stq_be_mmu(CPUArchState *env, abi_ptr ptr, uint64_t val,
-                    MemOpIdx oi, uintptr_t ra);
-void cpu_stw_le_mmu(CPUArchState *env, abi_ptr ptr, uint16_t val,
-                    MemOpIdx oi, uintptr_t ra);
-void cpu_stl_le_mmu(CPUArchState *env, abi_ptr ptr, uint32_t val,
-                    MemOpIdx oi, uintptr_t ra);
-void cpu_stq_le_mmu(CPUArchState *env, abi_ptr ptr, uint64_t val,
-                    MemOpIdx oi, uintptr_t ra);
-
-void cpu_st16_be_mmu(CPUArchState *env, abi_ptr addr, Int128 val,
-                     MemOpIdx oi, uintptr_t ra);
-void cpu_st16_le_mmu(CPUArchState *env, abi_ptr addr, Int128 val,
-                     MemOpIdx oi, uintptr_t ra);
+void cpu_stw_mmu(CPUArchState *env, abi_ptr ptr, uint16_t val,
+                 MemOpIdx oi, uintptr_t ra);
+void cpu_stl_mmu(CPUArchState *env, abi_ptr ptr, uint32_t val,
+                 MemOpIdx oi, uintptr_t ra);
+void cpu_stq_mmu(CPUArchState *env, abi_ptr ptr, uint64_t val,
+                 MemOpIdx oi, uintptr_t ra);
+void cpu_st16_mmu(CPUArchState *env, abi_ptr addr, Int128 val,
+                  MemOpIdx oi, uintptr_t ra);
 
 uint32_t cpu_atomic_cmpxchgb_mmu(CPUArchState *env, target_ulong addr,
                                  uint32_t cmpv, uint32_t newv,
@@ -XXX,XX +XXX,XX @@ static inline CPUTLBEntry *tlb_entry(CPUArchState *env, uintptr_t mmu_idx,
 # define cpu_ldsw_mmuidx_ra   cpu_ldsw_be_mmuidx_ra
 # define cpu_ldl_mmuidx_ra    cpu_ldl_be_mmuidx_ra
 # define cpu_ldq_mmuidx_ra    cpu_ldq_be_mmuidx_ra
-# define cpu_ldw_mmu          cpu_ldw_be_mmu
-# define cpu_ldl_mmu          cpu_ldl_be_mmu
-# define cpu_ldq_mmu          cpu_ldq_be_mmu
 # define cpu_stw_data         cpu_stw_be_data
 # define cpu_stl_data         cpu_stl_be_data
 # define cpu_stq_data         cpu_stq_be_data
@@ -XXX,XX +XXX,XX @@ static inline CPUTLBEntry *tlb_entry(CPUArchState *env, uintptr_t mmu_idx,
 # define cpu_stw_mmuidx_ra    cpu_stw_be_mmuidx_ra
 # define cpu_stl_mmuidx_ra    cpu_stl_be_mmuidx_ra
 # define cpu_stq_mmuidx_ra    cpu_stq_be_mmuidx_ra
-# define cpu_stw_mmu          cpu_stw_be_mmu
-# define cpu_stl_mmu          cpu_stl_be_mmu
-# define cpu_stq_mmu          cpu_stq_be_mmu
 #else
 # define cpu_lduw_data        cpu_lduw_le_data
 # define cpu_ldsw_data        cpu_ldsw_le_data
@@ -XXX,XX +XXX,XX @@ static inline CPUTLBEntry *tlb_entry(CPUArchState *env, uintptr_t mmu_idx,
 # define cpu_ldsw_mmuidx_ra   cpu_ldsw_le_mmuidx_ra
 # define cpu_ldl_mmuidx_ra    cpu_ldl_le_mmuidx_ra
 # define cpu_ldq_mmuidx_ra    cpu_ldq_le_mmuidx_ra
-# define cpu_ldw_mmu          cpu_ldw_le_mmu
-# define cpu_ldl_mmu          cpu_ldl_le_mmu
-# define cpu_ldq_mmu          cpu_ldq_le_mmu
 # define cpu_stw_data         cpu_stw_le_data
 # define cpu_stl_data         cpu_stl_le_data
 # define cpu_stq_data         cpu_stq_le_data
@@ -XXX,XX +XXX,XX @@ static inline CPUTLBEntry *tlb_entry(CPUArchState *env, uintptr_t mmu_idx,
 # define cpu_stw_mmuidx_ra    cpu_stw_le_mmuidx_ra
 # define cpu_stl_mmuidx_ra    cpu_stl_le_mmuidx_ra
 # define cpu_stq_mmuidx_ra    cpu_stq_le_mmuidx_ra
-# define cpu_stw_mmu          cpu_stw_le_mmu
-# define cpu_stl_mmu          cpu_stl_le_mmu
-# define cpu_stq_mmu          cpu_stq_le_mmu
 #endif
 
 uint8_t cpu_ldb_code_mmu(CPUArchState *env, abi_ptr addr,
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ uint8_t cpu_ldb_mmu(CPUArchState *env, abi_ptr addr, MemOpIdx oi, uintptr_t ra)
     return ret;
 }
 
-uint16_t cpu_ldw_be_mmu(CPUArchState *env, abi_ptr addr,
-                        MemOpIdx oi, uintptr_t ra)
+uint16_t cpu_ldw_mmu(CPUArchState *env, abi_ptr addr,
+                     MemOpIdx oi, uintptr_t ra)
 {
     uint16_t ret;
 
-    tcg_debug_assert((get_memop(oi) & (MO_BSWAP | MO_SIZE)) == MO_BEUW);
+    tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_16);
     ret = do_ld2_mmu(env, addr, oi, ra, MMU_DATA_LOAD);
     plugin_load_cb(env, addr, oi);
     return ret;
 }
 
-uint32_t cpu_ldl_be_mmu(CPUArchState *env, abi_ptr addr,
-                        MemOpIdx oi, uintptr_t ra)
+uint32_t cpu_ldl_mmu(CPUArchState *env, abi_ptr addr,
+                     MemOpIdx oi, uintptr_t ra)
 {
     uint32_t ret;
 
-    tcg_debug_assert((get_memop(oi) & (MO_BSWAP | MO_SIZE)) == MO_BEUL);
+    tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_32);
     ret = do_ld4_mmu(env, addr, oi, ra, MMU_DATA_LOAD);
     plugin_load_cb(env, addr, oi);
     return ret;
 }
 
-uint64_t cpu_ldq_be_mmu(CPUArchState *env, abi_ptr addr,
-                        MemOpIdx oi, uintptr_t ra)
+uint64_t cpu_ldq_mmu(CPUArchState *env, abi_ptr addr,
+                     MemOpIdx oi, uintptr_t ra)
 {
     uint64_t ret;
 
-    tcg_debug_assert((get_memop(oi) & (MO_BSWAP | MO_SIZE)) == MO_BEUQ);
+    tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_64);
     ret = do_ld8_mmu(env, addr, oi, ra, MMU_DATA_LOAD);
     plugin_load_cb(env, addr, oi);
     return ret;
 }
 
-uint16_t cpu_ldw_le_mmu(CPUArchState *env, abi_ptr addr,
-                        MemOpIdx oi, uintptr_t ra)
-{
-    uint16_t ret;
-
-    tcg_debug_assert((get_memop(oi) & (MO_BSWAP | MO_SIZE)) == MO_LEUW);
-    ret = do_ld2_mmu(env, addr, oi, ra, MMU_DATA_LOAD);
-    plugin_load_cb(env, addr, oi);
-    return ret;
-}
-
-uint32_t cpu_ldl_le_mmu(CPUArchState *env, abi_ptr addr,
-                        MemOpIdx oi, uintptr_t ra)
-{
-    uint32_t ret;
-
-    tcg_debug_assert((get_memop(oi) & (MO_BSWAP | MO_SIZE)) == MO_LEUL);
-    ret = do_ld4_mmu(env, addr, oi, ra, MMU_DATA_LOAD);
-    plugin_load_cb(env, addr, oi);
-    return ret;
-}
-
-uint64_t cpu_ldq_le_mmu(CPUArchState *env, abi_ptr addr,
-                        MemOpIdx oi, uintptr_t ra)
-{
-    uint64_t ret;
-
-    tcg_debug_assert((get_memop(oi) & (MO_BSWAP | MO_SIZE)) == MO_LEUQ);
-    ret = do_ld8_mmu(env, addr, oi, ra, MMU_DATA_LOAD);
-    plugin_load_cb(env, addr, oi);
-    return ret;
-}
-
-Int128 cpu_ld16_be_mmu(CPUArchState *env, abi_ptr addr,
-                       MemOpIdx oi, uintptr_t ra)
+Int128 cpu_ld16_mmu(CPUArchState *env, abi_ptr addr,
+                    MemOpIdx oi, uintptr_t ra)
 {
     Int128 ret;
 
-    tcg_debug_assert((get_memop(oi) & (MO_BSWAP|MO_SIZE)) == (MO_BE|MO_128));
-    ret = do_ld16_mmu(env, addr, oi, ra);
-    plugin_load_cb(env, addr, oi);
-    return ret;
-}
-
-Int128 cpu_ld16_le_mmu(CPUArchState *env, abi_ptr addr,
-                       MemOpIdx oi, uintptr_t ra)
-{
-    Int128 ret;
-
-    tcg_debug_assert((get_memop(oi) & (MO_BSWAP|MO_SIZE)) == (MO_LE|MO_128));
+    tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_128);
     ret = do_ld16_mmu(env, addr, oi, ra);
     plugin_load_cb(env, addr, oi);
     return ret;
@@ -XXX,XX +XXX,XX @@ void cpu_stb_mmu(CPUArchState *env, target_ulong addr, uint8_t val,
     plugin_store_cb(env, addr, oi);
 }
 
-void cpu_stw_be_mmu(CPUArchState *env, target_ulong addr, uint16_t val,
-                    MemOpIdx oi, uintptr_t retaddr)
+void cpu_stw_mmu(CPUArchState *env, target_ulong addr, uint16_t val,
+                 MemOpIdx oi, uintptr_t retaddr)
 {
-    tcg_debug_assert((get_memop(oi) & (MO_BSWAP | MO_SIZE)) == MO_BEUW);
+    tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_16);
     do_st2_mmu(env, addr, val, oi, retaddr);
     plugin_store_cb(env, addr, oi);
 }
 
-void cpu_stl_be_mmu(CPUArchState *env, target_ulong addr, uint32_t val,
+void cpu_stl_mmu(CPUArchState *env, target_ulong addr, uint32_t val,
                     MemOpIdx oi, uintptr_t retaddr)
 {
-    tcg_debug_assert((get_memop(oi) & (MO_BSWAP | MO_SIZE)) == MO_BEUL);
+    tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_32);
     do_st4_mmu(env, addr, val, oi, retaddr);
     plugin_store_cb(env, addr, oi);
 }
 
-void cpu_stq_be_mmu(CPUArchState *env, target_ulong addr, uint64_t val,
-                    MemOpIdx oi, uintptr_t retaddr)
+void cpu_stq_mmu(CPUArchState *env, target_ulong addr, uint64_t val,
+                 MemOpIdx oi, uintptr_t retaddr)
 {
-    tcg_debug_assert((get_memop(oi) & (MO_BSWAP | MO_SIZE)) == MO_BEUQ);
+    tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_64);
     do_st8_mmu(env, addr, val, oi, retaddr);
     plugin_store_cb(env, addr, oi);
 }
 
-void cpu_stw_le_mmu(CPUArchState *env, target_ulong addr, uint16_t val,
-                    MemOpIdx oi, uintptr_t retaddr)
+void cpu_st16_mmu(CPUArchState *env, target_ulong addr, Int128 val,
+                  MemOpIdx oi, uintptr_t retaddr)
 {
-    tcg_debug_assert((get_memop(oi) & (MO_BSWAP | MO_SIZE)) == MO_LEUW);
-    do_st2_mmu(env, addr, val, oi, retaddr);
-    plugin_store_cb(env, addr, oi);
-}
-
-void cpu_stl_le_mmu(CPUArchState *env, target_ulong addr, uint32_t val,
-                    MemOpIdx oi, uintptr_t retaddr)
-{
-    tcg_debug_assert((get_memop(oi) & (MO_BSWAP | MO_SIZE)) == MO_LEUL);
-    do_st4_mmu(env, addr, val, oi, retaddr);
-    plugin_store_cb(env, addr, oi);
-}
-
-void cpu_stq_le_mmu(CPUArchState *env, target_ulong addr, uint64_t val,
-                    MemOpIdx oi, uintptr_t retaddr)
-{
-    tcg_debug_assert((get_memop(oi) & (MO_BSWAP | MO_SIZE)) == MO_LEUQ);
-    do_st8_mmu(env, addr, val, oi, retaddr);
-    plugin_store_cb(env, addr, oi);
-}
-
-void cpu_st16_be_mmu(CPUArchState *env, target_ulong addr, Int128 val,
-                     MemOpIdx oi, uintptr_t retaddr)
-{
-    tcg_debug_assert((get_memop(oi) & (MO_BSWAP|MO_SIZE)) == (MO_BE|MO_128));
-    do_st16_mmu(env, addr, val, oi, retaddr);
-    plugin_store_cb(env, addr, oi);
-}
-
-void cpu_st16_le_mmu(CPUArchState *env, target_ulong addr, Int128 val,
-                     MemOpIdx oi, uintptr_t retaddr)
-{
-    tcg_debug_assert((get_memop(oi) & (MO_BSWAP|MO_SIZE)) == (MO_LE|MO_128));
+    tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_128);
     do_st16_mmu(env, addr, val, oi, retaddr);
     plugin_store_cb(env, addr, oi);
 }
diff --git a/accel/tcg/user-exec.c b/accel/tcg/user-exec.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/user-exec.c
+++ b/accel/tcg/user-exec.c
@@ -XXX,XX +XXX,XX @@ uint8_t cpu_ldb_mmu(CPUArchState *env, abi_ptr addr,
     return ret;
 }
 
-static uint16_t do_ld2_he_mmu(CPUArchState *env, abi_ptr addr,
-                              MemOp mop, uintptr_t ra)
+static uint16_t do_ld2_mmu(CPUArchState *env, abi_ptr addr,
+                           MemOp mop, uintptr_t ra)
 {
     void *haddr;
     uint16_t ret;
@@ -XXX,XX +XXX,XX @@ static uint16_t do_ld2_he_mmu(CPUArchState *env, abi_ptr addr,
     haddr = cpu_mmu_lookup(env, addr, mop, ra, MMU_DATA_LOAD);
     ret = load_atom_2(env, ra, haddr, mop);
     clear_helper_retaddr();
+
+    if (mop & MO_BSWAP) {
+        ret = bswap16(ret);
+    }
     return ret;
 }
 
 tcg_target_ulong helper_lduw_mmu(CPUArchState *env, uint64_t addr,
                                  MemOpIdx oi, uintptr_t ra)
 {
-    MemOp mop = get_memop(oi);
-    uint16_t ret = do_ld2_he_mmu(env, addr, mop, ra);
-
-    if (mop & MO_BSWAP) {
-        ret = bswap16(ret);
-    }
-    return ret;
+    return do_ld2_mmu(env, addr, get_memop(oi), ra);
 }
 
 tcg_target_ulong helper_ldsw_mmu(CPUArchState *env, uint64_t addr,
                                  MemOpIdx oi, uintptr_t ra)
 {
-    MemOp mop = get_memop(oi);
-    int16_t ret = do_ld2_he_mmu(env, addr, mop, ra);
+    return (int16_t)do_ld2_mmu(env, addr, get_memop(oi), ra);
+}
 
-    if (mop & MO_BSWAP) {
-        ret = bswap16(ret);
-    }
+uint16_t cpu_ldw_mmu(CPUArchState *env, abi_ptr addr,
+                     MemOpIdx oi, uintptr_t ra)
+{
+    uint16_t ret = do_ld2_mmu(env, addr, get_memop(oi), ra);
+    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
     return ret;
 }
 
-uint16_t cpu_ldw_be_mmu(CPUArchState *env, abi_ptr addr,
-                        MemOpIdx oi, uintptr_t ra)
-{
-    MemOp mop = get_memop(oi);
-    uint16_t ret;
-
-    tcg_debug_assert((mop & MO_BSWAP) == MO_BE);
-    ret = do_ld2_he_mmu(env, addr, mop, ra);
-    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
-    return cpu_to_be16(ret);
-}
-
-uint16_t cpu_ldw_le_mmu(CPUArchState *env, abi_ptr addr,
-                        MemOpIdx oi, uintptr_t ra)
-{
-    MemOp mop = get_memop(oi);
-    uint16_t ret;
-
-    tcg_debug_assert((mop & MO_BSWAP) == MO_LE);
-    ret = do_ld2_he_mmu(env, addr, mop, ra);
-    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
-    return cpu_to_le16(ret);
-}
-
-static uint32_t do_ld4_he_mmu(CPUArchState *env, abi_ptr addr,
-                              MemOp mop, uintptr_t ra)
+static uint32_t do_ld4_mmu(CPUArchState *env, abi_ptr addr,
+                           MemOp mop, uintptr_t ra)
 {
     void *haddr;
     uint32_t ret;
@@ -XXX,XX +XXX,XX @@ static uint32_t do_ld4_he_mmu(CPUArchState *env, abi_ptr addr,
     haddr = cpu_mmu_lookup(env, addr, mop, ra, MMU_DATA_LOAD);
     ret = load_atom_4(env, ra, haddr, mop);
     clear_helper_retaddr();
+
+    if (mop & MO_BSWAP) {
+        ret = bswap32(ret);
+    }
     return ret;
 }
 
 tcg_target_ulong helper_ldul_mmu(CPUArchState *env, uint64_t addr,
                                  MemOpIdx oi, uintptr_t ra)
 {
-    MemOp mop = get_memop(oi);
-    uint32_t ret = do_ld4_he_mmu(env, addr, mop, ra);
-
-    if (mop & MO_BSWAP) {
-        ret = bswap32(ret);
-    }
-    return ret;
+    return do_ld4_mmu(env, addr, get_memop(oi), ra);
 }
 
 tcg_target_ulong helper_ldsl_mmu(CPUArchState *env, uint64_t addr,
                                  MemOpIdx oi, uintptr_t ra)
 {
-    MemOp mop = get_memop(oi);
-    int32_t ret = do_ld4_he_mmu(env, addr, mop, ra);
+    return (int32_t)do_ld4_mmu(env, addr, get_memop(oi), ra);
+}
 
-    if (mop & MO_BSWAP) {
-        ret = bswap32(ret);
-    }
+uint32_t cpu_ldl_mmu(CPUArchState *env, abi_ptr addr,
+                     MemOpIdx oi, uintptr_t ra)
+{
+    uint32_t ret = do_ld4_mmu(env, addr, get_memop(oi), ra);
+    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
     return ret;
 }
 
-uint32_t cpu_ldl_be_mmu(CPUArchState *env, abi_ptr addr,
-                        MemOpIdx oi, uintptr_t ra)
-{
-    MemOp mop = get_memop(oi);
-    uint32_t ret;
-
-    tcg_debug_assert((mop & MO_BSWAP) == MO_BE);
-    ret = do_ld4_he_mmu(env, addr, mop, ra);
-    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
-    return cpu_to_be32(ret);
-}
-
-uint32_t cpu_ldl_le_mmu(CPUArchState *env, abi_ptr addr,
-                        MemOpIdx oi, uintptr_t ra)
-{
-    MemOp mop = get_memop(oi);
-    uint32_t ret;
-
-    tcg_debug_assert((mop & MO_BSWAP) == MO_LE);
-    ret = do_ld4_he_mmu(env, addr, mop, ra);
-    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
-    return cpu_to_le32(ret);
-}
-
-static uint64_t do_ld8_he_mmu(CPUArchState *env, abi_ptr addr,
-                              MemOp mop, uintptr_t ra)
+static uint64_t do_ld8_mmu(CPUArchState *env, abi_ptr addr,
+                           MemOp mop, uintptr_t ra)
 {
     void *haddr;
     uint64_t ret;
@@ -XXX,XX +XXX,XX @@ static uint64_t do_ld8_he_mmu(CPUArchState *env, abi_ptr addr,
     haddr = cpu_mmu_lookup(env, addr, mop, ra, MMU_DATA_LOAD);
     ret = load_atom_8(env, ra, haddr, mop);
     clear_helper_retaddr();
-    return ret;
-}
-
-uint64_t helper_ldq_mmu(CPUArchState *env, uint64_t addr,
-                        MemOpIdx oi, uintptr_t ra)
-{
-    MemOp mop = get_memop(oi);
-    uint64_t ret = do_ld8_he_mmu(env, addr, mop, ra);
 
     if (mop & MO_BSWAP) {
         ret = bswap64(ret);
@@ -XXX,XX +XXX,XX @@ uint64_t helper_ldq_mmu(CPUArchState *env, uint64_t addr,
     return ret;
 }
 
-uint64_t cpu_ldq_be_mmu(CPUArchState *env, abi_ptr addr,
+uint64_t helper_ldq_mmu(CPUArchState *env, uint64_t addr,
                         MemOpIdx oi, uintptr_t ra)
 {
-    MemOp mop = get_memop(oi);
-    uint64_t ret;
-
-    tcg_debug_assert((mop & MO_BSWAP) == MO_BE);
-    ret = do_ld8_he_mmu(env, addr, mop, ra);
-    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
-    return cpu_to_be64(ret);
+    return do_ld8_mmu(env, addr, get_memop(oi), ra);
 }
 
-uint64_t cpu_ldq_le_mmu(CPUArchState *env, abi_ptr addr,
-                        MemOpIdx oi, uintptr_t ra)
+uint64_t cpu_ldq_mmu(CPUArchState *env, abi_ptr addr,
+                     MemOpIdx oi, uintptr_t ra)
 {
-    MemOp mop = get_memop(oi);
-    uint64_t ret;
-
-    tcg_debug_assert((mop & MO_BSWAP) == MO_LE);
-    ret = do_ld8_he_mmu(env, addr, mop, ra);
+    uint64_t ret = do_ld8_mmu(env, addr, get_memop(oi), ra);
     qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
-    return cpu_to_le64(ret);
+    return ret;
 }
 
-static Int128 do_ld16_he_mmu(CPUArchState *env, abi_ptr addr,
-                             MemOp mop, uintptr_t ra)
+static Int128 do_ld16_mmu(CPUArchState *env, abi_ptr addr,
+                          MemOp mop, uintptr_t ra)
 {
     void *haddr;
     Int128 ret;
@@ -XXX,XX +XXX,XX @@ static Int128 do_ld16_he_mmu(CPUArchState *env, abi_ptr addr,
     haddr = cpu_mmu_lookup(env, addr, mop, ra, MMU_DATA_LOAD);
     ret = load_atom_16(env, ra, haddr, mop);
     clear_helper_retaddr();
-    return ret;
-}
-
-Int128 helper_ld16_mmu(CPUArchState *env, uint64_t addr,
-                       MemOpIdx oi, uintptr_t ra)
-{
-    MemOp mop = get_memop(oi);
-    Int128 ret = do_ld16_he_mmu(env, addr, mop, ra);
 
     if (mop & MO_BSWAP) {
         ret = bswap128(ret);
@@ -XXX,XX +XXX,XX @@ Int128 helper_ld16_mmu(CPUArchState *env, uint64_t addr,
     return ret;
 }
 
+Int128 helper_ld16_mmu(CPUArchState *env, uint64_t addr,
+                       MemOpIdx oi, uintptr_t ra)
+{
+    return do_ld16_mmu(env, addr, get_memop(oi), ra);
+}
+
 Int128 helper_ld_i128(CPUArchState *env, uint64_t addr, MemOpIdx oi)
 {
     return helper_ld16_mmu(env, addr, oi, GETPC());
 }
 
-Int128 cpu_ld16_be_mmu(CPUArchState *env, abi_ptr addr,
-                       MemOpIdx oi, uintptr_t ra)
+Int128 cpu_ld16_mmu(CPUArchState *env, abi_ptr addr,
+                    MemOpIdx oi, uintptr_t ra)
 {
-    MemOp mop = get_memop(oi);
-    Int128 ret;
-
-    tcg_debug_assert((mop & MO_BSWAP) == MO_BE);
-    ret = do_ld16_he_mmu(env, addr, mop, ra);
+    Int128 ret = do_ld16_mmu(env, addr, get_memop(oi), ra);
     qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
-    if (!HOST_BIG_ENDIAN) {
-        ret = bswap128(ret);
-    }
-    return ret;
-}
-
-Int128 cpu_ld16_le_mmu(CPUArchState *env, abi_ptr addr,
-                       MemOpIdx oi, uintptr_t ra)
-{
-    MemOp mop = get_memop(oi);
-    Int128 ret;
-
-    tcg_debug_assert((mop & MO_BSWAP) == MO_LE);
-    ret = do_ld16_he_mmu(env, addr, mop, ra);
-    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
-    if (HOST_BIG_ENDIAN) {
-        ret = bswap128(ret);
-    }
     return ret;
 }
 
@@ -XXX,XX +XXX,XX @@ void cpu_stb_mmu(CPUArchState *env, abi_ptr addr, uint8_t val,
     qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W);
 }
 
-static void do_st2_he_mmu(CPUArchState *env, abi_ptr addr, uint16_t val,
-                          MemOp mop, uintptr_t ra)
+static void do_st2_mmu(CPUArchState *env, abi_ptr addr, uint16_t val,
+                       MemOp mop, uintptr_t ra)
 {
     void *haddr;
 
     tcg_debug_assert((mop & MO_SIZE) == MO_16);
     haddr = cpu_mmu_lookup(env, addr, mop, ra, MMU_DATA_STORE);
+
+    if (mop & MO_BSWAP) {
+        val = bswap16(val);
+    }
     store_atom_2(env, ra, haddr, mop, val);
     clear_helper_retaddr();
 }
@@ -XXX,XX +XXX,XX @@ static void do_st2_he_mmu(CPUArchState *env, abi_ptr addr, uint16_t val,
 void helper_stw_mmu(CPUArchState *env, uint64_t addr, uint32_t val,
                     MemOpIdx oi, uintptr_t ra)
 {
-    MemOp mop = get_memop(oi);
-
-    if (mop & MO_BSWAP) {
-        val = bswap16(val);
-    }
-    do_st2_he_mmu(env, addr, val, mop, ra);
+    do_st2_mmu(env, addr, val, get_memop(oi), ra);
 }
 
-void cpu_stw_be_mmu(CPUArchState *env, abi_ptr addr, uint16_t val,
+void cpu_stw_mmu(CPUArchState *env, abi_ptr addr, uint16_t val,
                     MemOpIdx oi, uintptr_t ra)
 {
-    MemOp mop = get_memop(oi);
-
-    tcg_debug_assert((mop & MO_BSWAP) == MO_BE);
-    do_st2_he_mmu(env, addr, be16_to_cpu(val), mop, ra);
+    do_st2_mmu(env, addr, val, get_memop(oi), ra);
     qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W);
 }
 
-void cpu_stw_le_mmu(CPUArchState *env, abi_ptr addr, uint16_t val,
-                    MemOpIdx oi, uintptr_t ra)
-{
-    MemOp mop = get_memop(oi);
-
-    tcg_debug_assert((mop & MO_BSWAP) == MO_LE);
-    do_st2_he_mmu(env, addr, le16_to_cpu(val), mop, ra);
-    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W);
-}
-
-static void do_st4_he_mmu(CPUArchState *env, abi_ptr addr, uint32_t val,
-                          MemOp mop, uintptr_t ra)
+static void do_st4_mmu(CPUArchState *env, abi_ptr addr, uint32_t val,
+                       MemOp mop, uintptr_t ra)
 {
     void *haddr;
 
     tcg_debug_assert((mop & MO_SIZE) == MO_32);
     haddr = cpu_mmu_lookup(env, addr, mop, ra, MMU_DATA_STORE);
+
+    if (mop & MO_BSWAP) {
+        val = bswap32(val);
+    }
     store_atom_4(env, ra, haddr, mop, val);
     clear_helper_retaddr();
 }
@@ -XXX,XX +XXX,XX @@ static void do_st4_he_mmu(CPUArchState *env, abi_ptr addr, uint32_t val,
 void helper_stl_mmu(CPUArchState *env, uint64_t addr, uint32_t val,
                     MemOpIdx oi, uintptr_t ra)
 {
-    MemOp mop = get_memop(oi);
-
-    if (mop & MO_BSWAP) {
-        val = bswap32(val);
-    }
-    do_st4_he_mmu(env, addr, val, mop, ra);
+    do_st4_mmu(env, addr, val, get_memop(oi), ra);
 }
 
-void cpu_stl_be_mmu(CPUArchState *env, abi_ptr addr, uint32_t val,
-                    MemOpIdx oi, uintptr_t ra)
+void cpu_stl_mmu(CPUArchState *env, abi_ptr addr, uint32_t val,
+                 MemOpIdx oi, uintptr_t ra)
 {
-    MemOp mop = get_memop(oi);
-
-    tcg_debug_assert((mop & MO_BSWAP) == MO_BE);
-    do_st4_he_mmu(env, addr, be32_to_cpu(val), mop, ra);
+    do_st4_mmu(env, addr, val, get_memop(oi), ra);
     qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W);
 }
 
-void cpu_stl_le_mmu(CPUArchState *env, abi_ptr addr, uint32_t val,
-                    MemOpIdx oi, uintptr_t ra)
-{
-    MemOp mop = get_memop(oi);
-
-    tcg_debug_assert((mop & MO_BSWAP) == MO_LE);
-    do_st4_he_mmu(env, addr, le32_to_cpu(val), mop, ra);
-    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W);
-}
-
-static void do_st8_he_mmu(CPUArchState *env, abi_ptr addr, uint64_t val,
-                          MemOp mop, uintptr_t ra)
+static void do_st8_mmu(CPUArchState *env, abi_ptr addr, uint64_t val,
+                       MemOp mop, uintptr_t ra)
 {
     void *haddr;
 
     tcg_debug_assert((mop & MO_SIZE) == MO_64);
     haddr = cpu_mmu_lookup(env, addr, mop, ra, MMU_DATA_STORE);
+
+    if (mop & MO_BSWAP) {
+        val = bswap64(val);
+    }
     store_atom_8(env, ra, haddr, mop, val);
     clear_helper_retaddr();
 }
@@ -XXX,XX +XXX,XX @@ static void do_st8_he_mmu(CPUArchState *env, abi_ptr addr, uint64_t val,
 void helper_stq_mmu(CPUArchState *env, uint64_t addr, uint64_t val,
                     MemOpIdx oi, uintptr_t ra)
 {
-    MemOp mop = get_memop(oi);
-
-    if (mop & MO_BSWAP) {
-        val = bswap64(val);
-    }
-    do_st8_he_mmu(env, addr, val, mop, ra);
+    do_st8_mmu(env, addr, val, get_memop(oi), ra);
 }
 
-void cpu_stq_be_mmu(CPUArchState *env, abi_ptr addr, uint64_t val,
+void cpu_stq_mmu(CPUArchState *env, abi_ptr addr, uint64_t val,
                     MemOpIdx oi, uintptr_t ra)
 {
-    MemOp mop = get_memop(oi);
-
-    tcg_debug_assert((mop & MO_BSWAP) == MO_BE);
-    do_st8_he_mmu(env, addr, cpu_to_be64(val), mop, ra);
+    do_st8_mmu(env, addr, val, get_memop(oi), ra);
     qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W);
 }
 
-void cpu_stq_le_mmu(CPUArchState *env, abi_ptr addr, uint64_t val,
-                    MemOpIdx oi, uintptr_t ra)
-{
-    MemOp mop = get_memop(oi);
-
-    tcg_debug_assert((mop & MO_BSWAP) == MO_LE);
-    do_st8_he_mmu(env, addr, cpu_to_le64(val), mop, ra);
-    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W);
-}
-
-static void do_st16_he_mmu(CPUArchState *env, abi_ptr addr, Int128 val,
-                           MemOp mop, uintptr_t ra)
+static void do_st16_mmu(CPUArchState *env, abi_ptr addr, Int128 val,
+                        MemOp mop, uintptr_t ra)
 {
     void *haddr;
 
     tcg_debug_assert((mop & MO_SIZE) == MO_128);
     haddr = cpu_mmu_lookup(env, addr, mop, ra, MMU_DATA_STORE);
+
+    if (mop & MO_BSWAP) {
+        val = bswap128(val);
+    }
     store_atom_16(env, ra, haddr, mop, val);
     clear_helper_retaddr();
 }
@@ -XXX,XX +XXX,XX @@ static void do_st16_he_mmu(CPUArchState *env, abi_ptr addr, Int128 val,
 void helper_st16_mmu(CPUArchState *env, uint64_t addr, Int128 val,
                      MemOpIdx oi, uintptr_t ra)
 {
-    MemOp mop = get_memop(oi);
-
-    if (mop & MO_BSWAP) {
-        val = bswap128(val);
-    }
-    do_st16_he_mmu(env, addr, val, mop, ra);
+    do_st16_mmu(env, addr, val, get_memop(oi), ra);
 }
 
 void helper_st_i128(CPUArchState *env, uint64_t addr, Int128 val, MemOpIdx oi)
@@ -XXX,XX +XXX,XX @@ void helper_st_i128(CPUArchState *env, uint64_t addr, Int128 val, MemOpIdx oi)
     helper_st16_mmu(env, addr, val, oi, GETPC());
 }
 
-void cpu_st16_be_mmu(CPUArchState *env, abi_ptr addr,
-                     Int128 val, MemOpIdx oi, uintptr_t ra)
+void cpu_st16_mmu(CPUArchState *env, abi_ptr addr,
+                  Int128 val, MemOpIdx oi, uintptr_t ra)
 {
-    MemOp mop = get_memop(oi);
-
-    tcg_debug_assert((mop & MO_BSWAP) == MO_BE);
-    if (!HOST_BIG_ENDIAN) {
-        val = bswap128(val);
-    }
-    do_st16_he_mmu(env, addr, val, mop, ra);
-    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W);
-}
-
-void cpu_st16_le_mmu(CPUArchState *env, abi_ptr addr,
-                     Int128 val, MemOpIdx oi, uintptr_t ra)
-{
-    MemOp mop = get_memop(oi);
-
-    tcg_debug_assert((mop & MO_BSWAP) == MO_LE);
-    if (HOST_BIG_ENDIAN) {
-        val = bswap128(val);
-    }
-    do_st16_he_mmu(env, addr, val, mop, ra);
+    do_st16_mmu(env, addr, val, get_memop(oi), ra);
     qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W);
 }
 
diff --git a/target/arm/tcg/m_helper.c b/target/arm/tcg/m_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/tcg/m_helper.c
+++ b/target/arm/tcg/m_helper.c
@@ -XXX,XX +XXX,XX @@ static bool do_v7m_function_return(ARMCPU *cpu)
          */
         mmu_idx = arm_v7m_mmu_idx_for_secstate(env, true);
         oi = make_memop_idx(MO_LEUL, arm_to_core_mmu_idx(mmu_idx));
-        newpc = cpu_ldl_le_mmu(env, frameptr, oi, 0);
-        newpsr = cpu_ldl_le_mmu(env, frameptr + 4, oi, 0);
+        newpc = cpu_ldl_mmu(env, frameptr, oi, 0);
+        newpsr = cpu_ldl_mmu(env, frameptr + 4, oi, 0);
 
         /* Consistency checks on new IPSR */
         newpsr_exc = newpsr & XPSR_EXCP;
diff --git a/target/sparc/ldst_helper.c b/target/sparc/ldst_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/sparc/ldst_helper.c
+++ b/target/sparc/ldst_helper.c
@@ -XXX,XX +XXX,XX @@ uint64_t helper_ld_asi(CPUSPARCState *env, target_ulong addr,
                 ret = cpu_ldb_mmu(env, addr, oi, GETPC());
                 break;
             case 2:
-                if (asi & 8) {
-                    ret = cpu_ldw_le_mmu(env, addr, oi, GETPC());
-                } else {
-                    ret = cpu_ldw_be_mmu(env, addr, oi, GETPC());
-                }
+                ret = cpu_ldw_mmu(env, addr, oi, GETPC());
                 break;
             case 4:
-                if (asi & 8) {
-                    ret = cpu_ldl_le_mmu(env, addr, oi, GETPC());
-                } else {
-                    ret = cpu_ldl_be_mmu(env, addr, oi, GETPC());
-                }
+                ret = cpu_ldl_mmu(env, addr, oi, GETPC());
                 break;
             case 8:
-                if (asi & 8) {
-                    ret = cpu_ldq_le_mmu(env, addr, oi, GETPC());
-                } else {
-                    ret = cpu_ldq_be_mmu(env, addr, oi, GETPC());
-                }
+                ret = cpu_ldq_mmu(env, addr, oi, GETPC());
                 break;
             default:
                 g_assert_not_reached();
diff --git a/accel/tcg/ldst_common.c.inc b/accel/tcg/ldst_common.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/ldst_common.c.inc
+++ b/accel/tcg/ldst_common.c.inc
@@ -XXX,XX +XXX,XX @@ uint32_t cpu_lduw_be_mmuidx_ra(CPUArchState *env, abi_ptr addr,
                                int mmu_idx, uintptr_t ra)
 {
     MemOpIdx oi = make_memop_idx(MO_BEUW | MO_UNALN, mmu_idx);
-    return cpu_ldw_be_mmu(env, addr, oi, ra);
+    return cpu_ldw_mmu(env, addr, oi, ra);
 }
 
 int cpu_ldsw_be_mmuidx_ra(CPUArchState *env, abi_ptr addr,
@@ -XXX,XX +XXX,XX @@ uint32_t cpu_ldl_be_mmuidx_ra(CPUArchState *env, abi_ptr addr,
                               int mmu_idx, uintptr_t ra)
 {
     MemOpIdx oi = make_memop_idx(MO_BEUL | MO_UNALN, mmu_idx);
-    return cpu_ldl_be_mmu(env, addr, oi, ra);
+    return cpu_ldl_mmu(env, addr, oi, ra);
 }
 
 uint64_t cpu_ldq_be_mmuidx_ra(CPUArchState *env, abi_ptr addr,
                               int mmu_idx, uintptr_t ra)
 {
     MemOpIdx oi = make_memop_idx(MO_BEUQ | MO_UNALN, mmu_idx);
-    return cpu_ldq_be_mmu(env, addr, oi, ra);
+    return cpu_ldq_mmu(env, addr, oi, ra);
 }
 
 uint32_t cpu_lduw_le_mmuidx_ra(CPUArchState *env, abi_ptr addr,
                                int mmu_idx, uintptr_t ra)
 {
     MemOpIdx oi = make_memop_idx(MO_LEUW | MO_UNALN, mmu_idx);
-    return cpu_ldw_le_mmu(env, addr, oi, ra);
+    return cpu_ldw_mmu(env, addr, oi, ra);
 }
 
 int cpu_ldsw_le_mmuidx_ra(CPUArchState *env, abi_ptr addr,
@@ -XXX,XX +XXX,XX @@ uint32_t cpu_ldl_le_mmuidx_ra(CPUArchState *env, abi_ptr addr,
                               int mmu_idx, uintptr_t ra)
 {
     MemOpIdx oi = make_memop_idx(MO_LEUL | MO_UNALN, mmu_idx);
-    return cpu_ldl_le_mmu(env, addr, oi, ra);
+    return cpu_ldl_mmu(env, addr, oi, ra);
 }
 
 uint64_t cpu_ldq_le_mmuidx_ra(CPUArchState *env, abi_ptr addr,
                               int mmu_idx, uintptr_t ra)
 {
     MemOpIdx oi = make_memop_idx(MO_LEUQ | MO_UNALN, mmu_idx);
-    return cpu_ldq_le_mmu(env, addr, oi, ra);
+    return cpu_ldq_mmu(env, addr, oi, ra);
 }
 
 void cpu_stb_mmuidx_ra(CPUArchState *env, abi_ptr addr, uint32_t val,
@@ -XXX,XX +XXX,XX @@ void cpu_stw_be_mmuidx_ra(CPUArchState *env, abi_ptr addr, uint32_t val,
                           int mmu_idx, uintptr_t ra)
 {
     MemOpIdx oi = make_memop_idx(MO_BEUW | MO_UNALN, mmu_idx);
-    cpu_stw_be_mmu(env, addr, val, oi, ra);
+    cpu_stw_mmu(env, addr, val, oi, ra);
 }
 
 void cpu_stl_be_mmuidx_ra(CPUArchState *env, abi_ptr addr, uint32_t val,
                           int mmu_idx, uintptr_t ra)
 {
     MemOpIdx oi = make_memop_idx(MO_BEUL | MO_UNALN, mmu_idx);
-    cpu_stl_be_mmu(env, addr, val, oi, ra);
+    cpu_stl_mmu(env, addr, val, oi, ra);
 }
 
 void cpu_stq_be_mmuidx_ra(CPUArchState *env, abi_ptr addr, uint64_t val,
                           int mmu_idx, uintptr_t ra)
 {
     MemOpIdx oi = make_memop_idx(MO_BEUQ | MO_UNALN, mmu_idx);
-    cpu_stq_be_mmu(env, addr, val, oi, ra);
+    cpu_stq_mmu(env, addr, val, oi, ra);
 }
 
 void cpu_stw_le_mmuidx_ra(CPUArchState *env, abi_ptr addr, uint32_t val,
                           int mmu_idx, uintptr_t ra)
 {
     MemOpIdx oi = make_memop_idx(MO_LEUW | MO_UNALN, mmu_idx);
-    cpu_stw_le_mmu(env, addr, val, oi, ra);
+    cpu_stw_mmu(env, addr, val, oi, ra);
 }
 
 void cpu_stl_le_mmuidx_ra(CPUArchState *env, abi_ptr addr, uint32_t val,
                           int mmu_idx, uintptr_t ra)
 {
     MemOpIdx oi = make_memop_idx(MO_LEUL | MO_UNALN, mmu_idx);
-    cpu_stl_le_mmu(env, addr, val, oi, ra);
+    cpu_stl_mmu(env, addr, val, oi, ra);
 }
 
 void cpu_stq_le_mmuidx_ra(CPUArchState *env, abi_ptr addr, uint64_t val,
                           int mmu_idx, uintptr_t ra)
 {
     MemOpIdx oi = make_memop_idx(MO_LEUQ | MO_UNALN, mmu_idx);
-    cpu_stq_le_mmu(env, addr, val, oi, ra);
+    cpu_stq_mmu(env, addr, val, oi, ra);
 }
 
 /*--------------------------*/
-- 
2.34.1

Use cpu_ld16_mmu and cpu_st16_mmu to eliminate the special case,
and change all of the *_data_ra functions to match.

Note that we check the alignment of both compare and store
pointers at the top of the function, so MO_ALIGN* may be
safely removed from the individual memory operations.

diff --git a/target/s390x/tcg/mem_helper.c b/target/s390x/tcg/mem_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/s390x/tcg/mem_helper.c
+++ b/target/s390x/tcg/mem_helper.c
@@ -XXX,XX +XXX,XX @@ static uint32_t do_csst(CPUS390XState *env, uint32_t r3, uint64_t a1,
                         uint64_t a2, bool parallel)
 {
     uint32_t mem_idx = cpu_mmu_index(env, false);
+    MemOpIdx oi16 = make_memop_idx(MO_TE | MO_128, mem_idx);
+    MemOpIdx oi8 = make_memop_idx(MO_TE | MO_64, mem_idx);
+    MemOpIdx oi4 = make_memop_idx(MO_TE | MO_32, mem_idx);
+    MemOpIdx oi2 = make_memop_idx(MO_TE | MO_16, mem_idx);
+    MemOpIdx oi1 = make_memop_idx(MO_8, mem_idx);
     uintptr_t ra = GETPC();
     uint32_t fc = extract32(env->regs[0], 0, 8);
     uint32_t sc = extract32(env->regs[0], 8, 8);
@@ -XXX,XX +XXX,XX @@ static uint32_t do_csst(CPUS390XState *env, uint32_t r3, uint64_t a1,
         }
     }
 
-    /* All loads happen before all stores.  For simplicity, load the entire
-       store value area from the parameter list.  */
-    svh = cpu_ldq_data_ra(env, pl + 16, ra);
-    svl = cpu_ldq_data_ra(env, pl + 24, ra);
+    /*
+     * All loads happen before all stores.  For simplicity, load the entire
+     * store value area from the parameter list.
+     */
+    svh = cpu_ldq_mmu(env, pl + 16, oi8, ra);
+    svl = cpu_ldq_mmu(env, pl + 24, oi8, ra);
 
     switch (fc) {
     case 0:
         {
-            uint32_t nv = cpu_ldl_data_ra(env, pl, ra);
+            uint32_t nv = cpu_ldl_mmu(env, pl, oi4, ra);
             uint32_t cv = env->regs[r3];
             uint32_t ov;
 
@@ -XXX,XX +XXX,XX @@ static uint32_t do_csst(CPUS390XState *env, uint32_t r3, uint64_t a1,
                 ov = cpu_atomic_cmpxchgl_be_mmu(env, a1, cv, nv, oi, ra);
 #endif
             } else {
-                ov = cpu_ldl_data_ra(env, a1, ra);
-                cpu_stl_data_ra(env, a1, (ov == cv ? nv : ov), ra);
+                ov = cpu_ldl_mmu(env, a1, oi4, ra);
+                cpu_stl_mmu(env, a1, (ov == cv ? nv : ov), oi4, ra);
             }
             cc = (ov != cv);
             env->regs[r3] = deposit64(env->regs[r3], 32, 32, ov);
@@ -XXX,XX +XXX,XX @@ static uint32_t do_csst(CPUS390XState *env, uint32_t r3, uint64_t a1,
 
     case 1:
         {
-            uint64_t nv = cpu_ldq_data_ra(env, pl, ra);
+            uint64_t nv = cpu_ldq_mmu(env, pl, oi8, ra);
             uint64_t cv = env->regs[r3];
             uint64_t ov;
 
             if (parallel) {
 #ifdef CONFIG_ATOMIC64
-                MemOpIdx oi = make_memop_idx(MO_TEUQ | MO_ALIGN, mem_idx);
-                ov = cpu_atomic_cmpxchgq_be_mmu(env, a1, cv, nv, oi, ra);
+                ov = cpu_atomic_cmpxchgq_be_mmu(env, a1, cv, nv, oi8, ra);
 #else
                 /* Note that we asserted !parallel above.  */
                 g_assert_not_reached();
 #endif
             } else {
-                ov = cpu_ldq_data_ra(env, a1, ra);
-                cpu_stq_data_ra(env, a1, (ov == cv ? nv : ov), ra);
+                ov = cpu_ldq_mmu(env, a1, oi8, ra);
+                cpu_stq_mmu(env, a1, (ov == cv ? nv : ov), oi8, ra);
             }
             cc = (ov != cv);
             env->regs[r3] = ov;
@@ -XXX,XX +XXX,XX @@ static uint32_t do_csst(CPUS390XState *env, uint32_t r3, uint64_t a1,
 
     case 2:
         {
-            uint64_t nvh = cpu_ldq_data_ra(env, pl, ra);
-            uint64_t nvl = cpu_ldq_data_ra(env, pl + 8, ra);
-            Int128 nv = int128_make128(nvl, nvh);
+            Int128 nv = cpu_ld16_mmu(env, pl, oi16, ra);
             Int128 cv = int128_make128(env->regs[r3 + 1], env->regs[r3]);
             Int128 ov;
 
             if (!parallel) {
-                uint64_t oh = cpu_ldq_data_ra(env, a1 + 0, ra);
-                uint64_t ol = cpu_ldq_data_ra(env, a1 + 8, ra);
-
-                ov = int128_make128(ol, oh);
+                ov = cpu_ld16_mmu(env, a1, oi16, ra);
                 cc = !int128_eq(ov, cv);
                 if (cc) {
                     nv = ov;
                 }
-
-                cpu_stq_data_ra(env, a1 + 0, int128_gethi(nv), ra);
-                cpu_stq_data_ra(env, a1 + 8, int128_getlo(nv), ra);
+                cpu_st16_mmu(env, a1, nv, oi16, ra);
             } else if (HAVE_CMPXCHG128) {
-                MemOpIdx oi = make_memop_idx(MO_TE | MO_128 | MO_ALIGN, mem_idx);
-                ov = cpu_atomic_cmpxchgo_be_mmu(env, a1, cv, nv, oi, ra);
+                ov = cpu_atomic_cmpxchgo_be_mmu(env, a1, cv, nv, oi16, ra);
                 cc = !int128_eq(ov, cv);
             } else {
                 /* Note that we asserted !parallel above.  */
@@ -XXX,XX +XXX,XX @@ static uint32_t do_csst(CPUS390XState *env, uint32_t r3, uint64_t a1,
     if (cc == 0) {
         switch (sc) {
         case 0:
-            cpu_stb_data_ra(env, a2, svh >> 56, ra);
+            cpu_stb_mmu(env, a2, svh >> 56, oi1, ra);
             break;
         case 1:
-            cpu_stw_data_ra(env, a2, svh >> 48, ra);
+            cpu_stw_mmu(env, a2, svh >> 48, oi2, ra);
             break;
         case 2:
-            cpu_stl_data_ra(env, a2, svh >> 32, ra);
+            cpu_stl_mmu(env, a2, svh >> 32, oi4, ra);
             break;
         case 3:
-            cpu_stq_data_ra(env, a2, svh, ra);
+            cpu_stq_mmu(env, a2, svh, oi8, ra);
             break;
         case 4:
-            if (!parallel) {
-                cpu_stq_data_ra(env, a2 + 0, svh, ra);
-                cpu_stq_data_ra(env, a2 + 8, svl, ra);
-            } else if (HAVE_ATOMIC128) {
-                MemOpIdx oi = make_memop_idx(MO_TEUQ | MO_ALIGN_16, mem_idx);
-                Int128 sv = int128_make128(svl, svh);
-                cpu_atomic_sto_be_mmu(env, a2, sv, oi, ra);
-            } else {
-                /* Note that we asserted !parallel above.  */
-                g_assert_not_reached();
-            }
+            cpu_st16_mmu(env, a2, int128_make128(svl, svh), oi16, ra);
             break;
         default:
             g_assert_not_reached();
-- 
2.34.1

Eliminate the CONFIG_USER_ONLY specialization.

Atomic load/store of 128-byte quantities is now handled
by cpu_{ld,st}16_mmu.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/atomic_template.h   | 61 +++--------------------------------
 include/exec/cpu_ldst.h       |  9 ------
 accel/tcg/atomic_common.c.inc | 14 --------
 3 files changed, 4 insertions(+), 80 deletions(-)

Now that load/store are gone, we're always passing
PAGE_READ | PAGE_WRITE for RMW atomic operations.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/atomic_template.h | 32 ++++++--------
 accel/tcg/cputlb.c          | 85 ++++++++++++++-----------------------
 accel/tcg/user-exec.c       |  8 +---
 3 files changed, 45 insertions(+), 80 deletions(-)

diff --git a/accel/tcg/atomic_template.h b/accel/tcg/atomic_template.h
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/atomic_template.h
+++ b/accel/tcg/atomic_template.h
@@ -XXX,XX +XXX,XX @@ ABI_TYPE ATOMIC_NAME(cmpxchg)(CPUArchState *env, target_ulong addr,
                               ABI_TYPE cmpv, ABI_TYPE newv,
                               MemOpIdx oi, uintptr_t retaddr)
 {
-    DATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE,
-                                         PAGE_READ | PAGE_WRITE, retaddr);
+    DATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE, retaddr);
     DATA_TYPE ret;
 
 #if DATA_SIZE == 16
@@ -XXX,XX +XXX,XX @@ ABI_TYPE ATOMIC_NAME(cmpxchg)(CPUArchState *env, target_ulong addr,
 ABI_TYPE ATOMIC_NAME(xchg)(CPUArchState *env, target_ulong addr, ABI_TYPE val,
                            MemOpIdx oi, uintptr_t retaddr)
 {
-    DATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE,
-                                         PAGE_READ | PAGE_WRITE, retaddr);
+    DATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE, retaddr);
     DATA_TYPE ret;
 
     ret = qatomic_xchg__nocheck(haddr, val);
@@ -XXX,XX +XXX,XX @@ ABI_TYPE ATOMIC_NAME(xchg)(CPUArchState *env, target_ulong addr, ABI_TYPE val,
 ABI_TYPE ATOMIC_NAME(X)(CPUArchState *env, target_ulong addr,       \
                         ABI_TYPE val, MemOpIdx oi, uintptr_t retaddr) \
 {                                                                   \
-    DATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE,  \
-                                         PAGE_READ | PAGE_WRITE, retaddr); \
-    DATA_TYPE ret;                                                  \
+    DATA_TYPE *haddr, ret;                                          \
+    haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE, retaddr);   \
     ret = qatomic_##X(haddr, val);                                  \
     ATOMIC_MMU_CLEANUP;                                             \
     atomic_trace_rmw_post(env, addr, oi);                           \
@@ -XXX,XX +XXX,XX @@ GEN_ATOMIC_HELPER(xor_fetch)
 ABI_TYPE ATOMIC_NAME(X)(CPUArchState *env, target_ulong addr,       \
                         ABI_TYPE xval, MemOpIdx oi, uintptr_t retaddr) \
 {                                                                   \
-    XDATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE, \
-                                          PAGE_READ | PAGE_WRITE, retaddr); \
-    XDATA_TYPE cmp, old, new, val = xval;                           \
+    XDATA_TYPE *haddr, cmp, old, new, val = xval;                   \
+    haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE, retaddr);   \
     smp_mb();                                                       \
     cmp = qatomic_read__nocheck(haddr);                             \
     do {                                                            \
@@ -XXX,XX +XXX,XX @@ ABI_TYPE ATOMIC_NAME(cmpxchg)(CPUArchState *env, target_ulong addr,
                               ABI_TYPE cmpv, ABI_TYPE newv,
                               MemOpIdx oi, uintptr_t retaddr)
 {
-    DATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE,
-                                         PAGE_READ | PAGE_WRITE, retaddr);
+    DATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE, retaddr);
     DATA_TYPE ret;
 
 #if DATA_SIZE == 16
@@ -XXX,XX +XXX,XX @@ ABI_TYPE ATOMIC_NAME(cmpxchg)(CPUArchState *env, target_ulong addr,
 ABI_TYPE ATOMIC_NAME(xchg)(CPUArchState *env, target_ulong addr, ABI_TYPE val,
                            MemOpIdx oi, uintptr_t retaddr)
 {
-    DATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE,
-                                         PAGE_READ | PAGE_WRITE, retaddr);
+    DATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE, retaddr);
     ABI_TYPE ret;
 
     ret = qatomic_xchg__nocheck(haddr, BSWAP(val));
@@ -XXX,XX +XXX,XX @@ ABI_TYPE ATOMIC_NAME(xchg)(CPUArchState *env, target_ulong addr, ABI_TYPE val,
 ABI_TYPE ATOMIC_NAME(X)(CPUArchState *env, target_ulong addr,       \
                         ABI_TYPE val, MemOpIdx oi, uintptr_t retaddr) \
 {                                                                   \
-    DATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE,  \
-                                         PAGE_READ | PAGE_WRITE, retaddr); \
-    DATA_TYPE ret;                                                  \
+    DATA_TYPE *haddr, ret;                                          \
+    haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE, retaddr);   \
     ret = qatomic_##X(haddr, BSWAP(val));                           \
     ATOMIC_MMU_CLEANUP;                                             \
     atomic_trace_rmw_post(env, addr, oi);                           \
@@ -XXX,XX +XXX,XX @@ GEN_ATOMIC_HELPER(xor_fetch)
 ABI_TYPE ATOMIC_NAME(X)(CPUArchState *env, target_ulong addr,       \
                         ABI_TYPE xval, MemOpIdx oi, uintptr_t retaddr) \
 {                                                                   \
-    XDATA_TYPE *haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE, \
-                                          PAGE_READ | PAGE_WRITE, retaddr); \
-    XDATA_TYPE ldo, ldn, old, new, val = xval;                      \
+    XDATA_TYPE *haddr, ldo, ldn, old, new, val = xval;              \
+    haddr = atomic_mmu_lookup(env, addr, oi, DATA_SIZE, retaddr);   \
     smp_mb();                                                       \
     ldn = qatomic_read__nocheck(haddr);                             \
     do {                                                            \
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ static bool mmu_lookup(CPUArchState *env, target_ulong addr, MemOpIdx oi,
 /*
  * Probe for an atomic operation.  Do not allow unaligned operations,
  * or io operations to proceed.  Return the host address.
- *
- * @prot may be PAGE_READ, PAGE_WRITE, or PAGE_READ|PAGE_WRITE.
  */
 static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
-                               MemOpIdx oi, int size, int prot,
-                               uintptr_t retaddr)
+                               MemOpIdx oi, int size, uintptr_t retaddr)
 {
     uintptr_t mmu_idx = get_mmuidx(oi);
     MemOp mop = get_memop(oi);
@@ -XXX,XX +XXX,XX @@ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
     tlbe = tlb_entry(env, mmu_idx, addr);
 
     /* Check TLB entry and enforce page permissions.  */
-    if (prot & PAGE_WRITE) {
-        tlb_addr = tlb_addr_write(tlbe);
-        if (!tlb_hit(tlb_addr, addr)) {
-            if (!victim_tlb_hit(env, mmu_idx, index, MMU_DATA_STORE,
-                                addr & TARGET_PAGE_MASK)) {
-                tlb_fill(env_cpu(env), addr, size,
-                         MMU_DATA_STORE, mmu_idx, retaddr);
-                index = tlb_index(env, mmu_idx, addr);
-                tlbe = tlb_entry(env, mmu_idx, addr);
-            }
-            tlb_addr = tlb_addr_write(tlbe) & ~TLB_INVALID_MASK;
-        }
-
-        if (prot & PAGE_READ) {
-            /*
-             * Let the guest notice RMW on a write-only page.
-             * We have just verified that the page is writable.
-             * Subpage lookups may have left TLB_INVALID_MASK set,
-             * but addr_read will only be -1 if PAGE_READ was unset.
-             */
-            if (unlikely(tlbe->addr_read == -1)) {
-                tlb_fill(env_cpu(env), addr, size,
-                         MMU_DATA_LOAD, mmu_idx, retaddr);
-                /*
-                 * Since we don't support reads and writes to different
-                 * addresses, and we do have the proper page loaded for
-                 * write, this shouldn't ever return.  But just in case,
-                 * handle via stop-the-world.
-                 */
-                goto stop_the_world;
-            }
-            /* Collect TLB_WATCHPOINT for read. */
-            tlb_addr |= tlbe->addr_read;
-        }
-    } else /* if (prot & PAGE_READ) */ {
-        tlb_addr = tlbe->addr_read;
-        if (!tlb_hit(tlb_addr, addr)) {
-            if (!victim_tlb_hit(env, mmu_idx, index, MMU_DATA_LOAD,
-                                addr & TARGET_PAGE_MASK)) {
-                tlb_fill(env_cpu(env), addr, size,
-                         MMU_DATA_LOAD, mmu_idx, retaddr);
-                index = tlb_index(env, mmu_idx, addr);
-                tlbe = tlb_entry(env, mmu_idx, addr);
-            }
-            tlb_addr = tlbe->addr_read & ~TLB_INVALID_MASK;
+    tlb_addr = tlb_addr_write(tlbe);
+    if (!tlb_hit(tlb_addr, addr)) {
+        if (!victim_tlb_hit(env, mmu_idx, index, MMU_DATA_STORE,
+                            addr & TARGET_PAGE_MASK)) {
+            tlb_fill(env_cpu(env), addr, size,
+                     MMU_DATA_STORE, mmu_idx, retaddr);
+            index = tlb_index(env, mmu_idx, addr);
+            tlbe = tlb_entry(env, mmu_idx, addr);
         }
+        tlb_addr = tlb_addr_write(tlbe) & ~TLB_INVALID_MASK;
     }
 
+    /*
+     * Let the guest notice RMW on a write-only page.
+     * We have just verified that the page is writable.
+     * Subpage lookups may have left TLB_INVALID_MASK set,
+     * but addr_read will only be -1 if PAGE_READ was unset.
+     */
+    if (unlikely(tlbe->addr_read == -1)) {
+        tlb_fill(env_cpu(env), addr, size, MMU_DATA_LOAD, mmu_idx, retaddr);
+        /*
+         * Since we don't support reads and writes to different
+         * addresses, and we do have the proper page loaded for
+         * write, this shouldn't ever return.  But just in case,
+         * handle via stop-the-world.
+         */
+        goto stop_the_world;
+    }
+    /* Collect TLB_WATCHPOINT for read. */
+    tlb_addr |= tlbe->addr_read;
+
     /* Notice an IO access or a needs-MMU-lookup access */
     if (unlikely(tlb_addr & (TLB_MMIO | TLB_DISCARD_WRITE))) {
         /* There's really nothing that can be done to
@@ -XXX,XX +XXX,XX @@ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
     }
 
     if (unlikely(tlb_addr & TLB_WATCHPOINT)) {
-        QEMU_BUILD_BUG_ON(PAGE_READ != BP_MEM_READ);
-        QEMU_BUILD_BUG_ON(PAGE_WRITE != BP_MEM_WRITE);
-        /* therefore prot == watchpoint bits */
-        cpu_check_watchpoint(env_cpu(env), addr, size,
-                             full->attrs, prot, retaddr);
+        cpu_check_watchpoint(env_cpu(env), addr, size, full->attrs,
+                             BP_MEM_READ | BP_MEM_WRITE, retaddr);
     }
 
     return hostaddr;
diff --git a/accel/tcg/user-exec.c b/accel/tcg/user-exec.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/user-exec.c
+++ b/accel/tcg/user-exec.c
@@ -XXX,XX +XXX,XX @@ uint64_t cpu_ldq_code_mmu(CPUArchState *env, abi_ptr addr,
 
 /*
  * Do not allow unaligned operations to proceed.  Return the host address.
- *
- * @prot may be PAGE_READ, PAGE_WRITE, or PAGE_READ|PAGE_WRITE.
  */
 static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
-                               MemOpIdx oi, int size, int prot,
-                               uintptr_t retaddr)
+                               MemOpIdx oi, int size, uintptr_t retaddr)
 {
     MemOp mop = get_memop(oi);
     int a_bits = get_alignment_bits(mop);
@@ -XXX,XX +XXX,XX @@ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
 
     /* Enforce guest required alignment.  */
     if (unlikely(addr & ((1 << a_bits) - 1))) {
-        MMUAccessType t = prot == PAGE_READ ? MMU_DATA_LOAD : MMU_DATA_STORE;
-        cpu_loop_exit_sigbus(env_cpu(env), addr, t, retaddr);
+        cpu_loop_exit_sigbus(env_cpu(env), addr, MMU_DATA_STORE, retaddr);
     }
 
     /* Enforce qemu required alignment.  */
-- 
2.34.1

These symbols will shortly become dynamic runtime tests and
therefore not appropriate for the preprocessor.  Use the
matching CONFIG_* symbols for that purpose.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 host/include/aarch64/host/atomic128-cas.h  | 2 ++
 host/include/generic/host/atomic128-ldst.h | 2 +-
 accel/tcg/cputlb.c                         | 2 +-
 accel/tcg/user-exec.c                      | 2 +-
 4 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/host/include/aarch64/host/atomic128-cas.h b/host/include/aarch64/host/atomic128-cas.h
index XXXXXXX..XXXXXXX 100644
--- a/host/include/aarch64/host/atomic128-cas.h
+++ b/host/include/aarch64/host/atomic128-cas.h
@@ -XXX,XX +XXX,XX @@ static inline Int128 atomic16_cmpxchg(Int128 *ptr, Int128 cmp, Int128 new)
 
     return int128_make128(oldl, oldh);
 }
+
+# define CONFIG_CMPXCHG128 1
 # define HAVE_CMPXCHG128 1
 #endif
 
diff --git a/host/include/generic/host/atomic128-ldst.h b/host/include/generic/host/atomic128-ldst.h
index XXXXXXX..XXXXXXX 100644
--- a/host/include/generic/host/atomic128-ldst.h
+++ b/host/include/generic/host/atomic128-ldst.h
@@ -XXX,XX +XXX,XX @@ atomic16_set(Int128 *ptr, Int128 val)
 }
 
 # define HAVE_ATOMIC128 1
-#elif !defined(CONFIG_USER_ONLY) && HAVE_CMPXCHG128
+#elif defined(CONFIG_CMPXCHG128) && !defined(CONFIG_USER_ONLY)
 static inline Int128 ATTRIBUTE_ATOMIC128_OPT
 atomic16_read(Int128 *ptr)
 {
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ void cpu_st16_mmu(CPUArchState *env, target_ulong addr, Int128 val,
 #include "atomic_template.h"
 #endif
 
-#if HAVE_CMPXCHG128 || HAVE_ATOMIC128
+#if defined(CONFIG_ATOMIC128) || defined(CONFIG_CMPXCHG128)
 #define DATA_SIZE 16
 #include "atomic_template.h"
 #endif
diff --git a/accel/tcg/user-exec.c b/accel/tcg/user-exec.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/user-exec.c
+++ b/accel/tcg/user-exec.c
@@ -XXX,XX +XXX,XX @@ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
 #include "atomic_template.h"
 #endif
 
-#if HAVE_ATOMIC128 || HAVE_CMPXCHG128
+#if defined(CONFIG_ATOMIC128) || defined(CONFIG_CMPXCHG128)
 #define DATA_SIZE 16
 #include "atomic_template.h"
 #endif
-- 
2.34.1

Create both atomic16_read_ro and atomic16_read_rw.
Previously we pretended that we had atomic16_read in system mode,
because we "know" that all ram is always writable to the host.
Now, expose read-only and read-write versions all of the time.

For aarch64, do not fall back to __atomic_read_16 even if
supported by the compiler, to work around a clang bug.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 host/include/aarch64/host/atomic128-ldst.h | 21 ++++++++-------
 host/include/generic/host/atomic128-ldst.h | 31 ++++++++++++++++------
 target/s390x/tcg/mem_helper.c              |  2 +-
 3 files changed, 36 insertions(+), 18 deletions(-)

diff --git a/host/include/aarch64/host/atomic128-ldst.h b/host/include/aarch64/host/atomic128-ldst.h
index XXXXXXX..XXXXXXX 100644
--- a/host/include/aarch64/host/atomic128-ldst.h
+++ b/host/include/aarch64/host/atomic128-ldst.h
@@ -XXX,XX +XXX,XX @@
 #ifndef AARCH64_ATOMIC128_LDST_H
 #define AARCH64_ATOMIC128_LDST_H
 
-/* Through gcc 10, aarch64 has no support for 128-bit atomics.  */
-#if !defined(CONFIG_ATOMIC128) && !defined(CONFIG_USER_ONLY)
-/* We can do better than cmpxchg for AArch64.  */
-static inline Int128 atomic16_read(Int128 *ptr)
+/*
+ * Through gcc 10, aarch64 has no support for 128-bit atomics.
+ * Through clang 16, without -march=armv8.4-a, __atomic_load_16
+ * is incorrectly expanded to a read-write operation.
+ */
+
+#define HAVE_ATOMIC128_RO 0
+#define HAVE_ATOMIC128_RW 1
+
+Int128 QEMU_ERROR("unsupported atomic") atomic16_read_ro(const Int128 *ptr);
+
+static inline Int128 atomic16_read_rw(Int128 *ptr)
 {
     uint64_t l, h;
     uint32_t tmp;
@@ -XXX,XX +XXX,XX @@ static inline void atomic16_set(Int128 *ptr, Int128 val)
         : [l] "r"(l), [h] "r"(h));
 }
 
-# define HAVE_ATOMIC128 1
-#else
-#include "host/include/generic/host/atomic128-ldst.h"
-#endif
-
 #endif /* AARCH64_ATOMIC128_LDST_H */
diff --git a/host/include/generic/host/atomic128-ldst.h b/host/include/generic/host/atomic128-ldst.h
index XXXXXXX..XXXXXXX 100644
--- a/host/include/generic/host/atomic128-ldst.h
+++ b/host/include/generic/host/atomic128-ldst.h
@@ -XXX,XX +XXX,XX @@
 #define HOST_ATOMIC128_LDST_H
 
 #if defined(CONFIG_ATOMIC128)
+# define HAVE_ATOMIC128_RO 1
+# define HAVE_ATOMIC128_RW 1
+
 static inline Int128 ATTRIBUTE_ATOMIC128_OPT
-atomic16_read(Int128 *ptr)
+atomic16_read_ro(const Int128 *ptr)
 {
-    __int128_t *ptr_align = __builtin_assume_aligned(ptr, 16);
+    const __int128_t *ptr_align = __builtin_assume_aligned(ptr, 16);
     Int128Alias r;
 
     r.i = qatomic_read__nocheck(ptr_align);
     return r.s;
 }
 
+static inline Int128 ATTRIBUTE_ATOMIC128_OPT
+atomic16_read_rw(Int128 *ptr)
+{
+    return atomic16_read_ro(ptr);
+}
+
 static inline void ATTRIBUTE_ATOMIC128_OPT
 atomic16_set(Int128 *ptr, Int128 val)
 {
@@ -XXX,XX +XXX,XX @@ atomic16_set(Int128 *ptr, Int128 val)
     qatomic_set__nocheck(ptr_align, v.i);
 }
 
-# define HAVE_ATOMIC128 1
-#elif defined(CONFIG_CMPXCHG128) && !defined(CONFIG_USER_ONLY)
+#elif defined(CONFIG_CMPXCHG128)
+# define HAVE_ATOMIC128_RO 0
+# define HAVE_ATOMIC128_RW 1
+
+Int128 QEMU_ERROR("unsupported atomic") atomic16_read_ro(const Int128 *ptr);
+
 static inline Int128 ATTRIBUTE_ATOMIC128_OPT
-atomic16_read(Int128 *ptr)
+atomic16_read_rw(Int128 *ptr)
 {
     /* Maybe replace 0 with 0, returning the old value.  */
     Int128 z = int128_make64(0);
@@ -XXX,XX +XXX,XX @@ atomic16_set(Int128 *ptr, Int128 val)
     } while (int128_ne(old, cmp));
 }
 
-# define HAVE_ATOMIC128 1
 #else
+# define HAVE_ATOMIC128_RO 0
+# define HAVE_ATOMIC128_RW 0
+
 /* Fallback definitions that must be optimized away, or error.  */
-Int128 QEMU_ERROR("unsupported atomic") atomic16_read(Int128 *ptr);
+Int128 QEMU_ERROR("unsupported atomic") atomic16_read_ro(const Int128 *ptr);
+Int128 QEMU_ERROR("unsupported atomic") atomic16_read_rw(Int128 *ptr);
 void QEMU_ERROR("unsupported atomic") atomic16_set(Int128 *ptr, Int128 val);
-# define HAVE_ATOMIC128 0
 #endif
 
 #endif /* HOST_ATOMIC128_LDST_H */
diff --git a/target/s390x/tcg/mem_helper.c b/target/s390x/tcg/mem_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/s390x/tcg/mem_helper.c
+++ b/target/s390x/tcg/mem_helper.c
@@ -XXX,XX +XXX,XX @@ static uint32_t do_csst(CPUS390XState *env, uint32_t r3, uint64_t a1,
         max = 3;
 #endif
         if ((HAVE_CMPXCHG128 ? 0 : fc + 2 > max) ||
-            (HAVE_ATOMIC128  ? 0 : sc > max)) {
+            (HAVE_ATOMIC128_RW ? 0 : sc > max)) {
             cpu_loop_exit_atomic(env_cpu(env), ra);
         }
     }
-- 
2.34.1

Remove the locally defined load_atomic16 and store_atomic16,
along with HAVE_al16 and HAVE_al16_fast in favor of the
routines defined in atomic128.h.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/cputlb.c             |   2 +-
 accel/tcg/ldst_atomicity.c.inc | 118 +++++++--------------------------
 2 files changed, 24 insertions(+), 96 deletions(-)

diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ static uint64_t do_st16_leN(CPUArchState *env, MMULookupPageData *p,
 
     case MO_ATOM_WITHIN16_PAIR:
         /* Since size > 8, this is the half that must be atomic. */
-        if (!HAVE_al16) {
+        if (!HAVE_ATOMIC128_RW) {
             cpu_loop_exit_atomic(env_cpu(env), ra);
         }
         return store_whole_le16(p->haddr, p->size, val_le);
diff --git a/accel/tcg/ldst_atomicity.c.inc b/accel/tcg/ldst_atomicity.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/ldst_atomicity.c.inc
+++ b/accel/tcg/ldst_atomicity.c.inc
@@ -XXX,XX +XXX,XX @@
 #endif
 #define HAVE_al8_fast      (ATOMIC_REG_SIZE >= 8)
 
-#if defined(CONFIG_ATOMIC128)
-# define HAVE_al16_fast    true
-#else
-# define HAVE_al16_fast    false
-#endif
-#if defined(CONFIG_ATOMIC128) || defined(CONFIG_CMPXCHG128)
-# define HAVE_al16         true
-#else
-# define HAVE_al16         false
-#endif
-
-
 /**
  * required_atomicity:
  *
@@ -XXX,XX +XXX,XX @@ static inline uint64_t load_atomic8(void *pv)
     return qatomic_read__nocheck(p);
 }
 
-/**
- * load_atomic16:
- * @pv: host address
- *
- * Atomically load 16 aligned bytes from @pv.
- */
-static inline Int128 ATTRIBUTE_ATOMIC128_OPT
-load_atomic16(void *pv)
-{
-#ifdef CONFIG_ATOMIC128
-    __uint128_t *p = __builtin_assume_aligned(pv, 16);
-    Int128Alias r;
-
-    r.u = qatomic_read__nocheck(p);
-    return r.s;
-#else
-    qemu_build_not_reached();
-#endif
-}
-
 /**
  * load_atomic8_or_exit:
  * @env: cpu context
@@ -XXX,XX +XXX,XX @@ static Int128 load_atomic16_or_exit(CPUArchState *env, uintptr_t ra, void *pv)
 {
     Int128 *p = __builtin_assume_aligned(pv, 16);
 
-    if (HAVE_al16_fast) {
-        return load_atomic16(p);
+    if (HAVE_ATOMIC128_RO) {
+        return atomic16_read_ro(p);
     }
 
 #ifdef CONFIG_USER_ONLY
@@ -XXX,XX +XXX,XX @@ static Int128 load_atomic16_or_exit(CPUArchState *env, uintptr_t ra, void *pv)
      * In system mode all guest pages are writable, and for user-only
      * we have just checked writability.  Try cmpxchg.
      */
-#if defined(CONFIG_CMPXCHG128)
-    /* Swap 0 with 0, with the side-effect of returning the old value. */
-    {
-        Int128Alias r;
-        r.u = __sync_val_compare_and_swap_16((__uint128_t *)p, 0, 0);
-        return r.s;
+    if (HAVE_ATOMIC128_RW) {
+        return atomic16_read_rw(p);
     }
-#endif
 
     /* Ultimate fallback: re-execute in serial context. */
     cpu_loop_exit_atomic(env_cpu(env), ra);
@@ -XXX,XX +XXX,XX @@ static uint64_t load_atom_extract_al16_or_exit(CPUArchState *env, uintptr_t ra,
 static inline uint64_t ATTRIBUTE_ATOMIC128_OPT
 load_atom_extract_al16_or_al8(void *pv, int s)
 {
-#if defined(CONFIG_ATOMIC128)
     uintptr_t pi = (uintptr_t)pv;
     int o = pi & 7;
     int shr = (HOST_BIG_ENDIAN ? 16 - s - o : o) * 8;
-    __uint128_t r;
+    Int128 r;
 
     pv = (void *)(pi & ~7);
     if (pi & 8) {
@@ -XXX,XX +XXX,XX @@ load_atom_extract_al16_or_al8(void *pv, int s)
         uint64_t b = qatomic_read__nocheck(p8 + 1);
 
         if (HOST_BIG_ENDIAN) {
-            r = ((__uint128_t)a << 64) | b;
+            r = int128_make128(b, a);
         } else {
-            r = ((__uint128_t)b << 64) | a;
+            r = int128_make128(a, b);
         }
     } else {
-        __uint128_t *p16 = __builtin_assume_aligned(pv, 16, 0);
-        r = qatomic_read__nocheck(p16);
+        r = atomic16_read_ro(pv);
     }
-    return r >> shr;
-#else
-    qemu_build_not_reached();
-#endif
+    return int128_getlo(int128_urshift(r, shr));
 }
 
 /**
@@ -XXX,XX +XXX,XX @@ static uint16_t load_atom_2(CPUArchState *env, uintptr_t ra,
     if (likely((pi & 1) == 0)) {
         return load_atomic2(pv);
     }
-    if (HAVE_al16_fast) {
+    if (HAVE_ATOMIC128_RO) {
         return load_atom_extract_al16_or_al8(pv, 2);
     }
 
@@ -XXX,XX +XXX,XX @@ static uint32_t load_atom_4(CPUArchState *env, uintptr_t ra,
     if (likely((pi & 3) == 0)) {
         return load_atomic4(pv);
     }
-    if (HAVE_al16_fast) {
+    if (HAVE_ATOMIC128_RO) {
         return load_atom_extract_al16_or_al8(pv, 4);
     }
 
@@ -XXX,XX +XXX,XX @@ static uint64_t load_atom_8(CPUArchState *env, uintptr_t ra,
     if (HAVE_al8 && likely((pi & 7) == 0)) {
         return load_atomic8(pv);
     }
-    if (HAVE_al16_fast) {
+    if (HAVE_ATOMIC128_RO) {
         return load_atom_extract_al16_or_al8(pv, 8);
     }
 
@@ -XXX,XX +XXX,XX @@ static Int128 load_atom_16(CPUArchState *env, uintptr_t ra,
      * If the host does not support 16-byte atomics, wait until we have
      * examined the atomicity parameters below.
      */
-    if (HAVE_al16_fast && likely((pi & 15) == 0)) {
-        return load_atomic16(pv);
+    if (HAVE_ATOMIC128_RO && likely((pi & 15) == 0)) {
+        return atomic16_read_ro(pv);
     }
 
     atmax = required_atomicity(env, pi, memop);
@@ -XXX,XX +XXX,XX @@ static inline void store_atomic8(void *pv, uint64_t val)
     qatomic_set__nocheck(p, val);
 }
 
-/**
- * store_atomic16:
- * @pv: host address
- * @val: value to store
- *
- * Atomically store 16 aligned bytes to @pv.
- */
-static inline void ATTRIBUTE_ATOMIC128_OPT
-store_atomic16(void *pv, Int128Alias val)
-{
-#if defined(CONFIG_ATOMIC128)
-    __uint128_t *pu = __builtin_assume_aligned(pv, 16);
-    qatomic_set__nocheck(pu, val.u);
-#elif defined(CONFIG_CMPXCHG128)
-    __uint128_t *pu = __builtin_assume_aligned(pv, 16);
-    __uint128_t o;
-
-    /*
-     * Without CONFIG_ATOMIC128, __atomic_compare_exchange_n will always
-     * defer to libatomic, so we must use __sync_*_compare_and_swap_16
-     * and accept the sequential consistency that comes with it.
-     */
-    do {
-        o = *pu;
-    } while (!__sync_bool_compare_and_swap_16(pu, o, val.u));
-#else
-    qemu_build_not_reached();
-#endif
-}
-
 /**
  * store_atom_4x2
  */
@@ -XXX,XX +XXX,XX @@ static uint64_t store_whole_le16(void *pv, int size, Int128 val_le)
     int sh = o * 8;
     Int128 m, v;
 
-    qemu_build_assert(HAVE_al16);
+    qemu_build_assert(HAVE_ATOMIC128_RW);
 
     /* Like MAKE_64BIT_MASK(0, sz), but larger. */
     if (sz <= 64) {
@@ -XXX,XX +XXX,XX @@ static void store_atom_2(CPUArchState *env, uintptr_t ra,
             return;
         }
     } else if ((pi & 15) == 7) {
-        if (HAVE_al16) {
+        if (HAVE_ATOMIC128_RW) {
             Int128 v = int128_lshift(int128_make64(val), 56);
             Int128 m = int128_lshift(int128_make64(0xffff), 56);
             store_atom_insert_al16(pv - 7, v, m);
@@ -XXX,XX +XXX,XX @@ static void store_atom_4(CPUArchState *env, uintptr_t ra,
                 return;
             }
         } else {
-            if (HAVE_al16) {
+            if (HAVE_ATOMIC128_RW) {
                 store_whole_le16(pv, 4, int128_make64(cpu_to_le32(val)));
                 return;
             }
@@ -XXX,XX +XXX,XX @@ static void store_atom_8(CPUArchState *env, uintptr_t ra,
         }
         break;
     case MO_64:
-        if (HAVE_al16) {
+        if (HAVE_ATOMIC128_RW) {
             store_whole_le16(pv, 8, int128_make64(cpu_to_le64(val)));
             return;
         }
@@ -XXX,XX +XXX,XX @@ static void store_atom_16(CPUArchState *env, uintptr_t ra,
     uint64_t a, b;
     int atmax;
 
-    if (HAVE_al16_fast && likely((pi & 15) == 0)) {
-        store_atomic16(pv, val);
+    if (HAVE_ATOMIC128_RW && likely((pi & 15) == 0)) {
+        atomic16_set(pv, val);
         return;
     }
 
@@ -XXX,XX +XXX,XX @@ static void store_atom_16(CPUArchState *env, uintptr_t ra,
         }
         break;
     case -MO_64:
-        if (HAVE_al16) {
+        if (HAVE_ATOMIC128_RW) {
             uint64_t val_le;
             int s2 = pi & 15;
             int s1 = 16 - s2;
@@ -XXX,XX +XXX,XX @@ static void store_atom_16(CPUArchState *env, uintptr_t ra,
         }
         break;
     case MO_128:
-        if (HAVE_al16) {
-            store_atomic16(pv, val);
+        if (HAVE_ATOMIC128_RW) {
+            atomic16_set(pv, val);
             return;
         }
         break;
-- 
2.34.1

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/tcg/debug-assert.h | 17 +++++++++++++++++
 include/tcg/tcg.h          |  9 +--------
 MAINTAINERS                |  1 +
 3 files changed, 19 insertions(+), 8 deletions(-)
 create mode 100644 include/tcg/debug-assert.h

diff --git a/include/tcg/debug-assert.h b/include/tcg/debug-assert.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/include/tcg/debug-assert.h
@@ -XXX,XX +XXX,XX @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Define tcg_debug_assert
+ * Copyright (c) 2008 Fabrice Bellard
+ */
+
+#ifndef TCG_DEBUG_ASSERT_H
+#define TCG_DEBUG_ASSERT_H
+
+#if defined CONFIG_DEBUG_TCG || defined QEMU_STATIC_ANALYSIS
+# define tcg_debug_assert(X) do { assert(X); } while (0)
+#else
+# define tcg_debug_assert(X) \
+    do { if (!(X)) { __builtin_unreachable(); } } while (0)
+#endif
+
+#endif
diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg.h
+++ b/include/tcg/tcg.h
@@ -XXX,XX +XXX,XX @@
 #include "tcg/tcg-mo.h"
 #include "tcg-target.h"
 #include "tcg/tcg-cond.h"
+#include "tcg/debug-assert.h"
 
 /* XXX: make safe guess about sizes */
 #define MAX_OP_PER_INSTR 266
@@ -XXX,XX +XXX,XX @@ typedef uint64_t tcg_insn_unit;
 /* The port better have done this.  */
 #endif
 
-
-#if defined CONFIG_DEBUG_TCG || defined QEMU_STATIC_ANALYSIS
-# define tcg_debug_assert(X) do { assert(X); } while (0)
-#else
-# define tcg_debug_assert(X) \
-    do { if (!(X)) { __builtin_unreachable(); } } while (0)
-#endif
-
 typedef struct TCGRelocation TCGRelocation;
 struct TCGRelocation {
     QSIMPLEQ_ENTRY(TCGRelocation) next;
diff --git a/MAINTAINERS b/MAINTAINERS
index XXXXXXX..XXXXXXX 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -XXX,XX +XXX,XX @@ F: include/sysemu/tcg.h
 F: include/hw/core/tcg-cpu-ops.h
 F: host/include/*/host/cpuinfo.h
 F: util/cpuinfo-*.c
+F: include/tcg/
 
 FPU emulation
 M: Aurelien Jarno <aurelien@aurel32.net>
-- 
2.34.1

Use __sync_bool_compare_and_swap_16 to control the loop,
rather than a separate comparison.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 host/include/generic/host/atomic128-ldst.h | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/host/include/generic/host/atomic128-ldst.h b/host/include/generic/host/atomic128-ldst.h
index XXXXXXX..XXXXXXX 100644
--- a/host/include/generic/host/atomic128-ldst.h
+++ b/host/include/generic/host/atomic128-ldst.h
@@ -XXX,XX +XXX,XX @@ atomic16_read_rw(Int128 *ptr)
 static inline void ATTRIBUTE_ATOMIC128_OPT
 atomic16_set(Int128 *ptr, Int128 val)
 {
-    Int128 old = *ptr, cmp;
+    __int128_t *ptr_align = __builtin_assume_aligned(ptr, 16);
+    __int128_t old;
+    Int128Alias new;
+
+    new.s = val;
     do {
-        cmp = old;
-        old = atomic16_cmpxchg(ptr, cmp, val);
-    } while (int128_ne(old, cmp));
+        old = *ptr_align;
+    } while (!__sync_bool_compare_and_swap_16(ptr_align, old, new.i));
 }
 
 #else
-- 
2.34.1

With FEAT_LSE2, load and store of int128 is directly supported.

diff --git a/host/include/aarch64/host/atomic128-ldst.h b/host/include/aarch64/host/atomic128-ldst.h
index XXXXXXX..XXXXXXX 100644
--- a/host/include/aarch64/host/atomic128-ldst.h
+++ b/host/include/aarch64/host/atomic128-ldst.h
@@ -XXX,XX +XXX,XX @@
 #ifndef AARCH64_ATOMIC128_LDST_H
 #define AARCH64_ATOMIC128_LDST_H
 
+#include "host/cpuinfo.h"
+#include "tcg/debug-assert.h"
+
 /*
  * Through gcc 10, aarch64 has no support for 128-bit atomics.
  * Through clang 16, without -march=armv8.4-a, __atomic_load_16
  * is incorrectly expanded to a read-write operation.
+ *
+ * Anyway, this method allows runtime detection of FEAT_LSE2.
  */
 
-#define HAVE_ATOMIC128_RO 0
+#define HAVE_ATOMIC128_RO (cpuinfo & CPUINFO_LSE2)
 #define HAVE_ATOMIC128_RW 1
 
-Int128 QEMU_ERROR("unsupported atomic") atomic16_read_ro(const Int128 *ptr);
+static inline Int128 atomic16_read_ro(const Int128 *ptr)
+{
+    uint64_t l, h;
+
+    tcg_debug_assert(HAVE_ATOMIC128_RO);
+    /* With FEAT_LSE2, 16-byte aligned LDP is atomic. */
+    asm("ldp %[l], %[h], %[mem]"
+        : [l] "=r"(l), [h] "=r"(h) : [mem] "m"(*ptr));
+
+    return int128_make128(l, h);
+}
 
 static inline Int128 atomic16_read_rw(Int128 *ptr)
 {
     uint64_t l, h;
     uint32_t tmp;
 
-    /* The load must be paired with the store to guarantee not tearing.  */
-    asm("0: ldxp %[l], %[h], %[mem]\n\t"
-        "stxp %w[tmp], %[l], %[h], %[mem]\n\t"
-        "cbnz %w[tmp], 0b"
-        : [mem] "+m"(*ptr), [tmp] "=r"(tmp), [l] "=r"(l), [h] "=r"(h));
+    if (cpuinfo & CPUINFO_LSE2) {
+        /* With FEAT_LSE2, 16-byte aligned LDP is atomic. */
+        asm("ldp %[l], %[h], %[mem]"
+            : [l] "=r"(l), [h] "=r"(h) : [mem] "m"(*ptr));
+    } else {
+        /* The load must be paired with the store to guarantee not tearing.  */
+        asm("0: ldxp %[l], %[h], %[mem]\n\t"
+            "stxp %w[tmp], %[l], %[h], %[mem]\n\t"
+            "cbnz %w[tmp], 0b"
+            : [mem] "+m"(*ptr), [tmp] "=r"(tmp), [l] "=r"(l), [h] "=r"(h));
+    }
 
     return int128_make128(l, h);
 }
@@ -XXX,XX +XXX,XX @@ static inline void atomic16_set(Int128 *ptr, Int128 val)
     uint64_t l = int128_getlo(val), h = int128_gethi(val);
     uint64_t t1, t2;
 
-    /* Load into temporaries to acquire the exclusive access lock.  */
-    asm("0: ldxp %[t1], %[t2], %[mem]\n\t"
-        "stxp %w[t1], %[l], %[h], %[mem]\n\t"
-        "cbnz %w[t1], 0b"
-        : [mem] "+m"(*ptr), [t1] "=&r"(t1), [t2] "=&r"(t2)
-        : [l] "r"(l), [h] "r"(h));
+    if (cpuinfo & CPUINFO_LSE2) {
+        /* With FEAT_LSE2, 16-byte aligned STP is atomic. */
+        asm("stp %[l], %[h], %[mem]"
+            : [mem] "=m"(*ptr) : [l] "r"(l), [h] "r"(h));
+    } else {
+        /* Load into temporaries to acquire the exclusive access lock.  */
+        asm("0: ldxp %[t1], %[t2], %[mem]\n\t"
+            "stxp %w[t1], %[l], %[h], %[mem]\n\t"
+            "cbnz %w[t1], 0b"
+            : [mem] "+m"(*ptr), [t1] "=&r"(t1), [t2] "=&r"(t2)
+            : [l] "r"(l), [h] "r"(h));
+    }
 }
 
 #endif /* AARCH64_ATOMIC128_LDST_H */
-- 
2.34.1

This had been set since the beginning, is never undefined,
and it would seem to be harmful to debugging to do so.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/exec-all.h   | 3 ---
 accel/tcg/cpu-exec.c      | 2 --
 accel/tcg/translate-all.c | 2 --
 accel/tcg/translator.c    | 2 --
 target/sh4/translate.c    | 2 --
 target/sparc/translate.c  | 2 --
 tcg/tcg.c                 | 9 +--------
 7 files changed, 1 insertion(+), 21 deletions(-)

diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -XXX,XX +XXX,XX @@
 #include "qemu/interval-tree.h"
 #include "qemu/clang-tsa.h"
 
-/* allow to see translation results - the slowdown should be negligible, so we leave it */
-#define DEBUG_DISAS
-
 /* Page tracking code uses ram addresses in system mode, and virtual
    addresses in userspace mode.  Define tb_page_addr_t to be an appropriate
    type.  */
diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cpu-exec.c
+++ b/accel/tcg/cpu-exec.c
@@ -XXX,XX +XXX,XX @@ static void log_cpu_exec(target_ulong pc, CPUState *cpu,
                       cpu->cpu_index, tb->tc.ptr, tb->cs_base, pc,
                       tb->flags, tb->cflags, lookup_symbol(pc));
 
-#if defined(DEBUG_DISAS)
         if (qemu_loglevel_mask(CPU_LOG_TB_CPU)) {
             FILE *logfile = qemu_log_trylock();
             if (logfile) {
@@ -XXX,XX +XXX,XX @@ static void log_cpu_exec(target_ulong pc, CPUState *cpu,
                 qemu_log_unlock(logfile);
             }
         }
-#endif /* DEBUG_DISAS */
     }
 }
 
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_gen_code(CPUState *cpu,
     qatomic_set(&prof->search_out_len, prof->search_out_len + search_size);
 #endif
 
-#ifdef DEBUG_DISAS
     if (qemu_loglevel_mask(CPU_LOG_TB_OUT_ASM) &&
         qemu_log_in_addr_range(pc)) {
         FILE *logfile = qemu_log_trylock();
@@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_gen_code(CPUState *cpu,
             qemu_log_unlock(logfile);
         }
     }
-#endif
 
     qatomic_set(&tcg_ctx->code_gen_ptr, (void *)
         ROUND_UP((uintptr_t)gen_code_buf + gen_code_size + search_size,
diff --git a/accel/tcg/translator.c b/accel/tcg/translator.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/translator.c
+++ b/accel/tcg/translator.c
@@ -XXX,XX +XXX,XX @@ void translator_loop(CPUState *cpu, TranslationBlock *tb, int *max_insns,
     tb->size = db->pc_next - db->pc_first;
     tb->icount = db->num_insns;
 
-#ifdef DEBUG_DISAS
     if (qemu_loglevel_mask(CPU_LOG_TB_IN_ASM)
         && qemu_log_in_addr_range(db->pc_first)) {
         FILE *logfile = qemu_log_trylock();
@@ -XXX,XX +XXX,XX @@ void translator_loop(CPUState *cpu, TranslationBlock *tb, int *max_insns,
             qemu_log_unlock(logfile);
         }
     }
-#endif
 }
 
 static void *translator_access(CPUArchState *env, DisasContextBase *db,
diff --git a/target/sh4/translate.c b/target/sh4/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/sh4/translate.c
+++ b/target/sh4/translate.c
@@ -XXX,XX +XXX,XX @@
  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  */
 
-#define DEBUG_DISAS
-
 #include "qemu/osdep.h"
 #include "cpu.h"
 #include "disas/disas.h"
diff --git a/target/sparc/translate.c b/target/sparc/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/sparc/translate.c
+++ b/target/sparc/translate.c
@@ -XXX,XX +XXX,XX @@
 #include "asi.h"
 
 
-#define DEBUG_DISAS
-
 #define DYNAMIC_PC  1 /* dynamic pc value */
 #define JUMP_PC     2 /* dynamic pc value which takes only two values
                          according to jump_pc[T2] */
diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ void tcg_prologue_init(TCGContext *s)
                         (uintptr_t)s->code_buf, prologue_size);
 #endif
 
-#ifdef DEBUG_DISAS
     if (qemu_loglevel_mask(CPU_LOG_TB_OUT_ASM)) {
         FILE *logfile = qemu_log_trylock();
         if (logfile) {
@@ -XXX,XX +XXX,XX @@ void tcg_prologue_init(TCGContext *s)
             qemu_log_unlock(logfile);
         }
     }
-#endif
 
 #ifndef CONFIG_TCG_INTERPRETER
     /*
@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start)
     }
 #endif
 
-#ifdef DEBUG_DISAS
     if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_OP)
                  && qemu_log_in_addr_range(pc_start))) {
         FILE *logfile = qemu_log_trylock();
@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start)
             qemu_log_unlock(logfile);
         }
     }
-#endif
 
 #ifdef CONFIG_DEBUG_TCG
     /* Ensure all labels referenced have been emitted.  */
@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start)
     liveness_pass_1(s);
 
     if (s->nb_indirects > 0) {
-#ifdef DEBUG_DISAS
         if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_OP_IND)
                      && qemu_log_in_addr_range(pc_start))) {
             FILE *logfile = qemu_log_trylock();
@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start)
                 qemu_log_unlock(logfile);
             }
         }
-#endif
+
         /* Replace indirect temps with direct temps.  */
         if (liveness_pass_2(s)) {
             /* If changes were made, re-run liveness.  */
@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start)
     qatomic_set(&prof->la_time, prof->la_time + profile_getclock());
 #endif
 
-#ifdef DEBUG_DISAS
     if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_OP_OPT)
                  && qemu_log_in_addr_range(pc_start))) {
         FILE *logfile = qemu_log_trylock();
@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start)
             qemu_log_unlock(logfile);
         }
     }
-#endif
 
     /* Initialize goto_tb jump offsets. */
     tb->jmp_reset_offset[0] = TB_JMP_OFFSET_INVALID;
-- 
2.34.1