Series comparison

-[PULL 0/4] tcg patch queue
+[PULL v2 00/28] tcg patch queue
-The following changes since commit 15df33ceb73cb6bb3c6736cf4d2cff51129ed4b4:
+v2: Testing revealed a missing earlyclober in the aa64 inline asm,
 which showed up with macos testing.
-  Merge remote-tracking branch 'remotes/quic/tags/pull-hex-20220312-1' into staging (2022-03-13 17:29:18 +0000)
+r~
 The following changes since commit aa33508196f4e2da04625bee36e1f7be5b9267e7:
   Merge tag 'mem-2023-05-23' of https://github.com/davidhildenbrand/qemu into staging (2023-05-23 10:57:25 -0700)
 are available in the Git repository at:
-  https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20220314
+  https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20230523-2
-for you to fetch changes up to 76cff100beeae8d3676bb658cccd45ef5ced8aa9:
+for you to fetch changes up to a57663c5a38c26516bde24ecb3992adff4861a31:
-  tcg/arm: Don't emit UNPREDICTABLE LDRD with Rm == Rt or Rt+1 (2022-03-14 10:31:51 -0700)
+  tcg: Remove USE_TCG_OPTIMIZATIONS (2023-05-24 01:10:44 +0000)
 ----------------------------------------------------------------
-Fixes for s390x host vectors
+util: Host cpu detection for x86 and aa64
-Fix for arm ldrd unpredictable case
+util: Use cpu detection for bufferiszero
 migration: Use cpu detection for xbzrle
 tcg: Replace and remove cpu_atomic_{ld,st}o*
 host/include: Split qemu/atomic128.h
 tcg: Remove DEBUG_DISAS
 tcg: Remove USE_TCG_OPTIMIZATIONS
 ----------------------------------------------------------------
-Richard Henderson (4):
+Richard Henderson (28):
-      tcg/s390x: Fix tcg_out_dupi_vec vs VGM
+      util: Introduce host-specific cpuinfo.h
-      tcg/s390x: Fix INDEX_op_bitsel_vec vs VSEL
+      util: Add cpuinfo-i386.c
-      tcg/s390x: Fix tcg_out_dup_vec vs general registers
+      util: Add i386 CPUINFO_ATOMIC_VMOVDQU
-      tcg/arm: Don't emit UNPREDICTABLE LDRD with Rm == Rt or Rt+1
+      tcg/i386: Use host/cpuinfo.h
       util/bufferiszero: Use i386 host/cpuinfo.h
       migration/xbzrle: Shuffle function order
       migration/xbzrle: Use i386 host/cpuinfo.h
       migration: Build migration_files once
       util: Add cpuinfo-aarch64.c
       include/host: Split out atomic128-cas.h
       include/host: Split out atomic128-ldst.h
       meson: Fix detect atomic128 support with optimization
       include/qemu: Move CONFIG_ATOMIC128_OPT handling to atomic128.h
       target/ppc: Use tcg_gen_qemu_{ld,st}_i128 for LQARX, LQ, STQ
       target/s390x: Use tcg_gen_qemu_{ld,st}_i128 for LPQ, STPQ
       accel/tcg: Unify cpu_{ld,st}*_{be,le}_mmu
       target/s390x: Use cpu_{ld,st}*_mmu in do_csst
       target/s390x: Always use cpu_atomic_cmpxchgl_be_mmu in do_csst
       accel/tcg: Remove cpu_atomic_{ld,st}o_*_mmu
       accel/tcg: Remove prot argument to atomic_mmu_lookup
       accel/tcg: Eliminate #if on HAVE_ATOMIC128 and HAVE_CMPXCHG128
       qemu/atomic128: Split atomic16_read
       accel/tcg: Correctly use atomic128.h in ldst_atomicity.c.inc
       tcg: Split out tcg/debug-assert.h
       qemu/atomic128: Improve cmpxchg fallback for atomic16_set
       qemu/atomic128: Add runtime test for FEAT_LSE2
       tcg: Remove DEBUG_DISAS
       tcg: Remove USE_TCG_OPTIMIZATIONS
- tcg/arm/tcg-target.c.inc   | 17 +++++++++++++++--
+ accel/tcg/atomic_template.h                |  93 +-----
- tcg/s390x/tcg-target.c.inc |  7 ++++---
+ host/include/aarch64/host/atomic128-cas.h  |  45 +++
-files changed, 19 insertions(+), 5 deletions(-)
+ host/include/aarch64/host/atomic128-ldst.h |  79 +++++
  host/include/aarch64/host/cpuinfo.h        |  22 ++
  host/include/generic/host/atomic128-cas.h  |  47 +++
  host/include/generic/host/atomic128-ldst.h |  81 +++++
  host/include/generic/host/cpuinfo.h        |   4 +
  host/include/i386/host/cpuinfo.h           |  39 +++
  host/include/x86_64/host/cpuinfo.h         |   1 +
  include/exec/cpu_ldst.h                    |  67 +----
  include/exec/exec-all.h                    |   3 -
  include/qemu/atomic128.h                   | 146 ++-------
  include/tcg/debug-assert.h                 |  17 ++
  include/tcg/tcg.h                          |   9 +-
  migration/xbzrle.h                         |   5 +-
  target/ppc/cpu.h                           |   1 -
  target/ppc/helper.h                        |   9 -
  target/s390x/cpu.h                         |   3 -
  target/s390x/helper.h                      |   4 -
  tcg/aarch64/tcg-target.h                   |   6 +-
  tcg/i386/tcg-target.h                      |  28 +-
  accel/tcg/cpu-exec.c                       |   2 -
  accel/tcg/cputlb.c                         | 211 ++++---------
  accel/tcg/translate-all.c                  |   2 -
  accel/tcg/translator.c                     |   2 -
  accel/tcg/user-exec.c                      | 332 ++++++--------------
  migration/ram.c                            |  34 +--
  migration/xbzrle.c                         | 268 +++++++++--------
  target/arm/tcg/m_helper.c                  |   4 +-
  target/ppc/mem_helper.c                    |  48 ---
  target/ppc/translate.c                     |  34 +--
  target/s390x/tcg/mem_helper.c              | 137 ++-------
  target/s390x/tcg/translate.c               |  30 +-
  target/sh4/translate.c                     |   2 -
  target/sparc/ldst_helper.c                 |  18 +-
  target/sparc/translate.c                   |   2 -
  tcg/tcg.c                                  |  14 +-
  tests/bench/xbzrle-bench.c                 | 469 -----------------------------
  tests/unit/test-xbzrle.c                   |  49 +--
  util/bufferiszero.c                        | 127 +++-----
  util/cpuinfo-aarch64.c                     |  67 +++++
  util/cpuinfo-i386.c                        |  99 ++++++
  MAINTAINERS                                |   3 +
  accel/tcg/atomic_common.c.inc              |  14 -
  accel/tcg/ldst_atomicity.c.inc             | 135 ++-------
  accel/tcg/ldst_common.c.inc                |  24 +-
  meson.build                                |  12 +-
  migration/meson.build                      |   1 -
  target/ppc/translate/fixedpoint-impl.c.inc |  51 +---
  target/s390x/tcg/insn-data.h.inc           |   2 +-
  tcg/aarch64/tcg-target.c.inc               |  40 ---
  tcg/i386/tcg-target.c.inc                  | 123 +-------
  tests/bench/meson.build                    |   6 -
  util/meson.build                           |   6 +
 files changed, 1035 insertions(+), 2042 deletions(-)
  create mode 100644 host/include/aarch64/host/atomic128-cas.h
  create mode 100644 host/include/aarch64/host/atomic128-ldst.h
  create mode 100644 host/include/aarch64/host/cpuinfo.h
  create mode 100644 host/include/generic/host/atomic128-cas.h
  create mode 100644 host/include/generic/host/atomic128-ldst.h
  create mode 100644 host/include/generic/host/cpuinfo.h
  create mode 100644 host/include/i386/host/cpuinfo.h
  create mode 100644 host/include/x86_64/host/cpuinfo.h
  create mode 100644 include/tcg/debug-assert.h
  delete mode 100644 tests/bench/xbzrle-bench.c
  create mode 100644 util/cpuinfo-aarch64.c
  create mode 100644 util/cpuinfo-i386.c

-[PULL 1/4] tcg/s390x: Fix tcg_out_dupi_vec vs VGM
+Deleted patch
-The immediate operands to VGM were in the wrong order,
-producing an inverse mask.
-Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
----
- tcg/s390x/tcg-target.c.inc | 4 ++--
-file changed, 2 insertions(+), 2 deletions(-)
-diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
-index XXXXXXX..XXXXXXX 100644
---- a/tcg/s390x/tcg-target.c.inc
-+++ b/tcg/s390x/tcg-target.c.inc
-@@ -XXX,XX +XXX,XX @@ static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
-                 msb = clz32(val);
-                 lsb = 31 - ctz32(val);
-             }
--            tcg_out_insn(s, VRIb, VGM, dst, lsb, msb, MO_32);
-+            tcg_out_insn(s, VRIb, VGM, dst, msb, lsb, MO_32);
-             return;
-         }
-     } else {
-@@ -XXX,XX +XXX,XX @@ static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
-                 msb = clz64(val);
-                 lsb = 63 - ctz64(val);
-             }
--            tcg_out_insn(s, VRIb, VGM, dst, lsb, msb, MO_64);
-+            tcg_out_insn(s, VRIb, VGM, dst, msb, lsb, MO_64);
-             return;
-         }
-     }
---
-.25.1

-[PULL 2/4] tcg/s390x: Fix INDEX_op_bitsel_vec vs VSEL
+Deleted patch
-The operands are output in the wrong order: the tcg selector
-argument is first, whereas the s390x selector argument is last.
-Tested-by: Thomas Huth <thuth@redhat.com>
-Resolves: https://gitlab.com/qemu-project/qemu/-/issues/898
-Fixes: 9bca986df88 ("tcg/s390x: Implement TCG_TARGET_HAS_bitsel_vec")
-Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
----
- tcg/s390x/tcg-target.c.inc | 2 +-
-file changed, 1 insertion(+), 1 deletion(-)
-diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
-index XXXXXXX..XXXXXXX 100644
---- a/tcg/s390x/tcg-target.c.inc
-+++ b/tcg/s390x/tcg-target.c.inc
-@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
-         break;
-     case INDEX_op_bitsel_vec:
--        tcg_out_insn(s, VRRe, VSEL, a0, a1, a2, args[3]);
-+        tcg_out_insn(s, VRRe, VSEL, a0, a2, args[3], a1);
-         break;
-     case INDEX_op_cmp_vec:
---
-.25.1

-[PULL 3/4] tcg/s390x: Fix tcg_out_dup_vec vs general registers
+Deleted patch
-We copied the data from the general register input to the
-vector register output, but have not yet replicated it.
-We intended to fall through into the vector-vector case,
-but failed to redirect the input register.
-This is caught by an assertion failure in tcg_out_insn_VRIc,
-which diagnosed the incorrect register class.
-Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
----
- tcg/s390x/tcg-target.c.inc | 1 +
-file changed, 1 insertion(+)
-diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
-index XXXXXXX..XXXXXXX 100644
---- a/tcg/s390x/tcg-target.c.inc
-+++ b/tcg/s390x/tcg-target.c.inc
-@@ -XXX,XX +XXX,XX @@ static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
-         if (vece == MO_64) {
-             return true;
-         }
-+        src = dst;
-     }
-     /*
---
-.25.1

-[PULL 4/4] tcg/arm: Don't emit UNPREDICTABLE LDRD with Rm == Rt or Rt+1
+[PULL v2 26/28] qemu/atomic128: Add runtime test for FEAT_LSE2
-The LDRD (register) instruction is UNPREDICTABLE if the Rm register
+With FEAT_LSE2, load and store of int128 is directly supported.
 is the same as either Rt or Rt+1 (the two registers being loaded to).
 We weren't making sure we avoided this, with the result that on some
 host CPUs like the Cortex-A7 we would get a SIGILL because the CPU
 chooses to UNDEF for this particular UNPREDICTABLE case.
 Since we've already checked that datalo is aligned, we can simplify
 the test vs the Rm operand by aligning it before comparison.  Check
 for the two orderings before falling back to two ldr instructions.
 We don't bother to do anything similar for tcg_out_ldrd_rwb(),
 because it is only used in tcg_out_tlb_read() with a fixed set of
 registers which don't overlap.
 There is no equivalent UNPREDICTABLE case for STRD.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-Resolves: https://gitlab.com/qemu-project/qemu/-/issues/896
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/arm/tcg-target.c.inc | 17 +++++++++++++++--
+ host/include/aarch64/host/atomic128-ldst.h | 53 ++++++++++++++++------
-file changed, 15 insertions(+), 2 deletions(-)
+file changed, 40 insertions(+), 13 deletions(-)
-diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
+diff --git a/host/include/aarch64/host/atomic128-ldst.h b/host/include/aarch64/host/atomic128-ldst.h
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/arm/tcg-target.c.inc
+--- a/host/include/aarch64/host/atomic128-ldst.h
-+++ b/tcg/arm/tcg-target.c.inc
++++ b/host/include/aarch64/host/atomic128-ldst.h
-@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld_index(TCGContext *s, MemOp opc,
+@@ -XXX,XX +XXX,XX @@
-         /* LDRD requires alignment; double-check that. */
+ #ifndef AARCH64_ATOMIC128_LDST_H
-         if (get_alignment_bits(opc) >= MO_64
+ #define AARCH64_ATOMIC128_LDST_H
-             && (datalo & 1) == 0 && datahi == datalo + 1) {
--            tcg_out_ldrd_r(s, COND_AL, datalo, addrlo, addend);
++#include "host/cpuinfo.h"
--        } else if (scratch_addend) {
++#include "tcg/debug-assert.h"
-+            /*
++
-+             * Rm (the second address op) must not overlap Rt or Rt + 1.
+ /*
-+             * Since datalo is aligned, we can simplify the test via alignment.
+  * Through gcc 10, aarch64 has no support for 128-bit atomics.
-+             * Flip the two address arguments if that works.
+  * Through clang 16, without -march=armv8.4-a, __atomic_load_16
-+             */
+  * is incorrectly expanded to a read-write operation.
-+            if ((addend & ~1) != datalo) {
++ *
-+                tcg_out_ldrd_r(s, COND_AL, datalo, addrlo, addend);
++ * Anyway, this method allows runtime detection of FEAT_LSE2.
-+                break;
+  */
-+            }
-+            if ((addrlo & ~1) != datalo) {
+-#define HAVE_ATOMIC128_RO 0
-+                tcg_out_ldrd_r(s, COND_AL, datalo, addend, addrlo);
++#define HAVE_ATOMIC128_RO (cpuinfo & CPUINFO_LSE2)
-+                break;
+ #define HAVE_ATOMIC128_RW 1
-+            }
-+        }
+-Int128 QEMU_ERROR("unsupported atomic") atomic16_read_ro(const Int128 *ptr);
-+        if (scratch_addend) {
++static inline Int128 atomic16_read_ro(const Int128 *ptr)
-             tcg_out_ld32_rwb(s, COND_AL, datalo, addend, addrlo);
++{
-             tcg_out_ld32_12(s, COND_AL, datahi, addend, 4);
++    uint64_t l, h;
-         } else {
++
 +    tcg_debug_assert(HAVE_ATOMIC128_RO);
 +    /* With FEAT_LSE2, 16-byte aligned LDP is atomic. */
 +    asm("ldp %[l], %[h], %[mem]"
 +        : [l] "=r"(l), [h] "=r"(h) : [mem] "m"(*ptr));
 +
 +    return int128_make128(l, h);
 +}
  static inline Int128 atomic16_read_rw(Int128 *ptr)
  {
      uint64_t l, h;
      uint32_t tmp;
 -    /* The load must be paired with the store to guarantee not tearing.  */
 -    asm("0: ldxp %[l], %[h], %[mem]\n\t"
 -        "stxp %w[tmp], %[l], %[h], %[mem]\n\t"
 -        "cbnz %w[tmp], 0b"
 -        : [mem] "+m"(*ptr), [tmp] "=r"(tmp), [l] "=r"(l), [h] "=r"(h));
 +    if (cpuinfo & CPUINFO_LSE2) {
 +        /* With FEAT_LSE2, 16-byte aligned LDP is atomic. */
 +        asm("ldp %[l], %[h], %[mem]"
 +            : [l] "=r"(l), [h] "=r"(h) : [mem] "m"(*ptr));
 +    } else {
 +        /* The load must be paired with the store to guarantee not tearing.  */
 +        asm("0: ldxp %[l], %[h], %[mem]\n\t"
 +            "stxp %w[tmp], %[l], %[h], %[mem]\n\t"
 +            "cbnz %w[tmp], 0b"
 +            : [mem] "+m"(*ptr), [tmp] "=&r"(tmp), [l] "=&r"(l), [h] "=&r"(h));
 +    }
      return int128_make128(l, h);
  }
@@ -XXX,XX +XXX,XX @@ static inline void atomic16_set(Int128 *ptr, Int128 val)
      uint64_t l = int128_getlo(val), h = int128_gethi(val);
      uint64_t t1, t2;
 -    /* Load into temporaries to acquire the exclusive access lock.  */
 -    asm("0: ldxp %[t1], %[t2], %[mem]\n\t"
 -        "stxp %w[t1], %[l], %[h], %[mem]\n\t"
 -        "cbnz %w[t1], 0b"
 -        : [mem] "+m"(*ptr), [t1] "=&r"(t1), [t2] "=&r"(t2)
 -        : [l] "r"(l), [h] "r"(h));
 +    if (cpuinfo & CPUINFO_LSE2) {
 +        /* With FEAT_LSE2, 16-byte aligned STP is atomic. */
 +        asm("stp %[l], %[h], %[mem]"
 +            : [mem] "=m"(*ptr) : [l] "r"(l), [h] "r"(h));
 +    } else {
 +        /* Load into temporaries to acquire the exclusive access lock.  */
 +        asm("0: ldxp %[t1], %[t2], %[mem]\n\t"
 +            "stxp %w[t1], %[l], %[h], %[mem]\n\t"
 +            "cbnz %w[t1], 0b"
 +            : [mem] "+m"(*ptr), [t1] "=&r"(t1), [t2] "=&r"(t2)
 +            : [l] "r"(l), [h] "r"(h));
 +    }
  }
  #endif /* AARCH64_ATOMIC128_LDST_H */
 --
-.25.1
+.34.1

The following changes since commit 15df33ceb73cb6bb3c6736cf4d2cff51129ed4b4:

Merge remote-tracking branch 'remotes/quic/tags/pull-hex-20220312-1' into staging (2022-03-13 17:29:18 +0000)

are available in the Git repository at:

https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20220314

for you to fetch changes up to 76cff100beeae8d3676bb658cccd45ef5ced8aa9:

tcg/arm: Don't emit UNPREDICTABLE LDRD with Rm == Rt or Rt+1 (2022-03-14 10:31:51 -0700)

----------------------------------------------------------------
Fixes for s390x host vectors
Fix for arm ldrd unpredictable case

----------------------------------------------------------------
Richard Henderson (4):
      tcg/s390x: Fix tcg_out_dupi_vec vs VGM
      tcg/s390x: Fix INDEX_op_bitsel_vec vs VSEL
      tcg/s390x: Fix tcg_out_dup_vec vs general registers
      tcg/arm: Don't emit UNPREDICTABLE LDRD with Rm == Rt or Rt+1

tcg/arm/tcg-target.c.inc   | 17 +++++++++++++++--
 tcg/s390x/tcg-target.c.inc |  7 ++++---
 2 files changed, 19 insertions(+), 5 deletions(-)

The immediate operands to VGM were in the wrong order,
producing an inverse mask.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/s390x/tcg-target.c.inc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/s390x/tcg-target.c.inc
+++ b/tcg/s390x/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
                 msb = clz32(val);
                 lsb = 31 - ctz32(val);
             }
-            tcg_out_insn(s, VRIb, VGM, dst, lsb, msb, MO_32);
+            tcg_out_insn(s, VRIb, VGM, dst, msb, lsb, MO_32);
             return;
         }
     } else {
@@ -XXX,XX +XXX,XX @@ static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
                 msb = clz64(val);
                 lsb = 63 - ctz64(val);
             }
-            tcg_out_insn(s, VRIb, VGM, dst, lsb, msb, MO_64);
+            tcg_out_insn(s, VRIb, VGM, dst, msb, lsb, MO_64);
             return;
         }
     }
-- 
2.25.1

The LDRD (register) instruction is UNPREDICTABLE if the Rm register
is the same as either Rt or Rt+1 (the two registers being loaded to).
We weren't making sure we avoided this, with the result that on some
host CPUs like the Cortex-A7 we would get a SIGILL because the CPU
chooses to UNDEF for this particular UNPREDICTABLE case.

Since we've already checked that datalo is aligned, we can simplify
the test vs the Rm operand by aligning it before comparison.  Check
for the two orderings before falling back to two ldr instructions.

We don't bother to do anything similar for tcg_out_ldrd_rwb(),
because it is only used in tcg_out_tlb_read() with a fixed set of
registers which don't overlap.

There is no equivalent UNPREDICTABLE case for STRD.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/896
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/arm/tcg-target.c.inc | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld_index(TCGContext *s, MemOp opc,
         /* LDRD requires alignment; double-check that. */
         if (get_alignment_bits(opc) >= MO_64
             && (datalo & 1) == 0 && datahi == datalo + 1) {
-            tcg_out_ldrd_r(s, COND_AL, datalo, addrlo, addend);
-        } else if (scratch_addend) {
+            /*
+             * Rm (the second address op) must not overlap Rt or Rt + 1.
+             * Since datalo is aligned, we can simplify the test via alignment.
+             * Flip the two address arguments if that works.
+             */
+            if ((addend & ~1) != datalo) {
+                tcg_out_ldrd_r(s, COND_AL, datalo, addrlo, addend);
+                break;
+            }
+            if ((addrlo & ~1) != datalo) {
+                tcg_out_ldrd_r(s, COND_AL, datalo, addend, addrlo);
+                break;
+            }
+        }
+        if (scratch_addend) {
             tcg_out_ld32_rwb(s, COND_AL, datalo, addend, addrlo);
             tcg_out_ld32_12(s, COND_AL, datahi, addend, 4);
         } else {
-- 
2.25.1

v2: Testing revealed a missing earlyclober in the aa64 inline asm,
which showed up with macos testing.

The following changes since commit aa33508196f4e2da04625bee36e1f7be5b9267e7:

Merge tag 'mem-2023-05-23' of https://github.com/davidhildenbrand/qemu into staging (2023-05-23 10:57:25 -0700)

are available in the Git repository at:

https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20230523-2

for you to fetch changes up to a57663c5a38c26516bde24ecb3992adff4861a31:

tcg: Remove USE_TCG_OPTIMIZATIONS (2023-05-24 01:10:44 +0000)

----------------------------------------------------------------
util: Host cpu detection for x86 and aa64
util: Use cpu detection for bufferiszero
migration: Use cpu detection for xbzrle
tcg: Replace and remove cpu_atomic_{ld,st}o*
host/include: Split qemu/atomic128.h
tcg: Remove DEBUG_DISAS
tcg: Remove USE_TCG_OPTIMIZATIONS

----------------------------------------------------------------
Richard Henderson (28):
      util: Introduce host-specific cpuinfo.h
      util: Add cpuinfo-i386.c
      util: Add i386 CPUINFO_ATOMIC_VMOVDQU
      tcg/i386: Use host/cpuinfo.h
      util/bufferiszero: Use i386 host/cpuinfo.h
      migration/xbzrle: Shuffle function order
      migration/xbzrle: Use i386 host/cpuinfo.h
      migration: Build migration_files once
      util: Add cpuinfo-aarch64.c
      include/host: Split out atomic128-cas.h
      include/host: Split out atomic128-ldst.h
      meson: Fix detect atomic128 support with optimization
      include/qemu: Move CONFIG_ATOMIC128_OPT handling to atomic128.h
      target/ppc: Use tcg_gen_qemu_{ld,st}_i128 for LQARX, LQ, STQ
      target/s390x: Use tcg_gen_qemu_{ld,st}_i128 for LPQ, STPQ
      accel/tcg: Unify cpu_{ld,st}*_{be,le}_mmu
      target/s390x: Use cpu_{ld,st}*_mmu in do_csst
      target/s390x: Always use cpu_atomic_cmpxchgl_be_mmu in do_csst
      accel/tcg: Remove cpu_atomic_{ld,st}o_*_mmu
      accel/tcg: Remove prot argument to atomic_mmu_lookup
      accel/tcg: Eliminate #if on HAVE_ATOMIC128 and HAVE_CMPXCHG128
      qemu/atomic128: Split atomic16_read
      accel/tcg: Correctly use atomic128.h in ldst_atomicity.c.inc
      tcg: Split out tcg/debug-assert.h
      qemu/atomic128: Improve cmpxchg fallback for atomic16_set
      qemu/atomic128: Add runtime test for FEAT_LSE2
      tcg: Remove DEBUG_DISAS
      tcg: Remove USE_TCG_OPTIMIZATIONS

accel/tcg/atomic_template.h                |  93 +-----
 host/include/aarch64/host/atomic128-cas.h  |  45 +++
 host/include/aarch64/host/atomic128-ldst.h |  79 +++++
 host/include/aarch64/host/cpuinfo.h        |  22 ++
 host/include/generic/host/atomic128-cas.h  |  47 +++
 host/include/generic/host/atomic128-ldst.h |  81 +++++
 host/include/generic/host/cpuinfo.h        |   4 +
 host/include/i386/host/cpuinfo.h           |  39 +++
 host/include/x86_64/host/cpuinfo.h         |   1 +
 include/exec/cpu_ldst.h                    |  67 +----
 include/exec/exec-all.h                    |   3 -
 include/qemu/atomic128.h                   | 146 ++-------
 include/tcg/debug-assert.h                 |  17 ++
 include/tcg/tcg.h                          |   9 +-
 migration/xbzrle.h                         |   5 +-
 target/ppc/cpu.h                           |   1 -
 target/ppc/helper.h                        |   9 -
 target/s390x/cpu.h                         |   3 -
 target/s390x/helper.h                      |   4 -
 tcg/aarch64/tcg-target.h                   |   6 +-
 tcg/i386/tcg-target.h                      |  28 +-
 accel/tcg/cpu-exec.c                       |   2 -
 accel/tcg/cputlb.c                         | 211 ++++---------
 accel/tcg/translate-all.c                  |   2 -
 accel/tcg/translator.c                     |   2 -
 accel/tcg/user-exec.c                      | 332 ++++++--------------
 migration/ram.c                            |  34 +--
 migration/xbzrle.c                         | 268 +++++++++--------
 target/arm/tcg/m_helper.c                  |   4 +-
 target/ppc/mem_helper.c                    |  48 ---
 target/ppc/translate.c                     |  34 +--
 target/s390x/tcg/mem_helper.c              | 137 ++-------
 target/s390x/tcg/translate.c               |  30 +-
 target/sh4/translate.c                     |   2 -
 target/sparc/ldst_helper.c                 |  18 +-
 target/sparc/translate.c                   |   2 -
 tcg/tcg.c                                  |  14 +-
 tests/bench/xbzrle-bench.c                 | 469 -----------------------------
 tests/unit/test-xbzrle.c                   |  49 +--
 util/bufferiszero.c                        | 127 +++-----
 util/cpuinfo-aarch64.c                     |  67 +++++
 util/cpuinfo-i386.c                        |  99 ++++++
 MAINTAINERS                                |   3 +
 accel/tcg/atomic_common.c.inc              |  14 -
 accel/tcg/ldst_atomicity.c.inc             | 135 ++-------
 accel/tcg/ldst_common.c.inc                |  24 +-
 meson.build                                |  12 +-
 migration/meson.build                      |   1 -
 target/ppc/translate/fixedpoint-impl.c.inc |  51 +---
 target/s390x/tcg/insn-data.h.inc           |   2 +-
 tcg/aarch64/tcg-target.c.inc               |  40 ---
 tcg/i386/tcg-target.c.inc                  | 123 +-------
 tests/bench/meson.build                    |   6 -
 util/meson.build                           |   6 +
 54 files changed, 1035 insertions(+), 2042 deletions(-)
 create mode 100644 host/include/aarch64/host/atomic128-cas.h
 create mode 100644 host/include/aarch64/host/atomic128-ldst.h
 create mode 100644 host/include/aarch64/host/cpuinfo.h
 create mode 100644 host/include/generic/host/atomic128-cas.h
 create mode 100644 host/include/generic/host/atomic128-ldst.h
 create mode 100644 host/include/generic/host/cpuinfo.h
 create mode 100644 host/include/i386/host/cpuinfo.h
 create mode 100644 host/include/x86_64/host/cpuinfo.h
 create mode 100644 include/tcg/debug-assert.h
 delete mode 100644 tests/bench/xbzrle-bench.c
 create mode 100644 util/cpuinfo-aarch64.c
 create mode 100644 util/cpuinfo-i386.c

With FEAT_LSE2, load and store of int128 is directly supported.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 host/include/aarch64/host/atomic128-ldst.h | 53 ++++++++++++++++------
 1 file changed, 40 insertions(+), 13 deletions(-)

diff --git a/host/include/aarch64/host/atomic128-ldst.h b/host/include/aarch64/host/atomic128-ldst.h
index XXXXXXX..XXXXXXX 100644
--- a/host/include/aarch64/host/atomic128-ldst.h
+++ b/host/include/aarch64/host/atomic128-ldst.h
@@ -XXX,XX +XXX,XX @@
 #ifndef AARCH64_ATOMIC128_LDST_H
 #define AARCH64_ATOMIC128_LDST_H
 
+#include "host/cpuinfo.h"
+#include "tcg/debug-assert.h"
+
 /*
  * Through gcc 10, aarch64 has no support for 128-bit atomics.
  * Through clang 16, without -march=armv8.4-a, __atomic_load_16
  * is incorrectly expanded to a read-write operation.
+ *
+ * Anyway, this method allows runtime detection of FEAT_LSE2.
  */
 
-#define HAVE_ATOMIC128_RO 0
+#define HAVE_ATOMIC128_RO (cpuinfo & CPUINFO_LSE2)
 #define HAVE_ATOMIC128_RW 1
 
-Int128 QEMU_ERROR("unsupported atomic") atomic16_read_ro(const Int128 *ptr);
+static inline Int128 atomic16_read_ro(const Int128 *ptr)
+{
+    uint64_t l, h;
+
+    tcg_debug_assert(HAVE_ATOMIC128_RO);
+    /* With FEAT_LSE2, 16-byte aligned LDP is atomic. */
+    asm("ldp %[l], %[h], %[mem]"
+        : [l] "=r"(l), [h] "=r"(h) : [mem] "m"(*ptr));
+
+    return int128_make128(l, h);
+}
 
 static inline Int128 atomic16_read_rw(Int128 *ptr)
 {
     uint64_t l, h;
     uint32_t tmp;
 
-    /* The load must be paired with the store to guarantee not tearing.  */
-    asm("0: ldxp %[l], %[h], %[mem]\n\t"
-        "stxp %w[tmp], %[l], %[h], %[mem]\n\t"
-        "cbnz %w[tmp], 0b"
-        : [mem] "+m"(*ptr), [tmp] "=r"(tmp), [l] "=r"(l), [h] "=r"(h));
+    if (cpuinfo & CPUINFO_LSE2) {
+        /* With FEAT_LSE2, 16-byte aligned LDP is atomic. */
+        asm("ldp %[l], %[h], %[mem]"
+            : [l] "=r"(l), [h] "=r"(h) : [mem] "m"(*ptr));
+    } else {
+        /* The load must be paired with the store to guarantee not tearing.  */
+        asm("0: ldxp %[l], %[h], %[mem]\n\t"
+            "stxp %w[tmp], %[l], %[h], %[mem]\n\t"
+            "cbnz %w[tmp], 0b"
+            : [mem] "+m"(*ptr), [tmp] "=&r"(tmp), [l] "=&r"(l), [h] "=&r"(h));
+    }
 
     return int128_make128(l, h);
 }
@@ -XXX,XX +XXX,XX @@ static inline void atomic16_set(Int128 *ptr, Int128 val)
     uint64_t l = int128_getlo(val), h = int128_gethi(val);
     uint64_t t1, t2;
 
-    /* Load into temporaries to acquire the exclusive access lock.  */
-    asm("0: ldxp %[t1], %[t2], %[mem]\n\t"
-        "stxp %w[t1], %[l], %[h], %[mem]\n\t"
-        "cbnz %w[t1], 0b"
-        : [mem] "+m"(*ptr), [t1] "=&r"(t1), [t2] "=&r"(t2)
-        : [l] "r"(l), [h] "r"(h));
+    if (cpuinfo & CPUINFO_LSE2) {
+        /* With FEAT_LSE2, 16-byte aligned STP is atomic. */
+        asm("stp %[l], %[h], %[mem]"
+            : [mem] "=m"(*ptr) : [l] "r"(l), [h] "r"(h));
+    } else {
+        /* Load into temporaries to acquire the exclusive access lock.  */
+        asm("0: ldxp %[t1], %[t2], %[mem]\n\t"
+            "stxp %w[t1], %[l], %[h], %[mem]\n\t"
+            "cbnz %w[t1], 0b"
+            : [mem] "+m"(*ptr), [t1] "=&r"(t1), [t2] "=&r"(t2)
+            : [l] "r"(l), [h] "r"(h));
+    }
 }
 
 #endif /* AARCH64_ATOMIC128_LDST_H */
-- 
2.34.1