1
The following changes since commit 15df33ceb73cb6bb3c6736cf4d2cff51129ed4b4:
1
v2: Testing revealed a missing earlyclober in the aa64 inline asm,
2
which showed up with macos testing.
2
3
3
Merge remote-tracking branch 'remotes/quic/tags/pull-hex-20220312-1' into staging (2022-03-13 17:29:18 +0000)
4
r~
5
6
The following changes since commit aa33508196f4e2da04625bee36e1f7be5b9267e7:
7
8
Merge tag 'mem-2023-05-23' of https://github.com/davidhildenbrand/qemu into staging (2023-05-23 10:57:25 -0700)
4
9
5
are available in the Git repository at:
10
are available in the Git repository at:
6
11
7
https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20220314
12
https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20230523-2
8
13
9
for you to fetch changes up to 76cff100beeae8d3676bb658cccd45ef5ced8aa9:
14
for you to fetch changes up to a57663c5a38c26516bde24ecb3992adff4861a31:
10
15
11
tcg/arm: Don't emit UNPREDICTABLE LDRD with Rm == Rt or Rt+1 (2022-03-14 10:31:51 -0700)
16
tcg: Remove USE_TCG_OPTIMIZATIONS (2023-05-24 01:10:44 +0000)
12
17
13
----------------------------------------------------------------
18
----------------------------------------------------------------
14
Fixes for s390x host vectors
19
util: Host cpu detection for x86 and aa64
15
Fix for arm ldrd unpredictable case
20
util: Use cpu detection for bufferiszero
21
migration: Use cpu detection for xbzrle
22
tcg: Replace and remove cpu_atomic_{ld,st}o*
23
host/include: Split qemu/atomic128.h
24
tcg: Remove DEBUG_DISAS
25
tcg: Remove USE_TCG_OPTIMIZATIONS
16
26
17
----------------------------------------------------------------
27
----------------------------------------------------------------
18
Richard Henderson (4):
28
Richard Henderson (28):
19
tcg/s390x: Fix tcg_out_dupi_vec vs VGM
29
util: Introduce host-specific cpuinfo.h
20
tcg/s390x: Fix INDEX_op_bitsel_vec vs VSEL
30
util: Add cpuinfo-i386.c
21
tcg/s390x: Fix tcg_out_dup_vec vs general registers
31
util: Add i386 CPUINFO_ATOMIC_VMOVDQU
22
tcg/arm: Don't emit UNPREDICTABLE LDRD with Rm == Rt or Rt+1
32
tcg/i386: Use host/cpuinfo.h
33
util/bufferiszero: Use i386 host/cpuinfo.h
34
migration/xbzrle: Shuffle function order
35
migration/xbzrle: Use i386 host/cpuinfo.h
36
migration: Build migration_files once
37
util: Add cpuinfo-aarch64.c
38
include/host: Split out atomic128-cas.h
39
include/host: Split out atomic128-ldst.h
40
meson: Fix detect atomic128 support with optimization
41
include/qemu: Move CONFIG_ATOMIC128_OPT handling to atomic128.h
42
target/ppc: Use tcg_gen_qemu_{ld,st}_i128 for LQARX, LQ, STQ
43
target/s390x: Use tcg_gen_qemu_{ld,st}_i128 for LPQ, STPQ
44
accel/tcg: Unify cpu_{ld,st}*_{be,le}_mmu
45
target/s390x: Use cpu_{ld,st}*_mmu in do_csst
46
target/s390x: Always use cpu_atomic_cmpxchgl_be_mmu in do_csst
47
accel/tcg: Remove cpu_atomic_{ld,st}o_*_mmu
48
accel/tcg: Remove prot argument to atomic_mmu_lookup
49
accel/tcg: Eliminate #if on HAVE_ATOMIC128 and HAVE_CMPXCHG128
50
qemu/atomic128: Split atomic16_read
51
accel/tcg: Correctly use atomic128.h in ldst_atomicity.c.inc
52
tcg: Split out tcg/debug-assert.h
53
qemu/atomic128: Improve cmpxchg fallback for atomic16_set
54
qemu/atomic128: Add runtime test for FEAT_LSE2
55
tcg: Remove DEBUG_DISAS
56
tcg: Remove USE_TCG_OPTIMIZATIONS
23
57
24
tcg/arm/tcg-target.c.inc | 17 +++++++++++++++--
58
accel/tcg/atomic_template.h | 93 +-----
25
tcg/s390x/tcg-target.c.inc | 7 ++++---
59
host/include/aarch64/host/atomic128-cas.h | 45 +++
26
2 files changed, 19 insertions(+), 5 deletions(-)
60
host/include/aarch64/host/atomic128-ldst.h | 79 +++++
61
host/include/aarch64/host/cpuinfo.h | 22 ++
62
host/include/generic/host/atomic128-cas.h | 47 +++
63
host/include/generic/host/atomic128-ldst.h | 81 +++++
64
host/include/generic/host/cpuinfo.h | 4 +
65
host/include/i386/host/cpuinfo.h | 39 +++
66
host/include/x86_64/host/cpuinfo.h | 1 +
67
include/exec/cpu_ldst.h | 67 +----
68
include/exec/exec-all.h | 3 -
69
include/qemu/atomic128.h | 146 ++-------
70
include/tcg/debug-assert.h | 17 ++
71
include/tcg/tcg.h | 9 +-
72
migration/xbzrle.h | 5 +-
73
target/ppc/cpu.h | 1 -
74
target/ppc/helper.h | 9 -
75
target/s390x/cpu.h | 3 -
76
target/s390x/helper.h | 4 -
77
tcg/aarch64/tcg-target.h | 6 +-
78
tcg/i386/tcg-target.h | 28 +-
79
accel/tcg/cpu-exec.c | 2 -
80
accel/tcg/cputlb.c | 211 ++++---------
81
accel/tcg/translate-all.c | 2 -
82
accel/tcg/translator.c | 2 -
83
accel/tcg/user-exec.c | 332 ++++++--------------
84
migration/ram.c | 34 +--
85
migration/xbzrle.c | 268 +++++++++--------
86
target/arm/tcg/m_helper.c | 4 +-
87
target/ppc/mem_helper.c | 48 ---
88
target/ppc/translate.c | 34 +--
89
target/s390x/tcg/mem_helper.c | 137 ++-------
90
target/s390x/tcg/translate.c | 30 +-
91
target/sh4/translate.c | 2 -
92
target/sparc/ldst_helper.c | 18 +-
93
target/sparc/translate.c | 2 -
94
tcg/tcg.c | 14 +-
95
tests/bench/xbzrle-bench.c | 469 -----------------------------
96
tests/unit/test-xbzrle.c | 49 +--
97
util/bufferiszero.c | 127 +++-----
98
util/cpuinfo-aarch64.c | 67 +++++
99
util/cpuinfo-i386.c | 99 ++++++
100
MAINTAINERS | 3 +
101
accel/tcg/atomic_common.c.inc | 14 -
102
accel/tcg/ldst_atomicity.c.inc | 135 ++-------
103
accel/tcg/ldst_common.c.inc | 24 +-
104
meson.build | 12 +-
105
migration/meson.build | 1 -
106
target/ppc/translate/fixedpoint-impl.c.inc | 51 +---
107
target/s390x/tcg/insn-data.h.inc | 2 +-
108
tcg/aarch64/tcg-target.c.inc | 40 ---
109
tcg/i386/tcg-target.c.inc | 123 +-------
110
tests/bench/meson.build | 6 -
111
util/meson.build | 6 +
112
54 files changed, 1035 insertions(+), 2042 deletions(-)
113
create mode 100644 host/include/aarch64/host/atomic128-cas.h
114
create mode 100644 host/include/aarch64/host/atomic128-ldst.h
115
create mode 100644 host/include/aarch64/host/cpuinfo.h
116
create mode 100644 host/include/generic/host/atomic128-cas.h
117
create mode 100644 host/include/generic/host/atomic128-ldst.h
118
create mode 100644 host/include/generic/host/cpuinfo.h
119
create mode 100644 host/include/i386/host/cpuinfo.h
120
create mode 100644 host/include/x86_64/host/cpuinfo.h
121
create mode 100644 include/tcg/debug-assert.h
122
delete mode 100644 tests/bench/xbzrle-bench.c
123
create mode 100644 util/cpuinfo-aarch64.c
124
create mode 100644 util/cpuinfo-i386.c
diff view generated by jsdifflib
Deleted patch
1
The immediate operands to VGM were in the wrong order,
2
producing an inverse mask.
3
1
4
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
5
---
6
tcg/s390x/tcg-target.c.inc | 4 ++--
7
1 file changed, 2 insertions(+), 2 deletions(-)
8
9
diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
10
index XXXXXXX..XXXXXXX 100644
11
--- a/tcg/s390x/tcg-target.c.inc
12
+++ b/tcg/s390x/tcg-target.c.inc
13
@@ -XXX,XX +XXX,XX @@ static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
14
msb = clz32(val);
15
lsb = 31 - ctz32(val);
16
}
17
- tcg_out_insn(s, VRIb, VGM, dst, lsb, msb, MO_32);
18
+ tcg_out_insn(s, VRIb, VGM, dst, msb, lsb, MO_32);
19
return;
20
}
21
} else {
22
@@ -XXX,XX +XXX,XX @@ static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
23
msb = clz64(val);
24
lsb = 63 - ctz64(val);
25
}
26
- tcg_out_insn(s, VRIb, VGM, dst, lsb, msb, MO_64);
27
+ tcg_out_insn(s, VRIb, VGM, dst, msb, lsb, MO_64);
28
return;
29
}
30
}
31
--
32
2.25.1
diff view generated by jsdifflib
Deleted patch
1
The operands are output in the wrong order: the tcg selector
2
argument is first, whereas the s390x selector argument is last.
3
1
4
Tested-by: Thomas Huth <thuth@redhat.com>
5
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/898
6
Fixes: 9bca986df88 ("tcg/s390x: Implement TCG_TARGET_HAS_bitsel_vec")
7
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
8
---
9
tcg/s390x/tcg-target.c.inc | 2 +-
10
1 file changed, 1 insertion(+), 1 deletion(-)
11
12
diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
13
index XXXXXXX..XXXXXXX 100644
14
--- a/tcg/s390x/tcg-target.c.inc
15
+++ b/tcg/s390x/tcg-target.c.inc
16
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
17
break;
18
19
case INDEX_op_bitsel_vec:
20
- tcg_out_insn(s, VRRe, VSEL, a0, a1, a2, args[3]);
21
+ tcg_out_insn(s, VRRe, VSEL, a0, a2, args[3], a1);
22
break;
23
24
case INDEX_op_cmp_vec:
25
--
26
2.25.1
diff view generated by jsdifflib
Deleted patch
1
We copied the data from the general register input to the
2
vector register output, but have not yet replicated it.
3
We intended to fall through into the vector-vector case,
4
but failed to redirect the input register.
5
1
6
This is caught by an assertion failure in tcg_out_insn_VRIc,
7
which diagnosed the incorrect register class.
8
9
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
10
---
11
tcg/s390x/tcg-target.c.inc | 1 +
12
1 file changed, 1 insertion(+)
13
14
diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
15
index XXXXXXX..XXXXXXX 100644
16
--- a/tcg/s390x/tcg-target.c.inc
17
+++ b/tcg/s390x/tcg-target.c.inc
18
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
19
if (vece == MO_64) {
20
return true;
21
}
22
+ src = dst;
23
}
24
25
/*
26
--
27
2.25.1
diff view generated by jsdifflib
1
The LDRD (register) instruction is UNPREDICTABLE if the Rm register
1
With FEAT_LSE2, load and store of int128 is directly supported.
2
is the same as either Rt or Rt+1 (the two registers being loaded to).
3
We weren't making sure we avoided this, with the result that on some
4
host CPUs like the Cortex-A7 we would get a SIGILL because the CPU
5
chooses to UNDEF for this particular UNPREDICTABLE case.
6
7
Since we've already checked that datalo is aligned, we can simplify
8
the test vs the Rm operand by aligning it before comparison. Check
9
for the two orderings before falling back to two ldr instructions.
10
11
We don't bother to do anything similar for tcg_out_ldrd_rwb(),
12
because it is only used in tcg_out_tlb_read() with a fixed set of
13
registers which don't overlap.
14
15
There is no equivalent UNPREDICTABLE case for STRD.
16
2
17
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
3
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
18
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/896
19
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
4
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
20
---
5
---
21
tcg/arm/tcg-target.c.inc | 17 +++++++++++++++--
6
host/include/aarch64/host/atomic128-ldst.h | 53 ++++++++++++++++------
22
1 file changed, 15 insertions(+), 2 deletions(-)
7
1 file changed, 40 insertions(+), 13 deletions(-)
23
8
24
diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
9
diff --git a/host/include/aarch64/host/atomic128-ldst.h b/host/include/aarch64/host/atomic128-ldst.h
25
index XXXXXXX..XXXXXXX 100644
10
index XXXXXXX..XXXXXXX 100644
26
--- a/tcg/arm/tcg-target.c.inc
11
--- a/host/include/aarch64/host/atomic128-ldst.h
27
+++ b/tcg/arm/tcg-target.c.inc
12
+++ b/host/include/aarch64/host/atomic128-ldst.h
28
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld_index(TCGContext *s, MemOp opc,
13
@@ -XXX,XX +XXX,XX @@
29
/* LDRD requires alignment; double-check that. */
14
#ifndef AARCH64_ATOMIC128_LDST_H
30
if (get_alignment_bits(opc) >= MO_64
15
#define AARCH64_ATOMIC128_LDST_H
31
&& (datalo & 1) == 0 && datahi == datalo + 1) {
16
32
- tcg_out_ldrd_r(s, COND_AL, datalo, addrlo, addend);
17
+#include "host/cpuinfo.h"
33
- } else if (scratch_addend) {
18
+#include "tcg/debug-assert.h"
34
+ /*
19
+
35
+ * Rm (the second address op) must not overlap Rt or Rt + 1.
20
/*
36
+ * Since datalo is aligned, we can simplify the test via alignment.
21
* Through gcc 10, aarch64 has no support for 128-bit atomics.
37
+ * Flip the two address arguments if that works.
22
* Through clang 16, without -march=armv8.4-a, __atomic_load_16
38
+ */
23
* is incorrectly expanded to a read-write operation.
39
+ if ((addend & ~1) != datalo) {
24
+ *
40
+ tcg_out_ldrd_r(s, COND_AL, datalo, addrlo, addend);
25
+ * Anyway, this method allows runtime detection of FEAT_LSE2.
41
+ break;
26
*/
42
+ }
27
43
+ if ((addrlo & ~1) != datalo) {
28
-#define HAVE_ATOMIC128_RO 0
44
+ tcg_out_ldrd_r(s, COND_AL, datalo, addend, addrlo);
29
+#define HAVE_ATOMIC128_RO (cpuinfo & CPUINFO_LSE2)
45
+ break;
30
#define HAVE_ATOMIC128_RW 1
46
+ }
31
47
+ }
32
-Int128 QEMU_ERROR("unsupported atomic") atomic16_read_ro(const Int128 *ptr);
48
+ if (scratch_addend) {
33
+static inline Int128 atomic16_read_ro(const Int128 *ptr)
49
tcg_out_ld32_rwb(s, COND_AL, datalo, addend, addrlo);
34
+{
50
tcg_out_ld32_12(s, COND_AL, datahi, addend, 4);
35
+ uint64_t l, h;
51
} else {
36
+
37
+ tcg_debug_assert(HAVE_ATOMIC128_RO);
38
+ /* With FEAT_LSE2, 16-byte aligned LDP is atomic. */
39
+ asm("ldp %[l], %[h], %[mem]"
40
+ : [l] "=r"(l), [h] "=r"(h) : [mem] "m"(*ptr));
41
+
42
+ return int128_make128(l, h);
43
+}
44
45
static inline Int128 atomic16_read_rw(Int128 *ptr)
46
{
47
uint64_t l, h;
48
uint32_t tmp;
49
50
- /* The load must be paired with the store to guarantee not tearing. */
51
- asm("0: ldxp %[l], %[h], %[mem]\n\t"
52
- "stxp %w[tmp], %[l], %[h], %[mem]\n\t"
53
- "cbnz %w[tmp], 0b"
54
- : [mem] "+m"(*ptr), [tmp] "=r"(tmp), [l] "=r"(l), [h] "=r"(h));
55
+ if (cpuinfo & CPUINFO_LSE2) {
56
+ /* With FEAT_LSE2, 16-byte aligned LDP is atomic. */
57
+ asm("ldp %[l], %[h], %[mem]"
58
+ : [l] "=r"(l), [h] "=r"(h) : [mem] "m"(*ptr));
59
+ } else {
60
+ /* The load must be paired with the store to guarantee not tearing. */
61
+ asm("0: ldxp %[l], %[h], %[mem]\n\t"
62
+ "stxp %w[tmp], %[l], %[h], %[mem]\n\t"
63
+ "cbnz %w[tmp], 0b"
64
+ : [mem] "+m"(*ptr), [tmp] "=&r"(tmp), [l] "=&r"(l), [h] "=&r"(h));
65
+ }
66
67
return int128_make128(l, h);
68
}
69
@@ -XXX,XX +XXX,XX @@ static inline void atomic16_set(Int128 *ptr, Int128 val)
70
uint64_t l = int128_getlo(val), h = int128_gethi(val);
71
uint64_t t1, t2;
72
73
- /* Load into temporaries to acquire the exclusive access lock. */
74
- asm("0: ldxp %[t1], %[t2], %[mem]\n\t"
75
- "stxp %w[t1], %[l], %[h], %[mem]\n\t"
76
- "cbnz %w[t1], 0b"
77
- : [mem] "+m"(*ptr), [t1] "=&r"(t1), [t2] "=&r"(t2)
78
- : [l] "r"(l), [h] "r"(h));
79
+ if (cpuinfo & CPUINFO_LSE2) {
80
+ /* With FEAT_LSE2, 16-byte aligned STP is atomic. */
81
+ asm("stp %[l], %[h], %[mem]"
82
+ : [mem] "=m"(*ptr) : [l] "r"(l), [h] "r"(h));
83
+ } else {
84
+ /* Load into temporaries to acquire the exclusive access lock. */
85
+ asm("0: ldxp %[t1], %[t2], %[mem]\n\t"
86
+ "stxp %w[t1], %[l], %[h], %[mem]\n\t"
87
+ "cbnz %w[t1], 0b"
88
+ : [mem] "+m"(*ptr), [t1] "=&r"(t1), [t2] "=&r"(t2)
89
+ : [l] "r"(l), [h] "r"(h));
90
+ }
91
}
92
93
#endif /* AARCH64_ATOMIC128_LDST_H */
52
--
94
--
53
2.25.1
95
2.34.1
54
96
55
97
diff view generated by jsdifflib