1 | The following changes since commit 15df33ceb73cb6bb3c6736cf4d2cff51129ed4b4: | 1 | v2: Testing revealed a missing earlyclober in the aa64 inline asm, |
---|---|---|---|
2 | which showed up with macos testing. | ||
2 | 3 | ||
3 | Merge remote-tracking branch 'remotes/quic/tags/pull-hex-20220312-1' into staging (2022-03-13 17:29:18 +0000) | 4 | r~ |
5 | |||
6 | The following changes since commit aa33508196f4e2da04625bee36e1f7be5b9267e7: | ||
7 | |||
8 | Merge tag 'mem-2023-05-23' of https://github.com/davidhildenbrand/qemu into staging (2023-05-23 10:57:25 -0700) | ||
4 | 9 | ||
5 | are available in the Git repository at: | 10 | are available in the Git repository at: |
6 | 11 | ||
7 | https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20220314 | 12 | https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20230523-2 |
8 | 13 | ||
9 | for you to fetch changes up to 76cff100beeae8d3676bb658cccd45ef5ced8aa9: | 14 | for you to fetch changes up to a57663c5a38c26516bde24ecb3992adff4861a31: |
10 | 15 | ||
11 | tcg/arm: Don't emit UNPREDICTABLE LDRD with Rm == Rt or Rt+1 (2022-03-14 10:31:51 -0700) | 16 | tcg: Remove USE_TCG_OPTIMIZATIONS (2023-05-24 01:10:44 +0000) |
12 | 17 | ||
13 | ---------------------------------------------------------------- | 18 | ---------------------------------------------------------------- |
14 | Fixes for s390x host vectors | 19 | util: Host cpu detection for x86 and aa64 |
15 | Fix for arm ldrd unpredictable case | 20 | util: Use cpu detection for bufferiszero |
21 | migration: Use cpu detection for xbzrle | ||
22 | tcg: Replace and remove cpu_atomic_{ld,st}o* | ||
23 | host/include: Split qemu/atomic128.h | ||
24 | tcg: Remove DEBUG_DISAS | ||
25 | tcg: Remove USE_TCG_OPTIMIZATIONS | ||
16 | 26 | ||
17 | ---------------------------------------------------------------- | 27 | ---------------------------------------------------------------- |
18 | Richard Henderson (4): | 28 | Richard Henderson (28): |
19 | tcg/s390x: Fix tcg_out_dupi_vec vs VGM | 29 | util: Introduce host-specific cpuinfo.h |
20 | tcg/s390x: Fix INDEX_op_bitsel_vec vs VSEL | 30 | util: Add cpuinfo-i386.c |
21 | tcg/s390x: Fix tcg_out_dup_vec vs general registers | 31 | util: Add i386 CPUINFO_ATOMIC_VMOVDQU |
22 | tcg/arm: Don't emit UNPREDICTABLE LDRD with Rm == Rt or Rt+1 | 32 | tcg/i386: Use host/cpuinfo.h |
33 | util/bufferiszero: Use i386 host/cpuinfo.h | ||
34 | migration/xbzrle: Shuffle function order | ||
35 | migration/xbzrle: Use i386 host/cpuinfo.h | ||
36 | migration: Build migration_files once | ||
37 | util: Add cpuinfo-aarch64.c | ||
38 | include/host: Split out atomic128-cas.h | ||
39 | include/host: Split out atomic128-ldst.h | ||
40 | meson: Fix detect atomic128 support with optimization | ||
41 | include/qemu: Move CONFIG_ATOMIC128_OPT handling to atomic128.h | ||
42 | target/ppc: Use tcg_gen_qemu_{ld,st}_i128 for LQARX, LQ, STQ | ||
43 | target/s390x: Use tcg_gen_qemu_{ld,st}_i128 for LPQ, STPQ | ||
44 | accel/tcg: Unify cpu_{ld,st}*_{be,le}_mmu | ||
45 | target/s390x: Use cpu_{ld,st}*_mmu in do_csst | ||
46 | target/s390x: Always use cpu_atomic_cmpxchgl_be_mmu in do_csst | ||
47 | accel/tcg: Remove cpu_atomic_{ld,st}o_*_mmu | ||
48 | accel/tcg: Remove prot argument to atomic_mmu_lookup | ||
49 | accel/tcg: Eliminate #if on HAVE_ATOMIC128 and HAVE_CMPXCHG128 | ||
50 | qemu/atomic128: Split atomic16_read | ||
51 | accel/tcg: Correctly use atomic128.h in ldst_atomicity.c.inc | ||
52 | tcg: Split out tcg/debug-assert.h | ||
53 | qemu/atomic128: Improve cmpxchg fallback for atomic16_set | ||
54 | qemu/atomic128: Add runtime test for FEAT_LSE2 | ||
55 | tcg: Remove DEBUG_DISAS | ||
56 | tcg: Remove USE_TCG_OPTIMIZATIONS | ||
23 | 57 | ||
24 | tcg/arm/tcg-target.c.inc | 17 +++++++++++++++-- | 58 | accel/tcg/atomic_template.h | 93 +----- |
25 | tcg/s390x/tcg-target.c.inc | 7 ++++--- | 59 | host/include/aarch64/host/atomic128-cas.h | 45 +++ |
26 | 2 files changed, 19 insertions(+), 5 deletions(-) | 60 | host/include/aarch64/host/atomic128-ldst.h | 79 +++++ |
61 | host/include/aarch64/host/cpuinfo.h | 22 ++ | ||
62 | host/include/generic/host/atomic128-cas.h | 47 +++ | ||
63 | host/include/generic/host/atomic128-ldst.h | 81 +++++ | ||
64 | host/include/generic/host/cpuinfo.h | 4 + | ||
65 | host/include/i386/host/cpuinfo.h | 39 +++ | ||
66 | host/include/x86_64/host/cpuinfo.h | 1 + | ||
67 | include/exec/cpu_ldst.h | 67 +---- | ||
68 | include/exec/exec-all.h | 3 - | ||
69 | include/qemu/atomic128.h | 146 ++------- | ||
70 | include/tcg/debug-assert.h | 17 ++ | ||
71 | include/tcg/tcg.h | 9 +- | ||
72 | migration/xbzrle.h | 5 +- | ||
73 | target/ppc/cpu.h | 1 - | ||
74 | target/ppc/helper.h | 9 - | ||
75 | target/s390x/cpu.h | 3 - | ||
76 | target/s390x/helper.h | 4 - | ||
77 | tcg/aarch64/tcg-target.h | 6 +- | ||
78 | tcg/i386/tcg-target.h | 28 +- | ||
79 | accel/tcg/cpu-exec.c | 2 - | ||
80 | accel/tcg/cputlb.c | 211 ++++--------- | ||
81 | accel/tcg/translate-all.c | 2 - | ||
82 | accel/tcg/translator.c | 2 - | ||
83 | accel/tcg/user-exec.c | 332 ++++++-------------- | ||
84 | migration/ram.c | 34 +-- | ||
85 | migration/xbzrle.c | 268 +++++++++-------- | ||
86 | target/arm/tcg/m_helper.c | 4 +- | ||
87 | target/ppc/mem_helper.c | 48 --- | ||
88 | target/ppc/translate.c | 34 +-- | ||
89 | target/s390x/tcg/mem_helper.c | 137 ++------- | ||
90 | target/s390x/tcg/translate.c | 30 +- | ||
91 | target/sh4/translate.c | 2 - | ||
92 | target/sparc/ldst_helper.c | 18 +- | ||
93 | target/sparc/translate.c | 2 - | ||
94 | tcg/tcg.c | 14 +- | ||
95 | tests/bench/xbzrle-bench.c | 469 ----------------------------- | ||
96 | tests/unit/test-xbzrle.c | 49 +-- | ||
97 | util/bufferiszero.c | 127 +++----- | ||
98 | util/cpuinfo-aarch64.c | 67 +++++ | ||
99 | util/cpuinfo-i386.c | 99 ++++++ | ||
100 | MAINTAINERS | 3 + | ||
101 | accel/tcg/atomic_common.c.inc | 14 - | ||
102 | accel/tcg/ldst_atomicity.c.inc | 135 ++------- | ||
103 | accel/tcg/ldst_common.c.inc | 24 +- | ||
104 | meson.build | 12 +- | ||
105 | migration/meson.build | 1 - | ||
106 | target/ppc/translate/fixedpoint-impl.c.inc | 51 +--- | ||
107 | target/s390x/tcg/insn-data.h.inc | 2 +- | ||
108 | tcg/aarch64/tcg-target.c.inc | 40 --- | ||
109 | tcg/i386/tcg-target.c.inc | 123 +------- | ||
110 | tests/bench/meson.build | 6 - | ||
111 | util/meson.build | 6 + | ||
112 | 54 files changed, 1035 insertions(+), 2042 deletions(-) | ||
113 | create mode 100644 host/include/aarch64/host/atomic128-cas.h | ||
114 | create mode 100644 host/include/aarch64/host/atomic128-ldst.h | ||
115 | create mode 100644 host/include/aarch64/host/cpuinfo.h | ||
116 | create mode 100644 host/include/generic/host/atomic128-cas.h | ||
117 | create mode 100644 host/include/generic/host/atomic128-ldst.h | ||
118 | create mode 100644 host/include/generic/host/cpuinfo.h | ||
119 | create mode 100644 host/include/i386/host/cpuinfo.h | ||
120 | create mode 100644 host/include/x86_64/host/cpuinfo.h | ||
121 | create mode 100644 include/tcg/debug-assert.h | ||
122 | delete mode 100644 tests/bench/xbzrle-bench.c | ||
123 | create mode 100644 util/cpuinfo-aarch64.c | ||
124 | create mode 100644 util/cpuinfo-i386.c | diff view generated by jsdifflib |
Deleted patch | |||
---|---|---|---|
1 | The immediate operands to VGM were in the wrong order, | ||
2 | producing an inverse mask. | ||
3 | 1 | ||
4 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> | ||
5 | --- | ||
6 | tcg/s390x/tcg-target.c.inc | 4 ++-- | ||
7 | 1 file changed, 2 insertions(+), 2 deletions(-) | ||
8 | |||
9 | diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc | ||
10 | index XXXXXXX..XXXXXXX 100644 | ||
11 | --- a/tcg/s390x/tcg-target.c.inc | ||
12 | +++ b/tcg/s390x/tcg-target.c.inc | ||
13 | @@ -XXX,XX +XXX,XX @@ static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece, | ||
14 | msb = clz32(val); | ||
15 | lsb = 31 - ctz32(val); | ||
16 | } | ||
17 | - tcg_out_insn(s, VRIb, VGM, dst, lsb, msb, MO_32); | ||
18 | + tcg_out_insn(s, VRIb, VGM, dst, msb, lsb, MO_32); | ||
19 | return; | ||
20 | } | ||
21 | } else { | ||
22 | @@ -XXX,XX +XXX,XX @@ static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece, | ||
23 | msb = clz64(val); | ||
24 | lsb = 63 - ctz64(val); | ||
25 | } | ||
26 | - tcg_out_insn(s, VRIb, VGM, dst, lsb, msb, MO_64); | ||
27 | + tcg_out_insn(s, VRIb, VGM, dst, msb, lsb, MO_64); | ||
28 | return; | ||
29 | } | ||
30 | } | ||
31 | -- | ||
32 | 2.25.1 | diff view generated by jsdifflib |
Deleted patch | |||
---|---|---|---|
1 | The operands are output in the wrong order: the tcg selector | ||
2 | argument is first, whereas the s390x selector argument is last. | ||
3 | 1 | ||
4 | Tested-by: Thomas Huth <thuth@redhat.com> | ||
5 | Resolves: https://gitlab.com/qemu-project/qemu/-/issues/898 | ||
6 | Fixes: 9bca986df88 ("tcg/s390x: Implement TCG_TARGET_HAS_bitsel_vec") | ||
7 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> | ||
8 | --- | ||
9 | tcg/s390x/tcg-target.c.inc | 2 +- | ||
10 | 1 file changed, 1 insertion(+), 1 deletion(-) | ||
11 | |||
12 | diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc | ||
13 | index XXXXXXX..XXXXXXX 100644 | ||
14 | --- a/tcg/s390x/tcg-target.c.inc | ||
15 | +++ b/tcg/s390x/tcg-target.c.inc | ||
16 | @@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, | ||
17 | break; | ||
18 | |||
19 | case INDEX_op_bitsel_vec: | ||
20 | - tcg_out_insn(s, VRRe, VSEL, a0, a1, a2, args[3]); | ||
21 | + tcg_out_insn(s, VRRe, VSEL, a0, a2, args[3], a1); | ||
22 | break; | ||
23 | |||
24 | case INDEX_op_cmp_vec: | ||
25 | -- | ||
26 | 2.25.1 | diff view generated by jsdifflib |
Deleted patch | |||
---|---|---|---|
1 | We copied the data from the general register input to the | ||
2 | vector register output, but have not yet replicated it. | ||
3 | We intended to fall through into the vector-vector case, | ||
4 | but failed to redirect the input register. | ||
5 | 1 | ||
6 | This is caught by an assertion failure in tcg_out_insn_VRIc, | ||
7 | which diagnosed the incorrect register class. | ||
8 | |||
9 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> | ||
10 | --- | ||
11 | tcg/s390x/tcg-target.c.inc | 1 + | ||
12 | 1 file changed, 1 insertion(+) | ||
13 | |||
14 | diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc | ||
15 | index XXXXXXX..XXXXXXX 100644 | ||
16 | --- a/tcg/s390x/tcg-target.c.inc | ||
17 | +++ b/tcg/s390x/tcg-target.c.inc | ||
18 | @@ -XXX,XX +XXX,XX @@ static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece, | ||
19 | if (vece == MO_64) { | ||
20 | return true; | ||
21 | } | ||
22 | + src = dst; | ||
23 | } | ||
24 | |||
25 | /* | ||
26 | -- | ||
27 | 2.25.1 | diff view generated by jsdifflib |
1 | The LDRD (register) instruction is UNPREDICTABLE if the Rm register | 1 | With FEAT_LSE2, load and store of int128 is directly supported. |
---|---|---|---|
2 | is the same as either Rt or Rt+1 (the two registers being loaded to). | ||
3 | We weren't making sure we avoided this, with the result that on some | ||
4 | host CPUs like the Cortex-A7 we would get a SIGILL because the CPU | ||
5 | chooses to UNDEF for this particular UNPREDICTABLE case. | ||
6 | |||
7 | Since we've already checked that datalo is aligned, we can simplify | ||
8 | the test vs the Rm operand by aligning it before comparison. Check | ||
9 | for the two orderings before falling back to two ldr instructions. | ||
10 | |||
11 | We don't bother to do anything similar for tcg_out_ldrd_rwb(), | ||
12 | because it is only used in tcg_out_tlb_read() with a fixed set of | ||
13 | registers which don't overlap. | ||
14 | |||
15 | There is no equivalent UNPREDICTABLE case for STRD. | ||
16 | 2 | ||
17 | Reviewed-by: Alex Bennée <alex.bennee@linaro.org> | 3 | Reviewed-by: Alex Bennée <alex.bennee@linaro.org> |
18 | Resolves: https://gitlab.com/qemu-project/qemu/-/issues/896 | ||
19 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> | 4 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> |
20 | --- | 5 | --- |
21 | tcg/arm/tcg-target.c.inc | 17 +++++++++++++++-- | 6 | host/include/aarch64/host/atomic128-ldst.h | 53 ++++++++++++++++------ |
22 | 1 file changed, 15 insertions(+), 2 deletions(-) | 7 | 1 file changed, 40 insertions(+), 13 deletions(-) |
23 | 8 | ||
24 | diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc | 9 | diff --git a/host/include/aarch64/host/atomic128-ldst.h b/host/include/aarch64/host/atomic128-ldst.h |
25 | index XXXXXXX..XXXXXXX 100644 | 10 | index XXXXXXX..XXXXXXX 100644 |
26 | --- a/tcg/arm/tcg-target.c.inc | 11 | --- a/host/include/aarch64/host/atomic128-ldst.h |
27 | +++ b/tcg/arm/tcg-target.c.inc | 12 | +++ b/host/include/aarch64/host/atomic128-ldst.h |
28 | @@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld_index(TCGContext *s, MemOp opc, | 13 | @@ -XXX,XX +XXX,XX @@ |
29 | /* LDRD requires alignment; double-check that. */ | 14 | #ifndef AARCH64_ATOMIC128_LDST_H |
30 | if (get_alignment_bits(opc) >= MO_64 | 15 | #define AARCH64_ATOMIC128_LDST_H |
31 | && (datalo & 1) == 0 && datahi == datalo + 1) { | 16 | |
32 | - tcg_out_ldrd_r(s, COND_AL, datalo, addrlo, addend); | 17 | +#include "host/cpuinfo.h" |
33 | - } else if (scratch_addend) { | 18 | +#include "tcg/debug-assert.h" |
34 | + /* | 19 | + |
35 | + * Rm (the second address op) must not overlap Rt or Rt + 1. | 20 | /* |
36 | + * Since datalo is aligned, we can simplify the test via alignment. | 21 | * Through gcc 10, aarch64 has no support for 128-bit atomics. |
37 | + * Flip the two address arguments if that works. | 22 | * Through clang 16, without -march=armv8.4-a, __atomic_load_16 |
38 | + */ | 23 | * is incorrectly expanded to a read-write operation. |
39 | + if ((addend & ~1) != datalo) { | 24 | + * |
40 | + tcg_out_ldrd_r(s, COND_AL, datalo, addrlo, addend); | 25 | + * Anyway, this method allows runtime detection of FEAT_LSE2. |
41 | + break; | 26 | */ |
42 | + } | 27 | |
43 | + if ((addrlo & ~1) != datalo) { | 28 | -#define HAVE_ATOMIC128_RO 0 |
44 | + tcg_out_ldrd_r(s, COND_AL, datalo, addend, addrlo); | 29 | +#define HAVE_ATOMIC128_RO (cpuinfo & CPUINFO_LSE2) |
45 | + break; | 30 | #define HAVE_ATOMIC128_RW 1 |
46 | + } | 31 | |
47 | + } | 32 | -Int128 QEMU_ERROR("unsupported atomic") atomic16_read_ro(const Int128 *ptr); |
48 | + if (scratch_addend) { | 33 | +static inline Int128 atomic16_read_ro(const Int128 *ptr) |
49 | tcg_out_ld32_rwb(s, COND_AL, datalo, addend, addrlo); | 34 | +{ |
50 | tcg_out_ld32_12(s, COND_AL, datahi, addend, 4); | 35 | + uint64_t l, h; |
51 | } else { | 36 | + |
37 | + tcg_debug_assert(HAVE_ATOMIC128_RO); | ||
38 | + /* With FEAT_LSE2, 16-byte aligned LDP is atomic. */ | ||
39 | + asm("ldp %[l], %[h], %[mem]" | ||
40 | + : [l] "=r"(l), [h] "=r"(h) : [mem] "m"(*ptr)); | ||
41 | + | ||
42 | + return int128_make128(l, h); | ||
43 | +} | ||
44 | |||
45 | static inline Int128 atomic16_read_rw(Int128 *ptr) | ||
46 | { | ||
47 | uint64_t l, h; | ||
48 | uint32_t tmp; | ||
49 | |||
50 | - /* The load must be paired with the store to guarantee not tearing. */ | ||
51 | - asm("0: ldxp %[l], %[h], %[mem]\n\t" | ||
52 | - "stxp %w[tmp], %[l], %[h], %[mem]\n\t" | ||
53 | - "cbnz %w[tmp], 0b" | ||
54 | - : [mem] "+m"(*ptr), [tmp] "=r"(tmp), [l] "=r"(l), [h] "=r"(h)); | ||
55 | + if (cpuinfo & CPUINFO_LSE2) { | ||
56 | + /* With FEAT_LSE2, 16-byte aligned LDP is atomic. */ | ||
57 | + asm("ldp %[l], %[h], %[mem]" | ||
58 | + : [l] "=r"(l), [h] "=r"(h) : [mem] "m"(*ptr)); | ||
59 | + } else { | ||
60 | + /* The load must be paired with the store to guarantee not tearing. */ | ||
61 | + asm("0: ldxp %[l], %[h], %[mem]\n\t" | ||
62 | + "stxp %w[tmp], %[l], %[h], %[mem]\n\t" | ||
63 | + "cbnz %w[tmp], 0b" | ||
64 | + : [mem] "+m"(*ptr), [tmp] "=&r"(tmp), [l] "=&r"(l), [h] "=&r"(h)); | ||
65 | + } | ||
66 | |||
67 | return int128_make128(l, h); | ||
68 | } | ||
69 | @@ -XXX,XX +XXX,XX @@ static inline void atomic16_set(Int128 *ptr, Int128 val) | ||
70 | uint64_t l = int128_getlo(val), h = int128_gethi(val); | ||
71 | uint64_t t1, t2; | ||
72 | |||
73 | - /* Load into temporaries to acquire the exclusive access lock. */ | ||
74 | - asm("0: ldxp %[t1], %[t2], %[mem]\n\t" | ||
75 | - "stxp %w[t1], %[l], %[h], %[mem]\n\t" | ||
76 | - "cbnz %w[t1], 0b" | ||
77 | - : [mem] "+m"(*ptr), [t1] "=&r"(t1), [t2] "=&r"(t2) | ||
78 | - : [l] "r"(l), [h] "r"(h)); | ||
79 | + if (cpuinfo & CPUINFO_LSE2) { | ||
80 | + /* With FEAT_LSE2, 16-byte aligned STP is atomic. */ | ||
81 | + asm("stp %[l], %[h], %[mem]" | ||
82 | + : [mem] "=m"(*ptr) : [l] "r"(l), [h] "r"(h)); | ||
83 | + } else { | ||
84 | + /* Load into temporaries to acquire the exclusive access lock. */ | ||
85 | + asm("0: ldxp %[t1], %[t2], %[mem]\n\t" | ||
86 | + "stxp %w[t1], %[l], %[h], %[mem]\n\t" | ||
87 | + "cbnz %w[t1], 0b" | ||
88 | + : [mem] "+m"(*ptr), [t1] "=&r"(t1), [t2] "=&r"(t2) | ||
89 | + : [l] "r"(l), [h] "r"(h)); | ||
90 | + } | ||
91 | } | ||
92 | |||
93 | #endif /* AARCH64_ATOMIC128_LDST_H */ | ||
52 | -- | 94 | -- |
53 | 2.25.1 | 95 | 2.34.1 |
54 | 96 | ||
55 | 97 | diff view generated by jsdifflib |