1
The following changes since commit 3dd23a4fb8fd72d2220a90a809f213999ffe7f3a:
1
The following changes since commit 15df33ceb73cb6bb3c6736cf4d2cff51129ed4b4:
2
2
3
Merge remote-tracking branch 'remotes/legoater/tags/pull-aspeed-20200901' into staging (2020-09-03 14:12:48 +0100)
3
Merge remote-tracking branch 'remotes/quic/tags/pull-hex-20220312-1' into staging (2022-03-13 17:29:18 +0000)
4
4
5
are available in the Git repository at:
5
are available in the Git repository at:
6
6
7
https://github.com/rth7680/qemu.git tags/pull-tcg-20200903
7
https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20220314
8
8
9
for you to fetch changes up to fe4b0b5bfa96c38ad1cad0689a86cca9f307e353:
9
for you to fetch changes up to 76cff100beeae8d3676bb658cccd45ef5ced8aa9:
10
10
11
tcg: Implement 256-bit dup for tcg_gen_gvec_dup_mem (2020-09-03 13:13:58 -0700)
11
tcg/arm: Don't emit UNPREDICTABLE LDRD with Rm == Rt or Rt+1 (2022-03-14 10:31:51 -0700)
12
12
13
----------------------------------------------------------------
13
----------------------------------------------------------------
14
Improve inlining in cputlb.c.
14
Fixes for s390x host vectors
15
Fix vector abs fallback.
15
Fix for arm ldrd unpredictable case
16
Only set parallel_cpus for SMP.
17
Add vector dupm for 256-bit elements.
18
16
19
----------------------------------------------------------------
17
----------------------------------------------------------------
20
Richard Henderson (4):
18
Richard Henderson (4):
21
cputlb: Make store_helper less fragile to compiler optimizations
19
tcg/s390x: Fix tcg_out_dupi_vec vs VGM
22
softmmu/cpus: Only set parallel_cpus for SMP
20
tcg/s390x: Fix INDEX_op_bitsel_vec vs VSEL
23
tcg: Eliminate one store for in-place 128-bit dup_mem
21
tcg/s390x: Fix tcg_out_dup_vec vs general registers
24
tcg: Implement 256-bit dup for tcg_gen_gvec_dup_mem
22
tcg/arm: Don't emit UNPREDICTABLE LDRD with Rm == Rt or Rt+1
25
23
26
Stephen Long (1):
24
tcg/arm/tcg-target.c.inc | 17 +++++++++++++++--
27
tcg: Fix tcg gen for vectorized absolute value
25
tcg/s390x/tcg-target.c.inc | 7 ++++---
28
26
2 files changed, 19 insertions(+), 5 deletions(-)
29
accel/tcg/cputlb.c | 138 ++++++++++++++++++++++++++++++-----------------------
30
softmmu/cpus.c | 11 ++++-
31
tcg/tcg-op-gvec.c | 61 ++++++++++++++++++++---
32
3 files changed, 143 insertions(+), 67 deletions(-)
33
diff view generated by jsdifflib
Deleted patch
1
This has no functional change.
2
1
3
The current function structure is:
4
5
inline QEMU_ALWAYSINLINE
6
store_memop() {
7
switch () {
8
...
9
default:
10
qemu_build_not_reached();
11
}
12
}
13
inline QEMU_ALWAYSINLINE
14
store_helper() {
15
...
16
if (span_two_pages_or_io) {
17
...
18
helper_ret_stb_mmu();
19
}
20
store_memop();
21
}
22
helper_ret_stb_mmu() {
23
store_helper();
24
}
25
26
Whereas GCC will generate an error at compile-time when an always_inline
27
function is not inlined, Clang does not. Nor does Clang prioritize the
28
inlining of always_inline functions. Both of these are arguably bugs.
29
30
Both `store_memop` and `store_helper` need to be inlined and allow
31
constant propogations to eliminate the `qemu_build_not_reached` call.
32
33
However, if the compiler instead chooses to inline helper_ret_stb_mmu
34
into store_helper, then store_helper is now self-recursive and the
35
compiler is no longer able to propagate the constant in the same way.
36
37
This does not produce at current QEMU head, but was reproducible
38
at v4.2.0 with `clang-10 -O2 -fexperimental-new-pass-manager`.
39
40
The inline recursion problem can be fixed solely by marking
41
helper_ret_stb_mmu as noinline, so the compiler does not make an
42
incorrect decision about which functions to inline.
43
44
In addition, extract store_helper_unaligned as a noinline subroutine
45
that can be shared by all of the helpers. This saves about 6k code
46
size in an optimized x86_64 build.
47
48
Reported-by: Shu-Chun Weng <scw@google.com>
49
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
50
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
51
---
52
accel/tcg/cputlb.c | 138 ++++++++++++++++++++++++++-------------------
53
1 file changed, 79 insertions(+), 59 deletions(-)
54
55
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
56
index XXXXXXX..XXXXXXX 100644
57
--- a/accel/tcg/cputlb.c
58
+++ b/accel/tcg/cputlb.c
59
@@ -XXX,XX +XXX,XX @@ store_memop(void *haddr, uint64_t val, MemOp op)
60
}
61
}
62
63
+static void __attribute__((noinline))
64
+store_helper_unaligned(CPUArchState *env, target_ulong addr, uint64_t val,
65
+ uintptr_t retaddr, size_t size, uintptr_t mmu_idx,
66
+ bool big_endian)
67
+{
68
+ const size_t tlb_off = offsetof(CPUTLBEntry, addr_write);
69
+ uintptr_t index, index2;
70
+ CPUTLBEntry *entry, *entry2;
71
+ target_ulong page2, tlb_addr, tlb_addr2;
72
+ TCGMemOpIdx oi;
73
+ size_t size2;
74
+ int i;
75
+
76
+ /*
77
+ * Ensure the second page is in the TLB. Note that the first page
78
+ * is already guaranteed to be filled, and that the second page
79
+ * cannot evict the first.
80
+ */
81
+ page2 = (addr + size) & TARGET_PAGE_MASK;
82
+ size2 = (addr + size) & ~TARGET_PAGE_MASK;
83
+ index2 = tlb_index(env, mmu_idx, page2);
84
+ entry2 = tlb_entry(env, mmu_idx, page2);
85
+
86
+ tlb_addr2 = tlb_addr_write(entry2);
87
+ if (!tlb_hit_page(tlb_addr2, page2)) {
88
+ if (!victim_tlb_hit(env, mmu_idx, index2, tlb_off, page2)) {
89
+ tlb_fill(env_cpu(env), page2, size2, MMU_DATA_STORE,
90
+ mmu_idx, retaddr);
91
+ index2 = tlb_index(env, mmu_idx, page2);
92
+ entry2 = tlb_entry(env, mmu_idx, page2);
93
+ }
94
+ tlb_addr2 = tlb_addr_write(entry2);
95
+ }
96
+
97
+ index = tlb_index(env, mmu_idx, addr);
98
+ entry = tlb_entry(env, mmu_idx, addr);
99
+ tlb_addr = tlb_addr_write(entry);
100
+
101
+ /*
102
+ * Handle watchpoints. Since this may trap, all checks
103
+ * must happen before any store.
104
+ */
105
+ if (unlikely(tlb_addr & TLB_WATCHPOINT)) {
106
+ cpu_check_watchpoint(env_cpu(env), addr, size - size2,
107
+ env_tlb(env)->d[mmu_idx].iotlb[index].attrs,
108
+ BP_MEM_WRITE, retaddr);
109
+ }
110
+ if (unlikely(tlb_addr2 & TLB_WATCHPOINT)) {
111
+ cpu_check_watchpoint(env_cpu(env), page2, size2,
112
+ env_tlb(env)->d[mmu_idx].iotlb[index2].attrs,
113
+ BP_MEM_WRITE, retaddr);
114
+ }
115
+
116
+ /*
117
+ * XXX: not efficient, but simple.
118
+ * This loop must go in the forward direction to avoid issues
119
+ * with self-modifying code in Windows 64-bit.
120
+ */
121
+ oi = make_memop_idx(MO_UB, mmu_idx);
122
+ if (big_endian) {
123
+ for (i = 0; i < size; ++i) {
124
+ /* Big-endian extract. */
125
+ uint8_t val8 = val >> (((size - 1) * 8) - (i * 8));
126
+ helper_ret_stb_mmu(env, addr + i, val8, oi, retaddr);
127
+ }
128
+ } else {
129
+ for (i = 0; i < size; ++i) {
130
+ /* Little-endian extract. */
131
+ uint8_t val8 = val >> (i * 8);
132
+ helper_ret_stb_mmu(env, addr + i, val8, oi, retaddr);
133
+ }
134
+ }
135
+}
136
+
137
static inline void QEMU_ALWAYS_INLINE
138
store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
139
TCGMemOpIdx oi, uintptr_t retaddr, MemOp op)
140
@@ -XXX,XX +XXX,XX @@ store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
141
if (size > 1
142
&& unlikely((addr & ~TARGET_PAGE_MASK) + size - 1
143
>= TARGET_PAGE_SIZE)) {
144
- int i;
145
- uintptr_t index2;
146
- CPUTLBEntry *entry2;
147
- target_ulong page2, tlb_addr2;
148
- size_t size2;
149
-
150
do_unaligned_access:
151
- /*
152
- * Ensure the second page is in the TLB. Note that the first page
153
- * is already guaranteed to be filled, and that the second page
154
- * cannot evict the first.
155
- */
156
- page2 = (addr + size) & TARGET_PAGE_MASK;
157
- size2 = (addr + size) & ~TARGET_PAGE_MASK;
158
- index2 = tlb_index(env, mmu_idx, page2);
159
- entry2 = tlb_entry(env, mmu_idx, page2);
160
- tlb_addr2 = tlb_addr_write(entry2);
161
- if (!tlb_hit_page(tlb_addr2, page2)) {
162
- if (!victim_tlb_hit(env, mmu_idx, index2, tlb_off, page2)) {
163
- tlb_fill(env_cpu(env), page2, size2, MMU_DATA_STORE,
164
- mmu_idx, retaddr);
165
- index2 = tlb_index(env, mmu_idx, page2);
166
- entry2 = tlb_entry(env, mmu_idx, page2);
167
- }
168
- tlb_addr2 = tlb_addr_write(entry2);
169
- }
170
-
171
- /*
172
- * Handle watchpoints. Since this may trap, all checks
173
- * must happen before any store.
174
- */
175
- if (unlikely(tlb_addr & TLB_WATCHPOINT)) {
176
- cpu_check_watchpoint(env_cpu(env), addr, size - size2,
177
- env_tlb(env)->d[mmu_idx].iotlb[index].attrs,
178
- BP_MEM_WRITE, retaddr);
179
- }
180
- if (unlikely(tlb_addr2 & TLB_WATCHPOINT)) {
181
- cpu_check_watchpoint(env_cpu(env), page2, size2,
182
- env_tlb(env)->d[mmu_idx].iotlb[index2].attrs,
183
- BP_MEM_WRITE, retaddr);
184
- }
185
-
186
- /*
187
- * XXX: not efficient, but simple.
188
- * This loop must go in the forward direction to avoid issues
189
- * with self-modifying code in Windows 64-bit.
190
- */
191
- for (i = 0; i < size; ++i) {
192
- uint8_t val8;
193
- if (memop_big_endian(op)) {
194
- /* Big-endian extract. */
195
- val8 = val >> (((size - 1) * 8) - (i * 8));
196
- } else {
197
- /* Little-endian extract. */
198
- val8 = val >> (i * 8);
199
- }
200
- helper_ret_stb_mmu(env, addr + i, val8, oi, retaddr);
201
- }
202
+ store_helper_unaligned(env, addr, val, retaddr, size,
203
+ mmu_idx, memop_big_endian(op));
204
return;
205
}
206
207
@@ -XXX,XX +XXX,XX @@ store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
208
store_memop(haddr, val, op);
209
}
210
211
-void helper_ret_stb_mmu(CPUArchState *env, target_ulong addr, uint8_t val,
212
- TCGMemOpIdx oi, uintptr_t retaddr)
213
+void __attribute__((noinline))
214
+helper_ret_stb_mmu(CPUArchState *env, target_ulong addr, uint8_t val,
215
+ TCGMemOpIdx oi, uintptr_t retaddr)
216
{
217
store_helper(env, addr, val, oi, retaddr, MO_UB);
218
}
219
--
220
2.25.1
221
222
diff view generated by jsdifflib
1
Do not store back to the exact memory from which we just loaded.
1
The immediate operands to VGM were in the wrong order,
2
producing an inverse mask.
2
3
3
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
4
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
4
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
5
---
5
---
6
tcg/tcg-op-gvec.c | 4 ++--
6
tcg/s390x/tcg-target.c.inc | 4 ++--
7
1 file changed, 2 insertions(+), 2 deletions(-)
7
1 file changed, 2 insertions(+), 2 deletions(-)
8
8
9
diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
9
diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
10
index XXXXXXX..XXXXXXX 100644
10
index XXXXXXX..XXXXXXX 100644
11
--- a/tcg/tcg-op-gvec.c
11
--- a/tcg/s390x/tcg-target.c.inc
12
+++ b/tcg/tcg-op-gvec.c
12
+++ b/tcg/s390x/tcg-target.c.inc
13
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
13
@@ -XXX,XX +XXX,XX @@ static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
14
TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V128);
14
msb = clz32(val);
15
15
lsb = 31 - ctz32(val);
16
tcg_gen_ld_vec(in, cpu_env, aofs);
17
- for (i = 0; i < oprsz; i += 16) {
18
+ for (i = (aofs == dofs) * 16; i < oprsz; i += 16) {
19
tcg_gen_st_vec(in, cpu_env, dofs + i);
20
}
16
}
21
tcg_temp_free_vec(in);
17
- tcg_out_insn(s, VRIb, VGM, dst, lsb, msb, MO_32);
22
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
18
+ tcg_out_insn(s, VRIb, VGM, dst, msb, lsb, MO_32);
23
19
return;
24
tcg_gen_ld_i64(in0, cpu_env, aofs);
20
}
25
tcg_gen_ld_i64(in1, cpu_env, aofs + 8);
21
} else {
26
- for (i = 0; i < oprsz; i += 16) {
22
@@ -XXX,XX +XXX,XX @@ static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
27
+ for (i = (aofs == dofs) * 16; i < oprsz; i += 16) {
23
msb = clz64(val);
28
tcg_gen_st_i64(in0, cpu_env, dofs + i);
24
lsb = 63 - ctz64(val);
29
tcg_gen_st_i64(in1, cpu_env, dofs + i + 8);
30
}
25
}
26
- tcg_out_insn(s, VRIb, VGM, dst, lsb, msb, MO_64);
27
+ tcg_out_insn(s, VRIb, VGM, dst, msb, lsb, MO_64);
28
return;
29
}
30
}
31
--
31
--
32
2.25.1
32
2.25.1
33
34
diff view generated by jsdifflib
1
We already support duplication of 128-bit blocks. This extends
1
The operands are output in the wrong order: the tcg selector
2
that support to 256-bit blocks. This will be needed by SVE2.
2
argument is first, whereas the s390x selector argument is last.
3
3
4
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
4
Tested-by: Thomas Huth <thuth@redhat.com>
5
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/898
6
Fixes: 9bca986df88 ("tcg/s390x: Implement TCG_TARGET_HAS_bitsel_vec")
5
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
7
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
6
---
8
---
7
tcg/tcg-op-gvec.c | 52 ++++++++++++++++++++++++++++++++++++++++++++---
9
tcg/s390x/tcg-target.c.inc | 2 +-
8
1 file changed, 49 insertions(+), 3 deletions(-)
10
1 file changed, 1 insertion(+), 1 deletion(-)
9
11
10
diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
12
diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
11
index XXXXXXX..XXXXXXX 100644
13
index XXXXXXX..XXXXXXX 100644
12
--- a/tcg/tcg-op-gvec.c
14
--- a/tcg/s390x/tcg-target.c.inc
13
+++ b/tcg/tcg-op-gvec.c
15
+++ b/tcg/s390x/tcg-target.c.inc
14
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
16
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
15
do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0);
17
break;
16
tcg_temp_free_i64(in);
18
17
}
19
case INDEX_op_bitsel_vec:
18
- } else {
20
- tcg_out_insn(s, VRRe, VSEL, a0, a1, a2, args[3]);
19
+ } else if (vece == 4) {
21
+ tcg_out_insn(s, VRRe, VSEL, a0, a2, args[3], a1);
20
/* 128-bit duplicate. */
22
break;
21
- /* ??? Dup to 256-bit vector. */
23
22
int i;
24
case INDEX_op_cmp_vec:
23
24
- tcg_debug_assert(vece == 4);
25
tcg_debug_assert(oprsz >= 16);
26
if (TCG_TARGET_HAS_v128) {
27
TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V128);
28
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
29
if (oprsz < maxsz) {
30
expand_clr(dofs + oprsz, maxsz - oprsz);
31
}
32
+ } else if (vece == 5) {
33
+ /* 256-bit duplicate. */
34
+ int i;
35
+
36
+ tcg_debug_assert(oprsz >= 32);
37
+ tcg_debug_assert(oprsz % 32 == 0);
38
+ if (TCG_TARGET_HAS_v256) {
39
+ TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V256);
40
+
41
+ tcg_gen_ld_vec(in, cpu_env, aofs);
42
+ for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
43
+ tcg_gen_st_vec(in, cpu_env, dofs + i);
44
+ }
45
+ tcg_temp_free_vec(in);
46
+ } else if (TCG_TARGET_HAS_v128) {
47
+ TCGv_vec in0 = tcg_temp_new_vec(TCG_TYPE_V128);
48
+ TCGv_vec in1 = tcg_temp_new_vec(TCG_TYPE_V128);
49
+
50
+ tcg_gen_ld_vec(in0, cpu_env, aofs);
51
+ tcg_gen_ld_vec(in1, cpu_env, aofs + 16);
52
+ for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
53
+ tcg_gen_st_vec(in0, cpu_env, dofs + i);
54
+ tcg_gen_st_vec(in1, cpu_env, dofs + i + 16);
55
+ }
56
+ tcg_temp_free_vec(in0);
57
+ tcg_temp_free_vec(in1);
58
+ } else {
59
+ TCGv_i64 in[4];
60
+ int j;
61
+
62
+ for (j = 0; j < 4; ++j) {
63
+ in[j] = tcg_temp_new_i64();
64
+ tcg_gen_ld_i64(in[j], cpu_env, aofs + j * 8);
65
+ }
66
+ for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
67
+ for (j = 0; j < 4; ++j) {
68
+ tcg_gen_st_i64(in[j], cpu_env, dofs + i + j * 8);
69
+ }
70
+ }
71
+ for (j = 0; j < 4; ++j) {
72
+ tcg_temp_free_i64(in[j]);
73
+ }
74
+ }
75
+ if (oprsz < maxsz) {
76
+ expand_clr(dofs + oprsz, maxsz - oprsz);
77
+ }
78
+ } else {
79
+ g_assert_not_reached();
80
}
81
}
82
83
--
25
--
84
2.25.1
26
2.25.1
85
86
diff view generated by jsdifflib
1
From: Stephen Long <steplong@quicinc.com>
1
We copied the data from the general register input to the
2
vector register output, but have not yet replicated it.
3
We intended to fall through into the vector-vector case,
4
but failed to redirect the input register.
2
5
3
The fallback inline expansion for vectorized absolute value,
6
This is caught by an assertion failure in tcg_out_insn_VRIc,
4
when the host doesn't support such an insn was flawed.
7
which diagnosed the incorrect register class.
5
8
6
E.g. when a vector of bytes has all elements negative, mask
7
will be 0xffff_ffff_ffff_ffff. Subtracting mask only adds 1
8
to the low element instead of all elements becase -mask is 1
9
and not 0x0101_0101_0101_0101.
10
11
Signed-off-by: Stephen Long <steplong@quicinc.com>
12
Message-Id: <20200813161818.190-1-steplong@quicinc.com>
13
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
9
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
14
---
10
---
15
tcg/tcg-op-gvec.c | 5 +++--
11
tcg/s390x/tcg-target.c.inc | 1 +
16
1 file changed, 3 insertions(+), 2 deletions(-)
12
1 file changed, 1 insertion(+)
17
13
18
diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
14
diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
19
index XXXXXXX..XXXXXXX 100644
15
index XXXXXXX..XXXXXXX 100644
20
--- a/tcg/tcg-op-gvec.c
16
--- a/tcg/s390x/tcg-target.c.inc
21
+++ b/tcg/tcg-op-gvec.c
17
+++ b/tcg/s390x/tcg-target.c.inc
22
@@ -XXX,XX +XXX,XX @@ static void gen_absv_mask(TCGv_i64 d, TCGv_i64 b, unsigned vece)
18
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
23
tcg_gen_muli_i64(t, t, (1 << nbit) - 1);
19
if (vece == MO_64) {
20
return true;
21
}
22
+ src = dst;
23
}
24
24
25
/*
25
/*
26
- * Invert (via xor -1) and add one (via sub -1).
27
+ * Invert (via xor -1) and add one.
28
* Because of the ordering the msb is cleared,
29
* so we never have carry into the next element.
30
*/
31
tcg_gen_xor_i64(d, b, t);
32
- tcg_gen_sub_i64(d, d, t);
33
+ tcg_gen_andi_i64(t, t, dup_const(vece, 1));
34
+ tcg_gen_add_i64(d, d, t);
35
36
tcg_temp_free_i64(t);
37
}
38
--
26
--
39
2.25.1
27
2.25.1
40
41
diff view generated by jsdifflib
1
Do not set parallel_cpus if there is only one cpu instantiated.
1
The LDRD (register) instruction is UNPREDICTABLE if the Rm register
2
This will allow tcg to use serial code to implement atomics.
2
is the same as either Rt or Rt+1 (the two registers being loaded to).
3
We weren't making sure we avoided this, with the result that on some
4
host CPUs like the Cortex-A7 we would get a SIGILL because the CPU
5
chooses to UNDEF for this particular UNPREDICTABLE case.
3
6
4
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
7
Since we've already checked that datalo is aligned, we can simplify
8
the test vs the Rm operand by aligning it before comparison. Check
9
for the two orderings before falling back to two ldr instructions.
10
11
We don't bother to do anything similar for tcg_out_ldrd_rwb(),
12
because it is only used in tcg_out_tlb_read() with a fixed set of
13
registers which don't overlap.
14
15
There is no equivalent UNPREDICTABLE case for STRD.
16
17
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
18
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/896
5
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
19
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
6
---
20
---
7
softmmu/cpus.c | 11 ++++++++++-
21
tcg/arm/tcg-target.c.inc | 17 +++++++++++++++--
8
1 file changed, 10 insertions(+), 1 deletion(-)
22
1 file changed, 15 insertions(+), 2 deletions(-)
9
23
10
diff --git a/softmmu/cpus.c b/softmmu/cpus.c
24
diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
11
index XXXXXXX..XXXXXXX 100644
25
index XXXXXXX..XXXXXXX 100644
12
--- a/softmmu/cpus.c
26
--- a/tcg/arm/tcg-target.c.inc
13
+++ b/softmmu/cpus.c
27
+++ b/tcg/arm/tcg-target.c.inc
14
@@ -XXX,XX +XXX,XX @@ static void qemu_tcg_init_vcpu(CPUState *cpu)
28
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld_index(TCGContext *s, MemOp opc,
15
if (!tcg_region_inited) {
29
/* LDRD requires alignment; double-check that. */
16
tcg_region_inited = 1;
30
if (get_alignment_bits(opc) >= MO_64
17
tcg_region_init();
31
&& (datalo & 1) == 0 && datahi == datalo + 1) {
18
+ /*
32
- tcg_out_ldrd_r(s, COND_AL, datalo, addrlo, addend);
19
+ * If MTTCG, and we will create multiple cpus,
33
- } else if (scratch_addend) {
20
+ * then we will have cpus running in parallel.
34
+ /*
21
+ */
35
+ * Rm (the second address op) must not overlap Rt or Rt + 1.
22
+ if (qemu_tcg_mttcg_enabled()) {
36
+ * Since datalo is aligned, we can simplify the test via alignment.
23
+ MachineState *ms = MACHINE(qdev_get_machine());
37
+ * Flip the two address arguments if that works.
24
+ if (ms->smp.max_cpus > 1) {
38
+ */
25
+ parallel_cpus = true;
39
+ if ((addend & ~1) != datalo) {
40
+ tcg_out_ldrd_r(s, COND_AL, datalo, addrlo, addend);
41
+ break;
42
+ }
43
+ if ((addrlo & ~1) != datalo) {
44
+ tcg_out_ldrd_r(s, COND_AL, datalo, addend, addrlo);
45
+ break;
26
+ }
46
+ }
27
+ }
47
+ }
28
}
48
+ if (scratch_addend) {
29
49
tcg_out_ld32_rwb(s, COND_AL, datalo, addend, addrlo);
30
if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
50
tcg_out_ld32_12(s, COND_AL, datahi, addend, 4);
31
@@ -XXX,XX +XXX,XX @@ static void qemu_tcg_init_vcpu(CPUState *cpu)
51
} else {
32
33
if (qemu_tcg_mttcg_enabled()) {
34
/* create a thread per vCPU with TCG (MTTCG) */
35
- parallel_cpus = true;
36
snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
37
cpu->cpu_index);
38
39
--
52
--
40
2.25.1
53
2.25.1
41
54
42
55
diff view generated by jsdifflib