1 | The following changes since commit 3dd23a4fb8fd72d2220a90a809f213999ffe7f3a: | 1 | The following changes since commit 15df33ceb73cb6bb3c6736cf4d2cff51129ed4b4: |
---|---|---|---|
2 | 2 | ||
3 | Merge remote-tracking branch 'remotes/legoater/tags/pull-aspeed-20200901' into staging (2020-09-03 14:12:48 +0100) | 3 | Merge remote-tracking branch 'remotes/quic/tags/pull-hex-20220312-1' into staging (2022-03-13 17:29:18 +0000) |
4 | 4 | ||
5 | are available in the Git repository at: | 5 | are available in the Git repository at: |
6 | 6 | ||
7 | https://github.com/rth7680/qemu.git tags/pull-tcg-20200903 | 7 | https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20220314 |
8 | 8 | ||
9 | for you to fetch changes up to fe4b0b5bfa96c38ad1cad0689a86cca9f307e353: | 9 | for you to fetch changes up to 76cff100beeae8d3676bb658cccd45ef5ced8aa9: |
10 | 10 | ||
11 | tcg: Implement 256-bit dup for tcg_gen_gvec_dup_mem (2020-09-03 13:13:58 -0700) | 11 | tcg/arm: Don't emit UNPREDICTABLE LDRD with Rm == Rt or Rt+1 (2022-03-14 10:31:51 -0700) |
12 | 12 | ||
13 | ---------------------------------------------------------------- | 13 | ---------------------------------------------------------------- |
14 | Improve inlining in cputlb.c. | 14 | Fixes for s390x host vectors |
15 | Fix vector abs fallback. | 15 | Fix for arm ldrd unpredictable case |
16 | Only set parallel_cpus for SMP. | ||
17 | Add vector dupm for 256-bit elements. | ||
18 | 16 | ||
19 | ---------------------------------------------------------------- | 17 | ---------------------------------------------------------------- |
20 | Richard Henderson (4): | 18 | Richard Henderson (4): |
21 | cputlb: Make store_helper less fragile to compiler optimizations | 19 | tcg/s390x: Fix tcg_out_dupi_vec vs VGM |
22 | softmmu/cpus: Only set parallel_cpus for SMP | 20 | tcg/s390x: Fix INDEX_op_bitsel_vec vs VSEL |
23 | tcg: Eliminate one store for in-place 128-bit dup_mem | 21 | tcg/s390x: Fix tcg_out_dup_vec vs general registers |
24 | tcg: Implement 256-bit dup for tcg_gen_gvec_dup_mem | 22 | tcg/arm: Don't emit UNPREDICTABLE LDRD with Rm == Rt or Rt+1 |
25 | 23 | ||
26 | Stephen Long (1): | 24 | tcg/arm/tcg-target.c.inc | 17 +++++++++++++++-- |
27 | tcg: Fix tcg gen for vectorized absolute value | 25 | tcg/s390x/tcg-target.c.inc | 7 ++++--- |
28 | 26 | 2 files changed, 19 insertions(+), 5 deletions(-) | |
29 | accel/tcg/cputlb.c | 138 ++++++++++++++++++++++++++++++----------------------- | ||
30 | softmmu/cpus.c | 11 ++++- | ||
31 | tcg/tcg-op-gvec.c | 61 ++++++++++++++++++++--- | ||
32 | 3 files changed, 143 insertions(+), 67 deletions(-) | ||
33 | diff view generated by jsdifflib |
Deleted patch | |||
---|---|---|---|
1 | This has no functional change. | ||
2 | 1 | ||
3 | The current function structure is: | ||
4 | |||
5 | inline QEMU_ALWAYSINLINE | ||
6 | store_memop() { | ||
7 | switch () { | ||
8 | ... | ||
9 | default: | ||
10 | qemu_build_not_reached(); | ||
11 | } | ||
12 | } | ||
13 | inline QEMU_ALWAYSINLINE | ||
14 | store_helper() { | ||
15 | ... | ||
16 | if (span_two_pages_or_io) { | ||
17 | ... | ||
18 | helper_ret_stb_mmu(); | ||
19 | } | ||
20 | store_memop(); | ||
21 | } | ||
22 | helper_ret_stb_mmu() { | ||
23 | store_helper(); | ||
24 | } | ||
25 | |||
26 | Whereas GCC will generate an error at compile-time when an always_inline | ||
27 | function is not inlined, Clang does not. Nor does Clang prioritize the | ||
28 | inlining of always_inline functions. Both of these are arguably bugs. | ||
29 | |||
30 | Both `store_memop` and `store_helper` need to be inlined and allow | ||
31 | constant propogations to eliminate the `qemu_build_not_reached` call. | ||
32 | |||
33 | However, if the compiler instead chooses to inline helper_ret_stb_mmu | ||
34 | into store_helper, then store_helper is now self-recursive and the | ||
35 | compiler is no longer able to propagate the constant in the same way. | ||
36 | |||
37 | This does not produce at current QEMU head, but was reproducible | ||
38 | at v4.2.0 with `clang-10 -O2 -fexperimental-new-pass-manager`. | ||
39 | |||
40 | The inline recursion problem can be fixed solely by marking | ||
41 | helper_ret_stb_mmu as noinline, so the compiler does not make an | ||
42 | incorrect decision about which functions to inline. | ||
43 | |||
44 | In addition, extract store_helper_unaligned as a noinline subroutine | ||
45 | that can be shared by all of the helpers. This saves about 6k code | ||
46 | size in an optimized x86_64 build. | ||
47 | |||
48 | Reported-by: Shu-Chun Weng <scw@google.com> | ||
49 | Reviewed-by: Alex Bennée <alex.bennee@linaro.org> | ||
50 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> | ||
51 | --- | ||
52 | accel/tcg/cputlb.c | 138 ++++++++++++++++++++++++++------------------- | ||
53 | 1 file changed, 79 insertions(+), 59 deletions(-) | ||
54 | |||
55 | diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c | ||
56 | index XXXXXXX..XXXXXXX 100644 | ||
57 | --- a/accel/tcg/cputlb.c | ||
58 | +++ b/accel/tcg/cputlb.c | ||
59 | @@ -XXX,XX +XXX,XX @@ store_memop(void *haddr, uint64_t val, MemOp op) | ||
60 | } | ||
61 | } | ||
62 | |||
63 | +static void __attribute__((noinline)) | ||
64 | +store_helper_unaligned(CPUArchState *env, target_ulong addr, uint64_t val, | ||
65 | + uintptr_t retaddr, size_t size, uintptr_t mmu_idx, | ||
66 | + bool big_endian) | ||
67 | +{ | ||
68 | + const size_t tlb_off = offsetof(CPUTLBEntry, addr_write); | ||
69 | + uintptr_t index, index2; | ||
70 | + CPUTLBEntry *entry, *entry2; | ||
71 | + target_ulong page2, tlb_addr, tlb_addr2; | ||
72 | + TCGMemOpIdx oi; | ||
73 | + size_t size2; | ||
74 | + int i; | ||
75 | + | ||
76 | + /* | ||
77 | + * Ensure the second page is in the TLB. Note that the first page | ||
78 | + * is already guaranteed to be filled, and that the second page | ||
79 | + * cannot evict the first. | ||
80 | + */ | ||
81 | + page2 = (addr + size) & TARGET_PAGE_MASK; | ||
82 | + size2 = (addr + size) & ~TARGET_PAGE_MASK; | ||
83 | + index2 = tlb_index(env, mmu_idx, page2); | ||
84 | + entry2 = tlb_entry(env, mmu_idx, page2); | ||
85 | + | ||
86 | + tlb_addr2 = tlb_addr_write(entry2); | ||
87 | + if (!tlb_hit_page(tlb_addr2, page2)) { | ||
88 | + if (!victim_tlb_hit(env, mmu_idx, index2, tlb_off, page2)) { | ||
89 | + tlb_fill(env_cpu(env), page2, size2, MMU_DATA_STORE, | ||
90 | + mmu_idx, retaddr); | ||
91 | + index2 = tlb_index(env, mmu_idx, page2); | ||
92 | + entry2 = tlb_entry(env, mmu_idx, page2); | ||
93 | + } | ||
94 | + tlb_addr2 = tlb_addr_write(entry2); | ||
95 | + } | ||
96 | + | ||
97 | + index = tlb_index(env, mmu_idx, addr); | ||
98 | + entry = tlb_entry(env, mmu_idx, addr); | ||
99 | + tlb_addr = tlb_addr_write(entry); | ||
100 | + | ||
101 | + /* | ||
102 | + * Handle watchpoints. Since this may trap, all checks | ||
103 | + * must happen before any store. | ||
104 | + */ | ||
105 | + if (unlikely(tlb_addr & TLB_WATCHPOINT)) { | ||
106 | + cpu_check_watchpoint(env_cpu(env), addr, size - size2, | ||
107 | + env_tlb(env)->d[mmu_idx].iotlb[index].attrs, | ||
108 | + BP_MEM_WRITE, retaddr); | ||
109 | + } | ||
110 | + if (unlikely(tlb_addr2 & TLB_WATCHPOINT)) { | ||
111 | + cpu_check_watchpoint(env_cpu(env), page2, size2, | ||
112 | + env_tlb(env)->d[mmu_idx].iotlb[index2].attrs, | ||
113 | + BP_MEM_WRITE, retaddr); | ||
114 | + } | ||
115 | + | ||
116 | + /* | ||
117 | + * XXX: not efficient, but simple. | ||
118 | + * This loop must go in the forward direction to avoid issues | ||
119 | + * with self-modifying code in Windows 64-bit. | ||
120 | + */ | ||
121 | + oi = make_memop_idx(MO_UB, mmu_idx); | ||
122 | + if (big_endian) { | ||
123 | + for (i = 0; i < size; ++i) { | ||
124 | + /* Big-endian extract. */ | ||
125 | + uint8_t val8 = val >> (((size - 1) * 8) - (i * 8)); | ||
126 | + helper_ret_stb_mmu(env, addr + i, val8, oi, retaddr); | ||
127 | + } | ||
128 | + } else { | ||
129 | + for (i = 0; i < size; ++i) { | ||
130 | + /* Little-endian extract. */ | ||
131 | + uint8_t val8 = val >> (i * 8); | ||
132 | + helper_ret_stb_mmu(env, addr + i, val8, oi, retaddr); | ||
133 | + } | ||
134 | + } | ||
135 | +} | ||
136 | + | ||
137 | static inline void QEMU_ALWAYS_INLINE | ||
138 | store_helper(CPUArchState *env, target_ulong addr, uint64_t val, | ||
139 | TCGMemOpIdx oi, uintptr_t retaddr, MemOp op) | ||
140 | @@ -XXX,XX +XXX,XX @@ store_helper(CPUArchState *env, target_ulong addr, uint64_t val, | ||
141 | if (size > 1 | ||
142 | && unlikely((addr & ~TARGET_PAGE_MASK) + size - 1 | ||
143 | >= TARGET_PAGE_SIZE)) { | ||
144 | - int i; | ||
145 | - uintptr_t index2; | ||
146 | - CPUTLBEntry *entry2; | ||
147 | - target_ulong page2, tlb_addr2; | ||
148 | - size_t size2; | ||
149 | - | ||
150 | do_unaligned_access: | ||
151 | - /* | ||
152 | - * Ensure the second page is in the TLB. Note that the first page | ||
153 | - * is already guaranteed to be filled, and that the second page | ||
154 | - * cannot evict the first. | ||
155 | - */ | ||
156 | - page2 = (addr + size) & TARGET_PAGE_MASK; | ||
157 | - size2 = (addr + size) & ~TARGET_PAGE_MASK; | ||
158 | - index2 = tlb_index(env, mmu_idx, page2); | ||
159 | - entry2 = tlb_entry(env, mmu_idx, page2); | ||
160 | - tlb_addr2 = tlb_addr_write(entry2); | ||
161 | - if (!tlb_hit_page(tlb_addr2, page2)) { | ||
162 | - if (!victim_tlb_hit(env, mmu_idx, index2, tlb_off, page2)) { | ||
163 | - tlb_fill(env_cpu(env), page2, size2, MMU_DATA_STORE, | ||
164 | - mmu_idx, retaddr); | ||
165 | - index2 = tlb_index(env, mmu_idx, page2); | ||
166 | - entry2 = tlb_entry(env, mmu_idx, page2); | ||
167 | - } | ||
168 | - tlb_addr2 = tlb_addr_write(entry2); | ||
169 | - } | ||
170 | - | ||
171 | - /* | ||
172 | - * Handle watchpoints. Since this may trap, all checks | ||
173 | - * must happen before any store. | ||
174 | - */ | ||
175 | - if (unlikely(tlb_addr & TLB_WATCHPOINT)) { | ||
176 | - cpu_check_watchpoint(env_cpu(env), addr, size - size2, | ||
177 | - env_tlb(env)->d[mmu_idx].iotlb[index].attrs, | ||
178 | - BP_MEM_WRITE, retaddr); | ||
179 | - } | ||
180 | - if (unlikely(tlb_addr2 & TLB_WATCHPOINT)) { | ||
181 | - cpu_check_watchpoint(env_cpu(env), page2, size2, | ||
182 | - env_tlb(env)->d[mmu_idx].iotlb[index2].attrs, | ||
183 | - BP_MEM_WRITE, retaddr); | ||
184 | - } | ||
185 | - | ||
186 | - /* | ||
187 | - * XXX: not efficient, but simple. | ||
188 | - * This loop must go in the forward direction to avoid issues | ||
189 | - * with self-modifying code in Windows 64-bit. | ||
190 | - */ | ||
191 | - for (i = 0; i < size; ++i) { | ||
192 | - uint8_t val8; | ||
193 | - if (memop_big_endian(op)) { | ||
194 | - /* Big-endian extract. */ | ||
195 | - val8 = val >> (((size - 1) * 8) - (i * 8)); | ||
196 | - } else { | ||
197 | - /* Little-endian extract. */ | ||
198 | - val8 = val >> (i * 8); | ||
199 | - } | ||
200 | - helper_ret_stb_mmu(env, addr + i, val8, oi, retaddr); | ||
201 | - } | ||
202 | + store_helper_unaligned(env, addr, val, retaddr, size, | ||
203 | + mmu_idx, memop_big_endian(op)); | ||
204 | return; | ||
205 | } | ||
206 | |||
207 | @@ -XXX,XX +XXX,XX @@ store_helper(CPUArchState *env, target_ulong addr, uint64_t val, | ||
208 | store_memop(haddr, val, op); | ||
209 | } | ||
210 | |||
211 | -void helper_ret_stb_mmu(CPUArchState *env, target_ulong addr, uint8_t val, | ||
212 | - TCGMemOpIdx oi, uintptr_t retaddr) | ||
213 | +void __attribute__((noinline)) | ||
214 | +helper_ret_stb_mmu(CPUArchState *env, target_ulong addr, uint8_t val, | ||
215 | + TCGMemOpIdx oi, uintptr_t retaddr) | ||
216 | { | ||
217 | store_helper(env, addr, val, oi, retaddr, MO_UB); | ||
218 | } | ||
219 | -- | ||
220 | 2.25.1 | ||
221 | |||
222 | diff view generated by jsdifflib |
1 | Do not store back to the exact memory from which we just loaded. | 1 | The immediate operands to VGM were in the wrong order, |
---|---|---|---|
2 | producing an inverse mask. | ||
2 | 3 | ||
3 | Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org> | ||
4 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> | 4 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> |
5 | --- | 5 | --- |
6 | tcg/tcg-op-gvec.c | 4 ++-- | 6 | tcg/s390x/tcg-target.c.inc | 4 ++-- |
7 | 1 file changed, 2 insertions(+), 2 deletions(-) | 7 | 1 file changed, 2 insertions(+), 2 deletions(-) |
8 | 8 | ||
9 | diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c | 9 | diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc |
10 | index XXXXXXX..XXXXXXX 100644 | 10 | index XXXXXXX..XXXXXXX 100644 |
11 | --- a/tcg/tcg-op-gvec.c | 11 | --- a/tcg/s390x/tcg-target.c.inc |
12 | +++ b/tcg/tcg-op-gvec.c | 12 | +++ b/tcg/s390x/tcg-target.c.inc |
13 | @@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs, | 13 | @@ -XXX,XX +XXX,XX @@ static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece, |
14 | TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V128); | 14 | msb = clz32(val); |
15 | 15 | lsb = 31 - ctz32(val); | |
16 | tcg_gen_ld_vec(in, cpu_env, aofs); | ||
17 | - for (i = 0; i < oprsz; i += 16) { | ||
18 | + for (i = (aofs == dofs) * 16; i < oprsz; i += 16) { | ||
19 | tcg_gen_st_vec(in, cpu_env, dofs + i); | ||
20 | } | 16 | } |
21 | tcg_temp_free_vec(in); | 17 | - tcg_out_insn(s, VRIb, VGM, dst, lsb, msb, MO_32); |
22 | @@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs, | 18 | + tcg_out_insn(s, VRIb, VGM, dst, msb, lsb, MO_32); |
23 | 19 | return; | |
24 | tcg_gen_ld_i64(in0, cpu_env, aofs); | 20 | } |
25 | tcg_gen_ld_i64(in1, cpu_env, aofs + 8); | 21 | } else { |
26 | - for (i = 0; i < oprsz; i += 16) { | 22 | @@ -XXX,XX +XXX,XX @@ static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece, |
27 | + for (i = (aofs == dofs) * 16; i < oprsz; i += 16) { | 23 | msb = clz64(val); |
28 | tcg_gen_st_i64(in0, cpu_env, dofs + i); | 24 | lsb = 63 - ctz64(val); |
29 | tcg_gen_st_i64(in1, cpu_env, dofs + i + 8); | ||
30 | } | 25 | } |
26 | - tcg_out_insn(s, VRIb, VGM, dst, lsb, msb, MO_64); | ||
27 | + tcg_out_insn(s, VRIb, VGM, dst, msb, lsb, MO_64); | ||
28 | return; | ||
29 | } | ||
30 | } | ||
31 | -- | 31 | -- |
32 | 2.25.1 | 32 | 2.25.1 |
33 | |||
34 | diff view generated by jsdifflib |
1 | We already support duplication of 128-bit blocks. This extends | 1 | The operands are output in the wrong order: the tcg selector |
---|---|---|---|
2 | that support to 256-bit blocks. This will be needed by SVE2. | 2 | argument is first, whereas the s390x selector argument is last. |
3 | 3 | ||
4 | Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org> | 4 | Tested-by: Thomas Huth <thuth@redhat.com> |
5 | Resolves: https://gitlab.com/qemu-project/qemu/-/issues/898 | ||
6 | Fixes: 9bca986df88 ("tcg/s390x: Implement TCG_TARGET_HAS_bitsel_vec") | ||
5 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> | 7 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> |
6 | --- | 8 | --- |
7 | tcg/tcg-op-gvec.c | 52 ++++++++++++++++++++++++++++++++++++++++++++--- | 9 | tcg/s390x/tcg-target.c.inc | 2 +- |
8 | 1 file changed, 49 insertions(+), 3 deletions(-) | 10 | 1 file changed, 1 insertion(+), 1 deletion(-) |
9 | 11 | ||
10 | diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c | 12 | diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc |
11 | index XXXXXXX..XXXXXXX 100644 | 13 | index XXXXXXX..XXXXXXX 100644 |
12 | --- a/tcg/tcg-op-gvec.c | 14 | --- a/tcg/s390x/tcg-target.c.inc |
13 | +++ b/tcg/tcg-op-gvec.c | 15 | +++ b/tcg/s390x/tcg-target.c.inc |
14 | @@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs, | 16 | @@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, |
15 | do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0); | 17 | break; |
16 | tcg_temp_free_i64(in); | 18 | |
17 | } | 19 | case INDEX_op_bitsel_vec: |
18 | - } else { | 20 | - tcg_out_insn(s, VRRe, VSEL, a0, a1, a2, args[3]); |
19 | + } else if (vece == 4) { | 21 | + tcg_out_insn(s, VRRe, VSEL, a0, a2, args[3], a1); |
20 | /* 128-bit duplicate. */ | 22 | break; |
21 | - /* ??? Dup to 256-bit vector. */ | 23 | |
22 | int i; | 24 | case INDEX_op_cmp_vec: |
23 | |||
24 | - tcg_debug_assert(vece == 4); | ||
25 | tcg_debug_assert(oprsz >= 16); | ||
26 | if (TCG_TARGET_HAS_v128) { | ||
27 | TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V128); | ||
28 | @@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs, | ||
29 | if (oprsz < maxsz) { | ||
30 | expand_clr(dofs + oprsz, maxsz - oprsz); | ||
31 | } | ||
32 | + } else if (vece == 5) { | ||
33 | + /* 256-bit duplicate. */ | ||
34 | + int i; | ||
35 | + | ||
36 | + tcg_debug_assert(oprsz >= 32); | ||
37 | + tcg_debug_assert(oprsz % 32 == 0); | ||
38 | + if (TCG_TARGET_HAS_v256) { | ||
39 | + TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V256); | ||
40 | + | ||
41 | + tcg_gen_ld_vec(in, cpu_env, aofs); | ||
42 | + for (i = (aofs == dofs) * 32; i < oprsz; i += 32) { | ||
43 | + tcg_gen_st_vec(in, cpu_env, dofs + i); | ||
44 | + } | ||
45 | + tcg_temp_free_vec(in); | ||
46 | + } else if (TCG_TARGET_HAS_v128) { | ||
47 | + TCGv_vec in0 = tcg_temp_new_vec(TCG_TYPE_V128); | ||
48 | + TCGv_vec in1 = tcg_temp_new_vec(TCG_TYPE_V128); | ||
49 | + | ||
50 | + tcg_gen_ld_vec(in0, cpu_env, aofs); | ||
51 | + tcg_gen_ld_vec(in1, cpu_env, aofs + 16); | ||
52 | + for (i = (aofs == dofs) * 32; i < oprsz; i += 32) { | ||
53 | + tcg_gen_st_vec(in0, cpu_env, dofs + i); | ||
54 | + tcg_gen_st_vec(in1, cpu_env, dofs + i + 16); | ||
55 | + } | ||
56 | + tcg_temp_free_vec(in0); | ||
57 | + tcg_temp_free_vec(in1); | ||
58 | + } else { | ||
59 | + TCGv_i64 in[4]; | ||
60 | + int j; | ||
61 | + | ||
62 | + for (j = 0; j < 4; ++j) { | ||
63 | + in[j] = tcg_temp_new_i64(); | ||
64 | + tcg_gen_ld_i64(in[j], cpu_env, aofs + j * 8); | ||
65 | + } | ||
66 | + for (i = (aofs == dofs) * 32; i < oprsz; i += 32) { | ||
67 | + for (j = 0; j < 4; ++j) { | ||
68 | + tcg_gen_st_i64(in[j], cpu_env, dofs + i + j * 8); | ||
69 | + } | ||
70 | + } | ||
71 | + for (j = 0; j < 4; ++j) { | ||
72 | + tcg_temp_free_i64(in[j]); | ||
73 | + } | ||
74 | + } | ||
75 | + if (oprsz < maxsz) { | ||
76 | + expand_clr(dofs + oprsz, maxsz - oprsz); | ||
77 | + } | ||
78 | + } else { | ||
79 | + g_assert_not_reached(); | ||
80 | } | ||
81 | } | ||
82 | |||
83 | -- | 25 | -- |
84 | 2.25.1 | 26 | 2.25.1 |
85 | |||
86 | diff view generated by jsdifflib |
1 | From: Stephen Long <steplong@quicinc.com> | 1 | We copied the data from the general register input to the |
---|---|---|---|
2 | vector register output, but have not yet replicated it. | ||
3 | We intended to fall through into the vector-vector case, | ||
4 | but failed to redirect the input register. | ||
2 | 5 | ||
3 | The fallback inline expansion for vectorized absolute value, | 6 | This is caught by an assertion failure in tcg_out_insn_VRIc, |
4 | when the host doesn't support such an insn was flawed. | 7 | which diagnosed the incorrect register class. |
5 | 8 | ||
6 | E.g. when a vector of bytes has all elements negative, mask | ||
7 | will be 0xffff_ffff_ffff_ffff. Subtracting mask only adds 1 | ||
8 | to the low element instead of all elements becase -mask is 1 | ||
9 | and not 0x0101_0101_0101_0101. | ||
10 | |||
11 | Signed-off-by: Stephen Long <steplong@quicinc.com> | ||
12 | Message-Id: <20200813161818.190-1-steplong@quicinc.com> | ||
13 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> | 9 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> |
14 | --- | 10 | --- |
15 | tcg/tcg-op-gvec.c | 5 +++-- | 11 | tcg/s390x/tcg-target.c.inc | 1 + |
16 | 1 file changed, 3 insertions(+), 2 deletions(-) | 12 | 1 file changed, 1 insertion(+) |
17 | 13 | ||
18 | diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c | 14 | diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc |
19 | index XXXXXXX..XXXXXXX 100644 | 15 | index XXXXXXX..XXXXXXX 100644 |
20 | --- a/tcg/tcg-op-gvec.c | 16 | --- a/tcg/s390x/tcg-target.c.inc |
21 | +++ b/tcg/tcg-op-gvec.c | 17 | +++ b/tcg/s390x/tcg-target.c.inc |
22 | @@ -XXX,XX +XXX,XX @@ static void gen_absv_mask(TCGv_i64 d, TCGv_i64 b, unsigned vece) | 18 | @@ -XXX,XX +XXX,XX @@ static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece, |
23 | tcg_gen_muli_i64(t, t, (1 << nbit) - 1); | 19 | if (vece == MO_64) { |
20 | return true; | ||
21 | } | ||
22 | + src = dst; | ||
23 | } | ||
24 | 24 | ||
25 | /* | 25 | /* |
26 | - * Invert (via xor -1) and add one (via sub -1). | ||
27 | + * Invert (via xor -1) and add one. | ||
28 | * Because of the ordering the msb is cleared, | ||
29 | * so we never have carry into the next element. | ||
30 | */ | ||
31 | tcg_gen_xor_i64(d, b, t); | ||
32 | - tcg_gen_sub_i64(d, d, t); | ||
33 | + tcg_gen_andi_i64(t, t, dup_const(vece, 1)); | ||
34 | + tcg_gen_add_i64(d, d, t); | ||
35 | |||
36 | tcg_temp_free_i64(t); | ||
37 | } | ||
38 | -- | 26 | -- |
39 | 2.25.1 | 27 | 2.25.1 |
40 | |||
41 | diff view generated by jsdifflib |
1 | Do not set parallel_cpus if there is only one cpu instantiated. | 1 | The LDRD (register) instruction is UNPREDICTABLE if the Rm register |
---|---|---|---|
2 | This will allow tcg to use serial code to implement atomics. | 2 | is the same as either Rt or Rt+1 (the two registers being loaded to). |
3 | We weren't making sure we avoided this, with the result that on some | ||
4 | host CPUs like the Cortex-A7 we would get a SIGILL because the CPU | ||
5 | chooses to UNDEF for this particular UNPREDICTABLE case. | ||
3 | 6 | ||
4 | Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org> | 7 | Since we've already checked that datalo is aligned, we can simplify |
8 | the test vs the Rm operand by aligning it before comparison. Check | ||
9 | for the two orderings before falling back to two ldr instructions. | ||
10 | |||
11 | We don't bother to do anything similar for tcg_out_ldrd_rwb(), | ||
12 | because it is only used in tcg_out_tlb_read() with a fixed set of | ||
13 | registers which don't overlap. | ||
14 | |||
15 | There is no equivalent UNPREDICTABLE case for STRD. | ||
16 | |||
17 | Reviewed-by: Alex Bennée <alex.bennee@linaro.org> | ||
18 | Resolves: https://gitlab.com/qemu-project/qemu/-/issues/896 | ||
5 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> | 19 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> |
6 | --- | 20 | --- |
7 | softmmu/cpus.c | 11 ++++++++++- | 21 | tcg/arm/tcg-target.c.inc | 17 +++++++++++++++-- |
8 | 1 file changed, 10 insertions(+), 1 deletion(-) | 22 | 1 file changed, 15 insertions(+), 2 deletions(-) |
9 | 23 | ||
10 | diff --git a/softmmu/cpus.c b/softmmu/cpus.c | 24 | diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc |
11 | index XXXXXXX..XXXXXXX 100644 | 25 | index XXXXXXX..XXXXXXX 100644 |
12 | --- a/softmmu/cpus.c | 26 | --- a/tcg/arm/tcg-target.c.inc |
13 | +++ b/softmmu/cpus.c | 27 | +++ b/tcg/arm/tcg-target.c.inc |
14 | @@ -XXX,XX +XXX,XX @@ static void qemu_tcg_init_vcpu(CPUState *cpu) | 28 | @@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld_index(TCGContext *s, MemOp opc, |
15 | if (!tcg_region_inited) { | 29 | /* LDRD requires alignment; double-check that. */ |
16 | tcg_region_inited = 1; | 30 | if (get_alignment_bits(opc) >= MO_64 |
17 | tcg_region_init(); | 31 | && (datalo & 1) == 0 && datahi == datalo + 1) { |
18 | + /* | 32 | - tcg_out_ldrd_r(s, COND_AL, datalo, addrlo, addend); |
19 | + * If MTTCG, and we will create multiple cpus, | 33 | - } else if (scratch_addend) { |
20 | + * then we will have cpus running in parallel. | 34 | + /* |
21 | + */ | 35 | + * Rm (the second address op) must not overlap Rt or Rt + 1. |
22 | + if (qemu_tcg_mttcg_enabled()) { | 36 | + * Since datalo is aligned, we can simplify the test via alignment. |
23 | + MachineState *ms = MACHINE(qdev_get_machine()); | 37 | + * Flip the two address arguments if that works. |
24 | + if (ms->smp.max_cpus > 1) { | 38 | + */ |
25 | + parallel_cpus = true; | 39 | + if ((addend & ~1) != datalo) { |
40 | + tcg_out_ldrd_r(s, COND_AL, datalo, addrlo, addend); | ||
41 | + break; | ||
42 | + } | ||
43 | + if ((addrlo & ~1) != datalo) { | ||
44 | + tcg_out_ldrd_r(s, COND_AL, datalo, addend, addrlo); | ||
45 | + break; | ||
26 | + } | 46 | + } |
27 | + } | 47 | + } |
28 | } | 48 | + if (scratch_addend) { |
29 | 49 | tcg_out_ld32_rwb(s, COND_AL, datalo, addend, addrlo); | |
30 | if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) { | 50 | tcg_out_ld32_12(s, COND_AL, datahi, addend, 4); |
31 | @@ -XXX,XX +XXX,XX @@ static void qemu_tcg_init_vcpu(CPUState *cpu) | 51 | } else { |
32 | |||
33 | if (qemu_tcg_mttcg_enabled()) { | ||
34 | /* create a thread per vCPU with TCG (MTTCG) */ | ||
35 | - parallel_cpus = true; | ||
36 | snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG", | ||
37 | cpu->cpu_index); | ||
38 | |||
39 | -- | 52 | -- |
40 | 2.25.1 | 53 | 2.25.1 |
41 | 54 | ||
42 | 55 | diff view generated by jsdifflib |