1
The following changes since commit 40c67636f67c2a89745f2e698522fe917326a952:
1
The following changes since commit d1181d29370a4318a9f11ea92065bea6bb159f83:
2
2
3
Merge remote-tracking branch 'remotes/kraxel/tags/usb-20200317-pull-request' into staging (2020-03-17 14:00:56 +0000)
3
Merge tag 'pull-nbd-2023-07-19' of https://repo.or.cz/qemu/ericb into staging (2023-07-20 09:54:07 +0100)
4
4
5
are available in the Git repository at:
5
are available in the Git repository at:
6
6
7
https://github.com/rth7680/qemu.git tags/pull-tcg-20200317
7
https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20230724
8
8
9
for you to fetch changes up to 0270bd503e3699b7202200a2d693ad1feb57473f:
9
for you to fetch changes up to 32b120394c578bc824f1db4835b3bffbeca88fae:
10
10
11
tcg: Remove tcg-runtime-gvec.c DO_CMP0 (2020-03-17 08:41:07 -0700)
11
accel/tcg: Fix type of 'last' for pageflags_{find,next} (2023-07-24 09:48:49 +0100)
12
12
13
----------------------------------------------------------------
13
----------------------------------------------------------------
14
Fix tcg/i386 bug vs sari_vec.
14
accel/tcg: Zero-pad vaddr in tlb debug output
15
Fix tcg-runtime-gvec.c vs i386 without avx.
15
accel/tcg: Fix type of 'last' for pageflags_{find,next}
16
accel/tcg: Fix sense of read-only probes in ldst_atomicity
17
accel/tcg: Take mmap_lock in load_atomic*_or_exit
18
tcg: Add earlyclobber to op_add2 for x86 and s390x
19
tcg/ppc: Fix race in goto_tb implementation
16
20
17
----------------------------------------------------------------
21
----------------------------------------------------------------
18
Richard Henderson (5):
22
Anton Johansson (1):
19
tcg/i386: Bound shift count expanding sari_vec
23
accel/tcg: Zero-pad vaddr in tlb_debug output
20
tcg: Remove CONFIG_VECTOR16
21
tcg: Tidy tcg-runtime-gvec.c types
22
tcg: Tidy tcg-runtime-gvec.c DUP*
23
tcg: Remove tcg-runtime-gvec.c DO_CMP0
24
24
25
configure | 56 --------
25
Ilya Leoshkevich (1):
26
accel/tcg/tcg-runtime-gvec.c | 298 +++++++++++++++++--------------------------
26
tcg/{i386, s390x}: Add earlyclobber to the op_add2's first output
27
tcg/i386/tcg-target.inc.c | 9 +-
28
3 files changed, 122 insertions(+), 241 deletions(-)
29
27
28
Jordan Niethe (1):
29
tcg/ppc: Fix race in goto_tb implementation
30
31
Luca Bonissi (1):
32
accel/tcg: Fix type of 'last' for pageflags_{find,next}
33
34
Richard Henderson (3):
35
include/exec: Add WITH_MMAP_LOCK_GUARD
36
accel/tcg: Fix sense of read-only probes in ldst_atomicity
37
accel/tcg: Take mmap_lock in load_atomic*_or_exit
38
39
include/exec/exec-all.h | 10 ++++++++++
40
tcg/i386/tcg-target-con-set.h | 5 ++++-
41
tcg/s390x/tcg-target-con-set.h | 8 +++++---
42
accel/tcg/cputlb.c | 20 ++++++++++----------
43
accel/tcg/user-exec.c | 4 ++--
44
bsd-user/mmap.c | 1 +
45
linux-user/mmap.c | 1 +
46
tcg/tcg.c | 8 +++++++-
47
accel/tcg/ldst_atomicity.c.inc | 32 ++++++++++++++++++--------------
48
tcg/i386/tcg-target.c.inc | 2 +-
49
tcg/ppc/tcg-target.c.inc | 9 +++++----
50
tcg/s390x/tcg-target.c.inc | 4 ++--
51
12 files changed, 66 insertions(+), 38 deletions(-)
diff view generated by jsdifflib
New patch
1
From: Jordan Niethe <jniethe5@gmail.com>
1
2
3
Commit 20b6643324 ("tcg/ppc: Reorg goto_tb implementation") modified
4
goto_tb to ensure only a single instruction was patched to prevent
5
incorrect behavior if a thread was in the middle of multiple
6
instructions when they were replaced. However this introduced a race
7
between loading the jmp target into TCG_REG_TB and patching and
8
executing the direct branch.
9
10
The relevant part of the goto_tb implementation:
11
12
ld TCG_REG_TB, TARGET_ADDR_LOCATION(TCG_REG_TB)
13
patch_location:
14
mtctr TCG_REG_TB
15
bctr
16
17
tb_target_set_jmp_target() will replace 'patch_location' with a direct
18
branch if the target is in range. The direct branch now relies on
19
TCG_REG_TB being set up correctly by the ld. Prior to this commit
20
multiple instructions were patched in for the direct branch case; these
21
instructions would initialize TCG_REG_TB to the same value as the branch
22
target.
23
24
Imagine the following sequence:
25
26
1) Thread A is executing the goto_tb sequence and loads the jmp
27
target into TCG_REG_TB.
28
29
2) Thread B updates the jmp target address and calls
30
tb_target_set_jmp_target(). This patches a new direct branch into the
31
goto_tb sequence.
32
33
3) Thread A executes the newly patched direct branch. The value in
34
TCG_REG_TB still contains the old jmp target.
35
36
TCG_REG_TB MUST contain the translation block's tc.ptr. Execution will
37
eventually crash after performing memory accesses generated from a
38
faulty value in TCG_REG_TB.
39
40
This presents as segfaults or illegal instruction exceptions.
41
42
Do not revert commit 20b6643324 as it did fix a different race
43
condition. Instead remove the direct branch optimization and always use
44
indirect branches.
45
46
The direct branch optimization can be re-added later with a race free
47
sequence.
48
49
Fixes: 20b6643324 ("tcg/ppc: Reorg goto_tb implementation")
50
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1726
51
Reported-by: Anushree Mathur <anushree.mathur@linux.vnet.ibm.com>
52
Tested-by: Anushree Mathur <anushree.mathur@linux.vnet.ibm.com>
53
Tested-by: Michael Tokarev <mjt@tls.msk.ru>
54
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
55
Co-developed-by: Benjamin Gray <bgray@linux.ibm.com>
56
Signed-off-by: Jordan Niethe <jniethe5@gmail.com>
57
Signed-off-by: Benjamin Gray <bgray@linux.ibm.com>
58
Message-Id: <20230717093001.13167-1-jniethe5@gmail.com>
59
---
60
tcg/ppc/tcg-target.c.inc | 9 +++++----
61
1 file changed, 5 insertions(+), 4 deletions(-)
62
63
diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
64
index XXXXXXX..XXXXXXX 100644
65
--- a/tcg/ppc/tcg-target.c.inc
66
+++ b/tcg/ppc/tcg-target.c.inc
67
@@ -XXX,XX +XXX,XX @@ static void tcg_out_goto_tb(TCGContext *s, int which)
68
ptrdiff_t offset = tcg_tbrel_diff(s, (void *)ptr);
69
tcg_out_mem_long(s, LD, LDX, TCG_REG_TB, TCG_REG_TB, offset);
70
71
- /* Direct branch will be patched by tb_target_set_jmp_target. */
72
+ /* TODO: Use direct branches when possible. */
73
set_jmp_insn_offset(s, which);
74
tcg_out32(s, MTSPR | RS(TCG_REG_TB) | CTR);
75
76
- /* When branch is out of range, fall through to indirect. */
77
tcg_out32(s, BCCTR | BO_ALWAYS);
78
79
/* For the unlinked case, need to reset TCG_REG_TB. */
80
@@ -XXX,XX +XXX,XX @@ void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
81
intptr_t diff = addr - jmp_rx;
82
tcg_insn_unit insn;
83
84
+ if (USE_REG_TB) {
85
+ return;
86
+ }
87
+
88
if (in_range_b(diff)) {
89
insn = B | (diff & 0x3fffffc);
90
- } else if (USE_REG_TB) {
91
- insn = MTSPR | RS(TCG_REG_TB) | CTR;
92
} else {
93
insn = NOP;
94
}
95
--
96
2.34.1
diff view generated by jsdifflib
New patch
1
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
2
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
3
---
4
include/exec/exec-all.h | 10 ++++++++++
5
bsd-user/mmap.c | 1 +
6
linux-user/mmap.c | 1 +
7
3 files changed, 12 insertions(+)
1
8
9
diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
10
index XXXXXXX..XXXXXXX 100644
11
--- a/include/exec/exec-all.h
12
+++ b/include/exec/exec-all.h
13
@@ -XXX,XX +XXX,XX @@ void TSA_NO_TSA mmap_lock(void);
14
void TSA_NO_TSA mmap_unlock(void);
15
bool have_mmap_lock(void);
16
17
+static inline void mmap_unlock_guard(void *unused)
18
+{
19
+ mmap_unlock();
20
+}
21
+
22
+#define WITH_MMAP_LOCK_GUARD() \
23
+ for (int _mmap_lock_iter __attribute__((cleanup(mmap_unlock_guard))) \
24
+ = (mmap_lock(), 0); _mmap_lock_iter == 0; _mmap_lock_iter = 1)
25
+
26
/**
27
* adjust_signal_pc:
28
* @pc: raw pc from the host signal ucontext_t.
29
@@ -XXX,XX +XXX,XX @@ G_NORETURN void cpu_loop_exit_sigbus(CPUState *cpu, target_ulong addr,
30
#else
31
static inline void mmap_lock(void) {}
32
static inline void mmap_unlock(void) {}
33
+#define WITH_MMAP_LOCK_GUARD()
34
35
void tlb_reset_dirty(CPUState *cpu, ram_addr_t start1, ram_addr_t length);
36
void tlb_set_dirty(CPUState *cpu, vaddr addr);
37
diff --git a/bsd-user/mmap.c b/bsd-user/mmap.c
38
index XXXXXXX..XXXXXXX 100644
39
--- a/bsd-user/mmap.c
40
+++ b/bsd-user/mmap.c
41
@@ -XXX,XX +XXX,XX @@ void mmap_lock(void)
42
43
void mmap_unlock(void)
44
{
45
+ assert(mmap_lock_count > 0);
46
if (--mmap_lock_count == 0) {
47
pthread_mutex_unlock(&mmap_mutex);
48
}
49
diff --git a/linux-user/mmap.c b/linux-user/mmap.c
50
index XXXXXXX..XXXXXXX 100644
51
--- a/linux-user/mmap.c
52
+++ b/linux-user/mmap.c
53
@@ -XXX,XX +XXX,XX @@ void mmap_lock(void)
54
55
void mmap_unlock(void)
56
{
57
+ assert(mmap_lock_count > 0);
58
if (--mmap_lock_count == 0) {
59
pthread_mutex_unlock(&mmap_mutex);
60
}
61
--
62
2.34.1
diff view generated by jsdifflib
1
Partial cleanup from the CONFIG_VECTOR16 removal.
1
In the initial commit, cdfac37be0d, the sense of the test is incorrect,
2
Replace the DUP* expansions with the scalar argument.
2
as the -1/0 return was confusing. In bef6f008b981, we mechanically
3
invert all callers while changing to false/true return, preserving the
4
incorrectness of the test.
3
5
4
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
6
Now that the return sense is sane, it's easy to see that if !write,
7
then the page is not modifiable (i.e. most likely read-only, with
8
PROT_NONE handled via SIGSEGV).
9
10
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
5
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
11
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
6
---
12
---
7
accel/tcg/tcg-runtime-gvec.c | 50 +++++++++++-------------------------
13
accel/tcg/ldst_atomicity.c.inc | 4 ++--
8
1 file changed, 15 insertions(+), 35 deletions(-)
14
1 file changed, 2 insertions(+), 2 deletions(-)
9
15
10
diff --git a/accel/tcg/tcg-runtime-gvec.c b/accel/tcg/tcg-runtime-gvec.c
16
diff --git a/accel/tcg/ldst_atomicity.c.inc b/accel/tcg/ldst_atomicity.c.inc
11
index XXXXXXX..XXXXXXX 100644
17
index XXXXXXX..XXXXXXX 100644
12
--- a/accel/tcg/tcg-runtime-gvec.c
18
--- a/accel/tcg/ldst_atomicity.c.inc
13
+++ b/accel/tcg/tcg-runtime-gvec.c
19
+++ b/accel/tcg/ldst_atomicity.c.inc
14
@@ -XXX,XX +XXX,XX @@
20
@@ -XXX,XX +XXX,XX @@ static uint64_t load_atomic8_or_exit(CPUArchState *env, uintptr_t ra, void *pv)
15
#include "tcg/tcg-gvec-desc.h"
21
* another process, because the fallback start_exclusive solution
16
22
* provides no protection across processes.
17
23
*/
18
-#define DUP16(X) X
24
- if (page_check_range(h2g(pv), 8, PAGE_WRITE_ORG)) {
19
-#define DUP8(X) X
25
+ if (!page_check_range(h2g(pv), 8, PAGE_WRITE_ORG)) {
20
-#define DUP4(X) X
26
uint64_t *p = __builtin_assume_aligned(pv, 8);
21
-#define DUP2(X) X
27
return *p;
22
-
23
static inline void clear_high(void *d, intptr_t oprsz, uint32_t desc)
24
{
25
intptr_t maxsz = simd_maxsz(desc);
26
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_add64)(void *d, void *a, void *b, uint32_t desc)
27
void HELPER(gvec_adds8)(void *d, void *a, uint64_t b, uint32_t desc)
28
{
29
intptr_t oprsz = simd_oprsz(desc);
30
- uint8_t vecb = (uint8_t)DUP16(b);
31
intptr_t i;
32
33
for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
34
- *(uint8_t *)(d + i) = *(uint8_t *)(a + i) + vecb;
35
+ *(uint8_t *)(d + i) = *(uint8_t *)(a + i) + (uint8_t)b;
36
}
28
}
37
clear_high(d, oprsz, desc);
29
@@ -XXX,XX +XXX,XX @@ static Int128 load_atomic16_or_exit(CPUArchState *env, uintptr_t ra, void *pv)
38
}
30
* another process, because the fallback start_exclusive solution
39
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_adds8)(void *d, void *a, uint64_t b, uint32_t desc)
31
* provides no protection across processes.
40
void HELPER(gvec_adds16)(void *d, void *a, uint64_t b, uint32_t desc)
32
*/
41
{
33
- if (page_check_range(h2g(p), 16, PAGE_WRITE_ORG)) {
42
intptr_t oprsz = simd_oprsz(desc);
34
+ if (!page_check_range(h2g(p), 16, PAGE_WRITE_ORG)) {
43
- uint16_t vecb = (uint16_t)DUP8(b);
35
return *p;
44
intptr_t i;
45
46
for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
47
- *(uint16_t *)(d + i) = *(uint16_t *)(a + i) + vecb;
48
+ *(uint16_t *)(d + i) = *(uint16_t *)(a + i) + (uint16_t)b;
49
}
36
}
50
clear_high(d, oprsz, desc);
37
#endif
51
}
52
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_adds16)(void *d, void *a, uint64_t b, uint32_t desc)
53
void HELPER(gvec_adds32)(void *d, void *a, uint64_t b, uint32_t desc)
54
{
55
intptr_t oprsz = simd_oprsz(desc);
56
- uint32_t vecb = (uint32_t)DUP4(b);
57
intptr_t i;
58
59
for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
60
- *(uint32_t *)(d + i) = *(uint32_t *)(a + i) + vecb;
61
+ *(uint32_t *)(d + i) = *(uint32_t *)(a + i) + (uint32_t)b;
62
}
63
clear_high(d, oprsz, desc);
64
}
65
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_adds32)(void *d, void *a, uint64_t b, uint32_t desc)
66
void HELPER(gvec_adds64)(void *d, void *a, uint64_t b, uint32_t desc)
67
{
68
intptr_t oprsz = simd_oprsz(desc);
69
- uint64_t vecb = (uint64_t)DUP2(b);
70
intptr_t i;
71
72
for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
73
- *(uint64_t *)(d + i) = *(uint64_t *)(a + i) + vecb;
74
+ *(uint64_t *)(d + i) = *(uint64_t *)(a + i) + b;
75
}
76
clear_high(d, oprsz, desc);
77
}
78
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_sub64)(void *d, void *a, void *b, uint32_t desc)
79
void HELPER(gvec_subs8)(void *d, void *a, uint64_t b, uint32_t desc)
80
{
81
intptr_t oprsz = simd_oprsz(desc);
82
- uint8_t vecb = (uint8_t)DUP16(b);
83
intptr_t i;
84
85
for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
86
- *(uint8_t *)(d + i) = *(uint8_t *)(a + i) - vecb;
87
+ *(uint8_t *)(d + i) = *(uint8_t *)(a + i) - (uint8_t)b;
88
}
89
clear_high(d, oprsz, desc);
90
}
91
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_subs8)(void *d, void *a, uint64_t b, uint32_t desc)
92
void HELPER(gvec_subs16)(void *d, void *a, uint64_t b, uint32_t desc)
93
{
94
intptr_t oprsz = simd_oprsz(desc);
95
- uint16_t vecb = (uint16_t)DUP8(b);
96
intptr_t i;
97
98
for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
99
- *(uint16_t *)(d + i) = *(uint16_t *)(a + i) - vecb;
100
+ *(uint16_t *)(d + i) = *(uint16_t *)(a + i) - (uint16_t)b;
101
}
102
clear_high(d, oprsz, desc);
103
}
104
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_subs16)(void *d, void *a, uint64_t b, uint32_t desc)
105
void HELPER(gvec_subs32)(void *d, void *a, uint64_t b, uint32_t desc)
106
{
107
intptr_t oprsz = simd_oprsz(desc);
108
- uint32_t vecb = (uint32_t)DUP4(b);
109
intptr_t i;
110
111
for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
112
- *(uint32_t *)(d + i) = *(uint32_t *)(a + i) - vecb;
113
+ *(uint32_t *)(d + i) = *(uint32_t *)(a + i) - (uint32_t)b;
114
}
115
clear_high(d, oprsz, desc);
116
}
117
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_subs32)(void *d, void *a, uint64_t b, uint32_t desc)
118
void HELPER(gvec_subs64)(void *d, void *a, uint64_t b, uint32_t desc)
119
{
120
intptr_t oprsz = simd_oprsz(desc);
121
- uint64_t vecb = (uint64_t)DUP2(b);
122
intptr_t i;
123
124
for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
125
- *(uint64_t *)(d + i) = *(uint64_t *)(a + i) - vecb;
126
+ *(uint64_t *)(d + i) = *(uint64_t *)(a + i) - b;
127
}
128
clear_high(d, oprsz, desc);
129
}
130
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_mul64)(void *d, void *a, void *b, uint32_t desc)
131
void HELPER(gvec_muls8)(void *d, void *a, uint64_t b, uint32_t desc)
132
{
133
intptr_t oprsz = simd_oprsz(desc);
134
- uint8_t vecb = (uint8_t)DUP16(b);
135
intptr_t i;
136
137
for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
138
- *(uint8_t *)(d + i) = *(uint8_t *)(a + i) * vecb;
139
+ *(uint8_t *)(d + i) = *(uint8_t *)(a + i) * (uint8_t)b;
140
}
141
clear_high(d, oprsz, desc);
142
}
143
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_muls8)(void *d, void *a, uint64_t b, uint32_t desc)
144
void HELPER(gvec_muls16)(void *d, void *a, uint64_t b, uint32_t desc)
145
{
146
intptr_t oprsz = simd_oprsz(desc);
147
- uint16_t vecb = (uint16_t)DUP8(b);
148
intptr_t i;
149
150
for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
151
- *(uint16_t *)(d + i) = *(uint16_t *)(a + i) * vecb;
152
+ *(uint16_t *)(d + i) = *(uint16_t *)(a + i) * (uint16_t)b;
153
}
154
clear_high(d, oprsz, desc);
155
}
156
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_muls16)(void *d, void *a, uint64_t b, uint32_t desc)
157
void HELPER(gvec_muls32)(void *d, void *a, uint64_t b, uint32_t desc)
158
{
159
intptr_t oprsz = simd_oprsz(desc);
160
- uint32_t vecb = (uint32_t)DUP4(b);
161
intptr_t i;
162
163
for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
164
- *(uint32_t *)(d + i) = *(uint32_t *)(a + i) * vecb;
165
+ *(uint32_t *)(d + i) = *(uint32_t *)(a + i) * (uint32_t)b;
166
}
167
clear_high(d, oprsz, desc);
168
}
169
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_muls32)(void *d, void *a, uint64_t b, uint32_t desc)
170
void HELPER(gvec_muls64)(void *d, void *a, uint64_t b, uint32_t desc)
171
{
172
intptr_t oprsz = simd_oprsz(desc);
173
- uint64_t vecb = (uint64_t)DUP2(b);
174
intptr_t i;
175
176
for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
177
- *(uint64_t *)(d + i) = *(uint64_t *)(a + i) * vecb;
178
+ *(uint64_t *)(d + i) = *(uint64_t *)(a + i) * b;
179
}
180
clear_high(d, oprsz, desc);
181
}
182
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_eqv)(void *d, void *a, void *b, uint32_t desc)
183
void HELPER(gvec_ands)(void *d, void *a, uint64_t b, uint32_t desc)
184
{
185
intptr_t oprsz = simd_oprsz(desc);
186
- uint64_t vecb = (uint64_t)DUP2(b);
187
intptr_t i;
188
189
for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
190
- *(uint64_t *)(d + i) = *(uint64_t *)(a + i) & vecb;
191
+ *(uint64_t *)(d + i) = *(uint64_t *)(a + i) & b;
192
}
193
clear_high(d, oprsz, desc);
194
}
195
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_ands)(void *d, void *a, uint64_t b, uint32_t desc)
196
void HELPER(gvec_xors)(void *d, void *a, uint64_t b, uint32_t desc)
197
{
198
intptr_t oprsz = simd_oprsz(desc);
199
- uint64_t vecb = (uint64_t)DUP2(b);
200
intptr_t i;
201
202
for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
203
- *(uint64_t *)(d + i) = *(uint64_t *)(a + i) ^ vecb;
204
+ *(uint64_t *)(d + i) = *(uint64_t *)(a + i) ^ b;
205
}
206
clear_high(d, oprsz, desc);
207
}
208
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_xors)(void *d, void *a, uint64_t b, uint32_t desc)
209
void HELPER(gvec_ors)(void *d, void *a, uint64_t b, uint32_t desc)
210
{
211
intptr_t oprsz = simd_oprsz(desc);
212
- uint64_t vecb = (uint64_t)DUP2(b);
213
intptr_t i;
214
215
for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
216
- *(uint64_t *)(d + i) = *(uint64_t *)(a + i) | vecb;
217
+ *(uint64_t *)(d + i) = *(uint64_t *)(a + i) | b;
218
}
219
clear_high(d, oprsz, desc);
220
}
221
--
38
--
222
2.20.1
39
2.34.1
223
224
diff view generated by jsdifflib
1
Partial cleanup from the CONFIG_VECTOR16 removal.
1
For user-only, the probe for page writability may race with another
2
Replace the vec* types with their scalar expansions.
2
thread's mprotect. Take the mmap_lock around the operation. This
3
is still faster than the start/end_exclusive fallback.
3
4
4
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
5
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
5
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
6
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
6
---
7
---
7
accel/tcg/tcg-runtime-gvec.c | 270 +++++++++++++++++------------------
8
accel/tcg/ldst_atomicity.c.inc | 32 ++++++++++++++++++--------------
8
1 file changed, 130 insertions(+), 140 deletions(-)
9
1 file changed, 18 insertions(+), 14 deletions(-)
9
10
10
diff --git a/accel/tcg/tcg-runtime-gvec.c b/accel/tcg/tcg-runtime-gvec.c
11
diff --git a/accel/tcg/ldst_atomicity.c.inc b/accel/tcg/ldst_atomicity.c.inc
11
index XXXXXXX..XXXXXXX 100644
12
index XXXXXXX..XXXXXXX 100644
12
--- a/accel/tcg/tcg-runtime-gvec.c
13
--- a/accel/tcg/ldst_atomicity.c.inc
13
+++ b/accel/tcg/tcg-runtime-gvec.c
14
+++ b/accel/tcg/ldst_atomicity.c.inc
14
@@ -XXX,XX +XXX,XX @@
15
@@ -XXX,XX +XXX,XX @@ static uint64_t load_atomic8_or_exit(CPUArchState *env, uintptr_t ra, void *pv)
15
#include "tcg/tcg-gvec-desc.h"
16
* another process, because the fallback start_exclusive solution
16
17
* provides no protection across processes.
17
18
*/
18
-typedef uint8_t vec8;
19
- if (!page_check_range(h2g(pv), 8, PAGE_WRITE_ORG)) {
19
-typedef uint16_t vec16;
20
- uint64_t *p = __builtin_assume_aligned(pv, 8);
20
-typedef uint32_t vec32;
21
- return *p;
21
-typedef uint64_t vec64;
22
+ WITH_MMAP_LOCK_GUARD() {
23
+ if (!page_check_range(h2g(pv), 8, PAGE_WRITE_ORG)) {
24
+ uint64_t *p = __builtin_assume_aligned(pv, 8);
25
+ return *p;
26
+ }
27
}
28
#endif
29
30
@@ -XXX,XX +XXX,XX @@ static Int128 load_atomic16_or_exit(CPUArchState *env, uintptr_t ra, void *pv)
31
return atomic16_read_ro(p);
32
}
33
34
-#ifdef CONFIG_USER_ONLY
35
/*
36
* We can only use cmpxchg to emulate a load if the page is writable.
37
* If the page is not writable, then assume the value is immutable
38
* and requires no locking. This ignores the case of MAP_SHARED with
39
* another process, because the fallback start_exclusive solution
40
* provides no protection across processes.
41
+ *
42
+ * In system mode all guest pages are writable. For user mode,
43
+ * we must take mmap_lock so that the query remains valid until
44
+ * the write is complete -- tests/tcg/multiarch/munmap-pthread.c
45
+ * is an example that can race.
46
*/
47
- if (!page_check_range(h2g(p), 16, PAGE_WRITE_ORG)) {
48
- return *p;
49
- }
50
+ WITH_MMAP_LOCK_GUARD() {
51
+#ifdef CONFIG_USER_ONLY
52
+ if (!page_check_range(h2g(p), 16, PAGE_WRITE_ORG)) {
53
+ return *p;
54
+ }
55
#endif
22
-
56
-
23
-typedef int8_t svec8;
57
- /*
24
-typedef int16_t svec16;
58
- * In system mode all guest pages are writable, and for user-only
25
-typedef int32_t svec32;
59
- * we have just checked writability. Try cmpxchg.
26
-typedef int64_t svec64;
60
- */
27
-
61
- if (HAVE_ATOMIC128_RW) {
28
#define DUP16(X) X
62
- return atomic16_read_rw(p);
29
#define DUP8(X) X
63
+ if (HAVE_ATOMIC128_RW) {
30
#define DUP4(X) X
64
+ return atomic16_read_rw(p);
31
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_add8)(void *d, void *a, void *b, uint32_t desc)
65
+ }
32
intptr_t oprsz = simd_oprsz(desc);
33
intptr_t i;
34
35
- for (i = 0; i < oprsz; i += sizeof(vec8)) {
36
- *(vec8 *)(d + i) = *(vec8 *)(a + i) + *(vec8 *)(b + i);
37
+ for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
38
+ *(uint8_t *)(d + i) = *(uint8_t *)(a + i) + *(uint8_t *)(b + i);
39
}
66
}
40
clear_high(d, oprsz, desc);
67
41
}
68
/* Ultimate fallback: re-execute in serial context. */
42
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_add16)(void *d, void *a, void *b, uint32_t desc)
43
intptr_t oprsz = simd_oprsz(desc);
44
intptr_t i;
45
46
- for (i = 0; i < oprsz; i += sizeof(vec16)) {
47
- *(vec16 *)(d + i) = *(vec16 *)(a + i) + *(vec16 *)(b + i);
48
+ for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
49
+ *(uint16_t *)(d + i) = *(uint16_t *)(a + i) + *(uint16_t *)(b + i);
50
}
51
clear_high(d, oprsz, desc);
52
}
53
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_add32)(void *d, void *a, void *b, uint32_t desc)
54
intptr_t oprsz = simd_oprsz(desc);
55
intptr_t i;
56
57
- for (i = 0; i < oprsz; i += sizeof(vec32)) {
58
- *(vec32 *)(d + i) = *(vec32 *)(a + i) + *(vec32 *)(b + i);
59
+ for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
60
+ *(uint32_t *)(d + i) = *(uint32_t *)(a + i) + *(uint32_t *)(b + i);
61
}
62
clear_high(d, oprsz, desc);
63
}
64
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_add64)(void *d, void *a, void *b, uint32_t desc)
65
intptr_t oprsz = simd_oprsz(desc);
66
intptr_t i;
67
68
- for (i = 0; i < oprsz; i += sizeof(vec64)) {
69
- *(vec64 *)(d + i) = *(vec64 *)(a + i) + *(vec64 *)(b + i);
70
+ for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
71
+ *(uint64_t *)(d + i) = *(uint64_t *)(a + i) + *(uint64_t *)(b + i);
72
}
73
clear_high(d, oprsz, desc);
74
}
75
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_add64)(void *d, void *a, void *b, uint32_t desc)
76
void HELPER(gvec_adds8)(void *d, void *a, uint64_t b, uint32_t desc)
77
{
78
intptr_t oprsz = simd_oprsz(desc);
79
- vec8 vecb = (vec8)DUP16(b);
80
+ uint8_t vecb = (uint8_t)DUP16(b);
81
intptr_t i;
82
83
- for (i = 0; i < oprsz; i += sizeof(vec8)) {
84
- *(vec8 *)(d + i) = *(vec8 *)(a + i) + vecb;
85
+ for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
86
+ *(uint8_t *)(d + i) = *(uint8_t *)(a + i) + vecb;
87
}
88
clear_high(d, oprsz, desc);
89
}
90
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_adds8)(void *d, void *a, uint64_t b, uint32_t desc)
91
void HELPER(gvec_adds16)(void *d, void *a, uint64_t b, uint32_t desc)
92
{
93
intptr_t oprsz = simd_oprsz(desc);
94
- vec16 vecb = (vec16)DUP8(b);
95
+ uint16_t vecb = (uint16_t)DUP8(b);
96
intptr_t i;
97
98
- for (i = 0; i < oprsz; i += sizeof(vec16)) {
99
- *(vec16 *)(d + i) = *(vec16 *)(a + i) + vecb;
100
+ for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
101
+ *(uint16_t *)(d + i) = *(uint16_t *)(a + i) + vecb;
102
}
103
clear_high(d, oprsz, desc);
104
}
105
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_adds16)(void *d, void *a, uint64_t b, uint32_t desc)
106
void HELPER(gvec_adds32)(void *d, void *a, uint64_t b, uint32_t desc)
107
{
108
intptr_t oprsz = simd_oprsz(desc);
109
- vec32 vecb = (vec32)DUP4(b);
110
+ uint32_t vecb = (uint32_t)DUP4(b);
111
intptr_t i;
112
113
- for (i = 0; i < oprsz; i += sizeof(vec32)) {
114
- *(vec32 *)(d + i) = *(vec32 *)(a + i) + vecb;
115
+ for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
116
+ *(uint32_t *)(d + i) = *(uint32_t *)(a + i) + vecb;
117
}
118
clear_high(d, oprsz, desc);
119
}
120
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_adds32)(void *d, void *a, uint64_t b, uint32_t desc)
121
void HELPER(gvec_adds64)(void *d, void *a, uint64_t b, uint32_t desc)
122
{
123
intptr_t oprsz = simd_oprsz(desc);
124
- vec64 vecb = (vec64)DUP2(b);
125
+ uint64_t vecb = (uint64_t)DUP2(b);
126
intptr_t i;
127
128
- for (i = 0; i < oprsz; i += sizeof(vec64)) {
129
- *(vec64 *)(d + i) = *(vec64 *)(a + i) + vecb;
130
+ for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
131
+ *(uint64_t *)(d + i) = *(uint64_t *)(a + i) + vecb;
132
}
133
clear_high(d, oprsz, desc);
134
}
135
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_sub8)(void *d, void *a, void *b, uint32_t desc)
136
intptr_t oprsz = simd_oprsz(desc);
137
intptr_t i;
138
139
- for (i = 0; i < oprsz; i += sizeof(vec8)) {
140
- *(vec8 *)(d + i) = *(vec8 *)(a + i) - *(vec8 *)(b + i);
141
+ for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
142
+ *(uint8_t *)(d + i) = *(uint8_t *)(a + i) - *(uint8_t *)(b + i);
143
}
144
clear_high(d, oprsz, desc);
145
}
146
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_sub16)(void *d, void *a, void *b, uint32_t desc)
147
intptr_t oprsz = simd_oprsz(desc);
148
intptr_t i;
149
150
- for (i = 0; i < oprsz; i += sizeof(vec16)) {
151
- *(vec16 *)(d + i) = *(vec16 *)(a + i) - *(vec16 *)(b + i);
152
+ for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
153
+ *(uint16_t *)(d + i) = *(uint16_t *)(a + i) - *(uint16_t *)(b + i);
154
}
155
clear_high(d, oprsz, desc);
156
}
157
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_sub32)(void *d, void *a, void *b, uint32_t desc)
158
intptr_t oprsz = simd_oprsz(desc);
159
intptr_t i;
160
161
- for (i = 0; i < oprsz; i += sizeof(vec32)) {
162
- *(vec32 *)(d + i) = *(vec32 *)(a + i) - *(vec32 *)(b + i);
163
+ for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
164
+ *(uint32_t *)(d + i) = *(uint32_t *)(a + i) - *(uint32_t *)(b + i);
165
}
166
clear_high(d, oprsz, desc);
167
}
168
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_sub64)(void *d, void *a, void *b, uint32_t desc)
169
intptr_t oprsz = simd_oprsz(desc);
170
intptr_t i;
171
172
- for (i = 0; i < oprsz; i += sizeof(vec64)) {
173
- *(vec64 *)(d + i) = *(vec64 *)(a + i) - *(vec64 *)(b + i);
174
+ for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
175
+ *(uint64_t *)(d + i) = *(uint64_t *)(a + i) - *(uint64_t *)(b + i);
176
}
177
clear_high(d, oprsz, desc);
178
}
179
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_sub64)(void *d, void *a, void *b, uint32_t desc)
180
void HELPER(gvec_subs8)(void *d, void *a, uint64_t b, uint32_t desc)
181
{
182
intptr_t oprsz = simd_oprsz(desc);
183
- vec8 vecb = (vec8)DUP16(b);
184
+ uint8_t vecb = (uint8_t)DUP16(b);
185
intptr_t i;
186
187
- for (i = 0; i < oprsz; i += sizeof(vec8)) {
188
- *(vec8 *)(d + i) = *(vec8 *)(a + i) - vecb;
189
+ for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
190
+ *(uint8_t *)(d + i) = *(uint8_t *)(a + i) - vecb;
191
}
192
clear_high(d, oprsz, desc);
193
}
194
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_subs8)(void *d, void *a, uint64_t b, uint32_t desc)
195
void HELPER(gvec_subs16)(void *d, void *a, uint64_t b, uint32_t desc)
196
{
197
intptr_t oprsz = simd_oprsz(desc);
198
- vec16 vecb = (vec16)DUP8(b);
199
+ uint16_t vecb = (uint16_t)DUP8(b);
200
intptr_t i;
201
202
- for (i = 0; i < oprsz; i += sizeof(vec16)) {
203
- *(vec16 *)(d + i) = *(vec16 *)(a + i) - vecb;
204
+ for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
205
+ *(uint16_t *)(d + i) = *(uint16_t *)(a + i) - vecb;
206
}
207
clear_high(d, oprsz, desc);
208
}
209
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_subs16)(void *d, void *a, uint64_t b, uint32_t desc)
210
void HELPER(gvec_subs32)(void *d, void *a, uint64_t b, uint32_t desc)
211
{
212
intptr_t oprsz = simd_oprsz(desc);
213
- vec32 vecb = (vec32)DUP4(b);
214
+ uint32_t vecb = (uint32_t)DUP4(b);
215
intptr_t i;
216
217
- for (i = 0; i < oprsz; i += sizeof(vec32)) {
218
- *(vec32 *)(d + i) = *(vec32 *)(a + i) - vecb;
219
+ for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
220
+ *(uint32_t *)(d + i) = *(uint32_t *)(a + i) - vecb;
221
}
222
clear_high(d, oprsz, desc);
223
}
224
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_subs32)(void *d, void *a, uint64_t b, uint32_t desc)
225
void HELPER(gvec_subs64)(void *d, void *a, uint64_t b, uint32_t desc)
226
{
227
intptr_t oprsz = simd_oprsz(desc);
228
- vec64 vecb = (vec64)DUP2(b);
229
+ uint64_t vecb = (uint64_t)DUP2(b);
230
intptr_t i;
231
232
- for (i = 0; i < oprsz; i += sizeof(vec64)) {
233
- *(vec64 *)(d + i) = *(vec64 *)(a + i) - vecb;
234
+ for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
235
+ *(uint64_t *)(d + i) = *(uint64_t *)(a + i) - vecb;
236
}
237
clear_high(d, oprsz, desc);
238
}
239
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_mul8)(void *d, void *a, void *b, uint32_t desc)
240
intptr_t oprsz = simd_oprsz(desc);
241
intptr_t i;
242
243
- for (i = 0; i < oprsz; i += sizeof(vec8)) {
244
- *(vec8 *)(d + i) = *(vec8 *)(a + i) * *(vec8 *)(b + i);
245
+ for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
246
+ *(uint8_t *)(d + i) = *(uint8_t *)(a + i) * *(uint8_t *)(b + i);
247
}
248
clear_high(d, oprsz, desc);
249
}
250
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_mul16)(void *d, void *a, void *b, uint32_t desc)
251
intptr_t oprsz = simd_oprsz(desc);
252
intptr_t i;
253
254
- for (i = 0; i < oprsz; i += sizeof(vec16)) {
255
- *(vec16 *)(d + i) = *(vec16 *)(a + i) * *(vec16 *)(b + i);
256
+ for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
257
+ *(uint16_t *)(d + i) = *(uint16_t *)(a + i) * *(uint16_t *)(b + i);
258
}
259
clear_high(d, oprsz, desc);
260
}
261
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_mul32)(void *d, void *a, void *b, uint32_t desc)
262
intptr_t oprsz = simd_oprsz(desc);
263
intptr_t i;
264
265
- for (i = 0; i < oprsz; i += sizeof(vec32)) {
266
- *(vec32 *)(d + i) = *(vec32 *)(a + i) * *(vec32 *)(b + i);
267
+ for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
268
+ *(uint32_t *)(d + i) = *(uint32_t *)(a + i) * *(uint32_t *)(b + i);
269
}
270
clear_high(d, oprsz, desc);
271
}
272
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_mul64)(void *d, void *a, void *b, uint32_t desc)
273
intptr_t oprsz = simd_oprsz(desc);
274
intptr_t i;
275
276
- for (i = 0; i < oprsz; i += sizeof(vec64)) {
277
- *(vec64 *)(d + i) = *(vec64 *)(a + i) * *(vec64 *)(b + i);
278
+ for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
279
+ *(uint64_t *)(d + i) = *(uint64_t *)(a + i) * *(uint64_t *)(b + i);
280
}
281
clear_high(d, oprsz, desc);
282
}
283
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_mul64)(void *d, void *a, void *b, uint32_t desc)
284
void HELPER(gvec_muls8)(void *d, void *a, uint64_t b, uint32_t desc)
285
{
286
intptr_t oprsz = simd_oprsz(desc);
287
- vec8 vecb = (vec8)DUP16(b);
288
+ uint8_t vecb = (uint8_t)DUP16(b);
289
intptr_t i;
290
291
- for (i = 0; i < oprsz; i += sizeof(vec8)) {
292
- *(vec8 *)(d + i) = *(vec8 *)(a + i) * vecb;
293
+ for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
294
+ *(uint8_t *)(d + i) = *(uint8_t *)(a + i) * vecb;
295
}
296
clear_high(d, oprsz, desc);
297
}
298
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_muls8)(void *d, void *a, uint64_t b, uint32_t desc)
299
void HELPER(gvec_muls16)(void *d, void *a, uint64_t b, uint32_t desc)
300
{
301
intptr_t oprsz = simd_oprsz(desc);
302
- vec16 vecb = (vec16)DUP8(b);
303
+ uint16_t vecb = (uint16_t)DUP8(b);
304
intptr_t i;
305
306
- for (i = 0; i < oprsz; i += sizeof(vec16)) {
307
- *(vec16 *)(d + i) = *(vec16 *)(a + i) * vecb;
308
+ for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
309
+ *(uint16_t *)(d + i) = *(uint16_t *)(a + i) * vecb;
310
}
311
clear_high(d, oprsz, desc);
312
}
313
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_muls16)(void *d, void *a, uint64_t b, uint32_t desc)
314
void HELPER(gvec_muls32)(void *d, void *a, uint64_t b, uint32_t desc)
315
{
316
intptr_t oprsz = simd_oprsz(desc);
317
- vec32 vecb = (vec32)DUP4(b);
318
+ uint32_t vecb = (uint32_t)DUP4(b);
319
intptr_t i;
320
321
- for (i = 0; i < oprsz; i += sizeof(vec32)) {
322
- *(vec32 *)(d + i) = *(vec32 *)(a + i) * vecb;
323
+ for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
324
+ *(uint32_t *)(d + i) = *(uint32_t *)(a + i) * vecb;
325
}
326
clear_high(d, oprsz, desc);
327
}
328
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_muls32)(void *d, void *a, uint64_t b, uint32_t desc)
329
void HELPER(gvec_muls64)(void *d, void *a, uint64_t b, uint32_t desc)
330
{
331
intptr_t oprsz = simd_oprsz(desc);
332
- vec64 vecb = (vec64)DUP2(b);
333
+ uint64_t vecb = (uint64_t)DUP2(b);
334
intptr_t i;
335
336
- for (i = 0; i < oprsz; i += sizeof(vec64)) {
337
- *(vec64 *)(d + i) = *(vec64 *)(a + i) * vecb;
338
+ for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
339
+ *(uint64_t *)(d + i) = *(uint64_t *)(a + i) * vecb;
340
}
341
clear_high(d, oprsz, desc);
342
}
343
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_neg8)(void *d, void *a, uint32_t desc)
344
intptr_t oprsz = simd_oprsz(desc);
345
intptr_t i;
346
347
- for (i = 0; i < oprsz; i += sizeof(vec8)) {
348
- *(vec8 *)(d + i) = -*(vec8 *)(a + i);
349
+ for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
350
+ *(uint8_t *)(d + i) = -*(uint8_t *)(a + i);
351
}
352
clear_high(d, oprsz, desc);
353
}
354
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_neg16)(void *d, void *a, uint32_t desc)
355
intptr_t oprsz = simd_oprsz(desc);
356
intptr_t i;
357
358
- for (i = 0; i < oprsz; i += sizeof(vec16)) {
359
- *(vec16 *)(d + i) = -*(vec16 *)(a + i);
360
+ for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
361
+ *(uint16_t *)(d + i) = -*(uint16_t *)(a + i);
362
}
363
clear_high(d, oprsz, desc);
364
}
365
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_neg32)(void *d, void *a, uint32_t desc)
366
intptr_t oprsz = simd_oprsz(desc);
367
intptr_t i;
368
369
- for (i = 0; i < oprsz; i += sizeof(vec32)) {
370
- *(vec32 *)(d + i) = -*(vec32 *)(a + i);
371
+ for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
372
+ *(uint32_t *)(d + i) = -*(uint32_t *)(a + i);
373
}
374
clear_high(d, oprsz, desc);
375
}
376
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_neg64)(void *d, void *a, uint32_t desc)
377
intptr_t oprsz = simd_oprsz(desc);
378
intptr_t i;
379
380
- for (i = 0; i < oprsz; i += sizeof(vec64)) {
381
- *(vec64 *)(d + i) = -*(vec64 *)(a + i);
382
+ for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
383
+ *(uint64_t *)(d + i) = -*(uint64_t *)(a + i);
384
}
385
clear_high(d, oprsz, desc);
386
}
387
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_not)(void *d, void *a, uint32_t desc)
388
intptr_t oprsz = simd_oprsz(desc);
389
intptr_t i;
390
391
- for (i = 0; i < oprsz; i += sizeof(vec64)) {
392
- *(vec64 *)(d + i) = ~*(vec64 *)(a + i);
393
+ for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
394
+ *(uint64_t *)(d + i) = ~*(uint64_t *)(a + i);
395
}
396
clear_high(d, oprsz, desc);
397
}
398
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_and)(void *d, void *a, void *b, uint32_t desc)
399
intptr_t oprsz = simd_oprsz(desc);
400
intptr_t i;
401
402
- for (i = 0; i < oprsz; i += sizeof(vec64)) {
403
- *(vec64 *)(d + i) = *(vec64 *)(a + i) & *(vec64 *)(b + i);
404
+ for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
405
+ *(uint64_t *)(d + i) = *(uint64_t *)(a + i) & *(uint64_t *)(b + i);
406
}
407
clear_high(d, oprsz, desc);
408
}
409
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_or)(void *d, void *a, void *b, uint32_t desc)
410
intptr_t oprsz = simd_oprsz(desc);
411
intptr_t i;
412
413
- for (i = 0; i < oprsz; i += sizeof(vec64)) {
414
- *(vec64 *)(d + i) = *(vec64 *)(a + i) | *(vec64 *)(b + i);
415
+ for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
416
+ *(uint64_t *)(d + i) = *(uint64_t *)(a + i) | *(uint64_t *)(b + i);
417
}
418
clear_high(d, oprsz, desc);
419
}
420
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_xor)(void *d, void *a, void *b, uint32_t desc)
421
intptr_t oprsz = simd_oprsz(desc);
422
intptr_t i;
423
424
- for (i = 0; i < oprsz; i += sizeof(vec64)) {
425
- *(vec64 *)(d + i) = *(vec64 *)(a + i) ^ *(vec64 *)(b + i);
426
+ for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
427
+ *(uint64_t *)(d + i) = *(uint64_t *)(a + i) ^ *(uint64_t *)(b + i);
428
}
429
clear_high(d, oprsz, desc);
430
}
431
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_andc)(void *d, void *a, void *b, uint32_t desc)
432
intptr_t oprsz = simd_oprsz(desc);
433
intptr_t i;
434
435
- for (i = 0; i < oprsz; i += sizeof(vec64)) {
436
- *(vec64 *)(d + i) = *(vec64 *)(a + i) &~ *(vec64 *)(b + i);
437
+ for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
438
+ *(uint64_t *)(d + i) = *(uint64_t *)(a + i) &~ *(uint64_t *)(b + i);
439
}
440
clear_high(d, oprsz, desc);
441
}
442
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_orc)(void *d, void *a, void *b, uint32_t desc)
443
intptr_t oprsz = simd_oprsz(desc);
444
intptr_t i;
445
446
- for (i = 0; i < oprsz; i += sizeof(vec64)) {
447
- *(vec64 *)(d + i) = *(vec64 *)(a + i) |~ *(vec64 *)(b + i);
448
+ for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
449
+ *(uint64_t *)(d + i) = *(uint64_t *)(a + i) |~ *(uint64_t *)(b + i);
450
}
451
clear_high(d, oprsz, desc);
452
}
453
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_nand)(void *d, void *a, void *b, uint32_t desc)
454
intptr_t oprsz = simd_oprsz(desc);
455
intptr_t i;
456
457
- for (i = 0; i < oprsz; i += sizeof(vec64)) {
458
- *(vec64 *)(d + i) = ~(*(vec64 *)(a + i) & *(vec64 *)(b + i));
459
+ for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
460
+ *(uint64_t *)(d + i) = ~(*(uint64_t *)(a + i) & *(uint64_t *)(b + i));
461
}
462
clear_high(d, oprsz, desc);
463
}
464
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_nor)(void *d, void *a, void *b, uint32_t desc)
465
intptr_t oprsz = simd_oprsz(desc);
466
intptr_t i;
467
468
- for (i = 0; i < oprsz; i += sizeof(vec64)) {
469
- *(vec64 *)(d + i) = ~(*(vec64 *)(a + i) | *(vec64 *)(b + i));
470
+ for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
471
+ *(uint64_t *)(d + i) = ~(*(uint64_t *)(a + i) | *(uint64_t *)(b + i));
472
}
473
clear_high(d, oprsz, desc);
474
}
475
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_eqv)(void *d, void *a, void *b, uint32_t desc)
476
intptr_t oprsz = simd_oprsz(desc);
477
intptr_t i;
478
479
- for (i = 0; i < oprsz; i += sizeof(vec64)) {
480
- *(vec64 *)(d + i) = ~(*(vec64 *)(a + i) ^ *(vec64 *)(b + i));
481
+ for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
482
+ *(uint64_t *)(d + i) = ~(*(uint64_t *)(a + i) ^ *(uint64_t *)(b + i));
483
}
484
clear_high(d, oprsz, desc);
485
}
486
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_eqv)(void *d, void *a, void *b, uint32_t desc)
487
void HELPER(gvec_ands)(void *d, void *a, uint64_t b, uint32_t desc)
488
{
489
intptr_t oprsz = simd_oprsz(desc);
490
- vec64 vecb = (vec64)DUP2(b);
491
+ uint64_t vecb = (uint64_t)DUP2(b);
492
intptr_t i;
493
494
- for (i = 0; i < oprsz; i += sizeof(vec64)) {
495
- *(vec64 *)(d + i) = *(vec64 *)(a + i) & vecb;
496
+ for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
497
+ *(uint64_t *)(d + i) = *(uint64_t *)(a + i) & vecb;
498
}
499
clear_high(d, oprsz, desc);
500
}
501
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_ands)(void *d, void *a, uint64_t b, uint32_t desc)
502
void HELPER(gvec_xors)(void *d, void *a, uint64_t b, uint32_t desc)
503
{
504
intptr_t oprsz = simd_oprsz(desc);
505
- vec64 vecb = (vec64)DUP2(b);
506
+ uint64_t vecb = (uint64_t)DUP2(b);
507
intptr_t i;
508
509
- for (i = 0; i < oprsz; i += sizeof(vec64)) {
510
- *(vec64 *)(d + i) = *(vec64 *)(a + i) ^ vecb;
511
+ for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
512
+ *(uint64_t *)(d + i) = *(uint64_t *)(a + i) ^ vecb;
513
}
514
clear_high(d, oprsz, desc);
515
}
516
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_xors)(void *d, void *a, uint64_t b, uint32_t desc)
517
void HELPER(gvec_ors)(void *d, void *a, uint64_t b, uint32_t desc)
518
{
519
intptr_t oprsz = simd_oprsz(desc);
520
- vec64 vecb = (vec64)DUP2(b);
521
+ uint64_t vecb = (uint64_t)DUP2(b);
522
intptr_t i;
523
524
- for (i = 0; i < oprsz; i += sizeof(vec64)) {
525
- *(vec64 *)(d + i) = *(vec64 *)(a + i) | vecb;
526
+ for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
527
+ *(uint64_t *)(d + i) = *(uint64_t *)(a + i) | vecb;
528
}
529
clear_high(d, oprsz, desc);
530
}
531
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_shl8i)(void *d, void *a, uint32_t desc)
532
int shift = simd_data(desc);
533
intptr_t i;
534
535
- for (i = 0; i < oprsz; i += sizeof(vec8)) {
536
- *(vec8 *)(d + i) = *(vec8 *)(a + i) << shift;
537
+ for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
538
+ *(uint8_t *)(d + i) = *(uint8_t *)(a + i) << shift;
539
}
540
clear_high(d, oprsz, desc);
541
}
542
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_shl16i)(void *d, void *a, uint32_t desc)
543
int shift = simd_data(desc);
544
intptr_t i;
545
546
- for (i = 0; i < oprsz; i += sizeof(vec16)) {
547
- *(vec16 *)(d + i) = *(vec16 *)(a + i) << shift;
548
+ for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
549
+ *(uint16_t *)(d + i) = *(uint16_t *)(a + i) << shift;
550
}
551
clear_high(d, oprsz, desc);
552
}
553
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_shl32i)(void *d, void *a, uint32_t desc)
554
int shift = simd_data(desc);
555
intptr_t i;
556
557
- for (i = 0; i < oprsz; i += sizeof(vec32)) {
558
- *(vec32 *)(d + i) = *(vec32 *)(a + i) << shift;
559
+ for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
560
+ *(uint32_t *)(d + i) = *(uint32_t *)(a + i) << shift;
561
}
562
clear_high(d, oprsz, desc);
563
}
564
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_shl64i)(void *d, void *a, uint32_t desc)
565
int shift = simd_data(desc);
566
intptr_t i;
567
568
- for (i = 0; i < oprsz; i += sizeof(vec64)) {
569
- *(vec64 *)(d + i) = *(vec64 *)(a + i) << shift;
570
+ for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
571
+ *(uint64_t *)(d + i) = *(uint64_t *)(a + i) << shift;
572
}
573
clear_high(d, oprsz, desc);
574
}
575
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_shr8i)(void *d, void *a, uint32_t desc)
576
int shift = simd_data(desc);
577
intptr_t i;
578
579
- for (i = 0; i < oprsz; i += sizeof(vec8)) {
580
- *(vec8 *)(d + i) = *(vec8 *)(a + i) >> shift;
581
+ for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
582
+ *(uint8_t *)(d + i) = *(uint8_t *)(a + i) >> shift;
583
}
584
clear_high(d, oprsz, desc);
585
}
586
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_shr16i)(void *d, void *a, uint32_t desc)
587
int shift = simd_data(desc);
588
intptr_t i;
589
590
- for (i = 0; i < oprsz; i += sizeof(vec16)) {
591
- *(vec16 *)(d + i) = *(vec16 *)(a + i) >> shift;
592
+ for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
593
+ *(uint16_t *)(d + i) = *(uint16_t *)(a + i) >> shift;
594
}
595
clear_high(d, oprsz, desc);
596
}
597
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_shr32i)(void *d, void *a, uint32_t desc)
598
int shift = simd_data(desc);
599
intptr_t i;
600
601
- for (i = 0; i < oprsz; i += sizeof(vec32)) {
602
- *(vec32 *)(d + i) = *(vec32 *)(a + i) >> shift;
603
+ for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
604
+ *(uint32_t *)(d + i) = *(uint32_t *)(a + i) >> shift;
605
}
606
clear_high(d, oprsz, desc);
607
}
608
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_shr64i)(void *d, void *a, uint32_t desc)
609
int shift = simd_data(desc);
610
intptr_t i;
611
612
- for (i = 0; i < oprsz; i += sizeof(vec64)) {
613
- *(vec64 *)(d + i) = *(vec64 *)(a + i) >> shift;
614
+ for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
615
+ *(uint64_t *)(d + i) = *(uint64_t *)(a + i) >> shift;
616
}
617
clear_high(d, oprsz, desc);
618
}
619
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_sar8i)(void *d, void *a, uint32_t desc)
620
int shift = simd_data(desc);
621
intptr_t i;
622
623
- for (i = 0; i < oprsz; i += sizeof(vec8)) {
624
- *(svec8 *)(d + i) = *(svec8 *)(a + i) >> shift;
625
+ for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
626
+ *(int8_t *)(d + i) = *(int8_t *)(a + i) >> shift;
627
}
628
clear_high(d, oprsz, desc);
629
}
630
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_sar16i)(void *d, void *a, uint32_t desc)
631
int shift = simd_data(desc);
632
intptr_t i;
633
634
- for (i = 0; i < oprsz; i += sizeof(vec16)) {
635
- *(svec16 *)(d + i) = *(svec16 *)(a + i) >> shift;
636
+ for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
637
+ *(int16_t *)(d + i) = *(int16_t *)(a + i) >> shift;
638
}
639
clear_high(d, oprsz, desc);
640
}
641
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_sar32i)(void *d, void *a, uint32_t desc)
642
int shift = simd_data(desc);
643
intptr_t i;
644
645
- for (i = 0; i < oprsz; i += sizeof(vec32)) {
646
- *(svec32 *)(d + i) = *(svec32 *)(a + i) >> shift;
647
+ for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
648
+ *(int32_t *)(d + i) = *(int32_t *)(a + i) >> shift;
649
}
650
clear_high(d, oprsz, desc);
651
}
652
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_sar64i)(void *d, void *a, uint32_t desc)
653
int shift = simd_data(desc);
654
intptr_t i;
655
656
- for (i = 0; i < oprsz; i += sizeof(vec64)) {
657
- *(svec64 *)(d + i) = *(svec64 *)(a + i) >> shift;
658
+ for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
659
+ *(int64_t *)(d + i) = *(int64_t *)(a + i) >> shift;
660
}
661
clear_high(d, oprsz, desc);
662
}
663
@@ -XXX,XX +XXX,XX @@ void HELPER(NAME)(void *d, void *a, void *b, uint32_t desc) \
664
}
665
666
#define DO_CMP2(SZ) \
667
- DO_CMP1(gvec_eq##SZ, vec##SZ, ==) \
668
- DO_CMP1(gvec_ne##SZ, vec##SZ, !=) \
669
- DO_CMP1(gvec_lt##SZ, svec##SZ, <) \
670
- DO_CMP1(gvec_le##SZ, svec##SZ, <=) \
671
- DO_CMP1(gvec_ltu##SZ, vec##SZ, <) \
672
- DO_CMP1(gvec_leu##SZ, vec##SZ, <=)
673
+ DO_CMP1(gvec_eq##SZ, uint##SZ##_t, ==) \
674
+ DO_CMP1(gvec_ne##SZ, uint##SZ##_t, !=) \
675
+ DO_CMP1(gvec_lt##SZ, int##SZ##_t, <) \
676
+ DO_CMP1(gvec_le##SZ, int##SZ##_t, <=) \
677
+ DO_CMP1(gvec_ltu##SZ, uint##SZ##_t, <) \
678
+ DO_CMP1(gvec_leu##SZ, uint##SZ##_t, <=)
679
680
DO_CMP2(8)
681
DO_CMP2(16)
682
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_bitsel)(void *d, void *a, void *b, void *c, uint32_t desc)
683
intptr_t oprsz = simd_oprsz(desc);
684
intptr_t i;
685
686
- for (i = 0; i < oprsz; i += sizeof(vec64)) {
687
- vec64 aa = *(vec64 *)(a + i);
688
- vec64 bb = *(vec64 *)(b + i);
689
- vec64 cc = *(vec64 *)(c + i);
690
- *(vec64 *)(d + i) = (bb & aa) | (cc & ~aa);
691
+ for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
692
+ uint64_t aa = *(uint64_t *)(a + i);
693
+ uint64_t bb = *(uint64_t *)(b + i);
694
+ uint64_t cc = *(uint64_t *)(c + i);
695
+ *(uint64_t *)(d + i) = (bb & aa) | (cc & ~aa);
696
}
697
clear_high(d, oprsz, desc);
698
}
699
--
69
--
700
2.20.1
70
2.34.1
701
702
diff view generated by jsdifflib
1
A given RISU testcase for SVE can produce
1
From: Ilya Leoshkevich <iii@linux.ibm.com>
2
2
3
tcg-op-vec.c:511: do_shifti: Assertion `i >= 0 && i < (8 << vece)' failed.
3
i386 and s390x implementations of op_add2 require an earlyclobber,
4
which is currently missing. This breaks VCKSM in s390x guests. E.g., on
5
x86_64 the following op:
4
6
5
because expand_vec_sari gave a shift count of 32 to a MO_32
7
add2_i32 tmp2,tmp3,tmp2,tmp3,tmp3,tmp2 dead: 0 2 3 4 5 pref=none,0xffff
6
vector shift.
7
8
8
In 44f1441dbe1, we changed from direct expansion of vector opcodes
9
is translated to:
9
to re-use of the tcg expanders. So while the comment correctly notes
10
that the hw will handle such a shift count, we now have to take our
11
own sanity checks into account. Which is easy in this particular case.
12
10
13
Fixes: 44f1441dbe1
11
addl %ebx, %r12d
12
adcl %r12d, %ebx
13
14
Introduce a new C_N1_O1_I4 constraint, and make sure that earlyclobber
15
of aliased outputs is honored.
16
17
Cc: qemu-stable@nongnu.org
18
Fixes: 82790a870992 ("tcg: Add markup for output requires new register")
19
Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
20
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
21
Message-Id: <20230719221310.1968845-7-iii@linux.ibm.com>
14
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
22
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
15
---
23
---
16
tcg/i386/tcg-target.inc.c | 9 ++++++---
24
tcg/i386/tcg-target-con-set.h | 5 ++++-
17
1 file changed, 6 insertions(+), 3 deletions(-)
25
tcg/s390x/tcg-target-con-set.h | 8 +++++---
26
tcg/tcg.c | 8 +++++++-
27
tcg/i386/tcg-target.c.inc | 2 +-
28
tcg/s390x/tcg-target.c.inc | 4 ++--
29
5 files changed, 19 insertions(+), 8 deletions(-)
18
30
19
diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
31
diff --git a/tcg/i386/tcg-target-con-set.h b/tcg/i386/tcg-target-con-set.h
20
index XXXXXXX..XXXXXXX 100644
32
index XXXXXXX..XXXXXXX 100644
21
--- a/tcg/i386/tcg-target.inc.c
33
--- a/tcg/i386/tcg-target-con-set.h
22
+++ b/tcg/i386/tcg-target.inc.c
34
+++ b/tcg/i386/tcg-target-con-set.h
23
@@ -XXX,XX +XXX,XX @@ static void expand_vec_sari(TCGType type, unsigned vece,
35
@@ -XXX,XX +XXX,XX @@
24
36
*
25
case MO_64:
37
* C_N1_Im(...) defines a constraint set with 1 output and <m> inputs,
26
if (imm <= 32) {
38
* except that the output must use a new register.
27
- /* We can emulate a small sign extend by performing an arithmetic
39
+ *
28
+ /*
40
+ * C_Nn_Om_Ik(...) defines a constraint set with <n + m> outputs and <k>
29
+ * We can emulate a small sign extend by performing an arithmetic
41
+ * inputs, except that the first <n> outputs must use new registers.
30
* 32-bit shift and overwriting the high half of a 64-bit logical
42
*/
31
- * shift (note that the ISA says shift of 32 is valid).
43
C_O0_I1(r)
32
+ * shift. Note that the ISA says shift of 32 is valid, but TCG
44
C_O0_I2(L, L)
33
+ * does not, so we have to bound the smaller shift -- we get the
45
@@ -XXX,XX +XXX,XX @@ C_O2_I1(r, r, L)
34
+ * same result in the high half either way.
46
C_O2_I2(a, d, a, r)
35
*/
47
C_O2_I2(r, r, L, L)
36
t1 = tcg_temp_new_vec(type);
48
C_O2_I3(a, d, 0, 1, r)
37
- tcg_gen_sari_vec(MO_32, t1, v1, imm);
49
-C_O2_I4(r, r, 0, 1, re, re)
38
+ tcg_gen_sari_vec(MO_32, t1, v1, MIN(imm, 31));
50
+C_N1_O1_I4(r, r, 0, 1, re, re)
39
tcg_gen_shri_vec(MO_64, v0, v1, imm);
51
diff --git a/tcg/s390x/tcg-target-con-set.h b/tcg/s390x/tcg-target-con-set.h
40
vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32,
52
index XXXXXXX..XXXXXXX 100644
41
tcgv_vec_arg(v0), tcgv_vec_arg(v0),
53
--- a/tcg/s390x/tcg-target-con-set.h
54
+++ b/tcg/s390x/tcg-target-con-set.h
55
@@ -XXX,XX +XXX,XX @@
56
* C_On_Im(...) defines a constraint set with <n> outputs and <m> inputs.
57
* Each operand should be a sequence of constraint letters as defined by
58
* tcg-target-con-str.h; the constraint combination is inclusive or.
59
+ *
60
+ * C_Nn_Om_Ik(...) defines a constraint set with <n + m> outputs and <k>
61
+ * inputs, except that the first <n> outputs must use new registers.
62
*/
63
C_O0_I1(r)
64
C_O0_I2(r, r)
65
@@ -XXX,XX +XXX,XX @@ C_O2_I1(o, m, r)
66
C_O2_I2(o, m, 0, r)
67
C_O2_I2(o, m, r, r)
68
C_O2_I3(o, m, 0, 1, r)
69
-C_O2_I4(r, r, 0, 1, rA, r)
70
-C_O2_I4(r, r, 0, 1, ri, r)
71
-C_O2_I4(r, r, 0, 1, r, r)
72
+C_N1_O1_I4(r, r, 0, 1, ri, r)
73
+C_N1_O1_I4(r, r, 0, 1, rA, r)
74
diff --git a/tcg/tcg.c b/tcg/tcg.c
75
index XXXXXXX..XXXXXXX 100644
76
--- a/tcg/tcg.c
77
+++ b/tcg/tcg.c
78
@@ -XXX,XX +XXX,XX @@ static void tcg_out_movext3(TCGContext *s, const TCGMovExtend *i1,
79
#define C_O2_I2(O1, O2, I1, I2) C_PFX4(c_o2_i2_, O1, O2, I1, I2),
80
#define C_O2_I3(O1, O2, I1, I2, I3) C_PFX5(c_o2_i3_, O1, O2, I1, I2, I3),
81
#define C_O2_I4(O1, O2, I1, I2, I3, I4) C_PFX6(c_o2_i4_, O1, O2, I1, I2, I3, I4),
82
+#define C_N1_O1_I4(O1, O2, I1, I2, I3, I4) C_PFX6(c_n1_o1_i4_, O1, O2, I1, I2, I3, I4),
83
84
typedef enum {
85
#include "tcg-target-con-set.h"
86
@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode);
87
#undef C_O2_I2
88
#undef C_O2_I3
89
#undef C_O2_I4
90
+#undef C_N1_O1_I4
91
92
/* Put all of the constraint sets into an array, indexed by the enum. */
93
94
@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode);
95
#define C_O2_I2(O1, O2, I1, I2) { .args_ct_str = { #O1, #O2, #I1, #I2 } },
96
#define C_O2_I3(O1, O2, I1, I2, I3) { .args_ct_str = { #O1, #O2, #I1, #I2, #I3 } },
97
#define C_O2_I4(O1, O2, I1, I2, I3, I4) { .args_ct_str = { #O1, #O2, #I1, #I2, #I3, #I4 } },
98
+#define C_N1_O1_I4(O1, O2, I1, I2, I3, I4) { .args_ct_str = { "&" #O1, #O2, #I1, #I2, #I3, #I4 } },
99
100
static const TCGTargetOpDef constraint_sets[] = {
101
#include "tcg-target-con-set.h"
102
@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef constraint_sets[] = {
103
#undef C_O2_I2
104
#undef C_O2_I3
105
#undef C_O2_I4
106
+#undef C_N1_O1_I4
107
108
/* Expand the enumerator to be returned from tcg_target_op_def(). */
109
110
@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef constraint_sets[] = {
111
#define C_O2_I2(O1, O2, I1, I2) C_PFX4(c_o2_i2_, O1, O2, I1, I2)
112
#define C_O2_I3(O1, O2, I1, I2, I3) C_PFX5(c_o2_i3_, O1, O2, I1, I2, I3)
113
#define C_O2_I4(O1, O2, I1, I2, I3, I4) C_PFX6(c_o2_i4_, O1, O2, I1, I2, I3, I4)
114
+#define C_N1_O1_I4(O1, O2, I1, I2, I3, I4) C_PFX6(c_n1_o1_i4_, O1, O2, I1, I2, I3, I4)
115
116
#include "tcg-target.c.inc"
117
118
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
119
* dead after the instruction, we must allocate a new
120
* register and move it.
121
*/
122
- if (temp_readonly(ts) || !IS_DEAD_ARG(i)) {
123
+ if (temp_readonly(ts) || !IS_DEAD_ARG(i)
124
+ || def->args_ct[arg_ct->alias_index].newreg) {
125
allocate_new_reg = true;
126
} else if (ts->val_type == TEMP_VAL_REG) {
127
/*
128
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
129
index XXXXXXX..XXXXXXX 100644
130
--- a/tcg/i386/tcg-target.c.inc
131
+++ b/tcg/i386/tcg-target.c.inc
132
@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
133
case INDEX_op_add2_i64:
134
case INDEX_op_sub2_i32:
135
case INDEX_op_sub2_i64:
136
- return C_O2_I4(r, r, 0, 1, re, re);
137
+ return C_N1_O1_I4(r, r, 0, 1, re, re);
138
139
case INDEX_op_ctz_i32:
140
case INDEX_op_ctz_i64:
141
diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
142
index XXXXXXX..XXXXXXX 100644
143
--- a/tcg/s390x/tcg-target.c.inc
144
+++ b/tcg/s390x/tcg-target.c.inc
145
@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
146
147
case INDEX_op_add2_i32:
148
case INDEX_op_sub2_i32:
149
- return C_O2_I4(r, r, 0, 1, ri, r);
150
+ return C_N1_O1_I4(r, r, 0, 1, ri, r);
151
152
case INDEX_op_add2_i64:
153
case INDEX_op_sub2_i64:
154
- return C_O2_I4(r, r, 0, 1, rA, r);
155
+ return C_N1_O1_I4(r, r, 0, 1, rA, r);
156
157
case INDEX_op_st_vec:
158
return C_O0_I2(v, r);
42
--
159
--
43
2.20.1
160
2.34.1
44
45
diff view generated by jsdifflib
1
Partial cleanup from the CONFIG_VECTOR16 removal.
1
From: Anton Johansson <anjo@rev.ng>
2
Replace DO_CMP0 with its scalar expansion, a simple negation.
3
2
3
In replacing target_ulong with vaddr and TARGET_FMT_lx with VADDR_PRIx,
4
the zero-padding of TARGET_FMT_lx got lost. Readd 16-wide zero-padding
5
for logging consistency.
6
7
Suggested-by: Peter Maydell <peter.maydell@linaro.org>
8
Signed-off-by: Anton Johansson <anjo@rev.ng>
9
Message-Id: <20230713120746.26897-1-anjo@rev.ng>
10
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
4
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
11
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
5
---
12
---
6
accel/tcg/tcg-runtime-gvec.c | 5 +----
13
accel/tcg/cputlb.c | 20 ++++++++++----------
7
1 file changed, 1 insertion(+), 4 deletions(-)
14
1 file changed, 10 insertions(+), 10 deletions(-)
8
15
9
diff --git a/accel/tcg/tcg-runtime-gvec.c b/accel/tcg/tcg-runtime-gvec.c
16
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
10
index XXXXXXX..XXXXXXX 100644
17
index XXXXXXX..XXXXXXX 100644
11
--- a/accel/tcg/tcg-runtime-gvec.c
18
--- a/accel/tcg/cputlb.c
12
+++ b/accel/tcg/tcg-runtime-gvec.c
19
+++ b/accel/tcg/cputlb.c
13
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_sar64v)(void *d, void *a, void *b, uint32_t desc)
20
@@ -XXX,XX +XXX,XX @@ static void tlb_flush_page_locked(CPUArchState *env, int midx, vaddr page)
14
clear_high(d, oprsz, desc);
21
15
}
22
/* Check if we need to flush due to large pages. */
16
23
if ((page & lp_mask) == lp_addr) {
17
-#define DO_CMP0(X) -(X)
24
- tlb_debug("forcing full flush midx %d (%"
18
-
25
- VADDR_PRIx "/%" VADDR_PRIx ")\n",
19
#define DO_CMP1(NAME, TYPE, OP) \
26
+ tlb_debug("forcing full flush midx %d (%016"
20
void HELPER(NAME)(void *d, void *a, void *b, uint32_t desc) \
27
+ VADDR_PRIx "/%016" VADDR_PRIx ")\n",
21
{ \
28
midx, lp_addr, lp_mask);
22
intptr_t oprsz = simd_oprsz(desc); \
29
tlb_flush_one_mmuidx_locked(env, midx, get_clock_realtime());
23
intptr_t i; \
30
} else {
24
for (i = 0; i < oprsz; i += sizeof(TYPE)) { \
31
@@ -XXX,XX +XXX,XX @@ static void tlb_flush_page_by_mmuidx_async_0(CPUState *cpu,
25
- *(TYPE *)(d + i) = DO_CMP0(*(TYPE *)(a + i) OP *(TYPE *)(b + i)); \
32
26
+ *(TYPE *)(d + i) = -(*(TYPE *)(a + i) OP *(TYPE *)(b + i)); \
33
assert_cpu_is_self(cpu);
27
} \
34
28
clear_high(d, oprsz, desc); \
35
- tlb_debug("page addr: %" VADDR_PRIx " mmu_map:0x%x\n", addr, idxmap);
29
}
36
+ tlb_debug("page addr: %016" VADDR_PRIx " mmu_map:0x%x\n", addr, idxmap);
30
@@ -XXX,XX +XXX,XX @@ DO_CMP2(16)
37
31
DO_CMP2(32)
38
qemu_spin_lock(&env_tlb(env)->c.lock);
32
DO_CMP2(64)
39
for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
33
40
@@ -XXX,XX +XXX,XX @@ static void tlb_flush_page_by_mmuidx_async_2(CPUState *cpu,
34
-#undef DO_CMP0
41
35
#undef DO_CMP1
42
void tlb_flush_page_by_mmuidx(CPUState *cpu, vaddr addr, uint16_t idxmap)
36
#undef DO_CMP2
43
{
44
- tlb_debug("addr: %" VADDR_PRIx " mmu_idx:%" PRIx16 "\n", addr, idxmap);
45
+ tlb_debug("addr: %016" VADDR_PRIx " mmu_idx:%" PRIx16 "\n", addr, idxmap);
46
47
/* This should already be page aligned */
48
addr &= TARGET_PAGE_MASK;
49
@@ -XXX,XX +XXX,XX @@ void tlb_flush_page(CPUState *cpu, vaddr addr)
50
void tlb_flush_page_by_mmuidx_all_cpus(CPUState *src_cpu, vaddr addr,
51
uint16_t idxmap)
52
{
53
- tlb_debug("addr: %" VADDR_PRIx " mmu_idx:%"PRIx16"\n", addr, idxmap);
54
+ tlb_debug("addr: %016" VADDR_PRIx " mmu_idx:%"PRIx16"\n", addr, idxmap);
55
56
/* This should already be page aligned */
57
addr &= TARGET_PAGE_MASK;
58
@@ -XXX,XX +XXX,XX @@ void tlb_flush_page_by_mmuidx_all_cpus_synced(CPUState *src_cpu,
59
vaddr addr,
60
uint16_t idxmap)
61
{
62
- tlb_debug("addr: %" VADDR_PRIx " mmu_idx:%"PRIx16"\n", addr, idxmap);
63
+ tlb_debug("addr: %016" VADDR_PRIx " mmu_idx:%"PRIx16"\n", addr, idxmap);
64
65
/* This should already be page aligned */
66
addr &= TARGET_PAGE_MASK;
67
@@ -XXX,XX +XXX,XX @@ static void tlb_flush_range_locked(CPUArchState *env, int midx,
68
*/
69
if (mask < f->mask || len > f->mask) {
70
tlb_debug("forcing full flush midx %d ("
71
- "%" VADDR_PRIx "/%" VADDR_PRIx "+%" VADDR_PRIx ")\n",
72
+ "%016" VADDR_PRIx "/%016" VADDR_PRIx "+%016" VADDR_PRIx ")\n",
73
midx, addr, mask, len);
74
tlb_flush_one_mmuidx_locked(env, midx, get_clock_realtime());
75
return;
76
@@ -XXX,XX +XXX,XX @@ static void tlb_flush_range_locked(CPUArchState *env, int midx,
77
*/
78
if (((addr + len - 1) & d->large_page_mask) == d->large_page_addr) {
79
tlb_debug("forcing full flush midx %d ("
80
- "%" VADDR_PRIx "/%" VADDR_PRIx ")\n",
81
+ "%016" VADDR_PRIx "/%016" VADDR_PRIx ")\n",
82
midx, d->large_page_addr, d->large_page_mask);
83
tlb_flush_one_mmuidx_locked(env, midx, get_clock_realtime());
84
return;
85
@@ -XXX,XX +XXX,XX @@ static void tlb_flush_range_by_mmuidx_async_0(CPUState *cpu,
86
87
assert_cpu_is_self(cpu);
88
89
- tlb_debug("range: %" VADDR_PRIx "/%u+%" VADDR_PRIx " mmu_map:0x%x\n",
90
+ tlb_debug("range: %016" VADDR_PRIx "/%u+%016" VADDR_PRIx " mmu_map:0x%x\n",
91
d.addr, d.bits, d.len, d.idxmap);
92
93
qemu_spin_lock(&env_tlb(env)->c.lock);
94
@@ -XXX,XX +XXX,XX @@ void tlb_set_page_full(CPUState *cpu, int mmu_idx,
95
&xlat, &sz, full->attrs, &prot);
96
assert(sz >= TARGET_PAGE_SIZE);
97
98
- tlb_debug("vaddr=%" VADDR_PRIx " paddr=0x" HWADDR_FMT_plx
99
+ tlb_debug("vaddr=%016" VADDR_PRIx " paddr=0x" HWADDR_FMT_plx
100
" prot=%x idx=%d\n",
101
addr, full->phys_addr, prot, mmu_idx);
37
102
38
--
103
--
39
2.20.1
104
2.34.1
40
41
diff view generated by jsdifflib
1
The comment in tcg-runtime-gvec.c about CONFIG_VECTOR16 says that
1
From: Luca Bonissi <qemu@bonslack.org>
2
tcg-op-gvec.c has eliminated size 8 vectors, and only passes on
3
multiples of 16. This may have been true of the first few operations,
4
but is not true of all operations.
5
2
6
In particular, multiply, shift by scalar, and compare of 8- and 16-bit
3
These should match 'start' as target_ulong, not target_long.
7
elements are not expanded inline if host vector operations are not
8
supported.
9
4
10
For an x86_64 host that does not support AVX, this means that we will
5
On 32bit targets, the parameter was sign-extended to uint64_t,
11
fall back to the helper, which will attempt to use SSE instructions,
6
so only the first mmap within the upper 2GB memory can succeed.
12
which will SEGV on an invalid 8-byte aligned memory operation.
13
7
14
This patch simply removes the CONFIG_VECTOR16 code and configuration
8
Signed-off-by: Luca Bonissi <qemu@bonslack.org>
15
without further simplification.
9
Message-Id: <327460e2-0ebd-9edb-426b-1df80d16c32a@bonslack.org>
16
10
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
17
Buglink: https://bugs.launchpad.net/bugs/1863508
18
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
11
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
19
---
12
---
20
configure | 56 ------------------------------------
13
accel/tcg/user-exec.c | 4 ++--
21
accel/tcg/tcg-runtime-gvec.c | 35 +---------------------
14
1 file changed, 2 insertions(+), 2 deletions(-)
22
2 files changed, 1 insertion(+), 90 deletions(-)
23
15
24
diff --git a/configure b/configure
16
diff --git a/accel/tcg/user-exec.c b/accel/tcg/user-exec.c
25
index XXXXXXX..XXXXXXX 100755
26
--- a/configure
27
+++ b/configure
28
@@ -XXX,XX +XXX,XX @@ if test "$plugins" = "yes" &&
29
"for this purpose. You can't build with --static."
30
fi
31
32
-########################################
33
-# See if 16-byte vector operations are supported.
34
-# Even without a vector unit the compiler may expand these.
35
-# There is a bug in old GCC for PPC that crashes here.
36
-# Unfortunately it's the system compiler for Centos 7.
37
-
38
-cat > $TMPC << EOF
39
-typedef unsigned char U1 __attribute__((vector_size(16)));
40
-typedef unsigned short U2 __attribute__((vector_size(16)));
41
-typedef unsigned int U4 __attribute__((vector_size(16)));
42
-typedef unsigned long long U8 __attribute__((vector_size(16)));
43
-typedef signed char S1 __attribute__((vector_size(16)));
44
-typedef signed short S2 __attribute__((vector_size(16)));
45
-typedef signed int S4 __attribute__((vector_size(16)));
46
-typedef signed long long S8 __attribute__((vector_size(16)));
47
-static U1 a1, b1;
48
-static U2 a2, b2;
49
-static U4 a4, b4;
50
-static U8 a8, b8;
51
-static S1 c1;
52
-static S2 c2;
53
-static S4 c4;
54
-static S8 c8;
55
-static int i;
56
-void helper(void *d, void *a, int shift, int i);
57
-void helper(void *d, void *a, int shift, int i)
58
-{
59
- *(U1 *)(d + i) = *(U1 *)(a + i) << shift;
60
- *(U2 *)(d + i) = *(U2 *)(a + i) << shift;
61
- *(U4 *)(d + i) = *(U4 *)(a + i) << shift;
62
- *(U8 *)(d + i) = *(U8 *)(a + i) << shift;
63
-}
64
-int main(void)
65
-{
66
- a1 += b1; a2 += b2; a4 += b4; a8 += b8;
67
- a1 -= b1; a2 -= b2; a4 -= b4; a8 -= b8;
68
- a1 *= b1; a2 *= b2; a4 *= b4; a8 *= b8;
69
- a1 &= b1; a2 &= b2; a4 &= b4; a8 &= b8;
70
- a1 |= b1; a2 |= b2; a4 |= b4; a8 |= b8;
71
- a1 ^= b1; a2 ^= b2; a4 ^= b4; a8 ^= b8;
72
- a1 <<= i; a2 <<= i; a4 <<= i; a8 <<= i;
73
- a1 >>= i; a2 >>= i; a4 >>= i; a8 >>= i;
74
- c1 >>= i; c2 >>= i; c4 >>= i; c8 >>= i;
75
- return 0;
76
-}
77
-EOF
78
-
79
-vector16=no
80
-if compile_prog "" "" ; then
81
- vector16=yes
82
-fi
83
-
84
########################################
85
# See if __attribute__((alias)) is supported.
86
# This false for Xcode 9, but has been remedied for Xcode 10.
87
@@ -XXX,XX +XXX,XX @@ if test "$atomic64" = "yes" ; then
88
echo "CONFIG_ATOMIC64=y" >> $config_host_mak
89
fi
90
91
-if test "$vector16" = "yes" ; then
92
- echo "CONFIG_VECTOR16=y" >> $config_host_mak
93
-fi
94
-
95
if test "$attralias" = "yes" ; then
96
echo "CONFIG_ATTRIBUTE_ALIAS=y" >> $config_host_mak
97
fi
98
diff --git a/accel/tcg/tcg-runtime-gvec.c b/accel/tcg/tcg-runtime-gvec.c
99
index XXXXXXX..XXXXXXX 100644
17
index XXXXXXX..XXXXXXX 100644
100
--- a/accel/tcg/tcg-runtime-gvec.c
18
--- a/accel/tcg/user-exec.c
101
+++ b/accel/tcg/tcg-runtime-gvec.c
19
+++ b/accel/tcg/user-exec.c
102
@@ -XXX,XX +XXX,XX @@
20
@@ -XXX,XX +XXX,XX @@ typedef struct PageFlagsNode {
103
#include "tcg/tcg-gvec-desc.h"
21
104
22
static IntervalTreeRoot pageflags_root;
105
23
106
-/* Virtually all hosts support 16-byte vectors. Those that don't can emulate
24
-static PageFlagsNode *pageflags_find(target_ulong start, target_long last)
107
- * them via GCC's generic vector extension. This turns out to be simpler and
25
+static PageFlagsNode *pageflags_find(target_ulong start, target_ulong last)
108
- * more reliable than getting the compiler to autovectorize.
109
- *
110
- * In tcg-op-gvec.c, we asserted that both the size and alignment of the data
111
- * are multiples of 16.
112
- *
113
- * When the compiler does not support all of the operations we require, the
114
- * loops are written so that we can always fall back on the base types.
115
- */
116
-#ifdef CONFIG_VECTOR16
117
-typedef uint8_t vec8 __attribute__((vector_size(16)));
118
-typedef uint16_t vec16 __attribute__((vector_size(16)));
119
-typedef uint32_t vec32 __attribute__((vector_size(16)));
120
-typedef uint64_t vec64 __attribute__((vector_size(16)));
121
-
122
-typedef int8_t svec8 __attribute__((vector_size(16)));
123
-typedef int16_t svec16 __attribute__((vector_size(16)));
124
-typedef int32_t svec32 __attribute__((vector_size(16)));
125
-typedef int64_t svec64 __attribute__((vector_size(16)));
126
-
127
-#define DUP16(X) { X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X }
128
-#define DUP8(X) { X, X, X, X, X, X, X, X }
129
-#define DUP4(X) { X, X, X, X }
130
-#define DUP2(X) { X, X }
131
-#else
132
typedef uint8_t vec8;
133
typedef uint16_t vec16;
134
typedef uint32_t vec32;
135
@@ -XXX,XX +XXX,XX @@ typedef int64_t svec64;
136
#define DUP8(X) X
137
#define DUP4(X) X
138
#define DUP2(X) X
139
-#endif /* CONFIG_VECTOR16 */
140
141
static inline void clear_high(void *d, intptr_t oprsz, uint32_t desc)
142
{
26
{
143
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_sar64v)(void *d, void *a, void *b, uint32_t desc)
27
IntervalTreeNode *n;
144
clear_high(d, oprsz, desc);
28
29
@@ -XXX,XX +XXX,XX @@ static PageFlagsNode *pageflags_find(target_ulong start, target_long last)
145
}
30
}
146
31
147
-/* If vectors are enabled, the compiler fills in -1 for true.
32
static PageFlagsNode *pageflags_next(PageFlagsNode *p, target_ulong start,
148
- Otherwise, we must take care of this by hand. */
33
- target_long last)
149
-#ifdef CONFIG_VECTOR16
34
+ target_ulong last)
150
-# define DO_CMP0(X) X
35
{
151
-#else
36
IntervalTreeNode *n;
152
-# define DO_CMP0(X) -(X)
37
153
-#endif
154
+#define DO_CMP0(X) -(X)
155
156
#define DO_CMP1(NAME, TYPE, OP) \
157
void HELPER(NAME)(void *d, void *a, void *b, uint32_t desc) \
158
--
38
--
159
2.20.1
39
2.34.1
160
161
diff view generated by jsdifflib