1
The following changes since commit 3dd23a4fb8fd72d2220a90a809f213999ffe7f3a:
1
v2: Fix incorretly resolved rebase conflict in patch 16.
2
2
3
Merge remote-tracking branch 'remotes/legoater/tags/pull-aspeed-20200901' into staging (2020-09-03 14:12:48 +0100)
3
4
r~
5
6
7
The following changes since commit 61fd710b8da8aedcea9b4f197283dc38638e4b60:
8
9
Merge tag 'for-upstream' of https://gitlab.com/bonzini/qemu into staging (2022-09-02 13:24:28 -0400)
4
10
5
are available in the Git repository at:
11
are available in the Git repository at:
6
12
7
https://github.com/rth7680/qemu.git tags/pull-tcg-20200903
13
https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20220904
8
14
9
for you to fetch changes up to fe4b0b5bfa96c38ad1cad0689a86cca9f307e353:
15
for you to fetch changes up to cc64de1fdeb81bc1ab8bb6c7c24bfd4fc9b28ef2:
10
16
11
tcg: Implement 256-bit dup for tcg_gen_gvec_dup_mem (2020-09-03 13:13:58 -0700)
17
target/riscv: Make translator stop before the end of a page (2022-09-03 09:27:05 +0100)
12
18
13
----------------------------------------------------------------
19
----------------------------------------------------------------
14
Improve inlining in cputlb.c.
20
Respect PROT_EXEC in user-only mode.
15
Fix vector abs fallback.
21
Fix s390x, i386 and riscv for translations crossing a page.
16
Only set parallel_cpus for SMP.
17
Add vector dupm for 256-bit elements.
18
22
19
----------------------------------------------------------------
23
----------------------------------------------------------------
20
Richard Henderson (4):
24
Ilya Leoshkevich (4):
21
cputlb: Make store_helper less fragile to compiler optimizations
25
linux-user: Clear translations on mprotect()
22
softmmu/cpus: Only set parallel_cpus for SMP
26
accel/tcg: Introduce is_same_page()
23
tcg: Eliminate one store for in-place 128-bit dup_mem
27
target/s390x: Make translator stop before the end of a page
24
tcg: Implement 256-bit dup for tcg_gen_gvec_dup_mem
28
target/i386: Make translator stop before the end of a page
25
29
26
Stephen Long (1):
30
Richard Henderson (16):
27
tcg: Fix tcg gen for vectorized absolute value
31
linux-user/arm: Mark the commpage executable
32
linux-user/hppa: Allocate page zero as a commpage
33
linux-user/x86_64: Allocate vsyscall page as a commpage
34
linux-user: Honor PT_GNU_STACK
35
tests/tcg/i386: Move smc_code2 to an executable section
36
accel/tcg: Properly implement get_page_addr_code for user-only
37
accel/tcg: Unlock mmap_lock after longjmp
38
accel/tcg: Make tb_htable_lookup static
39
accel/tcg: Move qemu_ram_addr_from_host_nofail to physmem.c
40
accel/tcg: Use probe_access_internal for softmmu get_page_addr_code_hostp
41
accel/tcg: Document the faulting lookup in tb_lookup_cmp
42
accel/tcg: Remove translator_ldsw
43
accel/tcg: Add pc and host_pc params to gen_intermediate_code
44
accel/tcg: Add fast path for translator_ld*
45
target/riscv: Add MAX_INSN_LEN and insn_len
46
target/riscv: Make translator stop before the end of a page
28
47
29
accel/tcg/cputlb.c | 138 ++++++++++++++++++++++++++++++-----------------------
48
include/elf.h | 1 +
30
softmmu/cpus.c | 11 ++++-
49
include/exec/cpu-common.h | 1 +
31
tcg/tcg-op-gvec.c | 61 ++++++++++++++++++++---
50
include/exec/exec-all.h | 89 ++++++++----------------
32
3 files changed, 143 insertions(+), 67 deletions(-)
51
include/exec/translator.h | 96 ++++++++++++++++---------
33
52
linux-user/arm/target_cpu.h | 4 +-
53
linux-user/qemu.h | 1 +
54
accel/tcg/cpu-exec.c | 143 ++++++++++++++++++++------------------
55
accel/tcg/cputlb.c | 93 +++++++------------------
56
accel/tcg/translate-all.c | 29 ++++----
57
accel/tcg/translator.c | 135 ++++++++++++++++++++++++++---------
58
accel/tcg/user-exec.c | 17 ++++-
59
linux-user/elfload.c | 82 ++++++++++++++++++++--
60
linux-user/mmap.c | 6 +-
61
softmmu/physmem.c | 12 ++++
62
target/alpha/translate.c | 5 +-
63
target/arm/translate.c | 5 +-
64
target/avr/translate.c | 5 +-
65
target/cris/translate.c | 5 +-
66
target/hexagon/translate.c | 6 +-
67
target/hppa/translate.c | 5 +-
68
target/i386/tcg/translate.c | 71 +++++++++++--------
69
target/loongarch/translate.c | 6 +-
70
target/m68k/translate.c | 5 +-
71
target/microblaze/translate.c | 5 +-
72
target/mips/tcg/translate.c | 5 +-
73
target/nios2/translate.c | 5 +-
74
target/openrisc/translate.c | 6 +-
75
target/ppc/translate.c | 5 +-
76
target/riscv/translate.c | 32 +++++++--
77
target/rx/translate.c | 5 +-
78
target/s390x/tcg/translate.c | 20 ++++--
79
target/sh4/translate.c | 5 +-
80
target/sparc/translate.c | 5 +-
81
target/tricore/translate.c | 6 +-
82
target/xtensa/translate.c | 6 +-
83
tests/tcg/i386/test-i386.c | 2 +-
84
tests/tcg/riscv64/noexec.c | 79 +++++++++++++++++++++
85
tests/tcg/s390x/noexec.c | 106 ++++++++++++++++++++++++++++
86
tests/tcg/x86_64/noexec.c | 75 ++++++++++++++++++++
87
tests/tcg/multiarch/noexec.c.inc | 139 ++++++++++++++++++++++++++++++++++++
88
tests/tcg/riscv64/Makefile.target | 1 +
89
tests/tcg/s390x/Makefile.target | 1 +
90
tests/tcg/x86_64/Makefile.target | 3 +-
91
43 files changed, 966 insertions(+), 367 deletions(-)
92
create mode 100644 tests/tcg/riscv64/noexec.c
93
create mode 100644 tests/tcg/s390x/noexec.c
94
create mode 100644 tests/tcg/x86_64/noexec.c
95
create mode 100644 tests/tcg/multiarch/noexec.c.inc
diff view generated by jsdifflib
1
This has no functional change.
1
Cache the translation from guest to host address, so we may
2
use direct loads when we hit on the primary translation page.
2
3
3
The current function structure is:
4
Look up the second translation page only once, during translation.
5
This obviates another lookup of the second page within tb_gen_code
6
after translation.
4
7
5
inline QEMU_ALWAYSINLINE
8
Fixes a bug in that plugin_insn_append should be passed the bytes
6
store_memop() {
9
in the original memory order, not bswapped by pieces.
7
switch () {
8
...
9
default:
10
qemu_build_not_reached();
11
}
12
}
13
inline QEMU_ALWAYSINLINE
14
store_helper() {
15
...
16
if (span_two_pages_or_io) {
17
...
18
helper_ret_stb_mmu();
19
}
20
store_memop();
21
}
22
helper_ret_stb_mmu() {
23
store_helper();
24
}
25
10
26
Whereas GCC will generate an error at compile-time when an always_inline
11
Acked-by: Ilya Leoshkevich <iii@linux.ibm.com>
27
function is not inlined, Clang does not. Nor does Clang prioritize the
12
Tested-by: Ilya Leoshkevich <iii@linux.ibm.com>
28
inlining of always_inline functions. Both of these are arguably bugs.
29
30
Both `store_memop` and `store_helper` need to be inlined and allow
31
constant propogations to eliminate the `qemu_build_not_reached` call.
32
33
However, if the compiler instead chooses to inline helper_ret_stb_mmu
34
into store_helper, then store_helper is now self-recursive and the
35
compiler is no longer able to propagate the constant in the same way.
36
37
This does not produce at current QEMU head, but was reproducible
38
at v4.2.0 with `clang-10 -O2 -fexperimental-new-pass-manager`.
39
40
The inline recursion problem can be fixed solely by marking
41
helper_ret_stb_mmu as noinline, so the compiler does not make an
42
incorrect decision about which functions to inline.
43
44
In addition, extract store_helper_unaligned as a noinline subroutine
45
that can be shared by all of the helpers. This saves about 6k code
46
size in an optimized x86_64 build.
47
48
Reported-by: Shu-Chun Weng <scw@google.com>
49
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
50
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
13
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
51
---
14
---
52
accel/tcg/cputlb.c | 138 ++++++++++++++++++++++++++-------------------
15
include/exec/translator.h | 63 +++++++++++--------
53
1 file changed, 79 insertions(+), 59 deletions(-)
16
accel/tcg/translate-all.c | 23 +++----
17
accel/tcg/translator.c | 126 +++++++++++++++++++++++++++++---------
18
3 files changed, 141 insertions(+), 71 deletions(-)
54
19
55
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
20
diff --git a/include/exec/translator.h b/include/exec/translator.h
56
index XXXXXXX..XXXXXXX 100644
21
index XXXXXXX..XXXXXXX 100644
57
--- a/accel/tcg/cputlb.c
22
--- a/include/exec/translator.h
58
+++ b/accel/tcg/cputlb.c
23
+++ b/include/exec/translator.h
59
@@ -XXX,XX +XXX,XX @@ store_memop(void *haddr, uint64_t val, MemOp op)
24
@@ -XXX,XX +XXX,XX @@ typedef enum DisasJumpType {
60
}
25
* Architecture-agnostic disassembly context.
26
*/
27
typedef struct DisasContextBase {
28
- const TranslationBlock *tb;
29
+ TranslationBlock *tb;
30
target_ulong pc_first;
31
target_ulong pc_next;
32
DisasJumpType is_jmp;
33
int num_insns;
34
int max_insns;
35
bool singlestep_enabled;
36
-#ifdef CONFIG_USER_ONLY
37
- /*
38
- * Guest address of the last byte of the last protected page.
39
- *
40
- * Pages containing the translated instructions are made non-writable in
41
- * order to achieve consistency in case another thread is modifying the
42
- * code while translate_insn() fetches the instruction bytes piecemeal.
43
- * Such writer threads are blocked on mmap_lock() in page_unprotect().
44
- */
45
- target_ulong page_protect_end;
46
-#endif
47
+ void *host_addr[2];
48
} DisasContextBase;
49
50
/**
51
@@ -XXX,XX +XXX,XX @@ bool translator_use_goto_tb(DisasContextBase *db, target_ulong dest);
52
* the relevant information at translation time.
53
*/
54
55
-#define GEN_TRANSLATOR_LD(fullname, type, load_fn, swap_fn) \
56
- type fullname ## _swap(CPUArchState *env, DisasContextBase *dcbase, \
57
- abi_ptr pc, bool do_swap); \
58
- static inline type fullname(CPUArchState *env, \
59
- DisasContextBase *dcbase, abi_ptr pc) \
60
- { \
61
- return fullname ## _swap(env, dcbase, pc, false); \
62
+uint8_t translator_ldub(CPUArchState *env, DisasContextBase *db, abi_ptr pc);
63
+uint16_t translator_lduw(CPUArchState *env, DisasContextBase *db, abi_ptr pc);
64
+uint32_t translator_ldl(CPUArchState *env, DisasContextBase *db, abi_ptr pc);
65
+uint64_t translator_ldq(CPUArchState *env, DisasContextBase *db, abi_ptr pc);
66
+
67
+static inline uint16_t
68
+translator_lduw_swap(CPUArchState *env, DisasContextBase *db,
69
+ abi_ptr pc, bool do_swap)
70
+{
71
+ uint16_t ret = translator_lduw(env, db, pc);
72
+ if (do_swap) {
73
+ ret = bswap16(ret);
74
}
75
+ return ret;
76
+}
77
78
-#define FOR_EACH_TRANSLATOR_LD(F) \
79
- F(translator_ldub, uint8_t, cpu_ldub_code, /* no swap */) \
80
- F(translator_lduw, uint16_t, cpu_lduw_code, bswap16) \
81
- F(translator_ldl, uint32_t, cpu_ldl_code, bswap32) \
82
- F(translator_ldq, uint64_t, cpu_ldq_code, bswap64)
83
+static inline uint32_t
84
+translator_ldl_swap(CPUArchState *env, DisasContextBase *db,
85
+ abi_ptr pc, bool do_swap)
86
+{
87
+ uint32_t ret = translator_ldl(env, db, pc);
88
+ if (do_swap) {
89
+ ret = bswap32(ret);
90
+ }
91
+ return ret;
92
+}
93
94
-FOR_EACH_TRANSLATOR_LD(GEN_TRANSLATOR_LD)
95
-
96
-#undef GEN_TRANSLATOR_LD
97
+static inline uint64_t
98
+translator_ldq_swap(CPUArchState *env, DisasContextBase *db,
99
+ abi_ptr pc, bool do_swap)
100
+{
101
+ uint64_t ret = translator_ldq(env, db, pc);
102
+ if (do_swap) {
103
+ ret = bswap64(ret);
104
+ }
105
+ return ret;
106
+}
107
108
/*
109
* Return whether addr is on the same page as where disassembly started.
110
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
111
index XXXXXXX..XXXXXXX 100644
112
--- a/accel/tcg/translate-all.c
113
+++ b/accel/tcg/translate-all.c
114
@@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_gen_code(CPUState *cpu,
115
{
116
CPUArchState *env = cpu->env_ptr;
117
TranslationBlock *tb, *existing_tb;
118
- tb_page_addr_t phys_pc, phys_page2;
119
- target_ulong virt_page2;
120
+ tb_page_addr_t phys_pc;
121
tcg_insn_unit *gen_code_buf;
122
int gen_code_size, search_size, max_insns;
123
#ifdef CONFIG_PROFILER
124
@@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_gen_code(CPUState *cpu,
125
tb->flags = flags;
126
tb->cflags = cflags;
127
tb->trace_vcpu_dstate = *cpu->trace_dstate;
128
+ tb->page_addr[0] = phys_pc;
129
+ tb->page_addr[1] = -1;
130
tcg_ctx->tb_cflags = cflags;
131
tb_overflow:
132
133
@@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_gen_code(CPUState *cpu,
134
}
135
136
/*
137
- * If the TB is not associated with a physical RAM page then
138
- * it must be a temporary one-insn TB, and we have nothing to do
139
- * except fill in the page_addr[] fields. Return early before
140
- * attempting to link to other TBs or add to the lookup table.
141
+ * If the TB is not associated with a physical RAM page then it must be
142
+ * a temporary one-insn TB, and we have nothing left to do. Return early
143
+ * before attempting to link to other TBs or add to the lookup table.
144
*/
145
- if (phys_pc == -1) {
146
- tb->page_addr[0] = tb->page_addr[1] = -1;
147
+ if (tb->page_addr[0] == -1) {
148
return tb;
149
}
150
151
@@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_gen_code(CPUState *cpu,
152
*/
153
tcg_tb_insert(tb);
154
155
- /* check next page if needed */
156
- virt_page2 = (pc + tb->size - 1) & TARGET_PAGE_MASK;
157
- phys_page2 = -1;
158
- if ((pc & TARGET_PAGE_MASK) != virt_page2) {
159
- phys_page2 = get_page_addr_code(env, virt_page2);
160
- }
161
/*
162
* No explicit memory barrier is required -- tb_link_page() makes the
163
* TB visible in a consistent state.
164
*/
165
- existing_tb = tb_link_page(tb, phys_pc, phys_page2);
166
+ existing_tb = tb_link_page(tb, tb->page_addr[0], tb->page_addr[1]);
167
/* if the TB already exists, discard what we just translated */
168
if (unlikely(existing_tb != tb)) {
169
uintptr_t orig_aligned = (uintptr_t)gen_code_buf;
170
diff --git a/accel/tcg/translator.c b/accel/tcg/translator.c
171
index XXXXXXX..XXXXXXX 100644
172
--- a/accel/tcg/translator.c
173
+++ b/accel/tcg/translator.c
174
@@ -XXX,XX +XXX,XX @@ bool translator_use_goto_tb(DisasContextBase *db, target_ulong dest)
175
return ((db->pc_first ^ dest) & TARGET_PAGE_MASK) == 0;
61
}
176
}
62
177
63
+static void __attribute__((noinline))
178
-static inline void translator_page_protect(DisasContextBase *dcbase,
64
+store_helper_unaligned(CPUArchState *env, target_ulong addr, uint64_t val,
179
- target_ulong pc)
65
+ uintptr_t retaddr, size_t size, uintptr_t mmu_idx,
180
-{
66
+ bool big_endian)
181
-#ifdef CONFIG_USER_ONLY
67
+{
182
- dcbase->page_protect_end = pc | ~TARGET_PAGE_MASK;
68
+ const size_t tlb_off = offsetof(CPUTLBEntry, addr_write);
183
- page_protect(pc);
69
+ uintptr_t index, index2;
184
-#endif
70
+ CPUTLBEntry *entry, *entry2;
185
-}
71
+ target_ulong page2, tlb_addr, tlb_addr2;
186
-
72
+ TCGMemOpIdx oi;
187
void translator_loop(CPUState *cpu, TranslationBlock *tb, int max_insns,
73
+ size_t size2;
188
target_ulong pc, void *host_pc,
74
+ int i;
189
const TranslatorOps *ops, DisasContextBase *db)
75
+
190
@@ -XXX,XX +XXX,XX @@ void translator_loop(CPUState *cpu, TranslationBlock *tb, int max_insns,
76
+ /*
191
db->num_insns = 0;
77
+ * Ensure the second page is in the TLB. Note that the first page
192
db->max_insns = max_insns;
78
+ * is already guaranteed to be filled, and that the second page
193
db->singlestep_enabled = cflags & CF_SINGLE_STEP;
79
+ * cannot evict the first.
194
- translator_page_protect(db, db->pc_next);
80
+ */
195
+ db->host_addr[0] = host_pc;
81
+ page2 = (addr + size) & TARGET_PAGE_MASK;
196
+ db->host_addr[1] = NULL;
82
+ size2 = (addr + size) & ~TARGET_PAGE_MASK;
197
+
83
+ index2 = tlb_index(env, mmu_idx, page2);
198
+#ifdef CONFIG_USER_ONLY
84
+ entry2 = tlb_entry(env, mmu_idx, page2);
199
+ page_protect(pc);
85
+
200
+#endif
86
+ tlb_addr2 = tlb_addr_write(entry2);
201
87
+ if (!tlb_hit_page(tlb_addr2, page2)) {
202
ops->init_disas_context(db, cpu);
88
+ if (!victim_tlb_hit(env, mmu_idx, index2, tlb_off, page2)) {
203
tcg_debug_assert(db->is_jmp == DISAS_NEXT); /* no early exit */
89
+ tlb_fill(env_cpu(env), page2, size2, MMU_DATA_STORE,
204
@@ -XXX,XX +XXX,XX @@ void translator_loop(CPUState *cpu, TranslationBlock *tb, int max_insns,
90
+ mmu_idx, retaddr);
205
#endif
91
+ index2 = tlb_index(env, mmu_idx, page2);
206
}
92
+ entry2 = tlb_entry(env, mmu_idx, page2);
207
208
-static inline void translator_maybe_page_protect(DisasContextBase *dcbase,
209
- target_ulong pc, size_t len)
210
+static void *translator_access(CPUArchState *env, DisasContextBase *db,
211
+ target_ulong pc, size_t len)
212
{
213
-#ifdef CONFIG_USER_ONLY
214
- target_ulong end = pc + len - 1;
215
+ void *host;
216
+ target_ulong base, end;
217
+ TranslationBlock *tb;
218
219
- if (end > dcbase->page_protect_end) {
220
- translator_page_protect(dcbase, end);
221
+ tb = db->tb;
222
+
223
+ /* Use slow path if first page is MMIO. */
224
+ if (unlikely(tb->page_addr[0] == -1)) {
225
+ return NULL;
226
}
227
+
228
+ end = pc + len - 1;
229
+ if (likely(is_same_page(db, end))) {
230
+ host = db->host_addr[0];
231
+ base = db->pc_first;
232
+ } else {
233
+ host = db->host_addr[1];
234
+ base = TARGET_PAGE_ALIGN(db->pc_first);
235
+ if (host == NULL) {
236
+ tb->page_addr[1] =
237
+ get_page_addr_code_hostp(env, base, &db->host_addr[1]);
238
+#ifdef CONFIG_USER_ONLY
239
+ page_protect(end);
240
#endif
241
+ /* We cannot handle MMIO as second page. */
242
+ assert(tb->page_addr[1] != -1);
243
+ host = db->host_addr[1];
93
+ }
244
+ }
94
+ tlb_addr2 = tlb_addr_write(entry2);
245
+
95
+ }
246
+ /* Use slow path when crossing pages. */
96
+
247
+ if (is_same_page(db, pc)) {
97
+ index = tlb_index(env, mmu_idx, addr);
248
+ return NULL;
98
+ entry = tlb_entry(env, mmu_idx, addr);
99
+ tlb_addr = tlb_addr_write(entry);
100
+
101
+ /*
102
+ * Handle watchpoints. Since this may trap, all checks
103
+ * must happen before any store.
104
+ */
105
+ if (unlikely(tlb_addr & TLB_WATCHPOINT)) {
106
+ cpu_check_watchpoint(env_cpu(env), addr, size - size2,
107
+ env_tlb(env)->d[mmu_idx].iotlb[index].attrs,
108
+ BP_MEM_WRITE, retaddr);
109
+ }
110
+ if (unlikely(tlb_addr2 & TLB_WATCHPOINT)) {
111
+ cpu_check_watchpoint(env_cpu(env), page2, size2,
112
+ env_tlb(env)->d[mmu_idx].iotlb[index2].attrs,
113
+ BP_MEM_WRITE, retaddr);
114
+ }
115
+
116
+ /*
117
+ * XXX: not efficient, but simple.
118
+ * This loop must go in the forward direction to avoid issues
119
+ * with self-modifying code in Windows 64-bit.
120
+ */
121
+ oi = make_memop_idx(MO_UB, mmu_idx);
122
+ if (big_endian) {
123
+ for (i = 0; i < size; ++i) {
124
+ /* Big-endian extract. */
125
+ uint8_t val8 = val >> (((size - 1) * 8) - (i * 8));
126
+ helper_ret_stb_mmu(env, addr + i, val8, oi, retaddr);
127
+ }
249
+ }
128
+ } else {
250
+ }
129
+ for (i = 0; i < size; ++i) {
251
+
130
+ /* Little-endian extract. */
252
+ tcg_debug_assert(pc >= base);
131
+ uint8_t val8 = val >> (i * 8);
253
+ return host + (pc - base);
132
+ helper_ret_stb_mmu(env, addr + i, val8, oi, retaddr);
133
+ }
134
+ }
135
+}
136
+
137
static inline void QEMU_ALWAYS_INLINE
138
store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
139
TCGMemOpIdx oi, uintptr_t retaddr, MemOp op)
140
@@ -XXX,XX +XXX,XX @@ store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
141
if (size > 1
142
&& unlikely((addr & ~TARGET_PAGE_MASK) + size - 1
143
>= TARGET_PAGE_SIZE)) {
144
- int i;
145
- uintptr_t index2;
146
- CPUTLBEntry *entry2;
147
- target_ulong page2, tlb_addr2;
148
- size_t size2;
149
-
150
do_unaligned_access:
151
- /*
152
- * Ensure the second page is in the TLB. Note that the first page
153
- * is already guaranteed to be filled, and that the second page
154
- * cannot evict the first.
155
- */
156
- page2 = (addr + size) & TARGET_PAGE_MASK;
157
- size2 = (addr + size) & ~TARGET_PAGE_MASK;
158
- index2 = tlb_index(env, mmu_idx, page2);
159
- entry2 = tlb_entry(env, mmu_idx, page2);
160
- tlb_addr2 = tlb_addr_write(entry2);
161
- if (!tlb_hit_page(tlb_addr2, page2)) {
162
- if (!victim_tlb_hit(env, mmu_idx, index2, tlb_off, page2)) {
163
- tlb_fill(env_cpu(env), page2, size2, MMU_DATA_STORE,
164
- mmu_idx, retaddr);
165
- index2 = tlb_index(env, mmu_idx, page2);
166
- entry2 = tlb_entry(env, mmu_idx, page2);
167
- }
168
- tlb_addr2 = tlb_addr_write(entry2);
169
- }
170
-
171
- /*
172
- * Handle watchpoints. Since this may trap, all checks
173
- * must happen before any store.
174
- */
175
- if (unlikely(tlb_addr & TLB_WATCHPOINT)) {
176
- cpu_check_watchpoint(env_cpu(env), addr, size - size2,
177
- env_tlb(env)->d[mmu_idx].iotlb[index].attrs,
178
- BP_MEM_WRITE, retaddr);
179
- }
180
- if (unlikely(tlb_addr2 & TLB_WATCHPOINT)) {
181
- cpu_check_watchpoint(env_cpu(env), page2, size2,
182
- env_tlb(env)->d[mmu_idx].iotlb[index2].attrs,
183
- BP_MEM_WRITE, retaddr);
184
- }
185
-
186
- /*
187
- * XXX: not efficient, but simple.
188
- * This loop must go in the forward direction to avoid issues
189
- * with self-modifying code in Windows 64-bit.
190
- */
191
- for (i = 0; i < size; ++i) {
192
- uint8_t val8;
193
- if (memop_big_endian(op)) {
194
- /* Big-endian extract. */
195
- val8 = val >> (((size - 1) * 8) - (i * 8));
196
- } else {
197
- /* Little-endian extract. */
198
- val8 = val >> (i * 8);
199
- }
200
- helper_ret_stb_mmu(env, addr + i, val8, oi, retaddr);
201
- }
202
+ store_helper_unaligned(env, addr, val, retaddr, size,
203
+ mmu_idx, memop_big_endian(op));
204
return;
205
}
206
207
@@ -XXX,XX +XXX,XX @@ store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
208
store_memop(haddr, val, op);
209
}
254
}
210
255
211
-void helper_ret_stb_mmu(CPUArchState *env, target_ulong addr, uint8_t val,
256
-#define GEN_TRANSLATOR_LD(fullname, type, load_fn, swap_fn) \
212
- TCGMemOpIdx oi, uintptr_t retaddr)
257
- type fullname ## _swap(CPUArchState *env, DisasContextBase *dcbase, \
213
+void __attribute__((noinline))
258
- abi_ptr pc, bool do_swap) \
214
+helper_ret_stb_mmu(CPUArchState *env, target_ulong addr, uint8_t val,
259
- { \
215
+ TCGMemOpIdx oi, uintptr_t retaddr)
260
- translator_maybe_page_protect(dcbase, pc, sizeof(type)); \
216
{
261
- type ret = load_fn(env, pc); \
217
store_helper(env, addr, val, oi, retaddr, MO_UB);
262
- if (do_swap) { \
218
}
263
- ret = swap_fn(ret); \
264
- } \
265
- plugin_insn_append(pc, &ret, sizeof(ret)); \
266
- return ret; \
267
+uint8_t translator_ldub(CPUArchState *env, DisasContextBase *db, abi_ptr pc)
268
+{
269
+ uint8_t ret;
270
+ void *p = translator_access(env, db, pc, sizeof(ret));
271
+
272
+ if (p) {
273
+ plugin_insn_append(pc, p, sizeof(ret));
274
+ return ldub_p(p);
275
}
276
+ ret = cpu_ldub_code(env, pc);
277
+ plugin_insn_append(pc, &ret, sizeof(ret));
278
+ return ret;
279
+}
280
281
-FOR_EACH_TRANSLATOR_LD(GEN_TRANSLATOR_LD)
282
+uint16_t translator_lduw(CPUArchState *env, DisasContextBase *db, abi_ptr pc)
283
+{
284
+ uint16_t ret, plug;
285
+ void *p = translator_access(env, db, pc, sizeof(ret));
286
287
-#undef GEN_TRANSLATOR_LD
288
+ if (p) {
289
+ plugin_insn_append(pc, p, sizeof(ret));
290
+ return lduw_p(p);
291
+ }
292
+ ret = cpu_lduw_code(env, pc);
293
+ plug = tswap16(ret);
294
+ plugin_insn_append(pc, &plug, sizeof(ret));
295
+ return ret;
296
+}
297
+
298
+uint32_t translator_ldl(CPUArchState *env, DisasContextBase *db, abi_ptr pc)
299
+{
300
+ uint32_t ret, plug;
301
+ void *p = translator_access(env, db, pc, sizeof(ret));
302
+
303
+ if (p) {
304
+ plugin_insn_append(pc, p, sizeof(ret));
305
+ return ldl_p(p);
306
+ }
307
+ ret = cpu_ldl_code(env, pc);
308
+ plug = tswap32(ret);
309
+ plugin_insn_append(pc, &plug, sizeof(ret));
310
+ return ret;
311
+}
312
+
313
+uint64_t translator_ldq(CPUArchState *env, DisasContextBase *db, abi_ptr pc)
314
+{
315
+ uint64_t ret, plug;
316
+ void *p = translator_access(env, db, pc, sizeof(ret));
317
+
318
+ if (p) {
319
+ plugin_insn_append(pc, p, sizeof(ret));
320
+ return ldq_p(p);
321
+ }
322
+ ret = cpu_ldq_code(env, pc);
323
+ plug = tswap64(ret);
324
+ plugin_insn_append(pc, &plug, sizeof(ret));
325
+ return ret;
326
+}
219
--
327
--
220
2.25.1
328
2.34.1
221
222
diff view generated by jsdifflib
Deleted patch
1
From: Stephen Long <steplong@quicinc.com>
2
1
3
The fallback inline expansion for vectorized absolute value,
4
when the host doesn't support such an insn was flawed.
5
6
E.g. when a vector of bytes has all elements negative, mask
7
will be 0xffff_ffff_ffff_ffff. Subtracting mask only adds 1
8
to the low element instead of all elements becase -mask is 1
9
and not 0x0101_0101_0101_0101.
10
11
Signed-off-by: Stephen Long <steplong@quicinc.com>
12
Message-Id: <20200813161818.190-1-steplong@quicinc.com>
13
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
14
---
15
tcg/tcg-op-gvec.c | 5 +++--
16
1 file changed, 3 insertions(+), 2 deletions(-)
17
18
diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
19
index XXXXXXX..XXXXXXX 100644
20
--- a/tcg/tcg-op-gvec.c
21
+++ b/tcg/tcg-op-gvec.c
22
@@ -XXX,XX +XXX,XX @@ static void gen_absv_mask(TCGv_i64 d, TCGv_i64 b, unsigned vece)
23
tcg_gen_muli_i64(t, t, (1 << nbit) - 1);
24
25
/*
26
- * Invert (via xor -1) and add one (via sub -1).
27
+ * Invert (via xor -1) and add one.
28
* Because of the ordering the msb is cleared,
29
* so we never have carry into the next element.
30
*/
31
tcg_gen_xor_i64(d, b, t);
32
- tcg_gen_sub_i64(d, d, t);
33
+ tcg_gen_andi_i64(t, t, dup_const(vece, 1));
34
+ tcg_gen_add_i64(d, d, t);
35
36
tcg_temp_free_i64(t);
37
}
38
--
39
2.25.1
40
41
diff view generated by jsdifflib
Deleted patch
1
Do not set parallel_cpus if there is only one cpu instantiated.
2
This will allow tcg to use serial code to implement atomics.
3
1
4
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
5
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
6
---
7
softmmu/cpus.c | 11 ++++++++++-
8
1 file changed, 10 insertions(+), 1 deletion(-)
9
10
diff --git a/softmmu/cpus.c b/softmmu/cpus.c
11
index XXXXXXX..XXXXXXX 100644
12
--- a/softmmu/cpus.c
13
+++ b/softmmu/cpus.c
14
@@ -XXX,XX +XXX,XX @@ static void qemu_tcg_init_vcpu(CPUState *cpu)
15
if (!tcg_region_inited) {
16
tcg_region_inited = 1;
17
tcg_region_init();
18
+ /*
19
+ * If MTTCG, and we will create multiple cpus,
20
+ * then we will have cpus running in parallel.
21
+ */
22
+ if (qemu_tcg_mttcg_enabled()) {
23
+ MachineState *ms = MACHINE(qdev_get_machine());
24
+ if (ms->smp.max_cpus > 1) {
25
+ parallel_cpus = true;
26
+ }
27
+ }
28
}
29
30
if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
31
@@ -XXX,XX +XXX,XX @@ static void qemu_tcg_init_vcpu(CPUState *cpu)
32
33
if (qemu_tcg_mttcg_enabled()) {
34
/* create a thread per vCPU with TCG (MTTCG) */
35
- parallel_cpus = true;
36
snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
37
cpu->cpu_index);
38
39
--
40
2.25.1
41
42
diff view generated by jsdifflib
Deleted patch
1
Do not store back to the exact memory from which we just loaded.
2
1
3
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
4
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
5
---
6
tcg/tcg-op-gvec.c | 4 ++--
7
1 file changed, 2 insertions(+), 2 deletions(-)
8
9
diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
10
index XXXXXXX..XXXXXXX 100644
11
--- a/tcg/tcg-op-gvec.c
12
+++ b/tcg/tcg-op-gvec.c
13
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
14
TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V128);
15
16
tcg_gen_ld_vec(in, cpu_env, aofs);
17
- for (i = 0; i < oprsz; i += 16) {
18
+ for (i = (aofs == dofs) * 16; i < oprsz; i += 16) {
19
tcg_gen_st_vec(in, cpu_env, dofs + i);
20
}
21
tcg_temp_free_vec(in);
22
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
23
24
tcg_gen_ld_i64(in0, cpu_env, aofs);
25
tcg_gen_ld_i64(in1, cpu_env, aofs + 8);
26
- for (i = 0; i < oprsz; i += 16) {
27
+ for (i = (aofs == dofs) * 16; i < oprsz; i += 16) {
28
tcg_gen_st_i64(in0, cpu_env, dofs + i);
29
tcg_gen_st_i64(in1, cpu_env, dofs + i + 8);
30
}
31
--
32
2.25.1
33
34
diff view generated by jsdifflib
Deleted patch
1
We already support duplication of 128-bit blocks. This extends
2
that support to 256-bit blocks. This will be needed by SVE2.
3
1
4
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
5
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
6
---
7
tcg/tcg-op-gvec.c | 52 ++++++++++++++++++++++++++++++++++++++++++++---
8
1 file changed, 49 insertions(+), 3 deletions(-)
9
10
diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
11
index XXXXXXX..XXXXXXX 100644
12
--- a/tcg/tcg-op-gvec.c
13
+++ b/tcg/tcg-op-gvec.c
14
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
15
do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0);
16
tcg_temp_free_i64(in);
17
}
18
- } else {
19
+ } else if (vece == 4) {
20
/* 128-bit duplicate. */
21
- /* ??? Dup to 256-bit vector. */
22
int i;
23
24
- tcg_debug_assert(vece == 4);
25
tcg_debug_assert(oprsz >= 16);
26
if (TCG_TARGET_HAS_v128) {
27
TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V128);
28
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
29
if (oprsz < maxsz) {
30
expand_clr(dofs + oprsz, maxsz - oprsz);
31
}
32
+ } else if (vece == 5) {
33
+ /* 256-bit duplicate. */
34
+ int i;
35
+
36
+ tcg_debug_assert(oprsz >= 32);
37
+ tcg_debug_assert(oprsz % 32 == 0);
38
+ if (TCG_TARGET_HAS_v256) {
39
+ TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V256);
40
+
41
+ tcg_gen_ld_vec(in, cpu_env, aofs);
42
+ for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
43
+ tcg_gen_st_vec(in, cpu_env, dofs + i);
44
+ }
45
+ tcg_temp_free_vec(in);
46
+ } else if (TCG_TARGET_HAS_v128) {
47
+ TCGv_vec in0 = tcg_temp_new_vec(TCG_TYPE_V128);
48
+ TCGv_vec in1 = tcg_temp_new_vec(TCG_TYPE_V128);
49
+
50
+ tcg_gen_ld_vec(in0, cpu_env, aofs);
51
+ tcg_gen_ld_vec(in1, cpu_env, aofs + 16);
52
+ for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
53
+ tcg_gen_st_vec(in0, cpu_env, dofs + i);
54
+ tcg_gen_st_vec(in1, cpu_env, dofs + i + 16);
55
+ }
56
+ tcg_temp_free_vec(in0);
57
+ tcg_temp_free_vec(in1);
58
+ } else {
59
+ TCGv_i64 in[4];
60
+ int j;
61
+
62
+ for (j = 0; j < 4; ++j) {
63
+ in[j] = tcg_temp_new_i64();
64
+ tcg_gen_ld_i64(in[j], cpu_env, aofs + j * 8);
65
+ }
66
+ for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
67
+ for (j = 0; j < 4; ++j) {
68
+ tcg_gen_st_i64(in[j], cpu_env, dofs + i + j * 8);
69
+ }
70
+ }
71
+ for (j = 0; j < 4; ++j) {
72
+ tcg_temp_free_i64(in[j]);
73
+ }
74
+ }
75
+ if (oprsz < maxsz) {
76
+ expand_clr(dofs + oprsz, maxsz - oprsz);
77
+ }
78
+ } else {
79
+ g_assert_not_reached();
80
}
81
}
82
83
--
84
2.25.1
85
86
diff view generated by jsdifflib