1 | The following changes since commit 3dd23a4fb8fd72d2220a90a809f213999ffe7f3a: | 1 | v2: Fix incorretly resolved rebase conflict in patch 16. |
---|---|---|---|
2 | 2 | ||
3 | Merge remote-tracking branch 'remotes/legoater/tags/pull-aspeed-20200901' into staging (2020-09-03 14:12:48 +0100) | 3 | |
4 | r~ | ||
5 | |||
6 | |||
7 | The following changes since commit 61fd710b8da8aedcea9b4f197283dc38638e4b60: | ||
8 | |||
9 | Merge tag 'for-upstream' of https://gitlab.com/bonzini/qemu into staging (2022-09-02 13:24:28 -0400) | ||
4 | 10 | ||
5 | are available in the Git repository at: | 11 | are available in the Git repository at: |
6 | 12 | ||
7 | https://github.com/rth7680/qemu.git tags/pull-tcg-20200903 | 13 | https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20220904 |
8 | 14 | ||
9 | for you to fetch changes up to fe4b0b5bfa96c38ad1cad0689a86cca9f307e353: | 15 | for you to fetch changes up to cc64de1fdeb81bc1ab8bb6c7c24bfd4fc9b28ef2: |
10 | 16 | ||
11 | tcg: Implement 256-bit dup for tcg_gen_gvec_dup_mem (2020-09-03 13:13:58 -0700) | 17 | target/riscv: Make translator stop before the end of a page (2022-09-03 09:27:05 +0100) |
12 | 18 | ||
13 | ---------------------------------------------------------------- | 19 | ---------------------------------------------------------------- |
14 | Improve inlining in cputlb.c. | 20 | Respect PROT_EXEC in user-only mode. |
15 | Fix vector abs fallback. | 21 | Fix s390x, i386 and riscv for translations crossing a page. |
16 | Only set parallel_cpus for SMP. | ||
17 | Add vector dupm for 256-bit elements. | ||
18 | 22 | ||
19 | ---------------------------------------------------------------- | 23 | ---------------------------------------------------------------- |
20 | Richard Henderson (4): | 24 | Ilya Leoshkevich (4): |
21 | cputlb: Make store_helper less fragile to compiler optimizations | 25 | linux-user: Clear translations on mprotect() |
22 | softmmu/cpus: Only set parallel_cpus for SMP | 26 | accel/tcg: Introduce is_same_page() |
23 | tcg: Eliminate one store for in-place 128-bit dup_mem | 27 | target/s390x: Make translator stop before the end of a page |
24 | tcg: Implement 256-bit dup for tcg_gen_gvec_dup_mem | 28 | target/i386: Make translator stop before the end of a page |
25 | 29 | ||
26 | Stephen Long (1): | 30 | Richard Henderson (16): |
27 | tcg: Fix tcg gen for vectorized absolute value | 31 | linux-user/arm: Mark the commpage executable |
32 | linux-user/hppa: Allocate page zero as a commpage | ||
33 | linux-user/x86_64: Allocate vsyscall page as a commpage | ||
34 | linux-user: Honor PT_GNU_STACK | ||
35 | tests/tcg/i386: Move smc_code2 to an executable section | ||
36 | accel/tcg: Properly implement get_page_addr_code for user-only | ||
37 | accel/tcg: Unlock mmap_lock after longjmp | ||
38 | accel/tcg: Make tb_htable_lookup static | ||
39 | accel/tcg: Move qemu_ram_addr_from_host_nofail to physmem.c | ||
40 | accel/tcg: Use probe_access_internal for softmmu get_page_addr_code_hostp | ||
41 | accel/tcg: Document the faulting lookup in tb_lookup_cmp | ||
42 | accel/tcg: Remove translator_ldsw | ||
43 | accel/tcg: Add pc and host_pc params to gen_intermediate_code | ||
44 | accel/tcg: Add fast path for translator_ld* | ||
45 | target/riscv: Add MAX_INSN_LEN and insn_len | ||
46 | target/riscv: Make translator stop before the end of a page | ||
28 | 47 | ||
29 | accel/tcg/cputlb.c | 138 ++++++++++++++++++++++++++++++----------------------- | 48 | include/elf.h | 1 + |
30 | softmmu/cpus.c | 11 ++++- | 49 | include/exec/cpu-common.h | 1 + |
31 | tcg/tcg-op-gvec.c | 61 ++++++++++++++++++++--- | 50 | include/exec/exec-all.h | 89 ++++++++---------------- |
32 | 3 files changed, 143 insertions(+), 67 deletions(-) | 51 | include/exec/translator.h | 96 ++++++++++++++++--------- |
33 | 52 | linux-user/arm/target_cpu.h | 4 +- | |
53 | linux-user/qemu.h | 1 + | ||
54 | accel/tcg/cpu-exec.c | 143 ++++++++++++++++++++------------------ | ||
55 | accel/tcg/cputlb.c | 93 +++++++------------------ | ||
56 | accel/tcg/translate-all.c | 29 ++++---- | ||
57 | accel/tcg/translator.c | 135 ++++++++++++++++++++++++++--------- | ||
58 | accel/tcg/user-exec.c | 17 ++++- | ||
59 | linux-user/elfload.c | 82 ++++++++++++++++++++-- | ||
60 | linux-user/mmap.c | 6 +- | ||
61 | softmmu/physmem.c | 12 ++++ | ||
62 | target/alpha/translate.c | 5 +- | ||
63 | target/arm/translate.c | 5 +- | ||
64 | target/avr/translate.c | 5 +- | ||
65 | target/cris/translate.c | 5 +- | ||
66 | target/hexagon/translate.c | 6 +- | ||
67 | target/hppa/translate.c | 5 +- | ||
68 | target/i386/tcg/translate.c | 71 +++++++++++-------- | ||
69 | target/loongarch/translate.c | 6 +- | ||
70 | target/m68k/translate.c | 5 +- | ||
71 | target/microblaze/translate.c | 5 +- | ||
72 | target/mips/tcg/translate.c | 5 +- | ||
73 | target/nios2/translate.c | 5 +- | ||
74 | target/openrisc/translate.c | 6 +- | ||
75 | target/ppc/translate.c | 5 +- | ||
76 | target/riscv/translate.c | 32 +++++++-- | ||
77 | target/rx/translate.c | 5 +- | ||
78 | target/s390x/tcg/translate.c | 20 ++++-- | ||
79 | target/sh4/translate.c | 5 +- | ||
80 | target/sparc/translate.c | 5 +- | ||
81 | target/tricore/translate.c | 6 +- | ||
82 | target/xtensa/translate.c | 6 +- | ||
83 | tests/tcg/i386/test-i386.c | 2 +- | ||
84 | tests/tcg/riscv64/noexec.c | 79 +++++++++++++++++++++ | ||
85 | tests/tcg/s390x/noexec.c | 106 ++++++++++++++++++++++++++++ | ||
86 | tests/tcg/x86_64/noexec.c | 75 ++++++++++++++++++++ | ||
87 | tests/tcg/multiarch/noexec.c.inc | 139 ++++++++++++++++++++++++++++++++++++ | ||
88 | tests/tcg/riscv64/Makefile.target | 1 + | ||
89 | tests/tcg/s390x/Makefile.target | 1 + | ||
90 | tests/tcg/x86_64/Makefile.target | 3 +- | ||
91 | 43 files changed, 966 insertions(+), 367 deletions(-) | ||
92 | create mode 100644 tests/tcg/riscv64/noexec.c | ||
93 | create mode 100644 tests/tcg/s390x/noexec.c | ||
94 | create mode 100644 tests/tcg/x86_64/noexec.c | ||
95 | create mode 100644 tests/tcg/multiarch/noexec.c.inc | diff view generated by jsdifflib |
1 | This has no functional change. | 1 | Cache the translation from guest to host address, so we may |
---|---|---|---|
2 | use direct loads when we hit on the primary translation page. | ||
2 | 3 | ||
3 | The current function structure is: | 4 | Look up the second translation page only once, during translation. |
5 | This obviates another lookup of the second page within tb_gen_code | ||
6 | after translation. | ||
4 | 7 | ||
5 | inline QEMU_ALWAYSINLINE | 8 | Fixes a bug in that plugin_insn_append should be passed the bytes |
6 | store_memop() { | 9 | in the original memory order, not bswapped by pieces. |
7 | switch () { | ||
8 | ... | ||
9 | default: | ||
10 | qemu_build_not_reached(); | ||
11 | } | ||
12 | } | ||
13 | inline QEMU_ALWAYSINLINE | ||
14 | store_helper() { | ||
15 | ... | ||
16 | if (span_two_pages_or_io) { | ||
17 | ... | ||
18 | helper_ret_stb_mmu(); | ||
19 | } | ||
20 | store_memop(); | ||
21 | } | ||
22 | helper_ret_stb_mmu() { | ||
23 | store_helper(); | ||
24 | } | ||
25 | 10 | ||
26 | Whereas GCC will generate an error at compile-time when an always_inline | 11 | Acked-by: Ilya Leoshkevich <iii@linux.ibm.com> |
27 | function is not inlined, Clang does not. Nor does Clang prioritize the | 12 | Tested-by: Ilya Leoshkevich <iii@linux.ibm.com> |
28 | inlining of always_inline functions. Both of these are arguably bugs. | ||
29 | |||
30 | Both `store_memop` and `store_helper` need to be inlined and allow | ||
31 | constant propogations to eliminate the `qemu_build_not_reached` call. | ||
32 | |||
33 | However, if the compiler instead chooses to inline helper_ret_stb_mmu | ||
34 | into store_helper, then store_helper is now self-recursive and the | ||
35 | compiler is no longer able to propagate the constant in the same way. | ||
36 | |||
37 | This does not produce at current QEMU head, but was reproducible | ||
38 | at v4.2.0 with `clang-10 -O2 -fexperimental-new-pass-manager`. | ||
39 | |||
40 | The inline recursion problem can be fixed solely by marking | ||
41 | helper_ret_stb_mmu as noinline, so the compiler does not make an | ||
42 | incorrect decision about which functions to inline. | ||
43 | |||
44 | In addition, extract store_helper_unaligned as a noinline subroutine | ||
45 | that can be shared by all of the helpers. This saves about 6k code | ||
46 | size in an optimized x86_64 build. | ||
47 | |||
48 | Reported-by: Shu-Chun Weng <scw@google.com> | ||
49 | Reviewed-by: Alex Bennée <alex.bennee@linaro.org> | ||
50 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> | 13 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> |
51 | --- | 14 | --- |
52 | accel/tcg/cputlb.c | 138 ++++++++++++++++++++++++++------------------- | 15 | include/exec/translator.h | 63 +++++++++++-------- |
53 | 1 file changed, 79 insertions(+), 59 deletions(-) | 16 | accel/tcg/translate-all.c | 23 +++---- |
17 | accel/tcg/translator.c | 126 +++++++++++++++++++++++++++++--------- | ||
18 | 3 files changed, 141 insertions(+), 71 deletions(-) | ||
54 | 19 | ||
55 | diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c | 20 | diff --git a/include/exec/translator.h b/include/exec/translator.h |
56 | index XXXXXXX..XXXXXXX 100644 | 21 | index XXXXXXX..XXXXXXX 100644 |
57 | --- a/accel/tcg/cputlb.c | 22 | --- a/include/exec/translator.h |
58 | +++ b/accel/tcg/cputlb.c | 23 | +++ b/include/exec/translator.h |
59 | @@ -XXX,XX +XXX,XX @@ store_memop(void *haddr, uint64_t val, MemOp op) | 24 | @@ -XXX,XX +XXX,XX @@ typedef enum DisasJumpType { |
60 | } | 25 | * Architecture-agnostic disassembly context. |
26 | */ | ||
27 | typedef struct DisasContextBase { | ||
28 | - const TranslationBlock *tb; | ||
29 | + TranslationBlock *tb; | ||
30 | target_ulong pc_first; | ||
31 | target_ulong pc_next; | ||
32 | DisasJumpType is_jmp; | ||
33 | int num_insns; | ||
34 | int max_insns; | ||
35 | bool singlestep_enabled; | ||
36 | -#ifdef CONFIG_USER_ONLY | ||
37 | - /* | ||
38 | - * Guest address of the last byte of the last protected page. | ||
39 | - * | ||
40 | - * Pages containing the translated instructions are made non-writable in | ||
41 | - * order to achieve consistency in case another thread is modifying the | ||
42 | - * code while translate_insn() fetches the instruction bytes piecemeal. | ||
43 | - * Such writer threads are blocked on mmap_lock() in page_unprotect(). | ||
44 | - */ | ||
45 | - target_ulong page_protect_end; | ||
46 | -#endif | ||
47 | + void *host_addr[2]; | ||
48 | } DisasContextBase; | ||
49 | |||
50 | /** | ||
51 | @@ -XXX,XX +XXX,XX @@ bool translator_use_goto_tb(DisasContextBase *db, target_ulong dest); | ||
52 | * the relevant information at translation time. | ||
53 | */ | ||
54 | |||
55 | -#define GEN_TRANSLATOR_LD(fullname, type, load_fn, swap_fn) \ | ||
56 | - type fullname ## _swap(CPUArchState *env, DisasContextBase *dcbase, \ | ||
57 | - abi_ptr pc, bool do_swap); \ | ||
58 | - static inline type fullname(CPUArchState *env, \ | ||
59 | - DisasContextBase *dcbase, abi_ptr pc) \ | ||
60 | - { \ | ||
61 | - return fullname ## _swap(env, dcbase, pc, false); \ | ||
62 | +uint8_t translator_ldub(CPUArchState *env, DisasContextBase *db, abi_ptr pc); | ||
63 | +uint16_t translator_lduw(CPUArchState *env, DisasContextBase *db, abi_ptr pc); | ||
64 | +uint32_t translator_ldl(CPUArchState *env, DisasContextBase *db, abi_ptr pc); | ||
65 | +uint64_t translator_ldq(CPUArchState *env, DisasContextBase *db, abi_ptr pc); | ||
66 | + | ||
67 | +static inline uint16_t | ||
68 | +translator_lduw_swap(CPUArchState *env, DisasContextBase *db, | ||
69 | + abi_ptr pc, bool do_swap) | ||
70 | +{ | ||
71 | + uint16_t ret = translator_lduw(env, db, pc); | ||
72 | + if (do_swap) { | ||
73 | + ret = bswap16(ret); | ||
74 | } | ||
75 | + return ret; | ||
76 | +} | ||
77 | |||
78 | -#define FOR_EACH_TRANSLATOR_LD(F) \ | ||
79 | - F(translator_ldub, uint8_t, cpu_ldub_code, /* no swap */) \ | ||
80 | - F(translator_lduw, uint16_t, cpu_lduw_code, bswap16) \ | ||
81 | - F(translator_ldl, uint32_t, cpu_ldl_code, bswap32) \ | ||
82 | - F(translator_ldq, uint64_t, cpu_ldq_code, bswap64) | ||
83 | +static inline uint32_t | ||
84 | +translator_ldl_swap(CPUArchState *env, DisasContextBase *db, | ||
85 | + abi_ptr pc, bool do_swap) | ||
86 | +{ | ||
87 | + uint32_t ret = translator_ldl(env, db, pc); | ||
88 | + if (do_swap) { | ||
89 | + ret = bswap32(ret); | ||
90 | + } | ||
91 | + return ret; | ||
92 | +} | ||
93 | |||
94 | -FOR_EACH_TRANSLATOR_LD(GEN_TRANSLATOR_LD) | ||
95 | - | ||
96 | -#undef GEN_TRANSLATOR_LD | ||
97 | +static inline uint64_t | ||
98 | +translator_ldq_swap(CPUArchState *env, DisasContextBase *db, | ||
99 | + abi_ptr pc, bool do_swap) | ||
100 | +{ | ||
101 | + uint64_t ret = translator_ldq(env, db, pc); | ||
102 | + if (do_swap) { | ||
103 | + ret = bswap64(ret); | ||
104 | + } | ||
105 | + return ret; | ||
106 | +} | ||
107 | |||
108 | /* | ||
109 | * Return whether addr is on the same page as where disassembly started. | ||
110 | diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c | ||
111 | index XXXXXXX..XXXXXXX 100644 | ||
112 | --- a/accel/tcg/translate-all.c | ||
113 | +++ b/accel/tcg/translate-all.c | ||
114 | @@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_gen_code(CPUState *cpu, | ||
115 | { | ||
116 | CPUArchState *env = cpu->env_ptr; | ||
117 | TranslationBlock *tb, *existing_tb; | ||
118 | - tb_page_addr_t phys_pc, phys_page2; | ||
119 | - target_ulong virt_page2; | ||
120 | + tb_page_addr_t phys_pc; | ||
121 | tcg_insn_unit *gen_code_buf; | ||
122 | int gen_code_size, search_size, max_insns; | ||
123 | #ifdef CONFIG_PROFILER | ||
124 | @@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_gen_code(CPUState *cpu, | ||
125 | tb->flags = flags; | ||
126 | tb->cflags = cflags; | ||
127 | tb->trace_vcpu_dstate = *cpu->trace_dstate; | ||
128 | + tb->page_addr[0] = phys_pc; | ||
129 | + tb->page_addr[1] = -1; | ||
130 | tcg_ctx->tb_cflags = cflags; | ||
131 | tb_overflow: | ||
132 | |||
133 | @@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_gen_code(CPUState *cpu, | ||
134 | } | ||
135 | |||
136 | /* | ||
137 | - * If the TB is not associated with a physical RAM page then | ||
138 | - * it must be a temporary one-insn TB, and we have nothing to do | ||
139 | - * except fill in the page_addr[] fields. Return early before | ||
140 | - * attempting to link to other TBs or add to the lookup table. | ||
141 | + * If the TB is not associated with a physical RAM page then it must be | ||
142 | + * a temporary one-insn TB, and we have nothing left to do. Return early | ||
143 | + * before attempting to link to other TBs or add to the lookup table. | ||
144 | */ | ||
145 | - if (phys_pc == -1) { | ||
146 | - tb->page_addr[0] = tb->page_addr[1] = -1; | ||
147 | + if (tb->page_addr[0] == -1) { | ||
148 | return tb; | ||
149 | } | ||
150 | |||
151 | @@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_gen_code(CPUState *cpu, | ||
152 | */ | ||
153 | tcg_tb_insert(tb); | ||
154 | |||
155 | - /* check next page if needed */ | ||
156 | - virt_page2 = (pc + tb->size - 1) & TARGET_PAGE_MASK; | ||
157 | - phys_page2 = -1; | ||
158 | - if ((pc & TARGET_PAGE_MASK) != virt_page2) { | ||
159 | - phys_page2 = get_page_addr_code(env, virt_page2); | ||
160 | - } | ||
161 | /* | ||
162 | * No explicit memory barrier is required -- tb_link_page() makes the | ||
163 | * TB visible in a consistent state. | ||
164 | */ | ||
165 | - existing_tb = tb_link_page(tb, phys_pc, phys_page2); | ||
166 | + existing_tb = tb_link_page(tb, tb->page_addr[0], tb->page_addr[1]); | ||
167 | /* if the TB already exists, discard what we just translated */ | ||
168 | if (unlikely(existing_tb != tb)) { | ||
169 | uintptr_t orig_aligned = (uintptr_t)gen_code_buf; | ||
170 | diff --git a/accel/tcg/translator.c b/accel/tcg/translator.c | ||
171 | index XXXXXXX..XXXXXXX 100644 | ||
172 | --- a/accel/tcg/translator.c | ||
173 | +++ b/accel/tcg/translator.c | ||
174 | @@ -XXX,XX +XXX,XX @@ bool translator_use_goto_tb(DisasContextBase *db, target_ulong dest) | ||
175 | return ((db->pc_first ^ dest) & TARGET_PAGE_MASK) == 0; | ||
61 | } | 176 | } |
62 | 177 | ||
63 | +static void __attribute__((noinline)) | 178 | -static inline void translator_page_protect(DisasContextBase *dcbase, |
64 | +store_helper_unaligned(CPUArchState *env, target_ulong addr, uint64_t val, | 179 | - target_ulong pc) |
65 | + uintptr_t retaddr, size_t size, uintptr_t mmu_idx, | 180 | -{ |
66 | + bool big_endian) | 181 | -#ifdef CONFIG_USER_ONLY |
67 | +{ | 182 | - dcbase->page_protect_end = pc | ~TARGET_PAGE_MASK; |
68 | + const size_t tlb_off = offsetof(CPUTLBEntry, addr_write); | 183 | - page_protect(pc); |
69 | + uintptr_t index, index2; | 184 | -#endif |
70 | + CPUTLBEntry *entry, *entry2; | 185 | -} |
71 | + target_ulong page2, tlb_addr, tlb_addr2; | 186 | - |
72 | + TCGMemOpIdx oi; | 187 | void translator_loop(CPUState *cpu, TranslationBlock *tb, int max_insns, |
73 | + size_t size2; | 188 | target_ulong pc, void *host_pc, |
74 | + int i; | 189 | const TranslatorOps *ops, DisasContextBase *db) |
75 | + | 190 | @@ -XXX,XX +XXX,XX @@ void translator_loop(CPUState *cpu, TranslationBlock *tb, int max_insns, |
76 | + /* | 191 | db->num_insns = 0; |
77 | + * Ensure the second page is in the TLB. Note that the first page | 192 | db->max_insns = max_insns; |
78 | + * is already guaranteed to be filled, and that the second page | 193 | db->singlestep_enabled = cflags & CF_SINGLE_STEP; |
79 | + * cannot evict the first. | 194 | - translator_page_protect(db, db->pc_next); |
80 | + */ | 195 | + db->host_addr[0] = host_pc; |
81 | + page2 = (addr + size) & TARGET_PAGE_MASK; | 196 | + db->host_addr[1] = NULL; |
82 | + size2 = (addr + size) & ~TARGET_PAGE_MASK; | 197 | + |
83 | + index2 = tlb_index(env, mmu_idx, page2); | 198 | +#ifdef CONFIG_USER_ONLY |
84 | + entry2 = tlb_entry(env, mmu_idx, page2); | 199 | + page_protect(pc); |
85 | + | 200 | +#endif |
86 | + tlb_addr2 = tlb_addr_write(entry2); | 201 | |
87 | + if (!tlb_hit_page(tlb_addr2, page2)) { | 202 | ops->init_disas_context(db, cpu); |
88 | + if (!victim_tlb_hit(env, mmu_idx, index2, tlb_off, page2)) { | 203 | tcg_debug_assert(db->is_jmp == DISAS_NEXT); /* no early exit */ |
89 | + tlb_fill(env_cpu(env), page2, size2, MMU_DATA_STORE, | 204 | @@ -XXX,XX +XXX,XX @@ void translator_loop(CPUState *cpu, TranslationBlock *tb, int max_insns, |
90 | + mmu_idx, retaddr); | 205 | #endif |
91 | + index2 = tlb_index(env, mmu_idx, page2); | 206 | } |
92 | + entry2 = tlb_entry(env, mmu_idx, page2); | 207 | |
208 | -static inline void translator_maybe_page_protect(DisasContextBase *dcbase, | ||
209 | - target_ulong pc, size_t len) | ||
210 | +static void *translator_access(CPUArchState *env, DisasContextBase *db, | ||
211 | + target_ulong pc, size_t len) | ||
212 | { | ||
213 | -#ifdef CONFIG_USER_ONLY | ||
214 | - target_ulong end = pc + len - 1; | ||
215 | + void *host; | ||
216 | + target_ulong base, end; | ||
217 | + TranslationBlock *tb; | ||
218 | |||
219 | - if (end > dcbase->page_protect_end) { | ||
220 | - translator_page_protect(dcbase, end); | ||
221 | + tb = db->tb; | ||
222 | + | ||
223 | + /* Use slow path if first page is MMIO. */ | ||
224 | + if (unlikely(tb->page_addr[0] == -1)) { | ||
225 | + return NULL; | ||
226 | } | ||
227 | + | ||
228 | + end = pc + len - 1; | ||
229 | + if (likely(is_same_page(db, end))) { | ||
230 | + host = db->host_addr[0]; | ||
231 | + base = db->pc_first; | ||
232 | + } else { | ||
233 | + host = db->host_addr[1]; | ||
234 | + base = TARGET_PAGE_ALIGN(db->pc_first); | ||
235 | + if (host == NULL) { | ||
236 | + tb->page_addr[1] = | ||
237 | + get_page_addr_code_hostp(env, base, &db->host_addr[1]); | ||
238 | +#ifdef CONFIG_USER_ONLY | ||
239 | + page_protect(end); | ||
240 | #endif | ||
241 | + /* We cannot handle MMIO as second page. */ | ||
242 | + assert(tb->page_addr[1] != -1); | ||
243 | + host = db->host_addr[1]; | ||
93 | + } | 244 | + } |
94 | + tlb_addr2 = tlb_addr_write(entry2); | 245 | + |
95 | + } | 246 | + /* Use slow path when crossing pages. */ |
96 | + | 247 | + if (is_same_page(db, pc)) { |
97 | + index = tlb_index(env, mmu_idx, addr); | 248 | + return NULL; |
98 | + entry = tlb_entry(env, mmu_idx, addr); | ||
99 | + tlb_addr = tlb_addr_write(entry); | ||
100 | + | ||
101 | + /* | ||
102 | + * Handle watchpoints. Since this may trap, all checks | ||
103 | + * must happen before any store. | ||
104 | + */ | ||
105 | + if (unlikely(tlb_addr & TLB_WATCHPOINT)) { | ||
106 | + cpu_check_watchpoint(env_cpu(env), addr, size - size2, | ||
107 | + env_tlb(env)->d[mmu_idx].iotlb[index].attrs, | ||
108 | + BP_MEM_WRITE, retaddr); | ||
109 | + } | ||
110 | + if (unlikely(tlb_addr2 & TLB_WATCHPOINT)) { | ||
111 | + cpu_check_watchpoint(env_cpu(env), page2, size2, | ||
112 | + env_tlb(env)->d[mmu_idx].iotlb[index2].attrs, | ||
113 | + BP_MEM_WRITE, retaddr); | ||
114 | + } | ||
115 | + | ||
116 | + /* | ||
117 | + * XXX: not efficient, but simple. | ||
118 | + * This loop must go in the forward direction to avoid issues | ||
119 | + * with self-modifying code in Windows 64-bit. | ||
120 | + */ | ||
121 | + oi = make_memop_idx(MO_UB, mmu_idx); | ||
122 | + if (big_endian) { | ||
123 | + for (i = 0; i < size; ++i) { | ||
124 | + /* Big-endian extract. */ | ||
125 | + uint8_t val8 = val >> (((size - 1) * 8) - (i * 8)); | ||
126 | + helper_ret_stb_mmu(env, addr + i, val8, oi, retaddr); | ||
127 | + } | 249 | + } |
128 | + } else { | 250 | + } |
129 | + for (i = 0; i < size; ++i) { | 251 | + |
130 | + /* Little-endian extract. */ | 252 | + tcg_debug_assert(pc >= base); |
131 | + uint8_t val8 = val >> (i * 8); | 253 | + return host + (pc - base); |
132 | + helper_ret_stb_mmu(env, addr + i, val8, oi, retaddr); | ||
133 | + } | ||
134 | + } | ||
135 | +} | ||
136 | + | ||
137 | static inline void QEMU_ALWAYS_INLINE | ||
138 | store_helper(CPUArchState *env, target_ulong addr, uint64_t val, | ||
139 | TCGMemOpIdx oi, uintptr_t retaddr, MemOp op) | ||
140 | @@ -XXX,XX +XXX,XX @@ store_helper(CPUArchState *env, target_ulong addr, uint64_t val, | ||
141 | if (size > 1 | ||
142 | && unlikely((addr & ~TARGET_PAGE_MASK) + size - 1 | ||
143 | >= TARGET_PAGE_SIZE)) { | ||
144 | - int i; | ||
145 | - uintptr_t index2; | ||
146 | - CPUTLBEntry *entry2; | ||
147 | - target_ulong page2, tlb_addr2; | ||
148 | - size_t size2; | ||
149 | - | ||
150 | do_unaligned_access: | ||
151 | - /* | ||
152 | - * Ensure the second page is in the TLB. Note that the first page | ||
153 | - * is already guaranteed to be filled, and that the second page | ||
154 | - * cannot evict the first. | ||
155 | - */ | ||
156 | - page2 = (addr + size) & TARGET_PAGE_MASK; | ||
157 | - size2 = (addr + size) & ~TARGET_PAGE_MASK; | ||
158 | - index2 = tlb_index(env, mmu_idx, page2); | ||
159 | - entry2 = tlb_entry(env, mmu_idx, page2); | ||
160 | - tlb_addr2 = tlb_addr_write(entry2); | ||
161 | - if (!tlb_hit_page(tlb_addr2, page2)) { | ||
162 | - if (!victim_tlb_hit(env, mmu_idx, index2, tlb_off, page2)) { | ||
163 | - tlb_fill(env_cpu(env), page2, size2, MMU_DATA_STORE, | ||
164 | - mmu_idx, retaddr); | ||
165 | - index2 = tlb_index(env, mmu_idx, page2); | ||
166 | - entry2 = tlb_entry(env, mmu_idx, page2); | ||
167 | - } | ||
168 | - tlb_addr2 = tlb_addr_write(entry2); | ||
169 | - } | ||
170 | - | ||
171 | - /* | ||
172 | - * Handle watchpoints. Since this may trap, all checks | ||
173 | - * must happen before any store. | ||
174 | - */ | ||
175 | - if (unlikely(tlb_addr & TLB_WATCHPOINT)) { | ||
176 | - cpu_check_watchpoint(env_cpu(env), addr, size - size2, | ||
177 | - env_tlb(env)->d[mmu_idx].iotlb[index].attrs, | ||
178 | - BP_MEM_WRITE, retaddr); | ||
179 | - } | ||
180 | - if (unlikely(tlb_addr2 & TLB_WATCHPOINT)) { | ||
181 | - cpu_check_watchpoint(env_cpu(env), page2, size2, | ||
182 | - env_tlb(env)->d[mmu_idx].iotlb[index2].attrs, | ||
183 | - BP_MEM_WRITE, retaddr); | ||
184 | - } | ||
185 | - | ||
186 | - /* | ||
187 | - * XXX: not efficient, but simple. | ||
188 | - * This loop must go in the forward direction to avoid issues | ||
189 | - * with self-modifying code in Windows 64-bit. | ||
190 | - */ | ||
191 | - for (i = 0; i < size; ++i) { | ||
192 | - uint8_t val8; | ||
193 | - if (memop_big_endian(op)) { | ||
194 | - /* Big-endian extract. */ | ||
195 | - val8 = val >> (((size - 1) * 8) - (i * 8)); | ||
196 | - } else { | ||
197 | - /* Little-endian extract. */ | ||
198 | - val8 = val >> (i * 8); | ||
199 | - } | ||
200 | - helper_ret_stb_mmu(env, addr + i, val8, oi, retaddr); | ||
201 | - } | ||
202 | + store_helper_unaligned(env, addr, val, retaddr, size, | ||
203 | + mmu_idx, memop_big_endian(op)); | ||
204 | return; | ||
205 | } | ||
206 | |||
207 | @@ -XXX,XX +XXX,XX @@ store_helper(CPUArchState *env, target_ulong addr, uint64_t val, | ||
208 | store_memop(haddr, val, op); | ||
209 | } | 254 | } |
210 | 255 | ||
211 | -void helper_ret_stb_mmu(CPUArchState *env, target_ulong addr, uint8_t val, | 256 | -#define GEN_TRANSLATOR_LD(fullname, type, load_fn, swap_fn) \ |
212 | - TCGMemOpIdx oi, uintptr_t retaddr) | 257 | - type fullname ## _swap(CPUArchState *env, DisasContextBase *dcbase, \ |
213 | +void __attribute__((noinline)) | 258 | - abi_ptr pc, bool do_swap) \ |
214 | +helper_ret_stb_mmu(CPUArchState *env, target_ulong addr, uint8_t val, | 259 | - { \ |
215 | + TCGMemOpIdx oi, uintptr_t retaddr) | 260 | - translator_maybe_page_protect(dcbase, pc, sizeof(type)); \ |
216 | { | 261 | - type ret = load_fn(env, pc); \ |
217 | store_helper(env, addr, val, oi, retaddr, MO_UB); | 262 | - if (do_swap) { \ |
218 | } | 263 | - ret = swap_fn(ret); \ |
264 | - } \ | ||
265 | - plugin_insn_append(pc, &ret, sizeof(ret)); \ | ||
266 | - return ret; \ | ||
267 | +uint8_t translator_ldub(CPUArchState *env, DisasContextBase *db, abi_ptr pc) | ||
268 | +{ | ||
269 | + uint8_t ret; | ||
270 | + void *p = translator_access(env, db, pc, sizeof(ret)); | ||
271 | + | ||
272 | + if (p) { | ||
273 | + plugin_insn_append(pc, p, sizeof(ret)); | ||
274 | + return ldub_p(p); | ||
275 | } | ||
276 | + ret = cpu_ldub_code(env, pc); | ||
277 | + plugin_insn_append(pc, &ret, sizeof(ret)); | ||
278 | + return ret; | ||
279 | +} | ||
280 | |||
281 | -FOR_EACH_TRANSLATOR_LD(GEN_TRANSLATOR_LD) | ||
282 | +uint16_t translator_lduw(CPUArchState *env, DisasContextBase *db, abi_ptr pc) | ||
283 | +{ | ||
284 | + uint16_t ret, plug; | ||
285 | + void *p = translator_access(env, db, pc, sizeof(ret)); | ||
286 | |||
287 | -#undef GEN_TRANSLATOR_LD | ||
288 | + if (p) { | ||
289 | + plugin_insn_append(pc, p, sizeof(ret)); | ||
290 | + return lduw_p(p); | ||
291 | + } | ||
292 | + ret = cpu_lduw_code(env, pc); | ||
293 | + plug = tswap16(ret); | ||
294 | + plugin_insn_append(pc, &plug, sizeof(ret)); | ||
295 | + return ret; | ||
296 | +} | ||
297 | + | ||
298 | +uint32_t translator_ldl(CPUArchState *env, DisasContextBase *db, abi_ptr pc) | ||
299 | +{ | ||
300 | + uint32_t ret, plug; | ||
301 | + void *p = translator_access(env, db, pc, sizeof(ret)); | ||
302 | + | ||
303 | + if (p) { | ||
304 | + plugin_insn_append(pc, p, sizeof(ret)); | ||
305 | + return ldl_p(p); | ||
306 | + } | ||
307 | + ret = cpu_ldl_code(env, pc); | ||
308 | + plug = tswap32(ret); | ||
309 | + plugin_insn_append(pc, &plug, sizeof(ret)); | ||
310 | + return ret; | ||
311 | +} | ||
312 | + | ||
313 | +uint64_t translator_ldq(CPUArchState *env, DisasContextBase *db, abi_ptr pc) | ||
314 | +{ | ||
315 | + uint64_t ret, plug; | ||
316 | + void *p = translator_access(env, db, pc, sizeof(ret)); | ||
317 | + | ||
318 | + if (p) { | ||
319 | + plugin_insn_append(pc, p, sizeof(ret)); | ||
320 | + return ldq_p(p); | ||
321 | + } | ||
322 | + ret = cpu_ldq_code(env, pc); | ||
323 | + plug = tswap64(ret); | ||
324 | + plugin_insn_append(pc, &plug, sizeof(ret)); | ||
325 | + return ret; | ||
326 | +} | ||
219 | -- | 327 | -- |
220 | 2.25.1 | 328 | 2.34.1 |
221 | |||
222 | diff view generated by jsdifflib |
Deleted patch | |||
---|---|---|---|
1 | From: Stephen Long <steplong@quicinc.com> | ||
2 | 1 | ||
3 | The fallback inline expansion for vectorized absolute value, | ||
4 | when the host doesn't support such an insn was flawed. | ||
5 | |||
6 | E.g. when a vector of bytes has all elements negative, mask | ||
7 | will be 0xffff_ffff_ffff_ffff. Subtracting mask only adds 1 | ||
8 | to the low element instead of all elements becase -mask is 1 | ||
9 | and not 0x0101_0101_0101_0101. | ||
10 | |||
11 | Signed-off-by: Stephen Long <steplong@quicinc.com> | ||
12 | Message-Id: <20200813161818.190-1-steplong@quicinc.com> | ||
13 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> | ||
14 | --- | ||
15 | tcg/tcg-op-gvec.c | 5 +++-- | ||
16 | 1 file changed, 3 insertions(+), 2 deletions(-) | ||
17 | |||
18 | diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c | ||
19 | index XXXXXXX..XXXXXXX 100644 | ||
20 | --- a/tcg/tcg-op-gvec.c | ||
21 | +++ b/tcg/tcg-op-gvec.c | ||
22 | @@ -XXX,XX +XXX,XX @@ static void gen_absv_mask(TCGv_i64 d, TCGv_i64 b, unsigned vece) | ||
23 | tcg_gen_muli_i64(t, t, (1 << nbit) - 1); | ||
24 | |||
25 | /* | ||
26 | - * Invert (via xor -1) and add one (via sub -1). | ||
27 | + * Invert (via xor -1) and add one. | ||
28 | * Because of the ordering the msb is cleared, | ||
29 | * so we never have carry into the next element. | ||
30 | */ | ||
31 | tcg_gen_xor_i64(d, b, t); | ||
32 | - tcg_gen_sub_i64(d, d, t); | ||
33 | + tcg_gen_andi_i64(t, t, dup_const(vece, 1)); | ||
34 | + tcg_gen_add_i64(d, d, t); | ||
35 | |||
36 | tcg_temp_free_i64(t); | ||
37 | } | ||
38 | -- | ||
39 | 2.25.1 | ||
40 | |||
41 | diff view generated by jsdifflib |
Deleted patch | |||
---|---|---|---|
1 | Do not set parallel_cpus if there is only one cpu instantiated. | ||
2 | This will allow tcg to use serial code to implement atomics. | ||
3 | 1 | ||
4 | Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org> | ||
5 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> | ||
6 | --- | ||
7 | softmmu/cpus.c | 11 ++++++++++- | ||
8 | 1 file changed, 10 insertions(+), 1 deletion(-) | ||
9 | |||
10 | diff --git a/softmmu/cpus.c b/softmmu/cpus.c | ||
11 | index XXXXXXX..XXXXXXX 100644 | ||
12 | --- a/softmmu/cpus.c | ||
13 | +++ b/softmmu/cpus.c | ||
14 | @@ -XXX,XX +XXX,XX @@ static void qemu_tcg_init_vcpu(CPUState *cpu) | ||
15 | if (!tcg_region_inited) { | ||
16 | tcg_region_inited = 1; | ||
17 | tcg_region_init(); | ||
18 | + /* | ||
19 | + * If MTTCG, and we will create multiple cpus, | ||
20 | + * then we will have cpus running in parallel. | ||
21 | + */ | ||
22 | + if (qemu_tcg_mttcg_enabled()) { | ||
23 | + MachineState *ms = MACHINE(qdev_get_machine()); | ||
24 | + if (ms->smp.max_cpus > 1) { | ||
25 | + parallel_cpus = true; | ||
26 | + } | ||
27 | + } | ||
28 | } | ||
29 | |||
30 | if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) { | ||
31 | @@ -XXX,XX +XXX,XX @@ static void qemu_tcg_init_vcpu(CPUState *cpu) | ||
32 | |||
33 | if (qemu_tcg_mttcg_enabled()) { | ||
34 | /* create a thread per vCPU with TCG (MTTCG) */ | ||
35 | - parallel_cpus = true; | ||
36 | snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG", | ||
37 | cpu->cpu_index); | ||
38 | |||
39 | -- | ||
40 | 2.25.1 | ||
41 | |||
42 | diff view generated by jsdifflib |
Deleted patch | |||
---|---|---|---|
1 | Do not store back to the exact memory from which we just loaded. | ||
2 | 1 | ||
3 | Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org> | ||
4 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> | ||
5 | --- | ||
6 | tcg/tcg-op-gvec.c | 4 ++-- | ||
7 | 1 file changed, 2 insertions(+), 2 deletions(-) | ||
8 | |||
9 | diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c | ||
10 | index XXXXXXX..XXXXXXX 100644 | ||
11 | --- a/tcg/tcg-op-gvec.c | ||
12 | +++ b/tcg/tcg-op-gvec.c | ||
13 | @@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs, | ||
14 | TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V128); | ||
15 | |||
16 | tcg_gen_ld_vec(in, cpu_env, aofs); | ||
17 | - for (i = 0; i < oprsz; i += 16) { | ||
18 | + for (i = (aofs == dofs) * 16; i < oprsz; i += 16) { | ||
19 | tcg_gen_st_vec(in, cpu_env, dofs + i); | ||
20 | } | ||
21 | tcg_temp_free_vec(in); | ||
22 | @@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs, | ||
23 | |||
24 | tcg_gen_ld_i64(in0, cpu_env, aofs); | ||
25 | tcg_gen_ld_i64(in1, cpu_env, aofs + 8); | ||
26 | - for (i = 0; i < oprsz; i += 16) { | ||
27 | + for (i = (aofs == dofs) * 16; i < oprsz; i += 16) { | ||
28 | tcg_gen_st_i64(in0, cpu_env, dofs + i); | ||
29 | tcg_gen_st_i64(in1, cpu_env, dofs + i + 8); | ||
30 | } | ||
31 | -- | ||
32 | 2.25.1 | ||
33 | |||
34 | diff view generated by jsdifflib |
Deleted patch | |||
---|---|---|---|
1 | We already support duplication of 128-bit blocks. This extends | ||
2 | that support to 256-bit blocks. This will be needed by SVE2. | ||
3 | 1 | ||
4 | Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org> | ||
5 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> | ||
6 | --- | ||
7 | tcg/tcg-op-gvec.c | 52 ++++++++++++++++++++++++++++++++++++++++++++--- | ||
8 | 1 file changed, 49 insertions(+), 3 deletions(-) | ||
9 | |||
10 | diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c | ||
11 | index XXXXXXX..XXXXXXX 100644 | ||
12 | --- a/tcg/tcg-op-gvec.c | ||
13 | +++ b/tcg/tcg-op-gvec.c | ||
14 | @@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs, | ||
15 | do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0); | ||
16 | tcg_temp_free_i64(in); | ||
17 | } | ||
18 | - } else { | ||
19 | + } else if (vece == 4) { | ||
20 | /* 128-bit duplicate. */ | ||
21 | - /* ??? Dup to 256-bit vector. */ | ||
22 | int i; | ||
23 | |||
24 | - tcg_debug_assert(vece == 4); | ||
25 | tcg_debug_assert(oprsz >= 16); | ||
26 | if (TCG_TARGET_HAS_v128) { | ||
27 | TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V128); | ||
28 | @@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs, | ||
29 | if (oprsz < maxsz) { | ||
30 | expand_clr(dofs + oprsz, maxsz - oprsz); | ||
31 | } | ||
32 | + } else if (vece == 5) { | ||
33 | + /* 256-bit duplicate. */ | ||
34 | + int i; | ||
35 | + | ||
36 | + tcg_debug_assert(oprsz >= 32); | ||
37 | + tcg_debug_assert(oprsz % 32 == 0); | ||
38 | + if (TCG_TARGET_HAS_v256) { | ||
39 | + TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V256); | ||
40 | + | ||
41 | + tcg_gen_ld_vec(in, cpu_env, aofs); | ||
42 | + for (i = (aofs == dofs) * 32; i < oprsz; i += 32) { | ||
43 | + tcg_gen_st_vec(in, cpu_env, dofs + i); | ||
44 | + } | ||
45 | + tcg_temp_free_vec(in); | ||
46 | + } else if (TCG_TARGET_HAS_v128) { | ||
47 | + TCGv_vec in0 = tcg_temp_new_vec(TCG_TYPE_V128); | ||
48 | + TCGv_vec in1 = tcg_temp_new_vec(TCG_TYPE_V128); | ||
49 | + | ||
50 | + tcg_gen_ld_vec(in0, cpu_env, aofs); | ||
51 | + tcg_gen_ld_vec(in1, cpu_env, aofs + 16); | ||
52 | + for (i = (aofs == dofs) * 32; i < oprsz; i += 32) { | ||
53 | + tcg_gen_st_vec(in0, cpu_env, dofs + i); | ||
54 | + tcg_gen_st_vec(in1, cpu_env, dofs + i + 16); | ||
55 | + } | ||
56 | + tcg_temp_free_vec(in0); | ||
57 | + tcg_temp_free_vec(in1); | ||
58 | + } else { | ||
59 | + TCGv_i64 in[4]; | ||
60 | + int j; | ||
61 | + | ||
62 | + for (j = 0; j < 4; ++j) { | ||
63 | + in[j] = tcg_temp_new_i64(); | ||
64 | + tcg_gen_ld_i64(in[j], cpu_env, aofs + j * 8); | ||
65 | + } | ||
66 | + for (i = (aofs == dofs) * 32; i < oprsz; i += 32) { | ||
67 | + for (j = 0; j < 4; ++j) { | ||
68 | + tcg_gen_st_i64(in[j], cpu_env, dofs + i + j * 8); | ||
69 | + } | ||
70 | + } | ||
71 | + for (j = 0; j < 4; ++j) { | ||
72 | + tcg_temp_free_i64(in[j]); | ||
73 | + } | ||
74 | + } | ||
75 | + if (oprsz < maxsz) { | ||
76 | + expand_clr(dofs + oprsz, maxsz - oprsz); | ||
77 | + } | ||
78 | + } else { | ||
79 | + g_assert_not_reached(); | ||
80 | } | ||
81 | } | ||
82 | |||
83 | -- | ||
84 | 2.25.1 | ||
85 | |||
86 | diff view generated by jsdifflib |