1 | The following changes since commit e0175b71638cf4398903c0d25f93fe62e0606389: | 1 | The following changes since commit a36d64f43325fa503075cc9408ddabb69b32f829: |
---|---|---|---|
2 | 2 | ||
3 | Merge remote-tracking branch 'remotes/pmaydell/tags/pull-target-arm-20200228' into staging (2020-02-28 16:39:27 +0000) | 3 | Merge remote-tracking branch 'remotes/stsquad/tags/pull-testing-and-gdbstub-060520-1' into staging (2020-05-06 14:06:00 +0100) |
4 | 4 | ||
5 | are available in the Git repository at: | 5 | are available in the Git repository at: |
6 | 6 | ||
7 | https://github.com/rth7680/qemu.git tags/pull-tcg-20200228 | 7 | https://github.com/rth7680/qemu.git tags/pull-tcg-20200506 |
8 | 8 | ||
9 | for you to fetch changes up to 600e17b261555c56a048781b8dd5ba3985650013: | 9 | for you to fetch changes up to 07dada0336a83002dfa8673a9220a88e13d9a45c: |
10 | 10 | ||
11 | accel/tcg: increase default code gen buffer size for 64 bit (2020-02-28 17:43:31 -0800) | 11 | tcg: Fix integral argument type to tcg_gen_rot[rl]i_i{32,64} (2020-05-06 09:25:10 -0700) |
12 | 12 | ||
13 | ---------------------------------------------------------------- | 13 | ---------------------------------------------------------------- |
14 | Fix race in cpu_exec_step_atomic. | 14 | Add tcg_gen_gvec_dup_imm |
15 | Work around compile failure with -fno-inine. | 15 | Misc tcg patches |
16 | Expand tcg/arm epilogue inline. | ||
17 | Adjustments to the default code gen buffer size. | ||
18 | 16 | ||
19 | ---------------------------------------------------------------- | 17 | ---------------------------------------------------------------- |
20 | Alex Bennée (5): | 18 | Richard Henderson (10): |
21 | accel/tcg: fix race in cpu_exec_step_atomic (bug 1863025) | 19 | tcg: Add tcg_gen_gvec_dup_imm |
22 | accel/tcg: use units.h for defining code gen buffer sizes | 20 | target/s390x: Use tcg_gen_gvec_dup_imm |
23 | accel/tcg: remove link between guest ram and TCG cache size | 21 | target/ppc: Use tcg_gen_gvec_dup_imm |
24 | accel/tcg: only USE_STATIC_CODE_GEN_BUFFER on 32 bit hosts | 22 | target/arm: Use tcg_gen_gvec_dup_imm |
25 | accel/tcg: increase default code gen buffer size for 64 bit | 23 | tcg: Use tcg_gen_gvec_dup_imm in logical simplifications |
24 | tcg: Remove tcg_gen_gvec_dup{8,16,32,64}i | ||
25 | tcg: Add tcg_gen_gvec_dup_tl | ||
26 | tcg: Improve vector tail clearing | ||
27 | tcg: Add load_dest parameter to GVecGen2 | ||
28 | tcg: Fix integral argument type to tcg_gen_rot[rl]i_i{32,64} | ||
26 | 29 | ||
27 | Richard Henderson (2): | 30 | include/tcg/tcg-op-gvec.h | 13 ++- |
28 | tcg/arm: Split out tcg_out_epilogue | 31 | include/tcg/tcg-op.h | 8 +- |
29 | tcg/arm: Expand epilogue inline | 32 | target/arm/translate-a64.c | 10 +-- |
33 | target/arm/translate-sve.c | 12 ++- | ||
34 | target/arm/translate.c | 9 +- | ||
35 | target/ppc/translate/vmx-impl.inc.c | 32 +++---- | ||
36 | target/ppc/translate/vsx-impl.inc.c | 2 +- | ||
37 | target/s390x/translate_vx.inc.c | 41 ++------- | ||
38 | tcg/tcg-op-gvec.c | 162 +++++++++++++++++++++++------------- | ||
39 | tcg/tcg-op.c | 16 ++-- | ||
40 | 10 files changed, 166 insertions(+), 139 deletions(-) | ||
30 | 41 | ||
31 | Zenghui Yu (1): | ||
32 | compiler.h: Don't use compile-time assert when __NO_INLINE__ is defined | ||
33 | |||
34 | include/qemu/compiler.h | 2 +- | ||
35 | accel/tcg/cpu-exec.c | 21 ++++++++-------- | ||
36 | accel/tcg/translate-all.c | 61 ++++++++++++++++++++++++++++------------------- | ||
37 | tcg/arm/tcg-target.inc.c | 29 ++++++++++------------ | ||
38 | 4 files changed, 60 insertions(+), 53 deletions(-) | ||
39 | diff view generated by jsdifflib |
1 | From: Richard Henderson <rth@twiddle.net> | 1 | Add a version of tcg_gen_dup_* that takes both immediate and |
---|---|---|---|
2 | a vector element size operand. This will replace the set of | ||
3 | tcg_gen_gvec_dup{8,16,32,64}i functions that encode the element | ||
4 | size within the function name. | ||
2 | 5 | ||
3 | We will shortly use this function from tcg_out_op as well. | 6 | Reviewed-by: LIU Zhiwei <zhiwei_liu@c-sky.com> |
7 | Reviewed-by: David Hildenbrand <david@redhat.com> | ||
8 | Reviewed-by: Alex Bennée <alex.bennee@linaro.org> | ||
9 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> | ||
10 | --- | ||
11 | include/tcg/tcg-op-gvec.h | 2 ++ | ||
12 | tcg/tcg-op-gvec.c | 7 +++++++ | ||
13 | 2 files changed, 9 insertions(+) | ||
4 | 14 | ||
5 | Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com> | 15 | diff --git a/include/tcg/tcg-op-gvec.h b/include/tcg/tcg-op-gvec.h |
6 | Signed-off-by: Richard Henderson <rth@twiddle.net> | ||
7 | --- | ||
8 | tcg/arm/tcg-target.inc.c | 19 +++++++++++-------- | ||
9 | 1 file changed, 11 insertions(+), 8 deletions(-) | ||
10 | |||
11 | diff --git a/tcg/arm/tcg-target.inc.c b/tcg/arm/tcg-target.inc.c | ||
12 | index XXXXXXX..XXXXXXX 100644 | 16 | index XXXXXXX..XXXXXXX 100644 |
13 | --- a/tcg/arm/tcg-target.inc.c | 17 | --- a/include/tcg/tcg-op-gvec.h |
14 | +++ b/tcg/arm/tcg-target.inc.c | 18 | +++ b/include/tcg/tcg-op-gvec.h |
15 | @@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64) | 19 | @@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs, |
20 | |||
21 | void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs, | ||
22 | uint32_t s, uint32_t m); | ||
23 | +void tcg_gen_gvec_dup_imm(unsigned vece, uint32_t dofs, uint32_t s, | ||
24 | + uint32_t m, uint64_t imm); | ||
25 | void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t s, | ||
26 | uint32_t m, TCGv_i32); | ||
27 | void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t s, | ||
28 | diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c | ||
29 | index XXXXXXX..XXXXXXX 100644 | ||
30 | --- a/tcg/tcg-op-gvec.c | ||
31 | +++ b/tcg/tcg-op-gvec.c | ||
32 | @@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_dup8i(uint32_t dofs, uint32_t oprsz, | ||
33 | do_dup(MO_8, dofs, oprsz, maxsz, NULL, NULL, x); | ||
16 | } | 34 | } |
17 | 35 | ||
18 | static tcg_insn_unit *tb_ret_addr; | 36 | +void tcg_gen_gvec_dup_imm(unsigned vece, uint32_t dofs, uint32_t oprsz, |
19 | +static void tcg_out_epilogue(TCGContext *s); | 37 | + uint32_t maxsz, uint64_t x) |
20 | 38 | +{ | |
21 | static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, | 39 | + check_size_align(oprsz, maxsz, dofs); |
22 | const TCGArg *args, const int *const_args) | 40 | + do_dup(vece, dofs, oprsz, maxsz, NULL, NULL, x); |
23 | @@ -XXX,XX +XXX,XX @@ static void tcg_out_nop_fill(tcg_insn_unit *p, int count) | ||
24 | + TCG_TARGET_STACK_ALIGN - 1) \ | ||
25 | & -TCG_TARGET_STACK_ALIGN) | ||
26 | |||
27 | +#define STACK_ADDEND (FRAME_SIZE - PUSH_SIZE) | ||
28 | + | ||
29 | static void tcg_target_qemu_prologue(TCGContext *s) | ||
30 | { | ||
31 | - int stack_addend; | ||
32 | - | ||
33 | /* Calling convention requires us to save r4-r11 and lr. */ | ||
34 | /* stmdb sp!, { r4 - r11, lr } */ | ||
35 | tcg_out32(s, (COND_AL << 28) | 0x092d4ff0); | ||
36 | |||
37 | /* Reserve callee argument and tcg temp space. */ | ||
38 | - stack_addend = FRAME_SIZE - PUSH_SIZE; | ||
39 | - | ||
40 | tcg_out_dat_rI(s, COND_AL, ARITH_SUB, TCG_REG_CALL_STACK, | ||
41 | - TCG_REG_CALL_STACK, stack_addend, 1); | ||
42 | + TCG_REG_CALL_STACK, STACK_ADDEND, 1); | ||
43 | tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE, | ||
44 | CPU_TEMP_BUF_NLONGS * sizeof(long)); | ||
45 | |||
46 | @@ -XXX,XX +XXX,XX @@ static void tcg_target_qemu_prologue(TCGContext *s) | ||
47 | */ | ||
48 | s->code_gen_epilogue = s->code_ptr; | ||
49 | tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R0, 0); | ||
50 | - | ||
51 | - /* TB epilogue */ | ||
52 | tb_ret_addr = s->code_ptr; | ||
53 | + tcg_out_epilogue(s); | ||
54 | +} | 41 | +} |
55 | + | 42 | + |
56 | +static void tcg_out_epilogue(TCGContext *s) | 43 | void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs, |
57 | +{ | 44 | uint32_t oprsz, uint32_t maxsz) |
58 | + /* Release local stack frame. */ | 45 | { |
59 | tcg_out_dat_rI(s, COND_AL, ARITH_ADD, TCG_REG_CALL_STACK, | ||
60 | - TCG_REG_CALL_STACK, stack_addend, 1); | ||
61 | + TCG_REG_CALL_STACK, STACK_ADDEND, 1); | ||
62 | |||
63 | /* ldmia sp!, { r4 - r11, pc } */ | ||
64 | tcg_out32(s, (COND_AL << 28) | 0x08bd8ff0); | ||
65 | -- | 46 | -- |
66 | 2.20.1 | 47 | 2.20.1 |
67 | 48 | ||
68 | 49 | diff view generated by jsdifflib |
1 | From: Alex Bennée <alex.bennee@linaro.org> | 1 | The gen_gvec_dupi switch is unnecessary with the new function. |
---|---|---|---|
2 | Replace it with a local gen_gvec_dup_imm that takes care of the | ||
3 | register to offset conversion and length arguments. | ||
2 | 4 | ||
3 | The bug describes a race whereby cpu_exec_step_atomic can acquire a TB | 5 | Drop zero_vec and use use gen_gvec_dup_imm with 0. |
4 | which is invalidated by a tb_flush before we execute it. This doesn't | ||
5 | affect the other cpu_exec modes as a tb_flush by it's nature can only | ||
6 | occur on a quiescent system. The race was described as: | ||
7 | 6 | ||
8 | B2. tcg_cpu_exec => cpu_exec => tb_find => tb_gen_code | 7 | Reviewed-by: David Hildenbrand <david@redhat.com> |
9 | B3. tcg_tb_alloc obtains a new TB | 8 | Reviewed-by: Alex Bennée <alex.bennee@linaro.org> |
10 | |||
11 | C3. TB obtained with tb_lookup__cpu_state or tb_gen_code | ||
12 | (same TB as B2) | ||
13 | |||
14 | A3. start_exclusive critical section entered | ||
15 | A4. do_tb_flush is called, TB memory freed/re-allocated | ||
16 | A5. end_exclusive exits critical section | ||
17 | |||
18 | B2. tcg_cpu_exec => cpu_exec => tb_find => tb_gen_code | ||
19 | B3. tcg_tb_alloc reallocates TB from B2 | ||
20 | |||
21 | C4. start_exclusive critical section entered | ||
22 | C5. cpu_tb_exec executes the TB code that was free in A4 | ||
23 | |||
24 | The simplest fix is to widen the exclusive period to include the TB | ||
25 | lookup. As a result we can drop the complication of checking we are in | ||
26 | the exclusive region before we end it. | ||
27 | |||
28 | Cc: Yifan <me@yifanlu.com> | ||
29 | Buglink: https://bugs.launchpad.net/qemu/+bug/1863025 | ||
30 | Reviewed-by: Paolo Bonzini <pbonzini@redhat.com> | ||
31 | Reviewed-by: Richard Henderson <richard.henderson@linaro.org> | ||
32 | Signed-off-by: Alex Bennée <alex.bennee@linaro.org> | ||
33 | Message-Id: <20200214144952.15502-1-alex.bennee@linaro.org> | ||
34 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> | 9 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> |
35 | --- | 10 | --- |
36 | accel/tcg/cpu-exec.c | 21 +++++++++++---------- | 11 | target/s390x/translate_vx.inc.c | 41 +++++++-------------------------- |
37 | 1 file changed, 11 insertions(+), 10 deletions(-) | 12 | 1 file changed, 8 insertions(+), 33 deletions(-) |
38 | 13 | ||
39 | diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c | 14 | diff --git a/target/s390x/translate_vx.inc.c b/target/s390x/translate_vx.inc.c |
40 | index XXXXXXX..XXXXXXX 100644 | 15 | index XXXXXXX..XXXXXXX 100644 |
41 | --- a/accel/tcg/cpu-exec.c | 16 | --- a/target/s390x/translate_vx.inc.c |
42 | +++ b/accel/tcg/cpu-exec.c | 17 | +++ b/target/s390x/translate_vx.inc.c |
43 | @@ -XXX,XX +XXX,XX @@ void cpu_exec_step_atomic(CPUState *cpu) | 18 | @@ -XXX,XX +XXX,XX @@ static void get_vec_element_ptr_i64(TCGv_ptr ptr, uint8_t reg, TCGv_i64 enr, |
44 | uint32_t cf_mask = cflags & CF_HASH_MASK; | 19 | #define gen_gvec_mov(v1, v2) \ |
45 | 20 | tcg_gen_gvec_mov(0, vec_full_reg_offset(v1), vec_full_reg_offset(v2), 16, \ | |
46 | if (sigsetjmp(cpu->jmp_env, 0) == 0) { | 21 | 16) |
47 | + start_exclusive(); | 22 | -#define gen_gvec_dup64i(v1, c) \ |
48 | + | 23 | - tcg_gen_gvec_dup64i(vec_full_reg_offset(v1), 16, 16, c) |
49 | tb = tb_lookup__cpu_state(cpu, &pc, &cs_base, &flags, cf_mask); | 24 | +#define gen_gvec_dup_imm(es, v1, c) \ |
50 | if (tb == NULL) { | 25 | + tcg_gen_gvec_dup_imm(es, vec_full_reg_offset(v1), 16, 16, c); |
51 | mmap_lock(); | 26 | #define gen_gvec_fn_2(fn, es, v1, v2) \ |
52 | @@ -XXX,XX +XXX,XX @@ void cpu_exec_step_atomic(CPUState *cpu) | 27 | tcg_gen_gvec_##fn(es, vec_full_reg_offset(v1), vec_full_reg_offset(v2), \ |
53 | mmap_unlock(); | 28 | 16, 16) |
29 | @@ -XXX,XX +XXX,XX @@ static void gen_gvec128_4_i64(gen_gvec128_4_i64_fn fn, uint8_t d, uint8_t a, | ||
30 | tcg_temp_free_i64(cl); | ||
31 | } | ||
32 | |||
33 | -static void gen_gvec_dupi(uint8_t es, uint8_t reg, uint64_t c) | ||
34 | -{ | ||
35 | - switch (es) { | ||
36 | - case ES_8: | ||
37 | - tcg_gen_gvec_dup8i(vec_full_reg_offset(reg), 16, 16, c); | ||
38 | - break; | ||
39 | - case ES_16: | ||
40 | - tcg_gen_gvec_dup16i(vec_full_reg_offset(reg), 16, 16, c); | ||
41 | - break; | ||
42 | - case ES_32: | ||
43 | - tcg_gen_gvec_dup32i(vec_full_reg_offset(reg), 16, 16, c); | ||
44 | - break; | ||
45 | - case ES_64: | ||
46 | - gen_gvec_dup64i(reg, c); | ||
47 | - break; | ||
48 | - default: | ||
49 | - g_assert_not_reached(); | ||
50 | - } | ||
51 | -} | ||
52 | - | ||
53 | -static void zero_vec(uint8_t reg) | ||
54 | -{ | ||
55 | - tcg_gen_gvec_dup8i(vec_full_reg_offset(reg), 16, 16, 0); | ||
56 | -} | ||
57 | - | ||
58 | static void gen_addi2_i64(TCGv_i64 dl, TCGv_i64 dh, TCGv_i64 al, TCGv_i64 ah, | ||
59 | uint64_t b) | ||
60 | { | ||
61 | @@ -XXX,XX +XXX,XX @@ static DisasJumpType op_vgbm(DisasContext *s, DisasOps *o) | ||
62 | * Masks for both 64 bit elements of the vector are the same. | ||
63 | * Trust tcg to produce a good constant loading. | ||
64 | */ | ||
65 | - gen_gvec_dup64i(get_field(s, v1), | ||
66 | - generate_byte_mask(i2 & 0xff)); | ||
67 | + gen_gvec_dup_imm(ES_64, get_field(s, v1), | ||
68 | + generate_byte_mask(i2 & 0xff)); | ||
69 | } else { | ||
70 | TCGv_i64 t = tcg_temp_new_i64(); | ||
71 | |||
72 | @@ -XXX,XX +XXX,XX @@ static DisasJumpType op_vgm(DisasContext *s, DisasOps *o) | ||
54 | } | 73 | } |
55 | |||
56 | - start_exclusive(); | ||
57 | - | ||
58 | /* Since we got here, we know that parallel_cpus must be true. */ | ||
59 | parallel_cpus = false; | ||
60 | cc->cpu_exec_enter(cpu); | ||
61 | @@ -XXX,XX +XXX,XX @@ void cpu_exec_step_atomic(CPUState *cpu) | ||
62 | qemu_plugin_disable_mem_helpers(cpu); | ||
63 | } | 74 | } |
64 | 75 | ||
65 | - if (cpu_in_exclusive_context(cpu)) { | 76 | - gen_gvec_dupi(es, get_field(s, v1), mask); |
66 | - /* We might longjump out of either the codegen or the | 77 | + gen_gvec_dup_imm(es, get_field(s, v1), mask); |
67 | - * execution, so must make sure we only end the exclusive | 78 | return DISAS_NEXT; |
68 | - * region if we started it. | ||
69 | - */ | ||
70 | - parallel_cpus = true; | ||
71 | - end_exclusive(); | ||
72 | - } | ||
73 | + | ||
74 | + /* | ||
75 | + * As we start the exclusive region before codegen we must still | ||
76 | + * be in the region if we longjump out of either the codegen or | ||
77 | + * the execution. | ||
78 | + */ | ||
79 | + g_assert(cpu_in_exclusive_context(cpu)); | ||
80 | + parallel_cpus = true; | ||
81 | + end_exclusive(); | ||
82 | } | 79 | } |
83 | 80 | ||
84 | struct tb_desc { | 81 | @@ -XXX,XX +XXX,XX @@ static DisasJumpType op_vllez(DisasContext *s, DisasOps *o) |
82 | |||
83 | t = tcg_temp_new_i64(); | ||
84 | tcg_gen_qemu_ld_i64(t, o->addr1, get_mem_index(s), MO_TE | es); | ||
85 | - zero_vec(get_field(s, v1)); | ||
86 | + gen_gvec_dup_imm(es, get_field(s, v1), 0); | ||
87 | write_vec_element_i64(t, get_field(s, v1), enr, es); | ||
88 | tcg_temp_free_i64(t); | ||
89 | return DISAS_NEXT; | ||
90 | @@ -XXX,XX +XXX,XX @@ static DisasJumpType op_vrepi(DisasContext *s, DisasOps *o) | ||
91 | return DISAS_NORETURN; | ||
92 | } | ||
93 | |||
94 | - gen_gvec_dupi(es, get_field(s, v1), data); | ||
95 | + gen_gvec_dup_imm(es, get_field(s, v1), data); | ||
96 | return DISAS_NEXT; | ||
97 | } | ||
98 | |||
99 | @@ -XXX,XX +XXX,XX @@ static DisasJumpType op_vcksm(DisasContext *s, DisasOps *o) | ||
100 | read_vec_element_i32(tmp, get_field(s, v2), i, ES_32); | ||
101 | tcg_gen_add2_i32(tmp, sum, sum, sum, tmp, tmp); | ||
102 | } | ||
103 | - zero_vec(get_field(s, v1)); | ||
104 | + gen_gvec_dup_imm(ES_32, get_field(s, v1), 0); | ||
105 | write_vec_element_i32(sum, get_field(s, v1), 1, ES_32); | ||
106 | |||
107 | tcg_temp_free_i32(tmp); | ||
85 | -- | 108 | -- |
86 | 2.20.1 | 109 | 2.20.1 |
87 | 110 | ||
88 | 111 | diff view generated by jsdifflib |
New patch | |||
---|---|---|---|
1 | We can now unify the implementation of the 3 VSPLTI instructions. | ||
1 | 2 | ||
3 | Acked-by: David Gibson <david@gibson.dropbear.id.au> | ||
4 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> | ||
5 | --- | ||
6 | target/ppc/translate/vmx-impl.inc.c | 32 ++++++++++++++++------------- | ||
7 | target/ppc/translate/vsx-impl.inc.c | 2 +- | ||
8 | 2 files changed, 19 insertions(+), 15 deletions(-) | ||
9 | |||
10 | diff --git a/target/ppc/translate/vmx-impl.inc.c b/target/ppc/translate/vmx-impl.inc.c | ||
11 | index XXXXXXX..XXXXXXX 100644 | ||
12 | --- a/target/ppc/translate/vmx-impl.inc.c | ||
13 | +++ b/target/ppc/translate/vmx-impl.inc.c | ||
14 | @@ -XXX,XX +XXX,XX @@ GEN_VXRFORM_DUAL(vcmpbfp, PPC_ALTIVEC, PPC_NONE, \ | ||
15 | GEN_VXRFORM_DUAL(vcmpgtfp, PPC_ALTIVEC, PPC_NONE, \ | ||
16 | vcmpgtud, PPC_NONE, PPC2_ALTIVEC_207) | ||
17 | |||
18 | -#define GEN_VXFORM_DUPI(name, tcg_op, opc2, opc3) \ | ||
19 | -static void glue(gen_, name)(DisasContext *ctx) \ | ||
20 | - { \ | ||
21 | - int simm; \ | ||
22 | - if (unlikely(!ctx->altivec_enabled)) { \ | ||
23 | - gen_exception(ctx, POWERPC_EXCP_VPU); \ | ||
24 | - return; \ | ||
25 | - } \ | ||
26 | - simm = SIMM5(ctx->opcode); \ | ||
27 | - tcg_op(avr_full_offset(rD(ctx->opcode)), 16, 16, simm); \ | ||
28 | +static void gen_vsplti(DisasContext *ctx, int vece) | ||
29 | +{ | ||
30 | + int simm; | ||
31 | + | ||
32 | + if (unlikely(!ctx->altivec_enabled)) { | ||
33 | + gen_exception(ctx, POWERPC_EXCP_VPU); | ||
34 | + return; | ||
35 | } | ||
36 | |||
37 | -GEN_VXFORM_DUPI(vspltisb, tcg_gen_gvec_dup8i, 6, 12); | ||
38 | -GEN_VXFORM_DUPI(vspltish, tcg_gen_gvec_dup16i, 6, 13); | ||
39 | -GEN_VXFORM_DUPI(vspltisw, tcg_gen_gvec_dup32i, 6, 14); | ||
40 | + simm = SIMM5(ctx->opcode); | ||
41 | + tcg_gen_gvec_dup_imm(vece, avr_full_offset(rD(ctx->opcode)), 16, 16, simm); | ||
42 | +} | ||
43 | + | ||
44 | +#define GEN_VXFORM_VSPLTI(name, vece, opc2, opc3) \ | ||
45 | +static void glue(gen_, name)(DisasContext *ctx) { gen_vsplti(ctx, vece); } | ||
46 | + | ||
47 | +GEN_VXFORM_VSPLTI(vspltisb, MO_8, 6, 12); | ||
48 | +GEN_VXFORM_VSPLTI(vspltish, MO_16, 6, 13); | ||
49 | +GEN_VXFORM_VSPLTI(vspltisw, MO_32, 6, 14); | ||
50 | |||
51 | #define GEN_VXFORM_NOA(name, opc2, opc3) \ | ||
52 | static void glue(gen_, name)(DisasContext *ctx) \ | ||
53 | @@ -XXX,XX +XXX,XX @@ GEN_VXFORM_DUAL(vsldoi, PPC_ALTIVEC, PPC_NONE, | ||
54 | #undef GEN_VXRFORM_DUAL | ||
55 | #undef GEN_VXRFORM1 | ||
56 | #undef GEN_VXRFORM | ||
57 | -#undef GEN_VXFORM_DUPI | ||
58 | +#undef GEN_VXFORM_VSPLTI | ||
59 | #undef GEN_VXFORM_NOA | ||
60 | #undef GEN_VXFORM_UIMM | ||
61 | #undef GEN_VAFORM_PAIRED | ||
62 | diff --git a/target/ppc/translate/vsx-impl.inc.c b/target/ppc/translate/vsx-impl.inc.c | ||
63 | index XXXXXXX..XXXXXXX 100644 | ||
64 | --- a/target/ppc/translate/vsx-impl.inc.c | ||
65 | +++ b/target/ppc/translate/vsx-impl.inc.c | ||
66 | @@ -XXX,XX +XXX,XX @@ static void gen_xxspltib(DisasContext *ctx) | ||
67 | return; | ||
68 | } | ||
69 | } | ||
70 | - tcg_gen_gvec_dup8i(vsr_full_offset(rt), 16, 16, uim8); | ||
71 | + tcg_gen_gvec_dup_imm(MO_8, vsr_full_offset(rt), 16, 16, uim8); | ||
72 | } | ||
73 | |||
74 | static void gen_xxsldwi(DisasContext *ctx) | ||
75 | -- | ||
76 | 2.20.1 | ||
77 | |||
78 | diff view generated by jsdifflib |
1 | From: Alex Bennée <alex.bennee@linaro.org> | 1 | In a few cases, we're able to remove some manual replication. |
---|---|---|---|
2 | 2 | ||
3 | There is no particular reason to use a static codegen buffer on 64 bit | 3 | Reviewed-by: Alex Bennée <alex.bennee@linaro.org> |
4 | hosts as we have address space to burn. Allow the common CONFIG_USER | ||
5 | case to use the mmap'ed buffers like SoftMMU. | ||
6 | |||
7 | Signed-off-by: Alex Bennée <alex.bennee@linaro.org> | ||
8 | Reviewed-by: Richard Henderson <richard.henderson@linaro.org> | ||
9 | Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com> | ||
10 | Tested-by: Philippe Mathieu-Daudé <philmd@redhat.com> | ||
11 | Reviewed-by: Niek Linnenbank <nieklinnenbank@gmail.com> | ||
12 | Message-Id: <20200228192415.19867-4-alex.bennee@linaro.org> | ||
13 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> | 4 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> |
14 | --- | 5 | --- |
15 | accel/tcg/translate-all.c | 11 ++++++----- | 6 | target/arm/translate-a64.c | 10 +++++----- |
16 | 1 file changed, 6 insertions(+), 5 deletions(-) | 7 | target/arm/translate-sve.c | 12 +++++------- |
8 | target/arm/translate.c | 9 ++++++--- | ||
9 | 3 files changed, 16 insertions(+), 15 deletions(-) | ||
17 | 10 | ||
18 | diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c | 11 | diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c |
19 | index XXXXXXX..XXXXXXX 100644 | 12 | index XXXXXXX..XXXXXXX 100644 |
20 | --- a/accel/tcg/translate-all.c | 13 | --- a/target/arm/translate-a64.c |
21 | +++ b/accel/tcg/translate-all.c | 14 | +++ b/target/arm/translate-a64.c |
22 | @@ -XXX,XX +XXX,XX @@ static void page_lock_pair(PageDesc **ret_p1, tb_page_addr_t phys1, | 15 | @@ -XXX,XX +XXX,XX @@ static void clear_vec_high(DisasContext *s, bool is_q, int rd) |
16 | tcg_temp_free_i64(tcg_zero); | ||
17 | } | ||
18 | if (vsz > 16) { | ||
19 | - tcg_gen_gvec_dup8i(ofs + 16, vsz - 16, vsz - 16, 0); | ||
20 | + tcg_gen_gvec_dup_imm(MO_64, ofs + 16, vsz - 16, vsz - 16, 0); | ||
23 | } | 21 | } |
24 | } | 22 | } |
25 | 23 | ||
26 | -#if defined(CONFIG_USER_ONLY) | 24 | @@ -XXX,XX +XXX,XX @@ static void disas_simd_mod_imm(DisasContext *s, uint32_t insn) |
27 | -/* Currently it is not recommended to allocate big chunks of data in | 25 | |
28 | - user mode. It will change when a dedicated libc will be used. */ | 26 | if (!((cmode & 0x9) == 0x1 || (cmode & 0xd) == 0x9)) { |
29 | -/* ??? 64-bit hosts ought to have no problem mmaping data outside the | 27 | /* MOVI or MVNI, with MVNI negation handled above. */ |
30 | - region in which the guest needs to run. Revisit this. */ | 28 | - tcg_gen_gvec_dup64i(vec_full_reg_offset(s, rd), is_q ? 16 : 8, |
31 | +#if defined(CONFIG_USER_ONLY) && TCG_TARGET_REG_BITS == 32 | 29 | - vec_full_reg_size(s), imm); |
32 | +/* | 30 | + tcg_gen_gvec_dup_imm(MO_64, vec_full_reg_offset(s, rd), is_q ? 16 : 8, |
33 | + * For user mode on smaller 32 bit systems we may run into trouble | 31 | + vec_full_reg_size(s), imm); |
34 | + * allocating big chunks of data in the right place. On these systems | 32 | } else { |
35 | + * we utilise a static code generation buffer directly in the binary. | 33 | /* ORR or BIC, with BIC negation to AND handled above. */ |
36 | + */ | 34 | if (is_neg) { |
37 | #define USE_STATIC_CODE_GEN_BUFFER | 35 | @@ -XXX,XX +XXX,XX @@ static void handle_vec_simd_shri(DisasContext *s, bool is_q, bool is_u, |
38 | #endif | 36 | if (is_u) { |
39 | 37 | if (shift == 8 << size) { | |
38 | /* Shift count the same size as element size produces zero. */ | ||
39 | - tcg_gen_gvec_dup8i(vec_full_reg_offset(s, rd), | ||
40 | - is_q ? 16 : 8, vec_full_reg_size(s), 0); | ||
41 | + tcg_gen_gvec_dup_imm(size, vec_full_reg_offset(s, rd), | ||
42 | + is_q ? 16 : 8, vec_full_reg_size(s), 0); | ||
43 | } else { | ||
44 | gen_gvec_fn2i(s, is_q, rd, rn, shift, tcg_gen_gvec_shri, size); | ||
45 | } | ||
46 | diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c | ||
47 | index XXXXXXX..XXXXXXX 100644 | ||
48 | --- a/target/arm/translate-sve.c | ||
49 | +++ b/target/arm/translate-sve.c | ||
50 | @@ -XXX,XX +XXX,XX @@ static bool do_mov_z(DisasContext *s, int rd, int rn) | ||
51 | static void do_dupi_z(DisasContext *s, int rd, uint64_t word) | ||
52 | { | ||
53 | unsigned vsz = vec_full_reg_size(s); | ||
54 | - tcg_gen_gvec_dup64i(vec_full_reg_offset(s, rd), vsz, vsz, word); | ||
55 | + tcg_gen_gvec_dup_imm(MO_64, vec_full_reg_offset(s, rd), vsz, vsz, word); | ||
56 | } | ||
57 | |||
58 | /* Invoke a vector expander on two Pregs. */ | ||
59 | @@ -XXX,XX +XXX,XX @@ static bool do_predset(DisasContext *s, int esz, int rd, int pat, bool setflag) | ||
60 | unsigned oprsz = size_for_gvec(setsz / 8); | ||
61 | |||
62 | if (oprsz * 8 == setsz) { | ||
63 | - tcg_gen_gvec_dup64i(ofs, oprsz, maxsz, word); | ||
64 | + tcg_gen_gvec_dup_imm(MO_64, ofs, oprsz, maxsz, word); | ||
65 | goto done; | ||
66 | } | ||
67 | } | ||
68 | @@ -XXX,XX +XXX,XX @@ static bool trans_DUP_x(DisasContext *s, arg_DUP_x *a) | ||
69 | unsigned nofs = vec_reg_offset(s, a->rn, index, esz); | ||
70 | tcg_gen_gvec_dup_mem(esz, dofs, nofs, vsz, vsz); | ||
71 | } else { | ||
72 | - tcg_gen_gvec_dup64i(dofs, vsz, vsz, 0); | ||
73 | + tcg_gen_gvec_dup_imm(esz, dofs, vsz, vsz, 0); | ||
74 | } | ||
75 | } | ||
76 | return true; | ||
77 | @@ -XXX,XX +XXX,XX @@ static bool trans_FDUP(DisasContext *s, arg_FDUP *a) | ||
78 | |||
79 | /* Decode the VFP immediate. */ | ||
80 | imm = vfp_expand_imm(a->esz, a->imm); | ||
81 | - imm = dup_const(a->esz, imm); | ||
82 | - | ||
83 | - tcg_gen_gvec_dup64i(dofs, vsz, vsz, imm); | ||
84 | + tcg_gen_gvec_dup_imm(a->esz, dofs, vsz, vsz, imm); | ||
85 | } | ||
86 | return true; | ||
87 | } | ||
88 | @@ -XXX,XX +XXX,XX @@ static bool trans_DUP_i(DisasContext *s, arg_DUP_i *a) | ||
89 | unsigned vsz = vec_full_reg_size(s); | ||
90 | int dofs = vec_full_reg_offset(s, a->rd); | ||
91 | |||
92 | - tcg_gen_gvec_dup64i(dofs, vsz, vsz, dup_const(a->esz, a->imm)); | ||
93 | + tcg_gen_gvec_dup_imm(a->esz, dofs, vsz, vsz, a->imm); | ||
94 | } | ||
95 | return true; | ||
96 | } | ||
97 | diff --git a/target/arm/translate.c b/target/arm/translate.c | ||
98 | index XXXXXXX..XXXXXXX 100644 | ||
99 | --- a/target/arm/translate.c | ||
100 | +++ b/target/arm/translate.c | ||
101 | @@ -XXX,XX +XXX,XX @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn) | ||
102 | MIN(shift, (8 << size) - 1), | ||
103 | vec_size, vec_size); | ||
104 | } else if (shift >= 8 << size) { | ||
105 | - tcg_gen_gvec_dup8i(rd_ofs, vec_size, vec_size, 0); | ||
106 | + tcg_gen_gvec_dup_imm(MO_8, rd_ofs, vec_size, | ||
107 | + vec_size, 0); | ||
108 | } else { | ||
109 | tcg_gen_gvec_shri(size, rd_ofs, rm_ofs, shift, | ||
110 | vec_size, vec_size); | ||
111 | @@ -XXX,XX +XXX,XX @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn) | ||
112 | * architecturally valid and results in zero. | ||
113 | */ | ||
114 | if (shift >= 8 << size) { | ||
115 | - tcg_gen_gvec_dup8i(rd_ofs, vec_size, vec_size, 0); | ||
116 | + tcg_gen_gvec_dup_imm(size, rd_ofs, | ||
117 | + vec_size, vec_size, 0); | ||
118 | } else { | ||
119 | tcg_gen_gvec_shli(size, rd_ofs, rm_ofs, shift, | ||
120 | vec_size, vec_size); | ||
121 | @@ -XXX,XX +XXX,XX @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn) | ||
122 | } | ||
123 | tcg_temp_free_i64(t64); | ||
124 | } else { | ||
125 | - tcg_gen_gvec_dup32i(reg_ofs, vec_size, vec_size, imm); | ||
126 | + tcg_gen_gvec_dup_imm(MO_32, reg_ofs, vec_size, | ||
127 | + vec_size, imm); | ||
128 | } | ||
129 | } | ||
130 | } | ||
40 | -- | 131 | -- |
41 | 2.20.1 | 132 | 2.20.1 |
42 | 133 | ||
43 | 134 | diff view generated by jsdifflib |
New patch | |||
---|---|---|---|
1 | Replace the outgoing interface. | ||
1 | 2 | ||
3 | Reviewed-by: LIU Zhiwei <zhiwei_liu@c-sky.com> | ||
4 | Reviewed-by: Alex Bennée <alex.bennee@linaro.org> | ||
5 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> | ||
6 | --- | ||
7 | tcg/tcg-op-gvec.c | 8 ++++---- | ||
8 | 1 file changed, 4 insertions(+), 4 deletions(-) | ||
9 | |||
10 | diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c | ||
11 | index XXXXXXX..XXXXXXX 100644 | ||
12 | --- a/tcg/tcg-op-gvec.c | ||
13 | +++ b/tcg/tcg-op-gvec.c | ||
14 | @@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs, | ||
15 | }; | ||
16 | |||
17 | if (aofs == bofs) { | ||
18 | - tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, 0); | ||
19 | + tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, 0); | ||
20 | } else { | ||
21 | tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); | ||
22 | } | ||
23 | @@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs, | ||
24 | }; | ||
25 | |||
26 | if (aofs == bofs) { | ||
27 | - tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, 0); | ||
28 | + tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, 0); | ||
29 | } else { | ||
30 | tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); | ||
31 | } | ||
32 | @@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs, | ||
33 | }; | ||
34 | |||
35 | if (aofs == bofs) { | ||
36 | - tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, -1); | ||
37 | + tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, -1); | ||
38 | } else { | ||
39 | tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); | ||
40 | } | ||
41 | @@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_eqv(unsigned vece, uint32_t dofs, uint32_t aofs, | ||
42 | }; | ||
43 | |||
44 | if (aofs == bofs) { | ||
45 | - tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, -1); | ||
46 | + tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, -1); | ||
47 | } else { | ||
48 | tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); | ||
49 | } | ||
50 | -- | ||
51 | 2.20.1 | ||
52 | |||
53 | diff view generated by jsdifflib |
1 | From: Alex Bennée <alex.bennee@linaro.org> | 1 | These interfaces are now unused. |
---|---|---|---|
2 | 2 | ||
3 | While 32mb is certainly usable a full system boot ends up flushing the | 3 | Reviewed-by: LIU Zhiwei <zhiwei_liu@c-sky.com> |
4 | codegen buffer nearly 100 times. Increase the default on 64 bit hosts | 4 | Reviewed-by: David Hildenbrand <david@redhat.com> |
5 | to take advantage of all that spare memory. After this change I can | 5 | Reviewed-by: Alex Bennée <alex.bennee@linaro.org> |
6 | boot my tests system without any TB flushes. | ||
7 | |||
8 | As we usually run more CONFIG_USER binaries at a time in typical usage | ||
9 | we aren't quite as profligate for user-mode code generation usage. We | ||
10 | also bring the static code gen defies to the same place to keep all | ||
11 | the reasoning in the comments together. | ||
12 | |||
13 | Signed-off-by: Alex Bennée <alex.bennee@linaro.org> | ||
14 | Tested-by: Niek Linnenbank <nieklinnenbank@gmail.com> | ||
15 | Reviewed-by: Niek Linnenbank <nieklinnenbank@gmail.com> | ||
16 | Message-Id: <20200228192415.19867-5-alex.bennee@linaro.org> | ||
17 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> | 6 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> |
18 | --- | 7 | --- |
19 | accel/tcg/translate-all.c | 35 ++++++++++++++++++++++++++--------- | 8 | include/tcg/tcg-op-gvec.h | 5 ----- |
20 | 1 file changed, 26 insertions(+), 9 deletions(-) | 9 | tcg/tcg-op-gvec.c | 28 ---------------------------- |
10 | 2 files changed, 33 deletions(-) | ||
21 | 11 | ||
22 | diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c | 12 | diff --git a/include/tcg/tcg-op-gvec.h b/include/tcg/tcg-op-gvec.h |
23 | index XXXXXXX..XXXXXXX 100644 | 13 | index XXXXXXX..XXXXXXX 100644 |
24 | --- a/accel/tcg/translate-all.c | 14 | --- a/include/tcg/tcg-op-gvec.h |
25 | +++ b/accel/tcg/translate-all.c | 15 | +++ b/include/tcg/tcg-op-gvec.h |
26 | @@ -XXX,XX +XXX,XX @@ static void page_lock_pair(PageDesc **ret_p1, tb_page_addr_t phys1, | 16 | @@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t s, |
17 | void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t s, | ||
18 | uint32_t m, TCGv_i64); | ||
19 | |||
20 | -void tcg_gen_gvec_dup8i(uint32_t dofs, uint32_t s, uint32_t m, uint8_t x); | ||
21 | -void tcg_gen_gvec_dup16i(uint32_t dofs, uint32_t s, uint32_t m, uint16_t x); | ||
22 | -void tcg_gen_gvec_dup32i(uint32_t dofs, uint32_t s, uint32_t m, uint32_t x); | ||
23 | -void tcg_gen_gvec_dup64i(uint32_t dofs, uint32_t s, uint32_t m, uint64_t x); | ||
24 | - | ||
25 | void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs, | ||
26 | int64_t shift, uint32_t oprsz, uint32_t maxsz); | ||
27 | void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs, | ||
28 | diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c | ||
29 | index XXXXXXX..XXXXXXX 100644 | ||
30 | --- a/tcg/tcg-op-gvec.c | ||
31 | +++ b/tcg/tcg-op-gvec.c | ||
32 | @@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs, | ||
27 | } | 33 | } |
28 | } | 34 | } |
29 | 35 | ||
30 | -#if defined(CONFIG_USER_ONLY) && TCG_TARGET_REG_BITS == 32 | 36 | -void tcg_gen_gvec_dup64i(uint32_t dofs, uint32_t oprsz, |
31 | -/* | 37 | - uint32_t maxsz, uint64_t x) |
32 | - * For user mode on smaller 32 bit systems we may run into trouble | 38 | -{ |
33 | - * allocating big chunks of data in the right place. On these systems | 39 | - check_size_align(oprsz, maxsz, dofs); |
34 | - * we utilise a static code generation buffer directly in the binary. | 40 | - do_dup(MO_64, dofs, oprsz, maxsz, NULL, NULL, x); |
35 | - */ | 41 | -} |
36 | -#define USE_STATIC_CODE_GEN_BUFFER | ||
37 | -#endif | ||
38 | - | 42 | - |
39 | /* Minimum size of the code gen buffer. This number is randomly chosen, | 43 | -void tcg_gen_gvec_dup32i(uint32_t dofs, uint32_t oprsz, |
40 | but not so small that we can't have a fair number of TB's live. */ | 44 | - uint32_t maxsz, uint32_t x) |
41 | #define MIN_CODE_GEN_BUFFER_SIZE (1 * MiB) | 45 | -{ |
42 | @@ -XXX,XX +XXX,XX @@ static void page_lock_pair(PageDesc **ret_p1, tb_page_addr_t phys1, | 46 | - check_size_align(oprsz, maxsz, dofs); |
43 | # define MAX_CODE_GEN_BUFFER_SIZE ((size_t)-1) | 47 | - do_dup(MO_32, dofs, oprsz, maxsz, NULL, NULL, x); |
44 | #endif | 48 | -} |
45 | 49 | - | |
46 | +#if TCG_TARGET_REG_BITS == 32 | 50 | -void tcg_gen_gvec_dup16i(uint32_t dofs, uint32_t oprsz, |
47 | #define DEFAULT_CODE_GEN_BUFFER_SIZE_1 (32 * MiB) | 51 | - uint32_t maxsz, uint16_t x) |
48 | +#ifdef CONFIG_USER_ONLY | 52 | -{ |
49 | +/* | 53 | - check_size_align(oprsz, maxsz, dofs); |
50 | + * For user mode on smaller 32 bit systems we may run into trouble | 54 | - do_dup(MO_16, dofs, oprsz, maxsz, NULL, NULL, x); |
51 | + * allocating big chunks of data in the right place. On these systems | 55 | -} |
52 | + * we utilise a static code generation buffer directly in the binary. | 56 | - |
53 | + */ | 57 | -void tcg_gen_gvec_dup8i(uint32_t dofs, uint32_t oprsz, |
54 | +#define USE_STATIC_CODE_GEN_BUFFER | 58 | - uint32_t maxsz, uint8_t x) |
55 | +#endif | 59 | -{ |
56 | +#else /* TCG_TARGET_REG_BITS == 64 */ | 60 | - check_size_align(oprsz, maxsz, dofs); |
57 | +#ifdef CONFIG_USER_ONLY | 61 | - do_dup(MO_8, dofs, oprsz, maxsz, NULL, NULL, x); |
58 | +/* | 62 | -} |
59 | + * As user-mode emulation typically means running multiple instances | 63 | - |
60 | + * of the translator don't go too nuts with our default code gen | 64 | void tcg_gen_gvec_dup_imm(unsigned vece, uint32_t dofs, uint32_t oprsz, |
61 | + * buffer lest we make things too hard for the OS. | 65 | uint32_t maxsz, uint64_t x) |
62 | + */ | 66 | { |
63 | +#define DEFAULT_CODE_GEN_BUFFER_SIZE_1 (128 * MiB) | ||
64 | +#else | ||
65 | +/* | ||
66 | + * We expect most system emulation to run one or two guests per host. | ||
67 | + * Users running large scale system emulation may want to tweak their | ||
68 | + * runtime setup via the tb-size control on the command line. | ||
69 | + */ | ||
70 | +#define DEFAULT_CODE_GEN_BUFFER_SIZE_1 (1 * GiB) | ||
71 | +#endif | ||
72 | +#endif | ||
73 | |||
74 | #define DEFAULT_CODE_GEN_BUFFER_SIZE \ | ||
75 | (DEFAULT_CODE_GEN_BUFFER_SIZE_1 < MAX_CODE_GEN_BUFFER_SIZE \ | ||
76 | -- | 67 | -- |
77 | 2.20.1 | 68 | 2.20.1 |
78 | 69 | ||
79 | 70 | diff view generated by jsdifflib |
1 | From: Alex Bennée <alex.bennee@linaro.org> | 1 | For use when a target needs to pass a configure-specific |
---|---|---|---|
2 | target_ulong value to duplicate. | ||
2 | 3 | ||
3 | Basing the TB cache size on the ram_size was always a little heuristic | 4 | Reviewed-by: LIU Zhiwei <zhiwei_liu@c-sky.com> |
4 | and was broken by a1b18df9a4 which caused ram_size not to be fully | 5 | Reviewed-by: David Hildenbrand <david@redhat.com> |
5 | realised at the time we initialise the TCG translation cache. | 6 | Reviewed-by: Alex Bennée <alex.bennee@linaro.org> |
6 | |||
7 | The current DEFAULT_CODE_GEN_BUFFER_SIZE may still be a little small | ||
8 | but follow-up patches will address that. | ||
9 | |||
10 | Fixes: a1b18df9a4 | ||
11 | Cc: Igor Mammedov <imammedo@redhat.com> | ||
12 | Signed-off-by: Alex Bennée <alex.bennee@linaro.org> | ||
13 | Reviewed-by: Richard Henderson <richard.henderson@linaro.org> | ||
14 | Tested-by: Philippe Mathieu-Daudé <philmd@redhat.com> | ||
15 | Reviewed-by: Niek Linnenbank <nieklinnenbank@gmail.com> | ||
16 | Message-Id: <20200228192415.19867-3-alex.bennee@linaro.org> | ||
17 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> | 7 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> |
18 | --- | 8 | --- |
19 | accel/tcg/translate-all.c | 8 -------- | 9 | include/tcg/tcg-op-gvec.h | 6 ++++++ |
20 | 1 file changed, 8 deletions(-) | 10 | 1 file changed, 6 insertions(+) |
21 | 11 | ||
22 | diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c | 12 | diff --git a/include/tcg/tcg-op-gvec.h b/include/tcg/tcg-op-gvec.h |
23 | index XXXXXXX..XXXXXXX 100644 | 13 | index XXXXXXX..XXXXXXX 100644 |
24 | --- a/accel/tcg/translate-all.c | 14 | --- a/include/tcg/tcg-op-gvec.h |
25 | +++ b/accel/tcg/translate-all.c | 15 | +++ b/include/tcg/tcg-op-gvec.h |
26 | @@ -XXX,XX +XXX,XX @@ static inline size_t size_code_gen_buffer(size_t tb_size) | 16 | @@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t s, |
27 | { | 17 | void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t s, |
28 | /* Size the buffer. */ | 18 | uint32_t m, TCGv_i64); |
29 | if (tb_size == 0) { | 19 | |
30 | -#ifdef USE_STATIC_CODE_GEN_BUFFER | 20 | +#if TARGET_LONG_BITS == 64 |
31 | tb_size = DEFAULT_CODE_GEN_BUFFER_SIZE; | 21 | +# define tcg_gen_gvec_dup_tl tcg_gen_gvec_dup_i64 |
32 | -#else | 22 | +#else |
33 | - /* ??? Needs adjustments. */ | 23 | +# define tcg_gen_gvec_dup_tl tcg_gen_gvec_dup_i32 |
34 | - /* ??? If we relax the requirement that CONFIG_USER_ONLY use the | 24 | +#endif |
35 | - static buffer, we could size this on RESERVED_VA, on the text | 25 | + |
36 | - segment size of the executable, or continue to use the default. */ | 26 | void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs, |
37 | - tb_size = (unsigned long)(ram_size / 4); | 27 | int64_t shift, uint32_t oprsz, uint32_t maxsz); |
38 | -#endif | 28 | void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs, |
39 | } | ||
40 | if (tb_size < MIN_CODE_GEN_BUFFER_SIZE) { | ||
41 | tb_size = MIN_CODE_GEN_BUFFER_SIZE; | ||
42 | -- | 29 | -- |
43 | 2.20.1 | 30 | 2.20.1 |
44 | 31 | ||
45 | 32 | diff view generated by jsdifflib |
1 | From: Alex Bennée <alex.bennee@linaro.org> | 1 | Better handling of non-power-of-2 tails as seen with Arm 8-byte |
---|---|---|---|
2 | vector operations. | ||
2 | 3 | ||
3 | It's easier to read. | 4 | Reviewed-by: Alex Bennée <alex.bennee@linaro.org> |
4 | |||
5 | Signed-off-by: Alex Bennée <alex.bennee@linaro.org> | ||
6 | Reviewed-by: Niek Linnenbank <nieklinnenbank@gmail.com> | ||
7 | Reviewed-by: Richard Henderson <richard.henderson@linaro.org> | ||
8 | Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com> | ||
9 | Tested-by: Philippe Mathieu-Daudé <philmd@redhat.com> | ||
10 | Message-Id: <20200228192415.19867-2-alex.bennee@linaro.org> | ||
11 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> | 5 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> |
12 | --- | 6 | --- |
13 | accel/tcg/translate-all.c | 19 ++++++++++--------- | 7 | tcg/tcg-op-gvec.c | 82 ++++++++++++++++++++++++++++++++++++----------- |
14 | 1 file changed, 10 insertions(+), 9 deletions(-) | 8 | 1 file changed, 63 insertions(+), 19 deletions(-) |
15 | 9 | ||
16 | diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c | 10 | diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c |
17 | index XXXXXXX..XXXXXXX 100644 | 11 | index XXXXXXX..XXXXXXX 100644 |
18 | --- a/accel/tcg/translate-all.c | 12 | --- a/tcg/tcg-op-gvec.c |
19 | +++ b/accel/tcg/translate-all.c | 13 | +++ b/tcg/tcg-op-gvec.c |
20 | @@ -XXX,XX +XXX,XX @@ | 14 | @@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_5_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs, |
21 | */ | 15 | in units of LNSZ. This limits the expansion of inline code. */ |
22 | 16 | static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz) | |
23 | #include "qemu/osdep.h" | 17 | { |
24 | +#include "qemu/units.h" | 18 | - if (oprsz % lnsz == 0) { |
25 | #include "qemu-common.h" | 19 | - uint32_t lnct = oprsz / lnsz; |
26 | 20 | - return lnct >= 1 && lnct <= MAX_UNROLL; | |
27 | #define NO_CPU_IO_DEFS | 21 | + uint32_t q, r; |
28 | @@ -XXX,XX +XXX,XX @@ static void page_lock_pair(PageDesc **ret_p1, tb_page_addr_t phys1, | 22 | + |
29 | 23 | + if (oprsz < lnsz) { | |
30 | /* Minimum size of the code gen buffer. This number is randomly chosen, | 24 | + return false; |
31 | but not so small that we can't have a fair number of TB's live. */ | 25 | } |
32 | -#define MIN_CODE_GEN_BUFFER_SIZE (1024u * 1024) | 26 | - return false; |
33 | +#define MIN_CODE_GEN_BUFFER_SIZE (1 * MiB) | 27 | + |
34 | 28 | + q = oprsz / lnsz; | |
35 | /* Maximum size of the code gen buffer we'd like to use. Unless otherwise | 29 | + r = oprsz % lnsz; |
36 | indicated, this is constrained by the range of direct branches on the | 30 | + tcg_debug_assert((r & 7) == 0); |
37 | host cpu, as used by the TCG implementation of goto_tb. */ | 31 | + |
38 | #if defined(__x86_64__) | 32 | + if (lnsz < 16) { |
39 | -# define MAX_CODE_GEN_BUFFER_SIZE (2ul * 1024 * 1024 * 1024) | 33 | + /* For sizes below 16, accept no remainder. */ |
40 | +# define MAX_CODE_GEN_BUFFER_SIZE (2 * GiB) | 34 | + if (r != 0) { |
41 | #elif defined(__sparc__) | 35 | + return false; |
42 | -# define MAX_CODE_GEN_BUFFER_SIZE (2ul * 1024 * 1024 * 1024) | 36 | + } |
43 | +# define MAX_CODE_GEN_BUFFER_SIZE (2 * GiB) | 37 | + } else { |
44 | #elif defined(__powerpc64__) | 38 | + /* |
45 | -# define MAX_CODE_GEN_BUFFER_SIZE (2ul * 1024 * 1024 * 1024) | 39 | + * Recall that ARM SVE allows vector sizes that are not a |
46 | +# define MAX_CODE_GEN_BUFFER_SIZE (2 * GiB) | 40 | + * power of 2, but always a multiple of 16. The intent is |
47 | #elif defined(__powerpc__) | 41 | + * that e.g. size == 80 would be expanded with 2x32 + 1x16. |
48 | -# define MAX_CODE_GEN_BUFFER_SIZE (32u * 1024 * 1024) | 42 | + * In addition, expand_clr needs to handle a multiple of 8. |
49 | +# define MAX_CODE_GEN_BUFFER_SIZE (32 * MiB) | 43 | + * Thus we can handle the tail with one more operation per |
50 | #elif defined(__aarch64__) | 44 | + * diminishing power of 2. |
51 | -# define MAX_CODE_GEN_BUFFER_SIZE (2ul * 1024 * 1024 * 1024) | 45 | + */ |
52 | +# define MAX_CODE_GEN_BUFFER_SIZE (2 * GiB) | 46 | + q += ctpop32(r); |
53 | #elif defined(__s390x__) | 47 | + } |
54 | /* We have a +- 4GB range on the branches; leave some slop. */ | 48 | + |
55 | -# define MAX_CODE_GEN_BUFFER_SIZE (3ul * 1024 * 1024 * 1024) | 49 | + return q <= MAX_UNROLL; |
56 | +# define MAX_CODE_GEN_BUFFER_SIZE (3 * GiB) | 50 | } |
57 | #elif defined(__mips__) | 51 | |
58 | /* We have a 256MB branch region, but leave room to make sure the | 52 | static void expand_clr(uint32_t dofs, uint32_t maxsz); |
59 | main executable is also within that region. */ | 53 | @@ -XXX,XX +XXX,XX @@ static void gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in) |
60 | -# define MAX_CODE_GEN_BUFFER_SIZE (128ul * 1024 * 1024) | 54 | static TCGType choose_vector_type(const TCGOpcode *list, unsigned vece, |
61 | +# define MAX_CODE_GEN_BUFFER_SIZE (128 * MiB) | 55 | uint32_t size, bool prefer_i64) |
62 | #else | 56 | { |
63 | # define MAX_CODE_GEN_BUFFER_SIZE ((size_t)-1) | 57 | - if (TCG_TARGET_HAS_v256 && check_size_impl(size, 32)) { |
64 | #endif | 58 | - /* |
65 | 59 | - * Recall that ARM SVE allows vector sizes that are not a | |
66 | -#define DEFAULT_CODE_GEN_BUFFER_SIZE_1 (32u * 1024 * 1024) | 60 | - * power of 2, but always a multiple of 16. The intent is |
67 | +#define DEFAULT_CODE_GEN_BUFFER_SIZE_1 (32 * MiB) | 61 | - * that e.g. size == 80 would be expanded with 2x32 + 1x16. |
68 | 62 | - * It is hard to imagine a case in which v256 is supported | |
69 | #define DEFAULT_CODE_GEN_BUFFER_SIZE \ | 63 | - * but v128 is not, but check anyway. |
70 | (DEFAULT_CODE_GEN_BUFFER_SIZE_1 < MAX_CODE_GEN_BUFFER_SIZE \ | 64 | - */ |
65 | - if (tcg_can_emit_vecop_list(list, TCG_TYPE_V256, vece) | ||
66 | - && (size % 32 == 0 | ||
67 | - || tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece))) { | ||
68 | - return TCG_TYPE_V256; | ||
69 | - } | ||
70 | + /* | ||
71 | + * Recall that ARM SVE allows vector sizes that are not a | ||
72 | + * power of 2, but always a multiple of 16. The intent is | ||
73 | + * that e.g. size == 80 would be expanded with 2x32 + 1x16. | ||
74 | + * It is hard to imagine a case in which v256 is supported | ||
75 | + * but v128 is not, but check anyway. | ||
76 | + * In addition, expand_clr needs to handle a multiple of 8. | ||
77 | + */ | ||
78 | + if (TCG_TARGET_HAS_v256 && | ||
79 | + check_size_impl(size, 32) && | ||
80 | + tcg_can_emit_vecop_list(list, TCG_TYPE_V256, vece) && | ||
81 | + (!(size & 16) || | ||
82 | + (TCG_TARGET_HAS_v128 && | ||
83 | + tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece))) && | ||
84 | + (!(size & 8) || | ||
85 | + (TCG_TARGET_HAS_v64 && | ||
86 | + tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) { | ||
87 | + return TCG_TYPE_V256; | ||
88 | } | ||
89 | - if (TCG_TARGET_HAS_v128 && check_size_impl(size, 16) | ||
90 | - && tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece)) { | ||
91 | + if (TCG_TARGET_HAS_v128 && | ||
92 | + check_size_impl(size, 16) && | ||
93 | + tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece) && | ||
94 | + (!(size & 8) || | ||
95 | + (TCG_TARGET_HAS_v64 && | ||
96 | + tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) { | ||
97 | return TCG_TYPE_V128; | ||
98 | } | ||
99 | if (TCG_TARGET_HAS_v64 && !prefer_i64 && check_size_impl(size, 8) | ||
100 | @@ -XXX,XX +XXX,XX @@ static void do_dup_store(TCGType type, uint32_t dofs, uint32_t oprsz, | ||
101 | { | ||
102 | uint32_t i = 0; | ||
103 | |||
104 | + tcg_debug_assert(oprsz >= 8); | ||
105 | + | ||
106 | + /* | ||
107 | + * This may be expand_clr for the tail of an operation, e.g. | ||
108 | + * oprsz == 8 && maxsz == 64. The first 8 bytes of this store | ||
109 | + * are misaligned wrt the maximum vector size, so do that first. | ||
110 | + */ | ||
111 | + if (dofs & 8) { | ||
112 | + tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64); | ||
113 | + i += 8; | ||
114 | + } | ||
115 | + | ||
116 | switch (type) { | ||
117 | case TCG_TYPE_V256: | ||
118 | /* | ||
71 | -- | 119 | -- |
72 | 2.20.1 | 120 | 2.20.1 |
73 | 121 | ||
74 | 122 | diff view generated by jsdifflib |
1 | From: Zenghui Yu <yuzenghui@huawei.com> | 1 | We have this same parameter for GVecGen2i, GVecGen3, |
---|---|---|---|
2 | and GVecGen3i. This will make some SVE2 insns easier | ||
3 | to parameterize. | ||
2 | 4 | ||
3 | Our robot reported the following compile-time warning while compiling | 5 | Reviewed-by: Alex Bennée <alex.bennee@linaro.org> |
4 | Qemu with -fno-inline cflags: | ||
5 | |||
6 | In function 'load_memop', | ||
7 | inlined from 'load_helper' at /qemu/accel/tcg/cputlb.c:1578:20, | ||
8 | inlined from 'full_ldub_mmu' at /qemu/accel/tcg/cputlb.c:1624:12: | ||
9 | /qemu/accel/tcg/cputlb.c:1502:9: error: call to 'qemu_build_not_reached' declared with attribute error: code path is reachable | ||
10 | qemu_build_not_reached(); | ||
11 | ^~~~~~~~~~~~~~~~~~~~~~~~ | ||
12 | [...] | ||
13 | |||
14 | It looks like a false-positive because only (MO_UB ^ MO_BSWAP) will | ||
15 | hit the default case in load_memop() while need_swap (size > 1) has | ||
16 | already ensured that MO_UB is not involved. | ||
17 | |||
18 | So the thing is that compilers get confused by the -fno-inline and | ||
19 | just can't accurately evaluate memop_size(op) at compile time, and | ||
20 | then the qemu_build_not_reached() is wrongly triggered by (MO_UB ^ | ||
21 | MO_BSWAP). Let's carefully don't use the compile-time assert when | ||
22 | no functions will be inlined into their callers. | ||
23 | |||
24 | Reported-by: Euler Robot <euler.robot@huawei.com> | ||
25 | Suggested-by: Richard Henderson <richard.henderson@linaro.org> | ||
26 | Signed-off-by: Zenghui Yu <yuzenghui@huawei.com> | ||
27 | Message-Id: <20200205141545.180-1-yuzenghui@huawei.com> | ||
28 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> | 6 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> |
29 | --- | 7 | --- |
30 | include/qemu/compiler.h | 2 +- | 8 | include/tcg/tcg-op-gvec.h | 2 ++ |
31 | 1 file changed, 1 insertion(+), 1 deletion(-) | 9 | tcg/tcg-op-gvec.c | 45 ++++++++++++++++++++++++++++----------- |
10 | 2 files changed, 34 insertions(+), 13 deletions(-) | ||
32 | 11 | ||
33 | diff --git a/include/qemu/compiler.h b/include/qemu/compiler.h | 12 | diff --git a/include/tcg/tcg-op-gvec.h b/include/tcg/tcg-op-gvec.h |
34 | index XXXXXXX..XXXXXXX 100644 | 13 | index XXXXXXX..XXXXXXX 100644 |
35 | --- a/include/qemu/compiler.h | 14 | --- a/include/tcg/tcg-op-gvec.h |
36 | +++ b/include/qemu/compiler.h | 15 | +++ b/include/tcg/tcg-op-gvec.h |
37 | @@ -XXX,XX +XXX,XX @@ | 16 | @@ -XXX,XX +XXX,XX @@ typedef struct { |
38 | * supports QEMU_ERROR, this will be reported at compile time; otherwise | 17 | uint8_t vece; |
39 | * this will be reported at link time due to the missing symbol. | 18 | /* Prefer i64 to v64. */ |
40 | */ | 19 | bool prefer_i64; |
41 | -#ifdef __OPTIMIZE__ | 20 | + /* Load dest as a 2nd source operand. */ |
42 | +#if defined(__OPTIMIZE__) && !defined(__NO_INLINE__) | 21 | + bool load_dest; |
43 | extern void QEMU_NORETURN QEMU_ERROR("code path is reachable") | 22 | } GVecGen2; |
44 | qemu_build_not_reached(void); | 23 | |
45 | #else | 24 | typedef struct { |
25 | diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c | ||
26 | index XXXXXXX..XXXXXXX 100644 | ||
27 | --- a/tcg/tcg-op-gvec.c | ||
28 | +++ b/tcg/tcg-op-gvec.c | ||
29 | @@ -XXX,XX +XXX,XX @@ static void expand_clr(uint32_t dofs, uint32_t maxsz) | ||
30 | |||
31 | /* Expand OPSZ bytes worth of two-operand operations using i32 elements. */ | ||
32 | static void expand_2_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz, | ||
33 | - void (*fni)(TCGv_i32, TCGv_i32)) | ||
34 | + bool load_dest, void (*fni)(TCGv_i32, TCGv_i32)) | ||
35 | { | ||
36 | TCGv_i32 t0 = tcg_temp_new_i32(); | ||
37 | + TCGv_i32 t1 = tcg_temp_new_i32(); | ||
38 | uint32_t i; | ||
39 | |||
40 | for (i = 0; i < oprsz; i += 4) { | ||
41 | tcg_gen_ld_i32(t0, cpu_env, aofs + i); | ||
42 | - fni(t0, t0); | ||
43 | - tcg_gen_st_i32(t0, cpu_env, dofs + i); | ||
44 | + if (load_dest) { | ||
45 | + tcg_gen_ld_i32(t1, cpu_env, dofs + i); | ||
46 | + } | ||
47 | + fni(t1, t0); | ||
48 | + tcg_gen_st_i32(t1, cpu_env, dofs + i); | ||
49 | } | ||
50 | tcg_temp_free_i32(t0); | ||
51 | + tcg_temp_free_i32(t1); | ||
52 | } | ||
53 | |||
54 | static void expand_2i_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz, | ||
55 | @@ -XXX,XX +XXX,XX @@ static void expand_4_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs, | ||
56 | |||
57 | /* Expand OPSZ bytes worth of two-operand operations using i64 elements. */ | ||
58 | static void expand_2_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz, | ||
59 | - void (*fni)(TCGv_i64, TCGv_i64)) | ||
60 | + bool load_dest, void (*fni)(TCGv_i64, TCGv_i64)) | ||
61 | { | ||
62 | TCGv_i64 t0 = tcg_temp_new_i64(); | ||
63 | + TCGv_i64 t1 = tcg_temp_new_i64(); | ||
64 | uint32_t i; | ||
65 | |||
66 | for (i = 0; i < oprsz; i += 8) { | ||
67 | tcg_gen_ld_i64(t0, cpu_env, aofs + i); | ||
68 | - fni(t0, t0); | ||
69 | - tcg_gen_st_i64(t0, cpu_env, dofs + i); | ||
70 | + if (load_dest) { | ||
71 | + tcg_gen_ld_i64(t1, cpu_env, dofs + i); | ||
72 | + } | ||
73 | + fni(t1, t0); | ||
74 | + tcg_gen_st_i64(t1, cpu_env, dofs + i); | ||
75 | } | ||
76 | tcg_temp_free_i64(t0); | ||
77 | + tcg_temp_free_i64(t1); | ||
78 | } | ||
79 | |||
80 | static void expand_2i_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz, | ||
81 | @@ -XXX,XX +XXX,XX @@ static void expand_4_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs, | ||
82 | /* Expand OPSZ bytes worth of two-operand operations using host vectors. */ | ||
83 | static void expand_2_vec(unsigned vece, uint32_t dofs, uint32_t aofs, | ||
84 | uint32_t oprsz, uint32_t tysz, TCGType type, | ||
85 | + bool load_dest, | ||
86 | void (*fni)(unsigned, TCGv_vec, TCGv_vec)) | ||
87 | { | ||
88 | TCGv_vec t0 = tcg_temp_new_vec(type); | ||
89 | + TCGv_vec t1 = tcg_temp_new_vec(type); | ||
90 | uint32_t i; | ||
91 | |||
92 | for (i = 0; i < oprsz; i += tysz) { | ||
93 | tcg_gen_ld_vec(t0, cpu_env, aofs + i); | ||
94 | - fni(vece, t0, t0); | ||
95 | - tcg_gen_st_vec(t0, cpu_env, dofs + i); | ||
96 | + if (load_dest) { | ||
97 | + tcg_gen_ld_vec(t1, cpu_env, dofs + i); | ||
98 | + } | ||
99 | + fni(vece, t1, t0); | ||
100 | + tcg_gen_st_vec(t1, cpu_env, dofs + i); | ||
101 | } | ||
102 | tcg_temp_free_vec(t0); | ||
103 | + tcg_temp_free_vec(t1); | ||
104 | } | ||
105 | |||
106 | /* Expand OPSZ bytes worth of two-vector operands and an immediate operand | ||
107 | @@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs, | ||
108 | * that e.g. size == 80 would be expanded with 2x32 + 1x16. | ||
109 | */ | ||
110 | some = QEMU_ALIGN_DOWN(oprsz, 32); | ||
111 | - expand_2_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, g->fniv); | ||
112 | + expand_2_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, | ||
113 | + g->load_dest, g->fniv); | ||
114 | if (some == oprsz) { | ||
115 | break; | ||
116 | } | ||
117 | @@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs, | ||
118 | maxsz -= some; | ||
119 | /* fallthru */ | ||
120 | case TCG_TYPE_V128: | ||
121 | - expand_2_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, g->fniv); | ||
122 | + expand_2_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, | ||
123 | + g->load_dest, g->fniv); | ||
124 | break; | ||
125 | case TCG_TYPE_V64: | ||
126 | - expand_2_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, g->fniv); | ||
127 | + expand_2_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, | ||
128 | + g->load_dest, g->fniv); | ||
129 | break; | ||
130 | |||
131 | case 0: | ||
132 | if (g->fni8 && check_size_impl(oprsz, 8)) { | ||
133 | - expand_2_i64(dofs, aofs, oprsz, g->fni8); | ||
134 | + expand_2_i64(dofs, aofs, oprsz, g->load_dest, g->fni8); | ||
135 | } else if (g->fni4 && check_size_impl(oprsz, 4)) { | ||
136 | - expand_2_i32(dofs, aofs, oprsz, g->fni4); | ||
137 | + expand_2_i32(dofs, aofs, oprsz, g->load_dest, g->fni4); | ||
138 | } else { | ||
139 | assert(g->fno != NULL); | ||
140 | tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, g->data, g->fno); | ||
46 | -- | 141 | -- |
47 | 2.20.1 | 142 | 2.20.1 |
48 | 143 | ||
49 | 144 | diff view generated by jsdifflib |
1 | From: Richard Henderson <rth@twiddle.net> | 1 | For the benefit of compatibility of function pointer types, |
---|---|---|---|
2 | we have standardized on int32_t and int64_t as the integral | ||
3 | argument to tcg expanders. | ||
2 | 4 | ||
3 | It is, after all, just two instructions. | 5 | We converted most of them in 474b2e8f0f7, but missed the rotates. |
4 | 6 | ||
5 | Profiling on a cortex-a15, using -d nochain to increase the number | 7 | Reviewed-by: Alex Bennée <alex.bennee@linaro.org> |
6 | of exit_tb that are executed, shows a minor improvement of 0.5%. | 8 | Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org> |
9 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> | ||
10 | --- | ||
11 | include/tcg/tcg-op.h | 8 ++++---- | ||
12 | tcg/tcg-op.c | 16 ++++++++-------- | ||
13 | 2 files changed, 12 insertions(+), 12 deletions(-) | ||
7 | 14 | ||
8 | Signed-off-by: Richard Henderson <rth@twiddle.net> | 15 | diff --git a/include/tcg/tcg-op.h b/include/tcg/tcg-op.h |
9 | --- | ||
10 | tcg/arm/tcg-target.inc.c | 12 ++---------- | ||
11 | 1 file changed, 2 insertions(+), 10 deletions(-) | ||
12 | |||
13 | diff --git a/tcg/arm/tcg-target.inc.c b/tcg/arm/tcg-target.inc.c | ||
14 | index XXXXXXX..XXXXXXX 100644 | 16 | index XXXXXXX..XXXXXXX 100644 |
15 | --- a/tcg/arm/tcg-target.inc.c | 17 | --- a/include/tcg/tcg-op.h |
16 | +++ b/tcg/arm/tcg-target.inc.c | 18 | +++ b/include/tcg/tcg-op.h |
17 | @@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64) | 19 | @@ -XXX,XX +XXX,XX @@ void tcg_gen_ctzi_i32(TCGv_i32 ret, TCGv_i32 arg1, uint32_t arg2); |
18 | #endif | 20 | void tcg_gen_clrsb_i32(TCGv_i32 ret, TCGv_i32 arg); |
21 | void tcg_gen_ctpop_i32(TCGv_i32 a1, TCGv_i32 a2); | ||
22 | void tcg_gen_rotl_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2); | ||
23 | -void tcg_gen_rotli_i32(TCGv_i32 ret, TCGv_i32 arg1, unsigned arg2); | ||
24 | +void tcg_gen_rotli_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2); | ||
25 | void tcg_gen_rotr_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2); | ||
26 | -void tcg_gen_rotri_i32(TCGv_i32 ret, TCGv_i32 arg1, unsigned arg2); | ||
27 | +void tcg_gen_rotri_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2); | ||
28 | void tcg_gen_deposit_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2, | ||
29 | unsigned int ofs, unsigned int len); | ||
30 | void tcg_gen_deposit_z_i32(TCGv_i32 ret, TCGv_i32 arg, | ||
31 | @@ -XXX,XX +XXX,XX @@ void tcg_gen_ctzi_i64(TCGv_i64 ret, TCGv_i64 arg1, uint64_t arg2); | ||
32 | void tcg_gen_clrsb_i64(TCGv_i64 ret, TCGv_i64 arg); | ||
33 | void tcg_gen_ctpop_i64(TCGv_i64 a1, TCGv_i64 a2); | ||
34 | void tcg_gen_rotl_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2); | ||
35 | -void tcg_gen_rotli_i64(TCGv_i64 ret, TCGv_i64 arg1, unsigned arg2); | ||
36 | +void tcg_gen_rotli_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2); | ||
37 | void tcg_gen_rotr_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2); | ||
38 | -void tcg_gen_rotri_i64(TCGv_i64 ret, TCGv_i64 arg1, unsigned arg2); | ||
39 | +void tcg_gen_rotri_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2); | ||
40 | void tcg_gen_deposit_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2, | ||
41 | unsigned int ofs, unsigned int len); | ||
42 | void tcg_gen_deposit_z_i64(TCGv_i64 ret, TCGv_i64 arg, | ||
43 | diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c | ||
44 | index XXXXXXX..XXXXXXX 100644 | ||
45 | --- a/tcg/tcg-op.c | ||
46 | +++ b/tcg/tcg-op.c | ||
47 | @@ -XXX,XX +XXX,XX @@ void tcg_gen_rotl_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2) | ||
48 | } | ||
19 | } | 49 | } |
20 | 50 | ||
21 | -static tcg_insn_unit *tb_ret_addr; | 51 | -void tcg_gen_rotli_i32(TCGv_i32 ret, TCGv_i32 arg1, unsigned arg2) |
22 | static void tcg_out_epilogue(TCGContext *s); | 52 | +void tcg_gen_rotli_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2) |
23 | 53 | { | |
24 | static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, | 54 | - tcg_debug_assert(arg2 < 32); |
25 | @@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, | 55 | + tcg_debug_assert(arg2 >= 0 && arg2 < 32); |
26 | 56 | /* some cases can be optimized here */ | |
27 | switch (opc) { | 57 | if (arg2 == 0) { |
28 | case INDEX_op_exit_tb: | 58 | tcg_gen_mov_i32(ret, arg1); |
29 | - /* Reuse the zeroing that exists for goto_ptr. */ | 59 | @@ -XXX,XX +XXX,XX @@ void tcg_gen_rotr_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2) |
30 | - a0 = args[0]; | 60 | } |
31 | - if (a0 == 0) { | ||
32 | - tcg_out_goto(s, COND_AL, s->code_gen_epilogue); | ||
33 | - } else { | ||
34 | - tcg_out_movi32(s, COND_AL, TCG_REG_R0, args[0]); | ||
35 | - tcg_out_goto(s, COND_AL, tb_ret_addr); | ||
36 | - } | ||
37 | + tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R0, args[0]); | ||
38 | + tcg_out_epilogue(s); | ||
39 | break; | ||
40 | case INDEX_op_goto_tb: | ||
41 | { | ||
42 | @@ -XXX,XX +XXX,XX @@ static void tcg_target_qemu_prologue(TCGContext *s) | ||
43 | */ | ||
44 | s->code_gen_epilogue = s->code_ptr; | ||
45 | tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R0, 0); | ||
46 | - tb_ret_addr = s->code_ptr; | ||
47 | tcg_out_epilogue(s); | ||
48 | } | 61 | } |
49 | 62 | ||
63 | -void tcg_gen_rotri_i32(TCGv_i32 ret, TCGv_i32 arg1, unsigned arg2) | ||
64 | +void tcg_gen_rotri_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2) | ||
65 | { | ||
66 | - tcg_debug_assert(arg2 < 32); | ||
67 | + tcg_debug_assert(arg2 >= 0 && arg2 < 32); | ||
68 | /* some cases can be optimized here */ | ||
69 | if (arg2 == 0) { | ||
70 | tcg_gen_mov_i32(ret, arg1); | ||
71 | @@ -XXX,XX +XXX,XX @@ void tcg_gen_rotl_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2) | ||
72 | } | ||
73 | } | ||
74 | |||
75 | -void tcg_gen_rotli_i64(TCGv_i64 ret, TCGv_i64 arg1, unsigned arg2) | ||
76 | +void tcg_gen_rotli_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2) | ||
77 | { | ||
78 | - tcg_debug_assert(arg2 < 64); | ||
79 | + tcg_debug_assert(arg2 >= 0 && arg2 < 64); | ||
80 | /* some cases can be optimized here */ | ||
81 | if (arg2 == 0) { | ||
82 | tcg_gen_mov_i64(ret, arg1); | ||
83 | @@ -XXX,XX +XXX,XX @@ void tcg_gen_rotr_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2) | ||
84 | } | ||
85 | } | ||
86 | |||
87 | -void tcg_gen_rotri_i64(TCGv_i64 ret, TCGv_i64 arg1, unsigned arg2) | ||
88 | +void tcg_gen_rotri_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2) | ||
89 | { | ||
90 | - tcg_debug_assert(arg2 < 64); | ||
91 | + tcg_debug_assert(arg2 >= 0 && arg2 < 64); | ||
92 | /* some cases can be optimized here */ | ||
93 | if (arg2 == 0) { | ||
94 | tcg_gen_mov_i64(ret, arg1); | ||
50 | -- | 95 | -- |
51 | 2.20.1 | 96 | 2.20.1 |
52 | 97 | ||
53 | 98 | diff view generated by jsdifflib |