1 | Following the reviews on the previous version: | 1 | Previous versions: |
---|---|---|---|
2 | 2 | ||
3 | - RFC v1: https://lore.kernel.org/all/20241218170840.1090473-1-paolo.savini@embecosm.com/ | 3 | - RFC v1: https://lore.kernel.org/all/20241218170840.1090473-1-paolo.savini@embecosm.com/ |
4 | - Review: https://lore.kernel.org/all/e8fb908d-4723-417a-bf88-b4050432ddad@linaro.org/ | 4 | - RFC v2: https://lore.kernel.org/all/20241220153834.16302-1-paolo.savini@embecosm.com/ |
5 | - RFC v3: https://lore.kernel.org/all/20250122164905.13615-1-paolo.savini@embecosm.com/ | ||
5 | 6 | ||
6 | we apply the following fixes: | 7 | Version v4 of this patch brings the following changes: |
7 | 8 | ||
8 | - Fall back to using the helper function if vstart != 0 at the beginning | 9 | - removed the host specific conditions so that the behaviour of the emulation |
9 | of the iterations and refactor the setting of the function arguments | 10 | doesn't depend on the host we are running on. |
10 | accordignly. | 11 | The intruduction of this extra complexity is not worth the very marginal |
11 | - Add mark_vs_dirty before performing the memory operations. | 12 | performance improvement, when the overall performance improves anyway |
12 | - Loosen the atomicity constraints and apply only MO_ATOM_IFALIGN_PAIR | 13 | considerably without. |
13 | for element sizes MO_16, MO_32 and MO_64. | 14 | - added reviewers contacts (thanks all for reviewing the work). |
14 | - Change the way we update vstart in order to set vstart to 0 if it's the last | 15 | - changed the header from RFC to PATCH. |
15 | iteration. | ||
16 | - Fix the indentation. | ||
17 | |||
18 | We also rephrase the commit message to better reflect the new behaviour of the | ||
19 | patch. | ||
20 | |||
21 | Many thanks Richard for the thorough review and explanations. | ||
22 | 16 | ||
23 | Cc: Richard Handerson <richard.henderson@linaro.org> | 17 | Cc: Richard Handerson <richard.henderson@linaro.org> |
24 | Cc: Palmer Dabbelt <palmer@dabbelt.com> | 18 | Cc: Palmer Dabbelt <palmer@dabbelt.com> |
25 | Cc: Alistair Francis <alistair.francis@wdc.com> | 19 | Cc: Alistair Francis <alistair.francis@wdc.com> |
26 | Cc: Bin Meng <bmeng.cn@gmail.com> | 20 | Cc: Bin Meng <bmeng.cn@gmail.com> |
... | ... | ||
36 | 30 | ||
37 | Paolo Savini (1): | 31 | Paolo Savini (1): |
38 | target/riscv: use tcg ops generation to emulate whole reg rvv | 32 | target/riscv: use tcg ops generation to emulate whole reg rvv |
39 | loads/stores. | 33 | loads/stores. |
40 | 34 | ||
41 | target/riscv/insn_trans/trans_rvv.c.inc | 125 +++++++++++++++--------- | 35 | target/riscv/insn_trans/trans_rvv.c.inc | 155 +++++++++++++++++------- |
42 | 1 file changed, 78 insertions(+), 47 deletions(-) | 36 | 1 file changed, 108 insertions(+), 47 deletions(-) |
43 | 37 | ||
44 | -- | 38 | -- |
45 | 2.34.1 | 39 | 2.34.1 | diff view generated by jsdifflib |
1 | This patch replaces the use of a helper function with direct tcg ops generation | 1 | This patch replaces the use of a helper function with direct tcg ops generation |
---|---|---|---|
2 | in order to emulate whole register loads and stores. This is done in order to | 2 | in order to emulate whole register loads and stores. This is done in order to |
3 | improve the performance of QEMU. | 3 | improve the performance of QEMU. |
4 | We still use the helper function when vstart is not 0 at the beginning of the | 4 | We still use the helper function when vstart is not 0 at the beginning of the |
5 | emulation of the whole register load or store. | 5 | emulation of the whole register load or store or when we would end up generating |
6 | partial loads or stores of vector elements (e.g. emulating 64 bits element loads | ||
7 | with pairs of 32 bits loads on hosts with 32 bits registers). | ||
8 | The latter condition ensures that we are not surprised by a trap in mid-element | ||
9 | and consecutively that we can update vstart correctly. | ||
10 | We also use the helper function when it performs better than tcg for specific | ||
11 | combinations of vector length, number of fields and element size. | ||
6 | 12 | ||
7 | Signed-off-by: Paolo Savini <paolo.savini@embecosm.com> | 13 | Signed-off-by: Paolo Savini <paolo.savini@embecosm.com> |
14 | Reviewed-by: Daniel Henrique Barboza <dbarboza@ventanamicro.com> | ||
15 | Reviewed-by: Richard Handerson <richard.henderson@linaro.org> | ||
16 | Reviewed-by: Max Chou <max.chou@sifive.com> | ||
17 | Reviewed-by: "Alex Bennée" <alex.bennee@linaro.org> | ||
8 | --- | 18 | --- |
9 | target/riscv/insn_trans/trans_rvv.c.inc | 125 +++++++++++++++--------- | 19 | target/riscv/insn_trans/trans_rvv.c.inc | 155 +++++++++++++++++------- |
10 | 1 file changed, 78 insertions(+), 47 deletions(-) | 20 | 1 file changed, 108 insertions(+), 47 deletions(-) |
11 | 21 | ||
12 | diff --git a/target/riscv/insn_trans/trans_rvv.c.inc b/target/riscv/insn_trans/trans_rvv.c.inc | 22 | diff --git a/target/riscv/insn_trans/trans_rvv.c.inc b/target/riscv/insn_trans/trans_rvv.c.inc |
13 | index XXXXXXX..XXXXXXX 100644 | 23 | index XXXXXXX..XXXXXXX 100644 |
14 | --- a/target/riscv/insn_trans/trans_rvv.c.inc | 24 | --- a/target/riscv/insn_trans/trans_rvv.c.inc |
15 | +++ b/target/riscv/insn_trans/trans_rvv.c.inc | 25 | +++ b/target/riscv/insn_trans/trans_rvv.c.inc |
... | ... | ||
37 | - | 47 | - |
38 | mark_vs_dirty(s); | 48 | mark_vs_dirty(s); |
39 | 49 | ||
40 | - fn(dest, base, tcg_env, desc); | 50 | - fn(dest, base, tcg_env, desc); |
41 | + /* | 51 | + /* |
42 | + * Load/store minimum vlenb bytes per iteration. | 52 | + * Load/store multiple bytes per iteration. |
43 | + * When possible do this atomically. | 53 | + * When possible do this atomically. |
44 | + * Update vstart with the number of processed elements. | 54 | + * Update vstart with the number of processed elements. |
55 | + * Use the helper function if either: | ||
56 | + * - vstart is not 0. | ||
57 | + * - the target has 32 bit registers and we are loading/storing 64 bit long | ||
58 | + * elements. This is to ensure that we process every element with a single | ||
59 | + * memory instruction. | ||
45 | + */ | 60 | + */ |
46 | + if (s->vstart_eq_zero) { | 61 | + |
62 | + bool use_helper_fn = !(s->vstart_eq_zero) || | ||
63 | + (TCG_TARGET_REG_BITS == 32 && log2_esz == 3); | ||
64 | + | ||
65 | + if (!use_helper_fn) { | ||
47 | + TCGv addr = tcg_temp_new(); | 66 | + TCGv addr = tcg_temp_new(); |
48 | + uint32_t size = s->cfg_ptr->vlenb * nf; | 67 | + uint32_t size = s->cfg_ptr->vlenb * nf; |
49 | + TCGv_i128 t16 = tcg_temp_new_i128(); | 68 | + TCGv_i64 t8 = tcg_temp_new_i64(); |
69 | + TCGv_i32 t4 = tcg_temp_new_i32(); | ||
50 | + MemOp atomicity = MO_ATOM_NONE; | 70 | + MemOp atomicity = MO_ATOM_NONE; |
51 | + if (log2_esz == 0) { | 71 | + if (log2_esz == 0) { |
52 | + atomicity = MO_ATOM_NONE; | 72 | + atomicity = MO_ATOM_NONE; |
53 | + } else { | 73 | + } else { |
54 | + atomicity = MO_ATOM_IFALIGN_PAIR; | 74 | + atomicity = MO_ATOM_IFALIGN_PAIR; |
55 | + } | 75 | + } |
56 | + for (int i = 0; i < size; i += 16) { | 76 | + if (TCG_TARGET_REG_BITS == 64) { |
57 | + addr = get_address(s, rs1, i); | 77 | + for (int i = 0; i < size; i += 8) { |
58 | + if (is_load) { | 78 | + addr = get_address(s, rs1, i); |
59 | + tcg_gen_qemu_ld_i128(t16, addr, s->mem_idx, | 79 | + if (is_load) { |
60 | + MO_LE | MO_128 | atomicity); | 80 | + tcg_gen_qemu_ld_i64(t8, addr, s->mem_idx, |
61 | + tcg_gen_st_i128(t16, tcg_env, vreg_ofs(s, vd) + i); | 81 | + MO_LE | MO_64 | atomicity); |
62 | + } else { | 82 | + tcg_gen_st_i64(t8, tcg_env, vreg_ofs(s, vd) + i); |
63 | + tcg_gen_ld_i128(t16, tcg_env, vreg_ofs(s, vd) + i); | 83 | + } else { |
64 | + tcg_gen_qemu_st_i128(t16, addr, s->mem_idx, | 84 | + tcg_gen_ld_i64(t8, tcg_env, vreg_ofs(s, vd) + i); |
65 | + MO_LE | MO_128 | atomicity); | 85 | + tcg_gen_qemu_st_i64(t8, addr, s->mem_idx, |
86 | + MO_LE | MO_64 | atomicity); | ||
87 | + } | ||
88 | + if (i == size - 8) { | ||
89 | + tcg_gen_movi_tl(cpu_vstart, 0); | ||
90 | + } else { | ||
91 | + tcg_gen_addi_tl(cpu_vstart, cpu_vstart, 8 >> log2_esz); | ||
92 | + } | ||
66 | + } | 93 | + } |
67 | + if (i == size - 16) { | 94 | + } else { |
68 | + tcg_gen_movi_tl(cpu_vstart, 0); | 95 | + for (int i = 0; i < size; i += 4) { |
69 | + } else { | 96 | + addr = get_address(s, rs1, i); |
70 | + tcg_gen_addi_tl(cpu_vstart, cpu_vstart, 16 >> log2_esz); | 97 | + if (is_load) { |
98 | + tcg_gen_qemu_ld_i32(t4, addr, s->mem_idx, | ||
99 | + MO_LE | MO_32 | atomicity); | ||
100 | + tcg_gen_st_i32(t4, tcg_env, vreg_ofs(s, vd) + i); | ||
101 | + } else { | ||
102 | + tcg_gen_ld_i32(t4, tcg_env, vreg_ofs(s, vd) + i); | ||
103 | + tcg_gen_qemu_st_i32(t4, addr, s->mem_idx, | ||
104 | + MO_LE | MO_32 | atomicity); | ||
105 | + } | ||
106 | + if (i == size - 4) { | ||
107 | + tcg_gen_movi_tl(cpu_vstart, 0); | ||
108 | + } else { | ||
109 | + tcg_gen_addi_tl(cpu_vstart, cpu_vstart, 4 >> log2_esz); | ||
110 | + } | ||
71 | + } | 111 | + } |
72 | + } | 112 | + } |
73 | + } else { | 113 | + } else { |
74 | + TCGv_ptr dest; | 114 | + TCGv_ptr dest; |
75 | + TCGv base; | 115 | + TCGv base; |
... | ... | ||
160 | 200 | ||
161 | /* | 201 | /* |
162 | *** Vector Integer Arithmetic Instructions | 202 | *** Vector Integer Arithmetic Instructions |
163 | -- | 203 | -- |
164 | 2.34.1 | 204 | 2.34.1 |
205 | |||
206 | diff view generated by jsdifflib |