This patch improves the performance of the emulation of the RVV unit-stride
loads and stores in the following cases:
- when the data being loaded/stored per iteration amounts to 8 bytes or less.
- when the vector length is 16 bytes (VLEN=128) and there's no grouping of the
vector registers (LMUL=1).
The optimization consists of avoiding the overhead of probing the RAM of the
host machine and doing a loop load/store on the input data grouped in chunks
of as many bytes as possible (8,4,2,1 bytes).
Co-authored-by: Helene CHELIN <helene.chelin@embecosm.com>
Co-authored-by: Paolo Savini <paolo.savini@embecosm.com>
Signed-off-by: Helene CHELIN <helene.chelin@embecosm.com>
---
target/riscv/vector_helper.c | 47 ++++++++++++++++++++++++++++++++++++
1 file changed, 47 insertions(+)
diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c
index 4479726acf..75c24653f0 100644
--- a/target/riscv/vector_helper.c
+++ b/target/riscv/vector_helper.c
@@ -635,6 +635,53 @@ vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
VSTART_CHECK_EARLY_EXIT(env);
+#if defined(CONFIG_USER_ONLY) && !HOST_BIG_ENDIAN
+ /* For data sizes <= 64 bits and for LMUL=1 with VLEN=128 bits we get a
+ * better performance by doing a simple simulation of the load/store
+ * without the overhead of prodding the host RAM */
+ if ((nf == 1) && ((evl << log2_esz) <= 8 ||
+ ((vext_lmul(desc) == 0) && (simd_maxsz(desc) == 16)))) {
+
+ uint32_t evl_b = evl << log2_esz;
+
+ for (uint32_t j = env->vstart; j < evl_b;) {
+ addr = base + j;
+ if ((evl_b - j) >= 8) {
+ if (is_load)
+ lde_d_tlb(env, adjust_addr(env, addr), j, vd, ra);
+ else
+ ste_d_tlb(env, adjust_addr(env, addr), j, vd, ra);
+ j += 8;
+ }
+ else if ((evl_b - j) >= 4) {
+ if (is_load)
+ lde_w_tlb(env, adjust_addr(env, addr), j, vd, ra);
+ else
+ ste_w_tlb(env, adjust_addr(env, addr), j, vd, ra);
+ j += 4;
+ }
+ else if ((evl_b - j) >= 2) {
+ if (is_load)
+ lde_h_tlb(env, adjust_addr(env, addr), j, vd, ra);
+ else
+ ste_h_tlb(env, adjust_addr(env, addr), j, vd, ra);
+ j += 2;
+ }
+ else {
+ if (is_load)
+ lde_b_tlb(env, adjust_addr(env, addr), j, vd, ra);
+ else
+ ste_b_tlb(env, adjust_addr(env, addr), j, vd, ra);
+ j += 1;
+ }
+ }
+
+ env->vstart = 0;
+ vext_set_tail_elems_1s(evl, vd, desc, nf, esz, max_elems);
+ return;
+ }
+#endif
+
vext_cont_ldst_elements(&info, base, env->vreg, env->vstart, evl, desc,
log2_esz, false);
/* Probe the page(s). Exit with exception for any invalid page. */
--
2.34.1
On 10/29/24 4:43 PM, Paolo Savini wrote: > This patch improves the performance of the emulation of the RVV unit-stride > loads and stores in the following cases: > > - when the data being loaded/stored per iteration amounts to 8 bytes or less. > - when the vector length is 16 bytes (VLEN=128) and there's no grouping of the > vector registers (LMUL=1). > > The optimization consists of avoiding the overhead of probing the RAM of the > host machine and doing a loop load/store on the input data grouped in chunks > of as many bytes as possible (8,4,2,1 bytes). > > Co-authored-by: Helene CHELIN <helene.chelin@embecosm.com> > Co-authored-by: Paolo Savini <paolo.savini@embecosm.com> > > Signed-off-by: Helene CHELIN <helene.chelin@embecosm.com> > --- Paolo, To merge this patch we need you Signed-off-by tag since you're marked as the author. When sending a new version please add your Signed-off-by as well, and also add a: Reviewed-by: Daniel Henrique Barboza <dbarboza@ventanamicro.com> > target/riscv/vector_helper.c | 47 ++++++++++++++++++++++++++++++++++++ > 1 file changed, 47 insertions(+) > > diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c > index 4479726acf..75c24653f0 100644 > --- a/target/riscv/vector_helper.c > +++ b/target/riscv/vector_helper.c > @@ -635,6 +635,53 @@ vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc, > > VSTART_CHECK_EARLY_EXIT(env); > > +#if defined(CONFIG_USER_ONLY) && !HOST_BIG_ENDIAN > + /* For data sizes <= 64 bits and for LMUL=1 with VLEN=128 bits we get a > + * better performance by doing a simple simulation of the load/store > + * without the overhead of prodding the host RAM */ > + if ((nf == 1) && ((evl << log2_esz) <= 8 || > + ((vext_lmul(desc) == 0) && (simd_maxsz(desc) == 16)))) { > + > + uint32_t evl_b = evl << log2_esz; > + > + for (uint32_t j = env->vstart; j < evl_b;) { > + addr = base + j; > + if ((evl_b - j) >= 8) { > + if (is_load) > + lde_d_tlb(env, adjust_addr(env, addr), j, vd, ra); > + else > + ste_d_tlb(env, adjust_addr(env, addr), j, vd, ra); > + j += 8; > + } > + else if ((evl_b - j) >= 4) { > + if (is_load) > + lde_w_tlb(env, adjust_addr(env, addr), j, vd, ra); > + else > + ste_w_tlb(env, adjust_addr(env, addr), j, vd, ra); > + j += 4; > + } > + else if ((evl_b - j) >= 2) { > + if (is_load) > + lde_h_tlb(env, adjust_addr(env, addr), j, vd, ra); > + else > + ste_h_tlb(env, adjust_addr(env, addr), j, vd, ra); > + j += 2; > + } > + else { > + if (is_load) > + lde_b_tlb(env, adjust_addr(env, addr), j, vd, ra); > + else > + ste_b_tlb(env, adjust_addr(env, addr), j, vd, ra); > + j += 1; > + } > + } > + > + env->vstart = 0; > + vext_set_tail_elems_1s(evl, vd, desc, nf, esz, max_elems); > + return; > + } > +#endif > + > vext_cont_ldst_elements(&info, base, env->vreg, env->vstart, evl, desc, > log2_esz, false); > /* Probe the page(s). Exit with exception for any invalid page. */
© 2016 - 2024 Red Hat, Inc.