From: Helene CHELIN <helene.chelin@embecosm.com>
This patch improves the performance of the emulation of the RVV unit-stride
loads and stores in the following cases:
- when the data being loaded/stored per iteration amounts to 8 bytes or less.
- when the vector length is 16 bytes (VLEN=128) and there's no grouping of the
vector registers (LMUL=1).
The optimization consists of avoiding the overhead of probing the RAM of the
host machine and doing a loop load/store on the input data grouped in chunks
of as many bytes as possible (8,4,2,1 bytes).
Co-authored-by: Helene CHELIN <helene.chelin@embecosm.com>
Co-authored-by: Paolo Savini <paolo.savini@embecosm.com>
Signed-off-by: Helene CHELIN <helene.chelin@embecosm.com>
---
target/riscv/vector_helper.c | 46 ++++++++++++++++++++++++++++++++++++
1 file changed, 46 insertions(+)
diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c
index 29849a8b66..4b444c6bc5 100644
--- a/target/riscv/vector_helper.c
+++ b/target/riscv/vector_helper.c
@@ -633,6 +633,52 @@ vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
VSTART_CHECK_EARLY_EXIT(env);
+ /* For data sizes <= 64 bits and for LMUL=1 with VLEN=128 bits we get a
+ * better performance by doing a simple simulation of the load/store
+ * without the overhead of prodding the host RAM */
+ if ((nf == 1) && ((evl << log2_esz) <= 8 ||
+ ((vext_lmul(desc) == 0) && (simd_maxsz(desc) == 16)))) {
+
+ uint32_t evl_b = evl << log2_esz;
+
+ for (uint32_t j = env->vstart; j < evl_b;) {
+ addr = base + j;
+ if ((evl_b - j) >= 8) {
+ if (is_load)
+ lde_d_tlb(env, adjust_addr(env, addr), j, vd, ra);
+ else
+ ste_d_tlb(env, adjust_addr(env, addr), j, vd, ra);
+ j += 8;
+ }
+ else if ((evl_b - j) >= 4) {
+ if (is_load)
+ lde_w_tlb(env, adjust_addr(env, addr), j, vd, ra);
+ else
+ ste_w_tlb(env, adjust_addr(env, addr), j, vd, ra);
+ j += 4;
+ }
+ else if ((evl_b - j) >= 2) {
+ if (is_load)
+ lde_h_tlb(env, adjust_addr(env, addr), j, vd, ra);
+ else
+ ste_h_tlb(env, adjust_addr(env, addr), j, vd, ra);
+ j += 2;
+ }
+ else {
+ if (is_load)
+ lde_b_tlb(env, adjust_addr(env, addr), j, vd, ra);
+ else
+ ste_b_tlb(env, adjust_addr(env, addr), j, vd, ra);
+ j += 1;
+ }
+ }
+
+ env->vstart = 0;
+ vext_set_tail_elems_1s(evl, vd, desc, nf, esz, max_elems);
+ return;
+ }
+
+
vext_cont_ldst_elements(&info, base, env->vreg, env->vstart, evl, desc,
log2_esz, false);
/* Probe the page(s). Exit with exception for any invalid page. */
--
2.17.1
On 7/18/24 01:30, Paolo Savini wrote:
> From: Helene CHELIN <helene.chelin@embecosm.com>
>
> This patch improves the performance of the emulation of the RVV unit-stride
> loads and stores in the following cases:
>
> - when the data being loaded/stored per iteration amounts to 8 bytes or less.
> - when the vector length is 16 bytes (VLEN=128) and there's no grouping of the
> vector registers (LMUL=1).
>
> The optimization consists of avoiding the overhead of probing the RAM of the
> host machine and doing a loop load/store on the input data grouped in chunks
> of as many bytes as possible (8,4,2,1 bytes).
>
> Co-authored-by: Helene CHELIN <helene.chelin@embecosm.com>
> Co-authored-by: Paolo Savini <paolo.savini@embecosm.com>
>
> Signed-off-by: Helene CHELIN <helene.chelin@embecosm.com>
> ---
> target/riscv/vector_helper.c | 46 ++++++++++++++++++++++++++++++++++++
> 1 file changed, 46 insertions(+)
>
> diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c
> index 29849a8b66..4b444c6bc5 100644
> --- a/target/riscv/vector_helper.c
> +++ b/target/riscv/vector_helper.c
> @@ -633,6 +633,52 @@ vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
>
> VSTART_CHECK_EARLY_EXIT(env);
>
> + /* For data sizes <= 64 bits and for LMUL=1 with VLEN=128 bits we get a
> + * better performance by doing a simple simulation of the load/store
> + * without the overhead of prodding the host RAM */
> + if ((nf == 1) && ((evl << log2_esz) <= 8 ||
> + ((vext_lmul(desc) == 0) && (simd_maxsz(desc) == 16)))) {
> +
> + uint32_t evl_b = evl << log2_esz;
> +
> + for (uint32_t j = env->vstart; j < evl_b;) {
> + addr = base + j;
> + if ((evl_b - j) >= 8) {
> + if (is_load)
> + lde_d_tlb(env, adjust_addr(env, addr), j, vd, ra);
> + else
> + ste_d_tlb(env, adjust_addr(env, addr), j, vd, ra);
> + j += 8;
> + }
> + else if ((evl_b - j) >= 4) {
> + if (is_load)
> + lde_w_tlb(env, adjust_addr(env, addr), j, vd, ra);
> + else
> + ste_w_tlb(env, adjust_addr(env, addr), j, vd, ra);
> + j += 4;
> + }
> + else if ((evl_b - j) >= 2) {
> + if (is_load)
> + lde_h_tlb(env, adjust_addr(env, addr), j, vd, ra);
> + else
> + ste_h_tlb(env, adjust_addr(env, addr), j, vd, ra);
> + j += 2;
> + }
> + else {
> + if (is_load)
> + lde_b_tlb(env, adjust_addr(env, addr), j, vd, ra);
> + else
> + ste_b_tlb(env, adjust_addr(env, addr), j, vd, ra);
> + j += 1;
> + }
> + }
For system mode, this performs the tlb lookup N times, and so will not be an improvement.
This will not work on a big-endian host.
r~
On 7/27/24 4:13 AM, Richard Henderson wrote:
> On 7/18/24 01:30, Paolo Savini wrote:
>> From: Helene CHELIN <helene.chelin@embecosm.com>
>>
>> This patch improves the performance of the emulation of the RVV unit-stride
>> loads and stores in the following cases:
>>
>> - when the data being loaded/stored per iteration amounts to 8 bytes or less.
>> - when the vector length is 16 bytes (VLEN=128) and there's no grouping of the
>> vector registers (LMUL=1).
>>
>> The optimization consists of avoiding the overhead of probing the RAM of the
>> host machine and doing a loop load/store on the input data grouped in chunks
>> of as many bytes as possible (8,4,2,1 bytes).
>>
>> Co-authored-by: Helene CHELIN <helene.chelin@embecosm.com>
>> Co-authored-by: Paolo Savini <paolo.savini@embecosm.com>
>>
>> Signed-off-by: Helene CHELIN <helene.chelin@embecosm.com>
>> ---
>> target/riscv/vector_helper.c | 46 ++++++++++++++++++++++++++++++++++++
>> 1 file changed, 46 insertions(+)
>>
>> diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c
>> index 29849a8b66..4b444c6bc5 100644
>> --- a/target/riscv/vector_helper.c
>> +++ b/target/riscv/vector_helper.c
>> @@ -633,6 +633,52 @@ vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
>> VSTART_CHECK_EARLY_EXIT(env);
>> + /* For data sizes <= 64 bits and for LMUL=1 with VLEN=128 bits we get a
>> + * better performance by doing a simple simulation of the load/store
>> + * without the overhead of prodding the host RAM */
>> + if ((nf == 1) && ((evl << log2_esz) <= 8 ||
>> + ((vext_lmul(desc) == 0) && (simd_maxsz(desc) == 16)))) {
>> +
>> + uint32_t evl_b = evl << log2_esz;
>> +
>> + for (uint32_t j = env->vstart; j < evl_b;) {
>> + addr = base + j;
>> + if ((evl_b - j) >= 8) {
>> + if (is_load)
>> + lde_d_tlb(env, adjust_addr(env, addr), j, vd, ra);
>> + else
>> + ste_d_tlb(env, adjust_addr(env, addr), j, vd, ra);
>> + j += 8;
>> + }
>> + else if ((evl_b - j) >= 4) {
>> + if (is_load)
>> + lde_w_tlb(env, adjust_addr(env, addr), j, vd, ra);
>> + else
>> + ste_w_tlb(env, adjust_addr(env, addr), j, vd, ra);
>> + j += 4;
>> + }
>> + else if ((evl_b - j) >= 2) {
>> + if (is_load)
>> + lde_h_tlb(env, adjust_addr(env, addr), j, vd, ra);
>> + else
>> + ste_h_tlb(env, adjust_addr(env, addr), j, vd, ra);
>> + j += 2;
>> + }
>> + else {
>> + if (is_load)
>> + lde_b_tlb(env, adjust_addr(env, addr), j, vd, ra);
>> + else
>> + ste_b_tlb(env, adjust_addr(env, addr), j, vd, ra);
>> + j += 1;
>> + }
>> + }
>
> For system mode, this performs the tlb lookup N times, and so will not be an improvement.
I believe we can wrap this up in an "#ifdef CONFIG_USER_ONLY" block to allow
linux-user mode to benefit from it. We would still need to take care of the
host endianess though.
Thanks,
Daniel
>
> This will not work on a big-endian host.
>
>
> r~
On 7/17/24 12:30 PM, Paolo Savini wrote:
> From: Helene CHELIN <helene.chelin@embecosm.com>
>
> This patch improves the performance of the emulation of the RVV unit-stride
> loads and stores in the following cases:
>
> - when the data being loaded/stored per iteration amounts to 8 bytes or less.
> - when the vector length is 16 bytes (VLEN=128) and there's no grouping of the
> vector registers (LMUL=1).
>
> The optimization consists of avoiding the overhead of probing the RAM of the
> host machine and doing a loop load/store on the input data grouped in chunks
> of as many bytes as possible (8,4,2,1 bytes).
>
> Co-authored-by: Helene CHELIN <helene.chelin@embecosm.com>
> Co-authored-by: Paolo Savini <paolo.savini@embecosm.com>
>
> Signed-off-by: Helene CHELIN <helene.chelin@embecosm.com>
> ---
> target/riscv/vector_helper.c | 46 ++++++++++++++++++++++++++++++++++++
> 1 file changed, 46 insertions(+)
>
> diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c
> index 29849a8b66..4b444c6bc5 100644
> --- a/target/riscv/vector_helper.c
> +++ b/target/riscv/vector_helper.c
> @@ -633,6 +633,52 @@ vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
>
> VSTART_CHECK_EARLY_EXIT(env);
>
> + /* For data sizes <= 64 bits and for LMUL=1 with VLEN=128 bits we get a
> + * better performance by doing a simple simulation of the load/store
> + * without the overhead of prodding the host RAM */
> + if ((nf == 1) && ((evl << log2_esz) <= 8 ||
> + ((vext_lmul(desc) == 0) && (simd_maxsz(desc) == 16)))) {
> +
> + uint32_t evl_b = evl << log2_esz;
> +
> + for (uint32_t j = env->vstart; j < evl_b;) {
> + addr = base + j;
> + if ((evl_b - j) >= 8) {
> + if (is_load)
> + lde_d_tlb(env, adjust_addr(env, addr), j, vd, ra);
> + else
> + ste_d_tlb(env, adjust_addr(env, addr), j, vd, ra);
> + j += 8;
> + }
> + else if ((evl_b - j) >= 4) {
> + if (is_load)
> + lde_w_tlb(env, adjust_addr(env, addr), j, vd, ra);
> + else
> + ste_w_tlb(env, adjust_addr(env, addr), j, vd, ra);
> + j += 4;
> + }
> + else if ((evl_b - j) >= 2) {
> + if (is_load)
> + lde_h_tlb(env, adjust_addr(env, addr), j, vd, ra);
> + else
> + ste_h_tlb(env, adjust_addr(env, addr), j, vd, ra);
> + j += 2;
> + }
> + else {
> + if (is_load)
> + lde_b_tlb(env, adjust_addr(env, addr), j, vd, ra);
> + else
> + ste_b_tlb(env, adjust_addr(env, addr), j, vd, ra);
> + j += 1;
> + }
> + }
> +
> + env->vstart = 0;
> + vext_set_tail_elems_1s(evl, vd, desc, nf, esz, max_elems);
> + return;
> + }
> +
Aside from the code style remarks that ./scripts/checkpatch.pl will make here (we always
use curly braces in all ifs and elses, regardless of being a single statement or not),
LGTM.
Thanks,
Daniel
> +
> vext_cont_ldst_elements(&info, base, env->vreg, env->vstart, evl, desc,
> log2_esz, false);
> /* Probe the page(s). Exit with exception for any invalid page. */
© 2016 - 2026 Red Hat, Inc.