Improve the performance of RISC-V vector unit-stride ld/st instructions

[RFC PATCH 3/6] target/riscv: Inline vext_ldst_us and coressponding function for performance

Posted by Max Chou 8 months, 2 weeks ago

In the vector unit-stride load/store helper functions. the vext_ldst_us
function corresponding most of the execution time. Inline the functions
can avoid the function call overhead to imperove the helper function
performance.

Signed-off-by: Max Chou <max.chou@sifive.com>
---
 target/riscv/vector_helper.c | 30 ++++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c
index e8fbb921449..866f77d321d 100644
--- a/target/riscv/vector_helper.c
+++ b/target/riscv/vector_helper.c
@@ -149,25 +149,27 @@ static inline void vext_set_elem_mask(void *v0, int index,
 typedef void vext_ldst_elem_fn(CPURISCVState *env, abi_ptr addr,
                                uint32_t idx, void *vd, uintptr_t retaddr);
 
-#define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF)            \
-static void NAME(CPURISCVState *env, abi_ptr addr,         \
-                 uint32_t idx, void *vd, uintptr_t retaddr)\
-{                                                          \
-    ETYPE *cur = ((ETYPE *)vd + H(idx));                   \
-    *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr);      \
-}                                                          \
+#define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF)         \
+static inline QEMU_ALWAYS_INLINE                        \
+void NAME(CPURISCVState *env, abi_ptr addr,             \
+          uint32_t idx, void *vd, uintptr_t retaddr)    \
+{                                                       \
+    ETYPE *cur = ((ETYPE *)vd + H(idx));                \
+    *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr);   \
+}                                                       \
 
 GEN_VEXT_LD_ELEM(lde_b, int8_t,  H1, ldsb)
 GEN_VEXT_LD_ELEM(lde_h, int16_t, H2, ldsw)
 GEN_VEXT_LD_ELEM(lde_w, int32_t, H4, ldl)
 GEN_VEXT_LD_ELEM(lde_d, int64_t, H8, ldq)
 
-#define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF)            \
-static void NAME(CPURISCVState *env, abi_ptr addr,         \
-                 uint32_t idx, void *vd, uintptr_t retaddr)\
-{                                                          \
-    ETYPE data = *((ETYPE *)vd + H(idx));                  \
-    cpu_##STSUF##_data_ra(env, addr, data, retaddr);       \
+#define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF)         \
+static inline QEMU_ALWAYS_INLINE                        \
+void NAME(CPURISCVState *env, abi_ptr addr,             \
+          uint32_t idx, void *vd, uintptr_t retaddr)    \
+{                                                       \
+    ETYPE data = *((ETYPE *)vd + H(idx));               \
+    cpu_##STSUF##_data_ra(env, addr, data, retaddr);    \
 }
 
 GEN_VEXT_ST_ELEM(ste_b, int8_t,  H1, stb)
@@ -289,7 +291,7 @@ GEN_VEXT_ST_STRIDE(vsse64_v, int64_t, ste_d)
  */
 
 /* unmasked unit-stride load and store operation */
-static void
+static inline QEMU_ALWAYS_INLINE void
 vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
              vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uint32_t evl,
              uintptr_t ra)
-- 
2.34.1

Re: [RFC PATCH 3/6] target/riscv: Inline vext_ldst_us and coressponding function for performance

Posted by Daniel Henrique Barboza 8 months, 2 weeks ago


On 2/15/24 16:28, Max Chou wrote:
> In the vector unit-stride load/store helper functions. the vext_ldst_us
> function corresponding most of the execution time. Inline the functions
> can avoid the function call overhead to imperove the helper function
> performance.
> 
> Signed-off-by: Max Chou <max.chou@sifive.com>
> ---

The inline is a good idea but I think we can do better. I mentioned in a thread
last year [1] about the time we're spending in single byte loads/stores, even
for strided instructions.

E.g. in vext_ldst_stride():


     for (i = env->vstart; i < env->vl; i++, env->vstart++) {
         k = 0;
         while (k < nf) {
             if (!vm && !vext_elem_mask(v0, i)) {
                 /* set masked-off elements to 1s */
                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
                                   (i + k * max_elems + 1) * esz);
                 k++;
                 continue;
             }
             target_ulong addr = base + stride * i + (k << log2_esz);
             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
             k++;
         }
     }

We're doing single byte load/stores in ldst_elem() when, in this case, we could do
it in a whole block only once. ARM does something similar in SVE.

I update the gitlab bug https://gitlab.com/qemu-project/qemu/-/issues/2137 with this
additional info too.



Thanks,

Daniel


[1] https://lore.kernel.org/qemu-riscv/0e54c6c1-2903-7942-eff2-2b8c5e21187e@ventanamicro.com/


>   target/riscv/vector_helper.c | 30 ++++++++++++++++--------------
>   1 file changed, 16 insertions(+), 14 deletions(-)
> 
> diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c
> index e8fbb921449..866f77d321d 100644
> --- a/target/riscv/vector_helper.c
> +++ b/target/riscv/vector_helper.c
> @@ -149,25 +149,27 @@ static inline void vext_set_elem_mask(void *v0, int index,
>   typedef void vext_ldst_elem_fn(CPURISCVState *env, abi_ptr addr,
>                                  uint32_t idx, void *vd, uintptr_t retaddr);
>   
> -#define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF)            \
> -static void NAME(CPURISCVState *env, abi_ptr addr,         \
> -                 uint32_t idx, void *vd, uintptr_t retaddr)\
> -{                                                          \
> -    ETYPE *cur = ((ETYPE *)vd + H(idx));                   \
> -    *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr);      \
> -}                                                          \
> +#define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF)         \
> +static inline QEMU_ALWAYS_INLINE                        \
> +void NAME(CPURISCVState *env, abi_ptr addr,             \
> +          uint32_t idx, void *vd, uintptr_t retaddr)    \
> +{                                                       \
> +    ETYPE *cur = ((ETYPE *)vd + H(idx));                \
> +    *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr);   \
> +}                                                       \
>   
>   GEN_VEXT_LD_ELEM(lde_b, int8_t,  H1, ldsb)
>   GEN_VEXT_LD_ELEM(lde_h, int16_t, H2, ldsw)
>   GEN_VEXT_LD_ELEM(lde_w, int32_t, H4, ldl)
>   GEN_VEXT_LD_ELEM(lde_d, int64_t, H8, ldq)
>   
> -#define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF)            \
> -static void NAME(CPURISCVState *env, abi_ptr addr,         \
> -                 uint32_t idx, void *vd, uintptr_t retaddr)\
> -{                                                          \
> -    ETYPE data = *((ETYPE *)vd + H(idx));                  \
> -    cpu_##STSUF##_data_ra(env, addr, data, retaddr);       \
> +#define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF)         \
> +static inline QEMU_ALWAYS_INLINE                        \
> +void NAME(CPURISCVState *env, abi_ptr addr,             \
> +          uint32_t idx, void *vd, uintptr_t retaddr)    \
> +{                                                       \
> +    ETYPE data = *((ETYPE *)vd + H(idx));               \
> +    cpu_##STSUF##_data_ra(env, addr, data, retaddr);    \
>   }
>   
>   GEN_VEXT_ST_ELEM(ste_b, int8_t,  H1, stb)
> @@ -289,7 +291,7 @@ GEN_VEXT_ST_STRIDE(vsse64_v, int64_t, ste_d)
>    */
>   
>   /* unmasked unit-stride load and store operation */
> -static void
> +static inline QEMU_ALWAYS_INLINE void
>   vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
>                vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uint32_t evl,
>                uintptr_t ra)

Re: [RFC PATCH 3/6] target/riscv: Inline vext_ldst_us and coressponding function for performance

Posted by Max Chou 8 months, 2 weeks ago

Hi Daniel,

Thank you for the information and suggestion.
Yes, we can do it better if we load/store more bytes at a time.
I'll try to improve the RFC on this way.

Thanks,

Max

On 2024/2/16 5:11 AM, Daniel Henrique Barboza wrote:
>
>
> On 2/15/24 16:28, Max Chou wrote:
>> In the vector unit-stride load/store helper functions. the vext_ldst_us
>> function corresponding most of the execution time. Inline the functions
>> can avoid the function call overhead to imperove the helper function
>> performance.
>>
>> Signed-off-by: Max Chou <max.chou@sifive.com>
>> ---
>
> The inline is a good idea but I think we can do better. I mentioned in 
> a thread
> last year [1] about the time we're spending in single byte 
> loads/stores, even
> for strided instructions.
>
> E.g. in vext_ldst_stride():
>
>
>     for (i = env->vstart; i < env->vl; i++, env->vstart++) {
>         k = 0;
>         while (k < nf) {
>             if (!vm && !vext_elem_mask(v0, i)) {
>                 /* set masked-off elements to 1s */
>                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
>                                   (i + k * max_elems + 1) * esz);
>                 k++;
>                 continue;
>             }
>             target_ulong addr = base + stride * i + (k << log2_esz);
>             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, 
> vd, ra);
>             k++;
>         }
>     }
>
> We're doing single byte load/stores in ldst_elem() when, in this case, 
> we could do
> it in a whole block only once. ARM does something similar in SVE.
>
> I update the gitlab bug 
> https://gitlab.com/qemu-project/qemu/-/issues/2137 with this
> additional info too.
>
>
>
> Thanks,
>
> Daniel
>
>
> [1] 
> https://lore.kernel.org/qemu-riscv/0e54c6c1-2903-7942-eff2-2b8c5e21187e@ventanamicro.com/
>
>
>>   target/riscv/vector_helper.c | 30 ++++++++++++++++--------------
>>   1 file changed, 16 insertions(+), 14 deletions(-)
>>
>> diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c
>> index e8fbb921449..866f77d321d 100644
>> --- a/target/riscv/vector_helper.c
>> +++ b/target/riscv/vector_helper.c
>> @@ -149,25 +149,27 @@ static inline void vext_set_elem_mask(void *v0, 
>> int index,
>>   typedef void vext_ldst_elem_fn(CPURISCVState *env, abi_ptr addr,
>>                                  uint32_t idx, void *vd, uintptr_t 
>> retaddr);
>>   -#define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF)            \
>> -static void NAME(CPURISCVState *env, abi_ptr addr,         \
>> -                 uint32_t idx, void *vd, uintptr_t retaddr)\
>> -{                                                          \
>> -    ETYPE *cur = ((ETYPE *)vd + H(idx));                   \
>> -    *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr);      \
>> -}                                                          \
>> +#define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF)         \
>> +static inline QEMU_ALWAYS_INLINE                        \
>> +void NAME(CPURISCVState *env, abi_ptr addr,             \
>> +          uint32_t idx, void *vd, uintptr_t retaddr)    \
>> +{                                                       \
>> +    ETYPE *cur = ((ETYPE *)vd + H(idx));                \
>> +    *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr);   \
>> +}                                                       \
>>     GEN_VEXT_LD_ELEM(lde_b, int8_t,  H1, ldsb)
>>   GEN_VEXT_LD_ELEM(lde_h, int16_t, H2, ldsw)
>>   GEN_VEXT_LD_ELEM(lde_w, int32_t, H4, ldl)
>>   GEN_VEXT_LD_ELEM(lde_d, int64_t, H8, ldq)
>>   -#define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF)            \
>> -static void NAME(CPURISCVState *env, abi_ptr addr,         \
>> -                 uint32_t idx, void *vd, uintptr_t retaddr)\
>> -{                                                          \
>> -    ETYPE data = *((ETYPE *)vd + H(idx));                  \
>> -    cpu_##STSUF##_data_ra(env, addr, data, retaddr);       \
>> +#define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF)         \
>> +static inline QEMU_ALWAYS_INLINE                        \
>> +void NAME(CPURISCVState *env, abi_ptr addr,             \
>> +          uint32_t idx, void *vd, uintptr_t retaddr)    \
>> +{                                                       \
>> +    ETYPE data = *((ETYPE *)vd + H(idx));               \
>> +    cpu_##STSUF##_data_ra(env, addr, data, retaddr);    \
>>   }
>>     GEN_VEXT_ST_ELEM(ste_b, int8_t,  H1, stb)
>> @@ -289,7 +291,7 @@ GEN_VEXT_ST_STRIDE(vsse64_v, int64_t, ste_d)
>>    */
>>     /* unmasked unit-stride load and store operation */
>> -static void
>> +static inline QEMU_ALWAYS_INLINE void
>>   vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, 
>> uint32_t desc,
>>                vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, 
>> uint32_t evl,
>>                uintptr_t ra)

Re: [RFC PATCH 3/6] target/riscv: Inline vext_ldst_us and coressponding function for performance

Posted by Richard Henderson 8 months, 2 weeks ago

On 2/15/24 09:28, Max Chou wrote:
>   /* unmasked unit-stride load and store operation */
> -static void
> +static inline QEMU_ALWAYS_INLINE void
>   vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
>                vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uint32_t evl,
>                uintptr_t ra)

Yes, this is important so that ldst_elem itself can always be inlined.

Reviewed-by: Richard Henderson <richard.henderson@linaro.org>


r~

[RFC PATCH 1/6] target/riscv: Seperate vector segment ld/st instructions
[RFC PATCH 2/6] accel/tcg: Avoid uncessary call overhead from qemu_plugin_vcpu_mem_cb
[RFC PATCH 3/6] target/riscv: Inline vext_ldst_us and coressponding function for performance
[RFC PATCH 4/6] accel/tcg: Inline cpu_mmu_lookup function
[RFC PATCH 5/6] accel/tcg: Inline do_ld1_mmu function
[RFC PATCH 6/6] accel/tcg: Inline do_st1_mmu function