In the vector unit-stride load/store helper functions. the vext_ldst_us
function corresponding most of the execution time. Inline the functions
can avoid the function call overhead to imperove the helper function
performance.
Signed-off-by: Max Chou <max.chou@sifive.com>
---
target/riscv/vector_helper.c | 30 ++++++++++++++++--------------
1 file changed, 16 insertions(+), 14 deletions(-)
diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c
index e8fbb921449..866f77d321d 100644
--- a/target/riscv/vector_helper.c
+++ b/target/riscv/vector_helper.c
@@ -149,25 +149,27 @@ static inline void vext_set_elem_mask(void *v0, int index,
typedef void vext_ldst_elem_fn(CPURISCVState *env, abi_ptr addr,
uint32_t idx, void *vd, uintptr_t retaddr);
-#define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF) \
-static void NAME(CPURISCVState *env, abi_ptr addr, \
- uint32_t idx, void *vd, uintptr_t retaddr)\
-{ \
- ETYPE *cur = ((ETYPE *)vd + H(idx)); \
- *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr); \
-} \
+#define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF) \
+static inline QEMU_ALWAYS_INLINE \
+void NAME(CPURISCVState *env, abi_ptr addr, \
+ uint32_t idx, void *vd, uintptr_t retaddr) \
+{ \
+ ETYPE *cur = ((ETYPE *)vd + H(idx)); \
+ *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr); \
+} \
GEN_VEXT_LD_ELEM(lde_b, int8_t, H1, ldsb)
GEN_VEXT_LD_ELEM(lde_h, int16_t, H2, ldsw)
GEN_VEXT_LD_ELEM(lde_w, int32_t, H4, ldl)
GEN_VEXT_LD_ELEM(lde_d, int64_t, H8, ldq)
-#define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF) \
-static void NAME(CPURISCVState *env, abi_ptr addr, \
- uint32_t idx, void *vd, uintptr_t retaddr)\
-{ \
- ETYPE data = *((ETYPE *)vd + H(idx)); \
- cpu_##STSUF##_data_ra(env, addr, data, retaddr); \
+#define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF) \
+static inline QEMU_ALWAYS_INLINE \
+void NAME(CPURISCVState *env, abi_ptr addr, \
+ uint32_t idx, void *vd, uintptr_t retaddr) \
+{ \
+ ETYPE data = *((ETYPE *)vd + H(idx)); \
+ cpu_##STSUF##_data_ra(env, addr, data, retaddr); \
}
GEN_VEXT_ST_ELEM(ste_b, int8_t, H1, stb)
@@ -289,7 +291,7 @@ GEN_VEXT_ST_STRIDE(vsse64_v, int64_t, ste_d)
*/
/* unmasked unit-stride load and store operation */
-static void
+static inline QEMU_ALWAYS_INLINE void
vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uint32_t evl,
uintptr_t ra)
--
2.34.1
On 2/15/24 16:28, Max Chou wrote:
> In the vector unit-stride load/store helper functions. the vext_ldst_us
> function corresponding most of the execution time. Inline the functions
> can avoid the function call overhead to imperove the helper function
> performance.
>
> Signed-off-by: Max Chou <max.chou@sifive.com>
> ---
The inline is a good idea but I think we can do better. I mentioned in a thread
last year [1] about the time we're spending in single byte loads/stores, even
for strided instructions.
E.g. in vext_ldst_stride():
for (i = env->vstart; i < env->vl; i++, env->vstart++) {
k = 0;
while (k < nf) {
if (!vm && !vext_elem_mask(v0, i)) {
/* set masked-off elements to 1s */
vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
(i + k * max_elems + 1) * esz);
k++;
continue;
}
target_ulong addr = base + stride * i + (k << log2_esz);
ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
k++;
}
}
We're doing single byte load/stores in ldst_elem() when, in this case, we could do
it in a whole block only once. ARM does something similar in SVE.
I update the gitlab bug https://gitlab.com/qemu-project/qemu/-/issues/2137 with this
additional info too.
Thanks,
Daniel
[1] https://lore.kernel.org/qemu-riscv/0e54c6c1-2903-7942-eff2-2b8c5e21187e@ventanamicro.com/
> target/riscv/vector_helper.c | 30 ++++++++++++++++--------------
> 1 file changed, 16 insertions(+), 14 deletions(-)
>
> diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c
> index e8fbb921449..866f77d321d 100644
> --- a/target/riscv/vector_helper.c
> +++ b/target/riscv/vector_helper.c
> @@ -149,25 +149,27 @@ static inline void vext_set_elem_mask(void *v0, int index,
> typedef void vext_ldst_elem_fn(CPURISCVState *env, abi_ptr addr,
> uint32_t idx, void *vd, uintptr_t retaddr);
>
> -#define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF) \
> -static void NAME(CPURISCVState *env, abi_ptr addr, \
> - uint32_t idx, void *vd, uintptr_t retaddr)\
> -{ \
> - ETYPE *cur = ((ETYPE *)vd + H(idx)); \
> - *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr); \
> -} \
> +#define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF) \
> +static inline QEMU_ALWAYS_INLINE \
> +void NAME(CPURISCVState *env, abi_ptr addr, \
> + uint32_t idx, void *vd, uintptr_t retaddr) \
> +{ \
> + ETYPE *cur = ((ETYPE *)vd + H(idx)); \
> + *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr); \
> +} \
>
> GEN_VEXT_LD_ELEM(lde_b, int8_t, H1, ldsb)
> GEN_VEXT_LD_ELEM(lde_h, int16_t, H2, ldsw)
> GEN_VEXT_LD_ELEM(lde_w, int32_t, H4, ldl)
> GEN_VEXT_LD_ELEM(lde_d, int64_t, H8, ldq)
>
> -#define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF) \
> -static void NAME(CPURISCVState *env, abi_ptr addr, \
> - uint32_t idx, void *vd, uintptr_t retaddr)\
> -{ \
> - ETYPE data = *((ETYPE *)vd + H(idx)); \
> - cpu_##STSUF##_data_ra(env, addr, data, retaddr); \
> +#define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF) \
> +static inline QEMU_ALWAYS_INLINE \
> +void NAME(CPURISCVState *env, abi_ptr addr, \
> + uint32_t idx, void *vd, uintptr_t retaddr) \
> +{ \
> + ETYPE data = *((ETYPE *)vd + H(idx)); \
> + cpu_##STSUF##_data_ra(env, addr, data, retaddr); \
> }
>
> GEN_VEXT_ST_ELEM(ste_b, int8_t, H1, stb)
> @@ -289,7 +291,7 @@ GEN_VEXT_ST_STRIDE(vsse64_v, int64_t, ste_d)
> */
>
> /* unmasked unit-stride load and store operation */
> -static void
> +static inline QEMU_ALWAYS_INLINE void
> vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
> vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uint32_t evl,
> uintptr_t ra)
Hi Daniel,
Thank you for the information and suggestion.
Yes, we can do it better if we load/store more bytes at a time.
I'll try to improve the RFC on this way.
Thanks,
Max
On 2024/2/16 5:11 AM, Daniel Henrique Barboza wrote:
>
>
> On 2/15/24 16:28, Max Chou wrote:
>> In the vector unit-stride load/store helper functions. the vext_ldst_us
>> function corresponding most of the execution time. Inline the functions
>> can avoid the function call overhead to imperove the helper function
>> performance.
>>
>> Signed-off-by: Max Chou <max.chou@sifive.com>
>> ---
>
> The inline is a good idea but I think we can do better. I mentioned in
> a thread
> last year [1] about the time we're spending in single byte
> loads/stores, even
> for strided instructions.
>
> E.g. in vext_ldst_stride():
>
>
> for (i = env->vstart; i < env->vl; i++, env->vstart++) {
> k = 0;
> while (k < nf) {
> if (!vm && !vext_elem_mask(v0, i)) {
> /* set masked-off elements to 1s */
> vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
> (i + k * max_elems + 1) * esz);
> k++;
> continue;
> }
> target_ulong addr = base + stride * i + (k << log2_esz);
> ldst_elem(env, adjust_addr(env, addr), i + k * max_elems,
> vd, ra);
> k++;
> }
> }
>
> We're doing single byte load/stores in ldst_elem() when, in this case,
> we could do
> it in a whole block only once. ARM does something similar in SVE.
>
> I update the gitlab bug
> https://gitlab.com/qemu-project/qemu/-/issues/2137 with this
> additional info too.
>
>
>
> Thanks,
>
> Daniel
>
>
> [1]
> https://lore.kernel.org/qemu-riscv/0e54c6c1-2903-7942-eff2-2b8c5e21187e@ventanamicro.com/
>
>
>> target/riscv/vector_helper.c | 30 ++++++++++++++++--------------
>> 1 file changed, 16 insertions(+), 14 deletions(-)
>>
>> diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c
>> index e8fbb921449..866f77d321d 100644
>> --- a/target/riscv/vector_helper.c
>> +++ b/target/riscv/vector_helper.c
>> @@ -149,25 +149,27 @@ static inline void vext_set_elem_mask(void *v0,
>> int index,
>> typedef void vext_ldst_elem_fn(CPURISCVState *env, abi_ptr addr,
>> uint32_t idx, void *vd, uintptr_t
>> retaddr);
>> -#define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF) \
>> -static void NAME(CPURISCVState *env, abi_ptr addr, \
>> - uint32_t idx, void *vd, uintptr_t retaddr)\
>> -{ \
>> - ETYPE *cur = ((ETYPE *)vd + H(idx)); \
>> - *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr); \
>> -} \
>> +#define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF) \
>> +static inline QEMU_ALWAYS_INLINE \
>> +void NAME(CPURISCVState *env, abi_ptr addr, \
>> + uint32_t idx, void *vd, uintptr_t retaddr) \
>> +{ \
>> + ETYPE *cur = ((ETYPE *)vd + H(idx)); \
>> + *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr); \
>> +} \
>> GEN_VEXT_LD_ELEM(lde_b, int8_t, H1, ldsb)
>> GEN_VEXT_LD_ELEM(lde_h, int16_t, H2, ldsw)
>> GEN_VEXT_LD_ELEM(lde_w, int32_t, H4, ldl)
>> GEN_VEXT_LD_ELEM(lde_d, int64_t, H8, ldq)
>> -#define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF) \
>> -static void NAME(CPURISCVState *env, abi_ptr addr, \
>> - uint32_t idx, void *vd, uintptr_t retaddr)\
>> -{ \
>> - ETYPE data = *((ETYPE *)vd + H(idx)); \
>> - cpu_##STSUF##_data_ra(env, addr, data, retaddr); \
>> +#define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF) \
>> +static inline QEMU_ALWAYS_INLINE \
>> +void NAME(CPURISCVState *env, abi_ptr addr, \
>> + uint32_t idx, void *vd, uintptr_t retaddr) \
>> +{ \
>> + ETYPE data = *((ETYPE *)vd + H(idx)); \
>> + cpu_##STSUF##_data_ra(env, addr, data, retaddr); \
>> }
>> GEN_VEXT_ST_ELEM(ste_b, int8_t, H1, stb)
>> @@ -289,7 +291,7 @@ GEN_VEXT_ST_STRIDE(vsse64_v, int64_t, ste_d)
>> */
>> /* unmasked unit-stride load and store operation */
>> -static void
>> +static inline QEMU_ALWAYS_INLINE void
>> vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env,
>> uint32_t desc,
>> vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz,
>> uint32_t evl,
>> uintptr_t ra)
On 2/15/24 09:28, Max Chou wrote: > /* unmasked unit-stride load and store operation */ > -static void > +static inline QEMU_ALWAYS_INLINE void > vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc, > vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uint32_t evl, > uintptr_t ra) Yes, this is important so that ldst_elem itself can always be inlined. Reviewed-by: Richard Henderson <richard.henderson@linaro.org> r~
© 2016 - 2026 Red Hat, Inc.