In the vector unit-stride load/store helper functions. the vext_ldst_us
& vext_ldst_whole functions corresponding most of the execution time.
Inline the functions can avoid the function call overhead to improve the
helper function performance.
Signed-off-by: Max Chou <max.chou@sifive.com>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
---
target/riscv/vector_helper.c | 56 +++++++++++++++++++-----------------
1 file changed, 30 insertions(+), 26 deletions(-)
diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c
index 2e675b4220c..95394c425ed 100644
--- a/target/riscv/vector_helper.c
+++ b/target/riscv/vector_helper.c
@@ -150,18 +150,20 @@ typedef void vext_ldst_elem_fn_tlb(CPURISCVState *env, abi_ptr addr,
uint32_t idx, void *vd, uintptr_t retaddr);
typedef void vext_ldst_elem_fn_host(void *vd, uint32_t idx, void *host);
-#define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF) \
-static void NAME##_tlb(CPURISCVState *env, abi_ptr addr, \
- uint32_t byte_off, void *vd, uintptr_t retaddr) \
-{ \
- ETYPE *cur = vd + byte_off; \
- *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr); \
-} \
- \
-static void NAME##_host(void *vd, uint32_t byte_off, void *host) \
-{ \
- ETYPE val = LDSUF##_p(host); \
- *(ETYPE *)(vd + byte_off) = val; \
+#define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF) \
+static inline QEMU_ALWAYS_INLINE \
+void NAME##_tlb(CPURISCVState *env, abi_ptr addr, \
+ uint32_t byte_off, void *vd, uintptr_t retaddr) \
+{ \
+ ETYPE *cur = vd + byte_off; \
+ *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr); \
+} \
+ \
+static inline QEMU_ALWAYS_INLINE \
+void NAME##_host(void *vd, uint32_t byte_off, void *host) \
+{ \
+ ETYPE val = LDSUF##_p(host); \
+ *(ETYPE *)(vd + byte_off) = val; \
}
GEN_VEXT_LD_ELEM(lde_b, uint8_t, H1, ldub)
@@ -169,18 +171,20 @@ GEN_VEXT_LD_ELEM(lde_h, uint16_t, H2, lduw)
GEN_VEXT_LD_ELEM(lde_w, uint32_t, H4, ldl)
GEN_VEXT_LD_ELEM(lde_d, uint64_t, H8, ldq)
-#define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF) \
-static void NAME##_tlb(CPURISCVState *env, abi_ptr addr, \
- uint32_t byte_off, void *vd, uintptr_t retaddr) \
-{ \
- ETYPE data = *(ETYPE *)(vd + byte_off); \
- cpu_##STSUF##_data_ra(env, addr, data, retaddr); \
-} \
- \
-static void NAME##_host(void *vd, uint32_t byte_off, void *host) \
-{ \
- ETYPE val = *(ETYPE *)(vd + byte_off); \
- STSUF##_p(host, val); \
+#define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF) \
+static inline QEMU_ALWAYS_INLINE \
+void NAME##_tlb(CPURISCVState *env, abi_ptr addr, \
+ uint32_t byte_off, void *vd, uintptr_t retaddr) \
+{ \
+ ETYPE data = *(ETYPE *)(vd + byte_off); \
+ cpu_##STSUF##_data_ra(env, addr, data, retaddr); \
+} \
+ \
+static inline QEMU_ALWAYS_INLINE \
+void NAME##_host(void *vd, uint32_t byte_off, void *host) \
+{ \
+ ETYPE val = *(ETYPE *)(vd + byte_off); \
+ STSUF##_p(host, val); \
}
GEN_VEXT_ST_ELEM(ste_b, uint8_t, H1, stb)
@@ -366,7 +370,7 @@ vext_page_ldst_us(CPURISCVState *env, void *vd, target_ulong addr,
}
}
-static void
+static inline QEMU_ALWAYS_INLINE void
vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
vext_ldst_elem_fn_tlb *ldst_tlb,
vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz,
@@ -695,7 +699,7 @@ GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d_tlb)
/*
* load and store whole register instructions
*/
-static void
+static inline QEMU_ALWAYS_INLINE void
vext_ldst_whole(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
vext_ldst_elem_fn_tlb *ldst_tlb,
vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz,
--
2.34.1
On 7/17/24 23:39, Max Chou wrote:
> In the vector unit-stride load/store helper functions. the vext_ldst_us
> & vext_ldst_whole functions corresponding most of the execution time.
> Inline the functions can avoid the function call overhead to improve the
> helper function performance.
>
> Signed-off-by: Max Chou <max.chou@sifive.com>
> Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
> ---
> target/riscv/vector_helper.c | 56 +++++++++++++++++++-----------------
> 1 file changed, 30 insertions(+), 26 deletions(-)
You'll want to mark vext_page_ldst_us similarly.
r~
>
> diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c
> index 2e675b4220c..95394c425ed 100644
> --- a/target/riscv/vector_helper.c
> +++ b/target/riscv/vector_helper.c
> @@ -150,18 +150,20 @@ typedef void vext_ldst_elem_fn_tlb(CPURISCVState *env, abi_ptr addr,
> uint32_t idx, void *vd, uintptr_t retaddr);
> typedef void vext_ldst_elem_fn_host(void *vd, uint32_t idx, void *host);
>
> -#define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF) \
> -static void NAME##_tlb(CPURISCVState *env, abi_ptr addr, \
> - uint32_t byte_off, void *vd, uintptr_t retaddr) \
> -{ \
> - ETYPE *cur = vd + byte_off; \
> - *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr); \
> -} \
> - \
> -static void NAME##_host(void *vd, uint32_t byte_off, void *host) \
> -{ \
> - ETYPE val = LDSUF##_p(host); \
> - *(ETYPE *)(vd + byte_off) = val; \
> +#define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF) \
> +static inline QEMU_ALWAYS_INLINE \
> +void NAME##_tlb(CPURISCVState *env, abi_ptr addr, \
> + uint32_t byte_off, void *vd, uintptr_t retaddr) \
> +{ \
> + ETYPE *cur = vd + byte_off; \
> + *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr); \
> +} \
> + \
> +static inline QEMU_ALWAYS_INLINE \
> +void NAME##_host(void *vd, uint32_t byte_off, void *host) \
> +{ \
> + ETYPE val = LDSUF##_p(host); \
> + *(ETYPE *)(vd + byte_off) = val; \
> }
>
> GEN_VEXT_LD_ELEM(lde_b, uint8_t, H1, ldub)
> @@ -169,18 +171,20 @@ GEN_VEXT_LD_ELEM(lde_h, uint16_t, H2, lduw)
> GEN_VEXT_LD_ELEM(lde_w, uint32_t, H4, ldl)
> GEN_VEXT_LD_ELEM(lde_d, uint64_t, H8, ldq)
>
> -#define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF) \
> -static void NAME##_tlb(CPURISCVState *env, abi_ptr addr, \
> - uint32_t byte_off, void *vd, uintptr_t retaddr) \
> -{ \
> - ETYPE data = *(ETYPE *)(vd + byte_off); \
> - cpu_##STSUF##_data_ra(env, addr, data, retaddr); \
> -} \
> - \
> -static void NAME##_host(void *vd, uint32_t byte_off, void *host) \
> -{ \
> - ETYPE val = *(ETYPE *)(vd + byte_off); \
> - STSUF##_p(host, val); \
> +#define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF) \
> +static inline QEMU_ALWAYS_INLINE \
> +void NAME##_tlb(CPURISCVState *env, abi_ptr addr, \
> + uint32_t byte_off, void *vd, uintptr_t retaddr) \
> +{ \
> + ETYPE data = *(ETYPE *)(vd + byte_off); \
> + cpu_##STSUF##_data_ra(env, addr, data, retaddr); \
> +} \
> + \
> +static inline QEMU_ALWAYS_INLINE \
> +void NAME##_host(void *vd, uint32_t byte_off, void *host) \
> +{ \
> + ETYPE val = *(ETYPE *)(vd + byte_off); \
> + STSUF##_p(host, val); \
> }
>
> GEN_VEXT_ST_ELEM(ste_b, uint8_t, H1, stb)
> @@ -366,7 +370,7 @@ vext_page_ldst_us(CPURISCVState *env, void *vd, target_ulong addr,
> }
> }
>
> -static void
> +static inline QEMU_ALWAYS_INLINE void
> vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
> vext_ldst_elem_fn_tlb *ldst_tlb,
> vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz,
> @@ -695,7 +699,7 @@ GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d_tlb)
> /*
> * load and store whole register instructions
> */
> -static void
> +static inline QEMU_ALWAYS_INLINE void
> vext_ldst_whole(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
> vext_ldst_elem_fn_tlb *ldst_tlb,
> vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz,
On 2024/7/25 2:05 PM, Richard Henderson wrote: > On 7/17/24 23:39, Max Chou wrote: >> In the vector unit-stride load/store helper functions. the vext_ldst_us >> & vext_ldst_whole functions corresponding most of the execution time. >> Inline the functions can avoid the function call overhead to improve the >> helper function performance. >> >> Signed-off-by: Max Chou <max.chou@sifive.com> >> Reviewed-by: Richard Henderson <richard.henderson@linaro.org> >> --- >> target/riscv/vector_helper.c | 56 +++++++++++++++++++----------------- >> 1 file changed, 30 insertions(+), 26 deletions(-) > > You'll want to mark vext_page_ldst_us similarly. > > > r~ Yes, I'll mark vext_page_ldst_us at v6. Thanks. Max.
© 2016 - 2026 Red Hat, Inc.