[PATCH v6 7/7] target/riscv: Inline unit-stride ld/st and corresponding functions for performance

Max Chou posted 7 patches 2 months ago
[PATCH v6 7/7] target/riscv: Inline unit-stride ld/st and corresponding functions for performance
Posted by Max Chou 2 months ago
In the vector unit-stride load/store helper functions. the vext_ldst_us
& vext_ldst_whole functions corresponding most of the execution time.
Inline the functions can avoid the function call overhead to improve the
helper function performance.

Signed-off-by: Max Chou <max.chou@sifive.com>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/riscv/vector_helper.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c
index 654d5e111f3..0d5ed950486 100644
--- a/target/riscv/vector_helper.c
+++ b/target/riscv/vector_helper.c
@@ -152,14 +152,16 @@ typedef void vext_ldst_elem_fn_tlb(CPURISCVState *env, abi_ptr addr,
 typedef void vext_ldst_elem_fn_host(void *vd, uint32_t idx, void *host);
 
 #define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF)             \
-static void NAME##_tlb(CPURISCVState *env, abi_ptr addr,    \
+static inline QEMU_ALWAYS_INLINE                            \
+void NAME##_tlb(CPURISCVState *env, abi_ptr addr,           \
                 uint32_t idx, void *vd, uintptr_t retaddr)  \
 {                                                           \
     ETYPE *cur = ((ETYPE *)vd + H(idx));                    \
     *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr);       \
 }                                                           \
                                                             \
-static void NAME##_host(void *vd, uint32_t idx, void *host) \
+static inline QEMU_ALWAYS_INLINE                            \
+void NAME##_host(void *vd, uint32_t idx, void *host)        \
 {                                                           \
     ETYPE *cur = ((ETYPE *)vd + H(idx));                    \
     *cur = (ETYPE)LDSUF##_p(host);                          \
@@ -171,14 +173,16 @@ GEN_VEXT_LD_ELEM(lde_w, uint32_t, H4, ldl)
 GEN_VEXT_LD_ELEM(lde_d, uint64_t, H8, ldq)
 
 #define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF)             \
-static void NAME##_tlb(CPURISCVState *env, abi_ptr addr,    \
+static inline QEMU_ALWAYS_INLINE                            \
+void NAME##_tlb(CPURISCVState *env, abi_ptr addr,           \
                 uint32_t idx, void *vd, uintptr_t retaddr)  \
 {                                                           \
     ETYPE data = *((ETYPE *)vd + H(idx));                   \
     cpu_##STSUF##_data_ra(env, addr, data, retaddr);        \
 }                                                           \
                                                             \
-static void NAME##_host(void *vd, uint32_t idx, void *host) \
+static inline QEMU_ALWAYS_INLINE                            \
+void NAME##_host(void *vd, uint32_t idx, void *host)        \
 {                                                           \
     ETYPE data = *((ETYPE *)vd + H(idx));                   \
     STSUF##_p(host, data);                                  \
@@ -317,7 +321,7 @@ GEN_VEXT_ST_STRIDE(vsse64_v, int64_t, ste_d_tlb)
  */
 
 /* unmasked unit-stride load and store operation */
-static void
+static inline QEMU_ALWAYS_INLINE void
 vext_page_ldst_us(CPURISCVState *env, void *vd, target_ulong addr,
                   uint32_t elems, uint32_t nf, uint32_t max_elems,
                   uint32_t log2_esz, bool is_load, int mmu_index,
@@ -369,7 +373,7 @@ vext_page_ldst_us(CPURISCVState *env, void *vd, target_ulong addr,
     }
 }
 
-static void
+static inline QEMU_ALWAYS_INLINE void
 vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
              vext_ldst_elem_fn_tlb *ldst_tlb,
              vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz,
@@ -756,7 +760,7 @@ GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d_tlb, lde_d_host)
 /*
  * load and store whole register instructions
  */
-static void
+static inline QEMU_ALWAYS_INLINE void
 vext_ldst_whole(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
                 vext_ldst_elem_fn_tlb *ldst_tlb,
                 vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz,
-- 
2.34.1
Re: [PATCH v6 7/7] target/riscv: Inline unit-stride ld/st and corresponding functions for performance
Posted by Daniel Henrique Barboza 3 weeks, 3 days ago

On 9/18/24 2:14 PM, Max Chou wrote:
> In the vector unit-stride load/store helper functions. the vext_ldst_us
> & vext_ldst_whole functions corresponding most of the execution time.
> Inline the functions can avoid the function call overhead to improve the
> helper function performance.
> 
> Signed-off-by: Max Chou <max.chou@sifive.com>
> Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
> ---

Reviewed-by: Daniel Henrique Barboza <dbarboza@ventanamicro.com>

>   target/riscv/vector_helper.c | 18 +++++++++++-------
>   1 file changed, 11 insertions(+), 7 deletions(-)
> 
> diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c
> index 654d5e111f3..0d5ed950486 100644
> --- a/target/riscv/vector_helper.c
> +++ b/target/riscv/vector_helper.c
> @@ -152,14 +152,16 @@ typedef void vext_ldst_elem_fn_tlb(CPURISCVState *env, abi_ptr addr,
>   typedef void vext_ldst_elem_fn_host(void *vd, uint32_t idx, void *host);
>   
>   #define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF)             \
> -static void NAME##_tlb(CPURISCVState *env, abi_ptr addr,    \
> +static inline QEMU_ALWAYS_INLINE                            \
> +void NAME##_tlb(CPURISCVState *env, abi_ptr addr,           \
>                   uint32_t idx, void *vd, uintptr_t retaddr)  \
>   {                                                           \
>       ETYPE *cur = ((ETYPE *)vd + H(idx));                    \
>       *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr);       \
>   }                                                           \
>                                                               \
> -static void NAME##_host(void *vd, uint32_t idx, void *host) \
> +static inline QEMU_ALWAYS_INLINE                            \
> +void NAME##_host(void *vd, uint32_t idx, void *host)        \
>   {                                                           \
>       ETYPE *cur = ((ETYPE *)vd + H(idx));                    \
>       *cur = (ETYPE)LDSUF##_p(host);                          \
> @@ -171,14 +173,16 @@ GEN_VEXT_LD_ELEM(lde_w, uint32_t, H4, ldl)
>   GEN_VEXT_LD_ELEM(lde_d, uint64_t, H8, ldq)
>   
>   #define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF)             \
> -static void NAME##_tlb(CPURISCVState *env, abi_ptr addr,    \
> +static inline QEMU_ALWAYS_INLINE                            \
> +void NAME##_tlb(CPURISCVState *env, abi_ptr addr,           \
>                   uint32_t idx, void *vd, uintptr_t retaddr)  \
>   {                                                           \
>       ETYPE data = *((ETYPE *)vd + H(idx));                   \
>       cpu_##STSUF##_data_ra(env, addr, data, retaddr);        \
>   }                                                           \
>                                                               \
> -static void NAME##_host(void *vd, uint32_t idx, void *host) \
> +static inline QEMU_ALWAYS_INLINE                            \
> +void NAME##_host(void *vd, uint32_t idx, void *host)        \
>   {                                                           \
>       ETYPE data = *((ETYPE *)vd + H(idx));                   \
>       STSUF##_p(host, data);                                  \
> @@ -317,7 +321,7 @@ GEN_VEXT_ST_STRIDE(vsse64_v, int64_t, ste_d_tlb)
>    */
>   
>   /* unmasked unit-stride load and store operation */
> -static void
> +static inline QEMU_ALWAYS_INLINE void
>   vext_page_ldst_us(CPURISCVState *env, void *vd, target_ulong addr,
>                     uint32_t elems, uint32_t nf, uint32_t max_elems,
>                     uint32_t log2_esz, bool is_load, int mmu_index,
> @@ -369,7 +373,7 @@ vext_page_ldst_us(CPURISCVState *env, void *vd, target_ulong addr,
>       }
>   }
>   
> -static void
> +static inline QEMU_ALWAYS_INLINE void
>   vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
>                vext_ldst_elem_fn_tlb *ldst_tlb,
>                vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz,
> @@ -756,7 +760,7 @@ GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d_tlb, lde_d_host)
>   /*
>    * load and store whole register instructions
>    */
> -static void
> +static inline QEMU_ALWAYS_INLINE void
>   vext_ldst_whole(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
>                   vext_ldst_elem_fn_tlb *ldst_tlb,
>                   vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz,