[RFC PATCH v5 5/5] target/riscv: Inline unit-stride ld/st and corresponding functions for performance

Max Chou posted 5 patches 2 months ago
Maintainers: Palmer Dabbelt <palmer@dabbelt.com>, Alistair Francis <alistair.francis@wdc.com>, Bin Meng <bmeng.cn@gmail.com>, Weiwei Li <liwei1518@gmail.com>, Daniel Henrique Barboza <dbarboza@ventanamicro.com>, Liu Zhiwei <zhiwei_liu@linux.alibaba.com>
[RFC PATCH v5 5/5] target/riscv: Inline unit-stride ld/st and corresponding functions for performance
Posted by Max Chou 2 months ago
In the vector unit-stride load/store helper functions. the vext_ldst_us
& vext_ldst_whole functions corresponding most of the execution time.
Inline the functions can avoid the function call overhead to improve the
helper function performance.

Signed-off-by: Max Chou <max.chou@sifive.com>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/riscv/vector_helper.c | 56 +++++++++++++++++++-----------------
 1 file changed, 30 insertions(+), 26 deletions(-)

diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c
index 2e675b4220c..95394c425ed 100644
--- a/target/riscv/vector_helper.c
+++ b/target/riscv/vector_helper.c
@@ -150,18 +150,20 @@ typedef void vext_ldst_elem_fn_tlb(CPURISCVState *env, abi_ptr addr,
                                    uint32_t idx, void *vd, uintptr_t retaddr);
 typedef void vext_ldst_elem_fn_host(void *vd, uint32_t idx, void *host);
 
-#define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF)                         \
-static void NAME##_tlb(CPURISCVState *env, abi_ptr addr,                \
-                       uint32_t byte_off, void *vd, uintptr_t retaddr)  \
-{                                                                       \
-    ETYPE *cur = vd + byte_off;                                         \
-    *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr);                   \
-}                                                                       \
-                                                                        \
-static void NAME##_host(void *vd, uint32_t byte_off, void *host)        \
-{                                                                       \
-    ETYPE val = LDSUF##_p(host);                                        \
-    *(ETYPE *)(vd + byte_off) = val;                                    \
+#define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF)                 \
+static inline QEMU_ALWAYS_INLINE                                \
+void NAME##_tlb(CPURISCVState *env, abi_ptr addr,               \
+                uint32_t byte_off, void *vd, uintptr_t retaddr) \
+{                                                               \
+    ETYPE *cur = vd + byte_off;                                 \
+    *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr);           \
+}                                                               \
+                                                                \
+static inline QEMU_ALWAYS_INLINE                                \
+void NAME##_host(void *vd, uint32_t byte_off, void *host)       \
+{                                                               \
+    ETYPE val = LDSUF##_p(host);                                \
+    *(ETYPE *)(vd + byte_off) = val;                            \
 }
 
 GEN_VEXT_LD_ELEM(lde_b, uint8_t,  H1, ldub)
@@ -169,18 +171,20 @@ GEN_VEXT_LD_ELEM(lde_h, uint16_t, H2, lduw)
 GEN_VEXT_LD_ELEM(lde_w, uint32_t, H4, ldl)
 GEN_VEXT_LD_ELEM(lde_d, uint64_t, H8, ldq)
 
-#define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF)                         \
-static void NAME##_tlb(CPURISCVState *env, abi_ptr addr,                \
-                       uint32_t byte_off, void *vd, uintptr_t retaddr)  \
-{                                                                       \
-    ETYPE data = *(ETYPE *)(vd + byte_off);                             \
-    cpu_##STSUF##_data_ra(env, addr, data, retaddr);                    \
-}                                                                       \
-                                                                        \
-static void NAME##_host(void *vd, uint32_t byte_off, void *host)        \
-{                                                                       \
-    ETYPE val = *(ETYPE *)(vd + byte_off);                              \
-    STSUF##_p(host, val);                                               \
+#define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF)                 \
+static inline QEMU_ALWAYS_INLINE                                \
+void NAME##_tlb(CPURISCVState *env, abi_ptr addr,               \
+                uint32_t byte_off, void *vd, uintptr_t retaddr) \
+{                                                               \
+    ETYPE data = *(ETYPE *)(vd + byte_off);                     \
+    cpu_##STSUF##_data_ra(env, addr, data, retaddr);            \
+}                                                               \
+                                                                \
+static inline QEMU_ALWAYS_INLINE                                \
+void NAME##_host(void *vd, uint32_t byte_off, void *host)       \
+{                                                               \
+    ETYPE val = *(ETYPE *)(vd + byte_off);                      \
+    STSUF##_p(host, val);                                       \
 }
 
 GEN_VEXT_ST_ELEM(ste_b, uint8_t,  H1, stb)
@@ -366,7 +370,7 @@ vext_page_ldst_us(CPURISCVState *env, void *vd, target_ulong addr,
     }
 }
 
-static void
+static inline QEMU_ALWAYS_INLINE void
 vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
              vext_ldst_elem_fn_tlb *ldst_tlb,
              vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz,
@@ -695,7 +699,7 @@ GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d_tlb)
 /*
  * load and store whole register instructions
  */
-static void
+static inline QEMU_ALWAYS_INLINE void
 vext_ldst_whole(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
                 vext_ldst_elem_fn_tlb *ldst_tlb,
                 vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz,
-- 
2.34.1
Re: [RFC PATCH v5 5/5] target/riscv: Inline unit-stride ld/st and corresponding functions for performance
Posted by Richard Henderson 1 month, 3 weeks ago
On 7/17/24 23:39, Max Chou wrote:
> In the vector unit-stride load/store helper functions. the vext_ldst_us
> & vext_ldst_whole functions corresponding most of the execution time.
> Inline the functions can avoid the function call overhead to improve the
> helper function performance.
> 
> Signed-off-by: Max Chou <max.chou@sifive.com>
> Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>   target/riscv/vector_helper.c | 56 +++++++++++++++++++-----------------
>   1 file changed, 30 insertions(+), 26 deletions(-)

You'll want to mark vext_page_ldst_us similarly.


r~

> 
> diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c
> index 2e675b4220c..95394c425ed 100644
> --- a/target/riscv/vector_helper.c
> +++ b/target/riscv/vector_helper.c
> @@ -150,18 +150,20 @@ typedef void vext_ldst_elem_fn_tlb(CPURISCVState *env, abi_ptr addr,
>                                      uint32_t idx, void *vd, uintptr_t retaddr);
>   typedef void vext_ldst_elem_fn_host(void *vd, uint32_t idx, void *host);
>   
> -#define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF)                         \
> -static void NAME##_tlb(CPURISCVState *env, abi_ptr addr,                \
> -                       uint32_t byte_off, void *vd, uintptr_t retaddr)  \
> -{                                                                       \
> -    ETYPE *cur = vd + byte_off;                                         \
> -    *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr);                   \
> -}                                                                       \
> -                                                                        \
> -static void NAME##_host(void *vd, uint32_t byte_off, void *host)        \
> -{                                                                       \
> -    ETYPE val = LDSUF##_p(host);                                        \
> -    *(ETYPE *)(vd + byte_off) = val;                                    \
> +#define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF)                 \
> +static inline QEMU_ALWAYS_INLINE                                \
> +void NAME##_tlb(CPURISCVState *env, abi_ptr addr,               \
> +                uint32_t byte_off, void *vd, uintptr_t retaddr) \
> +{                                                               \
> +    ETYPE *cur = vd + byte_off;                                 \
> +    *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr);           \
> +}                                                               \
> +                                                                \
> +static inline QEMU_ALWAYS_INLINE                                \
> +void NAME##_host(void *vd, uint32_t byte_off, void *host)       \
> +{                                                               \
> +    ETYPE val = LDSUF##_p(host);                                \
> +    *(ETYPE *)(vd + byte_off) = val;                            \
>   }
>   
>   GEN_VEXT_LD_ELEM(lde_b, uint8_t,  H1, ldub)
> @@ -169,18 +171,20 @@ GEN_VEXT_LD_ELEM(lde_h, uint16_t, H2, lduw)
>   GEN_VEXT_LD_ELEM(lde_w, uint32_t, H4, ldl)
>   GEN_VEXT_LD_ELEM(lde_d, uint64_t, H8, ldq)
>   
> -#define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF)                         \
> -static void NAME##_tlb(CPURISCVState *env, abi_ptr addr,                \
> -                       uint32_t byte_off, void *vd, uintptr_t retaddr)  \
> -{                                                                       \
> -    ETYPE data = *(ETYPE *)(vd + byte_off);                             \
> -    cpu_##STSUF##_data_ra(env, addr, data, retaddr);                    \
> -}                                                                       \
> -                                                                        \
> -static void NAME##_host(void *vd, uint32_t byte_off, void *host)        \
> -{                                                                       \
> -    ETYPE val = *(ETYPE *)(vd + byte_off);                              \
> -    STSUF##_p(host, val);                                               \
> +#define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF)                 \
> +static inline QEMU_ALWAYS_INLINE                                \
> +void NAME##_tlb(CPURISCVState *env, abi_ptr addr,               \
> +                uint32_t byte_off, void *vd, uintptr_t retaddr) \
> +{                                                               \
> +    ETYPE data = *(ETYPE *)(vd + byte_off);                     \
> +    cpu_##STSUF##_data_ra(env, addr, data, retaddr);            \
> +}                                                               \
> +                                                                \
> +static inline QEMU_ALWAYS_INLINE                                \
> +void NAME##_host(void *vd, uint32_t byte_off, void *host)       \
> +{                                                               \
> +    ETYPE val = *(ETYPE *)(vd + byte_off);                      \
> +    STSUF##_p(host, val);                                       \
>   }
>   
>   GEN_VEXT_ST_ELEM(ste_b, uint8_t,  H1, stb)
> @@ -366,7 +370,7 @@ vext_page_ldst_us(CPURISCVState *env, void *vd, target_ulong addr,
>       }
>   }
>   
> -static void
> +static inline QEMU_ALWAYS_INLINE void
>   vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
>                vext_ldst_elem_fn_tlb *ldst_tlb,
>                vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz,
> @@ -695,7 +699,7 @@ GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d_tlb)
>   /*
>    * load and store whole register instructions
>    */
> -static void
> +static inline QEMU_ALWAYS_INLINE void
>   vext_ldst_whole(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
>                   vext_ldst_elem_fn_tlb *ldst_tlb,
>                   vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz,
Re: [RFC PATCH v5 5/5] target/riscv: Inline unit-stride ld/st and corresponding functions for performance
Posted by Max Chou 1 month, 2 weeks ago
On 2024/7/25 2:05 PM, Richard Henderson wrote:
> On 7/17/24 23:39, Max Chou wrote:
>> In the vector unit-stride load/store helper functions. the vext_ldst_us
>> & vext_ldst_whole functions corresponding most of the execution time.
>> Inline the functions can avoid the function call overhead to improve the
>> helper function performance.
>>
>> Signed-off-by: Max Chou <max.chou@sifive.com>
>> Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
>> ---
>>   target/riscv/vector_helper.c | 56 +++++++++++++++++++-----------------
>>   1 file changed, 30 insertions(+), 26 deletions(-)
>
> You'll want to mark vext_page_ldst_us similarly.
>
>
> r~
Yes, I'll mark vext_page_ldst_us at v6.
Thanks.

Max.