In the vector unit-stride load/store helper functions. the vext_ldst_us
function corresponding most of the execution time. Inline the functions
can avoid the function call overhead to imperove the helper function
performance.
Signed-off-by: Max Chou <max.chou@sifive.com>
---
target/riscv/vector_helper.c | 30 ++++++++++++++++--------------
1 file changed, 16 insertions(+), 14 deletions(-)
diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c
index e8fbb921449..866f77d321d 100644
--- a/target/riscv/vector_helper.c
+++ b/target/riscv/vector_helper.c
@@ -149,25 +149,27 @@ static inline void vext_set_elem_mask(void *v0, int index,
typedef void vext_ldst_elem_fn(CPURISCVState *env, abi_ptr addr,
uint32_t idx, void *vd, uintptr_t retaddr);
-#define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF) \
-static void NAME(CPURISCVState *env, abi_ptr addr, \
- uint32_t idx, void *vd, uintptr_t retaddr)\
-{ \
- ETYPE *cur = ((ETYPE *)vd + H(idx)); \
- *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr); \
-} \
+#define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF) \
+static inline QEMU_ALWAYS_INLINE \
+void NAME(CPURISCVState *env, abi_ptr addr, \
+ uint32_t idx, void *vd, uintptr_t retaddr) \
+{ \
+ ETYPE *cur = ((ETYPE *)vd + H(idx)); \
+ *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr); \
+} \
GEN_VEXT_LD_ELEM(lde_b, int8_t, H1, ldsb)
GEN_VEXT_LD_ELEM(lde_h, int16_t, H2, ldsw)
GEN_VEXT_LD_ELEM(lde_w, int32_t, H4, ldl)
GEN_VEXT_LD_ELEM(lde_d, int64_t, H8, ldq)
-#define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF) \
-static void NAME(CPURISCVState *env, abi_ptr addr, \
- uint32_t idx, void *vd, uintptr_t retaddr)\
-{ \
- ETYPE data = *((ETYPE *)vd + H(idx)); \
- cpu_##STSUF##_data_ra(env, addr, data, retaddr); \
+#define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF) \
+static inline QEMU_ALWAYS_INLINE \
+void NAME(CPURISCVState *env, abi_ptr addr, \
+ uint32_t idx, void *vd, uintptr_t retaddr) \
+{ \
+ ETYPE data = *((ETYPE *)vd + H(idx)); \
+ cpu_##STSUF##_data_ra(env, addr, data, retaddr); \
}
GEN_VEXT_ST_ELEM(ste_b, int8_t, H1, stb)
@@ -289,7 +291,7 @@ GEN_VEXT_ST_STRIDE(vsse64_v, int64_t, ste_d)
*/
/* unmasked unit-stride load and store operation */
-static void
+static inline QEMU_ALWAYS_INLINE void
vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uint32_t evl,
uintptr_t ra)
--
2.34.1
On 2/15/24 16:28, Max Chou wrote: > In the vector unit-stride load/store helper functions. the vext_ldst_us > function corresponding most of the execution time. Inline the functions > can avoid the function call overhead to imperove the helper function > performance. > > Signed-off-by: Max Chou <max.chou@sifive.com> > --- The inline is a good idea but I think we can do better. I mentioned in a thread last year [1] about the time we're spending in single byte loads/stores, even for strided instructions. E.g. in vext_ldst_stride(): for (i = env->vstart; i < env->vl; i++, env->vstart++) { k = 0; while (k < nf) { if (!vm && !vext_elem_mask(v0, i)) { /* set masked-off elements to 1s */ vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz, (i + k * max_elems + 1) * esz); k++; continue; } target_ulong addr = base + stride * i + (k << log2_esz); ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra); k++; } } We're doing single byte load/stores in ldst_elem() when, in this case, we could do it in a whole block only once. ARM does something similar in SVE. I update the gitlab bug https://gitlab.com/qemu-project/qemu/-/issues/2137 with this additional info too. Thanks, Daniel [1] https://lore.kernel.org/qemu-riscv/0e54c6c1-2903-7942-eff2-2b8c5e21187e@ventanamicro.com/ > target/riscv/vector_helper.c | 30 ++++++++++++++++-------------- > 1 file changed, 16 insertions(+), 14 deletions(-) > > diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c > index e8fbb921449..866f77d321d 100644 > --- a/target/riscv/vector_helper.c > +++ b/target/riscv/vector_helper.c > @@ -149,25 +149,27 @@ static inline void vext_set_elem_mask(void *v0, int index, > typedef void vext_ldst_elem_fn(CPURISCVState *env, abi_ptr addr, > uint32_t idx, void *vd, uintptr_t retaddr); > > -#define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF) \ > -static void NAME(CPURISCVState *env, abi_ptr addr, \ > - uint32_t idx, void *vd, uintptr_t retaddr)\ > -{ \ > - ETYPE *cur = ((ETYPE *)vd + H(idx)); \ > - *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr); \ > -} \ > +#define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF) \ > +static inline QEMU_ALWAYS_INLINE \ > +void NAME(CPURISCVState *env, abi_ptr addr, \ > + uint32_t idx, void *vd, uintptr_t retaddr) \ > +{ \ > + ETYPE *cur = ((ETYPE *)vd + H(idx)); \ > + *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr); \ > +} \ > > GEN_VEXT_LD_ELEM(lde_b, int8_t, H1, ldsb) > GEN_VEXT_LD_ELEM(lde_h, int16_t, H2, ldsw) > GEN_VEXT_LD_ELEM(lde_w, int32_t, H4, ldl) > GEN_VEXT_LD_ELEM(lde_d, int64_t, H8, ldq) > > -#define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF) \ > -static void NAME(CPURISCVState *env, abi_ptr addr, \ > - uint32_t idx, void *vd, uintptr_t retaddr)\ > -{ \ > - ETYPE data = *((ETYPE *)vd + H(idx)); \ > - cpu_##STSUF##_data_ra(env, addr, data, retaddr); \ > +#define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF) \ > +static inline QEMU_ALWAYS_INLINE \ > +void NAME(CPURISCVState *env, abi_ptr addr, \ > + uint32_t idx, void *vd, uintptr_t retaddr) \ > +{ \ > + ETYPE data = *((ETYPE *)vd + H(idx)); \ > + cpu_##STSUF##_data_ra(env, addr, data, retaddr); \ > } > > GEN_VEXT_ST_ELEM(ste_b, int8_t, H1, stb) > @@ -289,7 +291,7 @@ GEN_VEXT_ST_STRIDE(vsse64_v, int64_t, ste_d) > */ > > /* unmasked unit-stride load and store operation */ > -static void > +static inline QEMU_ALWAYS_INLINE void > vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc, > vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uint32_t evl, > uintptr_t ra)
Hi Daniel, Thank you for the information and suggestion. Yes, we can do it better if we load/store more bytes at a time. I'll try to improve the RFC on this way. Thanks, Max On 2024/2/16 5:11 AM, Daniel Henrique Barboza wrote: > > > On 2/15/24 16:28, Max Chou wrote: >> In the vector unit-stride load/store helper functions. the vext_ldst_us >> function corresponding most of the execution time. Inline the functions >> can avoid the function call overhead to imperove the helper function >> performance. >> >> Signed-off-by: Max Chou <max.chou@sifive.com> >> --- > > The inline is a good idea but I think we can do better. I mentioned in > a thread > last year [1] about the time we're spending in single byte > loads/stores, even > for strided instructions. > > E.g. in vext_ldst_stride(): > > > for (i = env->vstart; i < env->vl; i++, env->vstart++) { > k = 0; > while (k < nf) { > if (!vm && !vext_elem_mask(v0, i)) { > /* set masked-off elements to 1s */ > vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz, > (i + k * max_elems + 1) * esz); > k++; > continue; > } > target_ulong addr = base + stride * i + (k << log2_esz); > ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, > vd, ra); > k++; > } > } > > We're doing single byte load/stores in ldst_elem() when, in this case, > we could do > it in a whole block only once. ARM does something similar in SVE. > > I update the gitlab bug > https://gitlab.com/qemu-project/qemu/-/issues/2137 with this > additional info too. > > > > Thanks, > > Daniel > > > [1] > https://lore.kernel.org/qemu-riscv/0e54c6c1-2903-7942-eff2-2b8c5e21187e@ventanamicro.com/ > > >> target/riscv/vector_helper.c | 30 ++++++++++++++++-------------- >> 1 file changed, 16 insertions(+), 14 deletions(-) >> >> diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c >> index e8fbb921449..866f77d321d 100644 >> --- a/target/riscv/vector_helper.c >> +++ b/target/riscv/vector_helper.c >> @@ -149,25 +149,27 @@ static inline void vext_set_elem_mask(void *v0, >> int index, >> typedef void vext_ldst_elem_fn(CPURISCVState *env, abi_ptr addr, >> uint32_t idx, void *vd, uintptr_t >> retaddr); >> -#define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF) \ >> -static void NAME(CPURISCVState *env, abi_ptr addr, \ >> - uint32_t idx, void *vd, uintptr_t retaddr)\ >> -{ \ >> - ETYPE *cur = ((ETYPE *)vd + H(idx)); \ >> - *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr); \ >> -} \ >> +#define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF) \ >> +static inline QEMU_ALWAYS_INLINE \ >> +void NAME(CPURISCVState *env, abi_ptr addr, \ >> + uint32_t idx, void *vd, uintptr_t retaddr) \ >> +{ \ >> + ETYPE *cur = ((ETYPE *)vd + H(idx)); \ >> + *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr); \ >> +} \ >> GEN_VEXT_LD_ELEM(lde_b, int8_t, H1, ldsb) >> GEN_VEXT_LD_ELEM(lde_h, int16_t, H2, ldsw) >> GEN_VEXT_LD_ELEM(lde_w, int32_t, H4, ldl) >> GEN_VEXT_LD_ELEM(lde_d, int64_t, H8, ldq) >> -#define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF) \ >> -static void NAME(CPURISCVState *env, abi_ptr addr, \ >> - uint32_t idx, void *vd, uintptr_t retaddr)\ >> -{ \ >> - ETYPE data = *((ETYPE *)vd + H(idx)); \ >> - cpu_##STSUF##_data_ra(env, addr, data, retaddr); \ >> +#define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF) \ >> +static inline QEMU_ALWAYS_INLINE \ >> +void NAME(CPURISCVState *env, abi_ptr addr, \ >> + uint32_t idx, void *vd, uintptr_t retaddr) \ >> +{ \ >> + ETYPE data = *((ETYPE *)vd + H(idx)); \ >> + cpu_##STSUF##_data_ra(env, addr, data, retaddr); \ >> } >> GEN_VEXT_ST_ELEM(ste_b, int8_t, H1, stb) >> @@ -289,7 +291,7 @@ GEN_VEXT_ST_STRIDE(vsse64_v, int64_t, ste_d) >> */ >> /* unmasked unit-stride load and store operation */ >> -static void >> +static inline QEMU_ALWAYS_INLINE void >> vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, >> uint32_t desc, >> vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, >> uint32_t evl, >> uintptr_t ra)
On 2/15/24 09:28, Max Chou wrote: > /* unmasked unit-stride load and store operation */ > -static void > +static inline QEMU_ALWAYS_INLINE void > vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc, > vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uint32_t evl, > uintptr_t ra) Yes, this is important so that ldst_elem itself can always be inlined. Reviewed-by: Richard Henderson <richard.henderson@linaro.org> r~
© 2016 - 2024 Red Hat, Inc.