[v5] Improve the performance of RISC-V vector unit-stride/whole register ld/st instructions

[RFC PATCH v5 4/5] target/riscv: rvv: Provide group continuous ld/st flow for unit-stride ld/st instructions

Posted by Max Chou 2 months ago

The vector unmasked unit-stride and whole register load/store
instructions will load/store continuous memory. If the endian of both
the host and guest architecture are the same, then we can group the
element load/store to load/store more data at a time.

Signed-off-by: Max Chou <max.chou@sifive.com>
---
 target/riscv/vector_helper.c | 72 +++++++++++++++++++++++++++++-------
 1 file changed, 58 insertions(+), 14 deletions(-)

diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c
index 5343a08e6ad..2e675b4220c 100644
--- a/target/riscv/vector_helper.c
+++ b/target/riscv/vector_helper.c
@@ -188,6 +188,40 @@ GEN_VEXT_ST_ELEM(ste_h, uint16_t, H2, stw)
 GEN_VEXT_ST_ELEM(ste_w, uint32_t, H4, stl)
 GEN_VEXT_ST_ELEM(ste_d, uint64_t, H8, stq)
 
+static inline QEMU_ALWAYS_INLINE void
+vext_continus_ldst_tlb(CPURISCVState *env, vext_ldst_elem_fn_tlb *ldst_tlb,
+                       void *vd, uint32_t evl, target_ulong addr,
+                       uint32_t reg_start, uintptr_t ra, uint32_t esz,
+                       bool is_load)
+{
+    uint32_t i;
+    for (i = env->vstart; i < evl; env->vstart = ++i, addr += esz) {
+        ldst_tlb(env, adjust_addr(env, addr), i * esz, vd, ra);
+    }
+}
+
+static inline QEMU_ALWAYS_INLINE void
+vext_continus_ldst_host(CPURISCVState *env, vext_ldst_elem_fn_host *ldst_host,
+                        void *vd, uint32_t evl, uint32_t reg_start, void *host,
+                        uint32_t esz, bool is_load)
+{
+#if TARGET_BIG_ENDIAN != HOST_BIG_ENDIAN
+    for (; reg_start < evl; reg_start++, host += esz) {
+        uint32_t byte_off = reg_start * esz;
+        ldst_host(vd, byte_off, host);
+    }
+#else
+    uint32_t byte_offset = reg_start * esz;
+    uint32_t size = (evl - reg_start) * esz;
+
+    if (is_load) {
+        memcpy(vd + byte_offset, host, size);
+    } else {
+        memcpy(host, vd + byte_offset, size);
+    }
+#endif
+}
+
 static void vext_set_tail_elems_1s(target_ulong vl, void *vd,
                                    uint32_t desc, uint32_t nf,
                                    uint32_t esz, uint32_t max_elems)
@@ -299,24 +333,34 @@ vext_page_ldst_us(CPURISCVState *env, void *vd, target_ulong addr,
                                mmu_index, true, &host, ra);
 
     if (host && flags == 0) {
-        for (i = env->vstart; i < evl; ++i) {
-            k = 0;
-            while (k < nf) {
-                ldst_host(vd, (i + k * max_elems) << log2_esz, host);
-                host += esz;
-                k++;
+        if (nf == 1) {
+            vext_continus_ldst_host(env, ldst_host, vd, evl, env->vstart, host,
+                                    esz, is_load);
+        } else {
+            for (i = env->vstart; i < evl; ++i) {
+                k = 0;
+                while (k < nf) {
+                    ldst_host(vd, (i + k * max_elems) << log2_esz, host);
+                    host += esz;
+                    k++;
+                }
             }
         }
         env->vstart += elems;
     } else {
-        /* load bytes from guest memory */
-        for (i = env->vstart; i < evl; env->vstart = ++i) {
-            k = 0;
-            while (k < nf) {
-                ldst_tlb(env, adjust_addr(env, addr),
-                         (i + k * max_elems) << log2_esz, vd, ra);
-                addr += esz;
-                k++;
+        if (nf == 1) {
+            vext_continus_ldst_tlb(env, ldst_tlb, vd, evl, addr, env->vstart,
+                                   ra, esz, is_load);
+        } else {
+            /* load bytes from guest memory */
+            for (i = env->vstart; i < evl; env->vstart = ++i) {
+                k = 0;
+                while (k < nf) {
+                    ldst_tlb(env, adjust_addr(env, addr),
+                            (i + k * max_elems) << log2_esz, vd, ra);
+                    addr += esz;
+                    k++;
+                }
             }
         }
     }
-- 
2.34.1

Re: [RFC PATCH v5 4/5] target/riscv: rvv: Provide group continuous ld/st flow for unit-stride ld/st instructions

Posted by Richard Henderson 1 month, 3 weeks ago

On 7/17/24 23:39, Max Chou wrote:
> +static inline QEMU_ALWAYS_INLINE void
> +vext_continus_ldst_host(CPURISCVState *env, vext_ldst_elem_fn_host *ldst_host,
> +                        void *vd, uint32_t evl, uint32_t reg_start, void *host,
> +                        uint32_t esz, bool is_load)
> +{
> +#if TARGET_BIG_ENDIAN != HOST_BIG_ENDIAN
> +    for (; reg_start < evl; reg_start++, host += esz) {
> +        uint32_t byte_off = reg_start * esz;
> +        ldst_host(vd, byte_off, host);
> +    }
> +#else
> +    uint32_t byte_offset = reg_start * esz;
> +    uint32_t size = (evl - reg_start) * esz;
> +
> +    if (is_load) {
> +        memcpy(vd + byte_offset, host, size);
> +    } else {
> +        memcpy(host, vd + byte_offset, size);
> +    }
> +#endif

First, TARGET_BIG_ENDIAN is always false, so this reduces to HOST_BIG_ENDIAN.

Second, even if TARGET_BIG_ENDIAN were true, this optimization would be wrong, because of 
the element ordering given in vector_internals.h (i.e. H1 etc).

Third, this can be done with C if, instead of cpp ifdef, so that you always compile-test 
both sides.

Fourth... what are the atomicity guarantees of RVV?  I didn't immediately see anything in 
the RVV manual, which suggests that the atomicity is the same as individual integer loads 
of the same size.  Because there are no atomicity guarantees for memcpy, you can only use 
this for byte load/store.

For big-endian bytes, you can optimize this to 64-bit little-endian operations.
Compare arm gen_sve_ldr.

r~

Re: [RFC PATCH v5 4/5] target/riscv: rvv: Provide group continuous ld/st flow for unit-stride ld/st instructions

Posted by Max Chou 1 month, 2 weeks ago

On 2024/7/25 2:04 PM, Richard Henderson wrote:
> On 7/17/24 23:39, Max Chou wrote:
>> +static inline QEMU_ALWAYS_INLINE void
>> +vext_continus_ldst_host(CPURISCVState *env, vext_ldst_elem_fn_host 
>> *ldst_host,
>> +                        void *vd, uint32_t evl, uint32_t reg_start, 
>> void *host,
>> +                        uint32_t esz, bool is_load)
>> +{
>> +#if TARGET_BIG_ENDIAN != HOST_BIG_ENDIAN
>> +    for (; reg_start < evl; reg_start++, host += esz) {
>> +        uint32_t byte_off = reg_start * esz;
>> +        ldst_host(vd, byte_off, host);
>> +    }
>> +#else
>> +    uint32_t byte_offset = reg_start * esz;
>> +    uint32_t size = (evl - reg_start) * esz;
>> +
>> +    if (is_load) {
>> +        memcpy(vd + byte_offset, host, size);
>> +    } else {
>> +        memcpy(host, vd + byte_offset, size);
>> +    }
>> +#endif
>
> First, TARGET_BIG_ENDIAN is always false, so this reduces to 
> HOST_BIG_ENDIAN.
>
> Second, even if TARGET_BIG_ENDIAN were true, this optimization would 
> be wrong, because of the element ordering given in vector_internals.h 
> (i.e. H1 etc).
Thanks for the suggestions.
I missed the element ordering here.
I'll fix this at v6.

>
> Third, this can be done with C if, instead of cpp ifdef, so that you 
> always compile-test both sides.
>
> Fourth... what are the atomicity guarantees of RVV?  I didn't 
> immediately see anything in the RVV manual, which suggests that the 
> atomicity is the same as individual integer loads of the same size.  
> Because there are no atomicity guarantees for memcpy, you can only use 
> this for byte load/store.
>
> For big-endian bytes, you can optimize this to 64-bit little-endian 
> operations.
> Compare arm gen_sve_ldr.
Thanks for the suggestion.
I'll check arm gen_sve_ldr.

>
>
> r~

[RFC PATCH v5 1/5] target/riscv: Set vdata.vm field for vector load/store whole register instructions
[RFC PATCH v5 2/5] target/riscv: rvv: Provide a fast path using direct access to host ram for unmasked unit-stride load/store
[RFC PATCH v5 3/5] target/riscv: rvv: Provide a fast path using direct access to host ram for unit-stride whole register load/store
[RFC PATCH v5 4/5] target/riscv: rvv: Provide group continuous ld/st flow for unit-stride ld/st instructions
[RFC PATCH v5 5/5] target/riscv: Inline unit-stride ld/st and corresponding functions for performance