[PATCH v6 5/7] target/riscv: rvv: Provide a fast path using direct access to host ram for unit-stride load-only-first load instructions

Max Chou posted 7 patches 2 months ago
[PATCH v6 5/7] target/riscv: rvv: Provide a fast path using direct access to host ram for unit-stride load-only-first load instructions
Posted by Max Chou 2 months ago
The unmasked unit-stride fault-only-first load instructions are similar
to the unmasked unit-stride load/store instructions that is suitable to
be optimized by using a direct access to host ram fast path.

Signed-off-by: Max Chou <max.chou@sifive.com>
---
 target/riscv/vector_helper.c | 98 ++++++++++++++++++++++++++----------
 1 file changed, 71 insertions(+), 27 deletions(-)

diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c
index 824e6401736..59009a940ff 100644
--- a/target/riscv/vector_helper.c
+++ b/target/riscv/vector_helper.c
@@ -557,18 +557,18 @@ GEN_VEXT_ST_INDEX(vsxei64_64_v, int64_t, idx_d, ste_d_tlb)
  * unit-stride fault-only-fisrt load instructions
  */
 static inline void
-vext_ldff(void *vd, void *v0, target_ulong base,
-          CPURISCVState *env, uint32_t desc,
-          vext_ldst_elem_fn_tlb *ldst_elem,
-          uint32_t log2_esz, uintptr_t ra)
+vext_ldff(void *vd, void *v0, target_ulong base, CPURISCVState *env,
+          uint32_t desc, vext_ldst_elem_fn_tlb *ldst_tlb,
+          vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz, uintptr_t ra)
 {
     uint32_t i, k, vl = 0;
     uint32_t nf = vext_nf(desc);
     uint32_t vm = vext_vm(desc);
     uint32_t max_elems = vext_max_elems(desc, log2_esz);
     uint32_t esz = 1 << log2_esz;
+    uint32_t msize = nf * esz;
     uint32_t vma = vext_vma(desc);
-    target_ulong addr, offset, remain;
+    target_ulong addr, offset, remain, page_split, elems;
     int mmu_index = riscv_env_mmu_index(env, false);
 
     VSTART_CHECK_EARLY_EXIT(env);
@@ -617,19 +617,63 @@ ProbeSuccess:
     if (vl != 0) {
         env->vl = vl;
     }
-    for (i = env->vstart; i < env->vl; i++) {
-        k = 0;
-        while (k < nf) {
-            if (!vm && !vext_elem_mask(v0, i)) {
-                /* set masked-off elements to 1s */
-                vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
-                                  (i + k * max_elems + 1) * esz);
-                k++;
-                continue;
+
+    if (env->vstart < env->vl) {
+        if (vm) {
+            /* Calculate the page range of first page */
+            addr = base + ((env->vstart * nf) << log2_esz);
+            page_split = -(addr | TARGET_PAGE_MASK);
+            /* Get number of elements */
+            elems = page_split / msize;
+            if (unlikely(env->vstart + elems >= env->vl)) {
+                elems = env->vl - env->vstart;
+            }
+
+            /* Load/store elements in the first page */
+            if (likely(elems)) {
+                vext_page_ldst_us(env, vd, addr, elems, nf, max_elems,
+                                  log2_esz, true, mmu_index, ldst_tlb,
+                                  ldst_host, ra);
+            }
+
+            /* Load/store elements in the second page */
+            if (unlikely(env->vstart < env->vl)) {
+                /* Cross page element */
+                if (unlikely(page_split % msize)) {
+                    for (k = 0; k < nf; k++) {
+                        addr = base + ((env->vstart * nf + k) << log2_esz);
+                        ldst_tlb(env, adjust_addr(env, addr),
+                                 env->vstart + k * max_elems, vd, ra);
+                    }
+                    env->vstart++;
+                }
+
+                addr = base + ((env->vstart * nf) << log2_esz);
+                /* Get number of elements of second page */
+                elems = env->vl - env->vstart;
+
+                /* Load/store elements in the second page */
+                vext_page_ldst_us(env, vd, addr, elems, nf, max_elems,
+                                  log2_esz, true, mmu_index, ldst_tlb,
+                                  ldst_host, ra);
+            }
+        } else {
+            for (i = env->vstart; i < env->vl; i++) {
+                k = 0;
+                while (k < nf) {
+                    if (!vext_elem_mask(v0, i)) {
+                        /* set masked-off elements to 1s */
+                        vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
+                                          (i + k * max_elems + 1) * esz);
+                        k++;
+                        continue;
+                    }
+                    addr = base + ((i * nf + k) << log2_esz);
+                    ldst_tlb(env, adjust_addr(env, addr), i + k * max_elems,
+                             vd, ra);
+                    k++;
+                }
             }
-            addr = base + ((i * nf + k) << log2_esz);
-            ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
-            k++;
         }
     }
     env->vstart = 0;
@@ -637,18 +681,18 @@ ProbeSuccess:
     vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
 }
 
-#define GEN_VEXT_LDFF(NAME, ETYPE, LOAD_FN)               \
-void HELPER(NAME)(void *vd, void *v0, target_ulong base,  \
-                  CPURISCVState *env, uint32_t desc)      \
-{                                                         \
-    vext_ldff(vd, v0, base, env, desc, LOAD_FN,           \
-              ctzl(sizeof(ETYPE)), GETPC());              \
+#define GEN_VEXT_LDFF(NAME, ETYPE, LOAD_FN_TLB, LOAD_FN_HOST)   \
+void HELPER(NAME)(void *vd, void *v0, target_ulong base,        \
+                  CPURISCVState *env, uint32_t desc)            \
+{                                                               \
+    vext_ldff(vd, v0, base, env, desc, LOAD_FN_TLB,             \
+              LOAD_FN_HOST, ctzl(sizeof(ETYPE)), GETPC());      \
 }
 
-GEN_VEXT_LDFF(vle8ff_v,  int8_t,  lde_b_tlb)
-GEN_VEXT_LDFF(vle16ff_v, int16_t, lde_h_tlb)
-GEN_VEXT_LDFF(vle32ff_v, int32_t, lde_w_tlb)
-GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d_tlb)
+GEN_VEXT_LDFF(vle8ff_v,  int8_t,  lde_b_tlb, lde_b_host)
+GEN_VEXT_LDFF(vle16ff_v, int16_t, lde_h_tlb, lde_h_host)
+GEN_VEXT_LDFF(vle32ff_v, int32_t, lde_w_tlb, lde_w_host)
+GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d_tlb, lde_d_host)
 
 #define DO_SWAP(N, M) (M)
 #define DO_AND(N, M)  (N & M)
-- 
2.34.1
Re: [PATCH v6 5/7] target/riscv: rvv: Provide a fast path using direct access to host ram for unit-stride load-only-first load instructions
Posted by Daniel Henrique Barboza 3 weeks, 3 days ago

On 9/18/24 2:14 PM, Max Chou wrote:
> The unmasked unit-stride fault-only-first load instructions are similar
> to the unmasked unit-stride load/store instructions that is suitable to
> be optimized by using a direct access to host ram fast path.
> 
> Signed-off-by: Max Chou <max.chou@sifive.com>
> ---


Reviewed-by: Daniel Henrique Barboza <dbarboza@ventanamicro.com>

>   target/riscv/vector_helper.c | 98 ++++++++++++++++++++++++++----------
>   1 file changed, 71 insertions(+), 27 deletions(-)
> 
> diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c
> index 824e6401736..59009a940ff 100644
> --- a/target/riscv/vector_helper.c
> +++ b/target/riscv/vector_helper.c
> @@ -557,18 +557,18 @@ GEN_VEXT_ST_INDEX(vsxei64_64_v, int64_t, idx_d, ste_d_tlb)
>    * unit-stride fault-only-fisrt load instructions
>    */
>   static inline void
> -vext_ldff(void *vd, void *v0, target_ulong base,
> -          CPURISCVState *env, uint32_t desc,
> -          vext_ldst_elem_fn_tlb *ldst_elem,
> -          uint32_t log2_esz, uintptr_t ra)
> +vext_ldff(void *vd, void *v0, target_ulong base, CPURISCVState *env,
> +          uint32_t desc, vext_ldst_elem_fn_tlb *ldst_tlb,
> +          vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz, uintptr_t ra)
>   {
>       uint32_t i, k, vl = 0;
>       uint32_t nf = vext_nf(desc);
>       uint32_t vm = vext_vm(desc);
>       uint32_t max_elems = vext_max_elems(desc, log2_esz);
>       uint32_t esz = 1 << log2_esz;
> +    uint32_t msize = nf * esz;
>       uint32_t vma = vext_vma(desc);
> -    target_ulong addr, offset, remain;
> +    target_ulong addr, offset, remain, page_split, elems;
>       int mmu_index = riscv_env_mmu_index(env, false);
>   
>       VSTART_CHECK_EARLY_EXIT(env);
> @@ -617,19 +617,63 @@ ProbeSuccess:
>       if (vl != 0) {
>           env->vl = vl;
>       }
> -    for (i = env->vstart; i < env->vl; i++) {
> -        k = 0;
> -        while (k < nf) {
> -            if (!vm && !vext_elem_mask(v0, i)) {
> -                /* set masked-off elements to 1s */
> -                vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
> -                                  (i + k * max_elems + 1) * esz);
> -                k++;
> -                continue;
> +
> +    if (env->vstart < env->vl) {
> +        if (vm) {
> +            /* Calculate the page range of first page */
> +            addr = base + ((env->vstart * nf) << log2_esz);
> +            page_split = -(addr | TARGET_PAGE_MASK);
> +            /* Get number of elements */
> +            elems = page_split / msize;
> +            if (unlikely(env->vstart + elems >= env->vl)) {
> +                elems = env->vl - env->vstart;
> +            }
> +
> +            /* Load/store elements in the first page */
> +            if (likely(elems)) {
> +                vext_page_ldst_us(env, vd, addr, elems, nf, max_elems,
> +                                  log2_esz, true, mmu_index, ldst_tlb,
> +                                  ldst_host, ra);
> +            }
> +
> +            /* Load/store elements in the second page */
> +            if (unlikely(env->vstart < env->vl)) {
> +                /* Cross page element */
> +                if (unlikely(page_split % msize)) {
> +                    for (k = 0; k < nf; k++) {
> +                        addr = base + ((env->vstart * nf + k) << log2_esz);
> +                        ldst_tlb(env, adjust_addr(env, addr),
> +                                 env->vstart + k * max_elems, vd, ra);
> +                    }
> +                    env->vstart++;
> +                }
> +
> +                addr = base + ((env->vstart * nf) << log2_esz);
> +                /* Get number of elements of second page */
> +                elems = env->vl - env->vstart;
> +
> +                /* Load/store elements in the second page */
> +                vext_page_ldst_us(env, vd, addr, elems, nf, max_elems,
> +                                  log2_esz, true, mmu_index, ldst_tlb,
> +                                  ldst_host, ra);
> +            }
> +        } else {
> +            for (i = env->vstart; i < env->vl; i++) {
> +                k = 0;
> +                while (k < nf) {
> +                    if (!vext_elem_mask(v0, i)) {
> +                        /* set masked-off elements to 1s */
> +                        vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
> +                                          (i + k * max_elems + 1) * esz);
> +                        k++;
> +                        continue;
> +                    }
> +                    addr = base + ((i * nf + k) << log2_esz);
> +                    ldst_tlb(env, adjust_addr(env, addr), i + k * max_elems,
> +                             vd, ra);
> +                    k++;
> +                }
>               }
> -            addr = base + ((i * nf + k) << log2_esz);
> -            ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
> -            k++;
>           }
>       }
>       env->vstart = 0;
> @@ -637,18 +681,18 @@ ProbeSuccess:
>       vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
>   }
>   
> -#define GEN_VEXT_LDFF(NAME, ETYPE, LOAD_FN)               \
> -void HELPER(NAME)(void *vd, void *v0, target_ulong base,  \
> -                  CPURISCVState *env, uint32_t desc)      \
> -{                                                         \
> -    vext_ldff(vd, v0, base, env, desc, LOAD_FN,           \
> -              ctzl(sizeof(ETYPE)), GETPC());              \
> +#define GEN_VEXT_LDFF(NAME, ETYPE, LOAD_FN_TLB, LOAD_FN_HOST)   \
> +void HELPER(NAME)(void *vd, void *v0, target_ulong base,        \
> +                  CPURISCVState *env, uint32_t desc)            \
> +{                                                               \
> +    vext_ldff(vd, v0, base, env, desc, LOAD_FN_TLB,             \
> +              LOAD_FN_HOST, ctzl(sizeof(ETYPE)), GETPC());      \
>   }
>   
> -GEN_VEXT_LDFF(vle8ff_v,  int8_t,  lde_b_tlb)
> -GEN_VEXT_LDFF(vle16ff_v, int16_t, lde_h_tlb)
> -GEN_VEXT_LDFF(vle32ff_v, int32_t, lde_w_tlb)
> -GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d_tlb)
> +GEN_VEXT_LDFF(vle8ff_v,  int8_t,  lde_b_tlb, lde_b_host)
> +GEN_VEXT_LDFF(vle16ff_v, int16_t, lde_h_tlb, lde_h_host)
> +GEN_VEXT_LDFF(vle32ff_v, int32_t, lde_w_tlb, lde_w_host)
> +GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d_tlb, lde_d_host)
>   
>   #define DO_SWAP(N, M) (M)
>   #define DO_AND(N, M)  (N & M)