[v6] Generate strided vector loads/stores with tcg nodes

[PATCH v6 1/2] target/riscv: Generate strided vector loads/stores with tcg nodes.

Posted by Chao Liu 5 months, 1 week ago

From: Chao Liu <chao.liu@yeah.net>

This commit improves the performance of QEMU when emulating strided vector
loads and stores by substituting the call for the helper function with the
generation of equivalent TCG operations.

PS:

An implementation is permitted to cause an illegal instruction if vstart
is not 0 and it is set to a value that can not be produced implicitly by
the implementation, but memory accesses will generally always need to
deal with page faults.

So, if a strided vector memory access instruction has non-zero vstart,
check it through vlse/vsse helpers function.

Signed-off-by: Paolo Savini <paolo.savini@embecosm.com>
Signed-off-by: Chao Liu <chao.liu@zevorn.cn>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Tested-by: Eric Biggers <ebiggers@kernel.org>
Reviewed-by: Daniel Henrique Barboza <dbarboza@ventanamicro.com>
---
 target/riscv/insn_trans/trans_rvv.c.inc | 321 +++++++++++++++++++++---
 1 file changed, 283 insertions(+), 38 deletions(-)

diff --git a/target/riscv/insn_trans/trans_rvv.c.inc b/target/riscv/insn_trans/trans_rvv.c.inc
index 71f98fb350..1980476c83 100644
--- a/target/riscv/insn_trans/trans_rvv.c.inc
+++ b/target/riscv/insn_trans/trans_rvv.c.inc
@@ -864,32 +864,298 @@ GEN_VEXT_TRANS(vlm_v, MO_8, vlm_v, ld_us_mask_op, ld_us_mask_check)
 GEN_VEXT_TRANS(vsm_v, MO_8, vsm_v, st_us_mask_op, st_us_mask_check)
 
 /*
- *** stride load and store
+ * MAXSZ returns the maximum vector size can be operated in bytes,
+ * which is used in GVEC IR when vl_eq_vlmax flag is set to true
+ * to accelerate vector operation.
  */
+static inline uint32_t MAXSZ(DisasContext *s)
+{
+    int max_sz = s->cfg_ptr->vlenb << 3;
+    return max_sz >> (3 - s->lmul);
+}
+
+static inline uint32_t get_log2(uint32_t a)
+{
+    assert(is_power_of_2(a));
+    return ctz32(a);
+}
+
+typedef void gen_tl_ldst(TCGv, TCGv_ptr, tcg_target_long);
+
+/*
+ * Simulate the strided load/store main loop:
+ *
+ * for (i = env->vstart; i < env->vl; env->vstart = ++i) {
+ *     k = 0;
+ *     while (k < nf) {
+ *         if (!vm && !vext_elem_mask(v0, i)) {
+ *             vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
+ *                               (i + k * max_elems + 1) * esz);
+ *             k++;
+ *             continue;
+ *         }
+ *         target_ulong addr = base + stride * i + (k << log2_esz);
+ *         ldst(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
+ *         k++;
+ *     }
+ * }
+ */
+static void gen_ldst_stride_main_loop(DisasContext *s, TCGv dest, uint32_t rs1,
+                                      uint32_t rs2, uint32_t vm, uint32_t nf,
+                                      gen_tl_ldst *ld_fn, gen_tl_ldst *st_fn,
+                                      bool is_load)
+{
+    TCGv addr = tcg_temp_new();
+    TCGv base = get_gpr(s, rs1, EXT_NONE);
+    TCGv stride = get_gpr(s, rs2, EXT_NONE);
+
+    TCGv i = tcg_temp_new();
+    TCGv i_esz = tcg_temp_new();
+    TCGv k = tcg_temp_new();
+    TCGv k_esz = tcg_temp_new();
+    TCGv k_max = tcg_temp_new();
+    TCGv mask = tcg_temp_new();
+    TCGv mask_offs = tcg_temp_new();
+    TCGv mask_offs_64 = tcg_temp_new();
+    TCGv mask_elem = tcg_temp_new();
+    TCGv mask_offs_rem = tcg_temp_new();
+    TCGv vreg = tcg_temp_new();
+    TCGv dest_offs = tcg_temp_new();
+    TCGv stride_offs = tcg_temp_new();
+
+    uint32_t max_elems = MAXSZ(s) >> s->sew;
+
+    TCGLabel *start = gen_new_label();
+    TCGLabel *end = gen_new_label();
+    TCGLabel *start_k = gen_new_label();
+    TCGLabel *inc_k = gen_new_label();
+    TCGLabel *end_k = gen_new_label();
+
+    MemOp atomicity = MO_ATOM_NONE;
+    if (s->sew == 0) {
+        atomicity = MO_ATOM_NONE;
+    } else {
+        atomicity = MO_ATOM_IFALIGN_PAIR;
+    }
+
+    tcg_gen_addi_tl(mask, (TCGv)tcg_env, vreg_ofs(s, 0));
+
+    /* Start of outer loop. */
+    tcg_gen_mov_tl(i, cpu_vstart);
+    gen_set_label(start);
+    tcg_gen_brcond_tl(TCG_COND_GE, i, cpu_vl, end);
+    tcg_gen_shli_tl(i_esz, i, s->sew);
+    /* Start of inner loop. */
+    tcg_gen_movi_tl(k, 0);
+    gen_set_label(start_k);
+    tcg_gen_brcond_tl(TCG_COND_GE, k, tcg_constant_tl(nf), end_k);
+    /*
+     * If we are in mask agnostic regime and the operation is not unmasked we
+     * set the inactive elements to 1.
+     */
+    if (!vm && s->vma) {
+        TCGLabel *active_element = gen_new_label();
+        /* (i + k * max_elems) * esz */
+        tcg_gen_shli_tl(mask_offs, k, get_log2(max_elems << s->sew));
+        tcg_gen_add_tl(mask_offs, mask_offs, i_esz);
+
+        /*
+         * Check whether the i bit of the mask is 0 or 1.
+         *
+         * static inline int vext_elem_mask(void *v0, int index)
+         * {
+         *     int idx = index / 64;
+         *     int pos = index  % 64;
+         *     return (((uint64_t *)v0)[idx] >> pos) & 1;
+         * }
+         */
+        tcg_gen_shri_tl(mask_offs_64, mask_offs, 3);
+        tcg_gen_add_tl(mask_offs_64, mask_offs_64, mask);
+        tcg_gen_ld_i64((TCGv_i64)mask_elem, (TCGv_ptr)mask_offs_64, 0);
+        tcg_gen_rem_tl(mask_offs_rem, mask_offs, tcg_constant_tl(8));
+        tcg_gen_shr_tl(mask_elem, mask_elem, mask_offs_rem);
+        tcg_gen_andi_tl(mask_elem, mask_elem, 1);
+        tcg_gen_brcond_tl(TCG_COND_NE, mask_elem, tcg_constant_tl(0),
+                          active_element);
+        /*
+         * Set masked-off elements in the destination vector register to 1s.
+         * Store instructions simply skip this bit as memory ops access memory
+         * only for active elements.
+         */
+        if (is_load) {
+            tcg_gen_shli_tl(mask_offs, mask_offs, s->sew);
+            tcg_gen_add_tl(mask_offs, mask_offs, dest);
+            st_fn(tcg_constant_tl(-1), (TCGv_ptr)mask_offs, 0);
+        }
+        tcg_gen_br(inc_k);
+        gen_set_label(active_element);
+    }
+    /*
+     * The element is active, calculate the address with stride:
+     * target_ulong addr = base + stride * i + (k << log2_esz);
+     */
+    tcg_gen_mul_tl(stride_offs, stride, i);
+    tcg_gen_shli_tl(k_esz, k, s->sew);
+    tcg_gen_add_tl(stride_offs, stride_offs, k_esz);
+    tcg_gen_add_tl(addr, base, stride_offs);
+    /* Calculate the offset in the dst/src vector register. */
+    tcg_gen_shli_tl(k_max, k, get_log2(max_elems));
+    tcg_gen_add_tl(dest_offs, i, k_max);
+    tcg_gen_shli_tl(dest_offs, dest_offs, s->sew);
+    tcg_gen_add_tl(dest_offs, dest_offs, dest);
+    if (is_load) {
+        tcg_gen_qemu_ld_tl(vreg, addr, s->mem_idx, MO_LE | s->sew | atomicity);
+        st_fn((TCGv)vreg, (TCGv_ptr)dest_offs, 0);
+    } else {
+        ld_fn((TCGv)vreg, (TCGv_ptr)dest_offs, 0);
+        tcg_gen_qemu_st_tl(vreg, addr, s->mem_idx, MO_LE | s->sew | atomicity);
+    }
+    /*
+     * We don't execute the load/store above if the element was inactive.
+     * We jump instead directly to incrementing k and continuing the loop.
+     */
+    if (!vm && s->vma) {
+        gen_set_label(inc_k);
+    }
+    tcg_gen_addi_tl(k, k, 1);
+    tcg_gen_br(start_k);
+    /* End of the inner loop. */
+    gen_set_label(end_k);
+
+    tcg_gen_addi_tl(i, i, 1);
+    tcg_gen_mov_tl(cpu_vstart, i);
+    tcg_gen_br(start);
+
+    /* End of the outer loop. */
+    gen_set_label(end);
+
+    return;
+}
+
+/*
+ * Set the tail bytes of the strided loads/stores to 1:
+ *
+ * for (k = 0; k < nf; ++k) {
+ *     cnt = (k * max_elems + vl) * esz;
+ *     tot = (k * max_elems + max_elems) * esz;
+ *     for (i = cnt; i < tot; i += esz) {
+ *         store_1s(-1, vd[vl+i]);
+ *     }
+ * }
+ */
+static void gen_ldst_stride_tail_loop(DisasContext *s, TCGv dest, uint32_t nf,
+                                      gen_tl_ldst *st_fn)
+{
+    TCGv i = tcg_temp_new();
+    TCGv k = tcg_temp_new();
+    TCGv tail_cnt = tcg_temp_new();
+    TCGv tail_tot = tcg_temp_new();
+    TCGv tail_addr = tcg_temp_new();
+
+    TCGLabel *start = gen_new_label();
+    TCGLabel *end = gen_new_label();
+    TCGLabel *start_i = gen_new_label();
+    TCGLabel *end_i = gen_new_label();
+
+    uint32_t max_elems_b = MAXSZ(s);
+    uint32_t esz = 1 << s->sew;
+
+    /* Start of the outer loop. */
+    tcg_gen_movi_tl(k, 0);
+    tcg_gen_shli_tl(tail_cnt, cpu_vl, s->sew);
+    tcg_gen_movi_tl(tail_tot, max_elems_b);
+    tcg_gen_add_tl(tail_addr, dest, tail_cnt);
+    gen_set_label(start);
+    tcg_gen_brcond_tl(TCG_COND_GE, k, tcg_constant_tl(nf), end);
+    /* Start of the inner loop. */
+    tcg_gen_mov_tl(i, tail_cnt);
+    gen_set_label(start_i);
+    tcg_gen_brcond_tl(TCG_COND_GE, i, tail_tot, end_i);
+    /* store_1s(-1, vd[vl+i]); */
+    st_fn(tcg_constant_tl(-1), (TCGv_ptr)tail_addr, 0);
+    tcg_gen_addi_tl(tail_addr, tail_addr, esz);
+    tcg_gen_addi_tl(i, i, esz);
+    tcg_gen_br(start_i);
+    /* End of the inner loop. */
+    gen_set_label(end_i);
+    /* Update the counts */
+    tcg_gen_addi_tl(tail_cnt, tail_cnt, max_elems_b);
+    tcg_gen_addi_tl(tail_tot, tail_cnt, max_elems_b);
+    tcg_gen_addi_tl(k, k, 1);
+    tcg_gen_br(start);
+    /* End of the outer loop. */
+    gen_set_label(end);
+
+    return;
+}
+
 typedef void gen_helper_ldst_stride(TCGv_ptr, TCGv_ptr, TCGv,
                                     TCGv, TCGv_env, TCGv_i32);
 
 static bool ldst_stride_trans(uint32_t vd, uint32_t rs1, uint32_t rs2,
                               uint32_t data, gen_helper_ldst_stride *fn,
-                              DisasContext *s)
+                              DisasContext *s, bool is_load)
 {
-    TCGv_ptr dest, mask;
-    TCGv base, stride;
-    TCGv_i32 desc;
+    if (!s->vstart_eq_zero) {
+        TCGv_ptr dest, mask;
+        TCGv base, stride;
+        TCGv_i32 desc;
 
-    dest = tcg_temp_new_ptr();
-    mask = tcg_temp_new_ptr();
-    base = get_gpr(s, rs1, EXT_NONE);
-    stride = get_gpr(s, rs2, EXT_NONE);
-    desc = tcg_constant_i32(simd_desc(s->cfg_ptr->vlenb,
-                                      s->cfg_ptr->vlenb, data));
+        dest = tcg_temp_new_ptr();
+        mask = tcg_temp_new_ptr();
+        base = get_gpr(s, rs1, EXT_NONE);
+        stride = get_gpr(s, rs2, EXT_NONE);
+        desc = tcg_constant_i32(simd_desc(s->cfg_ptr->vlenb,
+                                          s->cfg_ptr->vlenb, data));
 
-    tcg_gen_addi_ptr(dest, tcg_env, vreg_ofs(s, vd));
-    tcg_gen_addi_ptr(mask, tcg_env, vreg_ofs(s, 0));
+        tcg_gen_addi_ptr(dest, tcg_env, vreg_ofs(s, vd));
+        tcg_gen_addi_ptr(mask, tcg_env, vreg_ofs(s, 0));
+        mark_vs_dirty(s);
+        fn(dest, mask, base, stride, tcg_env, desc);
+        return true;
+    }
+
+    TCGv dest = tcg_temp_new();
+
+    uint32_t nf = FIELD_EX32(data, VDATA, NF);
+    uint32_t vm = FIELD_EX32(data, VDATA, VM);
+
+    /* Destination register and mask register */
+    tcg_gen_addi_tl(dest, (TCGv)tcg_env, vreg_ofs(s, vd));
+
+    /*
+     * Select the appropriate load/tore to retrieve data from the vector
+     * register given a specific sew.
+     */
+    static gen_tl_ldst * const ld_fns[4] = {
+        tcg_gen_ld8u_tl, tcg_gen_ld16u_tl,
+        tcg_gen_ld32u_tl, tcg_gen_ld_tl
+    };
+
+    static gen_tl_ldst * const st_fns[4] = {
+        tcg_gen_st8_tl, tcg_gen_st16_tl,
+        tcg_gen_st32_tl, tcg_gen_st_tl
+    };
+
+    gen_tl_ldst *ld_fn = ld_fns[s->sew];
+    gen_tl_ldst *st_fn = st_fns[s->sew];
+
+    if (ld_fn == NULL || st_fn == NULL) {
+        return false;
+    }
 
     mark_vs_dirty(s);
 
-    fn(dest, mask, base, stride, tcg_env, desc);
+    gen_ldst_stride_main_loop(s, dest, rs1, rs2, vm, nf, ld_fn, st_fn, is_load);
+
+    tcg_gen_movi_tl(cpu_vstart, 0);
+
+    /*
+     * Set the tail bytes to 1 if tail agnostic:
+     */
+    if (s->vta != 0 && is_load) {
+        gen_ldst_stride_tail_loop(s, dest, nf, st_fn);
+    }
 
     finalize_rvv_inst(s);
     return true;
@@ -899,7 +1165,7 @@ static bool ld_stride_op(DisasContext *s, arg_rnfvm *a, uint8_t eew)
 {
     uint32_t data = 0;
     gen_helper_ldst_stride *fn;
-    static gen_helper_ldst_stride * const fns[4] = {
+    static gen_helper_ldst_stride *const fns[4] = {
         gen_helper_vlse8_v, gen_helper_vlse16_v,
         gen_helper_vlse32_v, gen_helper_vlse64_v
     };
@@ -915,7 +1181,7 @@ static bool ld_stride_op(DisasContext *s, arg_rnfvm *a, uint8_t eew)
     data = FIELD_DP32(data, VDATA, NF, a->nf);
     data = FIELD_DP32(data, VDATA, VTA, s->vta);
     data = FIELD_DP32(data, VDATA, VMA, s->vma);
-    return ldst_stride_trans(a->rd, a->rs1, a->rs2, data, fn, s);
+    return ldst_stride_trans(a->rd, a->rs1, a->rs2, data, fn, s, true);
 }
 
 static bool ld_stride_check(DisasContext *s, arg_rnfvm* a, uint8_t eew)
@@ -933,23 +1199,13 @@ GEN_VEXT_TRANS(vlse64_v, MO_64, rnfvm, ld_stride_op, ld_stride_check)
 static bool st_stride_op(DisasContext *s, arg_rnfvm *a, uint8_t eew)
 {
     uint32_t data = 0;
-    gen_helper_ldst_stride *fn;
-    static gen_helper_ldst_stride * const fns[4] = {
-        /* masked stride store */
-        gen_helper_vsse8_v,  gen_helper_vsse16_v,
-        gen_helper_vsse32_v,  gen_helper_vsse64_v
-    };
 
     uint8_t emul = vext_get_emul(s, eew);
     data = FIELD_DP32(data, VDATA, VM, a->vm);
     data = FIELD_DP32(data, VDATA, LMUL, emul);
     data = FIELD_DP32(data, VDATA, NF, a->nf);
-    fn = fns[eew];
-    if (fn == NULL) {
-        return false;
-    }
 
-    return ldst_stride_trans(a->rd, a->rs1, a->rs2, data, fn, s);
+    return ldst_stride_trans(a->rd, a->rs1, a->rs2, data, NULL, s, false);
 }
 
 static bool st_stride_check(DisasContext *s, arg_rnfvm* a, uint8_t eew)
@@ -1300,17 +1556,6 @@ GEN_LDST_WHOLE_TRANS(vs8r_v, int8_t, 8, false)
  *** Vector Integer Arithmetic Instructions
  */
 
-/*
- * MAXSZ returns the maximum vector size can be operated in bytes,
- * which is used in GVEC IR when vl_eq_vlmax flag is set to true
- * to accelerate vector operation.
- */
-static inline uint32_t MAXSZ(DisasContext *s)
-{
-    int max_sz = s->cfg_ptr->vlenb * 8;
-    return max_sz >> (3 - s->lmul);
-}
-
 static bool opivv_check(DisasContext *s, arg_rmrr *a)
 {
     return require_rvv(s) &&
-- 
2.50.1

Re: [PATCH v6 1/2] target/riscv: Generate strided vector loads/stores with tcg nodes.

Posted by Nicholas Piggin 5 months, 1 week ago

On Wed, Sep 03, 2025 at 09:52:01PM +0800, Chao Liu wrote:
> From: Chao Liu <chao.liu@yeah.net>

> +static void gen_ldst_stride_main_loop(DisasContext *s, TCGv dest, uint32_t rs1,
> +                                      uint32_t rs2, uint32_t vm, uint32_t nf,
> +                                      gen_tl_ldst *ld_fn, gen_tl_ldst *st_fn,
> +                                      bool is_load)
> +{
> +    TCGv addr = tcg_temp_new();
> +    TCGv base = get_gpr(s, rs1, EXT_NONE);
> +    TCGv stride = get_gpr(s, rs2, EXT_NONE);
> +
> +    TCGv i = tcg_temp_new();
> +    TCGv i_esz = tcg_temp_new();
> +    TCGv k = tcg_temp_new();
> +    TCGv k_esz = tcg_temp_new();
> +    TCGv k_max = tcg_temp_new();
> +    TCGv mask = tcg_temp_new();
> +    TCGv mask_offs = tcg_temp_new();
> +    TCGv mask_offs_64 = tcg_temp_new();
> +    TCGv mask_elem = tcg_temp_new();
> +    TCGv mask_offs_rem = tcg_temp_new();
> +    TCGv vreg = tcg_temp_new();
> +    TCGv dest_offs = tcg_temp_new();
> +    TCGv stride_offs = tcg_temp_new();
> +
> +    uint32_t max_elems = MAXSZ(s) >> s->sew;
> +
> +    TCGLabel *start = gen_new_label();
> +    TCGLabel *end = gen_new_label();
> +    TCGLabel *start_k = gen_new_label();
> +    TCGLabel *inc_k = gen_new_label();
> +    TCGLabel *end_k = gen_new_label();
> +
> +    MemOp atomicity = MO_ATOM_NONE;
> +    if (s->sew == 0) {
> +        atomicity = MO_ATOM_NONE;
> +    } else {
> +        atomicity = MO_ATOM_IFALIGN_PAIR;
> +    }
> +
> +    tcg_gen_addi_tl(mask, (TCGv)tcg_env, vreg_ofs(s, 0));
> +
> +    /* Start of outer loop. */
> +    tcg_gen_mov_tl(i, cpu_vstart);
> +    gen_set_label(start);
> +    tcg_gen_brcond_tl(TCG_COND_GE, i, cpu_vl, end);
> +    tcg_gen_shli_tl(i_esz, i, s->sew);
> +    /* Start of inner loop. */
> +    tcg_gen_movi_tl(k, 0);
> +    gen_set_label(start_k);
> +    tcg_gen_brcond_tl(TCG_COND_GE, k, tcg_constant_tl(nf), end_k);
> +    /*
> +     * If we are in mask agnostic regime and the operation is not unmasked we
> +     * set the inactive elements to 1.
> +     */
> +    if (!vm && s->vma) {
> +        TCGLabel *active_element = gen_new_label();
> +        /* (i + k * max_elems) * esz */
> +        tcg_gen_shli_tl(mask_offs, k, get_log2(max_elems << s->sew));
> +        tcg_gen_add_tl(mask_offs, mask_offs, i_esz);
> +
> +        /*
> +         * Check whether the i bit of the mask is 0 or 1.
> +         *
> +         * static inline int vext_elem_mask(void *v0, int index)
> +         * {
> +         *     int idx = index / 64;
> +         *     int pos = index  % 64;
> +         *     return (((uint64_t *)v0)[idx] >> pos) & 1;
> +         * }
> +         */
> +        tcg_gen_shri_tl(mask_offs_64, mask_offs, 3);
> +        tcg_gen_add_tl(mask_offs_64, mask_offs_64, mask);
> +        tcg_gen_ld_i64((TCGv_i64)mask_elem, (TCGv_ptr)mask_offs_64, 0);
> +        tcg_gen_rem_tl(mask_offs_rem, mask_offs, tcg_constant_tl(8));

Could this be faster with tcg_gen_andi_tl(mask_offs_rem, mask_offs, 7)?

> +        tcg_gen_shr_tl(mask_elem, mask_elem, mask_offs_rem);
> +        tcg_gen_andi_tl(mask_elem, mask_elem, 1);
> +        tcg_gen_brcond_tl(TCG_COND_NE, mask_elem, tcg_constant_tl(0),
> +                          active_element);
> +        /*
> +         * Set masked-off elements in the destination vector register to 1s.
> +         * Store instructions simply skip this bit as memory ops access memory
> +         * only for active elements.
> +         */
> +        if (is_load) {
> +            tcg_gen_shli_tl(mask_offs, mask_offs, s->sew);
> +            tcg_gen_add_tl(mask_offs, mask_offs, dest);
> +            st_fn(tcg_constant_tl(-1), (TCGv_ptr)mask_offs, 0);

One more small thing. This is implementing load and store instructions,
yet there are these ld_fn and st_fn, which makes it a bit confusing.
Calling them something like read_vreg / write_vreg might help make them
a bit clearer.

I also suggest try to split the at generation of access for the vector
data and mask registers to see if you can split them into helper
functions. That could make this function smaller and a bit nicer to
review.

Thanks,
Nick

Re: [PATCH v6 1/2] target/riscv: Generate strided vector loads/stores with tcg nodes.

Posted by Chao Liu 5 months, 1 week ago

On 2025/9/4 13:05, Nicholas Piggin wrote:
> On Wed, Sep 03, 2025 at 09:52:01PM +0800, Chao Liu wrote:
>> From: Chao Liu <chao.liu@yeah.net>
> 
>> +static void gen_ldst_stride_main_loop(DisasContext *s, TCGv dest, uint32_t rs1,
>> +                                      uint32_t rs2, uint32_t vm, uint32_t nf,
>> +                                      gen_tl_ldst *ld_fn, gen_tl_ldst *st_fn,
>> +                                      bool is_load)
>> +{
>> +    TCGv addr = tcg_temp_new();
>> +    TCGv base = get_gpr(s, rs1, EXT_NONE);
>> +    TCGv stride = get_gpr(s, rs2, EXT_NONE);
>> +
>> +    TCGv i = tcg_temp_new();
>> +    TCGv i_esz = tcg_temp_new();
>> +    TCGv k = tcg_temp_new();
>> +    TCGv k_esz = tcg_temp_new();
>> +    TCGv k_max = tcg_temp_new();
>> +    TCGv mask = tcg_temp_new();
>> +    TCGv mask_offs = tcg_temp_new();
>> +    TCGv mask_offs_64 = tcg_temp_new();
>> +    TCGv mask_elem = tcg_temp_new();
>> +    TCGv mask_offs_rem = tcg_temp_new();
>> +    TCGv vreg = tcg_temp_new();
>> +    TCGv dest_offs = tcg_temp_new();
>> +    TCGv stride_offs = tcg_temp_new();
>> +
>> +    uint32_t max_elems = MAXSZ(s) >> s->sew;
>> +
>> +    TCGLabel *start = gen_new_label();
>> +    TCGLabel *end = gen_new_label();
>> +    TCGLabel *start_k = gen_new_label();
>> +    TCGLabel *inc_k = gen_new_label();
>> +    TCGLabel *end_k = gen_new_label();
>> +
>> +    MemOp atomicity = MO_ATOM_NONE;
>> +    if (s->sew == 0) {
>> +        atomicity = MO_ATOM_NONE;
>> +    } else {
>> +        atomicity = MO_ATOM_IFALIGN_PAIR;
>> +    }
>> +
>> +    tcg_gen_addi_tl(mask, (TCGv)tcg_env, vreg_ofs(s, 0));
>> +
>> +    /* Start of outer loop. */
>> +    tcg_gen_mov_tl(i, cpu_vstart);
>> +    gen_set_label(start);
>> +    tcg_gen_brcond_tl(TCG_COND_GE, i, cpu_vl, end);
>> +    tcg_gen_shli_tl(i_esz, i, s->sew);
>> +    /* Start of inner loop. */
>> +    tcg_gen_movi_tl(k, 0);
>> +    gen_set_label(start_k);
>> +    tcg_gen_brcond_tl(TCG_COND_GE, k, tcg_constant_tl(nf), end_k);
>> +    /*
>> +     * If we are in mask agnostic regime and the operation is not unmasked we
>> +     * set the inactive elements to 1.
>> +     */
>> +    if (!vm && s->vma) {
>> +        TCGLabel *active_element = gen_new_label();
>> +        /* (i + k * max_elems) * esz */
>> +        tcg_gen_shli_tl(mask_offs, k, get_log2(max_elems << s->sew));
>> +        tcg_gen_add_tl(mask_offs, mask_offs, i_esz);
>> +
>> +        /*
>> +         * Check whether the i bit of the mask is 0 or 1.
>> +         *
>> +         * static inline int vext_elem_mask(void *v0, int index)
>> +         * {
>> +         *     int idx = index / 64;
>> +         *     int pos = index  % 64;
>> +         *     return (((uint64_t *)v0)[idx] >> pos) & 1;
>> +         * }
>> +         */
>> +        tcg_gen_shri_tl(mask_offs_64, mask_offs, 3);
>> +        tcg_gen_add_tl(mask_offs_64, mask_offs_64, mask);
>> +        tcg_gen_ld_i64((TCGv_i64)mask_elem, (TCGv_ptr)mask_offs_64, 0);
>> +        tcg_gen_rem_tl(mask_offs_rem, mask_offs, tcg_constant_tl(8));
> 
> Could this be faster with tcg_gen_andi_tl(mask_offs_rem, mask_offs, 7)?
> 

This is a good catch! I'll include this change in the next patch version.


>> +        tcg_gen_shr_tl(mask_elem, mask_elem, mask_offs_rem);
>> +        tcg_gen_andi_tl(mask_elem, mask_elem, 1);
>> +        tcg_gen_brcond_tl(TCG_COND_NE, mask_elem, tcg_constant_tl(0),
>> +                          active_element);
>> +        /*
>> +         * Set masked-off elements in the destination vector register to 1s.
>> +         * Store instructions simply skip this bit as memory ops access memory
>> +         * only for active elements.
>> +         */
>> +        if (is_load) {
>> +            tcg_gen_shli_tl(mask_offs, mask_offs, s->sew);
>> +            tcg_gen_add_tl(mask_offs, mask_offs, dest);
>> +            st_fn(tcg_constant_tl(-1), (TCGv_ptr)mask_offs, 0);
> 
> One more small thing. This is implementing load and store instructions,
> yet there are these ld_fn and st_fn, which makes it a bit confusing.
> Calling them something like read_vreg / write_vreg might help make them
> a bit clearer.
> 

I think the current names are okay? After splitting the vector and mask access
code into smaller functions, it's already much easier to read. Let's review the
next patch version first, then decide if we need to change the names.

> I also suggest try to split the at generation of access for the vector
> data and mask registers to see if you can split them into helper
> functions. That could make this function smaller and a bit nicer to
> review.
> 

Get, I'll split the check for vext_elem_mask() and the Load/Store of the vector
register into separate sub-functions to make review easier.

Thanks,
Chao

Re: [PATCH v6 1/2] target/riscv: Generate strided vector loads/stores with tcg nodes.

Posted by Nicholas Piggin 5 months, 1 week ago

On Wed, Sep 03, 2025 at 09:52:01PM +0800, Chao Liu wrote:
> From: Chao Liu <chao.liu@yeah.net>
> 
> This commit improves the performance of QEMU when emulating strided vector
> loads and stores by substituting the call for the helper function with the
> generation of equivalent TCG operations.
> 
> PS:
> 
> An implementation is permitted to cause an illegal instruction if vstart
> is not 0 and it is set to a value that can not be produced implicitly by
> the implementation, but memory accesses will generally always need to
> deal with page faults.
> 
> So, if a strided vector memory access instruction has non-zero vstart,
> check it through vlse/vsse helpers function.

[...]

>  typedef void gen_helper_ldst_stride(TCGv_ptr, TCGv_ptr, TCGv,
>                                      TCGv, TCGv_env, TCGv_i32);
>  
>  static bool ldst_stride_trans(uint32_t vd, uint32_t rs1, uint32_t rs2,
>                                uint32_t data, gen_helper_ldst_stride *fn,
> -                              DisasContext *s)
> +                              DisasContext *s, bool is_load)
>  {
> -    TCGv_ptr dest, mask;
> -    TCGv base, stride;
> -    TCGv_i32 desc;
> +    if (!s->vstart_eq_zero) {
> +        TCGv_ptr dest, mask;
> +        TCGv base, stride;
> +        TCGv_i32 desc;
>  
> -    dest = tcg_temp_new_ptr();
> -    mask = tcg_temp_new_ptr();
> -    base = get_gpr(s, rs1, EXT_NONE);
> -    stride = get_gpr(s, rs2, EXT_NONE);
> -    desc = tcg_constant_i32(simd_desc(s->cfg_ptr->vlenb,
> -                                      s->cfg_ptr->vlenb, data));
> +        dest = tcg_temp_new_ptr();
> +        mask = tcg_temp_new_ptr();
> +        base = get_gpr(s, rs1, EXT_NONE);
> +        stride = get_gpr(s, rs2, EXT_NONE);
> +        desc = tcg_constant_i32(simd_desc(s->cfg_ptr->vlenb,
> +                                          s->cfg_ptr->vlenb, data));
>  
> -    tcg_gen_addi_ptr(dest, tcg_env, vreg_ofs(s, vd));
> -    tcg_gen_addi_ptr(mask, tcg_env, vreg_ofs(s, 0));
> +        tcg_gen_addi_ptr(dest, tcg_env, vreg_ofs(s, vd));
> +        tcg_gen_addi_ptr(mask, tcg_env, vreg_ofs(s, 0));
> +        mark_vs_dirty(s);
> +        fn(dest, mask, base, stride, tcg_env, desc);
> +        return true;

Most of the lines changed here should just be indenting the
existing code into the 'if' branch. So maybe to split the patch
up a little and make less churn, you could do patch 1 that moves
this code into a function like gen_call_helper_ldst_stride().

Then after patch 2 it would be

    if (!s->vstart_eq_zero) {
        /* vstart != 0 helper slowpath */
        gen_call_helper_ldst_stride(vd, rs1, rs2, data, fn, is, is_load);
        return true;
    }

    [...]

> +    }
> +
> +    TCGv dest = tcg_temp_new();
> +
> +    uint32_t nf = FIELD_EX32(data, VDATA, NF);
> +    uint32_t vm = FIELD_EX32(data, VDATA, VM);
> +
> +    /* Destination register and mask register */
> +    tcg_gen_addi_tl(dest, (TCGv)tcg_env, vreg_ofs(s, vd));
> +
> +    /*
> +     * Select the appropriate load/tore to retrieve data from the vector
                                      ^^^^ store

[...]

> @@ -899,7 +1165,7 @@ static bool ld_stride_op(DisasContext *s, arg_rnfvm *a, uint8_t eew)
>  {
>      uint32_t data = 0;
>      gen_helper_ldst_stride *fn;
> -    static gen_helper_ldst_stride * const fns[4] = {
> +    static gen_helper_ldst_stride *const fns[4] = {
>          gen_helper_vlse8_v, gen_helper_vlse16_v,
>          gen_helper_vlse32_v, gen_helper_vlse64_v
>      };

This probably comes from my patch, just remove the hunk to
reduce patch size. Ditto for any other stray "cleanups".

> @@ -915,7 +1181,7 @@ static bool ld_stride_op(DisasContext *s, arg_rnfvm *a, uint8_t eew)
>      data = FIELD_DP32(data, VDATA, NF, a->nf);
>      data = FIELD_DP32(data, VDATA, VTA, s->vta);
>      data = FIELD_DP32(data, VDATA, VMA, s->vma);
> -    return ldst_stride_trans(a->rd, a->rs1, a->rs2, data, fn, s);
> +    return ldst_stride_trans(a->rd, a->rs1, a->rs2, data, fn, s, true);
>  }
>  
>  static bool ld_stride_check(DisasContext *s, arg_rnfvm* a, uint8_t eew)
> @@ -933,23 +1199,13 @@ GEN_VEXT_TRANS(vlse64_v, MO_64, rnfvm, ld_stride_op, ld_stride_check)
>  static bool st_stride_op(DisasContext *s, arg_rnfvm *a, uint8_t eew)
>  {
>      uint32_t data = 0;
> -    gen_helper_ldst_stride *fn;
> -    static gen_helper_ldst_stride * const fns[4] = {
> -        /* masked stride store */
> -        gen_helper_vsse8_v,  gen_helper_vsse16_v,
> -        gen_helper_vsse32_v,  gen_helper_vsse64_v
> -    };

I gave you an old patch without the stores done, sorry. You
just need to pass the store helper fn through here similarly
as for loads (i.e., this hunk should just be the one liner
change to add extra 'is_load=false' argument to the
ldst_stride_trans() call.).

Thanks,
Nick

Re: [PATCH v6 1/2] target/riscv: Generate strided vector loads/stores with tcg nodes.

Posted by Chao Liu 5 months, 1 week ago

On 2025/9/4 12:37, Nicholas Piggin wrote:
> On Wed, Sep 03, 2025 at 09:52:01PM +0800, Chao Liu wrote:
>> From: Chao Liu <chao.liu@yeah.net>
>>
>> This commit improves the performance of QEMU when emulating strided vector
>> loads and stores by substituting the call for the helper function with the
>> generation of equivalent TCG operations.
>>
>> PS:
>>
>> An implementation is permitted to cause an illegal instruction if vstart
>> is not 0 and it is set to a value that can not be produced implicitly by
>> the implementation, but memory accesses will generally always need to
>> deal with page faults.
>>
>> So, if a strided vector memory access instruction has non-zero vstart,
>> check it through vlse/vsse helpers function.
> 
> [...]
> 
>>  typedef void gen_helper_ldst_stride(TCGv_ptr, TCGv_ptr, TCGv,
>>                                      TCGv, TCGv_env, TCGv_i32);
>>  
>>  static bool ldst_stride_trans(uint32_t vd, uint32_t rs1, uint32_t rs2,
>>                                uint32_t data, gen_helper_ldst_stride *fn,
>> -                              DisasContext *s)
>> +                              DisasContext *s, bool is_load)
>>  {
>> -    TCGv_ptr dest, mask;
>> -    TCGv base, stride;
>> -    TCGv_i32 desc;
>> +    if (!s->vstart_eq_zero) {
>> +        TCGv_ptr dest, mask;
>> +        TCGv base, stride;
>> +        TCGv_i32 desc;
>>  
>> -    dest = tcg_temp_new_ptr();
>> -    mask = tcg_temp_new_ptr();
>> -    base = get_gpr(s, rs1, EXT_NONE);
>> -    stride = get_gpr(s, rs2, EXT_NONE);
>> -    desc = tcg_constant_i32(simd_desc(s->cfg_ptr->vlenb,
>> -                                      s->cfg_ptr->vlenb, data));
>> +        dest = tcg_temp_new_ptr();
>> +        mask = tcg_temp_new_ptr();
>> +        base = get_gpr(s, rs1, EXT_NONE);
>> +        stride = get_gpr(s, rs2, EXT_NONE);
>> +        desc = tcg_constant_i32(simd_desc(s->cfg_ptr->vlenb,
>> +                                          s->cfg_ptr->vlenb, data));
>>  
>> -    tcg_gen_addi_ptr(dest, tcg_env, vreg_ofs(s, vd));
>> -    tcg_gen_addi_ptr(mask, tcg_env, vreg_ofs(s, 0));
>> +        tcg_gen_addi_ptr(dest, tcg_env, vreg_ofs(s, vd));
>> +        tcg_gen_addi_ptr(mask, tcg_env, vreg_ofs(s, 0));
>> +        mark_vs_dirty(s);
>> +        fn(dest, mask, base, stride, tcg_env, desc);
>> +        return true;
> 
> Most of the lines changed here should just be indenting the
> existing code into the 'if' branch. So maybe to split the patch
> up a little and make less churn, you could do patch 1 that moves
> this code into a function like gen_call_helper_ldst_stride().
> 
> Then after patch 2 it would be
> 
>     if (!s->vstart_eq_zero) {
>         /* vstart != 0 helper slowpath */
>         gen_call_helper_ldst_stride(vd, rs1, rs2, data, fn, is, is_load);
>         return true;
>     }
> 
>     [...]
> 

Good idea. I've given it a try, and splitting out the
gen_call_helper_ldst_stride() into a separate patch is also very convenient for
review.

I'll include this change in the next version of the patch.

>> +    }
>> +
>> +    TCGv dest = tcg_temp_new();
>> +
>> +    uint32_t nf = FIELD_EX32(data, VDATA, NF);
>> +    uint32_t vm = FIELD_EX32(data, VDATA, VM);
>> +
>> +    /* Destination register and mask register */
>> +    tcg_gen_addi_tl(dest, (TCGv)tcg_env, vreg_ofs(s, vd));
>> +
>> +    /*
>> +     * Select the appropriate load/tore to retrieve data from the vector
>                                       ^^^^ store
> 

get~

> [...]
> 
>> @@ -899,7 +1165,7 @@ static bool ld_stride_op(DisasContext *s, arg_rnfvm *a, uint8_t eew)
>>  {
>>      uint32_t data = 0;
>>      gen_helper_ldst_stride *fn;
>> -    static gen_helper_ldst_stride * const fns[4] = {
>> +    static gen_helper_ldst_stride *const fns[4] = {
>>          gen_helper_vlse8_v, gen_helper_vlse16_v,
>>          gen_helper_vlse32_v, gen_helper_vlse64_v
>>      };
> 
> This probably comes from my patch, just remove the hunk to
> reduce patch size. Ditto for any other stray "cleanups".
> 
>> @@ -915,7 +1181,7 @@ static bool ld_stride_op(DisasContext *s, arg_rnfvm *a, uint8_t eew)
>>      data = FIELD_DP32(data, VDATA, NF, a->nf);
>>      data = FIELD_DP32(data, VDATA, VTA, s->vta);
>>      data = FIELD_DP32(data, VDATA, VMA, s->vma);
>> -    return ldst_stride_trans(a->rd, a->rs1, a->rs2, data, fn, s);
>> +    return ldst_stride_trans(a->rd, a->rs1, a->rs2, data, fn, s, true);
>>  }
>>  
>>  static bool ld_stride_check(DisasContext *s, arg_rnfvm* a, uint8_t eew)
>> @@ -933,23 +1199,13 @@ GEN_VEXT_TRANS(vlse64_v, MO_64, rnfvm, ld_stride_op, ld_stride_check)
>>  static bool st_stride_op(DisasContext *s, arg_rnfvm *a, uint8_t eew)
>>  {
>>      uint32_t data = 0;
>> -    gen_helper_ldst_stride *fn;
>> -    static gen_helper_ldst_stride * const fns[4] = {
>> -        /* masked stride store */
>> -        gen_helper_vsse8_v,  gen_helper_vsse16_v,
>> -        gen_helper_vsse32_v,  gen_helper_vsse64_v
>> -    };
> 
> I gave you an old patch without the stores done, sorry. You
> just need to pass the store helper fn through here similarly
> as for loads (i.e., this hunk should just be the one liner
> change to add extra 'is_load=false' argument to the
> ldst_stride_trans() call.).
> 
> Thanks,
> Nick

get~

[PATCH v6 1/2] target/riscv: Generate strided vector loads/stores with tcg nodes.
[PATCH v6 2/2] tests/tcg/riscv64: Add test for vlsseg8e32 instruction