[PATCH v7 1/2] target/riscv: Use tcg nodes for strided vector ld/st generation

Chao Liu posted 2 patches 2 days, 5 hours ago
Maintainers: Palmer Dabbelt <palmer@dabbelt.com>, Alistair Francis <alistair.francis@wdc.com>, Weiwei Li <liwei1518@gmail.com>, Daniel Henrique Barboza <dbarboza@ventanamicro.com>, Liu Zhiwei <zhiwei_liu@linux.alibaba.com>
[PATCH v7 1/2] target/riscv: Use tcg nodes for strided vector ld/st generation
Posted by Chao Liu 2 days, 5 hours ago
This commit improves the performance of QEMU when emulating strided vector
loads and stores by substituting the call for the helper function with the
generation of equivalent TCG operations.

PS:

An implementation is permitted to cause an illegal instruction if vstart
is not 0 and it is set to a value that can not be produced implicitly by
the implementation, but memory accesses will generally always need to
deal with page faults.

So, if a strided vector memory access instruction has non-zero vstart,
check it through vlse/vsse helpers function.

Signed-off-by: Paolo Savini <paolo.savini@embecosm.com>
Signed-off-by: Chao Liu <chao.liu@zevorn.cn>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Tested-by: Eric Biggers <ebiggers@kernel.org>
Reviewed-by: Daniel Henrique Barboza <dbarboza@ventanamicro.com>
---
 target/riscv/insn_trans/trans_rvv.c.inc | 348 ++++++++++++++++++++++--
 1 file changed, 331 insertions(+), 17 deletions(-)

diff --git a/target/riscv/insn_trans/trans_rvv.c.inc b/target/riscv/insn_trans/trans_rvv.c.inc
index 71f98fb350..f9be96ac93 100644
--- a/target/riscv/insn_trans/trans_rvv.c.inc
+++ b/target/riscv/insn_trans/trans_rvv.c.inc
@@ -863,15 +863,285 @@ static bool st_us_mask_check(DisasContext *s, arg_vsm_v *a, uint8_t eew)
 GEN_VEXT_TRANS(vlm_v, MO_8, vlm_v, ld_us_mask_op, ld_us_mask_check)
 GEN_VEXT_TRANS(vsm_v, MO_8, vsm_v, st_us_mask_op, st_us_mask_check)
 
+/*
+ * MAXSZ returns the maximum vector size can be operated in bytes,
+ * which is used in GVEC IR when vl_eq_vlmax flag is set to true
+ * to accelerate vector operation.
+ */
+static inline uint32_t MAXSZ(DisasContext *s)
+{
+    int max_sz = s->cfg_ptr->vlenb << 3;
+    return max_sz >> (3 - s->lmul);
+}
+
+static inline uint32_t get_log2(uint32_t a)
+{
+    assert(is_power_of_2(a));
+    return ctz32(a);
+}
+
+typedef void gen_tl_ldst(TCGv, TCGv_ptr, tcg_target_long);
+
+static void gen_ldst_vreg(DisasContext *s, TCGv vreg, TCGv dest_offs, TCGv addr,
+                          gen_tl_ldst *ld_fn, gen_tl_ldst *st_fn, bool is_load)
+{
+    MemOp atomicity = MO_ATOM_NONE;
+    if (s->sew == 0) {
+        atomicity = MO_ATOM_NONE;
+    } else {
+        atomicity = MO_ATOM_IFALIGN_PAIR;
+    }
+
+    if (is_load) {
+        tcg_gen_qemu_ld_tl(vreg, addr, s->mem_idx, MO_LE | s->sew | atomicity);
+        st_fn((TCGv)vreg, (TCGv_ptr)dest_offs, 0);
+    } else {
+        ld_fn((TCGv)vreg, (TCGv_ptr)dest_offs, 0);
+        tcg_gen_qemu_st_tl(vreg, addr, s->mem_idx, MO_LE | s->sew | atomicity);
+    }
+}
+
+/*
+ * Check whether the i bit of the mask is 0 or 1.
+ *
+ * static inline int vext_elem_mask(void *v0, int index)
+ * {
+ *     int idx = index / 64;
+ *     int pos = index % 64;
+ *     return (((uint64_t *)v0)[idx] >> pos) & 1;
+ * }
+ *
+ * And
+ *
+ * if (vext_elem_mask(v0, i) != 0) {
+ *     goto label;
+ * }
+ */
+static void gen_check_vext_elem_mask(TCGLabel *label, TCGv mask, TCGv mask_offs)
+{
+    TCGv mask_offs_64 = tcg_temp_new();
+    TCGv mask_offs_rem = tcg_temp_new();
+    TCGv mask_elem = tcg_temp_new();
+
+    tcg_gen_shri_tl(mask_offs_64, mask_offs, 3);
+    tcg_gen_add_tl(mask_offs_64, mask_offs_64, mask);
+    tcg_gen_ld_i64((TCGv_i64)mask_elem, (TCGv_ptr)mask_offs_64, 0);
+    tcg_gen_andi_tl(mask_offs_rem, mask_offs, 7);
+    tcg_gen_shr_tl(mask_elem, mask_elem, mask_offs_rem);
+    tcg_gen_andi_tl(mask_elem, mask_elem, 1);
+    tcg_gen_brcond_tl(TCG_COND_NE, mask_elem, tcg_constant_tl(0), label);
+}
+
+static void gen_vext_set_elems_1s(TCGv dest, TCGv mask_offs, int sew,
+                                  gen_tl_ldst *st_fn, bool is_load)
+{
+    if (is_load) {
+        tcg_gen_shli_tl(mask_offs, mask_offs, sew);
+        tcg_gen_add_tl(mask_offs, mask_offs, dest);
+        st_fn(tcg_constant_tl(-1), (TCGv_ptr)mask_offs, 0);
+    }
+}
+
+/*
+ * Simulate the strided load/store main loop:
+ *
+ * for (i = env->vstart; i < env->vl; env->vstart = ++i) {
+ *     k = 0;
+ *     while (k < nf) {
+ *         if (!vm && !vext_elem_mask(v0, i)) {
+ *             vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
+ *                               (i + k * max_elems + 1) * esz);
+ *             k++;
+ *             continue;
+ *         }
+ *         target_ulong addr = base + stride * i + (k << log2_esz);
+ *         ldst(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
+ *         k++;
+ *     }
+ * }
+ */
+static void gen_ldst_stride_main_loop(DisasContext *s, TCGv dest, uint32_t rs1,
+                                      uint32_t rs2, uint32_t vm, uint32_t nf,
+                                      gen_tl_ldst *ld_fn, gen_tl_ldst *st_fn,
+                                      bool is_load)
+{
+    TCGv addr = tcg_temp_new();
+    TCGv base = get_gpr(s, rs1, EXT_NONE);
+    TCGv stride = get_gpr(s, rs2, EXT_NONE);
+
+    TCGv i = tcg_temp_new();
+    TCGv i_esz = tcg_temp_new();
+    TCGv k = tcg_temp_new();
+    TCGv k_esz = tcg_temp_new();
+    TCGv k_max = tcg_temp_new();
+    TCGv mask = tcg_temp_new();
+    TCGv mask_offs = tcg_temp_new();
+    TCGv vreg = tcg_temp_new();
+    TCGv dest_offs = tcg_temp_new();
+    TCGv stride_offs = tcg_temp_new();
+
+    uint32_t max_elems = MAXSZ(s) >> s->sew;
+
+    TCGLabel *start = gen_new_label();
+    TCGLabel *end = gen_new_label();
+    TCGLabel *start_k = gen_new_label();
+    TCGLabel *inc_k = gen_new_label();
+    TCGLabel *end_k = gen_new_label();
+
+    tcg_gen_addi_tl(mask, (TCGv)tcg_env, vreg_ofs(s, 0));
+
+    /* Start of outer loop. */
+    tcg_gen_mov_tl(i, cpu_vstart);
+    gen_set_label(start);
+    tcg_gen_brcond_tl(TCG_COND_GE, i, cpu_vl, end);
+    tcg_gen_shli_tl(i_esz, i, s->sew);
+
+    /* Start of inner loop. */
+    tcg_gen_movi_tl(k, 0);
+    gen_set_label(start_k);
+    tcg_gen_brcond_tl(TCG_COND_GE, k, tcg_constant_tl(nf), end_k);
+
+    /*
+     * If we are in mask agnostic regime and the operation is not unmasked we
+     * set the inactive elements to 1.
+     */
+    if (!vm && s->vma) {
+        TCGLabel *active_element = gen_new_label();
+        /* (i + k * max_elems) * esz */
+        tcg_gen_shli_tl(mask_offs, k, get_log2(max_elems << s->sew));
+        tcg_gen_add_tl(mask_offs, mask_offs, i_esz);
+
+        /*
+         * Check whether the i bit of the mask is 0 or 1.
+         * If it is 0, set masked-off elements;
+         * otherwise, directly load/store the vector register.
+         */
+        gen_check_vext_elem_mask(active_element, mask, mask_offs);
+
+        /*
+         * Set masked-off elements in the destination vector register to 1s.
+         * Store instructions simply skip this bit as memory ops access memory
+         * only for active elements.
+         */
+        gen_vext_set_elems_1s(dest, mask_offs, s->sew, st_fn, is_load);
+
+        tcg_gen_br(inc_k);
+        gen_set_label(active_element);
+    }
+
+    /*
+     * The element is active, calculate the address with stride:
+     * target_ulong addr = base + stride * i + (k << log2_esz);
+     */
+    tcg_gen_mul_tl(stride_offs, stride, i);
+    tcg_gen_shli_tl(k_esz, k, s->sew);
+    tcg_gen_add_tl(stride_offs, stride_offs, k_esz);
+    tcg_gen_add_tl(addr, base, stride_offs);
+
+    /* Calculate the offset in the dst/src vector register. */
+    tcg_gen_shli_tl(k_max, k, get_log2(max_elems));
+    tcg_gen_add_tl(dest_offs, i, k_max);
+    tcg_gen_shli_tl(dest_offs, dest_offs, s->sew);
+    tcg_gen_add_tl(dest_offs, dest_offs, dest);
+
+    /* Load/Store vector register. */
+    gen_ldst_vreg(s, vreg, dest_offs, addr, ld_fn, st_fn, is_load);
+
+    /*
+     * We don't execute the load/store above if the element was inactive.
+     * We jump instead directly to incrementing k and continuing the loop.
+     */
+    if (!vm && s->vma) {
+        gen_set_label(inc_k);
+    }
+    tcg_gen_addi_tl(k, k, 1);
+    tcg_gen_br(start_k);
+
+    /* End of the inner loop. */
+    gen_set_label(end_k);
+
+    tcg_gen_addi_tl(i, i, 1);
+    tcg_gen_mov_tl(cpu_vstart, i);
+    tcg_gen_br(start);
+
+    /* End of the outer loop. */
+    gen_set_label(end);
+
+    return;
+}
+
+/*
+ * Set the tail bytes of the strided loads/stores to 1:
+ *
+ * for (k = 0; k < nf; ++k) {
+ *     cnt = (k * max_elems + vl) * esz;
+ *     tot = (k * max_elems + max_elems) * esz;
+ *     for (i = cnt; i < tot; i += esz) {
+ *         store_1s(-1, vd[vl+i]);
+ *     }
+ * }
+ */
+static void gen_ldst_stride_tail_loop(DisasContext *s, TCGv dest, uint32_t nf,
+                                      gen_tl_ldst *st_fn)
+{
+    TCGv i = tcg_temp_new();
+    TCGv k = tcg_temp_new();
+    TCGv tail_cnt = tcg_temp_new();
+    TCGv tail_tot = tcg_temp_new();
+    TCGv tail_addr = tcg_temp_new();
+
+    TCGLabel *start = gen_new_label();
+    TCGLabel *end = gen_new_label();
+    TCGLabel *start_i = gen_new_label();
+    TCGLabel *end_i = gen_new_label();
+
+    uint32_t max_elems_b = MAXSZ(s);
+    uint32_t esz = 1 << s->sew;
+
+    /* Start of the outer loop. */
+    tcg_gen_movi_tl(k, 0);
+    tcg_gen_shli_tl(tail_cnt, cpu_vl, s->sew);
+    tcg_gen_movi_tl(tail_tot, max_elems_b);
+    tcg_gen_add_tl(tail_addr, dest, tail_cnt);
+    gen_set_label(start);
+    tcg_gen_brcond_tl(TCG_COND_GE, k, tcg_constant_tl(nf), end);
+
+    /* Start of the inner loop. */
+    tcg_gen_mov_tl(i, tail_cnt);
+    gen_set_label(start_i);
+    tcg_gen_brcond_tl(TCG_COND_GE, i, tail_tot, end_i);
+
+    /* store_1s(-1, vd[vl+i]); */
+    st_fn(tcg_constant_tl(-1), (TCGv_ptr)tail_addr, 0);
+    tcg_gen_addi_tl(tail_addr, tail_addr, esz);
+    tcg_gen_addi_tl(i, i, esz);
+    tcg_gen_br(start_i);
+
+    /* End of the inner loop. */
+    gen_set_label(end_i);
+
+    /* Update the counts */
+    tcg_gen_addi_tl(tail_cnt, tail_cnt, max_elems_b);
+    tcg_gen_addi_tl(tail_tot, tail_cnt, max_elems_b);
+    tcg_gen_addi_tl(k, k, 1);
+    tcg_gen_br(start);
+
+    /* End of the outer loop. */
+    gen_set_label(end);
+
+    return;
+}
+
 /*
  *** stride load and store
  */
 typedef void gen_helper_ldst_stride(TCGv_ptr, TCGv_ptr, TCGv,
                                     TCGv, TCGv_env, TCGv_i32);
 
-static bool ldst_stride_trans(uint32_t vd, uint32_t rs1, uint32_t rs2,
-                              uint32_t data, gen_helper_ldst_stride *fn,
-                              DisasContext *s)
+static
+bool gen_call_helper_ldst_stride(uint32_t vd, uint32_t rs1, uint32_t rs2,
+                                 uint32_t data, gen_helper_ldst_stride *fn,
+                                 DisasContext *s)
 {
     TCGv_ptr dest, mask;
     TCGv base, stride;
@@ -895,11 +1165,66 @@ static bool ldst_stride_trans(uint32_t vd, uint32_t rs1, uint32_t rs2,
     return true;
 }
 
+static bool ldst_stride_trans(uint32_t vd, uint32_t rs1, uint32_t rs2,
+                              uint32_t data, gen_helper_ldst_stride *fn,
+                              DisasContext *s, bool is_load)
+{
+    if (!s->vstart_eq_zero) {
+        /* vstart != 0 helper slowpath */
+        return gen_call_helper_ldst_stride(vd, rs1, rs2, data, fn, s);
+    }
+
+    TCGv dest = tcg_temp_new();
+
+    uint32_t nf = FIELD_EX32(data, VDATA, NF);
+    uint32_t vm = FIELD_EX32(data, VDATA, VM);
+
+    /* Destination register and mask register */
+    tcg_gen_addi_tl(dest, (TCGv)tcg_env, vreg_ofs(s, vd));
+
+    /*
+     * Select the appropriate load/store to retrieve data from the vector
+     * register given a specific sew.
+     */
+    static gen_tl_ldst * const ld_fns[4] = {
+        tcg_gen_ld8u_tl, tcg_gen_ld16u_tl,
+        tcg_gen_ld32u_tl, tcg_gen_ld_tl
+    };
+
+    static gen_tl_ldst * const st_fns[4] = {
+        tcg_gen_st8_tl, tcg_gen_st16_tl,
+        tcg_gen_st32_tl, tcg_gen_st_tl
+    };
+
+    gen_tl_ldst *ld_fn = ld_fns[s->sew];
+    gen_tl_ldst *st_fn = st_fns[s->sew];
+
+    if (ld_fn == NULL || st_fn == NULL) {
+        return false;
+    }
+
+    mark_vs_dirty(s);
+
+    gen_ldst_stride_main_loop(s, dest, rs1, rs2, vm, nf, ld_fn, st_fn, is_load);
+
+    tcg_gen_movi_tl(cpu_vstart, 0);
+
+    /*
+     * Set the tail bytes to 1 if tail agnostic:
+     */
+    if (s->vta != 0 && is_load) {
+        gen_ldst_stride_tail_loop(s, dest, nf, st_fn);
+    }
+
+    finalize_rvv_inst(s);
+    return true;
+}
+
 static bool ld_stride_op(DisasContext *s, arg_rnfvm *a, uint8_t eew)
 {
     uint32_t data = 0;
     gen_helper_ldst_stride *fn;
-    static gen_helper_ldst_stride * const fns[4] = {
+    static gen_helper_ldst_stride *const fns[4] = {
         gen_helper_vlse8_v, gen_helper_vlse16_v,
         gen_helper_vlse32_v, gen_helper_vlse64_v
     };
@@ -915,7 +1240,7 @@ static bool ld_stride_op(DisasContext *s, arg_rnfvm *a, uint8_t eew)
     data = FIELD_DP32(data, VDATA, NF, a->nf);
     data = FIELD_DP32(data, VDATA, VTA, s->vta);
     data = FIELD_DP32(data, VDATA, VMA, s->vma);
-    return ldst_stride_trans(a->rd, a->rs1, a->rs2, data, fn, s);
+    return ldst_stride_trans(a->rd, a->rs1, a->rs2, data, fn, s, true);
 }
 
 static bool ld_stride_check(DisasContext *s, arg_rnfvm* a, uint8_t eew)
@@ -949,7 +1274,7 @@ static bool st_stride_op(DisasContext *s, arg_rnfvm *a, uint8_t eew)
         return false;
     }
 
-    return ldst_stride_trans(a->rd, a->rs1, a->rs2, data, fn, s);
+    return ldst_stride_trans(a->rd, a->rs1, a->rs2, data, fn, s, false);
 }
 
 static bool st_stride_check(DisasContext *s, arg_rnfvm* a, uint8_t eew)
@@ -1300,17 +1625,6 @@ GEN_LDST_WHOLE_TRANS(vs8r_v, int8_t, 8, false)
  *** Vector Integer Arithmetic Instructions
  */
 
-/*
- * MAXSZ returns the maximum vector size can be operated in bytes,
- * which is used in GVEC IR when vl_eq_vlmax flag is set to true
- * to accelerate vector operation.
- */
-static inline uint32_t MAXSZ(DisasContext *s)
-{
-    int max_sz = s->cfg_ptr->vlenb * 8;
-    return max_sz >> (3 - s->lmul);
-}
-
 static bool opivv_check(DisasContext *s, arg_rmrr *a)
 {
     return require_rvv(s) &&
-- 
2.50.1
Re: [PATCH v7 1/2] target/riscv: Use tcg nodes for strided vector ld/st generation
Posted by Richard Henderson 2 days ago
On 9/4/25 11:13, Chao Liu wrote:
> +/*
> + * Check whether the i bit of the mask is 0 or 1.
> + *
> + * static inline int vext_elem_mask(void *v0, int index)
> + * {
> + *     int idx = index / 64;
> + *     int pos = index % 64;
> + *     return (((uint64_t *)v0)[idx] >> pos) & 1;
> + * }
> + *
> + * And
> + *
> + * if (vext_elem_mask(v0, i) != 0) {
> + *     goto label;
> + * }
> + */
> +static void gen_check_vext_elem_mask(TCGLabel *label, TCGv mask, TCGv mask_offs)
> +{
> +    TCGv mask_offs_64 = tcg_temp_new();
> +    TCGv mask_offs_rem = tcg_temp_new();
> +    TCGv mask_elem = tcg_temp_new();
> +
> +    tcg_gen_shri_tl(mask_offs_64, mask_offs, 3);
> +    tcg_gen_add_tl(mask_offs_64, mask_offs_64, mask);
> +    tcg_gen_ld_i64((TCGv_i64)mask_elem, (TCGv_ptr)mask_offs_64, 0);

Each and every time you cast a TCGv, you're doing something wrong.
There are a lot of them in this patch.

Your host pointer arithmetic should be using TCGv_ptr and tcg_gen_*_ptr().
This mask_elem should itself be TCGv_i64.

> +    tcg_gen_andi_tl(mask_elem, mask_elem, 1);
> +    tcg_gen_brcond_tl(TCG_COND_NE, mask_elem, tcg_constant_tl(0), label);

This should be

     tcg_gen_brcond_i64(TCG_COND_TSTNE, mask_elem, tcg_constant_i64(1), label);


> +/*
> + * Simulate the strided load/store main loop:
> + *
> + * for (i = env->vstart; i < env->vl; env->vstart = ++i) {
> + *     k = 0;
> + *     while (k < nf) {
> + *         if (!vm && !vext_elem_mask(v0, i)) {
> + *             vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
> + *                               (i + k * max_elems + 1) * esz);
> + *             k++;
> + *             continue;
> + *         }
> + *         target_ulong addr = base + stride * i + (k << log2_esz);
> + *         ldst(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
> + *         k++;
> + *     }
> + * }

The form of this loop causes you to do more reads for vext_elem_mask than necessary.

Better to test once outside of the loop over K:

     for (i in vl) {
         if (!vm && !vext_elem_mask(v0, i)) {
             for (k in nf) {
                 vd[i, k] = -1;
             }
         } else {
             for (k in nf) {
                 vd[i, k] = ld(addr);
             }
         }
     }

If vl_eq_vlmax, and VL is a multiple of 64, you can structure this loop like:

     i = 0;
     do {
         mask = v0[i / 64];
         do {
             if (mask & 1) {
                 ...
             }
             mask >>= 1;
         } while (++i & 63);
     } while (i < vl);

If VL is a smaller power of 2, you can load smaller units of mask to match.  Though beware 
of the big-endian host addressing fixup.

If VM, you should fuse I and K into a single loop over all elements.


r~
Re: [PATCH v7 1/2] target/riscv: Use tcg nodes for strided vector ld/st generation
Posted by Chao Liu an hour ago
On 9/4/2025 9:53 PM, Richard Henderson wrote:
> On 9/4/25 11:13, Chao Liu wrote:
>> +/*
>> + * Check whether the i bit of the mask is 0 or 1.
>> + *
>> + * static inline int vext_elem_mask(void *v0, int index)
>> + * {
>> + *     int idx = index / 64;
>> + *     int pos = index % 64;
>> + *     return (((uint64_t *)v0)[idx] >> pos) & 1;
>> + * }
>> + *
>> + * And
>> + *
>> + * if (vext_elem_mask(v0, i) != 0) {
>> + *     goto label;
>> + * }
>> + */
>> +static void gen_check_vext_elem_mask(TCGLabel *label, TCGv mask, TCGv mask_offs)
>> +{
>> +    TCGv mask_offs_64 = tcg_temp_new();
>> +    TCGv mask_offs_rem = tcg_temp_new();
>> +    TCGv mask_elem = tcg_temp_new();
>> +
>> +    tcg_gen_shri_tl(mask_offs_64, mask_offs, 3);
>> +    tcg_gen_add_tl(mask_offs_64, mask_offs_64, mask);
>> +    tcg_gen_ld_i64((TCGv_i64)mask_elem, (TCGv_ptr)mask_offs_64, 0);
> 
> Each and every time you cast a TCGv, you're doing something wrong.
> There are a lot of them in this patch.
> 
> Your host pointer arithmetic should be using TCGv_ptr and tcg_gen_*_ptr().
> This mask_elem should itself be TCGv_i64.
> 

Thanks, I will include these changes in the next version. :)

>> +    tcg_gen_andi_tl(mask_elem, mask_elem, 1);
>> +    tcg_gen_brcond_tl(TCG_COND_NE, mask_elem, tcg_constant_tl(0), label);
> 
> This should be
> 
>      tcg_gen_brcond_i64(TCG_COND_TSTNE, mask_elem, tcg_constant_i64(1), label);
> 
> 
>> +/*
>> + * Simulate the strided load/store main loop:
>> + *
>> + * for (i = env->vstart; i < env->vl; env->vstart = ++i) {
>> + *     k = 0;
>> + *     while (k < nf) {
>> + *         if (!vm && !vext_elem_mask(v0, i)) {
>> + *             vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
>> + *                               (i + k * max_elems + 1) * esz);
>> + *             k++;
>> + *             continue;
>> + *         }
>> + *         target_ulong addr = base + stride * i + (k << log2_esz);
>> + *         ldst(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
>> + *         k++;
>> + *     }
>> + * }
> 
> The form of this loop causes you to do more reads for vext_elem_mask than necessary.
> 
> Better to test once outside of the loop over K:
> 
>      for (i in vl) {
>          if (!vm && !vext_elem_mask(v0, i)) {
>              for (k in nf) {
>                  vd[i, k] = -1;
>              }
>          } else {
>              for (k in nf) {
>                  vd[i, k] = ld(addr);
>              }
>          }
>      }
> 

Good idea, I've already made some attempts, and I will apply this suggestion in the next version.

> If vl_eq_vlmax, and VL is a multiple of 64, you can structure this loop like:
> 
>      i = 0;
>      do {
>          mask = v0[i / 64];
>          do {
>              if (mask & 1) {
>                  ...
>              }
>              mask >>= 1;
>          } while (++i & 63);
>      } while (i < vl);
> 
> If VL is a smaller power of 2, you can load smaller units of mask to match. 

Reducing memory access is correct. this is a great optimization point.

> Though beware of the big-endian host addressing fixup.

Get~

> If VM, you should fuse I and K into a single loop over all elements.
> 

I have made some tests, such as:

     uint32_t total = env->vl * nf;
     for (uint32_t idx = 0; idx < total; idx++) {
         k = idx / env->vl;
         i = idx % env->vl;
         target_ulong addr = base + stride * i + (k << log2_esz);
         uint32_t elem_idx = i + k * max_elems;
         ldst_elem(env, adjust_addr(env, addr), elem_idx, vd, ra);
     }

If we implement it using TCG ops, would division and modulo operations be more efficient than adding an extra loop layer?

Or is there a better way to merge the loops for i and k into a single for loop?

Also, I will make more tests to check that the changes above are correct and work well.


Thanks,
Chao