[PATCH v3 05/14] tcg/riscv: Implement vector load/store

LIU Zhiwei posted 14 patches 2 months, 2 weeks ago
[PATCH v3 05/14] tcg/riscv: Implement vector load/store
Posted by LIU Zhiwei 2 months, 2 weeks ago
From: TANG Tiancheng <tangtiancheng.ttc@alibaba-inc.com>

Signed-off-by: TANG Tiancheng <tangtiancheng.ttc@alibaba-inc.com>
Reviewed-by: Liu Zhiwei <zhiwei_liu@linux.alibaba.com>
---
 tcg/riscv/tcg-target-con-set.h |   2 +
 tcg/riscv/tcg-target.c.inc     | 202 +++++++++++++++++++++++++++++++--
 2 files changed, 196 insertions(+), 8 deletions(-)

diff --git a/tcg/riscv/tcg-target-con-set.h b/tcg/riscv/tcg-target-con-set.h
index aac5ceee2b..d73a62b0f2 100644
--- a/tcg/riscv/tcg-target-con-set.h
+++ b/tcg/riscv/tcg-target-con-set.h
@@ -21,3 +21,5 @@ C_O1_I2(r, rZ, rZ)
 C_N1_I2(r, r, rM)
 C_O1_I4(r, r, rI, rM, rM)
 C_O2_I4(r, r, rZ, rZ, rM, rM)
+C_O0_I2(v, r)
+C_O1_I1(v, r)
diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
index df96d350a3..4b1079fc6f 100644
--- a/tcg/riscv/tcg-target.c.inc
+++ b/tcg/riscv/tcg-target.c.inc
@@ -174,6 +174,11 @@ static bool tcg_target_const_match(int64_t val, int ct,
 #define V_OPMVX (0x6 << 12)
 #define V_OPCFG (0x7 << 12)
 
+/* NF <= 7 && BNF >= 0 */
+#define V_NF(x) (x << 29)
+#define V_UNIT_STRIDE (0x0 << 20)
+#define V_UNIT_STRIDE_WHOLE_REG (0x8 << 20)
+
 typedef enum {
     VLMUL_M1 = 0, /* LMUL=1 */
     VLMUL_M2,     /* LMUL=2 */
@@ -285,6 +290,25 @@ typedef enum {
     OPC_VSETVLI  = 0x57 | V_OPCFG,
     OPC_VSETIVLI = 0xc0000057 | V_OPCFG,
     OPC_VSETVL   = 0x80000057 | V_OPCFG,
+
+    OPC_VLE8_V  = 0x7 | V_UNIT_STRIDE,
+    OPC_VLE16_V = 0x5007 | V_UNIT_STRIDE,
+    OPC_VLE32_V = 0x6007 | V_UNIT_STRIDE,
+    OPC_VLE64_V = 0x7007 | V_UNIT_STRIDE,
+    OPC_VSE8_V  = 0x27 | V_UNIT_STRIDE,
+    OPC_VSE16_V = 0x5027 | V_UNIT_STRIDE,
+    OPC_VSE32_V = 0x6027 | V_UNIT_STRIDE,
+    OPC_VSE64_V = 0x7027 | V_UNIT_STRIDE,
+
+    OPC_VL1RE64_V = 0x2007007 | V_UNIT_STRIDE_WHOLE_REG | V_NF(0),
+    OPC_VL2RE64_V = 0x2007007 | V_UNIT_STRIDE_WHOLE_REG | V_NF(1),
+    OPC_VL4RE64_V = 0x2007007 | V_UNIT_STRIDE_WHOLE_REG | V_NF(3),
+    OPC_VL8RE64_V = 0x2007007 | V_UNIT_STRIDE_WHOLE_REG | V_NF(7),
+
+    OPC_VS1R_V = 0x2000027 | V_UNIT_STRIDE_WHOLE_REG | V_NF(0),
+    OPC_VS2R_V = 0x2000027 | V_UNIT_STRIDE_WHOLE_REG | V_NF(1),
+    OPC_VS4R_V = 0x2000027 | V_UNIT_STRIDE_WHOLE_REG | V_NF(3),
+    OPC_VS8R_V = 0x2000027 | V_UNIT_STRIDE_WHOLE_REG | V_NF(7),
 } RISCVInsn;
 
 /*
@@ -646,6 +670,20 @@ static void tcg_target_set_vec_config(TCGContext *s, TCGType type,
     }
 }
 
+static int riscv_set_vec_config_vl(TCGContext *s, TCGType type)
+{
+    int prev_vsew = s->riscv_host_vtype < 0 ? MO_8 :
+                    ((s->riscv_host_vtype >> 3) & 0x7);
+    tcg_target_set_vec_config(s, type, prev_vsew);
+    return prev_vsew;
+}
+
+static void riscv_set_vec_config_vl_vece(TCGContext *s, TCGType type,
+                                         unsigned vece)
+{
+    tcg_target_set_vec_config(s, type, vece);
+}
+
 /*
  * TCG intrinsics
  */
@@ -811,31 +849,52 @@ static void tcg_out_extrl_i64_i32(TCGContext *s, TCGReg ret, TCGReg arg)
     tcg_out_ext32s(s, ret, arg);
 }
 
-static void tcg_out_ldst(TCGContext *s, RISCVInsn opc, TCGReg data,
-                         TCGReg addr, intptr_t offset)
+static intptr_t split_offset_scalar(TCGContext *s, TCGReg *addr,
+                                    intptr_t offset)
 {
     intptr_t imm12 = sextreg(offset, 0, 12);
 
     if (offset != imm12) {
         intptr_t diff = tcg_pcrel_diff(s, (void *)offset);
 
-        if (addr == TCG_REG_ZERO && diff == (int32_t)diff) {
+        if (*addr == TCG_REG_ZERO && diff == (int32_t)diff) {
             imm12 = sextreg(diff, 0, 12);
             tcg_out_opc_upper(s, OPC_AUIPC, TCG_REG_TMP2, diff - imm12);
         } else {
             tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_TMP2, offset - imm12);
-            if (addr != TCG_REG_ZERO) {
-                tcg_out_opc_reg(s, OPC_ADD, TCG_REG_TMP2, TCG_REG_TMP2, addr);
+            if (*addr != TCG_REG_ZERO) {
+                tcg_out_opc_reg(s, OPC_ADD, TCG_REG_TMP2, TCG_REG_TMP2, *addr);
             }
         }
-        addr = TCG_REG_TMP2;
+        *addr = TCG_REG_TMP2;
+    }
+    return imm12;
+}
+
+static void split_offset_vector(TCGContext *s, TCGReg *addr, intptr_t offset)
+{
+    if (offset != 0) {
+        if (offset == sextreg(offset, 0, 12)) {
+            tcg_out_opc_imm(s, OPC_ADDI, TCG_REG_TMP0, *addr, offset);
+        } else {
+            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_TMP0, offset);
+            tcg_out_opc_reg(s, OPC_ADD, TCG_REG_TMP0, TCG_REG_TMP0, *addr);
+        }
+        *addr = TCG_REG_TMP0;
     }
+}
+
+static void tcg_out_ldst(TCGContext *s, RISCVInsn opc, TCGReg data,
+                         TCGReg addr, intptr_t offset)
+{
+    intptr_t imm12;
 
     switch (opc) {
     case OPC_SB:
     case OPC_SH:
     case OPC_SW:
     case OPC_SD:
+        imm12 = split_offset_scalar(s, &addr, offset);
         tcg_out_opc_store(s, opc, addr, data, imm12);
         break;
     case OPC_LB:
@@ -845,8 +904,31 @@ static void tcg_out_ldst(TCGContext *s, RISCVInsn opc, TCGReg data,
     case OPC_LW:
     case OPC_LWU:
     case OPC_LD:
+        imm12 = split_offset_scalar(s, &addr, offset);
         tcg_out_opc_imm(s, opc, data, addr, imm12);
         break;
+    case OPC_VSE8_V:
+    case OPC_VSE16_V:
+    case OPC_VSE32_V:
+    case OPC_VSE64_V:
+    case OPC_VS1R_V:
+    case OPC_VS2R_V:
+    case OPC_VS4R_V:
+    case OPC_VS8R_V:
+        split_offset_vector(s, &addr, offset);
+        tcg_out_opc_ldst_vec(s, opc, data, addr, true);
+        break;
+    case OPC_VLE8_V:
+    case OPC_VLE16_V:
+    case OPC_VLE32_V:
+    case OPC_VLE64_V:
+    case OPC_VL1RE64_V:
+    case OPC_VL2RE64_V:
+    case OPC_VL4RE64_V:
+    case OPC_VL8RE64_V:
+        split_offset_vector(s, &addr, offset);
+        tcg_out_opc_ldst_vec(s, opc, data, addr, true);
+        break;
     default:
         g_assert_not_reached();
     }
@@ -855,14 +937,101 @@ static void tcg_out_ldst(TCGContext *s, RISCVInsn opc, TCGReg data,
 static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg arg,
                        TCGReg arg1, intptr_t arg2)
 {
-    RISCVInsn insn = type == TCG_TYPE_I32 ? OPC_LW : OPC_LD;
+    RISCVInsn insn;
+
+    if (type < TCG_TYPE_V64) {
+        insn = (type == TCG_TYPE_I32) ? OPC_LW : OPC_LD;
+    } else {
+        int nf = get_vec_type_bytes(type) / riscv_vlenb;
+
+        switch (nf) {
+        case 1:
+            insn = OPC_VL1RE64_V;
+            break;
+        case 2:
+            insn = OPC_VL2RE64_V;
+            break;
+        case 4:
+            insn = OPC_VL4RE64_V;
+            break;
+        case 8:
+            insn = OPC_VL8RE64_V;
+            break;
+        default:
+            {
+                int prev_vsew = riscv_set_vec_config_vl(s, type);
+
+                switch (prev_vsew) {
+                case MO_8:
+                    insn = OPC_VLE8_V;
+                    break;
+                case MO_16:
+                    insn = OPC_VLE16_V;
+                    break;
+                case MO_32:
+                    insn = OPC_VLE32_V;
+                    break;
+                case MO_64:
+                    insn = OPC_VLE64_V;
+                    break;
+                default:
+                    g_assert_not_reached();
+                }
+            }
+            break;
+        }
+    }
     tcg_out_ldst(s, insn, arg, arg1, arg2);
 }
 
 static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
                        TCGReg arg1, intptr_t arg2)
 {
-    RISCVInsn insn = type == TCG_TYPE_I32 ? OPC_SW : OPC_SD;
+    RISCVInsn insn;
+
+    if (type < TCG_TYPE_V64) {
+        insn = (type == TCG_TYPE_I32) ? OPC_SW : OPC_SD;
+        tcg_out_ldst(s, insn, arg, arg1, arg2);
+    } else {
+        int nf = get_vec_type_bytes(type) / riscv_vlenb;
+
+        switch (nf) {
+        case 1:
+            insn = OPC_VS1R_V;
+            break;
+        case 2:
+            insn = OPC_VS2R_V;
+            break;
+        case 4:
+            insn = OPC_VS4R_V;
+            break;
+        case 8:
+            insn = OPC_VS8R_V;
+            break;
+        default:
+            {
+                int prev_vsew = riscv_set_vec_config_vl(s, type);
+
+                switch (prev_vsew) {
+                case MO_8:
+                    insn = OPC_VSE8_V;
+                    break;
+                case MO_16:
+                    insn = OPC_VSE16_V;
+                    break;
+                case MO_32:
+                    insn = OPC_VSE32_V;
+                    break;
+                case MO_64:
+                    insn = OPC_VSE64_V;
+                    break;
+                default:
+                    g_assert_not_reached();
+                }
+            }
+            break;
+        }
+    }
     tcg_out_ldst(s, insn, arg, arg1, arg2);
 }
 
@@ -2057,7 +2226,20 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
                            const TCGArg args[TCG_MAX_OP_ARGS],
                            const int const_args[TCG_MAX_OP_ARGS])
 {
+    TCGType type = vecl + TCG_TYPE_V64;
+    TCGArg a0, a1, a2;
+
+    a0 = args[0];
+    a1 = args[1];
+    a2 = args[2];
+
     switch (opc) {
+    case INDEX_op_ld_vec:
+        tcg_out_ld(s, type, a0, a1, a2);
+        break;
+    case INDEX_op_st_vec:
+        tcg_out_st(s, type, a0, a1, a2);
+        break;
     case INDEX_op_mov_vec: /* Always emitted via tcg_out_mov.  */
     case INDEX_op_dup_vec: /* Always emitted via tcg_out_dup_vec.  */
     default:
@@ -2221,6 +2403,10 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
     case INDEX_op_qemu_st_a64_i64:
         return C_O0_I2(rZ, r);
 
+    case INDEX_op_st_vec:
+        return C_O0_I2(v, r);
+    case INDEX_op_ld_vec:
+        return C_O1_I1(v, r);
     default:
         g_assert_not_reached();
     }
-- 
2.43.0
Re: [PATCH v3 05/14] tcg/riscv: Implement vector load/store
Posted by Richard Henderson 2 months, 2 weeks ago
On 9/4/24 07:27, LIU Zhiwei wrote:
> @@ -811,31 +849,52 @@ static void tcg_out_extrl_i64_i32(TCGContext *s, TCGReg ret, TCGReg arg)
>       tcg_out_ext32s(s, ret, arg);
>   }
>   
> -static void tcg_out_ldst(TCGContext *s, RISCVInsn opc, TCGReg data,
> -                         TCGReg addr, intptr_t offset)
> +static intptr_t split_offset_scalar(TCGContext *s, TCGReg *addr,
> +                                    intptr_t offset)
>   {
>       intptr_t imm12 = sextreg(offset, 0, 12);
>   
>       if (offset != imm12) {
>           intptr_t diff = tcg_pcrel_diff(s, (void *)offset);
>   
> -        if (addr == TCG_REG_ZERO && diff == (int32_t)diff) {
> +        if (*addr == TCG_REG_ZERO && diff == (int32_t)diff) {
>               imm12 = sextreg(diff, 0, 12);
>               tcg_out_opc_upper(s, OPC_AUIPC, TCG_REG_TMP2, diff - imm12);
>           } else {
>               tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_TMP2, offset - imm12);
> -            if (addr != TCG_REG_ZERO) {
> -                tcg_out_opc_reg(s, OPC_ADD, TCG_REG_TMP2, TCG_REG_TMP2, addr);
> +            if (*addr != TCG_REG_ZERO) {
> +                tcg_out_opc_reg(s, OPC_ADD, TCG_REG_TMP2, TCG_REG_TMP2, *addr);
>               }
>           }
> -        addr = TCG_REG_TMP2;
> +        *addr = TCG_REG_TMP2;
> +    }
> +    return imm12;
> +}
> +
> +static void split_offset_vector(TCGContext *s, TCGReg *addr, intptr_t offset)
> +{
> +    if (offset != 0) {
> +        if (offset == sextreg(offset, 0, 12)) {
> +            tcg_out_opc_imm(s, OPC_ADDI, TCG_REG_TMP0, *addr, offset);
> +        } else {
> +            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_TMP0, offset);
> +            tcg_out_opc_reg(s, OPC_ADD, TCG_REG_TMP0, TCG_REG_TMP0, *addr);
> +        }
> +        *addr = TCG_REG_TMP0;
>       }
> +}
> +
> +static void tcg_out_ldst(TCGContext *s, RISCVInsn opc, TCGReg data,
> +                         TCGReg addr, intptr_t offset)
> +{
> +    intptr_t imm12;
>   
>       switch (opc) {
>       case OPC_SB:
>       case OPC_SH:
>       case OPC_SW:
>       case OPC_SD:
> +        imm12 = split_offset_scalar(s, &addr, offset);
>           tcg_out_opc_store(s, opc, addr, data, imm12);
>           break;
>       case OPC_LB:
> @@ -845,8 +904,31 @@ static void tcg_out_ldst(TCGContext *s, RISCVInsn opc, TCGReg data,
>       case OPC_LW:
>       case OPC_LWU:
>       case OPC_LD:
> +        imm12 = split_offset_scalar(s, &addr, offset);
>           tcg_out_opc_imm(s, opc, data, addr, imm12);
>           break;
> +    case OPC_VSE8_V:
> +    case OPC_VSE16_V:
> +    case OPC_VSE32_V:
> +    case OPC_VSE64_V:
> +    case OPC_VS1R_V:
> +    case OPC_VS2R_V:
> +    case OPC_VS4R_V:
> +    case OPC_VS8R_V:
> +        split_offset_vector(s, &addr, offset);
> +        tcg_out_opc_ldst_vec(s, opc, data, addr, true);
> +        break;
> +    case OPC_VLE8_V:
> +    case OPC_VLE16_V:
> +    case OPC_VLE32_V:
> +    case OPC_VLE64_V:
> +    case OPC_VL1RE64_V:
> +    case OPC_VL2RE64_V:
> +    case OPC_VL4RE64_V:
> +    case OPC_VL8RE64_V:
> +        split_offset_vector(s, &addr, offset);
> +        tcg_out_opc_ldst_vec(s, opc, data, addr, true);
> +        break;
>       default:
>           g_assert_not_reached();
>       }

This is more complicated than it needs to be, calling a combined function, then using a 
switch to separate, then calling separate functions.  Calling separate functions in the 
first place is simpler.  E.g.

static void tcg_out_vec_ldst(TCGContext *s, RISCVInsn opc, TCGReg data,
                              TCGReg addr, intptr_t offset)
{
     tcg_debug_assert(data >= TCG_REG_V0);
     tcg_debug_assert(addr < TCG_REG_V0);

     if (offset) {
         tcg_debug_assert(addr != TCG_REG_ZERO);
         if (offset == sextreg(offset, 0, 12)) {
             tcg_out_opc_imm(s, OPC_ADDI, TCG_REG_TMP0, addr, offset);
         } else {
             tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_TMP0, offset);
             tcg_out_opc_reg(s, OPC_ADD, TCG_REG_TMP0, TCG_REG_TMP0, addr);
         }
         addr = TCG_REG_TMP0;
     }

     tcg_out32(s, opc | ((data & 0x1f) << 7) | (addr << 15) | (1 << 25));
}

>   static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg arg,
>                          TCGReg arg1, intptr_t arg2)
>   {
> -    RISCVInsn insn = type == TCG_TYPE_I32 ? OPC_LW : OPC_LD;
> +    RISCVInsn insn;
> +
> +    if (type < TCG_TYPE_V64) {
> +        insn = (type == TCG_TYPE_I32) ? OPC_LW : OPC_LD;
> +    } else {
> +        int nf = get_vec_type_bytes(type) / riscv_vlenb;
> +
> +        switch (nf) {
> +        case 1:
> +            insn = OPC_VL1RE64_V;
> +            break;
> +        case 2:
> +            insn = OPC_VL2RE64_V;
> +            break;
> +        case 4:
> +            insn = OPC_VL4RE64_V;
> +            break;
> +        case 8:
> +            insn = OPC_VL8RE64_V;
> +            break;
> +        default:
> +            {
> +                int prev_vsew = riscv_set_vec_config_vl(s, type);
> +
> +                switch (prev_vsew) {
> +                case MO_8:
> +                    insn = OPC_VLE8_V;
> +                    break;
> +                case MO_16:
> +                    insn = OPC_VLE16_V;
> +                    break;
> +                case MO_32:
> +                    insn = OPC_VLE32_V;
> +                    break;
> +                case MO_64:
> +                    insn = OPC_VLE64_V;
> +                    break;
> +                default:
> +                    g_assert_not_reached();
> +                }
> +            }
> +            break;

This can be simplified:

     switch (type) {
     case TCG_TYPE_I32:
         tcg_out_ldst(s, OPC_LW, data, base, offset);
         break;
     case TCG_TYPE_I64:
         tcg_out_ldst(s, OPC_LD, data, base, offset);
         break;
     case TCG_TYPE_V64:
     case TCG_TYPE_V128:
     case TCG_TYPE_V256:
         if (type >= riscv_lg2_vlenb) {
             static const RISCVInsn whole_reg_ld[] = {
                 OPC_VL1RE64_V, OPC_VL2RE64_V, OPC_VL4RE64_V, OPC_VL8RE64_V
             };
             unsigned idx = type - riscv_lg2_vlenb;
             insn = whole_reg_ld[idx];
         } else {
             static const RISCVInsn unit_stride_ld[] = {
                 OPC_VLE8_V, OPC_VLE16_V, OPC_VLE32_V, OPC_VLE64_V
             };
             MemOp prev_vsew = set_vtype_len(s, type);
             insn = unit_stride_ld[prev_vsew];
         }
         tcg_out_vec_ldst(s, insn, data, base, offset);
         break;
     default:
         g_assert_not_reached();
     }

and similar for store.


r~
Re: [PATCH v3 05/14] tcg/riscv: Implement vector load/store
Posted by LIU Zhiwei 2 months, 2 weeks ago
On 2024/9/5 14:39, Richard Henderson wrote:
> On 9/4/24 07:27, LIU Zhiwei wrote:
>> @@ -811,31 +849,52 @@ static void tcg_out_extrl_i64_i32(TCGContext 
>> *s, TCGReg ret, TCGReg arg)
>>       tcg_out_ext32s(s, ret, arg);
>>   }
>>   -static void tcg_out_ldst(TCGContext *s, RISCVInsn opc, TCGReg data,
>> -                         TCGReg addr, intptr_t offset)
>> +static intptr_t split_offset_scalar(TCGContext *s, TCGReg *addr,
>> +                                    intptr_t offset)
>>   {
>>       intptr_t imm12 = sextreg(offset, 0, 12);
>>         if (offset != imm12) {
>>           intptr_t diff = tcg_pcrel_diff(s, (void *)offset);
>>   -        if (addr == TCG_REG_ZERO && diff == (int32_t)diff) {
>> +        if (*addr == TCG_REG_ZERO && diff == (int32_t)diff) {
>>               imm12 = sextreg(diff, 0, 12);
>>               tcg_out_opc_upper(s, OPC_AUIPC, TCG_REG_TMP2, diff - 
>> imm12);
>>           } else {
>>               tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_TMP2, offset - 
>> imm12);
>> -            if (addr != TCG_REG_ZERO) {
>> -                tcg_out_opc_reg(s, OPC_ADD, TCG_REG_TMP2, 
>> TCG_REG_TMP2, addr);
>> +            if (*addr != TCG_REG_ZERO) {
>> +                tcg_out_opc_reg(s, OPC_ADD, TCG_REG_TMP2, 
>> TCG_REG_TMP2, *addr);
>>               }
>>           }
>> -        addr = TCG_REG_TMP2;
>> +        *addr = TCG_REG_TMP2;
>> +    }
>> +    return imm12;
>> +}
>> +
>> +static void split_offset_vector(TCGContext *s, TCGReg *addr, 
>> intptr_t offset)
>> +{
>> +    if (offset != 0) {
>> +        if (offset == sextreg(offset, 0, 12)) {
>> +            tcg_out_opc_imm(s, OPC_ADDI, TCG_REG_TMP0, *addr, offset);
>> +        } else {
>> +            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_TMP0, offset);
>> +            tcg_out_opc_reg(s, OPC_ADD, TCG_REG_TMP0, TCG_REG_TMP0, 
>> *addr);
>> +        }
>> +        *addr = TCG_REG_TMP0;
>>       }
>> +}
>> +
>> +static void tcg_out_ldst(TCGContext *s, RISCVInsn opc, TCGReg data,
>> +                         TCGReg addr, intptr_t offset)
>> +{
>> +    intptr_t imm12;
>>         switch (opc) {
>>       case OPC_SB:
>>       case OPC_SH:
>>       case OPC_SW:
>>       case OPC_SD:
>> +        imm12 = split_offset_scalar(s, &addr, offset);
>>           tcg_out_opc_store(s, opc, addr, data, imm12);
>>           break;
>>       case OPC_LB:
>> @@ -845,8 +904,31 @@ static void tcg_out_ldst(TCGContext *s, 
>> RISCVInsn opc, TCGReg data,
>>       case OPC_LW:
>>       case OPC_LWU:
>>       case OPC_LD:
>> +        imm12 = split_offset_scalar(s, &addr, offset);
>>           tcg_out_opc_imm(s, opc, data, addr, imm12);
>>           break;
>> +    case OPC_VSE8_V:
>> +    case OPC_VSE16_V:
>> +    case OPC_VSE32_V:
>> +    case OPC_VSE64_V:
>> +    case OPC_VS1R_V:
>> +    case OPC_VS2R_V:
>> +    case OPC_VS4R_V:
>> +    case OPC_VS8R_V:
>> +        split_offset_vector(s, &addr, offset);
>> +        tcg_out_opc_ldst_vec(s, opc, data, addr, true);
>> +        break;
>> +    case OPC_VLE8_V:
>> +    case OPC_VLE16_V:
>> +    case OPC_VLE32_V:
>> +    case OPC_VLE64_V:
>> +    case OPC_VL1RE64_V:
>> +    case OPC_VL2RE64_V:
>> +    case OPC_VL4RE64_V:
>> +    case OPC_VL8RE64_V:
>> +        split_offset_vector(s, &addr, offset);
>> +        tcg_out_opc_ldst_vec(s, opc, data, addr, true);
>> +        break;
>>       default:
>>           g_assert_not_reached();
>>       }
>
> This is more complicated than it needs to be, calling a combined 
> function, then using a switch to separate, then calling separate 
> functions.  Calling separate functions in the first place is simpler.  
> E.g.
>
> static void tcg_out_vec_ldst(TCGContext *s, RISCVInsn opc, TCGReg data,
>                              TCGReg addr, intptr_t offset)
> {
>     tcg_debug_assert(data >= TCG_REG_V0);
>     tcg_debug_assert(addr < TCG_REG_V0);
>
>     if (offset) {
>         tcg_debug_assert(addr != TCG_REG_ZERO);
>         if (offset == sextreg(offset, 0, 12)) {
>             tcg_out_opc_imm(s, OPC_ADDI, TCG_REG_TMP0, addr, offset);
>         } else {
>             tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_TMP0, offset);
>             tcg_out_opc_reg(s, OPC_ADD, TCG_REG_TMP0, TCG_REG_TMP0, 
> addr);
>         }
>         addr = TCG_REG_TMP0;
>     }
>
>     tcg_out32(s, opc | ((data & 0x1f) << 7) | (addr << 15) | (1 << 25));
> }
>
>>   static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg arg,
>>                          TCGReg arg1, intptr_t arg2)
>>   {
>> -    RISCVInsn insn = type == TCG_TYPE_I32 ? OPC_LW : OPC_LD;
>> +    RISCVInsn insn;
>> +
>> +    if (type < TCG_TYPE_V64) {
>> +        insn = (type == TCG_TYPE_I32) ? OPC_LW : OPC_LD;
>> +    } else {
>> +        int nf = get_vec_type_bytes(type) / riscv_vlenb;
>> +
>> +        switch (nf) {
>> +        case 1:
>> +            insn = OPC_VL1RE64_V;
>> +            break;
>> +        case 2:
>> +            insn = OPC_VL2RE64_V;
>> +            break;
>> +        case 4:
>> +            insn = OPC_VL4RE64_V;
>> +            break;
>> +        case 8:
>> +            insn = OPC_VL8RE64_V;
>> +            break;
>> +        default:
>> +            {
>> +                int prev_vsew = riscv_set_vec_config_vl(s, type);
>> +
>> +                switch (prev_vsew) {
>> +                case MO_8:
>> +                    insn = OPC_VLE8_V;
>> +                    break;
>> +                case MO_16:
>> +                    insn = OPC_VLE16_V;
>> +                    break;
>> +                case MO_32:
>> +                    insn = OPC_VLE32_V;
>> +                    break;
>> +                case MO_64:
>> +                    insn = OPC_VLE64_V;
>> +                    break;
>> +                default:
>> +                    g_assert_not_reached();
>> +                }
>> +            }
>> +            break;
>
> This can be simplified:
>
>     switch (type) {
>     case TCG_TYPE_I32:
>         tcg_out_ldst(s, OPC_LW, data, base, offset);
>         break;
>     case TCG_TYPE_I64:
>         tcg_out_ldst(s, OPC_LD, data, base, offset);
>         break;
>     case TCG_TYPE_V64:
>     case TCG_TYPE_V128:
>     case TCG_TYPE_V256:
>         if (type >= riscv_lg2_vlenb) {
>             static const RISCVInsn whole_reg_ld[] = {
>                 OPC_VL1RE64_V, OPC_VL2RE64_V, OPC_VL4RE64_V, 
> OPC_VL8RE64_V
>             };
>             unsigned idx = type - riscv_lg2_vlenb;
>             insn = whole_reg_ld[idx];
>         } else {
>             static const RISCVInsn unit_stride_ld[] = {
>                 OPC_VLE8_V, OPC_VLE16_V, OPC_VLE32_V, OPC_VLE64_V
>             };
>             MemOp prev_vsew = set_vtype_len(s, type);
>             insn = unit_stride_ld[prev_vsew];
>         }
>         tcg_out_vec_ldst(s, insn, data, base, offset);
>         break;
>     default:
>         g_assert_not_reached();
>     }
>
> and similar for store.

Great. We will take this way.

Zhiwei

>
>
> r~