RISC-V: support vector extension

[Qemu-devel] [PATCH v2 08/17] RISC-V: add vector extension integer instructions part1, add/sub/adc/sbc

Posted by liuzhiwei 6 years, 5 months ago

From: LIU Zhiwei <zhiwei_liu@c-sky.com>

Signed-off-by: LIU Zhiwei <zhiwei_liu@c-sky.com>
---
 target/riscv/helper.h                   |   36 +
 target/riscv/insn32.decode              |   35 +
 target/riscv/insn_trans/trans_rvv.inc.c |   49 +
 target/riscv/vector_helper.c            | 2335 +++++++++++++++++++++++++++++++
 4 files changed, 2455 insertions(+)

diff --git a/target/riscv/helper.h b/target/riscv/helper.h
index c107925..31e20dc 100644
--- a/target/riscv/helper.h
+++ b/target/riscv/helper.h
@@ -121,6 +121,7 @@ DEF_HELPER_6(vector_vsuxb_v, void, env, i32, i32, i32, i32, i32)
 DEF_HELPER_6(vector_vsuxh_v, void, env, i32, i32, i32, i32, i32)
 DEF_HELPER_6(vector_vsuxw_v, void, env, i32, i32, i32, i32, i32)
 DEF_HELPER_6(vector_vsuxe_v, void, env, i32, i32, i32, i32, i32)
+
 DEF_HELPER_6(vector_vamoswapw_v, void, env, i32, i32, i32, i32, i32)
 DEF_HELPER_6(vector_vamoswapd_v, void, env, i32, i32, i32, i32, i32)
 DEF_HELPER_6(vector_vamoaddw_v, void, env, i32, i32, i32, i32, i32)
@@ -139,5 +140,40 @@ DEF_HELPER_6(vector_vamominuw_v, void, env, i32, i32, i32, i32, i32)
 DEF_HELPER_6(vector_vamominud_v, void, env, i32, i32, i32, i32, i32)
 DEF_HELPER_6(vector_vamomaxuw_v, void, env, i32, i32, i32, i32, i32)
 DEF_HELPER_6(vector_vamomaxud_v, void, env, i32, i32, i32, i32, i32)
+
+DEF_HELPER_4(vector_vadc_vvm, void, env, i32, i32, i32)
+DEF_HELPER_4(vector_vadc_vxm, void, env, i32, i32, i32)
+DEF_HELPER_4(vector_vadc_vim, void, env, i32, i32, i32)
+DEF_HELPER_4(vector_vmadc_vvm, void, env, i32, i32, i32)
+DEF_HELPER_4(vector_vmadc_vxm, void, env, i32, i32, i32)
+DEF_HELPER_4(vector_vmadc_vim, void, env, i32, i32, i32)
+DEF_HELPER_4(vector_vsbc_vvm, void, env, i32, i32, i32)
+DEF_HELPER_4(vector_vsbc_vxm, void, env, i32, i32, i32)
+DEF_HELPER_4(vector_vmsbc_vvm, void, env, i32, i32, i32)
+DEF_HELPER_4(vector_vmsbc_vxm, void, env, i32, i32, i32)
+DEF_HELPER_5(vector_vadd_vv, void, env, i32, i32, i32, i32)
+DEF_HELPER_5(vector_vadd_vx, void, env, i32, i32, i32, i32)
+DEF_HELPER_5(vector_vadd_vi, void, env, i32, i32, i32, i32)
+DEF_HELPER_5(vector_vsub_vv, void, env, i32, i32, i32, i32)
+DEF_HELPER_5(vector_vsub_vx, void, env, i32, i32, i32, i32)
+DEF_HELPER_5(vector_vrsub_vx, void, env, i32, i32, i32, i32)
+DEF_HELPER_5(vector_vrsub_vi, void, env, i32, i32, i32, i32)
+DEF_HELPER_5(vector_vwaddu_vv, void, env, i32, i32, i32, i32)
+DEF_HELPER_5(vector_vwaddu_vx, void, env, i32, i32, i32, i32)
+DEF_HELPER_5(vector_vwadd_vv, void, env, i32, i32, i32, i32)
+DEF_HELPER_5(vector_vwadd_vx, void, env, i32, i32, i32, i32)
+DEF_HELPER_5(vector_vwsubu_vv, void, env, i32, i32, i32, i32)
+DEF_HELPER_5(vector_vwsubu_vx, void, env, i32, i32, i32, i32)
+DEF_HELPER_5(vector_vwsub_vv, void, env, i32, i32, i32, i32)
+DEF_HELPER_5(vector_vwsub_vx, void, env, i32, i32, i32, i32)
+DEF_HELPER_5(vector_vwaddu_wv, void, env, i32, i32, i32, i32)
+DEF_HELPER_5(vector_vwaddu_wx, void, env, i32, i32, i32, i32)
+DEF_HELPER_5(vector_vwadd_wv, void, env, i32, i32, i32, i32)
+DEF_HELPER_5(vector_vwadd_wx, void, env, i32, i32, i32, i32)
+DEF_HELPER_5(vector_vwsubu_wv, void, env, i32, i32, i32, i32)
+DEF_HELPER_5(vector_vwsubu_wx, void, env, i32, i32, i32, i32)
+DEF_HELPER_5(vector_vwsub_wv, void, env, i32, i32, i32, i32)
+DEF_HELPER_5(vector_vwsub_wx, void, env, i32, i32, i32, i32)
+
 DEF_HELPER_4(vector_vsetvli, void, env, i32, i32, i32)
 DEF_HELPER_4(vector_vsetvl, void, env, i32, i32, i32)
diff --git a/target/riscv/insn32.decode b/target/riscv/insn32.decode
index 48e7661..fc7e498 100644
--- a/target/riscv/insn32.decode
+++ b/target/riscv/insn32.decode
@@ -63,6 +63,7 @@
 @r_rm    .......   ..... ..... ... ..... ....... %rs2 %rs1 %rm %rd
 @r2_rm   .......   ..... ..... ... ..... ....... %rs1 %rm %rd
 @r2      .......   ..... ..... ... ..... ....... %rs1 %rd
+@r_vm    ...... vm:1 ..... ..... ... ..... ....... %rs2 %rs1 %rd
 @r_wdvm  ..... wd:1 vm:1 ..... ..... ... ..... ....... %rs2 %rs1 %rd
 @r_nfvm  nf:3 ... vm:1 ..... ..... ... ..... ....... %rs2 %rs1 %rd
 @r2_nfvm nf:3 ... vm:1 ..... ..... ... ..... ....... %rs1 %rd
@@ -280,5 +281,39 @@ vamomaxuw_v     11100 . . ..... ..... 110 ..... 0101111 @r_wdvm
 vamomaxud_v     11100 . . ..... ..... 111 ..... 0101111 @r_wdvm
 
 #*** new major opcode OP-V ***
+vadd_vv         000000 . ..... ..... 000 ..... 1010111 @r_vm
+vadd_vx         000000 . ..... ..... 100 ..... 1010111 @r_vm
+vadd_vi         000000 . ..... ..... 011 ..... 1010111 @r_vm
+vsub_vv         000010 . ..... ..... 000 ..... 1010111 @r_vm
+vsub_vx         000010 . ..... ..... 100 ..... 1010111 @r_vm
+vrsub_vx        000011 . ..... ..... 100 ..... 1010111 @r_vm
+vrsub_vi        000011 . ..... ..... 011 ..... 1010111 @r_vm
+vwaddu_vv       110000 . ..... ..... 010 ..... 1010111 @r_vm
+vwaddu_vx       110000 . ..... ..... 110 ..... 1010111 @r_vm
+vwadd_vv        110001 . ..... ..... 010 ..... 1010111 @r_vm
+vwadd_vx        110001 . ..... ..... 110 ..... 1010111 @r_vm
+vwsubu_vv       110010 . ..... ..... 010 ..... 1010111 @r_vm
+vwsubu_vx       110010 . ..... ..... 110 ..... 1010111 @r_vm
+vwsub_vv        110011 . ..... ..... 010 ..... 1010111 @r_vm
+vwsub_vx        110011 . ..... ..... 110 ..... 1010111 @r_vm
+vwaddu_wv       110100 . ..... ..... 010 ..... 1010111 @r_vm
+vwaddu_wx       110100 . ..... ..... 110 ..... 1010111 @r_vm
+vwadd_wv        110101 . ..... ..... 010 ..... 1010111 @r_vm
+vwadd_wx        110101 . ..... ..... 110 ..... 1010111 @r_vm
+vwsubu_wv       110110 . ..... ..... 010 ..... 1010111 @r_vm
+vwsubu_wx       110110 . ..... ..... 110 ..... 1010111 @r_vm
+vwsub_wv        110111 . ..... ..... 010 ..... 1010111 @r_vm
+vwsub_wx        110111 . ..... ..... 110 ..... 1010111 @r_vm
+vadc_vvm        010000 1 ..... ..... 000 ..... 1010111 @r
+vadc_vxm        010000 1 ..... ..... 100 ..... 1010111 @r
+vadc_vim        010000 1 ..... ..... 011 ..... 1010111 @r
+vmadc_vvm       010001 1 ..... ..... 000 ..... 1010111 @r
+vmadc_vxm       010001 1 ..... ..... 100 ..... 1010111 @r
+vmadc_vim       010001 1 ..... ..... 011 ..... 1010111 @r
+vsbc_vvm        010010 1 ..... ..... 000 ..... 1010111 @r
+vsbc_vxm        010010 1 ..... ..... 100 ..... 1010111 @r
+vmsbc_vvm       010011 1 ..... ..... 000 ..... 1010111 @r
+vmsbc_vxm       010011 1 ..... ..... 100 ..... 1010111 @r
+
 vsetvli         0 ........... ..... 111 ..... 1010111  @r2_zimm
 vsetvl          1000000 ..... ..... 111 ..... 1010111  @r
diff --git a/target/riscv/insn_trans/trans_rvv.inc.c b/target/riscv/insn_trans/trans_rvv.inc.c
index 7bda378..a1c1960 100644
--- a/target/riscv/insn_trans/trans_rvv.inc.c
+++ b/target/riscv/insn_trans/trans_rvv.inc.c
@@ -77,6 +77,21 @@ static bool trans_##INSN(DisasContext *ctx, arg_##INSN * a) \
     return true;                                       \
 }
 
+#define GEN_VECTOR_R_VM(INSN) \
+static bool trans_##INSN(DisasContext *ctx, arg_##INSN * a) \
+{                                                      \
+    TCGv_i32 s1 = tcg_const_i32(a->rs1);               \
+    TCGv_i32 s2 = tcg_const_i32(a->rs2);               \
+    TCGv_i32 d  = tcg_const_i32(a->rd);                \
+    TCGv_i32 vm = tcg_const_i32(a->vm);                \
+    gen_helper_vector_##INSN(cpu_env, vm, s1, s2, d);    \
+    tcg_temp_free_i32(s1);                             \
+    tcg_temp_free_i32(s2);                             \
+    tcg_temp_free_i32(d);                              \
+    tcg_temp_free_i32(vm);                             \
+    return true;                                       \
+}
+
 #define GEN_VECTOR_R2_ZIMM(INSN) \
 static bool trans_##INSN(DisasContext *ctx, arg_##INSN * a) \
 {                                                      \
@@ -155,5 +170,39 @@ GEN_VECTOR_R_WDVM(vamominud_v)
 GEN_VECTOR_R_WDVM(vamomaxuw_v)
 GEN_VECTOR_R_WDVM(vamomaxud_v)
 
+GEN_VECTOR_R(vadc_vvm)
+GEN_VECTOR_R(vadc_vxm)
+GEN_VECTOR_R(vadc_vim)
+GEN_VECTOR_R(vmadc_vvm)
+GEN_VECTOR_R(vmadc_vxm)
+GEN_VECTOR_R(vmadc_vim)
+GEN_VECTOR_R(vsbc_vvm)
+GEN_VECTOR_R(vsbc_vxm)
+GEN_VECTOR_R(vmsbc_vvm)
+GEN_VECTOR_R(vmsbc_vxm)
+GEN_VECTOR_R_VM(vadd_vv)
+GEN_VECTOR_R_VM(vadd_vx)
+GEN_VECTOR_R_VM(vadd_vi)
+GEN_VECTOR_R_VM(vsub_vv)
+GEN_VECTOR_R_VM(vsub_vx)
+GEN_VECTOR_R_VM(vrsub_vx)
+GEN_VECTOR_R_VM(vrsub_vi)
+GEN_VECTOR_R_VM(vwaddu_vv)
+GEN_VECTOR_R_VM(vwaddu_vx)
+GEN_VECTOR_R_VM(vwadd_vv)
+GEN_VECTOR_R_VM(vwadd_vx)
+GEN_VECTOR_R_VM(vwsubu_vv)
+GEN_VECTOR_R_VM(vwsubu_vx)
+GEN_VECTOR_R_VM(vwsub_vv)
+GEN_VECTOR_R_VM(vwsub_vx)
+GEN_VECTOR_R_VM(vwaddu_wv)
+GEN_VECTOR_R_VM(vwaddu_wx)
+GEN_VECTOR_R_VM(vwadd_wv)
+GEN_VECTOR_R_VM(vwadd_wx)
+GEN_VECTOR_R_VM(vwsubu_wv)
+GEN_VECTOR_R_VM(vwsubu_wx)
+GEN_VECTOR_R_VM(vwsub_wv)
+GEN_VECTOR_R_VM(vwsub_wx)
+
 GEN_VECTOR_R2_ZIMM(vsetvli)
 GEN_VECTOR_R(vsetvl)
diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c
index 9ebf70d..95336c9 100644
--- a/target/riscv/vector_helper.c
+++ b/target/riscv/vector_helper.c
@@ -24,12 +24,21 @@
 #include <math.h>
 
 #define VECTOR_HELPER(name) HELPER(glue(vector_, name))
+#define SIGNBIT8    (1 << 7)
+#define SIGNBIT16   (1 << 15)
+#define SIGNBIT32   (1 << 31)
+#define SIGNBIT64   ((uint64_t)1 << 63)
 
 static int64_t sign_extend(int64_t a, int8_t width)
 {
     return a << (64 - width) >> (64 - width);
 }
 
+static int64_t extend_gpr(target_ulong reg)
+{
+    return sign_extend(reg, sizeof(target_ulong) * 8);
+}
+
 static target_ulong vector_get_index(CPURISCVState *env, int rs1, int rs2,
     int index, int mem, int width, int nf)
 {
@@ -118,6 +127,39 @@ static inline bool vector_overlap_vm_common(int lmul, int vm, int rd)
     return false;
 }
 
+static inline bool vector_overlap_vm_force(int vm, int rd)
+{
+    if (vm == 0 && rd == 0) {
+        return true;
+    }
+    return false;
+}
+
+static inline bool vector_overlap_carry(int lmul, int rd)
+{
+    if (lmul > 1 && rd == 0) {
+        return true;
+    }
+    return false;
+}
+
+static inline bool vector_overlap_dstgp_srcgp(int rd, int dlen, int rs,
+    int slen)
+{
+    if ((rd >= rs && rd < rs + slen) || (rs >= rd && rs < rd + dlen)) {
+        return true;
+    }
+    return false;
+}
+
+static inline void vector_get_layout(CPURISCVState *env, int width, int lmul,
+    int index, int *idx, int *pos)
+{
+    int mlen = width / lmul;
+    *idx = (index * mlen) / 8;
+    *pos = (index * mlen) % 8;
+}
+
 static bool  vector_lmul_check_reg(CPURISCVState *env, uint32_t lmul,
         uint32_t reg, bool widen)
 {
@@ -185,6 +227,173 @@ static void vector_tail_segment(CPURISCVState *env, int vreg, int index,
     }
 }
 
+static void vector_tail_common(CPURISCVState *env, int vreg, int index,
+    int width)
+{
+    switch (width) {
+    case 8:
+        env->vfp.vreg[vreg].u8[index] = 0;
+        break;
+    case 16:
+        env->vfp.vreg[vreg].u16[index] = 0;
+        break;
+    case 32:
+        env->vfp.vreg[vreg].u32[index] = 0;
+        break;
+    case 64:
+        env->vfp.vreg[vreg].u64[index] = 0;
+        break;
+    default:
+        helper_raise_exception(env, RISCV_EXCP_ILLEGAL_INST);
+        return;
+    }
+}
+
+static void vector_tail_widen(CPURISCVState *env, int vreg, int index,
+    int width)
+{
+    switch (width) {
+    case 8:
+        env->vfp.vreg[vreg].u16[index] = 0;
+        break;
+    case 16:
+        env->vfp.vreg[vreg].u32[index] = 0;
+        break;
+    case 32:
+        env->vfp.vreg[vreg].u64[index] = 0;
+        break;
+    default:
+        helper_raise_exception(env, RISCV_EXCP_ILLEGAL_INST);
+        return;
+    }
+}
+
+static inline int vector_get_carry(CPURISCVState *env, int width, int lmul,
+    int index)
+{
+    int mlen = width / lmul;
+    int idx = (index * mlen) / 8;
+    int pos = (index * mlen) % 8;
+
+    return (env->vfp.vreg[0].u8[idx] >> pos) & 0x1;
+}
+
+static inline void vector_mask_result(CPURISCVState *env, uint32_t reg,
+        int width, int lmul, int index, uint32_t result)
+{
+    int mlen = width / lmul;
+    int idx  = (index * mlen) / width;
+    int pos  = (index * mlen) % width;
+    uint64_t mask = ~((((uint64_t)1 << mlen) - 1) << pos);
+
+    switch (width) {
+    case 8:
+        env->vfp.vreg[reg].u8[idx] = (env->vfp.vreg[reg].u8[idx] & mask)
+                                                | (result << pos);
+    break;
+    case 16:
+        env->vfp.vreg[reg].u16[idx] = (env->vfp.vreg[reg].u16[idx] & mask)
+                                                | (result << pos);
+    break;
+    case 32:
+        env->vfp.vreg[reg].u32[idx] = (env->vfp.vreg[reg].u32[idx] & mask)
+                                                | (result << pos);
+    break;
+    case 64:
+        env->vfp.vreg[reg].u64[idx] = (env->vfp.vreg[reg].u64[idx] & mask)
+                                                | ((uint64_t)result << pos);
+    break;
+    default:
+        helper_raise_exception(env, RISCV_EXCP_ILLEGAL_INST);
+    break;
+    }
+
+    return;
+}
+
+static inline uint64_t u64xu64_lh(uint64_t a, uint64_t b)
+{
+    uint64_t hi_64, carry;
+
+    /* first get the whole product in {hi_64, lo_64} */
+    uint64_t a_hi = a >> 32;
+    uint64_t a_lo = (uint32_t)a;
+    uint64_t b_hi = b >> 32;
+    uint64_t b_lo = (uint32_t)b;
+
+    /*
+     * a * b = (a_hi << 32 + a_lo) * (b_hi << 32 + b_lo)
+     *               = (a_hi * b_hi) << 64 + (a_hi * b_lo) << 32 +
+     *                 (a_lo * b_hi) << 32 + a_lo * b_lo
+     *               = {hi_64, lo_64}
+     * hi_64 = ((a_hi * b_lo) << 32 + (a_lo * b_hi) << 32 + (a_lo * b_lo)) >> 64
+     *       = (a_hi * b_lo) >> 32 + (a_lo * b_hi) >> 32 + carry
+     * carry = ((uint64_t)(uint32_t)(a_hi * b_lo) +
+     *           (uint64_t)(uint32_t)(a_lo * b_hi) + (a_lo * b_lo) >> 32) >> 32
+     */
+
+    carry =  ((uint64_t)(uint32_t)(a_hi * b_lo) +
+              (uint64_t)(uint32_t)(a_lo * b_hi) +
+              ((a_lo * b_lo) >> 32)) >> 32;
+
+    hi_64 = a_hi * b_hi +
+            ((a_hi * b_lo) >> 32) + ((a_lo * b_hi) >> 32) +
+            carry;
+
+    return hi_64;
+}
+
+static inline int64_t s64xu64_lh(int64_t a, uint64_t b)
+{
+    uint64_t abs_a = a;
+    uint64_t lo_64, hi_64;
+
+    if (a < 0) {
+        abs_a =  ~a + 1;
+    }
+    lo_64 = abs_a * b;
+    hi_64 = u64xu64_lh(abs_a, b);
+
+    if ((a ^ b) & SIGNBIT64) {
+        lo_64 = ~lo_64;
+        hi_64 = ~hi_64;
+        if (lo_64 == UINT64_MAX) {
+            lo_64 = 0;
+            hi_64 += 1;
+        } else {
+            lo_64 += 1;
+        }
+    }
+    return hi_64;
+}
+
+static inline int64_t s64xs64_lh(int64_t a, int64_t b)
+{
+    uint64_t abs_a = a, abs_b = b;
+    uint64_t lo_64, hi_64;
+
+    if (a < 0) {
+        abs_a =  ~a + 1;
+    }
+    if (b < 0) {
+        abs_b = ~b + 1;
+    }
+    lo_64 = abs_a * abs_b;
+    hi_64 = u64xu64_lh(abs_a, abs_b);
+
+    if ((a ^ b) & SIGNBIT64) {
+        lo_64 = ~lo_64;
+        hi_64 = ~hi_64;
+        if (lo_64 == UINT64_MAX) {
+            lo_64 = 0;
+            hi_64 += 1;
+        } else {
+            lo_64 += 1;
+        }
+    }
+    return hi_64;
+}
+
 void VECTOR_HELPER(vsetvl)(CPURISCVState *env, uint32_t rs1, uint32_t rs2,
     uint32_t rd)
 {
@@ -4796,3 +5005,2129 @@ void VECTOR_HELPER(vamomaxud_v)(CPURISCVState *env, uint32_t wd, uint32_t vm,
     env->vfp.vstart = 0;
 }
 
+void VECTOR_HELPER(vadc_vvm)(CPURISCVState *env, uint32_t rs1,
+    uint32_t rs2, uint32_t rd)
+{
+    int i, j, vl;
+    uint32_t lmul, width, src1, src2, dest, vlmax, carry;
+
+    vl    = env->vfp.vl;
+    lmul  = vector_get_lmul(env);
+    width   = vector_get_width(env);
+    vlmax = vector_get_vlmax(env);
+
+    if (vector_vtype_ill(env) || vector_overlap_carry(lmul, rd)) {
+        riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+        return;
+    }
+    vector_lmul_check_reg(env, lmul, rs1, false);
+    vector_lmul_check_reg(env, lmul, rs2, false);
+    vector_lmul_check_reg(env, lmul, rd, false);
+
+    for (i = 0; i < vlmax; i++) {
+        src1 = rs1 + (i / (VLEN / width));
+        src2 = rs2 + (i / (VLEN / width));
+        dest = rd + (i / (VLEN / width));
+        j = i % (VLEN / width);
+        if (i < env->vfp.vstart) {
+            continue;
+        } else if (i < vl) {
+            switch (width) {
+            case 8:
+                carry = vector_get_carry(env, width, lmul, i);
+                env->vfp.vreg[dest].u8[j] = env->vfp.vreg[src1].u8[j]
+                    + env->vfp.vreg[src2].u8[j] + carry;
+                break;
+            case 16:
+                carry = vector_get_carry(env, width, lmul, i);
+                env->vfp.vreg[dest].u16[j] = env->vfp.vreg[src1].u16[j]
+                    + env->vfp.vreg[src2].u16[j] + carry;
+                break;
+            case 32:
+                carry = vector_get_carry(env, width, lmul, i);
+                env->vfp.vreg[dest].u32[j] = env->vfp.vreg[src1].u32[j]
+                    + env->vfp.vreg[src2].u32[j] + carry;
+                break;
+            case 64:
+                carry = vector_get_carry(env, width, lmul, i);
+                env->vfp.vreg[dest].u64[j] = env->vfp.vreg[src1].u64[j]
+                    + env->vfp.vreg[src2].u64[j] + carry;
+                break;
+            default:
+                riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+                break;
+            }
+        } else {
+            vector_tail_common(env, dest, j, width);
+        }
+    }
+    env->vfp.vstart = 0;
+}
+void VECTOR_HELPER(vadc_vxm)(CPURISCVState *env, uint32_t rs1,
+    uint32_t rs2, uint32_t rd)
+{
+    int i, j, vl;
+    uint32_t lmul, width, src2, dest, vlmax, carry;
+    vl    = env->vfp.vl;
+    lmul  = vector_get_lmul(env);
+    width   = vector_get_width(env);
+    vlmax = vector_get_vlmax(env);
+
+    if (vector_vtype_ill(env) || vector_overlap_carry(lmul, rd)) {
+        riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+        return;
+    }
+    vector_lmul_check_reg(env, lmul, rs2, false);
+    vector_lmul_check_reg(env, lmul, rd, false);
+
+    for (i = 0; i < vlmax; i++) {
+        src2 = rs2 + (i / (VLEN / width));
+        dest = rd + (i / (VLEN / width));
+        j = i % (VLEN / width);
+        if (i < env->vfp.vstart) {
+            continue;
+        } else if (i < vl) {
+            switch (width) {
+            case 8:
+                carry = vector_get_carry(env, width, lmul, i);
+                env->vfp.vreg[dest].u8[j] = env->gpr[rs1]
+                    + env->vfp.vreg[src2].u8[j] + carry;
+                break;
+            case 16:
+                carry = vector_get_carry(env, width, lmul, i);
+                env->vfp.vreg[dest].u16[j] = env->gpr[rs1]
+                    + env->vfp.vreg[src2].u16[j] + carry;
+                break;
+            case 32:
+                carry = vector_get_carry(env, width, lmul, i);
+                env->vfp.vreg[dest].u32[j] = env->gpr[rs1]
+                    + env->vfp.vreg[src2].u32[j] + carry;
+                break;
+            case 64:
+                carry = vector_get_carry(env, width, lmul, i);
+                env->vfp.vreg[dest].u64[j] = (uint64_t)extend_gpr(env->gpr[rs1])
+                    + env->vfp.vreg[src2].u64[j] + carry;
+                break;
+            default:
+                riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+                break;
+            }
+        } else {
+            vector_tail_common(env, dest, j, width);
+        }
+    }
+    env->vfp.vstart = 0;
+}
+
+void VECTOR_HELPER(vadc_vim)(CPURISCVState *env, uint32_t rs1,
+    uint32_t rs2, uint32_t rd)
+{
+    int i, j, vl;
+    uint32_t lmul, width, src2, dest, vlmax, carry;
+
+    vl    = env->vfp.vl;
+    lmul  = vector_get_lmul(env);
+    width   = vector_get_width(env);
+    vlmax = vector_get_vlmax(env);
+
+    if (vector_vtype_ill(env) || vector_overlap_carry(lmul, rd)) {
+        riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+        return;
+    }
+    vector_lmul_check_reg(env, lmul, rs2, false);
+    vector_lmul_check_reg(env, lmul, rd, false);
+
+    for (i = 0; i < vlmax; i++) {
+        src2 = rs2 + (i / (VLEN / width));
+        dest = rd + (i / (VLEN / width));
+        j = i % (VLEN / width);
+        if (i < env->vfp.vstart) {
+            continue;
+        } else if (i < vl) {
+            switch (width) {
+            case 8:
+                carry = vector_get_carry(env, width, lmul, i);
+                env->vfp.vreg[dest].u8[j] = sign_extend(rs1, 5)
+                    + env->vfp.vreg[src2].u8[j] + carry;
+                break;
+            case 16:
+                carry = vector_get_carry(env, width, lmul, i);
+                env->vfp.vreg[dest].u16[j] = sign_extend(rs1, 5)
+                    + env->vfp.vreg[src2].u16[j] + carry;
+                break;
+            case 32:
+                carry = vector_get_carry(env, width, lmul, i);
+                env->vfp.vreg[dest].u32[j] = sign_extend(rs1, 5)
+                    + env->vfp.vreg[src2].u32[j] + carry;
+                break;
+            case 64:
+                carry = vector_get_carry(env, width, lmul, i);
+                env->vfp.vreg[dest].u64[j] = sign_extend(rs1, 5)
+                    + env->vfp.vreg[src2].u64[j] + carry;
+                break;
+            default:
+                riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+                break;
+            }
+        } else {
+            vector_tail_common(env, dest, j, width);
+        }
+    }
+    env->vfp.vstart = 0;
+}
+
+void VECTOR_HELPER(vmadc_vvm)(CPURISCVState *env, uint32_t rs1,
+    uint32_t rs2, uint32_t rd)
+{
+    int i, j, vl;
+    uint32_t lmul, width, src1, src2, vlmax, carry;
+    uint64_t tmp;
+
+    vl    = env->vfp.vl;
+    lmul  = vector_get_lmul(env);
+    width   = vector_get_width(env);
+    vlmax = vector_get_vlmax(env);
+
+    if (vector_vtype_ill(env)
+        || vector_overlap_dstgp_srcgp(rd, 1, rs1, lmul)
+        || vector_overlap_dstgp_srcgp(rd, 1, rs2, lmul)
+        || (rd == 0)) {
+        riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+        return;
+    }
+    vector_lmul_check_reg(env, lmul, rs1, false);
+    vector_lmul_check_reg(env, lmul, rs2, false);
+
+    for (i = 0; i < vlmax; i++) {
+        src1 = rs1 + (i / (VLEN / width));
+        src2 = rs2 + (i / (VLEN / width));
+        j      = i % (VLEN / width);
+        if (i < env->vfp.vstart) {
+            continue;
+        } else if (i < vl) {
+            switch (width) {
+            case 8:
+                carry = vector_get_carry(env, width, lmul, i);
+                tmp   = env->vfp.vreg[src1].u8[j]
+                    + env->vfp.vreg[src2].u8[j] + carry;
+                tmp   = tmp >> width;
+
+                vector_mask_result(env, rd, width, lmul, i, tmp);
+                break;
+            case 16:
+                carry = vector_get_carry(env, width, lmul, i);
+                tmp   = env->vfp.vreg[src1].u16[j]
+                    + env->vfp.vreg[src2].u16[j] + carry;
+                tmp   = tmp >> width;
+                vector_mask_result(env, rd, width, lmul, i, tmp);
+                break;
+            case 32:
+                carry = vector_get_carry(env, width, lmul, i);
+                tmp   = (uint64_t)env->vfp.vreg[src1].u32[j]
+                    + (uint64_t)env->vfp.vreg[src2].u32[j] + carry;
+                tmp   = tmp >> width;
+                vector_mask_result(env, rd, width, lmul, i, tmp);
+                break;
+            case 64:
+                carry = vector_get_carry(env, width, lmul, i);
+                tmp   = env->vfp.vreg[src1].u64[j]
+                    + env->vfp.vreg[src2].u64[j] + carry;
+
+                if ((tmp < env->vfp.vreg[src1].u64[j] ||
+                        tmp < env->vfp.vreg[src2].u64[j])
+                    || (env->vfp.vreg[src1].u64[j] == UINT64_MAX &&
+                        env->vfp.vreg[src2].u64[j] == UINT64_MAX)) {
+                    tmp = 1;
+                } else {
+                    tmp = 0;
+                }
+                vector_mask_result(env, rd, width, lmul, i, tmp);
+                break;
+            default:
+                riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+                break;
+            }
+        } else {
+            if (width <= 64) {
+                vector_mask_result(env, rd, width, lmul, i, 0);
+            } else {
+                riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+                return;
+            }
+        }
+    }
+    env->vfp.vstart = 0;
+}
+void VECTOR_HELPER(vmadc_vxm)(CPURISCVState *env, uint32_t rs1,
+    uint32_t rs2, uint32_t rd)
+{
+    int i, j, vl;
+    uint32_t lmul, width, src2, vlmax, carry;
+    uint64_t tmp, extend_rs1;
+
+    vl    = env->vfp.vl;
+    lmul  = vector_get_lmul(env);
+    width   = vector_get_width(env);
+    vlmax = vector_get_vlmax(env);
+
+    if (vector_vtype_ill(env)
+        || vector_overlap_dstgp_srcgp(rd, 1, rs2, lmul)
+        || (rd == 0)) {
+        riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+        return;
+    }
+    vector_lmul_check_reg(env, lmul, rs2, false);
+
+    for (i = 0; i < vlmax; i++) {
+        src2 = rs2 + (i / (VLEN / width));
+        j      = i % (VLEN / width);
+        if (i < env->vfp.vstart) {
+            continue;
+        } else if (i < vl) {
+            switch (width) {
+            case 8:
+                carry = vector_get_carry(env, width, lmul, i);
+                tmp   = (uint8_t)env->gpr[rs1]
+                    + env->vfp.vreg[src2].u8[j] + carry;
+                tmp   = tmp >> width;
+
+                vector_mask_result(env, rd, width, lmul, i, tmp);
+                break;
+            case 16:
+                carry = vector_get_carry(env, width, lmul, i);
+                tmp   = (uint16_t)env->gpr[rs1]
+                    + env->vfp.vreg[src2].u16[j] + carry;
+                tmp   = tmp >> width;
+                vector_mask_result(env, rd, width, lmul, i, tmp);
+                break;
+            case 32:
+                carry = vector_get_carry(env, width, lmul, i);
+                tmp   = (uint64_t)((uint32_t)env->gpr[rs1])
+                    + (uint64_t)env->vfp.vreg[src2].u32[j] + carry;
+                tmp   = tmp >> width;
+                vector_mask_result(env, rd, width, lmul, i, tmp);
+                break;
+            case 64:
+                carry = vector_get_carry(env, width, lmul, i);
+
+                extend_rs1 = (uint64_t)extend_gpr(env->gpr[rs1]);
+                tmp = extend_rs1 + env->vfp.vreg[src2].u64[j] + carry;
+                if ((tmp < extend_rs1) ||
+                    (carry && (env->vfp.vreg[src2].u64[j] == UINT64_MAX))) {
+                    tmp = 1;
+                } else {
+                    tmp = 0;
+                }
+                vector_mask_result(env, rd, width, lmul, i, tmp);
+                break;
+            default:
+                riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+                break;
+            }
+        } else {
+            if (width <= 64) {
+                vector_mask_result(env, rd, width, lmul, i, 0);
+            } else {
+                riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+                return;
+            }
+        }
+    }
+    env->vfp.vstart = 0;
+}
+
+void VECTOR_HELPER(vmadc_vim)(CPURISCVState *env, uint32_t rs1,
+    uint32_t rs2, uint32_t rd)
+{
+    int i, j, vl;
+    uint32_t lmul, width, src2, vlmax, carry;
+    uint64_t tmp;
+
+    vl    = env->vfp.vl;
+    lmul  = vector_get_lmul(env);
+    width   = vector_get_width(env);
+    vlmax = vector_get_vlmax(env);
+
+    if (vector_vtype_ill(env)
+        || vector_overlap_dstgp_srcgp(rd, 1, rs2, lmul)
+        || (rd == 0)) {
+        riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+        return;
+    }
+    vector_lmul_check_reg(env, lmul, rs2, false);
+
+    for (i = 0; i < vlmax; i++) {
+        src2 = rs2 + (i / (VLEN / width));
+        j      = i % (VLEN / width);
+        if (i < env->vfp.vstart) {
+            continue;
+        } else if (i < vl) {
+            switch (width) {
+            case 8:
+                carry = vector_get_carry(env, width, lmul, i);
+                tmp   = (uint8_t)sign_extend(rs1, 5)
+                    + env->vfp.vreg[src2].u8[j] + carry;
+                tmp   = tmp >> width;
+
+                vector_mask_result(env, rd, width, lmul, i, tmp);
+                break;
+            case 16:
+                carry = vector_get_carry(env, width, lmul, i);
+                tmp   = (uint16_t)sign_extend(rs1, 5)
+                    + env->vfp.vreg[src2].u16[j] + carry;
+                tmp   = tmp >> width;
+                vector_mask_result(env, rd, width, lmul, i, tmp);
+                break;
+            case 32:
+                carry = vector_get_carry(env, width, lmul, i);
+                tmp   = (uint64_t)((uint32_t)sign_extend(rs1, 5))
+                    + (uint64_t)env->vfp.vreg[src2].u32[j] + carry;
+                tmp   = tmp >> width;
+                vector_mask_result(env, rd, width, lmul, i, tmp);
+                break;
+            case 64:
+                carry = vector_get_carry(env, width, lmul, i);
+                tmp   = (uint64_t)sign_extend(rs1, 5)
+                    + env->vfp.vreg[src2].u64[j] + carry;
+
+                if ((tmp < (uint64_t)sign_extend(rs1, 5) ||
+                        tmp < env->vfp.vreg[src2].u64[j])
+                    || ((uint64_t)sign_extend(rs1, 5) == UINT64_MAX &&
+                        env->vfp.vreg[src2].u64[j] == UINT64_MAX)) {
+                    tmp = 1;
+                } else {
+                    tmp = 0;
+                }
+                vector_mask_result(env, rd, width, lmul, i, tmp);
+                break;
+            default:
+                riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+                break;
+            }
+        } else {
+            if (width <= 64) {
+                vector_mask_result(env, rd, width, lmul, i, 0);
+            } else {
+                riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+                return;
+            }
+        }
+    }
+    env->vfp.vstart = 0;
+}
+
+void VECTOR_HELPER(vsbc_vvm)(CPURISCVState *env, uint32_t rs1,
+    uint32_t rs2, uint32_t rd)
+{
+    int i, j, vl;
+    uint32_t lmul, width, src1, src2, dest, vlmax, carry;
+
+    vl    = env->vfp.vl;
+    lmul  = vector_get_lmul(env);
+    width   = vector_get_width(env);
+    vlmax = vector_get_vlmax(env);
+
+    if (vector_vtype_ill(env) || vector_overlap_carry(lmul, rd)) {
+        riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+        return;
+    }
+    vector_lmul_check_reg(env, lmul, rs1, false);
+    vector_lmul_check_reg(env, lmul, rs2, false);
+    vector_lmul_check_reg(env, lmul, rd, false);
+
+    for (i = 0; i < vlmax; i++) {
+        src1 = rs1 + (i / (VLEN / width));
+        src2 = rs2 + (i / (VLEN / width));
+        dest = rd + (i / (VLEN / width));
+        j = i % (VLEN / width);
+        if (i < env->vfp.vstart) {
+            continue;
+        } else if (i < vl) {
+            switch (width) {
+            case 8:
+                carry = vector_get_carry(env, width, lmul, i);
+                env->vfp.vreg[dest].u8[j] = env->vfp.vreg[src2].u8[j]
+                    - env->vfp.vreg[src1].u8[j] - carry;
+                break;
+            case 16:
+                carry = vector_get_carry(env, width, lmul, i);
+                env->vfp.vreg[dest].u16[j] = env->vfp.vreg[src2].u16[j]
+                    - env->vfp.vreg[src1].u16[j] - carry;
+                break;
+            case 32:
+                carry = vector_get_carry(env, width, lmul, i);
+                env->vfp.vreg[dest].u32[j] = env->vfp.vreg[src2].u32[j]
+                    - env->vfp.vreg[src1].u32[j] - carry;
+                break;
+            case 64:
+                carry = vector_get_carry(env, width, lmul, i);
+                env->vfp.vreg[dest].u64[j] = env->vfp.vreg[src2].u64[j]
+                    - env->vfp.vreg[src1].u64[j] - carry;
+                break;
+            default:
+                riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+                break;
+            }
+        } else {
+            vector_tail_common(env, dest, j, width);
+        }
+    }
+    env->vfp.vstart = 0;
+}
+void VECTOR_HELPER(vsbc_vxm)(CPURISCVState *env, uint32_t rs1,
+    uint32_t rs2, uint32_t rd)
+{
+    int i, j, vl;
+    uint32_t lmul, width, src2, dest, vlmax, carry;
+
+    vl    = env->vfp.vl;
+    lmul  = vector_get_lmul(env);
+    width   = vector_get_width(env);
+    vlmax = vector_get_vlmax(env);
+
+    if (vector_vtype_ill(env) || vector_overlap_carry(lmul, rd)) {
+        riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+        return;
+    }
+    vector_lmul_check_reg(env, lmul, rs2, false);
+    vector_lmul_check_reg(env, lmul, rd, false);
+
+    for (i = 0; i < vlmax; i++) {
+        src2 = rs2 + (i / (VLEN / width));
+        dest = rd + (i / (VLEN / width));
+        j = i % (VLEN / width);
+        if (i < env->vfp.vstart) {
+            continue;
+        } else if (i < vl) {
+            switch (width) {
+            case 8:
+                carry = vector_get_carry(env, width, lmul, i);
+                env->vfp.vreg[dest].u8[j] = env->vfp.vreg[src2].u8[j]
+                    - env->gpr[rs1] - carry;
+                break;
+            case 16:
+                carry = vector_get_carry(env, width, lmul, i);
+                env->vfp.vreg[dest].u16[j] = env->vfp.vreg[src2].u16[j]
+                    - env->gpr[rs1] - carry;
+                break;
+            case 32:
+                carry = vector_get_carry(env, width, lmul, i);
+                env->vfp.vreg[dest].u32[j] = env->vfp.vreg[src2].u32[j]
+                    - env->gpr[rs1] - carry;
+                break;
+            case 64:
+                carry = vector_get_carry(env, width, lmul, i);
+                env->vfp.vreg[dest].u64[j] = env->vfp.vreg[src2].u64[j]
+                    - (uint64_t)extend_gpr(env->gpr[rs1]) - carry;
+                break;
+            default:
+                riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+                break;
+            }
+        } else {
+            vector_tail_common(env, dest, j, width);
+        }
+    }
+    env->vfp.vstart = 0;
+}
+void VECTOR_HELPER(vmsbc_vvm)(CPURISCVState *env, uint32_t rs1,
+    uint32_t rs2, uint32_t rd)
+{
+    int i, j, vl;
+    uint32_t lmul, width, src1, src2, vlmax, carry;
+    uint64_t tmp;
+
+    vl    = env->vfp.vl;
+    lmul  = vector_get_lmul(env);
+    width   = vector_get_width(env);
+    vlmax = vector_get_vlmax(env);
+
+    if (vector_vtype_ill(env)
+        || vector_overlap_dstgp_srcgp(rd, 1, rs1, lmul)
+        || vector_overlap_dstgp_srcgp(rd, 1, rs2, lmul)
+        || (rd == 0)) {
+        riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+        return;
+    }
+    vector_lmul_check_reg(env, lmul, rs1, false);
+    vector_lmul_check_reg(env, lmul, rs2, false);
+
+    for (i = 0; i < vlmax; i++) {
+        src1 = rs1 + (i / (VLEN / width));
+        src2 = rs2 + (i / (VLEN / width));
+        j      = i % (VLEN / width);
+        if (i < env->vfp.vstart) {
+            continue;
+        } else if (i < vl) {
+            switch (width) {
+            case 8:
+                carry = vector_get_carry(env, width, lmul, i);
+                tmp   = env->vfp.vreg[src2].u8[j]
+                    - env->vfp.vreg[src1].u8[j] - carry;
+                tmp   = (tmp >> width) & 0x1;
+
+                vector_mask_result(env, rd, width, lmul, i, tmp);
+                break;
+            case 16:
+                carry = vector_get_carry(env, width, lmul, i);
+                tmp   = env->vfp.vreg[src2].u16[j]
+                    - env->vfp.vreg[src1].u16[j] - carry;
+                tmp   = (tmp >> width) & 0x1;
+                vector_mask_result(env, rd, width, lmul, i, tmp);
+                break;
+            case 32:
+                carry = vector_get_carry(env, width, lmul, i);
+                tmp   = (uint64_t)env->vfp.vreg[src2].u32[j]
+                    - (uint64_t)env->vfp.vreg[src1].u32[j] - carry;
+                tmp   = (tmp >> width) & 0x1;
+                vector_mask_result(env, rd, width, lmul, i, tmp);
+                break;
+            case 64:
+                carry = vector_get_carry(env, width, lmul, i);
+                tmp   = env->vfp.vreg[src2].u64[j]
+                    - env->vfp.vreg[src1].u64[j] - carry;
+
+                if (((env->vfp.vreg[src1].u64[j] == UINT64_MAX) && carry) ||
+                    env->vfp.vreg[src2].u64[j] <
+                        (env->vfp.vreg[src1].u64[j] + carry)) {
+                    tmp = 1;
+                } else {
+                    tmp = 0;
+                }
+                vector_mask_result(env, rd, width, lmul, i, tmp);
+                break;
+            default:
+                riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+                break;
+            }
+        } else {
+            if (width <= 64) {
+                vector_mask_result(env, rd, width, lmul, i, 0);
+            } else {
+                riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+                return;
+            }
+        }
+    }
+    env->vfp.vstart = 0;
+}
+void VECTOR_HELPER(vmsbc_vxm)(CPURISCVState *env, uint32_t rs1,
+    uint32_t rs2, uint32_t rd)
+{
+    int i, j, vl;
+    uint32_t lmul, width, src2, vlmax, carry;
+    uint64_t tmp, extend_rs1;
+
+    vl    = env->vfp.vl;
+    lmul  = vector_get_lmul(env);
+    width   = vector_get_width(env);
+    vlmax = vector_get_vlmax(env);
+
+    if (vector_vtype_ill(env)
+        || vector_overlap_dstgp_srcgp(rd, 1, rs2, lmul)
+        || (rd == 0)) {
+        riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+        return;
+    }
+    vector_lmul_check_reg(env, lmul, rs2, false);
+
+    for (i = 0; i < vlmax; i++) {
+        src2 = rs2 + (i / (VLEN / width));
+        j      = i % (VLEN / width);
+        if (i < env->vfp.vstart) {
+            continue;
+        } else if (i < vl) {
+            switch (width) {
+            case 8:
+                carry = vector_get_carry(env, width, lmul, i);
+                tmp   = env->vfp.vreg[src2].u8[j]
+                    - (uint8_t)env->gpr[rs1] - carry;
+                tmp   = (tmp >> width) & 0x1;
+                vector_mask_result(env, rd, width, lmul, i, tmp);
+                break;
+            case 16:
+                carry = vector_get_carry(env, width, lmul, i);
+                tmp   = env->vfp.vreg[src2].u16[j]
+                    - (uint16_t)env->gpr[rs1] - carry;
+                tmp   = (tmp >> width) & 0x1;
+                vector_mask_result(env, rd, width, lmul, i, tmp);
+                break;
+            case 32:
+                carry = vector_get_carry(env, width, lmul, i);
+                tmp   = (uint64_t)env->vfp.vreg[src2].u32[j]
+                    - (uint64_t)((uint32_t)env->gpr[rs1]) - carry;
+                tmp   = (tmp >> width) & 0x1;
+                vector_mask_result(env, rd, width, lmul, i, tmp);
+                break;
+            case 64:
+                carry = vector_get_carry(env, width, lmul, i);
+
+                extend_rs1 = (uint64_t)extend_gpr(env->gpr[rs1]);
+                tmp = env->vfp.vreg[src2].u64[j] - extend_rs1 - carry;
+
+                if ((tmp > env->vfp.vreg[src2].u64[j]) ||
+                    ((extend_rs1 == UINT64_MAX) && carry)) {
+                    tmp = 1;
+                } else {
+                    tmp = 0;
+                }
+                vector_mask_result(env, rd, width, lmul, i, tmp);
+                break;
+
+            default:
+                riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+                break;
+            }
+        } else {
+            if (width <= 64) {
+                vector_mask_result(env, rd, width, lmul, i, 0);
+            } else {
+                riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+                return;
+            }
+        }
+    }
+    env->vfp.vstart = 0;
+}
+
+void VECTOR_HELPER(vadd_vv)(CPURISCVState *env, uint32_t vm, uint32_t rs1,
+    uint32_t rs2, uint32_t rd)
+{
+    int i, j, vl;
+    uint32_t lmul, width, src1, src2, dest, vlmax;
+
+    vl = env->vfp.vl;
+    lmul  = vector_get_lmul(env);
+    width   = vector_get_width(env);
+    vlmax = vector_get_vlmax(env);
+
+    if (vector_vtype_ill(env) || vector_overlap_vm_common(lmul, vm, rd)) {
+        riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+        return;
+    }
+    vector_lmul_check_reg(env, lmul, rs1, false);
+    vector_lmul_check_reg(env, lmul, rs2, false);
+    vector_lmul_check_reg(env, lmul, rd, false);
+
+    for (i = 0; i < vlmax; i++) {
+        src1 = rs1 + (i / (VLEN / width));
+        src2 = rs2 + (i / (VLEN / width));
+        dest = rd + (i / (VLEN / width));
+        j = i % (VLEN / width);
+        if (i < env->vfp.vstart) {
+            continue;
+        } else if (i < vl) {
+            switch (width) {
+            case 8:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].u8[j] = env->vfp.vreg[src1].u8[j]
+                        + env->vfp.vreg[src2].u8[j];
+                }
+                break;
+            case 16:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].u16[j] = env->vfp.vreg[src1].u16[j]
+                        + env->vfp.vreg[src2].u16[j];
+                }
+                break;
+            case 32:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].u32[j] = env->vfp.vreg[src1].u32[j]
+                        + env->vfp.vreg[src2].u32[j];
+                }
+                break;
+            case 64:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].u64[j] = env->vfp.vreg[src1].u64[j]
+                        + env->vfp.vreg[src2].u64[j];
+                }
+                break;
+            default:
+                riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+                break;
+            }
+        } else {
+            vector_tail_common(env, dest, j, width);
+        }
+    }
+    env->vfp.vstart = 0;
+}
+
+void VECTOR_HELPER(vadd_vx)(CPURISCVState *env, uint32_t vm, uint32_t rs1,
+    uint32_t rs2, uint32_t rd)
+{
+    int i, j, vl;
+    uint32_t lmul, width, src2, dest, vlmax;
+
+    vl = env->vfp.vl;
+    lmul  = vector_get_lmul(env);
+    width   = vector_get_width(env);
+    vlmax = vector_get_vlmax(env);
+
+    if (vector_vtype_ill(env) || vector_overlap_vm_common(lmul, vm, rd)) {
+        riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+        return;
+    }
+    vector_lmul_check_reg(env, lmul, rs2, false);
+    vector_lmul_check_reg(env, lmul, rd, false);
+
+    for (i = 0; i < vlmax; i++) {
+        src2 = rs2 + (i / (VLEN / width));
+        dest = rd + (i / (VLEN / width));
+        j = i % (VLEN / width);
+        if (i < env->vfp.vstart) {
+            continue;
+        } else if (i < vl) {
+            switch (width) {
+            case 8:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].u8[j] = env->gpr[rs1]
+                        + env->vfp.vreg[src2].u8[j];
+                }
+                break;
+            case 16:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].u16[j] = env->gpr[rs1]
+                        + env->vfp.vreg[src2].u16[j];
+                }
+                break;
+            case 32:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].u32[j] = env->gpr[rs1]
+                        + env->vfp.vreg[src2].u32[j];
+                }
+                break;
+            case 64:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].u64[j] =
+                        (uint64_t)extend_gpr(env->gpr[rs1])
+                        + env->vfp.vreg[src2].u64[j];
+                }
+                break;
+            default:
+                riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+                break;
+            }
+        } else {
+            vector_tail_common(env, dest, j, width);
+        }
+    }
+    env->vfp.vstart = 0;
+}
+
+void VECTOR_HELPER(vadd_vi)(CPURISCVState *env, uint32_t vm,  uint32_t rs1,
+    uint32_t rs2, uint32_t rd)
+{
+    int i, j, vl;
+    uint32_t lmul, width, src2, dest, vlmax;
+
+    vl = env->vfp.vl;
+    lmul  = vector_get_lmul(env);
+    width   = vector_get_width(env);
+    vlmax = vector_get_vlmax(env);
+
+    if (vector_vtype_ill(env) || vector_overlap_vm_common(lmul, vm, rd)) {
+        riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+        return;
+    }
+    vector_lmul_check_reg(env, lmul, rs2, false);
+    vector_lmul_check_reg(env, lmul, rd, false);
+
+    for (i = 0; i < vlmax; i++) {
+        src2 = rs2 + (i / (VLEN / width));
+        dest = rd + (i / (VLEN / width));
+        j = i % (VLEN / width);
+        if (i < env->vfp.vstart) {
+            continue;
+        } else if (i < vl) {
+            switch (width) {
+            case 8:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].s8[j] = sign_extend(rs1, 5)
+                        + env->vfp.vreg[src2].s8[j];
+                }
+                break;
+            case 16:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].s16[j] = sign_extend(rs1, 5)
+                        + env->vfp.vreg[src2].s16[j];
+                }
+                break;
+            case 32:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].s32[j] = sign_extend(rs1, 5)
+                        + env->vfp.vreg[src2].s32[j];
+                }
+                break;
+            case 64:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].s64[j] = sign_extend(rs1, 5)
+                        + env->vfp.vreg[src2].s64[j];
+                }
+                break;
+            default:
+                riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+                break;
+            }
+        } else {
+            vector_tail_common(env, dest, j, width);
+        }
+    }
+    env->vfp.vstart = 0;
+}
+
+void VECTOR_HELPER(vsub_vv)(CPURISCVState *env, uint32_t vm, uint32_t rs1,
+    uint32_t rs2, uint32_t rd)
+{
+    int i, j, vl;
+    uint32_t lmul, width, src1, src2, dest, vlmax;
+
+    vl = env->vfp.vl;
+    lmul  = vector_get_lmul(env);
+    width   = vector_get_width(env);
+    vlmax = vector_get_vlmax(env);
+
+    if (vector_vtype_ill(env) || vector_overlap_vm_common(lmul, vm, rd)) {
+        riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+        return;
+    }
+    vector_lmul_check_reg(env, lmul, rs1, false);
+    vector_lmul_check_reg(env, lmul, rs2, false);
+    vector_lmul_check_reg(env, lmul, rd, false);
+
+    for (i = 0; i < vlmax; i++) {
+        src1 = rs1 + (i / (VLEN / width));
+        src2 = rs2 + (i / (VLEN / width));
+        dest = rd + (i / (VLEN / width));
+        j = i % (VLEN / width);
+        if (i < env->vfp.vstart) {
+            continue;
+        } else if (i < vl) {
+            switch (width) {
+            case 8:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].u8[j] = env->vfp.vreg[src2].u8[j]
+                        - env->vfp.vreg[src1].u8[j];
+                }
+                break;
+            case 16:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].u16[j] = env->vfp.vreg[src2].u16[j]
+                        - env->vfp.vreg[src1].u16[j];
+                }
+                break;
+            case 32:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].u32[j] = env->vfp.vreg[src2].u32[j]
+                        - env->vfp.vreg[src1].u32[j];
+                }
+                break;
+            case 64:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].u64[j] = env->vfp.vreg[src2].u64[j]
+                        - env->vfp.vreg[src1].u64[j];
+                }
+                break;
+            default:
+                riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+                break;
+            }
+        } else {
+            vector_tail_common(env, dest, j, width);
+        }
+    }
+
+    env->vfp.vstart = 0;
+}
+void VECTOR_HELPER(vsub_vx)(CPURISCVState *env, uint32_t vm, uint32_t rs1,
+    uint32_t rs2, uint32_t rd)
+{
+    int i, j, vl;
+    uint32_t lmul, width, src2, dest, vlmax;
+
+    vl = env->vfp.vl;
+    lmul  = vector_get_lmul(env);
+    width   = vector_get_width(env);
+    vlmax = vector_get_vlmax(env);
+
+    if (vector_vtype_ill(env) || vector_overlap_vm_common(lmul, vm, rd)) {
+        riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+        return;
+    }
+    vector_lmul_check_reg(env, lmul, rs2, false);
+    vector_lmul_check_reg(env, lmul, rd, false);
+
+    for (i = 0; i < vlmax; i++) {
+        src2 = rs2 + (i / (VLEN / width));
+        dest = rd + (i / (VLEN / width));
+        j = i % (VLEN / width);
+        if (i < env->vfp.vstart) {
+            continue;
+        } else if (i < vl) {
+            switch (width) {
+            case 8:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].u8[j] = env->vfp.vreg[src2].u8[j]
+                        - env->gpr[rs1];
+                }
+                break;
+            case 16:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].u16[j] = env->vfp.vreg[src2].u16[j]
+                        - env->gpr[rs1];
+                }
+                break;
+            case 32:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].u32[j] = env->vfp.vreg[src2].u32[j]
+                        - env->gpr[rs1];
+                }
+                break;
+            case 64:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].u64[j] = env->vfp.vreg[src2].u64[j]
+                        - (uint64_t)extend_gpr(env->gpr[rs1]);
+                }
+                break;
+            default:
+                riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+                break;
+            }
+        } else {
+            vector_tail_common(env, dest, j, width);
+        }
+    }
+    env->vfp.vstart = 0;
+}
+
+void VECTOR_HELPER(vrsub_vx)(CPURISCVState *env, uint32_t vm, uint32_t rs1,
+    uint32_t rs2, uint32_t rd)
+{
+    int i, j, vl;
+    uint32_t lmul, width, src2, dest, vlmax;
+
+    vl = env->vfp.vl;
+    lmul  = vector_get_lmul(env);
+    width   = vector_get_width(env);
+    vlmax = vector_get_vlmax(env);
+
+    if (vector_vtype_ill(env) || vector_overlap_vm_common(lmul, vm, rd)) {
+        riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+        return;
+    }
+    vector_lmul_check_reg(env, lmul, rs2, false);
+    vector_lmul_check_reg(env, lmul, rd, false);
+
+    for (i = 0; i < vlmax; i++) {
+        src2 = rs2 + (i / (VLEN / width));
+        dest = rd + (i / (VLEN / width));
+        j = i % (VLEN / width);
+        if (i < env->vfp.vstart) {
+            continue;
+        } else if (i < vl) {
+            switch (width) {
+            case 8:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].u8[j] = env->gpr[rs1]
+                        - env->vfp.vreg[src2].u8[j];
+                }
+                break;
+            case 16:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].u16[j] = env->gpr[rs1]
+                        - env->vfp.vreg[src2].u16[j];
+                }
+                break;
+            case 32:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].u32[j] = env->gpr[rs1]
+                        - env->vfp.vreg[src2].u32[j];
+                }
+                break;
+            case 64:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].u64[j] =
+                        (uint64_t)extend_gpr(env->gpr[rs1])
+                        - env->vfp.vreg[src2].u64[j];
+                }
+                break;
+            default:
+                riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+                break;
+            }
+        } else {
+            vector_tail_common(env, dest, j, width);
+        }
+    }
+    env->vfp.vstart = 0;
+}
+
+void VECTOR_HELPER(vrsub_vi)(CPURISCVState *env, uint32_t vm, uint32_t rs1,
+    uint32_t rs2, uint32_t rd)
+{
+    int i, j, vl;
+    uint32_t lmul, width, src2, dest, vlmax;
+
+    vl = env->vfp.vl;
+    lmul  = vector_get_lmul(env);
+    width   = vector_get_width(env);
+    vlmax = vector_get_vlmax(env);
+
+    if (vector_vtype_ill(env) || vector_overlap_vm_common(lmul, vm, rd)) {
+        riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+        return;
+    }
+    vector_lmul_check_reg(env, lmul, rs2, false);
+    vector_lmul_check_reg(env, lmul, rd, false);
+
+    for (i = 0; i < vlmax; i++) {
+        src2 = rs2 + (i / (VLEN / width));
+        dest = rd + (i / (VLEN / width));
+        j = i % (VLEN / width);
+        if (i < env->vfp.vstart) {
+            continue;
+        } else if (i < vl) {
+            switch (width) {
+            case 8:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].s8[j] = sign_extend(rs1, 5)
+                        - env->vfp.vreg[src2].s8[j];
+                }
+                break;
+            case 16:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].s16[j] = sign_extend(rs1, 5)
+                        - env->vfp.vreg[src2].s16[j];
+                }
+                break;
+            case 32:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].s32[j] = sign_extend(rs1, 5)
+                        - env->vfp.vreg[src2].s32[j];
+                }
+                break;
+            case 64:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].s64[j] = sign_extend(rs1, 5)
+                        - env->vfp.vreg[src2].s64[j];
+                }
+                break;
+            default:
+                riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+                break;
+            }
+        } else {
+            vector_tail_common(env, dest, j, width);
+        }
+    }
+    env->vfp.vstart = 0;
+}
+
+void VECTOR_HELPER(vwaddu_vv)(CPURISCVState *env, uint32_t vm, uint32_t rs1,
+    uint32_t rs2, uint32_t rd)
+{
+    int i, j, k, vl;
+    uint32_t lmul, width, src1, src2, dest, vlmax;
+
+    vl    = env->vfp.vl;
+
+    lmul  = vector_get_lmul(env);
+    width = vector_get_width(env);
+    vlmax = vector_get_vlmax(env);
+
+    if (vector_vtype_ill(env)
+        || vector_overlap_vm_force(vm, rd)
+        || vector_overlap_dstgp_srcgp(rd, 2 * lmul, rs1, lmul)
+        || vector_overlap_dstgp_srcgp(rd, 2 * lmul, rs2, lmul)
+        ) {
+        riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+        return;
+    }
+    vector_lmul_check_reg(env, lmul, rs1, false);
+    vector_lmul_check_reg(env, lmul, rs2, false);
+    vector_lmul_check_reg(env, lmul, rd, true);
+
+    for (i = 0; i < vlmax; i++) {
+        src1 = rs1 + (i / (VLEN / width));
+        src2 = rs2 + (i / (VLEN / width));
+        dest = rd + (i / (VLEN / (2 * width)));
+        j = i % (VLEN / width);
+        k = i % (VLEN / (2 * width));
+        if (i < env->vfp.vstart) {
+            continue;
+        } else if (i < vl) {
+            switch (width) {
+            case 8:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].u16[k] =
+                        (uint16_t)env->vfp.vreg[src1].u8[j] +
+                        (uint16_t)env->vfp.vreg[src2].u8[j];
+                }
+                break;
+            case 16:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].u32[k] =
+                        (uint32_t)env->vfp.vreg[src1].u16[j] +
+                        (uint32_t)env->vfp.vreg[src2].u16[j];
+                }
+                break;
+            case 32:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].u64[k] =
+                        (uint64_t)env->vfp.vreg[src1].u32[j] +
+                        (uint64_t)env->vfp.vreg[src2].u32[j];
+                }
+                break;
+            default:
+                riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+                break;
+            }
+        } else {
+            vector_tail_widen(env, dest, k, width);
+        }
+    }
+    env->vfp.vstart = 0;
+}
+
+void VECTOR_HELPER(vwaddu_vx)(CPURISCVState *env, uint32_t vm, uint32_t rs1,
+    uint32_t rs2, uint32_t rd)
+{
+    int i, j, k, vl;
+    uint32_t lmul, width, src2, dest, vlmax;
+
+    vl    = env->vfp.vl;
+
+    lmul  = vector_get_lmul(env);
+    width = vector_get_width(env);
+    vlmax = vector_get_vlmax(env);
+
+    if (vector_vtype_ill(env)
+        || vector_overlap_vm_force(vm, rd)
+        || vector_overlap_dstgp_srcgp(rd, 2 * lmul, rs2, lmul)
+        ) {
+        riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+        return;
+    }
+    vector_lmul_check_reg(env, lmul, rs2, false);
+    vector_lmul_check_reg(env, lmul, rd, true);
+
+    for (i = 0; i < vlmax; i++) {
+        src2 = rs2 + (i / (VLEN / width));
+        dest = rd + (i / (VLEN / (2 * width)));
+        j = i % (VLEN / width);
+        k = i % (VLEN / (2 * width));
+        if (i < env->vfp.vstart) {
+            continue;
+        } else if (i < vl) {
+            switch (width) {
+            case 8:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].u16[k] =
+                        (uint16_t)env->vfp.vreg[src2].u8[j] +
+                        (uint16_t)((uint8_t)env->gpr[rs1]);
+                }
+                break;
+            case 16:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].u32[k] =
+                        (uint32_t)env->vfp.vreg[src2].u16[j] +
+                        (uint32_t)((uint16_t)env->gpr[rs1]);
+                }
+                break;
+            case 32:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].u64[k] =
+                        (uint64_t)env->vfp.vreg[src2].u32[j] +
+                        (uint64_t)((uint32_t)env->gpr[rs1]);
+                }
+                break;
+            default:
+                riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+                break;
+            }
+        } else {
+            vector_tail_widen(env, dest, k, width);
+        }
+    }
+    env->vfp.vstart = 0;
+}
+
+void VECTOR_HELPER(vwadd_vv)(CPURISCVState *env, uint32_t vm, uint32_t rs1,
+    uint32_t rs2, uint32_t rd)
+{
+    int i, j, k, vl;
+    uint32_t lmul, width, src1, src2, dest, vlmax;
+
+    vl    = env->vfp.vl;
+
+    lmul  = vector_get_lmul(env);
+    width = vector_get_width(env);
+    vlmax = vector_get_vlmax(env);
+
+    if (vector_vtype_ill(env)
+        || vector_overlap_vm_force(vm, rd)
+        || vector_overlap_dstgp_srcgp(rd, 2 * lmul, rs1, lmul)
+        || vector_overlap_dstgp_srcgp(rd, 2 * lmul, rs2, lmul)) {
+        riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+        return;
+    }
+    vector_lmul_check_reg(env, lmul, rs1, false);
+    vector_lmul_check_reg(env, lmul, rs2, false);
+    vector_lmul_check_reg(env, lmul, rd, true);
+
+    for (i = 0; i < vlmax; i++) {
+        src1 = rs1 + (i / (VLEN / width));
+        src2 = rs2 + (i / (VLEN / width));
+        dest = rd + (i / (VLEN / (2 * width)));
+        j = i % (VLEN / width);
+        k = i % (VLEN / (2 * width));
+        if (i < env->vfp.vstart) {
+            continue;
+        } else if (i < vl) {
+            switch (width) {
+            case 8:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].s16[k] =
+                        (int16_t)env->vfp.vreg[src1].s8[j] +
+                        (int16_t)env->vfp.vreg[src2].s8[j];
+                }
+                break;
+            case 16:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].s32[k] =
+                        (int32_t)env->vfp.vreg[src1].s16[j] +
+                        (int32_t)env->vfp.vreg[src2].s16[j];
+                }
+                break;
+            case 32:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].s64[k] =
+                        (int64_t)env->vfp.vreg[src1].s32[j] +
+                        (int64_t)env->vfp.vreg[src2].s32[j];
+                }
+                break;
+            default:
+                riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+                break;
+            }
+        } else {
+            vector_tail_widen(env, dest, k, width);
+        }
+    }
+    env->vfp.vstart = 0;
+}
+
+void VECTOR_HELPER(vwadd_vx)(CPURISCVState *env, uint32_t vm, uint32_t rs1,
+    uint32_t rs2, uint32_t rd)
+{
+    int i, j, k, vl;
+    uint32_t lmul, width, src2, dest, vlmax;
+
+    vl    = env->vfp.vl;
+
+    lmul  = vector_get_lmul(env);
+    width = vector_get_width(env);
+    vlmax = vector_get_vlmax(env);
+
+    if (vector_vtype_ill(env)
+        || vector_overlap_vm_force(vm, rd)
+        || vector_overlap_dstgp_srcgp(rd, 2 * lmul, rs2, lmul)) {
+        riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+        return;
+    }
+    vector_lmul_check_reg(env, lmul, rs2, false);
+    vector_lmul_check_reg(env, lmul, rd, true);
+
+    for (i = 0; i < vlmax; i++) {
+        src2 = rs2 + (i / (VLEN / width));
+        dest = rd + (i / (VLEN / (2 * width)));
+        j = i % (VLEN / width);
+        k = i % (VLEN / (2 * width));
+        if (i < env->vfp.vstart) {
+            continue;
+        } else if (i < vl) {
+            switch (width) {
+            case 8:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].s16[k] =
+                        (int16_t)((int8_t)env->vfp.vreg[src2].s8[j]) +
+                        (int16_t)((int8_t)env->gpr[rs1]);
+                }
+                break;
+            case 16:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].s32[k] =
+                        (int32_t)((int16_t)env->vfp.vreg[src2].s16[j]) +
+                        (int32_t)((int16_t)env->gpr[rs1]);
+                }
+                break;
+            case 32:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].s64[k] =
+                        (int64_t)((int32_t)env->vfp.vreg[src2].s32[j]) +
+                        (int64_t)((int32_t)env->gpr[rs1]);
+                }
+                break;
+            default:
+                riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+                break;
+            }
+        } else {
+            vector_tail_widen(env, dest, k, width);
+        }
+    }
+    env->vfp.vstart = 0;
+}
+
+void VECTOR_HELPER(vwsubu_vv)(CPURISCVState *env, uint32_t vm, uint32_t rs1,
+    uint32_t rs2, uint32_t rd)
+{
+    int i, j, k, vl;
+    uint32_t lmul, width, src1, src2, dest, vlmax;
+
+    vl    = env->vfp.vl;
+
+    lmul  = vector_get_lmul(env);
+    width = vector_get_width(env);
+    vlmax = vector_get_vlmax(env);
+
+    if (vector_vtype_ill(env)
+        || vector_overlap_vm_force(vm, rd)
+        || vector_overlap_dstgp_srcgp(rd, 2 * lmul, rs1, lmul)
+        || vector_overlap_dstgp_srcgp(rd, 2 * lmul, rs2, lmul)
+        ) {
+        riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+        return;
+    }
+    vector_lmul_check_reg(env, lmul, rs1, false);
+    vector_lmul_check_reg(env, lmul, rs2, false);
+    vector_lmul_check_reg(env, lmul, rd, true);
+
+    for (i = 0; i < vlmax; i++) {
+        src1 = rs1 + (i / (VLEN / width));
+        src2 = rs2 + (i / (VLEN / width));
+        dest = rd + (i / (VLEN / (2 * width)));
+        j = i % (VLEN / width);
+        k = i % (VLEN / (2 * width));
+        if (i < env->vfp.vstart) {
+            continue;
+        } else if (i < vl) {
+            switch (width) {
+            case 8:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].u16[k] =
+                        (uint16_t)env->vfp.vreg[src2].u8[j] -
+                        (uint16_t)env->vfp.vreg[src1].u8[j];
+                }
+                break;
+            case 16:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].u32[k] =
+                        (uint32_t)env->vfp.vreg[src2].u16[j] -
+                        (uint32_t)env->vfp.vreg[src1].u16[j];
+                }
+                break;
+            case 32:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].u64[k] =
+                        (uint64_t)env->vfp.vreg[src2].u32[j] -
+                        (uint64_t)env->vfp.vreg[src1].u32[j];
+                }
+                break;
+            default:
+                riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+                break;
+            }
+        } else {
+            vector_tail_widen(env, dest, k, width);
+        }
+    }
+    env->vfp.vstart = 0;
+}
+
+void VECTOR_HELPER(vwsubu_vx)(CPURISCVState *env, uint32_t vm, uint32_t rs1,
+    uint32_t rs2, uint32_t rd)
+{
+    int i, j, k, vl;
+    uint32_t lmul, width, src2, dest, vlmax;
+
+    vl    = env->vfp.vl;
+
+    lmul  = vector_get_lmul(env);
+    width = vector_get_width(env);
+    vlmax = vector_get_vlmax(env);
+
+    if (vector_vtype_ill(env)
+        || vector_overlap_vm_force(vm, rd)
+        || vector_overlap_dstgp_srcgp(rd, 2 * lmul, rs2, lmul)
+        ) {
+        riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+        return;
+    }
+    vector_lmul_check_reg(env, lmul, rs2, false);
+    vector_lmul_check_reg(env, lmul, rd, true);
+
+    for (i = 0; i < vlmax; i++) {
+        src2 = rs2 + (i / (VLEN / width));
+        dest = rd + (i / (VLEN / (2 * width)));
+        j = i % (VLEN / width);
+        k = i % (VLEN / (2 * width));
+        if (i < env->vfp.vstart) {
+            continue;
+        } else if (i < vl) {
+            switch (width) {
+            case 8:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].u16[k] =
+                        (uint16_t)env->vfp.vreg[src2].u8[j] -
+                        (uint16_t)((uint8_t)env->gpr[rs1]);
+                }
+                break;
+            case 16:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].u32[k] =
+                        (uint32_t)env->vfp.vreg[src2].u16[j] -
+                        (uint32_t)((uint16_t)env->gpr[rs1]);
+                }
+                break;
+            case 32:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].u64[k] =
+                        (uint64_t)env->vfp.vreg[src2].u32[j] -
+                        (uint64_t)((uint32_t)env->gpr[rs1]);
+                }
+                break;
+            default:
+                riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+                break;
+            }
+        } else {
+            vector_tail_widen(env, dest, k, width);
+        }
+    }
+    env->vfp.vstart = 0;
+}
+
+void VECTOR_HELPER(vwsub_vv)(CPURISCVState *env, uint32_t vm, uint32_t rs1,
+    uint32_t rs2, uint32_t rd)
+{
+    int i, j, k, vl;
+    uint32_t lmul, width, src1, src2, dest, vlmax;
+
+    vl    = env->vfp.vl;
+
+    lmul  = vector_get_lmul(env);
+    width = vector_get_width(env);
+    vlmax = vector_get_vlmax(env);
+
+    if (vector_vtype_ill(env)
+        || vector_overlap_vm_force(vm, rd)
+        || vector_overlap_dstgp_srcgp(rd, 2 * lmul, rs1, lmul)
+        || vector_overlap_dstgp_srcgp(rd, 2 * lmul, rs2, lmul)
+        ) {
+        riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+        return;
+    }
+    vector_lmul_check_reg(env, lmul, rs1, false);
+    vector_lmul_check_reg(env, lmul, rs2, false);
+    vector_lmul_check_reg(env, lmul, rd, true);
+
+    for (i = 0; i < vlmax; i++) {
+        src1 = rs1 + (i / (VLEN / width));
+        src2 = rs2 + (i / (VLEN / width));
+        dest = rd + (i / (VLEN / (2 * width)));
+        j = i % (VLEN / width);
+        k = i % (VLEN / (2 * width));
+        if (i < env->vfp.vstart) {
+            continue;
+        } else if (i < vl) {
+            switch (width) {
+            case 8:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].s16[k] =
+                        (int16_t)env->vfp.vreg[src2].s8[j] -
+                        (int16_t)env->vfp.vreg[src1].s8[j];
+                }
+                break;
+            case 16:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].s32[k] =
+                        (int32_t)env->vfp.vreg[src2].s16[j] -
+                        (int32_t)env->vfp.vreg[src1].s16[j];
+                }
+                break;
+            case 32:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].s64[k] =
+                        (int64_t)env->vfp.vreg[src2].s32[j] -
+                        (int64_t)env->vfp.vreg[src1].s32[j];
+                }
+                break;
+            default:
+                riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+                break;
+            }
+        } else {
+            vector_tail_widen(env, dest, k, width);
+        }
+    }
+    env->vfp.vstart = 0;
+}
+void VECTOR_HELPER(vwsub_vx)(CPURISCVState *env, uint32_t vm, uint32_t rs1,
+    uint32_t rs2, uint32_t rd)
+{
+    int i, j, k, vl;
+    uint32_t lmul, width, src2, dest, vlmax;
+
+    vl    = env->vfp.vl;
+
+    lmul  = vector_get_lmul(env);
+    width = vector_get_width(env);
+    vlmax = vector_get_vlmax(env);
+
+    if (vector_vtype_ill(env)
+        || vector_overlap_vm_force(vm, rd)
+        || vector_overlap_dstgp_srcgp(rd, 2 * lmul, rs2, lmul)
+        ) {
+        riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+        return;
+    }
+    vector_lmul_check_reg(env, lmul, rs2, false);
+    vector_lmul_check_reg(env, lmul, rd, true);
+
+    for (i = 0; i < vlmax; i++) {
+        src2 = rs2 + (i / (VLEN / width));
+        dest = rd + (i / (VLEN / (2 * width)));
+        j = i % (VLEN / width);
+        k = i % (VLEN / (2 * width));
+        if (i < env->vfp.vstart) {
+            continue;
+        } else if (i < vl) {
+            switch (width) {
+            case 8:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].s16[k] =
+                        (int16_t)((int8_t)env->vfp.vreg[src2].s8[j]) -
+                        (int16_t)((int8_t)env->gpr[rs1]);
+                }
+                break;
+            case 16:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].s32[k] =
+                        (int32_t)((int16_t)env->vfp.vreg[src2].s16[j]) -
+                        (int32_t)((int16_t)env->gpr[rs1]);
+                }
+                break;
+            case 32:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].s64[k] =
+                        (int64_t)((int32_t)env->vfp.vreg[src2].s32[j]) -
+                        (int64_t)((int32_t)env->gpr[rs1]);
+                }
+                break;
+            default:
+                riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+                break;
+            }
+        } else {
+            vector_tail_widen(env, dest, k, width);
+        }
+    }
+    env->vfp.vstart = 0;
+}
+
+void VECTOR_HELPER(vwaddu_wv)(CPURISCVState *env, uint32_t vm, uint32_t rs1,
+    uint32_t rs2, uint32_t rd)
+{
+    int i, j, k, vl;
+    uint32_t lmul, width, src1, src2, dest, vlmax;
+
+    vl    = env->vfp.vl;
+
+    lmul  = vector_get_lmul(env);
+    width = vector_get_width(env);
+    vlmax = vector_get_vlmax(env);
+
+    if (vector_vtype_ill(env)
+        || vector_overlap_vm_force(vm, rd)
+        || vector_overlap_dstgp_srcgp(rd, 2 * lmul, rs1, lmul)) {
+        riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+        return;
+    }
+    vector_lmul_check_reg(env, lmul, rs1, false);
+    vector_lmul_check_reg(env, lmul, rs2, true);
+    vector_lmul_check_reg(env, lmul, rd, true);
+
+    for (i = 0; i < vlmax; i++) {
+        src1 = rs1 + (i / (VLEN / width));
+        src2 = rs2 + (i / (VLEN / (2 * width)));
+        dest = rd + (i / (VLEN / (2 * width)));
+        j = i % (VLEN / width);
+        k = i % (VLEN / (2 * width));
+        if (i < env->vfp.vstart) {
+            continue;
+        } else if (i < vl) {
+            switch (width) {
+            case 8:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].u16[k] =
+                        (uint16_t)env->vfp.vreg[src1].u8[j] +
+                        (uint16_t)env->vfp.vreg[src2].u16[k];
+                }
+                break;
+            case 16:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].u32[k] =
+                        (uint32_t)env->vfp.vreg[src1].u16[j] +
+                        (uint32_t)env->vfp.vreg[src2].u32[k];
+                }
+                break;
+            case 32:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].u64[k] =
+                        (uint64_t)env->vfp.vreg[src1].u32[j] +
+                        (uint64_t)env->vfp.vreg[src2].u64[k];
+                }
+                break;
+            default:
+                riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+                break;
+            }
+        } else {
+            vector_tail_widen(env, dest, k, width);
+        }
+    }
+    env->vfp.vstart = 0;
+}
+
+void VECTOR_HELPER(vwaddu_wx)(CPURISCVState *env, uint32_t vm, uint32_t rs1,
+    uint32_t rs2, uint32_t rd)
+{
+    int i, k, vl;
+    uint32_t lmul, width, src2, dest, vlmax;
+
+    vl    = env->vfp.vl;
+
+    lmul  = vector_get_lmul(env);
+    width = vector_get_width(env);
+    vlmax = vector_get_vlmax(env);
+
+    if (vector_vtype_ill(env)
+        || vector_overlap_vm_force(vm, rd)) {
+        riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+        return;
+    }
+    vector_lmul_check_reg(env, lmul, rs2, true);
+    vector_lmul_check_reg(env, lmul, rd, true);
+
+    for (i = 0; i < vlmax; i++) {
+        src2 = rs2 + (i / (VLEN / (2 * width)));
+        dest = rd + (i / (VLEN / (2 * width)));
+        k = i % (VLEN / (2 * width));
+        if (i < env->vfp.vstart) {
+            continue;
+        } else if (i < vl) {
+            switch (width) {
+            case 8:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].u16[k] =
+                        (uint16_t)env->vfp.vreg[src2].u16[k] +
+                        (uint16_t)((uint8_t)env->gpr[rs1]);
+                }
+                break;
+            case 16:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].u32[k] =
+                        (uint32_t)env->vfp.vreg[src2].u32[k] +
+                        (uint32_t)((uint16_t)env->gpr[rs1]);
+                }
+                break;
+            case 32:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].u64[k] =
+                        (uint64_t)env->vfp.vreg[src2].u64[k] +
+                        (uint64_t)((uint32_t)env->gpr[rs1]);
+                }
+                break;
+            default:
+                riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+                break;
+            }
+        } else {
+            vector_tail_widen(env, dest, k, width);
+        }
+    }
+    env->vfp.vstart = 0;
+}
+
+void VECTOR_HELPER(vwadd_wv)(CPURISCVState *env, uint32_t vm, uint32_t rs1,
+    uint32_t rs2, uint32_t rd)
+{
+    int i, j, k, vl;
+    uint32_t lmul, width, src1, src2, dest, vlmax;
+
+    vl    = env->vfp.vl;
+
+    lmul  = vector_get_lmul(env);
+    width = vector_get_width(env);
+    vlmax = vector_get_vlmax(env);
+
+    if (vector_vtype_ill(env)
+        || vector_overlap_vm_force(vm, rd)
+        || vector_overlap_dstgp_srcgp(rd, 2 * lmul, rs1, lmul)) {
+        riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+        return;
+    }
+    vector_lmul_check_reg(env, lmul, rs1, false);
+    vector_lmul_check_reg(env, lmul, rs2, true);
+    vector_lmul_check_reg(env, lmul, rd, true);
+
+    for (i = 0; i < vlmax; i++) {
+        src1 = rs1 + (i / (VLEN / width));
+        src2 = rs2 + (i / (VLEN / (2 * width)));
+        dest = rd + (i / (VLEN / (2 * width)));
+        j = i % (VLEN / width);
+        k = i % (VLEN / (2 * width));
+        if (i < env->vfp.vstart) {
+            continue;
+        } else if (i < vl) {
+            switch (width) {
+            case 8:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].s16[k] =
+                        (int16_t)((int8_t)env->vfp.vreg[src1].s8[j]) +
+                        (int16_t)env->vfp.vreg[src2].s16[k];
+                }
+                break;
+            case 16:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].s32[k] =
+                        (int32_t)((int16_t)env->vfp.vreg[src1].s16[j]) +
+                        (int32_t)env->vfp.vreg[src2].s32[k];
+                }
+                break;
+            case 32:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].s64[k] =
+                        (int64_t)((int32_t)env->vfp.vreg[src1].s32[j]) +
+                        (int64_t)env->vfp.vreg[src2].s64[k];
+                }
+                break;
+            default:
+                riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+                break;
+            }
+        } else {
+            vector_tail_widen(env, dest, k, width);
+        }
+    }
+    env->vfp.vstart = 0;
+}
+
+void VECTOR_HELPER(vwadd_wx)(CPURISCVState *env, uint32_t vm, uint32_t rs1,
+    uint32_t rs2, uint32_t rd)
+{
+    int i, k, vl;
+    uint32_t lmul, width, src2, dest, vlmax;
+
+    vl    = env->vfp.vl;
+
+    lmul  = vector_get_lmul(env);
+    width = vector_get_width(env);
+    vlmax = vector_get_vlmax(env);
+
+    if (vector_vtype_ill(env)
+        || vector_overlap_vm_force(vm, rd)) {
+        riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+        return;
+    }
+    vector_lmul_check_reg(env, lmul, rs2, true);
+    vector_lmul_check_reg(env, lmul, rd, true);
+
+    for (i = 0; i < vlmax; i++) {
+        src2 = rs2 + (i / (VLEN / (2 * width)));
+        dest = rd + (i / (VLEN / (2 * width)));
+        k = i % (VLEN / (2 * width));
+        if (i < env->vfp.vstart) {
+            continue;
+        } else if (i < vl) {
+            switch (width) {
+            case 8:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].s16[k] =
+                        (int16_t)env->vfp.vreg[src2].s16[k] +
+                        (int16_t)((int8_t)env->gpr[rs1]);
+                }
+                break;
+            case 16:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].s32[k] =
+                        (int32_t)env->vfp.vreg[src2].s32[k] +
+                        (int32_t)((int16_t)env->gpr[rs1]);
+                }
+                break;
+            case 32:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].s64[k] =
+                        (int64_t)env->vfp.vreg[src2].s64[k] +
+                        (int64_t)((int32_t)env->gpr[rs1]);
+                }
+                break;
+            default:
+                riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+                break;
+            }
+        } else {
+            vector_tail_widen(env, dest, k, width);
+        }
+    }
+    env->vfp.vstart = 0;
+}
+
+void VECTOR_HELPER(vwsubu_wv)(CPURISCVState *env, uint32_t vm, uint32_t rs1,
+    uint32_t rs2, uint32_t rd)
+{
+    int i, j, k, vl;
+    uint32_t lmul, width, src1, src2, dest, vlmax;
+
+    vl    = env->vfp.vl;
+
+    lmul  = vector_get_lmul(env);
+    width = vector_get_width(env);
+    vlmax = vector_get_vlmax(env);
+
+    if (vector_vtype_ill(env)
+        || vector_overlap_vm_force(vm, rd)
+        || vector_overlap_dstgp_srcgp(rd, 2 * lmul, rs1, lmul)) {
+        riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+        return;
+    }
+    vector_lmul_check_reg(env, lmul, rs1, false);
+    vector_lmul_check_reg(env, lmul, rs2, true);
+    vector_lmul_check_reg(env, lmul, rd, true);
+
+    for (i = 0; i < vlmax; i++) {
+        src1 = rs1 + (i / (VLEN / width));
+        src2 = rs2 + (i / (VLEN / (2 * width)));
+        dest = rd + (i / (VLEN / (2 * width)));
+        j = i % (VLEN / width);
+        k = i % (VLEN / (2 * width));
+        if (i < env->vfp.vstart) {
+            continue;
+        } else if (i < vl) {
+            switch (width) {
+            case 8:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].u16[k] =
+                        (uint16_t)env->vfp.vreg[src2].u16[k] -
+                        (uint16_t)env->vfp.vreg[src1].u8[j];
+                }
+                break;
+            case 16:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].u32[k] =
+                        (uint32_t)env->vfp.vreg[src2].u32[k] -
+                        (uint32_t)env->vfp.vreg[src1].u16[j];
+                }
+                break;
+            case 32:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].u64[k] =
+                        (uint64_t)env->vfp.vreg[src2].u64[k] -
+                        (uint64_t)env->vfp.vreg[src1].u32[j];
+                }
+                break;
+            default:
+                riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+                break;
+            }
+        } else {
+            vector_tail_widen(env, dest, k, width);
+        }
+    }
+    env->vfp.vstart = 0;
+}
+
+void VECTOR_HELPER(vwsubu_wx)(CPURISCVState *env, uint32_t vm, uint32_t rs1,
+    uint32_t rs2, uint32_t rd)
+{
+    int i, k, vl;
+    uint32_t lmul, width, src2, dest, vlmax;
+
+    vl    = env->vfp.vl;
+
+    lmul  = vector_get_lmul(env);
+    width = vector_get_width(env);
+    vlmax = vector_get_vlmax(env);
+
+    if (vector_vtype_ill(env)
+        || vector_overlap_vm_force(vm, rd)) {
+        riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+        return;
+    }
+    vector_lmul_check_reg(env, lmul, rs2, true);
+    vector_lmul_check_reg(env, lmul, rd, true);
+
+    for (i = 0; i < vlmax; i++) {
+        src2 = rs2 + (i / (VLEN / (2 * width)));
+        dest = rd + (i / (VLEN / (2 * width)));
+        k = i % (VLEN / (2 * width));
+        if (i < env->vfp.vstart) {
+            continue;
+        } else if (i < vl) {
+            switch (width) {
+            case 8:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].u16[k] =
+                        (uint16_t)env->vfp.vreg[src2].u16[k] -
+                        (uint16_t)((uint8_t)env->gpr[rs1]);
+                }
+                break;
+            case 16:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].u32[k] =
+                        (uint32_t)env->vfp.vreg[src2].u32[k] -
+                        (uint32_t)((uint16_t)env->gpr[rs1]);
+                }
+                break;
+            case 32:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].u64[k] =
+                        (uint64_t)env->vfp.vreg[src2].u64[k] -
+                        (uint64_t)((uint32_t)env->gpr[rs1]);
+                }
+                break;
+            default:
+                riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+                break;
+            }
+        } else {
+            vector_tail_widen(env, dest, k, width);
+        }
+    }
+    env->vfp.vstart = 0;
+}
+
+void VECTOR_HELPER(vwsub_wv)(CPURISCVState *env, uint32_t vm, uint32_t rs1,
+    uint32_t rs2, uint32_t rd)
+{
+    int i, j, k, vl;
+    uint32_t lmul, width, src1, src2, dest, vlmax;
+
+    vl    = env->vfp.vl;
+
+    lmul  = vector_get_lmul(env);
+    width = vector_get_width(env);
+    vlmax = vector_get_vlmax(env);
+
+    if (vector_vtype_ill(env)
+        || vector_overlap_vm_force(vm, rd)
+        || vector_overlap_dstgp_srcgp(rd, 2 * lmul, rs1, lmul)) {
+        riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+        return;
+    }
+    vector_lmul_check_reg(env, lmul, rs1, false);
+    vector_lmul_check_reg(env, lmul, rs2, true);
+    vector_lmul_check_reg(env, lmul, rd, true);
+
+    for (i = 0; i < vlmax; i++) {
+        src1 = rs1 + (i / (VLEN / width));
+        src2 = rs2 + (i / (VLEN / (2 * width)));
+        dest = rd + (i / (VLEN / (2 * width)));
+        j = i % (VLEN / width);
+        k = i % (VLEN / (2 * width));
+        if (i < env->vfp.vstart) {
+            continue;
+        } else if (i < vl) {
+            switch (width) {
+            case 8:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].s16[k] =
+                        (int16_t)env->vfp.vreg[src2].s16[k] -
+                        (int16_t)((int8_t)env->vfp.vreg[src1].s8[j]);
+                }
+                break;
+            case 16:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].s32[k] =
+                        (int32_t)env->vfp.vreg[src2].s32[k] -
+                        (int32_t)((int16_t)env->vfp.vreg[src1].s16[j]);
+                }
+                break;
+            case 32:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].s64[k] =
+                        (int64_t)env->vfp.vreg[src2].s64[k] -
+                        (int64_t)((int32_t)env->vfp.vreg[src1].s32[j]);
+                }
+                break;
+            default:
+                riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+                break;
+            }
+        } else {
+            vector_tail_widen(env, dest, k, width);
+        }
+    }
+    env->vfp.vstart = 0;
+}
+
+void VECTOR_HELPER(vwsub_wx)(CPURISCVState *env, uint32_t vm, uint32_t rs1,
+    uint32_t rs2, uint32_t rd)
+{
+    int i, k, vl;
+    uint32_t lmul, width, src2, dest, vlmax;
+
+    vl    = env->vfp.vl;
+
+    lmul  = vector_get_lmul(env);
+    width = vector_get_width(env);
+    vlmax = vector_get_vlmax(env);
+
+    if (vector_vtype_ill(env)
+        || vector_overlap_vm_force(vm, rd)) {
+        riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+        return;
+    }
+    vector_lmul_check_reg(env, lmul, rs2, true);
+    vector_lmul_check_reg(env, lmul, rd, true);
+
+    for (i = 0; i < vlmax; i++) {
+        src2 = rs2 + (i / (VLEN / (2 * width)));
+        dest = rd + (i / (VLEN / (2 * width)));
+        k = i % (VLEN / (2 * width));
+        if (i < env->vfp.vstart) {
+            continue;
+        } else if (i < vl) {
+            switch (width) {
+            case 8:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].s16[k] =
+                        (int16_t)env->vfp.vreg[src2].s16[k] -
+                        (int16_t)((int8_t)env->gpr[rs1]);
+                }
+                break;
+            case 16:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].s32[k] =
+                        (int32_t)env->vfp.vreg[src2].s32[k] -
+                        (int32_t)((int16_t)env->gpr[rs1]);
+                }
+                break;
+            case 32:
+                if (vector_elem_mask(env, vm, width, lmul, i)) {
+                    env->vfp.vreg[dest].s64[k] =
+                        (int64_t)env->vfp.vreg[src2].s64[k] -
+                        (int64_t)((int32_t)env->gpr[rs1]);
+                }
+                break;
+            default:
+                riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
+                break;
+            }
+        } else {
+            vector_tail_widen(env, dest, k, width);
+        }
+    }
+    env->vfp.vstart = 0;
+}
-- 
2.7.4

Re: [Qemu-devel] [PATCH v2 08/17] RISC-V: add vector extension integer instructions part1, add/sub/adc/sbc

Posted by Richard Henderson 6 years, 5 months ago

On 9/11/19 2:25 AM, liuzhiwei wrote:
>  #define VECTOR_HELPER(name) HELPER(glue(vector_, name))
> +#define SIGNBIT8    (1 << 7)
> +#define SIGNBIT16   (1 << 15)
> +#define SIGNBIT32   (1 << 31)
> +#define SIGNBIT64   ((uint64_t)1 << 63)

Perhaps make up your mind if you want signed or unsigned values?  Perhaps just
use or redefine INT<N>_MIN instead?

> +static int64_t extend_gpr(target_ulong reg)
> +{
> +    return sign_extend(reg, sizeof(target_ulong) * 8);
> +}

Note wrt usage:
+                extend_rs1 = (uint64_t)extend_gpr(env->gpr[rs1]);

This is equivalent to "extend_rs1 = (target_long)env->gpr[rs1]".

I don't see how this helper function is helping, really.
Also, pass gprs by value, not by index.

> +static inline int vector_get_carry(CPURISCVState *env, int width, int lmul,
> +    int index)
> +{
> +    int mlen = width / lmul;
> +    int idx = (index * mlen) / 8;
> +    int pos = (index * mlen) % 8;
> +
> +    return (env->vfp.vreg[0].u8[idx] >> pos) & 0x1;
> +}

Any reason not to re-use vector_elem_mask?

> +static inline uint64_t u64xu64_lh(uint64_t a, uint64_t b)
> +{
> +    uint64_t hi_64, carry;
> +
> +    /* first get the whole product in {hi_64, lo_64} */
> +    uint64_t a_hi = a >> 32;
> +    uint64_t a_lo = (uint32_t)a;
> +    uint64_t b_hi = b >> 32;
> +    uint64_t b_lo = (uint32_t)b;
> +
> +    /*
> +     * a * b = (a_hi << 32 + a_lo) * (b_hi << 32 + b_lo)
> +     *               = (a_hi * b_hi) << 64 + (a_hi * b_lo) << 32 +
> +     *                 (a_lo * b_hi) << 32 + a_lo * b_lo
> +     *               = {hi_64, lo_64}
> +     * hi_64 = ((a_hi * b_lo) << 32 + (a_lo * b_hi) << 32 + (a_lo * b_lo)) >> 64
> +     *       = (a_hi * b_lo) >> 32 + (a_lo * b_hi) >> 32 + carry
> +     * carry = ((uint64_t)(uint32_t)(a_hi * b_lo) +
> +     *           (uint64_t)(uint32_t)(a_lo * b_hi) + (a_lo * b_lo) >> 32) >> 32
> +     */
> +
> +    carry =  ((uint64_t)(uint32_t)(a_hi * b_lo) +
> +              (uint64_t)(uint32_t)(a_lo * b_hi) +
> +              ((a_lo * b_lo) >> 32)) >> 32;
> +
> +    hi_64 = a_hi * b_hi +
> +            ((a_hi * b_lo) >> 32) + ((a_lo * b_hi) >> 32) +
> +            carry;
> +
> +    return hi_64;
> +}

Use mulu64().

> +static inline int64_t s64xu64_lh(int64_t a, uint64_t b)
> +{
> +    uint64_t abs_a = a;
> +    uint64_t lo_64, hi_64;
> +
> +    if (a < 0) {
> +        abs_a =  ~a + 1;

 abs_a = -a

> +static inline int64_t s64xs64_lh(int64_t a, int64_t b)
> +{
> +    uint64_t abs_a = a, abs_b = b;
> +    uint64_t lo_64, hi_64;
> +
> +    if (a < 0) {
> +        abs_a =  ~a + 1;
> +    }
> +    if (b < 0) {
> +        abs_b = ~b + 1;
> +    }
> +    lo_64 = abs_a * abs_b;
> +    hi_64 = u64xu64_lh(abs_a, abs_b);
> +
> +    if ((a ^ b) & SIGNBIT64) {
> +        lo_64 = ~lo_64;
> +        hi_64 = ~hi_64;
> +        if (lo_64 == UINT64_MAX) {
> +            lo_64 = 0;
> +            hi_64 += 1;
> +        } else {
> +            lo_64 += 1;
> +        }
> +    }
> +    return hi_64;
> +}

Use muls64().


> +void VECTOR_HELPER(vadc_vvm)(CPURISCVState *env, uint32_t rs1,
> +    uint32_t rs2, uint32_t rd)
> +{
> +    int i, j, vl;
> +    uint32_t lmul, width, src1, src2, dest, vlmax, carry;
> +
> +    vl    = env->vfp.vl;
> +    lmul  = vector_get_lmul(env);
> +    width   = vector_get_width(env);
> +    vlmax = vector_get_vlmax(env);
> +
> +    if (vector_vtype_ill(env) || vector_overlap_carry(lmul, rd)) {
> +        riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
> +        return;
> +    }
> +    vector_lmul_check_reg(env, lmul, rs1, false);
> +    vector_lmul_check_reg(env, lmul, rs2, false);
> +    vector_lmul_check_reg(env, lmul, rd, false);
> +
> +    for (i = 0; i < vlmax; i++) {
> +        src1 = rs1 + (i / (VLEN / width));
> +        src2 = rs2 + (i / (VLEN / width));
> +        dest = rd + (i / (VLEN / width));
> +        j = i % (VLEN / width);
> +        if (i < env->vfp.vstart) {
> +            continue;

Again, hoist.

> +        } else if (i < vl) {

I would think this too could be moved into the loop condition.

> +            switch (width) {
> +            case 8:
> +                carry = vector_get_carry(env, width, lmul, i);
> +                env->vfp.vreg[dest].u8[j] = env->vfp.vreg[src1].u8[j]
> +                    + env->vfp.vreg[src2].u8[j] + carry;
> +                break;
> +            case 16:
> +                carry = vector_get_carry(env, width, lmul, i);
> +                env->vfp.vreg[dest].u16[j] = env->vfp.vreg[src1].u16[j]
> +                    + env->vfp.vreg[src2].u16[j] + carry;
> +                break;
> +            case 32:
> +                carry = vector_get_carry(env, width, lmul, i);
> +                env->vfp.vreg[dest].u32[j] = env->vfp.vreg[src1].u32[j]
> +                    + env->vfp.vreg[src2].u32[j] + carry;
> +                break;
> +            case 64:
> +                carry = vector_get_carry(env, width, lmul, i);
> +                env->vfp.vreg[dest].u64[j] = env->vfp.vreg[src1].u64[j]
> +                    + env->vfp.vreg[src2].u64[j] + carry;
> +                break;
> +            default:
> +                riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
> +                break;
> +            }
> +        } else {
> +            vector_tail_common(env, dest, j, width);

With this tail clearing being done as a loop of its own, which would devolve to
memset on a little-endian host.


> +        }
> +    }
> +    env->vfp.vstart = 0;
> +}
> +void VECTOR_HELPER(vadc_vxm)(CPURISCVState *env, uint32_t rs1,
> +    uint32_t rs2, uint32_t rd)
> +{

Watch the spacing between functions.
Pass gpr rs1 by value.

> +void VECTOR_HELPER(vadc_vim)(CPURISCVState *env, uint32_t rs1,
> +    uint32_t rs2, uint32_t rd)
> +{
...
> +                env->vfp.vreg[dest].u8[j] = sign_extend(rs1, 5)

Pass the immediate as a sign-extended immediate to begin with, not as an
unsigned 5-bit field.

All of the rest of the helpers are about the same.

Consider creating a helper function that contains the basic outline of the
vector processing, and takes a (set of) function pointers that perform the
operation.  With optimization, compiler inlining should produce the same code
as you have here without having to replicate quite so much code for each
helper.  You can also fix a bug in the basic outline in one place instead of
hundreds.


r~

Re: [Qemu-devel] [PATCH v2 08/17] RISC-V: add vector extension integer instructions part1, add/sub/adc/sbc

Posted by Richard Henderson 6 years, 5 months ago

On 9/12/19 11:27 AM, Richard Henderson wrote:
>> +void VECTOR_HELPER(vadc_vxm)(CPURISCVState *env, uint32_t rs1,
>> +    uint32_t rs2, uint32_t rd)
>> +{
> 
> Watch the spacing between functions.
> Pass gpr rs1 by value.
> 
>> +void VECTOR_HELPER(vadc_vim)(CPURISCVState *env, uint32_t rs1,
>> +    uint32_t rs2, uint32_t rd)
>> +{
> ...
>> +                env->vfp.vreg[dest].u8[j] = sign_extend(rs1, 5)
> 
> Pass the immediate as a sign-extended immediate to begin with, not as an
> unsigned 5-bit field.

Oh, and of course *_vxm and *_vim should be identical, because in both cases
there is a single scalar parameter.  In the first case the scalar is passed by
value from the gpr; in the second case the scalar is the sign-extended constant.

r~