[v5] Add LoongArch LASX instructions

[PATCH RESEND v5 52/57] target/loongarch: Implement xvreplve xvinsve0 xvpickve

Posted by Song Gao 2 years, 5 months ago

This patch includes:
- XVREPLVE.{B/H/W/D};
- XVREPL128VEI.{B/H/W/D};
- XVREPLVE0.{B/H/W/D/Q};
- XVINSVE0.{W/D};
- XVPICKVE.{W/D};
- XVBSLL.V, XVBSRL.V.

Signed-off-by: Song Gao <gaosong@loongson.cn>
---
 target/loongarch/helper.h                   |   5 +
 target/loongarch/insns.decode               |  25 ++
 target/loongarch/disas.c                    |  29 +++
 target/loongarch/vec_helper.c               |  28 +++
 target/loongarch/insn_trans/trans_vec.c.inc | 255 +++++++++++++++-----
 5 files changed, 287 insertions(+), 55 deletions(-)

diff --git a/target/loongarch/helper.h b/target/loongarch/helper.h
index 85233586e3..fb489dda2d 100644
--- a/target/loongarch/helper.h
+++ b/target/loongarch/helper.h
@@ -668,6 +668,11 @@ DEF_HELPER_4(vsetallnez_h, void, env, i32, i32, i32)
 DEF_HELPER_4(vsetallnez_w, void, env, i32, i32, i32)
 DEF_HELPER_4(vsetallnez_d, void, env, i32, i32, i32)
 
+DEF_HELPER_FLAGS_4(xvinsve0_w, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(xvinsve0_d, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(xvpickve_w, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(xvpickve_d, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+
 DEF_HELPER_FLAGS_4(vpackev_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(vpackev_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(vpackev_w, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
diff --git a/target/loongarch/insns.decode b/target/loongarch/insns.decode
index bb3bb447ae..74383ba3bc 100644
--- a/target/loongarch/insns.decode
+++ b/target/loongarch/insns.decode
@@ -1987,3 +1987,28 @@ xvreplgr2vr_b    0111 01101001 11110 00000 ..... .....    @vr
 xvreplgr2vr_h    0111 01101001 11110 00001 ..... .....    @vr
 xvreplgr2vr_w    0111 01101001 11110 00010 ..... .....    @vr
 xvreplgr2vr_d    0111 01101001 11110 00011 ..... .....    @vr
+
+xvreplve_b       0111 01010010 00100 ..... ..... .....    @vvr
+xvreplve_h       0111 01010010 00101 ..... ..... .....    @vvr
+xvreplve_w       0111 01010010 00110 ..... ..... .....    @vvr
+xvreplve_d       0111 01010010 00111 ..... ..... .....    @vvr
+
+xvrepl128vei_b   0111 01101111 01111 0 .... ..... .....   @vv_ui4
+xvrepl128vei_h   0111 01101111 01111 10 ... ..... .....   @vv_ui3
+xvrepl128vei_w   0111 01101111 01111 110 .. ..... .....   @vv_ui2
+xvrepl128vei_d   0111 01101111 01111 1110 . ..... .....   @vv_ui1
+
+xvreplve0_b      0111 01110000 01110 00000 ..... .....    @vv
+xvreplve0_h      0111 01110000 01111 00000 ..... .....    @vv
+xvreplve0_w      0111 01110000 01111 10000 ..... .....    @vv
+xvreplve0_d      0111 01110000 01111 11000 ..... .....    @vv
+xvreplve0_q      0111 01110000 01111 11100 ..... .....    @vv
+
+xvinsve0_w       0111 01101111 11111 10 ... ..... .....   @vv_ui3
+xvinsve0_d       0111 01101111 11111 110 .. ..... .....   @vv_ui2
+
+xvpickve_w       0111 01110000 00111 10 ... ..... .....   @vv_ui3
+xvpickve_d       0111 01110000 00111 110 .. ..... .....   @vv_ui2
+
+xvbsll_v         0111 01101000 11100 ..... ..... .....    @vv_ui5
+xvbsrl_v         0111 01101000 11101 ..... ..... .....    @vv_ui5
diff --git a/target/loongarch/disas.c b/target/loongarch/disas.c
index 04f9f9fa4b..d091402db6 100644
--- a/target/loongarch/disas.c
+++ b/target/loongarch/disas.c
@@ -1748,6 +1748,11 @@ static void output_rv_i_x(DisasContext *ctx, arg_rv_i *a, const char *mnemonic)
     output(ctx, mnemonic, "r%d, x%d, 0x%x", a->rd, a->vj, a->imm);
 }
 
+static void output_vvr_x(DisasContext *ctx, arg_vvr *a, const char *mnemonic)
+{
+    output(ctx, mnemonic, "x%d, x%d, r%d", a->vd, a->vj, a->rk);
+}
+
 INSN_LASX(xvadd_b,           vvv)
 INSN_LASX(xvadd_h,           vvv)
 INSN_LASX(xvadd_w,           vvv)
@@ -2518,3 +2523,27 @@ INSN_LASX(xvreplgr2vr_b,     vr)
 INSN_LASX(xvreplgr2vr_h,     vr)
 INSN_LASX(xvreplgr2vr_w,     vr)
 INSN_LASX(xvreplgr2vr_d,     vr)
+
+INSN_LASX(xvreplve_b,        vvr)
+INSN_LASX(xvreplve_h,        vvr)
+INSN_LASX(xvreplve_w,        vvr)
+INSN_LASX(xvreplve_d,        vvr)
+INSN_LASX(xvrepl128vei_b,    vv_i)
+INSN_LASX(xvrepl128vei_h,    vv_i)
+INSN_LASX(xvrepl128vei_w,    vv_i)
+INSN_LASX(xvrepl128vei_d,    vv_i)
+
+INSN_LASX(xvreplve0_b,       vv)
+INSN_LASX(xvreplve0_h,       vv)
+INSN_LASX(xvreplve0_w,       vv)
+INSN_LASX(xvreplve0_d,       vv)
+INSN_LASX(xvreplve0_q,       vv)
+
+INSN_LASX(xvinsve0_w,        vv_i)
+INSN_LASX(xvinsve0_d,        vv_i)
+
+INSN_LASX(xvpickve_w,        vv_i)
+INSN_LASX(xvpickve_d,        vv_i)
+
+INSN_LASX(xvbsll_v,          vv_i)
+INSN_LASX(xvbsrl_v,          vv_i)
diff --git a/target/loongarch/vec_helper.c b/target/loongarch/vec_helper.c
index 2f9acd4364..6832189151 100644
--- a/target/loongarch/vec_helper.c
+++ b/target/loongarch/vec_helper.c
@@ -3209,6 +3209,34 @@ SETALLNEZ(vsetallnez_h, MO_16)
 SETALLNEZ(vsetallnez_w, MO_32)
 SETALLNEZ(vsetallnez_d, MO_64)
 
+#define XVINSVE0(NAME, E, MASK)                                    \
+void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
+{                                                                  \
+    VReg *Vd = (VReg *)vd;                                         \
+    VReg *Vj = (VReg *)vj;                                         \
+    Vd->E(imm & MASK) = Vj->E(0);                                  \
+}
+
+XVINSVE0(xvinsve0_w, W, 0x7)
+XVINSVE0(xvinsve0_d, D, 0x3)
+
+#define XVPICKVE(NAME, E, BIT, MASK)                               \
+void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
+{                                                                  \
+    int i;                                                         \
+    VReg *Vd = (VReg *)vd;                                         \
+    VReg *Vj = (VReg *)vj;                                         \
+    int oprsz = simd_oprsz(desc);                                  \
+                                                                   \
+    Vd->E(0) = Vj->E(imm & MASK);                                  \
+    for (i = 1; i < oprsz / (BIT / 8); i++) {                      \
+        Vd->E(i) = 0;                                              \
+    }                                                              \
+}
+
+XVPICKVE(xvpickve_w, W, 32, 0x7)
+XVPICKVE(xvpickve_d, D, 64, 0x3)
+
 #define VPACKEV(NAME, BIT, E)                                  \
 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
 {                                                              \
diff --git a/target/loongarch/insn_trans/trans_vec.c.inc b/target/loongarch/insn_trans/trans_vec.c.inc
index bf44a4d1fc..70babae2c2 100644
--- a/target/loongarch/insn_trans/trans_vec.c.inc
+++ b/target/loongarch/insn_trans/trans_vec.c.inc
@@ -5419,112 +5419,257 @@ static bool trans_vreplvei_d(DisasContext *ctx, arg_vv_i *a)
     return true;
 }
 
-static bool gen_vreplve(DisasContext *ctx, arg_vvr *a, int vece, int bit,
-                        void (*func)(TCGv_i64, TCGv_ptr, tcg_target_long))
+static bool gen_vreplve_vl(DisasContext *ctx, arg_vvr *a,
+                           uint32_t oprsz, int vece, int bit,
+                           void (*func)(TCGv_i64, TCGv_ptr, tcg_target_long))
 {
     TCGv_i64 t0 = tcg_temp_new_i64();
     TCGv_ptr t1 = tcg_temp_new_ptr();
     TCGv_i64 t2 = tcg_temp_new_i64();
 
-    if (!avail_LSX(ctx)) {
-        return false;
-    }
-
-    if (!check_vec(ctx, 16)) {
-        return true;
-    }
-
-    tcg_gen_andi_i64(t0, gpr_src(ctx, a->rk, EXT_NONE), (LSX_LEN/bit) -1);
+    tcg_gen_andi_i64(t0, gpr_src(ctx, a->rk, EXT_NONE), (LSX_LEN / bit) - 1);
     tcg_gen_shli_i64(t0, t0, vece);
     if (HOST_BIG_ENDIAN) {
-        tcg_gen_xori_i64(t0, t0, vece << ((LSX_LEN/bit) -1));
+        tcg_gen_xori_i64(t0, t0, vece << ((LSX_LEN / bit) -1));
     }
 
     tcg_gen_trunc_i64_ptr(t1, t0);
     tcg_gen_add_ptr(t1, t1, cpu_env);
     func(t2, t1, vec_full_offset(a->vj));
-    tcg_gen_gvec_dup_i64(vece, vec_full_offset(a->vd), 16, ctx->vl/8, t2);
+    tcg_gen_gvec_dup_i64(vece, vec_full_offset(a->vd), 16, 16, t2);
+    if (oprsz == 32) {
+        func(t2, t1,  offsetof(CPULoongArchState, fpr[a->vj].vreg.Q(1)));
+        tcg_gen_gvec_dup_i64(vece,
+                             offsetof(CPULoongArchState, fpr[a->vd].vreg.Q(1)),
+                             16, 16, t2);
+    }
 
     return true;
 }
 
+static bool gen_vreplve(DisasContext *ctx, arg_vvr *a, int vece, int bit,
+                        void (*func)(TCGv_i64, TCGv_ptr, tcg_target_long))
+{
+    if (!check_vec(ctx, 16)) {
+        return true;
+    }
+
+    return gen_vreplve_vl(ctx, a, 16, vece, bit, func);
+}
+
+static bool gen_xvreplve(DisasContext *ctx, arg_vvr *a, int vece, int bit,
+                         void (*func)(TCGv_i64, TCGv_ptr, tcg_target_long))
+{
+    if (!check_vec(ctx, 32)) {
+        return true;
+    }
+
+    return gen_vreplve_vl(ctx, a, 32, vece, bit, func);
+}
+
 TRANS(vreplve_b, LSX, gen_vreplve, MO_8,  8, tcg_gen_ld8u_i64)
 TRANS(vreplve_h, LSX, gen_vreplve, MO_16, 16, tcg_gen_ld16u_i64)
 TRANS(vreplve_w, LSX, gen_vreplve, MO_32, 32, tcg_gen_ld32u_i64)
 TRANS(vreplve_d, LSX, gen_vreplve, MO_64, 64, tcg_gen_ld_i64)
+TRANS(xvreplve_b, LASX, gen_xvreplve, MO_8,  8, tcg_gen_ld8u_i64)
+TRANS(xvreplve_h, LASX, gen_xvreplve, MO_16, 16, tcg_gen_ld16u_i64)
+TRANS(xvreplve_w, LASX, gen_xvreplve, MO_32, 32, tcg_gen_ld32u_i64)
+TRANS(xvreplve_d, LASX, gen_xvreplve, MO_64, 64, tcg_gen_ld_i64)
 
-static bool trans_vbsll_v(DisasContext *ctx, arg_vv_i *a)
+static bool trans_xvrepl128vei_b(DisasContext *ctx, arg_vv_i * a)
 {
-    int ofs;
-    TCGv_i64 desthigh, destlow, high, low;
-
-    if (!avail_LSX(ctx)) {
+    if (!avail_LASX(ctx)) {
         return false;
     }
 
-    if (!check_vec(ctx, 16)) {
+    if (!check_vec(ctx, 32)) {
         return true;
     }
 
-    desthigh = tcg_temp_new_i64();
-    destlow = tcg_temp_new_i64();
-    high = tcg_temp_new_i64();
-    low = tcg_temp_new_i64();
-
-    get_vreg64(low, a->vj, 0);
+    tcg_gen_gvec_dup_mem(MO_8,
+                         offsetof(CPULoongArchState, fpr[a->vd].vreg.B(0)),
+                         offsetof(CPULoongArchState,
+                                  fpr[a->vj].vreg.B((a->imm))),
+                         16, 16);
+    tcg_gen_gvec_dup_mem(MO_8,
+                         offsetof(CPULoongArchState, fpr[a->vd].vreg.B(16)),
+                         offsetof(CPULoongArchState,
+                                  fpr[a->vj].vreg.B((a->imm + 16))),
+                         16, 16);
+    return true;
+}
 
-    ofs = ((a->imm) & 0xf) * 8;
-    if (ofs < 64) {
-        get_vreg64(high, a->vj, 1);
-        tcg_gen_extract2_i64(desthigh, low, high, 64 - ofs);
-        tcg_gen_shli_i64(destlow, low, ofs);
-    } else {
-        tcg_gen_shli_i64(desthigh, low, ofs - 64);
-        destlow = tcg_constant_i64(0);
+static bool trans_xvrepl128vei_h(DisasContext *ctx, arg_vv_i *a)
+{
+    if (!avail_LASX(ctx)) {
+        return false;
     }
 
-    set_vreg64(desthigh, a->vd, 1);
-    set_vreg64(destlow, a->vd, 0);
+    if (!check_vec(ctx, 32)) {
+        return true;
+    }
 
+    tcg_gen_gvec_dup_mem(MO_16,
+                         offsetof(CPULoongArchState, fpr[a->vd].vreg.H(0)),
+                         offsetof(CPULoongArchState,
+                                  fpr[a->vj].vreg.H((a->imm))),
+                         16, 16);
+    tcg_gen_gvec_dup_mem(MO_16,
+                         offsetof(CPULoongArchState, fpr[a->vd].vreg.H(8)),
+                         offsetof(CPULoongArchState,
+                                  fpr[a->vj].vreg.H((a->imm + 8))),
+                         16, 16);
     return true;
 }
 
-static bool trans_vbsrl_v(DisasContext *ctx, arg_vv_i *a)
+static bool trans_xvrepl128vei_w(DisasContext *ctx, arg_vv_i *a)
 {
-    TCGv_i64 desthigh, destlow, high, low;
-    int ofs;
+    if (!avail_LASX(ctx)) {
+        return false;
+    }
 
-    if (!avail_LSX(ctx)) {
+    if (!check_vec(ctx, 32)) {
+        return true;
+    }
+
+    tcg_gen_gvec_dup_mem(MO_32,
+                         offsetof(CPULoongArchState, fpr[a->vd].vreg.W(0)),
+                         offsetof(CPULoongArchState,
+                                  fpr[a->vj].vreg.W((a->imm))),
+                         16, 16);
+    tcg_gen_gvec_dup_mem(MO_32,
+                         offsetof(CPULoongArchState, fpr[a->vd].vreg.W(4)),
+                         offsetof(CPULoongArchState,
+                                  fpr[a->vj].vreg.W((a->imm + 4))),
+                         16, 16);
+    return true;
+}
+
+static bool trans_xvrepl128vei_d(DisasContext *ctx, arg_vv_i *a)
+{
+    if (!avail_LASX(ctx)) {
         return false;
     }
 
-    if (!check_vec(ctx, 16)) {
+    if (!check_vec(ctx, 32)) {
         return true;
     }
 
-    desthigh = tcg_temp_new_i64();
-    destlow = tcg_temp_new_i64();
-    high = tcg_temp_new_i64();
-    low = tcg_temp_new_i64();
+    tcg_gen_gvec_dup_mem(MO_64,
+                         offsetof(CPULoongArchState, fpr[a->vd].vreg.D(0)),
+                         offsetof(CPULoongArchState,
+                                  fpr[a->vj].vreg.D((a->imm))),
+                         16, 16);
+    tcg_gen_gvec_dup_mem(MO_64,
+                         offsetof(CPULoongArchState, fpr[a->vd].vreg.D(2)),
+                         offsetof(CPULoongArchState,
+                                  fpr[a->vj].vreg.D((a->imm + 2))),
+                         16, 16);
+    return true;
+}
+
+#define XVREPLVE0(NAME, MOP)                                                  \
+static bool trans_## NAME(DisasContext *ctx, arg_vv * a)                      \
+{                                                                             \
+    if (!avail_LASX(ctx)) {                                                   \
+        return false;                                                         \
+    }                                                                         \
+                                                                              \
+    if (!check_vec(ctx, 32)) {                                                \
+        return true;                                                          \
+    }                                                                         \
+                                                                              \
+    tcg_gen_gvec_dup_mem(MOP, vec_full_offset(a->vd), vec_full_offset(a->vj), \
+                         32, 32);                                             \
+    return true;                                                              \
+}
 
-    get_vreg64(high, a->vj, 1);
+XVREPLVE0(xvreplve0_b, MO_8)
+XVREPLVE0(xvreplve0_h, MO_16)
+XVREPLVE0(xvreplve0_w, MO_32)
+XVREPLVE0(xvreplve0_d, MO_64)
+XVREPLVE0(xvreplve0_q, MO_128)
 
-    ofs = ((a->imm) & 0xf) * 8;
-    if (ofs < 64) {
-        get_vreg64(low, a->vj, 0);
-        tcg_gen_extract2_i64(destlow, low, high, ofs);
-        tcg_gen_shri_i64(desthigh, high, ofs);
-    } else {
-        tcg_gen_shri_i64(destlow, high, ofs - 64);
-        desthigh = tcg_constant_i64(0);
+TRANS(xvinsve0_w, LASX, gen_xx_i, gen_helper_xvinsve0_w)
+TRANS(xvinsve0_d, LASX, gen_xx_i, gen_helper_xvinsve0_d)
+
+TRANS(xvpickve_w, LASX, gen_xx_i, gen_helper_xvpickve_w)
+TRANS(xvpickve_d, LASX, gen_xx_i, gen_helper_xvpickve_d)
+
+static bool do_vbsll_v(DisasContext *ctx, arg_vv_i *a, uint32_t oprsz)
+{
+    int i, max, ofs;
+    TCGv_i64 desthigh[2], destlow[2], high[2], low[2];
+
+    if (!check_vec(ctx, oprsz)) {
+        return true;
+    }
+
+    max = (oprsz == 16) ? 1 : 2;
+
+    for (i = 0; i < max; i++) {
+        desthigh[i] = tcg_temp_new_i64();
+        destlow[i] = tcg_temp_new_i64();
+        high[i] = tcg_temp_new_i64();
+        low[i] = tcg_temp_new_i64();
+
+        get_vreg64(low[i], a->vj, 2 * i);
+
+        ofs = ((a->imm) & 0xf) * 8;
+        if (ofs < 64) {
+            get_vreg64(high[i], a->vj, 2 * i + 1);
+            tcg_gen_extract2_i64(desthigh[i], low[i], high[i], 64 - ofs);
+            tcg_gen_shli_i64(destlow[i], low[i], ofs);
+        } else {
+            tcg_gen_shli_i64(desthigh[i], low[i], ofs - 64);
+            destlow[i] = tcg_constant_i64(0);
+        }
+        set_vreg64(desthigh[i], a->vd, 2 * i + 1);
+        set_vreg64(destlow[i], a->vd, 2 * i);
     }
 
-    set_vreg64(desthigh, a->vd, 1);
-    set_vreg64(destlow, a->vd, 0);
+    return true;
+}
+
+static bool do_vbsrl_v(DisasContext *ctx, arg_vv_i *a, uint32_t oprsz)
+{
+    int ofs, i, max;
+    TCGv_i64 desthigh[2], destlow[2], high[2], low[2];
+
+    if (!check_vec(ctx, 32)) {
+        return true;
+    }
+
+    max = (oprsz == 16) ? 1 : 2;
+
+    for (i = 0; i < max; i++) {
+        desthigh[i] = tcg_temp_new_i64();
+        destlow[i] = tcg_temp_new_i64();
+        high[i] = tcg_temp_new_i64();
+        low[i] = tcg_temp_new_i64();
+        get_vreg64(high[i], a->vj, 2 * i + 1);
+
+        ofs = ((a->imm) & 0xf) * 8;
+        if (ofs < 64) {
+            get_vreg64(low[i], a->vj, 2 * i);
+            tcg_gen_extract2_i64(destlow[i], low[i], high[i], ofs);
+            tcg_gen_shri_i64(desthigh[i], high[i], ofs);
+        } else {
+            tcg_gen_shri_i64(destlow[i], high[i], ofs - 64);
+            desthigh[i] = tcg_constant_i64(0);
+        }
+        set_vreg64(desthigh[i], a->vd, 2 * i + 1);
+        set_vreg64(destlow[i], a->vd, 2 * i);
+    }
 
     return true;
 }
 
+TRANS(vbsll_v, LSX, do_vbsll_v, 16)
+TRANS(vbsrl_v, LSX, do_vbsrl_v, 16)
+TRANS(xvbsll_v, LASX, do_vbsll_v, 32)
+TRANS(xvbsrl_v, LASX, do_vbsrl_v, 32)
+
 TRANS(vpackev_b, LSX, gen_vvv, gen_helper_vpackev_b)
 TRANS(vpackev_h, LSX, gen_vvv, gen_helper_vpackev_h)
 TRANS(vpackev_w, LSX, gen_vvv, gen_helper_vpackev_w)
-- 
2.39.1

Re: [PATCH RESEND v5 52/57] target/loongarch: Implement xvreplve xvinsve0 xvpickve

Posted by Richard Henderson 2 years, 5 months ago

On 9/7/23 01:31, Song Gao wrote:
> +static bool gen_vreplve_vl(DisasContext *ctx, arg_vvr *a,
> +                           uint32_t oprsz, int vece, int bit,
> +                           void (*func)(TCGv_i64, TCGv_ptr, tcg_target_long))
>   {
>       TCGv_i64 t0 = tcg_temp_new_i64();
>       TCGv_ptr t1 = tcg_temp_new_ptr();
>       TCGv_i64 t2 = tcg_temp_new_i64();
>   
> +    tcg_gen_andi_i64(t0, gpr_src(ctx, a->rk, EXT_NONE), (LSX_LEN / bit) - 1);
>       tcg_gen_shli_i64(t0, t0, vece);
>       if (HOST_BIG_ENDIAN) {
> +        tcg_gen_xori_i64(t0, t0, vece << ((LSX_LEN / bit) -1));
>       }
>   
>       tcg_gen_trunc_i64_ptr(t1, t0);
>       tcg_gen_add_ptr(t1, t1, cpu_env);
>       func(t2, t1, vec_full_offset(a->vj));
> +    tcg_gen_gvec_dup_i64(vece, vec_full_offset(a->vd), 16, 16, t2);
> +    if (oprsz == 32) {
> +        func(t2, t1,  offsetof(CPULoongArchState, fpr[a->vj].vreg.Q(1)));
> +        tcg_gen_gvec_dup_i64(vece,
> +                             offsetof(CPULoongArchState, fpr[a->vd].vreg.Q(1)),
> +                             16, 16, t2);
> +    }

This would be clearer as a loop:

     for (i = 0; i < oprsz; i += 16) {
         func(t2, t1, i);
         tcg_gen_gvec_dup_i64(vece, vec_full_offset(a->vd) + i, 16, 16, t2);
     }


> +static bool trans_xvrepl128vei_b(DisasContext *ctx, arg_vv_i * a)
>  {
> +    if (!avail_LASX(ctx)) {
>          return false;
>      }
>  
> +    if (!check_vec(ctx, 32)) {
>          return true;
>      }
>  
> +    tcg_gen_gvec_dup_mem(MO_8,
> +                         offsetof(CPULoongArchState, fpr[a->vd].vreg.B(0)),
> +                         offsetof(CPULoongArchState,
> +                                  fpr[a->vj].vreg.B((a->imm))),
> +                         16, 16);
> +    tcg_gen_gvec_dup_mem(MO_8,
> +                         offsetof(CPULoongArchState, fpr[a->vd].vreg.B(16)),
> +                         offsetof(CPULoongArchState,
> +                                  fpr[a->vj].vreg.B((a->imm + 16))),
> +                         16, 16);
> +    return true;
> +}

Again, a loop.  Also, I think you can easily merge all 4 of these functions using VECE.

> +#define XVREPLVE0(NAME, MOP)                                                  \
> +static bool trans_## NAME(DisasContext *ctx, arg_vv * a)                      \
> +{                                                                             \
> +    if (!avail_LASX(ctx)) {                                                   \
> +        return false;                                                         \
> +    }                                                                         \
> +                                                                              \
> +    if (!check_vec(ctx, 32)) {                                                \
> +        return true;                                                          \
> +    }                                                                         \
> +                                                                              \
> +    tcg_gen_gvec_dup_mem(MOP, vec_full_offset(a->vd), vec_full_offset(a->vj), \
> +                         32, 32);                                             \
> +    return true;                                                              \
> +}
>  
> +XVREPLVE0(xvreplve0_b, MO_8)
> +XVREPLVE0(xvreplve0_h, MO_16)
> +XVREPLVE0(xvreplve0_w, MO_32)
> +XVREPLVE0(xvreplve0_d, MO_64)
> +XVREPLVE0(xvreplve0_q, MO_128)

Should use a helper function and TRANS().

> +static bool do_vbsrl_v(DisasContext *ctx, arg_vv_i *a, uint32_t oprsz)
> +{
> +    int ofs, i, max;
> +    TCGv_i64 desthigh[2], destlow[2], high[2], low[2];
> +
> +    if (!check_vec(ctx, 32)) {
> +        return true;
> +    }
> +
> +    max = (oprsz == 16) ? 1 : 2;
> +
> +    for (i = 0; i < max; i++) {
> +        desthigh[i] = tcg_temp_new_i64();
> +        destlow[i] = tcg_temp_new_i64();
> +        high[i] = tcg_temp_new_i64();
> +        low[i] = tcg_temp_new_i64();
> +        get_vreg64(high[i], a->vj, 2 * i + 1);
> +
> +        ofs = ((a->imm) & 0xf) * 8;
> +        if (ofs < 64) {
> +            get_vreg64(low[i], a->vj, 2 * i);
> +            tcg_gen_extract2_i64(destlow[i], low[i], high[i], ofs);
> +            tcg_gen_shri_i64(desthigh[i], high[i], ofs);
> +        } else {
> +            tcg_gen_shri_i64(destlow[i], high[i], ofs - 64);
> +            desthigh[i] = tcg_constant_i64(0);
> +        }
> +        set_vreg64(desthigh[i], a->vd, 2 * i + 1);
> +        set_vreg64(destlow[i], a->vd, 2 * i);
> +    }
>  
>      return true;
>  }

Why are you using arrays?  They don't seem required.
This would seem clearer as

     for (i = 0; i < oprsz / 16; i++) {
         TCGv desthi = tcg_temp_new_i64();
         ...
     }


r~