[v1] Lower TCG vector ops to LSX

[PATCH 02/11] tcg/loongarch64: Lower basic tcg vec ops to LSX

Posted by Jiajie Chen 1 year, 3 months ago

LSX support on host cpu is detected via hwcap.

Lower the following ops to LSX:

- dup_vec
- dupi_vec
- dupm_vec
- ld_vec
- st_vec

Signed-off-by: Jiajie Chen <c@jia.je>
---
 tcg/loongarch64/tcg-target-con-set.h |   2 +
 tcg/loongarch64/tcg-target-con-str.h |   1 +
 tcg/loongarch64/tcg-target.c.inc     | 223 ++++++++++++++++++++++++++-
 tcg/loongarch64/tcg-target.h         |  37 ++++-
 tcg/loongarch64/tcg-target.opc.h     |  12 ++
 5 files changed, 273 insertions(+), 2 deletions(-)
 create mode 100644 tcg/loongarch64/tcg-target.opc.h

diff --git a/tcg/loongarch64/tcg-target-con-set.h b/tcg/loongarch64/tcg-target-con-set.h
index c2bde44613..37b3f80bf9 100644
--- a/tcg/loongarch64/tcg-target-con-set.h
+++ b/tcg/loongarch64/tcg-target-con-set.h
@@ -17,7 +17,9 @@
 C_O0_I1(r)
 C_O0_I2(rZ, r)
 C_O0_I2(rZ, rZ)
+C_O0_I2(w, r)
 C_O1_I1(r, r)
+C_O1_I1(w, r)
 C_O1_I2(r, r, rC)
 C_O1_I2(r, r, ri)
 C_O1_I2(r, r, rI)
diff --git a/tcg/loongarch64/tcg-target-con-str.h b/tcg/loongarch64/tcg-target-con-str.h
index 6e9ccca3ad..81b8d40278 100644
--- a/tcg/loongarch64/tcg-target-con-str.h
+++ b/tcg/loongarch64/tcg-target-con-str.h
@@ -14,6 +14,7 @@
  * REGS(letter, register_mask)
  */
 REGS('r', ALL_GENERAL_REGS)
+REGS('w', ALL_VECTOR_REGS)
 
 /*
  * Define constraint letters for constants:
diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
index baf5fc3819..0f9427572c 100644
--- a/tcg/loongarch64/tcg-target.c.inc
+++ b/tcg/loongarch64/tcg-target.c.inc
@@ -32,6 +32,8 @@
 #include "../tcg-ldst.c.inc"
 #include <asm/hwcap.h>
 
+bool use_lsx_instructions;
+
 #ifdef CONFIG_DEBUG_TCG
 static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
     "zero",
@@ -65,7 +67,39 @@ static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
     "s5",
     "s6",
     "s7",
-    "s8"
+    "s8",
+    "vr0",
+    "vr1",
+    "vr2",
+    "vr3",
+    "vr4",
+    "vr5",
+    "vr6",
+    "vr7",
+    "vr8",
+    "vr9",
+    "vr10",
+    "vr11",
+    "vr12",
+    "vr13",
+    "vr14",
+    "vr15",
+    "vr16",
+    "vr17",
+    "vr18",
+    "vr19",
+    "vr20",
+    "vr21",
+    "vr22",
+    "vr23",
+    "vr24",
+    "vr25",
+    "vr26",
+    "vr27",
+    "vr28",
+    "vr29",
+    "vr30",
+    "vr31",
 };
 #endif
 
@@ -102,6 +136,15 @@ static const int tcg_target_reg_alloc_order[] = {
     TCG_REG_A2,
     TCG_REG_A1,
     TCG_REG_A0,
+
+    /* Vector registers */
+    TCG_REG_V0, TCG_REG_V1, TCG_REG_V2, TCG_REG_V3,
+    TCG_REG_V4, TCG_REG_V5, TCG_REG_V6, TCG_REG_V7,
+    TCG_REG_V8, TCG_REG_V9, TCG_REG_V10, TCG_REG_V11,
+    TCG_REG_V12, TCG_REG_V13, TCG_REG_V14, TCG_REG_V15,
+    TCG_REG_V16, TCG_REG_V17, TCG_REG_V18, TCG_REG_V19,
+    TCG_REG_V20, TCG_REG_V21, TCG_REG_V22, TCG_REG_V23,
+    /* V24 - V31 are caller-saved, and skipped.  */
 };
 
 static const int tcg_target_call_iarg_regs[] = {
@@ -135,6 +178,7 @@ static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind kind, int slot)
 #define TCG_CT_CONST_WSZ   0x2000
 
 #define ALL_GENERAL_REGS   MAKE_64BIT_MASK(0, 32)
+#define ALL_VECTOR_REGS    MAKE_64BIT_MASK(32, 32)
 
 static inline tcg_target_long sextreg(tcg_target_long val, int pos, int len)
 {
@@ -1486,6 +1530,159 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     }
 }
 
+static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
+                            TCGReg rd, TCGReg rs)
+{
+    switch (vece) {
+    case MO_8:
+        tcg_out_opc_vreplgr2vr_b(s, rd, rs);
+        break;
+    case MO_16:
+        tcg_out_opc_vreplgr2vr_h(s, rd, rs);
+        break;
+    case MO_32:
+        tcg_out_opc_vreplgr2vr_w(s, rd, rs);
+        break;
+    case MO_64:
+        tcg_out_opc_vreplgr2vr_d(s, rd, rs);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+    return true;
+}
+
+static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
+                             TCGReg r, TCGReg base, intptr_t offset)
+{
+    /* Handle imm overflow and division (vldrepl.d imm is divided by 8) */
+    if (offset < -0x800 || offset > 0x7ff || \
+        (offset & ((1 << vece) - 1)) != 0) {
+        tcg_out_addi(s, TCG_TYPE_I64, TCG_REG_TMP0, base, offset);
+        base = TCG_REG_TMP0;
+        offset = 0;
+    }
+    offset >>= vece;
+
+    switch (vece) {
+    case MO_8:
+        tcg_out_opc_vldrepl_b(s, r, base, offset);
+        break;
+    case MO_16:
+        tcg_out_opc_vldrepl_h(s, r, base, offset);
+        break;
+    case MO_32:
+        tcg_out_opc_vldrepl_w(s, r, base, offset);
+        break;
+    case MO_64:
+        tcg_out_opc_vldrepl_d(s, r, base, offset);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+    return true;
+}
+
+static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
+                             TCGReg rd, int64_t v64)
+{
+    /* Try vldi if imm can fit */
+    if (vece <= MO_32 && (-0x200 <= v64 && v64 <= 0x1FF)) {
+        uint32_t imm = (vece << 10) | ((uint32_t)v64 & 0x3FF);
+        tcg_out_opc_vldi(s, rd, imm);
+        return;
+    }
+
+    /* Fallback to vreplgr2vr */
+    tcg_out_movi(s, type, TCG_REG_TMP0, v64);
+    switch (vece) {
+    case MO_8:
+        tcg_out_opc_vreplgr2vr_b(s, rd, TCG_REG_TMP0);
+        break;
+    case MO_16:
+        tcg_out_opc_vreplgr2vr_h(s, rd, TCG_REG_TMP0);
+        break;
+    case MO_32:
+        tcg_out_opc_vreplgr2vr_w(s, rd, TCG_REG_TMP0);
+        break;
+    case MO_64:
+        tcg_out_opc_vreplgr2vr_d(s, rd, TCG_REG_TMP0);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+}
+
+static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
+                           unsigned vecl, unsigned vece,
+                           const TCGArg args[TCG_MAX_OP_ARGS],
+                           const int const_args[TCG_MAX_OP_ARGS])
+{
+    TCGType type = vecl + TCG_TYPE_V64;
+    TCGArg a0, a1, a2;
+    TCGReg base;
+    TCGReg temp = TCG_REG_TMP0;
+    int32_t offset;
+
+    a0 = args[0];
+    a1 = args[1];
+    a2 = args[2];
+
+    /* Currently only supports V128 */
+    tcg_debug_assert(type == TCG_TYPE_V128);
+
+    switch (opc) {
+    case INDEX_op_st_vec:
+        /* Try to fit vst imm */
+        if (-0x800 <= a2 && a2 <= 0x7ff) {
+            base = a1;
+            offset = a2;
+        } else {
+            tcg_out_addi(s, TCG_TYPE_I64, temp, a1, a2);
+            base = temp;
+            offset = 0;
+        }
+        tcg_out_opc_vst(s, a0, base, offset);
+        break;
+    case INDEX_op_ld_vec:
+        /* Try to fit vld imm */
+        if (-0x800 <= a2 && a2 <= 0x7ff) {
+            base = a1;
+            offset = a2;
+        } else {
+            tcg_out_addi(s, TCG_TYPE_I64, temp, a1, a2);
+            base = temp;
+            offset = 0;
+        }
+        tcg_out_opc_vld(s, a0, base, offset);
+        break;
+    case INDEX_op_dupm_vec:
+        tcg_out_dupm_vec(s, type, vece, a0, a1, a2);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+}
+
+int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
+{
+    switch (opc) {
+    case INDEX_op_ld_vec:
+    case INDEX_op_st_vec:
+    case INDEX_op_dup_vec:
+    case INDEX_op_dupm_vec:
+        return 1;
+    default:
+        return 0;
+    }
+}
+
+void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
+                       TCGArg a0, ...)
+{
+    g_assert_not_reached();
+}
+
 static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
 {
     switch (op) {
@@ -1627,6 +1824,14 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
     case INDEX_op_movcond_i64:
         return C_O1_I4(r, rZ, rJ, rZ, rZ);
 
+    case INDEX_op_ld_vec:
+    case INDEX_op_dupm_vec:
+    case INDEX_op_dup_vec:
+        return C_O1_I1(w, r);
+
+    case INDEX_op_st_vec:
+        return C_O0_I2(w, r);
+
     default:
         g_assert_not_reached();
     }
@@ -1708,6 +1913,10 @@ static void tcg_target_init(TCGContext *s)
         exit(EXIT_FAILURE);
     }
 
+    if (hwcap & HWCAP_LOONGARCH_LSX) {
+        use_lsx_instructions = 1;
+    }
+
     tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS;
     tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS;
 
@@ -1723,6 +1932,18 @@ static void tcg_target_init(TCGContext *s)
     tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_S8);
     tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_S9);
 
+    if (use_lsx_instructions) {
+        tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS;
+        tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V24);
+        tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V25);
+        tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V26);
+        tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V27);
+        tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V28);
+        tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V29);
+        tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V30);
+        tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V31);
+    }
+
     s->reserved_regs = 0;
     tcg_regset_set_reg(s->reserved_regs, TCG_REG_ZERO);
     tcg_regset_set_reg(s->reserved_regs, TCG_REG_TMP0);
diff --git a/tcg/loongarch64/tcg-target.h b/tcg/loongarch64/tcg-target.h
index 26f1aab780..be9343ded9 100644
--- a/tcg/loongarch64/tcg-target.h
+++ b/tcg/loongarch64/tcg-target.h
@@ -30,7 +30,7 @@
 #define LOONGARCH_TCG_TARGET_H
 
 #define TCG_TARGET_INSN_UNIT_SIZE 4
-#define TCG_TARGET_NB_REGS 32
+#define TCG_TARGET_NB_REGS 64
 
 #define MAX_CODE_GEN_BUFFER_SIZE  ((size_t)-1)
 
@@ -68,6 +68,15 @@ typedef enum {
     TCG_REG_S7,
     TCG_REG_S8,
 
+    TCG_REG_V0 = 32, TCG_REG_V1, TCG_REG_V2, TCG_REG_V3,
+    TCG_REG_V4, TCG_REG_V5, TCG_REG_V6, TCG_REG_V7,
+    TCG_REG_V8, TCG_REG_V9, TCG_REG_V10, TCG_REG_V11,
+    TCG_REG_V12, TCG_REG_V13, TCG_REG_V14, TCG_REG_V15,
+    TCG_REG_V16, TCG_REG_V17, TCG_REG_V18, TCG_REG_V19,
+    TCG_REG_V20, TCG_REG_V21, TCG_REG_V22, TCG_REG_V23,
+    TCG_REG_V24, TCG_REG_V25, TCG_REG_V26, TCG_REG_V27,
+    TCG_REG_V28, TCG_REG_V29, TCG_REG_V30, TCG_REG_V31,
+
     /* aliases */
     TCG_AREG0    = TCG_REG_S0,
     TCG_REG_TMP0 = TCG_REG_T8,
@@ -75,6 +84,8 @@ typedef enum {
     TCG_REG_TMP2 = TCG_REG_T6,
 } TCGReg;
 
+extern bool use_lsx_instructions;
+
 /* used for function call generation */
 #define TCG_REG_CALL_STACK              TCG_REG_SP
 #define TCG_TARGET_STACK_ALIGN          16
@@ -159,6 +170,30 @@ typedef enum {
 #define TCG_TARGET_HAS_mulsh_i64        1
 #define TCG_TARGET_HAS_qemu_ldst_i128   0
 
+#define TCG_TARGET_HAS_v64              0
+#define TCG_TARGET_HAS_v128             use_lsx_instructions
+#define TCG_TARGET_HAS_v256             0
+
+#define TCG_TARGET_HAS_not_vec          0
+#define TCG_TARGET_HAS_neg_vec          0
+#define TCG_TARGET_HAS_abs_vec          0
+#define TCG_TARGET_HAS_andc_vec         0
+#define TCG_TARGET_HAS_orc_vec          0
+#define TCG_TARGET_HAS_nand_vec         0
+#define TCG_TARGET_HAS_nor_vec          0
+#define TCG_TARGET_HAS_eqv_vec          0
+#define TCG_TARGET_HAS_mul_vec          0
+#define TCG_TARGET_HAS_shi_vec          0
+#define TCG_TARGET_HAS_shs_vec          0
+#define TCG_TARGET_HAS_shv_vec          0
+#define TCG_TARGET_HAS_roti_vec         0
+#define TCG_TARGET_HAS_rots_vec         0
+#define TCG_TARGET_HAS_rotv_vec         0
+#define TCG_TARGET_HAS_sat_vec          0
+#define TCG_TARGET_HAS_minmax_vec       0
+#define TCG_TARGET_HAS_bitsel_vec       0
+#define TCG_TARGET_HAS_cmpsel_vec       0
+
 #define TCG_TARGET_DEFAULT_MO (0)
 
 #define TCG_TARGET_NEED_LDST_LABELS
diff --git a/tcg/loongarch64/tcg-target.opc.h b/tcg/loongarch64/tcg-target.opc.h
new file mode 100644
index 0000000000..fd1a40b7fd
--- /dev/null
+++ b/tcg/loongarch64/tcg-target.opc.h
@@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023 Jiajie Chen
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * (at your option) any later version.
+ *
+ * See the COPYING file in the top-level directory for details.
+ *
+ * Target-specific opcodes for host vector expansion.  These will be
+ * emitted by tcg_expand_vec_op.  For those familiar with GCC internals,
+ * consider these to be UNSPEC with names.
+ */
-- 
2.42.0

Re: [PATCH 02/11] tcg/loongarch64: Lower basic tcg vec ops to LSX

Posted by Richard Henderson 1 year, 3 months ago

On 8/28/23 08:19, Jiajie Chen wrote:
> +static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
> +                             TCGReg rd, int64_t v64)
> +{
> +    /* Try vldi if imm can fit */
> +    if (vece <= MO_32 && (-0x200 <= v64 && v64 <= 0x1FF)) {
> +        uint32_t imm = (vece << 10) | ((uint32_t)v64 & 0x3FF);
> +        tcg_out_opc_vldi(s, rd, imm);
> +        return;
> +    }

v64 has the value replicated across 64 bits.
In order to do the comparison above, you'll want

     int64_t vale = sextract64(v64, 0, 8 << vece);
     if (-0x200 <= vale && vale <= 0x1ff)
         ...

Since the only documentation for LSX is qemu's own translator code, why are you testing 
vece <= MO_32?  MO_64 should be available as well?  Or is there a bug in trans_vldi()?

It might be nice to leave a to-do for vldi imm bit 12 set, for the patterns expanded by 
vldi_get_value().  In particular, mode == 9 is likely to be useful, and modes {1,2,3,5} 
are easy to test for.

> +
> +    /* Fallback to vreplgr2vr */
> +    tcg_out_movi(s, type, TCG_REG_TMP0, v64);

type is a vector type; you can't use it here.
Correct would be TCG_TYPE_I64.

Better to load vale instead, since that will take fewer insns in tcg_out_movi.

> +static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
> +                           unsigned vecl, unsigned vece,
> +                           const TCGArg args[TCG_MAX_OP_ARGS],
> +                           const int const_args[TCG_MAX_OP_ARGS])
> +{
> +    TCGType type = vecl + TCG_TYPE_V64;
> +    TCGArg a0, a1, a2;
> +    TCGReg base;
> +    TCGReg temp = TCG_REG_TMP0;
> +    int32_t offset;
> +
> +    a0 = args[0];
> +    a1 = args[1];
> +    a2 = args[2];
> +
> +    /* Currently only supports V128 */
> +    tcg_debug_assert(type == TCG_TYPE_V128);
> +
> +    switch (opc) {
> +    case INDEX_op_st_vec:
> +        /* Try to fit vst imm */
> +        if (-0x800 <= a2 && a2 <= 0x7ff) {
> +            base = a1;
> +            offset = a2;
> +        } else {
> +            tcg_out_addi(s, TCG_TYPE_I64, temp, a1, a2);
> +            base = temp;
> +            offset = 0;
> +        }
> +        tcg_out_opc_vst(s, a0, base, offset);
> +        break;
> +    case INDEX_op_ld_vec:
> +        /* Try to fit vld imm */
> +        if (-0x800 <= a2 && a2 <= 0x7ff) {
> +            base = a1;
> +            offset = a2;
> +        } else {
> +            tcg_out_addi(s, TCG_TYPE_I64, temp, a1, a2);
> +            base = temp;
> +            offset = 0;
> +        }
> +        tcg_out_opc_vld(s, a0, base, offset);

tcg_out_addi has a hole in bits [15:12], and can take an extra insn if those bits are set. 
  Better to load the offset with tcg_out_movi and then use VLDX/VSTX instead of VLD/VST.

> @@ -159,6 +170,30 @@ typedef enum {
>   #define TCG_TARGET_HAS_mulsh_i64        1
>   #define TCG_TARGET_HAS_qemu_ldst_i128   0
>   
> +#define TCG_TARGET_HAS_v64              0
> +#define TCG_TARGET_HAS_v128             use_lsx_instructions
> +#define TCG_TARGET_HAS_v256             0

Perhaps reserve for a follow-up, but TCG_TARGET_HAS_v64 can easily be supported using the 
same instructions.

The only difference is load/store, where you could use FLD.D/FST.D to load the lower 
64-bits of the fp/vector register, or VLDREPL.D to load and initialize all bits and 
VSTELM.D to store the lower 64-bits.

I tend to think the float insns are more flexible, having a larger displacement, and the 
availability of FLDX/FSTX as well.

r~

Re: [PATCH 02/11] tcg/loongarch64: Lower basic tcg vec ops to LSX

Posted by Jiajie Chen 1 year, 3 months ago

There seems to some problem with the email server, try my another email 
address to send this email.


On 2023/8/29 00:57, Richard Henderson wrote:
> On 8/28/23 08:19, Jiajie Chen wrote:
>> +static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned 
>> vece,
>> +                             TCGReg rd, int64_t v64)
>> +{
>> +    /* Try vldi if imm can fit */
>> +    if (vece <= MO_32 && (-0x200 <= v64 && v64 <= 0x1FF)) {
>> +        uint32_t imm = (vece << 10) | ((uint32_t)v64 & 0x3FF);
>> +        tcg_out_opc_vldi(s, rd, imm);
>> +        return;
>> +    }
>
> v64 has the value replicated across 64 bits.
> In order to do the comparison above, you'll want
>
>     int64_t vale = sextract64(v64, 0, 8 << vece);
>     if (-0x200 <= vale && vale <= 0x1ff)
>         ...
>
> Since the only documentation for LSX is qemu's own translator code, 
> why are you testing vece <= MO_32?  MO_64 should be available as 
> well?  Or is there a bug in trans_vldi()?


Sorry, my mistake. I was messing MO_64 with bit 12 in vldi imm.


>
> It might be nice to leave a to-do for vldi imm bit 12 set, for the 
> patterns expanded by vldi_get_value().  In particular, mode == 9 is 
> likely to be useful, and modes {1,2,3,5} are easy to test for.
>

Sure, I was thinking about the complexity of pattern matching on those 
modes, and decided to skip the hard part in the first patch series.


>
>> +
>> +    /* Fallback to vreplgr2vr */
>> +    tcg_out_movi(s, type, TCG_REG_TMP0, v64);
>
> type is a vector type; you can't use it here.
> Correct would be TCG_TYPE_I64.
>
> Better to load vale instead, since that will take fewer insns in 
> tcg_out_movi.


Sure.


>
>
>> +static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
>> +                           unsigned vecl, unsigned vece,
>> +                           const TCGArg args[TCG_MAX_OP_ARGS],
>> +                           const int const_args[TCG_MAX_OP_ARGS])
>> +{
>> +    TCGType type = vecl + TCG_TYPE_V64;
>> +    TCGArg a0, a1, a2;
>> +    TCGReg base;
>> +    TCGReg temp = TCG_REG_TMP0;
>> +    int32_t offset;
>> +
>> +    a0 = args[0];
>> +    a1 = args[1];
>> +    a2 = args[2];
>> +
>> +    /* Currently only supports V128 */
>> +    tcg_debug_assert(type == TCG_TYPE_V128);
>> +
>> +    switch (opc) {
>> +    case INDEX_op_st_vec:
>> +        /* Try to fit vst imm */
>> +        if (-0x800 <= a2 && a2 <= 0x7ff) {
>> +            base = a1;
>> +            offset = a2;
>> +        } else {
>> +            tcg_out_addi(s, TCG_TYPE_I64, temp, a1, a2);
>> +            base = temp;
>> +            offset = 0;
>> +        }
>> +        tcg_out_opc_vst(s, a0, base, offset);
>> +        break;
>> +    case INDEX_op_ld_vec:
>> +        /* Try to fit vld imm */
>> +        if (-0x800 <= a2 && a2 <= 0x7ff) {
>> +            base = a1;
>> +            offset = a2;
>> +        } else {
>> +            tcg_out_addi(s, TCG_TYPE_I64, temp, a1, a2);
>> +            base = temp;
>> +            offset = 0;
>> +        }
>> +        tcg_out_opc_vld(s, a0, base, offset);
>
> tcg_out_addi has a hole in bits [15:12], and can take an extra insn if 
> those bits are set.  Better to load the offset with tcg_out_movi and 
> then use VLDX/VSTX instead of VLD/VST.


Sure.


>
>> @@ -159,6 +170,30 @@ typedef enum {
>>   #define TCG_TARGET_HAS_mulsh_i64        1
>>   #define TCG_TARGET_HAS_qemu_ldst_i128   0
>>   +#define TCG_TARGET_HAS_v64              0
>> +#define TCG_TARGET_HAS_v128             use_lsx_instructions
>> +#define TCG_TARGET_HAS_v256             0
>
> Perhaps reserve for a follow-up, but TCG_TARGET_HAS_v64 can easily be 
> supported using the same instructions.
>
> The only difference is load/store, where you could use FLD.D/FST.D to 
> load the lower 64-bits of the fp/vector register, or VLDREPL.D to load 
> and initialize all bits and VSTELM.D to store the lower 64-bits.
>
> I tend to think the float insns are more flexible, having a larger 
> displacement, and the availability of FLDX/FSTX as well.


Sure.


>
>
> r~