[v2] tcg/riscv: Add support for vector

[PATCH v2 04/14] tcg/riscv: Add riscv vset{i}vli support

Posted by LIU Zhiwei 2 months, 3 weeks ago

From: TANG Tiancheng <tangtiancheng.ttc@alibaba-inc.com>

In RISC-V, vector operations require initial configuration using
the vset{i}vl{i} instruction.

This instruction:
  1. Sets the vector length (vl) in bytes
  2. Configures the vtype register, which includes:
    SEW (Single Element Width)
    LMUL (vector register group multiplier)
    Other vector operation parameters

This configuration is crucial for defining subsequent vector
operation behavior. To optimize performance, the configuration
process is managed dynamically:
  1. Reconfiguration using vset{i}vl{i} is necessary when SEW
     or vector register group width changes.
  2. The vset instruction can be omitted when configuration
     remains unchanged.

This optimization is only effective within a single TB.
Each TB requires reconfiguration at its start, as the current
state cannot be obtained from hardware.

Signed-off-by: TANG Tiancheng <tangtiancheng.ttc@alibaba-inc.com>
Signed-off-by: Weiwei Li <liwei1518@gmail.com>
Reviewed-by: Liu Zhiwei <zhiwei_liu@linux.alibaba.com>
---
 tcg/riscv/tcg-target.c.inc | 104 +++++++++++++++++++++++++++++++++++++
 1 file changed, 104 insertions(+)

diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
index 5ef1538aed..49d01b8775 100644
--- a/tcg/riscv/tcg-target.c.inc
+++ b/tcg/riscv/tcg-target.c.inc
@@ -119,6 +119,7 @@ static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind kind, int slot)
 #define GET_VREG_SET(vlen) (vlen == 64 ? ALL_QVECTOR_REG_GROUPS : \
                              (vlen == 128 ? ALL_DVECTOR_REG_GROUPS : \
                               ALL_VECTOR_REGS))
+#define riscv_vlenb (riscv_vlen / 8)
 
 #define sextreg  sextract64
 
@@ -168,6 +169,18 @@ static bool tcg_target_const_match(int64_t val, int ct,
  * RISC-V Base ISA opcodes (IM)
  */
 
+#define V_OPIVV (0x0 << 12)
+#define V_OPFVV (0x1 << 12)
+#define V_OPMVV (0x2 << 12)
+#define V_OPIVI (0x3 << 12)
+#define V_OPIVX (0x4 << 12)
+#define V_OPFVF (0x5 << 12)
+#define V_OPMVX (0x6 << 12)
+#define V_OPCFG (0x7 << 12)
+
+#define V_SUMOP (0x0 << 20)
+#define V_LUMOP (0x0 << 20)
+
 typedef enum {
     OPC_ADD = 0x33,
     OPC_ADDI = 0x13,
@@ -263,6 +276,11 @@ typedef enum {
     /* Zicond: integer conditional operations */
     OPC_CZERO_EQZ = 0x0e005033,
     OPC_CZERO_NEZ = 0x0e007033,
+
+    /* V: Vector extension 1.0 */
+    OPC_VSETVLI  = 0x57 | V_OPCFG,
+    OPC_VSETIVLI = 0xc0000057 | V_OPCFG,
+    OPC_VSETVL   = 0x80000057 | V_OPCFG,
 } RISCVInsn;
 
 /*
@@ -355,6 +373,35 @@ static int32_t encode_uj(RISCVInsn opc, TCGReg rd, uint32_t imm)
     return opc | (rd & 0x1f) << 7 | encode_ujimm20(imm);
 }
 
+typedef enum {
+    VTA_TU = 0,
+    VTA_TA,
+} RISCVVta;
+
+typedef enum {
+    VMA_MU = 0,
+    VMA_MA,
+} RISCVVma;
+
+typedef enum {
+    VLMUL_M1 = 0, /* LMUL=1 */
+    VLMUL_M2,     /* LMUL=2 */
+    VLMUL_M4,     /* LMUL=4 */
+    VLMUL_M8,     /* LMUL=8 */
+    VLMUL_RESERVED,
+    VLMUL_MF8,    /* LMUL=1/8 */
+    VLMUL_MF4,    /* LMUL=1/4 */
+    VLMUL_MF2,    /* LMUL=1/2 */
+} RISCVVlmul;
+#define LMUL_MAX 8
+
+static int32_t encode_vtypei(RISCVVta vta, RISCVVma vma,
+                            unsigned vsew, RISCVVlmul vlmul)
+{
+    return (vma & 0x1) << 7 | (vta & 0x1) << 6 | (vsew & 0x7) << 3 |
+           (vlmul & 0x7);
+}
+
 /*
  * RISC-V instruction emitters
  */
@@ -484,6 +531,12 @@ static void tcg_out_opc_reg_vec_i(TCGContext *s, RISCVInsn opc,
     tcg_out32(s, encode_r(opc, rd, (imm & 0x1f), vs2) | (vm << 25));
 }
 
+static void tcg_out_opc_vec_config(TCGContext *s, RISCVInsn opc,
+                                  TCGReg rd, uint32_t avl, int32_t vtypei)
+{
+    tcg_out32(s, encode_i(opc, rd, avl, vtypei));
+}
+
 /* vm=0 (vm = false) means vector masking ENABLED. */
 #define tcg_out_opc_vv(s, opc, vd, vs2, vs1, vm) \
     tcg_out_opc_reg_vec(s, opc, vd, vs1, vs2, vm);
@@ -498,12 +551,62 @@ static void tcg_out_opc_reg_vec_i(TCGContext *s, RISCVInsn opc,
 #define tcg_out_opc_vi(s, opc, vd, vs2, imm, vm) \
     tcg_out_opc_reg_vec_i(s, opc, vd, imm, vs2, vm);
 
+#define tcg_out_opc_vconfig(s, opc, rd, avl, vtypei) \
+    tcg_out_opc_vec_config(s, opc, rd, avl, vtypei);
+
 /*
  * Only unit-stride addressing implemented; may extend in future.
  */
 #define tcg_out_opc_ldst_vec(s, opc, vs3_vd, rs1, vm) \
     tcg_out_opc_reg_vec(s, opc, vs3_vd, rs1, 0, vm);
 
+static void tcg_out_vsetvl(TCGContext *s, uint32_t avl, int vtypei)
+{
+    if (avl < 32) {
+        tcg_out_opc_vconfig(s, OPC_VSETIVLI, TCG_REG_ZERO, avl, vtypei);
+    } else {
+        tcg_out_opc_imm(s, OPC_ADDI, TCG_REG_TMP0, TCG_REG_ZERO, avl);
+        tcg_out_opc_vconfig(s, OPC_VSETVLI, TCG_REG_ZERO, TCG_REG_TMP0, vtypei);
+    }
+}
+
+/*
+ * TODO: If the vtype value is not supported by the implementation,
+ * then the vill bit is set in vtype, the remaining bits in
+ * vtype are set to zero, and the vl register is also set to zero
+ */
+
+static __thread int prev_vtypei;
+
+#define get_vlmax(vsew) (riscv_vlen / (8 << vsew) * (LMUL_MAX))
+#define get_vec_type_bytes(type)    (type >= TCG_TYPE_V64 ? \
+                                    (8 << (type - TCG_TYPE_V64)) : 0)
+#define calc_vlmul(oprsz)    (ctzl(oprsz / riscv_vlenb))
+
+static void tcg_target_set_vec_config(TCGContext *s, TCGType type,
+                                      unsigned vece)
+{
+    unsigned vsew, oprsz, avl;
+    int vtypei;
+    RISCVVlmul vlmul;
+
+    vsew = vece;
+    oprsz = get_vec_type_bytes(type);
+    avl = oprsz / (1 << vece);
+    vlmul = oprsz > riscv_vlenb ?
+                      calc_vlmul(oprsz) : VLMUL_M1;
+    vtypei = encode_vtypei(VTA_TA, VMA_MA, vsew, vlmul);
+
+    tcg_debug_assert(avl <= get_vlmax(vsew));
+    tcg_debug_assert(vlmul <= VLMUL_RESERVED);
+    tcg_debug_assert(vsew <= MO_64);
+
+    if (vtypei != prev_vtypei) {
+        prev_vtypei = vtypei;
+        tcg_out_vsetvl(s, avl, vtypei);
+    }
+}
+
 /*
  * TCG intrinsics
  */
@@ -2152,6 +2255,7 @@ static void tcg_target_qemu_prologue(TCGContext *s)
 
 static void tcg_out_tb_start(TCGContext *s)
 {
+    prev_vtypei = -1;
     /* nothing to do */
 }
 
-- 
2.43.0

Re: [PATCH v2 04/14] tcg/riscv: Add riscv vset{i}vli support

Posted by Richard Henderson 2 months, 3 weeks ago

On 8/30/24 16:15, LIU Zhiwei wrote:
> From: TANG Tiancheng <tangtiancheng.ttc@alibaba-inc.com>
> 
> In RISC-V, vector operations require initial configuration using
> the vset{i}vl{i} instruction.
> 
> This instruction:
>    1. Sets the vector length (vl) in bytes
>    2. Configures the vtype register, which includes:
>      SEW (Single Element Width)
>      LMUL (vector register group multiplier)
>      Other vector operation parameters
> 
> This configuration is crucial for defining subsequent vector
> operation behavior. To optimize performance, the configuration
> process is managed dynamically:
>    1. Reconfiguration using vset{i}vl{i} is necessary when SEW
>       or vector register group width changes.
>    2. The vset instruction can be omitted when configuration
>       remains unchanged.
> 
> This optimization is only effective within a single TB.
> Each TB requires reconfiguration at its start, as the current
> state cannot be obtained from hardware.
> 
> Signed-off-by: TANG Tiancheng <tangtiancheng.ttc@alibaba-inc.com>
> Signed-off-by: Weiwei Li <liwei1518@gmail.com>
> Reviewed-by: Liu Zhiwei <zhiwei_liu@linux.alibaba.com>
> ---
>   tcg/riscv/tcg-target.c.inc | 104 +++++++++++++++++++++++++++++++++++++
>   1 file changed, 104 insertions(+)
> 
> diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
> index 5ef1538aed..49d01b8775 100644
> --- a/tcg/riscv/tcg-target.c.inc
> +++ b/tcg/riscv/tcg-target.c.inc
> @@ -119,6 +119,7 @@ static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind kind, int slot)
>   #define GET_VREG_SET(vlen) (vlen == 64 ? ALL_QVECTOR_REG_GROUPS : \
>                                (vlen == 128 ? ALL_DVECTOR_REG_GROUPS : \
>                                 ALL_VECTOR_REGS))
> +#define riscv_vlenb (riscv_vlen / 8)
>   
>   #define sextreg  sextract64
>   
> @@ -168,6 +169,18 @@ static bool tcg_target_const_match(int64_t val, int ct,
>    * RISC-V Base ISA opcodes (IM)
>    */
>   
> +#define V_OPIVV (0x0 << 12)
> +#define V_OPFVV (0x1 << 12)
> +#define V_OPMVV (0x2 << 12)
> +#define V_OPIVI (0x3 << 12)
> +#define V_OPIVX (0x4 << 12)
> +#define V_OPFVF (0x5 << 12)
> +#define V_OPMVX (0x6 << 12)
> +#define V_OPCFG (0x7 << 12)
> +
> +#define V_SUMOP (0x0 << 20)
> +#define V_LUMOP (0x0 << 20)
> +
>   typedef enum {
>       OPC_ADD = 0x33,
>       OPC_ADDI = 0x13,
> @@ -263,6 +276,11 @@ typedef enum {
>       /* Zicond: integer conditional operations */
>       OPC_CZERO_EQZ = 0x0e005033,
>       OPC_CZERO_NEZ = 0x0e007033,
> +
> +    /* V: Vector extension 1.0 */
> +    OPC_VSETVLI  = 0x57 | V_OPCFG,
> +    OPC_VSETIVLI = 0xc0000057 | V_OPCFG,
> +    OPC_VSETVL   = 0x80000057 | V_OPCFG,
>   } RISCVInsn;
>   
>   /*
> @@ -355,6 +373,35 @@ static int32_t encode_uj(RISCVInsn opc, TCGReg rd, uint32_t imm)
>       return opc | (rd & 0x1f) << 7 | encode_ujimm20(imm);
>   }
>   
> +typedef enum {
> +    VTA_TU = 0,
> +    VTA_TA,
> +} RISCVVta;
> +
> +typedef enum {
> +    VMA_MU = 0,
> +    VMA_MA,
> +} RISCVVma;

Do these really need enumerators, or would 'bool' be sufficient?

> +static int32_t encode_vtypei(RISCVVta vta, RISCVVma vma,
> +                            unsigned vsew, RISCVVlmul vlmul)
> +{
> +    return (vma & 0x1) << 7 | (vta & 0x1) << 6 | (vsew & 0x7) << 3 |
> +           (vlmul & 0x7);
> +}

s/vtypei/vtype/g?  vtype is only immediate in specific contexts, and you'll match the 
manual better if you talk about vtype the CSR rather than the vset*vli argument.

Assert values in range rather than masking.

Use MemOp vsew, since you're using MO_64, etc.

> @@ -498,12 +551,62 @@ static void tcg_out_opc_reg_vec_i(TCGContext *s, RISCVInsn opc,
>   #define tcg_out_opc_vi(s, opc, vd, vs2, imm, vm) \
>       tcg_out_opc_reg_vec_i(s, opc, vd, imm, vs2, vm);
>   
> +#define tcg_out_opc_vconfig(s, opc, rd, avl, vtypei) \
> +    tcg_out_opc_vec_config(s, opc, rd, avl, vtypei);

Why the extra define?

> +
> +/*
> + * TODO: If the vtype value is not supported by the implementation,
> + * then the vill bit is set in vtype, the remaining bits in
> + * vtype are set to zero, and the vl register is also set to zero
> + */

Why is this a TODO?
Are you suggesting that we might need to probe *all* of the cases at startup?

> +static __thread int prev_vtypei;

I think we should put this into TCGContext.
We don't currently have any host-specific values there, but there's no reason we can't 
have any.

> +#define get_vlmax(vsew) (riscv_vlen / (8 << vsew) * (LMUL_MAX))

Given that we know that LMUL_MAX is 8, doesn't this cancel out?


> +#define get_vec_type_bytes(type)    (type >= TCG_TYPE_V64 ? \
> +                                    (8 << (type - TCG_TYPE_V64)) : 0)

Again, assert not produce nonsense results.  And this doesn't need hiding in a macro.

> +#define calc_vlmul(oprsz)    (ctzl(oprsz / riscv_vlenb))

I think it's clearer to do this inline, where we can see that oprsz > vlenb.

> +
> +static void tcg_target_set_vec_config(TCGContext *s, TCGType type,
> +                                      unsigned vece)
> +{
> +    unsigned vsew, oprsz, avl;
> +    int vtypei;
> +    RISCVVlmul vlmul;
> +
> +    vsew = vece;

You can just name the argument vsew...

> +    oprsz = get_vec_type_bytes(type);
> +    avl = oprsz / (1 << vece);
> +    vlmul = oprsz > riscv_vlenb ?
> +                      calc_vlmul(oprsz) : VLMUL_M1;

I guess it is always the case that full register operations are preferred over fractional?

> +    vtypei = encode_vtypei(VTA_TA, VMA_MA, vsew, vlmul);
> +
> +    tcg_debug_assert(avl <= get_vlmax(vsew));
> +    tcg_debug_assert(vlmul <= VLMUL_RESERVED);
> +    tcg_debug_assert(vsew <= MO_64);

These asserts should be moved higher, above their first uses.


r~