[Qemu-devel] [PATCH v11 02/20] tcg: Add types and basic operations for host vectors

Richard Henderson posted 20 patches 8 years ago
[Qemu-devel] [PATCH v11 02/20] tcg: Add types and basic operations for host vectors
Posted by Richard Henderson 8 years ago
Nothing uses or enables them yet.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 Makefile.target  |   4 +-
 tcg/tcg-op.h     |  27 +++++
 tcg/tcg-opc.h    |  25 +++++
 tcg/tcg.h        |  56 +++++++++++
 tcg/tcg-op-vec.c | 292 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 tcg/tcg.c        |  96 +++++++++++++++++-
 tcg/README       |  49 ++++++++++
 7 files changed, 543 insertions(+), 6 deletions(-)
 create mode 100644 tcg/tcg-op-vec.c

diff --git a/Makefile.target b/Makefile.target
index f9a9da7e7c..7f30a1e725 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -93,8 +93,8 @@ all: $(PROGS) stap
 # cpu emulator library
 obj-y += exec.o
 obj-y += accel/
-obj-$(CONFIG_TCG) += tcg/tcg.o tcg/tcg-op.o tcg/optimize.o
-obj-$(CONFIG_TCG) += tcg/tcg-common.o
+obj-$(CONFIG_TCG) += tcg/tcg.o tcg/tcg-op.o tcg/tcg-op-vec.o
+obj-$(CONFIG_TCG) += tcg/tcg-common.o tcg/optimize.o
 obj-$(CONFIG_TCG_INTERPRETER) += tcg/tci.o
 obj-$(CONFIG_TCG_INTERPRETER) += disas/tci.o
 obj-y += fpu/softfloat.o
diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h
index ca07b32b65..0c02d86b8b 100644
--- a/tcg/tcg-op.h
+++ b/tcg/tcg-op.h
@@ -35,6 +35,10 @@ void tcg_gen_op4(TCGOpcode, TCGArg, TCGArg, TCGArg, TCGArg);
 void tcg_gen_op5(TCGOpcode, TCGArg, TCGArg, TCGArg, TCGArg, TCGArg);
 void tcg_gen_op6(TCGOpcode, TCGArg, TCGArg, TCGArg, TCGArg, TCGArg, TCGArg);
 
+void vec_gen_2(TCGOpcode, TCGType, unsigned, TCGArg, TCGArg);
+void vec_gen_3(TCGOpcode, TCGType, unsigned, TCGArg, TCGArg, TCGArg);
+void vec_gen_4(TCGOpcode, TCGType, unsigned, TCGArg, TCGArg, TCGArg, TCGArg);
+
 static inline void tcg_gen_op1_i32(TCGOpcode opc, TCGv_i32 a1)
 {
     tcg_gen_op1(opc, tcgv_i32_arg(a1));
@@ -903,6 +907,27 @@ void tcg_gen_atomic_or_fetch_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp);
 void tcg_gen_atomic_xor_fetch_i32(TCGv_i32, TCGv, TCGv_i32, TCGArg, TCGMemOp);
 void tcg_gen_atomic_xor_fetch_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp);
 
+void tcg_gen_mov_vec(TCGv_vec, TCGv_vec);
+void tcg_gen_dup_i32_vec(unsigned vece, TCGv_vec, TCGv_i32);
+void tcg_gen_dup_i64_vec(unsigned vece, TCGv_vec, TCGv_i64);
+void tcg_gen_dup8i_vec(TCGv_vec, uint32_t);
+void tcg_gen_dup16i_vec(TCGv_vec, uint32_t);
+void tcg_gen_dup32i_vec(TCGv_vec, uint32_t);
+void tcg_gen_dup64i_vec(TCGv_vec, uint64_t);
+void tcg_gen_add_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
+void tcg_gen_sub_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
+void tcg_gen_and_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
+void tcg_gen_or_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
+void tcg_gen_xor_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
+void tcg_gen_andc_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
+void tcg_gen_orc_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
+void tcg_gen_not_vec(unsigned vece, TCGv_vec r, TCGv_vec a);
+void tcg_gen_neg_vec(unsigned vece, TCGv_vec r, TCGv_vec a);
+
+void tcg_gen_ld_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset);
+void tcg_gen_st_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset);
+void tcg_gen_stl_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset, TCGType t);
+
 #if TARGET_LONG_BITS == 64
 #define tcg_gen_movi_tl tcg_gen_movi_i64
 #define tcg_gen_mov_tl tcg_gen_mov_i64
@@ -1001,6 +1026,7 @@ void tcg_gen_atomic_xor_fetch_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp);
 #define tcg_gen_atomic_and_fetch_tl tcg_gen_atomic_and_fetch_i64
 #define tcg_gen_atomic_or_fetch_tl tcg_gen_atomic_or_fetch_i64
 #define tcg_gen_atomic_xor_fetch_tl tcg_gen_atomic_xor_fetch_i64
+#define tcg_gen_dup_tl_vec  tcg_gen_dup_i64_vec
 #else
 #define tcg_gen_movi_tl tcg_gen_movi_i32
 #define tcg_gen_mov_tl tcg_gen_mov_i32
@@ -1098,6 +1124,7 @@ void tcg_gen_atomic_xor_fetch_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp);
 #define tcg_gen_atomic_and_fetch_tl tcg_gen_atomic_and_fetch_i32
 #define tcg_gen_atomic_or_fetch_tl tcg_gen_atomic_or_fetch_i32
 #define tcg_gen_atomic_xor_fetch_tl tcg_gen_atomic_xor_fetch_i32
+#define tcg_gen_dup_tl_vec  tcg_gen_dup_i32_vec
 #endif
 
 #if UINTPTR_MAX == UINT32_MAX
diff --git a/tcg/tcg-opc.h b/tcg/tcg-opc.h
index 956fb1e9f3..b851ad4bca 100644
--- a/tcg/tcg-opc.h
+++ b/tcg/tcg-opc.h
@@ -204,8 +204,33 @@ DEF(qemu_ld_i64, DATA64_ARGS, TLADDR_ARGS, 1,
 DEF(qemu_st_i64, 0, TLADDR_ARGS + DATA64_ARGS, 1,
     TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS | TCG_OPF_64BIT)
 
+/* Host vector support.  */
+
+#define IMPLVEC  TCG_OPF_VECTOR | IMPL(TCG_TARGET_MAYBE_vec)
+
+DEF(mov_vec, 1, 1, 0, TCG_OPF_VECTOR | TCG_OPF_NOT_PRESENT)
+DEF(dupi_vec, 1, 0, 1, TCG_OPF_VECTOR | TCG_OPF_NOT_PRESENT)
+
+DEF(dup_vec, 1, 1, 0, IMPLVEC)
+DEF(dup2_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_REG_BITS == 32))
+
+DEF(ld_vec, 1, 1, 1, IMPLVEC)
+DEF(st_vec, 0, 2, 1, IMPLVEC)
+
+DEF(add_vec, 1, 2, 0, IMPLVEC)
+DEF(sub_vec, 1, 2, 0, IMPLVEC)
+DEF(neg_vec, 1, 1, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_neg_vec))
+
+DEF(and_vec, 1, 2, 0, IMPLVEC)
+DEF(or_vec, 1, 2, 0, IMPLVEC)
+DEF(xor_vec, 1, 2, 0, IMPLVEC)
+DEF(andc_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_andc_vec))
+DEF(orc_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_orc_vec))
+DEF(not_vec, 1, 1, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_not_vec))
+
 #undef TLADDR_ARGS
 #undef DATA64_ARGS
 #undef IMPL
 #undef IMPL64
+#undef IMPLVEC
 #undef DEF
diff --git a/tcg/tcg.h b/tcg/tcg.h
index 2ce497cebf..dce483b0ee 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -170,6 +170,27 @@ typedef uint64_t TCGRegSet;
 # error "Missing unsigned widening multiply"
 #endif
 
+#if !defined(TCG_TARGET_HAS_v64) \
+    && !defined(TCG_TARGET_HAS_v128) \
+    && !defined(TCG_TARGET_HAS_v256)
+#define TCG_TARGET_MAYBE_vec            0
+#define TCG_TARGET_HAS_neg_vec          0
+#define TCG_TARGET_HAS_not_vec          0
+#define TCG_TARGET_HAS_andc_vec         0
+#define TCG_TARGET_HAS_orc_vec          0
+#else
+#define TCG_TARGET_MAYBE_vec            1
+#endif
+#ifndef TCG_TARGET_HAS_v64
+#define TCG_TARGET_HAS_v64              0
+#endif
+#ifndef TCG_TARGET_HAS_v128
+#define TCG_TARGET_HAS_v128             0
+#endif
+#ifndef TCG_TARGET_HAS_v256
+#define TCG_TARGET_HAS_v256             0
+#endif
+
 #ifndef TARGET_INSN_START_EXTRA_WORDS
 # define TARGET_INSN_START_WORDS 1
 #else
@@ -246,6 +267,11 @@ typedef struct TCGPool {
 typedef enum TCGType {
     TCG_TYPE_I32,
     TCG_TYPE_I64,
+
+    TCG_TYPE_V64,
+    TCG_TYPE_V128,
+    TCG_TYPE_V256,
+
     TCG_TYPE_COUNT, /* number of different types */
 
     /* An alias for the size of the host register.  */
@@ -396,6 +422,8 @@ typedef tcg_target_ulong TCGArg;
     * TCGv_i32 : 32 bit integer type
     * TCGv_i64 : 64 bit integer type
     * TCGv_ptr : a host pointer type
+    * TCGv_vec : a host vector type; the exact size is not exposed
+                 to the CPU front-end code.
     * TCGv : an integer type the same size as target_ulong
              (an alias for either TCGv_i32 or TCGv_i64)
    The compiler's type checking will complain if you mix them
@@ -418,6 +446,7 @@ typedef tcg_target_ulong TCGArg;
 typedef struct TCGv_i32_d *TCGv_i32;
 typedef struct TCGv_i64_d *TCGv_i64;
 typedef struct TCGv_ptr_d *TCGv_ptr;
+typedef struct TCGv_vec_d *TCGv_vec;
 typedef TCGv_ptr TCGv_env;
 #if TARGET_LONG_BITS == 32
 #define TCGv TCGv_i32
@@ -589,6 +618,9 @@ typedef struct TCGOp {
 #define TCGOP_CALLI(X)    (X)->param1
 #define TCGOP_CALLO(X)    (X)->param2
 
+#define TCGOP_VECL(X)     (X)->param1
+#define TCGOP_VECE(X)     (X)->param2
+
 /* Make sure operands fit in the bitfields above.  */
 QEMU_BUILD_BUG_ON(NB_OPS > (1 << 8));
 
@@ -726,6 +758,11 @@ static inline TCGTemp *tcgv_ptr_temp(TCGv_ptr v)
     return tcgv_i32_temp((TCGv_i32)v);
 }
 
+static inline TCGTemp *tcgv_vec_temp(TCGv_vec v)
+{
+    return tcgv_i32_temp((TCGv_i32)v);
+}
+
 static inline TCGArg tcgv_i32_arg(TCGv_i32 v)
 {
     return temp_arg(tcgv_i32_temp(v));
@@ -741,6 +778,11 @@ static inline TCGArg tcgv_ptr_arg(TCGv_ptr v)
     return temp_arg(tcgv_ptr_temp(v));
 }
 
+static inline TCGArg tcgv_vec_arg(TCGv_vec v)
+{
+    return temp_arg(tcgv_vec_temp(v));
+}
+
 static inline TCGv_i32 temp_tcgv_i32(TCGTemp *t)
 {
     (void)temp_idx(t); /* trigger embedded assert */
@@ -757,6 +799,11 @@ static inline TCGv_ptr temp_tcgv_ptr(TCGTemp *t)
     return (TCGv_ptr)temp_tcgv_i32(t);
 }
 
+static inline TCGv_vec temp_tcgv_vec(TCGTemp *t)
+{
+    return (TCGv_vec)temp_tcgv_i32(t);
+}
+
 #if TCG_TARGET_REG_BITS == 32
 static inline TCGv_i32 TCGV_LOW(TCGv_i64 t)
 {
@@ -832,9 +879,12 @@ TCGTemp *tcg_global_mem_new_internal(TCGType, TCGv_ptr,
 
 TCGv_i32 tcg_temp_new_internal_i32(int temp_local);
 TCGv_i64 tcg_temp_new_internal_i64(int temp_local);
+TCGv_vec tcg_temp_new_vec(TCGType type);
+TCGv_vec tcg_temp_new_vec_matching(TCGv_vec match);
 
 void tcg_temp_free_i32(TCGv_i32 arg);
 void tcg_temp_free_i64(TCGv_i64 arg);
+void tcg_temp_free_vec(TCGv_vec arg);
 
 static inline TCGv_i32 tcg_global_mem_new_i32(TCGv_ptr reg, intptr_t offset,
                                               const char *name)
@@ -916,6 +966,8 @@ enum {
     /* Instruction is optional and not implemented by the host, or insn
        is generic and should not be implemened by the host.  */
     TCG_OPF_NOT_PRESENT  = 0x10,
+    /* Instruction operands are vectors.  */
+    TCG_OPF_VECTOR       = 0x20,
 };
 
 typedef struct TCGOpDef {
@@ -981,6 +1033,10 @@ TCGv_i32 tcg_const_i32(int32_t val);
 TCGv_i64 tcg_const_i64(int64_t val);
 TCGv_i32 tcg_const_local_i32(int32_t val);
 TCGv_i64 tcg_const_local_i64(int64_t val);
+TCGv_vec tcg_const_zeros_vec(TCGType);
+TCGv_vec tcg_const_ones_vec(TCGType);
+TCGv_vec tcg_const_zeros_vec_matching(TCGv_vec);
+TCGv_vec tcg_const_ones_vec_matching(TCGv_vec);
 
 TCGLabel *gen_new_label(void);
 
diff --git a/tcg/tcg-op-vec.c b/tcg/tcg-op-vec.c
new file mode 100644
index 0000000000..9e4678878b
--- /dev/null
+++ b/tcg/tcg-op-vec.c
@@ -0,0 +1,292 @@
+/*
+ * Tiny Code Generator for QEMU
+ *
+ * Copyright (c) 2018 Linaro, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+#include "cpu.h"
+#include "exec/exec-all.h"
+#include "tcg.h"
+#include "tcg-op.h"
+#include "tcg-mo.h"
+
+/* Reduce the number of ifdefs below.  This assumes that all uses of
+   TCGV_HIGH and TCGV_LOW are properly protected by a conditional that
+   the compiler can eliminate.  */
+#if TCG_TARGET_REG_BITS == 64
+extern TCGv_i32 TCGV_LOW_link_error(TCGv_i64);
+extern TCGv_i32 TCGV_HIGH_link_error(TCGv_i64);
+#define TCGV_LOW  TCGV_LOW_link_error
+#define TCGV_HIGH TCGV_HIGH_link_error
+#endif
+
+void vec_gen_2(TCGOpcode opc, TCGType type, unsigned vece, TCGArg r, TCGArg a)
+{
+    TCGOp *op = tcg_emit_op(opc);
+    TCGOP_VECL(op) = type - TCG_TYPE_V64;
+    TCGOP_VECE(op) = vece;
+    op->args[0] = r;
+    op->args[1] = a;
+}
+
+void vec_gen_3(TCGOpcode opc, TCGType type, unsigned vece,
+               TCGArg r, TCGArg a, TCGArg b)
+{
+    TCGOp *op = tcg_emit_op(opc);
+    TCGOP_VECL(op) = type - TCG_TYPE_V64;
+    TCGOP_VECE(op) = vece;
+    op->args[0] = r;
+    op->args[1] = a;
+    op->args[2] = b;
+}
+
+void vec_gen_4(TCGOpcode opc, TCGType type, unsigned vece,
+               TCGArg r, TCGArg a, TCGArg b, TCGArg c)
+{
+    TCGOp *op = tcg_emit_op(opc);
+    TCGOP_VECL(op) = type - TCG_TYPE_V64;
+    TCGOP_VECE(op) = vece;
+    op->args[0] = r;
+    op->args[1] = a;
+    op->args[2] = b;
+    op->args[3] = c;
+}
+
+static void vec_gen_op2(TCGOpcode opc, unsigned vece, TCGv_vec r, TCGv_vec a)
+{
+    TCGTemp *rt = tcgv_vec_temp(r);
+    TCGTemp *at = tcgv_vec_temp(a);
+    TCGType type = rt->base_type;
+
+    tcg_debug_assert(at->base_type == type);
+    vec_gen_2(opc, type, vece, temp_arg(rt), temp_arg(at));
+}
+
+static void vec_gen_op3(TCGOpcode opc, unsigned vece,
+                        TCGv_vec r, TCGv_vec a, TCGv_vec b)
+{
+    TCGTemp *rt = tcgv_vec_temp(r);
+    TCGTemp *at = tcgv_vec_temp(a);
+    TCGTemp *bt = tcgv_vec_temp(b);
+    TCGType type = rt->base_type;
+
+    tcg_debug_assert(at->base_type == type);
+    tcg_debug_assert(bt->base_type == type);
+    vec_gen_3(opc, type, vece, temp_arg(rt), temp_arg(at), temp_arg(bt));
+}
+
+void tcg_gen_mov_vec(TCGv_vec r, TCGv_vec a)
+{
+    if (r != a) {
+        vec_gen_op2(INDEX_op_mov_vec, 0, r, a);
+    }
+}
+
+#define MO_REG  (TCG_TARGET_REG_BITS == 64 ? MO_64 : MO_32)
+
+static void tcg_gen_dupi_vec(TCGv_vec r, unsigned vece, TCGArg a)
+{
+    TCGTemp *rt = tcgv_vec_temp(r);
+    vec_gen_2(INDEX_op_dupi_vec, rt->base_type, vece, temp_arg(rt), a);
+}
+
+TCGv_vec tcg_const_zeros_vec(TCGType type)
+{
+    TCGv_vec ret = tcg_temp_new_vec(type);
+    tcg_gen_dupi_vec(ret, MO_REG, 0);
+    return ret;
+}
+
+TCGv_vec tcg_const_ones_vec(TCGType type)
+{
+    TCGv_vec ret = tcg_temp_new_vec(type);
+    tcg_gen_dupi_vec(ret, MO_REG, -1);
+    return ret;
+}
+
+TCGv_vec tcg_const_zeros_vec_matching(TCGv_vec m)
+{
+    TCGTemp *t = tcgv_vec_temp(m);
+    return tcg_const_zeros_vec(t->base_type);
+}
+
+TCGv_vec tcg_const_ones_vec_matching(TCGv_vec m)
+{
+    TCGTemp *t = tcgv_vec_temp(m);
+    return tcg_const_ones_vec(t->base_type);
+}
+
+void tcg_gen_dup64i_vec(TCGv_vec r, uint64_t a)
+{
+    if (TCG_TARGET_REG_BITS == 32 && a == deposit64(a, 32, 32, a)) {
+        tcg_gen_dupi_vec(r, MO_32, a);
+    } else if (TCG_TARGET_REG_BITS == 64 || a == (uint64_t)(int32_t)a) {
+        tcg_gen_dupi_vec(r, MO_64, a);
+    } else {
+        TCGv_i64 c = tcg_const_i64(a);
+        tcg_gen_dup_i64_vec(MO_64, r, c);
+        tcg_temp_free_i64(c);
+    }
+}
+
+void tcg_gen_dup32i_vec(TCGv_vec r, uint32_t a)
+{
+    tcg_gen_dupi_vec(r, MO_REG, ((TCGArg)-1 / 0xffffffffu) * a);
+}
+
+void tcg_gen_dup16i_vec(TCGv_vec r, uint32_t a)
+{
+    tcg_gen_dupi_vec(r, MO_REG, ((TCGArg)-1 / 0xffff) * (a & 0xffff));
+}
+
+void tcg_gen_dup8i_vec(TCGv_vec r, uint32_t a)
+{
+    tcg_gen_dupi_vec(r, MO_REG, ((TCGArg)-1 / 0xff) * (a & 0xff));
+}
+
+void tcg_gen_dup_i64_vec(unsigned vece, TCGv_vec r, TCGv_i64 a)
+{
+    TCGArg ri = tcgv_vec_arg(r);
+    TCGTemp *rt = arg_temp(ri);
+    TCGType type = rt->base_type;
+
+    if (TCG_TARGET_REG_BITS == 64) {
+        TCGArg ai = tcgv_i64_arg(a);
+        vec_gen_2(INDEX_op_dup_vec, type, MO_64, ri, ai);
+    } else if (vece == MO_64) {
+        TCGArg al = tcgv_i32_arg(TCGV_LOW(a));
+        TCGArg ah = tcgv_i32_arg(TCGV_HIGH(a));
+        vec_gen_3(INDEX_op_dup2_vec, type, MO_64, ri, al, ah);
+    } else {
+        TCGArg ai = tcgv_i32_arg(TCGV_LOW(a));
+        vec_gen_2(INDEX_op_dup_vec, type, MO_64, ri, ai);
+    }
+}
+
+void tcg_gen_dup_i32_vec(unsigned vece, TCGv_vec r, TCGv_i32 a)
+{
+    TCGArg ri = tcgv_vec_arg(r);
+    TCGArg ai = tcgv_i32_arg(a);
+    TCGTemp *rt = arg_temp(ri);
+    TCGType type = rt->base_type;
+
+    vec_gen_2(INDEX_op_dup_vec, type, vece, ri, ai);
+}
+
+static void vec_gen_ldst(TCGOpcode opc, TCGv_vec r, TCGv_ptr b, TCGArg o)
+{
+    TCGArg ri = tcgv_vec_arg(r);
+    TCGArg bi = tcgv_ptr_arg(b);
+    TCGTemp *rt = arg_temp(ri);
+    TCGType type = rt->base_type;
+
+    vec_gen_3(opc, type, 0, ri, bi, o);
+}
+
+void tcg_gen_ld_vec(TCGv_vec r, TCGv_ptr b, TCGArg o)
+{
+    vec_gen_ldst(INDEX_op_ld_vec, r, b, o);
+}
+
+void tcg_gen_st_vec(TCGv_vec r, TCGv_ptr b, TCGArg o)
+{
+    vec_gen_ldst(INDEX_op_st_vec, r, b, o);
+}
+
+void tcg_gen_stl_vec(TCGv_vec r, TCGv_ptr b, TCGArg o, TCGType low_type)
+{
+    TCGArg ri = tcgv_vec_arg(r);
+    TCGArg bi = tcgv_ptr_arg(b);
+    TCGTemp *rt = arg_temp(ri);
+    TCGType type = rt->base_type;
+
+    tcg_debug_assert(low_type >= TCG_TYPE_V64);
+    tcg_debug_assert(low_type <= type);
+    vec_gen_3(INDEX_op_st_vec, low_type, 0, ri, bi, o);
+}
+
+void tcg_gen_add_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
+{
+    vec_gen_op3(INDEX_op_add_vec, vece, r, a, b);
+}
+
+void tcg_gen_sub_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
+{
+    vec_gen_op3(INDEX_op_sub_vec, vece, r, a, b);
+}
+
+void tcg_gen_and_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
+{
+    vec_gen_op3(INDEX_op_and_vec, 0, r, a, b);
+}
+
+void tcg_gen_or_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
+{
+    vec_gen_op3(INDEX_op_or_vec, 0, r, a, b);
+}
+
+void tcg_gen_xor_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
+{
+    vec_gen_op3(INDEX_op_xor_vec, 0, r, a, b);
+}
+
+void tcg_gen_andc_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
+{
+    if (TCG_TARGET_HAS_andc_vec) {
+        vec_gen_op3(INDEX_op_andc_vec, 0, r, a, b);
+    } else {
+        TCGv_vec t = tcg_temp_new_vec_matching(r);
+        tcg_gen_not_vec(0, t, b);
+        tcg_gen_and_vec(0, r, a, t);
+        tcg_temp_free_vec(t);
+    }
+}
+
+void tcg_gen_orc_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
+{
+    if (TCG_TARGET_HAS_orc_vec) {
+        vec_gen_op3(INDEX_op_orc_vec, 0, r, a, b);
+    } else {
+        TCGv_vec t = tcg_temp_new_vec_matching(r);
+        tcg_gen_not_vec(0, t, b);
+        tcg_gen_or_vec(0, r, a, t);
+        tcg_temp_free_vec(t);
+    }
+}
+
+void tcg_gen_not_vec(unsigned vece, TCGv_vec r, TCGv_vec a)
+{
+    if (TCG_TARGET_HAS_not_vec) {
+        vec_gen_op2(INDEX_op_not_vec, 0, r, a);
+    } else {
+        TCGv_vec t = tcg_const_ones_vec_matching(r);
+        tcg_gen_xor_vec(0, r, a, t);
+        tcg_temp_free_vec(t);
+    }
+}
+
+void tcg_gen_neg_vec(unsigned vece, TCGv_vec r, TCGv_vec a)
+{
+    if (TCG_TARGET_HAS_neg_vec) {
+        vec_gen_op2(INDEX_op_neg_vec, vece, r, a);
+    } else {
+        TCGv_vec t = tcg_const_zeros_vec_matching(r);
+        tcg_gen_sub_vec(vece, r, t, a);
+        tcg_temp_free_vec(t);
+    }
+}
diff --git a/tcg/tcg.c b/tcg/tcg.c
index 93caa0be93..42f0acdf8e 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -106,6 +106,18 @@ static void tcg_out_movi(TCGContext *s, TCGType type,
                          TCGReg ret, tcg_target_long arg);
 static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
                        const int *const_args);
+#if TCG_TARGET_MAYBE_vec
+static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, unsigned vecl,
+                           unsigned vece, const TCGArg *args,
+                           const int *const_args);
+#else
+static inline void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, unsigned vecl,
+                                  unsigned vece, const TCGArg *args,
+                                  const int *const_args)
+{
+    g_assert_not_reached();
+}
+#endif
 static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg, TCGReg arg1,
                        intptr_t arg2);
 static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
@@ -146,8 +158,7 @@ struct tcg_region_state {
 };
 
 static struct tcg_region_state region;
-
-static TCGRegSet tcg_target_available_regs[2];
+static TCGRegSet tcg_target_available_regs[TCG_TYPE_COUNT];
 static TCGRegSet tcg_target_call_clobber_regs;
 
 #if TCG_TARGET_INSN_UNIT_SIZE == 1
@@ -1026,6 +1037,41 @@ TCGv_i64 tcg_temp_new_internal_i64(int temp_local)
     return temp_tcgv_i64(t);
 }
 
+TCGv_vec tcg_temp_new_vec(TCGType type)
+{
+    TCGTemp *t;
+
+#ifdef CONFIG_DEBUG_TCG
+    switch (type) {
+    case TCG_TYPE_V64:
+        assert(TCG_TARGET_HAS_v64);
+        break;
+    case TCG_TYPE_V128:
+        assert(TCG_TARGET_HAS_v128);
+        break;
+    case TCG_TYPE_V256:
+        assert(TCG_TARGET_HAS_v256);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+#endif
+
+    t = tcg_temp_new_internal(type, 0);
+    return temp_tcgv_vec(t);
+}
+
+/* Create a new temp of the same type as an existing temp.  */
+TCGv_vec tcg_temp_new_vec_matching(TCGv_vec match)
+{
+    TCGTemp *t = tcgv_vec_temp(match);
+
+    tcg_debug_assert(t->temp_allocated != 0);
+
+    t = tcg_temp_new_internal(t->base_type, 0);
+    return temp_tcgv_vec(t);
+}
+
 static void tcg_temp_free_internal(TCGTemp *ts)
 {
     TCGContext *s = tcg_ctx;
@@ -1057,6 +1103,11 @@ void tcg_temp_free_i64(TCGv_i64 arg)
     tcg_temp_free_internal(tcgv_i64_temp(arg));
 }
 
+void tcg_temp_free_vec(TCGv_vec arg)
+{
+    tcg_temp_free_internal(tcgv_vec_temp(arg));
+}
+
 TCGv_i32 tcg_const_i32(int32_t val)
 {
     TCGv_i32 t0;
@@ -1114,6 +1165,9 @@ int tcg_check_temp_count(void)
    Test the runtime variable that controls each opcode.  */
 bool tcg_op_supported(TCGOpcode op)
 {
+    const bool have_vec
+        = TCG_TARGET_HAS_v64 | TCG_TARGET_HAS_v128 | TCG_TARGET_HAS_v256;
+
     switch (op) {
     case INDEX_op_discard:
     case INDEX_op_set_label:
@@ -1327,6 +1381,28 @@ bool tcg_op_supported(TCGOpcode op)
     case INDEX_op_mulsh_i64:
         return TCG_TARGET_HAS_mulsh_i64;
 
+    case INDEX_op_mov_vec:
+    case INDEX_op_dup_vec:
+    case INDEX_op_dupi_vec:
+    case INDEX_op_ld_vec:
+    case INDEX_op_st_vec:
+    case INDEX_op_add_vec:
+    case INDEX_op_sub_vec:
+    case INDEX_op_and_vec:
+    case INDEX_op_or_vec:
+    case INDEX_op_xor_vec:
+        return have_vec;
+    case INDEX_op_dup2_vec:
+        return have_vec && TCG_TARGET_REG_BITS == 32;
+    case INDEX_op_not_vec:
+        return have_vec && TCG_TARGET_HAS_not_vec;
+    case INDEX_op_neg_vec:
+        return have_vec && TCG_TARGET_HAS_neg_vec;
+    case INDEX_op_andc_vec:
+        return have_vec && TCG_TARGET_HAS_andc_vec;
+    case INDEX_op_orc_vec:
+        return have_vec && TCG_TARGET_HAS_orc_vec;
+
     case NB_OPS:
         break;
     }
@@ -1661,6 +1737,11 @@ void tcg_dump_ops(TCGContext *s)
             nb_iargs = def->nb_iargs;
             nb_cargs = def->nb_cargs;
 
+            if (def->flags & TCG_OPF_VECTOR) {
+                col += qemu_log("v%d,e%d,", 64 << TCGOP_VECL(op),
+                                8 << TCGOP_VECE(op));
+            }
+
             k = 0;
             for (i = 0; i < nb_oargs; i++) {
                 if (k != 0) {
@@ -2890,8 +2971,13 @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
     }
 
     /* emit instruction */
-    tcg_out_op(s, op->opc, new_args, const_args);
-    
+    if (def->flags & TCG_OPF_VECTOR) {
+        tcg_out_vec_op(s, op->opc, TCGOP_VECL(op), TCGOP_VECE(op),
+                       new_args, const_args);
+    } else {
+        tcg_out_op(s, op->opc, new_args, const_args);
+    }
+
     /* move the outputs in the correct register if needed */
     for(i = 0; i < nb_oargs; i++) {
         ts = arg_temp(op->args[i]);
@@ -3239,10 +3325,12 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
         switch (opc) {
         case INDEX_op_mov_i32:
         case INDEX_op_mov_i64:
+        case INDEX_op_mov_vec:
             tcg_reg_alloc_mov(s, op);
             break;
         case INDEX_op_movi_i32:
         case INDEX_op_movi_i64:
+        case INDEX_op_dupi_vec:
             tcg_reg_alloc_movi(s, op);
             break;
         case INDEX_op_insn_start:
diff --git a/tcg/README b/tcg/README
index 03bfb6acd4..f4695307bd 100644
--- a/tcg/README
+++ b/tcg/README
@@ -503,6 +503,55 @@ of the memory access.
 For a 32-bit host, qemu_ld/st_i64 is guaranteed to only be used with a
 64-bit memory access specified in flags.
 
+********* Host vector operations
+
+All of the vector ops have two parameters, TCGOP_VECL & TCGOP_VECE.
+The former specifies the length of the vector in log2 64-bit units; the
+later specifies the length of the element (if applicable) in log2 8-bit units.
+E.g. VECL=1 -> 64 << 1 -> v128, and VECE=2 -> 1 << 2 -> i32.
+
+* mov_vec   v0, v1
+* ld_vec    v0, t1
+* st_vec    v0, t1
+
+  Move, load and store.
+
+* dup_vec  v0, r1
+
+  Duplicate the low N bits of R1 into VECL/VECE copies across V0.
+
+* dupi_vec v0, c
+
+  Similarly, for a constant.
+  Smaller values will be replicated to host register size by the expanders.
+
+* dup2_vec v0, r1, r2
+
+  Duplicate r2:r1 into VECL/64 copies across V0.  This opcode is
+  only present for 32-bit hosts.
+
+* add_vec   v0, v1, v2
+
+  v0 = v1 + v2, in elements across the vector.
+
+* sub_vec   v0, v1, v2
+
+  Similarly, v0 = v1 - v2.
+
+* neg_vec   v0, v1
+
+  Similarly, v0 = -v1.
+
+* and_vec   v0, v1, v2
+* or_vec    v0, v1, v2
+* xor_vec   v0, v1, v2
+* andc_vec  v0, v1, v2
+* orc_vec   v0, v1, v2
+* not_vec   v0, v1
+
+  Similarly, logical operations with and without compliment.
+  Note that VECE is unused.
+
 *********
 
 Note 1: Some shortcuts are defined when the last operand is known to be
-- 
2.14.3


Re: [Qemu-devel] [PATCH v11 02/20] tcg: Add types and basic operations for host vectors
Posted by Alex Bennée 8 years ago
Richard Henderson <richard.henderson@linaro.org> writes:

> Nothing uses or enables them yet.
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>

> ---
>  Makefile.target  |   4 +-
>  tcg/tcg-op.h     |  27 +++++
>  tcg/tcg-opc.h    |  25 +++++
>  tcg/tcg.h        |  56 +++++++++++
>  tcg/tcg-op-vec.c | 292 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  tcg/tcg.c        |  96 +++++++++++++++++-
>  tcg/README       |  49 ++++++++++
>  7 files changed, 543 insertions(+), 6 deletions(-)
>  create mode 100644 tcg/tcg-op-vec.c
>
> diff --git a/Makefile.target b/Makefile.target
> index f9a9da7e7c..7f30a1e725 100644
> --- a/Makefile.target
> +++ b/Makefile.target
> @@ -93,8 +93,8 @@ all: $(PROGS) stap
>  # cpu emulator library
>  obj-y += exec.o
>  obj-y += accel/
> -obj-$(CONFIG_TCG) += tcg/tcg.o tcg/tcg-op.o tcg/optimize.o
> -obj-$(CONFIG_TCG) += tcg/tcg-common.o
> +obj-$(CONFIG_TCG) += tcg/tcg.o tcg/tcg-op.o tcg/tcg-op-vec.o
> +obj-$(CONFIG_TCG) += tcg/tcg-common.o tcg/optimize.o
>  obj-$(CONFIG_TCG_INTERPRETER) += tcg/tci.o
>  obj-$(CONFIG_TCG_INTERPRETER) += disas/tci.o
>  obj-y += fpu/softfloat.o
> diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h
> index ca07b32b65..0c02d86b8b 100644
> --- a/tcg/tcg-op.h
> +++ b/tcg/tcg-op.h
> @@ -35,6 +35,10 @@ void tcg_gen_op4(TCGOpcode, TCGArg, TCGArg, TCGArg, TCGArg);
>  void tcg_gen_op5(TCGOpcode, TCGArg, TCGArg, TCGArg, TCGArg, TCGArg);
>  void tcg_gen_op6(TCGOpcode, TCGArg, TCGArg, TCGArg, TCGArg, TCGArg, TCGArg);
>
> +void vec_gen_2(TCGOpcode, TCGType, unsigned, TCGArg, TCGArg);
> +void vec_gen_3(TCGOpcode, TCGType, unsigned, TCGArg, TCGArg, TCGArg);
> +void vec_gen_4(TCGOpcode, TCGType, unsigned, TCGArg, TCGArg, TCGArg, TCGArg);
> +
>  static inline void tcg_gen_op1_i32(TCGOpcode opc, TCGv_i32 a1)
>  {
>      tcg_gen_op1(opc, tcgv_i32_arg(a1));
> @@ -903,6 +907,27 @@ void tcg_gen_atomic_or_fetch_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp);
>  void tcg_gen_atomic_xor_fetch_i32(TCGv_i32, TCGv, TCGv_i32, TCGArg, TCGMemOp);
>  void tcg_gen_atomic_xor_fetch_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp);
>
> +void tcg_gen_mov_vec(TCGv_vec, TCGv_vec);
> +void tcg_gen_dup_i32_vec(unsigned vece, TCGv_vec, TCGv_i32);
> +void tcg_gen_dup_i64_vec(unsigned vece, TCGv_vec, TCGv_i64);
> +void tcg_gen_dup8i_vec(TCGv_vec, uint32_t);
> +void tcg_gen_dup16i_vec(TCGv_vec, uint32_t);
> +void tcg_gen_dup32i_vec(TCGv_vec, uint32_t);
> +void tcg_gen_dup64i_vec(TCGv_vec, uint64_t);
> +void tcg_gen_add_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
> +void tcg_gen_sub_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
> +void tcg_gen_and_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
> +void tcg_gen_or_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
> +void tcg_gen_xor_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
> +void tcg_gen_andc_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
> +void tcg_gen_orc_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
> +void tcg_gen_not_vec(unsigned vece, TCGv_vec r, TCGv_vec a);
> +void tcg_gen_neg_vec(unsigned vece, TCGv_vec r, TCGv_vec a);
> +
> +void tcg_gen_ld_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset);
> +void tcg_gen_st_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset);
> +void tcg_gen_stl_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset, TCGType t);
> +
>  #if TARGET_LONG_BITS == 64
>  #define tcg_gen_movi_tl tcg_gen_movi_i64
>  #define tcg_gen_mov_tl tcg_gen_mov_i64
> @@ -1001,6 +1026,7 @@ void tcg_gen_atomic_xor_fetch_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp);
>  #define tcg_gen_atomic_and_fetch_tl tcg_gen_atomic_and_fetch_i64
>  #define tcg_gen_atomic_or_fetch_tl tcg_gen_atomic_or_fetch_i64
>  #define tcg_gen_atomic_xor_fetch_tl tcg_gen_atomic_xor_fetch_i64
> +#define tcg_gen_dup_tl_vec  tcg_gen_dup_i64_vec
>  #else
>  #define tcg_gen_movi_tl tcg_gen_movi_i32
>  #define tcg_gen_mov_tl tcg_gen_mov_i32
> @@ -1098,6 +1124,7 @@ void tcg_gen_atomic_xor_fetch_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp);
>  #define tcg_gen_atomic_and_fetch_tl tcg_gen_atomic_and_fetch_i32
>  #define tcg_gen_atomic_or_fetch_tl tcg_gen_atomic_or_fetch_i32
>  #define tcg_gen_atomic_xor_fetch_tl tcg_gen_atomic_xor_fetch_i32
> +#define tcg_gen_dup_tl_vec  tcg_gen_dup_i32_vec
>  #endif
>
>  #if UINTPTR_MAX == UINT32_MAX
> diff --git a/tcg/tcg-opc.h b/tcg/tcg-opc.h
> index 956fb1e9f3..b851ad4bca 100644
> --- a/tcg/tcg-opc.h
> +++ b/tcg/tcg-opc.h
> @@ -204,8 +204,33 @@ DEF(qemu_ld_i64, DATA64_ARGS, TLADDR_ARGS, 1,
>  DEF(qemu_st_i64, 0, TLADDR_ARGS + DATA64_ARGS, 1,
>      TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS | TCG_OPF_64BIT)
>
> +/* Host vector support.  */
> +
> +#define IMPLVEC  TCG_OPF_VECTOR | IMPL(TCG_TARGET_MAYBE_vec)
> +
> +DEF(mov_vec, 1, 1, 0, TCG_OPF_VECTOR | TCG_OPF_NOT_PRESENT)
> +DEF(dupi_vec, 1, 0, 1, TCG_OPF_VECTOR | TCG_OPF_NOT_PRESENT)
> +
> +DEF(dup_vec, 1, 1, 0, IMPLVEC)
> +DEF(dup2_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_REG_BITS == 32))
> +
> +DEF(ld_vec, 1, 1, 1, IMPLVEC)
> +DEF(st_vec, 0, 2, 1, IMPLVEC)
> +
> +DEF(add_vec, 1, 2, 0, IMPLVEC)
> +DEF(sub_vec, 1, 2, 0, IMPLVEC)
> +DEF(neg_vec, 1, 1, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_neg_vec))
> +
> +DEF(and_vec, 1, 2, 0, IMPLVEC)
> +DEF(or_vec, 1, 2, 0, IMPLVEC)
> +DEF(xor_vec, 1, 2, 0, IMPLVEC)
> +DEF(andc_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_andc_vec))
> +DEF(orc_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_orc_vec))
> +DEF(not_vec, 1, 1, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_not_vec))
> +
>  #undef TLADDR_ARGS
>  #undef DATA64_ARGS
>  #undef IMPL
>  #undef IMPL64
> +#undef IMPLVEC
>  #undef DEF
> diff --git a/tcg/tcg.h b/tcg/tcg.h
> index 2ce497cebf..dce483b0ee 100644
> --- a/tcg/tcg.h
> +++ b/tcg/tcg.h
> @@ -170,6 +170,27 @@ typedef uint64_t TCGRegSet;
>  # error "Missing unsigned widening multiply"
>  #endif
>
> +#if !defined(TCG_TARGET_HAS_v64) \
> +    && !defined(TCG_TARGET_HAS_v128) \
> +    && !defined(TCG_TARGET_HAS_v256)
> +#define TCG_TARGET_MAYBE_vec            0
> +#define TCG_TARGET_HAS_neg_vec          0
> +#define TCG_TARGET_HAS_not_vec          0
> +#define TCG_TARGET_HAS_andc_vec         0
> +#define TCG_TARGET_HAS_orc_vec          0
> +#else
> +#define TCG_TARGET_MAYBE_vec            1
> +#endif
> +#ifndef TCG_TARGET_HAS_v64
> +#define TCG_TARGET_HAS_v64              0
> +#endif
> +#ifndef TCG_TARGET_HAS_v128
> +#define TCG_TARGET_HAS_v128             0
> +#endif
> +#ifndef TCG_TARGET_HAS_v256
> +#define TCG_TARGET_HAS_v256             0
> +#endif
> +
>  #ifndef TARGET_INSN_START_EXTRA_WORDS
>  # define TARGET_INSN_START_WORDS 1
>  #else
> @@ -246,6 +267,11 @@ typedef struct TCGPool {
>  typedef enum TCGType {
>      TCG_TYPE_I32,
>      TCG_TYPE_I64,
> +
> +    TCG_TYPE_V64,
> +    TCG_TYPE_V128,
> +    TCG_TYPE_V256,
> +
>      TCG_TYPE_COUNT, /* number of different types */
>
>      /* An alias for the size of the host register.  */
> @@ -396,6 +422,8 @@ typedef tcg_target_ulong TCGArg;
>      * TCGv_i32 : 32 bit integer type
>      * TCGv_i64 : 64 bit integer type
>      * TCGv_ptr : a host pointer type
> +    * TCGv_vec : a host vector type; the exact size is not exposed
> +                 to the CPU front-end code.
>      * TCGv : an integer type the same size as target_ulong
>               (an alias for either TCGv_i32 or TCGv_i64)
>     The compiler's type checking will complain if you mix them
> @@ -418,6 +446,7 @@ typedef tcg_target_ulong TCGArg;
>  typedef struct TCGv_i32_d *TCGv_i32;
>  typedef struct TCGv_i64_d *TCGv_i64;
>  typedef struct TCGv_ptr_d *TCGv_ptr;
> +typedef struct TCGv_vec_d *TCGv_vec;
>  typedef TCGv_ptr TCGv_env;
>  #if TARGET_LONG_BITS == 32
>  #define TCGv TCGv_i32
> @@ -589,6 +618,9 @@ typedef struct TCGOp {
>  #define TCGOP_CALLI(X)    (X)->param1
>  #define TCGOP_CALLO(X)    (X)->param2
>
> +#define TCGOP_VECL(X)     (X)->param1
> +#define TCGOP_VECE(X)     (X)->param2
> +
>  /* Make sure operands fit in the bitfields above.  */
>  QEMU_BUILD_BUG_ON(NB_OPS > (1 << 8));
>
> @@ -726,6 +758,11 @@ static inline TCGTemp *tcgv_ptr_temp(TCGv_ptr v)
>      return tcgv_i32_temp((TCGv_i32)v);
>  }
>
> +static inline TCGTemp *tcgv_vec_temp(TCGv_vec v)
> +{
> +    return tcgv_i32_temp((TCGv_i32)v);
> +}
> +
>  static inline TCGArg tcgv_i32_arg(TCGv_i32 v)
>  {
>      return temp_arg(tcgv_i32_temp(v));
> @@ -741,6 +778,11 @@ static inline TCGArg tcgv_ptr_arg(TCGv_ptr v)
>      return temp_arg(tcgv_ptr_temp(v));
>  }
>
> +static inline TCGArg tcgv_vec_arg(TCGv_vec v)
> +{
> +    return temp_arg(tcgv_vec_temp(v));
> +}
> +
>  static inline TCGv_i32 temp_tcgv_i32(TCGTemp *t)
>  {
>      (void)temp_idx(t); /* trigger embedded assert */
> @@ -757,6 +799,11 @@ static inline TCGv_ptr temp_tcgv_ptr(TCGTemp *t)
>      return (TCGv_ptr)temp_tcgv_i32(t);
>  }
>
> +static inline TCGv_vec temp_tcgv_vec(TCGTemp *t)
> +{
> +    return (TCGv_vec)temp_tcgv_i32(t);
> +}
> +
>  #if TCG_TARGET_REG_BITS == 32
>  static inline TCGv_i32 TCGV_LOW(TCGv_i64 t)
>  {
> @@ -832,9 +879,12 @@ TCGTemp *tcg_global_mem_new_internal(TCGType, TCGv_ptr,
>
>  TCGv_i32 tcg_temp_new_internal_i32(int temp_local);
>  TCGv_i64 tcg_temp_new_internal_i64(int temp_local);
> +TCGv_vec tcg_temp_new_vec(TCGType type);
> +TCGv_vec tcg_temp_new_vec_matching(TCGv_vec match);
>
>  void tcg_temp_free_i32(TCGv_i32 arg);
>  void tcg_temp_free_i64(TCGv_i64 arg);
> +void tcg_temp_free_vec(TCGv_vec arg);
>
>  static inline TCGv_i32 tcg_global_mem_new_i32(TCGv_ptr reg, intptr_t offset,
>                                                const char *name)
> @@ -916,6 +966,8 @@ enum {
>      /* Instruction is optional and not implemented by the host, or insn
>         is generic and should not be implemened by the host.  */
>      TCG_OPF_NOT_PRESENT  = 0x10,
> +    /* Instruction operands are vectors.  */
> +    TCG_OPF_VECTOR       = 0x20,
>  };
>
>  typedef struct TCGOpDef {
> @@ -981,6 +1033,10 @@ TCGv_i32 tcg_const_i32(int32_t val);
>  TCGv_i64 tcg_const_i64(int64_t val);
>  TCGv_i32 tcg_const_local_i32(int32_t val);
>  TCGv_i64 tcg_const_local_i64(int64_t val);
> +TCGv_vec tcg_const_zeros_vec(TCGType);
> +TCGv_vec tcg_const_ones_vec(TCGType);
> +TCGv_vec tcg_const_zeros_vec_matching(TCGv_vec);
> +TCGv_vec tcg_const_ones_vec_matching(TCGv_vec);
>
>  TCGLabel *gen_new_label(void);
>
> diff --git a/tcg/tcg-op-vec.c b/tcg/tcg-op-vec.c
> new file mode 100644
> index 0000000000..9e4678878b
> --- /dev/null
> +++ b/tcg/tcg-op-vec.c
> @@ -0,0 +1,292 @@
> +/*
> + * Tiny Code Generator for QEMU
> + *
> + * Copyright (c) 2018 Linaro, Inc.
> + *
> + * This library is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2 of the License, or (at your option) any later version.
> + *
> + * This library is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with this library; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include "qemu/osdep.h"
> +#include "qemu-common.h"
> +#include "cpu.h"
> +#include "exec/exec-all.h"
> +#include "tcg.h"
> +#include "tcg-op.h"
> +#include "tcg-mo.h"
> +
> +/* Reduce the number of ifdefs below.  This assumes that all uses of
> +   TCGV_HIGH and TCGV_LOW are properly protected by a conditional that
> +   the compiler can eliminate.  */
> +#if TCG_TARGET_REG_BITS == 64
> +extern TCGv_i32 TCGV_LOW_link_error(TCGv_i64);
> +extern TCGv_i32 TCGV_HIGH_link_error(TCGv_i64);
> +#define TCGV_LOW  TCGV_LOW_link_error
> +#define TCGV_HIGH TCGV_HIGH_link_error
> +#endif
> +
> +void vec_gen_2(TCGOpcode opc, TCGType type, unsigned vece, TCGArg r, TCGArg a)
> +{
> +    TCGOp *op = tcg_emit_op(opc);
> +    TCGOP_VECL(op) = type - TCG_TYPE_V64;
> +    TCGOP_VECE(op) = vece;
> +    op->args[0] = r;
> +    op->args[1] = a;
> +}
> +
> +void vec_gen_3(TCGOpcode opc, TCGType type, unsigned vece,
> +               TCGArg r, TCGArg a, TCGArg b)
> +{
> +    TCGOp *op = tcg_emit_op(opc);
> +    TCGOP_VECL(op) = type - TCG_TYPE_V64;
> +    TCGOP_VECE(op) = vece;
> +    op->args[0] = r;
> +    op->args[1] = a;
> +    op->args[2] = b;
> +}
> +
> +void vec_gen_4(TCGOpcode opc, TCGType type, unsigned vece,
> +               TCGArg r, TCGArg a, TCGArg b, TCGArg c)
> +{
> +    TCGOp *op = tcg_emit_op(opc);
> +    TCGOP_VECL(op) = type - TCG_TYPE_V64;
> +    TCGOP_VECE(op) = vece;
> +    op->args[0] = r;
> +    op->args[1] = a;
> +    op->args[2] = b;
> +    op->args[3] = c;
> +}
> +
> +static void vec_gen_op2(TCGOpcode opc, unsigned vece, TCGv_vec r, TCGv_vec a)
> +{
> +    TCGTemp *rt = tcgv_vec_temp(r);
> +    TCGTemp *at = tcgv_vec_temp(a);
> +    TCGType type = rt->base_type;
> +
> +    tcg_debug_assert(at->base_type == type);
> +    vec_gen_2(opc, type, vece, temp_arg(rt), temp_arg(at));
> +}
> +
> +static void vec_gen_op3(TCGOpcode opc, unsigned vece,
> +                        TCGv_vec r, TCGv_vec a, TCGv_vec b)
> +{
> +    TCGTemp *rt = tcgv_vec_temp(r);
> +    TCGTemp *at = tcgv_vec_temp(a);
> +    TCGTemp *bt = tcgv_vec_temp(b);
> +    TCGType type = rt->base_type;
> +
> +    tcg_debug_assert(at->base_type == type);
> +    tcg_debug_assert(bt->base_type == type);
> +    vec_gen_3(opc, type, vece, temp_arg(rt), temp_arg(at), temp_arg(bt));
> +}
> +
> +void tcg_gen_mov_vec(TCGv_vec r, TCGv_vec a)
> +{
> +    if (r != a) {
> +        vec_gen_op2(INDEX_op_mov_vec, 0, r, a);
> +    }
> +}
> +
> +#define MO_REG  (TCG_TARGET_REG_BITS == 64 ? MO_64 : MO_32)
> +
> +static void tcg_gen_dupi_vec(TCGv_vec r, unsigned vece, TCGArg a)
> +{
> +    TCGTemp *rt = tcgv_vec_temp(r);
> +    vec_gen_2(INDEX_op_dupi_vec, rt->base_type, vece, temp_arg(rt), a);
> +}
> +
> +TCGv_vec tcg_const_zeros_vec(TCGType type)
> +{
> +    TCGv_vec ret = tcg_temp_new_vec(type);
> +    tcg_gen_dupi_vec(ret, MO_REG, 0);
> +    return ret;
> +}
> +
> +TCGv_vec tcg_const_ones_vec(TCGType type)
> +{
> +    TCGv_vec ret = tcg_temp_new_vec(type);
> +    tcg_gen_dupi_vec(ret, MO_REG, -1);
> +    return ret;
> +}
> +
> +TCGv_vec tcg_const_zeros_vec_matching(TCGv_vec m)
> +{
> +    TCGTemp *t = tcgv_vec_temp(m);
> +    return tcg_const_zeros_vec(t->base_type);
> +}
> +
> +TCGv_vec tcg_const_ones_vec_matching(TCGv_vec m)
> +{
> +    TCGTemp *t = tcgv_vec_temp(m);
> +    return tcg_const_ones_vec(t->base_type);
> +}
> +
> +void tcg_gen_dup64i_vec(TCGv_vec r, uint64_t a)
> +{
> +    if (TCG_TARGET_REG_BITS == 32 && a == deposit64(a, 32, 32, a)) {
> +        tcg_gen_dupi_vec(r, MO_32, a);
> +    } else if (TCG_TARGET_REG_BITS == 64 || a == (uint64_t)(int32_t)a) {
> +        tcg_gen_dupi_vec(r, MO_64, a);
> +    } else {
> +        TCGv_i64 c = tcg_const_i64(a);
> +        tcg_gen_dup_i64_vec(MO_64, r, c);
> +        tcg_temp_free_i64(c);
> +    }
> +}
> +
> +void tcg_gen_dup32i_vec(TCGv_vec r, uint32_t a)
> +{
> +    tcg_gen_dupi_vec(r, MO_REG, ((TCGArg)-1 / 0xffffffffu) * a);
> +}
> +
> +void tcg_gen_dup16i_vec(TCGv_vec r, uint32_t a)
> +{
> +    tcg_gen_dupi_vec(r, MO_REG, ((TCGArg)-1 / 0xffff) * (a & 0xffff));
> +}
> +
> +void tcg_gen_dup8i_vec(TCGv_vec r, uint32_t a)
> +{
> +    tcg_gen_dupi_vec(r, MO_REG, ((TCGArg)-1 / 0xff) * (a & 0xff));
> +}
> +
> +void tcg_gen_dup_i64_vec(unsigned vece, TCGv_vec r, TCGv_i64 a)
> +{
> +    TCGArg ri = tcgv_vec_arg(r);
> +    TCGTemp *rt = arg_temp(ri);
> +    TCGType type = rt->base_type;
> +
> +    if (TCG_TARGET_REG_BITS == 64) {
> +        TCGArg ai = tcgv_i64_arg(a);
> +        vec_gen_2(INDEX_op_dup_vec, type, MO_64, ri, ai);
> +    } else if (vece == MO_64) {
> +        TCGArg al = tcgv_i32_arg(TCGV_LOW(a));
> +        TCGArg ah = tcgv_i32_arg(TCGV_HIGH(a));
> +        vec_gen_3(INDEX_op_dup2_vec, type, MO_64, ri, al, ah);
> +    } else {
> +        TCGArg ai = tcgv_i32_arg(TCGV_LOW(a));
> +        vec_gen_2(INDEX_op_dup_vec, type, MO_64, ri, ai);
> +    }
> +}
> +
> +void tcg_gen_dup_i32_vec(unsigned vece, TCGv_vec r, TCGv_i32 a)
> +{
> +    TCGArg ri = tcgv_vec_arg(r);
> +    TCGArg ai = tcgv_i32_arg(a);
> +    TCGTemp *rt = arg_temp(ri);
> +    TCGType type = rt->base_type;
> +
> +    vec_gen_2(INDEX_op_dup_vec, type, vece, ri, ai);
> +}
> +
> +static void vec_gen_ldst(TCGOpcode opc, TCGv_vec r, TCGv_ptr b, TCGArg o)
> +{
> +    TCGArg ri = tcgv_vec_arg(r);
> +    TCGArg bi = tcgv_ptr_arg(b);
> +    TCGTemp *rt = arg_temp(ri);
> +    TCGType type = rt->base_type;
> +
> +    vec_gen_3(opc, type, 0, ri, bi, o);
> +}
> +
> +void tcg_gen_ld_vec(TCGv_vec r, TCGv_ptr b, TCGArg o)
> +{
> +    vec_gen_ldst(INDEX_op_ld_vec, r, b, o);
> +}
> +
> +void tcg_gen_st_vec(TCGv_vec r, TCGv_ptr b, TCGArg o)
> +{
> +    vec_gen_ldst(INDEX_op_st_vec, r, b, o);
> +}
> +
> +void tcg_gen_stl_vec(TCGv_vec r, TCGv_ptr b, TCGArg o, TCGType low_type)
> +{
> +    TCGArg ri = tcgv_vec_arg(r);
> +    TCGArg bi = tcgv_ptr_arg(b);
> +    TCGTemp *rt = arg_temp(ri);
> +    TCGType type = rt->base_type;
> +
> +    tcg_debug_assert(low_type >= TCG_TYPE_V64);
> +    tcg_debug_assert(low_type <= type);
> +    vec_gen_3(INDEX_op_st_vec, low_type, 0, ri, bi, o);
> +}
> +
> +void tcg_gen_add_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
> +{
> +    vec_gen_op3(INDEX_op_add_vec, vece, r, a, b);
> +}
> +
> +void tcg_gen_sub_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
> +{
> +    vec_gen_op3(INDEX_op_sub_vec, vece, r, a, b);
> +}
> +
> +void tcg_gen_and_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
> +{
> +    vec_gen_op3(INDEX_op_and_vec, 0, r, a, b);
> +}
> +
> +void tcg_gen_or_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
> +{
> +    vec_gen_op3(INDEX_op_or_vec, 0, r, a, b);
> +}
> +
> +void tcg_gen_xor_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
> +{
> +    vec_gen_op3(INDEX_op_xor_vec, 0, r, a, b);
> +}
> +
> +void tcg_gen_andc_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
> +{
> +    if (TCG_TARGET_HAS_andc_vec) {
> +        vec_gen_op3(INDEX_op_andc_vec, 0, r, a, b);
> +    } else {
> +        TCGv_vec t = tcg_temp_new_vec_matching(r);
> +        tcg_gen_not_vec(0, t, b);
> +        tcg_gen_and_vec(0, r, a, t);
> +        tcg_temp_free_vec(t);
> +    }
> +}
> +
> +void tcg_gen_orc_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
> +{
> +    if (TCG_TARGET_HAS_orc_vec) {
> +        vec_gen_op3(INDEX_op_orc_vec, 0, r, a, b);
> +    } else {
> +        TCGv_vec t = tcg_temp_new_vec_matching(r);
> +        tcg_gen_not_vec(0, t, b);
> +        tcg_gen_or_vec(0, r, a, t);
> +        tcg_temp_free_vec(t);
> +    }
> +}
> +
> +void tcg_gen_not_vec(unsigned vece, TCGv_vec r, TCGv_vec a)
> +{
> +    if (TCG_TARGET_HAS_not_vec) {
> +        vec_gen_op2(INDEX_op_not_vec, 0, r, a);
> +    } else {
> +        TCGv_vec t = tcg_const_ones_vec_matching(r);
> +        tcg_gen_xor_vec(0, r, a, t);
> +        tcg_temp_free_vec(t);
> +    }
> +}
> +
> +void tcg_gen_neg_vec(unsigned vece, TCGv_vec r, TCGv_vec a)
> +{
> +    if (TCG_TARGET_HAS_neg_vec) {
> +        vec_gen_op2(INDEX_op_neg_vec, vece, r, a);
> +    } else {
> +        TCGv_vec t = tcg_const_zeros_vec_matching(r);
> +        tcg_gen_sub_vec(vece, r, t, a);
> +        tcg_temp_free_vec(t);
> +    }
> +}
> diff --git a/tcg/tcg.c b/tcg/tcg.c
> index 93caa0be93..42f0acdf8e 100644
> --- a/tcg/tcg.c
> +++ b/tcg/tcg.c
> @@ -106,6 +106,18 @@ static void tcg_out_movi(TCGContext *s, TCGType type,
>                           TCGReg ret, tcg_target_long arg);
>  static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
>                         const int *const_args);
> +#if TCG_TARGET_MAYBE_vec
> +static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, unsigned vecl,
> +                           unsigned vece, const TCGArg *args,
> +                           const int *const_args);
> +#else
> +static inline void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, unsigned vecl,
> +                                  unsigned vece, const TCGArg *args,
> +                                  const int *const_args)
> +{
> +    g_assert_not_reached();
> +}
> +#endif
>  static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg, TCGReg arg1,
>                         intptr_t arg2);
>  static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
> @@ -146,8 +158,7 @@ struct tcg_region_state {
>  };
>
>  static struct tcg_region_state region;
> -
> -static TCGRegSet tcg_target_available_regs[2];
> +static TCGRegSet tcg_target_available_regs[TCG_TYPE_COUNT];
>  static TCGRegSet tcg_target_call_clobber_regs;
>
>  #if TCG_TARGET_INSN_UNIT_SIZE == 1
> @@ -1026,6 +1037,41 @@ TCGv_i64 tcg_temp_new_internal_i64(int temp_local)
>      return temp_tcgv_i64(t);
>  }
>
> +TCGv_vec tcg_temp_new_vec(TCGType type)
> +{
> +    TCGTemp *t;
> +
> +#ifdef CONFIG_DEBUG_TCG
> +    switch (type) {
> +    case TCG_TYPE_V64:
> +        assert(TCG_TARGET_HAS_v64);
> +        break;
> +    case TCG_TYPE_V128:
> +        assert(TCG_TARGET_HAS_v128);
> +        break;
> +    case TCG_TYPE_V256:
> +        assert(TCG_TARGET_HAS_v256);
> +        break;
> +    default:
> +        g_assert_not_reached();
> +    }
> +#endif
> +
> +    t = tcg_temp_new_internal(type, 0);
> +    return temp_tcgv_vec(t);
> +}
> +
> +/* Create a new temp of the same type as an existing temp.  */
> +TCGv_vec tcg_temp_new_vec_matching(TCGv_vec match)
> +{
> +    TCGTemp *t = tcgv_vec_temp(match);
> +
> +    tcg_debug_assert(t->temp_allocated != 0);
> +
> +    t = tcg_temp_new_internal(t->base_type, 0);
> +    return temp_tcgv_vec(t);
> +}
> +
>  static void tcg_temp_free_internal(TCGTemp *ts)
>  {
>      TCGContext *s = tcg_ctx;
> @@ -1057,6 +1103,11 @@ void tcg_temp_free_i64(TCGv_i64 arg)
>      tcg_temp_free_internal(tcgv_i64_temp(arg));
>  }
>
> +void tcg_temp_free_vec(TCGv_vec arg)
> +{
> +    tcg_temp_free_internal(tcgv_vec_temp(arg));
> +}
> +
>  TCGv_i32 tcg_const_i32(int32_t val)
>  {
>      TCGv_i32 t0;
> @@ -1114,6 +1165,9 @@ int tcg_check_temp_count(void)
>     Test the runtime variable that controls each opcode.  */
>  bool tcg_op_supported(TCGOpcode op)
>  {
> +    const bool have_vec
> +        = TCG_TARGET_HAS_v64 | TCG_TARGET_HAS_v128 | TCG_TARGET_HAS_v256;
> +
>      switch (op) {
>      case INDEX_op_discard:
>      case INDEX_op_set_label:
> @@ -1327,6 +1381,28 @@ bool tcg_op_supported(TCGOpcode op)
>      case INDEX_op_mulsh_i64:
>          return TCG_TARGET_HAS_mulsh_i64;
>
> +    case INDEX_op_mov_vec:
> +    case INDEX_op_dup_vec:
> +    case INDEX_op_dupi_vec:
> +    case INDEX_op_ld_vec:
> +    case INDEX_op_st_vec:
> +    case INDEX_op_add_vec:
> +    case INDEX_op_sub_vec:
> +    case INDEX_op_and_vec:
> +    case INDEX_op_or_vec:
> +    case INDEX_op_xor_vec:
> +        return have_vec;
> +    case INDEX_op_dup2_vec:
> +        return have_vec && TCG_TARGET_REG_BITS == 32;
> +    case INDEX_op_not_vec:
> +        return have_vec && TCG_TARGET_HAS_not_vec;
> +    case INDEX_op_neg_vec:
> +        return have_vec && TCG_TARGET_HAS_neg_vec;
> +    case INDEX_op_andc_vec:
> +        return have_vec && TCG_TARGET_HAS_andc_vec;
> +    case INDEX_op_orc_vec:
> +        return have_vec && TCG_TARGET_HAS_orc_vec;
> +
>      case NB_OPS:
>          break;
>      }
> @@ -1661,6 +1737,11 @@ void tcg_dump_ops(TCGContext *s)
>              nb_iargs = def->nb_iargs;
>              nb_cargs = def->nb_cargs;
>
> +            if (def->flags & TCG_OPF_VECTOR) {
> +                col += qemu_log("v%d,e%d,", 64 << TCGOP_VECL(op),
> +                                8 << TCGOP_VECE(op));
> +            }
> +
>              k = 0;
>              for (i = 0; i < nb_oargs; i++) {
>                  if (k != 0) {
> @@ -2890,8 +2971,13 @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
>      }
>
>      /* emit instruction */
> -    tcg_out_op(s, op->opc, new_args, const_args);
> -
> +    if (def->flags & TCG_OPF_VECTOR) {
> +        tcg_out_vec_op(s, op->opc, TCGOP_VECL(op), TCGOP_VECE(op),
> +                       new_args, const_args);
> +    } else {
> +        tcg_out_op(s, op->opc, new_args, const_args);
> +    }
> +
>      /* move the outputs in the correct register if needed */
>      for(i = 0; i < nb_oargs; i++) {
>          ts = arg_temp(op->args[i]);
> @@ -3239,10 +3325,12 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
>          switch (opc) {
>          case INDEX_op_mov_i32:
>          case INDEX_op_mov_i64:
> +        case INDEX_op_mov_vec:
>              tcg_reg_alloc_mov(s, op);
>              break;
>          case INDEX_op_movi_i32:
>          case INDEX_op_movi_i64:
> +        case INDEX_op_dupi_vec:
>              tcg_reg_alloc_movi(s, op);
>              break;
>          case INDEX_op_insn_start:
> diff --git a/tcg/README b/tcg/README
> index 03bfb6acd4..f4695307bd 100644
> --- a/tcg/README
> +++ b/tcg/README
> @@ -503,6 +503,55 @@ of the memory access.
>  For a 32-bit host, qemu_ld/st_i64 is guaranteed to only be used with a
>  64-bit memory access specified in flags.
>
> +********* Host vector operations
> +
> +All of the vector ops have two parameters, TCGOP_VECL & TCGOP_VECE.
> +The former specifies the length of the vector in log2 64-bit units; the
> +later specifies the length of the element (if applicable) in log2 8-bit units.
> +E.g. VECL=1 -> 64 << 1 -> v128, and VECE=2 -> 1 << 2 -> i32.
> +
> +* mov_vec   v0, v1
> +* ld_vec    v0, t1
> +* st_vec    v0, t1
> +
> +  Move, load and store.
> +
> +* dup_vec  v0, r1
> +
> +  Duplicate the low N bits of R1 into VECL/VECE copies across V0.
> +
> +* dupi_vec v0, c
> +
> +  Similarly, for a constant.
> +  Smaller values will be replicated to host register size by the expanders.
> +
> +* dup2_vec v0, r1, r2
> +
> +  Duplicate r2:r1 into VECL/64 copies across V0.  This opcode is
> +  only present for 32-bit hosts.
> +
> +* add_vec   v0, v1, v2
> +
> +  v0 = v1 + v2, in elements across the vector.
> +
> +* sub_vec   v0, v1, v2
> +
> +  Similarly, v0 = v1 - v2.
> +
> +* neg_vec   v0, v1
> +
> +  Similarly, v0 = -v1.
> +
> +* and_vec   v0, v1, v2
> +* or_vec    v0, v1, v2
> +* xor_vec   v0, v1, v2
> +* andc_vec  v0, v1, v2
> +* orc_vec   v0, v1, v2
> +* not_vec   v0, v1
> +
> +  Similarly, logical operations with and without compliment.
> +  Note that VECE is unused.
> +
>  *********
>
>  Note 1: Some shortcuts are defined when the last operand is known to be


--
Alex Bennée