Series comparison

-[PULL 0/3] tcg patch queue
+[PULL 0/5] tcg patch queue
-The following changes since commit e18e5501d8ac692d32657a3e1ef545b14e72b730:
+The following changes since commit 40c67636f67c2a89745f2e698522fe917326a952:
-  Merge remote-tracking branch 'remotes/dgilbert-gitlab/tags/pull-virtiofs-20200210' into staging (2020-02-10 18:09:14 +0000)
+  Merge remote-tracking branch 'remotes/kraxel/tags/usb-20200317-pull-request' into staging (2020-03-17 14:00:56 +0000)
 are available in the Git repository at:
-  https://github.com/rth7680/qemu.git tags/pull-tcg-20200212
+  https://github.com/rth7680/qemu.git tags/pull-tcg-20200317
-for you to fetch changes up to 2445971604c1cfd3ec484457159f4ac300fb04d2:
+for you to fetch changes up to 0270bd503e3699b7202200a2d693ad1feb57473f:
-  tcg: Add tcg_gen_gvec_5_ptr (2020-02-12 14:58:36 -0800)
+  tcg: Remove tcg-runtime-gvec.c DO_CMP0 (2020-03-17 08:41:07 -0700)
 ----------------------------------------------------------------
-Fix breakpoint invalidation.
+Fix tcg/i386 bug vs sari_vec.
-Add support for tcg helpers with 7 arguments.
+Fix tcg-runtime-gvec.c vs i386 without avx.
 Add support for gvec helpers with 5 arguments.
 ----------------------------------------------------------------
-Max Filippov (1):
+Richard Henderson (5):
-      exec: flush CPU TB cache in breakpoint_invalidate
+      tcg/i386: Bound shift count expanding sari_vec
       tcg: Remove CONFIG_VECTOR16
       tcg: Tidy tcg-runtime-gvec.c types
       tcg: Tidy tcg-runtime-gvec.c DUP*
       tcg: Remove tcg-runtime-gvec.c DO_CMP0
-Richard Henderson (1):
+ configure                    |  56 --------
-      tcg: Add tcg_gen_gvec_5_ptr
+ accel/tcg/tcg-runtime-gvec.c | 298 +++++++++++++++++--------------------------
  tcg/i386/tcg-target.inc.c    |   9 +-
 files changed, 122 insertions(+), 241 deletions(-)
-Taylor Simpson (1):
-      tcg: Add support for a helper with 7 arguments
- include/exec/helper-gen.h   | 13 +++++++++++++
- include/exec/helper-head.h  |  2 ++
- include/exec/helper-proto.h |  6 ++++++
- include/exec/helper-tcg.h   |  7 +++++++
- include/tcg/tcg-op-gvec.h   |  7 +++++++
- exec.c                      | 15 +++++++--------
- tcg/tcg-op-gvec.c           | 32 ++++++++++++++++++++++++++++++++
-files changed, 74 insertions(+), 8 deletions(-)

-New patch
+[PULL 1/5] tcg/i386: Bound shift count expanding sari_vec
+A given RISU testcase for SVE can produce
+tcg-op-vec.c:511: do_shifti: Assertion `i >= 0 && i < (8 << vece)' failed.
+because expand_vec_sari gave a shift count of 32 to a MO_32
+vector shift.
+In 44f1441dbe1, we changed from direct expansion of vector opcodes
+to re-use of the tcg expanders.  So while the comment correctly notes
+that the hw will handle such a shift count, we now have to take our
+own sanity checks into account.  Which is easy in this particular case.
+Fixes: 44f1441dbe1
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/i386/tcg-target.inc.c | 9 ++++++---
+file changed, 6 insertions(+), 3 deletions(-)
+diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/i386/tcg-target.inc.c
++++ b/tcg/i386/tcg-target.inc.c
+@@ -XXX,XX +XXX,XX @@ static void expand_vec_sari(TCGType type, unsigned vece,
+     case MO_64:
+         if (imm <= 32) {
+-            /* We can emulate a small sign extend by performing an arithmetic
++            /*
++             * We can emulate a small sign extend by performing an arithmetic
+              * 32-bit shift and overwriting the high half of a 64-bit logical
+-             * shift (note that the ISA says shift of 32 is valid).
++             * shift.  Note that the ISA says shift of 32 is valid, but TCG
++             * does not, so we have to bound the smaller shift -- we get the
++             * same result in the high half either way.
+              */
+             t1 = tcg_temp_new_vec(type);
+-            tcg_gen_sari_vec(MO_32, t1, v1, imm);
++            tcg_gen_sari_vec(MO_32, t1, v1, MIN(imm, 31));
+             tcg_gen_shri_vec(MO_64, v0, v1, imm);
+             vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32,
+                       tcgv_vec_arg(v0), tcgv_vec_arg(v0),
+--
+.20.1

-[PULL 2/3] tcg: Add support for a helper with 7 arguments
+[PULL 2/5] tcg: Remove CONFIG_VECTOR16
-From: Taylor Simpson <tsimpson@quicinc.com>
+The comment in tcg-runtime-gvec.c about CONFIG_VECTOR16 says that
 tcg-op-gvec.c has eliminated size 8 vectors, and only passes on
 multiples of 16.  This may have been true of the first few operations,
 but is not true of all operations.
-Currently, helpers can only take up to 6 arguments.  This patch adds the
+In particular, multiply, shift by scalar, and compare of 8- and 16-bit
-capability for up to 7 arguments.  I have tested it with the Hexagon port
+elements are not expanded inline if host vector operations are not
-that I am preparing for submission.
+supported.
-Signed-off-by: Taylor Simpson <tsimpson@quicinc.com>
+For an x86_64 host that does not support AVX, this means that we will
-Message-Id: <1580942510-2820-1-git-send-email-tsimpson@quicinc.com>
+fall back to the helper, which will attempt to use SSE instructions,
 which will SEGV on an invalid 8-byte aligned memory operation.
 This patch simply removes the CONFIG_VECTOR16 code and configuration
 without further simplification.
 Buglink: https://bugs.launchpad.net/bugs/1863508
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/exec/helper-gen.h   | 13 +++++++++++++
+ configure                    | 56 ------------------------------------
- include/exec/helper-head.h  |  2 ++
+ accel/tcg/tcg-runtime-gvec.c | 35 +---------------------
- include/exec/helper-proto.h |  6 ++++++
+files changed, 1 insertion(+), 90 deletions(-)
  include/exec/helper-tcg.h   |  7 +++++++
 files changed, 28 insertions(+)
-diff --git a/include/exec/helper-gen.h b/include/exec/helper-gen.h
+diff --git a/configure b/configure
 index XXXXXXX..XXXXXXX 100755
 --- a/configure
 +++ b/configure
@@ -XXX,XX +XXX,XX @@ if  test "$plugins" = "yes" &&
        "for this purpose. You can't build with --static."
  fi
 -########################################
 -# See if 16-byte vector operations are supported.
 -# Even without a vector unit the compiler may expand these.
 -# There is a bug in old GCC for PPC that crashes here.
 -# Unfortunately it's the system compiler for Centos 7.
 -
 -cat > $TMPC << EOF
 -typedef unsigned char U1 __attribute__((vector_size(16)));
 -typedef unsigned short U2 __attribute__((vector_size(16)));
 -typedef unsigned int U4 __attribute__((vector_size(16)));
 -typedef unsigned long long U8 __attribute__((vector_size(16)));
 -typedef signed char S1 __attribute__((vector_size(16)));
 -typedef signed short S2 __attribute__((vector_size(16)));
 -typedef signed int S4 __attribute__((vector_size(16)));
 -typedef signed long long S8 __attribute__((vector_size(16)));
 -static U1 a1, b1;
 -static U2 a2, b2;
 -static U4 a4, b4;
 -static U8 a8, b8;
 -static S1 c1;
 -static S2 c2;
 -static S4 c4;
 -static S8 c8;
 -static int i;
 -void helper(void *d, void *a, int shift, int i);
 -void helper(void *d, void *a, int shift, int i)
 -{
 -  *(U1 *)(d + i) = *(U1 *)(a + i) << shift;
 -  *(U2 *)(d + i) = *(U2 *)(a + i) << shift;
 -  *(U4 *)(d + i) = *(U4 *)(a + i) << shift;
 -  *(U8 *)(d + i) = *(U8 *)(a + i) << shift;
 -}
 -int main(void)
 -{
 -  a1 += b1; a2 += b2; a4 += b4; a8 += b8;
 -  a1 -= b1; a2 -= b2; a4 -= b4; a8 -= b8;
 -  a1 *= b1; a2 *= b2; a4 *= b4; a8 *= b8;
 -  a1 &= b1; a2 &= b2; a4 &= b4; a8 &= b8;
 -  a1 |= b1; a2 |= b2; a4 |= b4; a8 |= b8;
 -  a1 ^= b1; a2 ^= b2; a4 ^= b4; a8 ^= b8;
 -  a1 <<= i; a2 <<= i; a4 <<= i; a8 <<= i;
 -  a1 >>= i; a2 >>= i; a4 >>= i; a8 >>= i;
 -  c1 >>= i; c2 >>= i; c4 >>= i; c8 >>= i;
 -  return 0;
 -}
 -EOF
 -
 -vector16=no
 -if compile_prog "" "" ; then
 -  vector16=yes
 -fi
 -
  ########################################
  # See if __attribute__((alias)) is supported.
  # This false for Xcode 9, but has been remedied for Xcode 10.
@@ -XXX,XX +XXX,XX @@ if test "$atomic64" = "yes" ; then
    echo "CONFIG_ATOMIC64=y" >> $config_host_mak
  fi
 -if test "$vector16" = "yes" ; then
 -  echo "CONFIG_VECTOR16=y" >> $config_host_mak
 -fi
 -
  if test "$attralias" = "yes" ; then
    echo "CONFIG_ATTRIBUTE_ALIAS=y" >> $config_host_mak
  fi
 diff --git a/accel/tcg/tcg-runtime-gvec.c b/accel/tcg/tcg-runtime-gvec.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/exec/helper-gen.h
+--- a/accel/tcg/tcg-runtime-gvec.c
-+++ b/include/exec/helper-gen.h
++++ b/accel/tcg/tcg-runtime-gvec.c
-@@ -XXX,XX +XXX,XX @@ static inline void glue(gen_helper_, name)(dh_retvar_decl(ret)          \
+@@ -XXX,XX +XXX,XX @@
-   tcg_gen_callN(HELPER(name), dh_retvar(ret), 6, args);                 \
+ #include "tcg/tcg-gvec-desc.h"
 -/* Virtually all hosts support 16-byte vectors.  Those that don't can emulate
 - * them via GCC's generic vector extension.  This turns out to be simpler and
 - * more reliable than getting the compiler to autovectorize.
 - *
 - * In tcg-op-gvec.c, we asserted that both the size and alignment of the data
 - * are multiples of 16.
 - *
 - * When the compiler does not support all of the operations we require, the
 - * loops are written so that we can always fall back on the base types.
 - */
 -#ifdef CONFIG_VECTOR16
 -typedef uint8_t vec8 __attribute__((vector_size(16)));
 -typedef uint16_t vec16 __attribute__((vector_size(16)));
 -typedef uint32_t vec32 __attribute__((vector_size(16)));
 -typedef uint64_t vec64 __attribute__((vector_size(16)));
 -
 -typedef int8_t svec8 __attribute__((vector_size(16)));
 -typedef int16_t svec16 __attribute__((vector_size(16)));
 -typedef int32_t svec32 __attribute__((vector_size(16)));
 -typedef int64_t svec64 __attribute__((vector_size(16)));
 -
 -#define DUP16(X)  { X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X }
 -#define DUP8(X)   { X, X, X, X, X, X, X, X }
 -#define DUP4(X)   { X, X, X, X }
 -#define DUP2(X)   { X, X }
 -#else
  typedef uint8_t vec8;
  typedef uint16_t vec16;
  typedef uint32_t vec32;
@@ -XXX,XX +XXX,XX @@ typedef int64_t svec64;
  #define DUP8(X)   X
  #define DUP4(X)   X
  #define DUP2(X)   X
 -#endif /* CONFIG_VECTOR16 */
  static inline void clear_high(void *d, intptr_t oprsz, uint32_t desc)
  {
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_sar64v)(void *d, void *a, void *b, uint32_t desc)
      clear_high(d, oprsz, desc);
  }
-+#define DEF_HELPER_FLAGS_7(name, flags, ret, t1, t2, t3, t4, t5, t6, t7)\
+-/* If vectors are enabled, the compiler fills in -1 for true.
-+static inline void glue(gen_helper_, name)(dh_retvar_decl(ret)          \
+-   Otherwise, we must take care of this by hand.  */
-+    dh_arg_decl(t1, 1),  dh_arg_decl(t2, 2), dh_arg_decl(t3, 3),        \
+-#ifdef CONFIG_VECTOR16
-+    dh_arg_decl(t4, 4), dh_arg_decl(t5, 5), dh_arg_decl(t6, 6),         \
+-# define DO_CMP0(X)  X
-+    dh_arg_decl(t7, 7))                                                 \
+-#else
-+{                                                                       \
+-# define DO_CMP0(X)  -(X)
-+  TCGTemp *args[7] = { dh_arg(t1, 1), dh_arg(t2, 2), dh_arg(t3, 3),     \
+-#endif
-+                     dh_arg(t4, 4), dh_arg(t5, 5), dh_arg(t6, 6),       \
++#define DO_CMP0(X)  -(X)
-+                     dh_arg(t7, 7) };                                   \
-+  tcg_gen_callN(HELPER(name), dh_retvar(ret), 7, args);                 \
+ #define DO_CMP1(NAME, TYPE, OP)                                            \
-+}
+ void HELPER(NAME)(void *d, void *a, void *b, uint32_t desc)                \
 +
  #include "helper.h"
  #include "trace/generated-helpers.h"
  #include "trace/generated-helpers-wrappers.h"
@@ -XXX,XX +XXX,XX @@ static inline void glue(gen_helper_, name)(dh_retvar_decl(ret)          \
  #undef DEF_HELPER_FLAGS_4
  #undef DEF_HELPER_FLAGS_5
  #undef DEF_HELPER_FLAGS_6
 +#undef DEF_HELPER_FLAGS_7
  #undef GEN_HELPER
  #endif /* HELPER_GEN_H */
 diff --git a/include/exec/helper-head.h b/include/exec/helper-head.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/exec/helper-head.h
 +++ b/include/exec/helper-head.h
@@ -XXX,XX +XXX,XX @@
      DEF_HELPER_FLAGS_5(name, 0, ret, t1, t2, t3, t4, t5)
  #define DEF_HELPER_6(name, ret, t1, t2, t3, t4, t5, t6) \
      DEF_HELPER_FLAGS_6(name, 0, ret, t1, t2, t3, t4, t5, t6)
 +#define DEF_HELPER_7(name, ret, t1, t2, t3, t4, t5, t6, t7) \
 +    DEF_HELPER_FLAGS_7(name, 0, ret, t1, t2, t3, t4, t5, t6, t7)
  /* MAX_OPC_PARAM_IARGS must be set to n if last entry is DEF_HELPER_FLAGS_n. */
 diff --git a/include/exec/helper-proto.h b/include/exec/helper-proto.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/exec/helper-proto.h
 +++ b/include/exec/helper-proto.h
@@ -XXX,XX +XXX,XX @@ dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
  dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
                              dh_ctype(t4), dh_ctype(t5), dh_ctype(t6));
 +#define DEF_HELPER_FLAGS_7(name, flags, ret, t1, t2, t3, t4, t5, t6, t7) \
 +dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
 +                            dh_ctype(t4), dh_ctype(t5), dh_ctype(t6), \
 +                            dh_ctype(t7));
 +
  #include "helper.h"
  #include "trace/generated-helpers.h"
  #include "tcg-runtime.h"
@@ -XXX,XX +XXX,XX @@ dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
  #undef DEF_HELPER_FLAGS_4
  #undef DEF_HELPER_FLAGS_5
  #undef DEF_HELPER_FLAGS_6
 +#undef DEF_HELPER_FLAGS_7
  #endif /* HELPER_PROTO_H */
 diff --git a/include/exec/helper-tcg.h b/include/exec/helper-tcg.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/exec/helper-tcg.h
 +++ b/include/exec/helper-tcg.h
@@ -XXX,XX +XXX,XX @@
      | dh_sizemask(t2, 2) | dh_sizemask(t3, 3) | dh_sizemask(t4, 4) \
      | dh_sizemask(t5, 5) | dh_sizemask(t6, 6) },
 +#define DEF_HELPER_FLAGS_7(NAME, FLAGS, ret, t1, t2, t3, t4, t5, t6, t7) \
 +  { .func = HELPER(NAME), .name = str(NAME), .flags = FLAGS, \
 +    .sizemask = dh_sizemask(ret, 0) | dh_sizemask(t1, 1) \
 +    | dh_sizemask(t2, 2) | dh_sizemask(t3, 3) | dh_sizemask(t4, 4) \
 +    | dh_sizemask(t5, 5) | dh_sizemask(t6, 6) | dh_sizemask(t7, 7) },
 +
  #include "helper.h"
  #include "trace/generated-helpers.h"
  #include "tcg-runtime.h"
@@ -XXX,XX +XXX,XX @@
  #undef DEF_HELPER_FLAGS_4
  #undef DEF_HELPER_FLAGS_5
  #undef DEF_HELPER_FLAGS_6
 +#undef DEF_HELPER_FLAGS_7
  #endif /* HELPER_TCG_H */
 --
 .20.1

-New patch
+[PULL 3/5] tcg: Tidy tcg-runtime-gvec.c types
+Partial cleanup from the CONFIG_VECTOR16 removal.
+Replace the vec* types with their scalar expansions.
+Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ accel/tcg/tcg-runtime-gvec.c | 270 +++++++++++++++++------------------
+file changed, 130 insertions(+), 140 deletions(-)
+diff --git a/accel/tcg/tcg-runtime-gvec.c b/accel/tcg/tcg-runtime-gvec.c
+index XXXXXXX..XXXXXXX 100644
+--- a/accel/tcg/tcg-runtime-gvec.c
++++ b/accel/tcg/tcg-runtime-gvec.c
+@@ -XXX,XX +XXX,XX @@
+ #include "tcg/tcg-gvec-desc.h"
+-typedef uint8_t vec8;
+-typedef uint16_t vec16;
+-typedef uint32_t vec32;
+-typedef uint64_t vec64;
+-
+-typedef int8_t svec8;
+-typedef int16_t svec16;
+-typedef int32_t svec32;
+-typedef int64_t svec64;
+-
+ #define DUP16(X)  X
+ #define DUP8(X)   X
+ #define DUP4(X)   X
+@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_add8)(void *d, void *a, void *b, uint32_t desc)
+     intptr_t oprsz = simd_oprsz(desc);
+     intptr_t i;
+-    for (i = 0; i < oprsz; i += sizeof(vec8)) {
+-        *(vec8 *)(d + i) = *(vec8 *)(a + i) + *(vec8 *)(b + i);
++    for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
++        *(uint8_t *)(d + i) = *(uint8_t *)(a + i) + *(uint8_t *)(b + i);
+     }
+     clear_high(d, oprsz, desc);
+ }
+@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_add16)(void *d, void *a, void *b, uint32_t desc)
+     intptr_t oprsz = simd_oprsz(desc);
+     intptr_t i;
+-    for (i = 0; i < oprsz; i += sizeof(vec16)) {
+-        *(vec16 *)(d + i) = *(vec16 *)(a + i) + *(vec16 *)(b + i);
++    for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
++        *(uint16_t *)(d + i) = *(uint16_t *)(a + i) + *(uint16_t *)(b + i);
+     }
+     clear_high(d, oprsz, desc);
+ }
+@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_add32)(void *d, void *a, void *b, uint32_t desc)
+     intptr_t oprsz = simd_oprsz(desc);
+     intptr_t i;
+-    for (i = 0; i < oprsz; i += sizeof(vec32)) {
+-        *(vec32 *)(d + i) = *(vec32 *)(a + i) + *(vec32 *)(b + i);
++    for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
++        *(uint32_t *)(d + i) = *(uint32_t *)(a + i) + *(uint32_t *)(b + i);
+     }
+     clear_high(d, oprsz, desc);
+ }
+@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_add64)(void *d, void *a, void *b, uint32_t desc)
+     intptr_t oprsz = simd_oprsz(desc);
+     intptr_t i;
+-    for (i = 0; i < oprsz; i += sizeof(vec64)) {
+-        *(vec64 *)(d + i) = *(vec64 *)(a + i) + *(vec64 *)(b + i);
++    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
++        *(uint64_t *)(d + i) = *(uint64_t *)(a + i) + *(uint64_t *)(b + i);
+     }
+     clear_high(d, oprsz, desc);
+ }
+@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_add64)(void *d, void *a, void *b, uint32_t desc)
+ void HELPER(gvec_adds8)(void *d, void *a, uint64_t b, uint32_t desc)
+ {
+     intptr_t oprsz = simd_oprsz(desc);
+-    vec8 vecb = (vec8)DUP16(b);
++    uint8_t vecb = (uint8_t)DUP16(b);
+     intptr_t i;
+-    for (i = 0; i < oprsz; i += sizeof(vec8)) {
+-        *(vec8 *)(d + i) = *(vec8 *)(a + i) + vecb;
++    for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
++        *(uint8_t *)(d + i) = *(uint8_t *)(a + i) + vecb;
+     }
+     clear_high(d, oprsz, desc);
+ }
+@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_adds8)(void *d, void *a, uint64_t b, uint32_t desc)
+ void HELPER(gvec_adds16)(void *d, void *a, uint64_t b, uint32_t desc)
+ {
+     intptr_t oprsz = simd_oprsz(desc);
+-    vec16 vecb = (vec16)DUP8(b);
++    uint16_t vecb = (uint16_t)DUP8(b);
+     intptr_t i;
+-    for (i = 0; i < oprsz; i += sizeof(vec16)) {
+-        *(vec16 *)(d + i) = *(vec16 *)(a + i) + vecb;
++    for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
++        *(uint16_t *)(d + i) = *(uint16_t *)(a + i) + vecb;
+     }
+     clear_high(d, oprsz, desc);
+ }
+@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_adds16)(void *d, void *a, uint64_t b, uint32_t desc)
+ void HELPER(gvec_adds32)(void *d, void *a, uint64_t b, uint32_t desc)
+ {
+     intptr_t oprsz = simd_oprsz(desc);
+-    vec32 vecb = (vec32)DUP4(b);
++    uint32_t vecb = (uint32_t)DUP4(b);
+     intptr_t i;
+-    for (i = 0; i < oprsz; i += sizeof(vec32)) {
+-        *(vec32 *)(d + i) = *(vec32 *)(a + i) + vecb;
++    for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
++        *(uint32_t *)(d + i) = *(uint32_t *)(a + i) + vecb;
+     }
+     clear_high(d, oprsz, desc);
+ }
+@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_adds32)(void *d, void *a, uint64_t b, uint32_t desc)
+ void HELPER(gvec_adds64)(void *d, void *a, uint64_t b, uint32_t desc)
+ {
+     intptr_t oprsz = simd_oprsz(desc);
+-    vec64 vecb = (vec64)DUP2(b);
++    uint64_t vecb = (uint64_t)DUP2(b);
+     intptr_t i;
+-    for (i = 0; i < oprsz; i += sizeof(vec64)) {
+-        *(vec64 *)(d + i) = *(vec64 *)(a + i) + vecb;
++    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
++        *(uint64_t *)(d + i) = *(uint64_t *)(a + i) + vecb;
+     }
+     clear_high(d, oprsz, desc);
+ }
+@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_sub8)(void *d, void *a, void *b, uint32_t desc)
+     intptr_t oprsz = simd_oprsz(desc);
+     intptr_t i;
+-    for (i = 0; i < oprsz; i += sizeof(vec8)) {
+-        *(vec8 *)(d + i) = *(vec8 *)(a + i) - *(vec8 *)(b + i);
++    for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
++        *(uint8_t *)(d + i) = *(uint8_t *)(a + i) - *(uint8_t *)(b + i);
+     }
+     clear_high(d, oprsz, desc);
+ }
+@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_sub16)(void *d, void *a, void *b, uint32_t desc)
+     intptr_t oprsz = simd_oprsz(desc);
+     intptr_t i;
+-    for (i = 0; i < oprsz; i += sizeof(vec16)) {
+-        *(vec16 *)(d + i) = *(vec16 *)(a + i) - *(vec16 *)(b + i);
++    for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
++        *(uint16_t *)(d + i) = *(uint16_t *)(a + i) - *(uint16_t *)(b + i);
+     }
+     clear_high(d, oprsz, desc);
+ }
+@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_sub32)(void *d, void *a, void *b, uint32_t desc)
+     intptr_t oprsz = simd_oprsz(desc);
+     intptr_t i;
+-    for (i = 0; i < oprsz; i += sizeof(vec32)) {
+-        *(vec32 *)(d + i) = *(vec32 *)(a + i) - *(vec32 *)(b + i);
++    for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
++        *(uint32_t *)(d + i) = *(uint32_t *)(a + i) - *(uint32_t *)(b + i);
+     }
+     clear_high(d, oprsz, desc);
+ }
+@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_sub64)(void *d, void *a, void *b, uint32_t desc)
+     intptr_t oprsz = simd_oprsz(desc);
+     intptr_t i;
+-    for (i = 0; i < oprsz; i += sizeof(vec64)) {
+-        *(vec64 *)(d + i) = *(vec64 *)(a + i) - *(vec64 *)(b + i);
++    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
++        *(uint64_t *)(d + i) = *(uint64_t *)(a + i) - *(uint64_t *)(b + i);
+     }
+     clear_high(d, oprsz, desc);
+ }
+@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_sub64)(void *d, void *a, void *b, uint32_t desc)
+ void HELPER(gvec_subs8)(void *d, void *a, uint64_t b, uint32_t desc)
+ {
+     intptr_t oprsz = simd_oprsz(desc);
+-    vec8 vecb = (vec8)DUP16(b);
++    uint8_t vecb = (uint8_t)DUP16(b);
+     intptr_t i;
+-    for (i = 0; i < oprsz; i += sizeof(vec8)) {
+-        *(vec8 *)(d + i) = *(vec8 *)(a + i) - vecb;
++    for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
++        *(uint8_t *)(d + i) = *(uint8_t *)(a + i) - vecb;
+     }
+     clear_high(d, oprsz, desc);
+ }
+@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_subs8)(void *d, void *a, uint64_t b, uint32_t desc)
+ void HELPER(gvec_subs16)(void *d, void *a, uint64_t b, uint32_t desc)
+ {
+     intptr_t oprsz = simd_oprsz(desc);
+-    vec16 vecb = (vec16)DUP8(b);
++    uint16_t vecb = (uint16_t)DUP8(b);
+     intptr_t i;
+-    for (i = 0; i < oprsz; i += sizeof(vec16)) {
+-        *(vec16 *)(d + i) = *(vec16 *)(a + i) - vecb;
++    for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
++        *(uint16_t *)(d + i) = *(uint16_t *)(a + i) - vecb;
+     }
+     clear_high(d, oprsz, desc);
+ }
+@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_subs16)(void *d, void *a, uint64_t b, uint32_t desc)
+ void HELPER(gvec_subs32)(void *d, void *a, uint64_t b, uint32_t desc)
+ {
+     intptr_t oprsz = simd_oprsz(desc);
+-    vec32 vecb = (vec32)DUP4(b);
++    uint32_t vecb = (uint32_t)DUP4(b);
+     intptr_t i;
+-    for (i = 0; i < oprsz; i += sizeof(vec32)) {
+-        *(vec32 *)(d + i) = *(vec32 *)(a + i) - vecb;
++    for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
++        *(uint32_t *)(d + i) = *(uint32_t *)(a + i) - vecb;
+     }
+     clear_high(d, oprsz, desc);
+ }
+@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_subs32)(void *d, void *a, uint64_t b, uint32_t desc)
+ void HELPER(gvec_subs64)(void *d, void *a, uint64_t b, uint32_t desc)
+ {
+     intptr_t oprsz = simd_oprsz(desc);
+-    vec64 vecb = (vec64)DUP2(b);
++    uint64_t vecb = (uint64_t)DUP2(b);
+     intptr_t i;
+-    for (i = 0; i < oprsz; i += sizeof(vec64)) {
+-        *(vec64 *)(d + i) = *(vec64 *)(a + i) - vecb;
++    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
++        *(uint64_t *)(d + i) = *(uint64_t *)(a + i) - vecb;
+     }
+     clear_high(d, oprsz, desc);
+ }
+@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_mul8)(void *d, void *a, void *b, uint32_t desc)
+     intptr_t oprsz = simd_oprsz(desc);
+     intptr_t i;
+-    for (i = 0; i < oprsz; i += sizeof(vec8)) {
+-        *(vec8 *)(d + i) = *(vec8 *)(a + i) * *(vec8 *)(b + i);
++    for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
++        *(uint8_t *)(d + i) = *(uint8_t *)(a + i) * *(uint8_t *)(b + i);
+     }
+     clear_high(d, oprsz, desc);
+ }
+@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_mul16)(void *d, void *a, void *b, uint32_t desc)
+     intptr_t oprsz = simd_oprsz(desc);
+     intptr_t i;
+-    for (i = 0; i < oprsz; i += sizeof(vec16)) {
+-        *(vec16 *)(d + i) = *(vec16 *)(a + i) * *(vec16 *)(b + i);
++    for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
++        *(uint16_t *)(d + i) = *(uint16_t *)(a + i) * *(uint16_t *)(b + i);
+     }
+     clear_high(d, oprsz, desc);
+ }
+@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_mul32)(void *d, void *a, void *b, uint32_t desc)
+     intptr_t oprsz = simd_oprsz(desc);
+     intptr_t i;
+-    for (i = 0; i < oprsz; i += sizeof(vec32)) {
+-        *(vec32 *)(d + i) = *(vec32 *)(a + i) * *(vec32 *)(b + i);
++    for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
++        *(uint32_t *)(d + i) = *(uint32_t *)(a + i) * *(uint32_t *)(b + i);
+     }
+     clear_high(d, oprsz, desc);
+ }
+@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_mul64)(void *d, void *a, void *b, uint32_t desc)
+     intptr_t oprsz = simd_oprsz(desc);
+     intptr_t i;
+-    for (i = 0; i < oprsz; i += sizeof(vec64)) {
+-        *(vec64 *)(d + i) = *(vec64 *)(a + i) * *(vec64 *)(b + i);
++    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
++        *(uint64_t *)(d + i) = *(uint64_t *)(a + i) * *(uint64_t *)(b + i);
+     }
+     clear_high(d, oprsz, desc);
+ }
+@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_mul64)(void *d, void *a, void *b, uint32_t desc)
+ void HELPER(gvec_muls8)(void *d, void *a, uint64_t b, uint32_t desc)
+ {
+     intptr_t oprsz = simd_oprsz(desc);
+-    vec8 vecb = (vec8)DUP16(b);
++    uint8_t vecb = (uint8_t)DUP16(b);
+     intptr_t i;
+-    for (i = 0; i < oprsz; i += sizeof(vec8)) {
+-        *(vec8 *)(d + i) = *(vec8 *)(a + i) * vecb;
++    for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
++        *(uint8_t *)(d + i) = *(uint8_t *)(a + i) * vecb;
+     }
+     clear_high(d, oprsz, desc);
+ }
+@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_muls8)(void *d, void *a, uint64_t b, uint32_t desc)
+ void HELPER(gvec_muls16)(void *d, void *a, uint64_t b, uint32_t desc)
+ {
+     intptr_t oprsz = simd_oprsz(desc);
+-    vec16 vecb = (vec16)DUP8(b);
++    uint16_t vecb = (uint16_t)DUP8(b);
+     intptr_t i;
+-    for (i = 0; i < oprsz; i += sizeof(vec16)) {
+-        *(vec16 *)(d + i) = *(vec16 *)(a + i) * vecb;
++    for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
++        *(uint16_t *)(d + i) = *(uint16_t *)(a + i) * vecb;
+     }
+     clear_high(d, oprsz, desc);
+ }
+@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_muls16)(void *d, void *a, uint64_t b, uint32_t desc)
+ void HELPER(gvec_muls32)(void *d, void *a, uint64_t b, uint32_t desc)
+ {
+     intptr_t oprsz = simd_oprsz(desc);
+-    vec32 vecb = (vec32)DUP4(b);
++    uint32_t vecb = (uint32_t)DUP4(b);
+     intptr_t i;
+-    for (i = 0; i < oprsz; i += sizeof(vec32)) {
+-        *(vec32 *)(d + i) = *(vec32 *)(a + i) * vecb;
++    for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
++        *(uint32_t *)(d + i) = *(uint32_t *)(a + i) * vecb;
+     }
+     clear_high(d, oprsz, desc);
+ }
+@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_muls32)(void *d, void *a, uint64_t b, uint32_t desc)
+ void HELPER(gvec_muls64)(void *d, void *a, uint64_t b, uint32_t desc)
+ {
+     intptr_t oprsz = simd_oprsz(desc);
+-    vec64 vecb = (vec64)DUP2(b);
++    uint64_t vecb = (uint64_t)DUP2(b);
+     intptr_t i;
+-    for (i = 0; i < oprsz; i += sizeof(vec64)) {
+-        *(vec64 *)(d + i) = *(vec64 *)(a + i) * vecb;
++    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
++        *(uint64_t *)(d + i) = *(uint64_t *)(a + i) * vecb;
+     }
+     clear_high(d, oprsz, desc);
+ }
+@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_neg8)(void *d, void *a, uint32_t desc)
+     intptr_t oprsz = simd_oprsz(desc);
+     intptr_t i;
+-    for (i = 0; i < oprsz; i += sizeof(vec8)) {
+-        *(vec8 *)(d + i) = -*(vec8 *)(a + i);
++    for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
++        *(uint8_t *)(d + i) = -*(uint8_t *)(a + i);
+     }
+     clear_high(d, oprsz, desc);
+ }
+@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_neg16)(void *d, void *a, uint32_t desc)
+     intptr_t oprsz = simd_oprsz(desc);
+     intptr_t i;
+-    for (i = 0; i < oprsz; i += sizeof(vec16)) {
+-        *(vec16 *)(d + i) = -*(vec16 *)(a + i);
++    for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
++        *(uint16_t *)(d + i) = -*(uint16_t *)(a + i);
+     }
+     clear_high(d, oprsz, desc);
+ }
+@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_neg32)(void *d, void *a, uint32_t desc)
+     intptr_t oprsz = simd_oprsz(desc);
+     intptr_t i;
+-    for (i = 0; i < oprsz; i += sizeof(vec32)) {
+-        *(vec32 *)(d + i) = -*(vec32 *)(a + i);
++    for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
++        *(uint32_t *)(d + i) = -*(uint32_t *)(a + i);
+     }
+     clear_high(d, oprsz, desc);
+ }
+@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_neg64)(void *d, void *a, uint32_t desc)
+     intptr_t oprsz = simd_oprsz(desc);
+     intptr_t i;
+-    for (i = 0; i < oprsz; i += sizeof(vec64)) {
+-        *(vec64 *)(d + i) = -*(vec64 *)(a + i);
++    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
++        *(uint64_t *)(d + i) = -*(uint64_t *)(a + i);
+     }
+     clear_high(d, oprsz, desc);
+ }
+@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_not)(void *d, void *a, uint32_t desc)
+     intptr_t oprsz = simd_oprsz(desc);
+     intptr_t i;
+-    for (i = 0; i < oprsz; i += sizeof(vec64)) {
+-        *(vec64 *)(d + i) = ~*(vec64 *)(a + i);
++    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
++        *(uint64_t *)(d + i) = ~*(uint64_t *)(a + i);
+     }
+     clear_high(d, oprsz, desc);
+ }
+@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_and)(void *d, void *a, void *b, uint32_t desc)
+     intptr_t oprsz = simd_oprsz(desc);
+     intptr_t i;
+-    for (i = 0; i < oprsz; i += sizeof(vec64)) {
+-        *(vec64 *)(d + i) = *(vec64 *)(a + i) & *(vec64 *)(b + i);
++    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
++        *(uint64_t *)(d + i) = *(uint64_t *)(a + i) & *(uint64_t *)(b + i);
+     }
+     clear_high(d, oprsz, desc);
+ }
+@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_or)(void *d, void *a, void *b, uint32_t desc)
+     intptr_t oprsz = simd_oprsz(desc);
+     intptr_t i;
+-    for (i = 0; i < oprsz; i += sizeof(vec64)) {
+-        *(vec64 *)(d + i) = *(vec64 *)(a + i) | *(vec64 *)(b + i);
++    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
++        *(uint64_t *)(d + i) = *(uint64_t *)(a + i) | *(uint64_t *)(b + i);
+     }
+     clear_high(d, oprsz, desc);
+ }
+@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_xor)(void *d, void *a, void *b, uint32_t desc)
+     intptr_t oprsz = simd_oprsz(desc);
+     intptr_t i;
+-    for (i = 0; i < oprsz; i += sizeof(vec64)) {
+-        *(vec64 *)(d + i) = *(vec64 *)(a + i) ^ *(vec64 *)(b + i);
++    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
++        *(uint64_t *)(d + i) = *(uint64_t *)(a + i) ^ *(uint64_t *)(b + i);
+     }
+     clear_high(d, oprsz, desc);
+ }
+@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_andc)(void *d, void *a, void *b, uint32_t desc)
+     intptr_t oprsz = simd_oprsz(desc);
+     intptr_t i;
+-    for (i = 0; i < oprsz; i += sizeof(vec64)) {
+-        *(vec64 *)(d + i) = *(vec64 *)(a + i) &~ *(vec64 *)(b + i);
++    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
++        *(uint64_t *)(d + i) = *(uint64_t *)(a + i) &~ *(uint64_t *)(b + i);
+     }
+     clear_high(d, oprsz, desc);
+ }
+@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_orc)(void *d, void *a, void *b, uint32_t desc)
+     intptr_t oprsz = simd_oprsz(desc);
+     intptr_t i;
+-    for (i = 0; i < oprsz; i += sizeof(vec64)) {
+-        *(vec64 *)(d + i) = *(vec64 *)(a + i) |~ *(vec64 *)(b + i);
++    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
++        *(uint64_t *)(d + i) = *(uint64_t *)(a + i) |~ *(uint64_t *)(b + i);
+     }
+     clear_high(d, oprsz, desc);
+ }
+@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_nand)(void *d, void *a, void *b, uint32_t desc)
+     intptr_t oprsz = simd_oprsz(desc);
+     intptr_t i;
+-    for (i = 0; i < oprsz; i += sizeof(vec64)) {
+-        *(vec64 *)(d + i) = ~(*(vec64 *)(a + i) & *(vec64 *)(b + i));
++    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
++        *(uint64_t *)(d + i) = ~(*(uint64_t *)(a + i) & *(uint64_t *)(b + i));
+     }
+     clear_high(d, oprsz, desc);
+ }
+@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_nor)(void *d, void *a, void *b, uint32_t desc)
+     intptr_t oprsz = simd_oprsz(desc);
+     intptr_t i;
+-    for (i = 0; i < oprsz; i += sizeof(vec64)) {
+-        *(vec64 *)(d + i) = ~(*(vec64 *)(a + i) | *(vec64 *)(b + i));
++    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
++        *(uint64_t *)(d + i) = ~(*(uint64_t *)(a + i) | *(uint64_t *)(b + i));
+     }
+     clear_high(d, oprsz, desc);
+ }
+@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_eqv)(void *d, void *a, void *b, uint32_t desc)
+     intptr_t oprsz = simd_oprsz(desc);
+     intptr_t i;
+-    for (i = 0; i < oprsz; i += sizeof(vec64)) {
+-        *(vec64 *)(d + i) = ~(*(vec64 *)(a + i) ^ *(vec64 *)(b + i));
++    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
++        *(uint64_t *)(d + i) = ~(*(uint64_t *)(a + i) ^ *(uint64_t *)(b + i));
+     }
+     clear_high(d, oprsz, desc);
+ }
+@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_eqv)(void *d, void *a, void *b, uint32_t desc)
+ void HELPER(gvec_ands)(void *d, void *a, uint64_t b, uint32_t desc)
+ {
+     intptr_t oprsz = simd_oprsz(desc);
+-    vec64 vecb = (vec64)DUP2(b);
++    uint64_t vecb = (uint64_t)DUP2(b);
+     intptr_t i;
+-    for (i = 0; i < oprsz; i += sizeof(vec64)) {
+-        *(vec64 *)(d + i) = *(vec64 *)(a + i) & vecb;
++    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
++        *(uint64_t *)(d + i) = *(uint64_t *)(a + i) & vecb;
+     }
+     clear_high(d, oprsz, desc);
+ }
+@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_ands)(void *d, void *a, uint64_t b, uint32_t desc)
+ void HELPER(gvec_xors)(void *d, void *a, uint64_t b, uint32_t desc)
+ {
+     intptr_t oprsz = simd_oprsz(desc);
+-    vec64 vecb = (vec64)DUP2(b);
++    uint64_t vecb = (uint64_t)DUP2(b);
+     intptr_t i;
+-    for (i = 0; i < oprsz; i += sizeof(vec64)) {
+-        *(vec64 *)(d + i) = *(vec64 *)(a + i) ^ vecb;
++    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
++        *(uint64_t *)(d + i) = *(uint64_t *)(a + i) ^ vecb;
+     }
+     clear_high(d, oprsz, desc);
+ }
+@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_xors)(void *d, void *a, uint64_t b, uint32_t desc)
+ void HELPER(gvec_ors)(void *d, void *a, uint64_t b, uint32_t desc)
+ {
+     intptr_t oprsz = simd_oprsz(desc);
+-    vec64 vecb = (vec64)DUP2(b);
++    uint64_t vecb = (uint64_t)DUP2(b);
+     intptr_t i;
+-    for (i = 0; i < oprsz; i += sizeof(vec64)) {
+-        *(vec64 *)(d + i) = *(vec64 *)(a + i) | vecb;
++    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
++        *(uint64_t *)(d + i) = *(uint64_t *)(a + i) | vecb;
+     }
+     clear_high(d, oprsz, desc);
+ }
+@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_shl8i)(void *d, void *a, uint32_t desc)
+     int shift = simd_data(desc);
+     intptr_t i;
+-    for (i = 0; i < oprsz; i += sizeof(vec8)) {
+-        *(vec8 *)(d + i) = *(vec8 *)(a + i) << shift;
++    for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
++        *(uint8_t *)(d + i) = *(uint8_t *)(a + i) << shift;
+     }
+     clear_high(d, oprsz, desc);
+ }
+@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_shl16i)(void *d, void *a, uint32_t desc)
+     int shift = simd_data(desc);
+     intptr_t i;
+-    for (i = 0; i < oprsz; i += sizeof(vec16)) {
+-        *(vec16 *)(d + i) = *(vec16 *)(a + i) << shift;
++    for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
++        *(uint16_t *)(d + i) = *(uint16_t *)(a + i) << shift;
+     }
+     clear_high(d, oprsz, desc);
+ }
+@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_shl32i)(void *d, void *a, uint32_t desc)
+     int shift = simd_data(desc);
+     intptr_t i;
+-    for (i = 0; i < oprsz; i += sizeof(vec32)) {
+-        *(vec32 *)(d + i) = *(vec32 *)(a + i) << shift;
++    for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
++        *(uint32_t *)(d + i) = *(uint32_t *)(a + i) << shift;
+     }
+     clear_high(d, oprsz, desc);
+ }
+@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_shl64i)(void *d, void *a, uint32_t desc)
+     int shift = simd_data(desc);
+     intptr_t i;
+-    for (i = 0; i < oprsz; i += sizeof(vec64)) {
+-        *(vec64 *)(d + i) = *(vec64 *)(a + i) << shift;
++    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
++        *(uint64_t *)(d + i) = *(uint64_t *)(a + i) << shift;
+     }
+     clear_high(d, oprsz, desc);
+ }
+@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_shr8i)(void *d, void *a, uint32_t desc)
+     int shift = simd_data(desc);
+     intptr_t i;
+-    for (i = 0; i < oprsz; i += sizeof(vec8)) {
+-        *(vec8 *)(d + i) = *(vec8 *)(a + i) >> shift;
++    for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
++        *(uint8_t *)(d + i) = *(uint8_t *)(a + i) >> shift;
+     }
+     clear_high(d, oprsz, desc);
+ }
+@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_shr16i)(void *d, void *a, uint32_t desc)
+     int shift = simd_data(desc);
+     intptr_t i;
+-    for (i = 0; i < oprsz; i += sizeof(vec16)) {
+-        *(vec16 *)(d + i) = *(vec16 *)(a + i) >> shift;
++    for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
++        *(uint16_t *)(d + i) = *(uint16_t *)(a + i) >> shift;
+     }
+     clear_high(d, oprsz, desc);
+ }
+@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_shr32i)(void *d, void *a, uint32_t desc)
+     int shift = simd_data(desc);
+     intptr_t i;
+-    for (i = 0; i < oprsz; i += sizeof(vec32)) {
+-        *(vec32 *)(d + i) = *(vec32 *)(a + i) >> shift;
++    for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
++        *(uint32_t *)(d + i) = *(uint32_t *)(a + i) >> shift;
+     }
+     clear_high(d, oprsz, desc);
+ }
+@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_shr64i)(void *d, void *a, uint32_t desc)
+     int shift = simd_data(desc);
+     intptr_t i;
+-    for (i = 0; i < oprsz; i += sizeof(vec64)) {
+-        *(vec64 *)(d + i) = *(vec64 *)(a + i) >> shift;
++    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
++        *(uint64_t *)(d + i) = *(uint64_t *)(a + i) >> shift;
+     }
+     clear_high(d, oprsz, desc);
+ }
+@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_sar8i)(void *d, void *a, uint32_t desc)
+     int shift = simd_data(desc);
+     intptr_t i;
+-    for (i = 0; i < oprsz; i += sizeof(vec8)) {
+-        *(svec8 *)(d + i) = *(svec8 *)(a + i) >> shift;
++    for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
++        *(int8_t *)(d + i) = *(int8_t *)(a + i) >> shift;
+     }
+     clear_high(d, oprsz, desc);
+ }
+@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_sar16i)(void *d, void *a, uint32_t desc)
+     int shift = simd_data(desc);
+     intptr_t i;
+-    for (i = 0; i < oprsz; i += sizeof(vec16)) {
+-        *(svec16 *)(d + i) = *(svec16 *)(a + i) >> shift;
++    for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
++        *(int16_t *)(d + i) = *(int16_t *)(a + i) >> shift;
+     }
+     clear_high(d, oprsz, desc);
+ }
+@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_sar32i)(void *d, void *a, uint32_t desc)
+     int shift = simd_data(desc);
+     intptr_t i;
+-    for (i = 0; i < oprsz; i += sizeof(vec32)) {
+-        *(svec32 *)(d + i) = *(svec32 *)(a + i) >> shift;
++    for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
++        *(int32_t *)(d + i) = *(int32_t *)(a + i) >> shift;
+     }
+     clear_high(d, oprsz, desc);
+ }
+@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_sar64i)(void *d, void *a, uint32_t desc)
+     int shift = simd_data(desc);
+     intptr_t i;
+-    for (i = 0; i < oprsz; i += sizeof(vec64)) {
+-        *(svec64 *)(d + i) = *(svec64 *)(a + i) >> shift;
++    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
++        *(int64_t *)(d + i) = *(int64_t *)(a + i) >> shift;
+     }
+     clear_high(d, oprsz, desc);
+ }
+@@ -XXX,XX +XXX,XX @@ void HELPER(NAME)(void *d, void *a, void *b, uint32_t desc)                \
+ }
+ #define DO_CMP2(SZ) \
+-    DO_CMP1(gvec_eq##SZ, vec##SZ, ==)    \
+-    DO_CMP1(gvec_ne##SZ, vec##SZ, !=)    \
+-    DO_CMP1(gvec_lt##SZ, svec##SZ, <)    \
+-    DO_CMP1(gvec_le##SZ, svec##SZ, <=)   \
+-    DO_CMP1(gvec_ltu##SZ, vec##SZ, <)    \
+-    DO_CMP1(gvec_leu##SZ, vec##SZ, <=)
++    DO_CMP1(gvec_eq##SZ, uint##SZ##_t, ==)    \
++    DO_CMP1(gvec_ne##SZ, uint##SZ##_t, !=)    \
++    DO_CMP1(gvec_lt##SZ, int##SZ##_t, <)      \
++    DO_CMP1(gvec_le##SZ, int##SZ##_t, <=)     \
++    DO_CMP1(gvec_ltu##SZ, uint##SZ##_t, <)    \
++    DO_CMP1(gvec_leu##SZ, uint##SZ##_t, <=)
+ DO_CMP2(8)
+ DO_CMP2(16)
+@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_bitsel)(void *d, void *a, void *b, void *c, uint32_t desc)
+     intptr_t oprsz = simd_oprsz(desc);
+     intptr_t i;
+-    for (i = 0; i < oprsz; i += sizeof(vec64)) {
+-        vec64 aa = *(vec64 *)(a + i);
+-        vec64 bb = *(vec64 *)(b + i);
+-        vec64 cc = *(vec64 *)(c + i);
+-        *(vec64 *)(d + i) = (bb & aa) | (cc & ~aa);
++    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
++        uint64_t aa = *(uint64_t *)(a + i);
++        uint64_t bb = *(uint64_t *)(b + i);
++        uint64_t cc = *(uint64_t *)(c + i);
++        *(uint64_t *)(d + i) = (bb & aa) | (cc & ~aa);
+     }
+     clear_high(d, oprsz, desc);
+ }
+--
+.20.1

-[PULL 3/3] tcg: Add tcg_gen_gvec_5_ptr
+[PULL 4/5] tcg: Tidy tcg-runtime-gvec.c DUP*
-Extend the vector generator infrastructure to handle
+Partial cleanup from the CONFIG_VECTOR16 removal.
-vector arguments.
+Replace the DUP* expansions with the scalar argument.
 Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
-Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-Reviewed-by: Taylor Simpson <tsimpson@quicinc.com>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/tcg/tcg-op-gvec.h |  7 +++++++
+ accel/tcg/tcg-runtime-gvec.c | 50 +++++++++++-------------------------
- tcg/tcg-op-gvec.c         | 32 ++++++++++++++++++++++++++++++++
+file changed, 15 insertions(+), 35 deletions(-)
-files changed, 39 insertions(+)
+diff --git a/accel/tcg/tcg-runtime-gvec.c b/accel/tcg/tcg-runtime-gvec.c
 diff --git a/include/tcg/tcg-op-gvec.h b/include/tcg/tcg-op-gvec.h
 index XXXXXXX..XXXXXXX 100644
---- a/include/tcg/tcg-op-gvec.h
+--- a/accel/tcg/tcg-runtime-gvec.c
-+++ b/include/tcg/tcg-op-gvec.h
++++ b/accel/tcg/tcg-runtime-gvec.c
-@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+@@ -XXX,XX +XXX,XX @@
-                         uint32_t maxsz, int32_t data,
+ #include "tcg/tcg-gvec-desc.h"
-                         gen_helper_gvec_4_ptr *fn);
-+typedef void gen_helper_gvec_5_ptr(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_ptr,
+-#define DUP16(X)  X
-+                                   TCGv_ptr, TCGv_ptr, TCGv_i32);
+-#define DUP8(X)   X
-+void tcg_gen_gvec_5_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+-#define DUP4(X)   X
-+                        uint32_t cofs, uint32_t eofs, TCGv_ptr ptr,
+-#define DUP2(X)   X
-+                        uint32_t oprsz, uint32_t maxsz, int32_t data,
+-
-+                        gen_helper_gvec_5_ptr *fn);
+ static inline void clear_high(void *d, intptr_t oprsz, uint32_t desc)
-+
+ {
- /* Expand a gvec operation.  Either inline or out-of-line depending on
+     intptr_t maxsz = simd_maxsz(desc);
-    the actual vector size and the operations supported by the host.  */
+@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_add64)(void *d, void *a, void *b, uint32_t desc)
- typedef struct {
+ void HELPER(gvec_adds8)(void *d, void *a, uint64_t b, uint32_t desc)
-diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
+ {
-index XXXXXXX..XXXXXXX 100644
+     intptr_t oprsz = simd_oprsz(desc);
---- a/tcg/tcg-op-gvec.c
+-    uint8_t vecb = (uint8_t)DUP16(b);
-+++ b/tcg/tcg-op-gvec.c
+     intptr_t i;
-@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
-     tcg_temp_free_i32(desc);
+     for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
- }
+-        *(uint8_t *)(d + i) = *(uint8_t *)(a + i) + vecb;
++        *(uint8_t *)(d + i) = *(uint8_t *)(a + i) + (uint8_t)b;
-+/* Generate a call to a gvec-style helper with five vector operands
+     }
-+   and an extra pointer operand.  */
+     clear_high(d, oprsz, desc);
-+void tcg_gen_gvec_5_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+ }
-+                        uint32_t cofs, uint32_t eofs, TCGv_ptr ptr,
+@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_adds8)(void *d, void *a, uint64_t b, uint32_t desc)
-+                        uint32_t oprsz, uint32_t maxsz, int32_t data,
+ void HELPER(gvec_adds16)(void *d, void *a, uint64_t b, uint32_t desc)
-+                        gen_helper_gvec_5_ptr *fn)
+ {
-+{
+     intptr_t oprsz = simd_oprsz(desc);
-+    TCGv_ptr a0, a1, a2, a3, a4;
+-    uint16_t vecb = (uint16_t)DUP8(b);
-+    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
+     intptr_t i;
-+
-+    a0 = tcg_temp_new_ptr();
+     for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
-+    a1 = tcg_temp_new_ptr();
+-        *(uint16_t *)(d + i) = *(uint16_t *)(a + i) + vecb;
-+    a2 = tcg_temp_new_ptr();
++        *(uint16_t *)(d + i) = *(uint16_t *)(a + i) + (uint16_t)b;
-+    a3 = tcg_temp_new_ptr();
+     }
-+    a4 = tcg_temp_new_ptr();
+     clear_high(d, oprsz, desc);
-+
+ }
-+    tcg_gen_addi_ptr(a0, cpu_env, dofs);
+@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_adds16)(void *d, void *a, uint64_t b, uint32_t desc)
-+    tcg_gen_addi_ptr(a1, cpu_env, aofs);
+ void HELPER(gvec_adds32)(void *d, void *a, uint64_t b, uint32_t desc)
-+    tcg_gen_addi_ptr(a2, cpu_env, bofs);
+ {
-+    tcg_gen_addi_ptr(a3, cpu_env, cofs);
+     intptr_t oprsz = simd_oprsz(desc);
-+    tcg_gen_addi_ptr(a4, cpu_env, eofs);
+-    uint32_t vecb = (uint32_t)DUP4(b);
-+
+     intptr_t i;
-+    fn(a0, a1, a2, a3, a4, ptr, desc);
-+
+     for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
-+    tcg_temp_free_ptr(a0);
+-        *(uint32_t *)(d + i) = *(uint32_t *)(a + i) + vecb;
-+    tcg_temp_free_ptr(a1);
++        *(uint32_t *)(d + i) = *(uint32_t *)(a + i) + (uint32_t)b;
-+    tcg_temp_free_ptr(a2);
+     }
-+    tcg_temp_free_ptr(a3);
+     clear_high(d, oprsz, desc);
-+    tcg_temp_free_ptr(a4);
+ }
-+    tcg_temp_free_i32(desc);
+@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_adds32)(void *d, void *a, uint64_t b, uint32_t desc)
-+}
+ void HELPER(gvec_adds64)(void *d, void *a, uint64_t b, uint32_t desc)
-+
+ {
- /* Return true if we want to implement something of OPRSZ bytes
+     intptr_t oprsz = simd_oprsz(desc);
-    in units of LNSZ.  This limits the expansion of inline code.  */
+-    uint64_t vecb = (uint64_t)DUP2(b);
- static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz)
+     intptr_t i;
      for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
 -        *(uint64_t *)(d + i) = *(uint64_t *)(a + i) + vecb;
 +        *(uint64_t *)(d + i) = *(uint64_t *)(a + i) + b;
      }
      clear_high(d, oprsz, desc);
  }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_sub64)(void *d, void *a, void *b, uint32_t desc)
  void HELPER(gvec_subs8)(void *d, void *a, uint64_t b, uint32_t desc)
  {
      intptr_t oprsz = simd_oprsz(desc);
 -    uint8_t vecb = (uint8_t)DUP16(b);
      intptr_t i;
      for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
 -        *(uint8_t *)(d + i) = *(uint8_t *)(a + i) - vecb;
 +        *(uint8_t *)(d + i) = *(uint8_t *)(a + i) - (uint8_t)b;
      }
      clear_high(d, oprsz, desc);
  }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_subs8)(void *d, void *a, uint64_t b, uint32_t desc)
  void HELPER(gvec_subs16)(void *d, void *a, uint64_t b, uint32_t desc)
  {
      intptr_t oprsz = simd_oprsz(desc);
 -    uint16_t vecb = (uint16_t)DUP8(b);
      intptr_t i;
      for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
 -        *(uint16_t *)(d + i) = *(uint16_t *)(a + i) - vecb;
 +        *(uint16_t *)(d + i) = *(uint16_t *)(a + i) - (uint16_t)b;
      }
      clear_high(d, oprsz, desc);
  }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_subs16)(void *d, void *a, uint64_t b, uint32_t desc)
  void HELPER(gvec_subs32)(void *d, void *a, uint64_t b, uint32_t desc)
  {
      intptr_t oprsz = simd_oprsz(desc);
 -    uint32_t vecb = (uint32_t)DUP4(b);
      intptr_t i;
      for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
 -        *(uint32_t *)(d + i) = *(uint32_t *)(a + i) - vecb;
 +        *(uint32_t *)(d + i) = *(uint32_t *)(a + i) - (uint32_t)b;
      }
      clear_high(d, oprsz, desc);
  }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_subs32)(void *d, void *a, uint64_t b, uint32_t desc)
  void HELPER(gvec_subs64)(void *d, void *a, uint64_t b, uint32_t desc)
  {
      intptr_t oprsz = simd_oprsz(desc);
 -    uint64_t vecb = (uint64_t)DUP2(b);
      intptr_t i;
      for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
 -        *(uint64_t *)(d + i) = *(uint64_t *)(a + i) - vecb;
 +        *(uint64_t *)(d + i) = *(uint64_t *)(a + i) - b;
      }
      clear_high(d, oprsz, desc);
  }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_mul64)(void *d, void *a, void *b, uint32_t desc)
  void HELPER(gvec_muls8)(void *d, void *a, uint64_t b, uint32_t desc)
  {
      intptr_t oprsz = simd_oprsz(desc);
 -    uint8_t vecb = (uint8_t)DUP16(b);
      intptr_t i;
      for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
 -        *(uint8_t *)(d + i) = *(uint8_t *)(a + i) * vecb;
 +        *(uint8_t *)(d + i) = *(uint8_t *)(a + i) * (uint8_t)b;
      }
      clear_high(d, oprsz, desc);
  }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_muls8)(void *d, void *a, uint64_t b, uint32_t desc)
  void HELPER(gvec_muls16)(void *d, void *a, uint64_t b, uint32_t desc)
  {
      intptr_t oprsz = simd_oprsz(desc);
 -    uint16_t vecb = (uint16_t)DUP8(b);
      intptr_t i;
      for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
 -        *(uint16_t *)(d + i) = *(uint16_t *)(a + i) * vecb;
 +        *(uint16_t *)(d + i) = *(uint16_t *)(a + i) * (uint16_t)b;
      }
      clear_high(d, oprsz, desc);
  }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_muls16)(void *d, void *a, uint64_t b, uint32_t desc)
  void HELPER(gvec_muls32)(void *d, void *a, uint64_t b, uint32_t desc)
  {
      intptr_t oprsz = simd_oprsz(desc);
 -    uint32_t vecb = (uint32_t)DUP4(b);
      intptr_t i;
      for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
 -        *(uint32_t *)(d + i) = *(uint32_t *)(a + i) * vecb;
 +        *(uint32_t *)(d + i) = *(uint32_t *)(a + i) * (uint32_t)b;
      }
      clear_high(d, oprsz, desc);
  }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_muls32)(void *d, void *a, uint64_t b, uint32_t desc)
  void HELPER(gvec_muls64)(void *d, void *a, uint64_t b, uint32_t desc)
  {
      intptr_t oprsz = simd_oprsz(desc);
 -    uint64_t vecb = (uint64_t)DUP2(b);
      intptr_t i;
      for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
 -        *(uint64_t *)(d + i) = *(uint64_t *)(a + i) * vecb;
 +        *(uint64_t *)(d + i) = *(uint64_t *)(a + i) * b;
      }
      clear_high(d, oprsz, desc);
  }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_eqv)(void *d, void *a, void *b, uint32_t desc)
  void HELPER(gvec_ands)(void *d, void *a, uint64_t b, uint32_t desc)
  {
      intptr_t oprsz = simd_oprsz(desc);
 -    uint64_t vecb = (uint64_t)DUP2(b);
      intptr_t i;
      for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
 -        *(uint64_t *)(d + i) = *(uint64_t *)(a + i) & vecb;
 +        *(uint64_t *)(d + i) = *(uint64_t *)(a + i) & b;
      }
      clear_high(d, oprsz, desc);
  }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_ands)(void *d, void *a, uint64_t b, uint32_t desc)
  void HELPER(gvec_xors)(void *d, void *a, uint64_t b, uint32_t desc)
  {
      intptr_t oprsz = simd_oprsz(desc);
 -    uint64_t vecb = (uint64_t)DUP2(b);
      intptr_t i;
      for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
 -        *(uint64_t *)(d + i) = *(uint64_t *)(a + i) ^ vecb;
 +        *(uint64_t *)(d + i) = *(uint64_t *)(a + i) ^ b;
      }
      clear_high(d, oprsz, desc);
  }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_xors)(void *d, void *a, uint64_t b, uint32_t desc)
  void HELPER(gvec_ors)(void *d, void *a, uint64_t b, uint32_t desc)
  {
      intptr_t oprsz = simd_oprsz(desc);
 -    uint64_t vecb = (uint64_t)DUP2(b);
      intptr_t i;
      for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
 -        *(uint64_t *)(d + i) = *(uint64_t *)(a + i) | vecb;
 +        *(uint64_t *)(d + i) = *(uint64_t *)(a + i) | b;
      }
      clear_high(d, oprsz, desc);
  }
 --
 .20.1

-[PULL 1/3] exec: flush CPU TB cache in breakpoint_invalidate
+[PULL 5/5] tcg: Remove tcg-runtime-gvec.c DO_CMP0
-From: Max Filippov <jcmvbkbc@gmail.com>
+Partial cleanup from the CONFIG_VECTOR16 removal.
 Replace DO_CMP0 with its scalar expansion, a simple negation.
-When a breakpoint is inserted at location for which there's currently no
-virtual to physical translation no action is taken on CPU TB cache. If a
-TB for that virtual address already exists but is not visible ATM the
-breakpoint won't be hit next time an instruction at that address will be
-executed.
-Flush entire CPU TB cache in breakpoint_invalidate to force
-re-translation of all TBs for the breakpoint address.
-This change fixes the following scenario:
-- linux user application is running
-- a breakpoint is inserted from QEMU gdbstub for a user address that is
-  not currently present in the target CPU TLB
-- an instruction at that address is executed, but the external debugger
-  doesn't get control.
-Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
-Signed-off-by: Max Filippov <jcmvbkbc@gmail.com>
-Message-Id: <20191127220602.10827-2-jcmvbkbc@gmail.com>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- exec.c | 15 +++++++--------
+ accel/tcg/tcg-runtime-gvec.c | 5 +----
-file changed, 7 insertions(+), 8 deletions(-)
+file changed, 1 insertion(+), 4 deletions(-)
-diff --git a/exec.c b/exec.c
+diff --git a/accel/tcg/tcg-runtime-gvec.c b/accel/tcg/tcg-runtime-gvec.c
 index XXXXXXX..XXXXXXX 100644
---- a/exec.c
+--- a/accel/tcg/tcg-runtime-gvec.c
-+++ b/exec.c
++++ b/accel/tcg/tcg-runtime-gvec.c
-@@ -XXX,XX +XXX,XX @@ void tb_invalidate_phys_addr(AddressSpace *as, hwaddr addr, MemTxAttrs attrs)
+@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_sar64v)(void *d, void *a, void *b, uint32_t desc)
+     clear_high(d, oprsz, desc);
  static void breakpoint_invalidate(CPUState *cpu, target_ulong pc)
  {
 -    MemTxAttrs attrs;
 -    hwaddr phys = cpu_get_phys_page_attrs_debug(cpu, pc, &attrs);
 -    int asidx = cpu_asidx_from_attrs(cpu, attrs);
 -    if (phys != -1) {
 -        /* Locks grabbed by tb_invalidate_phys_addr */
 -        tb_invalidate_phys_addr(cpu->cpu_ases[asidx].as,
 -                                phys | (pc & ~TARGET_PAGE_MASK), attrs);
 -    }
 +    /*
 +     * There may not be a virtual to physical translation for the pc
 +     * right now, but there may exist cached TB for this pc.
 +     * Flush the whole TB cache to force re-translation of such TBs.
 +     * This is heavyweight, but we're debugging anyway.
 +     */
 +    tb_flush(cpu);
  }
- #endif
 -#define DO_CMP0(X)  -(X)
 -
  #define DO_CMP1(NAME, TYPE, OP)                                            \
  void HELPER(NAME)(void *d, void *a, void *b, uint32_t desc)                \
  {                                                                          \
      intptr_t oprsz = simd_oprsz(desc);                                     \
      intptr_t i;                                                            \
      for (i = 0; i < oprsz; i += sizeof(TYPE)) {                            \
 -        *(TYPE *)(d + i) = DO_CMP0(*(TYPE *)(a + i) OP *(TYPE *)(b + i));  \
 +        *(TYPE *)(d + i) = -(*(TYPE *)(a + i) OP *(TYPE *)(b + i));        \
      }                                                                      \
      clear_high(d, oprsz, desc);                                            \
  }
@@ -XXX,XX +XXX,XX @@ DO_CMP2(16)
  DO_CMP2(32)
  DO_CMP2(64)
 -#undef DO_CMP0
  #undef DO_CMP1
  #undef DO_CMP2
 --
 .20.1

The following changes since commit e18e5501d8ac692d32657a3e1ef545b14e72b730:

Merge remote-tracking branch 'remotes/dgilbert-gitlab/tags/pull-virtiofs-20200210' into staging (2020-02-10 18:09:14 +0000)

are available in the Git repository at:

https://github.com/rth7680/qemu.git tags/pull-tcg-20200212

for you to fetch changes up to 2445971604c1cfd3ec484457159f4ac300fb04d2:

tcg: Add tcg_gen_gvec_5_ptr (2020-02-12 14:58:36 -0800)

----------------------------------------------------------------
Fix breakpoint invalidation.
Add support for tcg helpers with 7 arguments.
Add support for gvec helpers with 5 arguments.

----------------------------------------------------------------
Max Filippov (1):
      exec: flush CPU TB cache in breakpoint_invalidate

Richard Henderson (1):
      tcg: Add tcg_gen_gvec_5_ptr

Taylor Simpson (1):
      tcg: Add support for a helper with 7 arguments

From: Max Filippov <jcmvbkbc@gmail.com>

When a breakpoint is inserted at location for which there's currently no
virtual to physical translation no action is taken on CPU TB cache. If a
TB for that virtual address already exists but is not visible ATM the
breakpoint won't be hit next time an instruction at that address will be
executed.

Flush entire CPU TB cache in breakpoint_invalidate to force
re-translation of all TBs for the breakpoint address.

This change fixes the following scenario:
- linux user application is running
- a breakpoint is inserted from QEMU gdbstub for a user address that is
  not currently present in the target CPU TLB
- an instruction at that address is executed, but the external debugger
  doesn't get control.

Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Max Filippov <jcmvbkbc@gmail.com>
Message-Id: <20191127220602.10827-2-jcmvbkbc@gmail.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 exec.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/exec.c b/exec.c
index XXXXXXX..XXXXXXX 100644
--- a/exec.c
+++ b/exec.c
@@ -XXX,XX +XXX,XX @@ void tb_invalidate_phys_addr(AddressSpace *as, hwaddr addr, MemTxAttrs attrs)
 
 static void breakpoint_invalidate(CPUState *cpu, target_ulong pc)
 {
-    MemTxAttrs attrs;
-    hwaddr phys = cpu_get_phys_page_attrs_debug(cpu, pc, &attrs);
-    int asidx = cpu_asidx_from_attrs(cpu, attrs);
-    if (phys != -1) {
-        /* Locks grabbed by tb_invalidate_phys_addr */
-        tb_invalidate_phys_addr(cpu->cpu_ases[asidx].as,
-                                phys | (pc & ~TARGET_PAGE_MASK), attrs);
-    }
+    /*
+     * There may not be a virtual to physical translation for the pc
+     * right now, but there may exist cached TB for this pc.
+     * Flush the whole TB cache to force re-translation of such TBs.
+     * This is heavyweight, but we're debugging anyway.
+     */
+    tb_flush(cpu);
 }
 #endif
 
-- 
2.20.1

From: Taylor Simpson <tsimpson@quicinc.com>

Currently, helpers can only take up to 6 arguments.  This patch adds the
capability for up to 7 arguments.  I have tested it with the Hexagon port
that I am preparing for submission.

Signed-off-by: Taylor Simpson <tsimpson@quicinc.com>
Message-Id: <1580942510-2820-1-git-send-email-tsimpson@quicinc.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/helper-gen.h   | 13 +++++++++++++
 include/exec/helper-head.h  |  2 ++
 include/exec/helper-proto.h |  6 ++++++
 include/exec/helper-tcg.h   |  7 +++++++
 4 files changed, 28 insertions(+)

diff --git a/include/exec/helper-gen.h b/include/exec/helper-gen.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/helper-gen.h
+++ b/include/exec/helper-gen.h
@@ -XXX,XX +XXX,XX @@ static inline void glue(gen_helper_, name)(dh_retvar_decl(ret)          \
   tcg_gen_callN(HELPER(name), dh_retvar(ret), 6, args);                 \
 }
 
+#define DEF_HELPER_FLAGS_7(name, flags, ret, t1, t2, t3, t4, t5, t6, t7)\
+static inline void glue(gen_helper_, name)(dh_retvar_decl(ret)          \
+    dh_arg_decl(t1, 1),  dh_arg_decl(t2, 2), dh_arg_decl(t3, 3),        \
+    dh_arg_decl(t4, 4), dh_arg_decl(t5, 5), dh_arg_decl(t6, 6),         \
+    dh_arg_decl(t7, 7))                                                 \
+{                                                                       \
+  TCGTemp *args[7] = { dh_arg(t1, 1), dh_arg(t2, 2), dh_arg(t3, 3),     \
+                     dh_arg(t4, 4), dh_arg(t5, 5), dh_arg(t6, 6),       \
+                     dh_arg(t7, 7) };                                   \
+  tcg_gen_callN(HELPER(name), dh_retvar(ret), 7, args);                 \
+}
+
 #include "helper.h"
 #include "trace/generated-helpers.h"
 #include "trace/generated-helpers-wrappers.h"
@@ -XXX,XX +XXX,XX @@ static inline void glue(gen_helper_, name)(dh_retvar_decl(ret)          \
 #undef DEF_HELPER_FLAGS_4
 #undef DEF_HELPER_FLAGS_5
 #undef DEF_HELPER_FLAGS_6
+#undef DEF_HELPER_FLAGS_7
 #undef GEN_HELPER
 
 #endif /* HELPER_GEN_H */
diff --git a/include/exec/helper-head.h b/include/exec/helper-head.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/helper-head.h
+++ b/include/exec/helper-head.h
@@ -XXX,XX +XXX,XX @@
     DEF_HELPER_FLAGS_5(name, 0, ret, t1, t2, t3, t4, t5)
 #define DEF_HELPER_6(name, ret, t1, t2, t3, t4, t5, t6) \
     DEF_HELPER_FLAGS_6(name, 0, ret, t1, t2, t3, t4, t5, t6)
+#define DEF_HELPER_7(name, ret, t1, t2, t3, t4, t5, t6, t7) \
+    DEF_HELPER_FLAGS_7(name, 0, ret, t1, t2, t3, t4, t5, t6, t7)
 
 /* MAX_OPC_PARAM_IARGS must be set to n if last entry is DEF_HELPER_FLAGS_n. */
 
diff --git a/include/exec/helper-proto.h b/include/exec/helper-proto.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/helper-proto.h
+++ b/include/exec/helper-proto.h
@@ -XXX,XX +XXX,XX @@ dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
 dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
                             dh_ctype(t4), dh_ctype(t5), dh_ctype(t6));
 
+#define DEF_HELPER_FLAGS_7(name, flags, ret, t1, t2, t3, t4, t5, t6, t7) \
+dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
+                            dh_ctype(t4), dh_ctype(t5), dh_ctype(t6), \
+                            dh_ctype(t7));
+
 #include "helper.h"
 #include "trace/generated-helpers.h"
 #include "tcg-runtime.h"
@@ -XXX,XX +XXX,XX @@ dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
 #undef DEF_HELPER_FLAGS_4
 #undef DEF_HELPER_FLAGS_5
 #undef DEF_HELPER_FLAGS_6
+#undef DEF_HELPER_FLAGS_7
 
 #endif /* HELPER_PROTO_H */
diff --git a/include/exec/helper-tcg.h b/include/exec/helper-tcg.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/helper-tcg.h
+++ b/include/exec/helper-tcg.h
@@ -XXX,XX +XXX,XX @@
     | dh_sizemask(t2, 2) | dh_sizemask(t3, 3) | dh_sizemask(t4, 4) \
     | dh_sizemask(t5, 5) | dh_sizemask(t6, 6) },
 
+#define DEF_HELPER_FLAGS_7(NAME, FLAGS, ret, t1, t2, t3, t4, t5, t6, t7) \
+  { .func = HELPER(NAME), .name = str(NAME), .flags = FLAGS, \
+    .sizemask = dh_sizemask(ret, 0) | dh_sizemask(t1, 1) \
+    | dh_sizemask(t2, 2) | dh_sizemask(t3, 3) | dh_sizemask(t4, 4) \
+    | dh_sizemask(t5, 5) | dh_sizemask(t6, 6) | dh_sizemask(t7, 7) },
+
 #include "helper.h"
 #include "trace/generated-helpers.h"
 #include "tcg-runtime.h"
@@ -XXX,XX +XXX,XX @@
 #undef DEF_HELPER_FLAGS_4
 #undef DEF_HELPER_FLAGS_5
 #undef DEF_HELPER_FLAGS_6
+#undef DEF_HELPER_FLAGS_7
 
 #endif /* HELPER_TCG_H */
-- 
2.20.1

Extend the vector generator infrastructure to handle
5 vector arguments.

Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Taylor Simpson <tsimpson@quicinc.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/tcg/tcg-op-gvec.h |  7 +++++++
 tcg/tcg-op-gvec.c         | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 39 insertions(+)

diff --git a/include/tcg/tcg-op-gvec.h b/include/tcg/tcg-op-gvec.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg-op-gvec.h
+++ b/include/tcg/tcg-op-gvec.h
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
                         uint32_t maxsz, int32_t data,
                         gen_helper_gvec_4_ptr *fn);
 
+typedef void gen_helper_gvec_5_ptr(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_ptr,
+                                   TCGv_ptr, TCGv_ptr, TCGv_i32);
+void tcg_gen_gvec_5_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+                        uint32_t cofs, uint32_t eofs, TCGv_ptr ptr,
+                        uint32_t oprsz, uint32_t maxsz, int32_t data,
+                        gen_helper_gvec_5_ptr *fn);
+
 /* Expand a gvec operation.  Either inline or out-of-line depending on
    the actual vector size and the operations supported by the host.  */
 typedef struct {
diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg-op-gvec.c
+++ b/tcg/tcg-op-gvec.c
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
     tcg_temp_free_i32(desc);
 }
 
+/* Generate a call to a gvec-style helper with five vector operands
+   and an extra pointer operand.  */
+void tcg_gen_gvec_5_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+                        uint32_t cofs, uint32_t eofs, TCGv_ptr ptr,
+                        uint32_t oprsz, uint32_t maxsz, int32_t data,
+                        gen_helper_gvec_5_ptr *fn)
+{
+    TCGv_ptr a0, a1, a2, a3, a4;
+    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
+
+    a0 = tcg_temp_new_ptr();
+    a1 = tcg_temp_new_ptr();
+    a2 = tcg_temp_new_ptr();
+    a3 = tcg_temp_new_ptr();
+    a4 = tcg_temp_new_ptr();
+
+    tcg_gen_addi_ptr(a0, cpu_env, dofs);
+    tcg_gen_addi_ptr(a1, cpu_env, aofs);
+    tcg_gen_addi_ptr(a2, cpu_env, bofs);
+    tcg_gen_addi_ptr(a3, cpu_env, cofs);
+    tcg_gen_addi_ptr(a4, cpu_env, eofs);
+
+    fn(a0, a1, a2, a3, a4, ptr, desc);
+
+    tcg_temp_free_ptr(a0);
+    tcg_temp_free_ptr(a1);
+    tcg_temp_free_ptr(a2);
+    tcg_temp_free_ptr(a3);
+    tcg_temp_free_ptr(a4);
+    tcg_temp_free_i32(desc);
+}
+
 /* Return true if we want to implement something of OPRSZ bytes
    in units of LNSZ.  This limits the expansion of inline code.  */
 static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz)
-- 
2.20.1

The following changes since commit 40c67636f67c2a89745f2e698522fe917326a952:

Merge remote-tracking branch 'remotes/kraxel/tags/usb-20200317-pull-request' into staging (2020-03-17 14:00:56 +0000)

are available in the Git repository at:

https://github.com/rth7680/qemu.git tags/pull-tcg-20200317

for you to fetch changes up to 0270bd503e3699b7202200a2d693ad1feb57473f:

tcg: Remove tcg-runtime-gvec.c DO_CMP0 (2020-03-17 08:41:07 -0700)

----------------------------------------------------------------
Fix tcg/i386 bug vs sari_vec.
Fix tcg-runtime-gvec.c vs i386 without avx.

----------------------------------------------------------------
Richard Henderson (5):
      tcg/i386: Bound shift count expanding sari_vec
      tcg: Remove CONFIG_VECTOR16
      tcg: Tidy tcg-runtime-gvec.c types
      tcg: Tidy tcg-runtime-gvec.c DUP*
      tcg: Remove tcg-runtime-gvec.c DO_CMP0

configure                    |  56 --------
 accel/tcg/tcg-runtime-gvec.c | 298 +++++++++++++++++--------------------------
 tcg/i386/tcg-target.inc.c    |   9 +-
 3 files changed, 122 insertions(+), 241 deletions(-)

A given RISU testcase for SVE can produce

tcg-op-vec.c:511: do_shifti: Assertion `i >= 0 && i < (8 << vece)' failed.

because expand_vec_sari gave a shift count of 32 to a MO_32
vector shift.

In 44f1441dbe1, we changed from direct expansion of vector opcodes
to re-use of the tcg expanders.  So while the comment correctly notes
that the hw will handle such a shift count, we now have to take our
own sanity checks into account.  Which is easy in this particular case.

Fixes: 44f1441dbe1
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/i386/tcg-target.inc.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/i386/tcg-target.inc.c
+++ b/tcg/i386/tcg-target.inc.c
@@ -XXX,XX +XXX,XX @@ static void expand_vec_sari(TCGType type, unsigned vece,
 
     case MO_64:
         if (imm <= 32) {
-            /* We can emulate a small sign extend by performing an arithmetic
+            /*
+             * We can emulate a small sign extend by performing an arithmetic
              * 32-bit shift and overwriting the high half of a 64-bit logical
-             * shift (note that the ISA says shift of 32 is valid).
+             * shift.  Note that the ISA says shift of 32 is valid, but TCG
+             * does not, so we have to bound the smaller shift -- we get the
+             * same result in the high half either way.
              */
             t1 = tcg_temp_new_vec(type);
-            tcg_gen_sari_vec(MO_32, t1, v1, imm);
+            tcg_gen_sari_vec(MO_32, t1, v1, MIN(imm, 31));
             tcg_gen_shri_vec(MO_64, v0, v1, imm);
             vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32,
                       tcgv_vec_arg(v0), tcgv_vec_arg(v0),
-- 
2.20.1

The comment in tcg-runtime-gvec.c about CONFIG_VECTOR16 says that
tcg-op-gvec.c has eliminated size 8 vectors, and only passes on
multiples of 16.  This may have been true of the first few operations,
but is not true of all operations.

In particular, multiply, shift by scalar, and compare of 8- and 16-bit
elements are not expanded inline if host vector operations are not
supported.

For an x86_64 host that does not support AVX, this means that we will
fall back to the helper, which will attempt to use SSE instructions,
which will SEGV on an invalid 8-byte aligned memory operation.

This patch simply removes the CONFIG_VECTOR16 code and configuration
without further simplification.

Buglink: https://bugs.launchpad.net/bugs/1863508
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 configure                    | 56 ------------------------------------
 accel/tcg/tcg-runtime-gvec.c | 35 +---------------------
 2 files changed, 1 insertion(+), 90 deletions(-)

diff --git a/configure b/configure
index XXXXXXX..XXXXXXX 100755
--- a/configure
+++ b/configure
@@ -XXX,XX +XXX,XX @@ if  test "$plugins" = "yes" &&
       "for this purpose. You can't build with --static."
 fi
 
-########################################
-# See if 16-byte vector operations are supported.
-# Even without a vector unit the compiler may expand these.
-# There is a bug in old GCC for PPC that crashes here.
-# Unfortunately it's the system compiler for Centos 7.
-
-cat > $TMPC << EOF
-typedef unsigned char U1 __attribute__((vector_size(16)));
-typedef unsigned short U2 __attribute__((vector_size(16)));
-typedef unsigned int U4 __attribute__((vector_size(16)));
-typedef unsigned long long U8 __attribute__((vector_size(16)));
-typedef signed char S1 __attribute__((vector_size(16)));
-typedef signed short S2 __attribute__((vector_size(16)));
-typedef signed int S4 __attribute__((vector_size(16)));
-typedef signed long long S8 __attribute__((vector_size(16)));
-static U1 a1, b1;
-static U2 a2, b2;
-static U4 a4, b4;
-static U8 a8, b8;
-static S1 c1;
-static S2 c2;
-static S4 c4;
-static S8 c8;
-static int i;
-void helper(void *d, void *a, int shift, int i);
-void helper(void *d, void *a, int shift, int i)
-{
-  *(U1 *)(d + i) = *(U1 *)(a + i) << shift;
-  *(U2 *)(d + i) = *(U2 *)(a + i) << shift;
-  *(U4 *)(d + i) = *(U4 *)(a + i) << shift;
-  *(U8 *)(d + i) = *(U8 *)(a + i) << shift;
-}
-int main(void)
-{
-  a1 += b1; a2 += b2; a4 += b4; a8 += b8;
-  a1 -= b1; a2 -= b2; a4 -= b4; a8 -= b8;
-  a1 *= b1; a2 *= b2; a4 *= b4; a8 *= b8;
-  a1 &= b1; a2 &= b2; a4 &= b4; a8 &= b8;
-  a1 |= b1; a2 |= b2; a4 |= b4; a8 |= b8;
-  a1 ^= b1; a2 ^= b2; a4 ^= b4; a8 ^= b8;
-  a1 <<= i; a2 <<= i; a4 <<= i; a8 <<= i;
-  a1 >>= i; a2 >>= i; a4 >>= i; a8 >>= i;
-  c1 >>= i; c2 >>= i; c4 >>= i; c8 >>= i;
-  return 0;
-}
-EOF
-
-vector16=no
-if compile_prog "" "" ; then
-  vector16=yes
-fi
-
 ########################################
 # See if __attribute__((alias)) is supported.
 # This false for Xcode 9, but has been remedied for Xcode 10.
@@ -XXX,XX +XXX,XX @@ if test "$atomic64" = "yes" ; then
   echo "CONFIG_ATOMIC64=y" >> $config_host_mak
 fi
 
-if test "$vector16" = "yes" ; then
-  echo "CONFIG_VECTOR16=y" >> $config_host_mak
-fi
-
 if test "$attralias" = "yes" ; then
   echo "CONFIG_ATTRIBUTE_ALIAS=y" >> $config_host_mak
 fi
diff --git a/accel/tcg/tcg-runtime-gvec.c b/accel/tcg/tcg-runtime-gvec.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-runtime-gvec.c
+++ b/accel/tcg/tcg-runtime-gvec.c
@@ -XXX,XX +XXX,XX @@
 #include "tcg/tcg-gvec-desc.h"
 
 
-/* Virtually all hosts support 16-byte vectors.  Those that don't can emulate
- * them via GCC's generic vector extension.  This turns out to be simpler and
- * more reliable than getting the compiler to autovectorize.
- *
- * In tcg-op-gvec.c, we asserted that both the size and alignment of the data
- * are multiples of 16.
- *
- * When the compiler does not support all of the operations we require, the
- * loops are written so that we can always fall back on the base types.
- */
-#ifdef CONFIG_VECTOR16
-typedef uint8_t vec8 __attribute__((vector_size(16)));
-typedef uint16_t vec16 __attribute__((vector_size(16)));
-typedef uint32_t vec32 __attribute__((vector_size(16)));
-typedef uint64_t vec64 __attribute__((vector_size(16)));
-
-typedef int8_t svec8 __attribute__((vector_size(16)));
-typedef int16_t svec16 __attribute__((vector_size(16)));
-typedef int32_t svec32 __attribute__((vector_size(16)));
-typedef int64_t svec64 __attribute__((vector_size(16)));
-
-#define DUP16(X)  { X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X }
-#define DUP8(X)   { X, X, X, X, X, X, X, X }
-#define DUP4(X)   { X, X, X, X }
-#define DUP2(X)   { X, X }
-#else
 typedef uint8_t vec8;
 typedef uint16_t vec16;
 typedef uint32_t vec32;
@@ -XXX,XX +XXX,XX @@ typedef int64_t svec64;
 #define DUP8(X)   X
 #define DUP4(X)   X
 #define DUP2(X)   X
-#endif /* CONFIG_VECTOR16 */
 
 static inline void clear_high(void *d, intptr_t oprsz, uint32_t desc)
 {
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_sar64v)(void *d, void *a, void *b, uint32_t desc)
     clear_high(d, oprsz, desc);
 }
 
-/* If vectors are enabled, the compiler fills in -1 for true.
-   Otherwise, we must take care of this by hand.  */
-#ifdef CONFIG_VECTOR16
-# define DO_CMP0(X)  X
-#else
-# define DO_CMP0(X)  -(X)
-#endif
+#define DO_CMP0(X)  -(X)
 
 #define DO_CMP1(NAME, TYPE, OP)                                            \
 void HELPER(NAME)(void *d, void *a, void *b, uint32_t desc)                \
-- 
2.20.1

Partial cleanup from the CONFIG_VECTOR16 removal.
Replace the vec* types with their scalar expansions.

Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/tcg-runtime-gvec.c | 270 +++++++++++++++++------------------
 1 file changed, 130 insertions(+), 140 deletions(-)

diff --git a/accel/tcg/tcg-runtime-gvec.c b/accel/tcg/tcg-runtime-gvec.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-runtime-gvec.c
+++ b/accel/tcg/tcg-runtime-gvec.c
@@ -XXX,XX +XXX,XX @@
 #include "tcg/tcg-gvec-desc.h"
 
 
-typedef uint8_t vec8;
-typedef uint16_t vec16;
-typedef uint32_t vec32;
-typedef uint64_t vec64;
-
-typedef int8_t svec8;
-typedef int16_t svec16;
-typedef int32_t svec32;
-typedef int64_t svec64;
-
 #define DUP16(X)  X
 #define DUP8(X)   X
 #define DUP4(X)   X
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_add8)(void *d, void *a, void *b, uint32_t desc)
     intptr_t oprsz = simd_oprsz(desc);
     intptr_t i;
 
-    for (i = 0; i < oprsz; i += sizeof(vec8)) {
-        *(vec8 *)(d + i) = *(vec8 *)(a + i) + *(vec8 *)(b + i);
+    for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
+        *(uint8_t *)(d + i) = *(uint8_t *)(a + i) + *(uint8_t *)(b + i);
     }
     clear_high(d, oprsz, desc);
 }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_add16)(void *d, void *a, void *b, uint32_t desc)
     intptr_t oprsz = simd_oprsz(desc);
     intptr_t i;
 
-    for (i = 0; i < oprsz; i += sizeof(vec16)) {
-        *(vec16 *)(d + i) = *(vec16 *)(a + i) + *(vec16 *)(b + i);
+    for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
+        *(uint16_t *)(d + i) = *(uint16_t *)(a + i) + *(uint16_t *)(b + i);
     }
     clear_high(d, oprsz, desc);
 }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_add32)(void *d, void *a, void *b, uint32_t desc)
     intptr_t oprsz = simd_oprsz(desc);
     intptr_t i;
 
-    for (i = 0; i < oprsz; i += sizeof(vec32)) {
-        *(vec32 *)(d + i) = *(vec32 *)(a + i) + *(vec32 *)(b + i);
+    for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
+        *(uint32_t *)(d + i) = *(uint32_t *)(a + i) + *(uint32_t *)(b + i);
     }
     clear_high(d, oprsz, desc);
 }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_add64)(void *d, void *a, void *b, uint32_t desc)
     intptr_t oprsz = simd_oprsz(desc);
     intptr_t i;
 
-    for (i = 0; i < oprsz; i += sizeof(vec64)) {
-        *(vec64 *)(d + i) = *(vec64 *)(a + i) + *(vec64 *)(b + i);
+    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
+        *(uint64_t *)(d + i) = *(uint64_t *)(a + i) + *(uint64_t *)(b + i);
     }
     clear_high(d, oprsz, desc);
 }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_add64)(void *d, void *a, void *b, uint32_t desc)
 void HELPER(gvec_adds8)(void *d, void *a, uint64_t b, uint32_t desc)
 {
     intptr_t oprsz = simd_oprsz(desc);
-    vec8 vecb = (vec8)DUP16(b);
+    uint8_t vecb = (uint8_t)DUP16(b);
     intptr_t i;
 
-    for (i = 0; i < oprsz; i += sizeof(vec8)) {
-        *(vec8 *)(d + i) = *(vec8 *)(a + i) + vecb;
+    for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
+        *(uint8_t *)(d + i) = *(uint8_t *)(a + i) + vecb;
     }
     clear_high(d, oprsz, desc);
 }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_adds8)(void *d, void *a, uint64_t b, uint32_t desc)
 void HELPER(gvec_adds16)(void *d, void *a, uint64_t b, uint32_t desc)
 {
     intptr_t oprsz = simd_oprsz(desc);
-    vec16 vecb = (vec16)DUP8(b);
+    uint16_t vecb = (uint16_t)DUP8(b);
     intptr_t i;
 
-    for (i = 0; i < oprsz; i += sizeof(vec16)) {
-        *(vec16 *)(d + i) = *(vec16 *)(a + i) + vecb;
+    for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
+        *(uint16_t *)(d + i) = *(uint16_t *)(a + i) + vecb;
     }
     clear_high(d, oprsz, desc);
 }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_adds16)(void *d, void *a, uint64_t b, uint32_t desc)
 void HELPER(gvec_adds32)(void *d, void *a, uint64_t b, uint32_t desc)
 {
     intptr_t oprsz = simd_oprsz(desc);
-    vec32 vecb = (vec32)DUP4(b);
+    uint32_t vecb = (uint32_t)DUP4(b);
     intptr_t i;
 
-    for (i = 0; i < oprsz; i += sizeof(vec32)) {
-        *(vec32 *)(d + i) = *(vec32 *)(a + i) + vecb;
+    for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
+        *(uint32_t *)(d + i) = *(uint32_t *)(a + i) + vecb;
     }
     clear_high(d, oprsz, desc);
 }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_adds32)(void *d, void *a, uint64_t b, uint32_t desc)
 void HELPER(gvec_adds64)(void *d, void *a, uint64_t b, uint32_t desc)
 {
     intptr_t oprsz = simd_oprsz(desc);
-    vec64 vecb = (vec64)DUP2(b);
+    uint64_t vecb = (uint64_t)DUP2(b);
     intptr_t i;
 
-    for (i = 0; i < oprsz; i += sizeof(vec64)) {
-        *(vec64 *)(d + i) = *(vec64 *)(a + i) + vecb;
+    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
+        *(uint64_t *)(d + i) = *(uint64_t *)(a + i) + vecb;
     }
     clear_high(d, oprsz, desc);
 }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_sub8)(void *d, void *a, void *b, uint32_t desc)
     intptr_t oprsz = simd_oprsz(desc);
     intptr_t i;
 
-    for (i = 0; i < oprsz; i += sizeof(vec8)) {
-        *(vec8 *)(d + i) = *(vec8 *)(a + i) - *(vec8 *)(b + i);
+    for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
+        *(uint8_t *)(d + i) = *(uint8_t *)(a + i) - *(uint8_t *)(b + i);
     }
     clear_high(d, oprsz, desc);
 }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_sub16)(void *d, void *a, void *b, uint32_t desc)
     intptr_t oprsz = simd_oprsz(desc);
     intptr_t i;
 
-    for (i = 0; i < oprsz; i += sizeof(vec16)) {
-        *(vec16 *)(d + i) = *(vec16 *)(a + i) - *(vec16 *)(b + i);
+    for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
+        *(uint16_t *)(d + i) = *(uint16_t *)(a + i) - *(uint16_t *)(b + i);
     }
     clear_high(d, oprsz, desc);
 }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_sub32)(void *d, void *a, void *b, uint32_t desc)
     intptr_t oprsz = simd_oprsz(desc);
     intptr_t i;
 
-    for (i = 0; i < oprsz; i += sizeof(vec32)) {
-        *(vec32 *)(d + i) = *(vec32 *)(a + i) - *(vec32 *)(b + i);
+    for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
+        *(uint32_t *)(d + i) = *(uint32_t *)(a + i) - *(uint32_t *)(b + i);
     }
     clear_high(d, oprsz, desc);
 }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_sub64)(void *d, void *a, void *b, uint32_t desc)
     intptr_t oprsz = simd_oprsz(desc);
     intptr_t i;
 
-    for (i = 0; i < oprsz; i += sizeof(vec64)) {
-        *(vec64 *)(d + i) = *(vec64 *)(a + i) - *(vec64 *)(b + i);
+    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
+        *(uint64_t *)(d + i) = *(uint64_t *)(a + i) - *(uint64_t *)(b + i);
     }
     clear_high(d, oprsz, desc);
 }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_sub64)(void *d, void *a, void *b, uint32_t desc)
 void HELPER(gvec_subs8)(void *d, void *a, uint64_t b, uint32_t desc)
 {
     intptr_t oprsz = simd_oprsz(desc);
-    vec8 vecb = (vec8)DUP16(b);
+    uint8_t vecb = (uint8_t)DUP16(b);
     intptr_t i;
 
-    for (i = 0; i < oprsz; i += sizeof(vec8)) {
-        *(vec8 *)(d + i) = *(vec8 *)(a + i) - vecb;
+    for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
+        *(uint8_t *)(d + i) = *(uint8_t *)(a + i) - vecb;
     }
     clear_high(d, oprsz, desc);
 }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_subs8)(void *d, void *a, uint64_t b, uint32_t desc)
 void HELPER(gvec_subs16)(void *d, void *a, uint64_t b, uint32_t desc)
 {
     intptr_t oprsz = simd_oprsz(desc);
-    vec16 vecb = (vec16)DUP8(b);
+    uint16_t vecb = (uint16_t)DUP8(b);
     intptr_t i;
 
-    for (i = 0; i < oprsz; i += sizeof(vec16)) {
-        *(vec16 *)(d + i) = *(vec16 *)(a + i) - vecb;
+    for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
+        *(uint16_t *)(d + i) = *(uint16_t *)(a + i) - vecb;
     }
     clear_high(d, oprsz, desc);
 }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_subs16)(void *d, void *a, uint64_t b, uint32_t desc)
 void HELPER(gvec_subs32)(void *d, void *a, uint64_t b, uint32_t desc)
 {
     intptr_t oprsz = simd_oprsz(desc);
-    vec32 vecb = (vec32)DUP4(b);
+    uint32_t vecb = (uint32_t)DUP4(b);
     intptr_t i;
 
-    for (i = 0; i < oprsz; i += sizeof(vec32)) {
-        *(vec32 *)(d + i) = *(vec32 *)(a + i) - vecb;
+    for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
+        *(uint32_t *)(d + i) = *(uint32_t *)(a + i) - vecb;
     }
     clear_high(d, oprsz, desc);
 }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_subs32)(void *d, void *a, uint64_t b, uint32_t desc)
 void HELPER(gvec_subs64)(void *d, void *a, uint64_t b, uint32_t desc)
 {
     intptr_t oprsz = simd_oprsz(desc);
-    vec64 vecb = (vec64)DUP2(b);
+    uint64_t vecb = (uint64_t)DUP2(b);
     intptr_t i;
 
-    for (i = 0; i < oprsz; i += sizeof(vec64)) {
-        *(vec64 *)(d + i) = *(vec64 *)(a + i) - vecb;
+    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
+        *(uint64_t *)(d + i) = *(uint64_t *)(a + i) - vecb;
     }
     clear_high(d, oprsz, desc);
 }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_mul8)(void *d, void *a, void *b, uint32_t desc)
     intptr_t oprsz = simd_oprsz(desc);
     intptr_t i;
 
-    for (i = 0; i < oprsz; i += sizeof(vec8)) {
-        *(vec8 *)(d + i) = *(vec8 *)(a + i) * *(vec8 *)(b + i);
+    for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
+        *(uint8_t *)(d + i) = *(uint8_t *)(a + i) * *(uint8_t *)(b + i);
     }
     clear_high(d, oprsz, desc);
 }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_mul16)(void *d, void *a, void *b, uint32_t desc)
     intptr_t oprsz = simd_oprsz(desc);
     intptr_t i;
 
-    for (i = 0; i < oprsz; i += sizeof(vec16)) {
-        *(vec16 *)(d + i) = *(vec16 *)(a + i) * *(vec16 *)(b + i);
+    for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
+        *(uint16_t *)(d + i) = *(uint16_t *)(a + i) * *(uint16_t *)(b + i);
     }
     clear_high(d, oprsz, desc);
 }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_mul32)(void *d, void *a, void *b, uint32_t desc)
     intptr_t oprsz = simd_oprsz(desc);
     intptr_t i;
 
-    for (i = 0; i < oprsz; i += sizeof(vec32)) {
-        *(vec32 *)(d + i) = *(vec32 *)(a + i) * *(vec32 *)(b + i);
+    for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
+        *(uint32_t *)(d + i) = *(uint32_t *)(a + i) * *(uint32_t *)(b + i);
     }
     clear_high(d, oprsz, desc);
 }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_mul64)(void *d, void *a, void *b, uint32_t desc)
     intptr_t oprsz = simd_oprsz(desc);
     intptr_t i;
 
-    for (i = 0; i < oprsz; i += sizeof(vec64)) {
-        *(vec64 *)(d + i) = *(vec64 *)(a + i) * *(vec64 *)(b + i);
+    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
+        *(uint64_t *)(d + i) = *(uint64_t *)(a + i) * *(uint64_t *)(b + i);
     }
     clear_high(d, oprsz, desc);
 }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_mul64)(void *d, void *a, void *b, uint32_t desc)
 void HELPER(gvec_muls8)(void *d, void *a, uint64_t b, uint32_t desc)
 {
     intptr_t oprsz = simd_oprsz(desc);
-    vec8 vecb = (vec8)DUP16(b);
+    uint8_t vecb = (uint8_t)DUP16(b);
     intptr_t i;
 
-    for (i = 0; i < oprsz; i += sizeof(vec8)) {
-        *(vec8 *)(d + i) = *(vec8 *)(a + i) * vecb;
+    for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
+        *(uint8_t *)(d + i) = *(uint8_t *)(a + i) * vecb;
     }
     clear_high(d, oprsz, desc);
 }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_muls8)(void *d, void *a, uint64_t b, uint32_t desc)
 void HELPER(gvec_muls16)(void *d, void *a, uint64_t b, uint32_t desc)
 {
     intptr_t oprsz = simd_oprsz(desc);
-    vec16 vecb = (vec16)DUP8(b);
+    uint16_t vecb = (uint16_t)DUP8(b);
     intptr_t i;
 
-    for (i = 0; i < oprsz; i += sizeof(vec16)) {
-        *(vec16 *)(d + i) = *(vec16 *)(a + i) * vecb;
+    for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
+        *(uint16_t *)(d + i) = *(uint16_t *)(a + i) * vecb;
     }
     clear_high(d, oprsz, desc);
 }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_muls16)(void *d, void *a, uint64_t b, uint32_t desc)
 void HELPER(gvec_muls32)(void *d, void *a, uint64_t b, uint32_t desc)
 {
     intptr_t oprsz = simd_oprsz(desc);
-    vec32 vecb = (vec32)DUP4(b);
+    uint32_t vecb = (uint32_t)DUP4(b);
     intptr_t i;
 
-    for (i = 0; i < oprsz; i += sizeof(vec32)) {
-        *(vec32 *)(d + i) = *(vec32 *)(a + i) * vecb;
+    for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
+        *(uint32_t *)(d + i) = *(uint32_t *)(a + i) * vecb;
     }
     clear_high(d, oprsz, desc);
 }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_muls32)(void *d, void *a, uint64_t b, uint32_t desc)
 void HELPER(gvec_muls64)(void *d, void *a, uint64_t b, uint32_t desc)
 {
     intptr_t oprsz = simd_oprsz(desc);
-    vec64 vecb = (vec64)DUP2(b);
+    uint64_t vecb = (uint64_t)DUP2(b);
     intptr_t i;
 
-    for (i = 0; i < oprsz; i += sizeof(vec64)) {
-        *(vec64 *)(d + i) = *(vec64 *)(a + i) * vecb;
+    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
+        *(uint64_t *)(d + i) = *(uint64_t *)(a + i) * vecb;
     }
     clear_high(d, oprsz, desc);
 }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_neg8)(void *d, void *a, uint32_t desc)
     intptr_t oprsz = simd_oprsz(desc);
     intptr_t i;
 
-    for (i = 0; i < oprsz; i += sizeof(vec8)) {
-        *(vec8 *)(d + i) = -*(vec8 *)(a + i);
+    for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
+        *(uint8_t *)(d + i) = -*(uint8_t *)(a + i);
     }
     clear_high(d, oprsz, desc);
 }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_neg16)(void *d, void *a, uint32_t desc)
     intptr_t oprsz = simd_oprsz(desc);
     intptr_t i;
 
-    for (i = 0; i < oprsz; i += sizeof(vec16)) {
-        *(vec16 *)(d + i) = -*(vec16 *)(a + i);
+    for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
+        *(uint16_t *)(d + i) = -*(uint16_t *)(a + i);
     }
     clear_high(d, oprsz, desc);
 }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_neg32)(void *d, void *a, uint32_t desc)
     intptr_t oprsz = simd_oprsz(desc);
     intptr_t i;
 
-    for (i = 0; i < oprsz; i += sizeof(vec32)) {
-        *(vec32 *)(d + i) = -*(vec32 *)(a + i);
+    for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
+        *(uint32_t *)(d + i) = -*(uint32_t *)(a + i);
     }
     clear_high(d, oprsz, desc);
 }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_neg64)(void *d, void *a, uint32_t desc)
     intptr_t oprsz = simd_oprsz(desc);
     intptr_t i;
 
-    for (i = 0; i < oprsz; i += sizeof(vec64)) {
-        *(vec64 *)(d + i) = -*(vec64 *)(a + i);
+    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
+        *(uint64_t *)(d + i) = -*(uint64_t *)(a + i);
     }
     clear_high(d, oprsz, desc);
 }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_not)(void *d, void *a, uint32_t desc)
     intptr_t oprsz = simd_oprsz(desc);
     intptr_t i;
 
-    for (i = 0; i < oprsz; i += sizeof(vec64)) {
-        *(vec64 *)(d + i) = ~*(vec64 *)(a + i);
+    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
+        *(uint64_t *)(d + i) = ~*(uint64_t *)(a + i);
     }
     clear_high(d, oprsz, desc);
 }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_and)(void *d, void *a, void *b, uint32_t desc)
     intptr_t oprsz = simd_oprsz(desc);
     intptr_t i;
 
-    for (i = 0; i < oprsz; i += sizeof(vec64)) {
-        *(vec64 *)(d + i) = *(vec64 *)(a + i) & *(vec64 *)(b + i);
+    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
+        *(uint64_t *)(d + i) = *(uint64_t *)(a + i) & *(uint64_t *)(b + i);
     }
     clear_high(d, oprsz, desc);
 }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_or)(void *d, void *a, void *b, uint32_t desc)
     intptr_t oprsz = simd_oprsz(desc);
     intptr_t i;
 
-    for (i = 0; i < oprsz; i += sizeof(vec64)) {
-        *(vec64 *)(d + i) = *(vec64 *)(a + i) | *(vec64 *)(b + i);
+    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
+        *(uint64_t *)(d + i) = *(uint64_t *)(a + i) | *(uint64_t *)(b + i);
     }
     clear_high(d, oprsz, desc);
 }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_xor)(void *d, void *a, void *b, uint32_t desc)
     intptr_t oprsz = simd_oprsz(desc);
     intptr_t i;
 
-    for (i = 0; i < oprsz; i += sizeof(vec64)) {
-        *(vec64 *)(d + i) = *(vec64 *)(a + i) ^ *(vec64 *)(b + i);
+    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
+        *(uint64_t *)(d + i) = *(uint64_t *)(a + i) ^ *(uint64_t *)(b + i);
     }
     clear_high(d, oprsz, desc);
 }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_andc)(void *d, void *a, void *b, uint32_t desc)
     intptr_t oprsz = simd_oprsz(desc);
     intptr_t i;
 
-    for (i = 0; i < oprsz; i += sizeof(vec64)) {
-        *(vec64 *)(d + i) = *(vec64 *)(a + i) &~ *(vec64 *)(b + i);
+    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
+        *(uint64_t *)(d + i) = *(uint64_t *)(a + i) &~ *(uint64_t *)(b + i);
     }
     clear_high(d, oprsz, desc);
 }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_orc)(void *d, void *a, void *b, uint32_t desc)
     intptr_t oprsz = simd_oprsz(desc);
     intptr_t i;
 
-    for (i = 0; i < oprsz; i += sizeof(vec64)) {
-        *(vec64 *)(d + i) = *(vec64 *)(a + i) |~ *(vec64 *)(b + i);
+    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
+        *(uint64_t *)(d + i) = *(uint64_t *)(a + i) |~ *(uint64_t *)(b + i);
     }
     clear_high(d, oprsz, desc);
 }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_nand)(void *d, void *a, void *b, uint32_t desc)
     intptr_t oprsz = simd_oprsz(desc);
     intptr_t i;
 
-    for (i = 0; i < oprsz; i += sizeof(vec64)) {
-        *(vec64 *)(d + i) = ~(*(vec64 *)(a + i) & *(vec64 *)(b + i));
+    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
+        *(uint64_t *)(d + i) = ~(*(uint64_t *)(a + i) & *(uint64_t *)(b + i));
     }
     clear_high(d, oprsz, desc);
 }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_nor)(void *d, void *a, void *b, uint32_t desc)
     intptr_t oprsz = simd_oprsz(desc);
     intptr_t i;
 
-    for (i = 0; i < oprsz; i += sizeof(vec64)) {
-        *(vec64 *)(d + i) = ~(*(vec64 *)(a + i) | *(vec64 *)(b + i));
+    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
+        *(uint64_t *)(d + i) = ~(*(uint64_t *)(a + i) | *(uint64_t *)(b + i));
     }
     clear_high(d, oprsz, desc);
 }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_eqv)(void *d, void *a, void *b, uint32_t desc)
     intptr_t oprsz = simd_oprsz(desc);
     intptr_t i;
 
-    for (i = 0; i < oprsz; i += sizeof(vec64)) {
-        *(vec64 *)(d + i) = ~(*(vec64 *)(a + i) ^ *(vec64 *)(b + i));
+    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
+        *(uint64_t *)(d + i) = ~(*(uint64_t *)(a + i) ^ *(uint64_t *)(b + i));
     }
     clear_high(d, oprsz, desc);
 }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_eqv)(void *d, void *a, void *b, uint32_t desc)
 void HELPER(gvec_ands)(void *d, void *a, uint64_t b, uint32_t desc)
 {
     intptr_t oprsz = simd_oprsz(desc);
-    vec64 vecb = (vec64)DUP2(b);
+    uint64_t vecb = (uint64_t)DUP2(b);
     intptr_t i;
 
-    for (i = 0; i < oprsz; i += sizeof(vec64)) {
-        *(vec64 *)(d + i) = *(vec64 *)(a + i) & vecb;
+    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
+        *(uint64_t *)(d + i) = *(uint64_t *)(a + i) & vecb;
     }
     clear_high(d, oprsz, desc);
 }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_ands)(void *d, void *a, uint64_t b, uint32_t desc)
 void HELPER(gvec_xors)(void *d, void *a, uint64_t b, uint32_t desc)
 {
     intptr_t oprsz = simd_oprsz(desc);
-    vec64 vecb = (vec64)DUP2(b);
+    uint64_t vecb = (uint64_t)DUP2(b);
     intptr_t i;
 
-    for (i = 0; i < oprsz; i += sizeof(vec64)) {
-        *(vec64 *)(d + i) = *(vec64 *)(a + i) ^ vecb;
+    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
+        *(uint64_t *)(d + i) = *(uint64_t *)(a + i) ^ vecb;
     }
     clear_high(d, oprsz, desc);
 }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_xors)(void *d, void *a, uint64_t b, uint32_t desc)
 void HELPER(gvec_ors)(void *d, void *a, uint64_t b, uint32_t desc)
 {
     intptr_t oprsz = simd_oprsz(desc);
-    vec64 vecb = (vec64)DUP2(b);
+    uint64_t vecb = (uint64_t)DUP2(b);
     intptr_t i;
 
-    for (i = 0; i < oprsz; i += sizeof(vec64)) {
-        *(vec64 *)(d + i) = *(vec64 *)(a + i) | vecb;
+    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
+        *(uint64_t *)(d + i) = *(uint64_t *)(a + i) | vecb;
     }
     clear_high(d, oprsz, desc);
 }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_shl8i)(void *d, void *a, uint32_t desc)
     int shift = simd_data(desc);
     intptr_t i;
 
-    for (i = 0; i < oprsz; i += sizeof(vec8)) {
-        *(vec8 *)(d + i) = *(vec8 *)(a + i) << shift;
+    for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
+        *(uint8_t *)(d + i) = *(uint8_t *)(a + i) << shift;
     }
     clear_high(d, oprsz, desc);
 }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_shl16i)(void *d, void *a, uint32_t desc)
     int shift = simd_data(desc);
     intptr_t i;
 
-    for (i = 0; i < oprsz; i += sizeof(vec16)) {
-        *(vec16 *)(d + i) = *(vec16 *)(a + i) << shift;
+    for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
+        *(uint16_t *)(d + i) = *(uint16_t *)(a + i) << shift;
     }
     clear_high(d, oprsz, desc);
 }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_shl32i)(void *d, void *a, uint32_t desc)
     int shift = simd_data(desc);
     intptr_t i;
 
-    for (i = 0; i < oprsz; i += sizeof(vec32)) {
-        *(vec32 *)(d + i) = *(vec32 *)(a + i) << shift;
+    for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
+        *(uint32_t *)(d + i) = *(uint32_t *)(a + i) << shift;
     }
     clear_high(d, oprsz, desc);
 }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_shl64i)(void *d, void *a, uint32_t desc)
     int shift = simd_data(desc);
     intptr_t i;
 
-    for (i = 0; i < oprsz; i += sizeof(vec64)) {
-        *(vec64 *)(d + i) = *(vec64 *)(a + i) << shift;
+    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
+        *(uint64_t *)(d + i) = *(uint64_t *)(a + i) << shift;
     }
     clear_high(d, oprsz, desc);
 }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_shr8i)(void *d, void *a, uint32_t desc)
     int shift = simd_data(desc);
     intptr_t i;
 
-    for (i = 0; i < oprsz; i += sizeof(vec8)) {
-        *(vec8 *)(d + i) = *(vec8 *)(a + i) >> shift;
+    for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
+        *(uint8_t *)(d + i) = *(uint8_t *)(a + i) >> shift;
     }
     clear_high(d, oprsz, desc);
 }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_shr16i)(void *d, void *a, uint32_t desc)
     int shift = simd_data(desc);
     intptr_t i;
 
-    for (i = 0; i < oprsz; i += sizeof(vec16)) {
-        *(vec16 *)(d + i) = *(vec16 *)(a + i) >> shift;
+    for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
+        *(uint16_t *)(d + i) = *(uint16_t *)(a + i) >> shift;
     }
     clear_high(d, oprsz, desc);
 }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_shr32i)(void *d, void *a, uint32_t desc)
     int shift = simd_data(desc);
     intptr_t i;
 
-    for (i = 0; i < oprsz; i += sizeof(vec32)) {
-        *(vec32 *)(d + i) = *(vec32 *)(a + i) >> shift;
+    for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
+        *(uint32_t *)(d + i) = *(uint32_t *)(a + i) >> shift;
     }
     clear_high(d, oprsz, desc);
 }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_shr64i)(void *d, void *a, uint32_t desc)
     int shift = simd_data(desc);
     intptr_t i;
 
-    for (i = 0; i < oprsz; i += sizeof(vec64)) {
-        *(vec64 *)(d + i) = *(vec64 *)(a + i) >> shift;
+    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
+        *(uint64_t *)(d + i) = *(uint64_t *)(a + i) >> shift;
     }
     clear_high(d, oprsz, desc);
 }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_sar8i)(void *d, void *a, uint32_t desc)
     int shift = simd_data(desc);
     intptr_t i;
 
-    for (i = 0; i < oprsz; i += sizeof(vec8)) {
-        *(svec8 *)(d + i) = *(svec8 *)(a + i) >> shift;
+    for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
+        *(int8_t *)(d + i) = *(int8_t *)(a + i) >> shift;
     }
     clear_high(d, oprsz, desc);
 }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_sar16i)(void *d, void *a, uint32_t desc)
     int shift = simd_data(desc);
     intptr_t i;
 
-    for (i = 0; i < oprsz; i += sizeof(vec16)) {
-        *(svec16 *)(d + i) = *(svec16 *)(a + i) >> shift;
+    for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
+        *(int16_t *)(d + i) = *(int16_t *)(a + i) >> shift;
     }
     clear_high(d, oprsz, desc);
 }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_sar32i)(void *d, void *a, uint32_t desc)
     int shift = simd_data(desc);
     intptr_t i;
 
-    for (i = 0; i < oprsz; i += sizeof(vec32)) {
-        *(svec32 *)(d + i) = *(svec32 *)(a + i) >> shift;
+    for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
+        *(int32_t *)(d + i) = *(int32_t *)(a + i) >> shift;
     }
     clear_high(d, oprsz, desc);
 }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_sar64i)(void *d, void *a, uint32_t desc)
     int shift = simd_data(desc);
     intptr_t i;
 
-    for (i = 0; i < oprsz; i += sizeof(vec64)) {
-        *(svec64 *)(d + i) = *(svec64 *)(a + i) >> shift;
+    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
+        *(int64_t *)(d + i) = *(int64_t *)(a + i) >> shift;
     }
     clear_high(d, oprsz, desc);
 }
@@ -XXX,XX +XXX,XX @@ void HELPER(NAME)(void *d, void *a, void *b, uint32_t desc)                \
 }
 
 #define DO_CMP2(SZ) \
-    DO_CMP1(gvec_eq##SZ, vec##SZ, ==)    \
-    DO_CMP1(gvec_ne##SZ, vec##SZ, !=)    \
-    DO_CMP1(gvec_lt##SZ, svec##SZ, <)    \
-    DO_CMP1(gvec_le##SZ, svec##SZ, <=)   \
-    DO_CMP1(gvec_ltu##SZ, vec##SZ, <)    \
-    DO_CMP1(gvec_leu##SZ, vec##SZ, <=)
+    DO_CMP1(gvec_eq##SZ, uint##SZ##_t, ==)    \
+    DO_CMP1(gvec_ne##SZ, uint##SZ##_t, !=)    \
+    DO_CMP1(gvec_lt##SZ, int##SZ##_t, <)      \
+    DO_CMP1(gvec_le##SZ, int##SZ##_t, <=)     \
+    DO_CMP1(gvec_ltu##SZ, uint##SZ##_t, <)    \
+    DO_CMP1(gvec_leu##SZ, uint##SZ##_t, <=)
 
 DO_CMP2(8)
 DO_CMP2(16)
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_bitsel)(void *d, void *a, void *b, void *c, uint32_t desc)
     intptr_t oprsz = simd_oprsz(desc);
     intptr_t i;
 
-    for (i = 0; i < oprsz; i += sizeof(vec64)) {
-        vec64 aa = *(vec64 *)(a + i);
-        vec64 bb = *(vec64 *)(b + i);
-        vec64 cc = *(vec64 *)(c + i);
-        *(vec64 *)(d + i) = (bb & aa) | (cc & ~aa);
+    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
+        uint64_t aa = *(uint64_t *)(a + i);
+        uint64_t bb = *(uint64_t *)(b + i);
+        uint64_t cc = *(uint64_t *)(c + i);
+        *(uint64_t *)(d + i) = (bb & aa) | (cc & ~aa);
     }
     clear_high(d, oprsz, desc);
 }
-- 
2.20.1

Partial cleanup from the CONFIG_VECTOR16 removal.
Replace the DUP* expansions with the scalar argument.

Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/tcg-runtime-gvec.c | 50 +++++++++++-------------------------
 1 file changed, 15 insertions(+), 35 deletions(-)

diff --git a/accel/tcg/tcg-runtime-gvec.c b/accel/tcg/tcg-runtime-gvec.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-runtime-gvec.c
+++ b/accel/tcg/tcg-runtime-gvec.c
@@ -XXX,XX +XXX,XX @@
 #include "tcg/tcg-gvec-desc.h"
 
 
-#define DUP16(X)  X
-#define DUP8(X)   X
-#define DUP4(X)   X
-#define DUP2(X)   X
-
 static inline void clear_high(void *d, intptr_t oprsz, uint32_t desc)
 {
     intptr_t maxsz = simd_maxsz(desc);
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_add64)(void *d, void *a, void *b, uint32_t desc)
 void HELPER(gvec_adds8)(void *d, void *a, uint64_t b, uint32_t desc)
 {
     intptr_t oprsz = simd_oprsz(desc);
-    uint8_t vecb = (uint8_t)DUP16(b);
     intptr_t i;
 
     for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
-        *(uint8_t *)(d + i) = *(uint8_t *)(a + i) + vecb;
+        *(uint8_t *)(d + i) = *(uint8_t *)(a + i) + (uint8_t)b;
     }
     clear_high(d, oprsz, desc);
 }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_adds8)(void *d, void *a, uint64_t b, uint32_t desc)
 void HELPER(gvec_adds16)(void *d, void *a, uint64_t b, uint32_t desc)
 {
     intptr_t oprsz = simd_oprsz(desc);
-    uint16_t vecb = (uint16_t)DUP8(b);
     intptr_t i;
 
     for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
-        *(uint16_t *)(d + i) = *(uint16_t *)(a + i) + vecb;
+        *(uint16_t *)(d + i) = *(uint16_t *)(a + i) + (uint16_t)b;
     }
     clear_high(d, oprsz, desc);
 }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_adds16)(void *d, void *a, uint64_t b, uint32_t desc)
 void HELPER(gvec_adds32)(void *d, void *a, uint64_t b, uint32_t desc)
 {
     intptr_t oprsz = simd_oprsz(desc);
-    uint32_t vecb = (uint32_t)DUP4(b);
     intptr_t i;
 
     for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
-        *(uint32_t *)(d + i) = *(uint32_t *)(a + i) + vecb;
+        *(uint32_t *)(d + i) = *(uint32_t *)(a + i) + (uint32_t)b;
     }
     clear_high(d, oprsz, desc);
 }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_adds32)(void *d, void *a, uint64_t b, uint32_t desc)
 void HELPER(gvec_adds64)(void *d, void *a, uint64_t b, uint32_t desc)
 {
     intptr_t oprsz = simd_oprsz(desc);
-    uint64_t vecb = (uint64_t)DUP2(b);
     intptr_t i;
 
     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
-        *(uint64_t *)(d + i) = *(uint64_t *)(a + i) + vecb;
+        *(uint64_t *)(d + i) = *(uint64_t *)(a + i) + b;
     }
     clear_high(d, oprsz, desc);
 }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_sub64)(void *d, void *a, void *b, uint32_t desc)
 void HELPER(gvec_subs8)(void *d, void *a, uint64_t b, uint32_t desc)
 {
     intptr_t oprsz = simd_oprsz(desc);
-    uint8_t vecb = (uint8_t)DUP16(b);
     intptr_t i;
 
     for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
-        *(uint8_t *)(d + i) = *(uint8_t *)(a + i) - vecb;
+        *(uint8_t *)(d + i) = *(uint8_t *)(a + i) - (uint8_t)b;
     }
     clear_high(d, oprsz, desc);
 }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_subs8)(void *d, void *a, uint64_t b, uint32_t desc)
 void HELPER(gvec_subs16)(void *d, void *a, uint64_t b, uint32_t desc)
 {
     intptr_t oprsz = simd_oprsz(desc);
-    uint16_t vecb = (uint16_t)DUP8(b);
     intptr_t i;
 
     for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
-        *(uint16_t *)(d + i) = *(uint16_t *)(a + i) - vecb;
+        *(uint16_t *)(d + i) = *(uint16_t *)(a + i) - (uint16_t)b;
     }
     clear_high(d, oprsz, desc);
 }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_subs16)(void *d, void *a, uint64_t b, uint32_t desc)
 void HELPER(gvec_subs32)(void *d, void *a, uint64_t b, uint32_t desc)
 {
     intptr_t oprsz = simd_oprsz(desc);
-    uint32_t vecb = (uint32_t)DUP4(b);
     intptr_t i;
 
     for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
-        *(uint32_t *)(d + i) = *(uint32_t *)(a + i) - vecb;
+        *(uint32_t *)(d + i) = *(uint32_t *)(a + i) - (uint32_t)b;
     }
     clear_high(d, oprsz, desc);
 }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_subs32)(void *d, void *a, uint64_t b, uint32_t desc)
 void HELPER(gvec_subs64)(void *d, void *a, uint64_t b, uint32_t desc)
 {
     intptr_t oprsz = simd_oprsz(desc);
-    uint64_t vecb = (uint64_t)DUP2(b);
     intptr_t i;
 
     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
-        *(uint64_t *)(d + i) = *(uint64_t *)(a + i) - vecb;
+        *(uint64_t *)(d + i) = *(uint64_t *)(a + i) - b;
     }
     clear_high(d, oprsz, desc);
 }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_mul64)(void *d, void *a, void *b, uint32_t desc)
 void HELPER(gvec_muls8)(void *d, void *a, uint64_t b, uint32_t desc)
 {
     intptr_t oprsz = simd_oprsz(desc);
-    uint8_t vecb = (uint8_t)DUP16(b);
     intptr_t i;
 
     for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
-        *(uint8_t *)(d + i) = *(uint8_t *)(a + i) * vecb;
+        *(uint8_t *)(d + i) = *(uint8_t *)(a + i) * (uint8_t)b;
     }
     clear_high(d, oprsz, desc);
 }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_muls8)(void *d, void *a, uint64_t b, uint32_t desc)
 void HELPER(gvec_muls16)(void *d, void *a, uint64_t b, uint32_t desc)
 {
     intptr_t oprsz = simd_oprsz(desc);
-    uint16_t vecb = (uint16_t)DUP8(b);
     intptr_t i;
 
     for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
-        *(uint16_t *)(d + i) = *(uint16_t *)(a + i) * vecb;
+        *(uint16_t *)(d + i) = *(uint16_t *)(a + i) * (uint16_t)b;
     }
     clear_high(d, oprsz, desc);
 }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_muls16)(void *d, void *a, uint64_t b, uint32_t desc)
 void HELPER(gvec_muls32)(void *d, void *a, uint64_t b, uint32_t desc)
 {
     intptr_t oprsz = simd_oprsz(desc);
-    uint32_t vecb = (uint32_t)DUP4(b);
     intptr_t i;
 
     for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
-        *(uint32_t *)(d + i) = *(uint32_t *)(a + i) * vecb;
+        *(uint32_t *)(d + i) = *(uint32_t *)(a + i) * (uint32_t)b;
     }
     clear_high(d, oprsz, desc);
 }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_muls32)(void *d, void *a, uint64_t b, uint32_t desc)
 void HELPER(gvec_muls64)(void *d, void *a, uint64_t b, uint32_t desc)
 {
     intptr_t oprsz = simd_oprsz(desc);
-    uint64_t vecb = (uint64_t)DUP2(b);
     intptr_t i;
 
     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
-        *(uint64_t *)(d + i) = *(uint64_t *)(a + i) * vecb;
+        *(uint64_t *)(d + i) = *(uint64_t *)(a + i) * b;
     }
     clear_high(d, oprsz, desc);
 }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_eqv)(void *d, void *a, void *b, uint32_t desc)
 void HELPER(gvec_ands)(void *d, void *a, uint64_t b, uint32_t desc)
 {
     intptr_t oprsz = simd_oprsz(desc);
-    uint64_t vecb = (uint64_t)DUP2(b);
     intptr_t i;
 
     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
-        *(uint64_t *)(d + i) = *(uint64_t *)(a + i) & vecb;
+        *(uint64_t *)(d + i) = *(uint64_t *)(a + i) & b;
     }
     clear_high(d, oprsz, desc);
 }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_ands)(void *d, void *a, uint64_t b, uint32_t desc)
 void HELPER(gvec_xors)(void *d, void *a, uint64_t b, uint32_t desc)
 {
     intptr_t oprsz = simd_oprsz(desc);
-    uint64_t vecb = (uint64_t)DUP2(b);
     intptr_t i;
 
     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
-        *(uint64_t *)(d + i) = *(uint64_t *)(a + i) ^ vecb;
+        *(uint64_t *)(d + i) = *(uint64_t *)(a + i) ^ b;
     }
     clear_high(d, oprsz, desc);
 }
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_xors)(void *d, void *a, uint64_t b, uint32_t desc)
 void HELPER(gvec_ors)(void *d, void *a, uint64_t b, uint32_t desc)
 {
     intptr_t oprsz = simd_oprsz(desc);
-    uint64_t vecb = (uint64_t)DUP2(b);
     intptr_t i;
 
     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
-        *(uint64_t *)(d + i) = *(uint64_t *)(a + i) | vecb;
+        *(uint64_t *)(d + i) = *(uint64_t *)(a + i) | b;
     }
     clear_high(d, oprsz, desc);
 }
-- 
2.20.1

Partial cleanup from the CONFIG_VECTOR16 removal.
Replace DO_CMP0 with its scalar expansion, a simple negation.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/tcg-runtime-gvec.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/accel/tcg/tcg-runtime-gvec.c b/accel/tcg/tcg-runtime-gvec.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-runtime-gvec.c
+++ b/accel/tcg/tcg-runtime-gvec.c
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_sar64v)(void *d, void *a, void *b, uint32_t desc)
     clear_high(d, oprsz, desc);
 }
 
-#define DO_CMP0(X)  -(X)
-
 #define DO_CMP1(NAME, TYPE, OP)                                            \
 void HELPER(NAME)(void *d, void *a, void *b, uint32_t desc)                \
 {                                                                          \
     intptr_t oprsz = simd_oprsz(desc);                                     \
     intptr_t i;                                                            \
     for (i = 0; i < oprsz; i += sizeof(TYPE)) {                            \
-        *(TYPE *)(d + i) = DO_CMP0(*(TYPE *)(a + i) OP *(TYPE *)(b + i));  \
+        *(TYPE *)(d + i) = -(*(TYPE *)(a + i) OP *(TYPE *)(b + i));        \
     }                                                                      \
     clear_high(d, oprsz, desc);                                            \
 }
@@ -XXX,XX +XXX,XX @@ DO_CMP2(16)
 DO_CMP2(32)
 DO_CMP2(64)
 
-#undef DO_CMP0
 #undef DO_CMP1
 #undef DO_CMP2
 
-- 
2.20.1