Series comparison

-[PULL for-5.0 0/1] tcg patch queue
+[PULL 00/15] tcg patch queue
-The following changes since commit 17e1e49814096a3daaa8e5a73acd56a0f30bdc18:
+The following changes since commit 1cbd2d914939ee6028e9688d4ba859a528c28405:
-  Merge remote-tracking branch 'remotes/stefanha/tags/block-pull-request' into staging (2020-04-09 19:00:41 +0100)
+  Merge remote-tracking branch 'remotes/jasowang/tags/net-pull-request' into staging (2021-06-04 13:38:49 +0100)
 are available in the Git repository at:
-  https://github.com/rth7680/qemu.git tags/pull-tcg-20200412
+  https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20210604
-for you to fetch changes up to a4e57084c16d5b0eff3651693fba04f26b30b551:
+for you to fetch changes up to 0006039e29b9e6118beab300146f7c4931f7a217:
-  tcg/mips: mips sync* encode error (2020-04-12 14:07:07 -0700)
+  tcg/arm: Implement TCG_TARGET_HAS_rotv_vec (2021-06-04 11:50:11 -0700)
 ----------------------------------------------------------------
-Fix tcg/mips barrier encoding
+Host vector support for arm neon.
 ----------------------------------------------------------------
-lixinyu (1):
+Richard Henderson (15):
-      tcg/mips: mips sync* encode error
+      tcg: Change parameters for tcg_target_const_match
       tcg/arm: Add host vector framework
       tcg/arm: Implement tcg_out_ld/st for vector types
       tcg/arm: Implement tcg_out_mov for vector types
       tcg/arm: Implement tcg_out_dup*_vec
       tcg/arm: Implement minimal vector operations
       tcg/arm: Implement andc, orc, abs, neg, not vector operations
       tcg/arm: Implement TCG_TARGET_HAS_shi_vec
       tcg/arm: Implement TCG_TARGET_HAS_mul_vec
       tcg/arm: Implement TCG_TARGET_HAS_sat_vec
       tcg/arm: Implement TCG_TARGET_HAS_minmax_vec
       tcg/arm: Implement TCG_TARGET_HAS_bitsel_vec
       tcg/arm: Implement TCG_TARGET_HAS_shv_vec
       tcg/arm: Implement TCG_TARGET_HAS_roti_vec
       tcg/arm: Implement TCG_TARGET_HAS_rotv_vec
- tcg/mips/tcg-target.inc.c | 10 +++++-----
+ tcg/arm/tcg-target-con-set.h |  10 +
-file changed, 5 insertions(+), 5 deletions(-)
+ tcg/arm/tcg-target-con-str.h |   3 +
  tcg/arm/tcg-target.h         |  52 ++-
  tcg/arm/tcg-target.opc.h     |  16 +
  tcg/tcg.c                    |   5 +-
  tcg/aarch64/tcg-target.c.inc |   5 +-
  tcg/arm/tcg-target.c.inc     | 956 +++++++++++++++++++++++++++++++++++++++++--
  tcg/i386/tcg-target.c.inc    |   4 +-
  tcg/mips/tcg-target.c.inc    |   5 +-
  tcg/ppc/tcg-target.c.inc     |   4 +-
  tcg/riscv/tcg-target.c.inc   |   4 +-
  tcg/s390/tcg-target.c.inc    |   5 +-
  tcg/sparc/tcg-target.c.inc   |   5 +-
  tcg/tci/tcg-target.c.inc     |   6 +-
 files changed, 1001 insertions(+), 79 deletions(-)
  create mode 100644 tcg/arm/tcg-target.opc.h

-New patch
+[PULL 01/15] tcg: Change parameters for tcg_target_const_match
+Change the return value to bool, because that's what is should
 have been from the start.  Pass the ct mask instead of the whole
 TCGArgConstraint, as that's the only part that's relevant.
 Change the value argument to int64_t.  We will need the extra
 width for 32-bit hosts wanting to match vector constants.
 Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
  tcg/tcg.c                    | 5 ++---
  tcg/aarch64/tcg-target.c.inc | 5 +----
  tcg/arm/tcg-target.c.inc     | 5 +----
  tcg/i386/tcg-target.c.inc    | 4 +---
  tcg/mips/tcg-target.c.inc    | 5 +----
  tcg/ppc/tcg-target.c.inc     | 4 +---
  tcg/riscv/tcg-target.c.inc   | 4 +---
  tcg/s390/tcg-target.c.inc    | 5 +----
  tcg/sparc/tcg-target.c.inc   | 5 +----
  tcg/tci/tcg-target.c.inc     | 6 ++----
 files changed, 12 insertions(+), 36 deletions(-)
 diff --git a/tcg/tcg.c b/tcg/tcg.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tcg.c
 +++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg, TCGReg arg1,
  static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
                          TCGReg base, intptr_t ofs);
  static void tcg_out_call(TCGContext *s, const tcg_insn_unit *target);
 -static int tcg_target_const_match(tcg_target_long val, TCGType type,
 -                                  const TCGArgConstraint *arg_ct);
 +static bool tcg_target_const_match(int64_t val, TCGType type, int ct);
  #ifdef TCG_TARGET_NEED_LDST_LABELS
  static int tcg_out_ldst_finalize(TCGContext *s);
  #endif
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
          ts = arg_temp(arg);
          if (ts->val_type == TEMP_VAL_CONST
 -            && tcg_target_const_match(ts->val, ts->type, arg_ct)) {
 +            && tcg_target_const_match(ts->val, ts->type, arg_ct->ct)) {
              /* constant is OK for instruction */
              const_args[i] = 1;
              new_args[i] = ts->val;
 diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/aarch64/tcg-target.c.inc
 +++ b/tcg/aarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static bool is_shimm1632(uint32_t v32, int *cmode, int *imm8)
      }
  }
 -static int tcg_target_const_match(tcg_target_long val, TCGType type,
 -                                  const TCGArgConstraint *arg_ct)
 +static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
  {
 -    int ct = arg_ct->ct;
 -
      if (ct & TCG_CT_CONST) {
          return 1;
      }
 diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/arm/tcg-target.c.inc
 +++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static inline int check_fit_imm(uint32_t imm)
   * mov operand2:     values represented with x << (2 * y), x < 0x100
   * add, sub, eor...: ditto
   */
 -static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
 -                                         const TCGArgConstraint *arg_ct)
 +static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
  {
 -    int ct;
 -    ct = arg_ct->ct;
      if (ct & TCG_CT_CONST) {
          return 1;
      } else if ((ct & TCG_CT_CONST_ARM) && check_fit_imm(val)) {
 diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/i386/tcg-target.c.inc
 +++ b/tcg/i386/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
  }
  /* test if a constant matches the constraint */
 -static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
 -                                         const TCGArgConstraint *arg_ct)
 +static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
  {
 -    int ct = arg_ct->ct;
      if (ct & TCG_CT_CONST) {
          return 1;
      }
 diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/mips/tcg-target.c.inc
 +++ b/tcg/mips/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static inline bool is_p2m1(tcg_target_long val)
  }
  /* test if a constant matches the constraint */
 -static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
 -                                         const TCGArgConstraint *arg_ct)
 +static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
  {
 -    int ct;
 -    ct = arg_ct->ct;
      if (ct & TCG_CT_CONST) {
          return 1;
      } else if ((ct & TCG_CT_CONST_ZERO) && val == 0) {
 diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/ppc/tcg-target.c.inc
 +++ b/tcg/ppc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static bool reloc_pc14(tcg_insn_unit *src_rw, const tcg_insn_unit *target)
  }
  /* test if a constant matches the constraint */
 -static int tcg_target_const_match(tcg_target_long val, TCGType type,
 -                                  const TCGArgConstraint *arg_ct)
 +static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
  {
 -    int ct = arg_ct->ct;
      if (ct & TCG_CT_CONST) {
          return 1;
      }
 diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/riscv/tcg-target.c.inc
 +++ b/tcg/riscv/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static inline tcg_target_long sextreg(tcg_target_long val, int pos, int len)
  }
  /* test if a constant matches the constraint */
 -static int tcg_target_const_match(tcg_target_long val, TCGType type,
 -                                  const TCGArgConstraint *arg_ct)
 +static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
  {
 -    int ct = arg_ct->ct;
      if (ct & TCG_CT_CONST) {
          return 1;
      }
 diff --git a/tcg/s390/tcg-target.c.inc b/tcg/s390/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/s390/tcg-target.c.inc
 +++ b/tcg/s390/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static bool patch_reloc(tcg_insn_unit *src_rw, int type,
  }
  /* Test if a constant matches the constraint. */
 -static int tcg_target_const_match(tcg_target_long val, TCGType type,
 -                                  const TCGArgConstraint *arg_ct)
 +static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
  {
 -    int ct = arg_ct->ct;
 -
      if (ct & TCG_CT_CONST) {
          return 1;
      }
 diff --git a/tcg/sparc/tcg-target.c.inc b/tcg/sparc/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/sparc/tcg-target.c.inc
 +++ b/tcg/sparc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static bool patch_reloc(tcg_insn_unit *src_rw, int type,
  }
  /* test if a constant matches the constraint */
 -static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
 -                                         const TCGArgConstraint *arg_ct)
 +static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
  {
 -    int ct = arg_ct->ct;
 -
      if (ct & TCG_CT_CONST) {
          return 1;
      }
 diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tci/tcg-target.c.inc
 +++ b/tcg/tci/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static inline bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
  }
  /* Test if a constant matches the constraint. */
 -static int tcg_target_const_match(tcg_target_long val, TCGType type,
 -                                  const TCGArgConstraint *arg_ct)
 +static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
  {
 -    /* No need to return 0 or 1, 0 or != 0 is good enough. */
 -    return arg_ct->ct & TCG_CT_CONST;
 +    return ct & TCG_CT_CONST;
  }
  static void tcg_target_init(TCGContext *s)
 --
 .25.1

-New patch
+[PULL 02/15] tcg/arm: Add host vector framework
+Add registers and function stubs.  The functionality
 is disabled via use_neon_instructions defined to 0.
 We must still include results for the mandatory opcodes in
 tcg_target_op_def, as all opcodes are checked during tcg init.
 Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
  tcg/arm/tcg-target-con-set.h |   4 ++
  tcg/arm/tcg-target-con-str.h |   1 +
  tcg/arm/tcg-target.h         |  48 ++++++++++++--
  tcg/arm/tcg-target.opc.h     |  12 ++++
  tcg/arm/tcg-target.c.inc     | 117 +++++++++++++++++++++++++++++------
 files changed, 158 insertions(+), 24 deletions(-)
  create mode 100644 tcg/arm/tcg-target.opc.h
 diff --git a/tcg/arm/tcg-target-con-set.h b/tcg/arm/tcg-target-con-set.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/arm/tcg-target-con-set.h
 +++ b/tcg/arm/tcg-target-con-set.h
@@ -XXX,XX +XXX,XX @@ C_O0_I1(r)
  C_O0_I2(r, r)
  C_O0_I2(r, rIN)
  C_O0_I2(s, s)
 +C_O0_I2(w, r)
  C_O0_I3(s, s, s)
  C_O0_I4(r, r, rI, rI)
  C_O0_I4(s, s, s, s)
  C_O1_I1(r, l)
  C_O1_I1(r, r)
 +C_O1_I1(w, r)
 +C_O1_I1(w, wr)
  C_O1_I2(r, 0, rZ)
  C_O1_I2(r, l, l)
  C_O1_I2(r, r, r)
@@ -XXX,XX +XXX,XX @@ C_O1_I2(r, r, rIK)
  C_O1_I2(r, r, rIN)
  C_O1_I2(r, r, ri)
  C_O1_I2(r, rZ, rZ)
 +C_O1_I2(w, w, w)
  C_O1_I4(r, r, r, rI, rI)
  C_O1_I4(r, r, rIN, rIK, 0)
  C_O2_I1(r, r, l)
 diff --git a/tcg/arm/tcg-target-con-str.h b/tcg/arm/tcg-target-con-str.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/arm/tcg-target-con-str.h
 +++ b/tcg/arm/tcg-target-con-str.h
@@ -XXX,XX +XXX,XX @@
  REGS('r', ALL_GENERAL_REGS)
  REGS('l', ALL_QLOAD_REGS)
  REGS('s', ALL_QSTORE_REGS)
 +REGS('w', ALL_VECTOR_REGS)
  /*
   * Define constraint letters for constants:
 diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/arm/tcg-target.h
 +++ b/tcg/arm/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
      TCG_REG_R13,
      TCG_REG_R14,
      TCG_REG_PC,
 +
 +    TCG_REG_Q0,
 +    TCG_REG_Q1,
 +    TCG_REG_Q2,
 +    TCG_REG_Q3,
 +    TCG_REG_Q4,
 +    TCG_REG_Q5,
 +    TCG_REG_Q6,
 +    TCG_REG_Q7,
 +    TCG_REG_Q8,
 +    TCG_REG_Q9,
 +    TCG_REG_Q10,
 +    TCG_REG_Q11,
 +    TCG_REG_Q12,
 +    TCG_REG_Q13,
 +    TCG_REG_Q14,
 +    TCG_REG_Q15,
 +
 +    TCG_AREG0 = TCG_REG_R6,
 +    TCG_REG_CALL_STACK = TCG_REG_R13,
  } TCGReg;
 -#define TCG_TARGET_NB_REGS 16
 +#define TCG_TARGET_NB_REGS 32
  #ifdef __ARM_ARCH_EXT_IDIV__
  #define use_idiv_instructions  1
  #else
  extern bool use_idiv_instructions;
  #endif
 -
 +#define use_neon_instructions  0
  /* used for function call generation */
 -#define TCG_REG_CALL_STACK        TCG_REG_R13
  #define TCG_TARGET_STACK_ALIGN        8
  #define TCG_TARGET_CALL_ALIGN_ARGS    1
  #define TCG_TARGET_CALL_STACK_OFFSET    0
@@ -XXX,XX +XXX,XX @@ extern bool use_idiv_instructions;
  #define TCG_TARGET_HAS_direct_jump      0
  #define TCG_TARGET_HAS_qemu_st8_i32     0
 -enum {
 -    TCG_AREG0 = TCG_REG_R6,
 -};
 +#define TCG_TARGET_HAS_v64              use_neon_instructions
 +#define TCG_TARGET_HAS_v128             use_neon_instructions
 +#define TCG_TARGET_HAS_v256             0
 +
 +#define TCG_TARGET_HAS_andc_vec         0
 +#define TCG_TARGET_HAS_orc_vec          0
 +#define TCG_TARGET_HAS_not_vec          0
 +#define TCG_TARGET_HAS_neg_vec          0
 +#define TCG_TARGET_HAS_abs_vec          0
 +#define TCG_TARGET_HAS_roti_vec         0
 +#define TCG_TARGET_HAS_rots_vec         0
 +#define TCG_TARGET_HAS_rotv_vec         0
 +#define TCG_TARGET_HAS_shi_vec          0
 +#define TCG_TARGET_HAS_shs_vec          0
 +#define TCG_TARGET_HAS_shv_vec          0
 +#define TCG_TARGET_HAS_mul_vec          0
 +#define TCG_TARGET_HAS_sat_vec          0
 +#define TCG_TARGET_HAS_minmax_vec       0
 +#define TCG_TARGET_HAS_bitsel_vec       0
 +#define TCG_TARGET_HAS_cmpsel_vec       0
  #define TCG_TARGET_DEFAULT_MO (0)
  #define TCG_TARGET_HAS_MEMORY_BSWAP     1
 diff --git a/tcg/arm/tcg-target.opc.h b/tcg/arm/tcg-target.opc.h
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/tcg/arm/tcg-target.opc.h
@@ -XXX,XX +XXX,XX @@
 +/*
 + * Copyright (c) 2019 Linaro
 + *
 + * This work is licensed under the terms of the GNU GPL, version 2 or
 + * (at your option) any later version.
 + *
 + * See the COPYING file in the top-level directory for details.
 + *
 + * Target-specific opcodes for host vector expansion.  These will be
 + * emitted by tcg_expand_vec_op.  For those familiar with GCC internals,
 + * consider these to be UNSPEC with names.
 + */
 diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/arm/tcg-target.c.inc
 +++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ bool use_idiv_instructions;
  #ifdef CONFIG_DEBUG_TCG
  static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
 -    "%r0",
 -    "%r1",
 -    "%r2",
 -    "%r3",
 -    "%r4",
 -    "%r5",
 -    "%r6",
 -    "%r7",
 -    "%r8",
 -    "%r9",
 -    "%r10",
 -    "%r11",
 -    "%r12",
 -    "%r13",
 -    "%r14",
 -    "%pc",
 +    "%r0",  "%r1",  "%r2",  "%r3",  "%r4",  "%r5",  "%r6",  "%r7",
 +    "%r8",  "%r9",  "%r10", "%r11", "%r12", "%sp",  "%r14", "%pc",
 +    "%q0",  "%q1",  "%q2",  "%q3",  "%q4",  "%q5",  "%q6",  "%q7",
 +    "%q8",  "%q9",  "%q10", "%q11", "%q12", "%q13", "%q14", "%q15",
  };
  #endif
@@ -XXX,XX +XXX,XX @@ static const int tcg_target_reg_alloc_order[] = {
      TCG_REG_R3,
      TCG_REG_R12,
      TCG_REG_R14,
 +
 +    TCG_REG_Q0,
 +    TCG_REG_Q1,
 +    TCG_REG_Q2,
 +    TCG_REG_Q3,
 +    /* Q4 - Q7 are call-saved, and skipped. */
 +    TCG_REG_Q8,
 +    TCG_REG_Q9,
 +    TCG_REG_Q10,
 +    TCG_REG_Q11,
 +    TCG_REG_Q12,
 +    TCG_REG_Q13,
 +    TCG_REG_Q14,
 +    TCG_REG_Q15,
  };
  static const int tcg_target_call_iarg_regs[4] = {
@@ -XXX,XX +XXX,XX @@ static const int tcg_target_call_oarg_regs[2] = {
  };
  #define TCG_REG_TMP  TCG_REG_R12
 +#define TCG_VEC_TMP  TCG_REG_Q15
  enum arm_cond_code_e {
      COND_EQ = 0x0,
@@ -XXX,XX +XXX,XX @@ static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
  #define TCG_CT_CONST_ZERO 0x800
  #define ALL_GENERAL_REGS  0xffffu
 +#define ALL_VECTOR_REGS   0xffff0000u
  /*
   * r0-r2 will be overwritten when reading the tlb entry (softmmu only)
@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
      case INDEX_op_qemu_st_i64:
          return TARGET_LONG_BITS == 32 ? C_O0_I3(s, s, s) : C_O0_I4(s, s, s, s);
 +    case INDEX_op_st_vec:
 +        return C_O0_I2(w, r);
 +    case INDEX_op_ld_vec:
 +    case INDEX_op_dupm_vec:
 +        return C_O1_I1(w, r);
 +    case INDEX_op_dup_vec:
 +        return C_O1_I1(w, wr);
 +    case INDEX_op_dup2_vec:
 +    case INDEX_op_add_vec:
 +    case INDEX_op_sub_vec:
 +    case INDEX_op_xor_vec:
 +    case INDEX_op_or_vec:
 +    case INDEX_op_and_vec:
 +    case INDEX_op_cmp_vec:
 +        return C_O1_I2(w, w, w);
 +
      default:
          g_assert_not_reached();
      }
@@ -XXX,XX +XXX,XX @@ static void tcg_target_init(TCGContext *s)
  {
      /* Only probe for the platform and capabilities if we havn't already
         determined maximum values at compile time.  */
 -#ifndef use_idiv_instructions
 +#if !defined(use_idiv_instructions) || !defined(use_neon_instructions)
      {
          unsigned long hwcap = qemu_getauxval(AT_HWCAP);
 +#ifndef use_idiv_instructions
          use_idiv_instructions = (hwcap & HWCAP_ARM_IDIVA) != 0;
 +#endif
 +#ifndef use_neon_instructions
 +        use_neon_instructions = (hwcap & HWCAP_ARM_NEON) != 0;
 +#endif
      }
  #endif
 +
      if (__ARM_ARCH < 7) {
          const char *pl = (const char *)qemu_getauxval(AT_PLATFORM);
          if (pl != NULL && pl[0] == 'v' && pl[1] >= '4' && pl[1] <= '9') {
@@ -XXX,XX +XXX,XX @@ static void tcg_target_init(TCGContext *s)
          }
      }
 -    tcg_target_available_regs[TCG_TYPE_I32] = 0xffff;
 +    tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS;
      tcg_target_call_clobber_regs = 0;
      tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R0);
@@ -XXX,XX +XXX,XX @@ static void tcg_target_init(TCGContext *s)
      tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R12);
      tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R14);
 +    if (use_neon_instructions) {
 +        tcg_target_available_regs[TCG_TYPE_V64]  = ALL_VECTOR_REGS;
 +        tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS;
 +
 +        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_Q0);
 +        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_Q1);
 +        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_Q2);
 +        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_Q3);
 +        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_Q8);
 +        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_Q9);
 +        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_Q10);
 +        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_Q11);
 +        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_Q12);
 +        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_Q13);
 +        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_Q14);
 +        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_Q15);
 +    }
 +
      s->reserved_regs = 0;
      tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
      tcg_regset_set_reg(s->reserved_regs, TCG_REG_TMP);
      tcg_regset_set_reg(s->reserved_regs, TCG_REG_PC);
 +    tcg_regset_set_reg(s->reserved_regs, TCG_VEC_TMP);
  }
  static inline void tcg_out_ld(TCGContext *s, TCGType type, TCGReg arg,
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_movi(TCGContext *s, TCGType type,
      tcg_out_movi32(s, COND_AL, ret, arg);
  }
 +static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
 +                            TCGReg rd, TCGReg rs)
 +{
 +    g_assert_not_reached();
 +}
 +
 +static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
 +                             TCGReg rd, TCGReg base, intptr_t offset)
 +{
 +    g_assert_not_reached();
 +}
 +
 +static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
 +                             TCGReg rd, int64_t v64)
 +{
 +    g_assert_not_reached();
 +}
 +
 +static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
 +                           unsigned vecl, unsigned vece,
 +                           const TCGArg *args, const int *const_args)
 +{
 +    g_assert_not_reached();
 +}
 +
 +int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
 +{
 +    return 0;
 +}
 +
 +void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
 +                       TCGArg a0, ...)
 +{
 +    g_assert_not_reached();
 +}
 +
  static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
  {
      int i;
 --
 .25.1

-New patch
+[PULL 03/15] tcg/arm: Implement tcg_out_ld/st for vector types
+Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/arm/tcg-target.c.inc | 70 ++++++++++++++++++++++++++++++++++++----
+file changed, 64 insertions(+), 6 deletions(-)
+diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/arm/tcg-target.c.inc
++++ b/tcg/arm/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ typedef enum {
+     INSN_NOP_v6k   = 0xe320f000,
+     /* Otherwise the assembler uses mov r0,r0 */
+     INSN_NOP_v4    = (COND_AL << 28) | ARITH_MOV,
++
++    INSN_VLD1      = 0xf4200000,  /* VLD1 (multiple single elements) */
++    INSN_VST1      = 0xf4000000,  /* VST1 (multiple single elements) */
+ } ARMInsn;
+ #define INSN_NOP   (use_armv7_instructions ? INSN_NOP_v6k : INSN_NOP_v4)
+@@ -XXX,XX +XXX,XX @@ static TCGCond tcg_out_cmp2(TCGContext *s, const TCGArg *args,
+     }
+ }
++/*
++ * Note that TCGReg references Q-registers.
++ * Q-regno = 2 * D-regno, so shift left by 1 whlie inserting.
++ */
++static uint32_t encode_vd(TCGReg rd)
++{
++    tcg_debug_assert(rd >= TCG_REG_Q0);
++    return (extract32(rd, 3, 1) << 22) | (extract32(rd, 0, 3) << 13);
++}
++
++static void tcg_out_vldst(TCGContext *s, ARMInsn insn,
++                          TCGReg rd, TCGReg rn, int offset)
++{
++    if (offset != 0) {
++        if (check_fit_imm(offset) || check_fit_imm(-offset)) {
++            tcg_out_dat_rIN(s, COND_AL, ARITH_ADD, ARITH_SUB,
++                            TCG_REG_TMP, rn, offset, true);
++        } else {
++            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_TMP, offset);
++            tcg_out_dat_reg(s, COND_AL, ARITH_ADD,
++                            TCG_REG_TMP, TCG_REG_TMP, rn, 0);
++        }
++        rn = TCG_REG_TMP;
++    }
++    tcg_out32(s, insn | (rn << 16) | encode_vd(rd) | 0xf);
++}
++
+ #ifdef CONFIG_SOFTMMU
+ #include "../tcg-ldst.c.inc"
+@@ -XXX,XX +XXX,XX @@ static void tcg_target_init(TCGContext *s)
+     tcg_regset_set_reg(s->reserved_regs, TCG_VEC_TMP);
+ }
+-static inline void tcg_out_ld(TCGContext *s, TCGType type, TCGReg arg,
+-                              TCGReg arg1, intptr_t arg2)
++static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg arg,
++                       TCGReg arg1, intptr_t arg2)
+ {
+-    tcg_out_ld32u(s, COND_AL, arg, arg1, arg2);
++    switch (type) {
++    case TCG_TYPE_I32:
++        tcg_out_ld32u(s, COND_AL, arg, arg1, arg2);
++        return;
++    case TCG_TYPE_V64:
++        /* regs 1; size 8; align 8 */
++        tcg_out_vldst(s, INSN_VLD1 | 0x7d0, arg, arg1, arg2);
++        return;
++    case TCG_TYPE_V128:
++        /* regs 2; size 8; align 16 */
++        tcg_out_vldst(s, INSN_VLD1 | 0xae0, arg, arg1, arg2);
++        return;
++    default:
++        g_assert_not_reached();
++    }
+ }
+-static inline void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
+-                              TCGReg arg1, intptr_t arg2)
++static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
++                       TCGReg arg1, intptr_t arg2)
+ {
+-    tcg_out_st32(s, COND_AL, arg, arg1, arg2);
++    switch (type) {
++    case TCG_TYPE_I32:
++        tcg_out_st32(s, COND_AL, arg, arg1, arg2);
++        return;
++    case TCG_TYPE_V64:
++        /* regs 1; size 8; align 8 */
++        tcg_out_vldst(s, INSN_VST1 | 0x7d0, arg, arg1, arg2);
++        return;
++    case TCG_TYPE_V128:
++        /* regs 2; size 8; align 16 */
++        tcg_out_vldst(s, INSN_VST1 | 0xae0, arg, arg1, arg2);
++        return;
++    default:
++        g_assert_not_reached();
++    }
+ }
+ static inline bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
+--
+.25.1

-New patch
+[PULL 04/15] tcg/arm: Implement tcg_out_mov for vector types
+Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/arm/tcg-target.c.inc | 52 +++++++++++++++++++++++++++++++++++-----
+file changed, 46 insertions(+), 6 deletions(-)
+diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/arm/tcg-target.c.inc
++++ b/tcg/arm/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ typedef enum {
+     /* Otherwise the assembler uses mov r0,r0 */
+     INSN_NOP_v4    = (COND_AL << 28) | ARITH_MOV,
++    INSN_VORR      = 0xf2200110,
++
+     INSN_VLD1      = 0xf4200000,  /* VLD1 (multiple single elements) */
+     INSN_VST1      = 0xf4000000,  /* VST1 (multiple single elements) */
+ } ARMInsn;
+@@ -XXX,XX +XXX,XX @@ static uint32_t encode_vd(TCGReg rd)
+     return (extract32(rd, 3, 1) << 22) | (extract32(rd, 0, 3) << 13);
+ }
++static uint32_t encode_vn(TCGReg rn)
++{
++    tcg_debug_assert(rn >= TCG_REG_Q0);
++    return (extract32(rn, 3, 1) << 7) | (extract32(rn, 0, 3) << 17);
++}
++
++static uint32_t encode_vm(TCGReg rm)
++{
++    tcg_debug_assert(rm >= TCG_REG_Q0);
++    return (extract32(rm, 3, 1) << 5) | (extract32(rm, 0, 3) << 1);
++}
++
++static void tcg_out_vreg3(TCGContext *s, ARMInsn insn, int q, int vece,
++                          TCGReg d, TCGReg n, TCGReg m)
++{
++    tcg_out32(s, insn | (vece << 20) | (q << 6) |
++              encode_vd(d) | encode_vn(n) | encode_vm(m));
++}
++
+ static void tcg_out_vldst(TCGContext *s, ARMInsn insn,
+                           TCGReg rd, TCGReg rn, int offset)
+ {
+@@ -XXX,XX +XXX,XX @@ static inline bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
+     return false;
+ }
+-static inline bool tcg_out_mov(TCGContext *s, TCGType type,
+-                               TCGReg ret, TCGReg arg)
++static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
+ {
+-    tcg_out_mov_reg(s, COND_AL, ret, arg);
+-    return true;
++    if (ret == arg) {
++        return true;
++    }
++    switch (type) {
++    case TCG_TYPE_I32:
++        if (ret < TCG_REG_Q0 && arg < TCG_REG_Q0) {
++            tcg_out_mov_reg(s, COND_AL, ret, arg);
++            return true;
++        }
++        return false;
++
++    case TCG_TYPE_V64:
++    case TCG_TYPE_V128:
++        /* "VMOV D,N" is an alias for "VORR D,N,N". */
++        tcg_out_vreg3(s, INSN_VORR, type - TCG_TYPE_V64, 0, ret, arg, arg);
++        return true;
++
++    default:
++        g_assert_not_reached();
++    }
+ }
+-static inline void tcg_out_movi(TCGContext *s, TCGType type,
+-                                TCGReg ret, tcg_target_long arg)
++static void tcg_out_movi(TCGContext *s, TCGType type,
++                         TCGReg ret, tcg_target_long arg)
+ {
++    tcg_debug_assert(type == TCG_TYPE_I32);
++    tcg_debug_assert(ret < TCG_REG_Q0);
+     tcg_out_movi32(s, COND_AL, ret, arg);
+ }
+--
+.25.1

-New patch
+[PULL 05/15] tcg/arm: Implement tcg_out_dup*_vec
+Most of dupi is copied from tcg/aarch64, which has the same
+encoding for AdvSimdExpandImm.
+Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/arm/tcg-target.c.inc | 283 +++++++++++++++++++++++++++++++++++++--
+file changed, 275 insertions(+), 8 deletions(-)
+diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/arm/tcg-target.c.inc
++++ b/tcg/arm/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ typedef enum {
+     INSN_VORR      = 0xf2200110,
++    INSN_VDUP_G    = 0xee800b10,  /* VDUP (ARM core register) */
++    INSN_VDUP_S    = 0xf3b00c00,  /* VDUP (scalar) */
++    INSN_VLDR_D    = 0xed100b00,  /* VLDR.64 */
+     INSN_VLD1      = 0xf4200000,  /* VLD1 (multiple single elements) */
++    INSN_VLD1R     = 0xf4a00c00,  /* VLD1 (single element to all lanes) */
+     INSN_VST1      = 0xf4000000,  /* VST1 (multiple single elements) */
++    INSN_VMOVI     = 0xf2800010,  /* VMOV (immediate) */
+ } ARMInsn;
+ #define INSN_NOP   (use_armv7_instructions ? INSN_NOP_v6k : INSN_NOP_v4)
+@@ -XXX,XX +XXX,XX @@ static const uint8_t tcg_cond_to_arm_cond[] = {
+     [TCG_COND_GTU] = COND_HI,
+ };
++static int encode_imm(uint32_t imm);
++
++/* TCG private relocation type: add with pc+imm8 */
++#define R_ARM_PC8  11
++
++/* TCG private relocation type: vldr with imm8 << 2 */
++#define R_ARM_PC11 12
++
+ static bool reloc_pc24(tcg_insn_unit *src_rw, const tcg_insn_unit *target)
+ {
+     const tcg_insn_unit *src_rx = tcg_splitwx_to_rx(src_rw);
+@@ -XXX,XX +XXX,XX @@ static bool reloc_pc13(tcg_insn_unit *src_rw, const tcg_insn_unit *target)
+     return false;
+ }
++static bool reloc_pc11(tcg_insn_unit *src_rw, const tcg_insn_unit *target)
++{
++    const tcg_insn_unit *src_rx = tcg_splitwx_to_rx(src_rw);
++    ptrdiff_t offset = (tcg_ptr_byte_diff(target, src_rx) - 8) / 4;
++
++    if (offset >= -0xff && offset <= 0xff) {
++        tcg_insn_unit insn = *src_rw;
++        bool u = (offset >= 0);
++        if (!u) {
++            offset = -offset;
++        }
++        insn = deposit32(insn, 23, 1, u);
++        insn = deposit32(insn, 0, 8, offset);
++        *src_rw = insn;
++        return true;
++    }
++    return false;
++}
++
++static bool reloc_pc8(tcg_insn_unit *src_rw, const tcg_insn_unit *target)
++{
++    const tcg_insn_unit *src_rx = tcg_splitwx_to_rx(src_rw);
++    ptrdiff_t offset = tcg_ptr_byte_diff(target, src_rx) - 8;
++    int rot = encode_imm(offset);
++
++    if (rot >= 0) {
++        *src_rw = deposit32(*src_rw, 0, 12, rol32(offset, rot) | (rot << 7));
++        return true;
++    }
++    return false;
++}
++
+ static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
+                         intptr_t value, intptr_t addend)
+ {
+     tcg_debug_assert(addend == 0);
+-
+-    if (type == R_ARM_PC24) {
++    switch (type) {
++    case R_ARM_PC24:
+         return reloc_pc24(code_ptr, (const tcg_insn_unit *)value);
+-    } else if (type == R_ARM_PC13) {
++    case R_ARM_PC13:
+         return reloc_pc13(code_ptr, (const tcg_insn_unit *)value);
+-    } else {
++    case R_ARM_PC11:
++        return reloc_pc11(code_ptr, (const tcg_insn_unit *)value);
++    case R_ARM_PC8:
++        return reloc_pc8(code_ptr, (const tcg_insn_unit *)value);
++    default:
+         g_assert_not_reached();
+     }
+ }
+@@ -XXX,XX +XXX,XX @@ static inline uint32_t rotl(uint32_t val, int n)
+ /* ARM immediates for ALU instructions are made of an unsigned 8-bit
+    right-rotated by an even amount between 0 and 30. */
+-static inline int encode_imm(uint32_t imm)
++static int encode_imm(uint32_t imm)
+ {
+     int shift;
+@@ -XXX,XX +XXX,XX @@ static inline int check_fit_imm(uint32_t imm)
+     return encode_imm(imm) >= 0;
+ }
++/* Return true if v16 is a valid 16-bit shifted immediate.  */
++static bool is_shimm16(uint16_t v16, int *cmode, int *imm8)
++{
++    if (v16 == (v16 & 0xff)) {
++        *cmode = 0x8;
++        *imm8 = v16 & 0xff;
++        return true;
++    } else if (v16 == (v16 & 0xff00)) {
++        *cmode = 0xa;
++        *imm8 = v16 >> 8;
++        return true;
++    }
++    return false;
++}
++
++/* Return true if v32 is a valid 32-bit shifted immediate.  */
++static bool is_shimm32(uint32_t v32, int *cmode, int *imm8)
++{
++    if (v32 == (v32 & 0xff)) {
++        *cmode = 0x0;
++        *imm8 = v32 & 0xff;
++        return true;
++    } else if (v32 == (v32 & 0xff00)) {
++        *cmode = 0x2;
++        *imm8 = (v32 >> 8) & 0xff;
++        return true;
++    } else if (v32 == (v32 & 0xff0000)) {
++        *cmode = 0x4;
++        *imm8 = (v32 >> 16) & 0xff;
++        return true;
++    } else if (v32 == (v32 & 0xff000000)) {
++        *cmode = 0x6;
++        *imm8 = v32 >> 24;
++        return true;
++    }
++    return false;
++}
++
++/* Return true if v32 is a valid 32-bit shifting ones immediate.  */
++static bool is_soimm32(uint32_t v32, int *cmode, int *imm8)
++{
++    if ((v32 & 0xffff00ff) == 0xff) {
++        *cmode = 0xc;
++        *imm8 = (v32 >> 8) & 0xff;
++        return true;
++    } else if ((v32 & 0xff00ffff) == 0xffff) {
++        *cmode = 0xd;
++        *imm8 = (v32 >> 16) & 0xff;
++        return true;
++    }
++    return false;
++}
++
++/*
++ * Return non-zero if v32 can be formed by MOVI+ORR.
++ * Place the parameters for MOVI in (cmode, imm8).
++ * Return the cmode for ORR; the imm8 can be had via extraction from v32.
++ */
++static int is_shimm32_pair(uint32_t v32, int *cmode, int *imm8)
++{
++    int i;
++
++    for (i = 6; i > 0; i -= 2) {
++        /* Mask out one byte we can add with ORR.  */
++        uint32_t tmp = v32 & ~(0xffu << (i * 4));
++        if (is_shimm32(tmp, cmode, imm8) ||
++            is_soimm32(tmp, cmode, imm8)) {
++            break;
++        }
++    }
++    return i;
++}
++
+ /* Test if a constant matches the constraint.
+  * TODO: define constraints for:
+  *
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_vreg3(TCGContext *s, ARMInsn insn, int q, int vece,
+               encode_vd(d) | encode_vn(n) | encode_vm(m));
+ }
++static void tcg_out_vmovi(TCGContext *s, TCGReg rd,
++                          int q, int op, int cmode, uint8_t imm8)
++{
++    tcg_out32(s, INSN_VMOVI | encode_vd(rd) | (q << 6) | (op << 5)
++              | (cmode << 8) | extract32(imm8, 0, 4)
++              | (extract32(imm8, 4, 3) << 16)
++              | (extract32(imm8, 7, 1) << 24));
++}
++
+ static void tcg_out_vldst(TCGContext *s, ARMInsn insn,
+                           TCGReg rd, TCGReg rn, int offset)
+ {
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_movi(TCGContext *s, TCGType type,
+     tcg_out_movi32(s, COND_AL, ret, arg);
+ }
++/* Type is always V128, with I64 elements.  */
++static void tcg_out_dup2_vec(TCGContext *s, TCGReg rd, TCGReg rl, TCGReg rh)
++{
++    /* Move high element into place first. */
++    /* VMOV Dd+1, Ds */
++    tcg_out_vreg3(s, INSN_VORR | (1 << 12), 0, 0, rd, rh, rh);
++    /* Move low element into place; tcg_out_mov will check for nop. */
++    tcg_out_mov(s, TCG_TYPE_V64, rd, rl);
++}
++
+ static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
+                             TCGReg rd, TCGReg rs)
+ {
+-    g_assert_not_reached();
++    int q = type - TCG_TYPE_V64;
++
++    if (vece == MO_64) {
++        if (type == TCG_TYPE_V128) {
++            tcg_out_dup2_vec(s, rd, rs, rs);
++        } else {
++            tcg_out_mov(s, TCG_TYPE_V64, rd, rs);
++        }
++    } else if (rs < TCG_REG_Q0) {
++        int b = (vece == MO_8);
++        int e = (vece == MO_16);
++        tcg_out32(s, INSN_VDUP_G | (b << 22) | (q << 21) | (e << 5) |
++                  encode_vn(rd) | (rs << 12));
++    } else {
++        int imm4 = 1 << vece;
++        tcg_out32(s, INSN_VDUP_S | (imm4 << 16) | (q << 6) |
++                  encode_vd(rd) | encode_vm(rs));
++    }
++    return true;
+ }
+ static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
+                              TCGReg rd, TCGReg base, intptr_t offset)
+ {
+-    g_assert_not_reached();
++    if (vece == MO_64) {
++        tcg_out_ld(s, TCG_TYPE_V64, rd, base, offset);
++        if (type == TCG_TYPE_V128) {
++            tcg_out_dup2_vec(s, rd, rd, rd);
++        }
++    } else {
++        int q = type - TCG_TYPE_V64;
++        tcg_out_vldst(s, INSN_VLD1R | (vece << 6) | (q << 5),
++                      rd, base, offset);
++    }
++    return true;
+ }
+ static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
+                              TCGReg rd, int64_t v64)
+ {
+-    g_assert_not_reached();
++    int q = type - TCG_TYPE_V64;
++    int cmode, imm8, i;
++
++    /* Test all bytes equal first.  */
++    if (vece == MO_8) {
++        tcg_out_vmovi(s, rd, q, 0, 0xe, v64);
++        return;
++    }
++
++    /*
++     * Test all bytes 0x00 or 0xff second.  This can match cases that
++     * might otherwise take 2 or 3 insns for MO_16 or MO_32 below.
++     */
++    for (i = imm8 = 0; i < 8; i++) {
++        uint8_t byte = v64 >> (i * 8);
++        if (byte == 0xff) {
++            imm8 |= 1 << i;
++        } else if (byte != 0) {
++            goto fail_bytes;
++        }
++    }
++    tcg_out_vmovi(s, rd, q, 1, 0xe, imm8);
++    return;
++ fail_bytes:
++
++    /*
++     * Tests for various replications.  For each element width, if we
++     * cannot find an expansion there's no point checking a larger
++     * width because we already know by replication it cannot match.
++     */
++    if (vece == MO_16) {
++        uint16_t v16 = v64;
++
++        if (is_shimm16(v16, &cmode, &imm8)) {
++            tcg_out_vmovi(s, rd, q, 0, cmode, imm8);
++            return;
++        }
++        if (is_shimm16(~v16, &cmode, &imm8)) {
++            tcg_out_vmovi(s, rd, q, 1, cmode, imm8);
++            return;
++        }
++
++        /*
++         * Otherwise, all remaining constants can be loaded in two insns:
++         * rd = v16 & 0xff, rd |= v16 & 0xff00.
++         */
++        tcg_out_vmovi(s, rd, q, 0, 0x8, v16 & 0xff);
++        tcg_out_vmovi(s, rd, q, 0, 0xb, v16 >> 8);   /* VORRI */
++        return;
++    }
++
++    if (vece == MO_32) {
++        uint32_t v32 = v64;
++
++        if (is_shimm32(v32, &cmode, &imm8) ||
++            is_soimm32(v32, &cmode, &imm8)) {
++            tcg_out_vmovi(s, rd, q, 0, cmode, imm8);
++            return;
++        }
++        if (is_shimm32(~v32, &cmode, &imm8) ||
++            is_soimm32(~v32, &cmode, &imm8)) {
++            tcg_out_vmovi(s, rd, q, 1, cmode, imm8);
++            return;
++        }
++
++        /*
++         * Restrict the set of constants to those we can load with
++         * two instructions.  Others we load from the pool.
++         */
++        i = is_shimm32_pair(v32, &cmode, &imm8);
++        if (i) {
++            tcg_out_vmovi(s, rd, q, 0, cmode, imm8);
++            tcg_out_vmovi(s, rd, q, 0, i | 1, extract32(v32, i * 4, 8));
++            return;
++        }
++        i = is_shimm32_pair(~v32, &cmode, &imm8);
++        if (i) {
++            tcg_out_vmovi(s, rd, q, 1, cmode, imm8);
++            tcg_out_vmovi(s, rd, q, 1, i | 1, extract32(~v32, i * 4, 8));
++            return;
++        }
++    }
++
++    /*
++     * As a last resort, load from the constant pool.
++     */
++    if (!q || vece == MO_64) {
++        new_pool_l2(s, R_ARM_PC11, s->code_ptr, 0, v64, v64 >> 32);
++        /* VLDR Dd, [pc + offset] */
++        tcg_out32(s, INSN_VLDR_D | encode_vd(rd) | (0xf << 16));
++        if (q) {
++            tcg_out_dup2_vec(s, rd, rd, rd);
++        }
++    } else {
++        new_pool_label(s, (uint32_t)v64, R_ARM_PC8, s->code_ptr, 0);
++        /* add tmp, pc, offset */
++        tcg_out_dat_imm(s, COND_AL, ARITH_ADD, TCG_REG_TMP, TCG_REG_PC, 0);
++        tcg_out_dupm_vec(s, type, MO_32, rd, TCG_REG_TMP, 0);
++    }
+ }
+ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
+--
+.25.1

-New patch
+[PULL 06/15] tcg/arm: Implement minimal vector operations
+Implementing dup2, add, sub, and, or, xor as the minimal set.
+This allows us to actually enable neon in the header file.
+Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/arm/tcg-target-con-set.h |   3 +
+ tcg/arm/tcg-target-con-str.h |   2 +
+ tcg/arm/tcg-target.h         |   6 +-
+ tcg/arm/tcg-target.c.inc     | 201 +++++++++++++++++++++++++++++++++--
+files changed, 204 insertions(+), 8 deletions(-)
+diff --git a/tcg/arm/tcg-target-con-set.h b/tcg/arm/tcg-target-con-set.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/arm/tcg-target-con-set.h
++++ b/tcg/arm/tcg-target-con-set.h
+@@ -XXX,XX +XXX,XX @@ C_O1_I2(r, r, rIN)
+ C_O1_I2(r, r, ri)
+ C_O1_I2(r, rZ, rZ)
+ C_O1_I2(w, w, w)
++C_O1_I2(w, w, wO)
++C_O1_I2(w, w, wV)
++C_O1_I2(w, w, wZ)
+ C_O1_I4(r, r, r, rI, rI)
+ C_O1_I4(r, r, rIN, rIK, 0)
+ C_O2_I1(r, r, l)
+diff --git a/tcg/arm/tcg-target-con-str.h b/tcg/arm/tcg-target-con-str.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/arm/tcg-target-con-str.h
++++ b/tcg/arm/tcg-target-con-str.h
+@@ -XXX,XX +XXX,XX @@ REGS('w', ALL_VECTOR_REGS)
+ CONST('I', TCG_CT_CONST_ARM)
+ CONST('K', TCG_CT_CONST_INV)
+ CONST('N', TCG_CT_CONST_NEG)
++CONST('O', TCG_CT_CONST_ORRI)
++CONST('V', TCG_CT_CONST_ANDI)
+ CONST('Z', TCG_CT_CONST_ZERO)
+diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/arm/tcg-target.h
++++ b/tcg/arm/tcg-target.h
+@@ -XXX,XX +XXX,XX @@ typedef enum {
+ #else
+ extern bool use_idiv_instructions;
+ #endif
+-#define use_neon_instructions  0
++#ifdef __ARM_NEON__
++#define use_neon_instructions  1
++#else
++extern bool use_neon_instructions;
++#endif
+ /* used for function call generation */
+ #define TCG_TARGET_STACK_ALIGN        8
+diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/arm/tcg-target.c.inc
++++ b/tcg/arm/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ int arm_arch = __ARM_ARCH;
+ #ifndef use_idiv_instructions
+ bool use_idiv_instructions;
+ #endif
++#ifndef use_neon_instructions
++bool use_neon_instructions;
++#endif
+ /* ??? Ought to think about changing CONFIG_SOFTMMU to always defined.  */
+ #ifdef CONFIG_SOFTMMU
+@@ -XXX,XX +XXX,XX @@ typedef enum {
+     /* Otherwise the assembler uses mov r0,r0 */
+     INSN_NOP_v4    = (COND_AL << 28) | ARITH_MOV,
++    INSN_VADD      = 0xf2000800,
++    INSN_VAND      = 0xf2000110,
++    INSN_VEOR      = 0xf3000110,
+     INSN_VORR      = 0xf2200110,
++    INSN_VSUB      = 0xf3000800,
++
++    INSN_VMVN      = 0xf3b00580,
++
++    INSN_VCEQ0     = 0xf3b10100,
++    INSN_VCGT0     = 0xf3b10000,
++    INSN_VCGE0     = 0xf3b10080,
++    INSN_VCLE0     = 0xf3b10180,
++    INSN_VCLT0     = 0xf3b10200,
++
++    INSN_VCEQ      = 0xf3000810,
++    INSN_VCGE      = 0xf2000310,
++    INSN_VCGT      = 0xf2000300,
++    INSN_VCGE_U    = 0xf3000310,
++    INSN_VCGT_U    = 0xf3000300,
++
++    INSN_VTST      = 0xf2000810,
+     INSN_VDUP_G    = 0xee800b10,  /* VDUP (ARM core register) */
+     INSN_VDUP_S    = 0xf3b00c00,  /* VDUP (scalar) */
+@@ -XXX,XX +XXX,XX @@ static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
+ #define TCG_CT_CONST_INV  0x200
+ #define TCG_CT_CONST_NEG  0x400
+ #define TCG_CT_CONST_ZERO 0x800
++#define TCG_CT_CONST_ORRI 0x1000
++#define TCG_CT_CONST_ANDI 0x2000
+ #define ALL_GENERAL_REGS  0xffffu
+ #define ALL_VECTOR_REGS   0xffff0000u
+@@ -XXX,XX +XXX,XX @@ static int is_shimm32_pair(uint32_t v32, int *cmode, int *imm8)
+     return i;
+ }
++/* Return true if V is a valid 16-bit or 32-bit shifted immediate.  */
++static bool is_shimm1632(uint32_t v32, int *cmode, int *imm8)
++{
++    if (v32 == deposit32(v32, 16, 16, v32)) {
++        return is_shimm16(v32, cmode, imm8);
++    } else {
++        return is_shimm32(v32, cmode, imm8);
++    }
++}
++
+ /* Test if a constant matches the constraint.
+  * TODO: define constraints for:
+  *
+@@ -XXX,XX +XXX,XX @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
+         return 1;
+     } else if ((ct & TCG_CT_CONST_ZERO) && val == 0) {
+         return 1;
+-    } else {
+-        return 0;
+     }
++
++    switch (ct & (TCG_CT_CONST_ORRI | TCG_CT_CONST_ANDI)) {
++    case 0:
++        break;
++    case TCG_CT_CONST_ANDI:
++        val = ~val;
++        /* fallthru */
++    case TCG_CT_CONST_ORRI:
++        if (val == deposit64(val, 32, 32, val)) {
++            int cmode, imm8;
++            return is_shimm1632(val, &cmode, &imm8);
++        }
++        break;
++    default:
++        /* Both bits should not be set for the same insn.  */
++        g_assert_not_reached();
++    }
++
++    return 0;
+ }
+ static inline void tcg_out_b(TCGContext *s, int cond, int32_t offset)
+@@ -XXX,XX +XXX,XX @@ static uint32_t encode_vm(TCGReg rm)
+     return (extract32(rm, 3, 1) << 5) | (extract32(rm, 0, 3) << 1);
+ }
++static void tcg_out_vreg2(TCGContext *s, ARMInsn insn, int q, int vece,
++                          TCGReg d, TCGReg m)
++{
++    tcg_out32(s, insn | (vece << 18) | (q << 6) |
++              encode_vd(d) | encode_vm(m));
++}
++
+ static void tcg_out_vreg3(TCGContext *s, ARMInsn insn, int q, int vece,
+                           TCGReg d, TCGReg n, TCGReg m)
+ {
+@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
+     case INDEX_op_add_vec:
+     case INDEX_op_sub_vec:
+     case INDEX_op_xor_vec:
+-    case INDEX_op_or_vec:
+-    case INDEX_op_and_vec:
+-    case INDEX_op_cmp_vec:
+         return C_O1_I2(w, w, w);
++    case INDEX_op_or_vec:
++        return C_O1_I2(w, w, wO);
++    case INDEX_op_and_vec:
++        return C_O1_I2(w, w, wV);
++    case INDEX_op_cmp_vec:
++        return C_O1_I2(w, w, wZ);
+     default:
+         g_assert_not_reached();
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
+     }
+ }
++static const ARMInsn vec_cmp_insn[16] = {
++    [TCG_COND_EQ] = INSN_VCEQ,
++    [TCG_COND_GT] = INSN_VCGT,
++    [TCG_COND_GE] = INSN_VCGE,
++    [TCG_COND_GTU] = INSN_VCGT_U,
++    [TCG_COND_GEU] = INSN_VCGE_U,
++};
++
++static const ARMInsn vec_cmp0_insn[16] = {
++    [TCG_COND_EQ] = INSN_VCEQ0,
++    [TCG_COND_GT] = INSN_VCGT0,
++    [TCG_COND_GE] = INSN_VCGE0,
++    [TCG_COND_LT] = INSN_VCLT0,
++    [TCG_COND_LE] = INSN_VCLE0,
++};
++
+ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
+                            unsigned vecl, unsigned vece,
+                            const TCGArg *args, const int *const_args)
+ {
+-    g_assert_not_reached();
++    TCGType type = vecl + TCG_TYPE_V64;
++    unsigned q = vecl;
++    TCGArg a0, a1, a2;
++    int cmode, imm8;
++
++    a0 = args[0];
++    a1 = args[1];
++    a2 = args[2];
++
++    switch (opc) {
++    case INDEX_op_ld_vec:
++        tcg_out_ld(s, type, a0, a1, a2);
++        return;
++    case INDEX_op_st_vec:
++        tcg_out_st(s, type, a0, a1, a2);
++        return;
++    case INDEX_op_dupm_vec:
++        tcg_out_dupm_vec(s, type, vece, a0, a1, a2);
++        return;
++    case INDEX_op_dup2_vec:
++        tcg_out_dup2_vec(s, a0, a1, a2);
++        return;
++    case INDEX_op_add_vec:
++        tcg_out_vreg3(s, INSN_VADD, q, vece, a0, a1, a2);
++        return;
++    case INDEX_op_sub_vec:
++        tcg_out_vreg3(s, INSN_VSUB, q, vece, a0, a1, a2);
++        return;
++    case INDEX_op_xor_vec:
++        tcg_out_vreg3(s, INSN_VEOR, q, 0, a0, a1, a2);
++        return;
++
++    case INDEX_op_and_vec:
++        if (const_args[2]) {
++            is_shimm1632(~a2, &cmode, &imm8);
++            if (a0 == a1) {
++                tcg_out_vmovi(s, a0, q, 1, cmode | 1, imm8); /* VBICI */
++                return;
++            }
++            tcg_out_vmovi(s, a0, q, 1, cmode, imm8); /* VMVNI */
++            a2 = a0;
++        }
++        tcg_out_vreg3(s, INSN_VAND, q, 0, a0, a1, a2);
++        return;
++
++    case INDEX_op_or_vec:
++        if (const_args[2]) {
++            is_shimm1632(a2, &cmode, &imm8);
++            if (a0 == a1) {
++                tcg_out_vmovi(s, a0, q, 0, cmode | 1, imm8); /* VORRI */
++                return;
++            }
++            tcg_out_vmovi(s, a0, q, 0, cmode, imm8); /* VMOVI */
++            a2 = a0;
++        }
++        tcg_out_vreg3(s, INSN_VORR, q, 0, a0, a1, a2);
++        return;
++
++    case INDEX_op_cmp_vec:
++        {
++            TCGCond cond = args[3];
++
++            if (cond == TCG_COND_NE) {
++                if (const_args[2]) {
++                    tcg_out_vreg3(s, INSN_VTST, q, vece, a0, a1, a1);
++                } else {
++                    tcg_out_vreg3(s, INSN_VCEQ, q, vece, a0, a1, a2);
++                    tcg_out_vreg2(s, INSN_VMVN, q, 0, a0, a0);
++                }
++            } else {
++                ARMInsn insn;
++
++                if (const_args[2]) {
++                    insn = vec_cmp0_insn[cond];
++                    if (insn) {
++                        tcg_out_vreg2(s, insn, q, vece, a0, a1);
++                        return;
++                    }
++                    tcg_out_dupi_vec(s, type, MO_8, TCG_VEC_TMP, 0);
++                    a2 = TCG_VEC_TMP;
++                }
++                insn = vec_cmp_insn[cond];
++                if (insn == 0) {
++                    TCGArg t;
++                    t = a1, a1 = a2, a2 = t;
++                    cond = tcg_swap_cond(cond);
++                    insn = vec_cmp_insn[cond];
++                    tcg_debug_assert(insn != 0);
++                }
++                tcg_out_vreg3(s, insn, q, vece, a0, a1, a2);
++            }
++        }
++        return;
++
++    case INDEX_op_mov_vec:  /* Always emitted via tcg_out_mov.  */
++    case INDEX_op_dup_vec:  /* Always emitted via tcg_out_dup_vec.  */
++    default:
++        g_assert_not_reached();
++    }
+ }
+ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
+ {
+-    return 0;
++    switch (opc) {
++    case INDEX_op_add_vec:
++    case INDEX_op_sub_vec:
++    case INDEX_op_and_vec:
++    case INDEX_op_or_vec:
++    case INDEX_op_xor_vec:
++        return 1;
++    case INDEX_op_cmp_vec:
++        return vece < MO_64;
++    default:
++        return 0;
++    }
+ }
+ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
+--
+.25.1

-[PULL for-5.0 1/1] tcg/mips: mips sync* encode error
+[PULL 07/15] tcg/arm: Implement andc, orc, abs, neg, not vector operations
-From: lixinyu <precinct@mail.ustc.edu.cn>
+These logical and arithmetic operations are optional, but are
 trivial to accomplish with the existing infrastructure.
-OPC_SYNC_WMB, OPC_SYNC_MB, OPC_SYNC_ACQUIRE, OPC_SYNC_RELEASE and
+Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
 OPC_SYNC_RMB have wrong encode. According to the mips manual,
 their encode should be 'OPC_SYNC | 0x?? << 6' rather than
 'OPC_SYNC | 0x?? << 5'. Wrong encode can lead illegal instruction
 errors. These instructions often appear with multi-threaded
 simulation.
 Fixes: 6f0b99104a3 ("tcg/mips: Add support for fence")
 Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
 Reviewed-by: Aleksandar Markovic <aleksandar.qemu.devel@gmail.com>
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: lixinyu <precinct@mail.ustc.edu.cn>
 Message-Id: <20200411124612.12560-1-precinct@mail.ustc.edu.cn>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/mips/tcg-target.inc.c | 10 +++++-----
+ tcg/arm/tcg-target-con-set.h |  1 +
-file changed, 5 insertions(+), 5 deletions(-)
+ tcg/arm/tcg-target.h         | 10 +++++-----
  tcg/arm/tcg-target.c.inc     | 38 ++++++++++++++++++++++++++++++++++++
 files changed, 44 insertions(+), 5 deletions(-)
-diff --git a/tcg/mips/tcg-target.inc.c b/tcg/mips/tcg-target.inc.c
+diff --git a/tcg/arm/tcg-target-con-set.h b/tcg/arm/tcg-target-con-set.h
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/mips/tcg-target.inc.c
+--- a/tcg/arm/tcg-target-con-set.h
-+++ b/tcg/mips/tcg-target.inc.c
++++ b/tcg/arm/tcg-target-con-set.h
@@ -XXX,XX +XXX,XX @@ C_O0_I4(s, s, s, s)
  C_O1_I1(r, l)
  C_O1_I1(r, r)
  C_O1_I1(w, r)
 +C_O1_I1(w, w)
  C_O1_I1(w, wr)
  C_O1_I2(r, 0, rZ)
  C_O1_I2(r, l, l)
 diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/arm/tcg-target.h
 +++ b/tcg/arm/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool use_neon_instructions;
  #define TCG_TARGET_HAS_v128             use_neon_instructions
  #define TCG_TARGET_HAS_v256             0
 -#define TCG_TARGET_HAS_andc_vec         0
 -#define TCG_TARGET_HAS_orc_vec          0
 -#define TCG_TARGET_HAS_not_vec          0
 -#define TCG_TARGET_HAS_neg_vec          0
 -#define TCG_TARGET_HAS_abs_vec          0
 +#define TCG_TARGET_HAS_andc_vec         1
 +#define TCG_TARGET_HAS_orc_vec          1
 +#define TCG_TARGET_HAS_not_vec          1
 +#define TCG_TARGET_HAS_neg_vec          1
 +#define TCG_TARGET_HAS_abs_vec          1
  #define TCG_TARGET_HAS_roti_vec         0
  #define TCG_TARGET_HAS_rots_vec         0
  #define TCG_TARGET_HAS_rotv_vec         0
 diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/arm/tcg-target.c.inc
 +++ b/tcg/arm/tcg-target.c.inc
 @@ -XXX,XX +XXX,XX @@ typedef enum {
-     /* MIPS r6 introduced names for weaker variants of SYNC.  These are
+     INSN_VADD      = 0xf2000800,
-        backward compatible to previous architecture revisions.  */
+     INSN_VAND      = 0xf2000110,
--    OPC_SYNC_WMB     = OPC_SYNC | 0x04 << 5,
++    INSN_VBIC      = 0xf2100110,
--    OPC_SYNC_MB      = OPC_SYNC | 0x10 << 5,
+     INSN_VEOR      = 0xf3000110,
--    OPC_SYNC_ACQUIRE = OPC_SYNC | 0x11 << 5,
++    INSN_VORN      = 0xf2300110,
--    OPC_SYNC_RELEASE = OPC_SYNC | 0x12 << 5,
+     INSN_VORR      = 0xf2200110,
--    OPC_SYNC_RMB     = OPC_SYNC | 0x13 << 5,
+     INSN_VSUB      = 0xf3000800,
-+    OPC_SYNC_WMB     = OPC_SYNC | 0x04 << 6,
-+    OPC_SYNC_MB      = OPC_SYNC | 0x10 << 6,
++    INSN_VABS      = 0xf3b10300,
-+    OPC_SYNC_ACQUIRE = OPC_SYNC | 0x11 << 6,
+     INSN_VMVN      = 0xf3b00580,
-+    OPC_SYNC_RELEASE = OPC_SYNC | 0x12 << 6,
++    INSN_VNEG      = 0xf3b10380,
-+    OPC_SYNC_RMB     = OPC_SYNC | 0x13 << 6,
+     INSN_VCEQ0     = 0xf3b10100,
-     /* Aliases for convenience.  */
+     INSN_VCGT0     = 0xf3b10000,
-     ALIAS_PADD     = sizeof(void *) == 4 ? OPC_ADDU : OPC_DADDU,
+@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
          return C_O1_I1(w, r);
      case INDEX_op_dup_vec:
          return C_O1_I1(w, wr);
 +    case INDEX_op_abs_vec:
 +    case INDEX_op_neg_vec:
 +    case INDEX_op_not_vec:
 +        return C_O1_I1(w, w);
      case INDEX_op_dup2_vec:
      case INDEX_op_add_vec:
      case INDEX_op_sub_vec:
      case INDEX_op_xor_vec:
          return C_O1_I2(w, w, w);
      case INDEX_op_or_vec:
 +    case INDEX_op_andc_vec:
          return C_O1_I2(w, w, wO);
      case INDEX_op_and_vec:
 +    case INDEX_op_orc_vec:
          return C_O1_I2(w, w, wV);
      case INDEX_op_cmp_vec:
          return C_O1_I2(w, w, wZ);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
      case INDEX_op_dup2_vec:
          tcg_out_dup2_vec(s, a0, a1, a2);
          return;
 +    case INDEX_op_abs_vec:
 +        tcg_out_vreg2(s, INSN_VABS, q, vece, a0, a1);
 +        return;
 +    case INDEX_op_neg_vec:
 +        tcg_out_vreg2(s, INSN_VNEG, q, vece, a0, a1);
 +        return;
 +    case INDEX_op_not_vec:
 +        tcg_out_vreg2(s, INSN_VMVN, q, 0, a0, a1);
 +        return;
      case INDEX_op_add_vec:
          tcg_out_vreg3(s, INSN_VADD, q, vece, a0, a1, a2);
          return;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
          tcg_out_vreg3(s, INSN_VEOR, q, 0, a0, a1, a2);
          return;
 +    case INDEX_op_andc_vec:
 +        if (!const_args[2]) {
 +            tcg_out_vreg3(s, INSN_VBIC, q, 0, a0, a1, a2);
 +            return;
 +        }
 +        a2 = ~a2;
 +        /* fall through */
      case INDEX_op_and_vec:
          if (const_args[2]) {
              is_shimm1632(~a2, &cmode, &imm8);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
          tcg_out_vreg3(s, INSN_VAND, q, 0, a0, a1, a2);
          return;
 +    case INDEX_op_orc_vec:
 +        if (!const_args[2]) {
 +            tcg_out_vreg3(s, INSN_VORN, q, 0, a0, a1, a2);
 +            return;
 +        }
 +        a2 = ~a2;
 +        /* fall through */
      case INDEX_op_or_vec:
          if (const_args[2]) {
              is_shimm1632(a2, &cmode, &imm8);
@@ -XXX,XX +XXX,XX @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
      case INDEX_op_add_vec:
      case INDEX_op_sub_vec:
      case INDEX_op_and_vec:
 +    case INDEX_op_andc_vec:
      case INDEX_op_or_vec:
 +    case INDEX_op_orc_vec:
      case INDEX_op_xor_vec:
 +    case INDEX_op_not_vec:
          return 1;
 +    case INDEX_op_abs_vec:
      case INDEX_op_cmp_vec:
 +    case INDEX_op_neg_vec:
          return vece < MO_64;
      default:
          return 0;
 --
-.20.1
+.25.1

-New patch
+[PULL 08/15] tcg/arm: Implement TCG_TARGET_HAS_shi_vec
+This consists of the three immediate shifts: shli, shri, sari.
+Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/arm/tcg-target.h     |  2 +-
+ tcg/arm/tcg-target.c.inc | 27 +++++++++++++++++++++++++++
+files changed, 28 insertions(+), 1 deletion(-)
+diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/arm/tcg-target.h
++++ b/tcg/arm/tcg-target.h
+@@ -XXX,XX +XXX,XX @@ extern bool use_neon_instructions;
+ #define TCG_TARGET_HAS_roti_vec         0
+ #define TCG_TARGET_HAS_rots_vec         0
+ #define TCG_TARGET_HAS_rotv_vec         0
+-#define TCG_TARGET_HAS_shi_vec          0
++#define TCG_TARGET_HAS_shi_vec          1
+ #define TCG_TARGET_HAS_shs_vec          0
+ #define TCG_TARGET_HAS_shv_vec          0
+ #define TCG_TARGET_HAS_mul_vec          0
+diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/arm/tcg-target.c.inc
++++ b/tcg/arm/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ typedef enum {
+     INSN_VCGE_U    = 0xf3000310,
+     INSN_VCGT_U    = 0xf3000300,
++    INSN_VSHLI     = 0xf2800510,  /* VSHL (immediate) */
++    INSN_VSARI     = 0xf2800010,  /* VSHR.S */
++    INSN_VSHRI     = 0xf3800010,  /* VSHR.U */
++
+     INSN_VTST      = 0xf2000810,
+     INSN_VDUP_G    = 0xee800b10,  /* VDUP (ARM core register) */
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_vmovi(TCGContext *s, TCGReg rd,
+               | (extract32(imm8, 7, 1) << 24));
+ }
++static void tcg_out_vshifti(TCGContext *s, ARMInsn insn, int q,
++                            TCGReg rd, TCGReg rm, int l_imm6)
++{
++    tcg_out32(s, insn | (q << 6) | encode_vd(rd) | encode_vm(rm) |
++              (extract32(l_imm6, 6, 1) << 7) |
++              (extract32(l_imm6, 0, 6) << 16));
++}
++
+ static void tcg_out_vldst(TCGContext *s, ARMInsn insn,
+                           TCGReg rd, TCGReg rn, int offset)
+ {
+@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
+     case INDEX_op_abs_vec:
+     case INDEX_op_neg_vec:
+     case INDEX_op_not_vec:
++    case INDEX_op_shli_vec:
++    case INDEX_op_shri_vec:
++    case INDEX_op_sari_vec:
+         return C_O1_I1(w, w);
+     case INDEX_op_dup2_vec:
+     case INDEX_op_add_vec:
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
+     case INDEX_op_xor_vec:
+         tcg_out_vreg3(s, INSN_VEOR, q, 0, a0, a1, a2);
+         return;
++    case INDEX_op_shli_vec:
++        tcg_out_vshifti(s, INSN_VSHLI, q, a0, a1, a2 + (8 << vece));
++        return;
++    case INDEX_op_shri_vec:
++        tcg_out_vshifti(s, INSN_VSHRI, q, a0, a1, (16 << vece) - a2);
++        return;
++    case INDEX_op_sari_vec:
++        tcg_out_vshifti(s, INSN_VSARI, q, a0, a1, (16 << vece) - a2);
++        return;
+     case INDEX_op_andc_vec:
+         if (!const_args[2]) {
+@@ -XXX,XX +XXX,XX @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
+     case INDEX_op_orc_vec:
+     case INDEX_op_xor_vec:
+     case INDEX_op_not_vec:
++    case INDEX_op_shli_vec:
++    case INDEX_op_shri_vec:
++    case INDEX_op_sari_vec:
+         return 1;
+     case INDEX_op_abs_vec:
+     case INDEX_op_cmp_vec:
+--
+.25.1

-New patch
+[PULL 09/15] tcg/arm: Implement TCG_TARGET_HAS_mul_vec
+Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/arm/tcg-target.h     | 2 +-
+ tcg/arm/tcg-target.c.inc | 6 ++++++
+files changed, 7 insertions(+), 1 deletion(-)
+diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/arm/tcg-target.h
++++ b/tcg/arm/tcg-target.h
+@@ -XXX,XX +XXX,XX @@ extern bool use_neon_instructions;
+ #define TCG_TARGET_HAS_shi_vec          1
+ #define TCG_TARGET_HAS_shs_vec          0
+ #define TCG_TARGET_HAS_shv_vec          0
+-#define TCG_TARGET_HAS_mul_vec          0
++#define TCG_TARGET_HAS_mul_vec          1
+ #define TCG_TARGET_HAS_sat_vec          0
+ #define TCG_TARGET_HAS_minmax_vec       0
+ #define TCG_TARGET_HAS_bitsel_vec       0
+diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/arm/tcg-target.c.inc
++++ b/tcg/arm/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ typedef enum {
+     INSN_VORN      = 0xf2300110,
+     INSN_VORR      = 0xf2200110,
+     INSN_VSUB      = 0xf3000800,
++    INSN_VMUL      = 0xf2000910,
+     INSN_VABS      = 0xf3b10300,
+     INSN_VMVN      = 0xf3b00580,
+@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
+         return C_O1_I1(w, w);
+     case INDEX_op_dup2_vec:
+     case INDEX_op_add_vec:
++    case INDEX_op_mul_vec:
+     case INDEX_op_sub_vec:
+     case INDEX_op_xor_vec:
+         return C_O1_I2(w, w, w);
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
+     case INDEX_op_add_vec:
+         tcg_out_vreg3(s, INSN_VADD, q, vece, a0, a1, a2);
+         return;
++    case INDEX_op_mul_vec:
++        tcg_out_vreg3(s, INSN_VMUL, q, vece, a0, a1, a2);
++        return;
+     case INDEX_op_sub_vec:
+         tcg_out_vreg3(s, INSN_VSUB, q, vece, a0, a1, a2);
+         return;
+@@ -XXX,XX +XXX,XX @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
+         return 1;
+     case INDEX_op_abs_vec:
+     case INDEX_op_cmp_vec:
++    case INDEX_op_mul_vec:
+     case INDEX_op_neg_vec:
+         return vece < MO_64;
+     default:
+--
+.25.1

-New patch
+[PULL 10/15] tcg/arm: Implement TCG_TARGET_HAS_sat_vec
+This is saturating add and subtract, signed and unsigned.
+Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/arm/tcg-target.h     |  2 +-
+ tcg/arm/tcg-target.c.inc | 24 ++++++++++++++++++++++++
+files changed, 25 insertions(+), 1 deletion(-)
+diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/arm/tcg-target.h
++++ b/tcg/arm/tcg-target.h
+@@ -XXX,XX +XXX,XX @@ extern bool use_neon_instructions;
+ #define TCG_TARGET_HAS_shs_vec          0
+ #define TCG_TARGET_HAS_shv_vec          0
+ #define TCG_TARGET_HAS_mul_vec          1
+-#define TCG_TARGET_HAS_sat_vec          0
++#define TCG_TARGET_HAS_sat_vec          1
+ #define TCG_TARGET_HAS_minmax_vec       0
+ #define TCG_TARGET_HAS_bitsel_vec       0
+ #define TCG_TARGET_HAS_cmpsel_vec       0
+diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/arm/tcg-target.c.inc
++++ b/tcg/arm/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ typedef enum {
+     INSN_VORR      = 0xf2200110,
+     INSN_VSUB      = 0xf3000800,
+     INSN_VMUL      = 0xf2000910,
++    INSN_VQADD     = 0xf2000010,
++    INSN_VQADD_U   = 0xf3000010,
++    INSN_VQSUB     = 0xf2000210,
++    INSN_VQSUB_U   = 0xf3000210,
+     INSN_VABS      = 0xf3b10300,
+     INSN_VMVN      = 0xf3b00580,
+@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
+     case INDEX_op_dup2_vec:
+     case INDEX_op_add_vec:
+     case INDEX_op_mul_vec:
++    case INDEX_op_ssadd_vec:
++    case INDEX_op_sssub_vec:
+     case INDEX_op_sub_vec:
++    case INDEX_op_usadd_vec:
++    case INDEX_op_ussub_vec:
+     case INDEX_op_xor_vec:
+         return C_O1_I2(w, w, w);
+     case INDEX_op_or_vec:
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
+     case INDEX_op_sub_vec:
+         tcg_out_vreg3(s, INSN_VSUB, q, vece, a0, a1, a2);
+         return;
++    case INDEX_op_ssadd_vec:
++        tcg_out_vreg3(s, INSN_VQADD, q, vece, a0, a1, a2);
++        return;
++    case INDEX_op_sssub_vec:
++        tcg_out_vreg3(s, INSN_VQSUB, q, vece, a0, a1, a2);
++        return;
++    case INDEX_op_usadd_vec:
++        tcg_out_vreg3(s, INSN_VQADD_U, q, vece, a0, a1, a2);
++        return;
++    case INDEX_op_ussub_vec:
++        tcg_out_vreg3(s, INSN_VQSUB_U, q, vece, a0, a1, a2);
++        return;
+     case INDEX_op_xor_vec:
+         tcg_out_vreg3(s, INSN_VEOR, q, 0, a0, a1, a2);
+         return;
+@@ -XXX,XX +XXX,XX @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
+     case INDEX_op_shli_vec:
+     case INDEX_op_shri_vec:
+     case INDEX_op_sari_vec:
++    case INDEX_op_ssadd_vec:
++    case INDEX_op_sssub_vec:
++    case INDEX_op_usadd_vec:
++    case INDEX_op_ussub_vec:
+         return 1;
+     case INDEX_op_abs_vec:
+     case INDEX_op_cmp_vec:
+--
+.25.1

-New patch
+[PULL 11/15] tcg/arm: Implement TCG_TARGET_HAS_minmax_vec
+This is minimum and maximum, signed and unsigned.
+Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/arm/tcg-target.h     |  2 +-
+ tcg/arm/tcg-target.c.inc | 24 ++++++++++++++++++++++++
+files changed, 25 insertions(+), 1 deletion(-)
+diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/arm/tcg-target.h
++++ b/tcg/arm/tcg-target.h
+@@ -XXX,XX +XXX,XX @@ extern bool use_neon_instructions;
+ #define TCG_TARGET_HAS_shv_vec          0
+ #define TCG_TARGET_HAS_mul_vec          1
+ #define TCG_TARGET_HAS_sat_vec          1
+-#define TCG_TARGET_HAS_minmax_vec       0
++#define TCG_TARGET_HAS_minmax_vec       1
+ #define TCG_TARGET_HAS_bitsel_vec       0
+ #define TCG_TARGET_HAS_cmpsel_vec       0
+diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/arm/tcg-target.c.inc
++++ b/tcg/arm/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ typedef enum {
+     INSN_VQADD_U   = 0xf3000010,
+     INSN_VQSUB     = 0xf2000210,
+     INSN_VQSUB_U   = 0xf3000210,
++    INSN_VMAX      = 0xf2000600,
++    INSN_VMAX_U    = 0xf3000600,
++    INSN_VMIN      = 0xf2000610,
++    INSN_VMIN_U    = 0xf3000610,
+     INSN_VABS      = 0xf3b10300,
+     INSN_VMVN      = 0xf3b00580,
+@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
+     case INDEX_op_dup2_vec:
+     case INDEX_op_add_vec:
+     case INDEX_op_mul_vec:
++    case INDEX_op_smax_vec:
++    case INDEX_op_smin_vec:
+     case INDEX_op_ssadd_vec:
+     case INDEX_op_sssub_vec:
+     case INDEX_op_sub_vec:
++    case INDEX_op_umax_vec:
++    case INDEX_op_umin_vec:
+     case INDEX_op_usadd_vec:
+     case INDEX_op_ussub_vec:
+     case INDEX_op_xor_vec:
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
+     case INDEX_op_mul_vec:
+         tcg_out_vreg3(s, INSN_VMUL, q, vece, a0, a1, a2);
+         return;
++    case INDEX_op_smax_vec:
++        tcg_out_vreg3(s, INSN_VMAX, q, vece, a0, a1, a2);
++        return;
++    case INDEX_op_smin_vec:
++        tcg_out_vreg3(s, INSN_VMIN, q, vece, a0, a1, a2);
++        return;
+     case INDEX_op_sub_vec:
+         tcg_out_vreg3(s, INSN_VSUB, q, vece, a0, a1, a2);
+         return;
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
+     case INDEX_op_sssub_vec:
+         tcg_out_vreg3(s, INSN_VQSUB, q, vece, a0, a1, a2);
+         return;
++    case INDEX_op_umax_vec:
++        tcg_out_vreg3(s, INSN_VMAX_U, q, vece, a0, a1, a2);
++        return;
++    case INDEX_op_umin_vec:
++        tcg_out_vreg3(s, INSN_VMIN_U, q, vece, a0, a1, a2);
++        return;
+     case INDEX_op_usadd_vec:
+         tcg_out_vreg3(s, INSN_VQADD_U, q, vece, a0, a1, a2);
+         return;
+@@ -XXX,XX +XXX,XX @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
+     case INDEX_op_cmp_vec:
+     case INDEX_op_mul_vec:
+     case INDEX_op_neg_vec:
++    case INDEX_op_smax_vec:
++    case INDEX_op_smin_vec:
++    case INDEX_op_umax_vec:
++    case INDEX_op_umin_vec:
+         return vece < MO_64;
+     default:
+         return 0;
+--
+.25.1

-New patch
+[PULL 12/15] tcg/arm: Implement TCG_TARGET_HAS_bitsel_vec
+NEON has 3 instructions implementing this 4 argument operation,
+with each insn overlapping a different logical input onto the
+destination register.
+Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/arm/tcg-target-con-set.h |  1 +
+ tcg/arm/tcg-target.h         |  2 +-
+ tcg/arm/tcg-target.c.inc     | 22 ++++++++++++++++++++--
+files changed, 22 insertions(+), 3 deletions(-)
+diff --git a/tcg/arm/tcg-target-con-set.h b/tcg/arm/tcg-target-con-set.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/arm/tcg-target-con-set.h
++++ b/tcg/arm/tcg-target-con-set.h
+@@ -XXX,XX +XXX,XX @@ C_O1_I2(w, w, w)
+ C_O1_I2(w, w, wO)
+ C_O1_I2(w, w, wV)
+ C_O1_I2(w, w, wZ)
++C_O1_I3(w, w, w, w)
+ C_O1_I4(r, r, r, rI, rI)
+ C_O1_I4(r, r, rIN, rIK, 0)
+ C_O2_I1(r, r, l)
+diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/arm/tcg-target.h
++++ b/tcg/arm/tcg-target.h
+@@ -XXX,XX +XXX,XX @@ extern bool use_neon_instructions;
+ #define TCG_TARGET_HAS_mul_vec          1
+ #define TCG_TARGET_HAS_sat_vec          1
+ #define TCG_TARGET_HAS_minmax_vec       1
+-#define TCG_TARGET_HAS_bitsel_vec       0
++#define TCG_TARGET_HAS_bitsel_vec       1
+ #define TCG_TARGET_HAS_cmpsel_vec       0
+ #define TCG_TARGET_DEFAULT_MO (0)
+diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/arm/tcg-target.c.inc
++++ b/tcg/arm/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ typedef enum {
+     INSN_VSARI     = 0xf2800010,  /* VSHR.S */
+     INSN_VSHRI     = 0xf3800010,  /* VSHR.U */
++    INSN_VBSL      = 0xf3100110,
++    INSN_VBIT      = 0xf3200110,
++    INSN_VBIF      = 0xf3300110,
++
+     INSN_VTST      = 0xf2000810,
+     INSN_VDUP_G    = 0xee800b10,  /* VDUP (ARM core register) */
+@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
+         return C_O1_I2(w, w, wV);
+     case INDEX_op_cmp_vec:
+         return C_O1_I2(w, w, wZ);
+-
++    case INDEX_op_bitsel_vec:
++        return C_O1_I3(w, w, w, w);
+     default:
+         g_assert_not_reached();
+     }
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
+ {
+     TCGType type = vecl + TCG_TYPE_V64;
+     unsigned q = vecl;
+-    TCGArg a0, a1, a2;
++    TCGArg a0, a1, a2, a3;
+     int cmode, imm8;
+     a0 = args[0];
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
+         }
+         return;
++    case INDEX_op_bitsel_vec:
++        a3 = args[3];
++        if (a0 == a3) {
++            tcg_out_vreg3(s, INSN_VBIT, q, 0, a0, a2, a1);
++        } else if (a0 == a2) {
++            tcg_out_vreg3(s, INSN_VBIF, q, 0, a0, a3, a1);
++        } else {
++            tcg_out_mov(s, type, a0, a1);
++            tcg_out_vreg3(s, INSN_VBSL, q, 0, a0, a2, a3);
++        }
++        return;
++
+     case INDEX_op_mov_vec:  /* Always emitted via tcg_out_mov.  */
+     case INDEX_op_dup_vec:  /* Always emitted via tcg_out_dup_vec.  */
+     default:
+@@ -XXX,XX +XXX,XX @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
+     case INDEX_op_sssub_vec:
+     case INDEX_op_usadd_vec:
+     case INDEX_op_ussub_vec:
++    case INDEX_op_bitsel_vec:
+         return 1;
+     case INDEX_op_abs_vec:
+     case INDEX_op_cmp_vec:
+--
+.25.1

-New patch
+[PULL 13/15] tcg/arm: Implement TCG_TARGET_HAS_shv_vec
+The three vector shift by vector operations are all implemented via
+expansion.  Therefore do not actually set TCG_TARGET_HAS_shv_vec,
+as none of shlv_vec, shrv_vec, sarv_vec may actually appear in the
+instruction stream, and therefore also do not appear in tcg_target_op_def.
+Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/arm/tcg-target.opc.h |  3 ++
+ tcg/arm/tcg-target.c.inc | 61 +++++++++++++++++++++++++++++++++++++++-
+files changed, 63 insertions(+), 1 deletion(-)
+diff --git a/tcg/arm/tcg-target.opc.h b/tcg/arm/tcg-target.opc.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/arm/tcg-target.opc.h
++++ b/tcg/arm/tcg-target.opc.h
+@@ -XXX,XX +XXX,XX @@
+  * emitted by tcg_expand_vec_op.  For those familiar with GCC internals,
+  * consider these to be UNSPEC with names.
+  */
++
++DEF(arm_sshl_vec, 1, 2, 0, IMPLVEC)
++DEF(arm_ushl_vec, 1, 2, 0, IMPLVEC)
+diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/arm/tcg-target.c.inc
++++ b/tcg/arm/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ typedef enum {
+     INSN_VSHLI     = 0xf2800510,  /* VSHL (immediate) */
+     INSN_VSARI     = 0xf2800010,  /* VSHR.S */
+     INSN_VSHRI     = 0xf3800010,  /* VSHR.U */
++    INSN_VSHL_S    = 0xf2000400,  /* VSHL.S (register) */
++    INSN_VSHL_U    = 0xf3000400,  /* VSHL.U (register) */
+     INSN_VBSL      = 0xf3100110,
+     INSN_VBIT      = 0xf3200110,
+@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
+     case INDEX_op_usadd_vec:
+     case INDEX_op_ussub_vec:
+     case INDEX_op_xor_vec:
++    case INDEX_op_arm_sshl_vec:
++    case INDEX_op_arm_ushl_vec:
+         return C_O1_I2(w, w, w);
+     case INDEX_op_or_vec:
+     case INDEX_op_andc_vec:
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
+     case INDEX_op_xor_vec:
+         tcg_out_vreg3(s, INSN_VEOR, q, 0, a0, a1, a2);
+         return;
++    case INDEX_op_arm_sshl_vec:
++        /*
++         * Note that Vm is the data and Vn is the shift count,
++         * therefore the arguments appear reversed.
++         */
++        tcg_out_vreg3(s, INSN_VSHL_S, q, vece, a0, a2, a1);
++        return;
++    case INDEX_op_arm_ushl_vec:
++        /* See above. */
++        tcg_out_vreg3(s, INSN_VSHL_U, q, vece, a0, a2, a1);
++        return;
+     case INDEX_op_shli_vec:
+         tcg_out_vshifti(s, INSN_VSHLI, q, a0, a1, a2 + (8 << vece));
+         return;
+@@ -XXX,XX +XXX,XX @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
+     case INDEX_op_umax_vec:
+     case INDEX_op_umin_vec:
+         return vece < MO_64;
++    case INDEX_op_shlv_vec:
++    case INDEX_op_shrv_vec:
++    case INDEX_op_sarv_vec:
++        return -1;
+     default:
+         return 0;
+     }
+@@ -XXX,XX +XXX,XX @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
+ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
+                        TCGArg a0, ...)
+ {
+-    g_assert_not_reached();
++    va_list va;
++    TCGv_vec v0, v1, v2, t1;
++    TCGArg a2;
++
++    va_start(va, a0);
++    v0 = temp_tcgv_vec(arg_temp(a0));
++    v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
++    a2 = va_arg(va, TCGArg);
++    va_end(va);
++
++    switch (opc) {
++    case INDEX_op_shlv_vec:
++        /*
++         * Merely propagate shlv_vec to arm_ushl_vec.
++         * In this way we don't set TCG_TARGET_HAS_shv_vec
++         * because everything is done via expansion.
++         */
++        v2 = temp_tcgv_vec(arg_temp(a2));
++        vec_gen_3(INDEX_op_arm_ushl_vec, type, vece, tcgv_vec_arg(v0),
++                  tcgv_vec_arg(v1), tcgv_vec_arg(v2));
++        break;
++
++    case INDEX_op_shrv_vec:
++    case INDEX_op_sarv_vec:
++        /* Right shifts are negative left shifts for NEON.  */
++        v2 = temp_tcgv_vec(arg_temp(a2));
++        t1 = tcg_temp_new_vec(type);
++        tcg_gen_neg_vec(vece, t1, v2);
++        if (opc == INDEX_op_shrv_vec) {
++            opc = INDEX_op_arm_ushl_vec;
++        } else {
++            opc = INDEX_op_arm_sshl_vec;
++        }
++        vec_gen_3(opc, type, vece, tcgv_vec_arg(v0),
++                  tcgv_vec_arg(v1), tcgv_vec_arg(t1));
++        tcg_temp_free_vec(t1);
++        break;
++
++    default:
++        g_assert_not_reached();
++    }
+ }
+ static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
+--
+.25.1

-New patch
+[PULL 14/15] tcg/arm: Implement TCG_TARGET_HAS_roti_vec
+Implement via expansion, so don't actually set TCG_TARGET_HAS_roti_vec.
+For NEON, this is shift-right followed by shift-left-and-insert.
+Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/arm/tcg-target-con-set.h |  1 +
+ tcg/arm/tcg-target.opc.h     |  1 +
+ tcg/arm/tcg-target.c.inc     | 15 +++++++++++++++
+files changed, 17 insertions(+)
+diff --git a/tcg/arm/tcg-target-con-set.h b/tcg/arm/tcg-target-con-set.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/arm/tcg-target-con-set.h
++++ b/tcg/arm/tcg-target-con-set.h
+@@ -XXX,XX +XXX,XX @@ C_O1_I2(r, r, rIK)
+ C_O1_I2(r, r, rIN)
+ C_O1_I2(r, r, ri)
+ C_O1_I2(r, rZ, rZ)
++C_O1_I2(w, 0, w)
+ C_O1_I2(w, w, w)
+ C_O1_I2(w, w, wO)
+ C_O1_I2(w, w, wV)
+diff --git a/tcg/arm/tcg-target.opc.h b/tcg/arm/tcg-target.opc.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/arm/tcg-target.opc.h
++++ b/tcg/arm/tcg-target.opc.h
+@@ -XXX,XX +XXX,XX @@
+  * consider these to be UNSPEC with names.
+  */
++DEF(arm_sli_vec, 1, 2, 1, IMPLVEC)
+ DEF(arm_sshl_vec, 1, 2, 0, IMPLVEC)
+ DEF(arm_ushl_vec, 1, 2, 0, IMPLVEC)
+diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/arm/tcg-target.c.inc
++++ b/tcg/arm/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ typedef enum {
+     INSN_VSHLI     = 0xf2800510,  /* VSHL (immediate) */
+     INSN_VSARI     = 0xf2800010,  /* VSHR.S */
+     INSN_VSHRI     = 0xf3800010,  /* VSHR.U */
++    INSN_VSLI      = 0xf3800510,
+     INSN_VSHL_S    = 0xf2000400,  /* VSHL.S (register) */
+     INSN_VSHL_U    = 0xf3000400,  /* VSHL.U (register) */
+@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
+     case INDEX_op_arm_sshl_vec:
+     case INDEX_op_arm_ushl_vec:
+         return C_O1_I2(w, w, w);
++    case INDEX_op_arm_sli_vec:
++        return C_O1_I2(w, 0, w);
+     case INDEX_op_or_vec:
+     case INDEX_op_andc_vec:
+         return C_O1_I2(w, w, wO);
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
+     case INDEX_op_sari_vec:
+         tcg_out_vshifti(s, INSN_VSARI, q, a0, a1, (16 << vece) - a2);
+         return;
++    case INDEX_op_arm_sli_vec:
++        tcg_out_vshifti(s, INSN_VSLI, q, a0, a2, args[3] + (8 << vece));
++        return;
+     case INDEX_op_andc_vec:
+         if (!const_args[2]) {
+@@ -XXX,XX +XXX,XX @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
+     case INDEX_op_shlv_vec:
+     case INDEX_op_shrv_vec:
+     case INDEX_op_sarv_vec:
++    case INDEX_op_rotli_vec:
+         return -1;
+     default:
+         return 0;
+@@ -XXX,XX +XXX,XX @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
+         tcg_temp_free_vec(t1);
+         break;
++    case INDEX_op_rotli_vec:
++        t1 = tcg_temp_new_vec(type);
++        tcg_gen_shri_vec(vece, t1, v1, -a2 & ((8 << vece) - 1));
++        vec_gen_4(INDEX_op_arm_sli_vec, type, vece,
++                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(v1), a2);
++        tcg_temp_free_vec(t1);
++        break;
++
+     default:
+         g_assert_not_reached();
+     }
+--
+.25.1

-New patch
+[PULL 15/15] tcg/arm: Implement TCG_TARGET_HAS_rotv_vec
+Implement via expansion, so don't actually set TCG_TARGET_HAS_rotv_vec.
+Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/arm/tcg-target.c.inc | 35 ++++++++++++++++++++++++++++++++++-
+file changed, 34 insertions(+), 1 deletion(-)
+diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/arm/tcg-target.c.inc
++++ b/tcg/arm/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
+     case INDEX_op_shrv_vec:
+     case INDEX_op_sarv_vec:
+     case INDEX_op_rotli_vec:
++    case INDEX_op_rotlv_vec:
++    case INDEX_op_rotrv_vec:
+         return -1;
+     default:
+         return 0;
+@@ -XXX,XX +XXX,XX @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
+                        TCGArg a0, ...)
+ {
+     va_list va;
+-    TCGv_vec v0, v1, v2, t1;
++    TCGv_vec v0, v1, v2, t1, t2, c1;
+     TCGArg a2;
+     va_start(va, a0);
+@@ -XXX,XX +XXX,XX @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
+         tcg_temp_free_vec(t1);
+         break;
++    case INDEX_op_rotlv_vec:
++        v2 = temp_tcgv_vec(arg_temp(a2));
++        t1 = tcg_temp_new_vec(type);
++        c1 = tcg_constant_vec(type, vece, 8 << vece);
++        tcg_gen_sub_vec(vece, t1, v2, c1);
++        /* Right shifts are negative left shifts for NEON.  */
++        vec_gen_3(INDEX_op_arm_ushl_vec, type, vece, tcgv_vec_arg(t1),
++                  tcgv_vec_arg(v1), tcgv_vec_arg(t1));
++        vec_gen_3(INDEX_op_arm_ushl_vec, type, vece, tcgv_vec_arg(v0),
++                  tcgv_vec_arg(v1), tcgv_vec_arg(v2));
++        tcg_gen_or_vec(vece, v0, v0, t1);
++        tcg_temp_free_vec(t1);
++        break;
++
++    case INDEX_op_rotrv_vec:
++        v2 = temp_tcgv_vec(arg_temp(a2));
++        t1 = tcg_temp_new_vec(type);
++        t2 = tcg_temp_new_vec(type);
++        c1 = tcg_constant_vec(type, vece, 8 << vece);
++        tcg_gen_neg_vec(vece, t1, v2);
++        tcg_gen_sub_vec(vece, t2, c1, v2);
++        /* Right shifts are negative left shifts for NEON.  */
++        vec_gen_3(INDEX_op_arm_ushl_vec, type, vece, tcgv_vec_arg(t1),
++                  tcgv_vec_arg(v1), tcgv_vec_arg(t1));
++        vec_gen_3(INDEX_op_arm_ushl_vec, type, vece, tcgv_vec_arg(t2),
++                  tcgv_vec_arg(v1), tcgv_vec_arg(t2));
++        tcg_gen_or_vec(vece, v0, t1, t2);
++        tcg_temp_free_vec(t1);
++        tcg_temp_free_vec(t2);
++        break;
++
+     default:
+         g_assert_not_reached();
+     }
+--
+.25.1

From: lixinyu <precinct@mail.ustc.edu.cn>

OPC_SYNC_WMB, OPC_SYNC_MB, OPC_SYNC_ACQUIRE, OPC_SYNC_RELEASE and
OPC_SYNC_RMB have wrong encode. According to the mips manual,
their encode should be 'OPC_SYNC | 0x?? << 6' rather than
'OPC_SYNC | 0x?? << 5'. Wrong encode can lead illegal instruction
errors. These instructions often appear with multi-threaded
simulation.

Fixes: 6f0b99104a3 ("tcg/mips: Add support for fence")
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Reviewed-by: Aleksandar Markovic <aleksandar.qemu.devel@gmail.com>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: lixinyu <precinct@mail.ustc.edu.cn>
Message-Id: <20200411124612.12560-1-precinct@mail.ustc.edu.cn>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/mips/tcg-target.inc.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tcg/mips/tcg-target.inc.c b/tcg/mips/tcg-target.inc.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/mips/tcg-target.inc.c
+++ b/tcg/mips/tcg-target.inc.c
@@ -XXX,XX +XXX,XX @@ typedef enum {
 
     /* MIPS r6 introduced names for weaker variants of SYNC.  These are
        backward compatible to previous architecture revisions.  */
-    OPC_SYNC_WMB     = OPC_SYNC | 0x04 << 5,
-    OPC_SYNC_MB      = OPC_SYNC | 0x10 << 5,
-    OPC_SYNC_ACQUIRE = OPC_SYNC | 0x11 << 5,
-    OPC_SYNC_RELEASE = OPC_SYNC | 0x12 << 5,
-    OPC_SYNC_RMB     = OPC_SYNC | 0x13 << 5,
+    OPC_SYNC_WMB     = OPC_SYNC | 0x04 << 6,
+    OPC_SYNC_MB      = OPC_SYNC | 0x10 << 6,
+    OPC_SYNC_ACQUIRE = OPC_SYNC | 0x11 << 6,
+    OPC_SYNC_RELEASE = OPC_SYNC | 0x12 << 6,
+    OPC_SYNC_RMB     = OPC_SYNC | 0x13 << 6,
 
     /* Aliases for convenience.  */
     ALIAS_PADD     = sizeof(void *) == 4 ? OPC_ADDU : OPC_DADDU,
-- 
2.20.1

The following changes since commit 1cbd2d914939ee6028e9688d4ba859a528c28405:

Merge remote-tracking branch 'remotes/jasowang/tags/net-pull-request' into staging (2021-06-04 13:38:49 +0100)

are available in the Git repository at:

https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20210604

for you to fetch changes up to 0006039e29b9e6118beab300146f7c4931f7a217:

tcg/arm: Implement TCG_TARGET_HAS_rotv_vec (2021-06-04 11:50:11 -0700)

----------------------------------------------------------------
Host vector support for arm neon.

----------------------------------------------------------------
Richard Henderson (15):
      tcg: Change parameters for tcg_target_const_match
      tcg/arm: Add host vector framework
      tcg/arm: Implement tcg_out_ld/st for vector types
      tcg/arm: Implement tcg_out_mov for vector types
      tcg/arm: Implement tcg_out_dup*_vec
      tcg/arm: Implement minimal vector operations
      tcg/arm: Implement andc, orc, abs, neg, not vector operations
      tcg/arm: Implement TCG_TARGET_HAS_shi_vec
      tcg/arm: Implement TCG_TARGET_HAS_mul_vec
      tcg/arm: Implement TCG_TARGET_HAS_sat_vec
      tcg/arm: Implement TCG_TARGET_HAS_minmax_vec
      tcg/arm: Implement TCG_TARGET_HAS_bitsel_vec
      tcg/arm: Implement TCG_TARGET_HAS_shv_vec
      tcg/arm: Implement TCG_TARGET_HAS_roti_vec
      tcg/arm: Implement TCG_TARGET_HAS_rotv_vec

Change the return value to bool, because that's what is should
have been from the start.  Pass the ct mask instead of the whole
TCGArgConstraint, as that's the only part that's relevant.

Change the value argument to int64_t.  We will need the extra
width for 32-bit hosts wanting to match vector constants.

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tcg.c                    | 5 ++---
 tcg/aarch64/tcg-target.c.inc | 5 +----
 tcg/arm/tcg-target.c.inc     | 5 +----
 tcg/i386/tcg-target.c.inc    | 4 +---
 tcg/mips/tcg-target.c.inc    | 5 +----
 tcg/ppc/tcg-target.c.inc     | 4 +---
 tcg/riscv/tcg-target.c.inc   | 4 +---
 tcg/s390/tcg-target.c.inc    | 5 +----
 tcg/sparc/tcg-target.c.inc   | 5 +----
 tcg/tci/tcg-target.c.inc     | 6 ++----
 10 files changed, 12 insertions(+), 36 deletions(-)

diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg, TCGReg arg1,
 static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
                         TCGReg base, intptr_t ofs);
 static void tcg_out_call(TCGContext *s, const tcg_insn_unit *target);
-static int tcg_target_const_match(tcg_target_long val, TCGType type,
-                                  const TCGArgConstraint *arg_ct);
+static bool tcg_target_const_match(int64_t val, TCGType type, int ct);
 #ifdef TCG_TARGET_NEED_LDST_LABELS
 static int tcg_out_ldst_finalize(TCGContext *s);
 #endif
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
         ts = arg_temp(arg);
 
         if (ts->val_type == TEMP_VAL_CONST
-            && tcg_target_const_match(ts->val, ts->type, arg_ct)) {
+            && tcg_target_const_match(ts->val, ts->type, arg_ct->ct)) {
             /* constant is OK for instruction */
             const_args[i] = 1;
             new_args[i] = ts->val;
diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/aarch64/tcg-target.c.inc
+++ b/tcg/aarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static bool is_shimm1632(uint32_t v32, int *cmode, int *imm8)
     }
 }
 
-static int tcg_target_const_match(tcg_target_long val, TCGType type,
-                                  const TCGArgConstraint *arg_ct)
+static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
 {
-    int ct = arg_ct->ct;
-
     if (ct & TCG_CT_CONST) {
         return 1;
     }
diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static inline int check_fit_imm(uint32_t imm)
  * mov operand2:     values represented with x << (2 * y), x < 0x100
  * add, sub, eor...: ditto
  */
-static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
-                                         const TCGArgConstraint *arg_ct)
+static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
 {
-    int ct;
-    ct = arg_ct->ct;
     if (ct & TCG_CT_CONST) {
         return 1;
     } else if ((ct & TCG_CT_CONST_ARM) && check_fit_imm(val)) {
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
 }
 
 /* test if a constant matches the constraint */
-static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
-                                         const TCGArgConstraint *arg_ct)
+static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
 {
-    int ct = arg_ct->ct;
     if (ct & TCG_CT_CONST) {
         return 1;
     }
diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/mips/tcg-target.c.inc
+++ b/tcg/mips/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static inline bool is_p2m1(tcg_target_long val)
 }
 
 /* test if a constant matches the constraint */
-static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
-                                         const TCGArgConstraint *arg_ct)
+static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
 {
-    int ct;
-    ct = arg_ct->ct;
     if (ct & TCG_CT_CONST) {
         return 1;
     } else if ((ct & TCG_CT_CONST_ZERO) && val == 0) {
diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static bool reloc_pc14(tcg_insn_unit *src_rw, const tcg_insn_unit *target)
 }
 
 /* test if a constant matches the constraint */
-static int tcg_target_const_match(tcg_target_long val, TCGType type,
-                                  const TCGArgConstraint *arg_ct)
+static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
 {
-    int ct = arg_ct->ct;
     if (ct & TCG_CT_CONST) {
         return 1;
     }
diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/riscv/tcg-target.c.inc
+++ b/tcg/riscv/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static inline tcg_target_long sextreg(tcg_target_long val, int pos, int len)
 }
 
 /* test if a constant matches the constraint */
-static int tcg_target_const_match(tcg_target_long val, TCGType type,
-                                  const TCGArgConstraint *arg_ct)
+static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
 {
-    int ct = arg_ct->ct;
     if (ct & TCG_CT_CONST) {
         return 1;
     }
diff --git a/tcg/s390/tcg-target.c.inc b/tcg/s390/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/s390/tcg-target.c.inc
+++ b/tcg/s390/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static bool patch_reloc(tcg_insn_unit *src_rw, int type,
 }
 
 /* Test if a constant matches the constraint. */
-static int tcg_target_const_match(tcg_target_long val, TCGType type,
-                                  const TCGArgConstraint *arg_ct)
+static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
 {
-    int ct = arg_ct->ct;
-
     if (ct & TCG_CT_CONST) {
         return 1;
     }
diff --git a/tcg/sparc/tcg-target.c.inc b/tcg/sparc/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/sparc/tcg-target.c.inc
+++ b/tcg/sparc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static bool patch_reloc(tcg_insn_unit *src_rw, int type,
 }
 
 /* test if a constant matches the constraint */
-static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
-                                         const TCGArgConstraint *arg_ct)
+static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
 {
-    int ct = arg_ct->ct;
-
     if (ct & TCG_CT_CONST) {
         return 1;
     }
diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci/tcg-target.c.inc
+++ b/tcg/tci/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static inline bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
 }
 
 /* Test if a constant matches the constraint. */
-static int tcg_target_const_match(tcg_target_long val, TCGType type,
-                                  const TCGArgConstraint *arg_ct)
+static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
 {
-    /* No need to return 0 or 1, 0 or != 0 is good enough. */
-    return arg_ct->ct & TCG_CT_CONST;
+    return ct & TCG_CT_CONST;
 }
 
 static void tcg_target_init(TCGContext *s)
-- 
2.25.1

Add registers and function stubs.  The functionality
is disabled via use_neon_instructions defined to 0.

We must still include results for the mandatory opcodes in
tcg_target_op_def, as all opcodes are checked during tcg init.

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/arm/tcg-target-con-set.h |   4 ++
 tcg/arm/tcg-target-con-str.h |   1 +
 tcg/arm/tcg-target.h         |  48 ++++++++++++--
 tcg/arm/tcg-target.opc.h     |  12 ++++
 tcg/arm/tcg-target.c.inc     | 117 +++++++++++++++++++++++++++++------
 5 files changed, 158 insertions(+), 24 deletions(-)
 create mode 100644 tcg/arm/tcg-target.opc.h

diff --git a/tcg/arm/tcg-target-con-set.h b/tcg/arm/tcg-target-con-set.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target-con-set.h
+++ b/tcg/arm/tcg-target-con-set.h
@@ -XXX,XX +XXX,XX @@ C_O0_I1(r)
 C_O0_I2(r, r)
 C_O0_I2(r, rIN)
 C_O0_I2(s, s)
+C_O0_I2(w, r)
 C_O0_I3(s, s, s)
 C_O0_I4(r, r, rI, rI)
 C_O0_I4(s, s, s, s)
 C_O1_I1(r, l)
 C_O1_I1(r, r)
+C_O1_I1(w, r)
+C_O1_I1(w, wr)
 C_O1_I2(r, 0, rZ)
 C_O1_I2(r, l, l)
 C_O1_I2(r, r, r)
@@ -XXX,XX +XXX,XX @@ C_O1_I2(r, r, rIK)
 C_O1_I2(r, r, rIN)
 C_O1_I2(r, r, ri)
 C_O1_I2(r, rZ, rZ)
+C_O1_I2(w, w, w)
 C_O1_I4(r, r, r, rI, rI)
 C_O1_I4(r, r, rIN, rIK, 0)
 C_O2_I1(r, r, l)
diff --git a/tcg/arm/tcg-target-con-str.h b/tcg/arm/tcg-target-con-str.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target-con-str.h
+++ b/tcg/arm/tcg-target-con-str.h
@@ -XXX,XX +XXX,XX @@
 REGS('r', ALL_GENERAL_REGS)
 REGS('l', ALL_QLOAD_REGS)
 REGS('s', ALL_QSTORE_REGS)
+REGS('w', ALL_VECTOR_REGS)
 
 /*
  * Define constraint letters for constants:
diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.h
+++ b/tcg/arm/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
     TCG_REG_R13,
     TCG_REG_R14,
     TCG_REG_PC,
+
+    TCG_REG_Q0,
+    TCG_REG_Q1,
+    TCG_REG_Q2,
+    TCG_REG_Q3,
+    TCG_REG_Q4,
+    TCG_REG_Q5,
+    TCG_REG_Q6,
+    TCG_REG_Q7,
+    TCG_REG_Q8,
+    TCG_REG_Q9,
+    TCG_REG_Q10,
+    TCG_REG_Q11,
+    TCG_REG_Q12,
+    TCG_REG_Q13,
+    TCG_REG_Q14,
+    TCG_REG_Q15,
+
+    TCG_AREG0 = TCG_REG_R6,
+    TCG_REG_CALL_STACK = TCG_REG_R13,
 } TCGReg;
 
-#define TCG_TARGET_NB_REGS 16
+#define TCG_TARGET_NB_REGS 32
 
 #ifdef __ARM_ARCH_EXT_IDIV__
 #define use_idiv_instructions  1
 #else
 extern bool use_idiv_instructions;
 #endif
-
+#define use_neon_instructions  0
 
 /* used for function call generation */
-#define TCG_REG_CALL_STACK		TCG_REG_R13
 #define TCG_TARGET_STACK_ALIGN		8
 #define TCG_TARGET_CALL_ALIGN_ARGS	1
 #define TCG_TARGET_CALL_STACK_OFFSET	0
@@ -XXX,XX +XXX,XX @@ extern bool use_idiv_instructions;
 #define TCG_TARGET_HAS_direct_jump      0
 #define TCG_TARGET_HAS_qemu_st8_i32     0
 
-enum {
-    TCG_AREG0 = TCG_REG_R6,
-};
+#define TCG_TARGET_HAS_v64              use_neon_instructions
+#define TCG_TARGET_HAS_v128             use_neon_instructions
+#define TCG_TARGET_HAS_v256             0
+
+#define TCG_TARGET_HAS_andc_vec         0
+#define TCG_TARGET_HAS_orc_vec          0
+#define TCG_TARGET_HAS_not_vec          0
+#define TCG_TARGET_HAS_neg_vec          0
+#define TCG_TARGET_HAS_abs_vec          0
+#define TCG_TARGET_HAS_roti_vec         0
+#define TCG_TARGET_HAS_rots_vec         0
+#define TCG_TARGET_HAS_rotv_vec         0
+#define TCG_TARGET_HAS_shi_vec          0
+#define TCG_TARGET_HAS_shs_vec          0
+#define TCG_TARGET_HAS_shv_vec          0
+#define TCG_TARGET_HAS_mul_vec          0
+#define TCG_TARGET_HAS_sat_vec          0
+#define TCG_TARGET_HAS_minmax_vec       0
+#define TCG_TARGET_HAS_bitsel_vec       0
+#define TCG_TARGET_HAS_cmpsel_vec       0
 
 #define TCG_TARGET_DEFAULT_MO (0)
 #define TCG_TARGET_HAS_MEMORY_BSWAP     1
diff --git a/tcg/arm/tcg-target.opc.h b/tcg/arm/tcg-target.opc.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tcg/arm/tcg-target.opc.h
@@ -XXX,XX +XXX,XX @@
+/*
+ * Copyright (c) 2019 Linaro
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * (at your option) any later version.
+ *
+ * See the COPYING file in the top-level directory for details.
+ *
+ * Target-specific opcodes for host vector expansion.  These will be
+ * emitted by tcg_expand_vec_op.  For those familiar with GCC internals,
+ * consider these to be UNSPEC with names.
+ */
diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ bool use_idiv_instructions;
 
 #ifdef CONFIG_DEBUG_TCG
 static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
-    "%r0",
-    "%r1",
-    "%r2",
-    "%r3",
-    "%r4",
-    "%r5",
-    "%r6",
-    "%r7",
-    "%r8",
-    "%r9",
-    "%r10",
-    "%r11",
-    "%r12",
-    "%r13",
-    "%r14",
-    "%pc",
+    "%r0",  "%r1",  "%r2",  "%r3",  "%r4",  "%r5",  "%r6",  "%r7",
+    "%r8",  "%r9",  "%r10", "%r11", "%r12", "%sp",  "%r14", "%pc",
+    "%q0",  "%q1",  "%q2",  "%q3",  "%q4",  "%q5",  "%q6",  "%q7",
+    "%q8",  "%q9",  "%q10", "%q11", "%q12", "%q13", "%q14", "%q15",
 };
 #endif
 
@@ -XXX,XX +XXX,XX @@ static const int tcg_target_reg_alloc_order[] = {
     TCG_REG_R3,
     TCG_REG_R12,
     TCG_REG_R14,
+
+    TCG_REG_Q0,
+    TCG_REG_Q1,
+    TCG_REG_Q2,
+    TCG_REG_Q3,
+    /* Q4 - Q7 are call-saved, and skipped. */
+    TCG_REG_Q8,
+    TCG_REG_Q9,
+    TCG_REG_Q10,
+    TCG_REG_Q11,
+    TCG_REG_Q12,
+    TCG_REG_Q13,
+    TCG_REG_Q14,
+    TCG_REG_Q15,
 };
 
 static const int tcg_target_call_iarg_regs[4] = {
@@ -XXX,XX +XXX,XX @@ static const int tcg_target_call_oarg_regs[2] = {
 };
 
 #define TCG_REG_TMP  TCG_REG_R12
+#define TCG_VEC_TMP  TCG_REG_Q15
 
 enum arm_cond_code_e {
     COND_EQ = 0x0,
@@ -XXX,XX +XXX,XX @@ static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
 #define TCG_CT_CONST_ZERO 0x800
 
 #define ALL_GENERAL_REGS  0xffffu
+#define ALL_VECTOR_REGS   0xffff0000u
 
 /*
  * r0-r2 will be overwritten when reading the tlb entry (softmmu only)
@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
     case INDEX_op_qemu_st_i64:
         return TARGET_LONG_BITS == 32 ? C_O0_I3(s, s, s) : C_O0_I4(s, s, s, s);
 
+    case INDEX_op_st_vec:
+        return C_O0_I2(w, r);
+    case INDEX_op_ld_vec:
+    case INDEX_op_dupm_vec:
+        return C_O1_I1(w, r);
+    case INDEX_op_dup_vec:
+        return C_O1_I1(w, wr);
+    case INDEX_op_dup2_vec:
+    case INDEX_op_add_vec:
+    case INDEX_op_sub_vec:
+    case INDEX_op_xor_vec:
+    case INDEX_op_or_vec:
+    case INDEX_op_and_vec:
+    case INDEX_op_cmp_vec:
+        return C_O1_I2(w, w, w);
+
     default:
         g_assert_not_reached();
     }
@@ -XXX,XX +XXX,XX @@ static void tcg_target_init(TCGContext *s)
 {
     /* Only probe for the platform and capabilities if we havn't already
        determined maximum values at compile time.  */
-#ifndef use_idiv_instructions
+#if !defined(use_idiv_instructions) || !defined(use_neon_instructions)
     {
         unsigned long hwcap = qemu_getauxval(AT_HWCAP);
+#ifndef use_idiv_instructions
         use_idiv_instructions = (hwcap & HWCAP_ARM_IDIVA) != 0;
+#endif
+#ifndef use_neon_instructions
+        use_neon_instructions = (hwcap & HWCAP_ARM_NEON) != 0;
+#endif
     }
 #endif
+
     if (__ARM_ARCH < 7) {
         const char *pl = (const char *)qemu_getauxval(AT_PLATFORM);
         if (pl != NULL && pl[0] == 'v' && pl[1] >= '4' && pl[1] <= '9') {
@@ -XXX,XX +XXX,XX @@ static void tcg_target_init(TCGContext *s)
         }
     }
 
-    tcg_target_available_regs[TCG_TYPE_I32] = 0xffff;
+    tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS;
 
     tcg_target_call_clobber_regs = 0;
     tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R0);
@@ -XXX,XX +XXX,XX @@ static void tcg_target_init(TCGContext *s)
     tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R12);
     tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R14);
 
+    if (use_neon_instructions) {
+        tcg_target_available_regs[TCG_TYPE_V64]  = ALL_VECTOR_REGS;
+        tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS;
+
+        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_Q0);
+        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_Q1);
+        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_Q2);
+        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_Q3);
+        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_Q8);
+        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_Q9);
+        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_Q10);
+        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_Q11);
+        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_Q12);
+        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_Q13);
+        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_Q14);
+        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_Q15);
+    }
+
     s->reserved_regs = 0;
     tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
     tcg_regset_set_reg(s->reserved_regs, TCG_REG_TMP);
     tcg_regset_set_reg(s->reserved_regs, TCG_REG_PC);
+    tcg_regset_set_reg(s->reserved_regs, TCG_VEC_TMP);
 }
 
 static inline void tcg_out_ld(TCGContext *s, TCGType type, TCGReg arg,
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_movi(TCGContext *s, TCGType type,
     tcg_out_movi32(s, COND_AL, ret, arg);
 }
 
+static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
+                            TCGReg rd, TCGReg rs)
+{
+    g_assert_not_reached();
+}
+
+static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
+                             TCGReg rd, TCGReg base, intptr_t offset)
+{
+    g_assert_not_reached();
+}
+
+static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
+                             TCGReg rd, int64_t v64)
+{
+    g_assert_not_reached();
+}
+
+static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
+                           unsigned vecl, unsigned vece,
+                           const TCGArg *args, const int *const_args)
+{
+    g_assert_not_reached();
+}
+
+int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
+{
+    return 0;
+}
+
+void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
+                       TCGArg a0, ...)
+{
+    g_assert_not_reached();
+}
+
 static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
 {
     int i;
-- 
2.25.1

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/arm/tcg-target.c.inc | 70 ++++++++++++++++++++++++++++++++++++----
 1 file changed, 64 insertions(+), 6 deletions(-)

diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ typedef enum {
     INSN_NOP_v6k   = 0xe320f000,
     /* Otherwise the assembler uses mov r0,r0 */
     INSN_NOP_v4    = (COND_AL << 28) | ARITH_MOV,
+
+    INSN_VLD1      = 0xf4200000,  /* VLD1 (multiple single elements) */
+    INSN_VST1      = 0xf4000000,  /* VST1 (multiple single elements) */
 } ARMInsn;
 
 #define INSN_NOP   (use_armv7_instructions ? INSN_NOP_v6k : INSN_NOP_v4)
@@ -XXX,XX +XXX,XX @@ static TCGCond tcg_out_cmp2(TCGContext *s, const TCGArg *args,
     }
 }
 
+/*
+ * Note that TCGReg references Q-registers.
+ * Q-regno = 2 * D-regno, so shift left by 1 whlie inserting.
+ */
+static uint32_t encode_vd(TCGReg rd)
+{
+    tcg_debug_assert(rd >= TCG_REG_Q0);
+    return (extract32(rd, 3, 1) << 22) | (extract32(rd, 0, 3) << 13);
+}
+
+static void tcg_out_vldst(TCGContext *s, ARMInsn insn,
+                          TCGReg rd, TCGReg rn, int offset)
+{
+    if (offset != 0) {
+        if (check_fit_imm(offset) || check_fit_imm(-offset)) {
+            tcg_out_dat_rIN(s, COND_AL, ARITH_ADD, ARITH_SUB,
+                            TCG_REG_TMP, rn, offset, true);
+        } else {
+            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_TMP, offset);
+            tcg_out_dat_reg(s, COND_AL, ARITH_ADD,
+                            TCG_REG_TMP, TCG_REG_TMP, rn, 0);
+        }
+        rn = TCG_REG_TMP;
+    }
+    tcg_out32(s, insn | (rn << 16) | encode_vd(rd) | 0xf);
+}
+
 #ifdef CONFIG_SOFTMMU
 #include "../tcg-ldst.c.inc"
 
@@ -XXX,XX +XXX,XX @@ static void tcg_target_init(TCGContext *s)
     tcg_regset_set_reg(s->reserved_regs, TCG_VEC_TMP);
 }
 
-static inline void tcg_out_ld(TCGContext *s, TCGType type, TCGReg arg,
-                              TCGReg arg1, intptr_t arg2)
+static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg arg,
+                       TCGReg arg1, intptr_t arg2)
 {
-    tcg_out_ld32u(s, COND_AL, arg, arg1, arg2);
+    switch (type) {
+    case TCG_TYPE_I32:
+        tcg_out_ld32u(s, COND_AL, arg, arg1, arg2);
+        return;
+    case TCG_TYPE_V64:
+        /* regs 1; size 8; align 8 */
+        tcg_out_vldst(s, INSN_VLD1 | 0x7d0, arg, arg1, arg2);
+        return;
+    case TCG_TYPE_V128:
+        /* regs 2; size 8; align 16 */
+        tcg_out_vldst(s, INSN_VLD1 | 0xae0, arg, arg1, arg2);
+        return;
+    default:
+        g_assert_not_reached();
+    }
 }
 
-static inline void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
-                              TCGReg arg1, intptr_t arg2)
+static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
+                       TCGReg arg1, intptr_t arg2)
 {
-    tcg_out_st32(s, COND_AL, arg, arg1, arg2);
+    switch (type) {
+    case TCG_TYPE_I32:
+        tcg_out_st32(s, COND_AL, arg, arg1, arg2);
+        return;
+    case TCG_TYPE_V64:
+        /* regs 1; size 8; align 8 */
+        tcg_out_vldst(s, INSN_VST1 | 0x7d0, arg, arg1, arg2);
+        return;
+    case TCG_TYPE_V128:
+        /* regs 2; size 8; align 16 */
+        tcg_out_vldst(s, INSN_VST1 | 0xae0, arg, arg1, arg2);
+        return;
+    default:
+        g_assert_not_reached();
+    }
 }
 
 static inline bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
-- 
2.25.1

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/arm/tcg-target.c.inc | 52 +++++++++++++++++++++++++++++++++++-----
 1 file changed, 46 insertions(+), 6 deletions(-)

diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ typedef enum {
     /* Otherwise the assembler uses mov r0,r0 */
     INSN_NOP_v4    = (COND_AL << 28) | ARITH_MOV,
 
+    INSN_VORR      = 0xf2200110,
+
     INSN_VLD1      = 0xf4200000,  /* VLD1 (multiple single elements) */
     INSN_VST1      = 0xf4000000,  /* VST1 (multiple single elements) */
 } ARMInsn;
@@ -XXX,XX +XXX,XX @@ static uint32_t encode_vd(TCGReg rd)
     return (extract32(rd, 3, 1) << 22) | (extract32(rd, 0, 3) << 13);
 }
 
+static uint32_t encode_vn(TCGReg rn)
+{
+    tcg_debug_assert(rn >= TCG_REG_Q0);
+    return (extract32(rn, 3, 1) << 7) | (extract32(rn, 0, 3) << 17);
+}
+
+static uint32_t encode_vm(TCGReg rm)
+{
+    tcg_debug_assert(rm >= TCG_REG_Q0);
+    return (extract32(rm, 3, 1) << 5) | (extract32(rm, 0, 3) << 1);
+}
+
+static void tcg_out_vreg3(TCGContext *s, ARMInsn insn, int q, int vece,
+                          TCGReg d, TCGReg n, TCGReg m)
+{
+    tcg_out32(s, insn | (vece << 20) | (q << 6) |
+              encode_vd(d) | encode_vn(n) | encode_vm(m));
+}
+
 static void tcg_out_vldst(TCGContext *s, ARMInsn insn,
                           TCGReg rd, TCGReg rn, int offset)
 {
@@ -XXX,XX +XXX,XX @@ static inline bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
     return false;
 }
 
-static inline bool tcg_out_mov(TCGContext *s, TCGType type,
-                               TCGReg ret, TCGReg arg)
+static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
 {
-    tcg_out_mov_reg(s, COND_AL, ret, arg);
-    return true;
+    if (ret == arg) {
+        return true;
+    }
+    switch (type) {
+    case TCG_TYPE_I32:
+        if (ret < TCG_REG_Q0 && arg < TCG_REG_Q0) {
+            tcg_out_mov_reg(s, COND_AL, ret, arg);
+            return true;
+        }
+        return false;
+
+    case TCG_TYPE_V64:
+    case TCG_TYPE_V128:
+        /* "VMOV D,N" is an alias for "VORR D,N,N". */
+        tcg_out_vreg3(s, INSN_VORR, type - TCG_TYPE_V64, 0, ret, arg, arg);
+        return true;
+
+    default:
+        g_assert_not_reached();
+    }
 }
 
-static inline void tcg_out_movi(TCGContext *s, TCGType type,
-                                TCGReg ret, tcg_target_long arg)
+static void tcg_out_movi(TCGContext *s, TCGType type,
+                         TCGReg ret, tcg_target_long arg)
 {
+    tcg_debug_assert(type == TCG_TYPE_I32);
+    tcg_debug_assert(ret < TCG_REG_Q0);
     tcg_out_movi32(s, COND_AL, ret, arg);
 }
 
-- 
2.25.1

Most of dupi is copied from tcg/aarch64, which has the same
encoding for AdvSimdExpandImm.

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/arm/tcg-target.c.inc | 283 +++++++++++++++++++++++++++++++++++++--
 1 file changed, 275 insertions(+), 8 deletions(-)

diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ typedef enum {
 
     INSN_VORR      = 0xf2200110,
 
+    INSN_VDUP_G    = 0xee800b10,  /* VDUP (ARM core register) */
+    INSN_VDUP_S    = 0xf3b00c00,  /* VDUP (scalar) */
+    INSN_VLDR_D    = 0xed100b00,  /* VLDR.64 */
     INSN_VLD1      = 0xf4200000,  /* VLD1 (multiple single elements) */
+    INSN_VLD1R     = 0xf4a00c00,  /* VLD1 (single element to all lanes) */
     INSN_VST1      = 0xf4000000,  /* VST1 (multiple single elements) */
+    INSN_VMOVI     = 0xf2800010,  /* VMOV (immediate) */
 } ARMInsn;
 
 #define INSN_NOP   (use_armv7_instructions ? INSN_NOP_v6k : INSN_NOP_v4)
@@ -XXX,XX +XXX,XX @@ static const uint8_t tcg_cond_to_arm_cond[] = {
     [TCG_COND_GTU] = COND_HI,
 };
 
+static int encode_imm(uint32_t imm);
+
+/* TCG private relocation type: add with pc+imm8 */
+#define R_ARM_PC8  11
+
+/* TCG private relocation type: vldr with imm8 << 2 */
+#define R_ARM_PC11 12
+
 static bool reloc_pc24(tcg_insn_unit *src_rw, const tcg_insn_unit *target)
 {
     const tcg_insn_unit *src_rx = tcg_splitwx_to_rx(src_rw);
@@ -XXX,XX +XXX,XX @@ static bool reloc_pc13(tcg_insn_unit *src_rw, const tcg_insn_unit *target)
     return false;
 }
 
+static bool reloc_pc11(tcg_insn_unit *src_rw, const tcg_insn_unit *target)
+{
+    const tcg_insn_unit *src_rx = tcg_splitwx_to_rx(src_rw);
+    ptrdiff_t offset = (tcg_ptr_byte_diff(target, src_rx) - 8) / 4;
+
+    if (offset >= -0xff && offset <= 0xff) {
+        tcg_insn_unit insn = *src_rw;
+        bool u = (offset >= 0);
+        if (!u) {
+            offset = -offset;
+        }
+        insn = deposit32(insn, 23, 1, u);
+        insn = deposit32(insn, 0, 8, offset);
+        *src_rw = insn;
+        return true;
+    }
+    return false;
+}
+
+static bool reloc_pc8(tcg_insn_unit *src_rw, const tcg_insn_unit *target)
+{
+    const tcg_insn_unit *src_rx = tcg_splitwx_to_rx(src_rw);
+    ptrdiff_t offset = tcg_ptr_byte_diff(target, src_rx) - 8;
+    int rot = encode_imm(offset);
+
+    if (rot >= 0) {
+        *src_rw = deposit32(*src_rw, 0, 12, rol32(offset, rot) | (rot << 7));
+        return true;
+    }
+    return false;
+}
+
 static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
                         intptr_t value, intptr_t addend)
 {
     tcg_debug_assert(addend == 0);
-
-    if (type == R_ARM_PC24) {
+    switch (type) {
+    case R_ARM_PC24:
         return reloc_pc24(code_ptr, (const tcg_insn_unit *)value);
-    } else if (type == R_ARM_PC13) {
+    case R_ARM_PC13:
         return reloc_pc13(code_ptr, (const tcg_insn_unit *)value);
-    } else {
+    case R_ARM_PC11:
+        return reloc_pc11(code_ptr, (const tcg_insn_unit *)value);
+    case R_ARM_PC8:
+        return reloc_pc8(code_ptr, (const tcg_insn_unit *)value);
+    default:
         g_assert_not_reached();
     }
 }
@@ -XXX,XX +XXX,XX @@ static inline uint32_t rotl(uint32_t val, int n)
 
 /* ARM immediates for ALU instructions are made of an unsigned 8-bit
    right-rotated by an even amount between 0 and 30. */
-static inline int encode_imm(uint32_t imm)
+static int encode_imm(uint32_t imm)
 {
     int shift;
 
@@ -XXX,XX +XXX,XX @@ static inline int check_fit_imm(uint32_t imm)
     return encode_imm(imm) >= 0;
 }
 
+/* Return true if v16 is a valid 16-bit shifted immediate.  */
+static bool is_shimm16(uint16_t v16, int *cmode, int *imm8)
+{
+    if (v16 == (v16 & 0xff)) {
+        *cmode = 0x8;
+        *imm8 = v16 & 0xff;
+        return true;
+    } else if (v16 == (v16 & 0xff00)) {
+        *cmode = 0xa;
+        *imm8 = v16 >> 8;
+        return true;
+    }
+    return false;
+}
+
+/* Return true if v32 is a valid 32-bit shifted immediate.  */
+static bool is_shimm32(uint32_t v32, int *cmode, int *imm8)
+{
+    if (v32 == (v32 & 0xff)) {
+        *cmode = 0x0;
+        *imm8 = v32 & 0xff;
+        return true;
+    } else if (v32 == (v32 & 0xff00)) {
+        *cmode = 0x2;
+        *imm8 = (v32 >> 8) & 0xff;
+        return true;
+    } else if (v32 == (v32 & 0xff0000)) {
+        *cmode = 0x4;
+        *imm8 = (v32 >> 16) & 0xff;
+        return true;
+    } else if (v32 == (v32 & 0xff000000)) {
+        *cmode = 0x6;
+        *imm8 = v32 >> 24;
+        return true;
+    }
+    return false;
+}
+
+/* Return true if v32 is a valid 32-bit shifting ones immediate.  */
+static bool is_soimm32(uint32_t v32, int *cmode, int *imm8)
+{
+    if ((v32 & 0xffff00ff) == 0xff) {
+        *cmode = 0xc;
+        *imm8 = (v32 >> 8) & 0xff;
+        return true;
+    } else if ((v32 & 0xff00ffff) == 0xffff) {
+        *cmode = 0xd;
+        *imm8 = (v32 >> 16) & 0xff;
+        return true;
+    }
+    return false;
+}
+
+/*
+ * Return non-zero if v32 can be formed by MOVI+ORR.
+ * Place the parameters for MOVI in (cmode, imm8).
+ * Return the cmode for ORR; the imm8 can be had via extraction from v32.
+ */
+static int is_shimm32_pair(uint32_t v32, int *cmode, int *imm8)
+{
+    int i;
+
+    for (i = 6; i > 0; i -= 2) {
+        /* Mask out one byte we can add with ORR.  */
+        uint32_t tmp = v32 & ~(0xffu << (i * 4));
+        if (is_shimm32(tmp, cmode, imm8) ||
+            is_soimm32(tmp, cmode, imm8)) {
+            break;
+        }
+    }
+    return i;
+}
+
 /* Test if a constant matches the constraint.
  * TODO: define constraints for:
  *
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vreg3(TCGContext *s, ARMInsn insn, int q, int vece,
               encode_vd(d) | encode_vn(n) | encode_vm(m));
 }
 
+static void tcg_out_vmovi(TCGContext *s, TCGReg rd,
+                          int q, int op, int cmode, uint8_t imm8)
+{
+    tcg_out32(s, INSN_VMOVI | encode_vd(rd) | (q << 6) | (op << 5)
+              | (cmode << 8) | extract32(imm8, 0, 4)
+              | (extract32(imm8, 4, 3) << 16)
+              | (extract32(imm8, 7, 1) << 24));
+}
+
 static void tcg_out_vldst(TCGContext *s, ARMInsn insn,
                           TCGReg rd, TCGReg rn, int offset)
 {
@@ -XXX,XX +XXX,XX @@ static void tcg_out_movi(TCGContext *s, TCGType type,
     tcg_out_movi32(s, COND_AL, ret, arg);
 }
 
+/* Type is always V128, with I64 elements.  */
+static void tcg_out_dup2_vec(TCGContext *s, TCGReg rd, TCGReg rl, TCGReg rh)
+{
+    /* Move high element into place first. */
+    /* VMOV Dd+1, Ds */
+    tcg_out_vreg3(s, INSN_VORR | (1 << 12), 0, 0, rd, rh, rh);
+    /* Move low element into place; tcg_out_mov will check for nop. */
+    tcg_out_mov(s, TCG_TYPE_V64, rd, rl);
+}
+
 static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
                             TCGReg rd, TCGReg rs)
 {
-    g_assert_not_reached();
+    int q = type - TCG_TYPE_V64;
+
+    if (vece == MO_64) {
+        if (type == TCG_TYPE_V128) {
+            tcg_out_dup2_vec(s, rd, rs, rs);
+        } else {
+            tcg_out_mov(s, TCG_TYPE_V64, rd, rs);
+        }
+    } else if (rs < TCG_REG_Q0) {
+        int b = (vece == MO_8);
+        int e = (vece == MO_16);
+        tcg_out32(s, INSN_VDUP_G | (b << 22) | (q << 21) | (e << 5) |
+                  encode_vn(rd) | (rs << 12));
+    } else {
+        int imm4 = 1 << vece;
+        tcg_out32(s, INSN_VDUP_S | (imm4 << 16) | (q << 6) |
+                  encode_vd(rd) | encode_vm(rs));
+    }
+    return true;
 }
 
 static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
                              TCGReg rd, TCGReg base, intptr_t offset)
 {
-    g_assert_not_reached();
+    if (vece == MO_64) {
+        tcg_out_ld(s, TCG_TYPE_V64, rd, base, offset);
+        if (type == TCG_TYPE_V128) {
+            tcg_out_dup2_vec(s, rd, rd, rd);
+        }
+    } else {
+        int q = type - TCG_TYPE_V64;
+        tcg_out_vldst(s, INSN_VLD1R | (vece << 6) | (q << 5),
+                      rd, base, offset);
+    }
+    return true;
 }
 
 static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
                              TCGReg rd, int64_t v64)
 {
-    g_assert_not_reached();
+    int q = type - TCG_TYPE_V64;
+    int cmode, imm8, i;
+
+    /* Test all bytes equal first.  */
+    if (vece == MO_8) {
+        tcg_out_vmovi(s, rd, q, 0, 0xe, v64);
+        return;
+    }
+
+    /*
+     * Test all bytes 0x00 or 0xff second.  This can match cases that
+     * might otherwise take 2 or 3 insns for MO_16 or MO_32 below.
+     */
+    for (i = imm8 = 0; i < 8; i++) {
+        uint8_t byte = v64 >> (i * 8);
+        if (byte == 0xff) {
+            imm8 |= 1 << i;
+        } else if (byte != 0) {
+            goto fail_bytes;
+        }
+    }
+    tcg_out_vmovi(s, rd, q, 1, 0xe, imm8);
+    return;
+ fail_bytes:
+
+    /*
+     * Tests for various replications.  For each element width, if we
+     * cannot find an expansion there's no point checking a larger
+     * width because we already know by replication it cannot match.
+     */
+    if (vece == MO_16) {
+        uint16_t v16 = v64;
+
+        if (is_shimm16(v16, &cmode, &imm8)) {
+            tcg_out_vmovi(s, rd, q, 0, cmode, imm8);
+            return;
+        }
+        if (is_shimm16(~v16, &cmode, &imm8)) {
+            tcg_out_vmovi(s, rd, q, 1, cmode, imm8);
+            return;
+        }
+
+        /*
+         * Otherwise, all remaining constants can be loaded in two insns:
+         * rd = v16 & 0xff, rd |= v16 & 0xff00.
+         */
+        tcg_out_vmovi(s, rd, q, 0, 0x8, v16 & 0xff);
+        tcg_out_vmovi(s, rd, q, 0, 0xb, v16 >> 8);   /* VORRI */
+        return;
+    }
+
+    if (vece == MO_32) {
+        uint32_t v32 = v64;
+
+        if (is_shimm32(v32, &cmode, &imm8) ||
+            is_soimm32(v32, &cmode, &imm8)) {
+            tcg_out_vmovi(s, rd, q, 0, cmode, imm8);
+            return;
+        }
+        if (is_shimm32(~v32, &cmode, &imm8) ||
+            is_soimm32(~v32, &cmode, &imm8)) {
+            tcg_out_vmovi(s, rd, q, 1, cmode, imm8);
+            return;
+        }
+
+        /*
+         * Restrict the set of constants to those we can load with
+         * two instructions.  Others we load from the pool.
+         */
+        i = is_shimm32_pair(v32, &cmode, &imm8);
+        if (i) {
+            tcg_out_vmovi(s, rd, q, 0, cmode, imm8);
+            tcg_out_vmovi(s, rd, q, 0, i | 1, extract32(v32, i * 4, 8));
+            return;
+        }
+        i = is_shimm32_pair(~v32, &cmode, &imm8);
+        if (i) {
+            tcg_out_vmovi(s, rd, q, 1, cmode, imm8);
+            tcg_out_vmovi(s, rd, q, 1, i | 1, extract32(~v32, i * 4, 8));
+            return;
+        }
+    }
+
+    /*
+     * As a last resort, load from the constant pool.
+     */
+    if (!q || vece == MO_64) {
+        new_pool_l2(s, R_ARM_PC11, s->code_ptr, 0, v64, v64 >> 32);
+        /* VLDR Dd, [pc + offset] */
+        tcg_out32(s, INSN_VLDR_D | encode_vd(rd) | (0xf << 16));
+        if (q) {
+            tcg_out_dup2_vec(s, rd, rd, rd);
+        }
+    } else {
+        new_pool_label(s, (uint32_t)v64, R_ARM_PC8, s->code_ptr, 0);
+        /* add tmp, pc, offset */
+        tcg_out_dat_imm(s, COND_AL, ARITH_ADD, TCG_REG_TMP, TCG_REG_PC, 0);
+        tcg_out_dupm_vec(s, type, MO_32, rd, TCG_REG_TMP, 0);
+    }
 }
 
 static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
-- 
2.25.1

Implementing dup2, add, sub, and, or, xor as the minimal set.
This allows us to actually enable neon in the header file.

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/arm/tcg-target-con-set.h |   3 +
 tcg/arm/tcg-target-con-str.h |   2 +
 tcg/arm/tcg-target.h         |   6 +-
 tcg/arm/tcg-target.c.inc     | 201 +++++++++++++++++++++++++++++++++--
 4 files changed, 204 insertions(+), 8 deletions(-)

diff --git a/tcg/arm/tcg-target-con-set.h b/tcg/arm/tcg-target-con-set.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target-con-set.h
+++ b/tcg/arm/tcg-target-con-set.h
@@ -XXX,XX +XXX,XX @@ C_O1_I2(r, r, rIN)
 C_O1_I2(r, r, ri)
 C_O1_I2(r, rZ, rZ)
 C_O1_I2(w, w, w)
+C_O1_I2(w, w, wO)
+C_O1_I2(w, w, wV)
+C_O1_I2(w, w, wZ)
 C_O1_I4(r, r, r, rI, rI)
 C_O1_I4(r, r, rIN, rIK, 0)
 C_O2_I1(r, r, l)
diff --git a/tcg/arm/tcg-target-con-str.h b/tcg/arm/tcg-target-con-str.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target-con-str.h
+++ b/tcg/arm/tcg-target-con-str.h
@@ -XXX,XX +XXX,XX @@ REGS('w', ALL_VECTOR_REGS)
 CONST('I', TCG_CT_CONST_ARM)
 CONST('K', TCG_CT_CONST_INV)
 CONST('N', TCG_CT_CONST_NEG)
+CONST('O', TCG_CT_CONST_ORRI)
+CONST('V', TCG_CT_CONST_ANDI)
 CONST('Z', TCG_CT_CONST_ZERO)
diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.h
+++ b/tcg/arm/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
 #else
 extern bool use_idiv_instructions;
 #endif
-#define use_neon_instructions  0
+#ifdef __ARM_NEON__
+#define use_neon_instructions  1
+#else
+extern bool use_neon_instructions;
+#endif
 
 /* used for function call generation */
 #define TCG_TARGET_STACK_ALIGN		8
diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ int arm_arch = __ARM_ARCH;
 #ifndef use_idiv_instructions
 bool use_idiv_instructions;
 #endif
+#ifndef use_neon_instructions
+bool use_neon_instructions;
+#endif
 
 /* ??? Ought to think about changing CONFIG_SOFTMMU to always defined.  */
 #ifdef CONFIG_SOFTMMU
@@ -XXX,XX +XXX,XX @@ typedef enum {
     /* Otherwise the assembler uses mov r0,r0 */
     INSN_NOP_v4    = (COND_AL << 28) | ARITH_MOV,
 
+    INSN_VADD      = 0xf2000800,
+    INSN_VAND      = 0xf2000110,
+    INSN_VEOR      = 0xf3000110,
     INSN_VORR      = 0xf2200110,
+    INSN_VSUB      = 0xf3000800,
+
+    INSN_VMVN      = 0xf3b00580,
+
+    INSN_VCEQ0     = 0xf3b10100,
+    INSN_VCGT0     = 0xf3b10000,
+    INSN_VCGE0     = 0xf3b10080,
+    INSN_VCLE0     = 0xf3b10180,
+    INSN_VCLT0     = 0xf3b10200,
+
+    INSN_VCEQ      = 0xf3000810,
+    INSN_VCGE      = 0xf2000310,
+    INSN_VCGT      = 0xf2000300,
+    INSN_VCGE_U    = 0xf3000310,
+    INSN_VCGT_U    = 0xf3000300,
+
+    INSN_VTST      = 0xf2000810,
 
     INSN_VDUP_G    = 0xee800b10,  /* VDUP (ARM core register) */
     INSN_VDUP_S    = 0xf3b00c00,  /* VDUP (scalar) */
@@ -XXX,XX +XXX,XX @@ static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
 #define TCG_CT_CONST_INV  0x200
 #define TCG_CT_CONST_NEG  0x400
 #define TCG_CT_CONST_ZERO 0x800
+#define TCG_CT_CONST_ORRI 0x1000
+#define TCG_CT_CONST_ANDI 0x2000
 
 #define ALL_GENERAL_REGS  0xffffu
 #define ALL_VECTOR_REGS   0xffff0000u
@@ -XXX,XX +XXX,XX @@ static int is_shimm32_pair(uint32_t v32, int *cmode, int *imm8)
     return i;
 }
 
+/* Return true if V is a valid 16-bit or 32-bit shifted immediate.  */
+static bool is_shimm1632(uint32_t v32, int *cmode, int *imm8)
+{
+    if (v32 == deposit32(v32, 16, 16, v32)) {
+        return is_shimm16(v32, cmode, imm8);
+    } else {
+        return is_shimm32(v32, cmode, imm8);
+    }
+}
+
 /* Test if a constant matches the constraint.
  * TODO: define constraints for:
  *
@@ -XXX,XX +XXX,XX @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
         return 1;
     } else if ((ct & TCG_CT_CONST_ZERO) && val == 0) {
         return 1;
-    } else {
-        return 0;
     }
+
+    switch (ct & (TCG_CT_CONST_ORRI | TCG_CT_CONST_ANDI)) {
+    case 0:
+        break;
+    case TCG_CT_CONST_ANDI:
+        val = ~val;
+        /* fallthru */
+    case TCG_CT_CONST_ORRI:
+        if (val == deposit64(val, 32, 32, val)) {
+            int cmode, imm8;
+            return is_shimm1632(val, &cmode, &imm8);
+        }
+        break;
+    default:
+        /* Both bits should not be set for the same insn.  */
+        g_assert_not_reached();
+    }
+
+    return 0;
 }
 
 static inline void tcg_out_b(TCGContext *s, int cond, int32_t offset)
@@ -XXX,XX +XXX,XX @@ static uint32_t encode_vm(TCGReg rm)
     return (extract32(rm, 3, 1) << 5) | (extract32(rm, 0, 3) << 1);
 }
 
+static void tcg_out_vreg2(TCGContext *s, ARMInsn insn, int q, int vece,
+                          TCGReg d, TCGReg m)
+{
+    tcg_out32(s, insn | (vece << 18) | (q << 6) |
+              encode_vd(d) | encode_vm(m));
+}
+
 static void tcg_out_vreg3(TCGContext *s, ARMInsn insn, int q, int vece,
                           TCGReg d, TCGReg n, TCGReg m)
 {
@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
     case INDEX_op_add_vec:
     case INDEX_op_sub_vec:
     case INDEX_op_xor_vec:
-    case INDEX_op_or_vec:
-    case INDEX_op_and_vec:
-    case INDEX_op_cmp_vec:
         return C_O1_I2(w, w, w);
+    case INDEX_op_or_vec:
+        return C_O1_I2(w, w, wO);
+    case INDEX_op_and_vec:
+        return C_O1_I2(w, w, wV);
+    case INDEX_op_cmp_vec:
+        return C_O1_I2(w, w, wZ);
 
     default:
         g_assert_not_reached();
@@ -XXX,XX +XXX,XX @@ static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
     }
 }
 
+static const ARMInsn vec_cmp_insn[16] = {
+    [TCG_COND_EQ] = INSN_VCEQ,
+    [TCG_COND_GT] = INSN_VCGT,
+    [TCG_COND_GE] = INSN_VCGE,
+    [TCG_COND_GTU] = INSN_VCGT_U,
+    [TCG_COND_GEU] = INSN_VCGE_U,
+};
+
+static const ARMInsn vec_cmp0_insn[16] = {
+    [TCG_COND_EQ] = INSN_VCEQ0,
+    [TCG_COND_GT] = INSN_VCGT0,
+    [TCG_COND_GE] = INSN_VCGE0,
+    [TCG_COND_LT] = INSN_VCLT0,
+    [TCG_COND_LE] = INSN_VCLE0,
+};
+
 static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
                            unsigned vecl, unsigned vece,
                            const TCGArg *args, const int *const_args)
 {
-    g_assert_not_reached();
+    TCGType type = vecl + TCG_TYPE_V64;
+    unsigned q = vecl;
+    TCGArg a0, a1, a2;
+    int cmode, imm8;
+
+    a0 = args[0];
+    a1 = args[1];
+    a2 = args[2];
+
+    switch (opc) {
+    case INDEX_op_ld_vec:
+        tcg_out_ld(s, type, a0, a1, a2);
+        return;
+    case INDEX_op_st_vec:
+        tcg_out_st(s, type, a0, a1, a2);
+        return;
+    case INDEX_op_dupm_vec:
+        tcg_out_dupm_vec(s, type, vece, a0, a1, a2);
+        return;
+    case INDEX_op_dup2_vec:
+        tcg_out_dup2_vec(s, a0, a1, a2);
+        return;
+    case INDEX_op_add_vec:
+        tcg_out_vreg3(s, INSN_VADD, q, vece, a0, a1, a2);
+        return;
+    case INDEX_op_sub_vec:
+        tcg_out_vreg3(s, INSN_VSUB, q, vece, a0, a1, a2);
+        return;
+    case INDEX_op_xor_vec:
+        tcg_out_vreg3(s, INSN_VEOR, q, 0, a0, a1, a2);
+        return;
+
+    case INDEX_op_and_vec:
+        if (const_args[2]) {
+            is_shimm1632(~a2, &cmode, &imm8);
+            if (a0 == a1) {
+                tcg_out_vmovi(s, a0, q, 1, cmode | 1, imm8); /* VBICI */
+                return;
+            }
+            tcg_out_vmovi(s, a0, q, 1, cmode, imm8); /* VMVNI */
+            a2 = a0;
+        }
+        tcg_out_vreg3(s, INSN_VAND, q, 0, a0, a1, a2);
+        return;
+
+    case INDEX_op_or_vec:
+        if (const_args[2]) {
+            is_shimm1632(a2, &cmode, &imm8);
+            if (a0 == a1) {
+                tcg_out_vmovi(s, a0, q, 0, cmode | 1, imm8); /* VORRI */
+                return;
+            }
+            tcg_out_vmovi(s, a0, q, 0, cmode, imm8); /* VMOVI */
+            a2 = a0;
+        }
+        tcg_out_vreg3(s, INSN_VORR, q, 0, a0, a1, a2);
+        return;
+
+    case INDEX_op_cmp_vec:
+        {
+            TCGCond cond = args[3];
+
+            if (cond == TCG_COND_NE) {
+                if (const_args[2]) {
+                    tcg_out_vreg3(s, INSN_VTST, q, vece, a0, a1, a1);
+                } else {
+                    tcg_out_vreg3(s, INSN_VCEQ, q, vece, a0, a1, a2);
+                    tcg_out_vreg2(s, INSN_VMVN, q, 0, a0, a0);
+                }
+            } else {
+                ARMInsn insn;
+
+                if (const_args[2]) {
+                    insn = vec_cmp0_insn[cond];
+                    if (insn) {
+                        tcg_out_vreg2(s, insn, q, vece, a0, a1);
+                        return;
+                    }
+                    tcg_out_dupi_vec(s, type, MO_8, TCG_VEC_TMP, 0);
+                    a2 = TCG_VEC_TMP;
+                }
+                insn = vec_cmp_insn[cond];
+                if (insn == 0) {
+                    TCGArg t;
+                    t = a1, a1 = a2, a2 = t;
+                    cond = tcg_swap_cond(cond);
+                    insn = vec_cmp_insn[cond];
+                    tcg_debug_assert(insn != 0);
+                }
+                tcg_out_vreg3(s, insn, q, vece, a0, a1, a2);
+            }
+        }
+        return;
+
+    case INDEX_op_mov_vec:  /* Always emitted via tcg_out_mov.  */
+    case INDEX_op_dup_vec:  /* Always emitted via tcg_out_dup_vec.  */
+    default:
+        g_assert_not_reached();
+    }
 }
 
 int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
 {
-    return 0;
+    switch (opc) {
+    case INDEX_op_add_vec:
+    case INDEX_op_sub_vec:
+    case INDEX_op_and_vec:
+    case INDEX_op_or_vec:
+    case INDEX_op_xor_vec:
+        return 1;
+    case INDEX_op_cmp_vec:
+        return vece < MO_64;
+    default:
+        return 0;
+    }
 }
 
 void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
-- 
2.25.1

These logical and arithmetic operations are optional, but are
trivial to accomplish with the existing infrastructure.

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/arm/tcg-target-con-set.h |  1 +
 tcg/arm/tcg-target.h         | 10 +++++-----
 tcg/arm/tcg-target.c.inc     | 38 ++++++++++++++++++++++++++++++++++++
 3 files changed, 44 insertions(+), 5 deletions(-)

This consists of the three immediate shifts: shli, shri, sari.

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/arm/tcg-target.h     |  2 +-
 tcg/arm/tcg-target.c.inc | 27 +++++++++++++++++++++++++++
 2 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.h
+++ b/tcg/arm/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool use_neon_instructions;
 #define TCG_TARGET_HAS_roti_vec         0
 #define TCG_TARGET_HAS_rots_vec         0
 #define TCG_TARGET_HAS_rotv_vec         0
-#define TCG_TARGET_HAS_shi_vec          0
+#define TCG_TARGET_HAS_shi_vec          1
 #define TCG_TARGET_HAS_shs_vec          0
 #define TCG_TARGET_HAS_shv_vec          0
 #define TCG_TARGET_HAS_mul_vec          0
diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ typedef enum {
     INSN_VCGE_U    = 0xf3000310,
     INSN_VCGT_U    = 0xf3000300,
 
+    INSN_VSHLI     = 0xf2800510,  /* VSHL (immediate) */
+    INSN_VSARI     = 0xf2800010,  /* VSHR.S */
+    INSN_VSHRI     = 0xf3800010,  /* VSHR.U */
+
     INSN_VTST      = 0xf2000810,
 
     INSN_VDUP_G    = 0xee800b10,  /* VDUP (ARM core register) */
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vmovi(TCGContext *s, TCGReg rd,
               | (extract32(imm8, 7, 1) << 24));
 }
 
+static void tcg_out_vshifti(TCGContext *s, ARMInsn insn, int q,
+                            TCGReg rd, TCGReg rm, int l_imm6)
+{
+    tcg_out32(s, insn | (q << 6) | encode_vd(rd) | encode_vm(rm) |
+              (extract32(l_imm6, 6, 1) << 7) |
+              (extract32(l_imm6, 0, 6) << 16));
+}
+
 static void tcg_out_vldst(TCGContext *s, ARMInsn insn,
                           TCGReg rd, TCGReg rn, int offset)
 {
@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
     case INDEX_op_abs_vec:
     case INDEX_op_neg_vec:
     case INDEX_op_not_vec:
+    case INDEX_op_shli_vec:
+    case INDEX_op_shri_vec:
+    case INDEX_op_sari_vec:
         return C_O1_I1(w, w);
     case INDEX_op_dup2_vec:
     case INDEX_op_add_vec:
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_xor_vec:
         tcg_out_vreg3(s, INSN_VEOR, q, 0, a0, a1, a2);
         return;
+    case INDEX_op_shli_vec:
+        tcg_out_vshifti(s, INSN_VSHLI, q, a0, a1, a2 + (8 << vece));
+        return;
+    case INDEX_op_shri_vec:
+        tcg_out_vshifti(s, INSN_VSHRI, q, a0, a1, (16 << vece) - a2);
+        return;
+    case INDEX_op_sari_vec:
+        tcg_out_vshifti(s, INSN_VSARI, q, a0, a1, (16 << vece) - a2);
+        return;
 
     case INDEX_op_andc_vec:
         if (!const_args[2]) {
@@ -XXX,XX +XXX,XX @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
     case INDEX_op_orc_vec:
     case INDEX_op_xor_vec:
     case INDEX_op_not_vec:
+    case INDEX_op_shli_vec:
+    case INDEX_op_shri_vec:
+    case INDEX_op_sari_vec:
         return 1;
     case INDEX_op_abs_vec:
     case INDEX_op_cmp_vec:
-- 
2.25.1

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/arm/tcg-target.h     | 2 +-
 tcg/arm/tcg-target.c.inc | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

This is saturating add and subtract, signed and unsigned.

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/arm/tcg-target.h     |  2 +-
 tcg/arm/tcg-target.c.inc | 24 ++++++++++++++++++++++++
 2 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.h
+++ b/tcg/arm/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool use_neon_instructions;
 #define TCG_TARGET_HAS_shs_vec          0
 #define TCG_TARGET_HAS_shv_vec          0
 #define TCG_TARGET_HAS_mul_vec          1
-#define TCG_TARGET_HAS_sat_vec          0
+#define TCG_TARGET_HAS_sat_vec          1
 #define TCG_TARGET_HAS_minmax_vec       0
 #define TCG_TARGET_HAS_bitsel_vec       0
 #define TCG_TARGET_HAS_cmpsel_vec       0
diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ typedef enum {
     INSN_VORR      = 0xf2200110,
     INSN_VSUB      = 0xf3000800,
     INSN_VMUL      = 0xf2000910,
+    INSN_VQADD     = 0xf2000010,
+    INSN_VQADD_U   = 0xf3000010,
+    INSN_VQSUB     = 0xf2000210,
+    INSN_VQSUB_U   = 0xf3000210,
 
     INSN_VABS      = 0xf3b10300,
     INSN_VMVN      = 0xf3b00580,
@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
     case INDEX_op_dup2_vec:
     case INDEX_op_add_vec:
     case INDEX_op_mul_vec:
+    case INDEX_op_ssadd_vec:
+    case INDEX_op_sssub_vec:
     case INDEX_op_sub_vec:
+    case INDEX_op_usadd_vec:
+    case INDEX_op_ussub_vec:
     case INDEX_op_xor_vec:
         return C_O1_I2(w, w, w);
     case INDEX_op_or_vec:
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_sub_vec:
         tcg_out_vreg3(s, INSN_VSUB, q, vece, a0, a1, a2);
         return;
+    case INDEX_op_ssadd_vec:
+        tcg_out_vreg3(s, INSN_VQADD, q, vece, a0, a1, a2);
+        return;
+    case INDEX_op_sssub_vec:
+        tcg_out_vreg3(s, INSN_VQSUB, q, vece, a0, a1, a2);
+        return;
+    case INDEX_op_usadd_vec:
+        tcg_out_vreg3(s, INSN_VQADD_U, q, vece, a0, a1, a2);
+        return;
+    case INDEX_op_ussub_vec:
+        tcg_out_vreg3(s, INSN_VQSUB_U, q, vece, a0, a1, a2);
+        return;
     case INDEX_op_xor_vec:
         tcg_out_vreg3(s, INSN_VEOR, q, 0, a0, a1, a2);
         return;
@@ -XXX,XX +XXX,XX @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
     case INDEX_op_shli_vec:
     case INDEX_op_shri_vec:
     case INDEX_op_sari_vec:
+    case INDEX_op_ssadd_vec:
+    case INDEX_op_sssub_vec:
+    case INDEX_op_usadd_vec:
+    case INDEX_op_ussub_vec:
         return 1;
     case INDEX_op_abs_vec:
     case INDEX_op_cmp_vec:
-- 
2.25.1

This is minimum and maximum, signed and unsigned.

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/arm/tcg-target.h     |  2 +-
 tcg/arm/tcg-target.c.inc | 24 ++++++++++++++++++++++++
 2 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.h
+++ b/tcg/arm/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool use_neon_instructions;
 #define TCG_TARGET_HAS_shv_vec          0
 #define TCG_TARGET_HAS_mul_vec          1
 #define TCG_TARGET_HAS_sat_vec          1
-#define TCG_TARGET_HAS_minmax_vec       0
+#define TCG_TARGET_HAS_minmax_vec       1
 #define TCG_TARGET_HAS_bitsel_vec       0
 #define TCG_TARGET_HAS_cmpsel_vec       0
 
diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ typedef enum {
     INSN_VQADD_U   = 0xf3000010,
     INSN_VQSUB     = 0xf2000210,
     INSN_VQSUB_U   = 0xf3000210,
+    INSN_VMAX      = 0xf2000600,
+    INSN_VMAX_U    = 0xf3000600,
+    INSN_VMIN      = 0xf2000610,
+    INSN_VMIN_U    = 0xf3000610,
 
     INSN_VABS      = 0xf3b10300,
     INSN_VMVN      = 0xf3b00580,
@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
     case INDEX_op_dup2_vec:
     case INDEX_op_add_vec:
     case INDEX_op_mul_vec:
+    case INDEX_op_smax_vec:
+    case INDEX_op_smin_vec:
     case INDEX_op_ssadd_vec:
     case INDEX_op_sssub_vec:
     case INDEX_op_sub_vec:
+    case INDEX_op_umax_vec:
+    case INDEX_op_umin_vec:
     case INDEX_op_usadd_vec:
     case INDEX_op_ussub_vec:
     case INDEX_op_xor_vec:
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_mul_vec:
         tcg_out_vreg3(s, INSN_VMUL, q, vece, a0, a1, a2);
         return;
+    case INDEX_op_smax_vec:
+        tcg_out_vreg3(s, INSN_VMAX, q, vece, a0, a1, a2);
+        return;
+    case INDEX_op_smin_vec:
+        tcg_out_vreg3(s, INSN_VMIN, q, vece, a0, a1, a2);
+        return;
     case INDEX_op_sub_vec:
         tcg_out_vreg3(s, INSN_VSUB, q, vece, a0, a1, a2);
         return;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_sssub_vec:
         tcg_out_vreg3(s, INSN_VQSUB, q, vece, a0, a1, a2);
         return;
+    case INDEX_op_umax_vec:
+        tcg_out_vreg3(s, INSN_VMAX_U, q, vece, a0, a1, a2);
+        return;
+    case INDEX_op_umin_vec:
+        tcg_out_vreg3(s, INSN_VMIN_U, q, vece, a0, a1, a2);
+        return;
     case INDEX_op_usadd_vec:
         tcg_out_vreg3(s, INSN_VQADD_U, q, vece, a0, a1, a2);
         return;
@@ -XXX,XX +XXX,XX @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
     case INDEX_op_cmp_vec:
     case INDEX_op_mul_vec:
     case INDEX_op_neg_vec:
+    case INDEX_op_smax_vec:
+    case INDEX_op_smin_vec:
+    case INDEX_op_umax_vec:
+    case INDEX_op_umin_vec:
         return vece < MO_64;
     default:
         return 0;
-- 
2.25.1

NEON has 3 instructions implementing this 4 argument operation,
with each insn overlapping a different logical input onto the
destination register.

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/arm/tcg-target-con-set.h |  1 +
 tcg/arm/tcg-target.h         |  2 +-
 tcg/arm/tcg-target.c.inc     | 22 ++++++++++++++++++++--
 3 files changed, 22 insertions(+), 3 deletions(-)

The three vector shift by vector operations are all implemented via
expansion.  Therefore do not actually set TCG_TARGET_HAS_shv_vec,
as none of shlv_vec, shrv_vec, sarv_vec may actually appear in the
instruction stream, and therefore also do not appear in tcg_target_op_def.

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/arm/tcg-target.opc.h |  3 ++
 tcg/arm/tcg-target.c.inc | 61 +++++++++++++++++++++++++++++++++++++++-
 2 files changed, 63 insertions(+), 1 deletion(-)

diff --git a/tcg/arm/tcg-target.opc.h b/tcg/arm/tcg-target.opc.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.opc.h
+++ b/tcg/arm/tcg-target.opc.h
@@ -XXX,XX +XXX,XX @@
  * emitted by tcg_expand_vec_op.  For those familiar with GCC internals,
  * consider these to be UNSPEC with names.
  */
+
+DEF(arm_sshl_vec, 1, 2, 0, IMPLVEC)
+DEF(arm_ushl_vec, 1, 2, 0, IMPLVEC)
diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ typedef enum {
     INSN_VSHLI     = 0xf2800510,  /* VSHL (immediate) */
     INSN_VSARI     = 0xf2800010,  /* VSHR.S */
     INSN_VSHRI     = 0xf3800010,  /* VSHR.U */
+    INSN_VSHL_S    = 0xf2000400,  /* VSHL.S (register) */
+    INSN_VSHL_U    = 0xf3000400,  /* VSHL.U (register) */
 
     INSN_VBSL      = 0xf3100110,
     INSN_VBIT      = 0xf3200110,
@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
     case INDEX_op_usadd_vec:
     case INDEX_op_ussub_vec:
     case INDEX_op_xor_vec:
+    case INDEX_op_arm_sshl_vec:
+    case INDEX_op_arm_ushl_vec:
         return C_O1_I2(w, w, w);
     case INDEX_op_or_vec:
     case INDEX_op_andc_vec:
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_xor_vec:
         tcg_out_vreg3(s, INSN_VEOR, q, 0, a0, a1, a2);
         return;
+    case INDEX_op_arm_sshl_vec:
+        /*
+         * Note that Vm is the data and Vn is the shift count,
+         * therefore the arguments appear reversed.
+         */
+        tcg_out_vreg3(s, INSN_VSHL_S, q, vece, a0, a2, a1);
+        return;
+    case INDEX_op_arm_ushl_vec:
+        /* See above. */
+        tcg_out_vreg3(s, INSN_VSHL_U, q, vece, a0, a2, a1);
+        return;
     case INDEX_op_shli_vec:
         tcg_out_vshifti(s, INSN_VSHLI, q, a0, a1, a2 + (8 << vece));
         return;
@@ -XXX,XX +XXX,XX @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
     case INDEX_op_umax_vec:
     case INDEX_op_umin_vec:
         return vece < MO_64;
+    case INDEX_op_shlv_vec:
+    case INDEX_op_shrv_vec:
+    case INDEX_op_sarv_vec:
+        return -1;
     default:
         return 0;
     }
@@ -XXX,XX +XXX,XX @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
 void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
                        TCGArg a0, ...)
 {
-    g_assert_not_reached();
+    va_list va;
+    TCGv_vec v0, v1, v2, t1;
+    TCGArg a2;
+
+    va_start(va, a0);
+    v0 = temp_tcgv_vec(arg_temp(a0));
+    v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
+    a2 = va_arg(va, TCGArg);
+    va_end(va);
+
+    switch (opc) {
+    case INDEX_op_shlv_vec:
+        /*
+         * Merely propagate shlv_vec to arm_ushl_vec.
+         * In this way we don't set TCG_TARGET_HAS_shv_vec
+         * because everything is done via expansion.
+         */
+        v2 = temp_tcgv_vec(arg_temp(a2));
+        vec_gen_3(INDEX_op_arm_ushl_vec, type, vece, tcgv_vec_arg(v0),
+                  tcgv_vec_arg(v1), tcgv_vec_arg(v2));
+        break;
+
+    case INDEX_op_shrv_vec:
+    case INDEX_op_sarv_vec:
+        /* Right shifts are negative left shifts for NEON.  */
+        v2 = temp_tcgv_vec(arg_temp(a2));
+        t1 = tcg_temp_new_vec(type);
+        tcg_gen_neg_vec(vece, t1, v2);
+        if (opc == INDEX_op_shrv_vec) {
+            opc = INDEX_op_arm_ushl_vec;
+        } else {
+            opc = INDEX_op_arm_sshl_vec;
+        }
+        vec_gen_3(opc, type, vece, tcgv_vec_arg(v0),
+                  tcgv_vec_arg(v1), tcgv_vec_arg(t1));
+        tcg_temp_free_vec(t1);
+        break;
+
+    default:
+        g_assert_not_reached();
+    }
 }
 
 static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
-- 
2.25.1

Implement via expansion, so don't actually set TCG_TARGET_HAS_roti_vec.
For NEON, this is shift-right followed by shift-left-and-insert.

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/arm/tcg-target-con-set.h |  1 +
 tcg/arm/tcg-target.opc.h     |  1 +
 tcg/arm/tcg-target.c.inc     | 15 +++++++++++++++
 3 files changed, 17 insertions(+)

Implement via expansion, so don't actually set TCG_TARGET_HAS_rotv_vec.

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/arm/tcg-target.c.inc | 35 ++++++++++++++++++++++++++++++++++-
 1 file changed, 34 insertions(+), 1 deletion(-)

diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
     case INDEX_op_shrv_vec:
     case INDEX_op_sarv_vec:
     case INDEX_op_rotli_vec:
+    case INDEX_op_rotlv_vec:
+    case INDEX_op_rotrv_vec:
         return -1;
     default:
         return 0;
@@ -XXX,XX +XXX,XX @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
                        TCGArg a0, ...)
 {
     va_list va;
-    TCGv_vec v0, v1, v2, t1;
+    TCGv_vec v0, v1, v2, t1, t2, c1;
     TCGArg a2;
 
     va_start(va, a0);
@@ -XXX,XX +XXX,XX @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
         tcg_temp_free_vec(t1);
         break;
 
+    case INDEX_op_rotlv_vec:
+        v2 = temp_tcgv_vec(arg_temp(a2));
+        t1 = tcg_temp_new_vec(type);
+        c1 = tcg_constant_vec(type, vece, 8 << vece);
+        tcg_gen_sub_vec(vece, t1, v2, c1);
+        /* Right shifts are negative left shifts for NEON.  */
+        vec_gen_3(INDEX_op_arm_ushl_vec, type, vece, tcgv_vec_arg(t1),
+                  tcgv_vec_arg(v1), tcgv_vec_arg(t1));
+        vec_gen_3(INDEX_op_arm_ushl_vec, type, vece, tcgv_vec_arg(v0),
+                  tcgv_vec_arg(v1), tcgv_vec_arg(v2));
+        tcg_gen_or_vec(vece, v0, v0, t1);
+        tcg_temp_free_vec(t1);
+        break;
+
+    case INDEX_op_rotrv_vec:
+        v2 = temp_tcgv_vec(arg_temp(a2));
+        t1 = tcg_temp_new_vec(type);
+        t2 = tcg_temp_new_vec(type);
+        c1 = tcg_constant_vec(type, vece, 8 << vece);
+        tcg_gen_neg_vec(vece, t1, v2);
+        tcg_gen_sub_vec(vece, t2, c1, v2);
+        /* Right shifts are negative left shifts for NEON.  */
+        vec_gen_3(INDEX_op_arm_ushl_vec, type, vece, tcgv_vec_arg(t1),
+                  tcgv_vec_arg(v1), tcgv_vec_arg(t1));
+        vec_gen_3(INDEX_op_arm_ushl_vec, type, vece, tcgv_vec_arg(t2),
+                  tcgv_vec_arg(v1), tcgv_vec_arg(t2));
+        tcg_gen_or_vec(vece, v0, t1, t2);
+        tcg_temp_free_vec(t1);
+        tcg_temp_free_vec(t2);
+        break;
+
     default:
         g_assert_not_reached();
     }
-- 
2.25.1