Series comparison

-[PULL 0/8] tcg patch queue
+[PULL 00/10] tcg patch queue
-The following changes since commit e0175b71638cf4398903c0d25f93fe62e0606389:
+The following changes since commit a36d64f43325fa503075cc9408ddabb69b32f829:
-  Merge remote-tracking branch 'remotes/pmaydell/tags/pull-target-arm-20200228' into staging (2020-02-28 16:39:27 +0000)
+  Merge remote-tracking branch 'remotes/stsquad/tags/pull-testing-and-gdbstub-060520-1' into staging (2020-05-06 14:06:00 +0100)
 are available in the Git repository at:
-  https://github.com/rth7680/qemu.git tags/pull-tcg-20200228
+  https://github.com/rth7680/qemu.git tags/pull-tcg-20200506
-for you to fetch changes up to 600e17b261555c56a048781b8dd5ba3985650013:
+for you to fetch changes up to 07dada0336a83002dfa8673a9220a88e13d9a45c:
-  accel/tcg: increase default code gen buffer size for 64 bit (2020-02-28 17:43:31 -0800)
+  tcg: Fix integral argument type to tcg_gen_rot[rl]i_i{32,64} (2020-05-06 09:25:10 -0700)
 ----------------------------------------------------------------
-Fix race in cpu_exec_step_atomic.
+Add tcg_gen_gvec_dup_imm
-Work around compile failure with -fno-inine.
+Misc tcg patches
 Expand tcg/arm epilogue inline.
 Adjustments to the default code gen buffer size.
 ----------------------------------------------------------------
-Alex Bennée (5):
+Richard Henderson (10):
-      accel/tcg: fix race in cpu_exec_step_atomic (bug 1863025)
+      tcg: Add tcg_gen_gvec_dup_imm
-      accel/tcg: use units.h for defining code gen buffer sizes
+      target/s390x: Use tcg_gen_gvec_dup_imm
-      accel/tcg: remove link between guest ram and TCG cache size
+      target/ppc: Use tcg_gen_gvec_dup_imm
-      accel/tcg: only USE_STATIC_CODE_GEN_BUFFER on 32 bit hosts
+      target/arm: Use tcg_gen_gvec_dup_imm
-      accel/tcg: increase default code gen buffer size for 64 bit
+      tcg: Use tcg_gen_gvec_dup_imm in logical simplifications
       tcg: Remove tcg_gen_gvec_dup{8,16,32,64}i
       tcg: Add tcg_gen_gvec_dup_tl
       tcg: Improve vector tail clearing
       tcg: Add load_dest parameter to GVecGen2
       tcg: Fix integral argument type to tcg_gen_rot[rl]i_i{32,64}
-Richard Henderson (2):
+ include/tcg/tcg-op-gvec.h           |  13 ++-
-      tcg/arm: Split out tcg_out_epilogue
+ include/tcg/tcg-op.h                |   8 +-
-      tcg/arm: Expand epilogue inline
+ target/arm/translate-a64.c          |  10 +--
  target/arm/translate-sve.c          |  12 ++-
  target/arm/translate.c              |   9 +-
  target/ppc/translate/vmx-impl.inc.c |  32 +++----
  target/ppc/translate/vsx-impl.inc.c |   2 +-
  target/s390x/translate_vx.inc.c     |  41 ++-------
  tcg/tcg-op-gvec.c                   | 162 +++++++++++++++++++++++-------------
  tcg/tcg-op.c                        |  16 ++--
 files changed, 166 insertions(+), 139 deletions(-)
-Zenghui Yu (1):
-      compiler.h: Don't use compile-time assert when __NO_INLINE__ is defined
- include/qemu/compiler.h   |  2 +-
- accel/tcg/cpu-exec.c      | 21 ++++++++--------
- accel/tcg/translate-all.c | 61 ++++++++++++++++++++++++++++-------------------
- tcg/arm/tcg-target.inc.c  | 29 ++++++++++------------
-files changed, 60 insertions(+), 53 deletions(-)

-[PULL 3/8] tcg/arm: Split out tcg_out_epilogue
+[PULL 01/10] tcg: Add tcg_gen_gvec_dup_imm
-From: Richard Henderson <rth@twiddle.net>
+Add a version of tcg_gen_dup_* that takes both immediate and
 a vector element size operand.  This will replace the set of
 tcg_gen_gvec_dup{8,16,32,64}i functions that encode the element
 size within the function name.
-We will shortly use this function from tcg_out_op as well.
+Reviewed-by: LIU Zhiwei <zhiwei_liu@c-sky.com>
 Reviewed-by: David Hildenbrand <david@redhat.com>
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
  include/tcg/tcg-op-gvec.h | 2 ++
  tcg/tcg-op-gvec.c         | 7 +++++++
 files changed, 9 insertions(+)
-Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+diff --git a/include/tcg/tcg-op-gvec.h b/include/tcg/tcg-op-gvec.h
 Signed-off-by: Richard Henderson <rth@twiddle.net>
 ---
  tcg/arm/tcg-target.inc.c | 19 +++++++++++--------
 file changed, 11 insertions(+), 8 deletions(-)
 diff --git a/tcg/arm/tcg-target.inc.c b/tcg/arm/tcg-target.inc.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/arm/tcg-target.inc.c
+--- a/include/tcg/tcg-op-gvec.h
-+++ b/tcg/arm/tcg-target.inc.c
++++ b/include/tcg/tcg-op-gvec.h
-@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
+@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs,
  void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
                            uint32_t s, uint32_t m);
 +void tcg_gen_gvec_dup_imm(unsigned vece, uint32_t dofs, uint32_t s,
 +                          uint32_t m, uint64_t imm);
  void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t s,
                            uint32_t m, TCGv_i32);
  void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t s,
 diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tcg-op-gvec.c
 +++ b/tcg/tcg-op-gvec.c
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_dup8i(uint32_t dofs, uint32_t oprsz,
      do_dup(MO_8, dofs, oprsz, maxsz, NULL, NULL, x);
  }
- static tcg_insn_unit *tb_ret_addr;
++void tcg_gen_gvec_dup_imm(unsigned vece, uint32_t dofs, uint32_t oprsz,
-+static void tcg_out_epilogue(TCGContext *s);
++                          uint32_t maxsz, uint64_t x)
++{
- static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
++    check_size_align(oprsz, maxsz, dofs);
-                 const TCGArg *args, const int *const_args)
++    do_dup(vece, dofs, oprsz, maxsz, NULL, NULL, x);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
        + TCG_TARGET_STACK_ALIGN - 1) \
       & -TCG_TARGET_STACK_ALIGN)
 +#define STACK_ADDEND  (FRAME_SIZE - PUSH_SIZE)
 +
  static void tcg_target_qemu_prologue(TCGContext *s)
  {
 -    int stack_addend;
 -
      /* Calling convention requires us to save r4-r11 and lr.  */
      /* stmdb sp!, { r4 - r11, lr } */
      tcg_out32(s, (COND_AL << 28) | 0x092d4ff0);
      /* Reserve callee argument and tcg temp space.  */
 -    stack_addend = FRAME_SIZE - PUSH_SIZE;
 -
      tcg_out_dat_rI(s, COND_AL, ARITH_SUB, TCG_REG_CALL_STACK,
 -                   TCG_REG_CALL_STACK, stack_addend, 1);
 +                   TCG_REG_CALL_STACK, STACK_ADDEND, 1);
      tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
                    CPU_TEMP_BUF_NLONGS * sizeof(long));
@@ -XXX,XX +XXX,XX @@ static void tcg_target_qemu_prologue(TCGContext *s)
       */
      s->code_gen_epilogue = s->code_ptr;
      tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R0, 0);
 -
 -    /* TB epilogue */
      tb_ret_addr = s->code_ptr;
 +    tcg_out_epilogue(s);
 +}
 +
-+static void tcg_out_epilogue(TCGContext *s)
+ void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs,
-+{
+                       uint32_t oprsz, uint32_t maxsz)
-+    /* Release local stack frame.  */
+ {
      tcg_out_dat_rI(s, COND_AL, ARITH_ADD, TCG_REG_CALL_STACK,
 -                   TCG_REG_CALL_STACK, stack_addend, 1);
 +                   TCG_REG_CALL_STACK, STACK_ADDEND, 1);
      /* ldmia sp!, { r4 - r11, pc } */
      tcg_out32(s, (COND_AL << 28) | 0x08bd8ff0);
 --
 .20.1

-[PULL 1/8] accel/tcg: fix race in cpu_exec_step_atomic (bug 1863025)
+[PULL 02/10] target/s390x: Use tcg_gen_gvec_dup_imm
-From: Alex Bennée <alex.bennee@linaro.org>
+The gen_gvec_dupi switch is unnecessary with the new function.
 Replace it with a local gen_gvec_dup_imm that takes care of the
 register to offset conversion and length arguments.
-The bug describes a race whereby cpu_exec_step_atomic can acquire a TB
+Drop zero_vec and use use gen_gvec_dup_imm with 0.
 which is invalidated by a tb_flush before we execute it. This doesn't
 affect the other cpu_exec modes as a tb_flush by it's nature can only
 occur on a quiescent system. The race was described as:
-  B2. tcg_cpu_exec => cpu_exec => tb_find => tb_gen_code
+Reviewed-by: David Hildenbrand <david@redhat.com>
-  B3. tcg_tb_alloc obtains a new TB
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
       C3. TB obtained with tb_lookup__cpu_state or tb_gen_code
           (same TB as B2)
           A3. start_exclusive critical section entered
           A4. do_tb_flush is called, TB memory freed/re-allocated
           A5. end_exclusive exits critical section
   B2. tcg_cpu_exec => cpu_exec => tb_find => tb_gen_code
   B3. tcg_tb_alloc reallocates TB from B2
       C4. start_exclusive critical section entered
       C5. cpu_tb_exec executes the TB code that was free in A4
 The simplest fix is to widen the exclusive period to include the TB
 lookup. As a result we can drop the complication of checking we are in
 the exclusive region before we end it.
 Cc: Yifan <me@yifanlu.com>
 Buglink: https://bugs.launchpad.net/qemu/+bug/1863025
 Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
 Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
 Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
 Message-Id: <20200214144952.15502-1-alex.bennee@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/cpu-exec.c | 21 +++++++++++----------
+ target/s390x/translate_vx.inc.c | 41 +++++++--------------------------
-file changed, 11 insertions(+), 10 deletions(-)
+file changed, 8 insertions(+), 33 deletions(-)
-diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
+diff --git a/target/s390x/translate_vx.inc.c b/target/s390x/translate_vx.inc.c
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/cpu-exec.c
+--- a/target/s390x/translate_vx.inc.c
-+++ b/accel/tcg/cpu-exec.c
++++ b/target/s390x/translate_vx.inc.c
-@@ -XXX,XX +XXX,XX @@ void cpu_exec_step_atomic(CPUState *cpu)
+@@ -XXX,XX +XXX,XX @@ static void get_vec_element_ptr_i64(TCGv_ptr ptr, uint8_t reg, TCGv_i64 enr,
-     uint32_t cf_mask = cflags & CF_HASH_MASK;
+ #define gen_gvec_mov(v1, v2) \
+     tcg_gen_gvec_mov(0, vec_full_reg_offset(v1), vec_full_reg_offset(v2), 16, \
-     if (sigsetjmp(cpu->jmp_env, 0) == 0) {
+)
-+        start_exclusive();
+-#define gen_gvec_dup64i(v1, c) \
-+
+-    tcg_gen_gvec_dup64i(vec_full_reg_offset(v1), 16, 16, c)
-         tb = tb_lookup__cpu_state(cpu, &pc, &cs_base, &flags, cf_mask);
++#define gen_gvec_dup_imm(es, v1, c) \
-         if (tb == NULL) {
++    tcg_gen_gvec_dup_imm(es, vec_full_reg_offset(v1), 16, 16, c);
-             mmap_lock();
+ #define gen_gvec_fn_2(fn, es, v1, v2) \
-@@ -XXX,XX +XXX,XX @@ void cpu_exec_step_atomic(CPUState *cpu)
+     tcg_gen_gvec_##fn(es, vec_full_reg_offset(v1), vec_full_reg_offset(v2), \
-             mmap_unlock();
+, 16)
@@ -XXX,XX +XXX,XX @@ static void gen_gvec128_4_i64(gen_gvec128_4_i64_fn fn, uint8_t d, uint8_t a,
          tcg_temp_free_i64(cl);
  }
 -static void gen_gvec_dupi(uint8_t es, uint8_t reg, uint64_t c)
 -{
 -    switch (es) {
 -    case ES_8:
 -        tcg_gen_gvec_dup8i(vec_full_reg_offset(reg), 16, 16, c);
 -        break;
 -    case ES_16:
 -        tcg_gen_gvec_dup16i(vec_full_reg_offset(reg), 16, 16, c);
 -        break;
 -    case ES_32:
 -        tcg_gen_gvec_dup32i(vec_full_reg_offset(reg), 16, 16, c);
 -        break;
 -    case ES_64:
 -        gen_gvec_dup64i(reg, c);
 -        break;
 -    default:
 -        g_assert_not_reached();
 -    }
 -}
 -
 -static void zero_vec(uint8_t reg)
 -{
 -    tcg_gen_gvec_dup8i(vec_full_reg_offset(reg), 16, 16, 0);
 -}
 -
  static void gen_addi2_i64(TCGv_i64 dl, TCGv_i64 dh, TCGv_i64 al, TCGv_i64 ah,
                            uint64_t b)
  {
@@ -XXX,XX +XXX,XX @@ static DisasJumpType op_vgbm(DisasContext *s, DisasOps *o)
           * Masks for both 64 bit elements of the vector are the same.
           * Trust tcg to produce a good constant loading.
           */
 -        gen_gvec_dup64i(get_field(s, v1),
 -                        generate_byte_mask(i2 & 0xff));
 +        gen_gvec_dup_imm(ES_64, get_field(s, v1),
 +                         generate_byte_mask(i2 & 0xff));
      } else {
          TCGv_i64 t = tcg_temp_new_i64();
@@ -XXX,XX +XXX,XX @@ static DisasJumpType op_vgm(DisasContext *s, DisasOps *o)
          }
--        start_exclusive();
--
-         /* Since we got here, we know that parallel_cpus must be true.  */
-         parallel_cpus = false;
-         cc->cpu_exec_enter(cpu);
-@@ -XXX,XX +XXX,XX @@ void cpu_exec_step_atomic(CPUState *cpu)
-         qemu_plugin_disable_mem_helpers(cpu);
      }
--    if (cpu_in_exclusive_context(cpu)) {
+-    gen_gvec_dupi(es, get_field(s, v1), mask);
--        /* We might longjump out of either the codegen or the
++    gen_gvec_dup_imm(es, get_field(s, v1), mask);
--         * execution, so must make sure we only end the exclusive
+     return DISAS_NEXT;
 -         * region if we started it.
 -         */
 -        parallel_cpus = true;
 -        end_exclusive();
 -    }
 +
 +    /*
 +     * As we start the exclusive region before codegen we must still
 +     * be in the region if we longjump out of either the codegen or
 +     * the execution.
 +     */
 +    g_assert(cpu_in_exclusive_context(cpu));
 +    parallel_cpus = true;
 +    end_exclusive();
  }
- struct tb_desc {
+@@ -XXX,XX +XXX,XX @@ static DisasJumpType op_vllez(DisasContext *s, DisasOps *o)
      t = tcg_temp_new_i64();
      tcg_gen_qemu_ld_i64(t, o->addr1, get_mem_index(s), MO_TE | es);
 -    zero_vec(get_field(s, v1));
 +    gen_gvec_dup_imm(es, get_field(s, v1), 0);
      write_vec_element_i64(t, get_field(s, v1), enr, es);
      tcg_temp_free_i64(t);
      return DISAS_NEXT;
@@ -XXX,XX +XXX,XX @@ static DisasJumpType op_vrepi(DisasContext *s, DisasOps *o)
          return DISAS_NORETURN;
      }
 -    gen_gvec_dupi(es, get_field(s, v1), data);
 +    gen_gvec_dup_imm(es, get_field(s, v1), data);
      return DISAS_NEXT;
  }
@@ -XXX,XX +XXX,XX @@ static DisasJumpType op_vcksm(DisasContext *s, DisasOps *o)
          read_vec_element_i32(tmp, get_field(s, v2), i, ES_32);
          tcg_gen_add2_i32(tmp, sum, sum, sum, tmp, tmp);
      }
 -    zero_vec(get_field(s, v1));
 +    gen_gvec_dup_imm(ES_32, get_field(s, v1), 0);
      write_vec_element_i32(sum, get_field(s, v1), 1, ES_32);
      tcg_temp_free_i32(tmp);
 --
 .20.1

-New patch
+[PULL 03/10] target/ppc: Use tcg_gen_gvec_dup_imm
+We can now unify the implementation of the 3 VSPLTI instructions.
+Acked-by: David Gibson <david@gibson.dropbear.id.au>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ target/ppc/translate/vmx-impl.inc.c | 32 ++++++++++++++++-------------
+ target/ppc/translate/vsx-impl.inc.c |  2 +-
+files changed, 19 insertions(+), 15 deletions(-)
+diff --git a/target/ppc/translate/vmx-impl.inc.c b/target/ppc/translate/vmx-impl.inc.c
+index XXXXXXX..XXXXXXX 100644
+--- a/target/ppc/translate/vmx-impl.inc.c
++++ b/target/ppc/translate/vmx-impl.inc.c
+@@ -XXX,XX +XXX,XX @@ GEN_VXRFORM_DUAL(vcmpbfp, PPC_ALTIVEC, PPC_NONE, \
+ GEN_VXRFORM_DUAL(vcmpgtfp, PPC_ALTIVEC, PPC_NONE, \
+                  vcmpgtud, PPC_NONE, PPC2_ALTIVEC_207)
+-#define GEN_VXFORM_DUPI(name, tcg_op, opc2, opc3)                       \
+-static void glue(gen_, name)(DisasContext *ctx)                         \
+-    {                                                                   \
+-        int simm;                                                       \
+-        if (unlikely(!ctx->altivec_enabled)) {                          \
+-            gen_exception(ctx, POWERPC_EXCP_VPU);                       \
+-            return;                                                     \
+-        }                                                               \
+-        simm = SIMM5(ctx->opcode);                                      \
+-        tcg_op(avr_full_offset(rD(ctx->opcode)), 16, 16, simm);         \
++static void gen_vsplti(DisasContext *ctx, int vece)
++{
++    int simm;
++
++    if (unlikely(!ctx->altivec_enabled)) {
++        gen_exception(ctx, POWERPC_EXCP_VPU);
++        return;
+     }
+-GEN_VXFORM_DUPI(vspltisb, tcg_gen_gvec_dup8i, 6, 12);
+-GEN_VXFORM_DUPI(vspltish, tcg_gen_gvec_dup16i, 6, 13);
+-GEN_VXFORM_DUPI(vspltisw, tcg_gen_gvec_dup32i, 6, 14);
++    simm = SIMM5(ctx->opcode);
++    tcg_gen_gvec_dup_imm(vece, avr_full_offset(rD(ctx->opcode)), 16, 16, simm);
++}
++
++#define GEN_VXFORM_VSPLTI(name, vece, opc2, opc3) \
++static void glue(gen_, name)(DisasContext *ctx) { gen_vsplti(ctx, vece); }
++
++GEN_VXFORM_VSPLTI(vspltisb, MO_8, 6, 12);
++GEN_VXFORM_VSPLTI(vspltish, MO_16, 6, 13);
++GEN_VXFORM_VSPLTI(vspltisw, MO_32, 6, 14);
+ #define GEN_VXFORM_NOA(name, opc2, opc3)                                \
+ static void glue(gen_, name)(DisasContext *ctx)                         \
+@@ -XXX,XX +XXX,XX @@ GEN_VXFORM_DUAL(vsldoi, PPC_ALTIVEC, PPC_NONE,
+ #undef GEN_VXRFORM_DUAL
+ #undef GEN_VXRFORM1
+ #undef GEN_VXRFORM
+-#undef GEN_VXFORM_DUPI
++#undef GEN_VXFORM_VSPLTI
+ #undef GEN_VXFORM_NOA
+ #undef GEN_VXFORM_UIMM
+ #undef GEN_VAFORM_PAIRED
+diff --git a/target/ppc/translate/vsx-impl.inc.c b/target/ppc/translate/vsx-impl.inc.c
+index XXXXXXX..XXXXXXX 100644
+--- a/target/ppc/translate/vsx-impl.inc.c
++++ b/target/ppc/translate/vsx-impl.inc.c
+@@ -XXX,XX +XXX,XX @@ static void gen_xxspltib(DisasContext *ctx)
+             return;
+         }
+     }
+-    tcg_gen_gvec_dup8i(vsr_full_offset(rt), 16, 16, uim8);
++    tcg_gen_gvec_dup_imm(MO_8, vsr_full_offset(rt), 16, 16, uim8);
+ }
+ static void gen_xxsldwi(DisasContext *ctx)
+--
+.20.1

-[PULL 7/8] accel/tcg: only USE_STATIC_CODE_GEN_BUFFER on 32 bit hosts
+[PULL 04/10] target/arm: Use tcg_gen_gvec_dup_imm
-From: Alex Bennée <alex.bennee@linaro.org>
+In a few cases, we're able to remove some manual replication.
-There is no particular reason to use a static codegen buffer on 64 bit
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 hosts as we have address space to burn. Allow the common CONFIG_USER
 case to use the mmap'ed buffers like SoftMMU.
 Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
 Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
 Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
 Tested-by: Philippe Mathieu-Daudé <philmd@redhat.com>
 Reviewed-by: Niek Linnenbank <nieklinnenbank@gmail.com>
 Message-Id: <20200228192415.19867-4-alex.bennee@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/translate-all.c | 11 ++++++-----
+ target/arm/translate-a64.c | 10 +++++-----
-file changed, 6 insertions(+), 5 deletions(-)
+ target/arm/translate-sve.c | 12 +++++-------
  target/arm/translate.c     |  9 ++++++---
 files changed, 16 insertions(+), 15 deletions(-)
-diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
+diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/translate-all.c
+--- a/target/arm/translate-a64.c
-+++ b/accel/tcg/translate-all.c
++++ b/target/arm/translate-a64.c
-@@ -XXX,XX +XXX,XX @@ static void page_lock_pair(PageDesc **ret_p1, tb_page_addr_t phys1,
+@@ -XXX,XX +XXX,XX @@ static void clear_vec_high(DisasContext *s, bool is_q, int rd)
          tcg_temp_free_i64(tcg_zero);
      }
      if (vsz > 16) {
 -        tcg_gen_gvec_dup8i(ofs + 16, vsz - 16, vsz - 16, 0);
 +        tcg_gen_gvec_dup_imm(MO_64, ofs + 16, vsz - 16, vsz - 16, 0);
      }
  }
--#if defined(CONFIG_USER_ONLY)
+@@ -XXX,XX +XXX,XX @@ static void disas_simd_mod_imm(DisasContext *s, uint32_t insn)
--/* Currently it is not recommended to allocate big chunks of data in
--   user mode. It will change when a dedicated libc will be used.  */
+     if (!((cmode & 0x9) == 0x1 || (cmode & 0xd) == 0x9)) {
--/* ??? 64-bit hosts ought to have no problem mmaping data outside the
+         /* MOVI or MVNI, with MVNI negation handled above.  */
--   region in which the guest needs to run.  Revisit this.  */
+-        tcg_gen_gvec_dup64i(vec_full_reg_offset(s, rd), is_q ? 16 : 8,
-+#if defined(CONFIG_USER_ONLY) && TCG_TARGET_REG_BITS == 32
+-                            vec_full_reg_size(s), imm);
-+/*
++        tcg_gen_gvec_dup_imm(MO_64, vec_full_reg_offset(s, rd), is_q ? 16 : 8,
-+ * For user mode on smaller 32 bit systems we may run into trouble
++                             vec_full_reg_size(s), imm);
-+ * allocating big chunks of data in the right place. On these systems
+     } else {
-+ * we utilise a static code generation buffer directly in the binary.
+         /* ORR or BIC, with BIC negation to AND handled above.  */
-+ */
+         if (is_neg) {
- #define USE_STATIC_CODE_GEN_BUFFER
+@@ -XXX,XX +XXX,XX @@ static void handle_vec_simd_shri(DisasContext *s, bool is_q, bool is_u,
- #endif
+         if (is_u) {
+             if (shift == 8 << size) {
                  /* Shift count the same size as element size produces zero.  */
 -                tcg_gen_gvec_dup8i(vec_full_reg_offset(s, rd),
 -                                   is_q ? 16 : 8, vec_full_reg_size(s), 0);
 +                tcg_gen_gvec_dup_imm(size, vec_full_reg_offset(s, rd),
 +                                     is_q ? 16 : 8, vec_full_reg_size(s), 0);
              } else {
                  gen_gvec_fn2i(s, is_q, rd, rn, shift, tcg_gen_gvec_shri, size);
              }
 diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/arm/translate-sve.c
 +++ b/target/arm/translate-sve.c
@@ -XXX,XX +XXX,XX @@ static bool do_mov_z(DisasContext *s, int rd, int rn)
  static void do_dupi_z(DisasContext *s, int rd, uint64_t word)
  {
      unsigned vsz = vec_full_reg_size(s);
 -    tcg_gen_gvec_dup64i(vec_full_reg_offset(s, rd), vsz, vsz, word);
 +    tcg_gen_gvec_dup_imm(MO_64, vec_full_reg_offset(s, rd), vsz, vsz, word);
  }
  /* Invoke a vector expander on two Pregs.  */
@@ -XXX,XX +XXX,XX @@ static bool do_predset(DisasContext *s, int esz, int rd, int pat, bool setflag)
          unsigned oprsz = size_for_gvec(setsz / 8);
          if (oprsz * 8 == setsz) {
 -            tcg_gen_gvec_dup64i(ofs, oprsz, maxsz, word);
 +            tcg_gen_gvec_dup_imm(MO_64, ofs, oprsz, maxsz, word);
              goto done;
          }
      }
@@ -XXX,XX +XXX,XX @@ static bool trans_DUP_x(DisasContext *s, arg_DUP_x *a)
              unsigned nofs = vec_reg_offset(s, a->rn, index, esz);
              tcg_gen_gvec_dup_mem(esz, dofs, nofs, vsz, vsz);
          } else {
 -            tcg_gen_gvec_dup64i(dofs, vsz, vsz, 0);
 +            tcg_gen_gvec_dup_imm(esz, dofs, vsz, vsz, 0);
          }
      }
      return true;
@@ -XXX,XX +XXX,XX @@ static bool trans_FDUP(DisasContext *s, arg_FDUP *a)
          /* Decode the VFP immediate.  */
          imm = vfp_expand_imm(a->esz, a->imm);
 -        imm = dup_const(a->esz, imm);
 -
 -        tcg_gen_gvec_dup64i(dofs, vsz, vsz, imm);
 +        tcg_gen_gvec_dup_imm(a->esz, dofs, vsz, vsz, imm);
      }
      return true;
  }
@@ -XXX,XX +XXX,XX @@ static bool trans_DUP_i(DisasContext *s, arg_DUP_i *a)
          unsigned vsz = vec_full_reg_size(s);
          int dofs = vec_full_reg_offset(s, a->rd);
 -        tcg_gen_gvec_dup64i(dofs, vsz, vsz, dup_const(a->esz, a->imm));
 +        tcg_gen_gvec_dup_imm(a->esz, dofs, vsz, vsz, a->imm);
      }
      return true;
  }
 diff --git a/target/arm/translate.c b/target/arm/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/arm/translate.c
 +++ b/target/arm/translate.c
@@ -XXX,XX +XXX,XX @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn)
                                            MIN(shift, (8 << size) - 1),
                                            vec_size, vec_size);
                      } else if (shift >= 8 << size) {
 -                        tcg_gen_gvec_dup8i(rd_ofs, vec_size, vec_size, 0);
 +                        tcg_gen_gvec_dup_imm(MO_8, rd_ofs, vec_size,
 +                                             vec_size, 0);
                      } else {
                          tcg_gen_gvec_shri(size, rd_ofs, rm_ofs, shift,
                                            vec_size, vec_size);
@@ -XXX,XX +XXX,XX @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn)
                           * architecturally valid and results in zero.
                           */
                          if (shift >= 8 << size) {
 -                            tcg_gen_gvec_dup8i(rd_ofs, vec_size, vec_size, 0);
 +                            tcg_gen_gvec_dup_imm(size, rd_ofs,
 +                                                 vec_size, vec_size, 0);
                          } else {
                              tcg_gen_gvec_shli(size, rd_ofs, rm_ofs, shift,
                                                vec_size, vec_size);
@@ -XXX,XX +XXX,XX @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn)
                      }
                      tcg_temp_free_i64(t64);
                  } else {
 -                    tcg_gen_gvec_dup32i(reg_ofs, vec_size, vec_size, imm);
 +                    tcg_gen_gvec_dup_imm(MO_32, reg_ofs, vec_size,
 +                                         vec_size, imm);
                  }
              }
          }
 --
 .20.1

-New patch
+[PULL 05/10] tcg: Use tcg_gen_gvec_dup_imm in logical simplifications
+Replace the outgoing interface.
+Reviewed-by: LIU Zhiwei <zhiwei_liu@c-sky.com>
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/tcg-op-gvec.c | 8 ++++----
+file changed, 4 insertions(+), 4 deletions(-)
+diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/tcg-op-gvec.c
++++ b/tcg/tcg-op-gvec.c
+@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs,
+     };
+     if (aofs == bofs) {
+-        tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, 0);
++        tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, 0);
+     } else {
+         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
+     }
+@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs,
+     };
+     if (aofs == bofs) {
+-        tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, 0);
++        tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, 0);
+     } else {
+         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
+     }
+@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs,
+     };
+     if (aofs == bofs) {
+-        tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, -1);
++        tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, -1);
+     } else {
+         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
+     }
+@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_eqv(unsigned vece, uint32_t dofs, uint32_t aofs,
+     };
+     if (aofs == bofs) {
+-        tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, -1);
++        tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, -1);
+     } else {
+         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
+     }
+--
+.20.1

-[PULL 8/8] accel/tcg: increase default code gen buffer size for 64 bit
+[PULL 06/10] tcg: Remove tcg_gen_gvec_dup{8,16,32,64}i
-From: Alex Bennée <alex.bennee@linaro.org>
+These interfaces are now unused.
-While 32mb is certainly usable a full system boot ends up flushing the
+Reviewed-by: LIU Zhiwei <zhiwei_liu@c-sky.com>
-codegen buffer nearly 100 times. Increase the default on 64 bit hosts
+Reviewed-by: David Hildenbrand <david@redhat.com>
-to take advantage of all that spare memory. After this change I can
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 boot my tests system without any TB flushes.
 As we usually run more CONFIG_USER binaries at a time in typical usage
 we aren't quite as profligate for user-mode code generation usage. We
 also bring the static code gen defies to the same place to keep all
 the reasoning in the comments together.
 Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
 Tested-by: Niek Linnenbank <nieklinnenbank@gmail.com>
 Reviewed-by: Niek Linnenbank <nieklinnenbank@gmail.com>
 Message-Id: <20200228192415.19867-5-alex.bennee@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/translate-all.c | 35 ++++++++++++++++++++++++++---------
+ include/tcg/tcg-op-gvec.h |  5 -----
-file changed, 26 insertions(+), 9 deletions(-)
+ tcg/tcg-op-gvec.c         | 28 ----------------------------
 files changed, 33 deletions(-)
-diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
+diff --git a/include/tcg/tcg-op-gvec.h b/include/tcg/tcg-op-gvec.h
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/translate-all.c
+--- a/include/tcg/tcg-op-gvec.h
-+++ b/accel/tcg/translate-all.c
++++ b/include/tcg/tcg-op-gvec.h
-@@ -XXX,XX +XXX,XX @@ static void page_lock_pair(PageDesc **ret_p1, tb_page_addr_t phys1,
+@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t s,
  void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t s,
                            uint32_t m, TCGv_i64);
 -void tcg_gen_gvec_dup8i(uint32_t dofs, uint32_t s, uint32_t m, uint8_t x);
 -void tcg_gen_gvec_dup16i(uint32_t dofs, uint32_t s, uint32_t m, uint16_t x);
 -void tcg_gen_gvec_dup32i(uint32_t dofs, uint32_t s, uint32_t m, uint32_t x);
 -void tcg_gen_gvec_dup64i(uint32_t dofs, uint32_t s, uint32_t m, uint64_t x);
 -
  void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs,
                         int64_t shift, uint32_t oprsz, uint32_t maxsz);
  void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs,
 diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tcg-op-gvec.c
 +++ b/tcg/tcg-op-gvec.c
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
      }
  }
--#if defined(CONFIG_USER_ONLY) && TCG_TARGET_REG_BITS == 32
+-void tcg_gen_gvec_dup64i(uint32_t dofs, uint32_t oprsz,
--/*
+-                         uint32_t maxsz, uint64_t x)
-- * For user mode on smaller 32 bit systems we may run into trouble
+-{
-- * allocating big chunks of data in the right place. On these systems
+-    check_size_align(oprsz, maxsz, dofs);
-- * we utilise a static code generation buffer directly in the binary.
+-    do_dup(MO_64, dofs, oprsz, maxsz, NULL, NULL, x);
-- */
+-}
 -#define USE_STATIC_CODE_GEN_BUFFER
 -#endif
 -
- /* Minimum size of the code gen buffer.  This number is randomly chosen,
+-void tcg_gen_gvec_dup32i(uint32_t dofs, uint32_t oprsz,
-    but not so small that we can't have a fair number of TB's live.  */
+-                         uint32_t maxsz, uint32_t x)
- #define MIN_CODE_GEN_BUFFER_SIZE     (1 * MiB)
+-{
-@@ -XXX,XX +XXX,XX @@ static void page_lock_pair(PageDesc **ret_p1, tb_page_addr_t phys1,
+-    check_size_align(oprsz, maxsz, dofs);
- # define MAX_CODE_GEN_BUFFER_SIZE  ((size_t)-1)
+-    do_dup(MO_32, dofs, oprsz, maxsz, NULL, NULL, x);
- #endif
+-}
+-
-+#if TCG_TARGET_REG_BITS == 32
+-void tcg_gen_gvec_dup16i(uint32_t dofs, uint32_t oprsz,
- #define DEFAULT_CODE_GEN_BUFFER_SIZE_1 (32 * MiB)
+-                         uint32_t maxsz, uint16_t x)
-+#ifdef CONFIG_USER_ONLY
+-{
-+/*
+-    check_size_align(oprsz, maxsz, dofs);
-+ * For user mode on smaller 32 bit systems we may run into trouble
+-    do_dup(MO_16, dofs, oprsz, maxsz, NULL, NULL, x);
-+ * allocating big chunks of data in the right place. On these systems
+-}
-+ * we utilise a static code generation buffer directly in the binary.
+-
-+ */
+-void tcg_gen_gvec_dup8i(uint32_t dofs, uint32_t oprsz,
-+#define USE_STATIC_CODE_GEN_BUFFER
+-                         uint32_t maxsz, uint8_t x)
-+#endif
+-{
-+#else /* TCG_TARGET_REG_BITS == 64 */
+-    check_size_align(oprsz, maxsz, dofs);
-+#ifdef CONFIG_USER_ONLY
+-    do_dup(MO_8, dofs, oprsz, maxsz, NULL, NULL, x);
-+/*
+-}
-+ * As user-mode emulation typically means running multiple instances
+-
-+ * of the translator don't go too nuts with our default code gen
+ void tcg_gen_gvec_dup_imm(unsigned vece, uint32_t dofs, uint32_t oprsz,
-+ * buffer lest we make things too hard for the OS.
+                           uint32_t maxsz, uint64_t x)
-+ */
+ {
 +#define DEFAULT_CODE_GEN_BUFFER_SIZE_1 (128 * MiB)
 +#else
 +/*
 + * We expect most system emulation to run one or two guests per host.
 + * Users running large scale system emulation may want to tweak their
 + * runtime setup via the tb-size control on the command line.
 + */
 +#define DEFAULT_CODE_GEN_BUFFER_SIZE_1 (1 * GiB)
 +#endif
 +#endif
  #define DEFAULT_CODE_GEN_BUFFER_SIZE \
    (DEFAULT_CODE_GEN_BUFFER_SIZE_1 < MAX_CODE_GEN_BUFFER_SIZE \
 --
 .20.1

-[PULL 6/8] accel/tcg: remove link between guest ram and TCG cache size
+[PULL 07/10] tcg: Add tcg_gen_gvec_dup_tl
-From: Alex Bennée <alex.bennee@linaro.org>
+For use when a target needs to pass a configure-specific
 target_ulong value to duplicate.
-Basing the TB cache size on the ram_size was always a little heuristic
+Reviewed-by: LIU Zhiwei <zhiwei_liu@c-sky.com>
-and was broken by a1b18df9a4 which caused ram_size not to be fully
+Reviewed-by: David Hildenbrand <david@redhat.com>
-realised at the time we initialise the TCG translation cache.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 The current DEFAULT_CODE_GEN_BUFFER_SIZE may still be a little small
 but follow-up patches will address that.
 Fixes: a1b18df9a4
 Cc: Igor Mammedov <imammedo@redhat.com>
 Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
 Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
 Tested-by: Philippe Mathieu-Daudé <philmd@redhat.com>
 Reviewed-by: Niek Linnenbank <nieklinnenbank@gmail.com>
 Message-Id: <20200228192415.19867-3-alex.bennee@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/translate-all.c | 8 --------
+ include/tcg/tcg-op-gvec.h | 6 ++++++
-file changed, 8 deletions(-)
+file changed, 6 insertions(+)
-diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
+diff --git a/include/tcg/tcg-op-gvec.h b/include/tcg/tcg-op-gvec.h
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/translate-all.c
+--- a/include/tcg/tcg-op-gvec.h
-+++ b/accel/tcg/translate-all.c
++++ b/include/tcg/tcg-op-gvec.h
-@@ -XXX,XX +XXX,XX @@ static inline size_t size_code_gen_buffer(size_t tb_size)
+@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t s,
- {
+ void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t s,
-     /* Size the buffer.  */
+                           uint32_t m, TCGv_i64);
-     if (tb_size == 0) {
--#ifdef USE_STATIC_CODE_GEN_BUFFER
++#if TARGET_LONG_BITS == 64
-         tb_size = DEFAULT_CODE_GEN_BUFFER_SIZE;
++# define tcg_gen_gvec_dup_tl  tcg_gen_gvec_dup_i64
--#else
++#else
--        /* ??? Needs adjustments.  */
++# define tcg_gen_gvec_dup_tl  tcg_gen_gvec_dup_i32
--        /* ??? If we relax the requirement that CONFIG_USER_ONLY use the
++#endif
--           static buffer, we could size this on RESERVED_VA, on the text
++
--           segment size of the executable, or continue to use the default.  */
+ void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs,
--        tb_size = (unsigned long)(ram_size / 4);
+                        int64_t shift, uint32_t oprsz, uint32_t maxsz);
--#endif
+ void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs,
      }
      if (tb_size < MIN_CODE_GEN_BUFFER_SIZE) {
          tb_size = MIN_CODE_GEN_BUFFER_SIZE;
 --
 .20.1

-[PULL 5/8] accel/tcg: use units.h for defining code gen buffer sizes
+[PULL 08/10] tcg: Improve vector tail clearing
-From: Alex Bennée <alex.bennee@linaro.org>
+Better handling of non-power-of-2 tails as seen with Arm 8-byte
 vector operations.
-It's easier to read.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
 Reviewed-by: Niek Linnenbank <nieklinnenbank@gmail.com>
 Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
 Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
 Tested-by: Philippe Mathieu-Daudé <philmd@redhat.com>
 Message-Id: <20200228192415.19867-2-alex.bennee@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/translate-all.c | 19 ++++++++++---------
+ tcg/tcg-op-gvec.c | 82 ++++++++++++++++++++++++++++++++++++-----------
-file changed, 10 insertions(+), 9 deletions(-)
+file changed, 63 insertions(+), 19 deletions(-)
-diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
+diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/translate-all.c
+--- a/tcg/tcg-op-gvec.c
-+++ b/accel/tcg/translate-all.c
++++ b/tcg/tcg-op-gvec.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_5_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
-  */
+    in units of LNSZ.  This limits the expansion of inline code.  */
+ static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz)
- #include "qemu/osdep.h"
+ {
-+#include "qemu/units.h"
+-    if (oprsz % lnsz == 0) {
- #include "qemu-common.h"
+-        uint32_t lnct = oprsz / lnsz;
+-        return lnct >= 1 && lnct <= MAX_UNROLL;
- #define NO_CPU_IO_DEFS
++    uint32_t q, r;
-@@ -XXX,XX +XXX,XX @@ static void page_lock_pair(PageDesc **ret_p1, tb_page_addr_t phys1,
++
++    if (oprsz < lnsz) {
- /* Minimum size of the code gen buffer.  This number is randomly chosen,
++        return false;
-    but not so small that we can't have a fair number of TB's live.  */
+     }
--#define MIN_CODE_GEN_BUFFER_SIZE     (1024u * 1024)
+-    return false;
-+#define MIN_CODE_GEN_BUFFER_SIZE     (1 * MiB)
++
++    q = oprsz / lnsz;
- /* Maximum size of the code gen buffer we'd like to use.  Unless otherwise
++    r = oprsz % lnsz;
-    indicated, this is constrained by the range of direct branches on the
++    tcg_debug_assert((r & 7) == 0);
-    host cpu, as used by the TCG implementation of goto_tb.  */
++
- #if defined(__x86_64__)
++    if (lnsz < 16) {
--# define MAX_CODE_GEN_BUFFER_SIZE  (2ul * 1024 * 1024 * 1024)
++        /* For sizes below 16, accept no remainder. */
-+# define MAX_CODE_GEN_BUFFER_SIZE  (2 * GiB)
++        if (r != 0) {
- #elif defined(__sparc__)
++            return false;
--# define MAX_CODE_GEN_BUFFER_SIZE  (2ul * 1024 * 1024 * 1024)
++        }
-+# define MAX_CODE_GEN_BUFFER_SIZE  (2 * GiB)
++    } else {
- #elif defined(__powerpc64__)
++        /*
--# define MAX_CODE_GEN_BUFFER_SIZE  (2ul * 1024 * 1024 * 1024)
++         * Recall that ARM SVE allows vector sizes that are not a
-+# define MAX_CODE_GEN_BUFFER_SIZE  (2 * GiB)
++         * power of 2, but always a multiple of 16.  The intent is
- #elif defined(__powerpc__)
++         * that e.g. size == 80 would be expanded with 2x32 + 1x16.
--# define MAX_CODE_GEN_BUFFER_SIZE  (32u * 1024 * 1024)
++         * In addition, expand_clr needs to handle a multiple of 8.
-+# define MAX_CODE_GEN_BUFFER_SIZE  (32 * MiB)
++         * Thus we can handle the tail with one more operation per
- #elif defined(__aarch64__)
++         * diminishing power of 2.
--# define MAX_CODE_GEN_BUFFER_SIZE  (2ul * 1024 * 1024 * 1024)
++         */
-+# define MAX_CODE_GEN_BUFFER_SIZE  (2 * GiB)
++        q += ctpop32(r);
- #elif defined(__s390x__)
++    }
-   /* We have a +- 4GB range on the branches; leave some slop.  */
++
--# define MAX_CODE_GEN_BUFFER_SIZE  (3ul * 1024 * 1024 * 1024)
++    return q <= MAX_UNROLL;
-+# define MAX_CODE_GEN_BUFFER_SIZE  (3 * GiB)
+ }
- #elif defined(__mips__)
-   /* We have a 256MB branch region, but leave room to make sure the
+ static void expand_clr(uint32_t dofs, uint32_t maxsz);
-      main executable is also within that region.  */
+@@ -XXX,XX +XXX,XX @@ static void gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in)
--# define MAX_CODE_GEN_BUFFER_SIZE  (128ul * 1024 * 1024)
+ static TCGType choose_vector_type(const TCGOpcode *list, unsigned vece,
-+# define MAX_CODE_GEN_BUFFER_SIZE  (128 * MiB)
+                                   uint32_t size, bool prefer_i64)
- #else
+ {
- # define MAX_CODE_GEN_BUFFER_SIZE  ((size_t)-1)
+-    if (TCG_TARGET_HAS_v256 && check_size_impl(size, 32)) {
- #endif
+-        /*
+-         * Recall that ARM SVE allows vector sizes that are not a
--#define DEFAULT_CODE_GEN_BUFFER_SIZE_1 (32u * 1024 * 1024)
+-         * power of 2, but always a multiple of 16.  The intent is
-+#define DEFAULT_CODE_GEN_BUFFER_SIZE_1 (32 * MiB)
+-         * that e.g. size == 80 would be expanded with 2x32 + 1x16.
+-         * It is hard to imagine a case in which v256 is supported
- #define DEFAULT_CODE_GEN_BUFFER_SIZE \
+-         * but v128 is not, but check anyway.
-   (DEFAULT_CODE_GEN_BUFFER_SIZE_1 < MAX_CODE_GEN_BUFFER_SIZE \
+-         */
 -        if (tcg_can_emit_vecop_list(list, TCG_TYPE_V256, vece)
 -            && (size % 32 == 0
 -                || tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece))) {
 -            return TCG_TYPE_V256;
 -        }
 +    /*
 +     * Recall that ARM SVE allows vector sizes that are not a
 +     * power of 2, but always a multiple of 16.  The intent is
 +     * that e.g. size == 80 would be expanded with 2x32 + 1x16.
 +     * It is hard to imagine a case in which v256 is supported
 +     * but v128 is not, but check anyway.
 +     * In addition, expand_clr needs to handle a multiple of 8.
 +     */
 +    if (TCG_TARGET_HAS_v256 &&
 +        check_size_impl(size, 32) &&
 +        tcg_can_emit_vecop_list(list, TCG_TYPE_V256, vece) &&
 +        (!(size & 16) ||
 +         (TCG_TARGET_HAS_v128 &&
 +          tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece))) &&
 +        (!(size & 8) ||
 +         (TCG_TARGET_HAS_v64 &&
 +          tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) {
 +        return TCG_TYPE_V256;
      }
 -    if (TCG_TARGET_HAS_v128 && check_size_impl(size, 16)
 -        && tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece)) {
 +    if (TCG_TARGET_HAS_v128 &&
 +        check_size_impl(size, 16) &&
 +        tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece) &&
 +        (!(size & 8) ||
 +         (TCG_TARGET_HAS_v64 &&
 +          tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) {
          return TCG_TYPE_V128;
      }
      if (TCG_TARGET_HAS_v64 && !prefer_i64 && check_size_impl(size, 8)
@@ -XXX,XX +XXX,XX @@ static void do_dup_store(TCGType type, uint32_t dofs, uint32_t oprsz,
  {
      uint32_t i = 0;
 +    tcg_debug_assert(oprsz >= 8);
 +
 +    /*
 +     * This may be expand_clr for the tail of an operation, e.g.
 +     * oprsz == 8 && maxsz == 64.  The first 8 bytes of this store
 +     * are misaligned wrt the maximum vector size, so do that first.
 +     */
 +    if (dofs & 8) {
 +        tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64);
 +        i += 8;
 +    }
 +
      switch (type) {
      case TCG_TYPE_V256:
          /*
 --
 .20.1

-[PULL 2/8] compiler.h: Don't use compile-time assert when __NO_INLINE__ is defined
+[PULL 09/10] tcg: Add load_dest parameter to GVecGen2
-From: Zenghui Yu <yuzenghui@huawei.com>
+We have this same parameter for GVecGen2i, GVecGen3,
 and GVecGen3i.  This will make some SVE2 insns easier
 to parameterize.
-Our robot reported the following compile-time warning while compiling
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Qemu with -fno-inline cflags:
 In function 'load_memop',
     inlined from 'load_helper' at /qemu/accel/tcg/cputlb.c:1578:20,
     inlined from 'full_ldub_mmu' at /qemu/accel/tcg/cputlb.c:1624:12:
 /qemu/accel/tcg/cputlb.c:1502:9: error: call to 'qemu_build_not_reached' declared with attribute error: code path is reachable
          qemu_build_not_reached();
          ^~~~~~~~~~~~~~~~~~~~~~~~
     [...]
 It looks like a false-positive because only (MO_UB ^ MO_BSWAP) will
 hit the default case in load_memop() while need_swap (size > 1) has
 already ensured that MO_UB is not involved.
 So the thing is that compilers get confused by the -fno-inline and
 just can't accurately evaluate memop_size(op) at compile time, and
 then the qemu_build_not_reached() is wrongly triggered by (MO_UB ^
 MO_BSWAP).  Let's carefully don't use the compile-time assert when
 no functions will be inlined into their callers.
 Reported-by: Euler Robot <euler.robot@huawei.com>
 Suggested-by: Richard Henderson <richard.henderson@linaro.org>
 Signed-off-by: Zenghui Yu <yuzenghui@huawei.com>
 Message-Id: <20200205141545.180-1-yuzenghui@huawei.com>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/qemu/compiler.h | 2 +-
+ include/tcg/tcg-op-gvec.h |  2 ++
-file changed, 1 insertion(+), 1 deletion(-)
+ tcg/tcg-op-gvec.c         | 45 ++++++++++++++++++++++++++++-----------
 files changed, 34 insertions(+), 13 deletions(-)
-diff --git a/include/qemu/compiler.h b/include/qemu/compiler.h
+diff --git a/include/tcg/tcg-op-gvec.h b/include/tcg/tcg-op-gvec.h
 index XXXXXXX..XXXXXXX 100644
---- a/include/qemu/compiler.h
+--- a/include/tcg/tcg-op-gvec.h
-+++ b/include/qemu/compiler.h
++++ b/include/tcg/tcg-op-gvec.h
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ typedef struct {
-  * supports QEMU_ERROR, this will be reported at compile time; otherwise
+     uint8_t vece;
-  * this will be reported at link time due to the missing symbol.
+     /* Prefer i64 to v64.  */
-  */
+     bool prefer_i64;
--#ifdef __OPTIMIZE__
++    /* Load dest as a 2nd source operand.  */
-+#if defined(__OPTIMIZE__) && !defined(__NO_INLINE__)
++    bool load_dest;
- extern void QEMU_NORETURN QEMU_ERROR("code path is reachable")
+ } GVecGen2;
-     qemu_build_not_reached(void);
- #else
+ typedef struct {
 diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tcg-op-gvec.c
 +++ b/tcg/tcg-op-gvec.c
@@ -XXX,XX +XXX,XX @@ static void expand_clr(uint32_t dofs, uint32_t maxsz)
  /* Expand OPSZ bytes worth of two-operand operations using i32 elements.  */
  static void expand_2_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 -                         void (*fni)(TCGv_i32, TCGv_i32))
 +                         bool load_dest, void (*fni)(TCGv_i32, TCGv_i32))
  {
      TCGv_i32 t0 = tcg_temp_new_i32();
 +    TCGv_i32 t1 = tcg_temp_new_i32();
      uint32_t i;
      for (i = 0; i < oprsz; i += 4) {
          tcg_gen_ld_i32(t0, cpu_env, aofs + i);
 -        fni(t0, t0);
 -        tcg_gen_st_i32(t0, cpu_env, dofs + i);
 +        if (load_dest) {
 +            tcg_gen_ld_i32(t1, cpu_env, dofs + i);
 +        }
 +        fni(t1, t0);
 +        tcg_gen_st_i32(t1, cpu_env, dofs + i);
      }
      tcg_temp_free_i32(t0);
 +    tcg_temp_free_i32(t1);
  }
  static void expand_2i_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
@@ -XXX,XX +XXX,XX @@ static void expand_4_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
  /* Expand OPSZ bytes worth of two-operand operations using i64 elements.  */
  static void expand_2_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 -                         void (*fni)(TCGv_i64, TCGv_i64))
 +                         bool load_dest, void (*fni)(TCGv_i64, TCGv_i64))
  {
      TCGv_i64 t0 = tcg_temp_new_i64();
 +    TCGv_i64 t1 = tcg_temp_new_i64();
      uint32_t i;
      for (i = 0; i < oprsz; i += 8) {
          tcg_gen_ld_i64(t0, cpu_env, aofs + i);
 -        fni(t0, t0);
 -        tcg_gen_st_i64(t0, cpu_env, dofs + i);
 +        if (load_dest) {
 +            tcg_gen_ld_i64(t1, cpu_env, dofs + i);
 +        }
 +        fni(t1, t0);
 +        tcg_gen_st_i64(t1, cpu_env, dofs + i);
      }
      tcg_temp_free_i64(t0);
 +    tcg_temp_free_i64(t1);
  }
  static void expand_2i_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
@@ -XXX,XX +XXX,XX @@ static void expand_4_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
  /* Expand OPSZ bytes worth of two-operand operations using host vectors.  */
  static void expand_2_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
                           uint32_t oprsz, uint32_t tysz, TCGType type,
 +                         bool load_dest,
                           void (*fni)(unsigned, TCGv_vec, TCGv_vec))
  {
      TCGv_vec t0 = tcg_temp_new_vec(type);
 +    TCGv_vec t1 = tcg_temp_new_vec(type);
      uint32_t i;
      for (i = 0; i < oprsz; i += tysz) {
          tcg_gen_ld_vec(t0, cpu_env, aofs + i);
 -        fni(vece, t0, t0);
 -        tcg_gen_st_vec(t0, cpu_env, dofs + i);
 +        if (load_dest) {
 +            tcg_gen_ld_vec(t1, cpu_env, dofs + i);
 +        }
 +        fni(vece, t1, t0);
 +        tcg_gen_st_vec(t1, cpu_env, dofs + i);
      }
      tcg_temp_free_vec(t0);
 +    tcg_temp_free_vec(t1);
  }
  /* Expand OPSZ bytes worth of two-vector operands and an immediate operand
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs,
           * that e.g. size == 80 would be expanded with 2x32 + 1x16.
           */
          some = QEMU_ALIGN_DOWN(oprsz, 32);
 -        expand_2_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, g->fniv);
 +        expand_2_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
 +                     g->load_dest, g->fniv);
          if (some == oprsz) {
              break;
          }
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs,
          maxsz -= some;
          /* fallthru */
      case TCG_TYPE_V128:
 -        expand_2_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, g->fniv);
 +        expand_2_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
 +                     g->load_dest, g->fniv);
          break;
      case TCG_TYPE_V64:
 -        expand_2_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, g->fniv);
 +        expand_2_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
 +                     g->load_dest, g->fniv);
          break;
      case 0:
          if (g->fni8 && check_size_impl(oprsz, 8)) {
 -            expand_2_i64(dofs, aofs, oprsz, g->fni8);
 +            expand_2_i64(dofs, aofs, oprsz, g->load_dest, g->fni8);
          } else if (g->fni4 && check_size_impl(oprsz, 4)) {
 -            expand_2_i32(dofs, aofs, oprsz, g->fni4);
 +            expand_2_i32(dofs, aofs, oprsz, g->load_dest, g->fni4);
          } else {
              assert(g->fno != NULL);
              tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, g->data, g->fno);
 --
 .20.1

-[PULL 4/8] tcg/arm: Expand epilogue inline
+[PULL 10/10] tcg: Fix integral argument type to tcg_gen_rot[rl]i_i{32, 64}
-From: Richard Henderson <rth@twiddle.net>
+For the benefit of compatibility of function pointer types,
 we have standardized on int32_t and int64_t as the integral
 argument to tcg expanders.
-It is, after all, just two instructions.
+We converted most of them in 474b2e8f0f7, but missed the rotates.
-Profiling on a cortex-a15, using -d nochain to increase the number
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-of exit_tb that are executed, shows a minor improvement of 0.5%.
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
  include/tcg/tcg-op.h |  8 ++++----
  tcg/tcg-op.c         | 16 ++++++++--------
 files changed, 12 insertions(+), 12 deletions(-)
-Signed-off-by: Richard Henderson <rth@twiddle.net>
+diff --git a/include/tcg/tcg-op.h b/include/tcg/tcg-op.h
 ---
  tcg/arm/tcg-target.inc.c | 12 ++----------
 file changed, 2 insertions(+), 10 deletions(-)
 diff --git a/tcg/arm/tcg-target.inc.c b/tcg/arm/tcg-target.inc.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/arm/tcg-target.inc.c
+--- a/include/tcg/tcg-op.h
-+++ b/tcg/arm/tcg-target.inc.c
++++ b/include/tcg/tcg-op.h
-@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
+@@ -XXX,XX +XXX,XX @@ void tcg_gen_ctzi_i32(TCGv_i32 ret, TCGv_i32 arg1, uint32_t arg2);
- #endif
+ void tcg_gen_clrsb_i32(TCGv_i32 ret, TCGv_i32 arg);
  void tcg_gen_ctpop_i32(TCGv_i32 a1, TCGv_i32 a2);
  void tcg_gen_rotl_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
 -void tcg_gen_rotli_i32(TCGv_i32 ret, TCGv_i32 arg1, unsigned arg2);
 +void tcg_gen_rotli_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
  void tcg_gen_rotr_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
 -void tcg_gen_rotri_i32(TCGv_i32 ret, TCGv_i32 arg1, unsigned arg2);
 +void tcg_gen_rotri_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
  void tcg_gen_deposit_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2,
                           unsigned int ofs, unsigned int len);
  void tcg_gen_deposit_z_i32(TCGv_i32 ret, TCGv_i32 arg,
@@ -XXX,XX +XXX,XX @@ void tcg_gen_ctzi_i64(TCGv_i64 ret, TCGv_i64 arg1, uint64_t arg2);
  void tcg_gen_clrsb_i64(TCGv_i64 ret, TCGv_i64 arg);
  void tcg_gen_ctpop_i64(TCGv_i64 a1, TCGv_i64 a2);
  void tcg_gen_rotl_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
 -void tcg_gen_rotli_i64(TCGv_i64 ret, TCGv_i64 arg1, unsigned arg2);
 +void tcg_gen_rotli_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
  void tcg_gen_rotr_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
 -void tcg_gen_rotri_i64(TCGv_i64 ret, TCGv_i64 arg1, unsigned arg2);
 +void tcg_gen_rotri_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
  void tcg_gen_deposit_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2,
                           unsigned int ofs, unsigned int len);
  void tcg_gen_deposit_z_i64(TCGv_i64 ret, TCGv_i64 arg,
 diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tcg-op.c
 +++ b/tcg/tcg-op.c
@@ -XXX,XX +XXX,XX @@ void tcg_gen_rotl_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
      }
  }
--static tcg_insn_unit *tb_ret_addr;
+-void tcg_gen_rotli_i32(TCGv_i32 ret, TCGv_i32 arg1, unsigned arg2)
- static void tcg_out_epilogue(TCGContext *s);
++void tcg_gen_rotli_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
+ {
- static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
+-    tcg_debug_assert(arg2 < 32);
-@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
++    tcg_debug_assert(arg2 >= 0 && arg2 < 32);
+     /* some cases can be optimized here */
-     switch (opc) {
+     if (arg2 == 0) {
-     case INDEX_op_exit_tb:
+         tcg_gen_mov_i32(ret, arg1);
--        /* Reuse the zeroing that exists for goto_ptr.  */
+@@ -XXX,XX +XXX,XX @@ void tcg_gen_rotr_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
--        a0 = args[0];
+     }
 -        if (a0 == 0) {
 -            tcg_out_goto(s, COND_AL, s->code_gen_epilogue);
 -        } else {
 -            tcg_out_movi32(s, COND_AL, TCG_REG_R0, args[0]);
 -            tcg_out_goto(s, COND_AL, tb_ret_addr);
 -        }
 +        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R0, args[0]);
 +        tcg_out_epilogue(s);
          break;
      case INDEX_op_goto_tb:
          {
@@ -XXX,XX +XXX,XX @@ static void tcg_target_qemu_prologue(TCGContext *s)
       */
      s->code_gen_epilogue = s->code_ptr;
      tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R0, 0);
 -    tb_ret_addr = s->code_ptr;
      tcg_out_epilogue(s);
  }
+-void tcg_gen_rotri_i32(TCGv_i32 ret, TCGv_i32 arg1, unsigned arg2)
++void tcg_gen_rotri_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
+ {
+-    tcg_debug_assert(arg2 < 32);
++    tcg_debug_assert(arg2 >= 0 && arg2 < 32);
+     /* some cases can be optimized here */
+     if (arg2 == 0) {
+         tcg_gen_mov_i32(ret, arg1);
+@@ -XXX,XX +XXX,XX @@ void tcg_gen_rotl_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+     }
+ }
+-void tcg_gen_rotli_i64(TCGv_i64 ret, TCGv_i64 arg1, unsigned arg2)
++void tcg_gen_rotli_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
+ {
+-    tcg_debug_assert(arg2 < 64);
++    tcg_debug_assert(arg2 >= 0 && arg2 < 64);
+     /* some cases can be optimized here */
+     if (arg2 == 0) {
+         tcg_gen_mov_i64(ret, arg1);
+@@ -XXX,XX +XXX,XX @@ void tcg_gen_rotr_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+     }
+ }
+-void tcg_gen_rotri_i64(TCGv_i64 ret, TCGv_i64 arg1, unsigned arg2)
++void tcg_gen_rotri_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
+ {
+-    tcg_debug_assert(arg2 < 64);
++    tcg_debug_assert(arg2 >= 0 && arg2 < 64);
+     /* some cases can be optimized here */
+     if (arg2 == 0) {
+         tcg_gen_mov_i64(ret, arg1);
 --
 .20.1

The following changes since commit e0175b71638cf4398903c0d25f93fe62e0606389:

Merge remote-tracking branch 'remotes/pmaydell/tags/pull-target-arm-20200228' into staging (2020-02-28 16:39:27 +0000)

are available in the Git repository at:

https://github.com/rth7680/qemu.git tags/pull-tcg-20200228

for you to fetch changes up to 600e17b261555c56a048781b8dd5ba3985650013:

accel/tcg: increase default code gen buffer size for 64 bit (2020-02-28 17:43:31 -0800)

----------------------------------------------------------------
Fix race in cpu_exec_step_atomic.
Work around compile failure with -fno-inine.
Expand tcg/arm epilogue inline.
Adjustments to the default code gen buffer size.

----------------------------------------------------------------
Alex Bennée (5):
      accel/tcg: fix race in cpu_exec_step_atomic (bug 1863025)
      accel/tcg: use units.h for defining code gen buffer sizes
      accel/tcg: remove link between guest ram and TCG cache size
      accel/tcg: only USE_STATIC_CODE_GEN_BUFFER on 32 bit hosts
      accel/tcg: increase default code gen buffer size for 64 bit

Richard Henderson (2):
      tcg/arm: Split out tcg_out_epilogue
      tcg/arm: Expand epilogue inline

Zenghui Yu (1):
      compiler.h: Don't use compile-time assert when __NO_INLINE__ is defined

include/qemu/compiler.h   |  2 +-
 accel/tcg/cpu-exec.c      | 21 ++++++++--------
 accel/tcg/translate-all.c | 61 ++++++++++++++++++++++++++++-------------------
 tcg/arm/tcg-target.inc.c  | 29 ++++++++++------------
 4 files changed, 60 insertions(+), 53 deletions(-)

From: Alex Bennée <alex.bennee@linaro.org>

The bug describes a race whereby cpu_exec_step_atomic can acquire a TB
which is invalidated by a tb_flush before we execute it. This doesn't
affect the other cpu_exec modes as a tb_flush by it's nature can only
occur on a quiescent system. The race was described as:

B2. tcg_cpu_exec => cpu_exec => tb_find => tb_gen_code
  B3. tcg_tb_alloc obtains a new TB

C3. TB obtained with tb_lookup__cpu_state or tb_gen_code
          (same TB as B2)

A3. start_exclusive critical section entered
          A4. do_tb_flush is called, TB memory freed/re-allocated
          A5. end_exclusive exits critical section

B2. tcg_cpu_exec => cpu_exec => tb_find => tb_gen_code
  B3. tcg_tb_alloc reallocates TB from B2

C4. start_exclusive critical section entered
      C5. cpu_tb_exec executes the TB code that was free in A4

The simplest fix is to widen the exclusive period to include the TB
lookup. As a result we can drop the complication of checking we are in
the exclusive region before we end it.

Cc: Yifan <me@yifanlu.com>
Buglink: https://bugs.launchpad.net/qemu/+bug/1863025
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
Message-Id: <20200214144952.15502-1-alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/cpu-exec.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cpu-exec.c
+++ b/accel/tcg/cpu-exec.c
@@ -XXX,XX +XXX,XX @@ void cpu_exec_step_atomic(CPUState *cpu)
     uint32_t cf_mask = cflags & CF_HASH_MASK;
 
     if (sigsetjmp(cpu->jmp_env, 0) == 0) {
+        start_exclusive();
+
         tb = tb_lookup__cpu_state(cpu, &pc, &cs_base, &flags, cf_mask);
         if (tb == NULL) {
             mmap_lock();
@@ -XXX,XX +XXX,XX @@ void cpu_exec_step_atomic(CPUState *cpu)
             mmap_unlock();
         }
 
-        start_exclusive();
-
         /* Since we got here, we know that parallel_cpus must be true.  */
         parallel_cpus = false;
         cc->cpu_exec_enter(cpu);
@@ -XXX,XX +XXX,XX @@ void cpu_exec_step_atomic(CPUState *cpu)
         qemu_plugin_disable_mem_helpers(cpu);
     }
 
-    if (cpu_in_exclusive_context(cpu)) {
-        /* We might longjump out of either the codegen or the
-         * execution, so must make sure we only end the exclusive
-         * region if we started it.
-         */
-        parallel_cpus = true;
-        end_exclusive();
-    }
+
+    /*
+     * As we start the exclusive region before codegen we must still
+     * be in the region if we longjump out of either the codegen or
+     * the execution.
+     */
+    g_assert(cpu_in_exclusive_context(cpu));
+    parallel_cpus = true;
+    end_exclusive();
 }
 
 struct tb_desc {
-- 
2.20.1

From: Zenghui Yu <yuzenghui@huawei.com>

Our robot reported the following compile-time warning while compiling
Qemu with -fno-inline cflags:

In function 'load_memop',
    inlined from 'load_helper' at /qemu/accel/tcg/cputlb.c:1578:20,
    inlined from 'full_ldub_mmu' at /qemu/accel/tcg/cputlb.c:1624:12:
/qemu/accel/tcg/cputlb.c:1502:9: error: call to 'qemu_build_not_reached' declared with attribute error: code path is reachable
         qemu_build_not_reached();
         ^~~~~~~~~~~~~~~~~~~~~~~~
    [...]

It looks like a false-positive because only (MO_UB ^ MO_BSWAP) will
hit the default case in load_memop() while need_swap (size > 1) has
already ensured that MO_UB is not involved.

So the thing is that compilers get confused by the -fno-inline and
just can't accurately evaluate memop_size(op) at compile time, and
then the qemu_build_not_reached() is wrongly triggered by (MO_UB ^
MO_BSWAP).  Let's carefully don't use the compile-time assert when
no functions will be inlined into their callers.

Reported-by: Euler Robot <euler.robot@huawei.com>
Suggested-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Zenghui Yu <yuzenghui@huawei.com>
Message-Id: <20200205141545.180-1-yuzenghui@huawei.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/qemu/compiler.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/qemu/compiler.h b/include/qemu/compiler.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/compiler.h
+++ b/include/qemu/compiler.h
@@ -XXX,XX +XXX,XX @@
  * supports QEMU_ERROR, this will be reported at compile time; otherwise
  * this will be reported at link time due to the missing symbol.
  */
-#ifdef __OPTIMIZE__
+#if defined(__OPTIMIZE__) && !defined(__NO_INLINE__)
 extern void QEMU_NORETURN QEMU_ERROR("code path is reachable")
     qemu_build_not_reached(void);
 #else
-- 
2.20.1

From: Richard Henderson <rth@twiddle.net>

We will shortly use this function from tcg_out_op as well.

Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/arm/tcg-target.inc.c | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/tcg/arm/tcg-target.inc.c b/tcg/arm/tcg-target.inc.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.inc.c
+++ b/tcg/arm/tcg-target.inc.c
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
 }
 
 static tcg_insn_unit *tb_ret_addr;
+static void tcg_out_epilogue(TCGContext *s);
 
 static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
                 const TCGArg *args, const int *const_args)
@@ -XXX,XX +XXX,XX @@ static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
       + TCG_TARGET_STACK_ALIGN - 1) \
      & -TCG_TARGET_STACK_ALIGN)
 
+#define STACK_ADDEND  (FRAME_SIZE - PUSH_SIZE)
+
 static void tcg_target_qemu_prologue(TCGContext *s)
 {
-    int stack_addend;
-
     /* Calling convention requires us to save r4-r11 and lr.  */
     /* stmdb sp!, { r4 - r11, lr } */
     tcg_out32(s, (COND_AL << 28) | 0x092d4ff0);
 
     /* Reserve callee argument and tcg temp space.  */
-    stack_addend = FRAME_SIZE - PUSH_SIZE;
-
     tcg_out_dat_rI(s, COND_AL, ARITH_SUB, TCG_REG_CALL_STACK,
-                   TCG_REG_CALL_STACK, stack_addend, 1);
+                   TCG_REG_CALL_STACK, STACK_ADDEND, 1);
     tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
                   CPU_TEMP_BUF_NLONGS * sizeof(long));
 
@@ -XXX,XX +XXX,XX @@ static void tcg_target_qemu_prologue(TCGContext *s)
      */
     s->code_gen_epilogue = s->code_ptr;
     tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R0, 0);
-
-    /* TB epilogue */
     tb_ret_addr = s->code_ptr;
+    tcg_out_epilogue(s);
+}
+
+static void tcg_out_epilogue(TCGContext *s)
+{
+    /* Release local stack frame.  */
     tcg_out_dat_rI(s, COND_AL, ARITH_ADD, TCG_REG_CALL_STACK,
-                   TCG_REG_CALL_STACK, stack_addend, 1);
+                   TCG_REG_CALL_STACK, STACK_ADDEND, 1);
 
     /* ldmia sp!, { r4 - r11, pc } */
     tcg_out32(s, (COND_AL << 28) | 0x08bd8ff0);
-- 
2.20.1

From: Richard Henderson <rth@twiddle.net>

It is, after all, just two instructions.

Profiling on a cortex-a15, using -d nochain to increase the number
of exit_tb that are executed, shows a minor improvement of 0.5%.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/arm/tcg-target.inc.c | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

From: Alex Bennée <alex.bennee@linaro.org>

It's easier to read.

Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Niek Linnenbank <nieklinnenbank@gmail.com>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Tested-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-Id: <20200228192415.19867-2-alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/translate-all.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -XXX,XX +XXX,XX @@
  */
 
 #include "qemu/osdep.h"
+#include "qemu/units.h"
 #include "qemu-common.h"
 
 #define NO_CPU_IO_DEFS
@@ -XXX,XX +XXX,XX @@ static void page_lock_pair(PageDesc **ret_p1, tb_page_addr_t phys1,
 
 /* Minimum size of the code gen buffer.  This number is randomly chosen,
    but not so small that we can't have a fair number of TB's live.  */
-#define MIN_CODE_GEN_BUFFER_SIZE     (1024u * 1024)
+#define MIN_CODE_GEN_BUFFER_SIZE     (1 * MiB)
 
 /* Maximum size of the code gen buffer we'd like to use.  Unless otherwise
    indicated, this is constrained by the range of direct branches on the
    host cpu, as used by the TCG implementation of goto_tb.  */
 #if defined(__x86_64__)
-# define MAX_CODE_GEN_BUFFER_SIZE  (2ul * 1024 * 1024 * 1024)
+# define MAX_CODE_GEN_BUFFER_SIZE  (2 * GiB)
 #elif defined(__sparc__)
-# define MAX_CODE_GEN_BUFFER_SIZE  (2ul * 1024 * 1024 * 1024)
+# define MAX_CODE_GEN_BUFFER_SIZE  (2 * GiB)
 #elif defined(__powerpc64__)
-# define MAX_CODE_GEN_BUFFER_SIZE  (2ul * 1024 * 1024 * 1024)
+# define MAX_CODE_GEN_BUFFER_SIZE  (2 * GiB)
 #elif defined(__powerpc__)
-# define MAX_CODE_GEN_BUFFER_SIZE  (32u * 1024 * 1024)
+# define MAX_CODE_GEN_BUFFER_SIZE  (32 * MiB)
 #elif defined(__aarch64__)
-# define MAX_CODE_GEN_BUFFER_SIZE  (2ul * 1024 * 1024 * 1024)
+# define MAX_CODE_GEN_BUFFER_SIZE  (2 * GiB)
 #elif defined(__s390x__)
   /* We have a +- 4GB range on the branches; leave some slop.  */
-# define MAX_CODE_GEN_BUFFER_SIZE  (3ul * 1024 * 1024 * 1024)
+# define MAX_CODE_GEN_BUFFER_SIZE  (3 * GiB)
 #elif defined(__mips__)
   /* We have a 256MB branch region, but leave room to make sure the
      main executable is also within that region.  */
-# define MAX_CODE_GEN_BUFFER_SIZE  (128ul * 1024 * 1024)
+# define MAX_CODE_GEN_BUFFER_SIZE  (128 * MiB)
 #else
 # define MAX_CODE_GEN_BUFFER_SIZE  ((size_t)-1)
 #endif
 
-#define DEFAULT_CODE_GEN_BUFFER_SIZE_1 (32u * 1024 * 1024)
+#define DEFAULT_CODE_GEN_BUFFER_SIZE_1 (32 * MiB)
 
 #define DEFAULT_CODE_GEN_BUFFER_SIZE \
   (DEFAULT_CODE_GEN_BUFFER_SIZE_1 < MAX_CODE_GEN_BUFFER_SIZE \
-- 
2.20.1

From: Alex Bennée <alex.bennee@linaro.org>

Basing the TB cache size on the ram_size was always a little heuristic
and was broken by a1b18df9a4 which caused ram_size not to be fully
realised at the time we initialise the TCG translation cache.

The current DEFAULT_CODE_GEN_BUFFER_SIZE may still be a little small
but follow-up patches will address that.

Fixes: a1b18df9a4
Cc: Igor Mammedov <imammedo@redhat.com>
Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Tested-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Reviewed-by: Niek Linnenbank <nieklinnenbank@gmail.com>
Message-Id: <20200228192415.19867-3-alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/translate-all.c | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -XXX,XX +XXX,XX @@ static inline size_t size_code_gen_buffer(size_t tb_size)
 {
     /* Size the buffer.  */
     if (tb_size == 0) {
-#ifdef USE_STATIC_CODE_GEN_BUFFER
         tb_size = DEFAULT_CODE_GEN_BUFFER_SIZE;
-#else
-        /* ??? Needs adjustments.  */
-        /* ??? If we relax the requirement that CONFIG_USER_ONLY use the
-           static buffer, we could size this on RESERVED_VA, on the text
-           segment size of the executable, or continue to use the default.  */
-        tb_size = (unsigned long)(ram_size / 4);
-#endif
     }
     if (tb_size < MIN_CODE_GEN_BUFFER_SIZE) {
         tb_size = MIN_CODE_GEN_BUFFER_SIZE;
-- 
2.20.1

From: Alex Bennée <alex.bennee@linaro.org>

There is no particular reason to use a static codegen buffer on 64 bit
hosts as we have address space to burn. Allow the common CONFIG_USER
case to use the mmap'ed buffers like SoftMMU.

Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Tested-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Reviewed-by: Niek Linnenbank <nieklinnenbank@gmail.com>
Message-Id: <20200228192415.19867-4-alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/translate-all.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -XXX,XX +XXX,XX @@ static void page_lock_pair(PageDesc **ret_p1, tb_page_addr_t phys1,
     }
 }
 
-#if defined(CONFIG_USER_ONLY)
-/* Currently it is not recommended to allocate big chunks of data in
-   user mode. It will change when a dedicated libc will be used.  */
-/* ??? 64-bit hosts ought to have no problem mmaping data outside the
-   region in which the guest needs to run.  Revisit this.  */
+#if defined(CONFIG_USER_ONLY) && TCG_TARGET_REG_BITS == 32
+/*
+ * For user mode on smaller 32 bit systems we may run into trouble
+ * allocating big chunks of data in the right place. On these systems
+ * we utilise a static code generation buffer directly in the binary.
+ */
 #define USE_STATIC_CODE_GEN_BUFFER
 #endif
 
-- 
2.20.1

From: Alex Bennée <alex.bennee@linaro.org>

While 32mb is certainly usable a full system boot ends up flushing the
codegen buffer nearly 100 times. Increase the default on 64 bit hosts
to take advantage of all that spare memory. After this change I can
boot my tests system without any TB flushes.

As we usually run more CONFIG_USER binaries at a time in typical usage
we aren't quite as profligate for user-mode code generation usage. We
also bring the static code gen defies to the same place to keep all
the reasoning in the comments together.

Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
Tested-by: Niek Linnenbank <nieklinnenbank@gmail.com>
Reviewed-by: Niek Linnenbank <nieklinnenbank@gmail.com>
Message-Id: <20200228192415.19867-5-alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/translate-all.c | 35 ++++++++++++++++++++++++++---------
 1 file changed, 26 insertions(+), 9 deletions(-)

diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -XXX,XX +XXX,XX @@ static void page_lock_pair(PageDesc **ret_p1, tb_page_addr_t phys1,
     }
 }
 
-#if defined(CONFIG_USER_ONLY) && TCG_TARGET_REG_BITS == 32
-/*
- * For user mode on smaller 32 bit systems we may run into trouble
- * allocating big chunks of data in the right place. On these systems
- * we utilise a static code generation buffer directly in the binary.
- */
-#define USE_STATIC_CODE_GEN_BUFFER
-#endif
-
 /* Minimum size of the code gen buffer.  This number is randomly chosen,
    but not so small that we can't have a fair number of TB's live.  */
 #define MIN_CODE_GEN_BUFFER_SIZE     (1 * MiB)
@@ -XXX,XX +XXX,XX @@ static void page_lock_pair(PageDesc **ret_p1, tb_page_addr_t phys1,
 # define MAX_CODE_GEN_BUFFER_SIZE  ((size_t)-1)
 #endif
 
+#if TCG_TARGET_REG_BITS == 32
 #define DEFAULT_CODE_GEN_BUFFER_SIZE_1 (32 * MiB)
+#ifdef CONFIG_USER_ONLY
+/*
+ * For user mode on smaller 32 bit systems we may run into trouble
+ * allocating big chunks of data in the right place. On these systems
+ * we utilise a static code generation buffer directly in the binary.
+ */
+#define USE_STATIC_CODE_GEN_BUFFER
+#endif
+#else /* TCG_TARGET_REG_BITS == 64 */
+#ifdef CONFIG_USER_ONLY
+/*
+ * As user-mode emulation typically means running multiple instances
+ * of the translator don't go too nuts with our default code gen
+ * buffer lest we make things too hard for the OS.
+ */
+#define DEFAULT_CODE_GEN_BUFFER_SIZE_1 (128 * MiB)
+#else
+/*
+ * We expect most system emulation to run one or two guests per host.
+ * Users running large scale system emulation may want to tweak their
+ * runtime setup via the tb-size control on the command line.
+ */
+#define DEFAULT_CODE_GEN_BUFFER_SIZE_1 (1 * GiB)
+#endif
+#endif
 
 #define DEFAULT_CODE_GEN_BUFFER_SIZE \
   (DEFAULT_CODE_GEN_BUFFER_SIZE_1 < MAX_CODE_GEN_BUFFER_SIZE \
-- 
2.20.1

The following changes since commit a36d64f43325fa503075cc9408ddabb69b32f829:

Merge remote-tracking branch 'remotes/stsquad/tags/pull-testing-and-gdbstub-060520-1' into staging (2020-05-06 14:06:00 +0100)

are available in the Git repository at:

https://github.com/rth7680/qemu.git tags/pull-tcg-20200506

for you to fetch changes up to 07dada0336a83002dfa8673a9220a88e13d9a45c:

tcg: Fix integral argument type to tcg_gen_rot[rl]i_i{32,64} (2020-05-06 09:25:10 -0700)

----------------------------------------------------------------
Add tcg_gen_gvec_dup_imm
Misc tcg patches

----------------------------------------------------------------
Richard Henderson (10):
      tcg: Add tcg_gen_gvec_dup_imm
      target/s390x: Use tcg_gen_gvec_dup_imm
      target/ppc: Use tcg_gen_gvec_dup_imm
      target/arm: Use tcg_gen_gvec_dup_imm
      tcg: Use tcg_gen_gvec_dup_imm in logical simplifications
      tcg: Remove tcg_gen_gvec_dup{8,16,32,64}i
      tcg: Add tcg_gen_gvec_dup_tl
      tcg: Improve vector tail clearing
      tcg: Add load_dest parameter to GVecGen2
      tcg: Fix integral argument type to tcg_gen_rot[rl]i_i{32,64}

Add a version of tcg_gen_dup_* that takes both immediate and
a vector element size operand.  This will replace the set of
tcg_gen_gvec_dup{8,16,32,64}i functions that encode the element
size within the function name.

Reviewed-by: LIU Zhiwei <zhiwei_liu@c-sky.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/tcg/tcg-op-gvec.h | 2 ++
 tcg/tcg-op-gvec.c         | 7 +++++++
 2 files changed, 9 insertions(+)

diff --git a/include/tcg/tcg-op-gvec.h b/include/tcg/tcg-op-gvec.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg-op-gvec.h
+++ b/include/tcg/tcg-op-gvec.h
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs,
 
 void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
                           uint32_t s, uint32_t m);
+void tcg_gen_gvec_dup_imm(unsigned vece, uint32_t dofs, uint32_t s,
+                          uint32_t m, uint64_t imm);
 void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t s,
                           uint32_t m, TCGv_i32);
 void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t s,
diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg-op-gvec.c
+++ b/tcg/tcg-op-gvec.c
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_dup8i(uint32_t dofs, uint32_t oprsz,
     do_dup(MO_8, dofs, oprsz, maxsz, NULL, NULL, x);
 }
 
+void tcg_gen_gvec_dup_imm(unsigned vece, uint32_t dofs, uint32_t oprsz,
+                          uint32_t maxsz, uint64_t x)
+{
+    check_size_align(oprsz, maxsz, dofs);
+    do_dup(vece, dofs, oprsz, maxsz, NULL, NULL, x);
+}
+
 void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs,
                       uint32_t oprsz, uint32_t maxsz)
 {
-- 
2.20.1

The gen_gvec_dupi switch is unnecessary with the new function.
Replace it with a local gen_gvec_dup_imm that takes care of the
register to offset conversion and length arguments.

Drop zero_vec and use use gen_gvec_dup_imm with 0.

Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/s390x/translate_vx.inc.c | 41 +++++++--------------------------
 1 file changed, 8 insertions(+), 33 deletions(-)

diff --git a/target/s390x/translate_vx.inc.c b/target/s390x/translate_vx.inc.c
index XXXXXXX..XXXXXXX 100644
--- a/target/s390x/translate_vx.inc.c
+++ b/target/s390x/translate_vx.inc.c
@@ -XXX,XX +XXX,XX @@ static void get_vec_element_ptr_i64(TCGv_ptr ptr, uint8_t reg, TCGv_i64 enr,
 #define gen_gvec_mov(v1, v2) \
     tcg_gen_gvec_mov(0, vec_full_reg_offset(v1), vec_full_reg_offset(v2), 16, \
                      16)
-#define gen_gvec_dup64i(v1, c) \
-    tcg_gen_gvec_dup64i(vec_full_reg_offset(v1), 16, 16, c)
+#define gen_gvec_dup_imm(es, v1, c) \
+    tcg_gen_gvec_dup_imm(es, vec_full_reg_offset(v1), 16, 16, c);
 #define gen_gvec_fn_2(fn, es, v1, v2) \
     tcg_gen_gvec_##fn(es, vec_full_reg_offset(v1), vec_full_reg_offset(v2), \
                       16, 16)
@@ -XXX,XX +XXX,XX @@ static void gen_gvec128_4_i64(gen_gvec128_4_i64_fn fn, uint8_t d, uint8_t a,
         tcg_temp_free_i64(cl);
 }
 
-static void gen_gvec_dupi(uint8_t es, uint8_t reg, uint64_t c)
-{
-    switch (es) {
-    case ES_8:
-        tcg_gen_gvec_dup8i(vec_full_reg_offset(reg), 16, 16, c);
-        break;
-    case ES_16:
-        tcg_gen_gvec_dup16i(vec_full_reg_offset(reg), 16, 16, c);
-        break;
-    case ES_32:
-        tcg_gen_gvec_dup32i(vec_full_reg_offset(reg), 16, 16, c);
-        break;
-    case ES_64:
-        gen_gvec_dup64i(reg, c);
-        break;
-    default:
-        g_assert_not_reached();
-    }
-}
-
-static void zero_vec(uint8_t reg)
-{
-    tcg_gen_gvec_dup8i(vec_full_reg_offset(reg), 16, 16, 0);
-}
-
 static void gen_addi2_i64(TCGv_i64 dl, TCGv_i64 dh, TCGv_i64 al, TCGv_i64 ah,
                           uint64_t b)
 {
@@ -XXX,XX +XXX,XX @@ static DisasJumpType op_vgbm(DisasContext *s, DisasOps *o)
          * Masks for both 64 bit elements of the vector are the same.
          * Trust tcg to produce a good constant loading.
          */
-        gen_gvec_dup64i(get_field(s, v1),
-                        generate_byte_mask(i2 & 0xff));
+        gen_gvec_dup_imm(ES_64, get_field(s, v1),
+                         generate_byte_mask(i2 & 0xff));
     } else {
         TCGv_i64 t = tcg_temp_new_i64();
 
@@ -XXX,XX +XXX,XX @@ static DisasJumpType op_vgm(DisasContext *s, DisasOps *o)
         }
     }
 
-    gen_gvec_dupi(es, get_field(s, v1), mask);
+    gen_gvec_dup_imm(es, get_field(s, v1), mask);
     return DISAS_NEXT;
 }
 
@@ -XXX,XX +XXX,XX @@ static DisasJumpType op_vllez(DisasContext *s, DisasOps *o)
 
     t = tcg_temp_new_i64();
     tcg_gen_qemu_ld_i64(t, o->addr1, get_mem_index(s), MO_TE | es);
-    zero_vec(get_field(s, v1));
+    gen_gvec_dup_imm(es, get_field(s, v1), 0);
     write_vec_element_i64(t, get_field(s, v1), enr, es);
     tcg_temp_free_i64(t);
     return DISAS_NEXT;
@@ -XXX,XX +XXX,XX @@ static DisasJumpType op_vrepi(DisasContext *s, DisasOps *o)
         return DISAS_NORETURN;
     }
 
-    gen_gvec_dupi(es, get_field(s, v1), data);
+    gen_gvec_dup_imm(es, get_field(s, v1), data);
     return DISAS_NEXT;
 }
 
@@ -XXX,XX +XXX,XX @@ static DisasJumpType op_vcksm(DisasContext *s, DisasOps *o)
         read_vec_element_i32(tmp, get_field(s, v2), i, ES_32);
         tcg_gen_add2_i32(tmp, sum, sum, sum, tmp, tmp);
     }
-    zero_vec(get_field(s, v1));
+    gen_gvec_dup_imm(ES_32, get_field(s, v1), 0);
     write_vec_element_i32(sum, get_field(s, v1), 1, ES_32);
 
     tcg_temp_free_i32(tmp);
-- 
2.20.1

We can now unify the implementation of the 3 VSPLTI instructions.

Acked-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/ppc/translate/vmx-impl.inc.c | 32 ++++++++++++++++-------------
 target/ppc/translate/vsx-impl.inc.c |  2 +-
 2 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/target/ppc/translate/vmx-impl.inc.c b/target/ppc/translate/vmx-impl.inc.c
index XXXXXXX..XXXXXXX 100644
--- a/target/ppc/translate/vmx-impl.inc.c
+++ b/target/ppc/translate/vmx-impl.inc.c
@@ -XXX,XX +XXX,XX @@ GEN_VXRFORM_DUAL(vcmpbfp, PPC_ALTIVEC, PPC_NONE, \
 GEN_VXRFORM_DUAL(vcmpgtfp, PPC_ALTIVEC, PPC_NONE, \
                  vcmpgtud, PPC_NONE, PPC2_ALTIVEC_207)
 
-#define GEN_VXFORM_DUPI(name, tcg_op, opc2, opc3)                       \
-static void glue(gen_, name)(DisasContext *ctx)                         \
-    {                                                                   \
-        int simm;                                                       \
-        if (unlikely(!ctx->altivec_enabled)) {                          \
-            gen_exception(ctx, POWERPC_EXCP_VPU);                       \
-            return;                                                     \
-        }                                                               \
-        simm = SIMM5(ctx->opcode);                                      \
-        tcg_op(avr_full_offset(rD(ctx->opcode)), 16, 16, simm);         \
+static void gen_vsplti(DisasContext *ctx, int vece)
+{
+    int simm;
+
+    if (unlikely(!ctx->altivec_enabled)) {
+        gen_exception(ctx, POWERPC_EXCP_VPU);
+        return;
     }
 
-GEN_VXFORM_DUPI(vspltisb, tcg_gen_gvec_dup8i, 6, 12);
-GEN_VXFORM_DUPI(vspltish, tcg_gen_gvec_dup16i, 6, 13);
-GEN_VXFORM_DUPI(vspltisw, tcg_gen_gvec_dup32i, 6, 14);
+    simm = SIMM5(ctx->opcode);
+    tcg_gen_gvec_dup_imm(vece, avr_full_offset(rD(ctx->opcode)), 16, 16, simm);
+}
+
+#define GEN_VXFORM_VSPLTI(name, vece, opc2, opc3) \
+static void glue(gen_, name)(DisasContext *ctx) { gen_vsplti(ctx, vece); }
+
+GEN_VXFORM_VSPLTI(vspltisb, MO_8, 6, 12);
+GEN_VXFORM_VSPLTI(vspltish, MO_16, 6, 13);
+GEN_VXFORM_VSPLTI(vspltisw, MO_32, 6, 14);
 
 #define GEN_VXFORM_NOA(name, opc2, opc3)                                \
 static void glue(gen_, name)(DisasContext *ctx)                         \
@@ -XXX,XX +XXX,XX @@ GEN_VXFORM_DUAL(vsldoi, PPC_ALTIVEC, PPC_NONE,
 #undef GEN_VXRFORM_DUAL
 #undef GEN_VXRFORM1
 #undef GEN_VXRFORM
-#undef GEN_VXFORM_DUPI
+#undef GEN_VXFORM_VSPLTI
 #undef GEN_VXFORM_NOA
 #undef GEN_VXFORM_UIMM
 #undef GEN_VAFORM_PAIRED
diff --git a/target/ppc/translate/vsx-impl.inc.c b/target/ppc/translate/vsx-impl.inc.c
index XXXXXXX..XXXXXXX 100644
--- a/target/ppc/translate/vsx-impl.inc.c
+++ b/target/ppc/translate/vsx-impl.inc.c
@@ -XXX,XX +XXX,XX @@ static void gen_xxspltib(DisasContext *ctx)
             return;
         }
     }
-    tcg_gen_gvec_dup8i(vsr_full_offset(rt), 16, 16, uim8);
+    tcg_gen_gvec_dup_imm(MO_8, vsr_full_offset(rt), 16, 16, uim8);
 }
 
 static void gen_xxsldwi(DisasContext *ctx)
-- 
2.20.1

In a few cases, we're able to remove some manual replication.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/arm/translate-a64.c | 10 +++++-----
 target/arm/translate-sve.c | 12 +++++-------
 target/arm/translate.c     |  9 ++++++---
 3 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/translate-a64.c
+++ b/target/arm/translate-a64.c
@@ -XXX,XX +XXX,XX @@ static void clear_vec_high(DisasContext *s, bool is_q, int rd)
         tcg_temp_free_i64(tcg_zero);
     }
     if (vsz > 16) {
-        tcg_gen_gvec_dup8i(ofs + 16, vsz - 16, vsz - 16, 0);
+        tcg_gen_gvec_dup_imm(MO_64, ofs + 16, vsz - 16, vsz - 16, 0);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ static void disas_simd_mod_imm(DisasContext *s, uint32_t insn)
 
     if (!((cmode & 0x9) == 0x1 || (cmode & 0xd) == 0x9)) {
         /* MOVI or MVNI, with MVNI negation handled above.  */
-        tcg_gen_gvec_dup64i(vec_full_reg_offset(s, rd), is_q ? 16 : 8,
-                            vec_full_reg_size(s), imm);
+        tcg_gen_gvec_dup_imm(MO_64, vec_full_reg_offset(s, rd), is_q ? 16 : 8,
+                             vec_full_reg_size(s), imm);
     } else {
         /* ORR or BIC, with BIC negation to AND handled above.  */
         if (is_neg) {
@@ -XXX,XX +XXX,XX @@ static void handle_vec_simd_shri(DisasContext *s, bool is_q, bool is_u,
         if (is_u) {
             if (shift == 8 << size) {
                 /* Shift count the same size as element size produces zero.  */
-                tcg_gen_gvec_dup8i(vec_full_reg_offset(s, rd),
-                                   is_q ? 16 : 8, vec_full_reg_size(s), 0);
+                tcg_gen_gvec_dup_imm(size, vec_full_reg_offset(s, rd),
+                                     is_q ? 16 : 8, vec_full_reg_size(s), 0);
             } else {
                 gen_gvec_fn2i(s, is_q, rd, rn, shift, tcg_gen_gvec_shri, size);
             }
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -XXX,XX +XXX,XX @@ static bool do_mov_z(DisasContext *s, int rd, int rn)
 static void do_dupi_z(DisasContext *s, int rd, uint64_t word)
 {
     unsigned vsz = vec_full_reg_size(s);
-    tcg_gen_gvec_dup64i(vec_full_reg_offset(s, rd), vsz, vsz, word);
+    tcg_gen_gvec_dup_imm(MO_64, vec_full_reg_offset(s, rd), vsz, vsz, word);
 }
 
 /* Invoke a vector expander on two Pregs.  */
@@ -XXX,XX +XXX,XX @@ static bool do_predset(DisasContext *s, int esz, int rd, int pat, bool setflag)
         unsigned oprsz = size_for_gvec(setsz / 8);
 
         if (oprsz * 8 == setsz) {
-            tcg_gen_gvec_dup64i(ofs, oprsz, maxsz, word);
+            tcg_gen_gvec_dup_imm(MO_64, ofs, oprsz, maxsz, word);
             goto done;
         }
     }
@@ -XXX,XX +XXX,XX @@ static bool trans_DUP_x(DisasContext *s, arg_DUP_x *a)
             unsigned nofs = vec_reg_offset(s, a->rn, index, esz);
             tcg_gen_gvec_dup_mem(esz, dofs, nofs, vsz, vsz);
         } else {
-            tcg_gen_gvec_dup64i(dofs, vsz, vsz, 0);
+            tcg_gen_gvec_dup_imm(esz, dofs, vsz, vsz, 0);
         }
     }
     return true;
@@ -XXX,XX +XXX,XX @@ static bool trans_FDUP(DisasContext *s, arg_FDUP *a)
 
         /* Decode the VFP immediate.  */
         imm = vfp_expand_imm(a->esz, a->imm);
-        imm = dup_const(a->esz, imm);
-
-        tcg_gen_gvec_dup64i(dofs, vsz, vsz, imm);
+        tcg_gen_gvec_dup_imm(a->esz, dofs, vsz, vsz, imm);
     }
     return true;
 }
@@ -XXX,XX +XXX,XX @@ static bool trans_DUP_i(DisasContext *s, arg_DUP_i *a)
         unsigned vsz = vec_full_reg_size(s);
         int dofs = vec_full_reg_offset(s, a->rd);
 
-        tcg_gen_gvec_dup64i(dofs, vsz, vsz, dup_const(a->esz, a->imm));
+        tcg_gen_gvec_dup_imm(a->esz, dofs, vsz, vsz, a->imm);
     }
     return true;
 }
diff --git a/target/arm/translate.c b/target/arm/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/translate.c
+++ b/target/arm/translate.c
@@ -XXX,XX +XXX,XX @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn)
                                           MIN(shift, (8 << size) - 1),
                                           vec_size, vec_size);
                     } else if (shift >= 8 << size) {
-                        tcg_gen_gvec_dup8i(rd_ofs, vec_size, vec_size, 0);
+                        tcg_gen_gvec_dup_imm(MO_8, rd_ofs, vec_size,
+                                             vec_size, 0);
                     } else {
                         tcg_gen_gvec_shri(size, rd_ofs, rm_ofs, shift,
                                           vec_size, vec_size);
@@ -XXX,XX +XXX,XX @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn)
                          * architecturally valid and results in zero.
                          */
                         if (shift >= 8 << size) {
-                            tcg_gen_gvec_dup8i(rd_ofs, vec_size, vec_size, 0);
+                            tcg_gen_gvec_dup_imm(size, rd_ofs,
+                                                 vec_size, vec_size, 0);
                         } else {
                             tcg_gen_gvec_shli(size, rd_ofs, rm_ofs, shift,
                                               vec_size, vec_size);
@@ -XXX,XX +XXX,XX @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn)
                     }
                     tcg_temp_free_i64(t64);
                 } else {
-                    tcg_gen_gvec_dup32i(reg_ofs, vec_size, vec_size, imm);
+                    tcg_gen_gvec_dup_imm(MO_32, reg_ofs, vec_size,
+                                         vec_size, imm);
                 }
             }
         }
-- 
2.20.1

Replace the outgoing interface.

Reviewed-by: LIU Zhiwei <zhiwei_liu@c-sky.com>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tcg-op-gvec.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg-op-gvec.c
+++ b/tcg/tcg-op-gvec.c
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs,
     };
 
     if (aofs == bofs) {
-        tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, 0);
+        tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, 0);
     } else {
         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
     }
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs,
     };
 
     if (aofs == bofs) {
-        tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, 0);
+        tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, 0);
     } else {
         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
     }
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs,
     };
 
     if (aofs == bofs) {
-        tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, -1);
+        tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, -1);
     } else {
         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
     }
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_eqv(unsigned vece, uint32_t dofs, uint32_t aofs,
     };
 
     if (aofs == bofs) {
-        tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, -1);
+        tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, -1);
     } else {
         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
     }
-- 
2.20.1

These interfaces are now unused.

Reviewed-by: LIU Zhiwei <zhiwei_liu@c-sky.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/tcg/tcg-op-gvec.h |  5 -----
 tcg/tcg-op-gvec.c         | 28 ----------------------------
 2 files changed, 33 deletions(-)

diff --git a/include/tcg/tcg-op-gvec.h b/include/tcg/tcg-op-gvec.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg-op-gvec.h
+++ b/include/tcg/tcg-op-gvec.h
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t s,
 void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t s,
                           uint32_t m, TCGv_i64);
 
-void tcg_gen_gvec_dup8i(uint32_t dofs, uint32_t s, uint32_t m, uint8_t x);
-void tcg_gen_gvec_dup16i(uint32_t dofs, uint32_t s, uint32_t m, uint16_t x);
-void tcg_gen_gvec_dup32i(uint32_t dofs, uint32_t s, uint32_t m, uint32_t x);
-void tcg_gen_gvec_dup64i(uint32_t dofs, uint32_t s, uint32_t m, uint64_t x);
-
 void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs,
                        int64_t shift, uint32_t oprsz, uint32_t maxsz);
 void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs,
diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg-op-gvec.c
+++ b/tcg/tcg-op-gvec.c
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
     }
 }
 
-void tcg_gen_gvec_dup64i(uint32_t dofs, uint32_t oprsz,
-                         uint32_t maxsz, uint64_t x)
-{
-    check_size_align(oprsz, maxsz, dofs);
-    do_dup(MO_64, dofs, oprsz, maxsz, NULL, NULL, x);
-}
-
-void tcg_gen_gvec_dup32i(uint32_t dofs, uint32_t oprsz,
-                         uint32_t maxsz, uint32_t x)
-{
-    check_size_align(oprsz, maxsz, dofs);
-    do_dup(MO_32, dofs, oprsz, maxsz, NULL, NULL, x);
-}
-
-void tcg_gen_gvec_dup16i(uint32_t dofs, uint32_t oprsz,
-                         uint32_t maxsz, uint16_t x)
-{
-    check_size_align(oprsz, maxsz, dofs);
-    do_dup(MO_16, dofs, oprsz, maxsz, NULL, NULL, x);
-}
-
-void tcg_gen_gvec_dup8i(uint32_t dofs, uint32_t oprsz,
-                         uint32_t maxsz, uint8_t x)
-{
-    check_size_align(oprsz, maxsz, dofs);
-    do_dup(MO_8, dofs, oprsz, maxsz, NULL, NULL, x);
-}
-
 void tcg_gen_gvec_dup_imm(unsigned vece, uint32_t dofs, uint32_t oprsz,
                           uint32_t maxsz, uint64_t x)
 {
-- 
2.20.1

For use when a target needs to pass a configure-specific
target_ulong value to duplicate.

diff --git a/include/tcg/tcg-op-gvec.h b/include/tcg/tcg-op-gvec.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg-op-gvec.h
+++ b/include/tcg/tcg-op-gvec.h
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t s,
 void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t s,
                           uint32_t m, TCGv_i64);
 
+#if TARGET_LONG_BITS == 64
+# define tcg_gen_gvec_dup_tl  tcg_gen_gvec_dup_i64
+#else
+# define tcg_gen_gvec_dup_tl  tcg_gen_gvec_dup_i32
+#endif
+
 void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs,
                        int64_t shift, uint32_t oprsz, uint32_t maxsz);
 void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs,
-- 
2.20.1

Better handling of non-power-of-2 tails as seen with Arm 8-byte
vector operations.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tcg-op-gvec.c | 82 ++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 63 insertions(+), 19 deletions(-)

diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg-op-gvec.c
+++ b/tcg/tcg-op-gvec.c
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_5_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
    in units of LNSZ.  This limits the expansion of inline code.  */
 static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz)
 {
-    if (oprsz % lnsz == 0) {
-        uint32_t lnct = oprsz / lnsz;
-        return lnct >= 1 && lnct <= MAX_UNROLL;
+    uint32_t q, r;
+
+    if (oprsz < lnsz) {
+        return false;
     }
-    return false;
+
+    q = oprsz / lnsz;
+    r = oprsz % lnsz;
+    tcg_debug_assert((r & 7) == 0);
+
+    if (lnsz < 16) {
+        /* For sizes below 16, accept no remainder. */
+        if (r != 0) {
+            return false;
+        }
+    } else {
+        /*
+         * Recall that ARM SVE allows vector sizes that are not a
+         * power of 2, but always a multiple of 16.  The intent is
+         * that e.g. size == 80 would be expanded with 2x32 + 1x16.
+         * In addition, expand_clr needs to handle a multiple of 8.
+         * Thus we can handle the tail with one more operation per
+         * diminishing power of 2.
+         */
+        q += ctpop32(r);
+    }
+
+    return q <= MAX_UNROLL;
 }
 
 static void expand_clr(uint32_t dofs, uint32_t maxsz);
@@ -XXX,XX +XXX,XX @@ static void gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in)
 static TCGType choose_vector_type(const TCGOpcode *list, unsigned vece,
                                   uint32_t size, bool prefer_i64)
 {
-    if (TCG_TARGET_HAS_v256 && check_size_impl(size, 32)) {
-        /*
-         * Recall that ARM SVE allows vector sizes that are not a
-         * power of 2, but always a multiple of 16.  The intent is
-         * that e.g. size == 80 would be expanded with 2x32 + 1x16.
-         * It is hard to imagine a case in which v256 is supported
-         * but v128 is not, but check anyway.
-         */
-        if (tcg_can_emit_vecop_list(list, TCG_TYPE_V256, vece)
-            && (size % 32 == 0
-                || tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece))) {
-            return TCG_TYPE_V256;
-        }
+    /*
+     * Recall that ARM SVE allows vector sizes that are not a
+     * power of 2, but always a multiple of 16.  The intent is
+     * that e.g. size == 80 would be expanded with 2x32 + 1x16.
+     * It is hard to imagine a case in which v256 is supported
+     * but v128 is not, but check anyway.
+     * In addition, expand_clr needs to handle a multiple of 8.
+     */
+    if (TCG_TARGET_HAS_v256 &&
+        check_size_impl(size, 32) &&
+        tcg_can_emit_vecop_list(list, TCG_TYPE_V256, vece) &&
+        (!(size & 16) ||
+         (TCG_TARGET_HAS_v128 &&
+          tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece))) &&
+        (!(size & 8) ||
+         (TCG_TARGET_HAS_v64 &&
+          tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) {
+        return TCG_TYPE_V256;
     }
-    if (TCG_TARGET_HAS_v128 && check_size_impl(size, 16)
-        && tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece)) {
+    if (TCG_TARGET_HAS_v128 &&
+        check_size_impl(size, 16) &&
+        tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece) &&
+        (!(size & 8) ||
+         (TCG_TARGET_HAS_v64 &&
+          tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) {
         return TCG_TYPE_V128;
     }
     if (TCG_TARGET_HAS_v64 && !prefer_i64 && check_size_impl(size, 8)
@@ -XXX,XX +XXX,XX @@ static void do_dup_store(TCGType type, uint32_t dofs, uint32_t oprsz,
 {
     uint32_t i = 0;
 
+    tcg_debug_assert(oprsz >= 8);
+
+    /*
+     * This may be expand_clr for the tail of an operation, e.g.
+     * oprsz == 8 && maxsz == 64.  The first 8 bytes of this store
+     * are misaligned wrt the maximum vector size, so do that first.
+     */
+    if (dofs & 8) {
+        tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64);
+        i += 8;
+    }
+
     switch (type) {
     case TCG_TYPE_V256:
         /*
-- 
2.20.1

We have this same parameter for GVecGen2i, GVecGen3,
and GVecGen3i.  This will make some SVE2 insns easier
to parameterize.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/tcg/tcg-op-gvec.h |  2 ++
 tcg/tcg-op-gvec.c         | 45 ++++++++++++++++++++++++++++-----------
 2 files changed, 34 insertions(+), 13 deletions(-)

diff --git a/include/tcg/tcg-op-gvec.h b/include/tcg/tcg-op-gvec.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg-op-gvec.h
+++ b/include/tcg/tcg-op-gvec.h
@@ -XXX,XX +XXX,XX @@ typedef struct {
     uint8_t vece;
     /* Prefer i64 to v64.  */
     bool prefer_i64;
+    /* Load dest as a 2nd source operand.  */
+    bool load_dest;
 } GVecGen2;
 
 typedef struct {
diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg-op-gvec.c
+++ b/tcg/tcg-op-gvec.c
@@ -XXX,XX +XXX,XX @@ static void expand_clr(uint32_t dofs, uint32_t maxsz)
 
 /* Expand OPSZ bytes worth of two-operand operations using i32 elements.  */
 static void expand_2_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
-                         void (*fni)(TCGv_i32, TCGv_i32))
+                         bool load_dest, void (*fni)(TCGv_i32, TCGv_i32))
 {
     TCGv_i32 t0 = tcg_temp_new_i32();
+    TCGv_i32 t1 = tcg_temp_new_i32();
     uint32_t i;
 
     for (i = 0; i < oprsz; i += 4) {
         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
-        fni(t0, t0);
-        tcg_gen_st_i32(t0, cpu_env, dofs + i);
+        if (load_dest) {
+            tcg_gen_ld_i32(t1, cpu_env, dofs + i);
+        }
+        fni(t1, t0);
+        tcg_gen_st_i32(t1, cpu_env, dofs + i);
     }
     tcg_temp_free_i32(t0);
+    tcg_temp_free_i32(t1);
 }
 
 static void expand_2i_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
@@ -XXX,XX +XXX,XX @@ static void expand_4_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 
 /* Expand OPSZ bytes worth of two-operand operations using i64 elements.  */
 static void expand_2_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
-                         void (*fni)(TCGv_i64, TCGv_i64))
+                         bool load_dest, void (*fni)(TCGv_i64, TCGv_i64))
 {
     TCGv_i64 t0 = tcg_temp_new_i64();
+    TCGv_i64 t1 = tcg_temp_new_i64();
     uint32_t i;
 
     for (i = 0; i < oprsz; i += 8) {
         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
-        fni(t0, t0);
-        tcg_gen_st_i64(t0, cpu_env, dofs + i);
+        if (load_dest) {
+            tcg_gen_ld_i64(t1, cpu_env, dofs + i);
+        }
+        fni(t1, t0);
+        tcg_gen_st_i64(t1, cpu_env, dofs + i);
     }
     tcg_temp_free_i64(t0);
+    tcg_temp_free_i64(t1);
 }
 
 static void expand_2i_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
@@ -XXX,XX +XXX,XX @@ static void expand_4_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 /* Expand OPSZ bytes worth of two-operand operations using host vectors.  */
 static void expand_2_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
                          uint32_t oprsz, uint32_t tysz, TCGType type,
+                         bool load_dest,
                          void (*fni)(unsigned, TCGv_vec, TCGv_vec))
 {
     TCGv_vec t0 = tcg_temp_new_vec(type);
+    TCGv_vec t1 = tcg_temp_new_vec(type);
     uint32_t i;
 
     for (i = 0; i < oprsz; i += tysz) {
         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
-        fni(vece, t0, t0);
-        tcg_gen_st_vec(t0, cpu_env, dofs + i);
+        if (load_dest) {
+            tcg_gen_ld_vec(t1, cpu_env, dofs + i);
+        }
+        fni(vece, t1, t0);
+        tcg_gen_st_vec(t1, cpu_env, dofs + i);
     }
     tcg_temp_free_vec(t0);
+    tcg_temp_free_vec(t1);
 }
 
 /* Expand OPSZ bytes worth of two-vector operands and an immediate operand
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs,
          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
          */
         some = QEMU_ALIGN_DOWN(oprsz, 32);
-        expand_2_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, g->fniv);
+        expand_2_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
+                     g->load_dest, g->fniv);
         if (some == oprsz) {
             break;
         }
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs,
         maxsz -= some;
         /* fallthru */
     case TCG_TYPE_V128:
-        expand_2_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, g->fniv);
+        expand_2_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
+                     g->load_dest, g->fniv);
         break;
     case TCG_TYPE_V64:
-        expand_2_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, g->fniv);
+        expand_2_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
+                     g->load_dest, g->fniv);
         break;
 
     case 0:
         if (g->fni8 && check_size_impl(oprsz, 8)) {
-            expand_2_i64(dofs, aofs, oprsz, g->fni8);
+            expand_2_i64(dofs, aofs, oprsz, g->load_dest, g->fni8);
         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
-            expand_2_i32(dofs, aofs, oprsz, g->fni4);
+            expand_2_i32(dofs, aofs, oprsz, g->load_dest, g->fni4);
         } else {
             assert(g->fno != NULL);
             tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, g->data, g->fno);
-- 
2.20.1

For the benefit of compatibility of function pointer types,
we have standardized on int32_t and int64_t as the integral
argument to tcg expanders.

We converted most of them in 474b2e8f0f7, but missed the rotates.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/tcg/tcg-op.h |  8 ++++----
 tcg/tcg-op.c         | 16 ++++++++--------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/include/tcg/tcg-op.h b/include/tcg/tcg-op.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg-op.h
+++ b/include/tcg/tcg-op.h
@@ -XXX,XX +XXX,XX @@ void tcg_gen_ctzi_i32(TCGv_i32 ret, TCGv_i32 arg1, uint32_t arg2);
 void tcg_gen_clrsb_i32(TCGv_i32 ret, TCGv_i32 arg);
 void tcg_gen_ctpop_i32(TCGv_i32 a1, TCGv_i32 a2);
 void tcg_gen_rotl_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
-void tcg_gen_rotli_i32(TCGv_i32 ret, TCGv_i32 arg1, unsigned arg2);
+void tcg_gen_rotli_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
 void tcg_gen_rotr_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
-void tcg_gen_rotri_i32(TCGv_i32 ret, TCGv_i32 arg1, unsigned arg2);
+void tcg_gen_rotri_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
 void tcg_gen_deposit_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2,
                          unsigned int ofs, unsigned int len);
 void tcg_gen_deposit_z_i32(TCGv_i32 ret, TCGv_i32 arg,
@@ -XXX,XX +XXX,XX @@ void tcg_gen_ctzi_i64(TCGv_i64 ret, TCGv_i64 arg1, uint64_t arg2);
 void tcg_gen_clrsb_i64(TCGv_i64 ret, TCGv_i64 arg);
 void tcg_gen_ctpop_i64(TCGv_i64 a1, TCGv_i64 a2);
 void tcg_gen_rotl_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
-void tcg_gen_rotli_i64(TCGv_i64 ret, TCGv_i64 arg1, unsigned arg2);
+void tcg_gen_rotli_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
 void tcg_gen_rotr_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
-void tcg_gen_rotri_i64(TCGv_i64 ret, TCGv_i64 arg1, unsigned arg2);
+void tcg_gen_rotri_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
 void tcg_gen_deposit_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2,
                          unsigned int ofs, unsigned int len);
 void tcg_gen_deposit_z_i64(TCGv_i64 ret, TCGv_i64 arg,
diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg-op.c
+++ b/tcg/tcg-op.c
@@ -XXX,XX +XXX,XX @@ void tcg_gen_rotl_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
     }
 }
 
-void tcg_gen_rotli_i32(TCGv_i32 ret, TCGv_i32 arg1, unsigned arg2)
+void tcg_gen_rotli_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
 {
-    tcg_debug_assert(arg2 < 32);
+    tcg_debug_assert(arg2 >= 0 && arg2 < 32);
     /* some cases can be optimized here */
     if (arg2 == 0) {
         tcg_gen_mov_i32(ret, arg1);
@@ -XXX,XX +XXX,XX @@ void tcg_gen_rotr_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
     }
 }
 
-void tcg_gen_rotri_i32(TCGv_i32 ret, TCGv_i32 arg1, unsigned arg2)
+void tcg_gen_rotri_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
 {
-    tcg_debug_assert(arg2 < 32);
+    tcg_debug_assert(arg2 >= 0 && arg2 < 32);
     /* some cases can be optimized here */
     if (arg2 == 0) {
         tcg_gen_mov_i32(ret, arg1);
@@ -XXX,XX +XXX,XX @@ void tcg_gen_rotl_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
     }
 }
 
-void tcg_gen_rotli_i64(TCGv_i64 ret, TCGv_i64 arg1, unsigned arg2)
+void tcg_gen_rotli_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
 {
-    tcg_debug_assert(arg2 < 64);
+    tcg_debug_assert(arg2 >= 0 && arg2 < 64);
     /* some cases can be optimized here */
     if (arg2 == 0) {
         tcg_gen_mov_i64(ret, arg1);
@@ -XXX,XX +XXX,XX @@ void tcg_gen_rotr_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
     }
 }
 
-void tcg_gen_rotri_i64(TCGv_i64 ret, TCGv_i64 arg1, unsigned arg2)
+void tcg_gen_rotri_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
 {
-    tcg_debug_assert(arg2 < 64);
+    tcg_debug_assert(arg2 >= 0 && arg2 < 64);
     /* some cases can be optimized here */
     if (arg2 == 0) {
         tcg_gen_mov_i64(ret, arg1);
-- 
2.20.1