Series comparison

-[PULL 00/12] tcg patch queue
+[PULL 0/4] tcg patch queue
-The following changes since commit 7c18f2d663521f1b31b821a13358ce38075eaf7d:
+Pretty small still, but there are two patches that ought
 to get backported to stable, so no point in delaying.
-  Merge tag 'for-upstream' of https://gitlab.com/bonzini/qemu into staging (2023-04-29 23:07:17 +0100)
+r~
 The following changes since commit a5ba0a7e4e150d1350a041f0d0ef9ca6c8d7c307:
   Merge tag 'pull-aspeed-20241211' of https://github.com/legoater/qemu into staging (2024-12-11 15:16:47 +0000)
 are available in the Git repository at:
-  https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20230502
+  https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20241212
-for you to fetch changes up to bdc7fba1c5a29ae218b45353daac9308fe1aae82:
+for you to fetch changes up to 7ac87b14a92234b6a89b701b4043ad6cf8bdcccf:
-  tcg: Introduce tcg_out_movext2 (2023-05-02 12:15:41 +0100)
+  target/sparc: Use memcpy() and remove memcpy32() (2024-12-12 14:28:38 -0600)
 ----------------------------------------------------------------
-Misc tcg-related patch queue.
+tcg: Reset free_temps before tcg_optimize
 tcg/riscv: Fix StoreStore barrier generation
 include/exec: Introduce fpst alias in helper-head.h.inc
 target/sparc: Use memcpy() and remove memcpy32()
 ----------------------------------------------------------------
-Dickon Hood (1):
+Philippe Mathieu-Daudé (1):
-      qemu/bitops.h: Limit rotate amounts
+      target/sparc: Use memcpy() and remove memcpy32()
-Kiran Ostrolenk (1):
+Richard Henderson (2):
-      qemu/host-utils.h: Add clz and ctz functions for lower-bit integers
+      tcg: Reset free_temps before tcg_optimize
       include/exec: Introduce fpst alias in helper-head.h.inc
-Nazar Kazakov (2):
+Roman Artemev (1):
-      tcg: Add tcg_gen_gvec_andcs
+      tcg/riscv: Fix StoreStore barrier generation
       tcg: Add tcg_gen_gvec_rotrs
-Richard Henderson (7):
+ include/tcg/tcg-temp-internal.h |  6 ++++++
-      softmmu: Tidy dirtylimit_dirty_ring_full_time
+ accel/tcg/plugin-gen.c          |  2 +-
-      qemu/int128: Re-shuffle Int128Alias members
+ target/sparc/win_helper.c       | 26 ++++++++------------------
-      migration/xbzrle: Use __attribute__((target)) for avx512
+ tcg/tcg.c                       |  5 ++++-
-      accel/tcg: Add cpu_ld*_code_mmu
+ include/exec/helper-head.h.inc  |  3 +++
-      tcg/loongarch64: Conditionalize tcg_out_exts_i32_i64
+ tcg/riscv/tcg-target.c.inc      |  2 +-
-      tcg/mips: Conditionalize tcg_out_exts_i32_i64
+files changed, 23 insertions(+), 21 deletions(-)
       tcg: Introduce tcg_out_movext2
-Weiwei Li (1):
-      accel/tcg: Uncache the host address for instruction fetch when tlb size < 1
- meson.build                      |  5 +--
- accel/tcg/tcg-runtime.h          |  1 +
- include/exec/cpu_ldst.h          |  9 ++++++
- include/qemu/bitops.h            | 24 +++++++++-----
- include/qemu/host-utils.h        | 54 +++++++++++++++++++++++++++++++
- include/qemu/int128.h            |  4 +--
- include/tcg/tcg-op-gvec.h        |  4 +++
- accel/tcg/cputlb.c               | 53 ++++++++++++++++++++++++++++++
- accel/tcg/tcg-runtime-gvec.c     | 11 +++++++
- accel/tcg/user-exec.c            | 58 +++++++++++++++++++++++++++++++++
- migration/xbzrle.c               |  9 +++---
- softmmu/dirtylimit.c             | 15 ++++++---
- tcg/tcg-op-gvec.c                | 28 ++++++++++++++++
- tcg/tcg.c                        | 69 +++++++++++++++++++++++++++++++++++++---
- tcg/arm/tcg-target.c.inc         | 44 +++++++++++--------------
- tcg/i386/tcg-target.c.inc        | 19 +++++------
- tcg/loongarch64/tcg-target.c.inc |  4 ++-
- tcg/mips/tcg-target.c.inc        |  4 ++-
-files changed, 347 insertions(+), 68 deletions(-)

-[PULL 12/12] tcg: Introduce tcg_out_movext2
+[PULL 1/4] tcg: Reset free_temps before tcg_optimize
-This is common code in most qemu_{ld,st} slow paths, moving two
+When allocating new temps during tcg_optmize, do not re-use
-registers when there may be overlap between sources and destinations.
+any EBB temps that were used within the TB.  We do not have
-At present, this is only used by 32-bit hosts for 64-bit data,
+any idea what span of the TB in which the temp was live.
 but will shortly be used for more than that.
+Introduce tcg_temp_ebb_reset_freed and use before tcg_optimize,
+as well as replacing the equivalent in plugin_gen_inject and
+tcg_func_start.
+Cc: qemu-stable@nongnu.org
+Fixes: fb04ab7ddd8 ("tcg/optimize: Lower TCG_COND_TST{EQ,NE} if unsupported")
+Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2711
+Reported-by: wannacu <wannacu2049@gmail.com>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
 Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
-Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/tcg.c                 | 69 ++++++++++++++++++++++++++++++++++++---
+ include/tcg/tcg-temp-internal.h | 6 ++++++
- tcg/arm/tcg-target.c.inc  | 44 ++++++++++---------------
+ accel/tcg/plugin-gen.c          | 2 +-
- tcg/i386/tcg-target.c.inc | 19 +++++------
+ tcg/tcg.c                       | 5 ++++-
-files changed, 90 insertions(+), 42 deletions(-)
+files changed, 11 insertions(+), 2 deletions(-)
+diff --git a/include/tcg/tcg-temp-internal.h b/include/tcg/tcg-temp-internal.h
+index XXXXXXX..XXXXXXX 100644
+--- a/include/tcg/tcg-temp-internal.h
++++ b/include/tcg/tcg-temp-internal.h
+@@ -XXX,XX +XXX,XX @@ TCGv_i64 tcg_temp_ebb_new_i64(void);
+ TCGv_ptr tcg_temp_ebb_new_ptr(void);
+ TCGv_i128 tcg_temp_ebb_new_i128(void);
++/* Forget all freed EBB temps, so that new allocations produce new temps. */
++static inline void tcg_temp_ebb_reset_freed(TCGContext *s)
++{
++    memset(s->free_temps, 0, sizeof(s->free_temps));
++}
++
+ #endif /* TCG_TEMP_FREE_H */
+diff --git a/accel/tcg/plugin-gen.c b/accel/tcg/plugin-gen.c
+index XXXXXXX..XXXXXXX 100644
+--- a/accel/tcg/plugin-gen.c
++++ b/accel/tcg/plugin-gen.c
+@@ -XXX,XX +XXX,XX @@ static void plugin_gen_inject(struct qemu_plugin_tb *plugin_tb)
+      * that might be live within the existing opcode stream.
+      * The simplest solution is to release them all and create new.
+      */
+-    memset(tcg_ctx->free_temps, 0, sizeof(tcg_ctx->free_temps));
++    tcg_temp_ebb_reset_freed(tcg_ctx);
+     QTAILQ_FOREACH_SAFE(op, &tcg_ctx->ops, link, next) {
+         switch (op->opc) {
 diff --git a/tcg/tcg.c b/tcg/tcg.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tcg.c
 +++ b/tcg/tcg.c
-@@ -XXX,XX +XXX,XX @@ static void tcg_out_exts_i32_i64(TCGContext *s, TCGReg ret, TCGReg arg);
+@@ -XXX,XX +XXX,XX @@ void tcg_func_start(TCGContext *s)
- static void tcg_out_extu_i32_i64(TCGContext *s, TCGReg ret, TCGReg arg);
+     s->nb_temps = s->nb_globals;
- static void tcg_out_extrl_i64_i32(TCGContext *s, TCGReg ret, TCGReg arg);
- static void tcg_out_addi_ptr(TCGContext *s, TCGReg, TCGReg, tcg_target_long);
+     /* No temps have been previously allocated for size or locality.  */
--static bool tcg_out_xchg(TCGContext *s, TCGType type, TCGReg r1, TCGReg r2)
+-    memset(s->free_temps, 0, sizeof(s->free_temps));
--    __attribute__((unused));
++    tcg_temp_ebb_reset_freed(s);
-+static bool tcg_out_xchg(TCGContext *s, TCGType type, TCGReg r1, TCGReg r2);
- static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg);
+     /* No constant temps have been previously allocated. */
- static void tcg_out_goto_tb(TCGContext *s, int which);
+     for (int i = 0; i < TCG_TYPE_COUNT; ++i) {
- static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start)
-@@ -XXX,XX +XXX,XX @@ void tcg_raise_tb_overflow(TCGContext *s)
+     }
-     siglongjmp(s->jmp_trans, -2);
+ #endif
- }
++    /* Do not reuse any EBB that may be allocated within the TB. */
-+typedef struct TCGMovExtend {
++    tcg_temp_ebb_reset_freed(s);
 +    TCGReg dst;
 +    TCGReg src;
 +    TCGType dst_type;
 +    TCGType src_type;
 +    MemOp src_ext;
 +} TCGMovExtend;
 +
- /**
+     tcg_optimize(s);
-  * tcg_out_movext -- move and extend
-  * @s: tcg context
+     reachable_code_pass(s);
@@ -XXX,XX +XXX,XX @@ void tcg_raise_tb_overflow(TCGContext *s)
   *
   * Move or extend @src into @dst, depending on @src_ext and the types.
   */
 -static void __attribute__((unused))
 -tcg_out_movext(TCGContext *s, TCGType dst_type, TCGReg dst,
 -               TCGType src_type, MemOp src_ext, TCGReg src)
 +static void tcg_out_movext(TCGContext *s, TCGType dst_type, TCGReg dst,
 +                           TCGType src_type, MemOp src_ext, TCGReg src)
  {
      switch (src_ext) {
      case MO_UB:
@@ -XXX,XX +XXX,XX @@ tcg_out_movext(TCGContext *s, TCGType dst_type, TCGReg dst,
      }
  }
 +/* Minor variations on a theme, using a structure. */
 +static void tcg_out_movext1_new_src(TCGContext *s, const TCGMovExtend *i,
 +                                    TCGReg src)
 +{
 +    tcg_out_movext(s, i->dst_type, i->dst, i->src_type, i->src_ext, src);
 +}
 +
 +static void tcg_out_movext1(TCGContext *s, const TCGMovExtend *i)
 +{
 +    tcg_out_movext1_new_src(s, i, i->src);
 +}
 +
 +/**
 + * tcg_out_movext2 -- move and extend two pair
 + * @s: tcg context
 + * @i1: first move description
 + * @i2: second move description
 + * @scratch: temporary register, or -1 for none
 + *
 + * As tcg_out_movext, for both @i1 and @i2, caring for overlap
 + * between the sources and destinations.
 + */
 +
 +static void __attribute__((unused))
 +tcg_out_movext2(TCGContext *s, const TCGMovExtend *i1,
 +                const TCGMovExtend *i2, int scratch)
 +{
 +    TCGReg src1 = i1->src;
 +    TCGReg src2 = i2->src;
 +
 +    if (i1->dst != src2) {
 +        tcg_out_movext1(s, i1);
 +        tcg_out_movext1(s, i2);
 +        return;
 +    }
 +    if (i2->dst == src1) {
 +        TCGType src1_type = i1->src_type;
 +        TCGType src2_type = i2->src_type;
 +
 +        if (tcg_out_xchg(s, MAX(src1_type, src2_type), src1, src2)) {
 +            /* The data is now in the correct registers, now extend. */
 +            src1 = i2->src;
 +            src2 = i1->src;
 +        } else {
 +            tcg_debug_assert(scratch >= 0);
 +            tcg_out_mov(s, src1_type, scratch, src1);
 +            src1 = scratch;
 +        }
 +    }
 +    tcg_out_movext1_new_src(s, i2, src2);
 +    tcg_out_movext1_new_src(s, i1, src1);
 +}
 +
  #define C_PFX1(P, A)                    P##A
  #define C_PFX2(P, A, B)                 P##A##_##B
  #define C_PFX3(P, A, B, C)              P##A##_##B##_##C
 diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/arm/tcg-target.c.inc
 +++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void add_qemu_ldst_label(TCGContext *s, bool is_ld, MemOpIdx oi,
  static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
  {
 -    TCGReg argreg, datalo, datahi;
 +    TCGReg argreg;
      MemOpIdx oi = lb->oi;
      MemOp opc = get_memop(oi);
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
      /* Use the canonical unsigned helpers and minimize icache usage. */
      tcg_out_call_int(s, qemu_ld_helpers[opc & MO_SIZE]);
 -    datalo = lb->datalo_reg;
 -    datahi = lb->datahi_reg;
      if ((opc & MO_SIZE) == MO_64) {
 -        if (datalo != TCG_REG_R1) {
 -            tcg_out_mov_reg(s, COND_AL, datalo, TCG_REG_R0);
 -            tcg_out_mov_reg(s, COND_AL, datahi, TCG_REG_R1);
 -        } else if (datahi != TCG_REG_R0) {
 -            tcg_out_mov_reg(s, COND_AL, datahi, TCG_REG_R1);
 -            tcg_out_mov_reg(s, COND_AL, datalo, TCG_REG_R0);
 -        } else {
 -            tcg_out_mov_reg(s, COND_AL, TCG_REG_TMP, TCG_REG_R0);
 -            tcg_out_mov_reg(s, COND_AL, datahi, TCG_REG_R1);
 -            tcg_out_mov_reg(s, COND_AL, datalo, TCG_REG_TMP);
 -        }
 +        TCGMovExtend ext[2] = {
 +            { .dst = lb->datalo_reg, .dst_type = TCG_TYPE_I32,
 +              .src = TCG_REG_R0, .src_type = TCG_TYPE_I32, .src_ext = MO_UL },
 +            { .dst = lb->datahi_reg, .dst_type = TCG_TYPE_I32,
 +              .src = TCG_REG_R1, .src_type = TCG_TYPE_I32, .src_ext = MO_UL },
 +        };
 +        tcg_out_movext2(s, &ext[0], &ext[1], TCG_REG_TMP);
      } else {
 -        tcg_out_movext(s, TCG_TYPE_I32, datalo,
 +        tcg_out_movext(s, TCG_TYPE_I32, lb->datalo_reg,
                         TCG_TYPE_I32, opc & MO_SSIZE, TCG_REG_R0);
      }
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_fail_alignment(TCGContext *s, TCGLabelQemuLdst *l)
      if (TARGET_LONG_BITS == 64) {
          /* 64-bit target address is aligned into R2:R3. */
 -        if (l->addrhi_reg != TCG_REG_R2) {
 -            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R2, l->addrlo_reg);
 -            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R3, l->addrhi_reg);
 -        } else if (l->addrlo_reg != TCG_REG_R3) {
 -            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R3, l->addrhi_reg);
 -            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R2, l->addrlo_reg);
 -        } else {
 -            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R1, TCG_REG_R2);
 -            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R2, TCG_REG_R3);
 -            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R3, TCG_REG_R1);
 -        }
 +        TCGMovExtend ext[2] = {
 +            { .dst = TCG_REG_R2, .dst_type = TCG_TYPE_I32,
 +              .src = l->addrlo_reg,
 +              .src_type = TCG_TYPE_I32, .src_ext = MO_UL },
 +            { .dst = TCG_REG_R3, .dst_type = TCG_TYPE_I32,
 +              .src = l->addrhi_reg,
 +              .src_type = TCG_TYPE_I32, .src_ext = MO_UL },
 +        };
 +        tcg_out_movext2(s, &ext[0], &ext[1], TCG_REG_TMP);
      } else {
          tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R1, l->addrlo_reg);
      }
 diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/i386/tcg-target.c.inc
 +++ b/tcg/i386/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
  {
      MemOpIdx oi = l->oi;
      MemOp opc = get_memop(oi);
 -    TCGReg data_reg;
      tcg_insn_unit **label_ptr = &l->label_ptr[0];
      /* resolve label address */
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
      tcg_out_branch(s, 1, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]);
 -    data_reg = l->datalo_reg;
      if (TCG_TARGET_REG_BITS == 32 && (opc & MO_SIZE) == MO_64) {
 -        if (data_reg == TCG_REG_EDX) {
 -            /* xchg %edx, %eax */
 -            tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0);
 -            tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EAX);
 -        } else {
 -            tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
 -            tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EDX);
 -        }
 +        TCGMovExtend ext[2] = {
 +            { .dst = l->datalo_reg, .dst_type = TCG_TYPE_I32,
 +              .src = TCG_REG_EAX, .src_type = TCG_TYPE_I32, .src_ext = MO_UL },
 +            { .dst = l->datahi_reg, .dst_type = TCG_TYPE_I32,
 +              .src = TCG_REG_EDX, .src_type = TCG_TYPE_I32, .src_ext = MO_UL },
 +        };
 +        tcg_out_movext2(s, &ext[0], &ext[1], -1);
      } else {
 -        tcg_out_movext(s, l->type, data_reg,
 +        tcg_out_movext(s, l->type, l->datalo_reg,
                         TCG_TYPE_REG, opc & MO_SSIZE, TCG_REG_EAX);
      }
 --
-.34.1
+.43.0

-[PULL 11/12] tcg/mips: Conditionalize tcg_out_exts_i32_i64
+[PULL 2/4] tcg/riscv: Fix StoreStore barrier generation
-Since TCG_TYPE_I32 values are kept sign-extended in registers, we need not
+From: Roman Artemev <roman.artemev@syntacore.com>
 extend if the register matches.  This is already relied upon by comparisons.
-Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+On RISC-V to StoreStore barrier corresponds
 `fence w, w` not `fence r, r`
 Cc: qemu-stable@nongnu.org
 Fixes: efbea94c76b ("tcg/riscv: Add slowpath load and store instructions")
 Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
 Signed-off-by: Denis Tomashev <denis.tomashev@syntacore.com>
 Signed-off-by: Roman Artemev <roman.artemev@syntacore.com>
 Message-ID: <e2f2131e294a49e79959d4fa9ec02cf4@syntacore.com>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/mips/tcg-target.c.inc | 4 +++-
+ tcg/riscv/tcg-target.c.inc | 2 +-
-file changed, 3 insertions(+), 1 deletion(-)
+file changed, 1 insertion(+), 1 deletion(-)
-diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc
+diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/mips/tcg-target.c.inc
+--- a/tcg/riscv/tcg-target.c.inc
-+++ b/tcg/mips/tcg-target.c.inc
++++ b/tcg/riscv/tcg-target.c.inc
-@@ -XXX,XX +XXX,XX @@ static void tcg_out_ext32s(TCGContext *s, TCGReg rd, TCGReg rs)
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_mb(TCGContext *s, TCGArg a0)
+         insn |= 0x02100000;
- static void tcg_out_exts_i32_i64(TCGContext *s, TCGReg rd, TCGReg rs)
+     }
- {
+     if (a0 & TCG_MO_ST_ST) {
--    tcg_out_ext32s(s, rd, rs);
+-        insn |= 0x02200000;
-+    if (rd != rs) {
++        insn |= 0x01100000;
-+        tcg_out_ext32s(s, rd, rs);
+     }
-+    }
+     tcg_out32(s, insn);
  }
- static void tcg_out_extu_i32_i64(TCGContext *s, TCGReg rd, TCGReg rs)
 --
-.34.1
+.43.0

-[PULL 10/12] tcg/loongarch64: Conditionalize tcg_out_exts_i32_i64
+[PULL 3/4] include/exec: Introduce fpst alias in helper-head.h.inc
-Since TCG_TYPE_I32 values are kept sign-extended in registers,
+This allows targets to declare that the helper requires a
-via ".w" instructions, we need not extend if the register matches.
+float_status pointer and instead of a generic void pointer.
 This is already relied upon by comparisons.
 Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/loongarch64/tcg-target.c.inc | 4 +++-
+ include/exec/helper-head.h.inc | 3 +++
-file changed, 3 insertions(+), 1 deletion(-)
+file changed, 3 insertions(+)
-diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
+diff --git a/include/exec/helper-head.h.inc b/include/exec/helper-head.h.inc
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/loongarch64/tcg-target.c.inc
+--- a/include/exec/helper-head.h.inc
-+++ b/tcg/loongarch64/tcg-target.c.inc
++++ b/include/exec/helper-head.h.inc
-@@ -XXX,XX +XXX,XX @@ static void tcg_out_ext32s(TCGContext *s, TCGReg ret, TCGReg arg)
+@@ -XXX,XX +XXX,XX @@
+ #define dh_alias_ptr ptr
- static void tcg_out_exts_i32_i64(TCGContext *s, TCGReg ret, TCGReg arg)
+ #define dh_alias_cptr ptr
- {
+ #define dh_alias_env ptr
--    tcg_out_ext32s(s, ret, arg);
++#define dh_alias_fpst ptr
-+    if (ret != arg) {
+ #define dh_alias_void void
-+        tcg_out_ext32s(s, ret, arg);
+ #define dh_alias_noreturn noreturn
-+    }
+ #define dh_alias(t) glue(dh_alias_, t)
- }
+@@ -XXX,XX +XXX,XX @@
+ #define dh_ctype_ptr void *
- static void tcg_out_extu_i32_i64(TCGContext *s, TCGReg ret, TCGReg arg)
+ #define dh_ctype_cptr const void *
  #define dh_ctype_env CPUArchState *
 +#define dh_ctype_fpst float_status *
  #define dh_ctype_void void
  #define dh_ctype_noreturn G_NORETURN void
  #define dh_ctype(t) dh_ctype_##t
@@ -XXX,XX +XXX,XX @@
  #define dh_typecode_f64 dh_typecode_i64
  #define dh_typecode_cptr dh_typecode_ptr
  #define dh_typecode_env dh_typecode_ptr
 +#define dh_typecode_fpst dh_typecode_ptr
  #define dh_typecode(t) dh_typecode_##t
  #define dh_callflag_i32  0
 --
-.34.1
+.43.0

-[PULL 01/12] softmmu: Tidy dirtylimit_dirty_ring_full_time
+[PULL 4/4] target/sparc: Use memcpy() and remove memcpy32()
-Drop inline marker: let compiler decide.
+From: Philippe Mathieu-Daudé <philmd@linaro.org>
-Change return type to uint64_t: this matches the computation in the
+Rather than manually copying each register, use
-return statement and the local variable assignment in the caller.
+the libc memcpy(), which is well optimized nowadays.
-Rename local to dirty_ring_size_MB to fix typo.
+Suggested-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
-Simplify conversion to MiB via qemu_target_page_bits and right shift.
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
+Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
-Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
-Reviewed-by: Thomas Huth <thuth@redhat.com>
+Message-ID: <20241205205418.67613-1-philmd@linaro.org>
 Reviewed-by: Juan Quintela <quintela@redhat.com>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- softmmu/dirtylimit.c | 15 ++++++++++-----
+ target/sparc/win_helper.c | 26 ++++++++------------------
-file changed, 10 insertions(+), 5 deletions(-)
+file changed, 8 insertions(+), 18 deletions(-)
-diff --git a/softmmu/dirtylimit.c b/softmmu/dirtylimit.c
+diff --git a/target/sparc/win_helper.c b/target/sparc/win_helper.c
 index XXXXXXX..XXXXXXX 100644
---- a/softmmu/dirtylimit.c
+--- a/target/sparc/win_helper.c
-+++ b/softmmu/dirtylimit.c
++++ b/target/sparc/win_helper.c
-@@ -XXX,XX +XXX,XX @@ bool dirtylimit_vcpu_index_valid(int cpu_index)
+@@ -XXX,XX +XXX,XX @@
-              cpu_index >= ms->smp.max_cpus);
+ #include "exec/helper-proto.h"
  #include "trace.h"
 -static inline void memcpy32(target_ulong *dst, const target_ulong *src)
 -{
 -    dst[0] = src[0];
 -    dst[1] = src[1];
 -    dst[2] = src[2];
 -    dst[3] = src[3];
 -    dst[4] = src[4];
 -    dst[5] = src[5];
 -    dst[6] = src[6];
 -    dst[7] = src[7];
 -}
 -
  void cpu_set_cwp(CPUSPARCState *env, int new_cwp)
  {
      /* put the modified wrap registers at their proper location */
      if (env->cwp == env->nwindows - 1) {
 -        memcpy32(env->regbase, env->regbase + env->nwindows * 16);
 +        memcpy(env->regbase, env->regbase + env->nwindows * 16,
 +               sizeof(env->gregs));
      }
      env->cwp = new_cwp;
      /* put the wrap registers at their temporary location */
      if (new_cwp == env->nwindows - 1) {
 -        memcpy32(env->regbase + env->nwindows * 16, env->regbase);
 +        memcpy(env->regbase + env->nwindows * 16, env->regbase,
 +               sizeof(env->gregs));
      }
      env->regwptr = env->regbase + (new_cwp * 16);
  }
+@@ -XXX,XX +XXX,XX @@ void cpu_gl_switch_gregs(CPUSPARCState *env, uint32_t new_gl)
--static inline int64_t dirtylimit_dirty_ring_full_time(uint64_t dirtyrate)
+     dst = get_gl_gregset(env, env->gl);
-+static uint64_t dirtylimit_dirty_ring_full_time(uint64_t dirtyrate)
- {
+     if (src != dst) {
-     static uint64_t max_dirtyrate;
+-        memcpy32(dst, env->gregs);
--    uint32_t dirty_ring_size = kvm_dirty_ring_size();
+-        memcpy32(env->gregs, src);
--    uint64_t dirty_ring_size_meory_MB =
++        memcpy(dst, env->gregs, sizeof(env->gregs));
--        dirty_ring_size * qemu_target_page_size() >> 20;
++        memcpy(env->gregs, src, sizeof(env->gregs));
 +    unsigned target_page_bits = qemu_target_page_bits();
 +    uint64_t dirty_ring_size_MB;
 +
 +    /* So far, the largest (non-huge) page size is 64k, i.e. 16 bits. */
 +    assert(target_page_bits < 20);
 +
 +    /* Convert ring size (pages) to MiB (2**20). */
 +    dirty_ring_size_MB = kvm_dirty_ring_size() >> (20 - target_page_bits);
      if (max_dirtyrate < dirtyrate) {
          max_dirtyrate = dirtyrate;
      }
--    return dirty_ring_size_meory_MB * 1000000 / max_dirtyrate;
-+    return dirty_ring_size_MB * 1000000 / max_dirtyrate;
  }
- static inline bool dirtylimit_done(uint64_t quota,
+@@ -XXX,XX +XXX,XX @@ void cpu_change_pstate(CPUSPARCState *env, uint32_t new_pstate)
          /* Switch global register bank */
          src = get_gregset(env, new_pstate_regs);
          dst = get_gregset(env, pstate_regs);
 -        memcpy32(dst, env->gregs);
 -        memcpy32(env->gregs, src);
 +        memcpy(dst, env->gregs, sizeof(env->gregs));
 +        memcpy(env->gregs, src, sizeof(env->gregs));
      } else {
          trace_win_helper_no_switch_pstate(new_pstate_regs);
      }
 --
-.34.1
+.43.0

-[PULL 02/12] accel/tcg: Uncache the host address for instruction fetch when tlb size < 1
+Deleted patch
-From: Weiwei Li <liweiwei@iscas.ac.cn>
-When PMP entry overlap part of the page, we'll set the tlb_size to 1, which
-will make the address in tlb entry set with TLB_INVALID_MASK, and the next
-access will again go through tlb_fill.However, this way will not work in
-tb_gen_code() => get_page_addr_code_hostp(): the TLB host address will be
-cached, and the following instructions can use this host address directly
-which may lead to the bypass of PMP related check.
-Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1542.
-Signed-off-by: Weiwei Li <liweiwei@iscas.ac.cn>
-Signed-off-by: Junqiang Wang <wangjunqiang@iscas.ac.cn>
-Reviewed-by: LIU Zhiwei <zhiwei_liu@linux.alibaba.com>
-Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
-Message-Id: <20230422130329.23555-6-liweiwei@iscas.ac.cn>
----
- accel/tcg/cputlb.c | 5 +++++
-file changed, 5 insertions(+)
-diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
-index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/cputlb.c
-+++ b/accel/tcg/cputlb.c
-@@ -XXX,XX +XXX,XX @@ tb_page_addr_t get_page_addr_code_hostp(CPUArchState *env, target_ulong addr,
-     if (p == NULL) {
-         return -1;
-     }
-+
-+    if (full->lg_page_size < TARGET_PAGE_BITS) {
-+        return -1;
-+    }
-+
-     if (hostp) {
-         *hostp = p;
-     }
---
-.34.1

-[PULL 03/12] qemu/bitops.h: Limit rotate amounts
+Deleted patch
-From: Dickon Hood <dickon.hood@codethink.co.uk>
-Rotates have been fixed up to only allow for reasonable rotate amounts
-(ie, no rotates >7 on an 8b value etc.)  This fixes a problem with riscv
-vector rotate instructions.
-Signed-off-by: Dickon Hood <dickon.hood@codethink.co.uk>
-Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
-Message-Id: <20230428144757.57530-9-lawrence.hunter@codethink.co.uk>
-Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
----
- include/qemu/bitops.h | 24 ++++++++++++++++--------
-file changed, 16 insertions(+), 8 deletions(-)
-diff --git a/include/qemu/bitops.h b/include/qemu/bitops.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/qemu/bitops.h
-+++ b/include/qemu/bitops.h
-@@ -XXX,XX +XXX,XX @@ static inline unsigned long find_first_zero_bit(const unsigned long *addr,
-  */
- static inline uint8_t rol8(uint8_t word, unsigned int shift)
- {
--    return (word << shift) | (word >> ((8 - shift) & 7));
-+    shift &= 7;
-+    return (word << shift) | (word >> (8 - shift));
- }
- /**
-@@ -XXX,XX +XXX,XX @@ static inline uint8_t rol8(uint8_t word, unsigned int shift)
-  */
- static inline uint8_t ror8(uint8_t word, unsigned int shift)
- {
--    return (word >> shift) | (word << ((8 - shift) & 7));
-+    shift &= 7;
-+    return (word >> shift) | (word << (8 - shift));
- }
- /**
-@@ -XXX,XX +XXX,XX @@ static inline uint8_t ror8(uint8_t word, unsigned int shift)
-  */
- static inline uint16_t rol16(uint16_t word, unsigned int shift)
- {
--    return (word << shift) | (word >> ((16 - shift) & 15));
-+    shift &= 15;
-+    return (word << shift) | (word >> (16 - shift));
- }
- /**
-@@ -XXX,XX +XXX,XX @@ static inline uint16_t rol16(uint16_t word, unsigned int shift)
-  */
- static inline uint16_t ror16(uint16_t word, unsigned int shift)
- {
--    return (word >> shift) | (word << ((16 - shift) & 15));
-+    shift &= 15;
-+    return (word >> shift) | (word << (16 - shift));
- }
- /**
-@@ -XXX,XX +XXX,XX @@ static inline uint16_t ror16(uint16_t word, unsigned int shift)
-  */
- static inline uint32_t rol32(uint32_t word, unsigned int shift)
- {
--    return (word << shift) | (word >> ((32 - shift) & 31));
-+    shift &= 31;
-+    return (word << shift) | (word >> (32 - shift));
- }
- /**
-@@ -XXX,XX +XXX,XX @@ static inline uint32_t rol32(uint32_t word, unsigned int shift)
-  */
- static inline uint32_t ror32(uint32_t word, unsigned int shift)
- {
--    return (word >> shift) | (word << ((32 - shift) & 31));
-+    shift &= 31;
-+    return (word >> shift) | (word << (32 - shift));
- }
- /**
-@@ -XXX,XX +XXX,XX @@ static inline uint32_t ror32(uint32_t word, unsigned int shift)
-  */
- static inline uint64_t rol64(uint64_t word, unsigned int shift)
- {
--    return (word << shift) | (word >> ((64 - shift) & 63));
-+    shift &= 63;
-+    return (word << shift) | (word >> (64 - shift));
- }
- /**
-@@ -XXX,XX +XXX,XX @@ static inline uint64_t rol64(uint64_t word, unsigned int shift)
-  */
- static inline uint64_t ror64(uint64_t word, unsigned int shift)
- {
--    return (word >> shift) | (word << ((64 - shift) & 63));
-+    shift &= 63;
-+    return (word >> shift) | (word << (64 - shift));
- }
- /**
---
-.34.1

-[PULL 04/12] qemu/host-utils.h: Add clz and ctz functions for lower-bit integers
+Deleted patch
-From: Kiran Ostrolenk <kiran.ostrolenk@codethink.co.uk>
-This is for use in the RISC-V vclz and vctz instructions (implemented in
-proceeding commit).
-Signed-off-by: Kiran Ostrolenk <kiran.ostrolenk@codethink.co.uk>
-Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
-Message-Id: <20230428144757.57530-11-lawrence.hunter@codethink.co.uk>
-Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
----
- include/qemu/host-utils.h | 54 +++++++++++++++++++++++++++++++++++++++
-file changed, 54 insertions(+)
-diff --git a/include/qemu/host-utils.h b/include/qemu/host-utils.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/qemu/host-utils.h
-+++ b/include/qemu/host-utils.h
-@@ -XXX,XX +XXX,XX @@ static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
- }
- #endif
-+/**
-+ * clz8 - count leading zeros in a 8-bit value.
-+ * @val: The value to search
-+ *
-+ * Returns 8 if the value is zero.  Note that the GCC builtin is
-+ * undefined if the value is zero.
-+ *
-+ * Note that the GCC builtin will upcast its argument to an `unsigned int`
-+ * so this function subtracts off the number of prepended zeroes.
-+ */
-+static inline int clz8(uint8_t val)
-+{
-+    return val ? __builtin_clz(val) - 24 : 8;
-+}
-+
-+/**
-+ * clz16 - count leading zeros in a 16-bit value.
-+ * @val: The value to search
-+ *
-+ * Returns 16 if the value is zero.  Note that the GCC builtin is
-+ * undefined if the value is zero.
-+ *
-+ * Note that the GCC builtin will upcast its argument to an `unsigned int`
-+ * so this function subtracts off the number of prepended zeroes.
-+ */
-+static inline int clz16(uint16_t val)
-+{
-+    return val ? __builtin_clz(val) - 16 : 16;
-+}
-+
- /**
-  * clz32 - count leading zeros in a 32-bit value.
-  * @val: The value to search
-@@ -XXX,XX +XXX,XX @@ static inline int clo64(uint64_t val)
-     return clz64(~val);
- }
-+/**
-+ * ctz8 - count trailing zeros in a 8-bit value.
-+ * @val: The value to search
-+ *
-+ * Returns 8 if the value is zero.  Note that the GCC builtin is
-+ * undefined if the value is zero.
-+ */
-+static inline int ctz8(uint8_t val)
-+{
-+    return val ? __builtin_ctz(val) : 8;
-+}
-+
-+/**
-+ * ctz16 - count trailing zeros in a 16-bit value.
-+ * @val: The value to search
-+ *
-+ * Returns 16 if the value is zero.  Note that the GCC builtin is
-+ * undefined if the value is zero.
-+ */
-+static inline int ctz16(uint16_t val)
-+{
-+    return val ? __builtin_ctz(val) : 16;
-+}
-+
- /**
-  * ctz32 - count trailing zeros in a 32-bit value.
-  * @val: The value to search
---
-.34.1

-[PULL 05/12] tcg: Add tcg_gen_gvec_andcs
+Deleted patch
-From: Nazar Kazakov <nazar.kazakov@codethink.co.uk>
-Add tcg expander and helper functions for and-compliment
-vector with scalar operand.
-Signed-off-by: Nazar Kazakov <nazar.kazakov@codethink.co.uk>
-Message-Id: <20230428144757.57530-10-lawrence.hunter@codethink.co.uk>
-[rth: Split out of larger patch.]
-Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
----
- accel/tcg/tcg-runtime.h      |  1 +
- include/tcg/tcg-op-gvec.h    |  2 ++
- accel/tcg/tcg-runtime-gvec.c | 11 +++++++++++
- tcg/tcg-op-gvec.c            | 17 +++++++++++++++++
-files changed, 31 insertions(+)
-diff --git a/accel/tcg/tcg-runtime.h b/accel/tcg/tcg-runtime.h
-index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/tcg-runtime.h
-+++ b/accel/tcg/tcg-runtime.h
-@@ -XXX,XX +XXX,XX @@ DEF_HELPER_FLAGS_4(gvec_nor, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
- DEF_HELPER_FLAGS_4(gvec_eqv, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
- DEF_HELPER_FLAGS_4(gvec_ands, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
-+DEF_HELPER_FLAGS_4(gvec_andcs, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
- DEF_HELPER_FLAGS_4(gvec_xors, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
- DEF_HELPER_FLAGS_4(gvec_ors, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
-diff --git a/include/tcg/tcg-op-gvec.h b/include/tcg/tcg-op-gvec.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/tcg/tcg-op-gvec.h
-+++ b/include/tcg/tcg-op-gvec.h
-@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, uint32_t aofs,
- void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs,
-                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz);
-+void tcg_gen_gvec_andcs(unsigned vece, uint32_t dofs, uint32_t aofs,
-+                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz);
- void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs,
-                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz);
- void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs,
-diff --git a/accel/tcg/tcg-runtime-gvec.c b/accel/tcg/tcg-runtime-gvec.c
-index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/tcg-runtime-gvec.c
-+++ b/accel/tcg/tcg-runtime-gvec.c
-@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_ands)(void *d, void *a, uint64_t b, uint32_t desc)
-     clear_high(d, oprsz, desc);
- }
-+void HELPER(gvec_andcs)(void *d, void *a, uint64_t b, uint32_t desc)
-+{
-+    intptr_t oprsz = simd_oprsz(desc);
-+    intptr_t i;
-+
-+    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
-+        *(uint64_t *)(d + i) = *(uint64_t *)(a + i) & ~b;
-+    }
-+    clear_high(d, oprsz, desc);
-+}
-+
- void HELPER(gvec_xors)(void *d, void *a, uint64_t b, uint32_t desc)
- {
-     intptr_t oprsz = simd_oprsz(desc);
-diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
-index XXXXXXX..XXXXXXX 100644
---- a/tcg/tcg-op-gvec.c
-+++ b/tcg/tcg-op-gvec.c
-@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs,
-     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
- }
-+void tcg_gen_gvec_andcs(unsigned vece, uint32_t dofs, uint32_t aofs,
-+                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
-+{
-+    static GVecGen2s g = {
-+        .fni8 = tcg_gen_andc_i64,
-+        .fniv = tcg_gen_andc_vec,
-+        .fno = gen_helper_gvec_andcs,
-+        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
-+        .vece = MO_64
-+    };
-+
-+    TCGv_i64 tmp = tcg_temp_ebb_new_i64();
-+    tcg_gen_dup_i64(vece, tmp, c);
-+    tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g);
-+    tcg_temp_free_i64(tmp);
-+}
-+
- static const GVecGen2s gop_xors = {
-     .fni8 = tcg_gen_xor_i64,
-     .fniv = tcg_gen_xor_vec,
---
-.34.1

-[PULL 06/12] tcg: Add tcg_gen_gvec_rotrs
+Deleted patch
-From: Nazar Kazakov <nazar.kazakov@codethink.co.uk>
-Add tcg expander and helper functions for rotate right
-vector with scalar operand.
-Signed-off-by: Nazar Kazakov <nazar.kazakov@codethink.co.uk>
-Message-Id: <20230428144757.57530-10-lawrence.hunter@codethink.co.uk>
-[rth: Split out of larger patch; mask rotation count.]
-Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
----
- include/tcg/tcg-op-gvec.h |  2 ++
- tcg/tcg-op-gvec.c         | 11 +++++++++++
-files changed, 13 insertions(+)
-diff --git a/include/tcg/tcg-op-gvec.h b/include/tcg/tcg-op-gvec.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/tcg/tcg-op-gvec.h
-+++ b/include/tcg/tcg-op-gvec.h
-@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_sars(unsigned vece, uint32_t dofs, uint32_t aofs,
-                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz);
- void tcg_gen_gvec_rotls(unsigned vece, uint32_t dofs, uint32_t aofs,
-                         TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz);
-+void tcg_gen_gvec_rotrs(unsigned vece, uint32_t dofs, uint32_t aofs,
-+                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz);
- /*
-  * Perform vector shift by vector element, modulo the element size.
-diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
-index XXXXXXX..XXXXXXX 100644
---- a/tcg/tcg-op-gvec.c
-+++ b/tcg/tcg-op-gvec.c
-@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_rotls(unsigned vece, uint32_t dofs, uint32_t aofs,
-     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
- }
-+void tcg_gen_gvec_rotrs(unsigned vece, uint32_t dofs, uint32_t aofs,
-+                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
-+{
-+    TCGv_i32 tmp = tcg_temp_ebb_new_i32();
-+
-+    tcg_gen_neg_i32(tmp, shift);
-+    tcg_gen_andi_i32(tmp, tmp, (8 << vece) - 1);
-+    tcg_gen_gvec_rotls(vece, dofs, aofs, tmp, oprsz, maxsz);
-+    tcg_temp_free_i32(tmp);
-+}
-+
- /*
-  * Expand D = A << (B % element bits)
-  *
---
-.34.1

-[PULL 07/12] qemu/int128: Re-shuffle Int128Alias members
+Deleted patch
-Clang 14, with --enable-tcg-interpreter errors with
-include/qemu/int128.h:487:16: error: alignment of field 'i' (128 bits)
-  does not match the alignment of the first field in transparent union;
-  transparent_union attribute ignored [-Werror,-Wignored-attributes]
-    __int128_t i;
-               ^
-include/qemu/int128.h:486:12: note: alignment of first field is 64 bits
-    Int128 s;
-           ^
-error generated.
-By placing the __uint128_t member first, this is avoided.
-Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
-Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-Message-Id: <20230501204625.277361-1-richard.henderson@linaro.org>
----
- include/qemu/int128.h | 4 ++--
-file changed, 2 insertions(+), 2 deletions(-)
-diff --git a/include/qemu/int128.h b/include/qemu/int128.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/qemu/int128.h
-+++ b/include/qemu/int128.h
-@@ -XXX,XX +XXX,XX @@ static inline void bswap128s(Int128 *s)
-  */
- #ifdef CONFIG_INT128
- typedef union {
--    Int128 s;
--    __int128_t i;
-     __uint128_t u;
-+    __int128_t i;
-+    Int128 s;
- } Int128Alias __attribute__((transparent_union));
- #else
- typedef Int128 Int128Alias;
---
-.34.1

-[PULL 08/12] migration/xbzrle: Use __attribute__((target)) for avx512
+Deleted patch
-Use the attribute, which is supported by clang, instead of
-the #pragma, which is not supported and, for some reason,
-also not detected by the meson probe, so we fail by -Werror.
-Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
-Reviewed-by: Juan Quintela <quintela@redhat.com>
-Message-Id: <20230501210555.289806-1-richard.henderson@linaro.org>
----
- meson.build        | 5 +----
- migration/xbzrle.c | 9 ++++-----
-files changed, 5 insertions(+), 9 deletions(-)
-diff --git a/meson.build b/meson.build
-index XXXXXXX..XXXXXXX 100644
---- a/meson.build
-+++ b/meson.build
-@@ -XXX,XX +XXX,XX @@ config_host_data.set('CONFIG_AVX512F_OPT', get_option('avx512f') \
- config_host_data.set('CONFIG_AVX512BW_OPT', get_option('avx512bw') \
-   .require(have_cpuid_h, error_message: 'cpuid.h not available, cannot enable AVX512BW') \
-   .require(cc.links('''
--    #pragma GCC push_options
--    #pragma GCC target("avx512bw")
-     #include <cpuid.h>
-     #include <immintrin.h>
--    static int bar(void *a) {
--
-+    static int __attribute__((target("avx512bw"))) bar(void *a) {
-       __m512i *x = a;
-       __m512i res= _mm512_abs_epi8(*x);
-       return res[1];
-diff --git a/migration/xbzrle.c b/migration/xbzrle.c
-index XXXXXXX..XXXXXXX 100644
---- a/migration/xbzrle.c
-+++ b/migration/xbzrle.c
-@@ -XXX,XX +XXX,XX @@ int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen)
- }
- #if defined(CONFIG_AVX512BW_OPT)
--#pragma GCC push_options
--#pragma GCC target("avx512bw")
- #include <immintrin.h>
--int xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen,
--                             uint8_t *dst, int dlen)
-+
-+int __attribute__((target("avx512bw")))
-+xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen,
-+                            uint8_t *dst, int dlen)
- {
-     uint32_t zrun_len = 0, nzrun_len = 0;
-     int d = 0, i = 0, num = 0;
-@@ -XXX,XX +XXX,XX @@ int xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen,
-     }
-     return d;
- }
--#pragma GCC pop_options
- #endif
---
-.34.1

-[PULL 09/12] accel/tcg: Add cpu_ld*_code_mmu
+Deleted patch
-At least RISC-V has the need to be able to perform a read
-using execute permissions, outside of translation.
-Add helpers to facilitate this.
-Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
-Acked-by: Alistair Francis <alistair.francis@wdc.com>
-Reviewed-by: Weiwei Li <liweiwei@iscas.ac.cn>
-Tested-by: Daniel Henrique Barboza <dbarboza@ventanamicro.com>
-Message-Id: <20230325105429.1142530-9-richard.henderson@linaro.org>
-Message-Id: <20230412114333.118895-9-richard.henderson@linaro.org>
----
- include/exec/cpu_ldst.h |  9 +++++++
- accel/tcg/cputlb.c      | 48 ++++++++++++++++++++++++++++++++++
- accel/tcg/user-exec.c   | 58 +++++++++++++++++++++++++++++++++++++++++
-files changed, 115 insertions(+)
-diff --git a/include/exec/cpu_ldst.h b/include/exec/cpu_ldst.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/exec/cpu_ldst.h
-+++ b/include/exec/cpu_ldst.h
-@@ -XXX,XX +XXX,XX @@ static inline CPUTLBEntry *tlb_entry(CPUArchState *env, uintptr_t mmu_idx,
- # define cpu_stq_mmu          cpu_stq_le_mmu
- #endif
-+uint8_t cpu_ldb_code_mmu(CPUArchState *env, abi_ptr addr,
-+                         MemOpIdx oi, uintptr_t ra);
-+uint16_t cpu_ldw_code_mmu(CPUArchState *env, abi_ptr addr,
-+                          MemOpIdx oi, uintptr_t ra);
-+uint32_t cpu_ldl_code_mmu(CPUArchState *env, abi_ptr addr,
-+                          MemOpIdx oi, uintptr_t ra);
-+uint64_t cpu_ldq_code_mmu(CPUArchState *env, abi_ptr addr,
-+                          MemOpIdx oi, uintptr_t ra);
-+
- uint32_t cpu_ldub_code(CPUArchState *env, abi_ptr addr);
- uint32_t cpu_lduw_code(CPUArchState *env, abi_ptr addr);
- uint32_t cpu_ldl_code(CPUArchState *env, abi_ptr addr);
-diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
-index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/cputlb.c
-+++ b/accel/tcg/cputlb.c
-@@ -XXX,XX +XXX,XX @@ uint64_t cpu_ldq_code(CPUArchState *env, abi_ptr addr)
-     MemOpIdx oi = make_memop_idx(MO_TEUQ, cpu_mmu_index(env, true));
-     return full_ldq_code(env, addr, oi, 0);
- }
-+
-+uint8_t cpu_ldb_code_mmu(CPUArchState *env, abi_ptr addr,
-+                         MemOpIdx oi, uintptr_t retaddr)
-+{
-+    return full_ldub_code(env, addr, oi, retaddr);
-+}
-+
-+uint16_t cpu_ldw_code_mmu(CPUArchState *env, abi_ptr addr,
-+                          MemOpIdx oi, uintptr_t retaddr)
-+{
-+    MemOp mop = get_memop(oi);
-+    int idx = get_mmuidx(oi);
-+    uint16_t ret;
-+
-+    ret = full_lduw_code(env, addr, make_memop_idx(MO_TEUW, idx), retaddr);
-+    if ((mop & MO_BSWAP) != MO_TE) {
-+        ret = bswap16(ret);
-+    }
-+    return ret;
-+}
-+
-+uint32_t cpu_ldl_code_mmu(CPUArchState *env, abi_ptr addr,
-+                          MemOpIdx oi, uintptr_t retaddr)
-+{
-+    MemOp mop = get_memop(oi);
-+    int idx = get_mmuidx(oi);
-+    uint32_t ret;
-+
-+    ret = full_ldl_code(env, addr, make_memop_idx(MO_TEUL, idx), retaddr);
-+    if ((mop & MO_BSWAP) != MO_TE) {
-+        ret = bswap32(ret);
-+    }
-+    return ret;
-+}
-+
-+uint64_t cpu_ldq_code_mmu(CPUArchState *env, abi_ptr addr,
-+                          MemOpIdx oi, uintptr_t retaddr)
-+{
-+    MemOp mop = get_memop(oi);
-+    int idx = get_mmuidx(oi);
-+    uint64_t ret;
-+
-+    ret = full_ldq_code(env, addr, make_memop_idx(MO_TEUQ, idx), retaddr);
-+    if ((mop & MO_BSWAP) != MO_TE) {
-+        ret = bswap64(ret);
-+    }
-+    return ret;
-+}
-diff --git a/accel/tcg/user-exec.c b/accel/tcg/user-exec.c
-index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/user-exec.c
-+++ b/accel/tcg/user-exec.c
-@@ -XXX,XX +XXX,XX @@ uint64_t cpu_ldq_code(CPUArchState *env, abi_ptr ptr)
-     return ret;
- }
-+uint8_t cpu_ldb_code_mmu(CPUArchState *env, abi_ptr addr,
-+                         MemOpIdx oi, uintptr_t ra)
-+{
-+    void *haddr;
-+    uint8_t ret;
-+
-+    haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_INST_FETCH);
-+    ret = ldub_p(haddr);
-+    clear_helper_retaddr();
-+    return ret;
-+}
-+
-+uint16_t cpu_ldw_code_mmu(CPUArchState *env, abi_ptr addr,
-+                          MemOpIdx oi, uintptr_t ra)
-+{
-+    void *haddr;
-+    uint16_t ret;
-+
-+    haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_INST_FETCH);
-+    ret = lduw_p(haddr);
-+    clear_helper_retaddr();
-+    if (get_memop(oi) & MO_BSWAP) {
-+        ret = bswap16(ret);
-+    }
-+    return ret;
-+}
-+
-+uint32_t cpu_ldl_code_mmu(CPUArchState *env, abi_ptr addr,
-+                          MemOpIdx oi, uintptr_t ra)
-+{
-+    void *haddr;
-+    uint32_t ret;
-+
-+    haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_INST_FETCH);
-+    ret = ldl_p(haddr);
-+    clear_helper_retaddr();
-+    if (get_memop(oi) & MO_BSWAP) {
-+        ret = bswap32(ret);
-+    }
-+    return ret;
-+}
-+
-+uint64_t cpu_ldq_code_mmu(CPUArchState *env, abi_ptr addr,
-+                          MemOpIdx oi, uintptr_t ra)
-+{
-+    void *haddr;
-+    uint64_t ret;
-+
-+    validate_memop(oi, MO_BEUQ);
-+    haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_LOAD);
-+    ret = ldq_p(haddr);
-+    clear_helper_retaddr();
-+    if (get_memop(oi) & MO_BSWAP) {
-+        ret = bswap64(ret);
-+    }
-+    return ret;
-+}
-+
- #include "ldst_common.c.inc"
- /*
---
-.34.1

The following changes since commit 7c18f2d663521f1b31b821a13358ce38075eaf7d:

Merge tag 'for-upstream' of https://gitlab.com/bonzini/qemu into staging (2023-04-29 23:07:17 +0100)

are available in the Git repository at:

https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20230502

for you to fetch changes up to bdc7fba1c5a29ae218b45353daac9308fe1aae82:

tcg: Introduce tcg_out_movext2 (2023-05-02 12:15:41 +0100)

----------------------------------------------------------------
Misc tcg-related patch queue.

----------------------------------------------------------------
Dickon Hood (1):
      qemu/bitops.h: Limit rotate amounts

Kiran Ostrolenk (1):
      qemu/host-utils.h: Add clz and ctz functions for lower-bit integers

Nazar Kazakov (2):
      tcg: Add tcg_gen_gvec_andcs
      tcg: Add tcg_gen_gvec_rotrs

Richard Henderson (7):
      softmmu: Tidy dirtylimit_dirty_ring_full_time
      qemu/int128: Re-shuffle Int128Alias members
      migration/xbzrle: Use __attribute__((target)) for avx512
      accel/tcg: Add cpu_ld*_code_mmu
      tcg/loongarch64: Conditionalize tcg_out_exts_i32_i64
      tcg/mips: Conditionalize tcg_out_exts_i32_i64
      tcg: Introduce tcg_out_movext2

Weiwei Li (1):
      accel/tcg: Uncache the host address for instruction fetch when tlb size < 1

Drop inline marker: let compiler decide.

Change return type to uint64_t: this matches the computation in the
return statement and the local variable assignment in the caller.

Rename local to dirty_ring_size_MB to fix typo.
Simplify conversion to MiB via qemu_target_page_bits and right shift.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Reviewed-by: Thomas Huth <thuth@redhat.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 softmmu/dirtylimit.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/softmmu/dirtylimit.c b/softmmu/dirtylimit.c
index XXXXXXX..XXXXXXX 100644
--- a/softmmu/dirtylimit.c
+++ b/softmmu/dirtylimit.c
@@ -XXX,XX +XXX,XX @@ bool dirtylimit_vcpu_index_valid(int cpu_index)
              cpu_index >= ms->smp.max_cpus);
 }
 
-static inline int64_t dirtylimit_dirty_ring_full_time(uint64_t dirtyrate)
+static uint64_t dirtylimit_dirty_ring_full_time(uint64_t dirtyrate)
 {
     static uint64_t max_dirtyrate;
-    uint32_t dirty_ring_size = kvm_dirty_ring_size();
-    uint64_t dirty_ring_size_meory_MB =
-        dirty_ring_size * qemu_target_page_size() >> 20;
+    unsigned target_page_bits = qemu_target_page_bits();
+    uint64_t dirty_ring_size_MB;
+
+    /* So far, the largest (non-huge) page size is 64k, i.e. 16 bits. */
+    assert(target_page_bits < 20);
+
+    /* Convert ring size (pages) to MiB (2**20). */
+    dirty_ring_size_MB = kvm_dirty_ring_size() >> (20 - target_page_bits);
 
     if (max_dirtyrate < dirtyrate) {
         max_dirtyrate = dirtyrate;
     }
 
-    return dirty_ring_size_meory_MB * 1000000 / max_dirtyrate;
+    return dirty_ring_size_MB * 1000000 / max_dirtyrate;
 }
 
 static inline bool dirtylimit_done(uint64_t quota,
-- 
2.34.1

From: Weiwei Li <liweiwei@iscas.ac.cn>

When PMP entry overlap part of the page, we'll set the tlb_size to 1, which
will make the address in tlb entry set with TLB_INVALID_MASK, and the next
access will again go through tlb_fill.However, this way will not work in
tb_gen_code() => get_page_addr_code_hostp(): the TLB host address will be
cached, and the following instructions can use this host address directly
which may lead to the bypass of PMP related check.
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1542.

Signed-off-by: Weiwei Li <liweiwei@iscas.ac.cn>
Signed-off-by: Junqiang Wang <wangjunqiang@iscas.ac.cn>
Reviewed-by: LIU Zhiwei <zhiwei_liu@linux.alibaba.com>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-Id: <20230422130329.23555-6-liweiwei@iscas.ac.cn>
---
 accel/tcg/cputlb.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ tb_page_addr_t get_page_addr_code_hostp(CPUArchState *env, target_ulong addr,
     if (p == NULL) {
         return -1;
     }
+
+    if (full->lg_page_size < TARGET_PAGE_BITS) {
+        return -1;
+    }
+
     if (hostp) {
         *hostp = p;
     }
-- 
2.34.1

From: Dickon Hood <dickon.hood@codethink.co.uk>

Rotates have been fixed up to only allow for reasonable rotate amounts
(ie, no rotates >7 on an 8b value etc.)  This fixes a problem with riscv
vector rotate instructions.

Signed-off-by: Dickon Hood <dickon.hood@codethink.co.uk>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-Id: <20230428144757.57530-9-lawrence.hunter@codethink.co.uk>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/qemu/bitops.h | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/include/qemu/bitops.h b/include/qemu/bitops.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/bitops.h
+++ b/include/qemu/bitops.h
@@ -XXX,XX +XXX,XX @@ static inline unsigned long find_first_zero_bit(const unsigned long *addr,
  */
 static inline uint8_t rol8(uint8_t word, unsigned int shift)
 {
-    return (word << shift) | (word >> ((8 - shift) & 7));
+    shift &= 7;
+    return (word << shift) | (word >> (8 - shift));
 }
 
 /**
@@ -XXX,XX +XXX,XX @@ static inline uint8_t rol8(uint8_t word, unsigned int shift)
  */
 static inline uint8_t ror8(uint8_t word, unsigned int shift)
 {
-    return (word >> shift) | (word << ((8 - shift) & 7));
+    shift &= 7;
+    return (word >> shift) | (word << (8 - shift));
 }
 
 /**
@@ -XXX,XX +XXX,XX @@ static inline uint8_t ror8(uint8_t word, unsigned int shift)
  */
 static inline uint16_t rol16(uint16_t word, unsigned int shift)
 {
-    return (word << shift) | (word >> ((16 - shift) & 15));
+    shift &= 15;
+    return (word << shift) | (word >> (16 - shift));
 }
 
 /**
@@ -XXX,XX +XXX,XX @@ static inline uint16_t rol16(uint16_t word, unsigned int shift)
  */
 static inline uint16_t ror16(uint16_t word, unsigned int shift)
 {
-    return (word >> shift) | (word << ((16 - shift) & 15));
+    shift &= 15;
+    return (word >> shift) | (word << (16 - shift));
 }
 
 /**
@@ -XXX,XX +XXX,XX @@ static inline uint16_t ror16(uint16_t word, unsigned int shift)
  */
 static inline uint32_t rol32(uint32_t word, unsigned int shift)
 {
-    return (word << shift) | (word >> ((32 - shift) & 31));
+    shift &= 31;
+    return (word << shift) | (word >> (32 - shift));
 }
 
 /**
@@ -XXX,XX +XXX,XX @@ static inline uint32_t rol32(uint32_t word, unsigned int shift)
  */
 static inline uint32_t ror32(uint32_t word, unsigned int shift)
 {
-    return (word >> shift) | (word << ((32 - shift) & 31));
+    shift &= 31;
+    return (word >> shift) | (word << (32 - shift));
 }
 
 /**
@@ -XXX,XX +XXX,XX @@ static inline uint32_t ror32(uint32_t word, unsigned int shift)
  */
 static inline uint64_t rol64(uint64_t word, unsigned int shift)
 {
-    return (word << shift) | (word >> ((64 - shift) & 63));
+    shift &= 63;
+    return (word << shift) | (word >> (64 - shift));
 }
 
 /**
@@ -XXX,XX +XXX,XX @@ static inline uint64_t rol64(uint64_t word, unsigned int shift)
  */
 static inline uint64_t ror64(uint64_t word, unsigned int shift)
 {
-    return (word >> shift) | (word << ((64 - shift) & 63));
+    shift &= 63;
+    return (word >> shift) | (word << (64 - shift));
 }
 
 /**
-- 
2.34.1

From: Kiran Ostrolenk <kiran.ostrolenk@codethink.co.uk>

This is for use in the RISC-V vclz and vctz instructions (implemented in
proceeding commit).

Signed-off-by: Kiran Ostrolenk <kiran.ostrolenk@codethink.co.uk>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-Id: <20230428144757.57530-11-lawrence.hunter@codethink.co.uk>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/qemu/host-utils.h | 54 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)

diff --git a/include/qemu/host-utils.h b/include/qemu/host-utils.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/host-utils.h
+++ b/include/qemu/host-utils.h
@@ -XXX,XX +XXX,XX @@ static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
 }
 #endif
 
+/**
+ * clz8 - count leading zeros in a 8-bit value.
+ * @val: The value to search
+ *
+ * Returns 8 if the value is zero.  Note that the GCC builtin is
+ * undefined if the value is zero.
+ *
+ * Note that the GCC builtin will upcast its argument to an `unsigned int`
+ * so this function subtracts off the number of prepended zeroes.
+ */
+static inline int clz8(uint8_t val)
+{
+    return val ? __builtin_clz(val) - 24 : 8;
+}
+
+/**
+ * clz16 - count leading zeros in a 16-bit value.
+ * @val: The value to search
+ *
+ * Returns 16 if the value is zero.  Note that the GCC builtin is
+ * undefined if the value is zero.
+ *
+ * Note that the GCC builtin will upcast its argument to an `unsigned int`
+ * so this function subtracts off the number of prepended zeroes.
+ */
+static inline int clz16(uint16_t val)
+{
+    return val ? __builtin_clz(val) - 16 : 16;
+}
+
 /**
  * clz32 - count leading zeros in a 32-bit value.
  * @val: The value to search
@@ -XXX,XX +XXX,XX @@ static inline int clo64(uint64_t val)
     return clz64(~val);
 }
 
+/**
+ * ctz8 - count trailing zeros in a 8-bit value.
+ * @val: The value to search
+ *
+ * Returns 8 if the value is zero.  Note that the GCC builtin is
+ * undefined if the value is zero.
+ */
+static inline int ctz8(uint8_t val)
+{
+    return val ? __builtin_ctz(val) : 8;
+}
+
+/**
+ * ctz16 - count trailing zeros in a 16-bit value.
+ * @val: The value to search
+ *
+ * Returns 16 if the value is zero.  Note that the GCC builtin is
+ * undefined if the value is zero.
+ */
+static inline int ctz16(uint16_t val)
+{
+    return val ? __builtin_ctz(val) : 16;
+}
+
 /**
  * ctz32 - count trailing zeros in a 32-bit value.
  * @val: The value to search
-- 
2.34.1

From: Nazar Kazakov <nazar.kazakov@codethink.co.uk>

Add tcg expander and helper functions for and-compliment
vector with scalar operand.

Signed-off-by: Nazar Kazakov <nazar.kazakov@codethink.co.uk>
Message-Id: <20230428144757.57530-10-lawrence.hunter@codethink.co.uk>
[rth: Split out of larger patch.]
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/tcg-runtime.h      |  1 +
 include/tcg/tcg-op-gvec.h    |  2 ++
 accel/tcg/tcg-runtime-gvec.c | 11 +++++++++++
 tcg/tcg-op-gvec.c            | 17 +++++++++++++++++
 4 files changed, 31 insertions(+)

diff --git a/accel/tcg/tcg-runtime.h b/accel/tcg/tcg-runtime.h
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-runtime.h
+++ b/accel/tcg/tcg-runtime.h
@@ -XXX,XX +XXX,XX @@ DEF_HELPER_FLAGS_4(gvec_nor, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(gvec_eqv, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 
 DEF_HELPER_FLAGS_4(gvec_ands, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_andcs, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
 DEF_HELPER_FLAGS_4(gvec_xors, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
 DEF_HELPER_FLAGS_4(gvec_ors, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
 
diff --git a/include/tcg/tcg-op-gvec.h b/include/tcg/tcg-op-gvec.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg-op-gvec.h
+++ b/include/tcg/tcg-op-gvec.h
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, uint32_t aofs,
 
 void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs,
                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_andcs(unsigned vece, uint32_t dofs, uint32_t aofs,
+                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz);
 void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs,
                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz);
 void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs,
diff --git a/accel/tcg/tcg-runtime-gvec.c b/accel/tcg/tcg-runtime-gvec.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-runtime-gvec.c
+++ b/accel/tcg/tcg-runtime-gvec.c
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_ands)(void *d, void *a, uint64_t b, uint32_t desc)
     clear_high(d, oprsz, desc);
 }
 
+void HELPER(gvec_andcs)(void *d, void *a, uint64_t b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
+        *(uint64_t *)(d + i) = *(uint64_t *)(a + i) & ~b;
+    }
+    clear_high(d, oprsz, desc);
+}
+
 void HELPER(gvec_xors)(void *d, void *a, uint64_t b, uint32_t desc)
 {
     intptr_t oprsz = simd_oprsz(desc);
diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg-op-gvec.c
+++ b/tcg/tcg-op-gvec.c
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs,
     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
 }
 
+void tcg_gen_gvec_andcs(unsigned vece, uint32_t dofs, uint32_t aofs,
+                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
+{
+    static GVecGen2s g = {
+        .fni8 = tcg_gen_andc_i64,
+        .fniv = tcg_gen_andc_vec,
+        .fno = gen_helper_gvec_andcs,
+        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+        .vece = MO_64
+    };
+
+    TCGv_i64 tmp = tcg_temp_ebb_new_i64();
+    tcg_gen_dup_i64(vece, tmp, c);
+    tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g);
+    tcg_temp_free_i64(tmp);
+}
+
 static const GVecGen2s gop_xors = {
     .fni8 = tcg_gen_xor_i64,
     .fniv = tcg_gen_xor_vec,
-- 
2.34.1

From: Nazar Kazakov <nazar.kazakov@codethink.co.uk>

Add tcg expander and helper functions for rotate right
vector with scalar operand.

Signed-off-by: Nazar Kazakov <nazar.kazakov@codethink.co.uk>
Message-Id: <20230428144757.57530-10-lawrence.hunter@codethink.co.uk>
[rth: Split out of larger patch; mask rotation count.]
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/tcg/tcg-op-gvec.h |  2 ++
 tcg/tcg-op-gvec.c         | 11 +++++++++++
 2 files changed, 13 insertions(+)

diff --git a/include/tcg/tcg-op-gvec.h b/include/tcg/tcg-op-gvec.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg-op-gvec.h
+++ b/include/tcg/tcg-op-gvec.h
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_sars(unsigned vece, uint32_t dofs, uint32_t aofs,
                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz);
 void tcg_gen_gvec_rotls(unsigned vece, uint32_t dofs, uint32_t aofs,
                         TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_rotrs(unsigned vece, uint32_t dofs, uint32_t aofs,
+                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz);
 
 /*
  * Perform vector shift by vector element, modulo the element size.
diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg-op-gvec.c
+++ b/tcg/tcg-op-gvec.c
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_rotls(unsigned vece, uint32_t dofs, uint32_t aofs,
     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
 }
 
+void tcg_gen_gvec_rotrs(unsigned vece, uint32_t dofs, uint32_t aofs,
+                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
+{
+    TCGv_i32 tmp = tcg_temp_ebb_new_i32();
+
+    tcg_gen_neg_i32(tmp, shift);
+    tcg_gen_andi_i32(tmp, tmp, (8 << vece) - 1);
+    tcg_gen_gvec_rotls(vece, dofs, aofs, tmp, oprsz, maxsz);
+    tcg_temp_free_i32(tmp);
+}
+
 /*
  * Expand D = A << (B % element bits)
  *
-- 
2.34.1

Clang 14, with --enable-tcg-interpreter errors with

include/qemu/int128.h:487:16: error: alignment of field 'i' (128 bits)
  does not match the alignment of the first field in transparent union;
  transparent_union attribute ignored [-Werror,-Wignored-attributes]
    __int128_t i;
               ^
include/qemu/int128.h:486:12: note: alignment of first field is 64 bits
    Int128 s;
           ^
1 error generated.

By placing the __uint128_t member first, this is avoided.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Message-Id: <20230501204625.277361-1-richard.henderson@linaro.org>
---
 include/qemu/int128.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/qemu/int128.h b/include/qemu/int128.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/int128.h
+++ b/include/qemu/int128.h
@@ -XXX,XX +XXX,XX @@ static inline void bswap128s(Int128 *s)
  */
 #ifdef CONFIG_INT128
 typedef union {
-    Int128 s;
-    __int128_t i;
     __uint128_t u;
+    __int128_t i;
+    Int128 s;
 } Int128Alias __attribute__((transparent_union));
 #else
 typedef Int128 Int128Alias;
-- 
2.34.1

Use the attribute, which is supported by clang, instead of
the #pragma, which is not supported and, for some reason,
also not detected by the meson probe, so we fail by -Werror.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Message-Id: <20230501210555.289806-1-richard.henderson@linaro.org>
---
 meson.build        | 5 +----
 migration/xbzrle.c | 9 ++++-----
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/meson.build b/meson.build
index XXXXXXX..XXXXXXX 100644
--- a/meson.build
+++ b/meson.build
@@ -XXX,XX +XXX,XX @@ config_host_data.set('CONFIG_AVX512F_OPT', get_option('avx512f') \
 config_host_data.set('CONFIG_AVX512BW_OPT', get_option('avx512bw') \
   .require(have_cpuid_h, error_message: 'cpuid.h not available, cannot enable AVX512BW') \
   .require(cc.links('''
-    #pragma GCC push_options
-    #pragma GCC target("avx512bw")
     #include <cpuid.h>
     #include <immintrin.h>
-    static int bar(void *a) {
-
+    static int __attribute__((target("avx512bw"))) bar(void *a) {
       __m512i *x = a;
       __m512i res= _mm512_abs_epi8(*x);
       return res[1];
diff --git a/migration/xbzrle.c b/migration/xbzrle.c
index XXXXXXX..XXXXXXX 100644
--- a/migration/xbzrle.c
+++ b/migration/xbzrle.c
@@ -XXX,XX +XXX,XX @@ int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen)
 }
 
 #if defined(CONFIG_AVX512BW_OPT)
-#pragma GCC push_options
-#pragma GCC target("avx512bw")
 #include <immintrin.h>
-int xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen,
-                             uint8_t *dst, int dlen)
+
+int __attribute__((target("avx512bw")))
+xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen,
+                            uint8_t *dst, int dlen)
 {
     uint32_t zrun_len = 0, nzrun_len = 0;
     int d = 0, i = 0, num = 0;
@@ -XXX,XX +XXX,XX @@ int xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen,
     }
     return d;
 }
-#pragma GCC pop_options
 #endif
-- 
2.34.1

At least RISC-V has the need to be able to perform a read
using execute permissions, outside of translation.
Add helpers to facilitate this.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Acked-by: Alistair Francis <alistair.francis@wdc.com>
Reviewed-by: Weiwei Li <liweiwei@iscas.ac.cn>
Tested-by: Daniel Henrique Barboza <dbarboza@ventanamicro.com>
Message-Id: <20230325105429.1142530-9-richard.henderson@linaro.org>
Message-Id: <20230412114333.118895-9-richard.henderson@linaro.org>
---
 include/exec/cpu_ldst.h |  9 +++++++
 accel/tcg/cputlb.c      | 48 ++++++++++++++++++++++++++++++++++
 accel/tcg/user-exec.c   | 58 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 115 insertions(+)

diff --git a/include/exec/cpu_ldst.h b/include/exec/cpu_ldst.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/cpu_ldst.h
+++ b/include/exec/cpu_ldst.h
@@ -XXX,XX +XXX,XX @@ static inline CPUTLBEntry *tlb_entry(CPUArchState *env, uintptr_t mmu_idx,
 # define cpu_stq_mmu          cpu_stq_le_mmu
 #endif
 
+uint8_t cpu_ldb_code_mmu(CPUArchState *env, abi_ptr addr,
+                         MemOpIdx oi, uintptr_t ra);
+uint16_t cpu_ldw_code_mmu(CPUArchState *env, abi_ptr addr,
+                          MemOpIdx oi, uintptr_t ra);
+uint32_t cpu_ldl_code_mmu(CPUArchState *env, abi_ptr addr,
+                          MemOpIdx oi, uintptr_t ra);
+uint64_t cpu_ldq_code_mmu(CPUArchState *env, abi_ptr addr,
+                          MemOpIdx oi, uintptr_t ra);
+
 uint32_t cpu_ldub_code(CPUArchState *env, abi_ptr addr);
 uint32_t cpu_lduw_code(CPUArchState *env, abi_ptr addr);
 uint32_t cpu_ldl_code(CPUArchState *env, abi_ptr addr);
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ uint64_t cpu_ldq_code(CPUArchState *env, abi_ptr addr)
     MemOpIdx oi = make_memop_idx(MO_TEUQ, cpu_mmu_index(env, true));
     return full_ldq_code(env, addr, oi, 0);
 }
+
+uint8_t cpu_ldb_code_mmu(CPUArchState *env, abi_ptr addr,
+                         MemOpIdx oi, uintptr_t retaddr)
+{
+    return full_ldub_code(env, addr, oi, retaddr);
+}
+
+uint16_t cpu_ldw_code_mmu(CPUArchState *env, abi_ptr addr,
+                          MemOpIdx oi, uintptr_t retaddr)
+{
+    MemOp mop = get_memop(oi);
+    int idx = get_mmuidx(oi);
+    uint16_t ret;
+
+    ret = full_lduw_code(env, addr, make_memop_idx(MO_TEUW, idx), retaddr);
+    if ((mop & MO_BSWAP) != MO_TE) {
+        ret = bswap16(ret);
+    }
+    return ret;
+}
+
+uint32_t cpu_ldl_code_mmu(CPUArchState *env, abi_ptr addr,
+                          MemOpIdx oi, uintptr_t retaddr)
+{
+    MemOp mop = get_memop(oi);
+    int idx = get_mmuidx(oi);
+    uint32_t ret;
+
+    ret = full_ldl_code(env, addr, make_memop_idx(MO_TEUL, idx), retaddr);
+    if ((mop & MO_BSWAP) != MO_TE) {
+        ret = bswap32(ret);
+    }
+    return ret;
+}
+
+uint64_t cpu_ldq_code_mmu(CPUArchState *env, abi_ptr addr,
+                          MemOpIdx oi, uintptr_t retaddr)
+{
+    MemOp mop = get_memop(oi);
+    int idx = get_mmuidx(oi);
+    uint64_t ret;
+
+    ret = full_ldq_code(env, addr, make_memop_idx(MO_TEUQ, idx), retaddr);
+    if ((mop & MO_BSWAP) != MO_TE) {
+        ret = bswap64(ret);
+    }
+    return ret;
+}
diff --git a/accel/tcg/user-exec.c b/accel/tcg/user-exec.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/user-exec.c
+++ b/accel/tcg/user-exec.c
@@ -XXX,XX +XXX,XX @@ uint64_t cpu_ldq_code(CPUArchState *env, abi_ptr ptr)
     return ret;
 }
 
+uint8_t cpu_ldb_code_mmu(CPUArchState *env, abi_ptr addr,
+                         MemOpIdx oi, uintptr_t ra)
+{
+    void *haddr;
+    uint8_t ret;
+
+    haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_INST_FETCH);
+    ret = ldub_p(haddr);
+    clear_helper_retaddr();
+    return ret;
+}
+
+uint16_t cpu_ldw_code_mmu(CPUArchState *env, abi_ptr addr,
+                          MemOpIdx oi, uintptr_t ra)
+{
+    void *haddr;
+    uint16_t ret;
+
+    haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_INST_FETCH);
+    ret = lduw_p(haddr);
+    clear_helper_retaddr();
+    if (get_memop(oi) & MO_BSWAP) {
+        ret = bswap16(ret);
+    }
+    return ret;
+}
+
+uint32_t cpu_ldl_code_mmu(CPUArchState *env, abi_ptr addr,
+                          MemOpIdx oi, uintptr_t ra)
+{
+    void *haddr;
+    uint32_t ret;
+
+    haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_INST_FETCH);
+    ret = ldl_p(haddr);
+    clear_helper_retaddr();
+    if (get_memop(oi) & MO_BSWAP) {
+        ret = bswap32(ret);
+    }
+    return ret;
+}
+
+uint64_t cpu_ldq_code_mmu(CPUArchState *env, abi_ptr addr,
+                          MemOpIdx oi, uintptr_t ra)
+{
+    void *haddr;
+    uint64_t ret;
+
+    validate_memop(oi, MO_BEUQ);
+    haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_LOAD);
+    ret = ldq_p(haddr);
+    clear_helper_retaddr();
+    if (get_memop(oi) & MO_BSWAP) {
+        ret = bswap64(ret);
+    }
+    return ret;
+}
+
 #include "ldst_common.c.inc"
 
 /*
-- 
2.34.1

This is common code in most qemu_{ld,st} slow paths, moving two
registers when there may be overlap between sources and destinations.
At present, this is only used by 32-bit hosts for 64-bit data,
but will shortly be used for more than that.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tcg.c                 | 69 ++++++++++++++++++++++++++++++++++++---
 tcg/arm/tcg-target.c.inc  | 44 ++++++++++---------------
 tcg/i386/tcg-target.c.inc | 19 +++++------
 3 files changed, 90 insertions(+), 42 deletions(-)

diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exts_i32_i64(TCGContext *s, TCGReg ret, TCGReg arg);
 static void tcg_out_extu_i32_i64(TCGContext *s, TCGReg ret, TCGReg arg);
 static void tcg_out_extrl_i64_i32(TCGContext *s, TCGReg ret, TCGReg arg);
 static void tcg_out_addi_ptr(TCGContext *s, TCGReg, TCGReg, tcg_target_long);
-static bool tcg_out_xchg(TCGContext *s, TCGType type, TCGReg r1, TCGReg r2)
-    __attribute__((unused));
+static bool tcg_out_xchg(TCGContext *s, TCGType type, TCGReg r1, TCGReg r2);
 static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg);
 static void tcg_out_goto_tb(TCGContext *s, int which);
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
@@ -XXX,XX +XXX,XX @@ void tcg_raise_tb_overflow(TCGContext *s)
     siglongjmp(s->jmp_trans, -2);
 }
 
+typedef struct TCGMovExtend {
+    TCGReg dst;
+    TCGReg src;
+    TCGType dst_type;
+    TCGType src_type;
+    MemOp src_ext;
+} TCGMovExtend;
+
 /**
  * tcg_out_movext -- move and extend
  * @s: tcg context
@@ -XXX,XX +XXX,XX @@ void tcg_raise_tb_overflow(TCGContext *s)
  *
  * Move or extend @src into @dst, depending on @src_ext and the types.
  */
-static void __attribute__((unused))
-tcg_out_movext(TCGContext *s, TCGType dst_type, TCGReg dst,
-               TCGType src_type, MemOp src_ext, TCGReg src)
+static void tcg_out_movext(TCGContext *s, TCGType dst_type, TCGReg dst,
+                           TCGType src_type, MemOp src_ext, TCGReg src)
 {
     switch (src_ext) {
     case MO_UB:
@@ -XXX,XX +XXX,XX @@ tcg_out_movext(TCGContext *s, TCGType dst_type, TCGReg dst,
     }
 }
 
+/* Minor variations on a theme, using a structure. */
+static void tcg_out_movext1_new_src(TCGContext *s, const TCGMovExtend *i,
+                                    TCGReg src)
+{
+    tcg_out_movext(s, i->dst_type, i->dst, i->src_type, i->src_ext, src);
+}
+
+static void tcg_out_movext1(TCGContext *s, const TCGMovExtend *i)
+{
+    tcg_out_movext1_new_src(s, i, i->src);
+}
+
+/**
+ * tcg_out_movext2 -- move and extend two pair
+ * @s: tcg context
+ * @i1: first move description
+ * @i2: second move description
+ * @scratch: temporary register, or -1 for none
+ *
+ * As tcg_out_movext, for both @i1 and @i2, caring for overlap
+ * between the sources and destinations.
+ */
+
+static void __attribute__((unused))
+tcg_out_movext2(TCGContext *s, const TCGMovExtend *i1,
+                const TCGMovExtend *i2, int scratch)
+{
+    TCGReg src1 = i1->src;
+    TCGReg src2 = i2->src;
+
+    if (i1->dst != src2) {
+        tcg_out_movext1(s, i1);
+        tcg_out_movext1(s, i2);
+        return;
+    }
+    if (i2->dst == src1) {
+        TCGType src1_type = i1->src_type;
+        TCGType src2_type = i2->src_type;
+
+        if (tcg_out_xchg(s, MAX(src1_type, src2_type), src1, src2)) {
+            /* The data is now in the correct registers, now extend. */
+            src1 = i2->src;
+            src2 = i1->src;
+        } else {
+            tcg_debug_assert(scratch >= 0);
+            tcg_out_mov(s, src1_type, scratch, src1);
+            src1 = scratch;
+        }
+    }
+    tcg_out_movext1_new_src(s, i2, src2);
+    tcg_out_movext1_new_src(s, i1, src1);
+}
+
 #define C_PFX1(P, A)                    P##A
 #define C_PFX2(P, A, B)                 P##A##_##B
 #define C_PFX3(P, A, B, C)              P##A##_##B##_##C
diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void add_qemu_ldst_label(TCGContext *s, bool is_ld, MemOpIdx oi,
 
 static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
 {
-    TCGReg argreg, datalo, datahi;
+    TCGReg argreg;
     MemOpIdx oi = lb->oi;
     MemOp opc = get_memop(oi);
 
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
     /* Use the canonical unsigned helpers and minimize icache usage. */
     tcg_out_call_int(s, qemu_ld_helpers[opc & MO_SIZE]);
 
-    datalo = lb->datalo_reg;
-    datahi = lb->datahi_reg;
     if ((opc & MO_SIZE) == MO_64) {
-        if (datalo != TCG_REG_R1) {
-            tcg_out_mov_reg(s, COND_AL, datalo, TCG_REG_R0);
-            tcg_out_mov_reg(s, COND_AL, datahi, TCG_REG_R1);
-        } else if (datahi != TCG_REG_R0) {
-            tcg_out_mov_reg(s, COND_AL, datahi, TCG_REG_R1);
-            tcg_out_mov_reg(s, COND_AL, datalo, TCG_REG_R0);
-        } else {
-            tcg_out_mov_reg(s, COND_AL, TCG_REG_TMP, TCG_REG_R0);
-            tcg_out_mov_reg(s, COND_AL, datahi, TCG_REG_R1);
-            tcg_out_mov_reg(s, COND_AL, datalo, TCG_REG_TMP);
-        }
+        TCGMovExtend ext[2] = {
+            { .dst = lb->datalo_reg, .dst_type = TCG_TYPE_I32,
+              .src = TCG_REG_R0, .src_type = TCG_TYPE_I32, .src_ext = MO_UL },
+            { .dst = lb->datahi_reg, .dst_type = TCG_TYPE_I32,
+              .src = TCG_REG_R1, .src_type = TCG_TYPE_I32, .src_ext = MO_UL },
+        };
+        tcg_out_movext2(s, &ext[0], &ext[1], TCG_REG_TMP);
     } else {
-        tcg_out_movext(s, TCG_TYPE_I32, datalo,
+        tcg_out_movext(s, TCG_TYPE_I32, lb->datalo_reg,
                        TCG_TYPE_I32, opc & MO_SSIZE, TCG_REG_R0);
     }
 
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_fail_alignment(TCGContext *s, TCGLabelQemuLdst *l)
 
     if (TARGET_LONG_BITS == 64) {
         /* 64-bit target address is aligned into R2:R3. */
-        if (l->addrhi_reg != TCG_REG_R2) {
-            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R2, l->addrlo_reg);
-            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R3, l->addrhi_reg);
-        } else if (l->addrlo_reg != TCG_REG_R3) {
-            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R3, l->addrhi_reg);
-            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R2, l->addrlo_reg);
-        } else {
-            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R1, TCG_REG_R2);
-            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R2, TCG_REG_R3);
-            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R3, TCG_REG_R1);
-        }
+        TCGMovExtend ext[2] = {
+            { .dst = TCG_REG_R2, .dst_type = TCG_TYPE_I32,
+              .src = l->addrlo_reg,
+              .src_type = TCG_TYPE_I32, .src_ext = MO_UL },
+            { .dst = TCG_REG_R3, .dst_type = TCG_TYPE_I32,
+              .src = l->addrhi_reg,
+              .src_type = TCG_TYPE_I32, .src_ext = MO_UL },
+        };
+        tcg_out_movext2(s, &ext[0], &ext[1], TCG_REG_TMP);
     } else {
         tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R1, l->addrlo_reg);
     }
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
 {
     MemOpIdx oi = l->oi;
     MemOp opc = get_memop(oi);
-    TCGReg data_reg;
     tcg_insn_unit **label_ptr = &l->label_ptr[0];
 
     /* resolve label address */
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
 
     tcg_out_branch(s, 1, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]);
 
-    data_reg = l->datalo_reg;
     if (TCG_TARGET_REG_BITS == 32 && (opc & MO_SIZE) == MO_64) {
-        if (data_reg == TCG_REG_EDX) {
-            /* xchg %edx, %eax */
-            tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0);
-            tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EAX);
-        } else {
-            tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
-            tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EDX);
-        }
+        TCGMovExtend ext[2] = {
+            { .dst = l->datalo_reg, .dst_type = TCG_TYPE_I32,
+              .src = TCG_REG_EAX, .src_type = TCG_TYPE_I32, .src_ext = MO_UL },
+            { .dst = l->datahi_reg, .dst_type = TCG_TYPE_I32,
+              .src = TCG_REG_EDX, .src_type = TCG_TYPE_I32, .src_ext = MO_UL },
+        };
+        tcg_out_movext2(s, &ext[0], &ext[1], -1);
     } else {
-        tcg_out_movext(s, l->type, data_reg,
+        tcg_out_movext(s, l->type, l->datalo_reg,
                        TCG_TYPE_REG, opc & MO_SSIZE, TCG_REG_EAX);
     }
 
-- 
2.34.1

Pretty small still, but there are two patches that ought
to get backported to stable, so no point in delaying.

The following changes since commit a5ba0a7e4e150d1350a041f0d0ef9ca6c8d7c307:

Merge tag 'pull-aspeed-20241211' of https://github.com/legoater/qemu into staging (2024-12-11 15:16:47 +0000)

are available in the Git repository at:

https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20241212

for you to fetch changes up to 7ac87b14a92234b6a89b701b4043ad6cf8bdcccf:

target/sparc: Use memcpy() and remove memcpy32() (2024-12-12 14:28:38 -0600)

----------------------------------------------------------------
tcg: Reset free_temps before tcg_optimize
tcg/riscv: Fix StoreStore barrier generation
include/exec: Introduce fpst alias in helper-head.h.inc
target/sparc: Use memcpy() and remove memcpy32()

----------------------------------------------------------------
Philippe Mathieu-Daudé (1):
      target/sparc: Use memcpy() and remove memcpy32()

Richard Henderson (2):
      tcg: Reset free_temps before tcg_optimize
      include/exec: Introduce fpst alias in helper-head.h.inc

Roman Artemev (1):
      tcg/riscv: Fix StoreStore barrier generation

When allocating new temps during tcg_optmize, do not re-use
any EBB temps that were used within the TB.  We do not have
any idea what span of the TB in which the temp was live.

Introduce tcg_temp_ebb_reset_freed and use before tcg_optimize,
as well as replacing the equivalent in plugin_gen_inject and
tcg_func_start.

Cc: qemu-stable@nongnu.org
Fixes: fb04ab7ddd8 ("tcg/optimize: Lower TCG_COND_TST{EQ,NE} if unsupported")
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2711
Reported-by: wannacu <wannacu2049@gmail.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
---
 include/tcg/tcg-temp-internal.h | 6 ++++++
 accel/tcg/plugin-gen.c          | 2 +-
 tcg/tcg.c                       | 5 ++++-
 3 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/include/tcg/tcg-temp-internal.h b/include/tcg/tcg-temp-internal.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg-temp-internal.h
+++ b/include/tcg/tcg-temp-internal.h
@@ -XXX,XX +XXX,XX @@ TCGv_i64 tcg_temp_ebb_new_i64(void);
 TCGv_ptr tcg_temp_ebb_new_ptr(void);
 TCGv_i128 tcg_temp_ebb_new_i128(void);
 
+/* Forget all freed EBB temps, so that new allocations produce new temps. */
+static inline void tcg_temp_ebb_reset_freed(TCGContext *s)
+{
+    memset(s->free_temps, 0, sizeof(s->free_temps));
+}
+
 #endif /* TCG_TEMP_FREE_H */
diff --git a/accel/tcg/plugin-gen.c b/accel/tcg/plugin-gen.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/plugin-gen.c
+++ b/accel/tcg/plugin-gen.c
@@ -XXX,XX +XXX,XX @@ static void plugin_gen_inject(struct qemu_plugin_tb *plugin_tb)
      * that might be live within the existing opcode stream.
      * The simplest solution is to release them all and create new.
      */
-    memset(tcg_ctx->free_temps, 0, sizeof(tcg_ctx->free_temps));
+    tcg_temp_ebb_reset_freed(tcg_ctx);
 
     QTAILQ_FOREACH_SAFE(op, &tcg_ctx->ops, link, next) {
         switch (op->opc) {
diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ void tcg_func_start(TCGContext *s)
     s->nb_temps = s->nb_globals;
 
     /* No temps have been previously allocated for size or locality.  */
-    memset(s->free_temps, 0, sizeof(s->free_temps));
+    tcg_temp_ebb_reset_freed(s);
 
     /* No constant temps have been previously allocated. */
     for (int i = 0; i < TCG_TYPE_COUNT; ++i) {
@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start)
     }
 #endif
 
+    /* Do not reuse any EBB that may be allocated within the TB. */
+    tcg_temp_ebb_reset_freed(s);
+
     tcg_optimize(s);
 
     reachable_code_pass(s);
-- 
2.43.0

From: Roman Artemev <roman.artemev@syntacore.com>

On RISC-V to StoreStore barrier corresponds
`fence w, w` not `fence r, r`

Cc: qemu-stable@nongnu.org
Fixes: efbea94c76b ("tcg/riscv: Add slowpath load and store instructions")
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Denis Tomashev <denis.tomashev@syntacore.com>
Signed-off-by: Roman Artemev <roman.artemev@syntacore.com>
Message-ID: <e2f2131e294a49e79959d4fa9ec02cf4@syntacore.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/riscv/tcg-target.c.inc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/riscv/tcg-target.c.inc
+++ b/tcg/riscv/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_mb(TCGContext *s, TCGArg a0)
         insn |= 0x02100000;
     }
     if (a0 & TCG_MO_ST_ST) {
-        insn |= 0x02200000;
+        insn |= 0x01100000;
     }
     tcg_out32(s, insn);
 }
-- 
2.43.0

This allows targets to declare that the helper requires a
float_status pointer and instead of a generic void pointer.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/helper-head.h.inc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/include/exec/helper-head.h.inc b/include/exec/helper-head.h.inc
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/helper-head.h.inc
+++ b/include/exec/helper-head.h.inc
@@ -XXX,XX +XXX,XX @@
 #define dh_alias_ptr ptr
 #define dh_alias_cptr ptr
 #define dh_alias_env ptr
+#define dh_alias_fpst ptr
 #define dh_alias_void void
 #define dh_alias_noreturn noreturn
 #define dh_alias(t) glue(dh_alias_, t)
@@ -XXX,XX +XXX,XX @@
 #define dh_ctype_ptr void *
 #define dh_ctype_cptr const void *
 #define dh_ctype_env CPUArchState *
+#define dh_ctype_fpst float_status *
 #define dh_ctype_void void
 #define dh_ctype_noreturn G_NORETURN void
 #define dh_ctype(t) dh_ctype_##t
@@ -XXX,XX +XXX,XX @@
 #define dh_typecode_f64 dh_typecode_i64
 #define dh_typecode_cptr dh_typecode_ptr
 #define dh_typecode_env dh_typecode_ptr
+#define dh_typecode_fpst dh_typecode_ptr
 #define dh_typecode(t) dh_typecode_##t
 
 #define dh_callflag_i32  0
-- 
2.43.0

From: Philippe Mathieu-Daudé <philmd@linaro.org>

Rather than manually copying each register, use
the libc memcpy(), which is well optimized nowadays.

Suggested-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Message-ID: <20241205205418.67613-1-philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/sparc/win_helper.c | 26 ++++++++------------------
 1 file changed, 8 insertions(+), 18 deletions(-)

diff --git a/target/sparc/win_helper.c b/target/sparc/win_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/sparc/win_helper.c
+++ b/target/sparc/win_helper.c
@@ -XXX,XX +XXX,XX @@
 #include "exec/helper-proto.h"
 #include "trace.h"
 
-static inline void memcpy32(target_ulong *dst, const target_ulong *src)
-{
-    dst[0] = src[0];
-    dst[1] = src[1];
-    dst[2] = src[2];
-    dst[3] = src[3];
-    dst[4] = src[4];
-    dst[5] = src[5];
-    dst[6] = src[6];
-    dst[7] = src[7];
-}
-
 void cpu_set_cwp(CPUSPARCState *env, int new_cwp)
 {
     /* put the modified wrap registers at their proper location */
     if (env->cwp == env->nwindows - 1) {
-        memcpy32(env->regbase, env->regbase + env->nwindows * 16);
+        memcpy(env->regbase, env->regbase + env->nwindows * 16,
+               sizeof(env->gregs));
     }
     env->cwp = new_cwp;
 
     /* put the wrap registers at their temporary location */
     if (new_cwp == env->nwindows - 1) {
-        memcpy32(env->regbase + env->nwindows * 16, env->regbase);
+        memcpy(env->regbase + env->nwindows * 16, env->regbase,
+               sizeof(env->gregs));
     }
     env->regwptr = env->regbase + (new_cwp * 16);
 }
@@ -XXX,XX +XXX,XX @@ void cpu_gl_switch_gregs(CPUSPARCState *env, uint32_t new_gl)
     dst = get_gl_gregset(env, env->gl);
 
     if (src != dst) {
-        memcpy32(dst, env->gregs);
-        memcpy32(env->gregs, src);
+        memcpy(dst, env->gregs, sizeof(env->gregs));
+        memcpy(env->gregs, src, sizeof(env->gregs));
     }
 }
 
@@ -XXX,XX +XXX,XX @@ void cpu_change_pstate(CPUSPARCState *env, uint32_t new_pstate)
         /* Switch global register bank */
         src = get_gregset(env, new_pstate_regs);
         dst = get_gregset(env, pstate_regs);
-        memcpy32(dst, env->gregs);
-        memcpy32(env->gregs, src);
+        memcpy(dst, env->gregs, sizeof(env->gregs));
+        memcpy(env->gregs, src, sizeof(env->gregs));
     } else {
         trace_win_helper_no_switch_pstate(new_pstate_regs);
     }
-- 
2.43.0