Series comparison

-[PULL 0/4] tcg patch queue
+[PULL v2 00/39] tcg patch queue
-Pretty small still, but there are two patches that ought
+v2: tcg/loongarch64 patch set without last minute tweaks.
 to get backported to stable, so no point in delaying.
 r~
-The following changes since commit a5ba0a7e4e150d1350a041f0d0ef9ca6c8d7c307:
+The following changes since commit 005ad32358f12fe9313a4a01918a55e60d4f39e5:
-  Merge tag 'pull-aspeed-20241211' of https://github.com/legoater/qemu into staging (2024-12-11 15:16:47 +0000)
+  Merge tag 'pull-tpm-2023-09-12-3' of https://github.com/stefanberger/qemu-tpm into staging (2023-09-13 13:41:57 -0400)
 are available in the Git repository at:
-  https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20241212
+  https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20230915-2
-for you to fetch changes up to 7ac87b14a92234b6a89b701b4043ad6cf8bdcccf:
+for you to fetch changes up to a97a83753c90d79ed15a716610af23fabd84aaed:
-  target/sparc: Use memcpy() and remove memcpy32() (2024-12-12 14:28:38 -0600)
+  tcg: Map code_gen_buffer with PROT_BTI (2023-09-16 14:57:16 +0000)
 ----------------------------------------------------------------
-tcg: Reset free_temps before tcg_optimize
+*: Delete checks for old host definitions
-tcg/riscv: Fix StoreStore barrier generation
+tcg/loongarch64: Generate LSX instructions
-include/exec: Introduce fpst alias in helper-head.h.inc
+fpu: Add conversions between bfloat16 and [u]int8
-target/sparc: Use memcpy() and remove memcpy32()
+fpu: Handle m68k extended precision denormals properly
 accel/tcg: Improve cputlb i/o organization
 accel/tcg: Simplify tlb_plugin_lookup
 accel/tcg: Remove false-negative halted assertion
 tcg: Add gvec compare with immediate and scalar operand
 tcg/aarch64: Emit BTI insns at jump landing pads
 ----------------------------------------------------------------
-Philippe Mathieu-Daudé (1):
+Akihiko Odaki (3):
-      target/sparc: Use memcpy() and remove memcpy32()
+      util: Delete checks for old host definitions
       softmmu: Delete checks for old host definitions
       thunk: Delete checks for old host definitions
-Richard Henderson (2):
+Jiajie Chen (16):
-      tcg: Reset free_temps before tcg_optimize
+      tcg/loongarch64: Import LSX instructions
-      include/exec: Introduce fpst alias in helper-head.h.inc
+      tcg/loongarch64: Lower basic tcg vec ops to LSX
       tcg: pass vece to tcg_target_const_match()
       tcg/loongarch64: Lower cmp_vec to vseq/vsle/vslt
       tcg/loongarch64: Lower add/sub_vec to vadd/vsub
       tcg/loongarch64: Lower vector bitwise operations
       tcg/loongarch64: Lower neg_vec to vneg
       tcg/loongarch64: Lower mul_vec to vmul
       tcg/loongarch64: Lower vector min max ops
       tcg/loongarch64: Lower vector saturated ops
       tcg/loongarch64: Lower vector shift vector ops
       tcg/loongarch64: Lower bitsel_vec to vbitsel
       tcg/loongarch64: Lower vector shift integer ops
       tcg/loongarch64: Lower rotv_vec ops to LSX
       tcg/loongarch64: Lower rotli_vec to vrotri
       tcg/loongarch64: Implement 128-bit load & store
-Roman Artemev (1):
+LIU Zhiwei (2):
-      tcg/riscv: Fix StoreStore barrier generation
+      accel/tcg: Fix the comment for CPUTLBEntryFull
       fpu: Add conversions between bfloat16 and [u]int8
- include/tcg/tcg-temp-internal.h |  6 ++++++
+Nicholas Piggin (1):
- accel/tcg/plugin-gen.c          |  2 +-
+      accel/tcg: mttcg remove false-negative halted assertion
  target/sparc/win_helper.c       | 26 ++++++++------------------
  tcg/tcg.c                       |  5 ++++-
  include/exec/helper-head.h.inc  |  3 +++
  tcg/riscv/tcg-target.c.inc      |  2 +-
 files changed, 23 insertions(+), 21 deletions(-)
+Richard Henderson (17):
+      tcg: Add gvec compare with immediate and scalar operand
+      target/arm: Use tcg_gen_gvec_cmpi for compare vs 0
+      accel/tcg: Simplify tlb_plugin_lookup
+      accel/tcg: Split out io_prepare and io_failed
+      accel/tcg: Use CPUTLBEntryFull.phys_addr in io_failed
+      plugin: Simplify struct qemu_plugin_hwaddr
+      accel/tcg: Merge cpu_transaction_failed into io_failed
+      accel/tcg: Replace direct use of io_readx/io_writex in do_{ld,st}_1
+      accel/tcg: Merge io_readx into do_ld_mmio_beN
+      accel/tcg: Merge io_writex into do_st_mmio_leN
+      accel/tcg: Introduce do_ld16_mmio_beN
+      accel/tcg: Introduce do_st16_mmio_leN
+      fpu: Handle m68k extended precision denormals properly
+      tcg: Add tcg_out_tb_start backend hook
+      util/cpuinfo-aarch64: Add CPUINFO_BTI
+      tcg/aarch64: Emit BTI insns at jump landing pads
+      tcg: Map code_gen_buffer with PROT_BTI
+ accel/tcg/tcg-runtime.h              |   25 +
+ host/include/aarch64/host/cpuinfo.h  |    1 +
+ include/exec/cpu-defs.h              |   12 +-
+ include/exec/user/thunk.h            |    3 +-
+ include/fpu/softfloat.h              |   12 +
+ include/hw/core/cpu.h                |   13 -
+ include/qemu/plugin-memory.h         |   11 +-
+ include/qemu/typedefs.h              |    1 -
+ include/tcg/tcg-op-gvec-common.h     |    6 +
+ tcg/loongarch64/tcg-target-con-set.h |    9 +
+ tcg/loongarch64/tcg-target-con-str.h |    3 +
+ tcg/loongarch64/tcg-target.h         |   40 +-
+ tcg/loongarch64/tcg-target.opc.h     |   12 +
+ accel/tcg/cputlb.c                   |  437 ++-
+ accel/tcg/tcg-accel-ops-mttcg.c      |    9 +-
+ accel/tcg/tcg-runtime-gvec.c         |   26 +
+ fpu/softfloat.c                      |   67 +-
+ plugins/api.c                        |   27 +-
+ softmmu/async-teardown.c             |    3 -
+ target/arm/tcg/translate.c           |   56 +-
+ tcg/region.c                         |   41 +-
+ tcg/tcg-op-gvec.c                    |  149 +
+ tcg/tcg.c                            |    7 +-
+ tests/tcg/m68k/denormal.c            |   53 +
+ util/cpuinfo-aarch64.c               |    7 +
+ util/oslib-posix.c                   |   15 +-
+ fpu/softfloat-parts.c.inc            |    7 +-
+ tcg/aarch64/tcg-target.c.inc         |   59 +-
+ tcg/arm/tcg-target.c.inc             |    7 +-
+ tcg/i386/tcg-target.c.inc            |    7 +-
+ tcg/loongarch64/tcg-insn-defs.c.inc  | 6019 +++++++++++++++++++++++++++++++++-
+ tcg/loongarch64/tcg-target.c.inc     |  624 +++-
+ tcg/mips/tcg-target.c.inc            |    7 +-
+ tcg/ppc/tcg-target.c.inc             |    7 +-
+ tcg/riscv/tcg-target.c.inc           |    7 +-
+ tcg/s390x/tcg-target.c.inc           |    7 +-
+ tcg/sparc64/tcg-target.c.inc         |    7 +-
+ tcg/tci/tcg-target.c.inc             |    7 +-
+ tests/tcg/m68k/Makefile.target       |    2 +-
+files changed, 7419 insertions(+), 393 deletions(-)
+ create mode 100644 tcg/loongarch64/tcg-target.opc.h
+ create mode 100644 tests/tcg/m68k/denormal.c

-[PULL 1/4] tcg: Reset free_temps before tcg_optimize
+Deleted patch
-When allocating new temps during tcg_optmize, do not re-use
-any EBB temps that were used within the TB.  We do not have
-any idea what span of the TB in which the temp was live.
-Introduce tcg_temp_ebb_reset_freed and use before tcg_optimize,
-as well as replacing the equivalent in plugin_gen_inject and
-tcg_func_start.
-Cc: qemu-stable@nongnu.org
-Fixes: fb04ab7ddd8 ("tcg/optimize: Lower TCG_COND_TST{EQ,NE} if unsupported")
-Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2711
-Reported-by: wannacu <wannacu2049@gmail.com>
-Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
-Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
-Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
----
- include/tcg/tcg-temp-internal.h | 6 ++++++
- accel/tcg/plugin-gen.c          | 2 +-
- tcg/tcg.c                       | 5 ++++-
-files changed, 11 insertions(+), 2 deletions(-)
-diff --git a/include/tcg/tcg-temp-internal.h b/include/tcg/tcg-temp-internal.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/tcg/tcg-temp-internal.h
-+++ b/include/tcg/tcg-temp-internal.h
-@@ -XXX,XX +XXX,XX @@ TCGv_i64 tcg_temp_ebb_new_i64(void);
- TCGv_ptr tcg_temp_ebb_new_ptr(void);
- TCGv_i128 tcg_temp_ebb_new_i128(void);
-+/* Forget all freed EBB temps, so that new allocations produce new temps. */
-+static inline void tcg_temp_ebb_reset_freed(TCGContext *s)
-+{
-+    memset(s->free_temps, 0, sizeof(s->free_temps));
-+}
-+
- #endif /* TCG_TEMP_FREE_H */
-diff --git a/accel/tcg/plugin-gen.c b/accel/tcg/plugin-gen.c
-index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/plugin-gen.c
-+++ b/accel/tcg/plugin-gen.c
-@@ -XXX,XX +XXX,XX @@ static void plugin_gen_inject(struct qemu_plugin_tb *plugin_tb)
-      * that might be live within the existing opcode stream.
-      * The simplest solution is to release them all and create new.
-      */
--    memset(tcg_ctx->free_temps, 0, sizeof(tcg_ctx->free_temps));
-+    tcg_temp_ebb_reset_freed(tcg_ctx);
-     QTAILQ_FOREACH_SAFE(op, &tcg_ctx->ops, link, next) {
-         switch (op->opc) {
-diff --git a/tcg/tcg.c b/tcg/tcg.c
-index XXXXXXX..XXXXXXX 100644
---- a/tcg/tcg.c
-+++ b/tcg/tcg.c
-@@ -XXX,XX +XXX,XX @@ void tcg_func_start(TCGContext *s)
-     s->nb_temps = s->nb_globals;
-     /* No temps have been previously allocated for size or locality.  */
--    memset(s->free_temps, 0, sizeof(s->free_temps));
-+    tcg_temp_ebb_reset_freed(s);
-     /* No constant temps have been previously allocated. */
-     for (int i = 0; i < TCG_TYPE_COUNT; ++i) {
-@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start)
-     }
- #endif
-+    /* Do not reuse any EBB that may be allocated within the TB. */
-+    tcg_temp_ebb_reset_freed(s);
-+
-     tcg_optimize(s);
-     reachable_code_pass(s);
---
-.43.0

-[PULL 2/4] tcg/riscv: Fix StoreStore barrier generation
+Deleted patch
-From: Roman Artemev <roman.artemev@syntacore.com>
-On RISC-V to StoreStore barrier corresponds
-`fence w, w` not `fence r, r`
-Cc: qemu-stable@nongnu.org
-Fixes: efbea94c76b ("tcg/riscv: Add slowpath load and store instructions")
-Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
-Signed-off-by: Denis Tomashev <denis.tomashev@syntacore.com>
-Signed-off-by: Roman Artemev <roman.artemev@syntacore.com>
-Message-ID: <e2f2131e294a49e79959d4fa9ec02cf4@syntacore.com>
-Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
----
- tcg/riscv/tcg-target.c.inc | 2 +-
-file changed, 1 insertion(+), 1 deletion(-)
-diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
-index XXXXXXX..XXXXXXX 100644
---- a/tcg/riscv/tcg-target.c.inc
-+++ b/tcg/riscv/tcg-target.c.inc
-@@ -XXX,XX +XXX,XX @@ static void tcg_out_mb(TCGContext *s, TCGArg a0)
-         insn |= 0x02100000;
-     }
-     if (a0 & TCG_MO_ST_ST) {
--        insn |= 0x02200000;
-+        insn |= 0x01100000;
-     }
-     tcg_out32(s, insn);
- }
---
-.43.0

-[PULL 3/4] include/exec: Introduce fpst alias in helper-head.h.inc
+Deleted patch
-This allows targets to declare that the helper requires a
-float_status pointer and instead of a generic void pointer.
-Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
-Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
----
- include/exec/helper-head.h.inc | 3 +++
-file changed, 3 insertions(+)
-diff --git a/include/exec/helper-head.h.inc b/include/exec/helper-head.h.inc
-index XXXXXXX..XXXXXXX 100644
---- a/include/exec/helper-head.h.inc
-+++ b/include/exec/helper-head.h.inc
-@@ -XXX,XX +XXX,XX @@
- #define dh_alias_ptr ptr
- #define dh_alias_cptr ptr
- #define dh_alias_env ptr
-+#define dh_alias_fpst ptr
- #define dh_alias_void void
- #define dh_alias_noreturn noreturn
- #define dh_alias(t) glue(dh_alias_, t)
-@@ -XXX,XX +XXX,XX @@
- #define dh_ctype_ptr void *
- #define dh_ctype_cptr const void *
- #define dh_ctype_env CPUArchState *
-+#define dh_ctype_fpst float_status *
- #define dh_ctype_void void
- #define dh_ctype_noreturn G_NORETURN void
- #define dh_ctype(t) dh_ctype_##t
-@@ -XXX,XX +XXX,XX @@
- #define dh_typecode_f64 dh_typecode_i64
- #define dh_typecode_cptr dh_typecode_ptr
- #define dh_typecode_env dh_typecode_ptr
-+#define dh_typecode_fpst dh_typecode_ptr
- #define dh_typecode(t) dh_typecode_##t
- #define dh_callflag_i32  0
---
-.43.0

-[PULL 4/4] target/sparc: Use memcpy() and remove memcpy32()
+[PULL v2 21/39] tcg/loongarch64: Implement 128-bit load & store
-From: Philippe Mathieu-Daudé <philmd@linaro.org>
+From: Jiajie Chen <c@jia.je>
-Rather than manually copying each register, use
+If LSX is available, use LSX instructions to implement 128-bit load &
-the libc memcpy(), which is well optimized nowadays.
+store when MO_128 is required, otherwise use two 64-bit loads & stores.
-Suggested-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
+Signed-off-by: Jiajie Chen <c@jia.je>
-Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
+Message-Id: <20230908022302.180442-17-c@jia.je>
 Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
-Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
-Message-ID: <20241205205418.67613-1-philmd@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/sparc/win_helper.c | 26 ++++++++------------------
+ tcg/loongarch64/tcg-target-con-set.h |  2 +
-file changed, 8 insertions(+), 18 deletions(-)
+ tcg/loongarch64/tcg-target.h         |  2 +-
  tcg/loongarch64/tcg-target.c.inc     | 59 ++++++++++++++++++++++++++++
 files changed, 62 insertions(+), 1 deletion(-)
-diff --git a/target/sparc/win_helper.c b/target/sparc/win_helper.c
+diff --git a/tcg/loongarch64/tcg-target-con-set.h b/tcg/loongarch64/tcg-target-con-set.h
 index XXXXXXX..XXXXXXX 100644
---- a/target/sparc/win_helper.c
+--- a/tcg/loongarch64/tcg-target-con-set.h
-+++ b/target/sparc/win_helper.c
++++ b/tcg/loongarch64/tcg-target-con-set.h
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ C_O0_I1(r)
- #include "exec/helper-proto.h"
+ C_O0_I2(rZ, r)
- #include "trace.h"
+ C_O0_I2(rZ, rZ)
+ C_O0_I2(w, r)
--static inline void memcpy32(target_ulong *dst, const target_ulong *src)
++C_O0_I3(r, r, r)
--{
+ C_O1_I1(r, r)
--    dst[0] = src[0];
+ C_O1_I1(w, r)
--    dst[1] = src[1];
+ C_O1_I1(w, w)
--    dst[2] = src[2];
+@@ -XXX,XX +XXX,XX @@ C_O1_I2(w, w, wM)
--    dst[3] = src[3];
+ C_O1_I2(w, w, wA)
--    dst[4] = src[4];
+ C_O1_I3(w, w, w, w)
--    dst[5] = src[5];
+ C_O1_I4(r, rZ, rJ, rZ, rZ)
--    dst[6] = src[6];
++C_O2_I1(r, r, r)
--    dst[7] = src[7];
+diff --git a/tcg/loongarch64/tcg-target.h b/tcg/loongarch64/tcg-target.h
--}
+index XXXXXXX..XXXXXXX 100644
--
+--- a/tcg/loongarch64/tcg-target.h
- void cpu_set_cwp(CPUSPARCState *env, int new_cwp)
++++ b/tcg/loongarch64/tcg-target.h
- {
+@@ -XXX,XX +XXX,XX @@ extern bool use_lsx_instructions;
-     /* put the modified wrap registers at their proper location */
+ #define TCG_TARGET_HAS_muluh_i64        1
-     if (env->cwp == env->nwindows - 1) {
+ #define TCG_TARGET_HAS_mulsh_i64        1
--        memcpy32(env->regbase, env->regbase + env->nwindows * 16);
-+        memcpy(env->regbase, env->regbase + env->nwindows * 16,
+-#define TCG_TARGET_HAS_qemu_ldst_i128   0
-+               sizeof(env->gregs));
++#define TCG_TARGET_HAS_qemu_ldst_i128   use_lsx_instructions
-     }
-     env->cwp = new_cwp;
+ #define TCG_TARGET_HAS_v64              0
+ #define TCG_TARGET_HAS_v128             use_lsx_instructions
-     /* put the wrap registers at their temporary location */
+diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
-     if (new_cwp == env->nwindows - 1) {
+index XXXXXXX..XXXXXXX 100644
--        memcpy32(env->regbase + env->nwindows * 16, env->regbase);
+--- a/tcg/loongarch64/tcg-target.c.inc
-+        memcpy(env->regbase + env->nwindows * 16, env->regbase,
++++ b/tcg/loongarch64/tcg-target.c.inc
-+               sizeof(env->gregs));
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, TCGReg data_reg, TCGReg addr_reg,
      }
      env->regwptr = env->regbase + (new_cwp * 16);
  }
@@ -XXX,XX +XXX,XX @@ void cpu_gl_switch_gregs(CPUSPARCState *env, uint32_t new_gl)
      dst = get_gl_gregset(env, env->gl);
      if (src != dst) {
 -        memcpy32(dst, env->gregs);
 -        memcpy32(env->gregs, src);
 +        memcpy(dst, env->gregs, sizeof(env->gregs));
 +        memcpy(env->gregs, src, sizeof(env->gregs));
      }
  }
-@@ -XXX,XX +XXX,XX @@ void cpu_change_pstate(CPUSPARCState *env, uint32_t new_pstate)
++static void tcg_out_qemu_ldst_i128(TCGContext *s, TCGReg data_lo, TCGReg data_hi,
-         /* Switch global register bank */
++                                   TCGReg addr_reg, MemOpIdx oi, bool is_ld)
-         src = get_gregset(env, new_pstate_regs);
++{
-         dst = get_gregset(env, pstate_regs);
++    TCGLabelQemuLdst *ldst;
--        memcpy32(dst, env->gregs);
++    HostAddress h;
--        memcpy32(env->gregs, src);
++
-+        memcpy(dst, env->gregs, sizeof(env->gregs));
++    ldst = prepare_host_addr(s, &h, addr_reg, oi, is_ld);
-+        memcpy(env->gregs, src, sizeof(env->gregs));
++
-     } else {
++    if (h.aa.atom == MO_128) {
-         trace_win_helper_no_switch_pstate(new_pstate_regs);
++        /*
-     }
++         * Use VLDX/VSTX when 128-bit atomicity is required.
 +         * If address is aligned to 16-bytes, the 128-bit load/store is atomic.
 +         */
 +        if (is_ld) {
 +            tcg_out_opc_vldx(s, TCG_VEC_TMP0, h.base, h.index);
 +            tcg_out_opc_vpickve2gr_d(s, data_lo, TCG_VEC_TMP0, 0);
 +            tcg_out_opc_vpickve2gr_d(s, data_hi, TCG_VEC_TMP0, 1);
 +        } else {
 +            tcg_out_opc_vinsgr2vr_d(s, TCG_VEC_TMP0, data_lo, 0);
 +            tcg_out_opc_vinsgr2vr_d(s, TCG_VEC_TMP0, data_hi, 1);
 +            tcg_out_opc_vstx(s, TCG_VEC_TMP0, h.base, h.index);
 +        }
 +    } else {
 +        /* Otherwise use a pair of LD/ST. */
 +        tcg_out_opc_add_d(s, TCG_REG_TMP0, h.base, h.index);
 +        if (is_ld) {
 +            tcg_out_opc_ld_d(s, data_lo, TCG_REG_TMP0, 0);
 +            tcg_out_opc_ld_d(s, data_hi, TCG_REG_TMP0, 8);
 +        } else {
 +            tcg_out_opc_st_d(s, data_lo, TCG_REG_TMP0, 0);
 +            tcg_out_opc_st_d(s, data_hi, TCG_REG_TMP0, 8);
 +        }
 +    }
 +
 +    if (ldst) {
 +        ldst->type = TCG_TYPE_I128;
 +        ldst->datalo_reg = data_lo;
 +        ldst->datahi_reg = data_hi;
 +        ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
 +    }
 +}
 +
  /*
   * Entry-points
   */
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
      TCGArg a0 = args[0];
      TCGArg a1 = args[1];
      TCGArg a2 = args[2];
 +    TCGArg a3 = args[3];
      int c2 = const_args[2];
      switch (opc) {
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
      case INDEX_op_qemu_ld_a64_i64:
          tcg_out_qemu_ld(s, a0, a1, a2, TCG_TYPE_I64);
          break;
 +    case INDEX_op_qemu_ld_a32_i128:
 +    case INDEX_op_qemu_ld_a64_i128:
 +        tcg_out_qemu_ldst_i128(s, a0, a1, a2, a3, true);
 +        break;
      case INDEX_op_qemu_st_a32_i32:
      case INDEX_op_qemu_st_a64_i32:
          tcg_out_qemu_st(s, a0, a1, a2, TCG_TYPE_I32);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
      case INDEX_op_qemu_st_a64_i64:
          tcg_out_qemu_st(s, a0, a1, a2, TCG_TYPE_I64);
          break;
 +    case INDEX_op_qemu_st_a32_i128:
 +    case INDEX_op_qemu_st_a64_i128:
 +        tcg_out_qemu_ldst_i128(s, a0, a1, a2, a3, false);
 +        break;
      case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
      case INDEX_op_mov_i64:
@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
      case INDEX_op_qemu_st_a64_i64:
          return C_O0_I2(rZ, r);
 +    case INDEX_op_qemu_ld_a32_i128:
 +    case INDEX_op_qemu_ld_a64_i128:
 +        return C_O2_I1(r, r, r);
 +
 +    case INDEX_op_qemu_st_a32_i128:
 +    case INDEX_op_qemu_st_a64_i128:
 +        return C_O0_I3(r, r, r);
 +
      case INDEX_op_brcond_i32:
      case INDEX_op_brcond_i64:
          return C_O0_I2(rZ, rZ);
 --
-.43.0
+.34.1

Pretty small still, but there are two patches that ought
to get backported to stable, so no point in delaying.

The following changes since commit a5ba0a7e4e150d1350a041f0d0ef9ca6c8d7c307:

Merge tag 'pull-aspeed-20241211' of https://github.com/legoater/qemu into staging (2024-12-11 15:16:47 +0000)

are available in the Git repository at:

https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20241212

for you to fetch changes up to 7ac87b14a92234b6a89b701b4043ad6cf8bdcccf:

target/sparc: Use memcpy() and remove memcpy32() (2024-12-12 14:28:38 -0600)

----------------------------------------------------------------
tcg: Reset free_temps before tcg_optimize
tcg/riscv: Fix StoreStore barrier generation
include/exec: Introduce fpst alias in helper-head.h.inc
target/sparc: Use memcpy() and remove memcpy32()

----------------------------------------------------------------
Philippe Mathieu-Daudé (1):
      target/sparc: Use memcpy() and remove memcpy32()

Richard Henderson (2):
      tcg: Reset free_temps before tcg_optimize
      include/exec: Introduce fpst alias in helper-head.h.inc

Roman Artemev (1):
      tcg/riscv: Fix StoreStore barrier generation

When allocating new temps during tcg_optmize, do not re-use
any EBB temps that were used within the TB.  We do not have
any idea what span of the TB in which the temp was live.

Introduce tcg_temp_ebb_reset_freed and use before tcg_optimize,
as well as replacing the equivalent in plugin_gen_inject and
tcg_func_start.

Cc: qemu-stable@nongnu.org
Fixes: fb04ab7ddd8 ("tcg/optimize: Lower TCG_COND_TST{EQ,NE} if unsupported")
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2711
Reported-by: wannacu <wannacu2049@gmail.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
---
 include/tcg/tcg-temp-internal.h | 6 ++++++
 accel/tcg/plugin-gen.c          | 2 +-
 tcg/tcg.c                       | 5 ++++-
 3 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/include/tcg/tcg-temp-internal.h b/include/tcg/tcg-temp-internal.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg-temp-internal.h
+++ b/include/tcg/tcg-temp-internal.h
@@ -XXX,XX +XXX,XX @@ TCGv_i64 tcg_temp_ebb_new_i64(void);
 TCGv_ptr tcg_temp_ebb_new_ptr(void);
 TCGv_i128 tcg_temp_ebb_new_i128(void);
 
+/* Forget all freed EBB temps, so that new allocations produce new temps. */
+static inline void tcg_temp_ebb_reset_freed(TCGContext *s)
+{
+    memset(s->free_temps, 0, sizeof(s->free_temps));
+}
+
 #endif /* TCG_TEMP_FREE_H */
diff --git a/accel/tcg/plugin-gen.c b/accel/tcg/plugin-gen.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/plugin-gen.c
+++ b/accel/tcg/plugin-gen.c
@@ -XXX,XX +XXX,XX @@ static void plugin_gen_inject(struct qemu_plugin_tb *plugin_tb)
      * that might be live within the existing opcode stream.
      * The simplest solution is to release them all and create new.
      */
-    memset(tcg_ctx->free_temps, 0, sizeof(tcg_ctx->free_temps));
+    tcg_temp_ebb_reset_freed(tcg_ctx);
 
     QTAILQ_FOREACH_SAFE(op, &tcg_ctx->ops, link, next) {
         switch (op->opc) {
diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ void tcg_func_start(TCGContext *s)
     s->nb_temps = s->nb_globals;
 
     /* No temps have been previously allocated for size or locality.  */
-    memset(s->free_temps, 0, sizeof(s->free_temps));
+    tcg_temp_ebb_reset_freed(s);
 
     /* No constant temps have been previously allocated. */
     for (int i = 0; i < TCG_TYPE_COUNT; ++i) {
@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start)
     }
 #endif
 
+    /* Do not reuse any EBB that may be allocated within the TB. */
+    tcg_temp_ebb_reset_freed(s);
+
     tcg_optimize(s);
 
     reachable_code_pass(s);
-- 
2.43.0

From: Roman Artemev <roman.artemev@syntacore.com>

On RISC-V to StoreStore barrier corresponds
`fence w, w` not `fence r, r`

Cc: qemu-stable@nongnu.org
Fixes: efbea94c76b ("tcg/riscv: Add slowpath load and store instructions")
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Denis Tomashev <denis.tomashev@syntacore.com>
Signed-off-by: Roman Artemev <roman.artemev@syntacore.com>
Message-ID: <e2f2131e294a49e79959d4fa9ec02cf4@syntacore.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/riscv/tcg-target.c.inc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/riscv/tcg-target.c.inc
+++ b/tcg/riscv/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_mb(TCGContext *s, TCGArg a0)
         insn |= 0x02100000;
     }
     if (a0 & TCG_MO_ST_ST) {
-        insn |= 0x02200000;
+        insn |= 0x01100000;
     }
     tcg_out32(s, insn);
 }
-- 
2.43.0

This allows targets to declare that the helper requires a
float_status pointer and instead of a generic void pointer.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/helper-head.h.inc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/include/exec/helper-head.h.inc b/include/exec/helper-head.h.inc
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/helper-head.h.inc
+++ b/include/exec/helper-head.h.inc
@@ -XXX,XX +XXX,XX @@
 #define dh_alias_ptr ptr
 #define dh_alias_cptr ptr
 #define dh_alias_env ptr
+#define dh_alias_fpst ptr
 #define dh_alias_void void
 #define dh_alias_noreturn noreturn
 #define dh_alias(t) glue(dh_alias_, t)
@@ -XXX,XX +XXX,XX @@
 #define dh_ctype_ptr void *
 #define dh_ctype_cptr const void *
 #define dh_ctype_env CPUArchState *
+#define dh_ctype_fpst float_status *
 #define dh_ctype_void void
 #define dh_ctype_noreturn G_NORETURN void
 #define dh_ctype(t) dh_ctype_##t
@@ -XXX,XX +XXX,XX @@
 #define dh_typecode_f64 dh_typecode_i64
 #define dh_typecode_cptr dh_typecode_ptr
 #define dh_typecode_env dh_typecode_ptr
+#define dh_typecode_fpst dh_typecode_ptr
 #define dh_typecode(t) dh_typecode_##t
 
 #define dh_callflag_i32  0
-- 
2.43.0

From: Philippe Mathieu-Daudé <philmd@linaro.org>

Rather than manually copying each register, use
the libc memcpy(), which is well optimized nowadays.

Suggested-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Message-ID: <20241205205418.67613-1-philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/sparc/win_helper.c | 26 ++++++++------------------
 1 file changed, 8 insertions(+), 18 deletions(-)

diff --git a/target/sparc/win_helper.c b/target/sparc/win_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/sparc/win_helper.c
+++ b/target/sparc/win_helper.c
@@ -XXX,XX +XXX,XX @@
 #include "exec/helper-proto.h"
 #include "trace.h"
 
-static inline void memcpy32(target_ulong *dst, const target_ulong *src)
-{
-    dst[0] = src[0];
-    dst[1] = src[1];
-    dst[2] = src[2];
-    dst[3] = src[3];
-    dst[4] = src[4];
-    dst[5] = src[5];
-    dst[6] = src[6];
-    dst[7] = src[7];
-}
-
 void cpu_set_cwp(CPUSPARCState *env, int new_cwp)
 {
     /* put the modified wrap registers at their proper location */
     if (env->cwp == env->nwindows - 1) {
-        memcpy32(env->regbase, env->regbase + env->nwindows * 16);
+        memcpy(env->regbase, env->regbase + env->nwindows * 16,
+               sizeof(env->gregs));
     }
     env->cwp = new_cwp;
 
     /* put the wrap registers at their temporary location */
     if (new_cwp == env->nwindows - 1) {
-        memcpy32(env->regbase + env->nwindows * 16, env->regbase);
+        memcpy(env->regbase + env->nwindows * 16, env->regbase,
+               sizeof(env->gregs));
     }
     env->regwptr = env->regbase + (new_cwp * 16);
 }
@@ -XXX,XX +XXX,XX @@ void cpu_gl_switch_gregs(CPUSPARCState *env, uint32_t new_gl)
     dst = get_gl_gregset(env, env->gl);
 
     if (src != dst) {
-        memcpy32(dst, env->gregs);
-        memcpy32(env->gregs, src);
+        memcpy(dst, env->gregs, sizeof(env->gregs));
+        memcpy(env->gregs, src, sizeof(env->gregs));
     }
 }
 
@@ -XXX,XX +XXX,XX @@ void cpu_change_pstate(CPUSPARCState *env, uint32_t new_pstate)
         /* Switch global register bank */
         src = get_gregset(env, new_pstate_regs);
         dst = get_gregset(env, pstate_regs);
-        memcpy32(dst, env->gregs);
-        memcpy32(env->gregs, src);
+        memcpy(dst, env->gregs, sizeof(env->gregs));
+        memcpy(env->gregs, src, sizeof(env->gregs));
     } else {
         trace_win_helper_no_switch_pstate(new_pstate_regs);
     }
-- 
2.43.0

v2: tcg/loongarch64 patch set without last minute tweaks.

The following changes since commit 005ad32358f12fe9313a4a01918a55e60d4f39e5:

Merge tag 'pull-tpm-2023-09-12-3' of https://github.com/stefanberger/qemu-tpm into staging (2023-09-13 13:41:57 -0400)

are available in the Git repository at:

https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20230915-2

for you to fetch changes up to a97a83753c90d79ed15a716610af23fabd84aaed:

tcg: Map code_gen_buffer with PROT_BTI (2023-09-16 14:57:16 +0000)

----------------------------------------------------------------
*: Delete checks for old host definitions
tcg/loongarch64: Generate LSX instructions
fpu: Add conversions between bfloat16 and [u]int8
fpu: Handle m68k extended precision denormals properly
accel/tcg: Improve cputlb i/o organization
accel/tcg: Simplify tlb_plugin_lookup
accel/tcg: Remove false-negative halted assertion
tcg: Add gvec compare with immediate and scalar operand
tcg/aarch64: Emit BTI insns at jump landing pads

----------------------------------------------------------------
Akihiko Odaki (3):
      util: Delete checks for old host definitions
      softmmu: Delete checks for old host definitions
      thunk: Delete checks for old host definitions

Jiajie Chen (16):
      tcg/loongarch64: Import LSX instructions
      tcg/loongarch64: Lower basic tcg vec ops to LSX
      tcg: pass vece to tcg_target_const_match()
      tcg/loongarch64: Lower cmp_vec to vseq/vsle/vslt
      tcg/loongarch64: Lower add/sub_vec to vadd/vsub
      tcg/loongarch64: Lower vector bitwise operations
      tcg/loongarch64: Lower neg_vec to vneg
      tcg/loongarch64: Lower mul_vec to vmul
      tcg/loongarch64: Lower vector min max ops
      tcg/loongarch64: Lower vector saturated ops
      tcg/loongarch64: Lower vector shift vector ops
      tcg/loongarch64: Lower bitsel_vec to vbitsel
      tcg/loongarch64: Lower vector shift integer ops
      tcg/loongarch64: Lower rotv_vec ops to LSX
      tcg/loongarch64: Lower rotli_vec to vrotri
      tcg/loongarch64: Implement 128-bit load & store

LIU Zhiwei (2):
      accel/tcg: Fix the comment for CPUTLBEntryFull
      fpu: Add conversions between bfloat16 and [u]int8

Nicholas Piggin (1):
      accel/tcg: mttcg remove false-negative halted assertion

Richard Henderson (17):
      tcg: Add gvec compare with immediate and scalar operand
      target/arm: Use tcg_gen_gvec_cmpi for compare vs 0
      accel/tcg: Simplify tlb_plugin_lookup
      accel/tcg: Split out io_prepare and io_failed
      accel/tcg: Use CPUTLBEntryFull.phys_addr in io_failed
      plugin: Simplify struct qemu_plugin_hwaddr
      accel/tcg: Merge cpu_transaction_failed into io_failed
      accel/tcg: Replace direct use of io_readx/io_writex in do_{ld,st}_1
      accel/tcg: Merge io_readx into do_ld_mmio_beN
      accel/tcg: Merge io_writex into do_st_mmio_leN
      accel/tcg: Introduce do_ld16_mmio_beN
      accel/tcg: Introduce do_st16_mmio_leN
      fpu: Handle m68k extended precision denormals properly
      tcg: Add tcg_out_tb_start backend hook
      util/cpuinfo-aarch64: Add CPUINFO_BTI
      tcg/aarch64: Emit BTI insns at jump landing pads
      tcg: Map code_gen_buffer with PROT_BTI

From: Jiajie Chen <c@jia.je>

If LSX is available, use LSX instructions to implement 128-bit load &
store when MO_128 is required, otherwise use two 64-bit loads & stores.

Signed-off-by: Jiajie Chen <c@jia.je>
Message-Id: <20230908022302.180442-17-c@jia.je>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/loongarch64/tcg-target-con-set.h |  2 +
 tcg/loongarch64/tcg-target.h         |  2 +-
 tcg/loongarch64/tcg-target.c.inc     | 59 ++++++++++++++++++++++++++++
 3 files changed, 62 insertions(+), 1 deletion(-)

diff --git a/tcg/loongarch64/tcg-target-con-set.h b/tcg/loongarch64/tcg-target-con-set.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/loongarch64/tcg-target-con-set.h
+++ b/tcg/loongarch64/tcg-target-con-set.h
@@ -XXX,XX +XXX,XX @@ C_O0_I1(r)
 C_O0_I2(rZ, r)
 C_O0_I2(rZ, rZ)
 C_O0_I2(w, r)
+C_O0_I3(r, r, r)
 C_O1_I1(r, r)
 C_O1_I1(w, r)
 C_O1_I1(w, w)
@@ -XXX,XX +XXX,XX @@ C_O1_I2(w, w, wM)
 C_O1_I2(w, w, wA)
 C_O1_I3(w, w, w, w)
 C_O1_I4(r, rZ, rJ, rZ, rZ)
+C_O2_I1(r, r, r)
diff --git a/tcg/loongarch64/tcg-target.h b/tcg/loongarch64/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/loongarch64/tcg-target.h
+++ b/tcg/loongarch64/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool use_lsx_instructions;
 #define TCG_TARGET_HAS_muluh_i64        1
 #define TCG_TARGET_HAS_mulsh_i64        1
 
-#define TCG_TARGET_HAS_qemu_ldst_i128   0
+#define TCG_TARGET_HAS_qemu_ldst_i128   use_lsx_instructions
 
 #define TCG_TARGET_HAS_v64              0
 #define TCG_TARGET_HAS_v128             use_lsx_instructions
diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/loongarch64/tcg-target.c.inc
+++ b/tcg/loongarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, TCGReg data_reg, TCGReg addr_reg,
     }
 }
 
+static void tcg_out_qemu_ldst_i128(TCGContext *s, TCGReg data_lo, TCGReg data_hi,
+                                   TCGReg addr_reg, MemOpIdx oi, bool is_ld)
+{
+    TCGLabelQemuLdst *ldst;
+    HostAddress h;
+
+    ldst = prepare_host_addr(s, &h, addr_reg, oi, is_ld);
+
+    if (h.aa.atom == MO_128) {
+        /*
+         * Use VLDX/VSTX when 128-bit atomicity is required.
+         * If address is aligned to 16-bytes, the 128-bit load/store is atomic.
+         */
+        if (is_ld) {
+            tcg_out_opc_vldx(s, TCG_VEC_TMP0, h.base, h.index);
+            tcg_out_opc_vpickve2gr_d(s, data_lo, TCG_VEC_TMP0, 0);
+            tcg_out_opc_vpickve2gr_d(s, data_hi, TCG_VEC_TMP0, 1);
+        } else {
+            tcg_out_opc_vinsgr2vr_d(s, TCG_VEC_TMP0, data_lo, 0);
+            tcg_out_opc_vinsgr2vr_d(s, TCG_VEC_TMP0, data_hi, 1);
+            tcg_out_opc_vstx(s, TCG_VEC_TMP0, h.base, h.index);
+        }
+    } else {
+        /* Otherwise use a pair of LD/ST. */
+        tcg_out_opc_add_d(s, TCG_REG_TMP0, h.base, h.index);
+        if (is_ld) {
+            tcg_out_opc_ld_d(s, data_lo, TCG_REG_TMP0, 0);
+            tcg_out_opc_ld_d(s, data_hi, TCG_REG_TMP0, 8);
+        } else {
+            tcg_out_opc_st_d(s, data_lo, TCG_REG_TMP0, 0);
+            tcg_out_opc_st_d(s, data_hi, TCG_REG_TMP0, 8);
+        }
+    }
+
+    if (ldst) {
+        ldst->type = TCG_TYPE_I128;
+        ldst->datalo_reg = data_lo;
+        ldst->datahi_reg = data_hi;
+        ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
+    }
+}
+
 /*
  * Entry-points
  */
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     TCGArg a0 = args[0];
     TCGArg a1 = args[1];
     TCGArg a2 = args[2];
+    TCGArg a3 = args[3];
     int c2 = const_args[2];
 
     switch (opc) {
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_qemu_ld_a64_i64:
         tcg_out_qemu_ld(s, a0, a1, a2, TCG_TYPE_I64);
         break;
+    case INDEX_op_qemu_ld_a32_i128:
+    case INDEX_op_qemu_ld_a64_i128:
+        tcg_out_qemu_ldst_i128(s, a0, a1, a2, a3, true);
+        break;
     case INDEX_op_qemu_st_a32_i32:
     case INDEX_op_qemu_st_a64_i32:
         tcg_out_qemu_st(s, a0, a1, a2, TCG_TYPE_I32);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_qemu_st_a64_i64:
         tcg_out_qemu_st(s, a0, a1, a2, TCG_TYPE_I64);
         break;
+    case INDEX_op_qemu_st_a32_i128:
+    case INDEX_op_qemu_st_a64_i128:
+        tcg_out_qemu_ldst_i128(s, a0, a1, a2, a3, false);
+        break;
 
     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
     case INDEX_op_mov_i64:
@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
     case INDEX_op_qemu_st_a64_i64:
         return C_O0_I2(rZ, r);
 
+    case INDEX_op_qemu_ld_a32_i128:
+    case INDEX_op_qemu_ld_a64_i128:
+        return C_O2_I1(r, r, r);
+
+    case INDEX_op_qemu_st_a32_i128:
+    case INDEX_op_qemu_st_a64_i128:
+        return C_O0_I3(r, r, r);
+
     case INDEX_op_brcond_i32:
     case INDEX_op_brcond_i64:
         return C_O0_I2(rZ, rZ);
-- 
2.34.1