Series comparison

-[PULL 0/4] tcg patch queue
+[PULL 00/12] tcg patch queue
-The following changes since commit 67e41fe0cfb62e6cdfa659f0155417d17e5274ea:
+The following changes since commit 7c18f2d663521f1b31b821a13358ce38075eaf7d:
-  Merge tag 'pull-ppc-20220104' of https://github.com/legoater/qemu into staging (2022-01-04 07:23:27 -0800)
+  Merge tag 'for-upstream' of https://gitlab.com/bonzini/qemu into staging (2023-04-29 23:07:17 +0100)
 are available in the Git repository at:
-  https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20220104
+  https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20230502
-for you to fetch changes up to d7478d4229f0a2b2817a55487e6b17081099fae4:
+for you to fetch changes up to bdc7fba1c5a29ae218b45353daac9308fe1aae82:
-  common-user: Fix tail calls to safe_syscall_set_errno_tail (2022-01-04 15:41:03 -0800)
+  tcg: Introduce tcg_out_movext2 (2023-05-02 12:15:41 +0100)
 ----------------------------------------------------------------
-Fix for safe_syscall_base.
+Misc tcg-related patch queue.
 Fix for folding of vector add/sub.
 Fix build on loongarch64 with gcc 8.
 Remove decl for qemu_run_machine_init_done_notifiers.
 ----------------------------------------------------------------
-Philippe Mathieu-Daudé (1):
+Dickon Hood (1):
-      linux-user: Fix trivial build error on loongarch64 hosts
+      qemu/bitops.h: Limit rotate amounts
-Richard Henderson (2):
+Kiran Ostrolenk (1):
-      tcg/optimize: Fix folding of vector ops
+      qemu/host-utils.h: Add clz and ctz functions for lower-bit integers
       common-user: Fix tail calls to safe_syscall_set_errno_tail
-Xiaoyao Li (1):
+Nazar Kazakov (2):
-      sysemu: Cleanup qemu_run_machine_init_done_notifiers()
+      tcg: Add tcg_gen_gvec_andcs
       tcg: Add tcg_gen_gvec_rotrs
- include/sysemu/sysemu.h                    |  1 -
+Richard Henderson (7):
- linux-user/host/loongarch64/host-signal.h  |  4 +--
+      softmmu: Tidy dirtylimit_dirty_ring_full_time
- tcg/optimize.c                             | 49 +++++++++++++++++++++++-------
+      qemu/int128: Re-shuffle Int128Alias members
- common-user/host/i386/safe-syscall.inc.S   |  1 +
+      migration/xbzrle: Use __attribute__((target)) for avx512
- common-user/host/mips/safe-syscall.inc.S   |  1 +
+      accel/tcg: Add cpu_ld*_code_mmu
- common-user/host/x86_64/safe-syscall.inc.S |  1 +
+      tcg/loongarch64: Conditionalize tcg_out_exts_i32_i64
-files changed, 42 insertions(+), 15 deletions(-)
+      tcg/mips: Conditionalize tcg_out_exts_i32_i64
       tcg: Introduce tcg_out_movext2
+Weiwei Li (1):
+      accel/tcg: Uncache the host address for instruction fetch when tlb size < 1
+ meson.build                      |  5 +--
+ accel/tcg/tcg-runtime.h          |  1 +
+ include/exec/cpu_ldst.h          |  9 ++++++
+ include/qemu/bitops.h            | 24 +++++++++-----
+ include/qemu/host-utils.h        | 54 +++++++++++++++++++++++++++++++
+ include/qemu/int128.h            |  4 +--
+ include/tcg/tcg-op-gvec.h        |  4 +++
+ accel/tcg/cputlb.c               | 53 ++++++++++++++++++++++++++++++
+ accel/tcg/tcg-runtime-gvec.c     | 11 +++++++
+ accel/tcg/user-exec.c            | 58 +++++++++++++++++++++++++++++++++
+ migration/xbzrle.c               |  9 +++---
+ softmmu/dirtylimit.c             | 15 ++++++---
+ tcg/tcg-op-gvec.c                | 28 ++++++++++++++++
+ tcg/tcg.c                        | 69 +++++++++++++++++++++++++++++++++++++---
+ tcg/arm/tcg-target.c.inc         | 44 +++++++++++--------------
+ tcg/i386/tcg-target.c.inc        | 19 +++++------
+ tcg/loongarch64/tcg-target.c.inc |  4 ++-
+ tcg/mips/tcg-target.c.inc        |  4 ++-
+files changed, 347 insertions(+), 68 deletions(-)

-New patch
+[PULL 01/12] softmmu: Tidy dirtylimit_dirty_ring_full_time
+Drop inline marker: let compiler decide.
+Change return type to uint64_t: this matches the computation in the
+return statement and the local variable assignment in the caller.
+Rename local to dirty_ring_size_MB to fix typo.
+Simplify conversion to MiB via qemu_target_page_bits and right shift.
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+Reviewed-by: Thomas Huth <thuth@redhat.com>
+Reviewed-by: Juan Quintela <quintela@redhat.com>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ softmmu/dirtylimit.c | 15 ++++++++++-----
+file changed, 10 insertions(+), 5 deletions(-)
+diff --git a/softmmu/dirtylimit.c b/softmmu/dirtylimit.c
+index XXXXXXX..XXXXXXX 100644
+--- a/softmmu/dirtylimit.c
++++ b/softmmu/dirtylimit.c
+@@ -XXX,XX +XXX,XX @@ bool dirtylimit_vcpu_index_valid(int cpu_index)
+              cpu_index >= ms->smp.max_cpus);
+ }
+-static inline int64_t dirtylimit_dirty_ring_full_time(uint64_t dirtyrate)
++static uint64_t dirtylimit_dirty_ring_full_time(uint64_t dirtyrate)
+ {
+     static uint64_t max_dirtyrate;
+-    uint32_t dirty_ring_size = kvm_dirty_ring_size();
+-    uint64_t dirty_ring_size_meory_MB =
+-        dirty_ring_size * qemu_target_page_size() >> 20;
++    unsigned target_page_bits = qemu_target_page_bits();
++    uint64_t dirty_ring_size_MB;
++
++    /* So far, the largest (non-huge) page size is 64k, i.e. 16 bits. */
++    assert(target_page_bits < 20);
++
++    /* Convert ring size (pages) to MiB (2**20). */
++    dirty_ring_size_MB = kvm_dirty_ring_size() >> (20 - target_page_bits);
+     if (max_dirtyrate < dirtyrate) {
+         max_dirtyrate = dirtyrate;
+     }
+-    return dirty_ring_size_meory_MB * 1000000 / max_dirtyrate;
++    return dirty_ring_size_MB * 1000000 / max_dirtyrate;
+ }
+ static inline bool dirtylimit_done(uint64_t quota,
+--
+.34.1

-New patch
+[PULL 02/12] accel/tcg: Uncache the host address for instruction fetch when tlb size < 1
+From: Weiwei Li <liweiwei@iscas.ac.cn>
+When PMP entry overlap part of the page, we'll set the tlb_size to 1, which
+will make the address in tlb entry set with TLB_INVALID_MASK, and the next
+access will again go through tlb_fill.However, this way will not work in
+tb_gen_code() => get_page_addr_code_hostp(): the TLB host address will be
+cached, and the following instructions can use this host address directly
+which may lead to the bypass of PMP related check.
+Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1542.
+Signed-off-by: Weiwei Li <liweiwei@iscas.ac.cn>
+Signed-off-by: Junqiang Wang <wangjunqiang@iscas.ac.cn>
+Reviewed-by: LIU Zhiwei <zhiwei_liu@linux.alibaba.com>
+Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
+Message-Id: <20230422130329.23555-6-liweiwei@iscas.ac.cn>
+---
+ accel/tcg/cputlb.c | 5 +++++
+file changed, 5 insertions(+)
+diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
+index XXXXXXX..XXXXXXX 100644
+--- a/accel/tcg/cputlb.c
++++ b/accel/tcg/cputlb.c
+@@ -XXX,XX +XXX,XX @@ tb_page_addr_t get_page_addr_code_hostp(CPUArchState *env, target_ulong addr,
+     if (p == NULL) {
+         return -1;
+     }
++
++    if (full->lg_page_size < TARGET_PAGE_BITS) {
++        return -1;
++    }
++
+     if (hostp) {
+         *hostp = p;
+     }
+--
+.34.1

-New patch
+[PULL 03/12] qemu/bitops.h: Limit rotate amounts
+From: Dickon Hood <dickon.hood@codethink.co.uk>
+Rotates have been fixed up to only allow for reasonable rotate amounts
+(ie, no rotates >7 on an 8b value etc.)  This fixes a problem with riscv
+vector rotate instructions.
+Signed-off-by: Dickon Hood <dickon.hood@codethink.co.uk>
+Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
+Message-Id: <20230428144757.57530-9-lawrence.hunter@codethink.co.uk>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ include/qemu/bitops.h | 24 ++++++++++++++++--------
+file changed, 16 insertions(+), 8 deletions(-)
+diff --git a/include/qemu/bitops.h b/include/qemu/bitops.h
+index XXXXXXX..XXXXXXX 100644
+--- a/include/qemu/bitops.h
++++ b/include/qemu/bitops.h
+@@ -XXX,XX +XXX,XX @@ static inline unsigned long find_first_zero_bit(const unsigned long *addr,
+  */
+ static inline uint8_t rol8(uint8_t word, unsigned int shift)
+ {
+-    return (word << shift) | (word >> ((8 - shift) & 7));
++    shift &= 7;
++    return (word << shift) | (word >> (8 - shift));
+ }
+ /**
+@@ -XXX,XX +XXX,XX @@ static inline uint8_t rol8(uint8_t word, unsigned int shift)
+  */
+ static inline uint8_t ror8(uint8_t word, unsigned int shift)
+ {
+-    return (word >> shift) | (word << ((8 - shift) & 7));
++    shift &= 7;
++    return (word >> shift) | (word << (8 - shift));
+ }
+ /**
+@@ -XXX,XX +XXX,XX @@ static inline uint8_t ror8(uint8_t word, unsigned int shift)
+  */
+ static inline uint16_t rol16(uint16_t word, unsigned int shift)
+ {
+-    return (word << shift) | (word >> ((16 - shift) & 15));
++    shift &= 15;
++    return (word << shift) | (word >> (16 - shift));
+ }
+ /**
+@@ -XXX,XX +XXX,XX @@ static inline uint16_t rol16(uint16_t word, unsigned int shift)
+  */
+ static inline uint16_t ror16(uint16_t word, unsigned int shift)
+ {
+-    return (word >> shift) | (word << ((16 - shift) & 15));
++    shift &= 15;
++    return (word >> shift) | (word << (16 - shift));
+ }
+ /**
+@@ -XXX,XX +XXX,XX @@ static inline uint16_t ror16(uint16_t word, unsigned int shift)
+  */
+ static inline uint32_t rol32(uint32_t word, unsigned int shift)
+ {
+-    return (word << shift) | (word >> ((32 - shift) & 31));
++    shift &= 31;
++    return (word << shift) | (word >> (32 - shift));
+ }
+ /**
+@@ -XXX,XX +XXX,XX @@ static inline uint32_t rol32(uint32_t word, unsigned int shift)
+  */
+ static inline uint32_t ror32(uint32_t word, unsigned int shift)
+ {
+-    return (word >> shift) | (word << ((32 - shift) & 31));
++    shift &= 31;
++    return (word >> shift) | (word << (32 - shift));
+ }
+ /**
+@@ -XXX,XX +XXX,XX @@ static inline uint32_t ror32(uint32_t word, unsigned int shift)
+  */
+ static inline uint64_t rol64(uint64_t word, unsigned int shift)
+ {
+-    return (word << shift) | (word >> ((64 - shift) & 63));
++    shift &= 63;
++    return (word << shift) | (word >> (64 - shift));
+ }
+ /**
+@@ -XXX,XX +XXX,XX @@ static inline uint64_t rol64(uint64_t word, unsigned int shift)
+  */
+ static inline uint64_t ror64(uint64_t word, unsigned int shift)
+ {
+-    return (word >> shift) | (word << ((64 - shift) & 63));
++    shift &= 63;
++    return (word >> shift) | (word << (64 - shift));
+ }
+ /**
+--
+.34.1

-New patch
+[PULL 04/12] qemu/host-utils.h: Add clz and ctz functions for lower-bit integers
+From: Kiran Ostrolenk <kiran.ostrolenk@codethink.co.uk>
+This is for use in the RISC-V vclz and vctz instructions (implemented in
+proceeding commit).
+Signed-off-by: Kiran Ostrolenk <kiran.ostrolenk@codethink.co.uk>
+Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
+Message-Id: <20230428144757.57530-11-lawrence.hunter@codethink.co.uk>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ include/qemu/host-utils.h | 54 +++++++++++++++++++++++++++++++++++++++
+file changed, 54 insertions(+)
+diff --git a/include/qemu/host-utils.h b/include/qemu/host-utils.h
+index XXXXXXX..XXXXXXX 100644
+--- a/include/qemu/host-utils.h
++++ b/include/qemu/host-utils.h
+@@ -XXX,XX +XXX,XX @@ static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
+ }
+ #endif
++/**
++ * clz8 - count leading zeros in a 8-bit value.
++ * @val: The value to search
++ *
++ * Returns 8 if the value is zero.  Note that the GCC builtin is
++ * undefined if the value is zero.
++ *
++ * Note that the GCC builtin will upcast its argument to an `unsigned int`
++ * so this function subtracts off the number of prepended zeroes.
++ */
++static inline int clz8(uint8_t val)
++{
++    return val ? __builtin_clz(val) - 24 : 8;
++}
++
++/**
++ * clz16 - count leading zeros in a 16-bit value.
++ * @val: The value to search
++ *
++ * Returns 16 if the value is zero.  Note that the GCC builtin is
++ * undefined if the value is zero.
++ *
++ * Note that the GCC builtin will upcast its argument to an `unsigned int`
++ * so this function subtracts off the number of prepended zeroes.
++ */
++static inline int clz16(uint16_t val)
++{
++    return val ? __builtin_clz(val) - 16 : 16;
++}
++
+ /**
+  * clz32 - count leading zeros in a 32-bit value.
+  * @val: The value to search
+@@ -XXX,XX +XXX,XX @@ static inline int clo64(uint64_t val)
+     return clz64(~val);
+ }
++/**
++ * ctz8 - count trailing zeros in a 8-bit value.
++ * @val: The value to search
++ *
++ * Returns 8 if the value is zero.  Note that the GCC builtin is
++ * undefined if the value is zero.
++ */
++static inline int ctz8(uint8_t val)
++{
++    return val ? __builtin_ctz(val) : 8;
++}
++
++/**
++ * ctz16 - count trailing zeros in a 16-bit value.
++ * @val: The value to search
++ *
++ * Returns 16 if the value is zero.  Note that the GCC builtin is
++ * undefined if the value is zero.
++ */
++static inline int ctz16(uint16_t val)
++{
++    return val ? __builtin_ctz(val) : 16;
++}
++
+ /**
+  * ctz32 - count trailing zeros in a 32-bit value.
+  * @val: The value to search
+--
+.34.1

-[PULL 1/4] tcg/optimize: Fix folding of vector ops
+[PULL 05/12] tcg: Add tcg_gen_gvec_andcs
-Bitwise operations are easy to fold, because the operation is
+From: Nazar Kazakov <nazar.kazakov@codethink.co.uk>
 identical regardless of element size.  But add and sub need
 extra element size info that is not currently propagated.
-Fixes: 2f9f08ba43d
+Add tcg expander and helper functions for and-compliment
-Cc: qemu-stable@nongnu.org
+vector with scalar operand.
-Resolves: https://gitlab.com/qemu-project/qemu/-/issues/799
-Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Nazar Kazakov <nazar.kazakov@codethink.co.uk>
 Message-Id: <20230428144757.57530-10-lawrence.hunter@codethink.co.uk>
 [rth: Split out of larger patch.]
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/optimize.c | 49 ++++++++++++++++++++++++++++++++++++++-----------
+ accel/tcg/tcg-runtime.h      |  1 +
-file changed, 38 insertions(+), 11 deletions(-)
+ include/tcg/tcg-op-gvec.h    |  2 ++
  accel/tcg/tcg-runtime-gvec.c | 11 +++++++++++
  tcg/tcg-op-gvec.c            | 17 +++++++++++++++++
 files changed, 31 insertions(+)
-diff --git a/tcg/optimize.c b/tcg/optimize.c
+diff --git a/accel/tcg/tcg-runtime.h b/accel/tcg/tcg-runtime.h
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/optimize.c
+--- a/accel/tcg/tcg-runtime.h
-+++ b/tcg/optimize.c
++++ b/accel/tcg/tcg-runtime.h
-@@ -XXX,XX +XXX,XX @@ static uint64_t do_constant_folding_2(TCGOpcode op, uint64_t x, uint64_t y)
+@@ -XXX,XX +XXX,XX @@ DEF_HELPER_FLAGS_4(gvec_nor, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
-     CASE_OP_32_64(mul):
+ DEF_HELPER_FLAGS_4(gvec_eqv, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
-         return x * y;
+ DEF_HELPER_FLAGS_4(gvec_ands, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
--    CASE_OP_32_64(and):
++DEF_HELPER_FLAGS_4(gvec_andcs, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
-+    CASE_OP_32_64_VEC(and):
+ DEF_HELPER_FLAGS_4(gvec_xors, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
-         return x & y;
+ DEF_HELPER_FLAGS_4(gvec_ors, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
--    CASE_OP_32_64(or):
+diff --git a/include/tcg/tcg-op-gvec.h b/include/tcg/tcg-op-gvec.h
-+    CASE_OP_32_64_VEC(or):
+index XXXXXXX..XXXXXXX 100644
-         return x | y;
+--- a/include/tcg/tcg-op-gvec.h
++++ b/include/tcg/tcg-op-gvec.h
--    CASE_OP_32_64(xor):
+@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, uint32_t aofs,
-+    CASE_OP_32_64_VEC(xor):
-         return x ^ y;
+ void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs,
+                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz);
-     case INDEX_op_shl_i32:
++void tcg_gen_gvec_andcs(unsigned vece, uint32_t dofs, uint32_t aofs,
-@@ -XXX,XX +XXX,XX @@ static uint64_t do_constant_folding_2(TCGOpcode op, uint64_t x, uint64_t y)
++                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz);
-     case INDEX_op_rotl_i64:
+ void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs,
-         return rol64(x, y & 63);
+                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz);
+ void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs,
--    CASE_OP_32_64(not):
+diff --git a/accel/tcg/tcg-runtime-gvec.c b/accel/tcg/tcg-runtime-gvec.c
-+    CASE_OP_32_64_VEC(not):
+index XXXXXXX..XXXXXXX 100644
-         return ~x;
+--- a/accel/tcg/tcg-runtime-gvec.c
++++ b/accel/tcg/tcg-runtime-gvec.c
-     CASE_OP_32_64(neg):
+@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_ands)(void *d, void *a, uint64_t b, uint32_t desc)
-         return -x;
+     clear_high(d, oprsz, desc);
 -    CASE_OP_32_64(andc):
 +    CASE_OP_32_64_VEC(andc):
          return x & ~y;
 -    CASE_OP_32_64(orc):
 +    CASE_OP_32_64_VEC(orc):
          return x | ~y;
      CASE_OP_32_64(eqv):
@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
      return false;
  }
-+static bool fold_commutative(OptContext *ctx, TCGOp *op)
++void HELPER(gvec_andcs)(void *d, void *a, uint64_t b, uint32_t desc)
 +{
-+    swap_commutative(op->args[0], &op->args[1], &op->args[2]);
++    intptr_t oprsz = simd_oprsz(desc);
-+    return false;
++    intptr_t i;
 +
 +    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
 +        *(uint64_t *)(d + i) = *(uint64_t *)(a + i) & ~b;
 +    }
 +    clear_high(d, oprsz, desc);
 +}
 +
- static bool fold_const2_commutative(OptContext *ctx, TCGOp *op)
+ void HELPER(gvec_xors)(void *d, void *a, uint64_t b, uint32_t desc)
  {
-     swap_commutative(op->args[0], &op->args[1], &op->args[2]);
+     intptr_t oprsz = simd_oprsz(desc);
-@@ -XXX,XX +XXX,XX @@ static bool fold_add(OptContext *ctx, TCGOp *op)
+diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
-     return false;
+index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tcg-op-gvec.c
 +++ b/tcg/tcg-op-gvec.c
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs,
      tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
  }
-+/* We cannot as yet do_constant_folding with vectors. */
++void tcg_gen_gvec_andcs(unsigned vece, uint32_t dofs, uint32_t aofs,
-+static bool fold_add_vec(OptContext *ctx, TCGOp *op)
++                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
 +{
-+    if (fold_commutative(ctx, op) ||
++    static GVecGen2s g = {
-+        fold_xi_to_x(ctx, op, 0)) {
++        .fni8 = tcg_gen_andc_i64,
-+        return true;
++        .fniv = tcg_gen_andc_vec,
-+    }
++        .fno = gen_helper_gvec_andcs,
-+    return false;
++        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
 +        .vece = MO_64
 +    };
 +
 +    TCGv_i64 tmp = tcg_temp_ebb_new_i64();
 +    tcg_gen_dup_i64(vece, tmp, c);
 +    tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g);
 +    tcg_temp_free_i64(tmp);
 +}
 +
- static bool fold_addsub2(OptContext *ctx, TCGOp *op, bool add)
+ static const GVecGen2s gop_xors = {
- {
+     .fni8 = tcg_gen_xor_i64,
-     if (arg_is_const(op->args[2]) && arg_is_const(op->args[3]) &&
+     .fniv = tcg_gen_xor_vec,
@@ -XXX,XX +XXX,XX @@ static bool fold_sub_to_neg(OptContext *ctx, TCGOp *op)
      return false;
  }
 -static bool fold_sub(OptContext *ctx, TCGOp *op)
 +/* We cannot as yet do_constant_folding with vectors. */
 +static bool fold_sub_vec(OptContext *ctx, TCGOp *op)
  {
 -    if (fold_const2(ctx, op) ||
 -        fold_xx_to_i(ctx, op, 0) ||
 +    if (fold_xx_to_i(ctx, op, 0) ||
          fold_xi_to_x(ctx, op, 0) ||
          fold_sub_to_neg(ctx, op)) {
          return true;
@@ -XXX,XX +XXX,XX @@ static bool fold_sub(OptContext *ctx, TCGOp *op)
      return false;
  }
 +static bool fold_sub(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op) || fold_sub_vec(ctx, op);
 +}
 +
  static bool fold_sub2(OptContext *ctx, TCGOp *op)
  {
      return fold_addsub2(ctx, op, false);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
           * Sorted alphabetically by opcode as much as possible.
           */
          switch (opc) {
 -        CASE_OP_32_64_VEC(add):
 +        CASE_OP_32_64(add):
              done = fold_add(&ctx, op);
              break;
 +        case INDEX_op_add_vec:
 +            done = fold_add_vec(&ctx, op);
 +            break;
          CASE_OP_32_64(add2):
              done = fold_add2(&ctx, op);
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          CASE_OP_32_64(sextract):
              done = fold_sextract(&ctx, op);
              break;
 -        CASE_OP_32_64_VEC(sub):
 +        CASE_OP_32_64(sub):
              done = fold_sub(&ctx, op);
              break;
 +        case INDEX_op_sub_vec:
 +            done = fold_sub_vec(&ctx, op);
 +            break;
          CASE_OP_32_64(sub2):
              done = fold_sub2(&ctx, op);
              break;
 --
-.25.1
+.34.1

-New patch
+[PULL 06/12] tcg: Add tcg_gen_gvec_rotrs
+From: Nazar Kazakov <nazar.kazakov@codethink.co.uk>
+Add tcg expander and helper functions for rotate right
+vector with scalar operand.
+Signed-off-by: Nazar Kazakov <nazar.kazakov@codethink.co.uk>
+Message-Id: <20230428144757.57530-10-lawrence.hunter@codethink.co.uk>
+[rth: Split out of larger patch; mask rotation count.]
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ include/tcg/tcg-op-gvec.h |  2 ++
+ tcg/tcg-op-gvec.c         | 11 +++++++++++
+files changed, 13 insertions(+)
+diff --git a/include/tcg/tcg-op-gvec.h b/include/tcg/tcg-op-gvec.h
+index XXXXXXX..XXXXXXX 100644
+--- a/include/tcg/tcg-op-gvec.h
++++ b/include/tcg/tcg-op-gvec.h
+@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_sars(unsigned vece, uint32_t dofs, uint32_t aofs,
+                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz);
+ void tcg_gen_gvec_rotls(unsigned vece, uint32_t dofs, uint32_t aofs,
+                         TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz);
++void tcg_gen_gvec_rotrs(unsigned vece, uint32_t dofs, uint32_t aofs,
++                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz);
+ /*
+  * Perform vector shift by vector element, modulo the element size.
+diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/tcg-op-gvec.c
++++ b/tcg/tcg-op-gvec.c
+@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_rotls(unsigned vece, uint32_t dofs, uint32_t aofs,
+     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
+ }
++void tcg_gen_gvec_rotrs(unsigned vece, uint32_t dofs, uint32_t aofs,
++                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
++{
++    TCGv_i32 tmp = tcg_temp_ebb_new_i32();
++
++    tcg_gen_neg_i32(tmp, shift);
++    tcg_gen_andi_i32(tmp, tmp, (8 << vece) - 1);
++    tcg_gen_gvec_rotls(vece, dofs, aofs, tmp, oprsz, maxsz);
++    tcg_temp_free_i32(tmp);
++}
++
+ /*
+  * Expand D = A << (B % element bits)
+  *
+--
+.34.1

-[PULL 4/4] common-user: Fix tail calls to safe_syscall_set_errno_tail
+[PULL 07/12] qemu/int128: Re-shuffle Int128Alias members
-For the ABIs in which the syscall return register is not
+Clang 14, with --enable-tcg-interpreter errors with
 also the first function argument register, move the errno
 value into the correct place.
-Fixes: a3310c0397e2 ("linux-user: Move syscall error detection into safe_syscall_base")
+include/qemu/int128.h:487:16: error: alignment of field 'i' (128 bits)
-Reported-by: Laurent Vivier <laurent@vivier.eu>
+  does not match the alignment of the first field in transparent union;
-Tested-by: Laurent Vivier <laurent@vivier.eu>
+  transparent_union attribute ignored [-Werror,-Wignored-attributes]
-Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+    __int128_t i;
                ^
 include/qemu/int128.h:486:12: note: alignment of first field is 64 bits
     Int128 s;
            ^
 error generated.
 By placing the __uint128_t member first, this is avoided.
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
-Message-Id: <20220104190454.542225-1-richard.henderson@linaro.org>
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Message-Id: <20230501204625.277361-1-richard.henderson@linaro.org>
 ---
- common-user/host/i386/safe-syscall.inc.S   | 1 +
+ include/qemu/int128.h | 4 ++--
- common-user/host/mips/safe-syscall.inc.S   | 1 +
+file changed, 2 insertions(+), 2 deletions(-)
  common-user/host/x86_64/safe-syscall.inc.S | 1 +
 files changed, 3 insertions(+)
-diff --git a/common-user/host/i386/safe-syscall.inc.S b/common-user/host/i386/safe-syscall.inc.S
+diff --git a/include/qemu/int128.h b/include/qemu/int128.h
 index XXXXXXX..XXXXXXX 100644
---- a/common-user/host/i386/safe-syscall.inc.S
+--- a/include/qemu/int128.h
-+++ b/common-user/host/i386/safe-syscall.inc.S
++++ b/include/qemu/int128.h
-@@ -XXX,XX +XXX,XX @@ safe_syscall_end:
+@@ -XXX,XX +XXX,XX @@ static inline void bswap128s(Int128 *s)
-         pop     %ebp
+  */
-         .cfi_adjust_cfa_offset -4
+ #ifdef CONFIG_INT128
-         .cfi_restore ebp
+ typedef union {
-+        mov     %eax, (%esp)
+-    Int128 s;
-         jmp     safe_syscall_set_errno_tail
+-    __int128_t i;
+     __uint128_t u;
-         .cfi_endproc
++    __int128_t i;
-diff --git a/common-user/host/mips/safe-syscall.inc.S b/common-user/host/mips/safe-syscall.inc.S
++    Int128 s;
-index XXXXXXX..XXXXXXX 100644
+ } Int128Alias __attribute__((transparent_union));
---- a/common-user/host/mips/safe-syscall.inc.S
+ #else
-+++ b/common-user/host/mips/safe-syscall.inc.S
+ typedef Int128 Int128Alias;
@@ -XXX,XX +XXX,XX @@ safe_syscall_end:
 :      USE_ALT_CP(t0)
          SETUP_GPX(t1)
          SETUP_GPX64(t0, t1)
 +        move    a0, v0
          PTR_LA  t9, safe_syscall_set_errno_tail
          jr      t9
 diff --git a/common-user/host/x86_64/safe-syscall.inc.S b/common-user/host/x86_64/safe-syscall.inc.S
 index XXXXXXX..XXXXXXX 100644
 --- a/common-user/host/x86_64/safe-syscall.inc.S
 +++ b/common-user/host/x86_64/safe-syscall.inc.S
@@ -XXX,XX +XXX,XX @@ safe_syscall_end:
 :      pop     %rbp
          .cfi_def_cfa_offset 8
          .cfi_restore rbp
 +        mov     %eax, %edi
          jmp     safe_syscall_set_errno_tail
          .cfi_endproc
 --
-.25.1
+.34.1

-New patch
+[PULL 08/12] migration/xbzrle: Use __attribute__((target)) for avx512
+Use the attribute, which is supported by clang, instead of
+the #pragma, which is not supported and, for some reason,
+also not detected by the meson probe, so we fail by -Werror.
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+Reviewed-by: Juan Quintela <quintela@redhat.com>
+Message-Id: <20230501210555.289806-1-richard.henderson@linaro.org>
+---
+ meson.build        | 5 +----
+ migration/xbzrle.c | 9 ++++-----
+files changed, 5 insertions(+), 9 deletions(-)
+diff --git a/meson.build b/meson.build
+index XXXXXXX..XXXXXXX 100644
+--- a/meson.build
++++ b/meson.build
+@@ -XXX,XX +XXX,XX @@ config_host_data.set('CONFIG_AVX512F_OPT', get_option('avx512f') \
+ config_host_data.set('CONFIG_AVX512BW_OPT', get_option('avx512bw') \
+   .require(have_cpuid_h, error_message: 'cpuid.h not available, cannot enable AVX512BW') \
+   .require(cc.links('''
+-    #pragma GCC push_options
+-    #pragma GCC target("avx512bw")
+     #include <cpuid.h>
+     #include <immintrin.h>
+-    static int bar(void *a) {
+-
++    static int __attribute__((target("avx512bw"))) bar(void *a) {
+       __m512i *x = a;
+       __m512i res= _mm512_abs_epi8(*x);
+       return res[1];
+diff --git a/migration/xbzrle.c b/migration/xbzrle.c
+index XXXXXXX..XXXXXXX 100644
+--- a/migration/xbzrle.c
++++ b/migration/xbzrle.c
+@@ -XXX,XX +XXX,XX @@ int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen)
+ }
+ #if defined(CONFIG_AVX512BW_OPT)
+-#pragma GCC push_options
+-#pragma GCC target("avx512bw")
+ #include <immintrin.h>
+-int xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen,
+-                             uint8_t *dst, int dlen)
++
++int __attribute__((target("avx512bw")))
++xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen,
++                            uint8_t *dst, int dlen)
+ {
+     uint32_t zrun_len = 0, nzrun_len = 0;
+     int d = 0, i = 0, num = 0;
+@@ -XXX,XX +XXX,XX @@ int xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen,
+     }
+     return d;
+ }
+-#pragma GCC pop_options
+ #endif
+--
+.34.1

-New patch
+[PULL 09/12] accel/tcg: Add cpu_ld*_code_mmu
+At least RISC-V has the need to be able to perform a read
+using execute permissions, outside of translation.
+Add helpers to facilitate this.
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+Acked-by: Alistair Francis <alistair.francis@wdc.com>
+Reviewed-by: Weiwei Li <liweiwei@iscas.ac.cn>
+Tested-by: Daniel Henrique Barboza <dbarboza@ventanamicro.com>
+Message-Id: <20230325105429.1142530-9-richard.henderson@linaro.org>
+Message-Id: <20230412114333.118895-9-richard.henderson@linaro.org>
+---
+ include/exec/cpu_ldst.h |  9 +++++++
+ accel/tcg/cputlb.c      | 48 ++++++++++++++++++++++++++++++++++
+ accel/tcg/user-exec.c   | 58 +++++++++++++++++++++++++++++++++++++++++
+files changed, 115 insertions(+)
+diff --git a/include/exec/cpu_ldst.h b/include/exec/cpu_ldst.h
+index XXXXXXX..XXXXXXX 100644
+--- a/include/exec/cpu_ldst.h
++++ b/include/exec/cpu_ldst.h
+@@ -XXX,XX +XXX,XX @@ static inline CPUTLBEntry *tlb_entry(CPUArchState *env, uintptr_t mmu_idx,
+ # define cpu_stq_mmu          cpu_stq_le_mmu
+ #endif
++uint8_t cpu_ldb_code_mmu(CPUArchState *env, abi_ptr addr,
++                         MemOpIdx oi, uintptr_t ra);
++uint16_t cpu_ldw_code_mmu(CPUArchState *env, abi_ptr addr,
++                          MemOpIdx oi, uintptr_t ra);
++uint32_t cpu_ldl_code_mmu(CPUArchState *env, abi_ptr addr,
++                          MemOpIdx oi, uintptr_t ra);
++uint64_t cpu_ldq_code_mmu(CPUArchState *env, abi_ptr addr,
++                          MemOpIdx oi, uintptr_t ra);
++
+ uint32_t cpu_ldub_code(CPUArchState *env, abi_ptr addr);
+ uint32_t cpu_lduw_code(CPUArchState *env, abi_ptr addr);
+ uint32_t cpu_ldl_code(CPUArchState *env, abi_ptr addr);
+diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
+index XXXXXXX..XXXXXXX 100644
+--- a/accel/tcg/cputlb.c
++++ b/accel/tcg/cputlb.c
+@@ -XXX,XX +XXX,XX @@ uint64_t cpu_ldq_code(CPUArchState *env, abi_ptr addr)
+     MemOpIdx oi = make_memop_idx(MO_TEUQ, cpu_mmu_index(env, true));
+     return full_ldq_code(env, addr, oi, 0);
+ }
++
++uint8_t cpu_ldb_code_mmu(CPUArchState *env, abi_ptr addr,
++                         MemOpIdx oi, uintptr_t retaddr)
++{
++    return full_ldub_code(env, addr, oi, retaddr);
++}
++
++uint16_t cpu_ldw_code_mmu(CPUArchState *env, abi_ptr addr,
++                          MemOpIdx oi, uintptr_t retaddr)
++{
++    MemOp mop = get_memop(oi);
++    int idx = get_mmuidx(oi);
++    uint16_t ret;
++
++    ret = full_lduw_code(env, addr, make_memop_idx(MO_TEUW, idx), retaddr);
++    if ((mop & MO_BSWAP) != MO_TE) {
++        ret = bswap16(ret);
++    }
++    return ret;
++}
++
++uint32_t cpu_ldl_code_mmu(CPUArchState *env, abi_ptr addr,
++                          MemOpIdx oi, uintptr_t retaddr)
++{
++    MemOp mop = get_memop(oi);
++    int idx = get_mmuidx(oi);
++    uint32_t ret;
++
++    ret = full_ldl_code(env, addr, make_memop_idx(MO_TEUL, idx), retaddr);
++    if ((mop & MO_BSWAP) != MO_TE) {
++        ret = bswap32(ret);
++    }
++    return ret;
++}
++
++uint64_t cpu_ldq_code_mmu(CPUArchState *env, abi_ptr addr,
++                          MemOpIdx oi, uintptr_t retaddr)
++{
++    MemOp mop = get_memop(oi);
++    int idx = get_mmuidx(oi);
++    uint64_t ret;
++
++    ret = full_ldq_code(env, addr, make_memop_idx(MO_TEUQ, idx), retaddr);
++    if ((mop & MO_BSWAP) != MO_TE) {
++        ret = bswap64(ret);
++    }
++    return ret;
++}
+diff --git a/accel/tcg/user-exec.c b/accel/tcg/user-exec.c
+index XXXXXXX..XXXXXXX 100644
+--- a/accel/tcg/user-exec.c
++++ b/accel/tcg/user-exec.c
+@@ -XXX,XX +XXX,XX @@ uint64_t cpu_ldq_code(CPUArchState *env, abi_ptr ptr)
+     return ret;
+ }
++uint8_t cpu_ldb_code_mmu(CPUArchState *env, abi_ptr addr,
++                         MemOpIdx oi, uintptr_t ra)
++{
++    void *haddr;
++    uint8_t ret;
++
++    haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_INST_FETCH);
++    ret = ldub_p(haddr);
++    clear_helper_retaddr();
++    return ret;
++}
++
++uint16_t cpu_ldw_code_mmu(CPUArchState *env, abi_ptr addr,
++                          MemOpIdx oi, uintptr_t ra)
++{
++    void *haddr;
++    uint16_t ret;
++
++    haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_INST_FETCH);
++    ret = lduw_p(haddr);
++    clear_helper_retaddr();
++    if (get_memop(oi) & MO_BSWAP) {
++        ret = bswap16(ret);
++    }
++    return ret;
++}
++
++uint32_t cpu_ldl_code_mmu(CPUArchState *env, abi_ptr addr,
++                          MemOpIdx oi, uintptr_t ra)
++{
++    void *haddr;
++    uint32_t ret;
++
++    haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_INST_FETCH);
++    ret = ldl_p(haddr);
++    clear_helper_retaddr();
++    if (get_memop(oi) & MO_BSWAP) {
++        ret = bswap32(ret);
++    }
++    return ret;
++}
++
++uint64_t cpu_ldq_code_mmu(CPUArchState *env, abi_ptr addr,
++                          MemOpIdx oi, uintptr_t ra)
++{
++    void *haddr;
++    uint64_t ret;
++
++    validate_memop(oi, MO_BEUQ);
++    haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_LOAD);
++    ret = ldq_p(haddr);
++    clear_helper_retaddr();
++    if (get_memop(oi) & MO_BSWAP) {
++        ret = bswap64(ret);
++    }
++    return ret;
++}
++
+ #include "ldst_common.c.inc"
+ /*
+--
+.34.1

-[PULL 3/4] sysemu: Cleanup qemu_run_machine_init_done_notifiers()
+[PULL 10/12] tcg/loongarch64: Conditionalize tcg_out_exts_i32_i64
-From: Xiaoyao Li <xiaoyao.li@intel.com>
+Since TCG_TYPE_I32 values are kept sign-extended in registers,
 via ".w" instructions, we need not extend if the register matches.
 This is already relied upon by comparisons.
-Remove qemu_run_machine_init_done_notifiers() since no implementation
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 and user.
 Fixes: f66dc8737c9 ("vl: move all generic initialization out of vl.c")
 Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Message-Id: <20220104024136.1433545-1-xiaoyao.li@intel.com>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/sysemu/sysemu.h | 1 -
+ tcg/loongarch64/tcg-target.c.inc | 4 +++-
-file changed, 1 deletion(-)
+file changed, 3 insertions(+), 1 deletion(-)
-diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
+diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
---- a/include/sysemu/sysemu.h
+--- a/tcg/loongarch64/tcg-target.c.inc
-+++ b/include/sysemu/sysemu.h
++++ b/tcg/loongarch64/tcg-target.c.inc
-@@ -XXX,XX +XXX,XX @@ extern bool qemu_uuid_set;
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_ext32s(TCGContext *s, TCGReg ret, TCGReg arg)
- void qemu_add_exit_notifier(Notifier *notify);
- void qemu_remove_exit_notifier(Notifier *notify);
+ static void tcg_out_exts_i32_i64(TCGContext *s, TCGReg ret, TCGReg arg)
+ {
--void qemu_run_machine_init_done_notifiers(void);
+-    tcg_out_ext32s(s, ret, arg);
- void qemu_add_machine_init_done_notifier(Notifier *notify);
++    if (ret != arg) {
- void qemu_remove_machine_init_done_notifier(Notifier *notify);
++        tcg_out_ext32s(s, ret, arg);
++    }
  }
  static void tcg_out_extu_i32_i64(TCGContext *s, TCGReg ret, TCGReg arg)
 --
-.25.1
+.34.1

-[PULL 2/4] linux-user: Fix trivial build error on loongarch64 hosts
+[PULL 11/12] tcg/mips: Conditionalize tcg_out_exts_i32_i64
-From: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Since TCG_TYPE_I32 values are kept sign-extended in registers, we need not
 extend if the register matches.  This is already relied upon by comparisons.
-When building using GCC 8.3.0 on loongarch64 (Loongnix) we get:
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
   In file included from ../linux-user/signal.c:33:
   ../linux-user/host/loongarch64/host-signal.h: In function ‘host_signal_write’:
   ../linux-user/host/loongarch64/host-signal.h:57:9: error: a label can only be part of a statement and a declaration is not a statement
          uint32_t sel = (insn >> 15) & 0b11111111111;
          ^~~~~~~~
 We don't use the 'sel' variable more than once, so drop it.
 Meson output for the record:
   Host machine cpu family: loongarch64
   Host machine cpu: loongarch64
   C compiler for the host machine: cc (gcc 8.3.0 "cc (Loongnix 8.3.0-6.lnd.vec.27) 8.3.0")
   C linker for the host machine: cc ld.bfd 2.31.1-system
 Fixes: ad812c3bd65 ("linux-user: Implement CPU-specific signal handler for loongarch64 hosts")
 Reported-by: Song Gao <gaosong@loongson.cn>
 Suggested-by: Song Gao <gaosong@loongson.cn>
 Signed-off-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Reviewed-by: WANG Xuerui <git@xen0n.name>
 Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
 Message-Id: <20220104215027.2180972-1-f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- linux-user/host/loongarch64/host-signal.h | 4 +---
+ tcg/mips/tcg-target.c.inc | 4 +++-
-file changed, 1 insertion(+), 3 deletions(-)
+file changed, 3 insertions(+), 1 deletion(-)
-diff --git a/linux-user/host/loongarch64/host-signal.h b/linux-user/host/loongarch64/host-signal.h
+diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
---- a/linux-user/host/loongarch64/host-signal.h
+--- a/tcg/mips/tcg-target.c.inc
-+++ b/linux-user/host/loongarch64/host-signal.h
++++ b/tcg/mips/tcg-target.c.inc
-@@ -XXX,XX +XXX,XX @@ static inline bool host_signal_write(siginfo_t *info, ucontext_t *uc)
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_ext32s(TCGContext *s, TCGReg rd, TCGReg rs)
-         }
-         break;
+ static void tcg_out_exts_i32_i64(TCGContext *s, TCGReg rd, TCGReg rs)
-     case 0b001110: /* indexed, atomic, bounds-checking memory operations */
+ {
--        uint32_t sel = (insn >> 15) & 0b11111111111;
+-    tcg_out_ext32s(s, rd, rs);
--
++    if (rd != rs) {
--        switch (sel) {
++        tcg_out_ext32s(s, rd, rs);
-+        switch ((insn >> 15) & 0b11111111111) {
++    }
-         case 0b00000100000: /* stx.b */
+ }
-         case 0b00000101000: /* stx.h */
-         case 0b00000110000: /* stx.w */
+ static void tcg_out_extu_i32_i64(TCGContext *s, TCGReg rd, TCGReg rs)
 --
-.25.1
+.34.1

-New patch
+[PULL 12/12] tcg: Introduce tcg_out_movext2
+This is common code in most qemu_{ld,st} slow paths, moving two
 registers when there may be overlap between sources and destinations.
 At present, this is only used by 32-bit hosts for 64-bit data,
 but will shortly be used for more than that.
 Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
  tcg/tcg.c                 | 69 ++++++++++++++++++++++++++++++++++++---
  tcg/arm/tcg-target.c.inc  | 44 ++++++++++---------------
  tcg/i386/tcg-target.c.inc | 19 +++++------
 files changed, 90 insertions(+), 42 deletions(-)
 diff --git a/tcg/tcg.c b/tcg/tcg.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tcg.c
 +++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exts_i32_i64(TCGContext *s, TCGReg ret, TCGReg arg);
  static void tcg_out_extu_i32_i64(TCGContext *s, TCGReg ret, TCGReg arg);
  static void tcg_out_extrl_i64_i32(TCGContext *s, TCGReg ret, TCGReg arg);
  static void tcg_out_addi_ptr(TCGContext *s, TCGReg, TCGReg, tcg_target_long);
 -static bool tcg_out_xchg(TCGContext *s, TCGType type, TCGReg r1, TCGReg r2)
 -    __attribute__((unused));
 +static bool tcg_out_xchg(TCGContext *s, TCGType type, TCGReg r1, TCGReg r2);
  static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg);
  static void tcg_out_goto_tb(TCGContext *s, int which);
  static void tcg_out_op(TCGContext *s, TCGOpcode opc,
@@ -XXX,XX +XXX,XX @@ void tcg_raise_tb_overflow(TCGContext *s)
      siglongjmp(s->jmp_trans, -2);
  }
 +typedef struct TCGMovExtend {
 +    TCGReg dst;
 +    TCGReg src;
 +    TCGType dst_type;
 +    TCGType src_type;
 +    MemOp src_ext;
 +} TCGMovExtend;
 +
  /**
   * tcg_out_movext -- move and extend
   * @s: tcg context
@@ -XXX,XX +XXX,XX @@ void tcg_raise_tb_overflow(TCGContext *s)
   *
   * Move or extend @src into @dst, depending on @src_ext and the types.
   */
 -static void __attribute__((unused))
 -tcg_out_movext(TCGContext *s, TCGType dst_type, TCGReg dst,
 -               TCGType src_type, MemOp src_ext, TCGReg src)
 +static void tcg_out_movext(TCGContext *s, TCGType dst_type, TCGReg dst,
 +                           TCGType src_type, MemOp src_ext, TCGReg src)
  {
      switch (src_ext) {
      case MO_UB:
@@ -XXX,XX +XXX,XX @@ tcg_out_movext(TCGContext *s, TCGType dst_type, TCGReg dst,
      }
  }
 +/* Minor variations on a theme, using a structure. */
 +static void tcg_out_movext1_new_src(TCGContext *s, const TCGMovExtend *i,
 +                                    TCGReg src)
 +{
 +    tcg_out_movext(s, i->dst_type, i->dst, i->src_type, i->src_ext, src);
 +}
 +
 +static void tcg_out_movext1(TCGContext *s, const TCGMovExtend *i)
 +{
 +    tcg_out_movext1_new_src(s, i, i->src);
 +}
 +
 +/**
 + * tcg_out_movext2 -- move and extend two pair
 + * @s: tcg context
 + * @i1: first move description
 + * @i2: second move description
 + * @scratch: temporary register, or -1 for none
 + *
 + * As tcg_out_movext, for both @i1 and @i2, caring for overlap
 + * between the sources and destinations.
 + */
 +
 +static void __attribute__((unused))
 +tcg_out_movext2(TCGContext *s, const TCGMovExtend *i1,
 +                const TCGMovExtend *i2, int scratch)
 +{
 +    TCGReg src1 = i1->src;
 +    TCGReg src2 = i2->src;
 +
 +    if (i1->dst != src2) {
 +        tcg_out_movext1(s, i1);
 +        tcg_out_movext1(s, i2);
 +        return;
 +    }
 +    if (i2->dst == src1) {
 +        TCGType src1_type = i1->src_type;
 +        TCGType src2_type = i2->src_type;
 +
 +        if (tcg_out_xchg(s, MAX(src1_type, src2_type), src1, src2)) {
 +            /* The data is now in the correct registers, now extend. */
 +            src1 = i2->src;
 +            src2 = i1->src;
 +        } else {
 +            tcg_debug_assert(scratch >= 0);
 +            tcg_out_mov(s, src1_type, scratch, src1);
 +            src1 = scratch;
 +        }
 +    }
 +    tcg_out_movext1_new_src(s, i2, src2);
 +    tcg_out_movext1_new_src(s, i1, src1);
 +}
 +
  #define C_PFX1(P, A)                    P##A
  #define C_PFX2(P, A, B)                 P##A##_##B
  #define C_PFX3(P, A, B, C)              P##A##_##B##_##C
 diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/arm/tcg-target.c.inc
 +++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void add_qemu_ldst_label(TCGContext *s, bool is_ld, MemOpIdx oi,
  static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
  {
 -    TCGReg argreg, datalo, datahi;
 +    TCGReg argreg;
      MemOpIdx oi = lb->oi;
      MemOp opc = get_memop(oi);
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
      /* Use the canonical unsigned helpers and minimize icache usage. */
      tcg_out_call_int(s, qemu_ld_helpers[opc & MO_SIZE]);
 -    datalo = lb->datalo_reg;
 -    datahi = lb->datahi_reg;
      if ((opc & MO_SIZE) == MO_64) {
 -        if (datalo != TCG_REG_R1) {
 -            tcg_out_mov_reg(s, COND_AL, datalo, TCG_REG_R0);
 -            tcg_out_mov_reg(s, COND_AL, datahi, TCG_REG_R1);
 -        } else if (datahi != TCG_REG_R0) {
 -            tcg_out_mov_reg(s, COND_AL, datahi, TCG_REG_R1);
 -            tcg_out_mov_reg(s, COND_AL, datalo, TCG_REG_R0);
 -        } else {
 -            tcg_out_mov_reg(s, COND_AL, TCG_REG_TMP, TCG_REG_R0);
 -            tcg_out_mov_reg(s, COND_AL, datahi, TCG_REG_R1);
 -            tcg_out_mov_reg(s, COND_AL, datalo, TCG_REG_TMP);
 -        }
 +        TCGMovExtend ext[2] = {
 +            { .dst = lb->datalo_reg, .dst_type = TCG_TYPE_I32,
 +              .src = TCG_REG_R0, .src_type = TCG_TYPE_I32, .src_ext = MO_UL },
 +            { .dst = lb->datahi_reg, .dst_type = TCG_TYPE_I32,
 +              .src = TCG_REG_R1, .src_type = TCG_TYPE_I32, .src_ext = MO_UL },
 +        };
 +        tcg_out_movext2(s, &ext[0], &ext[1], TCG_REG_TMP);
      } else {
 -        tcg_out_movext(s, TCG_TYPE_I32, datalo,
 +        tcg_out_movext(s, TCG_TYPE_I32, lb->datalo_reg,
                         TCG_TYPE_I32, opc & MO_SSIZE, TCG_REG_R0);
      }
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_fail_alignment(TCGContext *s, TCGLabelQemuLdst *l)
      if (TARGET_LONG_BITS == 64) {
          /* 64-bit target address is aligned into R2:R3. */
 -        if (l->addrhi_reg != TCG_REG_R2) {
 -            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R2, l->addrlo_reg);
 -            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R3, l->addrhi_reg);
 -        } else if (l->addrlo_reg != TCG_REG_R3) {
 -            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R3, l->addrhi_reg);
 -            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R2, l->addrlo_reg);
 -        } else {
 -            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R1, TCG_REG_R2);
 -            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R2, TCG_REG_R3);
 -            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R3, TCG_REG_R1);
 -        }
 +        TCGMovExtend ext[2] = {
 +            { .dst = TCG_REG_R2, .dst_type = TCG_TYPE_I32,
 +              .src = l->addrlo_reg,
 +              .src_type = TCG_TYPE_I32, .src_ext = MO_UL },
 +            { .dst = TCG_REG_R3, .dst_type = TCG_TYPE_I32,
 +              .src = l->addrhi_reg,
 +              .src_type = TCG_TYPE_I32, .src_ext = MO_UL },
 +        };
 +        tcg_out_movext2(s, &ext[0], &ext[1], TCG_REG_TMP);
      } else {
          tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R1, l->addrlo_reg);
      }
 diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/i386/tcg-target.c.inc
 +++ b/tcg/i386/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
  {
      MemOpIdx oi = l->oi;
      MemOp opc = get_memop(oi);
 -    TCGReg data_reg;
      tcg_insn_unit **label_ptr = &l->label_ptr[0];
      /* resolve label address */
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
      tcg_out_branch(s, 1, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]);
 -    data_reg = l->datalo_reg;
      if (TCG_TARGET_REG_BITS == 32 && (opc & MO_SIZE) == MO_64) {
 -        if (data_reg == TCG_REG_EDX) {
 -            /* xchg %edx, %eax */
 -            tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0);
 -            tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EAX);
 -        } else {
 -            tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
 -            tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EDX);
 -        }
 +        TCGMovExtend ext[2] = {
 +            { .dst = l->datalo_reg, .dst_type = TCG_TYPE_I32,
 +              .src = TCG_REG_EAX, .src_type = TCG_TYPE_I32, .src_ext = MO_UL },
 +            { .dst = l->datahi_reg, .dst_type = TCG_TYPE_I32,
 +              .src = TCG_REG_EDX, .src_type = TCG_TYPE_I32, .src_ext = MO_UL },
 +        };
 +        tcg_out_movext2(s, &ext[0], &ext[1], -1);
      } else {
 -        tcg_out_movext(s, l->type, data_reg,
 +        tcg_out_movext(s, l->type, l->datalo_reg,
                         TCG_TYPE_REG, opc & MO_SSIZE, TCG_REG_EAX);
      }
 --
 .34.1

The following changes since commit 67e41fe0cfb62e6cdfa659f0155417d17e5274ea:

Merge tag 'pull-ppc-20220104' of https://github.com/legoater/qemu into staging (2022-01-04 07:23:27 -0800)

are available in the Git repository at:

https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20220104

for you to fetch changes up to d7478d4229f0a2b2817a55487e6b17081099fae4:

common-user: Fix tail calls to safe_syscall_set_errno_tail (2022-01-04 15:41:03 -0800)

----------------------------------------------------------------
Fix for safe_syscall_base.
Fix for folding of vector add/sub.
Fix build on loongarch64 with gcc 8.
Remove decl for qemu_run_machine_init_done_notifiers.

----------------------------------------------------------------
Philippe Mathieu-Daudé (1):
      linux-user: Fix trivial build error on loongarch64 hosts

Richard Henderson (2):
      tcg/optimize: Fix folding of vector ops
      common-user: Fix tail calls to safe_syscall_set_errno_tail

Xiaoyao Li (1):
      sysemu: Cleanup qemu_run_machine_init_done_notifiers()

include/sysemu/sysemu.h                    |  1 -
 linux-user/host/loongarch64/host-signal.h  |  4 +--
 tcg/optimize.c                             | 49 +++++++++++++++++++++++-------
 common-user/host/i386/safe-syscall.inc.S   |  1 +
 common-user/host/mips/safe-syscall.inc.S   |  1 +
 common-user/host/x86_64/safe-syscall.inc.S |  1 +
 6 files changed, 42 insertions(+), 15 deletions(-)

Bitwise operations are easy to fold, because the operation is
identical regardless of element size.  But add and sub need
extra element size info that is not currently propagated.

Fixes: 2f9f08ba43d
Cc: qemu-stable@nongnu.org
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/799
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 49 ++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 38 insertions(+), 11 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static uint64_t do_constant_folding_2(TCGOpcode op, uint64_t x, uint64_t y)
     CASE_OP_32_64(mul):
         return x * y;
 
-    CASE_OP_32_64(and):
+    CASE_OP_32_64_VEC(and):
         return x & y;
 
-    CASE_OP_32_64(or):
+    CASE_OP_32_64_VEC(or):
         return x | y;
 
-    CASE_OP_32_64(xor):
+    CASE_OP_32_64_VEC(xor):
         return x ^ y;
 
     case INDEX_op_shl_i32:
@@ -XXX,XX +XXX,XX @@ static uint64_t do_constant_folding_2(TCGOpcode op, uint64_t x, uint64_t y)
     case INDEX_op_rotl_i64:
         return rol64(x, y & 63);
 
-    CASE_OP_32_64(not):
+    CASE_OP_32_64_VEC(not):
         return ~x;
 
     CASE_OP_32_64(neg):
         return -x;
 
-    CASE_OP_32_64(andc):
+    CASE_OP_32_64_VEC(andc):
         return x & ~y;
 
-    CASE_OP_32_64(orc):
+    CASE_OP_32_64_VEC(orc):
         return x | ~y;
 
     CASE_OP_32_64(eqv):
@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
     return false;
 }
 
+static bool fold_commutative(OptContext *ctx, TCGOp *op)
+{
+    swap_commutative(op->args[0], &op->args[1], &op->args[2]);
+    return false;
+}
+
 static bool fold_const2_commutative(OptContext *ctx, TCGOp *op)
 {
     swap_commutative(op->args[0], &op->args[1], &op->args[2]);
@@ -XXX,XX +XXX,XX @@ static bool fold_add(OptContext *ctx, TCGOp *op)
     return false;
 }
 
+/* We cannot as yet do_constant_folding with vectors. */
+static bool fold_add_vec(OptContext *ctx, TCGOp *op)
+{
+    if (fold_commutative(ctx, op) ||
+        fold_xi_to_x(ctx, op, 0)) {
+        return true;
+    }
+    return false;
+}
+
 static bool fold_addsub2(OptContext *ctx, TCGOp *op, bool add)
 {
     if (arg_is_const(op->args[2]) && arg_is_const(op->args[3]) &&
@@ -XXX,XX +XXX,XX @@ static bool fold_sub_to_neg(OptContext *ctx, TCGOp *op)
     return false;
 }
 
-static bool fold_sub(OptContext *ctx, TCGOp *op)
+/* We cannot as yet do_constant_folding with vectors. */
+static bool fold_sub_vec(OptContext *ctx, TCGOp *op)
 {
-    if (fold_const2(ctx, op) ||
-        fold_xx_to_i(ctx, op, 0) ||
+    if (fold_xx_to_i(ctx, op, 0) ||
         fold_xi_to_x(ctx, op, 0) ||
         fold_sub_to_neg(ctx, op)) {
         return true;
@@ -XXX,XX +XXX,XX @@ static bool fold_sub(OptContext *ctx, TCGOp *op)
     return false;
 }
 
+static bool fold_sub(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op) || fold_sub_vec(ctx, op);
+}
+
 static bool fold_sub2(OptContext *ctx, TCGOp *op)
 {
     return fold_addsub2(ctx, op, false);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          * Sorted alphabetically by opcode as much as possible.
          */
         switch (opc) {
-        CASE_OP_32_64_VEC(add):
+        CASE_OP_32_64(add):
             done = fold_add(&ctx, op);
             break;
+        case INDEX_op_add_vec:
+            done = fold_add_vec(&ctx, op);
+            break;
         CASE_OP_32_64(add2):
             done = fold_add2(&ctx, op);
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(sextract):
             done = fold_sextract(&ctx, op);
             break;
-        CASE_OP_32_64_VEC(sub):
+        CASE_OP_32_64(sub):
             done = fold_sub(&ctx, op);
             break;
+        case INDEX_op_sub_vec:
+            done = fold_sub_vec(&ctx, op);
+            break;
         CASE_OP_32_64(sub2):
             done = fold_sub2(&ctx, op);
             break;
-- 
2.25.1

From: Philippe Mathieu-Daudé <f4bug@amsat.org>

When building using GCC 8.3.0 on loongarch64 (Loongnix) we get:

In file included from ../linux-user/signal.c:33:
  ../linux-user/host/loongarch64/host-signal.h: In function ‘host_signal_write’:
  ../linux-user/host/loongarch64/host-signal.h:57:9: error: a label can only be part of a statement and a declaration is not a statement
         uint32_t sel = (insn >> 15) & 0b11111111111;
         ^~~~~~~~

We don't use the 'sel' variable more than once, so drop it.

Meson output for the record:

Host machine cpu family: loongarch64
  Host machine cpu: loongarch64
  C compiler for the host machine: cc (gcc 8.3.0 "cc (Loongnix 8.3.0-6.lnd.vec.27) 8.3.0")
  C linker for the host machine: cc ld.bfd 2.31.1-system

Fixes: ad812c3bd65 ("linux-user: Implement CPU-specific signal handler for loongarch64 hosts")
Reported-by: Song Gao <gaosong@loongson.cn>
Suggested-by: Song Gao <gaosong@loongson.cn>
Signed-off-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Reviewed-by: WANG Xuerui <git@xen0n.name>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-Id: <20220104215027.2180972-1-f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 linux-user/host/loongarch64/host-signal.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/linux-user/host/loongarch64/host-signal.h b/linux-user/host/loongarch64/host-signal.h
index XXXXXXX..XXXXXXX 100644
--- a/linux-user/host/loongarch64/host-signal.h
+++ b/linux-user/host/loongarch64/host-signal.h
@@ -XXX,XX +XXX,XX @@ static inline bool host_signal_write(siginfo_t *info, ucontext_t *uc)
         }
         break;
     case 0b001110: /* indexed, atomic, bounds-checking memory operations */
-        uint32_t sel = (insn >> 15) & 0b11111111111;
-
-        switch (sel) {
+        switch ((insn >> 15) & 0b11111111111) {
         case 0b00000100000: /* stx.b */
         case 0b00000101000: /* stx.h */
         case 0b00000110000: /* stx.w */
-- 
2.25.1

For the ABIs in which the syscall return register is not
also the first function argument register, move the errno
value into the correct place.

Fixes: a3310c0397e2 ("linux-user: Move syscall error detection into safe_syscall_base")
Reported-by: Laurent Vivier <laurent@vivier.eu>
Tested-by: Laurent Vivier <laurent@vivier.eu>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Message-Id: <20220104190454.542225-1-richard.henderson@linaro.org>
---
 common-user/host/i386/safe-syscall.inc.S   | 1 +
 common-user/host/mips/safe-syscall.inc.S   | 1 +
 common-user/host/x86_64/safe-syscall.inc.S | 1 +
 3 files changed, 3 insertions(+)

diff --git a/common-user/host/i386/safe-syscall.inc.S b/common-user/host/i386/safe-syscall.inc.S
index XXXXXXX..XXXXXXX 100644
--- a/common-user/host/i386/safe-syscall.inc.S
+++ b/common-user/host/i386/safe-syscall.inc.S
@@ -XXX,XX +XXX,XX @@ safe_syscall_end:
         pop     %ebp
         .cfi_adjust_cfa_offset -4
         .cfi_restore ebp
+        mov     %eax, (%esp)
         jmp     safe_syscall_set_errno_tail
 
         .cfi_endproc
diff --git a/common-user/host/mips/safe-syscall.inc.S b/common-user/host/mips/safe-syscall.inc.S
index XXXXXXX..XXXXXXX 100644
--- a/common-user/host/mips/safe-syscall.inc.S
+++ b/common-user/host/mips/safe-syscall.inc.S
@@ -XXX,XX +XXX,XX @@ safe_syscall_end:
 1:      USE_ALT_CP(t0)
         SETUP_GPX(t1)
         SETUP_GPX64(t0, t1)
+        move    a0, v0
         PTR_LA  t9, safe_syscall_set_errno_tail
         jr      t9
 
diff --git a/common-user/host/x86_64/safe-syscall.inc.S b/common-user/host/x86_64/safe-syscall.inc.S
index XXXXXXX..XXXXXXX 100644
--- a/common-user/host/x86_64/safe-syscall.inc.S
+++ b/common-user/host/x86_64/safe-syscall.inc.S
@@ -XXX,XX +XXX,XX @@ safe_syscall_end:
 1:      pop     %rbp
         .cfi_def_cfa_offset 8
         .cfi_restore rbp
+        mov     %eax, %edi
         jmp     safe_syscall_set_errno_tail
         .cfi_endproc
 
-- 
2.25.1

The following changes since commit 7c18f2d663521f1b31b821a13358ce38075eaf7d:

Merge tag 'for-upstream' of https://gitlab.com/bonzini/qemu into staging (2023-04-29 23:07:17 +0100)

are available in the Git repository at:

https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20230502

for you to fetch changes up to bdc7fba1c5a29ae218b45353daac9308fe1aae82:

tcg: Introduce tcg_out_movext2 (2023-05-02 12:15:41 +0100)

----------------------------------------------------------------
Misc tcg-related patch queue.

----------------------------------------------------------------
Dickon Hood (1):
      qemu/bitops.h: Limit rotate amounts

Kiran Ostrolenk (1):
      qemu/host-utils.h: Add clz and ctz functions for lower-bit integers

Nazar Kazakov (2):
      tcg: Add tcg_gen_gvec_andcs
      tcg: Add tcg_gen_gvec_rotrs

Richard Henderson (7):
      softmmu: Tidy dirtylimit_dirty_ring_full_time
      qemu/int128: Re-shuffle Int128Alias members
      migration/xbzrle: Use __attribute__((target)) for avx512
      accel/tcg: Add cpu_ld*_code_mmu
      tcg/loongarch64: Conditionalize tcg_out_exts_i32_i64
      tcg/mips: Conditionalize tcg_out_exts_i32_i64
      tcg: Introduce tcg_out_movext2

Weiwei Li (1):
      accel/tcg: Uncache the host address for instruction fetch when tlb size < 1

Drop inline marker: let compiler decide.

Change return type to uint64_t: this matches the computation in the
return statement and the local variable assignment in the caller.

Rename local to dirty_ring_size_MB to fix typo.
Simplify conversion to MiB via qemu_target_page_bits and right shift.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Reviewed-by: Thomas Huth <thuth@redhat.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 softmmu/dirtylimit.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/softmmu/dirtylimit.c b/softmmu/dirtylimit.c
index XXXXXXX..XXXXXXX 100644
--- a/softmmu/dirtylimit.c
+++ b/softmmu/dirtylimit.c
@@ -XXX,XX +XXX,XX @@ bool dirtylimit_vcpu_index_valid(int cpu_index)
              cpu_index >= ms->smp.max_cpus);
 }
 
-static inline int64_t dirtylimit_dirty_ring_full_time(uint64_t dirtyrate)
+static uint64_t dirtylimit_dirty_ring_full_time(uint64_t dirtyrate)
 {
     static uint64_t max_dirtyrate;
-    uint32_t dirty_ring_size = kvm_dirty_ring_size();
-    uint64_t dirty_ring_size_meory_MB =
-        dirty_ring_size * qemu_target_page_size() >> 20;
+    unsigned target_page_bits = qemu_target_page_bits();
+    uint64_t dirty_ring_size_MB;
+
+    /* So far, the largest (non-huge) page size is 64k, i.e. 16 bits. */
+    assert(target_page_bits < 20);
+
+    /* Convert ring size (pages) to MiB (2**20). */
+    dirty_ring_size_MB = kvm_dirty_ring_size() >> (20 - target_page_bits);
 
     if (max_dirtyrate < dirtyrate) {
         max_dirtyrate = dirtyrate;
     }
 
-    return dirty_ring_size_meory_MB * 1000000 / max_dirtyrate;
+    return dirty_ring_size_MB * 1000000 / max_dirtyrate;
 }
 
 static inline bool dirtylimit_done(uint64_t quota,
-- 
2.34.1

From: Weiwei Li <liweiwei@iscas.ac.cn>

When PMP entry overlap part of the page, we'll set the tlb_size to 1, which
will make the address in tlb entry set with TLB_INVALID_MASK, and the next
access will again go through tlb_fill.However, this way will not work in
tb_gen_code() => get_page_addr_code_hostp(): the TLB host address will be
cached, and the following instructions can use this host address directly
which may lead to the bypass of PMP related check.
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1542.

Signed-off-by: Weiwei Li <liweiwei@iscas.ac.cn>
Signed-off-by: Junqiang Wang <wangjunqiang@iscas.ac.cn>
Reviewed-by: LIU Zhiwei <zhiwei_liu@linux.alibaba.com>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-Id: <20230422130329.23555-6-liweiwei@iscas.ac.cn>
---
 accel/tcg/cputlb.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ tb_page_addr_t get_page_addr_code_hostp(CPUArchState *env, target_ulong addr,
     if (p == NULL) {
         return -1;
     }
+
+    if (full->lg_page_size < TARGET_PAGE_BITS) {
+        return -1;
+    }
+
     if (hostp) {
         *hostp = p;
     }
-- 
2.34.1

From: Dickon Hood <dickon.hood@codethink.co.uk>

Rotates have been fixed up to only allow for reasonable rotate amounts
(ie, no rotates >7 on an 8b value etc.)  This fixes a problem with riscv
vector rotate instructions.

Signed-off-by: Dickon Hood <dickon.hood@codethink.co.uk>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-Id: <20230428144757.57530-9-lawrence.hunter@codethink.co.uk>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/qemu/bitops.h | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/include/qemu/bitops.h b/include/qemu/bitops.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/bitops.h
+++ b/include/qemu/bitops.h
@@ -XXX,XX +XXX,XX @@ static inline unsigned long find_first_zero_bit(const unsigned long *addr,
  */
 static inline uint8_t rol8(uint8_t word, unsigned int shift)
 {
-    return (word << shift) | (word >> ((8 - shift) & 7));
+    shift &= 7;
+    return (word << shift) | (word >> (8 - shift));
 }
 
 /**
@@ -XXX,XX +XXX,XX @@ static inline uint8_t rol8(uint8_t word, unsigned int shift)
  */
 static inline uint8_t ror8(uint8_t word, unsigned int shift)
 {
-    return (word >> shift) | (word << ((8 - shift) & 7));
+    shift &= 7;
+    return (word >> shift) | (word << (8 - shift));
 }
 
 /**
@@ -XXX,XX +XXX,XX @@ static inline uint8_t ror8(uint8_t word, unsigned int shift)
  */
 static inline uint16_t rol16(uint16_t word, unsigned int shift)
 {
-    return (word << shift) | (word >> ((16 - shift) & 15));
+    shift &= 15;
+    return (word << shift) | (word >> (16 - shift));
 }
 
 /**
@@ -XXX,XX +XXX,XX @@ static inline uint16_t rol16(uint16_t word, unsigned int shift)
  */
 static inline uint16_t ror16(uint16_t word, unsigned int shift)
 {
-    return (word >> shift) | (word << ((16 - shift) & 15));
+    shift &= 15;
+    return (word >> shift) | (word << (16 - shift));
 }
 
 /**
@@ -XXX,XX +XXX,XX @@ static inline uint16_t ror16(uint16_t word, unsigned int shift)
  */
 static inline uint32_t rol32(uint32_t word, unsigned int shift)
 {
-    return (word << shift) | (word >> ((32 - shift) & 31));
+    shift &= 31;
+    return (word << shift) | (word >> (32 - shift));
 }
 
 /**
@@ -XXX,XX +XXX,XX @@ static inline uint32_t rol32(uint32_t word, unsigned int shift)
  */
 static inline uint32_t ror32(uint32_t word, unsigned int shift)
 {
-    return (word >> shift) | (word << ((32 - shift) & 31));
+    shift &= 31;
+    return (word >> shift) | (word << (32 - shift));
 }
 
 /**
@@ -XXX,XX +XXX,XX @@ static inline uint32_t ror32(uint32_t word, unsigned int shift)
  */
 static inline uint64_t rol64(uint64_t word, unsigned int shift)
 {
-    return (word << shift) | (word >> ((64 - shift) & 63));
+    shift &= 63;
+    return (word << shift) | (word >> (64 - shift));
 }
 
 /**
@@ -XXX,XX +XXX,XX @@ static inline uint64_t rol64(uint64_t word, unsigned int shift)
  */
 static inline uint64_t ror64(uint64_t word, unsigned int shift)
 {
-    return (word >> shift) | (word << ((64 - shift) & 63));
+    shift &= 63;
+    return (word >> shift) | (word << (64 - shift));
 }
 
 /**
-- 
2.34.1

From: Kiran Ostrolenk <kiran.ostrolenk@codethink.co.uk>

This is for use in the RISC-V vclz and vctz instructions (implemented in
proceeding commit).

Signed-off-by: Kiran Ostrolenk <kiran.ostrolenk@codethink.co.uk>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-Id: <20230428144757.57530-11-lawrence.hunter@codethink.co.uk>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/qemu/host-utils.h | 54 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)

diff --git a/include/qemu/host-utils.h b/include/qemu/host-utils.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/host-utils.h
+++ b/include/qemu/host-utils.h
@@ -XXX,XX +XXX,XX @@ static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
 }
 #endif
 
+/**
+ * clz8 - count leading zeros in a 8-bit value.
+ * @val: The value to search
+ *
+ * Returns 8 if the value is zero.  Note that the GCC builtin is
+ * undefined if the value is zero.
+ *
+ * Note that the GCC builtin will upcast its argument to an `unsigned int`
+ * so this function subtracts off the number of prepended zeroes.
+ */
+static inline int clz8(uint8_t val)
+{
+    return val ? __builtin_clz(val) - 24 : 8;
+}
+
+/**
+ * clz16 - count leading zeros in a 16-bit value.
+ * @val: The value to search
+ *
+ * Returns 16 if the value is zero.  Note that the GCC builtin is
+ * undefined if the value is zero.
+ *
+ * Note that the GCC builtin will upcast its argument to an `unsigned int`
+ * so this function subtracts off the number of prepended zeroes.
+ */
+static inline int clz16(uint16_t val)
+{
+    return val ? __builtin_clz(val) - 16 : 16;
+}
+
 /**
  * clz32 - count leading zeros in a 32-bit value.
  * @val: The value to search
@@ -XXX,XX +XXX,XX @@ static inline int clo64(uint64_t val)
     return clz64(~val);
 }
 
+/**
+ * ctz8 - count trailing zeros in a 8-bit value.
+ * @val: The value to search
+ *
+ * Returns 8 if the value is zero.  Note that the GCC builtin is
+ * undefined if the value is zero.
+ */
+static inline int ctz8(uint8_t val)
+{
+    return val ? __builtin_ctz(val) : 8;
+}
+
+/**
+ * ctz16 - count trailing zeros in a 16-bit value.
+ * @val: The value to search
+ *
+ * Returns 16 if the value is zero.  Note that the GCC builtin is
+ * undefined if the value is zero.
+ */
+static inline int ctz16(uint16_t val)
+{
+    return val ? __builtin_ctz(val) : 16;
+}
+
 /**
  * ctz32 - count trailing zeros in a 32-bit value.
  * @val: The value to search
-- 
2.34.1

From: Nazar Kazakov <nazar.kazakov@codethink.co.uk>

Add tcg expander and helper functions for and-compliment
vector with scalar operand.

Signed-off-by: Nazar Kazakov <nazar.kazakov@codethink.co.uk>
Message-Id: <20230428144757.57530-10-lawrence.hunter@codethink.co.uk>
[rth: Split out of larger patch.]
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/tcg-runtime.h      |  1 +
 include/tcg/tcg-op-gvec.h    |  2 ++
 accel/tcg/tcg-runtime-gvec.c | 11 +++++++++++
 tcg/tcg-op-gvec.c            | 17 +++++++++++++++++
 4 files changed, 31 insertions(+)

diff --git a/accel/tcg/tcg-runtime.h b/accel/tcg/tcg-runtime.h
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-runtime.h
+++ b/accel/tcg/tcg-runtime.h
@@ -XXX,XX +XXX,XX @@ DEF_HELPER_FLAGS_4(gvec_nor, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(gvec_eqv, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 
 DEF_HELPER_FLAGS_4(gvec_ands, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_andcs, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
 DEF_HELPER_FLAGS_4(gvec_xors, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
 DEF_HELPER_FLAGS_4(gvec_ors, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
 
diff --git a/include/tcg/tcg-op-gvec.h b/include/tcg/tcg-op-gvec.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg-op-gvec.h
+++ b/include/tcg/tcg-op-gvec.h
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, uint32_t aofs,
 
 void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs,
                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_andcs(unsigned vece, uint32_t dofs, uint32_t aofs,
+                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz);
 void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs,
                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz);
 void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs,
diff --git a/accel/tcg/tcg-runtime-gvec.c b/accel/tcg/tcg-runtime-gvec.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-runtime-gvec.c
+++ b/accel/tcg/tcg-runtime-gvec.c
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_ands)(void *d, void *a, uint64_t b, uint32_t desc)
     clear_high(d, oprsz, desc);
 }
 
+void HELPER(gvec_andcs)(void *d, void *a, uint64_t b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
+        *(uint64_t *)(d + i) = *(uint64_t *)(a + i) & ~b;
+    }
+    clear_high(d, oprsz, desc);
+}
+
 void HELPER(gvec_xors)(void *d, void *a, uint64_t b, uint32_t desc)
 {
     intptr_t oprsz = simd_oprsz(desc);
diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg-op-gvec.c
+++ b/tcg/tcg-op-gvec.c
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs,
     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
 }
 
+void tcg_gen_gvec_andcs(unsigned vece, uint32_t dofs, uint32_t aofs,
+                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
+{
+    static GVecGen2s g = {
+        .fni8 = tcg_gen_andc_i64,
+        .fniv = tcg_gen_andc_vec,
+        .fno = gen_helper_gvec_andcs,
+        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+        .vece = MO_64
+    };
+
+    TCGv_i64 tmp = tcg_temp_ebb_new_i64();
+    tcg_gen_dup_i64(vece, tmp, c);
+    tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g);
+    tcg_temp_free_i64(tmp);
+}
+
 static const GVecGen2s gop_xors = {
     .fni8 = tcg_gen_xor_i64,
     .fniv = tcg_gen_xor_vec,
-- 
2.34.1

From: Nazar Kazakov <nazar.kazakov@codethink.co.uk>

Add tcg expander and helper functions for rotate right
vector with scalar operand.

Signed-off-by: Nazar Kazakov <nazar.kazakov@codethink.co.uk>
Message-Id: <20230428144757.57530-10-lawrence.hunter@codethink.co.uk>
[rth: Split out of larger patch; mask rotation count.]
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/tcg/tcg-op-gvec.h |  2 ++
 tcg/tcg-op-gvec.c         | 11 +++++++++++
 2 files changed, 13 insertions(+)

diff --git a/include/tcg/tcg-op-gvec.h b/include/tcg/tcg-op-gvec.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg-op-gvec.h
+++ b/include/tcg/tcg-op-gvec.h
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_sars(unsigned vece, uint32_t dofs, uint32_t aofs,
                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz);
 void tcg_gen_gvec_rotls(unsigned vece, uint32_t dofs, uint32_t aofs,
                         TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_rotrs(unsigned vece, uint32_t dofs, uint32_t aofs,
+                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz);
 
 /*
  * Perform vector shift by vector element, modulo the element size.
diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg-op-gvec.c
+++ b/tcg/tcg-op-gvec.c
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_rotls(unsigned vece, uint32_t dofs, uint32_t aofs,
     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
 }
 
+void tcg_gen_gvec_rotrs(unsigned vece, uint32_t dofs, uint32_t aofs,
+                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
+{
+    TCGv_i32 tmp = tcg_temp_ebb_new_i32();
+
+    tcg_gen_neg_i32(tmp, shift);
+    tcg_gen_andi_i32(tmp, tmp, (8 << vece) - 1);
+    tcg_gen_gvec_rotls(vece, dofs, aofs, tmp, oprsz, maxsz);
+    tcg_temp_free_i32(tmp);
+}
+
 /*
  * Expand D = A << (B % element bits)
  *
-- 
2.34.1

Clang 14, with --enable-tcg-interpreter errors with

include/qemu/int128.h:487:16: error: alignment of field 'i' (128 bits)
  does not match the alignment of the first field in transparent union;
  transparent_union attribute ignored [-Werror,-Wignored-attributes]
    __int128_t i;
               ^
include/qemu/int128.h:486:12: note: alignment of first field is 64 bits
    Int128 s;
           ^
1 error generated.

By placing the __uint128_t member first, this is avoided.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Message-Id: <20230501204625.277361-1-richard.henderson@linaro.org>
---
 include/qemu/int128.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/qemu/int128.h b/include/qemu/int128.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/int128.h
+++ b/include/qemu/int128.h
@@ -XXX,XX +XXX,XX @@ static inline void bswap128s(Int128 *s)
  */
 #ifdef CONFIG_INT128
 typedef union {
-    Int128 s;
-    __int128_t i;
     __uint128_t u;
+    __int128_t i;
+    Int128 s;
 } Int128Alias __attribute__((transparent_union));
 #else
 typedef Int128 Int128Alias;
-- 
2.34.1

Use the attribute, which is supported by clang, instead of
the #pragma, which is not supported and, for some reason,
also not detected by the meson probe, so we fail by -Werror.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Message-Id: <20230501210555.289806-1-richard.henderson@linaro.org>
---
 meson.build        | 5 +----
 migration/xbzrle.c | 9 ++++-----
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/meson.build b/meson.build
index XXXXXXX..XXXXXXX 100644
--- a/meson.build
+++ b/meson.build
@@ -XXX,XX +XXX,XX @@ config_host_data.set('CONFIG_AVX512F_OPT', get_option('avx512f') \
 config_host_data.set('CONFIG_AVX512BW_OPT', get_option('avx512bw') \
   .require(have_cpuid_h, error_message: 'cpuid.h not available, cannot enable AVX512BW') \
   .require(cc.links('''
-    #pragma GCC push_options
-    #pragma GCC target("avx512bw")
     #include <cpuid.h>
     #include <immintrin.h>
-    static int bar(void *a) {
-
+    static int __attribute__((target("avx512bw"))) bar(void *a) {
       __m512i *x = a;
       __m512i res= _mm512_abs_epi8(*x);
       return res[1];
diff --git a/migration/xbzrle.c b/migration/xbzrle.c
index XXXXXXX..XXXXXXX 100644
--- a/migration/xbzrle.c
+++ b/migration/xbzrle.c
@@ -XXX,XX +XXX,XX @@ int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen)
 }
 
 #if defined(CONFIG_AVX512BW_OPT)
-#pragma GCC push_options
-#pragma GCC target("avx512bw")
 #include <immintrin.h>
-int xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen,
-                             uint8_t *dst, int dlen)
+
+int __attribute__((target("avx512bw")))
+xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen,
+                            uint8_t *dst, int dlen)
 {
     uint32_t zrun_len = 0, nzrun_len = 0;
     int d = 0, i = 0, num = 0;
@@ -XXX,XX +XXX,XX @@ int xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen,
     }
     return d;
 }
-#pragma GCC pop_options
 #endif
-- 
2.34.1

At least RISC-V has the need to be able to perform a read
using execute permissions, outside of translation.
Add helpers to facilitate this.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Acked-by: Alistair Francis <alistair.francis@wdc.com>
Reviewed-by: Weiwei Li <liweiwei@iscas.ac.cn>
Tested-by: Daniel Henrique Barboza <dbarboza@ventanamicro.com>
Message-Id: <20230325105429.1142530-9-richard.henderson@linaro.org>
Message-Id: <20230412114333.118895-9-richard.henderson@linaro.org>
---
 include/exec/cpu_ldst.h |  9 +++++++
 accel/tcg/cputlb.c      | 48 ++++++++++++++++++++++++++++++++++
 accel/tcg/user-exec.c   | 58 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 115 insertions(+)

diff --git a/include/exec/cpu_ldst.h b/include/exec/cpu_ldst.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/cpu_ldst.h
+++ b/include/exec/cpu_ldst.h
@@ -XXX,XX +XXX,XX @@ static inline CPUTLBEntry *tlb_entry(CPUArchState *env, uintptr_t mmu_idx,
 # define cpu_stq_mmu          cpu_stq_le_mmu
 #endif
 
+uint8_t cpu_ldb_code_mmu(CPUArchState *env, abi_ptr addr,
+                         MemOpIdx oi, uintptr_t ra);
+uint16_t cpu_ldw_code_mmu(CPUArchState *env, abi_ptr addr,
+                          MemOpIdx oi, uintptr_t ra);
+uint32_t cpu_ldl_code_mmu(CPUArchState *env, abi_ptr addr,
+                          MemOpIdx oi, uintptr_t ra);
+uint64_t cpu_ldq_code_mmu(CPUArchState *env, abi_ptr addr,
+                          MemOpIdx oi, uintptr_t ra);
+
 uint32_t cpu_ldub_code(CPUArchState *env, abi_ptr addr);
 uint32_t cpu_lduw_code(CPUArchState *env, abi_ptr addr);
 uint32_t cpu_ldl_code(CPUArchState *env, abi_ptr addr);
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ uint64_t cpu_ldq_code(CPUArchState *env, abi_ptr addr)
     MemOpIdx oi = make_memop_idx(MO_TEUQ, cpu_mmu_index(env, true));
     return full_ldq_code(env, addr, oi, 0);
 }
+
+uint8_t cpu_ldb_code_mmu(CPUArchState *env, abi_ptr addr,
+                         MemOpIdx oi, uintptr_t retaddr)
+{
+    return full_ldub_code(env, addr, oi, retaddr);
+}
+
+uint16_t cpu_ldw_code_mmu(CPUArchState *env, abi_ptr addr,
+                          MemOpIdx oi, uintptr_t retaddr)
+{
+    MemOp mop = get_memop(oi);
+    int idx = get_mmuidx(oi);
+    uint16_t ret;
+
+    ret = full_lduw_code(env, addr, make_memop_idx(MO_TEUW, idx), retaddr);
+    if ((mop & MO_BSWAP) != MO_TE) {
+        ret = bswap16(ret);
+    }
+    return ret;
+}
+
+uint32_t cpu_ldl_code_mmu(CPUArchState *env, abi_ptr addr,
+                          MemOpIdx oi, uintptr_t retaddr)
+{
+    MemOp mop = get_memop(oi);
+    int idx = get_mmuidx(oi);
+    uint32_t ret;
+
+    ret = full_ldl_code(env, addr, make_memop_idx(MO_TEUL, idx), retaddr);
+    if ((mop & MO_BSWAP) != MO_TE) {
+        ret = bswap32(ret);
+    }
+    return ret;
+}
+
+uint64_t cpu_ldq_code_mmu(CPUArchState *env, abi_ptr addr,
+                          MemOpIdx oi, uintptr_t retaddr)
+{
+    MemOp mop = get_memop(oi);
+    int idx = get_mmuidx(oi);
+    uint64_t ret;
+
+    ret = full_ldq_code(env, addr, make_memop_idx(MO_TEUQ, idx), retaddr);
+    if ((mop & MO_BSWAP) != MO_TE) {
+        ret = bswap64(ret);
+    }
+    return ret;
+}
diff --git a/accel/tcg/user-exec.c b/accel/tcg/user-exec.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/user-exec.c
+++ b/accel/tcg/user-exec.c
@@ -XXX,XX +XXX,XX @@ uint64_t cpu_ldq_code(CPUArchState *env, abi_ptr ptr)
     return ret;
 }
 
+uint8_t cpu_ldb_code_mmu(CPUArchState *env, abi_ptr addr,
+                         MemOpIdx oi, uintptr_t ra)
+{
+    void *haddr;
+    uint8_t ret;
+
+    haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_INST_FETCH);
+    ret = ldub_p(haddr);
+    clear_helper_retaddr();
+    return ret;
+}
+
+uint16_t cpu_ldw_code_mmu(CPUArchState *env, abi_ptr addr,
+                          MemOpIdx oi, uintptr_t ra)
+{
+    void *haddr;
+    uint16_t ret;
+
+    haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_INST_FETCH);
+    ret = lduw_p(haddr);
+    clear_helper_retaddr();
+    if (get_memop(oi) & MO_BSWAP) {
+        ret = bswap16(ret);
+    }
+    return ret;
+}
+
+uint32_t cpu_ldl_code_mmu(CPUArchState *env, abi_ptr addr,
+                          MemOpIdx oi, uintptr_t ra)
+{
+    void *haddr;
+    uint32_t ret;
+
+    haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_INST_FETCH);
+    ret = ldl_p(haddr);
+    clear_helper_retaddr();
+    if (get_memop(oi) & MO_BSWAP) {
+        ret = bswap32(ret);
+    }
+    return ret;
+}
+
+uint64_t cpu_ldq_code_mmu(CPUArchState *env, abi_ptr addr,
+                          MemOpIdx oi, uintptr_t ra)
+{
+    void *haddr;
+    uint64_t ret;
+
+    validate_memop(oi, MO_BEUQ);
+    haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_LOAD);
+    ret = ldq_p(haddr);
+    clear_helper_retaddr();
+    if (get_memop(oi) & MO_BSWAP) {
+        ret = bswap64(ret);
+    }
+    return ret;
+}
+
 #include "ldst_common.c.inc"
 
 /*
-- 
2.34.1

This is common code in most qemu_{ld,st} slow paths, moving two
registers when there may be overlap between sources and destinations.
At present, this is only used by 32-bit hosts for 64-bit data,
but will shortly be used for more than that.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tcg.c                 | 69 ++++++++++++++++++++++++++++++++++++---
 tcg/arm/tcg-target.c.inc  | 44 ++++++++++---------------
 tcg/i386/tcg-target.c.inc | 19 +++++------
 3 files changed, 90 insertions(+), 42 deletions(-)

diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exts_i32_i64(TCGContext *s, TCGReg ret, TCGReg arg);
 static void tcg_out_extu_i32_i64(TCGContext *s, TCGReg ret, TCGReg arg);
 static void tcg_out_extrl_i64_i32(TCGContext *s, TCGReg ret, TCGReg arg);
 static void tcg_out_addi_ptr(TCGContext *s, TCGReg, TCGReg, tcg_target_long);
-static bool tcg_out_xchg(TCGContext *s, TCGType type, TCGReg r1, TCGReg r2)
-    __attribute__((unused));
+static bool tcg_out_xchg(TCGContext *s, TCGType type, TCGReg r1, TCGReg r2);
 static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg);
 static void tcg_out_goto_tb(TCGContext *s, int which);
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
@@ -XXX,XX +XXX,XX @@ void tcg_raise_tb_overflow(TCGContext *s)
     siglongjmp(s->jmp_trans, -2);
 }
 
+typedef struct TCGMovExtend {
+    TCGReg dst;
+    TCGReg src;
+    TCGType dst_type;
+    TCGType src_type;
+    MemOp src_ext;
+} TCGMovExtend;
+
 /**
  * tcg_out_movext -- move and extend
  * @s: tcg context
@@ -XXX,XX +XXX,XX @@ void tcg_raise_tb_overflow(TCGContext *s)
  *
  * Move or extend @src into @dst, depending on @src_ext and the types.
  */
-static void __attribute__((unused))
-tcg_out_movext(TCGContext *s, TCGType dst_type, TCGReg dst,
-               TCGType src_type, MemOp src_ext, TCGReg src)
+static void tcg_out_movext(TCGContext *s, TCGType dst_type, TCGReg dst,
+                           TCGType src_type, MemOp src_ext, TCGReg src)
 {
     switch (src_ext) {
     case MO_UB:
@@ -XXX,XX +XXX,XX @@ tcg_out_movext(TCGContext *s, TCGType dst_type, TCGReg dst,
     }
 }
 
+/* Minor variations on a theme, using a structure. */
+static void tcg_out_movext1_new_src(TCGContext *s, const TCGMovExtend *i,
+                                    TCGReg src)
+{
+    tcg_out_movext(s, i->dst_type, i->dst, i->src_type, i->src_ext, src);
+}
+
+static void tcg_out_movext1(TCGContext *s, const TCGMovExtend *i)
+{
+    tcg_out_movext1_new_src(s, i, i->src);
+}
+
+/**
+ * tcg_out_movext2 -- move and extend two pair
+ * @s: tcg context
+ * @i1: first move description
+ * @i2: second move description
+ * @scratch: temporary register, or -1 for none
+ *
+ * As tcg_out_movext, for both @i1 and @i2, caring for overlap
+ * between the sources and destinations.
+ */
+
+static void __attribute__((unused))
+tcg_out_movext2(TCGContext *s, const TCGMovExtend *i1,
+                const TCGMovExtend *i2, int scratch)
+{
+    TCGReg src1 = i1->src;
+    TCGReg src2 = i2->src;
+
+    if (i1->dst != src2) {
+        tcg_out_movext1(s, i1);
+        tcg_out_movext1(s, i2);
+        return;
+    }
+    if (i2->dst == src1) {
+        TCGType src1_type = i1->src_type;
+        TCGType src2_type = i2->src_type;
+
+        if (tcg_out_xchg(s, MAX(src1_type, src2_type), src1, src2)) {
+            /* The data is now in the correct registers, now extend. */
+            src1 = i2->src;
+            src2 = i1->src;
+        } else {
+            tcg_debug_assert(scratch >= 0);
+            tcg_out_mov(s, src1_type, scratch, src1);
+            src1 = scratch;
+        }
+    }
+    tcg_out_movext1_new_src(s, i2, src2);
+    tcg_out_movext1_new_src(s, i1, src1);
+}
+
 #define C_PFX1(P, A)                    P##A
 #define C_PFX2(P, A, B)                 P##A##_##B
 #define C_PFX3(P, A, B, C)              P##A##_##B##_##C
diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void add_qemu_ldst_label(TCGContext *s, bool is_ld, MemOpIdx oi,
 
 static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
 {
-    TCGReg argreg, datalo, datahi;
+    TCGReg argreg;
     MemOpIdx oi = lb->oi;
     MemOp opc = get_memop(oi);
 
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
     /* Use the canonical unsigned helpers and minimize icache usage. */
     tcg_out_call_int(s, qemu_ld_helpers[opc & MO_SIZE]);
 
-    datalo = lb->datalo_reg;
-    datahi = lb->datahi_reg;
     if ((opc & MO_SIZE) == MO_64) {
-        if (datalo != TCG_REG_R1) {
-            tcg_out_mov_reg(s, COND_AL, datalo, TCG_REG_R0);
-            tcg_out_mov_reg(s, COND_AL, datahi, TCG_REG_R1);
-        } else if (datahi != TCG_REG_R0) {
-            tcg_out_mov_reg(s, COND_AL, datahi, TCG_REG_R1);
-            tcg_out_mov_reg(s, COND_AL, datalo, TCG_REG_R0);
-        } else {
-            tcg_out_mov_reg(s, COND_AL, TCG_REG_TMP, TCG_REG_R0);
-            tcg_out_mov_reg(s, COND_AL, datahi, TCG_REG_R1);
-            tcg_out_mov_reg(s, COND_AL, datalo, TCG_REG_TMP);
-        }
+        TCGMovExtend ext[2] = {
+            { .dst = lb->datalo_reg, .dst_type = TCG_TYPE_I32,
+              .src = TCG_REG_R0, .src_type = TCG_TYPE_I32, .src_ext = MO_UL },
+            { .dst = lb->datahi_reg, .dst_type = TCG_TYPE_I32,
+              .src = TCG_REG_R1, .src_type = TCG_TYPE_I32, .src_ext = MO_UL },
+        };
+        tcg_out_movext2(s, &ext[0], &ext[1], TCG_REG_TMP);
     } else {
-        tcg_out_movext(s, TCG_TYPE_I32, datalo,
+        tcg_out_movext(s, TCG_TYPE_I32, lb->datalo_reg,
                        TCG_TYPE_I32, opc & MO_SSIZE, TCG_REG_R0);
     }
 
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_fail_alignment(TCGContext *s, TCGLabelQemuLdst *l)
 
     if (TARGET_LONG_BITS == 64) {
         /* 64-bit target address is aligned into R2:R3. */
-        if (l->addrhi_reg != TCG_REG_R2) {
-            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R2, l->addrlo_reg);
-            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R3, l->addrhi_reg);
-        } else if (l->addrlo_reg != TCG_REG_R3) {
-            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R3, l->addrhi_reg);
-            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R2, l->addrlo_reg);
-        } else {
-            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R1, TCG_REG_R2);
-            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R2, TCG_REG_R3);
-            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R3, TCG_REG_R1);
-        }
+        TCGMovExtend ext[2] = {
+            { .dst = TCG_REG_R2, .dst_type = TCG_TYPE_I32,
+              .src = l->addrlo_reg,
+              .src_type = TCG_TYPE_I32, .src_ext = MO_UL },
+            { .dst = TCG_REG_R3, .dst_type = TCG_TYPE_I32,
+              .src = l->addrhi_reg,
+              .src_type = TCG_TYPE_I32, .src_ext = MO_UL },
+        };
+        tcg_out_movext2(s, &ext[0], &ext[1], TCG_REG_TMP);
     } else {
         tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R1, l->addrlo_reg);
     }
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
 {
     MemOpIdx oi = l->oi;
     MemOp opc = get_memop(oi);
-    TCGReg data_reg;
     tcg_insn_unit **label_ptr = &l->label_ptr[0];
 
     /* resolve label address */
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
 
     tcg_out_branch(s, 1, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]);
 
-    data_reg = l->datalo_reg;
     if (TCG_TARGET_REG_BITS == 32 && (opc & MO_SIZE) == MO_64) {
-        if (data_reg == TCG_REG_EDX) {
-            /* xchg %edx, %eax */
-            tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0);
-            tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EAX);
-        } else {
-            tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
-            tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EDX);
-        }
+        TCGMovExtend ext[2] = {
+            { .dst = l->datalo_reg, .dst_type = TCG_TYPE_I32,
+              .src = TCG_REG_EAX, .src_type = TCG_TYPE_I32, .src_ext = MO_UL },
+            { .dst = l->datahi_reg, .dst_type = TCG_TYPE_I32,
+              .src = TCG_REG_EDX, .src_type = TCG_TYPE_I32, .src_ext = MO_UL },
+        };
+        tcg_out_movext2(s, &ext[0], &ext[1], -1);
     } else {
-        tcg_out_movext(s, l->type, data_reg,
+        tcg_out_movext(s, l->type, l->datalo_reg,
                        TCG_TYPE_REG, opc & MO_SSIZE, TCG_REG_EAX);
     }
 
-- 
2.34.1