Series comparison

-[PULL 0/3] tcg patch queue
+[PULL 00/12] tcg patch queue
-The following changes since commit e18e5501d8ac692d32657a3e1ef545b14e72b730:
+The following changes since commit 7c18f2d663521f1b31b821a13358ce38075eaf7d:
-  Merge remote-tracking branch 'remotes/dgilbert-gitlab/tags/pull-virtiofs-20200210' into staging (2020-02-10 18:09:14 +0000)
+  Merge tag 'for-upstream' of https://gitlab.com/bonzini/qemu into staging (2023-04-29 23:07:17 +0100)
 are available in the Git repository at:
-  https://github.com/rth7680/qemu.git tags/pull-tcg-20200212
+  https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20230502
-for you to fetch changes up to 2445971604c1cfd3ec484457159f4ac300fb04d2:
+for you to fetch changes up to bdc7fba1c5a29ae218b45353daac9308fe1aae82:
-  tcg: Add tcg_gen_gvec_5_ptr (2020-02-12 14:58:36 -0800)
+  tcg: Introduce tcg_out_movext2 (2023-05-02 12:15:41 +0100)
 ----------------------------------------------------------------
-Fix breakpoint invalidation.
+Misc tcg-related patch queue.
 Add support for tcg helpers with 7 arguments.
 Add support for gvec helpers with 5 arguments.
 ----------------------------------------------------------------
-Max Filippov (1):
+Dickon Hood (1):
-      exec: flush CPU TB cache in breakpoint_invalidate
+      qemu/bitops.h: Limit rotate amounts
-Richard Henderson (1):
+Kiran Ostrolenk (1):
-      tcg: Add tcg_gen_gvec_5_ptr
+      qemu/host-utils.h: Add clz and ctz functions for lower-bit integers
-Taylor Simpson (1):
+Nazar Kazakov (2):
-      tcg: Add support for a helper with 7 arguments
+      tcg: Add tcg_gen_gvec_andcs
       tcg: Add tcg_gen_gvec_rotrs
- include/exec/helper-gen.h   | 13 +++++++++++++
+Richard Henderson (7):
- include/exec/helper-head.h  |  2 ++
+      softmmu: Tidy dirtylimit_dirty_ring_full_time
- include/exec/helper-proto.h |  6 ++++++
+      qemu/int128: Re-shuffle Int128Alias members
- include/exec/helper-tcg.h   |  7 +++++++
+      migration/xbzrle: Use __attribute__((target)) for avx512
- include/tcg/tcg-op-gvec.h   |  7 +++++++
+      accel/tcg: Add cpu_ld*_code_mmu
- exec.c                      | 15 +++++++--------
+      tcg/loongarch64: Conditionalize tcg_out_exts_i32_i64
- tcg/tcg-op-gvec.c           | 32 ++++++++++++++++++++++++++++++++
+      tcg/mips: Conditionalize tcg_out_exts_i32_i64
-files changed, 74 insertions(+), 8 deletions(-)
+      tcg: Introduce tcg_out_movext2
+Weiwei Li (1):
+      accel/tcg: Uncache the host address for instruction fetch when tlb size < 1
+ meson.build                      |  5 +--
+ accel/tcg/tcg-runtime.h          |  1 +
+ include/exec/cpu_ldst.h          |  9 ++++++
+ include/qemu/bitops.h            | 24 +++++++++-----
+ include/qemu/host-utils.h        | 54 +++++++++++++++++++++++++++++++
+ include/qemu/int128.h            |  4 +--
+ include/tcg/tcg-op-gvec.h        |  4 +++
+ accel/tcg/cputlb.c               | 53 ++++++++++++++++++++++++++++++
+ accel/tcg/tcg-runtime-gvec.c     | 11 +++++++
+ accel/tcg/user-exec.c            | 58 +++++++++++++++++++++++++++++++++
+ migration/xbzrle.c               |  9 +++---
+ softmmu/dirtylimit.c             | 15 ++++++---
+ tcg/tcg-op-gvec.c                | 28 ++++++++++++++++
+ tcg/tcg.c                        | 69 +++++++++++++++++++++++++++++++++++++---
+ tcg/arm/tcg-target.c.inc         | 44 +++++++++++--------------
+ tcg/i386/tcg-target.c.inc        | 19 +++++------
+ tcg/loongarch64/tcg-target.c.inc |  4 ++-
+ tcg/mips/tcg-target.c.inc        |  4 ++-
+files changed, 347 insertions(+), 68 deletions(-)

-New patch
+[PULL 01/12] softmmu: Tidy dirtylimit_dirty_ring_full_time
+Drop inline marker: let compiler decide.
+Change return type to uint64_t: this matches the computation in the
+return statement and the local variable assignment in the caller.
+Rename local to dirty_ring_size_MB to fix typo.
+Simplify conversion to MiB via qemu_target_page_bits and right shift.
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+Reviewed-by: Thomas Huth <thuth@redhat.com>
+Reviewed-by: Juan Quintela <quintela@redhat.com>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ softmmu/dirtylimit.c | 15 ++++++++++-----
+file changed, 10 insertions(+), 5 deletions(-)
+diff --git a/softmmu/dirtylimit.c b/softmmu/dirtylimit.c
+index XXXXXXX..XXXXXXX 100644
+--- a/softmmu/dirtylimit.c
++++ b/softmmu/dirtylimit.c
+@@ -XXX,XX +XXX,XX @@ bool dirtylimit_vcpu_index_valid(int cpu_index)
+              cpu_index >= ms->smp.max_cpus);
+ }
+-static inline int64_t dirtylimit_dirty_ring_full_time(uint64_t dirtyrate)
++static uint64_t dirtylimit_dirty_ring_full_time(uint64_t dirtyrate)
+ {
+     static uint64_t max_dirtyrate;
+-    uint32_t dirty_ring_size = kvm_dirty_ring_size();
+-    uint64_t dirty_ring_size_meory_MB =
+-        dirty_ring_size * qemu_target_page_size() >> 20;
++    unsigned target_page_bits = qemu_target_page_bits();
++    uint64_t dirty_ring_size_MB;
++
++    /* So far, the largest (non-huge) page size is 64k, i.e. 16 bits. */
++    assert(target_page_bits < 20);
++
++    /* Convert ring size (pages) to MiB (2**20). */
++    dirty_ring_size_MB = kvm_dirty_ring_size() >> (20 - target_page_bits);
+     if (max_dirtyrate < dirtyrate) {
+         max_dirtyrate = dirtyrate;
+     }
+-    return dirty_ring_size_meory_MB * 1000000 / max_dirtyrate;
++    return dirty_ring_size_MB * 1000000 / max_dirtyrate;
+ }
+ static inline bool dirtylimit_done(uint64_t quota,
+--
+.34.1

-New patch
+[PULL 02/12] accel/tcg: Uncache the host address for instruction fetch when tlb size < 1
+From: Weiwei Li <liweiwei@iscas.ac.cn>
+When PMP entry overlap part of the page, we'll set the tlb_size to 1, which
+will make the address in tlb entry set with TLB_INVALID_MASK, and the next
+access will again go through tlb_fill.However, this way will not work in
+tb_gen_code() => get_page_addr_code_hostp(): the TLB host address will be
+cached, and the following instructions can use this host address directly
+which may lead to the bypass of PMP related check.
+Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1542.
+Signed-off-by: Weiwei Li <liweiwei@iscas.ac.cn>
+Signed-off-by: Junqiang Wang <wangjunqiang@iscas.ac.cn>
+Reviewed-by: LIU Zhiwei <zhiwei_liu@linux.alibaba.com>
+Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
+Message-Id: <20230422130329.23555-6-liweiwei@iscas.ac.cn>
+---
+ accel/tcg/cputlb.c | 5 +++++
+file changed, 5 insertions(+)
+diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
+index XXXXXXX..XXXXXXX 100644
+--- a/accel/tcg/cputlb.c
++++ b/accel/tcg/cputlb.c
+@@ -XXX,XX +XXX,XX @@ tb_page_addr_t get_page_addr_code_hostp(CPUArchState *env, target_ulong addr,
+     if (p == NULL) {
+         return -1;
+     }
++
++    if (full->lg_page_size < TARGET_PAGE_BITS) {
++        return -1;
++    }
++
+     if (hostp) {
+         *hostp = p;
+     }
+--
+.34.1

-New patch
+[PULL 03/12] qemu/bitops.h: Limit rotate amounts
+From: Dickon Hood <dickon.hood@codethink.co.uk>
+Rotates have been fixed up to only allow for reasonable rotate amounts
+(ie, no rotates >7 on an 8b value etc.)  This fixes a problem with riscv
+vector rotate instructions.
+Signed-off-by: Dickon Hood <dickon.hood@codethink.co.uk>
+Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
+Message-Id: <20230428144757.57530-9-lawrence.hunter@codethink.co.uk>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ include/qemu/bitops.h | 24 ++++++++++++++++--------
+file changed, 16 insertions(+), 8 deletions(-)
+diff --git a/include/qemu/bitops.h b/include/qemu/bitops.h
+index XXXXXXX..XXXXXXX 100644
+--- a/include/qemu/bitops.h
++++ b/include/qemu/bitops.h
+@@ -XXX,XX +XXX,XX @@ static inline unsigned long find_first_zero_bit(const unsigned long *addr,
+  */
+ static inline uint8_t rol8(uint8_t word, unsigned int shift)
+ {
+-    return (word << shift) | (word >> ((8 - shift) & 7));
++    shift &= 7;
++    return (word << shift) | (word >> (8 - shift));
+ }
+ /**
+@@ -XXX,XX +XXX,XX @@ static inline uint8_t rol8(uint8_t word, unsigned int shift)
+  */
+ static inline uint8_t ror8(uint8_t word, unsigned int shift)
+ {
+-    return (word >> shift) | (word << ((8 - shift) & 7));
++    shift &= 7;
++    return (word >> shift) | (word << (8 - shift));
+ }
+ /**
+@@ -XXX,XX +XXX,XX @@ static inline uint8_t ror8(uint8_t word, unsigned int shift)
+  */
+ static inline uint16_t rol16(uint16_t word, unsigned int shift)
+ {
+-    return (word << shift) | (word >> ((16 - shift) & 15));
++    shift &= 15;
++    return (word << shift) | (word >> (16 - shift));
+ }
+ /**
+@@ -XXX,XX +XXX,XX @@ static inline uint16_t rol16(uint16_t word, unsigned int shift)
+  */
+ static inline uint16_t ror16(uint16_t word, unsigned int shift)
+ {
+-    return (word >> shift) | (word << ((16 - shift) & 15));
++    shift &= 15;
++    return (word >> shift) | (word << (16 - shift));
+ }
+ /**
+@@ -XXX,XX +XXX,XX @@ static inline uint16_t ror16(uint16_t word, unsigned int shift)
+  */
+ static inline uint32_t rol32(uint32_t word, unsigned int shift)
+ {
+-    return (word << shift) | (word >> ((32 - shift) & 31));
++    shift &= 31;
++    return (word << shift) | (word >> (32 - shift));
+ }
+ /**
+@@ -XXX,XX +XXX,XX @@ static inline uint32_t rol32(uint32_t word, unsigned int shift)
+  */
+ static inline uint32_t ror32(uint32_t word, unsigned int shift)
+ {
+-    return (word >> shift) | (word << ((32 - shift) & 31));
++    shift &= 31;
++    return (word >> shift) | (word << (32 - shift));
+ }
+ /**
+@@ -XXX,XX +XXX,XX @@ static inline uint32_t ror32(uint32_t word, unsigned int shift)
+  */
+ static inline uint64_t rol64(uint64_t word, unsigned int shift)
+ {
+-    return (word << shift) | (word >> ((64 - shift) & 63));
++    shift &= 63;
++    return (word << shift) | (word >> (64 - shift));
+ }
+ /**
+@@ -XXX,XX +XXX,XX @@ static inline uint64_t rol64(uint64_t word, unsigned int shift)
+  */
+ static inline uint64_t ror64(uint64_t word, unsigned int shift)
+ {
+-    return (word >> shift) | (word << ((64 - shift) & 63));
++    shift &= 63;
++    return (word >> shift) | (word << (64 - shift));
+ }
+ /**
+--
+.34.1

-New patch
+[PULL 04/12] qemu/host-utils.h: Add clz and ctz functions for lower-bit integers
+From: Kiran Ostrolenk <kiran.ostrolenk@codethink.co.uk>
+This is for use in the RISC-V vclz and vctz instructions (implemented in
+proceeding commit).
+Signed-off-by: Kiran Ostrolenk <kiran.ostrolenk@codethink.co.uk>
+Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
+Message-Id: <20230428144757.57530-11-lawrence.hunter@codethink.co.uk>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ include/qemu/host-utils.h | 54 +++++++++++++++++++++++++++++++++++++++
+file changed, 54 insertions(+)
+diff --git a/include/qemu/host-utils.h b/include/qemu/host-utils.h
+index XXXXXXX..XXXXXXX 100644
+--- a/include/qemu/host-utils.h
++++ b/include/qemu/host-utils.h
+@@ -XXX,XX +XXX,XX @@ static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
+ }
+ #endif
++/**
++ * clz8 - count leading zeros in a 8-bit value.
++ * @val: The value to search
++ *
++ * Returns 8 if the value is zero.  Note that the GCC builtin is
++ * undefined if the value is zero.
++ *
++ * Note that the GCC builtin will upcast its argument to an `unsigned int`
++ * so this function subtracts off the number of prepended zeroes.
++ */
++static inline int clz8(uint8_t val)
++{
++    return val ? __builtin_clz(val) - 24 : 8;
++}
++
++/**
++ * clz16 - count leading zeros in a 16-bit value.
++ * @val: The value to search
++ *
++ * Returns 16 if the value is zero.  Note that the GCC builtin is
++ * undefined if the value is zero.
++ *
++ * Note that the GCC builtin will upcast its argument to an `unsigned int`
++ * so this function subtracts off the number of prepended zeroes.
++ */
++static inline int clz16(uint16_t val)
++{
++    return val ? __builtin_clz(val) - 16 : 16;
++}
++
+ /**
+  * clz32 - count leading zeros in a 32-bit value.
+  * @val: The value to search
+@@ -XXX,XX +XXX,XX @@ static inline int clo64(uint64_t val)
+     return clz64(~val);
+ }
++/**
++ * ctz8 - count trailing zeros in a 8-bit value.
++ * @val: The value to search
++ *
++ * Returns 8 if the value is zero.  Note that the GCC builtin is
++ * undefined if the value is zero.
++ */
++static inline int ctz8(uint8_t val)
++{
++    return val ? __builtin_ctz(val) : 8;
++}
++
++/**
++ * ctz16 - count trailing zeros in a 16-bit value.
++ * @val: The value to search
++ *
++ * Returns 16 if the value is zero.  Note that the GCC builtin is
++ * undefined if the value is zero.
++ */
++static inline int ctz16(uint16_t val)
++{
++    return val ? __builtin_ctz(val) : 16;
++}
++
+ /**
+  * ctz32 - count trailing zeros in a 32-bit value.
+  * @val: The value to search
+--
+.34.1

-New patch
+[PULL 05/12] tcg: Add tcg_gen_gvec_andcs
+From: Nazar Kazakov <nazar.kazakov@codethink.co.uk>
+Add tcg expander and helper functions for and-compliment
+vector with scalar operand.
+Signed-off-by: Nazar Kazakov <nazar.kazakov@codethink.co.uk>
+Message-Id: <20230428144757.57530-10-lawrence.hunter@codethink.co.uk>
+[rth: Split out of larger patch.]
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ accel/tcg/tcg-runtime.h      |  1 +
+ include/tcg/tcg-op-gvec.h    |  2 ++
+ accel/tcg/tcg-runtime-gvec.c | 11 +++++++++++
+ tcg/tcg-op-gvec.c            | 17 +++++++++++++++++
+files changed, 31 insertions(+)
+diff --git a/accel/tcg/tcg-runtime.h b/accel/tcg/tcg-runtime.h
+index XXXXXXX..XXXXXXX 100644
+--- a/accel/tcg/tcg-runtime.h
++++ b/accel/tcg/tcg-runtime.h
+@@ -XXX,XX +XXX,XX @@ DEF_HELPER_FLAGS_4(gvec_nor, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+ DEF_HELPER_FLAGS_4(gvec_eqv, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+ DEF_HELPER_FLAGS_4(gvec_ands, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
++DEF_HELPER_FLAGS_4(gvec_andcs, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+ DEF_HELPER_FLAGS_4(gvec_xors, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+ DEF_HELPER_FLAGS_4(gvec_ors, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+diff --git a/include/tcg/tcg-op-gvec.h b/include/tcg/tcg-op-gvec.h
+index XXXXXXX..XXXXXXX 100644
+--- a/include/tcg/tcg-op-gvec.h
++++ b/include/tcg/tcg-op-gvec.h
+@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, uint32_t aofs,
+ void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs,
+                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz);
++void tcg_gen_gvec_andcs(unsigned vece, uint32_t dofs, uint32_t aofs,
++                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz);
+ void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs,
+                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz);
+ void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs,
+diff --git a/accel/tcg/tcg-runtime-gvec.c b/accel/tcg/tcg-runtime-gvec.c
+index XXXXXXX..XXXXXXX 100644
+--- a/accel/tcg/tcg-runtime-gvec.c
++++ b/accel/tcg/tcg-runtime-gvec.c
+@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_ands)(void *d, void *a, uint64_t b, uint32_t desc)
+     clear_high(d, oprsz, desc);
+ }
++void HELPER(gvec_andcs)(void *d, void *a, uint64_t b, uint32_t desc)
++{
++    intptr_t oprsz = simd_oprsz(desc);
++    intptr_t i;
++
++    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
++        *(uint64_t *)(d + i) = *(uint64_t *)(a + i) & ~b;
++    }
++    clear_high(d, oprsz, desc);
++}
++
+ void HELPER(gvec_xors)(void *d, void *a, uint64_t b, uint32_t desc)
+ {
+     intptr_t oprsz = simd_oprsz(desc);
+diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/tcg-op-gvec.c
++++ b/tcg/tcg-op-gvec.c
+@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs,
+     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
+ }
++void tcg_gen_gvec_andcs(unsigned vece, uint32_t dofs, uint32_t aofs,
++                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
++{
++    static GVecGen2s g = {
++        .fni8 = tcg_gen_andc_i64,
++        .fniv = tcg_gen_andc_vec,
++        .fno = gen_helper_gvec_andcs,
++        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
++        .vece = MO_64
++    };
++
++    TCGv_i64 tmp = tcg_temp_ebb_new_i64();
++    tcg_gen_dup_i64(vece, tmp, c);
++    tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g);
++    tcg_temp_free_i64(tmp);
++}
++
+ static const GVecGen2s gop_xors = {
+     .fni8 = tcg_gen_xor_i64,
+     .fniv = tcg_gen_xor_vec,
+--
+.34.1

-[PULL 3/3] tcg: Add tcg_gen_gvec_5_ptr
+[PULL 06/12] tcg: Add tcg_gen_gvec_rotrs
-Extend the vector generator infrastructure to handle
+From: Nazar Kazakov <nazar.kazakov@codethink.co.uk>
 vector arguments.
-Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+Add tcg expander and helper functions for rotate right
-Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+vector with scalar operand.
-Reviewed-by: Taylor Simpson <tsimpson@quicinc.com>
 Signed-off-by: Nazar Kazakov <nazar.kazakov@codethink.co.uk>
 Message-Id: <20230428144757.57530-10-lawrence.hunter@codethink.co.uk>
 [rth: Split out of larger patch; mask rotation count.]
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/tcg/tcg-op-gvec.h |  7 +++++++
+ include/tcg/tcg-op-gvec.h |  2 ++
- tcg/tcg-op-gvec.c         | 32 ++++++++++++++++++++++++++++++++
+ tcg/tcg-op-gvec.c         | 11 +++++++++++
-files changed, 39 insertions(+)
+files changed, 13 insertions(+)
 diff --git a/include/tcg/tcg-op-gvec.h b/include/tcg/tcg-op-gvec.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/tcg/tcg-op-gvec.h
 +++ b/include/tcg/tcg-op-gvec.h
-@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_sars(unsigned vece, uint32_t dofs, uint32_t aofs,
-                         uint32_t maxsz, int32_t data,
+                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz);
-                         gen_helper_gvec_4_ptr *fn);
+ void tcg_gen_gvec_rotls(unsigned vece, uint32_t dofs, uint32_t aofs,
+                         TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz);
-+typedef void gen_helper_gvec_5_ptr(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_ptr,
++void tcg_gen_gvec_rotrs(unsigned vece, uint32_t dofs, uint32_t aofs,
-+                                   TCGv_ptr, TCGv_ptr, TCGv_i32);
++                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz);
-+void tcg_gen_gvec_5_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
-+                        uint32_t cofs, uint32_t eofs, TCGv_ptr ptr,
+ /*
-+                        uint32_t oprsz, uint32_t maxsz, int32_t data,
+  * Perform vector shift by vector element, modulo the element size.
 +                        gen_helper_gvec_5_ptr *fn);
 +
  /* Expand a gvec operation.  Either inline or out-of-line depending on
     the actual vector size and the operations supported by the host.  */
  typedef struct {
 diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tcg-op-gvec.c
 +++ b/tcg/tcg-op-gvec.c
-@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_rotls(unsigned vece, uint32_t dofs, uint32_t aofs,
-     tcg_temp_free_i32(desc);
+     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
  }
-+/* Generate a call to a gvec-style helper with five vector operands
++void tcg_gen_gvec_rotrs(unsigned vece, uint32_t dofs, uint32_t aofs,
-+   and an extra pointer operand.  */
++                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
 +void tcg_gen_gvec_5_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 +                        uint32_t cofs, uint32_t eofs, TCGv_ptr ptr,
 +                        uint32_t oprsz, uint32_t maxsz, int32_t data,
 +                        gen_helper_gvec_5_ptr *fn)
 +{
-+    TCGv_ptr a0, a1, a2, a3, a4;
++    TCGv_i32 tmp = tcg_temp_ebb_new_i32();
 +    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 +
-+    a0 = tcg_temp_new_ptr();
++    tcg_gen_neg_i32(tmp, shift);
-+    a1 = tcg_temp_new_ptr();
++    tcg_gen_andi_i32(tmp, tmp, (8 << vece) - 1);
-+    a2 = tcg_temp_new_ptr();
++    tcg_gen_gvec_rotls(vece, dofs, aofs, tmp, oprsz, maxsz);
-+    a3 = tcg_temp_new_ptr();
++    tcg_temp_free_i32(tmp);
 +    a4 = tcg_temp_new_ptr();
 +
 +    tcg_gen_addi_ptr(a0, cpu_env, dofs);
 +    tcg_gen_addi_ptr(a1, cpu_env, aofs);
 +    tcg_gen_addi_ptr(a2, cpu_env, bofs);
 +    tcg_gen_addi_ptr(a3, cpu_env, cofs);
 +    tcg_gen_addi_ptr(a4, cpu_env, eofs);
 +
 +    fn(a0, a1, a2, a3, a4, ptr, desc);
 +
 +    tcg_temp_free_ptr(a0);
 +    tcg_temp_free_ptr(a1);
 +    tcg_temp_free_ptr(a2);
 +    tcg_temp_free_ptr(a3);
 +    tcg_temp_free_ptr(a4);
 +    tcg_temp_free_i32(desc);
 +}
 +
- /* Return true if we want to implement something of OPRSZ bytes
+ /*
-    in units of LNSZ.  This limits the expansion of inline code.  */
+  * Expand D = A << (B % element bits)
- static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz)
+  *
 --
-.20.1
+.34.1

-New patch
+[PULL 07/12] qemu/int128: Re-shuffle Int128Alias members
+Clang 14, with --enable-tcg-interpreter errors with
+include/qemu/int128.h:487:16: error: alignment of field 'i' (128 bits)
+  does not match the alignment of the first field in transparent union;
+  transparent_union attribute ignored [-Werror,-Wignored-attributes]
+    __int128_t i;
+               ^
+include/qemu/int128.h:486:12: note: alignment of first field is 64 bits
+    Int128 s;
+           ^
+error generated.
+By placing the __uint128_t member first, this is avoided.
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Message-Id: <20230501204625.277361-1-richard.henderson@linaro.org>
+---
+ include/qemu/int128.h | 4 ++--
+file changed, 2 insertions(+), 2 deletions(-)
+diff --git a/include/qemu/int128.h b/include/qemu/int128.h
+index XXXXXXX..XXXXXXX 100644
+--- a/include/qemu/int128.h
++++ b/include/qemu/int128.h
+@@ -XXX,XX +XXX,XX @@ static inline void bswap128s(Int128 *s)
+  */
+ #ifdef CONFIG_INT128
+ typedef union {
+-    Int128 s;
+-    __int128_t i;
+     __uint128_t u;
++    __int128_t i;
++    Int128 s;
+ } Int128Alias __attribute__((transparent_union));
+ #else
+ typedef Int128 Int128Alias;
+--
+.34.1

-New patch
+[PULL 08/12] migration/xbzrle: Use __attribute__((target)) for avx512
+Use the attribute, which is supported by clang, instead of
+the #pragma, which is not supported and, for some reason,
+also not detected by the meson probe, so we fail by -Werror.
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+Reviewed-by: Juan Quintela <quintela@redhat.com>
+Message-Id: <20230501210555.289806-1-richard.henderson@linaro.org>
+---
+ meson.build        | 5 +----
+ migration/xbzrle.c | 9 ++++-----
+files changed, 5 insertions(+), 9 deletions(-)
+diff --git a/meson.build b/meson.build
+index XXXXXXX..XXXXXXX 100644
+--- a/meson.build
++++ b/meson.build
+@@ -XXX,XX +XXX,XX @@ config_host_data.set('CONFIG_AVX512F_OPT', get_option('avx512f') \
+ config_host_data.set('CONFIG_AVX512BW_OPT', get_option('avx512bw') \
+   .require(have_cpuid_h, error_message: 'cpuid.h not available, cannot enable AVX512BW') \
+   .require(cc.links('''
+-    #pragma GCC push_options
+-    #pragma GCC target("avx512bw")
+     #include <cpuid.h>
+     #include <immintrin.h>
+-    static int bar(void *a) {
+-
++    static int __attribute__((target("avx512bw"))) bar(void *a) {
+       __m512i *x = a;
+       __m512i res= _mm512_abs_epi8(*x);
+       return res[1];
+diff --git a/migration/xbzrle.c b/migration/xbzrle.c
+index XXXXXXX..XXXXXXX 100644
+--- a/migration/xbzrle.c
++++ b/migration/xbzrle.c
+@@ -XXX,XX +XXX,XX @@ int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen)
+ }
+ #if defined(CONFIG_AVX512BW_OPT)
+-#pragma GCC push_options
+-#pragma GCC target("avx512bw")
+ #include <immintrin.h>
+-int xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen,
+-                             uint8_t *dst, int dlen)
++
++int __attribute__((target("avx512bw")))
++xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen,
++                            uint8_t *dst, int dlen)
+ {
+     uint32_t zrun_len = 0, nzrun_len = 0;
+     int d = 0, i = 0, num = 0;
+@@ -XXX,XX +XXX,XX @@ int xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen,
+     }
+     return d;
+ }
+-#pragma GCC pop_options
+ #endif
+--
+.34.1

-New patch
+[PULL 09/12] accel/tcg: Add cpu_ld*_code_mmu
+At least RISC-V has the need to be able to perform a read
+using execute permissions, outside of translation.
+Add helpers to facilitate this.
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+Acked-by: Alistair Francis <alistair.francis@wdc.com>
+Reviewed-by: Weiwei Li <liweiwei@iscas.ac.cn>
+Tested-by: Daniel Henrique Barboza <dbarboza@ventanamicro.com>
+Message-Id: <20230325105429.1142530-9-richard.henderson@linaro.org>
+Message-Id: <20230412114333.118895-9-richard.henderson@linaro.org>
+---
+ include/exec/cpu_ldst.h |  9 +++++++
+ accel/tcg/cputlb.c      | 48 ++++++++++++++++++++++++++++++++++
+ accel/tcg/user-exec.c   | 58 +++++++++++++++++++++++++++++++++++++++++
+files changed, 115 insertions(+)
+diff --git a/include/exec/cpu_ldst.h b/include/exec/cpu_ldst.h
+index XXXXXXX..XXXXXXX 100644
+--- a/include/exec/cpu_ldst.h
++++ b/include/exec/cpu_ldst.h
+@@ -XXX,XX +XXX,XX @@ static inline CPUTLBEntry *tlb_entry(CPUArchState *env, uintptr_t mmu_idx,
+ # define cpu_stq_mmu          cpu_stq_le_mmu
+ #endif
++uint8_t cpu_ldb_code_mmu(CPUArchState *env, abi_ptr addr,
++                         MemOpIdx oi, uintptr_t ra);
++uint16_t cpu_ldw_code_mmu(CPUArchState *env, abi_ptr addr,
++                          MemOpIdx oi, uintptr_t ra);
++uint32_t cpu_ldl_code_mmu(CPUArchState *env, abi_ptr addr,
++                          MemOpIdx oi, uintptr_t ra);
++uint64_t cpu_ldq_code_mmu(CPUArchState *env, abi_ptr addr,
++                          MemOpIdx oi, uintptr_t ra);
++
+ uint32_t cpu_ldub_code(CPUArchState *env, abi_ptr addr);
+ uint32_t cpu_lduw_code(CPUArchState *env, abi_ptr addr);
+ uint32_t cpu_ldl_code(CPUArchState *env, abi_ptr addr);
+diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
+index XXXXXXX..XXXXXXX 100644
+--- a/accel/tcg/cputlb.c
++++ b/accel/tcg/cputlb.c
+@@ -XXX,XX +XXX,XX @@ uint64_t cpu_ldq_code(CPUArchState *env, abi_ptr addr)
+     MemOpIdx oi = make_memop_idx(MO_TEUQ, cpu_mmu_index(env, true));
+     return full_ldq_code(env, addr, oi, 0);
+ }
++
++uint8_t cpu_ldb_code_mmu(CPUArchState *env, abi_ptr addr,
++                         MemOpIdx oi, uintptr_t retaddr)
++{
++    return full_ldub_code(env, addr, oi, retaddr);
++}
++
++uint16_t cpu_ldw_code_mmu(CPUArchState *env, abi_ptr addr,
++                          MemOpIdx oi, uintptr_t retaddr)
++{
++    MemOp mop = get_memop(oi);
++    int idx = get_mmuidx(oi);
++    uint16_t ret;
++
++    ret = full_lduw_code(env, addr, make_memop_idx(MO_TEUW, idx), retaddr);
++    if ((mop & MO_BSWAP) != MO_TE) {
++        ret = bswap16(ret);
++    }
++    return ret;
++}
++
++uint32_t cpu_ldl_code_mmu(CPUArchState *env, abi_ptr addr,
++                          MemOpIdx oi, uintptr_t retaddr)
++{
++    MemOp mop = get_memop(oi);
++    int idx = get_mmuidx(oi);
++    uint32_t ret;
++
++    ret = full_ldl_code(env, addr, make_memop_idx(MO_TEUL, idx), retaddr);
++    if ((mop & MO_BSWAP) != MO_TE) {
++        ret = bswap32(ret);
++    }
++    return ret;
++}
++
++uint64_t cpu_ldq_code_mmu(CPUArchState *env, abi_ptr addr,
++                          MemOpIdx oi, uintptr_t retaddr)
++{
++    MemOp mop = get_memop(oi);
++    int idx = get_mmuidx(oi);
++    uint64_t ret;
++
++    ret = full_ldq_code(env, addr, make_memop_idx(MO_TEUQ, idx), retaddr);
++    if ((mop & MO_BSWAP) != MO_TE) {
++        ret = bswap64(ret);
++    }
++    return ret;
++}
+diff --git a/accel/tcg/user-exec.c b/accel/tcg/user-exec.c
+index XXXXXXX..XXXXXXX 100644
+--- a/accel/tcg/user-exec.c
++++ b/accel/tcg/user-exec.c
+@@ -XXX,XX +XXX,XX @@ uint64_t cpu_ldq_code(CPUArchState *env, abi_ptr ptr)
+     return ret;
+ }
++uint8_t cpu_ldb_code_mmu(CPUArchState *env, abi_ptr addr,
++                         MemOpIdx oi, uintptr_t ra)
++{
++    void *haddr;
++    uint8_t ret;
++
++    haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_INST_FETCH);
++    ret = ldub_p(haddr);
++    clear_helper_retaddr();
++    return ret;
++}
++
++uint16_t cpu_ldw_code_mmu(CPUArchState *env, abi_ptr addr,
++                          MemOpIdx oi, uintptr_t ra)
++{
++    void *haddr;
++    uint16_t ret;
++
++    haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_INST_FETCH);
++    ret = lduw_p(haddr);
++    clear_helper_retaddr();
++    if (get_memop(oi) & MO_BSWAP) {
++        ret = bswap16(ret);
++    }
++    return ret;
++}
++
++uint32_t cpu_ldl_code_mmu(CPUArchState *env, abi_ptr addr,
++                          MemOpIdx oi, uintptr_t ra)
++{
++    void *haddr;
++    uint32_t ret;
++
++    haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_INST_FETCH);
++    ret = ldl_p(haddr);
++    clear_helper_retaddr();
++    if (get_memop(oi) & MO_BSWAP) {
++        ret = bswap32(ret);
++    }
++    return ret;
++}
++
++uint64_t cpu_ldq_code_mmu(CPUArchState *env, abi_ptr addr,
++                          MemOpIdx oi, uintptr_t ra)
++{
++    void *haddr;
++    uint64_t ret;
++
++    validate_memop(oi, MO_BEUQ);
++    haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_LOAD);
++    ret = ldq_p(haddr);
++    clear_helper_retaddr();
++    if (get_memop(oi) & MO_BSWAP) {
++        ret = bswap64(ret);
++    }
++    return ret;
++}
++
+ #include "ldst_common.c.inc"
+ /*
+--
+.34.1

-[PULL 2/3] tcg: Add support for a helper with 7 arguments
+[PULL 10/12] tcg/loongarch64: Conditionalize tcg_out_exts_i32_i64
-From: Taylor Simpson <tsimpson@quicinc.com>
+Since TCG_TYPE_I32 values are kept sign-extended in registers,
 via ".w" instructions, we need not extend if the register matches.
 This is already relied upon by comparisons.
-Currently, helpers can only take up to 6 arguments.  This patch adds the
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 capability for up to 7 arguments.  I have tested it with the Hexagon port
 that I am preparing for submission.
 Signed-off-by: Taylor Simpson <tsimpson@quicinc.com>
 Message-Id: <1580942510-2820-1-git-send-email-tsimpson@quicinc.com>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/exec/helper-gen.h   | 13 +++++++++++++
+ tcg/loongarch64/tcg-target.c.inc | 4 +++-
- include/exec/helper-head.h  |  2 ++
+file changed, 3 insertions(+), 1 deletion(-)
  include/exec/helper-proto.h |  6 ++++++
  include/exec/helper-tcg.h   |  7 +++++++
 files changed, 28 insertions(+)
-diff --git a/include/exec/helper-gen.h b/include/exec/helper-gen.h
+diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
---- a/include/exec/helper-gen.h
+--- a/tcg/loongarch64/tcg-target.c.inc
-+++ b/include/exec/helper-gen.h
++++ b/tcg/loongarch64/tcg-target.c.inc
-@@ -XXX,XX +XXX,XX @@ static inline void glue(gen_helper_, name)(dh_retvar_decl(ret)          \
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_ext32s(TCGContext *s, TCGReg ret, TCGReg arg)
-   tcg_gen_callN(HELPER(name), dh_retvar(ret), 6, args);                 \
  static void tcg_out_exts_i32_i64(TCGContext *s, TCGReg ret, TCGReg arg)
  {
 -    tcg_out_ext32s(s, ret, arg);
 +    if (ret != arg) {
 +        tcg_out_ext32s(s, ret, arg);
 +    }
  }
-+#define DEF_HELPER_FLAGS_7(name, flags, ret, t1, t2, t3, t4, t5, t6, t7)\
+ static void tcg_out_extu_i32_i64(TCGContext *s, TCGReg ret, TCGReg arg)
 +static inline void glue(gen_helper_, name)(dh_retvar_decl(ret)          \
 +    dh_arg_decl(t1, 1),  dh_arg_decl(t2, 2), dh_arg_decl(t3, 3),        \
 +    dh_arg_decl(t4, 4), dh_arg_decl(t5, 5), dh_arg_decl(t6, 6),         \
 +    dh_arg_decl(t7, 7))                                                 \
 +{                                                                       \
 +  TCGTemp *args[7] = { dh_arg(t1, 1), dh_arg(t2, 2), dh_arg(t3, 3),     \
 +                     dh_arg(t4, 4), dh_arg(t5, 5), dh_arg(t6, 6),       \
 +                     dh_arg(t7, 7) };                                   \
 +  tcg_gen_callN(HELPER(name), dh_retvar(ret), 7, args);                 \
 +}
 +
  #include "helper.h"
  #include "trace/generated-helpers.h"
  #include "trace/generated-helpers-wrappers.h"
@@ -XXX,XX +XXX,XX @@ static inline void glue(gen_helper_, name)(dh_retvar_decl(ret)          \
  #undef DEF_HELPER_FLAGS_4
  #undef DEF_HELPER_FLAGS_5
  #undef DEF_HELPER_FLAGS_6
 +#undef DEF_HELPER_FLAGS_7
  #undef GEN_HELPER
  #endif /* HELPER_GEN_H */
 diff --git a/include/exec/helper-head.h b/include/exec/helper-head.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/exec/helper-head.h
 +++ b/include/exec/helper-head.h
@@ -XXX,XX +XXX,XX @@
      DEF_HELPER_FLAGS_5(name, 0, ret, t1, t2, t3, t4, t5)
  #define DEF_HELPER_6(name, ret, t1, t2, t3, t4, t5, t6) \
      DEF_HELPER_FLAGS_6(name, 0, ret, t1, t2, t3, t4, t5, t6)
 +#define DEF_HELPER_7(name, ret, t1, t2, t3, t4, t5, t6, t7) \
 +    DEF_HELPER_FLAGS_7(name, 0, ret, t1, t2, t3, t4, t5, t6, t7)
  /* MAX_OPC_PARAM_IARGS must be set to n if last entry is DEF_HELPER_FLAGS_n. */
 diff --git a/include/exec/helper-proto.h b/include/exec/helper-proto.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/exec/helper-proto.h
 +++ b/include/exec/helper-proto.h
@@ -XXX,XX +XXX,XX @@ dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
  dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
                              dh_ctype(t4), dh_ctype(t5), dh_ctype(t6));
 +#define DEF_HELPER_FLAGS_7(name, flags, ret, t1, t2, t3, t4, t5, t6, t7) \
 +dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
 +                            dh_ctype(t4), dh_ctype(t5), dh_ctype(t6), \
 +                            dh_ctype(t7));
 +
  #include "helper.h"
  #include "trace/generated-helpers.h"
  #include "tcg-runtime.h"
@@ -XXX,XX +XXX,XX @@ dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
  #undef DEF_HELPER_FLAGS_4
  #undef DEF_HELPER_FLAGS_5
  #undef DEF_HELPER_FLAGS_6
 +#undef DEF_HELPER_FLAGS_7
  #endif /* HELPER_PROTO_H */
 diff --git a/include/exec/helper-tcg.h b/include/exec/helper-tcg.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/exec/helper-tcg.h
 +++ b/include/exec/helper-tcg.h
@@ -XXX,XX +XXX,XX @@
      | dh_sizemask(t2, 2) | dh_sizemask(t3, 3) | dh_sizemask(t4, 4) \
      | dh_sizemask(t5, 5) | dh_sizemask(t6, 6) },
 +#define DEF_HELPER_FLAGS_7(NAME, FLAGS, ret, t1, t2, t3, t4, t5, t6, t7) \
 +  { .func = HELPER(NAME), .name = str(NAME), .flags = FLAGS, \
 +    .sizemask = dh_sizemask(ret, 0) | dh_sizemask(t1, 1) \
 +    | dh_sizemask(t2, 2) | dh_sizemask(t3, 3) | dh_sizemask(t4, 4) \
 +    | dh_sizemask(t5, 5) | dh_sizemask(t6, 6) | dh_sizemask(t7, 7) },
 +
  #include "helper.h"
  #include "trace/generated-helpers.h"
  #include "tcg-runtime.h"
@@ -XXX,XX +XXX,XX @@
  #undef DEF_HELPER_FLAGS_4
  #undef DEF_HELPER_FLAGS_5
  #undef DEF_HELPER_FLAGS_6
 +#undef DEF_HELPER_FLAGS_7
  #endif /* HELPER_TCG_H */
 --
-.20.1
+.34.1

-[PULL 1/3] exec: flush CPU TB cache in breakpoint_invalidate
+[PULL 11/12] tcg/mips: Conditionalize tcg_out_exts_i32_i64
-From: Max Filippov <jcmvbkbc@gmail.com>
+Since TCG_TYPE_I32 values are kept sign-extended in registers, we need not
 extend if the register matches.  This is already relied upon by comparisons.
-When a breakpoint is inserted at location for which there's currently no
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 virtual to physical translation no action is taken on CPU TB cache. If a
 TB for that virtual address already exists but is not visible ATM the
 breakpoint won't be hit next time an instruction at that address will be
 executed.
 Flush entire CPU TB cache in breakpoint_invalidate to force
 re-translation of all TBs for the breakpoint address.
 This change fixes the following scenario:
 - linux user application is running
 - a breakpoint is inserted from QEMU gdbstub for a user address that is
   not currently present in the target CPU TLB
 - an instruction at that address is executed, but the external debugger
   doesn't get control.
 Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
 Signed-off-by: Max Filippov <jcmvbkbc@gmail.com>
 Message-Id: <20191127220602.10827-2-jcmvbkbc@gmail.com>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- exec.c | 15 +++++++--------
+ tcg/mips/tcg-target.c.inc | 4 +++-
-file changed, 7 insertions(+), 8 deletions(-)
+file changed, 3 insertions(+), 1 deletion(-)
-diff --git a/exec.c b/exec.c
+diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
---- a/exec.c
+--- a/tcg/mips/tcg-target.c.inc
-+++ b/exec.c
++++ b/tcg/mips/tcg-target.c.inc
-@@ -XXX,XX +XXX,XX @@ void tb_invalidate_phys_addr(AddressSpace *as, hwaddr addr, MemTxAttrs attrs)
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_ext32s(TCGContext *s, TCGReg rd, TCGReg rs)
- static void breakpoint_invalidate(CPUState *cpu, target_ulong pc)
+ static void tcg_out_exts_i32_i64(TCGContext *s, TCGReg rd, TCGReg rs)
  {
--    MemTxAttrs attrs;
+-    tcg_out_ext32s(s, rd, rs);
--    hwaddr phys = cpu_get_phys_page_attrs_debug(cpu, pc, &attrs);
++    if (rd != rs) {
--    int asidx = cpu_asidx_from_attrs(cpu, attrs);
++        tcg_out_ext32s(s, rd, rs);
--    if (phys != -1) {
++    }
 -        /* Locks grabbed by tb_invalidate_phys_addr */
 -        tb_invalidate_phys_addr(cpu->cpu_ases[asidx].as,
 -                                phys | (pc & ~TARGET_PAGE_MASK), attrs);
 -    }
 +    /*
 +     * There may not be a virtual to physical translation for the pc
 +     * right now, but there may exist cached TB for this pc.
 +     * Flush the whole TB cache to force re-translation of such TBs.
 +     * This is heavyweight, but we're debugging anyway.
 +     */
 +    tb_flush(cpu);
  }
- #endif
+ static void tcg_out_extu_i32_i64(TCGContext *s, TCGReg rd, TCGReg rs)
 --
-.20.1
+.34.1

-New patch
+[PULL 12/12] tcg: Introduce tcg_out_movext2
+This is common code in most qemu_{ld,st} slow paths, moving two
 registers when there may be overlap between sources and destinations.
 At present, this is only used by 32-bit hosts for 64-bit data,
 but will shortly be used for more than that.
 Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
  tcg/tcg.c                 | 69 ++++++++++++++++++++++++++++++++++++---
  tcg/arm/tcg-target.c.inc  | 44 ++++++++++---------------
  tcg/i386/tcg-target.c.inc | 19 +++++------
 files changed, 90 insertions(+), 42 deletions(-)
 diff --git a/tcg/tcg.c b/tcg/tcg.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tcg.c
 +++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exts_i32_i64(TCGContext *s, TCGReg ret, TCGReg arg);
  static void tcg_out_extu_i32_i64(TCGContext *s, TCGReg ret, TCGReg arg);
  static void tcg_out_extrl_i64_i32(TCGContext *s, TCGReg ret, TCGReg arg);
  static void tcg_out_addi_ptr(TCGContext *s, TCGReg, TCGReg, tcg_target_long);
 -static bool tcg_out_xchg(TCGContext *s, TCGType type, TCGReg r1, TCGReg r2)
 -    __attribute__((unused));
 +static bool tcg_out_xchg(TCGContext *s, TCGType type, TCGReg r1, TCGReg r2);
  static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg);
  static void tcg_out_goto_tb(TCGContext *s, int which);
  static void tcg_out_op(TCGContext *s, TCGOpcode opc,
@@ -XXX,XX +XXX,XX @@ void tcg_raise_tb_overflow(TCGContext *s)
      siglongjmp(s->jmp_trans, -2);
  }
 +typedef struct TCGMovExtend {
 +    TCGReg dst;
 +    TCGReg src;
 +    TCGType dst_type;
 +    TCGType src_type;
 +    MemOp src_ext;
 +} TCGMovExtend;
 +
  /**
   * tcg_out_movext -- move and extend
   * @s: tcg context
@@ -XXX,XX +XXX,XX @@ void tcg_raise_tb_overflow(TCGContext *s)
   *
   * Move or extend @src into @dst, depending on @src_ext and the types.
   */
 -static void __attribute__((unused))
 -tcg_out_movext(TCGContext *s, TCGType dst_type, TCGReg dst,
 -               TCGType src_type, MemOp src_ext, TCGReg src)
 +static void tcg_out_movext(TCGContext *s, TCGType dst_type, TCGReg dst,
 +                           TCGType src_type, MemOp src_ext, TCGReg src)
  {
      switch (src_ext) {
      case MO_UB:
@@ -XXX,XX +XXX,XX @@ tcg_out_movext(TCGContext *s, TCGType dst_type, TCGReg dst,
      }
  }
 +/* Minor variations on a theme, using a structure. */
 +static void tcg_out_movext1_new_src(TCGContext *s, const TCGMovExtend *i,
 +                                    TCGReg src)
 +{
 +    tcg_out_movext(s, i->dst_type, i->dst, i->src_type, i->src_ext, src);
 +}
 +
 +static void tcg_out_movext1(TCGContext *s, const TCGMovExtend *i)
 +{
 +    tcg_out_movext1_new_src(s, i, i->src);
 +}
 +
 +/**
 + * tcg_out_movext2 -- move and extend two pair
 + * @s: tcg context
 + * @i1: first move description
 + * @i2: second move description
 + * @scratch: temporary register, or -1 for none
 + *
 + * As tcg_out_movext, for both @i1 and @i2, caring for overlap
 + * between the sources and destinations.
 + */
 +
 +static void __attribute__((unused))
 +tcg_out_movext2(TCGContext *s, const TCGMovExtend *i1,
 +                const TCGMovExtend *i2, int scratch)
 +{
 +    TCGReg src1 = i1->src;
 +    TCGReg src2 = i2->src;
 +
 +    if (i1->dst != src2) {
 +        tcg_out_movext1(s, i1);
 +        tcg_out_movext1(s, i2);
 +        return;
 +    }
 +    if (i2->dst == src1) {
 +        TCGType src1_type = i1->src_type;
 +        TCGType src2_type = i2->src_type;
 +
 +        if (tcg_out_xchg(s, MAX(src1_type, src2_type), src1, src2)) {
 +            /* The data is now in the correct registers, now extend. */
 +            src1 = i2->src;
 +            src2 = i1->src;
 +        } else {
 +            tcg_debug_assert(scratch >= 0);
 +            tcg_out_mov(s, src1_type, scratch, src1);
 +            src1 = scratch;
 +        }
 +    }
 +    tcg_out_movext1_new_src(s, i2, src2);
 +    tcg_out_movext1_new_src(s, i1, src1);
 +}
 +
  #define C_PFX1(P, A)                    P##A
  #define C_PFX2(P, A, B)                 P##A##_##B
  #define C_PFX3(P, A, B, C)              P##A##_##B##_##C
 diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/arm/tcg-target.c.inc
 +++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void add_qemu_ldst_label(TCGContext *s, bool is_ld, MemOpIdx oi,
  static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
  {
 -    TCGReg argreg, datalo, datahi;
 +    TCGReg argreg;
      MemOpIdx oi = lb->oi;
      MemOp opc = get_memop(oi);
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
      /* Use the canonical unsigned helpers and minimize icache usage. */
      tcg_out_call_int(s, qemu_ld_helpers[opc & MO_SIZE]);
 -    datalo = lb->datalo_reg;
 -    datahi = lb->datahi_reg;
      if ((opc & MO_SIZE) == MO_64) {
 -        if (datalo != TCG_REG_R1) {
 -            tcg_out_mov_reg(s, COND_AL, datalo, TCG_REG_R0);
 -            tcg_out_mov_reg(s, COND_AL, datahi, TCG_REG_R1);
 -        } else if (datahi != TCG_REG_R0) {
 -            tcg_out_mov_reg(s, COND_AL, datahi, TCG_REG_R1);
 -            tcg_out_mov_reg(s, COND_AL, datalo, TCG_REG_R0);
 -        } else {
 -            tcg_out_mov_reg(s, COND_AL, TCG_REG_TMP, TCG_REG_R0);
 -            tcg_out_mov_reg(s, COND_AL, datahi, TCG_REG_R1);
 -            tcg_out_mov_reg(s, COND_AL, datalo, TCG_REG_TMP);
 -        }
 +        TCGMovExtend ext[2] = {
 +            { .dst = lb->datalo_reg, .dst_type = TCG_TYPE_I32,
 +              .src = TCG_REG_R0, .src_type = TCG_TYPE_I32, .src_ext = MO_UL },
 +            { .dst = lb->datahi_reg, .dst_type = TCG_TYPE_I32,
 +              .src = TCG_REG_R1, .src_type = TCG_TYPE_I32, .src_ext = MO_UL },
 +        };
 +        tcg_out_movext2(s, &ext[0], &ext[1], TCG_REG_TMP);
      } else {
 -        tcg_out_movext(s, TCG_TYPE_I32, datalo,
 +        tcg_out_movext(s, TCG_TYPE_I32, lb->datalo_reg,
                         TCG_TYPE_I32, opc & MO_SSIZE, TCG_REG_R0);
      }
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_fail_alignment(TCGContext *s, TCGLabelQemuLdst *l)
      if (TARGET_LONG_BITS == 64) {
          /* 64-bit target address is aligned into R2:R3. */
 -        if (l->addrhi_reg != TCG_REG_R2) {
 -            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R2, l->addrlo_reg);
 -            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R3, l->addrhi_reg);
 -        } else if (l->addrlo_reg != TCG_REG_R3) {
 -            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R3, l->addrhi_reg);
 -            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R2, l->addrlo_reg);
 -        } else {
 -            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R1, TCG_REG_R2);
 -            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R2, TCG_REG_R3);
 -            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R3, TCG_REG_R1);
 -        }
 +        TCGMovExtend ext[2] = {
 +            { .dst = TCG_REG_R2, .dst_type = TCG_TYPE_I32,
 +              .src = l->addrlo_reg,
 +              .src_type = TCG_TYPE_I32, .src_ext = MO_UL },
 +            { .dst = TCG_REG_R3, .dst_type = TCG_TYPE_I32,
 +              .src = l->addrhi_reg,
 +              .src_type = TCG_TYPE_I32, .src_ext = MO_UL },
 +        };
 +        tcg_out_movext2(s, &ext[0], &ext[1], TCG_REG_TMP);
      } else {
          tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R1, l->addrlo_reg);
      }
 diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/i386/tcg-target.c.inc
 +++ b/tcg/i386/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
  {
      MemOpIdx oi = l->oi;
      MemOp opc = get_memop(oi);
 -    TCGReg data_reg;
      tcg_insn_unit **label_ptr = &l->label_ptr[0];
      /* resolve label address */
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
      tcg_out_branch(s, 1, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]);
 -    data_reg = l->datalo_reg;
      if (TCG_TARGET_REG_BITS == 32 && (opc & MO_SIZE) == MO_64) {
 -        if (data_reg == TCG_REG_EDX) {
 -            /* xchg %edx, %eax */
 -            tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0);
 -            tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EAX);
 -        } else {
 -            tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
 -            tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EDX);
 -        }
 +        TCGMovExtend ext[2] = {
 +            { .dst = l->datalo_reg, .dst_type = TCG_TYPE_I32,
 +              .src = TCG_REG_EAX, .src_type = TCG_TYPE_I32, .src_ext = MO_UL },
 +            { .dst = l->datahi_reg, .dst_type = TCG_TYPE_I32,
 +              .src = TCG_REG_EDX, .src_type = TCG_TYPE_I32, .src_ext = MO_UL },
 +        };
 +        tcg_out_movext2(s, &ext[0], &ext[1], -1);
      } else {
 -        tcg_out_movext(s, l->type, data_reg,
 +        tcg_out_movext(s, l->type, l->datalo_reg,
                         TCG_TYPE_REG, opc & MO_SSIZE, TCG_REG_EAX);
      }
 --
 .34.1

The following changes since commit e18e5501d8ac692d32657a3e1ef545b14e72b730:

Merge remote-tracking branch 'remotes/dgilbert-gitlab/tags/pull-virtiofs-20200210' into staging (2020-02-10 18:09:14 +0000)

are available in the Git repository at:

https://github.com/rth7680/qemu.git tags/pull-tcg-20200212

for you to fetch changes up to 2445971604c1cfd3ec484457159f4ac300fb04d2:

tcg: Add tcg_gen_gvec_5_ptr (2020-02-12 14:58:36 -0800)

----------------------------------------------------------------
Fix breakpoint invalidation.
Add support for tcg helpers with 7 arguments.
Add support for gvec helpers with 5 arguments.

----------------------------------------------------------------
Max Filippov (1):
      exec: flush CPU TB cache in breakpoint_invalidate

Richard Henderson (1):
      tcg: Add tcg_gen_gvec_5_ptr

Taylor Simpson (1):
      tcg: Add support for a helper with 7 arguments

From: Max Filippov <jcmvbkbc@gmail.com>

When a breakpoint is inserted at location for which there's currently no
virtual to physical translation no action is taken on CPU TB cache. If a
TB for that virtual address already exists but is not visible ATM the
breakpoint won't be hit next time an instruction at that address will be
executed.

Flush entire CPU TB cache in breakpoint_invalidate to force
re-translation of all TBs for the breakpoint address.

This change fixes the following scenario:
- linux user application is running
- a breakpoint is inserted from QEMU gdbstub for a user address that is
  not currently present in the target CPU TLB
- an instruction at that address is executed, but the external debugger
  doesn't get control.

Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Max Filippov <jcmvbkbc@gmail.com>
Message-Id: <20191127220602.10827-2-jcmvbkbc@gmail.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 exec.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/exec.c b/exec.c
index XXXXXXX..XXXXXXX 100644
--- a/exec.c
+++ b/exec.c
@@ -XXX,XX +XXX,XX @@ void tb_invalidate_phys_addr(AddressSpace *as, hwaddr addr, MemTxAttrs attrs)
 
 static void breakpoint_invalidate(CPUState *cpu, target_ulong pc)
 {
-    MemTxAttrs attrs;
-    hwaddr phys = cpu_get_phys_page_attrs_debug(cpu, pc, &attrs);
-    int asidx = cpu_asidx_from_attrs(cpu, attrs);
-    if (phys != -1) {
-        /* Locks grabbed by tb_invalidate_phys_addr */
-        tb_invalidate_phys_addr(cpu->cpu_ases[asidx].as,
-                                phys | (pc & ~TARGET_PAGE_MASK), attrs);
-    }
+    /*
+     * There may not be a virtual to physical translation for the pc
+     * right now, but there may exist cached TB for this pc.
+     * Flush the whole TB cache to force re-translation of such TBs.
+     * This is heavyweight, but we're debugging anyway.
+     */
+    tb_flush(cpu);
 }
 #endif
 
-- 
2.20.1

From: Taylor Simpson <tsimpson@quicinc.com>

Currently, helpers can only take up to 6 arguments.  This patch adds the
capability for up to 7 arguments.  I have tested it with the Hexagon port
that I am preparing for submission.

Signed-off-by: Taylor Simpson <tsimpson@quicinc.com>
Message-Id: <1580942510-2820-1-git-send-email-tsimpson@quicinc.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/helper-gen.h   | 13 +++++++++++++
 include/exec/helper-head.h  |  2 ++
 include/exec/helper-proto.h |  6 ++++++
 include/exec/helper-tcg.h   |  7 +++++++
 4 files changed, 28 insertions(+)

diff --git a/include/exec/helper-gen.h b/include/exec/helper-gen.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/helper-gen.h
+++ b/include/exec/helper-gen.h
@@ -XXX,XX +XXX,XX @@ static inline void glue(gen_helper_, name)(dh_retvar_decl(ret)          \
   tcg_gen_callN(HELPER(name), dh_retvar(ret), 6, args);                 \
 }
 
+#define DEF_HELPER_FLAGS_7(name, flags, ret, t1, t2, t3, t4, t5, t6, t7)\
+static inline void glue(gen_helper_, name)(dh_retvar_decl(ret)          \
+    dh_arg_decl(t1, 1),  dh_arg_decl(t2, 2), dh_arg_decl(t3, 3),        \
+    dh_arg_decl(t4, 4), dh_arg_decl(t5, 5), dh_arg_decl(t6, 6),         \
+    dh_arg_decl(t7, 7))                                                 \
+{                                                                       \
+  TCGTemp *args[7] = { dh_arg(t1, 1), dh_arg(t2, 2), dh_arg(t3, 3),     \
+                     dh_arg(t4, 4), dh_arg(t5, 5), dh_arg(t6, 6),       \
+                     dh_arg(t7, 7) };                                   \
+  tcg_gen_callN(HELPER(name), dh_retvar(ret), 7, args);                 \
+}
+
 #include "helper.h"
 #include "trace/generated-helpers.h"
 #include "trace/generated-helpers-wrappers.h"
@@ -XXX,XX +XXX,XX @@ static inline void glue(gen_helper_, name)(dh_retvar_decl(ret)          \
 #undef DEF_HELPER_FLAGS_4
 #undef DEF_HELPER_FLAGS_5
 #undef DEF_HELPER_FLAGS_6
+#undef DEF_HELPER_FLAGS_7
 #undef GEN_HELPER
 
 #endif /* HELPER_GEN_H */
diff --git a/include/exec/helper-head.h b/include/exec/helper-head.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/helper-head.h
+++ b/include/exec/helper-head.h
@@ -XXX,XX +XXX,XX @@
     DEF_HELPER_FLAGS_5(name, 0, ret, t1, t2, t3, t4, t5)
 #define DEF_HELPER_6(name, ret, t1, t2, t3, t4, t5, t6) \
     DEF_HELPER_FLAGS_6(name, 0, ret, t1, t2, t3, t4, t5, t6)
+#define DEF_HELPER_7(name, ret, t1, t2, t3, t4, t5, t6, t7) \
+    DEF_HELPER_FLAGS_7(name, 0, ret, t1, t2, t3, t4, t5, t6, t7)
 
 /* MAX_OPC_PARAM_IARGS must be set to n if last entry is DEF_HELPER_FLAGS_n. */
 
diff --git a/include/exec/helper-proto.h b/include/exec/helper-proto.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/helper-proto.h
+++ b/include/exec/helper-proto.h
@@ -XXX,XX +XXX,XX @@ dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
 dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
                             dh_ctype(t4), dh_ctype(t5), dh_ctype(t6));
 
+#define DEF_HELPER_FLAGS_7(name, flags, ret, t1, t2, t3, t4, t5, t6, t7) \
+dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
+                            dh_ctype(t4), dh_ctype(t5), dh_ctype(t6), \
+                            dh_ctype(t7));
+
 #include "helper.h"
 #include "trace/generated-helpers.h"
 #include "tcg-runtime.h"
@@ -XXX,XX +XXX,XX @@ dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
 #undef DEF_HELPER_FLAGS_4
 #undef DEF_HELPER_FLAGS_5
 #undef DEF_HELPER_FLAGS_6
+#undef DEF_HELPER_FLAGS_7
 
 #endif /* HELPER_PROTO_H */
diff --git a/include/exec/helper-tcg.h b/include/exec/helper-tcg.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/helper-tcg.h
+++ b/include/exec/helper-tcg.h
@@ -XXX,XX +XXX,XX @@
     | dh_sizemask(t2, 2) | dh_sizemask(t3, 3) | dh_sizemask(t4, 4) \
     | dh_sizemask(t5, 5) | dh_sizemask(t6, 6) },
 
+#define DEF_HELPER_FLAGS_7(NAME, FLAGS, ret, t1, t2, t3, t4, t5, t6, t7) \
+  { .func = HELPER(NAME), .name = str(NAME), .flags = FLAGS, \
+    .sizemask = dh_sizemask(ret, 0) | dh_sizemask(t1, 1) \
+    | dh_sizemask(t2, 2) | dh_sizemask(t3, 3) | dh_sizemask(t4, 4) \
+    | dh_sizemask(t5, 5) | dh_sizemask(t6, 6) | dh_sizemask(t7, 7) },
+
 #include "helper.h"
 #include "trace/generated-helpers.h"
 #include "tcg-runtime.h"
@@ -XXX,XX +XXX,XX @@
 #undef DEF_HELPER_FLAGS_4
 #undef DEF_HELPER_FLAGS_5
 #undef DEF_HELPER_FLAGS_6
+#undef DEF_HELPER_FLAGS_7
 
 #endif /* HELPER_TCG_H */
-- 
2.20.1

Extend the vector generator infrastructure to handle
5 vector arguments.

Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Taylor Simpson <tsimpson@quicinc.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/tcg/tcg-op-gvec.h |  7 +++++++
 tcg/tcg-op-gvec.c         | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 39 insertions(+)

diff --git a/include/tcg/tcg-op-gvec.h b/include/tcg/tcg-op-gvec.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg-op-gvec.h
+++ b/include/tcg/tcg-op-gvec.h
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
                         uint32_t maxsz, int32_t data,
                         gen_helper_gvec_4_ptr *fn);
 
+typedef void gen_helper_gvec_5_ptr(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_ptr,
+                                   TCGv_ptr, TCGv_ptr, TCGv_i32);
+void tcg_gen_gvec_5_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+                        uint32_t cofs, uint32_t eofs, TCGv_ptr ptr,
+                        uint32_t oprsz, uint32_t maxsz, int32_t data,
+                        gen_helper_gvec_5_ptr *fn);
+
 /* Expand a gvec operation.  Either inline or out-of-line depending on
    the actual vector size and the operations supported by the host.  */
 typedef struct {
diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg-op-gvec.c
+++ b/tcg/tcg-op-gvec.c
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
     tcg_temp_free_i32(desc);
 }
 
+/* Generate a call to a gvec-style helper with five vector operands
+   and an extra pointer operand.  */
+void tcg_gen_gvec_5_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+                        uint32_t cofs, uint32_t eofs, TCGv_ptr ptr,
+                        uint32_t oprsz, uint32_t maxsz, int32_t data,
+                        gen_helper_gvec_5_ptr *fn)
+{
+    TCGv_ptr a0, a1, a2, a3, a4;
+    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
+
+    a0 = tcg_temp_new_ptr();
+    a1 = tcg_temp_new_ptr();
+    a2 = tcg_temp_new_ptr();
+    a3 = tcg_temp_new_ptr();
+    a4 = tcg_temp_new_ptr();
+
+    tcg_gen_addi_ptr(a0, cpu_env, dofs);
+    tcg_gen_addi_ptr(a1, cpu_env, aofs);
+    tcg_gen_addi_ptr(a2, cpu_env, bofs);
+    tcg_gen_addi_ptr(a3, cpu_env, cofs);
+    tcg_gen_addi_ptr(a4, cpu_env, eofs);
+
+    fn(a0, a1, a2, a3, a4, ptr, desc);
+
+    tcg_temp_free_ptr(a0);
+    tcg_temp_free_ptr(a1);
+    tcg_temp_free_ptr(a2);
+    tcg_temp_free_ptr(a3);
+    tcg_temp_free_ptr(a4);
+    tcg_temp_free_i32(desc);
+}
+
 /* Return true if we want to implement something of OPRSZ bytes
    in units of LNSZ.  This limits the expansion of inline code.  */
 static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz)
-- 
2.20.1

The following changes since commit 7c18f2d663521f1b31b821a13358ce38075eaf7d:

Merge tag 'for-upstream' of https://gitlab.com/bonzini/qemu into staging (2023-04-29 23:07:17 +0100)

are available in the Git repository at:

https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20230502

for you to fetch changes up to bdc7fba1c5a29ae218b45353daac9308fe1aae82:

tcg: Introduce tcg_out_movext2 (2023-05-02 12:15:41 +0100)

----------------------------------------------------------------
Misc tcg-related patch queue.

----------------------------------------------------------------
Dickon Hood (1):
      qemu/bitops.h: Limit rotate amounts

Kiran Ostrolenk (1):
      qemu/host-utils.h: Add clz and ctz functions for lower-bit integers

Nazar Kazakov (2):
      tcg: Add tcg_gen_gvec_andcs
      tcg: Add tcg_gen_gvec_rotrs

Richard Henderson (7):
      softmmu: Tidy dirtylimit_dirty_ring_full_time
      qemu/int128: Re-shuffle Int128Alias members
      migration/xbzrle: Use __attribute__((target)) for avx512
      accel/tcg: Add cpu_ld*_code_mmu
      tcg/loongarch64: Conditionalize tcg_out_exts_i32_i64
      tcg/mips: Conditionalize tcg_out_exts_i32_i64
      tcg: Introduce tcg_out_movext2

Weiwei Li (1):
      accel/tcg: Uncache the host address for instruction fetch when tlb size < 1

Drop inline marker: let compiler decide.

Change return type to uint64_t: this matches the computation in the
return statement and the local variable assignment in the caller.

Rename local to dirty_ring_size_MB to fix typo.
Simplify conversion to MiB via qemu_target_page_bits and right shift.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Reviewed-by: Thomas Huth <thuth@redhat.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 softmmu/dirtylimit.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/softmmu/dirtylimit.c b/softmmu/dirtylimit.c
index XXXXXXX..XXXXXXX 100644
--- a/softmmu/dirtylimit.c
+++ b/softmmu/dirtylimit.c
@@ -XXX,XX +XXX,XX @@ bool dirtylimit_vcpu_index_valid(int cpu_index)
              cpu_index >= ms->smp.max_cpus);
 }
 
-static inline int64_t dirtylimit_dirty_ring_full_time(uint64_t dirtyrate)
+static uint64_t dirtylimit_dirty_ring_full_time(uint64_t dirtyrate)
 {
     static uint64_t max_dirtyrate;
-    uint32_t dirty_ring_size = kvm_dirty_ring_size();
-    uint64_t dirty_ring_size_meory_MB =
-        dirty_ring_size * qemu_target_page_size() >> 20;
+    unsigned target_page_bits = qemu_target_page_bits();
+    uint64_t dirty_ring_size_MB;
+
+    /* So far, the largest (non-huge) page size is 64k, i.e. 16 bits. */
+    assert(target_page_bits < 20);
+
+    /* Convert ring size (pages) to MiB (2**20). */
+    dirty_ring_size_MB = kvm_dirty_ring_size() >> (20 - target_page_bits);
 
     if (max_dirtyrate < dirtyrate) {
         max_dirtyrate = dirtyrate;
     }
 
-    return dirty_ring_size_meory_MB * 1000000 / max_dirtyrate;
+    return dirty_ring_size_MB * 1000000 / max_dirtyrate;
 }
 
 static inline bool dirtylimit_done(uint64_t quota,
-- 
2.34.1

From: Weiwei Li <liweiwei@iscas.ac.cn>

When PMP entry overlap part of the page, we'll set the tlb_size to 1, which
will make the address in tlb entry set with TLB_INVALID_MASK, and the next
access will again go through tlb_fill.However, this way will not work in
tb_gen_code() => get_page_addr_code_hostp(): the TLB host address will be
cached, and the following instructions can use this host address directly
which may lead to the bypass of PMP related check.
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1542.

Signed-off-by: Weiwei Li <liweiwei@iscas.ac.cn>
Signed-off-by: Junqiang Wang <wangjunqiang@iscas.ac.cn>
Reviewed-by: LIU Zhiwei <zhiwei_liu@linux.alibaba.com>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-Id: <20230422130329.23555-6-liweiwei@iscas.ac.cn>
---
 accel/tcg/cputlb.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ tb_page_addr_t get_page_addr_code_hostp(CPUArchState *env, target_ulong addr,
     if (p == NULL) {
         return -1;
     }
+
+    if (full->lg_page_size < TARGET_PAGE_BITS) {
+        return -1;
+    }
+
     if (hostp) {
         *hostp = p;
     }
-- 
2.34.1

From: Dickon Hood <dickon.hood@codethink.co.uk>

Rotates have been fixed up to only allow for reasonable rotate amounts
(ie, no rotates >7 on an 8b value etc.)  This fixes a problem with riscv
vector rotate instructions.

Signed-off-by: Dickon Hood <dickon.hood@codethink.co.uk>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-Id: <20230428144757.57530-9-lawrence.hunter@codethink.co.uk>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/qemu/bitops.h | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/include/qemu/bitops.h b/include/qemu/bitops.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/bitops.h
+++ b/include/qemu/bitops.h
@@ -XXX,XX +XXX,XX @@ static inline unsigned long find_first_zero_bit(const unsigned long *addr,
  */
 static inline uint8_t rol8(uint8_t word, unsigned int shift)
 {
-    return (word << shift) | (word >> ((8 - shift) & 7));
+    shift &= 7;
+    return (word << shift) | (word >> (8 - shift));
 }
 
 /**
@@ -XXX,XX +XXX,XX @@ static inline uint8_t rol8(uint8_t word, unsigned int shift)
  */
 static inline uint8_t ror8(uint8_t word, unsigned int shift)
 {
-    return (word >> shift) | (word << ((8 - shift) & 7));
+    shift &= 7;
+    return (word >> shift) | (word << (8 - shift));
 }
 
 /**
@@ -XXX,XX +XXX,XX @@ static inline uint8_t ror8(uint8_t word, unsigned int shift)
  */
 static inline uint16_t rol16(uint16_t word, unsigned int shift)
 {
-    return (word << shift) | (word >> ((16 - shift) & 15));
+    shift &= 15;
+    return (word << shift) | (word >> (16 - shift));
 }
 
 /**
@@ -XXX,XX +XXX,XX @@ static inline uint16_t rol16(uint16_t word, unsigned int shift)
  */
 static inline uint16_t ror16(uint16_t word, unsigned int shift)
 {
-    return (word >> shift) | (word << ((16 - shift) & 15));
+    shift &= 15;
+    return (word >> shift) | (word << (16 - shift));
 }
 
 /**
@@ -XXX,XX +XXX,XX @@ static inline uint16_t ror16(uint16_t word, unsigned int shift)
  */
 static inline uint32_t rol32(uint32_t word, unsigned int shift)
 {
-    return (word << shift) | (word >> ((32 - shift) & 31));
+    shift &= 31;
+    return (word << shift) | (word >> (32 - shift));
 }
 
 /**
@@ -XXX,XX +XXX,XX @@ static inline uint32_t rol32(uint32_t word, unsigned int shift)
  */
 static inline uint32_t ror32(uint32_t word, unsigned int shift)
 {
-    return (word >> shift) | (word << ((32 - shift) & 31));
+    shift &= 31;
+    return (word >> shift) | (word << (32 - shift));
 }
 
 /**
@@ -XXX,XX +XXX,XX @@ static inline uint32_t ror32(uint32_t word, unsigned int shift)
  */
 static inline uint64_t rol64(uint64_t word, unsigned int shift)
 {
-    return (word << shift) | (word >> ((64 - shift) & 63));
+    shift &= 63;
+    return (word << shift) | (word >> (64 - shift));
 }
 
 /**
@@ -XXX,XX +XXX,XX @@ static inline uint64_t rol64(uint64_t word, unsigned int shift)
  */
 static inline uint64_t ror64(uint64_t word, unsigned int shift)
 {
-    return (word >> shift) | (word << ((64 - shift) & 63));
+    shift &= 63;
+    return (word >> shift) | (word << (64 - shift));
 }
 
 /**
-- 
2.34.1

From: Kiran Ostrolenk <kiran.ostrolenk@codethink.co.uk>

This is for use in the RISC-V vclz and vctz instructions (implemented in
proceeding commit).

Signed-off-by: Kiran Ostrolenk <kiran.ostrolenk@codethink.co.uk>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-Id: <20230428144757.57530-11-lawrence.hunter@codethink.co.uk>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/qemu/host-utils.h | 54 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)

diff --git a/include/qemu/host-utils.h b/include/qemu/host-utils.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/host-utils.h
+++ b/include/qemu/host-utils.h
@@ -XXX,XX +XXX,XX @@ static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
 }
 #endif
 
+/**
+ * clz8 - count leading zeros in a 8-bit value.
+ * @val: The value to search
+ *
+ * Returns 8 if the value is zero.  Note that the GCC builtin is
+ * undefined if the value is zero.
+ *
+ * Note that the GCC builtin will upcast its argument to an `unsigned int`
+ * so this function subtracts off the number of prepended zeroes.
+ */
+static inline int clz8(uint8_t val)
+{
+    return val ? __builtin_clz(val) - 24 : 8;
+}
+
+/**
+ * clz16 - count leading zeros in a 16-bit value.
+ * @val: The value to search
+ *
+ * Returns 16 if the value is zero.  Note that the GCC builtin is
+ * undefined if the value is zero.
+ *
+ * Note that the GCC builtin will upcast its argument to an `unsigned int`
+ * so this function subtracts off the number of prepended zeroes.
+ */
+static inline int clz16(uint16_t val)
+{
+    return val ? __builtin_clz(val) - 16 : 16;
+}
+
 /**
  * clz32 - count leading zeros in a 32-bit value.
  * @val: The value to search
@@ -XXX,XX +XXX,XX @@ static inline int clo64(uint64_t val)
     return clz64(~val);
 }
 
+/**
+ * ctz8 - count trailing zeros in a 8-bit value.
+ * @val: The value to search
+ *
+ * Returns 8 if the value is zero.  Note that the GCC builtin is
+ * undefined if the value is zero.
+ */
+static inline int ctz8(uint8_t val)
+{
+    return val ? __builtin_ctz(val) : 8;
+}
+
+/**
+ * ctz16 - count trailing zeros in a 16-bit value.
+ * @val: The value to search
+ *
+ * Returns 16 if the value is zero.  Note that the GCC builtin is
+ * undefined if the value is zero.
+ */
+static inline int ctz16(uint16_t val)
+{
+    return val ? __builtin_ctz(val) : 16;
+}
+
 /**
  * ctz32 - count trailing zeros in a 32-bit value.
  * @val: The value to search
-- 
2.34.1

From: Nazar Kazakov <nazar.kazakov@codethink.co.uk>

Add tcg expander and helper functions for and-compliment
vector with scalar operand.

Signed-off-by: Nazar Kazakov <nazar.kazakov@codethink.co.uk>
Message-Id: <20230428144757.57530-10-lawrence.hunter@codethink.co.uk>
[rth: Split out of larger patch.]
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/tcg-runtime.h      |  1 +
 include/tcg/tcg-op-gvec.h    |  2 ++
 accel/tcg/tcg-runtime-gvec.c | 11 +++++++++++
 tcg/tcg-op-gvec.c            | 17 +++++++++++++++++
 4 files changed, 31 insertions(+)

diff --git a/accel/tcg/tcg-runtime.h b/accel/tcg/tcg-runtime.h
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-runtime.h
+++ b/accel/tcg/tcg-runtime.h
@@ -XXX,XX +XXX,XX @@ DEF_HELPER_FLAGS_4(gvec_nor, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(gvec_eqv, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 
 DEF_HELPER_FLAGS_4(gvec_ands, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_andcs, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
 DEF_HELPER_FLAGS_4(gvec_xors, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
 DEF_HELPER_FLAGS_4(gvec_ors, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
 
diff --git a/include/tcg/tcg-op-gvec.h b/include/tcg/tcg-op-gvec.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg-op-gvec.h
+++ b/include/tcg/tcg-op-gvec.h
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, uint32_t aofs,
 
 void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs,
                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_andcs(unsigned vece, uint32_t dofs, uint32_t aofs,
+                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz);
 void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs,
                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz);
 void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs,
diff --git a/accel/tcg/tcg-runtime-gvec.c b/accel/tcg/tcg-runtime-gvec.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-runtime-gvec.c
+++ b/accel/tcg/tcg-runtime-gvec.c
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_ands)(void *d, void *a, uint64_t b, uint32_t desc)
     clear_high(d, oprsz, desc);
 }
 
+void HELPER(gvec_andcs)(void *d, void *a, uint64_t b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
+        *(uint64_t *)(d + i) = *(uint64_t *)(a + i) & ~b;
+    }
+    clear_high(d, oprsz, desc);
+}
+
 void HELPER(gvec_xors)(void *d, void *a, uint64_t b, uint32_t desc)
 {
     intptr_t oprsz = simd_oprsz(desc);
diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg-op-gvec.c
+++ b/tcg/tcg-op-gvec.c
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs,
     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
 }
 
+void tcg_gen_gvec_andcs(unsigned vece, uint32_t dofs, uint32_t aofs,
+                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
+{
+    static GVecGen2s g = {
+        .fni8 = tcg_gen_andc_i64,
+        .fniv = tcg_gen_andc_vec,
+        .fno = gen_helper_gvec_andcs,
+        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+        .vece = MO_64
+    };
+
+    TCGv_i64 tmp = tcg_temp_ebb_new_i64();
+    tcg_gen_dup_i64(vece, tmp, c);
+    tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g);
+    tcg_temp_free_i64(tmp);
+}
+
 static const GVecGen2s gop_xors = {
     .fni8 = tcg_gen_xor_i64,
     .fniv = tcg_gen_xor_vec,
-- 
2.34.1

From: Nazar Kazakov <nazar.kazakov@codethink.co.uk>

Add tcg expander and helper functions for rotate right
vector with scalar operand.

Signed-off-by: Nazar Kazakov <nazar.kazakov@codethink.co.uk>
Message-Id: <20230428144757.57530-10-lawrence.hunter@codethink.co.uk>
[rth: Split out of larger patch; mask rotation count.]
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/tcg/tcg-op-gvec.h |  2 ++
 tcg/tcg-op-gvec.c         | 11 +++++++++++
 2 files changed, 13 insertions(+)

Clang 14, with --enable-tcg-interpreter errors with

include/qemu/int128.h:487:16: error: alignment of field 'i' (128 bits)
  does not match the alignment of the first field in transparent union;
  transparent_union attribute ignored [-Werror,-Wignored-attributes]
    __int128_t i;
               ^
include/qemu/int128.h:486:12: note: alignment of first field is 64 bits
    Int128 s;
           ^
1 error generated.

By placing the __uint128_t member first, this is avoided.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Message-Id: <20230501204625.277361-1-richard.henderson@linaro.org>
---
 include/qemu/int128.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/qemu/int128.h b/include/qemu/int128.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/int128.h
+++ b/include/qemu/int128.h
@@ -XXX,XX +XXX,XX @@ static inline void bswap128s(Int128 *s)
  */
 #ifdef CONFIG_INT128
 typedef union {
-    Int128 s;
-    __int128_t i;
     __uint128_t u;
+    __int128_t i;
+    Int128 s;
 } Int128Alias __attribute__((transparent_union));
 #else
 typedef Int128 Int128Alias;
-- 
2.34.1

Use the attribute, which is supported by clang, instead of
the #pragma, which is not supported and, for some reason,
also not detected by the meson probe, so we fail by -Werror.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Message-Id: <20230501210555.289806-1-richard.henderson@linaro.org>
---
 meson.build        | 5 +----
 migration/xbzrle.c | 9 ++++-----
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/meson.build b/meson.build
index XXXXXXX..XXXXXXX 100644
--- a/meson.build
+++ b/meson.build
@@ -XXX,XX +XXX,XX @@ config_host_data.set('CONFIG_AVX512F_OPT', get_option('avx512f') \
 config_host_data.set('CONFIG_AVX512BW_OPT', get_option('avx512bw') \
   .require(have_cpuid_h, error_message: 'cpuid.h not available, cannot enable AVX512BW') \
   .require(cc.links('''
-    #pragma GCC push_options
-    #pragma GCC target("avx512bw")
     #include <cpuid.h>
     #include <immintrin.h>
-    static int bar(void *a) {
-
+    static int __attribute__((target("avx512bw"))) bar(void *a) {
       __m512i *x = a;
       __m512i res= _mm512_abs_epi8(*x);
       return res[1];
diff --git a/migration/xbzrle.c b/migration/xbzrle.c
index XXXXXXX..XXXXXXX 100644
--- a/migration/xbzrle.c
+++ b/migration/xbzrle.c
@@ -XXX,XX +XXX,XX @@ int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen)
 }
 
 #if defined(CONFIG_AVX512BW_OPT)
-#pragma GCC push_options
-#pragma GCC target("avx512bw")
 #include <immintrin.h>
-int xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen,
-                             uint8_t *dst, int dlen)
+
+int __attribute__((target("avx512bw")))
+xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen,
+                            uint8_t *dst, int dlen)
 {
     uint32_t zrun_len = 0, nzrun_len = 0;
     int d = 0, i = 0, num = 0;
@@ -XXX,XX +XXX,XX @@ int xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen,
     }
     return d;
 }
-#pragma GCC pop_options
 #endif
-- 
2.34.1

At least RISC-V has the need to be able to perform a read
using execute permissions, outside of translation.
Add helpers to facilitate this.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Acked-by: Alistair Francis <alistair.francis@wdc.com>
Reviewed-by: Weiwei Li <liweiwei@iscas.ac.cn>
Tested-by: Daniel Henrique Barboza <dbarboza@ventanamicro.com>
Message-Id: <20230325105429.1142530-9-richard.henderson@linaro.org>
Message-Id: <20230412114333.118895-9-richard.henderson@linaro.org>
---
 include/exec/cpu_ldst.h |  9 +++++++
 accel/tcg/cputlb.c      | 48 ++++++++++++++++++++++++++++++++++
 accel/tcg/user-exec.c   | 58 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 115 insertions(+)

diff --git a/include/exec/cpu_ldst.h b/include/exec/cpu_ldst.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/cpu_ldst.h
+++ b/include/exec/cpu_ldst.h
@@ -XXX,XX +XXX,XX @@ static inline CPUTLBEntry *tlb_entry(CPUArchState *env, uintptr_t mmu_idx,
 # define cpu_stq_mmu          cpu_stq_le_mmu
 #endif
 
+uint8_t cpu_ldb_code_mmu(CPUArchState *env, abi_ptr addr,
+                         MemOpIdx oi, uintptr_t ra);
+uint16_t cpu_ldw_code_mmu(CPUArchState *env, abi_ptr addr,
+                          MemOpIdx oi, uintptr_t ra);
+uint32_t cpu_ldl_code_mmu(CPUArchState *env, abi_ptr addr,
+                          MemOpIdx oi, uintptr_t ra);
+uint64_t cpu_ldq_code_mmu(CPUArchState *env, abi_ptr addr,
+                          MemOpIdx oi, uintptr_t ra);
+
 uint32_t cpu_ldub_code(CPUArchState *env, abi_ptr addr);
 uint32_t cpu_lduw_code(CPUArchState *env, abi_ptr addr);
 uint32_t cpu_ldl_code(CPUArchState *env, abi_ptr addr);
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ uint64_t cpu_ldq_code(CPUArchState *env, abi_ptr addr)
     MemOpIdx oi = make_memop_idx(MO_TEUQ, cpu_mmu_index(env, true));
     return full_ldq_code(env, addr, oi, 0);
 }
+
+uint8_t cpu_ldb_code_mmu(CPUArchState *env, abi_ptr addr,
+                         MemOpIdx oi, uintptr_t retaddr)
+{
+    return full_ldub_code(env, addr, oi, retaddr);
+}
+
+uint16_t cpu_ldw_code_mmu(CPUArchState *env, abi_ptr addr,
+                          MemOpIdx oi, uintptr_t retaddr)
+{
+    MemOp mop = get_memop(oi);
+    int idx = get_mmuidx(oi);
+    uint16_t ret;
+
+    ret = full_lduw_code(env, addr, make_memop_idx(MO_TEUW, idx), retaddr);
+    if ((mop & MO_BSWAP) != MO_TE) {
+        ret = bswap16(ret);
+    }
+    return ret;
+}
+
+uint32_t cpu_ldl_code_mmu(CPUArchState *env, abi_ptr addr,
+                          MemOpIdx oi, uintptr_t retaddr)
+{
+    MemOp mop = get_memop(oi);
+    int idx = get_mmuidx(oi);
+    uint32_t ret;
+
+    ret = full_ldl_code(env, addr, make_memop_idx(MO_TEUL, idx), retaddr);
+    if ((mop & MO_BSWAP) != MO_TE) {
+        ret = bswap32(ret);
+    }
+    return ret;
+}
+
+uint64_t cpu_ldq_code_mmu(CPUArchState *env, abi_ptr addr,
+                          MemOpIdx oi, uintptr_t retaddr)
+{
+    MemOp mop = get_memop(oi);
+    int idx = get_mmuidx(oi);
+    uint64_t ret;
+
+    ret = full_ldq_code(env, addr, make_memop_idx(MO_TEUQ, idx), retaddr);
+    if ((mop & MO_BSWAP) != MO_TE) {
+        ret = bswap64(ret);
+    }
+    return ret;
+}
diff --git a/accel/tcg/user-exec.c b/accel/tcg/user-exec.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/user-exec.c
+++ b/accel/tcg/user-exec.c
@@ -XXX,XX +XXX,XX @@ uint64_t cpu_ldq_code(CPUArchState *env, abi_ptr ptr)
     return ret;
 }
 
+uint8_t cpu_ldb_code_mmu(CPUArchState *env, abi_ptr addr,
+                         MemOpIdx oi, uintptr_t ra)
+{
+    void *haddr;
+    uint8_t ret;
+
+    haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_INST_FETCH);
+    ret = ldub_p(haddr);
+    clear_helper_retaddr();
+    return ret;
+}
+
+uint16_t cpu_ldw_code_mmu(CPUArchState *env, abi_ptr addr,
+                          MemOpIdx oi, uintptr_t ra)
+{
+    void *haddr;
+    uint16_t ret;
+
+    haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_INST_FETCH);
+    ret = lduw_p(haddr);
+    clear_helper_retaddr();
+    if (get_memop(oi) & MO_BSWAP) {
+        ret = bswap16(ret);
+    }
+    return ret;
+}
+
+uint32_t cpu_ldl_code_mmu(CPUArchState *env, abi_ptr addr,
+                          MemOpIdx oi, uintptr_t ra)
+{
+    void *haddr;
+    uint32_t ret;
+
+    haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_INST_FETCH);
+    ret = ldl_p(haddr);
+    clear_helper_retaddr();
+    if (get_memop(oi) & MO_BSWAP) {
+        ret = bswap32(ret);
+    }
+    return ret;
+}
+
+uint64_t cpu_ldq_code_mmu(CPUArchState *env, abi_ptr addr,
+                          MemOpIdx oi, uintptr_t ra)
+{
+    void *haddr;
+    uint64_t ret;
+
+    validate_memop(oi, MO_BEUQ);
+    haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_LOAD);
+    ret = ldq_p(haddr);
+    clear_helper_retaddr();
+    if (get_memop(oi) & MO_BSWAP) {
+        ret = bswap64(ret);
+    }
+    return ret;
+}
+
 #include "ldst_common.c.inc"
 
 /*
-- 
2.34.1

This is common code in most qemu_{ld,st} slow paths, moving two
registers when there may be overlap between sources and destinations.
At present, this is only used by 32-bit hosts for 64-bit data,
but will shortly be used for more than that.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tcg.c                 | 69 ++++++++++++++++++++++++++++++++++++---
 tcg/arm/tcg-target.c.inc  | 44 ++++++++++---------------
 tcg/i386/tcg-target.c.inc | 19 +++++------
 3 files changed, 90 insertions(+), 42 deletions(-)

diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exts_i32_i64(TCGContext *s, TCGReg ret, TCGReg arg);
 static void tcg_out_extu_i32_i64(TCGContext *s, TCGReg ret, TCGReg arg);
 static void tcg_out_extrl_i64_i32(TCGContext *s, TCGReg ret, TCGReg arg);
 static void tcg_out_addi_ptr(TCGContext *s, TCGReg, TCGReg, tcg_target_long);
-static bool tcg_out_xchg(TCGContext *s, TCGType type, TCGReg r1, TCGReg r2)
-    __attribute__((unused));
+static bool tcg_out_xchg(TCGContext *s, TCGType type, TCGReg r1, TCGReg r2);
 static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg);
 static void tcg_out_goto_tb(TCGContext *s, int which);
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
@@ -XXX,XX +XXX,XX @@ void tcg_raise_tb_overflow(TCGContext *s)
     siglongjmp(s->jmp_trans, -2);
 }
 
+typedef struct TCGMovExtend {
+    TCGReg dst;
+    TCGReg src;
+    TCGType dst_type;
+    TCGType src_type;
+    MemOp src_ext;
+} TCGMovExtend;
+
 /**
  * tcg_out_movext -- move and extend
  * @s: tcg context
@@ -XXX,XX +XXX,XX @@ void tcg_raise_tb_overflow(TCGContext *s)
  *
  * Move or extend @src into @dst, depending on @src_ext and the types.
  */
-static void __attribute__((unused))
-tcg_out_movext(TCGContext *s, TCGType dst_type, TCGReg dst,
-               TCGType src_type, MemOp src_ext, TCGReg src)
+static void tcg_out_movext(TCGContext *s, TCGType dst_type, TCGReg dst,
+                           TCGType src_type, MemOp src_ext, TCGReg src)
 {
     switch (src_ext) {
     case MO_UB:
@@ -XXX,XX +XXX,XX @@ tcg_out_movext(TCGContext *s, TCGType dst_type, TCGReg dst,
     }
 }
 
+/* Minor variations on a theme, using a structure. */
+static void tcg_out_movext1_new_src(TCGContext *s, const TCGMovExtend *i,
+                                    TCGReg src)
+{
+    tcg_out_movext(s, i->dst_type, i->dst, i->src_type, i->src_ext, src);
+}
+
+static void tcg_out_movext1(TCGContext *s, const TCGMovExtend *i)
+{
+    tcg_out_movext1_new_src(s, i, i->src);
+}
+
+/**
+ * tcg_out_movext2 -- move and extend two pair
+ * @s: tcg context
+ * @i1: first move description
+ * @i2: second move description
+ * @scratch: temporary register, or -1 for none
+ *
+ * As tcg_out_movext, for both @i1 and @i2, caring for overlap
+ * between the sources and destinations.
+ */
+
+static void __attribute__((unused))
+tcg_out_movext2(TCGContext *s, const TCGMovExtend *i1,
+                const TCGMovExtend *i2, int scratch)
+{
+    TCGReg src1 = i1->src;
+    TCGReg src2 = i2->src;
+
+    if (i1->dst != src2) {
+        tcg_out_movext1(s, i1);
+        tcg_out_movext1(s, i2);
+        return;
+    }
+    if (i2->dst == src1) {
+        TCGType src1_type = i1->src_type;
+        TCGType src2_type = i2->src_type;
+
+        if (tcg_out_xchg(s, MAX(src1_type, src2_type), src1, src2)) {
+            /* The data is now in the correct registers, now extend. */
+            src1 = i2->src;
+            src2 = i1->src;
+        } else {
+            tcg_debug_assert(scratch >= 0);
+            tcg_out_mov(s, src1_type, scratch, src1);
+            src1 = scratch;
+        }
+    }
+    tcg_out_movext1_new_src(s, i2, src2);
+    tcg_out_movext1_new_src(s, i1, src1);
+}
+
 #define C_PFX1(P, A)                    P##A
 #define C_PFX2(P, A, B)                 P##A##_##B
 #define C_PFX3(P, A, B, C)              P##A##_##B##_##C
diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void add_qemu_ldst_label(TCGContext *s, bool is_ld, MemOpIdx oi,
 
 static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
 {
-    TCGReg argreg, datalo, datahi;
+    TCGReg argreg;
     MemOpIdx oi = lb->oi;
     MemOp opc = get_memop(oi);
 
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
     /* Use the canonical unsigned helpers and minimize icache usage. */
     tcg_out_call_int(s, qemu_ld_helpers[opc & MO_SIZE]);
 
-    datalo = lb->datalo_reg;
-    datahi = lb->datahi_reg;
     if ((opc & MO_SIZE) == MO_64) {
-        if (datalo != TCG_REG_R1) {
-            tcg_out_mov_reg(s, COND_AL, datalo, TCG_REG_R0);
-            tcg_out_mov_reg(s, COND_AL, datahi, TCG_REG_R1);
-        } else if (datahi != TCG_REG_R0) {
-            tcg_out_mov_reg(s, COND_AL, datahi, TCG_REG_R1);
-            tcg_out_mov_reg(s, COND_AL, datalo, TCG_REG_R0);
-        } else {
-            tcg_out_mov_reg(s, COND_AL, TCG_REG_TMP, TCG_REG_R0);
-            tcg_out_mov_reg(s, COND_AL, datahi, TCG_REG_R1);
-            tcg_out_mov_reg(s, COND_AL, datalo, TCG_REG_TMP);
-        }
+        TCGMovExtend ext[2] = {
+            { .dst = lb->datalo_reg, .dst_type = TCG_TYPE_I32,
+              .src = TCG_REG_R0, .src_type = TCG_TYPE_I32, .src_ext = MO_UL },
+            { .dst = lb->datahi_reg, .dst_type = TCG_TYPE_I32,
+              .src = TCG_REG_R1, .src_type = TCG_TYPE_I32, .src_ext = MO_UL },
+        };
+        tcg_out_movext2(s, &ext[0], &ext[1], TCG_REG_TMP);
     } else {
-        tcg_out_movext(s, TCG_TYPE_I32, datalo,
+        tcg_out_movext(s, TCG_TYPE_I32, lb->datalo_reg,
                        TCG_TYPE_I32, opc & MO_SSIZE, TCG_REG_R0);
     }
 
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_fail_alignment(TCGContext *s, TCGLabelQemuLdst *l)
 
     if (TARGET_LONG_BITS == 64) {
         /* 64-bit target address is aligned into R2:R3. */
-        if (l->addrhi_reg != TCG_REG_R2) {
-            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R2, l->addrlo_reg);
-            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R3, l->addrhi_reg);
-        } else if (l->addrlo_reg != TCG_REG_R3) {
-            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R3, l->addrhi_reg);
-            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R2, l->addrlo_reg);
-        } else {
-            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R1, TCG_REG_R2);
-            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R2, TCG_REG_R3);
-            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R3, TCG_REG_R1);
-        }
+        TCGMovExtend ext[2] = {
+            { .dst = TCG_REG_R2, .dst_type = TCG_TYPE_I32,
+              .src = l->addrlo_reg,
+              .src_type = TCG_TYPE_I32, .src_ext = MO_UL },
+            { .dst = TCG_REG_R3, .dst_type = TCG_TYPE_I32,
+              .src = l->addrhi_reg,
+              .src_type = TCG_TYPE_I32, .src_ext = MO_UL },
+        };
+        tcg_out_movext2(s, &ext[0], &ext[1], TCG_REG_TMP);
     } else {
         tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R1, l->addrlo_reg);
     }
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
 {
     MemOpIdx oi = l->oi;
     MemOp opc = get_memop(oi);
-    TCGReg data_reg;
     tcg_insn_unit **label_ptr = &l->label_ptr[0];
 
     /* resolve label address */
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
 
     tcg_out_branch(s, 1, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]);
 
-    data_reg = l->datalo_reg;
     if (TCG_TARGET_REG_BITS == 32 && (opc & MO_SIZE) == MO_64) {
-        if (data_reg == TCG_REG_EDX) {
-            /* xchg %edx, %eax */
-            tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0);
-            tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EAX);
-        } else {
-            tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
-            tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EDX);
-        }
+        TCGMovExtend ext[2] = {
+            { .dst = l->datalo_reg, .dst_type = TCG_TYPE_I32,
+              .src = TCG_REG_EAX, .src_type = TCG_TYPE_I32, .src_ext = MO_UL },
+            { .dst = l->datahi_reg, .dst_type = TCG_TYPE_I32,
+              .src = TCG_REG_EDX, .src_type = TCG_TYPE_I32, .src_ext = MO_UL },
+        };
+        tcg_out_movext2(s, &ext[0], &ext[1], -1);
     } else {
-        tcg_out_movext(s, l->type, data_reg,
+        tcg_out_movext(s, l->type, l->datalo_reg,
                        TCG_TYPE_REG, opc & MO_SSIZE, TCG_REG_EAX);
     }
 
-- 
2.34.1