Series comparison

-[PULL 0/5] tcg patch queue
+[PULL 0/4] tcg patch queue
-The following changes since commit 3dd23a4fb8fd72d2220a90a809f213999ffe7f3a:
+The following changes since commit 15df33ceb73cb6bb3c6736cf4d2cff51129ed4b4:
-  Merge remote-tracking branch 'remotes/legoater/tags/pull-aspeed-20200901' into staging (2020-09-03 14:12:48 +0100)
+  Merge remote-tracking branch 'remotes/quic/tags/pull-hex-20220312-1' into staging (2022-03-13 17:29:18 +0000)
 are available in the Git repository at:
-  https://github.com/rth7680/qemu.git tags/pull-tcg-20200903
+  https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20220314
-for you to fetch changes up to fe4b0b5bfa96c38ad1cad0689a86cca9f307e353:
+for you to fetch changes up to 76cff100beeae8d3676bb658cccd45ef5ced8aa9:
-  tcg: Implement 256-bit dup for tcg_gen_gvec_dup_mem (2020-09-03 13:13:58 -0700)
+  tcg/arm: Don't emit UNPREDICTABLE LDRD with Rm == Rt or Rt+1 (2022-03-14 10:31:51 -0700)
 ----------------------------------------------------------------
-Improve inlining in cputlb.c.
+Fixes for s390x host vectors
-Fix vector abs fallback.
+Fix for arm ldrd unpredictable case
 Only set parallel_cpus for SMP.
 Add vector dupm for 256-bit elements.
 ----------------------------------------------------------------
 Richard Henderson (4):
-      cputlb: Make store_helper less fragile to compiler optimizations
+      tcg/s390x: Fix tcg_out_dupi_vec vs VGM
-      softmmu/cpus: Only set parallel_cpus for SMP
+      tcg/s390x: Fix INDEX_op_bitsel_vec vs VSEL
-      tcg: Eliminate one store for in-place 128-bit dup_mem
+      tcg/s390x: Fix tcg_out_dup_vec vs general registers
-      tcg: Implement 256-bit dup for tcg_gen_gvec_dup_mem
+      tcg/arm: Don't emit UNPREDICTABLE LDRD with Rm == Rt or Rt+1
-Stephen Long (1):
+ tcg/arm/tcg-target.c.inc   | 17 +++++++++++++++--
-      tcg: Fix tcg gen for vectorized absolute value
+ tcg/s390x/tcg-target.c.inc |  7 ++++---
+files changed, 19 insertions(+), 5 deletions(-)
  accel/tcg/cputlb.c | 138 ++++++++++++++++++++++++++++++-----------------------
  softmmu/cpus.c     |  11 ++++-
  tcg/tcg-op-gvec.c  |  61 ++++++++++++++++++++---
 files changed, 143 insertions(+), 67 deletions(-)

-[PULL 1/5] cputlb: Make store_helper less fragile to compiler optimizations
+Deleted patch
-This has no functional change.
-The current function structure is:
-    inline QEMU_ALWAYSINLINE
-    store_memop() {
-        switch () {
-            ...
-        default:
-            qemu_build_not_reached();
-        }
-    }
-    inline QEMU_ALWAYSINLINE
-    store_helper() {
-        ...
-        if (span_two_pages_or_io) {
-            ...
-            helper_ret_stb_mmu();
-        }
-        store_memop();
-    }
-    helper_ret_stb_mmu() {
-        store_helper();
-    }
-Whereas GCC will generate an error at compile-time when an always_inline
-function is not inlined, Clang does not.  Nor does Clang prioritize the
-inlining of always_inline functions.  Both of these are arguably bugs.
-Both `store_memop` and `store_helper` need to be inlined and allow
-constant propogations to eliminate the `qemu_build_not_reached` call.
-However, if the compiler instead chooses to inline helper_ret_stb_mmu
-into store_helper, then store_helper is now self-recursive and the
-compiler is no longer able to propagate the constant in the same way.
-This does not produce at current QEMU head, but was reproducible
-at v4.2.0 with `clang-10 -O2 -fexperimental-new-pass-manager`.
-The inline recursion problem can be fixed solely by marking
-helper_ret_stb_mmu as noinline, so the compiler does not make an
-incorrect decision about which functions to inline.
-In addition, extract store_helper_unaligned as a noinline subroutine
-that can be shared by all of the helpers.  This saves about 6k code
-size in an optimized x86_64 build.
-Reported-by: Shu-Chun Weng <scw@google.com>
-Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
----
- accel/tcg/cputlb.c | 138 ++++++++++++++++++++++++++-------------------
-file changed, 79 insertions(+), 59 deletions(-)
-diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
-index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/cputlb.c
-+++ b/accel/tcg/cputlb.c
-@@ -XXX,XX +XXX,XX @@ store_memop(void *haddr, uint64_t val, MemOp op)
-     }
- }
-+static void __attribute__((noinline))
-+store_helper_unaligned(CPUArchState *env, target_ulong addr, uint64_t val,
-+                       uintptr_t retaddr, size_t size, uintptr_t mmu_idx,
-+                       bool big_endian)
-+{
-+    const size_t tlb_off = offsetof(CPUTLBEntry, addr_write);
-+    uintptr_t index, index2;
-+    CPUTLBEntry *entry, *entry2;
-+    target_ulong page2, tlb_addr, tlb_addr2;
-+    TCGMemOpIdx oi;
-+    size_t size2;
-+    int i;
-+
-+    /*
-+     * Ensure the second page is in the TLB.  Note that the first page
-+     * is already guaranteed to be filled, and that the second page
-+     * cannot evict the first.
-+     */
-+    page2 = (addr + size) & TARGET_PAGE_MASK;
-+    size2 = (addr + size) & ~TARGET_PAGE_MASK;
-+    index2 = tlb_index(env, mmu_idx, page2);
-+    entry2 = tlb_entry(env, mmu_idx, page2);
-+
-+    tlb_addr2 = tlb_addr_write(entry2);
-+    if (!tlb_hit_page(tlb_addr2, page2)) {
-+        if (!victim_tlb_hit(env, mmu_idx, index2, tlb_off, page2)) {
-+            tlb_fill(env_cpu(env), page2, size2, MMU_DATA_STORE,
-+                     mmu_idx, retaddr);
-+            index2 = tlb_index(env, mmu_idx, page2);
-+            entry2 = tlb_entry(env, mmu_idx, page2);
-+        }
-+        tlb_addr2 = tlb_addr_write(entry2);
-+    }
-+
-+    index = tlb_index(env, mmu_idx, addr);
-+    entry = tlb_entry(env, mmu_idx, addr);
-+    tlb_addr = tlb_addr_write(entry);
-+
-+    /*
-+     * Handle watchpoints.  Since this may trap, all checks
-+     * must happen before any store.
-+     */
-+    if (unlikely(tlb_addr & TLB_WATCHPOINT)) {
-+        cpu_check_watchpoint(env_cpu(env), addr, size - size2,
-+                             env_tlb(env)->d[mmu_idx].iotlb[index].attrs,
-+                             BP_MEM_WRITE, retaddr);
-+    }
-+    if (unlikely(tlb_addr2 & TLB_WATCHPOINT)) {
-+        cpu_check_watchpoint(env_cpu(env), page2, size2,
-+                             env_tlb(env)->d[mmu_idx].iotlb[index2].attrs,
-+                             BP_MEM_WRITE, retaddr);
-+    }
-+
-+    /*
-+     * XXX: not efficient, but simple.
-+     * This loop must go in the forward direction to avoid issues
-+     * with self-modifying code in Windows 64-bit.
-+     */
-+    oi = make_memop_idx(MO_UB, mmu_idx);
-+    if (big_endian) {
-+        for (i = 0; i < size; ++i) {
-+            /* Big-endian extract.  */
-+            uint8_t val8 = val >> (((size - 1) * 8) - (i * 8));
-+            helper_ret_stb_mmu(env, addr + i, val8, oi, retaddr);
-+        }
-+    } else {
-+        for (i = 0; i < size; ++i) {
-+            /* Little-endian extract.  */
-+            uint8_t val8 = val >> (i * 8);
-+            helper_ret_stb_mmu(env, addr + i, val8, oi, retaddr);
-+        }
-+    }
-+}
-+
- static inline void QEMU_ALWAYS_INLINE
- store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
-              TCGMemOpIdx oi, uintptr_t retaddr, MemOp op)
-@@ -XXX,XX +XXX,XX @@ store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
-     if (size > 1
-         && unlikely((addr & ~TARGET_PAGE_MASK) + size - 1
-                      >= TARGET_PAGE_SIZE)) {
--        int i;
--        uintptr_t index2;
--        CPUTLBEntry *entry2;
--        target_ulong page2, tlb_addr2;
--        size_t size2;
--
-     do_unaligned_access:
--        /*
--         * Ensure the second page is in the TLB.  Note that the first page
--         * is already guaranteed to be filled, and that the second page
--         * cannot evict the first.
--         */
--        page2 = (addr + size) & TARGET_PAGE_MASK;
--        size2 = (addr + size) & ~TARGET_PAGE_MASK;
--        index2 = tlb_index(env, mmu_idx, page2);
--        entry2 = tlb_entry(env, mmu_idx, page2);
--        tlb_addr2 = tlb_addr_write(entry2);
--        if (!tlb_hit_page(tlb_addr2, page2)) {
--            if (!victim_tlb_hit(env, mmu_idx, index2, tlb_off, page2)) {
--                tlb_fill(env_cpu(env), page2, size2, MMU_DATA_STORE,
--                         mmu_idx, retaddr);
--                index2 = tlb_index(env, mmu_idx, page2);
--                entry2 = tlb_entry(env, mmu_idx, page2);
--            }
--            tlb_addr2 = tlb_addr_write(entry2);
--        }
--
--        /*
--         * Handle watchpoints.  Since this may trap, all checks
--         * must happen before any store.
--         */
--        if (unlikely(tlb_addr & TLB_WATCHPOINT)) {
--            cpu_check_watchpoint(env_cpu(env), addr, size - size2,
--                                 env_tlb(env)->d[mmu_idx].iotlb[index].attrs,
--                                 BP_MEM_WRITE, retaddr);
--        }
--        if (unlikely(tlb_addr2 & TLB_WATCHPOINT)) {
--            cpu_check_watchpoint(env_cpu(env), page2, size2,
--                                 env_tlb(env)->d[mmu_idx].iotlb[index2].attrs,
--                                 BP_MEM_WRITE, retaddr);
--        }
--
--        /*
--         * XXX: not efficient, but simple.
--         * This loop must go in the forward direction to avoid issues
--         * with self-modifying code in Windows 64-bit.
--         */
--        for (i = 0; i < size; ++i) {
--            uint8_t val8;
--            if (memop_big_endian(op)) {
--                /* Big-endian extract.  */
--                val8 = val >> (((size - 1) * 8) - (i * 8));
--            } else {
--                /* Little-endian extract.  */
--                val8 = val >> (i * 8);
--            }
--            helper_ret_stb_mmu(env, addr + i, val8, oi, retaddr);
--        }
-+        store_helper_unaligned(env, addr, val, retaddr, size,
-+                               mmu_idx, memop_big_endian(op));
-         return;
-     }
-@@ -XXX,XX +XXX,XX @@ store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
-     store_memop(haddr, val, op);
- }
--void helper_ret_stb_mmu(CPUArchState *env, target_ulong addr, uint8_t val,
--                        TCGMemOpIdx oi, uintptr_t retaddr)
-+void __attribute__((noinline))
-+helper_ret_stb_mmu(CPUArchState *env, target_ulong addr, uint8_t val,
-+                   TCGMemOpIdx oi, uintptr_t retaddr)
- {
-     store_helper(env, addr, val, oi, retaddr, MO_UB);
- }
---
-.25.1

-[PULL 4/5] tcg: Eliminate one store for in-place 128-bit dup_mem
+[PULL 1/4] tcg/s390x: Fix tcg_out_dupi_vec vs VGM
-Do not store back to the exact memory from which we just loaded.
+The immediate operands to VGM were in the wrong order,
 producing an inverse mask.
-Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/tcg-op-gvec.c | 4 ++--
+ tcg/s390x/tcg-target.c.inc | 4 ++--
 file changed, 2 insertions(+), 2 deletions(-)
-diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
+diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/tcg-op-gvec.c
+--- a/tcg/s390x/tcg-target.c.inc
-+++ b/tcg/tcg-op-gvec.c
++++ b/tcg/s390x/tcg-target.c.inc
-@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
-             TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V128);
+                 msb = clz32(val);
+                 lsb = 31 - ctz32(val);
              tcg_gen_ld_vec(in, cpu_env, aofs);
 -            for (i = 0; i < oprsz; i += 16) {
 +            for (i = (aofs == dofs) * 16; i < oprsz; i += 16) {
                  tcg_gen_st_vec(in, cpu_env, dofs + i);
              }
-             tcg_temp_free_vec(in);
+-            tcg_out_insn(s, VRIb, VGM, dst, lsb, msb, MO_32);
-@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
++            tcg_out_insn(s, VRIb, VGM, dst, msb, lsb, MO_32);
+             return;
-             tcg_gen_ld_i64(in0, cpu_env, aofs);
+         }
-             tcg_gen_ld_i64(in1, cpu_env, aofs + 8);
+     } else {
--            for (i = 0; i < oprsz; i += 16) {
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
-+            for (i = (aofs == dofs) * 16; i < oprsz; i += 16) {
+                 msb = clz64(val);
-                 tcg_gen_st_i64(in0, cpu_env, dofs + i);
+                 lsb = 63 - ctz64(val);
                  tcg_gen_st_i64(in1, cpu_env, dofs + i + 8);
              }
+-            tcg_out_insn(s, VRIb, VGM, dst, lsb, msb, MO_64);
++            tcg_out_insn(s, VRIb, VGM, dst, msb, lsb, MO_64);
+             return;
+         }
+     }
 --
 .25.1

-[PULL 5/5] tcg: Implement 256-bit dup for tcg_gen_gvec_dup_mem
+[PULL 2/4] tcg/s390x: Fix INDEX_op_bitsel_vec vs VSEL
-We already support duplication of 128-bit blocks.  This extends
+The operands are output in the wrong order: the tcg selector
-that support to 256-bit blocks.  This will be needed by SVE2.
+argument is first, whereas the s390x selector argument is last.
-Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Tested-by: Thomas Huth <thuth@redhat.com>
 Resolves: https://gitlab.com/qemu-project/qemu/-/issues/898
 Fixes: 9bca986df88 ("tcg/s390x: Implement TCG_TARGET_HAS_bitsel_vec")
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/tcg-op-gvec.c | 52 ++++++++++++++++++++++++++++++++++++++++++++---
+ tcg/s390x/tcg-target.c.inc | 2 +-
-file changed, 49 insertions(+), 3 deletions(-)
+file changed, 1 insertion(+), 1 deletion(-)
-diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
+diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/tcg-op-gvec.c
+--- a/tcg/s390x/tcg-target.c.inc
-+++ b/tcg/tcg-op-gvec.c
++++ b/tcg/s390x/tcg-target.c.inc
-@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
-             do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0);
+         break;
-             tcg_temp_free_i64(in);
-         }
+     case INDEX_op_bitsel_vec:
--    } else {
+-        tcg_out_insn(s, VRRe, VSEL, a0, a1, a2, args[3]);
-+    } else if (vece == 4) {
++        tcg_out_insn(s, VRRe, VSEL, a0, a2, args[3], a1);
-         /* 128-bit duplicate.  */
+         break;
--        /* ??? Dup to 256-bit vector.  */
-         int i;
+     case INDEX_op_cmp_vec:
 -        tcg_debug_assert(vece == 4);
          tcg_debug_assert(oprsz >= 16);
          if (TCG_TARGET_HAS_v128) {
              TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V128);
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
          if (oprsz < maxsz) {
              expand_clr(dofs + oprsz, maxsz - oprsz);
          }
 +    } else if (vece == 5) {
 +        /* 256-bit duplicate.  */
 +        int i;
 +
 +        tcg_debug_assert(oprsz >= 32);
 +        tcg_debug_assert(oprsz % 32 == 0);
 +        if (TCG_TARGET_HAS_v256) {
 +            TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V256);
 +
 +            tcg_gen_ld_vec(in, cpu_env, aofs);
 +            for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
 +                tcg_gen_st_vec(in, cpu_env, dofs + i);
 +            }
 +            tcg_temp_free_vec(in);
 +        } else if (TCG_TARGET_HAS_v128) {
 +            TCGv_vec in0 = tcg_temp_new_vec(TCG_TYPE_V128);
 +            TCGv_vec in1 = tcg_temp_new_vec(TCG_TYPE_V128);
 +
 +            tcg_gen_ld_vec(in0, cpu_env, aofs);
 +            tcg_gen_ld_vec(in1, cpu_env, aofs + 16);
 +            for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
 +                tcg_gen_st_vec(in0, cpu_env, dofs + i);
 +                tcg_gen_st_vec(in1, cpu_env, dofs + i + 16);
 +            }
 +            tcg_temp_free_vec(in0);
 +            tcg_temp_free_vec(in1);
 +        } else {
 +            TCGv_i64 in[4];
 +            int j;
 +
 +            for (j = 0; j < 4; ++j) {
 +                in[j] = tcg_temp_new_i64();
 +                tcg_gen_ld_i64(in[j], cpu_env, aofs + j * 8);
 +            }
 +            for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
 +                for (j = 0; j < 4; ++j) {
 +                    tcg_gen_st_i64(in[j], cpu_env, dofs + i + j * 8);
 +                }
 +            }
 +            for (j = 0; j < 4; ++j) {
 +                tcg_temp_free_i64(in[j]);
 +            }
 +        }
 +        if (oprsz < maxsz) {
 +            expand_clr(dofs + oprsz, maxsz - oprsz);
 +        }
 +    } else {
 +        g_assert_not_reached();
      }
  }
 --
 .25.1

-[PULL 2/5] tcg: Fix tcg gen for vectorized absolute value
+[PULL 3/4] tcg/s390x: Fix tcg_out_dup_vec vs general registers
-From: Stephen Long <steplong@quicinc.com>
+We copied the data from the general register input to the
 vector register output, but have not yet replicated it.
 We intended to fall through into the vector-vector case,
 but failed to redirect the input register.
-The fallback inline expansion for vectorized absolute value,
+This is caught by an assertion failure in tcg_out_insn_VRIc,
-when the host doesn't support such an insn was flawed.
+which diagnosed the incorrect register class.
-E.g. when a vector of bytes has all elements negative, mask
-will be 0xffff_ffff_ffff_ffff.  Subtracting mask only adds 1
-to the low element instead of all elements becase -mask is 1
-and not 0x0101_0101_0101_0101.
-Signed-off-by: Stephen Long <steplong@quicinc.com>
-Message-Id: <20200813161818.190-1-steplong@quicinc.com>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/tcg-op-gvec.c | 5 +++--
+ tcg/s390x/tcg-target.c.inc | 1 +
-file changed, 3 insertions(+), 2 deletions(-)
+file changed, 1 insertion(+)
-diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
+diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/tcg-op-gvec.c
+--- a/tcg/s390x/tcg-target.c.inc
-+++ b/tcg/tcg-op-gvec.c
++++ b/tcg/s390x/tcg-target.c.inc
-@@ -XXX,XX +XXX,XX @@ static void gen_absv_mask(TCGv_i64 d, TCGv_i64 b, unsigned vece)
+@@ -XXX,XX +XXX,XX @@ static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
-     tcg_gen_muli_i64(t, t, (1 << nbit) - 1);
+         if (vece == MO_64) {
              return true;
          }
 +        src = dst;
      }
      /*
--     * Invert (via xor -1) and add one (via sub -1).
-+     * Invert (via xor -1) and add one.
-      * Because of the ordering the msb is cleared,
-      * so we never have carry into the next element.
-      */
-     tcg_gen_xor_i64(d, b, t);
--    tcg_gen_sub_i64(d, d, t);
-+    tcg_gen_andi_i64(t, t, dup_const(vece, 1));
-+    tcg_gen_add_i64(d, d, t);
-     tcg_temp_free_i64(t);
- }
 --
 .25.1

-[PULL 3/5] softmmu/cpus: Only set parallel_cpus for SMP
+[PULL 4/4] tcg/arm: Don't emit UNPREDICTABLE LDRD with Rm == Rt or Rt+1
-Do not set parallel_cpus if there is only one cpu instantiated.
+The LDRD (register) instruction is UNPREDICTABLE if the Rm register
-This will allow tcg to use serial code to implement atomics.
+is the same as either Rt or Rt+1 (the two registers being loaded to).
 We weren't making sure we avoided this, with the result that on some
 host CPUs like the Cortex-A7 we would get a SIGILL because the CPU
 chooses to UNDEF for this particular UNPREDICTABLE case.
-Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Since we've already checked that datalo is aligned, we can simplify
 the test vs the Rm operand by aligning it before comparison.  Check
 for the two orderings before falling back to two ldr instructions.
 We don't bother to do anything similar for tcg_out_ldrd_rwb(),
 because it is only used in tcg_out_tlb_read() with a fixed set of
 registers which don't overlap.
 There is no equivalent UNPREDICTABLE case for STRD.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Resolves: https://gitlab.com/qemu-project/qemu/-/issues/896
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- softmmu/cpus.c | 11 ++++++++++-
+ tcg/arm/tcg-target.c.inc | 17 +++++++++++++++--
-file changed, 10 insertions(+), 1 deletion(-)
+file changed, 15 insertions(+), 2 deletions(-)
-diff --git a/softmmu/cpus.c b/softmmu/cpus.c
+diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
---- a/softmmu/cpus.c
+--- a/tcg/arm/tcg-target.c.inc
-+++ b/softmmu/cpus.c
++++ b/tcg/arm/tcg-target.c.inc
-@@ -XXX,XX +XXX,XX @@ static void qemu_tcg_init_vcpu(CPUState *cpu)
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld_index(TCGContext *s, MemOp opc,
-     if (!tcg_region_inited) {
+         /* LDRD requires alignment; double-check that. */
-         tcg_region_inited = 1;
+         if (get_alignment_bits(opc) >= MO_64
-         tcg_region_init();
+             && (datalo & 1) == 0 && datahi == datalo + 1) {
-+        /*
+-            tcg_out_ldrd_r(s, COND_AL, datalo, addrlo, addend);
-+         * If MTTCG, and we will create multiple cpus,
+-        } else if (scratch_addend) {
-+         * then we will have cpus running in parallel.
++            /*
-+         */
++             * Rm (the second address op) must not overlap Rt or Rt + 1.
-+        if (qemu_tcg_mttcg_enabled()) {
++             * Since datalo is aligned, we can simplify the test via alignment.
-+            MachineState *ms = MACHINE(qdev_get_machine());
++             * Flip the two address arguments if that works.
-+            if (ms->smp.max_cpus > 1) {
++             */
-+                parallel_cpus = true;
++            if ((addend & ~1) != datalo) {
 +                tcg_out_ldrd_r(s, COND_AL, datalo, addrlo, addend);
 +                break;
 +            }
 +            if ((addrlo & ~1) != datalo) {
 +                tcg_out_ldrd_r(s, COND_AL, datalo, addend, addrlo);
 +                break;
 +            }
 +        }
-     }
++        if (scratch_addend) {
+             tcg_out_ld32_rwb(s, COND_AL, datalo, addend, addrlo);
-     if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
+             tcg_out_ld32_12(s, COND_AL, datahi, addend, 4);
-@@ -XXX,XX +XXX,XX @@ static void qemu_tcg_init_vcpu(CPUState *cpu)
+         } else {
          if (qemu_tcg_mttcg_enabled()) {
              /* create a thread per vCPU with TCG (MTTCG) */
 -            parallel_cpus = true;
              snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
                   cpu->cpu_index);
 --
 .25.1

The following changes since commit 3dd23a4fb8fd72d2220a90a809f213999ffe7f3a:

Merge remote-tracking branch 'remotes/legoater/tags/pull-aspeed-20200901' into staging (2020-09-03 14:12:48 +0100)

are available in the Git repository at:

https://github.com/rth7680/qemu.git tags/pull-tcg-20200903

for you to fetch changes up to fe4b0b5bfa96c38ad1cad0689a86cca9f307e353:

tcg: Implement 256-bit dup for tcg_gen_gvec_dup_mem (2020-09-03 13:13:58 -0700)

----------------------------------------------------------------
Improve inlining in cputlb.c.
Fix vector abs fallback.
Only set parallel_cpus for SMP.
Add vector dupm for 256-bit elements.

----------------------------------------------------------------
Richard Henderson (4):
      cputlb: Make store_helper less fragile to compiler optimizations
      softmmu/cpus: Only set parallel_cpus for SMP
      tcg: Eliminate one store for in-place 128-bit dup_mem
      tcg: Implement 256-bit dup for tcg_gen_gvec_dup_mem

Stephen Long (1):
      tcg: Fix tcg gen for vectorized absolute value

accel/tcg/cputlb.c | 138 ++++++++++++++++++++++++++++++-----------------------
 softmmu/cpus.c     |  11 ++++-
 tcg/tcg-op-gvec.c  |  61 ++++++++++++++++++++---
 3 files changed, 143 insertions(+), 67 deletions(-)

This has no functional change.

The current function structure is:

inline QEMU_ALWAYSINLINE
    store_memop() {
        switch () {
            ...
        default:
            qemu_build_not_reached();
        }
    }
    inline QEMU_ALWAYSINLINE
    store_helper() {
        ...
        if (span_two_pages_or_io) {
            ...
            helper_ret_stb_mmu();
        }
        store_memop();
    }
    helper_ret_stb_mmu() {
        store_helper();
    }

Whereas GCC will generate an error at compile-time when an always_inline
function is not inlined, Clang does not.  Nor does Clang prioritize the
inlining of always_inline functions.  Both of these are arguably bugs.

Both `store_memop` and `store_helper` need to be inlined and allow
constant propogations to eliminate the `qemu_build_not_reached` call.

However, if the compiler instead chooses to inline helper_ret_stb_mmu
into store_helper, then store_helper is now self-recursive and the
compiler is no longer able to propagate the constant in the same way.

This does not produce at current QEMU head, but was reproducible
at v4.2.0 with `clang-10 -O2 -fexperimental-new-pass-manager`.

The inline recursion problem can be fixed solely by marking
helper_ret_stb_mmu as noinline, so the compiler does not make an
incorrect decision about which functions to inline.

In addition, extract store_helper_unaligned as a noinline subroutine
that can be shared by all of the helpers.  This saves about 6k code
size in an optimized x86_64 build.

Reported-by: Shu-Chun Weng <scw@google.com>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/cputlb.c | 138 ++++++++++++++++++++++++++-------------------
 1 file changed, 79 insertions(+), 59 deletions(-)

diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ store_memop(void *haddr, uint64_t val, MemOp op)
     }
 }
 
+static void __attribute__((noinline))
+store_helper_unaligned(CPUArchState *env, target_ulong addr, uint64_t val,
+                       uintptr_t retaddr, size_t size, uintptr_t mmu_idx,
+                       bool big_endian)
+{
+    const size_t tlb_off = offsetof(CPUTLBEntry, addr_write);
+    uintptr_t index, index2;
+    CPUTLBEntry *entry, *entry2;
+    target_ulong page2, tlb_addr, tlb_addr2;
+    TCGMemOpIdx oi;
+    size_t size2;
+    int i;
+
+    /*
+     * Ensure the second page is in the TLB.  Note that the first page
+     * is already guaranteed to be filled, and that the second page
+     * cannot evict the first.
+     */
+    page2 = (addr + size) & TARGET_PAGE_MASK;
+    size2 = (addr + size) & ~TARGET_PAGE_MASK;
+    index2 = tlb_index(env, mmu_idx, page2);
+    entry2 = tlb_entry(env, mmu_idx, page2);
+
+    tlb_addr2 = tlb_addr_write(entry2);
+    if (!tlb_hit_page(tlb_addr2, page2)) {
+        if (!victim_tlb_hit(env, mmu_idx, index2, tlb_off, page2)) {
+            tlb_fill(env_cpu(env), page2, size2, MMU_DATA_STORE,
+                     mmu_idx, retaddr);
+            index2 = tlb_index(env, mmu_idx, page2);
+            entry2 = tlb_entry(env, mmu_idx, page2);
+        }
+        tlb_addr2 = tlb_addr_write(entry2);
+    }
+
+    index = tlb_index(env, mmu_idx, addr);
+    entry = tlb_entry(env, mmu_idx, addr);
+    tlb_addr = tlb_addr_write(entry);
+
+    /*
+     * Handle watchpoints.  Since this may trap, all checks
+     * must happen before any store.
+     */
+    if (unlikely(tlb_addr & TLB_WATCHPOINT)) {
+        cpu_check_watchpoint(env_cpu(env), addr, size - size2,
+                             env_tlb(env)->d[mmu_idx].iotlb[index].attrs,
+                             BP_MEM_WRITE, retaddr);
+    }
+    if (unlikely(tlb_addr2 & TLB_WATCHPOINT)) {
+        cpu_check_watchpoint(env_cpu(env), page2, size2,
+                             env_tlb(env)->d[mmu_idx].iotlb[index2].attrs,
+                             BP_MEM_WRITE, retaddr);
+    }
+
+    /*
+     * XXX: not efficient, but simple.
+     * This loop must go in the forward direction to avoid issues
+     * with self-modifying code in Windows 64-bit.
+     */
+    oi = make_memop_idx(MO_UB, mmu_idx);
+    if (big_endian) {
+        for (i = 0; i < size; ++i) {
+            /* Big-endian extract.  */
+            uint8_t val8 = val >> (((size - 1) * 8) - (i * 8));
+            helper_ret_stb_mmu(env, addr + i, val8, oi, retaddr);
+        }
+    } else {
+        for (i = 0; i < size; ++i) {
+            /* Little-endian extract.  */
+            uint8_t val8 = val >> (i * 8);
+            helper_ret_stb_mmu(env, addr + i, val8, oi, retaddr);
+        }
+    }
+}
+
 static inline void QEMU_ALWAYS_INLINE
 store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
              TCGMemOpIdx oi, uintptr_t retaddr, MemOp op)
@@ -XXX,XX +XXX,XX @@ store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
     if (size > 1
         && unlikely((addr & ~TARGET_PAGE_MASK) + size - 1
                      >= TARGET_PAGE_SIZE)) {
-        int i;
-        uintptr_t index2;
-        CPUTLBEntry *entry2;
-        target_ulong page2, tlb_addr2;
-        size_t size2;
-
     do_unaligned_access:
-        /*
-         * Ensure the second page is in the TLB.  Note that the first page
-         * is already guaranteed to be filled, and that the second page
-         * cannot evict the first.
-         */
-        page2 = (addr + size) & TARGET_PAGE_MASK;
-        size2 = (addr + size) & ~TARGET_PAGE_MASK;
-        index2 = tlb_index(env, mmu_idx, page2);
-        entry2 = tlb_entry(env, mmu_idx, page2);
-        tlb_addr2 = tlb_addr_write(entry2);
-        if (!tlb_hit_page(tlb_addr2, page2)) {
-            if (!victim_tlb_hit(env, mmu_idx, index2, tlb_off, page2)) {
-                tlb_fill(env_cpu(env), page2, size2, MMU_DATA_STORE,
-                         mmu_idx, retaddr);
-                index2 = tlb_index(env, mmu_idx, page2);
-                entry2 = tlb_entry(env, mmu_idx, page2);
-            }
-            tlb_addr2 = tlb_addr_write(entry2);
-        }
-
-        /*
-         * Handle watchpoints.  Since this may trap, all checks
-         * must happen before any store.
-         */
-        if (unlikely(tlb_addr & TLB_WATCHPOINT)) {
-            cpu_check_watchpoint(env_cpu(env), addr, size - size2,
-                                 env_tlb(env)->d[mmu_idx].iotlb[index].attrs,
-                                 BP_MEM_WRITE, retaddr);
-        }
-        if (unlikely(tlb_addr2 & TLB_WATCHPOINT)) {
-            cpu_check_watchpoint(env_cpu(env), page2, size2,
-                                 env_tlb(env)->d[mmu_idx].iotlb[index2].attrs,
-                                 BP_MEM_WRITE, retaddr);
-        }
-
-        /*
-         * XXX: not efficient, but simple.
-         * This loop must go in the forward direction to avoid issues
-         * with self-modifying code in Windows 64-bit.
-         */
-        for (i = 0; i < size; ++i) {
-            uint8_t val8;
-            if (memop_big_endian(op)) {
-                /* Big-endian extract.  */
-                val8 = val >> (((size - 1) * 8) - (i * 8));
-            } else {
-                /* Little-endian extract.  */
-                val8 = val >> (i * 8);
-            }
-            helper_ret_stb_mmu(env, addr + i, val8, oi, retaddr);
-        }
+        store_helper_unaligned(env, addr, val, retaddr, size,
+                               mmu_idx, memop_big_endian(op));
         return;
     }
 
@@ -XXX,XX +XXX,XX @@ store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
     store_memop(haddr, val, op);
 }
 
-void helper_ret_stb_mmu(CPUArchState *env, target_ulong addr, uint8_t val,
-                        TCGMemOpIdx oi, uintptr_t retaddr)
+void __attribute__((noinline))
+helper_ret_stb_mmu(CPUArchState *env, target_ulong addr, uint8_t val,
+                   TCGMemOpIdx oi, uintptr_t retaddr)
 {
     store_helper(env, addr, val, oi, retaddr, MO_UB);
 }
-- 
2.25.1

From: Stephen Long <steplong@quicinc.com>

The fallback inline expansion for vectorized absolute value,
when the host doesn't support such an insn was flawed.

E.g. when a vector of bytes has all elements negative, mask
will be 0xffff_ffff_ffff_ffff.  Subtracting mask only adds 1
to the low element instead of all elements becase -mask is 1
and not 0x0101_0101_0101_0101.

Signed-off-by: Stephen Long <steplong@quicinc.com>
Message-Id: <20200813161818.190-1-steplong@quicinc.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tcg-op-gvec.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg-op-gvec.c
+++ b/tcg/tcg-op-gvec.c
@@ -XXX,XX +XXX,XX @@ static void gen_absv_mask(TCGv_i64 d, TCGv_i64 b, unsigned vece)
     tcg_gen_muli_i64(t, t, (1 << nbit) - 1);
 
     /*
-     * Invert (via xor -1) and add one (via sub -1).
+     * Invert (via xor -1) and add one.
      * Because of the ordering the msb is cleared,
      * so we never have carry into the next element.
      */
     tcg_gen_xor_i64(d, b, t);
-    tcg_gen_sub_i64(d, d, t);
+    tcg_gen_andi_i64(t, t, dup_const(vece, 1));
+    tcg_gen_add_i64(d, d, t);
 
     tcg_temp_free_i64(t);
 }
-- 
2.25.1

Do not set parallel_cpus if there is only one cpu instantiated.
This will allow tcg to use serial code to implement atomics.

Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 softmmu/cpus.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/softmmu/cpus.c b/softmmu/cpus.c
index XXXXXXX..XXXXXXX 100644
--- a/softmmu/cpus.c
+++ b/softmmu/cpus.c
@@ -XXX,XX +XXX,XX @@ static void qemu_tcg_init_vcpu(CPUState *cpu)
     if (!tcg_region_inited) {
         tcg_region_inited = 1;
         tcg_region_init();
+        /*
+         * If MTTCG, and we will create multiple cpus,
+         * then we will have cpus running in parallel.
+         */
+        if (qemu_tcg_mttcg_enabled()) {
+            MachineState *ms = MACHINE(qdev_get_machine());
+            if (ms->smp.max_cpus > 1) {
+                parallel_cpus = true;
+            }
+        }
     }
 
     if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
@@ -XXX,XX +XXX,XX @@ static void qemu_tcg_init_vcpu(CPUState *cpu)
 
         if (qemu_tcg_mttcg_enabled()) {
             /* create a thread per vCPU with TCG (MTTCG) */
-            parallel_cpus = true;
             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
                  cpu->cpu_index);
 
-- 
2.25.1

Do not store back to the exact memory from which we just loaded.

Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tcg-op-gvec.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg-op-gvec.c
+++ b/tcg/tcg-op-gvec.c
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
             TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V128);
 
             tcg_gen_ld_vec(in, cpu_env, aofs);
-            for (i = 0; i < oprsz; i += 16) {
+            for (i = (aofs == dofs) * 16; i < oprsz; i += 16) {
                 tcg_gen_st_vec(in, cpu_env, dofs + i);
             }
             tcg_temp_free_vec(in);
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
 
             tcg_gen_ld_i64(in0, cpu_env, aofs);
             tcg_gen_ld_i64(in1, cpu_env, aofs + 8);
-            for (i = 0; i < oprsz; i += 16) {
+            for (i = (aofs == dofs) * 16; i < oprsz; i += 16) {
                 tcg_gen_st_i64(in0, cpu_env, dofs + i);
                 tcg_gen_st_i64(in1, cpu_env, dofs + i + 8);
             }
-- 
2.25.1

We already support duplication of 128-bit blocks.  This extends
that support to 256-bit blocks.  This will be needed by SVE2.

Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tcg-op-gvec.c | 52 ++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 49 insertions(+), 3 deletions(-)

diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg-op-gvec.c
+++ b/tcg/tcg-op-gvec.c
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
             do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0);
             tcg_temp_free_i64(in);
         }
-    } else {
+    } else if (vece == 4) {
         /* 128-bit duplicate.  */
-        /* ??? Dup to 256-bit vector.  */
         int i;
 
-        tcg_debug_assert(vece == 4);
         tcg_debug_assert(oprsz >= 16);
         if (TCG_TARGET_HAS_v128) {
             TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V128);
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
         if (oprsz < maxsz) {
             expand_clr(dofs + oprsz, maxsz - oprsz);
         }
+    } else if (vece == 5) {
+        /* 256-bit duplicate.  */
+        int i;
+
+        tcg_debug_assert(oprsz >= 32);
+        tcg_debug_assert(oprsz % 32 == 0);
+        if (TCG_TARGET_HAS_v256) {
+            TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V256);
+
+            tcg_gen_ld_vec(in, cpu_env, aofs);
+            for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
+                tcg_gen_st_vec(in, cpu_env, dofs + i);
+            }
+            tcg_temp_free_vec(in);
+        } else if (TCG_TARGET_HAS_v128) {
+            TCGv_vec in0 = tcg_temp_new_vec(TCG_TYPE_V128);
+            TCGv_vec in1 = tcg_temp_new_vec(TCG_TYPE_V128);
+
+            tcg_gen_ld_vec(in0, cpu_env, aofs);
+            tcg_gen_ld_vec(in1, cpu_env, aofs + 16);
+            for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
+                tcg_gen_st_vec(in0, cpu_env, dofs + i);
+                tcg_gen_st_vec(in1, cpu_env, dofs + i + 16);
+            }
+            tcg_temp_free_vec(in0);
+            tcg_temp_free_vec(in1);
+        } else {
+            TCGv_i64 in[4];
+            int j;
+
+            for (j = 0; j < 4; ++j) {
+                in[j] = tcg_temp_new_i64();
+                tcg_gen_ld_i64(in[j], cpu_env, aofs + j * 8);
+            }
+            for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
+                for (j = 0; j < 4; ++j) {
+                    tcg_gen_st_i64(in[j], cpu_env, dofs + i + j * 8);
+                }
+            }
+            for (j = 0; j < 4; ++j) {
+                tcg_temp_free_i64(in[j]);
+            }
+        }
+        if (oprsz < maxsz) {
+            expand_clr(dofs + oprsz, maxsz - oprsz);
+        }
+    } else {
+        g_assert_not_reached();
     }
 }
 
-- 
2.25.1

The following changes since commit 15df33ceb73cb6bb3c6736cf4d2cff51129ed4b4:

Merge remote-tracking branch 'remotes/quic/tags/pull-hex-20220312-1' into staging (2022-03-13 17:29:18 +0000)

are available in the Git repository at:

https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20220314

for you to fetch changes up to 76cff100beeae8d3676bb658cccd45ef5ced8aa9:

tcg/arm: Don't emit UNPREDICTABLE LDRD with Rm == Rt or Rt+1 (2022-03-14 10:31:51 -0700)

----------------------------------------------------------------
Fixes for s390x host vectors
Fix for arm ldrd unpredictable case

----------------------------------------------------------------
Richard Henderson (4):
      tcg/s390x: Fix tcg_out_dupi_vec vs VGM
      tcg/s390x: Fix INDEX_op_bitsel_vec vs VSEL
      tcg/s390x: Fix tcg_out_dup_vec vs general registers
      tcg/arm: Don't emit UNPREDICTABLE LDRD with Rm == Rt or Rt+1

tcg/arm/tcg-target.c.inc   | 17 +++++++++++++++--
 tcg/s390x/tcg-target.c.inc |  7 ++++---
 2 files changed, 19 insertions(+), 5 deletions(-)

The immediate operands to VGM were in the wrong order,
producing an inverse mask.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/s390x/tcg-target.c.inc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/s390x/tcg-target.c.inc
+++ b/tcg/s390x/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
                 msb = clz32(val);
                 lsb = 31 - ctz32(val);
             }
-            tcg_out_insn(s, VRIb, VGM, dst, lsb, msb, MO_32);
+            tcg_out_insn(s, VRIb, VGM, dst, msb, lsb, MO_32);
             return;
         }
     } else {
@@ -XXX,XX +XXX,XX @@ static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
                 msb = clz64(val);
                 lsb = 63 - ctz64(val);
             }
-            tcg_out_insn(s, VRIb, VGM, dst, lsb, msb, MO_64);
+            tcg_out_insn(s, VRIb, VGM, dst, msb, lsb, MO_64);
             return;
         }
     }
-- 
2.25.1

The LDRD (register) instruction is UNPREDICTABLE if the Rm register
is the same as either Rt or Rt+1 (the two registers being loaded to).
We weren't making sure we avoided this, with the result that on some
host CPUs like the Cortex-A7 we would get a SIGILL because the CPU
chooses to UNDEF for this particular UNPREDICTABLE case.

Since we've already checked that datalo is aligned, we can simplify
the test vs the Rm operand by aligning it before comparison.  Check
for the two orderings before falling back to two ldr instructions.

We don't bother to do anything similar for tcg_out_ldrd_rwb(),
because it is only used in tcg_out_tlb_read() with a fixed set of
registers which don't overlap.

There is no equivalent UNPREDICTABLE case for STRD.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/896
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/arm/tcg-target.c.inc | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld_index(TCGContext *s, MemOp opc,
         /* LDRD requires alignment; double-check that. */
         if (get_alignment_bits(opc) >= MO_64
             && (datalo & 1) == 0 && datahi == datalo + 1) {
-            tcg_out_ldrd_r(s, COND_AL, datalo, addrlo, addend);
-        } else if (scratch_addend) {
+            /*
+             * Rm (the second address op) must not overlap Rt or Rt + 1.
+             * Since datalo is aligned, we can simplify the test via alignment.
+             * Flip the two address arguments if that works.
+             */
+            if ((addend & ~1) != datalo) {
+                tcg_out_ldrd_r(s, COND_AL, datalo, addrlo, addend);
+                break;
+            }
+            if ((addrlo & ~1) != datalo) {
+                tcg_out_ldrd_r(s, COND_AL, datalo, addend, addrlo);
+                break;
+            }
+        }
+        if (scratch_addend) {
             tcg_out_ld32_rwb(s, COND_AL, datalo, addend, addrlo);
             tcg_out_ld32_12(s, COND_AL, datahi, addend, 4);
         } else {
-- 
2.25.1