Series comparison

-[PULL 0/3] tcg patch queue
+[PULL v2 00/15] tcg patch queue
-The following changes since commit e18e5501d8ac692d32657a3e1ef545b14e72b730:
+Second try's the charm today, right?
-  Merge remote-tracking branch 'remotes/dgilbert-gitlab/tags/pull-virtiofs-20200210' into staging (2020-02-10 18:09:14 +0000)
 r~
 The following changes since commit 00b1faea41d283e931256aa78aa975a369ec3ae6:
   Merge tag 'pull-target-arm-20230123' of https://git.linaro.org/people/pmaydell/qemu-arm into staging (2023-01-23 13:40:28 +0000)
 are available in the Git repository at:
-  https://github.com/rth7680/qemu.git tags/pull-tcg-20200212
+  https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20230123
-for you to fetch changes up to 2445971604c1cfd3ec484457159f4ac300fb04d2:
+for you to fetch changes up to 709bcd7da3f6b4655d910634a0d520fa1439df38:
-  tcg: Add tcg_gen_gvec_5_ptr (2020-02-12 14:58:36 -0800)
+  tcg/loongarch64: Reorg goto_tb implementation (2023-01-23 16:00:13 -1000)
 ----------------------------------------------------------------
-Fix breakpoint invalidation.
+common-user: Re-enable ppc32 host
-Add support for tcg helpers with 7 arguments.
+tcg: Avoid recursion in tcg_gen_mulu2_i32
-Add support for gvec helpers with 5 arguments.
+tcg: Mark tcg helpers noinline to avoid an issue with LTO
 tcg/arm: Use register pair allocation for qemu_{ld,st}_i64
 disas: Enable loongarch disassembler, and fixes
 tcg/loongarch64: Improve move immediate
 tcg/loongarch64: Improve add immediate
 tcg/loongarch64: Improve setcond
 tcg/loongarch64: Implement movcond
 tcg/loongarch64: Use tcg_pcrel_diff in tcg_out_ldst
 tcg/loongarch64: Reorg goto_tb implementation
 ----------------------------------------------------------------
-Max Filippov (1):
+Richard Henderson (14):
-      exec: flush CPU TB cache in breakpoint_invalidate
+      tcg: Avoid recursion in tcg_gen_mulu2_i32
       tcg/arm: Use register pair allocation for qemu_{ld,st}_i64
       common-user/host/ppc: Implement safe-syscall.inc.S
       linux-user: Implment host/ppc/host-signal.h
       tcg: Mark tcg helpers noinline to avoid an issue with LTO
       target/loongarch: Enable the disassembler for host tcg
       target/loongarch: Disassemble jirl properly
       target/loongarch: Disassemble pcadd* addresses
       tcg/loongarch64: Update tcg-insn-defs.c.inc
       tcg/loongarch64: Introduce tcg_out_addi
       tcg/loongarch64: Improve setcond expansion
       tcg/loongarch64: Implement movcond
       tcg/loongarch64: Use tcg_pcrel_diff in tcg_out_ldst
       tcg/loongarch64: Reorg goto_tb implementation
-Richard Henderson (1):
+Rui Wang (1):
-      tcg: Add tcg_gen_gvec_5_ptr
+      tcg/loongarch64: Optimize immediate loading
-Taylor Simpson (1):
+ include/exec/helper-proto.h                    |  32 ++-
-      tcg: Add support for a helper with 7 arguments
+ include/tcg/tcg.h                              |   7 -
+ linux-user/include/host/ppc/host-signal.h      |  39 +++
- include/exec/helper-gen.h   | 13 +++++++++++++
+ tcg/arm/tcg-target-con-set.h                   |   7 +-
- include/exec/helper-head.h  |  2 ++
+ tcg/arm/tcg-target-con-str.h                   |   2 +
- include/exec/helper-proto.h |  6 ++++++
+ tcg/loongarch64/tcg-target-con-set.h           |   5 +-
- include/exec/helper-tcg.h   |  7 +++++++
+ tcg/loongarch64/tcg-target-con-str.h           |   2 +-
- include/tcg/tcg-op-gvec.h   |  7 +++++++
+ tcg/loongarch64/tcg-target.h                   |  11 +-
- exec.c                      | 15 +++++++--------
+ target/loongarch/insns.decode                  |   3 +-
- tcg/tcg-op-gvec.c           | 32 ++++++++++++++++++++++++++++++++
+ disas.c                                        |   2 +
-files changed, 74 insertions(+), 8 deletions(-)
+ target/loongarch/disas.c                       |  39 ++-
+ tcg/tcg-op.c                                   |   4 +-
  target/loongarch/insn_trans/trans_branch.c.inc |   2 +-
  tcg/arm/tcg-target.c.inc                       |  28 +-
  tcg/loongarch64/tcg-insn-defs.c.inc            |  10 +-
  tcg/loongarch64/tcg-target.c.inc               | 364 ++++++++++++++++---------
  common-user/host/ppc/safe-syscall.inc.S        | 107 ++++++++
  target/loongarch/meson.build                   |   3 +-
 files changed, 497 insertions(+), 170 deletions(-)
  create mode 100644 linux-user/include/host/ppc/host-signal.h
  create mode 100644 common-user/host/ppc/safe-syscall.inc.S

-New patch
+[PULL v2 01/15] tcg: Avoid recursion in tcg_gen_mulu2_i32
+We have a test for one of TCG_TARGET_HAS_mulu2_i32 or
+TCG_TARGET_HAS_muluh_i32 being defined, but the test
+became non-functional when we changed to always define
+all of these macros.
+Replace this with a build-time test in tcg_gen_mulu2_i32.
+Fixes: 25c4d9cc845 ("tcg: Always define all of the TCGOpcode enum members.")
+Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1435
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ include/tcg/tcg.h | 7 -------
+ tcg/tcg-op.c      | 4 +++-
+files changed, 3 insertions(+), 8 deletions(-)
+diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
+index XXXXXXX..XXXXXXX 100644
+--- a/include/tcg/tcg.h
++++ b/include/tcg/tcg.h
+@@ -XXX,XX +XXX,XX @@ typedef uint64_t TCGRegSet;
+ #define TCG_TARGET_HAS_rem_i64          0
+ #endif
+-/* For 32-bit targets, some sort of unsigned widening multiply is required.  */
+-#if TCG_TARGET_REG_BITS == 32 \
+-    && !(defined(TCG_TARGET_HAS_mulu2_i32) \
+-         || defined(TCG_TARGET_HAS_muluh_i32))
+-# error "Missing unsigned widening multiply"
+-#endif
+-
+ #if !defined(TCG_TARGET_HAS_v64) \
+     && !defined(TCG_TARGET_HAS_v128) \
+     && !defined(TCG_TARGET_HAS_v256)
+diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/tcg-op.c
++++ b/tcg/tcg-op.c
+@@ -XXX,XX +XXX,XX @@ void tcg_gen_mulu2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 arg1, TCGv_i32 arg2)
+         tcg_gen_op3_i32(INDEX_op_muluh_i32, rh, arg1, arg2);
+         tcg_gen_mov_i32(rl, t);
+         tcg_temp_free_i32(t);
+-    } else {
++    } else if (TCG_TARGET_REG_BITS == 64) {
+         TCGv_i64 t0 = tcg_temp_new_i64();
+         TCGv_i64 t1 = tcg_temp_new_i64();
+         tcg_gen_extu_i32_i64(t0, arg1);
+@@ -XXX,XX +XXX,XX @@ void tcg_gen_mulu2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 arg1, TCGv_i32 arg2)
+         tcg_gen_extr_i64_i32(rl, rh, t0);
+         tcg_temp_free_i64(t0);
+         tcg_temp_free_i64(t1);
++    } else {
++        qemu_build_not_reached();
+     }
+ }
+--
+.34.1

-New patch
+[PULL v2 02/15] tcg/arm: Use register pair allocation for qemu_{ld, st}_i64
+Although we still can't use ldrd and strd for all operations,
+increase the chances by getting the register allocation correct.
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/arm/tcg-target-con-set.h |  7 ++++---
+ tcg/arm/tcg-target-con-str.h |  2 ++
+ tcg/arm/tcg-target.c.inc     | 28 ++++++++++++++++++----------
+files changed, 24 insertions(+), 13 deletions(-)
+diff --git a/tcg/arm/tcg-target-con-set.h b/tcg/arm/tcg-target-con-set.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/arm/tcg-target-con-set.h
++++ b/tcg/arm/tcg-target-con-set.h
+@@ -XXX,XX +XXX,XX @@ C_O0_I2(r, rIN)
+ C_O0_I2(s, s)
+ C_O0_I2(w, r)
+ C_O0_I3(s, s, s)
++C_O0_I3(S, p, s)
+ C_O0_I4(r, r, rI, rI)
+-C_O0_I4(s, s, s, s)
++C_O0_I4(S, p, s, s)
+ C_O1_I1(r, l)
+ C_O1_I1(r, r)
+ C_O1_I1(w, r)
+@@ -XXX,XX +XXX,XX @@ C_O1_I2(w, w, wZ)
+ C_O1_I3(w, w, w, w)
+ C_O1_I4(r, r, r, rI, rI)
+ C_O1_I4(r, r, rIN, rIK, 0)
+-C_O2_I1(r, r, l)
+-C_O2_I2(r, r, l, l)
++C_O2_I1(e, p, l)
++C_O2_I2(e, p, l, l)
+ C_O2_I2(r, r, r, r)
+ C_O2_I4(r, r, r, r, rIN, rIK)
+ C_O2_I4(r, r, rI, rI, rIN, rIK)
+diff --git a/tcg/arm/tcg-target-con-str.h b/tcg/arm/tcg-target-con-str.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/arm/tcg-target-con-str.h
++++ b/tcg/arm/tcg-target-con-str.h
+@@ -XXX,XX +XXX,XX @@
+  * Define constraint letters for register sets:
+  * REGS(letter, register_mask)
+  */
++REGS('e', ALL_GENERAL_REGS & 0x5555) /* even regs */
+ REGS('r', ALL_GENERAL_REGS)
+ REGS('l', ALL_QLOAD_REGS)
+ REGS('s', ALL_QSTORE_REGS)
++REGS('S', ALL_QSTORE_REGS & 0x5555)  /* even qstore */
+ REGS('w', ALL_VECTOR_REGS)
+ /*
+diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/arm/tcg-target.c.inc
++++ b/tcg/arm/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld_index(TCGContext *s, MemOp opc,
+         tcg_out_ld32_r(s, COND_AL, datalo, addrlo, addend);
+         break;
+     case MO_UQ:
++        /* We used pair allocation for datalo, so already should be aligned. */
++        tcg_debug_assert((datalo & 1) == 0);
++        tcg_debug_assert(datahi == datalo + 1);
+         /* LDRD requires alignment; double-check that. */
+-        if (get_alignment_bits(opc) >= MO_64
+-            && (datalo & 1) == 0 && datahi == datalo + 1) {
++        if (get_alignment_bits(opc) >= MO_64) {
+             /*
+              * Rm (the second address op) must not overlap Rt or Rt + 1.
+              * Since datalo is aligned, we can simplify the test via alignment.
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld_direct(TCGContext *s, MemOp opc, TCGReg datalo,
+         tcg_out_ld32_12(s, COND_AL, datalo, addrlo, 0);
+         break;
+     case MO_UQ:
++        /* We used pair allocation for datalo, so already should be aligned. */
++        tcg_debug_assert((datalo & 1) == 0);
++        tcg_debug_assert(datahi == datalo + 1);
+         /* LDRD requires alignment; double-check that. */
+-        if (get_alignment_bits(opc) >= MO_64
+-            && (datalo & 1) == 0 && datahi == datalo + 1) {
++        if (get_alignment_bits(opc) >= MO_64) {
+             tcg_out_ldrd_8(s, COND_AL, datalo, addrlo, 0);
+         } else if (datalo == addrlo) {
+             tcg_out_ld32_12(s, COND_AL, datahi, addrlo, 4);
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st_index(TCGContext *s, ARMCond cond, MemOp opc,
+         tcg_out_st32_r(s, cond, datalo, addrlo, addend);
+         break;
+     case MO_64:
++        /* We used pair allocation for datalo, so already should be aligned. */
++        tcg_debug_assert((datalo & 1) == 0);
++        tcg_debug_assert(datahi == datalo + 1);
+         /* STRD requires alignment; double-check that. */
+-        if (get_alignment_bits(opc) >= MO_64
+-            && (datalo & 1) == 0 && datahi == datalo + 1) {
++        if (get_alignment_bits(opc) >= MO_64) {
+             tcg_out_strd_r(s, cond, datalo, addrlo, addend);
+         } else if (scratch_addend) {
+             tcg_out_st32_rwb(s, cond, datalo, addend, addrlo);
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st_direct(TCGContext *s, MemOp opc, TCGReg datalo,
+         tcg_out_st32_12(s, COND_AL, datalo, addrlo, 0);
+         break;
+     case MO_64:
++        /* We used pair allocation for datalo, so already should be aligned. */
++        tcg_debug_assert((datalo & 1) == 0);
++        tcg_debug_assert(datahi == datalo + 1);
+         /* STRD requires alignment; double-check that. */
+-        if (get_alignment_bits(opc) >= MO_64
+-            && (datalo & 1) == 0 && datahi == datalo + 1) {
++        if (get_alignment_bits(opc) >= MO_64) {
+             tcg_out_strd_8(s, COND_AL, datalo, addrlo, 0);
+         } else {
+             tcg_out_st32_12(s, COND_AL, datalo, addrlo, 0);
+@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
+     case INDEX_op_qemu_ld_i32:
+         return TARGET_LONG_BITS == 32 ? C_O1_I1(r, l) : C_O1_I2(r, l, l);
+     case INDEX_op_qemu_ld_i64:
+-        return TARGET_LONG_BITS == 32 ? C_O2_I1(r, r, l) : C_O2_I2(r, r, l, l);
++        return TARGET_LONG_BITS == 32 ? C_O2_I1(e, p, l) : C_O2_I2(e, p, l, l);
+     case INDEX_op_qemu_st_i32:
+         return TARGET_LONG_BITS == 32 ? C_O0_I2(s, s) : C_O0_I3(s, s, s);
+     case INDEX_op_qemu_st_i64:
+-        return TARGET_LONG_BITS == 32 ? C_O0_I3(s, s, s) : C_O0_I4(s, s, s, s);
++        return TARGET_LONG_BITS == 32 ? C_O0_I3(S, p, s) : C_O0_I4(S, p, s, s);
+     case INDEX_op_st_vec:
+         return C_O0_I2(w, r);
+--
+.34.1

-New patch
+[PULL v2 03/15] common-user/host/ppc: Implement safe-syscall.inc.S
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+Reviewed-by: Daniel Henrique Barboza <danielhb413@gmail.com>
+Message-Id: <20220729172141.1789105-2-richard.henderson@linaro.org>
+---
+ common-user/host/ppc/safe-syscall.inc.S | 107 ++++++++++++++++++++++++
+file changed, 107 insertions(+)
+ create mode 100644 common-user/host/ppc/safe-syscall.inc.S
+diff --git a/common-user/host/ppc/safe-syscall.inc.S b/common-user/host/ppc/safe-syscall.inc.S
+new file mode 100644
+index XXXXXXX..XXXXXXX
+--- /dev/null
++++ b/common-user/host/ppc/safe-syscall.inc.S
+@@ -XXX,XX +XXX,XX @@
++/*
++ * safe-syscall.inc.S : host-specific assembly fragment
++ * to handle signals occurring at the same time as system calls.
++ * This is intended to be included by common-user/safe-syscall.S
++ *
++ * Copyright (C) 2022 Linaro, Ltd.
++ *
++ * This work is licensed under the terms of the GNU GPL, version 2 or later.
++ * See the COPYING file in the top-level directory.
++ */
++
++/*
++ * Standardize on the _CALL_FOO symbols used by GCC:
++ * Apple XCode does not define _CALL_DARWIN.
++ * Clang defines _CALL_ELF (64-bit) but not _CALL_SYSV (32-bit).
++ */
++#if !defined(_CALL_SYSV) && \
++    !defined(_CALL_DARWIN) && \
++    !defined(_CALL_AIX) && \
++    !defined(_CALL_ELF)
++# if defined(__APPLE__)
++#  define _CALL_DARWIN
++# elif defined(__ELF__) && TCG_TARGET_REG_BITS == 32
++#  define _CALL_SYSV
++# else
++#  error "Unknown ABI"
++# endif
++#endif
++
++#ifndef _CALL_SYSV
++# error "Unsupported ABI"
++#endif
++
++
++        .global safe_syscall_base
++        .global safe_syscall_start
++        .global safe_syscall_end
++        .type   safe_syscall_base, @function
++
++        .text
++
++        /*
++         * This is the entry point for making a system call. The calling
++         * convention here is that of a C varargs function with the
++         * first argument an 'int *' to the signal_pending flag, the
++         * second one the system call number (as a 'long'), and all further
++         * arguments being syscall arguments (also 'long').
++         */
++safe_syscall_base:
++        .cfi_startproc
++        stwu    1, -8(1)
++        .cfi_def_cfa_offset 8
++        stw     30, 4(1)
++        .cfi_offset 30, -4
++
++        /*
++         * We enter with r3 == &signal_pending
++         *               r4 == syscall number
++         *               r5 ... r10 == syscall arguments
++         *               and return the result in r3
++         * and the syscall instruction needs
++         *               r0 == syscall number
++         *               r3 ... r8 == syscall arguments
++         *               and returns the result in r3
++         * Shuffle everything around appropriately.
++         */
++        mr      30, 3           /* signal_pending */
++        mr      0, 4            /* syscall number */
++        mr      3, 5            /* syscall arguments */
++        mr      4, 6
++        mr      5, 7
++        mr      6, 8
++        mr      7, 9
++        mr      8, 10
++
++        /*
++         * This next sequence of code works in conjunction with the
++         * rewind_if_safe_syscall_function(). If a signal is taken
++         * and the interrupted PC is anywhere between 'safe_syscall_start'
++         * and 'safe_syscall_end' then we rewind it to 'safe_syscall_start'.
++         * The code sequence must therefore be able to cope with this, and
++         * the syscall instruction must be the final one in the sequence.
++         */
++safe_syscall_start:
++        /* if signal_pending is non-zero, don't do the call */
++        lwz     12, 0(30)
++        cmpwi   0, 12, 0
++        bne-    2f
++        sc
++safe_syscall_end:
++        /* code path when we did execute the syscall */
++        lwz     30, 4(1)        /* restore r30 */
++        addi    1, 1, 8         /* restore stack */
++        .cfi_restore 30
++        .cfi_def_cfa_offset 0
++        bnslr+                  /* return on success */
++        b       safe_syscall_set_errno_tail
++
++        /* code path when we didn't execute the syscall */
++2:      lwz     30, 4(1)
++        addi    1, 1, 8
++        addi    3, 0, QEMU_ERESTARTSYS
++        b       safe_syscall_set_errno_tail
++
++        .cfi_endproc
++
++        .size   safe_syscall_base, .-safe_syscall_base
+--
+.34.1

-New patch
+[PULL v2 04/15] linux-user: Implment host/ppc/host-signal.h
+This commit re-enables ppc32 as a linux-user host,
+as existance of the directory is noted by configure.
+Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1097
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+Reviewed-by: Daniel Henrique Barboza <danielhb413@gmail.com>
+Message-Id: <20220729172141.1789105-3-richard.henderson@linaro.org>
+---
+ linux-user/include/host/ppc/host-signal.h | 39 +++++++++++++++++++++++
+file changed, 39 insertions(+)
+ create mode 100644 linux-user/include/host/ppc/host-signal.h
+diff --git a/linux-user/include/host/ppc/host-signal.h b/linux-user/include/host/ppc/host-signal.h
+new file mode 100644
+index XXXXXXX..XXXXXXX
+--- /dev/null
++++ b/linux-user/include/host/ppc/host-signal.h
+@@ -XXX,XX +XXX,XX @@
++/*
++ * host-signal.h: signal info dependent on the host architecture
++ *
++ * Copyright (c) 2022 Linaro Ltd.
++ *
++ * This work is licensed under the terms of the GNU LGPL, version 2.1 or later.
++ * See the COPYING file in the top-level directory.
++ */
++
++#ifndef PPC_HOST_SIGNAL_H
++#define PPC_HOST_SIGNAL_H
++
++#include <asm/ptrace.h>
++
++/* The third argument to a SA_SIGINFO handler is ucontext_t. */
++typedef ucontext_t host_sigcontext;
++
++static inline uintptr_t host_signal_pc(host_sigcontext *uc)
++{
++    return uc->uc_mcontext.regs->nip;
++}
++
++static inline void host_signal_set_pc(host_sigcontext *uc, uintptr_t pc)
++{
++    uc->uc_mcontext.regs->nip = pc;
++}
++
++static inline void *host_signal_mask(host_sigcontext *uc)
++{
++    return &uc->uc_sigmask;
++}
++
++static inline bool host_signal_write(siginfo_t *info, host_sigcontext *uc)
++{
++    return uc->uc_mcontext.regs->trap != 0x400
++        && (uc->uc_mcontext.regs->dsisr & 0x02000000);
++}
++
++#endif
+--
+.34.1

-[PULL 2/3] tcg: Add support for a helper with 7 arguments
+[PULL v2 05/15] tcg: Mark tcg helpers noinline to avoid an issue with LTO
-From: Taylor Simpson <tsimpson@quicinc.com>
+Marking helpers __attribute__((noinline)) prevents an issue
 with GCC's ipa-split pass under --enable-lto.
-Currently, helpers can only take up to 6 arguments.  This patch adds the
+Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1454
-capability for up to 7 arguments.  I have tested it with the Hexagon port
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
-that I am preparing for submission.
+Tested-by: Idan Horowitz <idan.horowitz@gmail.com>
 Signed-off-by: Taylor Simpson <tsimpson@quicinc.com>
 Message-Id: <1580942510-2820-1-git-send-email-tsimpson@quicinc.com>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/exec/helper-gen.h   | 13 +++++++++++++
+ include/exec/helper-proto.h | 32 ++++++++++++++++++++++++--------
- include/exec/helper-head.h  |  2 ++
+file changed, 24 insertions(+), 8 deletions(-)
  include/exec/helper-proto.h |  6 ++++++
  include/exec/helper-tcg.h   |  7 +++++++
 files changed, 28 insertions(+)
-diff --git a/include/exec/helper-gen.h b/include/exec/helper-gen.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/exec/helper-gen.h
-+++ b/include/exec/helper-gen.h
-@@ -XXX,XX +XXX,XX @@ static inline void glue(gen_helper_, name)(dh_retvar_decl(ret)          \
-   tcg_gen_callN(HELPER(name), dh_retvar(ret), 6, args);                 \
- }
-+#define DEF_HELPER_FLAGS_7(name, flags, ret, t1, t2, t3, t4, t5, t6, t7)\
-+static inline void glue(gen_helper_, name)(dh_retvar_decl(ret)          \
-+    dh_arg_decl(t1, 1),  dh_arg_decl(t2, 2), dh_arg_decl(t3, 3),        \
-+    dh_arg_decl(t4, 4), dh_arg_decl(t5, 5), dh_arg_decl(t6, 6),         \
-+    dh_arg_decl(t7, 7))                                                 \
-+{                                                                       \
-+  TCGTemp *args[7] = { dh_arg(t1, 1), dh_arg(t2, 2), dh_arg(t3, 3),     \
-+                     dh_arg(t4, 4), dh_arg(t5, 5), dh_arg(t6, 6),       \
-+                     dh_arg(t7, 7) };                                   \
-+  tcg_gen_callN(HELPER(name), dh_retvar(ret), 7, args);                 \
-+}
-+
- #include "helper.h"
- #include "trace/generated-helpers.h"
- #include "trace/generated-helpers-wrappers.h"
-@@ -XXX,XX +XXX,XX @@ static inline void glue(gen_helper_, name)(dh_retvar_decl(ret)          \
- #undef DEF_HELPER_FLAGS_4
- #undef DEF_HELPER_FLAGS_5
- #undef DEF_HELPER_FLAGS_6
-+#undef DEF_HELPER_FLAGS_7
- #undef GEN_HELPER
- #endif /* HELPER_GEN_H */
-diff --git a/include/exec/helper-head.h b/include/exec/helper-head.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/exec/helper-head.h
-+++ b/include/exec/helper-head.h
-@@ -XXX,XX +XXX,XX @@
-     DEF_HELPER_FLAGS_5(name, 0, ret, t1, t2, t3, t4, t5)
- #define DEF_HELPER_6(name, ret, t1, t2, t3, t4, t5, t6) \
-     DEF_HELPER_FLAGS_6(name, 0, ret, t1, t2, t3, t4, t5, t6)
-+#define DEF_HELPER_7(name, ret, t1, t2, t3, t4, t5, t6, t7) \
-+    DEF_HELPER_FLAGS_7(name, 0, ret, t1, t2, t3, t4, t5, t6, t7)
- /* MAX_OPC_PARAM_IARGS must be set to n if last entry is DEF_HELPER_FLAGS_n. */
 diff --git a/include/exec/helper-proto.h b/include/exec/helper-proto.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/exec/helper-proto.h
 +++ b/include/exec/helper-proto.h
+@@ -XXX,XX +XXX,XX @@
+ #include "exec/helper-head.h"
++/*
++ * Work around an issue with --enable-lto, in which GCC's ipa-split pass
++ * decides to split out the noreturn code paths that raise an exception,
++ * taking the __builtin_return_address() along into the new function,
++ * where it no longer computes a value that returns to TCG generated code.
++ * Despite the name, the noinline attribute affects splitter, so this
++ * prevents the optimization in question.  Given that helpers should not
++ * otherwise be called directly, this should have any other visible effect.
++ *
++ * See https://gitlab.com/qemu-project/qemu/-/issues/1454
++ */
++#define DEF_HELPER_ATTR  __attribute__((noinline))
++
+ #define DEF_HELPER_FLAGS_0(name, flags, ret) \
+-dh_ctype(ret) HELPER(name) (void);
++dh_ctype(ret) HELPER(name) (void) DEF_HELPER_ATTR;
+ #define DEF_HELPER_FLAGS_1(name, flags, ret, t1) \
+-dh_ctype(ret) HELPER(name) (dh_ctype(t1));
++dh_ctype(ret) HELPER(name) (dh_ctype(t1)) DEF_HELPER_ATTR;
+ #define DEF_HELPER_FLAGS_2(name, flags, ret, t1, t2) \
+-dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2));
++dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2)) DEF_HELPER_ATTR;
+ #define DEF_HELPER_FLAGS_3(name, flags, ret, t1, t2, t3) \
+-dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3));
++dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), \
++                            dh_ctype(t3)) DEF_HELPER_ATTR;
+ #define DEF_HELPER_FLAGS_4(name, flags, ret, t1, t2, t3, t4) \
+ dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
+-                                   dh_ctype(t4));
++                            dh_ctype(t4)) DEF_HELPER_ATTR;
+ #define DEF_HELPER_FLAGS_5(name, flags, ret, t1, t2, t3, t4, t5) \
+ dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
+-                            dh_ctype(t4), dh_ctype(t5));
++                            dh_ctype(t4), dh_ctype(t5)) DEF_HELPER_ATTR;
+ #define DEF_HELPER_FLAGS_6(name, flags, ret, t1, t2, t3, t4, t5, t6) \
+ dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
+-                            dh_ctype(t4), dh_ctype(t5), dh_ctype(t6));
++                            dh_ctype(t4), dh_ctype(t5), \
++                            dh_ctype(t6)) DEF_HELPER_ATTR;
+ #define DEF_HELPER_FLAGS_7(name, flags, ret, t1, t2, t3, t4, t5, t6, t7) \
+ dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
+                             dh_ctype(t4), dh_ctype(t5), dh_ctype(t6), \
+-                            dh_ctype(t7));
++                            dh_ctype(t7)) DEF_HELPER_ATTR;
+ #define IN_HELPER_PROTO
 @@ -XXX,XX +XXX,XX @@ dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
- dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
-                             dh_ctype(t4), dh_ctype(t5), dh_ctype(t6));
-+#define DEF_HELPER_FLAGS_7(name, flags, ret, t1, t2, t3, t4, t5, t6, t7) \
-+dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
-+                            dh_ctype(t4), dh_ctype(t5), dh_ctype(t6), \
-+                            dh_ctype(t7));
-+
- #include "helper.h"
- #include "trace/generated-helpers.h"
- #include "tcg-runtime.h"
-@@ -XXX,XX +XXX,XX @@ dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
- #undef DEF_HELPER_FLAGS_4
  #undef DEF_HELPER_FLAGS_5
  #undef DEF_HELPER_FLAGS_6
-+#undef DEF_HELPER_FLAGS_7
+ #undef DEF_HELPER_FLAGS_7
 +#undef DEF_HELPER_ATTR
  #endif /* HELPER_PROTO_H */
-diff --git a/include/exec/helper-tcg.h b/include/exec/helper-tcg.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/exec/helper-tcg.h
-+++ b/include/exec/helper-tcg.h
-@@ -XXX,XX +XXX,XX @@
-     | dh_sizemask(t2, 2) | dh_sizemask(t3, 3) | dh_sizemask(t4, 4) \
-     | dh_sizemask(t5, 5) | dh_sizemask(t6, 6) },
-+#define DEF_HELPER_FLAGS_7(NAME, FLAGS, ret, t1, t2, t3, t4, t5, t6, t7) \
-+  { .func = HELPER(NAME), .name = str(NAME), .flags = FLAGS, \
-+    .sizemask = dh_sizemask(ret, 0) | dh_sizemask(t1, 1) \
-+    | dh_sizemask(t2, 2) | dh_sizemask(t3, 3) | dh_sizemask(t4, 4) \
-+    | dh_sizemask(t5, 5) | dh_sizemask(t6, 6) | dh_sizemask(t7, 7) },
-+
- #include "helper.h"
- #include "trace/generated-helpers.h"
- #include "tcg-runtime.h"
-@@ -XXX,XX +XXX,XX @@
- #undef DEF_HELPER_FLAGS_4
- #undef DEF_HELPER_FLAGS_5
- #undef DEF_HELPER_FLAGS_6
-+#undef DEF_HELPER_FLAGS_7
- #endif /* HELPER_TCG_H */
 --
-.20.1
+.34.1

-New patch
+[PULL v2 06/15] target/loongarch: Enable the disassembler for host tcg
+Reuse the decodetree based disassembler from
+target/loongarch/ for tcg/loongarch64/.
+The generation of decode-insns.c.inc into ./libcommon.fa.p/ could
+eventually result in conflict, if any other host requires the same
+trick, but this is good enough for now.
+Reviewed-by: WANG Xuerui <git@xen0n.name>
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ disas.c                      | 2 ++
+ target/loongarch/meson.build | 3 ++-
+files changed, 4 insertions(+), 1 deletion(-)
+diff --git a/disas.c b/disas.c
+index XXXXXXX..XXXXXXX 100644
+--- a/disas.c
++++ b/disas.c
+@@ -XXX,XX +XXX,XX @@ static void initialize_debug_host(CPUDebug *s)
+     s->info.cap_insn_split = 6;
+ #elif defined(__hppa__)
+     s->info.print_insn = print_insn_hppa;
++#elif defined(__loongarch__)
++    s->info.print_insn = print_insn_loongarch;
+ #endif
+ }
+diff --git a/target/loongarch/meson.build b/target/loongarch/meson.build
+index XXXXXXX..XXXXXXX 100644
+--- a/target/loongarch/meson.build
++++ b/target/loongarch/meson.build
+@@ -XXX,XX +XXX,XX @@ gen = decodetree.process('insns.decode')
+ loongarch_ss = ss.source_set()
+ loongarch_ss.add(files(
+   'cpu.c',
+-  'disas.c',
+ ))
+ loongarch_tcg_ss = ss.source_set()
+ loongarch_tcg_ss.add(gen)
+@@ -XXX,XX +XXX,XX @@ loongarch_softmmu_ss.add(files(
+   'iocsr_helper.c',
+ ))
++common_ss.add(when: 'CONFIG_LOONGARCH_DIS', if_true: [files('disas.c'), gen])
++
+ loongarch_ss.add_all(when: 'CONFIG_TCG', if_true: [loongarch_tcg_ss])
+ target_arch += {'loongarch': loongarch_ss}
+--
+.34.1

-New patch
+[PULL v2 07/15] target/loongarch: Disassemble jirl properly
+While jirl shares the same instruction format as bne etc,
+it is not assembled the same.  In particular, rd is printed
+first not second and the immediate is not pc-relative.
+Decode into the arg_rr_i structure, which prints correctly.
+This changes the "offs" member to "imm", to update translate.
+Reviewed-by: WANG Xuerui <git@xen0n.name>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ target/loongarch/insns.decode                  | 3 ++-
+ target/loongarch/disas.c                       | 2 +-
+ target/loongarch/insn_trans/trans_branch.c.inc | 2 +-
+files changed, 4 insertions(+), 3 deletions(-)
+diff --git a/target/loongarch/insns.decode b/target/loongarch/insns.decode
+index XXXXXXX..XXXXXXX 100644
+--- a/target/loongarch/insns.decode
++++ b/target/loongarch/insns.decode
+@@ -XXX,XX +XXX,XX @@
+ @rr_ui12                 .... ...... imm:12 rj:5 rd:5    &rr_i
+ @rr_i14s2         .... ....  .............. rj:5 rd:5    &rr_i imm=%i14s2
+ @rr_i16                     .... .. imm:s16 rj:5 rd:5    &rr_i
++@rr_i16s2         .... ..  ................ rj:5 rd:5    &rr_i imm=%offs16
+ @hint_r_i12           .... ...... imm:s12 rj:5 hint:5    &hint_r_i
+ @rrr_sa2p1        .... ........ ... .. rk:5 rj:5 rd:5    &rrr_sa  sa=%sa2p1
+ @rrr_sa2        .... ........ ... sa:2 rk:5 rj:5 rd:5    &rrr_sa
+@@ -XXX,XX +XXX,XX @@ beqz            0100 00 ................ ..... .....     @r_offs21
+ bnez            0100 01 ................ ..... .....     @r_offs21
+ bceqz           0100 10 ................ 00 ... .....    @c_offs21
+ bcnez           0100 10 ................ 01 ... .....    @c_offs21
+-jirl            0100 11 ................ ..... .....     @rr_offs16
++jirl            0100 11 ................ ..... .....     @rr_i16s2
+ b               0101 00 ..........................       @offs26
+ bl              0101 01 ..........................       @offs26
+ beq             0101 10 ................ ..... .....     @rr_offs16
+diff --git a/target/loongarch/disas.c b/target/loongarch/disas.c
+index XXXXXXX..XXXXXXX 100644
+--- a/target/loongarch/disas.c
++++ b/target/loongarch/disas.c
+@@ -XXX,XX +XXX,XX @@ INSN(beqz,         r_offs)
+ INSN(bnez,         r_offs)
+ INSN(bceqz,        c_offs)
+ INSN(bcnez,        c_offs)
+-INSN(jirl,         rr_offs)
++INSN(jirl,         rr_i)
+ INSN(b,            offs)
+ INSN(bl,           offs)
+ INSN(beq,          rr_offs)
+diff --git a/target/loongarch/insn_trans/trans_branch.c.inc b/target/loongarch/insn_trans/trans_branch.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/target/loongarch/insn_trans/trans_branch.c.inc
++++ b/target/loongarch/insn_trans/trans_branch.c.inc
+@@ -XXX,XX +XXX,XX @@ static bool trans_jirl(DisasContext *ctx, arg_jirl *a)
+     TCGv dest = gpr_dst(ctx, a->rd, EXT_NONE);
+     TCGv src1 = gpr_src(ctx, a->rj, EXT_NONE);
+-    tcg_gen_addi_tl(cpu_pc, src1, a->offs);
++    tcg_gen_addi_tl(cpu_pc, src1, a->imm);
+     tcg_gen_movi_tl(dest, ctx->base.pc_next + 4);
+     gen_set_gpr(a->rd, dest, EXT_NONE);
+     tcg_gen_lookup_and_goto_ptr();
+--
+.34.1

-New patch
+[PULL v2 08/15] target/loongarch: Disassemble pcadd* addresses
+Print both the raw field and the resolved pc-relative
+address, as we do for branches.
+Reviewed-by: WANG Xuerui <git@xen0n.name>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ target/loongarch/disas.c | 37 +++++++++++++++++++++++++++++++++----
+file changed, 33 insertions(+), 4 deletions(-)
+diff --git a/target/loongarch/disas.c b/target/loongarch/disas.c
+index XXXXXXX..XXXXXXX 100644
+--- a/target/loongarch/disas.c
++++ b/target/loongarch/disas.c
+@@ -XXX,XX +XXX,XX @@ INSN(fsel,         fffc)
+ INSN(addu16i_d,    rr_i)
+ INSN(lu12i_w,      r_i)
+ INSN(lu32i_d,      r_i)
+-INSN(pcaddi,       r_i)
+-INSN(pcalau12i,    r_i)
+-INSN(pcaddu12i,    r_i)
+-INSN(pcaddu18i,    r_i)
+ INSN(ll_w,         rr_i)
+ INSN(sc_w,         rr_i)
+ INSN(ll_d,         rr_i)
+@@ -XXX,XX +XXX,XX @@ static bool trans_fcmp_cond_##suffix(DisasContext *ctx, \
+ FCMP_INSN(s)
+ FCMP_INSN(d)
++
++#define PCADD_INSN(name)                                        \
++static bool trans_##name(DisasContext *ctx, arg_##name *a)      \
++{                                                               \
++    output(ctx, #name, "r%d, %d # 0x%" PRIx64,                  \
++           a->rd, a->imm, gen_##name(ctx->pc, a->imm));         \
++    return true;                                                \
++}
++
++static uint64_t gen_pcaddi(uint64_t pc, int imm)
++{
++    return pc + (imm << 2);
++}
++
++static uint64_t gen_pcalau12i(uint64_t pc, int imm)
++{
++    return (pc + (imm << 12)) & ~0xfff;
++}
++
++static uint64_t gen_pcaddu12i(uint64_t pc, int imm)
++{
++    return pc + (imm << 12);
++}
++
++static uint64_t gen_pcaddu18i(uint64_t pc, int imm)
++{
++    return pc + ((uint64_t)(imm) << 18);
++}
++
++PCADD_INSN(pcaddi)
++PCADD_INSN(pcalau12i)
++PCADD_INSN(pcaddu12i)
++PCADD_INSN(pcaddu18i)
+--
+.34.1

-New patch
+[PULL v2 09/15] tcg/loongarch64: Optimize immediate loading
+From: Rui Wang <wangrui@loongson.cn>
+diff:
+  Imm                 Before                  After
+  0000000000000000    addi.w  rd, zero, 0     addi.w  rd, zero, 0
+                      lu52i.d rd, zero, 0
+  00000000fffff800    lu12i.w rd, -1          addi.w  rd, zero, -2048
+                      ori     rd, rd, 2048    lu32i.d rd, 0
+                      lu32i.d rd, 0
+Reviewed-by: WANG Xuerui <git@xen0n.name>
+Signed-off-by: Rui Wang <wangrui@loongson.cn>
+Message-Id: <20221107144713.845550-1-wangrui@loongson.cn>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/loongarch64/tcg-target.c.inc | 35 +++++++++++---------------------
+file changed, 12 insertions(+), 23 deletions(-)
+diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/loongarch64/tcg-target.c.inc
++++ b/tcg/loongarch64/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
+     return true;
+ }
+-static bool imm_part_needs_loading(bool high_bits_are_ones,
+-                                   tcg_target_long part)
+-{
+-    if (high_bits_are_ones) {
+-        return part != -1;
+-    } else {
+-        return part != 0;
+-    }
+-}
+-
+ /* Loads a 32-bit immediate into rd, sign-extended.  */
+ static void tcg_out_movi_i32(TCGContext *s, TCGReg rd, int32_t val)
+ {
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_movi_i32(TCGContext *s, TCGReg rd, int32_t val)
+     tcg_target_long hi12 = sextreg(val, 12, 20);
+     /* Single-instruction cases.  */
+-    if (lo == val) {
+-        /* val fits in simm12: addi.w rd, zero, val */
+-        tcg_out_opc_addi_w(s, rd, TCG_REG_ZERO, val);
+-        return;
+-    }
+-    if (0x800 <= val && val <= 0xfff) {
++    if (hi12 == 0) {
+         /* val fits in uimm12: ori rd, zero, val */
+         tcg_out_opc_ori(s, rd, TCG_REG_ZERO, val);
+         return;
+     }
++    if (hi12 == sextreg(lo, 12, 20)) {
++        /* val fits in simm12: addi.w rd, zero, val */
++        tcg_out_opc_addi_w(s, rd, TCG_REG_ZERO, val);
++        return;
++    }
+     /* High bits must be set; load with lu12i.w + optional ori.  */
+     tcg_out_opc_lu12i_w(s, rd, hi12);
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg rd,
+     intptr_t pc_offset;
+     tcg_target_long val_lo, val_hi, pc_hi, offset_hi;
+-    tcg_target_long hi32, hi52;
+-    bool rd_high_bits_are_ones;
++    tcg_target_long hi12, hi32, hi52;
+     /* Value fits in signed i32.  */
+     if (type == TCG_TYPE_I32 || val == (int32_t)val) {
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg rd,
+         return;
+     }
++    hi12 = sextreg(val, 12, 20);
+     hi32 = sextreg(val, 32, 20);
+     hi52 = sextreg(val, 52, 12);
+     /* Single cu52i.d case.  */
+-    if (ctz64(val) >= 52) {
++    if ((hi52 != 0) && (ctz64(val) >= 52)) {
+         tcg_out_opc_cu52i_d(s, rd, TCG_REG_ZERO, hi52);
+         return;
+     }
+     /* Slow path.  Initialize the low 32 bits, then concat high bits.  */
+     tcg_out_movi_i32(s, rd, val);
+-    rd_high_bits_are_ones = (int32_t)val < 0;
+-    if (imm_part_needs_loading(rd_high_bits_are_ones, hi32)) {
++    /* Load hi32 and hi52 explicitly when they are unexpected values. */
++    if (hi32 != sextreg(hi12, 20, 20)) {
+         tcg_out_opc_cu32i_d(s, rd, hi32);
+-        rd_high_bits_are_ones = hi32 < 0;
+     }
+-    if (imm_part_needs_loading(rd_high_bits_are_ones, hi52)) {
++    if (hi52 != sextreg(hi32, 20, 12)) {
+         tcg_out_opc_cu52i_d(s, rd, rd, hi52);
+     }
+ }
+--
+.34.1

-[PULL 3/3] tcg: Add tcg_gen_gvec_5_ptr
+[PULL v2 10/15] tcg/loongarch64: Update tcg-insn-defs.c.inc
-Extend the vector generator infrastructure to handle
+Regenerate with ADDU16I included:
 vector arguments.
-Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+   $ cd loongarch-opcodes/scripts/go
-Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+   $ go run ./genqemutcgdefs > $QEMU/tcg/loongarch64/tcg-insn-defs.c.inc
-Reviewed-by: Taylor Simpson <tsimpson@quicinc.com>
 Reviewed-by: WANG Xuerui <git@xen0n.name>
 Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/tcg/tcg-op-gvec.h |  7 +++++++
+ tcg/loongarch64/tcg-insn-defs.c.inc | 10 +++++++++-
- tcg/tcg-op-gvec.c         | 32 ++++++++++++++++++++++++++++++++
+file changed, 9 insertions(+), 1 deletion(-)
 files changed, 39 insertions(+)
-diff --git a/include/tcg/tcg-op-gvec.h b/include/tcg/tcg-op-gvec.h
+diff --git a/tcg/loongarch64/tcg-insn-defs.c.inc b/tcg/loongarch64/tcg-insn-defs.c.inc
 index XXXXXXX..XXXXXXX 100644
---- a/include/tcg/tcg-op-gvec.h
+--- a/tcg/loongarch64/tcg-insn-defs.c.inc
-+++ b/include/tcg/tcg-op-gvec.h
++++ b/tcg/loongarch64/tcg-insn-defs.c.inc
-@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+@@ -XXX,XX +XXX,XX @@
-                         uint32_t maxsz, int32_t data,
+  *
-                         gen_helper_gvec_4_ptr *fn);
+  * This file is auto-generated by genqemutcgdefs from
+  * https://github.com/loongson-community/loongarch-opcodes,
-+typedef void gen_helper_gvec_5_ptr(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_ptr,
+- * from commit 961f0c60f5b63e574d785995600c71ad5413fdc4.
-+                                   TCGv_ptr, TCGv_ptr, TCGv_i32);
++ * from commit 25ca7effe9d88101c1cf96c4005423643386d81f.
-+void tcg_gen_gvec_5_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+  * DO NOT EDIT.
-+                        uint32_t cofs, uint32_t eofs, TCGv_ptr ptr,
+  */
-+                        uint32_t oprsz, uint32_t maxsz, int32_t data,
-+                        gen_helper_gvec_5_ptr *fn);
+@@ -XXX,XX +XXX,XX @@ typedef enum {
-+
+     OPC_ANDI = 0x03400000,
- /* Expand a gvec operation.  Either inline or out-of-line depending on
+     OPC_ORI = 0x03800000,
-    the actual vector size and the operations supported by the host.  */
+     OPC_XORI = 0x03c00000,
- typedef struct {
++    OPC_ADDU16I_D = 0x10000000,
-diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
+     OPC_LU12I_W = 0x14000000,
-index XXXXXXX..XXXXXXX 100644
+     OPC_CU32I_D = 0x16000000,
---- a/tcg/tcg-op-gvec.c
+     OPC_PCADDU2I = 0x18000000,
-+++ b/tcg/tcg-op-gvec.c
+@@ -XXX,XX +XXX,XX @@ tcg_out_opc_xori(TCGContext *s, TCGReg d, TCGReg j, uint32_t uk12)
-@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+     tcg_out32(s, encode_djuk12_insn(OPC_XORI, d, j, uk12));
      tcg_temp_free_i32(desc);
  }
-+/* Generate a call to a gvec-style helper with five vector operands
++/* Emits the `addu16i.d d, j, sk16` instruction.  */
-+   and an extra pointer operand.  */
++static void __attribute__((unused))
-+void tcg_gen_gvec_5_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
++tcg_out_opc_addu16i_d(TCGContext *s, TCGReg d, TCGReg j, int32_t sk16)
 +                        uint32_t cofs, uint32_t eofs, TCGv_ptr ptr,
 +                        uint32_t oprsz, uint32_t maxsz, int32_t data,
 +                        gen_helper_gvec_5_ptr *fn)
 +{
-+    TCGv_ptr a0, a1, a2, a3, a4;
++    tcg_out32(s, encode_djsk16_insn(OPC_ADDU16I_D, d, j, sk16));
 +    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 +
 +    a0 = tcg_temp_new_ptr();
 +    a1 = tcg_temp_new_ptr();
 +    a2 = tcg_temp_new_ptr();
 +    a3 = tcg_temp_new_ptr();
 +    a4 = tcg_temp_new_ptr();
 +
 +    tcg_gen_addi_ptr(a0, cpu_env, dofs);
 +    tcg_gen_addi_ptr(a1, cpu_env, aofs);
 +    tcg_gen_addi_ptr(a2, cpu_env, bofs);
 +    tcg_gen_addi_ptr(a3, cpu_env, cofs);
 +    tcg_gen_addi_ptr(a4, cpu_env, eofs);
 +
 +    fn(a0, a1, a2, a3, a4, ptr, desc);
 +
 +    tcg_temp_free_ptr(a0);
 +    tcg_temp_free_ptr(a1);
 +    tcg_temp_free_ptr(a2);
 +    tcg_temp_free_ptr(a3);
 +    tcg_temp_free_ptr(a4);
 +    tcg_temp_free_i32(desc);
 +}
 +
- /* Return true if we want to implement something of OPRSZ bytes
+ /* Emits the `lu12i.w d, sj20` instruction.  */
-    in units of LNSZ.  This limits the expansion of inline code.  */
+ static void __attribute__((unused))
- static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz)
+ tcg_out_opc_lu12i_w(TCGContext *s, TCGReg d, int32_t sj20)
 --
-.20.1
+.34.1

-New patch
+[PULL v2 11/15] tcg/loongarch64: Introduce tcg_out_addi
+Adjust the constraints to allow any int32_t for immediate
+addition.  Split immediate adds into addu16i + addi, which
+covers quite a lot of the immediate space.  For the hole in
+the middle, load the constant into TMP0 instead.
+Reviewed-by: WANG Xuerui <git@xen0n.name>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/loongarch64/tcg-target-con-set.h |  4 +-
+ tcg/loongarch64/tcg-target-con-str.h |  2 +-
+ tcg/loongarch64/tcg-target.c.inc     | 57 ++++++++++++++++++++++++----
+files changed, 53 insertions(+), 10 deletions(-)
+diff --git a/tcg/loongarch64/tcg-target-con-set.h b/tcg/loongarch64/tcg-target-con-set.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/loongarch64/tcg-target-con-set.h
++++ b/tcg/loongarch64/tcg-target-con-set.h
+@@ -XXX,XX +XXX,XX @@ C_O1_I1(r, L)
+ C_O1_I2(r, r, rC)
+ C_O1_I2(r, r, ri)
+ C_O1_I2(r, r, rI)
++C_O1_I2(r, r, rJ)
+ C_O1_I2(r, r, rU)
+ C_O1_I2(r, r, rW)
+ C_O1_I2(r, r, rZ)
+ C_O1_I2(r, 0, rZ)
+-C_O1_I2(r, rZ, rN)
++C_O1_I2(r, rZ, ri)
++C_O1_I2(r, rZ, rJ)
+ C_O1_I2(r, rZ, rZ)
+diff --git a/tcg/loongarch64/tcg-target-con-str.h b/tcg/loongarch64/tcg-target-con-str.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/loongarch64/tcg-target-con-str.h
++++ b/tcg/loongarch64/tcg-target-con-str.h
+@@ -XXX,XX +XXX,XX @@ REGS('L', ALL_GENERAL_REGS & ~SOFTMMU_RESERVE_REGS)
+  * CONST(letter, TCG_CT_CONST_* bit set)
+  */
+ CONST('I', TCG_CT_CONST_S12)
+-CONST('N', TCG_CT_CONST_N12)
++CONST('J', TCG_CT_CONST_S32)
+ CONST('U', TCG_CT_CONST_U12)
+ CONST('Z', TCG_CT_CONST_ZERO)
+ CONST('C', TCG_CT_CONST_C12)
+diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/loongarch64/tcg-target.c.inc
++++ b/tcg/loongarch64/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static const int tcg_target_call_oarg_regs[] = {
+ #define TCG_CT_CONST_ZERO  0x100
+ #define TCG_CT_CONST_S12   0x200
+-#define TCG_CT_CONST_N12   0x400
++#define TCG_CT_CONST_S32   0x400
+ #define TCG_CT_CONST_U12   0x800
+ #define TCG_CT_CONST_C12   0x1000
+ #define TCG_CT_CONST_WSZ   0x2000
+@@ -XXX,XX +XXX,XX @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
+     if ((ct & TCG_CT_CONST_S12) && val == sextreg(val, 0, 12)) {
+         return true;
+     }
+-    if ((ct & TCG_CT_CONST_N12) && -val == sextreg(-val, 0, 12)) {
++    if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) {
+         return true;
+     }
+     if ((ct & TCG_CT_CONST_U12) && val >= 0 && val <= 0xfff) {
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg rd,
+     }
+ }
++static void tcg_out_addi(TCGContext *s, TCGType type, TCGReg rd,
++                         TCGReg rs, tcg_target_long imm)
++{
++    tcg_target_long lo12 = sextreg(imm, 0, 12);
++    tcg_target_long hi16 = sextreg(imm - lo12, 16, 16);
++
++    /*
++     * Note that there's a hole in between hi16 and lo12:
++     *
++     *       3                   2                   1                   0
++     *     1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0
++     * ...+-------------------------------+-------+-----------------------+
++     *    |             hi16              |       |          lo12         |
++     * ...+-------------------------------+-------+-----------------------+
++     *
++     * For bits within that hole, it's more efficient to use LU12I and ADD.
++     */
++    if (imm == (hi16 << 16) + lo12) {
++        if (hi16) {
++            tcg_out_opc_addu16i_d(s, rd, rs, hi16);
++            rs = rd;
++        }
++        if (type == TCG_TYPE_I32) {
++            tcg_out_opc_addi_w(s, rd, rs, lo12);
++        } else if (lo12) {
++            tcg_out_opc_addi_d(s, rd, rs, lo12);
++        } else {
++            tcg_out_mov(s, type, rd, rs);
++        }
++    } else {
++        tcg_out_movi(s, type, TCG_REG_TMP0, imm);
++        if (type == TCG_TYPE_I32) {
++            tcg_out_opc_add_w(s, rd, rs, TCG_REG_TMP0);
++        } else {
++            tcg_out_opc_add_d(s, rd, rs, TCG_REG_TMP0);
++        }
++    }
++}
++
+ static void tcg_out_ext8u(TCGContext *s, TCGReg ret, TCGReg arg)
+ {
+     tcg_out_opc_andi(s, ret, arg, 0xff);
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     case INDEX_op_add_i32:
+         if (c2) {
+-            tcg_out_opc_addi_w(s, a0, a1, a2);
++            tcg_out_addi(s, TCG_TYPE_I32, a0, a1, a2);
+         } else {
+             tcg_out_opc_add_w(s, a0, a1, a2);
+         }
+         break;
+     case INDEX_op_add_i64:
+         if (c2) {
+-            tcg_out_opc_addi_d(s, a0, a1, a2);
++            tcg_out_addi(s, TCG_TYPE_I64, a0, a1, a2);
+         } else {
+             tcg_out_opc_add_d(s, a0, a1, a2);
+         }
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     case INDEX_op_sub_i32:
+         if (c2) {
+-            tcg_out_opc_addi_w(s, a0, a1, -a2);
++            tcg_out_addi(s, TCG_TYPE_I32, a0, a1, -a2);
+         } else {
+             tcg_out_opc_sub_w(s, a0, a1, a2);
+         }
+         break;
+     case INDEX_op_sub_i64:
+         if (c2) {
+-            tcg_out_opc_addi_d(s, a0, a1, -a2);
++            tcg_out_addi(s, TCG_TYPE_I64, a0, a1, -a2);
+         } else {
+             tcg_out_opc_sub_d(s, a0, a1, a2);
+         }
+@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
+         return C_O1_I2(r, r, ri);
+     case INDEX_op_add_i32:
++        return C_O1_I2(r, r, ri);
+     case INDEX_op_add_i64:
+-        return C_O1_I2(r, r, rI);
++        return C_O1_I2(r, r, rJ);
+     case INDEX_op_and_i32:
+     case INDEX_op_and_i64:
+@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
+         return C_O1_I2(r, 0, rZ);
+     case INDEX_op_sub_i32:
++        return C_O1_I2(r, rZ, ri);
+     case INDEX_op_sub_i64:
+-        return C_O1_I2(r, rZ, rN);
++        return C_O1_I2(r, rZ, rJ);
+     case INDEX_op_mul_i32:
+     case INDEX_op_mul_i64:
+--
+.34.1

-New patch
+[PULL v2 12/15] tcg/loongarch64: Improve setcond expansion
+Split out a helper function, tcg_out_setcond_int, which
+does not always produce the complete boolean result, but
+returns a set of flags to do so.
+Accept all int32_t as constant input, so that LE/GT can
+adjust the constant to LT.
+Reviewed-by: WANG Xuerui <git@xen0n.name>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/loongarch64/tcg-target.c.inc | 165 +++++++++++++++++++++----------
+file changed, 115 insertions(+), 50 deletions(-)
+diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/loongarch64/tcg-target.c.inc
++++ b/tcg/loongarch64/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_clzctz(TCGContext *s, LoongArchInsn opc,
+     tcg_out_opc_or(s, a0, TCG_REG_TMP0, a0);
+ }
+-static void tcg_out_setcond(TCGContext *s, TCGCond cond, TCGReg ret,
+-                            TCGReg arg1, TCGReg arg2, bool c2)
+-{
+-    TCGReg tmp;
++#define SETCOND_INV    TCG_TARGET_NB_REGS
++#define SETCOND_NEZ    (SETCOND_INV << 1)
++#define SETCOND_FLAGS  (SETCOND_INV | SETCOND_NEZ)
+-    if (c2) {
+-        tcg_debug_assert(arg2 == 0);
++static int tcg_out_setcond_int(TCGContext *s, TCGCond cond, TCGReg ret,
++                               TCGReg arg1, tcg_target_long arg2, bool c2)
++{
++    int flags = 0;
++
++    switch (cond) {
++    case TCG_COND_EQ:    /* -> NE  */
++    case TCG_COND_GE:    /* -> LT  */
++    case TCG_COND_GEU:   /* -> LTU */
++    case TCG_COND_GT:    /* -> LE  */
++    case TCG_COND_GTU:   /* -> LEU */
++        cond = tcg_invert_cond(cond);
++        flags ^= SETCOND_INV;
++        break;
++    default:
++        break;
+     }
+     switch (cond) {
+-    case TCG_COND_EQ:
+-        if (c2) {
+-            tmp = arg1;
+-        } else {
+-            tcg_out_opc_sub_d(s, ret, arg1, arg2);
+-            tmp = ret;
+-        }
+-        tcg_out_opc_sltui(s, ret, tmp, 1);
+-        break;
+-    case TCG_COND_NE:
+-        if (c2) {
+-            tmp = arg1;
+-        } else {
+-            tcg_out_opc_sub_d(s, ret, arg1, arg2);
+-            tmp = ret;
+-        }
+-        tcg_out_opc_sltu(s, ret, TCG_REG_ZERO, tmp);
+-        break;
+-    case TCG_COND_LT:
+-        tcg_out_opc_slt(s, ret, arg1, arg2);
+-        break;
+-    case TCG_COND_GE:
+-        tcg_out_opc_slt(s, ret, arg1, arg2);
+-        tcg_out_opc_xori(s, ret, ret, 1);
+-        break;
+     case TCG_COND_LE:
+-        tcg_out_setcond(s, TCG_COND_GE, ret, arg2, arg1, false);
+-        break;
+-    case TCG_COND_GT:
+-        tcg_out_setcond(s, TCG_COND_LT, ret, arg2, arg1, false);
+-        break;
+-    case TCG_COND_LTU:
+-        tcg_out_opc_sltu(s, ret, arg1, arg2);
+-        break;
+-    case TCG_COND_GEU:
+-        tcg_out_opc_sltu(s, ret, arg1, arg2);
+-        tcg_out_opc_xori(s, ret, ret, 1);
+-        break;
+     case TCG_COND_LEU:
+-        tcg_out_setcond(s, TCG_COND_GEU, ret, arg2, arg1, false);
++        /*
++         * If we have a constant input, the most efficient way to implement
++         * LE is by adding 1 and using LT.  Watch out for wrap around for LEU.
++         * We don't need to care for this for LE because the constant input
++         * is still constrained to int32_t, and INT32_MAX+1 is representable
++         * in the 64-bit temporary register.
++         */
++        if (c2) {
++            if (cond == TCG_COND_LEU) {
++                /* unsigned <= -1 is true */
++                if (arg2 == -1) {
++                    tcg_out_movi(s, TCG_TYPE_REG, ret, !(flags & SETCOND_INV));
++                    return ret;
++                }
++                cond = TCG_COND_LTU;
++            } else {
++                cond = TCG_COND_LT;
++            }
++            arg2 += 1;
++        } else {
++            TCGReg tmp = arg2;
++            arg2 = arg1;
++            arg1 = tmp;
++            cond = tcg_swap_cond(cond);    /* LE -> GE */
++            cond = tcg_invert_cond(cond);  /* GE -> LT */
++            flags ^= SETCOND_INV;
++        }
+         break;
+-    case TCG_COND_GTU:
+-        tcg_out_setcond(s, TCG_COND_LTU, ret, arg2, arg1, false);
++    default:
+         break;
++    }
++
++    switch (cond) {
++    case TCG_COND_NE:
++        flags |= SETCOND_NEZ;
++        if (!c2) {
++            tcg_out_opc_xor(s, ret, arg1, arg2);
++        } else if (arg2 == 0) {
++            ret = arg1;
++        } else if (arg2 >= 0 && arg2 <= 0xfff) {
++            tcg_out_opc_xori(s, ret, arg1, arg2);
++        } else {
++            tcg_out_addi(s, TCG_TYPE_REG, ret, arg1, -arg2);
++        }
++        break;
++
++    case TCG_COND_LT:
++    case TCG_COND_LTU:
++        if (c2) {
++            if (arg2 >= -0x800 && arg2 <= 0x7ff) {
++                if (cond == TCG_COND_LT) {
++                    tcg_out_opc_slti(s, ret, arg1, arg2);
++                } else {
++                    tcg_out_opc_sltui(s, ret, arg1, arg2);
++                }
++                break;
++            }
++            tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_TMP0, arg2);
++            arg2 = TCG_REG_TMP0;
++        }
++        if (cond == TCG_COND_LT) {
++            tcg_out_opc_slt(s, ret, arg1, arg2);
++        } else {
++            tcg_out_opc_sltu(s, ret, arg1, arg2);
++        }
++        break;
++
+     default:
+         g_assert_not_reached();
+         break;
+     }
++
++    return ret | flags;
++}
++
++static void tcg_out_setcond(TCGContext *s, TCGCond cond, TCGReg ret,
++                            TCGReg arg1, tcg_target_long arg2, bool c2)
++{
++    int tmpflags = tcg_out_setcond_int(s, cond, ret, arg1, arg2, c2);
++
++    if (tmpflags != ret) {
++        TCGReg tmp = tmpflags & ~SETCOND_FLAGS;
++
++        switch (tmpflags & SETCOND_FLAGS) {
++        case SETCOND_INV:
++            /* Intermediate result is boolean: simply invert. */
++            tcg_out_opc_xori(s, ret, tmp, 1);
++            break;
++        case SETCOND_NEZ:
++            /* Intermediate result is zero/non-zero: test != 0. */
++            tcg_out_opc_sltu(s, ret, TCG_REG_ZERO, tmp);
++            break;
++        case SETCOND_NEZ | SETCOND_INV:
++            /* Intermediate result is zero/non-zero: test == 0. */
++            tcg_out_opc_sltui(s, ret, tmp, 1);
++            break;
++        default:
++            g_assert_not_reached();
++        }
++    }
+ }
+ /*
+@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
+     case INDEX_op_ctz_i64:
+         return C_O1_I2(r, r, rW);
+-    case INDEX_op_setcond_i32:
+-    case INDEX_op_setcond_i64:
+-        return C_O1_I2(r, r, rZ);
+-
+     case INDEX_op_deposit_i32:
+     case INDEX_op_deposit_i64:
+         /* Must deposit into the same register as input */
+         return C_O1_I2(r, 0, rZ);
+     case INDEX_op_sub_i32:
++    case INDEX_op_setcond_i32:
+         return C_O1_I2(r, rZ, ri);
+     case INDEX_op_sub_i64:
++    case INDEX_op_setcond_i64:
+         return C_O1_I2(r, rZ, rJ);
+     case INDEX_op_mul_i32:
+--
+.34.1

-New patch
+[PULL v2 13/15] tcg/loongarch64: Implement movcond
+Reviewed-by: WANG Xuerui <git@xen0n.name>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/loongarch64/tcg-target-con-set.h |  1 +
+ tcg/loongarch64/tcg-target.h         |  4 ++--
+ tcg/loongarch64/tcg-target.c.inc     | 33 ++++++++++++++++++++++++++++
+files changed, 36 insertions(+), 2 deletions(-)
+diff --git a/tcg/loongarch64/tcg-target-con-set.h b/tcg/loongarch64/tcg-target-con-set.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/loongarch64/tcg-target-con-set.h
++++ b/tcg/loongarch64/tcg-target-con-set.h
+@@ -XXX,XX +XXX,XX @@ C_O1_I2(r, 0, rZ)
+ C_O1_I2(r, rZ, ri)
+ C_O1_I2(r, rZ, rJ)
+ C_O1_I2(r, rZ, rZ)
++C_O1_I4(r, rZ, rJ, rZ, rZ)
+diff --git a/tcg/loongarch64/tcg-target.h b/tcg/loongarch64/tcg-target.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/loongarch64/tcg-target.h
++++ b/tcg/loongarch64/tcg-target.h
+@@ -XXX,XX +XXX,XX @@ typedef enum {
+ #define TCG_TARGET_CALL_ARG_I64         TCG_CALL_ARG_NORMAL
+ /* optional instructions */
+-#define TCG_TARGET_HAS_movcond_i32      0
++#define TCG_TARGET_HAS_movcond_i32      1
+ #define TCG_TARGET_HAS_div_i32          1
+ #define TCG_TARGET_HAS_rem_i32          1
+ #define TCG_TARGET_HAS_div2_i32         0
+@@ -XXX,XX +XXX,XX @@ typedef enum {
+ #define TCG_TARGET_HAS_qemu_st8_i32     0
+ /* 64-bit operations */
+-#define TCG_TARGET_HAS_movcond_i64      0
++#define TCG_TARGET_HAS_movcond_i64      1
+ #define TCG_TARGET_HAS_div_i64          1
+ #define TCG_TARGET_HAS_rem_i64          1
+ #define TCG_TARGET_HAS_div2_i64         0
+diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/loongarch64/tcg-target.c.inc
++++ b/tcg/loongarch64/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_setcond(TCGContext *s, TCGCond cond, TCGReg ret,
+     }
+ }
++static void tcg_out_movcond(TCGContext *s, TCGCond cond, TCGReg ret,
++                            TCGReg c1, tcg_target_long c2, bool const2,
++                            TCGReg v1, TCGReg v2)
++{
++    int tmpflags = tcg_out_setcond_int(s, cond, TCG_REG_TMP0, c1, c2, const2);
++    TCGReg t;
++
++    /* Standardize the test below to t != 0. */
++    if (tmpflags & SETCOND_INV) {
++        t = v1, v1 = v2, v2 = t;
++    }
++
++    t = tmpflags & ~SETCOND_FLAGS;
++    if (v1 == TCG_REG_ZERO) {
++        tcg_out_opc_masknez(s, ret, v2, t);
++    } else if (v2 == TCG_REG_ZERO) {
++        tcg_out_opc_maskeqz(s, ret, v1, t);
++    } else {
++        tcg_out_opc_masknez(s, TCG_REG_TMP2, v2, t); /* t ? 0 : v2 */
++        tcg_out_opc_maskeqz(s, TCG_REG_TMP1, v1, t); /* t ? v1 : 0 */
++        tcg_out_opc_or(s, ret, TCG_REG_TMP1, TCG_REG_TMP2);
++    }
++}
++
+ /*
+  * Branch helpers
+  */
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+         tcg_out_setcond(s, args[3], a0, a1, a2, c2);
+         break;
++    case INDEX_op_movcond_i32:
++    case INDEX_op_movcond_i64:
++        tcg_out_movcond(s, args[5], a0, a1, a2, c2, args[3], args[4]);
++        break;
++
+     case INDEX_op_ld8s_i32:
+     case INDEX_op_ld8s_i64:
+         tcg_out_ldst(s, OPC_LD_B, a0, a1, a2);
+@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
+     case INDEX_op_remu_i64:
+         return C_O1_I2(r, rZ, rZ);
++    case INDEX_op_movcond_i32:
++    case INDEX_op_movcond_i64:
++        return C_O1_I4(r, rZ, rJ, rZ, rZ);
++
+     default:
+         g_assert_not_reached();
+     }
+--
+.34.1

-[PULL 1/3] exec: flush CPU TB cache in breakpoint_invalidate
+[PULL v2 14/15] tcg/loongarch64: Use tcg_pcrel_diff in tcg_out_ldst
-From: Max Filippov <jcmvbkbc@gmail.com>
+Take the w^x split into account when computing the
 pc-relative distance to an absolute pointer.
-When a breakpoint is inserted at location for which there's currently no
+Reviewed-by: WANG Xuerui <git@xen0n.name>
-virtual to physical translation no action is taken on CPU TB cache. If a
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 TB for that virtual address already exists but is not visible ATM the
 breakpoint won't be hit next time an instruction at that address will be
 executed.
 Flush entire CPU TB cache in breakpoint_invalidate to force
 re-translation of all TBs for the breakpoint address.
 This change fixes the following scenario:
 - linux user application is running
 - a breakpoint is inserted from QEMU gdbstub for a user address that is
   not currently present in the target CPU TLB
 - an instruction at that address is executed, but the external debugger
   doesn't get control.
 Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
 Signed-off-by: Max Filippov <jcmvbkbc@gmail.com>
 Message-Id: <20191127220602.10827-2-jcmvbkbc@gmail.com>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- exec.c | 15 +++++++--------
+ tcg/loongarch64/tcg-target.c.inc | 2 +-
-file changed, 7 insertions(+), 8 deletions(-)
+file changed, 1 insertion(+), 1 deletion(-)
-diff --git a/exec.c b/exec.c
+diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
---- a/exec.c
+--- a/tcg/loongarch64/tcg-target.c.inc
-+++ b/exec.c
++++ b/tcg/loongarch64/tcg-target.c.inc
-@@ -XXX,XX +XXX,XX @@ void tb_invalidate_phys_addr(AddressSpace *as, hwaddr addr, MemTxAttrs attrs)
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_ldst(TCGContext *s, LoongArchInsn opc, TCGReg data,
+     intptr_t imm12 = sextreg(offset, 0, 12);
- static void breakpoint_invalidate(CPUState *cpu, target_ulong pc)
- {
+     if (offset != imm12) {
--    MemTxAttrs attrs;
+-        intptr_t diff = offset - (uintptr_t)s->code_ptr;
--    hwaddr phys = cpu_get_phys_page_attrs_debug(cpu, pc, &attrs);
++        intptr_t diff = tcg_pcrel_diff(s, (void *)offset);
--    int asidx = cpu_asidx_from_attrs(cpu, attrs);
--    if (phys != -1) {
+         if (addr == TCG_REG_ZERO && diff == (int32_t)diff) {
--        /* Locks grabbed by tb_invalidate_phys_addr */
+             imm12 = sextreg(diff, 0, 12);
 -        tb_invalidate_phys_addr(cpu->cpu_ases[asidx].as,
 -                                phys | (pc & ~TARGET_PAGE_MASK), attrs);
 -    }
 +    /*
 +     * There may not be a virtual to physical translation for the pc
 +     * right now, but there may exist cached TB for this pc.
 +     * Flush the whole TB cache to force re-translation of such TBs.
 +     * This is heavyweight, but we're debugging anyway.
 +     */
 +    tb_flush(cpu);
  }
  #endif
 --
-.20.1
+.34.1

-New patch
+[PULL v2 15/15] tcg/loongarch64: Reorg goto_tb implementation
+The old implementation replaces two insns, swapping between
+        b       <dest>
+        nop
+and
+        pcaddu18i tmp, <dest>
+        jirl      zero, tmp, <dest> & 0xffff
+There is a race condition in which a thread could be stopped at
+the jirl, i.e. with the top of the address loaded, and when
+restarted we have re-linked to a different TB, so that the top
+half no longer matches the bottom half.
+Note that while we never directly re-link to a different TB, we
+can link, unlink, and link again all while the stopped thread
+remains stopped.
+The new implementation replaces only one insn, swapping between
+        b       <dest>
+and
+        pcadd   tmp, <jmp_addr>
+falling through to load the address from tmp, and branch.
+Reviewed-by: WANG Xuerui <git@xen0n.name>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/loongarch64/tcg-target.h     |  7 +---
+ tcg/loongarch64/tcg-target.c.inc | 72 ++++++++++++++------------------
+files changed, 33 insertions(+), 46 deletions(-)
+diff --git a/tcg/loongarch64/tcg-target.h b/tcg/loongarch64/tcg-target.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/loongarch64/tcg-target.h
++++ b/tcg/loongarch64/tcg-target.h
+@@ -XXX,XX +XXX,XX @@
+ #define TCG_TARGET_INSN_UNIT_SIZE 4
+ #define TCG_TARGET_NB_REGS 32
+-/*
+- * PCADDU18I + JIRL sequence can give 20 + 16 + 2 = 38 bits
+- * signed offset, which is +/- 128 GiB.
+- */
+-#define MAX_CODE_GEN_BUFFER_SIZE  (128 * GiB)
++
++#define MAX_CODE_GEN_BUFFER_SIZE  ((size_t)-1)
+ typedef enum {
+     TCG_REG_ZERO,
+diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/loongarch64/tcg-target.c.inc
++++ b/tcg/loongarch64/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args)
+ #endif
+ }
+-/* LoongArch uses `andi zero, zero, 0` as NOP.  */
+-#define NOP OPC_ANDI
+-static void tcg_out_nop(TCGContext *s)
+-{
+-    tcg_out32(s, NOP);
+-}
+-
+-void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+-                              uintptr_t jmp_rx, uintptr_t jmp_rw)
+-{
+-    tcg_insn_unit i1, i2;
+-    ptrdiff_t upper, lower;
+-    uintptr_t addr = tb->jmp_target_addr[n];
+-    ptrdiff_t offset = (ptrdiff_t)(addr - jmp_rx) >> 2;
+-
+-    if (offset == sextreg(offset, 0, 26)) {
+-        i1 = encode_sd10k16_insn(OPC_B, offset);
+-        i2 = NOP;
+-    } else {
+-        tcg_debug_assert(offset == sextreg(offset, 0, 36));
+-        lower = (int16_t)offset;
+-        upper = (offset - lower) >> 16;
+-
+-        i1 = encode_dsj20_insn(OPC_PCADDU18I, TCG_REG_TMP0, upper);
+-        i2 = encode_djsk16_insn(OPC_JIRL, TCG_REG_ZERO, TCG_REG_TMP0, lower);
+-    }
+-    uint64_t pair = ((uint64_t)i2 << 32) | i1;
+-    qatomic_set((uint64_t *)jmp_rw, pair);
+-    flush_idcache_range(jmp_rx, jmp_rw, 8);
+-}
+-
+ /*
+  * Entry-points
+  */
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
+ static void tcg_out_goto_tb(TCGContext *s, int which)
+ {
+     /*
+-     * Ensure that patch area is 8-byte aligned so that an
+-     * atomic write can be used to patch the target address.
++     * Direct branch, or load indirect address, to be patched
++     * by tb_target_set_jmp_target.  Check indirect load offset
++     * in range early, regardless of direct branch distance,
++     * via assert within tcg_out_opc_pcaddu2i.
+      */
+-    if ((uintptr_t)s->code_ptr & 7) {
+-        tcg_out_nop(s);
+-    }
++    uintptr_t i_addr = get_jmp_target_addr(s, which);
++    intptr_t i_disp = tcg_pcrel_diff(s, (void *)i_addr);
++
+     set_jmp_insn_offset(s, which);
+-    /*
+-     * actual branch destination will be patched by
+-     * tb_target_set_jmp_target later
+-     */
+-    tcg_out_opc_pcaddu18i(s, TCG_REG_TMP0, 0);
++    tcg_out_opc_pcaddu2i(s, TCG_REG_TMP0, i_disp >> 2);
++
++    /* Finish the load and indirect branch. */
++    tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP0, TCG_REG_TMP0, 0);
+     tcg_out_opc_jirl(s, TCG_REG_ZERO, TCG_REG_TMP0, 0);
+     set_jmp_reset_offset(s, which);
+ }
++void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
++                              uintptr_t jmp_rx, uintptr_t jmp_rw)
++{
++    uintptr_t d_addr = tb->jmp_target_addr[n];
++    ptrdiff_t d_disp = (ptrdiff_t)(d_addr - jmp_rx) >> 2;
++    tcg_insn_unit insn;
++
++    /* Either directly branch, or load slot address for indirect branch. */
++    if (d_disp == sextreg(d_disp, 0, 26)) {
++        insn = encode_sd10k16_insn(OPC_B, d_disp);
++    } else {
++        uintptr_t i_addr = (uintptr_t)&tb->jmp_target_addr[n];
++        intptr_t i_disp = i_addr - jmp_rx;
++        insn = encode_dsj20_insn(OPC_PCADDU2I, TCG_REG_TMP0, i_disp >> 2);
++    }
++
++    qatomic_set((tcg_insn_unit *)jmp_rw, insn);
++    flush_idcache_range(jmp_rx, jmp_rw, 4);
++}
++
+ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+                        const TCGArg args[TCG_MAX_OP_ARGS],
+                        const int const_args[TCG_MAX_OP_ARGS])
+--
+.34.1

The following changes since commit e18e5501d8ac692d32657a3e1ef545b14e72b730:

Merge remote-tracking branch 'remotes/dgilbert-gitlab/tags/pull-virtiofs-20200210' into staging (2020-02-10 18:09:14 +0000)

are available in the Git repository at:

https://github.com/rth7680/qemu.git tags/pull-tcg-20200212

for you to fetch changes up to 2445971604c1cfd3ec484457159f4ac300fb04d2:

tcg: Add tcg_gen_gvec_5_ptr (2020-02-12 14:58:36 -0800)

----------------------------------------------------------------
Fix breakpoint invalidation.
Add support for tcg helpers with 7 arguments.
Add support for gvec helpers with 5 arguments.

----------------------------------------------------------------
Max Filippov (1):
      exec: flush CPU TB cache in breakpoint_invalidate

Richard Henderson (1):
      tcg: Add tcg_gen_gvec_5_ptr

Taylor Simpson (1):
      tcg: Add support for a helper with 7 arguments

From: Max Filippov <jcmvbkbc@gmail.com>

When a breakpoint is inserted at location for which there's currently no
virtual to physical translation no action is taken on CPU TB cache. If a
TB for that virtual address already exists but is not visible ATM the
breakpoint won't be hit next time an instruction at that address will be
executed.

Flush entire CPU TB cache in breakpoint_invalidate to force
re-translation of all TBs for the breakpoint address.

This change fixes the following scenario:
- linux user application is running
- a breakpoint is inserted from QEMU gdbstub for a user address that is
  not currently present in the target CPU TLB
- an instruction at that address is executed, but the external debugger
  doesn't get control.

Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Max Filippov <jcmvbkbc@gmail.com>
Message-Id: <20191127220602.10827-2-jcmvbkbc@gmail.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 exec.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/exec.c b/exec.c
index XXXXXXX..XXXXXXX 100644
--- a/exec.c
+++ b/exec.c
@@ -XXX,XX +XXX,XX @@ void tb_invalidate_phys_addr(AddressSpace *as, hwaddr addr, MemTxAttrs attrs)
 
 static void breakpoint_invalidate(CPUState *cpu, target_ulong pc)
 {
-    MemTxAttrs attrs;
-    hwaddr phys = cpu_get_phys_page_attrs_debug(cpu, pc, &attrs);
-    int asidx = cpu_asidx_from_attrs(cpu, attrs);
-    if (phys != -1) {
-        /* Locks grabbed by tb_invalidate_phys_addr */
-        tb_invalidate_phys_addr(cpu->cpu_ases[asidx].as,
-                                phys | (pc & ~TARGET_PAGE_MASK), attrs);
-    }
+    /*
+     * There may not be a virtual to physical translation for the pc
+     * right now, but there may exist cached TB for this pc.
+     * Flush the whole TB cache to force re-translation of such TBs.
+     * This is heavyweight, but we're debugging anyway.
+     */
+    tb_flush(cpu);
 }
 #endif
 
-- 
2.20.1

From: Taylor Simpson <tsimpson@quicinc.com>

Currently, helpers can only take up to 6 arguments.  This patch adds the
capability for up to 7 arguments.  I have tested it with the Hexagon port
that I am preparing for submission.

Signed-off-by: Taylor Simpson <tsimpson@quicinc.com>
Message-Id: <1580942510-2820-1-git-send-email-tsimpson@quicinc.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/helper-gen.h   | 13 +++++++++++++
 include/exec/helper-head.h  |  2 ++
 include/exec/helper-proto.h |  6 ++++++
 include/exec/helper-tcg.h   |  7 +++++++
 4 files changed, 28 insertions(+)

diff --git a/include/exec/helper-gen.h b/include/exec/helper-gen.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/helper-gen.h
+++ b/include/exec/helper-gen.h
@@ -XXX,XX +XXX,XX @@ static inline void glue(gen_helper_, name)(dh_retvar_decl(ret)          \
   tcg_gen_callN(HELPER(name), dh_retvar(ret), 6, args);                 \
 }
 
+#define DEF_HELPER_FLAGS_7(name, flags, ret, t1, t2, t3, t4, t5, t6, t7)\
+static inline void glue(gen_helper_, name)(dh_retvar_decl(ret)          \
+    dh_arg_decl(t1, 1),  dh_arg_decl(t2, 2), dh_arg_decl(t3, 3),        \
+    dh_arg_decl(t4, 4), dh_arg_decl(t5, 5), dh_arg_decl(t6, 6),         \
+    dh_arg_decl(t7, 7))                                                 \
+{                                                                       \
+  TCGTemp *args[7] = { dh_arg(t1, 1), dh_arg(t2, 2), dh_arg(t3, 3),     \
+                     dh_arg(t4, 4), dh_arg(t5, 5), dh_arg(t6, 6),       \
+                     dh_arg(t7, 7) };                                   \
+  tcg_gen_callN(HELPER(name), dh_retvar(ret), 7, args);                 \
+}
+
 #include "helper.h"
 #include "trace/generated-helpers.h"
 #include "trace/generated-helpers-wrappers.h"
@@ -XXX,XX +XXX,XX @@ static inline void glue(gen_helper_, name)(dh_retvar_decl(ret)          \
 #undef DEF_HELPER_FLAGS_4
 #undef DEF_HELPER_FLAGS_5
 #undef DEF_HELPER_FLAGS_6
+#undef DEF_HELPER_FLAGS_7
 #undef GEN_HELPER
 
 #endif /* HELPER_GEN_H */
diff --git a/include/exec/helper-head.h b/include/exec/helper-head.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/helper-head.h
+++ b/include/exec/helper-head.h
@@ -XXX,XX +XXX,XX @@
     DEF_HELPER_FLAGS_5(name, 0, ret, t1, t2, t3, t4, t5)
 #define DEF_HELPER_6(name, ret, t1, t2, t3, t4, t5, t6) \
     DEF_HELPER_FLAGS_6(name, 0, ret, t1, t2, t3, t4, t5, t6)
+#define DEF_HELPER_7(name, ret, t1, t2, t3, t4, t5, t6, t7) \
+    DEF_HELPER_FLAGS_7(name, 0, ret, t1, t2, t3, t4, t5, t6, t7)
 
 /* MAX_OPC_PARAM_IARGS must be set to n if last entry is DEF_HELPER_FLAGS_n. */
 
diff --git a/include/exec/helper-proto.h b/include/exec/helper-proto.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/helper-proto.h
+++ b/include/exec/helper-proto.h
@@ -XXX,XX +XXX,XX @@ dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
 dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
                             dh_ctype(t4), dh_ctype(t5), dh_ctype(t6));
 
+#define DEF_HELPER_FLAGS_7(name, flags, ret, t1, t2, t3, t4, t5, t6, t7) \
+dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
+                            dh_ctype(t4), dh_ctype(t5), dh_ctype(t6), \
+                            dh_ctype(t7));
+
 #include "helper.h"
 #include "trace/generated-helpers.h"
 #include "tcg-runtime.h"
@@ -XXX,XX +XXX,XX @@ dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
 #undef DEF_HELPER_FLAGS_4
 #undef DEF_HELPER_FLAGS_5
 #undef DEF_HELPER_FLAGS_6
+#undef DEF_HELPER_FLAGS_7
 
 #endif /* HELPER_PROTO_H */
diff --git a/include/exec/helper-tcg.h b/include/exec/helper-tcg.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/helper-tcg.h
+++ b/include/exec/helper-tcg.h
@@ -XXX,XX +XXX,XX @@
     | dh_sizemask(t2, 2) | dh_sizemask(t3, 3) | dh_sizemask(t4, 4) \
     | dh_sizemask(t5, 5) | dh_sizemask(t6, 6) },
 
+#define DEF_HELPER_FLAGS_7(NAME, FLAGS, ret, t1, t2, t3, t4, t5, t6, t7) \
+  { .func = HELPER(NAME), .name = str(NAME), .flags = FLAGS, \
+    .sizemask = dh_sizemask(ret, 0) | dh_sizemask(t1, 1) \
+    | dh_sizemask(t2, 2) | dh_sizemask(t3, 3) | dh_sizemask(t4, 4) \
+    | dh_sizemask(t5, 5) | dh_sizemask(t6, 6) | dh_sizemask(t7, 7) },
+
 #include "helper.h"
 #include "trace/generated-helpers.h"
 #include "tcg-runtime.h"
@@ -XXX,XX +XXX,XX @@
 #undef DEF_HELPER_FLAGS_4
 #undef DEF_HELPER_FLAGS_5
 #undef DEF_HELPER_FLAGS_6
+#undef DEF_HELPER_FLAGS_7
 
 #endif /* HELPER_TCG_H */
-- 
2.20.1

Extend the vector generator infrastructure to handle
5 vector arguments.

Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Taylor Simpson <tsimpson@quicinc.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/tcg/tcg-op-gvec.h |  7 +++++++
 tcg/tcg-op-gvec.c         | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 39 insertions(+)

diff --git a/include/tcg/tcg-op-gvec.h b/include/tcg/tcg-op-gvec.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg-op-gvec.h
+++ b/include/tcg/tcg-op-gvec.h
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
                         uint32_t maxsz, int32_t data,
                         gen_helper_gvec_4_ptr *fn);
 
+typedef void gen_helper_gvec_5_ptr(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_ptr,
+                                   TCGv_ptr, TCGv_ptr, TCGv_i32);
+void tcg_gen_gvec_5_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+                        uint32_t cofs, uint32_t eofs, TCGv_ptr ptr,
+                        uint32_t oprsz, uint32_t maxsz, int32_t data,
+                        gen_helper_gvec_5_ptr *fn);
+
 /* Expand a gvec operation.  Either inline or out-of-line depending on
    the actual vector size and the operations supported by the host.  */
 typedef struct {
diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg-op-gvec.c
+++ b/tcg/tcg-op-gvec.c
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
     tcg_temp_free_i32(desc);
 }
 
+/* Generate a call to a gvec-style helper with five vector operands
+   and an extra pointer operand.  */
+void tcg_gen_gvec_5_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+                        uint32_t cofs, uint32_t eofs, TCGv_ptr ptr,
+                        uint32_t oprsz, uint32_t maxsz, int32_t data,
+                        gen_helper_gvec_5_ptr *fn)
+{
+    TCGv_ptr a0, a1, a2, a3, a4;
+    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
+
+    a0 = tcg_temp_new_ptr();
+    a1 = tcg_temp_new_ptr();
+    a2 = tcg_temp_new_ptr();
+    a3 = tcg_temp_new_ptr();
+    a4 = tcg_temp_new_ptr();
+
+    tcg_gen_addi_ptr(a0, cpu_env, dofs);
+    tcg_gen_addi_ptr(a1, cpu_env, aofs);
+    tcg_gen_addi_ptr(a2, cpu_env, bofs);
+    tcg_gen_addi_ptr(a3, cpu_env, cofs);
+    tcg_gen_addi_ptr(a4, cpu_env, eofs);
+
+    fn(a0, a1, a2, a3, a4, ptr, desc);
+
+    tcg_temp_free_ptr(a0);
+    tcg_temp_free_ptr(a1);
+    tcg_temp_free_ptr(a2);
+    tcg_temp_free_ptr(a3);
+    tcg_temp_free_ptr(a4);
+    tcg_temp_free_i32(desc);
+}
+
 /* Return true if we want to implement something of OPRSZ bytes
    in units of LNSZ.  This limits the expansion of inline code.  */
 static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz)
-- 
2.20.1

Second try's the charm today, right?

The following changes since commit 00b1faea41d283e931256aa78aa975a369ec3ae6:

Merge tag 'pull-target-arm-20230123' of https://git.linaro.org/people/pmaydell/qemu-arm into staging (2023-01-23 13:40:28 +0000)

are available in the Git repository at:

https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20230123

for you to fetch changes up to 709bcd7da3f6b4655d910634a0d520fa1439df38:

tcg/loongarch64: Reorg goto_tb implementation (2023-01-23 16:00:13 -1000)

----------------------------------------------------------------
common-user: Re-enable ppc32 host
tcg: Avoid recursion in tcg_gen_mulu2_i32
tcg: Mark tcg helpers noinline to avoid an issue with LTO
tcg/arm: Use register pair allocation for qemu_{ld,st}_i64
disas: Enable loongarch disassembler, and fixes
tcg/loongarch64: Improve move immediate
tcg/loongarch64: Improve add immediate
tcg/loongarch64: Improve setcond
tcg/loongarch64: Implement movcond
tcg/loongarch64: Use tcg_pcrel_diff in tcg_out_ldst
tcg/loongarch64: Reorg goto_tb implementation

----------------------------------------------------------------
Richard Henderson (14):
      tcg: Avoid recursion in tcg_gen_mulu2_i32
      tcg/arm: Use register pair allocation for qemu_{ld,st}_i64
      common-user/host/ppc: Implement safe-syscall.inc.S
      linux-user: Implment host/ppc/host-signal.h
      tcg: Mark tcg helpers noinline to avoid an issue with LTO
      target/loongarch: Enable the disassembler for host tcg
      target/loongarch: Disassemble jirl properly
      target/loongarch: Disassemble pcadd* addresses
      tcg/loongarch64: Update tcg-insn-defs.c.inc
      tcg/loongarch64: Introduce tcg_out_addi
      tcg/loongarch64: Improve setcond expansion
      tcg/loongarch64: Implement movcond
      tcg/loongarch64: Use tcg_pcrel_diff in tcg_out_ldst
      tcg/loongarch64: Reorg goto_tb implementation

Rui Wang (1):
      tcg/loongarch64: Optimize immediate loading

include/exec/helper-proto.h                    |  32 ++-
 include/tcg/tcg.h                              |   7 -
 linux-user/include/host/ppc/host-signal.h      |  39 +++
 tcg/arm/tcg-target-con-set.h                   |   7 +-
 tcg/arm/tcg-target-con-str.h                   |   2 +
 tcg/loongarch64/tcg-target-con-set.h           |   5 +-
 tcg/loongarch64/tcg-target-con-str.h           |   2 +-
 tcg/loongarch64/tcg-target.h                   |  11 +-
 target/loongarch/insns.decode                  |   3 +-
 disas.c                                        |   2 +
 target/loongarch/disas.c                       |  39 ++-
 tcg/tcg-op.c                                   |   4 +-
 target/loongarch/insn_trans/trans_branch.c.inc |   2 +-
 tcg/arm/tcg-target.c.inc                       |  28 +-
 tcg/loongarch64/tcg-insn-defs.c.inc            |  10 +-
 tcg/loongarch64/tcg-target.c.inc               | 364 ++++++++++++++++---------
 common-user/host/ppc/safe-syscall.inc.S        | 107 ++++++++
 target/loongarch/meson.build                   |   3 +-
 18 files changed, 497 insertions(+), 170 deletions(-)
 create mode 100644 linux-user/include/host/ppc/host-signal.h
 create mode 100644 common-user/host/ppc/safe-syscall.inc.S

We have a test for one of TCG_TARGET_HAS_mulu2_i32 or
TCG_TARGET_HAS_muluh_i32 being defined, but the test
became non-functional when we changed to always define
all of these macros.

Replace this with a build-time test in tcg_gen_mulu2_i32.

Fixes: 25c4d9cc845 ("tcg: Always define all of the TCGOpcode enum members.")
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1435
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/tcg/tcg.h | 7 -------
 tcg/tcg-op.c      | 4 +++-
 2 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg.h
+++ b/include/tcg/tcg.h
@@ -XXX,XX +XXX,XX @@ typedef uint64_t TCGRegSet;
 #define TCG_TARGET_HAS_rem_i64          0
 #endif
 
-/* For 32-bit targets, some sort of unsigned widening multiply is required.  */
-#if TCG_TARGET_REG_BITS == 32 \
-    && !(defined(TCG_TARGET_HAS_mulu2_i32) \
-         || defined(TCG_TARGET_HAS_muluh_i32))
-# error "Missing unsigned widening multiply"
-#endif
-
 #if !defined(TCG_TARGET_HAS_v64) \
     && !defined(TCG_TARGET_HAS_v128) \
     && !defined(TCG_TARGET_HAS_v256)
diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg-op.c
+++ b/tcg/tcg-op.c
@@ -XXX,XX +XXX,XX @@ void tcg_gen_mulu2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 arg1, TCGv_i32 arg2)
         tcg_gen_op3_i32(INDEX_op_muluh_i32, rh, arg1, arg2);
         tcg_gen_mov_i32(rl, t);
         tcg_temp_free_i32(t);
-    } else {
+    } else if (TCG_TARGET_REG_BITS == 64) {
         TCGv_i64 t0 = tcg_temp_new_i64();
         TCGv_i64 t1 = tcg_temp_new_i64();
         tcg_gen_extu_i32_i64(t0, arg1);
@@ -XXX,XX +XXX,XX @@ void tcg_gen_mulu2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 arg1, TCGv_i32 arg2)
         tcg_gen_extr_i64_i32(rl, rh, t0);
         tcg_temp_free_i64(t0);
         tcg_temp_free_i64(t1);
+    } else {
+        qemu_build_not_reached();
     }
 }
 
-- 
2.34.1

Although we still can't use ldrd and strd for all operations,
increase the chances by getting the register allocation correct.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/arm/tcg-target-con-set.h |  7 ++++---
 tcg/arm/tcg-target-con-str.h |  2 ++
 tcg/arm/tcg-target.c.inc     | 28 ++++++++++++++++++----------
 3 files changed, 24 insertions(+), 13 deletions(-)

diff --git a/tcg/arm/tcg-target-con-set.h b/tcg/arm/tcg-target-con-set.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target-con-set.h
+++ b/tcg/arm/tcg-target-con-set.h
@@ -XXX,XX +XXX,XX @@ C_O0_I2(r, rIN)
 C_O0_I2(s, s)
 C_O0_I2(w, r)
 C_O0_I3(s, s, s)
+C_O0_I3(S, p, s)
 C_O0_I4(r, r, rI, rI)
-C_O0_I4(s, s, s, s)
+C_O0_I4(S, p, s, s)
 C_O1_I1(r, l)
 C_O1_I1(r, r)
 C_O1_I1(w, r)
@@ -XXX,XX +XXX,XX @@ C_O1_I2(w, w, wZ)
 C_O1_I3(w, w, w, w)
 C_O1_I4(r, r, r, rI, rI)
 C_O1_I4(r, r, rIN, rIK, 0)
-C_O2_I1(r, r, l)
-C_O2_I2(r, r, l, l)
+C_O2_I1(e, p, l)
+C_O2_I2(e, p, l, l)
 C_O2_I2(r, r, r, r)
 C_O2_I4(r, r, r, r, rIN, rIK)
 C_O2_I4(r, r, rI, rI, rIN, rIK)
diff --git a/tcg/arm/tcg-target-con-str.h b/tcg/arm/tcg-target-con-str.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target-con-str.h
+++ b/tcg/arm/tcg-target-con-str.h
@@ -XXX,XX +XXX,XX @@
  * Define constraint letters for register sets:
  * REGS(letter, register_mask)
  */
+REGS('e', ALL_GENERAL_REGS & 0x5555) /* even regs */
 REGS('r', ALL_GENERAL_REGS)
 REGS('l', ALL_QLOAD_REGS)
 REGS('s', ALL_QSTORE_REGS)
+REGS('S', ALL_QSTORE_REGS & 0x5555)  /* even qstore */
 REGS('w', ALL_VECTOR_REGS)
 
 /*
diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld_index(TCGContext *s, MemOp opc,
         tcg_out_ld32_r(s, COND_AL, datalo, addrlo, addend);
         break;
     case MO_UQ:
+        /* We used pair allocation for datalo, so already should be aligned. */
+        tcg_debug_assert((datalo & 1) == 0);
+        tcg_debug_assert(datahi == datalo + 1);
         /* LDRD requires alignment; double-check that. */
-        if (get_alignment_bits(opc) >= MO_64
-            && (datalo & 1) == 0 && datahi == datalo + 1) {
+        if (get_alignment_bits(opc) >= MO_64) {
             /*
              * Rm (the second address op) must not overlap Rt or Rt + 1.
              * Since datalo is aligned, we can simplify the test via alignment.
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld_direct(TCGContext *s, MemOp opc, TCGReg datalo,
         tcg_out_ld32_12(s, COND_AL, datalo, addrlo, 0);
         break;
     case MO_UQ:
+        /* We used pair allocation for datalo, so already should be aligned. */
+        tcg_debug_assert((datalo & 1) == 0);
+        tcg_debug_assert(datahi == datalo + 1);
         /* LDRD requires alignment; double-check that. */
-        if (get_alignment_bits(opc) >= MO_64
-            && (datalo & 1) == 0 && datahi == datalo + 1) {
+        if (get_alignment_bits(opc) >= MO_64) {
             tcg_out_ldrd_8(s, COND_AL, datalo, addrlo, 0);
         } else if (datalo == addrlo) {
             tcg_out_ld32_12(s, COND_AL, datahi, addrlo, 4);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st_index(TCGContext *s, ARMCond cond, MemOp opc,
         tcg_out_st32_r(s, cond, datalo, addrlo, addend);
         break;
     case MO_64:
+        /* We used pair allocation for datalo, so already should be aligned. */
+        tcg_debug_assert((datalo & 1) == 0);
+        tcg_debug_assert(datahi == datalo + 1);
         /* STRD requires alignment; double-check that. */
-        if (get_alignment_bits(opc) >= MO_64
-            && (datalo & 1) == 0 && datahi == datalo + 1) {
+        if (get_alignment_bits(opc) >= MO_64) {
             tcg_out_strd_r(s, cond, datalo, addrlo, addend);
         } else if (scratch_addend) {
             tcg_out_st32_rwb(s, cond, datalo, addend, addrlo);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st_direct(TCGContext *s, MemOp opc, TCGReg datalo,
         tcg_out_st32_12(s, COND_AL, datalo, addrlo, 0);
         break;
     case MO_64:
+        /* We used pair allocation for datalo, so already should be aligned. */
+        tcg_debug_assert((datalo & 1) == 0);
+        tcg_debug_assert(datahi == datalo + 1);
         /* STRD requires alignment; double-check that. */
-        if (get_alignment_bits(opc) >= MO_64
-            && (datalo & 1) == 0 && datahi == datalo + 1) {
+        if (get_alignment_bits(opc) >= MO_64) {
             tcg_out_strd_8(s, COND_AL, datalo, addrlo, 0);
         } else {
             tcg_out_st32_12(s, COND_AL, datalo, addrlo, 0);
@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
     case INDEX_op_qemu_ld_i32:
         return TARGET_LONG_BITS == 32 ? C_O1_I1(r, l) : C_O1_I2(r, l, l);
     case INDEX_op_qemu_ld_i64:
-        return TARGET_LONG_BITS == 32 ? C_O2_I1(r, r, l) : C_O2_I2(r, r, l, l);
+        return TARGET_LONG_BITS == 32 ? C_O2_I1(e, p, l) : C_O2_I2(e, p, l, l);
     case INDEX_op_qemu_st_i32:
         return TARGET_LONG_BITS == 32 ? C_O0_I2(s, s) : C_O0_I3(s, s, s);
     case INDEX_op_qemu_st_i64:
-        return TARGET_LONG_BITS == 32 ? C_O0_I3(s, s, s) : C_O0_I4(s, s, s, s);
+        return TARGET_LONG_BITS == 32 ? C_O0_I3(S, p, s) : C_O0_I4(S, p, s, s);
 
     case INDEX_op_st_vec:
         return C_O0_I2(w, r);
-- 
2.34.1

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Reviewed-by: Daniel Henrique Barboza <danielhb413@gmail.com>
Message-Id: <20220729172141.1789105-2-richard.henderson@linaro.org>
---
 common-user/host/ppc/safe-syscall.inc.S | 107 ++++++++++++++++++++++++
 1 file changed, 107 insertions(+)
 create mode 100644 common-user/host/ppc/safe-syscall.inc.S

diff --git a/common-user/host/ppc/safe-syscall.inc.S b/common-user/host/ppc/safe-syscall.inc.S
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/common-user/host/ppc/safe-syscall.inc.S
@@ -XXX,XX +XXX,XX @@
+/*
+ * safe-syscall.inc.S : host-specific assembly fragment
+ * to handle signals occurring at the same time as system calls.
+ * This is intended to be included by common-user/safe-syscall.S
+ *
+ * Copyright (C) 2022 Linaro, Ltd.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+/*
+ * Standardize on the _CALL_FOO symbols used by GCC:
+ * Apple XCode does not define _CALL_DARWIN.
+ * Clang defines _CALL_ELF (64-bit) but not _CALL_SYSV (32-bit).
+ */
+#if !defined(_CALL_SYSV) && \
+    !defined(_CALL_DARWIN) && \
+    !defined(_CALL_AIX) && \
+    !defined(_CALL_ELF)
+# if defined(__APPLE__)
+#  define _CALL_DARWIN
+# elif defined(__ELF__) && TCG_TARGET_REG_BITS == 32
+#  define _CALL_SYSV
+# else
+#  error "Unknown ABI"
+# endif
+#endif 
+
+#ifndef _CALL_SYSV
+# error "Unsupported ABI"
+#endif
+
+
+        .global safe_syscall_base
+        .global safe_syscall_start
+        .global safe_syscall_end
+        .type   safe_syscall_base, @function
+
+        .text
+
+        /*
+         * This is the entry point for making a system call. The calling
+         * convention here is that of a C varargs function with the
+         * first argument an 'int *' to the signal_pending flag, the
+         * second one the system call number (as a 'long'), and all further
+         * arguments being syscall arguments (also 'long').
+         */
+safe_syscall_base:
+        .cfi_startproc
+        stwu    1, -8(1)
+        .cfi_def_cfa_offset 8
+        stw     30, 4(1)
+        .cfi_offset 30, -4
+
+        /*
+         * We enter with r3 == &signal_pending
+         *               r4 == syscall number
+         *               r5 ... r10 == syscall arguments
+         *               and return the result in r3
+         * and the syscall instruction needs
+         *               r0 == syscall number
+         *               r3 ... r8 == syscall arguments
+         *               and returns the result in r3
+         * Shuffle everything around appropriately.
+         */
+        mr      30, 3           /* signal_pending */
+        mr      0, 4            /* syscall number */
+        mr      3, 5            /* syscall arguments */
+        mr      4, 6
+        mr      5, 7
+        mr      6, 8
+        mr      7, 9
+        mr      8, 10
+
+        /*
+         * This next sequence of code works in conjunction with the
+         * rewind_if_safe_syscall_function(). If a signal is taken
+         * and the interrupted PC is anywhere between 'safe_syscall_start'
+         * and 'safe_syscall_end' then we rewind it to 'safe_syscall_start'.
+         * The code sequence must therefore be able to cope with this, and
+         * the syscall instruction must be the final one in the sequence.
+         */
+safe_syscall_start:
+        /* if signal_pending is non-zero, don't do the call */
+        lwz     12, 0(30)
+        cmpwi   0, 12, 0
+        bne-    2f
+        sc
+safe_syscall_end:
+        /* code path when we did execute the syscall */
+        lwz     30, 4(1)        /* restore r30 */
+        addi    1, 1, 8         /* restore stack */
+        .cfi_restore 30
+        .cfi_def_cfa_offset 0
+        bnslr+                  /* return on success */
+        b       safe_syscall_set_errno_tail
+
+        /* code path when we didn't execute the syscall */
+2:      lwz     30, 4(1)
+        addi    1, 1, 8
+        addi    3, 0, QEMU_ERESTARTSYS
+        b       safe_syscall_set_errno_tail
+
+        .cfi_endproc
+
+        .size   safe_syscall_base, .-safe_syscall_base
-- 
2.34.1

This commit re-enables ppc32 as a linux-user host,
as existance of the directory is noted by configure.

Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1097
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Reviewed-by: Daniel Henrique Barboza <danielhb413@gmail.com>
Message-Id: <20220729172141.1789105-3-richard.henderson@linaro.org>
---
 linux-user/include/host/ppc/host-signal.h | 39 +++++++++++++++++++++++
 1 file changed, 39 insertions(+)
 create mode 100644 linux-user/include/host/ppc/host-signal.h

diff --git a/linux-user/include/host/ppc/host-signal.h b/linux-user/include/host/ppc/host-signal.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/linux-user/include/host/ppc/host-signal.h
@@ -XXX,XX +XXX,XX @@
+/*
+ * host-signal.h: signal info dependent on the host architecture
+ *
+ * Copyright (c) 2022 Linaro Ltd.
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2.1 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef PPC_HOST_SIGNAL_H
+#define PPC_HOST_SIGNAL_H
+
+#include <asm/ptrace.h>
+
+/* The third argument to a SA_SIGINFO handler is ucontext_t. */
+typedef ucontext_t host_sigcontext;
+
+static inline uintptr_t host_signal_pc(host_sigcontext *uc)
+{
+    return uc->uc_mcontext.regs->nip;
+}
+
+static inline void host_signal_set_pc(host_sigcontext *uc, uintptr_t pc)
+{
+    uc->uc_mcontext.regs->nip = pc;
+}
+
+static inline void *host_signal_mask(host_sigcontext *uc)
+{
+    return &uc->uc_sigmask;
+}
+
+static inline bool host_signal_write(siginfo_t *info, host_sigcontext *uc)
+{
+    return uc->uc_mcontext.regs->trap != 0x400
+        && (uc->uc_mcontext.regs->dsisr & 0x02000000);
+}
+
+#endif
-- 
2.34.1

Marking helpers __attribute__((noinline)) prevents an issue
with GCC's ipa-split pass under --enable-lto.

Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1454
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Tested-by: Idan Horowitz <idan.horowitz@gmail.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/helper-proto.h | 32 ++++++++++++++++++++++++--------
 1 file changed, 24 insertions(+), 8 deletions(-)

diff --git a/include/exec/helper-proto.h b/include/exec/helper-proto.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/helper-proto.h
+++ b/include/exec/helper-proto.h
@@ -XXX,XX +XXX,XX @@
 
 #include "exec/helper-head.h"
 
+/*
+ * Work around an issue with --enable-lto, in which GCC's ipa-split pass
+ * decides to split out the noreturn code paths that raise an exception,
+ * taking the __builtin_return_address() along into the new function,
+ * where it no longer computes a value that returns to TCG generated code.
+ * Despite the name, the noinline attribute affects splitter, so this
+ * prevents the optimization in question.  Given that helpers should not
+ * otherwise be called directly, this should have any other visible effect.
+ *
+ * See https://gitlab.com/qemu-project/qemu/-/issues/1454
+ */
+#define DEF_HELPER_ATTR  __attribute__((noinline))
+
 #define DEF_HELPER_FLAGS_0(name, flags, ret) \
-dh_ctype(ret) HELPER(name) (void);
+dh_ctype(ret) HELPER(name) (void) DEF_HELPER_ATTR;
 
 #define DEF_HELPER_FLAGS_1(name, flags, ret, t1) \
-dh_ctype(ret) HELPER(name) (dh_ctype(t1));
+dh_ctype(ret) HELPER(name) (dh_ctype(t1)) DEF_HELPER_ATTR;
 
 #define DEF_HELPER_FLAGS_2(name, flags, ret, t1, t2) \
-dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2));
+dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2)) DEF_HELPER_ATTR;
 
 #define DEF_HELPER_FLAGS_3(name, flags, ret, t1, t2, t3) \
-dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3));
+dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), \
+                            dh_ctype(t3)) DEF_HELPER_ATTR;
 
 #define DEF_HELPER_FLAGS_4(name, flags, ret, t1, t2, t3, t4) \
 dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
-                                   dh_ctype(t4));
+                            dh_ctype(t4)) DEF_HELPER_ATTR;
 
 #define DEF_HELPER_FLAGS_5(name, flags, ret, t1, t2, t3, t4, t5) \
 dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
-                            dh_ctype(t4), dh_ctype(t5));
+                            dh_ctype(t4), dh_ctype(t5)) DEF_HELPER_ATTR;
 
 #define DEF_HELPER_FLAGS_6(name, flags, ret, t1, t2, t3, t4, t5, t6) \
 dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
-                            dh_ctype(t4), dh_ctype(t5), dh_ctype(t6));
+                            dh_ctype(t4), dh_ctype(t5), \
+                            dh_ctype(t6)) DEF_HELPER_ATTR;
 
 #define DEF_HELPER_FLAGS_7(name, flags, ret, t1, t2, t3, t4, t5, t6, t7) \
 dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
                             dh_ctype(t4), dh_ctype(t5), dh_ctype(t6), \
-                            dh_ctype(t7));
+                            dh_ctype(t7)) DEF_HELPER_ATTR;
 
 #define IN_HELPER_PROTO
 
@@ -XXX,XX +XXX,XX @@ dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
 #undef DEF_HELPER_FLAGS_5
 #undef DEF_HELPER_FLAGS_6
 #undef DEF_HELPER_FLAGS_7
+#undef DEF_HELPER_ATTR
 
 #endif /* HELPER_PROTO_H */
-- 
2.34.1

Reuse the decodetree based disassembler from
target/loongarch/ for tcg/loongarch64/.

The generation of decode-insns.c.inc into ./libcommon.fa.p/ could
eventually result in conflict, if any other host requires the same
trick, but this is good enough for now.

Reviewed-by: WANG Xuerui <git@xen0n.name>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 disas.c                      | 2 ++
 target/loongarch/meson.build | 3 ++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/disas.c b/disas.c
index XXXXXXX..XXXXXXX 100644
--- a/disas.c
+++ b/disas.c
@@ -XXX,XX +XXX,XX @@ static void initialize_debug_host(CPUDebug *s)
     s->info.cap_insn_split = 6;
 #elif defined(__hppa__)
     s->info.print_insn = print_insn_hppa;
+#elif defined(__loongarch__)
+    s->info.print_insn = print_insn_loongarch;
 #endif
 }
 
diff --git a/target/loongarch/meson.build b/target/loongarch/meson.build
index XXXXXXX..XXXXXXX 100644
--- a/target/loongarch/meson.build
+++ b/target/loongarch/meson.build
@@ -XXX,XX +XXX,XX @@ gen = decodetree.process('insns.decode')
 loongarch_ss = ss.source_set()
 loongarch_ss.add(files(
   'cpu.c',
-  'disas.c',
 ))
 loongarch_tcg_ss = ss.source_set()
 loongarch_tcg_ss.add(gen)
@@ -XXX,XX +XXX,XX @@ loongarch_softmmu_ss.add(files(
   'iocsr_helper.c',
 ))
 
+common_ss.add(when: 'CONFIG_LOONGARCH_DIS', if_true: [files('disas.c'), gen])
+
 loongarch_ss.add_all(when: 'CONFIG_TCG', if_true: [loongarch_tcg_ss])
 
 target_arch += {'loongarch': loongarch_ss}
-- 
2.34.1

While jirl shares the same instruction format as bne etc,
it is not assembled the same.  In particular, rd is printed
first not second and the immediate is not pc-relative.

Decode into the arg_rr_i structure, which prints correctly.
This changes the "offs" member to "imm", to update translate.

Reviewed-by: WANG Xuerui <git@xen0n.name>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/loongarch/insns.decode                  | 3 ++-
 target/loongarch/disas.c                       | 2 +-
 target/loongarch/insn_trans/trans_branch.c.inc | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/target/loongarch/insns.decode b/target/loongarch/insns.decode
index XXXXXXX..XXXXXXX 100644
--- a/target/loongarch/insns.decode
+++ b/target/loongarch/insns.decode
@@ -XXX,XX +XXX,XX @@
 @rr_ui12                 .... ...... imm:12 rj:5 rd:5    &rr_i
 @rr_i14s2         .... ....  .............. rj:5 rd:5    &rr_i imm=%i14s2
 @rr_i16                     .... .. imm:s16 rj:5 rd:5    &rr_i
+@rr_i16s2         .... ..  ................ rj:5 rd:5    &rr_i imm=%offs16
 @hint_r_i12           .... ...... imm:s12 rj:5 hint:5    &hint_r_i
 @rrr_sa2p1        .... ........ ... .. rk:5 rj:5 rd:5    &rrr_sa  sa=%sa2p1
 @rrr_sa2        .... ........ ... sa:2 rk:5 rj:5 rd:5    &rrr_sa
@@ -XXX,XX +XXX,XX @@ beqz            0100 00 ................ ..... .....     @r_offs21
 bnez            0100 01 ................ ..... .....     @r_offs21
 bceqz           0100 10 ................ 00 ... .....    @c_offs21
 bcnez           0100 10 ................ 01 ... .....    @c_offs21
-jirl            0100 11 ................ ..... .....     @rr_offs16
+jirl            0100 11 ................ ..... .....     @rr_i16s2
 b               0101 00 ..........................       @offs26
 bl              0101 01 ..........................       @offs26
 beq             0101 10 ................ ..... .....     @rr_offs16
diff --git a/target/loongarch/disas.c b/target/loongarch/disas.c
index XXXXXXX..XXXXXXX 100644
--- a/target/loongarch/disas.c
+++ b/target/loongarch/disas.c
@@ -XXX,XX +XXX,XX @@ INSN(beqz,         r_offs)
 INSN(bnez,         r_offs)
 INSN(bceqz,        c_offs)
 INSN(bcnez,        c_offs)
-INSN(jirl,         rr_offs)
+INSN(jirl,         rr_i)
 INSN(b,            offs)
 INSN(bl,           offs)
 INSN(beq,          rr_offs)
diff --git a/target/loongarch/insn_trans/trans_branch.c.inc b/target/loongarch/insn_trans/trans_branch.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/target/loongarch/insn_trans/trans_branch.c.inc
+++ b/target/loongarch/insn_trans/trans_branch.c.inc
@@ -XXX,XX +XXX,XX @@ static bool trans_jirl(DisasContext *ctx, arg_jirl *a)
     TCGv dest = gpr_dst(ctx, a->rd, EXT_NONE);
     TCGv src1 = gpr_src(ctx, a->rj, EXT_NONE);
 
-    tcg_gen_addi_tl(cpu_pc, src1, a->offs);
+    tcg_gen_addi_tl(cpu_pc, src1, a->imm);
     tcg_gen_movi_tl(dest, ctx->base.pc_next + 4);
     gen_set_gpr(a->rd, dest, EXT_NONE);
     tcg_gen_lookup_and_goto_ptr();
-- 
2.34.1

Print both the raw field and the resolved pc-relative
address, as we do for branches.

Reviewed-by: WANG Xuerui <git@xen0n.name>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/loongarch/disas.c | 37 +++++++++++++++++++++++++++++++++----
 1 file changed, 33 insertions(+), 4 deletions(-)

diff --git a/target/loongarch/disas.c b/target/loongarch/disas.c
index XXXXXXX..XXXXXXX 100644
--- a/target/loongarch/disas.c
+++ b/target/loongarch/disas.c
@@ -XXX,XX +XXX,XX @@ INSN(fsel,         fffc)
 INSN(addu16i_d,    rr_i)
 INSN(lu12i_w,      r_i)
 INSN(lu32i_d,      r_i)
-INSN(pcaddi,       r_i)
-INSN(pcalau12i,    r_i)
-INSN(pcaddu12i,    r_i)
-INSN(pcaddu18i,    r_i)
 INSN(ll_w,         rr_i)
 INSN(sc_w,         rr_i)
 INSN(ll_d,         rr_i)
@@ -XXX,XX +XXX,XX @@ static bool trans_fcmp_cond_##suffix(DisasContext *ctx, \
 
 FCMP_INSN(s)
 FCMP_INSN(d)
+
+#define PCADD_INSN(name)                                        \
+static bool trans_##name(DisasContext *ctx, arg_##name *a)      \
+{                                                               \
+    output(ctx, #name, "r%d, %d # 0x%" PRIx64,                  \
+           a->rd, a->imm, gen_##name(ctx->pc, a->imm));         \
+    return true;                                                \
+}
+
+static uint64_t gen_pcaddi(uint64_t pc, int imm)
+{
+    return pc + (imm << 2);
+}
+
+static uint64_t gen_pcalau12i(uint64_t pc, int imm)
+{
+    return (pc + (imm << 12)) & ~0xfff;
+}
+
+static uint64_t gen_pcaddu12i(uint64_t pc, int imm)
+{
+    return pc + (imm << 12);
+}
+
+static uint64_t gen_pcaddu18i(uint64_t pc, int imm)
+{
+    return pc + ((uint64_t)(imm) << 18);
+}
+
+PCADD_INSN(pcaddi)
+PCADD_INSN(pcalau12i)
+PCADD_INSN(pcaddu12i)
+PCADD_INSN(pcaddu18i)
-- 
2.34.1

From: Rui Wang <wangrui@loongson.cn>

diff:
  Imm                 Before                  After
  0000000000000000    addi.w  rd, zero, 0     addi.w  rd, zero, 0
                      lu52i.d rd, zero, 0
  00000000fffff800    lu12i.w rd, -1          addi.w  rd, zero, -2048
                      ori     rd, rd, 2048    lu32i.d rd, 0
                      lu32i.d rd, 0

Reviewed-by: WANG Xuerui <git@xen0n.name>
Signed-off-by: Rui Wang <wangrui@loongson.cn>
Message-Id: <20221107144713.845550-1-wangrui@loongson.cn>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/loongarch64/tcg-target.c.inc | 35 +++++++++++---------------------
 1 file changed, 12 insertions(+), 23 deletions(-)

Regenerate with ADDU16I included:

$ cd loongarch-opcodes/scripts/go
   $ go run ./genqemutcgdefs > $QEMU/tcg/loongarch64/tcg-insn-defs.c.inc

Reviewed-by: WANG Xuerui <git@xen0n.name>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/loongarch64/tcg-insn-defs.c.inc | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/tcg/loongarch64/tcg-insn-defs.c.inc b/tcg/loongarch64/tcg-insn-defs.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/loongarch64/tcg-insn-defs.c.inc
+++ b/tcg/loongarch64/tcg-insn-defs.c.inc
@@ -XXX,XX +XXX,XX @@
  *
  * This file is auto-generated by genqemutcgdefs from
  * https://github.com/loongson-community/loongarch-opcodes,
- * from commit 961f0c60f5b63e574d785995600c71ad5413fdc4.
+ * from commit 25ca7effe9d88101c1cf96c4005423643386d81f.
  * DO NOT EDIT.
  */
 
@@ -XXX,XX +XXX,XX @@ typedef enum {
     OPC_ANDI = 0x03400000,
     OPC_ORI = 0x03800000,
     OPC_XORI = 0x03c00000,
+    OPC_ADDU16I_D = 0x10000000,
     OPC_LU12I_W = 0x14000000,
     OPC_CU32I_D = 0x16000000,
     OPC_PCADDU2I = 0x18000000,
@@ -XXX,XX +XXX,XX @@ tcg_out_opc_xori(TCGContext *s, TCGReg d, TCGReg j, uint32_t uk12)
     tcg_out32(s, encode_djuk12_insn(OPC_XORI, d, j, uk12));
 }
 
+/* Emits the `addu16i.d d, j, sk16` instruction.  */
+static void __attribute__((unused))
+tcg_out_opc_addu16i_d(TCGContext *s, TCGReg d, TCGReg j, int32_t sk16)
+{
+    tcg_out32(s, encode_djsk16_insn(OPC_ADDU16I_D, d, j, sk16));
+}
+
 /* Emits the `lu12i.w d, sj20` instruction.  */
 static void __attribute__((unused))
 tcg_out_opc_lu12i_w(TCGContext *s, TCGReg d, int32_t sj20)
-- 
2.34.1

Adjust the constraints to allow any int32_t for immediate
addition.  Split immediate adds into addu16i + addi, which
covers quite a lot of the immediate space.  For the hole in
the middle, load the constant into TMP0 instead.

Reviewed-by: WANG Xuerui <git@xen0n.name>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/loongarch64/tcg-target-con-set.h |  4 +-
 tcg/loongarch64/tcg-target-con-str.h |  2 +-
 tcg/loongarch64/tcg-target.c.inc     | 57 ++++++++++++++++++++++++----
 3 files changed, 53 insertions(+), 10 deletions(-)

diff --git a/tcg/loongarch64/tcg-target-con-set.h b/tcg/loongarch64/tcg-target-con-set.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/loongarch64/tcg-target-con-set.h
+++ b/tcg/loongarch64/tcg-target-con-set.h
@@ -XXX,XX +XXX,XX @@ C_O1_I1(r, L)
 C_O1_I2(r, r, rC)
 C_O1_I2(r, r, ri)
 C_O1_I2(r, r, rI)
+C_O1_I2(r, r, rJ)
 C_O1_I2(r, r, rU)
 C_O1_I2(r, r, rW)
 C_O1_I2(r, r, rZ)
 C_O1_I2(r, 0, rZ)
-C_O1_I2(r, rZ, rN)
+C_O1_I2(r, rZ, ri)
+C_O1_I2(r, rZ, rJ)
 C_O1_I2(r, rZ, rZ)
diff --git a/tcg/loongarch64/tcg-target-con-str.h b/tcg/loongarch64/tcg-target-con-str.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/loongarch64/tcg-target-con-str.h
+++ b/tcg/loongarch64/tcg-target-con-str.h
@@ -XXX,XX +XXX,XX @@ REGS('L', ALL_GENERAL_REGS & ~SOFTMMU_RESERVE_REGS)
  * CONST(letter, TCG_CT_CONST_* bit set)
  */
 CONST('I', TCG_CT_CONST_S12)
-CONST('N', TCG_CT_CONST_N12)
+CONST('J', TCG_CT_CONST_S32)
 CONST('U', TCG_CT_CONST_U12)
 CONST('Z', TCG_CT_CONST_ZERO)
 CONST('C', TCG_CT_CONST_C12)
diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/loongarch64/tcg-target.c.inc
+++ b/tcg/loongarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static const int tcg_target_call_oarg_regs[] = {
 
 #define TCG_CT_CONST_ZERO  0x100
 #define TCG_CT_CONST_S12   0x200
-#define TCG_CT_CONST_N12   0x400
+#define TCG_CT_CONST_S32   0x400
 #define TCG_CT_CONST_U12   0x800
 #define TCG_CT_CONST_C12   0x1000
 #define TCG_CT_CONST_WSZ   0x2000
@@ -XXX,XX +XXX,XX @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
     if ((ct & TCG_CT_CONST_S12) && val == sextreg(val, 0, 12)) {
         return true;
     }
-    if ((ct & TCG_CT_CONST_N12) && -val == sextreg(-val, 0, 12)) {
+    if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) {
         return true;
     }
     if ((ct & TCG_CT_CONST_U12) && val >= 0 && val <= 0xfff) {
@@ -XXX,XX +XXX,XX @@ static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg rd,
     }
 }
 
+static void tcg_out_addi(TCGContext *s, TCGType type, TCGReg rd,
+                         TCGReg rs, tcg_target_long imm)
+{
+    tcg_target_long lo12 = sextreg(imm, 0, 12);
+    tcg_target_long hi16 = sextreg(imm - lo12, 16, 16);
+
+    /*
+     * Note that there's a hole in between hi16 and lo12:
+     *
+     *       3                   2                   1                   0
+     *     1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0
+     * ...+-------------------------------+-------+-----------------------+
+     *    |             hi16              |       |          lo12         |
+     * ...+-------------------------------+-------+-----------------------+
+     *
+     * For bits within that hole, it's more efficient to use LU12I and ADD.
+     */
+    if (imm == (hi16 << 16) + lo12) {
+        if (hi16) {
+            tcg_out_opc_addu16i_d(s, rd, rs, hi16);
+            rs = rd;
+        }
+        if (type == TCG_TYPE_I32) {
+            tcg_out_opc_addi_w(s, rd, rs, lo12);
+        } else if (lo12) {
+            tcg_out_opc_addi_d(s, rd, rs, lo12);
+        } else {
+            tcg_out_mov(s, type, rd, rs);
+        }
+    } else {
+        tcg_out_movi(s, type, TCG_REG_TMP0, imm);
+        if (type == TCG_TYPE_I32) {
+            tcg_out_opc_add_w(s, rd, rs, TCG_REG_TMP0);
+        } else {
+            tcg_out_opc_add_d(s, rd, rs, TCG_REG_TMP0);
+        }
+    }
+}
+
 static void tcg_out_ext8u(TCGContext *s, TCGReg ret, TCGReg arg)
 {
     tcg_out_opc_andi(s, ret, arg, 0xff);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
 
     case INDEX_op_add_i32:
         if (c2) {
-            tcg_out_opc_addi_w(s, a0, a1, a2);
+            tcg_out_addi(s, TCG_TYPE_I32, a0, a1, a2);
         } else {
             tcg_out_opc_add_w(s, a0, a1, a2);
         }
         break;
     case INDEX_op_add_i64:
         if (c2) {
-            tcg_out_opc_addi_d(s, a0, a1, a2);
+            tcg_out_addi(s, TCG_TYPE_I64, a0, a1, a2);
         } else {
             tcg_out_opc_add_d(s, a0, a1, a2);
         }
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
 
     case INDEX_op_sub_i32:
         if (c2) {
-            tcg_out_opc_addi_w(s, a0, a1, -a2);
+            tcg_out_addi(s, TCG_TYPE_I32, a0, a1, -a2);
         } else {
             tcg_out_opc_sub_w(s, a0, a1, a2);
         }
         break;
     case INDEX_op_sub_i64:
         if (c2) {
-            tcg_out_opc_addi_d(s, a0, a1, -a2);
+            tcg_out_addi(s, TCG_TYPE_I64, a0, a1, -a2);
         } else {
             tcg_out_opc_sub_d(s, a0, a1, a2);
         }
@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
         return C_O1_I2(r, r, ri);
 
     case INDEX_op_add_i32:
+        return C_O1_I2(r, r, ri);
     case INDEX_op_add_i64:
-        return C_O1_I2(r, r, rI);
+        return C_O1_I2(r, r, rJ);
 
     case INDEX_op_and_i32:
     case INDEX_op_and_i64:
@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
         return C_O1_I2(r, 0, rZ);
 
     case INDEX_op_sub_i32:
+        return C_O1_I2(r, rZ, ri);
     case INDEX_op_sub_i64:
-        return C_O1_I2(r, rZ, rN);
+        return C_O1_I2(r, rZ, rJ);
 
     case INDEX_op_mul_i32:
     case INDEX_op_mul_i64:
-- 
2.34.1

Split out a helper function, tcg_out_setcond_int, which
does not always produce the complete boolean result, but
returns a set of flags to do so.

Accept all int32_t as constant input, so that LE/GT can
adjust the constant to LT.

Reviewed-by: WANG Xuerui <git@xen0n.name>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/loongarch64/tcg-target.c.inc | 165 +++++++++++++++++++++----------
 1 file changed, 115 insertions(+), 50 deletions(-)

diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/loongarch64/tcg-target.c.inc
+++ b/tcg/loongarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_clzctz(TCGContext *s, LoongArchInsn opc,
     tcg_out_opc_or(s, a0, TCG_REG_TMP0, a0);
 }
 
-static void tcg_out_setcond(TCGContext *s, TCGCond cond, TCGReg ret,
-                            TCGReg arg1, TCGReg arg2, bool c2)
-{
-    TCGReg tmp;
+#define SETCOND_INV    TCG_TARGET_NB_REGS
+#define SETCOND_NEZ    (SETCOND_INV << 1)
+#define SETCOND_FLAGS  (SETCOND_INV | SETCOND_NEZ)
 
-    if (c2) {
-        tcg_debug_assert(arg2 == 0);
+static int tcg_out_setcond_int(TCGContext *s, TCGCond cond, TCGReg ret,
+                               TCGReg arg1, tcg_target_long arg2, bool c2)
+{
+    int flags = 0;
+
+    switch (cond) {
+    case TCG_COND_EQ:    /* -> NE  */
+    case TCG_COND_GE:    /* -> LT  */
+    case TCG_COND_GEU:   /* -> LTU */
+    case TCG_COND_GT:    /* -> LE  */
+    case TCG_COND_GTU:   /* -> LEU */
+        cond = tcg_invert_cond(cond);
+        flags ^= SETCOND_INV;
+        break;
+    default:
+        break;
     }
 
     switch (cond) {
-    case TCG_COND_EQ:
-        if (c2) {
-            tmp = arg1;
-        } else {
-            tcg_out_opc_sub_d(s, ret, arg1, arg2);
-            tmp = ret;
-        }
-        tcg_out_opc_sltui(s, ret, tmp, 1);
-        break;
-    case TCG_COND_NE:
-        if (c2) {
-            tmp = arg1;
-        } else {
-            tcg_out_opc_sub_d(s, ret, arg1, arg2);
-            tmp = ret;
-        }
-        tcg_out_opc_sltu(s, ret, TCG_REG_ZERO, tmp);
-        break;
-    case TCG_COND_LT:
-        tcg_out_opc_slt(s, ret, arg1, arg2);
-        break;
-    case TCG_COND_GE:
-        tcg_out_opc_slt(s, ret, arg1, arg2);
-        tcg_out_opc_xori(s, ret, ret, 1);
-        break;
     case TCG_COND_LE:
-        tcg_out_setcond(s, TCG_COND_GE, ret, arg2, arg1, false);
-        break;
-    case TCG_COND_GT:
-        tcg_out_setcond(s, TCG_COND_LT, ret, arg2, arg1, false);
-        break;
-    case TCG_COND_LTU:
-        tcg_out_opc_sltu(s, ret, arg1, arg2);
-        break;
-    case TCG_COND_GEU:
-        tcg_out_opc_sltu(s, ret, arg1, arg2);
-        tcg_out_opc_xori(s, ret, ret, 1);
-        break;
     case TCG_COND_LEU:
-        tcg_out_setcond(s, TCG_COND_GEU, ret, arg2, arg1, false);
+        /*
+         * If we have a constant input, the most efficient way to implement
+         * LE is by adding 1 and using LT.  Watch out for wrap around for LEU.
+         * We don't need to care for this for LE because the constant input
+         * is still constrained to int32_t, and INT32_MAX+1 is representable
+         * in the 64-bit temporary register.
+         */
+        if (c2) {
+            if (cond == TCG_COND_LEU) {
+                /* unsigned <= -1 is true */
+                if (arg2 == -1) {
+                    tcg_out_movi(s, TCG_TYPE_REG, ret, !(flags & SETCOND_INV));
+                    return ret;
+                }
+                cond = TCG_COND_LTU;
+            } else {
+                cond = TCG_COND_LT;
+            }
+            arg2 += 1;
+        } else {
+            TCGReg tmp = arg2;
+            arg2 = arg1;
+            arg1 = tmp;
+            cond = tcg_swap_cond(cond);    /* LE -> GE */
+            cond = tcg_invert_cond(cond);  /* GE -> LT */
+            flags ^= SETCOND_INV;
+        }
         break;
-    case TCG_COND_GTU:
-        tcg_out_setcond(s, TCG_COND_LTU, ret, arg2, arg1, false);
+    default:
         break;
+    }
+
+    switch (cond) {
+    case TCG_COND_NE:
+        flags |= SETCOND_NEZ;
+        if (!c2) {
+            tcg_out_opc_xor(s, ret, arg1, arg2);
+        } else if (arg2 == 0) {
+            ret = arg1;
+        } else if (arg2 >= 0 && arg2 <= 0xfff) {
+            tcg_out_opc_xori(s, ret, arg1, arg2);
+        } else {
+            tcg_out_addi(s, TCG_TYPE_REG, ret, arg1, -arg2);
+        }
+        break;
+
+    case TCG_COND_LT:
+    case TCG_COND_LTU:
+        if (c2) {
+            if (arg2 >= -0x800 && arg2 <= 0x7ff) {
+                if (cond == TCG_COND_LT) {
+                    tcg_out_opc_slti(s, ret, arg1, arg2);
+                } else {
+                    tcg_out_opc_sltui(s, ret, arg1, arg2);
+                }
+                break;
+            }
+            tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_TMP0, arg2);
+            arg2 = TCG_REG_TMP0;
+        }
+        if (cond == TCG_COND_LT) {
+            tcg_out_opc_slt(s, ret, arg1, arg2);
+        } else {
+            tcg_out_opc_sltu(s, ret, arg1, arg2);
+        }
+        break;
+
     default:
         g_assert_not_reached();
         break;
     }
+
+    return ret | flags;
+}
+
+static void tcg_out_setcond(TCGContext *s, TCGCond cond, TCGReg ret,
+                            TCGReg arg1, tcg_target_long arg2, bool c2)
+{
+    int tmpflags = tcg_out_setcond_int(s, cond, ret, arg1, arg2, c2);
+
+    if (tmpflags != ret) {
+        TCGReg tmp = tmpflags & ~SETCOND_FLAGS;
+
+        switch (tmpflags & SETCOND_FLAGS) {
+        case SETCOND_INV:
+            /* Intermediate result is boolean: simply invert. */
+            tcg_out_opc_xori(s, ret, tmp, 1);
+            break;
+        case SETCOND_NEZ:
+            /* Intermediate result is zero/non-zero: test != 0. */
+            tcg_out_opc_sltu(s, ret, TCG_REG_ZERO, tmp);
+            break;
+        case SETCOND_NEZ | SETCOND_INV:
+            /* Intermediate result is zero/non-zero: test == 0. */
+            tcg_out_opc_sltui(s, ret, tmp, 1);
+            break;
+        default:
+            g_assert_not_reached();
+        }
+    }
 }
 
 /*
@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
     case INDEX_op_ctz_i64:
         return C_O1_I2(r, r, rW);
 
-    case INDEX_op_setcond_i32:
-    case INDEX_op_setcond_i64:
-        return C_O1_I2(r, r, rZ);
-
     case INDEX_op_deposit_i32:
     case INDEX_op_deposit_i64:
         /* Must deposit into the same register as input */
         return C_O1_I2(r, 0, rZ);
 
     case INDEX_op_sub_i32:
+    case INDEX_op_setcond_i32:
         return C_O1_I2(r, rZ, ri);
     case INDEX_op_sub_i64:
+    case INDEX_op_setcond_i64:
         return C_O1_I2(r, rZ, rJ);
 
     case INDEX_op_mul_i32:
-- 
2.34.1

Reviewed-by: WANG Xuerui <git@xen0n.name>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/loongarch64/tcg-target-con-set.h |  1 +
 tcg/loongarch64/tcg-target.h         |  4 ++--
 tcg/loongarch64/tcg-target.c.inc     | 33 ++++++++++++++++++++++++++++
 3 files changed, 36 insertions(+), 2 deletions(-)

The old implementation replaces two insns, swapping between

b       <dest>
        nop
and
        pcaddu18i tmp, <dest>
        jirl      zero, tmp, <dest> & 0xffff

There is a race condition in which a thread could be stopped at
the jirl, i.e. with the top of the address loaded, and when
restarted we have re-linked to a different TB, so that the top
half no longer matches the bottom half.

Note that while we never directly re-link to a different TB, we
can link, unlink, and link again all while the stopped thread
remains stopped.

The new implementation replaces only one insn, swapping between

b       <dest>
and
        pcadd   tmp, <jmp_addr>

falling through to load the address from tmp, and branch.

Reviewed-by: WANG Xuerui <git@xen0n.name>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/loongarch64/tcg-target.h     |  7 +---
 tcg/loongarch64/tcg-target.c.inc | 72 ++++++++++++++------------------
 2 files changed, 33 insertions(+), 46 deletions(-)

diff --git a/tcg/loongarch64/tcg-target.h b/tcg/loongarch64/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/loongarch64/tcg-target.h
+++ b/tcg/loongarch64/tcg-target.h
@@ -XXX,XX +XXX,XX @@
 
 #define TCG_TARGET_INSN_UNIT_SIZE 4
 #define TCG_TARGET_NB_REGS 32
-/*
- * PCADDU18I + JIRL sequence can give 20 + 16 + 2 = 38 bits
- * signed offset, which is +/- 128 GiB.
- */
-#define MAX_CODE_GEN_BUFFER_SIZE  (128 * GiB)
+
+#define MAX_CODE_GEN_BUFFER_SIZE  ((size_t)-1)
 
 typedef enum {
     TCG_REG_ZERO,
diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/loongarch64/tcg-target.c.inc
+++ b/tcg/loongarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args)
 #endif
 }
 
-/* LoongArch uses `andi zero, zero, 0` as NOP.  */
-#define NOP OPC_ANDI
-static void tcg_out_nop(TCGContext *s)
-{
-    tcg_out32(s, NOP);
-}
-
-void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
-                              uintptr_t jmp_rx, uintptr_t jmp_rw)
-{
-    tcg_insn_unit i1, i2;
-    ptrdiff_t upper, lower;
-    uintptr_t addr = tb->jmp_target_addr[n];
-    ptrdiff_t offset = (ptrdiff_t)(addr - jmp_rx) >> 2;
-
-    if (offset == sextreg(offset, 0, 26)) {
-        i1 = encode_sd10k16_insn(OPC_B, offset);
-        i2 = NOP;
-    } else {
-        tcg_debug_assert(offset == sextreg(offset, 0, 36));
-        lower = (int16_t)offset;
-        upper = (offset - lower) >> 16;
-
-        i1 = encode_dsj20_insn(OPC_PCADDU18I, TCG_REG_TMP0, upper);
-        i2 = encode_djsk16_insn(OPC_JIRL, TCG_REG_ZERO, TCG_REG_TMP0, lower);
-    }
-    uint64_t pair = ((uint64_t)i2 << 32) | i1;
-    qatomic_set((uint64_t *)jmp_rw, pair);
-    flush_idcache_range(jmp_rx, jmp_rw, 8);
-}
-
 /*
  * Entry-points
  */
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
 static void tcg_out_goto_tb(TCGContext *s, int which)
 {
     /*
-     * Ensure that patch area is 8-byte aligned so that an
-     * atomic write can be used to patch the target address.
+     * Direct branch, or load indirect address, to be patched
+     * by tb_target_set_jmp_target.  Check indirect load offset
+     * in range early, regardless of direct branch distance,
+     * via assert within tcg_out_opc_pcaddu2i.
      */
-    if ((uintptr_t)s->code_ptr & 7) {
-        tcg_out_nop(s);
-    }
+    uintptr_t i_addr = get_jmp_target_addr(s, which);
+    intptr_t i_disp = tcg_pcrel_diff(s, (void *)i_addr);
+
     set_jmp_insn_offset(s, which);
-    /*
-     * actual branch destination will be patched by
-     * tb_target_set_jmp_target later
-     */
-    tcg_out_opc_pcaddu18i(s, TCG_REG_TMP0, 0);
+    tcg_out_opc_pcaddu2i(s, TCG_REG_TMP0, i_disp >> 2);
+
+    /* Finish the load and indirect branch. */
+    tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP0, TCG_REG_TMP0, 0);
     tcg_out_opc_jirl(s, TCG_REG_ZERO, TCG_REG_TMP0, 0);
     set_jmp_reset_offset(s, which);
 }
 
+void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+                              uintptr_t jmp_rx, uintptr_t jmp_rw)
+{
+    uintptr_t d_addr = tb->jmp_target_addr[n];
+    ptrdiff_t d_disp = (ptrdiff_t)(d_addr - jmp_rx) >> 2;
+    tcg_insn_unit insn;
+
+    /* Either directly branch, or load slot address for indirect branch. */
+    if (d_disp == sextreg(d_disp, 0, 26)) {
+        insn = encode_sd10k16_insn(OPC_B, d_disp);
+    } else {
+        uintptr_t i_addr = (uintptr_t)&tb->jmp_target_addr[n];
+        intptr_t i_disp = i_addr - jmp_rx;
+        insn = encode_dsj20_insn(OPC_PCADDU2I, TCG_REG_TMP0, i_disp >> 2);
+    }
+
+    qatomic_set((tcg_insn_unit *)jmp_rw, insn);
+    flush_idcache_range(jmp_rx, jmp_rw, 4);
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
-- 
2.34.1