Series comparison

-[PULL 0/4] tcg patch queue
+[PULL v2 00/15] tcg patch queue
-The following changes since commit 67e41fe0cfb62e6cdfa659f0155417d17e5274ea:
+Second try's the charm today, right?
-  Merge tag 'pull-ppc-20220104' of https://github.com/legoater/qemu into staging (2022-01-04 07:23:27 -0800)
 r~
 The following changes since commit 00b1faea41d283e931256aa78aa975a369ec3ae6:
   Merge tag 'pull-target-arm-20230123' of https://git.linaro.org/people/pmaydell/qemu-arm into staging (2023-01-23 13:40:28 +0000)
 are available in the Git repository at:
-  https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20220104
+  https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20230123
-for you to fetch changes up to d7478d4229f0a2b2817a55487e6b17081099fae4:
+for you to fetch changes up to 709bcd7da3f6b4655d910634a0d520fa1439df38:
-  common-user: Fix tail calls to safe_syscall_set_errno_tail (2022-01-04 15:41:03 -0800)
+  tcg/loongarch64: Reorg goto_tb implementation (2023-01-23 16:00:13 -1000)
 ----------------------------------------------------------------
-Fix for safe_syscall_base.
+common-user: Re-enable ppc32 host
-Fix for folding of vector add/sub.
+tcg: Avoid recursion in tcg_gen_mulu2_i32
-Fix build on loongarch64 with gcc 8.
+tcg: Mark tcg helpers noinline to avoid an issue with LTO
-Remove decl for qemu_run_machine_init_done_notifiers.
+tcg/arm: Use register pair allocation for qemu_{ld,st}_i64
 disas: Enable loongarch disassembler, and fixes
 tcg/loongarch64: Improve move immediate
 tcg/loongarch64: Improve add immediate
 tcg/loongarch64: Improve setcond
 tcg/loongarch64: Implement movcond
 tcg/loongarch64: Use tcg_pcrel_diff in tcg_out_ldst
 tcg/loongarch64: Reorg goto_tb implementation
 ----------------------------------------------------------------
-Philippe Mathieu-Daudé (1):
+Richard Henderson (14):
-      linux-user: Fix trivial build error on loongarch64 hosts
+      tcg: Avoid recursion in tcg_gen_mulu2_i32
       tcg/arm: Use register pair allocation for qemu_{ld,st}_i64
       common-user/host/ppc: Implement safe-syscall.inc.S
       linux-user: Implment host/ppc/host-signal.h
       tcg: Mark tcg helpers noinline to avoid an issue with LTO
       target/loongarch: Enable the disassembler for host tcg
       target/loongarch: Disassemble jirl properly
       target/loongarch: Disassemble pcadd* addresses
       tcg/loongarch64: Update tcg-insn-defs.c.inc
       tcg/loongarch64: Introduce tcg_out_addi
       tcg/loongarch64: Improve setcond expansion
       tcg/loongarch64: Implement movcond
       tcg/loongarch64: Use tcg_pcrel_diff in tcg_out_ldst
       tcg/loongarch64: Reorg goto_tb implementation
-Richard Henderson (2):
+Rui Wang (1):
-      tcg/optimize: Fix folding of vector ops
+      tcg/loongarch64: Optimize immediate loading
       common-user: Fix tail calls to safe_syscall_set_errno_tail
-Xiaoyao Li (1):
+ include/exec/helper-proto.h                    |  32 ++-
-      sysemu: Cleanup qemu_run_machine_init_done_notifiers()
+ include/tcg/tcg.h                              |   7 -
+ linux-user/include/host/ppc/host-signal.h      |  39 +++
- include/sysemu/sysemu.h                    |  1 -
+ tcg/arm/tcg-target-con-set.h                   |   7 +-
- linux-user/host/loongarch64/host-signal.h  |  4 +--
+ tcg/arm/tcg-target-con-str.h                   |   2 +
- tcg/optimize.c                             | 49 +++++++++++++++++++++++-------
+ tcg/loongarch64/tcg-target-con-set.h           |   5 +-
- common-user/host/i386/safe-syscall.inc.S   |  1 +
+ tcg/loongarch64/tcg-target-con-str.h           |   2 +-
- common-user/host/mips/safe-syscall.inc.S   |  1 +
+ tcg/loongarch64/tcg-target.h                   |  11 +-
- common-user/host/x86_64/safe-syscall.inc.S |  1 +
+ target/loongarch/insns.decode                  |   3 +-
-files changed, 42 insertions(+), 15 deletions(-)
+ disas.c                                        |   2 +
+ target/loongarch/disas.c                       |  39 ++-
  tcg/tcg-op.c                                   |   4 +-
  target/loongarch/insn_trans/trans_branch.c.inc |   2 +-
  tcg/arm/tcg-target.c.inc                       |  28 +-
  tcg/loongarch64/tcg-insn-defs.c.inc            |  10 +-
  tcg/loongarch64/tcg-target.c.inc               | 364 ++++++++++++++++---------
  common-user/host/ppc/safe-syscall.inc.S        | 107 ++++++++
  target/loongarch/meson.build                   |   3 +-
 files changed, 497 insertions(+), 170 deletions(-)
  create mode 100644 linux-user/include/host/ppc/host-signal.h
  create mode 100644 common-user/host/ppc/safe-syscall.inc.S

-New patch
+[PULL v2 01/15] tcg: Avoid recursion in tcg_gen_mulu2_i32
+We have a test for one of TCG_TARGET_HAS_mulu2_i32 or
+TCG_TARGET_HAS_muluh_i32 being defined, but the test
+became non-functional when we changed to always define
+all of these macros.
+Replace this with a build-time test in tcg_gen_mulu2_i32.
+Fixes: 25c4d9cc845 ("tcg: Always define all of the TCGOpcode enum members.")
+Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1435
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ include/tcg/tcg.h | 7 -------
+ tcg/tcg-op.c      | 4 +++-
+files changed, 3 insertions(+), 8 deletions(-)
+diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
+index XXXXXXX..XXXXXXX 100644
+--- a/include/tcg/tcg.h
++++ b/include/tcg/tcg.h
+@@ -XXX,XX +XXX,XX @@ typedef uint64_t TCGRegSet;
+ #define TCG_TARGET_HAS_rem_i64          0
+ #endif
+-/* For 32-bit targets, some sort of unsigned widening multiply is required.  */
+-#if TCG_TARGET_REG_BITS == 32 \
+-    && !(defined(TCG_TARGET_HAS_mulu2_i32) \
+-         || defined(TCG_TARGET_HAS_muluh_i32))
+-# error "Missing unsigned widening multiply"
+-#endif
+-
+ #if !defined(TCG_TARGET_HAS_v64) \
+     && !defined(TCG_TARGET_HAS_v128) \
+     && !defined(TCG_TARGET_HAS_v256)
+diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/tcg-op.c
++++ b/tcg/tcg-op.c
+@@ -XXX,XX +XXX,XX @@ void tcg_gen_mulu2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 arg1, TCGv_i32 arg2)
+         tcg_gen_op3_i32(INDEX_op_muluh_i32, rh, arg1, arg2);
+         tcg_gen_mov_i32(rl, t);
+         tcg_temp_free_i32(t);
+-    } else {
++    } else if (TCG_TARGET_REG_BITS == 64) {
+         TCGv_i64 t0 = tcg_temp_new_i64();
+         TCGv_i64 t1 = tcg_temp_new_i64();
+         tcg_gen_extu_i32_i64(t0, arg1);
+@@ -XXX,XX +XXX,XX @@ void tcg_gen_mulu2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 arg1, TCGv_i32 arg2)
+         tcg_gen_extr_i64_i32(rl, rh, t0);
+         tcg_temp_free_i64(t0);
+         tcg_temp_free_i64(t1);
++    } else {
++        qemu_build_not_reached();
+     }
+ }
+--
+.34.1

-New patch
+[PULL v2 02/15] tcg/arm: Use register pair allocation for qemu_{ld, st}_i64
+Although we still can't use ldrd and strd for all operations,
+increase the chances by getting the register allocation correct.
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/arm/tcg-target-con-set.h |  7 ++++---
+ tcg/arm/tcg-target-con-str.h |  2 ++
+ tcg/arm/tcg-target.c.inc     | 28 ++++++++++++++++++----------
+files changed, 24 insertions(+), 13 deletions(-)
+diff --git a/tcg/arm/tcg-target-con-set.h b/tcg/arm/tcg-target-con-set.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/arm/tcg-target-con-set.h
++++ b/tcg/arm/tcg-target-con-set.h
+@@ -XXX,XX +XXX,XX @@ C_O0_I2(r, rIN)
+ C_O0_I2(s, s)
+ C_O0_I2(w, r)
+ C_O0_I3(s, s, s)
++C_O0_I3(S, p, s)
+ C_O0_I4(r, r, rI, rI)
+-C_O0_I4(s, s, s, s)
++C_O0_I4(S, p, s, s)
+ C_O1_I1(r, l)
+ C_O1_I1(r, r)
+ C_O1_I1(w, r)
+@@ -XXX,XX +XXX,XX @@ C_O1_I2(w, w, wZ)
+ C_O1_I3(w, w, w, w)
+ C_O1_I4(r, r, r, rI, rI)
+ C_O1_I4(r, r, rIN, rIK, 0)
+-C_O2_I1(r, r, l)
+-C_O2_I2(r, r, l, l)
++C_O2_I1(e, p, l)
++C_O2_I2(e, p, l, l)
+ C_O2_I2(r, r, r, r)
+ C_O2_I4(r, r, r, r, rIN, rIK)
+ C_O2_I4(r, r, rI, rI, rIN, rIK)
+diff --git a/tcg/arm/tcg-target-con-str.h b/tcg/arm/tcg-target-con-str.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/arm/tcg-target-con-str.h
++++ b/tcg/arm/tcg-target-con-str.h
+@@ -XXX,XX +XXX,XX @@
+  * Define constraint letters for register sets:
+  * REGS(letter, register_mask)
+  */
++REGS('e', ALL_GENERAL_REGS & 0x5555) /* even regs */
+ REGS('r', ALL_GENERAL_REGS)
+ REGS('l', ALL_QLOAD_REGS)
+ REGS('s', ALL_QSTORE_REGS)
++REGS('S', ALL_QSTORE_REGS & 0x5555)  /* even qstore */
+ REGS('w', ALL_VECTOR_REGS)
+ /*
+diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/arm/tcg-target.c.inc
++++ b/tcg/arm/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld_index(TCGContext *s, MemOp opc,
+         tcg_out_ld32_r(s, COND_AL, datalo, addrlo, addend);
+         break;
+     case MO_UQ:
++        /* We used pair allocation for datalo, so already should be aligned. */
++        tcg_debug_assert((datalo & 1) == 0);
++        tcg_debug_assert(datahi == datalo + 1);
+         /* LDRD requires alignment; double-check that. */
+-        if (get_alignment_bits(opc) >= MO_64
+-            && (datalo & 1) == 0 && datahi == datalo + 1) {
++        if (get_alignment_bits(opc) >= MO_64) {
+             /*
+              * Rm (the second address op) must not overlap Rt or Rt + 1.
+              * Since datalo is aligned, we can simplify the test via alignment.
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld_direct(TCGContext *s, MemOp opc, TCGReg datalo,
+         tcg_out_ld32_12(s, COND_AL, datalo, addrlo, 0);
+         break;
+     case MO_UQ:
++        /* We used pair allocation for datalo, so already should be aligned. */
++        tcg_debug_assert((datalo & 1) == 0);
++        tcg_debug_assert(datahi == datalo + 1);
+         /* LDRD requires alignment; double-check that. */
+-        if (get_alignment_bits(opc) >= MO_64
+-            && (datalo & 1) == 0 && datahi == datalo + 1) {
++        if (get_alignment_bits(opc) >= MO_64) {
+             tcg_out_ldrd_8(s, COND_AL, datalo, addrlo, 0);
+         } else if (datalo == addrlo) {
+             tcg_out_ld32_12(s, COND_AL, datahi, addrlo, 4);
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st_index(TCGContext *s, ARMCond cond, MemOp opc,
+         tcg_out_st32_r(s, cond, datalo, addrlo, addend);
+         break;
+     case MO_64:
++        /* We used pair allocation for datalo, so already should be aligned. */
++        tcg_debug_assert((datalo & 1) == 0);
++        tcg_debug_assert(datahi == datalo + 1);
+         /* STRD requires alignment; double-check that. */
+-        if (get_alignment_bits(opc) >= MO_64
+-            && (datalo & 1) == 0 && datahi == datalo + 1) {
++        if (get_alignment_bits(opc) >= MO_64) {
+             tcg_out_strd_r(s, cond, datalo, addrlo, addend);
+         } else if (scratch_addend) {
+             tcg_out_st32_rwb(s, cond, datalo, addend, addrlo);
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st_direct(TCGContext *s, MemOp opc, TCGReg datalo,
+         tcg_out_st32_12(s, COND_AL, datalo, addrlo, 0);
+         break;
+     case MO_64:
++        /* We used pair allocation for datalo, so already should be aligned. */
++        tcg_debug_assert((datalo & 1) == 0);
++        tcg_debug_assert(datahi == datalo + 1);
+         /* STRD requires alignment; double-check that. */
+-        if (get_alignment_bits(opc) >= MO_64
+-            && (datalo & 1) == 0 && datahi == datalo + 1) {
++        if (get_alignment_bits(opc) >= MO_64) {
+             tcg_out_strd_8(s, COND_AL, datalo, addrlo, 0);
+         } else {
+             tcg_out_st32_12(s, COND_AL, datalo, addrlo, 0);
+@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
+     case INDEX_op_qemu_ld_i32:
+         return TARGET_LONG_BITS == 32 ? C_O1_I1(r, l) : C_O1_I2(r, l, l);
+     case INDEX_op_qemu_ld_i64:
+-        return TARGET_LONG_BITS == 32 ? C_O2_I1(r, r, l) : C_O2_I2(r, r, l, l);
++        return TARGET_LONG_BITS == 32 ? C_O2_I1(e, p, l) : C_O2_I2(e, p, l, l);
+     case INDEX_op_qemu_st_i32:
+         return TARGET_LONG_BITS == 32 ? C_O0_I2(s, s) : C_O0_I3(s, s, s);
+     case INDEX_op_qemu_st_i64:
+-        return TARGET_LONG_BITS == 32 ? C_O0_I3(s, s, s) : C_O0_I4(s, s, s, s);
++        return TARGET_LONG_BITS == 32 ? C_O0_I3(S, p, s) : C_O0_I4(S, p, s, s);
+     case INDEX_op_st_vec:
+         return C_O0_I2(w, r);
+--
+.34.1

-New patch
+[PULL v2 03/15] common-user/host/ppc: Implement safe-syscall.inc.S
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+Reviewed-by: Daniel Henrique Barboza <danielhb413@gmail.com>
+Message-Id: <20220729172141.1789105-2-richard.henderson@linaro.org>
+---
+ common-user/host/ppc/safe-syscall.inc.S | 107 ++++++++++++++++++++++++
+file changed, 107 insertions(+)
+ create mode 100644 common-user/host/ppc/safe-syscall.inc.S
+diff --git a/common-user/host/ppc/safe-syscall.inc.S b/common-user/host/ppc/safe-syscall.inc.S
+new file mode 100644
+index XXXXXXX..XXXXXXX
+--- /dev/null
++++ b/common-user/host/ppc/safe-syscall.inc.S
+@@ -XXX,XX +XXX,XX @@
++/*
++ * safe-syscall.inc.S : host-specific assembly fragment
++ * to handle signals occurring at the same time as system calls.
++ * This is intended to be included by common-user/safe-syscall.S
++ *
++ * Copyright (C) 2022 Linaro, Ltd.
++ *
++ * This work is licensed under the terms of the GNU GPL, version 2 or later.
++ * See the COPYING file in the top-level directory.
++ */
++
++/*
++ * Standardize on the _CALL_FOO symbols used by GCC:
++ * Apple XCode does not define _CALL_DARWIN.
++ * Clang defines _CALL_ELF (64-bit) but not _CALL_SYSV (32-bit).
++ */
++#if !defined(_CALL_SYSV) && \
++    !defined(_CALL_DARWIN) && \
++    !defined(_CALL_AIX) && \
++    !defined(_CALL_ELF)
++# if defined(__APPLE__)
++#  define _CALL_DARWIN
++# elif defined(__ELF__) && TCG_TARGET_REG_BITS == 32
++#  define _CALL_SYSV
++# else
++#  error "Unknown ABI"
++# endif
++#endif
++
++#ifndef _CALL_SYSV
++# error "Unsupported ABI"
++#endif
++
++
++        .global safe_syscall_base
++        .global safe_syscall_start
++        .global safe_syscall_end
++        .type   safe_syscall_base, @function
++
++        .text
++
++        /*
++         * This is the entry point for making a system call. The calling
++         * convention here is that of a C varargs function with the
++         * first argument an 'int *' to the signal_pending flag, the
++         * second one the system call number (as a 'long'), and all further
++         * arguments being syscall arguments (also 'long').
++         */
++safe_syscall_base:
++        .cfi_startproc
++        stwu    1, -8(1)
++        .cfi_def_cfa_offset 8
++        stw     30, 4(1)
++        .cfi_offset 30, -4
++
++        /*
++         * We enter with r3 == &signal_pending
++         *               r4 == syscall number
++         *               r5 ... r10 == syscall arguments
++         *               and return the result in r3
++         * and the syscall instruction needs
++         *               r0 == syscall number
++         *               r3 ... r8 == syscall arguments
++         *               and returns the result in r3
++         * Shuffle everything around appropriately.
++         */
++        mr      30, 3           /* signal_pending */
++        mr      0, 4            /* syscall number */
++        mr      3, 5            /* syscall arguments */
++        mr      4, 6
++        mr      5, 7
++        mr      6, 8
++        mr      7, 9
++        mr      8, 10
++
++        /*
++         * This next sequence of code works in conjunction with the
++         * rewind_if_safe_syscall_function(). If a signal is taken
++         * and the interrupted PC is anywhere between 'safe_syscall_start'
++         * and 'safe_syscall_end' then we rewind it to 'safe_syscall_start'.
++         * The code sequence must therefore be able to cope with this, and
++         * the syscall instruction must be the final one in the sequence.
++         */
++safe_syscall_start:
++        /* if signal_pending is non-zero, don't do the call */
++        lwz     12, 0(30)
++        cmpwi   0, 12, 0
++        bne-    2f
++        sc
++safe_syscall_end:
++        /* code path when we did execute the syscall */
++        lwz     30, 4(1)        /* restore r30 */
++        addi    1, 1, 8         /* restore stack */
++        .cfi_restore 30
++        .cfi_def_cfa_offset 0
++        bnslr+                  /* return on success */
++        b       safe_syscall_set_errno_tail
++
++        /* code path when we didn't execute the syscall */
++2:      lwz     30, 4(1)
++        addi    1, 1, 8
++        addi    3, 0, QEMU_ERESTARTSYS
++        b       safe_syscall_set_errno_tail
++
++        .cfi_endproc
++
++        .size   safe_syscall_base, .-safe_syscall_base
+--
+.34.1

-New patch
+[PULL v2 04/15] linux-user: Implment host/ppc/host-signal.h
+This commit re-enables ppc32 as a linux-user host,
+as existance of the directory is noted by configure.
+Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1097
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+Reviewed-by: Daniel Henrique Barboza <danielhb413@gmail.com>
+Message-Id: <20220729172141.1789105-3-richard.henderson@linaro.org>
+---
+ linux-user/include/host/ppc/host-signal.h | 39 +++++++++++++++++++++++
+file changed, 39 insertions(+)
+ create mode 100644 linux-user/include/host/ppc/host-signal.h
+diff --git a/linux-user/include/host/ppc/host-signal.h b/linux-user/include/host/ppc/host-signal.h
+new file mode 100644
+index XXXXXXX..XXXXXXX
+--- /dev/null
++++ b/linux-user/include/host/ppc/host-signal.h
+@@ -XXX,XX +XXX,XX @@
++/*
++ * host-signal.h: signal info dependent on the host architecture
++ *
++ * Copyright (c) 2022 Linaro Ltd.
++ *
++ * This work is licensed under the terms of the GNU LGPL, version 2.1 or later.
++ * See the COPYING file in the top-level directory.
++ */
++
++#ifndef PPC_HOST_SIGNAL_H
++#define PPC_HOST_SIGNAL_H
++
++#include <asm/ptrace.h>
++
++/* The third argument to a SA_SIGINFO handler is ucontext_t. */
++typedef ucontext_t host_sigcontext;
++
++static inline uintptr_t host_signal_pc(host_sigcontext *uc)
++{
++    return uc->uc_mcontext.regs->nip;
++}
++
++static inline void host_signal_set_pc(host_sigcontext *uc, uintptr_t pc)
++{
++    uc->uc_mcontext.regs->nip = pc;
++}
++
++static inline void *host_signal_mask(host_sigcontext *uc)
++{
++    return &uc->uc_sigmask;
++}
++
++static inline bool host_signal_write(siginfo_t *info, host_sigcontext *uc)
++{
++    return uc->uc_mcontext.regs->trap != 0x400
++        && (uc->uc_mcontext.regs->dsisr & 0x02000000);
++}
++
++#endif
+--
+.34.1

-[PULL 4/4] common-user: Fix tail calls to safe_syscall_set_errno_tail
+[PULL v2 05/15] tcg: Mark tcg helpers noinline to avoid an issue with LTO
-For the ABIs in which the syscall return register is not
+Marking helpers __attribute__((noinline)) prevents an issue
-also the first function argument register, move the errno
+with GCC's ipa-split pass under --enable-lto.
 value into the correct place.
-Fixes: a3310c0397e2 ("linux-user: Move syscall error detection into safe_syscall_base")
+Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1454
-Reported-by: Laurent Vivier <laurent@vivier.eu>
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
-Tested-by: Laurent Vivier <laurent@vivier.eu>
+Tested-by: Idan Horowitz <idan.horowitz@gmail.com>
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
-Message-Id: <20220104190454.542225-1-richard.henderson@linaro.org>
 ---
- common-user/host/i386/safe-syscall.inc.S   | 1 +
+ include/exec/helper-proto.h | 32 ++++++++++++++++++++++++--------
- common-user/host/mips/safe-syscall.inc.S   | 1 +
+file changed, 24 insertions(+), 8 deletions(-)
  common-user/host/x86_64/safe-syscall.inc.S | 1 +
 files changed, 3 insertions(+)
-diff --git a/common-user/host/i386/safe-syscall.inc.S b/common-user/host/i386/safe-syscall.inc.S
+diff --git a/include/exec/helper-proto.h b/include/exec/helper-proto.h
 index XXXXXXX..XXXXXXX 100644
---- a/common-user/host/i386/safe-syscall.inc.S
+--- a/include/exec/helper-proto.h
-+++ b/common-user/host/i386/safe-syscall.inc.S
++++ b/include/exec/helper-proto.h
-@@ -XXX,XX +XXX,XX @@ safe_syscall_end:
+@@ -XXX,XX +XXX,XX @@
-         pop     %ebp
-         .cfi_adjust_cfa_offset -4
+ #include "exec/helper-head.h"
-         .cfi_restore ebp
-+        mov     %eax, (%esp)
++/*
-         jmp     safe_syscall_set_errno_tail
++ * Work around an issue with --enable-lto, in which GCC's ipa-split pass
++ * decides to split out the noreturn code paths that raise an exception,
-         .cfi_endproc
++ * taking the __builtin_return_address() along into the new function,
-diff --git a/common-user/host/mips/safe-syscall.inc.S b/common-user/host/mips/safe-syscall.inc.S
++ * where it no longer computes a value that returns to TCG generated code.
-index XXXXXXX..XXXXXXX 100644
++ * Despite the name, the noinline attribute affects splitter, so this
---- a/common-user/host/mips/safe-syscall.inc.S
++ * prevents the optimization in question.  Given that helpers should not
-+++ b/common-user/host/mips/safe-syscall.inc.S
++ * otherwise be called directly, this should have any other visible effect.
-@@ -XXX,XX +XXX,XX @@ safe_syscall_end:
++ *
-:      USE_ALT_CP(t0)
++ * See https://gitlab.com/qemu-project/qemu/-/issues/1454
-         SETUP_GPX(t1)
++ */
-         SETUP_GPX64(t0, t1)
++#define DEF_HELPER_ATTR  __attribute__((noinline))
-+        move    a0, v0
++
-         PTR_LA  t9, safe_syscall_set_errno_tail
+ #define DEF_HELPER_FLAGS_0(name, flags, ret) \
-         jr      t9
+-dh_ctype(ret) HELPER(name) (void);
++dh_ctype(ret) HELPER(name) (void) DEF_HELPER_ATTR;
-diff --git a/common-user/host/x86_64/safe-syscall.inc.S b/common-user/host/x86_64/safe-syscall.inc.S
-index XXXXXXX..XXXXXXX 100644
+ #define DEF_HELPER_FLAGS_1(name, flags, ret, t1) \
---- a/common-user/host/x86_64/safe-syscall.inc.S
+-dh_ctype(ret) HELPER(name) (dh_ctype(t1));
-+++ b/common-user/host/x86_64/safe-syscall.inc.S
++dh_ctype(ret) HELPER(name) (dh_ctype(t1)) DEF_HELPER_ATTR;
-@@ -XXX,XX +XXX,XX @@ safe_syscall_end:
-:      pop     %rbp
+ #define DEF_HELPER_FLAGS_2(name, flags, ret, t1, t2) \
-         .cfi_def_cfa_offset 8
+-dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2));
-         .cfi_restore rbp
++dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2)) DEF_HELPER_ATTR;
-+        mov     %eax, %edi
-         jmp     safe_syscall_set_errno_tail
+ #define DEF_HELPER_FLAGS_3(name, flags, ret, t1, t2, t3) \
-         .cfi_endproc
+-dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3));
++dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), \
 +                            dh_ctype(t3)) DEF_HELPER_ATTR;
  #define DEF_HELPER_FLAGS_4(name, flags, ret, t1, t2, t3, t4) \
  dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
 -                                   dh_ctype(t4));
 +                            dh_ctype(t4)) DEF_HELPER_ATTR;
  #define DEF_HELPER_FLAGS_5(name, flags, ret, t1, t2, t3, t4, t5) \
  dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
 -                            dh_ctype(t4), dh_ctype(t5));
 +                            dh_ctype(t4), dh_ctype(t5)) DEF_HELPER_ATTR;
  #define DEF_HELPER_FLAGS_6(name, flags, ret, t1, t2, t3, t4, t5, t6) \
  dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
 -                            dh_ctype(t4), dh_ctype(t5), dh_ctype(t6));
 +                            dh_ctype(t4), dh_ctype(t5), \
 +                            dh_ctype(t6)) DEF_HELPER_ATTR;
  #define DEF_HELPER_FLAGS_7(name, flags, ret, t1, t2, t3, t4, t5, t6, t7) \
  dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
                              dh_ctype(t4), dh_ctype(t5), dh_ctype(t6), \
 -                            dh_ctype(t7));
 +                            dh_ctype(t7)) DEF_HELPER_ATTR;
  #define IN_HELPER_PROTO
@@ -XXX,XX +XXX,XX @@ dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
  #undef DEF_HELPER_FLAGS_5
  #undef DEF_HELPER_FLAGS_6
  #undef DEF_HELPER_FLAGS_7
 +#undef DEF_HELPER_ATTR
  #endif /* HELPER_PROTO_H */
 --
-.25.1
+.34.1

-[PULL 3/4] sysemu: Cleanup qemu_run_machine_init_done_notifiers()
+[PULL v2 06/15] target/loongarch: Enable the disassembler for host tcg
-From: Xiaoyao Li <xiaoyao.li@intel.com>
+Reuse the decodetree based disassembler from
 target/loongarch/ for tcg/loongarch64/.
-Remove qemu_run_machine_init_done_notifiers() since no implementation
+The generation of decode-insns.c.inc into ./libcommon.fa.p/ could
-and user.
+eventually result in conflict, if any other host requires the same
 trick, but this is good enough for now.
-Fixes: f66dc8737c9 ("vl: move all generic initialization out of vl.c")
+Reviewed-by: WANG Xuerui <git@xen0n.name>
-Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Message-Id: <20220104024136.1433545-1-xiaoyao.li@intel.com>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/sysemu/sysemu.h | 1 -
+ disas.c                      | 2 ++
-file changed, 1 deletion(-)
+ target/loongarch/meson.build | 3 ++-
 files changed, 4 insertions(+), 1 deletion(-)
-diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
+diff --git a/disas.c b/disas.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/sysemu/sysemu.h
+--- a/disas.c
-+++ b/include/sysemu/sysemu.h
++++ b/disas.c
-@@ -XXX,XX +XXX,XX @@ extern bool qemu_uuid_set;
+@@ -XXX,XX +XXX,XX @@ static void initialize_debug_host(CPUDebug *s)
- void qemu_add_exit_notifier(Notifier *notify);
+     s->info.cap_insn_split = 6;
- void qemu_remove_exit_notifier(Notifier *notify);
+ #elif defined(__hppa__)
+     s->info.print_insn = print_insn_hppa;
--void qemu_run_machine_init_done_notifiers(void);
++#elif defined(__loongarch__)
- void qemu_add_machine_init_done_notifier(Notifier *notify);
++    s->info.print_insn = print_insn_loongarch;
- void qemu_remove_machine_init_done_notifier(Notifier *notify);
+ #endif
+ }
 diff --git a/target/loongarch/meson.build b/target/loongarch/meson.build
 index XXXXXXX..XXXXXXX 100644
 --- a/target/loongarch/meson.build
 +++ b/target/loongarch/meson.build
@@ -XXX,XX +XXX,XX @@ gen = decodetree.process('insns.decode')
  loongarch_ss = ss.source_set()
  loongarch_ss.add(files(
    'cpu.c',
 -  'disas.c',
  ))
  loongarch_tcg_ss = ss.source_set()
  loongarch_tcg_ss.add(gen)
@@ -XXX,XX +XXX,XX @@ loongarch_softmmu_ss.add(files(
    'iocsr_helper.c',
  ))
 +common_ss.add(when: 'CONFIG_LOONGARCH_DIS', if_true: [files('disas.c'), gen])
 +
  loongarch_ss.add_all(when: 'CONFIG_TCG', if_true: [loongarch_tcg_ss])
  target_arch += {'loongarch': loongarch_ss}
 --
-.25.1
+.34.1

-New patch
+[PULL v2 07/15] target/loongarch: Disassemble jirl properly
+While jirl shares the same instruction format as bne etc,
+it is not assembled the same.  In particular, rd is printed
+first not second and the immediate is not pc-relative.
+Decode into the arg_rr_i structure, which prints correctly.
+This changes the "offs" member to "imm", to update translate.
+Reviewed-by: WANG Xuerui <git@xen0n.name>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ target/loongarch/insns.decode                  | 3 ++-
+ target/loongarch/disas.c                       | 2 +-
+ target/loongarch/insn_trans/trans_branch.c.inc | 2 +-
+files changed, 4 insertions(+), 3 deletions(-)
+diff --git a/target/loongarch/insns.decode b/target/loongarch/insns.decode
+index XXXXXXX..XXXXXXX 100644
+--- a/target/loongarch/insns.decode
++++ b/target/loongarch/insns.decode
+@@ -XXX,XX +XXX,XX @@
+ @rr_ui12                 .... ...... imm:12 rj:5 rd:5    &rr_i
+ @rr_i14s2         .... ....  .............. rj:5 rd:5    &rr_i imm=%i14s2
+ @rr_i16                     .... .. imm:s16 rj:5 rd:5    &rr_i
++@rr_i16s2         .... ..  ................ rj:5 rd:5    &rr_i imm=%offs16
+ @hint_r_i12           .... ...... imm:s12 rj:5 hint:5    &hint_r_i
+ @rrr_sa2p1        .... ........ ... .. rk:5 rj:5 rd:5    &rrr_sa  sa=%sa2p1
+ @rrr_sa2        .... ........ ... sa:2 rk:5 rj:5 rd:5    &rrr_sa
+@@ -XXX,XX +XXX,XX @@ beqz            0100 00 ................ ..... .....     @r_offs21
+ bnez            0100 01 ................ ..... .....     @r_offs21
+ bceqz           0100 10 ................ 00 ... .....    @c_offs21
+ bcnez           0100 10 ................ 01 ... .....    @c_offs21
+-jirl            0100 11 ................ ..... .....     @rr_offs16
++jirl            0100 11 ................ ..... .....     @rr_i16s2
+ b               0101 00 ..........................       @offs26
+ bl              0101 01 ..........................       @offs26
+ beq             0101 10 ................ ..... .....     @rr_offs16
+diff --git a/target/loongarch/disas.c b/target/loongarch/disas.c
+index XXXXXXX..XXXXXXX 100644
+--- a/target/loongarch/disas.c
++++ b/target/loongarch/disas.c
+@@ -XXX,XX +XXX,XX @@ INSN(beqz,         r_offs)
+ INSN(bnez,         r_offs)
+ INSN(bceqz,        c_offs)
+ INSN(bcnez,        c_offs)
+-INSN(jirl,         rr_offs)
++INSN(jirl,         rr_i)
+ INSN(b,            offs)
+ INSN(bl,           offs)
+ INSN(beq,          rr_offs)
+diff --git a/target/loongarch/insn_trans/trans_branch.c.inc b/target/loongarch/insn_trans/trans_branch.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/target/loongarch/insn_trans/trans_branch.c.inc
++++ b/target/loongarch/insn_trans/trans_branch.c.inc
+@@ -XXX,XX +XXX,XX @@ static bool trans_jirl(DisasContext *ctx, arg_jirl *a)
+     TCGv dest = gpr_dst(ctx, a->rd, EXT_NONE);
+     TCGv src1 = gpr_src(ctx, a->rj, EXT_NONE);
+-    tcg_gen_addi_tl(cpu_pc, src1, a->offs);
++    tcg_gen_addi_tl(cpu_pc, src1, a->imm);
+     tcg_gen_movi_tl(dest, ctx->base.pc_next + 4);
+     gen_set_gpr(a->rd, dest, EXT_NONE);
+     tcg_gen_lookup_and_goto_ptr();
+--
+.34.1

-New patch
+[PULL v2 08/15] target/loongarch: Disassemble pcadd* addresses
+Print both the raw field and the resolved pc-relative
+address, as we do for branches.
+Reviewed-by: WANG Xuerui <git@xen0n.name>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ target/loongarch/disas.c | 37 +++++++++++++++++++++++++++++++++----
+file changed, 33 insertions(+), 4 deletions(-)
+diff --git a/target/loongarch/disas.c b/target/loongarch/disas.c
+index XXXXXXX..XXXXXXX 100644
+--- a/target/loongarch/disas.c
++++ b/target/loongarch/disas.c
+@@ -XXX,XX +XXX,XX @@ INSN(fsel,         fffc)
+ INSN(addu16i_d,    rr_i)
+ INSN(lu12i_w,      r_i)
+ INSN(lu32i_d,      r_i)
+-INSN(pcaddi,       r_i)
+-INSN(pcalau12i,    r_i)
+-INSN(pcaddu12i,    r_i)
+-INSN(pcaddu18i,    r_i)
+ INSN(ll_w,         rr_i)
+ INSN(sc_w,         rr_i)
+ INSN(ll_d,         rr_i)
+@@ -XXX,XX +XXX,XX @@ static bool trans_fcmp_cond_##suffix(DisasContext *ctx, \
+ FCMP_INSN(s)
+ FCMP_INSN(d)
++
++#define PCADD_INSN(name)                                        \
++static bool trans_##name(DisasContext *ctx, arg_##name *a)      \
++{                                                               \
++    output(ctx, #name, "r%d, %d # 0x%" PRIx64,                  \
++           a->rd, a->imm, gen_##name(ctx->pc, a->imm));         \
++    return true;                                                \
++}
++
++static uint64_t gen_pcaddi(uint64_t pc, int imm)
++{
++    return pc + (imm << 2);
++}
++
++static uint64_t gen_pcalau12i(uint64_t pc, int imm)
++{
++    return (pc + (imm << 12)) & ~0xfff;
++}
++
++static uint64_t gen_pcaddu12i(uint64_t pc, int imm)
++{
++    return pc + (imm << 12);
++}
++
++static uint64_t gen_pcaddu18i(uint64_t pc, int imm)
++{
++    return pc + ((uint64_t)(imm) << 18);
++}
++
++PCADD_INSN(pcaddi)
++PCADD_INSN(pcalau12i)
++PCADD_INSN(pcaddu12i)
++PCADD_INSN(pcaddu18i)
+--
+.34.1

-New patch
+[PULL v2 09/15] tcg/loongarch64: Optimize immediate loading
+From: Rui Wang <wangrui@loongson.cn>
+diff:
+  Imm                 Before                  After
+  0000000000000000    addi.w  rd, zero, 0     addi.w  rd, zero, 0
+                      lu52i.d rd, zero, 0
+  00000000fffff800    lu12i.w rd, -1          addi.w  rd, zero, -2048
+                      ori     rd, rd, 2048    lu32i.d rd, 0
+                      lu32i.d rd, 0
+Reviewed-by: WANG Xuerui <git@xen0n.name>
+Signed-off-by: Rui Wang <wangrui@loongson.cn>
+Message-Id: <20221107144713.845550-1-wangrui@loongson.cn>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/loongarch64/tcg-target.c.inc | 35 +++++++++++---------------------
+file changed, 12 insertions(+), 23 deletions(-)
+diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/loongarch64/tcg-target.c.inc
++++ b/tcg/loongarch64/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
+     return true;
+ }
+-static bool imm_part_needs_loading(bool high_bits_are_ones,
+-                                   tcg_target_long part)
+-{
+-    if (high_bits_are_ones) {
+-        return part != -1;
+-    } else {
+-        return part != 0;
+-    }
+-}
+-
+ /* Loads a 32-bit immediate into rd, sign-extended.  */
+ static void tcg_out_movi_i32(TCGContext *s, TCGReg rd, int32_t val)
+ {
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_movi_i32(TCGContext *s, TCGReg rd, int32_t val)
+     tcg_target_long hi12 = sextreg(val, 12, 20);
+     /* Single-instruction cases.  */
+-    if (lo == val) {
+-        /* val fits in simm12: addi.w rd, zero, val */
+-        tcg_out_opc_addi_w(s, rd, TCG_REG_ZERO, val);
+-        return;
+-    }
+-    if (0x800 <= val && val <= 0xfff) {
++    if (hi12 == 0) {
+         /* val fits in uimm12: ori rd, zero, val */
+         tcg_out_opc_ori(s, rd, TCG_REG_ZERO, val);
+         return;
+     }
++    if (hi12 == sextreg(lo, 12, 20)) {
++        /* val fits in simm12: addi.w rd, zero, val */
++        tcg_out_opc_addi_w(s, rd, TCG_REG_ZERO, val);
++        return;
++    }
+     /* High bits must be set; load with lu12i.w + optional ori.  */
+     tcg_out_opc_lu12i_w(s, rd, hi12);
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg rd,
+     intptr_t pc_offset;
+     tcg_target_long val_lo, val_hi, pc_hi, offset_hi;
+-    tcg_target_long hi32, hi52;
+-    bool rd_high_bits_are_ones;
++    tcg_target_long hi12, hi32, hi52;
+     /* Value fits in signed i32.  */
+     if (type == TCG_TYPE_I32 || val == (int32_t)val) {
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg rd,
+         return;
+     }
++    hi12 = sextreg(val, 12, 20);
+     hi32 = sextreg(val, 32, 20);
+     hi52 = sextreg(val, 52, 12);
+     /* Single cu52i.d case.  */
+-    if (ctz64(val) >= 52) {
++    if ((hi52 != 0) && (ctz64(val) >= 52)) {
+         tcg_out_opc_cu52i_d(s, rd, TCG_REG_ZERO, hi52);
+         return;
+     }
+     /* Slow path.  Initialize the low 32 bits, then concat high bits.  */
+     tcg_out_movi_i32(s, rd, val);
+-    rd_high_bits_are_ones = (int32_t)val < 0;
+-    if (imm_part_needs_loading(rd_high_bits_are_ones, hi32)) {
++    /* Load hi32 and hi52 explicitly when they are unexpected values. */
++    if (hi32 != sextreg(hi12, 20, 20)) {
+         tcg_out_opc_cu32i_d(s, rd, hi32);
+-        rd_high_bits_are_ones = hi32 < 0;
+     }
+-    if (imm_part_needs_loading(rd_high_bits_are_ones, hi52)) {
++    if (hi52 != sextreg(hi32, 20, 12)) {
+         tcg_out_opc_cu52i_d(s, rd, rd, hi52);
+     }
+ }
+--
+.34.1

-[PULL 1/4] tcg/optimize: Fix folding of vector ops
+[PULL v2 10/15] tcg/loongarch64: Update tcg-insn-defs.c.inc
-Bitwise operations are easy to fold, because the operation is
+Regenerate with ADDU16I included:
 identical regardless of element size.  But add and sub need
 extra element size info that is not currently propagated.
-Fixes: 2f9f08ba43d
+   $ cd loongarch-opcodes/scripts/go
-Cc: qemu-stable@nongnu.org
+   $ go run ./genqemutcgdefs > $QEMU/tcg/loongarch64/tcg-insn-defs.c.inc
-Resolves: https://gitlab.com/qemu-project/qemu/-/issues/799
-Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Reviewed-by: WANG Xuerui <git@xen0n.name>
 Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/optimize.c | 49 ++++++++++++++++++++++++++++++++++++++-----------
+ tcg/loongarch64/tcg-insn-defs.c.inc | 10 +++++++++-
-file changed, 38 insertions(+), 11 deletions(-)
+file changed, 9 insertions(+), 1 deletion(-)
-diff --git a/tcg/optimize.c b/tcg/optimize.c
+diff --git a/tcg/loongarch64/tcg-insn-defs.c.inc b/tcg/loongarch64/tcg-insn-defs.c.inc
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/optimize.c
+--- a/tcg/loongarch64/tcg-insn-defs.c.inc
-+++ b/tcg/optimize.c
++++ b/tcg/loongarch64/tcg-insn-defs.c.inc
-@@ -XXX,XX +XXX,XX @@ static uint64_t do_constant_folding_2(TCGOpcode op, uint64_t x, uint64_t y)
+@@ -XXX,XX +XXX,XX @@
-     CASE_OP_32_64(mul):
+  *
-         return x * y;
+  * This file is auto-generated by genqemutcgdefs from
+  * https://github.com/loongson-community/loongarch-opcodes,
--    CASE_OP_32_64(and):
+- * from commit 961f0c60f5b63e574d785995600c71ad5413fdc4.
-+    CASE_OP_32_64_VEC(and):
++ * from commit 25ca7effe9d88101c1cf96c4005423643386d81f.
-         return x & y;
+  * DO NOT EDIT.
+  */
--    CASE_OP_32_64(or):
-+    CASE_OP_32_64_VEC(or):
+@@ -XXX,XX +XXX,XX @@ typedef enum {
-         return x | y;
+     OPC_ANDI = 0x03400000,
+     OPC_ORI = 0x03800000,
--    CASE_OP_32_64(xor):
+     OPC_XORI = 0x03c00000,
-+    CASE_OP_32_64_VEC(xor):
++    OPC_ADDU16I_D = 0x10000000,
-         return x ^ y;
+     OPC_LU12I_W = 0x14000000,
+     OPC_CU32I_D = 0x16000000,
-     case INDEX_op_shl_i32:
+     OPC_PCADDU2I = 0x18000000,
-@@ -XXX,XX +XXX,XX @@ static uint64_t do_constant_folding_2(TCGOpcode op, uint64_t x, uint64_t y)
+@@ -XXX,XX +XXX,XX @@ tcg_out_opc_xori(TCGContext *s, TCGReg d, TCGReg j, uint32_t uk12)
-     case INDEX_op_rotl_i64:
+     tcg_out32(s, encode_djuk12_insn(OPC_XORI, d, j, uk12));
          return rol64(x, y & 63);
 -    CASE_OP_32_64(not):
 +    CASE_OP_32_64_VEC(not):
          return ~x;
      CASE_OP_32_64(neg):
          return -x;
 -    CASE_OP_32_64(andc):
 +    CASE_OP_32_64_VEC(andc):
          return x & ~y;
 -    CASE_OP_32_64(orc):
 +    CASE_OP_32_64_VEC(orc):
          return x | ~y;
      CASE_OP_32_64(eqv):
@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
      return false;
  }
-+static bool fold_commutative(OptContext *ctx, TCGOp *op)
++/* Emits the `addu16i.d d, j, sk16` instruction.  */
 +static void __attribute__((unused))
 +tcg_out_opc_addu16i_d(TCGContext *s, TCGReg d, TCGReg j, int32_t sk16)
 +{
-+    swap_commutative(op->args[0], &op->args[1], &op->args[2]);
++    tcg_out32(s, encode_djsk16_insn(OPC_ADDU16I_D, d, j, sk16));
 +    return false;
 +}
 +
- static bool fold_const2_commutative(OptContext *ctx, TCGOp *op)
+ /* Emits the `lu12i.w d, sj20` instruction.  */
- {
+ static void __attribute__((unused))
-     swap_commutative(op->args[0], &op->args[1], &op->args[2]);
+ tcg_out_opc_lu12i_w(TCGContext *s, TCGReg d, int32_t sj20)
@@ -XXX,XX +XXX,XX @@ static bool fold_add(OptContext *ctx, TCGOp *op)
      return false;
  }
 +/* We cannot as yet do_constant_folding with vectors. */
 +static bool fold_add_vec(OptContext *ctx, TCGOp *op)
 +{
 +    if (fold_commutative(ctx, op) ||
 +        fold_xi_to_x(ctx, op, 0)) {
 +        return true;
 +    }
 +    return false;
 +}
 +
  static bool fold_addsub2(OptContext *ctx, TCGOp *op, bool add)
  {
      if (arg_is_const(op->args[2]) && arg_is_const(op->args[3]) &&
@@ -XXX,XX +XXX,XX @@ static bool fold_sub_to_neg(OptContext *ctx, TCGOp *op)
      return false;
  }
 -static bool fold_sub(OptContext *ctx, TCGOp *op)
 +/* We cannot as yet do_constant_folding with vectors. */
 +static bool fold_sub_vec(OptContext *ctx, TCGOp *op)
  {
 -    if (fold_const2(ctx, op) ||
 -        fold_xx_to_i(ctx, op, 0) ||
 +    if (fold_xx_to_i(ctx, op, 0) ||
          fold_xi_to_x(ctx, op, 0) ||
          fold_sub_to_neg(ctx, op)) {
          return true;
@@ -XXX,XX +XXX,XX @@ static bool fold_sub(OptContext *ctx, TCGOp *op)
      return false;
  }
 +static bool fold_sub(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op) || fold_sub_vec(ctx, op);
 +}
 +
  static bool fold_sub2(OptContext *ctx, TCGOp *op)
  {
      return fold_addsub2(ctx, op, false);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
           * Sorted alphabetically by opcode as much as possible.
           */
          switch (opc) {
 -        CASE_OP_32_64_VEC(add):
 +        CASE_OP_32_64(add):
              done = fold_add(&ctx, op);
              break;
 +        case INDEX_op_add_vec:
 +            done = fold_add_vec(&ctx, op);
 +            break;
          CASE_OP_32_64(add2):
              done = fold_add2(&ctx, op);
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          CASE_OP_32_64(sextract):
              done = fold_sextract(&ctx, op);
              break;
 -        CASE_OP_32_64_VEC(sub):
 +        CASE_OP_32_64(sub):
              done = fold_sub(&ctx, op);
              break;
 +        case INDEX_op_sub_vec:
 +            done = fold_sub_vec(&ctx, op);
 +            break;
          CASE_OP_32_64(sub2):
              done = fold_sub2(&ctx, op);
              break;
 --
-.25.1
+.34.1

-New patch
+[PULL v2 11/15] tcg/loongarch64: Introduce tcg_out_addi
+Adjust the constraints to allow any int32_t for immediate
+addition.  Split immediate adds into addu16i + addi, which
+covers quite a lot of the immediate space.  For the hole in
+the middle, load the constant into TMP0 instead.
+Reviewed-by: WANG Xuerui <git@xen0n.name>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/loongarch64/tcg-target-con-set.h |  4 +-
+ tcg/loongarch64/tcg-target-con-str.h |  2 +-
+ tcg/loongarch64/tcg-target.c.inc     | 57 ++++++++++++++++++++++++----
+files changed, 53 insertions(+), 10 deletions(-)
+diff --git a/tcg/loongarch64/tcg-target-con-set.h b/tcg/loongarch64/tcg-target-con-set.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/loongarch64/tcg-target-con-set.h
++++ b/tcg/loongarch64/tcg-target-con-set.h
+@@ -XXX,XX +XXX,XX @@ C_O1_I1(r, L)
+ C_O1_I2(r, r, rC)
+ C_O1_I2(r, r, ri)
+ C_O1_I2(r, r, rI)
++C_O1_I2(r, r, rJ)
+ C_O1_I2(r, r, rU)
+ C_O1_I2(r, r, rW)
+ C_O1_I2(r, r, rZ)
+ C_O1_I2(r, 0, rZ)
+-C_O1_I2(r, rZ, rN)
++C_O1_I2(r, rZ, ri)
++C_O1_I2(r, rZ, rJ)
+ C_O1_I2(r, rZ, rZ)
+diff --git a/tcg/loongarch64/tcg-target-con-str.h b/tcg/loongarch64/tcg-target-con-str.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/loongarch64/tcg-target-con-str.h
++++ b/tcg/loongarch64/tcg-target-con-str.h
+@@ -XXX,XX +XXX,XX @@ REGS('L', ALL_GENERAL_REGS & ~SOFTMMU_RESERVE_REGS)
+  * CONST(letter, TCG_CT_CONST_* bit set)
+  */
+ CONST('I', TCG_CT_CONST_S12)
+-CONST('N', TCG_CT_CONST_N12)
++CONST('J', TCG_CT_CONST_S32)
+ CONST('U', TCG_CT_CONST_U12)
+ CONST('Z', TCG_CT_CONST_ZERO)
+ CONST('C', TCG_CT_CONST_C12)
+diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/loongarch64/tcg-target.c.inc
++++ b/tcg/loongarch64/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static const int tcg_target_call_oarg_regs[] = {
+ #define TCG_CT_CONST_ZERO  0x100
+ #define TCG_CT_CONST_S12   0x200
+-#define TCG_CT_CONST_N12   0x400
++#define TCG_CT_CONST_S32   0x400
+ #define TCG_CT_CONST_U12   0x800
+ #define TCG_CT_CONST_C12   0x1000
+ #define TCG_CT_CONST_WSZ   0x2000
+@@ -XXX,XX +XXX,XX @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
+     if ((ct & TCG_CT_CONST_S12) && val == sextreg(val, 0, 12)) {
+         return true;
+     }
+-    if ((ct & TCG_CT_CONST_N12) && -val == sextreg(-val, 0, 12)) {
++    if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) {
+         return true;
+     }
+     if ((ct & TCG_CT_CONST_U12) && val >= 0 && val <= 0xfff) {
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg rd,
+     }
+ }
++static void tcg_out_addi(TCGContext *s, TCGType type, TCGReg rd,
++                         TCGReg rs, tcg_target_long imm)
++{
++    tcg_target_long lo12 = sextreg(imm, 0, 12);
++    tcg_target_long hi16 = sextreg(imm - lo12, 16, 16);
++
++    /*
++     * Note that there's a hole in between hi16 and lo12:
++     *
++     *       3                   2                   1                   0
++     *     1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0
++     * ...+-------------------------------+-------+-----------------------+
++     *    |             hi16              |       |          lo12         |
++     * ...+-------------------------------+-------+-----------------------+
++     *
++     * For bits within that hole, it's more efficient to use LU12I and ADD.
++     */
++    if (imm == (hi16 << 16) + lo12) {
++        if (hi16) {
++            tcg_out_opc_addu16i_d(s, rd, rs, hi16);
++            rs = rd;
++        }
++        if (type == TCG_TYPE_I32) {
++            tcg_out_opc_addi_w(s, rd, rs, lo12);
++        } else if (lo12) {
++            tcg_out_opc_addi_d(s, rd, rs, lo12);
++        } else {
++            tcg_out_mov(s, type, rd, rs);
++        }
++    } else {
++        tcg_out_movi(s, type, TCG_REG_TMP0, imm);
++        if (type == TCG_TYPE_I32) {
++            tcg_out_opc_add_w(s, rd, rs, TCG_REG_TMP0);
++        } else {
++            tcg_out_opc_add_d(s, rd, rs, TCG_REG_TMP0);
++        }
++    }
++}
++
+ static void tcg_out_ext8u(TCGContext *s, TCGReg ret, TCGReg arg)
+ {
+     tcg_out_opc_andi(s, ret, arg, 0xff);
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     case INDEX_op_add_i32:
+         if (c2) {
+-            tcg_out_opc_addi_w(s, a0, a1, a2);
++            tcg_out_addi(s, TCG_TYPE_I32, a0, a1, a2);
+         } else {
+             tcg_out_opc_add_w(s, a0, a1, a2);
+         }
+         break;
+     case INDEX_op_add_i64:
+         if (c2) {
+-            tcg_out_opc_addi_d(s, a0, a1, a2);
++            tcg_out_addi(s, TCG_TYPE_I64, a0, a1, a2);
+         } else {
+             tcg_out_opc_add_d(s, a0, a1, a2);
+         }
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+     case INDEX_op_sub_i32:
+         if (c2) {
+-            tcg_out_opc_addi_w(s, a0, a1, -a2);
++            tcg_out_addi(s, TCG_TYPE_I32, a0, a1, -a2);
+         } else {
+             tcg_out_opc_sub_w(s, a0, a1, a2);
+         }
+         break;
+     case INDEX_op_sub_i64:
+         if (c2) {
+-            tcg_out_opc_addi_d(s, a0, a1, -a2);
++            tcg_out_addi(s, TCG_TYPE_I64, a0, a1, -a2);
+         } else {
+             tcg_out_opc_sub_d(s, a0, a1, a2);
+         }
+@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
+         return C_O1_I2(r, r, ri);
+     case INDEX_op_add_i32:
++        return C_O1_I2(r, r, ri);
+     case INDEX_op_add_i64:
+-        return C_O1_I2(r, r, rI);
++        return C_O1_I2(r, r, rJ);
+     case INDEX_op_and_i32:
+     case INDEX_op_and_i64:
+@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
+         return C_O1_I2(r, 0, rZ);
+     case INDEX_op_sub_i32:
++        return C_O1_I2(r, rZ, ri);
+     case INDEX_op_sub_i64:
+-        return C_O1_I2(r, rZ, rN);
++        return C_O1_I2(r, rZ, rJ);
+     case INDEX_op_mul_i32:
+     case INDEX_op_mul_i64:
+--
+.34.1

-New patch
+[PULL v2 12/15] tcg/loongarch64: Improve setcond expansion
+Split out a helper function, tcg_out_setcond_int, which
+does not always produce the complete boolean result, but
+returns a set of flags to do so.
+Accept all int32_t as constant input, so that LE/GT can
+adjust the constant to LT.
+Reviewed-by: WANG Xuerui <git@xen0n.name>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/loongarch64/tcg-target.c.inc | 165 +++++++++++++++++++++----------
+file changed, 115 insertions(+), 50 deletions(-)
+diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/loongarch64/tcg-target.c.inc
++++ b/tcg/loongarch64/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_clzctz(TCGContext *s, LoongArchInsn opc,
+     tcg_out_opc_or(s, a0, TCG_REG_TMP0, a0);
+ }
+-static void tcg_out_setcond(TCGContext *s, TCGCond cond, TCGReg ret,
+-                            TCGReg arg1, TCGReg arg2, bool c2)
+-{
+-    TCGReg tmp;
++#define SETCOND_INV    TCG_TARGET_NB_REGS
++#define SETCOND_NEZ    (SETCOND_INV << 1)
++#define SETCOND_FLAGS  (SETCOND_INV | SETCOND_NEZ)
+-    if (c2) {
+-        tcg_debug_assert(arg2 == 0);
++static int tcg_out_setcond_int(TCGContext *s, TCGCond cond, TCGReg ret,
++                               TCGReg arg1, tcg_target_long arg2, bool c2)
++{
++    int flags = 0;
++
++    switch (cond) {
++    case TCG_COND_EQ:    /* -> NE  */
++    case TCG_COND_GE:    /* -> LT  */
++    case TCG_COND_GEU:   /* -> LTU */
++    case TCG_COND_GT:    /* -> LE  */
++    case TCG_COND_GTU:   /* -> LEU */
++        cond = tcg_invert_cond(cond);
++        flags ^= SETCOND_INV;
++        break;
++    default:
++        break;
+     }
+     switch (cond) {
+-    case TCG_COND_EQ:
+-        if (c2) {
+-            tmp = arg1;
+-        } else {
+-            tcg_out_opc_sub_d(s, ret, arg1, arg2);
+-            tmp = ret;
+-        }
+-        tcg_out_opc_sltui(s, ret, tmp, 1);
+-        break;
+-    case TCG_COND_NE:
+-        if (c2) {
+-            tmp = arg1;
+-        } else {
+-            tcg_out_opc_sub_d(s, ret, arg1, arg2);
+-            tmp = ret;
+-        }
+-        tcg_out_opc_sltu(s, ret, TCG_REG_ZERO, tmp);
+-        break;
+-    case TCG_COND_LT:
+-        tcg_out_opc_slt(s, ret, arg1, arg2);
+-        break;
+-    case TCG_COND_GE:
+-        tcg_out_opc_slt(s, ret, arg1, arg2);
+-        tcg_out_opc_xori(s, ret, ret, 1);
+-        break;
+     case TCG_COND_LE:
+-        tcg_out_setcond(s, TCG_COND_GE, ret, arg2, arg1, false);
+-        break;
+-    case TCG_COND_GT:
+-        tcg_out_setcond(s, TCG_COND_LT, ret, arg2, arg1, false);
+-        break;
+-    case TCG_COND_LTU:
+-        tcg_out_opc_sltu(s, ret, arg1, arg2);
+-        break;
+-    case TCG_COND_GEU:
+-        tcg_out_opc_sltu(s, ret, arg1, arg2);
+-        tcg_out_opc_xori(s, ret, ret, 1);
+-        break;
+     case TCG_COND_LEU:
+-        tcg_out_setcond(s, TCG_COND_GEU, ret, arg2, arg1, false);
++        /*
++         * If we have a constant input, the most efficient way to implement
++         * LE is by adding 1 and using LT.  Watch out for wrap around for LEU.
++         * We don't need to care for this for LE because the constant input
++         * is still constrained to int32_t, and INT32_MAX+1 is representable
++         * in the 64-bit temporary register.
++         */
++        if (c2) {
++            if (cond == TCG_COND_LEU) {
++                /* unsigned <= -1 is true */
++                if (arg2 == -1) {
++                    tcg_out_movi(s, TCG_TYPE_REG, ret, !(flags & SETCOND_INV));
++                    return ret;
++                }
++                cond = TCG_COND_LTU;
++            } else {
++                cond = TCG_COND_LT;
++            }
++            arg2 += 1;
++        } else {
++            TCGReg tmp = arg2;
++            arg2 = arg1;
++            arg1 = tmp;
++            cond = tcg_swap_cond(cond);    /* LE -> GE */
++            cond = tcg_invert_cond(cond);  /* GE -> LT */
++            flags ^= SETCOND_INV;
++        }
+         break;
+-    case TCG_COND_GTU:
+-        tcg_out_setcond(s, TCG_COND_LTU, ret, arg2, arg1, false);
++    default:
+         break;
++    }
++
++    switch (cond) {
++    case TCG_COND_NE:
++        flags |= SETCOND_NEZ;
++        if (!c2) {
++            tcg_out_opc_xor(s, ret, arg1, arg2);
++        } else if (arg2 == 0) {
++            ret = arg1;
++        } else if (arg2 >= 0 && arg2 <= 0xfff) {
++            tcg_out_opc_xori(s, ret, arg1, arg2);
++        } else {
++            tcg_out_addi(s, TCG_TYPE_REG, ret, arg1, -arg2);
++        }
++        break;
++
++    case TCG_COND_LT:
++    case TCG_COND_LTU:
++        if (c2) {
++            if (arg2 >= -0x800 && arg2 <= 0x7ff) {
++                if (cond == TCG_COND_LT) {
++                    tcg_out_opc_slti(s, ret, arg1, arg2);
++                } else {
++                    tcg_out_opc_sltui(s, ret, arg1, arg2);
++                }
++                break;
++            }
++            tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_TMP0, arg2);
++            arg2 = TCG_REG_TMP0;
++        }
++        if (cond == TCG_COND_LT) {
++            tcg_out_opc_slt(s, ret, arg1, arg2);
++        } else {
++            tcg_out_opc_sltu(s, ret, arg1, arg2);
++        }
++        break;
++
+     default:
+         g_assert_not_reached();
+         break;
+     }
++
++    return ret | flags;
++}
++
++static void tcg_out_setcond(TCGContext *s, TCGCond cond, TCGReg ret,
++                            TCGReg arg1, tcg_target_long arg2, bool c2)
++{
++    int tmpflags = tcg_out_setcond_int(s, cond, ret, arg1, arg2, c2);
++
++    if (tmpflags != ret) {
++        TCGReg tmp = tmpflags & ~SETCOND_FLAGS;
++
++        switch (tmpflags & SETCOND_FLAGS) {
++        case SETCOND_INV:
++            /* Intermediate result is boolean: simply invert. */
++            tcg_out_opc_xori(s, ret, tmp, 1);
++            break;
++        case SETCOND_NEZ:
++            /* Intermediate result is zero/non-zero: test != 0. */
++            tcg_out_opc_sltu(s, ret, TCG_REG_ZERO, tmp);
++            break;
++        case SETCOND_NEZ | SETCOND_INV:
++            /* Intermediate result is zero/non-zero: test == 0. */
++            tcg_out_opc_sltui(s, ret, tmp, 1);
++            break;
++        default:
++            g_assert_not_reached();
++        }
++    }
+ }
+ /*
+@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
+     case INDEX_op_ctz_i64:
+         return C_O1_I2(r, r, rW);
+-    case INDEX_op_setcond_i32:
+-    case INDEX_op_setcond_i64:
+-        return C_O1_I2(r, r, rZ);
+-
+     case INDEX_op_deposit_i32:
+     case INDEX_op_deposit_i64:
+         /* Must deposit into the same register as input */
+         return C_O1_I2(r, 0, rZ);
+     case INDEX_op_sub_i32:
++    case INDEX_op_setcond_i32:
+         return C_O1_I2(r, rZ, ri);
+     case INDEX_op_sub_i64:
++    case INDEX_op_setcond_i64:
+         return C_O1_I2(r, rZ, rJ);
+     case INDEX_op_mul_i32:
+--
+.34.1

-New patch
+[PULL v2 13/15] tcg/loongarch64: Implement movcond
+Reviewed-by: WANG Xuerui <git@xen0n.name>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/loongarch64/tcg-target-con-set.h |  1 +
+ tcg/loongarch64/tcg-target.h         |  4 ++--
+ tcg/loongarch64/tcg-target.c.inc     | 33 ++++++++++++++++++++++++++++
+files changed, 36 insertions(+), 2 deletions(-)
+diff --git a/tcg/loongarch64/tcg-target-con-set.h b/tcg/loongarch64/tcg-target-con-set.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/loongarch64/tcg-target-con-set.h
++++ b/tcg/loongarch64/tcg-target-con-set.h
+@@ -XXX,XX +XXX,XX @@ C_O1_I2(r, 0, rZ)
+ C_O1_I2(r, rZ, ri)
+ C_O1_I2(r, rZ, rJ)
+ C_O1_I2(r, rZ, rZ)
++C_O1_I4(r, rZ, rJ, rZ, rZ)
+diff --git a/tcg/loongarch64/tcg-target.h b/tcg/loongarch64/tcg-target.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/loongarch64/tcg-target.h
++++ b/tcg/loongarch64/tcg-target.h
+@@ -XXX,XX +XXX,XX @@ typedef enum {
+ #define TCG_TARGET_CALL_ARG_I64         TCG_CALL_ARG_NORMAL
+ /* optional instructions */
+-#define TCG_TARGET_HAS_movcond_i32      0
++#define TCG_TARGET_HAS_movcond_i32      1
+ #define TCG_TARGET_HAS_div_i32          1
+ #define TCG_TARGET_HAS_rem_i32          1
+ #define TCG_TARGET_HAS_div2_i32         0
+@@ -XXX,XX +XXX,XX @@ typedef enum {
+ #define TCG_TARGET_HAS_qemu_st8_i32     0
+ /* 64-bit operations */
+-#define TCG_TARGET_HAS_movcond_i64      0
++#define TCG_TARGET_HAS_movcond_i64      1
+ #define TCG_TARGET_HAS_div_i64          1
+ #define TCG_TARGET_HAS_rem_i64          1
+ #define TCG_TARGET_HAS_div2_i64         0
+diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/loongarch64/tcg-target.c.inc
++++ b/tcg/loongarch64/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_setcond(TCGContext *s, TCGCond cond, TCGReg ret,
+     }
+ }
++static void tcg_out_movcond(TCGContext *s, TCGCond cond, TCGReg ret,
++                            TCGReg c1, tcg_target_long c2, bool const2,
++                            TCGReg v1, TCGReg v2)
++{
++    int tmpflags = tcg_out_setcond_int(s, cond, TCG_REG_TMP0, c1, c2, const2);
++    TCGReg t;
++
++    /* Standardize the test below to t != 0. */
++    if (tmpflags & SETCOND_INV) {
++        t = v1, v1 = v2, v2 = t;
++    }
++
++    t = tmpflags & ~SETCOND_FLAGS;
++    if (v1 == TCG_REG_ZERO) {
++        tcg_out_opc_masknez(s, ret, v2, t);
++    } else if (v2 == TCG_REG_ZERO) {
++        tcg_out_opc_maskeqz(s, ret, v1, t);
++    } else {
++        tcg_out_opc_masknez(s, TCG_REG_TMP2, v2, t); /* t ? 0 : v2 */
++        tcg_out_opc_maskeqz(s, TCG_REG_TMP1, v1, t); /* t ? v1 : 0 */
++        tcg_out_opc_or(s, ret, TCG_REG_TMP1, TCG_REG_TMP2);
++    }
++}
++
+ /*
+  * Branch helpers
+  */
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+         tcg_out_setcond(s, args[3], a0, a1, a2, c2);
+         break;
++    case INDEX_op_movcond_i32:
++    case INDEX_op_movcond_i64:
++        tcg_out_movcond(s, args[5], a0, a1, a2, c2, args[3], args[4]);
++        break;
++
+     case INDEX_op_ld8s_i32:
+     case INDEX_op_ld8s_i64:
+         tcg_out_ldst(s, OPC_LD_B, a0, a1, a2);
+@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
+     case INDEX_op_remu_i64:
+         return C_O1_I2(r, rZ, rZ);
++    case INDEX_op_movcond_i32:
++    case INDEX_op_movcond_i64:
++        return C_O1_I4(r, rZ, rJ, rZ, rZ);
++
+     default:
+         g_assert_not_reached();
+     }
+--
+.34.1

-[PULL 2/4] linux-user: Fix trivial build error on loongarch64 hosts
+[PULL v2 14/15] tcg/loongarch64: Use tcg_pcrel_diff in tcg_out_ldst
-From: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Take the w^x split into account when computing the
 pc-relative distance to an absolute pointer.
-When building using GCC 8.3.0 on loongarch64 (Loongnix) we get:
-  In file included from ../linux-user/signal.c:33:
-  ../linux-user/host/loongarch64/host-signal.h: In function ‘host_signal_write’:
-  ../linux-user/host/loongarch64/host-signal.h:57:9: error: a label can only be part of a statement and a declaration is not a statement
-         uint32_t sel = (insn >> 15) & 0b11111111111;
-         ^~~~~~~~
-We don't use the 'sel' variable more than once, so drop it.
-Meson output for the record:
-  Host machine cpu family: loongarch64
-  Host machine cpu: loongarch64
-  C compiler for the host machine: cc (gcc 8.3.0 "cc (Loongnix 8.3.0-6.lnd.vec.27) 8.3.0")
-  C linker for the host machine: cc ld.bfd 2.31.1-system
-Fixes: ad812c3bd65 ("linux-user: Implement CPU-specific signal handler for loongarch64 hosts")
-Reported-by: Song Gao <gaosong@loongson.cn>
-Suggested-by: Song Gao <gaosong@loongson.cn>
-Signed-off-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Reviewed-by: WANG Xuerui <git@xen0n.name>
-Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Message-Id: <20220104215027.2180972-1-f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- linux-user/host/loongarch64/host-signal.h | 4 +---
+ tcg/loongarch64/tcg-target.c.inc | 2 +-
-file changed, 1 insertion(+), 3 deletions(-)
+file changed, 1 insertion(+), 1 deletion(-)
-diff --git a/linux-user/host/loongarch64/host-signal.h b/linux-user/host/loongarch64/host-signal.h
+diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
---- a/linux-user/host/loongarch64/host-signal.h
+--- a/tcg/loongarch64/tcg-target.c.inc
-+++ b/linux-user/host/loongarch64/host-signal.h
++++ b/tcg/loongarch64/tcg-target.c.inc
-@@ -XXX,XX +XXX,XX @@ static inline bool host_signal_write(siginfo_t *info, ucontext_t *uc)
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_ldst(TCGContext *s, LoongArchInsn opc, TCGReg data,
-         }
+     intptr_t imm12 = sextreg(offset, 0, 12);
-         break;
-     case 0b001110: /* indexed, atomic, bounds-checking memory operations */
+     if (offset != imm12) {
--        uint32_t sel = (insn >> 15) & 0b11111111111;
+-        intptr_t diff = offset - (uintptr_t)s->code_ptr;
--
++        intptr_t diff = tcg_pcrel_diff(s, (void *)offset);
--        switch (sel) {
-+        switch ((insn >> 15) & 0b11111111111) {
+         if (addr == TCG_REG_ZERO && diff == (int32_t)diff) {
-         case 0b00000100000: /* stx.b */
+             imm12 = sextreg(diff, 0, 12);
          case 0b00000101000: /* stx.h */
          case 0b00000110000: /* stx.w */
 --
-.25.1
+.34.1

-New patch
+[PULL v2 15/15] tcg/loongarch64: Reorg goto_tb implementation
+The old implementation replaces two insns, swapping between
+        b       <dest>
+        nop
+and
+        pcaddu18i tmp, <dest>
+        jirl      zero, tmp, <dest> & 0xffff
+There is a race condition in which a thread could be stopped at
+the jirl, i.e. with the top of the address loaded, and when
+restarted we have re-linked to a different TB, so that the top
+half no longer matches the bottom half.
+Note that while we never directly re-link to a different TB, we
+can link, unlink, and link again all while the stopped thread
+remains stopped.
+The new implementation replaces only one insn, swapping between
+        b       <dest>
+and
+        pcadd   tmp, <jmp_addr>
+falling through to load the address from tmp, and branch.
+Reviewed-by: WANG Xuerui <git@xen0n.name>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/loongarch64/tcg-target.h     |  7 +---
+ tcg/loongarch64/tcg-target.c.inc | 72 ++++++++++++++------------------
+files changed, 33 insertions(+), 46 deletions(-)
+diff --git a/tcg/loongarch64/tcg-target.h b/tcg/loongarch64/tcg-target.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/loongarch64/tcg-target.h
++++ b/tcg/loongarch64/tcg-target.h
+@@ -XXX,XX +XXX,XX @@
+ #define TCG_TARGET_INSN_UNIT_SIZE 4
+ #define TCG_TARGET_NB_REGS 32
+-/*
+- * PCADDU18I + JIRL sequence can give 20 + 16 + 2 = 38 bits
+- * signed offset, which is +/- 128 GiB.
+- */
+-#define MAX_CODE_GEN_BUFFER_SIZE  (128 * GiB)
++
++#define MAX_CODE_GEN_BUFFER_SIZE  ((size_t)-1)
+ typedef enum {
+     TCG_REG_ZERO,
+diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/loongarch64/tcg-target.c.inc
++++ b/tcg/loongarch64/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args)
+ #endif
+ }
+-/* LoongArch uses `andi zero, zero, 0` as NOP.  */
+-#define NOP OPC_ANDI
+-static void tcg_out_nop(TCGContext *s)
+-{
+-    tcg_out32(s, NOP);
+-}
+-
+-void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+-                              uintptr_t jmp_rx, uintptr_t jmp_rw)
+-{
+-    tcg_insn_unit i1, i2;
+-    ptrdiff_t upper, lower;
+-    uintptr_t addr = tb->jmp_target_addr[n];
+-    ptrdiff_t offset = (ptrdiff_t)(addr - jmp_rx) >> 2;
+-
+-    if (offset == sextreg(offset, 0, 26)) {
+-        i1 = encode_sd10k16_insn(OPC_B, offset);
+-        i2 = NOP;
+-    } else {
+-        tcg_debug_assert(offset == sextreg(offset, 0, 36));
+-        lower = (int16_t)offset;
+-        upper = (offset - lower) >> 16;
+-
+-        i1 = encode_dsj20_insn(OPC_PCADDU18I, TCG_REG_TMP0, upper);
+-        i2 = encode_djsk16_insn(OPC_JIRL, TCG_REG_ZERO, TCG_REG_TMP0, lower);
+-    }
+-    uint64_t pair = ((uint64_t)i2 << 32) | i1;
+-    qatomic_set((uint64_t *)jmp_rw, pair);
+-    flush_idcache_range(jmp_rx, jmp_rw, 8);
+-}
+-
+ /*
+  * Entry-points
+  */
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
+ static void tcg_out_goto_tb(TCGContext *s, int which)
+ {
+     /*
+-     * Ensure that patch area is 8-byte aligned so that an
+-     * atomic write can be used to patch the target address.
++     * Direct branch, or load indirect address, to be patched
++     * by tb_target_set_jmp_target.  Check indirect load offset
++     * in range early, regardless of direct branch distance,
++     * via assert within tcg_out_opc_pcaddu2i.
+      */
+-    if ((uintptr_t)s->code_ptr & 7) {
+-        tcg_out_nop(s);
+-    }
++    uintptr_t i_addr = get_jmp_target_addr(s, which);
++    intptr_t i_disp = tcg_pcrel_diff(s, (void *)i_addr);
++
+     set_jmp_insn_offset(s, which);
+-    /*
+-     * actual branch destination will be patched by
+-     * tb_target_set_jmp_target later
+-     */
+-    tcg_out_opc_pcaddu18i(s, TCG_REG_TMP0, 0);
++    tcg_out_opc_pcaddu2i(s, TCG_REG_TMP0, i_disp >> 2);
++
++    /* Finish the load and indirect branch. */
++    tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP0, TCG_REG_TMP0, 0);
+     tcg_out_opc_jirl(s, TCG_REG_ZERO, TCG_REG_TMP0, 0);
+     set_jmp_reset_offset(s, which);
+ }
++void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
++                              uintptr_t jmp_rx, uintptr_t jmp_rw)
++{
++    uintptr_t d_addr = tb->jmp_target_addr[n];
++    ptrdiff_t d_disp = (ptrdiff_t)(d_addr - jmp_rx) >> 2;
++    tcg_insn_unit insn;
++
++    /* Either directly branch, or load slot address for indirect branch. */
++    if (d_disp == sextreg(d_disp, 0, 26)) {
++        insn = encode_sd10k16_insn(OPC_B, d_disp);
++    } else {
++        uintptr_t i_addr = (uintptr_t)&tb->jmp_target_addr[n];
++        intptr_t i_disp = i_addr - jmp_rx;
++        insn = encode_dsj20_insn(OPC_PCADDU2I, TCG_REG_TMP0, i_disp >> 2);
++    }
++
++    qatomic_set((tcg_insn_unit *)jmp_rw, insn);
++    flush_idcache_range(jmp_rx, jmp_rw, 4);
++}
++
+ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+                        const TCGArg args[TCG_MAX_OP_ARGS],
+                        const int const_args[TCG_MAX_OP_ARGS])
+--
+.34.1

The following changes since commit 67e41fe0cfb62e6cdfa659f0155417d17e5274ea:

Merge tag 'pull-ppc-20220104' of https://github.com/legoater/qemu into staging (2022-01-04 07:23:27 -0800)

are available in the Git repository at:

https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20220104

for you to fetch changes up to d7478d4229f0a2b2817a55487e6b17081099fae4:

common-user: Fix tail calls to safe_syscall_set_errno_tail (2022-01-04 15:41:03 -0800)

----------------------------------------------------------------
Fix for safe_syscall_base.
Fix for folding of vector add/sub.
Fix build on loongarch64 with gcc 8.
Remove decl for qemu_run_machine_init_done_notifiers.

----------------------------------------------------------------
Philippe Mathieu-Daudé (1):
      linux-user: Fix trivial build error on loongarch64 hosts

Richard Henderson (2):
      tcg/optimize: Fix folding of vector ops
      common-user: Fix tail calls to safe_syscall_set_errno_tail

Xiaoyao Li (1):
      sysemu: Cleanup qemu_run_machine_init_done_notifiers()

include/sysemu/sysemu.h                    |  1 -
 linux-user/host/loongarch64/host-signal.h  |  4 +--
 tcg/optimize.c                             | 49 +++++++++++++++++++++++-------
 common-user/host/i386/safe-syscall.inc.S   |  1 +
 common-user/host/mips/safe-syscall.inc.S   |  1 +
 common-user/host/x86_64/safe-syscall.inc.S |  1 +
 6 files changed, 42 insertions(+), 15 deletions(-)

Bitwise operations are easy to fold, because the operation is
identical regardless of element size.  But add and sub need
extra element size info that is not currently propagated.

Fixes: 2f9f08ba43d
Cc: qemu-stable@nongnu.org
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/799
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 49 ++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 38 insertions(+), 11 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static uint64_t do_constant_folding_2(TCGOpcode op, uint64_t x, uint64_t y)
     CASE_OP_32_64(mul):
         return x * y;
 
-    CASE_OP_32_64(and):
+    CASE_OP_32_64_VEC(and):
         return x & y;
 
-    CASE_OP_32_64(or):
+    CASE_OP_32_64_VEC(or):
         return x | y;
 
-    CASE_OP_32_64(xor):
+    CASE_OP_32_64_VEC(xor):
         return x ^ y;
 
     case INDEX_op_shl_i32:
@@ -XXX,XX +XXX,XX @@ static uint64_t do_constant_folding_2(TCGOpcode op, uint64_t x, uint64_t y)
     case INDEX_op_rotl_i64:
         return rol64(x, y & 63);
 
-    CASE_OP_32_64(not):
+    CASE_OP_32_64_VEC(not):
         return ~x;
 
     CASE_OP_32_64(neg):
         return -x;
 
-    CASE_OP_32_64(andc):
+    CASE_OP_32_64_VEC(andc):
         return x & ~y;
 
-    CASE_OP_32_64(orc):
+    CASE_OP_32_64_VEC(orc):
         return x | ~y;
 
     CASE_OP_32_64(eqv):
@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
     return false;
 }
 
+static bool fold_commutative(OptContext *ctx, TCGOp *op)
+{
+    swap_commutative(op->args[0], &op->args[1], &op->args[2]);
+    return false;
+}
+
 static bool fold_const2_commutative(OptContext *ctx, TCGOp *op)
 {
     swap_commutative(op->args[0], &op->args[1], &op->args[2]);
@@ -XXX,XX +XXX,XX @@ static bool fold_add(OptContext *ctx, TCGOp *op)
     return false;
 }
 
+/* We cannot as yet do_constant_folding with vectors. */
+static bool fold_add_vec(OptContext *ctx, TCGOp *op)
+{
+    if (fold_commutative(ctx, op) ||
+        fold_xi_to_x(ctx, op, 0)) {
+        return true;
+    }
+    return false;
+}
+
 static bool fold_addsub2(OptContext *ctx, TCGOp *op, bool add)
 {
     if (arg_is_const(op->args[2]) && arg_is_const(op->args[3]) &&
@@ -XXX,XX +XXX,XX @@ static bool fold_sub_to_neg(OptContext *ctx, TCGOp *op)
     return false;
 }
 
-static bool fold_sub(OptContext *ctx, TCGOp *op)
+/* We cannot as yet do_constant_folding with vectors. */
+static bool fold_sub_vec(OptContext *ctx, TCGOp *op)
 {
-    if (fold_const2(ctx, op) ||
-        fold_xx_to_i(ctx, op, 0) ||
+    if (fold_xx_to_i(ctx, op, 0) ||
         fold_xi_to_x(ctx, op, 0) ||
         fold_sub_to_neg(ctx, op)) {
         return true;
@@ -XXX,XX +XXX,XX @@ static bool fold_sub(OptContext *ctx, TCGOp *op)
     return false;
 }
 
+static bool fold_sub(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op) || fold_sub_vec(ctx, op);
+}
+
 static bool fold_sub2(OptContext *ctx, TCGOp *op)
 {
     return fold_addsub2(ctx, op, false);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          * Sorted alphabetically by opcode as much as possible.
          */
         switch (opc) {
-        CASE_OP_32_64_VEC(add):
+        CASE_OP_32_64(add):
             done = fold_add(&ctx, op);
             break;
+        case INDEX_op_add_vec:
+            done = fold_add_vec(&ctx, op);
+            break;
         CASE_OP_32_64(add2):
             done = fold_add2(&ctx, op);
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(sextract):
             done = fold_sextract(&ctx, op);
             break;
-        CASE_OP_32_64_VEC(sub):
+        CASE_OP_32_64(sub):
             done = fold_sub(&ctx, op);
             break;
+        case INDEX_op_sub_vec:
+            done = fold_sub_vec(&ctx, op);
+            break;
         CASE_OP_32_64(sub2):
             done = fold_sub2(&ctx, op);
             break;
-- 
2.25.1

From: Philippe Mathieu-Daudé <f4bug@amsat.org>

When building using GCC 8.3.0 on loongarch64 (Loongnix) we get:

In file included from ../linux-user/signal.c:33:
  ../linux-user/host/loongarch64/host-signal.h: In function ‘host_signal_write’:
  ../linux-user/host/loongarch64/host-signal.h:57:9: error: a label can only be part of a statement and a declaration is not a statement
         uint32_t sel = (insn >> 15) & 0b11111111111;
         ^~~~~~~~

We don't use the 'sel' variable more than once, so drop it.

Meson output for the record:

Host machine cpu family: loongarch64
  Host machine cpu: loongarch64
  C compiler for the host machine: cc (gcc 8.3.0 "cc (Loongnix 8.3.0-6.lnd.vec.27) 8.3.0")
  C linker for the host machine: cc ld.bfd 2.31.1-system

Fixes: ad812c3bd65 ("linux-user: Implement CPU-specific signal handler for loongarch64 hosts")
Reported-by: Song Gao <gaosong@loongson.cn>
Suggested-by: Song Gao <gaosong@loongson.cn>
Signed-off-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Reviewed-by: WANG Xuerui <git@xen0n.name>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-Id: <20220104215027.2180972-1-f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 linux-user/host/loongarch64/host-signal.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/linux-user/host/loongarch64/host-signal.h b/linux-user/host/loongarch64/host-signal.h
index XXXXXXX..XXXXXXX 100644
--- a/linux-user/host/loongarch64/host-signal.h
+++ b/linux-user/host/loongarch64/host-signal.h
@@ -XXX,XX +XXX,XX @@ static inline bool host_signal_write(siginfo_t *info, ucontext_t *uc)
         }
         break;
     case 0b001110: /* indexed, atomic, bounds-checking memory operations */
-        uint32_t sel = (insn >> 15) & 0b11111111111;
-
-        switch (sel) {
+        switch ((insn >> 15) & 0b11111111111) {
         case 0b00000100000: /* stx.b */
         case 0b00000101000: /* stx.h */
         case 0b00000110000: /* stx.w */
-- 
2.25.1

For the ABIs in which the syscall return register is not
also the first function argument register, move the errno
value into the correct place.

Fixes: a3310c0397e2 ("linux-user: Move syscall error detection into safe_syscall_base")
Reported-by: Laurent Vivier <laurent@vivier.eu>
Tested-by: Laurent Vivier <laurent@vivier.eu>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Message-Id: <20220104190454.542225-1-richard.henderson@linaro.org>
---
 common-user/host/i386/safe-syscall.inc.S   | 1 +
 common-user/host/mips/safe-syscall.inc.S   | 1 +
 common-user/host/x86_64/safe-syscall.inc.S | 1 +
 3 files changed, 3 insertions(+)

diff --git a/common-user/host/i386/safe-syscall.inc.S b/common-user/host/i386/safe-syscall.inc.S
index XXXXXXX..XXXXXXX 100644
--- a/common-user/host/i386/safe-syscall.inc.S
+++ b/common-user/host/i386/safe-syscall.inc.S
@@ -XXX,XX +XXX,XX @@ safe_syscall_end:
         pop     %ebp
         .cfi_adjust_cfa_offset -4
         .cfi_restore ebp
+        mov     %eax, (%esp)
         jmp     safe_syscall_set_errno_tail
 
         .cfi_endproc
diff --git a/common-user/host/mips/safe-syscall.inc.S b/common-user/host/mips/safe-syscall.inc.S
index XXXXXXX..XXXXXXX 100644
--- a/common-user/host/mips/safe-syscall.inc.S
+++ b/common-user/host/mips/safe-syscall.inc.S
@@ -XXX,XX +XXX,XX @@ safe_syscall_end:
 1:      USE_ALT_CP(t0)
         SETUP_GPX(t1)
         SETUP_GPX64(t0, t1)
+        move    a0, v0
         PTR_LA  t9, safe_syscall_set_errno_tail
         jr      t9
 
diff --git a/common-user/host/x86_64/safe-syscall.inc.S b/common-user/host/x86_64/safe-syscall.inc.S
index XXXXXXX..XXXXXXX 100644
--- a/common-user/host/x86_64/safe-syscall.inc.S
+++ b/common-user/host/x86_64/safe-syscall.inc.S
@@ -XXX,XX +XXX,XX @@ safe_syscall_end:
 1:      pop     %rbp
         .cfi_def_cfa_offset 8
         .cfi_restore rbp
+        mov     %eax, %edi
         jmp     safe_syscall_set_errno_tail
         .cfi_endproc
 
-- 
2.25.1

Second try's the charm today, right?

The following changes since commit 00b1faea41d283e931256aa78aa975a369ec3ae6:

Merge tag 'pull-target-arm-20230123' of https://git.linaro.org/people/pmaydell/qemu-arm into staging (2023-01-23 13:40:28 +0000)

are available in the Git repository at:

https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20230123

for you to fetch changes up to 709bcd7da3f6b4655d910634a0d520fa1439df38:

tcg/loongarch64: Reorg goto_tb implementation (2023-01-23 16:00:13 -1000)

----------------------------------------------------------------
common-user: Re-enable ppc32 host
tcg: Avoid recursion in tcg_gen_mulu2_i32
tcg: Mark tcg helpers noinline to avoid an issue with LTO
tcg/arm: Use register pair allocation for qemu_{ld,st}_i64
disas: Enable loongarch disassembler, and fixes
tcg/loongarch64: Improve move immediate
tcg/loongarch64: Improve add immediate
tcg/loongarch64: Improve setcond
tcg/loongarch64: Implement movcond
tcg/loongarch64: Use tcg_pcrel_diff in tcg_out_ldst
tcg/loongarch64: Reorg goto_tb implementation

----------------------------------------------------------------
Richard Henderson (14):
      tcg: Avoid recursion in tcg_gen_mulu2_i32
      tcg/arm: Use register pair allocation for qemu_{ld,st}_i64
      common-user/host/ppc: Implement safe-syscall.inc.S
      linux-user: Implment host/ppc/host-signal.h
      tcg: Mark tcg helpers noinline to avoid an issue with LTO
      target/loongarch: Enable the disassembler for host tcg
      target/loongarch: Disassemble jirl properly
      target/loongarch: Disassemble pcadd* addresses
      tcg/loongarch64: Update tcg-insn-defs.c.inc
      tcg/loongarch64: Introduce tcg_out_addi
      tcg/loongarch64: Improve setcond expansion
      tcg/loongarch64: Implement movcond
      tcg/loongarch64: Use tcg_pcrel_diff in tcg_out_ldst
      tcg/loongarch64: Reorg goto_tb implementation

Rui Wang (1):
      tcg/loongarch64: Optimize immediate loading

include/exec/helper-proto.h                    |  32 ++-
 include/tcg/tcg.h                              |   7 -
 linux-user/include/host/ppc/host-signal.h      |  39 +++
 tcg/arm/tcg-target-con-set.h                   |   7 +-
 tcg/arm/tcg-target-con-str.h                   |   2 +
 tcg/loongarch64/tcg-target-con-set.h           |   5 +-
 tcg/loongarch64/tcg-target-con-str.h           |   2 +-
 tcg/loongarch64/tcg-target.h                   |  11 +-
 target/loongarch/insns.decode                  |   3 +-
 disas.c                                        |   2 +
 target/loongarch/disas.c                       |  39 ++-
 tcg/tcg-op.c                                   |   4 +-
 target/loongarch/insn_trans/trans_branch.c.inc |   2 +-
 tcg/arm/tcg-target.c.inc                       |  28 +-
 tcg/loongarch64/tcg-insn-defs.c.inc            |  10 +-
 tcg/loongarch64/tcg-target.c.inc               | 364 ++++++++++++++++---------
 common-user/host/ppc/safe-syscall.inc.S        | 107 ++++++++
 target/loongarch/meson.build                   |   3 +-
 18 files changed, 497 insertions(+), 170 deletions(-)
 create mode 100644 linux-user/include/host/ppc/host-signal.h
 create mode 100644 common-user/host/ppc/safe-syscall.inc.S

We have a test for one of TCG_TARGET_HAS_mulu2_i32 or
TCG_TARGET_HAS_muluh_i32 being defined, but the test
became non-functional when we changed to always define
all of these macros.

Replace this with a build-time test in tcg_gen_mulu2_i32.

Fixes: 25c4d9cc845 ("tcg: Always define all of the TCGOpcode enum members.")
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1435
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/tcg/tcg.h | 7 -------
 tcg/tcg-op.c      | 4 +++-
 2 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg.h
+++ b/include/tcg/tcg.h
@@ -XXX,XX +XXX,XX @@ typedef uint64_t TCGRegSet;
 #define TCG_TARGET_HAS_rem_i64          0
 #endif
 
-/* For 32-bit targets, some sort of unsigned widening multiply is required.  */
-#if TCG_TARGET_REG_BITS == 32 \
-    && !(defined(TCG_TARGET_HAS_mulu2_i32) \
-         || defined(TCG_TARGET_HAS_muluh_i32))
-# error "Missing unsigned widening multiply"
-#endif
-
 #if !defined(TCG_TARGET_HAS_v64) \
     && !defined(TCG_TARGET_HAS_v128) \
     && !defined(TCG_TARGET_HAS_v256)
diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg-op.c
+++ b/tcg/tcg-op.c
@@ -XXX,XX +XXX,XX @@ void tcg_gen_mulu2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 arg1, TCGv_i32 arg2)
         tcg_gen_op3_i32(INDEX_op_muluh_i32, rh, arg1, arg2);
         tcg_gen_mov_i32(rl, t);
         tcg_temp_free_i32(t);
-    } else {
+    } else if (TCG_TARGET_REG_BITS == 64) {
         TCGv_i64 t0 = tcg_temp_new_i64();
         TCGv_i64 t1 = tcg_temp_new_i64();
         tcg_gen_extu_i32_i64(t0, arg1);
@@ -XXX,XX +XXX,XX @@ void tcg_gen_mulu2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 arg1, TCGv_i32 arg2)
         tcg_gen_extr_i64_i32(rl, rh, t0);
         tcg_temp_free_i64(t0);
         tcg_temp_free_i64(t1);
+    } else {
+        qemu_build_not_reached();
     }
 }
 
-- 
2.34.1

Although we still can't use ldrd and strd for all operations,
increase the chances by getting the register allocation correct.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/arm/tcg-target-con-set.h |  7 ++++---
 tcg/arm/tcg-target-con-str.h |  2 ++
 tcg/arm/tcg-target.c.inc     | 28 ++++++++++++++++++----------
 3 files changed, 24 insertions(+), 13 deletions(-)

diff --git a/tcg/arm/tcg-target-con-set.h b/tcg/arm/tcg-target-con-set.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target-con-set.h
+++ b/tcg/arm/tcg-target-con-set.h
@@ -XXX,XX +XXX,XX @@ C_O0_I2(r, rIN)
 C_O0_I2(s, s)
 C_O0_I2(w, r)
 C_O0_I3(s, s, s)
+C_O0_I3(S, p, s)
 C_O0_I4(r, r, rI, rI)
-C_O0_I4(s, s, s, s)
+C_O0_I4(S, p, s, s)
 C_O1_I1(r, l)
 C_O1_I1(r, r)
 C_O1_I1(w, r)
@@ -XXX,XX +XXX,XX @@ C_O1_I2(w, w, wZ)
 C_O1_I3(w, w, w, w)
 C_O1_I4(r, r, r, rI, rI)
 C_O1_I4(r, r, rIN, rIK, 0)
-C_O2_I1(r, r, l)
-C_O2_I2(r, r, l, l)
+C_O2_I1(e, p, l)
+C_O2_I2(e, p, l, l)
 C_O2_I2(r, r, r, r)
 C_O2_I4(r, r, r, r, rIN, rIK)
 C_O2_I4(r, r, rI, rI, rIN, rIK)
diff --git a/tcg/arm/tcg-target-con-str.h b/tcg/arm/tcg-target-con-str.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target-con-str.h
+++ b/tcg/arm/tcg-target-con-str.h
@@ -XXX,XX +XXX,XX @@
  * Define constraint letters for register sets:
  * REGS(letter, register_mask)
  */
+REGS('e', ALL_GENERAL_REGS & 0x5555) /* even regs */
 REGS('r', ALL_GENERAL_REGS)
 REGS('l', ALL_QLOAD_REGS)
 REGS('s', ALL_QSTORE_REGS)
+REGS('S', ALL_QSTORE_REGS & 0x5555)  /* even qstore */
 REGS('w', ALL_VECTOR_REGS)
 
 /*
diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld_index(TCGContext *s, MemOp opc,
         tcg_out_ld32_r(s, COND_AL, datalo, addrlo, addend);
         break;
     case MO_UQ:
+        /* We used pair allocation for datalo, so already should be aligned. */
+        tcg_debug_assert((datalo & 1) == 0);
+        tcg_debug_assert(datahi == datalo + 1);
         /* LDRD requires alignment; double-check that. */
-        if (get_alignment_bits(opc) >= MO_64
-            && (datalo & 1) == 0 && datahi == datalo + 1) {
+        if (get_alignment_bits(opc) >= MO_64) {
             /*
              * Rm (the second address op) must not overlap Rt or Rt + 1.
              * Since datalo is aligned, we can simplify the test via alignment.
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld_direct(TCGContext *s, MemOp opc, TCGReg datalo,
         tcg_out_ld32_12(s, COND_AL, datalo, addrlo, 0);
         break;
     case MO_UQ:
+        /* We used pair allocation for datalo, so already should be aligned. */
+        tcg_debug_assert((datalo & 1) == 0);
+        tcg_debug_assert(datahi == datalo + 1);
         /* LDRD requires alignment; double-check that. */
-        if (get_alignment_bits(opc) >= MO_64
-            && (datalo & 1) == 0 && datahi == datalo + 1) {
+        if (get_alignment_bits(opc) >= MO_64) {
             tcg_out_ldrd_8(s, COND_AL, datalo, addrlo, 0);
         } else if (datalo == addrlo) {
             tcg_out_ld32_12(s, COND_AL, datahi, addrlo, 4);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st_index(TCGContext *s, ARMCond cond, MemOp opc,
         tcg_out_st32_r(s, cond, datalo, addrlo, addend);
         break;
     case MO_64:
+        /* We used pair allocation for datalo, so already should be aligned. */
+        tcg_debug_assert((datalo & 1) == 0);
+        tcg_debug_assert(datahi == datalo + 1);
         /* STRD requires alignment; double-check that. */
-        if (get_alignment_bits(opc) >= MO_64
-            && (datalo & 1) == 0 && datahi == datalo + 1) {
+        if (get_alignment_bits(opc) >= MO_64) {
             tcg_out_strd_r(s, cond, datalo, addrlo, addend);
         } else if (scratch_addend) {
             tcg_out_st32_rwb(s, cond, datalo, addend, addrlo);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st_direct(TCGContext *s, MemOp opc, TCGReg datalo,
         tcg_out_st32_12(s, COND_AL, datalo, addrlo, 0);
         break;
     case MO_64:
+        /* We used pair allocation for datalo, so already should be aligned. */
+        tcg_debug_assert((datalo & 1) == 0);
+        tcg_debug_assert(datahi == datalo + 1);
         /* STRD requires alignment; double-check that. */
-        if (get_alignment_bits(opc) >= MO_64
-            && (datalo & 1) == 0 && datahi == datalo + 1) {
+        if (get_alignment_bits(opc) >= MO_64) {
             tcg_out_strd_8(s, COND_AL, datalo, addrlo, 0);
         } else {
             tcg_out_st32_12(s, COND_AL, datalo, addrlo, 0);
@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
     case INDEX_op_qemu_ld_i32:
         return TARGET_LONG_BITS == 32 ? C_O1_I1(r, l) : C_O1_I2(r, l, l);
     case INDEX_op_qemu_ld_i64:
-        return TARGET_LONG_BITS == 32 ? C_O2_I1(r, r, l) : C_O2_I2(r, r, l, l);
+        return TARGET_LONG_BITS == 32 ? C_O2_I1(e, p, l) : C_O2_I2(e, p, l, l);
     case INDEX_op_qemu_st_i32:
         return TARGET_LONG_BITS == 32 ? C_O0_I2(s, s) : C_O0_I3(s, s, s);
     case INDEX_op_qemu_st_i64:
-        return TARGET_LONG_BITS == 32 ? C_O0_I3(s, s, s) : C_O0_I4(s, s, s, s);
+        return TARGET_LONG_BITS == 32 ? C_O0_I3(S, p, s) : C_O0_I4(S, p, s, s);
 
     case INDEX_op_st_vec:
         return C_O0_I2(w, r);
-- 
2.34.1

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Reviewed-by: Daniel Henrique Barboza <danielhb413@gmail.com>
Message-Id: <20220729172141.1789105-2-richard.henderson@linaro.org>
---
 common-user/host/ppc/safe-syscall.inc.S | 107 ++++++++++++++++++++++++
 1 file changed, 107 insertions(+)
 create mode 100644 common-user/host/ppc/safe-syscall.inc.S

diff --git a/common-user/host/ppc/safe-syscall.inc.S b/common-user/host/ppc/safe-syscall.inc.S
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/common-user/host/ppc/safe-syscall.inc.S
@@ -XXX,XX +XXX,XX @@
+/*
+ * safe-syscall.inc.S : host-specific assembly fragment
+ * to handle signals occurring at the same time as system calls.
+ * This is intended to be included by common-user/safe-syscall.S
+ *
+ * Copyright (C) 2022 Linaro, Ltd.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+/*
+ * Standardize on the _CALL_FOO symbols used by GCC:
+ * Apple XCode does not define _CALL_DARWIN.
+ * Clang defines _CALL_ELF (64-bit) but not _CALL_SYSV (32-bit).
+ */
+#if !defined(_CALL_SYSV) && \
+    !defined(_CALL_DARWIN) && \
+    !defined(_CALL_AIX) && \
+    !defined(_CALL_ELF)
+# if defined(__APPLE__)
+#  define _CALL_DARWIN
+# elif defined(__ELF__) && TCG_TARGET_REG_BITS == 32
+#  define _CALL_SYSV
+# else
+#  error "Unknown ABI"
+# endif
+#endif 
+
+#ifndef _CALL_SYSV
+# error "Unsupported ABI"
+#endif
+
+
+        .global safe_syscall_base
+        .global safe_syscall_start
+        .global safe_syscall_end
+        .type   safe_syscall_base, @function
+
+        .text
+
+        /*
+         * This is the entry point for making a system call. The calling
+         * convention here is that of a C varargs function with the
+         * first argument an 'int *' to the signal_pending flag, the
+         * second one the system call number (as a 'long'), and all further
+         * arguments being syscall arguments (also 'long').
+         */
+safe_syscall_base:
+        .cfi_startproc
+        stwu    1, -8(1)
+        .cfi_def_cfa_offset 8
+        stw     30, 4(1)
+        .cfi_offset 30, -4
+
+        /*
+         * We enter with r3 == &signal_pending
+         *               r4 == syscall number
+         *               r5 ... r10 == syscall arguments
+         *               and return the result in r3
+         * and the syscall instruction needs
+         *               r0 == syscall number
+         *               r3 ... r8 == syscall arguments
+         *               and returns the result in r3
+         * Shuffle everything around appropriately.
+         */
+        mr      30, 3           /* signal_pending */
+        mr      0, 4            /* syscall number */
+        mr      3, 5            /* syscall arguments */
+        mr      4, 6
+        mr      5, 7
+        mr      6, 8
+        mr      7, 9
+        mr      8, 10
+
+        /*
+         * This next sequence of code works in conjunction with the
+         * rewind_if_safe_syscall_function(). If a signal is taken
+         * and the interrupted PC is anywhere between 'safe_syscall_start'
+         * and 'safe_syscall_end' then we rewind it to 'safe_syscall_start'.
+         * The code sequence must therefore be able to cope with this, and
+         * the syscall instruction must be the final one in the sequence.
+         */
+safe_syscall_start:
+        /* if signal_pending is non-zero, don't do the call */
+        lwz     12, 0(30)
+        cmpwi   0, 12, 0
+        bne-    2f
+        sc
+safe_syscall_end:
+        /* code path when we did execute the syscall */
+        lwz     30, 4(1)        /* restore r30 */
+        addi    1, 1, 8         /* restore stack */
+        .cfi_restore 30
+        .cfi_def_cfa_offset 0
+        bnslr+                  /* return on success */
+        b       safe_syscall_set_errno_tail
+
+        /* code path when we didn't execute the syscall */
+2:      lwz     30, 4(1)
+        addi    1, 1, 8
+        addi    3, 0, QEMU_ERESTARTSYS
+        b       safe_syscall_set_errno_tail
+
+        .cfi_endproc
+
+        .size   safe_syscall_base, .-safe_syscall_base
-- 
2.34.1

This commit re-enables ppc32 as a linux-user host,
as existance of the directory is noted by configure.

Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1097
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Reviewed-by: Daniel Henrique Barboza <danielhb413@gmail.com>
Message-Id: <20220729172141.1789105-3-richard.henderson@linaro.org>
---
 linux-user/include/host/ppc/host-signal.h | 39 +++++++++++++++++++++++
 1 file changed, 39 insertions(+)
 create mode 100644 linux-user/include/host/ppc/host-signal.h

diff --git a/linux-user/include/host/ppc/host-signal.h b/linux-user/include/host/ppc/host-signal.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/linux-user/include/host/ppc/host-signal.h
@@ -XXX,XX +XXX,XX @@
+/*
+ * host-signal.h: signal info dependent on the host architecture
+ *
+ * Copyright (c) 2022 Linaro Ltd.
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2.1 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef PPC_HOST_SIGNAL_H
+#define PPC_HOST_SIGNAL_H
+
+#include <asm/ptrace.h>
+
+/* The third argument to a SA_SIGINFO handler is ucontext_t. */
+typedef ucontext_t host_sigcontext;
+
+static inline uintptr_t host_signal_pc(host_sigcontext *uc)
+{
+    return uc->uc_mcontext.regs->nip;
+}
+
+static inline void host_signal_set_pc(host_sigcontext *uc, uintptr_t pc)
+{
+    uc->uc_mcontext.regs->nip = pc;
+}
+
+static inline void *host_signal_mask(host_sigcontext *uc)
+{
+    return &uc->uc_sigmask;
+}
+
+static inline bool host_signal_write(siginfo_t *info, host_sigcontext *uc)
+{
+    return uc->uc_mcontext.regs->trap != 0x400
+        && (uc->uc_mcontext.regs->dsisr & 0x02000000);
+}
+
+#endif
-- 
2.34.1

Marking helpers __attribute__((noinline)) prevents an issue
with GCC's ipa-split pass under --enable-lto.

Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1454
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Tested-by: Idan Horowitz <idan.horowitz@gmail.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/helper-proto.h | 32 ++++++++++++++++++++++++--------
 1 file changed, 24 insertions(+), 8 deletions(-)

diff --git a/include/exec/helper-proto.h b/include/exec/helper-proto.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/helper-proto.h
+++ b/include/exec/helper-proto.h
@@ -XXX,XX +XXX,XX @@
 
 #include "exec/helper-head.h"
 
+/*
+ * Work around an issue with --enable-lto, in which GCC's ipa-split pass
+ * decides to split out the noreturn code paths that raise an exception,
+ * taking the __builtin_return_address() along into the new function,
+ * where it no longer computes a value that returns to TCG generated code.
+ * Despite the name, the noinline attribute affects splitter, so this
+ * prevents the optimization in question.  Given that helpers should not
+ * otherwise be called directly, this should have any other visible effect.
+ *
+ * See https://gitlab.com/qemu-project/qemu/-/issues/1454
+ */
+#define DEF_HELPER_ATTR  __attribute__((noinline))
+
 #define DEF_HELPER_FLAGS_0(name, flags, ret) \
-dh_ctype(ret) HELPER(name) (void);
+dh_ctype(ret) HELPER(name) (void) DEF_HELPER_ATTR;
 
 #define DEF_HELPER_FLAGS_1(name, flags, ret, t1) \
-dh_ctype(ret) HELPER(name) (dh_ctype(t1));
+dh_ctype(ret) HELPER(name) (dh_ctype(t1)) DEF_HELPER_ATTR;
 
 #define DEF_HELPER_FLAGS_2(name, flags, ret, t1, t2) \
-dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2));
+dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2)) DEF_HELPER_ATTR;
 
 #define DEF_HELPER_FLAGS_3(name, flags, ret, t1, t2, t3) \
-dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3));
+dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), \
+                            dh_ctype(t3)) DEF_HELPER_ATTR;
 
 #define DEF_HELPER_FLAGS_4(name, flags, ret, t1, t2, t3, t4) \
 dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
-                                   dh_ctype(t4));
+                            dh_ctype(t4)) DEF_HELPER_ATTR;
 
 #define DEF_HELPER_FLAGS_5(name, flags, ret, t1, t2, t3, t4, t5) \
 dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
-                            dh_ctype(t4), dh_ctype(t5));
+                            dh_ctype(t4), dh_ctype(t5)) DEF_HELPER_ATTR;
 
 #define DEF_HELPER_FLAGS_6(name, flags, ret, t1, t2, t3, t4, t5, t6) \
 dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
-                            dh_ctype(t4), dh_ctype(t5), dh_ctype(t6));
+                            dh_ctype(t4), dh_ctype(t5), \
+                            dh_ctype(t6)) DEF_HELPER_ATTR;
 
 #define DEF_HELPER_FLAGS_7(name, flags, ret, t1, t2, t3, t4, t5, t6, t7) \
 dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
                             dh_ctype(t4), dh_ctype(t5), dh_ctype(t6), \
-                            dh_ctype(t7));
+                            dh_ctype(t7)) DEF_HELPER_ATTR;
 
 #define IN_HELPER_PROTO
 
@@ -XXX,XX +XXX,XX @@ dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
 #undef DEF_HELPER_FLAGS_5
 #undef DEF_HELPER_FLAGS_6
 #undef DEF_HELPER_FLAGS_7
+#undef DEF_HELPER_ATTR
 
 #endif /* HELPER_PROTO_H */
-- 
2.34.1

Reuse the decodetree based disassembler from
target/loongarch/ for tcg/loongarch64/.

The generation of decode-insns.c.inc into ./libcommon.fa.p/ could
eventually result in conflict, if any other host requires the same
trick, but this is good enough for now.

Reviewed-by: WANG Xuerui <git@xen0n.name>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 disas.c                      | 2 ++
 target/loongarch/meson.build | 3 ++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/disas.c b/disas.c
index XXXXXXX..XXXXXXX 100644
--- a/disas.c
+++ b/disas.c
@@ -XXX,XX +XXX,XX @@ static void initialize_debug_host(CPUDebug *s)
     s->info.cap_insn_split = 6;
 #elif defined(__hppa__)
     s->info.print_insn = print_insn_hppa;
+#elif defined(__loongarch__)
+    s->info.print_insn = print_insn_loongarch;
 #endif
 }
 
diff --git a/target/loongarch/meson.build b/target/loongarch/meson.build
index XXXXXXX..XXXXXXX 100644
--- a/target/loongarch/meson.build
+++ b/target/loongarch/meson.build
@@ -XXX,XX +XXX,XX @@ gen = decodetree.process('insns.decode')
 loongarch_ss = ss.source_set()
 loongarch_ss.add(files(
   'cpu.c',
-  'disas.c',
 ))
 loongarch_tcg_ss = ss.source_set()
 loongarch_tcg_ss.add(gen)
@@ -XXX,XX +XXX,XX @@ loongarch_softmmu_ss.add(files(
   'iocsr_helper.c',
 ))
 
+common_ss.add(when: 'CONFIG_LOONGARCH_DIS', if_true: [files('disas.c'), gen])
+
 loongarch_ss.add_all(when: 'CONFIG_TCG', if_true: [loongarch_tcg_ss])
 
 target_arch += {'loongarch': loongarch_ss}
-- 
2.34.1

While jirl shares the same instruction format as bne etc,
it is not assembled the same.  In particular, rd is printed
first not second and the immediate is not pc-relative.

Decode into the arg_rr_i structure, which prints correctly.
This changes the "offs" member to "imm", to update translate.

Reviewed-by: WANG Xuerui <git@xen0n.name>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/loongarch/insns.decode                  | 3 ++-
 target/loongarch/disas.c                       | 2 +-
 target/loongarch/insn_trans/trans_branch.c.inc | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/target/loongarch/insns.decode b/target/loongarch/insns.decode
index XXXXXXX..XXXXXXX 100644
--- a/target/loongarch/insns.decode
+++ b/target/loongarch/insns.decode
@@ -XXX,XX +XXX,XX @@
 @rr_ui12                 .... ...... imm:12 rj:5 rd:5    &rr_i
 @rr_i14s2         .... ....  .............. rj:5 rd:5    &rr_i imm=%i14s2
 @rr_i16                     .... .. imm:s16 rj:5 rd:5    &rr_i
+@rr_i16s2         .... ..  ................ rj:5 rd:5    &rr_i imm=%offs16
 @hint_r_i12           .... ...... imm:s12 rj:5 hint:5    &hint_r_i
 @rrr_sa2p1        .... ........ ... .. rk:5 rj:5 rd:5    &rrr_sa  sa=%sa2p1
 @rrr_sa2        .... ........ ... sa:2 rk:5 rj:5 rd:5    &rrr_sa
@@ -XXX,XX +XXX,XX @@ beqz            0100 00 ................ ..... .....     @r_offs21
 bnez            0100 01 ................ ..... .....     @r_offs21
 bceqz           0100 10 ................ 00 ... .....    @c_offs21
 bcnez           0100 10 ................ 01 ... .....    @c_offs21
-jirl            0100 11 ................ ..... .....     @rr_offs16
+jirl            0100 11 ................ ..... .....     @rr_i16s2
 b               0101 00 ..........................       @offs26
 bl              0101 01 ..........................       @offs26
 beq             0101 10 ................ ..... .....     @rr_offs16
diff --git a/target/loongarch/disas.c b/target/loongarch/disas.c
index XXXXXXX..XXXXXXX 100644
--- a/target/loongarch/disas.c
+++ b/target/loongarch/disas.c
@@ -XXX,XX +XXX,XX @@ INSN(beqz,         r_offs)
 INSN(bnez,         r_offs)
 INSN(bceqz,        c_offs)
 INSN(bcnez,        c_offs)
-INSN(jirl,         rr_offs)
+INSN(jirl,         rr_i)
 INSN(b,            offs)
 INSN(bl,           offs)
 INSN(beq,          rr_offs)
diff --git a/target/loongarch/insn_trans/trans_branch.c.inc b/target/loongarch/insn_trans/trans_branch.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/target/loongarch/insn_trans/trans_branch.c.inc
+++ b/target/loongarch/insn_trans/trans_branch.c.inc
@@ -XXX,XX +XXX,XX @@ static bool trans_jirl(DisasContext *ctx, arg_jirl *a)
     TCGv dest = gpr_dst(ctx, a->rd, EXT_NONE);
     TCGv src1 = gpr_src(ctx, a->rj, EXT_NONE);
 
-    tcg_gen_addi_tl(cpu_pc, src1, a->offs);
+    tcg_gen_addi_tl(cpu_pc, src1, a->imm);
     tcg_gen_movi_tl(dest, ctx->base.pc_next + 4);
     gen_set_gpr(a->rd, dest, EXT_NONE);
     tcg_gen_lookup_and_goto_ptr();
-- 
2.34.1

Print both the raw field and the resolved pc-relative
address, as we do for branches.

Reviewed-by: WANG Xuerui <git@xen0n.name>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/loongarch/disas.c | 37 +++++++++++++++++++++++++++++++++----
 1 file changed, 33 insertions(+), 4 deletions(-)

diff --git a/target/loongarch/disas.c b/target/loongarch/disas.c
index XXXXXXX..XXXXXXX 100644
--- a/target/loongarch/disas.c
+++ b/target/loongarch/disas.c
@@ -XXX,XX +XXX,XX @@ INSN(fsel,         fffc)
 INSN(addu16i_d,    rr_i)
 INSN(lu12i_w,      r_i)
 INSN(lu32i_d,      r_i)
-INSN(pcaddi,       r_i)
-INSN(pcalau12i,    r_i)
-INSN(pcaddu12i,    r_i)
-INSN(pcaddu18i,    r_i)
 INSN(ll_w,         rr_i)
 INSN(sc_w,         rr_i)
 INSN(ll_d,         rr_i)
@@ -XXX,XX +XXX,XX @@ static bool trans_fcmp_cond_##suffix(DisasContext *ctx, \
 
 FCMP_INSN(s)
 FCMP_INSN(d)
+
+#define PCADD_INSN(name)                                        \
+static bool trans_##name(DisasContext *ctx, arg_##name *a)      \
+{                                                               \
+    output(ctx, #name, "r%d, %d # 0x%" PRIx64,                  \
+           a->rd, a->imm, gen_##name(ctx->pc, a->imm));         \
+    return true;                                                \
+}
+
+static uint64_t gen_pcaddi(uint64_t pc, int imm)
+{
+    return pc + (imm << 2);
+}
+
+static uint64_t gen_pcalau12i(uint64_t pc, int imm)
+{
+    return (pc + (imm << 12)) & ~0xfff;
+}
+
+static uint64_t gen_pcaddu12i(uint64_t pc, int imm)
+{
+    return pc + (imm << 12);
+}
+
+static uint64_t gen_pcaddu18i(uint64_t pc, int imm)
+{
+    return pc + ((uint64_t)(imm) << 18);
+}
+
+PCADD_INSN(pcaddi)
+PCADD_INSN(pcalau12i)
+PCADD_INSN(pcaddu12i)
+PCADD_INSN(pcaddu18i)
-- 
2.34.1

From: Rui Wang <wangrui@loongson.cn>

diff:
  Imm                 Before                  After
  0000000000000000    addi.w  rd, zero, 0     addi.w  rd, zero, 0
                      lu52i.d rd, zero, 0
  00000000fffff800    lu12i.w rd, -1          addi.w  rd, zero, -2048
                      ori     rd, rd, 2048    lu32i.d rd, 0
                      lu32i.d rd, 0

Reviewed-by: WANG Xuerui <git@xen0n.name>
Signed-off-by: Rui Wang <wangrui@loongson.cn>
Message-Id: <20221107144713.845550-1-wangrui@loongson.cn>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/loongarch64/tcg-target.c.inc | 35 +++++++++++---------------------
 1 file changed, 12 insertions(+), 23 deletions(-)

Regenerate with ADDU16I included:

$ cd loongarch-opcodes/scripts/go
   $ go run ./genqemutcgdefs > $QEMU/tcg/loongarch64/tcg-insn-defs.c.inc

Reviewed-by: WANG Xuerui <git@xen0n.name>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/loongarch64/tcg-insn-defs.c.inc | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/tcg/loongarch64/tcg-insn-defs.c.inc b/tcg/loongarch64/tcg-insn-defs.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/loongarch64/tcg-insn-defs.c.inc
+++ b/tcg/loongarch64/tcg-insn-defs.c.inc
@@ -XXX,XX +XXX,XX @@
  *
  * This file is auto-generated by genqemutcgdefs from
  * https://github.com/loongson-community/loongarch-opcodes,
- * from commit 961f0c60f5b63e574d785995600c71ad5413fdc4.
+ * from commit 25ca7effe9d88101c1cf96c4005423643386d81f.
  * DO NOT EDIT.
  */
 
@@ -XXX,XX +XXX,XX @@ typedef enum {
     OPC_ANDI = 0x03400000,
     OPC_ORI = 0x03800000,
     OPC_XORI = 0x03c00000,
+    OPC_ADDU16I_D = 0x10000000,
     OPC_LU12I_W = 0x14000000,
     OPC_CU32I_D = 0x16000000,
     OPC_PCADDU2I = 0x18000000,
@@ -XXX,XX +XXX,XX @@ tcg_out_opc_xori(TCGContext *s, TCGReg d, TCGReg j, uint32_t uk12)
     tcg_out32(s, encode_djuk12_insn(OPC_XORI, d, j, uk12));
 }
 
+/* Emits the `addu16i.d d, j, sk16` instruction.  */
+static void __attribute__((unused))
+tcg_out_opc_addu16i_d(TCGContext *s, TCGReg d, TCGReg j, int32_t sk16)
+{
+    tcg_out32(s, encode_djsk16_insn(OPC_ADDU16I_D, d, j, sk16));
+}
+
 /* Emits the `lu12i.w d, sj20` instruction.  */
 static void __attribute__((unused))
 tcg_out_opc_lu12i_w(TCGContext *s, TCGReg d, int32_t sj20)
-- 
2.34.1

Adjust the constraints to allow any int32_t for immediate
addition.  Split immediate adds into addu16i + addi, which
covers quite a lot of the immediate space.  For the hole in
the middle, load the constant into TMP0 instead.

Reviewed-by: WANG Xuerui <git@xen0n.name>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/loongarch64/tcg-target-con-set.h |  4 +-
 tcg/loongarch64/tcg-target-con-str.h |  2 +-
 tcg/loongarch64/tcg-target.c.inc     | 57 ++++++++++++++++++++++++----
 3 files changed, 53 insertions(+), 10 deletions(-)

diff --git a/tcg/loongarch64/tcg-target-con-set.h b/tcg/loongarch64/tcg-target-con-set.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/loongarch64/tcg-target-con-set.h
+++ b/tcg/loongarch64/tcg-target-con-set.h
@@ -XXX,XX +XXX,XX @@ C_O1_I1(r, L)
 C_O1_I2(r, r, rC)
 C_O1_I2(r, r, ri)
 C_O1_I2(r, r, rI)
+C_O1_I2(r, r, rJ)
 C_O1_I2(r, r, rU)
 C_O1_I2(r, r, rW)
 C_O1_I2(r, r, rZ)
 C_O1_I2(r, 0, rZ)
-C_O1_I2(r, rZ, rN)
+C_O1_I2(r, rZ, ri)
+C_O1_I2(r, rZ, rJ)
 C_O1_I2(r, rZ, rZ)
diff --git a/tcg/loongarch64/tcg-target-con-str.h b/tcg/loongarch64/tcg-target-con-str.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/loongarch64/tcg-target-con-str.h
+++ b/tcg/loongarch64/tcg-target-con-str.h
@@ -XXX,XX +XXX,XX @@ REGS('L', ALL_GENERAL_REGS & ~SOFTMMU_RESERVE_REGS)
  * CONST(letter, TCG_CT_CONST_* bit set)
  */
 CONST('I', TCG_CT_CONST_S12)
-CONST('N', TCG_CT_CONST_N12)
+CONST('J', TCG_CT_CONST_S32)
 CONST('U', TCG_CT_CONST_U12)
 CONST('Z', TCG_CT_CONST_ZERO)
 CONST('C', TCG_CT_CONST_C12)
diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/loongarch64/tcg-target.c.inc
+++ b/tcg/loongarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static const int tcg_target_call_oarg_regs[] = {
 
 #define TCG_CT_CONST_ZERO  0x100
 #define TCG_CT_CONST_S12   0x200
-#define TCG_CT_CONST_N12   0x400
+#define TCG_CT_CONST_S32   0x400
 #define TCG_CT_CONST_U12   0x800
 #define TCG_CT_CONST_C12   0x1000
 #define TCG_CT_CONST_WSZ   0x2000
@@ -XXX,XX +XXX,XX @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
     if ((ct & TCG_CT_CONST_S12) && val == sextreg(val, 0, 12)) {
         return true;
     }
-    if ((ct & TCG_CT_CONST_N12) && -val == sextreg(-val, 0, 12)) {
+    if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) {
         return true;
     }
     if ((ct & TCG_CT_CONST_U12) && val >= 0 && val <= 0xfff) {
@@ -XXX,XX +XXX,XX @@ static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg rd,
     }
 }
 
+static void tcg_out_addi(TCGContext *s, TCGType type, TCGReg rd,
+                         TCGReg rs, tcg_target_long imm)
+{
+    tcg_target_long lo12 = sextreg(imm, 0, 12);
+    tcg_target_long hi16 = sextreg(imm - lo12, 16, 16);
+
+    /*
+     * Note that there's a hole in between hi16 and lo12:
+     *
+     *       3                   2                   1                   0
+     *     1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0
+     * ...+-------------------------------+-------+-----------------------+
+     *    |             hi16              |       |          lo12         |
+     * ...+-------------------------------+-------+-----------------------+
+     *
+     * For bits within that hole, it's more efficient to use LU12I and ADD.
+     */
+    if (imm == (hi16 << 16) + lo12) {
+        if (hi16) {
+            tcg_out_opc_addu16i_d(s, rd, rs, hi16);
+            rs = rd;
+        }
+        if (type == TCG_TYPE_I32) {
+            tcg_out_opc_addi_w(s, rd, rs, lo12);
+        } else if (lo12) {
+            tcg_out_opc_addi_d(s, rd, rs, lo12);
+        } else {
+            tcg_out_mov(s, type, rd, rs);
+        }
+    } else {
+        tcg_out_movi(s, type, TCG_REG_TMP0, imm);
+        if (type == TCG_TYPE_I32) {
+            tcg_out_opc_add_w(s, rd, rs, TCG_REG_TMP0);
+        } else {
+            tcg_out_opc_add_d(s, rd, rs, TCG_REG_TMP0);
+        }
+    }
+}
+
 static void tcg_out_ext8u(TCGContext *s, TCGReg ret, TCGReg arg)
 {
     tcg_out_opc_andi(s, ret, arg, 0xff);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
 
     case INDEX_op_add_i32:
         if (c2) {
-            tcg_out_opc_addi_w(s, a0, a1, a2);
+            tcg_out_addi(s, TCG_TYPE_I32, a0, a1, a2);
         } else {
             tcg_out_opc_add_w(s, a0, a1, a2);
         }
         break;
     case INDEX_op_add_i64:
         if (c2) {
-            tcg_out_opc_addi_d(s, a0, a1, a2);
+            tcg_out_addi(s, TCG_TYPE_I64, a0, a1, a2);
         } else {
             tcg_out_opc_add_d(s, a0, a1, a2);
         }
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
 
     case INDEX_op_sub_i32:
         if (c2) {
-            tcg_out_opc_addi_w(s, a0, a1, -a2);
+            tcg_out_addi(s, TCG_TYPE_I32, a0, a1, -a2);
         } else {
             tcg_out_opc_sub_w(s, a0, a1, a2);
         }
         break;
     case INDEX_op_sub_i64:
         if (c2) {
-            tcg_out_opc_addi_d(s, a0, a1, -a2);
+            tcg_out_addi(s, TCG_TYPE_I64, a0, a1, -a2);
         } else {
             tcg_out_opc_sub_d(s, a0, a1, a2);
         }
@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
         return C_O1_I2(r, r, ri);
 
     case INDEX_op_add_i32:
+        return C_O1_I2(r, r, ri);
     case INDEX_op_add_i64:
-        return C_O1_I2(r, r, rI);
+        return C_O1_I2(r, r, rJ);
 
     case INDEX_op_and_i32:
     case INDEX_op_and_i64:
@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
         return C_O1_I2(r, 0, rZ);
 
     case INDEX_op_sub_i32:
+        return C_O1_I2(r, rZ, ri);
     case INDEX_op_sub_i64:
-        return C_O1_I2(r, rZ, rN);
+        return C_O1_I2(r, rZ, rJ);
 
     case INDEX_op_mul_i32:
     case INDEX_op_mul_i64:
-- 
2.34.1

Split out a helper function, tcg_out_setcond_int, which
does not always produce the complete boolean result, but
returns a set of flags to do so.

Accept all int32_t as constant input, so that LE/GT can
adjust the constant to LT.

Reviewed-by: WANG Xuerui <git@xen0n.name>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/loongarch64/tcg-target.c.inc | 165 +++++++++++++++++++++----------
 1 file changed, 115 insertions(+), 50 deletions(-)

diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/loongarch64/tcg-target.c.inc
+++ b/tcg/loongarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_clzctz(TCGContext *s, LoongArchInsn opc,
     tcg_out_opc_or(s, a0, TCG_REG_TMP0, a0);
 }
 
-static void tcg_out_setcond(TCGContext *s, TCGCond cond, TCGReg ret,
-                            TCGReg arg1, TCGReg arg2, bool c2)
-{
-    TCGReg tmp;
+#define SETCOND_INV    TCG_TARGET_NB_REGS
+#define SETCOND_NEZ    (SETCOND_INV << 1)
+#define SETCOND_FLAGS  (SETCOND_INV | SETCOND_NEZ)
 
-    if (c2) {
-        tcg_debug_assert(arg2 == 0);
+static int tcg_out_setcond_int(TCGContext *s, TCGCond cond, TCGReg ret,
+                               TCGReg arg1, tcg_target_long arg2, bool c2)
+{
+    int flags = 0;
+
+    switch (cond) {
+    case TCG_COND_EQ:    /* -> NE  */
+    case TCG_COND_GE:    /* -> LT  */
+    case TCG_COND_GEU:   /* -> LTU */
+    case TCG_COND_GT:    /* -> LE  */
+    case TCG_COND_GTU:   /* -> LEU */
+        cond = tcg_invert_cond(cond);
+        flags ^= SETCOND_INV;
+        break;
+    default:
+        break;
     }
 
     switch (cond) {
-    case TCG_COND_EQ:
-        if (c2) {
-            tmp = arg1;
-        } else {
-            tcg_out_opc_sub_d(s, ret, arg1, arg2);
-            tmp = ret;
-        }
-        tcg_out_opc_sltui(s, ret, tmp, 1);
-        break;
-    case TCG_COND_NE:
-        if (c2) {
-            tmp = arg1;
-        } else {
-            tcg_out_opc_sub_d(s, ret, arg1, arg2);
-            tmp = ret;
-        }
-        tcg_out_opc_sltu(s, ret, TCG_REG_ZERO, tmp);
-        break;
-    case TCG_COND_LT:
-        tcg_out_opc_slt(s, ret, arg1, arg2);
-        break;
-    case TCG_COND_GE:
-        tcg_out_opc_slt(s, ret, arg1, arg2);
-        tcg_out_opc_xori(s, ret, ret, 1);
-        break;
     case TCG_COND_LE:
-        tcg_out_setcond(s, TCG_COND_GE, ret, arg2, arg1, false);
-        break;
-    case TCG_COND_GT:
-        tcg_out_setcond(s, TCG_COND_LT, ret, arg2, arg1, false);
-        break;
-    case TCG_COND_LTU:
-        tcg_out_opc_sltu(s, ret, arg1, arg2);
-        break;
-    case TCG_COND_GEU:
-        tcg_out_opc_sltu(s, ret, arg1, arg2);
-        tcg_out_opc_xori(s, ret, ret, 1);
-        break;
     case TCG_COND_LEU:
-        tcg_out_setcond(s, TCG_COND_GEU, ret, arg2, arg1, false);
+        /*
+         * If we have a constant input, the most efficient way to implement
+         * LE is by adding 1 and using LT.  Watch out for wrap around for LEU.
+         * We don't need to care for this for LE because the constant input
+         * is still constrained to int32_t, and INT32_MAX+1 is representable
+         * in the 64-bit temporary register.
+         */
+        if (c2) {
+            if (cond == TCG_COND_LEU) {
+                /* unsigned <= -1 is true */
+                if (arg2 == -1) {
+                    tcg_out_movi(s, TCG_TYPE_REG, ret, !(flags & SETCOND_INV));
+                    return ret;
+                }
+                cond = TCG_COND_LTU;
+            } else {
+                cond = TCG_COND_LT;
+            }
+            arg2 += 1;
+        } else {
+            TCGReg tmp = arg2;
+            arg2 = arg1;
+            arg1 = tmp;
+            cond = tcg_swap_cond(cond);    /* LE -> GE */
+            cond = tcg_invert_cond(cond);  /* GE -> LT */
+            flags ^= SETCOND_INV;
+        }
         break;
-    case TCG_COND_GTU:
-        tcg_out_setcond(s, TCG_COND_LTU, ret, arg2, arg1, false);
+    default:
         break;
+    }
+
+    switch (cond) {
+    case TCG_COND_NE:
+        flags |= SETCOND_NEZ;
+        if (!c2) {
+            tcg_out_opc_xor(s, ret, arg1, arg2);
+        } else if (arg2 == 0) {
+            ret = arg1;
+        } else if (arg2 >= 0 && arg2 <= 0xfff) {
+            tcg_out_opc_xori(s, ret, arg1, arg2);
+        } else {
+            tcg_out_addi(s, TCG_TYPE_REG, ret, arg1, -arg2);
+        }
+        break;
+
+    case TCG_COND_LT:
+    case TCG_COND_LTU:
+        if (c2) {
+            if (arg2 >= -0x800 && arg2 <= 0x7ff) {
+                if (cond == TCG_COND_LT) {
+                    tcg_out_opc_slti(s, ret, arg1, arg2);
+                } else {
+                    tcg_out_opc_sltui(s, ret, arg1, arg2);
+                }
+                break;
+            }
+            tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_TMP0, arg2);
+            arg2 = TCG_REG_TMP0;
+        }
+        if (cond == TCG_COND_LT) {
+            tcg_out_opc_slt(s, ret, arg1, arg2);
+        } else {
+            tcg_out_opc_sltu(s, ret, arg1, arg2);
+        }
+        break;
+
     default:
         g_assert_not_reached();
         break;
     }
+
+    return ret | flags;
+}
+
+static void tcg_out_setcond(TCGContext *s, TCGCond cond, TCGReg ret,
+                            TCGReg arg1, tcg_target_long arg2, bool c2)
+{
+    int tmpflags = tcg_out_setcond_int(s, cond, ret, arg1, arg2, c2);
+
+    if (tmpflags != ret) {
+        TCGReg tmp = tmpflags & ~SETCOND_FLAGS;
+
+        switch (tmpflags & SETCOND_FLAGS) {
+        case SETCOND_INV:
+            /* Intermediate result is boolean: simply invert. */
+            tcg_out_opc_xori(s, ret, tmp, 1);
+            break;
+        case SETCOND_NEZ:
+            /* Intermediate result is zero/non-zero: test != 0. */
+            tcg_out_opc_sltu(s, ret, TCG_REG_ZERO, tmp);
+            break;
+        case SETCOND_NEZ | SETCOND_INV:
+            /* Intermediate result is zero/non-zero: test == 0. */
+            tcg_out_opc_sltui(s, ret, tmp, 1);
+            break;
+        default:
+            g_assert_not_reached();
+        }
+    }
 }
 
 /*
@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
     case INDEX_op_ctz_i64:
         return C_O1_I2(r, r, rW);
 
-    case INDEX_op_setcond_i32:
-    case INDEX_op_setcond_i64:
-        return C_O1_I2(r, r, rZ);
-
     case INDEX_op_deposit_i32:
     case INDEX_op_deposit_i64:
         /* Must deposit into the same register as input */
         return C_O1_I2(r, 0, rZ);
 
     case INDEX_op_sub_i32:
+    case INDEX_op_setcond_i32:
         return C_O1_I2(r, rZ, ri);
     case INDEX_op_sub_i64:
+    case INDEX_op_setcond_i64:
         return C_O1_I2(r, rZ, rJ);
 
     case INDEX_op_mul_i32:
-- 
2.34.1

Reviewed-by: WANG Xuerui <git@xen0n.name>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/loongarch64/tcg-target-con-set.h |  1 +
 tcg/loongarch64/tcg-target.h         |  4 ++--
 tcg/loongarch64/tcg-target.c.inc     | 33 ++++++++++++++++++++++++++++
 3 files changed, 36 insertions(+), 2 deletions(-)

The old implementation replaces two insns, swapping between

b       <dest>
        nop
and
        pcaddu18i tmp, <dest>
        jirl      zero, tmp, <dest> & 0xffff

There is a race condition in which a thread could be stopped at
the jirl, i.e. with the top of the address loaded, and when
restarted we have re-linked to a different TB, so that the top
half no longer matches the bottom half.

Note that while we never directly re-link to a different TB, we
can link, unlink, and link again all while the stopped thread
remains stopped.

The new implementation replaces only one insn, swapping between

b       <dest>
and
        pcadd   tmp, <jmp_addr>

falling through to load the address from tmp, and branch.

Reviewed-by: WANG Xuerui <git@xen0n.name>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/loongarch64/tcg-target.h     |  7 +---
 tcg/loongarch64/tcg-target.c.inc | 72 ++++++++++++++------------------
 2 files changed, 33 insertions(+), 46 deletions(-)

diff --git a/tcg/loongarch64/tcg-target.h b/tcg/loongarch64/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/loongarch64/tcg-target.h
+++ b/tcg/loongarch64/tcg-target.h
@@ -XXX,XX +XXX,XX @@
 
 #define TCG_TARGET_INSN_UNIT_SIZE 4
 #define TCG_TARGET_NB_REGS 32
-/*
- * PCADDU18I + JIRL sequence can give 20 + 16 + 2 = 38 bits
- * signed offset, which is +/- 128 GiB.
- */
-#define MAX_CODE_GEN_BUFFER_SIZE  (128 * GiB)
+
+#define MAX_CODE_GEN_BUFFER_SIZE  ((size_t)-1)
 
 typedef enum {
     TCG_REG_ZERO,
diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/loongarch64/tcg-target.c.inc
+++ b/tcg/loongarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args)
 #endif
 }
 
-/* LoongArch uses `andi zero, zero, 0` as NOP.  */
-#define NOP OPC_ANDI
-static void tcg_out_nop(TCGContext *s)
-{
-    tcg_out32(s, NOP);
-}
-
-void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
-                              uintptr_t jmp_rx, uintptr_t jmp_rw)
-{
-    tcg_insn_unit i1, i2;
-    ptrdiff_t upper, lower;
-    uintptr_t addr = tb->jmp_target_addr[n];
-    ptrdiff_t offset = (ptrdiff_t)(addr - jmp_rx) >> 2;
-
-    if (offset == sextreg(offset, 0, 26)) {
-        i1 = encode_sd10k16_insn(OPC_B, offset);
-        i2 = NOP;
-    } else {
-        tcg_debug_assert(offset == sextreg(offset, 0, 36));
-        lower = (int16_t)offset;
-        upper = (offset - lower) >> 16;
-
-        i1 = encode_dsj20_insn(OPC_PCADDU18I, TCG_REG_TMP0, upper);
-        i2 = encode_djsk16_insn(OPC_JIRL, TCG_REG_ZERO, TCG_REG_TMP0, lower);
-    }
-    uint64_t pair = ((uint64_t)i2 << 32) | i1;
-    qatomic_set((uint64_t *)jmp_rw, pair);
-    flush_idcache_range(jmp_rx, jmp_rw, 8);
-}
-
 /*
  * Entry-points
  */
@@ -XXX,XX +XXX,XX @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
 static void tcg_out_goto_tb(TCGContext *s, int which)
 {
     /*
-     * Ensure that patch area is 8-byte aligned so that an
-     * atomic write can be used to patch the target address.
+     * Direct branch, or load indirect address, to be patched
+     * by tb_target_set_jmp_target.  Check indirect load offset
+     * in range early, regardless of direct branch distance,
+     * via assert within tcg_out_opc_pcaddu2i.
      */
-    if ((uintptr_t)s->code_ptr & 7) {
-        tcg_out_nop(s);
-    }
+    uintptr_t i_addr = get_jmp_target_addr(s, which);
+    intptr_t i_disp = tcg_pcrel_diff(s, (void *)i_addr);
+
     set_jmp_insn_offset(s, which);
-    /*
-     * actual branch destination will be patched by
-     * tb_target_set_jmp_target later
-     */
-    tcg_out_opc_pcaddu18i(s, TCG_REG_TMP0, 0);
+    tcg_out_opc_pcaddu2i(s, TCG_REG_TMP0, i_disp >> 2);
+
+    /* Finish the load and indirect branch. */
+    tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP0, TCG_REG_TMP0, 0);
     tcg_out_opc_jirl(s, TCG_REG_ZERO, TCG_REG_TMP0, 0);
     set_jmp_reset_offset(s, which);
 }
 
+void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+                              uintptr_t jmp_rx, uintptr_t jmp_rw)
+{
+    uintptr_t d_addr = tb->jmp_target_addr[n];
+    ptrdiff_t d_disp = (ptrdiff_t)(d_addr - jmp_rx) >> 2;
+    tcg_insn_unit insn;
+
+    /* Either directly branch, or load slot address for indirect branch. */
+    if (d_disp == sextreg(d_disp, 0, 26)) {
+        insn = encode_sd10k16_insn(OPC_B, d_disp);
+    } else {
+        uintptr_t i_addr = (uintptr_t)&tb->jmp_target_addr[n];
+        intptr_t i_disp = i_addr - jmp_rx;
+        insn = encode_dsj20_insn(OPC_PCADDU2I, TCG_REG_TMP0, i_disp >> 2);
+    }
+
+    qatomic_set((tcg_insn_unit *)jmp_rw, insn);
+    flush_idcache_range(jmp_rx, jmp_rw, 4);
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])
-- 
2.34.1