Series comparison

-[PULL 0/3] tcg patch queue
+[PULL 00/34] tcg patch queue
-The following changes since commit 2ecfc0657afa5d29a373271b342f704a1a3c6737:
+The following changes since commit 0a301624c2f4ced3331ffd5bce85b4274fe132af:
-  Merge remote-tracking branch 'remotes/armbru/tags/pull-misc-2020-12-10' into staging (2020-12-10 17:01:05 +0000)
+  Merge remote-tracking branch 'remotes/pmaydell/tags/pull-target-arm-20220208' into staging (2022-02-08 11:40:08 +0000)
 are available in the Git repository at:
-  https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20201210
+  https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20220211
-for you to fetch changes up to 9e2658d62ebc23efe7df43fc0e306f129510d874:
+for you to fetch changes up to 5c1a101ef6b85537a4ade93c39ea81cadd5c246e:
-  accel/tcg: rename tcg-cpus functions to match module name (2020-12-10 17:44:10 -0600)
+  tests/tcg/multiarch: Add sigbus.c (2022-02-09 09:00:01 +1100)
 ----------------------------------------------------------------
-Split CpusAccel for tcg variants
+Fix safe_syscall_base for sparc64.
 Fix host signal handling for sparc64-linux.
 Speedups for jump cache and work list probing.
 Fix for exception replays.
 Raise guest SIGBUS for user-only misaligned accesses.
 ----------------------------------------------------------------
-Claudio Fontana (3):
+Idan Horowitz (2):
-      accel/tcg: split CpusAccel into three TCG variants
+      accel/tcg: Optimize jump cache flush during tlb range flush
-      accel/tcg: split tcg_start_vcpu_thread
+      softmmu/cpus: Check if the cpu work list is empty atomically
       accel/tcg: rename tcg-cpus functions to match module name
- accel/tcg/tcg-cpus-icount.h |  17 ++
+Pavel Dovgalyuk (1):
- accel/tcg/tcg-cpus-rr.h     |  21 ++
+      replay: use CF_NOIRQ for special exception-replaying TB
  accel/tcg/tcg-cpus.h        |  12 +-
  accel/tcg/tcg-all.c         |  13 +-
  accel/tcg/tcg-cpus-icount.c | 147 +++++++++++++
  accel/tcg/tcg-cpus-mttcg.c  | 140 ++++++++++++
  accel/tcg/tcg-cpus-rr.c     | 305 ++++++++++++++++++++++++++
  accel/tcg/tcg-cpus.c        | 506 +-------------------------------------------
  softmmu/icount.c            |   2 +-
  accel/tcg/meson.build       |   9 +-
 files changed, 670 insertions(+), 502 deletions(-)
  create mode 100644 accel/tcg/tcg-cpus-icount.h
  create mode 100644 accel/tcg/tcg-cpus-rr.h
  create mode 100644 accel/tcg/tcg-cpus-icount.c
  create mode 100644 accel/tcg/tcg-cpus-mttcg.c
  create mode 100644 accel/tcg/tcg-cpus-rr.c
+Richard Henderson (29):
+      common-user/host/sparc64: Fix safe_syscall_base
+      linux-user: Introduce host_signal_mask
+      linux-user: Introduce host_sigcontext
+      linux-user: Move sparc/host-signal.h to sparc64/host-signal.h
+      linux-user/include/host/sparc64: Fix host_sigcontext
+      tcg/i386: Support raising sigbus for user-only
+      tcg/aarch64: Support raising sigbus for user-only
+      tcg/ppc: Support raising sigbus for user-only
+      tcg/riscv: Support raising sigbus for user-only
+      tcg/s390x: Support raising sigbus for user-only
+      tcg/tci: Support raising sigbus for user-only
+      tcg/arm: Drop support for armv4 and armv5 hosts
+      tcg/arm: Remove use_armv5t_instructions
+      tcg/arm: Remove use_armv6_instructions
+      tcg/arm: Check alignment for ldrd and strd
+      tcg/arm: Support unaligned access for softmmu
+      tcg/arm: Reserve a register for guest_base
+      tcg/arm: Support raising sigbus for user-only
+      tcg/mips: Support unaligned access for user-only
+      tcg/mips: Support unaligned access for softmmu
+      tcg/sparc: Use tcg_out_movi_imm13 in tcg_out_addsub2_i64
+      tcg/sparc: Split out tcg_out_movi_imm32
+      tcg/sparc: Add scratch argument to tcg_out_movi_int
+      tcg/sparc: Improve code gen for shifted 32-bit constants
+      tcg/sparc: Convert patch_reloc to return bool
+      tcg/sparc: Use the constant pool for 64-bit constants
+      tcg/sparc: Add tcg_out_jmpl_const for better tail calls
+      tcg/sparc: Support unaligned access for user-only
+      tests/tcg/multiarch: Add sigbus.c
+WANG Xuerui (2):
+      tcg/loongarch64: Fix fallout from recent MO_Q renaming
+      tcg/loongarch64: Support raising sigbus for user-only
+ linux-user/include/host/aarch64/host-signal.h     |  16 +-
+ linux-user/include/host/alpha/host-signal.h       |  14 +-
+ linux-user/include/host/arm/host-signal.h         |  14 +-
+ linux-user/include/host/i386/host-signal.h        |  14 +-
+ linux-user/include/host/loongarch64/host-signal.h |  14 +-
+ linux-user/include/host/mips/host-signal.h        |  14 +-
+ linux-user/include/host/ppc/host-signal.h         |  14 +-
+ linux-user/include/host/riscv/host-signal.h       |  14 +-
+ linux-user/include/host/s390/host-signal.h        |  14 +-
+ linux-user/include/host/sparc/host-signal.h       |  63 ----
+ linux-user/include/host/sparc64/host-signal.h     |  65 +++-
+ linux-user/include/host/x86_64/host-signal.h      |  14 +-
+ tcg/aarch64/tcg-target.h                          |   2 -
+ tcg/arm/tcg-target.h                              |   6 +-
+ tcg/i386/tcg-target.h                             |   2 -
+ tcg/loongarch64/tcg-target.h                      |   2 -
+ tcg/mips/tcg-target.h                             |   2 -
+ tcg/ppc/tcg-target.h                              |   2 -
+ tcg/riscv/tcg-target.h                            |   2 -
+ tcg/s390x/tcg-target.h                            |   2 -
+ accel/tcg/cpu-exec.c                              |   3 +-
+ accel/tcg/cputlb.c                                |   9 +
+ linux-user/signal.c                               |  22 +-
+ softmmu/cpus.c                                    |   7 +-
+ tcg/tci.c                                         |  20 +-
+ tests/tcg/multiarch/sigbus.c                      |  68 ++++
+ tcg/aarch64/tcg-target.c.inc                      |  91 ++++-
+ tcg/arm/tcg-target.c.inc                          | 410 +++++++++-------------
+ tcg/i386/tcg-target.c.inc                         | 103 +++++-
+ tcg/loongarch64/tcg-target.c.inc                  |  73 +++-
+ tcg/mips/tcg-target.c.inc                         | 387 ++++++++++++++++++--
+ tcg/ppc/tcg-target.c.inc                          |  98 +++++-
+ tcg/riscv/tcg-target.c.inc                        |  63 +++-
+ tcg/s390x/tcg-target.c.inc                        |  59 +++-
+ tcg/sparc/tcg-target.c.inc                        | 348 +++++++++++++++---
+ common-user/host/sparc64/safe-syscall.inc.S       |   5 +-
+files changed, 1561 insertions(+), 495 deletions(-)
+ delete mode 100644 linux-user/include/host/sparc/host-signal.h
+ create mode 100644 tests/tcg/multiarch/sigbus.c

-New patch
+[PULL 01/34] common-user/host/sparc64: Fix safe_syscall_base
+Use the "retl" instead of "ret" instruction alias, since we
+do not allocate a register window in this function.
+Fix the offset to the first stacked parameter, which lies
+beyond the register window save area.
+Fixes: 95c021dac835 ("linux-user/host/sparc64: Add safe-syscall.inc.S")
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ common-user/host/sparc64/safe-syscall.inc.S | 5 +++--
+file changed, 3 insertions(+), 2 deletions(-)
+diff --git a/common-user/host/sparc64/safe-syscall.inc.S b/common-user/host/sparc64/safe-syscall.inc.S
+index XXXXXXX..XXXXXXX 100644
+--- a/common-user/host/sparc64/safe-syscall.inc.S
++++ b/common-user/host/sparc64/safe-syscall.inc.S
+@@ -XXX,XX +XXX,XX @@
+         .type   safe_syscall_end, @function
+ #define STACK_BIAS  2047
+-#define PARAM(N)    STACK_BIAS + N*8
++#define WINDOW_SIZE 16 * 8
++#define PARAM(N)    STACK_BIAS + WINDOW_SIZE + N * 8
+         /*
+          * This is the entry point for making a system call. The calling
+@@ -XXX,XX +XXX,XX @@ safe_syscall_end:
+         /* code path for having successfully executed the syscall */
+         bcs,pn  %xcc, 1f
+          nop
+-        ret
++        retl
+          nop
+         /* code path when we didn't execute the syscall */
+--
+.25.1

-New patch
+[PULL 02/34] linux-user: Introduce host_signal_mask
+Do not directly access the uc_sigmask member.
 This is preparation for a sparc64 fix.
 Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
  linux-user/include/host/aarch64/host-signal.h  |  5 +++++
  linux-user/include/host/alpha/host-signal.h    |  5 +++++
  linux-user/include/host/arm/host-signal.h      |  5 +++++
  linux-user/include/host/i386/host-signal.h     |  5 +++++
  .../include/host/loongarch64/host-signal.h     |  5 +++++
  linux-user/include/host/mips/host-signal.h     |  5 +++++
  linux-user/include/host/ppc/host-signal.h      |  5 +++++
  linux-user/include/host/riscv/host-signal.h    |  5 +++++
  linux-user/include/host/s390/host-signal.h     |  5 +++++
  linux-user/include/host/sparc/host-signal.h    |  5 +++++
  linux-user/include/host/x86_64/host-signal.h   |  5 +++++
  linux-user/signal.c                            | 18 ++++++++----------
 files changed, 63 insertions(+), 10 deletions(-)
 diff --git a/linux-user/include/host/aarch64/host-signal.h b/linux-user/include/host/aarch64/host-signal.h
 index XXXXXXX..XXXXXXX 100644
 --- a/linux-user/include/host/aarch64/host-signal.h
 +++ b/linux-user/include/host/aarch64/host-signal.h
@@ -XXX,XX +XXX,XX @@ static inline void host_signal_set_pc(ucontext_t *uc, uintptr_t pc)
      uc->uc_mcontext.pc = pc;
  }
 +static inline void *host_signal_mask(ucontext_t *uc)
 +{
 +    return &uc->uc_sigmask;
 +}
 +
  static inline bool host_signal_write(siginfo_t *info, ucontext_t *uc)
  {
      struct _aarch64_ctx *hdr;
 diff --git a/linux-user/include/host/alpha/host-signal.h b/linux-user/include/host/alpha/host-signal.h
 index XXXXXXX..XXXXXXX 100644
 --- a/linux-user/include/host/alpha/host-signal.h
 +++ b/linux-user/include/host/alpha/host-signal.h
@@ -XXX,XX +XXX,XX @@ static inline void host_signal_set_pc(ucontext_t *uc, uintptr_t pc)
      uc->uc_mcontext.sc_pc = pc;
  }
 +static inline void *host_signal_mask(ucontext_t *uc)
 +{
 +    return &uc->uc_sigmask;
 +}
 +
  static inline bool host_signal_write(siginfo_t *info, ucontext_t *uc)
  {
      uint32_t *pc = (uint32_t *)host_signal_pc(uc);
 diff --git a/linux-user/include/host/arm/host-signal.h b/linux-user/include/host/arm/host-signal.h
 index XXXXXXX..XXXXXXX 100644
 --- a/linux-user/include/host/arm/host-signal.h
 +++ b/linux-user/include/host/arm/host-signal.h
@@ -XXX,XX +XXX,XX @@ static inline void host_signal_set_pc(ucontext_t *uc, uintptr_t pc)
      uc->uc_mcontext.arm_pc = pc;
  }
 +static inline void *host_signal_mask(ucontext_t *uc)
 +{
 +    return &uc->uc_sigmask;
 +}
 +
  static inline bool host_signal_write(siginfo_t *info, ucontext_t *uc)
  {
      /*
 diff --git a/linux-user/include/host/i386/host-signal.h b/linux-user/include/host/i386/host-signal.h
 index XXXXXXX..XXXXXXX 100644
 --- a/linux-user/include/host/i386/host-signal.h
 +++ b/linux-user/include/host/i386/host-signal.h
@@ -XXX,XX +XXX,XX @@ static inline void host_signal_set_pc(ucontext_t *uc, uintptr_t pc)
      uc->uc_mcontext.gregs[REG_EIP] = pc;
  }
 +static inline void *host_signal_mask(ucontext_t *uc)
 +{
 +    return &uc->uc_sigmask;
 +}
 +
  static inline bool host_signal_write(siginfo_t *info, ucontext_t *uc)
  {
      return uc->uc_mcontext.gregs[REG_TRAPNO] == 0xe
 diff --git a/linux-user/include/host/loongarch64/host-signal.h b/linux-user/include/host/loongarch64/host-signal.h
 index XXXXXXX..XXXXXXX 100644
 --- a/linux-user/include/host/loongarch64/host-signal.h
 +++ b/linux-user/include/host/loongarch64/host-signal.h
@@ -XXX,XX +XXX,XX @@ static inline void host_signal_set_pc(ucontext_t *uc, uintptr_t pc)
      uc->uc_mcontext.__pc = pc;
  }
 +static inline void *host_signal_mask(ucontext_t *uc)
 +{
 +    return &uc->uc_sigmask;
 +}
 +
  static inline bool host_signal_write(siginfo_t *info, ucontext_t *uc)
  {
      const uint32_t *pinsn = (const uint32_t *)host_signal_pc(uc);
 diff --git a/linux-user/include/host/mips/host-signal.h b/linux-user/include/host/mips/host-signal.h
 index XXXXXXX..XXXXXXX 100644
 --- a/linux-user/include/host/mips/host-signal.h
 +++ b/linux-user/include/host/mips/host-signal.h
@@ -XXX,XX +XXX,XX @@ static inline void host_signal_set_pc(ucontext_t *uc, uintptr_t pc)
      uc->uc_mcontext.pc = pc;
  }
 +static inline void *host_signal_mask(ucontext_t *uc)
 +{
 +    return &uc->uc_sigmask;
 +}
 +
  #if defined(__misp16) || defined(__mips_micromips)
  #error "Unsupported encoding"
  #endif
 diff --git a/linux-user/include/host/ppc/host-signal.h b/linux-user/include/host/ppc/host-signal.h
 index XXXXXXX..XXXXXXX 100644
 --- a/linux-user/include/host/ppc/host-signal.h
 +++ b/linux-user/include/host/ppc/host-signal.h
@@ -XXX,XX +XXX,XX @@ static inline void host_signal_set_pc(ucontext_t *uc, uintptr_t pc)
      uc->uc_mcontext.regs->nip = pc;
  }
 +static inline void *host_signal_mask(ucontext_t *uc)
 +{
 +    return &uc->uc_sigmask;
 +}
 +
  static inline bool host_signal_write(siginfo_t *info, ucontext_t *uc)
  {
      return uc->uc_mcontext.regs->trap != 0x400
 diff --git a/linux-user/include/host/riscv/host-signal.h b/linux-user/include/host/riscv/host-signal.h
 index XXXXXXX..XXXXXXX 100644
 --- a/linux-user/include/host/riscv/host-signal.h
 +++ b/linux-user/include/host/riscv/host-signal.h
@@ -XXX,XX +XXX,XX @@ static inline void host_signal_set_pc(ucontext_t *uc, uintptr_t pc)
      uc->uc_mcontext.__gregs[REG_PC] = pc;
  }
 +static inline void *host_signal_mask(ucontext_t *uc)
 +{
 +    return &uc->uc_sigmask;
 +}
 +
  static inline bool host_signal_write(siginfo_t *info, ucontext_t *uc)
  {
      /*
 diff --git a/linux-user/include/host/s390/host-signal.h b/linux-user/include/host/s390/host-signal.h
 index XXXXXXX..XXXXXXX 100644
 --- a/linux-user/include/host/s390/host-signal.h
 +++ b/linux-user/include/host/s390/host-signal.h
@@ -XXX,XX +XXX,XX @@ static inline void host_signal_set_pc(ucontext_t *uc, uintptr_t pc)
      uc->uc_mcontext.psw.addr = pc;
  }
 +static inline void *host_signal_mask(ucontext_t *uc)
 +{
 +    return &uc->uc_sigmask;
 +}
 +
  static inline bool host_signal_write(siginfo_t *info, ucontext_t *uc)
  {
      uint16_t *pinsn = (uint16_t *)host_signal_pc(uc);
 diff --git a/linux-user/include/host/sparc/host-signal.h b/linux-user/include/host/sparc/host-signal.h
 index XXXXXXX..XXXXXXX 100644
 --- a/linux-user/include/host/sparc/host-signal.h
 +++ b/linux-user/include/host/sparc/host-signal.h
@@ -XXX,XX +XXX,XX @@ static inline void host_signal_set_pc(ucontext_t *uc, uintptr_t pc)
  #endif
  }
 +static inline void *host_signal_mask(ucontext_t *uc)
 +{
 +    return &uc->uc_sigmask;
 +}
 +
  static inline bool host_signal_write(siginfo_t *info, ucontext_t *uc)
  {
      uint32_t insn = *(uint32_t *)host_signal_pc(uc);
 diff --git a/linux-user/include/host/x86_64/host-signal.h b/linux-user/include/host/x86_64/host-signal.h
 index XXXXXXX..XXXXXXX 100644
 --- a/linux-user/include/host/x86_64/host-signal.h
 +++ b/linux-user/include/host/x86_64/host-signal.h
@@ -XXX,XX +XXX,XX @@ static inline void host_signal_set_pc(ucontext_t *uc, uintptr_t pc)
      uc->uc_mcontext.gregs[REG_RIP] = pc;
  }
 +static inline void *host_signal_mask(ucontext_t *uc)
 +{
 +    return &uc->uc_sigmask;
 +}
 +
  static inline bool host_signal_write(siginfo_t *info, ucontext_t *uc)
  {
      return uc->uc_mcontext.gregs[REG_TRAPNO] == 0xe
 diff --git a/linux-user/signal.c b/linux-user/signal.c
 index XXXXXXX..XXXXXXX 100644
 --- a/linux-user/signal.c
 +++ b/linux-user/signal.c
@@ -XXX,XX +XXX,XX @@ static void host_signal_handler(int host_sig, siginfo_t *info, void *puc)
      int guest_sig;
      uintptr_t pc = 0;
      bool sync_sig = false;
 +    void *sigmask = host_signal_mask(uc);
      /*
       * Non-spoofed SIGSEGV and SIGBUS are synchronous, and need special
@@ -XXX,XX +XXX,XX @@ static void host_signal_handler(int host_sig, siginfo_t *info, void *puc)
              if (info->si_code == SEGV_ACCERR && h2g_valid(host_addr)) {
                  /* If this was a write to a TB protected page, restart. */
                  if (is_write &&
 -                    handle_sigsegv_accerr_write(cpu, &uc->uc_sigmask,
 -                                                pc, guest_addr)) {
 +                    handle_sigsegv_accerr_write(cpu, sigmask, pc, guest_addr)) {
                      return;
                  }
@@ -XXX,XX +XXX,XX @@ static void host_signal_handler(int host_sig, siginfo_t *info, void *puc)
                  }
              }
 -            sigprocmask(SIG_SETMASK, &uc->uc_sigmask, NULL);
 +            sigprocmask(SIG_SETMASK, sigmask, NULL);
              cpu_loop_exit_sigsegv(cpu, guest_addr, access_type, maperr, pc);
          } else {
 -            sigprocmask(SIG_SETMASK, &uc->uc_sigmask, NULL);
 +            sigprocmask(SIG_SETMASK, sigmask, NULL);
              if (info->si_code == BUS_ADRALN) {
                  cpu_loop_exit_sigbus(cpu, guest_addr, access_type, pc);
              }
@@ -XXX,XX +XXX,XX @@ static void host_signal_handler(int host_sig, siginfo_t *info, void *puc)
       * now and it getting out to the main loop. Signals will be
       * unblocked again in process_pending_signals().
       *
 -     * WARNING: we cannot use sigfillset() here because the uc_sigmask
 +     * WARNING: we cannot use sigfillset() here because the sigmask
       * field is a kernel sigset_t, which is much smaller than the
       * libc sigset_t which sigfillset() operates on. Using sigfillset()
       * would write 0xff bytes off the end of the structure and trash
       * data on the struct.
 -     * We can't use sizeof(uc->uc_sigmask) either, because the libc
 -     * headers define the struct field with the wrong (too large) type.
       */
 -    memset(&uc->uc_sigmask, 0xff, SIGSET_T_SIZE);
 -    sigdelset(&uc->uc_sigmask, SIGSEGV);
 -    sigdelset(&uc->uc_sigmask, SIGBUS);
 +    memset(sigmask, 0xff, SIGSET_T_SIZE);
 +    sigdelset(sigmask, SIGSEGV);
 +    sigdelset(sigmask, SIGBUS);
      /* interrupt the virtual CPU as soon as possible */
      cpu_exit(thread_cpu);
 --
 .25.1

-New patch
+[PULL 03/34] linux-user: Introduce host_sigcontext
+Do not directly access ucontext_t as the third signal parameter.
+This is preparation for a sparc64 fix.
+Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ linux-user/include/host/aarch64/host-signal.h     | 13 ++++++++-----
+ linux-user/include/host/alpha/host-signal.h       | 11 +++++++----
+ linux-user/include/host/arm/host-signal.h         | 11 +++++++----
+ linux-user/include/host/i386/host-signal.h        | 11 +++++++----
+ linux-user/include/host/loongarch64/host-signal.h | 11 +++++++----
+ linux-user/include/host/mips/host-signal.h        | 11 +++++++----
+ linux-user/include/host/ppc/host-signal.h         | 11 +++++++----
+ linux-user/include/host/riscv/host-signal.h       | 11 +++++++----
+ linux-user/include/host/s390/host-signal.h        | 11 +++++++----
+ linux-user/include/host/sparc/host-signal.h       | 11 +++++++----
+ linux-user/include/host/x86_64/host-signal.h      | 11 +++++++----
+ linux-user/signal.c                               |  4 ++--
+files changed, 80 insertions(+), 47 deletions(-)
+diff --git a/linux-user/include/host/aarch64/host-signal.h b/linux-user/include/host/aarch64/host-signal.h
+index XXXXXXX..XXXXXXX 100644
+--- a/linux-user/include/host/aarch64/host-signal.h
++++ b/linux-user/include/host/aarch64/host-signal.h
+@@ -XXX,XX +XXX,XX @@
+ #ifndef AARCH64_HOST_SIGNAL_H
+ #define AARCH64_HOST_SIGNAL_H
++/* The third argument to a SA_SIGINFO handler is ucontext_t. */
++typedef ucontext_t host_sigcontext;
++
+ /* Pre-3.16 kernel headers don't have these, so provide fallback definitions */
+ #ifndef ESR_MAGIC
+ #define ESR_MAGIC 0x45535201
+@@ -XXX,XX +XXX,XX @@ struct esr_context {
+ };
+ #endif
+-static inline struct _aarch64_ctx *first_ctx(ucontext_t *uc)
++static inline struct _aarch64_ctx *first_ctx(host_sigcontext *uc)
+ {
+     return (struct _aarch64_ctx *)&uc->uc_mcontext.__reserved;
+ }
+@@ -XXX,XX +XXX,XX @@ static inline struct _aarch64_ctx *next_ctx(struct _aarch64_ctx *hdr)
+     return (struct _aarch64_ctx *)((char *)hdr + hdr->size);
+ }
+-static inline uintptr_t host_signal_pc(ucontext_t *uc)
++static inline uintptr_t host_signal_pc(host_sigcontext *uc)
+ {
+     return uc->uc_mcontext.pc;
+ }
+-static inline void host_signal_set_pc(ucontext_t *uc, uintptr_t pc)
++static inline void host_signal_set_pc(host_sigcontext *uc, uintptr_t pc)
+ {
+     uc->uc_mcontext.pc = pc;
+ }
+-static inline void *host_signal_mask(ucontext_t *uc)
++static inline void *host_signal_mask(host_sigcontext *uc)
+ {
+     return &uc->uc_sigmask;
+ }
+-static inline bool host_signal_write(siginfo_t *info, ucontext_t *uc)
++static inline bool host_signal_write(siginfo_t *info, host_sigcontext *uc)
+ {
+     struct _aarch64_ctx *hdr;
+     uint32_t insn;
+diff --git a/linux-user/include/host/alpha/host-signal.h b/linux-user/include/host/alpha/host-signal.h
+index XXXXXXX..XXXXXXX 100644
+--- a/linux-user/include/host/alpha/host-signal.h
++++ b/linux-user/include/host/alpha/host-signal.h
+@@ -XXX,XX +XXX,XX @@
+ #ifndef ALPHA_HOST_SIGNAL_H
+ #define ALPHA_HOST_SIGNAL_H
+-static inline uintptr_t host_signal_pc(ucontext_t *uc)
++/* The third argument to a SA_SIGINFO handler is ucontext_t. */
++typedef ucontext_t host_sigcontext;
++
++static inline uintptr_t host_signal_pc(host_sigcontext *uc)
+ {
+     return uc->uc_mcontext.sc_pc;
+ }
+-static inline void host_signal_set_pc(ucontext_t *uc, uintptr_t pc)
++static inline void host_signal_set_pc(host_sigcontext *uc, uintptr_t pc)
+ {
+     uc->uc_mcontext.sc_pc = pc;
+ }
+-static inline void *host_signal_mask(ucontext_t *uc)
++static inline void *host_signal_mask(host_sigcontext *uc)
+ {
+     return &uc->uc_sigmask;
+ }
+-static inline bool host_signal_write(siginfo_t *info, ucontext_t *uc)
++static inline bool host_signal_write(siginfo_t *info, host_sigcontext *uc)
+ {
+     uint32_t *pc = (uint32_t *)host_signal_pc(uc);
+     uint32_t insn = *pc;
+diff --git a/linux-user/include/host/arm/host-signal.h b/linux-user/include/host/arm/host-signal.h
+index XXXXXXX..XXXXXXX 100644
+--- a/linux-user/include/host/arm/host-signal.h
++++ b/linux-user/include/host/arm/host-signal.h
+@@ -XXX,XX +XXX,XX @@
+ #ifndef ARM_HOST_SIGNAL_H
+ #define ARM_HOST_SIGNAL_H
+-static inline uintptr_t host_signal_pc(ucontext_t *uc)
++/* The third argument to a SA_SIGINFO handler is ucontext_t. */
++typedef ucontext_t host_sigcontext;
++
++static inline uintptr_t host_signal_pc(host_sigcontext *uc)
+ {
+     return uc->uc_mcontext.arm_pc;
+ }
+-static inline void host_signal_set_pc(ucontext_t *uc, uintptr_t pc)
++static inline void host_signal_set_pc(host_sigcontext *uc, uintptr_t pc)
+ {
+     uc->uc_mcontext.arm_pc = pc;
+ }
+-static inline void *host_signal_mask(ucontext_t *uc)
++static inline void *host_signal_mask(host_sigcontext *uc)
+ {
+     return &uc->uc_sigmask;
+ }
+-static inline bool host_signal_write(siginfo_t *info, ucontext_t *uc)
++static inline bool host_signal_write(siginfo_t *info, host_sigcontext *uc)
+ {
+     /*
+      * In the FSR, bit 11 is WnR, assuming a v6 or
+diff --git a/linux-user/include/host/i386/host-signal.h b/linux-user/include/host/i386/host-signal.h
+index XXXXXXX..XXXXXXX 100644
+--- a/linux-user/include/host/i386/host-signal.h
++++ b/linux-user/include/host/i386/host-signal.h
+@@ -XXX,XX +XXX,XX @@
+ #ifndef I386_HOST_SIGNAL_H
+ #define I386_HOST_SIGNAL_H
+-static inline uintptr_t host_signal_pc(ucontext_t *uc)
++/* The third argument to a SA_SIGINFO handler is ucontext_t. */
++typedef ucontext_t host_sigcontext;
++
++static inline uintptr_t host_signal_pc(host_sigcontext *uc)
+ {
+     return uc->uc_mcontext.gregs[REG_EIP];
+ }
+-static inline void host_signal_set_pc(ucontext_t *uc, uintptr_t pc)
++static inline void host_signal_set_pc(host_sigcontext *uc, uintptr_t pc)
+ {
+     uc->uc_mcontext.gregs[REG_EIP] = pc;
+ }
+-static inline void *host_signal_mask(ucontext_t *uc)
++static inline void *host_signal_mask(host_sigcontext *uc)
+ {
+     return &uc->uc_sigmask;
+ }
+-static inline bool host_signal_write(siginfo_t *info, ucontext_t *uc)
++static inline bool host_signal_write(siginfo_t *info, host_sigcontext *uc)
+ {
+     return uc->uc_mcontext.gregs[REG_TRAPNO] == 0xe
+         && (uc->uc_mcontext.gregs[REG_ERR] & 0x2);
+diff --git a/linux-user/include/host/loongarch64/host-signal.h b/linux-user/include/host/loongarch64/host-signal.h
+index XXXXXXX..XXXXXXX 100644
+--- a/linux-user/include/host/loongarch64/host-signal.h
++++ b/linux-user/include/host/loongarch64/host-signal.h
+@@ -XXX,XX +XXX,XX @@
+ #ifndef LOONGARCH64_HOST_SIGNAL_H
+ #define LOONGARCH64_HOST_SIGNAL_H
+-static inline uintptr_t host_signal_pc(ucontext_t *uc)
++/* The third argument to a SA_SIGINFO handler is ucontext_t. */
++typedef ucontext_t host_sigcontext;
++
++static inline uintptr_t host_signal_pc(host_sigcontext *uc)
+ {
+     return uc->uc_mcontext.__pc;
+ }
+-static inline void host_signal_set_pc(ucontext_t *uc, uintptr_t pc)
++static inline void host_signal_set_pc(host_sigcontext *uc, uintptr_t pc)
+ {
+     uc->uc_mcontext.__pc = pc;
+ }
+-static inline void *host_signal_mask(ucontext_t *uc)
++static inline void *host_signal_mask(host_sigcontext *uc)
+ {
+     return &uc->uc_sigmask;
+ }
+-static inline bool host_signal_write(siginfo_t *info, ucontext_t *uc)
++static inline bool host_signal_write(siginfo_t *info, host_sigcontext *uc)
+ {
+     const uint32_t *pinsn = (const uint32_t *)host_signal_pc(uc);
+     uint32_t insn = pinsn[0];
+diff --git a/linux-user/include/host/mips/host-signal.h b/linux-user/include/host/mips/host-signal.h
+index XXXXXXX..XXXXXXX 100644
+--- a/linux-user/include/host/mips/host-signal.h
++++ b/linux-user/include/host/mips/host-signal.h
+@@ -XXX,XX +XXX,XX @@
+ #ifndef MIPS_HOST_SIGNAL_H
+ #define MIPS_HOST_SIGNAL_H
+-static inline uintptr_t host_signal_pc(ucontext_t *uc)
++/* The third argument to a SA_SIGINFO handler is ucontext_t. */
++typedef ucontext_t host_sigcontext;
++
++static inline uintptr_t host_signal_pc(host_sigcontext *uc)
+ {
+     return uc->uc_mcontext.pc;
+ }
+-static inline void host_signal_set_pc(ucontext_t *uc, uintptr_t pc)
++static inline void host_signal_set_pc(host_sigcontext *uc, uintptr_t pc)
+ {
+     uc->uc_mcontext.pc = pc;
+ }
+-static inline void *host_signal_mask(ucontext_t *uc)
++static inline void *host_signal_mask(host_sigcontext *uc)
+ {
+     return &uc->uc_sigmask;
+ }
+@@ -XXX,XX +XXX,XX @@ static inline void *host_signal_mask(ucontext_t *uc)
+ #error "Unsupported encoding"
+ #endif
+-static inline bool host_signal_write(siginfo_t *info, ucontext_t *uc)
++static inline bool host_signal_write(siginfo_t *info, host_sigcontext *uc)
+ {
+     uint32_t insn = *(uint32_t *)host_signal_pc(uc);
+diff --git a/linux-user/include/host/ppc/host-signal.h b/linux-user/include/host/ppc/host-signal.h
+index XXXXXXX..XXXXXXX 100644
+--- a/linux-user/include/host/ppc/host-signal.h
++++ b/linux-user/include/host/ppc/host-signal.h
+@@ -XXX,XX +XXX,XX @@
+ #ifndef PPC_HOST_SIGNAL_H
+ #define PPC_HOST_SIGNAL_H
+-static inline uintptr_t host_signal_pc(ucontext_t *uc)
++/* The third argument to a SA_SIGINFO handler is ucontext_t. */
++typedef ucontext_t host_sigcontext;
++
++static inline uintptr_t host_signal_pc(host_sigcontext *uc)
+ {
+     return uc->uc_mcontext.regs->nip;
+ }
+-static inline void host_signal_set_pc(ucontext_t *uc, uintptr_t pc)
++static inline void host_signal_set_pc(host_sigcontext *uc, uintptr_t pc)
+ {
+     uc->uc_mcontext.regs->nip = pc;
+ }
+-static inline void *host_signal_mask(ucontext_t *uc)
++static inline void *host_signal_mask(host_sigcontext *uc)
+ {
+     return &uc->uc_sigmask;
+ }
+-static inline bool host_signal_write(siginfo_t *info, ucontext_t *uc)
++static inline bool host_signal_write(siginfo_t *info, host_sigcontext *uc)
+ {
+     return uc->uc_mcontext.regs->trap != 0x400
+         && (uc->uc_mcontext.regs->dsisr & 0x02000000);
+diff --git a/linux-user/include/host/riscv/host-signal.h b/linux-user/include/host/riscv/host-signal.h
+index XXXXXXX..XXXXXXX 100644
+--- a/linux-user/include/host/riscv/host-signal.h
++++ b/linux-user/include/host/riscv/host-signal.h
+@@ -XXX,XX +XXX,XX @@
+ #ifndef RISCV_HOST_SIGNAL_H
+ #define RISCV_HOST_SIGNAL_H
+-static inline uintptr_t host_signal_pc(ucontext_t *uc)
++/* The third argument to a SA_SIGINFO handler is ucontext_t. */
++typedef ucontext_t host_sigcontext;
++
++static inline uintptr_t host_signal_pc(host_sigcontext *uc)
+ {
+     return uc->uc_mcontext.__gregs[REG_PC];
+ }
+-static inline void host_signal_set_pc(ucontext_t *uc, uintptr_t pc)
++static inline void host_signal_set_pc(host_sigcontext *uc, uintptr_t pc)
+ {
+     uc->uc_mcontext.__gregs[REG_PC] = pc;
+ }
+-static inline void *host_signal_mask(ucontext_t *uc)
++static inline void *host_signal_mask(host_sigcontext *uc)
+ {
+     return &uc->uc_sigmask;
+ }
+-static inline bool host_signal_write(siginfo_t *info, ucontext_t *uc)
++static inline bool host_signal_write(siginfo_t *info, host_sigcontext *uc)
+ {
+     /*
+      * Detect store by reading the instruction at the program counter.
+diff --git a/linux-user/include/host/s390/host-signal.h b/linux-user/include/host/s390/host-signal.h
+index XXXXXXX..XXXXXXX 100644
+--- a/linux-user/include/host/s390/host-signal.h
++++ b/linux-user/include/host/s390/host-signal.h
+@@ -XXX,XX +XXX,XX @@
+ #ifndef S390_HOST_SIGNAL_H
+ #define S390_HOST_SIGNAL_H
+-static inline uintptr_t host_signal_pc(ucontext_t *uc)
++/* The third argument to a SA_SIGINFO handler is ucontext_t. */
++typedef ucontext_t host_sigcontext;
++
++static inline uintptr_t host_signal_pc(host_sigcontext *uc)
+ {
+     return uc->uc_mcontext.psw.addr;
+ }
+-static inline void host_signal_set_pc(ucontext_t *uc, uintptr_t pc)
++static inline void host_signal_set_pc(host_sigcontext *uc, uintptr_t pc)
+ {
+     uc->uc_mcontext.psw.addr = pc;
+ }
+-static inline void *host_signal_mask(ucontext_t *uc)
++static inline void *host_signal_mask(host_sigcontext *uc)
+ {
+     return &uc->uc_sigmask;
+ }
+-static inline bool host_signal_write(siginfo_t *info, ucontext_t *uc)
++static inline bool host_signal_write(siginfo_t *info, host_sigcontext *uc)
+ {
+     uint16_t *pinsn = (uint16_t *)host_signal_pc(uc);
+diff --git a/linux-user/include/host/sparc/host-signal.h b/linux-user/include/host/sparc/host-signal.h
+index XXXXXXX..XXXXXXX 100644
+--- a/linux-user/include/host/sparc/host-signal.h
++++ b/linux-user/include/host/sparc/host-signal.h
+@@ -XXX,XX +XXX,XX @@
+ #ifndef SPARC_HOST_SIGNAL_H
+ #define SPARC_HOST_SIGNAL_H
+-static inline uintptr_t host_signal_pc(ucontext_t *uc)
++/* FIXME: the third argument to a SA_SIGINFO handler is *not* ucontext_t. */
++typedef ucontext_t host_sigcontext;
++
++static inline uintptr_t host_signal_pc(host_sigcontext *uc)
+ {
+ #ifdef __arch64__
+     return uc->uc_mcontext.mc_gregs[MC_PC];
+@@ -XXX,XX +XXX,XX @@ static inline uintptr_t host_signal_pc(ucontext_t *uc)
+ #endif
+ }
+-static inline void host_signal_set_pc(ucontext_t *uc, uintptr_t pc)
++static inline void host_signal_set_pc(host_sigcontext *uc, uintptr_t pc)
+ {
+ #ifdef __arch64__
+     uc->uc_mcontext.mc_gregs[MC_PC] = pc;
+@@ -XXX,XX +XXX,XX @@ static inline void host_signal_set_pc(ucontext_t *uc, uintptr_t pc)
+ #endif
+ }
+-static inline void *host_signal_mask(ucontext_t *uc)
++static inline void *host_signal_mask(host_sigcontext *uc)
+ {
+     return &uc->uc_sigmask;
+ }
+-static inline bool host_signal_write(siginfo_t *info, ucontext_t *uc)
++static inline bool host_signal_write(siginfo_t *info, host_sigcontext *uc)
+ {
+     uint32_t insn = *(uint32_t *)host_signal_pc(uc);
+diff --git a/linux-user/include/host/x86_64/host-signal.h b/linux-user/include/host/x86_64/host-signal.h
+index XXXXXXX..XXXXXXX 100644
+--- a/linux-user/include/host/x86_64/host-signal.h
++++ b/linux-user/include/host/x86_64/host-signal.h
+@@ -XXX,XX +XXX,XX @@
+ #ifndef X86_64_HOST_SIGNAL_H
+ #define X86_64_HOST_SIGNAL_H
+-static inline uintptr_t host_signal_pc(ucontext_t *uc)
++/* The third argument to a SA_SIGINFO handler is ucontext_t. */
++typedef ucontext_t host_sigcontext;
++
++static inline uintptr_t host_signal_pc(host_sigcontext *uc)
+ {
+     return uc->uc_mcontext.gregs[REG_RIP];
+ }
+-static inline void host_signal_set_pc(ucontext_t *uc, uintptr_t pc)
++static inline void host_signal_set_pc(host_sigcontext *uc, uintptr_t pc)
+ {
+     uc->uc_mcontext.gregs[REG_RIP] = pc;
+ }
+-static inline void *host_signal_mask(ucontext_t *uc)
++static inline void *host_signal_mask(host_sigcontext *uc)
+ {
+     return &uc->uc_sigmask;
+ }
+-static inline bool host_signal_write(siginfo_t *info, ucontext_t *uc)
++static inline bool host_signal_write(siginfo_t *info, host_sigcontext *uc)
+ {
+     return uc->uc_mcontext.gregs[REG_TRAPNO] == 0xe
+         && (uc->uc_mcontext.gregs[REG_ERR] & 0x2);
+diff --git a/linux-user/signal.c b/linux-user/signal.c
+index XXXXXXX..XXXXXXX 100644
+--- a/linux-user/signal.c
++++ b/linux-user/signal.c
+@@ -XXX,XX +XXX,XX @@ void queue_signal(CPUArchState *env, int sig, int si_type,
+ /* Adjust the signal context to rewind out of safe-syscall if we're in it */
+ static inline void rewind_if_in_safe_syscall(void *puc)
+ {
+-    ucontext_t *uc = (ucontext_t *)puc;
++    host_sigcontext *uc = (host_sigcontext *)puc;
+     uintptr_t pcreg = host_signal_pc(uc);
+     if (pcreg > (uintptr_t)safe_syscall_start
+@@ -XXX,XX +XXX,XX @@ static void host_signal_handler(int host_sig, siginfo_t *info, void *puc)
+     CPUState *cpu = env_cpu(env);
+     TaskState *ts = cpu->opaque;
+     target_siginfo_t tinfo;
+-    ucontext_t *uc = puc;
++    host_sigcontext *uc = puc;
+     struct emulated_sigtable *k;
+     int guest_sig;
+     uintptr_t pc = 0;
+--
+.25.1

-[PULL 2/3] accel/tcg: split tcg_start_vcpu_thread
+[PULL 04/34] linux-user: Move sparc/host-signal.h to sparc64/host-signal.h
-From: Claudio Fontana <cfontana@suse.de>
+We do not support sparc32 as a host, so there's no point in
 sparc64 redirecting to sparc.
-after the initial split into 3 tcg variants, we proceed to also
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 split tcg_start_vcpu_thread.
 We actually split it in 2 this time, since the icount variant
 just uses the round robin function.
 Suggested-by: Richard Henderson <richard.henderson@linaro.org>
 Signed-off-by: Claudio Fontana <cfontana@suse.de>
 Message-Id: <20201015143217.29337-3-cfontana@suse.de>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/tcg-cpus-mttcg.h  | 21 --------------
+ linux-user/include/host/sparc/host-signal.h   | 71 -------------------
- accel/tcg/tcg-cpus-rr.h     |  3 +-
+ linux-user/include/host/sparc64/host-signal.h | 64 ++++++++++++++++-
- accel/tcg/tcg-cpus.h        |  1 -
+files changed, 63 insertions(+), 72 deletions(-)
- accel/tcg/tcg-all.c         |  5 ++++
+ delete mode 100644 linux-user/include/host/sparc/host-signal.h
  accel/tcg/tcg-cpus-icount.c |  2 +-
  accel/tcg/tcg-cpus-mttcg.c  | 29 +++++++++++++++++--
  accel/tcg/tcg-cpus-rr.c     | 39 +++++++++++++++++++++++--
  accel/tcg/tcg-cpus.c        | 58 -------------------------------------
 files changed, 71 insertions(+), 87 deletions(-)
  delete mode 100644 accel/tcg/tcg-cpus-mttcg.h
-diff --git a/accel/tcg/tcg-cpus-mttcg.h b/accel/tcg/tcg-cpus-mttcg.h
+diff --git a/linux-user/include/host/sparc/host-signal.h b/linux-user/include/host/sparc/host-signal.h
 deleted file mode 100644
 index XXXXXXX..XXXXXXX
---- a/accel/tcg/tcg-cpus-mttcg.h
+--- a/linux-user/include/host/sparc/host-signal.h
 +++ /dev/null
 @@ -XXX,XX +XXX,XX @@
 -/*
-- * QEMU TCG Multi Threaded vCPUs implementation
+- * host-signal.h: signal info dependent on the host architecture
 - *
-- * Copyright 2020 SUSE LLC
+- * Copyright (c) 2003-2005 Fabrice Bellard
 - * Copyright (c) 2021 Linaro Limited
 - *
-- * This work is licensed under the terms of the GNU GPL, version 2 or later.
+- * This work is licensed under the terms of the GNU LGPL, version 2.1 or later.
 - * See the COPYING file in the top-level directory.
 - */
 -
--#ifndef TCG_CPUS_MTTCG_H
+-#ifndef SPARC_HOST_SIGNAL_H
--#define TCG_CPUS_MTTCG_H
+-#define SPARC_HOST_SIGNAL_H
 -
--/*
+-/* FIXME: the third argument to a SA_SIGINFO handler is *not* ucontext_t. */
-- * In the multi-threaded case each vCPU has its own thread. The TLS
+-typedef ucontext_t host_sigcontext;
 - * variable current_cpu can be used deep in the code to find the
 - * current CPUState for a given thread.
 - */
 -
--void *tcg_cpu_thread_fn(void *arg);
+-static inline uintptr_t host_signal_pc(host_sigcontext *uc)
 -{
 -#ifdef __arch64__
 -    return uc->uc_mcontext.mc_gregs[MC_PC];
 -#else
 -    return uc->uc_mcontext.gregs[REG_PC];
 -#endif
 -}
 -
--#endif /* TCG_CPUS_MTTCG_H */
+-static inline void host_signal_set_pc(host_sigcontext *uc, uintptr_t pc)
-diff --git a/accel/tcg/tcg-cpus-rr.h b/accel/tcg/tcg-cpus-rr.h
+-{
 -#ifdef __arch64__
 -    uc->uc_mcontext.mc_gregs[MC_PC] = pc;
 -#else
 -    uc->uc_mcontext.gregs[REG_PC] = pc;
 -#endif
 -}
 -
 -static inline void *host_signal_mask(host_sigcontext *uc)
 -{
 -    return &uc->uc_sigmask;
 -}
 -
 -static inline bool host_signal_write(siginfo_t *info, host_sigcontext *uc)
 -{
 -    uint32_t insn = *(uint32_t *)host_signal_pc(uc);
 -
 -    if ((insn >> 30) == 3) {
 -        switch ((insn >> 19) & 0x3f) {
 -        case 0x05: /* stb */
 -        case 0x15: /* stba */
 -        case 0x06: /* sth */
 -        case 0x16: /* stha */
 -        case 0x04: /* st */
 -        case 0x14: /* sta */
 -        case 0x07: /* std */
 -        case 0x17: /* stda */
 -        case 0x0e: /* stx */
 -        case 0x1e: /* stxa */
 -        case 0x24: /* stf */
 -        case 0x34: /* stfa */
 -        case 0x27: /* stdf */
 -        case 0x37: /* stdfa */
 -        case 0x26: /* stqf */
 -        case 0x36: /* stqfa */
 -        case 0x25: /* stfsr */
 -        case 0x3c: /* casa */
 -        case 0x3e: /* casxa */
 -            return true;
 -        }
 -    }
 -    return false;
 -}
 -
 -#endif
 diff --git a/linux-user/include/host/sparc64/host-signal.h b/linux-user/include/host/sparc64/host-signal.h
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/tcg-cpus-rr.h
+--- a/linux-user/include/host/sparc64/host-signal.h
-+++ b/accel/tcg/tcg-cpus-rr.h
++++ b/linux-user/include/host/sparc64/host-signal.h
-@@ -XXX,XX +XXX,XX @@
+@@ -1 +1,63 @@
- /* Kick all RR vCPUs. */
+-#include "../sparc/host-signal.h"
- void qemu_cpu_kick_rr_cpus(CPUState *unused);
++/*
++ * host-signal.h: signal info dependent on the host architecture
--void *tcg_rr_cpu_thread_fn(void *arg);
++ *
-+/* start the round robin vcpu thread */
++ * Copyright (c) 2003-2005 Fabrice Bellard
-+void rr_start_vcpu_thread(CPUState *cpu);
++ * Copyright (c) 2021 Linaro Limited
++ *
- #endif /* TCG_CPUS_RR_H */
++ * This work is licensed under the terms of the GNU LGPL, version 2.1 or later.
-diff --git a/accel/tcg/tcg-cpus.h b/accel/tcg/tcg-cpus.h
++ * See the COPYING file in the top-level directory.
-index XXXXXXX..XXXXXXX 100644
++ */
 --- a/accel/tcg/tcg-cpus.h
 +++ b/accel/tcg/tcg-cpus.h
@@ -XXX,XX +XXX,XX @@ extern const CpusAccel tcg_cpus_mttcg;
  extern const CpusAccel tcg_cpus_icount;
  extern const CpusAccel tcg_cpus_rr;
 -void tcg_start_vcpu_thread(CPUState *cpu);
  void qemu_tcg_destroy_vcpu(CPUState *cpu);
  int tcg_cpu_exec(CPUState *cpu);
  void tcg_handle_interrupt(CPUState *cpu, int mask);
 diff --git a/accel/tcg/tcg-all.c b/accel/tcg/tcg-all.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/tcg-all.c
 +++ b/accel/tcg/tcg-all.c
@@ -XXX,XX +XXX,XX @@ static int tcg_init(MachineState *ms)
      tcg_exec_init(s->tb_size * 1024 * 1024);
      mttcg_enabled = s->mttcg_enabled;
 +    /*
 +     * Initialize TCG regions
 +     */
 +    tcg_region_init();
 +
-     if (mttcg_enabled) {
++#ifndef SPARC64_HOST_SIGNAL_H
-         cpus_register_accel(&tcg_cpus_mttcg);
++#define SPARC64_HOST_SIGNAL_H
-     } else if (icount_enabled()) {
++
-diff --git a/accel/tcg/tcg-cpus-icount.c b/accel/tcg/tcg-cpus-icount.c
++/* FIXME: the third argument to a SA_SIGINFO handler is *not* ucontext_t. */
-index XXXXXXX..XXXXXXX 100644
++typedef ucontext_t host_sigcontext;
---- a/accel/tcg/tcg-cpus-icount.c
++
-+++ b/accel/tcg/tcg-cpus-icount.c
++static inline uintptr_t host_signal_pc(host_sigcontext *uc)
@@ -XXX,XX +XXX,XX @@ static void icount_handle_interrupt(CPUState *cpu, int mask)
  }
  const CpusAccel tcg_cpus_icount = {
 -    .create_vcpu_thread = tcg_start_vcpu_thread,
 +    .create_vcpu_thread = rr_start_vcpu_thread,
      .kick_vcpu_thread = qemu_cpu_kick_rr_cpus,
      .handle_interrupt = icount_handle_interrupt,
 diff --git a/accel/tcg/tcg-cpus-mttcg.c b/accel/tcg/tcg-cpus-mttcg.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/tcg-cpus-mttcg.c
 +++ b/accel/tcg/tcg-cpus-mttcg.c
@@ -XXX,XX +XXX,XX @@
  #include "hw/boards.h"
  #include "tcg-cpus.h"
 -#include "tcg-cpus-mttcg.h"
  /*
   * In the multi-threaded case each vCPU has its own thread. The TLS
@@ -XXX,XX +XXX,XX @@
   * current CPUState for a given thread.
   */
 -void *tcg_cpu_thread_fn(void *arg)
 +static void *tcg_cpu_thread_fn(void *arg)
  {
      CPUState *cpu = arg;
@@ -XXX,XX +XXX,XX @@ static void mttcg_kick_vcpu_thread(CPUState *cpu)
      cpu_exit(cpu);
  }
 +static void mttcg_start_vcpu_thread(CPUState *cpu)
 +{
-+    char thread_name[VCPU_THREAD_NAME_SIZE];
++    return uc->uc_mcontext.mc_gregs[MC_PC];
 +
 +    g_assert(tcg_enabled());
 +
 +    parallel_cpus = (current_machine->smp.max_cpus > 1);
 +
 +    cpu->thread = g_malloc0(sizeof(QemuThread));
 +    cpu->halt_cond = g_malloc0(sizeof(QemuCond));
 +    qemu_cond_init(cpu->halt_cond);
 +
 +    /* create a thread per vCPU with TCG (MTTCG) */
 +    snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
 +             cpu->cpu_index);
 +
 +    qemu_thread_create(cpu->thread, thread_name, tcg_cpu_thread_fn,
 +                       cpu, QEMU_THREAD_JOINABLE);
 +
 +#ifdef _WIN32
 +    cpu->hThread = qemu_thread_get_handle(cpu->thread);
 +#endif
 +}
 +
- const CpusAccel tcg_cpus_mttcg = {
++static inline void host_signal_set_pc(host_sigcontext *uc, uintptr_t pc)
 -    .create_vcpu_thread = tcg_start_vcpu_thread,
 +    .create_vcpu_thread = mttcg_start_vcpu_thread,
      .kick_vcpu_thread = mttcg_kick_vcpu_thread,
      .handle_interrupt = tcg_handle_interrupt,
 diff --git a/accel/tcg/tcg-cpus-rr.c b/accel/tcg/tcg-cpus-rr.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/tcg-cpus-rr.c
 +++ b/accel/tcg/tcg-cpus-rr.c
@@ -XXX,XX +XXX,XX @@ static void deal_with_unplugged_cpus(void)
   * elsewhere.
   */
 -void *tcg_rr_cpu_thread_fn(void *arg)
 +static void *tcg_rr_cpu_thread_fn(void *arg)
  {
      CPUState *cpu = arg;
@@ -XXX,XX +XXX,XX @@ void *tcg_rr_cpu_thread_fn(void *arg)
      return NULL;
  }
 +void rr_start_vcpu_thread(CPUState *cpu)
 +{
-+    char thread_name[VCPU_THREAD_NAME_SIZE];
++    uc->uc_mcontext.mc_gregs[MC_PC] = pc;
 +    static QemuCond *single_tcg_halt_cond;
 +    static QemuThread *single_tcg_cpu_thread;
 +
 +    g_assert(tcg_enabled());
 +    parallel_cpus = false;
 +
 +    if (!single_tcg_cpu_thread) {
 +        cpu->thread = g_malloc0(sizeof(QemuThread));
 +        cpu->halt_cond = g_malloc0(sizeof(QemuCond));
 +        qemu_cond_init(cpu->halt_cond);
 +
 +        /* share a single thread for all cpus with TCG */
 +        snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
 +        qemu_thread_create(cpu->thread, thread_name,
 +                           tcg_rr_cpu_thread_fn,
 +                           cpu, QEMU_THREAD_JOINABLE);
 +
 +        single_tcg_halt_cond = cpu->halt_cond;
 +        single_tcg_cpu_thread = cpu->thread;
 +#ifdef _WIN32
 +        cpu->hThread = qemu_thread_get_handle(cpu->thread);
 +#endif
 +    } else {
 +        /* we share the thread */
 +        cpu->thread = single_tcg_cpu_thread;
 +        cpu->halt_cond = single_tcg_halt_cond;
 +        cpu->thread_id = first_cpu->thread_id;
 +        cpu->can_do_io = 1;
 +        cpu->created = true;
 +    }
 +}
 +
- const CpusAccel tcg_cpus_rr = {
++static inline void *host_signal_mask(host_sigcontext *uc)
--    .create_vcpu_thread = tcg_start_vcpu_thread,
++{
-+    .create_vcpu_thread = rr_start_vcpu_thread,
++    return &uc->uc_sigmask;
-     .kick_vcpu_thread = qemu_cpu_kick_rr_cpus,
++}
++
-     .handle_interrupt = tcg_handle_interrupt,
++static inline bool host_signal_write(siginfo_t *info, host_sigcontext *uc)
-diff --git a/accel/tcg/tcg-cpus.c b/accel/tcg/tcg-cpus.c
++{
-index XXXXXXX..XXXXXXX 100644
++    uint32_t insn = *(uint32_t *)host_signal_pc(uc);
---- a/accel/tcg/tcg-cpus.c
++
-+++ b/accel/tcg/tcg-cpus.c
++    if ((insn >> 30) == 3) {
-@@ -XXX,XX +XXX,XX @@
++        switch ((insn >> 19) & 0x3f) {
- #include "hw/boards.h"
++        case 0x05: /* stb */
++        case 0x15: /* stba */
- #include "tcg-cpus.h"
++        case 0x06: /* sth */
--#include "tcg-cpus-mttcg.h"
++        case 0x16: /* stha */
--#include "tcg-cpus-rr.h"
++        case 0x04: /* st */
++        case 0x14: /* sta */
- /* common functionality among all TCG variants */
++        case 0x07: /* std */
++        case 0x17: /* stda */
--void tcg_start_vcpu_thread(CPUState *cpu)
++        case 0x0e: /* stx */
--{
++        case 0x1e: /* stxa */
--    char thread_name[VCPU_THREAD_NAME_SIZE];
++        case 0x24: /* stf */
--    static QemuCond *single_tcg_halt_cond;
++        case 0x34: /* stfa */
--    static QemuThread *single_tcg_cpu_thread;
++        case 0x27: /* stdf */
--    static int tcg_region_inited;
++        case 0x37: /* stdfa */
--
++        case 0x26: /* stqf */
--    assert(tcg_enabled());
++        case 0x36: /* stqfa */
--    /*
++        case 0x25: /* stfsr */
--     * Initialize TCG regions--once. Now is a good time, because:
++        case 0x3c: /* casa */
--     * (1) TCG's init context, prologue and target globals have been set up.
++        case 0x3e: /* casxa */
--     * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
++            return true;
--     *     -accel flag is processed, so the check doesn't work then).
++        }
--     */
++    }
--    if (!tcg_region_inited) {
++    return false;
--        tcg_region_inited = 1;
++}
--        tcg_region_init();
++
--        parallel_cpus = qemu_tcg_mttcg_enabled() && current_machine->smp.max_cpus > 1;
++#endif
 -    }
 -
 -    if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
 -        cpu->thread = g_malloc0(sizeof(QemuThread));
 -        cpu->halt_cond = g_malloc0(sizeof(QemuCond));
 -        qemu_cond_init(cpu->halt_cond);
 -
 -        if (qemu_tcg_mttcg_enabled()) {
 -            /* create a thread per vCPU with TCG (MTTCG) */
 -            snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
 -                 cpu->cpu_index);
 -
 -            qemu_thread_create(cpu->thread, thread_name, tcg_cpu_thread_fn,
 -                               cpu, QEMU_THREAD_JOINABLE);
 -
 -        } else {
 -            /* share a single thread for all cpus with TCG */
 -            snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
 -            qemu_thread_create(cpu->thread, thread_name,
 -                               tcg_rr_cpu_thread_fn,
 -                               cpu, QEMU_THREAD_JOINABLE);
 -
 -            single_tcg_halt_cond = cpu->halt_cond;
 -            single_tcg_cpu_thread = cpu->thread;
 -        }
 -#ifdef _WIN32
 -        cpu->hThread = qemu_thread_get_handle(cpu->thread);
 -#endif
 -    } else {
 -        /* For non-MTTCG cases we share the thread */
 -        cpu->thread = single_tcg_cpu_thread;
 -        cpu->halt_cond = single_tcg_halt_cond;
 -        cpu->thread_id = first_cpu->thread_id;
 -        cpu->can_do_io = 1;
 -        cpu->created = true;
 -    }
 -}
 -
  void qemu_tcg_destroy_vcpu(CPUState *cpu)
  {
      cpu_thread_signal_destroyed(cpu);
 --
 .25.1

-New patch
+[PULL 05/34] linux-user/include/host/sparc64: Fix host_sigcontext
+Sparc64 is unique on linux in *not* passing ucontext_t as
+the third argument to a SA_SIGINFO handler.  It passes the
+old struct sigcontext instead.
+Set both pc and npc in host_signal_set_pc.
+Fixes: 8b5bd461935b ("linux-user/host/sparc: Populate host_signal.h")
+Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ linux-user/include/host/sparc64/host-signal.h | 17 +++++++++--------
+file changed, 9 insertions(+), 8 deletions(-)
+diff --git a/linux-user/include/host/sparc64/host-signal.h b/linux-user/include/host/sparc64/host-signal.h
+index XXXXXXX..XXXXXXX 100644
+--- a/linux-user/include/host/sparc64/host-signal.h
++++ b/linux-user/include/host/sparc64/host-signal.h
+@@ -XXX,XX +XXX,XX @@
+ #ifndef SPARC64_HOST_SIGNAL_H
+ #define SPARC64_HOST_SIGNAL_H
+-/* FIXME: the third argument to a SA_SIGINFO handler is *not* ucontext_t. */
+-typedef ucontext_t host_sigcontext;
++/* The third argument to a SA_SIGINFO handler is struct sigcontext.  */
++typedef struct sigcontext host_sigcontext;
+-static inline uintptr_t host_signal_pc(host_sigcontext *uc)
++static inline uintptr_t host_signal_pc(host_sigcontext *sc)
+ {
+-    return uc->uc_mcontext.mc_gregs[MC_PC];
++    return sc->sigc_regs.tpc;
+ }
+-static inline void host_signal_set_pc(host_sigcontext *uc, uintptr_t pc)
++static inline void host_signal_set_pc(host_sigcontext *sc, uintptr_t pc)
+ {
+-    uc->uc_mcontext.mc_gregs[MC_PC] = pc;
++    sc->sigc_regs.tpc = pc;
++    sc->sigc_regs.tnpc = pc + 4;
+ }
+-static inline void *host_signal_mask(host_sigcontext *uc)
++static inline void *host_signal_mask(host_sigcontext *sc)
+ {
+-    return &uc->uc_sigmask;
++    return &sc->sigc_mask;
+ }
+ static inline bool host_signal_write(siginfo_t *info, host_sigcontext *uc)
+--
+.25.1

-New patch
+[PULL 06/34] accel/tcg: Optimize jump cache flush during tlb range flush
+From: Idan Horowitz <idan.horowitz@gmail.com>
+When the length of the range is large enough, clearing the whole cache is
+faster than iterating over the (possibly extremely large) set of pages
+contained in the range.
+This mimics the pre-existing similar optimization done on the flush of the
+tlb itself.
+Signed-off-by: Idan Horowitz <idan.horowitz@gmail.com>
+Message-Id: <20220110164754.1066025-1-idan.horowitz@gmail.com>
+Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ accel/tcg/cputlb.c | 9 +++++++++
+file changed, 9 insertions(+)
+diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
+index XXXXXXX..XXXXXXX 100644
+--- a/accel/tcg/cputlb.c
++++ b/accel/tcg/cputlb.c
+@@ -XXX,XX +XXX,XX @@ static void tlb_flush_range_by_mmuidx_async_0(CPUState *cpu,
+     }
+     qemu_spin_unlock(&env_tlb(env)->c.lock);
++    /*
++     * If the length is larger than the jump cache size, then it will take
++     * longer to clear each entry individually than it will to clear it all.
++     */
++    if (d.len >= (TARGET_PAGE_SIZE * TB_JMP_CACHE_SIZE)) {
++        cpu_tb_jmp_cache_clear(cpu);
++        return;
++    }
++
+     for (target_ulong i = 0; i < d.len; i += TARGET_PAGE_SIZE) {
+         tb_flush_jmp_cache(cpu, d.addr + i);
+     }
+--
+.25.1

-New patch
+[PULL 07/34] softmmu/cpus: Check if the cpu work list is empty atomically
+From: Idan Horowitz <idan.horowitz@gmail.com>
+Instead of taking the lock of the cpu work list in order to check if it's
+empty, we can just read the head pointer atomically. This decreases
+cpu_work_list_empty's share from 5% to 1.3% in a profile of icount-enabled
+aarch64-softmmu.
+Signed-off-by: Idan Horowitz <idan.horowitz@gmail.com>
+Message-Id: <20220114004358.299534-1-idan.horowitz@gmail.com>
+Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ softmmu/cpus.c | 7 +------
+file changed, 1 insertion(+), 6 deletions(-)
+diff --git a/softmmu/cpus.c b/softmmu/cpus.c
+index XXXXXXX..XXXXXXX 100644
+--- a/softmmu/cpus.c
++++ b/softmmu/cpus.c
+@@ -XXX,XX +XXX,XX @@ bool cpu_is_stopped(CPUState *cpu)
+ bool cpu_work_list_empty(CPUState *cpu)
+ {
+-    bool ret;
+-
+-    qemu_mutex_lock(&cpu->work_mutex);
+-    ret = QSIMPLEQ_EMPTY(&cpu->work_list);
+-    qemu_mutex_unlock(&cpu->work_mutex);
+-    return ret;
++    return QSIMPLEQ_EMPTY_ATOMIC(&cpu->work_list);
+ }
+ bool cpu_thread_is_idle(CPUState *cpu)
+--
+.25.1

-New patch
+[PULL 08/34] replay: use CF_NOIRQ for special exception-replaying TB
+From: Pavel Dovgalyuk <pavel.dovgalyuk@ispras.ru>
+Commit aff0e204cb1f1c036a496c94c15f5dfafcd9b4b4 introduced CF_NOIRQ usage,
+but one case was forgotten. Record/replay uses one special TB which is not
+really executed, but used to cause a correct exception in replay mode.
+This patch adds CF_NOIRQ flag for such block.
+Signed-off-by: Pavel Dovgalyuk <Pavel.Dovgalyuk@ispras.ru>
+Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
+Message-Id: <164362834054.1754532.7678416881159817273.stgit@pasha-ThinkPad-X280>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ accel/tcg/cpu-exec.c | 3 ++-
+file changed, 2 insertions(+), 1 deletion(-)
+diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
+index XXXXXXX..XXXXXXX 100644
+--- a/accel/tcg/cpu-exec.c
++++ b/accel/tcg/cpu-exec.c
+@@ -XXX,XX +XXX,XX @@ static inline bool cpu_handle_exception(CPUState *cpu, int *ret)
+         if (replay_has_exception()
+             && cpu_neg(cpu)->icount_decr.u16.low + cpu->icount_extra == 0) {
+             /* Execute just one insn to trigger exception pending in the log */
+-            cpu->cflags_next_tb = (curr_cflags(cpu) & ~CF_USE_ICOUNT) | 1;
++            cpu->cflags_next_tb = (curr_cflags(cpu) & ~CF_USE_ICOUNT)
++                | CF_NOIRQ | 1;
+         }
+ #endif
+         return false;
+--
+.25.1

-New patch
+[PULL 09/34] tcg/loongarch64: Fix fallout from recent MO_Q renaming
+From: WANG Xuerui <git@xen0n.name>
+Apparently we were left behind; just renaming MO_Q to MO_UQ is enough.
+Fixes: fc313c64345453c7 ("exec/memop: Adding signedness to quad definitions")
+Signed-off-by: WANG Xuerui <git@xen0n.name>
+Message-Id: <20220206162106.1092364-1-i.qemu@xen0n.name>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/loongarch64/tcg-target.c.inc | 2 +-
+file changed, 1 insertion(+), 1 deletion(-)
+diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/loongarch64/tcg-target.c.inc
++++ b/tcg/loongarch64/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld_indexed(TCGContext *s, TCGReg rd, TCGReg rj,
+     case MO_SL:
+         tcg_out_opc_ldx_w(s, rd, rj, rk);
+         break;
+-    case MO_Q:
++    case MO_UQ:
+         tcg_out_opc_ldx_d(s, rd, rj, rk);
+         break;
+     default:
+--
+.25.1

-New patch
+[PULL 10/34] tcg/i386: Support raising sigbus for user-only
+Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/i386/tcg-target.h     |   2 -
+ tcg/i386/tcg-target.c.inc | 103 ++++++++++++++++++++++++++++++++++++--
+files changed, 98 insertions(+), 7 deletions(-)
+diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/i386/tcg-target.h
++++ b/tcg/i386/tcg-target.h
+@@ -XXX,XX +XXX,XX @@ static inline void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_rx,
+ #define TCG_TARGET_HAS_MEMORY_BSWAP  have_movbe
+-#ifdef CONFIG_SOFTMMU
+ #define TCG_TARGET_NEED_LDST_LABELS
+-#endif
+ #define TCG_TARGET_NEED_POOL_LABELS
+ #endif
+diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/i386/tcg-target.c.inc
++++ b/tcg/i386/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@
+  * THE SOFTWARE.
+  */
++#include "../tcg-ldst.c.inc"
+ #include "../tcg-pool.c.inc"
+ #ifdef CONFIG_DEBUG_TCG
+@@ -XXX,XX +XXX,XX @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
+ #define OPC_VZEROUPPER  (0x77 | P_EXT)
+ #define OPC_XCHG_ax_r32    (0x90)
+-#define OPC_GRP3_Ev    (0xf7)
+-#define OPC_GRP5    (0xff)
++#define OPC_GRP3_Eb     (0xf6)
++#define OPC_GRP3_Ev     (0xf7)
++#define OPC_GRP5        (0xff)
+ #define OPC_GRP14       (0x73 | P_EXT | P_DATA16)
+ /* Group 1 opcode extensions for 0x80-0x83.
+@@ -XXX,XX +XXX,XX @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
+ #define SHIFT_SAR 7
+ /* Group 3 opcode extensions for 0xf6, 0xf7.  To be used with OPC_GRP3.  */
++#define EXT3_TESTi 0
+ #define EXT3_NOT   2
+ #define EXT3_NEG   3
+ #define EXT3_MUL   4
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_nopn(TCGContext *s, int n)
+ }
+ #if defined(CONFIG_SOFTMMU)
+-#include "../tcg-ldst.c.inc"
+-
+ /* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
+  *                                     int mmu_idx, uintptr_t ra)
+  */
+@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
+     tcg_out_jmp(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
+     return true;
+ }
+-#elif TCG_TARGET_REG_BITS == 32
++#else
++
++static void tcg_out_test_alignment(TCGContext *s, bool is_ld, TCGReg addrlo,
++                                   TCGReg addrhi, unsigned a_bits)
++{
++    unsigned a_mask = (1 << a_bits) - 1;
++    TCGLabelQemuLdst *label;
++
++    /*
++     * We are expecting a_bits to max out at 7, so we can usually use testb.
++     * For i686, we have to use testl for %esi/%edi.
++     */
++    if (a_mask <= 0xff && (TCG_TARGET_REG_BITS == 64 || addrlo < 4)) {
++        tcg_out_modrm(s, OPC_GRP3_Eb | P_REXB_RM, EXT3_TESTi, addrlo);
++        tcg_out8(s, a_mask);
++    } else {
++        tcg_out_modrm(s, OPC_GRP3_Ev, EXT3_TESTi, addrlo);
++        tcg_out32(s, a_mask);
++    }
++
++    /* jne slow_path */
++    tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
++
++    label = new_ldst_label(s);
++    label->is_ld = is_ld;
++    label->addrlo_reg = addrlo;
++    label->addrhi_reg = addrhi;
++    label->raddr = tcg_splitwx_to_rx(s->code_ptr + 4);
++    label->label_ptr[0] = s->code_ptr;
++
++    s->code_ptr += 4;
++}
++
++static bool tcg_out_fail_alignment(TCGContext *s, TCGLabelQemuLdst *l)
++{
++    /* resolve label address */
++    tcg_patch32(l->label_ptr[0], s->code_ptr - l->label_ptr[0] - 4);
++
++    if (TCG_TARGET_REG_BITS == 32) {
++        int ofs = 0;
++
++        tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
++        ofs += 4;
++
++        tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
++        ofs += 4;
++        if (TARGET_LONG_BITS == 64) {
++            tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
++            ofs += 4;
++        }
++
++        tcg_out_pushi(s, (uintptr_t)l->raddr);
++    } else {
++        tcg_out_mov(s, TCG_TYPE_TL, tcg_target_call_iarg_regs[1],
++                    l->addrlo_reg);
++        tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
++
++        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_RAX, (uintptr_t)l->raddr);
++        tcg_out_push(s, TCG_REG_RAX);
++    }
++
++    /* "Tail call" to the helper, with the return address back inline. */
++    tcg_out_jmp(s, (const void *)(l->is_ld ? helper_unaligned_ld
++                                  : helper_unaligned_st));
++    return true;
++}
++
++static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
++{
++    return tcg_out_fail_alignment(s, l);
++}
++
++static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
++{
++    return tcg_out_fail_alignment(s, l);
++}
++
++#if TCG_TARGET_REG_BITS == 32
+ # define x86_guest_base_seg     0
+ # define x86_guest_base_index   -1
+ # define x86_guest_base_offset  guest_base
+@@ -XXX,XX +XXX,XX @@ static inline int setup_guest_base_seg(void)
+     return 0;
+ }
+ # endif
++#endif
+ #endif /* SOFTMMU */
+ static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
+ #if defined(CONFIG_SOFTMMU)
+     int mem_index;
+     tcg_insn_unit *label_ptr[2];
++#else
++    unsigned a_bits;
+ #endif
+     datalo = *args++;
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
+     add_qemu_ldst_label(s, true, is64, oi, datalo, datahi, addrlo, addrhi,
+                         s->code_ptr, label_ptr);
+ #else
++    a_bits = get_alignment_bits(opc);
++    if (a_bits) {
++        tcg_out_test_alignment(s, true, addrlo, addrhi, a_bits);
++    }
++
+     tcg_out_qemu_ld_direct(s, datalo, datahi, addrlo, x86_guest_base_index,
+                            x86_guest_base_offset, x86_guest_base_seg,
+                            is64, opc);
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
+ #if defined(CONFIG_SOFTMMU)
+     int mem_index;
+     tcg_insn_unit *label_ptr[2];
++#else
++    unsigned a_bits;
+ #endif
+     datalo = *args++;
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
+     add_qemu_ldst_label(s, false, is64, oi, datalo, datahi, addrlo, addrhi,
+                         s->code_ptr, label_ptr);
+ #else
++    a_bits = get_alignment_bits(opc);
++    if (a_bits) {
++        tcg_out_test_alignment(s, false, addrlo, addrhi, a_bits);
++    }
++
+     tcg_out_qemu_st_direct(s, datalo, datahi, addrlo, x86_guest_base_index,
+                            x86_guest_base_offset, x86_guest_base_seg, opc);
+ #endif
+--
+.25.1

-[PULL 3/3] accel/tcg: rename tcg-cpus functions to match module name
+[PULL 11/34] tcg/aarch64: Support raising sigbus for user-only
-From: Claudio Fontana <cfontana@suse.de>
+Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
 Signed-off-by: Claudio Fontana <cfontana@suse.de>
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Message-Id: <20201015143217.29337-4-cfontana@suse.de>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/tcg-cpus-icount.h |  6 +--
+ tcg/aarch64/tcg-target.h     |  2 -
- accel/tcg/tcg-cpus-rr.h     |  2 +-
+ tcg/aarch64/tcg-target.c.inc | 91 +++++++++++++++++++++++++++++-------
- accel/tcg/tcg-cpus.h        |  6 +--
+files changed, 74 insertions(+), 19 deletions(-)
  accel/tcg/tcg-cpus-icount.c | 24 ++++++------
  accel/tcg/tcg-cpus-mttcg.c  | 10 ++---
  accel/tcg/tcg-cpus-rr.c     | 74 ++++++++++++++++++-------------------
  accel/tcg/tcg-cpus.c        |  6 +--
 files changed, 64 insertions(+), 64 deletions(-)
-diff --git a/accel/tcg/tcg-cpus-icount.h b/accel/tcg/tcg-cpus-icount.h
+diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/tcg-cpus-icount.h
+--- a/tcg/aarch64/tcg-target.h
-+++ b/accel/tcg/tcg-cpus-icount.h
++++ b/tcg/aarch64/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
  void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t);
 -#ifdef CONFIG_SOFTMMU
  #define TCG_TARGET_NEED_LDST_LABELS
 -#endif
  #define TCG_TARGET_NEED_POOL_LABELS
  #endif /* AARCH64_TCG_TARGET_H */
 diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/aarch64/tcg-target.c.inc
 +++ b/tcg/aarch64/tcg-target.c.inc
 @@ -XXX,XX +XXX,XX @@
- #ifndef TCG_CPUS_ICOUNT_H
+  * See the COPYING file in the top-level directory for details.
- #define TCG_CPUS_ICOUNT_H
+  */
--void handle_icount_deadline(void);
++#include "../tcg-ldst.c.inc"
--void prepare_icount_for_run(CPUState *cpu);
+ #include "../tcg-pool.c.inc"
--void process_icount_data(CPUState *cpu);
+ #include "qemu/bitops.h"
-+void icount_handle_deadline(void);
-+void icount_prepare_for_run(CPUState *cpu);
+@@ -XXX,XX +XXX,XX @@ typedef enum {
-+void icount_process_data(CPUState *cpu);
+     I3404_ANDI      = 0x12000000,
+     I3404_ORRI      = 0x32000000,
- #endif /* TCG_CPUS_ICOUNT_H */
+     I3404_EORI      = 0x52000000,
-diff --git a/accel/tcg/tcg-cpus-rr.h b/accel/tcg/tcg-cpus-rr.h
++    I3404_ANDSI     = 0x72000000,
-index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/tcg-cpus-rr.h
+     /* Move wide immediate instructions.  */
-+++ b/accel/tcg/tcg-cpus-rr.h
+     I3405_MOVN      = 0x12800000,
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_goto_long(TCGContext *s, const tcg_insn_unit *target)
- #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
+     if (offset == sextract64(offset, 0, 26)) {
+         tcg_out_insn(s, 3206, B, offset);
- /* Kick all RR vCPUs. */
+     } else {
--void qemu_cpu_kick_rr_cpus(CPUState *unused);
+-        tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_TMP, (intptr_t)target);
-+void rr_kick_vcpu_thread(CPUState *unused);
+-        tcg_out_insn(s, 3207, BR, TCG_REG_TMP);
++        /* Choose X9 as a call-clobbered non-LR temporary. */
- /* start the round robin vcpu thread */
++        tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_X9, (intptr_t)target);
- void rr_start_vcpu_thread(CPUState *cpu);
++        tcg_out_insn(s, 3207, BR, TCG_REG_X9);
 diff --git a/accel/tcg/tcg-cpus.h b/accel/tcg/tcg-cpus.h
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/tcg-cpus.h
 +++ b/accel/tcg/tcg-cpus.h
@@ -XXX,XX +XXX,XX @@ extern const CpusAccel tcg_cpus_mttcg;
  extern const CpusAccel tcg_cpus_icount;
  extern const CpusAccel tcg_cpus_rr;
 -void qemu_tcg_destroy_vcpu(CPUState *cpu);
 -int tcg_cpu_exec(CPUState *cpu);
 -void tcg_handle_interrupt(CPUState *cpu, int mask);
 +void tcg_cpus_destroy(CPUState *cpu);
 +int tcg_cpus_exec(CPUState *cpu);
 +void tcg_cpus_handle_interrupt(CPUState *cpu, int mask);
  #endif /* TCG_CPUS_H */
 diff --git a/accel/tcg/tcg-cpus-icount.c b/accel/tcg/tcg-cpus-icount.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/tcg-cpus-icount.c
 +++ b/accel/tcg/tcg-cpus-icount.c
@@ -XXX,XX +XXX,XX @@
  #include "tcg-cpus-icount.h"
  #include "tcg-cpus-rr.h"
 -static int64_t tcg_get_icount_limit(void)
 +static int64_t icount_get_limit(void)
  {
      int64_t deadline;
@@ -XXX,XX +XXX,XX @@ static int64_t tcg_get_icount_limit(void)
      }
  }
--static void notify_aio_contexts(void)
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_cltz(TCGContext *s, TCGType ext, TCGReg d,
 +static void icount_notify_aio_contexts(void)
  {
      /* Wake up other AioContexts.  */
      qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
      qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
  }
 -void handle_icount_deadline(void)
 +void icount_handle_deadline(void)
  {
      assert(qemu_in_vcpu_thread());
      int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
                                                    QEMU_TIMER_ATTR_ALL);
      if (deadline == 0) {
 -        notify_aio_contexts();
 +        icount_notify_aio_contexts();
      }
  }
--void prepare_icount_for_run(CPUState *cpu)
+-#ifdef CONFIG_SOFTMMU
-+void icount_prepare_for_run(CPUState *cpu)
+-#include "../tcg-ldst.c.inc"
- {
++static void tcg_out_adr(TCGContext *s, TCGReg rd, const void *target)
-     int insns_left;
++{
++    ptrdiff_t offset = tcg_pcrel_diff(s, target);
-     /*
++    tcg_debug_assert(offset == sextract64(offset, 0, 21));
--     * These should always be cleared by process_icount_data after
++    tcg_out_insn(s, 3406, ADR, rd, offset);
-+     * These should always be cleared by icount_process_data after
++}
-      * each vCPU execution. However u16.high can be raised
--     * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
++#ifdef CONFIG_SOFTMMU
-+     * asynchronously by cpu_exit/cpu_interrupt/tcg_cpus_handle_interrupt
+ /* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
-      */
+  *                                     MemOpIdx oi, uintptr_t ra)
-     g_assert(cpu_neg(cpu)->icount_decr.u16.low == 0);
+  */
-     g_assert(cpu->icount_extra == 0);
+@@ -XXX,XX +XXX,XX @@ static void * const qemu_st_helpers[MO_SIZE + 1] = {
+ #endif
--    cpu->icount_budget = tcg_get_icount_limit();
+ };
-+    cpu->icount_budget = icount_get_limit();
-     insns_left = MIN(0xffff, cpu->icount_budget);
+-static inline void tcg_out_adr(TCGContext *s, TCGReg rd, const void *target)
-     cpu_neg(cpu)->icount_decr.u16.low = insns_left;
+-{
-     cpu->icount_extra = cpu->icount_budget - insns_left;
+-    ptrdiff_t offset = tcg_pcrel_diff(s, target);
-@@ -XXX,XX +XXX,XX @@ void prepare_icount_for_run(CPUState *cpu)
+-    tcg_debug_assert(offset == sextract64(offset, 0, 21));
-     replay_mutex_lock();
+-    tcg_out_insn(s, 3406, ADR, rd, offset);
+-}
-     if (cpu->icount_budget == 0 && replay_has_checkpoint()) {
+-
--        notify_aio_contexts();
+ static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
-+        icount_notify_aio_contexts();
+ {
-     }
+     MemOpIdx oi = lb->oi;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_tlb_read(TCGContext *s, TCGReg addr_reg, MemOp opc,
      tcg_out_insn(s, 3202, B_C, TCG_COND_NE, 0);
  }
--void process_icount_data(CPUState *cpu)
++#else
-+void icount_process_data(CPUState *cpu)
++static void tcg_out_test_alignment(TCGContext *s, bool is_ld, TCGReg addr_reg,
- {
++                                   unsigned a_bits)
-     /* Account for executed instructions */
++{
-     icount_update(cpu);
++    unsigned a_mask = (1 << a_bits) - 1;
-@@ -XXX,XX +XXX,XX @@ static void icount_handle_interrupt(CPUState *cpu, int mask)
++    TCGLabelQemuLdst *label = new_ldst_label(s);
- {
++
-     int old_mask = cpu->interrupt_request;
++    label->is_ld = is_ld;
++    label->addrlo_reg = addr_reg;
--    tcg_handle_interrupt(cpu, mask);
++
-+    tcg_cpus_handle_interrupt(cpu, mask);
++    /* tst addr, #mask */
-     if (qemu_cpu_is_self(cpu) &&
++    tcg_out_logicali(s, I3404_ANDSI, 0, TCG_REG_XZR, addr_reg, a_mask);
-         !cpu->can_do_io
++
-         && (mask & ~old_mask) != 0) {
++    label->label_ptr[0] = s->code_ptr;
-@@ -XXX,XX +XXX,XX @@ static void icount_handle_interrupt(CPUState *cpu, int mask)
++
++    /* b.ne slow_path */
- const CpusAccel tcg_cpus_icount = {
++    tcg_out_insn(s, 3202, B_C, TCG_COND_NE, 0);
-     .create_vcpu_thread = rr_start_vcpu_thread,
++
--    .kick_vcpu_thread = qemu_cpu_kick_rr_cpus,
++    label->raddr = tcg_splitwx_to_rx(s->code_ptr);
-+    .kick_vcpu_thread = rr_kick_vcpu_thread,
++}
++
-     .handle_interrupt = icount_handle_interrupt,
++static bool tcg_out_fail_alignment(TCGContext *s, TCGLabelQemuLdst *l)
-     .get_virtual_clock = icount_get,
++{
-diff --git a/accel/tcg/tcg-cpus-mttcg.c b/accel/tcg/tcg-cpus-mttcg.c
++    if (!reloc_pc19(l->label_ptr[0], tcg_splitwx_to_rx(s->code_ptr))) {
-index XXXXXXX..XXXXXXX 100644
++        return false;
---- a/accel/tcg/tcg-cpus-mttcg.c
++    }
-+++ b/accel/tcg/tcg-cpus-mttcg.c
++
-@@ -XXX,XX +XXX,XX @@
++    tcg_out_mov(s, TCG_TYPE_TL, TCG_REG_X1, l->addrlo_reg);
-  * current CPUState for a given thread.
++    tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_X0, TCG_AREG0);
-  */
++
++    /* "Tail call" to the helper, with the return address back inline. */
--static void *tcg_cpu_thread_fn(void *arg)
++    tcg_out_adr(s, TCG_REG_LR, l->raddr);
-+static void *mttcg_cpu_thread_fn(void *arg)
++    tcg_out_goto_long(s, (const void *)(l->is_ld ? helper_unaligned_ld
- {
++                                        : helper_unaligned_st));
-     CPUState *cpu = arg;
++    return true;
++}
-@@ -XXX,XX +XXX,XX @@ static void *tcg_cpu_thread_fn(void *arg)
++
-         if (cpu_can_run(cpu)) {
++static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
-             int r;
++{
-             qemu_mutex_unlock_iothread();
++    return tcg_out_fail_alignment(s, l);
--            r = tcg_cpu_exec(cpu);
++}
-+            r = tcg_cpus_exec(cpu);
++
-             qemu_mutex_lock_iothread();
++static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
-             switch (r) {
++{
-             case EXCP_DEBUG:
++    return tcg_out_fail_alignment(s, l);
-@@ -XXX,XX +XXX,XX @@ static void *tcg_cpu_thread_fn(void *arg)
++}
-         qemu_wait_io_event(cpu);
+ #endif /* CONFIG_SOFTMMU */
-     } while (!cpu->unplug || cpu_can_run(cpu));
+ static void tcg_out_qemu_ld_direct(TCGContext *s, MemOp memop, TCGType ext,
--    qemu_tcg_destroy_vcpu(cpu);
+                                    TCGReg data_r, TCGReg addr_r,
-+    tcg_cpus_destroy(cpu);
+                                    TCGType otype, TCGReg off_r)
-     qemu_mutex_unlock_iothread();
+ {
-     rcu_unregister_thread();
+-    /* Byte swapping is left to middle-end expansion. */
-     return NULL;
+-    tcg_debug_assert((memop & MO_BSWAP) == 0);
-@@ -XXX,XX +XXX,XX @@ static void mttcg_start_vcpu_thread(CPUState *cpu)
+-
-     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
+     switch (memop & MO_SSIZE) {
-              cpu->cpu_index);
+     case MO_UB:
+         tcg_out_ldst_r(s, I3312_LDRB, data_r, addr_r, otype, off_r);
--    qemu_thread_create(cpu->thread, thread_name, tcg_cpu_thread_fn,
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st_direct(TCGContext *s, MemOp memop,
-+    qemu_thread_create(cpu->thread, thread_name, mttcg_cpu_thread_fn,
+                                    TCGReg data_r, TCGReg addr_r,
-                        cpu, QEMU_THREAD_JOINABLE);
+                                    TCGType otype, TCGReg off_r)
+ {
- #ifdef _WIN32
+-    /* Byte swapping is left to middle-end expansion. */
-@@ -XXX,XX +XXX,XX @@ const CpusAccel tcg_cpus_mttcg = {
+-    tcg_debug_assert((memop & MO_BSWAP) == 0);
-     .create_vcpu_thread = mttcg_start_vcpu_thread,
+-
-     .kick_vcpu_thread = mttcg_kick_vcpu_thread,
+     switch (memop & MO_SIZE) {
+     case MO_8:
--    .handle_interrupt = tcg_handle_interrupt,
+         tcg_out_ldst_r(s, I3312_STRB, data_r, addr_r, otype, off_r);
-+    .handle_interrupt = tcg_cpus_handle_interrupt,
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld(TCGContext *s, TCGReg data_reg, TCGReg addr_reg,
- };
+ {
-diff --git a/accel/tcg/tcg-cpus-rr.c b/accel/tcg/tcg-cpus-rr.c
+     MemOp memop = get_memop(oi);
-index XXXXXXX..XXXXXXX 100644
+     const TCGType otype = TARGET_LONG_BITS == 64 ? TCG_TYPE_I64 : TCG_TYPE_I32;
---- a/accel/tcg/tcg-cpus-rr.c
++
-+++ b/accel/tcg/tcg-cpus-rr.c
++    /* Byte swapping is left to middle-end expansion. */
-@@ -XXX,XX +XXX,XX @@
++    tcg_debug_assert((memop & MO_BSWAP) == 0);
- #include "tcg-cpus-icount.h"
++
+ #ifdef CONFIG_SOFTMMU
- /* Kick all RR vCPUs */
+     unsigned mem_index = get_mmuidx(oi);
--void qemu_cpu_kick_rr_cpus(CPUState *unused)
+     tcg_insn_unit *label_ptr;
-+void rr_kick_vcpu_thread(CPUState *unused)
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld(TCGContext *s, TCGReg data_reg, TCGReg addr_reg,
- {
+     add_qemu_ldst_label(s, true, oi, ext, data_reg, addr_reg,
-     CPUState *cpu;
+                         s->code_ptr, label_ptr);
+ #else /* !CONFIG_SOFTMMU */
-@@ -XXX,XX +XXX,XX @@ void qemu_cpu_kick_rr_cpus(CPUState *unused)
++    unsigned a_bits = get_alignment_bits(memop);
-  * idleness is complete.
++    if (a_bits) {
-  */
++        tcg_out_test_alignment(s, true, addr_reg, a_bits);
++    }
--static QEMUTimer *tcg_kick_vcpu_timer;
+     if (USE_GUEST_BASE) {
--static CPUState *tcg_current_rr_cpu;
+         tcg_out_qemu_ld_direct(s, memop, ext, data_reg,
-+static QEMUTimer *rr_kick_vcpu_timer;
+                                TCG_REG_GUEST_BASE, otype, addr_reg);
-+static CPUState *rr_current_cpu;
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, TCGReg data_reg, TCGReg addr_reg,
+ {
- #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
+     MemOp memop = get_memop(oi);
+     const TCGType otype = TARGET_LONG_BITS == 64 ? TCG_TYPE_I64 : TCG_TYPE_I32;
--static inline int64_t qemu_tcg_next_kick(void)
++
-+static inline int64_t rr_next_kick_time(void)
++    /* Byte swapping is left to middle-end expansion. */
- {
++    tcg_debug_assert((memop & MO_BSWAP) == 0);
-     return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
++
- }
+ #ifdef CONFIG_SOFTMMU
+     unsigned mem_index = get_mmuidx(oi);
- /* Kick the currently round-robin scheduled vCPU to next */
+     tcg_insn_unit *label_ptr;
--static void qemu_cpu_kick_rr_next_cpu(void)
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, TCGReg data_reg, TCGReg addr_reg,
-+static void rr_kick_next_cpu(void)
+     add_qemu_ldst_label(s, false, oi, (memop & MO_SIZE)== MO_64,
- {
+                         data_reg, addr_reg, s->code_ptr, label_ptr);
-     CPUState *cpu;
+ #else /* !CONFIG_SOFTMMU */
-     do {
++    unsigned a_bits = get_alignment_bits(memop);
--        cpu = qatomic_mb_read(&tcg_current_rr_cpu);
++    if (a_bits) {
-+        cpu = qatomic_mb_read(&rr_current_cpu);
++        tcg_out_test_alignment(s, false, addr_reg, a_bits);
-         if (cpu) {
++    }
-             cpu_exit(cpu);
+     if (USE_GUEST_BASE) {
-         }
+         tcg_out_qemu_st_direct(s, memop, data_reg,
--    } while (cpu != qatomic_mb_read(&tcg_current_rr_cpu));
+                                TCG_REG_GUEST_BASE, otype, addr_reg);
 +    } while (cpu != qatomic_mb_read(&rr_current_cpu));
  }
 -static void kick_tcg_thread(void *opaque)
 +static void rr_kick_thread(void *opaque)
  {
 -    timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 -    qemu_cpu_kick_rr_next_cpu();
 +    timer_mod(rr_kick_vcpu_timer, rr_next_kick_time());
 +    rr_kick_next_cpu();
  }
 -static void start_tcg_kick_timer(void)
 +static void rr_start_kick_timer(void)
  {
 -    if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
 -        tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 -                                           kick_tcg_thread, NULL);
 +    if (!rr_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
 +        rr_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 +                                           rr_kick_thread, NULL);
      }
 -    if (tcg_kick_vcpu_timer && !timer_pending(tcg_kick_vcpu_timer)) {
 -        timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 +    if (rr_kick_vcpu_timer && !timer_pending(rr_kick_vcpu_timer)) {
 +        timer_mod(rr_kick_vcpu_timer, rr_next_kick_time());
      }
  }
 -static void stop_tcg_kick_timer(void)
 +static void rr_stop_kick_timer(void)
  {
 -    if (tcg_kick_vcpu_timer && timer_pending(tcg_kick_vcpu_timer)) {
 -        timer_del(tcg_kick_vcpu_timer);
 +    if (rr_kick_vcpu_timer && timer_pending(rr_kick_vcpu_timer)) {
 +        timer_del(rr_kick_vcpu_timer);
      }
  }
 -static void qemu_tcg_rr_wait_io_event(void)
 +static void rr_wait_io_event(void)
  {
      CPUState *cpu;
      while (all_cpu_threads_idle()) {
 -        stop_tcg_kick_timer();
 +        rr_stop_kick_timer();
          qemu_cond_wait_iothread(first_cpu->halt_cond);
      }
 -    start_tcg_kick_timer();
 +    rr_start_kick_timer();
      CPU_FOREACH(cpu) {
          qemu_wait_io_event_common(cpu);
@@ -XXX,XX +XXX,XX @@ static void qemu_tcg_rr_wait_io_event(void)
   * Destroy any remaining vCPUs which have been unplugged and have
   * finished running
   */
 -static void deal_with_unplugged_cpus(void)
 +static void rr_deal_with_unplugged_cpus(void)
  {
      CPUState *cpu;
      CPU_FOREACH(cpu) {
          if (cpu->unplug && !cpu_can_run(cpu)) {
 -            qemu_tcg_destroy_vcpu(cpu);
 +            tcg_cpus_destroy(cpu);
              break;
          }
      }
@@ -XXX,XX +XXX,XX @@ static void deal_with_unplugged_cpus(void)
   * elsewhere.
   */
 -static void *tcg_rr_cpu_thread_fn(void *arg)
 +static void *rr_cpu_thread_fn(void *arg)
  {
      CPUState *cpu = arg;
@@ -XXX,XX +XXX,XX @@ static void *tcg_rr_cpu_thread_fn(void *arg)
          }
      }
 -    start_tcg_kick_timer();
 +    rr_start_kick_timer();
      cpu = first_cpu;
@@ -XXX,XX +XXX,XX @@ static void *tcg_rr_cpu_thread_fn(void *arg)
               * Run the timers here.  This is much more efficient than
               * waking up the I/O thread and waiting for completion.
               */
 -            handle_icount_deadline();
 +            icount_handle_deadline();
          }
          replay_mutex_unlock();
@@ -XXX,XX +XXX,XX @@ static void *tcg_rr_cpu_thread_fn(void *arg)
          while (cpu && cpu_work_list_empty(cpu) && !cpu->exit_request) {
 -            qatomic_mb_set(&tcg_current_rr_cpu, cpu);
 +            qatomic_mb_set(&rr_current_cpu, cpu);
              current_cpu = cpu;
              qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
@@ -XXX,XX +XXX,XX @@ static void *tcg_rr_cpu_thread_fn(void *arg)
                  qemu_mutex_unlock_iothread();
                  if (icount_enabled()) {
 -                    prepare_icount_for_run(cpu);
 +                    icount_prepare_for_run(cpu);
                  }
 -                r = tcg_cpu_exec(cpu);
 +                r = tcg_cpus_exec(cpu);
                  if (icount_enabled()) {
 -                    process_icount_data(cpu);
 +                    icount_process_data(cpu);
                  }
                  qemu_mutex_lock_iothread();
@@ -XXX,XX +XXX,XX @@ static void *tcg_rr_cpu_thread_fn(void *arg)
          } /* while (cpu && !cpu->exit_request).. */
          /* Does not need qatomic_mb_set because a spurious wakeup is okay.  */
 -        qatomic_set(&tcg_current_rr_cpu, NULL);
 +        qatomic_set(&rr_current_cpu, NULL);
          if (cpu && cpu->exit_request) {
              qatomic_mb_set(&cpu->exit_request, 0);
@@ -XXX,XX +XXX,XX @@ static void *tcg_rr_cpu_thread_fn(void *arg)
              qemu_notify_event();
          }
 -        qemu_tcg_rr_wait_io_event();
 -        deal_with_unplugged_cpus();
 +        rr_wait_io_event();
 +        rr_deal_with_unplugged_cpus();
      }
      rcu_unregister_thread();
@@ -XXX,XX +XXX,XX @@ void rr_start_vcpu_thread(CPUState *cpu)
          /* share a single thread for all cpus with TCG */
          snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
          qemu_thread_create(cpu->thread, thread_name,
 -                           tcg_rr_cpu_thread_fn,
 +                           rr_cpu_thread_fn,
                             cpu, QEMU_THREAD_JOINABLE);
          single_tcg_halt_cond = cpu->halt_cond;
@@ -XXX,XX +XXX,XX @@ void rr_start_vcpu_thread(CPUState *cpu)
  const CpusAccel tcg_cpus_rr = {
      .create_vcpu_thread = rr_start_vcpu_thread,
 -    .kick_vcpu_thread = qemu_cpu_kick_rr_cpus,
 +    .kick_vcpu_thread = rr_kick_vcpu_thread,
 -    .handle_interrupt = tcg_handle_interrupt,
 +    .handle_interrupt = tcg_cpus_handle_interrupt,
  };
 diff --git a/accel/tcg/tcg-cpus.c b/accel/tcg/tcg-cpus.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/tcg-cpus.c
 +++ b/accel/tcg/tcg-cpus.c
@@ -XXX,XX +XXX,XX @@
  /* common functionality among all TCG variants */
 -void qemu_tcg_destroy_vcpu(CPUState *cpu)
 +void tcg_cpus_destroy(CPUState *cpu)
  {
      cpu_thread_signal_destroyed(cpu);
  }
 -int tcg_cpu_exec(CPUState *cpu)
 +int tcg_cpus_exec(CPUState *cpu)
  {
      int ret;
  #ifdef CONFIG_PROFILER
@@ -XXX,XX +XXX,XX @@ int tcg_cpu_exec(CPUState *cpu)
  }
  /* mask must never be zero, except for A20 change call */
 -void tcg_handle_interrupt(CPUState *cpu, int mask)
 +void tcg_cpus_handle_interrupt(CPUState *cpu, int mask)
  {
      g_assert(qemu_mutex_iothread_locked());
 --
 .25.1

-New patch
+[PULL 12/34] tcg/ppc: Support raising sigbus for user-only
+Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/ppc/tcg-target.h     |  2 -
+ tcg/ppc/tcg-target.c.inc | 98 ++++++++++++++++++++++++++++++++++++----
+files changed, 90 insertions(+), 10 deletions(-)
+diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/ppc/tcg-target.h
++++ b/tcg/ppc/tcg-target.h
+@@ -XXX,XX +XXX,XX @@ void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t);
+ #define TCG_TARGET_DEFAULT_MO (0)
+ #define TCG_TARGET_HAS_MEMORY_BSWAP     1
+-#ifdef CONFIG_SOFTMMU
+ #define TCG_TARGET_NEED_LDST_LABELS
+-#endif
+ #define TCG_TARGET_NEED_POOL_LABELS
+ #endif
+diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/ppc/tcg-target.c.inc
++++ b/tcg/ppc/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@
+ #include "elf.h"
+ #include "../tcg-pool.c.inc"
++#include "../tcg-ldst.c.inc"
+ /*
+  * Standardize on the _CALL_FOO symbols used by GCC:
+@@ -XXX,XX +XXX,XX @@ void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_rx,
+     }
+ }
+-static void tcg_out_call(TCGContext *s, const tcg_insn_unit *target)
++static void tcg_out_call_int(TCGContext *s, int lk,
++                             const tcg_insn_unit *target)
+ {
+ #ifdef _CALL_AIX
+     /* Look through the descriptor.  If the branch is in range, and we
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_call(TCGContext *s, const tcg_insn_unit *target)
+     if (in_range_b(diff) && toc == (uint32_t)toc) {
+         tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_TMP1, toc);
+-        tcg_out_b(s, LK, tgt);
++        tcg_out_b(s, lk, tgt);
+     } else {
+         /* Fold the low bits of the constant into the addresses below.  */
+         intptr_t arg = (intptr_t)target;
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_call(TCGContext *s, const tcg_insn_unit *target)
+         tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_R0, TCG_REG_TMP1, ofs);
+         tcg_out32(s, MTSPR | RA(TCG_REG_R0) | CTR);
+         tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_R2, TCG_REG_TMP1, ofs + SZP);
+-        tcg_out32(s, BCCTR | BO_ALWAYS | LK);
++        tcg_out32(s, BCCTR | BO_ALWAYS | lk);
+     }
+ #elif defined(_CALL_ELF) && _CALL_ELF == 2
+     intptr_t diff;
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_call(TCGContext *s, const tcg_insn_unit *target)
+     diff = tcg_pcrel_diff(s, target);
+     if (in_range_b(diff)) {
+-        tcg_out_b(s, LK, target);
++        tcg_out_b(s, lk, target);
+     } else {
+         tcg_out32(s, MTSPR | RS(TCG_REG_R12) | CTR);
+-        tcg_out32(s, BCCTR | BO_ALWAYS | LK);
++        tcg_out32(s, BCCTR | BO_ALWAYS | lk);
+     }
+ #else
+-    tcg_out_b(s, LK, target);
++    tcg_out_b(s, lk, target);
+ #endif
+ }
++static void tcg_out_call(TCGContext *s, const tcg_insn_unit *target)
++{
++    tcg_out_call_int(s, LK, target);
++}
++
+ static const uint32_t qemu_ldx_opc[(MO_SSIZE + MO_BSWAP) + 1] = {
+     [MO_UB] = LBZX,
+     [MO_UW] = LHZX,
+@@ -XXX,XX +XXX,XX @@ static const uint32_t qemu_exts_opc[4] = {
+ };
+ #if defined (CONFIG_SOFTMMU)
+-#include "../tcg-ldst.c.inc"
+-
+ /* helper signature: helper_ld_mmu(CPUState *env, target_ulong addr,
+  *                                 int mmu_idx, uintptr_t ra)
+  */
+@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
+     tcg_out_b(s, 0, lb->raddr);
+     return true;
+ }
++#else
++
++static void tcg_out_test_alignment(TCGContext *s, bool is_ld, TCGReg addrlo,
++                                   TCGReg addrhi, unsigned a_bits)
++{
++    unsigned a_mask = (1 << a_bits) - 1;
++    TCGLabelQemuLdst *label = new_ldst_label(s);
++
++    label->is_ld = is_ld;
++    label->addrlo_reg = addrlo;
++    label->addrhi_reg = addrhi;
++
++    /* We are expecting a_bits to max out at 7, much lower than ANDI. */
++    tcg_debug_assert(a_bits < 16);
++    tcg_out32(s, ANDI | SAI(addrlo, TCG_REG_R0, a_mask));
++
++    label->label_ptr[0] = s->code_ptr;
++    tcg_out32(s, BC | BI(0, CR_EQ) | BO_COND_FALSE | LK);
++
++    label->raddr = tcg_splitwx_to_rx(s->code_ptr);
++}
++
++static bool tcg_out_fail_alignment(TCGContext *s, TCGLabelQemuLdst *l)
++{
++    if (!reloc_pc14(l->label_ptr[0], tcg_splitwx_to_rx(s->code_ptr))) {
++        return false;
++    }
++
++    if (TCG_TARGET_REG_BITS < TARGET_LONG_BITS) {
++        TCGReg arg = TCG_REG_R4;
++#ifdef TCG_TARGET_CALL_ALIGN_ARGS
++        arg |= 1;
++#endif
++        if (l->addrlo_reg != arg) {
++            tcg_out_mov(s, TCG_TYPE_I32, arg, l->addrhi_reg);
++            tcg_out_mov(s, TCG_TYPE_I32, arg + 1, l->addrlo_reg);
++        } else if (l->addrhi_reg != arg + 1) {
++            tcg_out_mov(s, TCG_TYPE_I32, arg + 1, l->addrlo_reg);
++            tcg_out_mov(s, TCG_TYPE_I32, arg, l->addrhi_reg);
++        } else {
++            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R0, arg);
++            tcg_out_mov(s, TCG_TYPE_I32, arg, arg + 1);
++            tcg_out_mov(s, TCG_TYPE_I32, arg + 1, TCG_REG_R0);
++        }
++    } else {
++        tcg_out_mov(s, TCG_TYPE_TL, TCG_REG_R4, l->addrlo_reg);
++    }
++    tcg_out_mov(s, TCG_TYPE_TL, TCG_REG_R3, TCG_AREG0);
++
++    /* "Tail call" to the helper, with the return address back inline. */
++    tcg_out_call_int(s, 0, (const void *)(l->is_ld ? helper_unaligned_ld
++                                          : helper_unaligned_st));
++    return true;
++}
++
++static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
++{
++    return tcg_out_fail_alignment(s, l);
++}
++
++static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
++{
++    return tcg_out_fail_alignment(s, l);
++}
++
+ #endif /* SOFTMMU */
+ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is_64)
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is_64)
+ #ifdef CONFIG_SOFTMMU
+     int mem_index;
+     tcg_insn_unit *label_ptr;
++#else
++    unsigned a_bits;
+ #endif
+     datalo = *args++;
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is_64)
+     rbase = TCG_REG_R3;
+ #else  /* !CONFIG_SOFTMMU */
++    a_bits = get_alignment_bits(opc);
++    if (a_bits) {
++        tcg_out_test_alignment(s, true, addrlo, addrhi, a_bits);
++    }
+     rbase = guest_base ? TCG_GUEST_BASE_REG : 0;
+     if (TCG_TARGET_REG_BITS > TARGET_LONG_BITS) {
+         tcg_out_ext32u(s, TCG_REG_TMP1, addrlo);
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is_64)
+ #ifdef CONFIG_SOFTMMU
+     int mem_index;
+     tcg_insn_unit *label_ptr;
++#else
++    unsigned a_bits;
+ #endif
+     datalo = *args++;
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is_64)
+     rbase = TCG_REG_R3;
+ #else  /* !CONFIG_SOFTMMU */
++    a_bits = get_alignment_bits(opc);
++    if (a_bits) {
++        tcg_out_test_alignment(s, false, addrlo, addrhi, a_bits);
++    }
+     rbase = guest_base ? TCG_GUEST_BASE_REG : 0;
+     if (TCG_TARGET_REG_BITS > TARGET_LONG_BITS) {
+         tcg_out_ext32u(s, TCG_REG_TMP1, addrlo);
+--
+.25.1

-New patch
+[PULL 13/34] tcg/riscv: Support raising sigbus for user-only
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/riscv/tcg-target.h     |  2 --
+ tcg/riscv/tcg-target.c.inc | 63 ++++++++++++++++++++++++++++++++++++--
+files changed, 61 insertions(+), 4 deletions(-)
+diff --git a/tcg/riscv/tcg-target.h b/tcg/riscv/tcg-target.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/riscv/tcg-target.h
++++ b/tcg/riscv/tcg-target.h
+@@ -XXX,XX +XXX,XX @@ void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t);
+ #define TCG_TARGET_DEFAULT_MO (0)
+-#ifdef CONFIG_SOFTMMU
+ #define TCG_TARGET_NEED_LDST_LABELS
+-#endif
+ #define TCG_TARGET_NEED_POOL_LABELS
+ #define TCG_TARGET_HAS_MEMORY_BSWAP 0
+diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/riscv/tcg-target.c.inc
++++ b/tcg/riscv/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@
+  * THE SOFTWARE.
+  */
++#include "../tcg-ldst.c.inc"
+ #include "../tcg-pool.c.inc"
+ #ifdef CONFIG_DEBUG_TCG
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_mb(TCGContext *s, TCGArg a0)
+  */
+ #if defined(CONFIG_SOFTMMU)
+-#include "../tcg-ldst.c.inc"
+-
+ /* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
+  *                                     MemOpIdx oi, uintptr_t ra)
+  */
+@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
+     tcg_out_goto(s, l->raddr);
+     return true;
+ }
++#else
++
++static void tcg_out_test_alignment(TCGContext *s, bool is_ld, TCGReg addr_reg,
++                                   unsigned a_bits)
++{
++    unsigned a_mask = (1 << a_bits) - 1;
++    TCGLabelQemuLdst *l = new_ldst_label(s);
++
++    l->is_ld = is_ld;
++    l->addrlo_reg = addr_reg;
++
++    /* We are expecting a_bits to max out at 7, so we can always use andi. */
++    tcg_debug_assert(a_bits < 12);
++    tcg_out_opc_imm(s, OPC_ANDI, TCG_REG_TMP1, addr_reg, a_mask);
++
++    l->label_ptr[0] = s->code_ptr;
++    tcg_out_opc_branch(s, OPC_BNE, TCG_REG_TMP1, TCG_REG_ZERO, 0);
++
++    l->raddr = tcg_splitwx_to_rx(s->code_ptr);
++}
++
++static bool tcg_out_fail_alignment(TCGContext *s, TCGLabelQemuLdst *l)
++{
++    /* resolve label address */
++    if (!reloc_sbimm12(l->label_ptr[0], tcg_splitwx_to_rx(s->code_ptr))) {
++        return false;
++    }
++
++    tcg_out_mov(s, TCG_TYPE_TL, TCG_REG_A1, l->addrlo_reg);
++    tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_A0, TCG_AREG0);
++
++    /* tail call, with the return address back inline. */
++    tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_RA, (uintptr_t)l->raddr);
++    tcg_out_call_int(s, (const void *)(l->is_ld ? helper_unaligned_ld
++                                       : helper_unaligned_st), true);
++    return true;
++}
++
++static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
++{
++    return tcg_out_fail_alignment(s, l);
++}
++
++static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
++{
++    return tcg_out_fail_alignment(s, l);
++}
++
+ #endif /* CONFIG_SOFTMMU */
+ static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg lo, TCGReg hi,
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is_64)
+     MemOp opc;
+ #if defined(CONFIG_SOFTMMU)
+     tcg_insn_unit *label_ptr[1];
++#else
++    unsigned a_bits;
+ #endif
+     TCGReg base = TCG_REG_TMP0;
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is_64)
+         tcg_out_ext32u(s, base, addr_regl);
+         addr_regl = base;
+     }
++    a_bits = get_alignment_bits(opc);
++    if (a_bits) {
++        tcg_out_test_alignment(s, true, addr_regl, a_bits);
++    }
+     if (guest_base != 0) {
+         tcg_out_opc_reg(s, OPC_ADD, base, TCG_GUEST_BASE_REG, addr_regl);
+     }
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is_64)
+     MemOp opc;
+ #if defined(CONFIG_SOFTMMU)
+     tcg_insn_unit *label_ptr[1];
++#else
++    unsigned a_bits;
+ #endif
+     TCGReg base = TCG_REG_TMP0;
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is_64)
+         tcg_out_ext32u(s, base, addr_regl);
+         addr_regl = base;
+     }
++    a_bits = get_alignment_bits(opc);
++    if (a_bits) {
++        tcg_out_test_alignment(s, false, addr_regl, a_bits);
++    }
+     if (guest_base != 0) {
+         tcg_out_opc_reg(s, OPC_ADD, base, TCG_GUEST_BASE_REG, addr_regl);
+     }
+--
+.25.1

-New patch
+[PULL 14/34] tcg/s390x: Support raising sigbus for user-only
+Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/s390x/tcg-target.h     |  2 --
+ tcg/s390x/tcg-target.c.inc | 59 ++++++++++++++++++++++++++++++++++++--
+files changed, 57 insertions(+), 4 deletions(-)
+diff --git a/tcg/s390x/tcg-target.h b/tcg/s390x/tcg-target.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/s390x/tcg-target.h
++++ b/tcg/s390x/tcg-target.h
+@@ -XXX,XX +XXX,XX @@ static inline void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_rx,
+     /* no need to flush icache explicitly */
+ }
+-#ifdef CONFIG_SOFTMMU
+ #define TCG_TARGET_NEED_LDST_LABELS
+-#endif
+ #define TCG_TARGET_NEED_POOL_LABELS
+ #endif
+diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/s390x/tcg-target.c.inc
++++ b/tcg/s390x/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@
+ #error "unsupported code generation mode"
+ #endif
++#include "../tcg-ldst.c.inc"
+ #include "../tcg-pool.c.inc"
+ #include "elf.h"
+@@ -XXX,XX +XXX,XX @@ typedef enum S390Opcode {
+     RI_OIHL     = 0xa509,
+     RI_OILH     = 0xa50a,
+     RI_OILL     = 0xa50b,
++    RI_TMLL     = 0xa701,
+     RIE_CGIJ    = 0xec7c,
+     RIE_CGRJ    = 0xec64,
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st_direct(TCGContext *s, MemOp opc, TCGReg data,
+ }
+ #if defined(CONFIG_SOFTMMU)
+-#include "../tcg-ldst.c.inc"
+-
+ /* We're expecting to use a 20-bit negative offset on the tlb memory ops.  */
+ QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) > 0);
+ QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) < -(1 << 19));
+@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
+     return true;
+ }
+ #else
++static void tcg_out_test_alignment(TCGContext *s, bool is_ld,
++                                   TCGReg addrlo, unsigned a_bits)
++{
++    unsigned a_mask = (1 << a_bits) - 1;
++    TCGLabelQemuLdst *l = new_ldst_label(s);
++
++    l->is_ld = is_ld;
++    l->addrlo_reg = addrlo;
++
++    /* We are expecting a_bits to max out at 7, much lower than TMLL. */
++    tcg_debug_assert(a_bits < 16);
++    tcg_out_insn(s, RI, TMLL, addrlo, a_mask);
++
++    tcg_out16(s, RI_BRC | (7 << 4)); /* CC in {1,2,3} */
++    l->label_ptr[0] = s->code_ptr;
++    s->code_ptr += 1;
++
++    l->raddr = tcg_splitwx_to_rx(s->code_ptr);
++}
++
++static bool tcg_out_fail_alignment(TCGContext *s, TCGLabelQemuLdst *l)
++{
++    if (!patch_reloc(l->label_ptr[0], R_390_PC16DBL,
++                     (intptr_t)tcg_splitwx_to_rx(s->code_ptr), 2)) {
++        return false;
++    }
++
++    tcg_out_mov(s, TCG_TYPE_TL, TCG_REG_R3, l->addrlo_reg);
++    tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_R2, TCG_AREG0);
++
++    /* "Tail call" to the helper, with the return address back inline. */
++    tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R14, (uintptr_t)l->raddr);
++    tgen_gotoi(s, S390_CC_ALWAYS, (const void *)(l->is_ld ? helper_unaligned_ld
++                                                 : helper_unaligned_st));
++    return true;
++}
++
++static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
++{
++    return tcg_out_fail_alignment(s, l);
++}
++
++static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
++{
++    return tcg_out_fail_alignment(s, l);
++}
++
+ static void tcg_prepare_user_ldst(TCGContext *s, TCGReg *addr_reg,
+                                   TCGReg *index_reg, tcg_target_long *disp)
+ {
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld(TCGContext* s, TCGReg data_reg, TCGReg addr_reg,
+ #else
+     TCGReg index_reg;
+     tcg_target_long disp;
++    unsigned a_bits = get_alignment_bits(opc);
++    if (a_bits) {
++        tcg_out_test_alignment(s, true, addr_reg, a_bits);
++    }
+     tcg_prepare_user_ldst(s, &addr_reg, &index_reg, &disp);
+     tcg_out_qemu_ld_direct(s, opc, data_reg, addr_reg, index_reg, disp);
+ #endif
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext* s, TCGReg data_reg, TCGReg addr_reg,
+ #else
+     TCGReg index_reg;
+     tcg_target_long disp;
++    unsigned a_bits = get_alignment_bits(opc);
++    if (a_bits) {
++        tcg_out_test_alignment(s, false, addr_reg, a_bits);
++    }
+     tcg_prepare_user_ldst(s, &addr_reg, &index_reg, &disp);
+     tcg_out_qemu_st_direct(s, opc, data_reg, addr_reg, index_reg, disp);
+ #endif
+--
+.25.1

-New patch
+[PULL 15/34] tcg/tci: Support raising sigbus for user-only
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/tci.c | 20 ++++++++++++++------
+file changed, 14 insertions(+), 6 deletions(-)
+diff --git a/tcg/tci.c b/tcg/tci.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/tci.c
++++ b/tcg/tci.c
+@@ -XXX,XX +XXX,XX @@ static bool tci_compare64(uint64_t u0, uint64_t u1, TCGCond condition)
+ static uint64_t tci_qemu_ld(CPUArchState *env, target_ulong taddr,
+                             MemOpIdx oi, const void *tb_ptr)
+ {
+-    MemOp mop = get_memop(oi) & (MO_BSWAP | MO_SSIZE);
++    MemOp mop = get_memop(oi);
+     uintptr_t ra = (uintptr_t)tb_ptr;
+ #ifdef CONFIG_SOFTMMU
+-    switch (mop) {
++    switch (mop & (MO_BSWAP | MO_SSIZE)) {
+     case MO_UB:
+         return helper_ret_ldub_mmu(env, taddr, oi, ra);
+     case MO_SB:
+@@ -XXX,XX +XXX,XX @@ static uint64_t tci_qemu_ld(CPUArchState *env, target_ulong taddr,
+     }
+ #else
+     void *haddr = g2h(env_cpu(env), taddr);
++    unsigned a_mask = (1u << get_alignment_bits(mop)) - 1;
+     uint64_t ret;
+     set_helper_retaddr(ra);
+-    switch (mop) {
++    if (taddr & a_mask) {
++        helper_unaligned_ld(env, taddr);
++    }
++    switch (mop & (MO_BSWAP | MO_SSIZE)) {
+     case MO_UB:
+         ret = ldub_p(haddr);
+         break;
+@@ -XXX,XX +XXX,XX @@ static uint64_t tci_qemu_ld(CPUArchState *env, target_ulong taddr,
+ static void tci_qemu_st(CPUArchState *env, target_ulong taddr, uint64_t val,
+                         MemOpIdx oi, const void *tb_ptr)
+ {
+-    MemOp mop = get_memop(oi) & (MO_BSWAP | MO_SSIZE);
++    MemOp mop = get_memop(oi);
+     uintptr_t ra = (uintptr_t)tb_ptr;
+ #ifdef CONFIG_SOFTMMU
+-    switch (mop) {
++    switch (mop & (MO_BSWAP | MO_SIZE)) {
+     case MO_UB:
+         helper_ret_stb_mmu(env, taddr, val, oi, ra);
+         break;
+@@ -XXX,XX +XXX,XX @@ static void tci_qemu_st(CPUArchState *env, target_ulong taddr, uint64_t val,
+     }
+ #else
+     void *haddr = g2h(env_cpu(env), taddr);
++    unsigned a_mask = (1u << get_alignment_bits(mop)) - 1;
+     set_helper_retaddr(ra);
+-    switch (mop) {
++    if (taddr & a_mask) {
++        helper_unaligned_st(env, taddr);
++    }
++    switch (mop & (MO_BSWAP | MO_SIZE)) {
+     case MO_UB:
+         stb_p(haddr, val);
+         break;
+--
+.25.1

-New patch
+[PULL 16/34] tcg/loongarch64: Support raising sigbus for user-only
+From: WANG Xuerui <git@xen0n.name>
+Signed-off-by: WANG Xuerui <git@xen0n.name>
+Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
+Message-Id: <20220106134238.3936163-1-git@xen0n.name>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/loongarch64/tcg-target.h     |  2 -
+ tcg/loongarch64/tcg-target.c.inc | 71 +++++++++++++++++++++++++++++++-
+files changed, 69 insertions(+), 4 deletions(-)
+diff --git a/tcg/loongarch64/tcg-target.h b/tcg/loongarch64/tcg-target.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/loongarch64/tcg-target.h
++++ b/tcg/loongarch64/tcg-target.h
+@@ -XXX,XX +XXX,XX @@ void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t);
+ #define TCG_TARGET_DEFAULT_MO (0)
+-#ifdef CONFIG_SOFTMMU
+ #define TCG_TARGET_NEED_LDST_LABELS
+-#endif
+ #define TCG_TARGET_HAS_MEMORY_BSWAP 0
+diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/loongarch64/tcg-target.c.inc
++++ b/tcg/loongarch64/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@
+  * THE SOFTWARE.
+  */
++#include "../tcg-ldst.c.inc"
++
+ #ifdef CONFIG_DEBUG_TCG
+ static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
+     "zero",
+@@ -XXX,XX +XXX,XX @@ static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
+  */
+ #if defined(CONFIG_SOFTMMU)
+-#include "../tcg-ldst.c.inc"
+-
+ /*
+  * helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
+  *                                     MemOpIdx oi, uintptr_t ra)
+@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
+     return tcg_out_goto(s, l->raddr);
+ }
++#else
++
++/*
++ * Alignment helpers for user-mode emulation
++ */
++
++static void tcg_out_test_alignment(TCGContext *s, bool is_ld, TCGReg addr_reg,
++                                   unsigned a_bits)
++{
++    TCGLabelQemuLdst *l = new_ldst_label(s);
++
++    l->is_ld = is_ld;
++    l->addrlo_reg = addr_reg;
++
++    /*
++     * Without micro-architecture details, we don't know which of bstrpick or
++     * andi is faster, so use bstrpick as it's not constrained by imm field
++     * width. (Not to say alignments >= 2^12 are going to happen any time
++     * soon, though)
++     */
++    tcg_out_opc_bstrpick_d(s, TCG_REG_TMP1, addr_reg, 0, a_bits - 1);
++
++    l->label_ptr[0] = s->code_ptr;
++    tcg_out_opc_bne(s, TCG_REG_TMP1, TCG_REG_ZERO, 0);
++
++    l->raddr = tcg_splitwx_to_rx(s->code_ptr);
++}
++
++static bool tcg_out_fail_alignment(TCGContext *s, TCGLabelQemuLdst *l)
++{
++    /* resolve label address */
++    if (!reloc_br_sk16(l->label_ptr[0], tcg_splitwx_to_rx(s->code_ptr))) {
++        return false;
++    }
++
++    tcg_out_mov(s, TCG_TYPE_TL, TCG_REG_A1, l->addrlo_reg);
++    tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_A0, TCG_AREG0);
++
++    /* tail call, with the return address back inline. */
++    tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_RA, (uintptr_t)l->raddr);
++    tcg_out_call_int(s, (const void *)(l->is_ld ? helper_unaligned_ld
++                                       : helper_unaligned_st), true);
++    return true;
++}
++
++static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
++{
++    return tcg_out_fail_alignment(s, l);
++}
++
++static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
++{
++    return tcg_out_fail_alignment(s, l);
++}
++
+ #endif /* CONFIG_SOFTMMU */
+ /*
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, TCGType type)
+     MemOp opc;
+ #if defined(CONFIG_SOFTMMU)
+     tcg_insn_unit *label_ptr[1];
++#else
++    unsigned a_bits;
+ #endif
+     TCGReg base;
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, TCGType type)
+                         data_regl, addr_regl,
+                         s->code_ptr, label_ptr);
+ #else
++    a_bits = get_alignment_bits(opc);
++    if (a_bits) {
++        tcg_out_test_alignment(s, true, addr_regl, a_bits);
++    }
+     base = tcg_out_zext_addr_if_32_bit(s, addr_regl, TCG_REG_TMP0);
+     TCGReg guest_base_reg = USE_GUEST_BASE ? TCG_GUEST_BASE_REG : TCG_REG_ZERO;
+     tcg_out_qemu_ld_indexed(s, data_regl, base, guest_base_reg, opc, type);
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args)
+     MemOp opc;
+ #if defined(CONFIG_SOFTMMU)
+     tcg_insn_unit *label_ptr[1];
++#else
++    unsigned a_bits;
+ #endif
+     TCGReg base;
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args)
+                         data_regl, addr_regl,
+                         s->code_ptr, label_ptr);
+ #else
++    a_bits = get_alignment_bits(opc);
++    if (a_bits) {
++        tcg_out_test_alignment(s, false, addr_regl, a_bits);
++    }
+     base = tcg_out_zext_addr_if_32_bit(s, addr_regl, TCG_REG_TMP0);
+     TCGReg guest_base_reg = USE_GUEST_BASE ? TCG_GUEST_BASE_REG : TCG_REG_ZERO;
+     tcg_out_qemu_st_indexed(s, data_regl, base, guest_base_reg, opc);
+--
+.25.1

-New patch
+[PULL 17/34] tcg/arm: Drop support for armv4 and armv5 hosts
+Support for unaligned accesses is difficult for pre-v6 hosts.
+While debian still builds for armv4, we cannot use a compile
+time test, so test the architecture at runtime and error out.
+Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/arm/tcg-target.c.inc | 5 +++++
+file changed, 5 insertions(+)
+diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/arm/tcg-target.c.inc
++++ b/tcg/arm/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_target_init(TCGContext *s)
+         if (pl != NULL && pl[0] == 'v' && pl[1] >= '4' && pl[1] <= '9') {
+             arm_arch = pl[1] - '0';
+         }
++
++        if (arm_arch < 6) {
++            error_report("TCG: ARMv%d is unsupported; exiting", arm_arch);
++            exit(EXIT_FAILURE);
++        }
+     }
+     tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS;
+--
+.25.1

-New patch
+[PULL 18/34] tcg/arm: Remove use_armv5t_instructions
+This is now always true, since we require armv6.
+Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/arm/tcg-target.h     |  3 +--
+ tcg/arm/tcg-target.c.inc | 35 ++++++-----------------------------
+files changed, 7 insertions(+), 31 deletions(-)
+diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/arm/tcg-target.h
++++ b/tcg/arm/tcg-target.h
+@@ -XXX,XX +XXX,XX @@
+ extern int arm_arch;
+-#define use_armv5t_instructions (__ARM_ARCH >= 5 || arm_arch >= 5)
+ #define use_armv6_instructions  (__ARM_ARCH >= 6 || arm_arch >= 6)
+ #define use_armv7_instructions  (__ARM_ARCH >= 7 || arm_arch >= 7)
+@@ -XXX,XX +XXX,XX @@ extern bool use_neon_instructions;
+ #define TCG_TARGET_HAS_eqv_i32          0
+ #define TCG_TARGET_HAS_nand_i32         0
+ #define TCG_TARGET_HAS_nor_i32          0
+-#define TCG_TARGET_HAS_clz_i32          use_armv5t_instructions
++#define TCG_TARGET_HAS_clz_i32          1
+ #define TCG_TARGET_HAS_ctz_i32          use_armv7_instructions
+ #define TCG_TARGET_HAS_ctpop_i32        0
+ #define TCG_TARGET_HAS_deposit_i32      use_armv7_instructions
+diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/arm/tcg-target.c.inc
++++ b/tcg/arm/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_b_reg(TCGContext *s, ARMCond cond, TCGReg rn)
+      * Unless the C portion of QEMU is compiled as thumb, we don't need
+      * true BX semantics; merely a branch to an address held in a register.
+      */
+-    if (use_armv5t_instructions) {
+-        tcg_out_bx_reg(s, cond, rn);
+-    } else {
+-        tcg_out_mov_reg(s, cond, TCG_REG_PC, rn);
+-    }
++    tcg_out_bx_reg(s, cond, rn);
+ }
+ static void tcg_out_dat_imm(TCGContext *s, ARMCond cond, ARMInsn opc,
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_goto(TCGContext *s, ARMCond cond, const tcg_insn_unit *addr)
+     }
+     /* LDR is interworking from v5t. */
+-    if (arm_mode || use_armv5t_instructions) {
+-        tcg_out_movi_pool(s, cond, TCG_REG_PC, addri);
+-        return;
+-    }
+-
+-    /* else v4t */
+-    tcg_out_movi32(s, COND_AL, TCG_REG_TMP, addri);
+-    tcg_out_bx_reg(s, COND_AL, TCG_REG_TMP);
++    tcg_out_movi_pool(s, cond, TCG_REG_PC, addri);
+ }
+ /*
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_call(TCGContext *s, const tcg_insn_unit *addr)
+     if (disp - 8 < 0x02000000 && disp - 8 >= -0x02000000) {
+         if (arm_mode) {
+             tcg_out_bl_imm(s, COND_AL, disp);
+-            return;
+-        }
+-        if (use_armv5t_instructions) {
++        } else {
+             tcg_out_blx_imm(s, disp);
+-            return;
+         }
++        return;
+     }
+-    if (use_armv5t_instructions) {
+-        tcg_out_movi32(s, COND_AL, TCG_REG_TMP, addri);
+-        tcg_out_blx_reg(s, COND_AL, TCG_REG_TMP);
+-    } else if (arm_mode) {
+-        /* ??? Know that movi_pool emits exactly 1 insn.  */
+-        tcg_out_mov_reg(s, COND_AL, TCG_REG_R14, TCG_REG_PC);
+-        tcg_out_movi_pool(s, COND_AL, TCG_REG_PC, addri);
+-    } else {
+-        tcg_out_movi32(s, COND_AL, TCG_REG_TMP, addri);
+-        tcg_out_mov_reg(s, COND_AL, TCG_REG_R14, TCG_REG_PC);
+-        tcg_out_bx_reg(s, COND_AL, TCG_REG_TMP);
+-    }
++    tcg_out_movi32(s, COND_AL, TCG_REG_TMP, addri);
++    tcg_out_blx_reg(s, COND_AL, TCG_REG_TMP);
+ }
+ static void tcg_out_goto_label(TCGContext *s, ARMCond cond, TCGLabel *l)
+--
+.25.1

-New patch
+[PULL 19/34] tcg/arm: Remove use_armv6_instructions
+This is now always true, since we require armv6.
+Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/arm/tcg-target.h     |   1 -
+ tcg/arm/tcg-target.c.inc | 192 ++++++---------------------------------
+files changed, 27 insertions(+), 166 deletions(-)
+diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/arm/tcg-target.h
++++ b/tcg/arm/tcg-target.h
+@@ -XXX,XX +XXX,XX @@
+ extern int arm_arch;
+-#define use_armv6_instructions  (__ARM_ARCH >= 6 || arm_arch >= 6)
+ #define use_armv7_instructions  (__ARM_ARCH >= 7 || arm_arch >= 7)
+ #undef TCG_TARGET_STACK_GROWSUP
+diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/arm/tcg-target.c.inc
++++ b/tcg/arm/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_dat_rIN(TCGContext *s, ARMCond cond, ARMInsn opc,
+ static void tcg_out_mul32(TCGContext *s, ARMCond cond, TCGReg rd,
+                           TCGReg rn, TCGReg rm)
+ {
+-    /* if ArchVersion() < 6 && d == n then UNPREDICTABLE;  */
+-    if (!use_armv6_instructions && rd == rn) {
+-        if (rd == rm) {
+-            /* rd == rn == rm; copy an input to tmp first.  */
+-            tcg_out_mov_reg(s, cond, TCG_REG_TMP, rn);
+-            rm = rn = TCG_REG_TMP;
+-        } else {
+-            rn = rm;
+-            rm = rd;
+-        }
+-    }
+     /* mul */
+     tcg_out32(s, (cond << 28) | 0x90 | (rd << 16) | (rm << 8) | rn);
+ }
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_mul32(TCGContext *s, ARMCond cond, TCGReg rd,
+ static void tcg_out_umull32(TCGContext *s, ARMCond cond, TCGReg rd0,
+                             TCGReg rd1, TCGReg rn, TCGReg rm)
+ {
+-    /* if ArchVersion() < 6 && (dHi == n || dLo == n) then UNPREDICTABLE;  */
+-    if (!use_armv6_instructions && (rd0 == rn || rd1 == rn)) {
+-        if (rd0 == rm || rd1 == rm) {
+-            tcg_out_mov_reg(s, cond, TCG_REG_TMP, rn);
+-            rn = TCG_REG_TMP;
+-        } else {
+-            TCGReg t = rn;
+-            rn = rm;
+-            rm = t;
+-        }
+-    }
+     /* umull */
+     tcg_out32(s, (cond << 28) | 0x00800090 |
+               (rd1 << 16) | (rd0 << 12) | (rm << 8) | rn);
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_umull32(TCGContext *s, ARMCond cond, TCGReg rd0,
+ static void tcg_out_smull32(TCGContext *s, ARMCond cond, TCGReg rd0,
+                             TCGReg rd1, TCGReg rn, TCGReg rm)
+ {
+-    /* if ArchVersion() < 6 && (dHi == n || dLo == n) then UNPREDICTABLE;  */
+-    if (!use_armv6_instructions && (rd0 == rn || rd1 == rn)) {
+-        if (rd0 == rm || rd1 == rm) {
+-            tcg_out_mov_reg(s, cond, TCG_REG_TMP, rn);
+-            rn = TCG_REG_TMP;
+-        } else {
+-            TCGReg t = rn;
+-            rn = rm;
+-            rm = t;
+-        }
+-    }
+     /* smull */
+     tcg_out32(s, (cond << 28) | 0x00c00090 |
+               (rd1 << 16) | (rd0 << 12) | (rm << 8) | rn);
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_udiv(TCGContext *s, ARMCond cond,
+ static void tcg_out_ext8s(TCGContext *s, ARMCond cond, TCGReg rd, TCGReg rn)
+ {
+-    if (use_armv6_instructions) {
+-        /* sxtb */
+-        tcg_out32(s, 0x06af0070 | (cond << 28) | (rd << 12) | rn);
+-    } else {
+-        tcg_out_dat_reg(s, cond, ARITH_MOV,
+-                        rd, 0, rn, SHIFT_IMM_LSL(24));
+-        tcg_out_dat_reg(s, cond, ARITH_MOV,
+-                        rd, 0, rd, SHIFT_IMM_ASR(24));
+-    }
++    /* sxtb */
++    tcg_out32(s, 0x06af0070 | (cond << 28) | (rd << 12) | rn);
+ }
+ static void __attribute__((unused))
+@@ -XXX,XX +XXX,XX @@ tcg_out_ext8u(TCGContext *s, ARMCond cond, TCGReg rd, TCGReg rn)
+ static void tcg_out_ext16s(TCGContext *s, ARMCond cond, TCGReg rd, TCGReg rn)
+ {
+-    if (use_armv6_instructions) {
+-        /* sxth */
+-        tcg_out32(s, 0x06bf0070 | (cond << 28) | (rd << 12) | rn);
+-    } else {
+-        tcg_out_dat_reg(s, cond, ARITH_MOV,
+-                        rd, 0, rn, SHIFT_IMM_LSL(16));
+-        tcg_out_dat_reg(s, cond, ARITH_MOV,
+-                        rd, 0, rd, SHIFT_IMM_ASR(16));
+-    }
++    /* sxth */
++    tcg_out32(s, 0x06bf0070 | (cond << 28) | (rd << 12) | rn);
+ }
+ static void tcg_out_ext16u(TCGContext *s, ARMCond cond, TCGReg rd, TCGReg rn)
+ {
+-    if (use_armv6_instructions) {
+-        /* uxth */
+-        tcg_out32(s, 0x06ff0070 | (cond << 28) | (rd << 12) | rn);
+-    } else {
+-        tcg_out_dat_reg(s, cond, ARITH_MOV,
+-                        rd, 0, rn, SHIFT_IMM_LSL(16));
+-        tcg_out_dat_reg(s, cond, ARITH_MOV,
+-                        rd, 0, rd, SHIFT_IMM_LSR(16));
+-    }
++    /* uxth */
++    tcg_out32(s, 0x06ff0070 | (cond << 28) | (rd << 12) | rn);
+ }
+ static void tcg_out_bswap16(TCGContext *s, ARMCond cond,
+                             TCGReg rd, TCGReg rn, int flags)
+ {
+-    if (use_armv6_instructions) {
+-        if (flags & TCG_BSWAP_OS) {
+-            /* revsh */
+-            tcg_out32(s, 0x06ff0fb0 | (cond << 28) | (rd << 12) | rn);
+-            return;
+-        }
+-
+-        /* rev16 */
+-        tcg_out32(s, 0x06bf0fb0 | (cond << 28) | (rd << 12) | rn);
+-        if ((flags & (TCG_BSWAP_IZ | TCG_BSWAP_OZ)) == TCG_BSWAP_OZ) {
+-            /* uxth */
+-            tcg_out32(s, 0x06ff0070 | (cond << 28) | (rd << 12) | rd);
+-        }
++    if (flags & TCG_BSWAP_OS) {
++        /* revsh */
++        tcg_out32(s, 0x06ff0fb0 | (cond << 28) | (rd << 12) | rn);
+         return;
+     }
+-    if (flags == 0) {
+-        /*
+-         * For stores, no input or output extension:
+-         *                              rn  = xxAB
+-         * lsr tmp, rn, #8              tmp = 0xxA
+-         * and tmp, tmp, #0xff          tmp = 000A
+-         * orr rd, tmp, rn, lsl #8      rd  = xABA
+-         */
+-        tcg_out_dat_reg(s, cond, ARITH_MOV,
+-                        TCG_REG_TMP, 0, rn, SHIFT_IMM_LSR(8));
+-        tcg_out_dat_imm(s, cond, ARITH_AND, TCG_REG_TMP, TCG_REG_TMP, 0xff);
+-        tcg_out_dat_reg(s, cond, ARITH_ORR,
+-                        rd, TCG_REG_TMP, rn, SHIFT_IMM_LSL(8));
+-        return;
++    /* rev16 */
++    tcg_out32(s, 0x06bf0fb0 | (cond << 28) | (rd << 12) | rn);
++    if ((flags & (TCG_BSWAP_IZ | TCG_BSWAP_OZ)) == TCG_BSWAP_OZ) {
++        /* uxth */
++        tcg_out32(s, 0x06ff0070 | (cond << 28) | (rd << 12) | rd);
+     }
+-
+-    /*
+-     * Byte swap, leaving the result at the top of the register.
+-     * We will then shift down, zero or sign-extending.
+-     */
+-    if (flags & TCG_BSWAP_IZ) {
+-        /*
+-         *                              rn  = 00AB
+-         * ror tmp, rn, #8              tmp = B00A
+-         * orr tmp, tmp, tmp, lsl #16   tmp = BA00
+-         */
+-        tcg_out_dat_reg(s, cond, ARITH_MOV,
+-                        TCG_REG_TMP, 0, rn, SHIFT_IMM_ROR(8));
+-        tcg_out_dat_reg(s, cond, ARITH_ORR,
+-                        TCG_REG_TMP, TCG_REG_TMP, TCG_REG_TMP,
+-                        SHIFT_IMM_LSL(16));
+-    } else {
+-        /*
+-         *                              rn  = xxAB
+-         * and tmp, rn, #0xff00         tmp = 00A0
+-         * lsl tmp, tmp, #8             tmp = 0A00
+-         * orr tmp, tmp, rn, lsl #24    tmp = BA00
+-         */
+-        tcg_out_dat_rI(s, cond, ARITH_AND, TCG_REG_TMP, rn, 0xff00, 1);
+-        tcg_out_dat_reg(s, cond, ARITH_MOV,
+-                        TCG_REG_TMP, 0, TCG_REG_TMP, SHIFT_IMM_LSL(8));
+-        tcg_out_dat_reg(s, cond, ARITH_ORR,
+-                        TCG_REG_TMP, TCG_REG_TMP, rn, SHIFT_IMM_LSL(24));
+-    }
+-    tcg_out_dat_reg(s, cond, ARITH_MOV, rd, 0, TCG_REG_TMP,
+-                    (flags & TCG_BSWAP_OS
+-                     ? SHIFT_IMM_ASR(8) : SHIFT_IMM_LSR(8)));
+ }
+ static void tcg_out_bswap32(TCGContext *s, ARMCond cond, TCGReg rd, TCGReg rn)
+ {
+-    if (use_armv6_instructions) {
+-        /* rev */
+-        tcg_out32(s, 0x06bf0f30 | (cond << 28) | (rd << 12) | rn);
+-    } else {
+-        tcg_out_dat_reg(s, cond, ARITH_EOR,
+-                        TCG_REG_TMP, rn, rn, SHIFT_IMM_ROR(16));
+-        tcg_out_dat_imm(s, cond, ARITH_BIC,
+-                        TCG_REG_TMP, TCG_REG_TMP, 0xff | 0x800);
+-        tcg_out_dat_reg(s, cond, ARITH_MOV,
+-                        rd, 0, rn, SHIFT_IMM_ROR(8));
+-        tcg_out_dat_reg(s, cond, ARITH_EOR,
+-                        rd, rd, TCG_REG_TMP, SHIFT_IMM_LSR(8));
+-    }
++    /* rev */
++    tcg_out32(s, 0x06bf0f30 | (cond << 28) | (rd << 12) | rn);
+ }
+ static void tcg_out_deposit(TCGContext *s, ARMCond cond, TCGReg rd,
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_mb(TCGContext *s, TCGArg a0)
+ {
+     if (use_armv7_instructions) {
+         tcg_out32(s, INSN_DMB_ISH);
+-    } else if (use_armv6_instructions) {
++    } else {
+         tcg_out32(s, INSN_DMB_MCR);
+     }
+ }
+@@ -XXX,XX +XXX,XX @@ static TCGReg tcg_out_arg_reg64(TCGContext *s, TCGReg argreg,
+     if (argreg & 1) {
+         argreg++;
+     }
+-    if (use_armv6_instructions && argreg >= 4
+-        && (arglo & 1) == 0 && arghi == arglo + 1) {
++    if (argreg >= 4 && (arglo & 1) == 0 && arghi == arglo + 1) {
+         tcg_out_strd_8(s, COND_AL, arglo,
+                        TCG_REG_CALL_STACK, (argreg - 4) * 4);
+         return argreg + 2;
+@@ -XXX,XX +XXX,XX @@ static TCGReg tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
+     int cmp_off = (is_load ? offsetof(CPUTLBEntry, addr_read)
+                    : offsetof(CPUTLBEntry, addr_write));
+     int fast_off = TLB_MASK_TABLE_OFS(mem_index);
+-    int mask_off = fast_off + offsetof(CPUTLBDescFast, mask);
+-    int table_off = fast_off + offsetof(CPUTLBDescFast, table);
+     unsigned s_bits = opc & MO_SIZE;
+     unsigned a_bits = get_alignment_bits(opc);
+@@ -XXX,XX +XXX,XX @@ static TCGReg tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
+     }
+     /* Load env_tlb(env)->f[mmu_idx].{mask,table} into {r0,r1}.  */
+-    if (use_armv6_instructions) {
+-        tcg_out_ldrd_8(s, COND_AL, TCG_REG_R0, TCG_AREG0, fast_off);
+-    } else {
+-        tcg_out_ld(s, TCG_TYPE_I32, TCG_REG_R0, TCG_AREG0, mask_off);
+-        tcg_out_ld(s, TCG_TYPE_I32, TCG_REG_R1, TCG_AREG0, table_off);
+-    }
++    tcg_out_ldrd_8(s, COND_AL, TCG_REG_R0, TCG_AREG0, fast_off);
+     /* Extract the tlb index from the address into R0.  */
+     tcg_out_dat_reg(s, COND_AL, ARITH_AND, TCG_REG_R0, TCG_REG_R0, addrlo,
+@@ -XXX,XX +XXX,XX @@ static TCGReg tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
+      * Load the tlb comparator into R2/R3 and the fast path addend into R1.
+      */
+     if (cmp_off == 0) {
+-        if (use_armv6_instructions && TARGET_LONG_BITS == 64) {
++        if (TARGET_LONG_BITS == 64) {
+             tcg_out_ldrd_rwb(s, COND_AL, TCG_REG_R2, TCG_REG_R1, TCG_REG_R0);
+         } else {
+             tcg_out_ld32_rwb(s, COND_AL, TCG_REG_R2, TCG_REG_R1, TCG_REG_R0);
+@@ -XXX,XX +XXX,XX @@ static TCGReg tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
+     } else {
+         tcg_out_dat_reg(s, COND_AL, ARITH_ADD,
+                         TCG_REG_R1, TCG_REG_R1, TCG_REG_R0, 0);
+-        if (use_armv6_instructions && TARGET_LONG_BITS == 64) {
++        if (TARGET_LONG_BITS == 64) {
+             tcg_out_ldrd_8(s, COND_AL, TCG_REG_R2, TCG_REG_R1, cmp_off);
+         } else {
+             tcg_out_ld32_12(s, COND_AL, TCG_REG_R2, TCG_REG_R1, cmp_off);
+         }
+     }
+-    if (!use_armv6_instructions && TARGET_LONG_BITS == 64) {
+-        tcg_out_ld32_12(s, COND_AL, TCG_REG_R3, TCG_REG_R1, cmp_off + 4);
+-    }
+     /* Load the tlb addend.  */
+     tcg_out_ld32_12(s, COND_AL, TCG_REG_R1, TCG_REG_R1,
+@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
+     TCGReg argreg, datalo, datahi;
+     MemOpIdx oi = lb->oi;
+     MemOp opc = get_memop(oi);
+-    void *func;
+     if (!reloc_pc24(lb->label_ptr[0], tcg_splitwx_to_rx(s->code_ptr))) {
+         return false;
+@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
+     argreg = tcg_out_arg_imm32(s, argreg, oi);
+     argreg = tcg_out_arg_reg32(s, argreg, TCG_REG_R14);
+-    /* For armv6 we can use the canonical unsigned helpers and minimize
+-       icache usage.  For pre-armv6, use the signed helpers since we do
+-       not have a single insn sign-extend.  */
+-    if (use_armv6_instructions) {
+-        func = qemu_ld_helpers[opc & MO_SIZE];
+-    } else {
+-        func = qemu_ld_helpers[opc & MO_SSIZE];
+-        if (opc & MO_SIGN) {
+-            opc = MO_UL;
+-        }
+-    }
+-    tcg_out_call(s, func);
++    /* Use the canonical unsigned helpers and minimize icache usage. */
++    tcg_out_call(s, qemu_ld_helpers[opc & MO_SIZE]);
+     datalo = lb->datalo_reg;
+     datahi = lb->datahi_reg;
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld_index(TCGContext *s, MemOp opc,
+         break;
+     case MO_UQ:
+         /* Avoid ldrd for user-only emulation, to handle unaligned.  */
+-        if (USING_SOFTMMU && use_armv6_instructions
++        if (USING_SOFTMMU
+             && (datalo & 1) == 0 && datahi == datalo + 1) {
+             tcg_out_ldrd_r(s, COND_AL, datalo, addrlo, addend);
+         } else if (datalo != addend) {
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld_direct(TCGContext *s, MemOp opc, TCGReg datalo,
+         break;
+     case MO_UQ:
+         /* Avoid ldrd for user-only emulation, to handle unaligned.  */
+-        if (USING_SOFTMMU && use_armv6_instructions
++        if (USING_SOFTMMU
+             && (datalo & 1) == 0 && datahi == datalo + 1) {
+             tcg_out_ldrd_8(s, COND_AL, datalo, addrlo, 0);
+         } else if (datalo == addrlo) {
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st_index(TCGContext *s, ARMCond cond, MemOp opc,
+         break;
+     case MO_64:
+         /* Avoid strd for user-only emulation, to handle unaligned.  */
+-        if (USING_SOFTMMU && use_armv6_instructions
++        if (USING_SOFTMMU
+             && (datalo & 1) == 0 && datahi == datalo + 1) {
+             tcg_out_strd_r(s, cond, datalo, addrlo, addend);
+         } else {
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st_direct(TCGContext *s, MemOp opc, TCGReg datalo,
+         break;
+     case MO_64:
+         /* Avoid strd for user-only emulation, to handle unaligned.  */
+-        if (USING_SOFTMMU && use_armv6_instructions
++        if (USING_SOFTMMU
+             && (datalo & 1) == 0 && datahi == datalo + 1) {
+             tcg_out_strd_8(s, COND_AL, datalo, addrlo, 0);
+         } else {
+--
+.25.1

-New patch
+[PULL 20/34] tcg/arm: Check alignment for ldrd and strd
+We will shortly allow the use of unaligned memory accesses,
+and these require proper alignment.  Use get_alignment_bits
+to verify and remove USING_SOFTMMU.
+Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/arm/tcg-target.c.inc | 23 ++++++++---------------
+file changed, 8 insertions(+), 15 deletions(-)
+diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/arm/tcg-target.c.inc
++++ b/tcg/arm/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ bool use_idiv_instructions;
+ bool use_neon_instructions;
+ #endif
+-/* ??? Ought to think about changing CONFIG_SOFTMMU to always defined.  */
+-#ifdef CONFIG_SOFTMMU
+-# define USING_SOFTMMU 1
+-#else
+-# define USING_SOFTMMU 0
+-#endif
+-
+ #ifdef CONFIG_DEBUG_TCG
+ static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
+     "%r0",  "%r1",  "%r2",  "%r3",  "%r4",  "%r5",  "%r6",  "%r7",
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld_index(TCGContext *s, MemOp opc,
+         tcg_out_ld32_r(s, COND_AL, datalo, addrlo, addend);
+         break;
+     case MO_UQ:
+-        /* Avoid ldrd for user-only emulation, to handle unaligned.  */
+-        if (USING_SOFTMMU
++        /* LDRD requires alignment; double-check that. */
++        if (get_alignment_bits(opc) >= MO_64
+             && (datalo & 1) == 0 && datahi == datalo + 1) {
+             tcg_out_ldrd_r(s, COND_AL, datalo, addrlo, addend);
+         } else if (datalo != addend) {
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld_direct(TCGContext *s, MemOp opc, TCGReg datalo,
+         tcg_out_ld32_12(s, COND_AL, datalo, addrlo, 0);
+         break;
+     case MO_UQ:
+-        /* Avoid ldrd for user-only emulation, to handle unaligned.  */
+-        if (USING_SOFTMMU
++        /* LDRD requires alignment; double-check that. */
++        if (get_alignment_bits(opc) >= MO_64
+             && (datalo & 1) == 0 && datahi == datalo + 1) {
+             tcg_out_ldrd_8(s, COND_AL, datalo, addrlo, 0);
+         } else if (datalo == addrlo) {
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st_index(TCGContext *s, ARMCond cond, MemOp opc,
+         tcg_out_st32_r(s, cond, datalo, addrlo, addend);
+         break;
+     case MO_64:
+-        /* Avoid strd for user-only emulation, to handle unaligned.  */
+-        if (USING_SOFTMMU
++        /* STRD requires alignment; double-check that. */
++        if (get_alignment_bits(opc) >= MO_64
+             && (datalo & 1) == 0 && datahi == datalo + 1) {
+             tcg_out_strd_r(s, cond, datalo, addrlo, addend);
+         } else {
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st_direct(TCGContext *s, MemOp opc, TCGReg datalo,
+         tcg_out_st32_12(s, COND_AL, datalo, addrlo, 0);
+         break;
+     case MO_64:
+-        /* Avoid strd for user-only emulation, to handle unaligned.  */
+-        if (USING_SOFTMMU
++        /* STRD requires alignment; double-check that. */
++        if (get_alignment_bits(opc) >= MO_64
+             && (datalo & 1) == 0 && datahi == datalo + 1) {
+             tcg_out_strd_8(s, COND_AL, datalo, addrlo, 0);
+         } else {
+--
+.25.1

-New patch
+[PULL 21/34] tcg/arm: Support unaligned access for softmmu
+From armv6, the architecture supports unaligned accesses.
+All we need to do is perform the correct alignment check
+in tcg_out_tlb_read.
+Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/arm/tcg-target.c.inc | 41 ++++++++++++++++++++--------------------
+file changed, 21 insertions(+), 20 deletions(-)
+diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/arm/tcg-target.c.inc
++++ b/tcg/arm/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static TCGReg tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
+     int cmp_off = (is_load ? offsetof(CPUTLBEntry, addr_read)
+                    : offsetof(CPUTLBEntry, addr_write));
+     int fast_off = TLB_MASK_TABLE_OFS(mem_index);
+-    unsigned s_bits = opc & MO_SIZE;
+-    unsigned a_bits = get_alignment_bits(opc);
+-
+-    /*
+-     * We don't support inline unaligned acceses, but we can easily
+-     * support overalignment checks.
+-     */
+-    if (a_bits < s_bits) {
+-        a_bits = s_bits;
+-    }
++    unsigned s_mask = (1 << (opc & MO_SIZE)) - 1;
++    unsigned a_mask = (1 << get_alignment_bits(opc)) - 1;
++    TCGReg t_addr;
+     /* Load env_tlb(env)->f[mmu_idx].{mask,table} into {r0,r1}.  */
+     tcg_out_ldrd_8(s, COND_AL, TCG_REG_R0, TCG_AREG0, fast_off);
+@@ -XXX,XX +XXX,XX @@ static TCGReg tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
+     /*
+      * Check alignment, check comparators.
+-     * Do this in no more than 3 insns.  Use MOVW for v7, if possible,
++     * Do this in 2-4 insns.  Use MOVW for v7, if possible,
+      * to reduce the number of sequential conditional instructions.
+      * Almost all guests have at least 4k pages, which means that we need
+      * to clear at least 9 bits even for an 8-byte memory, which means it
+      * isn't worth checking for an immediate operand for BIC.
++     *
++     * For unaligned accesses, test the page of the last unit of alignment.
++     * This leaves the least significant alignment bits unchanged, and of
++     * course must be zero.
+      */
++    t_addr = addrlo;
++    if (a_mask < s_mask) {
++        t_addr = TCG_REG_R0;
++        tcg_out_dat_imm(s, COND_AL, ARITH_ADD, t_addr,
++                        addrlo, s_mask - a_mask);
++    }
+     if (use_armv7_instructions && TARGET_PAGE_BITS <= 16) {
+-        tcg_target_ulong mask = ~(TARGET_PAGE_MASK | ((1 << a_bits) - 1));
+-
+-        tcg_out_movi32(s, COND_AL, TCG_REG_TMP, mask);
++        tcg_out_movi32(s, COND_AL, TCG_REG_TMP, ~(TARGET_PAGE_MASK | a_mask));
+         tcg_out_dat_reg(s, COND_AL, ARITH_BIC, TCG_REG_TMP,
+-                        addrlo, TCG_REG_TMP, 0);
++                        t_addr, TCG_REG_TMP, 0);
+         tcg_out_dat_reg(s, COND_AL, ARITH_CMP, 0, TCG_REG_R2, TCG_REG_TMP, 0);
+     } else {
+-        if (a_bits) {
+-            tcg_out_dat_imm(s, COND_AL, ARITH_TST, 0, addrlo,
+-                            (1 << a_bits) - 1);
++        if (a_mask) {
++            tcg_debug_assert(a_mask <= 0xff);
++            tcg_out_dat_imm(s, COND_AL, ARITH_TST, 0, addrlo, a_mask);
+         }
+-        tcg_out_dat_reg(s, COND_AL, ARITH_MOV, TCG_REG_TMP, 0, addrlo,
++        tcg_out_dat_reg(s, COND_AL, ARITH_MOV, TCG_REG_TMP, 0, t_addr,
+                         SHIFT_IMM_LSR(TARGET_PAGE_BITS));
+-        tcg_out_dat_reg(s, (a_bits ? COND_EQ : COND_AL), ARITH_CMP,
++        tcg_out_dat_reg(s, (a_mask ? COND_EQ : COND_AL), ARITH_CMP,
+, TCG_REG_R2, TCG_REG_TMP,
+                         SHIFT_IMM_LSL(TARGET_PAGE_BITS));
+     }
+--
+.25.1

-New patch
+[PULL 22/34] tcg/arm: Reserve a register for guest_base
+Reserve a register for the guest_base using aarch64 for reference.
+By doing so, we do not have to recompute it for every memory load.
+Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/arm/tcg-target.c.inc | 39 ++++++++++++++++++++++++++++-----------
+file changed, 28 insertions(+), 11 deletions(-)
+diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/arm/tcg-target.c.inc
++++ b/tcg/arm/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static const int tcg_target_call_oarg_regs[2] = {
+ #define TCG_REG_TMP  TCG_REG_R12
+ #define TCG_VEC_TMP  TCG_REG_Q15
++#ifndef CONFIG_SOFTMMU
++#define TCG_REG_GUEST_BASE  TCG_REG_R11
++#endif
+ typedef enum {
+     COND_EQ = 0x0,
+@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
+ static void tcg_out_qemu_ld_index(TCGContext *s, MemOp opc,
+                                   TCGReg datalo, TCGReg datahi,
+-                                  TCGReg addrlo, TCGReg addend)
++                                  TCGReg addrlo, TCGReg addend,
++                                  bool scratch_addend)
+ {
+     /* Byte swapping is left to middle-end expansion. */
+     tcg_debug_assert((opc & MO_BSWAP) == 0);
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld_index(TCGContext *s, MemOp opc,
+         if (get_alignment_bits(opc) >= MO_64
+             && (datalo & 1) == 0 && datahi == datalo + 1) {
+             tcg_out_ldrd_r(s, COND_AL, datalo, addrlo, addend);
+-        } else if (datalo != addend) {
++        } else if (scratch_addend) {
+             tcg_out_ld32_rwb(s, COND_AL, datalo, addend, addrlo);
+             tcg_out_ld32_12(s, COND_AL, datahi, addend, 4);
+         } else {
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
+     label_ptr = s->code_ptr;
+     tcg_out_bl_imm(s, COND_NE, 0);
+-    tcg_out_qemu_ld_index(s, opc, datalo, datahi, addrlo, addend);
++    tcg_out_qemu_ld_index(s, opc, datalo, datahi, addrlo, addend, true);
+     add_qemu_ldst_label(s, true, oi, datalo, datahi, addrlo, addrhi,
+                         s->code_ptr, label_ptr);
+ #else /* !CONFIG_SOFTMMU */
+     if (guest_base) {
+-        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_TMP, guest_base);
+-        tcg_out_qemu_ld_index(s, opc, datalo, datahi, addrlo, TCG_REG_TMP);
++        tcg_out_qemu_ld_index(s, opc, datalo, datahi,
++                              addrlo, TCG_REG_GUEST_BASE, false);
+     } else {
+         tcg_out_qemu_ld_direct(s, opc, datalo, datahi, addrlo);
+     }
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
+ static void tcg_out_qemu_st_index(TCGContext *s, ARMCond cond, MemOp opc,
+                                   TCGReg datalo, TCGReg datahi,
+-                                  TCGReg addrlo, TCGReg addend)
++                                  TCGReg addrlo, TCGReg addend,
++                                  bool scratch_addend)
+ {
+     /* Byte swapping is left to middle-end expansion. */
+     tcg_debug_assert((opc & MO_BSWAP) == 0);
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st_index(TCGContext *s, ARMCond cond, MemOp opc,
+         if (get_alignment_bits(opc) >= MO_64
+             && (datalo & 1) == 0 && datahi == datalo + 1) {
+             tcg_out_strd_r(s, cond, datalo, addrlo, addend);
+-        } else {
++        } else if (scratch_addend) {
+             tcg_out_st32_rwb(s, cond, datalo, addend, addrlo);
+             tcg_out_st32_12(s, cond, datahi, addend, 4);
++        } else {
++            tcg_out_dat_reg(s, cond, ARITH_ADD, TCG_REG_TMP,
++                            addend, addrlo, SHIFT_IMM_LSL(0));
++            tcg_out_st32_12(s, cond, datalo, TCG_REG_TMP, 0);
++            tcg_out_st32_12(s, cond, datahi, TCG_REG_TMP, 4);
+         }
+         break;
+     default:
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
+     mem_index = get_mmuidx(oi);
+     addend = tcg_out_tlb_read(s, addrlo, addrhi, opc, mem_index, 0);
+-    tcg_out_qemu_st_index(s, COND_EQ, opc, datalo, datahi, addrlo, addend);
++    tcg_out_qemu_st_index(s, COND_EQ, opc, datalo, datahi,
++                          addrlo, addend, true);
+     /* The conditional call must come last, as we're going to return here.  */
+     label_ptr = s->code_ptr;
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
+                         s->code_ptr, label_ptr);
+ #else /* !CONFIG_SOFTMMU */
+     if (guest_base) {
+-        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_TMP, guest_base);
+-        tcg_out_qemu_st_index(s, COND_AL, opc, datalo,
+-                              datahi, addrlo, TCG_REG_TMP);
++        tcg_out_qemu_st_index(s, COND_AL, opc, datalo, datahi,
++                              addrlo, TCG_REG_GUEST_BASE, false);
+     } else {
+         tcg_out_qemu_st_direct(s, opc, datalo, datahi, addrlo);
+     }
+@@ -XXX,XX +XXX,XX @@ static void tcg_target_qemu_prologue(TCGContext *s)
+     tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
++#ifndef CONFIG_SOFTMMU
++    if (guest_base) {
++        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_GUEST_BASE, guest_base);
++        tcg_regset_set_reg(s->reserved_regs, TCG_REG_GUEST_BASE);
++    }
++#endif
++
+     tcg_out_b_reg(s, COND_AL, tcg_target_call_iarg_regs[1]);
+     /*
+--
+.25.1

-New patch
+[PULL 23/34] tcg/arm: Support raising sigbus for user-only
+Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/arm/tcg-target.h     |  2 -
+ tcg/arm/tcg-target.c.inc | 83 +++++++++++++++++++++++++++++++++++++++-
+files changed, 81 insertions(+), 4 deletions(-)
+diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/arm/tcg-target.h
++++ b/tcg/arm/tcg-target.h
+@@ -XXX,XX +XXX,XX @@ extern bool use_neon_instructions;
+ /* not defined -- call should be eliminated at compile time */
+ void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t);
+-#ifdef CONFIG_SOFTMMU
+ #define TCG_TARGET_NEED_LDST_LABELS
+-#endif
+ #define TCG_TARGET_NEED_POOL_LABELS
+ #endif
+diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/arm/tcg-target.c.inc
++++ b/tcg/arm/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@
+  */
+ #include "elf.h"
++#include "../tcg-ldst.c.inc"
+ #include "../tcg-pool.c.inc"
+ int arm_arch = __ARM_ARCH;
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_vldst(TCGContext *s, ARMInsn insn,
+ }
+ #ifdef CONFIG_SOFTMMU
+-#include "../tcg-ldst.c.inc"
+-
+ /* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
+  *                                     int mmu_idx, uintptr_t ra)
+  */
+@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
+     tcg_out_goto(s, COND_AL, qemu_st_helpers[opc & MO_SIZE]);
+     return true;
+ }
++#else
++
++static void tcg_out_test_alignment(TCGContext *s, bool is_ld, TCGReg addrlo,
++                                   TCGReg addrhi, unsigned a_bits)
++{
++    unsigned a_mask = (1 << a_bits) - 1;
++    TCGLabelQemuLdst *label = new_ldst_label(s);
++
++    label->is_ld = is_ld;
++    label->addrlo_reg = addrlo;
++    label->addrhi_reg = addrhi;
++
++    /* We are expecting a_bits to max out at 7, and can easily support 8. */
++    tcg_debug_assert(a_mask <= 0xff);
++    /* tst addr, #mask */
++    tcg_out_dat_imm(s, COND_AL, ARITH_TST, 0, addrlo, a_mask);
++
++    /* blne slow_path */
++    label->label_ptr[0] = s->code_ptr;
++    tcg_out_bl_imm(s, COND_NE, 0);
++
++    label->raddr = tcg_splitwx_to_rx(s->code_ptr);
++}
++
++static bool tcg_out_fail_alignment(TCGContext *s, TCGLabelQemuLdst *l)
++{
++    if (!reloc_pc24(l->label_ptr[0], tcg_splitwx_to_rx(s->code_ptr))) {
++        return false;
++    }
++
++    if (TARGET_LONG_BITS == 64) {
++        /* 64-bit target address is aligned into R2:R3. */
++        if (l->addrhi_reg != TCG_REG_R2) {
++            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R2, l->addrlo_reg);
++            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R3, l->addrhi_reg);
++        } else if (l->addrlo_reg != TCG_REG_R3) {
++            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R3, l->addrhi_reg);
++            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R2, l->addrlo_reg);
++        } else {
++            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R1, TCG_REG_R2);
++            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R2, TCG_REG_R3);
++            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R3, TCG_REG_R1);
++        }
++    } else {
++        tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R1, l->addrlo_reg);
++    }
++    tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_R0, TCG_AREG0);
++
++    /*
++     * Tail call to the helper, with the return address back inline,
++     * just for the clarity of the debugging traceback -- the helper
++     * cannot return.  We have used BLNE to arrive here, so LR is
++     * already set.
++     */
++    tcg_out_goto(s, COND_AL, (const void *)
++                 (l->is_ld ? helper_unaligned_ld : helper_unaligned_st));
++    return true;
++}
++
++static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
++{
++    return tcg_out_fail_alignment(s, l);
++}
++
++static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
++{
++    return tcg_out_fail_alignment(s, l);
++}
+ #endif /* SOFTMMU */
+ static void tcg_out_qemu_ld_index(TCGContext *s, MemOp opc,
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
+     int mem_index;
+     TCGReg addend;
+     tcg_insn_unit *label_ptr;
++#else
++    unsigned a_bits;
+ #endif
+     datalo = *args++;
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
+     add_qemu_ldst_label(s, true, oi, datalo, datahi, addrlo, addrhi,
+                         s->code_ptr, label_ptr);
+ #else /* !CONFIG_SOFTMMU */
++    a_bits = get_alignment_bits(opc);
++    if (a_bits) {
++        tcg_out_test_alignment(s, true, addrlo, addrhi, a_bits);
++    }
+     if (guest_base) {
+         tcg_out_qemu_ld_index(s, opc, datalo, datahi,
+                               addrlo, TCG_REG_GUEST_BASE, false);
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
+     int mem_index;
+     TCGReg addend;
+     tcg_insn_unit *label_ptr;
++#else
++    unsigned a_bits;
+ #endif
+     datalo = *args++;
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
+     add_qemu_ldst_label(s, false, oi, datalo, datahi, addrlo, addrhi,
+                         s->code_ptr, label_ptr);
+ #else /* !CONFIG_SOFTMMU */
++    a_bits = get_alignment_bits(opc);
++    if (a_bits) {
++        tcg_out_test_alignment(s, false, addrlo, addrhi, a_bits);
++    }
+     if (guest_base) {
+         tcg_out_qemu_st_index(s, COND_AL, opc, datalo, datahi,
+                               addrlo, TCG_REG_GUEST_BASE, false);
+--
+.25.1

-New patch
+[PULL 24/34] tcg/mips: Support unaligned access for user-only
+This is kinda sorta the opposite of the other tcg hosts, where
+we get (normal) alignment checks for free with host SIGBUS and
+need to add code to support unaligned accesses.
+Fortunately, the ISA contains pairs of instructions that are
+used to implement unaligned memory accesses.  Use them.
+Tested-by: Jiaxun Yang <jiaxun.yang@flygoat.com>
+Reviewed-by: Jiaxun Yang <jiaxun.yang@flygoat.com>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/mips/tcg-target.h     |   2 -
+ tcg/mips/tcg-target.c.inc | 334 +++++++++++++++++++++++++++++++++++++-
+files changed, 328 insertions(+), 8 deletions(-)
+diff --git a/tcg/mips/tcg-target.h b/tcg/mips/tcg-target.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/mips/tcg-target.h
++++ b/tcg/mips/tcg-target.h
+@@ -XXX,XX +XXX,XX @@ extern bool use_mips32r2_instructions;
+ void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t)
+     QEMU_ERROR("code path is reachable");
+-#ifdef CONFIG_SOFTMMU
+ #define TCG_TARGET_NEED_LDST_LABELS
+-#endif
+ #endif
+diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/mips/tcg-target.c.inc
++++ b/tcg/mips/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@
+  * THE SOFTWARE.
+  */
++#include "../tcg-ldst.c.inc"
++
+ #ifdef HOST_WORDS_BIGENDIAN
+ # define MIPS_BE  1
+ #else
+@@ -XXX,XX +XXX,XX @@ typedef enum {
+     OPC_ORI      = 015 << 26,
+     OPC_XORI     = 016 << 26,
+     OPC_LUI      = 017 << 26,
++    OPC_BNEL     = 025 << 26,
++    OPC_BNEZALC_R6 = 030 << 26,
+     OPC_DADDIU   = 031 << 26,
++    OPC_LDL      = 032 << 26,
++    OPC_LDR      = 033 << 26,
+     OPC_LB       = 040 << 26,
+     OPC_LH       = 041 << 26,
++    OPC_LWL      = 042 << 26,
+     OPC_LW       = 043 << 26,
+     OPC_LBU      = 044 << 26,
+     OPC_LHU      = 045 << 26,
++    OPC_LWR      = 046 << 26,
+     OPC_LWU      = 047 << 26,
+     OPC_SB       = 050 << 26,
+     OPC_SH       = 051 << 26,
++    OPC_SWL      = 052 << 26,
+     OPC_SW       = 053 << 26,
++    OPC_SDL      = 054 << 26,
++    OPC_SDR      = 055 << 26,
++    OPC_SWR      = 056 << 26,
+     OPC_LD       = 067 << 26,
+     OPC_SD       = 077 << 26,
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_call(TCGContext *s, const tcg_insn_unit *arg)
+ }
+ #if defined(CONFIG_SOFTMMU)
+-#include "../tcg-ldst.c.inc"
+-
+ static void * const qemu_ld_helpers[(MO_SSIZE | MO_BSWAP) + 1] = {
+     [MO_UB]   = helper_ret_ldub_mmu,
+     [MO_SB]   = helper_ret_ldsb_mmu,
+@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
+     tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
+     return true;
+ }
+-#endif
++
++#else
++
++static void tcg_out_test_alignment(TCGContext *s, bool is_ld, TCGReg addrlo,
++                                   TCGReg addrhi, unsigned a_bits)
++{
++    unsigned a_mask = (1 << a_bits) - 1;
++    TCGLabelQemuLdst *l = new_ldst_label(s);
++
++    l->is_ld = is_ld;
++    l->addrlo_reg = addrlo;
++    l->addrhi_reg = addrhi;
++
++    /* We are expecting a_bits to max out at 7, much lower than ANDI. */
++    tcg_debug_assert(a_bits < 16);
++    tcg_out_opc_imm(s, OPC_ANDI, TCG_TMP0, addrlo, a_mask);
++
++    l->label_ptr[0] = s->code_ptr;
++    if (use_mips32r6_instructions) {
++        tcg_out_opc_br(s, OPC_BNEZALC_R6, TCG_REG_ZERO, TCG_TMP0);
++    } else {
++        tcg_out_opc_br(s, OPC_BNEL, TCG_TMP0, TCG_REG_ZERO);
++        tcg_out_nop(s);
++    }
++
++    l->raddr = tcg_splitwx_to_rx(s->code_ptr);
++}
++
++static bool tcg_out_fail_alignment(TCGContext *s, TCGLabelQemuLdst *l)
++{
++    void *target;
++
++    if (!reloc_pc16(l->label_ptr[0], tcg_splitwx_to_rx(s->code_ptr))) {
++        return false;
++    }
++
++    if (TCG_TARGET_REG_BITS < TARGET_LONG_BITS) {
++        /* A0 is env, A1 is skipped, A2:A3 is the uint64_t address. */
++        TCGReg a2 = MIPS_BE ? l->addrhi_reg : l->addrlo_reg;
++        TCGReg a3 = MIPS_BE ? l->addrlo_reg : l->addrhi_reg;
++
++        if (a3 != TCG_REG_A2) {
++            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_A2, a2);
++            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_A3, a3);
++        } else if (a2 != TCG_REG_A3) {
++            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_A3, a3);
++            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_A2, a2);
++        } else {
++            tcg_out_mov(s, TCG_TYPE_I32, TCG_TMP0, TCG_REG_A2);
++            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_A2, TCG_REG_A3);
++            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_A3, TCG_TMP0);
++        }
++    } else {
++        tcg_out_mov(s, TCG_TYPE_TL, TCG_REG_A1, l->addrlo_reg);
++    }
++    tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_A0, TCG_AREG0);
++
++    /*
++     * Tail call to the helper, with the return address back inline.
++     * We have arrived here via BNEL, so $31 is already set.
++     */
++    target = (l->is_ld ? helper_unaligned_ld : helper_unaligned_st);
++    tcg_out_call_int(s, target, true);
++    return true;
++}
++
++static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
++{
++    return tcg_out_fail_alignment(s, l);
++}
++
++static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
++{
++    return tcg_out_fail_alignment(s, l);
++}
++#endif /* SOFTMMU */
+ static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg lo, TCGReg hi,
+                                    TCGReg base, MemOp opc, bool is_64)
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg lo, TCGReg hi,
+     }
+ }
++static void __attribute__((unused))
++tcg_out_qemu_ld_unalign(TCGContext *s, TCGReg lo, TCGReg hi,
++                                    TCGReg base, MemOp opc, bool is_64)
++{
++    const MIPSInsn lw1 = MIPS_BE ? OPC_LWL : OPC_LWR;
++    const MIPSInsn lw2 = MIPS_BE ? OPC_LWR : OPC_LWL;
++    const MIPSInsn ld1 = MIPS_BE ? OPC_LDL : OPC_LDR;
++    const MIPSInsn ld2 = MIPS_BE ? OPC_LDR : OPC_LDL;
++
++    bool sgn = (opc & MO_SIGN);
++
++    switch (opc & (MO_SSIZE | MO_BSWAP)) {
++    case MO_SW | MO_BE:
++    case MO_UW | MO_BE:
++        tcg_out_opc_imm(s, sgn ? OPC_LB : OPC_LBU, TCG_TMP0, base, 0);
++        tcg_out_opc_imm(s, OPC_LBU, lo, base, 1);
++        if (use_mips32r2_instructions) {
++            tcg_out_opc_bf(s, OPC_INS, lo, TCG_TMP0, 31, 8);
++        } else {
++            tcg_out_opc_sa(s, OPC_SLL, TCG_TMP0, TCG_TMP0, 8);
++            tcg_out_opc_reg(s, OPC_OR, lo, TCG_TMP0, TCG_TMP1);
++        }
++        break;
++
++    case MO_SW | MO_LE:
++    case MO_UW | MO_LE:
++        if (use_mips32r2_instructions && lo != base) {
++            tcg_out_opc_imm(s, OPC_LBU, lo, base, 0);
++            tcg_out_opc_imm(s, sgn ? OPC_LB : OPC_LBU, TCG_TMP0, base, 1);
++            tcg_out_opc_bf(s, OPC_INS, lo, TCG_TMP0, 31, 8);
++        } else {
++            tcg_out_opc_imm(s, OPC_LBU, TCG_TMP0, base, 0);
++            tcg_out_opc_imm(s, sgn ? OPC_LB : OPC_LBU, TCG_TMP1, base, 1);
++            tcg_out_opc_sa(s, OPC_SLL, TCG_TMP1, TCG_TMP1, 8);
++            tcg_out_opc_reg(s, OPC_OR, lo, TCG_TMP0, TCG_TMP1);
++        }
++        break;
++
++    case MO_SL:
++    case MO_UL:
++        tcg_out_opc_imm(s, lw1, lo, base, 0);
++        tcg_out_opc_imm(s, lw2, lo, base, 3);
++        if (TCG_TARGET_REG_BITS == 64 && is_64 && !sgn) {
++            tcg_out_ext32u(s, lo, lo);
++        }
++        break;
++
++    case MO_UL | MO_BSWAP:
++    case MO_SL | MO_BSWAP:
++        if (use_mips32r2_instructions) {
++            tcg_out_opc_imm(s, lw1, lo, base, 0);
++            tcg_out_opc_imm(s, lw2, lo, base, 3);
++            tcg_out_bswap32(s, lo, lo,
++                            TCG_TARGET_REG_BITS == 64 && is_64
++                            ? (sgn ? TCG_BSWAP_OS : TCG_BSWAP_OZ) : 0);
++        } else {
++            const tcg_insn_unit *subr =
++                (TCG_TARGET_REG_BITS == 64 && is_64 && !sgn
++                 ? bswap32u_addr : bswap32_addr);
++
++            tcg_out_opc_imm(s, lw1, TCG_TMP0, base, 0);
++            tcg_out_bswap_subr(s, subr);
++            /* delay slot */
++            tcg_out_opc_imm(s, lw2, TCG_TMP0, base, 3);
++            tcg_out_mov(s, is_64 ? TCG_TYPE_I64 : TCG_TYPE_I32, lo, TCG_TMP3);
++        }
++        break;
++
++    case MO_UQ:
++        if (TCG_TARGET_REG_BITS == 64) {
++            tcg_out_opc_imm(s, ld1, lo, base, 0);
++            tcg_out_opc_imm(s, ld2, lo, base, 7);
++        } else {
++            tcg_out_opc_imm(s, lw1, MIPS_BE ? hi : lo, base, 0 + 0);
++            tcg_out_opc_imm(s, lw2, MIPS_BE ? hi : lo, base, 0 + 3);
++            tcg_out_opc_imm(s, lw1, MIPS_BE ? lo : hi, base, 4 + 0);
++            tcg_out_opc_imm(s, lw2, MIPS_BE ? lo : hi, base, 4 + 3);
++        }
++        break;
++
++    case MO_UQ | MO_BSWAP:
++        if (TCG_TARGET_REG_BITS == 64) {
++            if (use_mips32r2_instructions) {
++                tcg_out_opc_imm(s, ld1, lo, base, 0);
++                tcg_out_opc_imm(s, ld2, lo, base, 7);
++                tcg_out_bswap64(s, lo, lo);
++            } else {
++                tcg_out_opc_imm(s, ld1, TCG_TMP0, base, 0);
++                tcg_out_bswap_subr(s, bswap64_addr);
++                /* delay slot */
++                tcg_out_opc_imm(s, ld2, TCG_TMP0, base, 7);
++                tcg_out_mov(s, TCG_TYPE_I64, lo, TCG_TMP3);
++            }
++        } else if (use_mips32r2_instructions) {
++            tcg_out_opc_imm(s, lw1, TCG_TMP0, base, 0 + 0);
++            tcg_out_opc_imm(s, lw2, TCG_TMP0, base, 0 + 3);
++            tcg_out_opc_imm(s, lw1, TCG_TMP1, base, 4 + 0);
++            tcg_out_opc_imm(s, lw2, TCG_TMP1, base, 4 + 3);
++            tcg_out_opc_reg(s, OPC_WSBH, TCG_TMP0, 0, TCG_TMP0);
++            tcg_out_opc_reg(s, OPC_WSBH, TCG_TMP1, 0, TCG_TMP1);
++            tcg_out_opc_sa(s, OPC_ROTR, MIPS_BE ? lo : hi, TCG_TMP0, 16);
++            tcg_out_opc_sa(s, OPC_ROTR, MIPS_BE ? hi : lo, TCG_TMP1, 16);
++        } else {
++            tcg_out_opc_imm(s, lw1, TCG_TMP0, base, 0 + 0);
++            tcg_out_bswap_subr(s, bswap32_addr);
++            /* delay slot */
++            tcg_out_opc_imm(s, lw2, TCG_TMP0, base, 0 + 3);
++            tcg_out_opc_imm(s, lw1, TCG_TMP0, base, 4 + 0);
++            tcg_out_mov(s, TCG_TYPE_I32, MIPS_BE ? lo : hi, TCG_TMP3);
++            tcg_out_bswap_subr(s, bswap32_addr);
++            /* delay slot */
++            tcg_out_opc_imm(s, lw2, TCG_TMP0, base, 4 + 3);
++            tcg_out_mov(s, TCG_TYPE_I32, MIPS_BE ? hi : lo, TCG_TMP3);
++        }
++        break;
++
++    default:
++        g_assert_not_reached();
++    }
++}
++
+ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is_64)
+ {
+     TCGReg addr_regl, addr_regh __attribute__((unused));
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is_64)
+     MemOp opc;
+ #if defined(CONFIG_SOFTMMU)
+     tcg_insn_unit *label_ptr[2];
++#else
++    unsigned a_bits, s_bits;
+ #endif
+     TCGReg base = TCG_REG_A0;
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is_64)
+     } else {
+         tcg_out_opc_reg(s, ALIAS_PADD, base, TCG_GUEST_BASE_REG, addr_regl);
+     }
+-    tcg_out_qemu_ld_direct(s, data_regl, data_regh, base, opc, is_64);
++    a_bits = get_alignment_bits(opc);
++    s_bits = opc & MO_SIZE;
++    /*
++     * R6 removes the left/right instructions but requires the
++     * system to support misaligned memory accesses.
++     */
++    if (use_mips32r6_instructions) {
++        if (a_bits) {
++            tcg_out_test_alignment(s, true, addr_regl, addr_regh, a_bits);
++        }
++        tcg_out_qemu_ld_direct(s, data_regl, data_regh, base, opc, is_64);
++    } else {
++        if (a_bits && a_bits != s_bits) {
++            tcg_out_test_alignment(s, true, addr_regl, addr_regh, a_bits);
++        }
++        if (a_bits >= s_bits) {
++            tcg_out_qemu_ld_direct(s, data_regl, data_regh, base, opc, is_64);
++        } else {
++            tcg_out_qemu_ld_unalign(s, data_regl, data_regh, base, opc, is_64);
++        }
++    }
+ #endif
+ }
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg lo, TCGReg hi,
+     }
+ }
++static void __attribute__((unused))
++tcg_out_qemu_st_unalign(TCGContext *s, TCGReg lo, TCGReg hi,
++                                    TCGReg base, MemOp opc)
++{
++    const MIPSInsn sw1 = MIPS_BE ? OPC_SWL : OPC_SWR;
++    const MIPSInsn sw2 = MIPS_BE ? OPC_SWR : OPC_SWL;
++    const MIPSInsn sd1 = MIPS_BE ? OPC_SDL : OPC_SDR;
++    const MIPSInsn sd2 = MIPS_BE ? OPC_SDR : OPC_SDL;
++
++    /* Don't clutter the code below with checks to avoid bswapping ZERO.  */
++    if ((lo | hi) == 0) {
++        opc &= ~MO_BSWAP;
++    }
++
++    switch (opc & (MO_SIZE | MO_BSWAP)) {
++    case MO_16 | MO_BE:
++        tcg_out_opc_sa(s, OPC_SRL, TCG_TMP0, lo, 8);
++        tcg_out_opc_imm(s, OPC_SB, TCG_TMP0, base, 0);
++        tcg_out_opc_imm(s, OPC_SB, lo, base, 1);
++        break;
++
++    case MO_16 | MO_LE:
++        tcg_out_opc_sa(s, OPC_SRL, TCG_TMP0, lo, 8);
++        tcg_out_opc_imm(s, OPC_SB, lo, base, 0);
++        tcg_out_opc_imm(s, OPC_SB, TCG_TMP0, base, 1);
++        break;
++
++    case MO_32 | MO_BSWAP:
++        tcg_out_bswap32(s, TCG_TMP3, lo, 0);
++        lo = TCG_TMP3;
++        /* fall through */
++    case MO_32:
++        tcg_out_opc_imm(s, sw1, lo, base, 0);
++        tcg_out_opc_imm(s, sw2, lo, base, 3);
++        break;
++
++    case MO_64 | MO_BSWAP:
++        if (TCG_TARGET_REG_BITS == 64) {
++            tcg_out_bswap64(s, TCG_TMP3, lo);
++            lo = TCG_TMP3;
++        } else if (use_mips32r2_instructions) {
++            tcg_out_opc_reg(s, OPC_WSBH, TCG_TMP0, 0, MIPS_BE ? hi : lo);
++            tcg_out_opc_reg(s, OPC_WSBH, TCG_TMP1, 0, MIPS_BE ? lo : hi);
++            tcg_out_opc_sa(s, OPC_ROTR, TCG_TMP0, TCG_TMP0, 16);
++            tcg_out_opc_sa(s, OPC_ROTR, TCG_TMP1, TCG_TMP1, 16);
++            hi = MIPS_BE ? TCG_TMP0 : TCG_TMP1;
++            lo = MIPS_BE ? TCG_TMP1 : TCG_TMP0;
++        } else {
++            tcg_out_bswap32(s, TCG_TMP3, MIPS_BE ? lo : hi, 0);
++            tcg_out_opc_imm(s, sw1, TCG_TMP3, base, 0 + 0);
++            tcg_out_opc_imm(s, sw2, TCG_TMP3, base, 0 + 3);
++            tcg_out_bswap32(s, TCG_TMP3, MIPS_BE ? hi : lo, 0);
++            tcg_out_opc_imm(s, sw1, TCG_TMP3, base, 4 + 0);
++            tcg_out_opc_imm(s, sw2, TCG_TMP3, base, 4 + 3);
++            break;
++        }
++        /* fall through */
++    case MO_64:
++        if (TCG_TARGET_REG_BITS == 64) {
++            tcg_out_opc_imm(s, sd1, lo, base, 0);
++            tcg_out_opc_imm(s, sd2, lo, base, 7);
++        } else {
++            tcg_out_opc_imm(s, sw1, MIPS_BE ? hi : lo, base, 0 + 0);
++            tcg_out_opc_imm(s, sw2, MIPS_BE ? hi : lo, base, 0 + 3);
++            tcg_out_opc_imm(s, sw1, MIPS_BE ? lo : hi, base, 4 + 0);
++            tcg_out_opc_imm(s, sw2, MIPS_BE ? lo : hi, base, 4 + 3);
++        }
++        break;
++
++    default:
++        tcg_abort();
++    }
++}
+ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is_64)
+ {
+     TCGReg addr_regl, addr_regh __attribute__((unused));
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is_64)
+     MemOp opc;
+ #if defined(CONFIG_SOFTMMU)
+     tcg_insn_unit *label_ptr[2];
++#else
++    unsigned a_bits, s_bits;
+ #endif
+     TCGReg base = TCG_REG_A0;
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is_64)
+                         data_regl, data_regh, addr_regl, addr_regh,
+                         s->code_ptr, label_ptr);
+ #else
+-    base = TCG_REG_A0;
+     if (TCG_TARGET_REG_BITS > TARGET_LONG_BITS) {
+         tcg_out_ext32u(s, base, addr_regl);
+         addr_regl = base;
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is_64)
+     } else {
+         tcg_out_opc_reg(s, ALIAS_PADD, base, TCG_GUEST_BASE_REG, addr_regl);
+     }
+-    tcg_out_qemu_st_direct(s, data_regl, data_regh, base, opc);
++    a_bits = get_alignment_bits(opc);
++    s_bits = opc & MO_SIZE;
++    /*
++     * R6 removes the left/right instructions but requires the
++     * system to support misaligned memory accesses.
++     */
++    if (use_mips32r6_instructions) {
++        if (a_bits) {
++            tcg_out_test_alignment(s, true, addr_regl, addr_regh, a_bits);
++        }
++        tcg_out_qemu_st_direct(s, data_regl, data_regh, base, opc);
++    } else {
++        if (a_bits && a_bits != s_bits) {
++            tcg_out_test_alignment(s, true, addr_regl, addr_regh, a_bits);
++        }
++        if (a_bits >= s_bits) {
++            tcg_out_qemu_st_direct(s, data_regl, data_regh, base, opc);
++        } else {
++            tcg_out_qemu_st_unalign(s, data_regl, data_regh, base, opc);
++        }
++    }
+ #endif
+ }
+--
+.25.1

-New patch
+[PULL 25/34] tcg/mips: Support unaligned access for softmmu
+We can use the routines just added for user-only to emit
 unaligned accesses in softmmu mode too.
 Tested-by: Jiaxun Yang <jiaxun.yang@flygoat.com>
 Reviewed-by: Jiaxun Yang <jiaxun.yang@flygoat.com>
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
  tcg/mips/tcg-target.c.inc | 91 ++++++++++++++++++++++-----------------
 file changed, 51 insertions(+), 40 deletions(-)
 diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/mips/tcg-target.c.inc
 +++ b/tcg/mips/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_tlb_load(TCGContext *s, TCGReg base, TCGReg addrl,
                               tcg_insn_unit *label_ptr[2], bool is_load)
  {
      MemOp opc = get_memop(oi);
 -    unsigned s_bits = opc & MO_SIZE;
      unsigned a_bits = get_alignment_bits(opc);
 +    unsigned s_bits = opc & MO_SIZE;
 +    unsigned a_mask = (1 << a_bits) - 1;
 +    unsigned s_mask = (1 << s_bits) - 1;
      int mem_index = get_mmuidx(oi);
      int fast_off = TLB_MASK_TABLE_OFS(mem_index);
      int mask_off = fast_off + offsetof(CPUTLBDescFast, mask);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_tlb_load(TCGContext *s, TCGReg base, TCGReg addrl,
      int add_off = offsetof(CPUTLBEntry, addend);
      int cmp_off = (is_load ? offsetof(CPUTLBEntry, addr_read)
                     : offsetof(CPUTLBEntry, addr_write));
 -    target_ulong mask;
 +    target_ulong tlb_mask;
      /* Load tlb_mask[mmu_idx] and tlb_table[mmu_idx].  */
      tcg_out_ld(s, TCG_TYPE_PTR, TCG_TMP0, TCG_AREG0, mask_off);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_tlb_load(TCGContext *s, TCGReg base, TCGReg addrl,
      /* Add the tlb_table pointer, creating the CPUTLBEntry address in TMP3.  */
      tcg_out_opc_reg(s, ALIAS_PADD, TCG_TMP3, TCG_TMP3, TCG_TMP1);
 -    /* We don't currently support unaligned accesses.
 -       We could do so with mips32r6.  */
 -    if (a_bits < s_bits) {
 -        a_bits = s_bits;
 -    }
 -
 -    /* Mask the page bits, keeping the alignment bits to compare against.  */
 -    mask = (target_ulong)TARGET_PAGE_MASK | ((1 << a_bits) - 1);
 -
      /* Load the (low-half) tlb comparator.  */
      if (TCG_TARGET_REG_BITS < TARGET_LONG_BITS) {
 -        tcg_out_ld(s, TCG_TYPE_I32, TCG_TMP0, TCG_TMP3, cmp_off + LO_OFF);
 -        tcg_out_movi(s, TCG_TYPE_I32, TCG_TMP1, mask);
 +        tcg_out_ldst(s, OPC_LW, TCG_TMP0, TCG_TMP3, cmp_off + LO_OFF);
      } else {
          tcg_out_ldst(s, (TARGET_LONG_BITS == 64 ? OPC_LD
                           : TCG_TARGET_REG_BITS == 64 ? OPC_LWU : OPC_LW),
                       TCG_TMP0, TCG_TMP3, cmp_off);
 -        tcg_out_movi(s, TCG_TYPE_TL, TCG_TMP1, mask);
 -        /* No second compare is required here;
 -           load the tlb addend for the fast path.  */
 -        tcg_out_ld(s, TCG_TYPE_PTR, TCG_TMP2, TCG_TMP3, add_off);
      }
      /* Zero extend a 32-bit guest address for a 64-bit host. */
@@ -XXX,XX +XXX,XX @@ static void tcg_out_tlb_load(TCGContext *s, TCGReg base, TCGReg addrl,
          tcg_out_ext32u(s, base, addrl);
          addrl = base;
      }
 -    tcg_out_opc_reg(s, OPC_AND, TCG_TMP1, TCG_TMP1, addrl);
 +
 +    /*
 +     * Mask the page bits, keeping the alignment bits to compare against.
 +     * For unaligned accesses, compare against the end of the access to
 +     * verify that it does not cross a page boundary.
 +     */
 +    tlb_mask = (target_ulong)TARGET_PAGE_MASK | a_mask;
 +    tcg_out_movi(s, TCG_TYPE_I32, TCG_TMP1, tlb_mask);
 +    if (a_mask >= s_mask) {
 +        tcg_out_opc_reg(s, OPC_AND, TCG_TMP1, TCG_TMP1, addrl);
 +    } else {
 +        tcg_out_opc_imm(s, ALIAS_PADDI, TCG_TMP2, addrl, s_mask - a_mask);
 +        tcg_out_opc_reg(s, OPC_AND, TCG_TMP1, TCG_TMP1, TCG_TMP2);
 +    }
 +
 +    if (TCG_TARGET_REG_BITS >= TARGET_LONG_BITS) {
 +        /* Load the tlb addend for the fast path.  */
 +        tcg_out_ld(s, TCG_TYPE_PTR, TCG_TMP2, TCG_TMP3, add_off);
 +    }
      label_ptr[0] = s->code_ptr;
      tcg_out_opc_br(s, OPC_BNE, TCG_TMP1, TCG_TMP0);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_tlb_load(TCGContext *s, TCGReg base, TCGReg addrl,
      /* Load and test the high half tlb comparator.  */
      if (TCG_TARGET_REG_BITS < TARGET_LONG_BITS) {
          /* delay slot */
 -        tcg_out_ld(s, TCG_TYPE_I32, TCG_TMP0, TCG_TMP3, cmp_off + HI_OFF);
 +        tcg_out_ldst(s, OPC_LW, TCG_TMP0, TCG_TMP3, cmp_off + HI_OFF);
          /* Load the tlb addend for the fast path.  */
          tcg_out_ld(s, TCG_TYPE_PTR, TCG_TMP2, TCG_TMP3, add_off);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg lo, TCGReg hi,
      }
  }
 -static void __attribute__((unused))
 -tcg_out_qemu_ld_unalign(TCGContext *s, TCGReg lo, TCGReg hi,
 +static void tcg_out_qemu_ld_unalign(TCGContext *s, TCGReg lo, TCGReg hi,
                                      TCGReg base, MemOp opc, bool is_64)
  {
      const MIPSInsn lw1 = MIPS_BE ? OPC_LWL : OPC_LWR;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is_64)
  #if defined(CONFIG_SOFTMMU)
      tcg_insn_unit *label_ptr[2];
  #else
 -    unsigned a_bits, s_bits;
  #endif
 +    unsigned a_bits, s_bits;
      TCGReg base = TCG_REG_A0;
      data_regl = *args++;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is_64)
      addr_regh = (TCG_TARGET_REG_BITS < TARGET_LONG_BITS ? *args++ : 0);
      oi = *args++;
      opc = get_memop(oi);
 +    a_bits = get_alignment_bits(opc);
 +    s_bits = opc & MO_SIZE;
 +    /*
 +     * R6 removes the left/right instructions but requires the
 +     * system to support misaligned memory accesses.
 +     */
  #if defined(CONFIG_SOFTMMU)
      tcg_out_tlb_load(s, base, addr_regl, addr_regh, oi, label_ptr, 1);
 -    tcg_out_qemu_ld_direct(s, data_regl, data_regh, base, opc, is_64);
 +    if (use_mips32r6_instructions || a_bits >= s_bits) {
 +        tcg_out_qemu_ld_direct(s, data_regl, data_regh, base, opc, is_64);
 +    } else {
 +        tcg_out_qemu_ld_unalign(s, data_regl, data_regh, base, opc, is_64);
 +    }
      add_qemu_ldst_label(s, 1, oi,
                          (is_64 ? TCG_TYPE_I64 : TCG_TYPE_I32),
                          data_regl, data_regh, addr_regl, addr_regh,
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is_64)
      } else {
          tcg_out_opc_reg(s, ALIAS_PADD, base, TCG_GUEST_BASE_REG, addr_regl);
      }
 -    a_bits = get_alignment_bits(opc);
 -    s_bits = opc & MO_SIZE;
 -    /*
 -     * R6 removes the left/right instructions but requires the
 -     * system to support misaligned memory accesses.
 -     */
      if (use_mips32r6_instructions) {
          if (a_bits) {
              tcg_out_test_alignment(s, true, addr_regl, addr_regh, a_bits);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg lo, TCGReg hi,
      }
  }
 -static void __attribute__((unused))
 -tcg_out_qemu_st_unalign(TCGContext *s, TCGReg lo, TCGReg hi,
 +static void tcg_out_qemu_st_unalign(TCGContext *s, TCGReg lo, TCGReg hi,
                                      TCGReg base, MemOp opc)
  {
      const MIPSInsn sw1 = MIPS_BE ? OPC_SWL : OPC_SWR;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is_64)
      MemOp opc;
  #if defined(CONFIG_SOFTMMU)
      tcg_insn_unit *label_ptr[2];
 -#else
 -    unsigned a_bits, s_bits;
  #endif
 +    unsigned a_bits, s_bits;
      TCGReg base = TCG_REG_A0;
      data_regl = *args++;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is_64)
      addr_regh = (TCG_TARGET_REG_BITS < TARGET_LONG_BITS ? *args++ : 0);
      oi = *args++;
      opc = get_memop(oi);
 +    a_bits = get_alignment_bits(opc);
 +    s_bits = opc & MO_SIZE;
 +    /*
 +     * R6 removes the left/right instructions but requires the
 +     * system to support misaligned memory accesses.
 +     */
  #if defined(CONFIG_SOFTMMU)
      tcg_out_tlb_load(s, base, addr_regl, addr_regh, oi, label_ptr, 0);
 -    tcg_out_qemu_st_direct(s, data_regl, data_regh, base, opc);
 +    if (use_mips32r6_instructions || a_bits >= s_bits) {
 +        tcg_out_qemu_st_direct(s, data_regl, data_regh, base, opc);
 +    } else {
 +        tcg_out_qemu_st_unalign(s, data_regl, data_regh, base, opc);
 +    }
      add_qemu_ldst_label(s, 0, oi,
                          (is_64 ? TCG_TYPE_I64 : TCG_TYPE_I32),
                          data_regl, data_regh, addr_regl, addr_regh,
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is_64)
      } else {
          tcg_out_opc_reg(s, ALIAS_PADD, base, TCG_GUEST_BASE_REG, addr_regl);
      }
 -    a_bits = get_alignment_bits(opc);
 -    s_bits = opc & MO_SIZE;
 -    /*
 -     * R6 removes the left/right instructions but requires the
 -     * system to support misaligned memory accesses.
 -     */
      if (use_mips32r6_instructions) {
          if (a_bits) {
              tcg_out_test_alignment(s, true, addr_regl, addr_regh, a_bits);
 --
 .25.1

-New patch
+[PULL 26/34] tcg/sparc: Use tcg_out_movi_imm13 in tcg_out_addsub2_i64
+When BH is constant, it is constrained to 11 bits for use in MOVCC.
+For the cases in which we must load the constant BH into a register,
+we do not need the full logic of tcg_out_movi; we can use the simpler
+function for emitting a 13 bit constant.
+This eliminates the only case in which TCG_REG_T2 was passed to
+tcg_out_movi, which will shortly become invalid.
+Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/sparc/tcg-target.c.inc | 10 +++++++---
+file changed, 7 insertions(+), 3 deletions(-)
+diff --git a/tcg/sparc/tcg-target.c.inc b/tcg/sparc/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/sparc/tcg-target.c.inc
++++ b/tcg/sparc/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_addsub2_i64(TCGContext *s, TCGReg rl, TCGReg rh,
+     if (use_vis3_instructions && !is_sub) {
+         /* Note that ADDXC doesn't accept immediates.  */
+         if (bhconst && bh != 0) {
+-           tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_T2, bh);
++           tcg_out_movi_imm13(s, TCG_REG_T2, bh);
+            bh = TCG_REG_T2;
+         }
+         tcg_out_arith(s, rh, ah, bh, ARITH_ADDXC);
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_addsub2_i64(TCGContext *s, TCGReg rl, TCGReg rh,
+         tcg_out_movcc(s, TCG_COND_GEU, MOVCC_XCC, rh, ah, 0);
+     }
+     } else {
+-        /* Otherwise adjust BH as if there is carry into T2 ... */
++        /*
++         * Otherwise adjust BH as if there is carry into T2.
++         * Note that constant BH is constrained to 11 bits for the MOVCC,
++         * so the adjustment fits 12 bits.
++         */
+         if (bhconst) {
+-            tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_T2, bh + (is_sub ? -1 : 1));
++            tcg_out_movi_imm13(s, TCG_REG_T2, bh + (is_sub ? -1 : 1));
+         } else {
+             tcg_out_arithi(s, TCG_REG_T2, bh, 1,
+                            is_sub ? ARITH_SUB : ARITH_ADD);
+--
+.25.1

-New patch
+[PULL 27/34] tcg/sparc: Split out tcg_out_movi_imm32
+Handle 32-bit constants with a separate function, so that
+tcg_out_movi_int does not need to recurse.  This slightly
+rearranges the order of tests for small constants, but
+produces the same output.
+Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/sparc/tcg-target.c.inc | 36 +++++++++++++++++++++---------------
+file changed, 21 insertions(+), 15 deletions(-)
+diff --git a/tcg/sparc/tcg-target.c.inc b/tcg/sparc/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/sparc/tcg-target.c.inc
++++ b/tcg/sparc/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_movi_imm13(TCGContext *s, TCGReg ret, int32_t arg)
+     tcg_out_arithi(s, ret, TCG_REG_G0, arg, ARITH_OR);
+ }
++static void tcg_out_movi_imm32(TCGContext *s, TCGReg ret, int32_t arg)
++{
++    if (check_fit_i32(arg, 13)) {
++        /* A 13-bit constant sign-extended to 64-bits.  */
++        tcg_out_movi_imm13(s, ret, arg);
++    } else {
++        /* A 32-bit constant zero-extended to 64 bits.  */
++        tcg_out_sethi(s, ret, arg);
++        if (arg & 0x3ff) {
++            tcg_out_arithi(s, ret, ret, arg & 0x3ff, ARITH_OR);
++        }
++    }
++}
++
+ static void tcg_out_movi_int(TCGContext *s, TCGType type, TCGReg ret,
+                              tcg_target_long arg, bool in_prologue)
+ {
+     tcg_target_long hi, lo = (int32_t)arg;
+     tcg_target_long test, lsb;
+-    /* Make sure we test 32-bit constants for imm13 properly.  */
+-    if (type == TCG_TYPE_I32) {
+-        arg = lo;
++    /* A 32-bit constant, or 32-bit zero-extended to 64-bits.  */
++    if (type == TCG_TYPE_I32 || arg == (uint32_t)arg) {
++        tcg_out_movi_imm32(s, ret, arg);
++        return;
+     }
+     /* A 13-bit constant sign-extended to 64-bits.  */
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_movi_int(TCGContext *s, TCGType type, TCGReg ret,
+         }
+     }
+-    /* A 32-bit constant, or 32-bit zero-extended to 64-bits.  */
+-    if (type == TCG_TYPE_I32 || arg == (uint32_t)arg) {
+-        tcg_out_sethi(s, ret, arg);
+-        if (arg & 0x3ff) {
+-            tcg_out_arithi(s, ret, ret, arg & 0x3ff, ARITH_OR);
+-        }
+-        return;
+-    }
+-
+     /* A 32-bit constant sign-extended to 64-bits.  */
+     if (arg == lo) {
+         tcg_out_sethi(s, ret, ~arg);
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_movi_int(TCGContext *s, TCGType type, TCGReg ret,
+     /* A 64-bit constant decomposed into 2 32-bit pieces.  */
+     if (check_fit_i32(lo, 13)) {
+         hi = (arg - lo) >> 32;
+-        tcg_out_movi(s, TCG_TYPE_I32, ret, hi);
++        tcg_out_movi_imm32(s, ret, hi);
+         tcg_out_arithi(s, ret, ret, 32, SHIFT_SLLX);
+         tcg_out_arithi(s, ret, ret, lo, ARITH_ADD);
+     } else {
+         hi = arg >> 32;
+-        tcg_out_movi(s, TCG_TYPE_I32, ret, hi);
+-        tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_T2, lo);
++        tcg_out_movi_imm32(s, ret, hi);
++        tcg_out_movi_imm32(s, TCG_REG_T2, lo);
+         tcg_out_arithi(s, ret, ret, 32, SHIFT_SLLX);
+         tcg_out_arith(s, ret, ret, TCG_REG_T2, ARITH_OR);
+     }
+--
+.25.1

-[PULL 1/3] accel/tcg: split CpusAccel into three TCG variants
+[PULL 28/34] tcg/sparc: Add scratch argument to tcg_out_movi_int
-From: Claudio Fontana <cfontana@suse.de>
+This will allow us to control exactly what scratch register is
 used for loading the constant.
-split up the CpusAccel tcg_cpus into three TCG variants:
+Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
 tcg_cpus_rr (single threaded, round robin cpus)
 tcg_cpus_icount (same as rr, but with instruction counting enabled)
 tcg_cpus_mttcg (multi-threaded cpus)
 Suggested-by: Richard Henderson <richard.henderson@linaro.org>
 Signed-off-by: Claudio Fontana <cfontana@suse.de>
 Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Message-Id: <20201015143217.29337-2-cfontana@suse.de>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/tcg-cpus-icount.h |  17 ++
+ tcg/sparc/tcg-target.c.inc | 15 +++++++++------
- accel/tcg/tcg-cpus-mttcg.h  |  21 ++
+file changed, 9 insertions(+), 6 deletions(-)
  accel/tcg/tcg-cpus-rr.h     |  20 ++
  accel/tcg/tcg-cpus.h        |  13 +-
  accel/tcg/tcg-all.c         |   8 +-
  accel/tcg/tcg-cpus-icount.c | 147 +++++++++++
  accel/tcg/tcg-cpus-mttcg.c  | 117 +++++++++
  accel/tcg/tcg-cpus-rr.c     | 270 ++++++++++++++++++++
  accel/tcg/tcg-cpus.c        | 484 ++----------------------------------
  softmmu/icount.c            |   2 +-
  accel/tcg/meson.build       |   9 +-
 files changed, 646 insertions(+), 462 deletions(-)
  create mode 100644 accel/tcg/tcg-cpus-icount.h
  create mode 100644 accel/tcg/tcg-cpus-mttcg.h
  create mode 100644 accel/tcg/tcg-cpus-rr.h
  create mode 100644 accel/tcg/tcg-cpus-icount.c
  create mode 100644 accel/tcg/tcg-cpus-mttcg.c
  create mode 100644 accel/tcg/tcg-cpus-rr.c
-diff --git a/accel/tcg/tcg-cpus-icount.h b/accel/tcg/tcg-cpus-icount.h
+diff --git a/tcg/sparc/tcg-target.c.inc b/tcg/sparc/tcg-target.c.inc
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/accel/tcg/tcg-cpus-icount.h
@@ -XXX,XX +XXX,XX @@
 +/*
 + * QEMU TCG Single Threaded vCPUs implementation using instruction counting
 + *
 + * Copyright 2020 SUSE LLC
 + *
 + * This work is licensed under the terms of the GNU GPL, version 2 or later.
 + * See the COPYING file in the top-level directory.
 + */
 +
 +#ifndef TCG_CPUS_ICOUNT_H
 +#define TCG_CPUS_ICOUNT_H
 +
 +void handle_icount_deadline(void);
 +void prepare_icount_for_run(CPUState *cpu);
 +void process_icount_data(CPUState *cpu);
 +
 +#endif /* TCG_CPUS_ICOUNT_H */
 diff --git a/accel/tcg/tcg-cpus-mttcg.h b/accel/tcg/tcg-cpus-mttcg.h
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/accel/tcg/tcg-cpus-mttcg.h
@@ -XXX,XX +XXX,XX @@
 +/*
 + * QEMU TCG Multi Threaded vCPUs implementation
 + *
 + * Copyright 2020 SUSE LLC
 + *
 + * This work is licensed under the terms of the GNU GPL, version 2 or later.
 + * See the COPYING file in the top-level directory.
 + */
 +
 +#ifndef TCG_CPUS_MTTCG_H
 +#define TCG_CPUS_MTTCG_H
 +
 +/*
 + * In the multi-threaded case each vCPU has its own thread. The TLS
 + * variable current_cpu can be used deep in the code to find the
 + * current CPUState for a given thread.
 + */
 +
 +void *tcg_cpu_thread_fn(void *arg);
 +
 +#endif /* TCG_CPUS_MTTCG_H */
 diff --git a/accel/tcg/tcg-cpus-rr.h b/accel/tcg/tcg-cpus-rr.h
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/accel/tcg/tcg-cpus-rr.h
@@ -XXX,XX +XXX,XX @@
 +/*
 + * QEMU TCG Single Threaded vCPUs implementation
 + *
 + * Copyright 2020 SUSE LLC
 + *
 + * This work is licensed under the terms of the GNU GPL, version 2 or later.
 + * See the COPYING file in the top-level directory.
 + */
 +
 +#ifndef TCG_CPUS_RR_H
 +#define TCG_CPUS_RR_H
 +
 +#define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
 +
 +/* Kick all RR vCPUs. */
 +void qemu_cpu_kick_rr_cpus(CPUState *unused);
 +
 +void *tcg_rr_cpu_thread_fn(void *arg);
 +
 +#endif /* TCG_CPUS_RR_H */
 diff --git a/accel/tcg/tcg-cpus.h b/accel/tcg/tcg-cpus.h
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/tcg-cpus.h
+--- a/tcg/sparc/tcg-target.c.inc
-+++ b/accel/tcg/tcg-cpus.h
++++ b/tcg/sparc/tcg-target.c.inc
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_movi_imm32(TCGContext *s, TCGReg ret, int32_t arg)
  /*
 - * Accelerator CPUS Interface
 + * QEMU TCG vCPU common functionality
 + *
 + * Functionality common to all TCG vcpu variants: mttcg, rr and icount.
   *
   * Copyright 2020 SUSE LLC
   *
@@ -XXX,XX +XXX,XX @@
  #include "sysemu/cpus.h"
 -extern const CpusAccel tcg_cpus;
 +extern const CpusAccel tcg_cpus_mttcg;
 +extern const CpusAccel tcg_cpus_icount;
 +extern const CpusAccel tcg_cpus_rr;
 +
 +void tcg_start_vcpu_thread(CPUState *cpu);
 +void qemu_tcg_destroy_vcpu(CPUState *cpu);
 +int tcg_cpu_exec(CPUState *cpu);
 +void tcg_handle_interrupt(CPUState *cpu, int mask);
  #endif /* TCG_CPUS_H */
 diff --git a/accel/tcg/tcg-all.c b/accel/tcg/tcg-all.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/tcg-all.c
 +++ b/accel/tcg/tcg-all.c
@@ -XXX,XX +XXX,XX @@ static int tcg_init(MachineState *ms)
      tcg_exec_init(s->tb_size * 1024 * 1024);
      mttcg_enabled = s->mttcg_enabled;
 -    cpus_register_accel(&tcg_cpus);
 +    if (mttcg_enabled) {
 +        cpus_register_accel(&tcg_cpus_mttcg);
 +    } else if (icount_enabled()) {
 +        cpus_register_accel(&tcg_cpus_icount);
 +    } else {
 +        cpus_register_accel(&tcg_cpus_rr);
 +    }
      return 0;
  }
-diff --git a/accel/tcg/tcg-cpus-icount.c b/accel/tcg/tcg-cpus-icount.c
+ static void tcg_out_movi_int(TCGContext *s, TCGType type, TCGReg ret,
-new file mode 100644
+-                             tcg_target_long arg, bool in_prologue)
-index XXXXXXX..XXXXXXX
++                             tcg_target_long arg, bool in_prologue,
---- /dev/null
++                             TCGReg scratch)
 +++ b/accel/tcg/tcg-cpus-icount.c
@@ -XXX,XX +XXX,XX @@
 +/*
 + * QEMU TCG Single Threaded vCPUs implementation using instruction counting
 + *
 + * Copyright (c) 2003-2008 Fabrice Bellard
 + * Copyright (c) 2014 Red Hat Inc.
 + *
 + * Permission is hereby granted, free of charge, to any person obtaining a copy
 + * of this software and associated documentation files (the "Software"), to deal
 + * in the Software without restriction, including without limitation the rights
 + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 + * copies of the Software, and to permit persons to whom the Software is
 + * furnished to do so, subject to the following conditions:
 + *
 + * The above copyright notice and this permission notice shall be included in
 + * all copies or substantial portions of the Software.
 + *
 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 + * THE SOFTWARE.
 + */
 +
 +#include "qemu/osdep.h"
 +#include "qemu-common.h"
 +#include "sysemu/tcg.h"
 +#include "sysemu/replay.h"
 +#include "qemu/main-loop.h"
 +#include "qemu/guest-random.h"
 +#include "exec/exec-all.h"
 +#include "hw/boards.h"
 +
 +#include "tcg-cpus.h"
 +#include "tcg-cpus-icount.h"
 +#include "tcg-cpus-rr.h"
 +
 +static int64_t tcg_get_icount_limit(void)
 +{
 +    int64_t deadline;
 +
 +    if (replay_mode != REPLAY_MODE_PLAY) {
 +        /*
 +         * Include all the timers, because they may need an attention.
 +         * Too long CPU execution may create unnecessary delay in UI.
 +         */
 +        deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
 +                                              QEMU_TIMER_ATTR_ALL);
 +        /* Check realtime timers, because they help with input processing */
 +        deadline = qemu_soonest_timeout(deadline,
 +                qemu_clock_deadline_ns_all(QEMU_CLOCK_REALTIME,
 +                                           QEMU_TIMER_ATTR_ALL));
 +
 +        /*
 +         * Maintain prior (possibly buggy) behaviour where if no deadline
 +         * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
 +         * INT32_MAX nanoseconds ahead, we still use INT32_MAX
 +         * nanoseconds.
 +         */
 +        if ((deadline < 0) || (deadline > INT32_MAX)) {
 +            deadline = INT32_MAX;
 +        }
 +
 +        return icount_round(deadline);
 +    } else {
 +        return replay_get_instructions();
 +    }
 +}
 +
 +static void notify_aio_contexts(void)
 +{
 +    /* Wake up other AioContexts.  */
 +    qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 +    qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
 +}
 +
 +void handle_icount_deadline(void)
 +{
 +    assert(qemu_in_vcpu_thread());
 +    int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
 +                                                  QEMU_TIMER_ATTR_ALL);
 +
 +    if (deadline == 0) {
 +        notify_aio_contexts();
 +    }
 +}
 +
 +void prepare_icount_for_run(CPUState *cpu)
 +{
 +    int insns_left;
 +
 +    /*
 +     * These should always be cleared by process_icount_data after
 +     * each vCPU execution. However u16.high can be raised
 +     * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
 +     */
 +    g_assert(cpu_neg(cpu)->icount_decr.u16.low == 0);
 +    g_assert(cpu->icount_extra == 0);
 +
 +    cpu->icount_budget = tcg_get_icount_limit();
 +    insns_left = MIN(0xffff, cpu->icount_budget);
 +    cpu_neg(cpu)->icount_decr.u16.low = insns_left;
 +    cpu->icount_extra = cpu->icount_budget - insns_left;
 +
 +    replay_mutex_lock();
 +
 +    if (cpu->icount_budget == 0 && replay_has_checkpoint()) {
 +        notify_aio_contexts();
 +    }
 +}
 +
 +void process_icount_data(CPUState *cpu)
 +{
 +    /* Account for executed instructions */
 +    icount_update(cpu);
 +
 +    /* Reset the counters */
 +    cpu_neg(cpu)->icount_decr.u16.low = 0;
 +    cpu->icount_extra = 0;
 +    cpu->icount_budget = 0;
 +
 +    replay_account_executed_instructions();
 +
 +    replay_mutex_unlock();
 +}
 +
 +static void icount_handle_interrupt(CPUState *cpu, int mask)
 +{
 +    int old_mask = cpu->interrupt_request;
 +
 +    tcg_handle_interrupt(cpu, mask);
 +    if (qemu_cpu_is_self(cpu) &&
 +        !cpu->can_do_io
 +        && (mask & ~old_mask) != 0) {
 +        cpu_abort(cpu, "Raised interrupt while not in I/O function");
 +    }
 +}
 +
 +const CpusAccel tcg_cpus_icount = {
 +    .create_vcpu_thread = tcg_start_vcpu_thread,
 +    .kick_vcpu_thread = qemu_cpu_kick_rr_cpus,
 +
 +    .handle_interrupt = icount_handle_interrupt,
 +    .get_virtual_clock = icount_get,
 +    .get_elapsed_ticks = icount_get,
 +};
 diff --git a/accel/tcg/tcg-cpus-mttcg.c b/accel/tcg/tcg-cpus-mttcg.c
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/accel/tcg/tcg-cpus-mttcg.c
@@ -XXX,XX +XXX,XX @@
 +/*
 + * QEMU TCG Multi Threaded vCPUs implementation
 + *
 + * Copyright (c) 2003-2008 Fabrice Bellard
 + * Copyright (c) 2014 Red Hat Inc.
 + *
 + * Permission is hereby granted, free of charge, to any person obtaining a copy
 + * of this software and associated documentation files (the "Software"), to deal
 + * in the Software without restriction, including without limitation the rights
 + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 + * copies of the Software, and to permit persons to whom the Software is
 + * furnished to do so, subject to the following conditions:
 + *
 + * The above copyright notice and this permission notice shall be included in
 + * all copies or substantial portions of the Software.
 + *
 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 + * THE SOFTWARE.
 + */
 +
 +#include "qemu/osdep.h"
 +#include "qemu-common.h"
 +#include "sysemu/tcg.h"
 +#include "sysemu/replay.h"
 +#include "qemu/main-loop.h"
 +#include "qemu/guest-random.h"
 +#include "exec/exec-all.h"
 +#include "hw/boards.h"
 +
 +#include "tcg-cpus.h"
 +#include "tcg-cpus-mttcg.h"
 +
 +/*
 + * In the multi-threaded case each vCPU has its own thread. The TLS
 + * variable current_cpu can be used deep in the code to find the
 + * current CPUState for a given thread.
 + */
 +
 +void *tcg_cpu_thread_fn(void *arg)
 +{
 +    CPUState *cpu = arg;
 +
 +    assert(tcg_enabled());
 +    g_assert(!icount_enabled());
 +
 +    rcu_register_thread();
 +    tcg_register_thread();
 +
 +    qemu_mutex_lock_iothread();
 +    qemu_thread_get_self(cpu->thread);
 +
 +    cpu->thread_id = qemu_get_thread_id();
 +    cpu->can_do_io = 1;
 +    current_cpu = cpu;
 +    cpu_thread_signal_created(cpu);
 +    qemu_guest_random_seed_thread_part2(cpu->random_seed);
 +
 +    /* process any pending work */
 +    cpu->exit_request = 1;
 +
 +    do {
 +        if (cpu_can_run(cpu)) {
 +            int r;
 +            qemu_mutex_unlock_iothread();
 +            r = tcg_cpu_exec(cpu);
 +            qemu_mutex_lock_iothread();
 +            switch (r) {
 +            case EXCP_DEBUG:
 +                cpu_handle_guest_debug(cpu);
 +                break;
 +            case EXCP_HALTED:
 +                /*
 +                 * during start-up the vCPU is reset and the thread is
 +                 * kicked several times. If we don't ensure we go back
 +                 * to sleep in the halted state we won't cleanly
 +                 * start-up when the vCPU is enabled.
 +                 *
 +                 * cpu->halted should ensure we sleep in wait_io_event
 +                 */
 +                g_assert(cpu->halted);
 +                break;
 +            case EXCP_ATOMIC:
 +                qemu_mutex_unlock_iothread();
 +                cpu_exec_step_atomic(cpu);
 +                qemu_mutex_lock_iothread();
 +            default:
 +                /* Ignore everything else? */
 +                break;
 +            }
 +        }
 +
 +        qatomic_mb_set(&cpu->exit_request, 0);
 +        qemu_wait_io_event(cpu);
 +    } while (!cpu->unplug || cpu_can_run(cpu));
 +
 +    qemu_tcg_destroy_vcpu(cpu);
 +    qemu_mutex_unlock_iothread();
 +    rcu_unregister_thread();
 +    return NULL;
 +}
 +
 +static void mttcg_kick_vcpu_thread(CPUState *cpu)
 +{
 +    cpu_exit(cpu);
 +}
 +
 +const CpusAccel tcg_cpus_mttcg = {
 +    .create_vcpu_thread = tcg_start_vcpu_thread,
 +    .kick_vcpu_thread = mttcg_kick_vcpu_thread,
 +
 +    .handle_interrupt = tcg_handle_interrupt,
 +};
 diff --git a/accel/tcg/tcg-cpus-rr.c b/accel/tcg/tcg-cpus-rr.c
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/accel/tcg/tcg-cpus-rr.c
@@ -XXX,XX +XXX,XX @@
 +/*
 + * QEMU TCG Single Threaded vCPUs implementation
 + *
 + * Copyright (c) 2003-2008 Fabrice Bellard
 + * Copyright (c) 2014 Red Hat Inc.
 + *
 + * Permission is hereby granted, free of charge, to any person obtaining a copy
 + * of this software and associated documentation files (the "Software"), to deal
 + * in the Software without restriction, including without limitation the rights
 + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 + * copies of the Software, and to permit persons to whom the Software is
 + * furnished to do so, subject to the following conditions:
 + *
 + * The above copyright notice and this permission notice shall be included in
 + * all copies or substantial portions of the Software.
 + *
 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 + * THE SOFTWARE.
 + */
 +
 +#include "qemu/osdep.h"
 +#include "qemu-common.h"
 +#include "sysemu/tcg.h"
 +#include "sysemu/replay.h"
 +#include "qemu/main-loop.h"
 +#include "qemu/guest-random.h"
 +#include "exec/exec-all.h"
 +#include "hw/boards.h"
 +
 +#include "tcg-cpus.h"
 +#include "tcg-cpus-rr.h"
 +#include "tcg-cpus-icount.h"
 +
 +/* Kick all RR vCPUs */
 +void qemu_cpu_kick_rr_cpus(CPUState *unused)
 +{
 +    CPUState *cpu;
 +
 +    CPU_FOREACH(cpu) {
 +        cpu_exit(cpu);
 +    };
 +}
 +
 +/*
 + * TCG vCPU kick timer
 + *
 + * The kick timer is responsible for moving single threaded vCPU
 + * emulation on to the next vCPU. If more than one vCPU is running a
 + * timer event with force a cpu->exit so the next vCPU can get
 + * scheduled.
 + *
 + * The timer is removed if all vCPUs are idle and restarted again once
 + * idleness is complete.
 + */
 +
 +static QEMUTimer *tcg_kick_vcpu_timer;
 +static CPUState *tcg_current_rr_cpu;
 +
 +#define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
 +
 +static inline int64_t qemu_tcg_next_kick(void)
 +{
 +    return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
 +}
 +
 +/* Kick the currently round-robin scheduled vCPU to next */
 +static void qemu_cpu_kick_rr_next_cpu(void)
 +{
 +    CPUState *cpu;
 +    do {
 +        cpu = qatomic_mb_read(&tcg_current_rr_cpu);
 +        if (cpu) {
 +            cpu_exit(cpu);
 +        }
 +    } while (cpu != qatomic_mb_read(&tcg_current_rr_cpu));
 +}
 +
 +static void kick_tcg_thread(void *opaque)
 +{
 +    timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 +    qemu_cpu_kick_rr_next_cpu();
 +}
 +
 +static void start_tcg_kick_timer(void)
 +{
 +    if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
 +        tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 +                                           kick_tcg_thread, NULL);
 +    }
 +    if (tcg_kick_vcpu_timer && !timer_pending(tcg_kick_vcpu_timer)) {
 +        timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 +    }
 +}
 +
 +static void stop_tcg_kick_timer(void)
 +{
 +    if (tcg_kick_vcpu_timer && timer_pending(tcg_kick_vcpu_timer)) {
 +        timer_del(tcg_kick_vcpu_timer);
 +    }
 +}
 +
 +static void qemu_tcg_rr_wait_io_event(void)
 +{
 +    CPUState *cpu;
 +
 +    while (all_cpu_threads_idle()) {
 +        stop_tcg_kick_timer();
 +        qemu_cond_wait_iothread(first_cpu->halt_cond);
 +    }
 +
 +    start_tcg_kick_timer();
 +
 +    CPU_FOREACH(cpu) {
 +        qemu_wait_io_event_common(cpu);
 +    }
 +}
 +
 +/*
 + * Destroy any remaining vCPUs which have been unplugged and have
 + * finished running
 + */
 +static void deal_with_unplugged_cpus(void)
 +{
 +    CPUState *cpu;
 +
 +    CPU_FOREACH(cpu) {
 +        if (cpu->unplug && !cpu_can_run(cpu)) {
 +            qemu_tcg_destroy_vcpu(cpu);
 +            break;
 +        }
 +    }
 +}
 +
 +/*
 + * In the single-threaded case each vCPU is simulated in turn. If
 + * there is more than a single vCPU we create a simple timer to kick
 + * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
 + * This is done explicitly rather than relying on side-effects
 + * elsewhere.
 + */
 +
 +void *tcg_rr_cpu_thread_fn(void *arg)
 +{
 +    CPUState *cpu = arg;
 +
 +    assert(tcg_enabled());
 +    rcu_register_thread();
 +    tcg_register_thread();
 +
 +    qemu_mutex_lock_iothread();
 +    qemu_thread_get_self(cpu->thread);
 +
 +    cpu->thread_id = qemu_get_thread_id();
 +    cpu->can_do_io = 1;
 +    cpu_thread_signal_created(cpu);
 +    qemu_guest_random_seed_thread_part2(cpu->random_seed);
 +
 +    /* wait for initial kick-off after machine start */
 +    while (first_cpu->stopped) {
 +        qemu_cond_wait_iothread(first_cpu->halt_cond);
 +
 +        /* process any pending work */
 +        CPU_FOREACH(cpu) {
 +            current_cpu = cpu;
 +            qemu_wait_io_event_common(cpu);
 +        }
 +    }
 +
 +    start_tcg_kick_timer();
 +
 +    cpu = first_cpu;
 +
 +    /* process any pending work */
 +    cpu->exit_request = 1;
 +
 +    while (1) {
 +        qemu_mutex_unlock_iothread();
 +        replay_mutex_lock();
 +        qemu_mutex_lock_iothread();
 +
 +        if (icount_enabled()) {
 +            /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
 +            icount_account_warp_timer();
 +            /*
 +             * Run the timers here.  This is much more efficient than
 +             * waking up the I/O thread and waiting for completion.
 +             */
 +            handle_icount_deadline();
 +        }
 +
 +        replay_mutex_unlock();
 +
 +        if (!cpu) {
 +            cpu = first_cpu;
 +        }
 +
 +        while (cpu && cpu_work_list_empty(cpu) && !cpu->exit_request) {
 +
 +            qatomic_mb_set(&tcg_current_rr_cpu, cpu);
 +            current_cpu = cpu;
 +
 +            qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
 +                              (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
 +
 +            if (cpu_can_run(cpu)) {
 +                int r;
 +
 +                qemu_mutex_unlock_iothread();
 +                if (icount_enabled()) {
 +                    prepare_icount_for_run(cpu);
 +                }
 +                r = tcg_cpu_exec(cpu);
 +                if (icount_enabled()) {
 +                    process_icount_data(cpu);
 +                }
 +                qemu_mutex_lock_iothread();
 +
 +                if (r == EXCP_DEBUG) {
 +                    cpu_handle_guest_debug(cpu);
 +                    break;
 +                } else if (r == EXCP_ATOMIC) {
 +                    qemu_mutex_unlock_iothread();
 +                    cpu_exec_step_atomic(cpu);
 +                    qemu_mutex_lock_iothread();
 +                    break;
 +                }
 +            } else if (cpu->stop) {
 +                if (cpu->unplug) {
 +                    cpu = CPU_NEXT(cpu);
 +                }
 +                break;
 +            }
 +
 +            cpu = CPU_NEXT(cpu);
 +        } /* while (cpu && !cpu->exit_request).. */
 +
 +        /* Does not need qatomic_mb_set because a spurious wakeup is okay.  */
 +        qatomic_set(&tcg_current_rr_cpu, NULL);
 +
 +        if (cpu && cpu->exit_request) {
 +            qatomic_mb_set(&cpu->exit_request, 0);
 +        }
 +
 +        if (icount_enabled() && all_cpu_threads_idle()) {
 +            /*
 +             * When all cpus are sleeping (e.g in WFI), to avoid a deadlock
 +             * in the main_loop, wake it up in order to start the warp timer.
 +             */
 +            qemu_notify_event();
 +        }
 +
 +        qemu_tcg_rr_wait_io_event();
 +        deal_with_unplugged_cpus();
 +    }
 +
 +    rcu_unregister_thread();
 +    return NULL;
 +}
 +
 +const CpusAccel tcg_cpus_rr = {
 +    .create_vcpu_thread = tcg_start_vcpu_thread,
 +    .kick_vcpu_thread = qemu_cpu_kick_rr_cpus,
 +
 +    .handle_interrupt = tcg_handle_interrupt,
 +};
 diff --git a/accel/tcg/tcg-cpus.c b/accel/tcg/tcg-cpus.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/tcg-cpus.c
 +++ b/accel/tcg/tcg-cpus.c
@@ -XXX,XX +XXX,XX @@
  /*
 - * QEMU System Emulator
 + * QEMU TCG vCPU common functionality
 + *
 + * Functionality common to all TCG vCPU variants: mttcg, rr and icount.
   *
   * Copyright (c) 2003-2008 Fabrice Bellard
   * Copyright (c) 2014 Red Hat Inc.
@@ -XXX,XX +XXX,XX @@
  #include "hw/boards.h"
  #include "tcg-cpus.h"
 +#include "tcg-cpus-mttcg.h"
 +#include "tcg-cpus-rr.h"
 -/* Kick all RR vCPUs */
 -static void qemu_cpu_kick_rr_cpus(void)
 -{
 -    CPUState *cpu;
 +/* common functionality among all TCG variants */
 -    CPU_FOREACH(cpu) {
 -        cpu_exit(cpu);
 -    };
 -}
 -
 -static void tcg_kick_vcpu_thread(CPUState *cpu)
 -{
 -    if (qemu_tcg_mttcg_enabled()) {
 -        cpu_exit(cpu);
 -    } else {
 -        qemu_cpu_kick_rr_cpus();
 -    }
 -}
 -
 -/*
 - * TCG vCPU kick timer
 - *
 - * The kick timer is responsible for moving single threaded vCPU
 - * emulation on to the next vCPU. If more than one vCPU is running a
 - * timer event with force a cpu->exit so the next vCPU can get
 - * scheduled.
 - *
 - * The timer is removed if all vCPUs are idle and restarted again once
 - * idleness is complete.
 - */
 -
 -static QEMUTimer *tcg_kick_vcpu_timer;
 -static CPUState *tcg_current_rr_cpu;
 -
 -#define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
 -
 -static inline int64_t qemu_tcg_next_kick(void)
 -{
 -    return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
 -}
 -
 -/* Kick the currently round-robin scheduled vCPU to next */
 -static void qemu_cpu_kick_rr_next_cpu(void)
 -{
 -    CPUState *cpu;
 -    do {
 -        cpu = qatomic_mb_read(&tcg_current_rr_cpu);
 -        if (cpu) {
 -            cpu_exit(cpu);
 -        }
 -    } while (cpu != qatomic_mb_read(&tcg_current_rr_cpu));
 -}
 -
 -static void kick_tcg_thread(void *opaque)
 -{
 -    timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 -    qemu_cpu_kick_rr_next_cpu();
 -}
 -
 -static void start_tcg_kick_timer(void)
 -{
 -    assert(!mttcg_enabled);
 -    if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
 -        tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 -                                           kick_tcg_thread, NULL);
 -    }
 -    if (tcg_kick_vcpu_timer && !timer_pending(tcg_kick_vcpu_timer)) {
 -        timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 -    }
 -}
 -
 -static void stop_tcg_kick_timer(void)
 -{
 -    assert(!mttcg_enabled);
 -    if (tcg_kick_vcpu_timer && timer_pending(tcg_kick_vcpu_timer)) {
 -        timer_del(tcg_kick_vcpu_timer);
 -    }
 -}
 -
 -static void qemu_tcg_destroy_vcpu(CPUState *cpu)
 -{
 -}
 -
 -static void qemu_tcg_rr_wait_io_event(void)
 -{
 -    CPUState *cpu;
 -
 -    while (all_cpu_threads_idle()) {
 -        stop_tcg_kick_timer();
 -        qemu_cond_wait_iothread(first_cpu->halt_cond);
 -    }
 -
 -    start_tcg_kick_timer();
 -
 -    CPU_FOREACH(cpu) {
 -        qemu_wait_io_event_common(cpu);
 -    }
 -}
 -
 -static int64_t tcg_get_icount_limit(void)
 -{
 -    int64_t deadline;
 -
 -    if (replay_mode != REPLAY_MODE_PLAY) {
 -        /*
 -         * Include all the timers, because they may need an attention.
 -         * Too long CPU execution may create unnecessary delay in UI.
 -         */
 -        deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
 -                                              QEMU_TIMER_ATTR_ALL);
 -        /* Check realtime timers, because they help with input processing */
 -        deadline = qemu_soonest_timeout(deadline,
 -                qemu_clock_deadline_ns_all(QEMU_CLOCK_REALTIME,
 -                                           QEMU_TIMER_ATTR_ALL));
 -
 -        /*
 -         * Maintain prior (possibly buggy) behaviour where if no deadline
 -         * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
 -         * INT32_MAX nanoseconds ahead, we still use INT32_MAX
 -         * nanoseconds.
 -         */
 -        if ((deadline < 0) || (deadline > INT32_MAX)) {
 -            deadline = INT32_MAX;
 -        }
 -
 -        return icount_round(deadline);
 -    } else {
 -        return replay_get_instructions();
 -    }
 -}
 -
 -static void notify_aio_contexts(void)
 -{
 -    /* Wake up other AioContexts.  */
 -    qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 -    qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
 -}
 -
 -static void handle_icount_deadline(void)
 -{
 -    assert(qemu_in_vcpu_thread());
 -    if (icount_enabled()) {
 -        int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
 -                                                      QEMU_TIMER_ATTR_ALL);
 -
 -        if (deadline == 0) {
 -            notify_aio_contexts();
 -        }
 -    }
 -}
 -
 -static void prepare_icount_for_run(CPUState *cpu)
 -{
 -    if (icount_enabled()) {
 -        int insns_left;
 -
 -        /*
 -         * These should always be cleared by process_icount_data after
 -         * each vCPU execution. However u16.high can be raised
 -         * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
 -         */
 -        g_assert(cpu_neg(cpu)->icount_decr.u16.low == 0);
 -        g_assert(cpu->icount_extra == 0);
 -
 -        cpu->icount_budget = tcg_get_icount_limit();
 -        insns_left = MIN(0xffff, cpu->icount_budget);
 -        cpu_neg(cpu)->icount_decr.u16.low = insns_left;
 -        cpu->icount_extra = cpu->icount_budget - insns_left;
 -
 -        replay_mutex_lock();
 -
 -        if (cpu->icount_budget == 0 && replay_has_checkpoint()) {
 -            notify_aio_contexts();
 -        }
 -    }
 -}
 -
 -static void process_icount_data(CPUState *cpu)
 -{
 -    if (icount_enabled()) {
 -        /* Account for executed instructions */
 -        icount_update(cpu);
 -
 -        /* Reset the counters */
 -        cpu_neg(cpu)->icount_decr.u16.low = 0;
 -        cpu->icount_extra = 0;
 -        cpu->icount_budget = 0;
 -
 -        replay_account_executed_instructions();
 -
 -        replay_mutex_unlock();
 -    }
 -}
 -
 -static int tcg_cpu_exec(CPUState *cpu)
 -{
 -    int ret;
 -#ifdef CONFIG_PROFILER
 -    int64_t ti;
 -#endif
 -
 -    assert(tcg_enabled());
 -#ifdef CONFIG_PROFILER
 -    ti = profile_getclock();
 -#endif
 -    cpu_exec_start(cpu);
 -    ret = cpu_exec(cpu);
 -    cpu_exec_end(cpu);
 -#ifdef CONFIG_PROFILER
 -    qatomic_set(&tcg_ctx->prof.cpu_exec_time,
 -                tcg_ctx->prof.cpu_exec_time + profile_getclock() - ti);
 -#endif
 -    return ret;
 -}
 -
 -/*
 - * Destroy any remaining vCPUs which have been unplugged and have
 - * finished running
 - */
 -static void deal_with_unplugged_cpus(void)
 -{
 -    CPUState *cpu;
 -
 -    CPU_FOREACH(cpu) {
 -        if (cpu->unplug && !cpu_can_run(cpu)) {
 -            qemu_tcg_destroy_vcpu(cpu);
 -            cpu_thread_signal_destroyed(cpu);
 -            break;
 -        }
 -    }
 -}
 -
 -/*
 - * Single-threaded TCG
 - *
 - * In the single-threaded case each vCPU is simulated in turn. If
 - * there is more than a single vCPU we create a simple timer to kick
 - * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
 - * This is done explicitly rather than relying on side-effects
 - * elsewhere.
 - */
 -
 -static void *tcg_rr_cpu_thread_fn(void *arg)
 -{
 -    CPUState *cpu = arg;
 -
 -    assert(tcg_enabled());
 -    rcu_register_thread();
 -    tcg_register_thread();
 -
 -    qemu_mutex_lock_iothread();
 -    qemu_thread_get_self(cpu->thread);
 -
 -    cpu->thread_id = qemu_get_thread_id();
 -    cpu->can_do_io = 1;
 -    cpu_thread_signal_created(cpu);
 -    qemu_guest_random_seed_thread_part2(cpu->random_seed);
 -
 -    /* wait for initial kick-off after machine start */
 -    while (first_cpu->stopped) {
 -        qemu_cond_wait_iothread(first_cpu->halt_cond);
 -
 -        /* process any pending work */
 -        CPU_FOREACH(cpu) {
 -            current_cpu = cpu;
 -            qemu_wait_io_event_common(cpu);
 -        }
 -    }
 -
 -    start_tcg_kick_timer();
 -
 -    cpu = first_cpu;
 -
 -    /* process any pending work */
 -    cpu->exit_request = 1;
 -
 -    while (1) {
 -        qemu_mutex_unlock_iothread();
 -        replay_mutex_lock();
 -        qemu_mutex_lock_iothread();
 -        /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
 -        icount_account_warp_timer();
 -
 -        /*
 -         * Run the timers here.  This is much more efficient than
 -         * waking up the I/O thread and waiting for completion.
 -         */
 -        handle_icount_deadline();
 -
 -        replay_mutex_unlock();
 -
 -        if (!cpu) {
 -            cpu = first_cpu;
 -        }
 -
 -        while (cpu && cpu_work_list_empty(cpu) && !cpu->exit_request) {
 -
 -            qatomic_mb_set(&tcg_current_rr_cpu, cpu);
 -            current_cpu = cpu;
 -
 -            qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
 -                              (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
 -
 -            if (cpu_can_run(cpu)) {
 -                int r;
 -
 -                qemu_mutex_unlock_iothread();
 -                prepare_icount_for_run(cpu);
 -
 -                r = tcg_cpu_exec(cpu);
 -
 -                process_icount_data(cpu);
 -                qemu_mutex_lock_iothread();
 -
 -                if (r == EXCP_DEBUG) {
 -                    cpu_handle_guest_debug(cpu);
 -                    break;
 -                } else if (r == EXCP_ATOMIC) {
 -                    qemu_mutex_unlock_iothread();
 -                    cpu_exec_step_atomic(cpu);
 -                    qemu_mutex_lock_iothread();
 -                    break;
 -                }
 -            } else if (cpu->stop) {
 -                if (cpu->unplug) {
 -                    cpu = CPU_NEXT(cpu);
 -                }
 -                break;
 -            }
 -
 -            cpu = CPU_NEXT(cpu);
 -        } /* while (cpu && !cpu->exit_request).. */
 -
 -        /* Does not need qatomic_mb_set because a spurious wakeup is okay.  */
 -        qatomic_set(&tcg_current_rr_cpu, NULL);
 -
 -        if (cpu && cpu->exit_request) {
 -            qatomic_mb_set(&cpu->exit_request, 0);
 -        }
 -
 -        if (icount_enabled() && all_cpu_threads_idle()) {
 -            /*
 -             * When all cpus are sleeping (e.g in WFI), to avoid a deadlock
 -             * in the main_loop, wake it up in order to start the warp timer.
 -             */
 -            qemu_notify_event();
 -        }
 -
 -        qemu_tcg_rr_wait_io_event();
 -        deal_with_unplugged_cpus();
 -    }
 -
 -    rcu_unregister_thread();
 -    return NULL;
 -}
 -
 -/*
 - * Multi-threaded TCG
 - *
 - * In the multi-threaded case each vCPU has its own thread. The TLS
 - * variable current_cpu can be used deep in the code to find the
 - * current CPUState for a given thread.
 - */
 -
 -static void *tcg_cpu_thread_fn(void *arg)
 -{
 -    CPUState *cpu = arg;
 -
 -    assert(tcg_enabled());
 -    g_assert(!icount_enabled());
 -
 -    rcu_register_thread();
 -    tcg_register_thread();
 -
 -    qemu_mutex_lock_iothread();
 -    qemu_thread_get_self(cpu->thread);
 -
 -    cpu->thread_id = qemu_get_thread_id();
 -    cpu->can_do_io = 1;
 -    current_cpu = cpu;
 -    cpu_thread_signal_created(cpu);
 -    qemu_guest_random_seed_thread_part2(cpu->random_seed);
 -
 -    /* process any pending work */
 -    cpu->exit_request = 1;
 -
 -    do {
 -        if (cpu_can_run(cpu)) {
 -            int r;
 -            qemu_mutex_unlock_iothread();
 -            r = tcg_cpu_exec(cpu);
 -            qemu_mutex_lock_iothread();
 -            switch (r) {
 -            case EXCP_DEBUG:
 -                cpu_handle_guest_debug(cpu);
 -                break;
 -            case EXCP_HALTED:
 -                /*
 -                 * during start-up the vCPU is reset and the thread is
 -                 * kicked several times. If we don't ensure we go back
 -                 * to sleep in the halted state we won't cleanly
 -                 * start-up when the vCPU is enabled.
 -                 *
 -                 * cpu->halted should ensure we sleep in wait_io_event
 -                 */
 -                g_assert(cpu->halted);
 -                break;
 -            case EXCP_ATOMIC:
 -                qemu_mutex_unlock_iothread();
 -                cpu_exec_step_atomic(cpu);
 -                qemu_mutex_lock_iothread();
 -            default:
 -                /* Ignore everything else? */
 -                break;
 -            }
 -        }
 -
 -        qatomic_mb_set(&cpu->exit_request, 0);
 -        qemu_wait_io_event(cpu);
 -    } while (!cpu->unplug || cpu_can_run(cpu));
 -
 -    qemu_tcg_destroy_vcpu(cpu);
 -    cpu_thread_signal_destroyed(cpu);
 -    qemu_mutex_unlock_iothread();
 -    rcu_unregister_thread();
 -    return NULL;
 -}
 -
 -static void tcg_start_vcpu_thread(CPUState *cpu)
 +void tcg_start_vcpu_thread(CPUState *cpu)
  {
-     char thread_name[VCPU_THREAD_NAME_SIZE];
+     tcg_target_long hi, lo = (int32_t)arg;
-     static QemuCond *single_tcg_halt_cond;
+     tcg_target_long test, lsb;
-@@ -XXX,XX +XXX,XX @@ static void tcg_start_vcpu_thread(CPUState *cpu)
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_movi_int(TCGContext *s, TCGType type, TCGReg ret,
      } else {
          hi = arg >> 32;
          tcg_out_movi_imm32(s, ret, hi);
 -        tcg_out_movi_imm32(s, TCG_REG_T2, lo);
 +        tcg_out_movi_imm32(s, scratch, lo);
          tcg_out_arithi(s, ret, ret, 32, SHIFT_SLLX);
 -        tcg_out_arith(s, ret, ret, TCG_REG_T2, ARITH_OR);
 +        tcg_out_arith(s, ret, ret, scratch, ARITH_OR);
      }
  }
--static int64_t tcg_get_virtual_clock(void)
+ static void tcg_out_movi(TCGContext *s, TCGType type,
-+void qemu_tcg_destroy_vcpu(CPUState *cpu)
+                          TCGReg ret, tcg_target_long arg)
  {
--    if (icount_enabled()) {
+-    tcg_out_movi_int(s, type, ret, arg, false);
--        return icount_get();
++    tcg_debug_assert(ret != TCG_REG_T2);
--    }
++    tcg_out_movi_int(s, type, ret, arg, false, TCG_REG_T2);
 -    return cpu_get_clock();
 +    cpu_thread_signal_destroyed(cpu);
  }
--static int64_t tcg_get_elapsed_ticks(void)
+ static void tcg_out_ldst_rr(TCGContext *s, TCGReg data, TCGReg a1,
-+int tcg_cpu_exec(CPUState *cpu)
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_call_nodelay(TCGContext *s, const tcg_insn_unit *dest,
  {
 -    if (icount_enabled()) {
 -        return icount_get();
 -    }
 -    return cpu_get_ticks();
 +    int ret;
 +#ifdef CONFIG_PROFILER
 +    int64_t ti;
 +#endif
 +    assert(tcg_enabled());
 +#ifdef CONFIG_PROFILER
 +    ti = profile_getclock();
 +#endif
 +    cpu_exec_start(cpu);
 +    ret = cpu_exec(cpu);
 +    cpu_exec_end(cpu);
 +#ifdef CONFIG_PROFILER
 +    qatomic_set(&tcg_ctx->prof.cpu_exec_time,
 +                tcg_ctx->prof.cpu_exec_time + profile_getclock() - ti);
 +#endif
 +    return ret;
  }
  /* mask must never be zero, except for A20 change call */
 -static void tcg_handle_interrupt(CPUState *cpu, int mask)
 +void tcg_handle_interrupt(CPUState *cpu, int mask)
  {
 -    int old_mask;
      g_assert(qemu_mutex_iothread_locked());
 -    old_mask = cpu->interrupt_request;
      cpu->interrupt_request |= mask;
      /*
@@ -XXX,XX +XXX,XX @@ static void tcg_handle_interrupt(CPUState *cpu, int mask)
          qemu_cpu_kick(cpu);
      } else {
-         qatomic_set(&cpu_neg(cpu)->icount_decr.u16.high, -1);
+         uintptr_t desti = (uintptr_t)dest;
--        if (icount_enabled() &&
+         tcg_out_movi_int(s, TCG_TYPE_PTR, TCG_REG_T1,
--            !cpu->can_do_io
+-                         desti & ~0xfff, in_prologue);
--            && (mask & ~old_mask) != 0) {
++                         desti & ~0xfff, in_prologue, TCG_REG_O7);
--            cpu_abort(cpu, "Raised interrupt while not in I/O function");
+         tcg_out_arithi(s, TCG_REG_O7, TCG_REG_T1, desti & 0xfff, JMPL);
 -        }
      }
  }
--
+@@ -XXX,XX +XXX,XX @@ static void tcg_target_qemu_prologue(TCGContext *s)
--const CpusAccel tcg_cpus = {
--    .create_vcpu_thread = tcg_start_vcpu_thread,
+ #ifndef CONFIG_SOFTMMU
--    .kick_vcpu_thread = tcg_kick_vcpu_thread,
+     if (guest_base != 0) {
--
+-        tcg_out_movi_int(s, TCG_TYPE_PTR, TCG_GUEST_BASE_REG, guest_base, true);
--    .handle_interrupt = tcg_handle_interrupt,
++        tcg_out_movi_int(s, TCG_TYPE_PTR, TCG_GUEST_BASE_REG,
--
++                         guest_base, true, TCG_REG_T1);
--    .get_virtual_clock = tcg_get_virtual_clock,
+         tcg_regset_set_reg(s->reserved_regs, TCG_GUEST_BASE_REG);
 -    .get_elapsed_ticks = tcg_get_elapsed_ticks,
 -};
 diff --git a/softmmu/icount.c b/softmmu/icount.c
 index XXXXXXX..XXXXXXX 100644
 --- a/softmmu/icount.c
 +++ b/softmmu/icount.c
@@ -XXX,XX +XXX,XX @@ void icount_start_warp_timer(void)
  void icount_account_warp_timer(void)
  {
 -    if (!icount_enabled() || !icount_sleep) {
 +    if (!icount_sleep) {
          return;
      }
+ #endif
 diff --git a/accel/tcg/meson.build b/accel/tcg/meson.build
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/meson.build
 +++ b/accel/tcg/meson.build
@@ -XXX,XX +XXX,XX @@ tcg_ss.add(when: 'CONFIG_SOFTMMU', if_false: files('user-exec-stub.c'))
  tcg_ss.add(when: 'CONFIG_PLUGIN', if_true: [files('plugin-gen.c'), libdl])
  specific_ss.add_all(when: 'CONFIG_TCG', if_true: tcg_ss)
 -specific_ss.add(when: ['CONFIG_SOFTMMU', 'CONFIG_TCG'], if_true: files('tcg-all.c', 'cputlb.c', 'tcg-cpus.c'))
 +specific_ss.add(when: ['CONFIG_SOFTMMU', 'CONFIG_TCG'], if_true: files(
 +  'tcg-all.c',
 +  'cputlb.c',
 +  'tcg-cpus.c',
 +  'tcg-cpus-mttcg.c',
 +  'tcg-cpus-icount.c',
 +  'tcg-cpus-rr.c'
 +))
 --
 .25.1

-New patch
+[PULL 29/34] tcg/sparc: Improve code gen for shifted 32-bit constants
+We had code for checking for 13 and 21-bit shifted constants,
+but we can do better and allow 32-bit shifted constants.
+This is still 2 insns shorter than the full 64-bit sequence.
+Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/sparc/tcg-target.c.inc | 12 ++++++------
+file changed, 6 insertions(+), 6 deletions(-)
+diff --git a/tcg/sparc/tcg-target.c.inc b/tcg/sparc/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/sparc/tcg-target.c.inc
++++ b/tcg/sparc/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_movi_int(TCGContext *s, TCGType type, TCGReg ret,
+         return;
+     }
+-    /* A 21-bit constant, shifted.  */
++    /* A 32-bit constant, shifted.  */
+     lsb = ctz64(arg);
+     test = (tcg_target_long)arg >> lsb;
+-    if (check_fit_tl(test, 13)) {
+-        tcg_out_movi_imm13(s, ret, test);
+-        tcg_out_arithi(s, ret, ret, lsb, SHIFT_SLLX);
+-        return;
+-    } else if (lsb > 10 && test == extract64(test, 0, 21)) {
++    if (lsb > 10 && test == extract64(test, 0, 21)) {
+         tcg_out_sethi(s, ret, test << 10);
+         tcg_out_arithi(s, ret, ret, lsb - 10, SHIFT_SLLX);
+         return;
++    } else if (test == (uint32_t)test || test == (int32_t)test) {
++        tcg_out_movi_int(s, TCG_TYPE_I64, ret, test, in_prologue, scratch);
++        tcg_out_arithi(s, ret, ret, lsb, SHIFT_SLLX);
++        return;
+     }
+     /* A 64-bit constant decomposed into 2 32-bit pieces.  */
+--
+.25.1

-New patch
+[PULL 30/34] tcg/sparc: Convert patch_reloc to return bool
+Since 7ecd02a06f8, if patch_reloc fails we restart translation
+with a smaller TB.  SPARC had its function signature changed,
+but not the logic.  Replace assert with return false.
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/sparc/tcg-target.c.inc | 8 ++++++--
+file changed, 6 insertions(+), 2 deletions(-)
+diff --git a/tcg/sparc/tcg-target.c.inc b/tcg/sparc/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/sparc/tcg-target.c.inc
++++ b/tcg/sparc/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static bool patch_reloc(tcg_insn_unit *src_rw, int type,
+     switch (type) {
+     case R_SPARC_WDISP16:
+-        assert(check_fit_ptr(pcrel >> 2, 16));
++        if (!check_fit_ptr(pcrel >> 2, 16)) {
++            return false;
++        }
+         insn &= ~INSN_OFF16(-1);
+         insn |= INSN_OFF16(pcrel);
+         break;
+     case R_SPARC_WDISP19:
+-        assert(check_fit_ptr(pcrel >> 2, 19));
++        if (!check_fit_ptr(pcrel >> 2, 19)) {
++            return false;
++        }
+         insn &= ~INSN_OFF19(-1);
+         insn |= INSN_OFF19(pcrel);
+         break;
+--
+.25.1

-New patch
+[PULL 31/34] tcg/sparc: Use the constant pool for 64-bit constants
+Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/sparc/tcg-target.c.inc | 15 +++++++++++++++
+file changed, 15 insertions(+)
+diff --git a/tcg/sparc/tcg-target.c.inc b/tcg/sparc/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/sparc/tcg-target.c.inc
++++ b/tcg/sparc/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static bool patch_reloc(tcg_insn_unit *src_rw, int type,
+         insn &= ~INSN_OFF19(-1);
+         insn |= INSN_OFF19(pcrel);
+         break;
++    case R_SPARC_13:
++        if (!check_fit_ptr(value, 13)) {
++            return false;
++        }
++        insn &= ~INSN_IMM13(-1);
++        insn |= INSN_IMM13(value);
++        break;
+     default:
+         g_assert_not_reached();
+     }
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_movi_int(TCGContext *s, TCGType type, TCGReg ret,
+         return;
+     }
++    /* Use the constant pool, if possible. */
++    if (!in_prologue && USE_REG_TB) {
++        new_pool_label(s, arg, R_SPARC_13, s->code_ptr,
++                       tcg_tbrel_diff(s, NULL));
++        tcg_out32(s, LDX | INSN_RD(ret) | INSN_RS1(TCG_REG_TB));
++        return;
++    }
++
+     /* A 64-bit constant decomposed into 2 32-bit pieces.  */
+     if (check_fit_i32(lo, 13)) {
+         hi = (arg - lo) >> 32;
+--
+.25.1

-New patch
+[PULL 32/34] tcg/sparc: Add tcg_out_jmpl_const for better tail calls
+Due to mapping changes, we now rarely place the code_gen_buffer
+near the main executable.  Which means that direct calls will
+now rarely be in range.
+So, always use indirect calls for tail calls, which allows us to
+avoid clobbering %o7, and therefore we need not save and restore it.
+Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/sparc/tcg-target.c.inc | 37 +++++++++++++++++++++++--------------
+file changed, 23 insertions(+), 14 deletions(-)
+diff --git a/tcg/sparc/tcg-target.c.inc b/tcg/sparc/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/sparc/tcg-target.c.inc
++++ b/tcg/sparc/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_addsub2_i64(TCGContext *s, TCGReg rl, TCGReg rh,
+     tcg_out_mov(s, TCG_TYPE_I64, rl, tmp);
+ }
++static void tcg_out_jmpl_const(TCGContext *s, const tcg_insn_unit *dest,
++                               bool in_prologue, bool tail_call)
++{
++    uintptr_t desti = (uintptr_t)dest;
++
++    /* Be careful not to clobber %o7 for a tail call. */
++    tcg_out_movi_int(s, TCG_TYPE_PTR, TCG_REG_T1,
++                     desti & ~0xfff, in_prologue,
++                     tail_call ? TCG_REG_G2 : TCG_REG_O7);
++    tcg_out_arithi(s, tail_call ? TCG_REG_G0 : TCG_REG_O7,
++                   TCG_REG_T1, desti & 0xfff, JMPL);
++}
++
+ static void tcg_out_call_nodelay(TCGContext *s, const tcg_insn_unit *dest,
+                                  bool in_prologue)
+ {
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_call_nodelay(TCGContext *s, const tcg_insn_unit *dest,
+     if (disp == (int32_t)disp) {
+         tcg_out32(s, CALL | (uint32_t)disp >> 2);
+     } else {
+-        uintptr_t desti = (uintptr_t)dest;
+-        tcg_out_movi_int(s, TCG_TYPE_PTR, TCG_REG_T1,
+-                         desti & ~0xfff, in_prologue, TCG_REG_O7);
+-        tcg_out_arithi(s, TCG_REG_O7, TCG_REG_T1, desti & 0xfff, JMPL);
++        tcg_out_jmpl_const(s, dest, in_prologue, false);
+     }
+ }
+@@ -XXX,XX +XXX,XX @@ static void build_trampolines(TCGContext *s)
+         /* Set the retaddr operand.  */
+         tcg_out_mov(s, TCG_TYPE_PTR, ra, TCG_REG_O7);
+-        /* Set the env operand.  */
+-        tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_O0, TCG_AREG0);
+         /* Tail call.  */
+-        tcg_out_call_nodelay(s, qemu_ld_helpers[i], true);
+-        tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_O7, ra);
++        tcg_out_jmpl_const(s, qemu_ld_helpers[i], true, true);
++        /* delay slot -- set the env argument */
++        tcg_out_mov_delay(s, TCG_REG_O0, TCG_AREG0);
+     }
+     for (i = 0; i < ARRAY_SIZE(qemu_st_helpers); ++i) {
+@@ -XXX,XX +XXX,XX @@ static void build_trampolines(TCGContext *s)
+         if (ra >= TCG_REG_O6) {
+             tcg_out_st(s, TCG_TYPE_PTR, TCG_REG_O7, TCG_REG_CALL_STACK,
+                        TCG_TARGET_CALL_STACK_OFFSET);
+-            ra = TCG_REG_G1;
++        } else {
++            tcg_out_mov(s, TCG_TYPE_PTR, ra, TCG_REG_O7);
+         }
+-        tcg_out_mov(s, TCG_TYPE_PTR, ra, TCG_REG_O7);
+-        /* Set the env operand.  */
+-        tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_O0, TCG_AREG0);
++
+         /* Tail call.  */
+-        tcg_out_call_nodelay(s, qemu_st_helpers[i], true);
+-        tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_O7, ra);
++        tcg_out_jmpl_const(s, qemu_st_helpers[i], true, true);
++        /* delay slot -- set the env argument */
++        tcg_out_mov_delay(s, TCG_REG_O0, TCG_AREG0);
+     }
+ }
+ #endif
+--
+.25.1

-New patch
+[PULL 33/34] tcg/sparc: Support unaligned access for user-only
+This is kinda sorta the opposite of the other tcg hosts, where
 we get (normal) alignment checks for free with host SIGBUS and
 need to add code to support unaligned accesses.
 This inline code expansion is somewhat large, but it takes quite
 a few instructions to make a function call to a helper anyway.
 Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
  tcg/sparc/tcg-target.c.inc | 219 +++++++++++++++++++++++++++++++++++--
 file changed, 211 insertions(+), 8 deletions(-)
 diff --git a/tcg/sparc/tcg-target.c.inc b/tcg/sparc/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/sparc/tcg-target.c.inc
 +++ b/tcg/sparc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static const int tcg_target_call_oarg_regs[] = {
  #define ARITH_ADD  (INSN_OP(2) | INSN_OP3(0x00))
  #define ARITH_ADDCC (INSN_OP(2) | INSN_OP3(0x10))
  #define ARITH_AND  (INSN_OP(2) | INSN_OP3(0x01))
 +#define ARITH_ANDCC (INSN_OP(2) | INSN_OP3(0x11))
  #define ARITH_ANDN (INSN_OP(2) | INSN_OP3(0x05))
  #define ARITH_OR   (INSN_OP(2) | INSN_OP3(0x02))
  #define ARITH_ORCC (INSN_OP(2) | INSN_OP3(0x12))
@@ -XXX,XX +XXX,XX @@ static void build_trampolines(TCGContext *s)
          tcg_out_mov_delay(s, TCG_REG_O0, TCG_AREG0);
      }
  }
 +#else
 +static const tcg_insn_unit *qemu_unalign_ld_trampoline;
 +static const tcg_insn_unit *qemu_unalign_st_trampoline;
 +
 +static void build_trampolines(TCGContext *s)
 +{
 +    for (int ld = 0; ld < 2; ++ld) {
 +        void *helper;
 +
 +        while ((uintptr_t)s->code_ptr & 15) {
 +            tcg_out_nop(s);
 +        }
 +
 +        if (ld) {
 +            helper = helper_unaligned_ld;
 +            qemu_unalign_ld_trampoline = tcg_splitwx_to_rx(s->code_ptr);
 +        } else {
 +            helper = helper_unaligned_st;
 +            qemu_unalign_st_trampoline = tcg_splitwx_to_rx(s->code_ptr);
 +        }
 +
 +        if (!SPARC64 && TARGET_LONG_BITS == 64) {
 +            /* Install the high part of the address.  */
 +            tcg_out_arithi(s, TCG_REG_O1, TCG_REG_O2, 32, SHIFT_SRLX);
 +        }
 +
 +        /* Tail call.  */
 +        tcg_out_jmpl_const(s, helper, true, true);
 +        /* delay slot -- set the env argument */
 +        tcg_out_mov_delay(s, TCG_REG_O0, TCG_AREG0);
 +    }
 +}
  #endif
  /* Generate global QEMU prologue and epilogue code */
@@ -XXX,XX +XXX,XX @@ static void tcg_target_qemu_prologue(TCGContext *s)
      /* delay slot */
      tcg_out_movi_imm13(s, TCG_REG_O0, 0);
 -#ifdef CONFIG_SOFTMMU
      build_trampolines(s);
 -#endif
  }
  static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
@@ -XXX,XX +XXX,XX @@ static TCGReg tcg_out_tlb_load(TCGContext *s, TCGReg addr, int mem_index,
  static const int qemu_ld_opc[(MO_SSIZE | MO_BSWAP) + 1] = {
      [MO_UB]   = LDUB,
      [MO_SB]   = LDSB,
 +    [MO_UB | MO_LE] = LDUB,
 +    [MO_SB | MO_LE] = LDSB,
      [MO_BEUW] = LDUH,
      [MO_BESW] = LDSH,
      [MO_BEUL] = LDUW,
      [MO_BESL] = LDSW,
      [MO_BEUQ] = LDX,
 +    [MO_BESQ] = LDX,
      [MO_LEUW] = LDUH_LE,
      [MO_LESW] = LDSH_LE,
      [MO_LEUL] = LDUW_LE,
      [MO_LESL] = LDSW_LE,
      [MO_LEUQ] = LDX_LE,
 +    [MO_LESQ] = LDX_LE,
  };
  static const int qemu_st_opc[(MO_SIZE | MO_BSWAP) + 1] = {
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld(TCGContext *s, TCGReg data, TCGReg addr,
                              MemOpIdx oi, bool is_64)
  {
      MemOp memop = get_memop(oi);
 +    tcg_insn_unit *label_ptr;
 +
  #ifdef CONFIG_SOFTMMU
      unsigned memi = get_mmuidx(oi);
      TCGReg addrz, param;
      const tcg_insn_unit *func;
 -    tcg_insn_unit *label_ptr;
      addrz = tcg_out_tlb_load(s, addr, memi, memop,
                               offsetof(CPUTLBEntry, addr_read));
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld(TCGContext *s, TCGReg data, TCGReg addr,
      *label_ptr |= INSN_OFF19(tcg_ptr_byte_diff(s->code_ptr, label_ptr));
  #else
 +    TCGReg index = (guest_base ? TCG_GUEST_BASE_REG : TCG_REG_G0);
 +    unsigned a_bits = get_alignment_bits(memop);
 +    unsigned s_bits = memop & MO_SIZE;
 +    unsigned t_bits;
 +
      if (SPARC64 && TARGET_LONG_BITS == 32) {
          tcg_out_arithi(s, TCG_REG_T1, addr, 0, SHIFT_SRL);
          addr = TCG_REG_T1;
      }
 -    tcg_out_ldst_rr(s, data, addr,
 -                    (guest_base ? TCG_GUEST_BASE_REG : TCG_REG_G0),
 +
 +    /*
 +     * Normal case: alignment equal to access size.
 +     */
 +    if (a_bits == s_bits) {
 +        tcg_out_ldst_rr(s, data, addr, index,
 +                        qemu_ld_opc[memop & (MO_BSWAP | MO_SSIZE)]);
 +        return;
 +    }
 +
 +    /*
 +     * Test for at least natural alignment, and assume most accesses
 +     * will be aligned -- perform a straight load in the delay slot.
 +     * This is required to preserve atomicity for aligned accesses.
 +     */
 +    t_bits = MAX(a_bits, s_bits);
 +    tcg_debug_assert(t_bits < 13);
 +    tcg_out_arithi(s, TCG_REG_G0, addr, (1u << t_bits) - 1, ARITH_ANDCC);
 +
 +    /* beq,a,pt %icc, label */
 +    label_ptr = s->code_ptr;
 +    tcg_out_bpcc0(s, COND_E, BPCC_A | BPCC_PT | BPCC_ICC, 0);
 +    /* delay slot */
 +    tcg_out_ldst_rr(s, data, addr, index,
                      qemu_ld_opc[memop & (MO_BSWAP | MO_SSIZE)]);
 +
 +    if (a_bits >= s_bits) {
 +        /*
 +         * Overalignment: A successful alignment test will perform the memory
 +         * operation in the delay slot, and failure need only invoke the
 +         * handler for SIGBUS.
 +         */
 +        TCGReg arg_low = TCG_REG_O1 + (!SPARC64 && TARGET_LONG_BITS == 64);
 +        tcg_out_call_nodelay(s, qemu_unalign_ld_trampoline, false);
 +        /* delay slot -- move to low part of argument reg */
 +        tcg_out_mov_delay(s, arg_low, addr);
 +    } else {
 +        /* Underalignment: load by pieces of minimum alignment. */
 +        int ld_opc, a_size, s_size, i;
 +
 +        /*
 +         * Force full address into T1 early; avoids problems with
 +         * overlap between @addr and @data.
 +         */
 +        tcg_out_arith(s, TCG_REG_T1, addr, index, ARITH_ADD);
 +
 +        a_size = 1 << a_bits;
 +        s_size = 1 << s_bits;
 +        if ((memop & MO_BSWAP) == MO_BE) {
 +            ld_opc = qemu_ld_opc[a_bits | MO_BE | (memop & MO_SIGN)];
 +            tcg_out_ldst(s, data, TCG_REG_T1, 0, ld_opc);
 +            ld_opc = qemu_ld_opc[a_bits | MO_BE];
 +            for (i = a_size; i < s_size; i += a_size) {
 +                tcg_out_ldst(s, TCG_REG_T2, TCG_REG_T1, i, ld_opc);
 +                tcg_out_arithi(s, data, data, a_size, SHIFT_SLLX);
 +                tcg_out_arith(s, data, data, TCG_REG_T2, ARITH_OR);
 +            }
 +        } else if (a_bits == 0) {
 +            ld_opc = LDUB;
 +            tcg_out_ldst(s, data, TCG_REG_T1, 0, ld_opc);
 +            for (i = a_size; i < s_size; i += a_size) {
 +                if ((memop & MO_SIGN) && i == s_size - a_size) {
 +                    ld_opc = LDSB;
 +                }
 +                tcg_out_ldst(s, TCG_REG_T2, TCG_REG_T1, i, ld_opc);
 +                tcg_out_arithi(s, TCG_REG_T2, TCG_REG_T2, i * 8, SHIFT_SLLX);
 +                tcg_out_arith(s, data, data, TCG_REG_T2, ARITH_OR);
 +            }
 +        } else {
 +            ld_opc = qemu_ld_opc[a_bits | MO_LE];
 +            tcg_out_ldst_rr(s, data, TCG_REG_T1, TCG_REG_G0, ld_opc);
 +            for (i = a_size; i < s_size; i += a_size) {
 +                tcg_out_arithi(s, TCG_REG_T1, TCG_REG_T1, a_size, ARITH_ADD);
 +                if ((memop & MO_SIGN) && i == s_size - a_size) {
 +                    ld_opc = qemu_ld_opc[a_bits | MO_LE | MO_SIGN];
 +                }
 +                tcg_out_ldst_rr(s, TCG_REG_T2, TCG_REG_T1, TCG_REG_G0, ld_opc);
 +                tcg_out_arithi(s, TCG_REG_T2, TCG_REG_T2, i * 8, SHIFT_SLLX);
 +                tcg_out_arith(s, data, data, TCG_REG_T2, ARITH_OR);
 +            }
 +        }
 +    }
 +
 +    *label_ptr |= INSN_OFF19(tcg_ptr_byte_diff(s->code_ptr, label_ptr));
  #endif /* CONFIG_SOFTMMU */
  }
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, TCGReg data, TCGReg addr,
                              MemOpIdx oi)
  {
      MemOp memop = get_memop(oi);
 +    tcg_insn_unit *label_ptr;
 +
  #ifdef CONFIG_SOFTMMU
      unsigned memi = get_mmuidx(oi);
      TCGReg addrz, param;
      const tcg_insn_unit *func;
 -    tcg_insn_unit *label_ptr;
      addrz = tcg_out_tlb_load(s, addr, memi, memop,
                               offsetof(CPUTLBEntry, addr_write));
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, TCGReg data, TCGReg addr,
      *label_ptr |= INSN_OFF19(tcg_ptr_byte_diff(s->code_ptr, label_ptr));
  #else
 +    TCGReg index = (guest_base ? TCG_GUEST_BASE_REG : TCG_REG_G0);
 +    unsigned a_bits = get_alignment_bits(memop);
 +    unsigned s_bits = memop & MO_SIZE;
 +    unsigned t_bits;
 +
      if (SPARC64 && TARGET_LONG_BITS == 32) {
          tcg_out_arithi(s, TCG_REG_T1, addr, 0, SHIFT_SRL);
          addr = TCG_REG_T1;
      }
 -    tcg_out_ldst_rr(s, data, addr,
 -                    (guest_base ? TCG_GUEST_BASE_REG : TCG_REG_G0),
 +
 +    /*
 +     * Normal case: alignment equal to access size.
 +     */
 +    if (a_bits == s_bits) {
 +        tcg_out_ldst_rr(s, data, addr, index,
 +                        qemu_st_opc[memop & (MO_BSWAP | MO_SIZE)]);
 +        return;
 +    }
 +
 +    /*
 +     * Test for at least natural alignment, and assume most accesses
 +     * will be aligned -- perform a straight store in the delay slot.
 +     * This is required to preserve atomicity for aligned accesses.
 +     */
 +    t_bits = MAX(a_bits, s_bits);
 +    tcg_debug_assert(t_bits < 13);
 +    tcg_out_arithi(s, TCG_REG_G0, addr, (1u << t_bits) - 1, ARITH_ANDCC);
 +
 +    /* beq,a,pt %icc, label */
 +    label_ptr = s->code_ptr;
 +    tcg_out_bpcc0(s, COND_E, BPCC_A | BPCC_PT | BPCC_ICC, 0);
 +    /* delay slot */
 +    tcg_out_ldst_rr(s, data, addr, index,
                      qemu_st_opc[memop & (MO_BSWAP | MO_SIZE)]);
 +
 +    if (a_bits >= s_bits) {
 +        /*
 +         * Overalignment: A successful alignment test will perform the memory
 +         * operation in the delay slot, and failure need only invoke the
 +         * handler for SIGBUS.
 +         */
 +        TCGReg arg_low = TCG_REG_O1 + (!SPARC64 && TARGET_LONG_BITS == 64);
 +        tcg_out_call_nodelay(s, qemu_unalign_st_trampoline, false);
 +        /* delay slot -- move to low part of argument reg */
 +        tcg_out_mov_delay(s, arg_low, addr);
 +    } else {
 +        /* Underalignment: store by pieces of minimum alignment. */
 +        int st_opc, a_size, s_size, i;
 +
 +        /*
 +         * Force full address into T1 early; avoids problems with
 +         * overlap between @addr and @data.
 +         */
 +        tcg_out_arith(s, TCG_REG_T1, addr, index, ARITH_ADD);
 +
 +        a_size = 1 << a_bits;
 +        s_size = 1 << s_bits;
 +        if ((memop & MO_BSWAP) == MO_BE) {
 +            st_opc = qemu_st_opc[a_bits | MO_BE];
 +            for (i = 0; i < s_size; i += a_size) {
 +                TCGReg d = data;
 +                int shift = (s_size - a_size - i) * 8;
 +                if (shift) {
 +                    d = TCG_REG_T2;
 +                    tcg_out_arithi(s, d, data, shift, SHIFT_SRLX);
 +                }
 +                tcg_out_ldst(s, d, TCG_REG_T1, i, st_opc);
 +            }
 +        } else if (a_bits == 0) {
 +            tcg_out_ldst(s, data, TCG_REG_T1, 0, STB);
 +            for (i = 1; i < s_size; i++) {
 +                tcg_out_arithi(s, TCG_REG_T2, data, i * 8, SHIFT_SRLX);
 +                tcg_out_ldst(s, TCG_REG_T2, TCG_REG_T1, i, STB);
 +            }
 +        } else {
 +            /* Note that ST*A with immediate asi must use indexed address. */
 +            st_opc = qemu_st_opc[a_bits + MO_LE];
 +            tcg_out_ldst_rr(s, data, TCG_REG_T1, TCG_REG_G0, st_opc);
 +            for (i = a_size; i < s_size; i += a_size) {
 +                tcg_out_arithi(s, TCG_REG_T2, data, i * 8, SHIFT_SRLX);
 +                tcg_out_arithi(s, TCG_REG_T1, TCG_REG_T1, a_size, ARITH_ADD);
 +                tcg_out_ldst_rr(s, TCG_REG_T2, TCG_REG_T1, TCG_REG_G0, st_opc);
 +            }
 +        }
 +    }
 +
 +    *label_ptr |= INSN_OFF19(tcg_ptr_byte_diff(s->code_ptr, label_ptr));
  #endif /* CONFIG_SOFTMMU */
  }
 --
 .25.1

-New patch
+[PULL 34/34] tests/tcg/multiarch: Add sigbus.c
+A mostly generic test for unaligned access raising SIGBUS.
+Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tests/tcg/multiarch/sigbus.c | 68 ++++++++++++++++++++++++++++++++++++
+file changed, 68 insertions(+)
+ create mode 100644 tests/tcg/multiarch/sigbus.c
+diff --git a/tests/tcg/multiarch/sigbus.c b/tests/tcg/multiarch/sigbus.c
+new file mode 100644
+index XXXXXXX..XXXXXXX
+--- /dev/null
++++ b/tests/tcg/multiarch/sigbus.c
+@@ -XXX,XX +XXX,XX @@
++#define _GNU_SOURCE 1
++
++#include <assert.h>
++#include <stdlib.h>
++#include <signal.h>
++#include <endian.h>
++
++
++unsigned long long x = 0x8877665544332211ull;
++void * volatile p = (void *)&x + 1;
++
++void sigbus(int sig, siginfo_t *info, void *uc)
++{
++    assert(sig == SIGBUS);
++    assert(info->si_signo == SIGBUS);
++#ifdef BUS_ADRALN
++    assert(info->si_code == BUS_ADRALN);
++#endif
++    assert(info->si_addr == p);
++    exit(EXIT_SUCCESS);
++}
++
++int main()
++{
++    struct sigaction sa = {
++        .sa_sigaction = sigbus,
++        .sa_flags = SA_SIGINFO
++    };
++    int allow_fail = 0;
++    int tmp;
++
++    tmp = sigaction(SIGBUS, &sa, NULL);
++    assert(tmp == 0);
++
++    /*
++     * Select an operation that's likely to enforce alignment.
++     * On many guests that support unaligned accesses by default,
++     * this is often an atomic operation.
++     */
++#if defined(__aarch64__)
++    asm volatile("ldxr %w0,[%1]" : "=r"(tmp) : "r"(p) : "memory");
++#elif defined(__alpha__)
++    asm volatile("ldl_l %0,0(%1)" : "=r"(tmp) : "r"(p) : "memory");
++#elif defined(__arm__)
++    asm volatile("ldrex %0,[%1]" : "=r"(tmp) : "r"(p) : "memory");
++#elif defined(__powerpc__)
++    asm volatile("lwarx %0,0,%1" : "=r"(tmp) : "r"(p) : "memory");
++#elif defined(__riscv_atomic)
++    asm volatile("lr.w %0,(%1)" : "=r"(tmp) : "r"(p) : "memory");
++#else
++    /* No insn known to fault unaligned -- try for a straight load. */
++    allow_fail = 1;
++    tmp = *(volatile int *)p;
++#endif
++
++    assert(allow_fail);
++
++    /*
++     * We didn't see a signal.
++     * We might as well validate the unaligned load worked.
++     */
++    if (BYTE_ORDER == LITTLE_ENDIAN) {
++        assert(tmp == 0x55443322);
++    } else {
++        assert(tmp == 0x77665544);
++    }
++    return EXIT_SUCCESS;
++}
+--
+.25.1

The following changes since commit 2ecfc0657afa5d29a373271b342f704a1a3c6737:

Merge remote-tracking branch 'remotes/armbru/tags/pull-misc-2020-12-10' into staging (2020-12-10 17:01:05 +0000)

are available in the Git repository at:

https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20201210

for you to fetch changes up to 9e2658d62ebc23efe7df43fc0e306f129510d874:

accel/tcg: rename tcg-cpus functions to match module name (2020-12-10 17:44:10 -0600)

----------------------------------------------------------------
Split CpusAccel for tcg variants

----------------------------------------------------------------
Claudio Fontana (3):
      accel/tcg: split CpusAccel into three TCG variants
      accel/tcg: split tcg_start_vcpu_thread
      accel/tcg: rename tcg-cpus functions to match module name

accel/tcg/tcg-cpus-icount.h |  17 ++
 accel/tcg/tcg-cpus-rr.h     |  21 ++
 accel/tcg/tcg-cpus.h        |  12 +-
 accel/tcg/tcg-all.c         |  13 +-
 accel/tcg/tcg-cpus-icount.c | 147 +++++++++++++
 accel/tcg/tcg-cpus-mttcg.c  | 140 ++++++++++++
 accel/tcg/tcg-cpus-rr.c     | 305 ++++++++++++++++++++++++++
 accel/tcg/tcg-cpus.c        | 506 +-------------------------------------------
 softmmu/icount.c            |   2 +-
 accel/tcg/meson.build       |   9 +-
 10 files changed, 670 insertions(+), 502 deletions(-)
 create mode 100644 accel/tcg/tcg-cpus-icount.h
 create mode 100644 accel/tcg/tcg-cpus-rr.h
 create mode 100644 accel/tcg/tcg-cpus-icount.c
 create mode 100644 accel/tcg/tcg-cpus-mttcg.c
 create mode 100644 accel/tcg/tcg-cpus-rr.c

From: Claudio Fontana <cfontana@suse.de>

split up the CpusAccel tcg_cpus into three TCG variants:

tcg_cpus_rr (single threaded, round robin cpus)
tcg_cpus_icount (same as rr, but with instruction counting enabled)
tcg_cpus_mttcg (multi-threaded cpus)

Suggested-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Claudio Fontana <cfontana@suse.de>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Message-Id: <20201015143217.29337-2-cfontana@suse.de>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/tcg-cpus-icount.h |  17 ++
 accel/tcg/tcg-cpus-mttcg.h  |  21 ++
 accel/tcg/tcg-cpus-rr.h     |  20 ++
 accel/tcg/tcg-cpus.h        |  13 +-
 accel/tcg/tcg-all.c         |   8 +-
 accel/tcg/tcg-cpus-icount.c | 147 +++++++++++
 accel/tcg/tcg-cpus-mttcg.c  | 117 +++++++++
 accel/tcg/tcg-cpus-rr.c     | 270 ++++++++++++++++++++
 accel/tcg/tcg-cpus.c        | 484 ++----------------------------------
 softmmu/icount.c            |   2 +-
 accel/tcg/meson.build       |   9 +-
 11 files changed, 646 insertions(+), 462 deletions(-)
 create mode 100644 accel/tcg/tcg-cpus-icount.h
 create mode 100644 accel/tcg/tcg-cpus-mttcg.h
 create mode 100644 accel/tcg/tcg-cpus-rr.h
 create mode 100644 accel/tcg/tcg-cpus-icount.c
 create mode 100644 accel/tcg/tcg-cpus-mttcg.c
 create mode 100644 accel/tcg/tcg-cpus-rr.c

diff --git a/accel/tcg/tcg-cpus-icount.h b/accel/tcg/tcg-cpus-icount.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/accel/tcg/tcg-cpus-icount.h
@@ -XXX,XX +XXX,XX @@
+/*
+ * QEMU TCG Single Threaded vCPUs implementation using instruction counting
+ *
+ * Copyright 2020 SUSE LLC
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef TCG_CPUS_ICOUNT_H
+#define TCG_CPUS_ICOUNT_H
+
+void handle_icount_deadline(void);
+void prepare_icount_for_run(CPUState *cpu);
+void process_icount_data(CPUState *cpu);
+
+#endif /* TCG_CPUS_ICOUNT_H */
diff --git a/accel/tcg/tcg-cpus-mttcg.h b/accel/tcg/tcg-cpus-mttcg.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/accel/tcg/tcg-cpus-mttcg.h
@@ -XXX,XX +XXX,XX @@
+/*
+ * QEMU TCG Multi Threaded vCPUs implementation
+ *
+ * Copyright 2020 SUSE LLC
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef TCG_CPUS_MTTCG_H
+#define TCG_CPUS_MTTCG_H
+
+/*
+ * In the multi-threaded case each vCPU has its own thread. The TLS
+ * variable current_cpu can be used deep in the code to find the
+ * current CPUState for a given thread.
+ */
+
+void *tcg_cpu_thread_fn(void *arg);
+
+#endif /* TCG_CPUS_MTTCG_H */
diff --git a/accel/tcg/tcg-cpus-rr.h b/accel/tcg/tcg-cpus-rr.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/accel/tcg/tcg-cpus-rr.h
@@ -XXX,XX +XXX,XX @@
+/*
+ * QEMU TCG Single Threaded vCPUs implementation
+ *
+ * Copyright 2020 SUSE LLC
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef TCG_CPUS_RR_H
+#define TCG_CPUS_RR_H
+
+#define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
+
+/* Kick all RR vCPUs. */
+void qemu_cpu_kick_rr_cpus(CPUState *unused);
+
+void *tcg_rr_cpu_thread_fn(void *arg);
+
+#endif /* TCG_CPUS_RR_H */
diff --git a/accel/tcg/tcg-cpus.h b/accel/tcg/tcg-cpus.h
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-cpus.h
+++ b/accel/tcg/tcg-cpus.h
@@ -XXX,XX +XXX,XX @@
 /*
- * Accelerator CPUS Interface
+ * QEMU TCG vCPU common functionality
+ *
+ * Functionality common to all TCG vcpu variants: mttcg, rr and icount.
  *
  * Copyright 2020 SUSE LLC
  *
@@ -XXX,XX +XXX,XX @@
 
 #include "sysemu/cpus.h"
 
-extern const CpusAccel tcg_cpus;
+extern const CpusAccel tcg_cpus_mttcg;
+extern const CpusAccel tcg_cpus_icount;
+extern const CpusAccel tcg_cpus_rr;
+
+void tcg_start_vcpu_thread(CPUState *cpu);
+void qemu_tcg_destroy_vcpu(CPUState *cpu);
+int tcg_cpu_exec(CPUState *cpu);
+void tcg_handle_interrupt(CPUState *cpu, int mask);
 
 #endif /* TCG_CPUS_H */
diff --git a/accel/tcg/tcg-all.c b/accel/tcg/tcg-all.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-all.c
+++ b/accel/tcg/tcg-all.c
@@ -XXX,XX +XXX,XX @@ static int tcg_init(MachineState *ms)
 
     tcg_exec_init(s->tb_size * 1024 * 1024);
     mttcg_enabled = s->mttcg_enabled;
-    cpus_register_accel(&tcg_cpus);
 
+    if (mttcg_enabled) {
+        cpus_register_accel(&tcg_cpus_mttcg);
+    } else if (icount_enabled()) {
+        cpus_register_accel(&tcg_cpus_icount);
+    } else {
+        cpus_register_accel(&tcg_cpus_rr);
+    }
     return 0;
 }
 
diff --git a/accel/tcg/tcg-cpus-icount.c b/accel/tcg/tcg-cpus-icount.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/accel/tcg/tcg-cpus-icount.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * QEMU TCG Single Threaded vCPUs implementation using instruction counting
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ * Copyright (c) 2014 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+#include "sysemu/tcg.h"
+#include "sysemu/replay.h"
+#include "qemu/main-loop.h"
+#include "qemu/guest-random.h"
+#include "exec/exec-all.h"
+#include "hw/boards.h"
+
+#include "tcg-cpus.h"
+#include "tcg-cpus-icount.h"
+#include "tcg-cpus-rr.h"
+
+static int64_t tcg_get_icount_limit(void)
+{
+    int64_t deadline;
+
+    if (replay_mode != REPLAY_MODE_PLAY) {
+        /*
+         * Include all the timers, because they may need an attention.
+         * Too long CPU execution may create unnecessary delay in UI.
+         */
+        deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
+                                              QEMU_TIMER_ATTR_ALL);
+        /* Check realtime timers, because they help with input processing */
+        deadline = qemu_soonest_timeout(deadline,
+                qemu_clock_deadline_ns_all(QEMU_CLOCK_REALTIME,
+                                           QEMU_TIMER_ATTR_ALL));
+
+        /*
+         * Maintain prior (possibly buggy) behaviour where if no deadline
+         * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
+         * INT32_MAX nanoseconds ahead, we still use INT32_MAX
+         * nanoseconds.
+         */
+        if ((deadline < 0) || (deadline > INT32_MAX)) {
+            deadline = INT32_MAX;
+        }
+
+        return icount_round(deadline);
+    } else {
+        return replay_get_instructions();
+    }
+}
+
+static void notify_aio_contexts(void)
+{
+    /* Wake up other AioContexts.  */
+    qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
+    qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
+}
+
+void handle_icount_deadline(void)
+{
+    assert(qemu_in_vcpu_thread());
+    int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
+                                                  QEMU_TIMER_ATTR_ALL);
+
+    if (deadline == 0) {
+        notify_aio_contexts();
+    }
+}
+
+void prepare_icount_for_run(CPUState *cpu)
+{
+    int insns_left;
+
+    /*
+     * These should always be cleared by process_icount_data after
+     * each vCPU execution. However u16.high can be raised
+     * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
+     */
+    g_assert(cpu_neg(cpu)->icount_decr.u16.low == 0);
+    g_assert(cpu->icount_extra == 0);
+
+    cpu->icount_budget = tcg_get_icount_limit();
+    insns_left = MIN(0xffff, cpu->icount_budget);
+    cpu_neg(cpu)->icount_decr.u16.low = insns_left;
+    cpu->icount_extra = cpu->icount_budget - insns_left;
+
+    replay_mutex_lock();
+
+    if (cpu->icount_budget == 0 && replay_has_checkpoint()) {
+        notify_aio_contexts();
+    }
+}
+
+void process_icount_data(CPUState *cpu)
+{
+    /* Account for executed instructions */
+    icount_update(cpu);
+
+    /* Reset the counters */
+    cpu_neg(cpu)->icount_decr.u16.low = 0;
+    cpu->icount_extra = 0;
+    cpu->icount_budget = 0;
+
+    replay_account_executed_instructions();
+
+    replay_mutex_unlock();
+}
+
+static void icount_handle_interrupt(CPUState *cpu, int mask)
+{
+    int old_mask = cpu->interrupt_request;
+
+    tcg_handle_interrupt(cpu, mask);
+    if (qemu_cpu_is_self(cpu) &&
+        !cpu->can_do_io
+        && (mask & ~old_mask) != 0) {
+        cpu_abort(cpu, "Raised interrupt while not in I/O function");
+    }
+}
+
+const CpusAccel tcg_cpus_icount = {
+    .create_vcpu_thread = tcg_start_vcpu_thread,
+    .kick_vcpu_thread = qemu_cpu_kick_rr_cpus,
+
+    .handle_interrupt = icount_handle_interrupt,
+    .get_virtual_clock = icount_get,
+    .get_elapsed_ticks = icount_get,
+};
diff --git a/accel/tcg/tcg-cpus-mttcg.c b/accel/tcg/tcg-cpus-mttcg.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/accel/tcg/tcg-cpus-mttcg.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * QEMU TCG Multi Threaded vCPUs implementation
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ * Copyright (c) 2014 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+#include "sysemu/tcg.h"
+#include "sysemu/replay.h"
+#include "qemu/main-loop.h"
+#include "qemu/guest-random.h"
+#include "exec/exec-all.h"
+#include "hw/boards.h"
+
+#include "tcg-cpus.h"
+#include "tcg-cpus-mttcg.h"
+
+/*
+ * In the multi-threaded case each vCPU has its own thread. The TLS
+ * variable current_cpu can be used deep in the code to find the
+ * current CPUState for a given thread.
+ */
+
+void *tcg_cpu_thread_fn(void *arg)
+{
+    CPUState *cpu = arg;
+
+    assert(tcg_enabled());
+    g_assert(!icount_enabled());
+
+    rcu_register_thread();
+    tcg_register_thread();
+
+    qemu_mutex_lock_iothread();
+    qemu_thread_get_self(cpu->thread);
+
+    cpu->thread_id = qemu_get_thread_id();
+    cpu->can_do_io = 1;
+    current_cpu = cpu;
+    cpu_thread_signal_created(cpu);
+    qemu_guest_random_seed_thread_part2(cpu->random_seed);
+
+    /* process any pending work */
+    cpu->exit_request = 1;
+
+    do {
+        if (cpu_can_run(cpu)) {
+            int r;
+            qemu_mutex_unlock_iothread();
+            r = tcg_cpu_exec(cpu);
+            qemu_mutex_lock_iothread();
+            switch (r) {
+            case EXCP_DEBUG:
+                cpu_handle_guest_debug(cpu);
+                break;
+            case EXCP_HALTED:
+                /*
+                 * during start-up the vCPU is reset and the thread is
+                 * kicked several times. If we don't ensure we go back
+                 * to sleep in the halted state we won't cleanly
+                 * start-up when the vCPU is enabled.
+                 *
+                 * cpu->halted should ensure we sleep in wait_io_event
+                 */
+                g_assert(cpu->halted);
+                break;
+            case EXCP_ATOMIC:
+                qemu_mutex_unlock_iothread();
+                cpu_exec_step_atomic(cpu);
+                qemu_mutex_lock_iothread();
+            default:
+                /* Ignore everything else? */
+                break;
+            }
+        }
+
+        qatomic_mb_set(&cpu->exit_request, 0);
+        qemu_wait_io_event(cpu);
+    } while (!cpu->unplug || cpu_can_run(cpu));
+
+    qemu_tcg_destroy_vcpu(cpu);
+    qemu_mutex_unlock_iothread();
+    rcu_unregister_thread();
+    return NULL;
+}
+
+static void mttcg_kick_vcpu_thread(CPUState *cpu)
+{
+    cpu_exit(cpu);
+}
+
+const CpusAccel tcg_cpus_mttcg = {
+    .create_vcpu_thread = tcg_start_vcpu_thread,
+    .kick_vcpu_thread = mttcg_kick_vcpu_thread,
+
+    .handle_interrupt = tcg_handle_interrupt,
+};
diff --git a/accel/tcg/tcg-cpus-rr.c b/accel/tcg/tcg-cpus-rr.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/accel/tcg/tcg-cpus-rr.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * QEMU TCG Single Threaded vCPUs implementation
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ * Copyright (c) 2014 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+#include "sysemu/tcg.h"
+#include "sysemu/replay.h"
+#include "qemu/main-loop.h"
+#include "qemu/guest-random.h"
+#include "exec/exec-all.h"
+#include "hw/boards.h"
+
+#include "tcg-cpus.h"
+#include "tcg-cpus-rr.h"
+#include "tcg-cpus-icount.h"
+
+/* Kick all RR vCPUs */
+void qemu_cpu_kick_rr_cpus(CPUState *unused)
+{
+    CPUState *cpu;
+
+    CPU_FOREACH(cpu) {
+        cpu_exit(cpu);
+    };
+}
+
+/*
+ * TCG vCPU kick timer
+ *
+ * The kick timer is responsible for moving single threaded vCPU
+ * emulation on to the next vCPU. If more than one vCPU is running a
+ * timer event with force a cpu->exit so the next vCPU can get
+ * scheduled.
+ *
+ * The timer is removed if all vCPUs are idle and restarted again once
+ * idleness is complete.
+ */
+
+static QEMUTimer *tcg_kick_vcpu_timer;
+static CPUState *tcg_current_rr_cpu;
+
+#define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
+
+static inline int64_t qemu_tcg_next_kick(void)
+{
+    return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
+}
+
+/* Kick the currently round-robin scheduled vCPU to next */
+static void qemu_cpu_kick_rr_next_cpu(void)
+{
+    CPUState *cpu;
+    do {
+        cpu = qatomic_mb_read(&tcg_current_rr_cpu);
+        if (cpu) {
+            cpu_exit(cpu);
+        }
+    } while (cpu != qatomic_mb_read(&tcg_current_rr_cpu));
+}
+
+static void kick_tcg_thread(void *opaque)
+{
+    timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
+    qemu_cpu_kick_rr_next_cpu();
+}
+
+static void start_tcg_kick_timer(void)
+{
+    if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
+        tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
+                                           kick_tcg_thread, NULL);
+    }
+    if (tcg_kick_vcpu_timer && !timer_pending(tcg_kick_vcpu_timer)) {
+        timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
+    }
+}
+
+static void stop_tcg_kick_timer(void)
+{
+    if (tcg_kick_vcpu_timer && timer_pending(tcg_kick_vcpu_timer)) {
+        timer_del(tcg_kick_vcpu_timer);
+    }
+}
+
+static void qemu_tcg_rr_wait_io_event(void)
+{
+    CPUState *cpu;
+
+    while (all_cpu_threads_idle()) {
+        stop_tcg_kick_timer();
+        qemu_cond_wait_iothread(first_cpu->halt_cond);
+    }
+
+    start_tcg_kick_timer();
+
+    CPU_FOREACH(cpu) {
+        qemu_wait_io_event_common(cpu);
+    }
+}
+
+/*
+ * Destroy any remaining vCPUs which have been unplugged and have
+ * finished running
+ */
+static void deal_with_unplugged_cpus(void)
+{
+    CPUState *cpu;
+
+    CPU_FOREACH(cpu) {
+        if (cpu->unplug && !cpu_can_run(cpu)) {
+            qemu_tcg_destroy_vcpu(cpu);
+            break;
+        }
+    }
+}
+
+/*
+ * In the single-threaded case each vCPU is simulated in turn. If
+ * there is more than a single vCPU we create a simple timer to kick
+ * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
+ * This is done explicitly rather than relying on side-effects
+ * elsewhere.
+ */
+
+void *tcg_rr_cpu_thread_fn(void *arg)
+{
+    CPUState *cpu = arg;
+
+    assert(tcg_enabled());
+    rcu_register_thread();
+    tcg_register_thread();
+
+    qemu_mutex_lock_iothread();
+    qemu_thread_get_self(cpu->thread);
+
+    cpu->thread_id = qemu_get_thread_id();
+    cpu->can_do_io = 1;
+    cpu_thread_signal_created(cpu);
+    qemu_guest_random_seed_thread_part2(cpu->random_seed);
+
+    /* wait for initial kick-off after machine start */
+    while (first_cpu->stopped) {
+        qemu_cond_wait_iothread(first_cpu->halt_cond);
+
+        /* process any pending work */
+        CPU_FOREACH(cpu) {
+            current_cpu = cpu;
+            qemu_wait_io_event_common(cpu);
+        }
+    }
+
+    start_tcg_kick_timer();
+
+    cpu = first_cpu;
+
+    /* process any pending work */
+    cpu->exit_request = 1;
+
+    while (1) {
+        qemu_mutex_unlock_iothread();
+        replay_mutex_lock();
+        qemu_mutex_lock_iothread();
+
+        if (icount_enabled()) {
+            /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
+            icount_account_warp_timer();
+            /*
+             * Run the timers here.  This is much more efficient than
+             * waking up the I/O thread and waiting for completion.
+             */
+            handle_icount_deadline();
+        }
+
+        replay_mutex_unlock();
+
+        if (!cpu) {
+            cpu = first_cpu;
+        }
+
+        while (cpu && cpu_work_list_empty(cpu) && !cpu->exit_request) {
+
+            qatomic_mb_set(&tcg_current_rr_cpu, cpu);
+            current_cpu = cpu;
+
+            qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
+                              (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
+
+            if (cpu_can_run(cpu)) {
+                int r;
+
+                qemu_mutex_unlock_iothread();
+                if (icount_enabled()) {
+                    prepare_icount_for_run(cpu);
+                }
+                r = tcg_cpu_exec(cpu);
+                if (icount_enabled()) {
+                    process_icount_data(cpu);
+                }
+                qemu_mutex_lock_iothread();
+
+                if (r == EXCP_DEBUG) {
+                    cpu_handle_guest_debug(cpu);
+                    break;
+                } else if (r == EXCP_ATOMIC) {
+                    qemu_mutex_unlock_iothread();
+                    cpu_exec_step_atomic(cpu);
+                    qemu_mutex_lock_iothread();
+                    break;
+                }
+            } else if (cpu->stop) {
+                if (cpu->unplug) {
+                    cpu = CPU_NEXT(cpu);
+                }
+                break;
+            }
+
+            cpu = CPU_NEXT(cpu);
+        } /* while (cpu && !cpu->exit_request).. */
+
+        /* Does not need qatomic_mb_set because a spurious wakeup is okay.  */
+        qatomic_set(&tcg_current_rr_cpu, NULL);
+
+        if (cpu && cpu->exit_request) {
+            qatomic_mb_set(&cpu->exit_request, 0);
+        }
+
+        if (icount_enabled() && all_cpu_threads_idle()) {
+            /*
+             * When all cpus are sleeping (e.g in WFI), to avoid a deadlock
+             * in the main_loop, wake it up in order to start the warp timer.
+             */
+            qemu_notify_event();
+        }
+
+        qemu_tcg_rr_wait_io_event();
+        deal_with_unplugged_cpus();
+    }
+
+    rcu_unregister_thread();
+    return NULL;
+}
+
+const CpusAccel tcg_cpus_rr = {
+    .create_vcpu_thread = tcg_start_vcpu_thread,
+    .kick_vcpu_thread = qemu_cpu_kick_rr_cpus,
+
+    .handle_interrupt = tcg_handle_interrupt,
+};
diff --git a/accel/tcg/tcg-cpus.c b/accel/tcg/tcg-cpus.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-cpus.c
+++ b/accel/tcg/tcg-cpus.c
@@ -XXX,XX +XXX,XX @@
 /*
- * QEMU System Emulator
+ * QEMU TCG vCPU common functionality
+ *
+ * Functionality common to all TCG vCPU variants: mttcg, rr and icount.
  *
  * Copyright (c) 2003-2008 Fabrice Bellard
  * Copyright (c) 2014 Red Hat Inc.
@@ -XXX,XX +XXX,XX @@
 #include "hw/boards.h"
 
 #include "tcg-cpus.h"
+#include "tcg-cpus-mttcg.h"
+#include "tcg-cpus-rr.h"
 
-/* Kick all RR vCPUs */
-static void qemu_cpu_kick_rr_cpus(void)
-{
-    CPUState *cpu;
+/* common functionality among all TCG variants */
 
-    CPU_FOREACH(cpu) {
-        cpu_exit(cpu);
-    };
-}
-
-static void tcg_kick_vcpu_thread(CPUState *cpu)
-{
-    if (qemu_tcg_mttcg_enabled()) {
-        cpu_exit(cpu);
-    } else {
-        qemu_cpu_kick_rr_cpus();
-    }
-}
-
-/*
- * TCG vCPU kick timer
- *
- * The kick timer is responsible for moving single threaded vCPU
- * emulation on to the next vCPU. If more than one vCPU is running a
- * timer event with force a cpu->exit so the next vCPU can get
- * scheduled.
- *
- * The timer is removed if all vCPUs are idle and restarted again once
- * idleness is complete.
- */
-
-static QEMUTimer *tcg_kick_vcpu_timer;
-static CPUState *tcg_current_rr_cpu;
-
-#define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
-
-static inline int64_t qemu_tcg_next_kick(void)
-{
-    return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
-}
-
-/* Kick the currently round-robin scheduled vCPU to next */
-static void qemu_cpu_kick_rr_next_cpu(void)
-{
-    CPUState *cpu;
-    do {
-        cpu = qatomic_mb_read(&tcg_current_rr_cpu);
-        if (cpu) {
-            cpu_exit(cpu);
-        }
-    } while (cpu != qatomic_mb_read(&tcg_current_rr_cpu));
-}
-
-static void kick_tcg_thread(void *opaque)
-{
-    timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
-    qemu_cpu_kick_rr_next_cpu();
-}
-
-static void start_tcg_kick_timer(void)
-{
-    assert(!mttcg_enabled);
-    if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
-        tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
-                                           kick_tcg_thread, NULL);
-    }
-    if (tcg_kick_vcpu_timer && !timer_pending(tcg_kick_vcpu_timer)) {
-        timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
-    }
-}
-
-static void stop_tcg_kick_timer(void)
-{
-    assert(!mttcg_enabled);
-    if (tcg_kick_vcpu_timer && timer_pending(tcg_kick_vcpu_timer)) {
-        timer_del(tcg_kick_vcpu_timer);
-    }
-}
-
-static void qemu_tcg_destroy_vcpu(CPUState *cpu)
-{
-}
-
-static void qemu_tcg_rr_wait_io_event(void)
-{
-    CPUState *cpu;
-
-    while (all_cpu_threads_idle()) {
-        stop_tcg_kick_timer();
-        qemu_cond_wait_iothread(first_cpu->halt_cond);
-    }
-
-    start_tcg_kick_timer();
-
-    CPU_FOREACH(cpu) {
-        qemu_wait_io_event_common(cpu);
-    }
-}
-
-static int64_t tcg_get_icount_limit(void)
-{
-    int64_t deadline;
-
-    if (replay_mode != REPLAY_MODE_PLAY) {
-        /*
-         * Include all the timers, because they may need an attention.
-         * Too long CPU execution may create unnecessary delay in UI.
-         */
-        deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
-                                              QEMU_TIMER_ATTR_ALL);
-        /* Check realtime timers, because they help with input processing */
-        deadline = qemu_soonest_timeout(deadline,
-                qemu_clock_deadline_ns_all(QEMU_CLOCK_REALTIME,
-                                           QEMU_TIMER_ATTR_ALL));
-
-        /*
-         * Maintain prior (possibly buggy) behaviour where if no deadline
-         * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
-         * INT32_MAX nanoseconds ahead, we still use INT32_MAX
-         * nanoseconds.
-         */
-        if ((deadline < 0) || (deadline > INT32_MAX)) {
-            deadline = INT32_MAX;
-        }
-
-        return icount_round(deadline);
-    } else {
-        return replay_get_instructions();
-    }
-}
-
-static void notify_aio_contexts(void)
-{
-    /* Wake up other AioContexts.  */
-    qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
-    qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
-}
-
-static void handle_icount_deadline(void)
-{
-    assert(qemu_in_vcpu_thread());
-    if (icount_enabled()) {
-        int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
-                                                      QEMU_TIMER_ATTR_ALL);
-
-        if (deadline == 0) {
-            notify_aio_contexts();
-        }
-    }
-}
-
-static void prepare_icount_for_run(CPUState *cpu)
-{
-    if (icount_enabled()) {
-        int insns_left;
-
-        /*
-         * These should always be cleared by process_icount_data after
-         * each vCPU execution. However u16.high can be raised
-         * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
-         */
-        g_assert(cpu_neg(cpu)->icount_decr.u16.low == 0);
-        g_assert(cpu->icount_extra == 0);
-
-        cpu->icount_budget = tcg_get_icount_limit();
-        insns_left = MIN(0xffff, cpu->icount_budget);
-        cpu_neg(cpu)->icount_decr.u16.low = insns_left;
-        cpu->icount_extra = cpu->icount_budget - insns_left;
-
-        replay_mutex_lock();
-
-        if (cpu->icount_budget == 0 && replay_has_checkpoint()) {
-            notify_aio_contexts();
-        }
-    }
-}
-
-static void process_icount_data(CPUState *cpu)
-{
-    if (icount_enabled()) {
-        /* Account for executed instructions */
-        icount_update(cpu);
-
-        /* Reset the counters */
-        cpu_neg(cpu)->icount_decr.u16.low = 0;
-        cpu->icount_extra = 0;
-        cpu->icount_budget = 0;
-
-        replay_account_executed_instructions();
-
-        replay_mutex_unlock();
-    }
-}
-
-static int tcg_cpu_exec(CPUState *cpu)
-{
-    int ret;
-#ifdef CONFIG_PROFILER
-    int64_t ti;
-#endif
-
-    assert(tcg_enabled());
-#ifdef CONFIG_PROFILER
-    ti = profile_getclock();
-#endif
-    cpu_exec_start(cpu);
-    ret = cpu_exec(cpu);
-    cpu_exec_end(cpu);
-#ifdef CONFIG_PROFILER
-    qatomic_set(&tcg_ctx->prof.cpu_exec_time,
-                tcg_ctx->prof.cpu_exec_time + profile_getclock() - ti);
-#endif
-    return ret;
-}
-
-/*
- * Destroy any remaining vCPUs which have been unplugged and have
- * finished running
- */
-static void deal_with_unplugged_cpus(void)
-{
-    CPUState *cpu;
-
-    CPU_FOREACH(cpu) {
-        if (cpu->unplug && !cpu_can_run(cpu)) {
-            qemu_tcg_destroy_vcpu(cpu);
-            cpu_thread_signal_destroyed(cpu);
-            break;
-        }
-    }
-}
-
-/*
- * Single-threaded TCG
- *
- * In the single-threaded case each vCPU is simulated in turn. If
- * there is more than a single vCPU we create a simple timer to kick
- * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
- * This is done explicitly rather than relying on side-effects
- * elsewhere.
- */
-
-static void *tcg_rr_cpu_thread_fn(void *arg)
-{
-    CPUState *cpu = arg;
-
-    assert(tcg_enabled());
-    rcu_register_thread();
-    tcg_register_thread();
-
-    qemu_mutex_lock_iothread();
-    qemu_thread_get_self(cpu->thread);
-
-    cpu->thread_id = qemu_get_thread_id();
-    cpu->can_do_io = 1;
-    cpu_thread_signal_created(cpu);
-    qemu_guest_random_seed_thread_part2(cpu->random_seed);
-
-    /* wait for initial kick-off after machine start */
-    while (first_cpu->stopped) {
-        qemu_cond_wait_iothread(first_cpu->halt_cond);
-
-        /* process any pending work */
-        CPU_FOREACH(cpu) {
-            current_cpu = cpu;
-            qemu_wait_io_event_common(cpu);
-        }
-    }
-
-    start_tcg_kick_timer();
-
-    cpu = first_cpu;
-
-    /* process any pending work */
-    cpu->exit_request = 1;
-
-    while (1) {
-        qemu_mutex_unlock_iothread();
-        replay_mutex_lock();
-        qemu_mutex_lock_iothread();
-        /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
-        icount_account_warp_timer();
-
-        /*
-         * Run the timers here.  This is much more efficient than
-         * waking up the I/O thread and waiting for completion.
-         */
-        handle_icount_deadline();
-
-        replay_mutex_unlock();
-
-        if (!cpu) {
-            cpu = first_cpu;
-        }
-
-        while (cpu && cpu_work_list_empty(cpu) && !cpu->exit_request) {
-
-            qatomic_mb_set(&tcg_current_rr_cpu, cpu);
-            current_cpu = cpu;
-
-            qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
-                              (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
-
-            if (cpu_can_run(cpu)) {
-                int r;
-
-                qemu_mutex_unlock_iothread();
-                prepare_icount_for_run(cpu);
-
-                r = tcg_cpu_exec(cpu);
-
-                process_icount_data(cpu);
-                qemu_mutex_lock_iothread();
-
-                if (r == EXCP_DEBUG) {
-                    cpu_handle_guest_debug(cpu);
-                    break;
-                } else if (r == EXCP_ATOMIC) {
-                    qemu_mutex_unlock_iothread();
-                    cpu_exec_step_atomic(cpu);
-                    qemu_mutex_lock_iothread();
-                    break;
-                }
-            } else if (cpu->stop) {
-                if (cpu->unplug) {
-                    cpu = CPU_NEXT(cpu);
-                }
-                break;
-            }
-
-            cpu = CPU_NEXT(cpu);
-        } /* while (cpu && !cpu->exit_request).. */
-
-        /* Does not need qatomic_mb_set because a spurious wakeup is okay.  */
-        qatomic_set(&tcg_current_rr_cpu, NULL);
-
-        if (cpu && cpu->exit_request) {
-            qatomic_mb_set(&cpu->exit_request, 0);
-        }
-
-        if (icount_enabled() && all_cpu_threads_idle()) {
-            /*
-             * When all cpus are sleeping (e.g in WFI), to avoid a deadlock
-             * in the main_loop, wake it up in order to start the warp timer.
-             */
-            qemu_notify_event();
-        }
-
-        qemu_tcg_rr_wait_io_event();
-        deal_with_unplugged_cpus();
-    }
-
-    rcu_unregister_thread();
-    return NULL;
-}
-
-/*
- * Multi-threaded TCG
- *
- * In the multi-threaded case each vCPU has its own thread. The TLS
- * variable current_cpu can be used deep in the code to find the
- * current CPUState for a given thread.
- */
-
-static void *tcg_cpu_thread_fn(void *arg)
-{
-    CPUState *cpu = arg;
-
-    assert(tcg_enabled());
-    g_assert(!icount_enabled());
-
-    rcu_register_thread();
-    tcg_register_thread();
-
-    qemu_mutex_lock_iothread();
-    qemu_thread_get_self(cpu->thread);
-
-    cpu->thread_id = qemu_get_thread_id();
-    cpu->can_do_io = 1;
-    current_cpu = cpu;
-    cpu_thread_signal_created(cpu);
-    qemu_guest_random_seed_thread_part2(cpu->random_seed);
-
-    /* process any pending work */
-    cpu->exit_request = 1;
-
-    do {
-        if (cpu_can_run(cpu)) {
-            int r;
-            qemu_mutex_unlock_iothread();
-            r = tcg_cpu_exec(cpu);
-            qemu_mutex_lock_iothread();
-            switch (r) {
-            case EXCP_DEBUG:
-                cpu_handle_guest_debug(cpu);
-                break;
-            case EXCP_HALTED:
-                /*
-                 * during start-up the vCPU is reset and the thread is
-                 * kicked several times. If we don't ensure we go back
-                 * to sleep in the halted state we won't cleanly
-                 * start-up when the vCPU is enabled.
-                 *
-                 * cpu->halted should ensure we sleep in wait_io_event
-                 */
-                g_assert(cpu->halted);
-                break;
-            case EXCP_ATOMIC:
-                qemu_mutex_unlock_iothread();
-                cpu_exec_step_atomic(cpu);
-                qemu_mutex_lock_iothread();
-            default:
-                /* Ignore everything else? */
-                break;
-            }
-        }
-
-        qatomic_mb_set(&cpu->exit_request, 0);
-        qemu_wait_io_event(cpu);
-    } while (!cpu->unplug || cpu_can_run(cpu));
-
-    qemu_tcg_destroy_vcpu(cpu);
-    cpu_thread_signal_destroyed(cpu);
-    qemu_mutex_unlock_iothread();
-    rcu_unregister_thread();
-    return NULL;
-}
-
-static void tcg_start_vcpu_thread(CPUState *cpu)
+void tcg_start_vcpu_thread(CPUState *cpu)
 {
     char thread_name[VCPU_THREAD_NAME_SIZE];
     static QemuCond *single_tcg_halt_cond;
@@ -XXX,XX +XXX,XX @@ static void tcg_start_vcpu_thread(CPUState *cpu)
     }
 }
 
-static int64_t tcg_get_virtual_clock(void)
+void qemu_tcg_destroy_vcpu(CPUState *cpu)
 {
-    if (icount_enabled()) {
-        return icount_get();
-    }
-    return cpu_get_clock();
+    cpu_thread_signal_destroyed(cpu);
 }
 
-static int64_t tcg_get_elapsed_ticks(void)
+int tcg_cpu_exec(CPUState *cpu)
 {
-    if (icount_enabled()) {
-        return icount_get();
-    }
-    return cpu_get_ticks();
+    int ret;
+#ifdef CONFIG_PROFILER
+    int64_t ti;
+#endif
+    assert(tcg_enabled());
+#ifdef CONFIG_PROFILER
+    ti = profile_getclock();
+#endif
+    cpu_exec_start(cpu);
+    ret = cpu_exec(cpu);
+    cpu_exec_end(cpu);
+#ifdef CONFIG_PROFILER
+    qatomic_set(&tcg_ctx->prof.cpu_exec_time,
+                tcg_ctx->prof.cpu_exec_time + profile_getclock() - ti);
+#endif
+    return ret;
 }
 
 /* mask must never be zero, except for A20 change call */
-static void tcg_handle_interrupt(CPUState *cpu, int mask)
+void tcg_handle_interrupt(CPUState *cpu, int mask)
 {
-    int old_mask;
     g_assert(qemu_mutex_iothread_locked());
 
-    old_mask = cpu->interrupt_request;
     cpu->interrupt_request |= mask;
 
     /*
@@ -XXX,XX +XXX,XX @@ static void tcg_handle_interrupt(CPUState *cpu, int mask)
         qemu_cpu_kick(cpu);
     } else {
         qatomic_set(&cpu_neg(cpu)->icount_decr.u16.high, -1);
-        if (icount_enabled() &&
-            !cpu->can_do_io
-            && (mask & ~old_mask) != 0) {
-            cpu_abort(cpu, "Raised interrupt while not in I/O function");
-        }
     }
 }
-
-const CpusAccel tcg_cpus = {
-    .create_vcpu_thread = tcg_start_vcpu_thread,
-    .kick_vcpu_thread = tcg_kick_vcpu_thread,
-
-    .handle_interrupt = tcg_handle_interrupt,
-
-    .get_virtual_clock = tcg_get_virtual_clock,
-    .get_elapsed_ticks = tcg_get_elapsed_ticks,
-};
diff --git a/softmmu/icount.c b/softmmu/icount.c
index XXXXXXX..XXXXXXX 100644
--- a/softmmu/icount.c
+++ b/softmmu/icount.c
@@ -XXX,XX +XXX,XX @@ void icount_start_warp_timer(void)
 
 void icount_account_warp_timer(void)
 {
-    if (!icount_enabled() || !icount_sleep) {
+    if (!icount_sleep) {
         return;
     }
 
diff --git a/accel/tcg/meson.build b/accel/tcg/meson.build
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/meson.build
+++ b/accel/tcg/meson.build
@@ -XXX,XX +XXX,XX @@ tcg_ss.add(when: 'CONFIG_SOFTMMU', if_false: files('user-exec-stub.c'))
 tcg_ss.add(when: 'CONFIG_PLUGIN', if_true: [files('plugin-gen.c'), libdl])
 specific_ss.add_all(when: 'CONFIG_TCG', if_true: tcg_ss)
 
-specific_ss.add(when: ['CONFIG_SOFTMMU', 'CONFIG_TCG'], if_true: files('tcg-all.c', 'cputlb.c', 'tcg-cpus.c'))
+specific_ss.add(when: ['CONFIG_SOFTMMU', 'CONFIG_TCG'], if_true: files(
+  'tcg-all.c',
+  'cputlb.c',
+  'tcg-cpus.c',
+  'tcg-cpus-mttcg.c',
+  'tcg-cpus-icount.c',
+  'tcg-cpus-rr.c'
+))
-- 
2.25.1

From: Claudio Fontana <cfontana@suse.de>

after the initial split into 3 tcg variants, we proceed to also
split tcg_start_vcpu_thread.

We actually split it in 2 this time, since the icount variant
just uses the round robin function.

Suggested-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Claudio Fontana <cfontana@suse.de>
Message-Id: <20201015143217.29337-3-cfontana@suse.de>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/tcg-cpus-mttcg.h  | 21 --------------
 accel/tcg/tcg-cpus-rr.h     |  3 +-
 accel/tcg/tcg-cpus.h        |  1 -
 accel/tcg/tcg-all.c         |  5 ++++
 accel/tcg/tcg-cpus-icount.c |  2 +-
 accel/tcg/tcg-cpus-mttcg.c  | 29 +++++++++++++++++--
 accel/tcg/tcg-cpus-rr.c     | 39 +++++++++++++++++++++++--
 accel/tcg/tcg-cpus.c        | 58 -------------------------------------
 8 files changed, 71 insertions(+), 87 deletions(-)
 delete mode 100644 accel/tcg/tcg-cpus-mttcg.h

diff --git a/accel/tcg/tcg-cpus-mttcg.h b/accel/tcg/tcg-cpus-mttcg.h
deleted file mode 100644
index XXXXXXX..XXXXXXX
--- a/accel/tcg/tcg-cpus-mttcg.h
+++ /dev/null
@@ -XXX,XX +XXX,XX @@
-/*
- * QEMU TCG Multi Threaded vCPUs implementation
- *
- * Copyright 2020 SUSE LLC
- *
- * This work is licensed under the terms of the GNU GPL, version 2 or later.
- * See the COPYING file in the top-level directory.
- */
-
-#ifndef TCG_CPUS_MTTCG_H
-#define TCG_CPUS_MTTCG_H
-
-/*
- * In the multi-threaded case each vCPU has its own thread. The TLS
- * variable current_cpu can be used deep in the code to find the
- * current CPUState for a given thread.
- */
-
-void *tcg_cpu_thread_fn(void *arg);
-
-#endif /* TCG_CPUS_MTTCG_H */
diff --git a/accel/tcg/tcg-cpus-rr.h b/accel/tcg/tcg-cpus-rr.h
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-cpus-rr.h
+++ b/accel/tcg/tcg-cpus-rr.h
@@ -XXX,XX +XXX,XX @@
 /* Kick all RR vCPUs. */
 void qemu_cpu_kick_rr_cpus(CPUState *unused);
 
-void *tcg_rr_cpu_thread_fn(void *arg);
+/* start the round robin vcpu thread */
+void rr_start_vcpu_thread(CPUState *cpu);
 
 #endif /* TCG_CPUS_RR_H */
diff --git a/accel/tcg/tcg-cpus.h b/accel/tcg/tcg-cpus.h
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-cpus.h
+++ b/accel/tcg/tcg-cpus.h
@@ -XXX,XX +XXX,XX @@ extern const CpusAccel tcg_cpus_mttcg;
 extern const CpusAccel tcg_cpus_icount;
 extern const CpusAccel tcg_cpus_rr;
 
-void tcg_start_vcpu_thread(CPUState *cpu);
 void qemu_tcg_destroy_vcpu(CPUState *cpu);
 int tcg_cpu_exec(CPUState *cpu);
 void tcg_handle_interrupt(CPUState *cpu, int mask);
diff --git a/accel/tcg/tcg-all.c b/accel/tcg/tcg-all.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-all.c
+++ b/accel/tcg/tcg-all.c
@@ -XXX,XX +XXX,XX @@ static int tcg_init(MachineState *ms)
     tcg_exec_init(s->tb_size * 1024 * 1024);
     mttcg_enabled = s->mttcg_enabled;
 
+    /*
+     * Initialize TCG regions
+     */
+    tcg_region_init();
+
     if (mttcg_enabled) {
         cpus_register_accel(&tcg_cpus_mttcg);
     } else if (icount_enabled()) {
diff --git a/accel/tcg/tcg-cpus-icount.c b/accel/tcg/tcg-cpus-icount.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-cpus-icount.c
+++ b/accel/tcg/tcg-cpus-icount.c
@@ -XXX,XX +XXX,XX @@ static void icount_handle_interrupt(CPUState *cpu, int mask)
 }
 
 const CpusAccel tcg_cpus_icount = {
-    .create_vcpu_thread = tcg_start_vcpu_thread,
+    .create_vcpu_thread = rr_start_vcpu_thread,
     .kick_vcpu_thread = qemu_cpu_kick_rr_cpus,
 
     .handle_interrupt = icount_handle_interrupt,
diff --git a/accel/tcg/tcg-cpus-mttcg.c b/accel/tcg/tcg-cpus-mttcg.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-cpus-mttcg.c
+++ b/accel/tcg/tcg-cpus-mttcg.c
@@ -XXX,XX +XXX,XX @@
 #include "hw/boards.h"
 
 #include "tcg-cpus.h"
-#include "tcg-cpus-mttcg.h"
 
 /*
  * In the multi-threaded case each vCPU has its own thread. The TLS
@@ -XXX,XX +XXX,XX @@
  * current CPUState for a given thread.
  */
 
-void *tcg_cpu_thread_fn(void *arg)
+static void *tcg_cpu_thread_fn(void *arg)
 {
     CPUState *cpu = arg;
 
@@ -XXX,XX +XXX,XX @@ static void mttcg_kick_vcpu_thread(CPUState *cpu)
     cpu_exit(cpu);
 }
 
+static void mttcg_start_vcpu_thread(CPUState *cpu)
+{
+    char thread_name[VCPU_THREAD_NAME_SIZE];
+
+    g_assert(tcg_enabled());
+
+    parallel_cpus = (current_machine->smp.max_cpus > 1);
+
+    cpu->thread = g_malloc0(sizeof(QemuThread));
+    cpu->halt_cond = g_malloc0(sizeof(QemuCond));
+    qemu_cond_init(cpu->halt_cond);
+
+    /* create a thread per vCPU with TCG (MTTCG) */
+    snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
+             cpu->cpu_index);
+
+    qemu_thread_create(cpu->thread, thread_name, tcg_cpu_thread_fn,
+                       cpu, QEMU_THREAD_JOINABLE);
+
+#ifdef _WIN32
+    cpu->hThread = qemu_thread_get_handle(cpu->thread);
+#endif
+}
+
 const CpusAccel tcg_cpus_mttcg = {
-    .create_vcpu_thread = tcg_start_vcpu_thread,
+    .create_vcpu_thread = mttcg_start_vcpu_thread,
     .kick_vcpu_thread = mttcg_kick_vcpu_thread,
 
     .handle_interrupt = tcg_handle_interrupt,
diff --git a/accel/tcg/tcg-cpus-rr.c b/accel/tcg/tcg-cpus-rr.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-cpus-rr.c
+++ b/accel/tcg/tcg-cpus-rr.c
@@ -XXX,XX +XXX,XX @@ static void deal_with_unplugged_cpus(void)
  * elsewhere.
  */
 
-void *tcg_rr_cpu_thread_fn(void *arg)
+static void *tcg_rr_cpu_thread_fn(void *arg)
 {
     CPUState *cpu = arg;
 
@@ -XXX,XX +XXX,XX @@ void *tcg_rr_cpu_thread_fn(void *arg)
     return NULL;
 }
 
+void rr_start_vcpu_thread(CPUState *cpu)
+{
+    char thread_name[VCPU_THREAD_NAME_SIZE];
+    static QemuCond *single_tcg_halt_cond;
+    static QemuThread *single_tcg_cpu_thread;
+
+    g_assert(tcg_enabled());
+    parallel_cpus = false;
+
+    if (!single_tcg_cpu_thread) {
+        cpu->thread = g_malloc0(sizeof(QemuThread));
+        cpu->halt_cond = g_malloc0(sizeof(QemuCond));
+        qemu_cond_init(cpu->halt_cond);
+
+        /* share a single thread for all cpus with TCG */
+        snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
+        qemu_thread_create(cpu->thread, thread_name,
+                           tcg_rr_cpu_thread_fn,
+                           cpu, QEMU_THREAD_JOINABLE);
+
+        single_tcg_halt_cond = cpu->halt_cond;
+        single_tcg_cpu_thread = cpu->thread;
+#ifdef _WIN32
+        cpu->hThread = qemu_thread_get_handle(cpu->thread);
+#endif
+    } else {
+        /* we share the thread */
+        cpu->thread = single_tcg_cpu_thread;
+        cpu->halt_cond = single_tcg_halt_cond;
+        cpu->thread_id = first_cpu->thread_id;
+        cpu->can_do_io = 1;
+        cpu->created = true;
+    }
+}
+
 const CpusAccel tcg_cpus_rr = {
-    .create_vcpu_thread = tcg_start_vcpu_thread,
+    .create_vcpu_thread = rr_start_vcpu_thread,
     .kick_vcpu_thread = qemu_cpu_kick_rr_cpus,
 
     .handle_interrupt = tcg_handle_interrupt,
diff --git a/accel/tcg/tcg-cpus.c b/accel/tcg/tcg-cpus.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-cpus.c
+++ b/accel/tcg/tcg-cpus.c
@@ -XXX,XX +XXX,XX @@
 #include "hw/boards.h"
 
 #include "tcg-cpus.h"
-#include "tcg-cpus-mttcg.h"
-#include "tcg-cpus-rr.h"
 
 /* common functionality among all TCG variants */
 
-void tcg_start_vcpu_thread(CPUState *cpu)
-{
-    char thread_name[VCPU_THREAD_NAME_SIZE];
-    static QemuCond *single_tcg_halt_cond;
-    static QemuThread *single_tcg_cpu_thread;
-    static int tcg_region_inited;
-
-    assert(tcg_enabled());
-    /*
-     * Initialize TCG regions--once. Now is a good time, because:
-     * (1) TCG's init context, prologue and target globals have been set up.
-     * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
-     *     -accel flag is processed, so the check doesn't work then).
-     */
-    if (!tcg_region_inited) {
-        tcg_region_inited = 1;
-        tcg_region_init();
-        parallel_cpus = qemu_tcg_mttcg_enabled() && current_machine->smp.max_cpus > 1;
-    }
-
-    if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
-        cpu->thread = g_malloc0(sizeof(QemuThread));
-        cpu->halt_cond = g_malloc0(sizeof(QemuCond));
-        qemu_cond_init(cpu->halt_cond);
-
-        if (qemu_tcg_mttcg_enabled()) {
-            /* create a thread per vCPU with TCG (MTTCG) */
-            snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
-                 cpu->cpu_index);
-
-            qemu_thread_create(cpu->thread, thread_name, tcg_cpu_thread_fn,
-                               cpu, QEMU_THREAD_JOINABLE);
-
-        } else {
-            /* share a single thread for all cpus with TCG */
-            snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
-            qemu_thread_create(cpu->thread, thread_name,
-                               tcg_rr_cpu_thread_fn,
-                               cpu, QEMU_THREAD_JOINABLE);
-
-            single_tcg_halt_cond = cpu->halt_cond;
-            single_tcg_cpu_thread = cpu->thread;
-        }
-#ifdef _WIN32
-        cpu->hThread = qemu_thread_get_handle(cpu->thread);
-#endif
-    } else {
-        /* For non-MTTCG cases we share the thread */
-        cpu->thread = single_tcg_cpu_thread;
-        cpu->halt_cond = single_tcg_halt_cond;
-        cpu->thread_id = first_cpu->thread_id;
-        cpu->can_do_io = 1;
-        cpu->created = true;
-    }
-}
-
 void qemu_tcg_destroy_vcpu(CPUState *cpu)
 {
     cpu_thread_signal_destroyed(cpu);
-- 
2.25.1

From: Claudio Fontana <cfontana@suse.de>

Signed-off-by: Claudio Fontana <cfontana@suse.de>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Message-Id: <20201015143217.29337-4-cfontana@suse.de>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/tcg-cpus-icount.h |  6 +--
 accel/tcg/tcg-cpus-rr.h     |  2 +-
 accel/tcg/tcg-cpus.h        |  6 +--
 accel/tcg/tcg-cpus-icount.c | 24 ++++++------
 accel/tcg/tcg-cpus-mttcg.c  | 10 ++---
 accel/tcg/tcg-cpus-rr.c     | 74 ++++++++++++++++++-------------------
 accel/tcg/tcg-cpus.c        |  6 +--
 7 files changed, 64 insertions(+), 64 deletions(-)

diff --git a/accel/tcg/tcg-cpus-icount.h b/accel/tcg/tcg-cpus-icount.h
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-cpus-icount.h
+++ b/accel/tcg/tcg-cpus-icount.h
@@ -XXX,XX +XXX,XX @@
 #ifndef TCG_CPUS_ICOUNT_H
 #define TCG_CPUS_ICOUNT_H
 
-void handle_icount_deadline(void);
-void prepare_icount_for_run(CPUState *cpu);
-void process_icount_data(CPUState *cpu);
+void icount_handle_deadline(void);
+void icount_prepare_for_run(CPUState *cpu);
+void icount_process_data(CPUState *cpu);
 
 #endif /* TCG_CPUS_ICOUNT_H */
diff --git a/accel/tcg/tcg-cpus-rr.h b/accel/tcg/tcg-cpus-rr.h
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-cpus-rr.h
+++ b/accel/tcg/tcg-cpus-rr.h
@@ -XXX,XX +XXX,XX @@
 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
 
 /* Kick all RR vCPUs. */
-void qemu_cpu_kick_rr_cpus(CPUState *unused);
+void rr_kick_vcpu_thread(CPUState *unused);
 
 /* start the round robin vcpu thread */
 void rr_start_vcpu_thread(CPUState *cpu);
diff --git a/accel/tcg/tcg-cpus.h b/accel/tcg/tcg-cpus.h
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-cpus.h
+++ b/accel/tcg/tcg-cpus.h
@@ -XXX,XX +XXX,XX @@ extern const CpusAccel tcg_cpus_mttcg;
 extern const CpusAccel tcg_cpus_icount;
 extern const CpusAccel tcg_cpus_rr;
 
-void qemu_tcg_destroy_vcpu(CPUState *cpu);
-int tcg_cpu_exec(CPUState *cpu);
-void tcg_handle_interrupt(CPUState *cpu, int mask);
+void tcg_cpus_destroy(CPUState *cpu);
+int tcg_cpus_exec(CPUState *cpu);
+void tcg_cpus_handle_interrupt(CPUState *cpu, int mask);
 
 #endif /* TCG_CPUS_H */
diff --git a/accel/tcg/tcg-cpus-icount.c b/accel/tcg/tcg-cpus-icount.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-cpus-icount.c
+++ b/accel/tcg/tcg-cpus-icount.c
@@ -XXX,XX +XXX,XX @@
 #include "tcg-cpus-icount.h"
 #include "tcg-cpus-rr.h"
 
-static int64_t tcg_get_icount_limit(void)
+static int64_t icount_get_limit(void)
 {
     int64_t deadline;
 
@@ -XXX,XX +XXX,XX @@ static int64_t tcg_get_icount_limit(void)
     }
 }
 
-static void notify_aio_contexts(void)
+static void icount_notify_aio_contexts(void)
 {
     /* Wake up other AioContexts.  */
     qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
     qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
 }
 
-void handle_icount_deadline(void)
+void icount_handle_deadline(void)
 {
     assert(qemu_in_vcpu_thread());
     int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
                                                   QEMU_TIMER_ATTR_ALL);
 
     if (deadline == 0) {
-        notify_aio_contexts();
+        icount_notify_aio_contexts();
     }
 }
 
-void prepare_icount_for_run(CPUState *cpu)
+void icount_prepare_for_run(CPUState *cpu)
 {
     int insns_left;
 
     /*
-     * These should always be cleared by process_icount_data after
+     * These should always be cleared by icount_process_data after
      * each vCPU execution. However u16.high can be raised
-     * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
+     * asynchronously by cpu_exit/cpu_interrupt/tcg_cpus_handle_interrupt
      */
     g_assert(cpu_neg(cpu)->icount_decr.u16.low == 0);
     g_assert(cpu->icount_extra == 0);
 
-    cpu->icount_budget = tcg_get_icount_limit();
+    cpu->icount_budget = icount_get_limit();
     insns_left = MIN(0xffff, cpu->icount_budget);
     cpu_neg(cpu)->icount_decr.u16.low = insns_left;
     cpu->icount_extra = cpu->icount_budget - insns_left;
@@ -XXX,XX +XXX,XX @@ void prepare_icount_for_run(CPUState *cpu)
     replay_mutex_lock();
 
     if (cpu->icount_budget == 0 && replay_has_checkpoint()) {
-        notify_aio_contexts();
+        icount_notify_aio_contexts();
     }
 }
 
-void process_icount_data(CPUState *cpu)
+void icount_process_data(CPUState *cpu)
 {
     /* Account for executed instructions */
     icount_update(cpu);
@@ -XXX,XX +XXX,XX @@ static void icount_handle_interrupt(CPUState *cpu, int mask)
 {
     int old_mask = cpu->interrupt_request;
 
-    tcg_handle_interrupt(cpu, mask);
+    tcg_cpus_handle_interrupt(cpu, mask);
     if (qemu_cpu_is_self(cpu) &&
         !cpu->can_do_io
         && (mask & ~old_mask) != 0) {
@@ -XXX,XX +XXX,XX @@ static void icount_handle_interrupt(CPUState *cpu, int mask)
 
 const CpusAccel tcg_cpus_icount = {
     .create_vcpu_thread = rr_start_vcpu_thread,
-    .kick_vcpu_thread = qemu_cpu_kick_rr_cpus,
+    .kick_vcpu_thread = rr_kick_vcpu_thread,
 
     .handle_interrupt = icount_handle_interrupt,
     .get_virtual_clock = icount_get,
diff --git a/accel/tcg/tcg-cpus-mttcg.c b/accel/tcg/tcg-cpus-mttcg.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-cpus-mttcg.c
+++ b/accel/tcg/tcg-cpus-mttcg.c
@@ -XXX,XX +XXX,XX @@
  * current CPUState for a given thread.
  */
 
-static void *tcg_cpu_thread_fn(void *arg)
+static void *mttcg_cpu_thread_fn(void *arg)
 {
     CPUState *cpu = arg;
 
@@ -XXX,XX +XXX,XX @@ static void *tcg_cpu_thread_fn(void *arg)
         if (cpu_can_run(cpu)) {
             int r;
             qemu_mutex_unlock_iothread();
-            r = tcg_cpu_exec(cpu);
+            r = tcg_cpus_exec(cpu);
             qemu_mutex_lock_iothread();
             switch (r) {
             case EXCP_DEBUG:
@@ -XXX,XX +XXX,XX @@ static void *tcg_cpu_thread_fn(void *arg)
         qemu_wait_io_event(cpu);
     } while (!cpu->unplug || cpu_can_run(cpu));
 
-    qemu_tcg_destroy_vcpu(cpu);
+    tcg_cpus_destroy(cpu);
     qemu_mutex_unlock_iothread();
     rcu_unregister_thread();
     return NULL;
@@ -XXX,XX +XXX,XX @@ static void mttcg_start_vcpu_thread(CPUState *cpu)
     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
              cpu->cpu_index);
 
-    qemu_thread_create(cpu->thread, thread_name, tcg_cpu_thread_fn,
+    qemu_thread_create(cpu->thread, thread_name, mttcg_cpu_thread_fn,
                        cpu, QEMU_THREAD_JOINABLE);
 
 #ifdef _WIN32
@@ -XXX,XX +XXX,XX @@ const CpusAccel tcg_cpus_mttcg = {
     .create_vcpu_thread = mttcg_start_vcpu_thread,
     .kick_vcpu_thread = mttcg_kick_vcpu_thread,
 
-    .handle_interrupt = tcg_handle_interrupt,
+    .handle_interrupt = tcg_cpus_handle_interrupt,
 };
diff --git a/accel/tcg/tcg-cpus-rr.c b/accel/tcg/tcg-cpus-rr.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-cpus-rr.c
+++ b/accel/tcg/tcg-cpus-rr.c
@@ -XXX,XX +XXX,XX @@
 #include "tcg-cpus-icount.h"
 
 /* Kick all RR vCPUs */
-void qemu_cpu_kick_rr_cpus(CPUState *unused)
+void rr_kick_vcpu_thread(CPUState *unused)
 {
     CPUState *cpu;
 
@@ -XXX,XX +XXX,XX @@ void qemu_cpu_kick_rr_cpus(CPUState *unused)
  * idleness is complete.
  */
 
-static QEMUTimer *tcg_kick_vcpu_timer;
-static CPUState *tcg_current_rr_cpu;
+static QEMUTimer *rr_kick_vcpu_timer;
+static CPUState *rr_current_cpu;
 
 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
 
-static inline int64_t qemu_tcg_next_kick(void)
+static inline int64_t rr_next_kick_time(void)
 {
     return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
 }
 
 /* Kick the currently round-robin scheduled vCPU to next */
-static void qemu_cpu_kick_rr_next_cpu(void)
+static void rr_kick_next_cpu(void)
 {
     CPUState *cpu;
     do {
-        cpu = qatomic_mb_read(&tcg_current_rr_cpu);
+        cpu = qatomic_mb_read(&rr_current_cpu);
         if (cpu) {
             cpu_exit(cpu);
         }
-    } while (cpu != qatomic_mb_read(&tcg_current_rr_cpu));
+    } while (cpu != qatomic_mb_read(&rr_current_cpu));
 }
 
-static void kick_tcg_thread(void *opaque)
+static void rr_kick_thread(void *opaque)
 {
-    timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
-    qemu_cpu_kick_rr_next_cpu();
+    timer_mod(rr_kick_vcpu_timer, rr_next_kick_time());
+    rr_kick_next_cpu();
 }
 
-static void start_tcg_kick_timer(void)
+static void rr_start_kick_timer(void)
 {
-    if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
-        tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
-                                           kick_tcg_thread, NULL);
+    if (!rr_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
+        rr_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
+                                           rr_kick_thread, NULL);
     }
-    if (tcg_kick_vcpu_timer && !timer_pending(tcg_kick_vcpu_timer)) {
-        timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
+    if (rr_kick_vcpu_timer && !timer_pending(rr_kick_vcpu_timer)) {
+        timer_mod(rr_kick_vcpu_timer, rr_next_kick_time());
     }
 }
 
-static void stop_tcg_kick_timer(void)
+static void rr_stop_kick_timer(void)
 {
-    if (tcg_kick_vcpu_timer && timer_pending(tcg_kick_vcpu_timer)) {
-        timer_del(tcg_kick_vcpu_timer);
+    if (rr_kick_vcpu_timer && timer_pending(rr_kick_vcpu_timer)) {
+        timer_del(rr_kick_vcpu_timer);
     }
 }
 
-static void qemu_tcg_rr_wait_io_event(void)
+static void rr_wait_io_event(void)
 {
     CPUState *cpu;
 
     while (all_cpu_threads_idle()) {
-        stop_tcg_kick_timer();
+        rr_stop_kick_timer();
         qemu_cond_wait_iothread(first_cpu->halt_cond);
     }
 
-    start_tcg_kick_timer();
+    rr_start_kick_timer();
 
     CPU_FOREACH(cpu) {
         qemu_wait_io_event_common(cpu);
@@ -XXX,XX +XXX,XX @@ static void qemu_tcg_rr_wait_io_event(void)
  * Destroy any remaining vCPUs which have been unplugged and have
  * finished running
  */
-static void deal_with_unplugged_cpus(void)
+static void rr_deal_with_unplugged_cpus(void)
 {
     CPUState *cpu;
 
     CPU_FOREACH(cpu) {
         if (cpu->unplug && !cpu_can_run(cpu)) {
-            qemu_tcg_destroy_vcpu(cpu);
+            tcg_cpus_destroy(cpu);
             break;
         }
     }
@@ -XXX,XX +XXX,XX @@ static void deal_with_unplugged_cpus(void)
  * elsewhere.
  */
 
-static void *tcg_rr_cpu_thread_fn(void *arg)
+static void *rr_cpu_thread_fn(void *arg)
 {
     CPUState *cpu = arg;
 
@@ -XXX,XX +XXX,XX @@ static void *tcg_rr_cpu_thread_fn(void *arg)
         }
     }
 
-    start_tcg_kick_timer();
+    rr_start_kick_timer();
 
     cpu = first_cpu;
 
@@ -XXX,XX +XXX,XX @@ static void *tcg_rr_cpu_thread_fn(void *arg)
              * Run the timers here.  This is much more efficient than
              * waking up the I/O thread and waiting for completion.
              */
-            handle_icount_deadline();
+            icount_handle_deadline();
         }
 
         replay_mutex_unlock();
@@ -XXX,XX +XXX,XX @@ static void *tcg_rr_cpu_thread_fn(void *arg)
 
         while (cpu && cpu_work_list_empty(cpu) && !cpu->exit_request) {
 
-            qatomic_mb_set(&tcg_current_rr_cpu, cpu);
+            qatomic_mb_set(&rr_current_cpu, cpu);
             current_cpu = cpu;
 
             qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
@@ -XXX,XX +XXX,XX @@ static void *tcg_rr_cpu_thread_fn(void *arg)
 
                 qemu_mutex_unlock_iothread();
                 if (icount_enabled()) {
-                    prepare_icount_for_run(cpu);
+                    icount_prepare_for_run(cpu);
                 }
-                r = tcg_cpu_exec(cpu);
+                r = tcg_cpus_exec(cpu);
                 if (icount_enabled()) {
-                    process_icount_data(cpu);
+                    icount_process_data(cpu);
                 }
                 qemu_mutex_lock_iothread();
 
@@ -XXX,XX +XXX,XX @@ static void *tcg_rr_cpu_thread_fn(void *arg)
         } /* while (cpu && !cpu->exit_request).. */
 
         /* Does not need qatomic_mb_set because a spurious wakeup is okay.  */
-        qatomic_set(&tcg_current_rr_cpu, NULL);
+        qatomic_set(&rr_current_cpu, NULL);
 
         if (cpu && cpu->exit_request) {
             qatomic_mb_set(&cpu->exit_request, 0);
@@ -XXX,XX +XXX,XX @@ static void *tcg_rr_cpu_thread_fn(void *arg)
             qemu_notify_event();
         }
 
-        qemu_tcg_rr_wait_io_event();
-        deal_with_unplugged_cpus();
+        rr_wait_io_event();
+        rr_deal_with_unplugged_cpus();
     }
 
     rcu_unregister_thread();
@@ -XXX,XX +XXX,XX @@ void rr_start_vcpu_thread(CPUState *cpu)
         /* share a single thread for all cpus with TCG */
         snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
         qemu_thread_create(cpu->thread, thread_name,
-                           tcg_rr_cpu_thread_fn,
+                           rr_cpu_thread_fn,
                            cpu, QEMU_THREAD_JOINABLE);
 
         single_tcg_halt_cond = cpu->halt_cond;
@@ -XXX,XX +XXX,XX @@ void rr_start_vcpu_thread(CPUState *cpu)
 
 const CpusAccel tcg_cpus_rr = {
     .create_vcpu_thread = rr_start_vcpu_thread,
-    .kick_vcpu_thread = qemu_cpu_kick_rr_cpus,
+    .kick_vcpu_thread = rr_kick_vcpu_thread,
 
-    .handle_interrupt = tcg_handle_interrupt,
+    .handle_interrupt = tcg_cpus_handle_interrupt,
 };
diff --git a/accel/tcg/tcg-cpus.c b/accel/tcg/tcg-cpus.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-cpus.c
+++ b/accel/tcg/tcg-cpus.c
@@ -XXX,XX +XXX,XX @@
 
 /* common functionality among all TCG variants */
 
-void qemu_tcg_destroy_vcpu(CPUState *cpu)
+void tcg_cpus_destroy(CPUState *cpu)
 {
     cpu_thread_signal_destroyed(cpu);
 }
 
-int tcg_cpu_exec(CPUState *cpu)
+int tcg_cpus_exec(CPUState *cpu)
 {
     int ret;
 #ifdef CONFIG_PROFILER
@@ -XXX,XX +XXX,XX @@ int tcg_cpu_exec(CPUState *cpu)
 }
 
 /* mask must never be zero, except for A20 change call */
-void tcg_handle_interrupt(CPUState *cpu, int mask)
+void tcg_cpus_handle_interrupt(CPUState *cpu, int mask)
 {
     g_assert(qemu_mutex_iothread_locked());
 
-- 
2.25.1

The following changes since commit 0a301624c2f4ced3331ffd5bce85b4274fe132af:

Merge remote-tracking branch 'remotes/pmaydell/tags/pull-target-arm-20220208' into staging (2022-02-08 11:40:08 +0000)

are available in the Git repository at:

https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20220211

for you to fetch changes up to 5c1a101ef6b85537a4ade93c39ea81cadd5c246e:

tests/tcg/multiarch: Add sigbus.c (2022-02-09 09:00:01 +1100)

----------------------------------------------------------------
Fix safe_syscall_base for sparc64.
Fix host signal handling for sparc64-linux.
Speedups for jump cache and work list probing.
Fix for exception replays.
Raise guest SIGBUS for user-only misaligned accesses.

----------------------------------------------------------------
Idan Horowitz (2):
      accel/tcg: Optimize jump cache flush during tlb range flush
      softmmu/cpus: Check if the cpu work list is empty atomically

Pavel Dovgalyuk (1):
      replay: use CF_NOIRQ for special exception-replaying TB

Richard Henderson (29):
      common-user/host/sparc64: Fix safe_syscall_base
      linux-user: Introduce host_signal_mask
      linux-user: Introduce host_sigcontext
      linux-user: Move sparc/host-signal.h to sparc64/host-signal.h
      linux-user/include/host/sparc64: Fix host_sigcontext
      tcg/i386: Support raising sigbus for user-only
      tcg/aarch64: Support raising sigbus for user-only
      tcg/ppc: Support raising sigbus for user-only
      tcg/riscv: Support raising sigbus for user-only
      tcg/s390x: Support raising sigbus for user-only
      tcg/tci: Support raising sigbus for user-only
      tcg/arm: Drop support for armv4 and armv5 hosts
      tcg/arm: Remove use_armv5t_instructions
      tcg/arm: Remove use_armv6_instructions
      tcg/arm: Check alignment for ldrd and strd
      tcg/arm: Support unaligned access for softmmu
      tcg/arm: Reserve a register for guest_base
      tcg/arm: Support raising sigbus for user-only
      tcg/mips: Support unaligned access for user-only
      tcg/mips: Support unaligned access for softmmu
      tcg/sparc: Use tcg_out_movi_imm13 in tcg_out_addsub2_i64
      tcg/sparc: Split out tcg_out_movi_imm32
      tcg/sparc: Add scratch argument to tcg_out_movi_int
      tcg/sparc: Improve code gen for shifted 32-bit constants
      tcg/sparc: Convert patch_reloc to return bool
      tcg/sparc: Use the constant pool for 64-bit constants
      tcg/sparc: Add tcg_out_jmpl_const for better tail calls
      tcg/sparc: Support unaligned access for user-only
      tests/tcg/multiarch: Add sigbus.c

WANG Xuerui (2):
      tcg/loongarch64: Fix fallout from recent MO_Q renaming
      tcg/loongarch64: Support raising sigbus for user-only

linux-user/include/host/aarch64/host-signal.h     |  16 +-
 linux-user/include/host/alpha/host-signal.h       |  14 +-
 linux-user/include/host/arm/host-signal.h         |  14 +-
 linux-user/include/host/i386/host-signal.h        |  14 +-
 linux-user/include/host/loongarch64/host-signal.h |  14 +-
 linux-user/include/host/mips/host-signal.h        |  14 +-
 linux-user/include/host/ppc/host-signal.h         |  14 +-
 linux-user/include/host/riscv/host-signal.h       |  14 +-
 linux-user/include/host/s390/host-signal.h        |  14 +-
 linux-user/include/host/sparc/host-signal.h       |  63 ----
 linux-user/include/host/sparc64/host-signal.h     |  65 +++-
 linux-user/include/host/x86_64/host-signal.h      |  14 +-
 tcg/aarch64/tcg-target.h                          |   2 -
 tcg/arm/tcg-target.h                              |   6 +-
 tcg/i386/tcg-target.h                             |   2 -
 tcg/loongarch64/tcg-target.h                      |   2 -
 tcg/mips/tcg-target.h                             |   2 -
 tcg/ppc/tcg-target.h                              |   2 -
 tcg/riscv/tcg-target.h                            |   2 -
 tcg/s390x/tcg-target.h                            |   2 -
 accel/tcg/cpu-exec.c                              |   3 +-
 accel/tcg/cputlb.c                                |   9 +
 linux-user/signal.c                               |  22 +-
 softmmu/cpus.c                                    |   7 +-
 tcg/tci.c                                         |  20 +-
 tests/tcg/multiarch/sigbus.c                      |  68 ++++
 tcg/aarch64/tcg-target.c.inc                      |  91 ++++-
 tcg/arm/tcg-target.c.inc                          | 410 +++++++++-------------
 tcg/i386/tcg-target.c.inc                         | 103 +++++-
 tcg/loongarch64/tcg-target.c.inc                  |  73 +++-
 tcg/mips/tcg-target.c.inc                         | 387 ++++++++++++++++++--
 tcg/ppc/tcg-target.c.inc                          |  98 +++++-
 tcg/riscv/tcg-target.c.inc                        |  63 +++-
 tcg/s390x/tcg-target.c.inc                        |  59 +++-
 tcg/sparc/tcg-target.c.inc                        | 348 +++++++++++++++---
 common-user/host/sparc64/safe-syscall.inc.S       |   5 +-
 36 files changed, 1561 insertions(+), 495 deletions(-)
 delete mode 100644 linux-user/include/host/sparc/host-signal.h
 create mode 100644 tests/tcg/multiarch/sigbus.c

Use the "retl" instead of "ret" instruction alias, since we
do not allocate a register window in this function.

Fix the offset to the first stacked parameter, which lies
beyond the register window save area.

Fixes: 95c021dac835 ("linux-user/host/sparc64: Add safe-syscall.inc.S")
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 common-user/host/sparc64/safe-syscall.inc.S | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/common-user/host/sparc64/safe-syscall.inc.S b/common-user/host/sparc64/safe-syscall.inc.S
index XXXXXXX..XXXXXXX 100644
--- a/common-user/host/sparc64/safe-syscall.inc.S
+++ b/common-user/host/sparc64/safe-syscall.inc.S
@@ -XXX,XX +XXX,XX @@
         .type   safe_syscall_end, @function
 
 #define STACK_BIAS  2047
-#define PARAM(N)    STACK_BIAS + N*8
+#define WINDOW_SIZE 16 * 8
+#define PARAM(N)    STACK_BIAS + WINDOW_SIZE + N * 8
 
         /*
          * This is the entry point for making a system call. The calling
@@ -XXX,XX +XXX,XX @@ safe_syscall_end:
         /* code path for having successfully executed the syscall */
         bcs,pn  %xcc, 1f
          nop
-        ret
+        retl
          nop
 
         /* code path when we didn't execute the syscall */
-- 
2.25.1

Do not directly access the uc_sigmask member.
This is preparation for a sparc64 fix.

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 linux-user/include/host/aarch64/host-signal.h  |  5 +++++
 linux-user/include/host/alpha/host-signal.h    |  5 +++++
 linux-user/include/host/arm/host-signal.h      |  5 +++++
 linux-user/include/host/i386/host-signal.h     |  5 +++++
 .../include/host/loongarch64/host-signal.h     |  5 +++++
 linux-user/include/host/mips/host-signal.h     |  5 +++++
 linux-user/include/host/ppc/host-signal.h      |  5 +++++
 linux-user/include/host/riscv/host-signal.h    |  5 +++++
 linux-user/include/host/s390/host-signal.h     |  5 +++++
 linux-user/include/host/sparc/host-signal.h    |  5 +++++
 linux-user/include/host/x86_64/host-signal.h   |  5 +++++
 linux-user/signal.c                            | 18 ++++++++----------
 12 files changed, 63 insertions(+), 10 deletions(-)

Do not directly access ucontext_t as the third signal parameter.
This is preparation for a sparc64 fix.

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 linux-user/include/host/aarch64/host-signal.h     | 13 ++++++++-----
 linux-user/include/host/alpha/host-signal.h       | 11 +++++++----
 linux-user/include/host/arm/host-signal.h         | 11 +++++++----
 linux-user/include/host/i386/host-signal.h        | 11 +++++++----
 linux-user/include/host/loongarch64/host-signal.h | 11 +++++++----
 linux-user/include/host/mips/host-signal.h        | 11 +++++++----
 linux-user/include/host/ppc/host-signal.h         | 11 +++++++----
 linux-user/include/host/riscv/host-signal.h       | 11 +++++++----
 linux-user/include/host/s390/host-signal.h        | 11 +++++++----
 linux-user/include/host/sparc/host-signal.h       | 11 +++++++----
 linux-user/include/host/x86_64/host-signal.h      | 11 +++++++----
 linux-user/signal.c                               |  4 ++--
 12 files changed, 80 insertions(+), 47 deletions(-)

diff --git a/linux-user/include/host/aarch64/host-signal.h b/linux-user/include/host/aarch64/host-signal.h
index XXXXXXX..XXXXXXX 100644
--- a/linux-user/include/host/aarch64/host-signal.h
+++ b/linux-user/include/host/aarch64/host-signal.h
@@ -XXX,XX +XXX,XX @@
 #ifndef AARCH64_HOST_SIGNAL_H
 #define AARCH64_HOST_SIGNAL_H
 
+/* The third argument to a SA_SIGINFO handler is ucontext_t. */
+typedef ucontext_t host_sigcontext;
+
 /* Pre-3.16 kernel headers don't have these, so provide fallback definitions */
 #ifndef ESR_MAGIC
 #define ESR_MAGIC 0x45535201
@@ -XXX,XX +XXX,XX @@ struct esr_context {
 };
 #endif
 
-static inline struct _aarch64_ctx *first_ctx(ucontext_t *uc)
+static inline struct _aarch64_ctx *first_ctx(host_sigcontext *uc)
 {
     return (struct _aarch64_ctx *)&uc->uc_mcontext.__reserved;
 }
@@ -XXX,XX +XXX,XX @@ static inline struct _aarch64_ctx *next_ctx(struct _aarch64_ctx *hdr)
     return (struct _aarch64_ctx *)((char *)hdr + hdr->size);
 }
 
-static inline uintptr_t host_signal_pc(ucontext_t *uc)
+static inline uintptr_t host_signal_pc(host_sigcontext *uc)
 {
     return uc->uc_mcontext.pc;
 }
 
-static inline void host_signal_set_pc(ucontext_t *uc, uintptr_t pc)
+static inline void host_signal_set_pc(host_sigcontext *uc, uintptr_t pc)
 {
     uc->uc_mcontext.pc = pc;
 }
 
-static inline void *host_signal_mask(ucontext_t *uc)
+static inline void *host_signal_mask(host_sigcontext *uc)
 {
     return &uc->uc_sigmask;
 }
 
-static inline bool host_signal_write(siginfo_t *info, ucontext_t *uc)
+static inline bool host_signal_write(siginfo_t *info, host_sigcontext *uc)
 {
     struct _aarch64_ctx *hdr;
     uint32_t insn;
diff --git a/linux-user/include/host/alpha/host-signal.h b/linux-user/include/host/alpha/host-signal.h
index XXXXXXX..XXXXXXX 100644
--- a/linux-user/include/host/alpha/host-signal.h
+++ b/linux-user/include/host/alpha/host-signal.h
@@ -XXX,XX +XXX,XX @@
 #ifndef ALPHA_HOST_SIGNAL_H
 #define ALPHA_HOST_SIGNAL_H
 
-static inline uintptr_t host_signal_pc(ucontext_t *uc)
+/* The third argument to a SA_SIGINFO handler is ucontext_t. */
+typedef ucontext_t host_sigcontext;
+
+static inline uintptr_t host_signal_pc(host_sigcontext *uc)
 {
     return uc->uc_mcontext.sc_pc;
 }
 
-static inline void host_signal_set_pc(ucontext_t *uc, uintptr_t pc)
+static inline void host_signal_set_pc(host_sigcontext *uc, uintptr_t pc)
 {
     uc->uc_mcontext.sc_pc = pc;
 }
 
-static inline void *host_signal_mask(ucontext_t *uc)
+static inline void *host_signal_mask(host_sigcontext *uc)
 {
     return &uc->uc_sigmask;
 }
 
-static inline bool host_signal_write(siginfo_t *info, ucontext_t *uc)
+static inline bool host_signal_write(siginfo_t *info, host_sigcontext *uc)
 {
     uint32_t *pc = (uint32_t *)host_signal_pc(uc);
     uint32_t insn = *pc;
diff --git a/linux-user/include/host/arm/host-signal.h b/linux-user/include/host/arm/host-signal.h
index XXXXXXX..XXXXXXX 100644
--- a/linux-user/include/host/arm/host-signal.h
+++ b/linux-user/include/host/arm/host-signal.h
@@ -XXX,XX +XXX,XX @@
 #ifndef ARM_HOST_SIGNAL_H
 #define ARM_HOST_SIGNAL_H
 
-static inline uintptr_t host_signal_pc(ucontext_t *uc)
+/* The third argument to a SA_SIGINFO handler is ucontext_t. */
+typedef ucontext_t host_sigcontext;
+
+static inline uintptr_t host_signal_pc(host_sigcontext *uc)
 {
     return uc->uc_mcontext.arm_pc;
 }
 
-static inline void host_signal_set_pc(ucontext_t *uc, uintptr_t pc)
+static inline void host_signal_set_pc(host_sigcontext *uc, uintptr_t pc)
 {
     uc->uc_mcontext.arm_pc = pc;
 }
 
-static inline void *host_signal_mask(ucontext_t *uc)
+static inline void *host_signal_mask(host_sigcontext *uc)
 {
     return &uc->uc_sigmask;
 }
 
-static inline bool host_signal_write(siginfo_t *info, ucontext_t *uc)
+static inline bool host_signal_write(siginfo_t *info, host_sigcontext *uc)
 {
     /*
      * In the FSR, bit 11 is WnR, assuming a v6 or
diff --git a/linux-user/include/host/i386/host-signal.h b/linux-user/include/host/i386/host-signal.h
index XXXXXXX..XXXXXXX 100644
--- a/linux-user/include/host/i386/host-signal.h
+++ b/linux-user/include/host/i386/host-signal.h
@@ -XXX,XX +XXX,XX @@
 #ifndef I386_HOST_SIGNAL_H
 #define I386_HOST_SIGNAL_H
 
-static inline uintptr_t host_signal_pc(ucontext_t *uc)
+/* The third argument to a SA_SIGINFO handler is ucontext_t. */
+typedef ucontext_t host_sigcontext;
+
+static inline uintptr_t host_signal_pc(host_sigcontext *uc)
 {
     return uc->uc_mcontext.gregs[REG_EIP];
 }
 
-static inline void host_signal_set_pc(ucontext_t *uc, uintptr_t pc)
+static inline void host_signal_set_pc(host_sigcontext *uc, uintptr_t pc)
 {
     uc->uc_mcontext.gregs[REG_EIP] = pc;
 }
 
-static inline void *host_signal_mask(ucontext_t *uc)
+static inline void *host_signal_mask(host_sigcontext *uc)
 {
     return &uc->uc_sigmask;
 }
 
-static inline bool host_signal_write(siginfo_t *info, ucontext_t *uc)
+static inline bool host_signal_write(siginfo_t *info, host_sigcontext *uc)
 {
     return uc->uc_mcontext.gregs[REG_TRAPNO] == 0xe
         && (uc->uc_mcontext.gregs[REG_ERR] & 0x2);
diff --git a/linux-user/include/host/loongarch64/host-signal.h b/linux-user/include/host/loongarch64/host-signal.h
index XXXXXXX..XXXXXXX 100644
--- a/linux-user/include/host/loongarch64/host-signal.h
+++ b/linux-user/include/host/loongarch64/host-signal.h
@@ -XXX,XX +XXX,XX @@
 #ifndef LOONGARCH64_HOST_SIGNAL_H
 #define LOONGARCH64_HOST_SIGNAL_H
 
-static inline uintptr_t host_signal_pc(ucontext_t *uc)
+/* The third argument to a SA_SIGINFO handler is ucontext_t. */
+typedef ucontext_t host_sigcontext;
+
+static inline uintptr_t host_signal_pc(host_sigcontext *uc)
 {
     return uc->uc_mcontext.__pc;
 }
 
-static inline void host_signal_set_pc(ucontext_t *uc, uintptr_t pc)
+static inline void host_signal_set_pc(host_sigcontext *uc, uintptr_t pc)
 {
     uc->uc_mcontext.__pc = pc;
 }
 
-static inline void *host_signal_mask(ucontext_t *uc)
+static inline void *host_signal_mask(host_sigcontext *uc)
 {
     return &uc->uc_sigmask;
 }
 
-static inline bool host_signal_write(siginfo_t *info, ucontext_t *uc)
+static inline bool host_signal_write(siginfo_t *info, host_sigcontext *uc)
 {
     const uint32_t *pinsn = (const uint32_t *)host_signal_pc(uc);
     uint32_t insn = pinsn[0];
diff --git a/linux-user/include/host/mips/host-signal.h b/linux-user/include/host/mips/host-signal.h
index XXXXXXX..XXXXXXX 100644
--- a/linux-user/include/host/mips/host-signal.h
+++ b/linux-user/include/host/mips/host-signal.h
@@ -XXX,XX +XXX,XX @@
 #ifndef MIPS_HOST_SIGNAL_H
 #define MIPS_HOST_SIGNAL_H
 
-static inline uintptr_t host_signal_pc(ucontext_t *uc)
+/* The third argument to a SA_SIGINFO handler is ucontext_t. */
+typedef ucontext_t host_sigcontext;
+
+static inline uintptr_t host_signal_pc(host_sigcontext *uc)
 {
     return uc->uc_mcontext.pc;
 }
 
-static inline void host_signal_set_pc(ucontext_t *uc, uintptr_t pc)
+static inline void host_signal_set_pc(host_sigcontext *uc, uintptr_t pc)
 {
     uc->uc_mcontext.pc = pc;
 }
 
-static inline void *host_signal_mask(ucontext_t *uc)
+static inline void *host_signal_mask(host_sigcontext *uc)
 {
     return &uc->uc_sigmask;
 }
@@ -XXX,XX +XXX,XX @@ static inline void *host_signal_mask(ucontext_t *uc)
 #error "Unsupported encoding"
 #endif
 
-static inline bool host_signal_write(siginfo_t *info, ucontext_t *uc)
+static inline bool host_signal_write(siginfo_t *info, host_sigcontext *uc)
 {
     uint32_t insn = *(uint32_t *)host_signal_pc(uc);
 
diff --git a/linux-user/include/host/ppc/host-signal.h b/linux-user/include/host/ppc/host-signal.h
index XXXXXXX..XXXXXXX 100644
--- a/linux-user/include/host/ppc/host-signal.h
+++ b/linux-user/include/host/ppc/host-signal.h
@@ -XXX,XX +XXX,XX @@
 #ifndef PPC_HOST_SIGNAL_H
 #define PPC_HOST_SIGNAL_H
 
-static inline uintptr_t host_signal_pc(ucontext_t *uc)
+/* The third argument to a SA_SIGINFO handler is ucontext_t. */
+typedef ucontext_t host_sigcontext;
+
+static inline uintptr_t host_signal_pc(host_sigcontext *uc)
 {
     return uc->uc_mcontext.regs->nip;
 }
 
-static inline void host_signal_set_pc(ucontext_t *uc, uintptr_t pc)
+static inline void host_signal_set_pc(host_sigcontext *uc, uintptr_t pc)
 {
     uc->uc_mcontext.regs->nip = pc;
 }
 
-static inline void *host_signal_mask(ucontext_t *uc)
+static inline void *host_signal_mask(host_sigcontext *uc)
 {
     return &uc->uc_sigmask;
 }
 
-static inline bool host_signal_write(siginfo_t *info, ucontext_t *uc)
+static inline bool host_signal_write(siginfo_t *info, host_sigcontext *uc)
 {
     return uc->uc_mcontext.regs->trap != 0x400
         && (uc->uc_mcontext.regs->dsisr & 0x02000000);
diff --git a/linux-user/include/host/riscv/host-signal.h b/linux-user/include/host/riscv/host-signal.h
index XXXXXXX..XXXXXXX 100644
--- a/linux-user/include/host/riscv/host-signal.h
+++ b/linux-user/include/host/riscv/host-signal.h
@@ -XXX,XX +XXX,XX @@
 #ifndef RISCV_HOST_SIGNAL_H
 #define RISCV_HOST_SIGNAL_H
 
-static inline uintptr_t host_signal_pc(ucontext_t *uc)
+/* The third argument to a SA_SIGINFO handler is ucontext_t. */
+typedef ucontext_t host_sigcontext;
+
+static inline uintptr_t host_signal_pc(host_sigcontext *uc)
 {
     return uc->uc_mcontext.__gregs[REG_PC];
 }
 
-static inline void host_signal_set_pc(ucontext_t *uc, uintptr_t pc)
+static inline void host_signal_set_pc(host_sigcontext *uc, uintptr_t pc)
 {
     uc->uc_mcontext.__gregs[REG_PC] = pc;
 }
 
-static inline void *host_signal_mask(ucontext_t *uc)
+static inline void *host_signal_mask(host_sigcontext *uc)
 {
     return &uc->uc_sigmask;
 }
 
-static inline bool host_signal_write(siginfo_t *info, ucontext_t *uc)
+static inline bool host_signal_write(siginfo_t *info, host_sigcontext *uc)
 {
     /*
      * Detect store by reading the instruction at the program counter.
diff --git a/linux-user/include/host/s390/host-signal.h b/linux-user/include/host/s390/host-signal.h
index XXXXXXX..XXXXXXX 100644
--- a/linux-user/include/host/s390/host-signal.h
+++ b/linux-user/include/host/s390/host-signal.h
@@ -XXX,XX +XXX,XX @@
 #ifndef S390_HOST_SIGNAL_H
 #define S390_HOST_SIGNAL_H
 
-static inline uintptr_t host_signal_pc(ucontext_t *uc)
+/* The third argument to a SA_SIGINFO handler is ucontext_t. */
+typedef ucontext_t host_sigcontext;
+
+static inline uintptr_t host_signal_pc(host_sigcontext *uc)
 {
     return uc->uc_mcontext.psw.addr;
 }
 
-static inline void host_signal_set_pc(ucontext_t *uc, uintptr_t pc)
+static inline void host_signal_set_pc(host_sigcontext *uc, uintptr_t pc)
 {
     uc->uc_mcontext.psw.addr = pc;
 }
 
-static inline void *host_signal_mask(ucontext_t *uc)
+static inline void *host_signal_mask(host_sigcontext *uc)
 {
     return &uc->uc_sigmask;
 }
 
-static inline bool host_signal_write(siginfo_t *info, ucontext_t *uc)
+static inline bool host_signal_write(siginfo_t *info, host_sigcontext *uc)
 {
     uint16_t *pinsn = (uint16_t *)host_signal_pc(uc);
 
diff --git a/linux-user/include/host/sparc/host-signal.h b/linux-user/include/host/sparc/host-signal.h
index XXXXXXX..XXXXXXX 100644
--- a/linux-user/include/host/sparc/host-signal.h
+++ b/linux-user/include/host/sparc/host-signal.h
@@ -XXX,XX +XXX,XX @@
 #ifndef SPARC_HOST_SIGNAL_H
 #define SPARC_HOST_SIGNAL_H
 
-static inline uintptr_t host_signal_pc(ucontext_t *uc)
+/* FIXME: the third argument to a SA_SIGINFO handler is *not* ucontext_t. */
+typedef ucontext_t host_sigcontext;
+
+static inline uintptr_t host_signal_pc(host_sigcontext *uc)
 {
 #ifdef __arch64__
     return uc->uc_mcontext.mc_gregs[MC_PC];
@@ -XXX,XX +XXX,XX @@ static inline uintptr_t host_signal_pc(ucontext_t *uc)
 #endif
 }
 
-static inline void host_signal_set_pc(ucontext_t *uc, uintptr_t pc)
+static inline void host_signal_set_pc(host_sigcontext *uc, uintptr_t pc)
 {
 #ifdef __arch64__
     uc->uc_mcontext.mc_gregs[MC_PC] = pc;
@@ -XXX,XX +XXX,XX @@ static inline void host_signal_set_pc(ucontext_t *uc, uintptr_t pc)
 #endif
 }
 
-static inline void *host_signal_mask(ucontext_t *uc)
+static inline void *host_signal_mask(host_sigcontext *uc)
 {
     return &uc->uc_sigmask;
 }
 
-static inline bool host_signal_write(siginfo_t *info, ucontext_t *uc)
+static inline bool host_signal_write(siginfo_t *info, host_sigcontext *uc)
 {
     uint32_t insn = *(uint32_t *)host_signal_pc(uc);
 
diff --git a/linux-user/include/host/x86_64/host-signal.h b/linux-user/include/host/x86_64/host-signal.h
index XXXXXXX..XXXXXXX 100644
--- a/linux-user/include/host/x86_64/host-signal.h
+++ b/linux-user/include/host/x86_64/host-signal.h
@@ -XXX,XX +XXX,XX @@
 #ifndef X86_64_HOST_SIGNAL_H
 #define X86_64_HOST_SIGNAL_H
 
-static inline uintptr_t host_signal_pc(ucontext_t *uc)
+/* The third argument to a SA_SIGINFO handler is ucontext_t. */
+typedef ucontext_t host_sigcontext;
+
+static inline uintptr_t host_signal_pc(host_sigcontext *uc)
 {
     return uc->uc_mcontext.gregs[REG_RIP];
 }
 
-static inline void host_signal_set_pc(ucontext_t *uc, uintptr_t pc)
+static inline void host_signal_set_pc(host_sigcontext *uc, uintptr_t pc)
 {
     uc->uc_mcontext.gregs[REG_RIP] = pc;
 }
 
-static inline void *host_signal_mask(ucontext_t *uc)
+static inline void *host_signal_mask(host_sigcontext *uc)
 {
     return &uc->uc_sigmask;
 }
 
-static inline bool host_signal_write(siginfo_t *info, ucontext_t *uc)
+static inline bool host_signal_write(siginfo_t *info, host_sigcontext *uc)
 {
     return uc->uc_mcontext.gregs[REG_TRAPNO] == 0xe
         && (uc->uc_mcontext.gregs[REG_ERR] & 0x2);
diff --git a/linux-user/signal.c b/linux-user/signal.c
index XXXXXXX..XXXXXXX 100644
--- a/linux-user/signal.c
+++ b/linux-user/signal.c
@@ -XXX,XX +XXX,XX @@ void queue_signal(CPUArchState *env, int sig, int si_type,
 /* Adjust the signal context to rewind out of safe-syscall if we're in it */
 static inline void rewind_if_in_safe_syscall(void *puc)
 {
-    ucontext_t *uc = (ucontext_t *)puc;
+    host_sigcontext *uc = (host_sigcontext *)puc;
     uintptr_t pcreg = host_signal_pc(uc);
 
     if (pcreg > (uintptr_t)safe_syscall_start
@@ -XXX,XX +XXX,XX @@ static void host_signal_handler(int host_sig, siginfo_t *info, void *puc)
     CPUState *cpu = env_cpu(env);
     TaskState *ts = cpu->opaque;
     target_siginfo_t tinfo;
-    ucontext_t *uc = puc;
+    host_sigcontext *uc = puc;
     struct emulated_sigtable *k;
     int guest_sig;
     uintptr_t pc = 0;
-- 
2.25.1

We do not support sparc32 as a host, so there's no point in
sparc64 redirecting to sparc.

Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 linux-user/include/host/sparc/host-signal.h   | 71 -------------------
 linux-user/include/host/sparc64/host-signal.h | 64 ++++++++++++++++-
 2 files changed, 63 insertions(+), 72 deletions(-)
 delete mode 100644 linux-user/include/host/sparc/host-signal.h

diff --git a/linux-user/include/host/sparc/host-signal.h b/linux-user/include/host/sparc/host-signal.h
deleted file mode 100644
index XXXXXXX..XXXXXXX
--- a/linux-user/include/host/sparc/host-signal.h
+++ /dev/null
@@ -XXX,XX +XXX,XX @@
-/*
- * host-signal.h: signal info dependent on the host architecture
- *
- * Copyright (c) 2003-2005 Fabrice Bellard
- * Copyright (c) 2021 Linaro Limited
- *
- * This work is licensed under the terms of the GNU LGPL, version 2.1 or later.
- * See the COPYING file in the top-level directory.
- */
-
-#ifndef SPARC_HOST_SIGNAL_H
-#define SPARC_HOST_SIGNAL_H
-
-/* FIXME: the third argument to a SA_SIGINFO handler is *not* ucontext_t. */
-typedef ucontext_t host_sigcontext;
-
-static inline uintptr_t host_signal_pc(host_sigcontext *uc)
-{
-#ifdef __arch64__
-    return uc->uc_mcontext.mc_gregs[MC_PC];
-#else
-    return uc->uc_mcontext.gregs[REG_PC];
-#endif
-}
-
-static inline void host_signal_set_pc(host_sigcontext *uc, uintptr_t pc)
-{
-#ifdef __arch64__
-    uc->uc_mcontext.mc_gregs[MC_PC] = pc;
-#else
-    uc->uc_mcontext.gregs[REG_PC] = pc;
-#endif
-}
-
-static inline void *host_signal_mask(host_sigcontext *uc)
-{
-    return &uc->uc_sigmask;
-}
-
-static inline bool host_signal_write(siginfo_t *info, host_sigcontext *uc)
-{
-    uint32_t insn = *(uint32_t *)host_signal_pc(uc);
-
-    if ((insn >> 30) == 3) {
-        switch ((insn >> 19) & 0x3f) {
-        case 0x05: /* stb */
-        case 0x15: /* stba */
-        case 0x06: /* sth */
-        case 0x16: /* stha */
-        case 0x04: /* st */
-        case 0x14: /* sta */
-        case 0x07: /* std */
-        case 0x17: /* stda */
-        case 0x0e: /* stx */
-        case 0x1e: /* stxa */
-        case 0x24: /* stf */
-        case 0x34: /* stfa */
-        case 0x27: /* stdf */
-        case 0x37: /* stdfa */
-        case 0x26: /* stqf */
-        case 0x36: /* stqfa */
-        case 0x25: /* stfsr */
-        case 0x3c: /* casa */
-        case 0x3e: /* casxa */
-            return true;
-        }
-    }
-    return false;
-}
-
-#endif
diff --git a/linux-user/include/host/sparc64/host-signal.h b/linux-user/include/host/sparc64/host-signal.h
index XXXXXXX..XXXXXXX 100644
--- a/linux-user/include/host/sparc64/host-signal.h
+++ b/linux-user/include/host/sparc64/host-signal.h
@@ -1 +1,63 @@
-#include "../sparc/host-signal.h"
+/*
+ * host-signal.h: signal info dependent on the host architecture
+ *
+ * Copyright (c) 2003-2005 Fabrice Bellard
+ * Copyright (c) 2021 Linaro Limited
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2.1 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef SPARC64_HOST_SIGNAL_H
+#define SPARC64_HOST_SIGNAL_H
+
+/* FIXME: the third argument to a SA_SIGINFO handler is *not* ucontext_t. */
+typedef ucontext_t host_sigcontext;
+
+static inline uintptr_t host_signal_pc(host_sigcontext *uc)
+{
+    return uc->uc_mcontext.mc_gregs[MC_PC];
+}
+
+static inline void host_signal_set_pc(host_sigcontext *uc, uintptr_t pc)
+{
+    uc->uc_mcontext.mc_gregs[MC_PC] = pc;
+}
+
+static inline void *host_signal_mask(host_sigcontext *uc)
+{
+    return &uc->uc_sigmask;
+}
+
+static inline bool host_signal_write(siginfo_t *info, host_sigcontext *uc)
+{
+    uint32_t insn = *(uint32_t *)host_signal_pc(uc);
+
+    if ((insn >> 30) == 3) {
+        switch ((insn >> 19) & 0x3f) {
+        case 0x05: /* stb */
+        case 0x15: /* stba */
+        case 0x06: /* sth */
+        case 0x16: /* stha */
+        case 0x04: /* st */
+        case 0x14: /* sta */
+        case 0x07: /* std */
+        case 0x17: /* stda */
+        case 0x0e: /* stx */
+        case 0x1e: /* stxa */
+        case 0x24: /* stf */
+        case 0x34: /* stfa */
+        case 0x27: /* stdf */
+        case 0x37: /* stdfa */
+        case 0x26: /* stqf */
+        case 0x36: /* stqfa */
+        case 0x25: /* stfsr */
+        case 0x3c: /* casa */
+        case 0x3e: /* casxa */
+            return true;
+        }
+    }
+    return false;
+}
+
+#endif
-- 
2.25.1

Sparc64 is unique on linux in *not* passing ucontext_t as
the third argument to a SA_SIGINFO handler.  It passes the
old struct sigcontext instead.

Set both pc and npc in host_signal_set_pc.

Fixes: 8b5bd461935b ("linux-user/host/sparc: Populate host_signal.h")
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 linux-user/include/host/sparc64/host-signal.h | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/linux-user/include/host/sparc64/host-signal.h b/linux-user/include/host/sparc64/host-signal.h
index XXXXXXX..XXXXXXX 100644
--- a/linux-user/include/host/sparc64/host-signal.h
+++ b/linux-user/include/host/sparc64/host-signal.h
@@ -XXX,XX +XXX,XX @@
 #ifndef SPARC64_HOST_SIGNAL_H
 #define SPARC64_HOST_SIGNAL_H
 
-/* FIXME: the third argument to a SA_SIGINFO handler is *not* ucontext_t. */
-typedef ucontext_t host_sigcontext;
+/* The third argument to a SA_SIGINFO handler is struct sigcontext.  */
+typedef struct sigcontext host_sigcontext;
 
-static inline uintptr_t host_signal_pc(host_sigcontext *uc)
+static inline uintptr_t host_signal_pc(host_sigcontext *sc)
 {
-    return uc->uc_mcontext.mc_gregs[MC_PC];
+    return sc->sigc_regs.tpc;
 }
 
-static inline void host_signal_set_pc(host_sigcontext *uc, uintptr_t pc)
+static inline void host_signal_set_pc(host_sigcontext *sc, uintptr_t pc)
 {
-    uc->uc_mcontext.mc_gregs[MC_PC] = pc;
+    sc->sigc_regs.tpc = pc;
+    sc->sigc_regs.tnpc = pc + 4;
 }
 
-static inline void *host_signal_mask(host_sigcontext *uc)
+static inline void *host_signal_mask(host_sigcontext *sc)
 {
-    return &uc->uc_sigmask;
+    return &sc->sigc_mask;
 }
 
 static inline bool host_signal_write(siginfo_t *info, host_sigcontext *uc)
-- 
2.25.1

From: Idan Horowitz <idan.horowitz@gmail.com>

When the length of the range is large enough, clearing the whole cache is
faster than iterating over the (possibly extremely large) set of pages
contained in the range.

This mimics the pre-existing similar optimization done on the flush of the
tlb itself.

Signed-off-by: Idan Horowitz <idan.horowitz@gmail.com>
Message-Id: <20220110164754.1066025-1-idan.horowitz@gmail.com>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/cputlb.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ static void tlb_flush_range_by_mmuidx_async_0(CPUState *cpu,
     }
     qemu_spin_unlock(&env_tlb(env)->c.lock);
 
+    /*
+     * If the length is larger than the jump cache size, then it will take
+     * longer to clear each entry individually than it will to clear it all.
+     */
+    if (d.len >= (TARGET_PAGE_SIZE * TB_JMP_CACHE_SIZE)) {
+        cpu_tb_jmp_cache_clear(cpu);
+        return;
+    }
+
     for (target_ulong i = 0; i < d.len; i += TARGET_PAGE_SIZE) {
         tb_flush_jmp_cache(cpu, d.addr + i);
     }
-- 
2.25.1

From: Idan Horowitz <idan.horowitz@gmail.com>

Instead of taking the lock of the cpu work list in order to check if it's
empty, we can just read the head pointer atomically. This decreases
cpu_work_list_empty's share from 5% to 1.3% in a profile of icount-enabled
aarch64-softmmu.

Signed-off-by: Idan Horowitz <idan.horowitz@gmail.com>
Message-Id: <20220114004358.299534-1-idan.horowitz@gmail.com>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 softmmu/cpus.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/softmmu/cpus.c b/softmmu/cpus.c
index XXXXXXX..XXXXXXX 100644
--- a/softmmu/cpus.c
+++ b/softmmu/cpus.c
@@ -XXX,XX +XXX,XX @@ bool cpu_is_stopped(CPUState *cpu)
 
 bool cpu_work_list_empty(CPUState *cpu)
 {
-    bool ret;
-
-    qemu_mutex_lock(&cpu->work_mutex);
-    ret = QSIMPLEQ_EMPTY(&cpu->work_list);
-    qemu_mutex_unlock(&cpu->work_mutex);
-    return ret;
+    return QSIMPLEQ_EMPTY_ATOMIC(&cpu->work_list);
 }
 
 bool cpu_thread_is_idle(CPUState *cpu)
-- 
2.25.1

From: Pavel Dovgalyuk <pavel.dovgalyuk@ispras.ru>

Commit aff0e204cb1f1c036a496c94c15f5dfafcd9b4b4 introduced CF_NOIRQ usage,
but one case was forgotten. Record/replay uses one special TB which is not
really executed, but used to cause a correct exception in replay mode.
This patch adds CF_NOIRQ flag for such block.

Signed-off-by: Pavel Dovgalyuk <Pavel.Dovgalyuk@ispras.ru>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-Id: <164362834054.1754532.7678416881159817273.stgit@pasha-ThinkPad-X280>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/cpu-exec.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cpu-exec.c
+++ b/accel/tcg/cpu-exec.c
@@ -XXX,XX +XXX,XX @@ static inline bool cpu_handle_exception(CPUState *cpu, int *ret)
         if (replay_has_exception()
             && cpu_neg(cpu)->icount_decr.u16.low + cpu->icount_extra == 0) {
             /* Execute just one insn to trigger exception pending in the log */
-            cpu->cflags_next_tb = (curr_cflags(cpu) & ~CF_USE_ICOUNT) | 1;
+            cpu->cflags_next_tb = (curr_cflags(cpu) & ~CF_USE_ICOUNT)
+                | CF_NOIRQ | 1;
         }
 #endif
         return false;
-- 
2.25.1

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/i386/tcg-target.h     |   2 -
 tcg/i386/tcg-target.c.inc | 103 ++++++++++++++++++++++++++++++++++++--
 2 files changed, 98 insertions(+), 7 deletions(-)

diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -XXX,XX +XXX,XX @@ static inline void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_rx,
 
 #define TCG_TARGET_HAS_MEMORY_BSWAP  have_movbe
 
-#ifdef CONFIG_SOFTMMU
 #define TCG_TARGET_NEED_LDST_LABELS
-#endif
 #define TCG_TARGET_NEED_POOL_LABELS
 
 #endif
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@
  * THE SOFTWARE.
  */
 
+#include "../tcg-ldst.c.inc"
 #include "../tcg-pool.c.inc"
 
 #ifdef CONFIG_DEBUG_TCG
@@ -XXX,XX +XXX,XX @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
 #define OPC_VZEROUPPER  (0x77 | P_EXT)
 #define OPC_XCHG_ax_r32	(0x90)
 
-#define OPC_GRP3_Ev	(0xf7)
-#define OPC_GRP5	(0xff)
+#define OPC_GRP3_Eb     (0xf6)
+#define OPC_GRP3_Ev     (0xf7)
+#define OPC_GRP5        (0xff)
 #define OPC_GRP14       (0x73 | P_EXT | P_DATA16)
 
 /* Group 1 opcode extensions for 0x80-0x83.
@@ -XXX,XX +XXX,XX @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
 #define SHIFT_SAR 7
 
 /* Group 3 opcode extensions for 0xf6, 0xf7.  To be used with OPC_GRP3.  */
+#define EXT3_TESTi 0
 #define EXT3_NOT   2
 #define EXT3_NEG   3
 #define EXT3_MUL   4
@@ -XXX,XX +XXX,XX @@ static void tcg_out_nopn(TCGContext *s, int n)
 }
 
 #if defined(CONFIG_SOFTMMU)
-#include "../tcg-ldst.c.inc"
-
 /* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
  *                                     int mmu_idx, uintptr_t ra)
  */
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
     tcg_out_jmp(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
     return true;
 }
-#elif TCG_TARGET_REG_BITS == 32
+#else
+
+static void tcg_out_test_alignment(TCGContext *s, bool is_ld, TCGReg addrlo,
+                                   TCGReg addrhi, unsigned a_bits)
+{
+    unsigned a_mask = (1 << a_bits) - 1;
+    TCGLabelQemuLdst *label;
+
+    /*
+     * We are expecting a_bits to max out at 7, so we can usually use testb.
+     * For i686, we have to use testl for %esi/%edi.
+     */
+    if (a_mask <= 0xff && (TCG_TARGET_REG_BITS == 64 || addrlo < 4)) {
+        tcg_out_modrm(s, OPC_GRP3_Eb | P_REXB_RM, EXT3_TESTi, addrlo);
+        tcg_out8(s, a_mask);
+    } else {
+        tcg_out_modrm(s, OPC_GRP3_Ev, EXT3_TESTi, addrlo);
+        tcg_out32(s, a_mask);
+    }
+
+    /* jne slow_path */
+    tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
+
+    label = new_ldst_label(s);
+    label->is_ld = is_ld;
+    label->addrlo_reg = addrlo;
+    label->addrhi_reg = addrhi;
+    label->raddr = tcg_splitwx_to_rx(s->code_ptr + 4);
+    label->label_ptr[0] = s->code_ptr;
+
+    s->code_ptr += 4;
+}
+
+static bool tcg_out_fail_alignment(TCGContext *s, TCGLabelQemuLdst *l)
+{
+    /* resolve label address */
+    tcg_patch32(l->label_ptr[0], s->code_ptr - l->label_ptr[0] - 4);
+
+    if (TCG_TARGET_REG_BITS == 32) {
+        int ofs = 0;
+
+        tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
+        ofs += 4;
+
+        tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
+        ofs += 4;
+        if (TARGET_LONG_BITS == 64) {
+            tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
+            ofs += 4;
+        }
+
+        tcg_out_pushi(s, (uintptr_t)l->raddr);
+    } else {
+        tcg_out_mov(s, TCG_TYPE_TL, tcg_target_call_iarg_regs[1],
+                    l->addrlo_reg);
+        tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
+
+        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_RAX, (uintptr_t)l->raddr);
+        tcg_out_push(s, TCG_REG_RAX);
+    }
+
+    /* "Tail call" to the helper, with the return address back inline. */
+    tcg_out_jmp(s, (const void *)(l->is_ld ? helper_unaligned_ld
+                                  : helper_unaligned_st));
+    return true;
+}
+
+static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
+{
+    return tcg_out_fail_alignment(s, l);
+}
+
+static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
+{
+    return tcg_out_fail_alignment(s, l);
+}
+
+#if TCG_TARGET_REG_BITS == 32
 # define x86_guest_base_seg     0
 # define x86_guest_base_index   -1
 # define x86_guest_base_offset  guest_base
@@ -XXX,XX +XXX,XX @@ static inline int setup_guest_base_seg(void)
     return 0;
 }
 # endif
+#endif
 #endif /* SOFTMMU */
 
 static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
 #if defined(CONFIG_SOFTMMU)
     int mem_index;
     tcg_insn_unit *label_ptr[2];
+#else
+    unsigned a_bits;
 #endif
 
     datalo = *args++;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
     add_qemu_ldst_label(s, true, is64, oi, datalo, datahi, addrlo, addrhi,
                         s->code_ptr, label_ptr);
 #else
+    a_bits = get_alignment_bits(opc);
+    if (a_bits) {
+        tcg_out_test_alignment(s, true, addrlo, addrhi, a_bits);
+    }
+
     tcg_out_qemu_ld_direct(s, datalo, datahi, addrlo, x86_guest_base_index,
                            x86_guest_base_offset, x86_guest_base_seg,
                            is64, opc);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
 #if defined(CONFIG_SOFTMMU)
     int mem_index;
     tcg_insn_unit *label_ptr[2];
+#else
+    unsigned a_bits;
 #endif
 
     datalo = *args++;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
     add_qemu_ldst_label(s, false, is64, oi, datalo, datahi, addrlo, addrhi,
                         s->code_ptr, label_ptr);
 #else
+    a_bits = get_alignment_bits(opc);
+    if (a_bits) {
+        tcg_out_test_alignment(s, false, addrlo, addrhi, a_bits);
+    }
+
     tcg_out_qemu_st_direct(s, datalo, datahi, addrlo, x86_guest_base_index,
                            x86_guest_base_offset, x86_guest_base_seg, opc);
 #endif
-- 
2.25.1

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/aarch64/tcg-target.h     |  2 -
 tcg/aarch64/tcg-target.c.inc | 91 +++++++++++++++++++++++++++++-------
 2 files changed, 74 insertions(+), 19 deletions(-)

diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/aarch64/tcg-target.h
+++ b/tcg/aarch64/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
 
 void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t);
 
-#ifdef CONFIG_SOFTMMU
 #define TCG_TARGET_NEED_LDST_LABELS
-#endif
 #define TCG_TARGET_NEED_POOL_LABELS
 
 #endif /* AARCH64_TCG_TARGET_H */
diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/aarch64/tcg-target.c.inc
+++ b/tcg/aarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@
  * See the COPYING file in the top-level directory for details.
  */
 
+#include "../tcg-ldst.c.inc"
 #include "../tcg-pool.c.inc"
 #include "qemu/bitops.h"
 
@@ -XXX,XX +XXX,XX @@ typedef enum {
     I3404_ANDI      = 0x12000000,
     I3404_ORRI      = 0x32000000,
     I3404_EORI      = 0x52000000,
+    I3404_ANDSI     = 0x72000000,
 
     /* Move wide immediate instructions.  */
     I3405_MOVN      = 0x12800000,
@@ -XXX,XX +XXX,XX @@ static void tcg_out_goto_long(TCGContext *s, const tcg_insn_unit *target)
     if (offset == sextract64(offset, 0, 26)) {
         tcg_out_insn(s, 3206, B, offset);
     } else {
-        tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_TMP, (intptr_t)target);
-        tcg_out_insn(s, 3207, BR, TCG_REG_TMP);
+        /* Choose X9 as a call-clobbered non-LR temporary. */
+        tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_X9, (intptr_t)target);
+        tcg_out_insn(s, 3207, BR, TCG_REG_X9);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ static void tcg_out_cltz(TCGContext *s, TCGType ext, TCGReg d,
     }
 }
 
-#ifdef CONFIG_SOFTMMU
-#include "../tcg-ldst.c.inc"
+static void tcg_out_adr(TCGContext *s, TCGReg rd, const void *target)
+{
+    ptrdiff_t offset = tcg_pcrel_diff(s, target);
+    tcg_debug_assert(offset == sextract64(offset, 0, 21));
+    tcg_out_insn(s, 3406, ADR, rd, offset);
+}
 
+#ifdef CONFIG_SOFTMMU
 /* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
  *                                     MemOpIdx oi, uintptr_t ra)
  */
@@ -XXX,XX +XXX,XX @@ static void * const qemu_st_helpers[MO_SIZE + 1] = {
 #endif
 };
 
-static inline void tcg_out_adr(TCGContext *s, TCGReg rd, const void *target)
-{
-    ptrdiff_t offset = tcg_pcrel_diff(s, target);
-    tcg_debug_assert(offset == sextract64(offset, 0, 21));
-    tcg_out_insn(s, 3406, ADR, rd, offset);
-}
-
 static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
 {
     MemOpIdx oi = lb->oi;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_tlb_read(TCGContext *s, TCGReg addr_reg, MemOp opc,
     tcg_out_insn(s, 3202, B_C, TCG_COND_NE, 0);
 }
 
+#else
+static void tcg_out_test_alignment(TCGContext *s, bool is_ld, TCGReg addr_reg,
+                                   unsigned a_bits)
+{
+    unsigned a_mask = (1 << a_bits) - 1;
+    TCGLabelQemuLdst *label = new_ldst_label(s);
+
+    label->is_ld = is_ld;
+    label->addrlo_reg = addr_reg;
+
+    /* tst addr, #mask */
+    tcg_out_logicali(s, I3404_ANDSI, 0, TCG_REG_XZR, addr_reg, a_mask);
+
+    label->label_ptr[0] = s->code_ptr;
+
+    /* b.ne slow_path */
+    tcg_out_insn(s, 3202, B_C, TCG_COND_NE, 0);
+
+    label->raddr = tcg_splitwx_to_rx(s->code_ptr);
+}
+
+static bool tcg_out_fail_alignment(TCGContext *s, TCGLabelQemuLdst *l)
+{
+    if (!reloc_pc19(l->label_ptr[0], tcg_splitwx_to_rx(s->code_ptr))) {
+        return false;
+    }
+
+    tcg_out_mov(s, TCG_TYPE_TL, TCG_REG_X1, l->addrlo_reg);
+    tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_X0, TCG_AREG0);
+
+    /* "Tail call" to the helper, with the return address back inline. */
+    tcg_out_adr(s, TCG_REG_LR, l->raddr);
+    tcg_out_goto_long(s, (const void *)(l->is_ld ? helper_unaligned_ld
+                                        : helper_unaligned_st));
+    return true;
+}
+
+static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
+{
+    return tcg_out_fail_alignment(s, l);
+}
+
+static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
+{
+    return tcg_out_fail_alignment(s, l);
+}
 #endif /* CONFIG_SOFTMMU */
 
 static void tcg_out_qemu_ld_direct(TCGContext *s, MemOp memop, TCGType ext,
                                    TCGReg data_r, TCGReg addr_r,
                                    TCGType otype, TCGReg off_r)
 {
-    /* Byte swapping is left to middle-end expansion. */
-    tcg_debug_assert((memop & MO_BSWAP) == 0);
-
     switch (memop & MO_SSIZE) {
     case MO_UB:
         tcg_out_ldst_r(s, I3312_LDRB, data_r, addr_r, otype, off_r);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st_direct(TCGContext *s, MemOp memop,
                                    TCGReg data_r, TCGReg addr_r,
                                    TCGType otype, TCGReg off_r)
 {
-    /* Byte swapping is left to middle-end expansion. */
-    tcg_debug_assert((memop & MO_BSWAP) == 0);
-
     switch (memop & MO_SIZE) {
     case MO_8:
         tcg_out_ldst_r(s, I3312_STRB, data_r, addr_r, otype, off_r);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld(TCGContext *s, TCGReg data_reg, TCGReg addr_reg,
 {
     MemOp memop = get_memop(oi);
     const TCGType otype = TARGET_LONG_BITS == 64 ? TCG_TYPE_I64 : TCG_TYPE_I32;
+
+    /* Byte swapping is left to middle-end expansion. */
+    tcg_debug_assert((memop & MO_BSWAP) == 0);
+
 #ifdef CONFIG_SOFTMMU
     unsigned mem_index = get_mmuidx(oi);
     tcg_insn_unit *label_ptr;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld(TCGContext *s, TCGReg data_reg, TCGReg addr_reg,
     add_qemu_ldst_label(s, true, oi, ext, data_reg, addr_reg,
                         s->code_ptr, label_ptr);
 #else /* !CONFIG_SOFTMMU */
+    unsigned a_bits = get_alignment_bits(memop);
+    if (a_bits) {
+        tcg_out_test_alignment(s, true, addr_reg, a_bits);
+    }
     if (USE_GUEST_BASE) {
         tcg_out_qemu_ld_direct(s, memop, ext, data_reg,
                                TCG_REG_GUEST_BASE, otype, addr_reg);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, TCGReg data_reg, TCGReg addr_reg,
 {
     MemOp memop = get_memop(oi);
     const TCGType otype = TARGET_LONG_BITS == 64 ? TCG_TYPE_I64 : TCG_TYPE_I32;
+
+    /* Byte swapping is left to middle-end expansion. */
+    tcg_debug_assert((memop & MO_BSWAP) == 0);
+
 #ifdef CONFIG_SOFTMMU
     unsigned mem_index = get_mmuidx(oi);
     tcg_insn_unit *label_ptr;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, TCGReg data_reg, TCGReg addr_reg,
     add_qemu_ldst_label(s, false, oi, (memop & MO_SIZE)== MO_64,
                         data_reg, addr_reg, s->code_ptr, label_ptr);
 #else /* !CONFIG_SOFTMMU */
+    unsigned a_bits = get_alignment_bits(memop);
+    if (a_bits) {
+        tcg_out_test_alignment(s, false, addr_reg, a_bits);
+    }
     if (USE_GUEST_BASE) {
         tcg_out_qemu_st_direct(s, memop, data_reg,
                                TCG_REG_GUEST_BASE, otype, addr_reg);
-- 
2.25.1

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/ppc/tcg-target.h     |  2 -
 tcg/ppc/tcg-target.c.inc | 98 ++++++++++++++++++++++++++++++++++++----
 2 files changed, 90 insertions(+), 10 deletions(-)

diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/ppc/tcg-target.h
+++ b/tcg/ppc/tcg-target.h
@@ -XXX,XX +XXX,XX @@ void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t);
 #define TCG_TARGET_DEFAULT_MO (0)
 #define TCG_TARGET_HAS_MEMORY_BSWAP     1
 
-#ifdef CONFIG_SOFTMMU
 #define TCG_TARGET_NEED_LDST_LABELS
-#endif
 #define TCG_TARGET_NEED_POOL_LABELS
 
 #endif
diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@
 
 #include "elf.h"
 #include "../tcg-pool.c.inc"
+#include "../tcg-ldst.c.inc"
 
 /*
  * Standardize on the _CALL_FOO symbols used by GCC:
@@ -XXX,XX +XXX,XX @@ void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_rx,
     }
 }
 
-static void tcg_out_call(TCGContext *s, const tcg_insn_unit *target)
+static void tcg_out_call_int(TCGContext *s, int lk,
+                             const tcg_insn_unit *target)
 {
 #ifdef _CALL_AIX
     /* Look through the descriptor.  If the branch is in range, and we
@@ -XXX,XX +XXX,XX @@ static void tcg_out_call(TCGContext *s, const tcg_insn_unit *target)
 
     if (in_range_b(diff) && toc == (uint32_t)toc) {
         tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_TMP1, toc);
-        tcg_out_b(s, LK, tgt);
+        tcg_out_b(s, lk, tgt);
     } else {
         /* Fold the low bits of the constant into the addresses below.  */
         intptr_t arg = (intptr_t)target;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_call(TCGContext *s, const tcg_insn_unit *target)
         tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_R0, TCG_REG_TMP1, ofs);
         tcg_out32(s, MTSPR | RA(TCG_REG_R0) | CTR);
         tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_R2, TCG_REG_TMP1, ofs + SZP);
-        tcg_out32(s, BCCTR | BO_ALWAYS | LK);
+        tcg_out32(s, BCCTR | BO_ALWAYS | lk);
     }
 #elif defined(_CALL_ELF) && _CALL_ELF == 2
     intptr_t diff;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_call(TCGContext *s, const tcg_insn_unit *target)
 
     diff = tcg_pcrel_diff(s, target);
     if (in_range_b(diff)) {
-        tcg_out_b(s, LK, target);
+        tcg_out_b(s, lk, target);
     } else {
         tcg_out32(s, MTSPR | RS(TCG_REG_R12) | CTR);
-        tcg_out32(s, BCCTR | BO_ALWAYS | LK);
+        tcg_out32(s, BCCTR | BO_ALWAYS | lk);
     }
 #else
-    tcg_out_b(s, LK, target);
+    tcg_out_b(s, lk, target);
 #endif
 }
 
+static void tcg_out_call(TCGContext *s, const tcg_insn_unit *target)
+{
+    tcg_out_call_int(s, LK, target);
+}
+
 static const uint32_t qemu_ldx_opc[(MO_SSIZE + MO_BSWAP) + 1] = {
     [MO_UB] = LBZX,
     [MO_UW] = LHZX,
@@ -XXX,XX +XXX,XX @@ static const uint32_t qemu_exts_opc[4] = {
 };
 
 #if defined (CONFIG_SOFTMMU)
-#include "../tcg-ldst.c.inc"
-
 /* helper signature: helper_ld_mmu(CPUState *env, target_ulong addr,
  *                                 int mmu_idx, uintptr_t ra)
  */
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
     tcg_out_b(s, 0, lb->raddr);
     return true;
 }
+#else
+
+static void tcg_out_test_alignment(TCGContext *s, bool is_ld, TCGReg addrlo,
+                                   TCGReg addrhi, unsigned a_bits)
+{
+    unsigned a_mask = (1 << a_bits) - 1;
+    TCGLabelQemuLdst *label = new_ldst_label(s);
+
+    label->is_ld = is_ld;
+    label->addrlo_reg = addrlo;
+    label->addrhi_reg = addrhi;
+
+    /* We are expecting a_bits to max out at 7, much lower than ANDI. */
+    tcg_debug_assert(a_bits < 16);
+    tcg_out32(s, ANDI | SAI(addrlo, TCG_REG_R0, a_mask));
+
+    label->label_ptr[0] = s->code_ptr;
+    tcg_out32(s, BC | BI(0, CR_EQ) | BO_COND_FALSE | LK);
+
+    label->raddr = tcg_splitwx_to_rx(s->code_ptr);
+}
+
+static bool tcg_out_fail_alignment(TCGContext *s, TCGLabelQemuLdst *l)
+{
+    if (!reloc_pc14(l->label_ptr[0], tcg_splitwx_to_rx(s->code_ptr))) {
+        return false;
+    }
+
+    if (TCG_TARGET_REG_BITS < TARGET_LONG_BITS) {
+        TCGReg arg = TCG_REG_R4;
+#ifdef TCG_TARGET_CALL_ALIGN_ARGS
+        arg |= 1;
+#endif
+        if (l->addrlo_reg != arg) {
+            tcg_out_mov(s, TCG_TYPE_I32, arg, l->addrhi_reg);
+            tcg_out_mov(s, TCG_TYPE_I32, arg + 1, l->addrlo_reg);
+        } else if (l->addrhi_reg != arg + 1) {
+            tcg_out_mov(s, TCG_TYPE_I32, arg + 1, l->addrlo_reg);
+            tcg_out_mov(s, TCG_TYPE_I32, arg, l->addrhi_reg);
+        } else {
+            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R0, arg);
+            tcg_out_mov(s, TCG_TYPE_I32, arg, arg + 1);
+            tcg_out_mov(s, TCG_TYPE_I32, arg + 1, TCG_REG_R0);
+        }
+    } else {
+        tcg_out_mov(s, TCG_TYPE_TL, TCG_REG_R4, l->addrlo_reg);
+    }
+    tcg_out_mov(s, TCG_TYPE_TL, TCG_REG_R3, TCG_AREG0);
+
+    /* "Tail call" to the helper, with the return address back inline. */
+    tcg_out_call_int(s, 0, (const void *)(l->is_ld ? helper_unaligned_ld
+                                          : helper_unaligned_st));
+    return true;
+}
+
+static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
+{
+    return tcg_out_fail_alignment(s, l);
+}
+
+static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
+{
+    return tcg_out_fail_alignment(s, l);
+}
+
 #endif /* SOFTMMU */
 
 static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is_64)
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is_64)
 #ifdef CONFIG_SOFTMMU
     int mem_index;
     tcg_insn_unit *label_ptr;
+#else
+    unsigned a_bits;
 #endif
 
     datalo = *args++;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is_64)
 
     rbase = TCG_REG_R3;
 #else  /* !CONFIG_SOFTMMU */
+    a_bits = get_alignment_bits(opc);
+    if (a_bits) {
+        tcg_out_test_alignment(s, true, addrlo, addrhi, a_bits);
+    }
     rbase = guest_base ? TCG_GUEST_BASE_REG : 0;
     if (TCG_TARGET_REG_BITS > TARGET_LONG_BITS) {
         tcg_out_ext32u(s, TCG_REG_TMP1, addrlo);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is_64)
 #ifdef CONFIG_SOFTMMU
     int mem_index;
     tcg_insn_unit *label_ptr;
+#else
+    unsigned a_bits;
 #endif
 
     datalo = *args++;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is_64)
 
     rbase = TCG_REG_R3;
 #else  /* !CONFIG_SOFTMMU */
+    a_bits = get_alignment_bits(opc);
+    if (a_bits) {
+        tcg_out_test_alignment(s, false, addrlo, addrhi, a_bits);
+    }
     rbase = guest_base ? TCG_GUEST_BASE_REG : 0;
     if (TCG_TARGET_REG_BITS > TARGET_LONG_BITS) {
         tcg_out_ext32u(s, TCG_REG_TMP1, addrlo);
-- 
2.25.1

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/riscv/tcg-target.h     |  2 --
 tcg/riscv/tcg-target.c.inc | 63 ++++++++++++++++++++++++++++++++++++--
 2 files changed, 61 insertions(+), 4 deletions(-)

diff --git a/tcg/riscv/tcg-target.h b/tcg/riscv/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/riscv/tcg-target.h
+++ b/tcg/riscv/tcg-target.h
@@ -XXX,XX +XXX,XX @@ void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t);
 
 #define TCG_TARGET_DEFAULT_MO (0)
 
-#ifdef CONFIG_SOFTMMU
 #define TCG_TARGET_NEED_LDST_LABELS
-#endif
 #define TCG_TARGET_NEED_POOL_LABELS
 
 #define TCG_TARGET_HAS_MEMORY_BSWAP 0
diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/riscv/tcg-target.c.inc
+++ b/tcg/riscv/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@
  * THE SOFTWARE.
  */
 
+#include "../tcg-ldst.c.inc"
 #include "../tcg-pool.c.inc"
 
 #ifdef CONFIG_DEBUG_TCG
@@ -XXX,XX +XXX,XX @@ static void tcg_out_mb(TCGContext *s, TCGArg a0)
  */
 
 #if defined(CONFIG_SOFTMMU)
-#include "../tcg-ldst.c.inc"
-
 /* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
  *                                     MemOpIdx oi, uintptr_t ra)
  */
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
     tcg_out_goto(s, l->raddr);
     return true;
 }
+#else
+
+static void tcg_out_test_alignment(TCGContext *s, bool is_ld, TCGReg addr_reg,
+                                   unsigned a_bits)
+{
+    unsigned a_mask = (1 << a_bits) - 1;
+    TCGLabelQemuLdst *l = new_ldst_label(s);
+
+    l->is_ld = is_ld;
+    l->addrlo_reg = addr_reg;
+
+    /* We are expecting a_bits to max out at 7, so we can always use andi. */
+    tcg_debug_assert(a_bits < 12);
+    tcg_out_opc_imm(s, OPC_ANDI, TCG_REG_TMP1, addr_reg, a_mask);
+
+    l->label_ptr[0] = s->code_ptr;
+    tcg_out_opc_branch(s, OPC_BNE, TCG_REG_TMP1, TCG_REG_ZERO, 0);
+
+    l->raddr = tcg_splitwx_to_rx(s->code_ptr);
+}
+
+static bool tcg_out_fail_alignment(TCGContext *s, TCGLabelQemuLdst *l)
+{
+    /* resolve label address */
+    if (!reloc_sbimm12(l->label_ptr[0], tcg_splitwx_to_rx(s->code_ptr))) {
+        return false;
+    }
+
+    tcg_out_mov(s, TCG_TYPE_TL, TCG_REG_A1, l->addrlo_reg);
+    tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_A0, TCG_AREG0);
+
+    /* tail call, with the return address back inline. */
+    tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_RA, (uintptr_t)l->raddr);
+    tcg_out_call_int(s, (const void *)(l->is_ld ? helper_unaligned_ld
+                                       : helper_unaligned_st), true);
+    return true;
+}
+
+static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
+{
+    return tcg_out_fail_alignment(s, l);
+}
+
+static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
+{
+    return tcg_out_fail_alignment(s, l);
+}
+
 #endif /* CONFIG_SOFTMMU */
 
 static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg lo, TCGReg hi,
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is_64)
     MemOp opc;
 #if defined(CONFIG_SOFTMMU)
     tcg_insn_unit *label_ptr[1];
+#else
+    unsigned a_bits;
 #endif
     TCGReg base = TCG_REG_TMP0;
 
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is_64)
         tcg_out_ext32u(s, base, addr_regl);
         addr_regl = base;
     }
+    a_bits = get_alignment_bits(opc);
+    if (a_bits) {
+        tcg_out_test_alignment(s, true, addr_regl, a_bits);
+    }
     if (guest_base != 0) {
         tcg_out_opc_reg(s, OPC_ADD, base, TCG_GUEST_BASE_REG, addr_regl);
     }
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is_64)
     MemOp opc;
 #if defined(CONFIG_SOFTMMU)
     tcg_insn_unit *label_ptr[1];
+#else
+    unsigned a_bits;
 #endif
     TCGReg base = TCG_REG_TMP0;
 
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is_64)
         tcg_out_ext32u(s, base, addr_regl);
         addr_regl = base;
     }
+    a_bits = get_alignment_bits(opc);
+    if (a_bits) {
+        tcg_out_test_alignment(s, false, addr_regl, a_bits);
+    }
     if (guest_base != 0) {
         tcg_out_opc_reg(s, OPC_ADD, base, TCG_GUEST_BASE_REG, addr_regl);
     }
-- 
2.25.1

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/s390x/tcg-target.h     |  2 --
 tcg/s390x/tcg-target.c.inc | 59 ++++++++++++++++++++++++++++++++++++--
 2 files changed, 57 insertions(+), 4 deletions(-)

diff --git a/tcg/s390x/tcg-target.h b/tcg/s390x/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/s390x/tcg-target.h
+++ b/tcg/s390x/tcg-target.h
@@ -XXX,XX +XXX,XX @@ static inline void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_rx,
     /* no need to flush icache explicitly */
 }
 
-#ifdef CONFIG_SOFTMMU
 #define TCG_TARGET_NEED_LDST_LABELS
-#endif
 #define TCG_TARGET_NEED_POOL_LABELS
 
 #endif
diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/s390x/tcg-target.c.inc
+++ b/tcg/s390x/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@
 #error "unsupported code generation mode"
 #endif
 
+#include "../tcg-ldst.c.inc"
 #include "../tcg-pool.c.inc"
 #include "elf.h"
 
@@ -XXX,XX +XXX,XX @@ typedef enum S390Opcode {
     RI_OIHL     = 0xa509,
     RI_OILH     = 0xa50a,
     RI_OILL     = 0xa50b,
+    RI_TMLL     = 0xa701,
 
     RIE_CGIJ    = 0xec7c,
     RIE_CGRJ    = 0xec64,
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st_direct(TCGContext *s, MemOp opc, TCGReg data,
 }
 
 #if defined(CONFIG_SOFTMMU)
-#include "../tcg-ldst.c.inc"
-
 /* We're expecting to use a 20-bit negative offset on the tlb memory ops.  */
 QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) > 0);
 QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) < -(1 << 19));
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
     return true;
 }
 #else
+static void tcg_out_test_alignment(TCGContext *s, bool is_ld,
+                                   TCGReg addrlo, unsigned a_bits)
+{
+    unsigned a_mask = (1 << a_bits) - 1;
+    TCGLabelQemuLdst *l = new_ldst_label(s);
+
+    l->is_ld = is_ld;
+    l->addrlo_reg = addrlo;
+
+    /* We are expecting a_bits to max out at 7, much lower than TMLL. */
+    tcg_debug_assert(a_bits < 16);
+    tcg_out_insn(s, RI, TMLL, addrlo, a_mask);
+
+    tcg_out16(s, RI_BRC | (7 << 4)); /* CC in {1,2,3} */
+    l->label_ptr[0] = s->code_ptr;
+    s->code_ptr += 1;
+
+    l->raddr = tcg_splitwx_to_rx(s->code_ptr);
+}
+
+static bool tcg_out_fail_alignment(TCGContext *s, TCGLabelQemuLdst *l)
+{
+    if (!patch_reloc(l->label_ptr[0], R_390_PC16DBL,
+                     (intptr_t)tcg_splitwx_to_rx(s->code_ptr), 2)) {
+        return false;
+    }
+
+    tcg_out_mov(s, TCG_TYPE_TL, TCG_REG_R3, l->addrlo_reg);
+    tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_R2, TCG_AREG0);
+
+    /* "Tail call" to the helper, with the return address back inline. */
+    tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R14, (uintptr_t)l->raddr);
+    tgen_gotoi(s, S390_CC_ALWAYS, (const void *)(l->is_ld ? helper_unaligned_ld
+                                                 : helper_unaligned_st));
+    return true;
+}
+
+static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
+{
+    return tcg_out_fail_alignment(s, l);
+}
+
+static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
+{
+    return tcg_out_fail_alignment(s, l);
+}
+
 static void tcg_prepare_user_ldst(TCGContext *s, TCGReg *addr_reg,
                                   TCGReg *index_reg, tcg_target_long *disp)
 {
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld(TCGContext* s, TCGReg data_reg, TCGReg addr_reg,
 #else
     TCGReg index_reg;
     tcg_target_long disp;
+    unsigned a_bits = get_alignment_bits(opc);
 
+    if (a_bits) {
+        tcg_out_test_alignment(s, true, addr_reg, a_bits);
+    }
     tcg_prepare_user_ldst(s, &addr_reg, &index_reg, &disp);
     tcg_out_qemu_ld_direct(s, opc, data_reg, addr_reg, index_reg, disp);
 #endif
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext* s, TCGReg data_reg, TCGReg addr_reg,
 #else
     TCGReg index_reg;
     tcg_target_long disp;
+    unsigned a_bits = get_alignment_bits(opc);
 
+    if (a_bits) {
+        tcg_out_test_alignment(s, false, addr_reg, a_bits);
+    }
     tcg_prepare_user_ldst(s, &addr_reg, &index_reg, &disp);
     tcg_out_qemu_st_direct(s, opc, data_reg, addr_reg, index_reg, disp);
 #endif
-- 
2.25.1

Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tci.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/tcg/tci.c b/tcg/tci.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci.c
+++ b/tcg/tci.c
@@ -XXX,XX +XXX,XX @@ static bool tci_compare64(uint64_t u0, uint64_t u1, TCGCond condition)
 static uint64_t tci_qemu_ld(CPUArchState *env, target_ulong taddr,
                             MemOpIdx oi, const void *tb_ptr)
 {
-    MemOp mop = get_memop(oi) & (MO_BSWAP | MO_SSIZE);
+    MemOp mop = get_memop(oi);
     uintptr_t ra = (uintptr_t)tb_ptr;
 
 #ifdef CONFIG_SOFTMMU
-    switch (mop) {
+    switch (mop & (MO_BSWAP | MO_SSIZE)) {
     case MO_UB:
         return helper_ret_ldub_mmu(env, taddr, oi, ra);
     case MO_SB:
@@ -XXX,XX +XXX,XX @@ static uint64_t tci_qemu_ld(CPUArchState *env, target_ulong taddr,
     }
 #else
     void *haddr = g2h(env_cpu(env), taddr);
+    unsigned a_mask = (1u << get_alignment_bits(mop)) - 1;
     uint64_t ret;
 
     set_helper_retaddr(ra);
-    switch (mop) {
+    if (taddr & a_mask) {
+        helper_unaligned_ld(env, taddr);
+    }
+    switch (mop & (MO_BSWAP | MO_SSIZE)) {
     case MO_UB:
         ret = ldub_p(haddr);
         break;
@@ -XXX,XX +XXX,XX @@ static uint64_t tci_qemu_ld(CPUArchState *env, target_ulong taddr,
 static void tci_qemu_st(CPUArchState *env, target_ulong taddr, uint64_t val,
                         MemOpIdx oi, const void *tb_ptr)
 {
-    MemOp mop = get_memop(oi) & (MO_BSWAP | MO_SSIZE);
+    MemOp mop = get_memop(oi);
     uintptr_t ra = (uintptr_t)tb_ptr;
 
 #ifdef CONFIG_SOFTMMU
-    switch (mop) {
+    switch (mop & (MO_BSWAP | MO_SIZE)) {
     case MO_UB:
         helper_ret_stb_mmu(env, taddr, val, oi, ra);
         break;
@@ -XXX,XX +XXX,XX @@ static void tci_qemu_st(CPUArchState *env, target_ulong taddr, uint64_t val,
     }
 #else
     void *haddr = g2h(env_cpu(env), taddr);
+    unsigned a_mask = (1u << get_alignment_bits(mop)) - 1;
 
     set_helper_retaddr(ra);
-    switch (mop) {
+    if (taddr & a_mask) {
+        helper_unaligned_st(env, taddr);
+    }
+    switch (mop & (MO_BSWAP | MO_SIZE)) {
     case MO_UB:
         stb_p(haddr, val);
         break;
-- 
2.25.1

From: WANG Xuerui <git@xen0n.name>

Signed-off-by: WANG Xuerui <git@xen0n.name>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-Id: <20220106134238.3936163-1-git@xen0n.name>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/loongarch64/tcg-target.h     |  2 -
 tcg/loongarch64/tcg-target.c.inc | 71 +++++++++++++++++++++++++++++++-
 2 files changed, 69 insertions(+), 4 deletions(-)

diff --git a/tcg/loongarch64/tcg-target.h b/tcg/loongarch64/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/loongarch64/tcg-target.h
+++ b/tcg/loongarch64/tcg-target.h
@@ -XXX,XX +XXX,XX @@ void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t);
 
 #define TCG_TARGET_DEFAULT_MO (0)
 
-#ifdef CONFIG_SOFTMMU
 #define TCG_TARGET_NEED_LDST_LABELS
-#endif
 
 #define TCG_TARGET_HAS_MEMORY_BSWAP 0
 
diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/loongarch64/tcg-target.c.inc
+++ b/tcg/loongarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@
  * THE SOFTWARE.
  */
 
+#include "../tcg-ldst.c.inc"
+
 #ifdef CONFIG_DEBUG_TCG
 static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
     "zero",
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
  */
 
 #if defined(CONFIG_SOFTMMU)
-#include "../tcg-ldst.c.inc"
-
 /*
  * helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
  *                                     MemOpIdx oi, uintptr_t ra)
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
 
     return tcg_out_goto(s, l->raddr);
 }
+#else
+
+/*
+ * Alignment helpers for user-mode emulation
+ */
+
+static void tcg_out_test_alignment(TCGContext *s, bool is_ld, TCGReg addr_reg,
+                                   unsigned a_bits)
+{
+    TCGLabelQemuLdst *l = new_ldst_label(s);
+
+    l->is_ld = is_ld;
+    l->addrlo_reg = addr_reg;
+
+    /*
+     * Without micro-architecture details, we don't know which of bstrpick or
+     * andi is faster, so use bstrpick as it's not constrained by imm field
+     * width. (Not to say alignments >= 2^12 are going to happen any time
+     * soon, though)
+     */
+    tcg_out_opc_bstrpick_d(s, TCG_REG_TMP1, addr_reg, 0, a_bits - 1);
+
+    l->label_ptr[0] = s->code_ptr;
+    tcg_out_opc_bne(s, TCG_REG_TMP1, TCG_REG_ZERO, 0);
+
+    l->raddr = tcg_splitwx_to_rx(s->code_ptr);
+}
+
+static bool tcg_out_fail_alignment(TCGContext *s, TCGLabelQemuLdst *l)
+{
+    /* resolve label address */
+    if (!reloc_br_sk16(l->label_ptr[0], tcg_splitwx_to_rx(s->code_ptr))) {
+        return false;
+    }
+
+    tcg_out_mov(s, TCG_TYPE_TL, TCG_REG_A1, l->addrlo_reg);
+    tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_A0, TCG_AREG0);
+
+    /* tail call, with the return address back inline. */
+    tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_RA, (uintptr_t)l->raddr);
+    tcg_out_call_int(s, (const void *)(l->is_ld ? helper_unaligned_ld
+                                       : helper_unaligned_st), true);
+    return true;
+}
+
+static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
+{
+    return tcg_out_fail_alignment(s, l);
+}
+
+static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
+{
+    return tcg_out_fail_alignment(s, l);
+}
+
 #endif /* CONFIG_SOFTMMU */
 
 /*
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, TCGType type)
     MemOp opc;
 #if defined(CONFIG_SOFTMMU)
     tcg_insn_unit *label_ptr[1];
+#else
+    unsigned a_bits;
 #endif
     TCGReg base;
 
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, TCGType type)
                         data_regl, addr_regl,
                         s->code_ptr, label_ptr);
 #else
+    a_bits = get_alignment_bits(opc);
+    if (a_bits) {
+        tcg_out_test_alignment(s, true, addr_regl, a_bits);
+    }
     base = tcg_out_zext_addr_if_32_bit(s, addr_regl, TCG_REG_TMP0);
     TCGReg guest_base_reg = USE_GUEST_BASE ? TCG_GUEST_BASE_REG : TCG_REG_ZERO;
     tcg_out_qemu_ld_indexed(s, data_regl, base, guest_base_reg, opc, type);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args)
     MemOp opc;
 #if defined(CONFIG_SOFTMMU)
     tcg_insn_unit *label_ptr[1];
+#else
+    unsigned a_bits;
 #endif
     TCGReg base;
 
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args)
                         data_regl, addr_regl,
                         s->code_ptr, label_ptr);
 #else
+    a_bits = get_alignment_bits(opc);
+    if (a_bits) {
+        tcg_out_test_alignment(s, false, addr_regl, a_bits);
+    }
     base = tcg_out_zext_addr_if_32_bit(s, addr_regl, TCG_REG_TMP0);
     TCGReg guest_base_reg = USE_GUEST_BASE ? TCG_GUEST_BASE_REG : TCG_REG_ZERO;
     tcg_out_qemu_st_indexed(s, data_regl, base, guest_base_reg, opc);
-- 
2.25.1

This is now always true, since we require armv6.

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/arm/tcg-target.h     |  3 +--
 tcg/arm/tcg-target.c.inc | 35 ++++++-----------------------------
 2 files changed, 7 insertions(+), 31 deletions(-)

This is now always true, since we require armv6.

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/arm/tcg-target.h     |   1 -
 tcg/arm/tcg-target.c.inc | 192 ++++++---------------------------------
 2 files changed, 27 insertions(+), 166 deletions(-)

diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.h
+++ b/tcg/arm/tcg-target.h
@@ -XXX,XX +XXX,XX @@
 
 extern int arm_arch;
 
-#define use_armv6_instructions  (__ARM_ARCH >= 6 || arm_arch >= 6)
 #define use_armv7_instructions  (__ARM_ARCH >= 7 || arm_arch >= 7)
 
 #undef TCG_TARGET_STACK_GROWSUP
diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_dat_rIN(TCGContext *s, ARMCond cond, ARMInsn opc,
 static void tcg_out_mul32(TCGContext *s, ARMCond cond, TCGReg rd,
                           TCGReg rn, TCGReg rm)
 {
-    /* if ArchVersion() < 6 && d == n then UNPREDICTABLE;  */
-    if (!use_armv6_instructions && rd == rn) {
-        if (rd == rm) {
-            /* rd == rn == rm; copy an input to tmp first.  */
-            tcg_out_mov_reg(s, cond, TCG_REG_TMP, rn);
-            rm = rn = TCG_REG_TMP;
-        } else {
-            rn = rm;
-            rm = rd;
-        }
-    }
     /* mul */
     tcg_out32(s, (cond << 28) | 0x90 | (rd << 16) | (rm << 8) | rn);
 }
@@ -XXX,XX +XXX,XX @@ static void tcg_out_mul32(TCGContext *s, ARMCond cond, TCGReg rd,
 static void tcg_out_umull32(TCGContext *s, ARMCond cond, TCGReg rd0,
                             TCGReg rd1, TCGReg rn, TCGReg rm)
 {
-    /* if ArchVersion() < 6 && (dHi == n || dLo == n) then UNPREDICTABLE;  */
-    if (!use_armv6_instructions && (rd0 == rn || rd1 == rn)) {
-        if (rd0 == rm || rd1 == rm) {
-            tcg_out_mov_reg(s, cond, TCG_REG_TMP, rn);
-            rn = TCG_REG_TMP;
-        } else {
-            TCGReg t = rn;
-            rn = rm;
-            rm = t;
-        }
-    }
     /* umull */
     tcg_out32(s, (cond << 28) | 0x00800090 |
               (rd1 << 16) | (rd0 << 12) | (rm << 8) | rn);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_umull32(TCGContext *s, ARMCond cond, TCGReg rd0,
 static void tcg_out_smull32(TCGContext *s, ARMCond cond, TCGReg rd0,
                             TCGReg rd1, TCGReg rn, TCGReg rm)
 {
-    /* if ArchVersion() < 6 && (dHi == n || dLo == n) then UNPREDICTABLE;  */
-    if (!use_armv6_instructions && (rd0 == rn || rd1 == rn)) {
-        if (rd0 == rm || rd1 == rm) {
-            tcg_out_mov_reg(s, cond, TCG_REG_TMP, rn);
-            rn = TCG_REG_TMP;
-        } else {
-            TCGReg t = rn;
-            rn = rm;
-            rm = t;
-        }
-    }
     /* smull */
     tcg_out32(s, (cond << 28) | 0x00c00090 |
               (rd1 << 16) | (rd0 << 12) | (rm << 8) | rn);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_udiv(TCGContext *s, ARMCond cond,
 
 static void tcg_out_ext8s(TCGContext *s, ARMCond cond, TCGReg rd, TCGReg rn)
 {
-    if (use_armv6_instructions) {
-        /* sxtb */
-        tcg_out32(s, 0x06af0070 | (cond << 28) | (rd << 12) | rn);
-    } else {
-        tcg_out_dat_reg(s, cond, ARITH_MOV,
-                        rd, 0, rn, SHIFT_IMM_LSL(24));
-        tcg_out_dat_reg(s, cond, ARITH_MOV,
-                        rd, 0, rd, SHIFT_IMM_ASR(24));
-    }
+    /* sxtb */
+    tcg_out32(s, 0x06af0070 | (cond << 28) | (rd << 12) | rn);
 }
 
 static void __attribute__((unused))
@@ -XXX,XX +XXX,XX @@ tcg_out_ext8u(TCGContext *s, ARMCond cond, TCGReg rd, TCGReg rn)
 
 static void tcg_out_ext16s(TCGContext *s, ARMCond cond, TCGReg rd, TCGReg rn)
 {
-    if (use_armv6_instructions) {
-        /* sxth */
-        tcg_out32(s, 0x06bf0070 | (cond << 28) | (rd << 12) | rn);
-    } else {
-        tcg_out_dat_reg(s, cond, ARITH_MOV,
-                        rd, 0, rn, SHIFT_IMM_LSL(16));
-        tcg_out_dat_reg(s, cond, ARITH_MOV,
-                        rd, 0, rd, SHIFT_IMM_ASR(16));
-    }
+    /* sxth */
+    tcg_out32(s, 0x06bf0070 | (cond << 28) | (rd << 12) | rn);
 }
 
 static void tcg_out_ext16u(TCGContext *s, ARMCond cond, TCGReg rd, TCGReg rn)
 {
-    if (use_armv6_instructions) {
-        /* uxth */
-        tcg_out32(s, 0x06ff0070 | (cond << 28) | (rd << 12) | rn);
-    } else {
-        tcg_out_dat_reg(s, cond, ARITH_MOV,
-                        rd, 0, rn, SHIFT_IMM_LSL(16));
-        tcg_out_dat_reg(s, cond, ARITH_MOV,
-                        rd, 0, rd, SHIFT_IMM_LSR(16));
-    }
+    /* uxth */
+    tcg_out32(s, 0x06ff0070 | (cond << 28) | (rd << 12) | rn);
 }
 
 static void tcg_out_bswap16(TCGContext *s, ARMCond cond,
                             TCGReg rd, TCGReg rn, int flags)
 {
-    if (use_armv6_instructions) {
-        if (flags & TCG_BSWAP_OS) {
-            /* revsh */
-            tcg_out32(s, 0x06ff0fb0 | (cond << 28) | (rd << 12) | rn);
-            return;
-        }
-
-        /* rev16 */
-        tcg_out32(s, 0x06bf0fb0 | (cond << 28) | (rd << 12) | rn);
-        if ((flags & (TCG_BSWAP_IZ | TCG_BSWAP_OZ)) == TCG_BSWAP_OZ) {
-            /* uxth */
-            tcg_out32(s, 0x06ff0070 | (cond << 28) | (rd << 12) | rd);
-        }
+    if (flags & TCG_BSWAP_OS) {
+        /* revsh */
+        tcg_out32(s, 0x06ff0fb0 | (cond << 28) | (rd << 12) | rn);
         return;
     }
 
-    if (flags == 0) {
-        /*
-         * For stores, no input or output extension:
-         *                              rn  = xxAB
-         * lsr tmp, rn, #8              tmp = 0xxA
-         * and tmp, tmp, #0xff          tmp = 000A
-         * orr rd, tmp, rn, lsl #8      rd  = xABA
-         */
-        tcg_out_dat_reg(s, cond, ARITH_MOV,
-                        TCG_REG_TMP, 0, rn, SHIFT_IMM_LSR(8));
-        tcg_out_dat_imm(s, cond, ARITH_AND, TCG_REG_TMP, TCG_REG_TMP, 0xff);
-        tcg_out_dat_reg(s, cond, ARITH_ORR,
-                        rd, TCG_REG_TMP, rn, SHIFT_IMM_LSL(8));
-        return;
+    /* rev16 */
+    tcg_out32(s, 0x06bf0fb0 | (cond << 28) | (rd << 12) | rn);
+    if ((flags & (TCG_BSWAP_IZ | TCG_BSWAP_OZ)) == TCG_BSWAP_OZ) {
+        /* uxth */
+        tcg_out32(s, 0x06ff0070 | (cond << 28) | (rd << 12) | rd);
     }
-
-    /*
-     * Byte swap, leaving the result at the top of the register.
-     * We will then shift down, zero or sign-extending.
-     */
-    if (flags & TCG_BSWAP_IZ) {
-        /*
-         *                              rn  = 00AB
-         * ror tmp, rn, #8              tmp = B00A
-         * orr tmp, tmp, tmp, lsl #16   tmp = BA00
-         */
-        tcg_out_dat_reg(s, cond, ARITH_MOV,
-                        TCG_REG_TMP, 0, rn, SHIFT_IMM_ROR(8));
-        tcg_out_dat_reg(s, cond, ARITH_ORR,
-                        TCG_REG_TMP, TCG_REG_TMP, TCG_REG_TMP,
-                        SHIFT_IMM_LSL(16));
-    } else {
-        /*
-         *                              rn  = xxAB
-         * and tmp, rn, #0xff00         tmp = 00A0
-         * lsl tmp, tmp, #8             tmp = 0A00
-         * orr tmp, tmp, rn, lsl #24    tmp = BA00
-         */
-        tcg_out_dat_rI(s, cond, ARITH_AND, TCG_REG_TMP, rn, 0xff00, 1);
-        tcg_out_dat_reg(s, cond, ARITH_MOV,
-                        TCG_REG_TMP, 0, TCG_REG_TMP, SHIFT_IMM_LSL(8));
-        tcg_out_dat_reg(s, cond, ARITH_ORR,
-                        TCG_REG_TMP, TCG_REG_TMP, rn, SHIFT_IMM_LSL(24));
-    }
-    tcg_out_dat_reg(s, cond, ARITH_MOV, rd, 0, TCG_REG_TMP,
-                    (flags & TCG_BSWAP_OS
-                     ? SHIFT_IMM_ASR(8) : SHIFT_IMM_LSR(8)));
 }
 
 static void tcg_out_bswap32(TCGContext *s, ARMCond cond, TCGReg rd, TCGReg rn)
 {
-    if (use_armv6_instructions) {
-        /* rev */
-        tcg_out32(s, 0x06bf0f30 | (cond << 28) | (rd << 12) | rn);
-    } else {
-        tcg_out_dat_reg(s, cond, ARITH_EOR,
-                        TCG_REG_TMP, rn, rn, SHIFT_IMM_ROR(16));
-        tcg_out_dat_imm(s, cond, ARITH_BIC,
-                        TCG_REG_TMP, TCG_REG_TMP, 0xff | 0x800);
-        tcg_out_dat_reg(s, cond, ARITH_MOV,
-                        rd, 0, rn, SHIFT_IMM_ROR(8));
-        tcg_out_dat_reg(s, cond, ARITH_EOR,
-                        rd, rd, TCG_REG_TMP, SHIFT_IMM_LSR(8));
-    }
+    /* rev */
+    tcg_out32(s, 0x06bf0f30 | (cond << 28) | (rd << 12) | rn);
 }
 
 static void tcg_out_deposit(TCGContext *s, ARMCond cond, TCGReg rd,
@@ -XXX,XX +XXX,XX @@ static void tcg_out_mb(TCGContext *s, TCGArg a0)
 {
     if (use_armv7_instructions) {
         tcg_out32(s, INSN_DMB_ISH);
-    } else if (use_armv6_instructions) {
+    } else {
         tcg_out32(s, INSN_DMB_MCR);
     }
 }
@@ -XXX,XX +XXX,XX @@ static TCGReg tcg_out_arg_reg64(TCGContext *s, TCGReg argreg,
     if (argreg & 1) {
         argreg++;
     }
-    if (use_armv6_instructions && argreg >= 4
-        && (arglo & 1) == 0 && arghi == arglo + 1) {
+    if (argreg >= 4 && (arglo & 1) == 0 && arghi == arglo + 1) {
         tcg_out_strd_8(s, COND_AL, arglo,
                        TCG_REG_CALL_STACK, (argreg - 4) * 4);
         return argreg + 2;
@@ -XXX,XX +XXX,XX @@ static TCGReg tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
     int cmp_off = (is_load ? offsetof(CPUTLBEntry, addr_read)
                    : offsetof(CPUTLBEntry, addr_write));
     int fast_off = TLB_MASK_TABLE_OFS(mem_index);
-    int mask_off = fast_off + offsetof(CPUTLBDescFast, mask);
-    int table_off = fast_off + offsetof(CPUTLBDescFast, table);
     unsigned s_bits = opc & MO_SIZE;
     unsigned a_bits = get_alignment_bits(opc);
 
@@ -XXX,XX +XXX,XX @@ static TCGReg tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
     }
 
     /* Load env_tlb(env)->f[mmu_idx].{mask,table} into {r0,r1}.  */
-    if (use_armv6_instructions) {
-        tcg_out_ldrd_8(s, COND_AL, TCG_REG_R0, TCG_AREG0, fast_off);
-    } else {
-        tcg_out_ld(s, TCG_TYPE_I32, TCG_REG_R0, TCG_AREG0, mask_off);
-        tcg_out_ld(s, TCG_TYPE_I32, TCG_REG_R1, TCG_AREG0, table_off);
-    }
+    tcg_out_ldrd_8(s, COND_AL, TCG_REG_R0, TCG_AREG0, fast_off);
 
     /* Extract the tlb index from the address into R0.  */
     tcg_out_dat_reg(s, COND_AL, ARITH_AND, TCG_REG_R0, TCG_REG_R0, addrlo,
@@ -XXX,XX +XXX,XX @@ static TCGReg tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
      * Load the tlb comparator into R2/R3 and the fast path addend into R1.
      */
     if (cmp_off == 0) {
-        if (use_armv6_instructions && TARGET_LONG_BITS == 64) {
+        if (TARGET_LONG_BITS == 64) {
             tcg_out_ldrd_rwb(s, COND_AL, TCG_REG_R2, TCG_REG_R1, TCG_REG_R0);
         } else {
             tcg_out_ld32_rwb(s, COND_AL, TCG_REG_R2, TCG_REG_R1, TCG_REG_R0);
@@ -XXX,XX +XXX,XX @@ static TCGReg tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
     } else {
         tcg_out_dat_reg(s, COND_AL, ARITH_ADD,
                         TCG_REG_R1, TCG_REG_R1, TCG_REG_R0, 0);
-        if (use_armv6_instructions && TARGET_LONG_BITS == 64) {
+        if (TARGET_LONG_BITS == 64) {
             tcg_out_ldrd_8(s, COND_AL, TCG_REG_R2, TCG_REG_R1, cmp_off);
         } else {
             tcg_out_ld32_12(s, COND_AL, TCG_REG_R2, TCG_REG_R1, cmp_off);
         }
     }
-    if (!use_armv6_instructions && TARGET_LONG_BITS == 64) {
-        tcg_out_ld32_12(s, COND_AL, TCG_REG_R3, TCG_REG_R1, cmp_off + 4);
-    }
 
     /* Load the tlb addend.  */
     tcg_out_ld32_12(s, COND_AL, TCG_REG_R1, TCG_REG_R1,
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
     TCGReg argreg, datalo, datahi;
     MemOpIdx oi = lb->oi;
     MemOp opc = get_memop(oi);
-    void *func;
 
     if (!reloc_pc24(lb->label_ptr[0], tcg_splitwx_to_rx(s->code_ptr))) {
         return false;
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
     argreg = tcg_out_arg_imm32(s, argreg, oi);
     argreg = tcg_out_arg_reg32(s, argreg, TCG_REG_R14);
 
-    /* For armv6 we can use the canonical unsigned helpers and minimize
-       icache usage.  For pre-armv6, use the signed helpers since we do
-       not have a single insn sign-extend.  */
-    if (use_armv6_instructions) {
-        func = qemu_ld_helpers[opc & MO_SIZE];
-    } else {
-        func = qemu_ld_helpers[opc & MO_SSIZE];
-        if (opc & MO_SIGN) {
-            opc = MO_UL;
-        }
-    }
-    tcg_out_call(s, func);
+    /* Use the canonical unsigned helpers and minimize icache usage. */
+    tcg_out_call(s, qemu_ld_helpers[opc & MO_SIZE]);
 
     datalo = lb->datalo_reg;
     datahi = lb->datahi_reg;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld_index(TCGContext *s, MemOp opc,
         break;
     case MO_UQ:
         /* Avoid ldrd for user-only emulation, to handle unaligned.  */
-        if (USING_SOFTMMU && use_armv6_instructions
+        if (USING_SOFTMMU
             && (datalo & 1) == 0 && datahi == datalo + 1) {
             tcg_out_ldrd_r(s, COND_AL, datalo, addrlo, addend);
         } else if (datalo != addend) {
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld_direct(TCGContext *s, MemOp opc, TCGReg datalo,
         break;
     case MO_UQ:
         /* Avoid ldrd for user-only emulation, to handle unaligned.  */
-        if (USING_SOFTMMU && use_armv6_instructions
+        if (USING_SOFTMMU
             && (datalo & 1) == 0 && datahi == datalo + 1) {
             tcg_out_ldrd_8(s, COND_AL, datalo, addrlo, 0);
         } else if (datalo == addrlo) {
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st_index(TCGContext *s, ARMCond cond, MemOp opc,
         break;
     case MO_64:
         /* Avoid strd for user-only emulation, to handle unaligned.  */
-        if (USING_SOFTMMU && use_armv6_instructions
+        if (USING_SOFTMMU
             && (datalo & 1) == 0 && datahi == datalo + 1) {
             tcg_out_strd_r(s, cond, datalo, addrlo, addend);
         } else {
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st_direct(TCGContext *s, MemOp opc, TCGReg datalo,
         break;
     case MO_64:
         /* Avoid strd for user-only emulation, to handle unaligned.  */
-        if (USING_SOFTMMU && use_armv6_instructions
+        if (USING_SOFTMMU
             && (datalo & 1) == 0 && datahi == datalo + 1) {
             tcg_out_strd_8(s, COND_AL, datalo, addrlo, 0);
         } else {
-- 
2.25.1

We will shortly allow the use of unaligned memory accesses,
and these require proper alignment.  Use get_alignment_bits
to verify and remove USING_SOFTMMU.

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/arm/tcg-target.c.inc | 23 ++++++++---------------
 1 file changed, 8 insertions(+), 15 deletions(-)

diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ bool use_idiv_instructions;
 bool use_neon_instructions;
 #endif
 
-/* ??? Ought to think about changing CONFIG_SOFTMMU to always defined.  */
-#ifdef CONFIG_SOFTMMU
-# define USING_SOFTMMU 1
-#else
-# define USING_SOFTMMU 0
-#endif
-
 #ifdef CONFIG_DEBUG_TCG
 static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
     "%r0",  "%r1",  "%r2",  "%r3",  "%r4",  "%r5",  "%r6",  "%r7",
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld_index(TCGContext *s, MemOp opc,
         tcg_out_ld32_r(s, COND_AL, datalo, addrlo, addend);
         break;
     case MO_UQ:
-        /* Avoid ldrd for user-only emulation, to handle unaligned.  */
-        if (USING_SOFTMMU
+        /* LDRD requires alignment; double-check that. */
+        if (get_alignment_bits(opc) >= MO_64
             && (datalo & 1) == 0 && datahi == datalo + 1) {
             tcg_out_ldrd_r(s, COND_AL, datalo, addrlo, addend);
         } else if (datalo != addend) {
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld_direct(TCGContext *s, MemOp opc, TCGReg datalo,
         tcg_out_ld32_12(s, COND_AL, datalo, addrlo, 0);
         break;
     case MO_UQ:
-        /* Avoid ldrd for user-only emulation, to handle unaligned.  */
-        if (USING_SOFTMMU
+        /* LDRD requires alignment; double-check that. */
+        if (get_alignment_bits(opc) >= MO_64
             && (datalo & 1) == 0 && datahi == datalo + 1) {
             tcg_out_ldrd_8(s, COND_AL, datalo, addrlo, 0);
         } else if (datalo == addrlo) {
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st_index(TCGContext *s, ARMCond cond, MemOp opc,
         tcg_out_st32_r(s, cond, datalo, addrlo, addend);
         break;
     case MO_64:
-        /* Avoid strd for user-only emulation, to handle unaligned.  */
-        if (USING_SOFTMMU
+        /* STRD requires alignment; double-check that. */
+        if (get_alignment_bits(opc) >= MO_64
             && (datalo & 1) == 0 && datahi == datalo + 1) {
             tcg_out_strd_r(s, cond, datalo, addrlo, addend);
         } else {
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st_direct(TCGContext *s, MemOp opc, TCGReg datalo,
         tcg_out_st32_12(s, COND_AL, datalo, addrlo, 0);
         break;
     case MO_64:
-        /* Avoid strd for user-only emulation, to handle unaligned.  */
-        if (USING_SOFTMMU
+        /* STRD requires alignment; double-check that. */
+        if (get_alignment_bits(opc) >= MO_64
             && (datalo & 1) == 0 && datahi == datalo + 1) {
             tcg_out_strd_8(s, COND_AL, datalo, addrlo, 0);
         } else {
-- 
2.25.1

From armv6, the architecture supports unaligned accesses.
All we need to do is perform the correct alignment check
in tcg_out_tlb_read.

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/arm/tcg-target.c.inc | 41 ++++++++++++++++++++--------------------
 1 file changed, 21 insertions(+), 20 deletions(-)

diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static TCGReg tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
     int cmp_off = (is_load ? offsetof(CPUTLBEntry, addr_read)
                    : offsetof(CPUTLBEntry, addr_write));
     int fast_off = TLB_MASK_TABLE_OFS(mem_index);
-    unsigned s_bits = opc & MO_SIZE;
-    unsigned a_bits = get_alignment_bits(opc);
-
-    /*
-     * We don't support inline unaligned acceses, but we can easily
-     * support overalignment checks.
-     */
-    if (a_bits < s_bits) {
-        a_bits = s_bits;
-    }
+    unsigned s_mask = (1 << (opc & MO_SIZE)) - 1;
+    unsigned a_mask = (1 << get_alignment_bits(opc)) - 1;
+    TCGReg t_addr;
 
     /* Load env_tlb(env)->f[mmu_idx].{mask,table} into {r0,r1}.  */
     tcg_out_ldrd_8(s, COND_AL, TCG_REG_R0, TCG_AREG0, fast_off);
@@ -XXX,XX +XXX,XX @@ static TCGReg tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
 
     /*
      * Check alignment, check comparators.
-     * Do this in no more than 3 insns.  Use MOVW for v7, if possible,
+     * Do this in 2-4 insns.  Use MOVW for v7, if possible,
      * to reduce the number of sequential conditional instructions.
      * Almost all guests have at least 4k pages, which means that we need
      * to clear at least 9 bits even for an 8-byte memory, which means it
      * isn't worth checking for an immediate operand for BIC.
+     *
+     * For unaligned accesses, test the page of the last unit of alignment.
+     * This leaves the least significant alignment bits unchanged, and of
+     * course must be zero.
      */
+    t_addr = addrlo;
+    if (a_mask < s_mask) {
+        t_addr = TCG_REG_R0;
+        tcg_out_dat_imm(s, COND_AL, ARITH_ADD, t_addr,
+                        addrlo, s_mask - a_mask);
+    }
     if (use_armv7_instructions && TARGET_PAGE_BITS <= 16) {
-        tcg_target_ulong mask = ~(TARGET_PAGE_MASK | ((1 << a_bits) - 1));
-
-        tcg_out_movi32(s, COND_AL, TCG_REG_TMP, mask);
+        tcg_out_movi32(s, COND_AL, TCG_REG_TMP, ~(TARGET_PAGE_MASK | a_mask));
         tcg_out_dat_reg(s, COND_AL, ARITH_BIC, TCG_REG_TMP,
-                        addrlo, TCG_REG_TMP, 0);
+                        t_addr, TCG_REG_TMP, 0);
         tcg_out_dat_reg(s, COND_AL, ARITH_CMP, 0, TCG_REG_R2, TCG_REG_TMP, 0);
     } else {
-        if (a_bits) {
-            tcg_out_dat_imm(s, COND_AL, ARITH_TST, 0, addrlo,
-                            (1 << a_bits) - 1);
+        if (a_mask) {
+            tcg_debug_assert(a_mask <= 0xff);
+            tcg_out_dat_imm(s, COND_AL, ARITH_TST, 0, addrlo, a_mask);
         }
-        tcg_out_dat_reg(s, COND_AL, ARITH_MOV, TCG_REG_TMP, 0, addrlo,
+        tcg_out_dat_reg(s, COND_AL, ARITH_MOV, TCG_REG_TMP, 0, t_addr,
                         SHIFT_IMM_LSR(TARGET_PAGE_BITS));
-        tcg_out_dat_reg(s, (a_bits ? COND_EQ : COND_AL), ARITH_CMP,
+        tcg_out_dat_reg(s, (a_mask ? COND_EQ : COND_AL), ARITH_CMP,
                         0, TCG_REG_R2, TCG_REG_TMP,
                         SHIFT_IMM_LSL(TARGET_PAGE_BITS));
     }
-- 
2.25.1

Reserve a register for the guest_base using aarch64 for reference.
By doing so, we do not have to recompute it for every memory load.

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/arm/tcg-target.c.inc | 39 ++++++++++++++++++++++++++++-----------
 1 file changed, 28 insertions(+), 11 deletions(-)

diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static const int tcg_target_call_oarg_regs[2] = {
 
 #define TCG_REG_TMP  TCG_REG_R12
 #define TCG_VEC_TMP  TCG_REG_Q15
+#ifndef CONFIG_SOFTMMU
+#define TCG_REG_GUEST_BASE  TCG_REG_R11
+#endif
 
 typedef enum {
     COND_EQ = 0x0,
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
 
 static void tcg_out_qemu_ld_index(TCGContext *s, MemOp opc,
                                   TCGReg datalo, TCGReg datahi,
-                                  TCGReg addrlo, TCGReg addend)
+                                  TCGReg addrlo, TCGReg addend,
+                                  bool scratch_addend)
 {
     /* Byte swapping is left to middle-end expansion. */
     tcg_debug_assert((opc & MO_BSWAP) == 0);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld_index(TCGContext *s, MemOp opc,
         if (get_alignment_bits(opc) >= MO_64
             && (datalo & 1) == 0 && datahi == datalo + 1) {
             tcg_out_ldrd_r(s, COND_AL, datalo, addrlo, addend);
-        } else if (datalo != addend) {
+        } else if (scratch_addend) {
             tcg_out_ld32_rwb(s, COND_AL, datalo, addend, addrlo);
             tcg_out_ld32_12(s, COND_AL, datahi, addend, 4);
         } else {
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
     label_ptr = s->code_ptr;
     tcg_out_bl_imm(s, COND_NE, 0);
 
-    tcg_out_qemu_ld_index(s, opc, datalo, datahi, addrlo, addend);
+    tcg_out_qemu_ld_index(s, opc, datalo, datahi, addrlo, addend, true);
 
     add_qemu_ldst_label(s, true, oi, datalo, datahi, addrlo, addrhi,
                         s->code_ptr, label_ptr);
 #else /* !CONFIG_SOFTMMU */
     if (guest_base) {
-        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_TMP, guest_base);
-        tcg_out_qemu_ld_index(s, opc, datalo, datahi, addrlo, TCG_REG_TMP);
+        tcg_out_qemu_ld_index(s, opc, datalo, datahi,
+                              addrlo, TCG_REG_GUEST_BASE, false);
     } else {
         tcg_out_qemu_ld_direct(s, opc, datalo, datahi, addrlo);
     }
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
 
 static void tcg_out_qemu_st_index(TCGContext *s, ARMCond cond, MemOp opc,
                                   TCGReg datalo, TCGReg datahi,
-                                  TCGReg addrlo, TCGReg addend)
+                                  TCGReg addrlo, TCGReg addend,
+                                  bool scratch_addend)
 {
     /* Byte swapping is left to middle-end expansion. */
     tcg_debug_assert((opc & MO_BSWAP) == 0);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st_index(TCGContext *s, ARMCond cond, MemOp opc,
         if (get_alignment_bits(opc) >= MO_64
             && (datalo & 1) == 0 && datahi == datalo + 1) {
             tcg_out_strd_r(s, cond, datalo, addrlo, addend);
-        } else {
+        } else if (scratch_addend) {
             tcg_out_st32_rwb(s, cond, datalo, addend, addrlo);
             tcg_out_st32_12(s, cond, datahi, addend, 4);
+        } else {
+            tcg_out_dat_reg(s, cond, ARITH_ADD, TCG_REG_TMP,
+                            addend, addrlo, SHIFT_IMM_LSL(0));
+            tcg_out_st32_12(s, cond, datalo, TCG_REG_TMP, 0);
+            tcg_out_st32_12(s, cond, datahi, TCG_REG_TMP, 4);
         }
         break;
     default:
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
     mem_index = get_mmuidx(oi);
     addend = tcg_out_tlb_read(s, addrlo, addrhi, opc, mem_index, 0);
 
-    tcg_out_qemu_st_index(s, COND_EQ, opc, datalo, datahi, addrlo, addend);
+    tcg_out_qemu_st_index(s, COND_EQ, opc, datalo, datahi,
+                          addrlo, addend, true);
 
     /* The conditional call must come last, as we're going to return here.  */
     label_ptr = s->code_ptr;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
                         s->code_ptr, label_ptr);
 #else /* !CONFIG_SOFTMMU */
     if (guest_base) {
-        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_TMP, guest_base);
-        tcg_out_qemu_st_index(s, COND_AL, opc, datalo,
-                              datahi, addrlo, TCG_REG_TMP);
+        tcg_out_qemu_st_index(s, COND_AL, opc, datalo, datahi,
+                              addrlo, TCG_REG_GUEST_BASE, false);
     } else {
         tcg_out_qemu_st_direct(s, opc, datalo, datahi, addrlo);
     }
@@ -XXX,XX +XXX,XX @@ static void tcg_target_qemu_prologue(TCGContext *s)
 
     tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
 
+#ifndef CONFIG_SOFTMMU
+    if (guest_base) {
+        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_GUEST_BASE, guest_base);
+        tcg_regset_set_reg(s->reserved_regs, TCG_REG_GUEST_BASE);
+    }
+#endif
+
     tcg_out_b_reg(s, COND_AL, tcg_target_call_iarg_regs[1]);
 
     /*
-- 
2.25.1

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/arm/tcg-target.h     |  2 -
 tcg/arm/tcg-target.c.inc | 83 +++++++++++++++++++++++++++++++++++++++-
 2 files changed, 81 insertions(+), 4 deletions(-)

diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.h
+++ b/tcg/arm/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool use_neon_instructions;
 /* not defined -- call should be eliminated at compile time */
 void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t);
 
-#ifdef CONFIG_SOFTMMU
 #define TCG_TARGET_NEED_LDST_LABELS
-#endif
 #define TCG_TARGET_NEED_POOL_LABELS
 
 #endif
diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@
  */
 
 #include "elf.h"
+#include "../tcg-ldst.c.inc"
 #include "../tcg-pool.c.inc"
 
 int arm_arch = __ARM_ARCH;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vldst(TCGContext *s, ARMInsn insn,
 }
 
 #ifdef CONFIG_SOFTMMU
-#include "../tcg-ldst.c.inc"
-
 /* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
  *                                     int mmu_idx, uintptr_t ra)
  */
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
     tcg_out_goto(s, COND_AL, qemu_st_helpers[opc & MO_SIZE]);
     return true;
 }
+#else
+
+static void tcg_out_test_alignment(TCGContext *s, bool is_ld, TCGReg addrlo,
+                                   TCGReg addrhi, unsigned a_bits)
+{
+    unsigned a_mask = (1 << a_bits) - 1;
+    TCGLabelQemuLdst *label = new_ldst_label(s);
+
+    label->is_ld = is_ld;
+    label->addrlo_reg = addrlo;
+    label->addrhi_reg = addrhi;
+
+    /* We are expecting a_bits to max out at 7, and can easily support 8. */
+    tcg_debug_assert(a_mask <= 0xff);
+    /* tst addr, #mask */
+    tcg_out_dat_imm(s, COND_AL, ARITH_TST, 0, addrlo, a_mask);
+
+    /* blne slow_path */
+    label->label_ptr[0] = s->code_ptr;
+    tcg_out_bl_imm(s, COND_NE, 0);
+
+    label->raddr = tcg_splitwx_to_rx(s->code_ptr);
+}
+
+static bool tcg_out_fail_alignment(TCGContext *s, TCGLabelQemuLdst *l)
+{
+    if (!reloc_pc24(l->label_ptr[0], tcg_splitwx_to_rx(s->code_ptr))) {
+        return false;
+    }
+
+    if (TARGET_LONG_BITS == 64) {
+        /* 64-bit target address is aligned into R2:R3. */
+        if (l->addrhi_reg != TCG_REG_R2) {
+            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R2, l->addrlo_reg);
+            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R3, l->addrhi_reg);
+        } else if (l->addrlo_reg != TCG_REG_R3) {
+            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R3, l->addrhi_reg);
+            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R2, l->addrlo_reg);
+        } else {
+            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R1, TCG_REG_R2);
+            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R2, TCG_REG_R3);
+            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R3, TCG_REG_R1);
+        }
+    } else {
+        tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R1, l->addrlo_reg);
+    }
+    tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_R0, TCG_AREG0);
+
+    /*
+     * Tail call to the helper, with the return address back inline,
+     * just for the clarity of the debugging traceback -- the helper
+     * cannot return.  We have used BLNE to arrive here, so LR is
+     * already set.
+     */
+    tcg_out_goto(s, COND_AL, (const void *)
+                 (l->is_ld ? helper_unaligned_ld : helper_unaligned_st));
+    return true;
+}
+
+static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
+{
+    return tcg_out_fail_alignment(s, l);
+}
+
+static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
+{
+    return tcg_out_fail_alignment(s, l);
+}
 #endif /* SOFTMMU */
 
 static void tcg_out_qemu_ld_index(TCGContext *s, MemOp opc,
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
     int mem_index;
     TCGReg addend;
     tcg_insn_unit *label_ptr;
+#else
+    unsigned a_bits;
 #endif
 
     datalo = *args++;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
     add_qemu_ldst_label(s, true, oi, datalo, datahi, addrlo, addrhi,
                         s->code_ptr, label_ptr);
 #else /* !CONFIG_SOFTMMU */
+    a_bits = get_alignment_bits(opc);
+    if (a_bits) {
+        tcg_out_test_alignment(s, true, addrlo, addrhi, a_bits);
+    }
     if (guest_base) {
         tcg_out_qemu_ld_index(s, opc, datalo, datahi,
                               addrlo, TCG_REG_GUEST_BASE, false);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
     int mem_index;
     TCGReg addend;
     tcg_insn_unit *label_ptr;
+#else
+    unsigned a_bits;
 #endif
 
     datalo = *args++;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
     add_qemu_ldst_label(s, false, oi, datalo, datahi, addrlo, addrhi,
                         s->code_ptr, label_ptr);
 #else /* !CONFIG_SOFTMMU */
+    a_bits = get_alignment_bits(opc);
+    if (a_bits) {
+        tcg_out_test_alignment(s, false, addrlo, addrhi, a_bits);
+    }
     if (guest_base) {
         tcg_out_qemu_st_index(s, COND_AL, opc, datalo, datahi,
                               addrlo, TCG_REG_GUEST_BASE, false);
-- 
2.25.1

This is kinda sorta the opposite of the other tcg hosts, where
we get (normal) alignment checks for free with host SIGBUS and
need to add code to support unaligned accesses.

Fortunately, the ISA contains pairs of instructions that are
used to implement unaligned memory accesses.  Use them.

Tested-by: Jiaxun Yang <jiaxun.yang@flygoat.com>
Reviewed-by: Jiaxun Yang <jiaxun.yang@flygoat.com>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/mips/tcg-target.h     |   2 -
 tcg/mips/tcg-target.c.inc | 334 +++++++++++++++++++++++++++++++++++++-
 2 files changed, 328 insertions(+), 8 deletions(-)

diff --git a/tcg/mips/tcg-target.h b/tcg/mips/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/mips/tcg-target.h
+++ b/tcg/mips/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool use_mips32r2_instructions;
 void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t)
     QEMU_ERROR("code path is reachable");
 
-#ifdef CONFIG_SOFTMMU
 #define TCG_TARGET_NEED_LDST_LABELS
-#endif
 
 #endif
diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/mips/tcg-target.c.inc
+++ b/tcg/mips/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@
  * THE SOFTWARE.
  */
 
+#include "../tcg-ldst.c.inc"
+
 #ifdef HOST_WORDS_BIGENDIAN
 # define MIPS_BE  1
 #else
@@ -XXX,XX +XXX,XX @@ typedef enum {
     OPC_ORI      = 015 << 26,
     OPC_XORI     = 016 << 26,
     OPC_LUI      = 017 << 26,
+    OPC_BNEL     = 025 << 26,
+    OPC_BNEZALC_R6 = 030 << 26,
     OPC_DADDIU   = 031 << 26,
+    OPC_LDL      = 032 << 26,
+    OPC_LDR      = 033 << 26,
     OPC_LB       = 040 << 26,
     OPC_LH       = 041 << 26,
+    OPC_LWL      = 042 << 26,
     OPC_LW       = 043 << 26,
     OPC_LBU      = 044 << 26,
     OPC_LHU      = 045 << 26,
+    OPC_LWR      = 046 << 26,
     OPC_LWU      = 047 << 26,
     OPC_SB       = 050 << 26,
     OPC_SH       = 051 << 26,
+    OPC_SWL      = 052 << 26,
     OPC_SW       = 053 << 26,
+    OPC_SDL      = 054 << 26,
+    OPC_SDR      = 055 << 26,
+    OPC_SWR      = 056 << 26,
     OPC_LD       = 067 << 26,
     OPC_SD       = 077 << 26,
 
@@ -XXX,XX +XXX,XX @@ static void tcg_out_call(TCGContext *s, const tcg_insn_unit *arg)
 }
 
 #if defined(CONFIG_SOFTMMU)
-#include "../tcg-ldst.c.inc"
-
 static void * const qemu_ld_helpers[(MO_SSIZE | MO_BSWAP) + 1] = {
     [MO_UB]   = helper_ret_ldub_mmu,
     [MO_SB]   = helper_ret_ldsb_mmu,
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
     tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
     return true;
 }
-#endif
+
+#else
+
+static void tcg_out_test_alignment(TCGContext *s, bool is_ld, TCGReg addrlo,
+                                   TCGReg addrhi, unsigned a_bits)
+{
+    unsigned a_mask = (1 << a_bits) - 1;
+    TCGLabelQemuLdst *l = new_ldst_label(s);
+
+    l->is_ld = is_ld;
+    l->addrlo_reg = addrlo;
+    l->addrhi_reg = addrhi;
+
+    /* We are expecting a_bits to max out at 7, much lower than ANDI. */
+    tcg_debug_assert(a_bits < 16);
+    tcg_out_opc_imm(s, OPC_ANDI, TCG_TMP0, addrlo, a_mask);
+
+    l->label_ptr[0] = s->code_ptr;
+    if (use_mips32r6_instructions) {
+        tcg_out_opc_br(s, OPC_BNEZALC_R6, TCG_REG_ZERO, TCG_TMP0);
+    } else {
+        tcg_out_opc_br(s, OPC_BNEL, TCG_TMP0, TCG_REG_ZERO);
+        tcg_out_nop(s);
+    }
+
+    l->raddr = tcg_splitwx_to_rx(s->code_ptr);
+}
+
+static bool tcg_out_fail_alignment(TCGContext *s, TCGLabelQemuLdst *l)
+{
+    void *target;
+
+    if (!reloc_pc16(l->label_ptr[0], tcg_splitwx_to_rx(s->code_ptr))) {
+        return false;
+    }
+
+    if (TCG_TARGET_REG_BITS < TARGET_LONG_BITS) {
+        /* A0 is env, A1 is skipped, A2:A3 is the uint64_t address. */
+        TCGReg a2 = MIPS_BE ? l->addrhi_reg : l->addrlo_reg;
+        TCGReg a3 = MIPS_BE ? l->addrlo_reg : l->addrhi_reg;
+
+        if (a3 != TCG_REG_A2) {
+            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_A2, a2);
+            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_A3, a3);
+        } else if (a2 != TCG_REG_A3) {
+            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_A3, a3);
+            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_A2, a2);
+        } else {
+            tcg_out_mov(s, TCG_TYPE_I32, TCG_TMP0, TCG_REG_A2);
+            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_A2, TCG_REG_A3);
+            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_A3, TCG_TMP0);
+        }
+    } else {
+        tcg_out_mov(s, TCG_TYPE_TL, TCG_REG_A1, l->addrlo_reg);
+    }
+    tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_A0, TCG_AREG0);
+
+    /*
+     * Tail call to the helper, with the return address back inline.
+     * We have arrived here via BNEL, so $31 is already set.
+     */
+    target = (l->is_ld ? helper_unaligned_ld : helper_unaligned_st);
+    tcg_out_call_int(s, target, true);
+    return true;
+}
+
+static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
+{
+    return tcg_out_fail_alignment(s, l);
+}
+
+static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
+{
+    return tcg_out_fail_alignment(s, l);
+}
+#endif /* SOFTMMU */
 
 static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg lo, TCGReg hi,
                                    TCGReg base, MemOp opc, bool is_64)
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg lo, TCGReg hi,
     }
 }
 
+static void __attribute__((unused))
+tcg_out_qemu_ld_unalign(TCGContext *s, TCGReg lo, TCGReg hi,
+                                    TCGReg base, MemOp opc, bool is_64)
+{
+    const MIPSInsn lw1 = MIPS_BE ? OPC_LWL : OPC_LWR;
+    const MIPSInsn lw2 = MIPS_BE ? OPC_LWR : OPC_LWL;
+    const MIPSInsn ld1 = MIPS_BE ? OPC_LDL : OPC_LDR;
+    const MIPSInsn ld2 = MIPS_BE ? OPC_LDR : OPC_LDL;
+
+    bool sgn = (opc & MO_SIGN);
+
+    switch (opc & (MO_SSIZE | MO_BSWAP)) {
+    case MO_SW | MO_BE:
+    case MO_UW | MO_BE:
+        tcg_out_opc_imm(s, sgn ? OPC_LB : OPC_LBU, TCG_TMP0, base, 0);
+        tcg_out_opc_imm(s, OPC_LBU, lo, base, 1);
+        if (use_mips32r2_instructions) {
+            tcg_out_opc_bf(s, OPC_INS, lo, TCG_TMP0, 31, 8);
+        } else {
+            tcg_out_opc_sa(s, OPC_SLL, TCG_TMP0, TCG_TMP0, 8);
+            tcg_out_opc_reg(s, OPC_OR, lo, TCG_TMP0, TCG_TMP1);
+        }
+        break;
+
+    case MO_SW | MO_LE:
+    case MO_UW | MO_LE:
+        if (use_mips32r2_instructions && lo != base) {
+            tcg_out_opc_imm(s, OPC_LBU, lo, base, 0);
+            tcg_out_opc_imm(s, sgn ? OPC_LB : OPC_LBU, TCG_TMP0, base, 1);
+            tcg_out_opc_bf(s, OPC_INS, lo, TCG_TMP0, 31, 8);
+        } else {
+            tcg_out_opc_imm(s, OPC_LBU, TCG_TMP0, base, 0);
+            tcg_out_opc_imm(s, sgn ? OPC_LB : OPC_LBU, TCG_TMP1, base, 1);
+            tcg_out_opc_sa(s, OPC_SLL, TCG_TMP1, TCG_TMP1, 8);
+            tcg_out_opc_reg(s, OPC_OR, lo, TCG_TMP0, TCG_TMP1);
+        }
+        break;
+
+    case MO_SL:
+    case MO_UL:
+        tcg_out_opc_imm(s, lw1, lo, base, 0);
+        tcg_out_opc_imm(s, lw2, lo, base, 3);
+        if (TCG_TARGET_REG_BITS == 64 && is_64 && !sgn) {
+            tcg_out_ext32u(s, lo, lo);
+        }
+        break;
+
+    case MO_UL | MO_BSWAP:
+    case MO_SL | MO_BSWAP:
+        if (use_mips32r2_instructions) {
+            tcg_out_opc_imm(s, lw1, lo, base, 0);
+            tcg_out_opc_imm(s, lw2, lo, base, 3);
+            tcg_out_bswap32(s, lo, lo,
+                            TCG_TARGET_REG_BITS == 64 && is_64
+                            ? (sgn ? TCG_BSWAP_OS : TCG_BSWAP_OZ) : 0);
+        } else {
+            const tcg_insn_unit *subr =
+                (TCG_TARGET_REG_BITS == 64 && is_64 && !sgn
+                 ? bswap32u_addr : bswap32_addr);
+
+            tcg_out_opc_imm(s, lw1, TCG_TMP0, base, 0);
+            tcg_out_bswap_subr(s, subr);
+            /* delay slot */
+            tcg_out_opc_imm(s, lw2, TCG_TMP0, base, 3);
+            tcg_out_mov(s, is_64 ? TCG_TYPE_I64 : TCG_TYPE_I32, lo, TCG_TMP3);
+        }
+        break;
+
+    case MO_UQ:
+        if (TCG_TARGET_REG_BITS == 64) {
+            tcg_out_opc_imm(s, ld1, lo, base, 0);
+            tcg_out_opc_imm(s, ld2, lo, base, 7);
+        } else {
+            tcg_out_opc_imm(s, lw1, MIPS_BE ? hi : lo, base, 0 + 0);
+            tcg_out_opc_imm(s, lw2, MIPS_BE ? hi : lo, base, 0 + 3);
+            tcg_out_opc_imm(s, lw1, MIPS_BE ? lo : hi, base, 4 + 0);
+            tcg_out_opc_imm(s, lw2, MIPS_BE ? lo : hi, base, 4 + 3);
+        }
+        break;
+
+    case MO_UQ | MO_BSWAP:
+        if (TCG_TARGET_REG_BITS == 64) {
+            if (use_mips32r2_instructions) {
+                tcg_out_opc_imm(s, ld1, lo, base, 0);
+                tcg_out_opc_imm(s, ld2, lo, base, 7);
+                tcg_out_bswap64(s, lo, lo);
+            } else {
+                tcg_out_opc_imm(s, ld1, TCG_TMP0, base, 0);
+                tcg_out_bswap_subr(s, bswap64_addr);
+                /* delay slot */
+                tcg_out_opc_imm(s, ld2, TCG_TMP0, base, 7);
+                tcg_out_mov(s, TCG_TYPE_I64, lo, TCG_TMP3);
+            }
+        } else if (use_mips32r2_instructions) {
+            tcg_out_opc_imm(s, lw1, TCG_TMP0, base, 0 + 0);
+            tcg_out_opc_imm(s, lw2, TCG_TMP0, base, 0 + 3);
+            tcg_out_opc_imm(s, lw1, TCG_TMP1, base, 4 + 0);
+            tcg_out_opc_imm(s, lw2, TCG_TMP1, base, 4 + 3);
+            tcg_out_opc_reg(s, OPC_WSBH, TCG_TMP0, 0, TCG_TMP0);
+            tcg_out_opc_reg(s, OPC_WSBH, TCG_TMP1, 0, TCG_TMP1);
+            tcg_out_opc_sa(s, OPC_ROTR, MIPS_BE ? lo : hi, TCG_TMP0, 16);
+            tcg_out_opc_sa(s, OPC_ROTR, MIPS_BE ? hi : lo, TCG_TMP1, 16);
+        } else {
+            tcg_out_opc_imm(s, lw1, TCG_TMP0, base, 0 + 0);
+            tcg_out_bswap_subr(s, bswap32_addr);
+            /* delay slot */
+            tcg_out_opc_imm(s, lw2, TCG_TMP0, base, 0 + 3);
+            tcg_out_opc_imm(s, lw1, TCG_TMP0, base, 4 + 0);
+            tcg_out_mov(s, TCG_TYPE_I32, MIPS_BE ? lo : hi, TCG_TMP3);
+            tcg_out_bswap_subr(s, bswap32_addr);
+            /* delay slot */
+            tcg_out_opc_imm(s, lw2, TCG_TMP0, base, 4 + 3);
+            tcg_out_mov(s, TCG_TYPE_I32, MIPS_BE ? hi : lo, TCG_TMP3);
+        }
+        break;
+
+    default:
+        g_assert_not_reached();
+    }
+}
+
 static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is_64)
 {
     TCGReg addr_regl, addr_regh __attribute__((unused));
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is_64)
     MemOp opc;
 #if defined(CONFIG_SOFTMMU)
     tcg_insn_unit *label_ptr[2];
+#else
+    unsigned a_bits, s_bits;
 #endif
     TCGReg base = TCG_REG_A0;
 
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is_64)
     } else {
         tcg_out_opc_reg(s, ALIAS_PADD, base, TCG_GUEST_BASE_REG, addr_regl);
     }
-    tcg_out_qemu_ld_direct(s, data_regl, data_regh, base, opc, is_64);
+    a_bits = get_alignment_bits(opc);
+    s_bits = opc & MO_SIZE;
+    /*
+     * R6 removes the left/right instructions but requires the
+     * system to support misaligned memory accesses.
+     */
+    if (use_mips32r6_instructions) {
+        if (a_bits) {
+            tcg_out_test_alignment(s, true, addr_regl, addr_regh, a_bits);
+        }
+        tcg_out_qemu_ld_direct(s, data_regl, data_regh, base, opc, is_64);
+    } else {
+        if (a_bits && a_bits != s_bits) {
+            tcg_out_test_alignment(s, true, addr_regl, addr_regh, a_bits);
+        }
+        if (a_bits >= s_bits) {
+            tcg_out_qemu_ld_direct(s, data_regl, data_regh, base, opc, is_64);
+        } else {
+            tcg_out_qemu_ld_unalign(s, data_regl, data_regh, base, opc, is_64);
+        }
+    }
 #endif
 }
 
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg lo, TCGReg hi,
     }
 }
 
+static void __attribute__((unused))
+tcg_out_qemu_st_unalign(TCGContext *s, TCGReg lo, TCGReg hi,
+                                    TCGReg base, MemOp opc)
+{
+    const MIPSInsn sw1 = MIPS_BE ? OPC_SWL : OPC_SWR;
+    const MIPSInsn sw2 = MIPS_BE ? OPC_SWR : OPC_SWL;
+    const MIPSInsn sd1 = MIPS_BE ? OPC_SDL : OPC_SDR;
+    const MIPSInsn sd2 = MIPS_BE ? OPC_SDR : OPC_SDL;
+
+    /* Don't clutter the code below with checks to avoid bswapping ZERO.  */
+    if ((lo | hi) == 0) {
+        opc &= ~MO_BSWAP;
+    }
+
+    switch (opc & (MO_SIZE | MO_BSWAP)) {
+    case MO_16 | MO_BE:
+        tcg_out_opc_sa(s, OPC_SRL, TCG_TMP0, lo, 8);
+        tcg_out_opc_imm(s, OPC_SB, TCG_TMP0, base, 0);
+        tcg_out_opc_imm(s, OPC_SB, lo, base, 1);
+        break;
+
+    case MO_16 | MO_LE:
+        tcg_out_opc_sa(s, OPC_SRL, TCG_TMP0, lo, 8);
+        tcg_out_opc_imm(s, OPC_SB, lo, base, 0);
+        tcg_out_opc_imm(s, OPC_SB, TCG_TMP0, base, 1);
+        break;
+
+    case MO_32 | MO_BSWAP:
+        tcg_out_bswap32(s, TCG_TMP3, lo, 0);
+        lo = TCG_TMP3;
+        /* fall through */
+    case MO_32:
+        tcg_out_opc_imm(s, sw1, lo, base, 0);
+        tcg_out_opc_imm(s, sw2, lo, base, 3);
+        break;
+
+    case MO_64 | MO_BSWAP:
+        if (TCG_TARGET_REG_BITS == 64) {
+            tcg_out_bswap64(s, TCG_TMP3, lo);
+            lo = TCG_TMP3;
+        } else if (use_mips32r2_instructions) {
+            tcg_out_opc_reg(s, OPC_WSBH, TCG_TMP0, 0, MIPS_BE ? hi : lo);
+            tcg_out_opc_reg(s, OPC_WSBH, TCG_TMP1, 0, MIPS_BE ? lo : hi);
+            tcg_out_opc_sa(s, OPC_ROTR, TCG_TMP0, TCG_TMP0, 16);
+            tcg_out_opc_sa(s, OPC_ROTR, TCG_TMP1, TCG_TMP1, 16);
+            hi = MIPS_BE ? TCG_TMP0 : TCG_TMP1;
+            lo = MIPS_BE ? TCG_TMP1 : TCG_TMP0;
+        } else {
+            tcg_out_bswap32(s, TCG_TMP3, MIPS_BE ? lo : hi, 0);
+            tcg_out_opc_imm(s, sw1, TCG_TMP3, base, 0 + 0);
+            tcg_out_opc_imm(s, sw2, TCG_TMP3, base, 0 + 3);
+            tcg_out_bswap32(s, TCG_TMP3, MIPS_BE ? hi : lo, 0);
+            tcg_out_opc_imm(s, sw1, TCG_TMP3, base, 4 + 0);
+            tcg_out_opc_imm(s, sw2, TCG_TMP3, base, 4 + 3);
+            break;
+        }
+        /* fall through */
+    case MO_64:
+        if (TCG_TARGET_REG_BITS == 64) {
+            tcg_out_opc_imm(s, sd1, lo, base, 0);
+            tcg_out_opc_imm(s, sd2, lo, base, 7);
+        } else {
+            tcg_out_opc_imm(s, sw1, MIPS_BE ? hi : lo, base, 0 + 0);
+            tcg_out_opc_imm(s, sw2, MIPS_BE ? hi : lo, base, 0 + 3);
+            tcg_out_opc_imm(s, sw1, MIPS_BE ? lo : hi, base, 4 + 0);
+            tcg_out_opc_imm(s, sw2, MIPS_BE ? lo : hi, base, 4 + 3);
+        }
+        break;
+
+    default:
+        tcg_abort();
+    }
+}
 static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is_64)
 {
     TCGReg addr_regl, addr_regh __attribute__((unused));
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is_64)
     MemOp opc;
 #if defined(CONFIG_SOFTMMU)
     tcg_insn_unit *label_ptr[2];
+#else
+    unsigned a_bits, s_bits;
 #endif
     TCGReg base = TCG_REG_A0;
 
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is_64)
                         data_regl, data_regh, addr_regl, addr_regh,
                         s->code_ptr, label_ptr);
 #else
-    base = TCG_REG_A0;
     if (TCG_TARGET_REG_BITS > TARGET_LONG_BITS) {
         tcg_out_ext32u(s, base, addr_regl);
         addr_regl = base;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is_64)
     } else {
         tcg_out_opc_reg(s, ALIAS_PADD, base, TCG_GUEST_BASE_REG, addr_regl);
     }
-    tcg_out_qemu_st_direct(s, data_regl, data_regh, base, opc);
+    a_bits = get_alignment_bits(opc);
+    s_bits = opc & MO_SIZE;
+    /*
+     * R6 removes the left/right instructions but requires the
+     * system to support misaligned memory accesses.
+     */
+    if (use_mips32r6_instructions) {
+        if (a_bits) {
+            tcg_out_test_alignment(s, true, addr_regl, addr_regh, a_bits);
+        }
+        tcg_out_qemu_st_direct(s, data_regl, data_regh, base, opc);
+    } else {
+        if (a_bits && a_bits != s_bits) {
+            tcg_out_test_alignment(s, true, addr_regl, addr_regh, a_bits);
+        }
+        if (a_bits >= s_bits) {
+            tcg_out_qemu_st_direct(s, data_regl, data_regh, base, opc);
+        } else {
+            tcg_out_qemu_st_unalign(s, data_regl, data_regh, base, opc);
+        }
+    }
 #endif
 }
 
-- 
2.25.1

We can use the routines just added for user-only to emit
unaligned accesses in softmmu mode too.

Tested-by: Jiaxun Yang <jiaxun.yang@flygoat.com>
Reviewed-by: Jiaxun Yang <jiaxun.yang@flygoat.com>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/mips/tcg-target.c.inc | 91 ++++++++++++++++++++++-----------------
 1 file changed, 51 insertions(+), 40 deletions(-)

diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/mips/tcg-target.c.inc
+++ b/tcg/mips/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_tlb_load(TCGContext *s, TCGReg base, TCGReg addrl,
                              tcg_insn_unit *label_ptr[2], bool is_load)
 {
     MemOp opc = get_memop(oi);
-    unsigned s_bits = opc & MO_SIZE;
     unsigned a_bits = get_alignment_bits(opc);
+    unsigned s_bits = opc & MO_SIZE;
+    unsigned a_mask = (1 << a_bits) - 1;
+    unsigned s_mask = (1 << s_bits) - 1;
     int mem_index = get_mmuidx(oi);
     int fast_off = TLB_MASK_TABLE_OFS(mem_index);
     int mask_off = fast_off + offsetof(CPUTLBDescFast, mask);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_tlb_load(TCGContext *s, TCGReg base, TCGReg addrl,
     int add_off = offsetof(CPUTLBEntry, addend);
     int cmp_off = (is_load ? offsetof(CPUTLBEntry, addr_read)
                    : offsetof(CPUTLBEntry, addr_write));
-    target_ulong mask;
+    target_ulong tlb_mask;
 
     /* Load tlb_mask[mmu_idx] and tlb_table[mmu_idx].  */
     tcg_out_ld(s, TCG_TYPE_PTR, TCG_TMP0, TCG_AREG0, mask_off);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_tlb_load(TCGContext *s, TCGReg base, TCGReg addrl,
     /* Add the tlb_table pointer, creating the CPUTLBEntry address in TMP3.  */
     tcg_out_opc_reg(s, ALIAS_PADD, TCG_TMP3, TCG_TMP3, TCG_TMP1);
 
-    /* We don't currently support unaligned accesses.
-       We could do so with mips32r6.  */
-    if (a_bits < s_bits) {
-        a_bits = s_bits;
-    }
-
-    /* Mask the page bits, keeping the alignment bits to compare against.  */
-    mask = (target_ulong)TARGET_PAGE_MASK | ((1 << a_bits) - 1);
-
     /* Load the (low-half) tlb comparator.  */
     if (TCG_TARGET_REG_BITS < TARGET_LONG_BITS) {
-        tcg_out_ld(s, TCG_TYPE_I32, TCG_TMP0, TCG_TMP3, cmp_off + LO_OFF);
-        tcg_out_movi(s, TCG_TYPE_I32, TCG_TMP1, mask);
+        tcg_out_ldst(s, OPC_LW, TCG_TMP0, TCG_TMP3, cmp_off + LO_OFF);
     } else {
         tcg_out_ldst(s, (TARGET_LONG_BITS == 64 ? OPC_LD
                          : TCG_TARGET_REG_BITS == 64 ? OPC_LWU : OPC_LW),
                      TCG_TMP0, TCG_TMP3, cmp_off);
-        tcg_out_movi(s, TCG_TYPE_TL, TCG_TMP1, mask);
-        /* No second compare is required here;
-           load the tlb addend for the fast path.  */
-        tcg_out_ld(s, TCG_TYPE_PTR, TCG_TMP2, TCG_TMP3, add_off);
     }
 
     /* Zero extend a 32-bit guest address for a 64-bit host. */
@@ -XXX,XX +XXX,XX @@ static void tcg_out_tlb_load(TCGContext *s, TCGReg base, TCGReg addrl,
         tcg_out_ext32u(s, base, addrl);
         addrl = base;
     }
-    tcg_out_opc_reg(s, OPC_AND, TCG_TMP1, TCG_TMP1, addrl);
+
+    /*
+     * Mask the page bits, keeping the alignment bits to compare against.
+     * For unaligned accesses, compare against the end of the access to
+     * verify that it does not cross a page boundary.
+     */
+    tlb_mask = (target_ulong)TARGET_PAGE_MASK | a_mask;
+    tcg_out_movi(s, TCG_TYPE_I32, TCG_TMP1, tlb_mask);
+    if (a_mask >= s_mask) {
+        tcg_out_opc_reg(s, OPC_AND, TCG_TMP1, TCG_TMP1, addrl);
+    } else {
+        tcg_out_opc_imm(s, ALIAS_PADDI, TCG_TMP2, addrl, s_mask - a_mask);
+        tcg_out_opc_reg(s, OPC_AND, TCG_TMP1, TCG_TMP1, TCG_TMP2);
+    }
+
+    if (TCG_TARGET_REG_BITS >= TARGET_LONG_BITS) {
+        /* Load the tlb addend for the fast path.  */
+        tcg_out_ld(s, TCG_TYPE_PTR, TCG_TMP2, TCG_TMP3, add_off);
+    }
 
     label_ptr[0] = s->code_ptr;
     tcg_out_opc_br(s, OPC_BNE, TCG_TMP1, TCG_TMP0);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_tlb_load(TCGContext *s, TCGReg base, TCGReg addrl,
     /* Load and test the high half tlb comparator.  */
     if (TCG_TARGET_REG_BITS < TARGET_LONG_BITS) {
         /* delay slot */
-        tcg_out_ld(s, TCG_TYPE_I32, TCG_TMP0, TCG_TMP3, cmp_off + HI_OFF);
+        tcg_out_ldst(s, OPC_LW, TCG_TMP0, TCG_TMP3, cmp_off + HI_OFF);
 
         /* Load the tlb addend for the fast path.  */
         tcg_out_ld(s, TCG_TYPE_PTR, TCG_TMP2, TCG_TMP3, add_off);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg lo, TCGReg hi,
     }
 }
 
-static void __attribute__((unused))
-tcg_out_qemu_ld_unalign(TCGContext *s, TCGReg lo, TCGReg hi,
+static void tcg_out_qemu_ld_unalign(TCGContext *s, TCGReg lo, TCGReg hi,
                                     TCGReg base, MemOp opc, bool is_64)
 {
     const MIPSInsn lw1 = MIPS_BE ? OPC_LWL : OPC_LWR;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is_64)
 #if defined(CONFIG_SOFTMMU)
     tcg_insn_unit *label_ptr[2];
 #else
-    unsigned a_bits, s_bits;
 #endif
+    unsigned a_bits, s_bits;
     TCGReg base = TCG_REG_A0;
 
     data_regl = *args++;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is_64)
     addr_regh = (TCG_TARGET_REG_BITS < TARGET_LONG_BITS ? *args++ : 0);
     oi = *args++;
     opc = get_memop(oi);
+    a_bits = get_alignment_bits(opc);
+    s_bits = opc & MO_SIZE;
 
+    /*
+     * R6 removes the left/right instructions but requires the
+     * system to support misaligned memory accesses.
+     */
 #if defined(CONFIG_SOFTMMU)
     tcg_out_tlb_load(s, base, addr_regl, addr_regh, oi, label_ptr, 1);
-    tcg_out_qemu_ld_direct(s, data_regl, data_regh, base, opc, is_64);
+    if (use_mips32r6_instructions || a_bits >= s_bits) {
+        tcg_out_qemu_ld_direct(s, data_regl, data_regh, base, opc, is_64);
+    } else {
+        tcg_out_qemu_ld_unalign(s, data_regl, data_regh, base, opc, is_64);
+    }
     add_qemu_ldst_label(s, 1, oi,
                         (is_64 ? TCG_TYPE_I64 : TCG_TYPE_I32),
                         data_regl, data_regh, addr_regl, addr_regh,
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is_64)
     } else {
         tcg_out_opc_reg(s, ALIAS_PADD, base, TCG_GUEST_BASE_REG, addr_regl);
     }
-    a_bits = get_alignment_bits(opc);
-    s_bits = opc & MO_SIZE;
-    /*
-     * R6 removes the left/right instructions but requires the
-     * system to support misaligned memory accesses.
-     */
     if (use_mips32r6_instructions) {
         if (a_bits) {
             tcg_out_test_alignment(s, true, addr_regl, addr_regh, a_bits);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg lo, TCGReg hi,
     }
 }
 
-static void __attribute__((unused))
-tcg_out_qemu_st_unalign(TCGContext *s, TCGReg lo, TCGReg hi,
+static void tcg_out_qemu_st_unalign(TCGContext *s, TCGReg lo, TCGReg hi,
                                     TCGReg base, MemOp opc)
 {
     const MIPSInsn sw1 = MIPS_BE ? OPC_SWL : OPC_SWR;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is_64)
     MemOp opc;
 #if defined(CONFIG_SOFTMMU)
     tcg_insn_unit *label_ptr[2];
-#else
-    unsigned a_bits, s_bits;
 #endif
+    unsigned a_bits, s_bits;
     TCGReg base = TCG_REG_A0;
 
     data_regl = *args++;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is_64)
     addr_regh = (TCG_TARGET_REG_BITS < TARGET_LONG_BITS ? *args++ : 0);
     oi = *args++;
     opc = get_memop(oi);
+    a_bits = get_alignment_bits(opc);
+    s_bits = opc & MO_SIZE;
 
+    /*
+     * R6 removes the left/right instructions but requires the
+     * system to support misaligned memory accesses.
+     */
 #if defined(CONFIG_SOFTMMU)
     tcg_out_tlb_load(s, base, addr_regl, addr_regh, oi, label_ptr, 0);
-    tcg_out_qemu_st_direct(s, data_regl, data_regh, base, opc);
+    if (use_mips32r6_instructions || a_bits >= s_bits) {
+        tcg_out_qemu_st_direct(s, data_regl, data_regh, base, opc);
+    } else {
+        tcg_out_qemu_st_unalign(s, data_regl, data_regh, base, opc);
+    }
     add_qemu_ldst_label(s, 0, oi,
                         (is_64 ? TCG_TYPE_I64 : TCG_TYPE_I32),
                         data_regl, data_regh, addr_regl, addr_regh,
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is_64)
     } else {
         tcg_out_opc_reg(s, ALIAS_PADD, base, TCG_GUEST_BASE_REG, addr_regl);
     }
-    a_bits = get_alignment_bits(opc);
-    s_bits = opc & MO_SIZE;
-    /*
-     * R6 removes the left/right instructions but requires the
-     * system to support misaligned memory accesses.
-     */
     if (use_mips32r6_instructions) {
         if (a_bits) {
             tcg_out_test_alignment(s, true, addr_regl, addr_regh, a_bits);
-- 
2.25.1

When BH is constant, it is constrained to 11 bits for use in MOVCC.
For the cases in which we must load the constant BH into a register,
we do not need the full logic of tcg_out_movi; we can use the simpler
function for emitting a 13 bit constant.

This eliminates the only case in which TCG_REG_T2 was passed to
tcg_out_movi, which will shortly become invalid.

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/sparc/tcg-target.c.inc | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/tcg/sparc/tcg-target.c.inc b/tcg/sparc/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/sparc/tcg-target.c.inc
+++ b/tcg/sparc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_addsub2_i64(TCGContext *s, TCGReg rl, TCGReg rh,
     if (use_vis3_instructions && !is_sub) {
         /* Note that ADDXC doesn't accept immediates.  */
         if (bhconst && bh != 0) {
-           tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_T2, bh);
+           tcg_out_movi_imm13(s, TCG_REG_T2, bh);
            bh = TCG_REG_T2;
         }
         tcg_out_arith(s, rh, ah, bh, ARITH_ADDXC);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_addsub2_i64(TCGContext *s, TCGReg rl, TCGReg rh,
 	    tcg_out_movcc(s, TCG_COND_GEU, MOVCC_XCC, rh, ah, 0);
 	}
     } else {
-        /* Otherwise adjust BH as if there is carry into T2 ... */
+        /*
+         * Otherwise adjust BH as if there is carry into T2.
+         * Note that constant BH is constrained to 11 bits for the MOVCC,
+         * so the adjustment fits 12 bits.
+         */
         if (bhconst) {
-            tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_T2, bh + (is_sub ? -1 : 1));
+            tcg_out_movi_imm13(s, TCG_REG_T2, bh + (is_sub ? -1 : 1));
         } else {
             tcg_out_arithi(s, TCG_REG_T2, bh, 1,
                            is_sub ? ARITH_SUB : ARITH_ADD);
-- 
2.25.1

Handle 32-bit constants with a separate function, so that
tcg_out_movi_int does not need to recurse.  This slightly
rearranges the order of tests for small constants, but
produces the same output.

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/sparc/tcg-target.c.inc | 36 +++++++++++++++++++++---------------
 1 file changed, 21 insertions(+), 15 deletions(-)

diff --git a/tcg/sparc/tcg-target.c.inc b/tcg/sparc/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/sparc/tcg-target.c.inc
+++ b/tcg/sparc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_movi_imm13(TCGContext *s, TCGReg ret, int32_t arg)
     tcg_out_arithi(s, ret, TCG_REG_G0, arg, ARITH_OR);
 }
 
+static void tcg_out_movi_imm32(TCGContext *s, TCGReg ret, int32_t arg)
+{
+    if (check_fit_i32(arg, 13)) {
+        /* A 13-bit constant sign-extended to 64-bits.  */
+        tcg_out_movi_imm13(s, ret, arg);
+    } else {
+        /* A 32-bit constant zero-extended to 64 bits.  */
+        tcg_out_sethi(s, ret, arg);
+        if (arg & 0x3ff) {
+            tcg_out_arithi(s, ret, ret, arg & 0x3ff, ARITH_OR);
+        }
+    }
+}
+
 static void tcg_out_movi_int(TCGContext *s, TCGType type, TCGReg ret,
                              tcg_target_long arg, bool in_prologue)
 {
     tcg_target_long hi, lo = (int32_t)arg;
     tcg_target_long test, lsb;
 
-    /* Make sure we test 32-bit constants for imm13 properly.  */
-    if (type == TCG_TYPE_I32) {
-        arg = lo;
+    /* A 32-bit constant, or 32-bit zero-extended to 64-bits.  */
+    if (type == TCG_TYPE_I32 || arg == (uint32_t)arg) {
+        tcg_out_movi_imm32(s, ret, arg);
+        return;
     }
 
     /* A 13-bit constant sign-extended to 64-bits.  */
@@ -XXX,XX +XXX,XX @@ static void tcg_out_movi_int(TCGContext *s, TCGType type, TCGReg ret,
         }
     }
 
-    /* A 32-bit constant, or 32-bit zero-extended to 64-bits.  */
-    if (type == TCG_TYPE_I32 || arg == (uint32_t)arg) {
-        tcg_out_sethi(s, ret, arg);
-        if (arg & 0x3ff) {
-            tcg_out_arithi(s, ret, ret, arg & 0x3ff, ARITH_OR);
-        }
-        return;
-    }
-
     /* A 32-bit constant sign-extended to 64-bits.  */
     if (arg == lo) {
         tcg_out_sethi(s, ret, ~arg);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_movi_int(TCGContext *s, TCGType type, TCGReg ret,
     /* A 64-bit constant decomposed into 2 32-bit pieces.  */
     if (check_fit_i32(lo, 13)) {
         hi = (arg - lo) >> 32;
-        tcg_out_movi(s, TCG_TYPE_I32, ret, hi);
+        tcg_out_movi_imm32(s, ret, hi);
         tcg_out_arithi(s, ret, ret, 32, SHIFT_SLLX);
         tcg_out_arithi(s, ret, ret, lo, ARITH_ADD);
     } else {
         hi = arg >> 32;
-        tcg_out_movi(s, TCG_TYPE_I32, ret, hi);
-        tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_T2, lo);
+        tcg_out_movi_imm32(s, ret, hi);
+        tcg_out_movi_imm32(s, TCG_REG_T2, lo);
         tcg_out_arithi(s, ret, ret, 32, SHIFT_SLLX);
         tcg_out_arith(s, ret, ret, TCG_REG_T2, ARITH_OR);
     }
-- 
2.25.1

This will allow us to control exactly what scratch register is
used for loading the constant.

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/sparc/tcg-target.c.inc | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/tcg/sparc/tcg-target.c.inc b/tcg/sparc/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/sparc/tcg-target.c.inc
+++ b/tcg/sparc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_movi_imm32(TCGContext *s, TCGReg ret, int32_t arg)
 }
 
 static void tcg_out_movi_int(TCGContext *s, TCGType type, TCGReg ret,
-                             tcg_target_long arg, bool in_prologue)
+                             tcg_target_long arg, bool in_prologue,
+                             TCGReg scratch)
 {
     tcg_target_long hi, lo = (int32_t)arg;
     tcg_target_long test, lsb;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_movi_int(TCGContext *s, TCGType type, TCGReg ret,
     } else {
         hi = arg >> 32;
         tcg_out_movi_imm32(s, ret, hi);
-        tcg_out_movi_imm32(s, TCG_REG_T2, lo);
+        tcg_out_movi_imm32(s, scratch, lo);
         tcg_out_arithi(s, ret, ret, 32, SHIFT_SLLX);
-        tcg_out_arith(s, ret, ret, TCG_REG_T2, ARITH_OR);
+        tcg_out_arith(s, ret, ret, scratch, ARITH_OR);
     }
 }
 
 static void tcg_out_movi(TCGContext *s, TCGType type,
                          TCGReg ret, tcg_target_long arg)
 {
-    tcg_out_movi_int(s, type, ret, arg, false);
+    tcg_debug_assert(ret != TCG_REG_T2);
+    tcg_out_movi_int(s, type, ret, arg, false, TCG_REG_T2);
 }
 
 static void tcg_out_ldst_rr(TCGContext *s, TCGReg data, TCGReg a1,
@@ -XXX,XX +XXX,XX @@ static void tcg_out_call_nodelay(TCGContext *s, const tcg_insn_unit *dest,
     } else {
         uintptr_t desti = (uintptr_t)dest;
         tcg_out_movi_int(s, TCG_TYPE_PTR, TCG_REG_T1,
-                         desti & ~0xfff, in_prologue);
+                         desti & ~0xfff, in_prologue, TCG_REG_O7);
         tcg_out_arithi(s, TCG_REG_O7, TCG_REG_T1, desti & 0xfff, JMPL);
     }
 }
@@ -XXX,XX +XXX,XX @@ static void tcg_target_qemu_prologue(TCGContext *s)
 
 #ifndef CONFIG_SOFTMMU
     if (guest_base != 0) {
-        tcg_out_movi_int(s, TCG_TYPE_PTR, TCG_GUEST_BASE_REG, guest_base, true);
+        tcg_out_movi_int(s, TCG_TYPE_PTR, TCG_GUEST_BASE_REG,
+                         guest_base, true, TCG_REG_T1);
         tcg_regset_set_reg(s->reserved_regs, TCG_GUEST_BASE_REG);
     }
 #endif
-- 
2.25.1

We had code for checking for 13 and 21-bit shifted constants,
but we can do better and allow 32-bit shifted constants.
This is still 2 insns shorter than the full 64-bit sequence.

diff --git a/tcg/sparc/tcg-target.c.inc b/tcg/sparc/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/sparc/tcg-target.c.inc
+++ b/tcg/sparc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_movi_int(TCGContext *s, TCGType type, TCGReg ret,
         return;
     }
 
-    /* A 21-bit constant, shifted.  */
+    /* A 32-bit constant, shifted.  */
     lsb = ctz64(arg);
     test = (tcg_target_long)arg >> lsb;
-    if (check_fit_tl(test, 13)) {
-        tcg_out_movi_imm13(s, ret, test);
-        tcg_out_arithi(s, ret, ret, lsb, SHIFT_SLLX);
-        return;
-    } else if (lsb > 10 && test == extract64(test, 0, 21)) {
+    if (lsb > 10 && test == extract64(test, 0, 21)) {
         tcg_out_sethi(s, ret, test << 10);
         tcg_out_arithi(s, ret, ret, lsb - 10, SHIFT_SLLX);
         return;
+    } else if (test == (uint32_t)test || test == (int32_t)test) {
+        tcg_out_movi_int(s, TCG_TYPE_I64, ret, test, in_prologue, scratch);
+        tcg_out_arithi(s, ret, ret, lsb, SHIFT_SLLX);
+        return;
     }
 
     /* A 64-bit constant decomposed into 2 32-bit pieces.  */
-- 
2.25.1

Since 7ecd02a06f8, if patch_reloc fails we restart translation
with a smaller TB.  SPARC had its function signature changed,
but not the logic.  Replace assert with return false.

Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/sparc/tcg-target.c.inc | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tcg/sparc/tcg-target.c.inc b/tcg/sparc/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/sparc/tcg-target.c.inc
+++ b/tcg/sparc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static bool patch_reloc(tcg_insn_unit *src_rw, int type,
 
     switch (type) {
     case R_SPARC_WDISP16:
-        assert(check_fit_ptr(pcrel >> 2, 16));
+        if (!check_fit_ptr(pcrel >> 2, 16)) {
+            return false;
+        }
         insn &= ~INSN_OFF16(-1);
         insn |= INSN_OFF16(pcrel);
         break;
     case R_SPARC_WDISP19:
-        assert(check_fit_ptr(pcrel >> 2, 19));
+        if (!check_fit_ptr(pcrel >> 2, 19)) {
+            return false;
+        }
         insn &= ~INSN_OFF19(-1);
         insn |= INSN_OFF19(pcrel);
         break;
-- 
2.25.1

diff --git a/tcg/sparc/tcg-target.c.inc b/tcg/sparc/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/sparc/tcg-target.c.inc
+++ b/tcg/sparc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static bool patch_reloc(tcg_insn_unit *src_rw, int type,
         insn &= ~INSN_OFF19(-1);
         insn |= INSN_OFF19(pcrel);
         break;
+    case R_SPARC_13:
+        if (!check_fit_ptr(value, 13)) {
+            return false;
+        }
+        insn &= ~INSN_IMM13(-1);
+        insn |= INSN_IMM13(value);
+        break;
     default:
         g_assert_not_reached();
     }
@@ -XXX,XX +XXX,XX @@ static void tcg_out_movi_int(TCGContext *s, TCGType type, TCGReg ret,
         return;
     }
 
+    /* Use the constant pool, if possible. */
+    if (!in_prologue && USE_REG_TB) {
+        new_pool_label(s, arg, R_SPARC_13, s->code_ptr,
+                       tcg_tbrel_diff(s, NULL));
+        tcg_out32(s, LDX | INSN_RD(ret) | INSN_RS1(TCG_REG_TB));
+        return;
+    }
+
     /* A 64-bit constant decomposed into 2 32-bit pieces.  */
     if (check_fit_i32(lo, 13)) {
         hi = (arg - lo) >> 32;
-- 
2.25.1

Due to mapping changes, we now rarely place the code_gen_buffer
near the main executable.  Which means that direct calls will
now rarely be in range.

So, always use indirect calls for tail calls, which allows us to
avoid clobbering %o7, and therefore we need not save and restore it.

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/sparc/tcg-target.c.inc | 37 +++++++++++++++++++++++--------------
 1 file changed, 23 insertions(+), 14 deletions(-)

diff --git a/tcg/sparc/tcg-target.c.inc b/tcg/sparc/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/sparc/tcg-target.c.inc
+++ b/tcg/sparc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_addsub2_i64(TCGContext *s, TCGReg rl, TCGReg rh,
     tcg_out_mov(s, TCG_TYPE_I64, rl, tmp);
 }
 
+static void tcg_out_jmpl_const(TCGContext *s, const tcg_insn_unit *dest,
+                               bool in_prologue, bool tail_call)
+{
+    uintptr_t desti = (uintptr_t)dest;
+
+    /* Be careful not to clobber %o7 for a tail call. */
+    tcg_out_movi_int(s, TCG_TYPE_PTR, TCG_REG_T1,
+                     desti & ~0xfff, in_prologue,
+                     tail_call ? TCG_REG_G2 : TCG_REG_O7);
+    tcg_out_arithi(s, tail_call ? TCG_REG_G0 : TCG_REG_O7,
+                   TCG_REG_T1, desti & 0xfff, JMPL);
+}
+
 static void tcg_out_call_nodelay(TCGContext *s, const tcg_insn_unit *dest,
                                  bool in_prologue)
 {
@@ -XXX,XX +XXX,XX @@ static void tcg_out_call_nodelay(TCGContext *s, const tcg_insn_unit *dest,
     if (disp == (int32_t)disp) {
         tcg_out32(s, CALL | (uint32_t)disp >> 2);
     } else {
-        uintptr_t desti = (uintptr_t)dest;
-        tcg_out_movi_int(s, TCG_TYPE_PTR, TCG_REG_T1,
-                         desti & ~0xfff, in_prologue, TCG_REG_O7);
-        tcg_out_arithi(s, TCG_REG_O7, TCG_REG_T1, desti & 0xfff, JMPL);
+        tcg_out_jmpl_const(s, dest, in_prologue, false);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ static void build_trampolines(TCGContext *s)
 
         /* Set the retaddr operand.  */
         tcg_out_mov(s, TCG_TYPE_PTR, ra, TCG_REG_O7);
-        /* Set the env operand.  */
-        tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_O0, TCG_AREG0);
         /* Tail call.  */
-        tcg_out_call_nodelay(s, qemu_ld_helpers[i], true);
-        tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_O7, ra);
+        tcg_out_jmpl_const(s, qemu_ld_helpers[i], true, true);
+        /* delay slot -- set the env argument */
+        tcg_out_mov_delay(s, TCG_REG_O0, TCG_AREG0);
     }
 
     for (i = 0; i < ARRAY_SIZE(qemu_st_helpers); ++i) {
@@ -XXX,XX +XXX,XX @@ static void build_trampolines(TCGContext *s)
         if (ra >= TCG_REG_O6) {
             tcg_out_st(s, TCG_TYPE_PTR, TCG_REG_O7, TCG_REG_CALL_STACK,
                        TCG_TARGET_CALL_STACK_OFFSET);
-            ra = TCG_REG_G1;
+        } else {
+            tcg_out_mov(s, TCG_TYPE_PTR, ra, TCG_REG_O7);
         }
-        tcg_out_mov(s, TCG_TYPE_PTR, ra, TCG_REG_O7);
-        /* Set the env operand.  */
-        tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_O0, TCG_AREG0);
+
         /* Tail call.  */
-        tcg_out_call_nodelay(s, qemu_st_helpers[i], true);
-        tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_O7, ra);
+        tcg_out_jmpl_const(s, qemu_st_helpers[i], true, true);
+        /* delay slot -- set the env argument */
+        tcg_out_mov_delay(s, TCG_REG_O0, TCG_AREG0);
     }
 }
 #endif
-- 
2.25.1

This is kinda sorta the opposite of the other tcg hosts, where
we get (normal) alignment checks for free with host SIGBUS and
need to add code to support unaligned accesses.

This inline code expansion is somewhat large, but it takes quite
a few instructions to make a function call to a helper anyway.

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/sparc/tcg-target.c.inc | 219 +++++++++++++++++++++++++++++++++++--
 1 file changed, 211 insertions(+), 8 deletions(-)

diff --git a/tcg/sparc/tcg-target.c.inc b/tcg/sparc/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/sparc/tcg-target.c.inc
+++ b/tcg/sparc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static const int tcg_target_call_oarg_regs[] = {
 #define ARITH_ADD  (INSN_OP(2) | INSN_OP3(0x00))
 #define ARITH_ADDCC (INSN_OP(2) | INSN_OP3(0x10))
 #define ARITH_AND  (INSN_OP(2) | INSN_OP3(0x01))
+#define ARITH_ANDCC (INSN_OP(2) | INSN_OP3(0x11))
 #define ARITH_ANDN (INSN_OP(2) | INSN_OP3(0x05))
 #define ARITH_OR   (INSN_OP(2) | INSN_OP3(0x02))
 #define ARITH_ORCC (INSN_OP(2) | INSN_OP3(0x12))
@@ -XXX,XX +XXX,XX @@ static void build_trampolines(TCGContext *s)
         tcg_out_mov_delay(s, TCG_REG_O0, TCG_AREG0);
     }
 }
+#else
+static const tcg_insn_unit *qemu_unalign_ld_trampoline;
+static const tcg_insn_unit *qemu_unalign_st_trampoline;
+
+static void build_trampolines(TCGContext *s)
+{
+    for (int ld = 0; ld < 2; ++ld) {
+        void *helper;
+
+        while ((uintptr_t)s->code_ptr & 15) {
+            tcg_out_nop(s);
+        }
+
+        if (ld) {
+            helper = helper_unaligned_ld;
+            qemu_unalign_ld_trampoline = tcg_splitwx_to_rx(s->code_ptr);
+        } else {
+            helper = helper_unaligned_st;
+            qemu_unalign_st_trampoline = tcg_splitwx_to_rx(s->code_ptr);
+        }
+
+        if (!SPARC64 && TARGET_LONG_BITS == 64) {
+            /* Install the high part of the address.  */
+            tcg_out_arithi(s, TCG_REG_O1, TCG_REG_O2, 32, SHIFT_SRLX);
+        }
+
+        /* Tail call.  */
+        tcg_out_jmpl_const(s, helper, true, true);
+        /* delay slot -- set the env argument */
+        tcg_out_mov_delay(s, TCG_REG_O0, TCG_AREG0);
+    }
+}
 #endif
 
 /* Generate global QEMU prologue and epilogue code */
@@ -XXX,XX +XXX,XX @@ static void tcg_target_qemu_prologue(TCGContext *s)
     /* delay slot */
     tcg_out_movi_imm13(s, TCG_REG_O0, 0);
 
-#ifdef CONFIG_SOFTMMU
     build_trampolines(s);
-#endif
 }
 
 static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
@@ -XXX,XX +XXX,XX @@ static TCGReg tcg_out_tlb_load(TCGContext *s, TCGReg addr, int mem_index,
 static const int qemu_ld_opc[(MO_SSIZE | MO_BSWAP) + 1] = {
     [MO_UB]   = LDUB,
     [MO_SB]   = LDSB,
+    [MO_UB | MO_LE] = LDUB,
+    [MO_SB | MO_LE] = LDSB,
 
     [MO_BEUW] = LDUH,
     [MO_BESW] = LDSH,
     [MO_BEUL] = LDUW,
     [MO_BESL] = LDSW,
     [MO_BEUQ] = LDX,
+    [MO_BESQ] = LDX,
 
     [MO_LEUW] = LDUH_LE,
     [MO_LESW] = LDSH_LE,
     [MO_LEUL] = LDUW_LE,
     [MO_LESL] = LDSW_LE,
     [MO_LEUQ] = LDX_LE,
+    [MO_LESQ] = LDX_LE,
 };
 
 static const int qemu_st_opc[(MO_SIZE | MO_BSWAP) + 1] = {
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld(TCGContext *s, TCGReg data, TCGReg addr,
                             MemOpIdx oi, bool is_64)
 {
     MemOp memop = get_memop(oi);
+    tcg_insn_unit *label_ptr;
+
 #ifdef CONFIG_SOFTMMU
     unsigned memi = get_mmuidx(oi);
     TCGReg addrz, param;
     const tcg_insn_unit *func;
-    tcg_insn_unit *label_ptr;
 
     addrz = tcg_out_tlb_load(s, addr, memi, memop,
                              offsetof(CPUTLBEntry, addr_read));
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_ld(TCGContext *s, TCGReg data, TCGReg addr,
 
     *label_ptr |= INSN_OFF19(tcg_ptr_byte_diff(s->code_ptr, label_ptr));
 #else
+    TCGReg index = (guest_base ? TCG_GUEST_BASE_REG : TCG_REG_G0);
+    unsigned a_bits = get_alignment_bits(memop);
+    unsigned s_bits = memop & MO_SIZE;
+    unsigned t_bits;
+
     if (SPARC64 && TARGET_LONG_BITS == 32) {
         tcg_out_arithi(s, TCG_REG_T1, addr, 0, SHIFT_SRL);
         addr = TCG_REG_T1;
     }
-    tcg_out_ldst_rr(s, data, addr,
-                    (guest_base ? TCG_GUEST_BASE_REG : TCG_REG_G0),
+
+    /*
+     * Normal case: alignment equal to access size.
+     */
+    if (a_bits == s_bits) {
+        tcg_out_ldst_rr(s, data, addr, index,
+                        qemu_ld_opc[memop & (MO_BSWAP | MO_SSIZE)]);
+        return;
+    }
+
+    /*
+     * Test for at least natural alignment, and assume most accesses
+     * will be aligned -- perform a straight load in the delay slot.
+     * This is required to preserve atomicity for aligned accesses.
+     */
+    t_bits = MAX(a_bits, s_bits);
+    tcg_debug_assert(t_bits < 13);
+    tcg_out_arithi(s, TCG_REG_G0, addr, (1u << t_bits) - 1, ARITH_ANDCC);
+
+    /* beq,a,pt %icc, label */
+    label_ptr = s->code_ptr;
+    tcg_out_bpcc0(s, COND_E, BPCC_A | BPCC_PT | BPCC_ICC, 0);
+    /* delay slot */
+    tcg_out_ldst_rr(s, data, addr, index,
                     qemu_ld_opc[memop & (MO_BSWAP | MO_SSIZE)]);
+
+    if (a_bits >= s_bits) {
+        /*
+         * Overalignment: A successful alignment test will perform the memory
+         * operation in the delay slot, and failure need only invoke the
+         * handler for SIGBUS.
+         */
+        TCGReg arg_low = TCG_REG_O1 + (!SPARC64 && TARGET_LONG_BITS == 64);
+        tcg_out_call_nodelay(s, qemu_unalign_ld_trampoline, false);
+        /* delay slot -- move to low part of argument reg */
+        tcg_out_mov_delay(s, arg_low, addr);
+    } else {
+        /* Underalignment: load by pieces of minimum alignment. */
+        int ld_opc, a_size, s_size, i;
+
+        /*
+         * Force full address into T1 early; avoids problems with
+         * overlap between @addr and @data.
+         */
+        tcg_out_arith(s, TCG_REG_T1, addr, index, ARITH_ADD);
+
+        a_size = 1 << a_bits;
+        s_size = 1 << s_bits;
+        if ((memop & MO_BSWAP) == MO_BE) {
+            ld_opc = qemu_ld_opc[a_bits | MO_BE | (memop & MO_SIGN)];
+            tcg_out_ldst(s, data, TCG_REG_T1, 0, ld_opc);
+            ld_opc = qemu_ld_opc[a_bits | MO_BE];
+            for (i = a_size; i < s_size; i += a_size) {
+                tcg_out_ldst(s, TCG_REG_T2, TCG_REG_T1, i, ld_opc);
+                tcg_out_arithi(s, data, data, a_size, SHIFT_SLLX);
+                tcg_out_arith(s, data, data, TCG_REG_T2, ARITH_OR);
+            }
+        } else if (a_bits == 0) {
+            ld_opc = LDUB;
+            tcg_out_ldst(s, data, TCG_REG_T1, 0, ld_opc);
+            for (i = a_size; i < s_size; i += a_size) {
+                if ((memop & MO_SIGN) && i == s_size - a_size) {
+                    ld_opc = LDSB;
+                }
+                tcg_out_ldst(s, TCG_REG_T2, TCG_REG_T1, i, ld_opc);
+                tcg_out_arithi(s, TCG_REG_T2, TCG_REG_T2, i * 8, SHIFT_SLLX);
+                tcg_out_arith(s, data, data, TCG_REG_T2, ARITH_OR);
+            }
+        } else {
+            ld_opc = qemu_ld_opc[a_bits | MO_LE];
+            tcg_out_ldst_rr(s, data, TCG_REG_T1, TCG_REG_G0, ld_opc);
+            for (i = a_size; i < s_size; i += a_size) {
+                tcg_out_arithi(s, TCG_REG_T1, TCG_REG_T1, a_size, ARITH_ADD);
+                if ((memop & MO_SIGN) && i == s_size - a_size) {
+                    ld_opc = qemu_ld_opc[a_bits | MO_LE | MO_SIGN];
+                }
+                tcg_out_ldst_rr(s, TCG_REG_T2, TCG_REG_T1, TCG_REG_G0, ld_opc);
+                tcg_out_arithi(s, TCG_REG_T2, TCG_REG_T2, i * 8, SHIFT_SLLX);
+                tcg_out_arith(s, data, data, TCG_REG_T2, ARITH_OR);
+            }
+        }
+    }
+
+    *label_ptr |= INSN_OFF19(tcg_ptr_byte_diff(s->code_ptr, label_ptr));
 #endif /* CONFIG_SOFTMMU */
 }
 
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, TCGReg data, TCGReg addr,
                             MemOpIdx oi)
 {
     MemOp memop = get_memop(oi);
+    tcg_insn_unit *label_ptr;
+
 #ifdef CONFIG_SOFTMMU
     unsigned memi = get_mmuidx(oi);
     TCGReg addrz, param;
     const tcg_insn_unit *func;
-    tcg_insn_unit *label_ptr;
 
     addrz = tcg_out_tlb_load(s, addr, memi, memop,
                              offsetof(CPUTLBEntry, addr_write));
@@ -XXX,XX +XXX,XX @@ static void tcg_out_qemu_st(TCGContext *s, TCGReg data, TCGReg addr,
 
     *label_ptr |= INSN_OFF19(tcg_ptr_byte_diff(s->code_ptr, label_ptr));
 #else
+    TCGReg index = (guest_base ? TCG_GUEST_BASE_REG : TCG_REG_G0);
+    unsigned a_bits = get_alignment_bits(memop);
+    unsigned s_bits = memop & MO_SIZE;
+    unsigned t_bits;
+
     if (SPARC64 && TARGET_LONG_BITS == 32) {
         tcg_out_arithi(s, TCG_REG_T1, addr, 0, SHIFT_SRL);
         addr = TCG_REG_T1;
     }
-    tcg_out_ldst_rr(s, data, addr,
-                    (guest_base ? TCG_GUEST_BASE_REG : TCG_REG_G0),
+
+    /*
+     * Normal case: alignment equal to access size.
+     */
+    if (a_bits == s_bits) {
+        tcg_out_ldst_rr(s, data, addr, index,
+                        qemu_st_opc[memop & (MO_BSWAP | MO_SIZE)]);
+        return;
+    }
+
+    /*
+     * Test for at least natural alignment, and assume most accesses
+     * will be aligned -- perform a straight store in the delay slot.
+     * This is required to preserve atomicity for aligned accesses.
+     */
+    t_bits = MAX(a_bits, s_bits);
+    tcg_debug_assert(t_bits < 13);
+    tcg_out_arithi(s, TCG_REG_G0, addr, (1u << t_bits) - 1, ARITH_ANDCC);
+
+    /* beq,a,pt %icc, label */
+    label_ptr = s->code_ptr;
+    tcg_out_bpcc0(s, COND_E, BPCC_A | BPCC_PT | BPCC_ICC, 0);
+    /* delay slot */
+    tcg_out_ldst_rr(s, data, addr, index,
                     qemu_st_opc[memop & (MO_BSWAP | MO_SIZE)]);
+
+    if (a_bits >= s_bits) {
+        /*
+         * Overalignment: A successful alignment test will perform the memory
+         * operation in the delay slot, and failure need only invoke the
+         * handler for SIGBUS.
+         */
+        TCGReg arg_low = TCG_REG_O1 + (!SPARC64 && TARGET_LONG_BITS == 64);
+        tcg_out_call_nodelay(s, qemu_unalign_st_trampoline, false);
+        /* delay slot -- move to low part of argument reg */
+        tcg_out_mov_delay(s, arg_low, addr);
+    } else {
+        /* Underalignment: store by pieces of minimum alignment. */
+        int st_opc, a_size, s_size, i;
+
+        /*
+         * Force full address into T1 early; avoids problems with
+         * overlap between @addr and @data.
+         */
+        tcg_out_arith(s, TCG_REG_T1, addr, index, ARITH_ADD);
+
+        a_size = 1 << a_bits;
+        s_size = 1 << s_bits;
+        if ((memop & MO_BSWAP) == MO_BE) {
+            st_opc = qemu_st_opc[a_bits | MO_BE];
+            for (i = 0; i < s_size; i += a_size) {
+                TCGReg d = data;
+                int shift = (s_size - a_size - i) * 8;
+                if (shift) {
+                    d = TCG_REG_T2;
+                    tcg_out_arithi(s, d, data, shift, SHIFT_SRLX);
+                }
+                tcg_out_ldst(s, d, TCG_REG_T1, i, st_opc);
+            }
+        } else if (a_bits == 0) {
+            tcg_out_ldst(s, data, TCG_REG_T1, 0, STB);
+            for (i = 1; i < s_size; i++) {
+                tcg_out_arithi(s, TCG_REG_T2, data, i * 8, SHIFT_SRLX);
+                tcg_out_ldst(s, TCG_REG_T2, TCG_REG_T1, i, STB);
+            }
+        } else {
+            /* Note that ST*A with immediate asi must use indexed address. */
+            st_opc = qemu_st_opc[a_bits + MO_LE];
+            tcg_out_ldst_rr(s, data, TCG_REG_T1, TCG_REG_G0, st_opc);
+            for (i = a_size; i < s_size; i += a_size) {
+                tcg_out_arithi(s, TCG_REG_T2, data, i * 8, SHIFT_SRLX);
+                tcg_out_arithi(s, TCG_REG_T1, TCG_REG_T1, a_size, ARITH_ADD);
+                tcg_out_ldst_rr(s, TCG_REG_T2, TCG_REG_T1, TCG_REG_G0, st_opc);
+            }
+        }
+    }
+
+    *label_ptr |= INSN_OFF19(tcg_ptr_byte_diff(s->code_ptr, label_ptr));
 #endif /* CONFIG_SOFTMMU */
 }
 
-- 
2.25.1

A mostly generic test for unaligned access raising SIGBUS.

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tests/tcg/multiarch/sigbus.c | 68 ++++++++++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)
 create mode 100644 tests/tcg/multiarch/sigbus.c

diff --git a/tests/tcg/multiarch/sigbus.c b/tests/tcg/multiarch/sigbus.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tests/tcg/multiarch/sigbus.c
@@ -XXX,XX +XXX,XX @@
+#define _GNU_SOURCE 1
+
+#include <assert.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <endian.h>
+
+
+unsigned long long x = 0x8877665544332211ull;
+void * volatile p = (void *)&x + 1;
+
+void sigbus(int sig, siginfo_t *info, void *uc)
+{
+    assert(sig == SIGBUS);
+    assert(info->si_signo == SIGBUS);
+#ifdef BUS_ADRALN
+    assert(info->si_code == BUS_ADRALN);
+#endif
+    assert(info->si_addr == p);
+    exit(EXIT_SUCCESS);
+}
+
+int main()
+{
+    struct sigaction sa = {
+        .sa_sigaction = sigbus,
+        .sa_flags = SA_SIGINFO
+    };
+    int allow_fail = 0;
+    int tmp;
+
+    tmp = sigaction(SIGBUS, &sa, NULL);
+    assert(tmp == 0);
+
+    /*
+     * Select an operation that's likely to enforce alignment.
+     * On many guests that support unaligned accesses by default,
+     * this is often an atomic operation.
+     */
+#if defined(__aarch64__)
+    asm volatile("ldxr %w0,[%1]" : "=r"(tmp) : "r"(p) : "memory");
+#elif defined(__alpha__)
+    asm volatile("ldl_l %0,0(%1)" : "=r"(tmp) : "r"(p) : "memory");
+#elif defined(__arm__)
+    asm volatile("ldrex %0,[%1]" : "=r"(tmp) : "r"(p) : "memory");
+#elif defined(__powerpc__)
+    asm volatile("lwarx %0,0,%1" : "=r"(tmp) : "r"(p) : "memory");
+#elif defined(__riscv_atomic)
+    asm volatile("lr.w %0,(%1)" : "=r"(tmp) : "r"(p) : "memory");
+#else
+    /* No insn known to fault unaligned -- try for a straight load. */
+    allow_fail = 1;
+    tmp = *(volatile int *)p;
+#endif
+
+    assert(allow_fail);
+
+    /*
+     * We didn't see a signal.
+     * We might as well validate the unaligned load worked.
+     */
+    if (BYTE_ORDER == LITTLE_ENDIAN) {
+        assert(tmp == 0x55443322);
+    } else {
+        assert(tmp == 0x77665544);
+    }
+    return EXIT_SUCCESS;
+}
-- 
2.25.1