Series comparison

-[PULL 00/46] tcg patch queue
+[PULL 00/56] tcg patch queue
-The following changes since commit d0dddab40e472ba62b5f43f11cc7dba085dabe71:
+The following changes since commit c52d69e7dbaaed0ffdef8125e79218672c30161d:
-  Merge remote-tracking branch 'remotes/mst/tags/for_upstream' into staging (2021-02-05 15:27:02 +0000)
+  Merge remote-tracking branch 'remotes/cschoenebeck/tags/pull-9p-20211027' into staging (2021-10-27 11:45:18 -0700)
 are available in the Git repository at:
-  https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20210205
+  https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20211027
-for you to fetch changes up to fb6916dd6ca8bb4b42d44baba9c67ecaf2279577:
+for you to fetch changes up to 820c025f0dcacf2f3c12735b1f162893fbfa7bc6:
-  accel: introduce AccelCPUClass extending CPUClass (2021-02-05 10:24:15 -1000)
+  tcg/optimize: Propagate sign info for shifting (2021-10-27 17:11:23 -0700)
 ----------------------------------------------------------------
-TCGCPUOps cleanups (claudio)
+Improvements to qemu/int128
-tcg/s390 compare fix (phil)
+Fixes for 128/64 division.
-tcg/aarch64 rotli_vec fix
+Cleanup tcg/optimize.c
-tcg/tci cleanups and fixes
+Optimize redundant sign extensions
 ----------------------------------------------------------------
-Claudio Fontana (13):
+Frédéric Pétrot (1):
-      target/riscv: remove CONFIG_TCG, as it is always TCG
+      qemu/int128: Add int128_{not,xor}
       accel/tcg: split TCG-only code from cpu_exec_realizefn
       target/arm: do not use cc->do_interrupt for KVM directly
       cpu: move cc->do_interrupt to tcg_ops
       cpu: move cc->transaction_failed to tcg_ops
       cpu: move do_unaligned_access to tcg_ops
       physmem: make watchpoint checking code TCG-only
       cpu: move adjust_watchpoint_address to tcg_ops
       cpu: move debug_check_watchpoint to tcg_ops
       cpu: tcg_ops: move to tcg-cpu-ops.h, keep a pointer in CPUClass
       accel: extend AccelState and AccelClass to user-mode
       accel: replace struct CpusAccel with AccelOpsClass
       accel: introduce AccelCPUClass extending CPUClass
-Eduardo Habkost (5):
+Luis Pires (4):
-      cpu: Introduce TCGCpuOperations struct
+      host-utils: move checks out of divu128/divs128
-      cpu: Move synchronize_from_tb() to tcg_ops
+      host-utils: move udiv_qrnnd() to host-utils
-      cpu: Move cpu_exec_* to tcg_ops
+      host-utils: add 128-bit quotient support to divu128/divs128
-      cpu: Move tlb_fill to tcg_ops
+      host-utils: add unit tests for divu128/divs128
       cpu: Move debug_excp_handler to tcg_ops
-Philippe Mathieu-Daudé (2):
+Richard Henderson (51):
-      tcg/s390: Fix compare instruction from extended-immediate facility
+      tcg/optimize: Rename "mask" to "z_mask"
-      exec/cpu-defs: Remove TCG backends dependency
+      tcg/optimize: Split out OptContext
       tcg/optimize: Remove do_default label
       tcg/optimize: Change tcg_opt_gen_{mov,movi} interface
       tcg/optimize: Move prev_mb into OptContext
       tcg/optimize: Split out init_arguments
       tcg/optimize: Split out copy_propagate
       tcg/optimize: Split out fold_call
       tcg/optimize: Drop nb_oargs, nb_iargs locals
       tcg/optimize: Change fail return for do_constant_folding_cond*
       tcg/optimize: Return true from tcg_opt_gen_{mov,movi}
       tcg/optimize: Split out finish_folding
       tcg/optimize: Use a boolean to avoid a mass of continues
       tcg/optimize: Split out fold_mb, fold_qemu_{ld,st}
       tcg/optimize: Split out fold_const{1,2}
       tcg/optimize: Split out fold_setcond2
       tcg/optimize: Split out fold_brcond2
       tcg/optimize: Split out fold_brcond
       tcg/optimize: Split out fold_setcond
       tcg/optimize: Split out fold_mulu2_i32
       tcg/optimize: Split out fold_addsub2_i32
       tcg/optimize: Split out fold_movcond
       tcg/optimize: Split out fold_extract2
       tcg/optimize: Split out fold_extract, fold_sextract
       tcg/optimize: Split out fold_deposit
       tcg/optimize: Split out fold_count_zeros
       tcg/optimize: Split out fold_bswap
       tcg/optimize: Split out fold_dup, fold_dup2
       tcg/optimize: Split out fold_mov
       tcg/optimize: Split out fold_xx_to_i
       tcg/optimize: Split out fold_xx_to_x
       tcg/optimize: Split out fold_xi_to_i
       tcg/optimize: Add type to OptContext
       tcg/optimize: Split out fold_to_not
       tcg/optimize: Split out fold_sub_to_neg
       tcg/optimize: Split out fold_xi_to_x
       tcg/optimize: Split out fold_ix_to_i
       tcg/optimize: Split out fold_masks
       tcg/optimize: Expand fold_mulu2_i32 to all 4-arg multiplies
       tcg/optimize: Expand fold_addsub2_i32 to 64-bit ops
       tcg/optimize: Sink commutative operand swapping into fold functions
       tcg/optimize: Stop forcing z_mask to "garbage" for 32-bit values
       tcg/optimize: Use fold_xx_to_i for orc
       tcg/optimize: Use fold_xi_to_x for mul
       tcg/optimize: Use fold_xi_to_x for div
       tcg/optimize: Use fold_xx_to_i for rem
       tcg/optimize: Optimize sign extensions
       tcg/optimize: Propagate sign info for logical operations
       tcg/optimize: Propagate sign info for setcond
       tcg/optimize: Propagate sign info for bit counting
       tcg/optimize: Propagate sign info for shifting
-Richard Henderson (24):
+ include/fpu/softfloat-macros.h |   82 --
-      tcg/aarch64: Do not convert TCGArg to temps that are not temps
+ include/hw/clock.h             |    5 +-
-      configure: Fix --enable-tcg-interpreter
+ include/qemu/host-utils.h      |  121 +-
-      tcg/tci: Make tci_tb_ptr thread-local
+ include/qemu/int128.h          |   20 +
-      tcg/tci: Inline tci_write_reg32s into the only caller
+ target/ppc/int_helper.c        |   23 +-
-      tcg/tci: Inline tci_write_reg8 into its callers
+ tcg/optimize.c                 | 2644 ++++++++++++++++++++++++----------------
-      tcg/tci: Inline tci_write_reg16 into the only caller
+ tests/unit/test-div128.c       |  197 +++
-      tcg/tci: Inline tci_write_reg32 into all callers
+ util/host-utils.c              |  147 ++-
-      tcg/tci: Inline tci_write_reg64 into 64-bit callers
+ tests/unit/meson.build         |    1 +
-      tcg/tci: Merge INDEX_op_ld8u_{i32,i64}
+files changed, 2053 insertions(+), 1187 deletions(-)
-      tcg/tci: Merge INDEX_op_ld8s_{i32,i64}
+ create mode 100644 tests/unit/test-div128.c
       tcg/tci: Merge INDEX_op_ld16u_{i32,i64}
       tcg/tci: Merge INDEX_op_ld16s_{i32,i64}
       tcg/tci: Merge INDEX_op_{ld_i32,ld32u_i64}
       tcg/tci: Merge INDEX_op_st8_{i32,i64}
       tcg/tci: Merge INDEX_op_st16_{i32,i64}
       tcg/tci: Move stack bounds check to compile-time
       tcg/tci: Merge INDEX_op_{st_i32,st32_i64}
       tcg/tci: Use g_assert_not_reached
       tcg/tci: Remove dead code for TCG_TARGET_HAS_div2_*
       tcg/tci: Implement 64-bit division
       tcg/tci: Remove TODO as unused
       tcg/tci: Restrict TCG_TARGET_NB_REGS to 16
       tcg/tci: Fix TCG_REG_R4 misusage
       tcg/tci: Remove TCG_CONST
-Stefan Weil (2):
-      tcg/tci: Implement INDEX_op_ld16s_i32
-      tcg/tci: Implement INDEX_op_ld8s_i64
- configure                                          |   5 +-
- accel/accel-softmmu.h                              |  15 +
- accel/kvm/kvm-cpus.h                               |   2 -
- .../{tcg-cpus-icount.h => tcg-accel-ops-icount.h}  |   2 +
- accel/tcg/tcg-accel-ops-mttcg.h                    |  19 +
- accel/tcg/{tcg-cpus-rr.h => tcg-accel-ops-rr.h}    |   0
- accel/tcg/{tcg-cpus.h => tcg-accel-ops.h}          |   6 +-
- include/exec/cpu-all.h                             |  11 +-
- include/exec/cpu-defs.h                            |   3 -
- include/exec/exec-all.h                            |   2 +-
- include/hw/boards.h                                |   2 +-
- include/hw/core/accel-cpu.h                        |  38 ++
- include/hw/core/cpu.h                              |  86 +---
- include/hw/core/tcg-cpu-ops.h                      |  97 +++++
- include/{sysemu => qemu}/accel.h                   |  16 +-
- include/sysemu/accel-ops.h                         |  45 ++
- include/sysemu/cpus.h                              |  26 +-
- include/sysemu/hvf.h                               |   2 +-
- include/sysemu/kvm.h                               |   2 +-
- include/sysemu/kvm_int.h                           |   2 +-
- target/arm/internals.h                             |   6 +
- target/i386/hax/{hax-cpus.h => hax-accel-ops.h}    |   2 -
- target/i386/hax/hax-windows.h                      |   2 +-
- target/i386/hvf/{hvf-cpus.h => hvf-accel-ops.h}    |   2 -
- target/i386/hvf/hvf-i386.h                         |   2 +-
- target/i386/whpx/{whpx-cpus.h => whpx-accel-ops.h} |   2 -
- tcg/tci/tcg-target-con-set.h                       |   6 +-
- tcg/tci/tcg-target.h                               |  37 +-
- accel/accel-common.c                               | 105 +++++
- accel/{accel.c => accel-softmmu.c}                 |  61 ++-
- accel/accel-user.c                                 |  24 ++
- accel/kvm/{kvm-cpus.c => kvm-accel-ops.c}          |  28 +-
- accel/kvm/kvm-all.c                                |   2 -
- accel/qtest/qtest.c                                |  25 +-
- accel/tcg/cpu-exec.c                               |  53 ++-
- accel/tcg/cputlb.c                                 |  34 +-
- .../{tcg-cpus-icount.c => tcg-accel-ops-icount.c}  |  21 +-
- .../{tcg-cpus-mttcg.c => tcg-accel-ops-mttcg.c}    |  14 +-
- accel/tcg/{tcg-cpus-rr.c => tcg-accel-ops-rr.c}    |  13 +-
- accel/tcg/{tcg-cpus.c => tcg-accel-ops.c}          |  47 +-
- accel/tcg/tcg-all.c                                |  19 +-
- accel/tcg/user-exec.c                              |   8 +-
- accel/xen/xen-all.c                                |  26 +-
- bsd-user/main.c                                    |  11 +-
- cpu.c                                              |  66 +--
- hw/core/cpu.c                                      |  21 +-
- hw/mips/jazz.c                                     |  12 +-
- linux-user/main.c                                  |   7 +-
- softmmu/cpus.c                                     |  12 +-
- softmmu/memory.c                                   |   2 +-
- softmmu/physmem.c                                  | 149 ++++---
- softmmu/qtest.c                                    |   2 +-
- softmmu/vl.c                                       |   9 +-
- target/alpha/cpu.c                                 |  21 +-
- target/arm/cpu.c                                   |  45 +-
- target/arm/cpu64.c                                 |   4 +-
- target/arm/cpu_tcg.c                               |  32 +-
- target/arm/helper.c                                |   4 +
- target/arm/kvm64.c                                 |   6 +-
- target/avr/cpu.c                                   |  19 +-
- target/avr/helper.c                                |   5 +-
- target/cris/cpu.c                                  |  43 +-
- target/cris/helper.c                               |   5 +-
- target/hppa/cpu.c                                  |  24 +-
- target/i386/hax/{hax-cpus.c => hax-accel-ops.c}    |  33 +-
- target/i386/hax/hax-all.c                          |   7 +-
- target/i386/hax/hax-mem.c                          |   2 +-
- target/i386/hax/hax-posix.c                        |   2 +-
- target/i386/hax/hax-windows.c                      |   2 +-
- target/i386/hvf/{hvf-cpus.c => hvf-accel-ops.c}    |  29 +-
- target/i386/hvf/hvf.c                              |   5 +-
- target/i386/hvf/x86_task.c                         |   2 +-
- target/i386/hvf/x86hvf.c                           |   2 +-
- target/i386/tcg/tcg-cpu.c                          |  26 +-
- target/i386/whpx/{whpx-cpus.c => whpx-accel-ops.c} |  33 +-
- target/i386/whpx/whpx-all.c                        |   9 +-
- target/lm32/cpu.c                                  |  19 +-
- target/m68k/cpu.c                                  |  19 +-
- target/microblaze/cpu.c                            |  25 +-
- target/mips/cpu.c                                  |  35 +-
- target/moxie/cpu.c                                 |  15 +-
- target/nios2/cpu.c                                 |  18 +-
- target/openrisc/cpu.c                              |  17 +-
- target/riscv/cpu.c                                 |  26 +-
- target/riscv/cpu_helper.c                          |   2 +-
- target/rx/cpu.c                                    |  20 +-
- target/s390x/cpu.c                                 |  33 +-
- target/s390x/excp_helper.c                         |   2 +-
- target/sh4/cpu.c                                   |  21 +-
- target/sparc/cpu.c                                 |  25 +-
- target/tilegx/cpu.c                                |  17 +-
- target/tricore/cpu.c                               |  12 +-
- target/unicore32/cpu.c                             |  17 +-
- target/xtensa/cpu.c                                |  23 +-
- target/xtensa/helper.c                             |   4 +-
- tcg/tcg-common.c                                   |   4 -
- tcg/tci.c                                          | 479 ++++++++-------------
- target/ppc/translate_init.c.inc                    |  39 +-
- tcg/aarch64/tcg-target.c.inc                       |   7 +-
- tcg/s390/tcg-target.c.inc                          |   2 +-
- tcg/tci/tcg-target.c.inc                           | 149 ++-----
- MAINTAINERS                                        |   7 +-
- accel/kvm/meson.build                              |   2 +-
- accel/meson.build                                  |   4 +-
- accel/tcg/meson.build                              |  10 +-
- target/i386/hax/meson.build                        |   2 +-
- target/i386/hvf/meson.build                        |   2 +-
- target/i386/whpx/meson.build                       |   2 +-
-files changed, 1565 insertions(+), 1065 deletions(-)
- create mode 100644 accel/accel-softmmu.h
- rename accel/tcg/{tcg-cpus-icount.h => tcg-accel-ops-icount.h} (88%)
- create mode 100644 accel/tcg/tcg-accel-ops-mttcg.h
- rename accel/tcg/{tcg-cpus-rr.h => tcg-accel-ops-rr.h} (100%)
- rename accel/tcg/{tcg-cpus.h => tcg-accel-ops.h} (72%)
- create mode 100644 include/hw/core/accel-cpu.h
- create mode 100644 include/hw/core/tcg-cpu-ops.h
- rename include/{sysemu => qemu}/accel.h (94%)
- create mode 100644 include/sysemu/accel-ops.h
- rename target/i386/hax/{hax-cpus.h => hax-accel-ops.h} (95%)
- rename target/i386/hvf/{hvf-cpus.h => hvf-accel-ops.h} (94%)
- rename target/i386/whpx/{whpx-cpus.h => whpx-accel-ops.h} (96%)
- create mode 100644 accel/accel-common.c
- rename accel/{accel.c => accel-softmmu.c} (64%)
- create mode 100644 accel/accel-user.c
- rename accel/kvm/{kvm-cpus.c => kvm-accel-ops.c} (72%)
- rename accel/tcg/{tcg-cpus-icount.c => tcg-accel-ops-icount.c} (89%)
- rename accel/tcg/{tcg-cpus-mttcg.c => tcg-accel-ops-mttcg.c} (92%)
- rename accel/tcg/{tcg-cpus-rr.c => tcg-accel-ops-rr.c} (97%)
- rename accel/tcg/{tcg-cpus.c => tcg-accel-ops.c} (63%)
- rename target/i386/hax/{hax-cpus.c => hax-accel-ops.c} (69%)
- rename target/i386/hvf/{hvf-cpus.c => hvf-accel-ops.c} (84%)
- rename target/i386/whpx/{whpx-cpus.c => whpx-accel-ops.c} (71%)

-[PULL 45/46] accel: replace struct CpusAccel with AccelOpsClass
+[PULL 01/56] qemu/int128: Add int128_{not,xor}
-From: Claudio Fontana <cfontana@suse.de>
+From: Frédéric Pétrot <frederic.petrot@univ-grenoble-alpes.fr>
-This will allow us to centralize the registration of
+Addition of not and xor on 128-bit integers.
 the cpus.c module accelerator operations (in accel/accel-softmmu.c),
 and trigger it automatically using object hierarchy lookup from the
 new accel_init_interfaces() initialization step, depending just on
 which accelerators are available in the code.
-Rename all tcg-cpus.c, kvm-cpus.c, etc to tcg-accel-ops.c,
+Signed-off-by: Frédéric Pétrot <frederic.petrot@univ-grenoble-alpes.fr>
-kvm-accel-ops.c, etc, matching the object type names.
+Co-authored-by: Fabien Portas <fabien.portas@grenoble-inp.org>
+Message-Id: <20211025122818.168890-3-frederic.petrot@univ-grenoble-alpes.fr>
-Signed-off-by: Claudio Fontana <cfontana@suse.de>
+[rth: Split out logical operations.]
-Message-Id: <20210204163931.7358-18-cfontana@suse.de>
+Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/accel-softmmu.h                         | 15 ++++++
+ include/qemu/int128.h | 20 ++++++++++++++++++++
- accel/kvm/kvm-cpus.h                          |  2 -
+file changed, 20 insertions(+)
  ...g-cpus-icount.h => tcg-accel-ops-icount.h} |  2 +
  accel/tcg/tcg-accel-ops-mttcg.h               | 19 ++++++++
  .../tcg/{tcg-cpus-rr.h => tcg-accel-ops-rr.h} |  0
  accel/tcg/{tcg-cpus.h => tcg-accel-ops.h}     |  6 +--
  include/qemu/accel.h                          |  2 +
  include/sysemu/accel-ops.h                    | 45 ++++++++++++++++++
  include/sysemu/cpus.h                         | 26 ++--------
  .../i386/hax/{hax-cpus.h => hax-accel-ops.h}  |  2 -
  target/i386/hax/hax-windows.h                 |  2 +-
  .../i386/hvf/{hvf-cpus.h => hvf-accel-ops.h}  |  2 -
  .../whpx/{whpx-cpus.h => whpx-accel-ops.h}    |  2 -
  accel/accel-common.c                          | 11 +++++
  accel/accel-softmmu.c                         | 44 +++++++++++++++--
  accel/kvm/{kvm-cpus.c => kvm-accel-ops.c}     | 28 ++++++++---
  accel/kvm/kvm-all.c                           |  2 -
  accel/qtest/qtest.c                           | 23 ++++++---
  ...g-cpus-icount.c => tcg-accel-ops-icount.c} | 21 +++------
  ...tcg-cpus-mttcg.c => tcg-accel-ops-mttcg.c} | 14 ++----
  .../tcg/{tcg-cpus-rr.c => tcg-accel-ops-rr.c} | 13 ++---
  accel/tcg/{tcg-cpus.c => tcg-accel-ops.c}     | 47 ++++++++++++++++++-
  accel/tcg/tcg-all.c                           | 12 -----
  accel/xen/xen-all.c                           | 24 ++++++----
  bsd-user/main.c                               |  3 +-
  linux-user/main.c                             |  1 +
  softmmu/cpus.c                                | 12 ++---
  softmmu/vl.c                                  |  7 ++-
  .../i386/hax/{hax-cpus.c => hax-accel-ops.c}  | 33 +++++++++----
  target/i386/hax/hax-all.c                     |  5 +-
  target/i386/hax/hax-mem.c                     |  2 +-
  target/i386/hax/hax-posix.c                   |  2 +-
  target/i386/hax/hax-windows.c                 |  2 +-
  .../i386/hvf/{hvf-cpus.c => hvf-accel-ops.c}  | 29 +++++++++---
  target/i386/hvf/hvf.c                         |  3 +-
  target/i386/hvf/x86hvf.c                      |  2 +-
  .../whpx/{whpx-cpus.c => whpx-accel-ops.c}    | 33 +++++++++----
  target/i386/whpx/whpx-all.c                   |  7 +--
  MAINTAINERS                                   |  3 +-
  accel/kvm/meson.build                         |  2 +-
  accel/tcg/meson.build                         |  8 ++--
  target/i386/hax/meson.build                   |  2 +-
  target/i386/hvf/meson.build                   |  2 +-
  target/i386/whpx/meson.build                  |  2 +-
 files changed, 361 insertions(+), 163 deletions(-)
  create mode 100644 accel/accel-softmmu.h
  rename accel/tcg/{tcg-cpus-icount.h => tcg-accel-ops-icount.h} (88%)
  create mode 100644 accel/tcg/tcg-accel-ops-mttcg.h
  rename accel/tcg/{tcg-cpus-rr.h => tcg-accel-ops-rr.h} (100%)
  rename accel/tcg/{tcg-cpus.h => tcg-accel-ops.h} (72%)
  create mode 100644 include/sysemu/accel-ops.h
  rename target/i386/hax/{hax-cpus.h => hax-accel-ops.h} (95%)
  rename target/i386/hvf/{hvf-cpus.h => hvf-accel-ops.h} (94%)
  rename target/i386/whpx/{whpx-cpus.h => whpx-accel-ops.h} (96%)
  rename accel/kvm/{kvm-cpus.c => kvm-accel-ops.c} (72%)
  rename accel/tcg/{tcg-cpus-icount.c => tcg-accel-ops-icount.c} (89%)
  rename accel/tcg/{tcg-cpus-mttcg.c => tcg-accel-ops-mttcg.c} (92%)
  rename accel/tcg/{tcg-cpus-rr.c => tcg-accel-ops-rr.c} (97%)
  rename accel/tcg/{tcg-cpus.c => tcg-accel-ops.c} (63%)
  rename target/i386/hax/{hax-cpus.c => hax-accel-ops.c} (69%)
  rename target/i386/hvf/{hvf-cpus.c => hvf-accel-ops.c} (84%)
  rename target/i386/whpx/{whpx-cpus.c => whpx-accel-ops.c} (71%)
-diff --git a/accel/accel-softmmu.h b/accel/accel-softmmu.h
+diff --git a/include/qemu/int128.h b/include/qemu/int128.h
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/accel/accel-softmmu.h
@@ -XXX,XX +XXX,XX @@
 +/*
 + * QEMU System Emulation accel internal functions
 + *
 + * Copyright 2021 SUSE LLC
 + *
 + * This work is licensed under the terms of the GNU GPL, version 2 or later.
 + * See the COPYING file in the top-level directory.
 + */
 +
 +#ifndef ACCEL_SOFTMMU_H
 +#define ACCEL_SOFTMMU_H
 +
 +void accel_init_ops_interfaces(AccelClass *ac);
 +
 +#endif /* ACCEL_SOFTMMU_H */
 diff --git a/accel/kvm/kvm-cpus.h b/accel/kvm/kvm-cpus.h
 index XXXXXXX..XXXXXXX 100644
---- a/accel/kvm/kvm-cpus.h
+--- a/include/qemu/int128.h
-+++ b/accel/kvm/kvm-cpus.h
++++ b/include/qemu/int128.h
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static inline Int128 int128_exts64(int64_t a)
+     return a;
  #include "sysemu/cpus.h"
 -extern const CpusAccel kvm_cpus;
 -
  int kvm_init_vcpu(CPUState *cpu, Error **errp);
  int kvm_cpu_exec(CPUState *cpu);
  void kvm_destroy_vcpu(CPUState *cpu);
 diff --git a/accel/tcg/tcg-cpus-icount.h b/accel/tcg/tcg-accel-ops-icount.h
 similarity index 88%
 rename from accel/tcg/tcg-cpus-icount.h
 rename to accel/tcg/tcg-accel-ops-icount.h
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/tcg-cpus-icount.h
 +++ b/accel/tcg/tcg-accel-ops-icount.h
@@ -XXX,XX +XXX,XX @@ void icount_handle_deadline(void);
  void icount_prepare_for_run(CPUState *cpu);
  void icount_process_data(CPUState *cpu);
 +void icount_handle_interrupt(CPUState *cpu, int mask);
 +
  #endif /* TCG_CPUS_ICOUNT_H */
 diff --git a/accel/tcg/tcg-accel-ops-mttcg.h b/accel/tcg/tcg-accel-ops-mttcg.h
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/accel/tcg/tcg-accel-ops-mttcg.h
@@ -XXX,XX +XXX,XX @@
 +/*
 + * QEMU TCG Multi Threaded vCPUs implementation
 + *
 + * Copyright 2021 SUSE LLC
 + *
 + * This work is licensed under the terms of the GNU GPL, version 2 or later.
 + * See the COPYING file in the top-level directory.
 + */
 +
 +#ifndef TCG_CPUS_MTTCG_H
 +#define TCG_CPUS_MTTCG_H
 +
 +/* kick MTTCG vCPU thread */
 +void mttcg_kick_vcpu_thread(CPUState *cpu);
 +
 +/* start an mttcg vCPU thread */
 +void mttcg_start_vcpu_thread(CPUState *cpu);
 +
 +#endif /* TCG_CPUS_MTTCG_H */
 diff --git a/accel/tcg/tcg-cpus-rr.h b/accel/tcg/tcg-accel-ops-rr.h
 similarity index 100%
 rename from accel/tcg/tcg-cpus-rr.h
 rename to accel/tcg/tcg-accel-ops-rr.h
 diff --git a/accel/tcg/tcg-cpus.h b/accel/tcg/tcg-accel-ops.h
 similarity index 72%
 rename from accel/tcg/tcg-cpus.h
 rename to accel/tcg/tcg-accel-ops.h
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/tcg-cpus.h
 +++ b/accel/tcg/tcg-accel-ops.h
@@ -XXX,XX +XXX,XX @@
  #include "sysemu/cpus.h"
 -extern const CpusAccel tcg_cpus_mttcg;
 -extern const CpusAccel tcg_cpus_icount;
 -extern const CpusAccel tcg_cpus_rr;
 -
  void tcg_cpus_destroy(CPUState *cpu);
  int tcg_cpus_exec(CPUState *cpu);
 -void tcg_cpus_handle_interrupt(CPUState *cpu, int mask);
 +void tcg_handle_interrupt(CPUState *cpu, int mask);
  #endif /* TCG_CPUS_H */
 diff --git a/include/qemu/accel.h b/include/qemu/accel.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/qemu/accel.h
 +++ b/include/qemu/accel.h
@@ -XXX,XX +XXX,XX @@ typedef struct AccelClass {
  AccelClass *accel_find(const char *opt_name);
  AccelState *current_accel(void);
 +void accel_init_interfaces(AccelClass *ac);
 +
  #ifndef CONFIG_USER_ONLY
  int accel_init_machine(AccelState *accel, MachineState *ms);
 diff --git a/include/sysemu/accel-ops.h b/include/sysemu/accel-ops.h
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/include/sysemu/accel-ops.h
@@ -XXX,XX +XXX,XX @@
 +/*
 + * Accelerator OPS, used for cpus.c module
 + *
 + * Copyright 2021 SUSE LLC
 + *
 + * This work is licensed under the terms of the GNU GPL, version 2 or later.
 + * See the COPYING file in the top-level directory.
 + */
 +
 +#ifndef ACCEL_OPS_H
 +#define ACCEL_OPS_H
 +
 +#include "qom/object.h"
 +
 +#define ACCEL_OPS_SUFFIX "-ops"
 +#define TYPE_ACCEL_OPS "accel" ACCEL_OPS_SUFFIX
 +#define ACCEL_OPS_NAME(name) (name "-" TYPE_ACCEL_OPS)
 +
 +typedef struct AccelOpsClass AccelOpsClass;
 +DECLARE_CLASS_CHECKERS(AccelOpsClass, ACCEL_OPS, TYPE_ACCEL_OPS)
 +
 +/* cpus.c operations interface */
 +struct AccelOpsClass {
 +    /*< private >*/
 +    ObjectClass parent_class;
 +    /*< public >*/
 +
 +    /* initialization function called when accel is chosen */
 +    void (*ops_init)(AccelOpsClass *ops);
 +
 +    void (*create_vcpu_thread)(CPUState *cpu); /* MANDATORY NON-NULL */
 +    void (*kick_vcpu_thread)(CPUState *cpu);
 +
 +    void (*synchronize_post_reset)(CPUState *cpu);
 +    void (*synchronize_post_init)(CPUState *cpu);
 +    void (*synchronize_state)(CPUState *cpu);
 +    void (*synchronize_pre_loadvm)(CPUState *cpu);
 +
 +    void (*handle_interrupt)(CPUState *cpu, int mask);
 +
 +    int64_t (*get_virtual_clock)(void);
 +    int64_t (*get_elapsed_ticks)(void);
 +};
 +
 +#endif /* ACCEL_OPS_H */
 diff --git a/include/sysemu/cpus.h b/include/sysemu/cpus.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/sysemu/cpus.h
 +++ b/include/sysemu/cpus.h
@@ -XXX,XX +XXX,XX @@
  #define QEMU_CPUS_H
  #include "qemu/timer.h"
 +#include "sysemu/accel-ops.h"
 -/* cpus.c */
 +/* register accel-specific operations */
 +void cpus_register_accel(const AccelOpsClass *i);
 -/* CPU execution threads */
 +/* accel/dummy-cpus.c */
 -typedef struct CpusAccel {
 -    void (*create_vcpu_thread)(CPUState *cpu); /* MANDATORY */
 -    void (*kick_vcpu_thread)(CPUState *cpu);
 -
 -    void (*synchronize_post_reset)(CPUState *cpu);
 -    void (*synchronize_post_init)(CPUState *cpu);
 -    void (*synchronize_state)(CPUState *cpu);
 -    void (*synchronize_pre_loadvm)(CPUState *cpu);
 -
 -    void (*handle_interrupt)(CPUState *cpu, int mask);
 -
 -    int64_t (*get_virtual_clock)(void);
 -    int64_t (*get_elapsed_ticks)(void);
 -} CpusAccel;
 -
 -/* register accel-specific cpus interface implementation */
 -void cpus_register_accel(const CpusAccel *i);
 -
 -/* Create a dummy vcpu for CpusAccel->create_vcpu_thread */
 +/* Create a dummy vcpu for AccelOpsClass->create_vcpu_thread */
  void dummy_start_vcpu_thread(CPUState *);
  /* interface available for cpus accelerator threads */
 diff --git a/target/i386/hax/hax-cpus.h b/target/i386/hax/hax-accel-ops.h
 similarity index 95%
 rename from target/i386/hax/hax-cpus.h
 rename to target/i386/hax/hax-accel-ops.h
 index XXXXXXX..XXXXXXX 100644
 --- a/target/i386/hax/hax-cpus.h
 +++ b/target/i386/hax/hax-accel-ops.h
@@ -XXX,XX +XXX,XX @@
  #include "sysemu/cpus.h"
 -extern const CpusAccel hax_cpus;
 -
  #include "hax-interface.h"
  #include "hax-i386.h"
 diff --git a/target/i386/hax/hax-windows.h b/target/i386/hax/hax-windows.h
 index XXXXXXX..XXXXXXX 100644
 --- a/target/i386/hax/hax-windows.h
 +++ b/target/i386/hax/hax-windows.h
@@ -XXX,XX +XXX,XX @@
  #include <winioctl.h>
  #include <windef.h>
 -#include "hax-cpus.h"
 +#include "hax-accel-ops.h"
  #define HAX_INVALID_FD INVALID_HANDLE_VALUE
 diff --git a/target/i386/hvf/hvf-cpus.h b/target/i386/hvf/hvf-accel-ops.h
 similarity index 94%
 rename from target/i386/hvf/hvf-cpus.h
 rename to target/i386/hvf/hvf-accel-ops.h
 index XXXXXXX..XXXXXXX 100644
 --- a/target/i386/hvf/hvf-cpus.h
 +++ b/target/i386/hvf/hvf-accel-ops.h
@@ -XXX,XX +XXX,XX @@
  #include "sysemu/cpus.h"
 -extern const CpusAccel hvf_cpus;
 -
  int hvf_init_vcpu(CPUState *);
  int hvf_vcpu_exec(CPUState *);
  void hvf_cpu_synchronize_state(CPUState *);
 diff --git a/target/i386/whpx/whpx-cpus.h b/target/i386/whpx/whpx-accel-ops.h
 similarity index 96%
 rename from target/i386/whpx/whpx-cpus.h
 rename to target/i386/whpx/whpx-accel-ops.h
 index XXXXXXX..XXXXXXX 100644
 --- a/target/i386/whpx/whpx-cpus.h
 +++ b/target/i386/whpx/whpx-accel-ops.h
@@ -XXX,XX +XXX,XX @@
  #include "sysemu/cpus.h"
 -extern const CpusAccel whpx_cpus;
 -
  int whpx_init_vcpu(CPUState *cpu);
  int whpx_vcpu_exec(CPUState *cpu);
  void whpx_destroy_vcpu(CPUState *cpu);
 diff --git a/accel/accel-common.c b/accel/accel-common.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/accel-common.c
 +++ b/accel/accel-common.c
@@ -XXX,XX +XXX,XX @@
  #include "qemu/osdep.h"
  #include "qemu/accel.h"
 +#ifndef CONFIG_USER_ONLY
 +#include "accel-softmmu.h"
 +#endif /* !CONFIG_USER_ONLY */
 +
  static const TypeInfo accel_type = {
      .name = TYPE_ACCEL,
      .parent = TYPE_OBJECT,
@@ -XXX,XX +XXX,XX @@ AccelClass *accel_find(const char *opt_name)
      return ac;
  }
-+void accel_init_interfaces(AccelClass *ac)
++static inline Int128 int128_not(Int128 a)
 +{
-+#ifndef CONFIG_USER_ONLY
++    return ~a;
 +    accel_init_ops_interfaces(ac);
 +#endif /* !CONFIG_USER_ONLY */
 +}
 +
- static void register_accel_types(void)
+ static inline Int128 int128_and(Int128 a, Int128 b)
  {
-     type_register_static(&accel_type);
+     return a & b;
-diff --git a/accel/accel-softmmu.c b/accel/accel-softmmu.c
+@@ -XXX,XX +XXX,XX @@ static inline Int128 int128_or(Int128 a, Int128 b)
-index XXXXXXX..XXXXXXX 100644
+     return a | b;
 --- a/accel/accel-softmmu.c
 +++ b/accel/accel-softmmu.c
@@ -XXX,XX +XXX,XX @@
  #include "qemu/osdep.h"
  #include "qemu/accel.h"
  #include "hw/boards.h"
 -#include "sysemu/arch_init.h"
 -#include "sysemu/sysemu.h"
 -#include "qom/object.h"
 +#include "sysemu/cpus.h"
 +
 +#include "accel-softmmu.h"
  int accel_init_machine(AccelState *accel, MachineState *ms)
  {
@@ -XXX,XX +XXX,XX @@ void accel_setup_post(MachineState *ms)
          acc->setup_post(ms, accel);
      }
  }
-+
-+/* initialize the arch-independent accel operation interfaces */
++static inline Int128 int128_xor(Int128 a, Int128 b)
 +void accel_init_ops_interfaces(AccelClass *ac)
 +{
-+    const char *ac_name;
++    return a ^ b;
 +    char *ops_name;
 +    AccelOpsClass *ops;
 +
 +    ac_name = object_class_get_name(OBJECT_CLASS(ac));
 +    g_assert(ac_name != NULL);
 +
 +    ops_name = g_strdup_printf("%s" ACCEL_OPS_SUFFIX, ac_name);
 +    ops = ACCEL_OPS_CLASS(object_class_by_name(ops_name));
 +    g_free(ops_name);
 +
 +    /*
 +     * all accelerators need to define ops, providing at least a mandatory
 +     * non-NULL create_vcpu_thread operation.
 +     */
 +    g_assert(ops != NULL);
 +    if (ops->ops_init) {
 +        ops->ops_init(ops);
 +    }
 +    cpus_register_accel(ops);
 +}
 +
-+static const TypeInfo accel_ops_type_info = {
+ static inline Int128 int128_rshift(Int128 a, int n)
-+    .name = TYPE_ACCEL_OPS,
+ {
-+    .parent = TYPE_OBJECT,
+     return a >> n;
-+    .abstract = true,
+@@ -XXX,XX +XXX,XX @@ static inline Int128 int128_exts64(int64_t a)
-+    .class_size = sizeof(AccelOpsClass),
+     return int128_make128(a, (a < 0) ? -1 : 0);
-+};
+ }
-+
-+static void accel_softmmu_register_types(void)
++static inline Int128 int128_not(Int128 a)
 +{
-+    type_register_static(&accel_ops_type_info);
++    return int128_make128(~a.lo, ~a.hi);
 +}
 +type_init(accel_softmmu_register_types);
 diff --git a/accel/kvm/kvm-cpus.c b/accel/kvm/kvm-accel-ops.c
 similarity index 72%
 rename from accel/kvm/kvm-cpus.c
 rename to accel/kvm/kvm-accel-ops.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/kvm/kvm-cpus.c
 +++ b/accel/kvm/kvm-accel-ops.c
@@ -XXX,XX +XXX,XX @@ static void kvm_start_vcpu_thread(CPUState *cpu)
                         cpu, QEMU_THREAD_JOINABLE);
  }
 -const CpusAccel kvm_cpus = {
 -    .create_vcpu_thread = kvm_start_vcpu_thread,
 +static void kvm_accel_ops_class_init(ObjectClass *oc, void *data)
 +{
 +    AccelOpsClass *ops = ACCEL_OPS_CLASS(oc);
 -    .synchronize_post_reset = kvm_cpu_synchronize_post_reset,
 -    .synchronize_post_init = kvm_cpu_synchronize_post_init,
 -    .synchronize_state = kvm_cpu_synchronize_state,
 -    .synchronize_pre_loadvm = kvm_cpu_synchronize_pre_loadvm,
 +    ops->create_vcpu_thread = kvm_start_vcpu_thread;
 +    ops->synchronize_post_reset = kvm_cpu_synchronize_post_reset;
 +    ops->synchronize_post_init = kvm_cpu_synchronize_post_init;
 +    ops->synchronize_state = kvm_cpu_synchronize_state;
 +    ops->synchronize_pre_loadvm = kvm_cpu_synchronize_pre_loadvm;
 +}
 +
-+static const TypeInfo kvm_accel_ops_type = {
+ static inline Int128 int128_and(Int128 a, Int128 b)
-+    .name = ACCEL_OPS_NAME("kvm"),
+ {
-+
+     return int128_make128(a.lo & b.lo, a.hi & b.hi);
-+    .parent = TYPE_ACCEL_OPS,
+@@ -XXX,XX +XXX,XX @@ static inline Int128 int128_or(Int128 a, Int128 b)
-+    .class_init = kvm_accel_ops_class_init,
+     return int128_make128(a.lo | b.lo, a.hi | b.hi);
-+    .abstract = true,
+ }
- };
-+
++static inline Int128 int128_xor(Int128 a, Int128 b)
 +static void kvm_accel_ops_register_types(void)
 +{
-+    type_register_static(&kvm_accel_ops_type);
++    return int128_make128(a.lo ^ b.lo, a.hi ^ b.hi);
 +}
 +type_init(kvm_accel_ops_register_types);
 diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/kvm/kvm-all.c
 +++ b/accel/kvm/kvm-all.c
@@ -XXX,XX +XXX,XX @@ static int kvm_init(MachineState *ms)
          ret = ram_block_discard_disable(true);
          assert(!ret);
      }
 -
 -    cpus_register_accel(&kvm_cpus);
      return 0;
  err:
 diff --git a/accel/qtest/qtest.c b/accel/qtest/qtest.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/qtest/qtest.c
 +++ b/accel/qtest/qtest.c
@@ -XXX,XX +XXX,XX @@
  #include "qemu/main-loop.h"
  #include "hw/core/cpu.h"
 -const CpusAccel qtest_cpus = {
 -    .create_vcpu_thread = dummy_start_vcpu_thread,
 -    .get_virtual_clock = qtest_get_virtual_clock,
 -};
 -
  static int qtest_init_accel(MachineState *ms)
  {
 -    cpus_register_accel(&qtest_cpus);
      return 0;
  }
@@ -XXX,XX +XXX,XX @@ static const TypeInfo qtest_accel_type = {
      .class_init = qtest_accel_class_init,
  };
 +static void qtest_accel_ops_class_init(ObjectClass *oc, void *data)
 +{
 +    AccelOpsClass *ops = ACCEL_OPS_CLASS(oc);
 +
 +    ops->create_vcpu_thread = dummy_start_vcpu_thread;
 +    ops->get_virtual_clock = qtest_get_virtual_clock;
 +};
 +
 +static const TypeInfo qtest_accel_ops_type = {
 +    .name = ACCEL_OPS_NAME("qtest"),
 +
 +    .parent = TYPE_ACCEL_OPS,
 +    .class_init = qtest_accel_ops_class_init,
 +    .abstract = true,
 +};
 +
  static void qtest_type_init(void)
  {
      type_register_static(&qtest_accel_type);
 +    type_register_static(&qtest_accel_ops_type);
  }
  type_init(qtest_type_init);
 diff --git a/accel/tcg/tcg-cpus-icount.c b/accel/tcg/tcg-accel-ops-icount.c
 similarity index 89%
 rename from accel/tcg/tcg-cpus-icount.c
 rename to accel/tcg/tcg-accel-ops-icount.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/tcg-cpus-icount.c
 +++ b/accel/tcg/tcg-accel-ops-icount.c
@@ -XXX,XX +XXX,XX @@
  #include "exec/exec-all.h"
  #include "hw/boards.h"
 -#include "tcg-cpus.h"
 -#include "tcg-cpus-icount.h"
 -#include "tcg-cpus-rr.h"
 +#include "tcg-accel-ops.h"
 +#include "tcg-accel-ops-icount.h"
 +#include "tcg-accel-ops-rr.h"
  static int64_t icount_get_limit(void)
  {
@@ -XXX,XX +XXX,XX @@ void icount_prepare_for_run(CPUState *cpu)
      /*
       * These should always be cleared by icount_process_data after
       * each vCPU execution. However u16.high can be raised
 -     * asynchronously by cpu_exit/cpu_interrupt/tcg_cpus_handle_interrupt
 +     * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
       */
      g_assert(cpu_neg(cpu)->icount_decr.u16.low == 0);
      g_assert(cpu->icount_extra == 0);
@@ -XXX,XX +XXX,XX @@ void icount_process_data(CPUState *cpu)
      replay_mutex_unlock();
  }
 -static void icount_handle_interrupt(CPUState *cpu, int mask)
 +void icount_handle_interrupt(CPUState *cpu, int mask)
  {
      int old_mask = cpu->interrupt_request;
 -    tcg_cpus_handle_interrupt(cpu, mask);
 +    tcg_handle_interrupt(cpu, mask);
      if (qemu_cpu_is_self(cpu) &&
          !cpu->can_do_io
          && (mask & ~old_mask) != 0) {
          cpu_abort(cpu, "Raised interrupt while not in I/O function");
      }
  }
 -
 -const CpusAccel tcg_cpus_icount = {
 -    .create_vcpu_thread = rr_start_vcpu_thread,
 -    .kick_vcpu_thread = rr_kick_vcpu_thread,
 -
 -    .handle_interrupt = icount_handle_interrupt,
 -    .get_virtual_clock = icount_get,
 -    .get_elapsed_ticks = icount_get,
 -};
 diff --git a/accel/tcg/tcg-cpus-mttcg.c b/accel/tcg/tcg-accel-ops-mttcg.c
 similarity index 92%
 rename from accel/tcg/tcg-cpus-mttcg.c
 rename to accel/tcg/tcg-accel-ops-mttcg.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/tcg-cpus-mttcg.c
 +++ b/accel/tcg/tcg-accel-ops-mttcg.c
@@ -XXX,XX +XXX,XX @@
  #include "exec/exec-all.h"
  #include "hw/boards.h"
 -#include "tcg-cpus.h"
 +#include "tcg-accel-ops.h"
 +#include "tcg-accel-ops-mttcg.h"
  /*
   * In the multi-threaded case each vCPU has its own thread. The TLS
@@ -XXX,XX +XXX,XX @@ static void *mttcg_cpu_thread_fn(void *arg)
      return NULL;
  }
 -static void mttcg_kick_vcpu_thread(CPUState *cpu)
 +void mttcg_kick_vcpu_thread(CPUState *cpu)
  {
      cpu_exit(cpu);
  }
 -static void mttcg_start_vcpu_thread(CPUState *cpu)
 +void mttcg_start_vcpu_thread(CPUState *cpu)
  {
      char thread_name[VCPU_THREAD_NAME_SIZE];
@@ -XXX,XX +XXX,XX @@ static void mttcg_start_vcpu_thread(CPUState *cpu)
      cpu->hThread = qemu_thread_get_handle(cpu->thread);
  #endif
  }
 -
 -const CpusAccel tcg_cpus_mttcg = {
 -    .create_vcpu_thread = mttcg_start_vcpu_thread,
 -    .kick_vcpu_thread = mttcg_kick_vcpu_thread,
 -
 -    .handle_interrupt = tcg_cpus_handle_interrupt,
 -};
 diff --git a/accel/tcg/tcg-cpus-rr.c b/accel/tcg/tcg-accel-ops-rr.c
 similarity index 97%
 rename from accel/tcg/tcg-cpus-rr.c
 rename to accel/tcg/tcg-accel-ops-rr.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/tcg-cpus-rr.c
 +++ b/accel/tcg/tcg-accel-ops-rr.c
@@ -XXX,XX +XXX,XX @@
  #include "exec/exec-all.h"
  #include "hw/boards.h"
 -#include "tcg-cpus.h"
 -#include "tcg-cpus-rr.h"
 -#include "tcg-cpus-icount.h"
 +#include "tcg-accel-ops.h"
 +#include "tcg-accel-ops-rr.h"
 +#include "tcg-accel-ops-icount.h"
  /* Kick all RR vCPUs */
  void rr_kick_vcpu_thread(CPUState *unused)
@@ -XXX,XX +XXX,XX @@ void rr_start_vcpu_thread(CPUState *cpu)
          cpu->created = true;
      }
  }
 -
 -const CpusAccel tcg_cpus_rr = {
 -    .create_vcpu_thread = rr_start_vcpu_thread,
 -    .kick_vcpu_thread = rr_kick_vcpu_thread,
 -
 -    .handle_interrupt = tcg_cpus_handle_interrupt,
 -};
 diff --git a/accel/tcg/tcg-cpus.c b/accel/tcg/tcg-accel-ops.c
 similarity index 63%
 rename from accel/tcg/tcg-cpus.c
 rename to accel/tcg/tcg-accel-ops.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/tcg-cpus.c
 +++ b/accel/tcg/tcg-accel-ops.c
@@ -XXX,XX +XXX,XX @@
  #include "exec/exec-all.h"
  #include "hw/boards.h"
 -#include "tcg-cpus.h"
 +#include "tcg-accel-ops.h"
 +#include "tcg-accel-ops-mttcg.h"
 +#include "tcg-accel-ops-rr.h"
 +#include "tcg-accel-ops-icount.h"
  /* common functionality among all TCG variants */
@@ -XXX,XX +XXX,XX @@ int tcg_cpus_exec(CPUState *cpu)
  }
  /* mask must never be zero, except for A20 change call */
 -void tcg_cpus_handle_interrupt(CPUState *cpu, int mask)
 +void tcg_handle_interrupt(CPUState *cpu, int mask)
  {
      g_assert(qemu_mutex_iothread_locked());
@@ -XXX,XX +XXX,XX @@ void tcg_cpus_handle_interrupt(CPUState *cpu, int mask)
          qatomic_set(&cpu_neg(cpu)->icount_decr.u16.high, -1);
      }
  }
 +
 +static void tcg_accel_ops_init(AccelOpsClass *ops)
 +{
 +    if (qemu_tcg_mttcg_enabled()) {
 +        ops->create_vcpu_thread = mttcg_start_vcpu_thread;
 +        ops->kick_vcpu_thread = mttcg_kick_vcpu_thread;
 +        ops->handle_interrupt = tcg_handle_interrupt;
 +    } else if (icount_enabled()) {
 +        ops->create_vcpu_thread = rr_start_vcpu_thread;
 +        ops->kick_vcpu_thread = rr_kick_vcpu_thread;
 +        ops->handle_interrupt = icount_handle_interrupt;
 +        ops->get_virtual_clock = icount_get;
 +        ops->get_elapsed_ticks = icount_get;
 +    } else {
 +        ops->create_vcpu_thread = rr_start_vcpu_thread;
 +        ops->kick_vcpu_thread = rr_kick_vcpu_thread;
 +        ops->handle_interrupt = tcg_handle_interrupt;
 +    }
 +}
 +
-+static void tcg_accel_ops_class_init(ObjectClass *oc, void *data)
+ static inline Int128 int128_rshift(Int128 a, int n)
 +{
 +    AccelOpsClass *ops = ACCEL_OPS_CLASS(oc);
 +
 +    ops->ops_init = tcg_accel_ops_init;
 +}
 +
 +static const TypeInfo tcg_accel_ops_type = {
 +    .name = ACCEL_OPS_NAME("tcg"),
 +
 +    .parent = TYPE_ACCEL_OPS,
 +    .class_init = tcg_accel_ops_class_init,
 +    .abstract = true,
 +};
 +
 +static void tcg_accel_ops_register_types(void)
 +{
 +    type_register_static(&tcg_accel_ops_type);
 +}
 +type_init(tcg_accel_ops_register_types);
 diff --git a/accel/tcg/tcg-all.c b/accel/tcg/tcg-all.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/tcg-all.c
 +++ b/accel/tcg/tcg-all.c
@@ -XXX,XX +XXX,XX @@
  #include "qemu/accel.h"
  #include "qapi/qapi-builtin-visit.h"
 -#ifndef CONFIG_USER_ONLY
 -#include "tcg-cpus.h"
 -#endif /* CONFIG_USER_ONLY */
 -
  struct TCGState {
      AccelState parent_obj;
@@ -XXX,XX +XXX,XX @@ static int tcg_init(MachineState *ms)
       */
  #ifndef CONFIG_USER_ONLY
      tcg_region_init();
 -
 -    if (mttcg_enabled) {
 -        cpus_register_accel(&tcg_cpus_mttcg);
 -    } else if (icount_enabled()) {
 -        cpus_register_accel(&tcg_cpus_icount);
 -    } else {
 -        cpus_register_accel(&tcg_cpus_rr);
 -    }
  #endif /* !CONFIG_USER_ONLY */
      return 0;
 diff --git a/accel/xen/xen-all.c b/accel/xen/xen-all.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/xen/xen-all.c
 +++ b/accel/xen/xen-all.c
@@ -XXX,XX +XXX,XX @@ static void xen_setup_post(MachineState *ms, AccelState *accel)
      }
  }
 -const CpusAccel xen_cpus = {
 -    .create_vcpu_thread = dummy_start_vcpu_thread,
 -};
 -
  static int xen_init(MachineState *ms)
  {
-     MachineClass *mc = MACHINE_GET_CLASS(ms);
+     int64_t h;
@@ -XXX,XX +XXX,XX @@ static int xen_init(MachineState *ms)
       * opt out of system RAM being allocated by generic code
       */
      mc->default_ram_id = NULL;
 -
 -    cpus_register_accel(&xen_cpus);
 -
      return 0;
  }
@@ -XXX,XX +XXX,XX @@ static const TypeInfo xen_accel_type = {
      .class_init = xen_accel_class_init,
  };
 +static void xen_accel_ops_class_init(ObjectClass *oc, void *data)
 +{
 +    AccelOpsClass *ops = ACCEL_OPS_CLASS(oc);
 +
 +    ops->create_vcpu_thread = dummy_start_vcpu_thread;
 +}
 +
 +static const TypeInfo xen_accel_ops_type = {
 +    .name = ACCEL_OPS_NAME("xen"),
 +
 +    .parent = TYPE_ACCEL_OPS,
 +    .class_init = xen_accel_ops_class_init,
 +    .abstract = true,
 +};
 +
  static void xen_type_init(void)
  {
      type_register_static(&xen_accel_type);
 +    type_register_static(&xen_accel_ops_type);
  }
 -
  type_init(xen_type_init);
 diff --git a/bsd-user/main.c b/bsd-user/main.c
 index XXXXXXX..XXXXXXX 100644
 --- a/bsd-user/main.c
 +++ b/bsd-user/main.c
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
  #endif
      }
 +    cpu_type = parse_cpu_option(cpu_model);
      /* init tcg before creating CPUs and to get qemu_host_page_size */
      {
          AccelClass *ac = ACCEL_GET_CLASS(current_accel());
          ac->init_machine(NULL);
 +        accel_init_interfaces(ac);
      }
 -    cpu_type = parse_cpu_option(cpu_model);
      cpu = cpu_create(cpu_type);
      env = cpu->env_ptr;
  #if defined(TARGET_SPARC) || defined(TARGET_PPC)
 diff --git a/linux-user/main.c b/linux-user/main.c
 index XXXXXXX..XXXXXXX 100644
 --- a/linux-user/main.c
 +++ b/linux-user/main.c
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp)
          AccelClass *ac = ACCEL_GET_CLASS(current_accel());
          ac->init_machine(NULL);
 +        accel_init_interfaces(ac);
      }
      cpu = cpu_create(cpu_type);
      env = cpu->env_ptr;
 diff --git a/softmmu/cpus.c b/softmmu/cpus.c
 index XXXXXXX..XXXXXXX 100644
 --- a/softmmu/cpus.c
 +++ b/softmmu/cpus.c
@@ -XXX,XX +XXX,XX @@ void hw_error(const char *fmt, ...)
  /*
   * The chosen accelerator is supposed to register this.
   */
 -static const CpusAccel *cpus_accel;
 +static const AccelOpsClass *cpus_accel;
  void cpu_synchronize_all_states(void)
  {
@@ -XXX,XX +XXX,XX @@ void cpu_remove_sync(CPUState *cpu)
      qemu_mutex_lock_iothread();
  }
 -void cpus_register_accel(const CpusAccel *ca)
 +void cpus_register_accel(const AccelOpsClass *ops)
  {
 -    assert(ca != NULL);
 -    assert(ca->create_vcpu_thread != NULL); /* mandatory */
 -    cpus_accel = ca;
 +    assert(ops != NULL);
 +    assert(ops->create_vcpu_thread != NULL); /* mandatory */
 +    cpus_accel = ops;
  }
  void qemu_init_vcpu(CPUState *cpu)
@@ -XXX,XX +XXX,XX @@ void qemu_init_vcpu(CPUState *cpu)
          cpu_address_space_init(cpu, 0, "cpu-memory", cpu->memory);
      }
 -    /* accelerators all implement the CpusAccel interface */
 +    /* accelerators all implement the AccelOpsClass */
      g_assert(cpus_accel != NULL && cpus_accel->create_vcpu_thread != NULL);
      cpus_accel->create_vcpu_thread(cpu);
 diff --git a/softmmu/vl.c b/softmmu/vl.c
 index XXXXXXX..XXXXXXX 100644
 --- a/softmmu/vl.c
 +++ b/softmmu/vl.c
@@ -XXX,XX +XXX,XX @@ static bool object_create_early(const char *type, QemuOpts *opts)
          return false;
      }
 -    /* Allocation of large amounts of memory may delay
 +    /*
 +     * Allocation of large amounts of memory may delay
       * chardev initialization for too long, and trigger timeouts
       * on software that waits for a monitor socket to be created
       * (e.g. libvirt).
@@ -XXX,XX +XXX,XX @@ void qemu_init(int argc, char **argv, char **envp)
       *
       * Machine compat properties: object_set_machine_compat_props().
       * Accelerator compat props: object_set_accelerator_compat_props(),
 -     * called from configure_accelerator().
 +     * called from do_configure_accelerator().
       */
      machine_class = MACHINE_GET_CLASS(current_machine);
@@ -XXX,XX +XXX,XX @@ void qemu_init(int argc, char **argv, char **envp)
      if (cpu_option) {
          current_machine->cpu_type = parse_cpu_option(cpu_option);
      }
 +    /* NB: for machine none cpu_type could STILL be NULL here! */
 +    accel_init_interfaces(ACCEL_GET_CLASS(current_machine->accelerator));
      qemu_resolve_machine_memdev();
      parse_numa_opts(current_machine);
 diff --git a/target/i386/hax/hax-cpus.c b/target/i386/hax/hax-accel-ops.c
 similarity index 69%
 rename from target/i386/hax/hax-cpus.c
 rename to target/i386/hax/hax-accel-ops.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/i386/hax/hax-cpus.c
 +++ b/target/i386/hax/hax-accel-ops.c
@@ -XXX,XX +XXX,XX @@
  #include "sysemu/cpus.h"
  #include "qemu/guest-random.h"
 -#include "hax-cpus.h"
 +#include "hax-accel-ops.h"
  static void *hax_cpu_thread_fn(void *arg)
  {
@@ -XXX,XX +XXX,XX @@ static void hax_start_vcpu_thread(CPUState *cpu)
  #endif
  }
 -const CpusAccel hax_cpus = {
 -    .create_vcpu_thread = hax_start_vcpu_thread,
 -    .kick_vcpu_thread = hax_kick_vcpu_thread,
 +static void hax_accel_ops_class_init(ObjectClass *oc, void *data)
 +{
 +    AccelOpsClass *ops = ACCEL_OPS_CLASS(oc);
 -    .synchronize_post_reset = hax_cpu_synchronize_post_reset,
 -    .synchronize_post_init = hax_cpu_synchronize_post_init,
 -    .synchronize_state = hax_cpu_synchronize_state,
 -    .synchronize_pre_loadvm = hax_cpu_synchronize_pre_loadvm,
 +    ops->create_vcpu_thread = hax_start_vcpu_thread;
 +    ops->kick_vcpu_thread = hax_kick_vcpu_thread;
 +
 +    ops->synchronize_post_reset = hax_cpu_synchronize_post_reset;
 +    ops->synchronize_post_init = hax_cpu_synchronize_post_init;
 +    ops->synchronize_state = hax_cpu_synchronize_state;
 +    ops->synchronize_pre_loadvm = hax_cpu_synchronize_pre_loadvm;
 +}
 +
 +static const TypeInfo hax_accel_ops_type = {
 +    .name = ACCEL_OPS_NAME("hax"),
 +
 +    .parent = TYPE_ACCEL_OPS,
 +    .class_init = hax_accel_ops_class_init,
 +    .abstract = true,
  };
 +
 +static void hax_accel_ops_register_types(void)
 +{
 +    type_register_static(&hax_accel_ops_type);
 +}
 +type_init(hax_accel_ops_register_types);
 diff --git a/target/i386/hax/hax-all.c b/target/i386/hax/hax-all.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/i386/hax/hax-all.c
 +++ b/target/i386/hax/hax-all.c
@@ -XXX,XX +XXX,XX @@
  #include "sysemu/runstate.h"
  #include "hw/boards.h"
 -#include "hax-cpus.h"
 +#include "hax-accel-ops.h"
  #define DEBUG_HAX 0
@@ -XXX,XX +XXX,XX @@ static int hax_accel_init(MachineState *ms)
                  !ret ? "working" : "not working",
                  !ret ? "fast virt" : "emulation");
      }
 -    if (ret == 0) {
 -        cpus_register_accel(&hax_cpus);
 -    }
      return ret;
  }
 diff --git a/target/i386/hax/hax-mem.c b/target/i386/hax/hax-mem.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/i386/hax/hax-mem.c
 +++ b/target/i386/hax/hax-mem.c
@@ -XXX,XX +XXX,XX @@
  #include "exec/address-spaces.h"
  #include "qemu/error-report.h"
 -#include "hax-cpus.h"
 +#include "hax-accel-ops.h"
  #include "qemu/queue.h"
  #define DEBUG_HAX_MEM 0
 diff --git a/target/i386/hax/hax-posix.c b/target/i386/hax/hax-posix.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/i386/hax/hax-posix.c
 +++ b/target/i386/hax/hax-posix.c
@@ -XXX,XX +XXX,XX @@
  #include <sys/ioctl.h>
  #include "sysemu/cpus.h"
 -#include "hax-cpus.h"
 +#include "hax-accel-ops.h"
  hax_fd hax_mod_open(void)
  {
 diff --git a/target/i386/hax/hax-windows.c b/target/i386/hax/hax-windows.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/i386/hax/hax-windows.c
 +++ b/target/i386/hax/hax-windows.c
@@ -XXX,XX +XXX,XX @@
  #include "qemu/osdep.h"
  #include "cpu.h"
 -#include "hax-cpus.h"
 +#include "hax-accel-ops.h"
  /*
   * return 0 when success, -1 when driver not loaded,
 diff --git a/target/i386/hvf/hvf-cpus.c b/target/i386/hvf/hvf-accel-ops.c
 similarity index 84%
 rename from target/i386/hvf/hvf-cpus.c
 rename to target/i386/hvf/hvf-accel-ops.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/i386/hvf/hvf-cpus.c
 +++ b/target/i386/hvf/hvf-accel-ops.c
@@ -XXX,XX +XXX,XX @@
  #include "target/i386/cpu.h"
  #include "qemu/guest-random.h"
 -#include "hvf-cpus.h"
 +#include "hvf-accel-ops.h"
  /*
   * The HVF-specific vCPU thread function. This one should only run when the host
@@ -XXX,XX +XXX,XX @@ static void hvf_start_vcpu_thread(CPUState *cpu)
                         cpu, QEMU_THREAD_JOINABLE);
  }
 -const CpusAccel hvf_cpus = {
 -    .create_vcpu_thread = hvf_start_vcpu_thread,
 +static void hvf_accel_ops_class_init(ObjectClass *oc, void *data)
 +{
 +    AccelOpsClass *ops = ACCEL_OPS_CLASS(oc);
 -    .synchronize_post_reset = hvf_cpu_synchronize_post_reset,
 -    .synchronize_post_init = hvf_cpu_synchronize_post_init,
 -    .synchronize_state = hvf_cpu_synchronize_state,
 -    .synchronize_pre_loadvm = hvf_cpu_synchronize_pre_loadvm,
 +    ops->create_vcpu_thread = hvf_start_vcpu_thread;
 +
 +    ops->synchronize_post_reset = hvf_cpu_synchronize_post_reset;
 +    ops->synchronize_post_init = hvf_cpu_synchronize_post_init;
 +    ops->synchronize_state = hvf_cpu_synchronize_state;
 +    ops->synchronize_pre_loadvm = hvf_cpu_synchronize_pre_loadvm;
  };
 +static const TypeInfo hvf_accel_ops_type = {
 +    .name = ACCEL_OPS_NAME("hvf"),
 +
 +    .parent = TYPE_ACCEL_OPS,
 +    .class_init = hvf_accel_ops_class_init,
 +    .abstract = true,
 +};
 +static void hvf_accel_ops_register_types(void)
 +{
 +    type_register_static(&hvf_accel_ops_type);
 +}
 +type_init(hvf_accel_ops_register_types);
 diff --git a/target/i386/hvf/hvf.c b/target/i386/hvf/hvf.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/i386/hvf/hvf.c
 +++ b/target/i386/hvf/hvf.c
@@ -XXX,XX +XXX,XX @@
  #include "qemu/accel.h"
  #include "target/i386/cpu.h"
 -#include "hvf-cpus.h"
 +#include "hvf-accel-ops.h"
  HVFState *hvf_state;
@@ -XXX,XX +XXX,XX @@ static int hvf_accel_init(MachineState *ms)
      hvf_state = s;
      memory_listener_register(&hvf_memory_listener, &address_space_memory);
 -    cpus_register_accel(&hvf_cpus);
      return 0;
  }
 diff --git a/target/i386/hvf/x86hvf.c b/target/i386/hvf/x86hvf.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/i386/hvf/x86hvf.c
 +++ b/target/i386/hvf/x86hvf.c
@@ -XXX,XX +XXX,XX @@
  #include <Hypervisor/hv.h>
  #include <Hypervisor/hv_vmx.h>
 -#include "hvf-cpus.h"
 +#include "hvf-accel-ops.h"
  void hvf_set_segment(struct CPUState *cpu, struct vmx_segment *vmx_seg,
                       SegmentCache *qseg, bool is_tr)
 diff --git a/target/i386/whpx/whpx-cpus.c b/target/i386/whpx/whpx-accel-ops.c
 similarity index 71%
 rename from target/i386/whpx/whpx-cpus.c
 rename to target/i386/whpx/whpx-accel-ops.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/i386/whpx/whpx-cpus.c
 +++ b/target/i386/whpx/whpx-accel-ops.c
@@ -XXX,XX +XXX,XX @@
  #include "sysemu/whpx.h"
  #include "whpx-internal.h"
 -#include "whpx-cpus.h"
 +#include "whpx-accel-ops.h"
  static void *whpx_cpu_thread_fn(void *arg)
  {
@@ -XXX,XX +XXX,XX @@ static void whpx_kick_vcpu_thread(CPUState *cpu)
      }
  }
 -const CpusAccel whpx_cpus = {
 -    .create_vcpu_thread = whpx_start_vcpu_thread,
 -    .kick_vcpu_thread = whpx_kick_vcpu_thread,
 +static void whpx_accel_ops_class_init(ObjectClass *oc, void *data)
 +{
 +    AccelOpsClass *ops = ACCEL_OPS_CLASS(oc);
 -    .synchronize_post_reset = whpx_cpu_synchronize_post_reset,
 -    .synchronize_post_init = whpx_cpu_synchronize_post_init,
 -    .synchronize_state = whpx_cpu_synchronize_state,
 -    .synchronize_pre_loadvm = whpx_cpu_synchronize_pre_loadvm,
 +    ops->create_vcpu_thread = whpx_start_vcpu_thread;
 +    ops->kick_vcpu_thread = whpx_kick_vcpu_thread;
 +
 +    ops->synchronize_post_reset = whpx_cpu_synchronize_post_reset;
 +    ops->synchronize_post_init = whpx_cpu_synchronize_post_init;
 +    ops->synchronize_state = whpx_cpu_synchronize_state;
 +    ops->synchronize_pre_loadvm = whpx_cpu_synchronize_pre_loadvm;
 +}
 +
 +static const TypeInfo whpx_accel_ops_type = {
 +    .name = ACCEL_OPS_NAME("whpx"),
 +
 +    .parent = TYPE_ACCEL_OPS,
 +    .class_init = whpx_accel_ops_class_init,
 +    .abstract = true,
  };
 +
 +static void whpx_accel_ops_register_types(void)
 +{
 +    type_register_static(&whpx_accel_ops_type);
 +}
 +type_init(whpx_accel_ops_register_types);
 diff --git a/target/i386/whpx/whpx-all.c b/target/i386/whpx/whpx-all.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/i386/whpx/whpx-all.c
 +++ b/target/i386/whpx/whpx-all.c
@@ -XXX,XX +XXX,XX @@
  #include "migration/blocker.h"
  #include <winerror.h>
 -#include "whpx-cpus.h"
  #include "whpx-internal.h"
 +#include "whpx-accel-ops.h"
 +
 +#include <WinHvPlatform.h>
 +#include <WinHvEmulation.h>
  #define HYPERV_APIC_BUS_FREQUENCY      (200000000ULL)
@@ -XXX,XX +XXX,XX @@ static int whpx_accel_init(MachineState *ms)
      whpx_memory_init();
 -    cpus_register_accel(&whpx_cpus);
 -
      printf("Windows Hypervisor Platform accelerator is operational\n");
      return 0;
 diff --git a/MAINTAINERS b/MAINTAINERS
 index XXXXXXX..XXXXXXX 100644
 --- a/MAINTAINERS
 +++ b/MAINTAINERS
@@ -XXX,XX +XXX,XX @@ M: Richard Henderson <richard.henderson@linaro.org>
  R: Paolo Bonzini <pbonzini@redhat.com>
  S: Maintained
  F: include/qemu/accel.h
 -F: accel/accel.c
 +F: include/sysemu/accel-ops.h
 +F: accel/accel-*.c
  F: accel/Makefile.objs
  F: accel/stubs/Makefile.objs
 diff --git a/accel/kvm/meson.build b/accel/kvm/meson.build
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/kvm/meson.build
 +++ b/accel/kvm/meson.build
@@ -XXX,XX +XXX,XX @@
  kvm_ss = ss.source_set()
  kvm_ss.add(files(
    'kvm-all.c',
 -  'kvm-cpus.c',
 +  'kvm-accel-ops.c',
  ))
  kvm_ss.add(when: 'CONFIG_SEV', if_false: files('sev-stub.c'))
 diff --git a/accel/tcg/meson.build b/accel/tcg/meson.build
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/meson.build
 +++ b/accel/tcg/meson.build
@@ -XXX,XX +XXX,XX @@ specific_ss.add_all(when: 'CONFIG_TCG', if_true: tcg_ss)
  specific_ss.add(when: ['CONFIG_SOFTMMU', 'CONFIG_TCG'], if_true: files(
    'cputlb.c',
 -  'tcg-cpus.c',
 -  'tcg-cpus-mttcg.c',
 -  'tcg-cpus-icount.c',
 -  'tcg-cpus-rr.c'
 +  'tcg-accel-ops.c',
 +  'tcg-accel-ops-mttcg.c',
 +  'tcg-accel-ops-icount.c',
 +  'tcg-accel-ops-rr.c'
  ))
 diff --git a/target/i386/hax/meson.build b/target/i386/hax/meson.build
 index XXXXXXX..XXXXXXX 100644
 --- a/target/i386/hax/meson.build
 +++ b/target/i386/hax/meson.build
@@ -XXX,XX +XXX,XX @@
  i386_softmmu_ss.add(when: 'CONFIG_HAX', if_true: files(
    'hax-all.c',
    'hax-mem.c',
 -  'hax-cpus.c',
 +  'hax-accel-ops.c',
  ))
  i386_softmmu_ss.add(when: ['CONFIG_HAX', 'CONFIG_POSIX'], if_true: files('hax-posix.c'))
  i386_softmmu_ss.add(when: ['CONFIG_HAX', 'CONFIG_WIN32'], if_true: files('hax-windows.c'))
 diff --git a/target/i386/hvf/meson.build b/target/i386/hvf/meson.build
 index XXXXXXX..XXXXXXX 100644
 --- a/target/i386/hvf/meson.build
 +++ b/target/i386/hvf/meson.build
@@ -XXX,XX +XXX,XX @@
  i386_softmmu_ss.add(when: [hvf, 'CONFIG_HVF'], if_true: files(
    'hvf.c',
 -  'hvf-cpus.c',
 +  'hvf-accel-ops.c',
    'x86.c',
    'x86_cpuid.c',
    'x86_decode.c',
 diff --git a/target/i386/whpx/meson.build b/target/i386/whpx/meson.build
 index XXXXXXX..XXXXXXX 100644
 --- a/target/i386/whpx/meson.build
 +++ b/target/i386/whpx/meson.build
@@ -XXX,XX +XXX,XX @@
  i386_softmmu_ss.add(when: 'CONFIG_WHPX', if_true: files(
    'whpx-all.c',
    'whpx-apic.c',
 -  'whpx-cpus.c',
 +  'whpx-accel-ops.c',
  ))
 --
 .25.1

-[PULL 13/46] tcg/tci: Merge INDEX_op_ld8u_{i32,i64}
+[PULL 02/56] host-utils: move checks out of divu128/divs128
-Tested-by: Alex Bennée <alex.bennee@linaro.org>
+From: Luis Pires <luis.pires@eldorado.org.br>
-Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 In preparation for changing the divu128/divs128 implementations
 to allow for quotients larger than 64 bits, move the div-by-zero
 and overflow checks to the callers.
 Signed-off-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
 Message-Id: <20211025191154.350831-2-luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/tci.c | 20 +++++++++++++-------
+ include/hw/clock.h        |  5 +++--
-file changed, 13 insertions(+), 7 deletions(-)
+ include/qemu/host-utils.h | 34 ++++++++++++---------------------
+ target/ppc/int_helper.c   | 14 +++++++++-----
-diff --git a/tcg/tci.c b/tcg/tci.c
+ util/host-utils.c         | 40 ++++++++++++++++++---------------------
-index XXXXXXX..XXXXXXX 100644
+files changed, 42 insertions(+), 51 deletions(-)
---- a/tcg/tci.c
-+++ b/tcg/tci.c
+diff --git a/include/hw/clock.h b/include/hw/clock.h
-@@ -XXX,XX +XXX,XX @@ static bool tci_compare64(uint64_t u0, uint64_t u1, TCGCond condition)
+index XXXXXXX..XXXXXXX 100644
- # define qemu_st_beq(X)  stq_be_p(g2h(taddr), X)
+--- a/include/hw/clock.h
 +++ b/include/hw/clock.h
@@ -XXX,XX +XXX,XX @@ static inline uint64_t clock_ns_to_ticks(const Clock *clk, uint64_t ns)
          return 0;
      }
      /*
 -     * Ignore divu128() return value as we've caught div-by-zero and don't
 -     * need different behaviour for overflow.
 +     * BUG: when CONFIG_INT128 is not defined, the current implementation of
 +     * divu128 does not return a valid truncated quotient, so the result will
 +     * be wrong.
       */
      divu128(&lo, &hi, clk->period);
      return lo;
 diff --git a/include/qemu/host-utils.h b/include/qemu/host-utils.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/qemu/host-utils.h
 +++ b/include/qemu/host-utils.h
@@ -XXX,XX +XXX,XX @@ static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
      return (__int128_t)a * b / c;
  }
 -static inline int divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
 +static inline void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
  {
 -    if (divisor == 0) {
 -        return 1;
 -    } else {
 -        __uint128_t dividend = ((__uint128_t)*phigh << 64) | *plow;
 -        __uint128_t result = dividend / divisor;
 -        *plow = result;
 -        *phigh = dividend % divisor;
 -        return result > UINT64_MAX;
 -    }
 +    __uint128_t dividend = ((__uint128_t)*phigh << 64) | *plow;
 +    __uint128_t result = dividend / divisor;
 +    *plow = result;
 +    *phigh = dividend % divisor;
  }
 -static inline int divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
 +static inline void divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
  {
 -    if (divisor == 0) {
 -        return 1;
 -    } else {
 -        __int128_t dividend = ((__int128_t)*phigh << 64) | (uint64_t)*plow;
 -        __int128_t result = dividend / divisor;
 -        *plow = result;
 -        *phigh = dividend % divisor;
 -        return result != *plow;
 -    }
 +    __int128_t dividend = ((__int128_t)*phigh << 64) | (uint64_t)*plow;
 +    __int128_t result = dividend / divisor;
 +    *plow = result;
 +    *phigh = dividend % divisor;
  }
  #else
  void muls64(uint64_t *plow, uint64_t *phigh, int64_t a, int64_t b);
  void mulu64(uint64_t *plow, uint64_t *phigh, uint64_t a, uint64_t b);
 -int divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor);
 -int divs128(int64_t *plow, int64_t *phigh, int64_t divisor);
 +void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor);
 +void divs128(int64_t *plow, int64_t *phigh, int64_t divisor);
  static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
  {
 diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/ppc/int_helper.c
 +++ b/target/ppc/int_helper.c
@@ -XXX,XX +XXX,XX @@ uint64_t helper_divdeu(CPUPPCState *env, uint64_t ra, uint64_t rb, uint32_t oe)
      uint64_t rt = 0;
      int overflow = 0;
 -    overflow = divu128(&rt, &ra, rb);
 -
 -    if (unlikely(overflow)) {
 +    if (unlikely(rb == 0 || ra >= rb)) {
 +        overflow = 1;
          rt = 0; /* Undefined */
 +    } else {
 +        divu128(&rt, &ra, rb);
      }
      if (oe) {
@@ -XXX,XX +XXX,XX @@ uint64_t helper_divde(CPUPPCState *env, uint64_t rau, uint64_t rbu, uint32_t oe)
      int64_t rt = 0;
      int64_t ra = (int64_t)rau;
      int64_t rb = (int64_t)rbu;
 -    int overflow = divs128(&rt, &ra, rb);
 +    int overflow = 0;
 -    if (unlikely(overflow)) {
 +    if (unlikely(rb == 0 || uabs64(ra) >= uabs64(rb))) {
 +        overflow = 1;
          rt = 0; /* Undefined */
 +    } else {
 +        divs128(&rt, &ra, rb);
      }
      if (oe) {
 diff --git a/util/host-utils.c b/util/host-utils.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/host-utils.c
 +++ b/util/host-utils.c
@@ -XXX,XX +XXX,XX @@ void muls64 (uint64_t *plow, uint64_t *phigh, int64_t a, int64_t b)
      *phigh = rh;
  }
 -/* Unsigned 128x64 division.  Returns 1 if overflow (divide by zero or */
 -/* quotient exceeds 64 bits).  Otherwise returns quotient via plow and */
 -/* remainder via phigh. */
 -int divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
 +/*
 + * Unsigned 128-by-64 division. Returns quotient via plow and
 + * remainder via phigh.
 + * The result must fit in 64 bits (plow) - otherwise, the result
 + * is undefined.
 + * This function will cause a division by zero if passed a zero divisor.
 + */
 +void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
  {
      uint64_t dhi = *phigh;
      uint64_t dlo = *plow;
      unsigned i;
      uint64_t carry = 0;
 -    if (divisor == 0) {
 -        return 1;
 -    } else if (dhi == 0) {
 +    if (divisor == 0 || dhi == 0) {
          *plow  = dlo / divisor;
          *phigh = dlo % divisor;
 -        return 0;
 -    } else if (dhi >= divisor) {
 -        return 1;
      } else {
          for (i = 0; i < 64; i++) {
@@ -XXX,XX +XXX,XX @@ int divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
          *plow = dlo;
          *phigh = dhi;
 -        return 0;
      }
  }
 -int divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
 +/*
 + * Signed 128-by-64 division. Returns quotient via plow and
 + * remainder via phigh.
 + * The result must fit in 64 bits (plow) - otherwise, the result
 + * is undefined.
 + * This function will cause a division by zero if passed a zero divisor.
 + */
 +void divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
  {
      int sgn_dvdnd = *phigh < 0;
      int sgn_divsr = divisor < 0;
 -    int overflow = 0;
      if (sgn_dvdnd) {
          *plow = ~(*plow);
@@ -XXX,XX +XXX,XX @@ int divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
          divisor = 0 - divisor;
      }
 -    overflow = divu128((uint64_t *)plow, (uint64_t *)phigh, (uint64_t)divisor);
 +    divu128((uint64_t *)plow, (uint64_t *)phigh, (uint64_t)divisor);
      if (sgn_dvdnd  ^ sgn_divsr) {
          *plow = 0 - *plow;
      }
 -
 -    if (!overflow) {
 -        if ((*plow < 0) ^ (sgn_dvdnd ^ sgn_divsr)) {
 -            overflow = 1;
 -        }
 -    }
 -
 -    return overflow;
  }
  #endif
-+#if TCG_TARGET_REG_BITS == 64
-+# define CASE_32_64(x) \
-+        case glue(glue(INDEX_op_, x), _i64): \
-+        case glue(glue(INDEX_op_, x), _i32):
-+# define CASE_64(x) \
-+        case glue(glue(INDEX_op_, x), _i64):
-+#else
-+# define CASE_32_64(x) \
-+        case glue(glue(INDEX_op_, x), _i32):
-+# define CASE_64(x)
-+#endif
-+
- /* Interpret pseudo code in tb. */
- /*
-  * Disable CFI checks.
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
-             /* Load/store operations (32 bit). */
--        case INDEX_op_ld8u_i32:
-+        CASE_32_64(ld8u)
-             t0 = *tb_ptr++;
-             t1 = tci_read_r(regs, &tb_ptr);
-             t2 = tci_read_s32(&tb_ptr);
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
-             /* Load/store operations (64 bit). */
--        case INDEX_op_ld8u_i64:
--            t0 = *tb_ptr++;
--            t1 = tci_read_r(regs, &tb_ptr);
--            t2 = tci_read_s32(&tb_ptr);
--            tci_write_reg(regs, t0, *(uint8_t *)(t1 + t2));
--            break;
-         case INDEX_op_ld8s_i64:
-             t0 = *tb_ptr++;
-             t1 = tci_read_r(regs, &tb_ptr);
 --
 .25.1

-[PULL 25/46] tcg/tci: Remove TODO as unused
+[PULL 03/56] host-utils: move udiv_qrnnd() to host-utils
-Tested-by: Alex Bennée <alex.bennee@linaro.org>
+From: Luis Pires <luis.pires@eldorado.org.br>
-Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Move udiv_qrnnd() from include/fpu/softfloat-macros.h to host-utils,
 so it can be reused by divu128().
 Signed-off-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
 Message-Id: <20211025191154.350831-3-luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/tci.c | 8 --------
+ include/fpu/softfloat-macros.h | 82 ----------------------------------
-file changed, 8 deletions(-)
+ include/qemu/host-utils.h      | 81 +++++++++++++++++++++++++++++++++
+files changed, 81 insertions(+), 82 deletions(-)
-diff --git a/tcg/tci.c b/tcg/tci.c
 diff --git a/include/fpu/softfloat-macros.h b/include/fpu/softfloat-macros.h
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/tci.c
+--- a/include/fpu/softfloat-macros.h
-+++ b/tcg/tci.c
++++ b/include/fpu/softfloat-macros.h
 @@ -XXX,XX +XXX,XX @@
- #include "tcg/tcg-op.h"
+  * so some portions are provided under:
- #include "qemu/compiler.h"
+  *  the SoftFloat-2a license
+  *  the BSD license
--/* Marker for missing code. */
+- *  GPL-v2-or-later
--#define TODO() \
+  *
--    do { \
+  * Any future contributions to this file after December 1st 2014 will be
--        fprintf(stderr, "TODO %s:%u: %s()\n", \
+  * taken to be licensed under the Softfloat-2a license unless specifically
--                __FILE__, __LINE__, __func__); \
+@@ -XXX,XX +XXX,XX @@ this code that are retained.
--        tcg_abort(); \
+  * THE POSSIBILITY OF SUCH DAMAGE.
--    } while (0)
+  */
--
- #if MAX_OPC_PARAM_IARGS != 6
+-/* Portions of this work are licensed under the terms of the GNU GPL,
- # error Fix needed, number of supported input arguments changed!
+- * version 2 or later. See the COPYING file in the top-level directory.
 - */
 -
  #ifndef FPU_SOFTFLOAT_MACROS_H
  #define FPU_SOFTFLOAT_MACROS_H
@@ -XXX,XX +XXX,XX @@ static inline uint64_t estimateDiv128To64(uint64_t a0, uint64_t a1, uint64_t b)
  }
 -/* From the GNU Multi Precision Library - longlong.h __udiv_qrnnd
 - * (https://gmplib.org/repo/gmp/file/tip/longlong.h)
 - *
 - * Licensed under the GPLv2/LGPLv3
 - */
 -static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
 -                                  uint64_t n0, uint64_t d)
 -{
 -#if defined(__x86_64__)
 -    uint64_t q;
 -    asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d));
 -    return q;
 -#elif defined(__s390x__) && !defined(__clang__)
 -    /* Need to use a TImode type to get an even register pair for DLGR.  */
 -    unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
 -    asm("dlgr %0, %1" : "+r"(n) : "r"(d));
 -    *r = n >> 64;
 -    return n;
 -#elif defined(_ARCH_PPC64) && defined(_ARCH_PWR7)
 -    /* From Power ISA 2.06, programming note for divdeu.  */
 -    uint64_t q1, q2, Q, r1, r2, R;
 -    asm("divdeu %0,%2,%4; divdu %1,%3,%4"
 -        : "=&r"(q1), "=r"(q2)
 -        : "r"(n1), "r"(n0), "r"(d));
 -    r1 = -(q1 * d);         /* low part of (n1<<64) - (q1 * d) */
 -    r2 = n0 - (q2 * d);
 -    Q = q1 + q2;
 -    R = r1 + r2;
 -    if (R >= d || R < r2) { /* overflow implies R > d */
 -        Q += 1;
 -        R -= d;
 -    }
 -    *r = R;
 -    return Q;
 -#else
 -    uint64_t d0, d1, q0, q1, r1, r0, m;
 -
 -    d0 = (uint32_t)d;
 -    d1 = d >> 32;
 -
 -    r1 = n1 % d1;
 -    q1 = n1 / d1;
 -    m = q1 * d0;
 -    r1 = (r1 << 32) | (n0 >> 32);
 -    if (r1 < m) {
 -        q1 -= 1;
 -        r1 += d;
 -        if (r1 >= d) {
 -            if (r1 < m) {
 -                q1 -= 1;
 -                r1 += d;
 -            }
 -        }
 -    }
 -    r1 -= m;
 -
 -    r0 = r1 % d1;
 -    q0 = r1 / d1;
 -    m = q0 * d0;
 -    r0 = (r0 << 32) | (uint32_t)n0;
 -    if (r0 < m) {
 -        q0 -= 1;
 -        r0 += d;
 -        if (r0 >= d) {
 -            if (r0 < m) {
 -                q0 -= 1;
 -                r0 += d;
 -            }
 -        }
 -    }
 -    r0 -= m;
 -
 -    *r = r0;
 -    return (q1 << 32) | q0;
 -#endif
 -}
 -
  /*----------------------------------------------------------------------------
  | Returns an approximation to the square root of the 32-bit significand given
  | by `a'.  Considered as an integer, `a' must be at least 2^31.  If bit 0 of
 diff --git a/include/qemu/host-utils.h b/include/qemu/host-utils.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/qemu/host-utils.h
 +++ b/include/qemu/host-utils.h
@@ -XXX,XX +XXX,XX @@
   * THE SOFTWARE.
   */
 +/* Portions of this work are licensed under the terms of the GNU GPL,
 + * version 2 or later. See the COPYING file in the top-level directory.
 + */
 +
  #ifndef HOST_UTILS_H
  #define HOST_UTILS_H
@@ -XXX,XX +XXX,XX @@ void urshift(uint64_t *plow, uint64_t *phigh, int32_t shift);
   */
  void ulshift(uint64_t *plow, uint64_t *phigh, int32_t shift, bool *overflow);
 +/* From the GNU Multi Precision Library - longlong.h __udiv_qrnnd
 + * (https://gmplib.org/repo/gmp/file/tip/longlong.h)
 + *
 + * Licensed under the GPLv2/LGPLv3
 + */
 +static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
 +                                  uint64_t n0, uint64_t d)
 +{
 +#if defined(__x86_64__)
 +    uint64_t q;
 +    asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d));
 +    return q;
 +#elif defined(__s390x__) && !defined(__clang__)
 +    /* Need to use a TImode type to get an even register pair for DLGR.  */
 +    unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
 +    asm("dlgr %0, %1" : "+r"(n) : "r"(d));
 +    *r = n >> 64;
 +    return n;
 +#elif defined(_ARCH_PPC64) && defined(_ARCH_PWR7)
 +    /* From Power ISA 2.06, programming note for divdeu.  */
 +    uint64_t q1, q2, Q, r1, r2, R;
 +    asm("divdeu %0,%2,%4; divdu %1,%3,%4"
 +        : "=&r"(q1), "=r"(q2)
 +        : "r"(n1), "r"(n0), "r"(d));
 +    r1 = -(q1 * d);         /* low part of (n1<<64) - (q1 * d) */
 +    r2 = n0 - (q2 * d);
 +    Q = q1 + q2;
 +    R = r1 + r2;
 +    if (R >= d || R < r2) { /* overflow implies R > d */
 +        Q += 1;
 +        R -= d;
 +    }
 +    *r = R;
 +    return Q;
 +#else
 +    uint64_t d0, d1, q0, q1, r1, r0, m;
 +
 +    d0 = (uint32_t)d;
 +    d1 = d >> 32;
 +
 +    r1 = n1 % d1;
 +    q1 = n1 / d1;
 +    m = q1 * d0;
 +    r1 = (r1 << 32) | (n0 >> 32);
 +    if (r1 < m) {
 +        q1 -= 1;
 +        r1 += d;
 +        if (r1 >= d) {
 +            if (r1 < m) {
 +                q1 -= 1;
 +                r1 += d;
 +            }
 +        }
 +    }
 +    r1 -= m;
 +
 +    r0 = r1 % d1;
 +    q0 = r1 / d1;
 +    m = q0 * d0;
 +    r0 = (r0 << 32) | (uint32_t)n0;
 +    if (r0 < m) {
 +        q0 -= 1;
 +        r0 += d;
 +        if (r0 >= d) {
 +            if (r0 < m) {
 +                q0 -= 1;
 +                r0 += d;
 +            }
 +        }
 +    }
 +    r0 -= m;
 +
 +    *r = r0;
 +    return (q1 << 32) | q0;
 +#endif
 +}
 +
  #endif
 --
 .25.1

-[PULL 02/46] exec/cpu-defs: Remove TCG backends dependency
+[PULL 04/56] host-utils: add 128-bit quotient support to divu128/divs128
-From: Philippe Mathieu-Daudé <f4bug@amsat.org>
+From: Luis Pires <luis.pires@eldorado.org.br>
-"exec/cpu-defs.h" contains generic CPU definitions for the
+These will be used to implement new decimal floating point
-TCG frontends (mostly related to TLB). TCG backends definitions
+instructions from Power ISA 3.1.
-aren't relevant here.
+The remainder is now returned directly by divu128/divs128,
-See tcg/README description:
+freeing up phigh to receive the high 64 bits of the quotient.
-) Backend
+Signed-off-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
-  tcg-target.h contains the target specific definitions. tcg-target.c.inc
+Message-Id: <20211025191154.350831-4-luis.pires@eldorado.org.br>
   contains the target specific code; it is #included by tcg/tcg.c, rather
   than being a standalone C file.
 So far only "tcg/tcg.h" requires these headers.
 Remove the "target-tcg.h" header dependency on TCG frontends, so we
 don't have to rebuild all frontends when hacking a single backend.
 Signed-off-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Message-Id: <20210204191423.1754158-1-f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/exec/cpu-defs.h | 3 ---
+ include/hw/clock.h        |   6 +-
-file changed, 3 deletions(-)
+ include/qemu/host-utils.h |  20 ++++--
+ target/ppc/int_helper.c   |   9 +--
-diff --git a/include/exec/cpu-defs.h b/include/exec/cpu-defs.h
+ util/host-utils.c         | 133 +++++++++++++++++++++++++-------------
-index XXXXXXX..XXXXXXX 100644
+files changed, 108 insertions(+), 60 deletions(-)
---- a/include/exec/cpu-defs.h
-+++ b/include/exec/cpu-defs.h
+diff --git a/include/hw/clock.h b/include/hw/clock.h
-@@ -XXX,XX +XXX,XX @@
+index XXXXXXX..XXXXXXX 100644
+--- a/include/hw/clock.h
- #include "qemu/host-utils.h"
++++ b/include/hw/clock.h
- #include "qemu/thread.h"
+@@ -XXX,XX +XXX,XX @@ static inline uint64_t clock_ns_to_ticks(const Clock *clk, uint64_t ns)
--#ifdef CONFIG_TCG
+     if (clk->period == 0) {
--#include "tcg-target.h"
+         return 0;
--#endif
+     }
- #ifndef CONFIG_USER_ONLY
+-    /*
- #include "exec/hwaddr.h"
+-     * BUG: when CONFIG_INT128 is not defined, the current implementation of
 -     * divu128 does not return a valid truncated quotient, so the result will
 -     * be wrong.
 -     */
 +
      divu128(&lo, &hi, clk->period);
      return lo;
  }
 diff --git a/include/qemu/host-utils.h b/include/qemu/host-utils.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/qemu/host-utils.h
 +++ b/include/qemu/host-utils.h
@@ -XXX,XX +XXX,XX @@ static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
      return (__int128_t)a * b / c;
  }
 -static inline void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
 +static inline uint64_t divu128(uint64_t *plow, uint64_t *phigh,
 +                               uint64_t divisor)
  {
      __uint128_t dividend = ((__uint128_t)*phigh << 64) | *plow;
      __uint128_t result = dividend / divisor;
 +
      *plow = result;
 -    *phigh = dividend % divisor;
 +    *phigh = result >> 64;
 +    return dividend % divisor;
  }
 -static inline void divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
 +static inline int64_t divs128(uint64_t *plow, int64_t *phigh,
 +                              int64_t divisor)
  {
 -    __int128_t dividend = ((__int128_t)*phigh << 64) | (uint64_t)*plow;
 +    __int128_t dividend = ((__int128_t)*phigh << 64) | *plow;
      __int128_t result = dividend / divisor;
 +
      *plow = result;
 -    *phigh = dividend % divisor;
 +    *phigh = result >> 64;
 +    return dividend % divisor;
  }
  #else
  void muls64(uint64_t *plow, uint64_t *phigh, int64_t a, int64_t b);
  void mulu64(uint64_t *plow, uint64_t *phigh, uint64_t a, uint64_t b);
 -void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor);
 -void divs128(int64_t *plow, int64_t *phigh, int64_t divisor);
 +uint64_t divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor);
 +int64_t divs128(uint64_t *plow, int64_t *phigh, int64_t divisor);
  static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
  {
 diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/ppc/int_helper.c
 +++ b/target/ppc/int_helper.c
@@ -XXX,XX +XXX,XX @@ uint64_t helper_divdeu(CPUPPCState *env, uint64_t ra, uint64_t rb, uint32_t oe)
  uint64_t helper_divde(CPUPPCState *env, uint64_t rau, uint64_t rbu, uint32_t oe)
  {
 -    int64_t rt = 0;
 +    uint64_t rt = 0;
      int64_t ra = (int64_t)rau;
      int64_t rb = (int64_t)rbu;
      int overflow = 0;
@@ -XXX,XX +XXX,XX @@ uint32_t helper_bcdcfsq(ppc_avr_t *r, ppc_avr_t *b, uint32_t ps)
      int cr;
      uint64_t lo_value;
      uint64_t hi_value;
 +    uint64_t rem;
      ppc_avr_t ret = { .u64 = { 0, 0 } };
      if (b->VsrSD(0) < 0) {
@@ -XXX,XX +XXX,XX @@ uint32_t helper_bcdcfsq(ppc_avr_t *r, ppc_avr_t *b, uint32_t ps)
           * In that case, we leave r unchanged.
           */
      } else {
 -        divu128(&lo_value, &hi_value, 1000000000000000ULL);
 +        rem = divu128(&lo_value, &hi_value, 1000000000000000ULL);
 -        for (i = 1; i < 16; hi_value /= 10, i++) {
 -            bcd_put_digit(&ret, hi_value % 10, i);
 +        for (i = 1; i < 16; rem /= 10, i++) {
 +            bcd_put_digit(&ret, rem % 10, i);
          }
          for (; i < 32; lo_value /= 10, i++) {
 diff --git a/util/host-utils.c b/util/host-utils.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/host-utils.c
 +++ b/util/host-utils.c
@@ -XXX,XX +XXX,XX @@ void muls64 (uint64_t *plow, uint64_t *phigh, int64_t a, int64_t b)
  }
  /*
 - * Unsigned 128-by-64 division. Returns quotient via plow and
 - * remainder via phigh.
 - * The result must fit in 64 bits (plow) - otherwise, the result
 - * is undefined.
 - * This function will cause a division by zero if passed a zero divisor.
 + * Unsigned 128-by-64 division.
 + * Returns the remainder.
 + * Returns quotient via plow and phigh.
 + * Also returns the remainder via the function return value.
   */
 -void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
 +uint64_t divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
  {
      uint64_t dhi = *phigh;
      uint64_t dlo = *plow;
 -    unsigned i;
 -    uint64_t carry = 0;
 +    uint64_t rem, dhighest;
 +    int sh;
      if (divisor == 0 || dhi == 0) {
          *plow  = dlo / divisor;
 -        *phigh = dlo % divisor;
 +        *phigh = 0;
 +        return dlo % divisor;
      } else {
 +        sh = clz64(divisor);
 -        for (i = 0; i < 64; i++) {
 -            carry = dhi >> 63;
 -            dhi = (dhi << 1) | (dlo >> 63);
 -            if (carry || (dhi >= divisor)) {
 -                dhi -= divisor;
 -                carry = 1;
 -            } else {
 -                carry = 0;
 +        if (dhi < divisor) {
 +            if (sh != 0) {
 +                /* normalize the divisor, shifting the dividend accordingly */
 +                divisor <<= sh;
 +                dhi = (dhi << sh) | (dlo >> (64 - sh));
 +                dlo <<= sh;
              }
 -            dlo = (dlo << 1) | carry;
 +
 +            *phigh = 0;
 +            *plow = udiv_qrnnd(&rem, dhi, dlo, divisor);
 +        } else {
 +            if (sh != 0) {
 +                /* normalize the divisor, shifting the dividend accordingly */
 +                divisor <<= sh;
 +                dhighest = dhi >> (64 - sh);
 +                dhi = (dhi << sh) | (dlo >> (64 - sh));
 +                dlo <<= sh;
 +
 +                *phigh = udiv_qrnnd(&dhi, dhighest, dhi, divisor);
 +            } else {
 +                /**
 +                 * dhi >= divisor
 +                 * Since the MSB of divisor is set (sh == 0),
 +                 * (dhi - divisor) < divisor
 +                 *
 +                 * Thus, the high part of the quotient is 1, and we can
 +                 * calculate the low part with a single call to udiv_qrnnd
 +                 * after subtracting divisor from dhi
 +                 */
 +                dhi -= divisor;
 +                *phigh = 1;
 +            }
 +
 +            *plow = udiv_qrnnd(&rem, dhi, dlo, divisor);
          }
 -        *plow = dlo;
 -        *phigh = dhi;
 +        /*
 +         * since the dividend/divisor might have been normalized,
 +         * the remainder might also have to be shifted back
 +         */
 +        return rem >> sh;
      }
  }
  /*
 - * Signed 128-by-64 division. Returns quotient via plow and
 - * remainder via phigh.
 - * The result must fit in 64 bits (plow) - otherwise, the result
 - * is undefined.
 - * This function will cause a division by zero if passed a zero divisor.
 + * Signed 128-by-64 division.
 + * Returns quotient via plow and phigh.
 + * Also returns the remainder via the function return value.
   */
 -void divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
 +int64_t divs128(uint64_t *plow, int64_t *phigh, int64_t divisor)
  {
 -    int sgn_dvdnd = *phigh < 0;
 -    int sgn_divsr = divisor < 0;
 +    bool neg_quotient = false, neg_remainder = false;
 +    uint64_t unsig_hi = *phigh, unsig_lo = *plow;
 +    uint64_t rem;
 -    if (sgn_dvdnd) {
 -        *plow = ~(*plow);
 -        *phigh = ~(*phigh);
 -        if (*plow == (int64_t)-1) {
 +    if (*phigh < 0) {
 +        neg_quotient = !neg_quotient;
 +        neg_remainder = !neg_remainder;
 +
 +        if (unsig_lo == 0) {
 +            unsig_hi = -unsig_hi;
 +        } else {
 +            unsig_hi = ~unsig_hi;
 +            unsig_lo = -unsig_lo;
 +        }
 +    }
 +
 +    if (divisor < 0) {
 +        neg_quotient = !neg_quotient;
 +
 +        divisor = -divisor;
 +    }
 +
 +    rem = divu128(&unsig_lo, &unsig_hi, (uint64_t)divisor);
 +
 +    if (neg_quotient) {
 +        if (unsig_lo == 0) {
 +            *phigh = -unsig_hi;
              *plow = 0;
 -            (*phigh)++;
 -         } else {
 -            (*plow)++;
 -         }
 +        } else {
 +            *phigh = ~unsig_hi;
 +            *plow = -unsig_lo;
 +        }
 +    } else {
 +        *phigh = unsig_hi;
 +        *plow = unsig_lo;
      }
 -    if (sgn_divsr) {
 -        divisor = 0 - divisor;
 -    }
 -
 -    divu128((uint64_t *)plow, (uint64_t *)phigh, (uint64_t)divisor);
 -
 -    if (sgn_dvdnd  ^ sgn_divsr) {
 -        *plow = 0 - *plow;
 +    if (neg_remainder) {
 +        return -rem;
 +    } else {
 +        return rem;
      }
  }
  #endif
 --
 .25.1

-[PULL 44/46] accel: extend AccelState and AccelClass to user-mode
+[PULL 05/56] host-utils: add unit tests for divu128/divs128
-From: Claudio Fontana <cfontana@suse.de>
+From: Luis Pires <luis.pires@eldorado.org.br>
-Signed-off-by: Claudio Fontana <cfontana@suse.de>
+Signed-off-by: Luis Pires <luis.pires@eldorado.org.br>
-Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
+Message-Id: <20211025191154.350831-5-luis.pires@eldorado.org.br>
 [claudio: rebased on Richard's splitwx work]
 Signed-off-by: Claudio Fontana <cfontana@suse.de>
 Message-Id: <20210204163931.7358-17-cfontana@suse.de>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/hw/boards.h                |  2 +-
+ tests/unit/test-div128.c | 197 +++++++++++++++++++++++++++++++++++++++
- include/{sysemu => qemu}/accel.h   | 14 +++++----
+ tests/unit/meson.build   |   1 +
- include/sysemu/hvf.h               |  2 +-
+files changed, 198 insertions(+)
- include/sysemu/kvm.h               |  2 +-
+ create mode 100644 tests/unit/test-div128.c
- include/sysemu/kvm_int.h           |  2 +-
- target/i386/hvf/hvf-i386.h         |  2 +-
+diff --git a/tests/unit/test-div128.c b/tests/unit/test-div128.c
  accel/accel-common.c               | 50 ++++++++++++++++++++++++++++++
  accel/{accel.c => accel-softmmu.c} | 27 ++--------------
  accel/accel-user.c                 | 24 ++++++++++++++
  accel/qtest/qtest.c                |  2 +-
  accel/tcg/tcg-all.c                | 15 +++++++--
  accel/xen/xen-all.c                |  2 +-
  bsd-user/main.c                    |  6 +++-
  linux-user/main.c                  |  6 +++-
  softmmu/memory.c                   |  2 +-
  softmmu/qtest.c                    |  2 +-
  softmmu/vl.c                       |  2 +-
  target/i386/hax/hax-all.c          |  2 +-
  target/i386/hvf/hvf.c              |  2 +-
  target/i386/hvf/x86_task.c         |  2 +-
  target/i386/whpx/whpx-all.c        |  2 +-
  MAINTAINERS                        |  2 +-
  accel/meson.build                  |  4 ++-
  accel/tcg/meson.build              |  2 +-
 files changed, 125 insertions(+), 53 deletions(-)
  rename include/{sysemu => qemu}/accel.h (95%)
  create mode 100644 accel/accel-common.c
  rename accel/{accel.c => accel-softmmu.c} (75%)
  create mode 100644 accel/accel-user.c
 diff --git a/include/hw/boards.h b/include/hw/boards.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/hw/boards.h
 +++ b/include/hw/boards.h
@@ -XXX,XX +XXX,XX @@
  #include "exec/memory.h"
  #include "sysemu/hostmem.h"
  #include "sysemu/blockdev.h"
 -#include "sysemu/accel.h"
 +#include "qemu/accel.h"
  #include "qapi/qapi-types-machine.h"
  #include "qemu/module.h"
  #include "qom/object.h"
 diff --git a/include/sysemu/accel.h b/include/qemu/accel.h
 similarity index 95%
 rename from include/sysemu/accel.h
 rename to include/qemu/accel.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/sysemu/accel.h
 +++ b/include/qemu/accel.h
@@ -XXX,XX +XXX,XX @@
   * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
   * THE SOFTWARE.
   */
 -#ifndef HW_ACCEL_H
 -#define HW_ACCEL_H
 +#ifndef QEMU_ACCEL_H
 +#define QEMU_ACCEL_H
  #include "qom/object.h"
  #include "exec/hwaddr.h"
@@ -XXX,XX +XXX,XX @@ typedef struct AccelClass {
      /*< public >*/
      const char *name;
 -#ifndef CONFIG_USER_ONLY
      int (*init_machine)(MachineState *ms);
 +#ifndef CONFIG_USER_ONLY
      void (*setup_post)(MachineState *ms, AccelState *accel);
      bool (*has_memory)(MachineState *ms, AddressSpace *as,
                         hwaddr start_addr, hwaddr size);
@@ -XXX,XX +XXX,XX @@ typedef struct AccelClass {
      OBJECT_GET_CLASS(AccelClass, (obj), TYPE_ACCEL)
  AccelClass *accel_find(const char *opt_name);
 +AccelState *current_accel(void);
 +
 +#ifndef CONFIG_USER_ONLY
  int accel_init_machine(AccelState *accel, MachineState *ms);
  /* Called just before os_setup_post (ie just before drop OS privs) */
  void accel_setup_post(MachineState *ms);
 +#endif /* !CONFIG_USER_ONLY */
 -AccelState *current_accel(void);
 -
 -#endif
 +#endif /* QEMU_ACCEL_H */
 diff --git a/include/sysemu/hvf.h b/include/sysemu/hvf.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/sysemu/hvf.h
 +++ b/include/sysemu/hvf.h
@@ -XXX,XX +XXX,XX @@
  #ifndef HVF_H
  #define HVF_H
 -#include "sysemu/accel.h"
 +#include "qemu/accel.h"
  #include "qom/object.h"
  #ifdef CONFIG_HVF
 diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/sysemu/kvm.h
 +++ b/include/sysemu/kvm.h
@@ -XXX,XX +XXX,XX @@
  #include "qemu/queue.h"
  #include "hw/core/cpu.h"
  #include "exec/memattrs.h"
 -#include "sysemu/accel.h"
 +#include "qemu/accel.h"
  #include "qom/object.h"
  #ifdef NEED_CPU_H
 diff --git a/include/sysemu/kvm_int.h b/include/sysemu/kvm_int.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/sysemu/kvm_int.h
 +++ b/include/sysemu/kvm_int.h
@@ -XXX,XX +XXX,XX @@
  #define QEMU_KVM_INT_H
  #include "exec/memory.h"
 -#include "sysemu/accel.h"
 +#include "qemu/accel.h"
  #include "sysemu/kvm.h"
  typedef struct KVMSlot
 diff --git a/target/i386/hvf/hvf-i386.h b/target/i386/hvf/hvf-i386.h
 index XXXXXXX..XXXXXXX 100644
 --- a/target/i386/hvf/hvf-i386.h
 +++ b/target/i386/hvf/hvf-i386.h
@@ -XXX,XX +XXX,XX @@
  #ifndef HVF_I386_H
  #define HVF_I386_H
 -#include "sysemu/accel.h"
 +#include "qemu/accel.h"
  #include "sysemu/hvf.h"
  #include "cpu.h"
  #include "x86.h"
 diff --git a/accel/accel-common.c b/accel/accel-common.c
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
-+++ b/accel/accel-common.c
++++ b/tests/unit/test-div128.c
 @@ -XXX,XX +XXX,XX @@
 +/*
-+ * QEMU accel class, components common to system emulation and user mode
++ * Test 128-bit division functions
 + *
-+ * Copyright (c) 2003-2008 Fabrice Bellard
++ * Copyright (c) 2021 Instituto de Pesquisas Eldorado (eldorado.org.br)
-+ * Copyright (c) 2014 Red Hat Inc.
++ *
-+ *
++ * This library is free software; you can redistribute it and/or
-+ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * modify it under the terms of the GNU Lesser General Public
-+ * of this software and associated documentation files (the "Software"), to deal
++ * License as published by the Free Software Foundation; either
-+ * in the Software without restriction, including without limitation the rights
++ * version 2.1 of the License, or (at your option) any later version.
-+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
++ *
-+ * copies of the Software, and to permit persons to whom the Software is
++ * This library is distributed in the hope that it will be useful,
-+ * furnished to do so, subject to the following conditions:
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ *
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * The above copyright notice and this permission notice shall be included in
++ * Lesser General Public License for more details.
-+ * all copies or substantial portions of the Software.
++ *
-+ *
++ * You should have received a copy of the GNU Lesser General Public
-+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 + * THE SOFTWARE.
 + */
 +
 +#include "qemu/osdep.h"
-+#include "qemu/accel.h"
++#include "qemu/host-utils.h"
 +
-+static const TypeInfo accel_type = {
++typedef struct {
-+    .name = TYPE_ACCEL,
++    uint64_t high;
-+    .parent = TYPE_OBJECT,
++    uint64_t low;
-+    .class_size = sizeof(AccelClass),
++    uint64_t rhigh;
-+    .instance_size = sizeof(AccelState),
++    uint64_t rlow;
 +    uint64_t divisor;
 +    uint64_t remainder;
 +} test_data_unsigned;
 +
 +typedef struct {
 +    int64_t high;
 +    uint64_t low;
 +    int64_t rhigh;
 +    uint64_t rlow;
 +    int64_t divisor;
 +    int64_t remainder;
 +} test_data_signed;
 +
 +static const test_data_unsigned test_table_unsigned[] = {
 +    /* Dividend fits in 64 bits */
 +    { 0x0000000000000000ULL, 0x0000000000000000ULL,
 +      0x0000000000000000ULL, 0x0000000000000000ULL,
 +      0x0000000000000001ULL, 0x0000000000000000ULL},
 +    { 0x0000000000000000ULL, 0x0000000000000001ULL,
 +      0x0000000000000000ULL, 0x0000000000000001ULL,
 +      0x0000000000000001ULL, 0x0000000000000000ULL},
 +    { 0x0000000000000000ULL, 0x0000000000000003ULL,
 +      0x0000000000000000ULL, 0x0000000000000001ULL,
 +      0x0000000000000002ULL, 0x0000000000000001ULL},
 +    { 0x0000000000000000ULL, 0x8000000000000000ULL,
 +      0x0000000000000000ULL, 0x8000000000000000ULL,
 +      0x0000000000000001ULL, 0x0000000000000000ULL},
 +    { 0x0000000000000000ULL, 0xa000000000000000ULL,
 +      0x0000000000000000ULL, 0x0000000000000002ULL,
 +      0x4000000000000000ULL, 0x2000000000000000ULL},
 +    { 0x0000000000000000ULL, 0x8000000000000000ULL,
 +      0x0000000000000000ULL, 0x0000000000000001ULL,
 +      0x8000000000000000ULL, 0x0000000000000000ULL},
 +
 +    /* Dividend > 64 bits, with MSB 0 */
 +    { 0x123456789abcdefeULL, 0xefedcba987654321ULL,
 +      0x123456789abcdefeULL, 0xefedcba987654321ULL,
 +      0x0000000000000001ULL, 0x0000000000000000ULL},
 +    { 0x123456789abcdefeULL, 0xefedcba987654321ULL,
 +      0x0000000000000001ULL, 0x000000000000000dULL,
 +      0x123456789abcdefeULL, 0x03456789abcdf03bULL},
 +    { 0x123456789abcdefeULL, 0xefedcba987654321ULL,
 +      0x0123456789abcdefULL, 0xeefedcba98765432ULL,
 +      0x0000000000000010ULL, 0x0000000000000001ULL},
 +
 +    /* Dividend > 64 bits, with MSB 1 */
 +    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
 +      0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
 +      0x0000000000000001ULL, 0x0000000000000000ULL},
 +    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
 +      0x0000000000000001ULL, 0x0000000000000000ULL,
 +      0xfeeddccbbaa99887ULL, 0x766554433221100fULL},
 +    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
 +      0x0feeddccbbaa9988ULL, 0x7766554433221100ULL,
 +      0x0000000000000010ULL, 0x000000000000000fULL},
 +    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
 +      0x000000000000000eULL, 0x00f0f0f0f0f0f35aULL,
 +      0x123456789abcdefeULL, 0x0f8922bc55ef90c3ULL},
 +
 +    /**
 +     * Divisor == 64 bits, with MSB 1
 +     * and high 64 bits of dividend >= divisor
 +     * (for testing normalization)
 +     */
 +    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
 +      0x0000000000000001ULL, 0x0000000000000000ULL,
 +      0xfeeddccbbaa99887ULL, 0x766554433221100fULL},
 +    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
 +      0x0000000000000001ULL, 0xfddbb9977553310aULL,
 +      0x8000000000000001ULL, 0x78899aabbccddf05ULL},
 +
 +    /* Dividend > 64 bits, divisor almost as big */
 +    { 0x0000000000000001ULL, 0x23456789abcdef01ULL,
 +      0x0000000000000000ULL, 0x000000000000000fULL,
 +      0x123456789abcdefeULL, 0x123456789abcde1fULL},
 +};
 +
-+/* Lookup AccelClass from opt_name. Returns NULL if not found */
++static const test_data_signed test_table_signed[] = {
-+AccelClass *accel_find(const char *opt_name)
++    /* Positive dividend, positive/negative divisors */
 +    { 0x0000000000000000LL, 0x0000000000bc614eULL,
 +      0x0000000000000000LL, 0x0000000000bc614eULL,
 +      0x0000000000000001LL, 0x0000000000000000LL},
 +    { 0x0000000000000000LL, 0x0000000000bc614eULL,
 +      0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
 +      0xffffffffffffffffLL, 0x0000000000000000LL},
 +    { 0x0000000000000000LL, 0x0000000000bc614eULL,
 +      0x0000000000000000LL, 0x00000000005e30a7ULL,
 +      0x0000000000000002LL, 0x0000000000000000LL},
 +    { 0x0000000000000000LL, 0x0000000000bc614eULL,
 +      0xffffffffffffffffLL, 0xffffffffffa1cf59ULL,
 +      0xfffffffffffffffeLL, 0x0000000000000000LL},
 +    { 0x0000000000000000LL, 0x0000000000bc614eULL,
 +      0x0000000000000000LL, 0x0000000000178c29ULL,
 +      0x0000000000000008LL, 0x0000000000000006LL},
 +    { 0x0000000000000000LL, 0x0000000000bc614eULL,
 +      0xffffffffffffffffLL, 0xffffffffffe873d7ULL,
 +      0xfffffffffffffff8LL, 0x0000000000000006LL},
 +    { 0x0000000000000000LL, 0x0000000000bc614eULL,
 +      0x0000000000000000LL, 0x000000000000550dULL,
 +      0x0000000000000237LL, 0x0000000000000183LL},
 +    { 0x0000000000000000LL, 0x0000000000bc614eULL,
 +      0xffffffffffffffffLL, 0xffffffffffffaaf3ULL,
 +      0xfffffffffffffdc9LL, 0x0000000000000183LL},
 +
 +    /* Negative dividend, positive/negative divisors */
 +    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
 +      0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
 +      0x0000000000000001LL, 0x0000000000000000LL},
 +    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
 +      0x0000000000000000LL, 0x0000000000bc614eULL,
 +      0xffffffffffffffffLL, 0x0000000000000000LL},
 +    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
 +      0xffffffffffffffffLL, 0xffffffffffa1cf59ULL,
 +      0x0000000000000002LL, 0x0000000000000000LL},
 +    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
 +      0x0000000000000000LL, 0x00000000005e30a7ULL,
 +      0xfffffffffffffffeLL, 0x0000000000000000LL},
 +    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
 +      0xffffffffffffffffLL, 0xffffffffffe873d7ULL,
 +      0x0000000000000008LL, 0xfffffffffffffffaLL},
 +    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
 +      0x0000000000000000LL, 0x0000000000178c29ULL,
 +      0xfffffffffffffff8LL, 0xfffffffffffffffaLL},
 +    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
 +      0xffffffffffffffffLL, 0xffffffffffffaaf3ULL,
 +      0x0000000000000237LL, 0xfffffffffffffe7dLL},
 +    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
 +      0x0000000000000000LL, 0x000000000000550dULL,
 +      0xfffffffffffffdc9LL, 0xfffffffffffffe7dLL},
 +};
 +
 +static void test_divu128(void)
 +{
-+    char *class_name = g_strdup_printf(ACCEL_CLASS_NAME("%s"), opt_name);
++    int i;
-+    AccelClass *ac = ACCEL_CLASS(object_class_by_name(class_name));
++    uint64_t rem;
-+    g_free(class_name);
++    test_data_unsigned tmp;
-+    return ac;
++
 +    for (i = 0; i < ARRAY_SIZE(test_table_unsigned); ++i) {
 +        tmp = test_table_unsigned[i];
 +
 +        rem = divu128(&tmp.low, &tmp.high, tmp.divisor);
 +        g_assert_cmpuint(tmp.low, ==, tmp.rlow);
 +        g_assert_cmpuint(tmp.high, ==, tmp.rhigh);
 +        g_assert_cmpuint(rem, ==, tmp.remainder);
 +    }
 +}
 +
-+static void register_accel_types(void)
++static void test_divs128(void)
 +{
-+    type_register_static(&accel_type);
++    int i;
 +    int64_t rem;
 +    test_data_signed tmp;
 +
 +    for (i = 0; i < ARRAY_SIZE(test_table_signed); ++i) {
 +        tmp = test_table_signed[i];
 +
 +        rem = divs128(&tmp.low, &tmp.high, tmp.divisor);
 +        g_assert_cmpuint(tmp.low, ==, tmp.rlow);
 +        g_assert_cmpuint(tmp.high, ==, tmp.rhigh);
 +        g_assert_cmpuint(rem, ==, tmp.remainder);
 +    }
 +}
 +
-+type_init(register_accel_types);
++int main(int argc, char **argv)
-diff --git a/accel/accel.c b/accel/accel-softmmu.c
++{
-similarity index 75%
++    g_test_init(&argc, &argv, NULL);
-rename from accel/accel.c
++    g_test_add_func("/host-utils/test_divu128", test_divu128);
-rename to accel/accel-softmmu.c
++    g_test_add_func("/host-utils/test_divs128", test_divs128);
 +    return g_test_run();
 +}
 diff --git a/tests/unit/meson.build b/tests/unit/meson.build
 index XXXXXXX..XXXXXXX 100644
---- a/accel/accel.c
+--- a/tests/unit/meson.build
-+++ b/accel/accel-softmmu.c
++++ b/tests/unit/meson.build
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ tests = {
- /*
+   # all code tested by test-x86-cpuid is inside topology.h
-- * QEMU System Emulator, accelerator interfaces
+   'test-x86-cpuid': [],
-+ * QEMU accel class, system emulation components
+   'test-cutils': [],
-  *
++  'test-div128': [],
-  * Copyright (c) 2003-2008 Fabrice Bellard
+   'test-shift128': [],
-  * Copyright (c) 2014 Red Hat Inc.
+   'test-mul64': [],
-@@ -XXX,XX +XXX,XX @@
+   # all code tested by test-int128 is inside int128.h
   */
  #include "qemu/osdep.h"
 -#include "sysemu/accel.h"
 +#include "qemu/accel.h"
  #include "hw/boards.h"
  #include "sysemu/arch_init.h"
  #include "sysemu/sysemu.h"
  #include "qom/object.h"
 -static const TypeInfo accel_type = {
 -    .name = TYPE_ACCEL,
 -    .parent = TYPE_OBJECT,
 -    .class_size = sizeof(AccelClass),
 -    .instance_size = sizeof(AccelState),
 -};
 -
 -/* Lookup AccelClass from opt_name. Returns NULL if not found */
 -AccelClass *accel_find(const char *opt_name)
 -{
 -    char *class_name = g_strdup_printf(ACCEL_CLASS_NAME("%s"), opt_name);
 -    AccelClass *ac = ACCEL_CLASS(object_class_by_name(class_name));
 -    g_free(class_name);
 -    return ac;
 -}
 -
  int accel_init_machine(AccelState *accel, MachineState *ms)
  {
      AccelClass *acc = ACCEL_GET_CLASS(accel);
@@ -XXX,XX +XXX,XX @@ void accel_setup_post(MachineState *ms)
          acc->setup_post(ms, accel);
      }
  }
 -
 -static void register_accel_types(void)
 -{
 -    type_register_static(&accel_type);
 -}
 -
 -type_init(register_accel_types);
 diff --git a/accel/accel-user.c b/accel/accel-user.c
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/accel/accel-user.c
@@ -XXX,XX +XXX,XX @@
 +/*
 + * QEMU accel class, user-mode components
 + *
 + * Copyright 2021 SUSE LLC
 + *
 + * This work is licensed under the terms of the GNU GPL, version 2 or later.
 + * See the COPYING file in the top-level directory.
 + */
 +
 +#include "qemu/osdep.h"
 +#include "qemu/accel.h"
 +
 +AccelState *current_accel(void)
 +{
 +    static AccelState *accel;
 +
 +    if (!accel) {
 +        AccelClass *ac = accel_find("tcg");
 +
 +        g_assert(ac != NULL);
 +        accel = ACCEL(object_new_with_class(OBJECT_CLASS(ac)));
 +    }
 +    return accel;
 +}
 diff --git a/accel/qtest/qtest.c b/accel/qtest/qtest.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/qtest/qtest.c
 +++ b/accel/qtest/qtest.c
@@ -XXX,XX +XXX,XX @@
  #include "qemu/module.h"
  #include "qemu/option.h"
  #include "qemu/config-file.h"
 -#include "sysemu/accel.h"
 +#include "qemu/accel.h"
  #include "sysemu/qtest.h"
  #include "sysemu/cpus.h"
  #include "sysemu/cpu-timers.h"
 diff --git a/accel/tcg/tcg-all.c b/accel/tcg/tcg-all.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/tcg-all.c
 +++ b/accel/tcg/tcg-all.c
@@ -XXX,XX +XXX,XX @@
  #include "tcg/tcg.h"
  #include "qapi/error.h"
  #include "qemu/error-report.h"
 -#include "hw/boards.h"
 +#include "qemu/accel.h"
  #include "qapi/qapi-builtin-visit.h"
 +
 +#ifndef CONFIG_USER_ONLY
  #include "tcg-cpus.h"
 +#endif /* CONFIG_USER_ONLY */
  struct TCGState {
      AccelState parent_obj;
@@ -XXX,XX +XXX,XX @@ static void tcg_accel_instance_init(Object *obj)
      s->mttcg_enabled = default_mttcg_enabled();
      /* If debugging enabled, default "auto on", otherwise off. */
 -#ifdef CONFIG_DEBUG_TCG
 +#if defined(CONFIG_DEBUG_TCG) && !defined(CONFIG_USER_ONLY)
      s->splitwx_enabled = -1;
  #else
      s->splitwx_enabled = 0;
@@ -XXX,XX +XXX,XX @@ static int tcg_init(MachineState *ms)
      mttcg_enabled = s->mttcg_enabled;
      /*
 -     * Initialize TCG regions
 +     * Initialize TCG regions only for softmmu.
 +     *
 +     * This needs to be done later for user mode, because the prologue
 +     * generation needs to be delayed so that GUEST_BASE is already set.
       */
 +#ifndef CONFIG_USER_ONLY
      tcg_region_init();
      if (mttcg_enabled) {
@@ -XXX,XX +XXX,XX @@ static int tcg_init(MachineState *ms)
      } else {
          cpus_register_accel(&tcg_cpus_rr);
      }
 +#endif /* !CONFIG_USER_ONLY */
 +
      return 0;
  }
 diff --git a/accel/xen/xen-all.c b/accel/xen/xen-all.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/xen/xen-all.c
 +++ b/accel/xen/xen-all.c
@@ -XXX,XX +XXX,XX @@
  #include "hw/xen/xen-legacy-backend.h"
  #include "hw/xen/xen_pt.h"
  #include "chardev/char.h"
 -#include "sysemu/accel.h"
 +#include "qemu/accel.h"
  #include "sysemu/cpus.h"
  #include "sysemu/xen.h"
  #include "sysemu/runstate.h"
 diff --git a/bsd-user/main.c b/bsd-user/main.c
 index XXXXXXX..XXXXXXX 100644
 --- a/bsd-user/main.c
 +++ b/bsd-user/main.c
@@ -XXX,XX +XXX,XX @@
  #include "qemu/osdep.h"
  #include "qemu-common.h"
  #include "qemu/units.h"
 +#include "qemu/accel.h"
  #include "sysemu/tcg.h"
  #include "qemu-version.h"
  #include <machine/trap.h>
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
      }
      /* init tcg before creating CPUs and to get qemu_host_page_size */
 -    tcg_exec_init(0, false);
 +    {
 +        AccelClass *ac = ACCEL_GET_CLASS(current_accel());
 +        ac->init_machine(NULL);
 +    }
      cpu_type = parse_cpu_option(cpu_model);
      cpu = cpu_create(cpu_type);
      env = cpu->env_ptr;
 diff --git a/linux-user/main.c b/linux-user/main.c
 index XXXXXXX..XXXXXXX 100644
 --- a/linux-user/main.c
 +++ b/linux-user/main.c
@@ -XXX,XX +XXX,XX @@
  #include "qemu/osdep.h"
  #include "qemu-common.h"
  #include "qemu/units.h"
 +#include "qemu/accel.h"
  #include "sysemu/tcg.h"
  #include "qemu-version.h"
  #include <sys/syscall.h>
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp)
      cpu_type = parse_cpu_option(cpu_model);
      /* init tcg before creating CPUs and to get qemu_host_page_size */
 -    tcg_exec_init(0, false);
 +    {
 +        AccelClass *ac = ACCEL_GET_CLASS(current_accel());
 +        ac->init_machine(NULL);
 +    }
      cpu = cpu_create(cpu_type);
      env = cpu->env_ptr;
      cpu_reset(cpu);
 diff --git a/softmmu/memory.c b/softmmu/memory.c
 index XXXXXXX..XXXXXXX 100644
 --- a/softmmu/memory.c
 +++ b/softmmu/memory.c
@@ -XXX,XX +XXX,XX @@
  #include "sysemu/kvm.h"
  #include "sysemu/runstate.h"
  #include "sysemu/tcg.h"
 -#include "sysemu/accel.h"
 +#include "qemu/accel.h"
  #include "hw/boards.h"
  #include "migration/vmstate.h"
 diff --git a/softmmu/qtest.c b/softmmu/qtest.c
 index XXXXXXX..XXXXXXX 100644
 --- a/softmmu/qtest.c
 +++ b/softmmu/qtest.c
@@ -XXX,XX +XXX,XX @@
  #include "exec/ioport.h"
  #include "exec/memory.h"
  #include "hw/irq.h"
 -#include "sysemu/accel.h"
 +#include "qemu/accel.h"
  #include "sysemu/cpu-timers.h"
  #include "qemu/config-file.h"
  #include "qemu/option.h"
 diff --git a/softmmu/vl.c b/softmmu/vl.c
 index XXXXXXX..XXXXXXX 100644
 --- a/softmmu/vl.c
 +++ b/softmmu/vl.c
@@ -XXX,XX +XXX,XX @@
  #include "qemu/error-report.h"
  #include "qemu/sockets.h"
 -#include "sysemu/accel.h"
 +#include "qemu/accel.h"
  #include "hw/usb.h"
  #include "hw/isa/isa.h"
  #include "hw/scsi/scsi.h"
 diff --git a/target/i386/hax/hax-all.c b/target/i386/hax/hax-all.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/i386/hax/hax-all.c
 +++ b/target/i386/hax/hax-all.c
@@ -XXX,XX +XXX,XX @@
  #include "exec/address-spaces.h"
  #include "qemu-common.h"
 -#include "sysemu/accel.h"
 +#include "qemu/accel.h"
  #include "sysemu/reset.h"
  #include "sysemu/runstate.h"
  #include "hw/boards.h"
 diff --git a/target/i386/hvf/hvf.c b/target/i386/hvf/hvf.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/i386/hvf/hvf.c
 +++ b/target/i386/hvf/hvf.c
@@ -XXX,XX +XXX,XX @@
  #include "exec/address-spaces.h"
  #include "hw/i386/apic_internal.h"
  #include "qemu/main-loop.h"
 -#include "sysemu/accel.h"
 +#include "qemu/accel.h"
  #include "target/i386/cpu.h"
  #include "hvf-cpus.h"
 diff --git a/target/i386/hvf/x86_task.c b/target/i386/hvf/x86_task.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/i386/hvf/x86_task.c
 +++ b/target/i386/hvf/x86_task.c
@@ -XXX,XX +XXX,XX @@
  #include "hw/i386/apic_internal.h"
  #include "qemu/main-loop.h"
 -#include "sysemu/accel.h"
 +#include "qemu/accel.h"
  #include "target/i386/cpu.h"
  // TODO: taskswitch handling
 diff --git a/target/i386/whpx/whpx-all.c b/target/i386/whpx/whpx-all.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/i386/whpx/whpx-all.c
 +++ b/target/i386/whpx/whpx-all.c
@@ -XXX,XX +XXX,XX @@
  #include "exec/address-spaces.h"
  #include "exec/ioport.h"
  #include "qemu-common.h"
 -#include "sysemu/accel.h"
 +#include "qemu/accel.h"
  #include "sysemu/whpx.h"
  #include "sysemu/cpus.h"
  #include "sysemu/runstate.h"
 diff --git a/MAINTAINERS b/MAINTAINERS
 index XXXXXXX..XXXXXXX 100644
 --- a/MAINTAINERS
 +++ b/MAINTAINERS
@@ -XXX,XX +XXX,XX @@ Overall
  M: Richard Henderson <richard.henderson@linaro.org>
  R: Paolo Bonzini <pbonzini@redhat.com>
  S: Maintained
 -F: include/sysemu/accel.h
 +F: include/qemu/accel.h
  F: accel/accel.c
  F: accel/Makefile.objs
  F: accel/stubs/Makefile.objs
 diff --git a/accel/meson.build b/accel/meson.build
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/meson.build
 +++ b/accel/meson.build
@@ -XXX,XX +XXX,XX @@
 -softmmu_ss.add(files('accel.c'))
 +specific_ss.add(files('accel-common.c'))
 +softmmu_ss.add(files('accel-softmmu.c'))
 +user_ss.add(files('accel-user.c'))
  subdir('qtest')
  subdir('kvm')
 diff --git a/accel/tcg/meson.build b/accel/tcg/meson.build
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/meson.build
 +++ b/accel/tcg/meson.build
@@ -XXX,XX +XXX,XX @@
  tcg_ss = ss.source_set()
  tcg_ss.add(files(
 +  'tcg-all.c',
    'cpu-exec-common.c',
    'cpu-exec.c',
    'tcg-runtime-gvec.c',
@@ -XXX,XX +XXX,XX @@ tcg_ss.add(when: 'CONFIG_PLUGIN', if_true: [files('plugin-gen.c'), libdl])
  specific_ss.add_all(when: 'CONFIG_TCG', if_true: tcg_ss)
  specific_ss.add(when: ['CONFIG_SOFTMMU', 'CONFIG_TCG'], if_true: files(
 -  'tcg-all.c',
    'cputlb.c',
    'tcg-cpus.c',
    'tcg-cpus-mttcg.c',
 --
 .25.1

-[PULL 12/46] tcg/tci: Inline tci_write_reg64 into 64-bit callers
+[PULL 06/56] tcg/optimize: Rename "mask" to "z_mask"
-Note that we had two functions of the same name: a 32-bit version
+Prepare for tracking different masks by renaming this one.
 which took two register numbers and a 64-bit version which was a
 no-op wrapper for tcg_write_reg.  After this, we are left with
 only the 32-bit version.
-Tested-by: Alex Bennée <alex.bennee@linaro.org>
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/tci.c | 60 +++++++++++++++++++++++++------------------------------
+ tcg/optimize.c | 142 +++++++++++++++++++++++++------------------------
-file changed, 27 insertions(+), 33 deletions(-)
+file changed, 72 insertions(+), 70 deletions(-)
-diff --git a/tcg/tci.c b/tcg/tci.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/tci.c
+--- a/tcg/optimize.c
-+++ b/tcg/tci.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static void tci_write_reg64(tcg_target_ulong *regs, uint32_t high_index,
+@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
-     tci_write_reg(regs, low_index, value);
+     TCGTemp *prev_copy;
-     tci_write_reg(regs, high_index, value >> 32);
+     TCGTemp *next_copy;
      uint64_t val;
 -    uint64_t mask;
 +    uint64_t z_mask;  /* mask bit is 0 if and only if value bit is 0 */
  } TempOptInfo;
  static inline TempOptInfo *ts_info(TCGTemp *ts)
@@ -XXX,XX +XXX,XX @@ static void reset_ts(TCGTemp *ts)
      ti->next_copy = ts;
      ti->prev_copy = ts;
      ti->is_const = false;
 -    ti->mask = -1;
 +    ti->z_mask = -1;
  }
--#elif TCG_TARGET_REG_BITS == 64
--static void
+ static void reset_temp(TCGArg arg)
--tci_write_reg64(tcg_target_ulong *regs, TCGReg index, uint64_t value)
+@@ -XXX,XX +XXX,XX @@ static void init_ts_info(TCGTempSet *temps_used, TCGTemp *ts)
--{
+     if (ts->kind == TEMP_CONST) {
--    tci_write_reg(regs, index, value);
+         ti->is_const = true;
--}
+         ti->val = ts->val;
- #endif
+-        ti->mask = ts->val;
++        ti->z_mask = ts->val;
- #if TCG_TARGET_REG_BITS == 32
+         if (TCG_TARGET_REG_BITS > 32 && ts->type == TCG_TYPE_I32) {
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+             /* High bits of a 32-bit quantity are garbage.  */
-             t1 = tci_read_r64(regs, &tb_ptr);
+-            ti->mask |= ~0xffffffffull;
-             t2 = tci_read_ri64(regs, &tb_ptr);
++            ti->z_mask |= ~0xffffffffull;
-             condition = *tb_ptr++;
+         }
--            tci_write_reg64(regs, t0, tci_compare64(t1, t2, condition));
+     } else {
-+            tci_write_reg(regs, t0, tci_compare64(t1, t2, condition));
+         ti->is_const = false;
-             break;
+-        ti->mask = -1;
- #endif
++        ti->z_mask = -1;
-         case INDEX_op_mov_i32:
+     }
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+ }
-         case INDEX_op_mov_i64:
-             t0 = *tb_ptr++;
+@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
-             t1 = tci_read_r64(regs, &tb_ptr);
+     const TCGOpDef *def;
--            tci_write_reg64(regs, t0, t1);
+     TempOptInfo *di;
-+            tci_write_reg(regs, t0, t1);
+     TempOptInfo *si;
-             break;
+-    uint64_t mask;
-         case INDEX_op_tci_movi_i64:
++    uint64_t z_mask;
-             t0 = *tb_ptr++;
+     TCGOpcode new_op;
-             t1 = tci_read_i64(&tb_ptr);
--            tci_write_reg64(regs, t0, t1);
+     if (ts_are_copies(dst_ts, src_ts)) {
-+            tci_write_reg(regs, t0, t1);
+@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
-             break;
+     op->args[0] = dst;
+     op->args[1] = src;
-             /* Load/store operations (64 bit). */
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+-    mask = si->mask;
-             t0 = *tb_ptr++;
++    z_mask = si->z_mask;
-             t1 = tci_read_r(regs, &tb_ptr);
+     if (TCG_TARGET_REG_BITS > 32 && new_op == INDEX_op_mov_i32) {
-             t2 = tci_read_s32(&tb_ptr);
+         /* High bits of the destination are now garbage.  */
--            tci_write_reg64(regs, t0, *(uint64_t *)(t1 + t2));
+-        mask |= ~0xffffffffull;
-+            tci_write_reg(regs, t0, *(uint64_t *)(t1 + t2));
++        z_mask |= ~0xffffffffull;
-             break;
+     }
-         case INDEX_op_st8_i64:
+-    di->mask = mask;
-             t0 = tci_read_r8(regs, &tb_ptr);
++    di->z_mask = z_mask;
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
-             t0 = *tb_ptr++;
+     if (src_ts->type == dst_ts->type) {
-             t1 = tci_read_ri64(regs, &tb_ptr);
+         TempOptInfo *ni = ts_info(si->next_copy);
-             t2 = tci_read_ri64(regs, &tb_ptr);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
--            tci_write_reg64(regs, t0, t1 + t2);
+     }
-+            tci_write_reg(regs, t0, t1 + t2);
-             break;
+     QTAILQ_FOREACH_SAFE(op, &s->ops, link, op_next) {
-         case INDEX_op_sub_i64:
+-        uint64_t mask, partmask, affected, tmp;
-             t0 = *tb_ptr++;
++        uint64_t z_mask, partmask, affected, tmp;
-             t1 = tci_read_ri64(regs, &tb_ptr);
+         int nb_oargs, nb_iargs;
-             t2 = tci_read_ri64(regs, &tb_ptr);
+         TCGOpcode opc = op->opc;
--            tci_write_reg64(regs, t0, t1 - t2);
+         const TCGOpDef *def = &tcg_op_defs[opc];
-+            tci_write_reg(regs, t0, t1 - t2);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-             break;
-         case INDEX_op_mul_i64:
+         /* Simplify using known-zero bits. Currently only ops with a single
-             t0 = *tb_ptr++;
+            output argument is supported. */
-             t1 = tci_read_ri64(regs, &tb_ptr);
+-        mask = -1;
-             t2 = tci_read_ri64(regs, &tb_ptr);
++        z_mask = -1;
--            tci_write_reg64(regs, t0, t1 * t2);
+         affected = -1;
-+            tci_write_reg(regs, t0, t1 * t2);
+         switch (opc) {
-             break;
+         CASE_OP_32_64(ext8s):
- #if TCG_TARGET_HAS_div_i64
+-            if ((arg_info(op->args[1])->mask & 0x80) != 0) {
-         case INDEX_op_div_i64:
++            if ((arg_info(op->args[1])->z_mask & 0x80) != 0) {
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+                 break;
-             t0 = *tb_ptr++;
+             }
-             t1 = tci_read_ri64(regs, &tb_ptr);
+             QEMU_FALLTHROUGH;
-             t2 = tci_read_ri64(regs, &tb_ptr);
+         CASE_OP_32_64(ext8u):
--            tci_write_reg64(regs, t0, t1 & t2);
+-            mask = 0xff;
-+            tci_write_reg(regs, t0, t1 & t2);
++            z_mask = 0xff;
-             break;
+             goto and_const;
-         case INDEX_op_or_i64:
+         CASE_OP_32_64(ext16s):
-             t0 = *tb_ptr++;
+-            if ((arg_info(op->args[1])->mask & 0x8000) != 0) {
-             t1 = tci_read_ri64(regs, &tb_ptr);
++            if ((arg_info(op->args[1])->z_mask & 0x8000) != 0) {
-             t2 = tci_read_ri64(regs, &tb_ptr);
+                 break;
--            tci_write_reg64(regs, t0, t1 | t2);
+             }
-+            tci_write_reg(regs, t0, t1 | t2);
+             QEMU_FALLTHROUGH;
-             break;
+         CASE_OP_32_64(ext16u):
-         case INDEX_op_xor_i64:
+-            mask = 0xffff;
-             t0 = *tb_ptr++;
++            z_mask = 0xffff;
-             t1 = tci_read_ri64(regs, &tb_ptr);
+             goto and_const;
-             t2 = tci_read_ri64(regs, &tb_ptr);
+         case INDEX_op_ext32s_i64:
--            tci_write_reg64(regs, t0, t1 ^ t2);
+-            if ((arg_info(op->args[1])->mask & 0x80000000) != 0) {
-+            tci_write_reg(regs, t0, t1 ^ t2);
++            if ((arg_info(op->args[1])->z_mask & 0x80000000) != 0) {
-             break;
+                 break;
+             }
-             /* Shift/rotate operations (64 bit). */
+             QEMU_FALLTHROUGH;
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+         case INDEX_op_ext32u_i64:
-             t0 = *tb_ptr++;
+-            mask = 0xffffffffU;
-             t1 = tci_read_ri64(regs, &tb_ptr);
++            z_mask = 0xffffffffU;
-             t2 = tci_read_ri64(regs, &tb_ptr);
+             goto and_const;
--            tci_write_reg64(regs, t0, t1 << (t2 & 63));
-+            tci_write_reg(regs, t0, t1 << (t2 & 63));
+         CASE_OP_32_64(and):
 -            mask = arg_info(op->args[2])->mask;
 +            z_mask = arg_info(op->args[2])->z_mask;
              if (arg_is_const(op->args[2])) {
          and_const:
 -                affected = arg_info(op->args[1])->mask & ~mask;
 +                affected = arg_info(op->args[1])->z_mask & ~z_mask;
              }
 -            mask = arg_info(op->args[1])->mask & mask;
 +            z_mask = arg_info(op->args[1])->z_mask & z_mask;
              break;
          case INDEX_op_ext_i32_i64:
 -            if ((arg_info(op->args[1])->mask & 0x80000000) != 0) {
 +            if ((arg_info(op->args[1])->z_mask & 0x80000000) != 0) {
                  break;
              }
              QEMU_FALLTHROUGH;
          case INDEX_op_extu_i32_i64:
              /* We do not compute affected as it is a size changing op.  */
 -            mask = (uint32_t)arg_info(op->args[1])->mask;
 +            z_mask = (uint32_t)arg_info(op->args[1])->z_mask;
              break;
          CASE_OP_32_64(andc):
              /* Known-zeros does not imply known-ones.  Therefore unless
                 op->args[2] is constant, we can't infer anything from it.  */
              if (arg_is_const(op->args[2])) {
 -                mask = ~arg_info(op->args[2])->mask;
 +                z_mask = ~arg_info(op->args[2])->z_mask;
                  goto and_const;
              }
              /* But we certainly know nothing outside args[1] may be set. */
 -            mask = arg_info(op->args[1])->mask;
 +            z_mask = arg_info(op->args[1])->z_mask;
              break;
          case INDEX_op_sar_i32:
              if (arg_is_const(op->args[2])) {
                  tmp = arg_info(op->args[2])->val & 31;
 -                mask = (int32_t)arg_info(op->args[1])->mask >> tmp;
 +                z_mask = (int32_t)arg_info(op->args[1])->z_mask >> tmp;
              }
              break;
          case INDEX_op_sar_i64:
              if (arg_is_const(op->args[2])) {
                  tmp = arg_info(op->args[2])->val & 63;
 -                mask = (int64_t)arg_info(op->args[1])->mask >> tmp;
 +                z_mask = (int64_t)arg_info(op->args[1])->z_mask >> tmp;
              }
              break;
          case INDEX_op_shr_i32:
              if (arg_is_const(op->args[2])) {
                  tmp = arg_info(op->args[2])->val & 31;
 -                mask = (uint32_t)arg_info(op->args[1])->mask >> tmp;
 +                z_mask = (uint32_t)arg_info(op->args[1])->z_mask >> tmp;
              }
              break;
          case INDEX_op_shr_i64:
-             t0 = *tb_ptr++;
+             if (arg_is_const(op->args[2])) {
-             t1 = tci_read_ri64(regs, &tb_ptr);
+                 tmp = arg_info(op->args[2])->val & 63;
-             t2 = tci_read_ri64(regs, &tb_ptr);
+-                mask = (uint64_t)arg_info(op->args[1])->mask >> tmp;
--            tci_write_reg64(regs, t0, t1 >> (t2 & 63));
++                z_mask = (uint64_t)arg_info(op->args[1])->z_mask >> tmp;
-+            tci_write_reg(regs, t0, t1 >> (t2 & 63));
+             }
              break;
-         case INDEX_op_sar_i64:
-             t0 = *tb_ptr++;
+         case INDEX_op_extrl_i64_i32:
-             t1 = tci_read_ri64(regs, &tb_ptr);
+-            mask = (uint32_t)arg_info(op->args[1])->mask;
-             t2 = tci_read_ri64(regs, &tb_ptr);
++            z_mask = (uint32_t)arg_info(op->args[1])->z_mask;
--            tci_write_reg64(regs, t0, ((int64_t)t1 >> (t2 & 63)));
+             break;
-+            tci_write_reg(regs, t0, ((int64_t)t1 >> (t2 & 63)));
+         case INDEX_op_extrh_i64_i32:
-             break;
+-            mask = (uint64_t)arg_info(op->args[1])->mask >> 32;
- #if TCG_TARGET_HAS_rot_i64
++            z_mask = (uint64_t)arg_info(op->args[1])->z_mask >> 32;
-         case INDEX_op_rotl_i64:
+             break;
-             t0 = *tb_ptr++;
-             t1 = tci_read_ri64(regs, &tb_ptr);
+         CASE_OP_32_64(shl):
-             t2 = tci_read_ri64(regs, &tb_ptr);
+             if (arg_is_const(op->args[2])) {
--            tci_write_reg64(regs, t0, rol64(t1, t2 & 63));
+                 tmp = arg_info(op->args[2])->val & (TCG_TARGET_REG_BITS - 1);
-+            tci_write_reg(regs, t0, rol64(t1, t2 & 63));
+-                mask = arg_info(op->args[1])->mask << tmp;
-             break;
++                z_mask = arg_info(op->args[1])->z_mask << tmp;
-         case INDEX_op_rotr_i64:
+             }
-             t0 = *tb_ptr++;
+             break;
-             t1 = tci_read_ri64(regs, &tb_ptr);
-             t2 = tci_read_ri64(regs, &tb_ptr);
+         CASE_OP_32_64(neg):
--            tci_write_reg64(regs, t0, ror64(t1, t2 & 63));
+             /* Set to 1 all bits to the left of the rightmost.  */
-+            tci_write_reg(regs, t0, ror64(t1, t2 & 63));
+-            mask = -(arg_info(op->args[1])->mask
-             break;
+-                     & -arg_info(op->args[1])->mask);
- #endif
++            z_mask = -(arg_info(op->args[1])->z_mask
- #if TCG_TARGET_HAS_deposit_i64
++                       & -arg_info(op->args[1])->z_mask);
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+             break;
-             tmp16 = *tb_ptr++;
-             tmp8 = *tb_ptr++;
+         CASE_OP_32_64(deposit):
-             tmp64 = (((1ULL << tmp8) - 1) << tmp16);
+-            mask = deposit64(arg_info(op->args[1])->mask,
--            tci_write_reg64(regs, t0, (t1 & ~tmp64) | ((t2 << tmp16) & tmp64));
+-                             op->args[3], op->args[4],
-+            tci_write_reg(regs, t0, (t1 & ~tmp64) | ((t2 << tmp16) & tmp64));
+-                             arg_info(op->args[2])->mask);
-             break;
++            z_mask = deposit64(arg_info(op->args[1])->z_mask,
- #endif
++                               op->args[3], op->args[4],
-         case INDEX_op_brcond_i64:
++                               arg_info(op->args[2])->z_mask);
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+             break;
-         case INDEX_op_ext8u_i64:
-             t0 = *tb_ptr++;
+         CASE_OP_32_64(extract):
-             t1 = tci_read_r8(regs, &tb_ptr);
+-            mask = extract64(arg_info(op->args[1])->mask,
--            tci_write_reg64(regs, t0, t1);
+-                             op->args[2], op->args[3]);
-+            tci_write_reg(regs, t0, t1);
++            z_mask = extract64(arg_info(op->args[1])->z_mask,
-             break;
++                               op->args[2], op->args[3]);
- #endif
+             if (op->args[2] == 0) {
- #if TCG_TARGET_HAS_ext8s_i64
+-                affected = arg_info(op->args[1])->mask & ~mask;
-         case INDEX_op_ext8s_i64:
++                affected = arg_info(op->args[1])->z_mask & ~z_mask;
-             t0 = *tb_ptr++;
+             }
-             t1 = tci_read_r8s(regs, &tb_ptr);
+             break;
--            tci_write_reg64(regs, t0, t1);
+         CASE_OP_32_64(sextract):
-+            tci_write_reg(regs, t0, t1);
+-            mask = sextract64(arg_info(op->args[1])->mask,
-             break;
+-                              op->args[2], op->args[3]);
- #endif
+-            if (op->args[2] == 0 && (tcg_target_long)mask >= 0) {
- #if TCG_TARGET_HAS_ext16s_i64
+-                affected = arg_info(op->args[1])->mask & ~mask;
-         case INDEX_op_ext16s_i64:
++            z_mask = sextract64(arg_info(op->args[1])->z_mask,
-             t0 = *tb_ptr++;
++                                op->args[2], op->args[3]);
-             t1 = tci_read_r16s(regs, &tb_ptr);
++            if (op->args[2] == 0 && (tcg_target_long)z_mask >= 0) {
--            tci_write_reg64(regs, t0, t1);
++                affected = arg_info(op->args[1])->z_mask & ~z_mask;
-+            tci_write_reg(regs, t0, t1);
+             }
              break;
- #endif
- #if TCG_TARGET_HAS_ext16u_i64
+         CASE_OP_32_64(or):
-         case INDEX_op_ext16u_i64:
+         CASE_OP_32_64(xor):
-             t0 = *tb_ptr++;
+-            mask = arg_info(op->args[1])->mask | arg_info(op->args[2])->mask;
-             t1 = tci_read_r16(regs, &tb_ptr);
++            z_mask = arg_info(op->args[1])->z_mask
--            tci_write_reg64(regs, t0, t1);
++                   | arg_info(op->args[2])->z_mask;
-+            tci_write_reg(regs, t0, t1);
+             break;
-             break;
- #endif
+         case INDEX_op_clz_i32:
- #if TCG_TARGET_HAS_ext32s_i64
+         case INDEX_op_ctz_i32:
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+-            mask = arg_info(op->args[2])->mask | 31;
-         case INDEX_op_ext_i32_i64:
++            z_mask = arg_info(op->args[2])->z_mask | 31;
-             t0 = *tb_ptr++;
+             break;
-             t1 = tci_read_r32s(regs, &tb_ptr);
--            tci_write_reg64(regs, t0, t1);
+         case INDEX_op_clz_i64:
-+            tci_write_reg(regs, t0, t1);
+         case INDEX_op_ctz_i64:
-             break;
+-            mask = arg_info(op->args[2])->mask | 63;
- #if TCG_TARGET_HAS_ext32u_i64
++            z_mask = arg_info(op->args[2])->z_mask | 63;
-         case INDEX_op_ext32u_i64:
+             break;
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
-         case INDEX_op_extu_i32_i64:
+         case INDEX_op_ctpop_i32:
-             t0 = *tb_ptr++;
+-            mask = 32 | 31;
-             t1 = tci_read_r32(regs, &tb_ptr);
++            z_mask = 32 | 31;
--            tci_write_reg64(regs, t0, t1);
+             break;
-+            tci_write_reg(regs, t0, t1);
+         case INDEX_op_ctpop_i64:
-             break;
+-            mask = 64 | 63;
- #if TCG_TARGET_HAS_bswap16_i64
++            z_mask = 64 | 63;
-         case INDEX_op_bswap16_i64:
+             break;
-             t0 = *tb_ptr++;
-             t1 = tci_read_r16(regs, &tb_ptr);
+         CASE_OP_32_64(setcond):
--            tci_write_reg64(regs, t0, bswap16(t1));
+         case INDEX_op_setcond2_i32:
-+            tci_write_reg(regs, t0, bswap16(t1));
+-            mask = 1;
-             break;
++            z_mask = 1;
- #endif
+             break;
- #if TCG_TARGET_HAS_bswap32_i64
          CASE_OP_32_64(movcond):
 -            mask = arg_info(op->args[3])->mask | arg_info(op->args[4])->mask;
 +            z_mask = arg_info(op->args[3])->z_mask
 +                   | arg_info(op->args[4])->z_mask;
              break;
          CASE_OP_32_64(ld8u):
 -            mask = 0xff;
 +            z_mask = 0xff;
              break;
          CASE_OP_32_64(ld16u):
 -            mask = 0xffff;
 +            z_mask = 0xffff;
              break;
          case INDEX_op_ld32u_i64:
 -            mask = 0xffffffffu;
 +            z_mask = 0xffffffffu;
              break;
          CASE_OP_32_64(qemu_ld):
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  MemOpIdx oi = op->args[nb_oargs + nb_iargs];
                  MemOp mop = get_memop(oi);
                  if (!(mop & MO_SIGN)) {
 -                    mask = (2ULL << ((8 << (mop & MO_SIZE)) - 1)) - 1;
 +                    z_mask = (2ULL << ((8 << (mop & MO_SIZE)) - 1)) - 1;
                  }
              }
              break;
          CASE_OP_32_64(bswap16):
 -            mask = arg_info(op->args[1])->mask;
 -            if (mask <= 0xffff) {
 +            z_mask = arg_info(op->args[1])->z_mask;
 +            if (z_mask <= 0xffff) {
                  op->args[2] |= TCG_BSWAP_IZ;
              }
 -            mask = bswap16(mask);
 +            z_mask = bswap16(z_mask);
              switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
              case TCG_BSWAP_OZ:
                  break;
              case TCG_BSWAP_OS:
 -                mask = (int16_t)mask;
 +                z_mask = (int16_t)z_mask;
                  break;
              default: /* undefined high bits */
 -                mask |= MAKE_64BIT_MASK(16, 48);
 +                z_mask |= MAKE_64BIT_MASK(16, 48);
                  break;
              }
              break;
          case INDEX_op_bswap32_i64:
-             t0 = *tb_ptr++;
+-            mask = arg_info(op->args[1])->mask;
-             t1 = tci_read_r32(regs, &tb_ptr);
+-            if (mask <= 0xffffffffu) {
--            tci_write_reg64(regs, t0, bswap32(t1));
++            z_mask = arg_info(op->args[1])->z_mask;
-+            tci_write_reg(regs, t0, bswap32(t1));
++            if (z_mask <= 0xffffffffu) {
-             break;
+                 op->args[2] |= TCG_BSWAP_IZ;
- #endif
+             }
- #if TCG_TARGET_HAS_bswap64_i64
+-            mask = bswap32(mask);
-         case INDEX_op_bswap64_i64:
++            z_mask = bswap32(z_mask);
-             t0 = *tb_ptr++;
+             switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
-             t1 = tci_read_r64(regs, &tb_ptr);
+             case TCG_BSWAP_OZ:
--            tci_write_reg64(regs, t0, bswap64(t1));
+                 break;
-+            tci_write_reg(regs, t0, bswap64(t1));
+             case TCG_BSWAP_OS:
-             break;
+-                mask = (int32_t)mask;
- #endif
++                z_mask = (int32_t)z_mask;
- #if TCG_TARGET_HAS_not_i64
+                 break;
-         case INDEX_op_not_i64:
+             default: /* undefined high bits */
-             t0 = *tb_ptr++;
+-                mask |= MAKE_64BIT_MASK(32, 32);
-             t1 = tci_read_r64(regs, &tb_ptr);
++                z_mask |= MAKE_64BIT_MASK(32, 32);
--            tci_write_reg64(regs, t0, ~t1);
+                 break;
-+            tci_write_reg(regs, t0, ~t1);
+             }
              break;
- #endif
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
- #if TCG_TARGET_HAS_neg_i64
+         /* 32-bit ops generate 32-bit results.  For the result is zero test
-         case INDEX_op_neg_i64:
+            below, we can ignore high bits, but for further optimizations we
-             t0 = *tb_ptr++;
+            need to record that the high bits contain garbage.  */
-             t1 = tci_read_r64(regs, &tb_ptr);
+-        partmask = mask;
--            tci_write_reg64(regs, t0, -t1);
++        partmask = z_mask;
-+            tci_write_reg(regs, t0, -t1);
+         if (!(def->flags & TCG_OPF_64BIT)) {
-             break;
+-            mask |= ~(tcg_target_ulong)0xffffffffu;
- #endif
++            z_mask |= ~(tcg_target_ulong)0xffffffffu;
- #endif /* TCG_TARGET_REG_BITS == 64 */
+             partmask &= 0xffffffffu;
              affected &= 0xffffffffu;
          }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                     vs the high word of the input.  */
              do_setcond_high:
                  reset_temp(op->args[0]);
 -                arg_info(op->args[0])->mask = 1;
 +                arg_info(op->args[0])->z_mask = 1;
                  op->opc = INDEX_op_setcond_i32;
                  op->args[1] = op->args[2];
                  op->args[2] = op->args[4];
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  }
              do_setcond_low:
                  reset_temp(op->args[0]);
 -                arg_info(op->args[0])->mask = 1;
 +                arg_info(op->args[0])->z_mask = 1;
                  op->opc = INDEX_op_setcond_i32;
                  op->args[2] = op->args[3];
                  op->args[3] = op->args[5];
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              /* Default case: we know nothing about operation (or were unable
                 to compute the operation result) so no propagation is done.
                 We trash everything if the operation is the end of a basic
 -               block, otherwise we only trash the output args.  "mask" is
 +               block, otherwise we only trash the output args.  "z_mask" is
                 the non-zero bits mask for the first output arg.  */
              if (def->flags & TCG_OPF_BB_END) {
                  memset(&temps_used, 0, sizeof(temps_used));
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                      /* Save the corresponding known-zero bits mask for the
                         first output argument (only one supported so far). */
                      if (i == 0) {
 -                        arg_info(op->args[i])->mask = mask;
 +                        arg_info(op->args[i])->z_mask = z_mask;
                      }
                  }
              }
 --
 .25.1

-New patch
+[PULL 07/56] tcg/optimize: Split out OptContext
+Provide what will become a larger context for splitting
+the very large tcg_optimize function.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 77 ++++++++++++++++++++++++++------------------------
+file changed, 40 insertions(+), 37 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
+     uint64_t z_mask;  /* mask bit is 0 if and only if value bit is 0 */
+ } TempOptInfo;
++typedef struct OptContext {
++    TCGTempSet temps_used;
++} OptContext;
++
+ static inline TempOptInfo *ts_info(TCGTemp *ts)
+ {
+     return ts->state_ptr;
+@@ -XXX,XX +XXX,XX @@ static void reset_temp(TCGArg arg)
+ }
+ /* Initialize and activate a temporary.  */
+-static void init_ts_info(TCGTempSet *temps_used, TCGTemp *ts)
++static void init_ts_info(OptContext *ctx, TCGTemp *ts)
+ {
+     size_t idx = temp_idx(ts);
+     TempOptInfo *ti;
+-    if (test_bit(idx, temps_used->l)) {
++    if (test_bit(idx, ctx->temps_used.l)) {
+         return;
+     }
+-    set_bit(idx, temps_used->l);
++    set_bit(idx, ctx->temps_used.l);
+     ti = ts->state_ptr;
+     if (ti == NULL) {
+@@ -XXX,XX +XXX,XX @@ static void init_ts_info(TCGTempSet *temps_used, TCGTemp *ts)
+     }
+ }
+-static void init_arg_info(TCGTempSet *temps_used, TCGArg arg)
++static void init_arg_info(OptContext *ctx, TCGArg arg)
+ {
+-    init_ts_info(temps_used, arg_temp(arg));
++    init_ts_info(ctx, arg_temp(arg));
+ }
+ static TCGTemp *find_better_copy(TCGContext *s, TCGTemp *ts)
+@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
+     }
+ }
+-static void tcg_opt_gen_movi(TCGContext *s, TCGTempSet *temps_used,
++static void tcg_opt_gen_movi(TCGContext *s, OptContext *ctx,
+                              TCGOp *op, TCGArg dst, uint64_t val)
+ {
+     const TCGOpDef *def = &tcg_op_defs[op->opc];
+@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_movi(TCGContext *s, TCGTempSet *temps_used,
+     /* Convert movi to mov with constant temp. */
+     tv = tcg_constant_internal(type, val);
+-    init_ts_info(temps_used, tv);
++    init_ts_info(ctx, tv);
+     tcg_opt_gen_mov(s, op, dst, temp_arg(tv));
+ }
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+ {
+     int nb_temps, nb_globals, i;
+     TCGOp *op, *op_next, *prev_mb = NULL;
+-    TCGTempSet temps_used;
++    OptContext ctx = {};
+     /* Array VALS has an element for each temp.
+        If this temp holds a constant then its value is kept in VALS' element.
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+     nb_temps = s->nb_temps;
+     nb_globals = s->nb_globals;
+-    memset(&temps_used, 0, sizeof(temps_used));
+     for (i = 0; i < nb_temps; ++i) {
+         s->temps[i].state_ptr = NULL;
+     }
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             for (i = 0; i < nb_oargs + nb_iargs; i++) {
+                 TCGTemp *ts = arg_temp(op->args[i]);
+                 if (ts) {
+-                    init_ts_info(&temps_used, ts);
++                    init_ts_info(&ctx, ts);
+                 }
+             }
+         } else {
+             nb_oargs = def->nb_oargs;
+             nb_iargs = def->nb_iargs;
+             for (i = 0; i < nb_oargs + nb_iargs; i++) {
+-                init_arg_info(&temps_used, op->args[i]);
++                init_arg_info(&ctx, op->args[i]);
+             }
+         }
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64(rotr):
+             if (arg_is_const(op->args[1])
+                 && arg_info(op->args[1])->val == 0) {
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
+                 continue;
+             }
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         if (partmask == 0) {
+             tcg_debug_assert(nb_oargs == 1);
+-            tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
++            tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
+             continue;
+         }
+         if (affected == 0) {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64(mulsh):
+             if (arg_is_const(op->args[2])
+                 && arg_info(op->args[2])->val == 0) {
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
+                 continue;
+             }
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64_VEC(sub):
+         CASE_OP_32_64_VEC(xor):
+             if (args_are_copies(op->args[1], op->args[2])) {
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
+                 continue;
+             }
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             if (arg_is_const(op->args[1])) {
+                 tmp = arg_info(op->args[1])->val;
+                 tmp = dup_const(TCGOP_VECE(op), tmp);
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                 break;
+             }
+             goto do_default;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         case INDEX_op_dup2_vec:
+             assert(TCG_TARGET_REG_BITS == 32);
+             if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0],
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0],
+                                  deposit64(arg_info(op->args[1])->val, 32, 32,
+                                            arg_info(op->args[2])->val));
+                 break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         case INDEX_op_extrh_i64_i32:
+             if (arg_is_const(op->args[1])) {
+                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val, 0);
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                 break;
+             }
+             goto do_default;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             if (arg_is_const(op->args[1])) {
+                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
+                                           op->args[2]);
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                 break;
+             }
+             goto do_default;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
+                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
+                                           arg_info(op->args[2])->val);
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                 break;
+             }
+             goto do_default;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 TCGArg v = arg_info(op->args[1])->val;
+                 if (v != 0) {
+                     tmp = do_constant_folding(opc, v, 0);
+-                    tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
++                    tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                 } else {
+                     tcg_opt_gen_mov(s, op, op->args[0], op->args[2]);
+                 }
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 tmp = deposit64(arg_info(op->args[1])->val,
+                                 op->args[3], op->args[4],
+                                 arg_info(op->args[2])->val);
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                 break;
+             }
+             goto do_default;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             if (arg_is_const(op->args[1])) {
+                 tmp = extract64(arg_info(op->args[1])->val,
+                                 op->args[2], op->args[3]);
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                 break;
+             }
+             goto do_default;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             if (arg_is_const(op->args[1])) {
+                 tmp = sextract64(arg_info(op->args[1])->val,
+                                  op->args[2], op->args[3]);
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                 break;
+             }
+             goto do_default;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                     tmp = (int32_t)(((uint32_t)v1 >> shr) |
+                                     ((uint32_t)v2 << (32 - shr)));
+                 }
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                 break;
+             }
+             goto do_default;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             tmp = do_constant_folding_cond(opc, op->args[1],
+                                            op->args[2], op->args[3]);
+             if (tmp != 2) {
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                 break;
+             }
+             goto do_default;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                                            op->args[1], op->args[2]);
+             if (tmp != 2) {
+                 if (tmp) {
+-                    memset(&temps_used, 0, sizeof(temps_used));
++                    memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+                     op->opc = INDEX_op_br;
+                     op->args[0] = op->args[3];
+                 } else {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 rl = op->args[0];
+                 rh = op->args[1];
+-                tcg_opt_gen_movi(s, &temps_used, op, rl, (int32_t)a);
+-                tcg_opt_gen_movi(s, &temps_used, op2, rh, (int32_t)(a >> 32));
++                tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)a);
++                tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(a >> 32));
+                 break;
+             }
+             goto do_default;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 rl = op->args[0];
+                 rh = op->args[1];
+-                tcg_opt_gen_movi(s, &temps_used, op, rl, (int32_t)r);
+-                tcg_opt_gen_movi(s, &temps_used, op2, rh, (int32_t)(r >> 32));
++                tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)r);
++                tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(r >> 32));
+                 break;
+             }
+             goto do_default;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             if (tmp != 2) {
+                 if (tmp) {
+             do_brcond_true:
+-                    memset(&temps_used, 0, sizeof(temps_used));
++                    memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+                     op->opc = INDEX_op_br;
+                     op->args[0] = op->args[5];
+                 } else {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 /* Simplify LT/GE comparisons vs zero to a single compare
+                    vs the high word of the input.  */
+             do_brcond_high:
+-                memset(&temps_used, 0, sizeof(temps_used));
++                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+                 op->opc = INDEX_op_brcond_i32;
+                 op->args[0] = op->args[1];
+                 op->args[1] = op->args[3];
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                     goto do_default;
+                 }
+             do_brcond_low:
+-                memset(&temps_used, 0, sizeof(temps_used));
++                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+                 op->opc = INDEX_op_brcond_i32;
+                 op->args[1] = op->args[2];
+                 op->args[2] = op->args[4];
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                                             op->args[5]);
+             if (tmp != 2) {
+             do_setcond_const:
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+             } else if ((op->args[5] == TCG_COND_LT
+                         || op->args[5] == TCG_COND_GE)
+                        && arg_is_const(op->args[3])
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             if (!(tcg_call_flags(op)
+                   & (TCG_CALL_NO_READ_GLOBALS | TCG_CALL_NO_WRITE_GLOBALS))) {
+                 for (i = 0; i < nb_globals; i++) {
+-                    if (test_bit(i, temps_used.l)) {
++                    if (test_bit(i, ctx.temps_used.l)) {
+                         reset_ts(&s->temps[i]);
+                     }
+                 }
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                block, otherwise we only trash the output args.  "z_mask" is
+                the non-zero bits mask for the first output arg.  */
+             if (def->flags & TCG_OPF_BB_END) {
+-                memset(&temps_used, 0, sizeof(temps_used));
++                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+             } else {
+         do_reset_output:
+                 for (i = 0; i < nb_oargs; i++) {
+--
+.25.1

-New patch
+[PULL 08/56] tcg/optimize: Remove do_default label
+Break the final cleanup clause out of the main switch
+statement.  When fully folding an opcode to mov/movi,
+use "continue" to process the next opcode, else break
+to fall into the final cleanup.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 190 ++++++++++++++++++++++++-------------------------
+file changed, 94 insertions(+), 96 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         switch (opc) {
+         CASE_OP_32_64_VEC(mov):
+             tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
+-            break;
++            continue;
+         case INDEX_op_dup_vec:
+             if (arg_is_const(op->args[1])) {
+                 tmp = arg_info(op->args[1])->val;
+                 tmp = dup_const(TCGOP_VECE(op), tmp);
+                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+-                break;
++                continue;
+             }
+-            goto do_default;
++            break;
+         case INDEX_op_dup2_vec:
+             assert(TCG_TARGET_REG_BITS == 32);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 tcg_opt_gen_movi(s, &ctx, op, op->args[0],
+                                  deposit64(arg_info(op->args[1])->val, 32, 32,
+                                            arg_info(op->args[2])->val));
+-                break;
++                continue;
+             } else if (args_are_copies(op->args[1], op->args[2])) {
+                 op->opc = INDEX_op_dup_vec;
+                 TCGOP_VECE(op) = MO_32;
+                 nb_iargs = 1;
+             }
+-            goto do_default;
++            break;
+         CASE_OP_32_64(not):
+         CASE_OP_32_64(neg):
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             if (arg_is_const(op->args[1])) {
+                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val, 0);
+                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+-                break;
++                continue;
+             }
+-            goto do_default;
++            break;
+         CASE_OP_32_64(bswap16):
+         CASE_OP_32_64(bswap32):
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
+                                           op->args[2]);
+                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+-                break;
++                continue;
+             }
+-            goto do_default;
++            break;
+         CASE_OP_32_64(add):
+         CASE_OP_32_64(sub):
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
+                                           arg_info(op->args[2])->val);
+                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+-                break;
++                continue;
+             }
+-            goto do_default;
++            break;
+         CASE_OP_32_64(clz):
+         CASE_OP_32_64(ctz):
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 } else {
+                     tcg_opt_gen_mov(s, op, op->args[0], op->args[2]);
+                 }
+-                break;
++                continue;
+             }
+-            goto do_default;
++            break;
+         CASE_OP_32_64(deposit):
+             if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                                 op->args[3], op->args[4],
+                                 arg_info(op->args[2])->val);
+                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+-                break;
++                continue;
+             }
+-            goto do_default;
++            break;
+         CASE_OP_32_64(extract):
+             if (arg_is_const(op->args[1])) {
+                 tmp = extract64(arg_info(op->args[1])->val,
+                                 op->args[2], op->args[3]);
+                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+-                break;
++                continue;
+             }
+-            goto do_default;
++            break;
+         CASE_OP_32_64(sextract):
+             if (arg_is_const(op->args[1])) {
+                 tmp = sextract64(arg_info(op->args[1])->val,
+                                  op->args[2], op->args[3]);
+                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+-                break;
++                continue;
+             }
+-            goto do_default;
++            break;
+         CASE_OP_32_64(extract2):
+             if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                                     ((uint32_t)v2 << (32 - shr)));
+                 }
+                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+-                break;
++                continue;
+             }
+-            goto do_default;
++            break;
+         CASE_OP_32_64(setcond):
+             tmp = do_constant_folding_cond(opc, op->args[1],
+                                            op->args[2], op->args[3]);
+             if (tmp != 2) {
+                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+-                break;
++                continue;
+             }
+-            goto do_default;
++            break;
+         CASE_OP_32_64(brcond):
+             tmp = do_constant_folding_cond(opc, op->args[0],
+                                            op->args[1], op->args[2]);
+-            if (tmp != 2) {
+-                if (tmp) {
+-                    memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+-                    op->opc = INDEX_op_br;
+-                    op->args[0] = op->args[3];
+-                } else {
+-                    tcg_op_remove(s, op);
+-                }
++            switch (tmp) {
++            case 0:
++                tcg_op_remove(s, op);
++                continue;
++            case 1:
++                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
++                op->opc = opc = INDEX_op_br;
++                op->args[0] = op->args[3];
+                 break;
+             }
+-            goto do_default;
++            break;
+         CASE_OP_32_64(movcond):
+             tmp = do_constant_folding_cond(opc, op->args[1],
+                                            op->args[2], op->args[5]);
+             if (tmp != 2) {
+                 tcg_opt_gen_mov(s, op, op->args[0], op->args[4-tmp]);
+-                break;
++                continue;
+             }
+             if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
+                 uint64_t tv = arg_info(op->args[3])->val;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 if (fv == 1 && tv == 0) {
+                     cond = tcg_invert_cond(cond);
+                 } else if (!(tv == 1 && fv == 0)) {
+-                    goto do_default;
++                    break;
+                 }
+                 op->args[3] = cond;
+                 op->opc = opc = (opc == INDEX_op_movcond_i32
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                                  : INDEX_op_setcond_i64);
+                 nb_iargs = 2;
+             }
+-            goto do_default;
++            break;
+         case INDEX_op_add2_i32:
+         case INDEX_op_sub2_i32:
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 rh = op->args[1];
+                 tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)a);
+                 tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(a >> 32));
+-                break;
++                continue;
+             }
+-            goto do_default;
++            break;
+         case INDEX_op_mulu2_i32:
+             if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 rh = op->args[1];
+                 tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)r);
+                 tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(r >> 32));
+-                break;
++                continue;
+             }
+-            goto do_default;
++            break;
+         case INDEX_op_brcond2_i32:
+             tmp = do_constant_folding_cond2(&op->args[0], &op->args[2],
+                                             op->args[4]);
+-            if (tmp != 2) {
+-                if (tmp) {
+-            do_brcond_true:
+-                    memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+-                    op->opc = INDEX_op_br;
+-                    op->args[0] = op->args[5];
+-                } else {
++            if (tmp == 0) {
+             do_brcond_false:
+-                    tcg_op_remove(s, op);
+-                }
+-            } else if ((op->args[4] == TCG_COND_LT
+-                        || op->args[4] == TCG_COND_GE)
+-                       && arg_is_const(op->args[2])
+-                       && arg_info(op->args[2])->val == 0
+-                       && arg_is_const(op->args[3])
+-                       && arg_info(op->args[3])->val == 0) {
++                tcg_op_remove(s, op);
++                continue;
++            }
++            if (tmp == 1) {
++            do_brcond_true:
++                op->opc = opc = INDEX_op_br;
++                op->args[0] = op->args[5];
++                break;
++            }
++            if ((op->args[4] == TCG_COND_LT || op->args[4] == TCG_COND_GE)
++                 && arg_is_const(op->args[2])
++                 && arg_info(op->args[2])->val == 0
++                 && arg_is_const(op->args[3])
++                 && arg_info(op->args[3])->val == 0) {
+                 /* Simplify LT/GE comparisons vs zero to a single compare
+                    vs the high word of the input.  */
+             do_brcond_high:
+-                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+-                op->opc = INDEX_op_brcond_i32;
++                op->opc = opc = INDEX_op_brcond_i32;
+                 op->args[0] = op->args[1];
+                 op->args[1] = op->args[3];
+                 op->args[2] = op->args[4];
+                 op->args[3] = op->args[5];
+-            } else if (op->args[4] == TCG_COND_EQ) {
++                break;
++            }
++            if (op->args[4] == TCG_COND_EQ) {
+                 /* Simplify EQ comparisons where one of the pairs
+                    can be simplified.  */
+                 tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 if (tmp == 0) {
+                     goto do_brcond_false;
+                 } else if (tmp != 1) {
+-                    goto do_default;
++                    break;
+                 }
+             do_brcond_low:
+                 memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 op->args[1] = op->args[2];
+                 op->args[2] = op->args[4];
+                 op->args[3] = op->args[5];
+-            } else if (op->args[4] == TCG_COND_NE) {
++                break;
++            }
++            if (op->args[4] == TCG_COND_NE) {
+                 /* Simplify NE comparisons where one of the pairs
+                    can be simplified.  */
+                 tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 } else if (tmp == 1) {
+                     goto do_brcond_true;
+                 }
+-                goto do_default;
+-            } else {
+-                goto do_default;
+             }
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             if (tmp != 2) {
+             do_setcond_const:
+                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+-            } else if ((op->args[5] == TCG_COND_LT
+-                        || op->args[5] == TCG_COND_GE)
+-                       && arg_is_const(op->args[3])
+-                       && arg_info(op->args[3])->val == 0
+-                       && arg_is_const(op->args[4])
+-                       && arg_info(op->args[4])->val == 0) {
++                continue;
++            }
++            if ((op->args[5] == TCG_COND_LT || op->args[5] == TCG_COND_GE)
++                 && arg_is_const(op->args[3])
++                 && arg_info(op->args[3])->val == 0
++                 && arg_is_const(op->args[4])
++                 && arg_info(op->args[4])->val == 0) {
+                 /* Simplify LT/GE comparisons vs zero to a single compare
+                    vs the high word of the input.  */
+             do_setcond_high:
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 op->args[1] = op->args[2];
+                 op->args[2] = op->args[4];
+                 op->args[3] = op->args[5];
+-            } else if (op->args[5] == TCG_COND_EQ) {
++                break;
++            }
++            if (op->args[5] == TCG_COND_EQ) {
+                 /* Simplify EQ comparisons where one of the pairs
+                    can be simplified.  */
+                 tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 if (tmp == 0) {
+                     goto do_setcond_high;
+                 } else if (tmp != 1) {
+-                    goto do_default;
++                    break;
+                 }
+             do_setcond_low:
+                 reset_temp(op->args[0]);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 op->opc = INDEX_op_setcond_i32;
+                 op->args[2] = op->args[3];
+                 op->args[3] = op->args[5];
+-            } else if (op->args[5] == TCG_COND_NE) {
++                break;
++            }
++            if (op->args[5] == TCG_COND_NE) {
+                 /* Simplify NE comparisons where one of the pairs
+                    can be simplified.  */
+                 tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 } else if (tmp == 1) {
+                     goto do_setcond_const;
+                 }
+-                goto do_default;
+-            } else {
+-                goto do_default;
+             }
+             break;
+-        case INDEX_op_call:
+-            if (!(tcg_call_flags(op)
++        default:
++            break;
++        }
++
++        /* Some of the folding above can change opc. */
++        opc = op->opc;
++        def = &tcg_op_defs[opc];
++        if (def->flags & TCG_OPF_BB_END) {
++            memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
++        } else {
++            if (opc == INDEX_op_call &&
++                !(tcg_call_flags(op)
+                   & (TCG_CALL_NO_READ_GLOBALS | TCG_CALL_NO_WRITE_GLOBALS))) {
+                 for (i = 0; i < nb_globals; i++) {
+                     if (test_bit(i, ctx.temps_used.l)) {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                     }
+                 }
+             }
+-            goto do_reset_output;
+-        default:
+-        do_default:
+-            /* Default case: we know nothing about operation (or were unable
+-               to compute the operation result) so no propagation is done.
+-               We trash everything if the operation is the end of a basic
+-               block, otherwise we only trash the output args.  "z_mask" is
+-               the non-zero bits mask for the first output arg.  */
+-            if (def->flags & TCG_OPF_BB_END) {
+-                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+-            } else {
+-        do_reset_output:
+-                for (i = 0; i < nb_oargs; i++) {
+-                    reset_temp(op->args[i]);
+-                    /* Save the corresponding known-zero bits mask for the
+-                       first output argument (only one supported so far). */
+-                    if (i == 0) {
+-                        arg_info(op->args[i])->z_mask = z_mask;
+-                    }
++            for (i = 0; i < nb_oargs; i++) {
++                reset_temp(op->args[i]);
++                /* Save the corresponding known-zero bits mask for the
++                   first output argument (only one supported so far). */
++                if (i == 0) {
++                    arg_info(op->args[i])->z_mask = z_mask;
+                 }
+             }
+-            break;
+         }
+         /* Eliminate duplicate and redundant fence instructions.  */
+--
+.25.1

-[PULL 35/46] cpu: Move debug_excp_handler to tcg_ops
+[PULL 09/56] tcg/optimize: Change tcg_opt_gen_{mov,movi} interface
-From: Eduardo Habkost <ehabkost@redhat.com>
+Adjust the interface to take the OptContext parameter instead
+of TCGContext or both.
-Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
 Signed-off-by: Claudio Fontana <cfontana@suse.de>
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Message-Id: <20210204163931.7358-8-cfontana@suse.de>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/hw/core/cpu.h     | 4 ++--
+ tcg/optimize.c | 67 +++++++++++++++++++++++++-------------------------
- accel/tcg/cpu-exec.c      | 4 ++--
+file changed, 34 insertions(+), 33 deletions(-)
- target/arm/cpu.c          | 2 +-
- target/i386/tcg/tcg-cpu.c | 2 +-
+diff --git a/tcg/optimize.c b/tcg/optimize.c
  target/lm32/cpu.c         | 2 +-
  target/s390x/cpu.c        | 2 +-
  target/xtensa/cpu.c       | 2 +-
 files changed, 9 insertions(+), 9 deletions(-)
 diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
 index XXXXXXX..XXXXXXX 100644
---- a/include/hw/core/cpu.h
+--- a/tcg/optimize.c
-+++ b/include/hw/core/cpu.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ typedef struct TcgCpuOperations {
+@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
-     bool (*tlb_fill)(CPUState *cpu, vaddr address, int size,
+ } TempOptInfo;
-                      MMUAccessType access_type, int mmu_idx,
-                      bool probe, uintptr_t retaddr);
+ typedef struct OptContext {
-+    /** @debug_excp_handler: Callback for handling debug exceptions */
++    TCGContext *tcg;
-+    void (*debug_excp_handler)(CPUState *cpu);
+     TCGTempSet temps_used;
+ } OptContext;
- } TcgCpuOperations;
+@@ -XXX,XX +XXX,XX @@ static bool args_are_copies(TCGArg arg1, TCGArg arg2)
-@@ -XXX,XX +XXX,XX @@ typedef struct TcgCpuOperations {
+     return ts_are_copies(arg_temp(arg1), arg_temp(arg2));
-  * @gdb_write_register: Callback for letting GDB write a register.
+ }
-  * @debug_check_watchpoint: Callback: return true if the architectural
-  *       watchpoint whose address has matched should really fire.
+-static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
-- * @debug_excp_handler: Callback for handling debug exceptions.
++static void tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
-  * @write_elf64_note: Callback for writing a CPU-specific ELF note to a
+ {
-  * 64-bit VM coredump.
+     TCGTemp *dst_ts = arg_temp(dst);
-  * @write_elf32_qemunote: Callback for writing a CPU- and QEMU-specific ELF
+     TCGTemp *src_ts = arg_temp(src);
-@@ -XXX,XX +XXX,XX @@ struct CPUClass {
+@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
-     int (*gdb_read_register)(CPUState *cpu, GByteArray *buf, int reg);
+     TCGOpcode new_op;
-     int (*gdb_write_register)(CPUState *cpu, uint8_t *buf, int reg);
-     bool (*debug_check_watchpoint)(CPUState *cpu, CPUWatchpoint *wp);
+     if (ts_are_copies(dst_ts, src_ts)) {
--    void (*debug_excp_handler)(CPUState *cpu);
+-        tcg_op_remove(s, op);
++        tcg_op_remove(ctx->tcg, op);
-     int (*write_elf64_note)(WriteCoreDumpFunction f, CPUState *cpu,
+         return;
                              int cpuid, void *opaque);
 diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/cpu-exec.c
 +++ b/accel/tcg/cpu-exec.c
@@ -XXX,XX +XXX,XX @@ static inline void cpu_handle_debug_exception(CPUState *cpu)
          }
      }
--    if (cc->debug_excp_handler) {
+@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
 -        cc->debug_excp_handler(cpu);
 +    if (cc->tcg_ops.debug_excp_handler) {
 +        cc->tcg_ops.debug_excp_handler(cpu);
      }
  }
-diff --git a/target/arm/cpu.c b/target/arm/cpu.c
+-static void tcg_opt_gen_movi(TCGContext *s, OptContext *ctx,
-index XXXXXXX..XXXXXXX 100644
+-                             TCGOp *op, TCGArg dst, uint64_t val)
---- a/target/arm/cpu.c
++static void tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
-+++ b/target/arm/cpu.c
++                             TCGArg dst, uint64_t val)
-@@ -XXX,XX +XXX,XX @@ static void arm_cpu_class_init(ObjectClass *oc, void *data)
+ {
-     cc->tcg_ops.cpu_exec_interrupt = arm_cpu_exec_interrupt;
+     const TCGOpDef *def = &tcg_op_defs[op->opc];
-     cc->tcg_ops.synchronize_from_tb = arm_cpu_synchronize_from_tb;
+     TCGType type;
-     cc->tcg_ops.tlb_fill = arm_cpu_tlb_fill;
+@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_movi(TCGContext *s, OptContext *ctx,
--    cc->debug_excp_handler = arm_debug_excp_handler;
+     /* Convert movi to mov with constant temp. */
-+    cc->tcg_ops.debug_excp_handler = arm_debug_excp_handler;
+     tv = tcg_constant_internal(type, val);
-     cc->debug_check_watchpoint = arm_debug_check_watchpoint;
+     init_ts_info(ctx, tv);
-     cc->do_unaligned_access = arm_cpu_do_unaligned_access;
+-    tcg_opt_gen_mov(s, op, dst, temp_arg(tv));
- #if !defined(CONFIG_USER_ONLY)
++    tcg_opt_gen_mov(ctx, op, dst, temp_arg(tv));
 diff --git a/target/i386/tcg/tcg-cpu.c b/target/i386/tcg/tcg-cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/i386/tcg/tcg-cpu.c
 +++ b/target/i386/tcg/tcg-cpu.c
@@ -XXX,XX +XXX,XX @@ void tcg_cpu_common_class_init(CPUClass *cc)
      cc->tcg_ops.initialize = tcg_x86_init;
      cc->tcg_ops.tlb_fill = x86_cpu_tlb_fill;
  #ifndef CONFIG_USER_ONLY
 -    cc->debug_excp_handler = breakpoint_handler;
 +    cc->tcg_ops.debug_excp_handler = breakpoint_handler;
  #endif
  }
-diff --git a/target/lm32/cpu.c b/target/lm32/cpu.c
-index XXXXXXX..XXXXXXX 100644
+ static uint64_t do_constant_folding_2(TCGOpcode op, uint64_t x, uint64_t y)
---- a/target/lm32/cpu.c
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-+++ b/target/lm32/cpu.c
+ {
-@@ -XXX,XX +XXX,XX @@ static void lm32_cpu_class_init(ObjectClass *oc, void *data)
+     int nb_temps, nb_globals, i;
- #endif
+     TCGOp *op, *op_next, *prev_mb = NULL;
-     cc->gdb_num_core_regs = 32 + 7;
+-    OptContext ctx = {};
-     cc->gdb_stop_before_watchpoint = true;
++    OptContext ctx = { .tcg = s };
--    cc->debug_excp_handler = lm32_debug_excp_handler;
-+    cc->tcg_ops.debug_excp_handler = lm32_debug_excp_handler;
+     /* Array VALS has an element for each temp.
-     cc->disas_set_info = lm32_cpu_disas_set_info;
+        If this temp holds a constant then its value is kept in VALS' element.
-     cc->tcg_ops.initialize = lm32_translate_init;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
- }
+         CASE_OP_32_64(rotr):
-diff --git a/target/s390x/cpu.c b/target/s390x/cpu.c
+             if (arg_is_const(op->args[1])
-index XXXXXXX..XXXXXXX 100644
+                 && arg_info(op->args[1])->val == 0) {
---- a/target/s390x/cpu.c
+-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
-+++ b/target/s390x/cpu.c
++                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
-@@ -XXX,XX +XXX,XX @@ static void s390_cpu_class_init(ObjectClass *oc, void *data)
+                 continue;
-     cc->write_elf64_note = s390_cpu_write_elf64_note;
+             }
- #ifdef CONFIG_TCG
+             break;
-     cc->tcg_ops.cpu_exec_interrupt = s390_cpu_exec_interrupt;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
--    cc->debug_excp_handler = s390x_cpu_debug_excp_handler;
+             if (!arg_is_const(op->args[1])
-+    cc->tcg_ops.debug_excp_handler = s390x_cpu_debug_excp_handler;
+                 && arg_is_const(op->args[2])
-     cc->do_unaligned_access = s390x_cpu_do_unaligned_access;
+                 && arg_info(op->args[2])->val == 0) {
- #endif
+-                tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
- #endif
++                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
-diff --git a/target/xtensa/cpu.c b/target/xtensa/cpu.c
+                 continue;
-index XXXXXXX..XXXXXXX 100644
+             }
---- a/target/xtensa/cpu.c
+             break;
-+++ b/target/xtensa/cpu.c
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-@@ -XXX,XX +XXX,XX @@ static void xtensa_cpu_class_init(ObjectClass *oc, void *data)
+             if (!arg_is_const(op->args[1])
-     cc->get_phys_page_debug = xtensa_cpu_get_phys_page_debug;
+                 && arg_is_const(op->args[2])
-     cc->do_transaction_failed = xtensa_cpu_do_transaction_failed;
+                 && arg_info(op->args[2])->val == -1) {
- #endif
+-                tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
--    cc->debug_excp_handler = xtensa_breakpoint_handler;
++                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
-+    cc->tcg_ops.debug_excp_handler = xtensa_breakpoint_handler;
+                 continue;
-     cc->disas_set_info = xtensa_cpu_disas_set_info;
+             }
-     cc->tcg_ops.initialize = xtensa_translate_init;
+             break;
-     dc->vmsd = &vmstate_xtensa_cpu;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          if (partmask == 0) {
              tcg_debug_assert(nb_oargs == 1);
 -            tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
 +            tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
              continue;
          }
          if (affected == 0) {
              tcg_debug_assert(nb_oargs == 1);
 -            tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
 +            tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
              continue;
          }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          CASE_OP_32_64(mulsh):
              if (arg_is_const(op->args[2])
                  && arg_info(op->args[2])->val == 0) {
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          CASE_OP_32_64_VEC(or):
          CASE_OP_32_64_VEC(and):
              if (args_are_copies(op->args[1], op->args[2])) {
 -                tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
 +                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          CASE_OP_32_64_VEC(sub):
          CASE_OP_32_64_VEC(xor):
              if (args_are_copies(op->args[1], op->args[2])) {
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             allocator where needed and possible.  Also detect copies. */
          switch (opc) {
          CASE_OP_32_64_VEC(mov):
 -            tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
 +            tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
              continue;
          case INDEX_op_dup_vec:
              if (arg_is_const(op->args[1])) {
                  tmp = arg_info(op->args[1])->val;
                  tmp = dup_const(TCGOP_VECE(op), tmp);
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          case INDEX_op_dup2_vec:
              assert(TCG_TARGET_REG_BITS == 32);
              if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0],
 +                tcg_opt_gen_movi(&ctx, op, op->args[0],
                                   deposit64(arg_info(op->args[1])->val, 32, 32,
                                             arg_info(op->args[2])->val));
                  continue;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          case INDEX_op_extrh_i64_i32:
              if (arg_is_const(op->args[1])) {
                  tmp = do_constant_folding(opc, arg_info(op->args[1])->val, 0);
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (arg_is_const(op->args[1])) {
                  tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
                                            op->args[2]);
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
                  tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
                                            arg_info(op->args[2])->val);
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  TCGArg v = arg_info(op->args[1])->val;
                  if (v != 0) {
                      tmp = do_constant_folding(opc, v, 0);
 -                    tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                    tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  } else {
 -                    tcg_opt_gen_mov(s, op, op->args[0], op->args[2]);
 +                    tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[2]);
                  }
                  continue;
              }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  tmp = deposit64(arg_info(op->args[1])->val,
                                  op->args[3], op->args[4],
                                  arg_info(op->args[2])->val);
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (arg_is_const(op->args[1])) {
                  tmp = extract64(arg_info(op->args[1])->val,
                                  op->args[2], op->args[3]);
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (arg_is_const(op->args[1])) {
                  tmp = sextract64(arg_info(op->args[1])->val,
                                   op->args[2], op->args[3]);
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                      tmp = (int32_t)(((uint32_t)v1 >> shr) |
                                      ((uint32_t)v2 << (32 - shr)));
                  }
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              tmp = do_constant_folding_cond(opc, op->args[1],
                                             op->args[2], op->args[3]);
              if (tmp != 2) {
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              tmp = do_constant_folding_cond(opc, op->args[1],
                                             op->args[2], op->args[5]);
              if (tmp != 2) {
 -                tcg_opt_gen_mov(s, op, op->args[0], op->args[4-tmp]);
 +                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[4-tmp]);
                  continue;
              }
              if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  rl = op->args[0];
                  rh = op->args[1];
 -                tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)a);
 -                tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(a >> 32));
 +                tcg_opt_gen_movi(&ctx, op, rl, (int32_t)a);
 +                tcg_opt_gen_movi(&ctx, op2, rh, (int32_t)(a >> 32));
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  rl = op->args[0];
                  rh = op->args[1];
 -                tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)r);
 -                tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(r >> 32));
 +                tcg_opt_gen_movi(&ctx, op, rl, (int32_t)r);
 +                tcg_opt_gen_movi(&ctx, op2, rh, (int32_t)(r >> 32));
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                                              op->args[5]);
              if (tmp != 2) {
              do_setcond_const:
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  continue;
              }
              if ((op->args[5] == TCG_COND_LT || op->args[5] == TCG_COND_GE)
 --
 .25.1

-New patch
+[PULL 10/56] tcg/optimize: Move prev_mb into OptContext
+This will expose the variable to subroutines that
+will be broken out of tcg_optimize.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 11 ++++++-----
+file changed, 6 insertions(+), 5 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
+ typedef struct OptContext {
+     TCGContext *tcg;
++    TCGOp *prev_mb;
+     TCGTempSet temps_used;
+ } OptContext;
+@@ -XXX,XX +XXX,XX @@ static bool swap_commutative2(TCGArg *p1, TCGArg *p2)
+ void tcg_optimize(TCGContext *s)
+ {
+     int nb_temps, nb_globals, i;
+-    TCGOp *op, *op_next, *prev_mb = NULL;
++    TCGOp *op, *op_next;
+     OptContext ctx = { .tcg = s };
+     /* Array VALS has an element for each temp.
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         }
+         /* Eliminate duplicate and redundant fence instructions.  */
+-        if (prev_mb) {
++        if (ctx.prev_mb) {
+             switch (opc) {
+             case INDEX_op_mb:
+                 /* Merge two barriers of the same type into one,
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                  * barrier.  This is stricter than specified but for
+                  * the purposes of TCG is better than not optimizing.
+                  */
+-                prev_mb->args[0] |= op->args[0];
++                ctx.prev_mb->args[0] |= op->args[0];
+                 tcg_op_remove(s, op);
+                 break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             case INDEX_op_qemu_st_i64:
+             case INDEX_op_call:
+                 /* Opcodes that touch guest memory stop the optimization.  */
+-                prev_mb = NULL;
++                ctx.prev_mb = NULL;
+                 break;
+             }
+         } else if (opc == INDEX_op_mb) {
+-            prev_mb = op;
++            ctx.prev_mb = op;
+         }
+     }
+ }
+--
+.25.1

-[PULL 11/46] tcg/tci: Inline tci_write_reg32 into all callers
+[PULL 11/56] tcg/optimize: Split out init_arguments
-For a 64-bit TCI, the upper bits of a 32-bit operation are
+There was no real reason for calls to have separate code here.
-undefined (much like a native ppc64 32-bit operation).  It
+Unify init for calls vs non-calls using the call path, which
-simplifies everything if we don't force-extend the result.
+handles TCG_CALL_DUMMY_ARG.
-Tested-by: Alex Bennée <alex.bennee@linaro.org>
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/tci.c | 66 +++++++++++++++++++++++++------------------------------
+ tcg/optimize.c | 25 +++++++++++--------------
-file changed, 30 insertions(+), 36 deletions(-)
+file changed, 11 insertions(+), 14 deletions(-)
-diff --git a/tcg/tci.c b/tcg/tci.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/tci.c
+--- a/tcg/optimize.c
-+++ b/tcg/tci.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ tci_write_reg(tcg_target_ulong *regs, TCGReg index, tcg_target_ulong value)
+@@ -XXX,XX +XXX,XX @@ static void init_ts_info(OptContext *ctx, TCGTemp *ts)
-     regs[index] = value;
+     }
  }
--static void
+-static void init_arg_info(OptContext *ctx, TCGArg arg)
 -tci_write_reg32(tcg_target_ulong *regs, TCGReg index, uint32_t value)
 -{
--    tci_write_reg(regs, index, value);
+-    init_ts_info(ctx, arg_temp(arg));
 -}
 -
- #if TCG_TARGET_REG_BITS == 32
+ static TCGTemp *find_better_copy(TCGContext *s, TCGTemp *ts)
- static void tci_write_reg64(tcg_target_ulong *regs, uint32_t high_index,
+ {
-                             uint32_t low_index, uint64_t value)
+     TCGTemp *i, *g, *l;
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+@@ -XXX,XX +XXX,XX @@ static bool swap_commutative2(TCGArg *p1, TCGArg *p2)
-             t1 = tci_read_r32(regs, &tb_ptr);
+     return false;
-             t2 = tci_read_ri32(regs, &tb_ptr);
+ }
-             condition = *tb_ptr++;
--            tci_write_reg32(regs, t0, tci_compare32(t1, t2, condition));
++static void init_arguments(OptContext *ctx, TCGOp *op, int nb_args)
-+            tci_write_reg(regs, t0, tci_compare32(t1, t2, condition));
++{
-             break;
++    for (int i = 0; i < nb_args; i++) {
- #if TCG_TARGET_REG_BITS == 32
++        TCGTemp *ts = arg_temp(op->args[i]);
-         case INDEX_op_setcond2_i32:
++        if (ts) {
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
++            init_ts_info(ctx, ts);
-             tmp64 = tci_read_r64(regs, &tb_ptr);
++        }
-             v64 = tci_read_ri64(regs, &tb_ptr);
++    }
-             condition = *tb_ptr++;
++}
--            tci_write_reg32(regs, t0, tci_compare64(tmp64, v64, condition));
++
-+            tci_write_reg(regs, t0, tci_compare64(tmp64, v64, condition));
+ /* Propagate constants and copies, fold constant expressions. */
-             break;
+ void tcg_optimize(TCGContext *s)
- #elif TCG_TARGET_REG_BITS == 64
+ {
-         case INDEX_op_setcond_i64:
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+         if (opc == INDEX_op_call) {
-         case INDEX_op_mov_i32:
+             nb_oargs = TCGOP_CALLO(op);
-             t0 = *tb_ptr++;
+             nb_iargs = TCGOP_CALLI(op);
-             t1 = tci_read_r32(regs, &tb_ptr);
+-            for (i = 0; i < nb_oargs + nb_iargs; i++) {
--            tci_write_reg32(regs, t0, t1);
+-                TCGTemp *ts = arg_temp(op->args[i]);
-+            tci_write_reg(regs, t0, t1);
+-                if (ts) {
-             break;
+-                    init_ts_info(&ctx, ts);
-         case INDEX_op_tci_movi_i32:
+-                }
-             t0 = *tb_ptr++;
+-            }
-             t1 = tci_read_i32(&tb_ptr);
+         } else {
--            tci_write_reg32(regs, t0, t1);
+             nb_oargs = def->nb_oargs;
-+            tci_write_reg(regs, t0, t1);
+             nb_iargs = def->nb_iargs;
-             break;
+-            for (i = 0; i < nb_oargs + nb_iargs; i++) {
+-                init_arg_info(&ctx, op->args[i]);
-             /* Load/store operations (32 bit). */
+-            }
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+         }
-             t0 = *tb_ptr++;
++        init_arguments(&ctx, op, nb_oargs + nb_iargs);
-             t1 = tci_read_r(regs, &tb_ptr);
-             t2 = tci_read_s32(&tb_ptr);
+         /* Do copy propagation */
--            tci_write_reg32(regs, t0, *(uint32_t *)(t1 + t2));
+         for (i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
 +            tci_write_reg(regs, t0, *(uint32_t *)(t1 + t2));
              break;
          case INDEX_op_st8_i32:
              t0 = tci_read_r8(regs, &tb_ptr);
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
              t0 = *tb_ptr++;
              t1 = tci_read_ri32(regs, &tb_ptr);
              t2 = tci_read_ri32(regs, &tb_ptr);
 -            tci_write_reg32(regs, t0, t1 + t2);
 +            tci_write_reg(regs, t0, t1 + t2);
              break;
          case INDEX_op_sub_i32:
              t0 = *tb_ptr++;
              t1 = tci_read_ri32(regs, &tb_ptr);
              t2 = tci_read_ri32(regs, &tb_ptr);
 -            tci_write_reg32(regs, t0, t1 - t2);
 +            tci_write_reg(regs, t0, t1 - t2);
              break;
          case INDEX_op_mul_i32:
              t0 = *tb_ptr++;
              t1 = tci_read_ri32(regs, &tb_ptr);
              t2 = tci_read_ri32(regs, &tb_ptr);
 -            tci_write_reg32(regs, t0, t1 * t2);
 +            tci_write_reg(regs, t0, t1 * t2);
              break;
  #if TCG_TARGET_HAS_div_i32
          case INDEX_op_div_i32:
              t0 = *tb_ptr++;
              t1 = tci_read_ri32(regs, &tb_ptr);
              t2 = tci_read_ri32(regs, &tb_ptr);
 -            tci_write_reg32(regs, t0, (int32_t)t1 / (int32_t)t2);
 +            tci_write_reg(regs, t0, (int32_t)t1 / (int32_t)t2);
              break;
          case INDEX_op_divu_i32:
              t0 = *tb_ptr++;
              t1 = tci_read_ri32(regs, &tb_ptr);
              t2 = tci_read_ri32(regs, &tb_ptr);
 -            tci_write_reg32(regs, t0, t1 / t2);
 +            tci_write_reg(regs, t0, t1 / t2);
              break;
          case INDEX_op_rem_i32:
              t0 = *tb_ptr++;
              t1 = tci_read_ri32(regs, &tb_ptr);
              t2 = tci_read_ri32(regs, &tb_ptr);
 -            tci_write_reg32(regs, t0, (int32_t)t1 % (int32_t)t2);
 +            tci_write_reg(regs, t0, (int32_t)t1 % (int32_t)t2);
              break;
          case INDEX_op_remu_i32:
              t0 = *tb_ptr++;
              t1 = tci_read_ri32(regs, &tb_ptr);
              t2 = tci_read_ri32(regs, &tb_ptr);
 -            tci_write_reg32(regs, t0, t1 % t2);
 +            tci_write_reg(regs, t0, t1 % t2);
              break;
  #elif TCG_TARGET_HAS_div2_i32
          case INDEX_op_div2_i32:
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
              t0 = *tb_ptr++;
              t1 = tci_read_ri32(regs, &tb_ptr);
              t2 = tci_read_ri32(regs, &tb_ptr);
 -            tci_write_reg32(regs, t0, t1 & t2);
 +            tci_write_reg(regs, t0, t1 & t2);
              break;
          case INDEX_op_or_i32:
              t0 = *tb_ptr++;
              t1 = tci_read_ri32(regs, &tb_ptr);
              t2 = tci_read_ri32(regs, &tb_ptr);
 -            tci_write_reg32(regs, t0, t1 | t2);
 +            tci_write_reg(regs, t0, t1 | t2);
              break;
          case INDEX_op_xor_i32:
              t0 = *tb_ptr++;
              t1 = tci_read_ri32(regs, &tb_ptr);
              t2 = tci_read_ri32(regs, &tb_ptr);
 -            tci_write_reg32(regs, t0, t1 ^ t2);
 +            tci_write_reg(regs, t0, t1 ^ t2);
              break;
              /* Shift/rotate operations (32 bit). */
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
              t0 = *tb_ptr++;
              t1 = tci_read_ri32(regs, &tb_ptr);
              t2 = tci_read_ri32(regs, &tb_ptr);
 -            tci_write_reg32(regs, t0, t1 << (t2 & 31));
 +            tci_write_reg(regs, t0, t1 << (t2 & 31));
              break;
          case INDEX_op_shr_i32:
              t0 = *tb_ptr++;
              t1 = tci_read_ri32(regs, &tb_ptr);
              t2 = tci_read_ri32(regs, &tb_ptr);
 -            tci_write_reg32(regs, t0, t1 >> (t2 & 31));
 +            tci_write_reg(regs, t0, t1 >> (t2 & 31));
              break;
          case INDEX_op_sar_i32:
              t0 = *tb_ptr++;
              t1 = tci_read_ri32(regs, &tb_ptr);
              t2 = tci_read_ri32(regs, &tb_ptr);
 -            tci_write_reg32(regs, t0, ((int32_t)t1 >> (t2 & 31)));
 +            tci_write_reg(regs, t0, ((int32_t)t1 >> (t2 & 31)));
              break;
  #if TCG_TARGET_HAS_rot_i32
          case INDEX_op_rotl_i32:
              t0 = *tb_ptr++;
              t1 = tci_read_ri32(regs, &tb_ptr);
              t2 = tci_read_ri32(regs, &tb_ptr);
 -            tci_write_reg32(regs, t0, rol32(t1, t2 & 31));
 +            tci_write_reg(regs, t0, rol32(t1, t2 & 31));
              break;
          case INDEX_op_rotr_i32:
              t0 = *tb_ptr++;
              t1 = tci_read_ri32(regs, &tb_ptr);
              t2 = tci_read_ri32(regs, &tb_ptr);
 -            tci_write_reg32(regs, t0, ror32(t1, t2 & 31));
 +            tci_write_reg(regs, t0, ror32(t1, t2 & 31));
              break;
  #endif
  #if TCG_TARGET_HAS_deposit_i32
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
              tmp16 = *tb_ptr++;
              tmp8 = *tb_ptr++;
              tmp32 = (((1 << tmp8) - 1) << tmp16);
 -            tci_write_reg32(regs, t0, (t1 & ~tmp32) | ((t2 << tmp16) & tmp32));
 +            tci_write_reg(regs, t0, (t1 & ~tmp32) | ((t2 << tmp16) & tmp32));
              break;
  #endif
          case INDEX_op_brcond_i32:
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
          case INDEX_op_ext8s_i32:
              t0 = *tb_ptr++;
              t1 = tci_read_r8s(regs, &tb_ptr);
 -            tci_write_reg32(regs, t0, t1);
 +            tci_write_reg(regs, t0, t1);
              break;
  #endif
  #if TCG_TARGET_HAS_ext16s_i32
          case INDEX_op_ext16s_i32:
              t0 = *tb_ptr++;
              t1 = tci_read_r16s(regs, &tb_ptr);
 -            tci_write_reg32(regs, t0, t1);
 +            tci_write_reg(regs, t0, t1);
              break;
  #endif
  #if TCG_TARGET_HAS_ext8u_i32
          case INDEX_op_ext8u_i32:
              t0 = *tb_ptr++;
              t1 = tci_read_r8(regs, &tb_ptr);
 -            tci_write_reg32(regs, t0, t1);
 +            tci_write_reg(regs, t0, t1);
              break;
  #endif
  #if TCG_TARGET_HAS_ext16u_i32
          case INDEX_op_ext16u_i32:
              t0 = *tb_ptr++;
              t1 = tci_read_r16(regs, &tb_ptr);
 -            tci_write_reg32(regs, t0, t1);
 +            tci_write_reg(regs, t0, t1);
              break;
  #endif
  #if TCG_TARGET_HAS_bswap16_i32
          case INDEX_op_bswap16_i32:
              t0 = *tb_ptr++;
              t1 = tci_read_r16(regs, &tb_ptr);
 -            tci_write_reg32(regs, t0, bswap16(t1));
 +            tci_write_reg(regs, t0, bswap16(t1));
              break;
  #endif
  #if TCG_TARGET_HAS_bswap32_i32
          case INDEX_op_bswap32_i32:
              t0 = *tb_ptr++;
              t1 = tci_read_r32(regs, &tb_ptr);
 -            tci_write_reg32(regs, t0, bswap32(t1));
 +            tci_write_reg(regs, t0, bswap32(t1));
              break;
  #endif
  #if TCG_TARGET_HAS_not_i32
          case INDEX_op_not_i32:
              t0 = *tb_ptr++;
              t1 = tci_read_r32(regs, &tb_ptr);
 -            tci_write_reg32(regs, t0, ~t1);
 +            tci_write_reg(regs, t0, ~t1);
              break;
  #endif
  #if TCG_TARGET_HAS_neg_i32
          case INDEX_op_neg_i32:
              t0 = *tb_ptr++;
              t1 = tci_read_r32(regs, &tb_ptr);
 -            tci_write_reg32(regs, t0, -t1);
 +            tci_write_reg(regs, t0, -t1);
              break;
  #endif
  #if TCG_TARGET_REG_BITS == 64
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
              t0 = *tb_ptr++;
              t1 = tci_read_r(regs, &tb_ptr);
              t2 = tci_read_s32(&tb_ptr);
 -            tci_write_reg32(regs, t0, *(uint32_t *)(t1 + t2));
 +            tci_write_reg(regs, t0, *(uint32_t *)(t1 + t2));
              break;
          case INDEX_op_ld32s_i64:
              t0 = *tb_ptr++;
 --
 .25.1

-[PULL 20/46] tcg/tci: Move stack bounds check to compile-time
+[PULL 12/56] tcg/optimize: Split out copy_propagate
-The existing check was incomplete:
+Continue splitting tcg_optimize.
 (1) Only applied to two of the 7 stores, and not to the loads at all.
 (2) Only checked the upper, but not the lower bound of the stack.
-Doing this at compile time means that we don't need to do it
-at runtime as well.
-Tested-by: Alex Bennée <alex.bennee@linaro.org>
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/tci.c                |  2 --
+ tcg/optimize.c | 22 ++++++++++++++--------
- tcg/tci/tcg-target.c.inc | 13 +++++++++++++
+file changed, 14 insertions(+), 8 deletions(-)
 files changed, 13 insertions(+), 2 deletions(-)
-diff --git a/tcg/tci.c b/tcg/tci.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/tci.c
+--- a/tcg/optimize.c
-+++ b/tcg/tci.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+@@ -XXX,XX +XXX,XX @@ static void init_arguments(OptContext *ctx, TCGOp *op, int nb_args)
              t0 = tci_read_r32(regs, &tb_ptr);
              t1 = tci_read_r(regs, &tb_ptr);
              t2 = tci_read_s32(&tb_ptr);
 -            tci_assert(t1 != sp_value || (int32_t)t2 < 0);
              *(uint32_t *)(t1 + t2) = t0;
              break;
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
              t0 = tci_read_r64(regs, &tb_ptr);
              t1 = tci_read_r(regs, &tb_ptr);
              t2 = tci_read_s32(&tb_ptr);
 -            tci_assert(t1 != sp_value || (int32_t)t2 < 0);
              *(uint64_t *)(t1 + t2) = t0;
              break;
 diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tci/tcg-target.c.inc
 +++ b/tcg/tci/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tci_out_label(TCGContext *s, TCGLabel *label)
      }
  }
-+static void stack_bounds_check(TCGReg base, target_long offset)
++static void copy_propagate(OptContext *ctx, TCGOp *op,
 +                           int nb_oargs, int nb_iargs)
 +{
-+    if (base == TCG_REG_CALL_STACK) {
++    TCGContext *s = ctx->tcg;
-+        tcg_debug_assert(offset < 0);
++
-+        tcg_debug_assert(offset >= -(CPU_TEMP_BUF_NLONGS * sizeof(long)));
++    for (int i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
 +        TCGTemp *ts = arg_temp(op->args[i]);
 +        if (ts && ts_is_copy(ts)) {
 +            op->args[i] = temp_arg(find_better_copy(s, ts));
 +        }
 +    }
 +}
 +
- static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg1,
+ /* Propagate constants and copies, fold constant expressions. */
-                        intptr_t arg2)
+ void tcg_optimize(TCGContext *s)
  {
-     uint8_t *old_code_ptr = s->code_ptr;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-+
+             nb_iargs = def->nb_iargs;
-+    stack_bounds_check(arg1, arg2);
+         }
-     if (type == TCG_TYPE_I32) {
+         init_arguments(&ctx, op, nb_oargs + nb_iargs);
-         tcg_out_op_t(s, INDEX_op_ld_i32);
+-
-         tcg_out_r(s, ret);
+-        /* Do copy propagation */
-@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
+-        for (i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
-     case INDEX_op_st16_i64:
+-            TCGTemp *ts = arg_temp(op->args[i]);
-     case INDEX_op_st32_i64:
+-            if (ts && ts_is_copy(ts)) {
-     case INDEX_op_st_i64:
+-                op->args[i] = temp_arg(find_better_copy(s, ts));
-+        stack_bounds_check(args[1], args[2]);
+-            }
-         tcg_out_r(s, args[0]);
+-        }
-         tcg_out_r(s, args[1]);
++        copy_propagate(&ctx, op, nb_oargs, nb_iargs);
-         tcg_debug_assert(args[2] == (int32_t)args[2]);
-@@ -XXX,XX +XXX,XX @@ static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg, TCGReg arg1,
+         /* For commutative operations make constant second argument */
-                        intptr_t arg2)
+         switch (opc) {
  {
      uint8_t *old_code_ptr = s->code_ptr;
 +
 +    stack_bounds_check(arg1, arg2);
      if (type == TCG_TYPE_I32) {
          tcg_out_op_t(s, INDEX_op_st_i32);
          tcg_out_r(s, arg);
 --
 .25.1

-New patch
+[PULL 13/56] tcg/optimize: Split out fold_call
+Calls are special in that they have a variable number
+of arguments, and need to be able to clobber globals.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 63 ++++++++++++++++++++++++++++++++------------------
+file changed, 41 insertions(+), 22 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static void copy_propagate(OptContext *ctx, TCGOp *op,
+     }
+ }
++static bool fold_call(OptContext *ctx, TCGOp *op)
++{
++    TCGContext *s = ctx->tcg;
++    int nb_oargs = TCGOP_CALLO(op);
++    int nb_iargs = TCGOP_CALLI(op);
++    int flags, i;
++
++    init_arguments(ctx, op, nb_oargs + nb_iargs);
++    copy_propagate(ctx, op, nb_oargs, nb_iargs);
++
++    /* If the function reads or writes globals, reset temp data. */
++    flags = tcg_call_flags(op);
++    if (!(flags & (TCG_CALL_NO_READ_GLOBALS | TCG_CALL_NO_WRITE_GLOBALS))) {
++        int nb_globals = s->nb_globals;
++
++        for (i = 0; i < nb_globals; i++) {
++            if (test_bit(i, ctx->temps_used.l)) {
++                reset_ts(&ctx->tcg->temps[i]);
++            }
++        }
++    }
++
++    /* Reset temp data for outputs. */
++    for (i = 0; i < nb_oargs; i++) {
++        reset_temp(op->args[i]);
++    }
++
++    /* Stop optimizing MB across calls. */
++    ctx->prev_mb = NULL;
++    return true;
++}
++
+ /* Propagate constants and copies, fold constant expressions. */
+ void tcg_optimize(TCGContext *s)
+ {
+-    int nb_temps, nb_globals, i;
++    int nb_temps, i;
+     TCGOp *op, *op_next;
+     OptContext ctx = { .tcg = s };
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+        available through the doubly linked circular list. */
+     nb_temps = s->nb_temps;
+-    nb_globals = s->nb_globals;
+-
+     for (i = 0; i < nb_temps; ++i) {
+         s->temps[i].state_ptr = NULL;
+     }
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         uint64_t z_mask, partmask, affected, tmp;
+         int nb_oargs, nb_iargs;
+         TCGOpcode opc = op->opc;
+-        const TCGOpDef *def = &tcg_op_defs[opc];
++        const TCGOpDef *def;
+-        /* Count the arguments, and initialize the temps that are
+-           going to be used */
++        /* Calls are special. */
+         if (opc == INDEX_op_call) {
+-            nb_oargs = TCGOP_CALLO(op);
+-            nb_iargs = TCGOP_CALLI(op);
+-        } else {
+-            nb_oargs = def->nb_oargs;
+-            nb_iargs = def->nb_iargs;
++            fold_call(&ctx, op);
++            continue;
+         }
++
++        def = &tcg_op_defs[opc];
++        nb_oargs = def->nb_oargs;
++        nb_iargs = def->nb_iargs;
+         init_arguments(&ctx, op, nb_oargs + nb_iargs);
+         copy_propagate(&ctx, op, nb_oargs, nb_iargs);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         if (def->flags & TCG_OPF_BB_END) {
+             memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+         } else {
+-            if (opc == INDEX_op_call &&
+-                !(tcg_call_flags(op)
+-                  & (TCG_CALL_NO_READ_GLOBALS | TCG_CALL_NO_WRITE_GLOBALS))) {
+-                for (i = 0; i < nb_globals; i++) {
+-                    if (test_bit(i, ctx.temps_used.l)) {
+-                        reset_ts(&s->temps[i]);
+-                    }
+-                }
+-            }
+-
+             for (i = 0; i < nb_oargs; i++) {
+                 reset_temp(op->args[i]);
+                 /* Save the corresponding known-zero bits mask for the
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             case INDEX_op_qemu_st_i32:
+             case INDEX_op_qemu_st8_i32:
+             case INDEX_op_qemu_st_i64:
+-            case INDEX_op_call:
+                 /* Opcodes that touch guest memory stop the optimization.  */
+                 ctx.prev_mb = NULL;
+                 break;
+--
+.25.1

-[PULL 22/46] tcg/tci: Use g_assert_not_reached
+[PULL 14/56] tcg/optimize: Drop nb_oargs, nb_iargs locals
-Three TODO instances are never happen cases.
+Rather than try to keep these up-to-date across folding,
-Other uses of tcg_abort are also indicating unreachable cases.
+re-read nb_oargs at the end, after re-reading the opcode.
-Tested-by: Alex Bennée <alex.bennee@linaro.org>
+A couple of asserts need dropping, but that will take care
-Reviewed-by: Stefan Weil <sw@weilnetz.de>
+of itself as we split the function further.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/tci.c | 15 +++++++--------
+ tcg/optimize.c | 14 ++++----------
-file changed, 7 insertions(+), 8 deletions(-)
+file changed, 4 insertions(+), 10 deletions(-)
-diff --git a/tcg/tci.c b/tcg/tci.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/tci.c
+--- a/tcg/optimize.c
-+++ b/tcg/tci.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static bool tci_compare32(uint32_t u0, uint32_t u1, TCGCond condition)
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-         result = (u0 > u1);
-         break;
+     QTAILQ_FOREACH_SAFE(op, &s->ops, link, op_next) {
-     default:
+         uint64_t z_mask, partmask, affected, tmp;
--        TODO();
+-        int nb_oargs, nb_iargs;
-+        g_assert_not_reached();
+         TCGOpcode opc = op->opc;
-     }
+         const TCGOpDef *def;
-     return result;
- }
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-@@ -XXX,XX +XXX,XX @@ static bool tci_compare64(uint64_t u0, uint64_t u1, TCGCond condition)
+         }
-         result = (u0 > u1);
-         break;
+         def = &tcg_op_defs[opc];
-     default:
+-        nb_oargs = def->nb_oargs;
--        TODO();
+-        nb_iargs = def->nb_iargs;
-+        g_assert_not_reached();
+-        init_arguments(&ctx, op, nb_oargs + nb_iargs);
-     }
+-        copy_propagate(&ctx, op, nb_oargs, nb_iargs);
-     return result;
++        init_arguments(&ctx, op, def->nb_oargs + def->nb_iargs);
- }
++        copy_propagate(&ctx, op, def->nb_oargs, def->nb_iargs);
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
-                 tmp32 = qemu_ld_beul;
+         /* For commutative operations make constant second argument */
-                 break;
+         switch (opc) {
-             default:
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
--                tcg_abort();
-+                g_assert_not_reached();
+         CASE_OP_32_64(qemu_ld):
-             }
+             {
-             tci_write_reg(regs, t0, tmp32);
+-                MemOpIdx oi = op->args[nb_oargs + nb_iargs];
-             break;
++                MemOpIdx oi = op->args[def->nb_oargs + def->nb_iargs];
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+                 MemOp mop = get_memop(oi);
-                 tmp64 = qemu_ld_beq;
+                 if (!(mop & MO_SIGN)) {
-                 break;
+                     z_mask = (2ULL << ((8 << (mop & MO_SIZE)) - 1)) - 1;
-             default:
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
--                tcg_abort();
+         }
-+                g_assert_not_reached();
-             }
+         if (partmask == 0) {
-             tci_write_reg(regs, t0, tmp64);
+-            tcg_debug_assert(nb_oargs == 1);
-             if (TCG_TARGET_REG_BITS == 32) {
+             tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+             continue;
-                 qemu_st_bel(t0);
+         }
-                 break;
+         if (affected == 0) {
-             default:
+-            tcg_debug_assert(nb_oargs == 1);
--                tcg_abort();
+             tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
-+                g_assert_not_reached();
+             continue;
          }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              } else if (args_are_copies(op->args[1], op->args[2])) {
                  op->opc = INDEX_op_dup_vec;
                  TCGOP_VECE(op) = MO_32;
 -                nb_iargs = 1;
              }
              break;
-         case INDEX_op_qemu_st_i64:
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-                 qemu_st_beq(tmp64);
+                 op->opc = opc = (opc == INDEX_op_movcond_i32
-                 break;
+                                  ? INDEX_op_setcond_i32
-             default:
+                                  : INDEX_op_setcond_i64);
--                tcg_abort();
+-                nb_iargs = 2;
 +                g_assert_not_reached();
              }
              break;
-         case INDEX_op_mb:
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-             smp_mb();
+         if (def->flags & TCG_OPF_BB_END) {
-             break;
+             memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
-         default:
+         } else {
--            TODO();
++            int nb_oargs = def->nb_oargs;
--            break;
+             for (i = 0; i < nb_oargs; i++) {
-+            g_assert_not_reached();
+                 reset_temp(op->args[i]);
-         }
+                 /* Save the corresponding known-zero bits mask for the
          tci_assert(tb_ptr == old_code_ptr + op_size);
      }
 --
 .25.1

-[PULL 38/46] cpu: move cc->transaction_failed to tcg_ops
+[PULL 15/56] tcg/optimize: Change fail return for do_constant_folding_cond*
-From: Claudio Fontana <cfontana@suse.de>
+Return -1 instead of 2 for failure, so that we can
+use comparisons against 0 for all cases.
-Signed-off-by: Claudio Fontana <cfontana@suse.de>
-Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
-Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
 [claudio: wrap target code around CONFIG_TCG and !CONFIG_USER_ONLY]
 avoiding its use in headers used by common_ss code (should be poisoned).
 Note: need to be careful with the use of CONFIG_USER_ONLY,
 Message-Id: <20210204163931.7358-11-cfontana@suse.de>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/hw/core/cpu.h     | 28 +++++++++++++---------------
+ tcg/optimize.c | 145 +++++++++++++++++++++++++------------------------
- hw/mips/jazz.c            |  9 +++++++--
+file changed, 74 insertions(+), 71 deletions(-)
- target/alpha/cpu.c        |  2 +-
- target/arm/cpu.c          |  4 ++--
+diff --git a/tcg/optimize.c b/tcg/optimize.c
  target/m68k/cpu.c         |  2 +-
  target/microblaze/cpu.c   |  2 +-
  target/mips/cpu.c         |  4 +++-
  target/riscv/cpu.c        |  2 +-
  target/riscv/cpu_helper.c |  2 +-
  target/sparc/cpu.c        |  2 +-
  target/xtensa/cpu.c       |  2 +-
  target/xtensa/helper.c    |  4 ++--
 files changed, 34 insertions(+), 29 deletions(-)
 diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
 index XXXXXXX..XXXXXXX 100644
---- a/include/hw/core/cpu.h
+--- a/tcg/optimize.c
-+++ b/include/hw/core/cpu.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ typedef struct TcgCpuOperations {
+@@ -XXX,XX +XXX,XX @@ static bool do_constant_folding_cond_eq(TCGCond c)
      /** @debug_excp_handler: Callback for handling debug exceptions */
      void (*debug_excp_handler)(CPUState *cpu);
 +    /**
 +     * @do_transaction_failed: Callback for handling failed memory transactions
 +     * (ie bus faults or external aborts; not MMU faults)
 +     */
 +    void (*do_transaction_failed)(CPUState *cpu, hwaddr physaddr, vaddr addr,
 +                                  unsigned size, MMUAccessType access_type,
 +                                  int mmu_idx, MemTxAttrs attrs,
 +                                  MemTxResult response, uintptr_t retaddr);
  } TcgCpuOperations;
  /**
@@ -XXX,XX +XXX,XX @@ typedef struct TcgCpuOperations {
   * @has_work: Callback for checking if there is work to do.
   * @do_unaligned_access: Callback for unaligned access handling, if
   * the target defines #TARGET_ALIGNED_ONLY.
 - * @do_transaction_failed: Callback for handling failed memory transactions
 - * (ie bus faults or external aborts; not MMU faults)
   * @virtio_is_big_endian: Callback to return %true if a CPU which supports
   * runtime configurable endianness is currently big-endian. Non-configurable
   * CPUs can use the default implementation of this method. This method should
@@ -XXX,XX +XXX,XX @@ struct CPUClass {
      void (*do_unaligned_access)(CPUState *cpu, vaddr addr,
                                  MMUAccessType access_type,
                                  int mmu_idx, uintptr_t retaddr);
 -    void (*do_transaction_failed)(CPUState *cpu, hwaddr physaddr, vaddr addr,
 -                                  unsigned size, MMUAccessType access_type,
 -                                  int mmu_idx, MemTxAttrs attrs,
 -                                  MemTxResult response, uintptr_t retaddr);
      bool (*virtio_is_big_endian)(CPUState *cpu);
      int (*memory_rw_debug)(CPUState *cpu, vaddr addr,
                             uint8_t *buf, int len, bool is_write);
@@ -XXX,XX +XXX,XX @@ CPUState *cpu_by_arch_id(int64_t id);
  void cpu_interrupt(CPUState *cpu, int mask);
 -#ifdef NEED_CPU_H
 -
 -#ifdef CONFIG_SOFTMMU
  static inline void cpu_unaligned_access(CPUState *cpu, vaddr addr,
                                          MMUAccessType access_type,
                                          int mmu_idx, uintptr_t retaddr)
@@ -XXX,XX +XXX,XX @@ static inline void cpu_transaction_failed(CPUState *cpu, hwaddr physaddr,
  {
      CPUClass *cc = CPU_GET_CLASS(cpu);
 -    if (!cpu->ignore_memory_transaction_failures && cc->do_transaction_failed) {
 -        cc->do_transaction_failed(cpu, physaddr, addr, size, access_type,
 -                                  mmu_idx, attrs, response, retaddr);
 +    if (!cpu->ignore_memory_transaction_failures &&
 +        cc->tcg_ops.do_transaction_failed) {
 +        cc->tcg_ops.do_transaction_failed(cpu, physaddr, addr, size,
 +                                          access_type, mmu_idx, attrs,
 +                                          response, retaddr);
      }
  }
--#endif
--
+-/* Return 2 if the condition can't be simplified, and the result
--#endif /* NEED_CPU_H */
+-   of the condition (0 or 1) if it can */
+-static TCGArg do_constant_folding_cond(TCGOpcode op, TCGArg x,
- /**
+-                                       TCGArg y, TCGCond c)
-  * cpu_set_pc:
++/*
-diff --git a/hw/mips/jazz.c b/hw/mips/jazz.c
++ * Return -1 if the condition can't be simplified,
-index XXXXXXX..XXXXXXX 100644
++ * and the result of the condition (0 or 1) if it can.
---- a/hw/mips/jazz.c
++ */
-+++ b/hw/mips/jazz.c
++static int do_constant_folding_cond(TCGOpcode op, TCGArg x,
-@@ -XXX,XX +XXX,XX @@ static const MemoryRegionOps dma_dummy_ops = {
++                                    TCGArg y, TCGCond c)
- #define MAGNUM_BIOS_SIZE_MAX 0x7e000
+ {
- #define MAGNUM_BIOS_SIZE                                                       \
+     uint64_t xv = arg_info(x)->val;
-         (BIOS_SIZE < MAGNUM_BIOS_SIZE_MAX ? BIOS_SIZE : MAGNUM_BIOS_SIZE_MAX)
+     uint64_t yv = arg_info(y)->val;
-+
+@@ -XXX,XX +XXX,XX @@ static TCGArg do_constant_folding_cond(TCGOpcode op, TCGArg x,
-+#if defined(CONFIG_TCG) && !defined(CONFIG_USER_ONLY)
+         case TCG_COND_GEU:
- static void (*real_do_transaction_failed)(CPUState *cpu, hwaddr physaddr,
+             return 1;
-                                           vaddr addr, unsigned size,
+         default:
-                                           MMUAccessType access_type,
+-            return 2;
-@@ -XXX,XX +XXX,XX @@ static void mips_jazz_do_transaction_failed(CPUState *cs, hwaddr physaddr,
++            return -1;
-     (*real_do_transaction_failed)(cs, physaddr, addr, size, access_type,
+         }
-                                   mmu_idx, attrs, response, retaddr);
+     }
 -    return 2;
 +    return -1;
  }
-+#endif /* CONFIG_TCG && !CONFIG_USER_ONLY */
+-/* Return 2 if the condition can't be simplified, and the result
- static void mips_jazz_init(MachineState *machine,
+-   of the condition (0 or 1) if it can */
-                            enum jazz_model_e jazz_model)
+-static TCGArg do_constant_folding_cond2(TCGArg *p1, TCGArg *p2, TCGCond c)
-@@ -XXX,XX +XXX,XX @@ static void mips_jazz_init(MachineState *machine,
++/*
-      * memory region that catches all memory accesses, as we do on Malta.
++ * Return -1 if the condition can't be simplified,
-      */
++ * and the result of the condition (0 or 1) if it can.
-     cc = CPU_GET_CLASS(cpu);
++ */
--    real_do_transaction_failed = cc->do_transaction_failed;
++static int do_constant_folding_cond2(TCGArg *p1, TCGArg *p2, TCGCond c)
--    cc->do_transaction_failed = mips_jazz_do_transaction_failed;
+ {
-+#if defined(CONFIG_TCG) && !defined(CONFIG_USER_ONLY)
+     TCGArg al = p1[0], ah = p1[1];
-+    real_do_transaction_failed = cc->tcg_ops.do_transaction_failed;
+     TCGArg bl = p2[0], bh = p2[1];
-+    cc->tcg_ops.do_transaction_failed = mips_jazz_do_transaction_failed;
+@@ -XXX,XX +XXX,XX @@ static TCGArg do_constant_folding_cond2(TCGArg *p1, TCGArg *p2, TCGCond c)
-+#endif /* CONFIG_TCG && !CONFIG_USER_ONLY */
+     if (args_are_copies(al, bl) && args_are_copies(ah, bh)) {
+         return do_constant_folding_cond_eq(c);
-     /* allocate RAM */
+     }
-     memory_region_add_subregion(address_space, 0, machine->ram);
+-    return 2;
-diff --git a/target/alpha/cpu.c b/target/alpha/cpu.c
++    return -1;
 index XXXXXXX..XXXXXXX 100644
 --- a/target/alpha/cpu.c
 +++ b/target/alpha/cpu.c
@@ -XXX,XX +XXX,XX @@ static void alpha_cpu_class_init(ObjectClass *oc, void *data)
      cc->gdb_write_register = alpha_cpu_gdb_write_register;
      cc->tcg_ops.tlb_fill = alpha_cpu_tlb_fill;
  #ifndef CONFIG_USER_ONLY
 -    cc->do_transaction_failed = alpha_cpu_do_transaction_failed;
 +    cc->tcg_ops.do_transaction_failed = alpha_cpu_do_transaction_failed;
      cc->do_unaligned_access = alpha_cpu_do_unaligned_access;
      cc->get_phys_page_debug = alpha_cpu_get_phys_page_debug;
      dc->vmsd = &vmstate_alpha_cpu;
 diff --git a/target/arm/cpu.c b/target/arm/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/arm/cpu.c
 +++ b/target/arm/cpu.c
@@ -XXX,XX +XXX,XX @@ static void arm_cpu_class_init(ObjectClass *oc, void *data)
      cc->debug_check_watchpoint = arm_debug_check_watchpoint;
      cc->do_unaligned_access = arm_cpu_do_unaligned_access;
  #if !defined(CONFIG_USER_ONLY)
 -    cc->do_transaction_failed = arm_cpu_do_transaction_failed;
 +    cc->tcg_ops.do_transaction_failed = arm_cpu_do_transaction_failed;
      cc->adjust_watchpoint_address = arm_adjust_watchpoint_address;
      cc->tcg_ops.do_interrupt = arm_cpu_do_interrupt;
  #endif /* CONFIG_TCG && !CONFIG_USER_ONLY */
 -#endif
 +#endif /* CONFIG_TCG */
  }
- #ifdef CONFIG_KVM
+ static bool swap_commutative(TCGArg dest, TCGArg *p1, TCGArg *p2)
-diff --git a/target/m68k/cpu.c b/target/m68k/cpu.c
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-index XXXXXXX..XXXXXXX 100644
+             break;
---- a/target/m68k/cpu.c
-+++ b/target/m68k/cpu.c
+         CASE_OP_32_64(setcond):
-@@ -XXX,XX +XXX,XX @@ static void m68k_cpu_class_init(ObjectClass *c, void *data)
+-            tmp = do_constant_folding_cond(opc, op->args[1],
-     cc->gdb_write_register = m68k_cpu_gdb_write_register;
+-                                           op->args[2], op->args[3]);
-     cc->tcg_ops.tlb_fill = m68k_cpu_tlb_fill;
+-            if (tmp != 2) {
- #if defined(CONFIG_SOFTMMU)
+-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
--    cc->do_transaction_failed = m68k_cpu_transaction_failed;
++            i = do_constant_folding_cond(opc, op->args[1],
-+    cc->tcg_ops.do_transaction_failed = m68k_cpu_transaction_failed;
++                                         op->args[2], op->args[3]);
-     cc->get_phys_page_debug = m68k_cpu_get_phys_page_debug;
++            if (i >= 0) {
-     dc->vmsd = &vmstate_m68k_cpu;
++                tcg_opt_gen_movi(&ctx, op, op->args[0], i);
- #endif
+                 continue;
-diff --git a/target/microblaze/cpu.c b/target/microblaze/cpu.c
+             }
-index XXXXXXX..XXXXXXX 100644
+             break;
---- a/target/microblaze/cpu.c
-+++ b/target/microblaze/cpu.c
+         CASE_OP_32_64(brcond):
-@@ -XXX,XX +XXX,XX @@ static void mb_cpu_class_init(ObjectClass *oc, void *data)
+-            tmp = do_constant_folding_cond(opc, op->args[0],
-     cc->gdb_write_register = mb_cpu_gdb_write_register;
+-                                           op->args[1], op->args[2]);
-     cc->tcg_ops.tlb_fill = mb_cpu_tlb_fill;
+-            switch (tmp) {
- #ifndef CONFIG_USER_ONLY
+-            case 0:
--    cc->do_transaction_failed = mb_cpu_transaction_failed;
++            i = do_constant_folding_cond(opc, op->args[0],
-+    cc->tcg_ops.do_transaction_failed = mb_cpu_transaction_failed;
++                                         op->args[1], op->args[2]);
-     cc->get_phys_page_attrs_debug = mb_cpu_get_phys_page_attrs_debug;
++            if (i == 0) {
-     dc->vmsd = &vmstate_mb_cpu;
+                 tcg_op_remove(s, op);
- #endif
+                 continue;
-diff --git a/target/mips/cpu.c b/target/mips/cpu.c
+-            case 1:
-index XXXXXXX..XXXXXXX 100644
++            } else if (i > 0) {
---- a/target/mips/cpu.c
+                 memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
-+++ b/target/mips/cpu.c
+                 op->opc = opc = INDEX_op_br;
-@@ -XXX,XX +XXX,XX @@ static void mips_cpu_class_init(ObjectClass *c, void *data)
+                 op->args[0] = op->args[3];
-     cc->gdb_read_register = mips_cpu_gdb_read_register;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-     cc->gdb_write_register = mips_cpu_gdb_write_register;
+             break;
- #ifndef CONFIG_USER_ONLY
--    cc->do_transaction_failed = mips_cpu_do_transaction_failed;
+         CASE_OP_32_64(movcond):
-     cc->do_unaligned_access = mips_cpu_do_unaligned_access;
+-            tmp = do_constant_folding_cond(opc, op->args[1],
-     cc->get_phys_page_debug = mips_cpu_get_phys_page_debug;
+-                                           op->args[2], op->args[5]);
-     cc->vmsd = &vmstate_mips_cpu;
+-            if (tmp != 2) {
-@@ -XXX,XX +XXX,XX @@ static void mips_cpu_class_init(ObjectClass *c, void *data)
+-                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[4-tmp]);
-     cc->tcg_ops.cpu_exec_interrupt = mips_cpu_exec_interrupt;
++            i = do_constant_folding_cond(opc, op->args[1],
-     cc->tcg_ops.synchronize_from_tb = mips_cpu_synchronize_from_tb;
++                                         op->args[2], op->args[5]);
-     cc->tcg_ops.tlb_fill = mips_cpu_tlb_fill;
++            if (i >= 0) {
-+#ifndef CONFIG_USER_ONLY
++                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[4 - i]);
-+    cc->tcg_ops.do_transaction_failed = mips_cpu_do_transaction_failed;
+                 continue;
-+#endif /* CONFIG_USER_ONLY */
+             }
- #endif /* CONFIG_TCG */
+             if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-     cc->gdb_num_core_regs = 73;
+             break;
-diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
-index XXXXXXX..XXXXXXX 100644
+         case INDEX_op_brcond2_i32:
---- a/target/riscv/cpu.c
+-            tmp = do_constant_folding_cond2(&op->args[0], &op->args[2],
-+++ b/target/riscv/cpu.c
+-                                            op->args[4]);
-@@ -XXX,XX +XXX,XX @@ static void riscv_cpu_class_init(ObjectClass *c, void *data)
+-            if (tmp == 0) {
-     cc->gdb_stop_before_watchpoint = true;
++            i = do_constant_folding_cond2(&op->args[0], &op->args[2],
-     cc->disas_set_info = riscv_cpu_disas_set_info;
++                                          op->args[4]);
- #ifndef CONFIG_USER_ONLY
++            if (i == 0) {
--    cc->do_transaction_failed = riscv_cpu_do_transaction_failed;
+             do_brcond_false:
-+    cc->tcg_ops.do_transaction_failed = riscv_cpu_do_transaction_failed;
+                 tcg_op_remove(s, op);
-     cc->do_unaligned_access = riscv_cpu_do_unaligned_access;
+                 continue;
-     cc->get_phys_page_debug = riscv_cpu_get_phys_page_debug;
+             }
-     /* For now, mark unmigratable: */
+-            if (tmp == 1) {
-diff --git a/target/riscv/cpu_helper.c b/target/riscv/cpu_helper.c
++            if (i > 0) {
-index XXXXXXX..XXXXXXX 100644
+             do_brcond_true:
---- a/target/riscv/cpu_helper.c
+                 op->opc = opc = INDEX_op_br;
-+++ b/target/riscv/cpu_helper.c
+                 op->args[0] = op->args[5];
-@@ -XXX,XX +XXX,XX @@ void riscv_cpu_do_unaligned_access(CPUState *cs, vaddr addr,
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-     env->badaddr = addr;
+             if (op->args[4] == TCG_COND_EQ) {
-     riscv_raise_exception(env, cs->exception_index, retaddr);
+                 /* Simplify EQ comparisons where one of the pairs
- }
+                    can be simplified.  */
--#endif
+-                tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
-+#endif /* !CONFIG_USER_ONLY */
+-                                               op->args[0], op->args[2],
+-                                               TCG_COND_EQ);
- bool riscv_cpu_tlb_fill(CPUState *cs, vaddr address, int size,
+-                if (tmp == 0) {
-                         MMUAccessType access_type, int mmu_idx,
++                i = do_constant_folding_cond(INDEX_op_brcond_i32,
-diff --git a/target/sparc/cpu.c b/target/sparc/cpu.c
++                                             op->args[0], op->args[2],
-index XXXXXXX..XXXXXXX 100644
++                                             TCG_COND_EQ);
---- a/target/sparc/cpu.c
++                if (i == 0) {
-+++ b/target/sparc/cpu.c
+                     goto do_brcond_false;
-@@ -XXX,XX +XXX,XX @@ static void sparc_cpu_class_init(ObjectClass *oc, void *data)
+-                } else if (tmp == 1) {
-     cc->gdb_write_register = sparc_cpu_gdb_write_register;
++                } else if (i > 0) {
-     cc->tcg_ops.tlb_fill = sparc_cpu_tlb_fill;
+                     goto do_brcond_high;
- #ifndef CONFIG_USER_ONLY
+                 }
--    cc->do_transaction_failed = sparc_cpu_do_transaction_failed;
+-                tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
-+    cc->tcg_ops.do_transaction_failed = sparc_cpu_do_transaction_failed;
+-                                               op->args[1], op->args[3],
-     cc->do_unaligned_access = sparc_cpu_do_unaligned_access;
+-                                               TCG_COND_EQ);
-     cc->get_phys_page_debug = sparc_cpu_get_phys_page_debug;
+-                if (tmp == 0) {
-     cc->vmsd = &vmstate_sparc_cpu;
++                i = do_constant_folding_cond(INDEX_op_brcond_i32,
-diff --git a/target/xtensa/cpu.c b/target/xtensa/cpu.c
++                                             op->args[1], op->args[3],
-index XXXXXXX..XXXXXXX 100644
++                                             TCG_COND_EQ);
---- a/target/xtensa/cpu.c
++                if (i == 0) {
-+++ b/target/xtensa/cpu.c
+                     goto do_brcond_false;
-@@ -XXX,XX +XXX,XX @@ static void xtensa_cpu_class_init(ObjectClass *oc, void *data)
+-                } else if (tmp != 1) {
- #ifndef CONFIG_USER_ONLY
++                } else if (i < 0) {
-     cc->do_unaligned_access = xtensa_cpu_do_unaligned_access;
+                     break;
-     cc->get_phys_page_debug = xtensa_cpu_get_phys_page_debug;
+                 }
--    cc->do_transaction_failed = xtensa_cpu_do_transaction_failed;
+             do_brcond_low:
-+    cc->tcg_ops.do_transaction_failed = xtensa_cpu_do_transaction_failed;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
- #endif
+             if (op->args[4] == TCG_COND_NE) {
-     cc->tcg_ops.debug_excp_handler = xtensa_breakpoint_handler;
+                 /* Simplify NE comparisons where one of the pairs
-     cc->disas_set_info = xtensa_cpu_disas_set_info;
+                    can be simplified.  */
-diff --git a/target/xtensa/helper.c b/target/xtensa/helper.c
+-                tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
-index XXXXXXX..XXXXXXX 100644
+-                                               op->args[0], op->args[2],
---- a/target/xtensa/helper.c
+-                                               TCG_COND_NE);
-+++ b/target/xtensa/helper.c
+-                if (tmp == 0) {
-@@ -XXX,XX +XXX,XX @@ bool xtensa_cpu_tlb_fill(CPUState *cs, vaddr address, int size,
++                i = do_constant_folding_cond(INDEX_op_brcond_i32,
-     cpu_loop_exit_restore(cs, retaddr);
++                                             op->args[0], op->args[2],
- }
++                                             TCG_COND_NE);
++                if (i == 0) {
--#else
+                     goto do_brcond_high;
-+#else /* !CONFIG_USER_ONLY */
+-                } else if (tmp == 1) {
++                } else if (i > 0) {
- void xtensa_cpu_do_unaligned_access(CPUState *cs,
+                     goto do_brcond_true;
-                                     vaddr addr, MMUAccessType access_type,
+                 }
-@@ -XXX,XX +XXX,XX @@ void xtensa_runstall(CPUXtensaState *env, bool runstall)
+-                tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
-         qemu_cpu_kick(cpu);
+-                                               op->args[1], op->args[3],
-     }
+-                                               TCG_COND_NE);
- }
+-                if (tmp == 0) {
--#endif
++                i = do_constant_folding_cond(INDEX_op_brcond_i32,
-+#endif /* !CONFIG_USER_ONLY */
++                                             op->args[1], op->args[3],
 +                                             TCG_COND_NE);
 +                if (i == 0) {
                      goto do_brcond_low;
 -                } else if (tmp == 1) {
 +                } else if (i > 0) {
                      goto do_brcond_true;
                  }
              }
              break;
          case INDEX_op_setcond2_i32:
 -            tmp = do_constant_folding_cond2(&op->args[1], &op->args[3],
 -                                            op->args[5]);
 -            if (tmp != 2) {
 +            i = do_constant_folding_cond2(&op->args[1], &op->args[3],
 +                                          op->args[5]);
 +            if (i >= 0) {
              do_setcond_const:
 -                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], i);
                  continue;
              }
              if ((op->args[5] == TCG_COND_LT || op->args[5] == TCG_COND_GE)
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (op->args[5] == TCG_COND_EQ) {
                  /* Simplify EQ comparisons where one of the pairs
                     can be simplified.  */
 -                tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
 -                                               op->args[1], op->args[3],
 -                                               TCG_COND_EQ);
 -                if (tmp == 0) {
 +                i = do_constant_folding_cond(INDEX_op_setcond_i32,
 +                                             op->args[1], op->args[3],
 +                                             TCG_COND_EQ);
 +                if (i == 0) {
                      goto do_setcond_const;
 -                } else if (tmp == 1) {
 +                } else if (i > 0) {
                      goto do_setcond_high;
                  }
 -                tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
 -                                               op->args[2], op->args[4],
 -                                               TCG_COND_EQ);
 -                if (tmp == 0) {
 +                i = do_constant_folding_cond(INDEX_op_setcond_i32,
 +                                             op->args[2], op->args[4],
 +                                             TCG_COND_EQ);
 +                if (i == 0) {
                      goto do_setcond_high;
 -                } else if (tmp != 1) {
 +                } else if (i < 0) {
                      break;
                  }
              do_setcond_low:
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (op->args[5] == TCG_COND_NE) {
                  /* Simplify NE comparisons where one of the pairs
                     can be simplified.  */
 -                tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
 -                                               op->args[1], op->args[3],
 -                                               TCG_COND_NE);
 -                if (tmp == 0) {
 +                i = do_constant_folding_cond(INDEX_op_setcond_i32,
 +                                             op->args[1], op->args[3],
 +                                             TCG_COND_NE);
 +                if (i == 0) {
                      goto do_setcond_high;
 -                } else if (tmp == 1) {
 +                } else if (i > 0) {
                      goto do_setcond_const;
                  }
 -                tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
 -                                               op->args[2], op->args[4],
 -                                               TCG_COND_NE);
 -                if (tmp == 0) {
 +                i = do_constant_folding_cond(INDEX_op_setcond_i32,
 +                                             op->args[2], op->args[4],
 +                                             TCG_COND_NE);
 +                if (i == 0) {
                      goto do_setcond_low;
 -                } else if (tmp == 1) {
 +                } else if (i > 0) {
                      goto do_setcond_const;
                  }
              }
 --
 .25.1

-[PULL 28/46] tcg/tci: Remove TCG_CONST
+[PULL 16/56] tcg/optimize: Return true from tcg_opt_gen_{mov,movi}
-Restrict all operands to registers.  All constants will be forced
+This will allow callers to tail call to these functions
-into registers by the middle-end.  Removing the difference in how
+and return true indicating processing complete.
 immediate integers were encoded will allow more code to be shared
 between 32-bit and 64-bit operations.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/tci/tcg-target-con-set.h |   6 +-
+ tcg/optimize.c | 9 +++++----
- tcg/tci/tcg-target.h         |   3 -
+file changed, 5 insertions(+), 4 deletions(-)
  tcg/tci.c                    | 189 +++++++++++++----------------------
  tcg/tci/tcg-target.c.inc     |  85 ++++------------
 files changed, 89 insertions(+), 194 deletions(-)
-diff --git a/tcg/tci/tcg-target-con-set.h b/tcg/tci/tcg-target-con-set.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/tci/tcg-target-con-set.h
+--- a/tcg/optimize.c
-+++ b/tcg/tci/tcg-target-con-set.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static bool args_are_copies(TCGArg arg1, TCGArg arg2)
-  * tcg-target-con-str.h; the constraint combination is inclusive or.
+     return ts_are_copies(arg_temp(arg1), arg_temp(arg2));
   */
  C_O0_I2(r, r)
 -C_O0_I2(r, ri)
  C_O0_I3(r, r, r)
 -C_O0_I4(r, r, ri, ri)
  C_O0_I4(r, r, r, r)
  C_O1_I1(r, r)
  C_O1_I2(r, 0, r)
 -C_O1_I2(r, ri, ri)
  C_O1_I2(r, r, r)
 -C_O1_I2(r, r, ri)
 -C_O1_I4(r, r, r, ri, ri)
 +C_O1_I4(r, r, r, r, r)
  C_O2_I1(r, r, r)
  C_O2_I2(r, r, r, r)
  C_O2_I4(r, r, r, r, r, r)
 diff --git a/tcg/tci/tcg-target.h b/tcg/tci/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tci/tcg-target.h
 +++ b/tcg/tci/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
      TCG_AREG0 = TCG_REG_R14,
      TCG_REG_CALL_STACK = TCG_REG_R15,
 -
 -    /* Special value UINT8_MAX is used by TCI to encode constant values. */
 -    TCG_CONST = UINT8_MAX
  } TCGReg;
  /* Used for function call generation. */
 diff --git a/tcg/tci.c b/tcg/tci.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tci.c
 +++ b/tcg/tci.c
@@ -XXX,XX +XXX,XX @@ tci_read_ulong(const tcg_target_ulong *regs, const uint8_t **tb_ptr)
      return taddr;
  }
--/* Read indexed register or constant (native size) from bytecode. */
+-static void tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
--static tcg_target_ulong
++static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
 -tci_read_ri(const tcg_target_ulong *regs, const uint8_t **tb_ptr)
 -{
 -    tcg_target_ulong value;
 -    TCGReg r = **tb_ptr;
 -    *tb_ptr += 1;
 -    if (r == TCG_CONST) {
 -        value = tci_read_i(tb_ptr);
 -    } else {
 -        value = tci_read_reg(regs, r);
 -    }
 -    return value;
 -}
 -
 -/* Read indexed register or constant (32 bit) from bytecode. */
 -static uint32_t tci_read_ri32(const tcg_target_ulong *regs,
 -                              const uint8_t **tb_ptr)
 -{
 -    uint32_t value;
 -    TCGReg r = **tb_ptr;
 -    *tb_ptr += 1;
 -    if (r == TCG_CONST) {
 -        value = tci_read_i32(tb_ptr);
 -    } else {
 -        value = tci_read_reg32(regs, r);
 -    }
 -    return value;
 -}
 -
 -#if TCG_TARGET_REG_BITS == 32
 -/* Read two indexed registers or constants (2 * 32 bit) from bytecode. */
 -static uint64_t tci_read_ri64(const tcg_target_ulong *regs,
 -                              const uint8_t **tb_ptr)
 -{
 -    uint32_t low = tci_read_ri32(regs, tb_ptr);
 -    return tci_uint64(tci_read_ri32(regs, tb_ptr), low);
 -}
 -#elif TCG_TARGET_REG_BITS == 64
 -/* Read indexed register or constant (64 bit) from bytecode. */
 -static uint64_t tci_read_ri64(const tcg_target_ulong *regs,
 -                              const uint8_t **tb_ptr)
 -{
 -    uint64_t value;
 -    TCGReg r = **tb_ptr;
 -    *tb_ptr += 1;
 -    if (r == TCG_CONST) {
 -        value = tci_read_i64(tb_ptr);
 -    } else {
 -        value = tci_read_reg64(regs, r);
 -    }
 -    return value;
 -}
 -#endif
 -
  static tcg_target_ulong tci_read_label(const uint8_t **tb_ptr)
  {
-     tcg_target_ulong label = tci_read_i(tb_ptr);
+     TCGTemp *dst_ts = arg_temp(dst);
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+     TCGTemp *src_ts = arg_temp(src);
+@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
-         switch (opc) {
-         case INDEX_op_call:
+     if (ts_are_copies(dst_ts, src_ts)) {
--            t0 = tci_read_ri(regs, &tb_ptr);
+         tcg_op_remove(ctx->tcg, op);
-+            t0 = tci_read_i(&tb_ptr);
+-        return;
-             tci_tb_ptr = (uintptr_t)tb_ptr;
++        return true;
- #if TCG_TARGET_REG_BITS == 32
+     }
-             tmp64 = ((helper_function)t0)(tci_read_reg(regs, TCG_REG_R0),
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+     reset_ts(dst_ts);
-         case INDEX_op_setcond_i32:
+@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
-             t0 = *tb_ptr++;
+         di->is_const = si->is_const;
-             t1 = tci_read_r32(regs, &tb_ptr);
+         di->val = si->val;
--            t2 = tci_read_ri32(regs, &tb_ptr);
+     }
-+            t2 = tci_read_r32(regs, &tb_ptr);
++    return true;
              condition = *tb_ptr++;
              tci_write_reg(regs, t0, tci_compare32(t1, t2, condition));
              break;
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
          case INDEX_op_setcond2_i32:
              t0 = *tb_ptr++;
              tmp64 = tci_read_r64(regs, &tb_ptr);
 -            v64 = tci_read_ri64(regs, &tb_ptr);
 +            v64 = tci_read_r64(regs, &tb_ptr);
              condition = *tb_ptr++;
              tci_write_reg(regs, t0, tci_compare64(tmp64, v64, condition));
              break;
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
          case INDEX_op_setcond_i64:
              t0 = *tb_ptr++;
              t1 = tci_read_r64(regs, &tb_ptr);
 -            t2 = tci_read_ri64(regs, &tb_ptr);
 +            t2 = tci_read_r64(regs, &tb_ptr);
              condition = *tb_ptr++;
              tci_write_reg(regs, t0, tci_compare64(t1, t2, condition));
              break;
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
          case INDEX_op_add_i32:
              t0 = *tb_ptr++;
 -            t1 = tci_read_ri32(regs, &tb_ptr);
 -            t2 = tci_read_ri32(regs, &tb_ptr);
 +            t1 = tci_read_r32(regs, &tb_ptr);
 +            t2 = tci_read_r32(regs, &tb_ptr);
              tci_write_reg(regs, t0, t1 + t2);
              break;
          case INDEX_op_sub_i32:
              t0 = *tb_ptr++;
 -            t1 = tci_read_ri32(regs, &tb_ptr);
 -            t2 = tci_read_ri32(regs, &tb_ptr);
 +            t1 = tci_read_r32(regs, &tb_ptr);
 +            t2 = tci_read_r32(regs, &tb_ptr);
              tci_write_reg(regs, t0, t1 - t2);
              break;
          case INDEX_op_mul_i32:
              t0 = *tb_ptr++;
 -            t1 = tci_read_ri32(regs, &tb_ptr);
 -            t2 = tci_read_ri32(regs, &tb_ptr);
 +            t1 = tci_read_r32(regs, &tb_ptr);
 +            t2 = tci_read_r32(regs, &tb_ptr);
              tci_write_reg(regs, t0, t1 * t2);
              break;
          case INDEX_op_div_i32:
              t0 = *tb_ptr++;
 -            t1 = tci_read_ri32(regs, &tb_ptr);
 -            t2 = tci_read_ri32(regs, &tb_ptr);
 +            t1 = tci_read_r32(regs, &tb_ptr);
 +            t2 = tci_read_r32(regs, &tb_ptr);
              tci_write_reg(regs, t0, (int32_t)t1 / (int32_t)t2);
              break;
          case INDEX_op_divu_i32:
              t0 = *tb_ptr++;
 -            t1 = tci_read_ri32(regs, &tb_ptr);
 -            t2 = tci_read_ri32(regs, &tb_ptr);
 +            t1 = tci_read_r32(regs, &tb_ptr);
 +            t2 = tci_read_r32(regs, &tb_ptr);
              tci_write_reg(regs, t0, t1 / t2);
              break;
          case INDEX_op_rem_i32:
              t0 = *tb_ptr++;
 -            t1 = tci_read_ri32(regs, &tb_ptr);
 -            t2 = tci_read_ri32(regs, &tb_ptr);
 +            t1 = tci_read_r32(regs, &tb_ptr);
 +            t2 = tci_read_r32(regs, &tb_ptr);
              tci_write_reg(regs, t0, (int32_t)t1 % (int32_t)t2);
              break;
          case INDEX_op_remu_i32:
              t0 = *tb_ptr++;
 -            t1 = tci_read_ri32(regs, &tb_ptr);
 -            t2 = tci_read_ri32(regs, &tb_ptr);
 +            t1 = tci_read_r32(regs, &tb_ptr);
 +            t2 = tci_read_r32(regs, &tb_ptr);
              tci_write_reg(regs, t0, t1 % t2);
              break;
          case INDEX_op_and_i32:
              t0 = *tb_ptr++;
 -            t1 = tci_read_ri32(regs, &tb_ptr);
 -            t2 = tci_read_ri32(regs, &tb_ptr);
 +            t1 = tci_read_r32(regs, &tb_ptr);
 +            t2 = tci_read_r32(regs, &tb_ptr);
              tci_write_reg(regs, t0, t1 & t2);
              break;
          case INDEX_op_or_i32:
              t0 = *tb_ptr++;
 -            t1 = tci_read_ri32(regs, &tb_ptr);
 -            t2 = tci_read_ri32(regs, &tb_ptr);
 +            t1 = tci_read_r32(regs, &tb_ptr);
 +            t2 = tci_read_r32(regs, &tb_ptr);
              tci_write_reg(regs, t0, t1 | t2);
              break;
          case INDEX_op_xor_i32:
              t0 = *tb_ptr++;
 -            t1 = tci_read_ri32(regs, &tb_ptr);
 -            t2 = tci_read_ri32(regs, &tb_ptr);
 +            t1 = tci_read_r32(regs, &tb_ptr);
 +            t2 = tci_read_r32(regs, &tb_ptr);
              tci_write_reg(regs, t0, t1 ^ t2);
              break;
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
          case INDEX_op_shl_i32:
              t0 = *tb_ptr++;
 -            t1 = tci_read_ri32(regs, &tb_ptr);
 -            t2 = tci_read_ri32(regs, &tb_ptr);
 +            t1 = tci_read_r32(regs, &tb_ptr);
 +            t2 = tci_read_r32(regs, &tb_ptr);
              tci_write_reg(regs, t0, t1 << (t2 & 31));
              break;
          case INDEX_op_shr_i32:
              t0 = *tb_ptr++;
 -            t1 = tci_read_ri32(regs, &tb_ptr);
 -            t2 = tci_read_ri32(regs, &tb_ptr);
 +            t1 = tci_read_r32(regs, &tb_ptr);
 +            t2 = tci_read_r32(regs, &tb_ptr);
              tci_write_reg(regs, t0, t1 >> (t2 & 31));
              break;
          case INDEX_op_sar_i32:
              t0 = *tb_ptr++;
 -            t1 = tci_read_ri32(regs, &tb_ptr);
 -            t2 = tci_read_ri32(regs, &tb_ptr);
 +            t1 = tci_read_r32(regs, &tb_ptr);
 +            t2 = tci_read_r32(regs, &tb_ptr);
              tci_write_reg(regs, t0, ((int32_t)t1 >> (t2 & 31)));
              break;
  #if TCG_TARGET_HAS_rot_i32
          case INDEX_op_rotl_i32:
              t0 = *tb_ptr++;
 -            t1 = tci_read_ri32(regs, &tb_ptr);
 -            t2 = tci_read_ri32(regs, &tb_ptr);
 +            t1 = tci_read_r32(regs, &tb_ptr);
 +            t2 = tci_read_r32(regs, &tb_ptr);
              tci_write_reg(regs, t0, rol32(t1, t2 & 31));
              break;
          case INDEX_op_rotr_i32:
              t0 = *tb_ptr++;
 -            t1 = tci_read_ri32(regs, &tb_ptr);
 -            t2 = tci_read_ri32(regs, &tb_ptr);
 +            t1 = tci_read_r32(regs, &tb_ptr);
 +            t2 = tci_read_r32(regs, &tb_ptr);
              tci_write_reg(regs, t0, ror32(t1, t2 & 31));
              break;
  #endif
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
  #endif
          case INDEX_op_brcond_i32:
              t0 = tci_read_r32(regs, &tb_ptr);
 -            t1 = tci_read_ri32(regs, &tb_ptr);
 +            t1 = tci_read_r32(regs, &tb_ptr);
              condition = *tb_ptr++;
              label = tci_read_label(&tb_ptr);
              if (tci_compare32(t0, t1, condition)) {
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
              break;
          case INDEX_op_brcond2_i32:
              tmp64 = tci_read_r64(regs, &tb_ptr);
 -            v64 = tci_read_ri64(regs, &tb_ptr);
 +            v64 = tci_read_r64(regs, &tb_ptr);
              condition = *tb_ptr++;
              label = tci_read_label(&tb_ptr);
              if (tci_compare64(tmp64, v64, condition)) {
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
          case INDEX_op_add_i64:
              t0 = *tb_ptr++;
 -            t1 = tci_read_ri64(regs, &tb_ptr);
 -            t2 = tci_read_ri64(regs, &tb_ptr);
 +            t1 = tci_read_r64(regs, &tb_ptr);
 +            t2 = tci_read_r64(regs, &tb_ptr);
              tci_write_reg(regs, t0, t1 + t2);
              break;
          case INDEX_op_sub_i64:
              t0 = *tb_ptr++;
 -            t1 = tci_read_ri64(regs, &tb_ptr);
 -            t2 = tci_read_ri64(regs, &tb_ptr);
 +            t1 = tci_read_r64(regs, &tb_ptr);
 +            t2 = tci_read_r64(regs, &tb_ptr);
              tci_write_reg(regs, t0, t1 - t2);
              break;
          case INDEX_op_mul_i64:
              t0 = *tb_ptr++;
 -            t1 = tci_read_ri64(regs, &tb_ptr);
 -            t2 = tci_read_ri64(regs, &tb_ptr);
 +            t1 = tci_read_r64(regs, &tb_ptr);
 +            t2 = tci_read_r64(regs, &tb_ptr);
              tci_write_reg(regs, t0, t1 * t2);
              break;
          case INDEX_op_div_i64:
              t0 = *tb_ptr++;
 -            t1 = tci_read_ri64(regs, &tb_ptr);
 -            t2 = tci_read_ri64(regs, &tb_ptr);
 +            t1 = tci_read_r64(regs, &tb_ptr);
 +            t2 = tci_read_r64(regs, &tb_ptr);
              tci_write_reg(regs, t0, (int64_t)t1 / (int64_t)t2);
              break;
          case INDEX_op_divu_i64:
              t0 = *tb_ptr++;
 -            t1 = tci_read_ri64(regs, &tb_ptr);
 -            t2 = tci_read_ri64(regs, &tb_ptr);
 +            t1 = tci_read_r64(regs, &tb_ptr);
 +            t2 = tci_read_r64(regs, &tb_ptr);
              tci_write_reg(regs, t0, (uint64_t)t1 / (uint64_t)t2);
              break;
          case INDEX_op_rem_i64:
              t0 = *tb_ptr++;
 -            t1 = tci_read_ri64(regs, &tb_ptr);
 -            t2 = tci_read_ri64(regs, &tb_ptr);
 +            t1 = tci_read_r64(regs, &tb_ptr);
 +            t2 = tci_read_r64(regs, &tb_ptr);
              tci_write_reg(regs, t0, (int64_t)t1 % (int64_t)t2);
              break;
          case INDEX_op_remu_i64:
              t0 = *tb_ptr++;
 -            t1 = tci_read_ri64(regs, &tb_ptr);
 -            t2 = tci_read_ri64(regs, &tb_ptr);
 +            t1 = tci_read_r64(regs, &tb_ptr);
 +            t2 = tci_read_r64(regs, &tb_ptr);
              tci_write_reg(regs, t0, (uint64_t)t1 % (uint64_t)t2);
              break;
          case INDEX_op_and_i64:
              t0 = *tb_ptr++;
 -            t1 = tci_read_ri64(regs, &tb_ptr);
 -            t2 = tci_read_ri64(regs, &tb_ptr);
 +            t1 = tci_read_r64(regs, &tb_ptr);
 +            t2 = tci_read_r64(regs, &tb_ptr);
              tci_write_reg(regs, t0, t1 & t2);
              break;
          case INDEX_op_or_i64:
              t0 = *tb_ptr++;
 -            t1 = tci_read_ri64(regs, &tb_ptr);
 -            t2 = tci_read_ri64(regs, &tb_ptr);
 +            t1 = tci_read_r64(regs, &tb_ptr);
 +            t2 = tci_read_r64(regs, &tb_ptr);
              tci_write_reg(regs, t0, t1 | t2);
              break;
          case INDEX_op_xor_i64:
              t0 = *tb_ptr++;
 -            t1 = tci_read_ri64(regs, &tb_ptr);
 -            t2 = tci_read_ri64(regs, &tb_ptr);
 +            t1 = tci_read_r64(regs, &tb_ptr);
 +            t2 = tci_read_r64(regs, &tb_ptr);
              tci_write_reg(regs, t0, t1 ^ t2);
              break;
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
          case INDEX_op_shl_i64:
              t0 = *tb_ptr++;
 -            t1 = tci_read_ri64(regs, &tb_ptr);
 -            t2 = tci_read_ri64(regs, &tb_ptr);
 +            t1 = tci_read_r64(regs, &tb_ptr);
 +            t2 = tci_read_r64(regs, &tb_ptr);
              tci_write_reg(regs, t0, t1 << (t2 & 63));
              break;
          case INDEX_op_shr_i64:
              t0 = *tb_ptr++;
 -            t1 = tci_read_ri64(regs, &tb_ptr);
 -            t2 = tci_read_ri64(regs, &tb_ptr);
 +            t1 = tci_read_r64(regs, &tb_ptr);
 +            t2 = tci_read_r64(regs, &tb_ptr);
              tci_write_reg(regs, t0, t1 >> (t2 & 63));
              break;
          case INDEX_op_sar_i64:
              t0 = *tb_ptr++;
 -            t1 = tci_read_ri64(regs, &tb_ptr);
 -            t2 = tci_read_ri64(regs, &tb_ptr);
 +            t1 = tci_read_r64(regs, &tb_ptr);
 +            t2 = tci_read_r64(regs, &tb_ptr);
              tci_write_reg(regs, t0, ((int64_t)t1 >> (t2 & 63)));
              break;
  #if TCG_TARGET_HAS_rot_i64
          case INDEX_op_rotl_i64:
              t0 = *tb_ptr++;
 -            t1 = tci_read_ri64(regs, &tb_ptr);
 -            t2 = tci_read_ri64(regs, &tb_ptr);
 +            t1 = tci_read_r64(regs, &tb_ptr);
 +            t2 = tci_read_r64(regs, &tb_ptr);
              tci_write_reg(regs, t0, rol64(t1, t2 & 63));
              break;
          case INDEX_op_rotr_i64:
              t0 = *tb_ptr++;
 -            t1 = tci_read_ri64(regs, &tb_ptr);
 -            t2 = tci_read_ri64(regs, &tb_ptr);
 +            t1 = tci_read_r64(regs, &tb_ptr);
 +            t2 = tci_read_r64(regs, &tb_ptr);
              tci_write_reg(regs, t0, ror64(t1, t2 & 63));
              break;
  #endif
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
  #endif
          case INDEX_op_brcond_i64:
              t0 = tci_read_r64(regs, &tb_ptr);
 -            t1 = tci_read_ri64(regs, &tb_ptr);
 +            t1 = tci_read_r64(regs, &tb_ptr);
              condition = *tb_ptr++;
              label = tci_read_label(&tb_ptr);
              if (tci_compare64(t0, t1, condition)) {
 diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tci/tcg-target.c.inc
 +++ b/tcg/tci/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
      case INDEX_op_rem_i64:
      case INDEX_op_remu_i32:
      case INDEX_op_remu_i64:
 -        return C_O1_I2(r, r, r);
 -
      case INDEX_op_add_i32:
      case INDEX_op_add_i64:
      case INDEX_op_sub_i32:
@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
      case INDEX_op_rotl_i64:
      case INDEX_op_rotr_i32:
      case INDEX_op_rotr_i64:
 -        /* TODO: Does R, RI, RI result in faster code than R, R, RI? */
 -        return C_O1_I2(r, ri, ri);
 +    case INDEX_op_setcond_i32:
 +    case INDEX_op_setcond_i64:
 +        return C_O1_I2(r, r, r);
      case INDEX_op_deposit_i32:
      case INDEX_op_deposit_i64:
@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
      case INDEX_op_brcond_i32:
      case INDEX_op_brcond_i64:
 -        return C_O0_I2(r, ri);
 -
 -    case INDEX_op_setcond_i32:
 -    case INDEX_op_setcond_i64:
 -        return C_O1_I2(r, r, ri);
 +        return C_O0_I2(r, r);
  #if TCG_TARGET_REG_BITS == 32
      /* TODO: Support R, R, R, R, RI, RI? Will it be faster? */
@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
      case INDEX_op_sub2_i32:
          return C_O2_I4(r, r, r, r, r, r);
      case INDEX_op_brcond2_i32:
 -        return C_O0_I4(r, r, ri, ri);
 +        return C_O0_I4(r, r, r, r);
      case INDEX_op_mulu2_i32:
          return C_O2_I2(r, r, r, r);
      case INDEX_op_setcond2_i32:
 -        return C_O1_I4(r, r, r, ri, ri);
 +        return C_O1_I4(r, r, r, r, r);
  #endif
      case INDEX_op_qemu_ld_i32:
@@ -XXX,XX +XXX,XX @@ static void tcg_out_r(TCGContext *s, TCGArg t0)
      tcg_out8(s, t0);
  }
--/* Write register or constant (native size). */
+-static void tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
--static void tcg_out_ri(TCGContext *s, int const_arg, TCGArg arg)
++static bool tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
--{
+                              TCGArg dst, uint64_t val)
 -    if (const_arg) {
 -        tcg_debug_assert(const_arg == 1);
 -        tcg_out8(s, TCG_CONST);
 -        tcg_out_i(s, arg);
 -    } else {
 -        tcg_out_r(s, arg);
 -    }
 -}
 -
 -/* Write register or constant (32 bit). */
 -static void tcg_out_ri32(TCGContext *s, int const_arg, TCGArg arg)
 -{
 -    if (const_arg) {
 -        tcg_debug_assert(const_arg == 1);
 -        tcg_out8(s, TCG_CONST);
 -        tcg_out32(s, arg);
 -    } else {
 -        tcg_out_r(s, arg);
 -    }
 -}
 -
 -#if TCG_TARGET_REG_BITS == 64
 -/* Write register or constant (64 bit). */
 -static void tcg_out_ri64(TCGContext *s, int const_arg, TCGArg arg)
 -{
 -    if (const_arg) {
 -        tcg_debug_assert(const_arg == 1);
 -        tcg_out8(s, TCG_CONST);
 -        tcg_out64(s, arg);
 -    } else {
 -        tcg_out_r(s, arg);
 -    }
 -}
 -#endif
 -
  /* Write label. */
  static void tci_out_label(TCGContext *s, TCGLabel *label)
  {
-@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_call(TCGContext *s, const tcg_insn_unit *arg)
+     const TCGOpDef *def = &tcg_op_defs[op->opc];
- {
+@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
-     uint8_t *old_code_ptr = s->code_ptr;
+     /* Convert movi to mov with constant temp. */
-     tcg_out_op_t(s, INDEX_op_call);
+     tv = tcg_constant_internal(type, val);
--    tcg_out_ri(s, 1, (uintptr_t)arg);
+     init_ts_info(ctx, tv);
-+    tcg_out_i(s, (uintptr_t)arg);
+-    tcg_opt_gen_mov(ctx, op, dst, temp_arg(tv));
-     old_code_ptr[1] = s->code_ptr - old_code_ptr;
++    return tcg_opt_gen_mov(ctx, op, dst, temp_arg(tv));
  }
-@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
+ static uint64_t do_constant_folding_2(TCGOpcode op, uint64_t x, uint64_t y)
      case INDEX_op_setcond_i32:
          tcg_out_r(s, args[0]);
          tcg_out_r(s, args[1]);
 -        tcg_out_ri32(s, const_args[2], args[2]);
 +        tcg_out_r(s, args[2]);
          tcg_out8(s, args[3]);   /* condition */
          break;
  #if TCG_TARGET_REG_BITS == 32
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
          tcg_out_r(s, args[0]);
          tcg_out_r(s, args[1]);
          tcg_out_r(s, args[2]);
 -        tcg_out_ri32(s, const_args[3], args[3]);
 -        tcg_out_ri32(s, const_args[4], args[4]);
 +        tcg_out_r(s, args[3]);
 +        tcg_out_r(s, args[4]);
          tcg_out8(s, args[5]);   /* condition */
          break;
  #elif TCG_TARGET_REG_BITS == 64
      case INDEX_op_setcond_i64:
          tcg_out_r(s, args[0]);
          tcg_out_r(s, args[1]);
 -        tcg_out_ri64(s, const_args[2], args[2]);
 +        tcg_out_r(s, args[2]);
          tcg_out8(s, args[3]);   /* condition */
          break;
  #endif
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
      case INDEX_op_rotl_i32:     /* Optional (TCG_TARGET_HAS_rot_i32). */
      case INDEX_op_rotr_i32:     /* Optional (TCG_TARGET_HAS_rot_i32). */
          tcg_out_r(s, args[0]);
 -        tcg_out_ri32(s, const_args[1], args[1]);
 -        tcg_out_ri32(s, const_args[2], args[2]);
 +        tcg_out_r(s, args[1]);
 +        tcg_out_r(s, args[2]);
          break;
      case INDEX_op_deposit_i32:  /* Optional (TCG_TARGET_HAS_deposit_i32). */
          tcg_out_r(s, args[0]);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
      case INDEX_op_rem_i64:      /* Optional (TCG_TARGET_HAS_div_i64). */
      case INDEX_op_remu_i64:     /* Optional (TCG_TARGET_HAS_div_i64). */
          tcg_out_r(s, args[0]);
 -        tcg_out_ri64(s, const_args[1], args[1]);
 -        tcg_out_ri64(s, const_args[2], args[2]);
 +        tcg_out_r(s, args[1]);
 +        tcg_out_r(s, args[2]);
          break;
      case INDEX_op_deposit_i64:  /* Optional (TCG_TARGET_HAS_deposit_i64). */
          tcg_out_r(s, args[0]);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
          break;
      case INDEX_op_brcond_i64:
          tcg_out_r(s, args[0]);
 -        tcg_out_ri64(s, const_args[1], args[1]);
 +        tcg_out_r(s, args[1]);
          tcg_out8(s, args[2]);           /* condition */
          tci_out_label(s, arg_label(args[3]));
          break;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
      case INDEX_op_rem_i32:      /* Optional (TCG_TARGET_HAS_div_i32). */
      case INDEX_op_remu_i32:     /* Optional (TCG_TARGET_HAS_div_i32). */
          tcg_out_r(s, args[0]);
 -        tcg_out_ri32(s, const_args[1], args[1]);
 -        tcg_out_ri32(s, const_args[2], args[2]);
 +        tcg_out_r(s, args[1]);
 +        tcg_out_r(s, args[2]);
          break;
  #if TCG_TARGET_REG_BITS == 32
      case INDEX_op_add2_i32:
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
      case INDEX_op_brcond2_i32:
          tcg_out_r(s, args[0]);
          tcg_out_r(s, args[1]);
 -        tcg_out_ri32(s, const_args[2], args[2]);
 -        tcg_out_ri32(s, const_args[3], args[3]);
 +        tcg_out_r(s, args[2]);
 +        tcg_out_r(s, args[3]);
          tcg_out8(s, args[4]);           /* condition */
          tci_out_label(s, arg_label(args[5]));
          break;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
  #endif
      case INDEX_op_brcond_i32:
          tcg_out_r(s, args[0]);
 -        tcg_out_ri32(s, const_args[1], args[1]);
 +        tcg_out_r(s, args[1]);
          tcg_out8(s, args[2]);           /* condition */
          tci_out_label(s, arg_label(args[3]));
          break;
 --
 .25.1

-[PULL 40/46] physmem: make watchpoint checking code TCG-only
+[PULL 17/56] tcg/optimize: Split out finish_folding
-From: Claudio Fontana <cfontana@suse.de>
+Copy z_mask into OptContext, for writeback to the
 first output within the new function.
-cpu_check_watchpoint, watchpoint_address_matches are TCG-only.
-Signed-off-by: Claudio Fontana <cfontana@suse.de>
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-Message-Id: <20210204163931.7358-13-cfontana@suse.de>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- softmmu/physmem.c | 141 +++++++++++++++++++++++-----------------------
+ tcg/optimize.c | 49 +++++++++++++++++++++++++++++++++----------------
-file changed, 72 insertions(+), 69 deletions(-)
+file changed, 33 insertions(+), 16 deletions(-)
-diff --git a/softmmu/physmem.c b/softmmu/physmem.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/softmmu/physmem.c
+--- a/tcg/optimize.c
-+++ b/softmmu/physmem.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ void cpu_watchpoint_remove_all(CPUState *cpu, int mask)
+@@ -XXX,XX +XXX,XX @@ typedef struct OptContext {
      TCGContext *tcg;
      TCGOp *prev_mb;
      TCGTempSet temps_used;
 +
 +    /* In flight values from optimization. */
 +    uint64_t z_mask;
  } OptContext;
  static inline TempOptInfo *ts_info(TCGTemp *ts)
@@ -XXX,XX +XXX,XX @@ static void copy_propagate(OptContext *ctx, TCGOp *op,
      }
  }
-+#ifdef CONFIG_TCG
++static void finish_folding(OptContext *ctx, TCGOp *op)
  /* Return true if this watchpoint address matches the specified
   * access (ie the address range covered by the watchpoint overlaps
   * partially or completely with the address range covered by the
@@ -XXX,XX +XXX,XX @@ int cpu_watchpoint_address_matches(CPUState *cpu, vaddr addr, vaddr len)
      return ret;
  }
 +/* Generate a debug exception if a watchpoint has been hit.  */
 +void cpu_check_watchpoint(CPUState *cpu, vaddr addr, vaddr len,
 +                          MemTxAttrs attrs, int flags, uintptr_t ra)
 +{
-+    CPUClass *cc = CPU_GET_CLASS(cpu);
++    const TCGOpDef *def = &tcg_op_defs[op->opc];
-+    CPUWatchpoint *wp;
++    int i, nb_oargs;
 +
-+    assert(tcg_enabled());
++    /*
-+    if (cpu->watchpoint_hit) {
++     * For an opcode that ends a BB, reset all temp data.
-+        /*
++     * We do no cross-BB optimization.
-+         * We re-entered the check after replacing the TB.
++     */
-+         * Now raise the debug interrupt so that it will
++    if (def->flags & TCG_OPF_BB_END) {
-+         * trigger after the current instruction.
++        memset(&ctx->temps_used, 0, sizeof(ctx->temps_used));
-+         */
++        ctx->prev_mb = NULL;
 +        qemu_mutex_lock_iothread();
 +        cpu_interrupt(cpu, CPU_INTERRUPT_DEBUG);
 +        qemu_mutex_unlock_iothread();
 +        return;
 +    }
 +
-+    addr = cc->adjust_watchpoint_address(cpu, addr, len);
++    nb_oargs = def->nb_oargs;
-+    QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
++    for (i = 0; i < nb_oargs; i++) {
-+        if (watchpoint_address_matches(wp, addr, len)
++        reset_temp(op->args[i]);
-+            && (wp->flags & flags)) {
++        /*
-+            if (replay_running_debug()) {
++         * Save the corresponding known-zero bits mask for the
-+                /*
++         * first output argument (only one supported so far).
-+                 * Don't process the watchpoints when we are
++         */
-+                 * in a reverse debugging operation.
++        if (i == 0) {
-+                 */
++            arg_info(op->args[i])->z_mask = ctx->z_mask;
 +                replay_breakpoint();
 +                return;
 +            }
 +            if (flags == BP_MEM_READ) {
 +                wp->flags |= BP_WATCHPOINT_HIT_READ;
 +            } else {
 +                wp->flags |= BP_WATCHPOINT_HIT_WRITE;
 +            }
 +            wp->hitaddr = MAX(addr, wp->vaddr);
 +            wp->hitattrs = attrs;
 +            if (!cpu->watchpoint_hit) {
 +                if (wp->flags & BP_CPU &&
 +                    !cc->debug_check_watchpoint(cpu, wp)) {
 +                    wp->flags &= ~BP_WATCHPOINT_HIT;
 +                    continue;
 +                }
 +                cpu->watchpoint_hit = wp;
 +
 +                mmap_lock();
 +                tb_check_watchpoint(cpu, ra);
 +                if (wp->flags & BP_STOP_BEFORE_ACCESS) {
 +                    cpu->exception_index = EXCP_DEBUG;
 +                    mmap_unlock();
 +                    cpu_loop_exit_restore(cpu, ra);
 +                } else {
 +                    /* Force execution of one insn next time.  */
 +                    cpu->cflags_next_tb = 1 | curr_cflags();
 +                    mmap_unlock();
 +                    if (ra) {
 +                        cpu_restore_state(cpu, ra, true);
 +                    }
 +                    cpu_loop_exit_noexc(cpu);
 +                }
 +            }
 +        } else {
 +            wp->flags &= ~BP_WATCHPOINT_HIT;
 +        }
 +    }
 +}
 +
-+#endif /* CONFIG_TCG */
+ static bool fold_call(OptContext *ctx, TCGOp *op)
 +
  /* Called from RCU critical section */
  static RAMBlock *qemu_get_ram_block(ram_addr_t addr)
  {
-@@ -XXX,XX +XXX,XX @@ ram_addr_t qemu_ram_addr_from_host(void *ptr)
+     TCGContext *s = ctx->tcg;
-     return block->offset + offset;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
- }
+             partmask &= 0xffffffffu;
+             affected &= 0xffffffffu;
--/* Generate a debug exception if a watchpoint has been hit.  */
+         }
--void cpu_check_watchpoint(CPUState *cpu, vaddr addr, vaddr len,
++        ctx.z_mask = z_mask;
--                          MemTxAttrs attrs, int flags, uintptr_t ra)
--{
+         if (partmask == 0) {
--    CPUClass *cc = CPU_GET_CLASS(cpu);
+             tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
--    CPUWatchpoint *wp;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
--
+             break;
--    assert(tcg_enabled());
+         }
--    if (cpu->watchpoint_hit) {
--        /*
+-        /* Some of the folding above can change opc. */
--         * We re-entered the check after replacing the TB.
+-        opc = op->opc;
--         * Now raise the debug interrupt so that it will
+-        def = &tcg_op_defs[opc];
--         * trigger after the current instruction.
+-        if (def->flags & TCG_OPF_BB_END) {
--         */
+-            memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
--        qemu_mutex_lock_iothread();
+-        } else {
--        cpu_interrupt(cpu, CPU_INTERRUPT_DEBUG);
+-            int nb_oargs = def->nb_oargs;
--        qemu_mutex_unlock_iothread();
+-            for (i = 0; i < nb_oargs; i++) {
--        return;
+-                reset_temp(op->args[i]);
--    }
+-                /* Save the corresponding known-zero bits mask for the
--
+-                   first output argument (only one supported so far). */
--    addr = cc->adjust_watchpoint_address(cpu, addr, len);
+-                if (i == 0) {
--    QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
+-                    arg_info(op->args[i])->z_mask = z_mask;
 -        if (watchpoint_address_matches(wp, addr, len)
 -            && (wp->flags & flags)) {
 -            if (replay_running_debug()) {
 -                /*
 -                 * Don't process the watchpoints when we are
 -                 * in a reverse debugging operation.
 -                 */
 -                replay_breakpoint();
 -                return;
 -            }
 -            if (flags == BP_MEM_READ) {
 -                wp->flags |= BP_WATCHPOINT_HIT_READ;
 -            } else {
 -                wp->flags |= BP_WATCHPOINT_HIT_WRITE;
 -            }
 -            wp->hitaddr = MAX(addr, wp->vaddr);
 -            wp->hitattrs = attrs;
 -            if (!cpu->watchpoint_hit) {
 -                if (wp->flags & BP_CPU &&
 -                    !cc->debug_check_watchpoint(cpu, wp)) {
 -                    wp->flags &= ~BP_WATCHPOINT_HIT;
 -                    continue;
 -                }
 -                cpu->watchpoint_hit = wp;
 -
 -                mmap_lock();
 -                tb_check_watchpoint(cpu, ra);
 -                if (wp->flags & BP_STOP_BEFORE_ACCESS) {
 -                    cpu->exception_index = EXCP_DEBUG;
 -                    mmap_unlock();
 -                    cpu_loop_exit_restore(cpu, ra);
 -                } else {
 -                    /* Force execution of one insn next time.  */
 -                    cpu->cflags_next_tb = 1 | curr_cflags();
 -                    mmap_unlock();
 -                    if (ra) {
 -                        cpu_restore_state(cpu, ra, true);
 -                    }
 -                    cpu_loop_exit_noexc(cpu);
 -                }
 -            }
--        } else {
--            wp->flags &= ~BP_WATCHPOINT_HIT;
 -        }
--    }
++        finish_folding(&ctx, op);
--}
--
+         /* Eliminate duplicate and redundant fence instructions.  */
- static MemTxResult flatview_read(FlatView *fv, hwaddr addr,
+         if (ctx.prev_mb) {
                                   MemTxAttrs attrs, void *buf, hwaddr len);
  static MemTxResult flatview_write(FlatView *fv, hwaddr addr, MemTxAttrs attrs,
 --
 .25.1

-New patch
+[PULL 18/56] tcg/optimize: Use a boolean to avoid a mass of continues
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 9 ++++++---
+file changed, 6 insertions(+), 3 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         uint64_t z_mask, partmask, affected, tmp;
+         TCGOpcode opc = op->opc;
+         const TCGOpDef *def;
++        bool done = false;
+         /* Calls are special. */
+         if (opc == INDEX_op_call) {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+            allocator where needed and possible.  Also detect copies. */
+         switch (opc) {
+         CASE_OP_32_64_VEC(mov):
+-            tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
+-            continue;
++            done = tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
++            break;
+         case INDEX_op_dup_vec:
+             if (arg_is_const(op->args[1])) {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             break;
+         }
+-        finish_folding(&ctx, op);
++        if (!done) {
++            finish_folding(&ctx, op);
++        }
+         /* Eliminate duplicate and redundant fence instructions.  */
+         if (ctx.prev_mb) {
+--
+.25.1

-New patch
+[PULL 19/56] tcg/optimize: Split out fold_mb, fold_qemu_{ld,st}
+This puts the separate mb optimization into the same framework
+as the others.  While fold_qemu_{ld,st} are currently identical,
+that won't last as more code gets moved.
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 89 +++++++++++++++++++++++++++++---------------------
+file changed, 51 insertions(+), 38 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_call(OptContext *ctx, TCGOp *op)
+     return true;
+ }
++static bool fold_mb(OptContext *ctx, TCGOp *op)
++{
++    /* Eliminate duplicate and redundant fence instructions.  */
++    if (ctx->prev_mb) {
++        /*
++         * Merge two barriers of the same type into one,
++         * or a weaker barrier into a stronger one,
++         * or two weaker barriers into a stronger one.
++         *   mb X; mb Y => mb X|Y
++         *   mb; strl => mb; st
++         *   ldaq; mb => ld; mb
++         *   ldaq; strl => ld; mb; st
++         * Other combinations are also merged into a strong
++         * barrier.  This is stricter than specified but for
++         * the purposes of TCG is better than not optimizing.
++         */
++        ctx->prev_mb->args[0] |= op->args[0];
++        tcg_op_remove(ctx->tcg, op);
++    } else {
++        ctx->prev_mb = op;
++    }
++    return true;
++}
++
++static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
++{
++    /* Opcodes that touch guest memory stop the mb optimization.  */
++    ctx->prev_mb = NULL;
++    return false;
++}
++
++static bool fold_qemu_st(OptContext *ctx, TCGOp *op)
++{
++    /* Opcodes that touch guest memory stop the mb optimization.  */
++    ctx->prev_mb = NULL;
++    return false;
++}
++
+ /* Propagate constants and copies, fold constant expressions. */
+ void tcg_optimize(TCGContext *s)
+ {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             }
+             break;
++        case INDEX_op_mb:
++            done = fold_mb(&ctx, op);
++            break;
++        case INDEX_op_qemu_ld_i32:
++        case INDEX_op_qemu_ld_i64:
++            done = fold_qemu_ld(&ctx, op);
++            break;
++        case INDEX_op_qemu_st_i32:
++        case INDEX_op_qemu_st8_i32:
++        case INDEX_op_qemu_st_i64:
++            done = fold_qemu_st(&ctx, op);
++            break;
++
+         default:
+             break;
+         }
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         if (!done) {
+             finish_folding(&ctx, op);
+         }
+-
+-        /* Eliminate duplicate and redundant fence instructions.  */
+-        if (ctx.prev_mb) {
+-            switch (opc) {
+-            case INDEX_op_mb:
+-                /* Merge two barriers of the same type into one,
+-                 * or a weaker barrier into a stronger one,
+-                 * or two weaker barriers into a stronger one.
+-                 *   mb X; mb Y => mb X|Y
+-                 *   mb; strl => mb; st
+-                 *   ldaq; mb => ld; mb
+-                 *   ldaq; strl => ld; mb; st
+-                 * Other combinations are also merged into a strong
+-                 * barrier.  This is stricter than specified but for
+-                 * the purposes of TCG is better than not optimizing.
+-                 */
+-                ctx.prev_mb->args[0] |= op->args[0];
+-                tcg_op_remove(s, op);
+-                break;
+-
+-            default:
+-                /* Opcodes that end the block stop the optimization.  */
+-                if ((def->flags & TCG_OPF_BB_END) == 0) {
+-                    break;
+-                }
+-                /* fallthru */
+-            case INDEX_op_qemu_ld_i32:
+-            case INDEX_op_qemu_ld_i64:
+-            case INDEX_op_qemu_st_i32:
+-            case INDEX_op_qemu_st8_i32:
+-            case INDEX_op_qemu_st_i64:
+-                /* Opcodes that touch guest memory stop the optimization.  */
+-                ctx.prev_mb = NULL;
+-                break;
+-            }
+-        } else if (opc == INDEX_op_mb) {
+-            ctx.prev_mb = op;
+-        }
+     }
+ }
+--
+.25.1

-[PULL 43/46] cpu: tcg_ops: move to tcg-cpu-ops.h, keep a pointer in CPUClass
+[PULL 20/56] tcg/optimize: Split out fold_const{1,2}
-From: Claudio Fontana <cfontana@suse.de>
+Split out a whole bunch of placeholder functions, which are
+currently identical.  That won't last as more code gets moved.
-we cannot in principle make the TCG Operations field definitions
-conditional on CONFIG_TCG in code that is included by both common_ss
+Use CASE_32_64_VEC for some logical operators that previously
-and specific_ss modules.
+missed the addition of vectors.
-Therefore, what we can do safely to restrict the TCG fields to TCG-only
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-builds, is to move all tcg cpu operations into a separate header file,
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 which is only included by TCG, target-specific code.
 This leaves just a NULL pointer in the cpu.h for the non-TCG builds.
 This also tidies up the code in all targets a bit, having all TCG cpu
 operations neatly contained by a dedicated data struct.
 Signed-off-by: Claudio Fontana <cfontana@suse.de>
 Message-Id: <20210204163931.7358-16-cfontana@suse.de>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/hw/core/cpu.h           | 103 ++------------------------------
+ tcg/optimize.c | 271 +++++++++++++++++++++++++++++++++++++++----------
- include/hw/core/tcg-cpu-ops.h   |  97 ++++++++++++++++++++++++++++++
+file changed, 219 insertions(+), 52 deletions(-)
- target/arm/internals.h          |   6 ++
- accel/tcg/cpu-exec.c            |  27 +++++----
+diff --git a/tcg/optimize.c b/tcg/optimize.c
  accel/tcg/cputlb.c              |  35 +++++++++--
  accel/tcg/user-exec.c           |   9 +--
  hw/mips/jazz.c                  |   7 ++-
  softmmu/physmem.c               |  13 ++--
  target/alpha/cpu.c              |  21 +++++--
  target/arm/cpu.c                |  41 ++++++++-----
  target/arm/cpu64.c              |   7 +--
  target/arm/cpu_tcg.c            |  28 +++++++--
  target/avr/cpu.c                |  19 ++++--
  target/avr/helper.c             |   5 +-
  target/cris/cpu.c               |  43 ++++++++-----
  target/cris/helper.c            |   5 +-
  target/hppa/cpu.c               |  22 ++++---
  target/i386/tcg/tcg-cpu.c       |  26 ++++----
  target/lm32/cpu.c               |  19 ++++--
  target/m68k/cpu.c               |  19 ++++--
  target/microblaze/cpu.c         |  25 +++++---
  target/mips/cpu.c               |  36 +++++++----
  target/moxie/cpu.c              |  15 ++++-
  target/nios2/cpu.c              |  18 ++++--
  target/openrisc/cpu.c           |  17 ++++--
  target/riscv/cpu.c              |  23 ++++---
  target/rx/cpu.c                 |  20 +++++--
  target/s390x/cpu.c              |  33 ++++++----
  target/sh4/cpu.c                |  21 +++++--
  target/sparc/cpu.c              |  25 +++++---
  target/tilegx/cpu.c             |  17 ++++--
  target/tricore/cpu.c            |  12 +++-
  target/unicore32/cpu.c          |  17 ++++--
  target/xtensa/cpu.c             |  23 ++++---
  target/ppc/translate_init.c.inc |  33 ++++++----
  MAINTAINERS                     |   1 +
 files changed, 582 insertions(+), 306 deletions(-)
  create mode 100644 include/hw/core/tcg-cpu-ops.h
 diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
 index XXXXXXX..XXXXXXX 100644
---- a/include/hw/core/cpu.h
+--- a/tcg/optimize.c
-+++ b/include/hw/core/cpu.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ typedef struct CPUWatchpoint CPUWatchpoint;
+@@ -XXX,XX +XXX,XX @@ static void finish_folding(OptContext *ctx, TCGOp *op)
  struct TranslationBlock;
 -/**
 - * struct TcgCpuOperations: TCG operations specific to a CPU class
 - */
 -typedef struct TcgCpuOperations {
 -    /**
 -     * @initialize: Initalize TCG state
 -     *
 -     * Called when the first CPU is realized.
 -     */
 -    void (*initialize)(void);
 -    /**
 -     * @synchronize_from_tb: Synchronize state from a TCG #TranslationBlock
 -     *
 -     * This is called when we abandon execution of a TB before starting it,
 -     * and must set all parts of the CPU state which the previous TB in the
 -     * chain may not have updated.
 -     * By default, when this is NULL, a call is made to @set_pc(tb->pc).
 -     *
 -     * If more state needs to be restored, the target must implement a
 -     * function to restore all the state, and register it here.
 -     */
 -    void (*synchronize_from_tb)(CPUState *cpu,
 -                                const struct TranslationBlock *tb);
 -    /** @cpu_exec_enter: Callback for cpu_exec preparation */
 -    void (*cpu_exec_enter)(CPUState *cpu);
 -    /** @cpu_exec_exit: Callback for cpu_exec cleanup */
 -    void (*cpu_exec_exit)(CPUState *cpu);
 -    /** @cpu_exec_interrupt: Callback for processing interrupts in cpu_exec */
 -    bool (*cpu_exec_interrupt)(CPUState *cpu, int interrupt_request);
 -    /** @do_interrupt: Callback for interrupt handling. */
 -    void (*do_interrupt)(CPUState *cpu);
 -    /**
 -     * @tlb_fill: Handle a softmmu tlb miss or user-only address fault
 -     *
 -     * For system mode, if the access is valid, call tlb_set_page
 -     * and return true; if the access is invalid, and probe is
 -     * true, return false; otherwise raise an exception and do
 -     * not return.  For user-only mode, always raise an exception
 -     * and do not return.
 -     */
 -    bool (*tlb_fill)(CPUState *cpu, vaddr address, int size,
 -                     MMUAccessType access_type, int mmu_idx,
 -                     bool probe, uintptr_t retaddr);
 -    /** @debug_excp_handler: Callback for handling debug exceptions */
 -    void (*debug_excp_handler)(CPUState *cpu);
 -
 -    /**
 -     * @do_transaction_failed: Callback for handling failed memory transactions
 -     * (ie bus faults or external aborts; not MMU faults)
 -     */
 -    void (*do_transaction_failed)(CPUState *cpu, hwaddr physaddr, vaddr addr,
 -                                  unsigned size, MMUAccessType access_type,
 -                                  int mmu_idx, MemTxAttrs attrs,
 -                                  MemTxResult response, uintptr_t retaddr);
 -    /**
 -     * @do_unaligned_access: Callback for unaligned access handling
 -     */
 -    void (*do_unaligned_access)(CPUState *cpu, vaddr addr,
 -                                MMUAccessType access_type,
 -                                int mmu_idx, uintptr_t retaddr);
 -    /**
 -     * @adjust_watchpoint_address: hack for cpu_check_watchpoint used by ARM
 -     */
 -    vaddr (*adjust_watchpoint_address)(CPUState *cpu, vaddr addr, int len);
 -
 -    /**
 -     * @debug_check_watchpoint: return true if the architectural
 -     * watchpoint whose address has matched should really fire, used by ARM
 -     */
 -    bool (*debug_check_watchpoint)(CPUState *cpu, CPUWatchpoint *wp);
 -
 -} TcgCpuOperations;
 +/* see tcg-cpu-ops.h */
 +struct TCGCPUOps;
  /**
   * CPUClass:
@@ -XXX,XX +XXX,XX @@ struct CPUClass {
      int gdb_num_core_regs;
      bool gdb_stop_before_watchpoint;
 -    TcgCpuOperations tcg_ops;
 +    /* when TCG is not available, this pointer is NULL */
 +    struct TCGCPUOps *tcg_ops;
  };
  /*
@@ -XXX,XX +XXX,XX @@ CPUState *cpu_by_arch_id(int64_t id);
  void cpu_interrupt(CPUState *cpu, int mask);
 -static inline void cpu_unaligned_access(CPUState *cpu, vaddr addr,
 -                                        MMUAccessType access_type,
 -                                        int mmu_idx, uintptr_t retaddr)
 -{
 -    CPUClass *cc = CPU_GET_CLASS(cpu);
 -
 -    cc->tcg_ops.do_unaligned_access(cpu, addr, access_type, mmu_idx, retaddr);
 -}
 -
 -static inline void cpu_transaction_failed(CPUState *cpu, hwaddr physaddr,
 -                                          vaddr addr, unsigned size,
 -                                          MMUAccessType access_type,
 -                                          int mmu_idx, MemTxAttrs attrs,
 -                                          MemTxResult response,
 -                                          uintptr_t retaddr)
 -{
 -    CPUClass *cc = CPU_GET_CLASS(cpu);
 -
 -    if (!cpu->ignore_memory_transaction_failures &&
 -        cc->tcg_ops.do_transaction_failed) {
 -        cc->tcg_ops.do_transaction_failed(cpu, physaddr, addr, size,
 -                                          access_type, mmu_idx, attrs,
 -                                          response, retaddr);
 -    }
 -}
 -
  /**
   * cpu_set_pc:
   * @cpu: The CPU to set the program counter for.
 diff --git a/include/hw/core/tcg-cpu-ops.h b/include/hw/core/tcg-cpu-ops.h
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/include/hw/core/tcg-cpu-ops.h
@@ -XXX,XX +XXX,XX @@
 +/*
 + * TCG CPU-specific operations
 + *
 + * Copyright 2021 SUSE LLC
 + *
 + * This work is licensed under the terms of the GNU GPL, version 2 or later.
 + * See the COPYING file in the top-level directory.
 + */
 +
 +#ifndef TCG_CPU_OPS_H
 +#define TCG_CPU_OPS_H
 +
 +#include "hw/core/cpu.h"
 +
 +struct TCGCPUOps {
 +    /**
 +     * @initialize: Initalize TCG state
 +     *
 +     * Called when the first CPU is realized.
 +     */
 +    void (*initialize)(void);
 +    /**
 +     * @synchronize_from_tb: Synchronize state from a TCG #TranslationBlock
 +     *
 +     * This is called when we abandon execution of a TB before starting it,
 +     * and must set all parts of the CPU state which the previous TB in the
 +     * chain may not have updated.
 +     * By default, when this is NULL, a call is made to @set_pc(tb->pc).
 +     *
 +     * If more state needs to be restored, the target must implement a
 +     * function to restore all the state, and register it here.
 +     */
 +    void (*synchronize_from_tb)(CPUState *cpu,
 +                                const struct TranslationBlock *tb);
 +    /** @cpu_exec_enter: Callback for cpu_exec preparation */
 +    void (*cpu_exec_enter)(CPUState *cpu);
 +    /** @cpu_exec_exit: Callback for cpu_exec cleanup */
 +    void (*cpu_exec_exit)(CPUState *cpu);
 +    /** @cpu_exec_interrupt: Callback for processing interrupts in cpu_exec */
 +    bool (*cpu_exec_interrupt)(CPUState *cpu, int interrupt_request);
 +    /**
 +     * @do_interrupt: Callback for interrupt handling.
 +     *
 +     * note that this is in general SOFTMMU only, but it actually isn't
 +     * because of an x86 hack (accel/tcg/cpu-exec.c), so we cannot put it
 +     * in the SOFTMMU section in general.
 +     */
 +    void (*do_interrupt)(CPUState *cpu);
 +    /**
 +     * @tlb_fill: Handle a softmmu tlb miss or user-only address fault
 +     *
 +     * For system mode, if the access is valid, call tlb_set_page
 +     * and return true; if the access is invalid, and probe is
 +     * true, return false; otherwise raise an exception and do
 +     * not return.  For user-only mode, always raise an exception
 +     * and do not return.
 +     */
 +    bool (*tlb_fill)(CPUState *cpu, vaddr address, int size,
 +                     MMUAccessType access_type, int mmu_idx,
 +                     bool probe, uintptr_t retaddr);
 +    /** @debug_excp_handler: Callback for handling debug exceptions */
 +    void (*debug_excp_handler)(CPUState *cpu);
 +
 +#ifdef NEED_CPU_H
 +#ifdef CONFIG_SOFTMMU
 +    /**
 +     * @do_transaction_failed: Callback for handling failed memory transactions
 +     * (ie bus faults or external aborts; not MMU faults)
 +     */
 +    void (*do_transaction_failed)(CPUState *cpu, hwaddr physaddr, vaddr addr,
 +                                  unsigned size, MMUAccessType access_type,
 +                                  int mmu_idx, MemTxAttrs attrs,
 +                                  MemTxResult response, uintptr_t retaddr);
 +    /**
 +     * @do_unaligned_access: Callback for unaligned access handling
 +     */
 +    void (*do_unaligned_access)(CPUState *cpu, vaddr addr,
 +                                MMUAccessType access_type,
 +                                int mmu_idx, uintptr_t retaddr);
 +
 +    /**
 +     * @adjust_watchpoint_address: hack for cpu_check_watchpoint used by ARM
 +     */
 +    vaddr (*adjust_watchpoint_address)(CPUState *cpu, vaddr addr, int len);
 +
 +    /**
 +     * @debug_check_watchpoint: return true if the architectural
 +     * watchpoint whose address has matched should really fire, used by ARM
 +     */
 +    bool (*debug_check_watchpoint)(CPUState *cpu, CPUWatchpoint *wp);
 +
 +#endif /* CONFIG_SOFTMMU */
 +#endif /* NEED_CPU_H */
 +
 +};
 +
 +#endif /* TCG_CPU_OPS_H */
 diff --git a/target/arm/internals.h b/target/arm/internals.h
 index XXXXXXX..XXXXXXX 100644
 --- a/target/arm/internals.h
 +++ b/target/arm/internals.h
@@ -XXX,XX +XXX,XX @@ static inline int r14_bank_number(int mode)
  void arm_cpu_register_gdb_regs_for_features(ARMCPU *cpu);
  void arm_translate_init(void);
 +#ifdef CONFIG_TCG
 +void arm_cpu_synchronize_from_tb(CPUState *cs,
 +                                 const struct TranslationBlock *tb);
 +#endif /* CONFIG_TCG */
 +
 +
  enum arm_fprounding {
      FPROUNDING_TIEEVEN,
      FPROUNDING_POSINF,
 diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/cpu-exec.c
 +++ b/accel/tcg/cpu-exec.c
@@ -XXX,XX +XXX,XX @@
  #include "qemu-common.h"
  #include "qemu/qemu-print.h"
  #include "cpu.h"
 +#include "hw/core/tcg-cpu-ops.h"
  #include "trace.h"
  #include "disas/disas.h"
  #include "exec/exec-all.h"
@@ -XXX,XX +XXX,XX @@ cpu_tb_exec(CPUState *cpu, TranslationBlock *itb, int *tb_exit)
                                 TARGET_FMT_lx "] %s\n",
                                 last_tb->tc.ptr, last_tb->pc,
                                 lookup_symbol(last_tb->pc));
 -        if (cc->tcg_ops.synchronize_from_tb) {
 -            cc->tcg_ops.synchronize_from_tb(cpu, last_tb);
 +        if (cc->tcg_ops->synchronize_from_tb) {
 +            cc->tcg_ops->synchronize_from_tb(cpu, last_tb);
          } else {
              assert(cc->set_pc);
              cc->set_pc(cpu, last_tb->pc);
@@ -XXX,XX +XXX,XX @@ static void cpu_exec_enter(CPUState *cpu)
  {
      CPUClass *cc = CPU_GET_CLASS(cpu);
 -    if (cc->tcg_ops.cpu_exec_enter) {
 -        cc->tcg_ops.cpu_exec_enter(cpu);
 +    if (cc->tcg_ops->cpu_exec_enter) {
 +        cc->tcg_ops->cpu_exec_enter(cpu);
      }
  }
-@@ -XXX,XX +XXX,XX @@ static void cpu_exec_exit(CPUState *cpu)
++/*
 + * The fold_* functions return true when processing is complete,
 + * usually by folding the operation to a constant or to a copy,
 + * and calling tcg_opt_gen_{mov,movi}.  They may do other things,
 + * like collect information about the value produced, for use in
 + * optimizing a subsequent operation.
 + *
 + * These first fold_* functions are all helpers, used by other
 + * folders for more specific operations.
 + */
 +
 +static bool fold_const1(OptContext *ctx, TCGOp *op)
 +{
 +    if (arg_is_const(op->args[1])) {
 +        uint64_t t;
 +
 +        t = arg_info(op->args[1])->val;
 +        t = do_constant_folding(op->opc, t, 0);
 +        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
 +    }
 +    return false;
 +}
 +
 +static bool fold_const2(OptContext *ctx, TCGOp *op)
 +{
 +    if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
 +        uint64_t t1 = arg_info(op->args[1])->val;
 +        uint64_t t2 = arg_info(op->args[2])->val;
 +
 +        t1 = do_constant_folding(op->opc, t1, t2);
 +        return tcg_opt_gen_movi(ctx, op, op->args[0], t1);
 +    }
 +    return false;
 +}
 +
 +/*
 + * These outermost fold_<op> functions are sorted alphabetically.
 + */
 +
 +static bool fold_add(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
 +static bool fold_and(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
 +static bool fold_andc(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
  static bool fold_call(OptContext *ctx, TCGOp *op)
  {
-     CPUClass *cc = CPU_GET_CLASS(cpu);
+     TCGContext *s = ctx->tcg;
+@@ -XXX,XX +XXX,XX @@ static bool fold_call(OptContext *ctx, TCGOp *op)
 -    if (cc->tcg_ops.cpu_exec_exit) {
 -        cc->tcg_ops.cpu_exec_exit(cpu);
 +    if (cc->tcg_ops->cpu_exec_exit) {
 +        cc->tcg_ops->cpu_exec_exit(cpu);
      }
  }
@@ -XXX,XX +XXX,XX @@ static inline void cpu_handle_debug_exception(CPUState *cpu)
          }
      }
 -    if (cc->tcg_ops.debug_excp_handler) {
 -        cc->tcg_ops.debug_excp_handler(cpu);
 +    if (cc->tcg_ops->debug_excp_handler) {
 +        cc->tcg_ops->debug_excp_handler(cpu);
      }
  }
@@ -XXX,XX +XXX,XX @@ static inline bool cpu_handle_exception(CPUState *cpu, int *ret)
             loop */
  #if defined(TARGET_I386)
          CPUClass *cc = CPU_GET_CLASS(cpu);
 -        cc->tcg_ops.do_interrupt(cpu);
 +        cc->tcg_ops->do_interrupt(cpu);
  #endif
          *ret = cpu->exception_index;
          cpu->exception_index = -1;
@@ -XXX,XX +XXX,XX @@ static inline bool cpu_handle_exception(CPUState *cpu, int *ret)
          if (replay_exception()) {
              CPUClass *cc = CPU_GET_CLASS(cpu);
              qemu_mutex_lock_iothread();
 -            cc->tcg_ops.do_interrupt(cpu);
 +            cc->tcg_ops->do_interrupt(cpu);
              qemu_mutex_unlock_iothread();
              cpu->exception_index = -1;
@@ -XXX,XX +XXX,XX @@ static inline bool cpu_handle_interrupt(CPUState *cpu,
             True when it is, and we should restart on a new TB,
             and via longjmp via cpu_loop_exit.  */
          else {
 -            if (cc->tcg_ops.cpu_exec_interrupt &&
 -                cc->tcg_ops.cpu_exec_interrupt(cpu, interrupt_request)) {
 +            if (cc->tcg_ops->cpu_exec_interrupt &&
 +                cc->tcg_ops->cpu_exec_interrupt(cpu, interrupt_request)) {
                  if (need_replay_interrupt(interrupt_request)) {
                      replay_interrupt();
                  }
@@ -XXX,XX +XXX,XX @@ void tcg_exec_realizefn(CPUState *cpu, Error **errp)
      CPUClass *cc = CPU_GET_CLASS(cpu);
      if (!tcg_target_initialized) {
 -        cc->tcg_ops.initialize();
 +        cc->tcg_ops->initialize();
          tcg_target_initialized = true;
      }
      tlb_init(cpu);
 diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/cputlb.c
 +++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@
  #include "qemu/osdep.h"
  #include "qemu/main-loop.h"
  #include "cpu.h"
 +#include "hw/core/tcg-cpu-ops.h"
  #include "exec/exec-all.h"
  #include "exec/memory.h"
  #include "exec/address-spaces.h"
@@ -XXX,XX +XXX,XX @@ static void tlb_fill(CPUState *cpu, target_ulong addr, int size,
       * This is not a probe, so only valid return is success; failure
       * should result in exception + longjmp to the cpu loop.
       */
 -    ok = cc->tcg_ops.tlb_fill(cpu, addr, size,
 -                              access_type, mmu_idx, false, retaddr);
 +    ok = cc->tcg_ops->tlb_fill(cpu, addr, size,
 +                               access_type, mmu_idx, false, retaddr);
      assert(ok);
  }
 +static inline void cpu_unaligned_access(CPUState *cpu, vaddr addr,
 +                                        MMUAccessType access_type,
 +                                        int mmu_idx, uintptr_t retaddr)
 +{
 +    CPUClass *cc = CPU_GET_CLASS(cpu);
 +
 +    cc->tcg_ops->do_unaligned_access(cpu, addr, access_type, mmu_idx, retaddr);
 +}
 +
 +static inline void cpu_transaction_failed(CPUState *cpu, hwaddr physaddr,
 +                                          vaddr addr, unsigned size,
 +                                          MMUAccessType access_type,
 +                                          int mmu_idx, MemTxAttrs attrs,
 +                                          MemTxResult response,
 +                                          uintptr_t retaddr)
 +{
 +    CPUClass *cc = CPU_GET_CLASS(cpu);
 +
 +    if (!cpu->ignore_memory_transaction_failures &&
 +        cc->tcg_ops->do_transaction_failed) {
 +        cc->tcg_ops->do_transaction_failed(cpu, physaddr, addr, size,
 +                                           access_type, mmu_idx, attrs,
 +                                           response, retaddr);
 +    }
 +}
 +
  static uint64_t io_readx(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
                           int mmu_idx, target_ulong addr, uintptr_t retaddr,
                           MMUAccessType access_type, MemOp op)
@@ -XXX,XX +XXX,XX @@ static int probe_access_internal(CPUArchState *env, target_ulong addr,
              CPUState *cs = env_cpu(env);
              CPUClass *cc = CPU_GET_CLASS(cs);
 -            if (!cc->tcg_ops.tlb_fill(cs, addr, fault_size, access_type,
 -                                      mmu_idx, nonfault, retaddr)) {
 +            if (!cc->tcg_ops->tlb_fill(cs, addr, fault_size, access_type,
 +                                       mmu_idx, nonfault, retaddr)) {
                  /* Non-faulting page table read failed.  */
                  *phost = NULL;
                  return TLB_INVALID_MASK;
 diff --git a/accel/tcg/user-exec.c b/accel/tcg/user-exec.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/user-exec.c
 +++ b/accel/tcg/user-exec.c
@@ -XXX,XX +XXX,XX @@
   */
  #include "qemu/osdep.h"
  #include "cpu.h"
 +#include "hw/core/tcg-cpu-ops.h"
  #include "disas/disas.h"
  #include "exec/exec-all.h"
  #include "tcg/tcg.h"
@@ -XXX,XX +XXX,XX @@ static inline int handle_cpu_signal(uintptr_t pc, siginfo_t *info,
      clear_helper_retaddr();
      cc = CPU_GET_CLASS(cpu);
 -    cc->tcg_ops.tlb_fill(cpu, address, 0, access_type,
 -                         MMU_USER_IDX, false, pc);
 +    cc->tcg_ops->tlb_fill(cpu, address, 0, access_type,
 +                          MMU_USER_IDX, false, pc);
      g_assert_not_reached();
  }
@@ -XXX,XX +XXX,XX @@ static int probe_access_internal(CPUArchState *env, target_ulong addr,
          } else {
              CPUState *cpu = env_cpu(env);
              CPUClass *cc = CPU_GET_CLASS(cpu);
 -            cc->tcg_ops.tlb_fill(cpu, addr, fault_size, access_type,
 -                                 MMU_USER_IDX, false, ra);
 +            cc->tcg_ops->tlb_fill(cpu, addr, fault_size, access_type,
 +                                  MMU_USER_IDX, false, ra);
              g_assert_not_reached();
          }
      }
 diff --git a/hw/mips/jazz.c b/hw/mips/jazz.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/mips/jazz.c
 +++ b/hw/mips/jazz.c
@@ -XXX,XX +XXX,XX @@
  #include "qapi/error.h"
  #include "qemu/error-report.h"
  #include "qemu/help_option.h"
 +#ifdef CONFIG_TCG
 +#include "hw/core/tcg-cpu-ops.h"
 +#endif /* CONFIG_TCG */
  enum jazz_model_e {
      JAZZ_MAGNUM,
@@ -XXX,XX +XXX,XX @@ static void mips_jazz_init(MachineState *machine,
       */
      cc = CPU_GET_CLASS(cpu);
  #if defined(CONFIG_TCG) && !defined(CONFIG_USER_ONLY)
 -    real_do_transaction_failed = cc->tcg_ops.do_transaction_failed;
 -    cc->tcg_ops.do_transaction_failed = mips_jazz_do_transaction_failed;
 +    real_do_transaction_failed = cc->tcg_ops->do_transaction_failed;
 +    cc->tcg_ops->do_transaction_failed = mips_jazz_do_transaction_failed;
  #endif /* CONFIG_TCG && !CONFIG_USER_ONLY */
      /* allocate RAM */
 diff --git a/softmmu/physmem.c b/softmmu/physmem.c
 index XXXXXXX..XXXXXXX 100644
 --- a/softmmu/physmem.c
 +++ b/softmmu/physmem.c
@@ -XXX,XX +XXX,XX @@
  #include "qemu/cutils.h"
  #include "qemu/cacheflush.h"
  #include "cpu.h"
 +
 +#ifdef CONFIG_TCG
 +#include "hw/core/tcg-cpu-ops.h"
 +#endif /* CONFIG_TCG */
 +
  #include "exec/exec-all.h"
  #include "exec/target_page.h"
  #include "hw/qdev-core.h"
@@ -XXX,XX +XXX,XX @@ void cpu_check_watchpoint(CPUState *cpu, vaddr addr, vaddr len,
          return;
      }
 -    if (cc->tcg_ops.adjust_watchpoint_address) {
 +    if (cc->tcg_ops->adjust_watchpoint_address) {
          /* this is currently used only by ARM BE32 */
 -        addr = cc->tcg_ops.adjust_watchpoint_address(cpu, addr, len);
 +        addr = cc->tcg_ops->adjust_watchpoint_address(cpu, addr, len);
      }
      QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
          if (watchpoint_address_matches(wp, addr, len)
@@ -XXX,XX +XXX,XX @@ void cpu_check_watchpoint(CPUState *cpu, vaddr addr, vaddr len,
              wp->hitaddr = MAX(addr, wp->vaddr);
              wp->hitattrs = attrs;
              if (!cpu->watchpoint_hit) {
 -                if (wp->flags & BP_CPU && cc->tcg_ops.debug_check_watchpoint &&
 -                    !cc->tcg_ops.debug_check_watchpoint(cpu, wp)) {
 +                if (wp->flags & BP_CPU && cc->tcg_ops->debug_check_watchpoint &&
 +                    !cc->tcg_ops->debug_check_watchpoint(cpu, wp)) {
                      wp->flags &= ~BP_WATCHPOINT_HIT;
                      continue;
                  }
 diff --git a/target/alpha/cpu.c b/target/alpha/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/alpha/cpu.c
 +++ b/target/alpha/cpu.c
@@ -XXX,XX +XXX,XX @@ static void alpha_cpu_initfn(Object *obj)
  #endif
  }
 +#include "hw/core/tcg-cpu-ops.h"
 +
 +static struct TCGCPUOps alpha_tcg_ops = {
 +    .initialize = alpha_translate_init,
 +    .cpu_exec_interrupt = alpha_cpu_exec_interrupt,
 +    .tlb_fill = alpha_cpu_tlb_fill,
 +
 +#ifndef CONFIG_USER_ONLY
 +    .do_interrupt = alpha_cpu_do_interrupt,
 +    .do_transaction_failed = alpha_cpu_do_transaction_failed,
 +    .do_unaligned_access = alpha_cpu_do_unaligned_access,
 +#endif /* !CONFIG_USER_ONLY */
 +};
 +
  static void alpha_cpu_class_init(ObjectClass *oc, void *data)
  {
      DeviceClass *dc = DEVICE_CLASS(oc);
@@ -XXX,XX +XXX,XX @@ static void alpha_cpu_class_init(ObjectClass *oc, void *data)
      cc->class_by_name = alpha_cpu_class_by_name;
      cc->has_work = alpha_cpu_has_work;
 -    cc->tcg_ops.do_interrupt = alpha_cpu_do_interrupt;
 -    cc->tcg_ops.cpu_exec_interrupt = alpha_cpu_exec_interrupt;
      cc->dump_state = alpha_cpu_dump_state;
      cc->set_pc = alpha_cpu_set_pc;
      cc->gdb_read_register = alpha_cpu_gdb_read_register;
      cc->gdb_write_register = alpha_cpu_gdb_write_register;
 -    cc->tcg_ops.tlb_fill = alpha_cpu_tlb_fill;
  #ifndef CONFIG_USER_ONLY
 -    cc->tcg_ops.do_transaction_failed = alpha_cpu_do_transaction_failed;
 -    cc->tcg_ops.do_unaligned_access = alpha_cpu_do_unaligned_access;
      cc->get_phys_page_debug = alpha_cpu_get_phys_page_debug;
      dc->vmsd = &vmstate_alpha_cpu;
  #endif
      cc->disas_set_info = alpha_cpu_disas_set_info;
 -    cc->tcg_ops.initialize = alpha_translate_init;
 +    cc->tcg_ops = &alpha_tcg_ops;
      cc->gdb_num_core_regs = 67;
  }
 diff --git a/target/arm/cpu.c b/target/arm/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/arm/cpu.c
 +++ b/target/arm/cpu.c
@@ -XXX,XX +XXX,XX @@
  #include "qapi/error.h"
  #include "qapi/visitor.h"
  #include "cpu.h"
 +#ifdef CONFIG_TCG
 +#include "hw/core/tcg-cpu-ops.h"
 +#endif /* CONFIG_TCG */
  #include "internals.h"
  #include "exec/exec-all.h"
  #include "hw/qdev-properties.h"
@@ -XXX,XX +XXX,XX @@ static void arm_cpu_set_pc(CPUState *cs, vaddr value)
  }
  #ifdef CONFIG_TCG
 -static void arm_cpu_synchronize_from_tb(CPUState *cs,
 -                                        const TranslationBlock *tb)
 +void arm_cpu_synchronize_from_tb(CPUState *cs,
 +                                 const TranslationBlock *tb)
  {
      ARMCPU *cpu = ARM_CPU(cs);
      CPUARMState *env = &cpu->env;
@@ -XXX,XX +XXX,XX @@ bool arm_cpu_exec_interrupt(CPUState *cs, int interrupt_request)
   found:
      cs->exception_index = excp_idx;
      env->exception.target_el = target_el;
 -    cc->tcg_ops.do_interrupt(cs);
 +    cc->tcg_ops->do_interrupt(cs);
      return true;
  }
-@@ -XXX,XX +XXX,XX @@ static gchar *arm_gdb_arch_name(CPUState *cs)
++static bool fold_ctpop(OptContext *ctx, TCGOp *op)
-     return g_strdup("arm");
++{
 +    return fold_const1(ctx, op);
 +}
 +
 +static bool fold_divide(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
 +static bool fold_eqv(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
 +static bool fold_exts(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const1(ctx, op);
 +}
 +
 +static bool fold_extu(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const1(ctx, op);
 +}
 +
  static bool fold_mb(OptContext *ctx, TCGOp *op)
  {
      /* Eliminate duplicate and redundant fence instructions.  */
@@ -XXX,XX +XXX,XX @@ static bool fold_mb(OptContext *ctx, TCGOp *op)
      return true;
  }
-+#ifdef CONFIG_TCG
++static bool fold_mul(OptContext *ctx, TCGOp *op)
-+static struct TCGCPUOps arm_tcg_ops = {
++{
-+    .initialize = arm_translate_init,
++    return fold_const2(ctx, op);
-+    .synchronize_from_tb = arm_cpu_synchronize_from_tb,
++}
-+    .cpu_exec_interrupt = arm_cpu_exec_interrupt,
++
-+    .tlb_fill = arm_cpu_tlb_fill,
++static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
-+    .debug_excp_handler = arm_debug_excp_handler,
++{
-+
++    return fold_const2(ctx, op);
-+#if !defined(CONFIG_USER_ONLY)
++}
-+    .do_interrupt = arm_cpu_do_interrupt,
++
-+    .do_transaction_failed = arm_cpu_do_transaction_failed,
++static bool fold_nand(OptContext *ctx, TCGOp *op)
-+    .do_unaligned_access = arm_cpu_do_unaligned_access,
++{
-+    .adjust_watchpoint_address = arm_adjust_watchpoint_address,
++    return fold_const2(ctx, op);
-+    .debug_check_watchpoint = arm_debug_check_watchpoint,
++}
-+#endif /* !CONFIG_USER_ONLY */
++
-+};
++static bool fold_neg(OptContext *ctx, TCGOp *op)
-+#endif /* CONFIG_TCG */
++{
-+
++    return fold_const1(ctx, op);
- static void arm_cpu_class_init(ObjectClass *oc, void *data)
++}
 +
 +static bool fold_nor(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
 +static bool fold_not(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const1(ctx, op);
 +}
 +
 +static bool fold_or(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
 +static bool fold_orc(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
  static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
  {
-     ARMCPUClass *acc = ARM_CPU_CLASS(oc);
+     /* Opcodes that touch guest memory stop the mb optimization.  */
-@@ -XXX,XX +XXX,XX @@ static void arm_cpu_class_init(ObjectClass *oc, void *data)
+@@ -XXX,XX +XXX,XX @@ static bool fold_qemu_st(OptContext *ctx, TCGOp *op)
      cc->gdb_get_dynamic_xml = arm_gdb_get_dynamic_xml;
      cc->gdb_stop_before_watchpoint = true;
      cc->disas_set_info = arm_disas_set_info;
 +
  #ifdef CONFIG_TCG
 -    cc->tcg_ops.initialize = arm_translate_init;
 -    cc->tcg_ops.cpu_exec_interrupt = arm_cpu_exec_interrupt;
 -    cc->tcg_ops.synchronize_from_tb = arm_cpu_synchronize_from_tb;
 -    cc->tcg_ops.tlb_fill = arm_cpu_tlb_fill;
 -    cc->tcg_ops.debug_excp_handler = arm_debug_excp_handler;
 -#if !defined(CONFIG_USER_ONLY)
 -    cc->tcg_ops.do_interrupt = arm_cpu_do_interrupt;
 -    cc->tcg_ops.do_transaction_failed = arm_cpu_do_transaction_failed;
 -    cc->tcg_ops.do_unaligned_access = arm_cpu_do_unaligned_access;
 -    cc->tcg_ops.adjust_watchpoint_address = arm_adjust_watchpoint_address;
 -    cc->tcg_ops.debug_check_watchpoint = arm_debug_check_watchpoint;
 -#endif /* CONFIG_TCG && !CONFIG_USER_ONLY */
 +    cc->tcg_ops = &arm_tcg_ops;
  #endif /* CONFIG_TCG */
  }
 diff --git a/target/arm/cpu64.c b/target/arm/cpu64.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/arm/cpu64.c
 +++ b/target/arm/cpu64.c
@@ -XXX,XX +XXX,XX @@
  #include "qemu/osdep.h"
  #include "qapi/error.h"
  #include "cpu.h"
 +#ifdef CONFIG_TCG
 +#include "hw/core/tcg-cpu-ops.h"
 +#endif /* CONFIG_TCG */
  #include "qemu/module.h"
  #if !defined(CONFIG_USER_ONLY)
  #include "hw/loader.h"
@@ -XXX,XX +XXX,XX @@ static void aarch64_cpu_class_init(ObjectClass *oc, void *data)
  {
      CPUClass *cc = CPU_CLASS(oc);
 -#ifdef CONFIG_TCG
 -    cc->tcg_ops.cpu_exec_interrupt = arm_cpu_exec_interrupt;
 -#endif /* CONFIG_TCG */
 -
      cc->gdb_read_register = aarch64_cpu_gdb_read_register;
      cc->gdb_write_register = aarch64_cpu_gdb_write_register;
      cc->gdb_num_core_regs = 34;
 diff --git a/target/arm/cpu_tcg.c b/target/arm/cpu_tcg.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/arm/cpu_tcg.c
 +++ b/target/arm/cpu_tcg.c
@@ -XXX,XX +XXX,XX @@
  #include "qemu/osdep.h"
  #include "cpu.h"
 +#ifdef CONFIG_TCG
 +#include "hw/core/tcg-cpu-ops.h"
 +#endif /* CONFIG_TCG */
  #include "internals.h"
  /* CPU models. These are not needed for the AArch64 linux-user build. */
@@ -XXX,XX +XXX,XX @@ static bool arm_v7m_cpu_exec_interrupt(CPUState *cs, int interrupt_request)
      if (interrupt_request & CPU_INTERRUPT_HARD
          && (armv7m_nvic_can_take_pending_exception(env->nvic))) {
          cs->exception_index = EXCP_IRQ;
 -        cc->tcg_ops.do_interrupt(cs);
 +        cc->tcg_ops->do_interrupt(cs);
          ret = true;
      }
      return ret;
@@ -XXX,XX +XXX,XX @@ static void pxa270c5_initfn(Object *obj)
      cpu->reset_sctlr = 0x00000078;
  }
 +#ifdef CONFIG_TCG
 +static struct TCGCPUOps arm_v7m_tcg_ops = {
 +    .initialize = arm_translate_init,
 +    .synchronize_from_tb = arm_cpu_synchronize_from_tb,
 +    .cpu_exec_interrupt = arm_v7m_cpu_exec_interrupt,
 +    .tlb_fill = arm_cpu_tlb_fill,
 +    .debug_excp_handler = arm_debug_excp_handler,
 +
 +#if !defined(CONFIG_USER_ONLY)
 +    .do_interrupt = arm_v7m_cpu_do_interrupt,
 +    .do_transaction_failed = arm_cpu_do_transaction_failed,
 +    .do_unaligned_access = arm_cpu_do_unaligned_access,
 +    .adjust_watchpoint_address = arm_adjust_watchpoint_address,
 +    .debug_check_watchpoint = arm_debug_check_watchpoint,
 +#endif /* !CONFIG_USER_ONLY */
 +};
 +#endif /* CONFIG_TCG */
 +
  static void arm_v7m_class_init(ObjectClass *oc, void *data)
  {
      ARMCPUClass *acc = ARM_CPU_CLASS(oc);
@@ -XXX,XX +XXX,XX @@ static void arm_v7m_class_init(ObjectClass *oc, void *data)
      acc->info = data;
  #ifdef CONFIG_TCG
 -    cc->tcg_ops.cpu_exec_interrupt = arm_v7m_cpu_exec_interrupt;
 -#ifndef CONFIG_USER_ONLY
 -    cc->tcg_ops.do_interrupt = arm_v7m_cpu_do_interrupt;
 -#endif
 +    cc->tcg_ops = &arm_v7m_tcg_ops;
  #endif /* CONFIG_TCG */
      cc->gdb_core_xml_file = "arm-m-profile.xml";
 diff --git a/target/avr/cpu.c b/target/avr/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/avr/cpu.c
 +++ b/target/avr/cpu.c
@@ -XXX,XX +XXX,XX @@ static void avr_cpu_dump_state(CPUState *cs, FILE *f, int flags)
      qemu_fprintf(f, "\n");
  }
 +#include "hw/core/tcg-cpu-ops.h"
 +
 +static struct TCGCPUOps avr_tcg_ops = {
 +    .initialize = avr_cpu_tcg_init,
 +    .synchronize_from_tb = avr_cpu_synchronize_from_tb,
 +    .cpu_exec_interrupt = avr_cpu_exec_interrupt,
 +    .tlb_fill = avr_cpu_tlb_fill,
 +
 +#ifndef CONFIG_USER_ONLY
 +    .do_interrupt = avr_cpu_do_interrupt,
 +#endif /* !CONFIG_USER_ONLY */
 +};
 +
  static void avr_cpu_class_init(ObjectClass *oc, void *data)
  {
      DeviceClass *dc = DEVICE_CLASS(oc);
@@ -XXX,XX +XXX,XX @@ static void avr_cpu_class_init(ObjectClass *oc, void *data)
      cc->class_by_name = avr_cpu_class_by_name;
      cc->has_work = avr_cpu_has_work;
 -    cc->tcg_ops.do_interrupt = avr_cpu_do_interrupt;
 -    cc->tcg_ops.cpu_exec_interrupt = avr_cpu_exec_interrupt;
      cc->dump_state = avr_cpu_dump_state;
      cc->set_pc = avr_cpu_set_pc;
      cc->memory_rw_debug = avr_cpu_memory_rw_debug;
      cc->get_phys_page_debug = avr_cpu_get_phys_page_debug;
 -    cc->tcg_ops.tlb_fill = avr_cpu_tlb_fill;
      cc->vmsd = &vms_avr_cpu;
      cc->disas_set_info = avr_cpu_disas_set_info;
 -    cc->tcg_ops.initialize = avr_cpu_tcg_init;
 -    cc->tcg_ops.synchronize_from_tb = avr_cpu_synchronize_from_tb;
      cc->gdb_read_register = avr_cpu_gdb_read_register;
      cc->gdb_write_register = avr_cpu_gdb_write_register;
      cc->gdb_num_core_regs = 35;
      cc->gdb_core_xml_file = "avr-cpu.xml";
 +    cc->tcg_ops = &avr_tcg_ops;
  }
  /*
 diff --git a/target/avr/helper.c b/target/avr/helper.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/avr/helper.c
 +++ b/target/avr/helper.c
@@ -XXX,XX +XXX,XX @@
  #include "qemu/osdep.h"
  #include "cpu.h"
 +#include "hw/core/tcg-cpu-ops.h"
  #include "exec/exec-all.h"
  #include "exec/address-spaces.h"
  #include "exec/helper-proto.h"
@@ -XXX,XX +XXX,XX @@ bool avr_cpu_exec_interrupt(CPUState *cs, int interrupt_request)
      if (interrupt_request & CPU_INTERRUPT_RESET) {
          if (cpu_interrupts_enabled(env)) {
              cs->exception_index = EXCP_RESET;
 -            cc->tcg_ops.do_interrupt(cs);
 +            cc->tcg_ops->do_interrupt(cs);
              cs->interrupt_request &= ~CPU_INTERRUPT_RESET;
@@ -XXX,XX +XXX,XX @@ bool avr_cpu_exec_interrupt(CPUState *cs, int interrupt_request)
          if (cpu_interrupts_enabled(env) && env->intsrc != 0) {
              int index = ctz32(env->intsrc);
              cs->exception_index = EXCP_INT(index);
 -            cc->tcg_ops.do_interrupt(cs);
 +            cc->tcg_ops->do_interrupt(cs);
              env->intsrc &= env->intsrc - 1; /* clear the interrupt */
              cs->interrupt_request &= ~CPU_INTERRUPT_HARD;
 diff --git a/target/cris/cpu.c b/target/cris/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/cris/cpu.c
 +++ b/target/cris/cpu.c
@@ -XXX,XX +XXX,XX @@ static void cris_cpu_initfn(Object *obj)
  #endif
  }
 +#include "hw/core/tcg-cpu-ops.h"
 +
 +static struct TCGCPUOps crisv10_tcg_ops = {
 +    .initialize = cris_initialize_crisv10_tcg,
 +    .cpu_exec_interrupt = cris_cpu_exec_interrupt,
 +    .tlb_fill = cris_cpu_tlb_fill,
 +
 +#ifndef CONFIG_USER_ONLY
 +    .do_interrupt = crisv10_cpu_do_interrupt,
 +#endif /* !CONFIG_USER_ONLY */
 +};
 +
 +static struct TCGCPUOps crisv32_tcg_ops = {
 +    .initialize = cris_initialize_tcg,
 +    .cpu_exec_interrupt = cris_cpu_exec_interrupt,
 +    .tlb_fill = cris_cpu_tlb_fill,
 +
 +#ifndef CONFIG_USER_ONLY
 +    .do_interrupt = cris_cpu_do_interrupt,
 +#endif /* !CONFIG_USER_ONLY */
 +};
 +
  static void crisv8_cpu_class_init(ObjectClass *oc, void *data)
  {
      CPUClass *cc = CPU_CLASS(oc);
      CRISCPUClass *ccc = CRIS_CPU_CLASS(oc);
      ccc->vr = 8;
 -    cc->tcg_ops.do_interrupt = crisv10_cpu_do_interrupt;
      cc->gdb_read_register = crisv10_cpu_gdb_read_register;
 -    cc->tcg_ops.initialize = cris_initialize_crisv10_tcg;
 +    cc->tcg_ops = &crisv10_tcg_ops;
  }
  static void crisv9_cpu_class_init(ObjectClass *oc, void *data)
@@ -XXX,XX +XXX,XX @@ static void crisv9_cpu_class_init(ObjectClass *oc, void *data)
      CRISCPUClass *ccc = CRIS_CPU_CLASS(oc);
      ccc->vr = 9;
 -    cc->tcg_ops.do_interrupt = crisv10_cpu_do_interrupt;
      cc->gdb_read_register = crisv10_cpu_gdb_read_register;
 -    cc->tcg_ops.initialize = cris_initialize_crisv10_tcg;
 +    cc->tcg_ops = &crisv10_tcg_ops;
  }
  static void crisv10_cpu_class_init(ObjectClass *oc, void *data)
@@ -XXX,XX +XXX,XX @@ static void crisv10_cpu_class_init(ObjectClass *oc, void *data)
      CRISCPUClass *ccc = CRIS_CPU_CLASS(oc);
      ccc->vr = 10;
 -    cc->tcg_ops.do_interrupt = crisv10_cpu_do_interrupt;
      cc->gdb_read_register = crisv10_cpu_gdb_read_register;
 -    cc->tcg_ops.initialize = cris_initialize_crisv10_tcg;
 +    cc->tcg_ops = &crisv10_tcg_ops;
  }
  static void crisv11_cpu_class_init(ObjectClass *oc, void *data)
@@ -XXX,XX +XXX,XX @@ static void crisv11_cpu_class_init(ObjectClass *oc, void *data)
      CRISCPUClass *ccc = CRIS_CPU_CLASS(oc);
      ccc->vr = 11;
 -    cc->tcg_ops.do_interrupt = crisv10_cpu_do_interrupt;
      cc->gdb_read_register = crisv10_cpu_gdb_read_register;
 -    cc->tcg_ops.initialize = cris_initialize_crisv10_tcg;
 +    cc->tcg_ops = &crisv10_tcg_ops;
  }
  static void crisv17_cpu_class_init(ObjectClass *oc, void *data)
@@ -XXX,XX +XXX,XX @@ static void crisv17_cpu_class_init(ObjectClass *oc, void *data)
      CRISCPUClass *ccc = CRIS_CPU_CLASS(oc);
      ccc->vr = 17;
 -    cc->tcg_ops.do_interrupt = crisv10_cpu_do_interrupt;
      cc->gdb_read_register = crisv10_cpu_gdb_read_register;
 -    cc->tcg_ops.initialize = cris_initialize_crisv10_tcg;
 +    cc->tcg_ops = &crisv10_tcg_ops;
  }
  static void crisv32_cpu_class_init(ObjectClass *oc, void *data)
  {
 +    CPUClass *cc = CPU_CLASS(oc);
      CRISCPUClass *ccc = CRIS_CPU_CLASS(oc);
      ccc->vr = 32;
 +    cc->tcg_ops = &crisv32_tcg_ops;
  }
  static void cris_cpu_class_init(ObjectClass *oc, void *data)
@@ -XXX,XX +XXX,XX @@ static void cris_cpu_class_init(ObjectClass *oc, void *data)
      cc->class_by_name = cris_cpu_class_by_name;
      cc->has_work = cris_cpu_has_work;
 -    cc->tcg_ops.do_interrupt = cris_cpu_do_interrupt;
 -    cc->tcg_ops.cpu_exec_interrupt = cris_cpu_exec_interrupt;
      cc->dump_state = cris_cpu_dump_state;
      cc->set_pc = cris_cpu_set_pc;
      cc->gdb_read_register = cris_cpu_gdb_read_register;
      cc->gdb_write_register = cris_cpu_gdb_write_register;
 -    cc->tcg_ops.tlb_fill = cris_cpu_tlb_fill;
  #ifndef CONFIG_USER_ONLY
      cc->get_phys_page_debug = cris_cpu_get_phys_page_debug;
      dc->vmsd = &vmstate_cris_cpu;
@@ -XXX,XX +XXX,XX @@ static void cris_cpu_class_init(ObjectClass *oc, void *data)
      cc->gdb_stop_before_watchpoint = true;
      cc->disas_set_info = cris_disas_set_info;
 -    cc->tcg_ops.initialize = cris_initialize_tcg;
  }
  #define DEFINE_CRIS_CPU_TYPE(cpu_model, initfn) \
 diff --git a/target/cris/helper.c b/target/cris/helper.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/cris/helper.c
 +++ b/target/cris/helper.c
@@ -XXX,XX +XXX,XX @@
  #include "qemu/osdep.h"
  #include "cpu.h"
 +#include "hw/core/tcg-cpu-ops.h"
  #include "mmu.h"
  #include "qemu/host-utils.h"
  #include "exec/exec-all.h"
@@ -XXX,XX +XXX,XX @@ bool cris_cpu_exec_interrupt(CPUState *cs, int interrupt_request)
          && (env->pregs[PR_CCS] & I_FLAG)
          && !env->locked_irq) {
          cs->exception_index = EXCP_IRQ;
 -        cc->tcg_ops.do_interrupt(cs);
 +        cc->tcg_ops->do_interrupt(cs);
          ret = true;
      }
      if (interrupt_request & CPU_INTERRUPT_NMI) {
@@ -XXX,XX +XXX,XX @@ bool cris_cpu_exec_interrupt(CPUState *cs, int interrupt_request)
          }
          if ((env->pregs[PR_CCS] & m_flag_archval)) {
              cs->exception_index = EXCP_NMI;
 -            cc->tcg_ops.do_interrupt(cs);
 +            cc->tcg_ops->do_interrupt(cs);
              ret = true;
          }
      }
 diff --git a/target/hppa/cpu.c b/target/hppa/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/hppa/cpu.c
 +++ b/target/hppa/cpu.c
@@ -XXX,XX +XXX,XX @@ static ObjectClass *hppa_cpu_class_by_name(const char *cpu_model)
      return object_class_by_name(TYPE_HPPA_CPU);
  }
 +#include "hw/core/tcg-cpu-ops.h"
 +
 +static struct TCGCPUOps hppa_tcg_ops = {
 +    .initialize = hppa_translate_init,
 +    .synchronize_from_tb = hppa_cpu_synchronize_from_tb,
 +    .cpu_exec_interrupt = hppa_cpu_exec_interrupt,
 +    .tlb_fill = hppa_cpu_tlb_fill,
 +
 +#ifndef CONFIG_USER_ONLY
 +    .do_interrupt = hppa_cpu_do_interrupt,
 +    .do_unaligned_access = hppa_cpu_do_unaligned_access,
 +#endif /* !CONFIG_USER_ONLY */
 +};
 +
  static void hppa_cpu_class_init(ObjectClass *oc, void *data)
  {
      DeviceClass *dc = DEVICE_CLASS(oc);
@@ -XXX,XX +XXX,XX @@ static void hppa_cpu_class_init(ObjectClass *oc, void *data)
      cc->class_by_name = hppa_cpu_class_by_name;
      cc->has_work = hppa_cpu_has_work;
 -    cc->tcg_ops.do_interrupt = hppa_cpu_do_interrupt;
 -    cc->tcg_ops.cpu_exec_interrupt = hppa_cpu_exec_interrupt;
      cc->dump_state = hppa_cpu_dump_state;
      cc->set_pc = hppa_cpu_set_pc;
 -    cc->tcg_ops.synchronize_from_tb = hppa_cpu_synchronize_from_tb;
      cc->gdb_read_register = hppa_cpu_gdb_read_register;
      cc->gdb_write_register = hppa_cpu_gdb_write_register;
 -    cc->tcg_ops.tlb_fill = hppa_cpu_tlb_fill;
  #ifndef CONFIG_USER_ONLY
      cc->get_phys_page_debug = hppa_cpu_get_phys_page_debug;
 -    cc->tcg_ops.do_unaligned_access = hppa_cpu_do_unaligned_access;
      dc->vmsd = &vmstate_hppa_cpu;
  #endif
      cc->disas_set_info = hppa_cpu_disas_set_info;
 -    cc->tcg_ops.initialize = hppa_translate_init;
 -
      cc->gdb_num_core_regs = 128;
 +    cc->tcg_ops = &hppa_tcg_ops;
  }
  static const TypeInfo hppa_cpu_type_info = {
 diff --git a/target/i386/tcg/tcg-cpu.c b/target/i386/tcg/tcg-cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/i386/tcg/tcg-cpu.c
 +++ b/target/i386/tcg/tcg-cpu.c
@@ -XXX,XX +XXX,XX @@ static void x86_cpu_synchronize_from_tb(CPUState *cs,
      cpu->env.eip = tb->pc - tb->cs_base;
  }
 +#include "hw/core/tcg-cpu-ops.h"
 +
 +static struct TCGCPUOps x86_tcg_ops = {
 +    .initialize = tcg_x86_init,
 +    .synchronize_from_tb = x86_cpu_synchronize_from_tb,
 +    .cpu_exec_enter = x86_cpu_exec_enter,
 +    .cpu_exec_exit = x86_cpu_exec_exit,
 +    .cpu_exec_interrupt = x86_cpu_exec_interrupt,
 +    .do_interrupt = x86_cpu_do_interrupt,
 +    .tlb_fill = x86_cpu_tlb_fill,
 +#ifndef CONFIG_USER_ONLY
 +    .debug_excp_handler = breakpoint_handler,
 +#endif /* !CONFIG_USER_ONLY */
 +};
 +
  void tcg_cpu_common_class_init(CPUClass *cc)
  {
 -    cc->tcg_ops.do_interrupt = x86_cpu_do_interrupt;
 -    cc->tcg_ops.cpu_exec_interrupt = x86_cpu_exec_interrupt;
 -    cc->tcg_ops.synchronize_from_tb = x86_cpu_synchronize_from_tb;
 -    cc->tcg_ops.cpu_exec_enter = x86_cpu_exec_enter;
 -    cc->tcg_ops.cpu_exec_exit = x86_cpu_exec_exit;
 -    cc->tcg_ops.initialize = tcg_x86_init;
 -    cc->tcg_ops.tlb_fill = x86_cpu_tlb_fill;
 -#ifndef CONFIG_USER_ONLY
 -    cc->tcg_ops.debug_excp_handler = breakpoint_handler;
 -#endif
 +    cc->tcg_ops = &x86_tcg_ops;
  }
 diff --git a/target/lm32/cpu.c b/target/lm32/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/lm32/cpu.c
 +++ b/target/lm32/cpu.c
@@ -XXX,XX +XXX,XX @@ static ObjectClass *lm32_cpu_class_by_name(const char *cpu_model)
      return oc;
  }
 +#include "hw/core/tcg-cpu-ops.h"
 +
 +static struct TCGCPUOps lm32_tcg_ops = {
 +    .initialize = lm32_translate_init,
 +    .cpu_exec_interrupt = lm32_cpu_exec_interrupt,
 +    .tlb_fill = lm32_cpu_tlb_fill,
 +    .debug_excp_handler = lm32_debug_excp_handler,
 +
 +#ifndef CONFIG_USER_ONLY
 +    .do_interrupt = lm32_cpu_do_interrupt,
 +#endif /* !CONFIG_USER_ONLY */
 +};
 +
  static void lm32_cpu_class_init(ObjectClass *oc, void *data)
  {
      LM32CPUClass *lcc = LM32_CPU_CLASS(oc);
@@ -XXX,XX +XXX,XX @@ static void lm32_cpu_class_init(ObjectClass *oc, void *data)
      cc->class_by_name = lm32_cpu_class_by_name;
      cc->has_work = lm32_cpu_has_work;
 -    cc->tcg_ops.do_interrupt = lm32_cpu_do_interrupt;
 -    cc->tcg_ops.cpu_exec_interrupt = lm32_cpu_exec_interrupt;
      cc->dump_state = lm32_cpu_dump_state;
      cc->set_pc = lm32_cpu_set_pc;
      cc->gdb_read_register = lm32_cpu_gdb_read_register;
      cc->gdb_write_register = lm32_cpu_gdb_write_register;
 -    cc->tcg_ops.tlb_fill = lm32_cpu_tlb_fill;
  #ifndef CONFIG_USER_ONLY
      cc->get_phys_page_debug = lm32_cpu_get_phys_page_debug;
      cc->vmsd = &vmstate_lm32_cpu;
  #endif
      cc->gdb_num_core_regs = 32 + 7;
      cc->gdb_stop_before_watchpoint = true;
 -    cc->tcg_ops.debug_excp_handler = lm32_debug_excp_handler;
      cc->disas_set_info = lm32_cpu_disas_set_info;
 -    cc->tcg_ops.initialize = lm32_translate_init;
 +    cc->tcg_ops = &lm32_tcg_ops;
  }
  #define DEFINE_LM32_CPU_TYPE(cpu_model, initfn) \
 diff --git a/target/m68k/cpu.c b/target/m68k/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/m68k/cpu.c
 +++ b/target/m68k/cpu.c
@@ -XXX,XX +XXX,XX @@ static const VMStateDescription vmstate_m68k_cpu = {
  };
  #endif
 +#include "hw/core/tcg-cpu-ops.h"
 +
 +static struct TCGCPUOps m68k_tcg_ops = {
 +    .initialize = m68k_tcg_init,
 +    .cpu_exec_interrupt = m68k_cpu_exec_interrupt,
 +    .tlb_fill = m68k_cpu_tlb_fill,
 +
 +#ifndef CONFIG_USER_ONLY
 +    .do_interrupt = m68k_cpu_do_interrupt,
 +    .do_transaction_failed = m68k_cpu_transaction_failed,
 +#endif /* !CONFIG_USER_ONLY */
 +};
 +
  static void m68k_cpu_class_init(ObjectClass *c, void *data)
  {
      M68kCPUClass *mcc = M68K_CPU_CLASS(c);
@@ -XXX,XX +XXX,XX @@ static void m68k_cpu_class_init(ObjectClass *c, void *data)
      cc->class_by_name = m68k_cpu_class_by_name;
      cc->has_work = m68k_cpu_has_work;
 -    cc->tcg_ops.do_interrupt = m68k_cpu_do_interrupt;
 -    cc->tcg_ops.cpu_exec_interrupt = m68k_cpu_exec_interrupt;
      cc->dump_state = m68k_cpu_dump_state;
      cc->set_pc = m68k_cpu_set_pc;
      cc->gdb_read_register = m68k_cpu_gdb_read_register;
      cc->gdb_write_register = m68k_cpu_gdb_write_register;
 -    cc->tcg_ops.tlb_fill = m68k_cpu_tlb_fill;
  #if defined(CONFIG_SOFTMMU)
 -    cc->tcg_ops.do_transaction_failed = m68k_cpu_transaction_failed;
      cc->get_phys_page_debug = m68k_cpu_get_phys_page_debug;
      dc->vmsd = &vmstate_m68k_cpu;
  #endif
      cc->disas_set_info = m68k_cpu_disas_set_info;
 -    cc->tcg_ops.initialize = m68k_tcg_init;
      cc->gdb_num_core_regs = 18;
 +    cc->tcg_ops = &m68k_tcg_ops;
  }
  static void m68k_cpu_class_init_cf_core(ObjectClass *c, void *data)
 diff --git a/target/microblaze/cpu.c b/target/microblaze/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/microblaze/cpu.c
 +++ b/target/microblaze/cpu.c
@@ -XXX,XX +XXX,XX @@ static ObjectClass *mb_cpu_class_by_name(const char *cpu_model)
      return object_class_by_name(TYPE_MICROBLAZE_CPU);
  }
 +#include "hw/core/tcg-cpu-ops.h"
 +
 +static struct TCGCPUOps mb_tcg_ops = {
 +    .initialize = mb_tcg_init,
 +    .synchronize_from_tb = mb_cpu_synchronize_from_tb,
 +    .cpu_exec_interrupt = mb_cpu_exec_interrupt,
 +    .tlb_fill = mb_cpu_tlb_fill,
 +
 +#ifndef CONFIG_USER_ONLY
 +    .do_interrupt = mb_cpu_do_interrupt,
 +    .do_transaction_failed = mb_cpu_transaction_failed,
 +    .do_unaligned_access = mb_cpu_do_unaligned_access,
 +#endif /* !CONFIG_USER_ONLY */
 +};
 +
  static void mb_cpu_class_init(ObjectClass *oc, void *data)
  {
      DeviceClass *dc = DEVICE_CLASS(oc);
@@ -XXX,XX +XXX,XX @@ static void mb_cpu_class_init(ObjectClass *oc, void *data)
      cc->class_by_name = mb_cpu_class_by_name;
      cc->has_work = mb_cpu_has_work;
 -    cc->tcg_ops.do_interrupt = mb_cpu_do_interrupt;
 -    cc->tcg_ops.cpu_exec_interrupt = mb_cpu_exec_interrupt;
 +
      cc->dump_state = mb_cpu_dump_state;
      cc->set_pc = mb_cpu_set_pc;
 -    cc->tcg_ops.synchronize_from_tb = mb_cpu_synchronize_from_tb;
      cc->gdb_read_register = mb_cpu_gdb_read_register;
      cc->gdb_write_register = mb_cpu_gdb_write_register;
 -    cc->tcg_ops.tlb_fill = mb_cpu_tlb_fill;
 +
  #ifndef CONFIG_USER_ONLY
 -    cc->tcg_ops.do_transaction_failed = mb_cpu_transaction_failed;
 -    cc->tcg_ops.do_unaligned_access = mb_cpu_do_unaligned_access;
      cc->get_phys_page_attrs_debug = mb_cpu_get_phys_page_attrs_debug;
      dc->vmsd = &vmstate_mb_cpu;
  #endif
@@ -XXX,XX +XXX,XX @@ static void mb_cpu_class_init(ObjectClass *oc, void *data)
      cc->gdb_num_core_regs = 32 + 27;
      cc->disas_set_info = mb_disas_set_info;
 -    cc->tcg_ops.initialize = mb_tcg_init;
 +    cc->tcg_ops = &mb_tcg_ops;
  }
  static const TypeInfo mb_cpu_type_info = {
 diff --git a/target/mips/cpu.c b/target/mips/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/mips/cpu.c
 +++ b/target/mips/cpu.c
@@ -XXX,XX +XXX,XX @@ static Property mips_cpu_properties[] = {
      DEFINE_PROP_END_OF_LIST()
  };
 +#ifdef CONFIG_TCG
 +#include "hw/core/tcg-cpu-ops.h"
 +/*
 + * NB: cannot be const, as some elements are changed for specific
 + * mips hardware (see hw/mips/jazz.c).
 + */
 +static struct TCGCPUOps mips_tcg_ops = {
 +    .initialize = mips_tcg_init,
 +    .synchronize_from_tb = mips_cpu_synchronize_from_tb,
 +    .cpu_exec_interrupt = mips_cpu_exec_interrupt,
 +    .tlb_fill = mips_cpu_tlb_fill,
 +
 +#if !defined(CONFIG_USER_ONLY)
 +    .do_interrupt = mips_cpu_do_interrupt,
 +    .do_transaction_failed = mips_cpu_do_transaction_failed,
 +    .do_unaligned_access = mips_cpu_do_unaligned_access,
 +#endif /* !CONFIG_USER_ONLY */
 +};
 +#endif /* CONFIG_TCG */
 +
  static void mips_cpu_class_init(ObjectClass *c, void *data)
  {
      MIPSCPUClass *mcc = MIPS_CPU_CLASS(c);
@@ -XXX,XX +XXX,XX @@ static void mips_cpu_class_init(ObjectClass *c, void *data)
      cc->vmsd = &vmstate_mips_cpu;
  #endif
      cc->disas_set_info = mips_cpu_disas_set_info;
 -#ifdef CONFIG_TCG
 -    cc->tcg_ops.initialize = mips_tcg_init;
 -    cc->tcg_ops.do_interrupt = mips_cpu_do_interrupt;
 -    cc->tcg_ops.cpu_exec_interrupt = mips_cpu_exec_interrupt;
 -    cc->tcg_ops.synchronize_from_tb = mips_cpu_synchronize_from_tb;
 -    cc->tcg_ops.tlb_fill = mips_cpu_tlb_fill;
 -#ifndef CONFIG_USER_ONLY
 -    cc->tcg_ops.do_transaction_failed = mips_cpu_do_transaction_failed;
 -    cc->tcg_ops.do_unaligned_access = mips_cpu_do_unaligned_access;
 -
 -#endif /* CONFIG_USER_ONLY */
 -#endif /* CONFIG_TCG */
 -
      cc->gdb_num_core_regs = 73;
      cc->gdb_stop_before_watchpoint = true;
 +#ifdef CONFIG_TCG
 +    cc->tcg_ops = &mips_tcg_ops;
 +#endif /* CONFIG_TCG */
  }
  static const TypeInfo mips_cpu_type_info = {
 diff --git a/target/moxie/cpu.c b/target/moxie/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/moxie/cpu.c
 +++ b/target/moxie/cpu.c
@@ -XXX,XX +XXX,XX @@ static ObjectClass *moxie_cpu_class_by_name(const char *cpu_model)
      return oc;
  }
 +#include "hw/core/tcg-cpu-ops.h"
 +
 +static struct TCGCPUOps moxie_tcg_ops = {
 +    .initialize = moxie_translate_init,
 +    .tlb_fill = moxie_cpu_tlb_fill,
 +
 +#ifndef CONFIG_USER_ONLY
 +    .do_interrupt = moxie_cpu_do_interrupt,
 +#endif /* !CONFIG_USER_ONLY */
 +};
 +
  static void moxie_cpu_class_init(ObjectClass *oc, void *data)
  {
      DeviceClass *dc = DEVICE_CLASS(oc);
@@ -XXX,XX +XXX,XX @@ static void moxie_cpu_class_init(ObjectClass *oc, void *data)
      cc->class_by_name = moxie_cpu_class_by_name;
      cc->has_work = moxie_cpu_has_work;
 -    cc->tcg_ops.do_interrupt = moxie_cpu_do_interrupt;
      cc->dump_state = moxie_cpu_dump_state;
      cc->set_pc = moxie_cpu_set_pc;
 -    cc->tcg_ops.tlb_fill = moxie_cpu_tlb_fill;
  #ifndef CONFIG_USER_ONLY
      cc->get_phys_page_debug = moxie_cpu_get_phys_page_debug;
      cc->vmsd = &vmstate_moxie_cpu;
  #endif
      cc->disas_set_info = moxie_cpu_disas_set_info;
 -    cc->tcg_ops.initialize = moxie_translate_init;
 +    cc->tcg_ops = &moxie_tcg_ops;
  }
  static void moxielite_initfn(Object *obj)
 diff --git a/target/nios2/cpu.c b/target/nios2/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/nios2/cpu.c
 +++ b/target/nios2/cpu.c
@@ -XXX,XX +XXX,XX @@ static Property nios2_properties[] = {
      DEFINE_PROP_END_OF_LIST(),
  };
 +#include "hw/core/tcg-cpu-ops.h"
 +
 +static struct TCGCPUOps nios2_tcg_ops = {
 +    .initialize = nios2_tcg_init,
 +    .cpu_exec_interrupt = nios2_cpu_exec_interrupt,
 +    .tlb_fill = nios2_cpu_tlb_fill,
 +
 +#ifndef CONFIG_USER_ONLY
 +    .do_interrupt = nios2_cpu_do_interrupt,
 +    .do_unaligned_access = nios2_cpu_do_unaligned_access,
 +#endif /* !CONFIG_USER_ONLY */
 +};
  static void nios2_cpu_class_init(ObjectClass *oc, void *data)
  {
@@ -XXX,XX +XXX,XX @@ static void nios2_cpu_class_init(ObjectClass *oc, void *data)
      cc->class_by_name = nios2_cpu_class_by_name;
      cc->has_work = nios2_cpu_has_work;
 -    cc->tcg_ops.do_interrupt = nios2_cpu_do_interrupt;
 -    cc->tcg_ops.cpu_exec_interrupt = nios2_cpu_exec_interrupt;
      cc->dump_state = nios2_cpu_dump_state;
      cc->set_pc = nios2_cpu_set_pc;
      cc->disas_set_info = nios2_cpu_disas_set_info;
 -    cc->tcg_ops.tlb_fill = nios2_cpu_tlb_fill;
  #ifndef CONFIG_USER_ONLY
 -    cc->tcg_ops.do_unaligned_access = nios2_cpu_do_unaligned_access;
      cc->get_phys_page_debug = nios2_cpu_get_phys_page_debug;
  #endif
      cc->gdb_read_register = nios2_cpu_gdb_read_register;
      cc->gdb_write_register = nios2_cpu_gdb_write_register;
      cc->gdb_num_core_regs = 49;
 -    cc->tcg_ops.initialize = nios2_tcg_init;
 +    cc->tcg_ops = &nios2_tcg_ops;
  }
  static const TypeInfo nios2_cpu_type_info = {
 diff --git a/target/openrisc/cpu.c b/target/openrisc/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/openrisc/cpu.c
 +++ b/target/openrisc/cpu.c
@@ -XXX,XX +XXX,XX @@ static void openrisc_any_initfn(Object *obj)
                        | (IMMUCFGR_NTS & (ctz32(TLB_SIZE) << 2));
  }
 +#include "hw/core/tcg-cpu-ops.h"
 +
 +static struct TCGCPUOps openrisc_tcg_ops = {
 +    .initialize = openrisc_translate_init,
 +    .cpu_exec_interrupt = openrisc_cpu_exec_interrupt,
 +    .tlb_fill = openrisc_cpu_tlb_fill,
 +
 +#ifndef CONFIG_USER_ONLY
 +    .do_interrupt = openrisc_cpu_do_interrupt,
 +#endif /* !CONFIG_USER_ONLY */
 +};
 +
  static void openrisc_cpu_class_init(ObjectClass *oc, void *data)
  {
      OpenRISCCPUClass *occ = OPENRISC_CPU_CLASS(oc);
@@ -XXX,XX +XXX,XX @@ static void openrisc_cpu_class_init(ObjectClass *oc, void *data)
      cc->class_by_name = openrisc_cpu_class_by_name;
      cc->has_work = openrisc_cpu_has_work;
 -    cc->tcg_ops.do_interrupt = openrisc_cpu_do_interrupt;
 -    cc->tcg_ops.cpu_exec_interrupt = openrisc_cpu_exec_interrupt;
      cc->dump_state = openrisc_cpu_dump_state;
      cc->set_pc = openrisc_cpu_set_pc;
      cc->gdb_read_register = openrisc_cpu_gdb_read_register;
      cc->gdb_write_register = openrisc_cpu_gdb_write_register;
 -    cc->tcg_ops.tlb_fill = openrisc_cpu_tlb_fill;
  #ifndef CONFIG_USER_ONLY
      cc->get_phys_page_debug = openrisc_cpu_get_phys_page_debug;
      dc->vmsd = &vmstate_openrisc_cpu;
  #endif
      cc->gdb_num_core_regs = 32 + 3;
 -    cc->tcg_ops.initialize = openrisc_translate_init;
      cc->disas_set_info = openrisc_disas_set_info;
 +    cc->tcg_ops = &openrisc_tcg_ops;
  }
  /* Sort alphabetically by type name, except for "any". */
 diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/riscv/cpu.c
 +++ b/target/riscv/cpu.c
@@ -XXX,XX +XXX,XX @@ static const char *riscv_gdb_get_dynamic_xml(CPUState *cs, const char *xmlname)
      return NULL;
  }
 +#include "hw/core/tcg-cpu-ops.h"
 +
 +static struct TCGCPUOps riscv_tcg_ops = {
 +    .initialize = riscv_translate_init,
 +    .synchronize_from_tb = riscv_cpu_synchronize_from_tb,
 +    .cpu_exec_interrupt = riscv_cpu_exec_interrupt,
 +    .tlb_fill = riscv_cpu_tlb_fill,
 +
 +#ifndef CONFIG_USER_ONLY
 +    .do_interrupt = riscv_cpu_do_interrupt,
 +    .do_transaction_failed = riscv_cpu_do_transaction_failed,
 +    .do_unaligned_access = riscv_cpu_do_unaligned_access,
 +#endif /* !CONFIG_USER_ONLY */
 +};
 +
  static void riscv_cpu_class_init(ObjectClass *c, void *data)
  {
      RISCVCPUClass *mcc = RISCV_CPU_CLASS(c);
@@ -XXX,XX +XXX,XX @@ static void riscv_cpu_class_init(ObjectClass *c, void *data)
      cc->class_by_name = riscv_cpu_class_by_name;
      cc->has_work = riscv_cpu_has_work;
 -    cc->tcg_ops.do_interrupt = riscv_cpu_do_interrupt;
 -    cc->tcg_ops.cpu_exec_interrupt = riscv_cpu_exec_interrupt;
      cc->dump_state = riscv_cpu_dump_state;
      cc->set_pc = riscv_cpu_set_pc;
 -    cc->tcg_ops.synchronize_from_tb = riscv_cpu_synchronize_from_tb;
      cc->gdb_read_register = riscv_cpu_gdb_read_register;
      cc->gdb_write_register = riscv_cpu_gdb_write_register;
      cc->gdb_num_core_regs = 33;
@@ -XXX,XX +XXX,XX @@ static void riscv_cpu_class_init(ObjectClass *c, void *data)
      cc->gdb_stop_before_watchpoint = true;
      cc->disas_set_info = riscv_cpu_disas_set_info;
  #ifndef CONFIG_USER_ONLY
 -    cc->tcg_ops.do_transaction_failed = riscv_cpu_do_transaction_failed;
 -    cc->tcg_ops.do_unaligned_access = riscv_cpu_do_unaligned_access;
      cc->get_phys_page_debug = riscv_cpu_get_phys_page_debug;
      /* For now, mark unmigratable: */
      cc->vmsd = &vmstate_riscv_cpu;
  #endif
      cc->gdb_arch_name = riscv_gdb_arch_name;
      cc->gdb_get_dynamic_xml = riscv_gdb_get_dynamic_xml;
 -    cc->tcg_ops.initialize = riscv_translate_init;
 -    cc->tcg_ops.tlb_fill = riscv_cpu_tlb_fill;
 +    cc->tcg_ops = &riscv_tcg_ops;
      device_class_set_props(dc, riscv_cpu_properties);
  }
 diff --git a/target/rx/cpu.c b/target/rx/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/rx/cpu.c
 +++ b/target/rx/cpu.c
@@ -XXX,XX +XXX,XX @@ static void rx_cpu_init(Object *obj)
      qdev_init_gpio_in(DEVICE(cpu), rx_cpu_set_irq, 2);
  }
 +#include "hw/core/tcg-cpu-ops.h"
 +
 +static struct TCGCPUOps rx_tcg_ops = {
 +    .initialize = rx_translate_init,
 +    .synchronize_from_tb = rx_cpu_synchronize_from_tb,
 +    .cpu_exec_interrupt = rx_cpu_exec_interrupt,
 +    .tlb_fill = rx_cpu_tlb_fill,
 +
 +#ifndef CONFIG_USER_ONLY
 +    .do_interrupt = rx_cpu_do_interrupt,
 +#endif /* !CONFIG_USER_ONLY */
 +};
 +
  static void rx_cpu_class_init(ObjectClass *klass, void *data)
  {
      DeviceClass *dc = DEVICE_CLASS(klass);
@@ -XXX,XX +XXX,XX @@ static void rx_cpu_class_init(ObjectClass *klass, void *data)
      cc->class_by_name = rx_cpu_class_by_name;
      cc->has_work = rx_cpu_has_work;
 -    cc->tcg_ops.do_interrupt = rx_cpu_do_interrupt;
 -    cc->tcg_ops.cpu_exec_interrupt = rx_cpu_exec_interrupt;
      cc->dump_state = rx_cpu_dump_state;
      cc->set_pc = rx_cpu_set_pc;
 -    cc->tcg_ops.synchronize_from_tb = rx_cpu_synchronize_from_tb;
 +
      cc->gdb_read_register = rx_cpu_gdb_read_register;
      cc->gdb_write_register = rx_cpu_gdb_write_register;
      cc->get_phys_page_debug = rx_cpu_get_phys_page_debug;
      cc->disas_set_info = rx_cpu_disas_set_info;
 -    cc->tcg_ops.initialize = rx_translate_init;
 -    cc->tcg_ops.tlb_fill = rx_cpu_tlb_fill;
      cc->gdb_num_core_regs = 26;
      cc->gdb_core_xml_file = "rx-core.xml";
 +    cc->tcg_ops = &rx_tcg_ops;
  }
  static const TypeInfo rx_cpu_info = {
 diff --git a/target/s390x/cpu.c b/target/s390x/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/s390x/cpu.c
 +++ b/target/s390x/cpu.c
@@ -XXX,XX +XXX,XX @@ static void s390_cpu_reset_full(DeviceState *dev)
      return s390_cpu_reset(s, S390_CPU_RESET_CLEAR);
  }
 +#ifdef CONFIG_TCG
 +#include "hw/core/tcg-cpu-ops.h"
 +
 +static struct TCGCPUOps s390_tcg_ops = {
 +    .initialize = s390x_translate_init,
 +    .tlb_fill = s390_cpu_tlb_fill,
 +
 +#if !defined(CONFIG_USER_ONLY)
 +    .cpu_exec_interrupt = s390_cpu_exec_interrupt,
 +    .do_interrupt = s390_cpu_do_interrupt,
 +    .debug_excp_handler = s390x_cpu_debug_excp_handler,
 +    .do_unaligned_access = s390x_cpu_do_unaligned_access,
 +#endif /* !CONFIG_USER_ONLY */
 +};
 +#endif /* CONFIG_TCG */
 +
  static void s390_cpu_class_init(ObjectClass *oc, void *data)
  {
      S390CPUClass *scc = S390_CPU_CLASS(oc);
@@ -XXX,XX +XXX,XX @@ static void s390_cpu_class_init(ObjectClass *oc, void *data)
      scc->reset = s390_cpu_reset;
      cc->class_by_name = s390_cpu_class_by_name,
      cc->has_work = s390_cpu_has_work;
 -#ifdef CONFIG_TCG
 -    cc->tcg_ops.do_interrupt = s390_cpu_do_interrupt;
 -#endif
      cc->dump_state = s390_cpu_dump_state;
      cc->set_pc = s390_cpu_set_pc;
      cc->gdb_read_register = s390_cpu_gdb_read_register;
@@ -XXX,XX +XXX,XX @@ static void s390_cpu_class_init(ObjectClass *oc, void *data)
      cc->vmsd = &vmstate_s390_cpu;
      cc->get_crash_info = s390_cpu_get_crash_info;
      cc->write_elf64_note = s390_cpu_write_elf64_note;
 -#ifdef CONFIG_TCG
 -    cc->tcg_ops.cpu_exec_interrupt = s390_cpu_exec_interrupt;
 -    cc->tcg_ops.debug_excp_handler = s390x_cpu_debug_excp_handler;
 -    cc->tcg_ops.do_unaligned_access = s390x_cpu_do_unaligned_access;
 -#endif
  #endif
      cc->disas_set_info = s390_cpu_disas_set_info;
 -#ifdef CONFIG_TCG
 -    cc->tcg_ops.initialize = s390x_translate_init;
 -    cc->tcg_ops.tlb_fill = s390_cpu_tlb_fill;
 -#endif
 -
      cc->gdb_num_core_regs = S390_NUM_CORE_REGS;
      cc->gdb_core_xml_file = "s390x-core64.xml";
      cc->gdb_arch_name = s390_gdb_arch_name;
      s390_cpu_model_class_register_props(oc);
 +
 +#ifdef CONFIG_TCG
 +    cc->tcg_ops = &s390_tcg_ops;
 +#endif /* CONFIG_TCG */
  }
  static const TypeInfo s390_cpu_type_info = {
 diff --git a/target/sh4/cpu.c b/target/sh4/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/sh4/cpu.c
 +++ b/target/sh4/cpu.c
@@ -XXX,XX +XXX,XX @@ static const VMStateDescription vmstate_sh_cpu = {
      .unmigratable = 1,
  };
 +#include "hw/core/tcg-cpu-ops.h"
 +
 +static struct TCGCPUOps superh_tcg_ops = {
 +    .initialize = sh4_translate_init,
 +    .synchronize_from_tb = superh_cpu_synchronize_from_tb,
 +    .cpu_exec_interrupt = superh_cpu_exec_interrupt,
 +    .tlb_fill = superh_cpu_tlb_fill,
 +
 +#ifndef CONFIG_USER_ONLY
 +    .do_interrupt = superh_cpu_do_interrupt,
 +    .do_unaligned_access = superh_cpu_do_unaligned_access,
 +#endif /* !CONFIG_USER_ONLY */
 +};
 +
  static void superh_cpu_class_init(ObjectClass *oc, void *data)
  {
      DeviceClass *dc = DEVICE_CLASS(oc);
@@ -XXX,XX +XXX,XX @@ static void superh_cpu_class_init(ObjectClass *oc, void *data)
      cc->class_by_name = superh_cpu_class_by_name;
      cc->has_work = superh_cpu_has_work;
 -    cc->tcg_ops.do_interrupt = superh_cpu_do_interrupt;
 -    cc->tcg_ops.cpu_exec_interrupt = superh_cpu_exec_interrupt;
      cc->dump_state = superh_cpu_dump_state;
      cc->set_pc = superh_cpu_set_pc;
 -    cc->tcg_ops.synchronize_from_tb = superh_cpu_synchronize_from_tb;
      cc->gdb_read_register = superh_cpu_gdb_read_register;
      cc->gdb_write_register = superh_cpu_gdb_write_register;
 -    cc->tcg_ops.tlb_fill = superh_cpu_tlb_fill;
  #ifndef CONFIG_USER_ONLY
 -    cc->tcg_ops.do_unaligned_access = superh_cpu_do_unaligned_access;
      cc->get_phys_page_debug = superh_cpu_get_phys_page_debug;
  #endif
      cc->disas_set_info = superh_cpu_disas_set_info;
 -    cc->tcg_ops.initialize = sh4_translate_init;
      cc->gdb_num_core_regs = 59;
      dc->vmsd = &vmstate_sh_cpu;
 +    cc->tcg_ops = &superh_tcg_ops;
  }
  #define DEFINE_SUPERH_CPU_TYPE(type_name, cinit, initfn) \
 diff --git a/target/sparc/cpu.c b/target/sparc/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/sparc/cpu.c
 +++ b/target/sparc/cpu.c
@@ -XXX,XX +XXX,XX @@ static Property sparc_cpu_properties[] = {
      DEFINE_PROP_END_OF_LIST()
  };
 +#ifdef CONFIG_TCG
 +#include "hw/core/tcg-cpu-ops.h"
 +
 +static struct TCGCPUOps sparc_tcg_ops = {
 +    .initialize = sparc_tcg_init,
 +    .synchronize_from_tb = sparc_cpu_synchronize_from_tb,
 +    .cpu_exec_interrupt = sparc_cpu_exec_interrupt,
 +    .tlb_fill = sparc_cpu_tlb_fill,
 +
 +#ifndef CONFIG_USER_ONLY
 +    .do_interrupt = sparc_cpu_do_interrupt,
 +    .do_transaction_failed = sparc_cpu_do_transaction_failed,
 +    .do_unaligned_access = sparc_cpu_do_unaligned_access,
 +#endif /* !CONFIG_USER_ONLY */
 +};
 +#endif /* CONFIG_TCG */
 +
  static void sparc_cpu_class_init(ObjectClass *oc, void *data)
  {
      SPARCCPUClass *scc = SPARC_CPU_CLASS(oc);
@@ -XXX,XX +XXX,XX @@ static void sparc_cpu_class_init(ObjectClass *oc, void *data)
      cc->class_by_name = sparc_cpu_class_by_name;
      cc->parse_features = sparc_cpu_parse_features;
      cc->has_work = sparc_cpu_has_work;
 -    cc->tcg_ops.do_interrupt = sparc_cpu_do_interrupt;
 -    cc->tcg_ops.cpu_exec_interrupt = sparc_cpu_exec_interrupt;
      cc->dump_state = sparc_cpu_dump_state;
  #if !defined(TARGET_SPARC64) && !defined(CONFIG_USER_ONLY)
      cc->memory_rw_debug = sparc_cpu_memory_rw_debug;
  #endif
      cc->set_pc = sparc_cpu_set_pc;
 -    cc->tcg_ops.synchronize_from_tb = sparc_cpu_synchronize_from_tb;
      cc->gdb_read_register = sparc_cpu_gdb_read_register;
      cc->gdb_write_register = sparc_cpu_gdb_write_register;
 -    cc->tcg_ops.tlb_fill = sparc_cpu_tlb_fill;
  #ifndef CONFIG_USER_ONLY
 -    cc->tcg_ops.do_transaction_failed = sparc_cpu_do_transaction_failed;
 -    cc->tcg_ops.do_unaligned_access = sparc_cpu_do_unaligned_access;
      cc->get_phys_page_debug = sparc_cpu_get_phys_page_debug;
      cc->vmsd = &vmstate_sparc_cpu;
  #endif
      cc->disas_set_info = cpu_sparc_disas_set_info;
 -    cc->tcg_ops.initialize = sparc_tcg_init;
  #if defined(TARGET_SPARC64) && !defined(TARGET_ABI32)
      cc->gdb_num_core_regs = 86;
  #else
      cc->gdb_num_core_regs = 72;
  #endif
 +    cc->tcg_ops = &sparc_tcg_ops;
  }
  static const TypeInfo sparc_cpu_type_info = {
 diff --git a/target/tilegx/cpu.c b/target/tilegx/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/tilegx/cpu.c
 +++ b/target/tilegx/cpu.c
@@ -XXX,XX +XXX,XX @@ static bool tilegx_cpu_exec_interrupt(CPUState *cs, int interrupt_request)
      return false;
  }
-+#include "hw/core/tcg-cpu-ops.h"
++static bool fold_remainder(OptContext *ctx, TCGOp *op)
-+
++{
-+static struct TCGCPUOps tilegx_tcg_ops = {
++    return fold_const2(ctx, op);
-+    .initialize = tilegx_tcg_init,
++}
-+    .cpu_exec_interrupt = tilegx_cpu_exec_interrupt,
++
-+    .tlb_fill = tilegx_cpu_tlb_fill,
++static bool fold_shift(OptContext *ctx, TCGOp *op)
-+
++{
-+#ifndef CONFIG_USER_ONLY
++    return fold_const2(ctx, op);
-+    .do_interrupt = tilegx_cpu_do_interrupt,
++}
-+#endif /* !CONFIG_USER_ONLY */
++
-+};
++static bool fold_sub(OptContext *ctx, TCGOp *op)
-+
++{
- static void tilegx_cpu_class_init(ObjectClass *oc, void *data)
++    return fold_const2(ctx, op);
 +}
 +
 +static bool fold_xor(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
  /* Propagate constants and copies, fold constant expressions. */
  void tcg_optimize(TCGContext *s)
  {
-     DeviceClass *dc = DEVICE_CLASS(oc);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-@@ -XXX,XX +XXX,XX @@ static void tilegx_cpu_class_init(ObjectClass *oc, void *data)
+             }
+             break;
-     cc->class_by_name = tilegx_cpu_class_by_name;
-     cc->has_work = tilegx_cpu_has_work;
+-        CASE_OP_32_64(not):
--    cc->tcg_ops.do_interrupt = tilegx_cpu_do_interrupt;
+-        CASE_OP_32_64(neg):
--    cc->tcg_ops.cpu_exec_interrupt = tilegx_cpu_exec_interrupt;
+-        CASE_OP_32_64(ext8s):
-     cc->dump_state = tilegx_cpu_dump_state;
+-        CASE_OP_32_64(ext8u):
-     cc->set_pc = tilegx_cpu_set_pc;
+-        CASE_OP_32_64(ext16s):
--    cc->tcg_ops.tlb_fill = tilegx_cpu_tlb_fill;
+-        CASE_OP_32_64(ext16u):
-     cc->gdb_num_core_regs = 0;
+-        CASE_OP_32_64(ctpop):
--    cc->tcg_ops.initialize = tilegx_tcg_init;
+-        case INDEX_op_ext32s_i64:
-+    cc->tcg_ops = &tilegx_tcg_ops;
+-        case INDEX_op_ext32u_i64:
- }
+-        case INDEX_op_ext_i32_i64:
+-        case INDEX_op_extu_i32_i64:
- static const TypeInfo tilegx_cpu_type_info = {
+-        case INDEX_op_extrl_i64_i32:
-diff --git a/target/tricore/cpu.c b/target/tricore/cpu.c
+-        case INDEX_op_extrh_i64_i32:
-index XXXXXXX..XXXXXXX 100644
+-            if (arg_is_const(op->args[1])) {
---- a/target/tricore/cpu.c
+-                tmp = do_constant_folding(opc, arg_info(op->args[1])->val, 0);
-+++ b/target/tricore/cpu.c
+-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
-@@ -XXX,XX +XXX,XX @@ static void tc27x_initfn(Object *obj)
+-                continue;
-     set_feature(&cpu->env, TRICORE_FEATURE_161);
+-            }
- }
+-            break;
 +#include "hw/core/tcg-cpu-ops.h"
 +
 +static struct TCGCPUOps tricore_tcg_ops = {
 +    .initialize = tricore_tcg_init,
 +    .synchronize_from_tb = tricore_cpu_synchronize_from_tb,
 +    .tlb_fill = tricore_cpu_tlb_fill,
 +};
 +
  static void tricore_cpu_class_init(ObjectClass *c, void *data)
  {
      TriCoreCPUClass *mcc = TRICORE_CPU_CLASS(c);
@@ -XXX,XX +XXX,XX @@ static void tricore_cpu_class_init(ObjectClass *c, void *data)
      cc->dump_state = tricore_cpu_dump_state;
      cc->set_pc = tricore_cpu_set_pc;
 -    cc->tcg_ops.synchronize_from_tb = tricore_cpu_synchronize_from_tb;
      cc->get_phys_page_debug = tricore_cpu_get_phys_page_debug;
 -    cc->tcg_ops.initialize = tricore_tcg_init;
 -    cc->tcg_ops.tlb_fill = tricore_cpu_tlb_fill;
 +    cc->tcg_ops = &tricore_tcg_ops;
  }
  #define DEFINE_TRICORE_CPU_TYPE(cpu_model, initfn) \
 diff --git a/target/unicore32/cpu.c b/target/unicore32/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/unicore32/cpu.c
 +++ b/target/unicore32/cpu.c
@@ -XXX,XX +XXX,XX @@ static const VMStateDescription vmstate_uc32_cpu = {
      .unmigratable = 1,
  };
 +#include "hw/core/tcg-cpu-ops.h"
 +
 +static struct TCGCPUOps uc32_tcg_ops = {
 +    .initialize = uc32_translate_init,
 +    .cpu_exec_interrupt = uc32_cpu_exec_interrupt,
 +    .tlb_fill = uc32_cpu_tlb_fill,
 +
 +#ifndef CONFIG_USER_ONLY
 +    .do_interrupt = uc32_cpu_do_interrupt,
 +#endif /* !CONFIG_USER_ONLY */
 +};
 +
  static void uc32_cpu_class_init(ObjectClass *oc, void *data)
  {
      DeviceClass *dc = DEVICE_CLASS(oc);
@@ -XXX,XX +XXX,XX @@ static void uc32_cpu_class_init(ObjectClass *oc, void *data)
      cc->class_by_name = uc32_cpu_class_by_name;
      cc->has_work = uc32_cpu_has_work;
 -    cc->tcg_ops.do_interrupt = uc32_cpu_do_interrupt;
 -    cc->tcg_ops.cpu_exec_interrupt = uc32_cpu_exec_interrupt;
      cc->dump_state = uc32_cpu_dump_state;
      cc->set_pc = uc32_cpu_set_pc;
 -    cc->tcg_ops.tlb_fill = uc32_cpu_tlb_fill;
      cc->get_phys_page_debug = uc32_cpu_get_phys_page_debug;
 -    cc->tcg_ops.initialize = uc32_translate_init;
      dc->vmsd = &vmstate_uc32_cpu;
 +    cc->tcg_ops = &uc32_tcg_ops;
  }
  #define DEFINE_UNICORE32_CPU_TYPE(cpu_model, initfn) \
 diff --git a/target/xtensa/cpu.c b/target/xtensa/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/xtensa/cpu.c
 +++ b/target/xtensa/cpu.c
@@ -XXX,XX +XXX,XX @@ static const VMStateDescription vmstate_xtensa_cpu = {
      .unmigratable = 1,
  };
 +#include "hw/core/tcg-cpu-ops.h"
 +
 +static struct TCGCPUOps xtensa_tcg_ops = {
 +    .initialize = xtensa_translate_init,
 +    .cpu_exec_interrupt = xtensa_cpu_exec_interrupt,
 +    .tlb_fill = xtensa_cpu_tlb_fill,
 +    .debug_excp_handler = xtensa_breakpoint_handler,
 +
 +#ifndef CONFIG_USER_ONLY
 +    .do_interrupt = xtensa_cpu_do_interrupt,
 +    .do_transaction_failed = xtensa_cpu_do_transaction_failed,
 +    .do_unaligned_access = xtensa_cpu_do_unaligned_access,
 +#endif /* !CONFIG_USER_ONLY */
 +};
 +
  static void xtensa_cpu_class_init(ObjectClass *oc, void *data)
  {
      DeviceClass *dc = DEVICE_CLASS(oc);
@@ -XXX,XX +XXX,XX @@ static void xtensa_cpu_class_init(ObjectClass *oc, void *data)
      cc->class_by_name = xtensa_cpu_class_by_name;
      cc->has_work = xtensa_cpu_has_work;
 -    cc->tcg_ops.do_interrupt = xtensa_cpu_do_interrupt;
 -    cc->tcg_ops.cpu_exec_interrupt = xtensa_cpu_exec_interrupt;
      cc->dump_state = xtensa_cpu_dump_state;
      cc->set_pc = xtensa_cpu_set_pc;
      cc->gdb_read_register = xtensa_cpu_gdb_read_register;
      cc->gdb_write_register = xtensa_cpu_gdb_write_register;
      cc->gdb_stop_before_watchpoint = true;
 -    cc->tcg_ops.tlb_fill = xtensa_cpu_tlb_fill;
  #ifndef CONFIG_USER_ONLY
 -    cc->tcg_ops.do_unaligned_access = xtensa_cpu_do_unaligned_access;
      cc->get_phys_page_debug = xtensa_cpu_get_phys_page_debug;
 -    cc->tcg_ops.do_transaction_failed = xtensa_cpu_do_transaction_failed;
  #endif
 -    cc->tcg_ops.debug_excp_handler = xtensa_breakpoint_handler;
      cc->disas_set_info = xtensa_cpu_disas_set_info;
 -    cc->tcg_ops.initialize = xtensa_translate_init;
      dc->vmsd = &vmstate_xtensa_cpu;
 +    cc->tcg_ops = &xtensa_tcg_ops;
  }
  static const TypeInfo xtensa_cpu_type_info = {
 diff --git a/target/ppc/translate_init.c.inc b/target/ppc/translate_init.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/target/ppc/translate_init.c.inc
 +++ b/target/ppc/translate_init.c.inc
@@ -XXX,XX +XXX,XX @@ static Property ppc_cpu_properties[] = {
      DEFINE_PROP_END_OF_LIST(),
  };
 +#ifdef CONFIG_TCG
 +#include "hw/core/tcg-cpu-ops.h"
 +
 +static struct TCGCPUOps ppc_tcg_ops = {
 +  .initialize = ppc_translate_init,
 +  .cpu_exec_interrupt = ppc_cpu_exec_interrupt,
 +  .tlb_fill = ppc_cpu_tlb_fill,
 +
 +#ifndef CONFIG_USER_ONLY
 +  .do_interrupt = ppc_cpu_do_interrupt,
 +  .cpu_exec_enter = ppc_cpu_exec_enter,
 +  .cpu_exec_exit = ppc_cpu_exec_exit,
 +  .do_unaligned_access = ppc_cpu_do_unaligned_access,
 +#endif /* !CONFIG_USER_ONLY */
 +};
 +#endif /* CONFIG_TCG */
 +
  static void ppc_cpu_class_init(ObjectClass *oc, void *data)
  {
      PowerPCCPUClass *pcc = POWERPC_CPU_CLASS(oc);
@@ -XXX,XX +XXX,XX @@ static void ppc_cpu_class_init(ObjectClass *oc, void *data)
  #ifndef CONFIG_USER_ONLY
      cc->virtio_is_big_endian = ppc_cpu_is_big_endian;
  #endif
 -#ifdef CONFIG_TCG
 -    cc->tcg_ops.initialize = ppc_translate_init;
 -    cc->tcg_ops.cpu_exec_interrupt = ppc_cpu_exec_interrupt;
 -    cc->tcg_ops.do_interrupt = ppc_cpu_do_interrupt;
 -    cc->tcg_ops.tlb_fill = ppc_cpu_tlb_fill;
 -#ifndef CONFIG_USER_ONLY
 -    cc->tcg_ops.cpu_exec_enter = ppc_cpu_exec_enter;
 -    cc->tcg_ops.cpu_exec_exit = ppc_cpu_exec_exit;
 -    cc->tcg_ops.do_unaligned_access = ppc_cpu_do_unaligned_access;
 -#endif /* !CONFIG_USER_ONLY */
 -#endif /* CONFIG_TCG */
 -
-     cc->disas_set_info = ppc_disas_set_info;
+         CASE_OP_32_64(bswap16):
+         CASE_OP_32_64(bswap32):
-     dc->fw_name = "PowerPC,UNKNOWN";
+         case INDEX_op_bswap64_i64:
-+
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-+#ifdef CONFIG_TCG
+             }
-+    cc->tcg_ops = &ppc_tcg_ops;
+             break;
-+#endif /* CONFIG_TCG */
- }
+-        CASE_OP_32_64(add):
+-        CASE_OP_32_64(sub):
- static const TypeInfo ppc_cpu_type_info = {
+-        CASE_OP_32_64(mul):
-diff --git a/MAINTAINERS b/MAINTAINERS
+-        CASE_OP_32_64(or):
-index XXXXXXX..XXXXXXX 100644
+-        CASE_OP_32_64(and):
---- a/MAINTAINERS
+-        CASE_OP_32_64(xor):
-+++ b/MAINTAINERS
+-        CASE_OP_32_64(shl):
-@@ -XXX,XX +XXX,XX @@ F: include/exec/helper*.h
+-        CASE_OP_32_64(shr):
- F: include/exec/tb-hash.h
+-        CASE_OP_32_64(sar):
- F: include/sysemu/cpus.h
+-        CASE_OP_32_64(rotl):
- F: include/sysemu/tcg.h
+-        CASE_OP_32_64(rotr):
-+F: include/hw/core/tcg-cpu-ops.h
+-        CASE_OP_32_64(andc):
+-        CASE_OP_32_64(orc):
- FPU emulation
+-        CASE_OP_32_64(eqv):
- M: Aurelien Jarno <aurelien@aurel32.net>
+-        CASE_OP_32_64(nand):
 -        CASE_OP_32_64(nor):
 -        CASE_OP_32_64(muluh):
 -        CASE_OP_32_64(mulsh):
 -        CASE_OP_32_64(div):
 -        CASE_OP_32_64(divu):
 -        CASE_OP_32_64(rem):
 -        CASE_OP_32_64(remu):
 -            if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
 -                tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
 -                                          arg_info(op->args[2])->val);
 -                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
 -                continue;
 -            }
 -            break;
 -
          CASE_OP_32_64(clz):
          CASE_OP_32_64(ctz):
              if (arg_is_const(op->args[1])) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              }
              break;
 +        default:
 +            break;
 +
 +        /* ---------------------------------------------------------- */
 +        /* Sorted alphabetically by opcode as much as possible. */
 +
 +        CASE_OP_32_64_VEC(add):
 +            done = fold_add(&ctx, op);
 +            break;
 +        CASE_OP_32_64_VEC(and):
 +            done = fold_and(&ctx, op);
 +            break;
 +        CASE_OP_32_64_VEC(andc):
 +            done = fold_andc(&ctx, op);
 +            break;
 +        CASE_OP_32_64(ctpop):
 +            done = fold_ctpop(&ctx, op);
 +            break;
 +        CASE_OP_32_64(div):
 +        CASE_OP_32_64(divu):
 +            done = fold_divide(&ctx, op);
 +            break;
 +        CASE_OP_32_64(eqv):
 +            done = fold_eqv(&ctx, op);
 +            break;
 +        CASE_OP_32_64(ext8s):
 +        CASE_OP_32_64(ext16s):
 +        case INDEX_op_ext32s_i64:
 +        case INDEX_op_ext_i32_i64:
 +            done = fold_exts(&ctx, op);
 +            break;
 +        CASE_OP_32_64(ext8u):
 +        CASE_OP_32_64(ext16u):
 +        case INDEX_op_ext32u_i64:
 +        case INDEX_op_extu_i32_i64:
 +        case INDEX_op_extrl_i64_i32:
 +        case INDEX_op_extrh_i64_i32:
 +            done = fold_extu(&ctx, op);
 +            break;
          case INDEX_op_mb:
              done = fold_mb(&ctx, op);
              break;
 +        CASE_OP_32_64(mul):
 +            done = fold_mul(&ctx, op);
 +            break;
 +        CASE_OP_32_64(mulsh):
 +        CASE_OP_32_64(muluh):
 +            done = fold_mul_highpart(&ctx, op);
 +            break;
 +        CASE_OP_32_64(nand):
 +            done = fold_nand(&ctx, op);
 +            break;
 +        CASE_OP_32_64(neg):
 +            done = fold_neg(&ctx, op);
 +            break;
 +        CASE_OP_32_64(nor):
 +            done = fold_nor(&ctx, op);
 +            break;
 +        CASE_OP_32_64_VEC(not):
 +            done = fold_not(&ctx, op);
 +            break;
 +        CASE_OP_32_64_VEC(or):
 +            done = fold_or(&ctx, op);
 +            break;
 +        CASE_OP_32_64_VEC(orc):
 +            done = fold_orc(&ctx, op);
 +            break;
          case INDEX_op_qemu_ld_i32:
          case INDEX_op_qemu_ld_i64:
              done = fold_qemu_ld(&ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          case INDEX_op_qemu_st_i64:
              done = fold_qemu_st(&ctx, op);
              break;
 -
 -        default:
 +        CASE_OP_32_64(rem):
 +        CASE_OP_32_64(remu):
 +            done = fold_remainder(&ctx, op);
 +            break;
 +        CASE_OP_32_64(rotl):
 +        CASE_OP_32_64(rotr):
 +        CASE_OP_32_64(sar):
 +        CASE_OP_32_64(shl):
 +        CASE_OP_32_64(shr):
 +            done = fold_shift(&ctx, op);
 +            break;
 +        CASE_OP_32_64_VEC(sub):
 +            done = fold_sub(&ctx, op);
 +            break;
 +        CASE_OP_32_64_VEC(xor):
 +            done = fold_xor(&ctx, op);
              break;
          }
 --
 .25.1

-New patch
+[PULL 21/56] tcg/optimize: Split out fold_setcond2
+Reduce some code duplication by folding the NE and EQ cases.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 145 ++++++++++++++++++++++++-------------------------
+file changed, 72 insertions(+), 73 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_remainder(OptContext *ctx, TCGOp *op)
+     return fold_const2(ctx, op);
+ }
++static bool fold_setcond2(OptContext *ctx, TCGOp *op)
++{
++    TCGCond cond = op->args[5];
++    int i = do_constant_folding_cond2(&op->args[1], &op->args[3], cond);
++    int inv = 0;
++
++    if (i >= 0) {
++        goto do_setcond_const;
++    }
++
++    switch (cond) {
++    case TCG_COND_LT:
++    case TCG_COND_GE:
++        /*
++         * Simplify LT/GE comparisons vs zero to a single compare
++         * vs the high word of the input.
++         */
++        if (arg_is_const(op->args[3]) && arg_info(op->args[3])->val == 0 &&
++            arg_is_const(op->args[4]) && arg_info(op->args[4])->val == 0) {
++            goto do_setcond_high;
++        }
++        break;
++
++    case TCG_COND_NE:
++        inv = 1;
++        QEMU_FALLTHROUGH;
++    case TCG_COND_EQ:
++        /*
++         * Simplify EQ/NE comparisons where one of the pairs
++         * can be simplified.
++         */
++        i = do_constant_folding_cond(INDEX_op_setcond_i32, op->args[1],
++                                     op->args[3], cond);
++        switch (i ^ inv) {
++        case 0:
++            goto do_setcond_const;
++        case 1:
++            goto do_setcond_high;
++        }
++
++        i = do_constant_folding_cond(INDEX_op_setcond_i32, op->args[2],
++                                     op->args[4], cond);
++        switch (i ^ inv) {
++        case 0:
++            goto do_setcond_const;
++        case 1:
++            op->args[2] = op->args[3];
++            op->args[3] = cond;
++            op->opc = INDEX_op_setcond_i32;
++            break;
++        }
++        break;
++
++    default:
++        break;
++
++    do_setcond_high:
++        op->args[1] = op->args[2];
++        op->args[2] = op->args[4];
++        op->args[3] = cond;
++        op->opc = INDEX_op_setcond_i32;
++        break;
++    }
++    return false;
++
++ do_setcond_const:
++    return tcg_opt_gen_movi(ctx, op, op->args[0], i);
++}
++
+ static bool fold_shift(OptContext *ctx, TCGOp *op)
+ {
+     return fold_const2(ctx, op);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             }
+             break;
+-        case INDEX_op_setcond2_i32:
+-            i = do_constant_folding_cond2(&op->args[1], &op->args[3],
+-                                          op->args[5]);
+-            if (i >= 0) {
+-            do_setcond_const:
+-                tcg_opt_gen_movi(&ctx, op, op->args[0], i);
+-                continue;
+-            }
+-            if ((op->args[5] == TCG_COND_LT || op->args[5] == TCG_COND_GE)
+-                 && arg_is_const(op->args[3])
+-                 && arg_info(op->args[3])->val == 0
+-                 && arg_is_const(op->args[4])
+-                 && arg_info(op->args[4])->val == 0) {
+-                /* Simplify LT/GE comparisons vs zero to a single compare
+-                   vs the high word of the input.  */
+-            do_setcond_high:
+-                reset_temp(op->args[0]);
+-                arg_info(op->args[0])->z_mask = 1;
+-                op->opc = INDEX_op_setcond_i32;
+-                op->args[1] = op->args[2];
+-                op->args[2] = op->args[4];
+-                op->args[3] = op->args[5];
+-                break;
+-            }
+-            if (op->args[5] == TCG_COND_EQ) {
+-                /* Simplify EQ comparisons where one of the pairs
+-                   can be simplified.  */
+-                i = do_constant_folding_cond(INDEX_op_setcond_i32,
+-                                             op->args[1], op->args[3],
+-                                             TCG_COND_EQ);
+-                if (i == 0) {
+-                    goto do_setcond_const;
+-                } else if (i > 0) {
+-                    goto do_setcond_high;
+-                }
+-                i = do_constant_folding_cond(INDEX_op_setcond_i32,
+-                                             op->args[2], op->args[4],
+-                                             TCG_COND_EQ);
+-                if (i == 0) {
+-                    goto do_setcond_high;
+-                } else if (i < 0) {
+-                    break;
+-                }
+-            do_setcond_low:
+-                reset_temp(op->args[0]);
+-                arg_info(op->args[0])->z_mask = 1;
+-                op->opc = INDEX_op_setcond_i32;
+-                op->args[2] = op->args[3];
+-                op->args[3] = op->args[5];
+-                break;
+-            }
+-            if (op->args[5] == TCG_COND_NE) {
+-                /* Simplify NE comparisons where one of the pairs
+-                   can be simplified.  */
+-                i = do_constant_folding_cond(INDEX_op_setcond_i32,
+-                                             op->args[1], op->args[3],
+-                                             TCG_COND_NE);
+-                if (i == 0) {
+-                    goto do_setcond_high;
+-                } else if (i > 0) {
+-                    goto do_setcond_const;
+-                }
+-                i = do_constant_folding_cond(INDEX_op_setcond_i32,
+-                                             op->args[2], op->args[4],
+-                                             TCG_COND_NE);
+-                if (i == 0) {
+-                    goto do_setcond_low;
+-                } else if (i > 0) {
+-                    goto do_setcond_const;
+-                }
+-            }
+-            break;
+-
+         default:
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64(shr):
+             done = fold_shift(&ctx, op);
+             break;
++        case INDEX_op_setcond2_i32:
++            done = fold_setcond2(&ctx, op);
++            break;
+         CASE_OP_32_64_VEC(sub):
+             done = fold_sub(&ctx, op);
+             break;
+--
+.25.1

-New patch
+[PULL 22/56] tcg/optimize: Split out fold_brcond2
+Reduce some code duplication by folding the NE and EQ cases.
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 159 +++++++++++++++++++++++++------------------------
+file changed, 81 insertions(+), 78 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
+     return fold_const2(ctx, op);
+ }
++static bool fold_brcond2(OptContext *ctx, TCGOp *op)
++{
++    TCGCond cond = op->args[4];
++    int i = do_constant_folding_cond2(&op->args[0], &op->args[2], cond);
++    TCGArg label = op->args[5];
++    int inv = 0;
++
++    if (i >= 0) {
++        goto do_brcond_const;
++    }
++
++    switch (cond) {
++    case TCG_COND_LT:
++    case TCG_COND_GE:
++        /*
++         * Simplify LT/GE comparisons vs zero to a single compare
++         * vs the high word of the input.
++         */
++        if (arg_is_const(op->args[2]) && arg_info(op->args[2])->val == 0 &&
++            arg_is_const(op->args[3]) && arg_info(op->args[3])->val == 0) {
++            goto do_brcond_high;
++        }
++        break;
++
++    case TCG_COND_NE:
++        inv = 1;
++        QEMU_FALLTHROUGH;
++    case TCG_COND_EQ:
++        /*
++         * Simplify EQ/NE comparisons where one of the pairs
++         * can be simplified.
++         */
++        i = do_constant_folding_cond(INDEX_op_brcond_i32, op->args[0],
++                                     op->args[2], cond);
++        switch (i ^ inv) {
++        case 0:
++            goto do_brcond_const;
++        case 1:
++            goto do_brcond_high;
++        }
++
++        i = do_constant_folding_cond(INDEX_op_brcond_i32, op->args[1],
++                                     op->args[3], cond);
++        switch (i ^ inv) {
++        case 0:
++            goto do_brcond_const;
++        case 1:
++            op->opc = INDEX_op_brcond_i32;
++            op->args[1] = op->args[2];
++            op->args[2] = cond;
++            op->args[3] = label;
++            break;
++        }
++        break;
++
++    default:
++        break;
++
++    do_brcond_high:
++        op->opc = INDEX_op_brcond_i32;
++        op->args[0] = op->args[1];
++        op->args[1] = op->args[3];
++        op->args[2] = cond;
++        op->args[3] = label;
++        break;
++
++    do_brcond_const:
++        if (i == 0) {
++            tcg_op_remove(ctx->tcg, op);
++            return true;
++        }
++        op->opc = INDEX_op_br;
++        op->args[0] = label;
++        break;
++    }
++    return false;
++}
++
+ static bool fold_call(OptContext *ctx, TCGOp *op)
+ {
+     TCGContext *s = ctx->tcg;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             }
+             break;
+-        case INDEX_op_brcond2_i32:
+-            i = do_constant_folding_cond2(&op->args[0], &op->args[2],
+-                                          op->args[4]);
+-            if (i == 0) {
+-            do_brcond_false:
+-                tcg_op_remove(s, op);
+-                continue;
+-            }
+-            if (i > 0) {
+-            do_brcond_true:
+-                op->opc = opc = INDEX_op_br;
+-                op->args[0] = op->args[5];
+-                break;
+-            }
+-            if ((op->args[4] == TCG_COND_LT || op->args[4] == TCG_COND_GE)
+-                 && arg_is_const(op->args[2])
+-                 && arg_info(op->args[2])->val == 0
+-                 && arg_is_const(op->args[3])
+-                 && arg_info(op->args[3])->val == 0) {
+-                /* Simplify LT/GE comparisons vs zero to a single compare
+-                   vs the high word of the input.  */
+-            do_brcond_high:
+-                op->opc = opc = INDEX_op_brcond_i32;
+-                op->args[0] = op->args[1];
+-                op->args[1] = op->args[3];
+-                op->args[2] = op->args[4];
+-                op->args[3] = op->args[5];
+-                break;
+-            }
+-            if (op->args[4] == TCG_COND_EQ) {
+-                /* Simplify EQ comparisons where one of the pairs
+-                   can be simplified.  */
+-                i = do_constant_folding_cond(INDEX_op_brcond_i32,
+-                                             op->args[0], op->args[2],
+-                                             TCG_COND_EQ);
+-                if (i == 0) {
+-                    goto do_brcond_false;
+-                } else if (i > 0) {
+-                    goto do_brcond_high;
+-                }
+-                i = do_constant_folding_cond(INDEX_op_brcond_i32,
+-                                             op->args[1], op->args[3],
+-                                             TCG_COND_EQ);
+-                if (i == 0) {
+-                    goto do_brcond_false;
+-                } else if (i < 0) {
+-                    break;
+-                }
+-            do_brcond_low:
+-                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+-                op->opc = INDEX_op_brcond_i32;
+-                op->args[1] = op->args[2];
+-                op->args[2] = op->args[4];
+-                op->args[3] = op->args[5];
+-                break;
+-            }
+-            if (op->args[4] == TCG_COND_NE) {
+-                /* Simplify NE comparisons where one of the pairs
+-                   can be simplified.  */
+-                i = do_constant_folding_cond(INDEX_op_brcond_i32,
+-                                             op->args[0], op->args[2],
+-                                             TCG_COND_NE);
+-                if (i == 0) {
+-                    goto do_brcond_high;
+-                } else if (i > 0) {
+-                    goto do_brcond_true;
+-                }
+-                i = do_constant_folding_cond(INDEX_op_brcond_i32,
+-                                             op->args[1], op->args[3],
+-                                             TCG_COND_NE);
+-                if (i == 0) {
+-                    goto do_brcond_low;
+-                } else if (i > 0) {
+-                    goto do_brcond_true;
+-                }
+-            }
+-            break;
+-
+         default:
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64_VEC(andc):
+             done = fold_andc(&ctx, op);
+             break;
++        case INDEX_op_brcond2_i32:
++            done = fold_brcond2(&ctx, op);
++            break;
+         CASE_OP_32_64(ctpop):
+             done = fold_ctpop(&ctx, op);
+             break;
+--
+.25.1

-New patch
+[PULL 23/56] tcg/optimize: Split out fold_brcond
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 33 +++++++++++++++++++--------------
+file changed, 19 insertions(+), 14 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
+     return fold_const2(ctx, op);
+ }
++static bool fold_brcond(OptContext *ctx, TCGOp *op)
++{
++    TCGCond cond = op->args[2];
++    int i = do_constant_folding_cond(op->opc, op->args[0], op->args[1], cond);
++
++    if (i == 0) {
++        tcg_op_remove(ctx->tcg, op);
++        return true;
++    }
++    if (i > 0) {
++        op->opc = INDEX_op_br;
++        op->args[0] = op->args[3];
++    }
++    return false;
++}
++
+ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
+ {
+     TCGCond cond = op->args[4];
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             }
+             break;
+-        CASE_OP_32_64(brcond):
+-            i = do_constant_folding_cond(opc, op->args[0],
+-                                         op->args[1], op->args[2]);
+-            if (i == 0) {
+-                tcg_op_remove(s, op);
+-                continue;
+-            } else if (i > 0) {
+-                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+-                op->opc = opc = INDEX_op_br;
+-                op->args[0] = op->args[3];
+-                break;
+-            }
+-            break;
+-
+         CASE_OP_32_64(movcond):
+             i = do_constant_folding_cond(opc, op->args[1],
+                                          op->args[2], op->args[5]);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64_VEC(andc):
+             done = fold_andc(&ctx, op);
+             break;
++        CASE_OP_32_64(brcond):
++            done = fold_brcond(&ctx, op);
++            break;
+         case INDEX_op_brcond2_i32:
+             done = fold_brcond2(&ctx, op);
+             break;
+--
+.25.1

-New patch
+[PULL 24/56] tcg/optimize: Split out fold_setcond
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 23 ++++++++++++++---------
+file changed, 14 insertions(+), 9 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_remainder(OptContext *ctx, TCGOp *op)
+     return fold_const2(ctx, op);
+ }
++static bool fold_setcond(OptContext *ctx, TCGOp *op)
++{
++    TCGCond cond = op->args[3];
++    int i = do_constant_folding_cond(op->opc, op->args[1], op->args[2], cond);
++
++    if (i >= 0) {
++        return tcg_opt_gen_movi(ctx, op, op->args[0], i);
++    }
++    return false;
++}
++
+ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
+ {
+     TCGCond cond = op->args[5];
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             }
+             break;
+-        CASE_OP_32_64(setcond):
+-            i = do_constant_folding_cond(opc, op->args[1],
+-                                         op->args[2], op->args[3]);
+-            if (i >= 0) {
+-                tcg_opt_gen_movi(&ctx, op, op->args[0], i);
+-                continue;
+-            }
+-            break;
+-
+         CASE_OP_32_64(movcond):
+             i = do_constant_folding_cond(opc, op->args[1],
+                                          op->args[2], op->args[5]);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64(shr):
+             done = fold_shift(&ctx, op);
+             break;
++        CASE_OP_32_64(setcond):
++            done = fold_setcond(&ctx, op);
++            break;
+         case INDEX_op_setcond2_i32:
+             done = fold_setcond2(&ctx, op);
+             break;
+--
+.25.1

-[PULL 24/46] tcg/tci: Implement 64-bit division
+[PULL 25/56] tcg/optimize: Split out fold_mulu2_i32
-Trivially implemented like other arithmetic.
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
-Tested via check-tcg and the ppc64 target.
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Tested-by: Alex Bennée <alex.bennee@linaro.org>
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/tci/tcg-target.h     |  4 ++--
+ tcg/optimize.c | 37 +++++++++++++++++++++----------------
- tcg/tci.c                | 28 ++++++++++++++++++++++------
+file changed, 21 insertions(+), 16 deletions(-)
  tcg/tci/tcg-target.c.inc | 10 ++++------
 files changed, 28 insertions(+), 14 deletions(-)
-diff --git a/tcg/tci/tcg-target.h b/tcg/tci/tcg-target.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/tci/tcg-target.h
+--- a/tcg/optimize.c
-+++ b/tcg/tci/tcg-target.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
- #define TCG_TARGET_HAS_extract_i64      0
+     return fold_const2(ctx, op);
- #define TCG_TARGET_HAS_sextract_i64     0
+ }
- #define TCG_TARGET_HAS_extract2_i64     0
--#define TCG_TARGET_HAS_div_i64          0
++static bool fold_mulu2_i32(OptContext *ctx, TCGOp *op)
--#define TCG_TARGET_HAS_rem_i64          0
++{
-+#define TCG_TARGET_HAS_div_i64          1
++    if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
-+#define TCG_TARGET_HAS_rem_i64          1
++        uint32_t a = arg_info(op->args[2])->val;
- #define TCG_TARGET_HAS_ext8s_i64        1
++        uint32_t b = arg_info(op->args[3])->val;
- #define TCG_TARGET_HAS_ext16s_i64       1
++        uint64_t r = (uint64_t)a * b;
- #define TCG_TARGET_HAS_ext32s_i64       1
++        TCGArg rl, rh;
-diff --git a/tcg/tci.c b/tcg/tci.c
++        TCGOp *op2 = tcg_op_insert_before(ctx->tcg, op, INDEX_op_mov_i32);
-index XXXXXXX..XXXXXXX 100644
++
---- a/tcg/tci.c
++        rl = op->args[0];
-+++ b/tcg/tci.c
++        rh = op->args[1];
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
++        tcg_opt_gen_movi(ctx, op, rl, (int32_t)r);
-             t2 = tci_read_ri64(regs, &tb_ptr);
++        tcg_opt_gen_movi(ctx, op2, rh, (int32_t)(r >> 32));
-             tci_write_reg(regs, t0, t1 * t2);
++        return true;
 +    }
 +    return false;
 +}
 +
  static bool fold_nand(OptContext *ctx, TCGOp *op)
  {
      return fold_const2(ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              }
              break;
--#if TCG_TARGET_HAS_div_i64
-         case INDEX_op_div_i64:
+-        case INDEX_op_mulu2_i32:
--        case INDEX_op_divu_i64:
+-            if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
--        case INDEX_op_rem_i64:
+-                uint32_t a = arg_info(op->args[2])->val;
--        case INDEX_op_remu_i64:
+-                uint32_t b = arg_info(op->args[3])->val;
--            TODO();
+-                uint64_t r = (uint64_t)a * b;
-+            t0 = *tb_ptr++;
+-                TCGArg rl, rh;
-+            t1 = tci_read_ri64(regs, &tb_ptr);
+-                TCGOp *op2 = tcg_op_insert_before(s, op, INDEX_op_mov_i32);
-+            t2 = tci_read_ri64(regs, &tb_ptr);
+-
-+            tci_write_reg(regs, t0, (int64_t)t1 / (int64_t)t2);
+-                rl = op->args[0];
 -                rh = op->args[1];
 -                tcg_opt_gen_movi(&ctx, op, rl, (int32_t)r);
 -                tcg_opt_gen_movi(&ctx, op2, rh, (int32_t)(r >> 32));
 -                continue;
 -            }
 -            break;
 -
          default:
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          CASE_OP_32_64(muluh):
              done = fold_mul_highpart(&ctx, op);
              break;
 +        case INDEX_op_mulu2_i32:
 +            done = fold_mulu2_i32(&ctx, op);
 +            break;
-+        case INDEX_op_divu_i64:
+         CASE_OP_32_64(nand):
-+            t0 = *tb_ptr++;
+             done = fold_nand(&ctx, op);
 +            t1 = tci_read_ri64(regs, &tb_ptr);
 +            t2 = tci_read_ri64(regs, &tb_ptr);
 +            tci_write_reg(regs, t0, (uint64_t)t1 / (uint64_t)t2);
 +            break;
 +        case INDEX_op_rem_i64:
 +            t0 = *tb_ptr++;
 +            t1 = tci_read_ri64(regs, &tb_ptr);
 +            t2 = tci_read_ri64(regs, &tb_ptr);
 +            tci_write_reg(regs, t0, (int64_t)t1 % (int64_t)t2);
 +            break;
 +        case INDEX_op_remu_i64:
 +            t0 = *tb_ptr++;
 +            t1 = tci_read_ri64(regs, &tb_ptr);
 +            t2 = tci_read_ri64(regs, &tb_ptr);
 +            tci_write_reg(regs, t0, (uint64_t)t1 % (uint64_t)t2);
              break;
--#endif
-         case INDEX_op_and_i64:
-             t0 = *tb_ptr++;
-             t1 = tci_read_ri64(regs, &tb_ptr);
-diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
-index XXXXXXX..XXXXXXX 100644
---- a/tcg/tci/tcg-target.c.inc
-+++ b/tcg/tci/tcg-target.c.inc
-@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
-     case INDEX_op_sar_i64:
-     case INDEX_op_rotl_i64:     /* Optional (TCG_TARGET_HAS_rot_i64). */
-     case INDEX_op_rotr_i64:     /* Optional (TCG_TARGET_HAS_rot_i64). */
-+    case INDEX_op_div_i64:      /* Optional (TCG_TARGET_HAS_div_i64). */
-+    case INDEX_op_divu_i64:     /* Optional (TCG_TARGET_HAS_div_i64). */
-+    case INDEX_op_rem_i64:      /* Optional (TCG_TARGET_HAS_div_i64). */
-+    case INDEX_op_remu_i64:     /* Optional (TCG_TARGET_HAS_div_i64). */
-         tcg_out_r(s, args[0]);
-         tcg_out_ri64(s, const_args[1], args[1]);
-         tcg_out_ri64(s, const_args[2], args[2]);
-@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
-         tcg_debug_assert(args[4] <= UINT8_MAX);
-         tcg_out8(s, args[4]);
-         break;
--    case INDEX_op_div_i64:      /* Optional (TCG_TARGET_HAS_div_i64). */
--    case INDEX_op_divu_i64:     /* Optional (TCG_TARGET_HAS_div_i64). */
--    case INDEX_op_rem_i64:      /* Optional (TCG_TARGET_HAS_div_i64). */
--    case INDEX_op_remu_i64:     /* Optional (TCG_TARGET_HAS_div_i64). */
--        TODO();
--        break;
-     case INDEX_op_brcond_i64:
-         tcg_out_r(s, args[0]);
-         tcg_out_ri64(s, const_args[1], args[1]);
 --
 .25.1

-[PULL 23/46] tcg/tci: Remove dead code for TCG_TARGET_HAS_div2_*
+[PULL 26/56] tcg/optimize: Split out fold_addsub2_i32
-We do not simultaneously support div and div2 -- it's one
+Add two additional helpers, fold_add2_i32 and fold_sub2_i32
-or the other.  TCI is already using div, so remove div2.
+which will not be simple wrappers forever.
-Tested-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
-Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/tci.c                | 12 ------------
+ tcg/optimize.c | 70 +++++++++++++++++++++++++++++++-------------------
- tcg/tci/tcg-target.c.inc |  8 --------
+file changed, 44 insertions(+), 26 deletions(-)
 files changed, 20 deletions(-)
-diff --git a/tcg/tci.c b/tcg/tci.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/tci.c
+--- a/tcg/optimize.c
-+++ b/tcg/tci.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+@@ -XXX,XX +XXX,XX @@ static bool fold_add(OptContext *ctx, TCGOp *op)
-             t2 = tci_read_ri32(regs, &tb_ptr);
+     return fold_const2(ctx, op);
-             tci_write_reg(regs, t0, t1 * t2);
+ }
 +static bool fold_addsub2_i32(OptContext *ctx, TCGOp *op, bool add)
 +{
 +    if (arg_is_const(op->args[2]) && arg_is_const(op->args[3]) &&
 +        arg_is_const(op->args[4]) && arg_is_const(op->args[5])) {
 +        uint32_t al = arg_info(op->args[2])->val;
 +        uint32_t ah = arg_info(op->args[3])->val;
 +        uint32_t bl = arg_info(op->args[4])->val;
 +        uint32_t bh = arg_info(op->args[5])->val;
 +        uint64_t a = ((uint64_t)ah << 32) | al;
 +        uint64_t b = ((uint64_t)bh << 32) | bl;
 +        TCGArg rl, rh;
 +        TCGOp *op2 = tcg_op_insert_before(ctx->tcg, op, INDEX_op_mov_i32);
 +
 +        if (add) {
 +            a += b;
 +        } else {
 +            a -= b;
 +        }
 +
 +        rl = op->args[0];
 +        rh = op->args[1];
 +        tcg_opt_gen_movi(ctx, op, rl, (int32_t)a);
 +        tcg_opt_gen_movi(ctx, op2, rh, (int32_t)(a >> 32));
 +        return true;
 +    }
 +    return false;
 +}
 +
 +static bool fold_add2_i32(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_addsub2_i32(ctx, op, true);
 +}
 +
  static bool fold_and(OptContext *ctx, TCGOp *op)
  {
      return fold_const2(ctx, op);
@@ -XXX,XX +XXX,XX @@ static bool fold_sub(OptContext *ctx, TCGOp *op)
      return fold_const2(ctx, op);
  }
 +static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_addsub2_i32(ctx, op, false);
 +}
 +
  static bool fold_xor(OptContext *ctx, TCGOp *op)
  {
      return fold_const2(ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              }
              break;
--#if TCG_TARGET_HAS_div_i32
-         case INDEX_op_div_i32:
+-        case INDEX_op_add2_i32:
-             t0 = *tb_ptr++;
+-        case INDEX_op_sub2_i32:
-             t1 = tci_read_ri32(regs, &tb_ptr);
+-            if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+-                && arg_is_const(op->args[4]) && arg_is_const(op->args[5])) {
-             t2 = tci_read_ri32(regs, &tb_ptr);
+-                uint32_t al = arg_info(op->args[2])->val;
-             tci_write_reg(regs, t0, t1 % t2);
+-                uint32_t ah = arg_info(op->args[3])->val;
 -                uint32_t bl = arg_info(op->args[4])->val;
 -                uint32_t bh = arg_info(op->args[5])->val;
 -                uint64_t a = ((uint64_t)ah << 32) | al;
 -                uint64_t b = ((uint64_t)bh << 32) | bl;
 -                TCGArg rl, rh;
 -                TCGOp *op2 = tcg_op_insert_before(s, op, INDEX_op_mov_i32);
 -
 -                if (opc == INDEX_op_add2_i32) {
 -                    a += b;
 -                } else {
 -                    a -= b;
 -                }
 -
 -                rl = op->args[0];
 -                rh = op->args[1];
 -                tcg_opt_gen_movi(&ctx, op, rl, (int32_t)a);
 -                tcg_opt_gen_movi(&ctx, op2, rh, (int32_t)(a >> 32));
 -                continue;
 -            }
 -            break;
          default:
              break;
--#elif TCG_TARGET_HAS_div2_i32
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
--        case INDEX_op_div2_i32:
+         CASE_OP_32_64_VEC(add):
--        case INDEX_op_divu2_i32:
+             done = fold_add(&ctx, op);
 -            TODO();
 -            break;
 -#endif
          case INDEX_op_and_i32:
              t0 = *tb_ptr++;
              t1 = tci_read_ri32(regs, &tb_ptr);
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
          case INDEX_op_remu_i64:
              TODO();
              break;
--#elif TCG_TARGET_HAS_div2_i64
++        case INDEX_op_add2_i32:
--        case INDEX_op_div2_i64:
++            done = fold_add2_i32(&ctx, op);
--        case INDEX_op_divu2_i64:
++            break;
--            TODO();
+         CASE_OP_32_64_VEC(and):
--            break;
+             done = fold_and(&ctx, op);
- #endif
+             break;
-         case INDEX_op_and_i64:
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-             t0 = *tb_ptr++;
+         CASE_OP_32_64_VEC(sub):
-diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
+             done = fold_sub(&ctx, op);
-index XXXXXXX..XXXXXXX 100644
+             break;
---- a/tcg/tci/tcg-target.c.inc
++        case INDEX_op_sub2_i32:
-+++ b/tcg/tci/tcg-target.c.inc
++            done = fold_sub2_i32(&ctx, op);
-@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
++            break;
-     case INDEX_op_remu_i64:     /* Optional (TCG_TARGET_HAS_div_i64). */
+         CASE_OP_32_64_VEC(xor):
-         TODO();
+             done = fold_xor(&ctx, op);
-         break;
+             break;
 -    case INDEX_op_div2_i64:     /* Optional (TCG_TARGET_HAS_div2_i64). */
 -    case INDEX_op_divu2_i64:    /* Optional (TCG_TARGET_HAS_div2_i64). */
 -        TODO();
 -        break;
      case INDEX_op_brcond_i64:
          tcg_out_r(s, args[0]);
          tcg_out_ri64(s, const_args[1], args[1]);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
          tcg_out_ri32(s, const_args[1], args[1]);
          tcg_out_ri32(s, const_args[2], args[2]);
          break;
 -    case INDEX_op_div2_i32:     /* Optional (TCG_TARGET_HAS_div2_i32). */
 -    case INDEX_op_divu2_i32:    /* Optional (TCG_TARGET_HAS_div2_i32). */
 -        TODO();
 -        break;
  #if TCG_TARGET_REG_BITS == 32
      case INDEX_op_add2_i32:
      case INDEX_op_sub2_i32:
 --
 .25.1

-[PULL 21/46] tcg/tci: Merge INDEX_op_{st_i32,st32_i64}
+[PULL 27/56] tcg/optimize: Split out fold_movcond
-Tested-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
-Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/tci.c | 7 +------
+ tcg/optimize.c | 56 ++++++++++++++++++++++++++++----------------------
-file changed, 1 insertion(+), 6 deletions(-)
+file changed, 31 insertions(+), 25 deletions(-)
-diff --git a/tcg/tci.c b/tcg/tci.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/tci.c
+--- a/tcg/optimize.c
-+++ b/tcg/tci.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+@@ -XXX,XX +XXX,XX @@ static bool fold_mb(OptContext *ctx, TCGOp *op)
-             *(uint16_t *)(t1 + t2) = t0;
+     return true;
  }
 +static bool fold_movcond(OptContext *ctx, TCGOp *op)
 +{
 +    TCGOpcode opc = op->opc;
 +    TCGCond cond = op->args[5];
 +    int i = do_constant_folding_cond(opc, op->args[1], op->args[2], cond);
 +
 +    if (i >= 0) {
 +        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[4 - i]);
 +    }
 +
 +    if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
 +        uint64_t tv = arg_info(op->args[3])->val;
 +        uint64_t fv = arg_info(op->args[4])->val;
 +
 +        opc = (opc == INDEX_op_movcond_i32
 +               ? INDEX_op_setcond_i32 : INDEX_op_setcond_i64);
 +
 +        if (tv == 1 && fv == 0) {
 +            op->opc = opc;
 +            op->args[3] = cond;
 +        } else if (fv == 1 && tv == 0) {
 +            op->opc = opc;
 +            op->args[3] = tcg_invert_cond(cond);
 +        }
 +    }
 +    return false;
 +}
 +
  static bool fold_mul(OptContext *ctx, TCGOp *op)
  {
      return fold_const2(ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              }
              break;
-         case INDEX_op_st_i32:
-+        CASE_64(st32)
+-        CASE_OP_32_64(movcond):
-             t0 = tci_read_r32(regs, &tb_ptr);
+-            i = do_constant_folding_cond(opc, op->args[1],
-             t1 = tci_read_r(regs, &tb_ptr);
+-                                         op->args[2], op->args[5]);
-             t2 = tci_read_s32(&tb_ptr);
+-            if (i >= 0) {
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+-                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[4 - i]);
-             t2 = tci_read_s32(&tb_ptr);
+-                continue;
-             tci_write_reg(regs, t0, *(uint64_t *)(t1 + t2));
+-            }
 -            if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
 -                uint64_t tv = arg_info(op->args[3])->val;
 -                uint64_t fv = arg_info(op->args[4])->val;
 -                TCGCond cond = op->args[5];
 -
 -                if (fv == 1 && tv == 0) {
 -                    cond = tcg_invert_cond(cond);
 -                } else if (!(tv == 1 && fv == 0)) {
 -                    break;
 -                }
 -                op->args[3] = cond;
 -                op->opc = opc = (opc == INDEX_op_movcond_i32
 -                                 ? INDEX_op_setcond_i32
 -                                 : INDEX_op_setcond_i64);
 -            }
 -            break;
 -
 -
          default:
              break;
--        case INDEX_op_st32_i64:
--            t0 = tci_read_r32(regs, &tb_ptr);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
--            t1 = tci_read_r(regs, &tb_ptr);
+         case INDEX_op_mb:
--            t2 = tci_read_s32(&tb_ptr);
+             done = fold_mb(&ctx, op);
--            *(uint32_t *)(t1 + t2) = t0;
+             break;
--            break;
++        CASE_OP_32_64(movcond):
-         case INDEX_op_st_i64:
++            done = fold_movcond(&ctx, op);
-             t0 = tci_read_r64(regs, &tb_ptr);
++            break;
-             t1 = tci_read_r(regs, &tb_ptr);
+         CASE_OP_32_64(mul):
              done = fold_mul(&ctx, op);
              break;
 --
 .25.1

-[PULL 19/46] tcg/tci: Merge INDEX_op_st16_{i32,i64}
+[PULL 28/56] tcg/optimize: Split out fold_extract2
-Tested-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
-Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/tci.c | 8 +-------
+ tcg/optimize.c | 39 ++++++++++++++++++++++-----------------
-file changed, 1 insertion(+), 7 deletions(-)
+file changed, 22 insertions(+), 17 deletions(-)
-diff --git a/tcg/tci.c b/tcg/tci.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/tci.c
+--- a/tcg/optimize.c
-+++ b/tcg/tci.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
-             t2 = tci_read_s32(&tb_ptr);
+     return fold_const2(ctx, op);
-             *(uint8_t *)(t1 + t2) = t0;
+ }
 +static bool fold_extract2(OptContext *ctx, TCGOp *op)
 +{
 +    if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
 +        uint64_t v1 = arg_info(op->args[1])->val;
 +        uint64_t v2 = arg_info(op->args[2])->val;
 +        int shr = op->args[3];
 +
 +        if (op->opc == INDEX_op_extract2_i64) {
 +            v1 >>= shr;
 +            v2 <<= 64 - shr;
 +        } else {
 +            v1 = (uint32_t)v1 >> shr;
 +            v2 = (int32_t)v2 << (32 - shr);
 +        }
 +        return tcg_opt_gen_movi(ctx, op, op->args[0], v1 | v2);
 +    }
 +    return false;
 +}
 +
  static bool fold_exts(OptContext *ctx, TCGOp *op)
  {
      return fold_const1(ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              }
              break;
--        case INDEX_op_st16_i32:
-+        CASE_32_64(st16)
+-        CASE_OP_32_64(extract2):
-             t0 = tci_read_r16(regs, &tb_ptr);
+-            if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
-             t1 = tci_read_r(regs, &tb_ptr);
+-                uint64_t v1 = arg_info(op->args[1])->val;
-             t2 = tci_read_s32(&tb_ptr);
+-                uint64_t v2 = arg_info(op->args[2])->val;
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+-                int shr = op->args[3];
-             t2 = tci_read_s32(&tb_ptr);
+-
-             tci_write_reg(regs, t0, *(uint64_t *)(t1 + t2));
+-                if (opc == INDEX_op_extract2_i64) {
 -                    tmp = (v1 >> shr) | (v2 << (64 - shr));
 -                } else {
 -                    tmp = (int32_t)(((uint32_t)v1 >> shr) |
 -                                    ((uint32_t)v2 << (32 - shr)));
 -                }
 -                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
 -                continue;
 -            }
 -            break;
 -
          default:
              break;
--        case INDEX_op_st16_i64:
--            t0 = tci_read_r16(regs, &tb_ptr);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
--            t1 = tci_read_r(regs, &tb_ptr);
+         CASE_OP_32_64(eqv):
--            t2 = tci_read_s32(&tb_ptr);
+             done = fold_eqv(&ctx, op);
--            *(uint16_t *)(t1 + t2) = t0;
+             break;
--            break;
++        CASE_OP_32_64(extract2):
-         case INDEX_op_st32_i64:
++            done = fold_extract2(&ctx, op);
-             t0 = tci_read_r32(regs, &tb_ptr);
++            break;
-             t1 = tci_read_r(regs, &tb_ptr);
+         CASE_OP_32_64(ext8s):
          CASE_OP_32_64(ext16s):
          case INDEX_op_ext32s_i64:
 --
 .25.1

-[PULL 18/46] tcg/tci: Merge INDEX_op_st8_{i32,i64}
+[PULL 29/56] tcg/optimize: Split out fold_extract, fold_sextract
-Tested-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
-Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/tci.c | 8 +-------
+ tcg/optimize.c | 48 ++++++++++++++++++++++++++++++------------------
-file changed, 1 insertion(+), 7 deletions(-)
+file changed, 30 insertions(+), 18 deletions(-)
-diff --git a/tcg/tci.c b/tcg/tci.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/tci.c
+--- a/tcg/optimize.c
-+++ b/tcg/tci.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
-             t2 = tci_read_s32(&tb_ptr);
+     return fold_const2(ctx, op);
-             tci_write_reg(regs, t0, *(uint32_t *)(t1 + t2));
+ }
 +static bool fold_extract(OptContext *ctx, TCGOp *op)
 +{
 +    if (arg_is_const(op->args[1])) {
 +        uint64_t t;
 +
 +        t = arg_info(op->args[1])->val;
 +        t = extract64(t, op->args[2], op->args[3]);
 +        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
 +    }
 +    return false;
 +}
 +
  static bool fold_extract2(OptContext *ctx, TCGOp *op)
  {
      if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
      return tcg_opt_gen_movi(ctx, op, op->args[0], i);
  }
 +static bool fold_sextract(OptContext *ctx, TCGOp *op)
 +{
 +    if (arg_is_const(op->args[1])) {
 +        uint64_t t;
 +
 +        t = arg_info(op->args[1])->val;
 +        t = sextract64(t, op->args[2], op->args[3]);
 +        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
 +    }
 +    return false;
 +}
 +
  static bool fold_shift(OptContext *ctx, TCGOp *op)
  {
      return fold_const2(ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              }
              break;
--        case INDEX_op_st8_i32:
-+        CASE_32_64(st8)
+-        CASE_OP_32_64(extract):
-             t0 = tci_read_r8(regs, &tb_ptr);
+-            if (arg_is_const(op->args[1])) {
-             t1 = tci_read_r(regs, &tb_ptr);
+-                tmp = extract64(arg_info(op->args[1])->val,
-             t2 = tci_read_s32(&tb_ptr);
+-                                op->args[2], op->args[3]);
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
-             t2 = tci_read_s32(&tb_ptr);
+-                continue;
-             tci_write_reg(regs, t0, *(uint64_t *)(t1 + t2));
+-            }
 -            break;
 -
 -        CASE_OP_32_64(sextract):
 -            if (arg_is_const(op->args[1])) {
 -                tmp = sextract64(arg_info(op->args[1])->val,
 -                                 op->args[2], op->args[3]);
 -                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
 -                continue;
 -            }
 -            break;
 -
          default:
              break;
--        case INDEX_op_st8_i64:
--            t0 = tci_read_r8(regs, &tb_ptr);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
--            t1 = tci_read_r(regs, &tb_ptr);
+         CASE_OP_32_64(eqv):
--            t2 = tci_read_s32(&tb_ptr);
+             done = fold_eqv(&ctx, op);
--            *(uint8_t *)(t1 + t2) = t0;
+             break;
--            break;
++        CASE_OP_32_64(extract):
-         case INDEX_op_st16_i64:
++            done = fold_extract(&ctx, op);
-             t0 = tci_read_r16(regs, &tb_ptr);
++            break;
-             t1 = tci_read_r(regs, &tb_ptr);
+         CASE_OP_32_64(extract2):
              done = fold_extract2(&ctx, op);
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          case INDEX_op_setcond2_i32:
              done = fold_setcond2(&ctx, op);
              break;
 +        CASE_OP_32_64(sextract):
 +            done = fold_sextract(&ctx, op);
 +            break;
          CASE_OP_32_64_VEC(sub):
              done = fold_sub(&ctx, op);
              break;
 --
 .25.1

-[PULL 17/46] tcg/tci: Merge INDEX_op_{ld_i32,ld32u_i64}
+[PULL 30/56] tcg/optimize: Split out fold_deposit
-Tested-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
-Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/tci.c | 7 +------
+ tcg/optimize.c | 25 +++++++++++++++----------
-file changed, 1 insertion(+), 6 deletions(-)
+file changed, 15 insertions(+), 10 deletions(-)
-diff --git a/tcg/tci.c b/tcg/tci.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/tci.c
+--- a/tcg/optimize.c
-+++ b/tcg/tci.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+@@ -XXX,XX +XXX,XX @@ static bool fold_ctpop(OptContext *ctx, TCGOp *op)
-             tci_write_reg(regs, t0, *(int16_t *)(t1 + t2));
+     return fold_const1(ctx, op);
  }
 +static bool fold_deposit(OptContext *ctx, TCGOp *op)
 +{
 +    if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
 +        uint64_t t1 = arg_info(op->args[1])->val;
 +        uint64_t t2 = arg_info(op->args[2])->val;
 +
 +        t1 = deposit64(t1, op->args[3], op->args[4], t2);
 +        return tcg_opt_gen_movi(ctx, op, op->args[0], t1);
 +    }
 +    return false;
 +}
 +
  static bool fold_divide(OptContext *ctx, TCGOp *op)
  {
      return fold_const2(ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              }
              break;
-         case INDEX_op_ld_i32:
-+        CASE_64(ld32u)
+-        CASE_OP_32_64(deposit):
-             t0 = *tb_ptr++;
+-            if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
-             t1 = tci_read_r(regs, &tb_ptr);
+-                tmp = deposit64(arg_info(op->args[1])->val,
-             t2 = tci_read_s32(&tb_ptr);
+-                                op->args[3], op->args[4],
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+-                                arg_info(op->args[2])->val);
+-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
-             /* Load/store operations (64 bit). */
+-                continue;
+-            }
 -        case INDEX_op_ld32u_i64:
 -            t0 = *tb_ptr++;
 -            t1 = tci_read_r(regs, &tb_ptr);
 -            t2 = tci_read_s32(&tb_ptr);
 -            tci_write_reg(regs, t0, *(uint32_t *)(t1 + t2));
 -            break;
-         case INDEX_op_ld32s_i64:
+-
-             t0 = *tb_ptr++;
+         default:
-             t1 = tci_read_r(regs, &tb_ptr);
+             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          CASE_OP_32_64(ctpop):
              done = fold_ctpop(&ctx, op);
              break;
 +        CASE_OP_32_64(deposit):
 +            done = fold_deposit(&ctx, op);
 +            break;
          CASE_OP_32_64(div):
          CASE_OP_32_64(divu):
              done = fold_divide(&ctx, op);
 --
 .25.1

-[PULL 37/46] cpu: move cc->do_interrupt to tcg_ops
+[PULL 31/56] tcg/optimize: Split out fold_count_zeros
-From: Claudio Fontana <cfontana@suse.de>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Claudio Fontana <cfontana@suse.de>
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
 Message-Id: <20210204163931.7358-10-cfontana@suse.de>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/hw/core/cpu.h           |  4 ++--
+ tcg/optimize.c | 32 ++++++++++++++++++--------------
- accel/tcg/cpu-exec.c            |  4 ++--
+file changed, 18 insertions(+), 14 deletions(-)
  target/alpha/cpu.c              |  2 +-
  target/arm/cpu.c                |  4 ++--
  target/arm/cpu_tcg.c            |  9 ++++-----
  target/avr/cpu.c                |  2 +-
  target/avr/helper.c             |  4 ++--
  target/cris/cpu.c               | 12 ++++++------
  target/cris/helper.c            |  4 ++--
  target/hppa/cpu.c               |  2 +-
  target/i386/tcg/tcg-cpu.c       |  2 +-
  target/lm32/cpu.c               |  2 +-
  target/m68k/cpu.c               |  2 +-
  target/microblaze/cpu.c         |  2 +-
  target/mips/cpu.c               |  4 ++--
  target/moxie/cpu.c              |  2 +-
  target/nios2/cpu.c              |  2 +-
  target/openrisc/cpu.c           |  2 +-
  target/riscv/cpu.c              |  2 +-
  target/rx/cpu.c                 |  2 +-
  target/s390x/cpu.c              |  2 +-
  target/sh4/cpu.c                |  2 +-
  target/sparc/cpu.c              |  2 +-
  target/tilegx/cpu.c             |  2 +-
  target/unicore32/cpu.c          |  2 +-
  target/xtensa/cpu.c             |  2 +-
  target/ppc/translate_init.c.inc |  2 +-
 files changed, 41 insertions(+), 42 deletions(-)
-diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/hw/core/cpu.h
+--- a/tcg/optimize.c
-+++ b/include/hw/core/cpu.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ typedef struct TcgCpuOperations {
+@@ -XXX,XX +XXX,XX @@ static bool fold_call(OptContext *ctx, TCGOp *op)
      void (*cpu_exec_exit)(CPUState *cpu);
      /** @cpu_exec_interrupt: Callback for processing interrupts in cpu_exec */
      bool (*cpu_exec_interrupt)(CPUState *cpu, int interrupt_request);
 +    /** @do_interrupt: Callback for interrupt handling. */
 +    void (*do_interrupt)(CPUState *cpu);
      /**
       * @tlb_fill: Handle a softmmu tlb miss or user-only address fault
       *
@@ -XXX,XX +XXX,XX @@ typedef struct TcgCpuOperations {
   * @parse_features: Callback to parse command line arguments.
   * @reset_dump_flags: #CPUDumpFlags to use for reset logging.
   * @has_work: Callback for checking if there is work to do.
 - * @do_interrupt: Callback for interrupt handling.
   * @do_unaligned_access: Callback for unaligned access handling, if
   * the target defines #TARGET_ALIGNED_ONLY.
   * @do_transaction_failed: Callback for handling failed memory transactions
@@ -XXX,XX +XXX,XX @@ struct CPUClass {
      int reset_dump_flags;
      bool (*has_work)(CPUState *cpu);
 -    void (*do_interrupt)(CPUState *cpu);
      void (*do_unaligned_access)(CPUState *cpu, vaddr addr,
                                  MMUAccessType access_type,
                                  int mmu_idx, uintptr_t retaddr);
 diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/cpu-exec.c
 +++ b/accel/tcg/cpu-exec.c
@@ -XXX,XX +XXX,XX @@ static inline bool cpu_handle_exception(CPUState *cpu, int *ret)
             loop */
  #if defined(TARGET_I386)
          CPUClass *cc = CPU_GET_CLASS(cpu);
 -        cc->do_interrupt(cpu);
 +        cc->tcg_ops.do_interrupt(cpu);
  #endif
          *ret = cpu->exception_index;
          cpu->exception_index = -1;
@@ -XXX,XX +XXX,XX @@ static inline bool cpu_handle_exception(CPUState *cpu, int *ret)
          if (replay_exception()) {
              CPUClass *cc = CPU_GET_CLASS(cpu);
              qemu_mutex_lock_iothread();
 -            cc->do_interrupt(cpu);
 +            cc->tcg_ops.do_interrupt(cpu);
              qemu_mutex_unlock_iothread();
              cpu->exception_index = -1;
 diff --git a/target/alpha/cpu.c b/target/alpha/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/alpha/cpu.c
 +++ b/target/alpha/cpu.c
@@ -XXX,XX +XXX,XX @@ static void alpha_cpu_class_init(ObjectClass *oc, void *data)
      cc->class_by_name = alpha_cpu_class_by_name;
      cc->has_work = alpha_cpu_has_work;
 -    cc->do_interrupt = alpha_cpu_do_interrupt;
 +    cc->tcg_ops.do_interrupt = alpha_cpu_do_interrupt;
      cc->tcg_ops.cpu_exec_interrupt = alpha_cpu_exec_interrupt;
      cc->dump_state = alpha_cpu_dump_state;
      cc->set_pc = alpha_cpu_set_pc;
 diff --git a/target/arm/cpu.c b/target/arm/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/arm/cpu.c
 +++ b/target/arm/cpu.c
@@ -XXX,XX +XXX,XX @@ bool arm_cpu_exec_interrupt(CPUState *cs, int interrupt_request)
   found:
      cs->exception_index = excp_idx;
      env->exception.target_el = target_el;
 -    cc->do_interrupt(cs);
 +    cc->tcg_ops.do_interrupt(cs);
      return true;
  }
-@@ -XXX,XX +XXX,XX @@ static void arm_cpu_class_init(ObjectClass *oc, void *data)
++static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
-     cc->gdb_read_register = arm_cpu_gdb_read_register;
++{
-     cc->gdb_write_register = arm_cpu_gdb_write_register;
++    if (arg_is_const(op->args[1])) {
- #ifndef CONFIG_USER_ONLY
++        uint64_t t = arg_info(op->args[1])->val;
--    cc->do_interrupt = arm_cpu_do_interrupt;
++
-     cc->get_phys_page_attrs_debug = arm_cpu_get_phys_page_attrs_debug;
++        if (t != 0) {
-     cc->asidx_from_attrs = arm_asidx_from_attrs;
++            t = do_constant_folding(op->opc, t, 0);
-     cc->vmsd = &vmstate_arm_cpu;
++            return tcg_opt_gen_movi(ctx, op, op->args[0], t);
-@@ -XXX,XX +XXX,XX @@ static void arm_cpu_class_init(ObjectClass *oc, void *data)
++        }
- #if !defined(CONFIG_USER_ONLY)
++        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[2]);
-     cc->do_transaction_failed = arm_cpu_do_transaction_failed;
++    }
-     cc->adjust_watchpoint_address = arm_adjust_watchpoint_address;
++    return false;
-+    cc->tcg_ops.do_interrupt = arm_cpu_do_interrupt;
++}
- #endif /* CONFIG_TCG && !CONFIG_USER_ONLY */
++
- #endif
+ static bool fold_ctpop(OptContext *ctx, TCGOp *op)
- }
+ {
-diff --git a/target/arm/cpu_tcg.c b/target/arm/cpu_tcg.c
+     return fold_const1(ctx, op);
-index XXXXXXX..XXXXXXX 100644
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
---- a/target/arm/cpu_tcg.c
+             }
-+++ b/target/arm/cpu_tcg.c
+             break;
-@@ -XXX,XX +XXX,XX @@ static bool arm_v7m_cpu_exec_interrupt(CPUState *cs, int interrupt_request)
-     if (interrupt_request & CPU_INTERRUPT_HARD
+-        CASE_OP_32_64(clz):
-         && (armv7m_nvic_can_take_pending_exception(env->nvic))) {
+-        CASE_OP_32_64(ctz):
-         cs->exception_index = EXCP_IRQ;
+-            if (arg_is_const(op->args[1])) {
--        cc->do_interrupt(cs);
+-                TCGArg v = arg_info(op->args[1])->val;
-+        cc->tcg_ops.do_interrupt(cs);
+-                if (v != 0) {
-         ret = true;
+-                    tmp = do_constant_folding(opc, v, 0);
-     }
+-                    tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
-     return ret;
+-                } else {
-@@ -XXX,XX +XXX,XX @@ static void arm_v7m_class_init(ObjectClass *oc, void *data)
+-                    tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[2]);
-     CPUClass *cc = CPU_CLASS(oc);
+-                }
+-                continue;
-     acc->info = data;
+-            }
--#ifndef CONFIG_USER_ONLY
+-            break;
 -    cc->do_interrupt = arm_v7m_cpu_do_interrupt;
 -#endif
 -
- #ifdef CONFIG_TCG
+         default:
-     cc->tcg_ops.cpu_exec_interrupt = arm_v7m_cpu_exec_interrupt;
+             break;
-+#ifndef CONFIG_USER_ONLY
-+    cc->tcg_ops.do_interrupt = arm_v7m_cpu_do_interrupt;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-+#endif
+         case INDEX_op_brcond2_i32:
- #endif /* CONFIG_TCG */
+             done = fold_brcond2(&ctx, op);
+             break;
-     cc->gdb_core_xml_file = "arm-m-profile.xml";
++        CASE_OP_32_64(clz):
-diff --git a/target/avr/cpu.c b/target/avr/cpu.c
++        CASE_OP_32_64(ctz):
-index XXXXXXX..XXXXXXX 100644
++            done = fold_count_zeros(&ctx, op);
---- a/target/avr/cpu.c
++            break;
-+++ b/target/avr/cpu.c
+         CASE_OP_32_64(ctpop):
-@@ -XXX,XX +XXX,XX @@ static void avr_cpu_class_init(ObjectClass *oc, void *data)
+             done = fold_ctpop(&ctx, op);
-     cc->class_by_name = avr_cpu_class_by_name;
+             break;
      cc->has_work = avr_cpu_has_work;
 -    cc->do_interrupt = avr_cpu_do_interrupt;
 +    cc->tcg_ops.do_interrupt = avr_cpu_do_interrupt;
      cc->tcg_ops.cpu_exec_interrupt = avr_cpu_exec_interrupt;
      cc->dump_state = avr_cpu_dump_state;
      cc->set_pc = avr_cpu_set_pc;
 diff --git a/target/avr/helper.c b/target/avr/helper.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/avr/helper.c
 +++ b/target/avr/helper.c
@@ -XXX,XX +XXX,XX @@ bool avr_cpu_exec_interrupt(CPUState *cs, int interrupt_request)
      if (interrupt_request & CPU_INTERRUPT_RESET) {
          if (cpu_interrupts_enabled(env)) {
              cs->exception_index = EXCP_RESET;
 -            cc->do_interrupt(cs);
 +            cc->tcg_ops.do_interrupt(cs);
              cs->interrupt_request &= ~CPU_INTERRUPT_RESET;
@@ -XXX,XX +XXX,XX @@ bool avr_cpu_exec_interrupt(CPUState *cs, int interrupt_request)
          if (cpu_interrupts_enabled(env) && env->intsrc != 0) {
              int index = ctz32(env->intsrc);
              cs->exception_index = EXCP_INT(index);
 -            cc->do_interrupt(cs);
 +            cc->tcg_ops.do_interrupt(cs);
              env->intsrc &= env->intsrc - 1; /* clear the interrupt */
              cs->interrupt_request &= ~CPU_INTERRUPT_HARD;
 diff --git a/target/cris/cpu.c b/target/cris/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/cris/cpu.c
 +++ b/target/cris/cpu.c
@@ -XXX,XX +XXX,XX @@ static void crisv8_cpu_class_init(ObjectClass *oc, void *data)
      CRISCPUClass *ccc = CRIS_CPU_CLASS(oc);
      ccc->vr = 8;
 -    cc->do_interrupt = crisv10_cpu_do_interrupt;
 +    cc->tcg_ops.do_interrupt = crisv10_cpu_do_interrupt;
      cc->gdb_read_register = crisv10_cpu_gdb_read_register;
      cc->tcg_ops.initialize = cris_initialize_crisv10_tcg;
  }
@@ -XXX,XX +XXX,XX @@ static void crisv9_cpu_class_init(ObjectClass *oc, void *data)
      CRISCPUClass *ccc = CRIS_CPU_CLASS(oc);
      ccc->vr = 9;
 -    cc->do_interrupt = crisv10_cpu_do_interrupt;
 +    cc->tcg_ops.do_interrupt = crisv10_cpu_do_interrupt;
      cc->gdb_read_register = crisv10_cpu_gdb_read_register;
      cc->tcg_ops.initialize = cris_initialize_crisv10_tcg;
  }
@@ -XXX,XX +XXX,XX @@ static void crisv10_cpu_class_init(ObjectClass *oc, void *data)
      CRISCPUClass *ccc = CRIS_CPU_CLASS(oc);
      ccc->vr = 10;
 -    cc->do_interrupt = crisv10_cpu_do_interrupt;
 +    cc->tcg_ops.do_interrupt = crisv10_cpu_do_interrupt;
      cc->gdb_read_register = crisv10_cpu_gdb_read_register;
      cc->tcg_ops.initialize = cris_initialize_crisv10_tcg;
  }
@@ -XXX,XX +XXX,XX @@ static void crisv11_cpu_class_init(ObjectClass *oc, void *data)
      CRISCPUClass *ccc = CRIS_CPU_CLASS(oc);
      ccc->vr = 11;
 -    cc->do_interrupt = crisv10_cpu_do_interrupt;
 +    cc->tcg_ops.do_interrupt = crisv10_cpu_do_interrupt;
      cc->gdb_read_register = crisv10_cpu_gdb_read_register;
      cc->tcg_ops.initialize = cris_initialize_crisv10_tcg;
  }
@@ -XXX,XX +XXX,XX @@ static void crisv17_cpu_class_init(ObjectClass *oc, void *data)
      CRISCPUClass *ccc = CRIS_CPU_CLASS(oc);
      ccc->vr = 17;
 -    cc->do_interrupt = crisv10_cpu_do_interrupt;
 +    cc->tcg_ops.do_interrupt = crisv10_cpu_do_interrupt;
      cc->gdb_read_register = crisv10_cpu_gdb_read_register;
      cc->tcg_ops.initialize = cris_initialize_crisv10_tcg;
  }
@@ -XXX,XX +XXX,XX @@ static void cris_cpu_class_init(ObjectClass *oc, void *data)
      cc->class_by_name = cris_cpu_class_by_name;
      cc->has_work = cris_cpu_has_work;
 -    cc->do_interrupt = cris_cpu_do_interrupt;
 +    cc->tcg_ops.do_interrupt = cris_cpu_do_interrupt;
      cc->tcg_ops.cpu_exec_interrupt = cris_cpu_exec_interrupt;
      cc->dump_state = cris_cpu_dump_state;
      cc->set_pc = cris_cpu_set_pc;
 diff --git a/target/cris/helper.c b/target/cris/helper.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/cris/helper.c
 +++ b/target/cris/helper.c
@@ -XXX,XX +XXX,XX @@ bool cris_cpu_exec_interrupt(CPUState *cs, int interrupt_request)
          && (env->pregs[PR_CCS] & I_FLAG)
          && !env->locked_irq) {
          cs->exception_index = EXCP_IRQ;
 -        cc->do_interrupt(cs);
 +        cc->tcg_ops.do_interrupt(cs);
          ret = true;
      }
      if (interrupt_request & CPU_INTERRUPT_NMI) {
@@ -XXX,XX +XXX,XX @@ bool cris_cpu_exec_interrupt(CPUState *cs, int interrupt_request)
          }
          if ((env->pregs[PR_CCS] & m_flag_archval)) {
              cs->exception_index = EXCP_NMI;
 -            cc->do_interrupt(cs);
 +            cc->tcg_ops.do_interrupt(cs);
              ret = true;
          }
      }
 diff --git a/target/hppa/cpu.c b/target/hppa/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/hppa/cpu.c
 +++ b/target/hppa/cpu.c
@@ -XXX,XX +XXX,XX @@ static void hppa_cpu_class_init(ObjectClass *oc, void *data)
      cc->class_by_name = hppa_cpu_class_by_name;
      cc->has_work = hppa_cpu_has_work;
 -    cc->do_interrupt = hppa_cpu_do_interrupt;
 +    cc->tcg_ops.do_interrupt = hppa_cpu_do_interrupt;
      cc->tcg_ops.cpu_exec_interrupt = hppa_cpu_exec_interrupt;
      cc->dump_state = hppa_cpu_dump_state;
      cc->set_pc = hppa_cpu_set_pc;
 diff --git a/target/i386/tcg/tcg-cpu.c b/target/i386/tcg/tcg-cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/i386/tcg/tcg-cpu.c
 +++ b/target/i386/tcg/tcg-cpu.c
@@ -XXX,XX +XXX,XX @@ static void x86_cpu_synchronize_from_tb(CPUState *cs,
  void tcg_cpu_common_class_init(CPUClass *cc)
  {
 -    cc->do_interrupt = x86_cpu_do_interrupt;
 +    cc->tcg_ops.do_interrupt = x86_cpu_do_interrupt;
      cc->tcg_ops.cpu_exec_interrupt = x86_cpu_exec_interrupt;
      cc->tcg_ops.synchronize_from_tb = x86_cpu_synchronize_from_tb;
      cc->tcg_ops.cpu_exec_enter = x86_cpu_exec_enter;
 diff --git a/target/lm32/cpu.c b/target/lm32/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/lm32/cpu.c
 +++ b/target/lm32/cpu.c
@@ -XXX,XX +XXX,XX @@ static void lm32_cpu_class_init(ObjectClass *oc, void *data)
      cc->class_by_name = lm32_cpu_class_by_name;
      cc->has_work = lm32_cpu_has_work;
 -    cc->do_interrupt = lm32_cpu_do_interrupt;
 +    cc->tcg_ops.do_interrupt = lm32_cpu_do_interrupt;
      cc->tcg_ops.cpu_exec_interrupt = lm32_cpu_exec_interrupt;
      cc->dump_state = lm32_cpu_dump_state;
      cc->set_pc = lm32_cpu_set_pc;
 diff --git a/target/m68k/cpu.c b/target/m68k/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/m68k/cpu.c
 +++ b/target/m68k/cpu.c
@@ -XXX,XX +XXX,XX @@ static void m68k_cpu_class_init(ObjectClass *c, void *data)
      cc->class_by_name = m68k_cpu_class_by_name;
      cc->has_work = m68k_cpu_has_work;
 -    cc->do_interrupt = m68k_cpu_do_interrupt;
 +    cc->tcg_ops.do_interrupt = m68k_cpu_do_interrupt;
      cc->tcg_ops.cpu_exec_interrupt = m68k_cpu_exec_interrupt;
      cc->dump_state = m68k_cpu_dump_state;
      cc->set_pc = m68k_cpu_set_pc;
 diff --git a/target/microblaze/cpu.c b/target/microblaze/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/microblaze/cpu.c
 +++ b/target/microblaze/cpu.c
@@ -XXX,XX +XXX,XX @@ static void mb_cpu_class_init(ObjectClass *oc, void *data)
      cc->class_by_name = mb_cpu_class_by_name;
      cc->has_work = mb_cpu_has_work;
 -    cc->do_interrupt = mb_cpu_do_interrupt;
 +    cc->tcg_ops.do_interrupt = mb_cpu_do_interrupt;
      cc->do_unaligned_access = mb_cpu_do_unaligned_access;
      cc->tcg_ops.cpu_exec_interrupt = mb_cpu_exec_interrupt;
      cc->dump_state = mb_cpu_dump_state;
 diff --git a/target/mips/cpu.c b/target/mips/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/mips/cpu.c
 +++ b/target/mips/cpu.c
@@ -XXX,XX +XXX,XX @@ static void mips_cpu_class_init(ObjectClass *c, void *data)
      cc->class_by_name = mips_cpu_class_by_name;
      cc->has_work = mips_cpu_has_work;
 -    cc->do_interrupt = mips_cpu_do_interrupt;
      cc->dump_state = mips_cpu_dump_state;
      cc->set_pc = mips_cpu_set_pc;
      cc->gdb_read_register = mips_cpu_gdb_read_register;
@@ -XXX,XX +XXX,XX @@ static void mips_cpu_class_init(ObjectClass *c, void *data)
      cc->disas_set_info = mips_cpu_disas_set_info;
  #ifdef CONFIG_TCG
      cc->tcg_ops.initialize = mips_tcg_init;
 +    cc->tcg_ops.do_interrupt = mips_cpu_do_interrupt;
      cc->tcg_ops.cpu_exec_interrupt = mips_cpu_exec_interrupt;
      cc->tcg_ops.synchronize_from_tb = mips_cpu_synchronize_from_tb;
      cc->tcg_ops.tlb_fill = mips_cpu_tlb_fill;
 -#endif
 +#endif /* CONFIG_TCG */
      cc->gdb_num_core_regs = 73;
      cc->gdb_stop_before_watchpoint = true;
 diff --git a/target/moxie/cpu.c b/target/moxie/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/moxie/cpu.c
 +++ b/target/moxie/cpu.c
@@ -XXX,XX +XXX,XX @@ static void moxie_cpu_class_init(ObjectClass *oc, void *data)
      cc->class_by_name = moxie_cpu_class_by_name;
      cc->has_work = moxie_cpu_has_work;
 -    cc->do_interrupt = moxie_cpu_do_interrupt;
 +    cc->tcg_ops.do_interrupt = moxie_cpu_do_interrupt;
      cc->dump_state = moxie_cpu_dump_state;
      cc->set_pc = moxie_cpu_set_pc;
      cc->tcg_ops.tlb_fill = moxie_cpu_tlb_fill;
 diff --git a/target/nios2/cpu.c b/target/nios2/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/nios2/cpu.c
 +++ b/target/nios2/cpu.c
@@ -XXX,XX +XXX,XX @@ static void nios2_cpu_class_init(ObjectClass *oc, void *data)
      cc->class_by_name = nios2_cpu_class_by_name;
      cc->has_work = nios2_cpu_has_work;
 -    cc->do_interrupt = nios2_cpu_do_interrupt;
 +    cc->tcg_ops.do_interrupt = nios2_cpu_do_interrupt;
      cc->tcg_ops.cpu_exec_interrupt = nios2_cpu_exec_interrupt;
      cc->dump_state = nios2_cpu_dump_state;
      cc->set_pc = nios2_cpu_set_pc;
 diff --git a/target/openrisc/cpu.c b/target/openrisc/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/openrisc/cpu.c
 +++ b/target/openrisc/cpu.c
@@ -XXX,XX +XXX,XX @@ static void openrisc_cpu_class_init(ObjectClass *oc, void *data)
      cc->class_by_name = openrisc_cpu_class_by_name;
      cc->has_work = openrisc_cpu_has_work;
 -    cc->do_interrupt = openrisc_cpu_do_interrupt;
 +    cc->tcg_ops.do_interrupt = openrisc_cpu_do_interrupt;
      cc->tcg_ops.cpu_exec_interrupt = openrisc_cpu_exec_interrupt;
      cc->dump_state = openrisc_cpu_dump_state;
      cc->set_pc = openrisc_cpu_set_pc;
 diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/riscv/cpu.c
 +++ b/target/riscv/cpu.c
@@ -XXX,XX +XXX,XX @@ static void riscv_cpu_class_init(ObjectClass *c, void *data)
      cc->class_by_name = riscv_cpu_class_by_name;
      cc->has_work = riscv_cpu_has_work;
 -    cc->do_interrupt = riscv_cpu_do_interrupt;
 +    cc->tcg_ops.do_interrupt = riscv_cpu_do_interrupt;
      cc->tcg_ops.cpu_exec_interrupt = riscv_cpu_exec_interrupt;
      cc->dump_state = riscv_cpu_dump_state;
      cc->set_pc = riscv_cpu_set_pc;
 diff --git a/target/rx/cpu.c b/target/rx/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/rx/cpu.c
 +++ b/target/rx/cpu.c
@@ -XXX,XX +XXX,XX @@ static void rx_cpu_class_init(ObjectClass *klass, void *data)
      cc->class_by_name = rx_cpu_class_by_name;
      cc->has_work = rx_cpu_has_work;
 -    cc->do_interrupt = rx_cpu_do_interrupt;
 +    cc->tcg_ops.do_interrupt = rx_cpu_do_interrupt;
      cc->tcg_ops.cpu_exec_interrupt = rx_cpu_exec_interrupt;
      cc->dump_state = rx_cpu_dump_state;
      cc->set_pc = rx_cpu_set_pc;
 diff --git a/target/s390x/cpu.c b/target/s390x/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/s390x/cpu.c
 +++ b/target/s390x/cpu.c
@@ -XXX,XX +XXX,XX @@ static void s390_cpu_class_init(ObjectClass *oc, void *data)
      cc->class_by_name = s390_cpu_class_by_name,
      cc->has_work = s390_cpu_has_work;
  #ifdef CONFIG_TCG
 -    cc->do_interrupt = s390_cpu_do_interrupt;
 +    cc->tcg_ops.do_interrupt = s390_cpu_do_interrupt;
  #endif
      cc->dump_state = s390_cpu_dump_state;
      cc->set_pc = s390_cpu_set_pc;
 diff --git a/target/sh4/cpu.c b/target/sh4/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/sh4/cpu.c
 +++ b/target/sh4/cpu.c
@@ -XXX,XX +XXX,XX @@ static void superh_cpu_class_init(ObjectClass *oc, void *data)
      cc->class_by_name = superh_cpu_class_by_name;
      cc->has_work = superh_cpu_has_work;
 -    cc->do_interrupt = superh_cpu_do_interrupt;
 +    cc->tcg_ops.do_interrupt = superh_cpu_do_interrupt;
      cc->tcg_ops.cpu_exec_interrupt = superh_cpu_exec_interrupt;
      cc->dump_state = superh_cpu_dump_state;
      cc->set_pc = superh_cpu_set_pc;
 diff --git a/target/sparc/cpu.c b/target/sparc/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/sparc/cpu.c
 +++ b/target/sparc/cpu.c
@@ -XXX,XX +XXX,XX @@ static void sparc_cpu_class_init(ObjectClass *oc, void *data)
      cc->class_by_name = sparc_cpu_class_by_name;
      cc->parse_features = sparc_cpu_parse_features;
      cc->has_work = sparc_cpu_has_work;
 -    cc->do_interrupt = sparc_cpu_do_interrupt;
 +    cc->tcg_ops.do_interrupt = sparc_cpu_do_interrupt;
      cc->tcg_ops.cpu_exec_interrupt = sparc_cpu_exec_interrupt;
      cc->dump_state = sparc_cpu_dump_state;
  #if !defined(TARGET_SPARC64) && !defined(CONFIG_USER_ONLY)
 diff --git a/target/tilegx/cpu.c b/target/tilegx/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/tilegx/cpu.c
 +++ b/target/tilegx/cpu.c
@@ -XXX,XX +XXX,XX @@ static void tilegx_cpu_class_init(ObjectClass *oc, void *data)
      cc->class_by_name = tilegx_cpu_class_by_name;
      cc->has_work = tilegx_cpu_has_work;
 -    cc->do_interrupt = tilegx_cpu_do_interrupt;
 +    cc->tcg_ops.do_interrupt = tilegx_cpu_do_interrupt;
      cc->tcg_ops.cpu_exec_interrupt = tilegx_cpu_exec_interrupt;
      cc->dump_state = tilegx_cpu_dump_state;
      cc->set_pc = tilegx_cpu_set_pc;
 diff --git a/target/unicore32/cpu.c b/target/unicore32/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/unicore32/cpu.c
 +++ b/target/unicore32/cpu.c
@@ -XXX,XX +XXX,XX @@ static void uc32_cpu_class_init(ObjectClass *oc, void *data)
      cc->class_by_name = uc32_cpu_class_by_name;
      cc->has_work = uc32_cpu_has_work;
 -    cc->do_interrupt = uc32_cpu_do_interrupt;
 +    cc->tcg_ops.do_interrupt = uc32_cpu_do_interrupt;
      cc->tcg_ops.cpu_exec_interrupt = uc32_cpu_exec_interrupt;
      cc->dump_state = uc32_cpu_dump_state;
      cc->set_pc = uc32_cpu_set_pc;
 diff --git a/target/xtensa/cpu.c b/target/xtensa/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/xtensa/cpu.c
 +++ b/target/xtensa/cpu.c
@@ -XXX,XX +XXX,XX @@ static void xtensa_cpu_class_init(ObjectClass *oc, void *data)
      cc->class_by_name = xtensa_cpu_class_by_name;
      cc->has_work = xtensa_cpu_has_work;
 -    cc->do_interrupt = xtensa_cpu_do_interrupt;
 +    cc->tcg_ops.do_interrupt = xtensa_cpu_do_interrupt;
      cc->tcg_ops.cpu_exec_interrupt = xtensa_cpu_exec_interrupt;
      cc->dump_state = xtensa_cpu_dump_state;
      cc->set_pc = xtensa_cpu_set_pc;
 diff --git a/target/ppc/translate_init.c.inc b/target/ppc/translate_init.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/target/ppc/translate_init.c.inc
 +++ b/target/ppc/translate_init.c.inc
@@ -XXX,XX +XXX,XX @@ static void ppc_cpu_class_init(ObjectClass *oc, void *data)
      cc->class_by_name = ppc_cpu_class_by_name;
      cc->has_work = ppc_cpu_has_work;
 -    cc->do_interrupt = ppc_cpu_do_interrupt;
      cc->dump_state = ppc_cpu_dump_state;
      cc->dump_statistics = ppc_cpu_dump_statistics;
      cc->set_pc = ppc_cpu_set_pc;
@@ -XXX,XX +XXX,XX @@ static void ppc_cpu_class_init(ObjectClass *oc, void *data)
  #ifdef CONFIG_TCG
      cc->tcg_ops.initialize = ppc_translate_init;
      cc->tcg_ops.cpu_exec_interrupt = ppc_cpu_exec_interrupt;
 +    cc->tcg_ops.do_interrupt = ppc_cpu_do_interrupt;
      cc->tcg_ops.tlb_fill = ppc_cpu_tlb_fill;
  #ifndef CONFIG_USER_ONLY
      cc->tcg_ops.cpu_exec_enter = ppc_cpu_exec_enter;
 --
 .25.1

-[PULL 16/46] tcg/tci: Merge INDEX_op_ld16s_{i32,i64}
+[PULL 32/56] tcg/optimize: Split out fold_bswap
-Eliminating a TODO for ld16s_i64.
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Tested-by: Alex Bennée <alex.bennee@linaro.org>
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/tci.c | 5 +----
+ tcg/optimize.c | 27 ++++++++++++++++-----------
-file changed, 1 insertion(+), 4 deletions(-)
+file changed, 16 insertions(+), 11 deletions(-)
-diff --git a/tcg/tci.c b/tcg/tci.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/tci.c
+--- a/tcg/optimize.c
-+++ b/tcg/tci.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
-             t2 = tci_read_s32(&tb_ptr);
+     return false;
-             tci_write_reg(regs, t0, *(uint16_t *)(t1 + t2));
+ }
 +static bool fold_bswap(OptContext *ctx, TCGOp *op)
 +{
 +    if (arg_is_const(op->args[1])) {
 +        uint64_t t = arg_info(op->args[1])->val;
 +
 +        t = do_constant_folding(op->opc, t, op->args[2]);
 +        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
 +    }
 +    return false;
 +}
 +
  static bool fold_call(OptContext *ctx, TCGOp *op)
  {
      TCGContext *s = ctx->tcg;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              }
              break;
--        case INDEX_op_ld16s_i32:
-+        CASE_32_64(ld16s)
+-        CASE_OP_32_64(bswap16):
-             t0 = *tb_ptr++;
+-        CASE_OP_32_64(bswap32):
-             t1 = tci_read_r(regs, &tb_ptr);
+-        case INDEX_op_bswap64_i64:
-             t2 = tci_read_s32(&tb_ptr);
+-            if (arg_is_const(op->args[1])) {
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+-                tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
+-                                          op->args[2]);
-             /* Load/store operations (64 bit). */
+-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
+-                continue;
--        case INDEX_op_ld16s_i64:
+-            }
 -            TODO();
 -            break;
-         case INDEX_op_ld32u_i64:
+-
-             t0 = *tb_ptr++;
+         default:
-             t1 = tci_read_r(regs, &tb_ptr);
+             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          case INDEX_op_brcond2_i32:
              done = fold_brcond2(&ctx, op);
              break;
 +        CASE_OP_32_64(bswap16):
 +        CASE_OP_32_64(bswap32):
 +        case INDEX_op_bswap64_i64:
 +            done = fold_bswap(&ctx, op);
 +            break;
          CASE_OP_32_64(clz):
          CASE_OP_32_64(ctz):
              done = fold_count_zeros(&ctx, op);
 --
 .25.1

-[PULL 15/46] tcg/tci: Merge INDEX_op_ld16u_{i32,i64}
+[PULL 33/56] tcg/optimize: Split out fold_dup, fold_dup2
-Eliminating a TODO for ld16u_i32.
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Tested-by: Alex Bennée <alex.bennee@linaro.org>
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/tci.c | 13 +++++--------
+ tcg/optimize.c | 53 +++++++++++++++++++++++++++++---------------------
-file changed, 5 insertions(+), 8 deletions(-)
+file changed, 31 insertions(+), 22 deletions(-)
-diff --git a/tcg/tci.c b/tcg/tci.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/tci.c
+--- a/tcg/optimize.c
-+++ b/tcg/tci.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+@@ -XXX,XX +XXX,XX @@ static bool fold_divide(OptContext *ctx, TCGOp *op)
-             t2 = tci_read_s32(&tb_ptr);
+     return fold_const2(ctx, op);
-             tci_write_reg(regs, t0, *(int8_t *)(t1 + t2));
+ }
 +static bool fold_dup(OptContext *ctx, TCGOp *op)
 +{
 +    if (arg_is_const(op->args[1])) {
 +        uint64_t t = arg_info(op->args[1])->val;
 +        t = dup_const(TCGOP_VECE(op), t);
 +        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
 +    }
 +    return false;
 +}
 +
 +static bool fold_dup2(OptContext *ctx, TCGOp *op)
 +{
 +    if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
 +        uint64_t t = deposit64(arg_info(op->args[1])->val, 32, 32,
 +                               arg_info(op->args[2])->val);
 +        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
 +    }
 +
 +    if (args_are_copies(op->args[1], op->args[2])) {
 +        op->opc = INDEX_op_dup_vec;
 +        TCGOP_VECE(op) = MO_32;
 +    }
 +    return false;
 +}
 +
  static bool fold_eqv(OptContext *ctx, TCGOp *op)
  {
      return fold_const2(ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              done = tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
              break;
--        case INDEX_op_ld16u_i32:
--            TODO();
+-        case INDEX_op_dup_vec:
-+        CASE_32_64(ld16u)
+-            if (arg_is_const(op->args[1])) {
-+            t0 = *tb_ptr++;
+-                tmp = arg_info(op->args[1])->val;
-+            t1 = tci_read_r(regs, &tb_ptr);
+-                tmp = dup_const(TCGOP_VECE(op), tmp);
-+            t2 = tci_read_s32(&tb_ptr);
+-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
-+            tci_write_reg(regs, t0, *(uint16_t *)(t1 + t2));
+-                continue;
 -            }
 -            break;
 -
 -        case INDEX_op_dup2_vec:
 -            assert(TCG_TARGET_REG_BITS == 32);
 -            if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
 -                tcg_opt_gen_movi(&ctx, op, op->args[0],
 -                                 deposit64(arg_info(op->args[1])->val, 32, 32,
 -                                           arg_info(op->args[2])->val));
 -                continue;
 -            } else if (args_are_copies(op->args[1], op->args[2])) {
 -                op->opc = INDEX_op_dup_vec;
 -                TCGOP_VECE(op) = MO_32;
 -            }
 -            break;
 -
          default:
              break;
-         case INDEX_op_ld16s_i32:
-             t0 = *tb_ptr++;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+         CASE_OP_32_64(divu):
+             done = fold_divide(&ctx, op);
-             /* Load/store operations (64 bit). */
+             break;
++        case INDEX_op_dup_vec:
--        case INDEX_op_ld16u_i64:
++            done = fold_dup(&ctx, op);
--            t0 = *tb_ptr++;
++            break;
--            t1 = tci_read_r(regs, &tb_ptr);
++        case INDEX_op_dup2_vec:
--            t2 = tci_read_s32(&tb_ptr);
++            done = fold_dup2(&ctx, op);
--            tci_write_reg(regs, t0, *(uint16_t *)(t1 + t2));
++            break;
--            break;
+         CASE_OP_32_64(eqv):
-         case INDEX_op_ld16s_i64:
+             done = fold_eqv(&ctx, op);
              TODO();
              break;
 --
 .25.1

-[PULL 14/46] tcg/tci: Merge INDEX_op_ld8s_{i32,i64}
+[PULL 34/56] tcg/optimize: Split out fold_mov
-Eliminating a TODO for ld8s_i32.
+This is the final entry in the main switch that was in a
 different form.  After this, we have the option to convert
 the switch into a function dispatch table.
-Tested-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
-Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/tci.c | 13 +++++--------
+ tcg/optimize.c | 27 ++++++++++++++-------------
-file changed, 5 insertions(+), 8 deletions(-)
+file changed, 14 insertions(+), 13 deletions(-)
-diff --git a/tcg/tci.c b/tcg/tci.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/tci.c
+--- a/tcg/optimize.c
-+++ b/tcg/tci.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+@@ -XXX,XX +XXX,XX @@ static bool fold_mb(OptContext *ctx, TCGOp *op)
-             t2 = tci_read_s32(&tb_ptr);
+     return true;
-             tci_write_reg(regs, t0, *(uint8_t *)(t1 + t2));
+ }
 +static bool fold_mov(OptContext *ctx, TCGOp *op)
 +{
 +    return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]);
 +}
 +
  static bool fold_movcond(OptContext *ctx, TCGOp *op)
  {
      TCGOpcode opc = op->opc;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              break;
--        case INDEX_op_ld8s_i32:
+         }
--            TODO();
-+        CASE_32_64(ld8s)
+-        /* Propagate constants through copy operations and do constant
-+            t0 = *tb_ptr++;
+-           folding.  Constants will be substituted to arguments by register
-+            t1 = tci_read_r(regs, &tb_ptr);
+-           allocator where needed and possible.  Also detect copies. */
-+            t2 = tci_read_s32(&tb_ptr);
++        /*
-+            tci_write_reg(regs, t0, *(int8_t *)(t1 + t2));
++         * Process each opcode.
 +         * Sorted alphabetically by opcode as much as possible.
 +         */
          switch (opc) {
 -        CASE_OP_32_64_VEC(mov):
 -            done = tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
 -            break;
 -
 -        default:
 -            break;
 -
 -        /* ---------------------------------------------------------- */
 -        /* Sorted alphabetically by opcode as much as possible. */
 -
          CASE_OP_32_64_VEC(add):
              done = fold_add(&ctx, op);
              break;
-         case INDEX_op_ld16u_i32:
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-             TODO();
+         case INDEX_op_mb:
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+             done = fold_mb(&ctx, op);
+             break;
-             /* Load/store operations (64 bit). */
++        CASE_OP_32_64_VEC(mov):
++            done = fold_mov(&ctx, op);
--        case INDEX_op_ld8s_i64:
++            break;
--            t0 = *tb_ptr++;
+         CASE_OP_32_64(movcond):
--            t1 = tci_read_r(regs, &tb_ptr);
+             done = fold_movcond(&ctx, op);
--            t2 = tci_read_s32(&tb_ptr);
+             break;
--            tci_write_reg(regs, t0, *(int8_t *)(t1 + t2));
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
--            break;
+         CASE_OP_32_64_VEC(xor):
-         case INDEX_op_ld16u_i64:
+             done = fold_xor(&ctx, op);
-             t0 = *tb_ptr++;
+             break;
-             t1 = tci_read_r(regs, &tb_ptr);
++        default:
 +            break;
          }
          if (!done) {
 --
 .25.1

-[PULL 29/46] cpu: Introduce TCGCpuOperations struct
+[PULL 35/56] tcg/optimize: Split out fold_xx_to_i
-From: Eduardo Habkost <ehabkost@redhat.com>
+Pull the "op r, a, a => movi r, 0" optimization into a function,
 and use it in the outer opcode fold functions.
-The TCG-specific CPU methods will be moved to a separate struct,
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
-to make it easier to move accel-specific code outside generic CPU
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 code in the future.  Start by moving tcg_initialize().
 The new CPUClass.tcg_opts field may eventually become a pointer,
 but keep it an embedded struct for now, to make code conversion
 easier.
 Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
 [claudio: move TCGCpuOperations inside include/hw/core/cpu.h]
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Message-Id: <20210204163931.7358-2-cfontana@suse.de>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/hw/core/cpu.h           | 16 +++++++++++++++-
+ tcg/optimize.c | 41 ++++++++++++++++++++++++-----------------
- cpu.c                           |  6 +++++-
+file changed, 24 insertions(+), 17 deletions(-)
  target/alpha/cpu.c              |  2 +-
  target/arm/cpu.c                |  2 +-
  target/avr/cpu.c                |  2 +-
  target/cris/cpu.c               | 12 ++++++------
  target/hppa/cpu.c               |  2 +-
  target/i386/tcg/tcg-cpu.c       |  2 +-
  target/lm32/cpu.c               |  2 +-
  target/m68k/cpu.c               |  2 +-
  target/microblaze/cpu.c         |  2 +-
  target/mips/cpu.c               |  2 +-
  target/moxie/cpu.c              |  2 +-
  target/nios2/cpu.c              |  2 +-
  target/openrisc/cpu.c           |  2 +-
  target/riscv/cpu.c              |  2 +-
  target/rx/cpu.c                 |  2 +-
  target/s390x/cpu.c              |  2 +-
  target/sh4/cpu.c                |  2 +-
  target/sparc/cpu.c              |  2 +-
  target/tilegx/cpu.c             |  2 +-
  target/tricore/cpu.c            |  2 +-
  target/unicore32/cpu.c          |  2 +-
  target/xtensa/cpu.c             |  2 +-
  target/ppc/translate_init.c.inc |  2 +-
 files changed, 48 insertions(+), 30 deletions(-)
-diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/hw/core/cpu.h
+--- a/tcg/optimize.c
-+++ b/include/hw/core/cpu.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ typedef struct CPUWatchpoint CPUWatchpoint;
+@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
+     return false;
- struct TranslationBlock;
+ }
-+/**
++/* If the binary operation has both arguments equal, fold to @i. */
-+ * struct TcgCpuOperations: TCG operations specific to a CPU class
++static bool fold_xx_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
-+ */
++{
-+typedef struct TcgCpuOperations {
++    if (args_are_copies(op->args[1], op->args[2])) {
-+    /**
++        return tcg_opt_gen_movi(ctx, op, op->args[0], i);
-+     * @initialize: Initalize TCG state
++    }
-+     *
++    return false;
-+     * Called when the first CPU is realized.
++}
 +     */
 +    void (*initialize)(void);
 +
-+} TcgCpuOperations;
-+
- /**
-  * CPUClass:
-  * @class_by_name: Callback to map -cpu command line model name to an
-@@ -XXX,XX +XXX,XX @@ struct CPUClass {
-     void (*disas_set_info)(CPUState *cpu, disassemble_info *info);
-     vaddr (*adjust_watchpoint_address)(CPUState *cpu, vaddr addr, int len);
--    void (*tcg_initialize)(void);
-     const char *deprecation_note;
-     /* Keep non-pointer data at the end to minimize holes.  */
-     int gdb_num_core_regs;
-     bool gdb_stop_before_watchpoint;
-+
-+    TcgCpuOperations tcg_ops;
- };
  /*
-diff --git a/cpu.c b/cpu.c
+  * These outermost fold_<op> functions are sorted alphabetically.
-index XXXXXXX..XXXXXXX 100644
+  */
---- a/cpu.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
-+++ b/cpu.c
-@@ -XXX,XX +XXX,XX @@ void cpu_exec_initfn(CPUState *cpu)
+ static bool fold_andc(OptContext *ctx, TCGOp *op)
  void cpu_exec_realizefn(CPUState *cpu, Error **errp)
  {
-     CPUClass *cc = CPU_GET_CLASS(cpu);
+-    return fold_const2(ctx, op);
-+#ifdef CONFIG_TCG
++    if (fold_const2(ctx, op) ||
-     static bool tcg_target_initialized;
++        fold_xx_to_i(ctx, op, 0)) {
-+#endif /* CONFIG_TCG */
++        return true;
++    }
-     cpu_list_add(cpu);
++    return false;
 +#ifdef CONFIG_TCG
      if (tcg_enabled() && !tcg_target_initialized) {
          tcg_target_initialized = true;
 -        cc->tcg_initialize();
 +        cc->tcg_ops.initialize();
      }
 +#endif /* CONFIG_TCG */
      tlb_init(cpu);
      qemu_plugin_vcpu_init_hook(cpu);
 diff --git a/target/alpha/cpu.c b/target/alpha/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/alpha/cpu.c
 +++ b/target/alpha/cpu.c
@@ -XXX,XX +XXX,XX @@ static void alpha_cpu_class_init(ObjectClass *oc, void *data)
      dc->vmsd = &vmstate_alpha_cpu;
  #endif
      cc->disas_set_info = alpha_cpu_disas_set_info;
 -    cc->tcg_initialize = alpha_translate_init;
 +    cc->tcg_ops.initialize = alpha_translate_init;
      cc->gdb_num_core_regs = 67;
  }
-diff --git a/target/arm/cpu.c b/target/arm/cpu.c
-index XXXXXXX..XXXXXXX 100644
+ static bool fold_brcond(OptContext *ctx, TCGOp *op)
---- a/target/arm/cpu.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_shift(OptContext *ctx, TCGOp *op)
-+++ b/target/arm/cpu.c
-@@ -XXX,XX +XXX,XX @@ static void arm_cpu_class_init(ObjectClass *oc, void *data)
+ static bool fold_sub(OptContext *ctx, TCGOp *op)
-     cc->gdb_stop_before_watchpoint = true;
+ {
-     cc->disas_set_info = arm_disas_set_info;
+-    return fold_const2(ctx, op);
- #ifdef CONFIG_TCG
++    if (fold_const2(ctx, op) ||
--    cc->tcg_initialize = arm_translate_init;
++        fold_xx_to_i(ctx, op, 0)) {
-+    cc->tcg_ops.initialize = arm_translate_init;
++        return true;
-     cc->tlb_fill = arm_cpu_tlb_fill;
++    }
-     cc->debug_excp_handler = arm_debug_excp_handler;
++    return false;
      cc->debug_check_watchpoint = arm_debug_check_watchpoint;
 diff --git a/target/avr/cpu.c b/target/avr/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/avr/cpu.c
 +++ b/target/avr/cpu.c
@@ -XXX,XX +XXX,XX @@ static void avr_cpu_class_init(ObjectClass *oc, void *data)
      cc->tlb_fill = avr_cpu_tlb_fill;
      cc->vmsd = &vms_avr_cpu;
      cc->disas_set_info = avr_cpu_disas_set_info;
 -    cc->tcg_initialize = avr_cpu_tcg_init;
 +    cc->tcg_ops.initialize = avr_cpu_tcg_init;
      cc->synchronize_from_tb = avr_cpu_synchronize_from_tb;
      cc->gdb_read_register = avr_cpu_gdb_read_register;
      cc->gdb_write_register = avr_cpu_gdb_write_register;
 diff --git a/target/cris/cpu.c b/target/cris/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/cris/cpu.c
 +++ b/target/cris/cpu.c
@@ -XXX,XX +XXX,XX @@ static void crisv8_cpu_class_init(ObjectClass *oc, void *data)
      ccc->vr = 8;
      cc->do_interrupt = crisv10_cpu_do_interrupt;
      cc->gdb_read_register = crisv10_cpu_gdb_read_register;
 -    cc->tcg_initialize = cris_initialize_crisv10_tcg;
 +    cc->tcg_ops.initialize = cris_initialize_crisv10_tcg;
  }
- static void crisv9_cpu_class_init(ObjectClass *oc, void *data)
+ static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
-@@ -XXX,XX +XXX,XX @@ static void crisv9_cpu_class_init(ObjectClass *oc, void *data)
+@@ -XXX,XX +XXX,XX @@ static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
-     ccc->vr = 9;
-     cc->do_interrupt = crisv10_cpu_do_interrupt;
+ static bool fold_xor(OptContext *ctx, TCGOp *op)
-     cc->gdb_read_register = crisv10_cpu_gdb_read_register;
+ {
--    cc->tcg_initialize = cris_initialize_crisv10_tcg;
+-    return fold_const2(ctx, op);
-+    cc->tcg_ops.initialize = cris_initialize_crisv10_tcg;
++    if (fold_const2(ctx, op) ||
 +        fold_xx_to_i(ctx, op, 0)) {
 +        return true;
 +    }
 +    return false;
  }
- static void crisv10_cpu_class_init(ObjectClass *oc, void *data)
+ /* Propagate constants and copies, fold constant expressions. */
-@@ -XXX,XX +XXX,XX @@ static void crisv10_cpu_class_init(ObjectClass *oc, void *data)
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-     ccc->vr = 10;
+             break;
-     cc->do_interrupt = crisv10_cpu_do_interrupt;
+         }
-     cc->gdb_read_register = crisv10_cpu_gdb_read_register;
--    cc->tcg_initialize = cris_initialize_crisv10_tcg;
+-        /* Simplify expression for "op r, a, a => movi r, 0" cases */
-+    cc->tcg_ops.initialize = cris_initialize_crisv10_tcg;
+-        switch (opc) {
- }
+-        CASE_OP_32_64_VEC(andc):
+-        CASE_OP_32_64_VEC(sub):
- static void crisv11_cpu_class_init(ObjectClass *oc, void *data)
+-        CASE_OP_32_64_VEC(xor):
-@@ -XXX,XX +XXX,XX @@ static void crisv11_cpu_class_init(ObjectClass *oc, void *data)
+-            if (args_are_copies(op->args[1], op->args[2])) {
-     ccc->vr = 11;
+-                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
-     cc->do_interrupt = crisv10_cpu_do_interrupt;
+-                continue;
-     cc->gdb_read_register = crisv10_cpu_gdb_read_register;
+-            }
--    cc->tcg_initialize = cris_initialize_crisv10_tcg;
+-            break;
-+    cc->tcg_ops.initialize = cris_initialize_crisv10_tcg;
+-        default:
- }
+-            break;
+-        }
- static void crisv17_cpu_class_init(ObjectClass *oc, void *data)
+-
-@@ -XXX,XX +XXX,XX @@ static void crisv17_cpu_class_init(ObjectClass *oc, void *data)
+         /*
-     ccc->vr = 17;
+          * Process each opcode.
-     cc->do_interrupt = crisv10_cpu_do_interrupt;
+          * Sorted alphabetically by opcode as much as possible.
      cc->gdb_read_register = crisv10_cpu_gdb_read_register;
 -    cc->tcg_initialize = cris_initialize_crisv10_tcg;
 +    cc->tcg_ops.initialize = cris_initialize_crisv10_tcg;
  }
  static void crisv32_cpu_class_init(ObjectClass *oc, void *data)
@@ -XXX,XX +XXX,XX @@ static void cris_cpu_class_init(ObjectClass *oc, void *data)
      cc->gdb_stop_before_watchpoint = true;
      cc->disas_set_info = cris_disas_set_info;
 -    cc->tcg_initialize = cris_initialize_tcg;
 +    cc->tcg_ops.initialize = cris_initialize_tcg;
  }
  #define DEFINE_CRIS_CPU_TYPE(cpu_model, initfn) \
 diff --git a/target/hppa/cpu.c b/target/hppa/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/hppa/cpu.c
 +++ b/target/hppa/cpu.c
@@ -XXX,XX +XXX,XX @@ static void hppa_cpu_class_init(ObjectClass *oc, void *data)
  #endif
      cc->do_unaligned_access = hppa_cpu_do_unaligned_access;
      cc->disas_set_info = hppa_cpu_disas_set_info;
 -    cc->tcg_initialize = hppa_translate_init;
 +    cc->tcg_ops.initialize = hppa_translate_init;
      cc->gdb_num_core_regs = 128;
  }
 diff --git a/target/i386/tcg/tcg-cpu.c b/target/i386/tcg/tcg-cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/i386/tcg/tcg-cpu.c
 +++ b/target/i386/tcg/tcg-cpu.c
@@ -XXX,XX +XXX,XX @@ void tcg_cpu_common_class_init(CPUClass *cc)
      cc->synchronize_from_tb = x86_cpu_synchronize_from_tb;
      cc->cpu_exec_enter = x86_cpu_exec_enter;
      cc->cpu_exec_exit = x86_cpu_exec_exit;
 -    cc->tcg_initialize = tcg_x86_init;
 +    cc->tcg_ops.initialize = tcg_x86_init;
      cc->tlb_fill = x86_cpu_tlb_fill;
  #ifndef CONFIG_USER_ONLY
      cc->debug_excp_handler = breakpoint_handler;
 diff --git a/target/lm32/cpu.c b/target/lm32/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/lm32/cpu.c
 +++ b/target/lm32/cpu.c
@@ -XXX,XX +XXX,XX @@ static void lm32_cpu_class_init(ObjectClass *oc, void *data)
      cc->gdb_stop_before_watchpoint = true;
      cc->debug_excp_handler = lm32_debug_excp_handler;
      cc->disas_set_info = lm32_cpu_disas_set_info;
 -    cc->tcg_initialize = lm32_translate_init;
 +    cc->tcg_ops.initialize = lm32_translate_init;
  }
  #define DEFINE_LM32_CPU_TYPE(cpu_model, initfn) \
 diff --git a/target/m68k/cpu.c b/target/m68k/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/m68k/cpu.c
 +++ b/target/m68k/cpu.c
@@ -XXX,XX +XXX,XX @@ static void m68k_cpu_class_init(ObjectClass *c, void *data)
      dc->vmsd = &vmstate_m68k_cpu;
  #endif
      cc->disas_set_info = m68k_cpu_disas_set_info;
 -    cc->tcg_initialize = m68k_tcg_init;
 +    cc->tcg_ops.initialize = m68k_tcg_init;
      cc->gdb_num_core_regs = 18;
  }
 diff --git a/target/microblaze/cpu.c b/target/microblaze/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/microblaze/cpu.c
 +++ b/target/microblaze/cpu.c
@@ -XXX,XX +XXX,XX @@ static void mb_cpu_class_init(ObjectClass *oc, void *data)
      cc->gdb_num_core_regs = 32 + 27;
      cc->disas_set_info = mb_disas_set_info;
 -    cc->tcg_initialize = mb_tcg_init;
 +    cc->tcg_ops.initialize = mb_tcg_init;
  }
  static const TypeInfo mb_cpu_type_info = {
 diff --git a/target/mips/cpu.c b/target/mips/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/mips/cpu.c
 +++ b/target/mips/cpu.c
@@ -XXX,XX +XXX,XX @@ static void mips_cpu_class_init(ObjectClass *c, void *data)
  #endif
      cc->disas_set_info = mips_cpu_disas_set_info;
  #ifdef CONFIG_TCG
 -    cc->tcg_initialize = mips_tcg_init;
 +    cc->tcg_ops.initialize = mips_tcg_init;
      cc->tlb_fill = mips_cpu_tlb_fill;
  #endif
 diff --git a/target/moxie/cpu.c b/target/moxie/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/moxie/cpu.c
 +++ b/target/moxie/cpu.c
@@ -XXX,XX +XXX,XX @@ static void moxie_cpu_class_init(ObjectClass *oc, void *data)
      cc->vmsd = &vmstate_moxie_cpu;
  #endif
      cc->disas_set_info = moxie_cpu_disas_set_info;
 -    cc->tcg_initialize = moxie_translate_init;
 +    cc->tcg_ops.initialize = moxie_translate_init;
  }
  static void moxielite_initfn(Object *obj)
 diff --git a/target/nios2/cpu.c b/target/nios2/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/nios2/cpu.c
 +++ b/target/nios2/cpu.c
@@ -XXX,XX +XXX,XX @@ static void nios2_cpu_class_init(ObjectClass *oc, void *data)
      cc->gdb_read_register = nios2_cpu_gdb_read_register;
      cc->gdb_write_register = nios2_cpu_gdb_write_register;
      cc->gdb_num_core_regs = 49;
 -    cc->tcg_initialize = nios2_tcg_init;
 +    cc->tcg_ops.initialize = nios2_tcg_init;
  }
  static const TypeInfo nios2_cpu_type_info = {
 diff --git a/target/openrisc/cpu.c b/target/openrisc/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/openrisc/cpu.c
 +++ b/target/openrisc/cpu.c
@@ -XXX,XX +XXX,XX @@ static void openrisc_cpu_class_init(ObjectClass *oc, void *data)
      dc->vmsd = &vmstate_openrisc_cpu;
  #endif
      cc->gdb_num_core_regs = 32 + 3;
 -    cc->tcg_initialize = openrisc_translate_init;
 +    cc->tcg_ops.initialize = openrisc_translate_init;
      cc->disas_set_info = openrisc_disas_set_info;
  }
 diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/riscv/cpu.c
 +++ b/target/riscv/cpu.c
@@ -XXX,XX +XXX,XX @@ static void riscv_cpu_class_init(ObjectClass *c, void *data)
      cc->gdb_arch_name = riscv_gdb_arch_name;
      cc->gdb_get_dynamic_xml = riscv_gdb_get_dynamic_xml;
  #ifdef CONFIG_TCG
 -    cc->tcg_initialize = riscv_translate_init;
 +    cc->tcg_ops.initialize = riscv_translate_init;
      cc->tlb_fill = riscv_cpu_tlb_fill;
  #endif
      device_class_set_props(dc, riscv_cpu_properties);
 diff --git a/target/rx/cpu.c b/target/rx/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/rx/cpu.c
 +++ b/target/rx/cpu.c
@@ -XXX,XX +XXX,XX @@ static void rx_cpu_class_init(ObjectClass *klass, void *data)
      cc->gdb_write_register = rx_cpu_gdb_write_register;
      cc->get_phys_page_debug = rx_cpu_get_phys_page_debug;
      cc->disas_set_info = rx_cpu_disas_set_info;
 -    cc->tcg_initialize = rx_translate_init;
 +    cc->tcg_ops.initialize = rx_translate_init;
      cc->tlb_fill = rx_cpu_tlb_fill;
      cc->gdb_num_core_regs = 26;
 diff --git a/target/s390x/cpu.c b/target/s390x/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/s390x/cpu.c
 +++ b/target/s390x/cpu.c
@@ -XXX,XX +XXX,XX @@ static void s390_cpu_class_init(ObjectClass *oc, void *data)
  #endif
      cc->disas_set_info = s390_cpu_disas_set_info;
  #ifdef CONFIG_TCG
 -    cc->tcg_initialize = s390x_translate_init;
 +    cc->tcg_ops.initialize = s390x_translate_init;
      cc->tlb_fill = s390_cpu_tlb_fill;
  #endif
 diff --git a/target/sh4/cpu.c b/target/sh4/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/sh4/cpu.c
 +++ b/target/sh4/cpu.c
@@ -XXX,XX +XXX,XX @@ static void superh_cpu_class_init(ObjectClass *oc, void *data)
      cc->get_phys_page_debug = superh_cpu_get_phys_page_debug;
  #endif
      cc->disas_set_info = superh_cpu_disas_set_info;
 -    cc->tcg_initialize = sh4_translate_init;
 +    cc->tcg_ops.initialize = sh4_translate_init;
      cc->gdb_num_core_regs = 59;
 diff --git a/target/sparc/cpu.c b/target/sparc/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/sparc/cpu.c
 +++ b/target/sparc/cpu.c
@@ -XXX,XX +XXX,XX @@ static void sparc_cpu_class_init(ObjectClass *oc, void *data)
      cc->vmsd = &vmstate_sparc_cpu;
  #endif
      cc->disas_set_info = cpu_sparc_disas_set_info;
 -    cc->tcg_initialize = sparc_tcg_init;
 +    cc->tcg_ops.initialize = sparc_tcg_init;
  #if defined(TARGET_SPARC64) && !defined(TARGET_ABI32)
      cc->gdb_num_core_regs = 86;
 diff --git a/target/tilegx/cpu.c b/target/tilegx/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/tilegx/cpu.c
 +++ b/target/tilegx/cpu.c
@@ -XXX,XX +XXX,XX @@ static void tilegx_cpu_class_init(ObjectClass *oc, void *data)
      cc->set_pc = tilegx_cpu_set_pc;
      cc->tlb_fill = tilegx_cpu_tlb_fill;
      cc->gdb_num_core_regs = 0;
 -    cc->tcg_initialize = tilegx_tcg_init;
 +    cc->tcg_ops.initialize = tilegx_tcg_init;
  }
  static const TypeInfo tilegx_cpu_type_info = {
 diff --git a/target/tricore/cpu.c b/target/tricore/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/tricore/cpu.c
 +++ b/target/tricore/cpu.c
@@ -XXX,XX +XXX,XX @@ static void tricore_cpu_class_init(ObjectClass *c, void *data)
      cc->set_pc = tricore_cpu_set_pc;
      cc->synchronize_from_tb = tricore_cpu_synchronize_from_tb;
      cc->get_phys_page_debug = tricore_cpu_get_phys_page_debug;
 -    cc->tcg_initialize = tricore_tcg_init;
 +    cc->tcg_ops.initialize = tricore_tcg_init;
      cc->tlb_fill = tricore_cpu_tlb_fill;
  }
 diff --git a/target/unicore32/cpu.c b/target/unicore32/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/unicore32/cpu.c
 +++ b/target/unicore32/cpu.c
@@ -XXX,XX +XXX,XX @@ static void uc32_cpu_class_init(ObjectClass *oc, void *data)
      cc->set_pc = uc32_cpu_set_pc;
      cc->tlb_fill = uc32_cpu_tlb_fill;
      cc->get_phys_page_debug = uc32_cpu_get_phys_page_debug;
 -    cc->tcg_initialize = uc32_translate_init;
 +    cc->tcg_ops.initialize = uc32_translate_init;
      dc->vmsd = &vmstate_uc32_cpu;
  }
 diff --git a/target/xtensa/cpu.c b/target/xtensa/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/xtensa/cpu.c
 +++ b/target/xtensa/cpu.c
@@ -XXX,XX +XXX,XX @@ static void xtensa_cpu_class_init(ObjectClass *oc, void *data)
  #endif
      cc->debug_excp_handler = xtensa_breakpoint_handler;
      cc->disas_set_info = xtensa_cpu_disas_set_info;
 -    cc->tcg_initialize = xtensa_translate_init;
 +    cc->tcg_ops.initialize = xtensa_translate_init;
      dc->vmsd = &vmstate_xtensa_cpu;
  }
 diff --git a/target/ppc/translate_init.c.inc b/target/ppc/translate_init.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/target/ppc/translate_init.c.inc
 +++ b/target/ppc/translate_init.c.inc
@@ -XXX,XX +XXX,XX @@ static void ppc_cpu_class_init(ObjectClass *oc, void *data)
      cc->virtio_is_big_endian = ppc_cpu_is_big_endian;
  #endif
  #ifdef CONFIG_TCG
 -    cc->tcg_initialize = ppc_translate_init;
 +    cc->tcg_ops.initialize = ppc_translate_init;
      cc->tlb_fill = ppc_cpu_tlb_fill;
  #endif
  #ifndef CONFIG_USER_ONLY
 --
 .25.1

-[PULL 32/46] cpu: Move synchronize_from_tb() to tcg_ops
+[PULL 36/56] tcg/optimize: Split out fold_xx_to_x
-From: Eduardo Habkost <ehabkost@redhat.com>
+Pull the "op r, a, a => mov r, a" optimization into a function,
 and use it in the outer opcode fold functions.
-Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
-[claudio: wrapped target code in CONFIG_TCG, reworded comments]
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Claudio Fontana <cfontana@suse.de>
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Message-Id: <20210204163931.7358-5-cfontana@suse.de>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/hw/core/cpu.h     | 22 +++++++++++++---------
+ tcg/optimize.c | 39 ++++++++++++++++++++++++---------------
- accel/tcg/cpu-exec.c      |  4 ++--
+file changed, 24 insertions(+), 15 deletions(-)
  target/arm/cpu.c          |  4 +++-
  target/avr/cpu.c          |  2 +-
  target/hppa/cpu.c         |  2 +-
  target/i386/tcg/tcg-cpu.c |  2 +-
  target/microblaze/cpu.c   |  2 +-
  target/mips/cpu.c         |  4 +++-
  target/riscv/cpu.c        |  2 +-
  target/rx/cpu.c           |  2 +-
  target/sh4/cpu.c          |  2 +-
  target/sparc/cpu.c        |  2 +-
  target/tricore/cpu.c      |  2 +-
 files changed, 30 insertions(+), 22 deletions(-)
-diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/hw/core/cpu.h
+--- a/tcg/optimize.c
-+++ b/include/hw/core/cpu.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ typedef struct TcgCpuOperations {
+@@ -XXX,XX +XXX,XX @@ static bool fold_xx_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
-      * Called when the first CPU is realized.
+     return false;
       */
      void (*initialize)(void);
 +    /**
 +     * @synchronize_from_tb: Synchronize state from a TCG #TranslationBlock
 +     *
 +     * This is called when we abandon execution of a TB before starting it,
 +     * and must set all parts of the CPU state which the previous TB in the
 +     * chain may not have updated.
 +     * By default, when this is NULL, a call is made to @set_pc(tb->pc).
 +     *
 +     * If more state needs to be restored, the target must implement a
 +     * function to restore all the state, and register it here.
 +     */
 +    void (*synchronize_from_tb)(CPUState *cpu,
 +                                const struct TranslationBlock *tb);
  } TcgCpuOperations;
@@ -XXX,XX +XXX,XX @@ typedef struct TcgCpuOperations {
   *       If the target behaviour here is anything other than "set
   *       the PC register to the value passed in" then the target must
   *       also implement the synchronize_from_tb hook.
 - * @synchronize_from_tb: Callback for synchronizing state from a TCG
 - *       #TranslationBlock. This is called when we abandon execution
 - *       of a TB before starting it, and must set all parts of the CPU
 - *       state which the previous TB in the chain may not have updated.
 - *       This always includes at least the program counter; some targets
 - *       will need to do more. If this hook is not implemented then the
 - *       default is to call @set_pc(tb->pc).
   * @tlb_fill: Callback for handling a softmmu tlb miss or user-only
   *       address fault.  For system mode, if the access is valid, call
   *       tlb_set_page and return true; if the access is invalid, and
@@ -XXX,XX +XXX,XX @@ struct CPUClass {
      void (*get_memory_mapping)(CPUState *cpu, MemoryMappingList *list,
                                 Error **errp);
      void (*set_pc)(CPUState *cpu, vaddr value);
 -    void (*synchronize_from_tb)(CPUState *cpu,
 -                                const struct TranslationBlock *tb);
      bool (*tlb_fill)(CPUState *cpu, vaddr address, int size,
                       MMUAccessType access_type, int mmu_idx,
                       bool probe, uintptr_t retaddr);
 diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/cpu-exec.c
 +++ b/accel/tcg/cpu-exec.c
@@ -XXX,XX +XXX,XX @@ cpu_tb_exec(CPUState *cpu, TranslationBlock *itb, int *tb_exit)
                                 TARGET_FMT_lx "] %s\n",
                                 last_tb->tc.ptr, last_tb->pc,
                                 lookup_symbol(last_tb->pc));
 -        if (cc->synchronize_from_tb) {
 -            cc->synchronize_from_tb(cpu, last_tb);
 +        if (cc->tcg_ops.synchronize_from_tb) {
 +            cc->tcg_ops.synchronize_from_tb(cpu, last_tb);
          } else {
              assert(cc->set_pc);
              cc->set_pc(cpu, last_tb->pc);
 diff --git a/target/arm/cpu.c b/target/arm/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/arm/cpu.c
 +++ b/target/arm/cpu.c
@@ -XXX,XX +XXX,XX @@ static void arm_cpu_set_pc(CPUState *cs, vaddr value)
      }
  }
-+#ifdef CONFIG_TCG
++/* If the binary operation has both arguments equal, fold to identity. */
- static void arm_cpu_synchronize_from_tb(CPUState *cs,
++static bool fold_xx_to_x(OptContext *ctx, TCGOp *op)
-                                         const TranslationBlock *tb)
++{
 +    if (args_are_copies(op->args[1], op->args[2])) {
 +        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]);
 +    }
 +    return false;
 +}
 +
  /*
   * These outermost fold_<op> functions are sorted alphabetically.
 + *
 + * The ordering of the transformations should be:
 + *   1) those that produce a constant
 + *   2) those that produce a copy
 + *   3) those that produce information about the result value.
   */
  static bool fold_add(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_add2_i32(OptContext *ctx, TCGOp *op)
  static bool fold_and(OptContext *ctx, TCGOp *op)
  {
-@@ -XXX,XX +XXX,XX @@ static void arm_cpu_synchronize_from_tb(CPUState *cs,
+-    return fold_const2(ctx, op);
-         env->regs[15] = tb->pc;
++    if (fold_const2(ctx, op) ||
-     }
++        fold_xx_to_x(ctx, op)) {
 +        return true;
 +    }
 +    return false;
  }
-+#endif /* CONFIG_TCG */
+ static bool fold_andc(OptContext *ctx, TCGOp *op)
- static bool arm_cpu_has_work(CPUState *cs)
+@@ -XXX,XX +XXX,XX @@ static bool fold_not(OptContext *ctx, TCGOp *op)
  static bool fold_or(OptContext *ctx, TCGOp *op)
  {
-@@ -XXX,XX +XXX,XX @@ static void arm_cpu_class_init(ObjectClass *oc, void *data)
+-    return fold_const2(ctx, op);
-     cc->cpu_exec_interrupt = arm_cpu_exec_interrupt;
++    if (fold_const2(ctx, op) ||
-     cc->dump_state = arm_cpu_dump_state;
++        fold_xx_to_x(ctx, op)) {
-     cc->set_pc = arm_cpu_set_pc;
++        return true;
--    cc->synchronize_from_tb = arm_cpu_synchronize_from_tb;
++    }
-     cc->gdb_read_register = arm_cpu_gdb_read_register;
++    return false;
      cc->gdb_write_register = arm_cpu_gdb_write_register;
  #ifndef CONFIG_USER_ONLY
@@ -XXX,XX +XXX,XX @@ static void arm_cpu_class_init(ObjectClass *oc, void *data)
      cc->disas_set_info = arm_disas_set_info;
  #ifdef CONFIG_TCG
      cc->tcg_ops.initialize = arm_translate_init;
 +    cc->tcg_ops.synchronize_from_tb = arm_cpu_synchronize_from_tb;
      cc->tlb_fill = arm_cpu_tlb_fill;
      cc->debug_excp_handler = arm_debug_excp_handler;
      cc->debug_check_watchpoint = arm_debug_check_watchpoint;
 diff --git a/target/avr/cpu.c b/target/avr/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/avr/cpu.c
 +++ b/target/avr/cpu.c
@@ -XXX,XX +XXX,XX @@ static void avr_cpu_class_init(ObjectClass *oc, void *data)
      cc->vmsd = &vms_avr_cpu;
      cc->disas_set_info = avr_cpu_disas_set_info;
      cc->tcg_ops.initialize = avr_cpu_tcg_init;
 -    cc->synchronize_from_tb = avr_cpu_synchronize_from_tb;
 +    cc->tcg_ops.synchronize_from_tb = avr_cpu_synchronize_from_tb;
      cc->gdb_read_register = avr_cpu_gdb_read_register;
      cc->gdb_write_register = avr_cpu_gdb_write_register;
      cc->gdb_num_core_regs = 35;
 diff --git a/target/hppa/cpu.c b/target/hppa/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/hppa/cpu.c
 +++ b/target/hppa/cpu.c
@@ -XXX,XX +XXX,XX @@ static void hppa_cpu_class_init(ObjectClass *oc, void *data)
      cc->cpu_exec_interrupt = hppa_cpu_exec_interrupt;
      cc->dump_state = hppa_cpu_dump_state;
      cc->set_pc = hppa_cpu_set_pc;
 -    cc->synchronize_from_tb = hppa_cpu_synchronize_from_tb;
 +    cc->tcg_ops.synchronize_from_tb = hppa_cpu_synchronize_from_tb;
      cc->gdb_read_register = hppa_cpu_gdb_read_register;
      cc->gdb_write_register = hppa_cpu_gdb_write_register;
      cc->tlb_fill = hppa_cpu_tlb_fill;
 diff --git a/target/i386/tcg/tcg-cpu.c b/target/i386/tcg/tcg-cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/i386/tcg/tcg-cpu.c
 +++ b/target/i386/tcg/tcg-cpu.c
@@ -XXX,XX +XXX,XX @@ void tcg_cpu_common_class_init(CPUClass *cc)
  {
      cc->do_interrupt = x86_cpu_do_interrupt;
      cc->cpu_exec_interrupt = x86_cpu_exec_interrupt;
 -    cc->synchronize_from_tb = x86_cpu_synchronize_from_tb;
 +    cc->tcg_ops.synchronize_from_tb = x86_cpu_synchronize_from_tb;
      cc->cpu_exec_enter = x86_cpu_exec_enter;
      cc->cpu_exec_exit = x86_cpu_exec_exit;
      cc->tcg_ops.initialize = tcg_x86_init;
 diff --git a/target/microblaze/cpu.c b/target/microblaze/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/microblaze/cpu.c
 +++ b/target/microblaze/cpu.c
@@ -XXX,XX +XXX,XX @@ static void mb_cpu_class_init(ObjectClass *oc, void *data)
      cc->cpu_exec_interrupt = mb_cpu_exec_interrupt;
      cc->dump_state = mb_cpu_dump_state;
      cc->set_pc = mb_cpu_set_pc;
 -    cc->synchronize_from_tb = mb_cpu_synchronize_from_tb;
 +    cc->tcg_ops.synchronize_from_tb = mb_cpu_synchronize_from_tb;
      cc->gdb_read_register = mb_cpu_gdb_read_register;
      cc->gdb_write_register = mb_cpu_gdb_write_register;
      cc->tlb_fill = mb_cpu_tlb_fill;
 diff --git a/target/mips/cpu.c b/target/mips/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/mips/cpu.c
 +++ b/target/mips/cpu.c
@@ -XXX,XX +XXX,XX @@ static void mips_cpu_set_pc(CPUState *cs, vaddr value)
      }
  }
-+#ifdef CONFIG_TCG
+ static bool fold_orc(OptContext *ctx, TCGOp *op)
- static void mips_cpu_synchronize_from_tb(CPUState *cs,
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-                                          const TranslationBlock *tb)
+             break;
- {
+         }
-@@ -XXX,XX +XXX,XX @@ static void mips_cpu_synchronize_from_tb(CPUState *cs,
-     env->hflags &= ~MIPS_HFLAG_BMASK;
+-        /* Simplify expression for "op r, a, a => mov r, a" cases */
-     env->hflags |= tb->flags & MIPS_HFLAG_BMASK;
+-        switch (opc) {
- }
+-        CASE_OP_32_64_VEC(or):
-+#endif /* CONFIG_TCG */
+-        CASE_OP_32_64_VEC(and):
+-            if (args_are_copies(op->args[1], op->args[2])) {
- static bool mips_cpu_has_work(CPUState *cs)
+-                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
- {
+-                continue;
-@@ -XXX,XX +XXX,XX @@ static void mips_cpu_class_init(ObjectClass *c, void *data)
+-            }
-     cc->cpu_exec_interrupt = mips_cpu_exec_interrupt;
+-            break;
-     cc->dump_state = mips_cpu_dump_state;
+-        default:
-     cc->set_pc = mips_cpu_set_pc;
+-            break;
--    cc->synchronize_from_tb = mips_cpu_synchronize_from_tb;
+-        }
-     cc->gdb_read_register = mips_cpu_gdb_read_register;
+-
-     cc->gdb_write_register = mips_cpu_gdb_write_register;
+         /*
- #ifndef CONFIG_USER_ONLY
+          * Process each opcode.
-@@ -XXX,XX +XXX,XX @@ static void mips_cpu_class_init(ObjectClass *c, void *data)
+          * Sorted alphabetically by opcode as much as possible.
      cc->disas_set_info = mips_cpu_disas_set_info;
  #ifdef CONFIG_TCG
      cc->tcg_ops.initialize = mips_tcg_init;
 +    cc->tcg_ops.synchronize_from_tb = mips_cpu_synchronize_from_tb;
      cc->tlb_fill = mips_cpu_tlb_fill;
  #endif
 diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/riscv/cpu.c
 +++ b/target/riscv/cpu.c
@@ -XXX,XX +XXX,XX @@ static void riscv_cpu_class_init(ObjectClass *c, void *data)
      cc->cpu_exec_interrupt = riscv_cpu_exec_interrupt;
      cc->dump_state = riscv_cpu_dump_state;
      cc->set_pc = riscv_cpu_set_pc;
 -    cc->synchronize_from_tb = riscv_cpu_synchronize_from_tb;
 +    cc->tcg_ops.synchronize_from_tb = riscv_cpu_synchronize_from_tb;
      cc->gdb_read_register = riscv_cpu_gdb_read_register;
      cc->gdb_write_register = riscv_cpu_gdb_write_register;
      cc->gdb_num_core_regs = 33;
 diff --git a/target/rx/cpu.c b/target/rx/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/rx/cpu.c
 +++ b/target/rx/cpu.c
@@ -XXX,XX +XXX,XX @@ static void rx_cpu_class_init(ObjectClass *klass, void *data)
      cc->cpu_exec_interrupt = rx_cpu_exec_interrupt;
      cc->dump_state = rx_cpu_dump_state;
      cc->set_pc = rx_cpu_set_pc;
 -    cc->synchronize_from_tb = rx_cpu_synchronize_from_tb;
 +    cc->tcg_ops.synchronize_from_tb = rx_cpu_synchronize_from_tb;
      cc->gdb_read_register = rx_cpu_gdb_read_register;
      cc->gdb_write_register = rx_cpu_gdb_write_register;
      cc->get_phys_page_debug = rx_cpu_get_phys_page_debug;
 diff --git a/target/sh4/cpu.c b/target/sh4/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/sh4/cpu.c
 +++ b/target/sh4/cpu.c
@@ -XXX,XX +XXX,XX @@ static void superh_cpu_class_init(ObjectClass *oc, void *data)
      cc->cpu_exec_interrupt = superh_cpu_exec_interrupt;
      cc->dump_state = superh_cpu_dump_state;
      cc->set_pc = superh_cpu_set_pc;
 -    cc->synchronize_from_tb = superh_cpu_synchronize_from_tb;
 +    cc->tcg_ops.synchronize_from_tb = superh_cpu_synchronize_from_tb;
      cc->gdb_read_register = superh_cpu_gdb_read_register;
      cc->gdb_write_register = superh_cpu_gdb_write_register;
      cc->tlb_fill = superh_cpu_tlb_fill;
 diff --git a/target/sparc/cpu.c b/target/sparc/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/sparc/cpu.c
 +++ b/target/sparc/cpu.c
@@ -XXX,XX +XXX,XX @@ static void sparc_cpu_class_init(ObjectClass *oc, void *data)
      cc->memory_rw_debug = sparc_cpu_memory_rw_debug;
  #endif
      cc->set_pc = sparc_cpu_set_pc;
 -    cc->synchronize_from_tb = sparc_cpu_synchronize_from_tb;
 +    cc->tcg_ops.synchronize_from_tb = sparc_cpu_synchronize_from_tb;
      cc->gdb_read_register = sparc_cpu_gdb_read_register;
      cc->gdb_write_register = sparc_cpu_gdb_write_register;
      cc->tlb_fill = sparc_cpu_tlb_fill;
 diff --git a/target/tricore/cpu.c b/target/tricore/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/tricore/cpu.c
 +++ b/target/tricore/cpu.c
@@ -XXX,XX +XXX,XX @@ static void tricore_cpu_class_init(ObjectClass *c, void *data)
      cc->dump_state = tricore_cpu_dump_state;
      cc->set_pc = tricore_cpu_set_pc;
 -    cc->synchronize_from_tb = tricore_cpu_synchronize_from_tb;
 +    cc->tcg_ops.synchronize_from_tb = tricore_cpu_synchronize_from_tb;
      cc->get_phys_page_debug = tricore_cpu_get_phys_page_debug;
      cc->tcg_ops.initialize = tricore_tcg_init;
      cc->tlb_fill = tricore_cpu_tlb_fill;
 --
 .25.1

-[PULL 42/46] cpu: move debug_check_watchpoint to tcg_ops
+[PULL 37/56] tcg/optimize: Split out fold_xi_to_i
-From: Claudio Fontana <cfontana@suse.de>
+Pull the "op r, a, 0 => movi r, 0" optimization into a function,
 and use it in the outer opcode fold functions.
-commit 568496c0c0f1 ("cpu: Add callback to check architectural") and
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
-commit 3826121d9298 ("target-arm: Implement checking of fired")
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 introduced an ARM-specific hack for cpu_check_watchpoint.
 Make debug_check_watchpoint optional, and move it to tcg_ops.
 Signed-off-by: Claudio Fontana <cfontana@suse.de>
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Message-Id: <20210204163931.7358-15-cfontana@suse.de>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/hw/core/cpu.h | 9 ++++++---
+ tcg/optimize.c | 38 ++++++++++++++++++++------------------
- accel/tcg/user-exec.c | 3 ++-
+file changed, 20 insertions(+), 18 deletions(-)
  hw/core/cpu.c         | 9 ---------
  softmmu/physmem.c     | 4 ++--
  target/arm/cpu.c      | 4 ++--
 files changed, 12 insertions(+), 17 deletions(-)
-diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/hw/core/cpu.h
+--- a/tcg/optimize.c
-+++ b/include/hw/core/cpu.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ typedef struct TcgCpuOperations {
+@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
-      */
+     return false;
-     vaddr (*adjust_watchpoint_address)(CPUState *cpu, vaddr addr, int len);
+ }
-+    /**
++/* If the binary operation has second argument @i, fold to @i. */
-+     * @debug_check_watchpoint: return true if the architectural
++static bool fold_xi_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
-+     * watchpoint whose address has matched should really fire, used by ARM
++{
-+     */
++    if (arg_is_const(op->args[2]) && arg_info(op->args[2])->val == i) {
-+    bool (*debug_check_watchpoint)(CPUState *cpu, CPUWatchpoint *wp);
++        return tcg_opt_gen_movi(ctx, op, op->args[0], i);
 +    }
 +    return false;
 +}
 +
- } TcgCpuOperations;
+ /* If the binary operation has both arguments equal, fold to @i. */
+ static bool fold_xx_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
- /**
+ {
-@@ -XXX,XX +XXX,XX @@ typedef struct TcgCpuOperations {
+@@ -XXX,XX +XXX,XX @@ static bool fold_add2_i32(OptContext *ctx, TCGOp *op)
-  *       a memory access with the specified memory transaction attributes.
+ static bool fold_and(OptContext *ctx, TCGOp *op)
-  * @gdb_read_register: Callback for letting GDB read a register.
+ {
-  * @gdb_write_register: Callback for letting GDB write a register.
+     if (fold_const2(ctx, op) ||
-- * @debug_check_watchpoint: Callback: return true if the architectural
++        fold_xi_to_i(ctx, op, 0) ||
-- *       watchpoint whose address has matched should really fire.
+         fold_xx_to_x(ctx, op)) {
-  * @write_elf64_note: Callback for writing a CPU-specific ELF note to a
+         return true;
-  * 64-bit VM coredump.
+     }
-  * @write_elf32_qemunote: Callback for writing a CPU- and QEMU-specific ELF
+@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
-@@ -XXX,XX +XXX,XX @@ struct CPUClass {
-     int (*asidx_from_attrs)(CPUState *cpu, MemTxAttrs attrs);
+ static bool fold_mul(OptContext *ctx, TCGOp *op)
-     int (*gdb_read_register)(CPUState *cpu, GByteArray *buf, int reg);
+ {
-     int (*gdb_write_register)(CPUState *cpu, uint8_t *buf, int reg);
+-    return fold_const2(ctx, op);
--    bool (*debug_check_watchpoint)(CPUState *cpu, CPUWatchpoint *wp);
++    if (fold_const2(ctx, op) ||
++        fold_xi_to_i(ctx, op, 0)) {
-     int (*write_elf64_note)(WriteCoreDumpFunction f, CPUState *cpu,
++        return true;
-                             int cpuid, void *opaque);
++    }
-diff --git a/accel/tcg/user-exec.c b/accel/tcg/user-exec.c
++    return false;
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/user-exec.c
 +++ b/accel/tcg/user-exec.c
@@ -XXX,XX +XXX,XX @@ static inline int handle_cpu_signal(uintptr_t pc, siginfo_t *info,
      clear_helper_retaddr();
      cc = CPU_GET_CLASS(cpu);
 -    cc->tcg_ops.tlb_fill(cpu, address, 0, access_type, MMU_USER_IDX, false, pc);
 +    cc->tcg_ops.tlb_fill(cpu, address, 0, access_type,
 +                         MMU_USER_IDX, false, pc);
      g_assert_not_reached();
  }
-diff --git a/hw/core/cpu.c b/hw/core/cpu.c
+ static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
-index XXXXXXX..XXXXXXX 100644
+ {
---- a/hw/core/cpu.c
+-    return fold_const2(ctx, op);
-+++ b/hw/core/cpu.c
++    if (fold_const2(ctx, op) ||
-@@ -XXX,XX +XXX,XX @@ static int cpu_common_gdb_write_register(CPUState *cpu, uint8_t *buf, int reg)
++        fold_xi_to_i(ctx, op, 0)) {
-     return 0;
++        return true;
 +    }
 +    return false;
  }
--static bool cpu_common_debug_check_watchpoint(CPUState *cpu, CPUWatchpoint *wp)
+ static bool fold_mulu2_i32(OptContext *ctx, TCGOp *op)
--{
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
--    /* If no extra check is required, QEMU watchpoint match can be considered
+             continue;
--     * as an architectural match.
+         }
--     */
--    return true;
+-        /* Simplify expression for "op r, a, 0 => movi r, 0" cases */
--}
+-        switch (opc) {
 -        CASE_OP_32_64_VEC(and):
 -        CASE_OP_32_64_VEC(mul):
 -        CASE_OP_32_64(muluh):
 -        CASE_OP_32_64(mulsh):
 -            if (arg_is_const(op->args[2])
 -                && arg_info(op->args[2])->val == 0) {
 -                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
 -                continue;
 -            }
 -            break;
 -        default:
 -            break;
 -        }
 -
- static bool cpu_common_virtio_is_big_endian(CPUState *cpu)
+         /*
- {
+          * Process each opcode.
-     return target_words_bigendian();
+          * Sorted alphabetically by opcode as much as possible.
@@ -XXX,XX +XXX,XX @@ static void cpu_class_init(ObjectClass *klass, void *data)
      k->gdb_read_register = cpu_common_gdb_read_register;
      k->gdb_write_register = cpu_common_gdb_write_register;
      k->virtio_is_big_endian = cpu_common_virtio_is_big_endian;
 -    k->debug_check_watchpoint = cpu_common_debug_check_watchpoint;
      set_bit(DEVICE_CATEGORY_CPU, dc->categories);
      dc->realize = cpu_common_realizefn;
      dc->unrealize = cpu_common_unrealizefn;
 diff --git a/softmmu/physmem.c b/softmmu/physmem.c
 index XXXXXXX..XXXXXXX 100644
 --- a/softmmu/physmem.c
 +++ b/softmmu/physmem.c
@@ -XXX,XX +XXX,XX @@ void cpu_check_watchpoint(CPUState *cpu, vaddr addr, vaddr len,
              wp->hitaddr = MAX(addr, wp->vaddr);
              wp->hitattrs = attrs;
              if (!cpu->watchpoint_hit) {
 -                if (wp->flags & BP_CPU &&
 -                    !cc->debug_check_watchpoint(cpu, wp)) {
 +                if (wp->flags & BP_CPU && cc->tcg_ops.debug_check_watchpoint &&
 +                    !cc->tcg_ops.debug_check_watchpoint(cpu, wp)) {
                      wp->flags &= ~BP_WATCHPOINT_HIT;
                      continue;
                  }
 diff --git a/target/arm/cpu.c b/target/arm/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/arm/cpu.c
 +++ b/target/arm/cpu.c
@@ -XXX,XX +XXX,XX @@ static void arm_cpu_class_init(ObjectClass *oc, void *data)
      cc->tcg_ops.synchronize_from_tb = arm_cpu_synchronize_from_tb;
      cc->tcg_ops.tlb_fill = arm_cpu_tlb_fill;
      cc->tcg_ops.debug_excp_handler = arm_debug_excp_handler;
 -    cc->debug_check_watchpoint = arm_debug_check_watchpoint;
  #if !defined(CONFIG_USER_ONLY)
 +    cc->tcg_ops.do_interrupt = arm_cpu_do_interrupt;
      cc->tcg_ops.do_transaction_failed = arm_cpu_do_transaction_failed;
      cc->tcg_ops.do_unaligned_access = arm_cpu_do_unaligned_access;
      cc->tcg_ops.adjust_watchpoint_address = arm_adjust_watchpoint_address;
 -    cc->tcg_ops.do_interrupt = arm_cpu_do_interrupt;
 +    cc->tcg_ops.debug_check_watchpoint = arm_debug_check_watchpoint;
  #endif /* CONFIG_TCG && !CONFIG_USER_ONLY */
  #endif /* CONFIG_TCG */
  }
 --
 .25.1

-[PULL 07/46] tcg/tci: Implement INDEX_op_ld8s_i64
+[PULL 38/56] tcg/optimize: Add type to OptContext
-From: Stefan Weil <sw@weilnetz.de>
+Compute the type of the operation early.
-That TCG opcode is used by debian-buster (arm64) running ffmpeg:
+There are at least 4 places that used a def->flags ladder
+to determine the type of the operation being optimized.
-    qemu-aarch64 /usr/bin/ffmpeg -i theora.mkv theora.webm
+There were two places that assumed !TCG_OPF_64BIT means
-Tested-by: Alex Bennée <alex.bennee@linaro.org>
+TCG_TYPE_I32, and so could potentially compute incorrect
-Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+results for vector operations.
-Reported-by: Alex Bennée <alex.bennee@linaro.org>
-Signed-off-by: Stefan Weil <sw@weilnetz.de>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Message-Id: <20210128020425.2055454-1-sw@weilnetz.de>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/tci.c | 5 ++++-
+ tcg/optimize.c | 149 +++++++++++++++++++++++++++++--------------------
-file changed, 4 insertions(+), 1 deletion(-)
+file changed, 89 insertions(+), 60 deletions(-)
-diff --git a/tcg/tci.c b/tcg/tci.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/tci.c
+--- a/tcg/optimize.c
-+++ b/tcg/tci.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+@@ -XXX,XX +XXX,XX @@ typedef struct OptContext {
-             tci_write_reg8(regs, t0, *(uint8_t *)(t1 + t2));
-             break;
+     /* In flight values from optimization. */
-         case INDEX_op_ld8s_i64:
+     uint64_t z_mask;
--            TODO();
++    TCGType type;
-+            t0 = *tb_ptr++;
+ } OptContext;
-+            t1 = tci_read_r(regs, &tb_ptr);
-+            t2 = tci_read_s32(&tb_ptr);
+ static inline TempOptInfo *ts_info(TCGTemp *ts)
-+            tci_write_reg(regs, t0, *(int8_t *)(t1 + t2));
+@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
-             break;
+ {
-         case INDEX_op_ld16u_i64:
+     TCGTemp *dst_ts = arg_temp(dst);
-             t0 = *tb_ptr++;
+     TCGTemp *src_ts = arg_temp(src);
 -    const TCGOpDef *def;
      TempOptInfo *di;
      TempOptInfo *si;
      uint64_t z_mask;
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
      reset_ts(dst_ts);
      di = ts_info(dst_ts);
      si = ts_info(src_ts);
 -    def = &tcg_op_defs[op->opc];
 -    if (def->flags & TCG_OPF_VECTOR) {
 -        new_op = INDEX_op_mov_vec;
 -    } else if (def->flags & TCG_OPF_64BIT) {
 -        new_op = INDEX_op_mov_i64;
 -    } else {
 +
 +    switch (ctx->type) {
 +    case TCG_TYPE_I32:
          new_op = INDEX_op_mov_i32;
 +        break;
 +    case TCG_TYPE_I64:
 +        new_op = INDEX_op_mov_i64;
 +        break;
 +    case TCG_TYPE_V64:
 +    case TCG_TYPE_V128:
 +    case TCG_TYPE_V256:
 +        /* TCGOP_VECL and TCGOP_VECE remain unchanged.  */
 +        new_op = INDEX_op_mov_vec;
 +        break;
 +    default:
 +        g_assert_not_reached();
      }
      op->opc = new_op;
 -    /* TCGOP_VECL and TCGOP_VECE remain unchanged.  */
      op->args[0] = dst;
      op->args[1] = src;
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
  static bool tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
                               TCGArg dst, uint64_t val)
  {
 -    const TCGOpDef *def = &tcg_op_defs[op->opc];
 -    TCGType type;
 -    TCGTemp *tv;
 -
 -    if (def->flags & TCG_OPF_VECTOR) {
 -        type = TCGOP_VECL(op) + TCG_TYPE_V64;
 -    } else if (def->flags & TCG_OPF_64BIT) {
 -        type = TCG_TYPE_I64;
 -    } else {
 -        type = TCG_TYPE_I32;
 -    }
 -
      /* Convert movi to mov with constant temp. */
 -    tv = tcg_constant_internal(type, val);
 +    TCGTemp *tv = tcg_constant_internal(ctx->type, val);
 +
      init_ts_info(ctx, tv);
      return tcg_opt_gen_mov(ctx, op, dst, temp_arg(tv));
  }
@@ -XXX,XX +XXX,XX @@ static uint64_t do_constant_folding_2(TCGOpcode op, uint64_t x, uint64_t y)
      }
  }
 -static uint64_t do_constant_folding(TCGOpcode op, uint64_t x, uint64_t y)
 +static uint64_t do_constant_folding(TCGOpcode op, TCGType type,
 +                                    uint64_t x, uint64_t y)
  {
 -    const TCGOpDef *def = &tcg_op_defs[op];
      uint64_t res = do_constant_folding_2(op, x, y);
 -    if (!(def->flags & TCG_OPF_64BIT)) {
 +    if (type == TCG_TYPE_I32) {
          res = (int32_t)res;
      }
      return res;
@@ -XXX,XX +XXX,XX @@ static bool do_constant_folding_cond_eq(TCGCond c)
   * Return -1 if the condition can't be simplified,
   * and the result of the condition (0 or 1) if it can.
   */
 -static int do_constant_folding_cond(TCGOpcode op, TCGArg x,
 +static int do_constant_folding_cond(TCGType type, TCGArg x,
                                      TCGArg y, TCGCond c)
  {
      uint64_t xv = arg_info(x)->val;
      uint64_t yv = arg_info(y)->val;
      if (arg_is_const(x) && arg_is_const(y)) {
 -        const TCGOpDef *def = &tcg_op_defs[op];
 -        tcg_debug_assert(!(def->flags & TCG_OPF_VECTOR));
 -        if (def->flags & TCG_OPF_64BIT) {
 -            return do_constant_folding_cond_64(xv, yv, c);
 -        } else {
 +        switch (type) {
 +        case TCG_TYPE_I32:
              return do_constant_folding_cond_32(xv, yv, c);
 +        case TCG_TYPE_I64:
 +            return do_constant_folding_cond_64(xv, yv, c);
 +        default:
 +            /* Only scalar comparisons are optimizable */
 +            return -1;
          }
      } else if (args_are_copies(x, y)) {
          return do_constant_folding_cond_eq(c);
@@ -XXX,XX +XXX,XX @@ static bool fold_const1(OptContext *ctx, TCGOp *op)
          uint64_t t;
          t = arg_info(op->args[1])->val;
 -        t = do_constant_folding(op->opc, t, 0);
 +        t = do_constant_folding(op->opc, ctx->type, t, 0);
          return tcg_opt_gen_movi(ctx, op, op->args[0], t);
      }
      return false;
@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
          uint64_t t1 = arg_info(op->args[1])->val;
          uint64_t t2 = arg_info(op->args[2])->val;
 -        t1 = do_constant_folding(op->opc, t1, t2);
 +        t1 = do_constant_folding(op->opc, ctx->type, t1, t2);
          return tcg_opt_gen_movi(ctx, op, op->args[0], t1);
      }
      return false;
@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
  static bool fold_brcond(OptContext *ctx, TCGOp *op)
  {
      TCGCond cond = op->args[2];
 -    int i = do_constant_folding_cond(op->opc, op->args[0], op->args[1], cond);
 +    int i = do_constant_folding_cond(ctx->type, op->args[0], op->args[1], cond);
      if (i == 0) {
          tcg_op_remove(ctx->tcg, op);
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
           * Simplify EQ/NE comparisons where one of the pairs
           * can be simplified.
           */
 -        i = do_constant_folding_cond(INDEX_op_brcond_i32, op->args[0],
 +        i = do_constant_folding_cond(TCG_TYPE_I32, op->args[0],
                                       op->args[2], cond);
          switch (i ^ inv) {
          case 0:
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
              goto do_brcond_high;
          }
 -        i = do_constant_folding_cond(INDEX_op_brcond_i32, op->args[1],
 +        i = do_constant_folding_cond(TCG_TYPE_I32, op->args[1],
                                       op->args[3], cond);
          switch (i ^ inv) {
          case 0:
@@ -XXX,XX +XXX,XX @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
      if (arg_is_const(op->args[1])) {
          uint64_t t = arg_info(op->args[1])->val;
 -        t = do_constant_folding(op->opc, t, op->args[2]);
 +        t = do_constant_folding(op->opc, ctx->type, t, op->args[2]);
          return tcg_opt_gen_movi(ctx, op, op->args[0], t);
      }
      return false;
@@ -XXX,XX +XXX,XX @@ static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
          uint64_t t = arg_info(op->args[1])->val;
          if (t != 0) {
 -            t = do_constant_folding(op->opc, t, 0);
 +            t = do_constant_folding(op->opc, ctx->type, t, 0);
              return tcg_opt_gen_movi(ctx, op, op->args[0], t);
          }
          return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[2]);
@@ -XXX,XX +XXX,XX @@ static bool fold_mov(OptContext *ctx, TCGOp *op)
  static bool fold_movcond(OptContext *ctx, TCGOp *op)
  {
 -    TCGOpcode opc = op->opc;
      TCGCond cond = op->args[5];
 -    int i = do_constant_folding_cond(opc, op->args[1], op->args[2], cond);
 +    int i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
      if (i >= 0) {
          return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[4 - i]);
@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
      if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
          uint64_t tv = arg_info(op->args[3])->val;
          uint64_t fv = arg_info(op->args[4])->val;
 +        TCGOpcode opc;
 -        opc = (opc == INDEX_op_movcond_i32
 -               ? INDEX_op_setcond_i32 : INDEX_op_setcond_i64);
 +        switch (ctx->type) {
 +        case TCG_TYPE_I32:
 +            opc = INDEX_op_setcond_i32;
 +            break;
 +        case TCG_TYPE_I64:
 +            opc = INDEX_op_setcond_i64;
 +            break;
 +        default:
 +            g_assert_not_reached();
 +        }
          if (tv == 1 && fv == 0) {
              op->opc = opc;
@@ -XXX,XX +XXX,XX @@ static bool fold_remainder(OptContext *ctx, TCGOp *op)
  static bool fold_setcond(OptContext *ctx, TCGOp *op)
  {
      TCGCond cond = op->args[3];
 -    int i = do_constant_folding_cond(op->opc, op->args[1], op->args[2], cond);
 +    int i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
      if (i >= 0) {
          return tcg_opt_gen_movi(ctx, op, op->args[0], i);
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
           * Simplify EQ/NE comparisons where one of the pairs
           * can be simplified.
           */
 -        i = do_constant_folding_cond(INDEX_op_setcond_i32, op->args[1],
 +        i = do_constant_folding_cond(TCG_TYPE_I32, op->args[1],
                                       op->args[3], cond);
          switch (i ^ inv) {
          case 0:
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
              goto do_setcond_high;
          }
 -        i = do_constant_folding_cond(INDEX_op_setcond_i32, op->args[2],
 +        i = do_constant_folding_cond(TCG_TYPE_I32, op->args[2],
                                       op->args[4], cond);
          switch (i ^ inv) {
          case 0:
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          init_arguments(&ctx, op, def->nb_oargs + def->nb_iargs);
          copy_propagate(&ctx, op, def->nb_oargs, def->nb_iargs);
 +        /* Pre-compute the type of the operation. */
 +        if (def->flags & TCG_OPF_VECTOR) {
 +            ctx.type = TCG_TYPE_V64 + TCGOP_VECL(op);
 +        } else if (def->flags & TCG_OPF_64BIT) {
 +            ctx.type = TCG_TYPE_I64;
 +        } else {
 +            ctx.type = TCG_TYPE_I32;
 +        }
 +
          /* For commutative operations make constant second argument */
          switch (opc) {
          CASE_OP_32_64_VEC(add):
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                      /* Proceed with possible constant folding. */
                      break;
                  }
 -                if (opc == INDEX_op_sub_i32) {
 +                switch (ctx.type) {
 +                case TCG_TYPE_I32:
                      neg_op = INDEX_op_neg_i32;
                      have_neg = TCG_TARGET_HAS_neg_i32;
 -                } else if (opc == INDEX_op_sub_i64) {
 +                    break;
 +                case TCG_TYPE_I64:
                      neg_op = INDEX_op_neg_i64;
                      have_neg = TCG_TARGET_HAS_neg_i64;
 -                } else if (TCG_TARGET_HAS_neg_vec) {
 -                    TCGType type = TCGOP_VECL(op) + TCG_TYPE_V64;
 -                    unsigned vece = TCGOP_VECE(op);
 -                    neg_op = INDEX_op_neg_vec;
 -                    have_neg = tcg_can_emit_vec_op(neg_op, type, vece) > 0;
 -                } else {
                      break;
 +                case TCG_TYPE_V64:
 +                case TCG_TYPE_V128:
 +                case TCG_TYPE_V256:
 +                    neg_op = INDEX_op_neg_vec;
 +                    have_neg = tcg_can_emit_vec_op(neg_op, ctx.type,
 +                                                   TCGOP_VECE(op)) > 0;
 +                    break;
 +                default:
 +                    g_assert_not_reached();
                  }
                  if (!have_neg) {
                      break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  TCGOpcode not_op;
                  bool have_not;
 -                if (def->flags & TCG_OPF_VECTOR) {
 -                    not_op = INDEX_op_not_vec;
 -                    have_not = TCG_TARGET_HAS_not_vec;
 -                } else if (def->flags & TCG_OPF_64BIT) {
 -                    not_op = INDEX_op_not_i64;
 -                    have_not = TCG_TARGET_HAS_not_i64;
 -                } else {
 +                switch (ctx.type) {
 +                case TCG_TYPE_I32:
                      not_op = INDEX_op_not_i32;
                      have_not = TCG_TARGET_HAS_not_i32;
 +                    break;
 +                case TCG_TYPE_I64:
 +                    not_op = INDEX_op_not_i64;
 +                    have_not = TCG_TARGET_HAS_not_i64;
 +                    break;
 +                case TCG_TYPE_V64:
 +                case TCG_TYPE_V128:
 +                case TCG_TYPE_V256:
 +                    not_op = INDEX_op_not_vec;
 +                    have_not = TCG_TARGET_HAS_not_vec;
 +                    break;
 +                default:
 +                    g_assert_not_reached();
                  }
                  if (!have_not) {
                      break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             below, we can ignore high bits, but for further optimizations we
             need to record that the high bits contain garbage.  */
          partmask = z_mask;
 -        if (!(def->flags & TCG_OPF_64BIT)) {
 +        if (ctx.type == TCG_TYPE_I32) {
              z_mask |= ~(tcg_target_ulong)0xffffffffu;
              partmask &= 0xffffffffu;
              affected &= 0xffffffffu;
 --
 .25.1

-[PULL 06/46] tcg/tci: Implement INDEX_op_ld16s_i32
+[PULL 39/56] tcg/optimize: Split out fold_to_not
-From: Stefan Weil <sw@weilnetz.de>
+Split out the conditional conversion from a more complex logical
+operation to a simple NOT.  Create a couple more helpers to make
-That TCG opcode is used by debian-buster (arm64) running ffmpeg:
+this easy for the outer-most logical operations.
-    qemu-aarch64 /usr/bin/ffmpeg -i theora.mkv theora.webm
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Tested-by: Alex Bennée <alex.bennee@linaro.org>
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Reported-by: Alex Bennée <alex.bennee@linaro.org>
 Signed-off-by: Stefan Weil <sw@weilnetz.de>
 Message-Id: <20210128024814.2056958-1-sw@weilnetz.de>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/tci.c | 5 ++++-
+ tcg/optimize.c | 158 +++++++++++++++++++++++++++----------------------
-file changed, 4 insertions(+), 1 deletion(-)
+file changed, 86 insertions(+), 72 deletions(-)
-diff --git a/tcg/tci.c b/tcg/tci.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/tci.c
+--- a/tcg/optimize.c
-+++ b/tcg/tci.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
-             TODO();
+     return false;
  }
 +/*
 + * Convert @op to NOT, if NOT is supported by the host.
 + * Return true f the conversion is successful, which will still
 + * indicate that the processing is complete.
 + */
 +static bool fold_not(OptContext *ctx, TCGOp *op);
 +static bool fold_to_not(OptContext *ctx, TCGOp *op, int idx)
 +{
 +    TCGOpcode not_op;
 +    bool have_not;
 +
 +    switch (ctx->type) {
 +    case TCG_TYPE_I32:
 +        not_op = INDEX_op_not_i32;
 +        have_not = TCG_TARGET_HAS_not_i32;
 +        break;
 +    case TCG_TYPE_I64:
 +        not_op = INDEX_op_not_i64;
 +        have_not = TCG_TARGET_HAS_not_i64;
 +        break;
 +    case TCG_TYPE_V64:
 +    case TCG_TYPE_V128:
 +    case TCG_TYPE_V256:
 +        not_op = INDEX_op_not_vec;
 +        have_not = TCG_TARGET_HAS_not_vec;
 +        break;
 +    default:
 +        g_assert_not_reached();
 +    }
 +    if (have_not) {
 +        op->opc = not_op;
 +        op->args[1] = op->args[idx];
 +        return fold_not(ctx, op);
 +    }
 +    return false;
 +}
 +
 +/* If the binary operation has first argument @i, fold to NOT. */
 +static bool fold_ix_to_not(OptContext *ctx, TCGOp *op, uint64_t i)
 +{
 +    if (arg_is_const(op->args[1]) && arg_info(op->args[1])->val == i) {
 +        return fold_to_not(ctx, op, 2);
 +    }
 +    return false;
 +}
 +
  /* If the binary operation has second argument @i, fold to @i. */
  static bool fold_xi_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
  {
@@ -XXX,XX +XXX,XX @@ static bool fold_xi_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
      return false;
  }
 +/* If the binary operation has second argument @i, fold to NOT. */
 +static bool fold_xi_to_not(OptContext *ctx, TCGOp *op, uint64_t i)
 +{
 +    if (arg_is_const(op->args[2]) && arg_info(op->args[2])->val == i) {
 +        return fold_to_not(ctx, op, 1);
 +    }
 +    return false;
 +}
 +
  /* If the binary operation has both arguments equal, fold to @i. */
  static bool fold_xx_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
  {
@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
  static bool fold_andc(OptContext *ctx, TCGOp *op)
  {
      if (fold_const2(ctx, op) ||
 -        fold_xx_to_i(ctx, op, 0)) {
 +        fold_xx_to_i(ctx, op, 0) ||
 +        fold_ix_to_not(ctx, op, -1)) {
          return true;
      }
      return false;
@@ -XXX,XX +XXX,XX @@ static bool fold_dup2(OptContext *ctx, TCGOp *op)
  static bool fold_eqv(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const2(ctx, op);
 +    if (fold_const2(ctx, op) ||
 +        fold_xi_to_not(ctx, op, 0)) {
 +        return true;
 +    }
 +    return false;
  }
  static bool fold_extract(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_mulu2_i32(OptContext *ctx, TCGOp *op)
  static bool fold_nand(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const2(ctx, op);
 +    if (fold_const2(ctx, op) ||
 +        fold_xi_to_not(ctx, op, -1)) {
 +        return true;
 +    }
 +    return false;
  }
  static bool fold_neg(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_neg(OptContext *ctx, TCGOp *op)
  static bool fold_nor(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const2(ctx, op);
 +    if (fold_const2(ctx, op) ||
 +        fold_xi_to_not(ctx, op, 0)) {
 +        return true;
 +    }
 +    return false;
  }
  static bool fold_not(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const1(ctx, op);
 +    if (fold_const1(ctx, op)) {
 +        return true;
 +    }
 +
 +    /* Because of fold_to_not, we want to always return true, via finish. */
 +    finish_folding(ctx, op);
 +    return true;
  }
  static bool fold_or(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_or(OptContext *ctx, TCGOp *op)
  static bool fold_orc(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const2(ctx, op);
 +    if (fold_const2(ctx, op) ||
 +        fold_ix_to_not(ctx, op, 0)) {
 +        return true;
 +    }
 +    return false;
  }
  static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
  static bool fold_xor(OptContext *ctx, TCGOp *op)
  {
      if (fold_const2(ctx, op) ||
 -        fold_xx_to_i(ctx, op, 0)) {
 +        fold_xx_to_i(ctx, op, 0) ||
 +        fold_xi_to_not(ctx, op, -1)) {
          return true;
      }
      return false;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  }
              }
              break;
-         case INDEX_op_ld16s_i32:
+-        CASE_OP_32_64_VEC(xor):
--            TODO();
+-        CASE_OP_32_64(nand):
-+            t0 = *tb_ptr++;
+-            if (!arg_is_const(op->args[1])
-+            t1 = tci_read_r(regs, &tb_ptr);
+-                && arg_is_const(op->args[2])
-+            t2 = tci_read_s32(&tb_ptr);
+-                && arg_info(op->args[2])->val == -1) {
-+            tci_write_reg(regs, t0, *(int16_t *)(t1 + t2));
+-                i = 1;
 -                goto try_not;
 -            }
 -            break;
 -        CASE_OP_32_64(nor):
 -            if (!arg_is_const(op->args[1])
 -                && arg_is_const(op->args[2])
 -                && arg_info(op->args[2])->val == 0) {
 -                i = 1;
 -                goto try_not;
 -            }
 -            break;
 -        CASE_OP_32_64_VEC(andc):
 -            if (!arg_is_const(op->args[2])
 -                && arg_is_const(op->args[1])
 -                && arg_info(op->args[1])->val == -1) {
 -                i = 2;
 -                goto try_not;
 -            }
 -            break;
 -        CASE_OP_32_64_VEC(orc):
 -        CASE_OP_32_64(eqv):
 -            if (!arg_is_const(op->args[2])
 -                && arg_is_const(op->args[1])
 -                && arg_info(op->args[1])->val == 0) {
 -                i = 2;
 -                goto try_not;
 -            }
 -            break;
 -        try_not:
 -            {
 -                TCGOpcode not_op;
 -                bool have_not;
 -
 -                switch (ctx.type) {
 -                case TCG_TYPE_I32:
 -                    not_op = INDEX_op_not_i32;
 -                    have_not = TCG_TARGET_HAS_not_i32;
 -                    break;
 -                case TCG_TYPE_I64:
 -                    not_op = INDEX_op_not_i64;
 -                    have_not = TCG_TARGET_HAS_not_i64;
 -                    break;
 -                case TCG_TYPE_V64:
 -                case TCG_TYPE_V128:
 -                case TCG_TYPE_V256:
 -                    not_op = INDEX_op_not_vec;
 -                    have_not = TCG_TARGET_HAS_not_vec;
 -                    break;
 -                default:
 -                    g_assert_not_reached();
 -                }
 -                if (!have_not) {
 -                    break;
 -                }
 -                op->opc = not_op;
 -                reset_temp(op->args[0]);
 -                op->args[1] = op->args[i];
 -                continue;
 -            }
          default:
              break;
-         case INDEX_op_ld_i32:
+         }
              t0 = *tb_ptr++;
 --
 .25.1

-[PULL 34/46] cpu: Move tlb_fill to tcg_ops
+[PULL 40/56] tcg/optimize: Split out fold_sub_to_neg
-From: Eduardo Habkost <ehabkost@redhat.com>
+Even though there is only one user, place this more complex
 conversion into its own helper.
-[claudio: wrapped target code in CONFIG_TCG]
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
 Signed-off-by: Claudio Fontana <cfontana@suse.de>
 Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
 Message-Id: <20210204163931.7358-7-cfontana@suse.de>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/hw/core/cpu.h           | 21 ++++++++++++---------
+ tcg/optimize.c | 89 ++++++++++++++++++++++++++------------------------
- accel/tcg/cputlb.c              |  7 ++++---
+file changed, 47 insertions(+), 42 deletions(-)
  accel/tcg/user-exec.c           |  6 +++---
  target/alpha/cpu.c              |  2 +-
  target/arm/cpu.c                |  2 +-
  target/avr/cpu.c                |  2 +-
  target/cris/cpu.c               |  2 +-
  target/hppa/cpu.c               |  2 +-
  target/i386/tcg/tcg-cpu.c       |  2 +-
  target/lm32/cpu.c               |  2 +-
  target/m68k/cpu.c               |  2 +-
  target/microblaze/cpu.c         |  2 +-
  target/mips/cpu.c               |  2 +-
  target/moxie/cpu.c              |  2 +-
  target/nios2/cpu.c              |  2 +-
  target/openrisc/cpu.c           |  2 +-
  target/riscv/cpu.c              |  2 +-
  target/rx/cpu.c                 |  2 +-
  target/s390x/cpu.c              |  2 +-
  target/sh4/cpu.c                |  2 +-
  target/sparc/cpu.c              |  2 +-
  target/tilegx/cpu.c             |  2 +-
  target/tricore/cpu.c            |  2 +-
  target/unicore32/cpu.c          |  2 +-
  target/xtensa/cpu.c             |  2 +-
  target/ppc/translate_init.c.inc |  2 +-
 files changed, 42 insertions(+), 38 deletions(-)
-diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/hw/core/cpu.h
+--- a/tcg/optimize.c
-+++ b/include/hw/core/cpu.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ typedef struct TcgCpuOperations {
+@@ -XXX,XX +XXX,XX @@ static bool fold_nand(OptContext *ctx, TCGOp *op)
-     void (*cpu_exec_exit)(CPUState *cpu);
-     /** @cpu_exec_interrupt: Callback for processing interrupts in cpu_exec */
+ static bool fold_neg(OptContext *ctx, TCGOp *op)
-     bool (*cpu_exec_interrupt)(CPUState *cpu, int interrupt_request);
+ {
-+    /**
+-    return fold_const1(ctx, op);
-+     * @tlb_fill: Handle a softmmu tlb miss or user-only address fault
++    if (fold_const1(ctx, op)) {
-+     *
++        return true;
-+     * For system mode, if the access is valid, call tlb_set_page
++    }
-+     * and return true; if the access is invalid, and probe is
++    /*
-+     * true, return false; otherwise raise an exception and do
++     * Because of fold_sub_to_neg, we want to always return true,
-+     * not return.  For user-only mode, always raise an exception
++     * via finish_folding.
 +     * and do not return.
 +     */
-+    bool (*tlb_fill)(CPUState *cpu, vaddr address, int size,
++    finish_folding(ctx, op);
-+                     MMUAccessType access_type, int mmu_idx,
++    return true;
 +                     bool probe, uintptr_t retaddr);
  } TcgCpuOperations;
@@ -XXX,XX +XXX,XX @@ typedef struct TcgCpuOperations {
   *       If the target behaviour here is anything other than "set
   *       the PC register to the value passed in" then the target must
   *       also implement the synchronize_from_tb hook.
 - * @tlb_fill: Callback for handling a softmmu tlb miss or user-only
 - *       address fault.  For system mode, if the access is valid, call
 - *       tlb_set_page and return true; if the access is invalid, and
 - *       probe is true, return false; otherwise raise an exception and
 - *       do not return.  For user-only mode, always raise an exception
 - *       and do not return.
   * @get_phys_page_debug: Callback for obtaining a physical address.
   * @get_phys_page_attrs_debug: Callback for obtaining a physical address and the
   *       associated memory transaction attributes to use for the access.
@@ -XXX,XX +XXX,XX @@ struct CPUClass {
      void (*get_memory_mapping)(CPUState *cpu, MemoryMappingList *list,
                                 Error **errp);
      void (*set_pc)(CPUState *cpu, vaddr value);
 -    bool (*tlb_fill)(CPUState *cpu, vaddr address, int size,
 -                     MMUAccessType access_type, int mmu_idx,
 -                     bool probe, uintptr_t retaddr);
      hwaddr (*get_phys_page_debug)(CPUState *cpu, vaddr addr);
      hwaddr (*get_phys_page_attrs_debug)(CPUState *cpu, vaddr addr,
                                          MemTxAttrs *attrs);
 diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/cputlb.c
 +++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ static void tlb_fill(CPUState *cpu, target_ulong addr, int size,
       * This is not a probe, so only valid return is success; failure
       * should result in exception + longjmp to the cpu loop.
       */
 -    ok = cc->tlb_fill(cpu, addr, size, access_type, mmu_idx, false, retaddr);
 +    ok = cc->tcg_ops.tlb_fill(cpu, addr, size,
 +                              access_type, mmu_idx, false, retaddr);
      assert(ok);
  }
-@@ -XXX,XX +XXX,XX @@ static int probe_access_internal(CPUArchState *env, target_ulong addr,
+ static bool fold_nor(OptContext *ctx, TCGOp *op)
-             CPUState *cs = env_cpu(env);
+@@ -XXX,XX +XXX,XX @@ static bool fold_shift(OptContext *ctx, TCGOp *op)
-             CPUClass *cc = CPU_GET_CLASS(cs);
+     return fold_const2(ctx, op);
 -            if (!cc->tlb_fill(cs, addr, fault_size, access_type,
 -                              mmu_idx, nonfault, retaddr)) {
 +            if (!cc->tcg_ops.tlb_fill(cs, addr, fault_size, access_type,
 +                                      mmu_idx, nonfault, retaddr)) {
                  /* Non-faulting page table read failed.  */
                  *phost = NULL;
                  return TLB_INVALID_MASK;
 diff --git a/accel/tcg/user-exec.c b/accel/tcg/user-exec.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/user-exec.c
 +++ b/accel/tcg/user-exec.c
@@ -XXX,XX +XXX,XX @@ static inline int handle_cpu_signal(uintptr_t pc, siginfo_t *info,
      clear_helper_retaddr();
      cc = CPU_GET_CLASS(cpu);
 -    cc->tlb_fill(cpu, address, 0, access_type, MMU_USER_IDX, false, pc);
 +    cc->tcg_ops.tlb_fill(cpu, address, 0, access_type, MMU_USER_IDX, false, pc);
      g_assert_not_reached();
  }
-@@ -XXX,XX +XXX,XX @@ static int probe_access_internal(CPUArchState *env, target_ulong addr,
++static bool fold_sub_to_neg(OptContext *ctx, TCGOp *op)
-         } else {
++{
-             CPUState *cpu = env_cpu(env);
++    TCGOpcode neg_op;
-             CPUClass *cc = CPU_GET_CLASS(cpu);
++    bool have_neg;
--            cc->tlb_fill(cpu, addr, fault_size, access_type,
++
--                         MMU_USER_IDX, false, ra);
++    if (!arg_is_const(op->args[1]) || arg_info(op->args[1])->val != 0) {
-+            cc->tcg_ops.tlb_fill(cpu, addr, fault_size, access_type,
++        return false;
-+                                 MMU_USER_IDX, false, ra);
++    }
-             g_assert_not_reached();
++
 +    switch (ctx->type) {
 +    case TCG_TYPE_I32:
 +        neg_op = INDEX_op_neg_i32;
 +        have_neg = TCG_TARGET_HAS_neg_i32;
 +        break;
 +    case TCG_TYPE_I64:
 +        neg_op = INDEX_op_neg_i64;
 +        have_neg = TCG_TARGET_HAS_neg_i64;
 +        break;
 +    case TCG_TYPE_V64:
 +    case TCG_TYPE_V128:
 +    case TCG_TYPE_V256:
 +        neg_op = INDEX_op_neg_vec;
 +        have_neg = (TCG_TARGET_HAS_neg_vec &&
 +                    tcg_can_emit_vec_op(neg_op, ctx->type, TCGOP_VECE(op)) > 0);
 +        break;
 +    default:
 +        g_assert_not_reached();
 +    }
 +    if (have_neg) {
 +        op->opc = neg_op;
 +        op->args[1] = op->args[2];
 +        return fold_neg(ctx, op);
 +    }
 +    return false;
 +}
 +
  static bool fold_sub(OptContext *ctx, TCGOp *op)
  {
      if (fold_const2(ctx, op) ||
 -        fold_xx_to_i(ctx, op, 0)) {
 +        fold_xx_to_i(ctx, op, 0) ||
 +        fold_sub_to_neg(ctx, op)) {
          return true;
      }
      return false;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  continue;
              }
              break;
 -        CASE_OP_32_64_VEC(sub):
 -            {
 -                TCGOpcode neg_op;
 -                bool have_neg;
 -
 -                if (arg_is_const(op->args[2])) {
 -                    /* Proceed with possible constant folding. */
 -                    break;
 -                }
 -                switch (ctx.type) {
 -                case TCG_TYPE_I32:
 -                    neg_op = INDEX_op_neg_i32;
 -                    have_neg = TCG_TARGET_HAS_neg_i32;
 -                    break;
 -                case TCG_TYPE_I64:
 -                    neg_op = INDEX_op_neg_i64;
 -                    have_neg = TCG_TARGET_HAS_neg_i64;
 -                    break;
 -                case TCG_TYPE_V64:
 -                case TCG_TYPE_V128:
 -                case TCG_TYPE_V256:
 -                    neg_op = INDEX_op_neg_vec;
 -                    have_neg = tcg_can_emit_vec_op(neg_op, ctx.type,
 -                                                   TCGOP_VECE(op)) > 0;
 -                    break;
 -                default:
 -                    g_assert_not_reached();
 -                }
 -                if (!have_neg) {
 -                    break;
 -                }
 -                if (arg_is_const(op->args[1])
 -                    && arg_info(op->args[1])->val == 0) {
 -                    op->opc = neg_op;
 -                    reset_temp(op->args[0]);
 -                    op->args[1] = op->args[2];
 -                    continue;
 -                }
 -            }
 -            break;
          default:
              break;
          }
-     }
-diff --git a/target/alpha/cpu.c b/target/alpha/cpu.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/alpha/cpu.c
-+++ b/target/alpha/cpu.c
-@@ -XXX,XX +XXX,XX @@ static void alpha_cpu_class_init(ObjectClass *oc, void *data)
-     cc->set_pc = alpha_cpu_set_pc;
-     cc->gdb_read_register = alpha_cpu_gdb_read_register;
-     cc->gdb_write_register = alpha_cpu_gdb_write_register;
--    cc->tlb_fill = alpha_cpu_tlb_fill;
-+    cc->tcg_ops.tlb_fill = alpha_cpu_tlb_fill;
- #ifndef CONFIG_USER_ONLY
-     cc->do_transaction_failed = alpha_cpu_do_transaction_failed;
-     cc->do_unaligned_access = alpha_cpu_do_unaligned_access;
-diff --git a/target/arm/cpu.c b/target/arm/cpu.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/arm/cpu.c
-+++ b/target/arm/cpu.c
-@@ -XXX,XX +XXX,XX @@ static void arm_cpu_class_init(ObjectClass *oc, void *data)
-     cc->tcg_ops.initialize = arm_translate_init;
-     cc->tcg_ops.cpu_exec_interrupt = arm_cpu_exec_interrupt;
-     cc->tcg_ops.synchronize_from_tb = arm_cpu_synchronize_from_tb;
--    cc->tlb_fill = arm_cpu_tlb_fill;
-+    cc->tcg_ops.tlb_fill = arm_cpu_tlb_fill;
-     cc->debug_excp_handler = arm_debug_excp_handler;
-     cc->debug_check_watchpoint = arm_debug_check_watchpoint;
-     cc->do_unaligned_access = arm_cpu_do_unaligned_access;
-diff --git a/target/avr/cpu.c b/target/avr/cpu.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/avr/cpu.c
-+++ b/target/avr/cpu.c
-@@ -XXX,XX +XXX,XX @@ static void avr_cpu_class_init(ObjectClass *oc, void *data)
-     cc->set_pc = avr_cpu_set_pc;
-     cc->memory_rw_debug = avr_cpu_memory_rw_debug;
-     cc->get_phys_page_debug = avr_cpu_get_phys_page_debug;
--    cc->tlb_fill = avr_cpu_tlb_fill;
-+    cc->tcg_ops.tlb_fill = avr_cpu_tlb_fill;
-     cc->vmsd = &vms_avr_cpu;
-     cc->disas_set_info = avr_cpu_disas_set_info;
-     cc->tcg_ops.initialize = avr_cpu_tcg_init;
-diff --git a/target/cris/cpu.c b/target/cris/cpu.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/cris/cpu.c
-+++ b/target/cris/cpu.c
-@@ -XXX,XX +XXX,XX @@ static void cris_cpu_class_init(ObjectClass *oc, void *data)
-     cc->set_pc = cris_cpu_set_pc;
-     cc->gdb_read_register = cris_cpu_gdb_read_register;
-     cc->gdb_write_register = cris_cpu_gdb_write_register;
--    cc->tlb_fill = cris_cpu_tlb_fill;
-+    cc->tcg_ops.tlb_fill = cris_cpu_tlb_fill;
- #ifndef CONFIG_USER_ONLY
-     cc->get_phys_page_debug = cris_cpu_get_phys_page_debug;
-     dc->vmsd = &vmstate_cris_cpu;
-diff --git a/target/hppa/cpu.c b/target/hppa/cpu.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/hppa/cpu.c
-+++ b/target/hppa/cpu.c
-@@ -XXX,XX +XXX,XX @@ static void hppa_cpu_class_init(ObjectClass *oc, void *data)
-     cc->tcg_ops.synchronize_from_tb = hppa_cpu_synchronize_from_tb;
-     cc->gdb_read_register = hppa_cpu_gdb_read_register;
-     cc->gdb_write_register = hppa_cpu_gdb_write_register;
--    cc->tlb_fill = hppa_cpu_tlb_fill;
-+    cc->tcg_ops.tlb_fill = hppa_cpu_tlb_fill;
- #ifndef CONFIG_USER_ONLY
-     cc->get_phys_page_debug = hppa_cpu_get_phys_page_debug;
-     dc->vmsd = &vmstate_hppa_cpu;
-diff --git a/target/i386/tcg/tcg-cpu.c b/target/i386/tcg/tcg-cpu.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/i386/tcg/tcg-cpu.c
-+++ b/target/i386/tcg/tcg-cpu.c
-@@ -XXX,XX +XXX,XX @@ void tcg_cpu_common_class_init(CPUClass *cc)
-     cc->tcg_ops.cpu_exec_enter = x86_cpu_exec_enter;
-     cc->tcg_ops.cpu_exec_exit = x86_cpu_exec_exit;
-     cc->tcg_ops.initialize = tcg_x86_init;
--    cc->tlb_fill = x86_cpu_tlb_fill;
-+    cc->tcg_ops.tlb_fill = x86_cpu_tlb_fill;
- #ifndef CONFIG_USER_ONLY
-     cc->debug_excp_handler = breakpoint_handler;
- #endif
-diff --git a/target/lm32/cpu.c b/target/lm32/cpu.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/lm32/cpu.c
-+++ b/target/lm32/cpu.c
-@@ -XXX,XX +XXX,XX @@ static void lm32_cpu_class_init(ObjectClass *oc, void *data)
-     cc->set_pc = lm32_cpu_set_pc;
-     cc->gdb_read_register = lm32_cpu_gdb_read_register;
-     cc->gdb_write_register = lm32_cpu_gdb_write_register;
--    cc->tlb_fill = lm32_cpu_tlb_fill;
-+    cc->tcg_ops.tlb_fill = lm32_cpu_tlb_fill;
- #ifndef CONFIG_USER_ONLY
-     cc->get_phys_page_debug = lm32_cpu_get_phys_page_debug;
-     cc->vmsd = &vmstate_lm32_cpu;
-diff --git a/target/m68k/cpu.c b/target/m68k/cpu.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/m68k/cpu.c
-+++ b/target/m68k/cpu.c
-@@ -XXX,XX +XXX,XX @@ static void m68k_cpu_class_init(ObjectClass *c, void *data)
-     cc->set_pc = m68k_cpu_set_pc;
-     cc->gdb_read_register = m68k_cpu_gdb_read_register;
-     cc->gdb_write_register = m68k_cpu_gdb_write_register;
--    cc->tlb_fill = m68k_cpu_tlb_fill;
-+    cc->tcg_ops.tlb_fill = m68k_cpu_tlb_fill;
- #if defined(CONFIG_SOFTMMU)
-     cc->do_transaction_failed = m68k_cpu_transaction_failed;
-     cc->get_phys_page_debug = m68k_cpu_get_phys_page_debug;
-diff --git a/target/microblaze/cpu.c b/target/microblaze/cpu.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/microblaze/cpu.c
-+++ b/target/microblaze/cpu.c
-@@ -XXX,XX +XXX,XX @@ static void mb_cpu_class_init(ObjectClass *oc, void *data)
-     cc->tcg_ops.synchronize_from_tb = mb_cpu_synchronize_from_tb;
-     cc->gdb_read_register = mb_cpu_gdb_read_register;
-     cc->gdb_write_register = mb_cpu_gdb_write_register;
--    cc->tlb_fill = mb_cpu_tlb_fill;
-+    cc->tcg_ops.tlb_fill = mb_cpu_tlb_fill;
- #ifndef CONFIG_USER_ONLY
-     cc->do_transaction_failed = mb_cpu_transaction_failed;
-     cc->get_phys_page_attrs_debug = mb_cpu_get_phys_page_attrs_debug;
-diff --git a/target/mips/cpu.c b/target/mips/cpu.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/mips/cpu.c
-+++ b/target/mips/cpu.c
-@@ -XXX,XX +XXX,XX @@ static void mips_cpu_class_init(ObjectClass *c, void *data)
-     cc->tcg_ops.initialize = mips_tcg_init;
-     cc->tcg_ops.cpu_exec_interrupt = mips_cpu_exec_interrupt;
-     cc->tcg_ops.synchronize_from_tb = mips_cpu_synchronize_from_tb;
--    cc->tlb_fill = mips_cpu_tlb_fill;
-+    cc->tcg_ops.tlb_fill = mips_cpu_tlb_fill;
- #endif
-     cc->gdb_num_core_regs = 73;
-diff --git a/target/moxie/cpu.c b/target/moxie/cpu.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/moxie/cpu.c
-+++ b/target/moxie/cpu.c
-@@ -XXX,XX +XXX,XX @@ static void moxie_cpu_class_init(ObjectClass *oc, void *data)
-     cc->do_interrupt = moxie_cpu_do_interrupt;
-     cc->dump_state = moxie_cpu_dump_state;
-     cc->set_pc = moxie_cpu_set_pc;
--    cc->tlb_fill = moxie_cpu_tlb_fill;
-+    cc->tcg_ops.tlb_fill = moxie_cpu_tlb_fill;
- #ifndef CONFIG_USER_ONLY
-     cc->get_phys_page_debug = moxie_cpu_get_phys_page_debug;
-     cc->vmsd = &vmstate_moxie_cpu;
-diff --git a/target/nios2/cpu.c b/target/nios2/cpu.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/nios2/cpu.c
-+++ b/target/nios2/cpu.c
-@@ -XXX,XX +XXX,XX @@ static void nios2_cpu_class_init(ObjectClass *oc, void *data)
-     cc->dump_state = nios2_cpu_dump_state;
-     cc->set_pc = nios2_cpu_set_pc;
-     cc->disas_set_info = nios2_cpu_disas_set_info;
--    cc->tlb_fill = nios2_cpu_tlb_fill;
-+    cc->tcg_ops.tlb_fill = nios2_cpu_tlb_fill;
- #ifndef CONFIG_USER_ONLY
-     cc->do_unaligned_access = nios2_cpu_do_unaligned_access;
-     cc->get_phys_page_debug = nios2_cpu_get_phys_page_debug;
-diff --git a/target/openrisc/cpu.c b/target/openrisc/cpu.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/openrisc/cpu.c
-+++ b/target/openrisc/cpu.c
-@@ -XXX,XX +XXX,XX @@ static void openrisc_cpu_class_init(ObjectClass *oc, void *data)
-     cc->set_pc = openrisc_cpu_set_pc;
-     cc->gdb_read_register = openrisc_cpu_gdb_read_register;
-     cc->gdb_write_register = openrisc_cpu_gdb_write_register;
--    cc->tlb_fill = openrisc_cpu_tlb_fill;
-+    cc->tcg_ops.tlb_fill = openrisc_cpu_tlb_fill;
- #ifndef CONFIG_USER_ONLY
-     cc->get_phys_page_debug = openrisc_cpu_get_phys_page_debug;
-     dc->vmsd = &vmstate_openrisc_cpu;
-diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/riscv/cpu.c
-+++ b/target/riscv/cpu.c
-@@ -XXX,XX +XXX,XX @@ static void riscv_cpu_class_init(ObjectClass *c, void *data)
-     cc->gdb_arch_name = riscv_gdb_arch_name;
-     cc->gdb_get_dynamic_xml = riscv_gdb_get_dynamic_xml;
-     cc->tcg_ops.initialize = riscv_translate_init;
--    cc->tlb_fill = riscv_cpu_tlb_fill;
-+    cc->tcg_ops.tlb_fill = riscv_cpu_tlb_fill;
-     device_class_set_props(dc, riscv_cpu_properties);
- }
-diff --git a/target/rx/cpu.c b/target/rx/cpu.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/rx/cpu.c
-+++ b/target/rx/cpu.c
-@@ -XXX,XX +XXX,XX @@ static void rx_cpu_class_init(ObjectClass *klass, void *data)
-     cc->get_phys_page_debug = rx_cpu_get_phys_page_debug;
-     cc->disas_set_info = rx_cpu_disas_set_info;
-     cc->tcg_ops.initialize = rx_translate_init;
--    cc->tlb_fill = rx_cpu_tlb_fill;
-+    cc->tcg_ops.tlb_fill = rx_cpu_tlb_fill;
-     cc->gdb_num_core_regs = 26;
-     cc->gdb_core_xml_file = "rx-core.xml";
-diff --git a/target/s390x/cpu.c b/target/s390x/cpu.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/s390x/cpu.c
-+++ b/target/s390x/cpu.c
-@@ -XXX,XX +XXX,XX @@ static void s390_cpu_class_init(ObjectClass *oc, void *data)
-     cc->disas_set_info = s390_cpu_disas_set_info;
- #ifdef CONFIG_TCG
-     cc->tcg_ops.initialize = s390x_translate_init;
--    cc->tlb_fill = s390_cpu_tlb_fill;
-+    cc->tcg_ops.tlb_fill = s390_cpu_tlb_fill;
- #endif
-     cc->gdb_num_core_regs = S390_NUM_CORE_REGS;
-diff --git a/target/sh4/cpu.c b/target/sh4/cpu.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/sh4/cpu.c
-+++ b/target/sh4/cpu.c
-@@ -XXX,XX +XXX,XX @@ static void superh_cpu_class_init(ObjectClass *oc, void *data)
-     cc->tcg_ops.synchronize_from_tb = superh_cpu_synchronize_from_tb;
-     cc->gdb_read_register = superh_cpu_gdb_read_register;
-     cc->gdb_write_register = superh_cpu_gdb_write_register;
--    cc->tlb_fill = superh_cpu_tlb_fill;
-+    cc->tcg_ops.tlb_fill = superh_cpu_tlb_fill;
- #ifndef CONFIG_USER_ONLY
-     cc->do_unaligned_access = superh_cpu_do_unaligned_access;
-     cc->get_phys_page_debug = superh_cpu_get_phys_page_debug;
-diff --git a/target/sparc/cpu.c b/target/sparc/cpu.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/sparc/cpu.c
-+++ b/target/sparc/cpu.c
-@@ -XXX,XX +XXX,XX @@ static void sparc_cpu_class_init(ObjectClass *oc, void *data)
-     cc->tcg_ops.synchronize_from_tb = sparc_cpu_synchronize_from_tb;
-     cc->gdb_read_register = sparc_cpu_gdb_read_register;
-     cc->gdb_write_register = sparc_cpu_gdb_write_register;
--    cc->tlb_fill = sparc_cpu_tlb_fill;
-+    cc->tcg_ops.tlb_fill = sparc_cpu_tlb_fill;
- #ifndef CONFIG_USER_ONLY
-     cc->do_transaction_failed = sparc_cpu_do_transaction_failed;
-     cc->do_unaligned_access = sparc_cpu_do_unaligned_access;
-diff --git a/target/tilegx/cpu.c b/target/tilegx/cpu.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/tilegx/cpu.c
-+++ b/target/tilegx/cpu.c
-@@ -XXX,XX +XXX,XX @@ static void tilegx_cpu_class_init(ObjectClass *oc, void *data)
-     cc->tcg_ops.cpu_exec_interrupt = tilegx_cpu_exec_interrupt;
-     cc->dump_state = tilegx_cpu_dump_state;
-     cc->set_pc = tilegx_cpu_set_pc;
--    cc->tlb_fill = tilegx_cpu_tlb_fill;
-+    cc->tcg_ops.tlb_fill = tilegx_cpu_tlb_fill;
-     cc->gdb_num_core_regs = 0;
-     cc->tcg_ops.initialize = tilegx_tcg_init;
- }
-diff --git a/target/tricore/cpu.c b/target/tricore/cpu.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/tricore/cpu.c
-+++ b/target/tricore/cpu.c
-@@ -XXX,XX +XXX,XX @@ static void tricore_cpu_class_init(ObjectClass *c, void *data)
-     cc->tcg_ops.synchronize_from_tb = tricore_cpu_synchronize_from_tb;
-     cc->get_phys_page_debug = tricore_cpu_get_phys_page_debug;
-     cc->tcg_ops.initialize = tricore_tcg_init;
--    cc->tlb_fill = tricore_cpu_tlb_fill;
-+    cc->tcg_ops.tlb_fill = tricore_cpu_tlb_fill;
- }
- #define DEFINE_TRICORE_CPU_TYPE(cpu_model, initfn) \
-diff --git a/target/unicore32/cpu.c b/target/unicore32/cpu.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/unicore32/cpu.c
-+++ b/target/unicore32/cpu.c
-@@ -XXX,XX +XXX,XX @@ static void uc32_cpu_class_init(ObjectClass *oc, void *data)
-     cc->tcg_ops.cpu_exec_interrupt = uc32_cpu_exec_interrupt;
-     cc->dump_state = uc32_cpu_dump_state;
-     cc->set_pc = uc32_cpu_set_pc;
--    cc->tlb_fill = uc32_cpu_tlb_fill;
-+    cc->tcg_ops.tlb_fill = uc32_cpu_tlb_fill;
-     cc->get_phys_page_debug = uc32_cpu_get_phys_page_debug;
-     cc->tcg_ops.initialize = uc32_translate_init;
-     dc->vmsd = &vmstate_uc32_cpu;
-diff --git a/target/xtensa/cpu.c b/target/xtensa/cpu.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/xtensa/cpu.c
-+++ b/target/xtensa/cpu.c
-@@ -XXX,XX +XXX,XX @@ static void xtensa_cpu_class_init(ObjectClass *oc, void *data)
-     cc->gdb_read_register = xtensa_cpu_gdb_read_register;
-     cc->gdb_write_register = xtensa_cpu_gdb_write_register;
-     cc->gdb_stop_before_watchpoint = true;
--    cc->tlb_fill = xtensa_cpu_tlb_fill;
-+    cc->tcg_ops.tlb_fill = xtensa_cpu_tlb_fill;
- #ifndef CONFIG_USER_ONLY
-     cc->do_unaligned_access = xtensa_cpu_do_unaligned_access;
-     cc->get_phys_page_debug = xtensa_cpu_get_phys_page_debug;
-diff --git a/target/ppc/translate_init.c.inc b/target/ppc/translate_init.c.inc
-index XXXXXXX..XXXXXXX 100644
---- a/target/ppc/translate_init.c.inc
-+++ b/target/ppc/translate_init.c.inc
-@@ -XXX,XX +XXX,XX @@ static void ppc_cpu_class_init(ObjectClass *oc, void *data)
- #ifdef CONFIG_TCG
-     cc->tcg_ops.initialize = ppc_translate_init;
-     cc->tcg_ops.cpu_exec_interrupt = ppc_cpu_exec_interrupt;
--    cc->tlb_fill = ppc_cpu_tlb_fill;
-+    cc->tcg_ops.tlb_fill = ppc_cpu_tlb_fill;
- #ifndef CONFIG_USER_ONLY
-     cc->tcg_ops.cpu_exec_enter = ppc_cpu_exec_enter;
-     cc->tcg_ops.cpu_exec_exit = ppc_cpu_exec_exit;
 --
 .25.1

-[PULL 39/46] cpu: move do_unaligned_access to tcg_ops
+[PULL 41/56] tcg/optimize: Split out fold_xi_to_x
-From: Claudio Fontana <cfontana@suse.de>
+Pull the "op r, a, i => mov r, a" optimization into a function,
 and use them in the outer-most logical operations.
-make it consistently SOFTMMU-only.
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Claudio Fontana <cfontana@suse.de>
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
 Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
 [claudio: make the field presence in cpu.h unconditional, removing the ifdefs]
 Message-Id: <20210204163931.7358-12-cfontana@suse.de>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/hw/core/cpu.h           | 13 +++++++------
+ tcg/optimize.c | 61 +++++++++++++++++++++-----------------------------
- target/alpha/cpu.c              |  2 +-
+file changed, 26 insertions(+), 35 deletions(-)
  target/arm/cpu.c                |  2 +-
  target/hppa/cpu.c               |  4 +++-
  target/microblaze/cpu.c         |  2 +-
  target/mips/cpu.c               |  3 ++-
  target/nios2/cpu.c              |  2 +-
  target/riscv/cpu.c              |  2 +-
  target/s390x/cpu.c              |  2 +-
  target/s390x/excp_helper.c      |  2 +-
  target/sh4/cpu.c                |  2 +-
  target/sparc/cpu.c              |  2 +-
  target/xtensa/cpu.c             |  2 +-
  target/ppc/translate_init.c.inc |  2 +-
 files changed, 23 insertions(+), 19 deletions(-)
-diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/hw/core/cpu.h
+--- a/tcg/optimize.c
-+++ b/include/hw/core/cpu.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ typedef struct TcgCpuOperations {
+@@ -XXX,XX +XXX,XX @@ static bool fold_xi_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
-                                   unsigned size, MMUAccessType access_type,
+     return false;
-                                   int mmu_idx, MemTxAttrs attrs,
+ }
-                                   MemTxResult response, uintptr_t retaddr);
-+    /**
++/* If the binary operation has second argument @i, fold to identity. */
-+     * @do_unaligned_access: Callback for unaligned access handling
++static bool fold_xi_to_x(OptContext *ctx, TCGOp *op, uint64_t i)
-+     */
++{
-+    void (*do_unaligned_access)(CPUState *cpu, vaddr addr,
++    if (arg_is_const(op->args[2]) && arg_info(op->args[2])->val == i) {
-+                                MMUAccessType access_type,
++        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]);
-+                                int mmu_idx, uintptr_t retaddr);
++    }
- } TcgCpuOperations;
++    return false;
++}
- /**
++
-@@ -XXX,XX +XXX,XX @@ typedef struct TcgCpuOperations {
+ /* If the binary operation has second argument @i, fold to NOT. */
-  * @parse_features: Callback to parse command line arguments.
+ static bool fold_xi_to_not(OptContext *ctx, TCGOp *op, uint64_t i)
   * @reset_dump_flags: #CPUDumpFlags to use for reset logging.
   * @has_work: Callback for checking if there is work to do.
 - * @do_unaligned_access: Callback for unaligned access handling, if
 - * the target defines #TARGET_ALIGNED_ONLY.
   * @virtio_is_big_endian: Callback to return %true if a CPU which supports
   * runtime configurable endianness is currently big-endian. Non-configurable
   * CPUs can use the default implementation of this method. This method should
@@ -XXX,XX +XXX,XX @@ struct CPUClass {
      int reset_dump_flags;
      bool (*has_work)(CPUState *cpu);
 -    void (*do_unaligned_access)(CPUState *cpu, vaddr addr,
 -                                MMUAccessType access_type,
 -                                int mmu_idx, uintptr_t retaddr);
      bool (*virtio_is_big_endian)(CPUState *cpu);
      int (*memory_rw_debug)(CPUState *cpu, vaddr addr,
                             uint8_t *buf, int len, bool is_write);
@@ -XXX,XX +XXX,XX @@ static inline void cpu_unaligned_access(CPUState *cpu, vaddr addr,
  {
-     CPUClass *cc = CPU_GET_CLASS(cpu);
+@@ -XXX,XX +XXX,XX @@ static bool fold_xx_to_x(OptContext *ctx, TCGOp *op)
--    cc->do_unaligned_access(cpu, addr, access_type, mmu_idx, retaddr);
+ static bool fold_add(OptContext *ctx, TCGOp *op)
-+    cc->tcg_ops.do_unaligned_access(cpu, addr, access_type, mmu_idx, retaddr);
+ {
 -    return fold_const2(ctx, op);
 +    if (fold_const2(ctx, op) ||
 +        fold_xi_to_x(ctx, op, 0)) {
 +        return true;
 +    }
 +    return false;
  }
- static inline void cpu_transaction_failed(CPUState *cpu, hwaddr physaddr,
+ static bool fold_addsub2_i32(OptContext *ctx, TCGOp *op, bool add)
-diff --git a/target/alpha/cpu.c b/target/alpha/cpu.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
-index XXXXXXX..XXXXXXX 100644
+ {
---- a/target/alpha/cpu.c
+     if (fold_const2(ctx, op) ||
-+++ b/target/alpha/cpu.c
+         fold_xi_to_i(ctx, op, 0) ||
-@@ -XXX,XX +XXX,XX @@ static void alpha_cpu_class_init(ObjectClass *oc, void *data)
++        fold_xi_to_x(ctx, op, -1) ||
-     cc->tcg_ops.tlb_fill = alpha_cpu_tlb_fill;
+         fold_xx_to_x(ctx, op)) {
- #ifndef CONFIG_USER_ONLY
+         return true;
-     cc->tcg_ops.do_transaction_failed = alpha_cpu_do_transaction_failed;
+     }
--    cc->do_unaligned_access = alpha_cpu_do_unaligned_access;
+@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
-+    cc->tcg_ops.do_unaligned_access = alpha_cpu_do_unaligned_access;
+ {
-     cc->get_phys_page_debug = alpha_cpu_get_phys_page_debug;
+     if (fold_const2(ctx, op) ||
-     dc->vmsd = &vmstate_alpha_cpu;
+         fold_xx_to_i(ctx, op, 0) ||
- #endif
++        fold_xi_to_x(ctx, op, 0) ||
-diff --git a/target/arm/cpu.c b/target/arm/cpu.c
+         fold_ix_to_not(ctx, op, -1)) {
-index XXXXXXX..XXXXXXX 100644
+         return true;
---- a/target/arm/cpu.c
+     }
-+++ b/target/arm/cpu.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_dup2(OptContext *ctx, TCGOp *op)
-@@ -XXX,XX +XXX,XX @@ static void arm_cpu_class_init(ObjectClass *oc, void *data)
+ static bool fold_eqv(OptContext *ctx, TCGOp *op)
-     cc->tcg_ops.tlb_fill = arm_cpu_tlb_fill;
+ {
-     cc->tcg_ops.debug_excp_handler = arm_debug_excp_handler;
+     if (fold_const2(ctx, op) ||
-     cc->debug_check_watchpoint = arm_debug_check_watchpoint;
++        fold_xi_to_x(ctx, op, -1) ||
--    cc->do_unaligned_access = arm_cpu_do_unaligned_access;
+         fold_xi_to_not(ctx, op, 0)) {
- #if !defined(CONFIG_USER_ONLY)
+         return true;
-     cc->tcg_ops.do_transaction_failed = arm_cpu_do_transaction_failed;
+     }
-+    cc->tcg_ops.do_unaligned_access = arm_cpu_do_unaligned_access;
+@@ -XXX,XX +XXX,XX @@ static bool fold_not(OptContext *ctx, TCGOp *op)
-     cc->adjust_watchpoint_address = arm_adjust_watchpoint_address;
+ static bool fold_or(OptContext *ctx, TCGOp *op)
-     cc->tcg_ops.do_interrupt = arm_cpu_do_interrupt;
+ {
- #endif /* CONFIG_TCG && !CONFIG_USER_ONLY */
+     if (fold_const2(ctx, op) ||
-diff --git a/target/hppa/cpu.c b/target/hppa/cpu.c
++        fold_xi_to_x(ctx, op, 0) ||
-index XXXXXXX..XXXXXXX 100644
+         fold_xx_to_x(ctx, op)) {
---- a/target/hppa/cpu.c
+         return true;
-+++ b/target/hppa/cpu.c
+     }
-@@ -XXX,XX +XXX,XX @@ static void hppa_cpu_disas_set_info(CPUState *cs, disassemble_info *info)
+@@ -XXX,XX +XXX,XX @@ static bool fold_or(OptContext *ctx, TCGOp *op)
-     info->print_insn = print_insn_hppa;
+ static bool fold_orc(OptContext *ctx, TCGOp *op)
  {
      if (fold_const2(ctx, op) ||
 +        fold_xi_to_x(ctx, op, -1) ||
          fold_ix_to_not(ctx, op, 0)) {
          return true;
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
  static bool fold_shift(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const2(ctx, op);
 +    if (fold_const2(ctx, op) ||
 +        fold_xi_to_x(ctx, op, 0)) {
 +        return true;
 +    }
 +    return false;
  }
-+#ifndef CONFIG_USER_ONLY
+ static bool fold_sub_to_neg(OptContext *ctx, TCGOp *op)
- static void hppa_cpu_do_unaligned_access(CPUState *cs, vaddr addr,
+@@ -XXX,XX +XXX,XX @@ static bool fold_sub(OptContext *ctx, TCGOp *op)
                                           MMUAccessType access_type,
                                           int mmu_idx, uintptr_t retaddr)
@@ -XXX,XX +XXX,XX @@ static void hppa_cpu_do_unaligned_access(CPUState *cs, vaddr addr,
      cpu_loop_exit_restore(cs, retaddr);
  }
 +#endif /* CONFIG_USER_ONLY */
  static void hppa_cpu_realizefn(DeviceState *dev, Error **errp)
  {
-@@ -XXX,XX +XXX,XX @@ static void hppa_cpu_class_init(ObjectClass *oc, void *data)
+     if (fold_const2(ctx, op) ||
-     cc->tcg_ops.tlb_fill = hppa_cpu_tlb_fill;
+         fold_xx_to_i(ctx, op, 0) ||
- #ifndef CONFIG_USER_ONLY
++        fold_xi_to_x(ctx, op, 0) ||
-     cc->get_phys_page_debug = hppa_cpu_get_phys_page_debug;
+         fold_sub_to_neg(ctx, op)) {
-+    cc->tcg_ops.do_unaligned_access = hppa_cpu_do_unaligned_access;
+         return true;
      dc->vmsd = &vmstate_hppa_cpu;
  #endif
 -    cc->do_unaligned_access = hppa_cpu_do_unaligned_access;
      cc->disas_set_info = hppa_cpu_disas_set_info;
      cc->tcg_ops.initialize = hppa_translate_init;
 diff --git a/target/microblaze/cpu.c b/target/microblaze/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/microblaze/cpu.c
 +++ b/target/microblaze/cpu.c
@@ -XXX,XX +XXX,XX @@ static void mb_cpu_class_init(ObjectClass *oc, void *data)
      cc->class_by_name = mb_cpu_class_by_name;
      cc->has_work = mb_cpu_has_work;
      cc->tcg_ops.do_interrupt = mb_cpu_do_interrupt;
 -    cc->do_unaligned_access = mb_cpu_do_unaligned_access;
      cc->tcg_ops.cpu_exec_interrupt = mb_cpu_exec_interrupt;
      cc->dump_state = mb_cpu_dump_state;
      cc->set_pc = mb_cpu_set_pc;
@@ -XXX,XX +XXX,XX @@ static void mb_cpu_class_init(ObjectClass *oc, void *data)
      cc->tcg_ops.tlb_fill = mb_cpu_tlb_fill;
  #ifndef CONFIG_USER_ONLY
      cc->tcg_ops.do_transaction_failed = mb_cpu_transaction_failed;
 +    cc->tcg_ops.do_unaligned_access = mb_cpu_do_unaligned_access;
      cc->get_phys_page_attrs_debug = mb_cpu_get_phys_page_attrs_debug;
      dc->vmsd = &vmstate_mb_cpu;
  #endif
 diff --git a/target/mips/cpu.c b/target/mips/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/mips/cpu.c
 +++ b/target/mips/cpu.c
@@ -XXX,XX +XXX,XX @@ static void mips_cpu_class_init(ObjectClass *c, void *data)
      cc->gdb_read_register = mips_cpu_gdb_read_register;
      cc->gdb_write_register = mips_cpu_gdb_write_register;
  #ifndef CONFIG_USER_ONLY
 -    cc->do_unaligned_access = mips_cpu_do_unaligned_access;
      cc->get_phys_page_debug = mips_cpu_get_phys_page_debug;
      cc->vmsd = &vmstate_mips_cpu;
  #endif
@@ -XXX,XX +XXX,XX @@ static void mips_cpu_class_init(ObjectClass *c, void *data)
      cc->tcg_ops.tlb_fill = mips_cpu_tlb_fill;
  #ifndef CONFIG_USER_ONLY
      cc->tcg_ops.do_transaction_failed = mips_cpu_do_transaction_failed;
 +    cc->tcg_ops.do_unaligned_access = mips_cpu_do_unaligned_access;
 +
  #endif /* CONFIG_USER_ONLY */
  #endif /* CONFIG_TCG */
 diff --git a/target/nios2/cpu.c b/target/nios2/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/nios2/cpu.c
 +++ b/target/nios2/cpu.c
@@ -XXX,XX +XXX,XX @@ static void nios2_cpu_class_init(ObjectClass *oc, void *data)
      cc->disas_set_info = nios2_cpu_disas_set_info;
      cc->tcg_ops.tlb_fill = nios2_cpu_tlb_fill;
  #ifndef CONFIG_USER_ONLY
 -    cc->do_unaligned_access = nios2_cpu_do_unaligned_access;
 +    cc->tcg_ops.do_unaligned_access = nios2_cpu_do_unaligned_access;
      cc->get_phys_page_debug = nios2_cpu_get_phys_page_debug;
  #endif
      cc->gdb_read_register = nios2_cpu_gdb_read_register;
 diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/riscv/cpu.c
 +++ b/target/riscv/cpu.c
@@ -XXX,XX +XXX,XX @@ static void riscv_cpu_class_init(ObjectClass *c, void *data)
      cc->disas_set_info = riscv_cpu_disas_set_info;
  #ifndef CONFIG_USER_ONLY
      cc->tcg_ops.do_transaction_failed = riscv_cpu_do_transaction_failed;
 -    cc->do_unaligned_access = riscv_cpu_do_unaligned_access;
 +    cc->tcg_ops.do_unaligned_access = riscv_cpu_do_unaligned_access;
      cc->get_phys_page_debug = riscv_cpu_get_phys_page_debug;
      /* For now, mark unmigratable: */
      cc->vmsd = &vmstate_riscv_cpu;
 diff --git a/target/s390x/cpu.c b/target/s390x/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/s390x/cpu.c
 +++ b/target/s390x/cpu.c
@@ -XXX,XX +XXX,XX @@ static void s390_cpu_class_init(ObjectClass *oc, void *data)
  #ifdef CONFIG_TCG
      cc->tcg_ops.cpu_exec_interrupt = s390_cpu_exec_interrupt;
      cc->tcg_ops.debug_excp_handler = s390x_cpu_debug_excp_handler;
 -    cc->do_unaligned_access = s390x_cpu_do_unaligned_access;
 +    cc->tcg_ops.do_unaligned_access = s390x_cpu_do_unaligned_access;
  #endif
  #endif
      cc->disas_set_info = s390_cpu_disas_set_info;
 diff --git a/target/s390x/excp_helper.c b/target/s390x/excp_helper.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/s390x/excp_helper.c
 +++ b/target/s390x/excp_helper.c
@@ -XXX,XX +XXX,XX @@ void HELPER(monitor_call)(CPUS390XState *env, uint64_t monitor_code,
      }
- }
+@@ -XXX,XX +XXX,XX @@ static bool fold_xor(OptContext *ctx, TCGOp *op)
+ {
--#endif /* CONFIG_USER_ONLY */
+     if (fold_const2(ctx, op) ||
-+#endif /* !CONFIG_USER_ONLY */
+         fold_xx_to_i(ctx, op, 0) ||
-diff --git a/target/sh4/cpu.c b/target/sh4/cpu.c
++        fold_xi_to_x(ctx, op, 0) ||
-index XXXXXXX..XXXXXXX 100644
+         fold_xi_to_not(ctx, op, -1)) {
---- a/target/sh4/cpu.c
+         return true;
-+++ b/target/sh4/cpu.c
+     }
-@@ -XXX,XX +XXX,XX @@ static void superh_cpu_class_init(ObjectClass *oc, void *data)
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-     cc->gdb_write_register = superh_cpu_gdb_write_register;
+             break;
-     cc->tcg_ops.tlb_fill = superh_cpu_tlb_fill;
+         }
- #ifndef CONFIG_USER_ONLY
--    cc->do_unaligned_access = superh_cpu_do_unaligned_access;
+-        /* Simplify expression for "op r, a, const => mov r, a" cases */
-+    cc->tcg_ops.do_unaligned_access = superh_cpu_do_unaligned_access;
+-        switch (opc) {
-     cc->get_phys_page_debug = superh_cpu_get_phys_page_debug;
+-        CASE_OP_32_64_VEC(add):
- #endif
+-        CASE_OP_32_64_VEC(sub):
-     cc->disas_set_info = superh_cpu_disas_set_info;
+-        CASE_OP_32_64_VEC(or):
-diff --git a/target/sparc/cpu.c b/target/sparc/cpu.c
+-        CASE_OP_32_64_VEC(xor):
-index XXXXXXX..XXXXXXX 100644
+-        CASE_OP_32_64_VEC(andc):
---- a/target/sparc/cpu.c
+-        CASE_OP_32_64(shl):
-+++ b/target/sparc/cpu.c
+-        CASE_OP_32_64(shr):
-@@ -XXX,XX +XXX,XX @@ static void sparc_cpu_class_init(ObjectClass *oc, void *data)
+-        CASE_OP_32_64(sar):
-     cc->tcg_ops.tlb_fill = sparc_cpu_tlb_fill;
+-        CASE_OP_32_64(rotl):
- #ifndef CONFIG_USER_ONLY
+-        CASE_OP_32_64(rotr):
-     cc->tcg_ops.do_transaction_failed = sparc_cpu_do_transaction_failed;
+-            if (!arg_is_const(op->args[1])
--    cc->do_unaligned_access = sparc_cpu_do_unaligned_access;
+-                && arg_is_const(op->args[2])
-+    cc->tcg_ops.do_unaligned_access = sparc_cpu_do_unaligned_access;
+-                && arg_info(op->args[2])->val == 0) {
-     cc->get_phys_page_debug = sparc_cpu_get_phys_page_debug;
+-                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
-     cc->vmsd = &vmstate_sparc_cpu;
+-                continue;
- #endif
+-            }
-diff --git a/target/xtensa/cpu.c b/target/xtensa/cpu.c
+-            break;
-index XXXXXXX..XXXXXXX 100644
+-        CASE_OP_32_64_VEC(and):
---- a/target/xtensa/cpu.c
+-        CASE_OP_32_64_VEC(orc):
-+++ b/target/xtensa/cpu.c
+-        CASE_OP_32_64(eqv):
-@@ -XXX,XX +XXX,XX @@ static void xtensa_cpu_class_init(ObjectClass *oc, void *data)
+-            if (!arg_is_const(op->args[1])
-     cc->gdb_stop_before_watchpoint = true;
+-                && arg_is_const(op->args[2])
-     cc->tcg_ops.tlb_fill = xtensa_cpu_tlb_fill;
+-                && arg_info(op->args[2])->val == -1) {
- #ifndef CONFIG_USER_ONLY
+-                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
--    cc->do_unaligned_access = xtensa_cpu_do_unaligned_access;
+-                continue;
-+    cc->tcg_ops.do_unaligned_access = xtensa_cpu_do_unaligned_access;
+-            }
-     cc->get_phys_page_debug = xtensa_cpu_get_phys_page_debug;
+-            break;
-     cc->tcg_ops.do_transaction_failed = xtensa_cpu_do_transaction_failed;
+-        default:
- #endif
+-            break;
-diff --git a/target/ppc/translate_init.c.inc b/target/ppc/translate_init.c.inc
+-        }
-index XXXXXXX..XXXXXXX 100644
+-
---- a/target/ppc/translate_init.c.inc
+         /* Simplify using known-zero bits. Currently only ops with a single
-+++ b/target/ppc/translate_init.c.inc
+            output argument is supported. */
-@@ -XXX,XX +XXX,XX @@ static void ppc_cpu_class_init(ObjectClass *oc, void *data)
+         z_mask = -1;
      cc->set_pc = ppc_cpu_set_pc;
      cc->gdb_read_register = ppc_cpu_gdb_read_register;
      cc->gdb_write_register = ppc_cpu_gdb_write_register;
 -    cc->do_unaligned_access = ppc_cpu_do_unaligned_access;
  #ifndef CONFIG_USER_ONLY
      cc->get_phys_page_debug = ppc_cpu_get_phys_page_debug;
      cc->vmsd = &vmstate_ppc_cpu;
@@ -XXX,XX +XXX,XX @@ static void ppc_cpu_class_init(ObjectClass *oc, void *data)
  #ifndef CONFIG_USER_ONLY
      cc->tcg_ops.cpu_exec_enter = ppc_cpu_exec_enter;
      cc->tcg_ops.cpu_exec_exit = ppc_cpu_exec_exit;
 +    cc->tcg_ops.do_unaligned_access = ppc_cpu_do_unaligned_access;
  #endif /* !CONFIG_USER_ONLY */
  #endif /* CONFIG_TCG */
 --
 .25.1

-[PULL 10/46] tcg/tci: Inline tci_write_reg16 into the only caller
+[PULL 42/56] tcg/optimize: Split out fold_ix_to_i
-Tested-by: Alex Bennée <alex.bennee@linaro.org>
+Pull the "op r, 0, b => movi r, 0" optimization into a function,
-Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+and use it in fold_shift.
 Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/tci.c | 10 +---------
+ tcg/optimize.c | 28 ++++++++++------------------
-file changed, 1 insertion(+), 9 deletions(-)
+file changed, 10 insertions(+), 18 deletions(-)
-diff --git a/tcg/tci.c b/tcg/tci.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/tci.c
+--- a/tcg/optimize.c
-+++ b/tcg/tci.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ tci_write_reg(tcg_target_ulong *regs, TCGReg index, tcg_target_ulong value)
+@@ -XXX,XX +XXX,XX @@ static bool fold_to_not(OptContext *ctx, TCGOp *op, int idx)
-     regs[index] = value;
+     return false;
  }
--#if TCG_TARGET_REG_BITS == 64
++/* If the binary operation has first argument @i, fold to @i. */
--static void
++static bool fold_ix_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
--tci_write_reg16(tcg_target_ulong *regs, TCGReg index, uint16_t value)
++{
--{
++    if (arg_is_const(op->args[1]) && arg_info(op->args[1])->val == i) {
--    tci_write_reg(regs, index, value);
++        return tcg_opt_gen_movi(ctx, op, op->args[0], i);
--}
++    }
--#endif
++    return false;
 +}
 +
  /* If the binary operation has first argument @i, fold to NOT. */
  static bool fold_ix_to_not(OptContext *ctx, TCGOp *op, uint64_t i)
  {
@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
  static bool fold_shift(OptContext *ctx, TCGOp *op)
  {
      if (fold_const2(ctx, op) ||
 +        fold_ix_to_i(ctx, op, 0) ||
          fold_xi_to_x(ctx, op, 0)) {
          return true;
      }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              break;
          }
 -        /* Simplify expressions for "shift/rot r, 0, a => movi r, 0",
 -           and "sub r, 0, a => neg r, a" case.  */
 -        switch (opc) {
 -        CASE_OP_32_64(shl):
 -        CASE_OP_32_64(shr):
 -        CASE_OP_32_64(sar):
 -        CASE_OP_32_64(rotl):
 -        CASE_OP_32_64(rotr):
 -            if (arg_is_const(op->args[1])
 -                && arg_info(op->args[1])->val == 0) {
 -                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
 -                continue;
 -            }
 -            break;
 -        default:
 -            break;
 -        }
 -
- static void
+         /* Simplify using known-zero bits. Currently only ops with a single
- tci_write_reg32(tcg_target_ulong *regs, TCGReg index, uint32_t value)
+            output argument is supported. */
- {
+         z_mask = -1;
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
              t0 = *tb_ptr++;
              t1 = tci_read_r(regs, &tb_ptr);
              t2 = tci_read_s32(&tb_ptr);
 -            tci_write_reg16(regs, t0, *(uint16_t *)(t1 + t2));
 +            tci_write_reg(regs, t0, *(uint16_t *)(t1 + t2));
              break;
          case INDEX_op_ld16s_i64:
              TODO();
 --
 .25.1

-[PULL 04/46] configure: Fix --enable-tcg-interpreter
+[PULL 43/56] tcg/optimize: Split out fold_masks
-The configure option was backward, and we failed to
+Move all of the known-zero optimizations into the per-opcode
-pass the value on to meson.
+functions.  Use fold_masks when there is a possibility of the
 result being determined, and simply set ctx->z_mask otherwise.
-Fixes: 23a77b2d18b ("build-system: clean up TCG/TCI configury")
-Tested-by: Stefan Weil <sw@weilnetz.de>
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Daniel P. Berrangé <berrange@redhat.com>
 Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- configure | 5 +++--
+ tcg/optimize.c | 545 ++++++++++++++++++++++++++-----------------------
-file changed, 3 insertions(+), 2 deletions(-)
+file changed, 294 insertions(+), 251 deletions(-)
-diff --git a/configure b/configure
+diff --git a/tcg/optimize.c b/tcg/optimize.c
-index XXXXXXX..XXXXXXX 100755
+index XXXXXXX..XXXXXXX 100644
---- a/configure
+--- a/tcg/optimize.c
-+++ b/configure
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ for opt do
+@@ -XXX,XX +XXX,XX @@ typedef struct OptContext {
-   ;;
+     TCGTempSet temps_used;
-   --enable-whpx) whpx="enabled"
-   ;;
+     /* In flight values from optimization. */
--  --disable-tcg-interpreter) tcg_interpreter="true"
+-    uint64_t z_mask;
-+  --disable-tcg-interpreter) tcg_interpreter="false"
++    uint64_t a_mask;  /* mask bit is 0 iff value identical to first input */
-   ;;
++    uint64_t z_mask;  /* mask bit is 0 iff value bit is 0 */
--  --enable-tcg-interpreter) tcg_interpreter="false"
+     TCGType type;
-+  --enable-tcg-interpreter) tcg_interpreter="true"
+ } OptContext;
-   ;;
-   --disable-cap-ng)  cap_ng="disabled"
+@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
-   ;;
+     return false;
-@@ -XXX,XX +XXX,XX @@ NINJA=$ninja $meson setup \
+ }
-         -Dvhost_user_blk_server=$vhost_user_blk_server \
-         -Dfuse=$fuse -Dfuse_lseek=$fuse_lseek -Dguest_agent_msi=$guest_agent_msi \
++static bool fold_masks(OptContext *ctx, TCGOp *op)
-         $(if test "$default_features" = no; then echo "-Dauto_features=disabled"; fi) \
++{
-+    -Dtcg_interpreter=$tcg_interpreter \
++    uint64_t a_mask = ctx->a_mask;
-         $cross_arg \
++    uint64_t z_mask = ctx->z_mask;
-         "$PWD" "$source_path"
++
++    /*
 +     * 32-bit ops generate 32-bit results.  For the result is zero test
 +     * below, we can ignore high bits, but for further optimizations we
 +     * need to record that the high bits contain garbage.
 +     */
 +    if (ctx->type == TCG_TYPE_I32) {
 +        ctx->z_mask |= MAKE_64BIT_MASK(32, 32);
 +        a_mask &= MAKE_64BIT_MASK(0, 32);
 +        z_mask &= MAKE_64BIT_MASK(0, 32);
 +    }
 +
 +    if (z_mask == 0) {
 +        return tcg_opt_gen_movi(ctx, op, op->args[0], 0);
 +    }
 +    if (a_mask == 0) {
 +        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]);
 +    }
 +    return false;
 +}
 +
  /*
   * Convert @op to NOT, if NOT is supported by the host.
   * Return true f the conversion is successful, which will still
@@ -XXX,XX +XXX,XX @@ static bool fold_add2_i32(OptContext *ctx, TCGOp *op)
  static bool fold_and(OptContext *ctx, TCGOp *op)
  {
 +    uint64_t z1, z2;
 +
      if (fold_const2(ctx, op) ||
          fold_xi_to_i(ctx, op, 0) ||
          fold_xi_to_x(ctx, op, -1) ||
          fold_xx_to_x(ctx, op)) {
          return true;
      }
 -    return false;
 +
 +    z1 = arg_info(op->args[1])->z_mask;
 +    z2 = arg_info(op->args[2])->z_mask;
 +    ctx->z_mask = z1 & z2;
 +
 +    /*
 +     * Known-zeros does not imply known-ones.  Therefore unless
 +     * arg2 is constant, we can't infer affected bits from it.
 +     */
 +    if (arg_is_const(op->args[2])) {
 +        ctx->a_mask = z1 & ~z2;
 +    }
 +
 +    return fold_masks(ctx, op);
  }
  static bool fold_andc(OptContext *ctx, TCGOp *op)
  {
 +    uint64_t z1;
 +
      if (fold_const2(ctx, op) ||
          fold_xx_to_i(ctx, op, 0) ||
          fold_xi_to_x(ctx, op, 0) ||
          fold_ix_to_not(ctx, op, -1)) {
          return true;
      }
 -    return false;
 +
 +    z1 = arg_info(op->args[1])->z_mask;
 +
 +    /*
 +     * Known-zeros does not imply known-ones.  Therefore unless
 +     * arg2 is constant, we can't infer anything from it.
 +     */
 +    if (arg_is_const(op->args[2])) {
 +        uint64_t z2 = ~arg_info(op->args[2])->z_mask;
 +        ctx->a_mask = z1 & ~z2;
 +        z1 &= z2;
 +    }
 +    ctx->z_mask = z1;
 +
 +    return fold_masks(ctx, op);
  }
  static bool fold_brcond(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
  static bool fold_bswap(OptContext *ctx, TCGOp *op)
  {
 +    uint64_t z_mask, sign;
 +
      if (arg_is_const(op->args[1])) {
          uint64_t t = arg_info(op->args[1])->val;
          t = do_constant_folding(op->opc, ctx->type, t, op->args[2]);
          return tcg_opt_gen_movi(ctx, op, op->args[0], t);
      }
 -    return false;
 +
 +    z_mask = arg_info(op->args[1])->z_mask;
 +    switch (op->opc) {
 +    case INDEX_op_bswap16_i32:
 +    case INDEX_op_bswap16_i64:
 +        z_mask = bswap16(z_mask);
 +        sign = INT16_MIN;
 +        break;
 +    case INDEX_op_bswap32_i32:
 +    case INDEX_op_bswap32_i64:
 +        z_mask = bswap32(z_mask);
 +        sign = INT32_MIN;
 +        break;
 +    case INDEX_op_bswap64_i64:
 +        z_mask = bswap64(z_mask);
 +        sign = INT64_MIN;
 +        break;
 +    default:
 +        g_assert_not_reached();
 +    }
 +
 +    switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
 +    case TCG_BSWAP_OZ:
 +        break;
 +    case TCG_BSWAP_OS:
 +        /* If the sign bit may be 1, force all the bits above to 1. */
 +        if (z_mask & sign) {
 +            z_mask |= sign;
 +        }
 +        break;
 +    default:
 +        /* The high bits are undefined: force all bits above the sign to 1. */
 +        z_mask |= sign << 1;
 +        break;
 +    }
 +    ctx->z_mask = z_mask;
 +
 +    return fold_masks(ctx, op);
  }
  static bool fold_call(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_call(OptContext *ctx, TCGOp *op)
  static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
  {
 +    uint64_t z_mask;
 +
      if (arg_is_const(op->args[1])) {
          uint64_t t = arg_info(op->args[1])->val;
@@ -XXX,XX +XXX,XX @@ static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
          }
          return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[2]);
      }
 +
 +    switch (ctx->type) {
 +    case TCG_TYPE_I32:
 +        z_mask = 31;
 +        break;
 +    case TCG_TYPE_I64:
 +        z_mask = 63;
 +        break;
 +    default:
 +        g_assert_not_reached();
 +    }
 +    ctx->z_mask = arg_info(op->args[2])->z_mask | z_mask;
 +
      return false;
  }
  static bool fold_ctpop(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const1(ctx, op);
 +    if (fold_const1(ctx, op)) {
 +        return true;
 +    }
 +
 +    switch (ctx->type) {
 +    case TCG_TYPE_I32:
 +        ctx->z_mask = 32 | 31;
 +        break;
 +    case TCG_TYPE_I64:
 +        ctx->z_mask = 64 | 63;
 +        break;
 +    default:
 +        g_assert_not_reached();
 +    }
 +    return false;
  }
  static bool fold_deposit(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_deposit(OptContext *ctx, TCGOp *op)
          t1 = deposit64(t1, op->args[3], op->args[4], t2);
          return tcg_opt_gen_movi(ctx, op, op->args[0], t1);
      }
 +
 +    ctx->z_mask = deposit64(arg_info(op->args[1])->z_mask,
 +                            op->args[3], op->args[4],
 +                            arg_info(op->args[2])->z_mask);
      return false;
  }
@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
  static bool fold_extract(OptContext *ctx, TCGOp *op)
  {
 +    uint64_t z_mask_old, z_mask;
 +
      if (arg_is_const(op->args[1])) {
          uint64_t t;
@@ -XXX,XX +XXX,XX @@ static bool fold_extract(OptContext *ctx, TCGOp *op)
          t = extract64(t, op->args[2], op->args[3]);
          return tcg_opt_gen_movi(ctx, op, op->args[0], t);
      }
 -    return false;
 +
 +    z_mask_old = arg_info(op->args[1])->z_mask;
 +    z_mask = extract64(z_mask_old, op->args[2], op->args[3]);
 +    if (op->args[2] == 0) {
 +        ctx->a_mask = z_mask_old ^ z_mask;
 +    }
 +    ctx->z_mask = z_mask;
 +
 +    return fold_masks(ctx, op);
  }
  static bool fold_extract2(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_extract2(OptContext *ctx, TCGOp *op)
  static bool fold_exts(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const1(ctx, op);
 +    uint64_t z_mask_old, z_mask, sign;
 +    bool type_change = false;
 +
 +    if (fold_const1(ctx, op)) {
 +        return true;
 +    }
 +
 +    z_mask_old = z_mask = arg_info(op->args[1])->z_mask;
 +
 +    switch (op->opc) {
 +    CASE_OP_32_64(ext8s):
 +        sign = INT8_MIN;
 +        z_mask = (uint8_t)z_mask;
 +        break;
 +    CASE_OP_32_64(ext16s):
 +        sign = INT16_MIN;
 +        z_mask = (uint16_t)z_mask;
 +        break;
 +    case INDEX_op_ext_i32_i64:
 +        type_change = true;
 +        QEMU_FALLTHROUGH;
 +    case INDEX_op_ext32s_i64:
 +        sign = INT32_MIN;
 +        z_mask = (uint32_t)z_mask;
 +        break;
 +    default:
 +        g_assert_not_reached();
 +    }
 +
 +    if (z_mask & sign) {
 +        z_mask |= sign;
 +    } else if (!type_change) {
 +        ctx->a_mask = z_mask_old ^ z_mask;
 +    }
 +    ctx->z_mask = z_mask;
 +
 +    return fold_masks(ctx, op);
  }
  static bool fold_extu(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const1(ctx, op);
 +    uint64_t z_mask_old, z_mask;
 +    bool type_change = false;
 +
 +    if (fold_const1(ctx, op)) {
 +        return true;
 +    }
 +
 +    z_mask_old = z_mask = arg_info(op->args[1])->z_mask;
 +
 +    switch (op->opc) {
 +    CASE_OP_32_64(ext8u):
 +        z_mask = (uint8_t)z_mask;
 +        break;
 +    CASE_OP_32_64(ext16u):
 +        z_mask = (uint16_t)z_mask;
 +        break;
 +    case INDEX_op_extrl_i64_i32:
 +    case INDEX_op_extu_i32_i64:
 +        type_change = true;
 +        QEMU_FALLTHROUGH;
 +    case INDEX_op_ext32u_i64:
 +        z_mask = (uint32_t)z_mask;
 +        break;
 +    case INDEX_op_extrh_i64_i32:
 +        type_change = true;
 +        z_mask >>= 32;
 +        break;
 +    default:
 +        g_assert_not_reached();
 +    }
 +
 +    ctx->z_mask = z_mask;
 +    if (!type_change) {
 +        ctx->a_mask = z_mask_old ^ z_mask;
 +    }
 +    return fold_masks(ctx, op);
  }
  static bool fold_mb(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
          return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[4 - i]);
      }
 +    ctx->z_mask = arg_info(op->args[3])->z_mask
 +                | arg_info(op->args[4])->z_mask;
 +
      if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
          uint64_t tv = arg_info(op->args[3])->val;
          uint64_t fv = arg_info(op->args[4])->val;
@@ -XXX,XX +XXX,XX @@ static bool fold_nand(OptContext *ctx, TCGOp *op)
  static bool fold_neg(OptContext *ctx, TCGOp *op)
  {
 +    uint64_t z_mask;
 +
      if (fold_const1(ctx, op)) {
          return true;
      }
 +
 +    /* Set to 1 all bits to the left of the rightmost.  */
 +    z_mask = arg_info(op->args[1])->z_mask;
 +    ctx->z_mask = -(z_mask & -z_mask);
 +
      /*
       * Because of fold_sub_to_neg, we want to always return true,
       * via finish_folding.
@@ -XXX,XX +XXX,XX @@ static bool fold_or(OptContext *ctx, TCGOp *op)
          fold_xx_to_x(ctx, op)) {
          return true;
      }
 -    return false;
 +
 +    ctx->z_mask = arg_info(op->args[1])->z_mask
 +                | arg_info(op->args[2])->z_mask;
 +    return fold_masks(ctx, op);
  }
  static bool fold_orc(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_orc(OptContext *ctx, TCGOp *op)
  static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
  {
 +    const TCGOpDef *def = &tcg_op_defs[op->opc];
 +    MemOpIdx oi = op->args[def->nb_oargs + def->nb_iargs];
 +    MemOp mop = get_memop(oi);
 +    int width = 8 * memop_size(mop);
 +
 +    if (!(mop & MO_SIGN) && width < 64) {
 +        ctx->z_mask = MAKE_64BIT_MASK(0, width);
 +    }
 +
      /* Opcodes that touch guest memory stop the mb optimization.  */
      ctx->prev_mb = NULL;
      return false;
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond(OptContext *ctx, TCGOp *op)
      if (i >= 0) {
          return tcg_opt_gen_movi(ctx, op, op->args[0], i);
      }
 +
 +    ctx->z_mask = 1;
      return false;
  }
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
          op->opc = INDEX_op_setcond_i32;
          break;
      }
 +
 +    ctx->z_mask = 1;
      return false;
   do_setcond_const:
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
  static bool fold_sextract(OptContext *ctx, TCGOp *op)
  {
 +    int64_t z_mask_old, z_mask;
 +
      if (arg_is_const(op->args[1])) {
          uint64_t t;
@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
          t = sextract64(t, op->args[2], op->args[3]);
          return tcg_opt_gen_movi(ctx, op, op->args[0], t);
      }
 -    return false;
 +
 +    z_mask_old = arg_info(op->args[1])->z_mask;
 +    z_mask = sextract64(z_mask_old, op->args[2], op->args[3]);
 +    if (op->args[2] == 0 && z_mask >= 0) {
 +        ctx->a_mask = z_mask_old ^ z_mask;
 +    }
 +    ctx->z_mask = z_mask;
 +
 +    return fold_masks(ctx, op);
  }
  static bool fold_shift(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_shift(OptContext *ctx, TCGOp *op)
          fold_xi_to_x(ctx, op, 0)) {
          return true;
      }
 +
 +    if (arg_is_const(op->args[2])) {
 +        ctx->z_mask = do_constant_folding(op->opc, ctx->type,
 +                                          arg_info(op->args[1])->z_mask,
 +                                          arg_info(op->args[2])->val);
 +        return fold_masks(ctx, op);
 +    }
      return false;
  }
@@ -XXX,XX +XXX,XX @@ static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
      return fold_addsub2_i32(ctx, op, false);
  }
 +static bool fold_tcg_ld(OptContext *ctx, TCGOp *op)
 +{
 +    /* We can't do any folding with a load, but we can record bits. */
 +    switch (op->opc) {
 +    CASE_OP_32_64(ld8u):
 +        ctx->z_mask = MAKE_64BIT_MASK(0, 8);
 +        break;
 +    CASE_OP_32_64(ld16u):
 +        ctx->z_mask = MAKE_64BIT_MASK(0, 16);
 +        break;
 +    case INDEX_op_ld32u_i64:
 +        ctx->z_mask = MAKE_64BIT_MASK(0, 32);
 +        break;
 +    default:
 +        g_assert_not_reached();
 +    }
 +    return false;
 +}
 +
  static bool fold_xor(OptContext *ctx, TCGOp *op)
  {
      if (fold_const2(ctx, op) ||
@@ -XXX,XX +XXX,XX @@ static bool fold_xor(OptContext *ctx, TCGOp *op)
          fold_xi_to_not(ctx, op, -1)) {
          return true;
      }
 -    return false;
 +
 +    ctx->z_mask = arg_info(op->args[1])->z_mask
 +                | arg_info(op->args[2])->z_mask;
 +    return fold_masks(ctx, op);
  }
  /* Propagate constants and copies, fold constant expressions. */
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
      }
      QTAILQ_FOREACH_SAFE(op, &s->ops, link, op_next) {
 -        uint64_t z_mask, partmask, affected, tmp;
          TCGOpcode opc = op->opc;
          const TCGOpDef *def;
          bool done = false;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              break;
          }
 -        /* Simplify using known-zero bits. Currently only ops with a single
 -           output argument is supported. */
 -        z_mask = -1;
 -        affected = -1;
 -        switch (opc) {
 -        CASE_OP_32_64(ext8s):
 -            if ((arg_info(op->args[1])->z_mask & 0x80) != 0) {
 -                break;
 -            }
 -            QEMU_FALLTHROUGH;
 -        CASE_OP_32_64(ext8u):
 -            z_mask = 0xff;
 -            goto and_const;
 -        CASE_OP_32_64(ext16s):
 -            if ((arg_info(op->args[1])->z_mask & 0x8000) != 0) {
 -                break;
 -            }
 -            QEMU_FALLTHROUGH;
 -        CASE_OP_32_64(ext16u):
 -            z_mask = 0xffff;
 -            goto and_const;
 -        case INDEX_op_ext32s_i64:
 -            if ((arg_info(op->args[1])->z_mask & 0x80000000) != 0) {
 -                break;
 -            }
 -            QEMU_FALLTHROUGH;
 -        case INDEX_op_ext32u_i64:
 -            z_mask = 0xffffffffU;
 -            goto and_const;
 -
 -        CASE_OP_32_64(and):
 -            z_mask = arg_info(op->args[2])->z_mask;
 -            if (arg_is_const(op->args[2])) {
 -        and_const:
 -                affected = arg_info(op->args[1])->z_mask & ~z_mask;
 -            }
 -            z_mask = arg_info(op->args[1])->z_mask & z_mask;
 -            break;
 -
 -        case INDEX_op_ext_i32_i64:
 -            if ((arg_info(op->args[1])->z_mask & 0x80000000) != 0) {
 -                break;
 -            }
 -            QEMU_FALLTHROUGH;
 -        case INDEX_op_extu_i32_i64:
 -            /* We do not compute affected as it is a size changing op.  */
 -            z_mask = (uint32_t)arg_info(op->args[1])->z_mask;
 -            break;
 -
 -        CASE_OP_32_64(andc):
 -            /* Known-zeros does not imply known-ones.  Therefore unless
 -               op->args[2] is constant, we can't infer anything from it.  */
 -            if (arg_is_const(op->args[2])) {
 -                z_mask = ~arg_info(op->args[2])->z_mask;
 -                goto and_const;
 -            }
 -            /* But we certainly know nothing outside args[1] may be set. */
 -            z_mask = arg_info(op->args[1])->z_mask;
 -            break;
 -
 -        case INDEX_op_sar_i32:
 -            if (arg_is_const(op->args[2])) {
 -                tmp = arg_info(op->args[2])->val & 31;
 -                z_mask = (int32_t)arg_info(op->args[1])->z_mask >> tmp;
 -            }
 -            break;
 -        case INDEX_op_sar_i64:
 -            if (arg_is_const(op->args[2])) {
 -                tmp = arg_info(op->args[2])->val & 63;
 -                z_mask = (int64_t)arg_info(op->args[1])->z_mask >> tmp;
 -            }
 -            break;
 -
 -        case INDEX_op_shr_i32:
 -            if (arg_is_const(op->args[2])) {
 -                tmp = arg_info(op->args[2])->val & 31;
 -                z_mask = (uint32_t)arg_info(op->args[1])->z_mask >> tmp;
 -            }
 -            break;
 -        case INDEX_op_shr_i64:
 -            if (arg_is_const(op->args[2])) {
 -                tmp = arg_info(op->args[2])->val & 63;
 -                z_mask = (uint64_t)arg_info(op->args[1])->z_mask >> tmp;
 -            }
 -            break;
 -
 -        case INDEX_op_extrl_i64_i32:
 -            z_mask = (uint32_t)arg_info(op->args[1])->z_mask;
 -            break;
 -        case INDEX_op_extrh_i64_i32:
 -            z_mask = (uint64_t)arg_info(op->args[1])->z_mask >> 32;
 -            break;
 -
 -        CASE_OP_32_64(shl):
 -            if (arg_is_const(op->args[2])) {
 -                tmp = arg_info(op->args[2])->val & (TCG_TARGET_REG_BITS - 1);
 -                z_mask = arg_info(op->args[1])->z_mask << tmp;
 -            }
 -            break;
 -
 -        CASE_OP_32_64(neg):
 -            /* Set to 1 all bits to the left of the rightmost.  */
 -            z_mask = -(arg_info(op->args[1])->z_mask
 -                       & -arg_info(op->args[1])->z_mask);
 -            break;
 -
 -        CASE_OP_32_64(deposit):
 -            z_mask = deposit64(arg_info(op->args[1])->z_mask,
 -                               op->args[3], op->args[4],
 -                               arg_info(op->args[2])->z_mask);
 -            break;
 -
 -        CASE_OP_32_64(extract):
 -            z_mask = extract64(arg_info(op->args[1])->z_mask,
 -                               op->args[2], op->args[3]);
 -            if (op->args[2] == 0) {
 -                affected = arg_info(op->args[1])->z_mask & ~z_mask;
 -            }
 -            break;
 -        CASE_OP_32_64(sextract):
 -            z_mask = sextract64(arg_info(op->args[1])->z_mask,
 -                                op->args[2], op->args[3]);
 -            if (op->args[2] == 0 && (tcg_target_long)z_mask >= 0) {
 -                affected = arg_info(op->args[1])->z_mask & ~z_mask;
 -            }
 -            break;
 -
 -        CASE_OP_32_64(or):
 -        CASE_OP_32_64(xor):
 -            z_mask = arg_info(op->args[1])->z_mask
 -                   | arg_info(op->args[2])->z_mask;
 -            break;
 -
 -        case INDEX_op_clz_i32:
 -        case INDEX_op_ctz_i32:
 -            z_mask = arg_info(op->args[2])->z_mask | 31;
 -            break;
 -
 -        case INDEX_op_clz_i64:
 -        case INDEX_op_ctz_i64:
 -            z_mask = arg_info(op->args[2])->z_mask | 63;
 -            break;
 -
 -        case INDEX_op_ctpop_i32:
 -            z_mask = 32 | 31;
 -            break;
 -        case INDEX_op_ctpop_i64:
 -            z_mask = 64 | 63;
 -            break;
 -
 -        CASE_OP_32_64(setcond):
 -        case INDEX_op_setcond2_i32:
 -            z_mask = 1;
 -            break;
 -
 -        CASE_OP_32_64(movcond):
 -            z_mask = arg_info(op->args[3])->z_mask
 -                   | arg_info(op->args[4])->z_mask;
 -            break;
 -
 -        CASE_OP_32_64(ld8u):
 -            z_mask = 0xff;
 -            break;
 -        CASE_OP_32_64(ld16u):
 -            z_mask = 0xffff;
 -            break;
 -        case INDEX_op_ld32u_i64:
 -            z_mask = 0xffffffffu;
 -            break;
 -
 -        CASE_OP_32_64(qemu_ld):
 -            {
 -                MemOpIdx oi = op->args[def->nb_oargs + def->nb_iargs];
 -                MemOp mop = get_memop(oi);
 -                if (!(mop & MO_SIGN)) {
 -                    z_mask = (2ULL << ((8 << (mop & MO_SIZE)) - 1)) - 1;
 -                }
 -            }
 -            break;
 -
 -        CASE_OP_32_64(bswap16):
 -            z_mask = arg_info(op->args[1])->z_mask;
 -            if (z_mask <= 0xffff) {
 -                op->args[2] |= TCG_BSWAP_IZ;
 -            }
 -            z_mask = bswap16(z_mask);
 -            switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
 -            case TCG_BSWAP_OZ:
 -                break;
 -            case TCG_BSWAP_OS:
 -                z_mask = (int16_t)z_mask;
 -                break;
 -            default: /* undefined high bits */
 -                z_mask |= MAKE_64BIT_MASK(16, 48);
 -                break;
 -            }
 -            break;
 -
 -        case INDEX_op_bswap32_i64:
 -            z_mask = arg_info(op->args[1])->z_mask;
 -            if (z_mask <= 0xffffffffu) {
 -                op->args[2] |= TCG_BSWAP_IZ;
 -            }
 -            z_mask = bswap32(z_mask);
 -            switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
 -            case TCG_BSWAP_OZ:
 -                break;
 -            case TCG_BSWAP_OS:
 -                z_mask = (int32_t)z_mask;
 -                break;
 -            default: /* undefined high bits */
 -                z_mask |= MAKE_64BIT_MASK(32, 32);
 -                break;
 -            }
 -            break;
 -
 -        default:
 -            break;
 -        }
 -
 -        /* 32-bit ops generate 32-bit results.  For the result is zero test
 -           below, we can ignore high bits, but for further optimizations we
 -           need to record that the high bits contain garbage.  */
 -        partmask = z_mask;
 -        if (ctx.type == TCG_TYPE_I32) {
 -            z_mask |= ~(tcg_target_ulong)0xffffffffu;
 -            partmask &= 0xffffffffu;
 -            affected &= 0xffffffffu;
 -        }
 -        ctx.z_mask = z_mask;
 -
 -        if (partmask == 0) {
 -            tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
 -            continue;
 -        }
 -        if (affected == 0) {
 -            tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
 -            continue;
 -        }
 +        /* Assume all bits affected, and no bits known zero. */
 +        ctx.a_mask = -1;
 +        ctx.z_mask = -1;
          /*
           * Process each opcode.
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          case INDEX_op_extrh_i64_i32:
              done = fold_extu(&ctx, op);
              break;
 +        CASE_OP_32_64(ld8u):
 +        CASE_OP_32_64(ld16u):
 +        case INDEX_op_ld32u_i64:
 +            done = fold_tcg_ld(&ctx, op);
 +            break;
          case INDEX_op_mb:
              done = fold_mb(&ctx, op);
              break;
 --
 .25.1

-[PULL 09/46] tcg/tci: Inline tci_write_reg8 into its callers
+[PULL 44/56] tcg/optimize: Expand fold_mulu2_i32 to all 4-arg multiplies
-Tested-by: Alex Bennée <alex.bennee@linaro.org>
+Rename to fold_multiply2, and handle muls2_i32, mulu2_i64,
-Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+and muls2_i64.
 Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/tci.c | 9 ++-------
+ tcg/optimize.c | 44 +++++++++++++++++++++++++++++++++++---------
-file changed, 2 insertions(+), 7 deletions(-)
+file changed, 35 insertions(+), 9 deletions(-)
-diff --git a/tcg/tci.c b/tcg/tci.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/tci.c
+--- a/tcg/optimize.c
-+++ b/tcg/tci.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ tci_write_reg(tcg_target_ulong *regs, TCGReg index, tcg_target_ulong value)
+@@ -XXX,XX +XXX,XX @@ static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
-     regs[index] = value;
+     return false;
  }
--static void tci_write_reg8(tcg_target_ulong *regs, TCGReg index, uint8_t value)
+-static bool fold_mulu2_i32(OptContext *ctx, TCGOp *op)
--{
++static bool fold_multiply2(OptContext *ctx, TCGOp *op)
--    tci_write_reg(regs, index, value);
+ {
--}
+     if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
--
+-        uint32_t a = arg_info(op->args[2])->val;
- #if TCG_TARGET_REG_BITS == 64
+-        uint32_t b = arg_info(op->args[3])->val;
- static void
+-        uint64_t r = (uint64_t)a * b;
- tci_write_reg16(tcg_target_ulong *regs, TCGReg index, uint16_t value)
++        uint64_t a = arg_info(op->args[2])->val;
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
++        uint64_t b = arg_info(op->args[3])->val;
-             t0 = *tb_ptr++;
++        uint64_t h, l;
-             t1 = tci_read_r(regs, &tb_ptr);
+         TCGArg rl, rh;
-             t2 = tci_read_s32(&tb_ptr);
+-        TCGOp *op2 = tcg_op_insert_before(ctx->tcg, op, INDEX_op_mov_i32);
--            tci_write_reg8(regs, t0, *(uint8_t *)(t1 + t2));
++        TCGOp *op2;
-+            tci_write_reg(regs, t0, *(uint8_t *)(t1 + t2));
++
 +        switch (op->opc) {
 +        case INDEX_op_mulu2_i32:
 +            l = (uint64_t)(uint32_t)a * (uint32_t)b;
 +            h = (int32_t)(l >> 32);
 +            l = (int32_t)l;
 +            break;
 +        case INDEX_op_muls2_i32:
 +            l = (int64_t)(int32_t)a * (int32_t)b;
 +            h = l >> 32;
 +            l = (int32_t)l;
 +            break;
 +        case INDEX_op_mulu2_i64:
 +            mulu64(&l, &h, a, b);
 +            break;
 +        case INDEX_op_muls2_i64:
 +            muls64(&l, &h, a, b);
 +            break;
 +        default:
 +            g_assert_not_reached();
 +        }
          rl = op->args[0];
          rh = op->args[1];
 -        tcg_opt_gen_movi(ctx, op, rl, (int32_t)r);
 -        tcg_opt_gen_movi(ctx, op2, rh, (int32_t)(r >> 32));
 +
 +        /* The proper opcode is supplied by tcg_opt_gen_mov. */
 +        op2 = tcg_op_insert_before(ctx->tcg, op, 0);
 +
 +        tcg_opt_gen_movi(ctx, op, rl, l);
 +        tcg_opt_gen_movi(ctx, op2, rh, h);
          return true;
      }
      return false;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          CASE_OP_32_64(muluh):
              done = fold_mul_highpart(&ctx, op);
              break;
-         case INDEX_op_ld8s_i32:
+-        case INDEX_op_mulu2_i32:
-             TODO();
+-            done = fold_mulu2_i32(&ctx, op);
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
++        CASE_OP_32_64(muls2):
-             t0 = *tb_ptr++;
++        CASE_OP_32_64(mulu2):
-             t1 = tci_read_r(regs, &tb_ptr);
++            done = fold_multiply2(&ctx, op);
              t2 = tci_read_s32(&tb_ptr);
 -            tci_write_reg8(regs, t0, *(uint8_t *)(t1 + t2));
 +            tci_write_reg(regs, t0, *(uint8_t *)(t1 + t2));
              break;
-         case INDEX_op_ld8s_i64:
+         CASE_OP_32_64(nand):
-             t0 = *tb_ptr++;
+             done = fold_nand(&ctx, op);
 --
 .25.1

-[PULL 31/46] accel/tcg: split TCG-only code from cpu_exec_realizefn
+[PULL 45/56] tcg/optimize: Expand fold_addsub2_i32 to 64-bit ops
-From: Claudio Fontana <cfontana@suse.de>
+Rename to fold_addsub2.
+Use Int128 to implement the wider operation.
 move away TCG-only code, make it compile only on TCG.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
-[claudio: moved the prototypes from hw/core/cpu.h to exec/cpu-all.h]
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Claudio Fontana <cfontana@suse.de>
 Message-Id: <20210204163931.7358-4-cfontana@suse.de>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/exec/cpu-all.h | 11 +++++--
+ tcg/optimize.c | 65 ++++++++++++++++++++++++++++++++++----------------
- include/hw/core/cpu.h  |  2 ++
+file changed, 44 insertions(+), 21 deletions(-)
  accel/tcg/cpu-exec.c   | 28 +++++++++++++++++
  cpu.c                  | 70 ++++++++++++++++++++----------------------
  hw/core/cpu.c          |  6 +++-
 files changed, 77 insertions(+), 40 deletions(-)
-diff --git a/include/exec/cpu-all.h b/include/exec/cpu-all.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/exec/cpu-all.h
+--- a/tcg/optimize.c
-+++ b/include/exec/cpu-all.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static inline bool tlb_hit(target_ulong tlb_addr, target_ulong addr)
+@@ -XXX,XX +XXX,XX @@
   */
  #include "qemu/osdep.h"
 +#include "qemu/int128.h"
  #include "tcg/tcg-op.h"
  #include "tcg-internal.h"
@@ -XXX,XX +XXX,XX @@ static bool fold_add(OptContext *ctx, TCGOp *op)
      return false;
  }
- #ifdef CONFIG_TCG
+-static bool fold_addsub2_i32(OptContext *ctx, TCGOp *op, bool add)
-+/* accel/tcg/cpu-exec.c */
++static bool fold_addsub2(OptContext *ctx, TCGOp *op, bool add)
- void dump_drift_info(void);
+ {
-+/* accel/tcg/translate-all.c */
+     if (arg_is_const(op->args[2]) && arg_is_const(op->args[3]) &&
- void dump_exec_info(void);
+         arg_is_const(op->args[4]) && arg_is_const(op->args[5])) {
- void dump_opcount_info(void);
+-        uint32_t al = arg_info(op->args[2])->val;
- #endif /* CONFIG_TCG */
+-        uint32_t ah = arg_info(op->args[3])->val;
+-        uint32_t bl = arg_info(op->args[4])->val;
- #endif /* !CONFIG_USER_ONLY */
+-        uint32_t bh = arg_info(op->args[5])->val;
+-        uint64_t a = ((uint64_t)ah << 32) | al;
-+#ifdef CONFIG_TCG
+-        uint64_t b = ((uint64_t)bh << 32) | bl;
-+/* accel/tcg/cpu-exec.c */
++        uint64_t al = arg_info(op->args[2])->val;
-+int cpu_exec(CPUState *cpu);
++        uint64_t ah = arg_info(op->args[3])->val;
-+void tcg_exec_realizefn(CPUState *cpu, Error **errp);
++        uint64_t bl = arg_info(op->args[4])->val;
-+void tcg_exec_unrealizefn(CPUState *cpu);
++        uint64_t bh = arg_info(op->args[5])->val;
-+#endif /* CONFIG_TCG */
+         TCGArg rl, rh;
 -        TCGOp *op2 = tcg_op_insert_before(ctx->tcg, op, INDEX_op_mov_i32);
 +        TCGOp *op2;
 -        if (add) {
 -            a += b;
 +        if (ctx->type == TCG_TYPE_I32) {
 +            uint64_t a = deposit64(al, 32, 32, ah);
 +            uint64_t b = deposit64(bl, 32, 32, bh);
 +
- /* Returns: 0 on success, -1 on error */
++            if (add) {
- int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr,
++                a += b;
-                         void *ptr, target_ulong len, bool is_write);
++            } else {
++                a -= b;
--int cpu_exec(CPUState *cpu);
++            }
 -
  /**
   * cpu_set_cpustate_pointers(cpu)
   * @cpu: The cpu object
 diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/hw/core/cpu.h
 +++ b/include/hw/core/cpu.h
@@ -XXX,XX +XXX,XX @@ AddressSpace *cpu_get_address_space(CPUState *cpu, int asidx);
  void QEMU_NORETURN cpu_abort(CPUState *cpu, const char *fmt, ...)
      GCC_FMT_ATTR(2, 3);
 +
-+/* $(top_srcdir)/cpu.c */
++            al = sextract64(a, 0, 32);
- void cpu_exec_initfn(CPUState *cpu);
++            ah = sextract64(a, 32, 32);
- void cpu_exec_realizefn(CPUState *cpu, Error **errp);
+         } else {
- void cpu_exec_unrealizefn(CPUState *cpu);
+-            a -= b;
-diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
++            Int128 a = int128_make128(al, ah);
-index XXXXXXX..XXXXXXX 100644
++            Int128 b = int128_make128(bl, bh);
---- a/accel/tcg/cpu-exec.c
++
-+++ b/accel/tcg/cpu-exec.c
++            if (add) {
-@@ -XXX,XX +XXX,XX @@ int cpu_exec(CPUState *cpu)
++                a = int128_add(a, b);
-     return ret;
++            } else {
 +                a = int128_sub(a, b);
 +            }
 +
 +            al = int128_getlo(a);
 +            ah = int128_gethi(a);
          }
          rl = op->args[0];
          rh = op->args[1];
 -        tcg_opt_gen_movi(ctx, op, rl, (int32_t)a);
 -        tcg_opt_gen_movi(ctx, op2, rh, (int32_t)(a >> 32));
 +
 +        /* The proper opcode is supplied by tcg_opt_gen_mov. */
 +        op2 = tcg_op_insert_before(ctx->tcg, op, 0);
 +
 +        tcg_opt_gen_movi(ctx, op, rl, al);
 +        tcg_opt_gen_movi(ctx, op2, rh, ah);
          return true;
      }
      return false;
  }
-+void tcg_exec_realizefn(CPUState *cpu, Error **errp)
+-static bool fold_add2_i32(OptContext *ctx, TCGOp *op)
-+{
++static bool fold_add2(OptContext *ctx, TCGOp *op)
 +    static bool tcg_target_initialized;
 +    CPUClass *cc = CPU_GET_CLASS(cpu);
 +
 +    if (!tcg_target_initialized) {
 +        cc->tcg_ops.initialize();
 +        tcg_target_initialized = true;
 +    }
 +    tlb_init(cpu);
 +    qemu_plugin_vcpu_init_hook(cpu);
 +
 +#ifndef CONFIG_USER_ONLY
 +    tcg_iommu_init_notifier_list(cpu);
 +#endif /* !CONFIG_USER_ONLY */
 +}
 +
 +/* undo the initializations in reverse order */
 +void tcg_exec_unrealizefn(CPUState *cpu)
 +{
 +#ifndef CONFIG_USER_ONLY
 +    tcg_iommu_free_notifier_list(cpu);
 +#endif /* !CONFIG_USER_ONLY */
 +
 +    qemu_plugin_vcpu_exit_hook(cpu);
 +    tlb_destroy(cpu);
 +}
 +
  #ifndef CONFIG_USER_ONLY
  void dump_drift_info(void)
 diff --git a/cpu.c b/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/cpu.c
 +++ b/cpu.c
@@ -XXX,XX +XXX,XX @@ const VMStateDescription vmstate_cpu_common = {
  };
  #endif
 -void cpu_exec_unrealizefn(CPUState *cpu)
 +void cpu_exec_realizefn(CPUState *cpu, Error **errp)
  {
-     CPUClass *cc = CPU_GET_CLASS(cpu);
+-    return fold_addsub2_i32(ctx, op, true);
++    return fold_addsub2(ctx, op, true);
 -    tlb_destroy(cpu);
 -    cpu_list_remove(cpu);
 +    cpu_list_add(cpu);
 +
 +#ifdef CONFIG_TCG
 +    /* NB: errp parameter is unused currently */
 +    if (tcg_enabled()) {
 +        tcg_exec_realizefn(cpu, errp);
 +    }
 +#endif /* CONFIG_TCG */
 +
 +#ifdef CONFIG_USER_ONLY
 +    assert(cc->vmsd == NULL);
 +#else
 +    if (qdev_get_vmsd(DEVICE(cpu)) == NULL) {
 +        vmstate_register(NULL, cpu->cpu_index, &vmstate_cpu_common, cpu);
 +    }
 +    if (cc->vmsd != NULL) {
 +        vmstate_register(NULL, cpu->cpu_index, cc->vmsd, cpu);
 +    }
 +#endif /* CONFIG_USER_ONLY */
 +}
 +
 +void cpu_exec_unrealizefn(CPUState *cpu)
 +{
 +    CPUClass *cc = CPU_GET_CLASS(cpu);
  #ifdef CONFIG_USER_ONLY
      assert(cc->vmsd == NULL);
@@ -XXX,XX +XXX,XX @@ void cpu_exec_unrealizefn(CPUState *cpu)
      if (qdev_get_vmsd(DEVICE(cpu)) == NULL) {
          vmstate_unregister(NULL, &vmstate_cpu_common, cpu);
      }
 -    tcg_iommu_free_notifier_list(cpu);
  #endif
 +#ifdef CONFIG_TCG
 +    /* NB: errp parameter is unused currently */
 +    if (tcg_enabled()) {
 +        tcg_exec_unrealizefn(cpu);
 +    }
 +#endif /* CONFIG_TCG */
 +
 +    cpu_list_remove(cpu);
  }
- void cpu_exec_initfn(CPUState *cpu)
+ static bool fold_and(OptContext *ctx, TCGOp *op)
-@@ -XXX,XX +XXX,XX @@ void cpu_exec_initfn(CPUState *cpu)
+@@ -XXX,XX +XXX,XX @@ static bool fold_sub(OptContext *ctx, TCGOp *op)
- #endif
+     return false;
  }
--void cpu_exec_realizefn(CPUState *cpu, Error **errp)
+-static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
--{
++static bool fold_sub2(OptContext *ctx, TCGOp *op)
 -    CPUClass *cc = CPU_GET_CLASS(cpu);
 -#ifdef CONFIG_TCG
 -    static bool tcg_target_initialized;
 -#endif /* CONFIG_TCG */
 -
 -    cpu_list_add(cpu);
 -
 -#ifdef CONFIG_TCG
 -    if (tcg_enabled() && !tcg_target_initialized) {
 -        tcg_target_initialized = true;
 -        cc->tcg_ops.initialize();
 -    }
 -#endif /* CONFIG_TCG */
 -    tlb_init(cpu);
 -
 -    qemu_plugin_vcpu_init_hook(cpu);
 -
 -#ifdef CONFIG_USER_ONLY
 -    assert(cc->vmsd == NULL);
 -#else /* !CONFIG_USER_ONLY */
 -    if (qdev_get_vmsd(DEVICE(cpu)) == NULL) {
 -        vmstate_register(NULL, cpu->cpu_index, &vmstate_cpu_common, cpu);
 -    }
 -    if (cc->vmsd != NULL) {
 -        vmstate_register(NULL, cpu->cpu_index, cc->vmsd, cpu);
 -    }
 -
 -    tcg_iommu_init_notifier_list(cpu);
 -#endif
 -}
 -
  const char *parse_cpu_option(const char *cpu_option)
  {
-     ObjectClass *oc;
+-    return fold_addsub2_i32(ctx, op, false);
-diff --git a/hw/core/cpu.c b/hw/core/cpu.c
++    return fold_addsub2(ctx, op, false);
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/core/cpu.c
 +++ b/hw/core/cpu.c
@@ -XXX,XX +XXX,XX @@ static bool cpu_common_virtio_is_big_endian(CPUState *cpu)
      return target_words_bigendian();
  }
-+/*
+ static bool fold_tcg_ld(OptContext *ctx, TCGOp *op)
-+ * XXX the following #if is always true because this is a common_ss
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-+ * module, so target CONFIG_* is never defined.
+         CASE_OP_32_64_VEC(add):
-+ */
+             done = fold_add(&ctx, op);
- #if !defined(CONFIG_USER_ONLY)
+             break;
- GuestPanicInformation *cpu_get_crash_info(CPUState *cpu)
+-        case INDEX_op_add2_i32:
- {
+-            done = fold_add2_i32(&ctx, op);
-@@ -XXX,XX +XXX,XX @@ static void cpu_common_realizefn(DeviceState *dev, Error **errp)
++        CASE_OP_32_64(add2):
- static void cpu_common_unrealizefn(DeviceState *dev)
++            done = fold_add2(&ctx, op);
- {
+             break;
-     CPUState *cpu = CPU(dev);
+         CASE_OP_32_64_VEC(and):
-+
+             done = fold_and(&ctx, op);
-     /* NOTE: latest generic point before the cpu is fully unrealized */
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-     trace_fini_vcpu(cpu);
+         CASE_OP_32_64_VEC(sub):
--    qemu_plugin_vcpu_exit_hook(cpu);
+             done = fold_sub(&ctx, op);
-     cpu_exec_unrealizefn(cpu);
+             break;
- }
+-        case INDEX_op_sub2_i32:
+-            done = fold_sub2_i32(&ctx, op);
 +        CASE_OP_32_64(sub2):
 +            done = fold_sub2(&ctx, op);
              break;
          CASE_OP_32_64_VEC(xor):
              done = fold_xor(&ctx, op);
 --
 .25.1

-[PULL 46/46] accel: introduce AccelCPUClass extending CPUClass
+[PULL 46/56] tcg/optimize: Sink commutative operand swapping into fold functions
-From: Claudio Fontana <cfontana@suse.de>
+Most of these are handled by creating a fold_const2_commutative
+to handle all of the binary operators.  The rest were already
-add a new optional interface to CPUClass, which allows accelerators
+handled on a case-by-case basis in the switch, and have their
-to extend the CPUClass with additional accelerator-specific
+own fold function in which to place the call.
-initializations.
+We now have only one major switch on TCGOpcode.
-This will allow to separate the target cpu code that is specific
-to each accelerator, and register it automatically with object
+Introduce NO_DEST and a block comment for swap_commutative in
-hierarchy lookup depending on accelerator code availability,
+order to make the handling of brcond and movcond opcodes cleaner.
-as part of the accel_init_interfaces() initialization step.
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Claudio Fontana <cfontana@suse.de>
 Message-Id: <20210204163931.7358-19-cfontana@suse.de>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/hw/core/accel-cpu.h | 38 ++++++++++++++++++++++++++++++++
+ tcg/optimize.c | 142 ++++++++++++++++++++++++-------------------------
- include/hw/core/cpu.h       |  4 ++++
+file changed, 70 insertions(+), 72 deletions(-)
- accel/accel-common.c        | 44 +++++++++++++++++++++++++++++++++++++
- MAINTAINERS                 |  1 +
+diff --git a/tcg/optimize.c b/tcg/optimize.c
-files changed, 87 insertions(+)
+index XXXXXXX..XXXXXXX 100644
- create mode 100644 include/hw/core/accel-cpu.h
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
-diff --git a/include/hw/core/accel-cpu.h b/include/hw/core/accel-cpu.h
+@@ -XXX,XX +XXX,XX @@ static int do_constant_folding_cond2(TCGArg *p1, TCGArg *p2, TCGCond c)
-new file mode 100644
+     return -1;
-index XXXXXXX..XXXXXXX
+ }
---- /dev/null
-+++ b/include/hw/core/accel-cpu.h
++/**
-@@ -XXX,XX +XXX,XX @@
++ * swap_commutative:
-+/*
++ * @dest: TCGArg of the destination argument, or NO_DEST.
-+ * Accelerator interface, specializes CPUClass
++ * @p1: first paired argument
-+ * This header is used only by target-specific code.
++ * @p2: second paired argument
 + *
-+ * Copyright 2021 SUSE LLC
++ * If *@p1 is a constant and *@p2 is not, swap.
-+ *
++ * If *@p2 matches @dest, swap.
-+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
++ * Return true if a swap was performed.
 + * See the COPYING file in the top-level directory.
 + */
 +
-+#ifndef ACCEL_CPU_H
++#define NO_DEST  temp_arg(NULL)
-+#define ACCEL_CPU_H
++
-+
+ static bool swap_commutative(TCGArg dest, TCGArg *p1, TCGArg *p2)
-+/*
+ {
-+ * This header is used to define new accelerator-specific target-specific
+     TCGArg a1 = *p1, a2 = *p2;
-+ * accelerator cpu subclasses.
+@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
-+ * It uses CPU_RESOLVING_TYPE, so this is clearly target-specific.
+     return false;
 + *
 + * Do not try to use for any other purpose than the implementation of new
 + * subclasses in target/, or the accel implementation itself in accel/
 + */
 +
 +#define TYPE_ACCEL_CPU "accel-" CPU_RESOLVING_TYPE
 +#define ACCEL_CPU_NAME(name) (name "-" TYPE_ACCEL_CPU)
 +typedef struct AccelCPUClass AccelCPUClass;
 +DECLARE_CLASS_CHECKERS(AccelCPUClass, ACCEL_CPU, TYPE_ACCEL_CPU)
 +
 +typedef struct AccelCPUClass {
 +    /*< private >*/
 +    ObjectClass parent_class;
 +    /*< public >*/
 +
 +    void (*cpu_class_init)(CPUClass *cc);
 +    void (*cpu_instance_init)(CPUState *cpu);
 +    void (*cpu_realizefn)(CPUState *cpu, Error **errp);
 +} AccelCPUClass;
 +
 +#endif /* ACCEL_CPU_H */
 diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/hw/core/cpu.h
 +++ b/include/hw/core/cpu.h
@@ -XXX,XX +XXX,XX @@ struct TranslationBlock;
  /* see tcg-cpu-ops.h */
  struct TCGCPUOps;
 +/* see accel-cpu.h */
 +struct AccelCPUClass;
 +
  /**
   * CPUClass:
   * @class_by_name: Callback to map -cpu command line model name to an
@@ -XXX,XX +XXX,XX @@ struct CPUClass {
      /* Keep non-pointer data at the end to minimize holes.  */
      int gdb_num_core_regs;
      bool gdb_stop_before_watchpoint;
 +    struct AccelCPUClass *accel_cpu;
      /* when TCG is not available, this pointer is NULL */
      struct TCGCPUOps *tcg_ops;
 diff --git a/accel/accel-common.c b/accel/accel-common.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/accel-common.c
 +++ b/accel/accel-common.c
@@ -XXX,XX +XXX,XX @@
  #include "qemu/osdep.h"
  #include "qemu/accel.h"
 +#include "cpu.h"
 +#include "hw/core/accel-cpu.h"
 +
  #ifndef CONFIG_USER_ONLY
  #include "accel-softmmu.h"
  #endif /* !CONFIG_USER_ONLY */
@@ -XXX,XX +XXX,XX @@ AccelClass *accel_find(const char *opt_name)
      return ac;
  }
-+static void accel_init_cpu_int_aux(ObjectClass *klass, void *opaque)
++static bool fold_const2_commutative(OptContext *ctx, TCGOp *op)
 +{
-+    CPUClass *cc = CPU_CLASS(klass);
++    swap_commutative(op->args[0], &op->args[1], &op->args[2]);
-+    AccelCPUClass *accel_cpu = opaque;
++    return fold_const2(ctx, op);
 +
 +    cc->accel_cpu = accel_cpu;
 +    if (accel_cpu->cpu_class_init) {
 +        accel_cpu->cpu_class_init(cc);
 +    }
 +}
 +
-+/* initialize the arch-specific accel CpuClass interfaces */
+ static bool fold_masks(OptContext *ctx, TCGOp *op)
-+static void accel_init_cpu_interfaces(AccelClass *ac)
+ {
-+{
+     uint64_t a_mask = ctx->a_mask;
-+    const char *ac_name; /* AccelClass name */
+@@ -XXX,XX +XXX,XX @@ static bool fold_xx_to_x(OptContext *ctx, TCGOp *op)
-+    char *acc_name;      /* AccelCPUClass name */
-+    ObjectClass *acc;    /* AccelCPUClass */
+ static bool fold_add(OptContext *ctx, TCGOp *op)
-+
+ {
-+    ac_name = object_class_get_name(OBJECT_CLASS(ac));
+-    if (fold_const2(ctx, op) ||
-+    g_assert(ac_name != NULL);
++    if (fold_const2_commutative(ctx, op) ||
-+
+         fold_xi_to_x(ctx, op, 0)) {
-+    acc_name = g_strdup_printf("%s-%s", ac_name, CPU_RESOLVING_TYPE);
+         return true;
-+    acc = object_class_by_name(acc_name);
+     }
-+    g_free(acc_name);
+@@ -XXX,XX +XXX,XX @@ static bool fold_addsub2(OptContext *ctx, TCGOp *op, bool add)
-+
-+    if (acc) {
+ static bool fold_add2(OptContext *ctx, TCGOp *op)
-+        object_class_foreach(accel_init_cpu_int_aux,
+ {
-+                             CPU_RESOLVING_TYPE, false, acc);
++    /* Note that the high and low parts may be independently swapped. */
-+    }
++    swap_commutative(op->args[0], &op->args[2], &op->args[4]);
-+}
++    swap_commutative(op->args[1], &op->args[3], &op->args[5]);
 +
- void accel_init_interfaces(AccelClass *ac)
+     return fold_addsub2(ctx, op, true);
  {
  #ifndef CONFIG_USER_ONLY
      accel_init_ops_interfaces(ac);
  #endif /* !CONFIG_USER_ONLY */
 +
 +    accel_init_cpu_interfaces(ac);
  }
-+static const TypeInfo accel_cpu_type = {
+@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
-+    .name = TYPE_ACCEL_CPU,
+ {
-+    .parent = TYPE_OBJECT,
+     uint64_t z1, z2;
-+    .abstract = true,
-+    .class_size = sizeof(AccelCPUClass),
+-    if (fold_const2(ctx, op) ||
-+};
++    if (fold_const2_commutative(ctx, op) ||
-+
+         fold_xi_to_i(ctx, op, 0) ||
- static void register_accel_types(void)
+         fold_xi_to_x(ctx, op, -1) ||
- {
+         fold_xx_to_x(ctx, op)) {
-     type_register_static(&accel_type);
+@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
-+    type_register_static(&accel_cpu_type);
+ static bool fold_brcond(OptContext *ctx, TCGOp *op)
- }
+ {
+     TCGCond cond = op->args[2];
- type_init(register_accel_types);
+-    int i = do_constant_folding_cond(ctx->type, op->args[0], op->args[1], cond);
-diff --git a/MAINTAINERS b/MAINTAINERS
++    int i;
-index XXXXXXX..XXXXXXX 100644
---- a/MAINTAINERS
++    if (swap_commutative(NO_DEST, &op->args[0], &op->args[1])) {
-+++ b/MAINTAINERS
++        op->args[2] = cond = tcg_swap_cond(cond);
-@@ -XXX,XX +XXX,XX @@ R: Paolo Bonzini <pbonzini@redhat.com>
++    }
- S: Maintained
++
- F: include/qemu/accel.h
++    i = do_constant_folding_cond(ctx->type, op->args[0], op->args[1], cond);
- F: include/sysemu/accel-ops.h
+     if (i == 0) {
-+F: include/hw/core/accel-cpu.h
+         tcg_op_remove(ctx->tcg, op);
- F: accel/accel-*.c
+         return true;
- F: accel/Makefile.objs
+@@ -XXX,XX +XXX,XX @@ static bool fold_brcond(OptContext *ctx, TCGOp *op)
- F: accel/stubs/Makefile.objs
+ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
  {
      TCGCond cond = op->args[4];
 -    int i = do_constant_folding_cond2(&op->args[0], &op->args[2], cond);
      TCGArg label = op->args[5];
 -    int inv = 0;
 +    int i, inv = 0;
 +    if (swap_commutative2(&op->args[0], &op->args[2])) {
 +        op->args[4] = cond = tcg_swap_cond(cond);
 +    }
 +
 +    i = do_constant_folding_cond2(&op->args[0], &op->args[2], cond);
      if (i >= 0) {
          goto do_brcond_const;
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_dup2(OptContext *ctx, TCGOp *op)
  static bool fold_eqv(OptContext *ctx, TCGOp *op)
  {
 -    if (fold_const2(ctx, op) ||
 +    if (fold_const2_commutative(ctx, op) ||
          fold_xi_to_x(ctx, op, -1) ||
          fold_xi_to_not(ctx, op, 0)) {
          return true;
@@ -XXX,XX +XXX,XX @@ static bool fold_mov(OptContext *ctx, TCGOp *op)
  static bool fold_movcond(OptContext *ctx, TCGOp *op)
  {
      TCGCond cond = op->args[5];
 -    int i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
 +    int i;
 +    if (swap_commutative(NO_DEST, &op->args[1], &op->args[2])) {
 +        op->args[5] = cond = tcg_swap_cond(cond);
 +    }
 +    /*
 +     * Canonicalize the "false" input reg to match the destination reg so
 +     * that the tcg backend can implement a "move if true" operation.
 +     */
 +    if (swap_commutative(op->args[0], &op->args[4], &op->args[3])) {
 +        op->args[5] = cond = tcg_invert_cond(cond);
 +    }
 +
 +    i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
      if (i >= 0) {
          return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[4 - i]);
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_mul(OptContext *ctx, TCGOp *op)
  static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
  {
 -    if (fold_const2(ctx, op) ||
 +    if (fold_const2_commutative(ctx, op) ||
          fold_xi_to_i(ctx, op, 0)) {
          return true;
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
  static bool fold_multiply2(OptContext *ctx, TCGOp *op)
  {
 +    swap_commutative(op->args[0], &op->args[2], &op->args[3]);
 +
      if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
          uint64_t a = arg_info(op->args[2])->val;
          uint64_t b = arg_info(op->args[3])->val;
@@ -XXX,XX +XXX,XX @@ static bool fold_multiply2(OptContext *ctx, TCGOp *op)
  static bool fold_nand(OptContext *ctx, TCGOp *op)
  {
 -    if (fold_const2(ctx, op) ||
 +    if (fold_const2_commutative(ctx, op) ||
          fold_xi_to_not(ctx, op, -1)) {
          return true;
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_neg(OptContext *ctx, TCGOp *op)
  static bool fold_nor(OptContext *ctx, TCGOp *op)
  {
 -    if (fold_const2(ctx, op) ||
 +    if (fold_const2_commutative(ctx, op) ||
          fold_xi_to_not(ctx, op, 0)) {
          return true;
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_not(OptContext *ctx, TCGOp *op)
  static bool fold_or(OptContext *ctx, TCGOp *op)
  {
 -    if (fold_const2(ctx, op) ||
 +    if (fold_const2_commutative(ctx, op) ||
          fold_xi_to_x(ctx, op, 0) ||
          fold_xx_to_x(ctx, op)) {
          return true;
@@ -XXX,XX +XXX,XX @@ static bool fold_remainder(OptContext *ctx, TCGOp *op)
  static bool fold_setcond(OptContext *ctx, TCGOp *op)
  {
      TCGCond cond = op->args[3];
 -    int i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
 +    int i;
 +    if (swap_commutative(op->args[0], &op->args[1], &op->args[2])) {
 +        op->args[3] = cond = tcg_swap_cond(cond);
 +    }
 +
 +    i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
      if (i >= 0) {
          return tcg_opt_gen_movi(ctx, op, op->args[0], i);
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond(OptContext *ctx, TCGOp *op)
  static bool fold_setcond2(OptContext *ctx, TCGOp *op)
  {
      TCGCond cond = op->args[5];
 -    int i = do_constant_folding_cond2(&op->args[1], &op->args[3], cond);
 -    int inv = 0;
 +    int i, inv = 0;
 +    if (swap_commutative2(&op->args[1], &op->args[3])) {
 +        op->args[5] = cond = tcg_swap_cond(cond);
 +    }
 +
 +    i = do_constant_folding_cond2(&op->args[1], &op->args[3], cond);
      if (i >= 0) {
          goto do_setcond_const;
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_tcg_ld(OptContext *ctx, TCGOp *op)
  static bool fold_xor(OptContext *ctx, TCGOp *op)
  {
 -    if (fold_const2(ctx, op) ||
 +    if (fold_const2_commutative(ctx, op) ||
          fold_xx_to_i(ctx, op, 0) ||
          fold_xi_to_x(ctx, op, 0) ||
          fold_xi_to_not(ctx, op, -1)) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              ctx.type = TCG_TYPE_I32;
          }
 -        /* For commutative operations make constant second argument */
 -        switch (opc) {
 -        CASE_OP_32_64_VEC(add):
 -        CASE_OP_32_64_VEC(mul):
 -        CASE_OP_32_64_VEC(and):
 -        CASE_OP_32_64_VEC(or):
 -        CASE_OP_32_64_VEC(xor):
 -        CASE_OP_32_64(eqv):
 -        CASE_OP_32_64(nand):
 -        CASE_OP_32_64(nor):
 -        CASE_OP_32_64(muluh):
 -        CASE_OP_32_64(mulsh):
 -            swap_commutative(op->args[0], &op->args[1], &op->args[2]);
 -            break;
 -        CASE_OP_32_64(brcond):
 -            if (swap_commutative(-1, &op->args[0], &op->args[1])) {
 -                op->args[2] = tcg_swap_cond(op->args[2]);
 -            }
 -            break;
 -        CASE_OP_32_64(setcond):
 -            if (swap_commutative(op->args[0], &op->args[1], &op->args[2])) {
 -                op->args[3] = tcg_swap_cond(op->args[3]);
 -            }
 -            break;
 -        CASE_OP_32_64(movcond):
 -            if (swap_commutative(-1, &op->args[1], &op->args[2])) {
 -                op->args[5] = tcg_swap_cond(op->args[5]);
 -            }
 -            /* For movcond, we canonicalize the "false" input reg to match
 -               the destination reg so that the tcg backend can implement
 -               a "move if true" operation.  */
 -            if (swap_commutative(op->args[0], &op->args[4], &op->args[3])) {
 -                op->args[5] = tcg_invert_cond(op->args[5]);
 -            }
 -            break;
 -        CASE_OP_32_64(add2):
 -            swap_commutative(op->args[0], &op->args[2], &op->args[4]);
 -            swap_commutative(op->args[1], &op->args[3], &op->args[5]);
 -            break;
 -        CASE_OP_32_64(mulu2):
 -        CASE_OP_32_64(muls2):
 -            swap_commutative(op->args[0], &op->args[2], &op->args[3]);
 -            break;
 -        case INDEX_op_brcond2_i32:
 -            if (swap_commutative2(&op->args[0], &op->args[2])) {
 -                op->args[4] = tcg_swap_cond(op->args[4]);
 -            }
 -            break;
 -        case INDEX_op_setcond2_i32:
 -            if (swap_commutative2(&op->args[1], &op->args[3])) {
 -                op->args[5] = tcg_swap_cond(op->args[5]);
 -            }
 -            break;
 -        default:
 -            break;
 -        }
 -
          /* Assume all bits affected, and no bits known zero. */
          ctx.a_mask = -1;
          ctx.z_mask = -1;
 --
 .25.1

-[PULL 27/46] tcg/tci: Fix TCG_REG_R4 misusage
+[PULL 47/56] tcg/optimize: Stop forcing z_mask to "garbage" for 32-bit values
-This was removed from tcg_target_reg_alloc_order and
+This "garbage" setting pre-dates the addition of the type
-tcg_target_call_iarg_regs on the assumption that it
+changing opcodes INDEX_op_ext_i32_i64, INDEX_op_extu_i32_i64,
-was the stack.  This was incorrectly copied from i386.
+and INDEX_op_extr{l,h}_i64_i32.
 For tci, the stack is R15.
-By adding R4 back to tcg_target_call_iarg_regs, adjust the other
+So now we have a definitive points at which to adjust z_mask
-entries so that 6 (or 12) entries are still present in the array,
+to eliminate such bits from the 32-bit operands.
 and adjust the numbers in the interpreter.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/tci.c                | 8 ++++----
+ tcg/optimize.c | 35 ++++++++++++++++-------------------
- tcg/tci/tcg-target.c.inc | 7 +------
+file changed, 16 insertions(+), 19 deletions(-)
 files changed, 5 insertions(+), 10 deletions(-)
-diff --git a/tcg/tci.c b/tcg/tci.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/tci.c
+--- a/tcg/optimize.c
-+++ b/tcg/tci.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+@@ -XXX,XX +XXX,XX @@ static void init_ts_info(OptContext *ctx, TCGTemp *ts)
-                                           tci_read_reg(regs, TCG_REG_R1),
+         ti->is_const = true;
-                                           tci_read_reg(regs, TCG_REG_R2),
+         ti->val = ts->val;
-                                           tci_read_reg(regs, TCG_REG_R3),
+         ti->z_mask = ts->val;
-+                                          tci_read_reg(regs, TCG_REG_R4),
+-        if (TCG_TARGET_REG_BITS > 32 && ts->type == TCG_TYPE_I32) {
-                                           tci_read_reg(regs, TCG_REG_R5),
+-            /* High bits of a 32-bit quantity are garbage.  */
-                                           tci_read_reg(regs, TCG_REG_R6),
+-            ti->z_mask |= ~0xffffffffull;
-                                           tci_read_reg(regs, TCG_REG_R7),
+-        }
-                                           tci_read_reg(regs, TCG_REG_R8),
+     } else {
-                                           tci_read_reg(regs, TCG_REG_R9),
+         ti->is_const = false;
-                                           tci_read_reg(regs, TCG_REG_R10),
+         ti->z_mask = -1;
--                                          tci_read_reg(regs, TCG_REG_R11),
+@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
--                                          tci_read_reg(regs, TCG_REG_R12));
+     TCGTemp *src_ts = arg_temp(src);
-+                                          tci_read_reg(regs, TCG_REG_R11));
+     TempOptInfo *di;
-             tci_write_reg(regs, TCG_REG_R0, tmp64);
+     TempOptInfo *si;
-             tci_write_reg(regs, TCG_REG_R1, tmp64 >> 32);
+-    uint64_t z_mask;
- #else
+     TCGOpcode new_op;
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
-                                           tci_read_reg(regs, TCG_REG_R1),
+     if (ts_are_copies(dst_ts, src_ts)) {
-                                           tci_read_reg(regs, TCG_REG_R2),
+@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
-                                           tci_read_reg(regs, TCG_REG_R3),
+     op->args[0] = dst;
--                                          tci_read_reg(regs, TCG_REG_R5),
+     op->args[1] = src;
--                                          tci_read_reg(regs, TCG_REG_R6));
-+                                          tci_read_reg(regs, TCG_REG_R4),
+-    z_mask = si->z_mask;
-+                                          tci_read_reg(regs, TCG_REG_R5));
+-    if (TCG_TARGET_REG_BITS > 32 && new_op == INDEX_op_mov_i32) {
-             tci_write_reg(regs, TCG_REG_R0, tmp64);
+-        /* High bits of the destination are now garbage.  */
- #endif
+-        z_mask |= ~0xffffffffull;
-             break;
+-    }
-diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
+-    di->z_mask = z_mask;
-index XXXXXXX..XXXXXXX 100644
++    di->z_mask = si->z_mask;
---- a/tcg/tci/tcg-target.c.inc
-+++ b/tcg/tci/tcg-target.c.inc
+     if (src_ts->type == dst_ts->type) {
-@@ -XXX,XX +XXX,XX @@ static const int tcg_target_reg_alloc_order[] = {
+         TempOptInfo *ni = ts_info(si->next_copy);
-     TCG_REG_R1,
+@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
-     TCG_REG_R2,
+ static bool tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
-     TCG_REG_R3,
+                              TCGArg dst, uint64_t val)
--#if 0 /* used for TCG_REG_CALL_STACK */
+ {
-     TCG_REG_R4,
+-    /* Convert movi to mov with constant temp. */
--#endif
+-    TCGTemp *tv = tcg_constant_internal(ctx->type, val);
-     TCG_REG_R5,
++    TCGTemp *tv;
-     TCG_REG_R6,
-     TCG_REG_R7,
++    if (ctx->type == TCG_TYPE_I32) {
-@@ -XXX,XX +XXX,XX @@ static const int tcg_target_call_iarg_regs[] = {
++        val = (int32_t)val;
-     TCG_REG_R1,
++    }
-     TCG_REG_R2,
++
-     TCG_REG_R3,
++    /* Convert movi to mov with constant temp. */
--#if 0 /* used for TCG_REG_CALL_STACK */
++    tv = tcg_constant_internal(ctx->type, val);
-     TCG_REG_R4,
+     init_ts_info(ctx, tv);
--#endif
+     return tcg_opt_gen_mov(ctx, op, dst, temp_arg(tv));
-     TCG_REG_R5,
+ }
--    TCG_REG_R6,
+@@ -XXX,XX +XXX,XX @@ static bool fold_masks(OptContext *ctx, TCGOp *op)
- #if TCG_TARGET_REG_BITS == 32
+     uint64_t z_mask = ctx->z_mask;
-     /* 32 bit hosts need 2 * MAX_OPC_PARAM_IARGS registers. */
-+    TCG_REG_R6,
+     /*
-     TCG_REG_R7,
+-     * 32-bit ops generate 32-bit results.  For the result is zero test
-     TCG_REG_R8,
+-     * below, we can ignore high bits, but for further optimizations we
-     TCG_REG_R9,
+-     * need to record that the high bits contain garbage.
-     TCG_REG_R10,
++     * 32-bit ops generate 32-bit results, which for the purpose of
-     TCG_REG_R11,
++     * simplifying tcg are sign-extended.  Certainly that's how we
--    TCG_REG_R12,
++     * represent our constants elsewhere.  Note that the bits will
- #endif
++     * be reset properly for a 64-bit value when encountering the
- };
++     * type changing opcodes.
+      */
      if (ctx->type == TCG_TYPE_I32) {
 -        ctx->z_mask |= MAKE_64BIT_MASK(32, 32);
 -        a_mask &= MAKE_64BIT_MASK(0, 32);
 -        z_mask &= MAKE_64BIT_MASK(0, 32);
 +        a_mask = (int32_t)a_mask;
 +        z_mask = (int32_t)z_mask;
 +        ctx->z_mask = z_mask;
      }
      if (z_mask == 0) {
 --
 .25.1

-[PULL 05/46] tcg/tci: Make tci_tb_ptr thread-local
+[PULL 48/56] tcg/optimize: Use fold_xx_to_i for orc
-Each thread must have its own pc, even under TCI.
+Recognize the constant function for or-complement.
 Remove the GETPC ifdef, because GETPC is always available for
 helpers, and thus is always required.  Move the assignment
 under INDEX_op_call, because the value is only visible when
 we make a call to a helper function.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
-Message-Id: <20210204014509.882821-6-richard.henderson@linaro.org>
 ---
- include/exec/exec-all.h | 2 +-
+ tcg/optimize.c | 1 +
- tcg/tcg-common.c        | 4 ----
+file changed, 1 insertion(+)
  tcg/tci.c               | 7 +++----
 files changed, 4 insertions(+), 9 deletions(-)
-diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/exec/exec-all.h
+--- a/tcg/optimize.c
-+++ b/include/exec/exec-all.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ void tb_set_jmp_target(TranslationBlock *tb, int n, uintptr_t addr);
+@@ -XXX,XX +XXX,XX @@ static bool fold_or(OptContext *ctx, TCGOp *op)
+ static bool fold_orc(OptContext *ctx, TCGOp *op)
  /* GETPC is the true target of the return instruction that we'll execute.  */
  #if defined(CONFIG_TCG_INTERPRETER)
 -extern uintptr_t tci_tb_ptr;
 +extern __thread uintptr_t tci_tb_ptr;
  # define GETPC() tci_tb_ptr
  #else
  # define GETPC() \
 diff --git a/tcg/tcg-common.c b/tcg/tcg-common.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tcg-common.c
 +++ b/tcg/tcg-common.c
@@ -XXX,XX +XXX,XX @@
  #include "qemu/osdep.h"
  #include "tcg/tcg.h"
 -#if defined(CONFIG_TCG_INTERPRETER)
 -uintptr_t tci_tb_ptr;
 -#endif
 -
  TCGOpDef tcg_op_defs[] = {
  #define DEF(s, oargs, iargs, cargs, flags) \
           { #s, oargs, iargs, cargs, iargs + oargs + cargs, flags },
 diff --git a/tcg/tci.c b/tcg/tci.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tci.c
 +++ b/tcg/tci.c
@@ -XXX,XX +XXX,XX @@ typedef uint64_t (*helper_function)(tcg_target_ulong, tcg_target_ulong,
                                      tcg_target_ulong, tcg_target_ulong);
  #endif
 +__thread uintptr_t tci_tb_ptr;
 +
  static tcg_target_ulong tci_read_reg(const tcg_target_ulong *regs, TCGReg index)
  {
-     tci_assert(index < TCG_TARGET_NB_REGS);
+     if (fold_const2(ctx, op) ||
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
++        fold_xx_to_i(ctx, op, -1) ||
- #endif
+         fold_xi_to_x(ctx, op, -1) ||
-         TCGMemOpIdx oi;
+         fold_ix_to_not(ctx, op, 0)) {
+         return true;
 -#if defined(GETPC)
 -        tci_tb_ptr = (uintptr_t)tb_ptr;
 -#endif
 -
          /* Skip opcode and size entry. */
          tb_ptr += 2;
          switch (opc) {
          case INDEX_op_call:
              t0 = tci_read_ri(regs, &tb_ptr);
 +            tci_tb_ptr = (uintptr_t)tb_ptr;
  #if TCG_TARGET_REG_BITS == 32
              tmp64 = ((helper_function)t0)(tci_read_reg(regs, TCG_REG_R0),
                                            tci_read_reg(regs, TCG_REG_R1),
 --
 .25.1

-[PULL 36/46] target/arm: do not use cc->do_interrupt for KVM directly
+[PULL 49/56] tcg/optimize: Use fold_xi_to_x for mul
-From: Claudio Fontana <cfontana@suse.de>
+Recognize the identity function for low-part multiply.
-cc->do_interrupt is in theory a TCG callback used in accel/tcg only,
+Suggested-by: Luis Pires <luis.pires@eldorado.org.br>
-to prepare the emulated architecture to take an interrupt as defined
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
-in the hardware specifications,
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 but in reality the _do_interrupt style of functions in targets are
 also occasionally reused by KVM to prepare the architecture state in a
 similar way where userspace code has identified that it needs to
 deliver an exception to the guest.
 In the case of ARM, that includes:
 ) the vcpu thread got a SIGBUS indicating a memory error,
    and we need to deliver a Synchronous External Abort to the guest to
    let it know about the error.
 ) the kernel told us about a debug exception (breakpoint, watchpoint)
    but it is not for one of QEMU's own gdbstub breakpoints/watchpoints
    so it must be a breakpoint the guest itself has set up, therefore
    we need to deliver it to the guest.
 So in order to reuse code, the same arm_do_interrupt function is used.
 This is all fine, but we need to avoid calling it using the callback
 registered in CPUClass, since that one is now TCG-only.
 Fortunately this is easily solved by replacing calls to
 CPUClass::do_interrupt() with explicit calls to arm_do_interrupt().
 Signed-off-by: Claudio Fontana <cfontana@suse.de>
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
 Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
 Cc: Peter Maydell <peter.maydell@linaro.org>
 Message-Id: <20210204163931.7358-9-cfontana@suse.de>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/arm/helper.c | 4 ++++
+ tcg/optimize.c | 3 ++-
- target/arm/kvm64.c  | 6 ++----
+file changed, 2 insertions(+), 1 deletion(-)
 files changed, 6 insertions(+), 4 deletions(-)
-diff --git a/target/arm/helper.c b/target/arm/helper.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/arm/helper.c
+--- a/tcg/optimize.c
-+++ b/target/arm/helper.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static void handle_semihosting(CPUState *cs)
+@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
-  * Do any appropriate logging, handle PSCI calls, and then hand off
+ static bool fold_mul(OptContext *ctx, TCGOp *op)
   * to the AArch64-entry or AArch32-entry function depending on the
   * target exception level's register width.
 + *
 + * Note: this is used for both TCG (as the do_interrupt tcg op),
 + *       and KVM to re-inject guest debug exceptions, and to
 + *       inject a Synchronous-External-Abort.
   */
  void arm_cpu_do_interrupt(CPUState *cs)
  {
-diff --git a/target/arm/kvm64.c b/target/arm/kvm64.c
+     if (fold_const2(ctx, op) ||
-index XXXXXXX..XXXXXXX 100644
+-        fold_xi_to_i(ctx, op, 0)) {
---- a/target/arm/kvm64.c
++        fold_xi_to_i(ctx, op, 0) ||
-+++ b/target/arm/kvm64.c
++        fold_xi_to_x(ctx, op, 1)) {
-@@ -XXX,XX +XXX,XX @@ static void kvm_inject_arm_sea(CPUState *c)
+         return true;
- {
+     }
      ARMCPU *cpu = ARM_CPU(c);
      CPUARMState *env = &cpu->env;
 -    CPUClass *cc = CPU_GET_CLASS(c);
      uint32_t esr;
      bool same_el;
@@ -XXX,XX +XXX,XX @@ static void kvm_inject_arm_sea(CPUState *c)
      env->exception.syndrome = esr;
 -    cc->do_interrupt(c);
 +    arm_cpu_do_interrupt(c);
  }
  #define AARCH64_CORE_REG(x)   (KVM_REG_ARM64 | KVM_REG_SIZE_U64 | \
@@ -XXX,XX +XXX,XX @@ bool kvm_arm_handle_debug(CPUState *cs, struct kvm_debug_exit_arch *debug_exit)
  {
      int hsr_ec = syn_get_ec(debug_exit->hsr);
      ARMCPU *cpu = ARM_CPU(cs);
 -    CPUClass *cc = CPU_GET_CLASS(cs);
      CPUARMState *env = &cpu->env;
      /* Ensure PC is synchronised */
@@ -XXX,XX +XXX,XX @@ bool kvm_arm_handle_debug(CPUState *cs, struct kvm_debug_exit_arch *debug_exit)
      env->exception.vaddress = debug_exit->far;
      env->exception.target_el = 1;
      qemu_mutex_lock_iothread();
 -    cc->do_interrupt(cs);
 +    arm_cpu_do_interrupt(cs);
      qemu_mutex_unlock_iothread();
      return false;
 --
 .25.1

-[PULL 30/46] target/riscv: remove CONFIG_TCG, as it is always TCG
+[PULL 50/56] tcg/optimize: Use fold_xi_to_x for div
-From: Claudio Fontana <cfontana@suse.de>
+Recognize the identity function for division.
-for now only TCG is allowed as an accelerator for riscv,
+Suggested-by: Luis Pires <luis.pires@eldorado.org.br>
-so remove the CONFIG_TCG use.
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Claudio Fontana <cfontana@suse.de>
 Reviewed-by: Alistair Francis <alistair.francis@wdc.com>
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Message-Id: <20210204163931.7358-3-cfontana@suse.de>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/riscv/cpu.c | 3 +--
+ tcg/optimize.c | 6 +++++-
-file changed, 1 insertion(+), 2 deletions(-)
+file changed, 5 insertions(+), 1 deletion(-)
-diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/riscv/cpu.c
+--- a/tcg/optimize.c
-+++ b/target/riscv/cpu.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static void riscv_cpu_class_init(ObjectClass *c, void *data)
+@@ -XXX,XX +XXX,XX @@ static bool fold_deposit(OptContext *ctx, TCGOp *op)
- #endif
-     cc->gdb_arch_name = riscv_gdb_arch_name;
+ static bool fold_divide(OptContext *ctx, TCGOp *op)
-     cc->gdb_get_dynamic_xml = riscv_gdb_get_dynamic_xml;
+ {
--#ifdef CONFIG_TCG
+-    return fold_const2(ctx, op);
-     cc->tcg_ops.initialize = riscv_translate_init;
++    if (fold_const2(ctx, op) ||
-     cc->tlb_fill = riscv_cpu_tlb_fill;
++        fold_xi_to_x(ctx, op, 1)) {
--#endif
++        return true;
-+
++    }
-     device_class_set_props(dc, riscv_cpu_properties);
++    return false;
  }
+ static bool fold_dup(OptContext *ctx, TCGOp *op)
 --
 .25.1

-[PULL 08/46] tcg/tci: Inline tci_write_reg32s into the only caller
+[PULL 51/56] tcg/optimize: Use fold_xx_to_i for rem
-Tested-by: Alex Bennée <alex.bennee@linaro.org>
+Recognize the constant function for remainder.
-Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Suggested-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/tci.c | 10 +---------
+ tcg/optimize.c | 6 +++++-
-file changed, 1 insertion(+), 9 deletions(-)
+file changed, 5 insertions(+), 1 deletion(-)
-diff --git a/tcg/tci.c b/tcg/tci.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/tci.c
+--- a/tcg/optimize.c
-+++ b/tcg/tci.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ tci_write_reg(tcg_target_ulong *regs, TCGReg index, tcg_target_ulong value)
+@@ -XXX,XX +XXX,XX @@ static bool fold_qemu_st(OptContext *ctx, TCGOp *op)
-     regs[index] = value;
  static bool fold_remainder(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const2(ctx, op);
 +    if (fold_const2(ctx, op) ||
 +        fold_xx_to_i(ctx, op, 0)) {
 +        return true;
 +    }
 +    return false;
  }
--#if TCG_TARGET_REG_BITS == 64
+ static bool fold_setcond(OptContext *ctx, TCGOp *op)
 -static void
 -tci_write_reg32s(tcg_target_ulong *regs, TCGReg index, int32_t value)
 -{
 -    tci_write_reg(regs, index, value);
 -}
 -#endif
 -
  static void tci_write_reg8(tcg_target_ulong *regs, TCGReg index, uint8_t value)
  {
      tci_write_reg(regs, index, value);
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
              t0 = *tb_ptr++;
              t1 = tci_read_r(regs, &tb_ptr);
              t2 = tci_read_s32(&tb_ptr);
 -            tci_write_reg32s(regs, t0, *(int32_t *)(t1 + t2));
 +            tci_write_reg(regs, t0, *(int32_t *)(t1 + t2));
              break;
          case INDEX_op_ld_i64:
              t0 = *tb_ptr++;
 --
 .25.1

-[PULL 01/46] tcg/s390: Fix compare instruction from extended-immediate facility
+[PULL 52/56] tcg/optimize: Optimize sign extensions
-From: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Certain targets, like riscv, produce signed 32-bit results.
+This can lead to lots of redundant extensions as values are
-The code is currently comparing c2 to the type promotion of
+manipulated.
-uint32_t and int32_t. That is, the conversion rules are as:
+Begin by tracking only the obvious sign-extensions, and
-  (common_type) c2 == (common_type) (uint32_t)
+converting them to simple copies when possible.
-                        (is_unsigned
-                        ? (uint32_t)c2
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-                        : (uint32_t)(int32_t)c2)
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 In the signed case we lose the desired sign extensions because
 of the argument promotion rules of the ternary operator.
 Solve the problem by doing the round-trip parsing through the
 intermediate type and back to the desired common type (all at
 one expression).
 Fixes: a534bb15f30 ("tcg/s390: Use constant pool for cmpi")
 Tested-by: Richard W.M. Jones <rjones@redhat.com>
 Reviewed-by: David Hildenbrand <david@redhat.com>
 Reported-by: Miroslav Rezanina <mrezanin@redhat.com>
 Reported-by: Richard W.M. Jones <rjones@redhat.com>
 Suggested-by: David Hildenbrand <david@redhat.com>
 Suggested-by: Eric Blake <eblake@redhat.com>
 Signed-off-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Message-Id: <20210204182902.1742826-1-f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/s390/tcg-target.c.inc | 2 +-
+ tcg/optimize.c | 123 ++++++++++++++++++++++++++++++++++++++++---------
-file changed, 1 insertion(+), 1 deletion(-)
+file changed, 102 insertions(+), 21 deletions(-)
-diff --git a/tcg/s390/tcg-target.c.inc b/tcg/s390/tcg-target.c.inc
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/s390/tcg-target.c.inc
+--- a/tcg/optimize.c
-+++ b/tcg/s390/tcg-target.c.inc
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static int tgen_cmp(TCGContext *s, TCGType type, TCGCond c, TCGReg r1,
+@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
-                 op = (is_unsigned ? RIL_CLFI : RIL_CFI);
+     TCGTemp *next_copy;
-                 tcg_out_insn_RIL(s, op, r1, c2);
+     uint64_t val;
-                 goto exit;
+     uint64_t z_mask;  /* mask bit is 0 if and only if value bit is 0 */
--            } else if (c2 == (is_unsigned ? (uint32_t)c2 : (int32_t)c2)) {
++    uint64_t s_mask;  /* a left-aligned mask of clrsb(value) bits. */
-+            } else if (c2 == (is_unsigned ? (TCGArg)(uint32_t)c2 : (TCGArg)(int32_t)c2)) {
+ } TempOptInfo;
-                 op = (is_unsigned ? RIL_CLGFI : RIL_CGFI);
-                 tcg_out_insn_RIL(s, op, r1, c2);
+ typedef struct OptContext {
-                 goto exit;
+@@ -XXX,XX +XXX,XX @@ typedef struct OptContext {
      /* In flight values from optimization. */
      uint64_t a_mask;  /* mask bit is 0 iff value identical to first input */
      uint64_t z_mask;  /* mask bit is 0 iff value bit is 0 */
 +    uint64_t s_mask;  /* mask of clrsb(value) bits */
      TCGType type;
  } OptContext;
 +/* Calculate the smask for a specific value. */
 +static uint64_t smask_from_value(uint64_t value)
 +{
 +    int rep = clrsb64(value);
 +    return ~(~0ull >> rep);
 +}
 +
 +/*
 + * Calculate the smask for a given set of known-zeros.
 + * If there are lots of zeros on the left, we can consider the remainder
 + * an unsigned field, and thus the corresponding signed field is one bit
 + * larger.
 + */
 +static uint64_t smask_from_zmask(uint64_t zmask)
 +{
 +    /*
 +     * Only the 0 bits are significant for zmask, thus the msb itself
 +     * must be zero, else we have no sign information.
 +     */
 +    int rep = clz64(zmask);
 +    if (rep == 0) {
 +        return 0;
 +    }
 +    rep -= 1;
 +    return ~(~0ull >> rep);
 +}
 +
  static inline TempOptInfo *ts_info(TCGTemp *ts)
  {
      return ts->state_ptr;
@@ -XXX,XX +XXX,XX @@ static void reset_ts(TCGTemp *ts)
      ti->prev_copy = ts;
      ti->is_const = false;
      ti->z_mask = -1;
 +    ti->s_mask = 0;
  }
  static void reset_temp(TCGArg arg)
@@ -XXX,XX +XXX,XX @@ static void init_ts_info(OptContext *ctx, TCGTemp *ts)
          ti->is_const = true;
          ti->val = ts->val;
          ti->z_mask = ts->val;
 +        ti->s_mask = smask_from_value(ts->val);
      } else {
          ti->is_const = false;
          ti->z_mask = -1;
 +        ti->s_mask = 0;
      }
  }
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
      op->args[1] = src;
      di->z_mask = si->z_mask;
 +    di->s_mask = si->s_mask;
      if (src_ts->type == dst_ts->type) {
          TempOptInfo *ni = ts_info(si->next_copy);
@@ -XXX,XX +XXX,XX @@ static void finish_folding(OptContext *ctx, TCGOp *op)
      nb_oargs = def->nb_oargs;
      for (i = 0; i < nb_oargs; i++) {
 -        reset_temp(op->args[i]);
 +        TCGTemp *ts = arg_temp(op->args[i]);
 +        reset_ts(ts);
          /*
 -         * Save the corresponding known-zero bits mask for the
 +         * Save the corresponding known-zero/sign bits mask for the
           * first output argument (only one supported so far).
           */
          if (i == 0) {
 -            arg_info(op->args[i])->z_mask = ctx->z_mask;
 +            ts_info(ts)->z_mask = ctx->z_mask;
 +            ts_info(ts)->s_mask = ctx->s_mask;
          }
      }
  }
@@ -XXX,XX +XXX,XX @@ static bool fold_masks(OptContext *ctx, TCGOp *op)
  {
      uint64_t a_mask = ctx->a_mask;
      uint64_t z_mask = ctx->z_mask;
 +    uint64_t s_mask = ctx->s_mask;
      /*
       * 32-bit ops generate 32-bit results, which for the purpose of
@@ -XXX,XX +XXX,XX @@ static bool fold_masks(OptContext *ctx, TCGOp *op)
      if (ctx->type == TCG_TYPE_I32) {
          a_mask = (int32_t)a_mask;
          z_mask = (int32_t)z_mask;
 +        s_mask |= MAKE_64BIT_MASK(32, 32);
          ctx->z_mask = z_mask;
 +        ctx->s_mask = s_mask;
      }
      if (z_mask == 0) {
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
  static bool fold_bswap(OptContext *ctx, TCGOp *op)
  {
 -    uint64_t z_mask, sign;
 +    uint64_t z_mask, s_mask, sign;
      if (arg_is_const(op->args[1])) {
          uint64_t t = arg_info(op->args[1])->val;
@@ -XXX,XX +XXX,XX @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
      }
      z_mask = arg_info(op->args[1])->z_mask;
 +
      switch (op->opc) {
      case INDEX_op_bswap16_i32:
      case INDEX_op_bswap16_i64:
@@ -XXX,XX +XXX,XX @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
      default:
          g_assert_not_reached();
      }
 +    s_mask = smask_from_zmask(z_mask);
      switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
      case TCG_BSWAP_OZ:
@@ -XXX,XX +XXX,XX @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
          /* If the sign bit may be 1, force all the bits above to 1. */
          if (z_mask & sign) {
              z_mask |= sign;
 +            s_mask = sign << 1;
          }
          break;
      default:
          /* The high bits are undefined: force all bits above the sign to 1. */
          z_mask |= sign << 1;
 +        s_mask = 0;
          break;
      }
      ctx->z_mask = z_mask;
 +    ctx->s_mask = s_mask;
      return fold_masks(ctx, op);
  }
@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
  static bool fold_extract(OptContext *ctx, TCGOp *op)
  {
      uint64_t z_mask_old, z_mask;
 +    int pos = op->args[2];
 +    int len = op->args[3];
      if (arg_is_const(op->args[1])) {
          uint64_t t;
          t = arg_info(op->args[1])->val;
 -        t = extract64(t, op->args[2], op->args[3]);
 +        t = extract64(t, pos, len);
          return tcg_opt_gen_movi(ctx, op, op->args[0], t);
      }
      z_mask_old = arg_info(op->args[1])->z_mask;
 -    z_mask = extract64(z_mask_old, op->args[2], op->args[3]);
 -    if (op->args[2] == 0) {
 +    z_mask = extract64(z_mask_old, pos, len);
 +    if (pos == 0) {
          ctx->a_mask = z_mask_old ^ z_mask;
      }
      ctx->z_mask = z_mask;
 +    ctx->s_mask = smask_from_zmask(z_mask);
      return fold_masks(ctx, op);
  }
@@ -XXX,XX +XXX,XX @@ static bool fold_extract2(OptContext *ctx, TCGOp *op)
  static bool fold_exts(OptContext *ctx, TCGOp *op)
  {
 -    uint64_t z_mask_old, z_mask, sign;
 +    uint64_t s_mask_old, s_mask, z_mask, sign;
      bool type_change = false;
      if (fold_const1(ctx, op)) {
          return true;
      }
 -    z_mask_old = z_mask = arg_info(op->args[1])->z_mask;
 +    z_mask = arg_info(op->args[1])->z_mask;
 +    s_mask = arg_info(op->args[1])->s_mask;
 +    s_mask_old = s_mask;
      switch (op->opc) {
      CASE_OP_32_64(ext8s):
@@ -XXX,XX +XXX,XX @@ static bool fold_exts(OptContext *ctx, TCGOp *op)
      if (z_mask & sign) {
          z_mask |= sign;
 -    } else if (!type_change) {
 -        ctx->a_mask = z_mask_old ^ z_mask;
      }
 +    s_mask |= sign << 1;
 +
      ctx->z_mask = z_mask;
 +    ctx->s_mask = s_mask;
 +    if (!type_change) {
 +        ctx->a_mask = s_mask & ~s_mask_old;
 +    }
      return fold_masks(ctx, op);
  }
@@ -XXX,XX +XXX,XX @@ static bool fold_extu(OptContext *ctx, TCGOp *op)
      }
      ctx->z_mask = z_mask;
 +    ctx->s_mask = smask_from_zmask(z_mask);
      if (!type_change) {
          ctx->a_mask = z_mask_old ^ z_mask;
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
      MemOp mop = get_memop(oi);
      int width = 8 * memop_size(mop);
 -    if (!(mop & MO_SIGN) && width < 64) {
 -        ctx->z_mask = MAKE_64BIT_MASK(0, width);
 +    if (width < 64) {
 +        ctx->s_mask = MAKE_64BIT_MASK(width, 64 - width);
 +        if (!(mop & MO_SIGN)) {
 +            ctx->z_mask = MAKE_64BIT_MASK(0, width);
 +            ctx->s_mask <<= 1;
 +        }
      }
      /* Opcodes that touch guest memory stop the mb optimization.  */
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
  static bool fold_sextract(OptContext *ctx, TCGOp *op)
  {
 -    int64_t z_mask_old, z_mask;
 +    uint64_t z_mask, s_mask, s_mask_old;
 +    int pos = op->args[2];
 +    int len = op->args[3];
      if (arg_is_const(op->args[1])) {
          uint64_t t;
          t = arg_info(op->args[1])->val;
 -        t = sextract64(t, op->args[2], op->args[3]);
 +        t = sextract64(t, pos, len);
          return tcg_opt_gen_movi(ctx, op, op->args[0], t);
      }
 -    z_mask_old = arg_info(op->args[1])->z_mask;
 -    z_mask = sextract64(z_mask_old, op->args[2], op->args[3]);
 -    if (op->args[2] == 0 && z_mask >= 0) {
 -        ctx->a_mask = z_mask_old ^ z_mask;
 -    }
 +    z_mask = arg_info(op->args[1])->z_mask;
 +    z_mask = sextract64(z_mask, pos, len);
      ctx->z_mask = z_mask;
 +    s_mask_old = arg_info(op->args[1])->s_mask;
 +    s_mask = sextract64(s_mask_old, pos, len);
 +    s_mask |= MAKE_64BIT_MASK(len, 64 - len);
 +    ctx->s_mask = s_mask;
 +
 +    if (pos == 0) {
 +        ctx->a_mask = s_mask & ~s_mask_old;
 +    }
 +
      return fold_masks(ctx, op);
  }
@@ -XXX,XX +XXX,XX @@ static bool fold_tcg_ld(OptContext *ctx, TCGOp *op)
  {
      /* We can't do any folding with a load, but we can record bits. */
      switch (op->opc) {
 +    CASE_OP_32_64(ld8s):
 +        ctx->s_mask = MAKE_64BIT_MASK(8, 56);
 +        break;
      CASE_OP_32_64(ld8u):
          ctx->z_mask = MAKE_64BIT_MASK(0, 8);
 +        ctx->s_mask = MAKE_64BIT_MASK(9, 55);
 +        break;
 +    CASE_OP_32_64(ld16s):
 +        ctx->s_mask = MAKE_64BIT_MASK(16, 48);
          break;
      CASE_OP_32_64(ld16u):
          ctx->z_mask = MAKE_64BIT_MASK(0, 16);
 +        ctx->s_mask = MAKE_64BIT_MASK(17, 47);
 +        break;
 +    case INDEX_op_ld32s_i64:
 +        ctx->s_mask = MAKE_64BIT_MASK(32, 32);
          break;
      case INDEX_op_ld32u_i64:
          ctx->z_mask = MAKE_64BIT_MASK(0, 32);
 +        ctx->s_mask = MAKE_64BIT_MASK(33, 31);
          break;
      default:
          g_assert_not_reached();
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              ctx.type = TCG_TYPE_I32;
          }
 -        /* Assume all bits affected, and no bits known zero. */
 +        /* Assume all bits affected, no bits known zero, no sign reps. */
          ctx.a_mask = -1;
          ctx.z_mask = -1;
 +        ctx.s_mask = 0;
          /*
           * Process each opcode.
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          case INDEX_op_extrh_i64_i32:
              done = fold_extu(&ctx, op);
              break;
 +        CASE_OP_32_64(ld8s):
          CASE_OP_32_64(ld8u):
 +        CASE_OP_32_64(ld16s):
          CASE_OP_32_64(ld16u):
 +        case INDEX_op_ld32s_i64:
          case INDEX_op_ld32u_i64:
              done = fold_tcg_ld(&ctx, op);
              break;
 --
 .25.1

-[PULL 33/46] cpu: Move cpu_exec_* to tcg_ops
+[PULL 53/56] tcg/optimize: Propagate sign info for logical operations
-From: Eduardo Habkost <ehabkost@redhat.com>
+Sign repetitions are perforce all identical, whether they are 1 or 0.
 Bitwise operations preserve the relative quantity of the repetitions.
-Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
-[claudio: wrapped target code in CONFIG_TCG]
-Signed-off-by: Claudio Fontana <cfontana@suse.de>
-Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
-Message-Id: <20210204163931.7358-6-cfontana@suse.de>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/hw/core/cpu.h           | 12 ++++++------
+ tcg/optimize.c | 29 +++++++++++++++++++++++++++++
- accel/tcg/cpu-exec.c            | 12 ++++++------
+file changed, 29 insertions(+)
  target/alpha/cpu.c              |  2 +-
  target/arm/cpu.c                |  2 +-
  target/arm/cpu64.c              |  5 ++++-
  target/arm/cpu_tcg.c            |  7 ++++++-
  target/avr/cpu.c                |  2 +-
  target/cris/cpu.c               |  2 +-
  target/hppa/cpu.c               |  2 +-
  target/i386/tcg/tcg-cpu.c       |  6 +++---
  target/lm32/cpu.c               |  2 +-
  target/m68k/cpu.c               |  2 +-
  target/microblaze/cpu.c         |  2 +-
  target/mips/cpu.c               |  2 +-
  target/nios2/cpu.c              |  2 +-
  target/openrisc/cpu.c           |  2 +-
  target/riscv/cpu.c              |  2 +-
  target/rx/cpu.c                 |  2 +-
  target/s390x/cpu.c              |  2 +-
  target/sh4/cpu.c                |  2 +-
  target/sparc/cpu.c              |  2 +-
  target/tilegx/cpu.c             |  2 +-
  target/unicore32/cpu.c          |  2 +-
  target/xtensa/cpu.c             |  2 +-
  target/ppc/translate_init.c.inc | 16 ++++++++++------
 files changed, 54 insertions(+), 42 deletions(-)
-diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/hw/core/cpu.h
+--- a/tcg/optimize.c
-+++ b/include/hw/core/cpu.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ typedef struct TcgCpuOperations {
+@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
-      */
+     z2 = arg_info(op->args[2])->z_mask;
-     void (*synchronize_from_tb)(CPUState *cpu,
+     ctx->z_mask = z1 & z2;
-                                 const struct TranslationBlock *tb);
-+    /** @cpu_exec_enter: Callback for cpu_exec preparation */
++    /*
-+    void (*cpu_exec_enter)(CPUState *cpu);
++     * Sign repetitions are perforce all identical, whether they are 1 or 0.
-+    /** @cpu_exec_exit: Callback for cpu_exec cleanup */
++     * Bitwise operations preserve the relative quantity of the repetitions.
-+    void (*cpu_exec_exit)(CPUState *cpu);
++     */
-+    /** @cpu_exec_interrupt: Callback for processing interrupts in cpu_exec */
++    ctx->s_mask = arg_info(op->args[1])->s_mask
-+    bool (*cpu_exec_interrupt)(CPUState *cpu, int interrupt_request);
++                & arg_info(op->args[2])->s_mask;
++
- } TcgCpuOperations;
+     /*
+      * Known-zeros does not imply known-ones.  Therefore unless
-@@ -XXX,XX +XXX,XX @@ typedef struct TcgCpuOperations {
+      * arg2 is constant, we can't infer affected bits from it.
-  * @gdb_get_dynamic_xml: Callback to return dynamically generated XML for the
+@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
   *   gdb stub. Returns a pointer to the XML contents for the specified XML file
   *   or NULL if the CPU doesn't have a dynamically generated content for it.
 - * @cpu_exec_enter: Callback for cpu_exec preparation.
 - * @cpu_exec_exit: Callback for cpu_exec cleanup.
 - * @cpu_exec_interrupt: Callback for processing interrupts in cpu_exec.
   * @disas_set_info: Setup architecture specific components of disassembly info
   * @adjust_watchpoint_address: Perform a target-specific adjustment to an
   * address before attempting to match it against watchpoints.
@@ -XXX,XX +XXX,XX @@ struct CPUClass {
      const char *gdb_core_xml_file;
      gchar * (*gdb_arch_name)(CPUState *cpu);
      const char * (*gdb_get_dynamic_xml)(CPUState *cpu, const char *xmlname);
 -    void (*cpu_exec_enter)(CPUState *cpu);
 -    void (*cpu_exec_exit)(CPUState *cpu);
 -    bool (*cpu_exec_interrupt)(CPUState *cpu, int interrupt_request);
      void (*disas_set_info)(CPUState *cpu, disassemble_info *info);
      vaddr (*adjust_watchpoint_address)(CPUState *cpu, vaddr addr, int len);
 diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/cpu-exec.c
 +++ b/accel/tcg/cpu-exec.c
@@ -XXX,XX +XXX,XX @@ static void cpu_exec_enter(CPUState *cpu)
  {
      CPUClass *cc = CPU_GET_CLASS(cpu);
 -    if (cc->cpu_exec_enter) {
 -        cc->cpu_exec_enter(cpu);
 +    if (cc->tcg_ops.cpu_exec_enter) {
 +        cc->tcg_ops.cpu_exec_enter(cpu);
      }
+     ctx->z_mask = z1;
++    ctx->s_mask = arg_info(op->args[1])->s_mask
++                & arg_info(op->args[2])->s_mask;
+     return fold_masks(ctx, op);
  }
-@@ -XXX,XX +XXX,XX @@ static void cpu_exec_exit(CPUState *cpu)
+@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
- {
+         fold_xi_to_not(ctx, op, 0)) {
-     CPUClass *cc = CPU_GET_CLASS(cpu);
+         return true;
 -    if (cc->cpu_exec_exit) {
 -        cc->cpu_exec_exit(cpu);
 +    if (cc->tcg_ops.cpu_exec_exit) {
 +        cc->tcg_ops.cpu_exec_exit(cpu);
      }
++
++    ctx->s_mask = arg_info(op->args[1])->s_mask
++                & arg_info(op->args[2])->s_mask;
+     return false;
  }
-@@ -XXX,XX +XXX,XX @@ static inline bool cpu_handle_interrupt(CPUState *cpu,
+@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
-            True when it is, and we should restart on a new TB,
-            and via longjmp via cpu_loop_exit.  */
+     ctx->z_mask = arg_info(op->args[3])->z_mask
-         else {
+                 | arg_info(op->args[4])->z_mask;
--            if (cc->cpu_exec_interrupt &&
++    ctx->s_mask = arg_info(op->args[3])->s_mask
--                cc->cpu_exec_interrupt(cpu, interrupt_request)) {
++                & arg_info(op->args[4])->s_mask;
-+            if (cc->tcg_ops.cpu_exec_interrupt &&
-+                cc->tcg_ops.cpu_exec_interrupt(cpu, interrupt_request)) {
+     if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
-                 if (need_replay_interrupt(interrupt_request)) {
+         uint64_t tv = arg_info(op->args[3])->val;
-                     replay_interrupt();
+@@ -XXX,XX +XXX,XX @@ static bool fold_nand(OptContext *ctx, TCGOp *op)
-                 }
+         fold_xi_to_not(ctx, op, -1)) {
-diff --git a/target/alpha/cpu.c b/target/alpha/cpu.c
+         return true;
-index XXXXXXX..XXXXXXX 100644
+     }
 --- a/target/alpha/cpu.c
 +++ b/target/alpha/cpu.c
@@ -XXX,XX +XXX,XX @@ static void alpha_cpu_class_init(ObjectClass *oc, void *data)
      cc->class_by_name = alpha_cpu_class_by_name;
      cc->has_work = alpha_cpu_has_work;
      cc->do_interrupt = alpha_cpu_do_interrupt;
 -    cc->cpu_exec_interrupt = alpha_cpu_exec_interrupt;
 +    cc->tcg_ops.cpu_exec_interrupt = alpha_cpu_exec_interrupt;
      cc->dump_state = alpha_cpu_dump_state;
      cc->set_pc = alpha_cpu_set_pc;
      cc->gdb_read_register = alpha_cpu_gdb_read_register;
 diff --git a/target/arm/cpu.c b/target/arm/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/arm/cpu.c
 +++ b/target/arm/cpu.c
@@ -XXX,XX +XXX,XX @@ static void arm_cpu_class_init(ObjectClass *oc, void *data)
      cc->class_by_name = arm_cpu_class_by_name;
      cc->has_work = arm_cpu_has_work;
 -    cc->cpu_exec_interrupt = arm_cpu_exec_interrupt;
      cc->dump_state = arm_cpu_dump_state;
      cc->set_pc = arm_cpu_set_pc;
      cc->gdb_read_register = arm_cpu_gdb_read_register;
@@ -XXX,XX +XXX,XX @@ static void arm_cpu_class_init(ObjectClass *oc, void *data)
      cc->disas_set_info = arm_disas_set_info;
  #ifdef CONFIG_TCG
      cc->tcg_ops.initialize = arm_translate_init;
 +    cc->tcg_ops.cpu_exec_interrupt = arm_cpu_exec_interrupt;
      cc->tcg_ops.synchronize_from_tb = arm_cpu_synchronize_from_tb;
      cc->tlb_fill = arm_cpu_tlb_fill;
      cc->debug_excp_handler = arm_debug_excp_handler;
 diff --git a/target/arm/cpu64.c b/target/arm/cpu64.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/arm/cpu64.c
 +++ b/target/arm/cpu64.c
@@ -XXX,XX +XXX,XX @@ static void aarch64_cpu_class_init(ObjectClass *oc, void *data)
  {
      CPUClass *cc = CPU_CLASS(oc);
 -    cc->cpu_exec_interrupt = arm_cpu_exec_interrupt;
 +#ifdef CONFIG_TCG
 +    cc->tcg_ops.cpu_exec_interrupt = arm_cpu_exec_interrupt;
 +#endif /* CONFIG_TCG */
 +
-     cc->gdb_read_register = aarch64_cpu_gdb_read_register;
++    ctx->s_mask = arg_info(op->args[1])->s_mask
-     cc->gdb_write_register = aarch64_cpu_gdb_write_register;
++                & arg_info(op->args[2])->s_mask;
-     cc->gdb_num_core_regs = 34;
+     return false;
-diff --git a/target/arm/cpu_tcg.c b/target/arm/cpu_tcg.c
+ }
-index XXXXXXX..XXXXXXX 100644
---- a/target/arm/cpu_tcg.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_nor(OptContext *ctx, TCGOp *op)
-+++ b/target/arm/cpu_tcg.c
+         fold_xi_to_not(ctx, op, 0)) {
-@@ -XXX,XX +XXX,XX @@
+         return true;
  /* CPU models. These are not needed for the AArch64 linux-user build. */
  #if !defined(CONFIG_USER_ONLY) || !defined(TARGET_AARCH64)
 +#ifdef CONFIG_TCG
  static bool arm_v7m_cpu_exec_interrupt(CPUState *cs, int interrupt_request)
  {
      CPUClass *cc = CPU_GET_CLASS(cs);
@@ -XXX,XX +XXX,XX @@ static bool arm_v7m_cpu_exec_interrupt(CPUState *cs, int interrupt_request)
      }
-     return ret;
++
 +    ctx->s_mask = arg_info(op->args[1])->s_mask
 +                & arg_info(op->args[2])->s_mask;
      return false;
  }
-+#endif /* CONFIG_TCG */
+@@ -XXX,XX +XXX,XX @@ static bool fold_not(OptContext *ctx, TCGOp *op)
- static void arm926_initfn(Object *obj)
+         return true;
- {
+     }
-@@ -XXX,XX +XXX,XX @@ static void arm_v7m_class_init(ObjectClass *oc, void *data)
-     cc->do_interrupt = arm_v7m_cpu_do_interrupt;
++    ctx->s_mask = arg_info(op->args[1])->s_mask;
  #endif
 -    cc->cpu_exec_interrupt = arm_v7m_cpu_exec_interrupt;
 +#ifdef CONFIG_TCG
 +    cc->tcg_ops.cpu_exec_interrupt = arm_v7m_cpu_exec_interrupt;
 +#endif /* CONFIG_TCG */
 +
-     cc->gdb_core_xml_file = "arm-m-profile.xml";
+     /* Because of fold_to_not, we want to always return true, via finish. */
      finish_folding(ctx, op);
      return true;
@@ -XXX,XX +XXX,XX @@ static bool fold_or(OptContext *ctx, TCGOp *op)
      ctx->z_mask = arg_info(op->args[1])->z_mask
                  | arg_info(op->args[2])->z_mask;
 +    ctx->s_mask = arg_info(op->args[1])->s_mask
 +                & arg_info(op->args[2])->s_mask;
      return fold_masks(ctx, op);
  }
-diff --git a/target/avr/cpu.c b/target/avr/cpu.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_orc(OptContext *ctx, TCGOp *op)
-index XXXXXXX..XXXXXXX 100644
+         fold_ix_to_not(ctx, op, 0)) {
---- a/target/avr/cpu.c
+         return true;
-+++ b/target/avr/cpu.c
+     }
-@@ -XXX,XX +XXX,XX @@ static void avr_cpu_class_init(ObjectClass *oc, void *data)
++
++    ctx->s_mask = arg_info(op->args[1])->s_mask
-     cc->has_work = avr_cpu_has_work;
++                & arg_info(op->args[2])->s_mask;
-     cc->do_interrupt = avr_cpu_do_interrupt;
+     return false;
 -    cc->cpu_exec_interrupt = avr_cpu_exec_interrupt;
 +    cc->tcg_ops.cpu_exec_interrupt = avr_cpu_exec_interrupt;
      cc->dump_state = avr_cpu_dump_state;
      cc->set_pc = avr_cpu_set_pc;
      cc->memory_rw_debug = avr_cpu_memory_rw_debug;
 diff --git a/target/cris/cpu.c b/target/cris/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/cris/cpu.c
 +++ b/target/cris/cpu.c
@@ -XXX,XX +XXX,XX @@ static void cris_cpu_class_init(ObjectClass *oc, void *data)
      cc->class_by_name = cris_cpu_class_by_name;
      cc->has_work = cris_cpu_has_work;
      cc->do_interrupt = cris_cpu_do_interrupt;
 -    cc->cpu_exec_interrupt = cris_cpu_exec_interrupt;
 +    cc->tcg_ops.cpu_exec_interrupt = cris_cpu_exec_interrupt;
      cc->dump_state = cris_cpu_dump_state;
      cc->set_pc = cris_cpu_set_pc;
      cc->gdb_read_register = cris_cpu_gdb_read_register;
 diff --git a/target/hppa/cpu.c b/target/hppa/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/hppa/cpu.c
 +++ b/target/hppa/cpu.c
@@ -XXX,XX +XXX,XX @@ static void hppa_cpu_class_init(ObjectClass *oc, void *data)
      cc->class_by_name = hppa_cpu_class_by_name;
      cc->has_work = hppa_cpu_has_work;
      cc->do_interrupt = hppa_cpu_do_interrupt;
 -    cc->cpu_exec_interrupt = hppa_cpu_exec_interrupt;
 +    cc->tcg_ops.cpu_exec_interrupt = hppa_cpu_exec_interrupt;
      cc->dump_state = hppa_cpu_dump_state;
      cc->set_pc = hppa_cpu_set_pc;
      cc->tcg_ops.synchronize_from_tb = hppa_cpu_synchronize_from_tb;
 diff --git a/target/i386/tcg/tcg-cpu.c b/target/i386/tcg/tcg-cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/i386/tcg/tcg-cpu.c
 +++ b/target/i386/tcg/tcg-cpu.c
@@ -XXX,XX +XXX,XX @@ static void x86_cpu_synchronize_from_tb(CPUState *cs,
  void tcg_cpu_common_class_init(CPUClass *cc)
  {
      cc->do_interrupt = x86_cpu_do_interrupt;
 -    cc->cpu_exec_interrupt = x86_cpu_exec_interrupt;
 +    cc->tcg_ops.cpu_exec_interrupt = x86_cpu_exec_interrupt;
      cc->tcg_ops.synchronize_from_tb = x86_cpu_synchronize_from_tb;
 -    cc->cpu_exec_enter = x86_cpu_exec_enter;
 -    cc->cpu_exec_exit = x86_cpu_exec_exit;
 +    cc->tcg_ops.cpu_exec_enter = x86_cpu_exec_enter;
 +    cc->tcg_ops.cpu_exec_exit = x86_cpu_exec_exit;
      cc->tcg_ops.initialize = tcg_x86_init;
      cc->tlb_fill = x86_cpu_tlb_fill;
  #ifndef CONFIG_USER_ONLY
 diff --git a/target/lm32/cpu.c b/target/lm32/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/lm32/cpu.c
 +++ b/target/lm32/cpu.c
@@ -XXX,XX +XXX,XX @@ static void lm32_cpu_class_init(ObjectClass *oc, void *data)
      cc->class_by_name = lm32_cpu_class_by_name;
      cc->has_work = lm32_cpu_has_work;
      cc->do_interrupt = lm32_cpu_do_interrupt;
 -    cc->cpu_exec_interrupt = lm32_cpu_exec_interrupt;
 +    cc->tcg_ops.cpu_exec_interrupt = lm32_cpu_exec_interrupt;
      cc->dump_state = lm32_cpu_dump_state;
      cc->set_pc = lm32_cpu_set_pc;
      cc->gdb_read_register = lm32_cpu_gdb_read_register;
 diff --git a/target/m68k/cpu.c b/target/m68k/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/m68k/cpu.c
 +++ b/target/m68k/cpu.c
@@ -XXX,XX +XXX,XX @@ static void m68k_cpu_class_init(ObjectClass *c, void *data)
      cc->class_by_name = m68k_cpu_class_by_name;
      cc->has_work = m68k_cpu_has_work;
      cc->do_interrupt = m68k_cpu_do_interrupt;
 -    cc->cpu_exec_interrupt = m68k_cpu_exec_interrupt;
 +    cc->tcg_ops.cpu_exec_interrupt = m68k_cpu_exec_interrupt;
      cc->dump_state = m68k_cpu_dump_state;
      cc->set_pc = m68k_cpu_set_pc;
      cc->gdb_read_register = m68k_cpu_gdb_read_register;
 diff --git a/target/microblaze/cpu.c b/target/microblaze/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/microblaze/cpu.c
 +++ b/target/microblaze/cpu.c
@@ -XXX,XX +XXX,XX @@ static void mb_cpu_class_init(ObjectClass *oc, void *data)
      cc->has_work = mb_cpu_has_work;
      cc->do_interrupt = mb_cpu_do_interrupt;
      cc->do_unaligned_access = mb_cpu_do_unaligned_access;
 -    cc->cpu_exec_interrupt = mb_cpu_exec_interrupt;
 +    cc->tcg_ops.cpu_exec_interrupt = mb_cpu_exec_interrupt;
      cc->dump_state = mb_cpu_dump_state;
      cc->set_pc = mb_cpu_set_pc;
      cc->tcg_ops.synchronize_from_tb = mb_cpu_synchronize_from_tb;
 diff --git a/target/mips/cpu.c b/target/mips/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/mips/cpu.c
 +++ b/target/mips/cpu.c
@@ -XXX,XX +XXX,XX @@ static void mips_cpu_class_init(ObjectClass *c, void *data)
      cc->class_by_name = mips_cpu_class_by_name;
      cc->has_work = mips_cpu_has_work;
      cc->do_interrupt = mips_cpu_do_interrupt;
 -    cc->cpu_exec_interrupt = mips_cpu_exec_interrupt;
      cc->dump_state = mips_cpu_dump_state;
      cc->set_pc = mips_cpu_set_pc;
      cc->gdb_read_register = mips_cpu_gdb_read_register;
@@ -XXX,XX +XXX,XX @@ static void mips_cpu_class_init(ObjectClass *c, void *data)
      cc->disas_set_info = mips_cpu_disas_set_info;
  #ifdef CONFIG_TCG
      cc->tcg_ops.initialize = mips_tcg_init;
 +    cc->tcg_ops.cpu_exec_interrupt = mips_cpu_exec_interrupt;
      cc->tcg_ops.synchronize_from_tb = mips_cpu_synchronize_from_tb;
      cc->tlb_fill = mips_cpu_tlb_fill;
  #endif
 diff --git a/target/nios2/cpu.c b/target/nios2/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/nios2/cpu.c
 +++ b/target/nios2/cpu.c
@@ -XXX,XX +XXX,XX @@ static void nios2_cpu_class_init(ObjectClass *oc, void *data)
      cc->class_by_name = nios2_cpu_class_by_name;
      cc->has_work = nios2_cpu_has_work;
      cc->do_interrupt = nios2_cpu_do_interrupt;
 -    cc->cpu_exec_interrupt = nios2_cpu_exec_interrupt;
 +    cc->tcg_ops.cpu_exec_interrupt = nios2_cpu_exec_interrupt;
      cc->dump_state = nios2_cpu_dump_state;
      cc->set_pc = nios2_cpu_set_pc;
      cc->disas_set_info = nios2_cpu_disas_set_info;
 diff --git a/target/openrisc/cpu.c b/target/openrisc/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/openrisc/cpu.c
 +++ b/target/openrisc/cpu.c
@@ -XXX,XX +XXX,XX @@ static void openrisc_cpu_class_init(ObjectClass *oc, void *data)
      cc->class_by_name = openrisc_cpu_class_by_name;
      cc->has_work = openrisc_cpu_has_work;
      cc->do_interrupt = openrisc_cpu_do_interrupt;
 -    cc->cpu_exec_interrupt = openrisc_cpu_exec_interrupt;
 +    cc->tcg_ops.cpu_exec_interrupt = openrisc_cpu_exec_interrupt;
      cc->dump_state = openrisc_cpu_dump_state;
      cc->set_pc = openrisc_cpu_set_pc;
      cc->gdb_read_register = openrisc_cpu_gdb_read_register;
 diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/riscv/cpu.c
 +++ b/target/riscv/cpu.c
@@ -XXX,XX +XXX,XX @@ static void riscv_cpu_class_init(ObjectClass *c, void *data)
      cc->class_by_name = riscv_cpu_class_by_name;
      cc->has_work = riscv_cpu_has_work;
      cc->do_interrupt = riscv_cpu_do_interrupt;
 -    cc->cpu_exec_interrupt = riscv_cpu_exec_interrupt;
 +    cc->tcg_ops.cpu_exec_interrupt = riscv_cpu_exec_interrupt;
      cc->dump_state = riscv_cpu_dump_state;
      cc->set_pc = riscv_cpu_set_pc;
      cc->tcg_ops.synchronize_from_tb = riscv_cpu_synchronize_from_tb;
 diff --git a/target/rx/cpu.c b/target/rx/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/rx/cpu.c
 +++ b/target/rx/cpu.c
@@ -XXX,XX +XXX,XX @@ static void rx_cpu_class_init(ObjectClass *klass, void *data)
      cc->class_by_name = rx_cpu_class_by_name;
      cc->has_work = rx_cpu_has_work;
      cc->do_interrupt = rx_cpu_do_interrupt;
 -    cc->cpu_exec_interrupt = rx_cpu_exec_interrupt;
 +    cc->tcg_ops.cpu_exec_interrupt = rx_cpu_exec_interrupt;
      cc->dump_state = rx_cpu_dump_state;
      cc->set_pc = rx_cpu_set_pc;
      cc->tcg_ops.synchronize_from_tb = rx_cpu_synchronize_from_tb;
 diff --git a/target/s390x/cpu.c b/target/s390x/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/s390x/cpu.c
 +++ b/target/s390x/cpu.c
@@ -XXX,XX +XXX,XX @@ static void s390_cpu_class_init(ObjectClass *oc, void *data)
      cc->get_crash_info = s390_cpu_get_crash_info;
      cc->write_elf64_note = s390_cpu_write_elf64_note;
  #ifdef CONFIG_TCG
 -    cc->cpu_exec_interrupt = s390_cpu_exec_interrupt;
 +    cc->tcg_ops.cpu_exec_interrupt = s390_cpu_exec_interrupt;
      cc->debug_excp_handler = s390x_cpu_debug_excp_handler;
      cc->do_unaligned_access = s390x_cpu_do_unaligned_access;
  #endif
 diff --git a/target/sh4/cpu.c b/target/sh4/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/sh4/cpu.c
 +++ b/target/sh4/cpu.c
@@ -XXX,XX +XXX,XX @@ static void superh_cpu_class_init(ObjectClass *oc, void *data)
      cc->class_by_name = superh_cpu_class_by_name;
      cc->has_work = superh_cpu_has_work;
      cc->do_interrupt = superh_cpu_do_interrupt;
 -    cc->cpu_exec_interrupt = superh_cpu_exec_interrupt;
 +    cc->tcg_ops.cpu_exec_interrupt = superh_cpu_exec_interrupt;
      cc->dump_state = superh_cpu_dump_state;
      cc->set_pc = superh_cpu_set_pc;
      cc->tcg_ops.synchronize_from_tb = superh_cpu_synchronize_from_tb;
 diff --git a/target/sparc/cpu.c b/target/sparc/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/sparc/cpu.c
 +++ b/target/sparc/cpu.c
@@ -XXX,XX +XXX,XX @@ static void sparc_cpu_class_init(ObjectClass *oc, void *data)
      cc->parse_features = sparc_cpu_parse_features;
      cc->has_work = sparc_cpu_has_work;
      cc->do_interrupt = sparc_cpu_do_interrupt;
 -    cc->cpu_exec_interrupt = sparc_cpu_exec_interrupt;
 +    cc->tcg_ops.cpu_exec_interrupt = sparc_cpu_exec_interrupt;
      cc->dump_state = sparc_cpu_dump_state;
  #if !defined(TARGET_SPARC64) && !defined(CONFIG_USER_ONLY)
      cc->memory_rw_debug = sparc_cpu_memory_rw_debug;
 diff --git a/target/tilegx/cpu.c b/target/tilegx/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/tilegx/cpu.c
 +++ b/target/tilegx/cpu.c
@@ -XXX,XX +XXX,XX @@ static void tilegx_cpu_class_init(ObjectClass *oc, void *data)
      cc->class_by_name = tilegx_cpu_class_by_name;
      cc->has_work = tilegx_cpu_has_work;
      cc->do_interrupt = tilegx_cpu_do_interrupt;
 -    cc->cpu_exec_interrupt = tilegx_cpu_exec_interrupt;
 +    cc->tcg_ops.cpu_exec_interrupt = tilegx_cpu_exec_interrupt;
      cc->dump_state = tilegx_cpu_dump_state;
      cc->set_pc = tilegx_cpu_set_pc;
      cc->tlb_fill = tilegx_cpu_tlb_fill;
 diff --git a/target/unicore32/cpu.c b/target/unicore32/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/unicore32/cpu.c
 +++ b/target/unicore32/cpu.c
@@ -XXX,XX +XXX,XX @@ static void uc32_cpu_class_init(ObjectClass *oc, void *data)
      cc->class_by_name = uc32_cpu_class_by_name;
      cc->has_work = uc32_cpu_has_work;
      cc->do_interrupt = uc32_cpu_do_interrupt;
 -    cc->cpu_exec_interrupt = uc32_cpu_exec_interrupt;
 +    cc->tcg_ops.cpu_exec_interrupt = uc32_cpu_exec_interrupt;
      cc->dump_state = uc32_cpu_dump_state;
      cc->set_pc = uc32_cpu_set_pc;
      cc->tlb_fill = uc32_cpu_tlb_fill;
 diff --git a/target/xtensa/cpu.c b/target/xtensa/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/xtensa/cpu.c
 +++ b/target/xtensa/cpu.c
@@ -XXX,XX +XXX,XX @@ static void xtensa_cpu_class_init(ObjectClass *oc, void *data)
      cc->class_by_name = xtensa_cpu_class_by_name;
      cc->has_work = xtensa_cpu_has_work;
      cc->do_interrupt = xtensa_cpu_do_interrupt;
 -    cc->cpu_exec_interrupt = xtensa_cpu_exec_interrupt;
 +    cc->tcg_ops.cpu_exec_interrupt = xtensa_cpu_exec_interrupt;
      cc->dump_state = xtensa_cpu_dump_state;
      cc->set_pc = xtensa_cpu_set_pc;
      cc->gdb_read_register = xtensa_cpu_gdb_read_register;
 diff --git a/target/ppc/translate_init.c.inc b/target/ppc/translate_init.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/target/ppc/translate_init.c.inc
 +++ b/target/ppc/translate_init.c.inc
@@ -XXX,XX +XXX,XX @@ static void ppc_cpu_reset(DeviceState *dev)
  }
- #ifndef CONFIG_USER_ONLY
+@@ -XXX,XX +XXX,XX @@ static bool fold_xor(OptContext *ctx, TCGOp *op)
-+
- static bool ppc_cpu_is_big_endian(CPUState *cs)
+     ctx->z_mask = arg_info(op->args[1])->z_mask
- {
+                 | arg_info(op->args[2])->z_mask;
-     PowerPCCPU *cpu = POWERPC_CPU(cs);
++    ctx->s_mask = arg_info(op->args[1])->s_mask
-@@ -XXX,XX +XXX,XX @@ static bool ppc_cpu_is_big_endian(CPUState *cs)
++                & arg_info(op->args[2])->s_mask;
-     return !msr_le;
+     return fold_masks(ctx, op);
  }
-+#ifdef CONFIG_TCG
- static void ppc_cpu_exec_enter(CPUState *cs)
- {
-     PowerPCCPU *cpu = POWERPC_CPU(cs);
-@@ -XXX,XX +XXX,XX @@ static void ppc_cpu_exec_exit(CPUState *cs)
-         vhc->cpu_exec_exit(cpu->vhyp, cpu);
-     }
- }
--#endif
-+#endif /* CONFIG_TCG */
-+
-+#endif /* !CONFIG_USER_ONLY */
- static void ppc_cpu_instance_init(Object *obj)
- {
-@@ -XXX,XX +XXX,XX @@ static void ppc_cpu_class_init(ObjectClass *oc, void *data)
-     cc->class_by_name = ppc_cpu_class_by_name;
-     cc->has_work = ppc_cpu_has_work;
-     cc->do_interrupt = ppc_cpu_do_interrupt;
--    cc->cpu_exec_interrupt = ppc_cpu_exec_interrupt;
-     cc->dump_state = ppc_cpu_dump_state;
-     cc->dump_statistics = ppc_cpu_dump_statistics;
-     cc->set_pc = ppc_cpu_set_pc;
-@@ -XXX,XX +XXX,XX @@ static void ppc_cpu_class_init(ObjectClass *oc, void *data)
- #endif
- #ifdef CONFIG_TCG
-     cc->tcg_ops.initialize = ppc_translate_init;
-+    cc->tcg_ops.cpu_exec_interrupt = ppc_cpu_exec_interrupt;
-     cc->tlb_fill = ppc_cpu_tlb_fill;
--#endif
- #ifndef CONFIG_USER_ONLY
--    cc->cpu_exec_enter = ppc_cpu_exec_enter;
--    cc->cpu_exec_exit = ppc_cpu_exec_exit;
--#endif
-+    cc->tcg_ops.cpu_exec_enter = ppc_cpu_exec_enter;
-+    cc->tcg_ops.cpu_exec_exit = ppc_cpu_exec_exit;
-+#endif /* !CONFIG_USER_ONLY */
-+#endif /* CONFIG_TCG */
-     cc->disas_set_info = ppc_disas_set_info;
 --
 .25.1

-[PULL 41/46] cpu: move adjust_watchpoint_address to tcg_ops
+[PULL 54/56] tcg/optimize: Propagate sign info for setcond
-From: Claudio Fontana <cfontana@suse.de>
+The result is either 0 or 1, which means that we have
 a 2 bit signed result, and thus 62 bits of sign.
 For clarity, use the smask_from_zmask function.
-commit 40612000599e ("arm: Correctly handle watchpoints for BE32 CPUs")
-introduced this ARM-specific, TCG-specific hack to adjust the address,
-before checking it with cpu_check_watchpoint.
-Make adjust_watchpoint_address optional and move it to tcg_ops.
-Signed-off-by: Claudio Fontana <cfontana@suse.de>
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-Message-Id: <20210204163931.7358-14-cfontana@suse.de>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/hw/core/cpu.h | 6 +++++-
+ tcg/optimize.c | 2 ++
- hw/core/cpu.c         | 6 ------
+file changed, 2 insertions(+)
  softmmu/physmem.c     | 5 ++++-
  target/arm/cpu.c      | 2 +-
 files changed, 10 insertions(+), 9 deletions(-)
-diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/hw/core/cpu.h
+--- a/tcg/optimize.c
-+++ b/include/hw/core/cpu.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ typedef struct TcgCpuOperations {
+@@ -XXX,XX +XXX,XX @@ static bool fold_setcond(OptContext *ctx, TCGOp *op)
-     void (*do_unaligned_access)(CPUState *cpu, vaddr addr,
+     }
-                                 MMUAccessType access_type,
-                                 int mmu_idx, uintptr_t retaddr);
+     ctx->z_mask = 1;
-+    /**
++    ctx->s_mask = smask_from_zmask(1);
-+     * @adjust_watchpoint_address: hack for cpu_check_watchpoint used by ARM
+     return false;
 +     */
 +    vaddr (*adjust_watchpoint_address)(CPUState *cpu, vaddr addr, int len);
 +
  } TcgCpuOperations;
  /**
@@ -XXX,XX +XXX,XX @@ struct CPUClass {
      const char * (*gdb_get_dynamic_xml)(CPUState *cpu, const char *xmlname);
      void (*disas_set_info)(CPUState *cpu, disassemble_info *info);
 -    vaddr (*adjust_watchpoint_address)(CPUState *cpu, vaddr addr, int len);
      const char *deprecation_note;
      /* Keep non-pointer data at the end to minimize holes.  */
 diff --git a/hw/core/cpu.c b/hw/core/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/core/cpu.c
 +++ b/hw/core/cpu.c
@@ -XXX,XX +XXX,XX @@ static int64_t cpu_common_get_arch_id(CPUState *cpu)
      return cpu->cpu_index;
  }
--static vaddr cpu_adjust_watchpoint_address(CPUState *cpu, vaddr addr, int len)
+@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
 -{
 -    return addr;
 -}
 -
  static Property cpu_common_props[] = {
  #ifndef CONFIG_USER_ONLY
      /* Create a memory property for softmmu CPU object,
@@ -XXX,XX +XXX,XX @@ static void cpu_class_init(ObjectClass *klass, void *data)
      k->gdb_write_register = cpu_common_gdb_write_register;
      k->virtio_is_big_endian = cpu_common_virtio_is_big_endian;
      k->debug_check_watchpoint = cpu_common_debug_check_watchpoint;
 -    k->adjust_watchpoint_address = cpu_adjust_watchpoint_address;
      set_bit(DEVICE_CATEGORY_CPU, dc->categories);
      dc->realize = cpu_common_realizefn;
      dc->unrealize = cpu_common_unrealizefn;
 diff --git a/softmmu/physmem.c b/softmmu/physmem.c
 index XXXXXXX..XXXXXXX 100644
 --- a/softmmu/physmem.c
 +++ b/softmmu/physmem.c
@@ -XXX,XX +XXX,XX @@ void cpu_check_watchpoint(CPUState *cpu, vaddr addr, vaddr len,
          return;
      }
--    addr = cc->adjust_watchpoint_address(cpu, addr, len);
+     ctx->z_mask = 1;
-+    if (cc->tcg_ops.adjust_watchpoint_address) {
++    ctx->s_mask = smask_from_zmask(1);
-+        /* this is currently used only by ARM BE32 */
+     return false;
-+        addr = cc->tcg_ops.adjust_watchpoint_address(cpu, addr, len);
-+    }
+  do_setcond_const:
      QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
          if (watchpoint_address_matches(wp, addr, len)
              && (wp->flags & flags)) {
 diff --git a/target/arm/cpu.c b/target/arm/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/arm/cpu.c
 +++ b/target/arm/cpu.c
@@ -XXX,XX +XXX,XX @@ static void arm_cpu_class_init(ObjectClass *oc, void *data)
  #if !defined(CONFIG_USER_ONLY)
      cc->tcg_ops.do_transaction_failed = arm_cpu_do_transaction_failed;
      cc->tcg_ops.do_unaligned_access = arm_cpu_do_unaligned_access;
 -    cc->adjust_watchpoint_address = arm_adjust_watchpoint_address;
 +    cc->tcg_ops.adjust_watchpoint_address = arm_adjust_watchpoint_address;
      cc->tcg_ops.do_interrupt = arm_cpu_do_interrupt;
  #endif /* CONFIG_TCG && !CONFIG_USER_ONLY */
  #endif /* CONFIG_TCG */
 --
 .25.1

-[PULL 03/46] tcg/aarch64: Do not convert TCGArg to temps that are not temps
+[PULL 55/56] tcg/optimize: Propagate sign info for bit counting
-Fixes INDEX_op_rotli_vec for aarch64 host, where the 3rd
+The results are generally 6 bit unsigned values, though
-argument is an integer, not a temporary, which now tickles
+the count leading and trailing bits may produce any value
-an assert added in e89b28a6350.
+for a zero input.
-Previously, the value computed into v2 would be garbage for
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-rotli_vec, but as the value was unused it caused no harm.
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/aarch64/tcg-target.c.inc | 7 ++++---
+ tcg/optimize.c | 3 ++-
-file changed, 4 insertions(+), 3 deletions(-)
+file changed, 2 insertions(+), 1 deletion(-)
-diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/aarch64/tcg-target.c.inc
+--- a/tcg/optimize.c
-+++ b/tcg/aarch64/tcg-target.c.inc
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
+@@ -XXX,XX +XXX,XX @@ static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
-     v0 = temp_tcgv_vec(arg_temp(a0));
+         g_assert_not_reached();
-     v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
+     }
-     a2 = va_arg(va, TCGArg);
+     ctx->z_mask = arg_info(op->args[2])->z_mask | z_mask;
--    v2 = temp_tcgv_vec(arg_temp(a2));
+-
-+    va_end(va);
++    ctx->s_mask = smask_from_zmask(ctx->z_mask);
+     return false;
-     switch (opc) {
+ }
-     case INDEX_op_rotli_vec:
-@@ -XXX,XX +XXX,XX @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
+@@ -XXX,XX +XXX,XX @@ static bool fold_ctpop(OptContext *ctx, TCGOp *op)
      case INDEX_op_shrv_vec:
      case INDEX_op_sarv_vec:
          /* Right shifts are negative left shifts for AArch64.  */
 +        v2 = temp_tcgv_vec(arg_temp(a2));
          t1 = tcg_temp_new_vec(type);
          tcg_gen_neg_vec(vece, t1, v2);
          opc = (opc == INDEX_op_shrv_vec
@@ -XXX,XX +XXX,XX @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
          break;
      case INDEX_op_rotlv_vec:
 +        v2 = temp_tcgv_vec(arg_temp(a2));
          t1 = tcg_temp_new_vec(type);
          c1 = tcg_constant_vec(type, vece, 8 << vece);
          tcg_gen_sub_vec(vece, t1, v2, c1);
@@ -XXX,XX +XXX,XX @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
          break;
      case INDEX_op_rotrv_vec:
 +        v2 = temp_tcgv_vec(arg_temp(a2));
          t1 = tcg_temp_new_vec(type);
          t2 = tcg_temp_new_vec(type);
          c1 = tcg_constant_vec(type, vece, 8 << vece);
@@ -XXX,XX +XXX,XX @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
      default:
          g_assert_not_reached();
      }
--
++    ctx->s_mask = smask_from_zmask(ctx->z_mask);
--    va_end(va);
+     return false;
  }
- static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
 --
 .25.1

-[PULL 26/46] tcg/tci: Restrict TCG_TARGET_NB_REGS to 16
+[PULL 56/56] tcg/optimize: Propagate sign info for shifting
-As noted in several comments, 8 regs is not enough for 32-bit
+For constant shifts, we can simply shift the s_mask.
-to perform calls, as currently implemented.  Shortly, we will
-rearrange the encoding which will make 32 regs impossible.
+For variable shifts, we know that sar does not reduce
 the s_mask, which helps for sequences like
     ext32s_i64  t, in
     sar_i64     t, t, v
     ext32s_i64  out, t
 allowing the final extend to be eliminated.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/tci/tcg-target.h     | 32 +++++---------------------------
+ tcg/optimize.c | 50 +++++++++++++++++++++++++++++++++++++++++++++++---
- tcg/tci/tcg-target.c.inc | 26 --------------------------
+file changed, 47 insertions(+), 3 deletions(-)
 files changed, 5 insertions(+), 53 deletions(-)
-diff --git a/tcg/tci/tcg-target.h b/tcg/tci/tcg-target.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/tci/tcg-target.h
+--- a/tcg/optimize.c
-+++ b/tcg/tci/tcg-target.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static uint64_t smask_from_zmask(uint64_t zmask)
- #define TCG_TARGET_HAS_mulu2_i32        1
+     return ~(~0ull >> rep);
- #endif /* TCG_TARGET_REG_BITS == 64 */
+ }
--/* Number of registers available.
++/*
--   For 32 bit hosts, we need more than 8 registers (call arguments). */
++ * Recreate a properly left-aligned smask after manipulation.
--/* #define TCG_TARGET_NB_REGS 8 */
++ * Some bit-shuffling, particularly shifts and rotates, may
-+/* Number of registers available. */
++ * retain sign bits on the left, but may scatter disconnected
- #define TCG_TARGET_NB_REGS 16
++ * sign bits on the right.  Retain only what remains to the left.
--/* #define TCG_TARGET_NB_REGS 32 */
++ */
++static uint64_t smask_from_smask(int64_t smask)
- /* List of registers which are used by TCG. */
++{
- typedef enum {
++    /* Only the 1 bits are significant for smask */
-@@ -XXX,XX +XXX,XX @@ typedef enum {
++    return smask_from_zmask(~smask);
-     TCG_REG_R5,
++}
      TCG_REG_R6,
      TCG_REG_R7,
 -#if TCG_TARGET_NB_REGS >= 16
      TCG_REG_R8,
      TCG_REG_R9,
      TCG_REG_R10,
@@ -XXX,XX +XXX,XX @@ typedef enum {
      TCG_REG_R13,
      TCG_REG_R14,
      TCG_REG_R15,
 -#if TCG_TARGET_NB_REGS >= 32
 -    TCG_REG_R16,
 -    TCG_REG_R17,
 -    TCG_REG_R18,
 -    TCG_REG_R19,
 -    TCG_REG_R20,
 -    TCG_REG_R21,
 -    TCG_REG_R22,
 -    TCG_REG_R23,
 -    TCG_REG_R24,
 -    TCG_REG_R25,
 -    TCG_REG_R26,
 -    TCG_REG_R27,
 -    TCG_REG_R28,
 -    TCG_REG_R29,
 -    TCG_REG_R30,
 -    TCG_REG_R31,
 -#endif
 -#endif
 +
-+    TCG_AREG0 = TCG_REG_R14,
+ static inline TempOptInfo *ts_info(TCGTemp *ts)
-+    TCG_REG_CALL_STACK = TCG_REG_R15,
+ {
      return ts->state_ptr;
@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
  static bool fold_shift(OptContext *ctx, TCGOp *op)
  {
 +    uint64_t s_mask, z_mask, sign;
 +
-     /* Special value UINT8_MAX is used by TCI to encode constant values. */
+     if (fold_const2(ctx, op) ||
-     TCG_CONST = UINT8_MAX
+         fold_ix_to_i(ctx, op, 0) ||
- } TCGReg;
+         fold_xi_to_x(ctx, op, 0)) {
+         return true;
--#define TCG_AREG0                       (TCG_TARGET_NB_REGS - 2)
+     }
--
- /* Used for function call generation. */
++    s_mask = arg_info(op->args[1])->s_mask;
--#define TCG_REG_CALL_STACK              (TCG_TARGET_NB_REGS - 1)
++    z_mask = arg_info(op->args[1])->z_mask;
- #define TCG_TARGET_CALL_STACK_OFFSET    0
++
- #define TCG_TARGET_STACK_ALIGN          16
+     if (arg_is_const(op->args[2])) {
+-        ctx->z_mask = do_constant_folding(op->opc, ctx->type,
-diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
+-                                          arg_info(op->args[1])->z_mask,
-index XXXXXXX..XXXXXXX 100644
+-                                          arg_info(op->args[2])->val);
---- a/tcg/tci/tcg-target.c.inc
++        int sh = arg_info(op->args[2])->val;
-+++ b/tcg/tci/tcg-target.c.inc
++
-@@ -XXX,XX +XXX,XX @@ static const int tcg_target_reg_alloc_order[] = {
++        ctx->z_mask = do_constant_folding(op->opc, ctx->type, z_mask, sh);
-     TCG_REG_R5,
++
-     TCG_REG_R6,
++        s_mask = do_constant_folding(op->opc, ctx->type, s_mask, sh);
-     TCG_REG_R7,
++        ctx->s_mask = smask_from_smask(s_mask);
--#if TCG_TARGET_NB_REGS >= 16
++
-     TCG_REG_R8,
+         return fold_masks(ctx, op);
-     TCG_REG_R9,
+     }
-     TCG_REG_R10,
++
-@@ -XXX,XX +XXX,XX @@ static const int tcg_target_reg_alloc_order[] = {
++    switch (op->opc) {
-     TCG_REG_R13,
++    CASE_OP_32_64(sar):
-     TCG_REG_R14,
++        /*
-     TCG_REG_R15,
++         * Arithmetic right shift will not reduce the number of
--#endif
++         * input sign repetitions.
- };
++         */
++        ctx->s_mask = s_mask;
- #if MAX_OPC_PARAM_IARGS != 6
++        break;
-@@ -XXX,XX +XXX,XX @@ static const int tcg_target_call_iarg_regs[] = {
++    CASE_OP_32_64(shr):
- #if TCG_TARGET_REG_BITS == 32
++        /*
-     /* 32 bit hosts need 2 * MAX_OPC_PARAM_IARGS registers. */
++         * If the sign bit is known zero, then logical right shift
-     TCG_REG_R7,
++         * will not reduced the number of input sign repetitions.
--#if TCG_TARGET_NB_REGS >= 16
++         */
-     TCG_REG_R8,
++        sign = (s_mask & -s_mask) >> 1;
-     TCG_REG_R9,
++        if (!(z_mask & sign)) {
-     TCG_REG_R10,
++            ctx->s_mask = s_mask;
-     TCG_REG_R11,
++        }
-     TCG_REG_R12,
++        break;
--#else
++    default:
--# error Too few input registers available
++        break;
--#endif
++    }
- #endif
++
- };
+     return false;
+ }
@@ -XXX,XX +XXX,XX @@ static const char *const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
      "r05",
      "r06",
      "r07",
 -#if TCG_TARGET_NB_REGS >= 16
      "r08",
      "r09",
      "r10",
@@ -XXX,XX +XXX,XX @@ static const char *const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
      "r13",
      "r14",
      "r15",
 -#if TCG_TARGET_NB_REGS >= 32
 -    "r16",
 -    "r17",
 -    "r18",
 -    "r19",
 -    "r20",
 -    "r21",
 -    "r22",
 -    "r23",
 -    "r24",
 -    "r25",
 -    "r26",
 -    "r27",
 -    "r28",
 -    "r29",
 -    "r30",
 -    "r31"
 -#endif
 -#endif
  };
  #endif
 --
 .25.1

The following changes since commit d0dddab40e472ba62b5f43f11cc7dba085dabe71:

Merge remote-tracking branch 'remotes/mst/tags/for_upstream' into staging (2021-02-05 15:27:02 +0000)

are available in the Git repository at:

https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20210205

for you to fetch changes up to fb6916dd6ca8bb4b42d44baba9c67ecaf2279577:

accel: introduce AccelCPUClass extending CPUClass (2021-02-05 10:24:15 -1000)

----------------------------------------------------------------
TCGCPUOps cleanups (claudio)
tcg/s390 compare fix (phil)
tcg/aarch64 rotli_vec fix
tcg/tci cleanups and fixes

----------------------------------------------------------------
Claudio Fontana (13):
      target/riscv: remove CONFIG_TCG, as it is always TCG
      accel/tcg: split TCG-only code from cpu_exec_realizefn
      target/arm: do not use cc->do_interrupt for KVM directly
      cpu: move cc->do_interrupt to tcg_ops
      cpu: move cc->transaction_failed to tcg_ops
      cpu: move do_unaligned_access to tcg_ops
      physmem: make watchpoint checking code TCG-only
      cpu: move adjust_watchpoint_address to tcg_ops
      cpu: move debug_check_watchpoint to tcg_ops
      cpu: tcg_ops: move to tcg-cpu-ops.h, keep a pointer in CPUClass
      accel: extend AccelState and AccelClass to user-mode
      accel: replace struct CpusAccel with AccelOpsClass
      accel: introduce AccelCPUClass extending CPUClass

Eduardo Habkost (5):
      cpu: Introduce TCGCpuOperations struct
      cpu: Move synchronize_from_tb() to tcg_ops
      cpu: Move cpu_exec_* to tcg_ops
      cpu: Move tlb_fill to tcg_ops
      cpu: Move debug_excp_handler to tcg_ops

Philippe Mathieu-Daudé (2):
      tcg/s390: Fix compare instruction from extended-immediate facility
      exec/cpu-defs: Remove TCG backends dependency

Richard Henderson (24):
      tcg/aarch64: Do not convert TCGArg to temps that are not temps
      configure: Fix --enable-tcg-interpreter
      tcg/tci: Make tci_tb_ptr thread-local
      tcg/tci: Inline tci_write_reg32s into the only caller
      tcg/tci: Inline tci_write_reg8 into its callers
      tcg/tci: Inline tci_write_reg16 into the only caller
      tcg/tci: Inline tci_write_reg32 into all callers
      tcg/tci: Inline tci_write_reg64 into 64-bit callers
      tcg/tci: Merge INDEX_op_ld8u_{i32,i64}
      tcg/tci: Merge INDEX_op_ld8s_{i32,i64}
      tcg/tci: Merge INDEX_op_ld16u_{i32,i64}
      tcg/tci: Merge INDEX_op_ld16s_{i32,i64}
      tcg/tci: Merge INDEX_op_{ld_i32,ld32u_i64}
      tcg/tci: Merge INDEX_op_st8_{i32,i64}
      tcg/tci: Merge INDEX_op_st16_{i32,i64}
      tcg/tci: Move stack bounds check to compile-time
      tcg/tci: Merge INDEX_op_{st_i32,st32_i64}
      tcg/tci: Use g_assert_not_reached
      tcg/tci: Remove dead code for TCG_TARGET_HAS_div2_*
      tcg/tci: Implement 64-bit division
      tcg/tci: Remove TODO as unused
      tcg/tci: Restrict TCG_TARGET_NB_REGS to 16
      tcg/tci: Fix TCG_REG_R4 misusage
      tcg/tci: Remove TCG_CONST

Stefan Weil (2):
      tcg/tci: Implement INDEX_op_ld16s_i32
      tcg/tci: Implement INDEX_op_ld8s_i64

configure                                          |   5 +-
 accel/accel-softmmu.h                              |  15 +
 accel/kvm/kvm-cpus.h                               |   2 -
 .../{tcg-cpus-icount.h => tcg-accel-ops-icount.h}  |   2 +
 accel/tcg/tcg-accel-ops-mttcg.h                    |  19 +
 accel/tcg/{tcg-cpus-rr.h => tcg-accel-ops-rr.h}    |   0
 accel/tcg/{tcg-cpus.h => tcg-accel-ops.h}          |   6 +-
 include/exec/cpu-all.h                             |  11 +-
 include/exec/cpu-defs.h                            |   3 -
 include/exec/exec-all.h                            |   2 +-
 include/hw/boards.h                                |   2 +-
 include/hw/core/accel-cpu.h                        |  38 ++
 include/hw/core/cpu.h                              |  86 +---
 include/hw/core/tcg-cpu-ops.h                      |  97 +++++
 include/{sysemu => qemu}/accel.h                   |  16 +-
 include/sysemu/accel-ops.h                         |  45 ++
 include/sysemu/cpus.h                              |  26 +-
 include/sysemu/hvf.h                               |   2 +-
 include/sysemu/kvm.h                               |   2 +-
 include/sysemu/kvm_int.h                           |   2 +-
 target/arm/internals.h                             |   6 +
 target/i386/hax/{hax-cpus.h => hax-accel-ops.h}    |   2 -
 target/i386/hax/hax-windows.h                      |   2 +-
 target/i386/hvf/{hvf-cpus.h => hvf-accel-ops.h}    |   2 -
 target/i386/hvf/hvf-i386.h                         |   2 +-
 target/i386/whpx/{whpx-cpus.h => whpx-accel-ops.h} |   2 -
 tcg/tci/tcg-target-con-set.h                       |   6 +-
 tcg/tci/tcg-target.h                               |  37 +-
 accel/accel-common.c                               | 105 +++++
 accel/{accel.c => accel-softmmu.c}                 |  61 ++-
 accel/accel-user.c                                 |  24 ++
 accel/kvm/{kvm-cpus.c => kvm-accel-ops.c}          |  28 +-
 accel/kvm/kvm-all.c                                |   2 -
 accel/qtest/qtest.c                                |  25 +-
 accel/tcg/cpu-exec.c                               |  53 ++-
 accel/tcg/cputlb.c                                 |  34 +-
 .../{tcg-cpus-icount.c => tcg-accel-ops-icount.c}  |  21 +-
 .../{tcg-cpus-mttcg.c => tcg-accel-ops-mttcg.c}    |  14 +-
 accel/tcg/{tcg-cpus-rr.c => tcg-accel-ops-rr.c}    |  13 +-
 accel/tcg/{tcg-cpus.c => tcg-accel-ops.c}          |  47 +-
 accel/tcg/tcg-all.c                                |  19 +-
 accel/tcg/user-exec.c                              |   8 +-
 accel/xen/xen-all.c                                |  26 +-
 bsd-user/main.c                                    |  11 +-
 cpu.c                                              |  66 +--
 hw/core/cpu.c                                      |  21 +-
 hw/mips/jazz.c                                     |  12 +-
 linux-user/main.c                                  |   7 +-
 softmmu/cpus.c                                     |  12 +-
 softmmu/memory.c                                   |   2 +-
 softmmu/physmem.c                                  | 149 ++++---
 softmmu/qtest.c                                    |   2 +-
 softmmu/vl.c                                       |   9 +-
 target/alpha/cpu.c                                 |  21 +-
 target/arm/cpu.c                                   |  45 +-
 target/arm/cpu64.c                                 |   4 +-
 target/arm/cpu_tcg.c                               |  32 +-
 target/arm/helper.c                                |   4 +
 target/arm/kvm64.c                                 |   6 +-
 target/avr/cpu.c                                   |  19 +-
 target/avr/helper.c                                |   5 +-
 target/cris/cpu.c                                  |  43 +-
 target/cris/helper.c                               |   5 +-
 target/hppa/cpu.c                                  |  24 +-
 target/i386/hax/{hax-cpus.c => hax-accel-ops.c}    |  33 +-
 target/i386/hax/hax-all.c                          |   7 +-
 target/i386/hax/hax-mem.c                          |   2 +-
 target/i386/hax/hax-posix.c                        |   2 +-
 target/i386/hax/hax-windows.c                      |   2 +-
 target/i386/hvf/{hvf-cpus.c => hvf-accel-ops.c}    |  29 +-
 target/i386/hvf/hvf.c                              |   5 +-
 target/i386/hvf/x86_task.c                         |   2 +-
 target/i386/hvf/x86hvf.c                           |   2 +-
 target/i386/tcg/tcg-cpu.c                          |  26 +-
 target/i386/whpx/{whpx-cpus.c => whpx-accel-ops.c} |  33 +-
 target/i386/whpx/whpx-all.c                        |   9 +-
 target/lm32/cpu.c                                  |  19 +-
 target/m68k/cpu.c                                  |  19 +-
 target/microblaze/cpu.c                            |  25 +-
 target/mips/cpu.c                                  |  35 +-
 target/moxie/cpu.c                                 |  15 +-
 target/nios2/cpu.c                                 |  18 +-
 target/openrisc/cpu.c                              |  17 +-
 target/riscv/cpu.c                                 |  26 +-
 target/riscv/cpu_helper.c                          |   2 +-
 target/rx/cpu.c                                    |  20 +-
 target/s390x/cpu.c                                 |  33 +-
 target/s390x/excp_helper.c                         |   2 +-
 target/sh4/cpu.c                                   |  21 +-
 target/sparc/cpu.c                                 |  25 +-
 target/tilegx/cpu.c                                |  17 +-
 target/tricore/cpu.c                               |  12 +-
 target/unicore32/cpu.c                             |  17 +-
 target/xtensa/cpu.c                                |  23 +-
 target/xtensa/helper.c                             |   4 +-
 tcg/tcg-common.c                                   |   4 -
 tcg/tci.c                                          | 479 ++++++++-------------
 target/ppc/translate_init.c.inc                    |  39 +-
 tcg/aarch64/tcg-target.c.inc                       |   7 +-
 tcg/s390/tcg-target.c.inc                          |   2 +-
 tcg/tci/tcg-target.c.inc                           | 149 ++-----
 MAINTAINERS                                        |   7 +-
 accel/kvm/meson.build                              |   2 +-
 accel/meson.build                                  |   4 +-
 accel/tcg/meson.build                              |  10 +-
 target/i386/hax/meson.build                        |   2 +-
 target/i386/hvf/meson.build                        |   2 +-
 target/i386/whpx/meson.build                       |   2 +-
 108 files changed, 1565 insertions(+), 1065 deletions(-)
 create mode 100644 accel/accel-softmmu.h
 rename accel/tcg/{tcg-cpus-icount.h => tcg-accel-ops-icount.h} (88%)
 create mode 100644 accel/tcg/tcg-accel-ops-mttcg.h
 rename accel/tcg/{tcg-cpus-rr.h => tcg-accel-ops-rr.h} (100%)
 rename accel/tcg/{tcg-cpus.h => tcg-accel-ops.h} (72%)
 create mode 100644 include/hw/core/accel-cpu.h
 create mode 100644 include/hw/core/tcg-cpu-ops.h
 rename include/{sysemu => qemu}/accel.h (94%)
 create mode 100644 include/sysemu/accel-ops.h
 rename target/i386/hax/{hax-cpus.h => hax-accel-ops.h} (95%)
 rename target/i386/hvf/{hvf-cpus.h => hvf-accel-ops.h} (94%)
 rename target/i386/whpx/{whpx-cpus.h => whpx-accel-ops.h} (96%)
 create mode 100644 accel/accel-common.c
 rename accel/{accel.c => accel-softmmu.c} (64%)
 create mode 100644 accel/accel-user.c
 rename accel/kvm/{kvm-cpus.c => kvm-accel-ops.c} (72%)
 rename accel/tcg/{tcg-cpus-icount.c => tcg-accel-ops-icount.c} (89%)
 rename accel/tcg/{tcg-cpus-mttcg.c => tcg-accel-ops-mttcg.c} (92%)
 rename accel/tcg/{tcg-cpus-rr.c => tcg-accel-ops-rr.c} (97%)
 rename accel/tcg/{tcg-cpus.c => tcg-accel-ops.c} (63%)
 rename target/i386/hax/{hax-cpus.c => hax-accel-ops.c} (69%)
 rename target/i386/hvf/{hvf-cpus.c => hvf-accel-ops.c} (84%)
 rename target/i386/whpx/{whpx-cpus.c => whpx-accel-ops.c} (71%)

From: Philippe Mathieu-Daudé <f4bug@amsat.org>

The code is currently comparing c2 to the type promotion of
uint32_t and int32_t. That is, the conversion rules are as:

(common_type) c2 == (common_type) (uint32_t)
                        (is_unsigned
                        ? (uint32_t)c2
                        : (uint32_t)(int32_t)c2)

In the signed case we lose the desired sign extensions because
of the argument promotion rules of the ternary operator.

Solve the problem by doing the round-trip parsing through the
intermediate type and back to the desired common type (all at
one expression).

Fixes: a534bb15f30 ("tcg/s390: Use constant pool for cmpi")
Tested-by: Richard W.M. Jones <rjones@redhat.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reported-by: Miroslav Rezanina <mrezanin@redhat.com>
Reported-by: Richard W.M. Jones <rjones@redhat.com>
Suggested-by: David Hildenbrand <david@redhat.com>
Suggested-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Message-Id: <20210204182902.1742826-1-f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/s390/tcg-target.c.inc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tcg/s390/tcg-target.c.inc b/tcg/s390/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/s390/tcg-target.c.inc
+++ b/tcg/s390/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static int tgen_cmp(TCGContext *s, TCGType type, TCGCond c, TCGReg r1,
                 op = (is_unsigned ? RIL_CLFI : RIL_CFI);
                 tcg_out_insn_RIL(s, op, r1, c2);
                 goto exit;
-            } else if (c2 == (is_unsigned ? (uint32_t)c2 : (int32_t)c2)) {
+            } else if (c2 == (is_unsigned ? (TCGArg)(uint32_t)c2 : (TCGArg)(int32_t)c2)) {
                 op = (is_unsigned ? RIL_CLGFI : RIL_CGFI);
                 tcg_out_insn_RIL(s, op, r1, c2);
                 goto exit;
-- 
2.25.1

From: Philippe Mathieu-Daudé <f4bug@amsat.org>

"exec/cpu-defs.h" contains generic CPU definitions for the
TCG frontends (mostly related to TLB). TCG backends definitions
aren't relevant here.

See tcg/README description:

4) Backend

tcg-target.h contains the target specific definitions. tcg-target.c.inc
  contains the target specific code; it is #included by tcg/tcg.c, rather
  than being a standalone C file.

So far only "tcg/tcg.h" requires these headers.

Remove the "target-tcg.h" header dependency on TCG frontends, so we
don't have to rebuild all frontends when hacking a single backend.

Signed-off-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Message-Id: <20210204191423.1754158-1-f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/cpu-defs.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/include/exec/cpu-defs.h b/include/exec/cpu-defs.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/cpu-defs.h
+++ b/include/exec/cpu-defs.h
@@ -XXX,XX +XXX,XX @@
 
 #include "qemu/host-utils.h"
 #include "qemu/thread.h"
-#ifdef CONFIG_TCG
-#include "tcg-target.h"
-#endif
 #ifndef CONFIG_USER_ONLY
 #include "exec/hwaddr.h"
 #endif
-- 
2.25.1

Fixes INDEX_op_rotli_vec for aarch64 host, where the 3rd
argument is an integer, not a temporary, which now tickles
an assert added in e89b28a6350.

Previously, the value computed into v2 would be garbage for
rotli_vec, but as the value was unused it caused no harm.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/aarch64/tcg-target.c.inc | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/aarch64/tcg-target.c.inc
+++ b/tcg/aarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
     v0 = temp_tcgv_vec(arg_temp(a0));
     v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
     a2 = va_arg(va, TCGArg);
-    v2 = temp_tcgv_vec(arg_temp(a2));
+    va_end(va);
 
     switch (opc) {
     case INDEX_op_rotli_vec:
@@ -XXX,XX +XXX,XX @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
     case INDEX_op_shrv_vec:
     case INDEX_op_sarv_vec:
         /* Right shifts are negative left shifts for AArch64.  */
+        v2 = temp_tcgv_vec(arg_temp(a2));
         t1 = tcg_temp_new_vec(type);
         tcg_gen_neg_vec(vece, t1, v2);
         opc = (opc == INDEX_op_shrv_vec
@@ -XXX,XX +XXX,XX @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
         break;
 
     case INDEX_op_rotlv_vec:
+        v2 = temp_tcgv_vec(arg_temp(a2));
         t1 = tcg_temp_new_vec(type);
         c1 = tcg_constant_vec(type, vece, 8 << vece);
         tcg_gen_sub_vec(vece, t1, v2, c1);
@@ -XXX,XX +XXX,XX @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
         break;
 
     case INDEX_op_rotrv_vec:
+        v2 = temp_tcgv_vec(arg_temp(a2));
         t1 = tcg_temp_new_vec(type);
         t2 = tcg_temp_new_vec(type);
         c1 = tcg_constant_vec(type, vece, 8 << vece);
@@ -XXX,XX +XXX,XX @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
     default:
         g_assert_not_reached();
     }
-
-    va_end(va);
 }
 
 static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
-- 
2.25.1

The configure option was backward, and we failed to
pass the value on to meson.

Fixes: 23a77b2d18b ("build-system: clean up TCG/TCI configury")
Tested-by: Stefan Weil <sw@weilnetz.de>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Reviewed-by: Daniel P. Berrangé <berrange@redhat.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 configure | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/configure b/configure
index XXXXXXX..XXXXXXX 100755
--- a/configure
+++ b/configure
@@ -XXX,XX +XXX,XX @@ for opt do
   ;;
   --enable-whpx) whpx="enabled"
   ;;
-  --disable-tcg-interpreter) tcg_interpreter="true"
+  --disable-tcg-interpreter) tcg_interpreter="false"
   ;;
-  --enable-tcg-interpreter) tcg_interpreter="false"
+  --enable-tcg-interpreter) tcg_interpreter="true"
   ;;
   --disable-cap-ng)  cap_ng="disabled"
   ;;
@@ -XXX,XX +XXX,XX @@ NINJA=$ninja $meson setup \
         -Dvhost_user_blk_server=$vhost_user_blk_server \
         -Dfuse=$fuse -Dfuse_lseek=$fuse_lseek -Dguest_agent_msi=$guest_agent_msi \
         $(if test "$default_features" = no; then echo "-Dauto_features=disabled"; fi) \
+	-Dtcg_interpreter=$tcg_interpreter \
         $cross_arg \
         "$PWD" "$source_path"
 
-- 
2.25.1

Each thread must have its own pc, even under TCI.

Remove the GETPC ifdef, because GETPC is always available for
helpers, and thus is always required.  Move the assignment
under INDEX_op_call, because the value is only visible when
we make a call to a helper function.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Message-Id: <20210204014509.882821-6-richard.henderson@linaro.org>
---
 include/exec/exec-all.h | 2 +-
 tcg/tcg-common.c        | 4 ----
 tcg/tci.c               | 7 +++----
 3 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -XXX,XX +XXX,XX @@ void tb_set_jmp_target(TranslationBlock *tb, int n, uintptr_t addr);
 
 /* GETPC is the true target of the return instruction that we'll execute.  */
 #if defined(CONFIG_TCG_INTERPRETER)
-extern uintptr_t tci_tb_ptr;
+extern __thread uintptr_t tci_tb_ptr;
 # define GETPC() tci_tb_ptr
 #else
 # define GETPC() \
diff --git a/tcg/tcg-common.c b/tcg/tcg-common.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg-common.c
+++ b/tcg/tcg-common.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/osdep.h"
 #include "tcg/tcg.h"
 
-#if defined(CONFIG_TCG_INTERPRETER)
-uintptr_t tci_tb_ptr;
-#endif
-
 TCGOpDef tcg_op_defs[] = {
 #define DEF(s, oargs, iargs, cargs, flags) \
          { #s, oargs, iargs, cargs, iargs + oargs + cargs, flags },
diff --git a/tcg/tci.c b/tcg/tci.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci.c
+++ b/tcg/tci.c
@@ -XXX,XX +XXX,XX @@ typedef uint64_t (*helper_function)(tcg_target_ulong, tcg_target_ulong,
                                     tcg_target_ulong, tcg_target_ulong);
 #endif
 
+__thread uintptr_t tci_tb_ptr;
+
 static tcg_target_ulong tci_read_reg(const tcg_target_ulong *regs, TCGReg index)
 {
     tci_assert(index < TCG_TARGET_NB_REGS);
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
 #endif
         TCGMemOpIdx oi;
 
-#if defined(GETPC)
-        tci_tb_ptr = (uintptr_t)tb_ptr;
-#endif
-
         /* Skip opcode and size entry. */
         tb_ptr += 2;
 
         switch (opc) {
         case INDEX_op_call:
             t0 = tci_read_ri(regs, &tb_ptr);
+            tci_tb_ptr = (uintptr_t)tb_ptr;
 #if TCG_TARGET_REG_BITS == 32
             tmp64 = ((helper_function)t0)(tci_read_reg(regs, TCG_REG_R0),
                                           tci_read_reg(regs, TCG_REG_R1),
-- 
2.25.1

From: Stefan Weil <sw@weilnetz.de>

That TCG opcode is used by debian-buster (arm64) running ffmpeg:

qemu-aarch64 /usr/bin/ffmpeg -i theora.mkv theora.webm

Tested-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reported-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Stefan Weil <sw@weilnetz.de>
Message-Id: <20210128024814.2056958-1-sw@weilnetz.de>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tci.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tcg/tci.c b/tcg/tci.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci.c
+++ b/tcg/tci.c
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
             TODO();
             break;
         case INDEX_op_ld16s_i32:
-            TODO();
+            t0 = *tb_ptr++;
+            t1 = tci_read_r(regs, &tb_ptr);
+            t2 = tci_read_s32(&tb_ptr);
+            tci_write_reg(regs, t0, *(int16_t *)(t1 + t2));
             break;
         case INDEX_op_ld_i32:
             t0 = *tb_ptr++;
-- 
2.25.1

From: Stefan Weil <sw@weilnetz.de>

That TCG opcode is used by debian-buster (arm64) running ffmpeg:

qemu-aarch64 /usr/bin/ffmpeg -i theora.mkv theora.webm

Tested-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reported-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Stefan Weil <sw@weilnetz.de>
Message-Id: <20210128020425.2055454-1-sw@weilnetz.de>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tci.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tcg/tci.c b/tcg/tci.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci.c
+++ b/tcg/tci.c
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
             tci_write_reg8(regs, t0, *(uint8_t *)(t1 + t2));
             break;
         case INDEX_op_ld8s_i64:
-            TODO();
+            t0 = *tb_ptr++;
+            t1 = tci_read_r(regs, &tb_ptr);
+            t2 = tci_read_s32(&tb_ptr);
+            tci_write_reg(regs, t0, *(int8_t *)(t1 + t2));
             break;
         case INDEX_op_ld16u_i64:
             t0 = *tb_ptr++;
-- 
2.25.1

For a 64-bit TCI, the upper bits of a 32-bit operation are
undefined (much like a native ppc64 32-bit operation).  It
simplifies everything if we don't force-extend the result.

Tested-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tci.c | 66 +++++++++++++++++++++++++------------------------------
 1 file changed, 30 insertions(+), 36 deletions(-)

diff --git a/tcg/tci.c b/tcg/tci.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci.c
+++ b/tcg/tci.c
@@ -XXX,XX +XXX,XX @@ tci_write_reg(tcg_target_ulong *regs, TCGReg index, tcg_target_ulong value)
     regs[index] = value;
 }
 
-static void
-tci_write_reg32(tcg_target_ulong *regs, TCGReg index, uint32_t value)
-{
-    tci_write_reg(regs, index, value);
-}
-
 #if TCG_TARGET_REG_BITS == 32
 static void tci_write_reg64(tcg_target_ulong *regs, uint32_t high_index,
                             uint32_t low_index, uint64_t value)
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
             t1 = tci_read_r32(regs, &tb_ptr);
             t2 = tci_read_ri32(regs, &tb_ptr);
             condition = *tb_ptr++;
-            tci_write_reg32(regs, t0, tci_compare32(t1, t2, condition));
+            tci_write_reg(regs, t0, tci_compare32(t1, t2, condition));
             break;
 #if TCG_TARGET_REG_BITS == 32
         case INDEX_op_setcond2_i32:
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
             tmp64 = tci_read_r64(regs, &tb_ptr);
             v64 = tci_read_ri64(regs, &tb_ptr);
             condition = *tb_ptr++;
-            tci_write_reg32(regs, t0, tci_compare64(tmp64, v64, condition));
+            tci_write_reg(regs, t0, tci_compare64(tmp64, v64, condition));
             break;
 #elif TCG_TARGET_REG_BITS == 64
         case INDEX_op_setcond_i64:
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
         case INDEX_op_mov_i32:
             t0 = *tb_ptr++;
             t1 = tci_read_r32(regs, &tb_ptr);
-            tci_write_reg32(regs, t0, t1);
+            tci_write_reg(regs, t0, t1);
             break;
         case INDEX_op_tci_movi_i32:
             t0 = *tb_ptr++;
             t1 = tci_read_i32(&tb_ptr);
-            tci_write_reg32(regs, t0, t1);
+            tci_write_reg(regs, t0, t1);
             break;
 
             /* Load/store operations (32 bit). */
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
             t0 = *tb_ptr++;
             t1 = tci_read_r(regs, &tb_ptr);
             t2 = tci_read_s32(&tb_ptr);
-            tci_write_reg32(regs, t0, *(uint32_t *)(t1 + t2));
+            tci_write_reg(regs, t0, *(uint32_t *)(t1 + t2));
             break;
         case INDEX_op_st8_i32:
             t0 = tci_read_r8(regs, &tb_ptr);
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
             t0 = *tb_ptr++;
             t1 = tci_read_ri32(regs, &tb_ptr);
             t2 = tci_read_ri32(regs, &tb_ptr);
-            tci_write_reg32(regs, t0, t1 + t2);
+            tci_write_reg(regs, t0, t1 + t2);
             break;
         case INDEX_op_sub_i32:
             t0 = *tb_ptr++;
             t1 = tci_read_ri32(regs, &tb_ptr);
             t2 = tci_read_ri32(regs, &tb_ptr);
-            tci_write_reg32(regs, t0, t1 - t2);
+            tci_write_reg(regs, t0, t1 - t2);
             break;
         case INDEX_op_mul_i32:
             t0 = *tb_ptr++;
             t1 = tci_read_ri32(regs, &tb_ptr);
             t2 = tci_read_ri32(regs, &tb_ptr);
-            tci_write_reg32(regs, t0, t1 * t2);
+            tci_write_reg(regs, t0, t1 * t2);
             break;
 #if TCG_TARGET_HAS_div_i32
         case INDEX_op_div_i32:
             t0 = *tb_ptr++;
             t1 = tci_read_ri32(regs, &tb_ptr);
             t2 = tci_read_ri32(regs, &tb_ptr);
-            tci_write_reg32(regs, t0, (int32_t)t1 / (int32_t)t2);
+            tci_write_reg(regs, t0, (int32_t)t1 / (int32_t)t2);
             break;
         case INDEX_op_divu_i32:
             t0 = *tb_ptr++;
             t1 = tci_read_ri32(regs, &tb_ptr);
             t2 = tci_read_ri32(regs, &tb_ptr);
-            tci_write_reg32(regs, t0, t1 / t2);
+            tci_write_reg(regs, t0, t1 / t2);
             break;
         case INDEX_op_rem_i32:
             t0 = *tb_ptr++;
             t1 = tci_read_ri32(regs, &tb_ptr);
             t2 = tci_read_ri32(regs, &tb_ptr);
-            tci_write_reg32(regs, t0, (int32_t)t1 % (int32_t)t2);
+            tci_write_reg(regs, t0, (int32_t)t1 % (int32_t)t2);
             break;
         case INDEX_op_remu_i32:
             t0 = *tb_ptr++;
             t1 = tci_read_ri32(regs, &tb_ptr);
             t2 = tci_read_ri32(regs, &tb_ptr);
-            tci_write_reg32(regs, t0, t1 % t2);
+            tci_write_reg(regs, t0, t1 % t2);
             break;
 #elif TCG_TARGET_HAS_div2_i32
         case INDEX_op_div2_i32:
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
             t0 = *tb_ptr++;
             t1 = tci_read_ri32(regs, &tb_ptr);
             t2 = tci_read_ri32(regs, &tb_ptr);
-            tci_write_reg32(regs, t0, t1 & t2);
+            tci_write_reg(regs, t0, t1 & t2);
             break;
         case INDEX_op_or_i32:
             t0 = *tb_ptr++;
             t1 = tci_read_ri32(regs, &tb_ptr);
             t2 = tci_read_ri32(regs, &tb_ptr);
-            tci_write_reg32(regs, t0, t1 | t2);
+            tci_write_reg(regs, t0, t1 | t2);
             break;
         case INDEX_op_xor_i32:
             t0 = *tb_ptr++;
             t1 = tci_read_ri32(regs, &tb_ptr);
             t2 = tci_read_ri32(regs, &tb_ptr);
-            tci_write_reg32(regs, t0, t1 ^ t2);
+            tci_write_reg(regs, t0, t1 ^ t2);
             break;
 
             /* Shift/rotate operations (32 bit). */
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
             t0 = *tb_ptr++;
             t1 = tci_read_ri32(regs, &tb_ptr);
             t2 = tci_read_ri32(regs, &tb_ptr);
-            tci_write_reg32(regs, t0, t1 << (t2 & 31));
+            tci_write_reg(regs, t0, t1 << (t2 & 31));
             break;
         case INDEX_op_shr_i32:
             t0 = *tb_ptr++;
             t1 = tci_read_ri32(regs, &tb_ptr);
             t2 = tci_read_ri32(regs, &tb_ptr);
-            tci_write_reg32(regs, t0, t1 >> (t2 & 31));
+            tci_write_reg(regs, t0, t1 >> (t2 & 31));
             break;
         case INDEX_op_sar_i32:
             t0 = *tb_ptr++;
             t1 = tci_read_ri32(regs, &tb_ptr);
             t2 = tci_read_ri32(regs, &tb_ptr);
-            tci_write_reg32(regs, t0, ((int32_t)t1 >> (t2 & 31)));
+            tci_write_reg(regs, t0, ((int32_t)t1 >> (t2 & 31)));
             break;
 #if TCG_TARGET_HAS_rot_i32
         case INDEX_op_rotl_i32:
             t0 = *tb_ptr++;
             t1 = tci_read_ri32(regs, &tb_ptr);
             t2 = tci_read_ri32(regs, &tb_ptr);
-            tci_write_reg32(regs, t0, rol32(t1, t2 & 31));
+            tci_write_reg(regs, t0, rol32(t1, t2 & 31));
             break;
         case INDEX_op_rotr_i32:
             t0 = *tb_ptr++;
             t1 = tci_read_ri32(regs, &tb_ptr);
             t2 = tci_read_ri32(regs, &tb_ptr);
-            tci_write_reg32(regs, t0, ror32(t1, t2 & 31));
+            tci_write_reg(regs, t0, ror32(t1, t2 & 31));
             break;
 #endif
 #if TCG_TARGET_HAS_deposit_i32
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
             tmp16 = *tb_ptr++;
             tmp8 = *tb_ptr++;
             tmp32 = (((1 << tmp8) - 1) << tmp16);
-            tci_write_reg32(regs, t0, (t1 & ~tmp32) | ((t2 << tmp16) & tmp32));
+            tci_write_reg(regs, t0, (t1 & ~tmp32) | ((t2 << tmp16) & tmp32));
             break;
 #endif
         case INDEX_op_brcond_i32:
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
         case INDEX_op_ext8s_i32:
             t0 = *tb_ptr++;
             t1 = tci_read_r8s(regs, &tb_ptr);
-            tci_write_reg32(regs, t0, t1);
+            tci_write_reg(regs, t0, t1);
             break;
 #endif
 #if TCG_TARGET_HAS_ext16s_i32
         case INDEX_op_ext16s_i32:
             t0 = *tb_ptr++;
             t1 = tci_read_r16s(regs, &tb_ptr);
-            tci_write_reg32(regs, t0, t1);
+            tci_write_reg(regs, t0, t1);
             break;
 #endif
 #if TCG_TARGET_HAS_ext8u_i32
         case INDEX_op_ext8u_i32:
             t0 = *tb_ptr++;
             t1 = tci_read_r8(regs, &tb_ptr);
-            tci_write_reg32(regs, t0, t1);
+            tci_write_reg(regs, t0, t1);
             break;
 #endif
 #if TCG_TARGET_HAS_ext16u_i32
         case INDEX_op_ext16u_i32:
             t0 = *tb_ptr++;
             t1 = tci_read_r16(regs, &tb_ptr);
-            tci_write_reg32(regs, t0, t1);
+            tci_write_reg(regs, t0, t1);
             break;
 #endif
 #if TCG_TARGET_HAS_bswap16_i32
         case INDEX_op_bswap16_i32:
             t0 = *tb_ptr++;
             t1 = tci_read_r16(regs, &tb_ptr);
-            tci_write_reg32(regs, t0, bswap16(t1));
+            tci_write_reg(regs, t0, bswap16(t1));
             break;
 #endif
 #if TCG_TARGET_HAS_bswap32_i32
         case INDEX_op_bswap32_i32:
             t0 = *tb_ptr++;
             t1 = tci_read_r32(regs, &tb_ptr);
-            tci_write_reg32(regs, t0, bswap32(t1));
+            tci_write_reg(regs, t0, bswap32(t1));
             break;
 #endif
 #if TCG_TARGET_HAS_not_i32
         case INDEX_op_not_i32:
             t0 = *tb_ptr++;
             t1 = tci_read_r32(regs, &tb_ptr);
-            tci_write_reg32(regs, t0, ~t1);
+            tci_write_reg(regs, t0, ~t1);
             break;
 #endif
 #if TCG_TARGET_HAS_neg_i32
         case INDEX_op_neg_i32:
             t0 = *tb_ptr++;
             t1 = tci_read_r32(regs, &tb_ptr);
-            tci_write_reg32(regs, t0, -t1);
+            tci_write_reg(regs, t0, -t1);
             break;
 #endif
 #if TCG_TARGET_REG_BITS == 64
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
             t0 = *tb_ptr++;
             t1 = tci_read_r(regs, &tb_ptr);
             t2 = tci_read_s32(&tb_ptr);
-            tci_write_reg32(regs, t0, *(uint32_t *)(t1 + t2));
+            tci_write_reg(regs, t0, *(uint32_t *)(t1 + t2));
             break;
         case INDEX_op_ld32s_i64:
             t0 = *tb_ptr++;
-- 
2.25.1

Note that we had two functions of the same name: a 32-bit version
which took two register numbers and a 64-bit version which was a
no-op wrapper for tcg_write_reg.  After this, we are left with
only the 32-bit version.

Tested-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tci.c | 60 +++++++++++++++++++++++++------------------------------
 1 file changed, 27 insertions(+), 33 deletions(-)

diff --git a/tcg/tci.c b/tcg/tci.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci.c
+++ b/tcg/tci.c
@@ -XXX,XX +XXX,XX @@ static void tci_write_reg64(tcg_target_ulong *regs, uint32_t high_index,
     tci_write_reg(regs, low_index, value);
     tci_write_reg(regs, high_index, value >> 32);
 }
-#elif TCG_TARGET_REG_BITS == 64
-static void
-tci_write_reg64(tcg_target_ulong *regs, TCGReg index, uint64_t value)
-{
-    tci_write_reg(regs, index, value);
-}
 #endif
 
 #if TCG_TARGET_REG_BITS == 32
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
             t1 = tci_read_r64(regs, &tb_ptr);
             t2 = tci_read_ri64(regs, &tb_ptr);
             condition = *tb_ptr++;
-            tci_write_reg64(regs, t0, tci_compare64(t1, t2, condition));
+            tci_write_reg(regs, t0, tci_compare64(t1, t2, condition));
             break;
 #endif
         case INDEX_op_mov_i32:
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
         case INDEX_op_mov_i64:
             t0 = *tb_ptr++;
             t1 = tci_read_r64(regs, &tb_ptr);
-            tci_write_reg64(regs, t0, t1);
+            tci_write_reg(regs, t0, t1);
             break;
         case INDEX_op_tci_movi_i64:
             t0 = *tb_ptr++;
             t1 = tci_read_i64(&tb_ptr);
-            tci_write_reg64(regs, t0, t1);
+            tci_write_reg(regs, t0, t1);
             break;
 
             /* Load/store operations (64 bit). */
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
             t0 = *tb_ptr++;
             t1 = tci_read_r(regs, &tb_ptr);
             t2 = tci_read_s32(&tb_ptr);
-            tci_write_reg64(regs, t0, *(uint64_t *)(t1 + t2));
+            tci_write_reg(regs, t0, *(uint64_t *)(t1 + t2));
             break;
         case INDEX_op_st8_i64:
             t0 = tci_read_r8(regs, &tb_ptr);
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
             t0 = *tb_ptr++;
             t1 = tci_read_ri64(regs, &tb_ptr);
             t2 = tci_read_ri64(regs, &tb_ptr);
-            tci_write_reg64(regs, t0, t1 + t2);
+            tci_write_reg(regs, t0, t1 + t2);
             break;
         case INDEX_op_sub_i64:
             t0 = *tb_ptr++;
             t1 = tci_read_ri64(regs, &tb_ptr);
             t2 = tci_read_ri64(regs, &tb_ptr);
-            tci_write_reg64(regs, t0, t1 - t2);
+            tci_write_reg(regs, t0, t1 - t2);
             break;
         case INDEX_op_mul_i64:
             t0 = *tb_ptr++;
             t1 = tci_read_ri64(regs, &tb_ptr);
             t2 = tci_read_ri64(regs, &tb_ptr);
-            tci_write_reg64(regs, t0, t1 * t2);
+            tci_write_reg(regs, t0, t1 * t2);
             break;
 #if TCG_TARGET_HAS_div_i64
         case INDEX_op_div_i64:
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
             t0 = *tb_ptr++;
             t1 = tci_read_ri64(regs, &tb_ptr);
             t2 = tci_read_ri64(regs, &tb_ptr);
-            tci_write_reg64(regs, t0, t1 & t2);
+            tci_write_reg(regs, t0, t1 & t2);
             break;
         case INDEX_op_or_i64:
             t0 = *tb_ptr++;
             t1 = tci_read_ri64(regs, &tb_ptr);
             t2 = tci_read_ri64(regs, &tb_ptr);
-            tci_write_reg64(regs, t0, t1 | t2);
+            tci_write_reg(regs, t0, t1 | t2);
             break;
         case INDEX_op_xor_i64:
             t0 = *tb_ptr++;
             t1 = tci_read_ri64(regs, &tb_ptr);
             t2 = tci_read_ri64(regs, &tb_ptr);
-            tci_write_reg64(regs, t0, t1 ^ t2);
+            tci_write_reg(regs, t0, t1 ^ t2);
             break;
 
             /* Shift/rotate operations (64 bit). */
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
             t0 = *tb_ptr++;
             t1 = tci_read_ri64(regs, &tb_ptr);
             t2 = tci_read_ri64(regs, &tb_ptr);
-            tci_write_reg64(regs, t0, t1 << (t2 & 63));
+            tci_write_reg(regs, t0, t1 << (t2 & 63));
             break;
         case INDEX_op_shr_i64:
             t0 = *tb_ptr++;
             t1 = tci_read_ri64(regs, &tb_ptr);
             t2 = tci_read_ri64(regs, &tb_ptr);
-            tci_write_reg64(regs, t0, t1 >> (t2 & 63));
+            tci_write_reg(regs, t0, t1 >> (t2 & 63));
             break;
         case INDEX_op_sar_i64:
             t0 = *tb_ptr++;
             t1 = tci_read_ri64(regs, &tb_ptr);
             t2 = tci_read_ri64(regs, &tb_ptr);
-            tci_write_reg64(regs, t0, ((int64_t)t1 >> (t2 & 63)));
+            tci_write_reg(regs, t0, ((int64_t)t1 >> (t2 & 63)));
             break;
 #if TCG_TARGET_HAS_rot_i64
         case INDEX_op_rotl_i64:
             t0 = *tb_ptr++;
             t1 = tci_read_ri64(regs, &tb_ptr);
             t2 = tci_read_ri64(regs, &tb_ptr);
-            tci_write_reg64(regs, t0, rol64(t1, t2 & 63));
+            tci_write_reg(regs, t0, rol64(t1, t2 & 63));
             break;
         case INDEX_op_rotr_i64:
             t0 = *tb_ptr++;
             t1 = tci_read_ri64(regs, &tb_ptr);
             t2 = tci_read_ri64(regs, &tb_ptr);
-            tci_write_reg64(regs, t0, ror64(t1, t2 & 63));
+            tci_write_reg(regs, t0, ror64(t1, t2 & 63));
             break;
 #endif
 #if TCG_TARGET_HAS_deposit_i64
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
             tmp16 = *tb_ptr++;
             tmp8 = *tb_ptr++;
             tmp64 = (((1ULL << tmp8) - 1) << tmp16);
-            tci_write_reg64(regs, t0, (t1 & ~tmp64) | ((t2 << tmp16) & tmp64));
+            tci_write_reg(regs, t0, (t1 & ~tmp64) | ((t2 << tmp16) & tmp64));
             break;
 #endif
         case INDEX_op_brcond_i64:
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
         case INDEX_op_ext8u_i64:
             t0 = *tb_ptr++;
             t1 = tci_read_r8(regs, &tb_ptr);
-            tci_write_reg64(regs, t0, t1);
+            tci_write_reg(regs, t0, t1);
             break;
 #endif
 #if TCG_TARGET_HAS_ext8s_i64
         case INDEX_op_ext8s_i64:
             t0 = *tb_ptr++;
             t1 = tci_read_r8s(regs, &tb_ptr);
-            tci_write_reg64(regs, t0, t1);
+            tci_write_reg(regs, t0, t1);
             break;
 #endif
 #if TCG_TARGET_HAS_ext16s_i64
         case INDEX_op_ext16s_i64:
             t0 = *tb_ptr++;
             t1 = tci_read_r16s(regs, &tb_ptr);
-            tci_write_reg64(regs, t0, t1);
+            tci_write_reg(regs, t0, t1);
             break;
 #endif
 #if TCG_TARGET_HAS_ext16u_i64
         case INDEX_op_ext16u_i64:
             t0 = *tb_ptr++;
             t1 = tci_read_r16(regs, &tb_ptr);
-            tci_write_reg64(regs, t0, t1);
+            tci_write_reg(regs, t0, t1);
             break;
 #endif
 #if TCG_TARGET_HAS_ext32s_i64
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
         case INDEX_op_ext_i32_i64:
             t0 = *tb_ptr++;
             t1 = tci_read_r32s(regs, &tb_ptr);
-            tci_write_reg64(regs, t0, t1);
+            tci_write_reg(regs, t0, t1);
             break;
 #if TCG_TARGET_HAS_ext32u_i64
         case INDEX_op_ext32u_i64:
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
         case INDEX_op_extu_i32_i64:
             t0 = *tb_ptr++;
             t1 = tci_read_r32(regs, &tb_ptr);
-            tci_write_reg64(regs, t0, t1);
+            tci_write_reg(regs, t0, t1);
             break;
 #if TCG_TARGET_HAS_bswap16_i64
         case INDEX_op_bswap16_i64:
             t0 = *tb_ptr++;
             t1 = tci_read_r16(regs, &tb_ptr);
-            tci_write_reg64(regs, t0, bswap16(t1));
+            tci_write_reg(regs, t0, bswap16(t1));
             break;
 #endif
 #if TCG_TARGET_HAS_bswap32_i64
         case INDEX_op_bswap32_i64:
             t0 = *tb_ptr++;
             t1 = tci_read_r32(regs, &tb_ptr);
-            tci_write_reg64(regs, t0, bswap32(t1));
+            tci_write_reg(regs, t0, bswap32(t1));
             break;
 #endif
 #if TCG_TARGET_HAS_bswap64_i64
         case INDEX_op_bswap64_i64:
             t0 = *tb_ptr++;
             t1 = tci_read_r64(regs, &tb_ptr);
-            tci_write_reg64(regs, t0, bswap64(t1));
+            tci_write_reg(regs, t0, bswap64(t1));
             break;
 #endif
 #if TCG_TARGET_HAS_not_i64
         case INDEX_op_not_i64:
             t0 = *tb_ptr++;
             t1 = tci_read_r64(regs, &tb_ptr);
-            tci_write_reg64(regs, t0, ~t1);
+            tci_write_reg(regs, t0, ~t1);
             break;
 #endif
 #if TCG_TARGET_HAS_neg_i64
         case INDEX_op_neg_i64:
             t0 = *tb_ptr++;
             t1 = tci_read_r64(regs, &tb_ptr);
-            tci_write_reg64(regs, t0, -t1);
+            tci_write_reg(regs, t0, -t1);
             break;
 #endif
 #endif /* TCG_TARGET_REG_BITS == 64 */
-- 
2.25.1

Tested-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tci.c | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/tcg/tci.c b/tcg/tci.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci.c
+++ b/tcg/tci.c
@@ -XXX,XX +XXX,XX @@ static bool tci_compare64(uint64_t u0, uint64_t u1, TCGCond condition)
 # define qemu_st_beq(X)  stq_be_p(g2h(taddr), X)
 #endif
 
+#if TCG_TARGET_REG_BITS == 64
+# define CASE_32_64(x) \
+        case glue(glue(INDEX_op_, x), _i64): \
+        case glue(glue(INDEX_op_, x), _i32):
+# define CASE_64(x) \
+        case glue(glue(INDEX_op_, x), _i64):
+#else
+# define CASE_32_64(x) \
+        case glue(glue(INDEX_op_, x), _i32):
+# define CASE_64(x)
+#endif
+
 /* Interpret pseudo code in tb. */
 /*
  * Disable CFI checks.
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
 
             /* Load/store operations (32 bit). */
 
-        case INDEX_op_ld8u_i32:
+        CASE_32_64(ld8u)
             t0 = *tb_ptr++;
             t1 = tci_read_r(regs, &tb_ptr);
             t2 = tci_read_s32(&tb_ptr);
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
 
             /* Load/store operations (64 bit). */
 
-        case INDEX_op_ld8u_i64:
-            t0 = *tb_ptr++;
-            t1 = tci_read_r(regs, &tb_ptr);
-            t2 = tci_read_s32(&tb_ptr);
-            tci_write_reg(regs, t0, *(uint8_t *)(t1 + t2));
-            break;
         case INDEX_op_ld8s_i64:
             t0 = *tb_ptr++;
             t1 = tci_read_r(regs, &tb_ptr);
-- 
2.25.1

Eliminating a TODO for ld8s_i32.

diff --git a/tcg/tci.c b/tcg/tci.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci.c
+++ b/tcg/tci.c
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
             t2 = tci_read_s32(&tb_ptr);
             tci_write_reg(regs, t0, *(uint8_t *)(t1 + t2));
             break;
-        case INDEX_op_ld8s_i32:
-            TODO();
+        CASE_32_64(ld8s)
+            t0 = *tb_ptr++;
+            t1 = tci_read_r(regs, &tb_ptr);
+            t2 = tci_read_s32(&tb_ptr);
+            tci_write_reg(regs, t0, *(int8_t *)(t1 + t2));
             break;
         case INDEX_op_ld16u_i32:
             TODO();
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
 
             /* Load/store operations (64 bit). */
 
-        case INDEX_op_ld8s_i64:
-            t0 = *tb_ptr++;
-            t1 = tci_read_r(regs, &tb_ptr);
-            t2 = tci_read_s32(&tb_ptr);
-            tci_write_reg(regs, t0, *(int8_t *)(t1 + t2));
-            break;
         case INDEX_op_ld16u_i64:
             t0 = *tb_ptr++;
             t1 = tci_read_r(regs, &tb_ptr);
-- 
2.25.1

Eliminating a TODO for ld16u_i32.

diff --git a/tcg/tci.c b/tcg/tci.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci.c
+++ b/tcg/tci.c
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
             t2 = tci_read_s32(&tb_ptr);
             tci_write_reg(regs, t0, *(int8_t *)(t1 + t2));
             break;
-        case INDEX_op_ld16u_i32:
-            TODO();
+        CASE_32_64(ld16u)
+            t0 = *tb_ptr++;
+            t1 = tci_read_r(regs, &tb_ptr);
+            t2 = tci_read_s32(&tb_ptr);
+            tci_write_reg(regs, t0, *(uint16_t *)(t1 + t2));
             break;
         case INDEX_op_ld16s_i32:
             t0 = *tb_ptr++;
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
 
             /* Load/store operations (64 bit). */
 
-        case INDEX_op_ld16u_i64:
-            t0 = *tb_ptr++;
-            t1 = tci_read_r(regs, &tb_ptr);
-            t2 = tci_read_s32(&tb_ptr);
-            tci_write_reg(regs, t0, *(uint16_t *)(t1 + t2));
-            break;
         case INDEX_op_ld16s_i64:
             TODO();
             break;
-- 
2.25.1

Eliminating a TODO for ld16s_i64.

diff --git a/tcg/tci.c b/tcg/tci.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci.c
+++ b/tcg/tci.c
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
             t2 = tci_read_s32(&tb_ptr);
             tci_write_reg(regs, t0, *(uint16_t *)(t1 + t2));
             break;
-        case INDEX_op_ld16s_i32:
+        CASE_32_64(ld16s)
             t0 = *tb_ptr++;
             t1 = tci_read_r(regs, &tb_ptr);
             t2 = tci_read_s32(&tb_ptr);
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
 
             /* Load/store operations (64 bit). */
 
-        case INDEX_op_ld16s_i64:
-            TODO();
-            break;
         case INDEX_op_ld32u_i64:
             t0 = *tb_ptr++;
             t1 = tci_read_r(regs, &tb_ptr);
-- 
2.25.1

diff --git a/tcg/tci.c b/tcg/tci.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci.c
+++ b/tcg/tci.c
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
             tci_write_reg(regs, t0, *(int16_t *)(t1 + t2));
             break;
         case INDEX_op_ld_i32:
+        CASE_64(ld32u)
             t0 = *tb_ptr++;
             t1 = tci_read_r(regs, &tb_ptr);
             t2 = tci_read_s32(&tb_ptr);
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
 
             /* Load/store operations (64 bit). */
 
-        case INDEX_op_ld32u_i64:
-            t0 = *tb_ptr++;
-            t1 = tci_read_r(regs, &tb_ptr);
-            t2 = tci_read_s32(&tb_ptr);
-            tci_write_reg(regs, t0, *(uint32_t *)(t1 + t2));
-            break;
         case INDEX_op_ld32s_i64:
             t0 = *tb_ptr++;
             t1 = tci_read_r(regs, &tb_ptr);
-- 
2.25.1

diff --git a/tcg/tci.c b/tcg/tci.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci.c
+++ b/tcg/tci.c
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
             t2 = tci_read_s32(&tb_ptr);
             tci_write_reg(regs, t0, *(uint32_t *)(t1 + t2));
             break;
-        case INDEX_op_st8_i32:
+        CASE_32_64(st8)
             t0 = tci_read_r8(regs, &tb_ptr);
             t1 = tci_read_r(regs, &tb_ptr);
             t2 = tci_read_s32(&tb_ptr);
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
             t2 = tci_read_s32(&tb_ptr);
             tci_write_reg(regs, t0, *(uint64_t *)(t1 + t2));
             break;
-        case INDEX_op_st8_i64:
-            t0 = tci_read_r8(regs, &tb_ptr);
-            t1 = tci_read_r(regs, &tb_ptr);
-            t2 = tci_read_s32(&tb_ptr);
-            *(uint8_t *)(t1 + t2) = t0;
-            break;
         case INDEX_op_st16_i64:
             t0 = tci_read_r16(regs, &tb_ptr);
             t1 = tci_read_r(regs, &tb_ptr);
-- 
2.25.1

diff --git a/tcg/tci.c b/tcg/tci.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci.c
+++ b/tcg/tci.c
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
             t2 = tci_read_s32(&tb_ptr);
             *(uint8_t *)(t1 + t2) = t0;
             break;
-        case INDEX_op_st16_i32:
+        CASE_32_64(st16)
             t0 = tci_read_r16(regs, &tb_ptr);
             t1 = tci_read_r(regs, &tb_ptr);
             t2 = tci_read_s32(&tb_ptr);
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
             t2 = tci_read_s32(&tb_ptr);
             tci_write_reg(regs, t0, *(uint64_t *)(t1 + t2));
             break;
-        case INDEX_op_st16_i64:
-            t0 = tci_read_r16(regs, &tb_ptr);
-            t1 = tci_read_r(regs, &tb_ptr);
-            t2 = tci_read_s32(&tb_ptr);
-            *(uint16_t *)(t1 + t2) = t0;
-            break;
         case INDEX_op_st32_i64:
             t0 = tci_read_r32(regs, &tb_ptr);
             t1 = tci_read_r(regs, &tb_ptr);
-- 
2.25.1

The existing check was incomplete:
(1) Only applied to two of the 7 stores, and not to the loads at all.
(2) Only checked the upper, but not the lower bound of the stack.

Doing this at compile time means that we don't need to do it
at runtime as well.

Tested-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tci.c                |  2 --
 tcg/tci/tcg-target.c.inc | 13 +++++++++++++
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/tcg/tci.c b/tcg/tci.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci.c
+++ b/tcg/tci.c
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
             t0 = tci_read_r32(regs, &tb_ptr);
             t1 = tci_read_r(regs, &tb_ptr);
             t2 = tci_read_s32(&tb_ptr);
-            tci_assert(t1 != sp_value || (int32_t)t2 < 0);
             *(uint32_t *)(t1 + t2) = t0;
             break;
 
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
             t0 = tci_read_r64(regs, &tb_ptr);
             t1 = tci_read_r(regs, &tb_ptr);
             t2 = tci_read_s32(&tb_ptr);
-            tci_assert(t1 != sp_value || (int32_t)t2 < 0);
             *(uint64_t *)(t1 + t2) = t0;
             break;
 
diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci/tcg-target.c.inc
+++ b/tcg/tci/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tci_out_label(TCGContext *s, TCGLabel *label)
     }
 }
 
+static void stack_bounds_check(TCGReg base, target_long offset)
+{
+    if (base == TCG_REG_CALL_STACK) {
+        tcg_debug_assert(offset < 0);
+        tcg_debug_assert(offset >= -(CPU_TEMP_BUF_NLONGS * sizeof(long)));
+    }
+}
+
 static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg1,
                        intptr_t arg2)
 {
     uint8_t *old_code_ptr = s->code_ptr;
+
+    stack_bounds_check(arg1, arg2);
     if (type == TCG_TYPE_I32) {
         tcg_out_op_t(s, INDEX_op_ld_i32);
         tcg_out_r(s, ret);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
     case INDEX_op_st16_i64:
     case INDEX_op_st32_i64:
     case INDEX_op_st_i64:
+        stack_bounds_check(args[1], args[2]);
         tcg_out_r(s, args[0]);
         tcg_out_r(s, args[1]);
         tcg_debug_assert(args[2] == (int32_t)args[2]);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg, TCGReg arg1,
                        intptr_t arg2)
 {
     uint8_t *old_code_ptr = s->code_ptr;
+
+    stack_bounds_check(arg1, arg2);
     if (type == TCG_TYPE_I32) {
         tcg_out_op_t(s, INDEX_op_st_i32);
         tcg_out_r(s, arg);
-- 
2.25.1

diff --git a/tcg/tci.c b/tcg/tci.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci.c
+++ b/tcg/tci.c
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
             *(uint16_t *)(t1 + t2) = t0;
             break;
         case INDEX_op_st_i32:
+        CASE_64(st32)
             t0 = tci_read_r32(regs, &tb_ptr);
             t1 = tci_read_r(regs, &tb_ptr);
             t2 = tci_read_s32(&tb_ptr);
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
             t2 = tci_read_s32(&tb_ptr);
             tci_write_reg(regs, t0, *(uint64_t *)(t1 + t2));
             break;
-        case INDEX_op_st32_i64:
-            t0 = tci_read_r32(regs, &tb_ptr);
-            t1 = tci_read_r(regs, &tb_ptr);
-            t2 = tci_read_s32(&tb_ptr);
-            *(uint32_t *)(t1 + t2) = t0;
-            break;
         case INDEX_op_st_i64:
             t0 = tci_read_r64(regs, &tb_ptr);
             t1 = tci_read_r(regs, &tb_ptr);
-- 
2.25.1

Three TODO instances are never happen cases.
Other uses of tcg_abort are also indicating unreachable cases.

Tested-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Stefan Weil <sw@weilnetz.de>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tci.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/tcg/tci.c b/tcg/tci.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci.c
+++ b/tcg/tci.c
@@ -XXX,XX +XXX,XX @@ static bool tci_compare32(uint32_t u0, uint32_t u1, TCGCond condition)
         result = (u0 > u1);
         break;
     default:
-        TODO();
+        g_assert_not_reached();
     }
     return result;
 }
@@ -XXX,XX +XXX,XX @@ static bool tci_compare64(uint64_t u0, uint64_t u1, TCGCond condition)
         result = (u0 > u1);
         break;
     default:
-        TODO();
+        g_assert_not_reached();
     }
     return result;
 }
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
                 tmp32 = qemu_ld_beul;
                 break;
             default:
-                tcg_abort();
+                g_assert_not_reached();
             }
             tci_write_reg(regs, t0, tmp32);
             break;
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
                 tmp64 = qemu_ld_beq;
                 break;
             default:
-                tcg_abort();
+                g_assert_not_reached();
             }
             tci_write_reg(regs, t0, tmp64);
             if (TCG_TARGET_REG_BITS == 32) {
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
                 qemu_st_bel(t0);
                 break;
             default:
-                tcg_abort();
+                g_assert_not_reached();
             }
             break;
         case INDEX_op_qemu_st_i64:
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
                 qemu_st_beq(tmp64);
                 break;
             default:
-                tcg_abort();
+                g_assert_not_reached();
             }
             break;
         case INDEX_op_mb:
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
             smp_mb();
             break;
         default:
-            TODO();
-            break;
+            g_assert_not_reached();
         }
         tci_assert(tb_ptr == old_code_ptr + op_size);
     }
-- 
2.25.1

We do not simultaneously support div and div2 -- it's one
or the other.  TCI is already using div, so remove div2.

Tested-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tci.c                | 12 ------------
 tcg/tci/tcg-target.c.inc |  8 --------
 2 files changed, 20 deletions(-)

diff --git a/tcg/tci.c b/tcg/tci.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci.c
+++ b/tcg/tci.c
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
             t2 = tci_read_ri32(regs, &tb_ptr);
             tci_write_reg(regs, t0, t1 * t2);
             break;
-#if TCG_TARGET_HAS_div_i32
         case INDEX_op_div_i32:
             t0 = *tb_ptr++;
             t1 = tci_read_ri32(regs, &tb_ptr);
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
             t2 = tci_read_ri32(regs, &tb_ptr);
             tci_write_reg(regs, t0, t1 % t2);
             break;
-#elif TCG_TARGET_HAS_div2_i32
-        case INDEX_op_div2_i32:
-        case INDEX_op_divu2_i32:
-            TODO();
-            break;
-#endif
         case INDEX_op_and_i32:
             t0 = *tb_ptr++;
             t1 = tci_read_ri32(regs, &tb_ptr);
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
         case INDEX_op_remu_i64:
             TODO();
             break;
-#elif TCG_TARGET_HAS_div2_i64
-        case INDEX_op_div2_i64:
-        case INDEX_op_divu2_i64:
-            TODO();
-            break;
 #endif
         case INDEX_op_and_i64:
             t0 = *tb_ptr++;
diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci/tcg-target.c.inc
+++ b/tcg/tci/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
     case INDEX_op_remu_i64:     /* Optional (TCG_TARGET_HAS_div_i64). */
         TODO();
         break;
-    case INDEX_op_div2_i64:     /* Optional (TCG_TARGET_HAS_div2_i64). */
-    case INDEX_op_divu2_i64:    /* Optional (TCG_TARGET_HAS_div2_i64). */
-        TODO();
-        break;
     case INDEX_op_brcond_i64:
         tcg_out_r(s, args[0]);
         tcg_out_ri64(s, const_args[1], args[1]);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
         tcg_out_ri32(s, const_args[1], args[1]);
         tcg_out_ri32(s, const_args[2], args[2]);
         break;
-    case INDEX_op_div2_i32:     /* Optional (TCG_TARGET_HAS_div2_i32). */
-    case INDEX_op_divu2_i32:    /* Optional (TCG_TARGET_HAS_div2_i32). */
-        TODO();
-        break;
 #if TCG_TARGET_REG_BITS == 32
     case INDEX_op_add2_i32:
     case INDEX_op_sub2_i32:
-- 
2.25.1

Trivially implemented like other arithmetic.
Tested via check-tcg and the ppc64 target.

Tested-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tci/tcg-target.h     |  4 ++--
 tcg/tci.c                | 28 ++++++++++++++++++++++------
 tcg/tci/tcg-target.c.inc | 10 ++++------
 3 files changed, 28 insertions(+), 14 deletions(-)

diff --git a/tcg/tci/tcg-target.h b/tcg/tci/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci/tcg-target.h
+++ b/tcg/tci/tcg-target.h
@@ -XXX,XX +XXX,XX @@
 #define TCG_TARGET_HAS_extract_i64      0
 #define TCG_TARGET_HAS_sextract_i64     0
 #define TCG_TARGET_HAS_extract2_i64     0
-#define TCG_TARGET_HAS_div_i64          0
-#define TCG_TARGET_HAS_rem_i64          0
+#define TCG_TARGET_HAS_div_i64          1
+#define TCG_TARGET_HAS_rem_i64          1
 #define TCG_TARGET_HAS_ext8s_i64        1
 #define TCG_TARGET_HAS_ext16s_i64       1
 #define TCG_TARGET_HAS_ext32s_i64       1
diff --git a/tcg/tci.c b/tcg/tci.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci.c
+++ b/tcg/tci.c
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
             t2 = tci_read_ri64(regs, &tb_ptr);
             tci_write_reg(regs, t0, t1 * t2);
             break;
-#if TCG_TARGET_HAS_div_i64
         case INDEX_op_div_i64:
-        case INDEX_op_divu_i64:
-        case INDEX_op_rem_i64:
-        case INDEX_op_remu_i64:
-            TODO();
+            t0 = *tb_ptr++;
+            t1 = tci_read_ri64(regs, &tb_ptr);
+            t2 = tci_read_ri64(regs, &tb_ptr);
+            tci_write_reg(regs, t0, (int64_t)t1 / (int64_t)t2);
+            break;
+        case INDEX_op_divu_i64:
+            t0 = *tb_ptr++;
+            t1 = tci_read_ri64(regs, &tb_ptr);
+            t2 = tci_read_ri64(regs, &tb_ptr);
+            tci_write_reg(regs, t0, (uint64_t)t1 / (uint64_t)t2);
+            break;
+        case INDEX_op_rem_i64:
+            t0 = *tb_ptr++;
+            t1 = tci_read_ri64(regs, &tb_ptr);
+            t2 = tci_read_ri64(regs, &tb_ptr);
+            tci_write_reg(regs, t0, (int64_t)t1 % (int64_t)t2);
+            break;
+        case INDEX_op_remu_i64:
+            t0 = *tb_ptr++;
+            t1 = tci_read_ri64(regs, &tb_ptr);
+            t2 = tci_read_ri64(regs, &tb_ptr);
+            tci_write_reg(regs, t0, (uint64_t)t1 % (uint64_t)t2);
             break;
-#endif
         case INDEX_op_and_i64:
             t0 = *tb_ptr++;
             t1 = tci_read_ri64(regs, &tb_ptr);
diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci/tcg-target.c.inc
+++ b/tcg/tci/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
     case INDEX_op_sar_i64:
     case INDEX_op_rotl_i64:     /* Optional (TCG_TARGET_HAS_rot_i64). */
     case INDEX_op_rotr_i64:     /* Optional (TCG_TARGET_HAS_rot_i64). */
+    case INDEX_op_div_i64:      /* Optional (TCG_TARGET_HAS_div_i64). */
+    case INDEX_op_divu_i64:     /* Optional (TCG_TARGET_HAS_div_i64). */
+    case INDEX_op_rem_i64:      /* Optional (TCG_TARGET_HAS_div_i64). */
+    case INDEX_op_remu_i64:     /* Optional (TCG_TARGET_HAS_div_i64). */
         tcg_out_r(s, args[0]);
         tcg_out_ri64(s, const_args[1], args[1]);
         tcg_out_ri64(s, const_args[2], args[2]);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
         tcg_debug_assert(args[4] <= UINT8_MAX);
         tcg_out8(s, args[4]);
         break;
-    case INDEX_op_div_i64:      /* Optional (TCG_TARGET_HAS_div_i64). */
-    case INDEX_op_divu_i64:     /* Optional (TCG_TARGET_HAS_div_i64). */
-    case INDEX_op_rem_i64:      /* Optional (TCG_TARGET_HAS_div_i64). */
-    case INDEX_op_remu_i64:     /* Optional (TCG_TARGET_HAS_div_i64). */
-        TODO();
-        break;
     case INDEX_op_brcond_i64:
         tcg_out_r(s, args[0]);
         tcg_out_ri64(s, const_args[1], args[1]);
-- 
2.25.1

As noted in several comments, 8 regs is not enough for 32-bit
to perform calls, as currently implemented.  Shortly, we will
rearrange the encoding which will make 32 regs impossible.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tci/tcg-target.h     | 32 +++++---------------------------
 tcg/tci/tcg-target.c.inc | 26 --------------------------
 2 files changed, 5 insertions(+), 53 deletions(-)

diff --git a/tcg/tci/tcg-target.h b/tcg/tci/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci/tcg-target.h
+++ b/tcg/tci/tcg-target.h
@@ -XXX,XX +XXX,XX @@
 #define TCG_TARGET_HAS_mulu2_i32        1
 #endif /* TCG_TARGET_REG_BITS == 64 */
 
-/* Number of registers available.
-   For 32 bit hosts, we need more than 8 registers (call arguments). */
-/* #define TCG_TARGET_NB_REGS 8 */
+/* Number of registers available. */
 #define TCG_TARGET_NB_REGS 16
-/* #define TCG_TARGET_NB_REGS 32 */
 
 /* List of registers which are used by TCG. */
 typedef enum {
@@ -XXX,XX +XXX,XX @@ typedef enum {
     TCG_REG_R5,
     TCG_REG_R6,
     TCG_REG_R7,
-#if TCG_TARGET_NB_REGS >= 16
     TCG_REG_R8,
     TCG_REG_R9,
     TCG_REG_R10,
@@ -XXX,XX +XXX,XX @@ typedef enum {
     TCG_REG_R13,
     TCG_REG_R14,
     TCG_REG_R15,
-#if TCG_TARGET_NB_REGS >= 32
-    TCG_REG_R16,
-    TCG_REG_R17,
-    TCG_REG_R18,
-    TCG_REG_R19,
-    TCG_REG_R20,
-    TCG_REG_R21,
-    TCG_REG_R22,
-    TCG_REG_R23,
-    TCG_REG_R24,
-    TCG_REG_R25,
-    TCG_REG_R26,
-    TCG_REG_R27,
-    TCG_REG_R28,
-    TCG_REG_R29,
-    TCG_REG_R30,
-    TCG_REG_R31,
-#endif
-#endif
+
+    TCG_AREG0 = TCG_REG_R14,
+    TCG_REG_CALL_STACK = TCG_REG_R15,
+
     /* Special value UINT8_MAX is used by TCI to encode constant values. */
     TCG_CONST = UINT8_MAX
 } TCGReg;
 
-#define TCG_AREG0                       (TCG_TARGET_NB_REGS - 2)
-
 /* Used for function call generation. */
-#define TCG_REG_CALL_STACK              (TCG_TARGET_NB_REGS - 1)
 #define TCG_TARGET_CALL_STACK_OFFSET    0
 #define TCG_TARGET_STACK_ALIGN          16
 
diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci/tcg-target.c.inc
+++ b/tcg/tci/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static const int tcg_target_reg_alloc_order[] = {
     TCG_REG_R5,
     TCG_REG_R6,
     TCG_REG_R7,
-#if TCG_TARGET_NB_REGS >= 16
     TCG_REG_R8,
     TCG_REG_R9,
     TCG_REG_R10,
@@ -XXX,XX +XXX,XX @@ static const int tcg_target_reg_alloc_order[] = {
     TCG_REG_R13,
     TCG_REG_R14,
     TCG_REG_R15,
-#endif
 };
 
 #if MAX_OPC_PARAM_IARGS != 6
@@ -XXX,XX +XXX,XX @@ static const int tcg_target_call_iarg_regs[] = {
 #if TCG_TARGET_REG_BITS == 32
     /* 32 bit hosts need 2 * MAX_OPC_PARAM_IARGS registers. */
     TCG_REG_R7,
-#if TCG_TARGET_NB_REGS >= 16
     TCG_REG_R8,
     TCG_REG_R9,
     TCG_REG_R10,
     TCG_REG_R11,
     TCG_REG_R12,
-#else
-# error Too few input registers available
-#endif
 #endif
 };
 
@@ -XXX,XX +XXX,XX @@ static const char *const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
     "r05",
     "r06",
     "r07",
-#if TCG_TARGET_NB_REGS >= 16
     "r08",
     "r09",
     "r10",
@@ -XXX,XX +XXX,XX @@ static const char *const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
     "r13",
     "r14",
     "r15",
-#if TCG_TARGET_NB_REGS >= 32
-    "r16",
-    "r17",
-    "r18",
-    "r19",
-    "r20",
-    "r21",
-    "r22",
-    "r23",
-    "r24",
-    "r25",
-    "r26",
-    "r27",
-    "r28",
-    "r29",
-    "r30",
-    "r31"
-#endif
-#endif
 };
 #endif
 
-- 
2.25.1

This was removed from tcg_target_reg_alloc_order and
tcg_target_call_iarg_regs on the assumption that it
was the stack.  This was incorrectly copied from i386.
For tci, the stack is R15.

By adding R4 back to tcg_target_call_iarg_regs, adjust the other
entries so that 6 (or 12) entries are still present in the array,
and adjust the numbers in the interpreter.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tci.c                | 8 ++++----
 tcg/tci/tcg-target.c.inc | 7 +------
 2 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/tcg/tci.c b/tcg/tci.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci.c
+++ b/tcg/tci.c
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
                                           tci_read_reg(regs, TCG_REG_R1),
                                           tci_read_reg(regs, TCG_REG_R2),
                                           tci_read_reg(regs, TCG_REG_R3),
+                                          tci_read_reg(regs, TCG_REG_R4),
                                           tci_read_reg(regs, TCG_REG_R5),
                                           tci_read_reg(regs, TCG_REG_R6),
                                           tci_read_reg(regs, TCG_REG_R7),
                                           tci_read_reg(regs, TCG_REG_R8),
                                           tci_read_reg(regs, TCG_REG_R9),
                                           tci_read_reg(regs, TCG_REG_R10),
-                                          tci_read_reg(regs, TCG_REG_R11),
-                                          tci_read_reg(regs, TCG_REG_R12));
+                                          tci_read_reg(regs, TCG_REG_R11));
             tci_write_reg(regs, TCG_REG_R0, tmp64);
             tci_write_reg(regs, TCG_REG_R1, tmp64 >> 32);
 #else
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
                                           tci_read_reg(regs, TCG_REG_R1),
                                           tci_read_reg(regs, TCG_REG_R2),
                                           tci_read_reg(regs, TCG_REG_R3),
-                                          tci_read_reg(regs, TCG_REG_R5),
-                                          tci_read_reg(regs, TCG_REG_R6));
+                                          tci_read_reg(regs, TCG_REG_R4),
+                                          tci_read_reg(regs, TCG_REG_R5));
             tci_write_reg(regs, TCG_REG_R0, tmp64);
 #endif
             break;
diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci/tcg-target.c.inc
+++ b/tcg/tci/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static const int tcg_target_reg_alloc_order[] = {
     TCG_REG_R1,
     TCG_REG_R2,
     TCG_REG_R3,
-#if 0 /* used for TCG_REG_CALL_STACK */
     TCG_REG_R4,
-#endif
     TCG_REG_R5,
     TCG_REG_R6,
     TCG_REG_R7,
@@ -XXX,XX +XXX,XX @@ static const int tcg_target_call_iarg_regs[] = {
     TCG_REG_R1,
     TCG_REG_R2,
     TCG_REG_R3,
-#if 0 /* used for TCG_REG_CALL_STACK */
     TCG_REG_R4,
-#endif
     TCG_REG_R5,
-    TCG_REG_R6,
 #if TCG_TARGET_REG_BITS == 32
     /* 32 bit hosts need 2 * MAX_OPC_PARAM_IARGS registers. */
+    TCG_REG_R6,
     TCG_REG_R7,
     TCG_REG_R8,
     TCG_REG_R9,
     TCG_REG_R10,
     TCG_REG_R11,
-    TCG_REG_R12,
 #endif
 };
 
-- 
2.25.1

Restrict all operands to registers.  All constants will be forced
into registers by the middle-end.  Removing the difference in how
immediate integers were encoded will allow more code to be shared
between 32-bit and 64-bit operations.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tci/tcg-target-con-set.h |   6 +-
 tcg/tci/tcg-target.h         |   3 -
 tcg/tci.c                    | 189 +++++++++++++----------------------
 tcg/tci/tcg-target.c.inc     |  85 ++++------------
 4 files changed, 89 insertions(+), 194 deletions(-)

diff --git a/tcg/tci/tcg-target-con-set.h b/tcg/tci/tcg-target-con-set.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci/tcg-target-con-set.h
+++ b/tcg/tci/tcg-target-con-set.h
@@ -XXX,XX +XXX,XX @@
  * tcg-target-con-str.h; the constraint combination is inclusive or.
  */
 C_O0_I2(r, r)
-C_O0_I2(r, ri)
 C_O0_I3(r, r, r)
-C_O0_I4(r, r, ri, ri)
 C_O0_I4(r, r, r, r)
 C_O1_I1(r, r)
 C_O1_I2(r, 0, r)
-C_O1_I2(r, ri, ri)
 C_O1_I2(r, r, r)
-C_O1_I2(r, r, ri)
-C_O1_I4(r, r, r, ri, ri)
+C_O1_I4(r, r, r, r, r)
 C_O2_I1(r, r, r)
 C_O2_I2(r, r, r, r)
 C_O2_I4(r, r, r, r, r, r)
diff --git a/tcg/tci/tcg-target.h b/tcg/tci/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci/tcg-target.h
+++ b/tcg/tci/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
 
     TCG_AREG0 = TCG_REG_R14,
     TCG_REG_CALL_STACK = TCG_REG_R15,
-
-    /* Special value UINT8_MAX is used by TCI to encode constant values. */
-    TCG_CONST = UINT8_MAX
 } TCGReg;
 
 /* Used for function call generation. */
diff --git a/tcg/tci.c b/tcg/tci.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci.c
+++ b/tcg/tci.c
@@ -XXX,XX +XXX,XX @@ tci_read_ulong(const tcg_target_ulong *regs, const uint8_t **tb_ptr)
     return taddr;
 }
 
-/* Read indexed register or constant (native size) from bytecode. */
-static tcg_target_ulong
-tci_read_ri(const tcg_target_ulong *regs, const uint8_t **tb_ptr)
-{
-    tcg_target_ulong value;
-    TCGReg r = **tb_ptr;
-    *tb_ptr += 1;
-    if (r == TCG_CONST) {
-        value = tci_read_i(tb_ptr);
-    } else {
-        value = tci_read_reg(regs, r);
-    }
-    return value;
-}
-
-/* Read indexed register or constant (32 bit) from bytecode. */
-static uint32_t tci_read_ri32(const tcg_target_ulong *regs,
-                              const uint8_t **tb_ptr)
-{
-    uint32_t value;
-    TCGReg r = **tb_ptr;
-    *tb_ptr += 1;
-    if (r == TCG_CONST) {
-        value = tci_read_i32(tb_ptr);
-    } else {
-        value = tci_read_reg32(regs, r);
-    }
-    return value;
-}
-
-#if TCG_TARGET_REG_BITS == 32
-/* Read two indexed registers or constants (2 * 32 bit) from bytecode. */
-static uint64_t tci_read_ri64(const tcg_target_ulong *regs,
-                              const uint8_t **tb_ptr)
-{
-    uint32_t low = tci_read_ri32(regs, tb_ptr);
-    return tci_uint64(tci_read_ri32(regs, tb_ptr), low);
-}
-#elif TCG_TARGET_REG_BITS == 64
-/* Read indexed register or constant (64 bit) from bytecode. */
-static uint64_t tci_read_ri64(const tcg_target_ulong *regs,
-                              const uint8_t **tb_ptr)
-{
-    uint64_t value;
-    TCGReg r = **tb_ptr;
-    *tb_ptr += 1;
-    if (r == TCG_CONST) {
-        value = tci_read_i64(tb_ptr);
-    } else {
-        value = tci_read_reg64(regs, r);
-    }
-    return value;
-}
-#endif
-
 static tcg_target_ulong tci_read_label(const uint8_t **tb_ptr)
 {
     tcg_target_ulong label = tci_read_i(tb_ptr);
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
 
         switch (opc) {
         case INDEX_op_call:
-            t0 = tci_read_ri(regs, &tb_ptr);
+            t0 = tci_read_i(&tb_ptr);
             tci_tb_ptr = (uintptr_t)tb_ptr;
 #if TCG_TARGET_REG_BITS == 32
             tmp64 = ((helper_function)t0)(tci_read_reg(regs, TCG_REG_R0),
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
         case INDEX_op_setcond_i32:
             t0 = *tb_ptr++;
             t1 = tci_read_r32(regs, &tb_ptr);
-            t2 = tci_read_ri32(regs, &tb_ptr);
+            t2 = tci_read_r32(regs, &tb_ptr);
             condition = *tb_ptr++;
             tci_write_reg(regs, t0, tci_compare32(t1, t2, condition));
             break;
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
         case INDEX_op_setcond2_i32:
             t0 = *tb_ptr++;
             tmp64 = tci_read_r64(regs, &tb_ptr);
-            v64 = tci_read_ri64(regs, &tb_ptr);
+            v64 = tci_read_r64(regs, &tb_ptr);
             condition = *tb_ptr++;
             tci_write_reg(regs, t0, tci_compare64(tmp64, v64, condition));
             break;
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
         case INDEX_op_setcond_i64:
             t0 = *tb_ptr++;
             t1 = tci_read_r64(regs, &tb_ptr);
-            t2 = tci_read_ri64(regs, &tb_ptr);
+            t2 = tci_read_r64(regs, &tb_ptr);
             condition = *tb_ptr++;
             tci_write_reg(regs, t0, tci_compare64(t1, t2, condition));
             break;
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
 
         case INDEX_op_add_i32:
             t0 = *tb_ptr++;
-            t1 = tci_read_ri32(regs, &tb_ptr);
-            t2 = tci_read_ri32(regs, &tb_ptr);
+            t1 = tci_read_r32(regs, &tb_ptr);
+            t2 = tci_read_r32(regs, &tb_ptr);
             tci_write_reg(regs, t0, t1 + t2);
             break;
         case INDEX_op_sub_i32:
             t0 = *tb_ptr++;
-            t1 = tci_read_ri32(regs, &tb_ptr);
-            t2 = tci_read_ri32(regs, &tb_ptr);
+            t1 = tci_read_r32(regs, &tb_ptr);
+            t2 = tci_read_r32(regs, &tb_ptr);
             tci_write_reg(regs, t0, t1 - t2);
             break;
         case INDEX_op_mul_i32:
             t0 = *tb_ptr++;
-            t1 = tci_read_ri32(regs, &tb_ptr);
-            t2 = tci_read_ri32(regs, &tb_ptr);
+            t1 = tci_read_r32(regs, &tb_ptr);
+            t2 = tci_read_r32(regs, &tb_ptr);
             tci_write_reg(regs, t0, t1 * t2);
             break;
         case INDEX_op_div_i32:
             t0 = *tb_ptr++;
-            t1 = tci_read_ri32(regs, &tb_ptr);
-            t2 = tci_read_ri32(regs, &tb_ptr);
+            t1 = tci_read_r32(regs, &tb_ptr);
+            t2 = tci_read_r32(regs, &tb_ptr);
             tci_write_reg(regs, t0, (int32_t)t1 / (int32_t)t2);
             break;
         case INDEX_op_divu_i32:
             t0 = *tb_ptr++;
-            t1 = tci_read_ri32(regs, &tb_ptr);
-            t2 = tci_read_ri32(regs, &tb_ptr);
+            t1 = tci_read_r32(regs, &tb_ptr);
+            t2 = tci_read_r32(regs, &tb_ptr);
             tci_write_reg(regs, t0, t1 / t2);
             break;
         case INDEX_op_rem_i32:
             t0 = *tb_ptr++;
-            t1 = tci_read_ri32(regs, &tb_ptr);
-            t2 = tci_read_ri32(regs, &tb_ptr);
+            t1 = tci_read_r32(regs, &tb_ptr);
+            t2 = tci_read_r32(regs, &tb_ptr);
             tci_write_reg(regs, t0, (int32_t)t1 % (int32_t)t2);
             break;
         case INDEX_op_remu_i32:
             t0 = *tb_ptr++;
-            t1 = tci_read_ri32(regs, &tb_ptr);
-            t2 = tci_read_ri32(regs, &tb_ptr);
+            t1 = tci_read_r32(regs, &tb_ptr);
+            t2 = tci_read_r32(regs, &tb_ptr);
             tci_write_reg(regs, t0, t1 % t2);
             break;
         case INDEX_op_and_i32:
             t0 = *tb_ptr++;
-            t1 = tci_read_ri32(regs, &tb_ptr);
-            t2 = tci_read_ri32(regs, &tb_ptr);
+            t1 = tci_read_r32(regs, &tb_ptr);
+            t2 = tci_read_r32(regs, &tb_ptr);
             tci_write_reg(regs, t0, t1 & t2);
             break;
         case INDEX_op_or_i32:
             t0 = *tb_ptr++;
-            t1 = tci_read_ri32(regs, &tb_ptr);
-            t2 = tci_read_ri32(regs, &tb_ptr);
+            t1 = tci_read_r32(regs, &tb_ptr);
+            t2 = tci_read_r32(regs, &tb_ptr);
             tci_write_reg(regs, t0, t1 | t2);
             break;
         case INDEX_op_xor_i32:
             t0 = *tb_ptr++;
-            t1 = tci_read_ri32(regs, &tb_ptr);
-            t2 = tci_read_ri32(regs, &tb_ptr);
+            t1 = tci_read_r32(regs, &tb_ptr);
+            t2 = tci_read_r32(regs, &tb_ptr);
             tci_write_reg(regs, t0, t1 ^ t2);
             break;
 
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
 
         case INDEX_op_shl_i32:
             t0 = *tb_ptr++;
-            t1 = tci_read_ri32(regs, &tb_ptr);
-            t2 = tci_read_ri32(regs, &tb_ptr);
+            t1 = tci_read_r32(regs, &tb_ptr);
+            t2 = tci_read_r32(regs, &tb_ptr);
             tci_write_reg(regs, t0, t1 << (t2 & 31));
             break;
         case INDEX_op_shr_i32:
             t0 = *tb_ptr++;
-            t1 = tci_read_ri32(regs, &tb_ptr);
-            t2 = tci_read_ri32(regs, &tb_ptr);
+            t1 = tci_read_r32(regs, &tb_ptr);
+            t2 = tci_read_r32(regs, &tb_ptr);
             tci_write_reg(regs, t0, t1 >> (t2 & 31));
             break;
         case INDEX_op_sar_i32:
             t0 = *tb_ptr++;
-            t1 = tci_read_ri32(regs, &tb_ptr);
-            t2 = tci_read_ri32(regs, &tb_ptr);
+            t1 = tci_read_r32(regs, &tb_ptr);
+            t2 = tci_read_r32(regs, &tb_ptr);
             tci_write_reg(regs, t0, ((int32_t)t1 >> (t2 & 31)));
             break;
 #if TCG_TARGET_HAS_rot_i32
         case INDEX_op_rotl_i32:
             t0 = *tb_ptr++;
-            t1 = tci_read_ri32(regs, &tb_ptr);
-            t2 = tci_read_ri32(regs, &tb_ptr);
+            t1 = tci_read_r32(regs, &tb_ptr);
+            t2 = tci_read_r32(regs, &tb_ptr);
             tci_write_reg(regs, t0, rol32(t1, t2 & 31));
             break;
         case INDEX_op_rotr_i32:
             t0 = *tb_ptr++;
-            t1 = tci_read_ri32(regs, &tb_ptr);
-            t2 = tci_read_ri32(regs, &tb_ptr);
+            t1 = tci_read_r32(regs, &tb_ptr);
+            t2 = tci_read_r32(regs, &tb_ptr);
             tci_write_reg(regs, t0, ror32(t1, t2 & 31));
             break;
 #endif
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
 #endif
         case INDEX_op_brcond_i32:
             t0 = tci_read_r32(regs, &tb_ptr);
-            t1 = tci_read_ri32(regs, &tb_ptr);
+            t1 = tci_read_r32(regs, &tb_ptr);
             condition = *tb_ptr++;
             label = tci_read_label(&tb_ptr);
             if (tci_compare32(t0, t1, condition)) {
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
             break;
         case INDEX_op_brcond2_i32:
             tmp64 = tci_read_r64(regs, &tb_ptr);
-            v64 = tci_read_ri64(regs, &tb_ptr);
+            v64 = tci_read_r64(regs, &tb_ptr);
             condition = *tb_ptr++;
             label = tci_read_label(&tb_ptr);
             if (tci_compare64(tmp64, v64, condition)) {
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
 
         case INDEX_op_add_i64:
             t0 = *tb_ptr++;
-            t1 = tci_read_ri64(regs, &tb_ptr);
-            t2 = tci_read_ri64(regs, &tb_ptr);
+            t1 = tci_read_r64(regs, &tb_ptr);
+            t2 = tci_read_r64(regs, &tb_ptr);
             tci_write_reg(regs, t0, t1 + t2);
             break;
         case INDEX_op_sub_i64:
             t0 = *tb_ptr++;
-            t1 = tci_read_ri64(regs, &tb_ptr);
-            t2 = tci_read_ri64(regs, &tb_ptr);
+            t1 = tci_read_r64(regs, &tb_ptr);
+            t2 = tci_read_r64(regs, &tb_ptr);
             tci_write_reg(regs, t0, t1 - t2);
             break;
         case INDEX_op_mul_i64:
             t0 = *tb_ptr++;
-            t1 = tci_read_ri64(regs, &tb_ptr);
-            t2 = tci_read_ri64(regs, &tb_ptr);
+            t1 = tci_read_r64(regs, &tb_ptr);
+            t2 = tci_read_r64(regs, &tb_ptr);
             tci_write_reg(regs, t0, t1 * t2);
             break;
         case INDEX_op_div_i64:
             t0 = *tb_ptr++;
-            t1 = tci_read_ri64(regs, &tb_ptr);
-            t2 = tci_read_ri64(regs, &tb_ptr);
+            t1 = tci_read_r64(regs, &tb_ptr);
+            t2 = tci_read_r64(regs, &tb_ptr);
             tci_write_reg(regs, t0, (int64_t)t1 / (int64_t)t2);
             break;
         case INDEX_op_divu_i64:
             t0 = *tb_ptr++;
-            t1 = tci_read_ri64(regs, &tb_ptr);
-            t2 = tci_read_ri64(regs, &tb_ptr);
+            t1 = tci_read_r64(regs, &tb_ptr);
+            t2 = tci_read_r64(regs, &tb_ptr);
             tci_write_reg(regs, t0, (uint64_t)t1 / (uint64_t)t2);
             break;
         case INDEX_op_rem_i64:
             t0 = *tb_ptr++;
-            t1 = tci_read_ri64(regs, &tb_ptr);
-            t2 = tci_read_ri64(regs, &tb_ptr);
+            t1 = tci_read_r64(regs, &tb_ptr);
+            t2 = tci_read_r64(regs, &tb_ptr);
             tci_write_reg(regs, t0, (int64_t)t1 % (int64_t)t2);
             break;
         case INDEX_op_remu_i64:
             t0 = *tb_ptr++;
-            t1 = tci_read_ri64(regs, &tb_ptr);
-            t2 = tci_read_ri64(regs, &tb_ptr);
+            t1 = tci_read_r64(regs, &tb_ptr);
+            t2 = tci_read_r64(regs, &tb_ptr);
             tci_write_reg(regs, t0, (uint64_t)t1 % (uint64_t)t2);
             break;
         case INDEX_op_and_i64:
             t0 = *tb_ptr++;
-            t1 = tci_read_ri64(regs, &tb_ptr);
-            t2 = tci_read_ri64(regs, &tb_ptr);
+            t1 = tci_read_r64(regs, &tb_ptr);
+            t2 = tci_read_r64(regs, &tb_ptr);
             tci_write_reg(regs, t0, t1 & t2);
             break;
         case INDEX_op_or_i64:
             t0 = *tb_ptr++;
-            t1 = tci_read_ri64(regs, &tb_ptr);
-            t2 = tci_read_ri64(regs, &tb_ptr);
+            t1 = tci_read_r64(regs, &tb_ptr);
+            t2 = tci_read_r64(regs, &tb_ptr);
             tci_write_reg(regs, t0, t1 | t2);
             break;
         case INDEX_op_xor_i64:
             t0 = *tb_ptr++;
-            t1 = tci_read_ri64(regs, &tb_ptr);
-            t2 = tci_read_ri64(regs, &tb_ptr);
+            t1 = tci_read_r64(regs, &tb_ptr);
+            t2 = tci_read_r64(regs, &tb_ptr);
             tci_write_reg(regs, t0, t1 ^ t2);
             break;
 
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
 
         case INDEX_op_shl_i64:
             t0 = *tb_ptr++;
-            t1 = tci_read_ri64(regs, &tb_ptr);
-            t2 = tci_read_ri64(regs, &tb_ptr);
+            t1 = tci_read_r64(regs, &tb_ptr);
+            t2 = tci_read_r64(regs, &tb_ptr);
             tci_write_reg(regs, t0, t1 << (t2 & 63));
             break;
         case INDEX_op_shr_i64:
             t0 = *tb_ptr++;
-            t1 = tci_read_ri64(regs, &tb_ptr);
-            t2 = tci_read_ri64(regs, &tb_ptr);
+            t1 = tci_read_r64(regs, &tb_ptr);
+            t2 = tci_read_r64(regs, &tb_ptr);
             tci_write_reg(regs, t0, t1 >> (t2 & 63));
             break;
         case INDEX_op_sar_i64:
             t0 = *tb_ptr++;
-            t1 = tci_read_ri64(regs, &tb_ptr);
-            t2 = tci_read_ri64(regs, &tb_ptr);
+            t1 = tci_read_r64(regs, &tb_ptr);
+            t2 = tci_read_r64(regs, &tb_ptr);
             tci_write_reg(regs, t0, ((int64_t)t1 >> (t2 & 63)));
             break;
 #if TCG_TARGET_HAS_rot_i64
         case INDEX_op_rotl_i64:
             t0 = *tb_ptr++;
-            t1 = tci_read_ri64(regs, &tb_ptr);
-            t2 = tci_read_ri64(regs, &tb_ptr);
+            t1 = tci_read_r64(regs, &tb_ptr);
+            t2 = tci_read_r64(regs, &tb_ptr);
             tci_write_reg(regs, t0, rol64(t1, t2 & 63));
             break;
         case INDEX_op_rotr_i64:
             t0 = *tb_ptr++;
-            t1 = tci_read_ri64(regs, &tb_ptr);
-            t2 = tci_read_ri64(regs, &tb_ptr);
+            t1 = tci_read_r64(regs, &tb_ptr);
+            t2 = tci_read_r64(regs, &tb_ptr);
             tci_write_reg(regs, t0, ror64(t1, t2 & 63));
             break;
 #endif
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
 #endif
         case INDEX_op_brcond_i64:
             t0 = tci_read_r64(regs, &tb_ptr);
-            t1 = tci_read_ri64(regs, &tb_ptr);
+            t1 = tci_read_r64(regs, &tb_ptr);
             condition = *tb_ptr++;
             label = tci_read_label(&tb_ptr);
             if (tci_compare64(t0, t1, condition)) {
diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci/tcg-target.c.inc
+++ b/tcg/tci/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
     case INDEX_op_rem_i64:
     case INDEX_op_remu_i32:
     case INDEX_op_remu_i64:
-        return C_O1_I2(r, r, r);
-
     case INDEX_op_add_i32:
     case INDEX_op_add_i64:
     case INDEX_op_sub_i32:
@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
     case INDEX_op_rotl_i64:
     case INDEX_op_rotr_i32:
     case INDEX_op_rotr_i64:
-        /* TODO: Does R, RI, RI result in faster code than R, R, RI? */
-        return C_O1_I2(r, ri, ri);
+    case INDEX_op_setcond_i32:
+    case INDEX_op_setcond_i64:
+        return C_O1_I2(r, r, r);
 
     case INDEX_op_deposit_i32:
     case INDEX_op_deposit_i64:
@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
 
     case INDEX_op_brcond_i32:
     case INDEX_op_brcond_i64:
-        return C_O0_I2(r, ri);
-
-    case INDEX_op_setcond_i32:
-    case INDEX_op_setcond_i64:
-        return C_O1_I2(r, r, ri);
+        return C_O0_I2(r, r);
 
 #if TCG_TARGET_REG_BITS == 32
     /* TODO: Support R, R, R, R, RI, RI? Will it be faster? */
@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
     case INDEX_op_sub2_i32:
         return C_O2_I4(r, r, r, r, r, r);
     case INDEX_op_brcond2_i32:
-        return C_O0_I4(r, r, ri, ri);
+        return C_O0_I4(r, r, r, r);
     case INDEX_op_mulu2_i32:
         return C_O2_I2(r, r, r, r);
     case INDEX_op_setcond2_i32:
-        return C_O1_I4(r, r, r, ri, ri);
+        return C_O1_I4(r, r, r, r, r);
 #endif
 
     case INDEX_op_qemu_ld_i32:
@@ -XXX,XX +XXX,XX @@ static void tcg_out_r(TCGContext *s, TCGArg t0)
     tcg_out8(s, t0);
 }
 
-/* Write register or constant (native size). */
-static void tcg_out_ri(TCGContext *s, int const_arg, TCGArg arg)
-{
-    if (const_arg) {
-        tcg_debug_assert(const_arg == 1);
-        tcg_out8(s, TCG_CONST);
-        tcg_out_i(s, arg);
-    } else {
-        tcg_out_r(s, arg);
-    }
-}
-
-/* Write register or constant (32 bit). */
-static void tcg_out_ri32(TCGContext *s, int const_arg, TCGArg arg)
-{
-    if (const_arg) {
-        tcg_debug_assert(const_arg == 1);
-        tcg_out8(s, TCG_CONST);
-        tcg_out32(s, arg);
-    } else {
-        tcg_out_r(s, arg);
-    }
-}
-
-#if TCG_TARGET_REG_BITS == 64
-/* Write register or constant (64 bit). */
-static void tcg_out_ri64(TCGContext *s, int const_arg, TCGArg arg)
-{
-    if (const_arg) {
-        tcg_debug_assert(const_arg == 1);
-        tcg_out8(s, TCG_CONST);
-        tcg_out64(s, arg);
-    } else {
-        tcg_out_r(s, arg);
-    }
-}
-#endif
-
 /* Write label. */
 static void tci_out_label(TCGContext *s, TCGLabel *label)
 {
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_call(TCGContext *s, const tcg_insn_unit *arg)
 {
     uint8_t *old_code_ptr = s->code_ptr;
     tcg_out_op_t(s, INDEX_op_call);
-    tcg_out_ri(s, 1, (uintptr_t)arg);
+    tcg_out_i(s, (uintptr_t)arg);
     old_code_ptr[1] = s->code_ptr - old_code_ptr;
 }
 
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
     case INDEX_op_setcond_i32:
         tcg_out_r(s, args[0]);
         tcg_out_r(s, args[1]);
-        tcg_out_ri32(s, const_args[2], args[2]);
+        tcg_out_r(s, args[2]);
         tcg_out8(s, args[3]);   /* condition */
         break;
 #if TCG_TARGET_REG_BITS == 32
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
         tcg_out_r(s, args[0]);
         tcg_out_r(s, args[1]);
         tcg_out_r(s, args[2]);
-        tcg_out_ri32(s, const_args[3], args[3]);
-        tcg_out_ri32(s, const_args[4], args[4]);
+        tcg_out_r(s, args[3]);
+        tcg_out_r(s, args[4]);
         tcg_out8(s, args[5]);   /* condition */
         break;
 #elif TCG_TARGET_REG_BITS == 64
     case INDEX_op_setcond_i64:
         tcg_out_r(s, args[0]);
         tcg_out_r(s, args[1]);
-        tcg_out_ri64(s, const_args[2], args[2]);
+        tcg_out_r(s, args[2]);
         tcg_out8(s, args[3]);   /* condition */
         break;
 #endif
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
     case INDEX_op_rotl_i32:     /* Optional (TCG_TARGET_HAS_rot_i32). */
     case INDEX_op_rotr_i32:     /* Optional (TCG_TARGET_HAS_rot_i32). */
         tcg_out_r(s, args[0]);
-        tcg_out_ri32(s, const_args[1], args[1]);
-        tcg_out_ri32(s, const_args[2], args[2]);
+        tcg_out_r(s, args[1]);
+        tcg_out_r(s, args[2]);
         break;
     case INDEX_op_deposit_i32:  /* Optional (TCG_TARGET_HAS_deposit_i32). */
         tcg_out_r(s, args[0]);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
     case INDEX_op_rem_i64:      /* Optional (TCG_TARGET_HAS_div_i64). */
     case INDEX_op_remu_i64:     /* Optional (TCG_TARGET_HAS_div_i64). */
         tcg_out_r(s, args[0]);
-        tcg_out_ri64(s, const_args[1], args[1]);
-        tcg_out_ri64(s, const_args[2], args[2]);
+        tcg_out_r(s, args[1]);
+        tcg_out_r(s, args[2]);
         break;
     case INDEX_op_deposit_i64:  /* Optional (TCG_TARGET_HAS_deposit_i64). */
         tcg_out_r(s, args[0]);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
         break;
     case INDEX_op_brcond_i64:
         tcg_out_r(s, args[0]);
-        tcg_out_ri64(s, const_args[1], args[1]);
+        tcg_out_r(s, args[1]);
         tcg_out8(s, args[2]);           /* condition */
         tci_out_label(s, arg_label(args[3]));
         break;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
     case INDEX_op_rem_i32:      /* Optional (TCG_TARGET_HAS_div_i32). */
     case INDEX_op_remu_i32:     /* Optional (TCG_TARGET_HAS_div_i32). */
         tcg_out_r(s, args[0]);
-        tcg_out_ri32(s, const_args[1], args[1]);
-        tcg_out_ri32(s, const_args[2], args[2]);
+        tcg_out_r(s, args[1]);
+        tcg_out_r(s, args[2]);
         break;
 #if TCG_TARGET_REG_BITS == 32
     case INDEX_op_add2_i32:
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
     case INDEX_op_brcond2_i32:
         tcg_out_r(s, args[0]);
         tcg_out_r(s, args[1]);
-        tcg_out_ri32(s, const_args[2], args[2]);
-        tcg_out_ri32(s, const_args[3], args[3]);
+        tcg_out_r(s, args[2]);
+        tcg_out_r(s, args[3]);
         tcg_out8(s, args[4]);           /* condition */
         tci_out_label(s, arg_label(args[5]));
         break;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
 #endif
     case INDEX_op_brcond_i32:
         tcg_out_r(s, args[0]);
-        tcg_out_ri32(s, const_args[1], args[1]);
+        tcg_out_r(s, args[1]);
         tcg_out8(s, args[2]);           /* condition */
         tci_out_label(s, arg_label(args[3]));
         break;
-- 
2.25.1

From: Eduardo Habkost <ehabkost@redhat.com>

The TCG-specific CPU methods will be moved to a separate struct,
to make it easier to move accel-specific code outside generic CPU
code in the future.  Start by moving tcg_initialize().

The new CPUClass.tcg_opts field may eventually become a pointer,
but keep it an embedded struct for now, to make code conversion
easier.

From: Claudio Fontana <cfontana@suse.de>

for now only TCG is allowed as an accelerator for riscv,
so remove the CONFIG_TCG use.

Signed-off-by: Claudio Fontana <cfontana@suse.de>
Reviewed-by: Alistair Francis <alistair.francis@wdc.com>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Message-Id: <20210204163931.7358-3-cfontana@suse.de>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/riscv/cpu.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/riscv/cpu.c
+++ b/target/riscv/cpu.c
@@ -XXX,XX +XXX,XX @@ static void riscv_cpu_class_init(ObjectClass *c, void *data)
 #endif
     cc->gdb_arch_name = riscv_gdb_arch_name;
     cc->gdb_get_dynamic_xml = riscv_gdb_get_dynamic_xml;
-#ifdef CONFIG_TCG
     cc->tcg_ops.initialize = riscv_translate_init;
     cc->tlb_fill = riscv_cpu_tlb_fill;
-#endif
+
     device_class_set_props(dc, riscv_cpu_properties);
 }
 
-- 
2.25.1

From: Claudio Fontana <cfontana@suse.de>

move away TCG-only code, make it compile only on TCG.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
[claudio: moved the prototypes from hw/core/cpu.h to exec/cpu-all.h]
Signed-off-by: Claudio Fontana <cfontana@suse.de>
Message-Id: <20210204163931.7358-4-cfontana@suse.de>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/cpu-all.h | 11 +++++--
 include/hw/core/cpu.h  |  2 ++
 accel/tcg/cpu-exec.c   | 28 +++++++++++++++++
 cpu.c                  | 70 ++++++++++++++++++++----------------------
 hw/core/cpu.c          |  6 +++-
 5 files changed, 77 insertions(+), 40 deletions(-)

diff --git a/include/exec/cpu-all.h b/include/exec/cpu-all.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/cpu-all.h
+++ b/include/exec/cpu-all.h
@@ -XXX,XX +XXX,XX @@ static inline bool tlb_hit(target_ulong tlb_addr, target_ulong addr)
 }
 
 #ifdef CONFIG_TCG
+/* accel/tcg/cpu-exec.c */
 void dump_drift_info(void);
+/* accel/tcg/translate-all.c */
 void dump_exec_info(void);
 void dump_opcount_info(void);
 #endif /* CONFIG_TCG */
 
 #endif /* !CONFIG_USER_ONLY */
 
+#ifdef CONFIG_TCG
+/* accel/tcg/cpu-exec.c */
+int cpu_exec(CPUState *cpu);
+void tcg_exec_realizefn(CPUState *cpu, Error **errp);
+void tcg_exec_unrealizefn(CPUState *cpu);
+#endif /* CONFIG_TCG */
+
 /* Returns: 0 on success, -1 on error */
 int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr,
                         void *ptr, target_ulong len, bool is_write);
 
-int cpu_exec(CPUState *cpu);
-
 /**
  * cpu_set_cpustate_pointers(cpu)
  * @cpu: The cpu object
diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/core/cpu.h
+++ b/include/hw/core/cpu.h
@@ -XXX,XX +XXX,XX @@ AddressSpace *cpu_get_address_space(CPUState *cpu, int asidx);
 
 void QEMU_NORETURN cpu_abort(CPUState *cpu, const char *fmt, ...)
     GCC_FMT_ATTR(2, 3);
+
+/* $(top_srcdir)/cpu.c */
 void cpu_exec_initfn(CPUState *cpu);
 void cpu_exec_realizefn(CPUState *cpu, Error **errp);
 void cpu_exec_unrealizefn(CPUState *cpu);
diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cpu-exec.c
+++ b/accel/tcg/cpu-exec.c
@@ -XXX,XX +XXX,XX @@ int cpu_exec(CPUState *cpu)
     return ret;
 }
 
+void tcg_exec_realizefn(CPUState *cpu, Error **errp)
+{
+    static bool tcg_target_initialized;
+    CPUClass *cc = CPU_GET_CLASS(cpu);
+
+    if (!tcg_target_initialized) {
+        cc->tcg_ops.initialize();
+        tcg_target_initialized = true;
+    }
+    tlb_init(cpu);
+    qemu_plugin_vcpu_init_hook(cpu);
+
+#ifndef CONFIG_USER_ONLY
+    tcg_iommu_init_notifier_list(cpu);
+#endif /* !CONFIG_USER_ONLY */
+}
+
+/* undo the initializations in reverse order */
+void tcg_exec_unrealizefn(CPUState *cpu)
+{
+#ifndef CONFIG_USER_ONLY
+    tcg_iommu_free_notifier_list(cpu);
+#endif /* !CONFIG_USER_ONLY */
+
+    qemu_plugin_vcpu_exit_hook(cpu);
+    tlb_destroy(cpu);
+}
+
 #ifndef CONFIG_USER_ONLY
 
 void dump_drift_info(void)
diff --git a/cpu.c b/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/cpu.c
+++ b/cpu.c
@@ -XXX,XX +XXX,XX @@ const VMStateDescription vmstate_cpu_common = {
 };
 #endif
 
-void cpu_exec_unrealizefn(CPUState *cpu)
+void cpu_exec_realizefn(CPUState *cpu, Error **errp)
 {
     CPUClass *cc = CPU_GET_CLASS(cpu);
 
-    tlb_destroy(cpu);
-    cpu_list_remove(cpu);
+    cpu_list_add(cpu);
+
+#ifdef CONFIG_TCG
+    /* NB: errp parameter is unused currently */
+    if (tcg_enabled()) {
+        tcg_exec_realizefn(cpu, errp);
+    }
+#endif /* CONFIG_TCG */
+
+#ifdef CONFIG_USER_ONLY
+    assert(cc->vmsd == NULL);
+#else
+    if (qdev_get_vmsd(DEVICE(cpu)) == NULL) {
+        vmstate_register(NULL, cpu->cpu_index, &vmstate_cpu_common, cpu);
+    }
+    if (cc->vmsd != NULL) {
+        vmstate_register(NULL, cpu->cpu_index, cc->vmsd, cpu);
+    }
+#endif /* CONFIG_USER_ONLY */
+}
+
+void cpu_exec_unrealizefn(CPUState *cpu)
+{
+    CPUClass *cc = CPU_GET_CLASS(cpu);
 
 #ifdef CONFIG_USER_ONLY
     assert(cc->vmsd == NULL);
@@ -XXX,XX +XXX,XX @@ void cpu_exec_unrealizefn(CPUState *cpu)
     if (qdev_get_vmsd(DEVICE(cpu)) == NULL) {
         vmstate_unregister(NULL, &vmstate_cpu_common, cpu);
     }
-    tcg_iommu_free_notifier_list(cpu);
 #endif
+#ifdef CONFIG_TCG
+    /* NB: errp parameter is unused currently */
+    if (tcg_enabled()) {
+        tcg_exec_unrealizefn(cpu);
+    }
+#endif /* CONFIG_TCG */
+
+    cpu_list_remove(cpu);
 }
 
 void cpu_exec_initfn(CPUState *cpu)
@@ -XXX,XX +XXX,XX @@ void cpu_exec_initfn(CPUState *cpu)
 #endif
 }
 
-void cpu_exec_realizefn(CPUState *cpu, Error **errp)
-{
-    CPUClass *cc = CPU_GET_CLASS(cpu);
-#ifdef CONFIG_TCG
-    static bool tcg_target_initialized;
-#endif /* CONFIG_TCG */
-
-    cpu_list_add(cpu);
-
-#ifdef CONFIG_TCG
-    if (tcg_enabled() && !tcg_target_initialized) {
-        tcg_target_initialized = true;
-        cc->tcg_ops.initialize();
-    }
-#endif /* CONFIG_TCG */
-    tlb_init(cpu);
-
-    qemu_plugin_vcpu_init_hook(cpu);
-
-#ifdef CONFIG_USER_ONLY
-    assert(cc->vmsd == NULL);
-#else /* !CONFIG_USER_ONLY */
-    if (qdev_get_vmsd(DEVICE(cpu)) == NULL) {
-        vmstate_register(NULL, cpu->cpu_index, &vmstate_cpu_common, cpu);
-    }
-    if (cc->vmsd != NULL) {
-        vmstate_register(NULL, cpu->cpu_index, cc->vmsd, cpu);
-    }
-
-    tcg_iommu_init_notifier_list(cpu);
-#endif
-}
-
 const char *parse_cpu_option(const char *cpu_option)
 {
     ObjectClass *oc;
diff --git a/hw/core/cpu.c b/hw/core/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/core/cpu.c
+++ b/hw/core/cpu.c
@@ -XXX,XX +XXX,XX @@ static bool cpu_common_virtio_is_big_endian(CPUState *cpu)
     return target_words_bigendian();
 }
 
+/*
+ * XXX the following #if is always true because this is a common_ss
+ * module, so target CONFIG_* is never defined.
+ */
 #if !defined(CONFIG_USER_ONLY)
 GuestPanicInformation *cpu_get_crash_info(CPUState *cpu)
 {
@@ -XXX,XX +XXX,XX @@ static void cpu_common_realizefn(DeviceState *dev, Error **errp)
 static void cpu_common_unrealizefn(DeviceState *dev)
 {
     CPUState *cpu = CPU(dev);
+
     /* NOTE: latest generic point before the cpu is fully unrealized */
     trace_fini_vcpu(cpu);
-    qemu_plugin_vcpu_exit_hook(cpu);
     cpu_exec_unrealizefn(cpu);
 }
 
-- 
2.25.1

From: Eduardo Habkost <ehabkost@redhat.com>

Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
[claudio: wrapped target code in CONFIG_TCG, reworded comments]
Signed-off-by: Claudio Fontana <cfontana@suse.de>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Message-Id: <20210204163931.7358-5-cfontana@suse.de>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/hw/core/cpu.h     | 22 +++++++++++++---------
 accel/tcg/cpu-exec.c      |  4 ++--
 target/arm/cpu.c          |  4 +++-
 target/avr/cpu.c          |  2 +-
 target/hppa/cpu.c         |  2 +-
 target/i386/tcg/tcg-cpu.c |  2 +-
 target/microblaze/cpu.c   |  2 +-
 target/mips/cpu.c         |  4 +++-
 target/riscv/cpu.c        |  2 +-
 target/rx/cpu.c           |  2 +-
 target/sh4/cpu.c          |  2 +-
 target/sparc/cpu.c        |  2 +-
 target/tricore/cpu.c      |  2 +-
 13 files changed, 30 insertions(+), 22 deletions(-)

diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/core/cpu.h
+++ b/include/hw/core/cpu.h
@@ -XXX,XX +XXX,XX @@ typedef struct TcgCpuOperations {
      * Called when the first CPU is realized.
      */
     void (*initialize)(void);
+    /**
+     * @synchronize_from_tb: Synchronize state from a TCG #TranslationBlock
+     *
+     * This is called when we abandon execution of a TB before starting it,
+     * and must set all parts of the CPU state which the previous TB in the
+     * chain may not have updated.
+     * By default, when this is NULL, a call is made to @set_pc(tb->pc).
+     *
+     * If more state needs to be restored, the target must implement a
+     * function to restore all the state, and register it here.
+     */
+    void (*synchronize_from_tb)(CPUState *cpu,
+                                const struct TranslationBlock *tb);
 
 } TcgCpuOperations;
 
@@ -XXX,XX +XXX,XX @@ typedef struct TcgCpuOperations {
  *       If the target behaviour here is anything other than "set
  *       the PC register to the value passed in" then the target must
  *       also implement the synchronize_from_tb hook.
- * @synchronize_from_tb: Callback for synchronizing state from a TCG
- *       #TranslationBlock. This is called when we abandon execution
- *       of a TB before starting it, and must set all parts of the CPU
- *       state which the previous TB in the chain may not have updated.
- *       This always includes at least the program counter; some targets
- *       will need to do more. If this hook is not implemented then the
- *       default is to call @set_pc(tb->pc).
  * @tlb_fill: Callback for handling a softmmu tlb miss or user-only
  *       address fault.  For system mode, if the access is valid, call
  *       tlb_set_page and return true; if the access is invalid, and
@@ -XXX,XX +XXX,XX @@ struct CPUClass {
     void (*get_memory_mapping)(CPUState *cpu, MemoryMappingList *list,
                                Error **errp);
     void (*set_pc)(CPUState *cpu, vaddr value);
-    void (*synchronize_from_tb)(CPUState *cpu,
-                                const struct TranslationBlock *tb);
     bool (*tlb_fill)(CPUState *cpu, vaddr address, int size,
                      MMUAccessType access_type, int mmu_idx,
                      bool probe, uintptr_t retaddr);
diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cpu-exec.c
+++ b/accel/tcg/cpu-exec.c
@@ -XXX,XX +XXX,XX @@ cpu_tb_exec(CPUState *cpu, TranslationBlock *itb, int *tb_exit)
                                TARGET_FMT_lx "] %s\n",
                                last_tb->tc.ptr, last_tb->pc,
                                lookup_symbol(last_tb->pc));
-        if (cc->synchronize_from_tb) {
-            cc->synchronize_from_tb(cpu, last_tb);
+        if (cc->tcg_ops.synchronize_from_tb) {
+            cc->tcg_ops.synchronize_from_tb(cpu, last_tb);
         } else {
             assert(cc->set_pc);
             cc->set_pc(cpu, last_tb->pc);
diff --git a/target/arm/cpu.c b/target/arm/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/cpu.c
+++ b/target/arm/cpu.c
@@ -XXX,XX +XXX,XX @@ static void arm_cpu_set_pc(CPUState *cs, vaddr value)
     }
 }
 
+#ifdef CONFIG_TCG
 static void arm_cpu_synchronize_from_tb(CPUState *cs,
                                         const TranslationBlock *tb)
 {
@@ -XXX,XX +XXX,XX @@ static void arm_cpu_synchronize_from_tb(CPUState *cs,
         env->regs[15] = tb->pc;
     }
 }
+#endif /* CONFIG_TCG */
 
 static bool arm_cpu_has_work(CPUState *cs)
 {
@@ -XXX,XX +XXX,XX @@ static void arm_cpu_class_init(ObjectClass *oc, void *data)
     cc->cpu_exec_interrupt = arm_cpu_exec_interrupt;
     cc->dump_state = arm_cpu_dump_state;
     cc->set_pc = arm_cpu_set_pc;
-    cc->synchronize_from_tb = arm_cpu_synchronize_from_tb;
     cc->gdb_read_register = arm_cpu_gdb_read_register;
     cc->gdb_write_register = arm_cpu_gdb_write_register;
 #ifndef CONFIG_USER_ONLY
@@ -XXX,XX +XXX,XX @@ static void arm_cpu_class_init(ObjectClass *oc, void *data)
     cc->disas_set_info = arm_disas_set_info;
 #ifdef CONFIG_TCG
     cc->tcg_ops.initialize = arm_translate_init;
+    cc->tcg_ops.synchronize_from_tb = arm_cpu_synchronize_from_tb;
     cc->tlb_fill = arm_cpu_tlb_fill;
     cc->debug_excp_handler = arm_debug_excp_handler;
     cc->debug_check_watchpoint = arm_debug_check_watchpoint;
diff --git a/target/avr/cpu.c b/target/avr/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/avr/cpu.c
+++ b/target/avr/cpu.c
@@ -XXX,XX +XXX,XX @@ static void avr_cpu_class_init(ObjectClass *oc, void *data)
     cc->vmsd = &vms_avr_cpu;
     cc->disas_set_info = avr_cpu_disas_set_info;
     cc->tcg_ops.initialize = avr_cpu_tcg_init;
-    cc->synchronize_from_tb = avr_cpu_synchronize_from_tb;
+    cc->tcg_ops.synchronize_from_tb = avr_cpu_synchronize_from_tb;
     cc->gdb_read_register = avr_cpu_gdb_read_register;
     cc->gdb_write_register = avr_cpu_gdb_write_register;
     cc->gdb_num_core_regs = 35;
diff --git a/target/hppa/cpu.c b/target/hppa/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/hppa/cpu.c
+++ b/target/hppa/cpu.c
@@ -XXX,XX +XXX,XX @@ static void hppa_cpu_class_init(ObjectClass *oc, void *data)
     cc->cpu_exec_interrupt = hppa_cpu_exec_interrupt;
     cc->dump_state = hppa_cpu_dump_state;
     cc->set_pc = hppa_cpu_set_pc;
-    cc->synchronize_from_tb = hppa_cpu_synchronize_from_tb;
+    cc->tcg_ops.synchronize_from_tb = hppa_cpu_synchronize_from_tb;
     cc->gdb_read_register = hppa_cpu_gdb_read_register;
     cc->gdb_write_register = hppa_cpu_gdb_write_register;
     cc->tlb_fill = hppa_cpu_tlb_fill;
diff --git a/target/i386/tcg/tcg-cpu.c b/target/i386/tcg/tcg-cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/i386/tcg/tcg-cpu.c
+++ b/target/i386/tcg/tcg-cpu.c
@@ -XXX,XX +XXX,XX @@ void tcg_cpu_common_class_init(CPUClass *cc)
 {
     cc->do_interrupt = x86_cpu_do_interrupt;
     cc->cpu_exec_interrupt = x86_cpu_exec_interrupt;
-    cc->synchronize_from_tb = x86_cpu_synchronize_from_tb;
+    cc->tcg_ops.synchronize_from_tb = x86_cpu_synchronize_from_tb;
     cc->cpu_exec_enter = x86_cpu_exec_enter;
     cc->cpu_exec_exit = x86_cpu_exec_exit;
     cc->tcg_ops.initialize = tcg_x86_init;
diff --git a/target/microblaze/cpu.c b/target/microblaze/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/microblaze/cpu.c
+++ b/target/microblaze/cpu.c
@@ -XXX,XX +XXX,XX @@ static void mb_cpu_class_init(ObjectClass *oc, void *data)
     cc->cpu_exec_interrupt = mb_cpu_exec_interrupt;
     cc->dump_state = mb_cpu_dump_state;
     cc->set_pc = mb_cpu_set_pc;
-    cc->synchronize_from_tb = mb_cpu_synchronize_from_tb;
+    cc->tcg_ops.synchronize_from_tb = mb_cpu_synchronize_from_tb;
     cc->gdb_read_register = mb_cpu_gdb_read_register;
     cc->gdb_write_register = mb_cpu_gdb_write_register;
     cc->tlb_fill = mb_cpu_tlb_fill;
diff --git a/target/mips/cpu.c b/target/mips/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/mips/cpu.c
+++ b/target/mips/cpu.c
@@ -XXX,XX +XXX,XX @@ static void mips_cpu_set_pc(CPUState *cs, vaddr value)
     }
 }
 
+#ifdef CONFIG_TCG
 static void mips_cpu_synchronize_from_tb(CPUState *cs,
                                          const TranslationBlock *tb)
 {
@@ -XXX,XX +XXX,XX @@ static void mips_cpu_synchronize_from_tb(CPUState *cs,
     env->hflags &= ~MIPS_HFLAG_BMASK;
     env->hflags |= tb->flags & MIPS_HFLAG_BMASK;
 }
+#endif /* CONFIG_TCG */
 
 static bool mips_cpu_has_work(CPUState *cs)
 {
@@ -XXX,XX +XXX,XX @@ static void mips_cpu_class_init(ObjectClass *c, void *data)
     cc->cpu_exec_interrupt = mips_cpu_exec_interrupt;
     cc->dump_state = mips_cpu_dump_state;
     cc->set_pc = mips_cpu_set_pc;
-    cc->synchronize_from_tb = mips_cpu_synchronize_from_tb;
     cc->gdb_read_register = mips_cpu_gdb_read_register;
     cc->gdb_write_register = mips_cpu_gdb_write_register;
 #ifndef CONFIG_USER_ONLY
@@ -XXX,XX +XXX,XX @@ static void mips_cpu_class_init(ObjectClass *c, void *data)
     cc->disas_set_info = mips_cpu_disas_set_info;
 #ifdef CONFIG_TCG
     cc->tcg_ops.initialize = mips_tcg_init;
+    cc->tcg_ops.synchronize_from_tb = mips_cpu_synchronize_from_tb;
     cc->tlb_fill = mips_cpu_tlb_fill;
 #endif
 
diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/riscv/cpu.c
+++ b/target/riscv/cpu.c
@@ -XXX,XX +XXX,XX @@ static void riscv_cpu_class_init(ObjectClass *c, void *data)
     cc->cpu_exec_interrupt = riscv_cpu_exec_interrupt;
     cc->dump_state = riscv_cpu_dump_state;
     cc->set_pc = riscv_cpu_set_pc;
-    cc->synchronize_from_tb = riscv_cpu_synchronize_from_tb;
+    cc->tcg_ops.synchronize_from_tb = riscv_cpu_synchronize_from_tb;
     cc->gdb_read_register = riscv_cpu_gdb_read_register;
     cc->gdb_write_register = riscv_cpu_gdb_write_register;
     cc->gdb_num_core_regs = 33;
diff --git a/target/rx/cpu.c b/target/rx/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/rx/cpu.c
+++ b/target/rx/cpu.c
@@ -XXX,XX +XXX,XX @@ static void rx_cpu_class_init(ObjectClass *klass, void *data)
     cc->cpu_exec_interrupt = rx_cpu_exec_interrupt;
     cc->dump_state = rx_cpu_dump_state;
     cc->set_pc = rx_cpu_set_pc;
-    cc->synchronize_from_tb = rx_cpu_synchronize_from_tb;
+    cc->tcg_ops.synchronize_from_tb = rx_cpu_synchronize_from_tb;
     cc->gdb_read_register = rx_cpu_gdb_read_register;
     cc->gdb_write_register = rx_cpu_gdb_write_register;
     cc->get_phys_page_debug = rx_cpu_get_phys_page_debug;
diff --git a/target/sh4/cpu.c b/target/sh4/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/sh4/cpu.c
+++ b/target/sh4/cpu.c
@@ -XXX,XX +XXX,XX @@ static void superh_cpu_class_init(ObjectClass *oc, void *data)
     cc->cpu_exec_interrupt = superh_cpu_exec_interrupt;
     cc->dump_state = superh_cpu_dump_state;
     cc->set_pc = superh_cpu_set_pc;
-    cc->synchronize_from_tb = superh_cpu_synchronize_from_tb;
+    cc->tcg_ops.synchronize_from_tb = superh_cpu_synchronize_from_tb;
     cc->gdb_read_register = superh_cpu_gdb_read_register;
     cc->gdb_write_register = superh_cpu_gdb_write_register;
     cc->tlb_fill = superh_cpu_tlb_fill;
diff --git a/target/sparc/cpu.c b/target/sparc/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/sparc/cpu.c
+++ b/target/sparc/cpu.c
@@ -XXX,XX +XXX,XX @@ static void sparc_cpu_class_init(ObjectClass *oc, void *data)
     cc->memory_rw_debug = sparc_cpu_memory_rw_debug;
 #endif
     cc->set_pc = sparc_cpu_set_pc;
-    cc->synchronize_from_tb = sparc_cpu_synchronize_from_tb;
+    cc->tcg_ops.synchronize_from_tb = sparc_cpu_synchronize_from_tb;
     cc->gdb_read_register = sparc_cpu_gdb_read_register;
     cc->gdb_write_register = sparc_cpu_gdb_write_register;
     cc->tlb_fill = sparc_cpu_tlb_fill;
diff --git a/target/tricore/cpu.c b/target/tricore/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/tricore/cpu.c
+++ b/target/tricore/cpu.c
@@ -XXX,XX +XXX,XX @@ static void tricore_cpu_class_init(ObjectClass *c, void *data)
 
     cc->dump_state = tricore_cpu_dump_state;
     cc->set_pc = tricore_cpu_set_pc;
-    cc->synchronize_from_tb = tricore_cpu_synchronize_from_tb;
+    cc->tcg_ops.synchronize_from_tb = tricore_cpu_synchronize_from_tb;
     cc->get_phys_page_debug = tricore_cpu_get_phys_page_debug;
     cc->tcg_ops.initialize = tricore_tcg_init;
     cc->tlb_fill = tricore_cpu_tlb_fill;
-- 
2.25.1

From: Eduardo Habkost <ehabkost@redhat.com>

Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
[claudio: wrapped target code in CONFIG_TCG]
Signed-off-by: Claudio Fontana <cfontana@suse.de>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-Id: <20210204163931.7358-6-cfontana@suse.de>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/hw/core/cpu.h           | 12 ++++++------
 accel/tcg/cpu-exec.c            | 12 ++++++------
 target/alpha/cpu.c              |  2 +-
 target/arm/cpu.c                |  2 +-
 target/arm/cpu64.c              |  5 ++++-
 target/arm/cpu_tcg.c            |  7 ++++++-
 target/avr/cpu.c                |  2 +-
 target/cris/cpu.c               |  2 +-
 target/hppa/cpu.c               |  2 +-
 target/i386/tcg/tcg-cpu.c       |  6 +++---
 target/lm32/cpu.c               |  2 +-
 target/m68k/cpu.c               |  2 +-
 target/microblaze/cpu.c         |  2 +-
 target/mips/cpu.c               |  2 +-
 target/nios2/cpu.c              |  2 +-
 target/openrisc/cpu.c           |  2 +-
 target/riscv/cpu.c              |  2 +-
 target/rx/cpu.c                 |  2 +-
 target/s390x/cpu.c              |  2 +-
 target/sh4/cpu.c                |  2 +-
 target/sparc/cpu.c              |  2 +-
 target/tilegx/cpu.c             |  2 +-
 target/unicore32/cpu.c          |  2 +-
 target/xtensa/cpu.c             |  2 +-
 target/ppc/translate_init.c.inc | 16 ++++++++++------
 25 files changed, 54 insertions(+), 42 deletions(-)

diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/core/cpu.h
+++ b/include/hw/core/cpu.h
@@ -XXX,XX +XXX,XX @@ typedef struct TcgCpuOperations {
      */
     void (*synchronize_from_tb)(CPUState *cpu,
                                 const struct TranslationBlock *tb);
+    /** @cpu_exec_enter: Callback for cpu_exec preparation */
+    void (*cpu_exec_enter)(CPUState *cpu);
+    /** @cpu_exec_exit: Callback for cpu_exec cleanup */
+    void (*cpu_exec_exit)(CPUState *cpu);
+    /** @cpu_exec_interrupt: Callback for processing interrupts in cpu_exec */
+    bool (*cpu_exec_interrupt)(CPUState *cpu, int interrupt_request);
 
 } TcgCpuOperations;
 
@@ -XXX,XX +XXX,XX @@ typedef struct TcgCpuOperations {
  * @gdb_get_dynamic_xml: Callback to return dynamically generated XML for the
  *   gdb stub. Returns a pointer to the XML contents for the specified XML file
  *   or NULL if the CPU doesn't have a dynamically generated content for it.
- * @cpu_exec_enter: Callback for cpu_exec preparation.
- * @cpu_exec_exit: Callback for cpu_exec cleanup.
- * @cpu_exec_interrupt: Callback for processing interrupts in cpu_exec.
  * @disas_set_info: Setup architecture specific components of disassembly info
  * @adjust_watchpoint_address: Perform a target-specific adjustment to an
  * address before attempting to match it against watchpoints.
@@ -XXX,XX +XXX,XX @@ struct CPUClass {
     const char *gdb_core_xml_file;
     gchar * (*gdb_arch_name)(CPUState *cpu);
     const char * (*gdb_get_dynamic_xml)(CPUState *cpu, const char *xmlname);
-    void (*cpu_exec_enter)(CPUState *cpu);
-    void (*cpu_exec_exit)(CPUState *cpu);
-    bool (*cpu_exec_interrupt)(CPUState *cpu, int interrupt_request);
 
     void (*disas_set_info)(CPUState *cpu, disassemble_info *info);
     vaddr (*adjust_watchpoint_address)(CPUState *cpu, vaddr addr, int len);
diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cpu-exec.c
+++ b/accel/tcg/cpu-exec.c
@@ -XXX,XX +XXX,XX @@ static void cpu_exec_enter(CPUState *cpu)
 {
     CPUClass *cc = CPU_GET_CLASS(cpu);
 
-    if (cc->cpu_exec_enter) {
-        cc->cpu_exec_enter(cpu);
+    if (cc->tcg_ops.cpu_exec_enter) {
+        cc->tcg_ops.cpu_exec_enter(cpu);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ static void cpu_exec_exit(CPUState *cpu)
 {
     CPUClass *cc = CPU_GET_CLASS(cpu);
 
-    if (cc->cpu_exec_exit) {
-        cc->cpu_exec_exit(cpu);
+    if (cc->tcg_ops.cpu_exec_exit) {
+        cc->tcg_ops.cpu_exec_exit(cpu);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ static inline bool cpu_handle_interrupt(CPUState *cpu,
            True when it is, and we should restart on a new TB,
            and via longjmp via cpu_loop_exit.  */
         else {
-            if (cc->cpu_exec_interrupt &&
-                cc->cpu_exec_interrupt(cpu, interrupt_request)) {
+            if (cc->tcg_ops.cpu_exec_interrupt &&
+                cc->tcg_ops.cpu_exec_interrupt(cpu, interrupt_request)) {
                 if (need_replay_interrupt(interrupt_request)) {
                     replay_interrupt();
                 }
diff --git a/target/alpha/cpu.c b/target/alpha/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/alpha/cpu.c
+++ b/target/alpha/cpu.c
@@ -XXX,XX +XXX,XX @@ static void alpha_cpu_class_init(ObjectClass *oc, void *data)
     cc->class_by_name = alpha_cpu_class_by_name;
     cc->has_work = alpha_cpu_has_work;
     cc->do_interrupt = alpha_cpu_do_interrupt;
-    cc->cpu_exec_interrupt = alpha_cpu_exec_interrupt;
+    cc->tcg_ops.cpu_exec_interrupt = alpha_cpu_exec_interrupt;
     cc->dump_state = alpha_cpu_dump_state;
     cc->set_pc = alpha_cpu_set_pc;
     cc->gdb_read_register = alpha_cpu_gdb_read_register;
diff --git a/target/arm/cpu.c b/target/arm/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/cpu.c
+++ b/target/arm/cpu.c
@@ -XXX,XX +XXX,XX @@ static void arm_cpu_class_init(ObjectClass *oc, void *data)
 
     cc->class_by_name = arm_cpu_class_by_name;
     cc->has_work = arm_cpu_has_work;
-    cc->cpu_exec_interrupt = arm_cpu_exec_interrupt;
     cc->dump_state = arm_cpu_dump_state;
     cc->set_pc = arm_cpu_set_pc;
     cc->gdb_read_register = arm_cpu_gdb_read_register;
@@ -XXX,XX +XXX,XX @@ static void arm_cpu_class_init(ObjectClass *oc, void *data)
     cc->disas_set_info = arm_disas_set_info;
 #ifdef CONFIG_TCG
     cc->tcg_ops.initialize = arm_translate_init;
+    cc->tcg_ops.cpu_exec_interrupt = arm_cpu_exec_interrupt;
     cc->tcg_ops.synchronize_from_tb = arm_cpu_synchronize_from_tb;
     cc->tlb_fill = arm_cpu_tlb_fill;
     cc->debug_excp_handler = arm_debug_excp_handler;
diff --git a/target/arm/cpu64.c b/target/arm/cpu64.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/cpu64.c
+++ b/target/arm/cpu64.c
@@ -XXX,XX +XXX,XX @@ static void aarch64_cpu_class_init(ObjectClass *oc, void *data)
 {
     CPUClass *cc = CPU_CLASS(oc);
 
-    cc->cpu_exec_interrupt = arm_cpu_exec_interrupt;
+#ifdef CONFIG_TCG
+    cc->tcg_ops.cpu_exec_interrupt = arm_cpu_exec_interrupt;
+#endif /* CONFIG_TCG */
+
     cc->gdb_read_register = aarch64_cpu_gdb_read_register;
     cc->gdb_write_register = aarch64_cpu_gdb_write_register;
     cc->gdb_num_core_regs = 34;
diff --git a/target/arm/cpu_tcg.c b/target/arm/cpu_tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/cpu_tcg.c
+++ b/target/arm/cpu_tcg.c
@@ -XXX,XX +XXX,XX @@
 /* CPU models. These are not needed for the AArch64 linux-user build. */
 #if !defined(CONFIG_USER_ONLY) || !defined(TARGET_AARCH64)
 
+#ifdef CONFIG_TCG
 static bool arm_v7m_cpu_exec_interrupt(CPUState *cs, int interrupt_request)
 {
     CPUClass *cc = CPU_GET_CLASS(cs);
@@ -XXX,XX +XXX,XX @@ static bool arm_v7m_cpu_exec_interrupt(CPUState *cs, int interrupt_request)
     }
     return ret;
 }
+#endif /* CONFIG_TCG */
 
 static void arm926_initfn(Object *obj)
 {
@@ -XXX,XX +XXX,XX @@ static void arm_v7m_class_init(ObjectClass *oc, void *data)
     cc->do_interrupt = arm_v7m_cpu_do_interrupt;
 #endif
 
-    cc->cpu_exec_interrupt = arm_v7m_cpu_exec_interrupt;
+#ifdef CONFIG_TCG
+    cc->tcg_ops.cpu_exec_interrupt = arm_v7m_cpu_exec_interrupt;
+#endif /* CONFIG_TCG */
+
     cc->gdb_core_xml_file = "arm-m-profile.xml";
 }
 
diff --git a/target/avr/cpu.c b/target/avr/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/avr/cpu.c
+++ b/target/avr/cpu.c
@@ -XXX,XX +XXX,XX @@ static void avr_cpu_class_init(ObjectClass *oc, void *data)
 
     cc->has_work = avr_cpu_has_work;
     cc->do_interrupt = avr_cpu_do_interrupt;
-    cc->cpu_exec_interrupt = avr_cpu_exec_interrupt;
+    cc->tcg_ops.cpu_exec_interrupt = avr_cpu_exec_interrupt;
     cc->dump_state = avr_cpu_dump_state;
     cc->set_pc = avr_cpu_set_pc;
     cc->memory_rw_debug = avr_cpu_memory_rw_debug;
diff --git a/target/cris/cpu.c b/target/cris/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/cris/cpu.c
+++ b/target/cris/cpu.c
@@ -XXX,XX +XXX,XX @@ static void cris_cpu_class_init(ObjectClass *oc, void *data)
     cc->class_by_name = cris_cpu_class_by_name;
     cc->has_work = cris_cpu_has_work;
     cc->do_interrupt = cris_cpu_do_interrupt;
-    cc->cpu_exec_interrupt = cris_cpu_exec_interrupt;
+    cc->tcg_ops.cpu_exec_interrupt = cris_cpu_exec_interrupt;
     cc->dump_state = cris_cpu_dump_state;
     cc->set_pc = cris_cpu_set_pc;
     cc->gdb_read_register = cris_cpu_gdb_read_register;
diff --git a/target/hppa/cpu.c b/target/hppa/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/hppa/cpu.c
+++ b/target/hppa/cpu.c
@@ -XXX,XX +XXX,XX @@ static void hppa_cpu_class_init(ObjectClass *oc, void *data)
     cc->class_by_name = hppa_cpu_class_by_name;
     cc->has_work = hppa_cpu_has_work;
     cc->do_interrupt = hppa_cpu_do_interrupt;
-    cc->cpu_exec_interrupt = hppa_cpu_exec_interrupt;
+    cc->tcg_ops.cpu_exec_interrupt = hppa_cpu_exec_interrupt;
     cc->dump_state = hppa_cpu_dump_state;
     cc->set_pc = hppa_cpu_set_pc;
     cc->tcg_ops.synchronize_from_tb = hppa_cpu_synchronize_from_tb;
diff --git a/target/i386/tcg/tcg-cpu.c b/target/i386/tcg/tcg-cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/i386/tcg/tcg-cpu.c
+++ b/target/i386/tcg/tcg-cpu.c
@@ -XXX,XX +XXX,XX @@ static void x86_cpu_synchronize_from_tb(CPUState *cs,
 void tcg_cpu_common_class_init(CPUClass *cc)
 {
     cc->do_interrupt = x86_cpu_do_interrupt;
-    cc->cpu_exec_interrupt = x86_cpu_exec_interrupt;
+    cc->tcg_ops.cpu_exec_interrupt = x86_cpu_exec_interrupt;
     cc->tcg_ops.synchronize_from_tb = x86_cpu_synchronize_from_tb;
-    cc->cpu_exec_enter = x86_cpu_exec_enter;
-    cc->cpu_exec_exit = x86_cpu_exec_exit;
+    cc->tcg_ops.cpu_exec_enter = x86_cpu_exec_enter;
+    cc->tcg_ops.cpu_exec_exit = x86_cpu_exec_exit;
     cc->tcg_ops.initialize = tcg_x86_init;
     cc->tlb_fill = x86_cpu_tlb_fill;
 #ifndef CONFIG_USER_ONLY
diff --git a/target/lm32/cpu.c b/target/lm32/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/lm32/cpu.c
+++ b/target/lm32/cpu.c
@@ -XXX,XX +XXX,XX @@ static void lm32_cpu_class_init(ObjectClass *oc, void *data)
     cc->class_by_name = lm32_cpu_class_by_name;
     cc->has_work = lm32_cpu_has_work;
     cc->do_interrupt = lm32_cpu_do_interrupt;
-    cc->cpu_exec_interrupt = lm32_cpu_exec_interrupt;
+    cc->tcg_ops.cpu_exec_interrupt = lm32_cpu_exec_interrupt;
     cc->dump_state = lm32_cpu_dump_state;
     cc->set_pc = lm32_cpu_set_pc;
     cc->gdb_read_register = lm32_cpu_gdb_read_register;
diff --git a/target/m68k/cpu.c b/target/m68k/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/m68k/cpu.c
+++ b/target/m68k/cpu.c
@@ -XXX,XX +XXX,XX @@ static void m68k_cpu_class_init(ObjectClass *c, void *data)
     cc->class_by_name = m68k_cpu_class_by_name;
     cc->has_work = m68k_cpu_has_work;
     cc->do_interrupt = m68k_cpu_do_interrupt;
-    cc->cpu_exec_interrupt = m68k_cpu_exec_interrupt;
+    cc->tcg_ops.cpu_exec_interrupt = m68k_cpu_exec_interrupt;
     cc->dump_state = m68k_cpu_dump_state;
     cc->set_pc = m68k_cpu_set_pc;
     cc->gdb_read_register = m68k_cpu_gdb_read_register;
diff --git a/target/microblaze/cpu.c b/target/microblaze/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/microblaze/cpu.c
+++ b/target/microblaze/cpu.c
@@ -XXX,XX +XXX,XX @@ static void mb_cpu_class_init(ObjectClass *oc, void *data)
     cc->has_work = mb_cpu_has_work;
     cc->do_interrupt = mb_cpu_do_interrupt;
     cc->do_unaligned_access = mb_cpu_do_unaligned_access;
-    cc->cpu_exec_interrupt = mb_cpu_exec_interrupt;
+    cc->tcg_ops.cpu_exec_interrupt = mb_cpu_exec_interrupt;
     cc->dump_state = mb_cpu_dump_state;
     cc->set_pc = mb_cpu_set_pc;
     cc->tcg_ops.synchronize_from_tb = mb_cpu_synchronize_from_tb;
diff --git a/target/mips/cpu.c b/target/mips/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/mips/cpu.c
+++ b/target/mips/cpu.c
@@ -XXX,XX +XXX,XX @@ static void mips_cpu_class_init(ObjectClass *c, void *data)
     cc->class_by_name = mips_cpu_class_by_name;
     cc->has_work = mips_cpu_has_work;
     cc->do_interrupt = mips_cpu_do_interrupt;
-    cc->cpu_exec_interrupt = mips_cpu_exec_interrupt;
     cc->dump_state = mips_cpu_dump_state;
     cc->set_pc = mips_cpu_set_pc;
     cc->gdb_read_register = mips_cpu_gdb_read_register;
@@ -XXX,XX +XXX,XX @@ static void mips_cpu_class_init(ObjectClass *c, void *data)
     cc->disas_set_info = mips_cpu_disas_set_info;
 #ifdef CONFIG_TCG
     cc->tcg_ops.initialize = mips_tcg_init;
+    cc->tcg_ops.cpu_exec_interrupt = mips_cpu_exec_interrupt;
     cc->tcg_ops.synchronize_from_tb = mips_cpu_synchronize_from_tb;
     cc->tlb_fill = mips_cpu_tlb_fill;
 #endif
diff --git a/target/nios2/cpu.c b/target/nios2/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/nios2/cpu.c
+++ b/target/nios2/cpu.c
@@ -XXX,XX +XXX,XX @@ static void nios2_cpu_class_init(ObjectClass *oc, void *data)
     cc->class_by_name = nios2_cpu_class_by_name;
     cc->has_work = nios2_cpu_has_work;
     cc->do_interrupt = nios2_cpu_do_interrupt;
-    cc->cpu_exec_interrupt = nios2_cpu_exec_interrupt;
+    cc->tcg_ops.cpu_exec_interrupt = nios2_cpu_exec_interrupt;
     cc->dump_state = nios2_cpu_dump_state;
     cc->set_pc = nios2_cpu_set_pc;
     cc->disas_set_info = nios2_cpu_disas_set_info;
diff --git a/target/openrisc/cpu.c b/target/openrisc/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/openrisc/cpu.c
+++ b/target/openrisc/cpu.c
@@ -XXX,XX +XXX,XX @@ static void openrisc_cpu_class_init(ObjectClass *oc, void *data)
     cc->class_by_name = openrisc_cpu_class_by_name;
     cc->has_work = openrisc_cpu_has_work;
     cc->do_interrupt = openrisc_cpu_do_interrupt;
-    cc->cpu_exec_interrupt = openrisc_cpu_exec_interrupt;
+    cc->tcg_ops.cpu_exec_interrupt = openrisc_cpu_exec_interrupt;
     cc->dump_state = openrisc_cpu_dump_state;
     cc->set_pc = openrisc_cpu_set_pc;
     cc->gdb_read_register = openrisc_cpu_gdb_read_register;
diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/riscv/cpu.c
+++ b/target/riscv/cpu.c
@@ -XXX,XX +XXX,XX @@ static void riscv_cpu_class_init(ObjectClass *c, void *data)
     cc->class_by_name = riscv_cpu_class_by_name;
     cc->has_work = riscv_cpu_has_work;
     cc->do_interrupt = riscv_cpu_do_interrupt;
-    cc->cpu_exec_interrupt = riscv_cpu_exec_interrupt;
+    cc->tcg_ops.cpu_exec_interrupt = riscv_cpu_exec_interrupt;
     cc->dump_state = riscv_cpu_dump_state;
     cc->set_pc = riscv_cpu_set_pc;
     cc->tcg_ops.synchronize_from_tb = riscv_cpu_synchronize_from_tb;
diff --git a/target/rx/cpu.c b/target/rx/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/rx/cpu.c
+++ b/target/rx/cpu.c
@@ -XXX,XX +XXX,XX @@ static void rx_cpu_class_init(ObjectClass *klass, void *data)
     cc->class_by_name = rx_cpu_class_by_name;
     cc->has_work = rx_cpu_has_work;
     cc->do_interrupt = rx_cpu_do_interrupt;
-    cc->cpu_exec_interrupt = rx_cpu_exec_interrupt;
+    cc->tcg_ops.cpu_exec_interrupt = rx_cpu_exec_interrupt;
     cc->dump_state = rx_cpu_dump_state;
     cc->set_pc = rx_cpu_set_pc;
     cc->tcg_ops.synchronize_from_tb = rx_cpu_synchronize_from_tb;
diff --git a/target/s390x/cpu.c b/target/s390x/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/s390x/cpu.c
+++ b/target/s390x/cpu.c
@@ -XXX,XX +XXX,XX @@ static void s390_cpu_class_init(ObjectClass *oc, void *data)
     cc->get_crash_info = s390_cpu_get_crash_info;
     cc->write_elf64_note = s390_cpu_write_elf64_note;
 #ifdef CONFIG_TCG
-    cc->cpu_exec_interrupt = s390_cpu_exec_interrupt;
+    cc->tcg_ops.cpu_exec_interrupt = s390_cpu_exec_interrupt;
     cc->debug_excp_handler = s390x_cpu_debug_excp_handler;
     cc->do_unaligned_access = s390x_cpu_do_unaligned_access;
 #endif
diff --git a/target/sh4/cpu.c b/target/sh4/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/sh4/cpu.c
+++ b/target/sh4/cpu.c
@@ -XXX,XX +XXX,XX @@ static void superh_cpu_class_init(ObjectClass *oc, void *data)
     cc->class_by_name = superh_cpu_class_by_name;
     cc->has_work = superh_cpu_has_work;
     cc->do_interrupt = superh_cpu_do_interrupt;
-    cc->cpu_exec_interrupt = superh_cpu_exec_interrupt;
+    cc->tcg_ops.cpu_exec_interrupt = superh_cpu_exec_interrupt;
     cc->dump_state = superh_cpu_dump_state;
     cc->set_pc = superh_cpu_set_pc;
     cc->tcg_ops.synchronize_from_tb = superh_cpu_synchronize_from_tb;
diff --git a/target/sparc/cpu.c b/target/sparc/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/sparc/cpu.c
+++ b/target/sparc/cpu.c
@@ -XXX,XX +XXX,XX @@ static void sparc_cpu_class_init(ObjectClass *oc, void *data)
     cc->parse_features = sparc_cpu_parse_features;
     cc->has_work = sparc_cpu_has_work;
     cc->do_interrupt = sparc_cpu_do_interrupt;
-    cc->cpu_exec_interrupt = sparc_cpu_exec_interrupt;
+    cc->tcg_ops.cpu_exec_interrupt = sparc_cpu_exec_interrupt;
     cc->dump_state = sparc_cpu_dump_state;
 #if !defined(TARGET_SPARC64) && !defined(CONFIG_USER_ONLY)
     cc->memory_rw_debug = sparc_cpu_memory_rw_debug;
diff --git a/target/tilegx/cpu.c b/target/tilegx/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/tilegx/cpu.c
+++ b/target/tilegx/cpu.c
@@ -XXX,XX +XXX,XX @@ static void tilegx_cpu_class_init(ObjectClass *oc, void *data)
     cc->class_by_name = tilegx_cpu_class_by_name;
     cc->has_work = tilegx_cpu_has_work;
     cc->do_interrupt = tilegx_cpu_do_interrupt;
-    cc->cpu_exec_interrupt = tilegx_cpu_exec_interrupt;
+    cc->tcg_ops.cpu_exec_interrupt = tilegx_cpu_exec_interrupt;
     cc->dump_state = tilegx_cpu_dump_state;
     cc->set_pc = tilegx_cpu_set_pc;
     cc->tlb_fill = tilegx_cpu_tlb_fill;
diff --git a/target/unicore32/cpu.c b/target/unicore32/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/unicore32/cpu.c
+++ b/target/unicore32/cpu.c
@@ -XXX,XX +XXX,XX @@ static void uc32_cpu_class_init(ObjectClass *oc, void *data)
     cc->class_by_name = uc32_cpu_class_by_name;
     cc->has_work = uc32_cpu_has_work;
     cc->do_interrupt = uc32_cpu_do_interrupt;
-    cc->cpu_exec_interrupt = uc32_cpu_exec_interrupt;
+    cc->tcg_ops.cpu_exec_interrupt = uc32_cpu_exec_interrupt;
     cc->dump_state = uc32_cpu_dump_state;
     cc->set_pc = uc32_cpu_set_pc;
     cc->tlb_fill = uc32_cpu_tlb_fill;
diff --git a/target/xtensa/cpu.c b/target/xtensa/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/xtensa/cpu.c
+++ b/target/xtensa/cpu.c
@@ -XXX,XX +XXX,XX @@ static void xtensa_cpu_class_init(ObjectClass *oc, void *data)
     cc->class_by_name = xtensa_cpu_class_by_name;
     cc->has_work = xtensa_cpu_has_work;
     cc->do_interrupt = xtensa_cpu_do_interrupt;
-    cc->cpu_exec_interrupt = xtensa_cpu_exec_interrupt;
+    cc->tcg_ops.cpu_exec_interrupt = xtensa_cpu_exec_interrupt;
     cc->dump_state = xtensa_cpu_dump_state;
     cc->set_pc = xtensa_cpu_set_pc;
     cc->gdb_read_register = xtensa_cpu_gdb_read_register;
diff --git a/target/ppc/translate_init.c.inc b/target/ppc/translate_init.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/target/ppc/translate_init.c.inc
+++ b/target/ppc/translate_init.c.inc
@@ -XXX,XX +XXX,XX @@ static void ppc_cpu_reset(DeviceState *dev)
 }
 
 #ifndef CONFIG_USER_ONLY
+
 static bool ppc_cpu_is_big_endian(CPUState *cs)
 {
     PowerPCCPU *cpu = POWERPC_CPU(cs);
@@ -XXX,XX +XXX,XX @@ static bool ppc_cpu_is_big_endian(CPUState *cs)
     return !msr_le;
 }
 
+#ifdef CONFIG_TCG
 static void ppc_cpu_exec_enter(CPUState *cs)
 {
     PowerPCCPU *cpu = POWERPC_CPU(cs);
@@ -XXX,XX +XXX,XX @@ static void ppc_cpu_exec_exit(CPUState *cs)
         vhc->cpu_exec_exit(cpu->vhyp, cpu);
     }
 }
-#endif
+#endif /* CONFIG_TCG */
+
+#endif /* !CONFIG_USER_ONLY */
 
 static void ppc_cpu_instance_init(Object *obj)
 {
@@ -XXX,XX +XXX,XX @@ static void ppc_cpu_class_init(ObjectClass *oc, void *data)
     cc->class_by_name = ppc_cpu_class_by_name;
     cc->has_work = ppc_cpu_has_work;
     cc->do_interrupt = ppc_cpu_do_interrupt;
-    cc->cpu_exec_interrupt = ppc_cpu_exec_interrupt;
     cc->dump_state = ppc_cpu_dump_state;
     cc->dump_statistics = ppc_cpu_dump_statistics;
     cc->set_pc = ppc_cpu_set_pc;
@@ -XXX,XX +XXX,XX @@ static void ppc_cpu_class_init(ObjectClass *oc, void *data)
 #endif
 #ifdef CONFIG_TCG
     cc->tcg_ops.initialize = ppc_translate_init;
+    cc->tcg_ops.cpu_exec_interrupt = ppc_cpu_exec_interrupt;
     cc->tlb_fill = ppc_cpu_tlb_fill;
-#endif
 #ifndef CONFIG_USER_ONLY
-    cc->cpu_exec_enter = ppc_cpu_exec_enter;
-    cc->cpu_exec_exit = ppc_cpu_exec_exit;
-#endif
+    cc->tcg_ops.cpu_exec_enter = ppc_cpu_exec_enter;
+    cc->tcg_ops.cpu_exec_exit = ppc_cpu_exec_exit;
+#endif /* !CONFIG_USER_ONLY */
+#endif /* CONFIG_TCG */
 
     cc->disas_set_info = ppc_disas_set_info;
 
-- 
2.25.1

From: Eduardo Habkost <ehabkost@redhat.com>

[claudio: wrapped target code in CONFIG_TCG]

Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
Signed-off-by: Claudio Fontana <cfontana@suse.de>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-Id: <20210204163931.7358-7-cfontana@suse.de>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/hw/core/cpu.h           | 21 ++++++++++++---------
 accel/tcg/cputlb.c              |  7 ++++---
 accel/tcg/user-exec.c           |  6 +++---
 target/alpha/cpu.c              |  2 +-
 target/arm/cpu.c                |  2 +-
 target/avr/cpu.c                |  2 +-
 target/cris/cpu.c               |  2 +-
 target/hppa/cpu.c               |  2 +-
 target/i386/tcg/tcg-cpu.c       |  2 +-
 target/lm32/cpu.c               |  2 +-
 target/m68k/cpu.c               |  2 +-
 target/microblaze/cpu.c         |  2 +-
 target/mips/cpu.c               |  2 +-
 target/moxie/cpu.c              |  2 +-
 target/nios2/cpu.c              |  2 +-
 target/openrisc/cpu.c           |  2 +-
 target/riscv/cpu.c              |  2 +-
 target/rx/cpu.c                 |  2 +-
 target/s390x/cpu.c              |  2 +-
 target/sh4/cpu.c                |  2 +-
 target/sparc/cpu.c              |  2 +-
 target/tilegx/cpu.c             |  2 +-
 target/tricore/cpu.c            |  2 +-
 target/unicore32/cpu.c          |  2 +-
 target/xtensa/cpu.c             |  2 +-
 target/ppc/translate_init.c.inc |  2 +-
 26 files changed, 42 insertions(+), 38 deletions(-)

diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/core/cpu.h
+++ b/include/hw/core/cpu.h
@@ -XXX,XX +XXX,XX @@ typedef struct TcgCpuOperations {
     void (*cpu_exec_exit)(CPUState *cpu);
     /** @cpu_exec_interrupt: Callback for processing interrupts in cpu_exec */
     bool (*cpu_exec_interrupt)(CPUState *cpu, int interrupt_request);
+    /**
+     * @tlb_fill: Handle a softmmu tlb miss or user-only address fault
+     *
+     * For system mode, if the access is valid, call tlb_set_page
+     * and return true; if the access is invalid, and probe is
+     * true, return false; otherwise raise an exception and do
+     * not return.  For user-only mode, always raise an exception
+     * and do not return.
+     */
+    bool (*tlb_fill)(CPUState *cpu, vaddr address, int size,
+                     MMUAccessType access_type, int mmu_idx,
+                     bool probe, uintptr_t retaddr);
 
 } TcgCpuOperations;
 
@@ -XXX,XX +XXX,XX @@ typedef struct TcgCpuOperations {
  *       If the target behaviour here is anything other than "set
  *       the PC register to the value passed in" then the target must
  *       also implement the synchronize_from_tb hook.
- * @tlb_fill: Callback for handling a softmmu tlb miss or user-only
- *       address fault.  For system mode, if the access is valid, call
- *       tlb_set_page and return true; if the access is invalid, and
- *       probe is true, return false; otherwise raise an exception and
- *       do not return.  For user-only mode, always raise an exception
- *       and do not return.
  * @get_phys_page_debug: Callback for obtaining a physical address.
  * @get_phys_page_attrs_debug: Callback for obtaining a physical address and the
  *       associated memory transaction attributes to use for the access.
@@ -XXX,XX +XXX,XX @@ struct CPUClass {
     void (*get_memory_mapping)(CPUState *cpu, MemoryMappingList *list,
                                Error **errp);
     void (*set_pc)(CPUState *cpu, vaddr value);
-    bool (*tlb_fill)(CPUState *cpu, vaddr address, int size,
-                     MMUAccessType access_type, int mmu_idx,
-                     bool probe, uintptr_t retaddr);
     hwaddr (*get_phys_page_debug)(CPUState *cpu, vaddr addr);
     hwaddr (*get_phys_page_attrs_debug)(CPUState *cpu, vaddr addr,
                                         MemTxAttrs *attrs);
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ static void tlb_fill(CPUState *cpu, target_ulong addr, int size,
      * This is not a probe, so only valid return is success; failure
      * should result in exception + longjmp to the cpu loop.
      */
-    ok = cc->tlb_fill(cpu, addr, size, access_type, mmu_idx, false, retaddr);
+    ok = cc->tcg_ops.tlb_fill(cpu, addr, size,
+                              access_type, mmu_idx, false, retaddr);
     assert(ok);
 }
 
@@ -XXX,XX +XXX,XX @@ static int probe_access_internal(CPUArchState *env, target_ulong addr,
             CPUState *cs = env_cpu(env);
             CPUClass *cc = CPU_GET_CLASS(cs);
 
-            if (!cc->tlb_fill(cs, addr, fault_size, access_type,
-                              mmu_idx, nonfault, retaddr)) {
+            if (!cc->tcg_ops.tlb_fill(cs, addr, fault_size, access_type,
+                                      mmu_idx, nonfault, retaddr)) {
                 /* Non-faulting page table read failed.  */
                 *phost = NULL;
                 return TLB_INVALID_MASK;
diff --git a/accel/tcg/user-exec.c b/accel/tcg/user-exec.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/user-exec.c
+++ b/accel/tcg/user-exec.c
@@ -XXX,XX +XXX,XX @@ static inline int handle_cpu_signal(uintptr_t pc, siginfo_t *info,
     clear_helper_retaddr();
 
     cc = CPU_GET_CLASS(cpu);
-    cc->tlb_fill(cpu, address, 0, access_type, MMU_USER_IDX, false, pc);
+    cc->tcg_ops.tlb_fill(cpu, address, 0, access_type, MMU_USER_IDX, false, pc);
     g_assert_not_reached();
 }
 
@@ -XXX,XX +XXX,XX @@ static int probe_access_internal(CPUArchState *env, target_ulong addr,
         } else {
             CPUState *cpu = env_cpu(env);
             CPUClass *cc = CPU_GET_CLASS(cpu);
-            cc->tlb_fill(cpu, addr, fault_size, access_type,
-                         MMU_USER_IDX, false, ra);
+            cc->tcg_ops.tlb_fill(cpu, addr, fault_size, access_type,
+                                 MMU_USER_IDX, false, ra);
             g_assert_not_reached();
         }
     }
diff --git a/target/alpha/cpu.c b/target/alpha/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/alpha/cpu.c
+++ b/target/alpha/cpu.c
@@ -XXX,XX +XXX,XX @@ static void alpha_cpu_class_init(ObjectClass *oc, void *data)
     cc->set_pc = alpha_cpu_set_pc;
     cc->gdb_read_register = alpha_cpu_gdb_read_register;
     cc->gdb_write_register = alpha_cpu_gdb_write_register;
-    cc->tlb_fill = alpha_cpu_tlb_fill;
+    cc->tcg_ops.tlb_fill = alpha_cpu_tlb_fill;
 #ifndef CONFIG_USER_ONLY
     cc->do_transaction_failed = alpha_cpu_do_transaction_failed;
     cc->do_unaligned_access = alpha_cpu_do_unaligned_access;
diff --git a/target/arm/cpu.c b/target/arm/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/cpu.c
+++ b/target/arm/cpu.c
@@ -XXX,XX +XXX,XX @@ static void arm_cpu_class_init(ObjectClass *oc, void *data)
     cc->tcg_ops.initialize = arm_translate_init;
     cc->tcg_ops.cpu_exec_interrupt = arm_cpu_exec_interrupt;
     cc->tcg_ops.synchronize_from_tb = arm_cpu_synchronize_from_tb;
-    cc->tlb_fill = arm_cpu_tlb_fill;
+    cc->tcg_ops.tlb_fill = arm_cpu_tlb_fill;
     cc->debug_excp_handler = arm_debug_excp_handler;
     cc->debug_check_watchpoint = arm_debug_check_watchpoint;
     cc->do_unaligned_access = arm_cpu_do_unaligned_access;
diff --git a/target/avr/cpu.c b/target/avr/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/avr/cpu.c
+++ b/target/avr/cpu.c
@@ -XXX,XX +XXX,XX @@ static void avr_cpu_class_init(ObjectClass *oc, void *data)
     cc->set_pc = avr_cpu_set_pc;
     cc->memory_rw_debug = avr_cpu_memory_rw_debug;
     cc->get_phys_page_debug = avr_cpu_get_phys_page_debug;
-    cc->tlb_fill = avr_cpu_tlb_fill;
+    cc->tcg_ops.tlb_fill = avr_cpu_tlb_fill;
     cc->vmsd = &vms_avr_cpu;
     cc->disas_set_info = avr_cpu_disas_set_info;
     cc->tcg_ops.initialize = avr_cpu_tcg_init;
diff --git a/target/cris/cpu.c b/target/cris/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/cris/cpu.c
+++ b/target/cris/cpu.c
@@ -XXX,XX +XXX,XX @@ static void cris_cpu_class_init(ObjectClass *oc, void *data)
     cc->set_pc = cris_cpu_set_pc;
     cc->gdb_read_register = cris_cpu_gdb_read_register;
     cc->gdb_write_register = cris_cpu_gdb_write_register;
-    cc->tlb_fill = cris_cpu_tlb_fill;
+    cc->tcg_ops.tlb_fill = cris_cpu_tlb_fill;
 #ifndef CONFIG_USER_ONLY
     cc->get_phys_page_debug = cris_cpu_get_phys_page_debug;
     dc->vmsd = &vmstate_cris_cpu;
diff --git a/target/hppa/cpu.c b/target/hppa/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/hppa/cpu.c
+++ b/target/hppa/cpu.c
@@ -XXX,XX +XXX,XX @@ static void hppa_cpu_class_init(ObjectClass *oc, void *data)
     cc->tcg_ops.synchronize_from_tb = hppa_cpu_synchronize_from_tb;
     cc->gdb_read_register = hppa_cpu_gdb_read_register;
     cc->gdb_write_register = hppa_cpu_gdb_write_register;
-    cc->tlb_fill = hppa_cpu_tlb_fill;
+    cc->tcg_ops.tlb_fill = hppa_cpu_tlb_fill;
 #ifndef CONFIG_USER_ONLY
     cc->get_phys_page_debug = hppa_cpu_get_phys_page_debug;
     dc->vmsd = &vmstate_hppa_cpu;
diff --git a/target/i386/tcg/tcg-cpu.c b/target/i386/tcg/tcg-cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/i386/tcg/tcg-cpu.c
+++ b/target/i386/tcg/tcg-cpu.c
@@ -XXX,XX +XXX,XX @@ void tcg_cpu_common_class_init(CPUClass *cc)
     cc->tcg_ops.cpu_exec_enter = x86_cpu_exec_enter;
     cc->tcg_ops.cpu_exec_exit = x86_cpu_exec_exit;
     cc->tcg_ops.initialize = tcg_x86_init;
-    cc->tlb_fill = x86_cpu_tlb_fill;
+    cc->tcg_ops.tlb_fill = x86_cpu_tlb_fill;
 #ifndef CONFIG_USER_ONLY
     cc->debug_excp_handler = breakpoint_handler;
 #endif
diff --git a/target/lm32/cpu.c b/target/lm32/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/lm32/cpu.c
+++ b/target/lm32/cpu.c
@@ -XXX,XX +XXX,XX @@ static void lm32_cpu_class_init(ObjectClass *oc, void *data)
     cc->set_pc = lm32_cpu_set_pc;
     cc->gdb_read_register = lm32_cpu_gdb_read_register;
     cc->gdb_write_register = lm32_cpu_gdb_write_register;
-    cc->tlb_fill = lm32_cpu_tlb_fill;
+    cc->tcg_ops.tlb_fill = lm32_cpu_tlb_fill;
 #ifndef CONFIG_USER_ONLY
     cc->get_phys_page_debug = lm32_cpu_get_phys_page_debug;
     cc->vmsd = &vmstate_lm32_cpu;
diff --git a/target/m68k/cpu.c b/target/m68k/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/m68k/cpu.c
+++ b/target/m68k/cpu.c
@@ -XXX,XX +XXX,XX @@ static void m68k_cpu_class_init(ObjectClass *c, void *data)
     cc->set_pc = m68k_cpu_set_pc;
     cc->gdb_read_register = m68k_cpu_gdb_read_register;
     cc->gdb_write_register = m68k_cpu_gdb_write_register;
-    cc->tlb_fill = m68k_cpu_tlb_fill;
+    cc->tcg_ops.tlb_fill = m68k_cpu_tlb_fill;
 #if defined(CONFIG_SOFTMMU)
     cc->do_transaction_failed = m68k_cpu_transaction_failed;
     cc->get_phys_page_debug = m68k_cpu_get_phys_page_debug;
diff --git a/target/microblaze/cpu.c b/target/microblaze/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/microblaze/cpu.c
+++ b/target/microblaze/cpu.c
@@ -XXX,XX +XXX,XX @@ static void mb_cpu_class_init(ObjectClass *oc, void *data)
     cc->tcg_ops.synchronize_from_tb = mb_cpu_synchronize_from_tb;
     cc->gdb_read_register = mb_cpu_gdb_read_register;
     cc->gdb_write_register = mb_cpu_gdb_write_register;
-    cc->tlb_fill = mb_cpu_tlb_fill;
+    cc->tcg_ops.tlb_fill = mb_cpu_tlb_fill;
 #ifndef CONFIG_USER_ONLY
     cc->do_transaction_failed = mb_cpu_transaction_failed;
     cc->get_phys_page_attrs_debug = mb_cpu_get_phys_page_attrs_debug;
diff --git a/target/mips/cpu.c b/target/mips/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/mips/cpu.c
+++ b/target/mips/cpu.c
@@ -XXX,XX +XXX,XX @@ static void mips_cpu_class_init(ObjectClass *c, void *data)
     cc->tcg_ops.initialize = mips_tcg_init;
     cc->tcg_ops.cpu_exec_interrupt = mips_cpu_exec_interrupt;
     cc->tcg_ops.synchronize_from_tb = mips_cpu_synchronize_from_tb;
-    cc->tlb_fill = mips_cpu_tlb_fill;
+    cc->tcg_ops.tlb_fill = mips_cpu_tlb_fill;
 #endif
 
     cc->gdb_num_core_regs = 73;
diff --git a/target/moxie/cpu.c b/target/moxie/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/moxie/cpu.c
+++ b/target/moxie/cpu.c
@@ -XXX,XX +XXX,XX @@ static void moxie_cpu_class_init(ObjectClass *oc, void *data)
     cc->do_interrupt = moxie_cpu_do_interrupt;
     cc->dump_state = moxie_cpu_dump_state;
     cc->set_pc = moxie_cpu_set_pc;
-    cc->tlb_fill = moxie_cpu_tlb_fill;
+    cc->tcg_ops.tlb_fill = moxie_cpu_tlb_fill;
 #ifndef CONFIG_USER_ONLY
     cc->get_phys_page_debug = moxie_cpu_get_phys_page_debug;
     cc->vmsd = &vmstate_moxie_cpu;
diff --git a/target/nios2/cpu.c b/target/nios2/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/nios2/cpu.c
+++ b/target/nios2/cpu.c
@@ -XXX,XX +XXX,XX @@ static void nios2_cpu_class_init(ObjectClass *oc, void *data)
     cc->dump_state = nios2_cpu_dump_state;
     cc->set_pc = nios2_cpu_set_pc;
     cc->disas_set_info = nios2_cpu_disas_set_info;
-    cc->tlb_fill = nios2_cpu_tlb_fill;
+    cc->tcg_ops.tlb_fill = nios2_cpu_tlb_fill;
 #ifndef CONFIG_USER_ONLY
     cc->do_unaligned_access = nios2_cpu_do_unaligned_access;
     cc->get_phys_page_debug = nios2_cpu_get_phys_page_debug;
diff --git a/target/openrisc/cpu.c b/target/openrisc/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/openrisc/cpu.c
+++ b/target/openrisc/cpu.c
@@ -XXX,XX +XXX,XX @@ static void openrisc_cpu_class_init(ObjectClass *oc, void *data)
     cc->set_pc = openrisc_cpu_set_pc;
     cc->gdb_read_register = openrisc_cpu_gdb_read_register;
     cc->gdb_write_register = openrisc_cpu_gdb_write_register;
-    cc->tlb_fill = openrisc_cpu_tlb_fill;
+    cc->tcg_ops.tlb_fill = openrisc_cpu_tlb_fill;
 #ifndef CONFIG_USER_ONLY
     cc->get_phys_page_debug = openrisc_cpu_get_phys_page_debug;
     dc->vmsd = &vmstate_openrisc_cpu;
diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/riscv/cpu.c
+++ b/target/riscv/cpu.c
@@ -XXX,XX +XXX,XX @@ static void riscv_cpu_class_init(ObjectClass *c, void *data)
     cc->gdb_arch_name = riscv_gdb_arch_name;
     cc->gdb_get_dynamic_xml = riscv_gdb_get_dynamic_xml;
     cc->tcg_ops.initialize = riscv_translate_init;
-    cc->tlb_fill = riscv_cpu_tlb_fill;
+    cc->tcg_ops.tlb_fill = riscv_cpu_tlb_fill;
 
     device_class_set_props(dc, riscv_cpu_properties);
 }
diff --git a/target/rx/cpu.c b/target/rx/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/rx/cpu.c
+++ b/target/rx/cpu.c
@@ -XXX,XX +XXX,XX @@ static void rx_cpu_class_init(ObjectClass *klass, void *data)
     cc->get_phys_page_debug = rx_cpu_get_phys_page_debug;
     cc->disas_set_info = rx_cpu_disas_set_info;
     cc->tcg_ops.initialize = rx_translate_init;
-    cc->tlb_fill = rx_cpu_tlb_fill;
+    cc->tcg_ops.tlb_fill = rx_cpu_tlb_fill;
 
     cc->gdb_num_core_regs = 26;
     cc->gdb_core_xml_file = "rx-core.xml";
diff --git a/target/s390x/cpu.c b/target/s390x/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/s390x/cpu.c
+++ b/target/s390x/cpu.c
@@ -XXX,XX +XXX,XX @@ static void s390_cpu_class_init(ObjectClass *oc, void *data)
     cc->disas_set_info = s390_cpu_disas_set_info;
 #ifdef CONFIG_TCG
     cc->tcg_ops.initialize = s390x_translate_init;
-    cc->tlb_fill = s390_cpu_tlb_fill;
+    cc->tcg_ops.tlb_fill = s390_cpu_tlb_fill;
 #endif
 
     cc->gdb_num_core_regs = S390_NUM_CORE_REGS;
diff --git a/target/sh4/cpu.c b/target/sh4/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/sh4/cpu.c
+++ b/target/sh4/cpu.c
@@ -XXX,XX +XXX,XX @@ static void superh_cpu_class_init(ObjectClass *oc, void *data)
     cc->tcg_ops.synchronize_from_tb = superh_cpu_synchronize_from_tb;
     cc->gdb_read_register = superh_cpu_gdb_read_register;
     cc->gdb_write_register = superh_cpu_gdb_write_register;
-    cc->tlb_fill = superh_cpu_tlb_fill;
+    cc->tcg_ops.tlb_fill = superh_cpu_tlb_fill;
 #ifndef CONFIG_USER_ONLY
     cc->do_unaligned_access = superh_cpu_do_unaligned_access;
     cc->get_phys_page_debug = superh_cpu_get_phys_page_debug;
diff --git a/target/sparc/cpu.c b/target/sparc/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/sparc/cpu.c
+++ b/target/sparc/cpu.c
@@ -XXX,XX +XXX,XX @@ static void sparc_cpu_class_init(ObjectClass *oc, void *data)
     cc->tcg_ops.synchronize_from_tb = sparc_cpu_synchronize_from_tb;
     cc->gdb_read_register = sparc_cpu_gdb_read_register;
     cc->gdb_write_register = sparc_cpu_gdb_write_register;
-    cc->tlb_fill = sparc_cpu_tlb_fill;
+    cc->tcg_ops.tlb_fill = sparc_cpu_tlb_fill;
 #ifndef CONFIG_USER_ONLY
     cc->do_transaction_failed = sparc_cpu_do_transaction_failed;
     cc->do_unaligned_access = sparc_cpu_do_unaligned_access;
diff --git a/target/tilegx/cpu.c b/target/tilegx/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/tilegx/cpu.c
+++ b/target/tilegx/cpu.c
@@ -XXX,XX +XXX,XX @@ static void tilegx_cpu_class_init(ObjectClass *oc, void *data)
     cc->tcg_ops.cpu_exec_interrupt = tilegx_cpu_exec_interrupt;
     cc->dump_state = tilegx_cpu_dump_state;
     cc->set_pc = tilegx_cpu_set_pc;
-    cc->tlb_fill = tilegx_cpu_tlb_fill;
+    cc->tcg_ops.tlb_fill = tilegx_cpu_tlb_fill;
     cc->gdb_num_core_regs = 0;
     cc->tcg_ops.initialize = tilegx_tcg_init;
 }
diff --git a/target/tricore/cpu.c b/target/tricore/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/tricore/cpu.c
+++ b/target/tricore/cpu.c
@@ -XXX,XX +XXX,XX @@ static void tricore_cpu_class_init(ObjectClass *c, void *data)
     cc->tcg_ops.synchronize_from_tb = tricore_cpu_synchronize_from_tb;
     cc->get_phys_page_debug = tricore_cpu_get_phys_page_debug;
     cc->tcg_ops.initialize = tricore_tcg_init;
-    cc->tlb_fill = tricore_cpu_tlb_fill;
+    cc->tcg_ops.tlb_fill = tricore_cpu_tlb_fill;
 }
 
 #define DEFINE_TRICORE_CPU_TYPE(cpu_model, initfn) \
diff --git a/target/unicore32/cpu.c b/target/unicore32/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/unicore32/cpu.c
+++ b/target/unicore32/cpu.c
@@ -XXX,XX +XXX,XX @@ static void uc32_cpu_class_init(ObjectClass *oc, void *data)
     cc->tcg_ops.cpu_exec_interrupt = uc32_cpu_exec_interrupt;
     cc->dump_state = uc32_cpu_dump_state;
     cc->set_pc = uc32_cpu_set_pc;
-    cc->tlb_fill = uc32_cpu_tlb_fill;
+    cc->tcg_ops.tlb_fill = uc32_cpu_tlb_fill;
     cc->get_phys_page_debug = uc32_cpu_get_phys_page_debug;
     cc->tcg_ops.initialize = uc32_translate_init;
     dc->vmsd = &vmstate_uc32_cpu;
diff --git a/target/xtensa/cpu.c b/target/xtensa/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/xtensa/cpu.c
+++ b/target/xtensa/cpu.c
@@ -XXX,XX +XXX,XX @@ static void xtensa_cpu_class_init(ObjectClass *oc, void *data)
     cc->gdb_read_register = xtensa_cpu_gdb_read_register;
     cc->gdb_write_register = xtensa_cpu_gdb_write_register;
     cc->gdb_stop_before_watchpoint = true;
-    cc->tlb_fill = xtensa_cpu_tlb_fill;
+    cc->tcg_ops.tlb_fill = xtensa_cpu_tlb_fill;
 #ifndef CONFIG_USER_ONLY
     cc->do_unaligned_access = xtensa_cpu_do_unaligned_access;
     cc->get_phys_page_debug = xtensa_cpu_get_phys_page_debug;
diff --git a/target/ppc/translate_init.c.inc b/target/ppc/translate_init.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/target/ppc/translate_init.c.inc
+++ b/target/ppc/translate_init.c.inc
@@ -XXX,XX +XXX,XX @@ static void ppc_cpu_class_init(ObjectClass *oc, void *data)
 #ifdef CONFIG_TCG
     cc->tcg_ops.initialize = ppc_translate_init;
     cc->tcg_ops.cpu_exec_interrupt = ppc_cpu_exec_interrupt;
-    cc->tlb_fill = ppc_cpu_tlb_fill;
+    cc->tcg_ops.tlb_fill = ppc_cpu_tlb_fill;
 #ifndef CONFIG_USER_ONLY
     cc->tcg_ops.cpu_exec_enter = ppc_cpu_exec_enter;
     cc->tcg_ops.cpu_exec_exit = ppc_cpu_exec_exit;
-- 
2.25.1

From: Eduardo Habkost <ehabkost@redhat.com>

Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
Signed-off-by: Claudio Fontana <cfontana@suse.de>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-Id: <20210204163931.7358-8-cfontana@suse.de>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/hw/core/cpu.h     | 4 ++--
 accel/tcg/cpu-exec.c      | 4 ++--
 target/arm/cpu.c          | 2 +-
 target/i386/tcg/tcg-cpu.c | 2 +-
 target/lm32/cpu.c         | 2 +-
 target/s390x/cpu.c        | 2 +-
 target/xtensa/cpu.c       | 2 +-
 7 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/core/cpu.h
+++ b/include/hw/core/cpu.h
@@ -XXX,XX +XXX,XX @@ typedef struct TcgCpuOperations {
     bool (*tlb_fill)(CPUState *cpu, vaddr address, int size,
                      MMUAccessType access_type, int mmu_idx,
                      bool probe, uintptr_t retaddr);
+    /** @debug_excp_handler: Callback for handling debug exceptions */
+    void (*debug_excp_handler)(CPUState *cpu);
 
 } TcgCpuOperations;
 
@@ -XXX,XX +XXX,XX @@ typedef struct TcgCpuOperations {
  * @gdb_write_register: Callback for letting GDB write a register.
  * @debug_check_watchpoint: Callback: return true if the architectural
  *       watchpoint whose address has matched should really fire.
- * @debug_excp_handler: Callback for handling debug exceptions.
  * @write_elf64_note: Callback for writing a CPU-specific ELF note to a
  * 64-bit VM coredump.
  * @write_elf32_qemunote: Callback for writing a CPU- and QEMU-specific ELF
@@ -XXX,XX +XXX,XX @@ struct CPUClass {
     int (*gdb_read_register)(CPUState *cpu, GByteArray *buf, int reg);
     int (*gdb_write_register)(CPUState *cpu, uint8_t *buf, int reg);
     bool (*debug_check_watchpoint)(CPUState *cpu, CPUWatchpoint *wp);
-    void (*debug_excp_handler)(CPUState *cpu);
 
     int (*write_elf64_note)(WriteCoreDumpFunction f, CPUState *cpu,
                             int cpuid, void *opaque);
diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cpu-exec.c
+++ b/accel/tcg/cpu-exec.c
@@ -XXX,XX +XXX,XX @@ static inline void cpu_handle_debug_exception(CPUState *cpu)
         }
     }
 
-    if (cc->debug_excp_handler) {
-        cc->debug_excp_handler(cpu);
+    if (cc->tcg_ops.debug_excp_handler) {
+        cc->tcg_ops.debug_excp_handler(cpu);
     }
 }
 
diff --git a/target/arm/cpu.c b/target/arm/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/cpu.c
+++ b/target/arm/cpu.c
@@ -XXX,XX +XXX,XX @@ static void arm_cpu_class_init(ObjectClass *oc, void *data)
     cc->tcg_ops.cpu_exec_interrupt = arm_cpu_exec_interrupt;
     cc->tcg_ops.synchronize_from_tb = arm_cpu_synchronize_from_tb;
     cc->tcg_ops.tlb_fill = arm_cpu_tlb_fill;
-    cc->debug_excp_handler = arm_debug_excp_handler;
+    cc->tcg_ops.debug_excp_handler = arm_debug_excp_handler;
     cc->debug_check_watchpoint = arm_debug_check_watchpoint;
     cc->do_unaligned_access = arm_cpu_do_unaligned_access;
 #if !defined(CONFIG_USER_ONLY)
diff --git a/target/i386/tcg/tcg-cpu.c b/target/i386/tcg/tcg-cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/i386/tcg/tcg-cpu.c
+++ b/target/i386/tcg/tcg-cpu.c
@@ -XXX,XX +XXX,XX @@ void tcg_cpu_common_class_init(CPUClass *cc)
     cc->tcg_ops.initialize = tcg_x86_init;
     cc->tcg_ops.tlb_fill = x86_cpu_tlb_fill;
 #ifndef CONFIG_USER_ONLY
-    cc->debug_excp_handler = breakpoint_handler;
+    cc->tcg_ops.debug_excp_handler = breakpoint_handler;
 #endif
 }
diff --git a/target/lm32/cpu.c b/target/lm32/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/lm32/cpu.c
+++ b/target/lm32/cpu.c
@@ -XXX,XX +XXX,XX @@ static void lm32_cpu_class_init(ObjectClass *oc, void *data)
 #endif
     cc->gdb_num_core_regs = 32 + 7;
     cc->gdb_stop_before_watchpoint = true;
-    cc->debug_excp_handler = lm32_debug_excp_handler;
+    cc->tcg_ops.debug_excp_handler = lm32_debug_excp_handler;
     cc->disas_set_info = lm32_cpu_disas_set_info;
     cc->tcg_ops.initialize = lm32_translate_init;
 }
diff --git a/target/s390x/cpu.c b/target/s390x/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/s390x/cpu.c
+++ b/target/s390x/cpu.c
@@ -XXX,XX +XXX,XX @@ static void s390_cpu_class_init(ObjectClass *oc, void *data)
     cc->write_elf64_note = s390_cpu_write_elf64_note;
 #ifdef CONFIG_TCG
     cc->tcg_ops.cpu_exec_interrupt = s390_cpu_exec_interrupt;
-    cc->debug_excp_handler = s390x_cpu_debug_excp_handler;
+    cc->tcg_ops.debug_excp_handler = s390x_cpu_debug_excp_handler;
     cc->do_unaligned_access = s390x_cpu_do_unaligned_access;
 #endif
 #endif
diff --git a/target/xtensa/cpu.c b/target/xtensa/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/xtensa/cpu.c
+++ b/target/xtensa/cpu.c
@@ -XXX,XX +XXX,XX @@ static void xtensa_cpu_class_init(ObjectClass *oc, void *data)
     cc->get_phys_page_debug = xtensa_cpu_get_phys_page_debug;
     cc->do_transaction_failed = xtensa_cpu_do_transaction_failed;
 #endif
-    cc->debug_excp_handler = xtensa_breakpoint_handler;
+    cc->tcg_ops.debug_excp_handler = xtensa_breakpoint_handler;
     cc->disas_set_info = xtensa_cpu_disas_set_info;
     cc->tcg_ops.initialize = xtensa_translate_init;
     dc->vmsd = &vmstate_xtensa_cpu;
-- 
2.25.1

From: Claudio Fontana <cfontana@suse.de>

cc->do_interrupt is in theory a TCG callback used in accel/tcg only,
to prepare the emulated architecture to take an interrupt as defined
in the hardware specifications,

but in reality the _do_interrupt style of functions in targets are
also occasionally reused by KVM to prepare the architecture state in a
similar way where userspace code has identified that it needs to
deliver an exception to the guest.

In the case of ARM, that includes:

1) the vcpu thread got a SIGBUS indicating a memory error,
   and we need to deliver a Synchronous External Abort to the guest to
   let it know about the error.
2) the kernel told us about a debug exception (breakpoint, watchpoint)
   but it is not for one of QEMU's own gdbstub breakpoints/watchpoints
   so it must be a breakpoint the guest itself has set up, therefore
   we need to deliver it to the guest.

So in order to reuse code, the same arm_do_interrupt function is used.
This is all fine, but we need to avoid calling it using the callback
registered in CPUClass, since that one is now TCG-only.

Fortunately this is easily solved by replacing calls to
CPUClass::do_interrupt() with explicit calls to arm_do_interrupt().

Signed-off-by: Claudio Fontana <cfontana@suse.de>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Cc: Peter Maydell <peter.maydell@linaro.org>
Message-Id: <20210204163931.7358-9-cfontana@suse.de>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/arm/helper.c | 4 ++++
 target/arm/kvm64.c  | 6 ++----
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/target/arm/helper.c b/target/arm/helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/helper.c
+++ b/target/arm/helper.c
@@ -XXX,XX +XXX,XX @@ static void handle_semihosting(CPUState *cs)
  * Do any appropriate logging, handle PSCI calls, and then hand off
  * to the AArch64-entry or AArch32-entry function depending on the
  * target exception level's register width.
+ *
+ * Note: this is used for both TCG (as the do_interrupt tcg op),
+ *       and KVM to re-inject guest debug exceptions, and to
+ *       inject a Synchronous-External-Abort.
  */
 void arm_cpu_do_interrupt(CPUState *cs)
 {
diff --git a/target/arm/kvm64.c b/target/arm/kvm64.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/kvm64.c
+++ b/target/arm/kvm64.c
@@ -XXX,XX +XXX,XX @@ static void kvm_inject_arm_sea(CPUState *c)
 {
     ARMCPU *cpu = ARM_CPU(c);
     CPUARMState *env = &cpu->env;
-    CPUClass *cc = CPU_GET_CLASS(c);
     uint32_t esr;
     bool same_el;
 
@@ -XXX,XX +XXX,XX @@ static void kvm_inject_arm_sea(CPUState *c)
 
     env->exception.syndrome = esr;
 
-    cc->do_interrupt(c);
+    arm_cpu_do_interrupt(c);
 }
 
 #define AARCH64_CORE_REG(x)   (KVM_REG_ARM64 | KVM_REG_SIZE_U64 | \
@@ -XXX,XX +XXX,XX @@ bool kvm_arm_handle_debug(CPUState *cs, struct kvm_debug_exit_arch *debug_exit)
 {
     int hsr_ec = syn_get_ec(debug_exit->hsr);
     ARMCPU *cpu = ARM_CPU(cs);
-    CPUClass *cc = CPU_GET_CLASS(cs);
     CPUARMState *env = &cpu->env;
 
     /* Ensure PC is synchronised */
@@ -XXX,XX +XXX,XX @@ bool kvm_arm_handle_debug(CPUState *cs, struct kvm_debug_exit_arch *debug_exit)
     env->exception.vaddress = debug_exit->far;
     env->exception.target_el = 1;
     qemu_mutex_lock_iothread();
-    cc->do_interrupt(cs);
+    arm_cpu_do_interrupt(cs);
     qemu_mutex_unlock_iothread();
 
     return false;
-- 
2.25.1

From: Claudio Fontana <cfontana@suse.de>

diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/core/cpu.h
+++ b/include/hw/core/cpu.h
@@ -XXX,XX +XXX,XX @@ typedef struct TcgCpuOperations {
     void (*cpu_exec_exit)(CPUState *cpu);
     /** @cpu_exec_interrupt: Callback for processing interrupts in cpu_exec */
     bool (*cpu_exec_interrupt)(CPUState *cpu, int interrupt_request);
+    /** @do_interrupt: Callback for interrupt handling. */
+    void (*do_interrupt)(CPUState *cpu);
     /**
      * @tlb_fill: Handle a softmmu tlb miss or user-only address fault
      *
@@ -XXX,XX +XXX,XX @@ typedef struct TcgCpuOperations {
  * @parse_features: Callback to parse command line arguments.
  * @reset_dump_flags: #CPUDumpFlags to use for reset logging.
  * @has_work: Callback for checking if there is work to do.
- * @do_interrupt: Callback for interrupt handling.
  * @do_unaligned_access: Callback for unaligned access handling, if
  * the target defines #TARGET_ALIGNED_ONLY.
  * @do_transaction_failed: Callback for handling failed memory transactions
@@ -XXX,XX +XXX,XX @@ struct CPUClass {
 
     int reset_dump_flags;
     bool (*has_work)(CPUState *cpu);
-    void (*do_interrupt)(CPUState *cpu);
     void (*do_unaligned_access)(CPUState *cpu, vaddr addr,
                                 MMUAccessType access_type,
                                 int mmu_idx, uintptr_t retaddr);
diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cpu-exec.c
+++ b/accel/tcg/cpu-exec.c
@@ -XXX,XX +XXX,XX @@ static inline bool cpu_handle_exception(CPUState *cpu, int *ret)
            loop */
 #if defined(TARGET_I386)
         CPUClass *cc = CPU_GET_CLASS(cpu);
-        cc->do_interrupt(cpu);
+        cc->tcg_ops.do_interrupt(cpu);
 #endif
         *ret = cpu->exception_index;
         cpu->exception_index = -1;
@@ -XXX,XX +XXX,XX @@ static inline bool cpu_handle_exception(CPUState *cpu, int *ret)
         if (replay_exception()) {
             CPUClass *cc = CPU_GET_CLASS(cpu);
             qemu_mutex_lock_iothread();
-            cc->do_interrupt(cpu);
+            cc->tcg_ops.do_interrupt(cpu);
             qemu_mutex_unlock_iothread();
             cpu->exception_index = -1;
 
diff --git a/target/alpha/cpu.c b/target/alpha/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/alpha/cpu.c
+++ b/target/alpha/cpu.c
@@ -XXX,XX +XXX,XX @@ static void alpha_cpu_class_init(ObjectClass *oc, void *data)
 
     cc->class_by_name = alpha_cpu_class_by_name;
     cc->has_work = alpha_cpu_has_work;
-    cc->do_interrupt = alpha_cpu_do_interrupt;
+    cc->tcg_ops.do_interrupt = alpha_cpu_do_interrupt;
     cc->tcg_ops.cpu_exec_interrupt = alpha_cpu_exec_interrupt;
     cc->dump_state = alpha_cpu_dump_state;
     cc->set_pc = alpha_cpu_set_pc;
diff --git a/target/arm/cpu.c b/target/arm/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/cpu.c
+++ b/target/arm/cpu.c
@@ -XXX,XX +XXX,XX @@ bool arm_cpu_exec_interrupt(CPUState *cs, int interrupt_request)
  found:
     cs->exception_index = excp_idx;
     env->exception.target_el = target_el;
-    cc->do_interrupt(cs);
+    cc->tcg_ops.do_interrupt(cs);
     return true;
 }
 
@@ -XXX,XX +XXX,XX @@ static void arm_cpu_class_init(ObjectClass *oc, void *data)
     cc->gdb_read_register = arm_cpu_gdb_read_register;
     cc->gdb_write_register = arm_cpu_gdb_write_register;
 #ifndef CONFIG_USER_ONLY
-    cc->do_interrupt = arm_cpu_do_interrupt;
     cc->get_phys_page_attrs_debug = arm_cpu_get_phys_page_attrs_debug;
     cc->asidx_from_attrs = arm_asidx_from_attrs;
     cc->vmsd = &vmstate_arm_cpu;
@@ -XXX,XX +XXX,XX @@ static void arm_cpu_class_init(ObjectClass *oc, void *data)
 #if !defined(CONFIG_USER_ONLY)
     cc->do_transaction_failed = arm_cpu_do_transaction_failed;
     cc->adjust_watchpoint_address = arm_adjust_watchpoint_address;
+    cc->tcg_ops.do_interrupt = arm_cpu_do_interrupt;
 #endif /* CONFIG_TCG && !CONFIG_USER_ONLY */
 #endif
 }
diff --git a/target/arm/cpu_tcg.c b/target/arm/cpu_tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/cpu_tcg.c
+++ b/target/arm/cpu_tcg.c
@@ -XXX,XX +XXX,XX @@ static bool arm_v7m_cpu_exec_interrupt(CPUState *cs, int interrupt_request)
     if (interrupt_request & CPU_INTERRUPT_HARD
         && (armv7m_nvic_can_take_pending_exception(env->nvic))) {
         cs->exception_index = EXCP_IRQ;
-        cc->do_interrupt(cs);
+        cc->tcg_ops.do_interrupt(cs);
         ret = true;
     }
     return ret;
@@ -XXX,XX +XXX,XX @@ static void arm_v7m_class_init(ObjectClass *oc, void *data)
     CPUClass *cc = CPU_CLASS(oc);
 
     acc->info = data;
-#ifndef CONFIG_USER_ONLY
-    cc->do_interrupt = arm_v7m_cpu_do_interrupt;
-#endif
-
 #ifdef CONFIG_TCG
     cc->tcg_ops.cpu_exec_interrupt = arm_v7m_cpu_exec_interrupt;
+#ifndef CONFIG_USER_ONLY
+    cc->tcg_ops.do_interrupt = arm_v7m_cpu_do_interrupt;
+#endif
 #endif /* CONFIG_TCG */
 
     cc->gdb_core_xml_file = "arm-m-profile.xml";
diff --git a/target/avr/cpu.c b/target/avr/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/avr/cpu.c
+++ b/target/avr/cpu.c
@@ -XXX,XX +XXX,XX @@ static void avr_cpu_class_init(ObjectClass *oc, void *data)
     cc->class_by_name = avr_cpu_class_by_name;
 
     cc->has_work = avr_cpu_has_work;
-    cc->do_interrupt = avr_cpu_do_interrupt;
+    cc->tcg_ops.do_interrupt = avr_cpu_do_interrupt;
     cc->tcg_ops.cpu_exec_interrupt = avr_cpu_exec_interrupt;
     cc->dump_state = avr_cpu_dump_state;
     cc->set_pc = avr_cpu_set_pc;
diff --git a/target/avr/helper.c b/target/avr/helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/avr/helper.c
+++ b/target/avr/helper.c
@@ -XXX,XX +XXX,XX @@ bool avr_cpu_exec_interrupt(CPUState *cs, int interrupt_request)
     if (interrupt_request & CPU_INTERRUPT_RESET) {
         if (cpu_interrupts_enabled(env)) {
             cs->exception_index = EXCP_RESET;
-            cc->do_interrupt(cs);
+            cc->tcg_ops.do_interrupt(cs);
 
             cs->interrupt_request &= ~CPU_INTERRUPT_RESET;
 
@@ -XXX,XX +XXX,XX @@ bool avr_cpu_exec_interrupt(CPUState *cs, int interrupt_request)
         if (cpu_interrupts_enabled(env) && env->intsrc != 0) {
             int index = ctz32(env->intsrc);
             cs->exception_index = EXCP_INT(index);
-            cc->do_interrupt(cs);
+            cc->tcg_ops.do_interrupt(cs);
 
             env->intsrc &= env->intsrc - 1; /* clear the interrupt */
             cs->interrupt_request &= ~CPU_INTERRUPT_HARD;
diff --git a/target/cris/cpu.c b/target/cris/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/cris/cpu.c
+++ b/target/cris/cpu.c
@@ -XXX,XX +XXX,XX @@ static void crisv8_cpu_class_init(ObjectClass *oc, void *data)
     CRISCPUClass *ccc = CRIS_CPU_CLASS(oc);
 
     ccc->vr = 8;
-    cc->do_interrupt = crisv10_cpu_do_interrupt;
+    cc->tcg_ops.do_interrupt = crisv10_cpu_do_interrupt;
     cc->gdb_read_register = crisv10_cpu_gdb_read_register;
     cc->tcg_ops.initialize = cris_initialize_crisv10_tcg;
 }
@@ -XXX,XX +XXX,XX @@ static void crisv9_cpu_class_init(ObjectClass *oc, void *data)
     CRISCPUClass *ccc = CRIS_CPU_CLASS(oc);
 
     ccc->vr = 9;
-    cc->do_interrupt = crisv10_cpu_do_interrupt;
+    cc->tcg_ops.do_interrupt = crisv10_cpu_do_interrupt;
     cc->gdb_read_register = crisv10_cpu_gdb_read_register;
     cc->tcg_ops.initialize = cris_initialize_crisv10_tcg;
 }
@@ -XXX,XX +XXX,XX @@ static void crisv10_cpu_class_init(ObjectClass *oc, void *data)
     CRISCPUClass *ccc = CRIS_CPU_CLASS(oc);
 
     ccc->vr = 10;
-    cc->do_interrupt = crisv10_cpu_do_interrupt;
+    cc->tcg_ops.do_interrupt = crisv10_cpu_do_interrupt;
     cc->gdb_read_register = crisv10_cpu_gdb_read_register;
     cc->tcg_ops.initialize = cris_initialize_crisv10_tcg;
 }
@@ -XXX,XX +XXX,XX @@ static void crisv11_cpu_class_init(ObjectClass *oc, void *data)
     CRISCPUClass *ccc = CRIS_CPU_CLASS(oc);
 
     ccc->vr = 11;
-    cc->do_interrupt = crisv10_cpu_do_interrupt;
+    cc->tcg_ops.do_interrupt = crisv10_cpu_do_interrupt;
     cc->gdb_read_register = crisv10_cpu_gdb_read_register;
     cc->tcg_ops.initialize = cris_initialize_crisv10_tcg;
 }
@@ -XXX,XX +XXX,XX @@ static void crisv17_cpu_class_init(ObjectClass *oc, void *data)
     CRISCPUClass *ccc = CRIS_CPU_CLASS(oc);
 
     ccc->vr = 17;
-    cc->do_interrupt = crisv10_cpu_do_interrupt;
+    cc->tcg_ops.do_interrupt = crisv10_cpu_do_interrupt;
     cc->gdb_read_register = crisv10_cpu_gdb_read_register;
     cc->tcg_ops.initialize = cris_initialize_crisv10_tcg;
 }
@@ -XXX,XX +XXX,XX @@ static void cris_cpu_class_init(ObjectClass *oc, void *data)
 
     cc->class_by_name = cris_cpu_class_by_name;
     cc->has_work = cris_cpu_has_work;
-    cc->do_interrupt = cris_cpu_do_interrupt;
+    cc->tcg_ops.do_interrupt = cris_cpu_do_interrupt;
     cc->tcg_ops.cpu_exec_interrupt = cris_cpu_exec_interrupt;
     cc->dump_state = cris_cpu_dump_state;
     cc->set_pc = cris_cpu_set_pc;
diff --git a/target/cris/helper.c b/target/cris/helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/cris/helper.c
+++ b/target/cris/helper.c
@@ -XXX,XX +XXX,XX @@ bool cris_cpu_exec_interrupt(CPUState *cs, int interrupt_request)
         && (env->pregs[PR_CCS] & I_FLAG)
         && !env->locked_irq) {
         cs->exception_index = EXCP_IRQ;
-        cc->do_interrupt(cs);
+        cc->tcg_ops.do_interrupt(cs);
         ret = true;
     }
     if (interrupt_request & CPU_INTERRUPT_NMI) {
@@ -XXX,XX +XXX,XX @@ bool cris_cpu_exec_interrupt(CPUState *cs, int interrupt_request)
         }
         if ((env->pregs[PR_CCS] & m_flag_archval)) {
             cs->exception_index = EXCP_NMI;
-            cc->do_interrupt(cs);
+            cc->tcg_ops.do_interrupt(cs);
             ret = true;
         }
     }
diff --git a/target/hppa/cpu.c b/target/hppa/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/hppa/cpu.c
+++ b/target/hppa/cpu.c
@@ -XXX,XX +XXX,XX @@ static void hppa_cpu_class_init(ObjectClass *oc, void *data)
 
     cc->class_by_name = hppa_cpu_class_by_name;
     cc->has_work = hppa_cpu_has_work;
-    cc->do_interrupt = hppa_cpu_do_interrupt;
+    cc->tcg_ops.do_interrupt = hppa_cpu_do_interrupt;
     cc->tcg_ops.cpu_exec_interrupt = hppa_cpu_exec_interrupt;
     cc->dump_state = hppa_cpu_dump_state;
     cc->set_pc = hppa_cpu_set_pc;
diff --git a/target/i386/tcg/tcg-cpu.c b/target/i386/tcg/tcg-cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/i386/tcg/tcg-cpu.c
+++ b/target/i386/tcg/tcg-cpu.c
@@ -XXX,XX +XXX,XX @@ static void x86_cpu_synchronize_from_tb(CPUState *cs,
 
 void tcg_cpu_common_class_init(CPUClass *cc)
 {
-    cc->do_interrupt = x86_cpu_do_interrupt;
+    cc->tcg_ops.do_interrupt = x86_cpu_do_interrupt;
     cc->tcg_ops.cpu_exec_interrupt = x86_cpu_exec_interrupt;
     cc->tcg_ops.synchronize_from_tb = x86_cpu_synchronize_from_tb;
     cc->tcg_ops.cpu_exec_enter = x86_cpu_exec_enter;
diff --git a/target/lm32/cpu.c b/target/lm32/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/lm32/cpu.c
+++ b/target/lm32/cpu.c
@@ -XXX,XX +XXX,XX @@ static void lm32_cpu_class_init(ObjectClass *oc, void *data)
 
     cc->class_by_name = lm32_cpu_class_by_name;
     cc->has_work = lm32_cpu_has_work;
-    cc->do_interrupt = lm32_cpu_do_interrupt;
+    cc->tcg_ops.do_interrupt = lm32_cpu_do_interrupt;
     cc->tcg_ops.cpu_exec_interrupt = lm32_cpu_exec_interrupt;
     cc->dump_state = lm32_cpu_dump_state;
     cc->set_pc = lm32_cpu_set_pc;
diff --git a/target/m68k/cpu.c b/target/m68k/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/m68k/cpu.c
+++ b/target/m68k/cpu.c
@@ -XXX,XX +XXX,XX @@ static void m68k_cpu_class_init(ObjectClass *c, void *data)
 
     cc->class_by_name = m68k_cpu_class_by_name;
     cc->has_work = m68k_cpu_has_work;
-    cc->do_interrupt = m68k_cpu_do_interrupt;
+    cc->tcg_ops.do_interrupt = m68k_cpu_do_interrupt;
     cc->tcg_ops.cpu_exec_interrupt = m68k_cpu_exec_interrupt;
     cc->dump_state = m68k_cpu_dump_state;
     cc->set_pc = m68k_cpu_set_pc;
diff --git a/target/microblaze/cpu.c b/target/microblaze/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/microblaze/cpu.c
+++ b/target/microblaze/cpu.c
@@ -XXX,XX +XXX,XX @@ static void mb_cpu_class_init(ObjectClass *oc, void *data)
 
     cc->class_by_name = mb_cpu_class_by_name;
     cc->has_work = mb_cpu_has_work;
-    cc->do_interrupt = mb_cpu_do_interrupt;
+    cc->tcg_ops.do_interrupt = mb_cpu_do_interrupt;
     cc->do_unaligned_access = mb_cpu_do_unaligned_access;
     cc->tcg_ops.cpu_exec_interrupt = mb_cpu_exec_interrupt;
     cc->dump_state = mb_cpu_dump_state;
diff --git a/target/mips/cpu.c b/target/mips/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/mips/cpu.c
+++ b/target/mips/cpu.c
@@ -XXX,XX +XXX,XX @@ static void mips_cpu_class_init(ObjectClass *c, void *data)
 
     cc->class_by_name = mips_cpu_class_by_name;
     cc->has_work = mips_cpu_has_work;
-    cc->do_interrupt = mips_cpu_do_interrupt;
     cc->dump_state = mips_cpu_dump_state;
     cc->set_pc = mips_cpu_set_pc;
     cc->gdb_read_register = mips_cpu_gdb_read_register;
@@ -XXX,XX +XXX,XX @@ static void mips_cpu_class_init(ObjectClass *c, void *data)
     cc->disas_set_info = mips_cpu_disas_set_info;
 #ifdef CONFIG_TCG
     cc->tcg_ops.initialize = mips_tcg_init;
+    cc->tcg_ops.do_interrupt = mips_cpu_do_interrupt;
     cc->tcg_ops.cpu_exec_interrupt = mips_cpu_exec_interrupt;
     cc->tcg_ops.synchronize_from_tb = mips_cpu_synchronize_from_tb;
     cc->tcg_ops.tlb_fill = mips_cpu_tlb_fill;
-#endif
+#endif /* CONFIG_TCG */
 
     cc->gdb_num_core_regs = 73;
     cc->gdb_stop_before_watchpoint = true;
diff --git a/target/moxie/cpu.c b/target/moxie/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/moxie/cpu.c
+++ b/target/moxie/cpu.c
@@ -XXX,XX +XXX,XX @@ static void moxie_cpu_class_init(ObjectClass *oc, void *data)
     cc->class_by_name = moxie_cpu_class_by_name;
 
     cc->has_work = moxie_cpu_has_work;
-    cc->do_interrupt = moxie_cpu_do_interrupt;
+    cc->tcg_ops.do_interrupt = moxie_cpu_do_interrupt;
     cc->dump_state = moxie_cpu_dump_state;
     cc->set_pc = moxie_cpu_set_pc;
     cc->tcg_ops.tlb_fill = moxie_cpu_tlb_fill;
diff --git a/target/nios2/cpu.c b/target/nios2/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/nios2/cpu.c
+++ b/target/nios2/cpu.c
@@ -XXX,XX +XXX,XX @@ static void nios2_cpu_class_init(ObjectClass *oc, void *data)
 
     cc->class_by_name = nios2_cpu_class_by_name;
     cc->has_work = nios2_cpu_has_work;
-    cc->do_interrupt = nios2_cpu_do_interrupt;
+    cc->tcg_ops.do_interrupt = nios2_cpu_do_interrupt;
     cc->tcg_ops.cpu_exec_interrupt = nios2_cpu_exec_interrupt;
     cc->dump_state = nios2_cpu_dump_state;
     cc->set_pc = nios2_cpu_set_pc;
diff --git a/target/openrisc/cpu.c b/target/openrisc/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/openrisc/cpu.c
+++ b/target/openrisc/cpu.c
@@ -XXX,XX +XXX,XX @@ static void openrisc_cpu_class_init(ObjectClass *oc, void *data)
 
     cc->class_by_name = openrisc_cpu_class_by_name;
     cc->has_work = openrisc_cpu_has_work;
-    cc->do_interrupt = openrisc_cpu_do_interrupt;
+    cc->tcg_ops.do_interrupt = openrisc_cpu_do_interrupt;
     cc->tcg_ops.cpu_exec_interrupt = openrisc_cpu_exec_interrupt;
     cc->dump_state = openrisc_cpu_dump_state;
     cc->set_pc = openrisc_cpu_set_pc;
diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/riscv/cpu.c
+++ b/target/riscv/cpu.c
@@ -XXX,XX +XXX,XX @@ static void riscv_cpu_class_init(ObjectClass *c, void *data)
 
     cc->class_by_name = riscv_cpu_class_by_name;
     cc->has_work = riscv_cpu_has_work;
-    cc->do_interrupt = riscv_cpu_do_interrupt;
+    cc->tcg_ops.do_interrupt = riscv_cpu_do_interrupt;
     cc->tcg_ops.cpu_exec_interrupt = riscv_cpu_exec_interrupt;
     cc->dump_state = riscv_cpu_dump_state;
     cc->set_pc = riscv_cpu_set_pc;
diff --git a/target/rx/cpu.c b/target/rx/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/rx/cpu.c
+++ b/target/rx/cpu.c
@@ -XXX,XX +XXX,XX @@ static void rx_cpu_class_init(ObjectClass *klass, void *data)
 
     cc->class_by_name = rx_cpu_class_by_name;
     cc->has_work = rx_cpu_has_work;
-    cc->do_interrupt = rx_cpu_do_interrupt;
+    cc->tcg_ops.do_interrupt = rx_cpu_do_interrupt;
     cc->tcg_ops.cpu_exec_interrupt = rx_cpu_exec_interrupt;
     cc->dump_state = rx_cpu_dump_state;
     cc->set_pc = rx_cpu_set_pc;
diff --git a/target/s390x/cpu.c b/target/s390x/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/s390x/cpu.c
+++ b/target/s390x/cpu.c
@@ -XXX,XX +XXX,XX @@ static void s390_cpu_class_init(ObjectClass *oc, void *data)
     cc->class_by_name = s390_cpu_class_by_name,
     cc->has_work = s390_cpu_has_work;
 #ifdef CONFIG_TCG
-    cc->do_interrupt = s390_cpu_do_interrupt;
+    cc->tcg_ops.do_interrupt = s390_cpu_do_interrupt;
 #endif
     cc->dump_state = s390_cpu_dump_state;
     cc->set_pc = s390_cpu_set_pc;
diff --git a/target/sh4/cpu.c b/target/sh4/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/sh4/cpu.c
+++ b/target/sh4/cpu.c
@@ -XXX,XX +XXX,XX @@ static void superh_cpu_class_init(ObjectClass *oc, void *data)
 
     cc->class_by_name = superh_cpu_class_by_name;
     cc->has_work = superh_cpu_has_work;
-    cc->do_interrupt = superh_cpu_do_interrupt;
+    cc->tcg_ops.do_interrupt = superh_cpu_do_interrupt;
     cc->tcg_ops.cpu_exec_interrupt = superh_cpu_exec_interrupt;
     cc->dump_state = superh_cpu_dump_state;
     cc->set_pc = superh_cpu_set_pc;
diff --git a/target/sparc/cpu.c b/target/sparc/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/sparc/cpu.c
+++ b/target/sparc/cpu.c
@@ -XXX,XX +XXX,XX @@ static void sparc_cpu_class_init(ObjectClass *oc, void *data)
     cc->class_by_name = sparc_cpu_class_by_name;
     cc->parse_features = sparc_cpu_parse_features;
     cc->has_work = sparc_cpu_has_work;
-    cc->do_interrupt = sparc_cpu_do_interrupt;
+    cc->tcg_ops.do_interrupt = sparc_cpu_do_interrupt;
     cc->tcg_ops.cpu_exec_interrupt = sparc_cpu_exec_interrupt;
     cc->dump_state = sparc_cpu_dump_state;
 #if !defined(TARGET_SPARC64) && !defined(CONFIG_USER_ONLY)
diff --git a/target/tilegx/cpu.c b/target/tilegx/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/tilegx/cpu.c
+++ b/target/tilegx/cpu.c
@@ -XXX,XX +XXX,XX @@ static void tilegx_cpu_class_init(ObjectClass *oc, void *data)
 
     cc->class_by_name = tilegx_cpu_class_by_name;
     cc->has_work = tilegx_cpu_has_work;
-    cc->do_interrupt = tilegx_cpu_do_interrupt;
+    cc->tcg_ops.do_interrupt = tilegx_cpu_do_interrupt;
     cc->tcg_ops.cpu_exec_interrupt = tilegx_cpu_exec_interrupt;
     cc->dump_state = tilegx_cpu_dump_state;
     cc->set_pc = tilegx_cpu_set_pc;
diff --git a/target/unicore32/cpu.c b/target/unicore32/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/unicore32/cpu.c
+++ b/target/unicore32/cpu.c
@@ -XXX,XX +XXX,XX @@ static void uc32_cpu_class_init(ObjectClass *oc, void *data)
 
     cc->class_by_name = uc32_cpu_class_by_name;
     cc->has_work = uc32_cpu_has_work;
-    cc->do_interrupt = uc32_cpu_do_interrupt;
+    cc->tcg_ops.do_interrupt = uc32_cpu_do_interrupt;
     cc->tcg_ops.cpu_exec_interrupt = uc32_cpu_exec_interrupt;
     cc->dump_state = uc32_cpu_dump_state;
     cc->set_pc = uc32_cpu_set_pc;
diff --git a/target/xtensa/cpu.c b/target/xtensa/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/xtensa/cpu.c
+++ b/target/xtensa/cpu.c
@@ -XXX,XX +XXX,XX @@ static void xtensa_cpu_class_init(ObjectClass *oc, void *data)
 
     cc->class_by_name = xtensa_cpu_class_by_name;
     cc->has_work = xtensa_cpu_has_work;
-    cc->do_interrupt = xtensa_cpu_do_interrupt;
+    cc->tcg_ops.do_interrupt = xtensa_cpu_do_interrupt;
     cc->tcg_ops.cpu_exec_interrupt = xtensa_cpu_exec_interrupt;
     cc->dump_state = xtensa_cpu_dump_state;
     cc->set_pc = xtensa_cpu_set_pc;
diff --git a/target/ppc/translate_init.c.inc b/target/ppc/translate_init.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/target/ppc/translate_init.c.inc
+++ b/target/ppc/translate_init.c.inc
@@ -XXX,XX +XXX,XX @@ static void ppc_cpu_class_init(ObjectClass *oc, void *data)
 
     cc->class_by_name = ppc_cpu_class_by_name;
     cc->has_work = ppc_cpu_has_work;
-    cc->do_interrupt = ppc_cpu_do_interrupt;
     cc->dump_state = ppc_cpu_dump_state;
     cc->dump_statistics = ppc_cpu_dump_statistics;
     cc->set_pc = ppc_cpu_set_pc;
@@ -XXX,XX +XXX,XX @@ static void ppc_cpu_class_init(ObjectClass *oc, void *data)
 #ifdef CONFIG_TCG
     cc->tcg_ops.initialize = ppc_translate_init;
     cc->tcg_ops.cpu_exec_interrupt = ppc_cpu_exec_interrupt;
+    cc->tcg_ops.do_interrupt = ppc_cpu_do_interrupt;
     cc->tcg_ops.tlb_fill = ppc_cpu_tlb_fill;
 #ifndef CONFIG_USER_ONLY
     cc->tcg_ops.cpu_exec_enter = ppc_cpu_exec_enter;
-- 
2.25.1

From: Claudio Fontana <cfontana@suse.de>

[claudio: wrap target code around CONFIG_TCG and !CONFIG_USER_ONLY]

avoiding its use in headers used by common_ss code (should be poisoned).

Note: need to be careful with the use of CONFIG_USER_ONLY,
Message-Id: <20210204163931.7358-11-cfontana@suse.de>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/hw/core/cpu.h     | 28 +++++++++++++---------------
 hw/mips/jazz.c            |  9 +++++++--
 target/alpha/cpu.c        |  2 +-
 target/arm/cpu.c          |  4 ++--
 target/m68k/cpu.c         |  2 +-
 target/microblaze/cpu.c   |  2 +-
 target/mips/cpu.c         |  4 +++-
 target/riscv/cpu.c        |  2 +-
 target/riscv/cpu_helper.c |  2 +-
 target/sparc/cpu.c        |  2 +-
 target/xtensa/cpu.c       |  2 +-
 target/xtensa/helper.c    |  4 ++--
 12 files changed, 34 insertions(+), 29 deletions(-)

diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/core/cpu.h
+++ b/include/hw/core/cpu.h
@@ -XXX,XX +XXX,XX @@ typedef struct TcgCpuOperations {
     /** @debug_excp_handler: Callback for handling debug exceptions */
     void (*debug_excp_handler)(CPUState *cpu);
 
+    /**
+     * @do_transaction_failed: Callback for handling failed memory transactions
+     * (ie bus faults or external aborts; not MMU faults)
+     */
+    void (*do_transaction_failed)(CPUState *cpu, hwaddr physaddr, vaddr addr,
+                                  unsigned size, MMUAccessType access_type,
+                                  int mmu_idx, MemTxAttrs attrs,
+                                  MemTxResult response, uintptr_t retaddr);
 } TcgCpuOperations;
 
 /**
@@ -XXX,XX +XXX,XX @@ typedef struct TcgCpuOperations {
  * @has_work: Callback for checking if there is work to do.
  * @do_unaligned_access: Callback for unaligned access handling, if
  * the target defines #TARGET_ALIGNED_ONLY.
- * @do_transaction_failed: Callback for handling failed memory transactions
- * (ie bus faults or external aborts; not MMU faults)
  * @virtio_is_big_endian: Callback to return %true if a CPU which supports
  * runtime configurable endianness is currently big-endian. Non-configurable
  * CPUs can use the default implementation of this method. This method should
@@ -XXX,XX +XXX,XX @@ struct CPUClass {
     void (*do_unaligned_access)(CPUState *cpu, vaddr addr,
                                 MMUAccessType access_type,
                                 int mmu_idx, uintptr_t retaddr);
-    void (*do_transaction_failed)(CPUState *cpu, hwaddr physaddr, vaddr addr,
-                                  unsigned size, MMUAccessType access_type,
-                                  int mmu_idx, MemTxAttrs attrs,
-                                  MemTxResult response, uintptr_t retaddr);
     bool (*virtio_is_big_endian)(CPUState *cpu);
     int (*memory_rw_debug)(CPUState *cpu, vaddr addr,
                            uint8_t *buf, int len, bool is_write);
@@ -XXX,XX +XXX,XX @@ CPUState *cpu_by_arch_id(int64_t id);
 
 void cpu_interrupt(CPUState *cpu, int mask);
 
-#ifdef NEED_CPU_H
-
-#ifdef CONFIG_SOFTMMU
 static inline void cpu_unaligned_access(CPUState *cpu, vaddr addr,
                                         MMUAccessType access_type,
                                         int mmu_idx, uintptr_t retaddr)
@@ -XXX,XX +XXX,XX @@ static inline void cpu_transaction_failed(CPUState *cpu, hwaddr physaddr,
 {
     CPUClass *cc = CPU_GET_CLASS(cpu);
 
-    if (!cpu->ignore_memory_transaction_failures && cc->do_transaction_failed) {
-        cc->do_transaction_failed(cpu, physaddr, addr, size, access_type,
-                                  mmu_idx, attrs, response, retaddr);
+    if (!cpu->ignore_memory_transaction_failures &&
+        cc->tcg_ops.do_transaction_failed) {
+        cc->tcg_ops.do_transaction_failed(cpu, physaddr, addr, size,
+                                          access_type, mmu_idx, attrs,
+                                          response, retaddr);
     }
 }
-#endif
-
-#endif /* NEED_CPU_H */
 
 /**
  * cpu_set_pc:
diff --git a/hw/mips/jazz.c b/hw/mips/jazz.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/mips/jazz.c
+++ b/hw/mips/jazz.c
@@ -XXX,XX +XXX,XX @@ static const MemoryRegionOps dma_dummy_ops = {
 #define MAGNUM_BIOS_SIZE_MAX 0x7e000
 #define MAGNUM_BIOS_SIZE                                                       \
         (BIOS_SIZE < MAGNUM_BIOS_SIZE_MAX ? BIOS_SIZE : MAGNUM_BIOS_SIZE_MAX)
+
+#if defined(CONFIG_TCG) && !defined(CONFIG_USER_ONLY)
 static void (*real_do_transaction_failed)(CPUState *cpu, hwaddr physaddr,
                                           vaddr addr, unsigned size,
                                           MMUAccessType access_type,
@@ -XXX,XX +XXX,XX @@ static void mips_jazz_do_transaction_failed(CPUState *cs, hwaddr physaddr,
     (*real_do_transaction_failed)(cs, physaddr, addr, size, access_type,
                                   mmu_idx, attrs, response, retaddr);
 }
+#endif /* CONFIG_TCG && !CONFIG_USER_ONLY */
 
 static void mips_jazz_init(MachineState *machine,
                            enum jazz_model_e jazz_model)
@@ -XXX,XX +XXX,XX @@ static void mips_jazz_init(MachineState *machine,
      * memory region that catches all memory accesses, as we do on Malta.
      */
     cc = CPU_GET_CLASS(cpu);
-    real_do_transaction_failed = cc->do_transaction_failed;
-    cc->do_transaction_failed = mips_jazz_do_transaction_failed;
+#if defined(CONFIG_TCG) && !defined(CONFIG_USER_ONLY)
+    real_do_transaction_failed = cc->tcg_ops.do_transaction_failed;
+    cc->tcg_ops.do_transaction_failed = mips_jazz_do_transaction_failed;
+#endif /* CONFIG_TCG && !CONFIG_USER_ONLY */
 
     /* allocate RAM */
     memory_region_add_subregion(address_space, 0, machine->ram);
diff --git a/target/alpha/cpu.c b/target/alpha/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/alpha/cpu.c
+++ b/target/alpha/cpu.c
@@ -XXX,XX +XXX,XX @@ static void alpha_cpu_class_init(ObjectClass *oc, void *data)
     cc->gdb_write_register = alpha_cpu_gdb_write_register;
     cc->tcg_ops.tlb_fill = alpha_cpu_tlb_fill;
 #ifndef CONFIG_USER_ONLY
-    cc->do_transaction_failed = alpha_cpu_do_transaction_failed;
+    cc->tcg_ops.do_transaction_failed = alpha_cpu_do_transaction_failed;
     cc->do_unaligned_access = alpha_cpu_do_unaligned_access;
     cc->get_phys_page_debug = alpha_cpu_get_phys_page_debug;
     dc->vmsd = &vmstate_alpha_cpu;
diff --git a/target/arm/cpu.c b/target/arm/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/cpu.c
+++ b/target/arm/cpu.c
@@ -XXX,XX +XXX,XX @@ static void arm_cpu_class_init(ObjectClass *oc, void *data)
     cc->debug_check_watchpoint = arm_debug_check_watchpoint;
     cc->do_unaligned_access = arm_cpu_do_unaligned_access;
 #if !defined(CONFIG_USER_ONLY)
-    cc->do_transaction_failed = arm_cpu_do_transaction_failed;
+    cc->tcg_ops.do_transaction_failed = arm_cpu_do_transaction_failed;
     cc->adjust_watchpoint_address = arm_adjust_watchpoint_address;
     cc->tcg_ops.do_interrupt = arm_cpu_do_interrupt;
 #endif /* CONFIG_TCG && !CONFIG_USER_ONLY */
-#endif
+#endif /* CONFIG_TCG */
 }
 
 #ifdef CONFIG_KVM
diff --git a/target/m68k/cpu.c b/target/m68k/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/m68k/cpu.c
+++ b/target/m68k/cpu.c
@@ -XXX,XX +XXX,XX @@ static void m68k_cpu_class_init(ObjectClass *c, void *data)
     cc->gdb_write_register = m68k_cpu_gdb_write_register;
     cc->tcg_ops.tlb_fill = m68k_cpu_tlb_fill;
 #if defined(CONFIG_SOFTMMU)
-    cc->do_transaction_failed = m68k_cpu_transaction_failed;
+    cc->tcg_ops.do_transaction_failed = m68k_cpu_transaction_failed;
     cc->get_phys_page_debug = m68k_cpu_get_phys_page_debug;
     dc->vmsd = &vmstate_m68k_cpu;
 #endif
diff --git a/target/microblaze/cpu.c b/target/microblaze/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/microblaze/cpu.c
+++ b/target/microblaze/cpu.c
@@ -XXX,XX +XXX,XX @@ static void mb_cpu_class_init(ObjectClass *oc, void *data)
     cc->gdb_write_register = mb_cpu_gdb_write_register;
     cc->tcg_ops.tlb_fill = mb_cpu_tlb_fill;
 #ifndef CONFIG_USER_ONLY
-    cc->do_transaction_failed = mb_cpu_transaction_failed;
+    cc->tcg_ops.do_transaction_failed = mb_cpu_transaction_failed;
     cc->get_phys_page_attrs_debug = mb_cpu_get_phys_page_attrs_debug;
     dc->vmsd = &vmstate_mb_cpu;
 #endif
diff --git a/target/mips/cpu.c b/target/mips/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/mips/cpu.c
+++ b/target/mips/cpu.c
@@ -XXX,XX +XXX,XX @@ static void mips_cpu_class_init(ObjectClass *c, void *data)
     cc->gdb_read_register = mips_cpu_gdb_read_register;
     cc->gdb_write_register = mips_cpu_gdb_write_register;
 #ifndef CONFIG_USER_ONLY
-    cc->do_transaction_failed = mips_cpu_do_transaction_failed;
     cc->do_unaligned_access = mips_cpu_do_unaligned_access;
     cc->get_phys_page_debug = mips_cpu_get_phys_page_debug;
     cc->vmsd = &vmstate_mips_cpu;
@@ -XXX,XX +XXX,XX @@ static void mips_cpu_class_init(ObjectClass *c, void *data)
     cc->tcg_ops.cpu_exec_interrupt = mips_cpu_exec_interrupt;
     cc->tcg_ops.synchronize_from_tb = mips_cpu_synchronize_from_tb;
     cc->tcg_ops.tlb_fill = mips_cpu_tlb_fill;
+#ifndef CONFIG_USER_ONLY
+    cc->tcg_ops.do_transaction_failed = mips_cpu_do_transaction_failed;
+#endif /* CONFIG_USER_ONLY */
 #endif /* CONFIG_TCG */
 
     cc->gdb_num_core_regs = 73;
diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/riscv/cpu.c
+++ b/target/riscv/cpu.c
@@ -XXX,XX +XXX,XX @@ static void riscv_cpu_class_init(ObjectClass *c, void *data)
     cc->gdb_stop_before_watchpoint = true;
     cc->disas_set_info = riscv_cpu_disas_set_info;
 #ifndef CONFIG_USER_ONLY
-    cc->do_transaction_failed = riscv_cpu_do_transaction_failed;
+    cc->tcg_ops.do_transaction_failed = riscv_cpu_do_transaction_failed;
     cc->do_unaligned_access = riscv_cpu_do_unaligned_access;
     cc->get_phys_page_debug = riscv_cpu_get_phys_page_debug;
     /* For now, mark unmigratable: */
diff --git a/target/riscv/cpu_helper.c b/target/riscv/cpu_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/riscv/cpu_helper.c
+++ b/target/riscv/cpu_helper.c
@@ -XXX,XX +XXX,XX @@ void riscv_cpu_do_unaligned_access(CPUState *cs, vaddr addr,
     env->badaddr = addr;
     riscv_raise_exception(env, cs->exception_index, retaddr);
 }
-#endif
+#endif /* !CONFIG_USER_ONLY */
 
 bool riscv_cpu_tlb_fill(CPUState *cs, vaddr address, int size,
                         MMUAccessType access_type, int mmu_idx,
diff --git a/target/sparc/cpu.c b/target/sparc/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/sparc/cpu.c
+++ b/target/sparc/cpu.c
@@ -XXX,XX +XXX,XX @@ static void sparc_cpu_class_init(ObjectClass *oc, void *data)
     cc->gdb_write_register = sparc_cpu_gdb_write_register;
     cc->tcg_ops.tlb_fill = sparc_cpu_tlb_fill;
 #ifndef CONFIG_USER_ONLY
-    cc->do_transaction_failed = sparc_cpu_do_transaction_failed;
+    cc->tcg_ops.do_transaction_failed = sparc_cpu_do_transaction_failed;
     cc->do_unaligned_access = sparc_cpu_do_unaligned_access;
     cc->get_phys_page_debug = sparc_cpu_get_phys_page_debug;
     cc->vmsd = &vmstate_sparc_cpu;
diff --git a/target/xtensa/cpu.c b/target/xtensa/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/xtensa/cpu.c
+++ b/target/xtensa/cpu.c
@@ -XXX,XX +XXX,XX @@ static void xtensa_cpu_class_init(ObjectClass *oc, void *data)
 #ifndef CONFIG_USER_ONLY
     cc->do_unaligned_access = xtensa_cpu_do_unaligned_access;
     cc->get_phys_page_debug = xtensa_cpu_get_phys_page_debug;
-    cc->do_transaction_failed = xtensa_cpu_do_transaction_failed;
+    cc->tcg_ops.do_transaction_failed = xtensa_cpu_do_transaction_failed;
 #endif
     cc->tcg_ops.debug_excp_handler = xtensa_breakpoint_handler;
     cc->disas_set_info = xtensa_cpu_disas_set_info;
diff --git a/target/xtensa/helper.c b/target/xtensa/helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/xtensa/helper.c
+++ b/target/xtensa/helper.c
@@ -XXX,XX +XXX,XX @@ bool xtensa_cpu_tlb_fill(CPUState *cs, vaddr address, int size,
     cpu_loop_exit_restore(cs, retaddr);
 }
 
-#else
+#else /* !CONFIG_USER_ONLY */
 
 void xtensa_cpu_do_unaligned_access(CPUState *cs,
                                     vaddr addr, MMUAccessType access_type,
@@ -XXX,XX +XXX,XX @@ void xtensa_runstall(CPUXtensaState *env, bool runstall)
         qemu_cpu_kick(cpu);
     }
 }
-#endif
+#endif /* !CONFIG_USER_ONLY */
-- 
2.25.1

From: Claudio Fontana <cfontana@suse.de>

make it consistently SOFTMMU-only.

[claudio: make the field presence in cpu.h unconditional, removing the ifdefs]
Message-Id: <20210204163931.7358-12-cfontana@suse.de>

diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/core/cpu.h
+++ b/include/hw/core/cpu.h
@@ -XXX,XX +XXX,XX @@ typedef struct TcgCpuOperations {
                                   unsigned size, MMUAccessType access_type,
                                   int mmu_idx, MemTxAttrs attrs,
                                   MemTxResult response, uintptr_t retaddr);
+    /**
+     * @do_unaligned_access: Callback for unaligned access handling
+     */
+    void (*do_unaligned_access)(CPUState *cpu, vaddr addr,
+                                MMUAccessType access_type,
+                                int mmu_idx, uintptr_t retaddr);
 } TcgCpuOperations;
 
 /**
@@ -XXX,XX +XXX,XX @@ typedef struct TcgCpuOperations {
  * @parse_features: Callback to parse command line arguments.
  * @reset_dump_flags: #CPUDumpFlags to use for reset logging.
  * @has_work: Callback for checking if there is work to do.
- * @do_unaligned_access: Callback for unaligned access handling, if
- * the target defines #TARGET_ALIGNED_ONLY.
  * @virtio_is_big_endian: Callback to return %true if a CPU which supports
  * runtime configurable endianness is currently big-endian. Non-configurable
  * CPUs can use the default implementation of this method. This method should
@@ -XXX,XX +XXX,XX @@ struct CPUClass {
 
     int reset_dump_flags;
     bool (*has_work)(CPUState *cpu);
-    void (*do_unaligned_access)(CPUState *cpu, vaddr addr,
-                                MMUAccessType access_type,
-                                int mmu_idx, uintptr_t retaddr);
     bool (*virtio_is_big_endian)(CPUState *cpu);
     int (*memory_rw_debug)(CPUState *cpu, vaddr addr,
                            uint8_t *buf, int len, bool is_write);
@@ -XXX,XX +XXX,XX @@ static inline void cpu_unaligned_access(CPUState *cpu, vaddr addr,
 {
     CPUClass *cc = CPU_GET_CLASS(cpu);
 
-    cc->do_unaligned_access(cpu, addr, access_type, mmu_idx, retaddr);
+    cc->tcg_ops.do_unaligned_access(cpu, addr, access_type, mmu_idx, retaddr);
 }
 
 static inline void cpu_transaction_failed(CPUState *cpu, hwaddr physaddr,
diff --git a/target/alpha/cpu.c b/target/alpha/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/alpha/cpu.c
+++ b/target/alpha/cpu.c
@@ -XXX,XX +XXX,XX @@ static void alpha_cpu_class_init(ObjectClass *oc, void *data)
     cc->tcg_ops.tlb_fill = alpha_cpu_tlb_fill;
 #ifndef CONFIG_USER_ONLY
     cc->tcg_ops.do_transaction_failed = alpha_cpu_do_transaction_failed;
-    cc->do_unaligned_access = alpha_cpu_do_unaligned_access;
+    cc->tcg_ops.do_unaligned_access = alpha_cpu_do_unaligned_access;
     cc->get_phys_page_debug = alpha_cpu_get_phys_page_debug;
     dc->vmsd = &vmstate_alpha_cpu;
 #endif
diff --git a/target/arm/cpu.c b/target/arm/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/cpu.c
+++ b/target/arm/cpu.c
@@ -XXX,XX +XXX,XX @@ static void arm_cpu_class_init(ObjectClass *oc, void *data)
     cc->tcg_ops.tlb_fill = arm_cpu_tlb_fill;
     cc->tcg_ops.debug_excp_handler = arm_debug_excp_handler;
     cc->debug_check_watchpoint = arm_debug_check_watchpoint;
-    cc->do_unaligned_access = arm_cpu_do_unaligned_access;
 #if !defined(CONFIG_USER_ONLY)
     cc->tcg_ops.do_transaction_failed = arm_cpu_do_transaction_failed;
+    cc->tcg_ops.do_unaligned_access = arm_cpu_do_unaligned_access;
     cc->adjust_watchpoint_address = arm_adjust_watchpoint_address;
     cc->tcg_ops.do_interrupt = arm_cpu_do_interrupt;
 #endif /* CONFIG_TCG && !CONFIG_USER_ONLY */
diff --git a/target/hppa/cpu.c b/target/hppa/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/hppa/cpu.c
+++ b/target/hppa/cpu.c
@@ -XXX,XX +XXX,XX @@ static void hppa_cpu_disas_set_info(CPUState *cs, disassemble_info *info)
     info->print_insn = print_insn_hppa;
 }
 
+#ifndef CONFIG_USER_ONLY
 static void hppa_cpu_do_unaligned_access(CPUState *cs, vaddr addr,
                                          MMUAccessType access_type,
                                          int mmu_idx, uintptr_t retaddr)
@@ -XXX,XX +XXX,XX @@ static void hppa_cpu_do_unaligned_access(CPUState *cs, vaddr addr,
 
     cpu_loop_exit_restore(cs, retaddr);
 }
+#endif /* CONFIG_USER_ONLY */
 
 static void hppa_cpu_realizefn(DeviceState *dev, Error **errp)
 {
@@ -XXX,XX +XXX,XX @@ static void hppa_cpu_class_init(ObjectClass *oc, void *data)
     cc->tcg_ops.tlb_fill = hppa_cpu_tlb_fill;
 #ifndef CONFIG_USER_ONLY
     cc->get_phys_page_debug = hppa_cpu_get_phys_page_debug;
+    cc->tcg_ops.do_unaligned_access = hppa_cpu_do_unaligned_access;
     dc->vmsd = &vmstate_hppa_cpu;
 #endif
-    cc->do_unaligned_access = hppa_cpu_do_unaligned_access;
     cc->disas_set_info = hppa_cpu_disas_set_info;
     cc->tcg_ops.initialize = hppa_translate_init;
 
diff --git a/target/microblaze/cpu.c b/target/microblaze/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/microblaze/cpu.c
+++ b/target/microblaze/cpu.c
@@ -XXX,XX +XXX,XX @@ static void mb_cpu_class_init(ObjectClass *oc, void *data)
     cc->class_by_name = mb_cpu_class_by_name;
     cc->has_work = mb_cpu_has_work;
     cc->tcg_ops.do_interrupt = mb_cpu_do_interrupt;
-    cc->do_unaligned_access = mb_cpu_do_unaligned_access;
     cc->tcg_ops.cpu_exec_interrupt = mb_cpu_exec_interrupt;
     cc->dump_state = mb_cpu_dump_state;
     cc->set_pc = mb_cpu_set_pc;
@@ -XXX,XX +XXX,XX @@ static void mb_cpu_class_init(ObjectClass *oc, void *data)
     cc->tcg_ops.tlb_fill = mb_cpu_tlb_fill;
 #ifndef CONFIG_USER_ONLY
     cc->tcg_ops.do_transaction_failed = mb_cpu_transaction_failed;
+    cc->tcg_ops.do_unaligned_access = mb_cpu_do_unaligned_access;
     cc->get_phys_page_attrs_debug = mb_cpu_get_phys_page_attrs_debug;
     dc->vmsd = &vmstate_mb_cpu;
 #endif
diff --git a/target/mips/cpu.c b/target/mips/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/mips/cpu.c
+++ b/target/mips/cpu.c
@@ -XXX,XX +XXX,XX @@ static void mips_cpu_class_init(ObjectClass *c, void *data)
     cc->gdb_read_register = mips_cpu_gdb_read_register;
     cc->gdb_write_register = mips_cpu_gdb_write_register;
 #ifndef CONFIG_USER_ONLY
-    cc->do_unaligned_access = mips_cpu_do_unaligned_access;
     cc->get_phys_page_debug = mips_cpu_get_phys_page_debug;
     cc->vmsd = &vmstate_mips_cpu;
 #endif
@@ -XXX,XX +XXX,XX @@ static void mips_cpu_class_init(ObjectClass *c, void *data)
     cc->tcg_ops.tlb_fill = mips_cpu_tlb_fill;
 #ifndef CONFIG_USER_ONLY
     cc->tcg_ops.do_transaction_failed = mips_cpu_do_transaction_failed;
+    cc->tcg_ops.do_unaligned_access = mips_cpu_do_unaligned_access;
+
 #endif /* CONFIG_USER_ONLY */
 #endif /* CONFIG_TCG */
 
diff --git a/target/nios2/cpu.c b/target/nios2/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/nios2/cpu.c
+++ b/target/nios2/cpu.c
@@ -XXX,XX +XXX,XX @@ static void nios2_cpu_class_init(ObjectClass *oc, void *data)
     cc->disas_set_info = nios2_cpu_disas_set_info;
     cc->tcg_ops.tlb_fill = nios2_cpu_tlb_fill;
 #ifndef CONFIG_USER_ONLY
-    cc->do_unaligned_access = nios2_cpu_do_unaligned_access;
+    cc->tcg_ops.do_unaligned_access = nios2_cpu_do_unaligned_access;
     cc->get_phys_page_debug = nios2_cpu_get_phys_page_debug;
 #endif
     cc->gdb_read_register = nios2_cpu_gdb_read_register;
diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/riscv/cpu.c
+++ b/target/riscv/cpu.c
@@ -XXX,XX +XXX,XX @@ static void riscv_cpu_class_init(ObjectClass *c, void *data)
     cc->disas_set_info = riscv_cpu_disas_set_info;
 #ifndef CONFIG_USER_ONLY
     cc->tcg_ops.do_transaction_failed = riscv_cpu_do_transaction_failed;
-    cc->do_unaligned_access = riscv_cpu_do_unaligned_access;
+    cc->tcg_ops.do_unaligned_access = riscv_cpu_do_unaligned_access;
     cc->get_phys_page_debug = riscv_cpu_get_phys_page_debug;
     /* For now, mark unmigratable: */
     cc->vmsd = &vmstate_riscv_cpu;
diff --git a/target/s390x/cpu.c b/target/s390x/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/s390x/cpu.c
+++ b/target/s390x/cpu.c
@@ -XXX,XX +XXX,XX @@ static void s390_cpu_class_init(ObjectClass *oc, void *data)
 #ifdef CONFIG_TCG
     cc->tcg_ops.cpu_exec_interrupt = s390_cpu_exec_interrupt;
     cc->tcg_ops.debug_excp_handler = s390x_cpu_debug_excp_handler;
-    cc->do_unaligned_access = s390x_cpu_do_unaligned_access;
+    cc->tcg_ops.do_unaligned_access = s390x_cpu_do_unaligned_access;
 #endif
 #endif
     cc->disas_set_info = s390_cpu_disas_set_info;
diff --git a/target/s390x/excp_helper.c b/target/s390x/excp_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/s390x/excp_helper.c
+++ b/target/s390x/excp_helper.c
@@ -XXX,XX +XXX,XX @@ void HELPER(monitor_call)(CPUS390XState *env, uint64_t monitor_code,
     }
 }
 
-#endif /* CONFIG_USER_ONLY */
+#endif /* !CONFIG_USER_ONLY */
diff --git a/target/sh4/cpu.c b/target/sh4/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/sh4/cpu.c
+++ b/target/sh4/cpu.c
@@ -XXX,XX +XXX,XX @@ static void superh_cpu_class_init(ObjectClass *oc, void *data)
     cc->gdb_write_register = superh_cpu_gdb_write_register;
     cc->tcg_ops.tlb_fill = superh_cpu_tlb_fill;
 #ifndef CONFIG_USER_ONLY
-    cc->do_unaligned_access = superh_cpu_do_unaligned_access;
+    cc->tcg_ops.do_unaligned_access = superh_cpu_do_unaligned_access;
     cc->get_phys_page_debug = superh_cpu_get_phys_page_debug;
 #endif
     cc->disas_set_info = superh_cpu_disas_set_info;
diff --git a/target/sparc/cpu.c b/target/sparc/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/sparc/cpu.c
+++ b/target/sparc/cpu.c
@@ -XXX,XX +XXX,XX @@ static void sparc_cpu_class_init(ObjectClass *oc, void *data)
     cc->tcg_ops.tlb_fill = sparc_cpu_tlb_fill;
 #ifndef CONFIG_USER_ONLY
     cc->tcg_ops.do_transaction_failed = sparc_cpu_do_transaction_failed;
-    cc->do_unaligned_access = sparc_cpu_do_unaligned_access;
+    cc->tcg_ops.do_unaligned_access = sparc_cpu_do_unaligned_access;
     cc->get_phys_page_debug = sparc_cpu_get_phys_page_debug;
     cc->vmsd = &vmstate_sparc_cpu;
 #endif
diff --git a/target/xtensa/cpu.c b/target/xtensa/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/xtensa/cpu.c
+++ b/target/xtensa/cpu.c
@@ -XXX,XX +XXX,XX @@ static void xtensa_cpu_class_init(ObjectClass *oc, void *data)
     cc->gdb_stop_before_watchpoint = true;
     cc->tcg_ops.tlb_fill = xtensa_cpu_tlb_fill;
 #ifndef CONFIG_USER_ONLY
-    cc->do_unaligned_access = xtensa_cpu_do_unaligned_access;
+    cc->tcg_ops.do_unaligned_access = xtensa_cpu_do_unaligned_access;
     cc->get_phys_page_debug = xtensa_cpu_get_phys_page_debug;
     cc->tcg_ops.do_transaction_failed = xtensa_cpu_do_transaction_failed;
 #endif
diff --git a/target/ppc/translate_init.c.inc b/target/ppc/translate_init.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/target/ppc/translate_init.c.inc
+++ b/target/ppc/translate_init.c.inc
@@ -XXX,XX +XXX,XX @@ static void ppc_cpu_class_init(ObjectClass *oc, void *data)
     cc->set_pc = ppc_cpu_set_pc;
     cc->gdb_read_register = ppc_cpu_gdb_read_register;
     cc->gdb_write_register = ppc_cpu_gdb_write_register;
-    cc->do_unaligned_access = ppc_cpu_do_unaligned_access;
 #ifndef CONFIG_USER_ONLY
     cc->get_phys_page_debug = ppc_cpu_get_phys_page_debug;
     cc->vmsd = &vmstate_ppc_cpu;
@@ -XXX,XX +XXX,XX @@ static void ppc_cpu_class_init(ObjectClass *oc, void *data)
 #ifndef CONFIG_USER_ONLY
     cc->tcg_ops.cpu_exec_enter = ppc_cpu_exec_enter;
     cc->tcg_ops.cpu_exec_exit = ppc_cpu_exec_exit;
+    cc->tcg_ops.do_unaligned_access = ppc_cpu_do_unaligned_access;
 #endif /* !CONFIG_USER_ONLY */
 #endif /* CONFIG_TCG */
 
-- 
2.25.1

From: Claudio Fontana <cfontana@suse.de>

cpu_check_watchpoint, watchpoint_address_matches are TCG-only.

Signed-off-by: Claudio Fontana <cfontana@suse.de>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Message-Id: <20210204163931.7358-13-cfontana@suse.de>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 softmmu/physmem.c | 141 +++++++++++++++++++++++-----------------------
 1 file changed, 72 insertions(+), 69 deletions(-)

diff --git a/softmmu/physmem.c b/softmmu/physmem.c
index XXXXXXX..XXXXXXX 100644
--- a/softmmu/physmem.c
+++ b/softmmu/physmem.c
@@ -XXX,XX +XXX,XX @@ void cpu_watchpoint_remove_all(CPUState *cpu, int mask)
     }
 }
 
+#ifdef CONFIG_TCG
 /* Return true if this watchpoint address matches the specified
  * access (ie the address range covered by the watchpoint overlaps
  * partially or completely with the address range covered by the
@@ -XXX,XX +XXX,XX @@ int cpu_watchpoint_address_matches(CPUState *cpu, vaddr addr, vaddr len)
     return ret;
 }
 
+/* Generate a debug exception if a watchpoint has been hit.  */
+void cpu_check_watchpoint(CPUState *cpu, vaddr addr, vaddr len,
+                          MemTxAttrs attrs, int flags, uintptr_t ra)
+{
+    CPUClass *cc = CPU_GET_CLASS(cpu);
+    CPUWatchpoint *wp;
+
+    assert(tcg_enabled());
+    if (cpu->watchpoint_hit) {
+        /*
+         * We re-entered the check after replacing the TB.
+         * Now raise the debug interrupt so that it will
+         * trigger after the current instruction.
+         */
+        qemu_mutex_lock_iothread();
+        cpu_interrupt(cpu, CPU_INTERRUPT_DEBUG);
+        qemu_mutex_unlock_iothread();
+        return;
+    }
+
+    addr = cc->adjust_watchpoint_address(cpu, addr, len);
+    QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
+        if (watchpoint_address_matches(wp, addr, len)
+            && (wp->flags & flags)) {
+            if (replay_running_debug()) {
+                /*
+                 * Don't process the watchpoints when we are
+                 * in a reverse debugging operation.
+                 */
+                replay_breakpoint();
+                return;
+            }
+            if (flags == BP_MEM_READ) {
+                wp->flags |= BP_WATCHPOINT_HIT_READ;
+            } else {
+                wp->flags |= BP_WATCHPOINT_HIT_WRITE;
+            }
+            wp->hitaddr = MAX(addr, wp->vaddr);
+            wp->hitattrs = attrs;
+            if (!cpu->watchpoint_hit) {
+                if (wp->flags & BP_CPU &&
+                    !cc->debug_check_watchpoint(cpu, wp)) {
+                    wp->flags &= ~BP_WATCHPOINT_HIT;
+                    continue;
+                }
+                cpu->watchpoint_hit = wp;
+
+                mmap_lock();
+                tb_check_watchpoint(cpu, ra);
+                if (wp->flags & BP_STOP_BEFORE_ACCESS) {
+                    cpu->exception_index = EXCP_DEBUG;
+                    mmap_unlock();
+                    cpu_loop_exit_restore(cpu, ra);
+                } else {
+                    /* Force execution of one insn next time.  */
+                    cpu->cflags_next_tb = 1 | curr_cflags();
+                    mmap_unlock();
+                    if (ra) {
+                        cpu_restore_state(cpu, ra, true);
+                    }
+                    cpu_loop_exit_noexc(cpu);
+                }
+            }
+        } else {
+            wp->flags &= ~BP_WATCHPOINT_HIT;
+        }
+    }
+}
+
+#endif /* CONFIG_TCG */
+
 /* Called from RCU critical section */
 static RAMBlock *qemu_get_ram_block(ram_addr_t addr)
 {
@@ -XXX,XX +XXX,XX @@ ram_addr_t qemu_ram_addr_from_host(void *ptr)
     return block->offset + offset;
 }
 
-/* Generate a debug exception if a watchpoint has been hit.  */
-void cpu_check_watchpoint(CPUState *cpu, vaddr addr, vaddr len,
-                          MemTxAttrs attrs, int flags, uintptr_t ra)
-{
-    CPUClass *cc = CPU_GET_CLASS(cpu);
-    CPUWatchpoint *wp;
-
-    assert(tcg_enabled());
-    if (cpu->watchpoint_hit) {
-        /*
-         * We re-entered the check after replacing the TB.
-         * Now raise the debug interrupt so that it will
-         * trigger after the current instruction.
-         */
-        qemu_mutex_lock_iothread();
-        cpu_interrupt(cpu, CPU_INTERRUPT_DEBUG);
-        qemu_mutex_unlock_iothread();
-        return;
-    }
-
-    addr = cc->adjust_watchpoint_address(cpu, addr, len);
-    QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
-        if (watchpoint_address_matches(wp, addr, len)
-            && (wp->flags & flags)) {
-            if (replay_running_debug()) {
-                /*
-                 * Don't process the watchpoints when we are
-                 * in a reverse debugging operation.
-                 */
-                replay_breakpoint();
-                return;
-            }
-            if (flags == BP_MEM_READ) {
-                wp->flags |= BP_WATCHPOINT_HIT_READ;
-            } else {
-                wp->flags |= BP_WATCHPOINT_HIT_WRITE;
-            }
-            wp->hitaddr = MAX(addr, wp->vaddr);
-            wp->hitattrs = attrs;
-            if (!cpu->watchpoint_hit) {
-                if (wp->flags & BP_CPU &&
-                    !cc->debug_check_watchpoint(cpu, wp)) {
-                    wp->flags &= ~BP_WATCHPOINT_HIT;
-                    continue;
-                }
-                cpu->watchpoint_hit = wp;
-
-                mmap_lock();
-                tb_check_watchpoint(cpu, ra);
-                if (wp->flags & BP_STOP_BEFORE_ACCESS) {
-                    cpu->exception_index = EXCP_DEBUG;
-                    mmap_unlock();
-                    cpu_loop_exit_restore(cpu, ra);
-                } else {
-                    /* Force execution of one insn next time.  */
-                    cpu->cflags_next_tb = 1 | curr_cflags();
-                    mmap_unlock();
-                    if (ra) {
-                        cpu_restore_state(cpu, ra, true);
-                    }
-                    cpu_loop_exit_noexc(cpu);
-                }
-            }
-        } else {
-            wp->flags &= ~BP_WATCHPOINT_HIT;
-        }
-    }
-}
-
 static MemTxResult flatview_read(FlatView *fv, hwaddr addr,
                                  MemTxAttrs attrs, void *buf, hwaddr len);
 static MemTxResult flatview_write(FlatView *fv, hwaddr addr, MemTxAttrs attrs,
-- 
2.25.1

From: Claudio Fontana <cfontana@suse.de>

commit 40612000599e ("arm: Correctly handle watchpoints for BE32 CPUs")

introduced this ARM-specific, TCG-specific hack to adjust the address,
before checking it with cpu_check_watchpoint.

Make adjust_watchpoint_address optional and move it to tcg_ops.

Signed-off-by: Claudio Fontana <cfontana@suse.de>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Message-Id: <20210204163931.7358-14-cfontana@suse.de>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/hw/core/cpu.h | 6 +++++-
 hw/core/cpu.c         | 6 ------
 softmmu/physmem.c     | 5 ++++-
 target/arm/cpu.c      | 2 +-
 4 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/core/cpu.h
+++ b/include/hw/core/cpu.h
@@ -XXX,XX +XXX,XX @@ typedef struct TcgCpuOperations {
     void (*do_unaligned_access)(CPUState *cpu, vaddr addr,
                                 MMUAccessType access_type,
                                 int mmu_idx, uintptr_t retaddr);
+    /**
+     * @adjust_watchpoint_address: hack for cpu_check_watchpoint used by ARM
+     */
+    vaddr (*adjust_watchpoint_address)(CPUState *cpu, vaddr addr, int len);
+
 } TcgCpuOperations;
 
 /**
@@ -XXX,XX +XXX,XX @@ struct CPUClass {
     const char * (*gdb_get_dynamic_xml)(CPUState *cpu, const char *xmlname);
 
     void (*disas_set_info)(CPUState *cpu, disassemble_info *info);
-    vaddr (*adjust_watchpoint_address)(CPUState *cpu, vaddr addr, int len);
 
     const char *deprecation_note;
     /* Keep non-pointer data at the end to minimize holes.  */
diff --git a/hw/core/cpu.c b/hw/core/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/core/cpu.c
+++ b/hw/core/cpu.c
@@ -XXX,XX +XXX,XX @@ static int64_t cpu_common_get_arch_id(CPUState *cpu)
     return cpu->cpu_index;
 }
 
-static vaddr cpu_adjust_watchpoint_address(CPUState *cpu, vaddr addr, int len)
-{
-    return addr;
-}
-
 static Property cpu_common_props[] = {
 #ifndef CONFIG_USER_ONLY
     /* Create a memory property for softmmu CPU object,
@@ -XXX,XX +XXX,XX @@ static void cpu_class_init(ObjectClass *klass, void *data)
     k->gdb_write_register = cpu_common_gdb_write_register;
     k->virtio_is_big_endian = cpu_common_virtio_is_big_endian;
     k->debug_check_watchpoint = cpu_common_debug_check_watchpoint;
-    k->adjust_watchpoint_address = cpu_adjust_watchpoint_address;
     set_bit(DEVICE_CATEGORY_CPU, dc->categories);
     dc->realize = cpu_common_realizefn;
     dc->unrealize = cpu_common_unrealizefn;
diff --git a/softmmu/physmem.c b/softmmu/physmem.c
index XXXXXXX..XXXXXXX 100644
--- a/softmmu/physmem.c
+++ b/softmmu/physmem.c
@@ -XXX,XX +XXX,XX @@ void cpu_check_watchpoint(CPUState *cpu, vaddr addr, vaddr len,
         return;
     }
 
-    addr = cc->adjust_watchpoint_address(cpu, addr, len);
+    if (cc->tcg_ops.adjust_watchpoint_address) {
+        /* this is currently used only by ARM BE32 */
+        addr = cc->tcg_ops.adjust_watchpoint_address(cpu, addr, len);
+    }
     QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
         if (watchpoint_address_matches(wp, addr, len)
             && (wp->flags & flags)) {
diff --git a/target/arm/cpu.c b/target/arm/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/cpu.c
+++ b/target/arm/cpu.c
@@ -XXX,XX +XXX,XX @@ static void arm_cpu_class_init(ObjectClass *oc, void *data)
 #if !defined(CONFIG_USER_ONLY)
     cc->tcg_ops.do_transaction_failed = arm_cpu_do_transaction_failed;
     cc->tcg_ops.do_unaligned_access = arm_cpu_do_unaligned_access;
-    cc->adjust_watchpoint_address = arm_adjust_watchpoint_address;
+    cc->tcg_ops.adjust_watchpoint_address = arm_adjust_watchpoint_address;
     cc->tcg_ops.do_interrupt = arm_cpu_do_interrupt;
 #endif /* CONFIG_TCG && !CONFIG_USER_ONLY */
 #endif /* CONFIG_TCG */
-- 
2.25.1

From: Claudio Fontana <cfontana@suse.de>

commit 568496c0c0f1 ("cpu: Add callback to check architectural") and
commit 3826121d9298 ("target-arm: Implement checking of fired")
introduced an ARM-specific hack for cpu_check_watchpoint.

Make debug_check_watchpoint optional, and move it to tcg_ops.

Signed-off-by: Claudio Fontana <cfontana@suse.de>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Message-Id: <20210204163931.7358-15-cfontana@suse.de>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/hw/core/cpu.h | 9 ++++++---
 accel/tcg/user-exec.c | 3 ++-
 hw/core/cpu.c         | 9 ---------
 softmmu/physmem.c     | 4 ++--
 target/arm/cpu.c      | 4 ++--
 5 files changed, 12 insertions(+), 17 deletions(-)

diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/core/cpu.h
+++ b/include/hw/core/cpu.h
@@ -XXX,XX +XXX,XX @@ typedef struct TcgCpuOperations {
      */
     vaddr (*adjust_watchpoint_address)(CPUState *cpu, vaddr addr, int len);
 
+    /**
+     * @debug_check_watchpoint: return true if the architectural
+     * watchpoint whose address has matched should really fire, used by ARM
+     */
+    bool (*debug_check_watchpoint)(CPUState *cpu, CPUWatchpoint *wp);
+
 } TcgCpuOperations;
 
 /**
@@ -XXX,XX +XXX,XX @@ typedef struct TcgCpuOperations {
  *       a memory access with the specified memory transaction attributes.
  * @gdb_read_register: Callback for letting GDB read a register.
  * @gdb_write_register: Callback for letting GDB write a register.
- * @debug_check_watchpoint: Callback: return true if the architectural
- *       watchpoint whose address has matched should really fire.
  * @write_elf64_note: Callback for writing a CPU-specific ELF note to a
  * 64-bit VM coredump.
  * @write_elf32_qemunote: Callback for writing a CPU- and QEMU-specific ELF
@@ -XXX,XX +XXX,XX @@ struct CPUClass {
     int (*asidx_from_attrs)(CPUState *cpu, MemTxAttrs attrs);
     int (*gdb_read_register)(CPUState *cpu, GByteArray *buf, int reg);
     int (*gdb_write_register)(CPUState *cpu, uint8_t *buf, int reg);
-    bool (*debug_check_watchpoint)(CPUState *cpu, CPUWatchpoint *wp);
 
     int (*write_elf64_note)(WriteCoreDumpFunction f, CPUState *cpu,
                             int cpuid, void *opaque);
diff --git a/accel/tcg/user-exec.c b/accel/tcg/user-exec.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/user-exec.c
+++ b/accel/tcg/user-exec.c
@@ -XXX,XX +XXX,XX @@ static inline int handle_cpu_signal(uintptr_t pc, siginfo_t *info,
     clear_helper_retaddr();
 
     cc = CPU_GET_CLASS(cpu);
-    cc->tcg_ops.tlb_fill(cpu, address, 0, access_type, MMU_USER_IDX, false, pc);
+    cc->tcg_ops.tlb_fill(cpu, address, 0, access_type,
+                         MMU_USER_IDX, false, pc);
     g_assert_not_reached();
 }
 
diff --git a/hw/core/cpu.c b/hw/core/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/core/cpu.c
+++ b/hw/core/cpu.c
@@ -XXX,XX +XXX,XX @@ static int cpu_common_gdb_write_register(CPUState *cpu, uint8_t *buf, int reg)
     return 0;
 }
 
-static bool cpu_common_debug_check_watchpoint(CPUState *cpu, CPUWatchpoint *wp)
-{
-    /* If no extra check is required, QEMU watchpoint match can be considered
-     * as an architectural match.
-     */
-    return true;
-}
-
 static bool cpu_common_virtio_is_big_endian(CPUState *cpu)
 {
     return target_words_bigendian();
@@ -XXX,XX +XXX,XX @@ static void cpu_class_init(ObjectClass *klass, void *data)
     k->gdb_read_register = cpu_common_gdb_read_register;
     k->gdb_write_register = cpu_common_gdb_write_register;
     k->virtio_is_big_endian = cpu_common_virtio_is_big_endian;
-    k->debug_check_watchpoint = cpu_common_debug_check_watchpoint;
     set_bit(DEVICE_CATEGORY_CPU, dc->categories);
     dc->realize = cpu_common_realizefn;
     dc->unrealize = cpu_common_unrealizefn;
diff --git a/softmmu/physmem.c b/softmmu/physmem.c
index XXXXXXX..XXXXXXX 100644
--- a/softmmu/physmem.c
+++ b/softmmu/physmem.c
@@ -XXX,XX +XXX,XX @@ void cpu_check_watchpoint(CPUState *cpu, vaddr addr, vaddr len,
             wp->hitaddr = MAX(addr, wp->vaddr);
             wp->hitattrs = attrs;
             if (!cpu->watchpoint_hit) {
-                if (wp->flags & BP_CPU &&
-                    !cc->debug_check_watchpoint(cpu, wp)) {
+                if (wp->flags & BP_CPU && cc->tcg_ops.debug_check_watchpoint &&
+                    !cc->tcg_ops.debug_check_watchpoint(cpu, wp)) {
                     wp->flags &= ~BP_WATCHPOINT_HIT;
                     continue;
                 }
diff --git a/target/arm/cpu.c b/target/arm/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/cpu.c
+++ b/target/arm/cpu.c
@@ -XXX,XX +XXX,XX @@ static void arm_cpu_class_init(ObjectClass *oc, void *data)
     cc->tcg_ops.synchronize_from_tb = arm_cpu_synchronize_from_tb;
     cc->tcg_ops.tlb_fill = arm_cpu_tlb_fill;
     cc->tcg_ops.debug_excp_handler = arm_debug_excp_handler;
-    cc->debug_check_watchpoint = arm_debug_check_watchpoint;
 #if !defined(CONFIG_USER_ONLY)
+    cc->tcg_ops.do_interrupt = arm_cpu_do_interrupt;
     cc->tcg_ops.do_transaction_failed = arm_cpu_do_transaction_failed;
     cc->tcg_ops.do_unaligned_access = arm_cpu_do_unaligned_access;
     cc->tcg_ops.adjust_watchpoint_address = arm_adjust_watchpoint_address;
-    cc->tcg_ops.do_interrupt = arm_cpu_do_interrupt;
+    cc->tcg_ops.debug_check_watchpoint = arm_debug_check_watchpoint;
 #endif /* CONFIG_TCG && !CONFIG_USER_ONLY */
 #endif /* CONFIG_TCG */
 }
-- 
2.25.1

From: Claudio Fontana <cfontana@suse.de>

we cannot in principle make the TCG Operations field definitions
conditional on CONFIG_TCG in code that is included by both common_ss
and specific_ss modules.

Therefore, what we can do safely to restrict the TCG fields to TCG-only
builds, is to move all tcg cpu operations into a separate header file,
which is only included by TCG, target-specific code.

This leaves just a NULL pointer in the cpu.h for the non-TCG builds.

This also tidies up the code in all targets a bit, having all TCG cpu
operations neatly contained by a dedicated data struct.

diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/core/cpu.h
+++ b/include/hw/core/cpu.h
@@ -XXX,XX +XXX,XX @@ typedef struct CPUWatchpoint CPUWatchpoint;
 
 struct TranslationBlock;
 
-/**
- * struct TcgCpuOperations: TCG operations specific to a CPU class
- */
-typedef struct TcgCpuOperations {
-    /**
-     * @initialize: Initalize TCG state
-     *
-     * Called when the first CPU is realized.
-     */
-    void (*initialize)(void);
-    /**
-     * @synchronize_from_tb: Synchronize state from a TCG #TranslationBlock
-     *
-     * This is called when we abandon execution of a TB before starting it,
-     * and must set all parts of the CPU state which the previous TB in the
-     * chain may not have updated.
-     * By default, when this is NULL, a call is made to @set_pc(tb->pc).
-     *
-     * If more state needs to be restored, the target must implement a
-     * function to restore all the state, and register it here.
-     */
-    void (*synchronize_from_tb)(CPUState *cpu,
-                                const struct TranslationBlock *tb);
-    /** @cpu_exec_enter: Callback for cpu_exec preparation */
-    void (*cpu_exec_enter)(CPUState *cpu);
-    /** @cpu_exec_exit: Callback for cpu_exec cleanup */
-    void (*cpu_exec_exit)(CPUState *cpu);
-    /** @cpu_exec_interrupt: Callback for processing interrupts in cpu_exec */
-    bool (*cpu_exec_interrupt)(CPUState *cpu, int interrupt_request);
-    /** @do_interrupt: Callback for interrupt handling. */
-    void (*do_interrupt)(CPUState *cpu);
-    /**
-     * @tlb_fill: Handle a softmmu tlb miss or user-only address fault
-     *
-     * For system mode, if the access is valid, call tlb_set_page
-     * and return true; if the access is invalid, and probe is
-     * true, return false; otherwise raise an exception and do
-     * not return.  For user-only mode, always raise an exception
-     * and do not return.
-     */
-    bool (*tlb_fill)(CPUState *cpu, vaddr address, int size,
-                     MMUAccessType access_type, int mmu_idx,
-                     bool probe, uintptr_t retaddr);
-    /** @debug_excp_handler: Callback for handling debug exceptions */
-    void (*debug_excp_handler)(CPUState *cpu);
-
-    /**
-     * @do_transaction_failed: Callback for handling failed memory transactions
-     * (ie bus faults or external aborts; not MMU faults)
-     */
-    void (*do_transaction_failed)(CPUState *cpu, hwaddr physaddr, vaddr addr,
-                                  unsigned size, MMUAccessType access_type,
-                                  int mmu_idx, MemTxAttrs attrs,
-                                  MemTxResult response, uintptr_t retaddr);
-    /**
-     * @do_unaligned_access: Callback for unaligned access handling
-     */
-    void (*do_unaligned_access)(CPUState *cpu, vaddr addr,
-                                MMUAccessType access_type,
-                                int mmu_idx, uintptr_t retaddr);
-    /**
-     * @adjust_watchpoint_address: hack for cpu_check_watchpoint used by ARM
-     */
-    vaddr (*adjust_watchpoint_address)(CPUState *cpu, vaddr addr, int len);
-
-    /**
-     * @debug_check_watchpoint: return true if the architectural
-     * watchpoint whose address has matched should really fire, used by ARM
-     */
-    bool (*debug_check_watchpoint)(CPUState *cpu, CPUWatchpoint *wp);
-
-} TcgCpuOperations;
+/* see tcg-cpu-ops.h */
+struct TCGCPUOps;
 
 /**
  * CPUClass:
@@ -XXX,XX +XXX,XX @@ struct CPUClass {
     int gdb_num_core_regs;
     bool gdb_stop_before_watchpoint;
 
-    TcgCpuOperations tcg_ops;
+    /* when TCG is not available, this pointer is NULL */
+    struct TCGCPUOps *tcg_ops;
 };
 
 /*
@@ -XXX,XX +XXX,XX @@ CPUState *cpu_by_arch_id(int64_t id);
 
 void cpu_interrupt(CPUState *cpu, int mask);
 
-static inline void cpu_unaligned_access(CPUState *cpu, vaddr addr,
-                                        MMUAccessType access_type,
-                                        int mmu_idx, uintptr_t retaddr)
-{
-    CPUClass *cc = CPU_GET_CLASS(cpu);
-
-    cc->tcg_ops.do_unaligned_access(cpu, addr, access_type, mmu_idx, retaddr);
-}
-
-static inline void cpu_transaction_failed(CPUState *cpu, hwaddr physaddr,
-                                          vaddr addr, unsigned size,
-                                          MMUAccessType access_type,
-                                          int mmu_idx, MemTxAttrs attrs,
-                                          MemTxResult response,
-                                          uintptr_t retaddr)
-{
-    CPUClass *cc = CPU_GET_CLASS(cpu);
-
-    if (!cpu->ignore_memory_transaction_failures &&
-        cc->tcg_ops.do_transaction_failed) {
-        cc->tcg_ops.do_transaction_failed(cpu, physaddr, addr, size,
-                                          access_type, mmu_idx, attrs,
-                                          response, retaddr);
-    }
-}
-
 /**
  * cpu_set_pc:
  * @cpu: The CPU to set the program counter for.
diff --git a/include/hw/core/tcg-cpu-ops.h b/include/hw/core/tcg-cpu-ops.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/include/hw/core/tcg-cpu-ops.h
@@ -XXX,XX +XXX,XX @@
+/*
+ * TCG CPU-specific operations
+ *
+ * Copyright 2021 SUSE LLC
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef TCG_CPU_OPS_H
+#define TCG_CPU_OPS_H
+
+#include "hw/core/cpu.h"
+
+struct TCGCPUOps {
+    /**
+     * @initialize: Initalize TCG state
+     *
+     * Called when the first CPU is realized.
+     */
+    void (*initialize)(void);
+    /**
+     * @synchronize_from_tb: Synchronize state from a TCG #TranslationBlock
+     *
+     * This is called when we abandon execution of a TB before starting it,
+     * and must set all parts of the CPU state which the previous TB in the
+     * chain may not have updated.
+     * By default, when this is NULL, a call is made to @set_pc(tb->pc).
+     *
+     * If more state needs to be restored, the target must implement a
+     * function to restore all the state, and register it here.
+     */
+    void (*synchronize_from_tb)(CPUState *cpu,
+                                const struct TranslationBlock *tb);
+    /** @cpu_exec_enter: Callback for cpu_exec preparation */
+    void (*cpu_exec_enter)(CPUState *cpu);
+    /** @cpu_exec_exit: Callback for cpu_exec cleanup */
+    void (*cpu_exec_exit)(CPUState *cpu);
+    /** @cpu_exec_interrupt: Callback for processing interrupts in cpu_exec */
+    bool (*cpu_exec_interrupt)(CPUState *cpu, int interrupt_request);
+    /**
+     * @do_interrupt: Callback for interrupt handling.
+     *
+     * note that this is in general SOFTMMU only, but it actually isn't
+     * because of an x86 hack (accel/tcg/cpu-exec.c), so we cannot put it
+     * in the SOFTMMU section in general.
+     */
+    void (*do_interrupt)(CPUState *cpu);
+    /**
+     * @tlb_fill: Handle a softmmu tlb miss or user-only address fault
+     *
+     * For system mode, if the access is valid, call tlb_set_page
+     * and return true; if the access is invalid, and probe is
+     * true, return false; otherwise raise an exception and do
+     * not return.  For user-only mode, always raise an exception
+     * and do not return.
+     */
+    bool (*tlb_fill)(CPUState *cpu, vaddr address, int size,
+                     MMUAccessType access_type, int mmu_idx,
+                     bool probe, uintptr_t retaddr);
+    /** @debug_excp_handler: Callback for handling debug exceptions */
+    void (*debug_excp_handler)(CPUState *cpu);
+
+#ifdef NEED_CPU_H
+#ifdef CONFIG_SOFTMMU
+    /**
+     * @do_transaction_failed: Callback for handling failed memory transactions
+     * (ie bus faults or external aborts; not MMU faults)
+     */
+    void (*do_transaction_failed)(CPUState *cpu, hwaddr physaddr, vaddr addr,
+                                  unsigned size, MMUAccessType access_type,
+                                  int mmu_idx, MemTxAttrs attrs,
+                                  MemTxResult response, uintptr_t retaddr);
+    /**
+     * @do_unaligned_access: Callback for unaligned access handling
+     */
+    void (*do_unaligned_access)(CPUState *cpu, vaddr addr,
+                                MMUAccessType access_type,
+                                int mmu_idx, uintptr_t retaddr);
+
+    /**
+     * @adjust_watchpoint_address: hack for cpu_check_watchpoint used by ARM
+     */
+    vaddr (*adjust_watchpoint_address)(CPUState *cpu, vaddr addr, int len);
+
+    /**
+     * @debug_check_watchpoint: return true if the architectural
+     * watchpoint whose address has matched should really fire, used by ARM
+     */
+    bool (*debug_check_watchpoint)(CPUState *cpu, CPUWatchpoint *wp);
+
+#endif /* CONFIG_SOFTMMU */
+#endif /* NEED_CPU_H */
+
+};
+
+#endif /* TCG_CPU_OPS_H */
diff --git a/target/arm/internals.h b/target/arm/internals.h
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/internals.h
+++ b/target/arm/internals.h
@@ -XXX,XX +XXX,XX @@ static inline int r14_bank_number(int mode)
 void arm_cpu_register_gdb_regs_for_features(ARMCPU *cpu);
 void arm_translate_init(void);
 
+#ifdef CONFIG_TCG
+void arm_cpu_synchronize_from_tb(CPUState *cs,
+                                 const struct TranslationBlock *tb);
+#endif /* CONFIG_TCG */
+
+
 enum arm_fprounding {
     FPROUNDING_TIEEVEN,
     FPROUNDING_POSINF,
diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cpu-exec.c
+++ b/accel/tcg/cpu-exec.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu-common.h"
 #include "qemu/qemu-print.h"
 #include "cpu.h"
+#include "hw/core/tcg-cpu-ops.h"
 #include "trace.h"
 #include "disas/disas.h"
 #include "exec/exec-all.h"
@@ -XXX,XX +XXX,XX @@ cpu_tb_exec(CPUState *cpu, TranslationBlock *itb, int *tb_exit)
                                TARGET_FMT_lx "] %s\n",
                                last_tb->tc.ptr, last_tb->pc,
                                lookup_symbol(last_tb->pc));
-        if (cc->tcg_ops.synchronize_from_tb) {
-            cc->tcg_ops.synchronize_from_tb(cpu, last_tb);
+        if (cc->tcg_ops->synchronize_from_tb) {
+            cc->tcg_ops->synchronize_from_tb(cpu, last_tb);
         } else {
             assert(cc->set_pc);
             cc->set_pc(cpu, last_tb->pc);
@@ -XXX,XX +XXX,XX @@ static void cpu_exec_enter(CPUState *cpu)
 {
     CPUClass *cc = CPU_GET_CLASS(cpu);
 
-    if (cc->tcg_ops.cpu_exec_enter) {
-        cc->tcg_ops.cpu_exec_enter(cpu);
+    if (cc->tcg_ops->cpu_exec_enter) {
+        cc->tcg_ops->cpu_exec_enter(cpu);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ static void cpu_exec_exit(CPUState *cpu)
 {
     CPUClass *cc = CPU_GET_CLASS(cpu);
 
-    if (cc->tcg_ops.cpu_exec_exit) {
-        cc->tcg_ops.cpu_exec_exit(cpu);
+    if (cc->tcg_ops->cpu_exec_exit) {
+        cc->tcg_ops->cpu_exec_exit(cpu);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ static inline void cpu_handle_debug_exception(CPUState *cpu)
         }
     }
 
-    if (cc->tcg_ops.debug_excp_handler) {
-        cc->tcg_ops.debug_excp_handler(cpu);
+    if (cc->tcg_ops->debug_excp_handler) {
+        cc->tcg_ops->debug_excp_handler(cpu);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ static inline bool cpu_handle_exception(CPUState *cpu, int *ret)
            loop */
 #if defined(TARGET_I386)
         CPUClass *cc = CPU_GET_CLASS(cpu);
-        cc->tcg_ops.do_interrupt(cpu);
+        cc->tcg_ops->do_interrupt(cpu);
 #endif
         *ret = cpu->exception_index;
         cpu->exception_index = -1;
@@ -XXX,XX +XXX,XX @@ static inline bool cpu_handle_exception(CPUState *cpu, int *ret)
         if (replay_exception()) {
             CPUClass *cc = CPU_GET_CLASS(cpu);
             qemu_mutex_lock_iothread();
-            cc->tcg_ops.do_interrupt(cpu);
+            cc->tcg_ops->do_interrupt(cpu);
             qemu_mutex_unlock_iothread();
             cpu->exception_index = -1;
 
@@ -XXX,XX +XXX,XX @@ static inline bool cpu_handle_interrupt(CPUState *cpu,
            True when it is, and we should restart on a new TB,
            and via longjmp via cpu_loop_exit.  */
         else {
-            if (cc->tcg_ops.cpu_exec_interrupt &&
-                cc->tcg_ops.cpu_exec_interrupt(cpu, interrupt_request)) {
+            if (cc->tcg_ops->cpu_exec_interrupt &&
+                cc->tcg_ops->cpu_exec_interrupt(cpu, interrupt_request)) {
                 if (need_replay_interrupt(interrupt_request)) {
                     replay_interrupt();
                 }
@@ -XXX,XX +XXX,XX @@ void tcg_exec_realizefn(CPUState *cpu, Error **errp)
     CPUClass *cc = CPU_GET_CLASS(cpu);
 
     if (!tcg_target_initialized) {
-        cc->tcg_ops.initialize();
+        cc->tcg_ops->initialize();
         tcg_target_initialized = true;
     }
     tlb_init(cpu);
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/osdep.h"
 #include "qemu/main-loop.h"
 #include "cpu.h"
+#include "hw/core/tcg-cpu-ops.h"
 #include "exec/exec-all.h"
 #include "exec/memory.h"
 #include "exec/address-spaces.h"
@@ -XXX,XX +XXX,XX @@ static void tlb_fill(CPUState *cpu, target_ulong addr, int size,
      * This is not a probe, so only valid return is success; failure
      * should result in exception + longjmp to the cpu loop.
      */
-    ok = cc->tcg_ops.tlb_fill(cpu, addr, size,
-                              access_type, mmu_idx, false, retaddr);
+    ok = cc->tcg_ops->tlb_fill(cpu, addr, size,
+                               access_type, mmu_idx, false, retaddr);
     assert(ok);
 }
 
+static inline void cpu_unaligned_access(CPUState *cpu, vaddr addr,
+                                        MMUAccessType access_type,
+                                        int mmu_idx, uintptr_t retaddr)
+{
+    CPUClass *cc = CPU_GET_CLASS(cpu);
+
+    cc->tcg_ops->do_unaligned_access(cpu, addr, access_type, mmu_idx, retaddr);
+}
+
+static inline void cpu_transaction_failed(CPUState *cpu, hwaddr physaddr,
+                                          vaddr addr, unsigned size,
+                                          MMUAccessType access_type,
+                                          int mmu_idx, MemTxAttrs attrs,
+                                          MemTxResult response,
+                                          uintptr_t retaddr)
+{
+    CPUClass *cc = CPU_GET_CLASS(cpu);
+
+    if (!cpu->ignore_memory_transaction_failures &&
+        cc->tcg_ops->do_transaction_failed) {
+        cc->tcg_ops->do_transaction_failed(cpu, physaddr, addr, size,
+                                           access_type, mmu_idx, attrs,
+                                           response, retaddr);
+    }
+}
+
 static uint64_t io_readx(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
                          int mmu_idx, target_ulong addr, uintptr_t retaddr,
                          MMUAccessType access_type, MemOp op)
@@ -XXX,XX +XXX,XX @@ static int probe_access_internal(CPUArchState *env, target_ulong addr,
             CPUState *cs = env_cpu(env);
             CPUClass *cc = CPU_GET_CLASS(cs);
 
-            if (!cc->tcg_ops.tlb_fill(cs, addr, fault_size, access_type,
-                                      mmu_idx, nonfault, retaddr)) {
+            if (!cc->tcg_ops->tlb_fill(cs, addr, fault_size, access_type,
+                                       mmu_idx, nonfault, retaddr)) {
                 /* Non-faulting page table read failed.  */
                 *phost = NULL;
                 return TLB_INVALID_MASK;
diff --git a/accel/tcg/user-exec.c b/accel/tcg/user-exec.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/user-exec.c
+++ b/accel/tcg/user-exec.c
@@ -XXX,XX +XXX,XX @@
  */
 #include "qemu/osdep.h"
 #include "cpu.h"
+#include "hw/core/tcg-cpu-ops.h"
 #include "disas/disas.h"
 #include "exec/exec-all.h"
 #include "tcg/tcg.h"
@@ -XXX,XX +XXX,XX @@ static inline int handle_cpu_signal(uintptr_t pc, siginfo_t *info,
     clear_helper_retaddr();
 
     cc = CPU_GET_CLASS(cpu);
-    cc->tcg_ops.tlb_fill(cpu, address, 0, access_type,
-                         MMU_USER_IDX, false, pc);
+    cc->tcg_ops->tlb_fill(cpu, address, 0, access_type,
+                          MMU_USER_IDX, false, pc);
     g_assert_not_reached();
 }
 
@@ -XXX,XX +XXX,XX @@ static int probe_access_internal(CPUArchState *env, target_ulong addr,
         } else {
             CPUState *cpu = env_cpu(env);
             CPUClass *cc = CPU_GET_CLASS(cpu);
-            cc->tcg_ops.tlb_fill(cpu, addr, fault_size, access_type,
-                                 MMU_USER_IDX, false, ra);
+            cc->tcg_ops->tlb_fill(cpu, addr, fault_size, access_type,
+                                  MMU_USER_IDX, false, ra);
             g_assert_not_reached();
         }
     }
diff --git a/hw/mips/jazz.c b/hw/mips/jazz.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/mips/jazz.c
+++ b/hw/mips/jazz.c
@@ -XXX,XX +XXX,XX @@
 #include "qapi/error.h"
 #include "qemu/error-report.h"
 #include "qemu/help_option.h"
+#ifdef CONFIG_TCG
+#include "hw/core/tcg-cpu-ops.h"
+#endif /* CONFIG_TCG */
 
 enum jazz_model_e {
     JAZZ_MAGNUM,
@@ -XXX,XX +XXX,XX @@ static void mips_jazz_init(MachineState *machine,
      */
     cc = CPU_GET_CLASS(cpu);
 #if defined(CONFIG_TCG) && !defined(CONFIG_USER_ONLY)
-    real_do_transaction_failed = cc->tcg_ops.do_transaction_failed;
-    cc->tcg_ops.do_transaction_failed = mips_jazz_do_transaction_failed;
+    real_do_transaction_failed = cc->tcg_ops->do_transaction_failed;
+    cc->tcg_ops->do_transaction_failed = mips_jazz_do_transaction_failed;
 #endif /* CONFIG_TCG && !CONFIG_USER_ONLY */
 
     /* allocate RAM */
diff --git a/softmmu/physmem.c b/softmmu/physmem.c
index XXXXXXX..XXXXXXX 100644
--- a/softmmu/physmem.c
+++ b/softmmu/physmem.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/cutils.h"
 #include "qemu/cacheflush.h"
 #include "cpu.h"
+
+#ifdef CONFIG_TCG
+#include "hw/core/tcg-cpu-ops.h"
+#endif /* CONFIG_TCG */
+
 #include "exec/exec-all.h"
 #include "exec/target_page.h"
 #include "hw/qdev-core.h"
@@ -XXX,XX +XXX,XX @@ void cpu_check_watchpoint(CPUState *cpu, vaddr addr, vaddr len,
         return;
     }
 
-    if (cc->tcg_ops.adjust_watchpoint_address) {
+    if (cc->tcg_ops->adjust_watchpoint_address) {
         /* this is currently used only by ARM BE32 */
-        addr = cc->tcg_ops.adjust_watchpoint_address(cpu, addr, len);
+        addr = cc->tcg_ops->adjust_watchpoint_address(cpu, addr, len);
     }
     QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
         if (watchpoint_address_matches(wp, addr, len)
@@ -XXX,XX +XXX,XX @@ void cpu_check_watchpoint(CPUState *cpu, vaddr addr, vaddr len,
             wp->hitaddr = MAX(addr, wp->vaddr);
             wp->hitattrs = attrs;
             if (!cpu->watchpoint_hit) {
-                if (wp->flags & BP_CPU && cc->tcg_ops.debug_check_watchpoint &&
-                    !cc->tcg_ops.debug_check_watchpoint(cpu, wp)) {
+                if (wp->flags & BP_CPU && cc->tcg_ops->debug_check_watchpoint &&
+                    !cc->tcg_ops->debug_check_watchpoint(cpu, wp)) {
                     wp->flags &= ~BP_WATCHPOINT_HIT;
                     continue;
                 }
diff --git a/target/alpha/cpu.c b/target/alpha/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/alpha/cpu.c
+++ b/target/alpha/cpu.c
@@ -XXX,XX +XXX,XX @@ static void alpha_cpu_initfn(Object *obj)
 #endif
 }
 
+#include "hw/core/tcg-cpu-ops.h"
+
+static struct TCGCPUOps alpha_tcg_ops = {
+    .initialize = alpha_translate_init,
+    .cpu_exec_interrupt = alpha_cpu_exec_interrupt,
+    .tlb_fill = alpha_cpu_tlb_fill,
+
+#ifndef CONFIG_USER_ONLY
+    .do_interrupt = alpha_cpu_do_interrupt,
+    .do_transaction_failed = alpha_cpu_do_transaction_failed,
+    .do_unaligned_access = alpha_cpu_do_unaligned_access,
+#endif /* !CONFIG_USER_ONLY */
+};
+
 static void alpha_cpu_class_init(ObjectClass *oc, void *data)
 {
     DeviceClass *dc = DEVICE_CLASS(oc);
@@ -XXX,XX +XXX,XX @@ static void alpha_cpu_class_init(ObjectClass *oc, void *data)
 
     cc->class_by_name = alpha_cpu_class_by_name;
     cc->has_work = alpha_cpu_has_work;
-    cc->tcg_ops.do_interrupt = alpha_cpu_do_interrupt;
-    cc->tcg_ops.cpu_exec_interrupt = alpha_cpu_exec_interrupt;
     cc->dump_state = alpha_cpu_dump_state;
     cc->set_pc = alpha_cpu_set_pc;
     cc->gdb_read_register = alpha_cpu_gdb_read_register;
     cc->gdb_write_register = alpha_cpu_gdb_write_register;
-    cc->tcg_ops.tlb_fill = alpha_cpu_tlb_fill;
 #ifndef CONFIG_USER_ONLY
-    cc->tcg_ops.do_transaction_failed = alpha_cpu_do_transaction_failed;
-    cc->tcg_ops.do_unaligned_access = alpha_cpu_do_unaligned_access;
     cc->get_phys_page_debug = alpha_cpu_get_phys_page_debug;
     dc->vmsd = &vmstate_alpha_cpu;
 #endif
     cc->disas_set_info = alpha_cpu_disas_set_info;
-    cc->tcg_ops.initialize = alpha_translate_init;
 
+    cc->tcg_ops = &alpha_tcg_ops;
     cc->gdb_num_core_regs = 67;
 }
 
diff --git a/target/arm/cpu.c b/target/arm/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/cpu.c
+++ b/target/arm/cpu.c
@@ -XXX,XX +XXX,XX @@
 #include "qapi/error.h"
 #include "qapi/visitor.h"
 #include "cpu.h"
+#ifdef CONFIG_TCG
+#include "hw/core/tcg-cpu-ops.h"
+#endif /* CONFIG_TCG */
 #include "internals.h"
 #include "exec/exec-all.h"
 #include "hw/qdev-properties.h"
@@ -XXX,XX +XXX,XX @@ static void arm_cpu_set_pc(CPUState *cs, vaddr value)
 }
 
 #ifdef CONFIG_TCG
-static void arm_cpu_synchronize_from_tb(CPUState *cs,
-                                        const TranslationBlock *tb)
+void arm_cpu_synchronize_from_tb(CPUState *cs,
+                                 const TranslationBlock *tb)
 {
     ARMCPU *cpu = ARM_CPU(cs);
     CPUARMState *env = &cpu->env;
@@ -XXX,XX +XXX,XX @@ bool arm_cpu_exec_interrupt(CPUState *cs, int interrupt_request)
  found:
     cs->exception_index = excp_idx;
     env->exception.target_el = target_el;
-    cc->tcg_ops.do_interrupt(cs);
+    cc->tcg_ops->do_interrupt(cs);
     return true;
 }
 
@@ -XXX,XX +XXX,XX @@ static gchar *arm_gdb_arch_name(CPUState *cs)
     return g_strdup("arm");
 }
 
+#ifdef CONFIG_TCG
+static struct TCGCPUOps arm_tcg_ops = {
+    .initialize = arm_translate_init,
+    .synchronize_from_tb = arm_cpu_synchronize_from_tb,
+    .cpu_exec_interrupt = arm_cpu_exec_interrupt,
+    .tlb_fill = arm_cpu_tlb_fill,
+    .debug_excp_handler = arm_debug_excp_handler,
+
+#if !defined(CONFIG_USER_ONLY)
+    .do_interrupt = arm_cpu_do_interrupt,
+    .do_transaction_failed = arm_cpu_do_transaction_failed,
+    .do_unaligned_access = arm_cpu_do_unaligned_access,
+    .adjust_watchpoint_address = arm_adjust_watchpoint_address,
+    .debug_check_watchpoint = arm_debug_check_watchpoint,
+#endif /* !CONFIG_USER_ONLY */
+};
+#endif /* CONFIG_TCG */
+
 static void arm_cpu_class_init(ObjectClass *oc, void *data)
 {
     ARMCPUClass *acc = ARM_CPU_CLASS(oc);
@@ -XXX,XX +XXX,XX @@ static void arm_cpu_class_init(ObjectClass *oc, void *data)
     cc->gdb_get_dynamic_xml = arm_gdb_get_dynamic_xml;
     cc->gdb_stop_before_watchpoint = true;
     cc->disas_set_info = arm_disas_set_info;
+
 #ifdef CONFIG_TCG
-    cc->tcg_ops.initialize = arm_translate_init;
-    cc->tcg_ops.cpu_exec_interrupt = arm_cpu_exec_interrupt;
-    cc->tcg_ops.synchronize_from_tb = arm_cpu_synchronize_from_tb;
-    cc->tcg_ops.tlb_fill = arm_cpu_tlb_fill;
-    cc->tcg_ops.debug_excp_handler = arm_debug_excp_handler;
-#if !defined(CONFIG_USER_ONLY)
-    cc->tcg_ops.do_interrupt = arm_cpu_do_interrupt;
-    cc->tcg_ops.do_transaction_failed = arm_cpu_do_transaction_failed;
-    cc->tcg_ops.do_unaligned_access = arm_cpu_do_unaligned_access;
-    cc->tcg_ops.adjust_watchpoint_address = arm_adjust_watchpoint_address;
-    cc->tcg_ops.debug_check_watchpoint = arm_debug_check_watchpoint;
-#endif /* CONFIG_TCG && !CONFIG_USER_ONLY */
+    cc->tcg_ops = &arm_tcg_ops;
 #endif /* CONFIG_TCG */
 }
 
diff --git a/target/arm/cpu64.c b/target/arm/cpu64.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/cpu64.c
+++ b/target/arm/cpu64.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/osdep.h"
 #include "qapi/error.h"
 #include "cpu.h"
+#ifdef CONFIG_TCG
+#include "hw/core/tcg-cpu-ops.h"
+#endif /* CONFIG_TCG */
 #include "qemu/module.h"
 #if !defined(CONFIG_USER_ONLY)
 #include "hw/loader.h"
@@ -XXX,XX +XXX,XX @@ static void aarch64_cpu_class_init(ObjectClass *oc, void *data)
 {
     CPUClass *cc = CPU_CLASS(oc);
 
-#ifdef CONFIG_TCG
-    cc->tcg_ops.cpu_exec_interrupt = arm_cpu_exec_interrupt;
-#endif /* CONFIG_TCG */
-
     cc->gdb_read_register = aarch64_cpu_gdb_read_register;
     cc->gdb_write_register = aarch64_cpu_gdb_write_register;
     cc->gdb_num_core_regs = 34;
diff --git a/target/arm/cpu_tcg.c b/target/arm/cpu_tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/cpu_tcg.c
+++ b/target/arm/cpu_tcg.c
@@ -XXX,XX +XXX,XX @@
 
 #include "qemu/osdep.h"
 #include "cpu.h"
+#ifdef CONFIG_TCG
+#include "hw/core/tcg-cpu-ops.h"
+#endif /* CONFIG_TCG */
 #include "internals.h"
 
 /* CPU models. These are not needed for the AArch64 linux-user build. */
@@ -XXX,XX +XXX,XX @@ static bool arm_v7m_cpu_exec_interrupt(CPUState *cs, int interrupt_request)
     if (interrupt_request & CPU_INTERRUPT_HARD
         && (armv7m_nvic_can_take_pending_exception(env->nvic))) {
         cs->exception_index = EXCP_IRQ;
-        cc->tcg_ops.do_interrupt(cs);
+        cc->tcg_ops->do_interrupt(cs);
         ret = true;
     }
     return ret;
@@ -XXX,XX +XXX,XX @@ static void pxa270c5_initfn(Object *obj)
     cpu->reset_sctlr = 0x00000078;
 }
 
+#ifdef CONFIG_TCG
+static struct TCGCPUOps arm_v7m_tcg_ops = {
+    .initialize = arm_translate_init,
+    .synchronize_from_tb = arm_cpu_synchronize_from_tb,
+    .cpu_exec_interrupt = arm_v7m_cpu_exec_interrupt,
+    .tlb_fill = arm_cpu_tlb_fill,
+    .debug_excp_handler = arm_debug_excp_handler,
+
+#if !defined(CONFIG_USER_ONLY)
+    .do_interrupt = arm_v7m_cpu_do_interrupt,
+    .do_transaction_failed = arm_cpu_do_transaction_failed,
+    .do_unaligned_access = arm_cpu_do_unaligned_access,
+    .adjust_watchpoint_address = arm_adjust_watchpoint_address,
+    .debug_check_watchpoint = arm_debug_check_watchpoint,
+#endif /* !CONFIG_USER_ONLY */
+};
+#endif /* CONFIG_TCG */
+
 static void arm_v7m_class_init(ObjectClass *oc, void *data)
 {
     ARMCPUClass *acc = ARM_CPU_CLASS(oc);
@@ -XXX,XX +XXX,XX @@ static void arm_v7m_class_init(ObjectClass *oc, void *data)
 
     acc->info = data;
 #ifdef CONFIG_TCG
-    cc->tcg_ops.cpu_exec_interrupt = arm_v7m_cpu_exec_interrupt;
-#ifndef CONFIG_USER_ONLY
-    cc->tcg_ops.do_interrupt = arm_v7m_cpu_do_interrupt;
-#endif
+    cc->tcg_ops = &arm_v7m_tcg_ops;
 #endif /* CONFIG_TCG */
 
     cc->gdb_core_xml_file = "arm-m-profile.xml";
diff --git a/target/avr/cpu.c b/target/avr/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/avr/cpu.c
+++ b/target/avr/cpu.c
@@ -XXX,XX +XXX,XX @@ static void avr_cpu_dump_state(CPUState *cs, FILE *f, int flags)
     qemu_fprintf(f, "\n");
 }
 
+#include "hw/core/tcg-cpu-ops.h"
+
+static struct TCGCPUOps avr_tcg_ops = {
+    .initialize = avr_cpu_tcg_init,
+    .synchronize_from_tb = avr_cpu_synchronize_from_tb,
+    .cpu_exec_interrupt = avr_cpu_exec_interrupt,
+    .tlb_fill = avr_cpu_tlb_fill,
+
+#ifndef CONFIG_USER_ONLY
+    .do_interrupt = avr_cpu_do_interrupt,
+#endif /* !CONFIG_USER_ONLY */
+};
+
 static void avr_cpu_class_init(ObjectClass *oc, void *data)
 {
     DeviceClass *dc = DEVICE_CLASS(oc);
@@ -XXX,XX +XXX,XX @@ static void avr_cpu_class_init(ObjectClass *oc, void *data)
     cc->class_by_name = avr_cpu_class_by_name;
 
     cc->has_work = avr_cpu_has_work;
-    cc->tcg_ops.do_interrupt = avr_cpu_do_interrupt;
-    cc->tcg_ops.cpu_exec_interrupt = avr_cpu_exec_interrupt;
     cc->dump_state = avr_cpu_dump_state;
     cc->set_pc = avr_cpu_set_pc;
     cc->memory_rw_debug = avr_cpu_memory_rw_debug;
     cc->get_phys_page_debug = avr_cpu_get_phys_page_debug;
-    cc->tcg_ops.tlb_fill = avr_cpu_tlb_fill;
     cc->vmsd = &vms_avr_cpu;
     cc->disas_set_info = avr_cpu_disas_set_info;
-    cc->tcg_ops.initialize = avr_cpu_tcg_init;
-    cc->tcg_ops.synchronize_from_tb = avr_cpu_synchronize_from_tb;
     cc->gdb_read_register = avr_cpu_gdb_read_register;
     cc->gdb_write_register = avr_cpu_gdb_write_register;
     cc->gdb_num_core_regs = 35;
     cc->gdb_core_xml_file = "avr-cpu.xml";
+    cc->tcg_ops = &avr_tcg_ops;
 }
 
 /*
diff --git a/target/avr/helper.c b/target/avr/helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/avr/helper.c
+++ b/target/avr/helper.c
@@ -XXX,XX +XXX,XX @@
 
 #include "qemu/osdep.h"
 #include "cpu.h"
+#include "hw/core/tcg-cpu-ops.h"
 #include "exec/exec-all.h"
 #include "exec/address-spaces.h"
 #include "exec/helper-proto.h"
@@ -XXX,XX +XXX,XX @@ bool avr_cpu_exec_interrupt(CPUState *cs, int interrupt_request)
     if (interrupt_request & CPU_INTERRUPT_RESET) {
         if (cpu_interrupts_enabled(env)) {
             cs->exception_index = EXCP_RESET;
-            cc->tcg_ops.do_interrupt(cs);
+            cc->tcg_ops->do_interrupt(cs);
 
             cs->interrupt_request &= ~CPU_INTERRUPT_RESET;
 
@@ -XXX,XX +XXX,XX @@ bool avr_cpu_exec_interrupt(CPUState *cs, int interrupt_request)
         if (cpu_interrupts_enabled(env) && env->intsrc != 0) {
             int index = ctz32(env->intsrc);
             cs->exception_index = EXCP_INT(index);
-            cc->tcg_ops.do_interrupt(cs);
+            cc->tcg_ops->do_interrupt(cs);
 
             env->intsrc &= env->intsrc - 1; /* clear the interrupt */
             cs->interrupt_request &= ~CPU_INTERRUPT_HARD;
diff --git a/target/cris/cpu.c b/target/cris/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/cris/cpu.c
+++ b/target/cris/cpu.c
@@ -XXX,XX +XXX,XX @@ static void cris_cpu_initfn(Object *obj)
 #endif
 }
 
+#include "hw/core/tcg-cpu-ops.h"
+
+static struct TCGCPUOps crisv10_tcg_ops = {
+    .initialize = cris_initialize_crisv10_tcg,
+    .cpu_exec_interrupt = cris_cpu_exec_interrupt,
+    .tlb_fill = cris_cpu_tlb_fill,
+
+#ifndef CONFIG_USER_ONLY
+    .do_interrupt = crisv10_cpu_do_interrupt,
+#endif /* !CONFIG_USER_ONLY */
+};
+
+static struct TCGCPUOps crisv32_tcg_ops = {
+    .initialize = cris_initialize_tcg,
+    .cpu_exec_interrupt = cris_cpu_exec_interrupt,
+    .tlb_fill = cris_cpu_tlb_fill,
+
+#ifndef CONFIG_USER_ONLY
+    .do_interrupt = cris_cpu_do_interrupt,
+#endif /* !CONFIG_USER_ONLY */
+};
+
 static void crisv8_cpu_class_init(ObjectClass *oc, void *data)
 {
     CPUClass *cc = CPU_CLASS(oc);
     CRISCPUClass *ccc = CRIS_CPU_CLASS(oc);
 
     ccc->vr = 8;
-    cc->tcg_ops.do_interrupt = crisv10_cpu_do_interrupt;
     cc->gdb_read_register = crisv10_cpu_gdb_read_register;
-    cc->tcg_ops.initialize = cris_initialize_crisv10_tcg;
+    cc->tcg_ops = &crisv10_tcg_ops;
 }
 
 static void crisv9_cpu_class_init(ObjectClass *oc, void *data)
@@ -XXX,XX +XXX,XX @@ static void crisv9_cpu_class_init(ObjectClass *oc, void *data)
     CRISCPUClass *ccc = CRIS_CPU_CLASS(oc);
 
     ccc->vr = 9;
-    cc->tcg_ops.do_interrupt = crisv10_cpu_do_interrupt;
     cc->gdb_read_register = crisv10_cpu_gdb_read_register;
-    cc->tcg_ops.initialize = cris_initialize_crisv10_tcg;
+    cc->tcg_ops = &crisv10_tcg_ops;
 }
 
 static void crisv10_cpu_class_init(ObjectClass *oc, void *data)
@@ -XXX,XX +XXX,XX @@ static void crisv10_cpu_class_init(ObjectClass *oc, void *data)
     CRISCPUClass *ccc = CRIS_CPU_CLASS(oc);
 
     ccc->vr = 10;
-    cc->tcg_ops.do_interrupt = crisv10_cpu_do_interrupt;
     cc->gdb_read_register = crisv10_cpu_gdb_read_register;
-    cc->tcg_ops.initialize = cris_initialize_crisv10_tcg;
+    cc->tcg_ops = &crisv10_tcg_ops;
 }
 
 static void crisv11_cpu_class_init(ObjectClass *oc, void *data)
@@ -XXX,XX +XXX,XX @@ static void crisv11_cpu_class_init(ObjectClass *oc, void *data)
     CRISCPUClass *ccc = CRIS_CPU_CLASS(oc);
 
     ccc->vr = 11;
-    cc->tcg_ops.do_interrupt = crisv10_cpu_do_interrupt;
     cc->gdb_read_register = crisv10_cpu_gdb_read_register;
-    cc->tcg_ops.initialize = cris_initialize_crisv10_tcg;
+    cc->tcg_ops = &crisv10_tcg_ops;
 }
 
 static void crisv17_cpu_class_init(ObjectClass *oc, void *data)
@@ -XXX,XX +XXX,XX @@ static void crisv17_cpu_class_init(ObjectClass *oc, void *data)
     CRISCPUClass *ccc = CRIS_CPU_CLASS(oc);
 
     ccc->vr = 17;
-    cc->tcg_ops.do_interrupt = crisv10_cpu_do_interrupt;
     cc->gdb_read_register = crisv10_cpu_gdb_read_register;
-    cc->tcg_ops.initialize = cris_initialize_crisv10_tcg;
+    cc->tcg_ops = &crisv10_tcg_ops;
 }
 
 static void crisv32_cpu_class_init(ObjectClass *oc, void *data)
 {
+    CPUClass *cc = CPU_CLASS(oc);
     CRISCPUClass *ccc = CRIS_CPU_CLASS(oc);
 
     ccc->vr = 32;
+    cc->tcg_ops = &crisv32_tcg_ops;
 }
 
 static void cris_cpu_class_init(ObjectClass *oc, void *data)
@@ -XXX,XX +XXX,XX @@ static void cris_cpu_class_init(ObjectClass *oc, void *data)
 
     cc->class_by_name = cris_cpu_class_by_name;
     cc->has_work = cris_cpu_has_work;
-    cc->tcg_ops.do_interrupt = cris_cpu_do_interrupt;
-    cc->tcg_ops.cpu_exec_interrupt = cris_cpu_exec_interrupt;
     cc->dump_state = cris_cpu_dump_state;
     cc->set_pc = cris_cpu_set_pc;
     cc->gdb_read_register = cris_cpu_gdb_read_register;
     cc->gdb_write_register = cris_cpu_gdb_write_register;
-    cc->tcg_ops.tlb_fill = cris_cpu_tlb_fill;
 #ifndef CONFIG_USER_ONLY
     cc->get_phys_page_debug = cris_cpu_get_phys_page_debug;
     dc->vmsd = &vmstate_cris_cpu;
@@ -XXX,XX +XXX,XX @@ static void cris_cpu_class_init(ObjectClass *oc, void *data)
     cc->gdb_stop_before_watchpoint = true;
 
     cc->disas_set_info = cris_disas_set_info;
-    cc->tcg_ops.initialize = cris_initialize_tcg;
 }
 
 #define DEFINE_CRIS_CPU_TYPE(cpu_model, initfn) \
diff --git a/target/cris/helper.c b/target/cris/helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/cris/helper.c
+++ b/target/cris/helper.c
@@ -XXX,XX +XXX,XX @@
 
 #include "qemu/osdep.h"
 #include "cpu.h"
+#include "hw/core/tcg-cpu-ops.h"
 #include "mmu.h"
 #include "qemu/host-utils.h"
 #include "exec/exec-all.h"
@@ -XXX,XX +XXX,XX @@ bool cris_cpu_exec_interrupt(CPUState *cs, int interrupt_request)
         && (env->pregs[PR_CCS] & I_FLAG)
         && !env->locked_irq) {
         cs->exception_index = EXCP_IRQ;
-        cc->tcg_ops.do_interrupt(cs);
+        cc->tcg_ops->do_interrupt(cs);
         ret = true;
     }
     if (interrupt_request & CPU_INTERRUPT_NMI) {
@@ -XXX,XX +XXX,XX @@ bool cris_cpu_exec_interrupt(CPUState *cs, int interrupt_request)
         }
         if ((env->pregs[PR_CCS] & m_flag_archval)) {
             cs->exception_index = EXCP_NMI;
-            cc->tcg_ops.do_interrupt(cs);
+            cc->tcg_ops->do_interrupt(cs);
             ret = true;
         }
     }
diff --git a/target/hppa/cpu.c b/target/hppa/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/hppa/cpu.c
+++ b/target/hppa/cpu.c
@@ -XXX,XX +XXX,XX @@ static ObjectClass *hppa_cpu_class_by_name(const char *cpu_model)
     return object_class_by_name(TYPE_HPPA_CPU);
 }
 
+#include "hw/core/tcg-cpu-ops.h"
+
+static struct TCGCPUOps hppa_tcg_ops = {
+    .initialize = hppa_translate_init,
+    .synchronize_from_tb = hppa_cpu_synchronize_from_tb,
+    .cpu_exec_interrupt = hppa_cpu_exec_interrupt,
+    .tlb_fill = hppa_cpu_tlb_fill,
+
+#ifndef CONFIG_USER_ONLY
+    .do_interrupt = hppa_cpu_do_interrupt,
+    .do_unaligned_access = hppa_cpu_do_unaligned_access,
+#endif /* !CONFIG_USER_ONLY */
+};
+
 static void hppa_cpu_class_init(ObjectClass *oc, void *data)
 {
     DeviceClass *dc = DEVICE_CLASS(oc);
@@ -XXX,XX +XXX,XX @@ static void hppa_cpu_class_init(ObjectClass *oc, void *data)
 
     cc->class_by_name = hppa_cpu_class_by_name;
     cc->has_work = hppa_cpu_has_work;
-    cc->tcg_ops.do_interrupt = hppa_cpu_do_interrupt;
-    cc->tcg_ops.cpu_exec_interrupt = hppa_cpu_exec_interrupt;
     cc->dump_state = hppa_cpu_dump_state;
     cc->set_pc = hppa_cpu_set_pc;
-    cc->tcg_ops.synchronize_from_tb = hppa_cpu_synchronize_from_tb;
     cc->gdb_read_register = hppa_cpu_gdb_read_register;
     cc->gdb_write_register = hppa_cpu_gdb_write_register;
-    cc->tcg_ops.tlb_fill = hppa_cpu_tlb_fill;
 #ifndef CONFIG_USER_ONLY
     cc->get_phys_page_debug = hppa_cpu_get_phys_page_debug;
-    cc->tcg_ops.do_unaligned_access = hppa_cpu_do_unaligned_access;
     dc->vmsd = &vmstate_hppa_cpu;
 #endif
     cc->disas_set_info = hppa_cpu_disas_set_info;
-    cc->tcg_ops.initialize = hppa_translate_init;
-
     cc->gdb_num_core_regs = 128;
+    cc->tcg_ops = &hppa_tcg_ops;
 }
 
 static const TypeInfo hppa_cpu_type_info = {
diff --git a/target/i386/tcg/tcg-cpu.c b/target/i386/tcg/tcg-cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/i386/tcg/tcg-cpu.c
+++ b/target/i386/tcg/tcg-cpu.c
@@ -XXX,XX +XXX,XX @@ static void x86_cpu_synchronize_from_tb(CPUState *cs,
     cpu->env.eip = tb->pc - tb->cs_base;
 }
 
+#include "hw/core/tcg-cpu-ops.h"
+
+static struct TCGCPUOps x86_tcg_ops = {
+    .initialize = tcg_x86_init,
+    .synchronize_from_tb = x86_cpu_synchronize_from_tb,
+    .cpu_exec_enter = x86_cpu_exec_enter,
+    .cpu_exec_exit = x86_cpu_exec_exit,
+    .cpu_exec_interrupt = x86_cpu_exec_interrupt,
+    .do_interrupt = x86_cpu_do_interrupt,
+    .tlb_fill = x86_cpu_tlb_fill,
+#ifndef CONFIG_USER_ONLY
+    .debug_excp_handler = breakpoint_handler,
+#endif /* !CONFIG_USER_ONLY */
+};
+
 void tcg_cpu_common_class_init(CPUClass *cc)
 {
-    cc->tcg_ops.do_interrupt = x86_cpu_do_interrupt;
-    cc->tcg_ops.cpu_exec_interrupt = x86_cpu_exec_interrupt;
-    cc->tcg_ops.synchronize_from_tb = x86_cpu_synchronize_from_tb;
-    cc->tcg_ops.cpu_exec_enter = x86_cpu_exec_enter;
-    cc->tcg_ops.cpu_exec_exit = x86_cpu_exec_exit;
-    cc->tcg_ops.initialize = tcg_x86_init;
-    cc->tcg_ops.tlb_fill = x86_cpu_tlb_fill;
-#ifndef CONFIG_USER_ONLY
-    cc->tcg_ops.debug_excp_handler = breakpoint_handler;
-#endif
+    cc->tcg_ops = &x86_tcg_ops;
 }
diff --git a/target/lm32/cpu.c b/target/lm32/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/lm32/cpu.c
+++ b/target/lm32/cpu.c
@@ -XXX,XX +XXX,XX @@ static ObjectClass *lm32_cpu_class_by_name(const char *cpu_model)
     return oc;
 }
 
+#include "hw/core/tcg-cpu-ops.h"
+
+static struct TCGCPUOps lm32_tcg_ops = {
+    .initialize = lm32_translate_init,
+    .cpu_exec_interrupt = lm32_cpu_exec_interrupt,
+    .tlb_fill = lm32_cpu_tlb_fill,
+    .debug_excp_handler = lm32_debug_excp_handler,
+
+#ifndef CONFIG_USER_ONLY
+    .do_interrupt = lm32_cpu_do_interrupt,
+#endif /* !CONFIG_USER_ONLY */
+};
+
 static void lm32_cpu_class_init(ObjectClass *oc, void *data)
 {
     LM32CPUClass *lcc = LM32_CPU_CLASS(oc);
@@ -XXX,XX +XXX,XX @@ static void lm32_cpu_class_init(ObjectClass *oc, void *data)
 
     cc->class_by_name = lm32_cpu_class_by_name;
     cc->has_work = lm32_cpu_has_work;
-    cc->tcg_ops.do_interrupt = lm32_cpu_do_interrupt;
-    cc->tcg_ops.cpu_exec_interrupt = lm32_cpu_exec_interrupt;
     cc->dump_state = lm32_cpu_dump_state;
     cc->set_pc = lm32_cpu_set_pc;
     cc->gdb_read_register = lm32_cpu_gdb_read_register;
     cc->gdb_write_register = lm32_cpu_gdb_write_register;
-    cc->tcg_ops.tlb_fill = lm32_cpu_tlb_fill;
 #ifndef CONFIG_USER_ONLY
     cc->get_phys_page_debug = lm32_cpu_get_phys_page_debug;
     cc->vmsd = &vmstate_lm32_cpu;
 #endif
     cc->gdb_num_core_regs = 32 + 7;
     cc->gdb_stop_before_watchpoint = true;
-    cc->tcg_ops.debug_excp_handler = lm32_debug_excp_handler;
     cc->disas_set_info = lm32_cpu_disas_set_info;
-    cc->tcg_ops.initialize = lm32_translate_init;
+    cc->tcg_ops = &lm32_tcg_ops;
 }
 
 #define DEFINE_LM32_CPU_TYPE(cpu_model, initfn) \
diff --git a/target/m68k/cpu.c b/target/m68k/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/m68k/cpu.c
+++ b/target/m68k/cpu.c
@@ -XXX,XX +XXX,XX @@ static const VMStateDescription vmstate_m68k_cpu = {
 };
 #endif
 
+#include "hw/core/tcg-cpu-ops.h"
+
+static struct TCGCPUOps m68k_tcg_ops = {
+    .initialize = m68k_tcg_init,
+    .cpu_exec_interrupt = m68k_cpu_exec_interrupt,
+    .tlb_fill = m68k_cpu_tlb_fill,
+
+#ifndef CONFIG_USER_ONLY
+    .do_interrupt = m68k_cpu_do_interrupt,
+    .do_transaction_failed = m68k_cpu_transaction_failed,
+#endif /* !CONFIG_USER_ONLY */
+};
+
 static void m68k_cpu_class_init(ObjectClass *c, void *data)
 {
     M68kCPUClass *mcc = M68K_CPU_CLASS(c);
@@ -XXX,XX +XXX,XX @@ static void m68k_cpu_class_init(ObjectClass *c, void *data)
 
     cc->class_by_name = m68k_cpu_class_by_name;
     cc->has_work = m68k_cpu_has_work;
-    cc->tcg_ops.do_interrupt = m68k_cpu_do_interrupt;
-    cc->tcg_ops.cpu_exec_interrupt = m68k_cpu_exec_interrupt;
     cc->dump_state = m68k_cpu_dump_state;
     cc->set_pc = m68k_cpu_set_pc;
     cc->gdb_read_register = m68k_cpu_gdb_read_register;
     cc->gdb_write_register = m68k_cpu_gdb_write_register;
-    cc->tcg_ops.tlb_fill = m68k_cpu_tlb_fill;
 #if defined(CONFIG_SOFTMMU)
-    cc->tcg_ops.do_transaction_failed = m68k_cpu_transaction_failed;
     cc->get_phys_page_debug = m68k_cpu_get_phys_page_debug;
     dc->vmsd = &vmstate_m68k_cpu;
 #endif
     cc->disas_set_info = m68k_cpu_disas_set_info;
-    cc->tcg_ops.initialize = m68k_tcg_init;
 
     cc->gdb_num_core_regs = 18;
+    cc->tcg_ops = &m68k_tcg_ops;
 }
 
 static void m68k_cpu_class_init_cf_core(ObjectClass *c, void *data)
diff --git a/target/microblaze/cpu.c b/target/microblaze/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/microblaze/cpu.c
+++ b/target/microblaze/cpu.c
@@ -XXX,XX +XXX,XX @@ static ObjectClass *mb_cpu_class_by_name(const char *cpu_model)
     return object_class_by_name(TYPE_MICROBLAZE_CPU);
 }
 
+#include "hw/core/tcg-cpu-ops.h"
+
+static struct TCGCPUOps mb_tcg_ops = {
+    .initialize = mb_tcg_init,
+    .synchronize_from_tb = mb_cpu_synchronize_from_tb,
+    .cpu_exec_interrupt = mb_cpu_exec_interrupt,
+    .tlb_fill = mb_cpu_tlb_fill,
+
+#ifndef CONFIG_USER_ONLY
+    .do_interrupt = mb_cpu_do_interrupt,
+    .do_transaction_failed = mb_cpu_transaction_failed,
+    .do_unaligned_access = mb_cpu_do_unaligned_access,
+#endif /* !CONFIG_USER_ONLY */
+};
+
 static void mb_cpu_class_init(ObjectClass *oc, void *data)
 {
     DeviceClass *dc = DEVICE_CLASS(oc);
@@ -XXX,XX +XXX,XX @@ static void mb_cpu_class_init(ObjectClass *oc, void *data)
 
     cc->class_by_name = mb_cpu_class_by_name;
     cc->has_work = mb_cpu_has_work;
-    cc->tcg_ops.do_interrupt = mb_cpu_do_interrupt;
-    cc->tcg_ops.cpu_exec_interrupt = mb_cpu_exec_interrupt;
+
     cc->dump_state = mb_cpu_dump_state;
     cc->set_pc = mb_cpu_set_pc;
-    cc->tcg_ops.synchronize_from_tb = mb_cpu_synchronize_from_tb;
     cc->gdb_read_register = mb_cpu_gdb_read_register;
     cc->gdb_write_register = mb_cpu_gdb_write_register;
-    cc->tcg_ops.tlb_fill = mb_cpu_tlb_fill;
+
 #ifndef CONFIG_USER_ONLY
-    cc->tcg_ops.do_transaction_failed = mb_cpu_transaction_failed;
-    cc->tcg_ops.do_unaligned_access = mb_cpu_do_unaligned_access;
     cc->get_phys_page_attrs_debug = mb_cpu_get_phys_page_attrs_debug;
     dc->vmsd = &vmstate_mb_cpu;
 #endif
@@ -XXX,XX +XXX,XX @@ static void mb_cpu_class_init(ObjectClass *oc, void *data)
     cc->gdb_num_core_regs = 32 + 27;
 
     cc->disas_set_info = mb_disas_set_info;
-    cc->tcg_ops.initialize = mb_tcg_init;
+    cc->tcg_ops = &mb_tcg_ops;
 }
 
 static const TypeInfo mb_cpu_type_info = {
diff --git a/target/mips/cpu.c b/target/mips/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/mips/cpu.c
+++ b/target/mips/cpu.c
@@ -XXX,XX +XXX,XX @@ static Property mips_cpu_properties[] = {
     DEFINE_PROP_END_OF_LIST()
 };
 
+#ifdef CONFIG_TCG
+#include "hw/core/tcg-cpu-ops.h"
+/*
+ * NB: cannot be const, as some elements are changed for specific
+ * mips hardware (see hw/mips/jazz.c).
+ */
+static struct TCGCPUOps mips_tcg_ops = {
+    .initialize = mips_tcg_init,
+    .synchronize_from_tb = mips_cpu_synchronize_from_tb,
+    .cpu_exec_interrupt = mips_cpu_exec_interrupt,
+    .tlb_fill = mips_cpu_tlb_fill,
+
+#if !defined(CONFIG_USER_ONLY)
+    .do_interrupt = mips_cpu_do_interrupt,
+    .do_transaction_failed = mips_cpu_do_transaction_failed,
+    .do_unaligned_access = mips_cpu_do_unaligned_access,
+#endif /* !CONFIG_USER_ONLY */
+};
+#endif /* CONFIG_TCG */
+
 static void mips_cpu_class_init(ObjectClass *c, void *data)
 {
     MIPSCPUClass *mcc = MIPS_CPU_CLASS(c);
@@ -XXX,XX +XXX,XX @@ static void mips_cpu_class_init(ObjectClass *c, void *data)
     cc->vmsd = &vmstate_mips_cpu;
 #endif
     cc->disas_set_info = mips_cpu_disas_set_info;
-#ifdef CONFIG_TCG
-    cc->tcg_ops.initialize = mips_tcg_init;
-    cc->tcg_ops.do_interrupt = mips_cpu_do_interrupt;
-    cc->tcg_ops.cpu_exec_interrupt = mips_cpu_exec_interrupt;
-    cc->tcg_ops.synchronize_from_tb = mips_cpu_synchronize_from_tb;
-    cc->tcg_ops.tlb_fill = mips_cpu_tlb_fill;
-#ifndef CONFIG_USER_ONLY
-    cc->tcg_ops.do_transaction_failed = mips_cpu_do_transaction_failed;
-    cc->tcg_ops.do_unaligned_access = mips_cpu_do_unaligned_access;
-
-#endif /* CONFIG_USER_ONLY */
-#endif /* CONFIG_TCG */
-
     cc->gdb_num_core_regs = 73;
     cc->gdb_stop_before_watchpoint = true;
+#ifdef CONFIG_TCG
+    cc->tcg_ops = &mips_tcg_ops;
+#endif /* CONFIG_TCG */
 }
 
 static const TypeInfo mips_cpu_type_info = {
diff --git a/target/moxie/cpu.c b/target/moxie/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/moxie/cpu.c
+++ b/target/moxie/cpu.c
@@ -XXX,XX +XXX,XX @@ static ObjectClass *moxie_cpu_class_by_name(const char *cpu_model)
     return oc;
 }
 
+#include "hw/core/tcg-cpu-ops.h"
+
+static struct TCGCPUOps moxie_tcg_ops = {
+    .initialize = moxie_translate_init,
+    .tlb_fill = moxie_cpu_tlb_fill,
+
+#ifndef CONFIG_USER_ONLY
+    .do_interrupt = moxie_cpu_do_interrupt,
+#endif /* !CONFIG_USER_ONLY */
+};
+
 static void moxie_cpu_class_init(ObjectClass *oc, void *data)
 {
     DeviceClass *dc = DEVICE_CLASS(oc);
@@ -XXX,XX +XXX,XX @@ static void moxie_cpu_class_init(ObjectClass *oc, void *data)
     cc->class_by_name = moxie_cpu_class_by_name;
 
     cc->has_work = moxie_cpu_has_work;
-    cc->tcg_ops.do_interrupt = moxie_cpu_do_interrupt;
     cc->dump_state = moxie_cpu_dump_state;
     cc->set_pc = moxie_cpu_set_pc;
-    cc->tcg_ops.tlb_fill = moxie_cpu_tlb_fill;
 #ifndef CONFIG_USER_ONLY
     cc->get_phys_page_debug = moxie_cpu_get_phys_page_debug;
     cc->vmsd = &vmstate_moxie_cpu;
 #endif
     cc->disas_set_info = moxie_cpu_disas_set_info;
-    cc->tcg_ops.initialize = moxie_translate_init;
+    cc->tcg_ops = &moxie_tcg_ops;
 }
 
 static void moxielite_initfn(Object *obj)
diff --git a/target/nios2/cpu.c b/target/nios2/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/nios2/cpu.c
+++ b/target/nios2/cpu.c
@@ -XXX,XX +XXX,XX @@ static Property nios2_properties[] = {
     DEFINE_PROP_END_OF_LIST(),
 };
 
+#include "hw/core/tcg-cpu-ops.h"
+
+static struct TCGCPUOps nios2_tcg_ops = {
+    .initialize = nios2_tcg_init,
+    .cpu_exec_interrupt = nios2_cpu_exec_interrupt,
+    .tlb_fill = nios2_cpu_tlb_fill,
+
+#ifndef CONFIG_USER_ONLY
+    .do_interrupt = nios2_cpu_do_interrupt,
+    .do_unaligned_access = nios2_cpu_do_unaligned_access,
+#endif /* !CONFIG_USER_ONLY */
+};
 
 static void nios2_cpu_class_init(ObjectClass *oc, void *data)
 {
@@ -XXX,XX +XXX,XX @@ static void nios2_cpu_class_init(ObjectClass *oc, void *data)
 
     cc->class_by_name = nios2_cpu_class_by_name;
     cc->has_work = nios2_cpu_has_work;
-    cc->tcg_ops.do_interrupt = nios2_cpu_do_interrupt;
-    cc->tcg_ops.cpu_exec_interrupt = nios2_cpu_exec_interrupt;
     cc->dump_state = nios2_cpu_dump_state;
     cc->set_pc = nios2_cpu_set_pc;
     cc->disas_set_info = nios2_cpu_disas_set_info;
-    cc->tcg_ops.tlb_fill = nios2_cpu_tlb_fill;
 #ifndef CONFIG_USER_ONLY
-    cc->tcg_ops.do_unaligned_access = nios2_cpu_do_unaligned_access;
     cc->get_phys_page_debug = nios2_cpu_get_phys_page_debug;
 #endif
     cc->gdb_read_register = nios2_cpu_gdb_read_register;
     cc->gdb_write_register = nios2_cpu_gdb_write_register;
     cc->gdb_num_core_regs = 49;
-    cc->tcg_ops.initialize = nios2_tcg_init;
+    cc->tcg_ops = &nios2_tcg_ops;
 }
 
 static const TypeInfo nios2_cpu_type_info = {
diff --git a/target/openrisc/cpu.c b/target/openrisc/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/openrisc/cpu.c
+++ b/target/openrisc/cpu.c
@@ -XXX,XX +XXX,XX @@ static void openrisc_any_initfn(Object *obj)
                       | (IMMUCFGR_NTS & (ctz32(TLB_SIZE) << 2));
 }
 
+#include "hw/core/tcg-cpu-ops.h"
+
+static struct TCGCPUOps openrisc_tcg_ops = {
+    .initialize = openrisc_translate_init,
+    .cpu_exec_interrupt = openrisc_cpu_exec_interrupt,
+    .tlb_fill = openrisc_cpu_tlb_fill,
+
+#ifndef CONFIG_USER_ONLY
+    .do_interrupt = openrisc_cpu_do_interrupt,
+#endif /* !CONFIG_USER_ONLY */
+};
+
 static void openrisc_cpu_class_init(ObjectClass *oc, void *data)
 {
     OpenRISCCPUClass *occ = OPENRISC_CPU_CLASS(oc);
@@ -XXX,XX +XXX,XX @@ static void openrisc_cpu_class_init(ObjectClass *oc, void *data)
 
     cc->class_by_name = openrisc_cpu_class_by_name;
     cc->has_work = openrisc_cpu_has_work;
-    cc->tcg_ops.do_interrupt = openrisc_cpu_do_interrupt;
-    cc->tcg_ops.cpu_exec_interrupt = openrisc_cpu_exec_interrupt;
     cc->dump_state = openrisc_cpu_dump_state;
     cc->set_pc = openrisc_cpu_set_pc;
     cc->gdb_read_register = openrisc_cpu_gdb_read_register;
     cc->gdb_write_register = openrisc_cpu_gdb_write_register;
-    cc->tcg_ops.tlb_fill = openrisc_cpu_tlb_fill;
 #ifndef CONFIG_USER_ONLY
     cc->get_phys_page_debug = openrisc_cpu_get_phys_page_debug;
     dc->vmsd = &vmstate_openrisc_cpu;
 #endif
     cc->gdb_num_core_regs = 32 + 3;
-    cc->tcg_ops.initialize = openrisc_translate_init;
     cc->disas_set_info = openrisc_disas_set_info;
+    cc->tcg_ops = &openrisc_tcg_ops;
 }
 
 /* Sort alphabetically by type name, except for "any". */
diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/riscv/cpu.c
+++ b/target/riscv/cpu.c
@@ -XXX,XX +XXX,XX @@ static const char *riscv_gdb_get_dynamic_xml(CPUState *cs, const char *xmlname)
     return NULL;
 }
 
+#include "hw/core/tcg-cpu-ops.h"
+
+static struct TCGCPUOps riscv_tcg_ops = {
+    .initialize = riscv_translate_init,
+    .synchronize_from_tb = riscv_cpu_synchronize_from_tb,
+    .cpu_exec_interrupt = riscv_cpu_exec_interrupt,
+    .tlb_fill = riscv_cpu_tlb_fill,
+
+#ifndef CONFIG_USER_ONLY
+    .do_interrupt = riscv_cpu_do_interrupt,
+    .do_transaction_failed = riscv_cpu_do_transaction_failed,
+    .do_unaligned_access = riscv_cpu_do_unaligned_access,
+#endif /* !CONFIG_USER_ONLY */
+};
+
 static void riscv_cpu_class_init(ObjectClass *c, void *data)
 {
     RISCVCPUClass *mcc = RISCV_CPU_CLASS(c);
@@ -XXX,XX +XXX,XX @@ static void riscv_cpu_class_init(ObjectClass *c, void *data)
 
     cc->class_by_name = riscv_cpu_class_by_name;
     cc->has_work = riscv_cpu_has_work;
-    cc->tcg_ops.do_interrupt = riscv_cpu_do_interrupt;
-    cc->tcg_ops.cpu_exec_interrupt = riscv_cpu_exec_interrupt;
     cc->dump_state = riscv_cpu_dump_state;
     cc->set_pc = riscv_cpu_set_pc;
-    cc->tcg_ops.synchronize_from_tb = riscv_cpu_synchronize_from_tb;
     cc->gdb_read_register = riscv_cpu_gdb_read_register;
     cc->gdb_write_register = riscv_cpu_gdb_write_register;
     cc->gdb_num_core_regs = 33;
@@ -XXX,XX +XXX,XX @@ static void riscv_cpu_class_init(ObjectClass *c, void *data)
     cc->gdb_stop_before_watchpoint = true;
     cc->disas_set_info = riscv_cpu_disas_set_info;
 #ifndef CONFIG_USER_ONLY
-    cc->tcg_ops.do_transaction_failed = riscv_cpu_do_transaction_failed;
-    cc->tcg_ops.do_unaligned_access = riscv_cpu_do_unaligned_access;
     cc->get_phys_page_debug = riscv_cpu_get_phys_page_debug;
     /* For now, mark unmigratable: */
     cc->vmsd = &vmstate_riscv_cpu;
 #endif
     cc->gdb_arch_name = riscv_gdb_arch_name;
     cc->gdb_get_dynamic_xml = riscv_gdb_get_dynamic_xml;
-    cc->tcg_ops.initialize = riscv_translate_init;
-    cc->tcg_ops.tlb_fill = riscv_cpu_tlb_fill;
+    cc->tcg_ops = &riscv_tcg_ops;
 
     device_class_set_props(dc, riscv_cpu_properties);
 }
diff --git a/target/rx/cpu.c b/target/rx/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/rx/cpu.c
+++ b/target/rx/cpu.c
@@ -XXX,XX +XXX,XX @@ static void rx_cpu_init(Object *obj)
     qdev_init_gpio_in(DEVICE(cpu), rx_cpu_set_irq, 2);
 }
 
+#include "hw/core/tcg-cpu-ops.h"
+
+static struct TCGCPUOps rx_tcg_ops = {
+    .initialize = rx_translate_init,
+    .synchronize_from_tb = rx_cpu_synchronize_from_tb,
+    .cpu_exec_interrupt = rx_cpu_exec_interrupt,
+    .tlb_fill = rx_cpu_tlb_fill,
+
+#ifndef CONFIG_USER_ONLY
+    .do_interrupt = rx_cpu_do_interrupt,
+#endif /* !CONFIG_USER_ONLY */
+};
+
 static void rx_cpu_class_init(ObjectClass *klass, void *data)
 {
     DeviceClass *dc = DEVICE_CLASS(klass);
@@ -XXX,XX +XXX,XX @@ static void rx_cpu_class_init(ObjectClass *klass, void *data)
 
     cc->class_by_name = rx_cpu_class_by_name;
     cc->has_work = rx_cpu_has_work;
-    cc->tcg_ops.do_interrupt = rx_cpu_do_interrupt;
-    cc->tcg_ops.cpu_exec_interrupt = rx_cpu_exec_interrupt;
     cc->dump_state = rx_cpu_dump_state;
     cc->set_pc = rx_cpu_set_pc;
-    cc->tcg_ops.synchronize_from_tb = rx_cpu_synchronize_from_tb;
+
     cc->gdb_read_register = rx_cpu_gdb_read_register;
     cc->gdb_write_register = rx_cpu_gdb_write_register;
     cc->get_phys_page_debug = rx_cpu_get_phys_page_debug;
     cc->disas_set_info = rx_cpu_disas_set_info;
-    cc->tcg_ops.initialize = rx_translate_init;
-    cc->tcg_ops.tlb_fill = rx_cpu_tlb_fill;
 
     cc->gdb_num_core_regs = 26;
     cc->gdb_core_xml_file = "rx-core.xml";
+    cc->tcg_ops = &rx_tcg_ops;
 }
 
 static const TypeInfo rx_cpu_info = {
diff --git a/target/s390x/cpu.c b/target/s390x/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/s390x/cpu.c
+++ b/target/s390x/cpu.c
@@ -XXX,XX +XXX,XX @@ static void s390_cpu_reset_full(DeviceState *dev)
     return s390_cpu_reset(s, S390_CPU_RESET_CLEAR);
 }
 
+#ifdef CONFIG_TCG
+#include "hw/core/tcg-cpu-ops.h"
+
+static struct TCGCPUOps s390_tcg_ops = {
+    .initialize = s390x_translate_init,
+    .tlb_fill = s390_cpu_tlb_fill,
+
+#if !defined(CONFIG_USER_ONLY)
+    .cpu_exec_interrupt = s390_cpu_exec_interrupt,
+    .do_interrupt = s390_cpu_do_interrupt,
+    .debug_excp_handler = s390x_cpu_debug_excp_handler,
+    .do_unaligned_access = s390x_cpu_do_unaligned_access,
+#endif /* !CONFIG_USER_ONLY */
+};
+#endif /* CONFIG_TCG */
+
 static void s390_cpu_class_init(ObjectClass *oc, void *data)
 {
     S390CPUClass *scc = S390_CPU_CLASS(oc);
@@ -XXX,XX +XXX,XX @@ static void s390_cpu_class_init(ObjectClass *oc, void *data)
     scc->reset = s390_cpu_reset;
     cc->class_by_name = s390_cpu_class_by_name,
     cc->has_work = s390_cpu_has_work;
-#ifdef CONFIG_TCG
-    cc->tcg_ops.do_interrupt = s390_cpu_do_interrupt;
-#endif
     cc->dump_state = s390_cpu_dump_state;
     cc->set_pc = s390_cpu_set_pc;
     cc->gdb_read_register = s390_cpu_gdb_read_register;
@@ -XXX,XX +XXX,XX @@ static void s390_cpu_class_init(ObjectClass *oc, void *data)
     cc->vmsd = &vmstate_s390_cpu;
     cc->get_crash_info = s390_cpu_get_crash_info;
     cc->write_elf64_note = s390_cpu_write_elf64_note;
-#ifdef CONFIG_TCG
-    cc->tcg_ops.cpu_exec_interrupt = s390_cpu_exec_interrupt;
-    cc->tcg_ops.debug_excp_handler = s390x_cpu_debug_excp_handler;
-    cc->tcg_ops.do_unaligned_access = s390x_cpu_do_unaligned_access;
-#endif
 #endif
     cc->disas_set_info = s390_cpu_disas_set_info;
-#ifdef CONFIG_TCG
-    cc->tcg_ops.initialize = s390x_translate_init;
-    cc->tcg_ops.tlb_fill = s390_cpu_tlb_fill;
-#endif
-
     cc->gdb_num_core_regs = S390_NUM_CORE_REGS;
     cc->gdb_core_xml_file = "s390x-core64.xml";
     cc->gdb_arch_name = s390_gdb_arch_name;
 
     s390_cpu_model_class_register_props(oc);
+
+#ifdef CONFIG_TCG
+    cc->tcg_ops = &s390_tcg_ops;
+#endif /* CONFIG_TCG */
 }
 
 static const TypeInfo s390_cpu_type_info = {
diff --git a/target/sh4/cpu.c b/target/sh4/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/sh4/cpu.c
+++ b/target/sh4/cpu.c
@@ -XXX,XX +XXX,XX @@ static const VMStateDescription vmstate_sh_cpu = {
     .unmigratable = 1,
 };
 
+#include "hw/core/tcg-cpu-ops.h"
+
+static struct TCGCPUOps superh_tcg_ops = {
+    .initialize = sh4_translate_init,
+    .synchronize_from_tb = superh_cpu_synchronize_from_tb,
+    .cpu_exec_interrupt = superh_cpu_exec_interrupt,
+    .tlb_fill = superh_cpu_tlb_fill,
+
+#ifndef CONFIG_USER_ONLY
+    .do_interrupt = superh_cpu_do_interrupt,
+    .do_unaligned_access = superh_cpu_do_unaligned_access,
+#endif /* !CONFIG_USER_ONLY */
+};
+
 static void superh_cpu_class_init(ObjectClass *oc, void *data)
 {
     DeviceClass *dc = DEVICE_CLASS(oc);
@@ -XXX,XX +XXX,XX @@ static void superh_cpu_class_init(ObjectClass *oc, void *data)
 
     cc->class_by_name = superh_cpu_class_by_name;
     cc->has_work = superh_cpu_has_work;
-    cc->tcg_ops.do_interrupt = superh_cpu_do_interrupt;
-    cc->tcg_ops.cpu_exec_interrupt = superh_cpu_exec_interrupt;
     cc->dump_state = superh_cpu_dump_state;
     cc->set_pc = superh_cpu_set_pc;
-    cc->tcg_ops.synchronize_from_tb = superh_cpu_synchronize_from_tb;
     cc->gdb_read_register = superh_cpu_gdb_read_register;
     cc->gdb_write_register = superh_cpu_gdb_write_register;
-    cc->tcg_ops.tlb_fill = superh_cpu_tlb_fill;
 #ifndef CONFIG_USER_ONLY
-    cc->tcg_ops.do_unaligned_access = superh_cpu_do_unaligned_access;
     cc->get_phys_page_debug = superh_cpu_get_phys_page_debug;
 #endif
     cc->disas_set_info = superh_cpu_disas_set_info;
-    cc->tcg_ops.initialize = sh4_translate_init;
 
     cc->gdb_num_core_regs = 59;
 
     dc->vmsd = &vmstate_sh_cpu;
+    cc->tcg_ops = &superh_tcg_ops;
 }
 
 #define DEFINE_SUPERH_CPU_TYPE(type_name, cinit, initfn) \
diff --git a/target/sparc/cpu.c b/target/sparc/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/sparc/cpu.c
+++ b/target/sparc/cpu.c
@@ -XXX,XX +XXX,XX @@ static Property sparc_cpu_properties[] = {
     DEFINE_PROP_END_OF_LIST()
 };
 
+#ifdef CONFIG_TCG
+#include "hw/core/tcg-cpu-ops.h"
+
+static struct TCGCPUOps sparc_tcg_ops = {
+    .initialize = sparc_tcg_init,
+    .synchronize_from_tb = sparc_cpu_synchronize_from_tb,
+    .cpu_exec_interrupt = sparc_cpu_exec_interrupt,
+    .tlb_fill = sparc_cpu_tlb_fill,
+
+#ifndef CONFIG_USER_ONLY
+    .do_interrupt = sparc_cpu_do_interrupt,
+    .do_transaction_failed = sparc_cpu_do_transaction_failed,
+    .do_unaligned_access = sparc_cpu_do_unaligned_access,
+#endif /* !CONFIG_USER_ONLY */
+};
+#endif /* CONFIG_TCG */
+
 static void sparc_cpu_class_init(ObjectClass *oc, void *data)
 {
     SPARCCPUClass *scc = SPARC_CPU_CLASS(oc);
@@ -XXX,XX +XXX,XX @@ static void sparc_cpu_class_init(ObjectClass *oc, void *data)
     cc->class_by_name = sparc_cpu_class_by_name;
     cc->parse_features = sparc_cpu_parse_features;
     cc->has_work = sparc_cpu_has_work;
-    cc->tcg_ops.do_interrupt = sparc_cpu_do_interrupt;
-    cc->tcg_ops.cpu_exec_interrupt = sparc_cpu_exec_interrupt;
     cc->dump_state = sparc_cpu_dump_state;
 #if !defined(TARGET_SPARC64) && !defined(CONFIG_USER_ONLY)
     cc->memory_rw_debug = sparc_cpu_memory_rw_debug;
 #endif
     cc->set_pc = sparc_cpu_set_pc;
-    cc->tcg_ops.synchronize_from_tb = sparc_cpu_synchronize_from_tb;
     cc->gdb_read_register = sparc_cpu_gdb_read_register;
     cc->gdb_write_register = sparc_cpu_gdb_write_register;
-    cc->tcg_ops.tlb_fill = sparc_cpu_tlb_fill;
 #ifndef CONFIG_USER_ONLY
-    cc->tcg_ops.do_transaction_failed = sparc_cpu_do_transaction_failed;
-    cc->tcg_ops.do_unaligned_access = sparc_cpu_do_unaligned_access;
     cc->get_phys_page_debug = sparc_cpu_get_phys_page_debug;
     cc->vmsd = &vmstate_sparc_cpu;
 #endif
     cc->disas_set_info = cpu_sparc_disas_set_info;
-    cc->tcg_ops.initialize = sparc_tcg_init;
 
 #if defined(TARGET_SPARC64) && !defined(TARGET_ABI32)
     cc->gdb_num_core_regs = 86;
 #else
     cc->gdb_num_core_regs = 72;
 #endif
+    cc->tcg_ops = &sparc_tcg_ops;
 }
 
 static const TypeInfo sparc_cpu_type_info = {
diff --git a/target/tilegx/cpu.c b/target/tilegx/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/tilegx/cpu.c
+++ b/target/tilegx/cpu.c
@@ -XXX,XX +XXX,XX @@ static bool tilegx_cpu_exec_interrupt(CPUState *cs, int interrupt_request)
     return false;
 }
 
+#include "hw/core/tcg-cpu-ops.h"
+
+static struct TCGCPUOps tilegx_tcg_ops = {
+    .initialize = tilegx_tcg_init,
+    .cpu_exec_interrupt = tilegx_cpu_exec_interrupt,
+    .tlb_fill = tilegx_cpu_tlb_fill,
+
+#ifndef CONFIG_USER_ONLY
+    .do_interrupt = tilegx_cpu_do_interrupt,
+#endif /* !CONFIG_USER_ONLY */
+};
+
 static void tilegx_cpu_class_init(ObjectClass *oc, void *data)
 {
     DeviceClass *dc = DEVICE_CLASS(oc);
@@ -XXX,XX +XXX,XX @@ static void tilegx_cpu_class_init(ObjectClass *oc, void *data)
 
     cc->class_by_name = tilegx_cpu_class_by_name;
     cc->has_work = tilegx_cpu_has_work;
-    cc->tcg_ops.do_interrupt = tilegx_cpu_do_interrupt;
-    cc->tcg_ops.cpu_exec_interrupt = tilegx_cpu_exec_interrupt;
     cc->dump_state = tilegx_cpu_dump_state;
     cc->set_pc = tilegx_cpu_set_pc;
-    cc->tcg_ops.tlb_fill = tilegx_cpu_tlb_fill;
     cc->gdb_num_core_regs = 0;
-    cc->tcg_ops.initialize = tilegx_tcg_init;
+    cc->tcg_ops = &tilegx_tcg_ops;
 }
 
 static const TypeInfo tilegx_cpu_type_info = {
diff --git a/target/tricore/cpu.c b/target/tricore/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/tricore/cpu.c
+++ b/target/tricore/cpu.c
@@ -XXX,XX +XXX,XX @@ static void tc27x_initfn(Object *obj)
     set_feature(&cpu->env, TRICORE_FEATURE_161);
 }
 
+#include "hw/core/tcg-cpu-ops.h"
+
+static struct TCGCPUOps tricore_tcg_ops = {
+    .initialize = tricore_tcg_init,
+    .synchronize_from_tb = tricore_cpu_synchronize_from_tb,
+    .tlb_fill = tricore_cpu_tlb_fill,
+};
+
 static void tricore_cpu_class_init(ObjectClass *c, void *data)
 {
     TriCoreCPUClass *mcc = TRICORE_CPU_CLASS(c);
@@ -XXX,XX +XXX,XX @@ static void tricore_cpu_class_init(ObjectClass *c, void *data)
 
     cc->dump_state = tricore_cpu_dump_state;
     cc->set_pc = tricore_cpu_set_pc;
-    cc->tcg_ops.synchronize_from_tb = tricore_cpu_synchronize_from_tb;
     cc->get_phys_page_debug = tricore_cpu_get_phys_page_debug;
-    cc->tcg_ops.initialize = tricore_tcg_init;
-    cc->tcg_ops.tlb_fill = tricore_cpu_tlb_fill;
+    cc->tcg_ops = &tricore_tcg_ops;
 }
 
 #define DEFINE_TRICORE_CPU_TYPE(cpu_model, initfn) \
diff --git a/target/unicore32/cpu.c b/target/unicore32/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/unicore32/cpu.c
+++ b/target/unicore32/cpu.c
@@ -XXX,XX +XXX,XX @@ static const VMStateDescription vmstate_uc32_cpu = {
     .unmigratable = 1,
 };
 
+#include "hw/core/tcg-cpu-ops.h"
+
+static struct TCGCPUOps uc32_tcg_ops = {
+    .initialize = uc32_translate_init,
+    .cpu_exec_interrupt = uc32_cpu_exec_interrupt,
+    .tlb_fill = uc32_cpu_tlb_fill,
+
+#ifndef CONFIG_USER_ONLY
+    .do_interrupt = uc32_cpu_do_interrupt,
+#endif /* !CONFIG_USER_ONLY */
+};
+
 static void uc32_cpu_class_init(ObjectClass *oc, void *data)
 {
     DeviceClass *dc = DEVICE_CLASS(oc);
@@ -XXX,XX +XXX,XX @@ static void uc32_cpu_class_init(ObjectClass *oc, void *data)
 
     cc->class_by_name = uc32_cpu_class_by_name;
     cc->has_work = uc32_cpu_has_work;
-    cc->tcg_ops.do_interrupt = uc32_cpu_do_interrupt;
-    cc->tcg_ops.cpu_exec_interrupt = uc32_cpu_exec_interrupt;
     cc->dump_state = uc32_cpu_dump_state;
     cc->set_pc = uc32_cpu_set_pc;
-    cc->tcg_ops.tlb_fill = uc32_cpu_tlb_fill;
     cc->get_phys_page_debug = uc32_cpu_get_phys_page_debug;
-    cc->tcg_ops.initialize = uc32_translate_init;
     dc->vmsd = &vmstate_uc32_cpu;
+    cc->tcg_ops = &uc32_tcg_ops;
 }
 
 #define DEFINE_UNICORE32_CPU_TYPE(cpu_model, initfn) \
diff --git a/target/xtensa/cpu.c b/target/xtensa/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/xtensa/cpu.c
+++ b/target/xtensa/cpu.c
@@ -XXX,XX +XXX,XX @@ static const VMStateDescription vmstate_xtensa_cpu = {
     .unmigratable = 1,
 };
 
+#include "hw/core/tcg-cpu-ops.h"
+
+static struct TCGCPUOps xtensa_tcg_ops = {
+    .initialize = xtensa_translate_init,
+    .cpu_exec_interrupt = xtensa_cpu_exec_interrupt,
+    .tlb_fill = xtensa_cpu_tlb_fill,
+    .debug_excp_handler = xtensa_breakpoint_handler,
+
+#ifndef CONFIG_USER_ONLY
+    .do_interrupt = xtensa_cpu_do_interrupt,
+    .do_transaction_failed = xtensa_cpu_do_transaction_failed,
+    .do_unaligned_access = xtensa_cpu_do_unaligned_access,
+#endif /* !CONFIG_USER_ONLY */
+};
+
 static void xtensa_cpu_class_init(ObjectClass *oc, void *data)
 {
     DeviceClass *dc = DEVICE_CLASS(oc);
@@ -XXX,XX +XXX,XX @@ static void xtensa_cpu_class_init(ObjectClass *oc, void *data)
 
     cc->class_by_name = xtensa_cpu_class_by_name;
     cc->has_work = xtensa_cpu_has_work;
-    cc->tcg_ops.do_interrupt = xtensa_cpu_do_interrupt;
-    cc->tcg_ops.cpu_exec_interrupt = xtensa_cpu_exec_interrupt;
     cc->dump_state = xtensa_cpu_dump_state;
     cc->set_pc = xtensa_cpu_set_pc;
     cc->gdb_read_register = xtensa_cpu_gdb_read_register;
     cc->gdb_write_register = xtensa_cpu_gdb_write_register;
     cc->gdb_stop_before_watchpoint = true;
-    cc->tcg_ops.tlb_fill = xtensa_cpu_tlb_fill;
 #ifndef CONFIG_USER_ONLY
-    cc->tcg_ops.do_unaligned_access = xtensa_cpu_do_unaligned_access;
     cc->get_phys_page_debug = xtensa_cpu_get_phys_page_debug;
-    cc->tcg_ops.do_transaction_failed = xtensa_cpu_do_transaction_failed;
 #endif
-    cc->tcg_ops.debug_excp_handler = xtensa_breakpoint_handler;
     cc->disas_set_info = xtensa_cpu_disas_set_info;
-    cc->tcg_ops.initialize = xtensa_translate_init;
     dc->vmsd = &vmstate_xtensa_cpu;
+    cc->tcg_ops = &xtensa_tcg_ops;
 }
 
 static const TypeInfo xtensa_cpu_type_info = {
diff --git a/target/ppc/translate_init.c.inc b/target/ppc/translate_init.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/target/ppc/translate_init.c.inc
+++ b/target/ppc/translate_init.c.inc
@@ -XXX,XX +XXX,XX @@ static Property ppc_cpu_properties[] = {
     DEFINE_PROP_END_OF_LIST(),
 };
 
+#ifdef CONFIG_TCG
+#include "hw/core/tcg-cpu-ops.h"
+
+static struct TCGCPUOps ppc_tcg_ops = {
+  .initialize = ppc_translate_init,
+  .cpu_exec_interrupt = ppc_cpu_exec_interrupt,
+  .tlb_fill = ppc_cpu_tlb_fill,
+
+#ifndef CONFIG_USER_ONLY
+  .do_interrupt = ppc_cpu_do_interrupt,
+  .cpu_exec_enter = ppc_cpu_exec_enter,
+  .cpu_exec_exit = ppc_cpu_exec_exit,
+  .do_unaligned_access = ppc_cpu_do_unaligned_access,
+#endif /* !CONFIG_USER_ONLY */
+};
+#endif /* CONFIG_TCG */
+
 static void ppc_cpu_class_init(ObjectClass *oc, void *data)
 {
     PowerPCCPUClass *pcc = POWERPC_CPU_CLASS(oc);
@@ -XXX,XX +XXX,XX @@ static void ppc_cpu_class_init(ObjectClass *oc, void *data)
 #ifndef CONFIG_USER_ONLY
     cc->virtio_is_big_endian = ppc_cpu_is_big_endian;
 #endif
-#ifdef CONFIG_TCG
-    cc->tcg_ops.initialize = ppc_translate_init;
-    cc->tcg_ops.cpu_exec_interrupt = ppc_cpu_exec_interrupt;
-    cc->tcg_ops.do_interrupt = ppc_cpu_do_interrupt;
-    cc->tcg_ops.tlb_fill = ppc_cpu_tlb_fill;
-#ifndef CONFIG_USER_ONLY
-    cc->tcg_ops.cpu_exec_enter = ppc_cpu_exec_enter;
-    cc->tcg_ops.cpu_exec_exit = ppc_cpu_exec_exit;
-    cc->tcg_ops.do_unaligned_access = ppc_cpu_do_unaligned_access;
-#endif /* !CONFIG_USER_ONLY */
-#endif /* CONFIG_TCG */
-
     cc->disas_set_info = ppc_disas_set_info;
 
     dc->fw_name = "PowerPC,UNKNOWN";
+
+#ifdef CONFIG_TCG
+    cc->tcg_ops = &ppc_tcg_ops;
+#endif /* CONFIG_TCG */
 }
 
 static const TypeInfo ppc_cpu_type_info = {
diff --git a/MAINTAINERS b/MAINTAINERS
index XXXXXXX..XXXXXXX 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -XXX,XX +XXX,XX @@ F: include/exec/helper*.h
 F: include/exec/tb-hash.h
 F: include/sysemu/cpus.h
 F: include/sysemu/tcg.h
+F: include/hw/core/tcg-cpu-ops.h
 
 FPU emulation
 M: Aurelien Jarno <aurelien@aurel32.net>
-- 
2.25.1

From: Claudio Fontana <cfontana@suse.de>

Signed-off-by: Claudio Fontana <cfontana@suse.de>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>

[claudio: rebased on Richard's splitwx work]

Signed-off-by: Claudio Fontana <cfontana@suse.de>
Message-Id: <20210204163931.7358-17-cfontana@suse.de>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/hw/boards.h                |  2 +-
 include/{sysemu => qemu}/accel.h   | 14 +++++----
 include/sysemu/hvf.h               |  2 +-
 include/sysemu/kvm.h               |  2 +-
 include/sysemu/kvm_int.h           |  2 +-
 target/i386/hvf/hvf-i386.h         |  2 +-
 accel/accel-common.c               | 50 ++++++++++++++++++++++++++++++
 accel/{accel.c => accel-softmmu.c} | 27 ++--------------
 accel/accel-user.c                 | 24 ++++++++++++++
 accel/qtest/qtest.c                |  2 +-
 accel/tcg/tcg-all.c                | 15 +++++++--
 accel/xen/xen-all.c                |  2 +-
 bsd-user/main.c                    |  6 +++-
 linux-user/main.c                  |  6 +++-
 softmmu/memory.c                   |  2 +-
 softmmu/qtest.c                    |  2 +-
 softmmu/vl.c                       |  2 +-
 target/i386/hax/hax-all.c          |  2 +-
 target/i386/hvf/hvf.c              |  2 +-
 target/i386/hvf/x86_task.c         |  2 +-
 target/i386/whpx/whpx-all.c        |  2 +-
 MAINTAINERS                        |  2 +-
 accel/meson.build                  |  4 ++-
 accel/tcg/meson.build              |  2 +-
 24 files changed, 125 insertions(+), 53 deletions(-)
 rename include/{sysemu => qemu}/accel.h (95%)
 create mode 100644 accel/accel-common.c
 rename accel/{accel.c => accel-softmmu.c} (75%)
 create mode 100644 accel/accel-user.c

diff --git a/include/hw/boards.h b/include/hw/boards.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/boards.h
+++ b/include/hw/boards.h
@@ -XXX,XX +XXX,XX @@
 #include "exec/memory.h"
 #include "sysemu/hostmem.h"
 #include "sysemu/blockdev.h"
-#include "sysemu/accel.h"
+#include "qemu/accel.h"
 #include "qapi/qapi-types-machine.h"
 #include "qemu/module.h"
 #include "qom/object.h"
diff --git a/include/sysemu/accel.h b/include/qemu/accel.h
similarity index 95%
rename from include/sysemu/accel.h
rename to include/qemu/accel.h
index XXXXXXX..XXXXXXX 100644
--- a/include/sysemu/accel.h
+++ b/include/qemu/accel.h
@@ -XXX,XX +XXX,XX @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  * THE SOFTWARE.
  */
-#ifndef HW_ACCEL_H
-#define HW_ACCEL_H
+#ifndef QEMU_ACCEL_H
+#define QEMU_ACCEL_H
 
 #include "qom/object.h"
 #include "exec/hwaddr.h"
@@ -XXX,XX +XXX,XX @@ typedef struct AccelClass {
     /*< public >*/
 
     const char *name;
-#ifndef CONFIG_USER_ONLY
     int (*init_machine)(MachineState *ms);
+#ifndef CONFIG_USER_ONLY
     void (*setup_post)(MachineState *ms, AccelState *accel);
     bool (*has_memory)(MachineState *ms, AddressSpace *as,
                        hwaddr start_addr, hwaddr size);
@@ -XXX,XX +XXX,XX @@ typedef struct AccelClass {
     OBJECT_GET_CLASS(AccelClass, (obj), TYPE_ACCEL)
 
 AccelClass *accel_find(const char *opt_name);
+AccelState *current_accel(void);
+
+#ifndef CONFIG_USER_ONLY
 int accel_init_machine(AccelState *accel, MachineState *ms);
 
 /* Called just before os_setup_post (ie just before drop OS privs) */
 void accel_setup_post(MachineState *ms);
+#endif /* !CONFIG_USER_ONLY */
 
-AccelState *current_accel(void);
-
-#endif
+#endif /* QEMU_ACCEL_H */
diff --git a/include/sysemu/hvf.h b/include/sysemu/hvf.h
index XXXXXXX..XXXXXXX 100644
--- a/include/sysemu/hvf.h
+++ b/include/sysemu/hvf.h
@@ -XXX,XX +XXX,XX @@
 #ifndef HVF_H
 #define HVF_H
 
-#include "sysemu/accel.h"
+#include "qemu/accel.h"
 #include "qom/object.h"
 
 #ifdef CONFIG_HVF
diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
index XXXXXXX..XXXXXXX 100644
--- a/include/sysemu/kvm.h
+++ b/include/sysemu/kvm.h
@@ -XXX,XX +XXX,XX @@
 #include "qemu/queue.h"
 #include "hw/core/cpu.h"
 #include "exec/memattrs.h"
-#include "sysemu/accel.h"
+#include "qemu/accel.h"
 #include "qom/object.h"
 
 #ifdef NEED_CPU_H
diff --git a/include/sysemu/kvm_int.h b/include/sysemu/kvm_int.h
index XXXXXXX..XXXXXXX 100644
--- a/include/sysemu/kvm_int.h
+++ b/include/sysemu/kvm_int.h
@@ -XXX,XX +XXX,XX @@
 #define QEMU_KVM_INT_H
 
 #include "exec/memory.h"
-#include "sysemu/accel.h"
+#include "qemu/accel.h"
 #include "sysemu/kvm.h"
 
 typedef struct KVMSlot
diff --git a/target/i386/hvf/hvf-i386.h b/target/i386/hvf/hvf-i386.h
index XXXXXXX..XXXXXXX 100644
--- a/target/i386/hvf/hvf-i386.h
+++ b/target/i386/hvf/hvf-i386.h
@@ -XXX,XX +XXX,XX @@
 #ifndef HVF_I386_H
 #define HVF_I386_H
 
-#include "sysemu/accel.h"
+#include "qemu/accel.h"
 #include "sysemu/hvf.h"
 #include "cpu.h"
 #include "x86.h"
diff --git a/accel/accel-common.c b/accel/accel-common.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/accel/accel-common.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * QEMU accel class, components common to system emulation and user mode
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ * Copyright (c) 2014 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/accel.h"
+
+static const TypeInfo accel_type = {
+    .name = TYPE_ACCEL,
+    .parent = TYPE_OBJECT,
+    .class_size = sizeof(AccelClass),
+    .instance_size = sizeof(AccelState),
+};
+
+/* Lookup AccelClass from opt_name. Returns NULL if not found */
+AccelClass *accel_find(const char *opt_name)
+{
+    char *class_name = g_strdup_printf(ACCEL_CLASS_NAME("%s"), opt_name);
+    AccelClass *ac = ACCEL_CLASS(object_class_by_name(class_name));
+    g_free(class_name);
+    return ac;
+}
+
+static void register_accel_types(void)
+{
+    type_register_static(&accel_type);
+}
+
+type_init(register_accel_types);
diff --git a/accel/accel.c b/accel/accel-softmmu.c
similarity index 75%
rename from accel/accel.c
rename to accel/accel-softmmu.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/accel.c
+++ b/accel/accel-softmmu.c
@@ -XXX,XX +XXX,XX @@
 /*
- * QEMU System Emulator, accelerator interfaces
+ * QEMU accel class, system emulation components
  *
  * Copyright (c) 2003-2008 Fabrice Bellard
  * Copyright (c) 2014 Red Hat Inc.
@@ -XXX,XX +XXX,XX @@
  */
 
 #include "qemu/osdep.h"
-#include "sysemu/accel.h"
+#include "qemu/accel.h"
 #include "hw/boards.h"
 #include "sysemu/arch_init.h"
 #include "sysemu/sysemu.h"
 #include "qom/object.h"
 
-static const TypeInfo accel_type = {
-    .name = TYPE_ACCEL,
-    .parent = TYPE_OBJECT,
-    .class_size = sizeof(AccelClass),
-    .instance_size = sizeof(AccelState),
-};
-
-/* Lookup AccelClass from opt_name. Returns NULL if not found */
-AccelClass *accel_find(const char *opt_name)
-{
-    char *class_name = g_strdup_printf(ACCEL_CLASS_NAME("%s"), opt_name);
-    AccelClass *ac = ACCEL_CLASS(object_class_by_name(class_name));
-    g_free(class_name);
-    return ac;
-}
-
 int accel_init_machine(AccelState *accel, MachineState *ms)
 {
     AccelClass *acc = ACCEL_GET_CLASS(accel);
@@ -XXX,XX +XXX,XX @@ void accel_setup_post(MachineState *ms)
         acc->setup_post(ms, accel);
     }
 }
-
-static void register_accel_types(void)
-{
-    type_register_static(&accel_type);
-}
-
-type_init(register_accel_types);
diff --git a/accel/accel-user.c b/accel/accel-user.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/accel/accel-user.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * QEMU accel class, user-mode components
+ *
+ * Copyright 2021 SUSE LLC
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/accel.h"
+
+AccelState *current_accel(void)
+{
+    static AccelState *accel;
+
+    if (!accel) {
+        AccelClass *ac = accel_find("tcg");
+
+        g_assert(ac != NULL);
+        accel = ACCEL(object_new_with_class(OBJECT_CLASS(ac)));
+    }
+    return accel;
+}
diff --git a/accel/qtest/qtest.c b/accel/qtest/qtest.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/qtest/qtest.c
+++ b/accel/qtest/qtest.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/module.h"
 #include "qemu/option.h"
 #include "qemu/config-file.h"
-#include "sysemu/accel.h"
+#include "qemu/accel.h"
 #include "sysemu/qtest.h"
 #include "sysemu/cpus.h"
 #include "sysemu/cpu-timers.h"
diff --git a/accel/tcg/tcg-all.c b/accel/tcg/tcg-all.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-all.c
+++ b/accel/tcg/tcg-all.c
@@ -XXX,XX +XXX,XX @@
 #include "tcg/tcg.h"
 #include "qapi/error.h"
 #include "qemu/error-report.h"
-#include "hw/boards.h"
+#include "qemu/accel.h"
 #include "qapi/qapi-builtin-visit.h"
+
+#ifndef CONFIG_USER_ONLY
 #include "tcg-cpus.h"
+#endif /* CONFIG_USER_ONLY */
 
 struct TCGState {
     AccelState parent_obj;
@@ -XXX,XX +XXX,XX @@ static void tcg_accel_instance_init(Object *obj)
     s->mttcg_enabled = default_mttcg_enabled();
 
     /* If debugging enabled, default "auto on", otherwise off. */
-#ifdef CONFIG_DEBUG_TCG
+#if defined(CONFIG_DEBUG_TCG) && !defined(CONFIG_USER_ONLY)
     s->splitwx_enabled = -1;
 #else
     s->splitwx_enabled = 0;
@@ -XXX,XX +XXX,XX @@ static int tcg_init(MachineState *ms)
     mttcg_enabled = s->mttcg_enabled;
 
     /*
-     * Initialize TCG regions
+     * Initialize TCG regions only for softmmu.
+     *
+     * This needs to be done later for user mode, because the prologue
+     * generation needs to be delayed so that GUEST_BASE is already set.
      */
+#ifndef CONFIG_USER_ONLY
     tcg_region_init();
 
     if (mttcg_enabled) {
@@ -XXX,XX +XXX,XX @@ static int tcg_init(MachineState *ms)
     } else {
         cpus_register_accel(&tcg_cpus_rr);
     }
+#endif /* !CONFIG_USER_ONLY */
+
     return 0;
 }
 
diff --git a/accel/xen/xen-all.c b/accel/xen/xen-all.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/xen/xen-all.c
+++ b/accel/xen/xen-all.c
@@ -XXX,XX +XXX,XX @@
 #include "hw/xen/xen-legacy-backend.h"
 #include "hw/xen/xen_pt.h"
 #include "chardev/char.h"
-#include "sysemu/accel.h"
+#include "qemu/accel.h"
 #include "sysemu/cpus.h"
 #include "sysemu/xen.h"
 #include "sysemu/runstate.h"
diff --git a/bsd-user/main.c b/bsd-user/main.c
index XXXXXXX..XXXXXXX 100644
--- a/bsd-user/main.c
+++ b/bsd-user/main.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/osdep.h"
 #include "qemu-common.h"
 #include "qemu/units.h"
+#include "qemu/accel.h"
 #include "sysemu/tcg.h"
 #include "qemu-version.h"
 #include <machine/trap.h>
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
     }
 
     /* init tcg before creating CPUs and to get qemu_host_page_size */
-    tcg_exec_init(0, false);
+    {
+        AccelClass *ac = ACCEL_GET_CLASS(current_accel());
 
+        ac->init_machine(NULL);
+    }
     cpu_type = parse_cpu_option(cpu_model);
     cpu = cpu_create(cpu_type);
     env = cpu->env_ptr;
diff --git a/linux-user/main.c b/linux-user/main.c
index XXXXXXX..XXXXXXX 100644
--- a/linux-user/main.c
+++ b/linux-user/main.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/osdep.h"
 #include "qemu-common.h"
 #include "qemu/units.h"
+#include "qemu/accel.h"
 #include "sysemu/tcg.h"
 #include "qemu-version.h"
 #include <sys/syscall.h>
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp)
     cpu_type = parse_cpu_option(cpu_model);
 
     /* init tcg before creating CPUs and to get qemu_host_page_size */
-    tcg_exec_init(0, false);
+    {
+        AccelClass *ac = ACCEL_GET_CLASS(current_accel());
 
+        ac->init_machine(NULL);
+    }
     cpu = cpu_create(cpu_type);
     env = cpu->env_ptr;
     cpu_reset(cpu);
diff --git a/softmmu/memory.c b/softmmu/memory.c
index XXXXXXX..XXXXXXX 100644
--- a/softmmu/memory.c
+++ b/softmmu/memory.c
@@ -XXX,XX +XXX,XX @@
 #include "sysemu/kvm.h"
 #include "sysemu/runstate.h"
 #include "sysemu/tcg.h"
-#include "sysemu/accel.h"
+#include "qemu/accel.h"
 #include "hw/boards.h"
 #include "migration/vmstate.h"
 
diff --git a/softmmu/qtest.c b/softmmu/qtest.c
index XXXXXXX..XXXXXXX 100644
--- a/softmmu/qtest.c
+++ b/softmmu/qtest.c
@@ -XXX,XX +XXX,XX @@
 #include "exec/ioport.h"
 #include "exec/memory.h"
 #include "hw/irq.h"
-#include "sysemu/accel.h"
+#include "qemu/accel.h"
 #include "sysemu/cpu-timers.h"
 #include "qemu/config-file.h"
 #include "qemu/option.h"
diff --git a/softmmu/vl.c b/softmmu/vl.c
index XXXXXXX..XXXXXXX 100644
--- a/softmmu/vl.c
+++ b/softmmu/vl.c
@@ -XXX,XX +XXX,XX @@
 
 #include "qemu/error-report.h"
 #include "qemu/sockets.h"
-#include "sysemu/accel.h"
+#include "qemu/accel.h"
 #include "hw/usb.h"
 #include "hw/isa/isa.h"
 #include "hw/scsi/scsi.h"
diff --git a/target/i386/hax/hax-all.c b/target/i386/hax/hax-all.c
index XXXXXXX..XXXXXXX 100644
--- a/target/i386/hax/hax-all.c
+++ b/target/i386/hax/hax-all.c
@@ -XXX,XX +XXX,XX @@
 #include "exec/address-spaces.h"
 
 #include "qemu-common.h"
-#include "sysemu/accel.h"
+#include "qemu/accel.h"
 #include "sysemu/reset.h"
 #include "sysemu/runstate.h"
 #include "hw/boards.h"
diff --git a/target/i386/hvf/hvf.c b/target/i386/hvf/hvf.c
index XXXXXXX..XXXXXXX 100644
--- a/target/i386/hvf/hvf.c
+++ b/target/i386/hvf/hvf.c
@@ -XXX,XX +XXX,XX @@
 #include "exec/address-spaces.h"
 #include "hw/i386/apic_internal.h"
 #include "qemu/main-loop.h"
-#include "sysemu/accel.h"
+#include "qemu/accel.h"
 #include "target/i386/cpu.h"
 
 #include "hvf-cpus.h"
diff --git a/target/i386/hvf/x86_task.c b/target/i386/hvf/x86_task.c
index XXXXXXX..XXXXXXX 100644
--- a/target/i386/hvf/x86_task.c
+++ b/target/i386/hvf/x86_task.c
@@ -XXX,XX +XXX,XX @@
 
 #include "hw/i386/apic_internal.h"
 #include "qemu/main-loop.h"
-#include "sysemu/accel.h"
+#include "qemu/accel.h"
 #include "target/i386/cpu.h"
 
 // TODO: taskswitch handling
diff --git a/target/i386/whpx/whpx-all.c b/target/i386/whpx/whpx-all.c
index XXXXXXX..XXXXXXX 100644
--- a/target/i386/whpx/whpx-all.c
+++ b/target/i386/whpx/whpx-all.c
@@ -XXX,XX +XXX,XX @@
 #include "exec/address-spaces.h"
 #include "exec/ioport.h"
 #include "qemu-common.h"
-#include "sysemu/accel.h"
+#include "qemu/accel.h"
 #include "sysemu/whpx.h"
 #include "sysemu/cpus.h"
 #include "sysemu/runstate.h"
diff --git a/MAINTAINERS b/MAINTAINERS
index XXXXXXX..XXXXXXX 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -XXX,XX +XXX,XX @@ Overall
 M: Richard Henderson <richard.henderson@linaro.org>
 R: Paolo Bonzini <pbonzini@redhat.com>
 S: Maintained
-F: include/sysemu/accel.h
+F: include/qemu/accel.h
 F: accel/accel.c
 F: accel/Makefile.objs
 F: accel/stubs/Makefile.objs
diff --git a/accel/meson.build b/accel/meson.build
index XXXXXXX..XXXXXXX 100644
--- a/accel/meson.build
+++ b/accel/meson.build
@@ -XXX,XX +XXX,XX @@
-softmmu_ss.add(files('accel.c'))
+specific_ss.add(files('accel-common.c'))
+softmmu_ss.add(files('accel-softmmu.c'))
+user_ss.add(files('accel-user.c'))
 
 subdir('qtest')
 subdir('kvm')
diff --git a/accel/tcg/meson.build b/accel/tcg/meson.build
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/meson.build
+++ b/accel/tcg/meson.build
@@ -XXX,XX +XXX,XX @@
 tcg_ss = ss.source_set()
 tcg_ss.add(files(
+  'tcg-all.c',
   'cpu-exec-common.c',
   'cpu-exec.c',
   'tcg-runtime-gvec.c',
@@ -XXX,XX +XXX,XX @@ tcg_ss.add(when: 'CONFIG_PLUGIN', if_true: [files('plugin-gen.c'), libdl])
 specific_ss.add_all(when: 'CONFIG_TCG', if_true: tcg_ss)
 
 specific_ss.add(when: ['CONFIG_SOFTMMU', 'CONFIG_TCG'], if_true: files(
-  'tcg-all.c',
   'cputlb.c',
   'tcg-cpus.c',
   'tcg-cpus-mttcg.c',
-- 
2.25.1

From: Claudio Fontana <cfontana@suse.de>

This will allow us to centralize the registration of
the cpus.c module accelerator operations (in accel/accel-softmmu.c),
and trigger it automatically using object hierarchy lookup from the
new accel_init_interfaces() initialization step, depending just on
which accelerators are available in the code.

Rename all tcg-cpus.c, kvm-cpus.c, etc to tcg-accel-ops.c,
kvm-accel-ops.c, etc, matching the object type names.

Signed-off-by: Claudio Fontana <cfontana@suse.de>
Message-Id: <20210204163931.7358-18-cfontana@suse.de>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/accel-softmmu.h                         | 15 ++++++
 accel/kvm/kvm-cpus.h                          |  2 -
 ...g-cpus-icount.h => tcg-accel-ops-icount.h} |  2 +
 accel/tcg/tcg-accel-ops-mttcg.h               | 19 ++++++++
 .../tcg/{tcg-cpus-rr.h => tcg-accel-ops-rr.h} |  0
 accel/tcg/{tcg-cpus.h => tcg-accel-ops.h}     |  6 +--
 include/qemu/accel.h                          |  2 +
 include/sysemu/accel-ops.h                    | 45 ++++++++++++++++++
 include/sysemu/cpus.h                         | 26 ++--------
 .../i386/hax/{hax-cpus.h => hax-accel-ops.h}  |  2 -
 target/i386/hax/hax-windows.h                 |  2 +-
 .../i386/hvf/{hvf-cpus.h => hvf-accel-ops.h}  |  2 -
 .../whpx/{whpx-cpus.h => whpx-accel-ops.h}    |  2 -
 accel/accel-common.c                          | 11 +++++
 accel/accel-softmmu.c                         | 44 +++++++++++++++--
 accel/kvm/{kvm-cpus.c => kvm-accel-ops.c}     | 28 ++++++++---
 accel/kvm/kvm-all.c                           |  2 -
 accel/qtest/qtest.c                           | 23 ++++++---
 ...g-cpus-icount.c => tcg-accel-ops-icount.c} | 21 +++------
 ...tcg-cpus-mttcg.c => tcg-accel-ops-mttcg.c} | 14 ++----
 .../tcg/{tcg-cpus-rr.c => tcg-accel-ops-rr.c} | 13 ++---
 accel/tcg/{tcg-cpus.c => tcg-accel-ops.c}     | 47 ++++++++++++++++++-
 accel/tcg/tcg-all.c                           | 12 -----
 accel/xen/xen-all.c                           | 24 ++++++----
 bsd-user/main.c                               |  3 +-
 linux-user/main.c                             |  1 +
 softmmu/cpus.c                                | 12 ++---
 softmmu/vl.c                                  |  7 ++-
 .../i386/hax/{hax-cpus.c => hax-accel-ops.c}  | 33 +++++++++----
 target/i386/hax/hax-all.c                     |  5 +-
 target/i386/hax/hax-mem.c                     |  2 +-
 target/i386/hax/hax-posix.c                   |  2 +-
 target/i386/hax/hax-windows.c                 |  2 +-
 .../i386/hvf/{hvf-cpus.c => hvf-accel-ops.c}  | 29 +++++++++---
 target/i386/hvf/hvf.c                         |  3 +-
 target/i386/hvf/x86hvf.c                      |  2 +-
 .../whpx/{whpx-cpus.c => whpx-accel-ops.c}    | 33 +++++++++----
 target/i386/whpx/whpx-all.c                   |  7 +--
 MAINTAINERS                                   |  3 +-
 accel/kvm/meson.build                         |  2 +-
 accel/tcg/meson.build                         |  8 ++--
 target/i386/hax/meson.build                   |  2 +-
 target/i386/hvf/meson.build                   |  2 +-
 target/i386/whpx/meson.build                  |  2 +-
 44 files changed, 361 insertions(+), 163 deletions(-)
 create mode 100644 accel/accel-softmmu.h
 rename accel/tcg/{tcg-cpus-icount.h => tcg-accel-ops-icount.h} (88%)
 create mode 100644 accel/tcg/tcg-accel-ops-mttcg.h
 rename accel/tcg/{tcg-cpus-rr.h => tcg-accel-ops-rr.h} (100%)
 rename accel/tcg/{tcg-cpus.h => tcg-accel-ops.h} (72%)
 create mode 100644 include/sysemu/accel-ops.h
 rename target/i386/hax/{hax-cpus.h => hax-accel-ops.h} (95%)
 rename target/i386/hvf/{hvf-cpus.h => hvf-accel-ops.h} (94%)
 rename target/i386/whpx/{whpx-cpus.h => whpx-accel-ops.h} (96%)
 rename accel/kvm/{kvm-cpus.c => kvm-accel-ops.c} (72%)
 rename accel/tcg/{tcg-cpus-icount.c => tcg-accel-ops-icount.c} (89%)
 rename accel/tcg/{tcg-cpus-mttcg.c => tcg-accel-ops-mttcg.c} (92%)
 rename accel/tcg/{tcg-cpus-rr.c => tcg-accel-ops-rr.c} (97%)
 rename accel/tcg/{tcg-cpus.c => tcg-accel-ops.c} (63%)
 rename target/i386/hax/{hax-cpus.c => hax-accel-ops.c} (69%)
 rename target/i386/hvf/{hvf-cpus.c => hvf-accel-ops.c} (84%)
 rename target/i386/whpx/{whpx-cpus.c => whpx-accel-ops.c} (71%)

diff --git a/accel/accel-softmmu.h b/accel/accel-softmmu.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/accel/accel-softmmu.h
@@ -XXX,XX +XXX,XX @@
+/*
+ * QEMU System Emulation accel internal functions
+ *
+ * Copyright 2021 SUSE LLC
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef ACCEL_SOFTMMU_H
+#define ACCEL_SOFTMMU_H
+
+void accel_init_ops_interfaces(AccelClass *ac);
+
+#endif /* ACCEL_SOFTMMU_H */
diff --git a/accel/kvm/kvm-cpus.h b/accel/kvm/kvm-cpus.h
index XXXXXXX..XXXXXXX 100644
--- a/accel/kvm/kvm-cpus.h
+++ b/accel/kvm/kvm-cpus.h
@@ -XXX,XX +XXX,XX @@
 
 #include "sysemu/cpus.h"
 
-extern const CpusAccel kvm_cpus;
-
 int kvm_init_vcpu(CPUState *cpu, Error **errp);
 int kvm_cpu_exec(CPUState *cpu);
 void kvm_destroy_vcpu(CPUState *cpu);
diff --git a/accel/tcg/tcg-cpus-icount.h b/accel/tcg/tcg-accel-ops-icount.h
similarity index 88%
rename from accel/tcg/tcg-cpus-icount.h
rename to accel/tcg/tcg-accel-ops-icount.h
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-cpus-icount.h
+++ b/accel/tcg/tcg-accel-ops-icount.h
@@ -XXX,XX +XXX,XX @@ void icount_handle_deadline(void);
 void icount_prepare_for_run(CPUState *cpu);
 void icount_process_data(CPUState *cpu);
 
+void icount_handle_interrupt(CPUState *cpu, int mask);
+
 #endif /* TCG_CPUS_ICOUNT_H */
diff --git a/accel/tcg/tcg-accel-ops-mttcg.h b/accel/tcg/tcg-accel-ops-mttcg.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/accel/tcg/tcg-accel-ops-mttcg.h
@@ -XXX,XX +XXX,XX @@
+/*
+ * QEMU TCG Multi Threaded vCPUs implementation
+ *
+ * Copyright 2021 SUSE LLC
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef TCG_CPUS_MTTCG_H
+#define TCG_CPUS_MTTCG_H
+
+/* kick MTTCG vCPU thread */
+void mttcg_kick_vcpu_thread(CPUState *cpu);
+
+/* start an mttcg vCPU thread */
+void mttcg_start_vcpu_thread(CPUState *cpu);
+
+#endif /* TCG_CPUS_MTTCG_H */
diff --git a/accel/tcg/tcg-cpus-rr.h b/accel/tcg/tcg-accel-ops-rr.h
similarity index 100%
rename from accel/tcg/tcg-cpus-rr.h
rename to accel/tcg/tcg-accel-ops-rr.h
diff --git a/accel/tcg/tcg-cpus.h b/accel/tcg/tcg-accel-ops.h
similarity index 72%
rename from accel/tcg/tcg-cpus.h
rename to accel/tcg/tcg-accel-ops.h
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-cpus.h
+++ b/accel/tcg/tcg-accel-ops.h
@@ -XXX,XX +XXX,XX @@
 
 #include "sysemu/cpus.h"
 
-extern const CpusAccel tcg_cpus_mttcg;
-extern const CpusAccel tcg_cpus_icount;
-extern const CpusAccel tcg_cpus_rr;
-
 void tcg_cpus_destroy(CPUState *cpu);
 int tcg_cpus_exec(CPUState *cpu);
-void tcg_cpus_handle_interrupt(CPUState *cpu, int mask);
+void tcg_handle_interrupt(CPUState *cpu, int mask);
 
 #endif /* TCG_CPUS_H */
diff --git a/include/qemu/accel.h b/include/qemu/accel.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/accel.h
+++ b/include/qemu/accel.h
@@ -XXX,XX +XXX,XX @@ typedef struct AccelClass {
 AccelClass *accel_find(const char *opt_name);
 AccelState *current_accel(void);
 
+void accel_init_interfaces(AccelClass *ac);
+
 #ifndef CONFIG_USER_ONLY
 int accel_init_machine(AccelState *accel, MachineState *ms);
 
diff --git a/include/sysemu/accel-ops.h b/include/sysemu/accel-ops.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/include/sysemu/accel-ops.h
@@ -XXX,XX +XXX,XX @@
+/*
+ * Accelerator OPS, used for cpus.c module
+ *
+ * Copyright 2021 SUSE LLC
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef ACCEL_OPS_H
+#define ACCEL_OPS_H
+
+#include "qom/object.h"
+
+#define ACCEL_OPS_SUFFIX "-ops"
+#define TYPE_ACCEL_OPS "accel" ACCEL_OPS_SUFFIX
+#define ACCEL_OPS_NAME(name) (name "-" TYPE_ACCEL_OPS)
+
+typedef struct AccelOpsClass AccelOpsClass;
+DECLARE_CLASS_CHECKERS(AccelOpsClass, ACCEL_OPS, TYPE_ACCEL_OPS)
+
+/* cpus.c operations interface */
+struct AccelOpsClass {
+    /*< private >*/
+    ObjectClass parent_class;
+    /*< public >*/
+
+    /* initialization function called when accel is chosen */
+    void (*ops_init)(AccelOpsClass *ops);
+
+    void (*create_vcpu_thread)(CPUState *cpu); /* MANDATORY NON-NULL */
+    void (*kick_vcpu_thread)(CPUState *cpu);
+
+    void (*synchronize_post_reset)(CPUState *cpu);
+    void (*synchronize_post_init)(CPUState *cpu);
+    void (*synchronize_state)(CPUState *cpu);
+    void (*synchronize_pre_loadvm)(CPUState *cpu);
+
+    void (*handle_interrupt)(CPUState *cpu, int mask);
+
+    int64_t (*get_virtual_clock)(void);
+    int64_t (*get_elapsed_ticks)(void);
+};
+
+#endif /* ACCEL_OPS_H */
diff --git a/include/sysemu/cpus.h b/include/sysemu/cpus.h
index XXXXXXX..XXXXXXX 100644
--- a/include/sysemu/cpus.h
+++ b/include/sysemu/cpus.h
@@ -XXX,XX +XXX,XX @@
 #define QEMU_CPUS_H
 
 #include "qemu/timer.h"
+#include "sysemu/accel-ops.h"
 
-/* cpus.c */
+/* register accel-specific operations */
+void cpus_register_accel(const AccelOpsClass *i);
 
-/* CPU execution threads */
+/* accel/dummy-cpus.c */
 
-typedef struct CpusAccel {
-    void (*create_vcpu_thread)(CPUState *cpu); /* MANDATORY */
-    void (*kick_vcpu_thread)(CPUState *cpu);
-
-    void (*synchronize_post_reset)(CPUState *cpu);
-    void (*synchronize_post_init)(CPUState *cpu);
-    void (*synchronize_state)(CPUState *cpu);
-    void (*synchronize_pre_loadvm)(CPUState *cpu);
-
-    void (*handle_interrupt)(CPUState *cpu, int mask);
-
-    int64_t (*get_virtual_clock)(void);
-    int64_t (*get_elapsed_ticks)(void);
-} CpusAccel;
-
-/* register accel-specific cpus interface implementation */
-void cpus_register_accel(const CpusAccel *i);
-
-/* Create a dummy vcpu for CpusAccel->create_vcpu_thread */
+/* Create a dummy vcpu for AccelOpsClass->create_vcpu_thread */
 void dummy_start_vcpu_thread(CPUState *);
 
 /* interface available for cpus accelerator threads */
diff --git a/target/i386/hax/hax-cpus.h b/target/i386/hax/hax-accel-ops.h
similarity index 95%
rename from target/i386/hax/hax-cpus.h
rename to target/i386/hax/hax-accel-ops.h
index XXXXXXX..XXXXXXX 100644
--- a/target/i386/hax/hax-cpus.h
+++ b/target/i386/hax/hax-accel-ops.h
@@ -XXX,XX +XXX,XX @@
 
 #include "sysemu/cpus.h"
 
-extern const CpusAccel hax_cpus;
-
 #include "hax-interface.h"
 #include "hax-i386.h"
 
diff --git a/target/i386/hax/hax-windows.h b/target/i386/hax/hax-windows.h
index XXXXXXX..XXXXXXX 100644
--- a/target/i386/hax/hax-windows.h
+++ b/target/i386/hax/hax-windows.h
@@ -XXX,XX +XXX,XX @@
 #include <winioctl.h>
 #include <windef.h>
 
-#include "hax-cpus.h"
+#include "hax-accel-ops.h"
 
 #define HAX_INVALID_FD INVALID_HANDLE_VALUE
 
diff --git a/target/i386/hvf/hvf-cpus.h b/target/i386/hvf/hvf-accel-ops.h
similarity index 94%
rename from target/i386/hvf/hvf-cpus.h
rename to target/i386/hvf/hvf-accel-ops.h
index XXXXXXX..XXXXXXX 100644
--- a/target/i386/hvf/hvf-cpus.h
+++ b/target/i386/hvf/hvf-accel-ops.h
@@ -XXX,XX +XXX,XX @@
 
 #include "sysemu/cpus.h"
 
-extern const CpusAccel hvf_cpus;
-
 int hvf_init_vcpu(CPUState *);
 int hvf_vcpu_exec(CPUState *);
 void hvf_cpu_synchronize_state(CPUState *);
diff --git a/target/i386/whpx/whpx-cpus.h b/target/i386/whpx/whpx-accel-ops.h
similarity index 96%
rename from target/i386/whpx/whpx-cpus.h
rename to target/i386/whpx/whpx-accel-ops.h
index XXXXXXX..XXXXXXX 100644
--- a/target/i386/whpx/whpx-cpus.h
+++ b/target/i386/whpx/whpx-accel-ops.h
@@ -XXX,XX +XXX,XX @@
 
 #include "sysemu/cpus.h"
 
-extern const CpusAccel whpx_cpus;
-
 int whpx_init_vcpu(CPUState *cpu);
 int whpx_vcpu_exec(CPUState *cpu);
 void whpx_destroy_vcpu(CPUState *cpu);
diff --git a/accel/accel-common.c b/accel/accel-common.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/accel-common.c
+++ b/accel/accel-common.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/osdep.h"
 #include "qemu/accel.h"
 
+#ifndef CONFIG_USER_ONLY
+#include "accel-softmmu.h"
+#endif /* !CONFIG_USER_ONLY */
+
 static const TypeInfo accel_type = {
     .name = TYPE_ACCEL,
     .parent = TYPE_OBJECT,
@@ -XXX,XX +XXX,XX @@ AccelClass *accel_find(const char *opt_name)
     return ac;
 }
 
+void accel_init_interfaces(AccelClass *ac)
+{
+#ifndef CONFIG_USER_ONLY
+    accel_init_ops_interfaces(ac);
+#endif /* !CONFIG_USER_ONLY */
+}
+
 static void register_accel_types(void)
 {
     type_register_static(&accel_type);
diff --git a/accel/accel-softmmu.c b/accel/accel-softmmu.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/accel-softmmu.c
+++ b/accel/accel-softmmu.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/osdep.h"
 #include "qemu/accel.h"
 #include "hw/boards.h"
-#include "sysemu/arch_init.h"
-#include "sysemu/sysemu.h"
-#include "qom/object.h"
+#include "sysemu/cpus.h"
+
+#include "accel-softmmu.h"
 
 int accel_init_machine(AccelState *accel, MachineState *ms)
 {
@@ -XXX,XX +XXX,XX @@ void accel_setup_post(MachineState *ms)
         acc->setup_post(ms, accel);
     }
 }
+
+/* initialize the arch-independent accel operation interfaces */
+void accel_init_ops_interfaces(AccelClass *ac)
+{
+    const char *ac_name;
+    char *ops_name;
+    AccelOpsClass *ops;
+
+    ac_name = object_class_get_name(OBJECT_CLASS(ac));
+    g_assert(ac_name != NULL);
+
+    ops_name = g_strdup_printf("%s" ACCEL_OPS_SUFFIX, ac_name);
+    ops = ACCEL_OPS_CLASS(object_class_by_name(ops_name));
+    g_free(ops_name);
+
+    /*
+     * all accelerators need to define ops, providing at least a mandatory
+     * non-NULL create_vcpu_thread operation.
+     */
+    g_assert(ops != NULL);
+    if (ops->ops_init) {
+        ops->ops_init(ops);
+    }
+    cpus_register_accel(ops);
+}
+
+static const TypeInfo accel_ops_type_info = {
+    .name = TYPE_ACCEL_OPS,
+    .parent = TYPE_OBJECT,
+    .abstract = true,
+    .class_size = sizeof(AccelOpsClass),
+};
+
+static void accel_softmmu_register_types(void)
+{
+    type_register_static(&accel_ops_type_info);
+}
+type_init(accel_softmmu_register_types);
diff --git a/accel/kvm/kvm-cpus.c b/accel/kvm/kvm-accel-ops.c
similarity index 72%
rename from accel/kvm/kvm-cpus.c
rename to accel/kvm/kvm-accel-ops.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/kvm/kvm-cpus.c
+++ b/accel/kvm/kvm-accel-ops.c
@@ -XXX,XX +XXX,XX @@ static void kvm_start_vcpu_thread(CPUState *cpu)
                        cpu, QEMU_THREAD_JOINABLE);
 }
 
-const CpusAccel kvm_cpus = {
-    .create_vcpu_thread = kvm_start_vcpu_thread,
+static void kvm_accel_ops_class_init(ObjectClass *oc, void *data)
+{
+    AccelOpsClass *ops = ACCEL_OPS_CLASS(oc);
 
-    .synchronize_post_reset = kvm_cpu_synchronize_post_reset,
-    .synchronize_post_init = kvm_cpu_synchronize_post_init,
-    .synchronize_state = kvm_cpu_synchronize_state,
-    .synchronize_pre_loadvm = kvm_cpu_synchronize_pre_loadvm,
+    ops->create_vcpu_thread = kvm_start_vcpu_thread;
+    ops->synchronize_post_reset = kvm_cpu_synchronize_post_reset;
+    ops->synchronize_post_init = kvm_cpu_synchronize_post_init;
+    ops->synchronize_state = kvm_cpu_synchronize_state;
+    ops->synchronize_pre_loadvm = kvm_cpu_synchronize_pre_loadvm;
+}
+
+static const TypeInfo kvm_accel_ops_type = {
+    .name = ACCEL_OPS_NAME("kvm"),
+
+    .parent = TYPE_ACCEL_OPS,
+    .class_init = kvm_accel_ops_class_init,
+    .abstract = true,
 };
+
+static void kvm_accel_ops_register_types(void)
+{
+    type_register_static(&kvm_accel_ops_type);
+}
+type_init(kvm_accel_ops_register_types);
diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -XXX,XX +XXX,XX @@ static int kvm_init(MachineState *ms)
         ret = ram_block_discard_disable(true);
         assert(!ret);
     }
-
-    cpus_register_accel(&kvm_cpus);
     return 0;
 
 err:
diff --git a/accel/qtest/qtest.c b/accel/qtest/qtest.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/qtest/qtest.c
+++ b/accel/qtest/qtest.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/main-loop.h"
 #include "hw/core/cpu.h"
 
-const CpusAccel qtest_cpus = {
-    .create_vcpu_thread = dummy_start_vcpu_thread,
-    .get_virtual_clock = qtest_get_virtual_clock,
-};
-
 static int qtest_init_accel(MachineState *ms)
 {
-    cpus_register_accel(&qtest_cpus);
     return 0;
 }
 
@@ -XXX,XX +XXX,XX @@ static const TypeInfo qtest_accel_type = {
     .class_init = qtest_accel_class_init,
 };
 
+static void qtest_accel_ops_class_init(ObjectClass *oc, void *data)
+{
+    AccelOpsClass *ops = ACCEL_OPS_CLASS(oc);
+
+    ops->create_vcpu_thread = dummy_start_vcpu_thread;
+    ops->get_virtual_clock = qtest_get_virtual_clock;
+};
+
+static const TypeInfo qtest_accel_ops_type = {
+    .name = ACCEL_OPS_NAME("qtest"),
+
+    .parent = TYPE_ACCEL_OPS,
+    .class_init = qtest_accel_ops_class_init,
+    .abstract = true,
+};
+
 static void qtest_type_init(void)
 {
     type_register_static(&qtest_accel_type);
+    type_register_static(&qtest_accel_ops_type);
 }
 
 type_init(qtest_type_init);
diff --git a/accel/tcg/tcg-cpus-icount.c b/accel/tcg/tcg-accel-ops-icount.c
similarity index 89%
rename from accel/tcg/tcg-cpus-icount.c
rename to accel/tcg/tcg-accel-ops-icount.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-cpus-icount.c
+++ b/accel/tcg/tcg-accel-ops-icount.c
@@ -XXX,XX +XXX,XX @@
 #include "exec/exec-all.h"
 #include "hw/boards.h"
 
-#include "tcg-cpus.h"
-#include "tcg-cpus-icount.h"
-#include "tcg-cpus-rr.h"
+#include "tcg-accel-ops.h"
+#include "tcg-accel-ops-icount.h"
+#include "tcg-accel-ops-rr.h"
 
 static int64_t icount_get_limit(void)
 {
@@ -XXX,XX +XXX,XX @@ void icount_prepare_for_run(CPUState *cpu)
     /*
      * These should always be cleared by icount_process_data after
      * each vCPU execution. However u16.high can be raised
-     * asynchronously by cpu_exit/cpu_interrupt/tcg_cpus_handle_interrupt
+     * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
      */
     g_assert(cpu_neg(cpu)->icount_decr.u16.low == 0);
     g_assert(cpu->icount_extra == 0);
@@ -XXX,XX +XXX,XX @@ void icount_process_data(CPUState *cpu)
     replay_mutex_unlock();
 }
 
-static void icount_handle_interrupt(CPUState *cpu, int mask)
+void icount_handle_interrupt(CPUState *cpu, int mask)
 {
     int old_mask = cpu->interrupt_request;
 
-    tcg_cpus_handle_interrupt(cpu, mask);
+    tcg_handle_interrupt(cpu, mask);
     if (qemu_cpu_is_self(cpu) &&
         !cpu->can_do_io
         && (mask & ~old_mask) != 0) {
         cpu_abort(cpu, "Raised interrupt while not in I/O function");
     }
 }
-
-const CpusAccel tcg_cpus_icount = {
-    .create_vcpu_thread = rr_start_vcpu_thread,
-    .kick_vcpu_thread = rr_kick_vcpu_thread,
-
-    .handle_interrupt = icount_handle_interrupt,
-    .get_virtual_clock = icount_get,
-    .get_elapsed_ticks = icount_get,
-};
diff --git a/accel/tcg/tcg-cpus-mttcg.c b/accel/tcg/tcg-accel-ops-mttcg.c
similarity index 92%
rename from accel/tcg/tcg-cpus-mttcg.c
rename to accel/tcg/tcg-accel-ops-mttcg.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-cpus-mttcg.c
+++ b/accel/tcg/tcg-accel-ops-mttcg.c
@@ -XXX,XX +XXX,XX @@
 #include "exec/exec-all.h"
 #include "hw/boards.h"
 
-#include "tcg-cpus.h"
+#include "tcg-accel-ops.h"
+#include "tcg-accel-ops-mttcg.h"
 
 /*
  * In the multi-threaded case each vCPU has its own thread. The TLS
@@ -XXX,XX +XXX,XX @@ static void *mttcg_cpu_thread_fn(void *arg)
     return NULL;
 }
 
-static void mttcg_kick_vcpu_thread(CPUState *cpu)
+void mttcg_kick_vcpu_thread(CPUState *cpu)
 {
     cpu_exit(cpu);
 }
 
-static void mttcg_start_vcpu_thread(CPUState *cpu)
+void mttcg_start_vcpu_thread(CPUState *cpu)
 {
     char thread_name[VCPU_THREAD_NAME_SIZE];
 
@@ -XXX,XX +XXX,XX @@ static void mttcg_start_vcpu_thread(CPUState *cpu)
     cpu->hThread = qemu_thread_get_handle(cpu->thread);
 #endif
 }
-
-const CpusAccel tcg_cpus_mttcg = {
-    .create_vcpu_thread = mttcg_start_vcpu_thread,
-    .kick_vcpu_thread = mttcg_kick_vcpu_thread,
-
-    .handle_interrupt = tcg_cpus_handle_interrupt,
-};
diff --git a/accel/tcg/tcg-cpus-rr.c b/accel/tcg/tcg-accel-ops-rr.c
similarity index 97%
rename from accel/tcg/tcg-cpus-rr.c
rename to accel/tcg/tcg-accel-ops-rr.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-cpus-rr.c
+++ b/accel/tcg/tcg-accel-ops-rr.c
@@ -XXX,XX +XXX,XX @@
 #include "exec/exec-all.h"
 #include "hw/boards.h"
 
-#include "tcg-cpus.h"
-#include "tcg-cpus-rr.h"
-#include "tcg-cpus-icount.h"
+#include "tcg-accel-ops.h"
+#include "tcg-accel-ops-rr.h"
+#include "tcg-accel-ops-icount.h"
 
 /* Kick all RR vCPUs */
 void rr_kick_vcpu_thread(CPUState *unused)
@@ -XXX,XX +XXX,XX @@ void rr_start_vcpu_thread(CPUState *cpu)
         cpu->created = true;
     }
 }
-
-const CpusAccel tcg_cpus_rr = {
-    .create_vcpu_thread = rr_start_vcpu_thread,
-    .kick_vcpu_thread = rr_kick_vcpu_thread,
-
-    .handle_interrupt = tcg_cpus_handle_interrupt,
-};
diff --git a/accel/tcg/tcg-cpus.c b/accel/tcg/tcg-accel-ops.c
similarity index 63%
rename from accel/tcg/tcg-cpus.c
rename to accel/tcg/tcg-accel-ops.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-cpus.c
+++ b/accel/tcg/tcg-accel-ops.c
@@ -XXX,XX +XXX,XX @@
 #include "exec/exec-all.h"
 #include "hw/boards.h"
 
-#include "tcg-cpus.h"
+#include "tcg-accel-ops.h"
+#include "tcg-accel-ops-mttcg.h"
+#include "tcg-accel-ops-rr.h"
+#include "tcg-accel-ops-icount.h"
 
 /* common functionality among all TCG variants */
 
@@ -XXX,XX +XXX,XX @@ int tcg_cpus_exec(CPUState *cpu)
 }
 
 /* mask must never be zero, except for A20 change call */
-void tcg_cpus_handle_interrupt(CPUState *cpu, int mask)
+void tcg_handle_interrupt(CPUState *cpu, int mask)
 {
     g_assert(qemu_mutex_iothread_locked());
 
@@ -XXX,XX +XXX,XX @@ void tcg_cpus_handle_interrupt(CPUState *cpu, int mask)
         qatomic_set(&cpu_neg(cpu)->icount_decr.u16.high, -1);
     }
 }
+
+static void tcg_accel_ops_init(AccelOpsClass *ops)
+{
+    if (qemu_tcg_mttcg_enabled()) {
+        ops->create_vcpu_thread = mttcg_start_vcpu_thread;
+        ops->kick_vcpu_thread = mttcg_kick_vcpu_thread;
+        ops->handle_interrupt = tcg_handle_interrupt;
+    } else if (icount_enabled()) {
+        ops->create_vcpu_thread = rr_start_vcpu_thread;
+        ops->kick_vcpu_thread = rr_kick_vcpu_thread;
+        ops->handle_interrupt = icount_handle_interrupt;
+        ops->get_virtual_clock = icount_get;
+        ops->get_elapsed_ticks = icount_get;
+    } else {
+        ops->create_vcpu_thread = rr_start_vcpu_thread;
+        ops->kick_vcpu_thread = rr_kick_vcpu_thread;
+        ops->handle_interrupt = tcg_handle_interrupt;
+    }
+}
+
+static void tcg_accel_ops_class_init(ObjectClass *oc, void *data)
+{
+    AccelOpsClass *ops = ACCEL_OPS_CLASS(oc);
+
+    ops->ops_init = tcg_accel_ops_init;
+}
+
+static const TypeInfo tcg_accel_ops_type = {
+    .name = ACCEL_OPS_NAME("tcg"),
+
+    .parent = TYPE_ACCEL_OPS,
+    .class_init = tcg_accel_ops_class_init,
+    .abstract = true,
+};
+
+static void tcg_accel_ops_register_types(void)
+{
+    type_register_static(&tcg_accel_ops_type);
+}
+type_init(tcg_accel_ops_register_types);
diff --git a/accel/tcg/tcg-all.c b/accel/tcg/tcg-all.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-all.c
+++ b/accel/tcg/tcg-all.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/accel.h"
 #include "qapi/qapi-builtin-visit.h"
 
-#ifndef CONFIG_USER_ONLY
-#include "tcg-cpus.h"
-#endif /* CONFIG_USER_ONLY */
-
 struct TCGState {
     AccelState parent_obj;
 
@@ -XXX,XX +XXX,XX @@ static int tcg_init(MachineState *ms)
      */
 #ifndef CONFIG_USER_ONLY
     tcg_region_init();
-
-    if (mttcg_enabled) {
-        cpus_register_accel(&tcg_cpus_mttcg);
-    } else if (icount_enabled()) {
-        cpus_register_accel(&tcg_cpus_icount);
-    } else {
-        cpus_register_accel(&tcg_cpus_rr);
-    }
 #endif /* !CONFIG_USER_ONLY */
 
     return 0;
diff --git a/accel/xen/xen-all.c b/accel/xen/xen-all.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/xen/xen-all.c
+++ b/accel/xen/xen-all.c
@@ -XXX,XX +XXX,XX @@ static void xen_setup_post(MachineState *ms, AccelState *accel)
     }
 }
 
-const CpusAccel xen_cpus = {
-    .create_vcpu_thread = dummy_start_vcpu_thread,
-};
-
 static int xen_init(MachineState *ms)
 {
     MachineClass *mc = MACHINE_GET_CLASS(ms);
@@ -XXX,XX +XXX,XX @@ static int xen_init(MachineState *ms)
      * opt out of system RAM being allocated by generic code
      */
     mc->default_ram_id = NULL;
-
-    cpus_register_accel(&xen_cpus);
-
     return 0;
 }
 
@@ -XXX,XX +XXX,XX @@ static const TypeInfo xen_accel_type = {
     .class_init = xen_accel_class_init,
 };
 
+static void xen_accel_ops_class_init(ObjectClass *oc, void *data)
+{
+    AccelOpsClass *ops = ACCEL_OPS_CLASS(oc);
+
+    ops->create_vcpu_thread = dummy_start_vcpu_thread;
+}
+
+static const TypeInfo xen_accel_ops_type = {
+    .name = ACCEL_OPS_NAME("xen"),
+
+    .parent = TYPE_ACCEL_OPS,
+    .class_init = xen_accel_ops_class_init,
+    .abstract = true,
+};
+
 static void xen_type_init(void)
 {
     type_register_static(&xen_accel_type);
+    type_register_static(&xen_accel_ops_type);
 }
-
 type_init(xen_type_init);
diff --git a/bsd-user/main.c b/bsd-user/main.c
index XXXXXXX..XXXXXXX 100644
--- a/bsd-user/main.c
+++ b/bsd-user/main.c
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
 #endif
     }
 
+    cpu_type = parse_cpu_option(cpu_model);
     /* init tcg before creating CPUs and to get qemu_host_page_size */
     {
         AccelClass *ac = ACCEL_GET_CLASS(current_accel());
 
         ac->init_machine(NULL);
+        accel_init_interfaces(ac);
     }
-    cpu_type = parse_cpu_option(cpu_model);
     cpu = cpu_create(cpu_type);
     env = cpu->env_ptr;
 #if defined(TARGET_SPARC) || defined(TARGET_PPC)
diff --git a/linux-user/main.c b/linux-user/main.c
index XXXXXXX..XXXXXXX 100644
--- a/linux-user/main.c
+++ b/linux-user/main.c
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp)
         AccelClass *ac = ACCEL_GET_CLASS(current_accel());
 
         ac->init_machine(NULL);
+        accel_init_interfaces(ac);
     }
     cpu = cpu_create(cpu_type);
     env = cpu->env_ptr;
diff --git a/softmmu/cpus.c b/softmmu/cpus.c
index XXXXXXX..XXXXXXX 100644
--- a/softmmu/cpus.c
+++ b/softmmu/cpus.c
@@ -XXX,XX +XXX,XX @@ void hw_error(const char *fmt, ...)
 /*
  * The chosen accelerator is supposed to register this.
  */
-static const CpusAccel *cpus_accel;
+static const AccelOpsClass *cpus_accel;
 
 void cpu_synchronize_all_states(void)
 {
@@ -XXX,XX +XXX,XX @@ void cpu_remove_sync(CPUState *cpu)
     qemu_mutex_lock_iothread();
 }
 
-void cpus_register_accel(const CpusAccel *ca)
+void cpus_register_accel(const AccelOpsClass *ops)
 {
-    assert(ca != NULL);
-    assert(ca->create_vcpu_thread != NULL); /* mandatory */
-    cpus_accel = ca;
+    assert(ops != NULL);
+    assert(ops->create_vcpu_thread != NULL); /* mandatory */
+    cpus_accel = ops;
 }
 
 void qemu_init_vcpu(CPUState *cpu)
@@ -XXX,XX +XXX,XX @@ void qemu_init_vcpu(CPUState *cpu)
         cpu_address_space_init(cpu, 0, "cpu-memory", cpu->memory);
     }
 
-    /* accelerators all implement the CpusAccel interface */
+    /* accelerators all implement the AccelOpsClass */
     g_assert(cpus_accel != NULL && cpus_accel->create_vcpu_thread != NULL);
     cpus_accel->create_vcpu_thread(cpu);
 
diff --git a/softmmu/vl.c b/softmmu/vl.c
index XXXXXXX..XXXXXXX 100644
--- a/softmmu/vl.c
+++ b/softmmu/vl.c
@@ -XXX,XX +XXX,XX @@ static bool object_create_early(const char *type, QemuOpts *opts)
         return false;
     }
 
-    /* Allocation of large amounts of memory may delay
+    /*
+     * Allocation of large amounts of memory may delay
      * chardev initialization for too long, and trigger timeouts
      * on software that waits for a monitor socket to be created
      * (e.g. libvirt).
@@ -XXX,XX +XXX,XX @@ void qemu_init(int argc, char **argv, char **envp)
      *
      * Machine compat properties: object_set_machine_compat_props().
      * Accelerator compat props: object_set_accelerator_compat_props(),
-     * called from configure_accelerator().
+     * called from do_configure_accelerator().
      */
 
     machine_class = MACHINE_GET_CLASS(current_machine);
@@ -XXX,XX +XXX,XX @@ void qemu_init(int argc, char **argv, char **envp)
     if (cpu_option) {
         current_machine->cpu_type = parse_cpu_option(cpu_option);
     }
+    /* NB: for machine none cpu_type could STILL be NULL here! */
+    accel_init_interfaces(ACCEL_GET_CLASS(current_machine->accelerator));
 
     qemu_resolve_machine_memdev();
     parse_numa_opts(current_machine);
diff --git a/target/i386/hax/hax-cpus.c b/target/i386/hax/hax-accel-ops.c
similarity index 69%
rename from target/i386/hax/hax-cpus.c
rename to target/i386/hax/hax-accel-ops.c
index XXXXXXX..XXXXXXX 100644
--- a/target/i386/hax/hax-cpus.c
+++ b/target/i386/hax/hax-accel-ops.c
@@ -XXX,XX +XXX,XX @@
 #include "sysemu/cpus.h"
 #include "qemu/guest-random.h"
 
-#include "hax-cpus.h"
+#include "hax-accel-ops.h"
 
 static void *hax_cpu_thread_fn(void *arg)
 {
@@ -XXX,XX +XXX,XX @@ static void hax_start_vcpu_thread(CPUState *cpu)
 #endif
 }
 
-const CpusAccel hax_cpus = {
-    .create_vcpu_thread = hax_start_vcpu_thread,
-    .kick_vcpu_thread = hax_kick_vcpu_thread,
+static void hax_accel_ops_class_init(ObjectClass *oc, void *data)
+{
+    AccelOpsClass *ops = ACCEL_OPS_CLASS(oc);
 
-    .synchronize_post_reset = hax_cpu_synchronize_post_reset,
-    .synchronize_post_init = hax_cpu_synchronize_post_init,
-    .synchronize_state = hax_cpu_synchronize_state,
-    .synchronize_pre_loadvm = hax_cpu_synchronize_pre_loadvm,
+    ops->create_vcpu_thread = hax_start_vcpu_thread;
+    ops->kick_vcpu_thread = hax_kick_vcpu_thread;
+
+    ops->synchronize_post_reset = hax_cpu_synchronize_post_reset;
+    ops->synchronize_post_init = hax_cpu_synchronize_post_init;
+    ops->synchronize_state = hax_cpu_synchronize_state;
+    ops->synchronize_pre_loadvm = hax_cpu_synchronize_pre_loadvm;
+}
+
+static const TypeInfo hax_accel_ops_type = {
+    .name = ACCEL_OPS_NAME("hax"),
+
+    .parent = TYPE_ACCEL_OPS,
+    .class_init = hax_accel_ops_class_init,
+    .abstract = true,
 };
+
+static void hax_accel_ops_register_types(void)
+{
+    type_register_static(&hax_accel_ops_type);
+}
+type_init(hax_accel_ops_register_types);
diff --git a/target/i386/hax/hax-all.c b/target/i386/hax/hax-all.c
index XXXXXXX..XXXXXXX 100644
--- a/target/i386/hax/hax-all.c
+++ b/target/i386/hax/hax-all.c
@@ -XXX,XX +XXX,XX @@
 #include "sysemu/runstate.h"
 #include "hw/boards.h"
 
-#include "hax-cpus.h"
+#include "hax-accel-ops.h"
 
 #define DEBUG_HAX 0
 
@@ -XXX,XX +XXX,XX @@ static int hax_accel_init(MachineState *ms)
                 !ret ? "working" : "not working",
                 !ret ? "fast virt" : "emulation");
     }
-    if (ret == 0) {
-        cpus_register_accel(&hax_cpus);
-    }
     return ret;
 }
 
diff --git a/target/i386/hax/hax-mem.c b/target/i386/hax/hax-mem.c
index XXXXXXX..XXXXXXX 100644
--- a/target/i386/hax/hax-mem.c
+++ b/target/i386/hax/hax-mem.c
@@ -XXX,XX +XXX,XX @@
 #include "exec/address-spaces.h"
 #include "qemu/error-report.h"
 
-#include "hax-cpus.h"
+#include "hax-accel-ops.h"
 #include "qemu/queue.h"
 
 #define DEBUG_HAX_MEM 0
diff --git a/target/i386/hax/hax-posix.c b/target/i386/hax/hax-posix.c
index XXXXXXX..XXXXXXX 100644
--- a/target/i386/hax/hax-posix.c
+++ b/target/i386/hax/hax-posix.c
@@ -XXX,XX +XXX,XX @@
 #include <sys/ioctl.h>
 
 #include "sysemu/cpus.h"
-#include "hax-cpus.h"
+#include "hax-accel-ops.h"
 
 hax_fd hax_mod_open(void)
 {
diff --git a/target/i386/hax/hax-windows.c b/target/i386/hax/hax-windows.c
index XXXXXXX..XXXXXXX 100644
--- a/target/i386/hax/hax-windows.c
+++ b/target/i386/hax/hax-windows.c
@@ -XXX,XX +XXX,XX @@
 
 #include "qemu/osdep.h"
 #include "cpu.h"
-#include "hax-cpus.h"
+#include "hax-accel-ops.h"
 
 /*
  * return 0 when success, -1 when driver not loaded,
diff --git a/target/i386/hvf/hvf-cpus.c b/target/i386/hvf/hvf-accel-ops.c
similarity index 84%
rename from target/i386/hvf/hvf-cpus.c
rename to target/i386/hvf/hvf-accel-ops.c
index XXXXXXX..XXXXXXX 100644
--- a/target/i386/hvf/hvf-cpus.c
+++ b/target/i386/hvf/hvf-accel-ops.c
@@ -XXX,XX +XXX,XX @@
 #include "target/i386/cpu.h"
 #include "qemu/guest-random.h"
 
-#include "hvf-cpus.h"
+#include "hvf-accel-ops.h"
 
 /*
  * The HVF-specific vCPU thread function. This one should only run when the host
@@ -XXX,XX +XXX,XX @@ static void hvf_start_vcpu_thread(CPUState *cpu)
                        cpu, QEMU_THREAD_JOINABLE);
 }
 
-const CpusAccel hvf_cpus = {
-    .create_vcpu_thread = hvf_start_vcpu_thread,
+static void hvf_accel_ops_class_init(ObjectClass *oc, void *data)
+{
+    AccelOpsClass *ops = ACCEL_OPS_CLASS(oc);
 
-    .synchronize_post_reset = hvf_cpu_synchronize_post_reset,
-    .synchronize_post_init = hvf_cpu_synchronize_post_init,
-    .synchronize_state = hvf_cpu_synchronize_state,
-    .synchronize_pre_loadvm = hvf_cpu_synchronize_pre_loadvm,
+    ops->create_vcpu_thread = hvf_start_vcpu_thread;
+
+    ops->synchronize_post_reset = hvf_cpu_synchronize_post_reset;
+    ops->synchronize_post_init = hvf_cpu_synchronize_post_init;
+    ops->synchronize_state = hvf_cpu_synchronize_state;
+    ops->synchronize_pre_loadvm = hvf_cpu_synchronize_pre_loadvm;
 };
+static const TypeInfo hvf_accel_ops_type = {
+    .name = ACCEL_OPS_NAME("hvf"),
+
+    .parent = TYPE_ACCEL_OPS,
+    .class_init = hvf_accel_ops_class_init,
+    .abstract = true,
+};
+static void hvf_accel_ops_register_types(void)
+{
+    type_register_static(&hvf_accel_ops_type);
+}
+type_init(hvf_accel_ops_register_types);
diff --git a/target/i386/hvf/hvf.c b/target/i386/hvf/hvf.c
index XXXXXXX..XXXXXXX 100644
--- a/target/i386/hvf/hvf.c
+++ b/target/i386/hvf/hvf.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/accel.h"
 #include "target/i386/cpu.h"
 
-#include "hvf-cpus.h"
+#include "hvf-accel-ops.h"
 
 HVFState *hvf_state;
 
@@ -XXX,XX +XXX,XX @@ static int hvf_accel_init(MachineState *ms)
   
     hvf_state = s;
     memory_listener_register(&hvf_memory_listener, &address_space_memory);
-    cpus_register_accel(&hvf_cpus);
     return 0;
 }
 
diff --git a/target/i386/hvf/x86hvf.c b/target/i386/hvf/x86hvf.c
index XXXXXXX..XXXXXXX 100644
--- a/target/i386/hvf/x86hvf.c
+++ b/target/i386/hvf/x86hvf.c
@@ -XXX,XX +XXX,XX @@
 #include <Hypervisor/hv.h>
 #include <Hypervisor/hv_vmx.h>
 
-#include "hvf-cpus.h"
+#include "hvf-accel-ops.h"
 
 void hvf_set_segment(struct CPUState *cpu, struct vmx_segment *vmx_seg,
                      SegmentCache *qseg, bool is_tr)
diff --git a/target/i386/whpx/whpx-cpus.c b/target/i386/whpx/whpx-accel-ops.c
similarity index 71%
rename from target/i386/whpx/whpx-cpus.c
rename to target/i386/whpx/whpx-accel-ops.c
index XXXXXXX..XXXXXXX 100644
--- a/target/i386/whpx/whpx-cpus.c
+++ b/target/i386/whpx/whpx-accel-ops.c
@@ -XXX,XX +XXX,XX @@
 
 #include "sysemu/whpx.h"
 #include "whpx-internal.h"
-#include "whpx-cpus.h"
+#include "whpx-accel-ops.h"
 
 static void *whpx_cpu_thread_fn(void *arg)
 {
@@ -XXX,XX +XXX,XX @@ static void whpx_kick_vcpu_thread(CPUState *cpu)
     }
 }
 
-const CpusAccel whpx_cpus = {
-    .create_vcpu_thread = whpx_start_vcpu_thread,
-    .kick_vcpu_thread = whpx_kick_vcpu_thread,
+static void whpx_accel_ops_class_init(ObjectClass *oc, void *data)
+{
+    AccelOpsClass *ops = ACCEL_OPS_CLASS(oc);
 
-    .synchronize_post_reset = whpx_cpu_synchronize_post_reset,
-    .synchronize_post_init = whpx_cpu_synchronize_post_init,
-    .synchronize_state = whpx_cpu_synchronize_state,
-    .synchronize_pre_loadvm = whpx_cpu_synchronize_pre_loadvm,
+    ops->create_vcpu_thread = whpx_start_vcpu_thread;
+    ops->kick_vcpu_thread = whpx_kick_vcpu_thread;
+
+    ops->synchronize_post_reset = whpx_cpu_synchronize_post_reset;
+    ops->synchronize_post_init = whpx_cpu_synchronize_post_init;
+    ops->synchronize_state = whpx_cpu_synchronize_state;
+    ops->synchronize_pre_loadvm = whpx_cpu_synchronize_pre_loadvm;
+}
+
+static const TypeInfo whpx_accel_ops_type = {
+    .name = ACCEL_OPS_NAME("whpx"),
+
+    .parent = TYPE_ACCEL_OPS,
+    .class_init = whpx_accel_ops_class_init,
+    .abstract = true,
 };
+
+static void whpx_accel_ops_register_types(void)
+{
+    type_register_static(&whpx_accel_ops_type);
+}
+type_init(whpx_accel_ops_register_types);
diff --git a/target/i386/whpx/whpx-all.c b/target/i386/whpx/whpx-all.c
index XXXXXXX..XXXXXXX 100644
--- a/target/i386/whpx/whpx-all.c
+++ b/target/i386/whpx/whpx-all.c
@@ -XXX,XX +XXX,XX @@
 #include "migration/blocker.h"
 #include <winerror.h>
 
-#include "whpx-cpus.h"
 #include "whpx-internal.h"
+#include "whpx-accel-ops.h"
+
+#include <WinHvPlatform.h>
+#include <WinHvEmulation.h>
 
 #define HYPERV_APIC_BUS_FREQUENCY      (200000000ULL)
 
@@ -XXX,XX +XXX,XX @@ static int whpx_accel_init(MachineState *ms)
 
     whpx_memory_init();
 
-    cpus_register_accel(&whpx_cpus);
-
     printf("Windows Hypervisor Platform accelerator is operational\n");
     return 0;
 
diff --git a/MAINTAINERS b/MAINTAINERS
index XXXXXXX..XXXXXXX 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -XXX,XX +XXX,XX @@ M: Richard Henderson <richard.henderson@linaro.org>
 R: Paolo Bonzini <pbonzini@redhat.com>
 S: Maintained
 F: include/qemu/accel.h
-F: accel/accel.c
+F: include/sysemu/accel-ops.h
+F: accel/accel-*.c
 F: accel/Makefile.objs
 F: accel/stubs/Makefile.objs
 
diff --git a/accel/kvm/meson.build b/accel/kvm/meson.build
index XXXXXXX..XXXXXXX 100644
--- a/accel/kvm/meson.build
+++ b/accel/kvm/meson.build
@@ -XXX,XX +XXX,XX @@
 kvm_ss = ss.source_set()
 kvm_ss.add(files(
   'kvm-all.c',
-  'kvm-cpus.c',
+  'kvm-accel-ops.c',
 ))
 kvm_ss.add(when: 'CONFIG_SEV', if_false: files('sev-stub.c'))
 
diff --git a/accel/tcg/meson.build b/accel/tcg/meson.build
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/meson.build
+++ b/accel/tcg/meson.build
@@ -XXX,XX +XXX,XX @@ specific_ss.add_all(when: 'CONFIG_TCG', if_true: tcg_ss)
 
 specific_ss.add(when: ['CONFIG_SOFTMMU', 'CONFIG_TCG'], if_true: files(
   'cputlb.c',
-  'tcg-cpus.c',
-  'tcg-cpus-mttcg.c',
-  'tcg-cpus-icount.c',
-  'tcg-cpus-rr.c'
+  'tcg-accel-ops.c',
+  'tcg-accel-ops-mttcg.c',
+  'tcg-accel-ops-icount.c',
+  'tcg-accel-ops-rr.c'
 ))
diff --git a/target/i386/hax/meson.build b/target/i386/hax/meson.build
index XXXXXXX..XXXXXXX 100644
--- a/target/i386/hax/meson.build
+++ b/target/i386/hax/meson.build
@@ -XXX,XX +XXX,XX @@
 i386_softmmu_ss.add(when: 'CONFIG_HAX', if_true: files(
   'hax-all.c',
   'hax-mem.c',
-  'hax-cpus.c',
+  'hax-accel-ops.c',
 ))
 i386_softmmu_ss.add(when: ['CONFIG_HAX', 'CONFIG_POSIX'], if_true: files('hax-posix.c'))
 i386_softmmu_ss.add(when: ['CONFIG_HAX', 'CONFIG_WIN32'], if_true: files('hax-windows.c'))
diff --git a/target/i386/hvf/meson.build b/target/i386/hvf/meson.build
index XXXXXXX..XXXXXXX 100644
--- a/target/i386/hvf/meson.build
+++ b/target/i386/hvf/meson.build
@@ -XXX,XX +XXX,XX @@
 i386_softmmu_ss.add(when: [hvf, 'CONFIG_HVF'], if_true: files(
   'hvf.c',
-  'hvf-cpus.c',
+  'hvf-accel-ops.c',
   'x86.c',
   'x86_cpuid.c',
   'x86_decode.c',
diff --git a/target/i386/whpx/meson.build b/target/i386/whpx/meson.build
index XXXXXXX..XXXXXXX 100644
--- a/target/i386/whpx/meson.build
+++ b/target/i386/whpx/meson.build
@@ -XXX,XX +XXX,XX @@
 i386_softmmu_ss.add(when: 'CONFIG_WHPX', if_true: files(
   'whpx-all.c',
   'whpx-apic.c',
-  'whpx-cpus.c',
+  'whpx-accel-ops.c',
 ))
-- 
2.25.1

From: Claudio Fontana <cfontana@suse.de>

add a new optional interface to CPUClass, which allows accelerators
to extend the CPUClass with additional accelerator-specific
initializations.

This will allow to separate the target cpu code that is specific
to each accelerator, and register it automatically with object
hierarchy lookup depending on accelerator code availability,
as part of the accel_init_interfaces() initialization step.

Signed-off-by: Claudio Fontana <cfontana@suse.de>
Message-Id: <20210204163931.7358-19-cfontana@suse.de>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/hw/core/accel-cpu.h | 38 ++++++++++++++++++++++++++++++++
 include/hw/core/cpu.h       |  4 ++++
 accel/accel-common.c        | 44 +++++++++++++++++++++++++++++++++++++
 MAINTAINERS                 |  1 +
 4 files changed, 87 insertions(+)
 create mode 100644 include/hw/core/accel-cpu.h

diff --git a/include/hw/core/accel-cpu.h b/include/hw/core/accel-cpu.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/include/hw/core/accel-cpu.h
@@ -XXX,XX +XXX,XX @@
+/*
+ * Accelerator interface, specializes CPUClass
+ * This header is used only by target-specific code.
+ *
+ * Copyright 2021 SUSE LLC
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef ACCEL_CPU_H
+#define ACCEL_CPU_H
+
+/*
+ * This header is used to define new accelerator-specific target-specific
+ * accelerator cpu subclasses.
+ * It uses CPU_RESOLVING_TYPE, so this is clearly target-specific.
+ *
+ * Do not try to use for any other purpose than the implementation of new
+ * subclasses in target/, or the accel implementation itself in accel/
+ */
+
+#define TYPE_ACCEL_CPU "accel-" CPU_RESOLVING_TYPE
+#define ACCEL_CPU_NAME(name) (name "-" TYPE_ACCEL_CPU)
+typedef struct AccelCPUClass AccelCPUClass;
+DECLARE_CLASS_CHECKERS(AccelCPUClass, ACCEL_CPU, TYPE_ACCEL_CPU)
+
+typedef struct AccelCPUClass {
+    /*< private >*/
+    ObjectClass parent_class;
+    /*< public >*/
+
+    void (*cpu_class_init)(CPUClass *cc);
+    void (*cpu_instance_init)(CPUState *cpu);
+    void (*cpu_realizefn)(CPUState *cpu, Error **errp);
+} AccelCPUClass;
+
+#endif /* ACCEL_CPU_H */
diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/core/cpu.h
+++ b/include/hw/core/cpu.h
@@ -XXX,XX +XXX,XX @@ struct TranslationBlock;
 /* see tcg-cpu-ops.h */
 struct TCGCPUOps;
 
+/* see accel-cpu.h */
+struct AccelCPUClass;
+
 /**
  * CPUClass:
  * @class_by_name: Callback to map -cpu command line model name to an
@@ -XXX,XX +XXX,XX @@ struct CPUClass {
     /* Keep non-pointer data at the end to minimize holes.  */
     int gdb_num_core_regs;
     bool gdb_stop_before_watchpoint;
+    struct AccelCPUClass *accel_cpu;
 
     /* when TCG is not available, this pointer is NULL */
     struct TCGCPUOps *tcg_ops;
diff --git a/accel/accel-common.c b/accel/accel-common.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/accel-common.c
+++ b/accel/accel-common.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/osdep.h"
 #include "qemu/accel.h"
 
+#include "cpu.h"
+#include "hw/core/accel-cpu.h"
+
 #ifndef CONFIG_USER_ONLY
 #include "accel-softmmu.h"
 #endif /* !CONFIG_USER_ONLY */
@@ -XXX,XX +XXX,XX @@ AccelClass *accel_find(const char *opt_name)
     return ac;
 }
 
+static void accel_init_cpu_int_aux(ObjectClass *klass, void *opaque)
+{
+    CPUClass *cc = CPU_CLASS(klass);
+    AccelCPUClass *accel_cpu = opaque;
+
+    cc->accel_cpu = accel_cpu;
+    if (accel_cpu->cpu_class_init) {
+        accel_cpu->cpu_class_init(cc);
+    }
+}
+
+/* initialize the arch-specific accel CpuClass interfaces */
+static void accel_init_cpu_interfaces(AccelClass *ac)
+{
+    const char *ac_name; /* AccelClass name */
+    char *acc_name;      /* AccelCPUClass name */
+    ObjectClass *acc;    /* AccelCPUClass */
+
+    ac_name = object_class_get_name(OBJECT_CLASS(ac));
+    g_assert(ac_name != NULL);
+
+    acc_name = g_strdup_printf("%s-%s", ac_name, CPU_RESOLVING_TYPE);
+    acc = object_class_by_name(acc_name);
+    g_free(acc_name);
+
+    if (acc) {
+        object_class_foreach(accel_init_cpu_int_aux,
+                             CPU_RESOLVING_TYPE, false, acc);
+    }
+}
+
 void accel_init_interfaces(AccelClass *ac)
 {
 #ifndef CONFIG_USER_ONLY
     accel_init_ops_interfaces(ac);
 #endif /* !CONFIG_USER_ONLY */
+
+    accel_init_cpu_interfaces(ac);
 }
 
+static const TypeInfo accel_cpu_type = {
+    .name = TYPE_ACCEL_CPU,
+    .parent = TYPE_OBJECT,
+    .abstract = true,
+    .class_size = sizeof(AccelCPUClass),
+};
+
 static void register_accel_types(void)
 {
     type_register_static(&accel_type);
+    type_register_static(&accel_cpu_type);
 }
 
 type_init(register_accel_types);
diff --git a/MAINTAINERS b/MAINTAINERS
index XXXXXXX..XXXXXXX 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -XXX,XX +XXX,XX @@ R: Paolo Bonzini <pbonzini@redhat.com>
 S: Maintained
 F: include/qemu/accel.h
 F: include/sysemu/accel-ops.h
+F: include/hw/core/accel-cpu.h
 F: accel/accel-*.c
 F: accel/Makefile.objs
 F: accel/stubs/Makefile.objs
-- 
2.25.1

The following changes since commit c52d69e7dbaaed0ffdef8125e79218672c30161d:

Merge remote-tracking branch 'remotes/cschoenebeck/tags/pull-9p-20211027' into staging (2021-10-27 11:45:18 -0700)

are available in the Git repository at:

https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20211027

for you to fetch changes up to 820c025f0dcacf2f3c12735b1f162893fbfa7bc6:

tcg/optimize: Propagate sign info for shifting (2021-10-27 17:11:23 -0700)

----------------------------------------------------------------
Improvements to qemu/int128
Fixes for 128/64 division.
Cleanup tcg/optimize.c
Optimize redundant sign extensions

----------------------------------------------------------------
Frédéric Pétrot (1):
      qemu/int128: Add int128_{not,xor}

Luis Pires (4):
      host-utils: move checks out of divu128/divs128
      host-utils: move udiv_qrnnd() to host-utils
      host-utils: add 128-bit quotient support to divu128/divs128
      host-utils: add unit tests for divu128/divs128

Richard Henderson (51):
      tcg/optimize: Rename "mask" to "z_mask"
      tcg/optimize: Split out OptContext
      tcg/optimize: Remove do_default label
      tcg/optimize: Change tcg_opt_gen_{mov,movi} interface
      tcg/optimize: Move prev_mb into OptContext
      tcg/optimize: Split out init_arguments
      tcg/optimize: Split out copy_propagate
      tcg/optimize: Split out fold_call
      tcg/optimize: Drop nb_oargs, nb_iargs locals
      tcg/optimize: Change fail return for do_constant_folding_cond*
      tcg/optimize: Return true from tcg_opt_gen_{mov,movi}
      tcg/optimize: Split out finish_folding
      tcg/optimize: Use a boolean to avoid a mass of continues
      tcg/optimize: Split out fold_mb, fold_qemu_{ld,st}
      tcg/optimize: Split out fold_const{1,2}
      tcg/optimize: Split out fold_setcond2
      tcg/optimize: Split out fold_brcond2
      tcg/optimize: Split out fold_brcond
      tcg/optimize: Split out fold_setcond
      tcg/optimize: Split out fold_mulu2_i32
      tcg/optimize: Split out fold_addsub2_i32
      tcg/optimize: Split out fold_movcond
      tcg/optimize: Split out fold_extract2
      tcg/optimize: Split out fold_extract, fold_sextract
      tcg/optimize: Split out fold_deposit
      tcg/optimize: Split out fold_count_zeros
      tcg/optimize: Split out fold_bswap
      tcg/optimize: Split out fold_dup, fold_dup2
      tcg/optimize: Split out fold_mov
      tcg/optimize: Split out fold_xx_to_i
      tcg/optimize: Split out fold_xx_to_x
      tcg/optimize: Split out fold_xi_to_i
      tcg/optimize: Add type to OptContext
      tcg/optimize: Split out fold_to_not
      tcg/optimize: Split out fold_sub_to_neg
      tcg/optimize: Split out fold_xi_to_x
      tcg/optimize: Split out fold_ix_to_i
      tcg/optimize: Split out fold_masks
      tcg/optimize: Expand fold_mulu2_i32 to all 4-arg multiplies
      tcg/optimize: Expand fold_addsub2_i32 to 64-bit ops
      tcg/optimize: Sink commutative operand swapping into fold functions
      tcg/optimize: Stop forcing z_mask to "garbage" for 32-bit values
      tcg/optimize: Use fold_xx_to_i for orc
      tcg/optimize: Use fold_xi_to_x for mul
      tcg/optimize: Use fold_xi_to_x for div
      tcg/optimize: Use fold_xx_to_i for rem
      tcg/optimize: Optimize sign extensions
      tcg/optimize: Propagate sign info for logical operations
      tcg/optimize: Propagate sign info for setcond
      tcg/optimize: Propagate sign info for bit counting
      tcg/optimize: Propagate sign info for shifting

From: Frédéric Pétrot <frederic.petrot@univ-grenoble-alpes.fr>

Addition of not and xor on 128-bit integers.

Signed-off-by: Frédéric Pétrot <frederic.petrot@univ-grenoble-alpes.fr>
Co-authored-by: Fabien Portas <fabien.portas@grenoble-inp.org>
Message-Id: <20211025122818.168890-3-frederic.petrot@univ-grenoble-alpes.fr>
[rth: Split out logical operations.]
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/qemu/int128.h | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/include/qemu/int128.h b/include/qemu/int128.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/int128.h
+++ b/include/qemu/int128.h
@@ -XXX,XX +XXX,XX @@ static inline Int128 int128_exts64(int64_t a)
     return a;
 }
 
+static inline Int128 int128_not(Int128 a)
+{
+    return ~a;
+}
+
 static inline Int128 int128_and(Int128 a, Int128 b)
 {
     return a & b;
@@ -XXX,XX +XXX,XX @@ static inline Int128 int128_or(Int128 a, Int128 b)
     return a | b;
 }
 
+static inline Int128 int128_xor(Int128 a, Int128 b)
+{
+    return a ^ b;
+}
+
 static inline Int128 int128_rshift(Int128 a, int n)
 {
     return a >> n;
@@ -XXX,XX +XXX,XX @@ static inline Int128 int128_exts64(int64_t a)
     return int128_make128(a, (a < 0) ? -1 : 0);
 }
 
+static inline Int128 int128_not(Int128 a)
+{
+    return int128_make128(~a.lo, ~a.hi);
+}
+
 static inline Int128 int128_and(Int128 a, Int128 b)
 {
     return int128_make128(a.lo & b.lo, a.hi & b.hi);
@@ -XXX,XX +XXX,XX @@ static inline Int128 int128_or(Int128 a, Int128 b)
     return int128_make128(a.lo | b.lo, a.hi | b.hi);
 }
 
+static inline Int128 int128_xor(Int128 a, Int128 b)
+{
+    return int128_make128(a.lo ^ b.lo, a.hi ^ b.hi);
+}
+
 static inline Int128 int128_rshift(Int128 a, int n)
 {
     int64_t h;
-- 
2.25.1

From: Luis Pires <luis.pires@eldorado.org.br>

In preparation for changing the divu128/divs128 implementations
to allow for quotients larger than 64 bits, move the div-by-zero
and overflow checks to the callers.

Signed-off-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-Id: <20211025191154.350831-2-luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/hw/clock.h        |  5 +++--
 include/qemu/host-utils.h | 34 ++++++++++++---------------------
 target/ppc/int_helper.c   | 14 +++++++++-----
 util/host-utils.c         | 40 ++++++++++++++++++---------------------
 4 files changed, 42 insertions(+), 51 deletions(-)

diff --git a/include/hw/clock.h b/include/hw/clock.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/clock.h
+++ b/include/hw/clock.h
@@ -XXX,XX +XXX,XX @@ static inline uint64_t clock_ns_to_ticks(const Clock *clk, uint64_t ns)
         return 0;
     }
     /*
-     * Ignore divu128() return value as we've caught div-by-zero and don't
-     * need different behaviour for overflow.
+     * BUG: when CONFIG_INT128 is not defined, the current implementation of
+     * divu128 does not return a valid truncated quotient, so the result will
+     * be wrong.
      */
     divu128(&lo, &hi, clk->period);
     return lo;
diff --git a/include/qemu/host-utils.h b/include/qemu/host-utils.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/host-utils.h
+++ b/include/qemu/host-utils.h
@@ -XXX,XX +XXX,XX @@ static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
     return (__int128_t)a * b / c;
 }
 
-static inline int divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
+static inline void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
 {
-    if (divisor == 0) {
-        return 1;
-    } else {
-        __uint128_t dividend = ((__uint128_t)*phigh << 64) | *plow;
-        __uint128_t result = dividend / divisor;
-        *plow = result;
-        *phigh = dividend % divisor;
-        return result > UINT64_MAX;
-    }
+    __uint128_t dividend = ((__uint128_t)*phigh << 64) | *plow;
+    __uint128_t result = dividend / divisor;
+    *plow = result;
+    *phigh = dividend % divisor;
 }
 
-static inline int divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
+static inline void divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
 {
-    if (divisor == 0) {
-        return 1;
-    } else {
-        __int128_t dividend = ((__int128_t)*phigh << 64) | (uint64_t)*plow;
-        __int128_t result = dividend / divisor;
-        *plow = result;
-        *phigh = dividend % divisor;
-        return result != *plow;
-    }
+    __int128_t dividend = ((__int128_t)*phigh << 64) | (uint64_t)*plow;
+    __int128_t result = dividend / divisor;
+    *plow = result;
+    *phigh = dividend % divisor;
 }
 #else
 void muls64(uint64_t *plow, uint64_t *phigh, int64_t a, int64_t b);
 void mulu64(uint64_t *plow, uint64_t *phigh, uint64_t a, uint64_t b);
-int divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor);
-int divs128(int64_t *plow, int64_t *phigh, int64_t divisor);
+void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor);
+void divs128(int64_t *plow, int64_t *phigh, int64_t divisor);
 
 static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
 {
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -XXX,XX +XXX,XX @@ uint64_t helper_divdeu(CPUPPCState *env, uint64_t ra, uint64_t rb, uint32_t oe)
     uint64_t rt = 0;
     int overflow = 0;
 
-    overflow = divu128(&rt, &ra, rb);
-
-    if (unlikely(overflow)) {
+    if (unlikely(rb == 0 || ra >= rb)) {
+        overflow = 1;
         rt = 0; /* Undefined */
+    } else {
+        divu128(&rt, &ra, rb);
     }
 
     if (oe) {
@@ -XXX,XX +XXX,XX @@ uint64_t helper_divde(CPUPPCState *env, uint64_t rau, uint64_t rbu, uint32_t oe)
     int64_t rt = 0;
     int64_t ra = (int64_t)rau;
     int64_t rb = (int64_t)rbu;
-    int overflow = divs128(&rt, &ra, rb);
+    int overflow = 0;
 
-    if (unlikely(overflow)) {
+    if (unlikely(rb == 0 || uabs64(ra) >= uabs64(rb))) {
+        overflow = 1;
         rt = 0; /* Undefined */
+    } else {
+        divs128(&rt, &ra, rb);
     }
 
     if (oe) {
diff --git a/util/host-utils.c b/util/host-utils.c
index XXXXXXX..XXXXXXX 100644
--- a/util/host-utils.c
+++ b/util/host-utils.c
@@ -XXX,XX +XXX,XX @@ void muls64 (uint64_t *plow, uint64_t *phigh, int64_t a, int64_t b)
     *phigh = rh;
 }
 
-/* Unsigned 128x64 division.  Returns 1 if overflow (divide by zero or */
-/* quotient exceeds 64 bits).  Otherwise returns quotient via plow and */
-/* remainder via phigh. */
-int divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
+/*
+ * Unsigned 128-by-64 division. Returns quotient via plow and
+ * remainder via phigh.
+ * The result must fit in 64 bits (plow) - otherwise, the result
+ * is undefined.
+ * This function will cause a division by zero if passed a zero divisor.
+ */
+void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
 {
     uint64_t dhi = *phigh;
     uint64_t dlo = *plow;
     unsigned i;
     uint64_t carry = 0;
 
-    if (divisor == 0) {
-        return 1;
-    } else if (dhi == 0) {
+    if (divisor == 0 || dhi == 0) {
         *plow  = dlo / divisor;
         *phigh = dlo % divisor;
-        return 0;
-    } else if (dhi >= divisor) {
-        return 1;
     } else {
 
         for (i = 0; i < 64; i++) {
@@ -XXX,XX +XXX,XX @@ int divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
 
         *plow = dlo;
         *phigh = dhi;
-        return 0;
     }
 }
 
-int divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
+/*
+ * Signed 128-by-64 division. Returns quotient via plow and
+ * remainder via phigh.
+ * The result must fit in 64 bits (plow) - otherwise, the result
+ * is undefined.
+ * This function will cause a division by zero if passed a zero divisor.
+ */
+void divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
 {
     int sgn_dvdnd = *phigh < 0;
     int sgn_divsr = divisor < 0;
-    int overflow = 0;
 
     if (sgn_dvdnd) {
         *plow = ~(*plow);
@@ -XXX,XX +XXX,XX @@ int divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
         divisor = 0 - divisor;
     }
 
-    overflow = divu128((uint64_t *)plow, (uint64_t *)phigh, (uint64_t)divisor);
+    divu128((uint64_t *)plow, (uint64_t *)phigh, (uint64_t)divisor);
 
     if (sgn_dvdnd  ^ sgn_divsr) {
         *plow = 0 - *plow;
     }
-
-    if (!overflow) {
-        if ((*plow < 0) ^ (sgn_dvdnd ^ sgn_divsr)) {
-            overflow = 1;
-        }
-    }
-
-    return overflow;
 }
 #endif
 
-- 
2.25.1

From: Luis Pires <luis.pires@eldorado.org.br>

Move udiv_qrnnd() from include/fpu/softfloat-macros.h to host-utils,
so it can be reused by divu128().

Signed-off-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-Id: <20211025191154.350831-3-luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/fpu/softfloat-macros.h | 82 ----------------------------------
 include/qemu/host-utils.h      | 81 +++++++++++++++++++++++++++++++++
 2 files changed, 81 insertions(+), 82 deletions(-)

diff --git a/include/fpu/softfloat-macros.h b/include/fpu/softfloat-macros.h
index XXXXXXX..XXXXXXX 100644
--- a/include/fpu/softfloat-macros.h
+++ b/include/fpu/softfloat-macros.h
@@ -XXX,XX +XXX,XX @@
  * so some portions are provided under:
  *  the SoftFloat-2a license
  *  the BSD license
- *  GPL-v2-or-later
  *
  * Any future contributions to this file after December 1st 2014 will be
  * taken to be licensed under the Softfloat-2a license unless specifically
@@ -XXX,XX +XXX,XX @@ this code that are retained.
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-/* Portions of this work are licensed under the terms of the GNU GPL,
- * version 2 or later. See the COPYING file in the top-level directory.
- */
-
 #ifndef FPU_SOFTFLOAT_MACROS_H
 #define FPU_SOFTFLOAT_MACROS_H
 
@@ -XXX,XX +XXX,XX @@ static inline uint64_t estimateDiv128To64(uint64_t a0, uint64_t a1, uint64_t b)
 
 }
 
-/* From the GNU Multi Precision Library - longlong.h __udiv_qrnnd
- * (https://gmplib.org/repo/gmp/file/tip/longlong.h)
- *
- * Licensed under the GPLv2/LGPLv3
- */
-static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
-                                  uint64_t n0, uint64_t d)
-{
-#if defined(__x86_64__)
-    uint64_t q;
-    asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d));
-    return q;
-#elif defined(__s390x__) && !defined(__clang__)
-    /* Need to use a TImode type to get an even register pair for DLGR.  */
-    unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
-    asm("dlgr %0, %1" : "+r"(n) : "r"(d));
-    *r = n >> 64;
-    return n;
-#elif defined(_ARCH_PPC64) && defined(_ARCH_PWR7)
-    /* From Power ISA 2.06, programming note for divdeu.  */
-    uint64_t q1, q2, Q, r1, r2, R;
-    asm("divdeu %0,%2,%4; divdu %1,%3,%4"
-        : "=&r"(q1), "=r"(q2)
-        : "r"(n1), "r"(n0), "r"(d));
-    r1 = -(q1 * d);         /* low part of (n1<<64) - (q1 * d) */
-    r2 = n0 - (q2 * d);
-    Q = q1 + q2;
-    R = r1 + r2;
-    if (R >= d || R < r2) { /* overflow implies R > d */
-        Q += 1;
-        R -= d;
-    }
-    *r = R;
-    return Q;
-#else
-    uint64_t d0, d1, q0, q1, r1, r0, m;
-
-    d0 = (uint32_t)d;
-    d1 = d >> 32;
-
-    r1 = n1 % d1;
-    q1 = n1 / d1;
-    m = q1 * d0;
-    r1 = (r1 << 32) | (n0 >> 32);
-    if (r1 < m) {
-        q1 -= 1;
-        r1 += d;
-        if (r1 >= d) {
-            if (r1 < m) {
-                q1 -= 1;
-                r1 += d;
-            }
-        }
-    }
-    r1 -= m;
-
-    r0 = r1 % d1;
-    q0 = r1 / d1;
-    m = q0 * d0;
-    r0 = (r0 << 32) | (uint32_t)n0;
-    if (r0 < m) {
-        q0 -= 1;
-        r0 += d;
-        if (r0 >= d) {
-            if (r0 < m) {
-                q0 -= 1;
-                r0 += d;
-            }
-        }
-    }
-    r0 -= m;
-
-    *r = r0;
-    return (q1 << 32) | q0;
-#endif
-}
-
 /*----------------------------------------------------------------------------
 | Returns an approximation to the square root of the 32-bit significand given
 | by `a'.  Considered as an integer, `a' must be at least 2^31.  If bit 0 of
diff --git a/include/qemu/host-utils.h b/include/qemu/host-utils.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/host-utils.h
+++ b/include/qemu/host-utils.h
@@ -XXX,XX +XXX,XX @@
  * THE SOFTWARE.
  */
 
+/* Portions of this work are licensed under the terms of the GNU GPL,
+ * version 2 or later. See the COPYING file in the top-level directory.
+ */
+
 #ifndef HOST_UTILS_H
 #define HOST_UTILS_H
 
@@ -XXX,XX +XXX,XX @@ void urshift(uint64_t *plow, uint64_t *phigh, int32_t shift);
  */
 void ulshift(uint64_t *plow, uint64_t *phigh, int32_t shift, bool *overflow);
 
+/* From the GNU Multi Precision Library - longlong.h __udiv_qrnnd
+ * (https://gmplib.org/repo/gmp/file/tip/longlong.h)
+ *
+ * Licensed under the GPLv2/LGPLv3
+ */
+static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
+                                  uint64_t n0, uint64_t d)
+{
+#if defined(__x86_64__)
+    uint64_t q;
+    asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d));
+    return q;
+#elif defined(__s390x__) && !defined(__clang__)
+    /* Need to use a TImode type to get an even register pair for DLGR.  */
+    unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
+    asm("dlgr %0, %1" : "+r"(n) : "r"(d));
+    *r = n >> 64;
+    return n;
+#elif defined(_ARCH_PPC64) && defined(_ARCH_PWR7)
+    /* From Power ISA 2.06, programming note for divdeu.  */
+    uint64_t q1, q2, Q, r1, r2, R;
+    asm("divdeu %0,%2,%4; divdu %1,%3,%4"
+        : "=&r"(q1), "=r"(q2)
+        : "r"(n1), "r"(n0), "r"(d));
+    r1 = -(q1 * d);         /* low part of (n1<<64) - (q1 * d) */
+    r2 = n0 - (q2 * d);
+    Q = q1 + q2;
+    R = r1 + r2;
+    if (R >= d || R < r2) { /* overflow implies R > d */
+        Q += 1;
+        R -= d;
+    }
+    *r = R;
+    return Q;
+#else
+    uint64_t d0, d1, q0, q1, r1, r0, m;
+
+    d0 = (uint32_t)d;
+    d1 = d >> 32;
+
+    r1 = n1 % d1;
+    q1 = n1 / d1;
+    m = q1 * d0;
+    r1 = (r1 << 32) | (n0 >> 32);
+    if (r1 < m) {
+        q1 -= 1;
+        r1 += d;
+        if (r1 >= d) {
+            if (r1 < m) {
+                q1 -= 1;
+                r1 += d;
+            }
+        }
+    }
+    r1 -= m;
+
+    r0 = r1 % d1;
+    q0 = r1 / d1;
+    m = q0 * d0;
+    r0 = (r0 << 32) | (uint32_t)n0;
+    if (r0 < m) {
+        q0 -= 1;
+        r0 += d;
+        if (r0 >= d) {
+            if (r0 < m) {
+                q0 -= 1;
+                r0 += d;
+            }
+        }
+    }
+    r0 -= m;
+
+    *r = r0;
+    return (q1 << 32) | q0;
+#endif
+}
+
 #endif
-- 
2.25.1

From: Luis Pires <luis.pires@eldorado.org.br>

These will be used to implement new decimal floating point
instructions from Power ISA 3.1.

The remainder is now returned directly by divu128/divs128,
freeing up phigh to receive the high 64 bits of the quotient.

Signed-off-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-Id: <20211025191154.350831-4-luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/hw/clock.h        |   6 +-
 include/qemu/host-utils.h |  20 ++++--
 target/ppc/int_helper.c   |   9 +--
 util/host-utils.c         | 133 +++++++++++++++++++++++++-------------
 4 files changed, 108 insertions(+), 60 deletions(-)

diff --git a/include/hw/clock.h b/include/hw/clock.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/clock.h
+++ b/include/hw/clock.h
@@ -XXX,XX +XXX,XX @@ static inline uint64_t clock_ns_to_ticks(const Clock *clk, uint64_t ns)
     if (clk->period == 0) {
         return 0;
     }
-    /*
-     * BUG: when CONFIG_INT128 is not defined, the current implementation of
-     * divu128 does not return a valid truncated quotient, so the result will
-     * be wrong.
-     */
+
     divu128(&lo, &hi, clk->period);
     return lo;
 }
diff --git a/include/qemu/host-utils.h b/include/qemu/host-utils.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/host-utils.h
+++ b/include/qemu/host-utils.h
@@ -XXX,XX +XXX,XX @@ static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
     return (__int128_t)a * b / c;
 }
 
-static inline void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
+static inline uint64_t divu128(uint64_t *plow, uint64_t *phigh,
+                               uint64_t divisor)
 {
     __uint128_t dividend = ((__uint128_t)*phigh << 64) | *plow;
     __uint128_t result = dividend / divisor;
+
     *plow = result;
-    *phigh = dividend % divisor;
+    *phigh = result >> 64;
+    return dividend % divisor;
 }
 
-static inline void divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
+static inline int64_t divs128(uint64_t *plow, int64_t *phigh,
+                              int64_t divisor)
 {
-    __int128_t dividend = ((__int128_t)*phigh << 64) | (uint64_t)*plow;
+    __int128_t dividend = ((__int128_t)*phigh << 64) | *plow;
     __int128_t result = dividend / divisor;
+
     *plow = result;
-    *phigh = dividend % divisor;
+    *phigh = result >> 64;
+    return dividend % divisor;
 }
 #else
 void muls64(uint64_t *plow, uint64_t *phigh, int64_t a, int64_t b);
 void mulu64(uint64_t *plow, uint64_t *phigh, uint64_t a, uint64_t b);
-void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor);
-void divs128(int64_t *plow, int64_t *phigh, int64_t divisor);
+uint64_t divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor);
+int64_t divs128(uint64_t *plow, int64_t *phigh, int64_t divisor);
 
 static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
 {
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -XXX,XX +XXX,XX @@ uint64_t helper_divdeu(CPUPPCState *env, uint64_t ra, uint64_t rb, uint32_t oe)
 
 uint64_t helper_divde(CPUPPCState *env, uint64_t rau, uint64_t rbu, uint32_t oe)
 {
-    int64_t rt = 0;
+    uint64_t rt = 0;
     int64_t ra = (int64_t)rau;
     int64_t rb = (int64_t)rbu;
     int overflow = 0;
@@ -XXX,XX +XXX,XX @@ uint32_t helper_bcdcfsq(ppc_avr_t *r, ppc_avr_t *b, uint32_t ps)
     int cr;
     uint64_t lo_value;
     uint64_t hi_value;
+    uint64_t rem;
     ppc_avr_t ret = { .u64 = { 0, 0 } };
 
     if (b->VsrSD(0) < 0) {
@@ -XXX,XX +XXX,XX @@ uint32_t helper_bcdcfsq(ppc_avr_t *r, ppc_avr_t *b, uint32_t ps)
          * In that case, we leave r unchanged.
          */
     } else {
-        divu128(&lo_value, &hi_value, 1000000000000000ULL);
+        rem = divu128(&lo_value, &hi_value, 1000000000000000ULL);
 
-        for (i = 1; i < 16; hi_value /= 10, i++) {
-            bcd_put_digit(&ret, hi_value % 10, i);
+        for (i = 1; i < 16; rem /= 10, i++) {
+            bcd_put_digit(&ret, rem % 10, i);
         }
 
         for (; i < 32; lo_value /= 10, i++) {
diff --git a/util/host-utils.c b/util/host-utils.c
index XXXXXXX..XXXXXXX 100644
--- a/util/host-utils.c
+++ b/util/host-utils.c
@@ -XXX,XX +XXX,XX @@ void muls64 (uint64_t *plow, uint64_t *phigh, int64_t a, int64_t b)
 }
 
 /*
- * Unsigned 128-by-64 division. Returns quotient via plow and
- * remainder via phigh.
- * The result must fit in 64 bits (plow) - otherwise, the result
- * is undefined.
- * This function will cause a division by zero if passed a zero divisor.
+ * Unsigned 128-by-64 division.
+ * Returns the remainder.
+ * Returns quotient via plow and phigh.
+ * Also returns the remainder via the function return value.
  */
-void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
+uint64_t divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
 {
     uint64_t dhi = *phigh;
     uint64_t dlo = *plow;
-    unsigned i;
-    uint64_t carry = 0;
+    uint64_t rem, dhighest;
+    int sh;
 
     if (divisor == 0 || dhi == 0) {
         *plow  = dlo / divisor;
-        *phigh = dlo % divisor;
+        *phigh = 0;
+        return dlo % divisor;
     } else {
+        sh = clz64(divisor);
 
-        for (i = 0; i < 64; i++) {
-            carry = dhi >> 63;
-            dhi = (dhi << 1) | (dlo >> 63);
-            if (carry || (dhi >= divisor)) {
-                dhi -= divisor;
-                carry = 1;
-            } else {
-                carry = 0;
+        if (dhi < divisor) {
+            if (sh != 0) {
+                /* normalize the divisor, shifting the dividend accordingly */
+                divisor <<= sh;
+                dhi = (dhi << sh) | (dlo >> (64 - sh));
+                dlo <<= sh;
             }
-            dlo = (dlo << 1) | carry;
+
+            *phigh = 0;
+            *plow = udiv_qrnnd(&rem, dhi, dlo, divisor);
+        } else {
+            if (sh != 0) {
+                /* normalize the divisor, shifting the dividend accordingly */
+                divisor <<= sh;
+                dhighest = dhi >> (64 - sh);
+                dhi = (dhi << sh) | (dlo >> (64 - sh));
+                dlo <<= sh;
+
+                *phigh = udiv_qrnnd(&dhi, dhighest, dhi, divisor);
+            } else {
+                /**
+                 * dhi >= divisor
+                 * Since the MSB of divisor is set (sh == 0),
+                 * (dhi - divisor) < divisor
+                 *
+                 * Thus, the high part of the quotient is 1, and we can
+                 * calculate the low part with a single call to udiv_qrnnd
+                 * after subtracting divisor from dhi
+                 */
+                dhi -= divisor;
+                *phigh = 1;
+            }
+
+            *plow = udiv_qrnnd(&rem, dhi, dlo, divisor);
         }
 
-        *plow = dlo;
-        *phigh = dhi;
+        /*
+         * since the dividend/divisor might have been normalized,
+         * the remainder might also have to be shifted back
+         */
+        return rem >> sh;
     }
 }
 
 /*
- * Signed 128-by-64 division. Returns quotient via plow and
- * remainder via phigh.
- * The result must fit in 64 bits (plow) - otherwise, the result
- * is undefined.
- * This function will cause a division by zero if passed a zero divisor.
+ * Signed 128-by-64 division.
+ * Returns quotient via plow and phigh.
+ * Also returns the remainder via the function return value.
  */
-void divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
+int64_t divs128(uint64_t *plow, int64_t *phigh, int64_t divisor)
 {
-    int sgn_dvdnd = *phigh < 0;
-    int sgn_divsr = divisor < 0;
+    bool neg_quotient = false, neg_remainder = false;
+    uint64_t unsig_hi = *phigh, unsig_lo = *plow;
+    uint64_t rem;
 
-    if (sgn_dvdnd) {
-        *plow = ~(*plow);
-        *phigh = ~(*phigh);
-        if (*plow == (int64_t)-1) {
+    if (*phigh < 0) {
+        neg_quotient = !neg_quotient;
+        neg_remainder = !neg_remainder;
+
+        if (unsig_lo == 0) {
+            unsig_hi = -unsig_hi;
+        } else {
+            unsig_hi = ~unsig_hi;
+            unsig_lo = -unsig_lo;
+        }
+    }
+
+    if (divisor < 0) {
+        neg_quotient = !neg_quotient;
+
+        divisor = -divisor;
+    }
+
+    rem = divu128(&unsig_lo, &unsig_hi, (uint64_t)divisor);
+
+    if (neg_quotient) {
+        if (unsig_lo == 0) {
+            *phigh = -unsig_hi;
             *plow = 0;
-            (*phigh)++;
-         } else {
-            (*plow)++;
-         }
+        } else {
+            *phigh = ~unsig_hi;
+            *plow = -unsig_lo;
+        }
+    } else {
+        *phigh = unsig_hi;
+        *plow = unsig_lo;
     }
 
-    if (sgn_divsr) {
-        divisor = 0 - divisor;
-    }
-
-    divu128((uint64_t *)plow, (uint64_t *)phigh, (uint64_t)divisor);
-
-    if (sgn_dvdnd  ^ sgn_divsr) {
-        *plow = 0 - *plow;
+    if (neg_remainder) {
+        return -rem;
+    } else {
+        return rem;
     }
 }
 #endif
-- 
2.25.1

From: Luis Pires <luis.pires@eldorado.org.br>

Signed-off-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-Id: <20211025191154.350831-5-luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tests/unit/test-div128.c | 197 +++++++++++++++++++++++++++++++++++++++
 tests/unit/meson.build   |   1 +
 2 files changed, 198 insertions(+)
 create mode 100644 tests/unit/test-div128.c

diff --git a/tests/unit/test-div128.c b/tests/unit/test-div128.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tests/unit/test-div128.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * Test 128-bit division functions
+ *
+ * Copyright (c) 2021 Instituto de Pesquisas Eldorado (eldorado.org.br)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/host-utils.h"
+
+typedef struct {
+    uint64_t high;
+    uint64_t low;
+    uint64_t rhigh;
+    uint64_t rlow;
+    uint64_t divisor;
+    uint64_t remainder;
+} test_data_unsigned;
+
+typedef struct {
+    int64_t high;
+    uint64_t low;
+    int64_t rhigh;
+    uint64_t rlow;
+    int64_t divisor;
+    int64_t remainder;
+} test_data_signed;
+
+static const test_data_unsigned test_table_unsigned[] = {
+    /* Dividend fits in 64 bits */
+    { 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000001ULL, 0x0000000000000000ULL},
+    { 0x0000000000000000ULL, 0x0000000000000001ULL,
+      0x0000000000000000ULL, 0x0000000000000001ULL,
+      0x0000000000000001ULL, 0x0000000000000000ULL},
+    { 0x0000000000000000ULL, 0x0000000000000003ULL,
+      0x0000000000000000ULL, 0x0000000000000001ULL,
+      0x0000000000000002ULL, 0x0000000000000001ULL},
+    { 0x0000000000000000ULL, 0x8000000000000000ULL,
+      0x0000000000000000ULL, 0x8000000000000000ULL,
+      0x0000000000000001ULL, 0x0000000000000000ULL},
+    { 0x0000000000000000ULL, 0xa000000000000000ULL,
+      0x0000000000000000ULL, 0x0000000000000002ULL,
+      0x4000000000000000ULL, 0x2000000000000000ULL},
+    { 0x0000000000000000ULL, 0x8000000000000000ULL,
+      0x0000000000000000ULL, 0x0000000000000001ULL,
+      0x8000000000000000ULL, 0x0000000000000000ULL},
+
+    /* Dividend > 64 bits, with MSB 0 */
+    { 0x123456789abcdefeULL, 0xefedcba987654321ULL,
+      0x123456789abcdefeULL, 0xefedcba987654321ULL,
+      0x0000000000000001ULL, 0x0000000000000000ULL},
+    { 0x123456789abcdefeULL, 0xefedcba987654321ULL,
+      0x0000000000000001ULL, 0x000000000000000dULL,
+      0x123456789abcdefeULL, 0x03456789abcdf03bULL},
+    { 0x123456789abcdefeULL, 0xefedcba987654321ULL,
+      0x0123456789abcdefULL, 0xeefedcba98765432ULL,
+      0x0000000000000010ULL, 0x0000000000000001ULL},
+
+    /* Dividend > 64 bits, with MSB 1 */
+    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
+      0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
+      0x0000000000000001ULL, 0x0000000000000000ULL},
+    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
+      0x0000000000000001ULL, 0x0000000000000000ULL,
+      0xfeeddccbbaa99887ULL, 0x766554433221100fULL},
+    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
+      0x0feeddccbbaa9988ULL, 0x7766554433221100ULL,
+      0x0000000000000010ULL, 0x000000000000000fULL},
+    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
+      0x000000000000000eULL, 0x00f0f0f0f0f0f35aULL,
+      0x123456789abcdefeULL, 0x0f8922bc55ef90c3ULL},
+
+    /**
+     * Divisor == 64 bits, with MSB 1
+     * and high 64 bits of dividend >= divisor
+     * (for testing normalization)
+     */
+    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
+      0x0000000000000001ULL, 0x0000000000000000ULL,
+      0xfeeddccbbaa99887ULL, 0x766554433221100fULL},
+    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
+      0x0000000000000001ULL, 0xfddbb9977553310aULL,
+      0x8000000000000001ULL, 0x78899aabbccddf05ULL},
+
+    /* Dividend > 64 bits, divisor almost as big */
+    { 0x0000000000000001ULL, 0x23456789abcdef01ULL,
+      0x0000000000000000ULL, 0x000000000000000fULL,
+      0x123456789abcdefeULL, 0x123456789abcde1fULL},
+};
+
+static const test_data_signed test_table_signed[] = {
+    /* Positive dividend, positive/negative divisors */
+    { 0x0000000000000000LL, 0x0000000000bc614eULL,
+      0x0000000000000000LL, 0x0000000000bc614eULL,
+      0x0000000000000001LL, 0x0000000000000000LL},
+    { 0x0000000000000000LL, 0x0000000000bc614eULL,
+      0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
+      0xffffffffffffffffLL, 0x0000000000000000LL},
+    { 0x0000000000000000LL, 0x0000000000bc614eULL,
+      0x0000000000000000LL, 0x00000000005e30a7ULL,
+      0x0000000000000002LL, 0x0000000000000000LL},
+    { 0x0000000000000000LL, 0x0000000000bc614eULL,
+      0xffffffffffffffffLL, 0xffffffffffa1cf59ULL,
+      0xfffffffffffffffeLL, 0x0000000000000000LL},
+    { 0x0000000000000000LL, 0x0000000000bc614eULL,
+      0x0000000000000000LL, 0x0000000000178c29ULL,
+      0x0000000000000008LL, 0x0000000000000006LL},
+    { 0x0000000000000000LL, 0x0000000000bc614eULL,
+      0xffffffffffffffffLL, 0xffffffffffe873d7ULL,
+      0xfffffffffffffff8LL, 0x0000000000000006LL},
+    { 0x0000000000000000LL, 0x0000000000bc614eULL,
+      0x0000000000000000LL, 0x000000000000550dULL,
+      0x0000000000000237LL, 0x0000000000000183LL},
+    { 0x0000000000000000LL, 0x0000000000bc614eULL,
+      0xffffffffffffffffLL, 0xffffffffffffaaf3ULL,
+      0xfffffffffffffdc9LL, 0x0000000000000183LL},
+
+    /* Negative dividend, positive/negative divisors */
+    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
+      0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
+      0x0000000000000001LL, 0x0000000000000000LL},
+    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
+      0x0000000000000000LL, 0x0000000000bc614eULL,
+      0xffffffffffffffffLL, 0x0000000000000000LL},
+    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
+      0xffffffffffffffffLL, 0xffffffffffa1cf59ULL,
+      0x0000000000000002LL, 0x0000000000000000LL},
+    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
+      0x0000000000000000LL, 0x00000000005e30a7ULL,
+      0xfffffffffffffffeLL, 0x0000000000000000LL},
+    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
+      0xffffffffffffffffLL, 0xffffffffffe873d7ULL,
+      0x0000000000000008LL, 0xfffffffffffffffaLL},
+    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
+      0x0000000000000000LL, 0x0000000000178c29ULL,
+      0xfffffffffffffff8LL, 0xfffffffffffffffaLL},
+    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
+      0xffffffffffffffffLL, 0xffffffffffffaaf3ULL,
+      0x0000000000000237LL, 0xfffffffffffffe7dLL},
+    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
+      0x0000000000000000LL, 0x000000000000550dULL,
+      0xfffffffffffffdc9LL, 0xfffffffffffffe7dLL},
+};
+
+static void test_divu128(void)
+{
+    int i;
+    uint64_t rem;
+    test_data_unsigned tmp;
+
+    for (i = 0; i < ARRAY_SIZE(test_table_unsigned); ++i) {
+        tmp = test_table_unsigned[i];
+
+        rem = divu128(&tmp.low, &tmp.high, tmp.divisor);
+        g_assert_cmpuint(tmp.low, ==, tmp.rlow);
+        g_assert_cmpuint(tmp.high, ==, tmp.rhigh);
+        g_assert_cmpuint(rem, ==, tmp.remainder);
+    }
+}
+
+static void test_divs128(void)
+{
+    int i;
+    int64_t rem;
+    test_data_signed tmp;
+
+    for (i = 0; i < ARRAY_SIZE(test_table_signed); ++i) {
+        tmp = test_table_signed[i];
+
+        rem = divs128(&tmp.low, &tmp.high, tmp.divisor);
+        g_assert_cmpuint(tmp.low, ==, tmp.rlow);
+        g_assert_cmpuint(tmp.high, ==, tmp.rhigh);
+        g_assert_cmpuint(rem, ==, tmp.remainder);
+    }
+}
+
+int main(int argc, char **argv)
+{
+    g_test_init(&argc, &argv, NULL);
+    g_test_add_func("/host-utils/test_divu128", test_divu128);
+    g_test_add_func("/host-utils/test_divs128", test_divs128);
+    return g_test_run();
+}
diff --git a/tests/unit/meson.build b/tests/unit/meson.build
index XXXXXXX..XXXXXXX 100644
--- a/tests/unit/meson.build
+++ b/tests/unit/meson.build
@@ -XXX,XX +XXX,XX @@ tests = {
   # all code tested by test-x86-cpuid is inside topology.h
   'test-x86-cpuid': [],
   'test-cutils': [],
+  'test-div128': [],
   'test-shift128': [],
   'test-mul64': [],
   # all code tested by test-int128 is inside int128.h
-- 
2.25.1

Prepare for tracking different masks by renaming this one.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 142 +++++++++++++++++++++++++------------------------
 1 file changed, 72 insertions(+), 70 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
     TCGTemp *prev_copy;
     TCGTemp *next_copy;
     uint64_t val;
-    uint64_t mask;
+    uint64_t z_mask;  /* mask bit is 0 if and only if value bit is 0 */
 } TempOptInfo;
 
 static inline TempOptInfo *ts_info(TCGTemp *ts)
@@ -XXX,XX +XXX,XX @@ static void reset_ts(TCGTemp *ts)
     ti->next_copy = ts;
     ti->prev_copy = ts;
     ti->is_const = false;
-    ti->mask = -1;
+    ti->z_mask = -1;
 }
 
 static void reset_temp(TCGArg arg)
@@ -XXX,XX +XXX,XX @@ static void init_ts_info(TCGTempSet *temps_used, TCGTemp *ts)
     if (ts->kind == TEMP_CONST) {
         ti->is_const = true;
         ti->val = ts->val;
-        ti->mask = ts->val;
+        ti->z_mask = ts->val;
         if (TCG_TARGET_REG_BITS > 32 && ts->type == TCG_TYPE_I32) {
             /* High bits of a 32-bit quantity are garbage.  */
-            ti->mask |= ~0xffffffffull;
+            ti->z_mask |= ~0xffffffffull;
         }
     } else {
         ti->is_const = false;
-        ti->mask = -1;
+        ti->z_mask = -1;
     }
 }
 
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
     const TCGOpDef *def;
     TempOptInfo *di;
     TempOptInfo *si;
-    uint64_t mask;
+    uint64_t z_mask;
     TCGOpcode new_op;
 
     if (ts_are_copies(dst_ts, src_ts)) {
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
     op->args[0] = dst;
     op->args[1] = src;
 
-    mask = si->mask;
+    z_mask = si->z_mask;
     if (TCG_TARGET_REG_BITS > 32 && new_op == INDEX_op_mov_i32) {
         /* High bits of the destination are now garbage.  */
-        mask |= ~0xffffffffull;
+        z_mask |= ~0xffffffffull;
     }
-    di->mask = mask;
+    di->z_mask = z_mask;
 
     if (src_ts->type == dst_ts->type) {
         TempOptInfo *ni = ts_info(si->next_copy);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
     }
 
     QTAILQ_FOREACH_SAFE(op, &s->ops, link, op_next) {
-        uint64_t mask, partmask, affected, tmp;
+        uint64_t z_mask, partmask, affected, tmp;
         int nb_oargs, nb_iargs;
         TCGOpcode opc = op->opc;
         const TCGOpDef *def = &tcg_op_defs[opc];
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 
         /* Simplify using known-zero bits. Currently only ops with a single
            output argument is supported. */
-        mask = -1;
+        z_mask = -1;
         affected = -1;
         switch (opc) {
         CASE_OP_32_64(ext8s):
-            if ((arg_info(op->args[1])->mask & 0x80) != 0) {
+            if ((arg_info(op->args[1])->z_mask & 0x80) != 0) {
                 break;
             }
             QEMU_FALLTHROUGH;
         CASE_OP_32_64(ext8u):
-            mask = 0xff;
+            z_mask = 0xff;
             goto and_const;
         CASE_OP_32_64(ext16s):
-            if ((arg_info(op->args[1])->mask & 0x8000) != 0) {
+            if ((arg_info(op->args[1])->z_mask & 0x8000) != 0) {
                 break;
             }
             QEMU_FALLTHROUGH;
         CASE_OP_32_64(ext16u):
-            mask = 0xffff;
+            z_mask = 0xffff;
             goto and_const;
         case INDEX_op_ext32s_i64:
-            if ((arg_info(op->args[1])->mask & 0x80000000) != 0) {
+            if ((arg_info(op->args[1])->z_mask & 0x80000000) != 0) {
                 break;
             }
             QEMU_FALLTHROUGH;
         case INDEX_op_ext32u_i64:
-            mask = 0xffffffffU;
+            z_mask = 0xffffffffU;
             goto and_const;
 
         CASE_OP_32_64(and):
-            mask = arg_info(op->args[2])->mask;
+            z_mask = arg_info(op->args[2])->z_mask;
             if (arg_is_const(op->args[2])) {
         and_const:
-                affected = arg_info(op->args[1])->mask & ~mask;
+                affected = arg_info(op->args[1])->z_mask & ~z_mask;
             }
-            mask = arg_info(op->args[1])->mask & mask;
+            z_mask = arg_info(op->args[1])->z_mask & z_mask;
             break;
 
         case INDEX_op_ext_i32_i64:
-            if ((arg_info(op->args[1])->mask & 0x80000000) != 0) {
+            if ((arg_info(op->args[1])->z_mask & 0x80000000) != 0) {
                 break;
             }
             QEMU_FALLTHROUGH;
         case INDEX_op_extu_i32_i64:
             /* We do not compute affected as it is a size changing op.  */
-            mask = (uint32_t)arg_info(op->args[1])->mask;
+            z_mask = (uint32_t)arg_info(op->args[1])->z_mask;
             break;
 
         CASE_OP_32_64(andc):
             /* Known-zeros does not imply known-ones.  Therefore unless
                op->args[2] is constant, we can't infer anything from it.  */
             if (arg_is_const(op->args[2])) {
-                mask = ~arg_info(op->args[2])->mask;
+                z_mask = ~arg_info(op->args[2])->z_mask;
                 goto and_const;
             }
             /* But we certainly know nothing outside args[1] may be set. */
-            mask = arg_info(op->args[1])->mask;
+            z_mask = arg_info(op->args[1])->z_mask;
             break;
 
         case INDEX_op_sar_i32:
             if (arg_is_const(op->args[2])) {
                 tmp = arg_info(op->args[2])->val & 31;
-                mask = (int32_t)arg_info(op->args[1])->mask >> tmp;
+                z_mask = (int32_t)arg_info(op->args[1])->z_mask >> tmp;
             }
             break;
         case INDEX_op_sar_i64:
             if (arg_is_const(op->args[2])) {
                 tmp = arg_info(op->args[2])->val & 63;
-                mask = (int64_t)arg_info(op->args[1])->mask >> tmp;
+                z_mask = (int64_t)arg_info(op->args[1])->z_mask >> tmp;
             }
             break;
 
         case INDEX_op_shr_i32:
             if (arg_is_const(op->args[2])) {
                 tmp = arg_info(op->args[2])->val & 31;
-                mask = (uint32_t)arg_info(op->args[1])->mask >> tmp;
+                z_mask = (uint32_t)arg_info(op->args[1])->z_mask >> tmp;
             }
             break;
         case INDEX_op_shr_i64:
             if (arg_is_const(op->args[2])) {
                 tmp = arg_info(op->args[2])->val & 63;
-                mask = (uint64_t)arg_info(op->args[1])->mask >> tmp;
+                z_mask = (uint64_t)arg_info(op->args[1])->z_mask >> tmp;
             }
             break;
 
         case INDEX_op_extrl_i64_i32:
-            mask = (uint32_t)arg_info(op->args[1])->mask;
+            z_mask = (uint32_t)arg_info(op->args[1])->z_mask;
             break;
         case INDEX_op_extrh_i64_i32:
-            mask = (uint64_t)arg_info(op->args[1])->mask >> 32;
+            z_mask = (uint64_t)arg_info(op->args[1])->z_mask >> 32;
             break;
 
         CASE_OP_32_64(shl):
             if (arg_is_const(op->args[2])) {
                 tmp = arg_info(op->args[2])->val & (TCG_TARGET_REG_BITS - 1);
-                mask = arg_info(op->args[1])->mask << tmp;
+                z_mask = arg_info(op->args[1])->z_mask << tmp;
             }
             break;
 
         CASE_OP_32_64(neg):
             /* Set to 1 all bits to the left of the rightmost.  */
-            mask = -(arg_info(op->args[1])->mask
-                     & -arg_info(op->args[1])->mask);
+            z_mask = -(arg_info(op->args[1])->z_mask
+                       & -arg_info(op->args[1])->z_mask);
             break;
 
         CASE_OP_32_64(deposit):
-            mask = deposit64(arg_info(op->args[1])->mask,
-                             op->args[3], op->args[4],
-                             arg_info(op->args[2])->mask);
+            z_mask = deposit64(arg_info(op->args[1])->z_mask,
+                               op->args[3], op->args[4],
+                               arg_info(op->args[2])->z_mask);
             break;
 
         CASE_OP_32_64(extract):
-            mask = extract64(arg_info(op->args[1])->mask,
-                             op->args[2], op->args[3]);
+            z_mask = extract64(arg_info(op->args[1])->z_mask,
+                               op->args[2], op->args[3]);
             if (op->args[2] == 0) {
-                affected = arg_info(op->args[1])->mask & ~mask;
+                affected = arg_info(op->args[1])->z_mask & ~z_mask;
             }
             break;
         CASE_OP_32_64(sextract):
-            mask = sextract64(arg_info(op->args[1])->mask,
-                              op->args[2], op->args[3]);
-            if (op->args[2] == 0 && (tcg_target_long)mask >= 0) {
-                affected = arg_info(op->args[1])->mask & ~mask;
+            z_mask = sextract64(arg_info(op->args[1])->z_mask,
+                                op->args[2], op->args[3]);
+            if (op->args[2] == 0 && (tcg_target_long)z_mask >= 0) {
+                affected = arg_info(op->args[1])->z_mask & ~z_mask;
             }
             break;
 
         CASE_OP_32_64(or):
         CASE_OP_32_64(xor):
-            mask = arg_info(op->args[1])->mask | arg_info(op->args[2])->mask;
+            z_mask = arg_info(op->args[1])->z_mask
+                   | arg_info(op->args[2])->z_mask;
             break;
 
         case INDEX_op_clz_i32:
         case INDEX_op_ctz_i32:
-            mask = arg_info(op->args[2])->mask | 31;
+            z_mask = arg_info(op->args[2])->z_mask | 31;
             break;
 
         case INDEX_op_clz_i64:
         case INDEX_op_ctz_i64:
-            mask = arg_info(op->args[2])->mask | 63;
+            z_mask = arg_info(op->args[2])->z_mask | 63;
             break;
 
         case INDEX_op_ctpop_i32:
-            mask = 32 | 31;
+            z_mask = 32 | 31;
             break;
         case INDEX_op_ctpop_i64:
-            mask = 64 | 63;
+            z_mask = 64 | 63;
             break;
 
         CASE_OP_32_64(setcond):
         case INDEX_op_setcond2_i32:
-            mask = 1;
+            z_mask = 1;
             break;
 
         CASE_OP_32_64(movcond):
-            mask = arg_info(op->args[3])->mask | arg_info(op->args[4])->mask;
+            z_mask = arg_info(op->args[3])->z_mask
+                   | arg_info(op->args[4])->z_mask;
             break;
 
         CASE_OP_32_64(ld8u):
-            mask = 0xff;
+            z_mask = 0xff;
             break;
         CASE_OP_32_64(ld16u):
-            mask = 0xffff;
+            z_mask = 0xffff;
             break;
         case INDEX_op_ld32u_i64:
-            mask = 0xffffffffu;
+            z_mask = 0xffffffffu;
             break;
 
         CASE_OP_32_64(qemu_ld):
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 MemOpIdx oi = op->args[nb_oargs + nb_iargs];
                 MemOp mop = get_memop(oi);
                 if (!(mop & MO_SIGN)) {
-                    mask = (2ULL << ((8 << (mop & MO_SIZE)) - 1)) - 1;
+                    z_mask = (2ULL << ((8 << (mop & MO_SIZE)) - 1)) - 1;
                 }
             }
             break;
 
         CASE_OP_32_64(bswap16):
-            mask = arg_info(op->args[1])->mask;
-            if (mask <= 0xffff) {
+            z_mask = arg_info(op->args[1])->z_mask;
+            if (z_mask <= 0xffff) {
                 op->args[2] |= TCG_BSWAP_IZ;
             }
-            mask = bswap16(mask);
+            z_mask = bswap16(z_mask);
             switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
             case TCG_BSWAP_OZ:
                 break;
             case TCG_BSWAP_OS:
-                mask = (int16_t)mask;
+                z_mask = (int16_t)z_mask;
                 break;
             default: /* undefined high bits */
-                mask |= MAKE_64BIT_MASK(16, 48);
+                z_mask |= MAKE_64BIT_MASK(16, 48);
                 break;
             }
             break;
 
         case INDEX_op_bswap32_i64:
-            mask = arg_info(op->args[1])->mask;
-            if (mask <= 0xffffffffu) {
+            z_mask = arg_info(op->args[1])->z_mask;
+            if (z_mask <= 0xffffffffu) {
                 op->args[2] |= TCG_BSWAP_IZ;
             }
-            mask = bswap32(mask);
+            z_mask = bswap32(z_mask);
             switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
             case TCG_BSWAP_OZ:
                 break;
             case TCG_BSWAP_OS:
-                mask = (int32_t)mask;
+                z_mask = (int32_t)z_mask;
                 break;
             default: /* undefined high bits */
-                mask |= MAKE_64BIT_MASK(32, 32);
+                z_mask |= MAKE_64BIT_MASK(32, 32);
                 break;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         /* 32-bit ops generate 32-bit results.  For the result is zero test
            below, we can ignore high bits, but for further optimizations we
            need to record that the high bits contain garbage.  */
-        partmask = mask;
+        partmask = z_mask;
         if (!(def->flags & TCG_OPF_64BIT)) {
-            mask |= ~(tcg_target_ulong)0xffffffffu;
+            z_mask |= ~(tcg_target_ulong)0xffffffffu;
             partmask &= 0xffffffffu;
             affected &= 0xffffffffu;
         }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                    vs the high word of the input.  */
             do_setcond_high:
                 reset_temp(op->args[0]);
-                arg_info(op->args[0])->mask = 1;
+                arg_info(op->args[0])->z_mask = 1;
                 op->opc = INDEX_op_setcond_i32;
                 op->args[1] = op->args[2];
                 op->args[2] = op->args[4];
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 }
             do_setcond_low:
                 reset_temp(op->args[0]);
-                arg_info(op->args[0])->mask = 1;
+                arg_info(op->args[0])->z_mask = 1;
                 op->opc = INDEX_op_setcond_i32;
                 op->args[2] = op->args[3];
                 op->args[3] = op->args[5];
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             /* Default case: we know nothing about operation (or were unable
                to compute the operation result) so no propagation is done.
                We trash everything if the operation is the end of a basic
-               block, otherwise we only trash the output args.  "mask" is
+               block, otherwise we only trash the output args.  "z_mask" is
                the non-zero bits mask for the first output arg.  */
             if (def->flags & TCG_OPF_BB_END) {
                 memset(&temps_used, 0, sizeof(temps_used));
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                     /* Save the corresponding known-zero bits mask for the
                        first output argument (only one supported so far). */
                     if (i == 0) {
-                        arg_info(op->args[i])->mask = mask;
+                        arg_info(op->args[i])->z_mask = z_mask;
                     }
                 }
             }
-- 
2.25.1

Provide what will become a larger context for splitting
the very large tcg_optimize function.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 77 ++++++++++++++++++++++++++------------------------
 1 file changed, 40 insertions(+), 37 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
     uint64_t z_mask;  /* mask bit is 0 if and only if value bit is 0 */
 } TempOptInfo;
 
+typedef struct OptContext {
+    TCGTempSet temps_used;
+} OptContext;
+
 static inline TempOptInfo *ts_info(TCGTemp *ts)
 {
     return ts->state_ptr;
@@ -XXX,XX +XXX,XX @@ static void reset_temp(TCGArg arg)
 }
 
 /* Initialize and activate a temporary.  */
-static void init_ts_info(TCGTempSet *temps_used, TCGTemp *ts)
+static void init_ts_info(OptContext *ctx, TCGTemp *ts)
 {
     size_t idx = temp_idx(ts);
     TempOptInfo *ti;
 
-    if (test_bit(idx, temps_used->l)) {
+    if (test_bit(idx, ctx->temps_used.l)) {
         return;
     }
-    set_bit(idx, temps_used->l);
+    set_bit(idx, ctx->temps_used.l);
 
     ti = ts->state_ptr;
     if (ti == NULL) {
@@ -XXX,XX +XXX,XX @@ static void init_ts_info(TCGTempSet *temps_used, TCGTemp *ts)
     }
 }
 
-static void init_arg_info(TCGTempSet *temps_used, TCGArg arg)
+static void init_arg_info(OptContext *ctx, TCGArg arg)
 {
-    init_ts_info(temps_used, arg_temp(arg));
+    init_ts_info(ctx, arg_temp(arg));
 }
 
 static TCGTemp *find_better_copy(TCGContext *s, TCGTemp *ts)
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
     }
 }
 
-static void tcg_opt_gen_movi(TCGContext *s, TCGTempSet *temps_used,
+static void tcg_opt_gen_movi(TCGContext *s, OptContext *ctx,
                              TCGOp *op, TCGArg dst, uint64_t val)
 {
     const TCGOpDef *def = &tcg_op_defs[op->opc];
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_movi(TCGContext *s, TCGTempSet *temps_used,
 
     /* Convert movi to mov with constant temp. */
     tv = tcg_constant_internal(type, val);
-    init_ts_info(temps_used, tv);
+    init_ts_info(ctx, tv);
     tcg_opt_gen_mov(s, op, dst, temp_arg(tv));
 }
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 {
     int nb_temps, nb_globals, i;
     TCGOp *op, *op_next, *prev_mb = NULL;
-    TCGTempSet temps_used;
+    OptContext ctx = {};
 
     /* Array VALS has an element for each temp.
        If this temp holds a constant then its value is kept in VALS' element.
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
     nb_temps = s->nb_temps;
     nb_globals = s->nb_globals;
 
-    memset(&temps_used, 0, sizeof(temps_used));
     for (i = 0; i < nb_temps; ++i) {
         s->temps[i].state_ptr = NULL;
     }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             for (i = 0; i < nb_oargs + nb_iargs; i++) {
                 TCGTemp *ts = arg_temp(op->args[i]);
                 if (ts) {
-                    init_ts_info(&temps_used, ts);
+                    init_ts_info(&ctx, ts);
                 }
             }
         } else {
             nb_oargs = def->nb_oargs;
             nb_iargs = def->nb_iargs;
             for (i = 0; i < nb_oargs + nb_iargs; i++) {
-                init_arg_info(&temps_used, op->args[i]);
+                init_arg_info(&ctx, op->args[i]);
             }
         }
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(rotr):
             if (arg_is_const(op->args[1])
                 && arg_info(op->args[1])->val == 0) {
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 
         if (partmask == 0) {
             tcg_debug_assert(nb_oargs == 1);
-            tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
+            tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
             continue;
         }
         if (affected == 0) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(mulsh):
             if (arg_is_const(op->args[2])
                 && arg_info(op->args[2])->val == 0) {
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64_VEC(sub):
         CASE_OP_32_64_VEC(xor):
             if (args_are_copies(op->args[1], op->args[2])) {
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (arg_is_const(op->args[1])) {
                 tmp = arg_info(op->args[1])->val;
                 tmp = dup_const(TCGOP_VECE(op), tmp);
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_dup2_vec:
             assert(TCG_TARGET_REG_BITS == 32);
             if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0],
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0],
                                  deposit64(arg_info(op->args[1])->val, 32, 32,
                                            arg_info(op->args[2])->val));
                 break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_extrh_i64_i32:
             if (arg_is_const(op->args[1])) {
                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val, 0);
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (arg_is_const(op->args[1])) {
                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
                                           op->args[2]);
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
                                           arg_info(op->args[2])->val);
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 TCGArg v = arg_info(op->args[1])->val;
                 if (v != 0) {
                     tmp = do_constant_folding(opc, v, 0);
-                    tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
+                    tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                 } else {
                     tcg_opt_gen_mov(s, op, op->args[0], op->args[2]);
                 }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 tmp = deposit64(arg_info(op->args[1])->val,
                                 op->args[3], op->args[4],
                                 arg_info(op->args[2])->val);
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (arg_is_const(op->args[1])) {
                 tmp = extract64(arg_info(op->args[1])->val,
                                 op->args[2], op->args[3]);
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (arg_is_const(op->args[1])) {
                 tmp = sextract64(arg_info(op->args[1])->val,
                                  op->args[2], op->args[3]);
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                     tmp = (int32_t)(((uint32_t)v1 >> shr) |
                                     ((uint32_t)v2 << (32 - shr)));
                 }
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             tmp = do_constant_folding_cond(opc, op->args[1],
                                            op->args[2], op->args[3]);
             if (tmp != 2) {
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                                            op->args[1], op->args[2]);
             if (tmp != 2) {
                 if (tmp) {
-                    memset(&temps_used, 0, sizeof(temps_used));
+                    memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
                     op->opc = INDEX_op_br;
                     op->args[0] = op->args[3];
                 } else {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 
                 rl = op->args[0];
                 rh = op->args[1];
-                tcg_opt_gen_movi(s, &temps_used, op, rl, (int32_t)a);
-                tcg_opt_gen_movi(s, &temps_used, op2, rh, (int32_t)(a >> 32));
+                tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)a);
+                tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(a >> 32));
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 
                 rl = op->args[0];
                 rh = op->args[1];
-                tcg_opt_gen_movi(s, &temps_used, op, rl, (int32_t)r);
-                tcg_opt_gen_movi(s, &temps_used, op2, rh, (int32_t)(r >> 32));
+                tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)r);
+                tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(r >> 32));
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (tmp != 2) {
                 if (tmp) {
             do_brcond_true:
-                    memset(&temps_used, 0, sizeof(temps_used));
+                    memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
                     op->opc = INDEX_op_br;
                     op->args[0] = op->args[5];
                 } else {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 /* Simplify LT/GE comparisons vs zero to a single compare
                    vs the high word of the input.  */
             do_brcond_high:
-                memset(&temps_used, 0, sizeof(temps_used));
+                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
                 op->opc = INDEX_op_brcond_i32;
                 op->args[0] = op->args[1];
                 op->args[1] = op->args[3];
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                     goto do_default;
                 }
             do_brcond_low:
-                memset(&temps_used, 0, sizeof(temps_used));
+                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
                 op->opc = INDEX_op_brcond_i32;
                 op->args[1] = op->args[2];
                 op->args[2] = op->args[4];
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                                             op->args[5]);
             if (tmp != 2) {
             do_setcond_const:
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
             } else if ((op->args[5] == TCG_COND_LT
                         || op->args[5] == TCG_COND_GE)
                        && arg_is_const(op->args[3])
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (!(tcg_call_flags(op)
                   & (TCG_CALL_NO_READ_GLOBALS | TCG_CALL_NO_WRITE_GLOBALS))) {
                 for (i = 0; i < nb_globals; i++) {
-                    if (test_bit(i, temps_used.l)) {
+                    if (test_bit(i, ctx.temps_used.l)) {
                         reset_ts(&s->temps[i]);
                     }
                 }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                block, otherwise we only trash the output args.  "z_mask" is
                the non-zero bits mask for the first output arg.  */
             if (def->flags & TCG_OPF_BB_END) {
-                memset(&temps_used, 0, sizeof(temps_used));
+                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
             } else {
         do_reset_output:
                 for (i = 0; i < nb_oargs; i++) {
-- 
2.25.1

Break the final cleanup clause out of the main switch
statement.  When fully folding an opcode to mov/movi,
use "continue" to process the next opcode, else break
to fall into the final cleanup.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 190 ++++++++++++++++++++++++-------------------------
 1 file changed, 94 insertions(+), 96 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         switch (opc) {
         CASE_OP_32_64_VEC(mov):
             tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
-            break;
+            continue;
 
         case INDEX_op_dup_vec:
             if (arg_is_const(op->args[1])) {
                 tmp = arg_info(op->args[1])->val;
                 tmp = dup_const(TCGOP_VECE(op), tmp);
                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         case INDEX_op_dup2_vec:
             assert(TCG_TARGET_REG_BITS == 32);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 tcg_opt_gen_movi(s, &ctx, op, op->args[0],
                                  deposit64(arg_info(op->args[1])->val, 32, 32,
                                            arg_info(op->args[2])->val));
-                break;
+                continue;
             } else if (args_are_copies(op->args[1], op->args[2])) {
                 op->opc = INDEX_op_dup_vec;
                 TCGOP_VECE(op) = MO_32;
                 nb_iargs = 1;
             }
-            goto do_default;
+            break;
 
         CASE_OP_32_64(not):
         CASE_OP_32_64(neg):
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (arg_is_const(op->args[1])) {
                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val, 0);
                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         CASE_OP_32_64(bswap16):
         CASE_OP_32_64(bswap32):
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
                                           op->args[2]);
                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         CASE_OP_32_64(add):
         CASE_OP_32_64(sub):
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
                                           arg_info(op->args[2])->val);
                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         CASE_OP_32_64(clz):
         CASE_OP_32_64(ctz):
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 } else {
                     tcg_opt_gen_mov(s, op, op->args[0], op->args[2]);
                 }
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         CASE_OP_32_64(deposit):
             if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                                 op->args[3], op->args[4],
                                 arg_info(op->args[2])->val);
                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         CASE_OP_32_64(extract):
             if (arg_is_const(op->args[1])) {
                 tmp = extract64(arg_info(op->args[1])->val,
                                 op->args[2], op->args[3]);
                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         CASE_OP_32_64(sextract):
             if (arg_is_const(op->args[1])) {
                 tmp = sextract64(arg_info(op->args[1])->val,
                                  op->args[2], op->args[3]);
                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         CASE_OP_32_64(extract2):
             if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                                     ((uint32_t)v2 << (32 - shr)));
                 }
                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         CASE_OP_32_64(setcond):
             tmp = do_constant_folding_cond(opc, op->args[1],
                                            op->args[2], op->args[3]);
             if (tmp != 2) {
                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         CASE_OP_32_64(brcond):
             tmp = do_constant_folding_cond(opc, op->args[0],
                                            op->args[1], op->args[2]);
-            if (tmp != 2) {
-                if (tmp) {
-                    memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
-                    op->opc = INDEX_op_br;
-                    op->args[0] = op->args[3];
-                } else {
-                    tcg_op_remove(s, op);
-                }
+            switch (tmp) {
+            case 0:
+                tcg_op_remove(s, op);
+                continue;
+            case 1:
+                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+                op->opc = opc = INDEX_op_br;
+                op->args[0] = op->args[3];
                 break;
             }
-            goto do_default;
+            break;
 
         CASE_OP_32_64(movcond):
             tmp = do_constant_folding_cond(opc, op->args[1],
                                            op->args[2], op->args[5]);
             if (tmp != 2) {
                 tcg_opt_gen_mov(s, op, op->args[0], op->args[4-tmp]);
-                break;
+                continue;
             }
             if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
                 uint64_t tv = arg_info(op->args[3])->val;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 if (fv == 1 && tv == 0) {
                     cond = tcg_invert_cond(cond);
                 } else if (!(tv == 1 && fv == 0)) {
-                    goto do_default;
+                    break;
                 }
                 op->args[3] = cond;
                 op->opc = opc = (opc == INDEX_op_movcond_i32
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                                  : INDEX_op_setcond_i64);
                 nb_iargs = 2;
             }
-            goto do_default;
+            break;
 
         case INDEX_op_add2_i32:
         case INDEX_op_sub2_i32:
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 rh = op->args[1];
                 tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)a);
                 tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(a >> 32));
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         case INDEX_op_mulu2_i32:
             if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 rh = op->args[1];
                 tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)r);
                 tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(r >> 32));
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         case INDEX_op_brcond2_i32:
             tmp = do_constant_folding_cond2(&op->args[0], &op->args[2],
                                             op->args[4]);
-            if (tmp != 2) {
-                if (tmp) {
-            do_brcond_true:
-                    memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
-                    op->opc = INDEX_op_br;
-                    op->args[0] = op->args[5];
-                } else {
+            if (tmp == 0) {
             do_brcond_false:
-                    tcg_op_remove(s, op);
-                }
-            } else if ((op->args[4] == TCG_COND_LT
-                        || op->args[4] == TCG_COND_GE)
-                       && arg_is_const(op->args[2])
-                       && arg_info(op->args[2])->val == 0
-                       && arg_is_const(op->args[3])
-                       && arg_info(op->args[3])->val == 0) {
+                tcg_op_remove(s, op);
+                continue;
+            }
+            if (tmp == 1) {
+            do_brcond_true:
+                op->opc = opc = INDEX_op_br;
+                op->args[0] = op->args[5];
+                break;
+            }
+            if ((op->args[4] == TCG_COND_LT || op->args[4] == TCG_COND_GE)
+                 && arg_is_const(op->args[2])
+                 && arg_info(op->args[2])->val == 0
+                 && arg_is_const(op->args[3])
+                 && arg_info(op->args[3])->val == 0) {
                 /* Simplify LT/GE comparisons vs zero to a single compare
                    vs the high word of the input.  */
             do_brcond_high:
-                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
-                op->opc = INDEX_op_brcond_i32;
+                op->opc = opc = INDEX_op_brcond_i32;
                 op->args[0] = op->args[1];
                 op->args[1] = op->args[3];
                 op->args[2] = op->args[4];
                 op->args[3] = op->args[5];
-            } else if (op->args[4] == TCG_COND_EQ) {
+                break;
+            }
+            if (op->args[4] == TCG_COND_EQ) {
                 /* Simplify EQ comparisons where one of the pairs
                    can be simplified.  */
                 tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 if (tmp == 0) {
                     goto do_brcond_false;
                 } else if (tmp != 1) {
-                    goto do_default;
+                    break;
                 }
             do_brcond_low:
                 memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 op->args[1] = op->args[2];
                 op->args[2] = op->args[4];
                 op->args[3] = op->args[5];
-            } else if (op->args[4] == TCG_COND_NE) {
+                break;
+            }
+            if (op->args[4] == TCG_COND_NE) {
                 /* Simplify NE comparisons where one of the pairs
                    can be simplified.  */
                 tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 } else if (tmp == 1) {
                     goto do_brcond_true;
                 }
-                goto do_default;
-            } else {
-                goto do_default;
             }
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (tmp != 2) {
             do_setcond_const:
                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
-            } else if ((op->args[5] == TCG_COND_LT
-                        || op->args[5] == TCG_COND_GE)
-                       && arg_is_const(op->args[3])
-                       && arg_info(op->args[3])->val == 0
-                       && arg_is_const(op->args[4])
-                       && arg_info(op->args[4])->val == 0) {
+                continue;
+            }
+            if ((op->args[5] == TCG_COND_LT || op->args[5] == TCG_COND_GE)
+                 && arg_is_const(op->args[3])
+                 && arg_info(op->args[3])->val == 0
+                 && arg_is_const(op->args[4])
+                 && arg_info(op->args[4])->val == 0) {
                 /* Simplify LT/GE comparisons vs zero to a single compare
                    vs the high word of the input.  */
             do_setcond_high:
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 op->args[1] = op->args[2];
                 op->args[2] = op->args[4];
                 op->args[3] = op->args[5];
-            } else if (op->args[5] == TCG_COND_EQ) {
+                break;
+            }
+            if (op->args[5] == TCG_COND_EQ) {
                 /* Simplify EQ comparisons where one of the pairs
                    can be simplified.  */
                 tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 if (tmp == 0) {
                     goto do_setcond_high;
                 } else if (tmp != 1) {
-                    goto do_default;
+                    break;
                 }
             do_setcond_low:
                 reset_temp(op->args[0]);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 op->opc = INDEX_op_setcond_i32;
                 op->args[2] = op->args[3];
                 op->args[3] = op->args[5];
-            } else if (op->args[5] == TCG_COND_NE) {
+                break;
+            }
+            if (op->args[5] == TCG_COND_NE) {
                 /* Simplify NE comparisons where one of the pairs
                    can be simplified.  */
                 tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 } else if (tmp == 1) {
                     goto do_setcond_const;
                 }
-                goto do_default;
-            } else {
-                goto do_default;
             }
             break;
 
-        case INDEX_op_call:
-            if (!(tcg_call_flags(op)
+        default:
+            break;
+        }
+
+        /* Some of the folding above can change opc. */
+        opc = op->opc;
+        def = &tcg_op_defs[opc];
+        if (def->flags & TCG_OPF_BB_END) {
+            memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+        } else {
+            if (opc == INDEX_op_call &&
+                !(tcg_call_flags(op)
                   & (TCG_CALL_NO_READ_GLOBALS | TCG_CALL_NO_WRITE_GLOBALS))) {
                 for (i = 0; i < nb_globals; i++) {
                     if (test_bit(i, ctx.temps_used.l)) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                     }
                 }
             }
-            goto do_reset_output;
 
-        default:
-        do_default:
-            /* Default case: we know nothing about operation (or were unable
-               to compute the operation result) so no propagation is done.
-               We trash everything if the operation is the end of a basic
-               block, otherwise we only trash the output args.  "z_mask" is
-               the non-zero bits mask for the first output arg.  */
-            if (def->flags & TCG_OPF_BB_END) {
-                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
-            } else {
-        do_reset_output:
-                for (i = 0; i < nb_oargs; i++) {
-                    reset_temp(op->args[i]);
-                    /* Save the corresponding known-zero bits mask for the
-                       first output argument (only one supported so far). */
-                    if (i == 0) {
-                        arg_info(op->args[i])->z_mask = z_mask;
-                    }
+            for (i = 0; i < nb_oargs; i++) {
+                reset_temp(op->args[i]);
+                /* Save the corresponding known-zero bits mask for the
+                   first output argument (only one supported so far). */
+                if (i == 0) {
+                    arg_info(op->args[i])->z_mask = z_mask;
                 }
             }
-            break;
         }
 
         /* Eliminate duplicate and redundant fence instructions.  */
-- 
2.25.1

Adjust the interface to take the OptContext parameter instead
of TCGContext or both.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 67 +++++++++++++++++++++++++-------------------------
 1 file changed, 34 insertions(+), 33 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
 } TempOptInfo;
 
 typedef struct OptContext {
+    TCGContext *tcg;
     TCGTempSet temps_used;
 } OptContext;
 
@@ -XXX,XX +XXX,XX @@ static bool args_are_copies(TCGArg arg1, TCGArg arg2)
     return ts_are_copies(arg_temp(arg1), arg_temp(arg2));
 }
 
-static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
+static void tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
 {
     TCGTemp *dst_ts = arg_temp(dst);
     TCGTemp *src_ts = arg_temp(src);
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
     TCGOpcode new_op;
 
     if (ts_are_copies(dst_ts, src_ts)) {
-        tcg_op_remove(s, op);
+        tcg_op_remove(ctx->tcg, op);
         return;
     }
 
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
     }
 }
 
-static void tcg_opt_gen_movi(TCGContext *s, OptContext *ctx,
-                             TCGOp *op, TCGArg dst, uint64_t val)
+static void tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
+                             TCGArg dst, uint64_t val)
 {
     const TCGOpDef *def = &tcg_op_defs[op->opc];
     TCGType type;
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_movi(TCGContext *s, OptContext *ctx,
     /* Convert movi to mov with constant temp. */
     tv = tcg_constant_internal(type, val);
     init_ts_info(ctx, tv);
-    tcg_opt_gen_mov(s, op, dst, temp_arg(tv));
+    tcg_opt_gen_mov(ctx, op, dst, temp_arg(tv));
 }
 
 static uint64_t do_constant_folding_2(TCGOpcode op, uint64_t x, uint64_t y)
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 {
     int nb_temps, nb_globals, i;
     TCGOp *op, *op_next, *prev_mb = NULL;
-    OptContext ctx = {};
+    OptContext ctx = { .tcg = s };
 
     /* Array VALS has an element for each temp.
        If this temp holds a constant then its value is kept in VALS' element.
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(rotr):
             if (arg_is_const(op->args[1])
                 && arg_info(op->args[1])->val == 0) {
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (!arg_is_const(op->args[1])
                 && arg_is_const(op->args[2])
                 && arg_info(op->args[2])->val == 0) {
-                tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
+                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (!arg_is_const(op->args[1])
                 && arg_is_const(op->args[2])
                 && arg_info(op->args[2])->val == -1) {
-                tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
+                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 
         if (partmask == 0) {
             tcg_debug_assert(nb_oargs == 1);
-            tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
+            tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
             continue;
         }
         if (affected == 0) {
             tcg_debug_assert(nb_oargs == 1);
-            tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
+            tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
             continue;
         }
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(mulsh):
             if (arg_is_const(op->args[2])
                 && arg_info(op->args[2])->val == 0) {
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64_VEC(or):
         CASE_OP_32_64_VEC(and):
             if (args_are_copies(op->args[1], op->args[2])) {
-                tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
+                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64_VEC(sub):
         CASE_OP_32_64_VEC(xor):
             if (args_are_copies(op->args[1], op->args[2])) {
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
            allocator where needed and possible.  Also detect copies. */
         switch (opc) {
         CASE_OP_32_64_VEC(mov):
-            tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
+            tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
             continue;
 
         case INDEX_op_dup_vec:
             if (arg_is_const(op->args[1])) {
                 tmp = arg_info(op->args[1])->val;
                 tmp = dup_const(TCGOP_VECE(op), tmp);
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_dup2_vec:
             assert(TCG_TARGET_REG_BITS == 32);
             if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0],
+                tcg_opt_gen_movi(&ctx, op, op->args[0],
                                  deposit64(arg_info(op->args[1])->val, 32, 32,
                                            arg_info(op->args[2])->val));
                 continue;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_extrh_i64_i32:
             if (arg_is_const(op->args[1])) {
                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val, 0);
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (arg_is_const(op->args[1])) {
                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
                                           op->args[2]);
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
                                           arg_info(op->args[2])->val);
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 TCGArg v = arg_info(op->args[1])->val;
                 if (v != 0) {
                     tmp = do_constant_folding(opc, v, 0);
-                    tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                    tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                 } else {
-                    tcg_opt_gen_mov(s, op, op->args[0], op->args[2]);
+                    tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[2]);
                 }
                 continue;
             }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 tmp = deposit64(arg_info(op->args[1])->val,
                                 op->args[3], op->args[4],
                                 arg_info(op->args[2])->val);
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (arg_is_const(op->args[1])) {
                 tmp = extract64(arg_info(op->args[1])->val,
                                 op->args[2], op->args[3]);
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (arg_is_const(op->args[1])) {
                 tmp = sextract64(arg_info(op->args[1])->val,
                                  op->args[2], op->args[3]);
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                     tmp = (int32_t)(((uint32_t)v1 >> shr) |
                                     ((uint32_t)v2 << (32 - shr)));
                 }
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             tmp = do_constant_folding_cond(opc, op->args[1],
                                            op->args[2], op->args[3]);
             if (tmp != 2) {
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             tmp = do_constant_folding_cond(opc, op->args[1],
                                            op->args[2], op->args[5]);
             if (tmp != 2) {
-                tcg_opt_gen_mov(s, op, op->args[0], op->args[4-tmp]);
+                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[4-tmp]);
                 continue;
             }
             if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 
                 rl = op->args[0];
                 rh = op->args[1];
-                tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)a);
-                tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(a >> 32));
+                tcg_opt_gen_movi(&ctx, op, rl, (int32_t)a);
+                tcg_opt_gen_movi(&ctx, op2, rh, (int32_t)(a >> 32));
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 
                 rl = op->args[0];
                 rh = op->args[1];
-                tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)r);
-                tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(r >> 32));
+                tcg_opt_gen_movi(&ctx, op, rl, (int32_t)r);
+                tcg_opt_gen_movi(&ctx, op2, rh, (int32_t)(r >> 32));
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                                             op->args[5]);
             if (tmp != 2) {
             do_setcond_const:
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                 continue;
             }
             if ((op->args[5] == TCG_COND_LT || op->args[5] == TCG_COND_GE)
-- 
2.25.1

This will expose the variable to subroutines that
will be broken out of tcg_optimize.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
 
 typedef struct OptContext {
     TCGContext *tcg;
+    TCGOp *prev_mb;
     TCGTempSet temps_used;
 } OptContext;
 
@@ -XXX,XX +XXX,XX @@ static bool swap_commutative2(TCGArg *p1, TCGArg *p2)
 void tcg_optimize(TCGContext *s)
 {
     int nb_temps, nb_globals, i;
-    TCGOp *op, *op_next, *prev_mb = NULL;
+    TCGOp *op, *op_next;
     OptContext ctx = { .tcg = s };
 
     /* Array VALS has an element for each temp.
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         }
 
         /* Eliminate duplicate and redundant fence instructions.  */
-        if (prev_mb) {
+        if (ctx.prev_mb) {
             switch (opc) {
             case INDEX_op_mb:
                 /* Merge two barriers of the same type into one,
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  * barrier.  This is stricter than specified but for
                  * the purposes of TCG is better than not optimizing.
                  */
-                prev_mb->args[0] |= op->args[0];
+                ctx.prev_mb->args[0] |= op->args[0];
                 tcg_op_remove(s, op);
                 break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             case INDEX_op_qemu_st_i64:
             case INDEX_op_call:
                 /* Opcodes that touch guest memory stop the optimization.  */
-                prev_mb = NULL;
+                ctx.prev_mb = NULL;
                 break;
             }
         } else if (opc == INDEX_op_mb) {
-            prev_mb = op;
+            ctx.prev_mb = op;
         }
     }
 }
-- 
2.25.1

There was no real reason for calls to have separate code here.
Unify init for calls vs non-calls using the call path, which
handles TCG_CALL_DUMMY_ARG.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static void init_ts_info(OptContext *ctx, TCGTemp *ts)
     }
 }
 
-static void init_arg_info(OptContext *ctx, TCGArg arg)
-{
-    init_ts_info(ctx, arg_temp(arg));
-}
-
 static TCGTemp *find_better_copy(TCGContext *s, TCGTemp *ts)
 {
     TCGTemp *i, *g, *l;
@@ -XXX,XX +XXX,XX @@ static bool swap_commutative2(TCGArg *p1, TCGArg *p2)
     return false;
 }
 
+static void init_arguments(OptContext *ctx, TCGOp *op, int nb_args)
+{
+    for (int i = 0; i < nb_args; i++) {
+        TCGTemp *ts = arg_temp(op->args[i]);
+        if (ts) {
+            init_ts_info(ctx, ts);
+        }
+    }
+}
+
 /* Propagate constants and copies, fold constant expressions. */
 void tcg_optimize(TCGContext *s)
 {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         if (opc == INDEX_op_call) {
             nb_oargs = TCGOP_CALLO(op);
             nb_iargs = TCGOP_CALLI(op);
-            for (i = 0; i < nb_oargs + nb_iargs; i++) {
-                TCGTemp *ts = arg_temp(op->args[i]);
-                if (ts) {
-                    init_ts_info(&ctx, ts);
-                }
-            }
         } else {
             nb_oargs = def->nb_oargs;
             nb_iargs = def->nb_iargs;
-            for (i = 0; i < nb_oargs + nb_iargs; i++) {
-                init_arg_info(&ctx, op->args[i]);
-            }
         }
+        init_arguments(&ctx, op, nb_oargs + nb_iargs);
 
         /* Do copy propagation */
         for (i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
-- 
2.25.1

Continue splitting tcg_optimize.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static void init_arguments(OptContext *ctx, TCGOp *op, int nb_args)
     }
 }
 
+static void copy_propagate(OptContext *ctx, TCGOp *op,
+                           int nb_oargs, int nb_iargs)
+{
+    TCGContext *s = ctx->tcg;
+
+    for (int i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
+        TCGTemp *ts = arg_temp(op->args[i]);
+        if (ts && ts_is_copy(ts)) {
+            op->args[i] = temp_arg(find_better_copy(s, ts));
+        }
+    }
+}
+
 /* Propagate constants and copies, fold constant expressions. */
 void tcg_optimize(TCGContext *s)
 {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             nb_iargs = def->nb_iargs;
         }
         init_arguments(&ctx, op, nb_oargs + nb_iargs);
-
-        /* Do copy propagation */
-        for (i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
-            TCGTemp *ts = arg_temp(op->args[i]);
-            if (ts && ts_is_copy(ts)) {
-                op->args[i] = temp_arg(find_better_copy(s, ts));
-            }
-        }
+        copy_propagate(&ctx, op, nb_oargs, nb_iargs);
 
         /* For commutative operations make constant second argument */
         switch (opc) {
-- 
2.25.1

Calls are special in that they have a variable number
of arguments, and need to be able to clobber globals.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 63 ++++++++++++++++++++++++++++++++------------------
 1 file changed, 41 insertions(+), 22 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static void copy_propagate(OptContext *ctx, TCGOp *op,
     }
 }
 
+static bool fold_call(OptContext *ctx, TCGOp *op)
+{
+    TCGContext *s = ctx->tcg;
+    int nb_oargs = TCGOP_CALLO(op);
+    int nb_iargs = TCGOP_CALLI(op);
+    int flags, i;
+
+    init_arguments(ctx, op, nb_oargs + nb_iargs);
+    copy_propagate(ctx, op, nb_oargs, nb_iargs);
+
+    /* If the function reads or writes globals, reset temp data. */
+    flags = tcg_call_flags(op);
+    if (!(flags & (TCG_CALL_NO_READ_GLOBALS | TCG_CALL_NO_WRITE_GLOBALS))) {
+        int nb_globals = s->nb_globals;
+
+        for (i = 0; i < nb_globals; i++) {
+            if (test_bit(i, ctx->temps_used.l)) {
+                reset_ts(&ctx->tcg->temps[i]);
+            }
+        }
+    }
+
+    /* Reset temp data for outputs. */
+    for (i = 0; i < nb_oargs; i++) {
+        reset_temp(op->args[i]);
+    }
+
+    /* Stop optimizing MB across calls. */
+    ctx->prev_mb = NULL;
+    return true;
+}
+
 /* Propagate constants and copies, fold constant expressions. */
 void tcg_optimize(TCGContext *s)
 {
-    int nb_temps, nb_globals, i;
+    int nb_temps, i;
     TCGOp *op, *op_next;
     OptContext ctx = { .tcg = s };
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
        available through the doubly linked circular list. */
 
     nb_temps = s->nb_temps;
-    nb_globals = s->nb_globals;
-
     for (i = 0; i < nb_temps; ++i) {
         s->temps[i].state_ptr = NULL;
     }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         uint64_t z_mask, partmask, affected, tmp;
         int nb_oargs, nb_iargs;
         TCGOpcode opc = op->opc;
-        const TCGOpDef *def = &tcg_op_defs[opc];
+        const TCGOpDef *def;
 
-        /* Count the arguments, and initialize the temps that are
-           going to be used */
+        /* Calls are special. */
         if (opc == INDEX_op_call) {
-            nb_oargs = TCGOP_CALLO(op);
-            nb_iargs = TCGOP_CALLI(op);
-        } else {
-            nb_oargs = def->nb_oargs;
-            nb_iargs = def->nb_iargs;
+            fold_call(&ctx, op);
+            continue;
         }
+
+        def = &tcg_op_defs[opc];
+        nb_oargs = def->nb_oargs;
+        nb_iargs = def->nb_iargs;
         init_arguments(&ctx, op, nb_oargs + nb_iargs);
         copy_propagate(&ctx, op, nb_oargs, nb_iargs);
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         if (def->flags & TCG_OPF_BB_END) {
             memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
         } else {
-            if (opc == INDEX_op_call &&
-                !(tcg_call_flags(op)
-                  & (TCG_CALL_NO_READ_GLOBALS | TCG_CALL_NO_WRITE_GLOBALS))) {
-                for (i = 0; i < nb_globals; i++) {
-                    if (test_bit(i, ctx.temps_used.l)) {
-                        reset_ts(&s->temps[i]);
-                    }
-                }
-            }
-
             for (i = 0; i < nb_oargs; i++) {
                 reset_temp(op->args[i]);
                 /* Save the corresponding known-zero bits mask for the
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             case INDEX_op_qemu_st_i32:
             case INDEX_op_qemu_st8_i32:
             case INDEX_op_qemu_st_i64:
-            case INDEX_op_call:
                 /* Opcodes that touch guest memory stop the optimization.  */
                 ctx.prev_mb = NULL;
                 break;
-- 
2.25.1

Rather than try to keep these up-to-date across folding,
re-read nb_oargs at the end, after re-reading the opcode.

A couple of asserts need dropping, but that will take care
of itself as we split the function further.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 
     QTAILQ_FOREACH_SAFE(op, &s->ops, link, op_next) {
         uint64_t z_mask, partmask, affected, tmp;
-        int nb_oargs, nb_iargs;
         TCGOpcode opc = op->opc;
         const TCGOpDef *def;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         }
 
         def = &tcg_op_defs[opc];
-        nb_oargs = def->nb_oargs;
-        nb_iargs = def->nb_iargs;
-        init_arguments(&ctx, op, nb_oargs + nb_iargs);
-        copy_propagate(&ctx, op, nb_oargs, nb_iargs);
+        init_arguments(&ctx, op, def->nb_oargs + def->nb_iargs);
+        copy_propagate(&ctx, op, def->nb_oargs, def->nb_iargs);
 
         /* For commutative operations make constant second argument */
         switch (opc) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 
         CASE_OP_32_64(qemu_ld):
             {
-                MemOpIdx oi = op->args[nb_oargs + nb_iargs];
+                MemOpIdx oi = op->args[def->nb_oargs + def->nb_iargs];
                 MemOp mop = get_memop(oi);
                 if (!(mop & MO_SIGN)) {
                     z_mask = (2ULL << ((8 << (mop & MO_SIZE)) - 1)) - 1;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         }
 
         if (partmask == 0) {
-            tcg_debug_assert(nb_oargs == 1);
             tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
             continue;
         }
         if (affected == 0) {
-            tcg_debug_assert(nb_oargs == 1);
             tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
             continue;
         }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             } else if (args_are_copies(op->args[1], op->args[2])) {
                 op->opc = INDEX_op_dup_vec;
                 TCGOP_VECE(op) = MO_32;
-                nb_iargs = 1;
             }
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 op->opc = opc = (opc == INDEX_op_movcond_i32
                                  ? INDEX_op_setcond_i32
                                  : INDEX_op_setcond_i64);
-                nb_iargs = 2;
             }
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         if (def->flags & TCG_OPF_BB_END) {
             memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
         } else {
+            int nb_oargs = def->nb_oargs;
             for (i = 0; i < nb_oargs; i++) {
                 reset_temp(op->args[i]);
                 /* Save the corresponding known-zero bits mask for the
-- 
2.25.1

Return -1 instead of 2 for failure, so that we can
use comparisons against 0 for all cases.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 145 +++++++++++++++++++++++++------------------------
 1 file changed, 74 insertions(+), 71 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool do_constant_folding_cond_eq(TCGCond c)
     }
 }
 
-/* Return 2 if the condition can't be simplified, and the result
-   of the condition (0 or 1) if it can */
-static TCGArg do_constant_folding_cond(TCGOpcode op, TCGArg x,
-                                       TCGArg y, TCGCond c)
+/*
+ * Return -1 if the condition can't be simplified,
+ * and the result of the condition (0 or 1) if it can.
+ */
+static int do_constant_folding_cond(TCGOpcode op, TCGArg x,
+                                    TCGArg y, TCGCond c)
 {
     uint64_t xv = arg_info(x)->val;
     uint64_t yv = arg_info(y)->val;
@@ -XXX,XX +XXX,XX @@ static TCGArg do_constant_folding_cond(TCGOpcode op, TCGArg x,
         case TCG_COND_GEU:
             return 1;
         default:
-            return 2;
+            return -1;
         }
     }
-    return 2;
+    return -1;
 }
 
-/* Return 2 if the condition can't be simplified, and the result
-   of the condition (0 or 1) if it can */
-static TCGArg do_constant_folding_cond2(TCGArg *p1, TCGArg *p2, TCGCond c)
+/*
+ * Return -1 if the condition can't be simplified,
+ * and the result of the condition (0 or 1) if it can.
+ */
+static int do_constant_folding_cond2(TCGArg *p1, TCGArg *p2, TCGCond c)
 {
     TCGArg al = p1[0], ah = p1[1];
     TCGArg bl = p2[0], bh = p2[1];
@@ -XXX,XX +XXX,XX @@ static TCGArg do_constant_folding_cond2(TCGArg *p1, TCGArg *p2, TCGCond c)
     if (args_are_copies(al, bl) && args_are_copies(ah, bh)) {
         return do_constant_folding_cond_eq(c);
     }
-    return 2;
+    return -1;
 }
 
 static bool swap_commutative(TCGArg dest, TCGArg *p1, TCGArg *p2)
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             break;
 
         CASE_OP_32_64(setcond):
-            tmp = do_constant_folding_cond(opc, op->args[1],
-                                           op->args[2], op->args[3]);
-            if (tmp != 2) {
-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
+            i = do_constant_folding_cond(opc, op->args[1],
+                                         op->args[2], op->args[3]);
+            if (i >= 0) {
+                tcg_opt_gen_movi(&ctx, op, op->args[0], i);
                 continue;
             }
             break;
 
         CASE_OP_32_64(brcond):
-            tmp = do_constant_folding_cond(opc, op->args[0],
-                                           op->args[1], op->args[2]);
-            switch (tmp) {
-            case 0:
+            i = do_constant_folding_cond(opc, op->args[0],
+                                         op->args[1], op->args[2]);
+            if (i == 0) {
                 tcg_op_remove(s, op);
                 continue;
-            case 1:
+            } else if (i > 0) {
                 memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
                 op->opc = opc = INDEX_op_br;
                 op->args[0] = op->args[3];
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             break;
 
         CASE_OP_32_64(movcond):
-            tmp = do_constant_folding_cond(opc, op->args[1],
-                                           op->args[2], op->args[5]);
-            if (tmp != 2) {
-                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[4-tmp]);
+            i = do_constant_folding_cond(opc, op->args[1],
+                                         op->args[2], op->args[5]);
+            if (i >= 0) {
+                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[4 - i]);
                 continue;
             }
             if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             break;
 
         case INDEX_op_brcond2_i32:
-            tmp = do_constant_folding_cond2(&op->args[0], &op->args[2],
-                                            op->args[4]);
-            if (tmp == 0) {
+            i = do_constant_folding_cond2(&op->args[0], &op->args[2],
+                                          op->args[4]);
+            if (i == 0) {
             do_brcond_false:
                 tcg_op_remove(s, op);
                 continue;
             }
-            if (tmp == 1) {
+            if (i > 0) {
             do_brcond_true:
                 op->opc = opc = INDEX_op_br;
                 op->args[0] = op->args[5];
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (op->args[4] == TCG_COND_EQ) {
                 /* Simplify EQ comparisons where one of the pairs
                    can be simplified.  */
-                tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
-                                               op->args[0], op->args[2],
-                                               TCG_COND_EQ);
-                if (tmp == 0) {
+                i = do_constant_folding_cond(INDEX_op_brcond_i32,
+                                             op->args[0], op->args[2],
+                                             TCG_COND_EQ);
+                if (i == 0) {
                     goto do_brcond_false;
-                } else if (tmp == 1) {
+                } else if (i > 0) {
                     goto do_brcond_high;
                 }
-                tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
-                                               op->args[1], op->args[3],
-                                               TCG_COND_EQ);
-                if (tmp == 0) {
+                i = do_constant_folding_cond(INDEX_op_brcond_i32,
+                                             op->args[1], op->args[3],
+                                             TCG_COND_EQ);
+                if (i == 0) {
                     goto do_brcond_false;
-                } else if (tmp != 1) {
+                } else if (i < 0) {
                     break;
                 }
             do_brcond_low:
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (op->args[4] == TCG_COND_NE) {
                 /* Simplify NE comparisons where one of the pairs
                    can be simplified.  */
-                tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
-                                               op->args[0], op->args[2],
-                                               TCG_COND_NE);
-                if (tmp == 0) {
+                i = do_constant_folding_cond(INDEX_op_brcond_i32,
+                                             op->args[0], op->args[2],
+                                             TCG_COND_NE);
+                if (i == 0) {
                     goto do_brcond_high;
-                } else if (tmp == 1) {
+                } else if (i > 0) {
                     goto do_brcond_true;
                 }
-                tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
-                                               op->args[1], op->args[3],
-                                               TCG_COND_NE);
-                if (tmp == 0) {
+                i = do_constant_folding_cond(INDEX_op_brcond_i32,
+                                             op->args[1], op->args[3],
+                                             TCG_COND_NE);
+                if (i == 0) {
                     goto do_brcond_low;
-                } else if (tmp == 1) {
+                } else if (i > 0) {
                     goto do_brcond_true;
                 }
             }
             break;
 
         case INDEX_op_setcond2_i32:
-            tmp = do_constant_folding_cond2(&op->args[1], &op->args[3],
-                                            op->args[5]);
-            if (tmp != 2) {
+            i = do_constant_folding_cond2(&op->args[1], &op->args[3],
+                                          op->args[5]);
+            if (i >= 0) {
             do_setcond_const:
-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], i);
                 continue;
             }
             if ((op->args[5] == TCG_COND_LT || op->args[5] == TCG_COND_GE)
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (op->args[5] == TCG_COND_EQ) {
                 /* Simplify EQ comparisons where one of the pairs
                    can be simplified.  */
-                tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
-                                               op->args[1], op->args[3],
-                                               TCG_COND_EQ);
-                if (tmp == 0) {
+                i = do_constant_folding_cond(INDEX_op_setcond_i32,
+                                             op->args[1], op->args[3],
+                                             TCG_COND_EQ);
+                if (i == 0) {
                     goto do_setcond_const;
-                } else if (tmp == 1) {
+                } else if (i > 0) {
                     goto do_setcond_high;
                 }
-                tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
-                                               op->args[2], op->args[4],
-                                               TCG_COND_EQ);
-                if (tmp == 0) {
+                i = do_constant_folding_cond(INDEX_op_setcond_i32,
+                                             op->args[2], op->args[4],
+                                             TCG_COND_EQ);
+                if (i == 0) {
                     goto do_setcond_high;
-                } else if (tmp != 1) {
+                } else if (i < 0) {
                     break;
                 }
             do_setcond_low:
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (op->args[5] == TCG_COND_NE) {
                 /* Simplify NE comparisons where one of the pairs
                    can be simplified.  */
-                tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
-                                               op->args[1], op->args[3],
-                                               TCG_COND_NE);
-                if (tmp == 0) {
+                i = do_constant_folding_cond(INDEX_op_setcond_i32,
+                                             op->args[1], op->args[3],
+                                             TCG_COND_NE);
+                if (i == 0) {
                     goto do_setcond_high;
-                } else if (tmp == 1) {
+                } else if (i > 0) {
                     goto do_setcond_const;
                 }
-                tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
-                                               op->args[2], op->args[4],
-                                               TCG_COND_NE);
-                if (tmp == 0) {
+                i = do_constant_folding_cond(INDEX_op_setcond_i32,
+                                             op->args[2], op->args[4],
+                                             TCG_COND_NE);
+                if (i == 0) {
                     goto do_setcond_low;
-                } else if (tmp == 1) {
+                } else if (i > 0) {
                     goto do_setcond_const;
                 }
             }
-- 
2.25.1

This will allow callers to tail call to these functions
and return true indicating processing complete.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool args_are_copies(TCGArg arg1, TCGArg arg2)
     return ts_are_copies(arg_temp(arg1), arg_temp(arg2));
 }
 
-static void tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
+static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
 {
     TCGTemp *dst_ts = arg_temp(dst);
     TCGTemp *src_ts = arg_temp(src);
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
 
     if (ts_are_copies(dst_ts, src_ts)) {
         tcg_op_remove(ctx->tcg, op);
-        return;
+        return true;
     }
 
     reset_ts(dst_ts);
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
         di->is_const = si->is_const;
         di->val = si->val;
     }
+    return true;
 }
 
-static void tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
+static bool tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
                              TCGArg dst, uint64_t val)
 {
     const TCGOpDef *def = &tcg_op_defs[op->opc];
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
     /* Convert movi to mov with constant temp. */
     tv = tcg_constant_internal(type, val);
     init_ts_info(ctx, tv);
-    tcg_opt_gen_mov(ctx, op, dst, temp_arg(tv));
+    return tcg_opt_gen_mov(ctx, op, dst, temp_arg(tv));
 }
 
 static uint64_t do_constant_folding_2(TCGOpcode op, uint64_t x, uint64_t y)
-- 
2.25.1

Copy z_mask into OptContext, for writeback to the
first output within the new function.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 49 +++++++++++++++++++++++++++++++++----------------
 1 file changed, 33 insertions(+), 16 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ typedef struct OptContext {
     TCGContext *tcg;
     TCGOp *prev_mb;
     TCGTempSet temps_used;
+
+    /* In flight values from optimization. */
+    uint64_t z_mask;
 } OptContext;
 
 static inline TempOptInfo *ts_info(TCGTemp *ts)
@@ -XXX,XX +XXX,XX @@ static void copy_propagate(OptContext *ctx, TCGOp *op,
     }
 }
 
+static void finish_folding(OptContext *ctx, TCGOp *op)
+{
+    const TCGOpDef *def = &tcg_op_defs[op->opc];
+    int i, nb_oargs;
+
+    /*
+     * For an opcode that ends a BB, reset all temp data.
+     * We do no cross-BB optimization.
+     */
+    if (def->flags & TCG_OPF_BB_END) {
+        memset(&ctx->temps_used, 0, sizeof(ctx->temps_used));
+        ctx->prev_mb = NULL;
+        return;
+    }
+
+    nb_oargs = def->nb_oargs;
+    for (i = 0; i < nb_oargs; i++) {
+        reset_temp(op->args[i]);
+        /*
+         * Save the corresponding known-zero bits mask for the
+         * first output argument (only one supported so far).
+         */
+        if (i == 0) {
+            arg_info(op->args[i])->z_mask = ctx->z_mask;
+        }
+    }
+}
+
 static bool fold_call(OptContext *ctx, TCGOp *op)
 {
     TCGContext *s = ctx->tcg;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             partmask &= 0xffffffffu;
             affected &= 0xffffffffu;
         }
+        ctx.z_mask = z_mask;
 
         if (partmask == 0) {
             tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             break;
         }
 
-        /* Some of the folding above can change opc. */
-        opc = op->opc;
-        def = &tcg_op_defs[opc];
-        if (def->flags & TCG_OPF_BB_END) {
-            memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
-        } else {
-            int nb_oargs = def->nb_oargs;
-            for (i = 0; i < nb_oargs; i++) {
-                reset_temp(op->args[i]);
-                /* Save the corresponding known-zero bits mask for the
-                   first output argument (only one supported so far). */
-                if (i == 0) {
-                    arg_info(op->args[i])->z_mask = z_mask;
-                }
-            }
-        }
+        finish_folding(&ctx, op);
 
         /* Eliminate duplicate and redundant fence instructions.  */
         if (ctx.prev_mb) {
-- 
2.25.1

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         uint64_t z_mask, partmask, affected, tmp;
         TCGOpcode opc = op->opc;
         const TCGOpDef *def;
+        bool done = false;
 
         /* Calls are special. */
         if (opc == INDEX_op_call) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
            allocator where needed and possible.  Also detect copies. */
         switch (opc) {
         CASE_OP_32_64_VEC(mov):
-            tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
-            continue;
+            done = tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
+            break;
 
         case INDEX_op_dup_vec:
             if (arg_is_const(op->args[1])) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             break;
         }
 
-        finish_folding(&ctx, op);
+        if (!done) {
+            finish_folding(&ctx, op);
+        }
 
         /* Eliminate duplicate and redundant fence instructions.  */
         if (ctx.prev_mb) {
-- 
2.25.1

This puts the separate mb optimization into the same framework
as the others.  While fold_qemu_{ld,st} are currently identical,
that won't last as more code gets moved.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 89 +++++++++++++++++++++++++++++---------------------
 1 file changed, 51 insertions(+), 38 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_call(OptContext *ctx, TCGOp *op)
     return true;
 }
 
+static bool fold_mb(OptContext *ctx, TCGOp *op)
+{
+    /* Eliminate duplicate and redundant fence instructions.  */
+    if (ctx->prev_mb) {
+        /*
+         * Merge two barriers of the same type into one,
+         * or a weaker barrier into a stronger one,
+         * or two weaker barriers into a stronger one.
+         *   mb X; mb Y => mb X|Y
+         *   mb; strl => mb; st
+         *   ldaq; mb => ld; mb
+         *   ldaq; strl => ld; mb; st
+         * Other combinations are also merged into a strong
+         * barrier.  This is stricter than specified but for
+         * the purposes of TCG is better than not optimizing.
+         */
+        ctx->prev_mb->args[0] |= op->args[0];
+        tcg_op_remove(ctx->tcg, op);
+    } else {
+        ctx->prev_mb = op;
+    }
+    return true;
+}
+
+static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
+{
+    /* Opcodes that touch guest memory stop the mb optimization.  */
+    ctx->prev_mb = NULL;
+    return false;
+}
+
+static bool fold_qemu_st(OptContext *ctx, TCGOp *op)
+{
+    /* Opcodes that touch guest memory stop the mb optimization.  */
+    ctx->prev_mb = NULL;
+    return false;
+}
+
 /* Propagate constants and copies, fold constant expressions. */
 void tcg_optimize(TCGContext *s)
 {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
+        case INDEX_op_mb:
+            done = fold_mb(&ctx, op);
+            break;
+        case INDEX_op_qemu_ld_i32:
+        case INDEX_op_qemu_ld_i64:
+            done = fold_qemu_ld(&ctx, op);
+            break;
+        case INDEX_op_qemu_st_i32:
+        case INDEX_op_qemu_st8_i32:
+        case INDEX_op_qemu_st_i64:
+            done = fold_qemu_st(&ctx, op);
+            break;
+
         default:
             break;
         }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         if (!done) {
             finish_folding(&ctx, op);
         }
-
-        /* Eliminate duplicate and redundant fence instructions.  */
-        if (ctx.prev_mb) {
-            switch (opc) {
-            case INDEX_op_mb:
-                /* Merge two barriers of the same type into one,
-                 * or a weaker barrier into a stronger one,
-                 * or two weaker barriers into a stronger one.
-                 *   mb X; mb Y => mb X|Y
-                 *   mb; strl => mb; st
-                 *   ldaq; mb => ld; mb
-                 *   ldaq; strl => ld; mb; st
-                 * Other combinations are also merged into a strong
-                 * barrier.  This is stricter than specified but for
-                 * the purposes of TCG is better than not optimizing.
-                 */
-                ctx.prev_mb->args[0] |= op->args[0];
-                tcg_op_remove(s, op);
-                break;
-
-            default:
-                /* Opcodes that end the block stop the optimization.  */
-                if ((def->flags & TCG_OPF_BB_END) == 0) {
-                    break;
-                }
-                /* fallthru */
-            case INDEX_op_qemu_ld_i32:
-            case INDEX_op_qemu_ld_i64:
-            case INDEX_op_qemu_st_i32:
-            case INDEX_op_qemu_st8_i32:
-            case INDEX_op_qemu_st_i64:
-                /* Opcodes that touch guest memory stop the optimization.  */
-                ctx.prev_mb = NULL;
-                break;
-            }
-        } else if (opc == INDEX_op_mb) {
-            ctx.prev_mb = op;
-        }
     }
 }
-- 
2.25.1

Split out a whole bunch of placeholder functions, which are
currently identical.  That won't last as more code gets moved.

Use CASE_32_64_VEC for some logical operators that previously
missed the addition of vectors.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 271 +++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 219 insertions(+), 52 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static void finish_folding(OptContext *ctx, TCGOp *op)
     }
 }
 
+/*
+ * The fold_* functions return true when processing is complete,
+ * usually by folding the operation to a constant or to a copy,
+ * and calling tcg_opt_gen_{mov,movi}.  They may do other things,
+ * like collect information about the value produced, for use in
+ * optimizing a subsequent operation.
+ *
+ * These first fold_* functions are all helpers, used by other
+ * folders for more specific operations.
+ */
+
+static bool fold_const1(OptContext *ctx, TCGOp *op)
+{
+    if (arg_is_const(op->args[1])) {
+        uint64_t t;
+
+        t = arg_info(op->args[1])->val;
+        t = do_constant_folding(op->opc, t, 0);
+        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
+    }
+    return false;
+}
+
+static bool fold_const2(OptContext *ctx, TCGOp *op)
+{
+    if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
+        uint64_t t1 = arg_info(op->args[1])->val;
+        uint64_t t2 = arg_info(op->args[2])->val;
+
+        t1 = do_constant_folding(op->opc, t1, t2);
+        return tcg_opt_gen_movi(ctx, op, op->args[0], t1);
+    }
+    return false;
+}
+
+/*
+ * These outermost fold_<op> functions are sorted alphabetically.
+ */
+
+static bool fold_add(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_and(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_andc(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
 static bool fold_call(OptContext *ctx, TCGOp *op)
 {
     TCGContext *s = ctx->tcg;
@@ -XXX,XX +XXX,XX @@ static bool fold_call(OptContext *ctx, TCGOp *op)
     return true;
 }
 
+static bool fold_ctpop(OptContext *ctx, TCGOp *op)
+{
+    return fold_const1(ctx, op);
+}
+
+static bool fold_divide(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_eqv(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_exts(OptContext *ctx, TCGOp *op)
+{
+    return fold_const1(ctx, op);
+}
+
+static bool fold_extu(OptContext *ctx, TCGOp *op)
+{
+    return fold_const1(ctx, op);
+}
+
 static bool fold_mb(OptContext *ctx, TCGOp *op)
 {
     /* Eliminate duplicate and redundant fence instructions.  */
@@ -XXX,XX +XXX,XX @@ static bool fold_mb(OptContext *ctx, TCGOp *op)
     return true;
 }
 
+static bool fold_mul(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_nand(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_neg(OptContext *ctx, TCGOp *op)
+{
+    return fold_const1(ctx, op);
+}
+
+static bool fold_nor(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_not(OptContext *ctx, TCGOp *op)
+{
+    return fold_const1(ctx, op);
+}
+
+static bool fold_or(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_orc(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
 static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
 {
     /* Opcodes that touch guest memory stop the mb optimization.  */
@@ -XXX,XX +XXX,XX @@ static bool fold_qemu_st(OptContext *ctx, TCGOp *op)
     return false;
 }
 
+static bool fold_remainder(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_shift(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_sub(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_xor(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
 /* Propagate constants and copies, fold constant expressions. */
 void tcg_optimize(TCGContext *s)
 {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
-        CASE_OP_32_64(not):
-        CASE_OP_32_64(neg):
-        CASE_OP_32_64(ext8s):
-        CASE_OP_32_64(ext8u):
-        CASE_OP_32_64(ext16s):
-        CASE_OP_32_64(ext16u):
-        CASE_OP_32_64(ctpop):
-        case INDEX_op_ext32s_i64:
-        case INDEX_op_ext32u_i64:
-        case INDEX_op_ext_i32_i64:
-        case INDEX_op_extu_i32_i64:
-        case INDEX_op_extrl_i64_i32:
-        case INDEX_op_extrh_i64_i32:
-            if (arg_is_const(op->args[1])) {
-                tmp = do_constant_folding(opc, arg_info(op->args[1])->val, 0);
-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
-                continue;
-            }
-            break;
-
         CASE_OP_32_64(bswap16):
         CASE_OP_32_64(bswap32):
         case INDEX_op_bswap64_i64:
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
-        CASE_OP_32_64(add):
-        CASE_OP_32_64(sub):
-        CASE_OP_32_64(mul):
-        CASE_OP_32_64(or):
-        CASE_OP_32_64(and):
-        CASE_OP_32_64(xor):
-        CASE_OP_32_64(shl):
-        CASE_OP_32_64(shr):
-        CASE_OP_32_64(sar):
-        CASE_OP_32_64(rotl):
-        CASE_OP_32_64(rotr):
-        CASE_OP_32_64(andc):
-        CASE_OP_32_64(orc):
-        CASE_OP_32_64(eqv):
-        CASE_OP_32_64(nand):
-        CASE_OP_32_64(nor):
-        CASE_OP_32_64(muluh):
-        CASE_OP_32_64(mulsh):
-        CASE_OP_32_64(div):
-        CASE_OP_32_64(divu):
-        CASE_OP_32_64(rem):
-        CASE_OP_32_64(remu):
-            if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
-                tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
-                                          arg_info(op->args[2])->val);
-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
-                continue;
-            }
-            break;
-
         CASE_OP_32_64(clz):
         CASE_OP_32_64(ctz):
             if (arg_is_const(op->args[1])) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
+        default:
+            break;
+
+        /* ---------------------------------------------------------- */
+        /* Sorted alphabetically by opcode as much as possible. */
+
+        CASE_OP_32_64_VEC(add):
+            done = fold_add(&ctx, op);
+            break;
+        CASE_OP_32_64_VEC(and):
+            done = fold_and(&ctx, op);
+            break;
+        CASE_OP_32_64_VEC(andc):
+            done = fold_andc(&ctx, op);
+            break;
+        CASE_OP_32_64(ctpop):
+            done = fold_ctpop(&ctx, op);
+            break;
+        CASE_OP_32_64(div):
+        CASE_OP_32_64(divu):
+            done = fold_divide(&ctx, op);
+            break;
+        CASE_OP_32_64(eqv):
+            done = fold_eqv(&ctx, op);
+            break;
+        CASE_OP_32_64(ext8s):
+        CASE_OP_32_64(ext16s):
+        case INDEX_op_ext32s_i64:
+        case INDEX_op_ext_i32_i64:
+            done = fold_exts(&ctx, op);
+            break;
+        CASE_OP_32_64(ext8u):
+        CASE_OP_32_64(ext16u):
+        case INDEX_op_ext32u_i64:
+        case INDEX_op_extu_i32_i64:
+        case INDEX_op_extrl_i64_i32:
+        case INDEX_op_extrh_i64_i32:
+            done = fold_extu(&ctx, op);
+            break;
         case INDEX_op_mb:
             done = fold_mb(&ctx, op);
             break;
+        CASE_OP_32_64(mul):
+            done = fold_mul(&ctx, op);
+            break;
+        CASE_OP_32_64(mulsh):
+        CASE_OP_32_64(muluh):
+            done = fold_mul_highpart(&ctx, op);
+            break;
+        CASE_OP_32_64(nand):
+            done = fold_nand(&ctx, op);
+            break;
+        CASE_OP_32_64(neg):
+            done = fold_neg(&ctx, op);
+            break;
+        CASE_OP_32_64(nor):
+            done = fold_nor(&ctx, op);
+            break;
+        CASE_OP_32_64_VEC(not):
+            done = fold_not(&ctx, op);
+            break;
+        CASE_OP_32_64_VEC(or):
+            done = fold_or(&ctx, op);
+            break;
+        CASE_OP_32_64_VEC(orc):
+            done = fold_orc(&ctx, op);
+            break;
         case INDEX_op_qemu_ld_i32:
         case INDEX_op_qemu_ld_i64:
             done = fold_qemu_ld(&ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_qemu_st_i64:
             done = fold_qemu_st(&ctx, op);
             break;
-
-        default:
+        CASE_OP_32_64(rem):
+        CASE_OP_32_64(remu):
+            done = fold_remainder(&ctx, op);
+            break;
+        CASE_OP_32_64(rotl):
+        CASE_OP_32_64(rotr):
+        CASE_OP_32_64(sar):
+        CASE_OP_32_64(shl):
+        CASE_OP_32_64(shr):
+            done = fold_shift(&ctx, op);
+            break;
+        CASE_OP_32_64_VEC(sub):
+            done = fold_sub(&ctx, op);
+            break;
+        CASE_OP_32_64_VEC(xor):
+            done = fold_xor(&ctx, op);
             break;
         }
 
-- 
2.25.1

Reduce some code duplication by folding the NE and EQ cases.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 145 ++++++++++++++++++++++++-------------------------
 1 file changed, 72 insertions(+), 73 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_remainder(OptContext *ctx, TCGOp *op)
     return fold_const2(ctx, op);
 }
 
+static bool fold_setcond2(OptContext *ctx, TCGOp *op)
+{
+    TCGCond cond = op->args[5];
+    int i = do_constant_folding_cond2(&op->args[1], &op->args[3], cond);
+    int inv = 0;
+
+    if (i >= 0) {
+        goto do_setcond_const;
+    }
+
+    switch (cond) {
+    case TCG_COND_LT:
+    case TCG_COND_GE:
+        /*
+         * Simplify LT/GE comparisons vs zero to a single compare
+         * vs the high word of the input.
+         */
+        if (arg_is_const(op->args[3]) && arg_info(op->args[3])->val == 0 &&
+            arg_is_const(op->args[4]) && arg_info(op->args[4])->val == 0) {
+            goto do_setcond_high;
+        }
+        break;
+
+    case TCG_COND_NE:
+        inv = 1;
+        QEMU_FALLTHROUGH;
+    case TCG_COND_EQ:
+        /*
+         * Simplify EQ/NE comparisons where one of the pairs
+         * can be simplified.
+         */
+        i = do_constant_folding_cond(INDEX_op_setcond_i32, op->args[1],
+                                     op->args[3], cond);
+        switch (i ^ inv) {
+        case 0:
+            goto do_setcond_const;
+        case 1:
+            goto do_setcond_high;
+        }
+
+        i = do_constant_folding_cond(INDEX_op_setcond_i32, op->args[2],
+                                     op->args[4], cond);
+        switch (i ^ inv) {
+        case 0:
+            goto do_setcond_const;
+        case 1:
+            op->args[2] = op->args[3];
+            op->args[3] = cond;
+            op->opc = INDEX_op_setcond_i32;
+            break;
+        }
+        break;
+
+    default:
+        break;
+
+    do_setcond_high:
+        op->args[1] = op->args[2];
+        op->args[2] = op->args[4];
+        op->args[3] = cond;
+        op->opc = INDEX_op_setcond_i32;
+        break;
+    }
+    return false;
+
+ do_setcond_const:
+    return tcg_opt_gen_movi(ctx, op, op->args[0], i);
+}
+
 static bool fold_shift(OptContext *ctx, TCGOp *op)
 {
     return fold_const2(ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
-        case INDEX_op_setcond2_i32:
-            i = do_constant_folding_cond2(&op->args[1], &op->args[3],
-                                          op->args[5]);
-            if (i >= 0) {
-            do_setcond_const:
-                tcg_opt_gen_movi(&ctx, op, op->args[0], i);
-                continue;
-            }
-            if ((op->args[5] == TCG_COND_LT || op->args[5] == TCG_COND_GE)
-                 && arg_is_const(op->args[3])
-                 && arg_info(op->args[3])->val == 0
-                 && arg_is_const(op->args[4])
-                 && arg_info(op->args[4])->val == 0) {
-                /* Simplify LT/GE comparisons vs zero to a single compare
-                   vs the high word of the input.  */
-            do_setcond_high:
-                reset_temp(op->args[0]);
-                arg_info(op->args[0])->z_mask = 1;
-                op->opc = INDEX_op_setcond_i32;
-                op->args[1] = op->args[2];
-                op->args[2] = op->args[4];
-                op->args[3] = op->args[5];
-                break;
-            }
-            if (op->args[5] == TCG_COND_EQ) {
-                /* Simplify EQ comparisons where one of the pairs
-                   can be simplified.  */
-                i = do_constant_folding_cond(INDEX_op_setcond_i32,
-                                             op->args[1], op->args[3],
-                                             TCG_COND_EQ);
-                if (i == 0) {
-                    goto do_setcond_const;
-                } else if (i > 0) {
-                    goto do_setcond_high;
-                }
-                i = do_constant_folding_cond(INDEX_op_setcond_i32,
-                                             op->args[2], op->args[4],
-                                             TCG_COND_EQ);
-                if (i == 0) {
-                    goto do_setcond_high;
-                } else if (i < 0) {
-                    break;
-                }
-            do_setcond_low:
-                reset_temp(op->args[0]);
-                arg_info(op->args[0])->z_mask = 1;
-                op->opc = INDEX_op_setcond_i32;
-                op->args[2] = op->args[3];
-                op->args[3] = op->args[5];
-                break;
-            }
-            if (op->args[5] == TCG_COND_NE) {
-                /* Simplify NE comparisons where one of the pairs
-                   can be simplified.  */
-                i = do_constant_folding_cond(INDEX_op_setcond_i32,
-                                             op->args[1], op->args[3],
-                                             TCG_COND_NE);
-                if (i == 0) {
-                    goto do_setcond_high;
-                } else if (i > 0) {
-                    goto do_setcond_const;
-                }
-                i = do_constant_folding_cond(INDEX_op_setcond_i32,
-                                             op->args[2], op->args[4],
-                                             TCG_COND_NE);
-                if (i == 0) {
-                    goto do_setcond_low;
-                } else if (i > 0) {
-                    goto do_setcond_const;
-                }
-            }
-            break;
-
         default:
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(shr):
             done = fold_shift(&ctx, op);
             break;
+        case INDEX_op_setcond2_i32:
+            done = fold_setcond2(&ctx, op);
+            break;
         CASE_OP_32_64_VEC(sub):
             done = fold_sub(&ctx, op);
             break;
-- 
2.25.1

Reduce some code duplication by folding the NE and EQ cases.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 159 +++++++++++++++++++++++++------------------------
 1 file changed, 81 insertions(+), 78 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
     return fold_const2(ctx, op);
 }
 
+static bool fold_brcond2(OptContext *ctx, TCGOp *op)
+{
+    TCGCond cond = op->args[4];
+    int i = do_constant_folding_cond2(&op->args[0], &op->args[2], cond);
+    TCGArg label = op->args[5];
+    int inv = 0;
+
+    if (i >= 0) {
+        goto do_brcond_const;
+    }
+
+    switch (cond) {
+    case TCG_COND_LT:
+    case TCG_COND_GE:
+        /*
+         * Simplify LT/GE comparisons vs zero to a single compare
+         * vs the high word of the input.
+         */
+        if (arg_is_const(op->args[2]) && arg_info(op->args[2])->val == 0 &&
+            arg_is_const(op->args[3]) && arg_info(op->args[3])->val == 0) {
+            goto do_brcond_high;
+        }
+        break;
+
+    case TCG_COND_NE:
+        inv = 1;
+        QEMU_FALLTHROUGH;
+    case TCG_COND_EQ:
+        /*
+         * Simplify EQ/NE comparisons where one of the pairs
+         * can be simplified.
+         */
+        i = do_constant_folding_cond(INDEX_op_brcond_i32, op->args[0],
+                                     op->args[2], cond);
+        switch (i ^ inv) {
+        case 0:
+            goto do_brcond_const;
+        case 1:
+            goto do_brcond_high;
+        }
+
+        i = do_constant_folding_cond(INDEX_op_brcond_i32, op->args[1],
+                                     op->args[3], cond);
+        switch (i ^ inv) {
+        case 0:
+            goto do_brcond_const;
+        case 1:
+            op->opc = INDEX_op_brcond_i32;
+            op->args[1] = op->args[2];
+            op->args[2] = cond;
+            op->args[3] = label;
+            break;
+        }
+        break;
+
+    default:
+        break;
+
+    do_brcond_high:
+        op->opc = INDEX_op_brcond_i32;
+        op->args[0] = op->args[1];
+        op->args[1] = op->args[3];
+        op->args[2] = cond;
+        op->args[3] = label;
+        break;
+
+    do_brcond_const:
+        if (i == 0) {
+            tcg_op_remove(ctx->tcg, op);
+            return true;
+        }
+        op->opc = INDEX_op_br;
+        op->args[0] = label;
+        break;
+    }
+    return false;
+}
+
 static bool fold_call(OptContext *ctx, TCGOp *op)
 {
     TCGContext *s = ctx->tcg;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
-        case INDEX_op_brcond2_i32:
-            i = do_constant_folding_cond2(&op->args[0], &op->args[2],
-                                          op->args[4]);
-            if (i == 0) {
-            do_brcond_false:
-                tcg_op_remove(s, op);
-                continue;
-            }
-            if (i > 0) {
-            do_brcond_true:
-                op->opc = opc = INDEX_op_br;
-                op->args[0] = op->args[5];
-                break;
-            }
-            if ((op->args[4] == TCG_COND_LT || op->args[4] == TCG_COND_GE)
-                 && arg_is_const(op->args[2])
-                 && arg_info(op->args[2])->val == 0
-                 && arg_is_const(op->args[3])
-                 && arg_info(op->args[3])->val == 0) {
-                /* Simplify LT/GE comparisons vs zero to a single compare
-                   vs the high word of the input.  */
-            do_brcond_high:
-                op->opc = opc = INDEX_op_brcond_i32;
-                op->args[0] = op->args[1];
-                op->args[1] = op->args[3];
-                op->args[2] = op->args[4];
-                op->args[3] = op->args[5];
-                break;
-            }
-            if (op->args[4] == TCG_COND_EQ) {
-                /* Simplify EQ comparisons where one of the pairs
-                   can be simplified.  */
-                i = do_constant_folding_cond(INDEX_op_brcond_i32,
-                                             op->args[0], op->args[2],
-                                             TCG_COND_EQ);
-                if (i == 0) {
-                    goto do_brcond_false;
-                } else if (i > 0) {
-                    goto do_brcond_high;
-                }
-                i = do_constant_folding_cond(INDEX_op_brcond_i32,
-                                             op->args[1], op->args[3],
-                                             TCG_COND_EQ);
-                if (i == 0) {
-                    goto do_brcond_false;
-                } else if (i < 0) {
-                    break;
-                }
-            do_brcond_low:
-                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
-                op->opc = INDEX_op_brcond_i32;
-                op->args[1] = op->args[2];
-                op->args[2] = op->args[4];
-                op->args[3] = op->args[5];
-                break;
-            }
-            if (op->args[4] == TCG_COND_NE) {
-                /* Simplify NE comparisons where one of the pairs
-                   can be simplified.  */
-                i = do_constant_folding_cond(INDEX_op_brcond_i32,
-                                             op->args[0], op->args[2],
-                                             TCG_COND_NE);
-                if (i == 0) {
-                    goto do_brcond_high;
-                } else if (i > 0) {
-                    goto do_brcond_true;
-                }
-                i = do_constant_folding_cond(INDEX_op_brcond_i32,
-                                             op->args[1], op->args[3],
-                                             TCG_COND_NE);
-                if (i == 0) {
-                    goto do_brcond_low;
-                } else if (i > 0) {
-                    goto do_brcond_true;
-                }
-            }
-            break;
-
         default:
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64_VEC(andc):
             done = fold_andc(&ctx, op);
             break;
+        case INDEX_op_brcond2_i32:
+            done = fold_brcond2(&ctx, op);
+            break;
         CASE_OP_32_64(ctpop):
             done = fold_ctpop(&ctx, op);
             break;
-- 
2.25.1

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 37 +++++++++++++++++++++----------------
 1 file changed, 21 insertions(+), 16 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
     return fold_const2(ctx, op);
 }
 
+static bool fold_mulu2_i32(OptContext *ctx, TCGOp *op)
+{
+    if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
+        uint32_t a = arg_info(op->args[2])->val;
+        uint32_t b = arg_info(op->args[3])->val;
+        uint64_t r = (uint64_t)a * b;
+        TCGArg rl, rh;
+        TCGOp *op2 = tcg_op_insert_before(ctx->tcg, op, INDEX_op_mov_i32);
+
+        rl = op->args[0];
+        rh = op->args[1];
+        tcg_opt_gen_movi(ctx, op, rl, (int32_t)r);
+        tcg_opt_gen_movi(ctx, op2, rh, (int32_t)(r >> 32));
+        return true;
+    }
+    return false;
+}
+
 static bool fold_nand(OptContext *ctx, TCGOp *op)
 {
     return fold_const2(ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
-        case INDEX_op_mulu2_i32:
-            if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
-                uint32_t a = arg_info(op->args[2])->val;
-                uint32_t b = arg_info(op->args[3])->val;
-                uint64_t r = (uint64_t)a * b;
-                TCGArg rl, rh;
-                TCGOp *op2 = tcg_op_insert_before(s, op, INDEX_op_mov_i32);
-
-                rl = op->args[0];
-                rh = op->args[1];
-                tcg_opt_gen_movi(&ctx, op, rl, (int32_t)r);
-                tcg_opt_gen_movi(&ctx, op2, rh, (int32_t)(r >> 32));
-                continue;
-            }
-            break;
-
         default:
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(muluh):
             done = fold_mul_highpart(&ctx, op);
             break;
+        case INDEX_op_mulu2_i32:
+            done = fold_mulu2_i32(&ctx, op);
+            break;
         CASE_OP_32_64(nand):
             done = fold_nand(&ctx, op);
             break;
-- 
2.25.1

Add two additional helpers, fold_add2_i32 and fold_sub2_i32
which will not be simple wrappers forever.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 70 +++++++++++++++++++++++++++++++-------------------
 1 file changed, 44 insertions(+), 26 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_add(OptContext *ctx, TCGOp *op)
     return fold_const2(ctx, op);
 }
 
+static bool fold_addsub2_i32(OptContext *ctx, TCGOp *op, bool add)
+{
+    if (arg_is_const(op->args[2]) && arg_is_const(op->args[3]) &&
+        arg_is_const(op->args[4]) && arg_is_const(op->args[5])) {
+        uint32_t al = arg_info(op->args[2])->val;
+        uint32_t ah = arg_info(op->args[3])->val;
+        uint32_t bl = arg_info(op->args[4])->val;
+        uint32_t bh = arg_info(op->args[5])->val;
+        uint64_t a = ((uint64_t)ah << 32) | al;
+        uint64_t b = ((uint64_t)bh << 32) | bl;
+        TCGArg rl, rh;
+        TCGOp *op2 = tcg_op_insert_before(ctx->tcg, op, INDEX_op_mov_i32);
+
+        if (add) {
+            a += b;
+        } else {
+            a -= b;
+        }
+
+        rl = op->args[0];
+        rh = op->args[1];
+        tcg_opt_gen_movi(ctx, op, rl, (int32_t)a);
+        tcg_opt_gen_movi(ctx, op2, rh, (int32_t)(a >> 32));
+        return true;
+    }
+    return false;
+}
+
+static bool fold_add2_i32(OptContext *ctx, TCGOp *op)
+{
+    return fold_addsub2_i32(ctx, op, true);
+}
+
 static bool fold_and(OptContext *ctx, TCGOp *op)
 {
     return fold_const2(ctx, op);
@@ -XXX,XX +XXX,XX @@ static bool fold_sub(OptContext *ctx, TCGOp *op)
     return fold_const2(ctx, op);
 }
 
+static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
+{
+    return fold_addsub2_i32(ctx, op, false);
+}
+
 static bool fold_xor(OptContext *ctx, TCGOp *op)
 {
     return fold_const2(ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
-        case INDEX_op_add2_i32:
-        case INDEX_op_sub2_i32:
-            if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])
-                && arg_is_const(op->args[4]) && arg_is_const(op->args[5])) {
-                uint32_t al = arg_info(op->args[2])->val;
-                uint32_t ah = arg_info(op->args[3])->val;
-                uint32_t bl = arg_info(op->args[4])->val;
-                uint32_t bh = arg_info(op->args[5])->val;
-                uint64_t a = ((uint64_t)ah << 32) | al;
-                uint64_t b = ((uint64_t)bh << 32) | bl;
-                TCGArg rl, rh;
-                TCGOp *op2 = tcg_op_insert_before(s, op, INDEX_op_mov_i32);
-
-                if (opc == INDEX_op_add2_i32) {
-                    a += b;
-                } else {
-                    a -= b;
-                }
-
-                rl = op->args[0];
-                rh = op->args[1];
-                tcg_opt_gen_movi(&ctx, op, rl, (int32_t)a);
-                tcg_opt_gen_movi(&ctx, op2, rh, (int32_t)(a >> 32));
-                continue;
-            }
-            break;
 
         default:
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64_VEC(add):
             done = fold_add(&ctx, op);
             break;
+        case INDEX_op_add2_i32:
+            done = fold_add2_i32(&ctx, op);
+            break;
         CASE_OP_32_64_VEC(and):
             done = fold_and(&ctx, op);
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64_VEC(sub):
             done = fold_sub(&ctx, op);
             break;
+        case INDEX_op_sub2_i32:
+            done = fold_sub2_i32(&ctx, op);
+            break;
         CASE_OP_32_64_VEC(xor):
             done = fold_xor(&ctx, op);
             break;
-- 
2.25.1

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 56 ++++++++++++++++++++++++++++----------------------
 1 file changed, 31 insertions(+), 25 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_mb(OptContext *ctx, TCGOp *op)
     return true;
 }
 
+static bool fold_movcond(OptContext *ctx, TCGOp *op)
+{
+    TCGOpcode opc = op->opc;
+    TCGCond cond = op->args[5];
+    int i = do_constant_folding_cond(opc, op->args[1], op->args[2], cond);
+
+    if (i >= 0) {
+        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[4 - i]);
+    }
+
+    if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
+        uint64_t tv = arg_info(op->args[3])->val;
+        uint64_t fv = arg_info(op->args[4])->val;
+
+        opc = (opc == INDEX_op_movcond_i32
+               ? INDEX_op_setcond_i32 : INDEX_op_setcond_i64);
+
+        if (tv == 1 && fv == 0) {
+            op->opc = opc;
+            op->args[3] = cond;
+        } else if (fv == 1 && tv == 0) {
+            op->opc = opc;
+            op->args[3] = tcg_invert_cond(cond);
+        }
+    }
+    return false;
+}
+
 static bool fold_mul(OptContext *ctx, TCGOp *op)
 {
     return fold_const2(ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
-        CASE_OP_32_64(movcond):
-            i = do_constant_folding_cond(opc, op->args[1],
-                                         op->args[2], op->args[5]);
-            if (i >= 0) {
-                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[4 - i]);
-                continue;
-            }
-            if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
-                uint64_t tv = arg_info(op->args[3])->val;
-                uint64_t fv = arg_info(op->args[4])->val;
-                TCGCond cond = op->args[5];
-
-                if (fv == 1 && tv == 0) {
-                    cond = tcg_invert_cond(cond);
-                } else if (!(tv == 1 && fv == 0)) {
-                    break;
-                }
-                op->args[3] = cond;
-                op->opc = opc = (opc == INDEX_op_movcond_i32
-                                 ? INDEX_op_setcond_i32
-                                 : INDEX_op_setcond_i64);
-            }
-            break;
-
-
         default:
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_mb:
             done = fold_mb(&ctx, op);
             break;
+        CASE_OP_32_64(movcond):
+            done = fold_movcond(&ctx, op);
+            break;
         CASE_OP_32_64(mul):
             done = fold_mul(&ctx, op);
             break;
-- 
2.25.1

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 39 ++++++++++++++++++++++-----------------
 1 file changed, 22 insertions(+), 17 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
     return fold_const2(ctx, op);
 }
 
+static bool fold_extract2(OptContext *ctx, TCGOp *op)
+{
+    if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
+        uint64_t v1 = arg_info(op->args[1])->val;
+        uint64_t v2 = arg_info(op->args[2])->val;
+        int shr = op->args[3];
+
+        if (op->opc == INDEX_op_extract2_i64) {
+            v1 >>= shr;
+            v2 <<= 64 - shr;
+        } else {
+            v1 = (uint32_t)v1 >> shr;
+            v2 = (int32_t)v2 << (32 - shr);
+        }
+        return tcg_opt_gen_movi(ctx, op, op->args[0], v1 | v2);
+    }
+    return false;
+}
+
 static bool fold_exts(OptContext *ctx, TCGOp *op)
 {
     return fold_const1(ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
-        CASE_OP_32_64(extract2):
-            if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
-                uint64_t v1 = arg_info(op->args[1])->val;
-                uint64_t v2 = arg_info(op->args[2])->val;
-                int shr = op->args[3];
-
-                if (opc == INDEX_op_extract2_i64) {
-                    tmp = (v1 >> shr) | (v2 << (64 - shr));
-                } else {
-                    tmp = (int32_t)(((uint32_t)v1 >> shr) |
-                                    ((uint32_t)v2 << (32 - shr)));
-                }
-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
-                continue;
-            }
-            break;
-
         default:
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(eqv):
             done = fold_eqv(&ctx, op);
             break;
+        CASE_OP_32_64(extract2):
+            done = fold_extract2(&ctx, op);
+            break;
         CASE_OP_32_64(ext8s):
         CASE_OP_32_64(ext16s):
         case INDEX_op_ext32s_i64:
-- 
2.25.1

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 48 ++++++++++++++++++++++++++++++------------------
 1 file changed, 30 insertions(+), 18 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
     return fold_const2(ctx, op);
 }
 
+static bool fold_extract(OptContext *ctx, TCGOp *op)
+{
+    if (arg_is_const(op->args[1])) {
+        uint64_t t;
+
+        t = arg_info(op->args[1])->val;
+        t = extract64(t, op->args[2], op->args[3]);
+        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
+    }
+    return false;
+}
+
 static bool fold_extract2(OptContext *ctx, TCGOp *op)
 {
     if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
     return tcg_opt_gen_movi(ctx, op, op->args[0], i);
 }
 
+static bool fold_sextract(OptContext *ctx, TCGOp *op)
+{
+    if (arg_is_const(op->args[1])) {
+        uint64_t t;
+
+        t = arg_info(op->args[1])->val;
+        t = sextract64(t, op->args[2], op->args[3]);
+        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
+    }
+    return false;
+}
+
 static bool fold_shift(OptContext *ctx, TCGOp *op)
 {
     return fold_const2(ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
-        CASE_OP_32_64(extract):
-            if (arg_is_const(op->args[1])) {
-                tmp = extract64(arg_info(op->args[1])->val,
-                                op->args[2], op->args[3]);
-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
-                continue;
-            }
-            break;
-
-        CASE_OP_32_64(sextract):
-            if (arg_is_const(op->args[1])) {
-                tmp = sextract64(arg_info(op->args[1])->val,
-                                 op->args[2], op->args[3]);
-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
-                continue;
-            }
-            break;
-
         default:
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(eqv):
             done = fold_eqv(&ctx, op);
             break;
+        CASE_OP_32_64(extract):
+            done = fold_extract(&ctx, op);
+            break;
         CASE_OP_32_64(extract2):
             done = fold_extract2(&ctx, op);
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_setcond2_i32:
             done = fold_setcond2(&ctx, op);
             break;
+        CASE_OP_32_64(sextract):
+            done = fold_sextract(&ctx, op);
+            break;
         CASE_OP_32_64_VEC(sub):
             done = fold_sub(&ctx, op);
             break;
-- 
2.25.1

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_ctpop(OptContext *ctx, TCGOp *op)
     return fold_const1(ctx, op);
 }
 
+static bool fold_deposit(OptContext *ctx, TCGOp *op)
+{
+    if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
+        uint64_t t1 = arg_info(op->args[1])->val;
+        uint64_t t2 = arg_info(op->args[2])->val;
+
+        t1 = deposit64(t1, op->args[3], op->args[4], t2);
+        return tcg_opt_gen_movi(ctx, op, op->args[0], t1);
+    }
+    return false;
+}
+
 static bool fold_divide(OptContext *ctx, TCGOp *op)
 {
     return fold_const2(ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
-        CASE_OP_32_64(deposit):
-            if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
-                tmp = deposit64(arg_info(op->args[1])->val,
-                                op->args[3], op->args[4],
-                                arg_info(op->args[2])->val);
-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
-                continue;
-            }
-            break;
-
         default:
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(ctpop):
             done = fold_ctpop(&ctx, op);
             break;
+        CASE_OP_32_64(deposit):
+            done = fold_deposit(&ctx, op);
+            break;
         CASE_OP_32_64(div):
         CASE_OP_32_64(divu):
             done = fold_divide(&ctx, op);
-- 
2.25.1

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 32 ++++++++++++++++++--------------
 1 file changed, 18 insertions(+), 14 deletions(-)

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
     return false;
 }
 
+static bool fold_bswap(OptContext *ctx, TCGOp *op)
+{
+    if (arg_is_const(op->args[1])) {
+        uint64_t t = arg_info(op->args[1])->val;
+
+        t = do_constant_folding(op->opc, t, op->args[2]);
+        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
+    }
+    return false;
+}
+
 static bool fold_call(OptContext *ctx, TCGOp *op)
 {
     TCGContext *s = ctx->tcg;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
-        CASE_OP_32_64(bswap16):
-        CASE_OP_32_64(bswap32):
-        case INDEX_op_bswap64_i64:
-            if (arg_is_const(op->args[1])) {
-                tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
-                                          op->args[2]);
-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
-                continue;
-            }
-            break;
-
         default:
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_brcond2_i32:
             done = fold_brcond2(&ctx, op);
             break;
+        CASE_OP_32_64(bswap16):
+        CASE_OP_32_64(bswap32):
+        case INDEX_op_bswap64_i64:
+            done = fold_bswap(&ctx, op);
+            break;
         CASE_OP_32_64(clz):
         CASE_OP_32_64(ctz):
             done = fold_count_zeros(&ctx, op);
-- 
2.25.1

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 53 +++++++++++++++++++++++++++++---------------------
 1 file changed, 31 insertions(+), 22 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_divide(OptContext *ctx, TCGOp *op)
     return fold_const2(ctx, op);
 }
 
+static bool fold_dup(OptContext *ctx, TCGOp *op)
+{
+    if (arg_is_const(op->args[1])) {
+        uint64_t t = arg_info(op->args[1])->val;
+        t = dup_const(TCGOP_VECE(op), t);
+        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
+    }
+    return false;
+}
+
+static bool fold_dup2(OptContext *ctx, TCGOp *op)
+{
+    if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
+        uint64_t t = deposit64(arg_info(op->args[1])->val, 32, 32,
+                               arg_info(op->args[2])->val);
+        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
+    }
+
+    if (args_are_copies(op->args[1], op->args[2])) {
+        op->opc = INDEX_op_dup_vec;
+        TCGOP_VECE(op) = MO_32;
+    }
+    return false;
+}
+
 static bool fold_eqv(OptContext *ctx, TCGOp *op)
 {
     return fold_const2(ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             done = tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
             break;
 
-        case INDEX_op_dup_vec:
-            if (arg_is_const(op->args[1])) {
-                tmp = arg_info(op->args[1])->val;
-                tmp = dup_const(TCGOP_VECE(op), tmp);
-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
-                continue;
-            }
-            break;
-
-        case INDEX_op_dup2_vec:
-            assert(TCG_TARGET_REG_BITS == 32);
-            if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
-                tcg_opt_gen_movi(&ctx, op, op->args[0],
-                                 deposit64(arg_info(op->args[1])->val, 32, 32,
-                                           arg_info(op->args[2])->val));
-                continue;
-            } else if (args_are_copies(op->args[1], op->args[2])) {
-                op->opc = INDEX_op_dup_vec;
-                TCGOP_VECE(op) = MO_32;
-            }
-            break;
-
         default:
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(divu):
             done = fold_divide(&ctx, op);
             break;
+        case INDEX_op_dup_vec:
+            done = fold_dup(&ctx, op);
+            break;
+        case INDEX_op_dup2_vec:
+            done = fold_dup2(&ctx, op);
+            break;
         CASE_OP_32_64(eqv):
             done = fold_eqv(&ctx, op);
             break;
-- 
2.25.1

This is the final entry in the main switch that was in a
different form.  After this, we have the option to convert
the switch into a function dispatch table.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_mb(OptContext *ctx, TCGOp *op)
     return true;
 }
 
+static bool fold_mov(OptContext *ctx, TCGOp *op)
+{
+    return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]);
+}
+
 static bool fold_movcond(OptContext *ctx, TCGOp *op)
 {
     TCGOpcode opc = op->opc;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             break;
         }
 
-        /* Propagate constants through copy operations and do constant
-           folding.  Constants will be substituted to arguments by register
-           allocator where needed and possible.  Also detect copies. */
+        /*
+         * Process each opcode.
+         * Sorted alphabetically by opcode as much as possible.
+         */
         switch (opc) {
-        CASE_OP_32_64_VEC(mov):
-            done = tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
-            break;
-
-        default:
-            break;
-
-        /* ---------------------------------------------------------- */
-        /* Sorted alphabetically by opcode as much as possible. */
-
         CASE_OP_32_64_VEC(add):
             done = fold_add(&ctx, op);
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_mb:
             done = fold_mb(&ctx, op);
             break;
+        CASE_OP_32_64_VEC(mov):
+            done = fold_mov(&ctx, op);
+            break;
         CASE_OP_32_64(movcond):
             done = fold_movcond(&ctx, op);
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64_VEC(xor):
             done = fold_xor(&ctx, op);
             break;
+        default:
+            break;
         }
 
         if (!done) {
-- 
2.25.1

Pull the "op r, a, a => movi r, 0" optimization into a function,
and use it in the outer opcode fold functions.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 41 ++++++++++++++++++++++++-----------------
 1 file changed, 24 insertions(+), 17 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
     return false;
 }
 
+/* If the binary operation has both arguments equal, fold to @i. */
+static bool fold_xx_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
+{
+    if (args_are_copies(op->args[1], op->args[2])) {
+        return tcg_opt_gen_movi(ctx, op, op->args[0], i);
+    }
+    return false;
+}
+
 /*
  * These outermost fold_<op> functions are sorted alphabetically.
  */
@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
 
 static bool fold_andc(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xx_to_i(ctx, op, 0)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_brcond(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_shift(OptContext *ctx, TCGOp *op)
 
 static bool fold_sub(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xx_to_i(ctx, op, 0)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
 
 static bool fold_xor(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xx_to_i(ctx, op, 0)) {
+        return true;
+    }
+    return false;
 }
 
 /* Propagate constants and copies, fold constant expressions. */
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             break;
         }
 
-        /* Simplify expression for "op r, a, a => movi r, 0" cases */
-        switch (opc) {
-        CASE_OP_32_64_VEC(andc):
-        CASE_OP_32_64_VEC(sub):
-        CASE_OP_32_64_VEC(xor):
-            if (args_are_copies(op->args[1], op->args[2])) {
-                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
-                continue;
-            }
-            break;
-        default:
-            break;
-        }
-
         /*
          * Process each opcode.
          * Sorted alphabetically by opcode as much as possible.
-- 
2.25.1

Pull the "op r, a, a => mov r, a" optimization into a function,
and use it in the outer opcode fold functions.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 39 ++++++++++++++++++++++++---------------
 1 file changed, 24 insertions(+), 15 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_xx_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
     return false;
 }
 
+/* If the binary operation has both arguments equal, fold to identity. */
+static bool fold_xx_to_x(OptContext *ctx, TCGOp *op)
+{
+    if (args_are_copies(op->args[1], op->args[2])) {
+        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]);
+    }
+    return false;
+}
+
 /*
  * These outermost fold_<op> functions are sorted alphabetically.
+ *
+ * The ordering of the transformations should be:
+ *   1) those that produce a constant
+ *   2) those that produce a copy
+ *   3) those that produce information about the result value.
  */
 
 static bool fold_add(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_add2_i32(OptContext *ctx, TCGOp *op)
 
 static bool fold_and(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xx_to_x(ctx, op)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_andc(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_not(OptContext *ctx, TCGOp *op)
 
 static bool fold_or(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xx_to_x(ctx, op)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_orc(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             break;
         }
 
-        /* Simplify expression for "op r, a, a => mov r, a" cases */
-        switch (opc) {
-        CASE_OP_32_64_VEC(or):
-        CASE_OP_32_64_VEC(and):
-            if (args_are_copies(op->args[1], op->args[2])) {
-                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
-                continue;
-            }
-            break;
-        default:
-            break;
-        }
-
         /*
          * Process each opcode.
          * Sorted alphabetically by opcode as much as possible.
-- 
2.25.1

Pull the "op r, a, 0 => movi r, 0" optimization into a function,
and use it in the outer opcode fold functions.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 38 ++++++++++++++++++++------------------
 1 file changed, 20 insertions(+), 18 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
     return false;
 }
 
+/* If the binary operation has second argument @i, fold to @i. */
+static bool fold_xi_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
+{
+    if (arg_is_const(op->args[2]) && arg_info(op->args[2])->val == i) {
+        return tcg_opt_gen_movi(ctx, op, op->args[0], i);
+    }
+    return false;
+}
+
 /* If the binary operation has both arguments equal, fold to @i. */
 static bool fold_xx_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
 {
@@ -XXX,XX +XXX,XX @@ static bool fold_add2_i32(OptContext *ctx, TCGOp *op)
 static bool fold_and(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
+        fold_xi_to_i(ctx, op, 0) ||
         fold_xx_to_x(ctx, op)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
 
 static bool fold_mul(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xi_to_i(ctx, op, 0)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xi_to_i(ctx, op, 0)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_mulu2_i32(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             continue;
         }
 
-        /* Simplify expression for "op r, a, 0 => movi r, 0" cases */
-        switch (opc) {
-        CASE_OP_32_64_VEC(and):
-        CASE_OP_32_64_VEC(mul):
-        CASE_OP_32_64(muluh):
-        CASE_OP_32_64(mulsh):
-            if (arg_is_const(op->args[2])
-                && arg_info(op->args[2])->val == 0) {
-                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
-                continue;
-            }
-            break;
-        default:
-            break;
-        }
-
         /*
          * Process each opcode.
          * Sorted alphabetically by opcode as much as possible.
-- 
2.25.1

Compute the type of the operation early.

There are at least 4 places that used a def->flags ladder
to determine the type of the operation being optimized.

There were two places that assumed !TCG_OPF_64BIT means
TCG_TYPE_I32, and so could potentially compute incorrect
results for vector operations.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 149 +++++++++++++++++++++++++++++--------------------
 1 file changed, 89 insertions(+), 60 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ typedef struct OptContext {
 
     /* In flight values from optimization. */
     uint64_t z_mask;
+    TCGType type;
 } OptContext;
 
 static inline TempOptInfo *ts_info(TCGTemp *ts)
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
 {
     TCGTemp *dst_ts = arg_temp(dst);
     TCGTemp *src_ts = arg_temp(src);
-    const TCGOpDef *def;
     TempOptInfo *di;
     TempOptInfo *si;
     uint64_t z_mask;
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
     reset_ts(dst_ts);
     di = ts_info(dst_ts);
     si = ts_info(src_ts);
-    def = &tcg_op_defs[op->opc];
-    if (def->flags & TCG_OPF_VECTOR) {
-        new_op = INDEX_op_mov_vec;
-    } else if (def->flags & TCG_OPF_64BIT) {
-        new_op = INDEX_op_mov_i64;
-    } else {
+
+    switch (ctx->type) {
+    case TCG_TYPE_I32:
         new_op = INDEX_op_mov_i32;
+        break;
+    case TCG_TYPE_I64:
+        new_op = INDEX_op_mov_i64;
+        break;
+    case TCG_TYPE_V64:
+    case TCG_TYPE_V128:
+    case TCG_TYPE_V256:
+        /* TCGOP_VECL and TCGOP_VECE remain unchanged.  */
+        new_op = INDEX_op_mov_vec;
+        break;
+    default:
+        g_assert_not_reached();
     }
     op->opc = new_op;
-    /* TCGOP_VECL and TCGOP_VECE remain unchanged.  */
     op->args[0] = dst;
     op->args[1] = src;
 
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
 static bool tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
                              TCGArg dst, uint64_t val)
 {
-    const TCGOpDef *def = &tcg_op_defs[op->opc];
-    TCGType type;
-    TCGTemp *tv;
-
-    if (def->flags & TCG_OPF_VECTOR) {
-        type = TCGOP_VECL(op) + TCG_TYPE_V64;
-    } else if (def->flags & TCG_OPF_64BIT) {
-        type = TCG_TYPE_I64;
-    } else {
-        type = TCG_TYPE_I32;
-    }
-
     /* Convert movi to mov with constant temp. */
-    tv = tcg_constant_internal(type, val);
+    TCGTemp *tv = tcg_constant_internal(ctx->type, val);
+
     init_ts_info(ctx, tv);
     return tcg_opt_gen_mov(ctx, op, dst, temp_arg(tv));
 }
@@ -XXX,XX +XXX,XX @@ static uint64_t do_constant_folding_2(TCGOpcode op, uint64_t x, uint64_t y)
     }
 }
 
-static uint64_t do_constant_folding(TCGOpcode op, uint64_t x, uint64_t y)
+static uint64_t do_constant_folding(TCGOpcode op, TCGType type,
+                                    uint64_t x, uint64_t y)
 {
-    const TCGOpDef *def = &tcg_op_defs[op];
     uint64_t res = do_constant_folding_2(op, x, y);
-    if (!(def->flags & TCG_OPF_64BIT)) {
+    if (type == TCG_TYPE_I32) {
         res = (int32_t)res;
     }
     return res;
@@ -XXX,XX +XXX,XX @@ static bool do_constant_folding_cond_eq(TCGCond c)
  * Return -1 if the condition can't be simplified,
  * and the result of the condition (0 or 1) if it can.
  */
-static int do_constant_folding_cond(TCGOpcode op, TCGArg x,
+static int do_constant_folding_cond(TCGType type, TCGArg x,
                                     TCGArg y, TCGCond c)
 {
     uint64_t xv = arg_info(x)->val;
     uint64_t yv = arg_info(y)->val;
 
     if (arg_is_const(x) && arg_is_const(y)) {
-        const TCGOpDef *def = &tcg_op_defs[op];
-        tcg_debug_assert(!(def->flags & TCG_OPF_VECTOR));
-        if (def->flags & TCG_OPF_64BIT) {
-            return do_constant_folding_cond_64(xv, yv, c);
-        } else {
+        switch (type) {
+        case TCG_TYPE_I32:
             return do_constant_folding_cond_32(xv, yv, c);
+        case TCG_TYPE_I64:
+            return do_constant_folding_cond_64(xv, yv, c);
+        default:
+            /* Only scalar comparisons are optimizable */
+            return -1;
         }
     } else if (args_are_copies(x, y)) {
         return do_constant_folding_cond_eq(c);
@@ -XXX,XX +XXX,XX @@ static bool fold_const1(OptContext *ctx, TCGOp *op)
         uint64_t t;
 
         t = arg_info(op->args[1])->val;
-        t = do_constant_folding(op->opc, t, 0);
+        t = do_constant_folding(op->opc, ctx->type, t, 0);
         return tcg_opt_gen_movi(ctx, op, op->args[0], t);
     }
     return false;
@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
         uint64_t t1 = arg_info(op->args[1])->val;
         uint64_t t2 = arg_info(op->args[2])->val;
 
-        t1 = do_constant_folding(op->opc, t1, t2);
+        t1 = do_constant_folding(op->opc, ctx->type, t1, t2);
         return tcg_opt_gen_movi(ctx, op, op->args[0], t1);
     }
     return false;
@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
 static bool fold_brcond(OptContext *ctx, TCGOp *op)
 {
     TCGCond cond = op->args[2];
-    int i = do_constant_folding_cond(op->opc, op->args[0], op->args[1], cond);
+    int i = do_constant_folding_cond(ctx->type, op->args[0], op->args[1], cond);
 
     if (i == 0) {
         tcg_op_remove(ctx->tcg, op);
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
          * Simplify EQ/NE comparisons where one of the pairs
          * can be simplified.
          */
-        i = do_constant_folding_cond(INDEX_op_brcond_i32, op->args[0],
+        i = do_constant_folding_cond(TCG_TYPE_I32, op->args[0],
                                      op->args[2], cond);
         switch (i ^ inv) {
         case 0:
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
             goto do_brcond_high;
         }
 
-        i = do_constant_folding_cond(INDEX_op_brcond_i32, op->args[1],
+        i = do_constant_folding_cond(TCG_TYPE_I32, op->args[1],
                                      op->args[3], cond);
         switch (i ^ inv) {
         case 0:
@@ -XXX,XX +XXX,XX @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
     if (arg_is_const(op->args[1])) {
         uint64_t t = arg_info(op->args[1])->val;
 
-        t = do_constant_folding(op->opc, t, op->args[2]);
+        t = do_constant_folding(op->opc, ctx->type, t, op->args[2]);
         return tcg_opt_gen_movi(ctx, op, op->args[0], t);
     }
     return false;
@@ -XXX,XX +XXX,XX @@ static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
         uint64_t t = arg_info(op->args[1])->val;
 
         if (t != 0) {
-            t = do_constant_folding(op->opc, t, 0);
+            t = do_constant_folding(op->opc, ctx->type, t, 0);
             return tcg_opt_gen_movi(ctx, op, op->args[0], t);
         }
         return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[2]);
@@ -XXX,XX +XXX,XX @@ static bool fold_mov(OptContext *ctx, TCGOp *op)
 
 static bool fold_movcond(OptContext *ctx, TCGOp *op)
 {
-    TCGOpcode opc = op->opc;
     TCGCond cond = op->args[5];
-    int i = do_constant_folding_cond(opc, op->args[1], op->args[2], cond);
+    int i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
 
     if (i >= 0) {
         return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[4 - i]);
@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
     if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
         uint64_t tv = arg_info(op->args[3])->val;
         uint64_t fv = arg_info(op->args[4])->val;
+        TCGOpcode opc;
 
-        opc = (opc == INDEX_op_movcond_i32
-               ? INDEX_op_setcond_i32 : INDEX_op_setcond_i64);
+        switch (ctx->type) {
+        case TCG_TYPE_I32:
+            opc = INDEX_op_setcond_i32;
+            break;
+        case TCG_TYPE_I64:
+            opc = INDEX_op_setcond_i64;
+            break;
+        default:
+            g_assert_not_reached();
+        }
 
         if (tv == 1 && fv == 0) {
             op->opc = opc;
@@ -XXX,XX +XXX,XX @@ static bool fold_remainder(OptContext *ctx, TCGOp *op)
 static bool fold_setcond(OptContext *ctx, TCGOp *op)
 {
     TCGCond cond = op->args[3];
-    int i = do_constant_folding_cond(op->opc, op->args[1], op->args[2], cond);
+    int i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
 
     if (i >= 0) {
         return tcg_opt_gen_movi(ctx, op, op->args[0], i);
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
          * Simplify EQ/NE comparisons where one of the pairs
          * can be simplified.
          */
-        i = do_constant_folding_cond(INDEX_op_setcond_i32, op->args[1],
+        i = do_constant_folding_cond(TCG_TYPE_I32, op->args[1],
                                      op->args[3], cond);
         switch (i ^ inv) {
         case 0:
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
             goto do_setcond_high;
         }
 
-        i = do_constant_folding_cond(INDEX_op_setcond_i32, op->args[2],
+        i = do_constant_folding_cond(TCG_TYPE_I32, op->args[2],
                                      op->args[4], cond);
         switch (i ^ inv) {
         case 0:
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         init_arguments(&ctx, op, def->nb_oargs + def->nb_iargs);
         copy_propagate(&ctx, op, def->nb_oargs, def->nb_iargs);
 
+        /* Pre-compute the type of the operation. */
+        if (def->flags & TCG_OPF_VECTOR) {
+            ctx.type = TCG_TYPE_V64 + TCGOP_VECL(op);
+        } else if (def->flags & TCG_OPF_64BIT) {
+            ctx.type = TCG_TYPE_I64;
+        } else {
+            ctx.type = TCG_TYPE_I32;
+        }
+
         /* For commutative operations make constant second argument */
         switch (opc) {
         CASE_OP_32_64_VEC(add):
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                     /* Proceed with possible constant folding. */
                     break;
                 }
-                if (opc == INDEX_op_sub_i32) {
+                switch (ctx.type) {
+                case TCG_TYPE_I32:
                     neg_op = INDEX_op_neg_i32;
                     have_neg = TCG_TARGET_HAS_neg_i32;
-                } else if (opc == INDEX_op_sub_i64) {
+                    break;
+                case TCG_TYPE_I64:
                     neg_op = INDEX_op_neg_i64;
                     have_neg = TCG_TARGET_HAS_neg_i64;
-                } else if (TCG_TARGET_HAS_neg_vec) {
-                    TCGType type = TCGOP_VECL(op) + TCG_TYPE_V64;
-                    unsigned vece = TCGOP_VECE(op);
-                    neg_op = INDEX_op_neg_vec;
-                    have_neg = tcg_can_emit_vec_op(neg_op, type, vece) > 0;
-                } else {
                     break;
+                case TCG_TYPE_V64:
+                case TCG_TYPE_V128:
+                case TCG_TYPE_V256:
+                    neg_op = INDEX_op_neg_vec;
+                    have_neg = tcg_can_emit_vec_op(neg_op, ctx.type,
+                                                   TCGOP_VECE(op)) > 0;
+                    break;
+                default:
+                    g_assert_not_reached();
                 }
                 if (!have_neg) {
                     break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 TCGOpcode not_op;
                 bool have_not;
 
-                if (def->flags & TCG_OPF_VECTOR) {
-                    not_op = INDEX_op_not_vec;
-                    have_not = TCG_TARGET_HAS_not_vec;
-                } else if (def->flags & TCG_OPF_64BIT) {
-                    not_op = INDEX_op_not_i64;
-                    have_not = TCG_TARGET_HAS_not_i64;
-                } else {
+                switch (ctx.type) {
+                case TCG_TYPE_I32:
                     not_op = INDEX_op_not_i32;
                     have_not = TCG_TARGET_HAS_not_i32;
+                    break;
+                case TCG_TYPE_I64:
+                    not_op = INDEX_op_not_i64;
+                    have_not = TCG_TARGET_HAS_not_i64;
+                    break;
+                case TCG_TYPE_V64:
+                case TCG_TYPE_V128:
+                case TCG_TYPE_V256:
+                    not_op = INDEX_op_not_vec;
+                    have_not = TCG_TARGET_HAS_not_vec;
+                    break;
+                default:
+                    g_assert_not_reached();
                 }
                 if (!have_not) {
                     break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
            below, we can ignore high bits, but for further optimizations we
            need to record that the high bits contain garbage.  */
         partmask = z_mask;
-        if (!(def->flags & TCG_OPF_64BIT)) {
+        if (ctx.type == TCG_TYPE_I32) {
             z_mask |= ~(tcg_target_ulong)0xffffffffu;
             partmask &= 0xffffffffu;
             affected &= 0xffffffffu;
-- 
2.25.1

Split out the conditional conversion from a more complex logical
operation to a simple NOT.  Create a couple more helpers to make
this easy for the outer-most logical operations.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 158 +++++++++++++++++++++++++++----------------------
 1 file changed, 86 insertions(+), 72 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
     return false;
 }
 
+/*
+ * Convert @op to NOT, if NOT is supported by the host.
+ * Return true f the conversion is successful, which will still
+ * indicate that the processing is complete.
+ */
+static bool fold_not(OptContext *ctx, TCGOp *op);
+static bool fold_to_not(OptContext *ctx, TCGOp *op, int idx)
+{
+    TCGOpcode not_op;
+    bool have_not;
+
+    switch (ctx->type) {
+    case TCG_TYPE_I32:
+        not_op = INDEX_op_not_i32;
+        have_not = TCG_TARGET_HAS_not_i32;
+        break;
+    case TCG_TYPE_I64:
+        not_op = INDEX_op_not_i64;
+        have_not = TCG_TARGET_HAS_not_i64;
+        break;
+    case TCG_TYPE_V64:
+    case TCG_TYPE_V128:
+    case TCG_TYPE_V256:
+        not_op = INDEX_op_not_vec;
+        have_not = TCG_TARGET_HAS_not_vec;
+        break;
+    default:
+        g_assert_not_reached();
+    }
+    if (have_not) {
+        op->opc = not_op;
+        op->args[1] = op->args[idx];
+        return fold_not(ctx, op);
+    }
+    return false;
+}
+
+/* If the binary operation has first argument @i, fold to NOT. */
+static bool fold_ix_to_not(OptContext *ctx, TCGOp *op, uint64_t i)
+{
+    if (arg_is_const(op->args[1]) && arg_info(op->args[1])->val == i) {
+        return fold_to_not(ctx, op, 2);
+    }
+    return false;
+}
+
 /* If the binary operation has second argument @i, fold to @i. */
 static bool fold_xi_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
 {
@@ -XXX,XX +XXX,XX @@ static bool fold_xi_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
     return false;
 }
 
+/* If the binary operation has second argument @i, fold to NOT. */
+static bool fold_xi_to_not(OptContext *ctx, TCGOp *op, uint64_t i)
+{
+    if (arg_is_const(op->args[2]) && arg_info(op->args[2])->val == i) {
+        return fold_to_not(ctx, op, 1);
+    }
+    return false;
+}
+
 /* If the binary operation has both arguments equal, fold to @i. */
 static bool fold_xx_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
 {
@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
 static bool fold_andc(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
-        fold_xx_to_i(ctx, op, 0)) {
+        fold_xx_to_i(ctx, op, 0) ||
+        fold_ix_to_not(ctx, op, -1)) {
         return true;
     }
     return false;
@@ -XXX,XX +XXX,XX @@ static bool fold_dup2(OptContext *ctx, TCGOp *op)
 
 static bool fold_eqv(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xi_to_not(ctx, op, 0)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_extract(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_mulu2_i32(OptContext *ctx, TCGOp *op)
 
 static bool fold_nand(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xi_to_not(ctx, op, -1)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_neg(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_neg(OptContext *ctx, TCGOp *op)
 
 static bool fold_nor(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xi_to_not(ctx, op, 0)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_not(OptContext *ctx, TCGOp *op)
 {
-    return fold_const1(ctx, op);
+    if (fold_const1(ctx, op)) {
+        return true;
+    }
+
+    /* Because of fold_to_not, we want to always return true, via finish. */
+    finish_folding(ctx, op);
+    return true;
 }
 
 static bool fold_or(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_or(OptContext *ctx, TCGOp *op)
 
 static bool fold_orc(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_ix_to_not(ctx, op, 0)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
 static bool fold_xor(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
-        fold_xx_to_i(ctx, op, 0)) {
+        fold_xx_to_i(ctx, op, 0) ||
+        fold_xi_to_not(ctx, op, -1)) {
         return true;
     }
     return false;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 }
             }
             break;
-        CASE_OP_32_64_VEC(xor):
-        CASE_OP_32_64(nand):
-            if (!arg_is_const(op->args[1])
-                && arg_is_const(op->args[2])
-                && arg_info(op->args[2])->val == -1) {
-                i = 1;
-                goto try_not;
-            }
-            break;
-        CASE_OP_32_64(nor):
-            if (!arg_is_const(op->args[1])
-                && arg_is_const(op->args[2])
-                && arg_info(op->args[2])->val == 0) {
-                i = 1;
-                goto try_not;
-            }
-            break;
-        CASE_OP_32_64_VEC(andc):
-            if (!arg_is_const(op->args[2])
-                && arg_is_const(op->args[1])
-                && arg_info(op->args[1])->val == -1) {
-                i = 2;
-                goto try_not;
-            }
-            break;
-        CASE_OP_32_64_VEC(orc):
-        CASE_OP_32_64(eqv):
-            if (!arg_is_const(op->args[2])
-                && arg_is_const(op->args[1])
-                && arg_info(op->args[1])->val == 0) {
-                i = 2;
-                goto try_not;
-            }
-            break;
-        try_not:
-            {
-                TCGOpcode not_op;
-                bool have_not;
-
-                switch (ctx.type) {
-                case TCG_TYPE_I32:
-                    not_op = INDEX_op_not_i32;
-                    have_not = TCG_TARGET_HAS_not_i32;
-                    break;
-                case TCG_TYPE_I64:
-                    not_op = INDEX_op_not_i64;
-                    have_not = TCG_TARGET_HAS_not_i64;
-                    break;
-                case TCG_TYPE_V64:
-                case TCG_TYPE_V128:
-                case TCG_TYPE_V256:
-                    not_op = INDEX_op_not_vec;
-                    have_not = TCG_TARGET_HAS_not_vec;
-                    break;
-                default:
-                    g_assert_not_reached();
-                }
-                if (!have_not) {
-                    break;
-                }
-                op->opc = not_op;
-                reset_temp(op->args[0]);
-                op->args[1] = op->args[i];
-                continue;
-            }
         default:
             break;
         }
-- 
2.25.1

Even though there is only one user, place this more complex
conversion into its own helper.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 89 ++++++++++++++++++++++++++------------------------
 1 file changed, 47 insertions(+), 42 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_nand(OptContext *ctx, TCGOp *op)
 
 static bool fold_neg(OptContext *ctx, TCGOp *op)
 {
-    return fold_const1(ctx, op);
+    if (fold_const1(ctx, op)) {
+        return true;
+    }
+    /*
+     * Because of fold_sub_to_neg, we want to always return true,
+     * via finish_folding.
+     */
+    finish_folding(ctx, op);
+    return true;
 }
 
 static bool fold_nor(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_shift(OptContext *ctx, TCGOp *op)
     return fold_const2(ctx, op);
 }
 
+static bool fold_sub_to_neg(OptContext *ctx, TCGOp *op)
+{
+    TCGOpcode neg_op;
+    bool have_neg;
+
+    if (!arg_is_const(op->args[1]) || arg_info(op->args[1])->val != 0) {
+        return false;
+    }
+
+    switch (ctx->type) {
+    case TCG_TYPE_I32:
+        neg_op = INDEX_op_neg_i32;
+        have_neg = TCG_TARGET_HAS_neg_i32;
+        break;
+    case TCG_TYPE_I64:
+        neg_op = INDEX_op_neg_i64;
+        have_neg = TCG_TARGET_HAS_neg_i64;
+        break;
+    case TCG_TYPE_V64:
+    case TCG_TYPE_V128:
+    case TCG_TYPE_V256:
+        neg_op = INDEX_op_neg_vec;
+        have_neg = (TCG_TARGET_HAS_neg_vec &&
+                    tcg_can_emit_vec_op(neg_op, ctx->type, TCGOP_VECE(op)) > 0);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+    if (have_neg) {
+        op->opc = neg_op;
+        op->args[1] = op->args[2];
+        return fold_neg(ctx, op);
+    }
+    return false;
+}
+
 static bool fold_sub(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
-        fold_xx_to_i(ctx, op, 0)) {
+        fold_xx_to_i(ctx, op, 0) ||
+        fold_sub_to_neg(ctx, op)) {
         return true;
     }
     return false;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 continue;
             }
             break;
-        CASE_OP_32_64_VEC(sub):
-            {
-                TCGOpcode neg_op;
-                bool have_neg;
-
-                if (arg_is_const(op->args[2])) {
-                    /* Proceed with possible constant folding. */
-                    break;
-                }
-                switch (ctx.type) {
-                case TCG_TYPE_I32:
-                    neg_op = INDEX_op_neg_i32;
-                    have_neg = TCG_TARGET_HAS_neg_i32;
-                    break;
-                case TCG_TYPE_I64:
-                    neg_op = INDEX_op_neg_i64;
-                    have_neg = TCG_TARGET_HAS_neg_i64;
-                    break;
-                case TCG_TYPE_V64:
-                case TCG_TYPE_V128:
-                case TCG_TYPE_V256:
-                    neg_op = INDEX_op_neg_vec;
-                    have_neg = tcg_can_emit_vec_op(neg_op, ctx.type,
-                                                   TCGOP_VECE(op)) > 0;
-                    break;
-                default:
-                    g_assert_not_reached();
-                }
-                if (!have_neg) {
-                    break;
-                }
-                if (arg_is_const(op->args[1])
-                    && arg_info(op->args[1])->val == 0) {
-                    op->opc = neg_op;
-                    reset_temp(op->args[0]);
-                    op->args[1] = op->args[2];
-                    continue;
-                }
-            }
-            break;
         default:
             break;
         }
-- 
2.25.1

Pull the "op r, a, i => mov r, a" optimization into a function,
and use them in the outer-most logical operations.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 61 +++++++++++++++++++++-----------------------------
 1 file changed, 26 insertions(+), 35 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_xi_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
     return false;
 }
 
+/* If the binary operation has second argument @i, fold to identity. */
+static bool fold_xi_to_x(OptContext *ctx, TCGOp *op, uint64_t i)
+{
+    if (arg_is_const(op->args[2]) && arg_info(op->args[2])->val == i) {
+        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]);
+    }
+    return false;
+}
+
 /* If the binary operation has second argument @i, fold to NOT. */
 static bool fold_xi_to_not(OptContext *ctx, TCGOp *op, uint64_t i)
 {
@@ -XXX,XX +XXX,XX @@ static bool fold_xx_to_x(OptContext *ctx, TCGOp *op)
 
 static bool fold_add(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xi_to_x(ctx, op, 0)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_addsub2_i32(OptContext *ctx, TCGOp *op, bool add)
@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
         fold_xi_to_i(ctx, op, 0) ||
+        fold_xi_to_x(ctx, op, -1) ||
         fold_xx_to_x(ctx, op)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
         fold_xx_to_i(ctx, op, 0) ||
+        fold_xi_to_x(ctx, op, 0) ||
         fold_ix_to_not(ctx, op, -1)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_dup2(OptContext *ctx, TCGOp *op)
 static bool fold_eqv(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
+        fold_xi_to_x(ctx, op, -1) ||
         fold_xi_to_not(ctx, op, 0)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_not(OptContext *ctx, TCGOp *op)
 static bool fold_or(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
+        fold_xi_to_x(ctx, op, 0) ||
         fold_xx_to_x(ctx, op)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_or(OptContext *ctx, TCGOp *op)
 static bool fold_orc(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
+        fold_xi_to_x(ctx, op, -1) ||
         fold_ix_to_not(ctx, op, 0)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
 
 static bool fold_shift(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xi_to_x(ctx, op, 0)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_sub_to_neg(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_sub(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
         fold_xx_to_i(ctx, op, 0) ||
+        fold_xi_to_x(ctx, op, 0) ||
         fold_sub_to_neg(ctx, op)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_xor(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
         fold_xx_to_i(ctx, op, 0) ||
+        fold_xi_to_x(ctx, op, 0) ||
         fold_xi_to_not(ctx, op, -1)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             break;
         }
 
-        /* Simplify expression for "op r, a, const => mov r, a" cases */
-        switch (opc) {
-        CASE_OP_32_64_VEC(add):
-        CASE_OP_32_64_VEC(sub):
-        CASE_OP_32_64_VEC(or):
-        CASE_OP_32_64_VEC(xor):
-        CASE_OP_32_64_VEC(andc):
-        CASE_OP_32_64(shl):
-        CASE_OP_32_64(shr):
-        CASE_OP_32_64(sar):
-        CASE_OP_32_64(rotl):
-        CASE_OP_32_64(rotr):
-            if (!arg_is_const(op->args[1])
-                && arg_is_const(op->args[2])
-                && arg_info(op->args[2])->val == 0) {
-                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
-                continue;
-            }
-            break;
-        CASE_OP_32_64_VEC(and):
-        CASE_OP_32_64_VEC(orc):
-        CASE_OP_32_64(eqv):
-            if (!arg_is_const(op->args[1])
-                && arg_is_const(op->args[2])
-                && arg_info(op->args[2])->val == -1) {
-                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
-                continue;
-            }
-            break;
-        default:
-            break;
-        }
-
         /* Simplify using known-zero bits. Currently only ops with a single
            output argument is supported. */
         z_mask = -1;
-- 
2.25.1

Pull the "op r, 0, b => movi r, 0" optimization into a function,
and use it in fold_shift.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 28 ++++++++++------------------
 1 file changed, 10 insertions(+), 18 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_to_not(OptContext *ctx, TCGOp *op, int idx)
     return false;
 }
 
+/* If the binary operation has first argument @i, fold to @i. */
+static bool fold_ix_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
+{
+    if (arg_is_const(op->args[1]) && arg_info(op->args[1])->val == i) {
+        return tcg_opt_gen_movi(ctx, op, op->args[0], i);
+    }
+    return false;
+}
+
 /* If the binary operation has first argument @i, fold to NOT. */
 static bool fold_ix_to_not(OptContext *ctx, TCGOp *op, uint64_t i)
 {
@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
 static bool fold_shift(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
+        fold_ix_to_i(ctx, op, 0) ||
         fold_xi_to_x(ctx, op, 0)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             break;
         }
 
-        /* Simplify expressions for "shift/rot r, 0, a => movi r, 0",
-           and "sub r, 0, a => neg r, a" case.  */
-        switch (opc) {
-        CASE_OP_32_64(shl):
-        CASE_OP_32_64(shr):
-        CASE_OP_32_64(sar):
-        CASE_OP_32_64(rotl):
-        CASE_OP_32_64(rotr):
-            if (arg_is_const(op->args[1])
-                && arg_info(op->args[1])->val == 0) {
-                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
-                continue;
-            }
-            break;
-        default:
-            break;
-        }
-
         /* Simplify using known-zero bits. Currently only ops with a single
            output argument is supported. */
         z_mask = -1;
-- 
2.25.1

Move all of the known-zero optimizations into the per-opcode
functions.  Use fold_masks when there is a possibility of the
result being determined, and simply set ctx->z_mask otherwise.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 545 ++++++++++++++++++++++++++-----------------------
 1 file changed, 294 insertions(+), 251 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ typedef struct OptContext {
     TCGTempSet temps_used;
 
     /* In flight values from optimization. */
-    uint64_t z_mask;
+    uint64_t a_mask;  /* mask bit is 0 iff value identical to first input */
+    uint64_t z_mask;  /* mask bit is 0 iff value bit is 0 */
     TCGType type;
 } OptContext;
 
@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
     return false;
 }
 
+static bool fold_masks(OptContext *ctx, TCGOp *op)
+{
+    uint64_t a_mask = ctx->a_mask;
+    uint64_t z_mask = ctx->z_mask;
+
+    /*
+     * 32-bit ops generate 32-bit results.  For the result is zero test
+     * below, we can ignore high bits, but for further optimizations we
+     * need to record that the high bits contain garbage.
+     */
+    if (ctx->type == TCG_TYPE_I32) {
+        ctx->z_mask |= MAKE_64BIT_MASK(32, 32);
+        a_mask &= MAKE_64BIT_MASK(0, 32);
+        z_mask &= MAKE_64BIT_MASK(0, 32);
+    }
+
+    if (z_mask == 0) {
+        return tcg_opt_gen_movi(ctx, op, op->args[0], 0);
+    }
+    if (a_mask == 0) {
+        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]);
+    }
+    return false;
+}
+
 /*
  * Convert @op to NOT, if NOT is supported by the host.
  * Return true f the conversion is successful, which will still
@@ -XXX,XX +XXX,XX @@ static bool fold_add2_i32(OptContext *ctx, TCGOp *op)
 
 static bool fold_and(OptContext *ctx, TCGOp *op)
 {
+    uint64_t z1, z2;
+
     if (fold_const2(ctx, op) ||
         fold_xi_to_i(ctx, op, 0) ||
         fold_xi_to_x(ctx, op, -1) ||
         fold_xx_to_x(ctx, op)) {
         return true;
     }
-    return false;
+
+    z1 = arg_info(op->args[1])->z_mask;
+    z2 = arg_info(op->args[2])->z_mask;
+    ctx->z_mask = z1 & z2;
+
+    /*
+     * Known-zeros does not imply known-ones.  Therefore unless
+     * arg2 is constant, we can't infer affected bits from it.
+     */
+    if (arg_is_const(op->args[2])) {
+        ctx->a_mask = z1 & ~z2;
+    }
+
+    return fold_masks(ctx, op);
 }
 
 static bool fold_andc(OptContext *ctx, TCGOp *op)
 {
+    uint64_t z1;
+
     if (fold_const2(ctx, op) ||
         fold_xx_to_i(ctx, op, 0) ||
         fold_xi_to_x(ctx, op, 0) ||
         fold_ix_to_not(ctx, op, -1)) {
         return true;
     }
-    return false;
+
+    z1 = arg_info(op->args[1])->z_mask;
+
+    /*
+     * Known-zeros does not imply known-ones.  Therefore unless
+     * arg2 is constant, we can't infer anything from it.
+     */
+    if (arg_is_const(op->args[2])) {
+        uint64_t z2 = ~arg_info(op->args[2])->z_mask;
+        ctx->a_mask = z1 & ~z2;
+        z1 &= z2;
+    }
+    ctx->z_mask = z1;
+
+    return fold_masks(ctx, op);
 }
 
 static bool fold_brcond(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
 
 static bool fold_bswap(OptContext *ctx, TCGOp *op)
 {
+    uint64_t z_mask, sign;
+
     if (arg_is_const(op->args[1])) {
         uint64_t t = arg_info(op->args[1])->val;
 
         t = do_constant_folding(op->opc, ctx->type, t, op->args[2]);
         return tcg_opt_gen_movi(ctx, op, op->args[0], t);
     }
-    return false;
+
+    z_mask = arg_info(op->args[1])->z_mask;
+    switch (op->opc) {
+    case INDEX_op_bswap16_i32:
+    case INDEX_op_bswap16_i64:
+        z_mask = bswap16(z_mask);
+        sign = INT16_MIN;
+        break;
+    case INDEX_op_bswap32_i32:
+    case INDEX_op_bswap32_i64:
+        z_mask = bswap32(z_mask);
+        sign = INT32_MIN;
+        break;
+    case INDEX_op_bswap64_i64:
+        z_mask = bswap64(z_mask);
+        sign = INT64_MIN;
+        break;
+    default:
+        g_assert_not_reached();
+    }
+
+    switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
+    case TCG_BSWAP_OZ:
+        break;
+    case TCG_BSWAP_OS:
+        /* If the sign bit may be 1, force all the bits above to 1. */
+        if (z_mask & sign) {
+            z_mask |= sign;
+        }
+        break;
+    default:
+        /* The high bits are undefined: force all bits above the sign to 1. */
+        z_mask |= sign << 1;
+        break;
+    }
+    ctx->z_mask = z_mask;
+
+    return fold_masks(ctx, op);
 }
 
 static bool fold_call(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_call(OptContext *ctx, TCGOp *op)
 
 static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
 {
+    uint64_t z_mask;
+
     if (arg_is_const(op->args[1])) {
         uint64_t t = arg_info(op->args[1])->val;
 
@@ -XXX,XX +XXX,XX @@ static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
         }
         return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[2]);
     }
+
+    switch (ctx->type) {
+    case TCG_TYPE_I32:
+        z_mask = 31;
+        break;
+    case TCG_TYPE_I64:
+        z_mask = 63;
+        break;
+    default:
+        g_assert_not_reached();
+    }
+    ctx->z_mask = arg_info(op->args[2])->z_mask | z_mask;
+
     return false;
 }
 
 static bool fold_ctpop(OptContext *ctx, TCGOp *op)
 {
-    return fold_const1(ctx, op);
+    if (fold_const1(ctx, op)) {
+        return true;
+    }
+
+    switch (ctx->type) {
+    case TCG_TYPE_I32:
+        ctx->z_mask = 32 | 31;
+        break;
+    case TCG_TYPE_I64:
+        ctx->z_mask = 64 | 63;
+        break;
+    default:
+        g_assert_not_reached();
+    }
+    return false;
 }
 
 static bool fold_deposit(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_deposit(OptContext *ctx, TCGOp *op)
         t1 = deposit64(t1, op->args[3], op->args[4], t2);
         return tcg_opt_gen_movi(ctx, op, op->args[0], t1);
     }
+
+    ctx->z_mask = deposit64(arg_info(op->args[1])->z_mask,
+                            op->args[3], op->args[4],
+                            arg_info(op->args[2])->z_mask);
     return false;
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
 
 static bool fold_extract(OptContext *ctx, TCGOp *op)
 {
+    uint64_t z_mask_old, z_mask;
+
     if (arg_is_const(op->args[1])) {
         uint64_t t;
 
@@ -XXX,XX +XXX,XX @@ static bool fold_extract(OptContext *ctx, TCGOp *op)
         t = extract64(t, op->args[2], op->args[3]);
         return tcg_opt_gen_movi(ctx, op, op->args[0], t);
     }
-    return false;
+
+    z_mask_old = arg_info(op->args[1])->z_mask;
+    z_mask = extract64(z_mask_old, op->args[2], op->args[3]);
+    if (op->args[2] == 0) {
+        ctx->a_mask = z_mask_old ^ z_mask;
+    }
+    ctx->z_mask = z_mask;
+
+    return fold_masks(ctx, op);
 }
 
 static bool fold_extract2(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_extract2(OptContext *ctx, TCGOp *op)
 
 static bool fold_exts(OptContext *ctx, TCGOp *op)
 {
-    return fold_const1(ctx, op);
+    uint64_t z_mask_old, z_mask, sign;
+    bool type_change = false;
+
+    if (fold_const1(ctx, op)) {
+        return true;
+    }
+
+    z_mask_old = z_mask = arg_info(op->args[1])->z_mask;
+
+    switch (op->opc) {
+    CASE_OP_32_64(ext8s):
+        sign = INT8_MIN;
+        z_mask = (uint8_t)z_mask;
+        break;
+    CASE_OP_32_64(ext16s):
+        sign = INT16_MIN;
+        z_mask = (uint16_t)z_mask;
+        break;
+    case INDEX_op_ext_i32_i64:
+        type_change = true;
+        QEMU_FALLTHROUGH;
+    case INDEX_op_ext32s_i64:
+        sign = INT32_MIN;
+        z_mask = (uint32_t)z_mask;
+        break;
+    default:
+        g_assert_not_reached();
+    }
+
+    if (z_mask & sign) {
+        z_mask |= sign;
+    } else if (!type_change) {
+        ctx->a_mask = z_mask_old ^ z_mask;
+    }
+    ctx->z_mask = z_mask;
+
+    return fold_masks(ctx, op);
 }
 
 static bool fold_extu(OptContext *ctx, TCGOp *op)
 {
-    return fold_const1(ctx, op);
+    uint64_t z_mask_old, z_mask;
+    bool type_change = false;
+
+    if (fold_const1(ctx, op)) {
+        return true;
+    }
+
+    z_mask_old = z_mask = arg_info(op->args[1])->z_mask;
+
+    switch (op->opc) {
+    CASE_OP_32_64(ext8u):
+        z_mask = (uint8_t)z_mask;
+        break;
+    CASE_OP_32_64(ext16u):
+        z_mask = (uint16_t)z_mask;
+        break;
+    case INDEX_op_extrl_i64_i32:
+    case INDEX_op_extu_i32_i64:
+        type_change = true;
+        QEMU_FALLTHROUGH;
+    case INDEX_op_ext32u_i64:
+        z_mask = (uint32_t)z_mask;
+        break;
+    case INDEX_op_extrh_i64_i32:
+        type_change = true;
+        z_mask >>= 32;
+        break;
+    default:
+        g_assert_not_reached();
+    }
+
+    ctx->z_mask = z_mask;
+    if (!type_change) {
+        ctx->a_mask = z_mask_old ^ z_mask;
+    }
+    return fold_masks(ctx, op);
 }
 
 static bool fold_mb(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
         return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[4 - i]);
     }
 
+    ctx->z_mask = arg_info(op->args[3])->z_mask
+                | arg_info(op->args[4])->z_mask;
+
     if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
         uint64_t tv = arg_info(op->args[3])->val;
         uint64_t fv = arg_info(op->args[4])->val;
@@ -XXX,XX +XXX,XX @@ static bool fold_nand(OptContext *ctx, TCGOp *op)
 
 static bool fold_neg(OptContext *ctx, TCGOp *op)
 {
+    uint64_t z_mask;
+
     if (fold_const1(ctx, op)) {
         return true;
     }
+
+    /* Set to 1 all bits to the left of the rightmost.  */
+    z_mask = arg_info(op->args[1])->z_mask;
+    ctx->z_mask = -(z_mask & -z_mask);
+
     /*
      * Because of fold_sub_to_neg, we want to always return true,
      * via finish_folding.
@@ -XXX,XX +XXX,XX @@ static bool fold_or(OptContext *ctx, TCGOp *op)
         fold_xx_to_x(ctx, op)) {
         return true;
     }
-    return false;
+
+    ctx->z_mask = arg_info(op->args[1])->z_mask
+                | arg_info(op->args[2])->z_mask;
+    return fold_masks(ctx, op);
 }
 
 static bool fold_orc(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_orc(OptContext *ctx, TCGOp *op)
 
 static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
 {
+    const TCGOpDef *def = &tcg_op_defs[op->opc];
+    MemOpIdx oi = op->args[def->nb_oargs + def->nb_iargs];
+    MemOp mop = get_memop(oi);
+    int width = 8 * memop_size(mop);
+
+    if (!(mop & MO_SIGN) && width < 64) {
+        ctx->z_mask = MAKE_64BIT_MASK(0, width);
+    }
+
     /* Opcodes that touch guest memory stop the mb optimization.  */
     ctx->prev_mb = NULL;
     return false;
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond(OptContext *ctx, TCGOp *op)
     if (i >= 0) {
         return tcg_opt_gen_movi(ctx, op, op->args[0], i);
     }
+
+    ctx->z_mask = 1;
     return false;
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
         op->opc = INDEX_op_setcond_i32;
         break;
     }
+
+    ctx->z_mask = 1;
     return false;
 
  do_setcond_const:
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
 
 static bool fold_sextract(OptContext *ctx, TCGOp *op)
 {
+    int64_t z_mask_old, z_mask;
+
     if (arg_is_const(op->args[1])) {
         uint64_t t;
 
@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
         t = sextract64(t, op->args[2], op->args[3]);
         return tcg_opt_gen_movi(ctx, op, op->args[0], t);
     }
-    return false;
+
+    z_mask_old = arg_info(op->args[1])->z_mask;
+    z_mask = sextract64(z_mask_old, op->args[2], op->args[3]);
+    if (op->args[2] == 0 && z_mask >= 0) {
+        ctx->a_mask = z_mask_old ^ z_mask;
+    }
+    ctx->z_mask = z_mask;
+
+    return fold_masks(ctx, op);
 }
 
 static bool fold_shift(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_shift(OptContext *ctx, TCGOp *op)
         fold_xi_to_x(ctx, op, 0)) {
         return true;
     }
+
+    if (arg_is_const(op->args[2])) {
+        ctx->z_mask = do_constant_folding(op->opc, ctx->type,
+                                          arg_info(op->args[1])->z_mask,
+                                          arg_info(op->args[2])->val);
+        return fold_masks(ctx, op);
+    }
     return false;
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
     return fold_addsub2_i32(ctx, op, false);
 }
 
+static bool fold_tcg_ld(OptContext *ctx, TCGOp *op)
+{
+    /* We can't do any folding with a load, but we can record bits. */
+    switch (op->opc) {
+    CASE_OP_32_64(ld8u):
+        ctx->z_mask = MAKE_64BIT_MASK(0, 8);
+        break;
+    CASE_OP_32_64(ld16u):
+        ctx->z_mask = MAKE_64BIT_MASK(0, 16);
+        break;
+    case INDEX_op_ld32u_i64:
+        ctx->z_mask = MAKE_64BIT_MASK(0, 32);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+    return false;
+}
+
 static bool fold_xor(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
@@ -XXX,XX +XXX,XX @@ static bool fold_xor(OptContext *ctx, TCGOp *op)
         fold_xi_to_not(ctx, op, -1)) {
         return true;
     }
-    return false;
+
+    ctx->z_mask = arg_info(op->args[1])->z_mask
+                | arg_info(op->args[2])->z_mask;
+    return fold_masks(ctx, op);
 }
 
 /* Propagate constants and copies, fold constant expressions. */
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
     }
 
     QTAILQ_FOREACH_SAFE(op, &s->ops, link, op_next) {
-        uint64_t z_mask, partmask, affected, tmp;
         TCGOpcode opc = op->opc;
         const TCGOpDef *def;
         bool done = false;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             break;
         }
 
-        /* Simplify using known-zero bits. Currently only ops with a single
-           output argument is supported. */
-        z_mask = -1;
-        affected = -1;
-        switch (opc) {
-        CASE_OP_32_64(ext8s):
-            if ((arg_info(op->args[1])->z_mask & 0x80) != 0) {
-                break;
-            }
-            QEMU_FALLTHROUGH;
-        CASE_OP_32_64(ext8u):
-            z_mask = 0xff;
-            goto and_const;
-        CASE_OP_32_64(ext16s):
-            if ((arg_info(op->args[1])->z_mask & 0x8000) != 0) {
-                break;
-            }
-            QEMU_FALLTHROUGH;
-        CASE_OP_32_64(ext16u):
-            z_mask = 0xffff;
-            goto and_const;
-        case INDEX_op_ext32s_i64:
-            if ((arg_info(op->args[1])->z_mask & 0x80000000) != 0) {
-                break;
-            }
-            QEMU_FALLTHROUGH;
-        case INDEX_op_ext32u_i64:
-            z_mask = 0xffffffffU;
-            goto and_const;
-
-        CASE_OP_32_64(and):
-            z_mask = arg_info(op->args[2])->z_mask;
-            if (arg_is_const(op->args[2])) {
-        and_const:
-                affected = arg_info(op->args[1])->z_mask & ~z_mask;
-            }
-            z_mask = arg_info(op->args[1])->z_mask & z_mask;
-            break;
-
-        case INDEX_op_ext_i32_i64:
-            if ((arg_info(op->args[1])->z_mask & 0x80000000) != 0) {
-                break;
-            }
-            QEMU_FALLTHROUGH;
-        case INDEX_op_extu_i32_i64:
-            /* We do not compute affected as it is a size changing op.  */
-            z_mask = (uint32_t)arg_info(op->args[1])->z_mask;
-            break;
-
-        CASE_OP_32_64(andc):
-            /* Known-zeros does not imply known-ones.  Therefore unless
-               op->args[2] is constant, we can't infer anything from it.  */
-            if (arg_is_const(op->args[2])) {
-                z_mask = ~arg_info(op->args[2])->z_mask;
-                goto and_const;
-            }
-            /* But we certainly know nothing outside args[1] may be set. */
-            z_mask = arg_info(op->args[1])->z_mask;
-            break;
-
-        case INDEX_op_sar_i32:
-            if (arg_is_const(op->args[2])) {
-                tmp = arg_info(op->args[2])->val & 31;
-                z_mask = (int32_t)arg_info(op->args[1])->z_mask >> tmp;
-            }
-            break;
-        case INDEX_op_sar_i64:
-            if (arg_is_const(op->args[2])) {
-                tmp = arg_info(op->args[2])->val & 63;
-                z_mask = (int64_t)arg_info(op->args[1])->z_mask >> tmp;
-            }
-            break;
-
-        case INDEX_op_shr_i32:
-            if (arg_is_const(op->args[2])) {
-                tmp = arg_info(op->args[2])->val & 31;
-                z_mask = (uint32_t)arg_info(op->args[1])->z_mask >> tmp;
-            }
-            break;
-        case INDEX_op_shr_i64:
-            if (arg_is_const(op->args[2])) {
-                tmp = arg_info(op->args[2])->val & 63;
-                z_mask = (uint64_t)arg_info(op->args[1])->z_mask >> tmp;
-            }
-            break;
-
-        case INDEX_op_extrl_i64_i32:
-            z_mask = (uint32_t)arg_info(op->args[1])->z_mask;
-            break;
-        case INDEX_op_extrh_i64_i32:
-            z_mask = (uint64_t)arg_info(op->args[1])->z_mask >> 32;
-            break;
-
-        CASE_OP_32_64(shl):
-            if (arg_is_const(op->args[2])) {
-                tmp = arg_info(op->args[2])->val & (TCG_TARGET_REG_BITS - 1);
-                z_mask = arg_info(op->args[1])->z_mask << tmp;
-            }
-            break;
-
-        CASE_OP_32_64(neg):
-            /* Set to 1 all bits to the left of the rightmost.  */
-            z_mask = -(arg_info(op->args[1])->z_mask
-                       & -arg_info(op->args[1])->z_mask);
-            break;
-
-        CASE_OP_32_64(deposit):
-            z_mask = deposit64(arg_info(op->args[1])->z_mask,
-                               op->args[3], op->args[4],
-                               arg_info(op->args[2])->z_mask);
-            break;
-
-        CASE_OP_32_64(extract):
-            z_mask = extract64(arg_info(op->args[1])->z_mask,
-                               op->args[2], op->args[3]);
-            if (op->args[2] == 0) {
-                affected = arg_info(op->args[1])->z_mask & ~z_mask;
-            }
-            break;
-        CASE_OP_32_64(sextract):
-            z_mask = sextract64(arg_info(op->args[1])->z_mask,
-                                op->args[2], op->args[3]);
-            if (op->args[2] == 0 && (tcg_target_long)z_mask >= 0) {
-                affected = arg_info(op->args[1])->z_mask & ~z_mask;
-            }
-            break;
-
-        CASE_OP_32_64(or):
-        CASE_OP_32_64(xor):
-            z_mask = arg_info(op->args[1])->z_mask
-                   | arg_info(op->args[2])->z_mask;
-            break;
-
-        case INDEX_op_clz_i32:
-        case INDEX_op_ctz_i32:
-            z_mask = arg_info(op->args[2])->z_mask | 31;
-            break;
-
-        case INDEX_op_clz_i64:
-        case INDEX_op_ctz_i64:
-            z_mask = arg_info(op->args[2])->z_mask | 63;
-            break;
-
-        case INDEX_op_ctpop_i32:
-            z_mask = 32 | 31;
-            break;
-        case INDEX_op_ctpop_i64:
-            z_mask = 64 | 63;
-            break;
-
-        CASE_OP_32_64(setcond):
-        case INDEX_op_setcond2_i32:
-            z_mask = 1;
-            break;
-
-        CASE_OP_32_64(movcond):
-            z_mask = arg_info(op->args[3])->z_mask
-                   | arg_info(op->args[4])->z_mask;
-            break;
-
-        CASE_OP_32_64(ld8u):
-            z_mask = 0xff;
-            break;
-        CASE_OP_32_64(ld16u):
-            z_mask = 0xffff;
-            break;
-        case INDEX_op_ld32u_i64:
-            z_mask = 0xffffffffu;
-            break;
-
-        CASE_OP_32_64(qemu_ld):
-            {
-                MemOpIdx oi = op->args[def->nb_oargs + def->nb_iargs];
-                MemOp mop = get_memop(oi);
-                if (!(mop & MO_SIGN)) {
-                    z_mask = (2ULL << ((8 << (mop & MO_SIZE)) - 1)) - 1;
-                }
-            }
-            break;
-
-        CASE_OP_32_64(bswap16):
-            z_mask = arg_info(op->args[1])->z_mask;
-            if (z_mask <= 0xffff) {
-                op->args[2] |= TCG_BSWAP_IZ;
-            }
-            z_mask = bswap16(z_mask);
-            switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
-            case TCG_BSWAP_OZ:
-                break;
-            case TCG_BSWAP_OS:
-                z_mask = (int16_t)z_mask;
-                break;
-            default: /* undefined high bits */
-                z_mask |= MAKE_64BIT_MASK(16, 48);
-                break;
-            }
-            break;
-
-        case INDEX_op_bswap32_i64:
-            z_mask = arg_info(op->args[1])->z_mask;
-            if (z_mask <= 0xffffffffu) {
-                op->args[2] |= TCG_BSWAP_IZ;
-            }
-            z_mask = bswap32(z_mask);
-            switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
-            case TCG_BSWAP_OZ:
-                break;
-            case TCG_BSWAP_OS:
-                z_mask = (int32_t)z_mask;
-                break;
-            default: /* undefined high bits */
-                z_mask |= MAKE_64BIT_MASK(32, 32);
-                break;
-            }
-            break;
-
-        default:
-            break;
-        }
-
-        /* 32-bit ops generate 32-bit results.  For the result is zero test
-           below, we can ignore high bits, but for further optimizations we
-           need to record that the high bits contain garbage.  */
-        partmask = z_mask;
-        if (ctx.type == TCG_TYPE_I32) {
-            z_mask |= ~(tcg_target_ulong)0xffffffffu;
-            partmask &= 0xffffffffu;
-            affected &= 0xffffffffu;
-        }
-        ctx.z_mask = z_mask;
-
-        if (partmask == 0) {
-            tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
-            continue;
-        }
-        if (affected == 0) {
-            tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
-            continue;
-        }
+        /* Assume all bits affected, and no bits known zero. */
+        ctx.a_mask = -1;
+        ctx.z_mask = -1;
 
         /*
          * Process each opcode.
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_extrh_i64_i32:
             done = fold_extu(&ctx, op);
             break;
+        CASE_OP_32_64(ld8u):
+        CASE_OP_32_64(ld16u):
+        case INDEX_op_ld32u_i64:
+            done = fold_tcg_ld(&ctx, op);
+            break;
         case INDEX_op_mb:
             done = fold_mb(&ctx, op);
             break;
-- 
2.25.1

Rename to fold_multiply2, and handle muls2_i32, mulu2_i64,
and muls2_i64.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 44 +++++++++++++++++++++++++++++++++++---------
 1 file changed, 35 insertions(+), 9 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
     return false;
 }
 
-static bool fold_mulu2_i32(OptContext *ctx, TCGOp *op)
+static bool fold_multiply2(OptContext *ctx, TCGOp *op)
 {
     if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
-        uint32_t a = arg_info(op->args[2])->val;
-        uint32_t b = arg_info(op->args[3])->val;
-        uint64_t r = (uint64_t)a * b;
+        uint64_t a = arg_info(op->args[2])->val;
+        uint64_t b = arg_info(op->args[3])->val;
+        uint64_t h, l;
         TCGArg rl, rh;
-        TCGOp *op2 = tcg_op_insert_before(ctx->tcg, op, INDEX_op_mov_i32);
+        TCGOp *op2;
+
+        switch (op->opc) {
+        case INDEX_op_mulu2_i32:
+            l = (uint64_t)(uint32_t)a * (uint32_t)b;
+            h = (int32_t)(l >> 32);
+            l = (int32_t)l;
+            break;
+        case INDEX_op_muls2_i32:
+            l = (int64_t)(int32_t)a * (int32_t)b;
+            h = l >> 32;
+            l = (int32_t)l;
+            break;
+        case INDEX_op_mulu2_i64:
+            mulu64(&l, &h, a, b);
+            break;
+        case INDEX_op_muls2_i64:
+            muls64(&l, &h, a, b);
+            break;
+        default:
+            g_assert_not_reached();
+        }
 
         rl = op->args[0];
         rh = op->args[1];
-        tcg_opt_gen_movi(ctx, op, rl, (int32_t)r);
-        tcg_opt_gen_movi(ctx, op2, rh, (int32_t)(r >> 32));
+
+        /* The proper opcode is supplied by tcg_opt_gen_mov. */
+        op2 = tcg_op_insert_before(ctx->tcg, op, 0);
+
+        tcg_opt_gen_movi(ctx, op, rl, l);
+        tcg_opt_gen_movi(ctx, op2, rh, h);
         return true;
     }
     return false;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(muluh):
             done = fold_mul_highpart(&ctx, op);
             break;
-        case INDEX_op_mulu2_i32:
-            done = fold_mulu2_i32(&ctx, op);
+        CASE_OP_32_64(muls2):
+        CASE_OP_32_64(mulu2):
+            done = fold_multiply2(&ctx, op);
             break;
         CASE_OP_32_64(nand):
             done = fold_nand(&ctx, op);
-- 
2.25.1

Rename to fold_addsub2.
Use Int128 to implement the wider operation.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 65 ++++++++++++++++++++++++++++++++++----------------
 1 file changed, 44 insertions(+), 21 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@
  */
 
 #include "qemu/osdep.h"
+#include "qemu/int128.h"
 #include "tcg/tcg-op.h"
 #include "tcg-internal.h"
 
@@ -XXX,XX +XXX,XX @@ static bool fold_add(OptContext *ctx, TCGOp *op)
     return false;
 }
 
-static bool fold_addsub2_i32(OptContext *ctx, TCGOp *op, bool add)
+static bool fold_addsub2(OptContext *ctx, TCGOp *op, bool add)
 {
     if (arg_is_const(op->args[2]) && arg_is_const(op->args[3]) &&
         arg_is_const(op->args[4]) && arg_is_const(op->args[5])) {
-        uint32_t al = arg_info(op->args[2])->val;
-        uint32_t ah = arg_info(op->args[3])->val;
-        uint32_t bl = arg_info(op->args[4])->val;
-        uint32_t bh = arg_info(op->args[5])->val;
-        uint64_t a = ((uint64_t)ah << 32) | al;
-        uint64_t b = ((uint64_t)bh << 32) | bl;
+        uint64_t al = arg_info(op->args[2])->val;
+        uint64_t ah = arg_info(op->args[3])->val;
+        uint64_t bl = arg_info(op->args[4])->val;
+        uint64_t bh = arg_info(op->args[5])->val;
         TCGArg rl, rh;
-        TCGOp *op2 = tcg_op_insert_before(ctx->tcg, op, INDEX_op_mov_i32);
+        TCGOp *op2;
 
-        if (add) {
-            a += b;
+        if (ctx->type == TCG_TYPE_I32) {
+            uint64_t a = deposit64(al, 32, 32, ah);
+            uint64_t b = deposit64(bl, 32, 32, bh);
+
+            if (add) {
+                a += b;
+            } else {
+                a -= b;
+            }
+
+            al = sextract64(a, 0, 32);
+            ah = sextract64(a, 32, 32);
         } else {
-            a -= b;
+            Int128 a = int128_make128(al, ah);
+            Int128 b = int128_make128(bl, bh);
+
+            if (add) {
+                a = int128_add(a, b);
+            } else {
+                a = int128_sub(a, b);
+            }
+
+            al = int128_getlo(a);
+            ah = int128_gethi(a);
         }
 
         rl = op->args[0];
         rh = op->args[1];
-        tcg_opt_gen_movi(ctx, op, rl, (int32_t)a);
-        tcg_opt_gen_movi(ctx, op2, rh, (int32_t)(a >> 32));
+
+        /* The proper opcode is supplied by tcg_opt_gen_mov. */
+        op2 = tcg_op_insert_before(ctx->tcg, op, 0);
+
+        tcg_opt_gen_movi(ctx, op, rl, al);
+        tcg_opt_gen_movi(ctx, op2, rh, ah);
         return true;
     }
     return false;
 }
 
-static bool fold_add2_i32(OptContext *ctx, TCGOp *op)
+static bool fold_add2(OptContext *ctx, TCGOp *op)
 {
-    return fold_addsub2_i32(ctx, op, true);
+    return fold_addsub2(ctx, op, true);
 }
 
 static bool fold_and(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_sub(OptContext *ctx, TCGOp *op)
     return false;
 }
 
-static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
+static bool fold_sub2(OptContext *ctx, TCGOp *op)
 {
-    return fold_addsub2_i32(ctx, op, false);
+    return fold_addsub2(ctx, op, false);
 }
 
 static bool fold_tcg_ld(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64_VEC(add):
             done = fold_add(&ctx, op);
             break;
-        case INDEX_op_add2_i32:
-            done = fold_add2_i32(&ctx, op);
+        CASE_OP_32_64(add2):
+            done = fold_add2(&ctx, op);
             break;
         CASE_OP_32_64_VEC(and):
             done = fold_and(&ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64_VEC(sub):
             done = fold_sub(&ctx, op);
             break;
-        case INDEX_op_sub2_i32:
-            done = fold_sub2_i32(&ctx, op);
+        CASE_OP_32_64(sub2):
+            done = fold_sub2(&ctx, op);
             break;
         CASE_OP_32_64_VEC(xor):
             done = fold_xor(&ctx, op);
-- 
2.25.1

Most of these are handled by creating a fold_const2_commutative
to handle all of the binary operators.  The rest were already
handled on a case-by-case basis in the switch, and have their
own fold function in which to place the call.

We now have only one major switch on TCGOpcode.

Introduce NO_DEST and a block comment for swap_commutative in
order to make the handling of brcond and movcond opcodes cleaner.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 142 ++++++++++++++++++++++++-------------------------
 1 file changed, 70 insertions(+), 72 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static int do_constant_folding_cond2(TCGArg *p1, TCGArg *p2, TCGCond c)
     return -1;
 }
 
+/**
+ * swap_commutative:
+ * @dest: TCGArg of the destination argument, or NO_DEST.
+ * @p1: first paired argument
+ * @p2: second paired argument
+ *
+ * If *@p1 is a constant and *@p2 is not, swap.
+ * If *@p2 matches @dest, swap.
+ * Return true if a swap was performed.
+ */
+
+#define NO_DEST  temp_arg(NULL)
+
 static bool swap_commutative(TCGArg dest, TCGArg *p1, TCGArg *p2)
 {
     TCGArg a1 = *p1, a2 = *p2;
@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
     return false;
 }
 
+static bool fold_const2_commutative(OptContext *ctx, TCGOp *op)
+{
+    swap_commutative(op->args[0], &op->args[1], &op->args[2]);
+    return fold_const2(ctx, op);
+}
+
 static bool fold_masks(OptContext *ctx, TCGOp *op)
 {
     uint64_t a_mask = ctx->a_mask;
@@ -XXX,XX +XXX,XX @@ static bool fold_xx_to_x(OptContext *ctx, TCGOp *op)
 
 static bool fold_add(OptContext *ctx, TCGOp *op)
 {
-    if (fold_const2(ctx, op) ||
+    if (fold_const2_commutative(ctx, op) ||
         fold_xi_to_x(ctx, op, 0)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_addsub2(OptContext *ctx, TCGOp *op, bool add)
 
 static bool fold_add2(OptContext *ctx, TCGOp *op)
 {
+    /* Note that the high and low parts may be independently swapped. */
+    swap_commutative(op->args[0], &op->args[2], &op->args[4]);
+    swap_commutative(op->args[1], &op->args[3], &op->args[5]);
+
     return fold_addsub2(ctx, op, true);
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
 {
     uint64_t z1, z2;
 
-    if (fold_const2(ctx, op) ||
+    if (fold_const2_commutative(ctx, op) ||
         fold_xi_to_i(ctx, op, 0) ||
         fold_xi_to_x(ctx, op, -1) ||
         fold_xx_to_x(ctx, op)) {
@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
 static bool fold_brcond(OptContext *ctx, TCGOp *op)
 {
     TCGCond cond = op->args[2];
-    int i = do_constant_folding_cond(ctx->type, op->args[0], op->args[1], cond);
+    int i;
 
+    if (swap_commutative(NO_DEST, &op->args[0], &op->args[1])) {
+        op->args[2] = cond = tcg_swap_cond(cond);
+    }
+
+    i = do_constant_folding_cond(ctx->type, op->args[0], op->args[1], cond);
     if (i == 0) {
         tcg_op_remove(ctx->tcg, op);
         return true;
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond(OptContext *ctx, TCGOp *op)
 static bool fold_brcond2(OptContext *ctx, TCGOp *op)
 {
     TCGCond cond = op->args[4];
-    int i = do_constant_folding_cond2(&op->args[0], &op->args[2], cond);
     TCGArg label = op->args[5];
-    int inv = 0;
+    int i, inv = 0;
 
+    if (swap_commutative2(&op->args[0], &op->args[2])) {
+        op->args[4] = cond = tcg_swap_cond(cond);
+    }
+
+    i = do_constant_folding_cond2(&op->args[0], &op->args[2], cond);
     if (i >= 0) {
         goto do_brcond_const;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_dup2(OptContext *ctx, TCGOp *op)
 
 static bool fold_eqv(OptContext *ctx, TCGOp *op)
 {
-    if (fold_const2(ctx, op) ||
+    if (fold_const2_commutative(ctx, op) ||
         fold_xi_to_x(ctx, op, -1) ||
         fold_xi_to_not(ctx, op, 0)) {
         return true;
@@ -XXX,XX +XXX,XX @@ static bool fold_mov(OptContext *ctx, TCGOp *op)
 static bool fold_movcond(OptContext *ctx, TCGOp *op)
 {
     TCGCond cond = op->args[5];
-    int i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
+    int i;
 
+    if (swap_commutative(NO_DEST, &op->args[1], &op->args[2])) {
+        op->args[5] = cond = tcg_swap_cond(cond);
+    }
+    /*
+     * Canonicalize the "false" input reg to match the destination reg so
+     * that the tcg backend can implement a "move if true" operation.
+     */
+    if (swap_commutative(op->args[0], &op->args[4], &op->args[3])) {
+        op->args[5] = cond = tcg_invert_cond(cond);
+    }
+
+    i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
     if (i >= 0) {
         return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[4 - i]);
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_mul(OptContext *ctx, TCGOp *op)
 
 static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
 {
-    if (fold_const2(ctx, op) ||
+    if (fold_const2_commutative(ctx, op) ||
         fold_xi_to_i(ctx, op, 0)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
 
 static bool fold_multiply2(OptContext *ctx, TCGOp *op)
 {
+    swap_commutative(op->args[0], &op->args[2], &op->args[3]);
+
     if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
         uint64_t a = arg_info(op->args[2])->val;
         uint64_t b = arg_info(op->args[3])->val;
@@ -XXX,XX +XXX,XX @@ static bool fold_multiply2(OptContext *ctx, TCGOp *op)
 
 static bool fold_nand(OptContext *ctx, TCGOp *op)
 {
-    if (fold_const2(ctx, op) ||
+    if (fold_const2_commutative(ctx, op) ||
         fold_xi_to_not(ctx, op, -1)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_neg(OptContext *ctx, TCGOp *op)
 
 static bool fold_nor(OptContext *ctx, TCGOp *op)
 {
-    if (fold_const2(ctx, op) ||
+    if (fold_const2_commutative(ctx, op) ||
         fold_xi_to_not(ctx, op, 0)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_not(OptContext *ctx, TCGOp *op)
 
 static bool fold_or(OptContext *ctx, TCGOp *op)
 {
-    if (fold_const2(ctx, op) ||
+    if (fold_const2_commutative(ctx, op) ||
         fold_xi_to_x(ctx, op, 0) ||
         fold_xx_to_x(ctx, op)) {
         return true;
@@ -XXX,XX +XXX,XX @@ static bool fold_remainder(OptContext *ctx, TCGOp *op)
 static bool fold_setcond(OptContext *ctx, TCGOp *op)
 {
     TCGCond cond = op->args[3];
-    int i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
+    int i;
 
+    if (swap_commutative(op->args[0], &op->args[1], &op->args[2])) {
+        op->args[3] = cond = tcg_swap_cond(cond);
+    }
+
+    i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
     if (i >= 0) {
         return tcg_opt_gen_movi(ctx, op, op->args[0], i);
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond(OptContext *ctx, TCGOp *op)
 static bool fold_setcond2(OptContext *ctx, TCGOp *op)
 {
     TCGCond cond = op->args[5];
-    int i = do_constant_folding_cond2(&op->args[1], &op->args[3], cond);
-    int inv = 0;
+    int i, inv = 0;
 
+    if (swap_commutative2(&op->args[1], &op->args[3])) {
+        op->args[5] = cond = tcg_swap_cond(cond);
+    }
+
+    i = do_constant_folding_cond2(&op->args[1], &op->args[3], cond);
     if (i >= 0) {
         goto do_setcond_const;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_tcg_ld(OptContext *ctx, TCGOp *op)
 
 static bool fold_xor(OptContext *ctx, TCGOp *op)
 {
-    if (fold_const2(ctx, op) ||
+    if (fold_const2_commutative(ctx, op) ||
         fold_xx_to_i(ctx, op, 0) ||
         fold_xi_to_x(ctx, op, 0) ||
         fold_xi_to_not(ctx, op, -1)) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             ctx.type = TCG_TYPE_I32;
         }
 
-        /* For commutative operations make constant second argument */
-        switch (opc) {
-        CASE_OP_32_64_VEC(add):
-        CASE_OP_32_64_VEC(mul):
-        CASE_OP_32_64_VEC(and):
-        CASE_OP_32_64_VEC(or):
-        CASE_OP_32_64_VEC(xor):
-        CASE_OP_32_64(eqv):
-        CASE_OP_32_64(nand):
-        CASE_OP_32_64(nor):
-        CASE_OP_32_64(muluh):
-        CASE_OP_32_64(mulsh):
-            swap_commutative(op->args[0], &op->args[1], &op->args[2]);
-            break;
-        CASE_OP_32_64(brcond):
-            if (swap_commutative(-1, &op->args[0], &op->args[1])) {
-                op->args[2] = tcg_swap_cond(op->args[2]);
-            }
-            break;
-        CASE_OP_32_64(setcond):
-            if (swap_commutative(op->args[0], &op->args[1], &op->args[2])) {
-                op->args[3] = tcg_swap_cond(op->args[3]);
-            }
-            break;
-        CASE_OP_32_64(movcond):
-            if (swap_commutative(-1, &op->args[1], &op->args[2])) {
-                op->args[5] = tcg_swap_cond(op->args[5]);
-            }
-            /* For movcond, we canonicalize the "false" input reg to match
-               the destination reg so that the tcg backend can implement
-               a "move if true" operation.  */
-            if (swap_commutative(op->args[0], &op->args[4], &op->args[3])) {
-                op->args[5] = tcg_invert_cond(op->args[5]);
-            }
-            break;
-        CASE_OP_32_64(add2):
-            swap_commutative(op->args[0], &op->args[2], &op->args[4]);
-            swap_commutative(op->args[1], &op->args[3], &op->args[5]);
-            break;
-        CASE_OP_32_64(mulu2):
-        CASE_OP_32_64(muls2):
-            swap_commutative(op->args[0], &op->args[2], &op->args[3]);
-            break;
-        case INDEX_op_brcond2_i32:
-            if (swap_commutative2(&op->args[0], &op->args[2])) {
-                op->args[4] = tcg_swap_cond(op->args[4]);
-            }
-            break;
-        case INDEX_op_setcond2_i32:
-            if (swap_commutative2(&op->args[1], &op->args[3])) {
-                op->args[5] = tcg_swap_cond(op->args[5]);
-            }
-            break;
-        default:
-            break;
-        }
-
         /* Assume all bits affected, and no bits known zero. */
         ctx.a_mask = -1;
         ctx.z_mask = -1;
-- 
2.25.1

This "garbage" setting pre-dates the addition of the type
changing opcodes INDEX_op_ext_i32_i64, INDEX_op_extu_i32_i64,
and INDEX_op_extr{l,h}_i64_i32.

So now we have a definitive points at which to adjust z_mask
to eliminate such bits from the 32-bit operands.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 35 ++++++++++++++++-------------------
 1 file changed, 16 insertions(+), 19 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static void init_ts_info(OptContext *ctx, TCGTemp *ts)
         ti->is_const = true;
         ti->val = ts->val;
         ti->z_mask = ts->val;
-        if (TCG_TARGET_REG_BITS > 32 && ts->type == TCG_TYPE_I32) {
-            /* High bits of a 32-bit quantity are garbage.  */
-            ti->z_mask |= ~0xffffffffull;
-        }
     } else {
         ti->is_const = false;
         ti->z_mask = -1;
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
     TCGTemp *src_ts = arg_temp(src);
     TempOptInfo *di;
     TempOptInfo *si;
-    uint64_t z_mask;
     TCGOpcode new_op;
 
     if (ts_are_copies(dst_ts, src_ts)) {
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
     op->args[0] = dst;
     op->args[1] = src;
 
-    z_mask = si->z_mask;
-    if (TCG_TARGET_REG_BITS > 32 && new_op == INDEX_op_mov_i32) {
-        /* High bits of the destination are now garbage.  */
-        z_mask |= ~0xffffffffull;
-    }
-    di->z_mask = z_mask;
+    di->z_mask = si->z_mask;
 
     if (src_ts->type == dst_ts->type) {
         TempOptInfo *ni = ts_info(si->next_copy);
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
 static bool tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
                              TCGArg dst, uint64_t val)
 {
-    /* Convert movi to mov with constant temp. */
-    TCGTemp *tv = tcg_constant_internal(ctx->type, val);
+    TCGTemp *tv;
 
+    if (ctx->type == TCG_TYPE_I32) {
+        val = (int32_t)val;
+    }
+
+    /* Convert movi to mov with constant temp. */
+    tv = tcg_constant_internal(ctx->type, val);
     init_ts_info(ctx, tv);
     return tcg_opt_gen_mov(ctx, op, dst, temp_arg(tv));
 }
@@ -XXX,XX +XXX,XX @@ static bool fold_masks(OptContext *ctx, TCGOp *op)
     uint64_t z_mask = ctx->z_mask;
 
     /*
-     * 32-bit ops generate 32-bit results.  For the result is zero test
-     * below, we can ignore high bits, but for further optimizations we
-     * need to record that the high bits contain garbage.
+     * 32-bit ops generate 32-bit results, which for the purpose of
+     * simplifying tcg are sign-extended.  Certainly that's how we
+     * represent our constants elsewhere.  Note that the bits will
+     * be reset properly for a 64-bit value when encountering the
+     * type changing opcodes.
      */
     if (ctx->type == TCG_TYPE_I32) {
-        ctx->z_mask |= MAKE_64BIT_MASK(32, 32);
-        a_mask &= MAKE_64BIT_MASK(0, 32);
-        z_mask &= MAKE_64BIT_MASK(0, 32);
+        a_mask = (int32_t)a_mask;
+        z_mask = (int32_t)z_mask;
+        ctx->z_mask = z_mask;
     }
 
     if (z_mask == 0) {
-- 
2.25.1

Certain targets, like riscv, produce signed 32-bit results.
This can lead to lots of redundant extensions as values are
manipulated.

Begin by tracking only the obvious sign-extensions, and
converting them to simple copies when possible.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 123 ++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 102 insertions(+), 21 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
     TCGTemp *next_copy;
     uint64_t val;
     uint64_t z_mask;  /* mask bit is 0 if and only if value bit is 0 */
+    uint64_t s_mask;  /* a left-aligned mask of clrsb(value) bits. */
 } TempOptInfo;
 
 typedef struct OptContext {
@@ -XXX,XX +XXX,XX @@ typedef struct OptContext {
     /* In flight values from optimization. */
     uint64_t a_mask;  /* mask bit is 0 iff value identical to first input */
     uint64_t z_mask;  /* mask bit is 0 iff value bit is 0 */
+    uint64_t s_mask;  /* mask of clrsb(value) bits */
     TCGType type;
 } OptContext;
 
+/* Calculate the smask for a specific value. */
+static uint64_t smask_from_value(uint64_t value)
+{
+    int rep = clrsb64(value);
+    return ~(~0ull >> rep);
+}
+
+/*
+ * Calculate the smask for a given set of known-zeros.
+ * If there are lots of zeros on the left, we can consider the remainder
+ * an unsigned field, and thus the corresponding signed field is one bit
+ * larger.
+ */
+static uint64_t smask_from_zmask(uint64_t zmask)
+{
+    /*
+     * Only the 0 bits are significant for zmask, thus the msb itself
+     * must be zero, else we have no sign information.
+     */
+    int rep = clz64(zmask);
+    if (rep == 0) {
+        return 0;
+    }
+    rep -= 1;
+    return ~(~0ull >> rep);
+}
+
 static inline TempOptInfo *ts_info(TCGTemp *ts)
 {
     return ts->state_ptr;
@@ -XXX,XX +XXX,XX @@ static void reset_ts(TCGTemp *ts)
     ti->prev_copy = ts;
     ti->is_const = false;
     ti->z_mask = -1;
+    ti->s_mask = 0;
 }
 
 static void reset_temp(TCGArg arg)
@@ -XXX,XX +XXX,XX @@ static void init_ts_info(OptContext *ctx, TCGTemp *ts)
         ti->is_const = true;
         ti->val = ts->val;
         ti->z_mask = ts->val;
+        ti->s_mask = smask_from_value(ts->val);
     } else {
         ti->is_const = false;
         ti->z_mask = -1;
+        ti->s_mask = 0;
     }
 }
 
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
     op->args[1] = src;
 
     di->z_mask = si->z_mask;
+    di->s_mask = si->s_mask;
 
     if (src_ts->type == dst_ts->type) {
         TempOptInfo *ni = ts_info(si->next_copy);
@@ -XXX,XX +XXX,XX @@ static void finish_folding(OptContext *ctx, TCGOp *op)
 
     nb_oargs = def->nb_oargs;
     for (i = 0; i < nb_oargs; i++) {
-        reset_temp(op->args[i]);
+        TCGTemp *ts = arg_temp(op->args[i]);
+        reset_ts(ts);
         /*
-         * Save the corresponding known-zero bits mask for the
+         * Save the corresponding known-zero/sign bits mask for the
          * first output argument (only one supported so far).
          */
         if (i == 0) {
-            arg_info(op->args[i])->z_mask = ctx->z_mask;
+            ts_info(ts)->z_mask = ctx->z_mask;
+            ts_info(ts)->s_mask = ctx->s_mask;
         }
     }
 }
@@ -XXX,XX +XXX,XX @@ static bool fold_masks(OptContext *ctx, TCGOp *op)
 {
     uint64_t a_mask = ctx->a_mask;
     uint64_t z_mask = ctx->z_mask;
+    uint64_t s_mask = ctx->s_mask;
 
     /*
      * 32-bit ops generate 32-bit results, which for the purpose of
@@ -XXX,XX +XXX,XX @@ static bool fold_masks(OptContext *ctx, TCGOp *op)
     if (ctx->type == TCG_TYPE_I32) {
         a_mask = (int32_t)a_mask;
         z_mask = (int32_t)z_mask;
+        s_mask |= MAKE_64BIT_MASK(32, 32);
         ctx->z_mask = z_mask;
+        ctx->s_mask = s_mask;
     }
 
     if (z_mask == 0) {
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
 
 static bool fold_bswap(OptContext *ctx, TCGOp *op)
 {
-    uint64_t z_mask, sign;
+    uint64_t z_mask, s_mask, sign;
 
     if (arg_is_const(op->args[1])) {
         uint64_t t = arg_info(op->args[1])->val;
@@ -XXX,XX +XXX,XX @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
     }
 
     z_mask = arg_info(op->args[1])->z_mask;
+
     switch (op->opc) {
     case INDEX_op_bswap16_i32:
     case INDEX_op_bswap16_i64:
@@ -XXX,XX +XXX,XX @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
     default:
         g_assert_not_reached();
     }
+    s_mask = smask_from_zmask(z_mask);
 
     switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
     case TCG_BSWAP_OZ:
@@ -XXX,XX +XXX,XX @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
         /* If the sign bit may be 1, force all the bits above to 1. */
         if (z_mask & sign) {
             z_mask |= sign;
+            s_mask = sign << 1;
         }
         break;
     default:
         /* The high bits are undefined: force all bits above the sign to 1. */
         z_mask |= sign << 1;
+        s_mask = 0;
         break;
     }
     ctx->z_mask = z_mask;
+    ctx->s_mask = s_mask;
 
     return fold_masks(ctx, op);
 }
@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
 static bool fold_extract(OptContext *ctx, TCGOp *op)
 {
     uint64_t z_mask_old, z_mask;
+    int pos = op->args[2];
+    int len = op->args[3];
 
     if (arg_is_const(op->args[1])) {
         uint64_t t;
 
         t = arg_info(op->args[1])->val;
-        t = extract64(t, op->args[2], op->args[3]);
+        t = extract64(t, pos, len);
         return tcg_opt_gen_movi(ctx, op, op->args[0], t);
     }
 
     z_mask_old = arg_info(op->args[1])->z_mask;
-    z_mask = extract64(z_mask_old, op->args[2], op->args[3]);
-    if (op->args[2] == 0) {
+    z_mask = extract64(z_mask_old, pos, len);
+    if (pos == 0) {
         ctx->a_mask = z_mask_old ^ z_mask;
     }
     ctx->z_mask = z_mask;
+    ctx->s_mask = smask_from_zmask(z_mask);
 
     return fold_masks(ctx, op);
 }
@@ -XXX,XX +XXX,XX @@ static bool fold_extract2(OptContext *ctx, TCGOp *op)
 
 static bool fold_exts(OptContext *ctx, TCGOp *op)
 {
-    uint64_t z_mask_old, z_mask, sign;
+    uint64_t s_mask_old, s_mask, z_mask, sign;
     bool type_change = false;
 
     if (fold_const1(ctx, op)) {
         return true;
     }
 
-    z_mask_old = z_mask = arg_info(op->args[1])->z_mask;
+    z_mask = arg_info(op->args[1])->z_mask;
+    s_mask = arg_info(op->args[1])->s_mask;
+    s_mask_old = s_mask;
 
     switch (op->opc) {
     CASE_OP_32_64(ext8s):
@@ -XXX,XX +XXX,XX @@ static bool fold_exts(OptContext *ctx, TCGOp *op)
 
     if (z_mask & sign) {
         z_mask |= sign;
-    } else if (!type_change) {
-        ctx->a_mask = z_mask_old ^ z_mask;
     }
+    s_mask |= sign << 1;
+
     ctx->z_mask = z_mask;
+    ctx->s_mask = s_mask;
+    if (!type_change) {
+        ctx->a_mask = s_mask & ~s_mask_old;
+    }
 
     return fold_masks(ctx, op);
 }
@@ -XXX,XX +XXX,XX @@ static bool fold_extu(OptContext *ctx, TCGOp *op)
     }
 
     ctx->z_mask = z_mask;
+    ctx->s_mask = smask_from_zmask(z_mask);
     if (!type_change) {
         ctx->a_mask = z_mask_old ^ z_mask;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
     MemOp mop = get_memop(oi);
     int width = 8 * memop_size(mop);
 
-    if (!(mop & MO_SIGN) && width < 64) {
-        ctx->z_mask = MAKE_64BIT_MASK(0, width);
+    if (width < 64) {
+        ctx->s_mask = MAKE_64BIT_MASK(width, 64 - width);
+        if (!(mop & MO_SIGN)) {
+            ctx->z_mask = MAKE_64BIT_MASK(0, width);
+            ctx->s_mask <<= 1;
+        }
     }
 
     /* Opcodes that touch guest memory stop the mb optimization.  */
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
 
 static bool fold_sextract(OptContext *ctx, TCGOp *op)
 {
-    int64_t z_mask_old, z_mask;
+    uint64_t z_mask, s_mask, s_mask_old;
+    int pos = op->args[2];
+    int len = op->args[3];
 
     if (arg_is_const(op->args[1])) {
         uint64_t t;
 
         t = arg_info(op->args[1])->val;
-        t = sextract64(t, op->args[2], op->args[3]);
+        t = sextract64(t, pos, len);
         return tcg_opt_gen_movi(ctx, op, op->args[0], t);
     }
 
-    z_mask_old = arg_info(op->args[1])->z_mask;
-    z_mask = sextract64(z_mask_old, op->args[2], op->args[3]);
-    if (op->args[2] == 0 && z_mask >= 0) {
-        ctx->a_mask = z_mask_old ^ z_mask;
-    }
+    z_mask = arg_info(op->args[1])->z_mask;
+    z_mask = sextract64(z_mask, pos, len);
     ctx->z_mask = z_mask;
 
+    s_mask_old = arg_info(op->args[1])->s_mask;
+    s_mask = sextract64(s_mask_old, pos, len);
+    s_mask |= MAKE_64BIT_MASK(len, 64 - len);
+    ctx->s_mask = s_mask;
+
+    if (pos == 0) {
+        ctx->a_mask = s_mask & ~s_mask_old;
+    }
+
     return fold_masks(ctx, op);
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_tcg_ld(OptContext *ctx, TCGOp *op)
 {
     /* We can't do any folding with a load, but we can record bits. */
     switch (op->opc) {
+    CASE_OP_32_64(ld8s):
+        ctx->s_mask = MAKE_64BIT_MASK(8, 56);
+        break;
     CASE_OP_32_64(ld8u):
         ctx->z_mask = MAKE_64BIT_MASK(0, 8);
+        ctx->s_mask = MAKE_64BIT_MASK(9, 55);
+        break;
+    CASE_OP_32_64(ld16s):
+        ctx->s_mask = MAKE_64BIT_MASK(16, 48);
         break;
     CASE_OP_32_64(ld16u):
         ctx->z_mask = MAKE_64BIT_MASK(0, 16);
+        ctx->s_mask = MAKE_64BIT_MASK(17, 47);
+        break;
+    case INDEX_op_ld32s_i64:
+        ctx->s_mask = MAKE_64BIT_MASK(32, 32);
         break;
     case INDEX_op_ld32u_i64:
         ctx->z_mask = MAKE_64BIT_MASK(0, 32);
+        ctx->s_mask = MAKE_64BIT_MASK(33, 31);
         break;
     default:
         g_assert_not_reached();
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             ctx.type = TCG_TYPE_I32;
         }
 
-        /* Assume all bits affected, and no bits known zero. */
+        /* Assume all bits affected, no bits known zero, no sign reps. */
         ctx.a_mask = -1;
         ctx.z_mask = -1;
+        ctx.s_mask = 0;
 
         /*
          * Process each opcode.
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_extrh_i64_i32:
             done = fold_extu(&ctx, op);
             break;
+        CASE_OP_32_64(ld8s):
         CASE_OP_32_64(ld8u):
+        CASE_OP_32_64(ld16s):
         CASE_OP_32_64(ld16u):
+        case INDEX_op_ld32s_i64:
         case INDEX_op_ld32u_i64:
             done = fold_tcg_ld(&ctx, op);
             break;
-- 
2.25.1

Sign repetitions are perforce all identical, whether they are 1 or 0.
Bitwise operations preserve the relative quantity of the repetitions.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
     z2 = arg_info(op->args[2])->z_mask;
     ctx->z_mask = z1 & z2;
 
+    /*
+     * Sign repetitions are perforce all identical, whether they are 1 or 0.
+     * Bitwise operations preserve the relative quantity of the repetitions.
+     */
+    ctx->s_mask = arg_info(op->args[1])->s_mask
+                & arg_info(op->args[2])->s_mask;
+
     /*
      * Known-zeros does not imply known-ones.  Therefore unless
      * arg2 is constant, we can't infer affected bits from it.
@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
     }
     ctx->z_mask = z1;
 
+    ctx->s_mask = arg_info(op->args[1])->s_mask
+                & arg_info(op->args[2])->s_mask;
     return fold_masks(ctx, op);
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
         fold_xi_to_not(ctx, op, 0)) {
         return true;
     }
+
+    ctx->s_mask = arg_info(op->args[1])->s_mask
+                & arg_info(op->args[2])->s_mask;
     return false;
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
 
     ctx->z_mask = arg_info(op->args[3])->z_mask
                 | arg_info(op->args[4])->z_mask;
+    ctx->s_mask = arg_info(op->args[3])->s_mask
+                & arg_info(op->args[4])->s_mask;
 
     if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
         uint64_t tv = arg_info(op->args[3])->val;
@@ -XXX,XX +XXX,XX @@ static bool fold_nand(OptContext *ctx, TCGOp *op)
         fold_xi_to_not(ctx, op, -1)) {
         return true;
     }
+
+    ctx->s_mask = arg_info(op->args[1])->s_mask
+                & arg_info(op->args[2])->s_mask;
     return false;
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_nor(OptContext *ctx, TCGOp *op)
         fold_xi_to_not(ctx, op, 0)) {
         return true;
     }
+
+    ctx->s_mask = arg_info(op->args[1])->s_mask
+                & arg_info(op->args[2])->s_mask;
     return false;
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_not(OptContext *ctx, TCGOp *op)
         return true;
     }
 
+    ctx->s_mask = arg_info(op->args[1])->s_mask;
+
     /* Because of fold_to_not, we want to always return true, via finish. */
     finish_folding(ctx, op);
     return true;
@@ -XXX,XX +XXX,XX @@ static bool fold_or(OptContext *ctx, TCGOp *op)
 
     ctx->z_mask = arg_info(op->args[1])->z_mask
                 | arg_info(op->args[2])->z_mask;
+    ctx->s_mask = arg_info(op->args[1])->s_mask
+                & arg_info(op->args[2])->s_mask;
     return fold_masks(ctx, op);
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_orc(OptContext *ctx, TCGOp *op)
         fold_ix_to_not(ctx, op, 0)) {
         return true;
     }
+
+    ctx->s_mask = arg_info(op->args[1])->s_mask
+                & arg_info(op->args[2])->s_mask;
     return false;
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_xor(OptContext *ctx, TCGOp *op)
 
     ctx->z_mask = arg_info(op->args[1])->z_mask
                 | arg_info(op->args[2])->z_mask;
+    ctx->s_mask = arg_info(op->args[1])->s_mask
+                & arg_info(op->args[2])->s_mask;
     return fold_masks(ctx, op);
 }
 
-- 
2.25.1

For constant shifts, we can simply shift the s_mask.

For variable shifts, we know that sar does not reduce
the s_mask, which helps for sequences like

ext32s_i64  t, in
    sar_i64     t, t, v
    ext32s_i64  out, t

allowing the final extend to be eliminated.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 50 +++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 47 insertions(+), 3 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static uint64_t smask_from_zmask(uint64_t zmask)
     return ~(~0ull >> rep);
 }
 
+/*
+ * Recreate a properly left-aligned smask after manipulation.
+ * Some bit-shuffling, particularly shifts and rotates, may
+ * retain sign bits on the left, but may scatter disconnected
+ * sign bits on the right.  Retain only what remains to the left.
+ */
+static uint64_t smask_from_smask(int64_t smask)
+{
+    /* Only the 1 bits are significant for smask */
+    return smask_from_zmask(~smask);
+}
+
 static inline TempOptInfo *ts_info(TCGTemp *ts)
 {
     return ts->state_ptr;
@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
 
 static bool fold_shift(OptContext *ctx, TCGOp *op)
 {
+    uint64_t s_mask, z_mask, sign;
+
     if (fold_const2(ctx, op) ||
         fold_ix_to_i(ctx, op, 0) ||
         fold_xi_to_x(ctx, op, 0)) {
         return true;
     }
 
+    s_mask = arg_info(op->args[1])->s_mask;
+    z_mask = arg_info(op->args[1])->z_mask;
+
     if (arg_is_const(op->args[2])) {
-        ctx->z_mask = do_constant_folding(op->opc, ctx->type,
-                                          arg_info(op->args[1])->z_mask,
-                                          arg_info(op->args[2])->val);
+        int sh = arg_info(op->args[2])->val;
+
+        ctx->z_mask = do_constant_folding(op->opc, ctx->type, z_mask, sh);
+
+        s_mask = do_constant_folding(op->opc, ctx->type, s_mask, sh);
+        ctx->s_mask = smask_from_smask(s_mask);
+
         return fold_masks(ctx, op);
     }
+
+    switch (op->opc) {
+    CASE_OP_32_64(sar):
+        /*
+         * Arithmetic right shift will not reduce the number of
+         * input sign repetitions.
+         */
+        ctx->s_mask = s_mask;
+        break;
+    CASE_OP_32_64(shr):
+        /*
+         * If the sign bit is known zero, then logical right shift
+         * will not reduced the number of input sign repetitions.
+         */
+        sign = (s_mask & -s_mask) >> 1;
+        if (!(z_mask & sign)) {
+            ctx->s_mask = s_mask;
+        }
+        break;
+    default:
+        break;
+    }
+
     return false;
 }
 
-- 
2.25.1