Series comparison

-[Qemu-devel] [PULL 00/21] tcg patch queue
+[PULL 00/47] tcg patch queue
-The following changes since commit ff56877e911782dedc9a424233fd3f62369c258c:
+The following changes since commit 222059a0fccf4af3be776fe35a5ea2d6a68f9a0b:
-  Merge remote-tracking branch 'remotes/kraxel/tags/vga-20181015-pull-request' into staging (2018-10-15 15:03:45 +0100)
+  Merge tag 'pull-ppc-20221221' of https://gitlab.com/danielhb/qemu into staging (2022-12-21 18:08:09 +0000)
 are available in the Git repository at:
-  https://github.com/rth7680/qemu.git tags/pull-tcg-20181016
+  https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20221229
-for you to fetch changes up to e3e9d1ea20c75718ce7c528c588a0a497f12f750:
+for you to fetch changes up to b05e35533782a71a9fda472afd08442f50622a3e:
-  cputlb: read CPUTLBEntry.addr_write atomically (2018-10-16 10:04:27 -0700)
+  tests/tcg/multiarch: add vma-pthread.c (2022-12-29 12:39:45 -0800)
 ----------------------------------------------------------------
-Queued tcg patches
+Fix race conditions in new user-only vma tracking.
 Add tcg backend paired register allocation.
 Cleanup tcg backend function call abi.
 ----------------------------------------------------------------
-Emilio G. Cota (10):
+Ilya Leoshkevich (1):
-      tcg: access cpu->icount_decr.u16.high with atomics
+      tests/tcg/multiarch: add vma-pthread.c
       tcg: fix use of uninitialized variable under CONFIG_PROFILER
       tcg: plug holes in struct TCGProfile
       tcg: distribute tcg_time into TCG contexts
       target/alpha: remove tlb_flush from alpha_cpu_initfn
       target/unicore32: remove tlb_flush from uc32_init_fn
       exec: introduce tlb_init
       cputlb: fix assert_cpu_is_self macro
       cputlb: serialize tlb updates with env->tlb_lock
       cputlb: read CPUTLBEntry.addr_write atomically
-Richard Henderson (11):
+Mark Cave-Ayland (1):
-      tcg: Implement CPU_LOG_TB_NOCHAIN during expansion
+      tcg: convert tcg/README to rst
       tcg: Add tlb_index and tlb_entry helpers
       tcg: Split CONFIG_ATOMIC128
       target/i386: Convert to HAVE_CMPXCHG128
       target/arm: Convert to HAVE_CMPXCHG128
       target/arm: Check HAVE_CMPXCHG128 at translate time
       target/ppc: Convert to HAVE_CMPXCHG128 and HAVE_ATOMIC128
       target/s390x: Convert to HAVE_CMPXCHG128 and HAVE_ATOMIC128
       target/s390x: Split do_cdsg, do_lpq, do_stpq
       target/s390x: Skip wout, cout helpers if op helper does not return
       target/s390x: Check HAVE_ATOMIC128 and HAVE_CMPXCHG128 at translate
- accel/tcg/atomic_template.h      |  20 +++-
+Philippe Mathieu-Daudé (5):
- accel/tcg/softmmu_template.h     |  64 +++++-----
+      tcg/s390x: Fix coding style
- include/exec/cpu-defs.h          |   3 +
+      tcg: Massage process_op_defs()
- include/exec/cpu_ldst.h          |  30 ++++-
+      tcg: Pass number of arguments to tcg_emit_op() / tcg_op_insert_*()
- include/exec/cpu_ldst_template.h |  25 ++--
+      tcg: Convert typecode_to_ffi from array to function
- include/exec/exec-all.h          |   8 ++
+      tcg: Factor init_ffi_layouts() out of tcg_context_init()
  include/qemu/atomic128.h         | 155 ++++++++++++++++++++++++
  include/qemu/timer.h             |   1 -
  target/ppc/helper.h              |   2 +-
  tcg/tcg.h                        |  20 ++--
  accel/tcg/cpu-exec.c             |   2 +-
  accel/tcg/cputlb.c               | 235 +++++++++++++++++++-----------------
  accel/tcg/tcg-all.c              |   2 +-
  accel/tcg/translate-all.c        |   2 +-
  accel/tcg/user-exec.c            |   5 +-
  cpus.c                           |   3 +-
  exec.c                           |   1 +
  monitor.c                        |  13 +-
  qom/cpu.c                        |   2 +-
  target/alpha/cpu.c               |   1 -
  target/arm/helper-a64.c          | 251 +++++++++++++++++++--------------------
  target/arm/translate-a64.c       |  38 +++---
  target/i386/mem_helper.c         |   9 +-
  target/ppc/mem_helper.c          |  33 ++++-
  target/ppc/translate.c           | 115 +++++++++---------
  target/s390x/mem_helper.c        | 202 +++++++++++++++----------------
  target/s390x/translate.c         |  45 +++++--
  target/unicore32/cpu.c           |   2 -
  tcg/tcg-op.c                     |   9 +-
  tcg/tcg.c                        |  25 +++-
  configure                        |  19 +++
 files changed, 830 insertions(+), 512 deletions(-)
  create mode 100644 include/qemu/atomic128.h
+Richard Henderson (40):
+      meson: Move CONFIG_TCG_INTERPRETER to config_host
+      tcg: Cleanup trailing whitespace
+      qemu/main-loop: Introduce QEMU_IOTHREAD_LOCK_GUARD
+      hw/mips: Use QEMU_IOTHREAD_LOCK_GUARD in cpu_mips_irq_request
+      target/ppc: Use QEMU_IOTHREAD_LOCK_GUARD in ppc_maybe_interrupt
+      target/ppc: Use QEMU_IOTHREAD_LOCK_GUARD in cpu_interrupt_exittb
+      target/riscv: Use QEMU_IOTHREAD_LOCK_GUARD in riscv_cpu_update_mip
+      hw/ppc: Use QEMU_IOTHREAD_LOCK_GUARD in ppc_set_irq
+      accel/tcg: Use QEMU_IOTHREAD_LOCK_GUARD in io_readx/io_writex
+      tcg: Tidy tcg_reg_alloc_op
+      tcg: Remove TCG_TARGET_STACK_GROWSUP
+      tci: MAX_OPC_PARAM_IARGS is no longer used
+      tcg: Fix tcg_reg_alloc_dup*
+      tcg: Centralize updates to reg_to_temp
+      tcg: Remove check_regs
+      tcg: Introduce paired register allocation
+      accel/tcg: Set cflags_next_tb in cpu_common_initfn
+      target/sparc: Avoid TCGV_{LOW,HIGH}
+      tcg: Move TCG_{LOW,HIGH} to tcg-internal.h
+      tcg: Add temp_subindex to TCGTemp
+      tcg: Simplify calls to temp_sync vs mem_coherent
+      tcg: Allocate TCGTemp pairs in host memory order
+      tcg: Move TCG_TYPE_COUNT outside enum
+      tcg: Introduce tcg_type_size
+      tcg: Introduce TCGCallReturnKind and TCGCallArgumentKind
+      tcg: Replace TCG_TARGET_CALL_ALIGN_ARGS with TCG_TARGET_CALL_ARG_I64
+      tcg: Replace TCG_TARGET_EXTEND_ARGS with TCG_TARGET_CALL_ARG_I32
+      tcg: Use TCG_CALL_ARG_EVEN for TCI special case
+      accel/tcg/plugin: Don't search for the function pointer index
+      accel/tcg/plugin: Avoid duplicate copy in copy_call
+      accel/tcg/plugin: Use copy_op in append_{udata,mem}_cb
+      tcg: Vary the allocation size for TCGOp
+      tcg: Use output_pref wrapper function
+      tcg: Reorg function calls
+      tcg: Move ffi_cif pointer into TCGHelperInfo
+      tcg/aarch64: Merge tcg_out_callr into tcg_out_call
+      tcg: Add TCGHelperInfo argument to tcg_out_call
+      accel/tcg: Fix tb_invalidate_phys_page_unwind
+      accel/tcg: Use g_free_rcu for user-exec interval trees
+      accel/tcg: Handle false negative lookup in page_check_range
+ docs/devel/atomics.rst               |    2 +
+ docs/devel/index-tcg.rst             |    1 +
+ docs/devel/tcg-ops.rst               |  941 +++++++++++++++++++
+ docs/devel/tcg.rst                   |    2 +-
+ meson.build                          |    4 +-
+ include/exec/helper-head.h           |    2 +-
+ include/qemu/main-loop.h             |   29 +
+ include/tcg/tcg-op.h                 |   35 +-
+ include/tcg/tcg.h                    |   96 +-
+ tcg/aarch64/tcg-target.h             |    4 +-
+ tcg/arm/tcg-target.h                 |    4 +-
+ tcg/i386/tcg-target.h                |    2 +
+ tcg/loongarch64/tcg-target.h         |    3 +-
+ tcg/mips/tcg-target.h                |    4 +-
+ tcg/riscv/tcg-target.h               |    7 +-
+ tcg/s390x/tcg-target.h               |    3 +-
+ tcg/sparc64/tcg-target.h             |    3 +-
+ tcg/tcg-internal.h                   |   58 +-
+ tcg/tci/tcg-target.h                 |    7 +
+ tests/tcg/multiarch/nop_func.h       |   25 +
+ accel/tcg/cputlb.c                   |   25 +-
+ accel/tcg/plugin-gen.c               |   54 +-
+ accel/tcg/tb-maint.c                 |   78 +-
+ accel/tcg/user-exec.c                |   59 +-
+ hw/core/cpu-common.c                 |    1 +
+ hw/mips/mips_int.c                   |   11 +-
+ hw/ppc/ppc.c                         |   10 +-
+ target/ppc/excp_helper.c             |   11 +-
+ target/ppc/helper_regs.c             |   14 +-
+ target/riscv/cpu_helper.c            |   10 +-
+ target/sparc/translate.c             |   21 +-
+ tcg/optimize.c                       |   10 +-
+ tcg/tcg-op-vec.c                     |   10 +-
+ tcg/tcg-op.c                         |   49 +-
+ tcg/tcg.c                            | 1658 +++++++++++++++++++++-------------
+ tcg/tci.c                            |    1 -
+ tests/tcg/multiarch/munmap-pthread.c |   16 +-
+ tests/tcg/multiarch/vma-pthread.c    |  207 +++++
+ tcg/aarch64/tcg-target.c.inc         |   19 +-
+ tcg/arm/tcg-target.c.inc             |   10 +-
+ tcg/i386/tcg-target.c.inc            |    5 +-
+ tcg/loongarch64/tcg-target.c.inc     |    7 +-
+ tcg/mips/tcg-target.c.inc            |    3 +-
+ tcg/ppc/tcg-target.c.inc             |   36 +-
+ tcg/riscv/tcg-target.c.inc           |    7 +-
+ tcg/s390x/tcg-target.c.inc           |   32 +-
+ tcg/sparc64/tcg-target.c.inc         |    3 +-
+ tcg/tci/tcg-target.c.inc             |    7 +-
+ tcg/README                           |  784 ----------------
+ tests/tcg/multiarch/Makefile.target  |    3 +
+files changed, 2630 insertions(+), 1763 deletions(-)
+ create mode 100644 docs/devel/tcg-ops.rst
+ create mode 100644 tests/tcg/multiarch/nop_func.h
+ create mode 100644 tests/tcg/multiarch/vma-pthread.c
+ delete mode 100644 tcg/README

-New patch
+[PULL 01/47] tcg: convert tcg/README to rst
+From: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>
+Convert tcg/README to rst and move it to docs/devel as a new "TCG Intermediate
+Representation" page. There are a few minor changes to improve the aesthetic
+of the final output which are as follows:
+  - Rename the title from "Tiny Code Generator - Fabrice Bellard" to "TCG
+    Intermediate Representation"
+  - Remove the section numbering
+  - Add the missing parameters to the ssadd_vec operations in the "Host
+    vector operations" section
+  - Change the path to the Atomic Operations document to use a proper
+    reference
+  - Replace tcg/README in tcg.rst with a proper reference to the new document
+Signed-off-by: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>
+Reviewed-by: Fabiano Rosas <farosas@suse.de>
+Message-Id: <20221130100434.64207-2-mark.cave-ayland@ilande.co.uk>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ docs/devel/atomics.rst   |   2 +
+ docs/devel/index-tcg.rst |   1 +
+ docs/devel/tcg-ops.rst   | 941 +++++++++++++++++++++++++++++++++++++++
+ docs/devel/tcg.rst       |   2 +-
+ tcg/README               | 784 --------------------------------
+files changed, 945 insertions(+), 785 deletions(-)
+ create mode 100644 docs/devel/tcg-ops.rst
+ delete mode 100644 tcg/README
+diff --git a/docs/devel/atomics.rst b/docs/devel/atomics.rst
+index XXXXXXX..XXXXXXX 100644
+--- a/docs/devel/atomics.rst
++++ b/docs/devel/atomics.rst
+@@ -XXX,XX +XXX,XX @@
++.. _atomics-ref:
++
+ =========================
+ Atomic operations in QEMU
+ =========================
+diff --git a/docs/devel/index-tcg.rst b/docs/devel/index-tcg.rst
+index XXXXXXX..XXXXXXX 100644
+--- a/docs/devel/index-tcg.rst
++++ b/docs/devel/index-tcg.rst
+@@ -XXX,XX +XXX,XX @@ are only implementing things for HW accelerated hypervisors.
+    :maxdepth: 2
+    tcg
++   tcg-ops
+    decodetree
+    multi-thread-tcg
+    tcg-icount
+diff --git a/docs/devel/tcg-ops.rst b/docs/devel/tcg-ops.rst
+new file mode 100644
+index XXXXXXX..XXXXXXX
+--- /dev/null
++++ b/docs/devel/tcg-ops.rst
+@@ -XXX,XX +XXX,XX @@
++.. _tcg-ops-ref:
++
++*******************************
++TCG Intermediate Representation
++*******************************
++
++Introduction
++============
++
++TCG (Tiny Code Generator) began as a generic backend for a C
++compiler. It was simplified to be used in QEMU. It also has its roots
++in the QOP code generator written by Paul Brook.
++
++Definitions
++===========
++
++TCG receives RISC-like *TCG ops* and performs some optimizations on them,
++including liveness analysis and trivial constant expression
++evaluation.  TCG ops are then implemented in the host CPU back end,
++also known as the TCG target.
++
++The TCG *target* is the architecture for which we generate the
++code. It is of course not the same as the "target" of QEMU which is
++the emulated architecture. As TCG started as a generic C backend used
++for cross compiling, it is assumed that the TCG target is different
++from the host, although it is never the case for QEMU.
++
++In this document, we use *guest* to specify what architecture we are
++emulating; *target* always means the TCG target, the machine on which
++we are running QEMU.
++
++A TCG *function* corresponds to a QEMU Translated Block (TB).
++
++A TCG *temporary* is a variable only live in a basic block. Temporaries are allocated explicitly in each function.
++
++A TCG *local temporary* is a variable only live in a function. Local temporaries are allocated explicitly in each function.
++
++A TCG *global* is a variable which is live in all the functions
++(equivalent of a C global variable). They are defined before the
++functions defined. A TCG global can be a memory location (e.g. a QEMU
++CPU register), a fixed host register (e.g. the QEMU CPU state pointer)
++or a memory location which is stored in a register outside QEMU TBs
++(not implemented yet).
++
++A TCG *basic block* corresponds to a list of instructions terminated
++by a branch instruction.
++
++An operation with *undefined behavior* may result in a crash.
++
++An operation with *unspecified behavior* shall not crash.  However,
++the result may be one of several possibilities so may be considered
++an *undefined result*.
++
++Intermediate representation
++===========================
++
++Introduction
++------------
++
++TCG instructions operate on variables which are temporaries, local
++temporaries or globals. TCG instructions and variables are strongly
++typed. Two types are supported: 32 bit integers and 64 bit
++integers. Pointers are defined as an alias to 32 bit or 64 bit
++integers depending on the TCG target word size.
++
++Each instruction has a fixed number of output variable operands, input
++variable operands and always constant operands.
++
++The notable exception is the call instruction which has a variable
++number of outputs and inputs.
++
++In the textual form, output operands usually come first, followed by
++input operands, followed by constant operands. The output type is
++included in the instruction name. Constants are prefixed with a '$'.
++
++.. code-block:: none
++
++   add_i32 t0, t1, t2    /* (t0 <- t1 + t2) */
++
++
++Assumptions
++-----------
++
++Basic blocks
++^^^^^^^^^^^^
++
++* Basic blocks end after branches (e.g. brcond_i32 instruction),
++  goto_tb and exit_tb instructions.
++
++* Basic blocks start after the end of a previous basic block, or at a
++  set_label instruction.
++
++After the end of a basic block, the content of temporaries is
++destroyed, but local temporaries and globals are preserved.
++
++Floating point types
++^^^^^^^^^^^^^^^^^^^^
++
++* Floating point types are not supported yet
++
++Pointers
++^^^^^^^^
++
++* Depending on the TCG target, pointer size is 32 bit or 64
++  bit. The type ``TCG_TYPE_PTR`` is an alias to ``TCG_TYPE_I32`` or
++  ``TCG_TYPE_I64``.
++
++Helpers
++^^^^^^^
++
++* Using the tcg_gen_helper_x_y it is possible to call any function
++  taking i32, i64 or pointer types. By default, before calling a helper,
++  all globals are stored at their canonical location and it is assumed
++  that the function can modify them. By default, the helper is allowed to
++  modify the CPU state or raise an exception.
++
++  This can be overridden using the following function modifiers:
++
++  - ``TCG_CALL_NO_READ_GLOBALS`` means that the helper does not read globals,
++    either directly or via an exception. They will not be saved to their
++    canonical locations before calling the helper.
++
++  - ``TCG_CALL_NO_WRITE_GLOBALS`` means that the helper does not modify any globals.
++    They will only be saved to their canonical location before calling helpers,
++    but they won't be reloaded afterwards.
++
++  - ``TCG_CALL_NO_SIDE_EFFECTS`` means that the call to the function is removed if
++    the return value is not used.
++
++  Note that ``TCG_CALL_NO_READ_GLOBALS`` implies ``TCG_CALL_NO_WRITE_GLOBALS``.
++
++  On some TCG targets (e.g. x86), several calling conventions are
++  supported.
++
++Branches
++^^^^^^^^
++
++* Use the instruction 'br' to jump to a label.
++
++Code Optimizations
++------------------
++
++When generating instructions, you can count on at least the following
++optimizations:
++
++- Single instructions are simplified, e.g.
++
++  .. code-block:: none
++
++     and_i32 t0, t0, $0xffffffff
++
++  is suppressed.
++
++- A liveness analysis is done at the basic block level. The
++  information is used to suppress moves from a dead variable to
++  another one. It is also used to remove instructions which compute
++  dead results. The later is especially useful for condition code
++  optimization in QEMU.
++
++  In the following example:
++
++  .. code-block:: none
++
++     add_i32 t0, t1, t2
++     add_i32 t0, t0, $1
++     mov_i32 t0, $1
++
++  only the last instruction is kept.
++
++
++Instruction Reference
++=====================
++
++Function call
++-------------
++
++.. list-table::
++
++   * - call *<ret>* *<params>* ptr
++
++     - |  call function 'ptr' (pointer type)
++       |
++       |  *<ret>* optional 32 bit or 64 bit return value
++       |  *<params>* optional 32 bit or 64 bit parameters
++
++Jumps/Labels
++------------
++
++.. list-table::
++
++   * - set_label $label
++
++     - | Define label 'label' at the current program point.
++
++   * - br $label
++
++     - | Jump to label.
++
++   * - brcond_i32/i64 *t0*, *t1*, *cond*, *label*
++
++     - | Conditional jump if *t0* *cond* *t1* is true. *cond* can be:
++       |
++       |   ``TCG_COND_EQ``
++       |   ``TCG_COND_NE``
++       |   ``TCG_COND_LT /* signed */``
++       |   ``TCG_COND_GE /* signed */``
++       |   ``TCG_COND_LE /* signed */``
++       |   ``TCG_COND_GT /* signed */``
++       |   ``TCG_COND_LTU /* unsigned */``
++       |   ``TCG_COND_GEU /* unsigned */``
++       |   ``TCG_COND_LEU /* unsigned */``
++       |   ``TCG_COND_GTU /* unsigned */``
++
++Arithmetic
++----------
++
++.. list-table::
++
++   * - add_i32/i64 *t0*, *t1*, *t2*
++
++     - | *t0* = *t1* + *t2*
++
++   * - sub_i32/i64 *t0*, *t1*, *t2*
++
++     - | *t0* = *t1* - *t2*
++
++   * - neg_i32/i64 *t0*, *t1*
++
++     - | *t0* = -*t1* (two's complement)
++
++   * - mul_i32/i64 *t0*, *t1*, *t2*
++
++     - | *t0* = *t1* * *t2*
++
++   * - div_i32/i64 *t0*, *t1*, *t2*
++
++     - | *t0* = *t1* / *t2* (signed)
++       | Undefined behavior if division by zero or overflow.
++
++   * - divu_i32/i64 *t0*, *t1*, *t2*
++
++     - | *t0* = *t1* / *t2* (unsigned)
++       | Undefined behavior if division by zero.
++
++   * - rem_i32/i64 *t0*, *t1*, *t2*
++
++     - | *t0* = *t1* % *t2* (signed)
++       | Undefined behavior if division by zero or overflow.
++
++   * - remu_i32/i64 *t0*, *t1*, *t2*
++
++     - | *t0* = *t1* % *t2* (unsigned)
++       | Undefined behavior if division by zero.
++
++
++Logical
++-------
++
++.. list-table::
++
++   * - and_i32/i64 *t0*, *t1*, *t2*
++
++     - | *t0* = *t1* & *t2*
++
++   * - or_i32/i64 *t0*, *t1*, *t2*
++
++     - | *t0* = *t1* | *t2*
++
++   * - xor_i32/i64 *t0*, *t1*, *t2*
++
++     - | *t0* = *t1* ^ *t2*
++
++   * - not_i32/i64 *t0*, *t1*
++
++     - | *t0* = ~\ *t1*
++
++   * - andc_i32/i64 *t0*, *t1*, *t2*
++
++     - | *t0* = *t1* & ~\ *t2*
++
++   * - eqv_i32/i64 *t0*, *t1*, *t2*
++
++     - | *t0* = ~(*t1* ^ *t2*), or equivalently, *t0* = *t1* ^ ~\ *t2*
++
++   * - nand_i32/i64 *t0*, *t1*, *t2*
++
++     - | *t0* = ~(*t1* & *t2*)
++
++   * - nor_i32/i64 *t0*, *t1*, *t2*
++
++     - | *t0* = ~(*t1* | *t2*)
++
++   * - orc_i32/i64 *t0*, *t1*, *t2*
++
++     - | *t0* = *t1* | ~\ *t2*
++
++   * - clz_i32/i64 *t0*, *t1*, *t2*
++
++     - | *t0* = *t1* ? clz(*t1*) : *t2*
++
++   * - ctz_i32/i64 *t0*, *t1*, *t2*
++
++     - | *t0* = *t1* ? ctz(*t1*) : *t2*
++
++   * - ctpop_i32/i64 *t0*, *t1*
++
++     - | *t0* = number of bits set in *t1*
++       |
++       | With *ctpop* short for "count population", matching
++       | the function name used in ``include/qemu/host-utils.h``.
++
++
++Shifts/Rotates
++--------------
++
++.. list-table::
++
++   * - shl_i32/i64 *t0*, *t1*, *t2*
++
++     - | *t0* = *t1* << *t2*
++       | Unspecified behavior if *t2* < 0 or *t2* >= 32 (resp 64)
++
++   * - shr_i32/i64 *t0*, *t1*, *t2*
++
++     - | *t0* = *t1* >> *t2* (unsigned)
++       | Unspecified behavior if *t2* < 0 or *t2* >= 32 (resp 64)
++
++   * - sar_i32/i64 *t0*, *t1*, *t2*
++
++     - | *t0* = *t1* >> *t2* (signed)
++       | Unspecified behavior if *t2* < 0 or *t2* >= 32 (resp 64)
++
++   * - rotl_i32/i64 *t0*, *t1*, *t2*
++
++     - | Rotation of *t2* bits to the left
++       | Unspecified behavior if *t2* < 0 or *t2* >= 32 (resp 64)
++
++   * - rotr_i32/i64 *t0*, *t1*, *t2*
++
++     - | Rotation of *t2* bits to the right.
++       | Unspecified behavior if *t2* < 0 or *t2* >= 32 (resp 64)
++
++
++Misc
++----
++
++.. list-table::
++
++   * - mov_i32/i64 *t0*, *t1*
++
++     - | *t0* = *t1*
++       | Move *t1* to *t0* (both operands must have the same type).
++
++   * - ext8s_i32/i64 *t0*, *t1*
++
++       ext8u_i32/i64 *t0*, *t1*
++
++       ext16s_i32/i64 *t0*, *t1*
++
++       ext16u_i32/i64 *t0*, *t1*
++
++       ext32s_i64 *t0*, *t1*
++
++       ext32u_i64 *t0*, *t1*
++
++     - | 8, 16 or 32 bit sign/zero extension (both operands must have the same type)
++
++   * - bswap16_i32/i64 *t0*, *t1*, *flags*
++
++     - | 16 bit byte swap on the low bits of a 32/64 bit input.
++       |
++       | If *flags* & ``TCG_BSWAP_IZ``, then *t1* is known to be zero-extended from bit 15.
++       | If *flags* & ``TCG_BSWAP_OZ``, then *t0* will be zero-extended from bit 15.
++       | If *flags* & ``TCG_BSWAP_OS``, then *t0* will be sign-extended from bit 15.
++       |
++       | If neither ``TCG_BSWAP_OZ`` nor ``TCG_BSWAP_OS`` are set, then the bits of *t0* above bit 15 may contain any value.
++
++   * - bswap32_i64 *t0*, *t1*, *flags*
++
++     - | 32 bit byte swap on a 64-bit value.  The flags are the same as for bswap16,
++         except they apply from bit 31 instead of bit 15.
++
++   * - bswap32_i32 *t0*, *t1*, *flags*
++
++       bswap64_i64 *t0*, *t1*, *flags*
++
++     - | 32/64 bit byte swap. The flags are ignored, but still present
++         for consistency with the other bswap opcodes.
++
++   * - discard_i32/i64 *t0*
++
++     - | Indicate that the value of *t0* won't be used later. It is useful to
++         force dead code elimination.
++
++   * - deposit_i32/i64 *dest*, *t1*, *t2*, *pos*, *len*
++
++     - | Deposit *t2* as a bitfield into *t1*, placing the result in *dest*.
++       |
++       | The bitfield is described by *pos*/*len*, which are immediate values:
++       |
++       |     *len* - the length of the bitfield
++       |     *pos* - the position of the first bit, counting from the LSB
++       |
++       | For example, "deposit_i32 dest, t1, t2, 8, 4" indicates a 4-bit field
++         at bit 8. This operation would be equivalent to
++       |
++       |     *dest* = (*t1* & ~0x0f00) | ((*t2* << 8) & 0x0f00)
++
++   * - extract_i32/i64 *dest*, *t1*, *pos*, *len*
++
++       sextract_i32/i64 *dest*, *t1*, *pos*, *len*
++
++     - | Extract a bitfield from *t1*, placing the result in *dest*.
++       |
++       | The bitfield is described by *pos*/*len*, which are immediate values,
++         as above for deposit.  For extract_*, the result will be extended
++         to the left with zeros; for sextract_*, the result will be extended
++         to the left with copies of the bitfield sign bit at *pos* + *len* - 1.
++       |
++       | For example, "sextract_i32 dest, t1, 8, 4" indicates a 4-bit field
++         at bit 8. This operation would be equivalent to
++       |
++       |    *dest* = (*t1* << 20) >> 28
++       |
++       | (using an arithmetic right shift).
++
++   * - extract2_i32/i64 *dest*, *t1*, *t2*, *pos*
++
++     - | For N = {32,64}, extract an N-bit quantity from the concatenation
++         of *t2*:*t1*, beginning at *pos*. The tcg_gen_extract2_{i32,i64} expander
++         accepts 0 <= *pos* <= N as inputs. The backend code generator will
++         not see either 0 or N as inputs for these opcodes.
++
++   * - extrl_i64_i32 *t0*, *t1*
++
++     - | For 64-bit hosts only, extract the low 32-bits of input *t1* and place it
++         into 32-bit output *t0*.  Depending on the host, this may be a simple move,
++         or may require additional canonicalization.
++
++   * - extrh_i64_i32 *t0*, *t1*
++
++     - | For 64-bit hosts only, extract the high 32-bits of input *t1* and place it
++         into 32-bit output *t0*.  Depending on the host, this may be a simple shift,
++         or may require additional canonicalization.
++
++
++Conditional moves
++-----------------
++
++.. list-table::
++
++   * - setcond_i32/i64 *dest*, *t1*, *t2*, *cond*
++
++     - | *dest* = (*t1* *cond* *t2*)
++       |
++       | Set *dest* to 1 if (*t1* *cond* *t2*) is true, otherwise set to 0.
++
++   * - movcond_i32/i64 *dest*, *c1*, *c2*, *v1*, *v2*, *cond*
++
++     - | *dest* = (*c1* *cond* *c2* ? *v1* : *v2*)
++       |
++       | Set *dest* to *v1* if (*c1* *cond* *c2*) is true, otherwise set to *v2*.
++
++
++Type conversions
++----------------
++
++.. list-table::
++
++   * - ext_i32_i64 *t0*, *t1*
++
++     - | Convert *t1* (32 bit) to *t0* (64 bit) and does sign extension
++
++   * - extu_i32_i64 *t0*, *t1*
++
++     - | Convert *t1* (32 bit) to *t0* (64 bit) and does zero extension
++
++   * - trunc_i64_i32 *t0*, *t1*
++
++     - | Truncate *t1* (64 bit) to *t0* (32 bit)
++
++   * - concat_i32_i64 *t0*, *t1*, *t2*
++
++     - | Construct *t0* (64-bit) taking the low half from *t1* (32 bit) and the high half
++         from *t2* (32 bit).
++
++   * - concat32_i64 *t0*, *t1*, *t2*
++
++     - | Construct *t0* (64-bit) taking the low half from *t1* (64 bit) and the high half
++         from *t2* (64 bit).
++
++
++Load/Store
++----------
++
++.. list-table::
++
++   * - ld_i32/i64 *t0*, *t1*, *offset*
++
++       ld8s_i32/i64 *t0*, *t1*, *offset*
++
++       ld8u_i32/i64 *t0*, *t1*, *offset*
++
++       ld16s_i32/i64 *t0*, *t1*, *offset*
++
++       ld16u_i32/i64 *t0*, *t1*, *offset*
++
++       ld32s_i64 t0, *t1*, *offset*
++
++       ld32u_i64 t0, *t1*, *offset*
++
++     - | *t0* = read(*t1* + *offset*)
++       |
++       | Load 8, 16, 32 or 64 bits with or without sign extension from host memory.
++         *offset* must be a constant.
++
++   * - st_i32/i64 *t0*, *t1*, *offset*
++
++       st8_i32/i64 *t0*, *t1*, *offset*
++
++       st16_i32/i64 *t0*, *t1*, *offset*
++
++       st32_i64 *t0*, *t1*, *offset*
++
++     - | write(*t0*, *t1* + *offset*)
++       |
++       | Write 8, 16, 32 or 64 bits to host memory.
++
++All this opcodes assume that the pointed host memory doesn't correspond
++to a global. In the latter case the behaviour is unpredictable.
++
++
++Multiword arithmetic support
++----------------------------
++
++.. list-table::
++
++   * - add2_i32/i64 *t0_low*, *t0_high*, *t1_low*, *t1_high*, *t2_low*, *t2_high*
++
++       sub2_i32/i64 *t0_low*, *t0_high*, *t1_low*, *t1_high*, *t2_low*, *t2_high*
++
++     - | Similar to add/sub, except that the double-word inputs *t1* and *t2* are
++         formed from two single-word arguments, and the double-word output *t0*
++         is returned in two single-word outputs.
++
++   * - mulu2_i32/i64 *t0_low*, *t0_high*, *t1*, *t2*
++
++     - | Similar to mul, except two unsigned inputs *t1* and *t2* yielding the full
++         double-word product *t0*. The latter is returned in two single-word outputs.
++
++   * - muls2_i32/i64 *t0_low*, *t0_high*, *t1*, *t2*
++
++     - | Similar to mulu2, except the two inputs *t1* and *t2* are signed.
++
++   * - mulsh_i32/i64 *t0*, *t1*, *t2*
++
++       muluh_i32/i64 *t0*, *t1*, *t2*
++
++     - | Provide the high part of a signed or unsigned multiply, respectively.
++       |
++       | If mulu2/muls2 are not provided by the backend, the tcg-op generator
++         can obtain the same results by emitting a pair of opcodes, mul + muluh/mulsh.
++
++
++Memory Barrier support
++----------------------
++
++.. list-table::
++
++   * - mb *<$arg>*
++
++     - | Generate a target memory barrier instruction to ensure memory ordering
++         as being  enforced by a corresponding guest memory barrier instruction.
++       |
++       | The ordering enforced by the backend may be stricter than the ordering
++         required by the guest. It cannot be weaker. This opcode takes a constant
++         argument which is required to generate the appropriate barrier
++         instruction. The backend should take care to emit the target barrier
++         instruction only when necessary i.e., for SMP guests and when MTTCG is
++         enabled.
++       |
++       | The guest translators should generate this opcode for all guest instructions
++         which have ordering side effects.
++       |
++       | Please see :ref:`atomics-ref` for more information on memory barriers.
++
++
++64-bit guest on 32-bit host support
++-----------------------------------
++
++The following opcodes are internal to TCG.  Thus they are to be implemented by
++32-bit host code generators, but are not to be emitted by guest translators.
++They are emitted as needed by inline functions within ``tcg-op.h``.
++
++.. list-table::
++
++   * - brcond2_i32 *t0_low*, *t0_high*, *t1_low*, *t1_high*, *cond*, *label*
++
++     - | Similar to brcond, except that the 64-bit values *t0* and *t1*
++         are formed from two 32-bit arguments.
++
++   * - setcond2_i32 *dest*, *t1_low*, *t1_high*, *t2_low*, *t2_high*, *cond*
++
++     - | Similar to setcond, except that the 64-bit values *t1* and *t2* are
++         formed from two 32-bit arguments. The result is a 32-bit value.
++
++
++QEMU specific operations
++------------------------
++
++.. list-table::
++
++   * - exit_tb *t0*
++
++     - | Exit the current TB and return the value *t0* (word type).
++
++   * - goto_tb *index*
++
++     - | Exit the current TB and jump to the TB index *index* (constant) if the
++         current TB was linked to this TB. Otherwise execute the next
++         instructions. Only indices 0 and 1 are valid and tcg_gen_goto_tb may be issued
++         at most once with each slot index per TB.
++
++   * - lookup_and_goto_ptr *tb_addr*
++
++     - | Look up a TB address *tb_addr* and jump to it if valid. If not valid,
++         jump to the TCG epilogue to go back to the exec loop.
++       |
++       | This operation is optional. If the TCG backend does not implement the
++         goto_ptr opcode, emitting this op is equivalent to emitting exit_tb(0).
++
++   * - qemu_ld_i32/i64 *t0*, *t1*, *flags*, *memidx*
++
++       qemu_st_i32/i64 *t0*, *t1*, *flags*, *memidx*
++
++       qemu_st8_i32 *t0*, *t1*, *flags*, *memidx*
++
++     - | Load data at the guest address *t1* into *t0*, or store data in *t0* at guest
++         address *t1*.  The _i32/_i64 size applies to the size of the input/output
++         register *t0* only.  The address *t1* is always sized according to the guest,
++         and the width of the memory operation is controlled by *flags*.
++       |
++       | Both *t0* and *t1* may be split into little-endian ordered pairs of registers
++         if dealing with 64-bit quantities on a 32-bit host.
++       |
++       | The *memidx* selects the qemu tlb index to use (e.g. user or kernel access).
++         The flags are the MemOp bits, selecting the sign, width, and endianness
++         of the memory access.
++       |
++       | For a 32-bit host, qemu_ld/st_i64 is guaranteed to only be used with a
++         64-bit memory access specified in *flags*.
++       |
++       | For i386, qemu_st8_i32 is exactly like qemu_st_i32, except the size of
++         the memory operation is known to be 8-bit.  This allows the backend to
++         provide a different set of register constraints.
++
++
++Host vector operations
++----------------------
++
++All of the vector ops have two parameters, ``TCGOP_VECL`` & ``TCGOP_VECE``.
++The former specifies the length of the vector in log2 64-bit units; the
++latter specifies the length of the element (if applicable) in log2 8-bit units.
++E.g. VECL = 1 -> 64 << 1 -> v128, and VECE = 2 -> 1 << 2 -> i32.
++
++.. list-table::
++
++   * - mov_vec *v0*, *v1*
++       ld_vec *v0*, *t1*
++       st_vec *v0*, *t1*
++
++     - | Move, load and store.
++
++   * - dup_vec *v0*, *r1*
++
++     - | Duplicate the low N bits of *r1* into VECL/VECE copies across *v0*.
++
++   * - dupi_vec *v0*, *c*
++
++     - | Similarly, for a constant.
++       | Smaller values will be replicated to host register size by the expanders.
++
++   * - dup2_vec *v0*, *r1*, *r2*
++
++     - | Duplicate *r2*:*r1* into VECL/64 copies across *v0*. This opcode is
++         only present for 32-bit hosts.
++
++   * - add_vec *v0*, *v1*, *v2*
++
++     - | *v0* = *v1* + *v2*, in elements across the vector.
++
++   * - sub_vec *v0*, *v1*, *v2*
++
++     - | Similarly, *v0* = *v1* - *v2*.
++
++   * - mul_vec *v0*, *v1*, *v2*
++
++     - | Similarly, *v0* = *v1* * *v2*.
++
++   * - neg_vec *v0*, *v1*
++
++     - | Similarly, *v0* = -*v1*.
++
++   * - abs_vec *v0*, *v1*
++
++     - | Similarly, *v0* = *v1* < 0 ? -*v1* : *v1*, in elements across the vector.
++
++   * - smin_vec *v0*, *v1*, *v2*
++
++       umin_vec *v0*, *v1*, *v2*
++
++     - | Similarly, *v0* = MIN(*v1*, *v2*), for signed and unsigned element types.
++
++   * - smax_vec *v0*, *v1*, *v2*
++
++       umax_vec *v0*, *v1*, *v2*
++
++     - | Similarly, *v0* = MAX(*v1*, *v2*), for signed and unsigned element types.
++
++   * - ssadd_vec *v0*, *v1*, *v2*
++
++       sssub_vec *v0*, *v1*, *v2*
++
++       usadd_vec *v0*, *v1*, *v2*
++
++       ussub_vec *v0*, *v1*, *v2*
++
++     - | Signed and unsigned saturating addition and subtraction.
++       |
++       | If the true result is not representable within the element type, the
++         element is set to the minimum or maximum value for the type.
++
++   * - and_vec *v0*, *v1*, *v2*
++
++       or_vec *v0*, *v1*, *v2*
++
++       xor_vec *v0*, *v1*, *v2*
++
++       andc_vec *v0*, *v1*, *v2*
++
++       orc_vec *v0*, *v1*, *v2*
++
++       not_vec *v0*, *v1*
++
++     - | Similarly, logical operations with and without complement.
++       |
++       | Note that VECE is unused.
++
++   * - shli_vec *v0*, *v1*, *i2*
++
++       shls_vec *v0*, *v1*, *s2*
++
++     - | Shift all elements from v1 by a scalar *i2*/*s2*. I.e.
++
++       .. code-block:: c
++
++          for (i = 0; i < VECL/VECE; ++i) {
++              v0[i] = v1[i] << s2;
++          }
++
++   * - shri_vec *v0*, *v1*, *i2*
++
++       sari_vec *v0*, *v1*, *i2*
++
++       rotli_vec *v0*, *v1*, *i2*
++
++       shrs_vec *v0*, *v1*, *s2*
++
++       sars_vec *v0*, *v1*, *s2*
++
++     - | Similarly for logical and arithmetic right shift, and left rotate.
++
++   * - shlv_vec *v0*, *v1*, *v2*
++
++     - | Shift elements from *v1* by elements from *v2*. I.e.
++
++       .. code-block:: c
++
++          for (i = 0; i < VECL/VECE; ++i) {
++              v0[i] = v1[i] << v2[i];
++          }
++
++   * - shrv_vec *v0*, *v1*, *v2*
++
++       sarv_vec *v0*, *v1*, *v2*
++
++       rotlv_vec *v0*, *v1*, *v2*
++
++       rotrv_vec *v0*, *v1*, *v2*
++
++     - | Similarly for logical and arithmetic right shift, and rotates.
++
++   * - cmp_vec *v0*, *v1*, *v2*, *cond*
++
++     - | Compare vectors by element, storing -1 for true and 0 for false.
++
++   * - bitsel_vec *v0*, *v1*, *v2*, *v3*
++
++     - | Bitwise select, *v0* = (*v2* & *v1*) | (*v3* & ~\ *v1*), across the entire vector.
++
++   * - cmpsel_vec *v0*, *c1*, *c2*, *v3*, *v4*, *cond*
++
++     - | Select elements based on comparison results:
++
++       .. code-block:: c
++
++          for (i = 0; i < n; ++i) {
++              v0[i] = (c1[i] cond c2[i]) ? v3[i] : v4[i].
++          }
++
++**Note 1**: Some shortcuts are defined when the last operand is known to be
++a constant (e.g. addi for add, movi for mov).
++
++**Note 2**: When using TCG, the opcodes must never be generated directly
++as some of them may not be available as "real" opcodes. Always use the
++function tcg_gen_xxx(args).
++
++
++Backend
++=======
++
++``tcg-target.h`` contains the target specific definitions. ``tcg-target.c.inc``
++contains the target specific code; it is #included by ``tcg/tcg.c``, rather
++than being a standalone C file.
++
++Assumptions
++-----------
++
++The target word size (``TCG_TARGET_REG_BITS``) is expected to be 32 bit or
++64 bit. It is expected that the pointer has the same size as the word.
++
++On a 32 bit target, all 64 bit operations are converted to 32 bits. A
++few specific operations must be implemented to allow it (see add2_i32,
++sub2_i32, brcond2_i32).
++
++On a 64 bit target, the values are transferred between 32 and 64-bit
++registers using the following ops:
++
++- trunc_shr_i64_i32
++- ext_i32_i64
++- extu_i32_i64
++
++They ensure that the values are correctly truncated or extended when
++moved from a 32-bit to a 64-bit register or vice-versa. Note that the
++trunc_shr_i64_i32 is an optional op. It is not necessary to implement
++it if all the following conditions are met:
++
++- 64-bit registers can hold 32-bit values
++- 32-bit values in a 64-bit register do not need to stay zero or
++  sign extended
++- all 32-bit TCG ops ignore the high part of 64-bit registers
++
++Floating point operations are not supported in this version. A
++previous incarnation of the code generator had full support of them,
++but it is better to concentrate on integer operations first.
++
++Constraints
++----------------
++
++GCC like constraints are used to define the constraints of every
++instruction. Memory constraints are not supported in this
++version. Aliases are specified in the input operands as for GCC.
++
++The same register may be used for both an input and an output, even when
++they are not explicitly aliased.  If an op expands to multiple target
++instructions then care must be taken to avoid clobbering input values.
++GCC style "early clobber" outputs are supported, with '``&``'.
++
++A target can define specific register or constant constraints. If an
++operation uses a constant input constraint which does not allow all
++constants, it must also accept registers in order to have a fallback.
++The constraint '``i``' is defined generically to accept any constant.
++The constraint '``r``' is not defined generically, but is consistently
++used by each backend to indicate all registers.
++
++The movi_i32 and movi_i64 operations must accept any constants.
++
++The mov_i32 and mov_i64 operations must accept any registers of the
++same type.
++
++The ld/st/sti instructions must accept signed 32 bit constant offsets.
++This can be implemented by reserving a specific register in which to
++compute the address if the offset is too big.
++
++The ld/st instructions must accept any destination (ld) or source (st)
++register.
++
++The sti instruction may fail if it cannot store the given constant.
++
++Function call assumptions
++-------------------------
++
++- The only supported types for parameters and return value are: 32 and
++  64 bit integers and pointer.
++- The stack grows downwards.
++- The first N parameters are passed in registers.
++- The next parameters are passed on the stack by storing them as words.
++- Some registers are clobbered during the call.
++- The function can return 0 or 1 value in registers. On a 32 bit
++  target, functions must be able to return 2 values in registers for
++  64 bit return type.
++
++
++Recommended coding rules for best performance
++=============================================
++
++- Use globals to represent the parts of the QEMU CPU state which are
++  often modified, e.g. the integer registers and the condition
++  codes. TCG will be able to use host registers to store them.
++
++- Avoid globals stored in fixed registers. They must be used only to
++  store the pointer to the CPU state and possibly to store a pointer
++  to a register window.
++
++- Use temporaries. Use local temporaries only when really needed,
++  e.g. when you need to use a value after a jump. Local temporaries
++  introduce a performance hit in the current TCG implementation: their
++  content is saved to memory at end of each basic block.
++
++- Free temporaries and local temporaries when they are no longer used
++  (tcg_temp_free). Since tcg_const_x() also creates a temporary, you
++  should free it after it is used. Freeing temporaries does not yield
++  a better generated code, but it reduces the memory usage of TCG and
++  the speed of the translation.
++
++- Don't hesitate to use helpers for complicated or seldom used guest
++  instructions. There is little performance advantage in using TCG to
++  implement guest instructions taking more than about twenty TCG
++  instructions. Note that this rule of thumb is more applicable to
++  helpers doing complex logic or arithmetic, where the C compiler has
++  scope to do a good job of optimisation; it is less relevant where
++  the instruction is mostly doing loads and stores, and in those cases
++  inline TCG may still be faster for longer sequences.
++
++- The hard limit on the number of TCG instructions you can generate
++  per guest instruction is set by ``MAX_OP_PER_INSTR`` in ``exec-all.h`` --
++  you cannot exceed this without risking a buffer overrun.
++
++- Use the 'discard' instruction if you know that TCG won't be able to
++  prove that a given global is "dead" at a given program point. The
++  x86 guest uses it to improve the condition codes optimisation.
+diff --git a/docs/devel/tcg.rst b/docs/devel/tcg.rst
+index XXXXXXX..XXXXXXX 100644
+--- a/docs/devel/tcg.rst
++++ b/docs/devel/tcg.rst
+@@ -XXX,XX +XXX,XX @@ which make it relatively easily portable and simple while achieving good
+ performances.
+ QEMU's dynamic translation backend is called TCG, for "Tiny Code
+-Generator". For more information, please take a look at ``tcg/README``.
++Generator". For more information, please take a look at :ref:`tcg-ops-ref`.
+ The following sections outline some notable features and implementation
+ details of QEMU's dynamic translator.
+diff --git a/tcg/README b/tcg/README
+deleted file mode 100644
+index XXXXXXX..XXXXXXX
+--- a/tcg/README
++++ /dev/null
+@@ -XXX,XX +XXX,XX @@
+-Tiny Code Generator - Fabrice Bellard.
+-
+-1) Introduction
+-
+-TCG (Tiny Code Generator) began as a generic backend for a C
+-compiler. It was simplified to be used in QEMU. It also has its roots
+-in the QOP code generator written by Paul Brook.
+-
+-2) Definitions
+-
+-TCG receives RISC-like "TCG ops" and performs some optimizations on them,
+-including liveness analysis and trivial constant expression
+-evaluation.  TCG ops are then implemented in the host CPU back end,
+-also known as the TCG "target".
+-
+-The TCG "target" is the architecture for which we generate the
+-code. It is of course not the same as the "target" of QEMU which is
+-the emulated architecture. As TCG started as a generic C backend used
+-for cross compiling, it is assumed that the TCG target is different
+-from the host, although it is never the case for QEMU.
+-
+-In this document, we use "guest" to specify what architecture we are
+-emulating; "target" always means the TCG target, the machine on which
+-we are running QEMU.
+-
+-A TCG "function" corresponds to a QEMU Translated Block (TB).
+-
+-A TCG "temporary" is a variable only live in a basic
+-block. Temporaries are allocated explicitly in each function.
+-
+-A TCG "local temporary" is a variable only live in a function. Local
+-temporaries are allocated explicitly in each function.
+-
+-A TCG "global" is a variable which is live in all the functions
+-(equivalent of a C global variable). They are defined before the
+-functions defined. A TCG global can be a memory location (e.g. a QEMU
+-CPU register), a fixed host register (e.g. the QEMU CPU state pointer)
+-or a memory location which is stored in a register outside QEMU TBs
+-(not implemented yet).
+-
+-A TCG "basic block" corresponds to a list of instructions terminated
+-by a branch instruction.
+-
+-An operation with "undefined behavior" may result in a crash.
+-
+-An operation with "unspecified behavior" shall not crash.  However,
+-the result may be one of several possibilities so may be considered
+-an "undefined result".
+-
+-3) Intermediate representation
+-
+-3.1) Introduction
+-
+-TCG instructions operate on variables which are temporaries, local
+-temporaries or globals. TCG instructions and variables are strongly
+-typed. Two types are supported: 32 bit integers and 64 bit
+-integers. Pointers are defined as an alias to 32 bit or 64 bit
+-integers depending on the TCG target word size.
+-
+-Each instruction has a fixed number of output variable operands, input
+-variable operands and always constant operands.
+-
+-The notable exception is the call instruction which has a variable
+-number of outputs and inputs.
+-
+-In the textual form, output operands usually come first, followed by
+-input operands, followed by constant operands. The output type is
+-included in the instruction name. Constants are prefixed with a '$'.
+-
+-add_i32 t0, t1, t2  (t0 <- t1 + t2)
+-
+-3.2) Assumptions
+-
+-* Basic blocks
+-
+-- Basic blocks end after branches (e.g. brcond_i32 instruction),
+-  goto_tb and exit_tb instructions.
+-- Basic blocks start after the end of a previous basic block, or at a
+-  set_label instruction.
+-
+-After the end of a basic block, the content of temporaries is
+-destroyed, but local temporaries and globals are preserved.
+-
+-* Floating point types are not supported yet
+-
+-* Pointers: depending on the TCG target, pointer size is 32 bit or 64
+-  bit. The type TCG_TYPE_PTR is an alias to TCG_TYPE_I32 or
+-  TCG_TYPE_I64.
+-
+-* Helpers:
+-
+-Using the tcg_gen_helper_x_y it is possible to call any function
+-taking i32, i64 or pointer types. By default, before calling a helper,
+-all globals are stored at their canonical location and it is assumed
+-that the function can modify them. By default, the helper is allowed to
+-modify the CPU state or raise an exception.
+-
+-This can be overridden using the following function modifiers:
+-- TCG_CALL_NO_READ_GLOBALS means that the helper does not read globals,
+-  either directly or via an exception. They will not be saved to their
+-  canonical locations before calling the helper.
+-- TCG_CALL_NO_WRITE_GLOBALS means that the helper does not modify any globals.
+-  They will only be saved to their canonical location before calling helpers,
+-  but they won't be reloaded afterwards.
+-- TCG_CALL_NO_SIDE_EFFECTS means that the call to the function is removed if
+-  the return value is not used.
+-
+-Note that TCG_CALL_NO_READ_GLOBALS implies TCG_CALL_NO_WRITE_GLOBALS.
+-
+-On some TCG targets (e.g. x86), several calling conventions are
+-supported.
+-
+-* Branches:
+-
+-Use the instruction 'br' to jump to a label.
+-
+-3.3) Code Optimizations
+-
+-When generating instructions, you can count on at least the following
+-optimizations:
+-
+-- Single instructions are simplified, e.g.
+-
+-   and_i32 t0, t0, $0xffffffff
+-
+-  is suppressed.
+-
+-- A liveness analysis is done at the basic block level. The
+-  information is used to suppress moves from a dead variable to
+-  another one. It is also used to remove instructions which compute
+-  dead results. The later is especially useful for condition code
+-  optimization in QEMU.
+-
+-  In the following example:
+-
+-  add_i32 t0, t1, t2
+-  add_i32 t0, t0, $1
+-  mov_i32 t0, $1
+-
+-  only the last instruction is kept.
+-
+-3.4) Instruction Reference
+-
+-********* Function call
+-
+-* call <ret> <params> ptr
+-
+-call function 'ptr' (pointer type)
+-
+-<ret> optional 32 bit or 64 bit return value
+-<params> optional 32 bit or 64 bit parameters
+-
+-********* Jumps/Labels
+-
+-* set_label $label
+-
+-Define label 'label' at the current program point.
+-
+-* br $label
+-
+-Jump to label.
+-
+-* brcond_i32/i64 t0, t1, cond, label
+-
+-Conditional jump if t0 cond t1 is true. cond can be:
+-    TCG_COND_EQ
+-    TCG_COND_NE
+-    TCG_COND_LT /* signed */
+-    TCG_COND_GE /* signed */
+-    TCG_COND_LE /* signed */
+-    TCG_COND_GT /* signed */
+-    TCG_COND_LTU /* unsigned */
+-    TCG_COND_GEU /* unsigned */
+-    TCG_COND_LEU /* unsigned */
+-    TCG_COND_GTU /* unsigned */
+-
+-********* Arithmetic
+-
+-* add_i32/i64 t0, t1, t2
+-
+-t0=t1+t2
+-
+-* sub_i32/i64 t0, t1, t2
+-
+-t0=t1-t2
+-
+-* neg_i32/i64 t0, t1
+-
+-t0=-t1 (two's complement)
+-
+-* mul_i32/i64 t0, t1, t2
+-
+-t0=t1*t2
+-
+-* div_i32/i64 t0, t1, t2
+-
+-t0=t1/t2 (signed). Undefined behavior if division by zero or overflow.
+-
+-* divu_i32/i64 t0, t1, t2
+-
+-t0=t1/t2 (unsigned). Undefined behavior if division by zero.
+-
+-* rem_i32/i64 t0, t1, t2
+-
+-t0=t1%t2 (signed). Undefined behavior if division by zero or overflow.
+-
+-* remu_i32/i64 t0, t1, t2
+-
+-t0=t1%t2 (unsigned). Undefined behavior if division by zero.
+-
+-********* Logical
+-
+-* and_i32/i64 t0, t1, t2
+-
+-t0=t1&t2
+-
+-* or_i32/i64 t0, t1, t2
+-
+-t0=t1|t2
+-
+-* xor_i32/i64 t0, t1, t2
+-
+-t0=t1^t2
+-
+-* not_i32/i64 t0, t1
+-
+-t0=~t1
+-
+-* andc_i32/i64 t0, t1, t2
+-
+-t0=t1&~t2
+-
+-* eqv_i32/i64 t0, t1, t2
+-
+-t0=~(t1^t2), or equivalently, t0=t1^~t2
+-
+-* nand_i32/i64 t0, t1, t2
+-
+-t0=~(t1&t2)
+-
+-* nor_i32/i64 t0, t1, t2
+-
+-t0=~(t1|t2)
+-
+-* orc_i32/i64 t0, t1, t2
+-
+-t0=t1|~t2
+-
+-* clz_i32/i64 t0, t1, t2
+-
+-t0 = t1 ? clz(t1) : t2
+-
+-* ctz_i32/i64 t0, t1, t2
+-
+-t0 = t1 ? ctz(t1) : t2
+-
+-* ctpop_i32/i64 t0, t1
+-
+-t0 = number of bits set in t1
+-With "ctpop" short for "count population", matching
+-the function name used in include/qemu/host-utils.h.
+-
+-********* Shifts/Rotates
+-
+-* shl_i32/i64 t0, t1, t2
+-
+-t0=t1 << t2. Unspecified behavior if t2 < 0 or t2 >= 32 (resp 64)
+-
+-* shr_i32/i64 t0, t1, t2
+-
+-t0=t1 >> t2 (unsigned). Unspecified behavior if t2 < 0 or t2 >= 32 (resp 64)
+-
+-* sar_i32/i64 t0, t1, t2
+-
+-t0=t1 >> t2 (signed). Unspecified behavior if t2 < 0 or t2 >= 32 (resp 64)
+-
+-* rotl_i32/i64 t0, t1, t2
+-
+-Rotation of t2 bits to the left.
+-Unspecified behavior if t2 < 0 or t2 >= 32 (resp 64)
+-
+-* rotr_i32/i64 t0, t1, t2
+-
+-Rotation of t2 bits to the right.
+-Unspecified behavior if t2 < 0 or t2 >= 32 (resp 64)
+-
+-********* Misc
+-
+-* mov_i32/i64 t0, t1
+-
+-t0 = t1
+-
+-Move t1 to t0 (both operands must have the same type).
+-
+-* ext8s_i32/i64 t0, t1
+-ext8u_i32/i64 t0, t1
+-ext16s_i32/i64 t0, t1
+-ext16u_i32/i64 t0, t1
+-ext32s_i64 t0, t1
+-ext32u_i64 t0, t1
+-
+-8, 16 or 32 bit sign/zero extension (both operands must have the same type)
+-
+-* bswap16_i32/i64 t0, t1, flags
+-
+-16 bit byte swap on the low bits of a 32/64 bit input.
+-If flags & TCG_BSWAP_IZ, then t1 is known to be zero-extended from bit 15.
+-If flags & TCG_BSWAP_OZ, then t0 will be zero-extended from bit 15.
+-If flags & TCG_BSWAP_OS, then t0 will be sign-extended from bit 15.
+-If neither TCG_BSWAP_OZ nor TCG_BSWAP_OS are set, then the bits of
+-t0 above bit 15 may contain any value.
+-
+-* bswap32_i64 t0, t1, flags
+-
+-32 bit byte swap on a 64-bit value.  The flags are the same as for bswap16,
+-except they apply from bit 31 instead of bit 15.
+-
+-* bswap32_i32 t0, t1, flags
+-* bswap64_i64 t0, t1, flags
+-
+-32/64 bit byte swap.  The flags are ignored, but still present
+-for consistency with the other bswap opcodes.
+-
+-* discard_i32/i64 t0
+-
+-Indicate that the value of t0 won't be used later. It is useful to
+-force dead code elimination.
+-
+-* deposit_i32/i64 dest, t1, t2, pos, len
+-
+-Deposit T2 as a bitfield into T1, placing the result in DEST.
+-The bitfield is described by POS/LEN, which are immediate values:
+-
+-  LEN - the length of the bitfield
+-  POS - the position of the first bit, counting from the LSB
+-
+-For example, "deposit_i32 dest, t1, t2, 8, 4" indicates a 4-bit field
+-at bit 8.  This operation would be equivalent to
+-
+-  dest = (t1 & ~0x0f00) | ((t2 << 8) & 0x0f00)
+-
+-* extract_i32/i64 dest, t1, pos, len
+-* sextract_i32/i64 dest, t1, pos, len
+-
+-Extract a bitfield from T1, placing the result in DEST.
+-The bitfield is described by POS/LEN, which are immediate values,
+-as above for deposit.  For extract_*, the result will be extended
+-to the left with zeros; for sextract_*, the result will be extended
+-to the left with copies of the bitfield sign bit at pos + len - 1.
+-
+-For example, "sextract_i32 dest, t1, 8, 4" indicates a 4-bit field
+-at bit 8.  This operation would be equivalent to
+-
+-  dest = (t1 << 20) >> 28
+-
+-(using an arithmetic right shift).
+-
+-* extract2_i32/i64 dest, t1, t2, pos
+-
+-For N = {32,64}, extract an N-bit quantity from the concatenation
+-of t2:t1, beginning at pos.  The tcg_gen_extract2_{i32,i64} expander
+-accepts 0 <= pos <= N as inputs.  The backend code generator will
+-not see either 0 or N as inputs for these opcodes.
+-
+-* extrl_i64_i32 t0, t1
+-
+-For 64-bit hosts only, extract the low 32-bits of input T1 and place it
+-into 32-bit output T0.  Depending on the host, this may be a simple move,
+-or may require additional canonicalization.
+-
+-* extrh_i64_i32 t0, t1
+-
+-For 64-bit hosts only, extract the high 32-bits of input T1 and place it
+-into 32-bit output T0.  Depending on the host, this may be a simple shift,
+-or may require additional canonicalization.
+-
+-********* Conditional moves
+-
+-* setcond_i32/i64 dest, t1, t2, cond
+-
+-dest = (t1 cond t2)
+-
+-Set DEST to 1 if (T1 cond T2) is true, otherwise set to 0.
+-
+-* movcond_i32/i64 dest, c1, c2, v1, v2, cond
+-
+-dest = (c1 cond c2 ? v1 : v2)
+-
+-Set DEST to V1 if (C1 cond C2) is true, otherwise set to V2.
+-
+-********* Type conversions
+-
+-* ext_i32_i64 t0, t1
+-Convert t1 (32 bit) to t0 (64 bit) and does sign extension
+-
+-* extu_i32_i64 t0, t1
+-Convert t1 (32 bit) to t0 (64 bit) and does zero extension
+-
+-* trunc_i64_i32 t0, t1
+-Truncate t1 (64 bit) to t0 (32 bit)
+-
+-* concat_i32_i64 t0, t1, t2
+-Construct t0 (64-bit) taking the low half from t1 (32 bit) and the high half
+-from t2 (32 bit).
+-
+-* concat32_i64 t0, t1, t2
+-Construct t0 (64-bit) taking the low half from t1 (64 bit) and the high half
+-from t2 (64 bit).
+-
+-********* Load/Store
+-
+-* ld_i32/i64 t0, t1, offset
+-ld8s_i32/i64 t0, t1, offset
+-ld8u_i32/i64 t0, t1, offset
+-ld16s_i32/i64 t0, t1, offset
+-ld16u_i32/i64 t0, t1, offset
+-ld32s_i64 t0, t1, offset
+-ld32u_i64 t0, t1, offset
+-
+-t0 = read(t1 + offset)
+-Load 8, 16, 32 or 64 bits with or without sign extension from host memory.
+-offset must be a constant.
+-
+-* st_i32/i64 t0, t1, offset
+-st8_i32/i64 t0, t1, offset
+-st16_i32/i64 t0, t1, offset
+-st32_i64 t0, t1, offset
+-
+-write(t0, t1 + offset)
+-Write 8, 16, 32 or 64 bits to host memory.
+-
+-All this opcodes assume that the pointed host memory doesn't correspond
+-to a global. In the latter case the behaviour is unpredictable.
+-
+-********* Multiword arithmetic support
+-
+-* add2_i32/i64 t0_low, t0_high, t1_low, t1_high, t2_low, t2_high
+-* sub2_i32/i64 t0_low, t0_high, t1_low, t1_high, t2_low, t2_high
+-
+-Similar to add/sub, except that the double-word inputs T1 and T2 are
+-formed from two single-word arguments, and the double-word output T0
+-is returned in two single-word outputs.
+-
+-* mulu2_i32/i64 t0_low, t0_high, t1, t2
+-
+-Similar to mul, except two unsigned inputs T1 and T2 yielding the full
+-double-word product T0.  The later is returned in two single-word outputs.
+-
+-* muls2_i32/i64 t0_low, t0_high, t1, t2
+-
+-Similar to mulu2, except the two inputs T1 and T2 are signed.
+-
+-* mulsh_i32/i64 t0, t1, t2
+-* muluh_i32/i64 t0, t1, t2
+-
+-Provide the high part of a signed or unsigned multiply, respectively.
+-If mulu2/muls2 are not provided by the backend, the tcg-op generator
+-can obtain the same results can be obtained by emitting a pair of
+-opcodes, mul+muluh/mulsh.
+-
+-********* Memory Barrier support
+-
+-* mb <$arg>
+-
+-Generate a target memory barrier instruction to ensure memory ordering as being
+-enforced by a corresponding guest memory barrier instruction. The ordering
+-enforced by the backend may be stricter than the ordering required by the guest.
+-It cannot be weaker. This opcode takes a constant argument which is required to
+-generate the appropriate barrier instruction. The backend should take care to
+-emit the target barrier instruction only when necessary i.e., for SMP guests and
+-when MTTCG is enabled.
+-
+-The guest translators should generate this opcode for all guest instructions
+-which have ordering side effects.
+-
+-Please see docs/devel/atomics.rst for more information on memory barriers.
+-
+-********* 64-bit guest on 32-bit host support
+-
+-The following opcodes are internal to TCG.  Thus they are to be implemented by
+-32-bit host code generators, but are not to be emitted by guest translators.
+-They are emitted as needed by inline functions within "tcg-op.h".
+-
+-* brcond2_i32 t0_low, t0_high, t1_low, t1_high, cond, label
+-
+-Similar to brcond, except that the 64-bit values T0 and T1
+-are formed from two 32-bit arguments.
+-
+-* setcond2_i32 dest, t1_low, t1_high, t2_low, t2_high, cond
+-
+-Similar to setcond, except that the 64-bit values T1 and T2 are
+-formed from two 32-bit arguments.  The result is a 32-bit value.
+-
+-********* QEMU specific operations
+-
+-* exit_tb t0
+-
+-Exit the current TB and return the value t0 (word type).
+-
+-* goto_tb index
+-
+-Exit the current TB and jump to the TB index 'index' (constant) if the
+-current TB was linked to this TB. Otherwise execute the next
+-instructions. Only indices 0 and 1 are valid and tcg_gen_goto_tb may be issued
+-at most once with each slot index per TB.
+-
+-* lookup_and_goto_ptr tb_addr
+-
+-Look up a TB address ('tb_addr') and jump to it if valid. If not valid,
+-jump to the TCG epilogue to go back to the exec loop.
+-
+-This operation is optional. If the TCG backend does not implement the
+-goto_ptr opcode, emitting this op is equivalent to emitting exit_tb(0).
+-
+-* qemu_ld_i32/i64 t0, t1, flags, memidx
+-* qemu_st_i32/i64 t0, t1, flags, memidx
+-* qemu_st8_i32 t0, t1, flags, memidx
+-
+-Load data at the guest address t1 into t0, or store data in t0 at guest
+-address t1.  The _i32/_i64 size applies to the size of the input/output
+-register t0 only.  The address t1 is always sized according to the guest,
+-and the width of the memory operation is controlled by flags.
+-
+-Both t0 and t1 may be split into little-endian ordered pairs of registers
+-if dealing with 64-bit quantities on a 32-bit host.
+-
+-The memidx selects the qemu tlb index to use (e.g. user or kernel access).
+-The flags are the MemOp bits, selecting the sign, width, and endianness
+-of the memory access.
+-
+-For a 32-bit host, qemu_ld/st_i64 is guaranteed to only be used with a
+-64-bit memory access specified in flags.
+-
+-For i386, qemu_st8_i32 is exactly like qemu_st_i32, except the size of
+-the memory operation is known to be 8-bit.  This allows the backend to
+-provide a different set of register constraints.
+-
+-********* Host vector operations
+-
+-All of the vector ops have two parameters, TCGOP_VECL & TCGOP_VECE.
+-The former specifies the length of the vector in log2 64-bit units; the
+-later specifies the length of the element (if applicable) in log2 8-bit units.
+-E.g. VECL=1 -> 64 << 1 -> v128, and VECE=2 -> 1 << 2 -> i32.
+-
+-* mov_vec   v0, v1
+-* ld_vec    v0, t1
+-* st_vec    v0, t1
+-
+-  Move, load and store.
+-
+-* dup_vec  v0, r1
+-
+-  Duplicate the low N bits of R1 into VECL/VECE copies across V0.
+-
+-* dupi_vec v0, c
+-
+-  Similarly, for a constant.
+-  Smaller values will be replicated to host register size by the expanders.
+-
+-* dup2_vec v0, r1, r2
+-
+-  Duplicate r2:r1 into VECL/64 copies across V0.  This opcode is
+-  only present for 32-bit hosts.
+-
+-* add_vec   v0, v1, v2
+-
+-  v0 = v1 + v2, in elements across the vector.
+-
+-* sub_vec   v0, v1, v2
+-
+-  Similarly, v0 = v1 - v2.
+-
+-* mul_vec   v0, v1, v2
+-
+-  Similarly, v0 = v1 * v2.
+-
+-* neg_vec   v0, v1
+-
+-  Similarly, v0 = -v1.
+-
+-* abs_vec   v0, v1
+-
+-  Similarly, v0 = v1 < 0 ? -v1 : v1, in elements across the vector.
+-
+-* smin_vec:
+-* umin_vec:
+-
+-  Similarly, v0 = MIN(v1, v2), for signed and unsigned element types.
+-
+-* smax_vec:
+-* umax_vec:
+-
+-  Similarly, v0 = MAX(v1, v2), for signed and unsigned element types.
+-
+-* ssadd_vec:
+-* sssub_vec:
+-* usadd_vec:
+-* ussub_vec:
+-
+-  Signed and unsigned saturating addition and subtraction.  If the true
+-  result is not representable within the element type, the element is
+-  set to the minimum or maximum value for the type.
+-
+-* and_vec   v0, v1, v2
+-* or_vec    v0, v1, v2
+-* xor_vec   v0, v1, v2
+-* andc_vec  v0, v1, v2
+-* orc_vec   v0, v1, v2
+-* not_vec   v0, v1
+-
+-  Similarly, logical operations with and without complement.
+-  Note that VECE is unused.
+-
+-* shli_vec   v0, v1, i2
+-* shls_vec   v0, v1, s2
+-
+-  Shift all elements from v1 by a scalar i2/s2.  I.e.
+-
+-    for (i = 0; i < VECL/VECE; ++i) {
+-      v0[i] = v1[i] << s2;
+-    }
+-
+-* shri_vec   v0, v1, i2
+-* sari_vec   v0, v1, i2
+-* rotli_vec  v0, v1, i2
+-* shrs_vec   v0, v1, s2
+-* sars_vec   v0, v1, s2
+-
+-  Similarly for logical and arithmetic right shift, and left rotate.
+-
+-* shlv_vec   v0, v1, v2
+-
+-  Shift elements from v1 by elements from v2.  I.e.
+-
+-    for (i = 0; i < VECL/VECE; ++i) {
+-      v0[i] = v1[i] << v2[i];
+-    }
+-
+-* shrv_vec   v0, v1, v2
+-* sarv_vec   v0, v1, v2
+-* rotlv_vec  v0, v1, v2
+-* rotrv_vec  v0, v1, v2
+-
+-  Similarly for logical and arithmetic right shift, and rotates.
+-
+-* cmp_vec  v0, v1, v2, cond
+-
+-  Compare vectors by element, storing -1 for true and 0 for false.
+-
+-* bitsel_vec v0, v1, v2, v3
+-
+-  Bitwise select, v0 = (v2 & v1) | (v3 & ~v1), across the entire vector.
+-
+-* cmpsel_vec v0, c1, c2, v3, v4, cond
+-
+-  Select elements based on comparison results:
+-  for (i = 0; i < n; ++i) {
+-    v0[i] = (c1[i] cond c2[i]) ? v3[i] : v4[i].
+-  }
+-
+-*********
+-
+-Note 1: Some shortcuts are defined when the last operand is known to be
+-a constant (e.g. addi for add, movi for mov).
+-
+-Note 2: When using TCG, the opcodes must never be generated directly
+-as some of them may not be available as "real" opcodes. Always use the
+-function tcg_gen_xxx(args).
+-
+-4) Backend
+-
+-tcg-target.h contains the target specific definitions. tcg-target.c.inc
+-contains the target specific code; it is #included by tcg/tcg.c, rather
+-than being a standalone C file.
+-
+-4.1) Assumptions
+-
+-The target word size (TCG_TARGET_REG_BITS) is expected to be 32 bit or
+-64 bit. It is expected that the pointer has the same size as the word.
+-
+-On a 32 bit target, all 64 bit operations are converted to 32 bits. A
+-few specific operations must be implemented to allow it (see add2_i32,
+-sub2_i32, brcond2_i32).
+-
+-On a 64 bit target, the values are transferred between 32 and 64-bit
+-registers using the following ops:
+-- trunc_shr_i64_i32
+-- ext_i32_i64
+-- extu_i32_i64
+-
+-They ensure that the values are correctly truncated or extended when
+-moved from a 32-bit to a 64-bit register or vice-versa. Note that the
+-trunc_shr_i64_i32 is an optional op. It is not necessary to implement
+-it if all the following conditions are met:
+-- 64-bit registers can hold 32-bit values
+-- 32-bit values in a 64-bit register do not need to stay zero or
+-  sign extended
+-- all 32-bit TCG ops ignore the high part of 64-bit registers
+-
+-Floating point operations are not supported in this version. A
+-previous incarnation of the code generator had full support of them,
+-but it is better to concentrate on integer operations first.
+-
+-4.2) Constraints
+-
+-GCC like constraints are used to define the constraints of every
+-instruction. Memory constraints are not supported in this
+-version. Aliases are specified in the input operands as for GCC.
+-
+-The same register may be used for both an input and an output, even when
+-they are not explicitly aliased.  If an op expands to multiple target
+-instructions then care must be taken to avoid clobbering input values.
+-GCC style "early clobber" outputs are supported, with '&'.
+-
+-A target can define specific register or constant constraints. If an
+-operation uses a constant input constraint which does not allow all
+-constants, it must also accept registers in order to have a fallback.
+-The constraint 'i' is defined generically to accept any constant.
+-The constraint 'r' is not defined generically, but is consistently
+-used by each backend to indicate all registers.
+-
+-The movi_i32 and movi_i64 operations must accept any constants.
+-
+-The mov_i32 and mov_i64 operations must accept any registers of the
+-same type.
+-
+-The ld/st/sti instructions must accept signed 32 bit constant offsets.
+-This can be implemented by reserving a specific register in which to
+-compute the address if the offset is too big.
+-
+-The ld/st instructions must accept any destination (ld) or source (st)
+-register.
+-
+-The sti instruction may fail if it cannot store the given constant.
+-
+-4.3) Function call assumptions
+-
+-- The only supported types for parameters and return value are: 32 and
+-  64 bit integers and pointer.
+-- The stack grows downwards.
+-- The first N parameters are passed in registers.
+-- The next parameters are passed on the stack by storing them as words.
+-- Some registers are clobbered during the call.
+-- The function can return 0 or 1 value in registers. On a 32 bit
+-  target, functions must be able to return 2 values in registers for
+-  64 bit return type.
+-
+-5) Recommended coding rules for best performance
+-
+-- Use globals to represent the parts of the QEMU CPU state which are
+-  often modified, e.g. the integer registers and the condition
+-  codes. TCG will be able to use host registers to store them.
+-
+-- Avoid globals stored in fixed registers. They must be used only to
+-  store the pointer to the CPU state and possibly to store a pointer
+-  to a register window.
+-
+-- Use temporaries. Use local temporaries only when really needed,
+-  e.g. when you need to use a value after a jump. Local temporaries
+-  introduce a performance hit in the current TCG implementation: their
+-  content is saved to memory at end of each basic block.
+-
+-- Free temporaries and local temporaries when they are no longer used
+-  (tcg_temp_free). Since tcg_const_x() also creates a temporary, you
+-  should free it after it is used. Freeing temporaries does not yield
+-  a better generated code, but it reduces the memory usage of TCG and
+-  the speed of the translation.
+-
+-- Don't hesitate to use helpers for complicated or seldom used guest
+-  instructions. There is little performance advantage in using TCG to
+-  implement guest instructions taking more than about twenty TCG
+-  instructions. Note that this rule of thumb is more applicable to
+-  helpers doing complex logic or arithmetic, where the C compiler has
+-  scope to do a good job of optimisation; it is less relevant where
+-  the instruction is mostly doing loads and stores, and in those cases
+-  inline TCG may still be faster for longer sequences.
+-
+-- The hard limit on the number of TCG instructions you can generate
+-  per guest instruction is set by MAX_OP_PER_INSTR in exec-all.h --
+-  you cannot exceed this without risking a buffer overrun.
+-
+-- Use the 'discard' instruction if you know that TCG won't be able to
+-  prove that a given global is "dead" at a given program point. The
+-  x86 guest uses it to improve the condition codes optimisation.
+--
+.34.1

-New patch
+[PULL 02/47] meson: Move CONFIG_TCG_INTERPRETER to config_host
+Like CONFIG_TCG, the enabled method of execution is a host property
+not a guest property.  This exposes the define to compile-once files.
+Acked-by: Paolo Bonzini <pbonzini@redhat.com>
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ meson.build | 4 +---
+file changed, 1 insertion(+), 3 deletions(-)
+diff --git a/meson.build b/meson.build
+index XXXXXXX..XXXXXXX 100644
+--- a/meson.build
++++ b/meson.build
+@@ -XXX,XX +XXX,XX @@ if get_option('tcg').allowed()
+   endif
+   if get_option('tcg_interpreter')
+     tcg_arch = 'tci'
++    config_host += { 'CONFIG_TCG_INTERPRETER': 'y' }
+   elif host_arch == 'x86_64'
+     tcg_arch = 'i386'
+   elif host_arch == 'ppc64'
+@@ -XXX,XX +XXX,XX @@ foreach target : target_dirs
+     if sym == 'CONFIG_TCG' or target in accelerator_targets.get(sym, [])
+       config_target += { sym: 'y' }
+       config_all += { sym: 'y' }
+-      if sym == 'CONFIG_TCG' and tcg_arch == 'tci'
+-        config_target += { 'CONFIG_TCG_INTERPRETER': 'y' }
+-      endif
+       if target in modular_tcg
+         config_target += { 'CONFIG_TCG_MODULAR': 'y' }
+       else
+--
+.34.1

-New patch
+[PULL 03/47] tcg/s390x: Fix coding style
+From: Philippe Mathieu-Daudé <philmd@linaro.org>
+We are going to modify this code, so fix its style first to avoid:
+  ERROR: spaces required around that '*' (ctx:VxV)
+  #281: FILE: tcg/s390x/tcg-target.c.inc:1224:
+  +        uintptr_t mask = ~(0xffffull << i*16);
+                                            ^
+Reviewed-by: Wilfred Mallawa <wilfred.mallawa@wdc.com>
+Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+Message-Id: <20221130132654.76369-2-philmd@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/s390x/tcg-target.c.inc | 20 ++++++++++----------
+file changed, 10 insertions(+), 10 deletions(-)
+diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/s390x/tcg-target.c.inc
++++ b/tcg/s390x/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static bool maybe_out_small_movi(TCGContext *s, TCGType type,
+     }
+     for (i = 0; i < 4; i++) {
+-        tcg_target_long mask = 0xffffull << i*16;
++        tcg_target_long mask = 0xffffull << i * 16;
+         if ((uval & mask) == uval) {
+-            tcg_out_insn_RI(s, lli_insns[i], ret, uval >> i*16);
++            tcg_out_insn_RI(s, lli_insns[i], ret, uval >> i * 16);
+             return true;
+         }
+     }
+@@ -XXX,XX +XXX,XX @@ static void tgen_andi(TCGContext *s, TCGType type, TCGReg dest, uint64_t val)
+     /* Try all 32-bit insns that can perform it in one go.  */
+     for (i = 0; i < 4; i++) {
+-        tcg_target_ulong mask = ~(0xffffull << i*16);
++        tcg_target_ulong mask = ~(0xffffull << i * 16);
+         if (((val | ~valid) & mask) == mask) {
+-            tcg_out_insn_RI(s, ni_insns[i], dest, val >> i*16);
++            tcg_out_insn_RI(s, ni_insns[i], dest, val >> i * 16);
+             return;
+         }
+     }
+@@ -XXX,XX +XXX,XX @@ static void tgen_andi(TCGContext *s, TCGType type, TCGReg dest, uint64_t val)
+     /* Try all 48-bit insns that can perform it in one go.  */
+     if (HAVE_FACILITY(EXT_IMM)) {
+         for (i = 0; i < 2; i++) {
+-            tcg_target_ulong mask = ~(0xffffffffull << i*32);
++            tcg_target_ulong mask = ~(0xffffffffull << i * 32);
+             if (((val | ~valid) & mask) == mask) {
+-                tcg_out_insn_RIL(s, nif_insns[i], dest, val >> i*32);
++                tcg_out_insn_RIL(s, nif_insns[i], dest, val >> i * 32);
+                 return;
+             }
+         }
+@@ -XXX,XX +XXX,XX @@ static void tgen_ori(TCGContext *s, TCGType type, TCGReg dest, uint64_t val)
+     /* Try all 32-bit insns that can perform it in one go.  */
+     for (i = 0; i < 4; i++) {
+-        tcg_target_ulong mask = (0xffffull << i*16);
++        tcg_target_ulong mask = (0xffffull << i * 16);
+         if ((val & mask) != 0 && (val & ~mask) == 0) {
+-            tcg_out_insn_RI(s, oi_insns[i], dest, val >> i*16);
++            tcg_out_insn_RI(s, oi_insns[i], dest, val >> i * 16);
+             return;
+         }
+     }
+@@ -XXX,XX +XXX,XX @@ static void tgen_ori(TCGContext *s, TCGType type, TCGReg dest, uint64_t val)
+     /* Try all 48-bit insns that can perform it in one go.  */
+     if (HAVE_FACILITY(EXT_IMM)) {
+         for (i = 0; i < 2; i++) {
+-            tcg_target_ulong mask = (0xffffffffull << i*32);
++            tcg_target_ulong mask = (0xffffffffull << i * 32);
+             if ((val & mask) != 0 && (val & ~mask) == 0) {
+-                tcg_out_insn_RIL(s, oif_insns[i], dest, val >> i*32);
++                tcg_out_insn_RIL(s, oif_insns[i], dest, val >> i * 32);
+                 return;
+             }
+         }
+--
+.34.1

-New patch
+[PULL 04/47] tcg: Cleanup trailing whitespace
+Remove whitespace at end of line, plus one place this also
+highlights some missing braces.
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/tcg.c                | 33 +++++++++++++++++----------------
+ tcg/ppc/tcg-target.c.inc |  2 +-
+files changed, 18 insertions(+), 17 deletions(-)
+diff --git a/tcg/tcg.c b/tcg/tcg.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/tcg.c
++++ b/tcg/tcg.c
+@@ -XXX,XX +XXX,XX @@ void *tcg_malloc_internal(TCGContext *s, int size)
+ {
+     TCGPool *p;
+     int pool_size;
+-
++
+     if (size > TCG_POOL_CHUNK_SIZE) {
+         /* big malloc: insert a new pool (XXX: could optimize) */
+         p = g_malloc(sizeof(TCGPool) + size);
+@@ -XXX,XX +XXX,XX @@ void *tcg_malloc_internal(TCGContext *s, int size)
+                 p = g_malloc(sizeof(TCGPool) + pool_size);
+                 p->size = pool_size;
+                 p->next = NULL;
+-                if (s->pool_current)
++                if (s->pool_current) {
+                     s->pool_current->next = p;
+-                else
++                } else {
+                     s->pool_first = p;
++                }
+             } else {
+                 p = p->next;
+             }
+@@ -XXX,XX +XXX,XX @@ static void dump_regs(TCGContext *s)
+     for(i = 0; i < TCG_TARGET_NB_REGS; i++) {
+         if (s->reg_to_temp[i] != NULL) {
+-            printf("%s: %s\n",
+-                   tcg_target_reg_names[i],
++            printf("%s: %s\n",
++                   tcg_target_reg_names[i],
+                    tcg_get_arg_str_ptr(s, buf, sizeof(buf), s->reg_to_temp[i]));
+         }
+     }
+@@ -XXX,XX +XXX,XX @@ static void check_regs(TCGContext *s)
+         ts = s->reg_to_temp[reg];
+         if (ts != NULL) {
+             if (ts->val_type != TEMP_VAL_REG || ts->reg != reg) {
+-                printf("Inconsistency for register %s:\n",
++                printf("Inconsistency for register %s:\n",
+                        tcg_target_reg_names[reg]);
+                 goto fail;
+             }
+@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
+     nb_iargs = def->nb_iargs;
+     /* copy constants */
+-    memcpy(new_args + nb_oargs + nb_iargs,
++    memcpy(new_args + nb_oargs + nb_iargs,
+            op->args + nb_oargs + nb_iargs,
+            sizeof(TCGArg) * def->nb_cargs);
+     i_allocated_regs = s->reserved_regs;
+     o_allocated_regs = s->reserved_regs;
+-    /* satisfy input constraints */
++    /* satisfy input constraints */
+     for (k = 0; k < nb_iargs; k++) {
+         TCGRegSet i_preferred_regs, o_preferred_regs;
+@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
+         const_args[i] = 0;
+         tcg_regset_set_reg(i_allocated_regs, reg);
+     }
+-
++
+     /* mark dead temporaries and free the associated registers */
+     for (i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
+         if (IS_DEAD_ARG(i)) {
+@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
+         tcg_reg_alloc_bb_end(s, i_allocated_regs);
+     } else {
+         if (def->flags & TCG_OPF_CALL_CLOBBER) {
+-            /* XXX: permit generic clobber register list ? */
++            /* XXX: permit generic clobber register list ? */
+             for (i = 0; i < TCG_TARGET_NB_REGS; i++) {
+                 if (tcg_regset_test_reg(tcg_target_call_clobber_regs, i)) {
+                     tcg_reg_free(s, i, i_allocated_regs);
+@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
+                an exception. */
+             sync_globals(s, i_allocated_regs);
+         }
+-
++
+         /* satisfy the output constraints */
+         for(k = 0; k < nb_oargs; k++) {
+             i = def->args_ct[k].sort_index;
+@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_call(TCGContext *s, TCGOp *op)
+     /* assign stack slots first */
+     call_stack_size = (nb_iargs - nb_regs) * sizeof(tcg_target_long);
+-    call_stack_size = (call_stack_size + TCG_TARGET_STACK_ALIGN - 1) &
++    call_stack_size = (call_stack_size + TCG_TARGET_STACK_ALIGN - 1) &
+         ~(TCG_TARGET_STACK_ALIGN - 1);
+     allocate_args = (call_stack_size > TCG_STATIC_CALL_ARGS_SIZE);
+     if (allocate_args) {
+@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_call(TCGContext *s, TCGOp *op)
+         stack_offset += sizeof(tcg_target_long);
+ #endif
+     }
+-
++
+     /* assign input registers */
+     allocated_regs = s->reserved_regs;
+     for (i = 0; i < nb_regs; i++) {
+@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_call(TCGContext *s, TCGOp *op)
+             tcg_regset_set_reg(allocated_regs, reg);
+         }
+     }
+-
++
+     /* mark dead temporaries and free the associated registers */
+     for (i = nb_oargs; i < nb_iargs + nb_oargs; i++) {
+         if (IS_DEAD_ARG(i)) {
+             temp_dead(s, arg_temp(op->args[i]));
+         }
+     }
+-
++
+     /* clobber call registers */
+     for (i = 0; i < TCG_TARGET_NB_REGS; i++) {
+         if (tcg_regset_test_reg(tcg_target_call_clobber_regs, i)) {
+@@ -XXX,XX +XXX,XX @@ void tcg_dump_info(GString *buf)
+                            (double)s->code_out_len / tb_div_count);
+     g_string_append_printf(buf, "avg search data/TB  %0.1f\n",
+                            (double)s->search_out_len / tb_div_count);
+-
++
+     g_string_append_printf(buf, "cycles/op           %0.1f\n",
+                            s->op_count ? (double)tot / s->op_count : 0);
+     g_string_append_printf(buf, "cycles/in byte      %0.1f\n",
+diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/ppc/tcg-target.c.inc
++++ b/tcg/ppc/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@
+ # else
+ #  error "Unknown ABI"
+ # endif
+-#endif
++#endif
+ #ifdef _CALL_SYSV
+ # define TCG_TARGET_CALL_ALIGN_ARGS   1
+--
+.34.1

-New patch
+[PULL 05/47] qemu/main-loop: Introduce QEMU_IOTHREAD_LOCK_GUARD
+Create a wrapper for locking/unlocking the iothread lock.
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ include/qemu/main-loop.h | 29 +++++++++++++++++++++++++++++
+file changed, 29 insertions(+)
+diff --git a/include/qemu/main-loop.h b/include/qemu/main-loop.h
+index XXXXXXX..XXXXXXX 100644
+--- a/include/qemu/main-loop.h
++++ b/include/qemu/main-loop.h
+@@ -XXX,XX +XXX,XX @@ void qemu_mutex_lock_iothread_impl(const char *file, int line);
+  */
+ void qemu_mutex_unlock_iothread(void);
++/**
++ * QEMU_IOTHREAD_LOCK_GUARD
++ *
++ * Wrap a block of code in a conditional qemu_mutex_{lock,unlock}_iothread.
++ */
++typedef struct IOThreadLockAuto IOThreadLockAuto;
++
++static inline IOThreadLockAuto *qemu_iothread_auto_lock(const char *file,
++                                                        int line)
++{
++    if (qemu_mutex_iothread_locked()) {
++        return NULL;
++    }
++    qemu_mutex_lock_iothread_impl(file, line);
++    /* Anything non-NULL causes the cleanup function to be called */
++    return (IOThreadLockAuto *)(uintptr_t)1;
++}
++
++static inline void qemu_iothread_auto_unlock(IOThreadLockAuto *l)
++{
++    qemu_mutex_unlock_iothread();
++}
++
++G_DEFINE_AUTOPTR_CLEANUP_FUNC(IOThreadLockAuto, qemu_iothread_auto_unlock)
++
++#define QEMU_IOTHREAD_LOCK_GUARD() \
++    g_autoptr(IOThreadLockAuto) _iothread_lock_auto __attribute__((unused)) \
++        = qemu_iothread_auto_lock(__FILE__, __LINE__)
++
+ /*
+  * qemu_cond_wait_iothread: Wait on condition for the main loop mutex
+  *
+--
+.34.1

-New patch
+[PULL 06/47] hw/mips: Use QEMU_IOTHREAD_LOCK_GUARD in cpu_mips_irq_request
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ hw/mips/mips_int.c | 11 +----------
+file changed, 1 insertion(+), 10 deletions(-)
+diff --git a/hw/mips/mips_int.c b/hw/mips/mips_int.c
+index XXXXXXX..XXXXXXX 100644
+--- a/hw/mips/mips_int.c
++++ b/hw/mips/mips_int.c
+@@ -XXX,XX +XXX,XX @@ static void cpu_mips_irq_request(void *opaque, int irq, int level)
+     MIPSCPU *cpu = opaque;
+     CPUMIPSState *env = &cpu->env;
+     CPUState *cs = CPU(cpu);
+-    bool locked = false;
+     if (irq < 0 || irq > 7) {
+         return;
+     }
+-    /* Make sure locking works even if BQL is already held by the caller */
+-    if (!qemu_mutex_iothread_locked()) {
+-        locked = true;
+-        qemu_mutex_lock_iothread();
+-    }
++    QEMU_IOTHREAD_LOCK_GUARD();
+     if (level) {
+         env->CP0_Cause |= 1 << (irq + CP0Ca_IP);
+@@ -XXX,XX +XXX,XX @@ static void cpu_mips_irq_request(void *opaque, int irq, int level)
+     } else {
+         cpu_reset_interrupt(cs, CPU_INTERRUPT_HARD);
+     }
+-
+-    if (locked) {
+-        qemu_mutex_unlock_iothread();
+-    }
+ }
+ void cpu_mips_irq_init_cpu(MIPSCPU *cpu)
+--
+.34.1

-New patch
+[PULL 07/47] target/ppc: Use QEMU_IOTHREAD_LOCK_GUARD in ppc_maybe_interrupt
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+Reviewed-by: Daniel Henrique Barboza <danielhb413@gmail.com>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ target/ppc/excp_helper.c | 11 +----------
+file changed, 1 insertion(+), 10 deletions(-)
+diff --git a/target/ppc/excp_helper.c b/target/ppc/excp_helper.c
+index XXXXXXX..XXXXXXX 100644
+--- a/target/ppc/excp_helper.c
++++ b/target/ppc/excp_helper.c
+@@ -XXX,XX +XXX,XX @@ static int ppc_next_unmasked_interrupt(CPUPPCState *env)
+ void ppc_maybe_interrupt(CPUPPCState *env)
+ {
+     CPUState *cs = env_cpu(env);
+-    bool locked = false;
+-
+-    if (!qemu_mutex_iothread_locked()) {
+-        locked = true;
+-        qemu_mutex_lock_iothread();
+-    }
++    QEMU_IOTHREAD_LOCK_GUARD();
+     if (ppc_next_unmasked_interrupt(env)) {
+         cpu_interrupt(cs, CPU_INTERRUPT_HARD);
+     } else {
+         cpu_reset_interrupt(cs, CPU_INTERRUPT_HARD);
+     }
+-
+-    if (locked) {
+-        qemu_mutex_unlock_iothread();
+-    }
+ }
+ #if defined(TARGET_PPC64)
+--
+.34.1

-New patch
+[PULL 08/47] target/ppc: Use QEMU_IOTHREAD_LOCK_GUARD in cpu_interrupt_exittb
+In addition, use tcg_enabled instead of !kvm_enabled.
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+Reviewed-by: Daniel Henrique Barboza <danielhb413@gmail.com>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ target/ppc/helper_regs.c | 14 ++++----------
+file changed, 4 insertions(+), 10 deletions(-)
+diff --git a/target/ppc/helper_regs.c b/target/ppc/helper_regs.c
+index XXXXXXX..XXXXXXX 100644
+--- a/target/ppc/helper_regs.c
++++ b/target/ppc/helper_regs.c
+@@ -XXX,XX +XXX,XX @@
+ #include "qemu/main-loop.h"
+ #include "exec/exec-all.h"
+ #include "sysemu/kvm.h"
++#include "sysemu/tcg.h"
+ #include "helper_regs.h"
+ #include "power8-pmu.h"
+ #include "cpu-models.h"
+@@ -XXX,XX +XXX,XX @@ void cpu_interrupt_exittb(CPUState *cs)
+ {
+     /*
+      * We don't need to worry about translation blocks
+-     * when running with KVM.
++     * unless running with TCG.
+      */
+-    if (kvm_enabled()) {
+-        return;
+-    }
+-
+-    if (!qemu_mutex_iothread_locked()) {
+-        qemu_mutex_lock_iothread();
+-        cpu_interrupt(cs, CPU_INTERRUPT_EXITTB);
+-        qemu_mutex_unlock_iothread();
+-    } else {
++    if (tcg_enabled()) {
++        QEMU_IOTHREAD_LOCK_GUARD();
+         cpu_interrupt(cs, CPU_INTERRUPT_EXITTB);
+     }
+ }
+--
+.34.1

-[Qemu-devel] [PULL 16/21] target/ppc: Convert to HAVE_CMPXCHG128 and HAVE_ATOMIC128
+[PULL 09/47] target/riscv: Use QEMU_IOTHREAD_LOCK_GUARD in riscv_cpu_update_mip
-Reviewed-by: Emilio G. Cota <cota@braap.org>
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Reviewed-by: Alistair Francis <alistair.francis@wdc.com>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/ppc/helper.h     |   2 +-
+ target/riscv/cpu_helper.c | 10 +---------
- target/ppc/mem_helper.c |  33 ++++++++++--
+file changed, 1 insertion(+), 9 deletions(-)
  target/ppc/translate.c  | 115 +++++++++++++++++++++-------------------
 files changed, 88 insertions(+), 62 deletions(-)
-diff --git a/target/ppc/helper.h b/target/ppc/helper.h
+diff --git a/target/riscv/cpu_helper.c b/target/riscv/cpu_helper.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/ppc/helper.h
+--- a/target/riscv/cpu_helper.c
-+++ b/target/ppc/helper.h
++++ b/target/riscv/cpu_helper.c
-@@ -XXX,XX +XXX,XX @@ DEF_HELPER_4(dscliq, void, env, fprp, fprp, i32)
+@@ -XXX,XX +XXX,XX @@ uint64_t riscv_cpu_update_mip(RISCVCPU *cpu, uint64_t mask, uint64_t value)
- DEF_HELPER_1(tbegin, void, env)
+     CPURISCVState *env = &cpu->env;
- DEF_HELPER_FLAGS_1(fixup_thrm, TCG_CALL_NO_RWG, void, env)
+     CPUState *cs = CPU(cpu);
+     uint64_t gein, vsgein = 0, vstip = 0, old = env->mip;
--#if defined(TARGET_PPC64) && defined(CONFIG_ATOMIC128)
+-    bool locked = false;
-+#ifdef TARGET_PPC64
- DEF_HELPER_FLAGS_3(lq_le_parallel, TCG_CALL_NO_WG, i64, env, tl, i32)
+     if (riscv_cpu_virt_enabled(env)) {
- DEF_HELPER_FLAGS_3(lq_be_parallel, TCG_CALL_NO_WG, i64, env, tl, i32)
+         gein = get_field(env->hstatus, HSTATUS_VGEIN);
- DEF_HELPER_FLAGS_5(stq_le_parallel, TCG_CALL_NO_WG,
+@@ -XXX,XX +XXX,XX @@ uint64_t riscv_cpu_update_mip(RISCVCPU *cpu, uint64_t mask, uint64_t value)
-diff --git a/target/ppc/mem_helper.c b/target/ppc/mem_helper.c
+     mask = ((mask == MIP_VSTIP) && env->vstime_irq) ? 0 : mask;
-index XXXXXXX..XXXXXXX 100644
+     vstip = env->vstime_irq ? MIP_VSTIP : 0;
---- a/target/ppc/mem_helper.c
-+++ b/target/ppc/mem_helper.c
+-    if (!qemu_mutex_iothread_locked()) {
-@@ -XXX,XX +XXX,XX @@
+-        locked = true;
- #include "exec/cpu_ldst.h"
+-        qemu_mutex_lock_iothread();
- #include "tcg.h"
+-    }
- #include "internal.h"
++    QEMU_IOTHREAD_LOCK_GUARD();
-+#include "qemu/atomic128.h"
+     env->mip = (env->mip & ~mask) | (value & mask);
- //#define DEBUG_OP
+@@ -XXX,XX +XXX,XX @@ uint64_t riscv_cpu_update_mip(RISCVCPU *cpu, uint64_t mask, uint64_t value)
-@@ -XXX,XX +XXX,XX @@ target_ulong helper_lscbx(CPUPPCState *env, target_ulong addr, uint32_t reg,
+         cpu_reset_interrupt(cs, CPU_INTERRUPT_HARD);
-     return i;
+     }
 -    if (locked) {
 -        qemu_mutex_unlock_iothread();
 -    }
 -
      return old;
  }
--#if defined(TARGET_PPC64) && defined(CONFIG_ATOMIC128)
-+#ifdef TARGET_PPC64
- uint64_t helper_lq_le_parallel(CPUPPCState *env, target_ulong addr,
-                                uint32_t opidx)
- {
--    Int128 ret = helper_atomic_ldo_le_mmu(env, addr, opidx, GETPC());
-+    Int128 ret;
-+
-+    /* We will have raised EXCP_ATOMIC from the translator.  */
-+    assert(HAVE_ATOMIC128);
-+    ret = helper_atomic_ldo_le_mmu(env, addr, opidx, GETPC());
-     env->retxh = int128_gethi(ret);
-     return int128_getlo(ret);
- }
-@@ -XXX,XX +XXX,XX @@ uint64_t helper_lq_le_parallel(CPUPPCState *env, target_ulong addr,
- uint64_t helper_lq_be_parallel(CPUPPCState *env, target_ulong addr,
-                                uint32_t opidx)
- {
--    Int128 ret = helper_atomic_ldo_be_mmu(env, addr, opidx, GETPC());
-+    Int128 ret;
-+
-+    /* We will have raised EXCP_ATOMIC from the translator.  */
-+    assert(HAVE_ATOMIC128);
-+    ret = helper_atomic_ldo_be_mmu(env, addr, opidx, GETPC());
-     env->retxh = int128_gethi(ret);
-     return int128_getlo(ret);
- }
-@@ -XXX,XX +XXX,XX @@ uint64_t helper_lq_be_parallel(CPUPPCState *env, target_ulong addr,
- void helper_stq_le_parallel(CPUPPCState *env, target_ulong addr,
-                             uint64_t lo, uint64_t hi, uint32_t opidx)
- {
--    Int128 val = int128_make128(lo, hi);
-+    Int128 val;
-+
-+    /* We will have raised EXCP_ATOMIC from the translator.  */
-+    assert(HAVE_ATOMIC128);
-+    val = int128_make128(lo, hi);
-     helper_atomic_sto_le_mmu(env, addr, val, opidx, GETPC());
- }
- void helper_stq_be_parallel(CPUPPCState *env, target_ulong addr,
-                             uint64_t lo, uint64_t hi, uint32_t opidx)
- {
--    Int128 val = int128_make128(lo, hi);
-+    Int128 val;
-+
-+    /* We will have raised EXCP_ATOMIC from the translator.  */
-+    assert(HAVE_ATOMIC128);
-+    val = int128_make128(lo, hi);
-     helper_atomic_sto_be_mmu(env, addr, val, opidx, GETPC());
- }
-@@ -XXX,XX +XXX,XX @@ uint32_t helper_stqcx_le_parallel(CPUPPCState *env, target_ulong addr,
- {
-     bool success = false;
-+    /* We will have raised EXCP_ATOMIC from the translator.  */
-+    assert(HAVE_CMPXCHG128);
-+
-     if (likely(addr == env->reserve_addr)) {
-         Int128 oldv, cmpv, newv;
-@@ -XXX,XX +XXX,XX @@ uint32_t helper_stqcx_be_parallel(CPUPPCState *env, target_ulong addr,
- {
-     bool success = false;
-+    /* We will have raised EXCP_ATOMIC from the translator.  */
-+    assert(HAVE_CMPXCHG128);
-+
-     if (likely(addr == env->reserve_addr)) {
-         Int128 oldv, cmpv, newv;
-diff --git a/target/ppc/translate.c b/target/ppc/translate.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/ppc/translate.c
-+++ b/target/ppc/translate.c
-@@ -XXX,XX +XXX,XX @@
- #include "trace-tcg.h"
- #include "exec/translator.h"
- #include "exec/log.h"
-+#include "qemu/atomic128.h"
- #define CPU_SINGLE_STEP 0x1
-@@ -XXX,XX +XXX,XX @@ static void gen_lq(DisasContext *ctx)
-     hi = cpu_gpr[rd];
-     if (tb_cflags(ctx->base.tb) & CF_PARALLEL) {
--#ifdef CONFIG_ATOMIC128
--        TCGv_i32 oi = tcg_temp_new_i32();
--        if (ctx->le_mode) {
--            tcg_gen_movi_i32(oi, make_memop_idx(MO_LEQ, ctx->mem_idx));
--            gen_helper_lq_le_parallel(lo, cpu_env, EA, oi);
-+        if (HAVE_ATOMIC128) {
-+            TCGv_i32 oi = tcg_temp_new_i32();
-+            if (ctx->le_mode) {
-+                tcg_gen_movi_i32(oi, make_memop_idx(MO_LEQ, ctx->mem_idx));
-+                gen_helper_lq_le_parallel(lo, cpu_env, EA, oi);
-+            } else {
-+                tcg_gen_movi_i32(oi, make_memop_idx(MO_BEQ, ctx->mem_idx));
-+                gen_helper_lq_be_parallel(lo, cpu_env, EA, oi);
-+            }
-+            tcg_temp_free_i32(oi);
-+            tcg_gen_ld_i64(hi, cpu_env, offsetof(CPUPPCState, retxh));
-         } else {
--            tcg_gen_movi_i32(oi, make_memop_idx(MO_BEQ, ctx->mem_idx));
--            gen_helper_lq_be_parallel(lo, cpu_env, EA, oi);
-+            /* Restart with exclusive lock.  */
-+            gen_helper_exit_atomic(cpu_env);
-+            ctx->base.is_jmp = DISAS_NORETURN;
-         }
--        tcg_temp_free_i32(oi);
--        tcg_gen_ld_i64(hi, cpu_env, offsetof(CPUPPCState, retxh));
--#else
--        /* Restart with exclusive lock.  */
--        gen_helper_exit_atomic(cpu_env);
--        ctx->base.is_jmp = DISAS_NORETURN;
--#endif
-     } else if (ctx->le_mode) {
-         tcg_gen_qemu_ld_i64(lo, EA, ctx->mem_idx, MO_LEQ);
-         gen_addr_add(ctx, EA, EA, 8);
-@@ -XXX,XX +XXX,XX @@ static void gen_std(DisasContext *ctx)
-         hi = cpu_gpr[rs];
-         if (tb_cflags(ctx->base.tb) & CF_PARALLEL) {
--#ifdef CONFIG_ATOMIC128
--            TCGv_i32 oi = tcg_temp_new_i32();
--            if (ctx->le_mode) {
--                tcg_gen_movi_i32(oi, make_memop_idx(MO_LEQ, ctx->mem_idx));
--                gen_helper_stq_le_parallel(cpu_env, EA, lo, hi, oi);
-+            if (HAVE_ATOMIC128) {
-+                TCGv_i32 oi = tcg_temp_new_i32();
-+                if (ctx->le_mode) {
-+                    tcg_gen_movi_i32(oi, make_memop_idx(MO_LEQ, ctx->mem_idx));
-+                    gen_helper_stq_le_parallel(cpu_env, EA, lo, hi, oi);
-+                } else {
-+                    tcg_gen_movi_i32(oi, make_memop_idx(MO_BEQ, ctx->mem_idx));
-+                    gen_helper_stq_be_parallel(cpu_env, EA, lo, hi, oi);
-+                }
-+                tcg_temp_free_i32(oi);
-             } else {
--                tcg_gen_movi_i32(oi, make_memop_idx(MO_BEQ, ctx->mem_idx));
--                gen_helper_stq_be_parallel(cpu_env, EA, lo, hi, oi);
-+                /* Restart with exclusive lock.  */
-+                gen_helper_exit_atomic(cpu_env);
-+                ctx->base.is_jmp = DISAS_NORETURN;
-             }
--            tcg_temp_free_i32(oi);
--#else
--            /* Restart with exclusive lock.  */
--            gen_helper_exit_atomic(cpu_env);
--            ctx->base.is_jmp = DISAS_NORETURN;
--#endif
-         } else if (ctx->le_mode) {
-             tcg_gen_qemu_st_i64(lo, EA, ctx->mem_idx, MO_LEQ);
-             gen_addr_add(ctx, EA, EA, 8);
-@@ -XXX,XX +XXX,XX @@ static void gen_lqarx(DisasContext *ctx)
-     hi = cpu_gpr[rd];
-     if (tb_cflags(ctx->base.tb) & CF_PARALLEL) {
--#ifdef CONFIG_ATOMIC128
--        TCGv_i32 oi = tcg_temp_new_i32();
--        if (ctx->le_mode) {
--            tcg_gen_movi_i32(oi, make_memop_idx(MO_LEQ | MO_ALIGN_16,
--                                                ctx->mem_idx));
--            gen_helper_lq_le_parallel(lo, cpu_env, EA, oi);
-+        if (HAVE_ATOMIC128) {
-+            TCGv_i32 oi = tcg_temp_new_i32();
-+            if (ctx->le_mode) {
-+                tcg_gen_movi_i32(oi, make_memop_idx(MO_LEQ | MO_ALIGN_16,
-+                                                    ctx->mem_idx));
-+                gen_helper_lq_le_parallel(lo, cpu_env, EA, oi);
-+            } else {
-+                tcg_gen_movi_i32(oi, make_memop_idx(MO_BEQ | MO_ALIGN_16,
-+                                                    ctx->mem_idx));
-+                gen_helper_lq_be_parallel(lo, cpu_env, EA, oi);
-+            }
-+            tcg_temp_free_i32(oi);
-+            tcg_gen_ld_i64(hi, cpu_env, offsetof(CPUPPCState, retxh));
-         } else {
--            tcg_gen_movi_i32(oi, make_memop_idx(MO_BEQ | MO_ALIGN_16,
--                                                ctx->mem_idx));
--            gen_helper_lq_be_parallel(lo, cpu_env, EA, oi);
-+            /* Restart with exclusive lock.  */
-+            gen_helper_exit_atomic(cpu_env);
-+            ctx->base.is_jmp = DISAS_NORETURN;
-+            tcg_temp_free(EA);
-+            return;
-         }
--        tcg_temp_free_i32(oi);
--        tcg_gen_ld_i64(hi, cpu_env, offsetof(CPUPPCState, retxh));
--#else
--        /* Restart with exclusive lock.  */
--        gen_helper_exit_atomic(cpu_env);
--        ctx->base.is_jmp = DISAS_NORETURN;
--        tcg_temp_free(EA);
--        return;
--#endif
-     } else if (ctx->le_mode) {
-         tcg_gen_qemu_ld_i64(lo, EA, ctx->mem_idx, MO_LEQ | MO_ALIGN_16);
-         tcg_gen_mov_tl(cpu_reserve, EA);
-@@ -XXX,XX +XXX,XX @@ static void gen_stqcx_(DisasContext *ctx)
-     hi = cpu_gpr[rs];
-     if (tb_cflags(ctx->base.tb) & CF_PARALLEL) {
--        TCGv_i32 oi = tcg_const_i32(DEF_MEMOP(MO_Q) | MO_ALIGN_16);
--#ifdef CONFIG_ATOMIC128
--        if (ctx->le_mode) {
--            gen_helper_stqcx_le_parallel(cpu_crf[0], cpu_env, EA, lo, hi, oi);
-+        if (HAVE_CMPXCHG128) {
-+            TCGv_i32 oi = tcg_const_i32(DEF_MEMOP(MO_Q) | MO_ALIGN_16);
-+            if (ctx->le_mode) {
-+                gen_helper_stqcx_le_parallel(cpu_crf[0], cpu_env,
-+                                             EA, lo, hi, oi);
-+            } else {
-+                gen_helper_stqcx_be_parallel(cpu_crf[0], cpu_env,
-+                                             EA, lo, hi, oi);
-+            }
-+            tcg_temp_free_i32(oi);
-         } else {
--            gen_helper_stqcx_le_parallel(cpu_crf[0], cpu_env, EA, lo, hi, oi);
-+            /* Restart with exclusive lock.  */
-+            gen_helper_exit_atomic(cpu_env);
-+            ctx->base.is_jmp = DISAS_NORETURN;
-         }
--#else
--        /* Restart with exclusive lock.  */
--        gen_helper_exit_atomic(cpu_env);
--        ctx->base.is_jmp = DISAS_NORETURN;
--#endif
-         tcg_temp_free(EA);
--        tcg_temp_free_i32(oi);
-     } else {
-         TCGLabel *lab_fail = gen_new_label();
-         TCGLabel *lab_over = gen_new_label();
 --
-.17.2
+.34.1

-[Qemu-devel] [PULL 02/21] tcg: access cpu->icount_decr.u16.high with atomics
+[PULL 10/47] hw/ppc: Use QEMU_IOTHREAD_LOCK_GUARD in ppc_set_irq
-From: "Emilio G. Cota" <cota@braap.org>
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+Reviewed-by: Daniel Henrique Barboza <danielhb413@gmail.com>
 Consistently access u16.high with atomics to avoid
 undefined behaviour in MTTCG.
 Note that icount_decr.u16.low is only used in icount mode,
 so regular accesses to it are OK.
 Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
 Signed-off-by: Emilio G. Cota <cota@braap.org>
 Message-Id: <20181010144853.13005-2-cota@braap.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/tcg-all.c       | 2 +-
+ hw/ppc/ppc.c | 10 +---------
- accel/tcg/translate-all.c | 2 +-
+file changed, 1 insertion(+), 9 deletions(-)
  qom/cpu.c                 | 2 +-
 files changed, 3 insertions(+), 3 deletions(-)
-diff --git a/accel/tcg/tcg-all.c b/accel/tcg/tcg-all.c
+diff --git a/hw/ppc/ppc.c b/hw/ppc/ppc.c
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/tcg-all.c
+--- a/hw/ppc/ppc.c
-+++ b/accel/tcg/tcg-all.c
++++ b/hw/ppc/ppc.c
-@@ -XXX,XX +XXX,XX @@ static void tcg_handle_interrupt(CPUState *cpu, int mask)
+@@ -XXX,XX +XXX,XX @@ void ppc_set_irq(PowerPCCPU *cpu, int irq, int level)
      if (!qemu_cpu_is_self(cpu)) {
          qemu_cpu_kick(cpu);
      } else {
 -        cpu->icount_decr.u16.high = -1;
 +        atomic_set(&cpu->icount_decr.u16.high, -1);
          if (use_icount &&
              !cpu->can_do_io
              && (mask & ~old_mask) != 0) {
 diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/translate-all.c
 +++ b/accel/tcg/translate-all.c
@@ -XXX,XX +XXX,XX @@ void cpu_interrupt(CPUState *cpu, int mask)
  {
-     g_assert(qemu_mutex_iothread_locked());
+     CPUPPCState *env = &cpu->env;
-     cpu->interrupt_request |= mask;
+     unsigned int old_pending;
--    cpu->icount_decr.u16.high = -1;
+-    bool locked = false;
-+    atomic_set(&cpu->icount_decr.u16.high, -1);
      /* We may already have the BQL if coming from the reset path */
 -    if (!qemu_mutex_iothread_locked()) {
 -        locked = true;
 -        qemu_mutex_lock_iothread();
 -    }
 +    QEMU_IOTHREAD_LOCK_GUARD();
      old_pending = env->pending_interrupts;
@@ -XXX,XX +XXX,XX @@ void ppc_set_irq(PowerPCCPU *cpu, int irq, int level)
      trace_ppc_irq_set_exit(env, irq, level, env->pending_interrupts,
                             CPU(cpu)->interrupt_request);
 -
 -    if (locked) {
 -        qemu_mutex_unlock_iothread();
 -    }
  }
- /*
+ /* PowerPC 6xx / 7xx internal IRQ controller */
 diff --git a/qom/cpu.c b/qom/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/qom/cpu.c
 +++ b/qom/cpu.c
@@ -XXX,XX +XXX,XX @@ static void cpu_common_reset(CPUState *cpu)
      cpu->mem_io_pc = 0;
      cpu->mem_io_vaddr = 0;
      cpu->icount_extra = 0;
 -    cpu->icount_decr.u32 = 0;
 +    atomic_set(&cpu->icount_decr.u32, 0);
      cpu->can_do_io = 1;
      cpu->exception_index = -1;
      cpu->crash_occurred = false;
 --
-.17.2
+.34.1

-[Qemu-devel] [PULL 09/21] cputlb: fix assert_cpu_is_self macro
+[PULL 11/47] accel/tcg: Use QEMU_IOTHREAD_LOCK_GUARD in io_readx/io_writex
-From: "Emilio G. Cota" <cota@braap.org>
+Narrow the scope of the lock to the actual read/write,
 moving the cpu_transation_failed call outside the lock.
-Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Signed-off-by: Emilio G. Cota <cota@braap.org>
 Message-Id: <20181009174557.16125-5-cota@braap.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/cputlb.c | 4 ++--
+ accel/tcg/cputlb.c | 25 ++++++++-----------------
-file changed, 2 insertions(+), 2 deletions(-)
+file changed, 8 insertions(+), 17 deletions(-)
 diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/cputlb.c
 +++ b/accel/tcg/cputlb.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static uint64_t io_readx(CPUArchState *env, CPUTLBEntryFull *full,
-     } \
+     MemoryRegionSection *section;
- } while (0)
+     MemoryRegion *mr;
+     uint64_t val;
--#define assert_cpu_is_self(this_cpu) do {                         \
+-    bool locked = false;
-+#define assert_cpu_is_self(cpu) do {                              \
+     MemTxResult r;
-         if (DEBUG_TLB_GATE) {                                     \
--            g_assert(!cpu->created || qemu_cpu_is_self(cpu));     \
+     section = iotlb_to_section(cpu, full->xlat_section, full->attrs);
-+            g_assert(!(cpu)->created || qemu_cpu_is_self(cpu));   \
+@@ -XXX,XX +XXX,XX @@ static uint64_t io_readx(CPUArchState *env, CPUTLBEntryFull *full,
-         }                                                         \
+         cpu_io_recompile(cpu, retaddr);
-     } while (0)
+     }
 -    if (!qemu_mutex_iothread_locked()) {
 -        qemu_mutex_lock_iothread();
 -        locked = true;
 +    {
 +        QEMU_IOTHREAD_LOCK_GUARD();
 +        r = memory_region_dispatch_read(mr, mr_offset, &val, op, full->attrs);
      }
 -    r = memory_region_dispatch_read(mr, mr_offset, &val, op, full->attrs);
 +
      if (r != MEMTX_OK) {
          hwaddr physaddr = mr_offset +
              section->offset_within_address_space -
@@ -XXX,XX +XXX,XX @@ static uint64_t io_readx(CPUArchState *env, CPUTLBEntryFull *full,
          cpu_transaction_failed(cpu, physaddr, addr, memop_size(op), access_type,
                                 mmu_idx, full->attrs, r, retaddr);
      }
 -    if (locked) {
 -        qemu_mutex_unlock_iothread();
 -    }
 -
      return val;
  }
@@ -XXX,XX +XXX,XX @@ static void io_writex(CPUArchState *env, CPUTLBEntryFull *full,
      hwaddr mr_offset;
      MemoryRegionSection *section;
      MemoryRegion *mr;
 -    bool locked = false;
      MemTxResult r;
      section = iotlb_to_section(cpu, full->xlat_section, full->attrs);
@@ -XXX,XX +XXX,XX @@ static void io_writex(CPUArchState *env, CPUTLBEntryFull *full,
       */
      save_iotlb_data(cpu, section, mr_offset);
 -    if (!qemu_mutex_iothread_locked()) {
 -        qemu_mutex_lock_iothread();
 -        locked = true;
 +    {
 +        QEMU_IOTHREAD_LOCK_GUARD();
 +        r = memory_region_dispatch_write(mr, mr_offset, val, op, full->attrs);
      }
 -    r = memory_region_dispatch_write(mr, mr_offset, val, op, full->attrs);
 +
      if (r != MEMTX_OK) {
          hwaddr physaddr = mr_offset +
              section->offset_within_address_space -
@@ -XXX,XX +XXX,XX @@ static void io_writex(CPUArchState *env, CPUTLBEntryFull *full,
                                 MMU_DATA_STORE, mmu_idx, full->attrs, r,
                                 retaddr);
      }
 -    if (locked) {
 -        qemu_mutex_unlock_iothread();
 -    }
  }
  static inline target_ulong tlb_read_ofs(CPUTLBEntry *entry, size_t ofs)
 --
-.17.2
+.34.1

-New patch
+[PULL 12/47] tcg: Tidy tcg_reg_alloc_op
+Replace goto allocate_in_reg with a boolean.
+Remove o_preferred_regs which isn't used, except to copy.
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/tcg.c | 45 +++++++++++++++++++++------------------------
+file changed, 21 insertions(+), 24 deletions(-)
+diff --git a/tcg/tcg.c b/tcg/tcg.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/tcg.c
++++ b/tcg/tcg.c
+@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
+     /* satisfy input constraints */
+     for (k = 0; k < nb_iargs; k++) {
+-        TCGRegSet i_preferred_regs, o_preferred_regs;
++        TCGRegSet i_preferred_regs;
++        bool allocate_new_reg;
+         i = def->args_ct[nb_oargs + k].sort_index;
+         arg = op->args[i];
+@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
+             continue;
+         }
+-        i_preferred_regs = o_preferred_regs = 0;
++        reg = ts->reg;
++        i_preferred_regs = 0;
++        allocate_new_reg = false;
++
+         if (arg_ct->ialias) {
+-            o_preferred_regs = op->output_pref[arg_ct->alias_index];
++            i_preferred_regs = op->output_pref[arg_ct->alias_index];
+             /*
+              * If the input is readonly, then it cannot also be an
+@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
+              * register and move it.
+              */
+             if (temp_readonly(ts) || !IS_DEAD_ARG(i)) {
+-                goto allocate_in_reg;
++                allocate_new_reg = true;
++            } else if (ts->val_type == TEMP_VAL_REG) {
++                /*
++                 * Check if the current register has already been
++                 * allocated for another input.
++                 */
++                allocate_new_reg = tcg_regset_test_reg(i_allocated_regs, reg);
+             }
+-
+-            /*
+-             * Check if the current register has already been allocated
+-             * for another input aliased to an output.
+-             */
+-            if (ts->val_type == TEMP_VAL_REG) {
+-                reg = ts->reg;
+-                for (int k2 = 0; k2 < k; k2++) {
+-                    int i2 = def->args_ct[nb_oargs + k2].sort_index;
+-                    if (def->args_ct[i2].ialias && reg == new_args[i2]) {
+-                        goto allocate_in_reg;
+-                    }
+-                }
+-            }
+-            i_preferred_regs = o_preferred_regs;
+         }
+-        temp_load(s, ts, arg_ct->regs, i_allocated_regs, i_preferred_regs);
+-        reg = ts->reg;
++        if (!allocate_new_reg) {
++            temp_load(s, ts, arg_ct->regs, i_allocated_regs, i_preferred_regs);
++            reg = ts->reg;
++            allocate_new_reg = !tcg_regset_test_reg(arg_ct->regs, reg);
++        }
+-        if (!tcg_regset_test_reg(arg_ct->regs, reg)) {
+- allocate_in_reg:
++        if (allocate_new_reg) {
+             /*
+              * Allocate a new register matching the constraint
+              * and move the temporary register into it.
+@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
+             temp_load(s, ts, tcg_target_available_regs[ts->type],
+                       i_allocated_regs, 0);
+             reg = tcg_reg_alloc(s, arg_ct->regs, i_allocated_regs,
+-                                o_preferred_regs, ts->indirect_base);
++                                i_preferred_regs, ts->indirect_base);
+             if (!tcg_out_mov(s, ts->type, reg, ts->reg)) {
+                 /*
+                  * Cross register class move not supported.  Sync the
+--
+.34.1

-[Qemu-devel] [PULL 14/21] target/arm: Convert to HAVE_CMPXCHG128
+[PULL 13/47] tcg: Remove TCG_TARGET_STACK_GROWSUP
-Reviewed-by: Emilio G. Cota <cota@braap.org>
+The hppa host code has been removed since 2013; this
 should have been deleted at the same time.
 Fixes: 802b5081233a ("tcg-hppa: Remove tcg backend")
 Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/arm/helper-a64.c | 259 +++++++++++++++++++++-------------------
+ tcg/aarch64/tcg-target.h |  1 -
-file changed, 133 insertions(+), 126 deletions(-)
+ tcg/arm/tcg-target.h     |  1 -
  tcg/tcg.c                | 32 ++------------------------------
 files changed, 2 insertions(+), 32 deletions(-)
-diff --git a/target/arm/helper-a64.c b/target/arm/helper-a64.c
+diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
---- a/target/arm/helper-a64.c
+--- a/tcg/aarch64/tcg-target.h
-+++ b/target/arm/helper-a64.c
++++ b/tcg/aarch64/tcg-target.h
 @@ -XXX,XX +XXX,XX @@
- #include "exec/exec-all.h"
+ #define TCG_TARGET_INSN_UNIT_SIZE  4
- #include "exec/cpu_ldst.h"
+ #define TCG_TARGET_TLB_DISPLACEMENT_BITS 24
- #include "qemu/int128.h"
+ #define MAX_CODE_GEN_BUFFER_SIZE  (2 * GiB)
-+#include "qemu/atomic128.h"
+-#undef TCG_TARGET_STACK_GROWSUP
- #include "tcg.h"
- #include "fpu/softfloat.h"
+ typedef enum {
- #include <zlib.h> /* For crc32 */
+     TCG_REG_X0, TCG_REG_X1, TCG_REG_X2, TCG_REG_X3,
-@@ -XXX,XX +XXX,XX @@ uint64_t HELPER(crc32c_64)(uint64_t acc, uint64_t val, uint32_t bytes)
+diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
-     return crc32c(acc, buf, bytes) ^ 0xffffffff;
+index XXXXXXX..XXXXXXX 100644
 --- a/tcg/arm/tcg-target.h
 +++ b/tcg/arm/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern int arm_arch;
  #define use_armv7_instructions  (__ARM_ARCH >= 7 || arm_arch >= 7)
 -#undef TCG_TARGET_STACK_GROWSUP
  #define TCG_TARGET_INSN_UNIT_SIZE 4
  #define TCG_TARGET_TLB_DISPLACEMENT_BITS 16
  #define MAX_CODE_GEN_BUFFER_SIZE  UINT32_MAX
 diff --git a/tcg/tcg.c b/tcg/tcg.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tcg.c
 +++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ void tcg_gen_callN(void *func, TCGTemp *ret, int nargs, TCGTemp **args)
          }
          if (TCG_TARGET_REG_BITS < 64 && is_64bit) {
 -            /*
 -             * If stack grows up, then we will be placing successive
 -             * arguments at lower addresses, which means we need to
 -             * reverse the order compared to how we would normally
 -             * treat either big or little-endian.  For those arguments
 -             * that will wind up in registers, this still works for
 -             * HPPA (the only current STACK_GROWSUP target) since the
 -             * argument registers are *also* allocated in decreasing
 -             * order.  If another such target is added, this logic may
 -             * have to get more complicated to differentiate between
 -             * stack arguments and register arguments.
 -             */
 -#if HOST_BIG_ENDIAN != defined(TCG_TARGET_STACK_GROWSUP)
 -            op->args[pi++] = temp_arg(args[i] + 1);
 -            op->args[pi++] = temp_arg(args[i]);
 -#else
 -            op->args[pi++] = temp_arg(args[i]);
 -            op->args[pi++] = temp_arg(args[i] + 1);
 -#endif
 +            op->args[pi++] = temp_arg(args[i] + HOST_BIG_ENDIAN);
 +            op->args[pi++] = temp_arg(args[i] + !HOST_BIG_ENDIAN);
              real_args += 2;
              continue;
          }
@@ -XXX,XX +XXX,XX @@ static bool tcg_reg_alloc_dup2(TCGContext *s, const TCGOp *op)
      return true;
  }
--/* Returns 0 on success; 1 otherwise.  */
+-#ifdef TCG_TARGET_STACK_GROWSUP
--static uint64_t do_paired_cmpxchg64_le(CPUARMState *env, uint64_t addr,
+-#define STACK_DIR(x) (-(x))
--                                       uint64_t new_lo, uint64_t new_hi,
+-#else
--                                       bool parallel, uintptr_t ra)
+-#define STACK_DIR(x) (x)
-+uint64_t HELPER(paired_cmpxchg64_le)(CPUARMState *env, uint64_t addr,
+-#endif
-+                                     uint64_t new_lo, uint64_t new_hi)
+-
  static void tcg_reg_alloc_call(TCGContext *s, TCGOp *op)
  {
--    Int128 oldv, cmpv, newv;
+     const int nb_oargs = TCGOP_CALLO(op);
-+    Int128 cmpv = int128_make128(env->exclusive_val, env->exclusive_high);
+@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_call(TCGContext *s, TCGOp *op)
-+    Int128 newv = int128_make128(new_lo, new_hi);
+     stack_offset = TCG_TARGET_CALL_STACK_OFFSET;
-+    Int128 oldv;
+     for (i = nb_regs; i < nb_iargs; i++) {
-+    uintptr_t ra = GETPC();
+         arg = op->args[nb_oargs + i];
-+    uint64_t o0, o1;
+-#ifdef TCG_TARGET_STACK_GROWSUP
-     bool success;
+-        stack_offset -= sizeof(tcg_target_long);
 -    cmpv = int128_make128(env->exclusive_val, env->exclusive_high);
 -    newv = int128_make128(new_lo, new_hi);
 -
 -    if (parallel) {
 -#ifndef CONFIG_ATOMIC128
 -        cpu_loop_exit_atomic(ENV_GET_CPU(env), ra);
 -#else
 -        int mem_idx = cpu_mmu_index(env, false);
 -        TCGMemOpIdx oi = make_memop_idx(MO_LEQ | MO_ALIGN_16, mem_idx);
 -        oldv = helper_atomic_cmpxchgo_le_mmu(env, addr, cmpv, newv, oi, ra);
 -        success = int128_eq(oldv, cmpv);
 -#endif
--    } else {
+         if (arg != TCG_CALL_DUMMY_ARG) {
--        uint64_t o0, o1;
+             ts = arg_temp(arg);
--
+             temp_load(s, ts, tcg_target_available_regs[ts->type],
- #ifdef CONFIG_USER_ONLY
+                       s->reserved_regs, 0);
--        /* ??? Enforce alignment.  */
+             tcg_out_st(s, ts->type, ts->reg, TCG_REG_CALL_STACK, stack_offset);
--        uint64_t *haddr = g2h(addr);
+         }
-+    /* ??? Enforce alignment.  */
+-#ifndef TCG_TARGET_STACK_GROWSUP
-+    uint64_t *haddr = g2h(addr);
+         stack_offset += sizeof(tcg_target_long);
 -        helper_retaddr = ra;
 -        o0 = ldq_le_p(haddr + 0);
 -        o1 = ldq_le_p(haddr + 1);
 -        oldv = int128_make128(o0, o1);
 +    helper_retaddr = ra;
 +    o0 = ldq_le_p(haddr + 0);
 +    o1 = ldq_le_p(haddr + 1);
 +    oldv = int128_make128(o0, o1);
 -        success = int128_eq(oldv, cmpv);
 -        if (success) {
 -            stq_le_p(haddr + 0, int128_getlo(newv));
 -            stq_le_p(haddr + 1, int128_gethi(newv));
 -        }
 -        helper_retaddr = 0;
 -#else
 -        int mem_idx = cpu_mmu_index(env, false);
 -        TCGMemOpIdx oi0 = make_memop_idx(MO_LEQ | MO_ALIGN_16, mem_idx);
 -        TCGMemOpIdx oi1 = make_memop_idx(MO_LEQ, mem_idx);
 -
 -        o0 = helper_le_ldq_mmu(env, addr + 0, oi0, ra);
 -        o1 = helper_le_ldq_mmu(env, addr + 8, oi1, ra);
 -        oldv = int128_make128(o0, o1);
 -
 -        success = int128_eq(oldv, cmpv);
 -        if (success) {
 -            helper_le_stq_mmu(env, addr + 0, int128_getlo(newv), oi1, ra);
 -            helper_le_stq_mmu(env, addr + 8, int128_gethi(newv), oi1, ra);
 -        }
 -#endif
 +    success = int128_eq(oldv, cmpv);
 +    if (success) {
 +        stq_le_p(haddr + 0, int128_getlo(newv));
 +        stq_le_p(haddr + 1, int128_gethi(newv));
      }
 +    helper_retaddr = 0;
 +#else
 +    int mem_idx = cpu_mmu_index(env, false);
 +    TCGMemOpIdx oi0 = make_memop_idx(MO_LEQ | MO_ALIGN_16, mem_idx);
 +    TCGMemOpIdx oi1 = make_memop_idx(MO_LEQ, mem_idx);
 +
 +    o0 = helper_le_ldq_mmu(env, addr + 0, oi0, ra);
 +    o1 = helper_le_ldq_mmu(env, addr + 8, oi1, ra);
 +    oldv = int128_make128(o0, o1);
 +
 +    success = int128_eq(oldv, cmpv);
 +    if (success) {
 +        helper_le_stq_mmu(env, addr + 0, int128_getlo(newv), oi1, ra);
 +        helper_le_stq_mmu(env, addr + 8, int128_gethi(newv), oi1, ra);
 +    }
 +#endif
      return !success;
  }
 -uint64_t HELPER(paired_cmpxchg64_le)(CPUARMState *env, uint64_t addr,
 -                                              uint64_t new_lo, uint64_t new_hi)
 -{
 -    return do_paired_cmpxchg64_le(env, addr, new_lo, new_hi, false, GETPC());
 -}
 -
  uint64_t HELPER(paired_cmpxchg64_le_parallel)(CPUARMState *env, uint64_t addr,
                                                uint64_t new_lo, uint64_t new_hi)
 -{
 -    return do_paired_cmpxchg64_le(env, addr, new_lo, new_hi, true, GETPC());
 -}
 -
 -static uint64_t do_paired_cmpxchg64_be(CPUARMState *env, uint64_t addr,
 -                                       uint64_t new_lo, uint64_t new_hi,
 -                                       bool parallel, uintptr_t ra)
  {
      Int128 oldv, cmpv, newv;
 +    uintptr_t ra = GETPC();
      bool success;
 +    int mem_idx;
 +    TCGMemOpIdx oi;
 -    /* high and low need to be switched here because this is not actually a
 -     * 128bit store but two doublewords stored consecutively
 -     */
 -    cmpv = int128_make128(env->exclusive_high, env->exclusive_val);
 -    newv = int128_make128(new_hi, new_lo);
 -
 -    if (parallel) {
 -#ifndef CONFIG_ATOMIC128
 +    if (!HAVE_CMPXCHG128) {
          cpu_loop_exit_atomic(ENV_GET_CPU(env), ra);
 -#else
 -        int mem_idx = cpu_mmu_index(env, false);
 -        TCGMemOpIdx oi = make_memop_idx(MO_BEQ | MO_ALIGN_16, mem_idx);
 -        oldv = helper_atomic_cmpxchgo_be_mmu(env, addr, cmpv, newv, oi, ra);
 -        success = int128_eq(oldv, cmpv);
 -#endif
 -    } else {
 -        uint64_t o0, o1;
 -
 -#ifdef CONFIG_USER_ONLY
 -        /* ??? Enforce alignment.  */
 -        uint64_t *haddr = g2h(addr);
 -
 -        helper_retaddr = ra;
 -        o1 = ldq_be_p(haddr + 0);
 -        o0 = ldq_be_p(haddr + 1);
 -        oldv = int128_make128(o0, o1);
 -
 -        success = int128_eq(oldv, cmpv);
 -        if (success) {
 -            stq_be_p(haddr + 0, int128_gethi(newv));
 -            stq_be_p(haddr + 1, int128_getlo(newv));
 -        }
 -        helper_retaddr = 0;
 -#else
 -        int mem_idx = cpu_mmu_index(env, false);
 -        TCGMemOpIdx oi0 = make_memop_idx(MO_BEQ | MO_ALIGN_16, mem_idx);
 -        TCGMemOpIdx oi1 = make_memop_idx(MO_BEQ, mem_idx);
 -
 -        o1 = helper_be_ldq_mmu(env, addr + 0, oi0, ra);
 -        o0 = helper_be_ldq_mmu(env, addr + 8, oi1, ra);
 -        oldv = int128_make128(o0, o1);
 -
 -        success = int128_eq(oldv, cmpv);
 -        if (success) {
 -            helper_be_stq_mmu(env, addr + 0, int128_gethi(newv), oi1, ra);
 -            helper_be_stq_mmu(env, addr + 8, int128_getlo(newv), oi1, ra);
 -        }
 -#endif
      }
-+    mem_idx = cpu_mmu_index(env, false);
+     /* assign input registers */
 +    oi = make_memop_idx(MO_LEQ | MO_ALIGN_16, mem_idx);
 +
 +    cmpv = int128_make128(env->exclusive_val, env->exclusive_high);
 +    newv = int128_make128(new_lo, new_hi);
 +    oldv = helper_atomic_cmpxchgo_le_mmu(env, addr, cmpv, newv, oi, ra);
 +
 +    success = int128_eq(oldv, cmpv);
      return !success;
  }
  uint64_t HELPER(paired_cmpxchg64_be)(CPUARMState *env, uint64_t addr,
                                       uint64_t new_lo, uint64_t new_hi)
  {
 -    return do_paired_cmpxchg64_be(env, addr, new_lo, new_hi, false, GETPC());
 +    /*
 +     * High and low need to be switched here because this is not actually a
 +     * 128bit store but two doublewords stored consecutively
 +     */
 +    Int128 cmpv = int128_make128(env->exclusive_val, env->exclusive_high);
 +    Int128 newv = int128_make128(new_lo, new_hi);
 +    Int128 oldv;
 +    uintptr_t ra = GETPC();
 +    uint64_t o0, o1;
 +    bool success;
 +
 +#ifdef CONFIG_USER_ONLY
 +    /* ??? Enforce alignment.  */
 +    uint64_t *haddr = g2h(addr);
 +
 +    helper_retaddr = ra;
 +    o1 = ldq_be_p(haddr + 0);
 +    o0 = ldq_be_p(haddr + 1);
 +    oldv = int128_make128(o0, o1);
 +
 +    success = int128_eq(oldv, cmpv);
 +    if (success) {
 +        stq_be_p(haddr + 0, int128_gethi(newv));
 +        stq_be_p(haddr + 1, int128_getlo(newv));
 +    }
 +    helper_retaddr = 0;
 +#else
 +    int mem_idx = cpu_mmu_index(env, false);
 +    TCGMemOpIdx oi0 = make_memop_idx(MO_BEQ | MO_ALIGN_16, mem_idx);
 +    TCGMemOpIdx oi1 = make_memop_idx(MO_BEQ, mem_idx);
 +
 +    o1 = helper_be_ldq_mmu(env, addr + 0, oi0, ra);
 +    o0 = helper_be_ldq_mmu(env, addr + 8, oi1, ra);
 +    oldv = int128_make128(o0, o1);
 +
 +    success = int128_eq(oldv, cmpv);
 +    if (success) {
 +        helper_be_stq_mmu(env, addr + 0, int128_gethi(newv), oi1, ra);
 +        helper_be_stq_mmu(env, addr + 8, int128_getlo(newv), oi1, ra);
 +    }
 +#endif
 +
 +    return !success;
  }
  uint64_t HELPER(paired_cmpxchg64_be_parallel)(CPUARMState *env, uint64_t addr,
 -                                     uint64_t new_lo, uint64_t new_hi)
 +                                              uint64_t new_lo, uint64_t new_hi)
  {
 -    return do_paired_cmpxchg64_be(env, addr, new_lo, new_hi, true, GETPC());
 +    Int128 oldv, cmpv, newv;
 +    uintptr_t ra = GETPC();
 +    bool success;
 +    int mem_idx;
 +    TCGMemOpIdx oi;
 +
 +    if (!HAVE_CMPXCHG128) {
 +        cpu_loop_exit_atomic(ENV_GET_CPU(env), ra);
 +    }
 +
 +    mem_idx = cpu_mmu_index(env, false);
 +    oi = make_memop_idx(MO_BEQ | MO_ALIGN_16, mem_idx);
 +
 +    /*
 +     * High and low need to be switched here because this is not actually a
 +     * 128bit store but two doublewords stored consecutively
 +     */
 +    cmpv = int128_make128(env->exclusive_high, env->exclusive_val);
 +    newv = int128_make128(new_hi, new_lo);
 +    oldv = helper_atomic_cmpxchgo_be_mmu(env, addr, cmpv, newv, oi, ra);
 +
 +    success = int128_eq(oldv, cmpv);
 +    return !success;
  }
  /* Writes back the old data into Rs.  */
  void HELPER(casp_le_parallel)(CPUARMState *env, uint32_t rs, uint64_t addr,
                                uint64_t new_lo, uint64_t new_hi)
  {
 -    uintptr_t ra = GETPC();
 -#ifndef CONFIG_ATOMIC128
 -    cpu_loop_exit_atomic(ENV_GET_CPU(env), ra);
 -#else
      Int128 oldv, cmpv, newv;
 +    uintptr_t ra = GETPC();
 +    int mem_idx;
 +    TCGMemOpIdx oi;
 +
 +    if (!HAVE_CMPXCHG128) {
 +        cpu_loop_exit_atomic(ENV_GET_CPU(env), ra);
 +    }
 +
 +    mem_idx = cpu_mmu_index(env, false);
 +    oi = make_memop_idx(MO_LEQ | MO_ALIGN_16, mem_idx);
      cmpv = int128_make128(env->xregs[rs], env->xregs[rs + 1]);
      newv = int128_make128(new_lo, new_hi);
 -
 -    int mem_idx = cpu_mmu_index(env, false);
 -    TCGMemOpIdx oi = make_memop_idx(MO_LEQ | MO_ALIGN_16, mem_idx);
      oldv = helper_atomic_cmpxchgo_le_mmu(env, addr, cmpv, newv, oi, ra);
      env->xregs[rs] = int128_getlo(oldv);
      env->xregs[rs + 1] = int128_gethi(oldv);
 -#endif
  }
  void HELPER(casp_be_parallel)(CPUARMState *env, uint32_t rs, uint64_t addr,
                                uint64_t new_hi, uint64_t new_lo)
  {
 -    uintptr_t ra = GETPC();
 -#ifndef CONFIG_ATOMIC128
 -    cpu_loop_exit_atomic(ENV_GET_CPU(env), ra);
 -#else
      Int128 oldv, cmpv, newv;
 +    uintptr_t ra = GETPC();
 +    int mem_idx;
 +    TCGMemOpIdx oi;
 +
 +    if (!HAVE_CMPXCHG128) {
 +        cpu_loop_exit_atomic(ENV_GET_CPU(env), ra);
 +    }
 +
 +    mem_idx = cpu_mmu_index(env, false);
 +    oi = make_memop_idx(MO_LEQ | MO_ALIGN_16, mem_idx);
      cmpv = int128_make128(env->xregs[rs + 1], env->xregs[rs]);
      newv = int128_make128(new_lo, new_hi);
 -
 -    int mem_idx = cpu_mmu_index(env, false);
 -    TCGMemOpIdx oi = make_memop_idx(MO_LEQ | MO_ALIGN_16, mem_idx);
      oldv = helper_atomic_cmpxchgo_be_mmu(env, addr, cmpv, newv, oi, ra);
      env->xregs[rs + 1] = int128_getlo(oldv);
      env->xregs[rs] = int128_gethi(oldv);
 -#endif
  }
  /*
 --
-.17.2
+.34.1

-New patch
+[PULL 14/47] tci: MAX_OPC_PARAM_IARGS is no longer used
+Unused since commit 7b7d8b2d9a ("tcg/tci: Use ffi for calls").
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/tci.c                | 1 -
+ tcg/tci/tcg-target.c.inc | 4 ----
+files changed, 5 deletions(-)
+diff --git a/tcg/tci.c b/tcg/tci.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/tci.c
++++ b/tcg/tci.c
+@@ -XXX,XX +XXX,XX @@
+  */
+ #include "qemu/osdep.h"
+-#include "tcg/tcg.h"           /* MAX_OPC_PARAM_IARGS */
+ #include "exec/cpu_ldst.h"
+ #include "tcg/tcg-op.h"
+ #include "tcg/tcg-ldst.h"
+diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/tci/tcg-target.c.inc
++++ b/tcg/tci/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ static const int tcg_target_reg_alloc_order[] = {
+     TCG_REG_R0,
+ };
+-#if MAX_OPC_PARAM_IARGS != 7
+-# error Fix needed, number of supported input arguments changed!
+-#endif
+-
+ /* No call arguments via registers.  All will be stored on the "stack". */
+ static const int tcg_target_call_iarg_regs[] = { };
+--
+.34.1

-New patch
+[PULL 15/47] tcg: Fix tcg_reg_alloc_dup*
+The assignment to mem_coherent should be done with any
+modification, not simply with a newly allocated register.
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/tcg.c | 4 ++--
+file changed, 2 insertions(+), 2 deletions(-)
+diff --git a/tcg/tcg.c b/tcg/tcg.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/tcg.c
++++ b/tcg/tcg.c
+@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_dup(TCGContext *s, const TCGOp *op)
+         ots->reg = tcg_reg_alloc(s, dup_out_regs, allocated_regs,
+                                  op->output_pref[0], ots->indirect_base);
+         ots->val_type = TEMP_VAL_REG;
+-        ots->mem_coherent = 0;
+         s->reg_to_temp[ots->reg] = ots;
+     }
+@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_dup(TCGContext *s, const TCGOp *op)
+     tcg_debug_assert(ok);
+  done:
++    ots->mem_coherent = 0;
+     if (IS_DEAD_ARG(1)) {
+         temp_dead(s, its);
+     }
+@@ -XXX,XX +XXX,XX @@ static bool tcg_reg_alloc_dup2(TCGContext *s, const TCGOp *op)
+         ots->reg = tcg_reg_alloc(s, dup_out_regs, allocated_regs,
+                                  op->output_pref[0], ots->indirect_base);
+         ots->val_type = TEMP_VAL_REG;
+-        ots->mem_coherent = 0;
+         s->reg_to_temp[ots->reg] = ots;
+     }
+@@ -XXX,XX +XXX,XX @@ static bool tcg_reg_alloc_dup2(TCGContext *s, const TCGOp *op)
+     return false;
+  done:
++    ots->mem_coherent = 0;
+     if (IS_DEAD_ARG(1)) {
+         temp_dead(s, itsl);
+     }
+--
+.34.1

-New patch
+[PULL 16/47] tcg: Centralize updates to reg_to_temp
+Create two new functions, set_temp_val_{reg,nonreg}.
 Assert that the reg_to_temp mapping is correct before
 any changes are made.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
  tcg/tcg.c | 159 +++++++++++++++++++++++++++++-------------------------
 file changed, 85 insertions(+), 74 deletions(-)
 diff --git a/tcg/tcg.c b/tcg/tcg.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tcg.c
 +++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static void temp_allocate_frame(TCGContext *s, TCGTemp *ts)
      ts->mem_allocated = 1;
  }
 +/* Assign @reg to @ts, and update reg_to_temp[]. */
 +static void set_temp_val_reg(TCGContext *s, TCGTemp *ts, TCGReg reg)
 +{
 +    if (ts->val_type == TEMP_VAL_REG) {
 +        TCGReg old = ts->reg;
 +        tcg_debug_assert(s->reg_to_temp[old] == ts);
 +        if (old == reg) {
 +            return;
 +        }
 +        s->reg_to_temp[old] = NULL;
 +    }
 +    tcg_debug_assert(s->reg_to_temp[reg] == NULL);
 +    s->reg_to_temp[reg] = ts;
 +    ts->val_type = TEMP_VAL_REG;
 +    ts->reg = reg;
 +}
 +
 +/* Assign a non-register value type to @ts, and update reg_to_temp[]. */
 +static void set_temp_val_nonreg(TCGContext *s, TCGTemp *ts, TCGTempVal type)
 +{
 +    tcg_debug_assert(type != TEMP_VAL_REG);
 +    if (ts->val_type == TEMP_VAL_REG) {
 +        TCGReg reg = ts->reg;
 +        tcg_debug_assert(s->reg_to_temp[reg] == ts);
 +        s->reg_to_temp[reg] = NULL;
 +    }
 +    ts->val_type = type;
 +}
 +
  static void temp_load(TCGContext *, TCGTemp *, TCGRegSet, TCGRegSet, TCGRegSet);
  /* Mark a temporary as free or dead.  If 'free_or_dead' is negative,
@@ -XXX,XX +XXX,XX @@ static void temp_free_or_dead(TCGContext *s, TCGTemp *ts, int free_or_dead)
      default:
          g_assert_not_reached();
      }
 -    if (ts->val_type == TEMP_VAL_REG) {
 -        s->reg_to_temp[ts->reg] = NULL;
 -    }
 -    ts->val_type = new_type;
 +    set_temp_val_nonreg(s, ts, new_type);
  }
  /* Mark a temporary as dead.  */
@@ -XXX,XX +XXX,XX @@ static void temp_load(TCGContext *s, TCGTemp *ts, TCGRegSet desired_regs,
      default:
          tcg_abort();
      }
 -    ts->reg = reg;
 -    ts->val_type = TEMP_VAL_REG;
 -    s->reg_to_temp[reg] = ts;
 +    set_temp_val_reg(s, ts, reg);
  }
  /* Save a temporary to memory. 'allocated_regs' is used in case a
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_do_movi(TCGContext *s, TCGTemp *ots,
      tcg_debug_assert(!temp_readonly(ots));
      /* The movi is not explicitly generated here.  */
 -    if (ots->val_type == TEMP_VAL_REG) {
 -        s->reg_to_temp[ots->reg] = NULL;
 -    }
 -    ots->val_type = TEMP_VAL_CONST;
 +    set_temp_val_nonreg(s, ots, TEMP_VAL_CONST);
      ots->val = val;
      ots->mem_coherent = 0;
      if (NEED_SYNC_ARG(0)) {
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_mov(TCGContext *s, const TCGOp *op)
      TCGRegSet allocated_regs, preferred_regs;
      TCGTemp *ts, *ots;
      TCGType otype, itype;
 +    TCGReg oreg, ireg;
      allocated_regs = s->reserved_regs;
      preferred_regs = op->output_pref[0];
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_mov(TCGContext *s, const TCGOp *op)
          temp_load(s, ts, tcg_target_available_regs[itype],
                    allocated_regs, preferred_regs);
      }
 -
      tcg_debug_assert(ts->val_type == TEMP_VAL_REG);
 +    ireg = ts->reg;
 +
      if (IS_DEAD_ARG(0)) {
          /* mov to a non-saved dead register makes no sense (even with
             liveness analysis disabled). */
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_mov(TCGContext *s, const TCGOp *op)
          if (!ots->mem_allocated) {
              temp_allocate_frame(s, ots);
          }
 -        tcg_out_st(s, otype, ts->reg, ots->mem_base->reg, ots->mem_offset);
 +        tcg_out_st(s, otype, ireg, ots->mem_base->reg, ots->mem_offset);
          if (IS_DEAD_ARG(1)) {
              temp_dead(s, ts);
          }
          temp_dead(s, ots);
 +        return;
 +    }
 +
 +    if (IS_DEAD_ARG(1) && ts->kind != TEMP_FIXED) {
 +        /*
 +         * The mov can be suppressed.  Kill input first, so that it
 +         * is unlinked from reg_to_temp, then set the output to the
 +         * reg that we saved from the input.
 +         */
 +        temp_dead(s, ts);
 +        oreg = ireg;
      } else {
 -        if (IS_DEAD_ARG(1) && ts->kind != TEMP_FIXED) {
 -            /* the mov can be suppressed */
 -            if (ots->val_type == TEMP_VAL_REG) {
 -                s->reg_to_temp[ots->reg] = NULL;
 -            }
 -            ots->reg = ts->reg;
 -            temp_dead(s, ts);
 +        if (ots->val_type == TEMP_VAL_REG) {
 +            oreg = ots->reg;
          } else {
 -            if (ots->val_type != TEMP_VAL_REG) {
 -                /* When allocating a new register, make sure to not spill the
 -                   input one. */
 -                tcg_regset_set_reg(allocated_regs, ts->reg);
 -                ots->reg = tcg_reg_alloc(s, tcg_target_available_regs[otype],
 -                                         allocated_regs, preferred_regs,
 -                                         ots->indirect_base);
 -            }
 -            if (!tcg_out_mov(s, otype, ots->reg, ts->reg)) {
 -                /*
 -                 * Cross register class move not supported.
 -                 * Store the source register into the destination slot
 -                 * and leave the destination temp as TEMP_VAL_MEM.
 -                 */
 -                assert(!temp_readonly(ots));
 -                if (!ts->mem_allocated) {
 -                    temp_allocate_frame(s, ots);
 -                }
 -                tcg_out_st(s, ts->type, ts->reg,
 -                           ots->mem_base->reg, ots->mem_offset);
 -                ots->mem_coherent = 1;
 -                temp_free_or_dead(s, ots, -1);
 -                return;
 -            }
 +            /* Make sure to not spill the input register during allocation. */
 +            oreg = tcg_reg_alloc(s, tcg_target_available_regs[otype],
 +                                 allocated_regs | ((TCGRegSet)1 << ireg),
 +                                 preferred_regs, ots->indirect_base);
          }
 -        ots->val_type = TEMP_VAL_REG;
 -        ots->mem_coherent = 0;
 -        s->reg_to_temp[ots->reg] = ots;
 -        if (NEED_SYNC_ARG(0)) {
 -            temp_sync(s, ots, allocated_regs, 0, 0);
 +        if (!tcg_out_mov(s, otype, oreg, ireg)) {
 +            /*
 +             * Cross register class move not supported.
 +             * Store the source register into the destination slot
 +             * and leave the destination temp as TEMP_VAL_MEM.
 +             */
 +            assert(!temp_readonly(ots));
 +            if (!ts->mem_allocated) {
 +                temp_allocate_frame(s, ots);
 +            }
 +            tcg_out_st(s, ts->type, ireg, ots->mem_base->reg, ots->mem_offset);
 +            set_temp_val_nonreg(s, ts, TEMP_VAL_MEM);
 +            ots->mem_coherent = 1;
 +            return;
          }
      }
 +    set_temp_val_reg(s, ots, oreg);
 +    ots->mem_coherent = 0;
 +
 +    if (NEED_SYNC_ARG(0)) {
 +        temp_sync(s, ots, allocated_regs, 0, 0);
 +    }
  }
  /*
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_dup(TCGContext *s, const TCGOp *op)
      /* Allocate the output register now.  */
      if (ots->val_type != TEMP_VAL_REG) {
          TCGRegSet allocated_regs = s->reserved_regs;
 +        TCGReg oreg;
          if (!IS_DEAD_ARG(1) && its->val_type == TEMP_VAL_REG) {
              /* Make sure to not spill the input register. */
              tcg_regset_set_reg(allocated_regs, its->reg);
          }
 -        ots->reg = tcg_reg_alloc(s, dup_out_regs, allocated_regs,
 -                                 op->output_pref[0], ots->indirect_base);
 -        ots->val_type = TEMP_VAL_REG;
 -        s->reg_to_temp[ots->reg] = ots;
 +        oreg = tcg_reg_alloc(s, dup_out_regs, allocated_regs,
 +                             op->output_pref[0], ots->indirect_base);
 +        set_temp_val_reg(s, ots, oreg);
      }
      switch (its->val_type) {
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_dup(TCGContext *s, const TCGOp *op)
  #else
          endian_fixup = 0;
  #endif
 +        /* Attempt to dup directly from the input memory slot. */
          if (tcg_out_dupm_vec(s, vtype, vece, ots->reg, its->mem_base->reg,
                               its->mem_offset + endian_fixup)) {
              goto done;
          }
 +        /* Load the input into the destination vector register. */
          tcg_out_ld(s, itype, ots->reg, its->mem_base->reg, its->mem_offset);
          break;
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
                                      op->output_pref[k], ts->indirect_base);
              }
              tcg_regset_set_reg(o_allocated_regs, reg);
 -            if (ts->val_type == TEMP_VAL_REG) {
 -                s->reg_to_temp[ts->reg] = NULL;
 -            }
 -            ts->val_type = TEMP_VAL_REG;
 -            ts->reg = reg;
 -            /*
 -             * Temp value is modified, so the value kept in memory is
 -             * potentially not the same.
 -             */
 +            set_temp_val_reg(s, ts, reg);
              ts->mem_coherent = 0;
 -            s->reg_to_temp[reg] = ts;
              new_args[i] = reg;
          }
      }
@@ -XXX,XX +XXX,XX @@ static bool tcg_reg_alloc_dup2(TCGContext *s, const TCGOp *op)
          TCGRegSet allocated_regs = s->reserved_regs;
          TCGRegSet dup_out_regs =
              tcg_op_defs[INDEX_op_dup_vec].args_ct[0].regs;
 +        TCGReg oreg;
          /* Make sure to not spill the input registers. */
          if (!IS_DEAD_ARG(1) && itsl->val_type == TEMP_VAL_REG) {
@@ -XXX,XX +XXX,XX @@ static bool tcg_reg_alloc_dup2(TCGContext *s, const TCGOp *op)
              tcg_regset_set_reg(allocated_regs, itsh->reg);
          }
 -        ots->reg = tcg_reg_alloc(s, dup_out_regs, allocated_regs,
 -                                 op->output_pref[0], ots->indirect_base);
 -        ots->val_type = TEMP_VAL_REG;
 -        s->reg_to_temp[ots->reg] = ots;
 +        oreg = tcg_reg_alloc(s, dup_out_regs, allocated_regs,
 +                             op->output_pref[0], ots->indirect_base);
 +        set_temp_val_reg(s, ots, oreg);
      }
      /* Promote dup2 of immediates to dupi_vec. */
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_call(TCGContext *s, TCGOp *op)
          tcg_debug_assert(!temp_readonly(ts));
          reg = tcg_target_call_oarg_regs[i];
 -        tcg_debug_assert(s->reg_to_temp[reg] == NULL);
 -        if (ts->val_type == TEMP_VAL_REG) {
 -            s->reg_to_temp[ts->reg] = NULL;
 -        }
 -        ts->val_type = TEMP_VAL_REG;
 -        ts->reg = reg;
 +        set_temp_val_reg(s, ts, reg);
          ts->mem_coherent = 0;
 -        s->reg_to_temp[reg] = ts;
          if (NEED_SYNC_ARG(i)) {
              temp_sync(s, ts, allocated_regs, 0, IS_DEAD_ARG(i));
          } else if (IS_DEAD_ARG(i)) {
 --
 .34.1

-[Qemu-devel] [PULL 06/21] target/alpha: remove tlb_flush from alpha_cpu_initfn
+[PULL 17/47] tcg: Remove check_regs
-From: "Emilio G. Cota" <cota@braap.org>
+We now check the consistency of reg_to_temp[] with each update,
+so the utility of checking consistency at the end of each
-As far as I can tell tlb_flush does not need to be called
+opcode is minimal.  In addition, the form of this check is
-this early. tlb_flush is eventually called after the CPU
+quite expensive, consuming 10% of a checking-enabled build.
 has been realized.
 This change paves the way to the introduction of tlb_init,
 which will be called from cpu_exec_realizefn.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
-Signed-off-by: Emilio G. Cota <cota@braap.org>
-Message-Id: <20181009174557.16125-2-cota@braap.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/alpha/cpu.c | 1 -
+ tcg/tcg.c | 76 -------------------------------------------------------
-file changed, 1 deletion(-)
+file changed, 76 deletions(-)
-diff --git a/target/alpha/cpu.c b/target/alpha/cpu.c
+diff --git a/tcg/tcg.c b/tcg/tcg.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/alpha/cpu.c
+--- a/tcg/tcg.c
-+++ b/target/alpha/cpu.c
++++ b/tcg/tcg.c
-@@ -XXX,XX +XXX,XX @@ static void alpha_cpu_initfn(Object *obj)
+@@ -XXX,XX +XXX,XX @@ static bool liveness_pass_2(TCGContext *s)
-     CPUAlphaState *env = &cpu->env;
+     return changes;
+ }
-     cs->env_ptr = env;
--    tlb_flush(cs);
+-#ifdef CONFIG_DEBUG_TCG
+-static void dump_regs(TCGContext *s)
-     env->lock_addr = -1;
+-{
- #if defined(CONFIG_USER_ONLY)
+-    TCGTemp *ts;
 -    int i;
 -    char buf[64];
 -
 -    for(i = 0; i < s->nb_temps; i++) {
 -        ts = &s->temps[i];
 -        printf("  %10s: ", tcg_get_arg_str_ptr(s, buf, sizeof(buf), ts));
 -        switch(ts->val_type) {
 -        case TEMP_VAL_REG:
 -            printf("%s", tcg_target_reg_names[ts->reg]);
 -            break;
 -        case TEMP_VAL_MEM:
 -            printf("%d(%s)", (int)ts->mem_offset,
 -                   tcg_target_reg_names[ts->mem_base->reg]);
 -            break;
 -        case TEMP_VAL_CONST:
 -            printf("$0x%" PRIx64, ts->val);
 -            break;
 -        case TEMP_VAL_DEAD:
 -            printf("D");
 -            break;
 -        default:
 -            printf("???");
 -            break;
 -        }
 -        printf("\n");
 -    }
 -
 -    for(i = 0; i < TCG_TARGET_NB_REGS; i++) {
 -        if (s->reg_to_temp[i] != NULL) {
 -            printf("%s: %s\n",
 -                   tcg_target_reg_names[i],
 -                   tcg_get_arg_str_ptr(s, buf, sizeof(buf), s->reg_to_temp[i]));
 -        }
 -    }
 -}
 -
 -static void check_regs(TCGContext *s)
 -{
 -    int reg;
 -    int k;
 -    TCGTemp *ts;
 -    char buf[64];
 -
 -    for (reg = 0; reg < TCG_TARGET_NB_REGS; reg++) {
 -        ts = s->reg_to_temp[reg];
 -        if (ts != NULL) {
 -            if (ts->val_type != TEMP_VAL_REG || ts->reg != reg) {
 -                printf("Inconsistency for register %s:\n",
 -                       tcg_target_reg_names[reg]);
 -                goto fail;
 -            }
 -        }
 -    }
 -    for (k = 0; k < s->nb_temps; k++) {
 -        ts = &s->temps[k];
 -        if (ts->val_type == TEMP_VAL_REG
 -            && ts->kind != TEMP_FIXED
 -            && s->reg_to_temp[ts->reg] != ts) {
 -            printf("Inconsistency for temp %s:\n",
 -                   tcg_get_arg_str_ptr(s, buf, sizeof(buf), ts));
 -        fail:
 -            printf("reg state:\n");
 -            dump_regs(s);
 -            tcg_abort();
 -        }
 -    }
 -}
 -#endif
 -
  static void temp_allocate_frame(TCGContext *s, TCGTemp *ts)
  {
      intptr_t off, size, align;
@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, target_ulong pc_start)
              tcg_reg_alloc_op(s, op);
              break;
          }
 -#ifdef CONFIG_DEBUG_TCG
 -        check_regs(s);
 -#endif
          /* Test for (pending) buffer overflow.  The assumption is that any
             one operation beginning below the high water mark cannot overrun
             the buffer completely.  Thus we can test for overflow after
 --
-.17.2
+.34.1

-New patch
+[PULL 18/47] tcg: Massage process_op_defs()
+From: Philippe Mathieu-Daudé <philmd@linaro.org>
+In preparation of introducing paired registers,
+massage a bit process_op_defs()'s switch case.
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+[PMD: Split from bigger patch, 1/3]
+Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+Message-Id: <20221219220925.79218-2-philmd@linaro.org>
+---
+ tcg/tcg.c | 61 +++++++++++++++++++++++++++++++------------------------
+file changed, 34 insertions(+), 27 deletions(-)
+diff --git a/tcg/tcg.c b/tcg/tcg.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/tcg.c
++++ b/tcg/tcg.c
+@@ -XXX,XX +XXX,XX @@ static void process_op_defs(TCGContext *s)
+     for (op = 0; op < NB_OPS; op++) {
+         TCGOpDef *def = &tcg_op_defs[op];
+         const TCGTargetOpDef *tdefs;
+-        int i, nb_args;
++        int i, o, nb_args;
+         if (def->flags & TCG_OPF_NOT_PRESENT) {
+             continue;
+@@ -XXX,XX +XXX,XX @@ static void process_op_defs(TCGContext *s)
+         for (i = 0; i < nb_args; i++) {
+             const char *ct_str = tdefs->args_ct_str[i];
++            bool input_p = i >= def->nb_oargs;
++
+             /* Incomplete TCGTargetOpDef entry. */
+             tcg_debug_assert(ct_str != NULL);
+-            while (*ct_str != '\0') {
+-                switch(*ct_str) {
+-                case '0' ... '9':
+-                    {
+-                        int oarg = *ct_str - '0';
+-                        tcg_debug_assert(ct_str == tdefs->args_ct_str[i]);
+-                        tcg_debug_assert(oarg < def->nb_oargs);
+-                        tcg_debug_assert(def->args_ct[oarg].regs != 0);
+-                        def->args_ct[i] = def->args_ct[oarg];
+-                        /* The output sets oalias.  */
+-                        def->args_ct[oarg].oalias = true;
+-                        def->args_ct[oarg].alias_index = i;
+-                        /* The input sets ialias. */
+-                        def->args_ct[i].ialias = true;
+-                        def->args_ct[i].alias_index = oarg;
+-                    }
+-                    ct_str++;
+-                    break;
+-                case '&':
+-                    def->args_ct[i].newreg = true;
+-                    ct_str++;
+-                    break;
++            switch (*ct_str) {
++            case '0' ... '9':
++                o = *ct_str - '0';
++                tcg_debug_assert(input_p);
++                tcg_debug_assert(o < def->nb_oargs);
++                tcg_debug_assert(def->args_ct[o].regs != 0);
++                tcg_debug_assert(!def->args_ct[o].oalias);
++                def->args_ct[i] = def->args_ct[o];
++                /* The output sets oalias.  */
++                def->args_ct[o].oalias = 1;
++                def->args_ct[o].alias_index = i;
++                /* The input sets ialias. */
++                def->args_ct[i].ialias = 1;
++                def->args_ct[i].alias_index = o;
++                tcg_debug_assert(ct_str[1] == '\0');
++                continue;
++
++            case '&':
++                tcg_debug_assert(!input_p);
++                def->args_ct[i].newreg = true;
++                ct_str++;
++                break;
++            }
++
++            do {
++                switch (*ct_str) {
+                 case 'i':
+                     def->args_ct[i].ct |= TCG_CT_CONST;
+-                    ct_str++;
+                     break;
+                 /* Include all of the target-specific constraints. */
+ #undef CONST
+ #define CONST(CASE, MASK) \
+-    case CASE: def->args_ct[i].ct |= MASK; ct_str++; break;
++    case CASE: def->args_ct[i].ct |= MASK; break;
+ #define REGS(CASE, MASK) \
+-    case CASE: def->args_ct[i].regs |= MASK; ct_str++; break;
++    case CASE: def->args_ct[i].regs |= MASK; break;
+ #include "tcg-target-con-str.h"
+ #undef REGS
+ #undef CONST
+                 default:
++                case '0' ... '9':
++                case '&':
+                     /* Typo in TCGTargetOpDef constraint. */
+                     g_assert_not_reached();
+                 }
+-            }
++            } while (*++ct_str != '\0');
+         }
+         /* TCGTargetOpDef entry with too much information? */
+--
+.34.1

-New patch
+[PULL 19/47] tcg: Introduce paired register allocation
+There are several instances where we need to be able to
+allocate a pair of registers to related inputs/outputs.
+Add 'p' and 'm' register constraints for this, in order to
+be able to allocate the even/odd register first or second.
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ include/tcg/tcg.h |   2 +
+ tcg/tcg.c         | 419 ++++++++++++++++++++++++++++++++++++++++------
+files changed, 373 insertions(+), 48 deletions(-)
+diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
+index XXXXXXX..XXXXXXX 100644
+--- a/include/tcg/tcg.h
++++ b/include/tcg/tcg.h
+@@ -XXX,XX +XXX,XX @@ typedef struct TCGArgConstraint {
+     unsigned ct : 16;
+     unsigned alias_index : 4;
+     unsigned sort_index : 4;
++    unsigned pair_index : 4;
++    unsigned pair : 2;  /* 0: none, 1: first, 2: second, 3: second alias */
+     bool oalias : 1;
+     bool ialias : 1;
+     bool newreg : 1;
+diff --git a/tcg/tcg.c b/tcg/tcg.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/tcg.c
++++ b/tcg/tcg.c
+@@ -XXX,XX +XXX,XX @@ static void tcg_dump_ops(TCGContext *s, FILE *f, bool have_prefs)
+ static int get_constraint_priority(const TCGOpDef *def, int k)
+ {
+     const TCGArgConstraint *arg_ct = &def->args_ct[k];
+-    int n;
++    int n = ctpop64(arg_ct->regs);
+-    if (arg_ct->oalias) {
+-        /* an alias is equivalent to a single register */
+-        n = 1;
+-    } else {
+-        n = ctpop64(arg_ct->regs);
++    /*
++     * Sort constraints of a single register first, which includes output
++     * aliases (which must exactly match the input already allocated).
++     */
++    if (n == 1 || arg_ct->oalias) {
++        return INT_MAX;
+     }
+-    return TCG_TARGET_NB_REGS - n + 1;
++
++    /*
++     * Sort register pairs next, first then second immediately after.
++     * Arbitrarily sort multiple pairs by the index of the first reg;
++     * there shouldn't be many pairs.
++     */
++    switch (arg_ct->pair) {
++    case 1:
++    case 3:
++        return (k + 1) * 2;
++    case 2:
++        return (arg_ct->pair_index + 1) * 2 - 1;
++    }
++
++    /* Finally, sort by decreasing register count. */
++    assert(n > 1);
++    return -n;
+ }
+ /* sort from highest priority to lowest */
+@@ -XXX,XX +XXX,XX @@ static void process_op_defs(TCGContext *s)
+     for (op = 0; op < NB_OPS; op++) {
+         TCGOpDef *def = &tcg_op_defs[op];
+         const TCGTargetOpDef *tdefs;
+-        int i, o, nb_args;
++        bool saw_alias_pair = false;
++        int i, o, i2, o2, nb_args;
+         if (def->flags & TCG_OPF_NOT_PRESENT) {
+             continue;
+@@ -XXX,XX +XXX,XX @@ static void process_op_defs(TCGContext *s)
+                 /* The input sets ialias. */
+                 def->args_ct[i].ialias = 1;
+                 def->args_ct[i].alias_index = o;
++                if (def->args_ct[i].pair) {
++                    saw_alias_pair = true;
++                }
+                 tcg_debug_assert(ct_str[1] == '\0');
+                 continue;
+@@ -XXX,XX +XXX,XX @@ static void process_op_defs(TCGContext *s)
+                 def->args_ct[i].newreg = true;
+                 ct_str++;
+                 break;
++
++            case 'p': /* plus */
++                /* Allocate to the register after the previous. */
++                tcg_debug_assert(i > (input_p ? def->nb_oargs : 0));
++                o = i - 1;
++                tcg_debug_assert(!def->args_ct[o].pair);
++                tcg_debug_assert(!def->args_ct[o].ct);
++                def->args_ct[i] = (TCGArgConstraint){
++                    .pair = 2,
++                    .pair_index = o,
++                    .regs = def->args_ct[o].regs << 1,
++                };
++                def->args_ct[o].pair = 1;
++                def->args_ct[o].pair_index = i;
++                tcg_debug_assert(ct_str[1] == '\0');
++                continue;
++
++            case 'm': /* minus */
++                /* Allocate to the register before the previous. */
++                tcg_debug_assert(i > (input_p ? def->nb_oargs : 0));
++                o = i - 1;
++                tcg_debug_assert(!def->args_ct[o].pair);
++                tcg_debug_assert(!def->args_ct[o].ct);
++                def->args_ct[i] = (TCGArgConstraint){
++                    .pair = 1,
++                    .pair_index = o,
++                    .regs = def->args_ct[o].regs >> 1,
++                };
++                def->args_ct[o].pair = 2;
++                def->args_ct[o].pair_index = i;
++                tcg_debug_assert(ct_str[1] == '\0');
++                continue;
+             }
+             do {
+@@ -XXX,XX +XXX,XX @@ static void process_op_defs(TCGContext *s)
+                 default:
+                 case '0' ... '9':
+                 case '&':
++                case 'p':
++                case 'm':
+                     /* Typo in TCGTargetOpDef constraint. */
+                     g_assert_not_reached();
+                 }
+@@ -XXX,XX +XXX,XX @@ static void process_op_defs(TCGContext *s)
+         /* TCGTargetOpDef entry with too much information? */
+         tcg_debug_assert(i == TCG_MAX_OP_ARGS || tdefs->args_ct_str[i] == NULL);
++        /*
++         * Fix up output pairs that are aliased with inputs.
++         * When we created the alias, we copied pair from the output.
++         * There are three cases:
++         *    (1a) Pairs of inputs alias pairs of outputs.
++         *    (1b) One input aliases the first of a pair of outputs.
++         *    (2)  One input aliases the second of a pair of outputs.
++         *
++         * Case 1a is handled by making sure that the pair_index'es are
++         * properly updated so that they appear the same as a pair of inputs.
++         *
++         * Case 1b is handled by setting the pair_index of the input to
++         * itself, simply so it doesn't point to an unrelated argument.
++         * Since we don't encounter the "second" during the input allocation
++         * phase, nothing happens with the second half of the input pair.
++         *
++         * Case 2 is handled by setting the second input to pair=3, the
++         * first output to pair=3, and the pair_index'es to match.
++         */
++        if (saw_alias_pair) {
++            for (i = def->nb_oargs; i < nb_args; i++) {
++                /*
++                 * Since [0-9pm] must be alone in the constraint string,
++                 * the only way they can both be set is if the pair comes
++                 * from the output alias.
++                 */
++                if (!def->args_ct[i].ialias) {
++                    continue;
++                }
++                switch (def->args_ct[i].pair) {
++                case 0:
++                    break;
++                case 1:
++                    o = def->args_ct[i].alias_index;
++                    o2 = def->args_ct[o].pair_index;
++                    tcg_debug_assert(def->args_ct[o].pair == 1);
++                    tcg_debug_assert(def->args_ct[o2].pair == 2);
++                    if (def->args_ct[o2].oalias) {
++                        /* Case 1a */
++                        i2 = def->args_ct[o2].alias_index;
++                        tcg_debug_assert(def->args_ct[i2].pair == 2);
++                        def->args_ct[i2].pair_index = i;
++                        def->args_ct[i].pair_index = i2;
++                    } else {
++                        /* Case 1b */
++                        def->args_ct[i].pair_index = i;
++                    }
++                    break;
++                case 2:
++                    o = def->args_ct[i].alias_index;
++                    o2 = def->args_ct[o].pair_index;
++                    tcg_debug_assert(def->args_ct[o].pair == 2);
++                    tcg_debug_assert(def->args_ct[o2].pair == 1);
++                    if (def->args_ct[o2].oalias) {
++                        /* Case 1a */
++                        i2 = def->args_ct[o2].alias_index;
++                        tcg_debug_assert(def->args_ct[i2].pair == 1);
++                        def->args_ct[i2].pair_index = i;
++                        def->args_ct[i].pair_index = i2;
++                    } else {
++                        /* Case 2 */
++                        def->args_ct[i].pair = 3;
++                        def->args_ct[o2].pair = 3;
++                        def->args_ct[i].pair_index = o2;
++                        def->args_ct[o2].pair_index = i;
++                    }
++                    break;
++                default:
++                    g_assert_not_reached();
++                }
++            }
++        }
++
+         /* sort the constraints (XXX: this is just an heuristic) */
+         sort_constraints(def, 0, def->nb_oargs);
+         sort_constraints(def, def->nb_oargs, def->nb_iargs);
+@@ -XXX,XX +XXX,XX @@ static TCGReg tcg_reg_alloc(TCGContext *s, TCGRegSet required_regs,
+     tcg_abort();
+ }
++static TCGReg tcg_reg_alloc_pair(TCGContext *s, TCGRegSet required_regs,
++                                 TCGRegSet allocated_regs,
++                                 TCGRegSet preferred_regs, bool rev)
++{
++    int i, j, k, fmin, n = ARRAY_SIZE(tcg_target_reg_alloc_order);
++    TCGRegSet reg_ct[2];
++    const int *order;
++
++    /* Ensure that if I is not in allocated_regs, I+1 is not either. */
++    reg_ct[1] = required_regs & ~(allocated_regs | (allocated_regs >> 1));
++    tcg_debug_assert(reg_ct[1] != 0);
++    reg_ct[0] = reg_ct[1] & preferred_regs;
++
++    order = rev ? indirect_reg_alloc_order : tcg_target_reg_alloc_order;
++
++    /*
++     * Skip the preferred_regs option if it cannot be satisfied,
++     * or if the preference made no difference.
++     */
++    k = reg_ct[0] == 0 || reg_ct[0] == reg_ct[1];
++
++    /*
++     * Minimize the number of flushes by looking for 2 free registers first,
++     * then a single flush, then two flushes.
++     */
++    for (fmin = 2; fmin >= 0; fmin--) {
++        for (j = k; j < 2; j++) {
++            TCGRegSet set = reg_ct[j];
++
++            for (i = 0; i < n; i++) {
++                TCGReg reg = order[i];
++
++                if (tcg_regset_test_reg(set, reg)) {
++                    int f = !s->reg_to_temp[reg] + !s->reg_to_temp[reg + 1];
++                    if (f >= fmin) {
++                        tcg_reg_free(s, reg, allocated_regs);
++                        tcg_reg_free(s, reg + 1, allocated_regs);
++                        return reg;
++                    }
++                }
++            }
++        }
++    }
++    tcg_abort();
++}
++
+ /* Make sure the temporary is in a register.  If needed, allocate the register
+    from DESIRED while avoiding ALLOCATED.  */
+ static void temp_load(TCGContext *s, TCGTemp *ts, TCGRegSet desired_regs,
+@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
+     /* satisfy input constraints */
+     for (k = 0; k < nb_iargs; k++) {
+-        TCGRegSet i_preferred_regs;
+-        bool allocate_new_reg;
++        TCGRegSet i_preferred_regs, i_required_regs;
++        bool allocate_new_reg, copyto_new_reg;
++        TCGTemp *ts2;
++        int i1, i2;
+         i = def->args_ct[nb_oargs + k].sort_index;
+         arg = op->args[i];
+@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
+         reg = ts->reg;
+         i_preferred_regs = 0;
++        i_required_regs = arg_ct->regs;
+         allocate_new_reg = false;
++        copyto_new_reg = false;
+-        if (arg_ct->ialias) {
++        switch (arg_ct->pair) {
++        case 0: /* not paired */
++            if (arg_ct->ialias) {
++                i_preferred_regs = op->output_pref[arg_ct->alias_index];
++
++                /*
++                 * If the input is not dead after the instruction,
++                 * we must allocate a new register and move it.
++                 */
++                if (!IS_DEAD_ARG(i)) {
++                    allocate_new_reg = true;
++                } else if (ts->val_type == TEMP_VAL_REG) {
++                    /*
++                     * Check if the current register has already been
++                     * allocated for another input.
++                     */
++                    allocate_new_reg =
++                        tcg_regset_test_reg(i_allocated_regs, reg);
++                }
++            }
++            if (!allocate_new_reg) {
++                temp_load(s, ts, i_required_regs, i_allocated_regs,
++                          i_preferred_regs);
++                reg = ts->reg;
++                allocate_new_reg = !tcg_regset_test_reg(i_required_regs, reg);
++            }
++            if (allocate_new_reg) {
++                /*
++                 * Allocate a new register matching the constraint
++                 * and move the temporary register into it.
++                 */
++                temp_load(s, ts, tcg_target_available_regs[ts->type],
++                          i_allocated_regs, 0);
++                reg = tcg_reg_alloc(s, i_required_regs, i_allocated_regs,
++                                    i_preferred_regs, ts->indirect_base);
++                copyto_new_reg = true;
++            }
++            break;
++
++        case 1:
++            /* First of an input pair; if i1 == i2, the second is an output. */
++            i1 = i;
++            i2 = arg_ct->pair_index;
++            ts2 = i1 != i2 ? arg_temp(op->args[i2]) : NULL;
++
++            /*
++             * It is easier to default to allocating a new pair
++             * and to identify a few cases where it's not required.
++             */
++            if (arg_ct->ialias) {
++                i_preferred_regs = op->output_pref[arg_ct->alias_index];
++                if (IS_DEAD_ARG(i1) &&
++                    IS_DEAD_ARG(i2) &&
++                    ts->val_type == TEMP_VAL_REG &&
++                    ts->reg < TCG_TARGET_NB_REGS - 1 &&
++                    tcg_regset_test_reg(i_required_regs, reg) &&
++                    !tcg_regset_test_reg(i_allocated_regs, reg) &&
++                    !tcg_regset_test_reg(i_allocated_regs, reg + 1) &&
++                    (ts2
++                     ? ts2->val_type == TEMP_VAL_REG &&
++                       ts2->reg == reg + 1
++                     : s->reg_to_temp[reg + 1] == NULL)) {
++                    break;
++                }
++            } else {
++                /* Without aliasing, the pair must also be an input. */
++                tcg_debug_assert(ts2);
++                if (ts->val_type == TEMP_VAL_REG &&
++                    ts2->val_type == TEMP_VAL_REG &&
++                    ts2->reg == reg + 1 &&
++                    tcg_regset_test_reg(i_required_regs, reg)) {
++                    break;
++                }
++            }
++            reg = tcg_reg_alloc_pair(s, i_required_regs, i_allocated_regs,
++                                     0, ts->indirect_base);
++            goto do_pair;
++
++        case 2: /* pair second */
++            reg = new_args[arg_ct->pair_index] + 1;
++            goto do_pair;
++
++        case 3: /* ialias with second output, no first input */
++            tcg_debug_assert(arg_ct->ialias);
+             i_preferred_regs = op->output_pref[arg_ct->alias_index];
+-            /*
+-             * If the input is readonly, then it cannot also be an
+-             * output and aliased to itself.  If the input is not
+-             * dead after the instruction, we must allocate a new
+-             * register and move it.
+-             */
+-            if (temp_readonly(ts) || !IS_DEAD_ARG(i)) {
+-                allocate_new_reg = true;
+-            } else if (ts->val_type == TEMP_VAL_REG) {
+-                /*
+-                 * Check if the current register has already been
+-                 * allocated for another input.
+-                 */
+-                allocate_new_reg = tcg_regset_test_reg(i_allocated_regs, reg);
++            if (IS_DEAD_ARG(i) &&
++                ts->val_type == TEMP_VAL_REG &&
++                reg > 0 &&
++                s->reg_to_temp[reg - 1] == NULL &&
++                tcg_regset_test_reg(i_required_regs, reg) &&
++                !tcg_regset_test_reg(i_allocated_regs, reg) &&
++                !tcg_regset_test_reg(i_allocated_regs, reg - 1)) {
++                tcg_regset_set_reg(i_allocated_regs, reg - 1);
++                break;
+             }
+-        }
++            reg = tcg_reg_alloc_pair(s, i_required_regs >> 1,
++                                     i_allocated_regs, 0,
++                                     ts->indirect_base);
++            tcg_regset_set_reg(i_allocated_regs, reg);
++            reg += 1;
++            goto do_pair;
+-        if (!allocate_new_reg) {
+-            temp_load(s, ts, arg_ct->regs, i_allocated_regs, i_preferred_regs);
+-            reg = ts->reg;
+-            allocate_new_reg = !tcg_regset_test_reg(arg_ct->regs, reg);
+-        }
+-
+-        if (allocate_new_reg) {
++        do_pair:
+             /*
+-             * Allocate a new register matching the constraint
+-             * and move the temporary register into it.
++             * If an aliased input is not dead after the instruction,
++             * we must allocate a new register and move it.
+              */
+-            temp_load(s, ts, tcg_target_available_regs[ts->type],
+-                      i_allocated_regs, 0);
+-            reg = tcg_reg_alloc(s, arg_ct->regs, i_allocated_regs,
+-                                i_preferred_regs, ts->indirect_base);
++            if (arg_ct->ialias && !IS_DEAD_ARG(i)) {
++                TCGRegSet t_allocated_regs = i_allocated_regs;
++
++                /*
++                 * Because of the alias, and the continued life, make sure
++                 * that the temp is somewhere *other* than the reg pair,
++                 * and we get a copy in reg.
++                 */
++                tcg_regset_set_reg(t_allocated_regs, reg);
++                tcg_regset_set_reg(t_allocated_regs, reg + 1);
++                if (ts->val_type == TEMP_VAL_REG && ts->reg == reg) {
++                    /* If ts was already in reg, copy it somewhere else. */
++                    TCGReg nr;
++                    bool ok;
++
++                    tcg_debug_assert(ts->kind != TEMP_FIXED);
++                    nr = tcg_reg_alloc(s, tcg_target_available_regs[ts->type],
++                                       t_allocated_regs, 0, ts->indirect_base);
++                    ok = tcg_out_mov(s, ts->type, nr, reg);
++                    tcg_debug_assert(ok);
++
++                    set_temp_val_reg(s, ts, nr);
++                } else {
++                    temp_load(s, ts, tcg_target_available_regs[ts->type],
++                              t_allocated_regs, 0);
++                    copyto_new_reg = true;
++                }
++            } else {
++                /* Preferably allocate to reg, otherwise copy. */
++                i_required_regs = (TCGRegSet)1 << reg;
++                temp_load(s, ts, i_required_regs, i_allocated_regs,
++                          i_preferred_regs);
++                copyto_new_reg = ts->reg != reg;
++            }
++            break;
++
++        default:
++            g_assert_not_reached();
++        }
++
++        if (copyto_new_reg) {
+             if (!tcg_out_mov(s, ts->type, reg, ts->reg)) {
+                 /*
+                  * Cross register class move not supported.  Sync the
+@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
+             /* ENV should not be modified.  */
+             tcg_debug_assert(!temp_readonly(ts));
+-            if (arg_ct->oalias && !const_args[arg_ct->alias_index]) {
+-                reg = new_args[arg_ct->alias_index];
+-            } else if (arg_ct->newreg) {
+-                reg = tcg_reg_alloc(s, arg_ct->regs,
+-                                    i_allocated_regs | o_allocated_regs,
+-                                    op->output_pref[k], ts->indirect_base);
+-            } else {
+-                reg = tcg_reg_alloc(s, arg_ct->regs, o_allocated_regs,
+-                                    op->output_pref[k], ts->indirect_base);
++            switch (arg_ct->pair) {
++            case 0: /* not paired */
++                if (arg_ct->oalias && !const_args[arg_ct->alias_index]) {
++                    reg = new_args[arg_ct->alias_index];
++                } else if (arg_ct->newreg) {
++                    reg = tcg_reg_alloc(s, arg_ct->regs,
++                                        i_allocated_regs | o_allocated_regs,
++                                        op->output_pref[k], ts->indirect_base);
++                } else {
++                    reg = tcg_reg_alloc(s, arg_ct->regs, o_allocated_regs,
++                                        op->output_pref[k], ts->indirect_base);
++                }
++                break;
++
++            case 1: /* first of pair */
++                tcg_debug_assert(!arg_ct->newreg);
++                if (arg_ct->oalias) {
++                    reg = new_args[arg_ct->alias_index];
++                    break;
++                }
++                reg = tcg_reg_alloc_pair(s, arg_ct->regs, o_allocated_regs,
++                                         op->output_pref[k], ts->indirect_base);
++                break;
++
++            case 2: /* second of pair */
++                tcg_debug_assert(!arg_ct->newreg);
++                if (arg_ct->oalias) {
++                    reg = new_args[arg_ct->alias_index];
++                } else {
++                    reg = new_args[arg_ct->pair_index] + 1;
++                }
++                break;
++
++            case 3: /* first of pair, aliasing with a second input */
++                tcg_debug_assert(!arg_ct->newreg);
++                reg = new_args[arg_ct->pair_index] - 1;
++                break;
++
++            default:
++                g_assert_not_reached();
+             }
+             tcg_regset_set_reg(o_allocated_regs, reg);
+             set_temp_val_reg(s, ts, reg);
+--
+.34.1

-New patch
+[PULL 20/47] accel/tcg: Set cflags_next_tb in cpu_common_initfn
+While we initialize this value in cpu_common_reset, that
+isn't called during startup, so set it as well in init.
+This fixes -singlestep versus the very first TB.
+Fixes: 04f5b647ed07 ("accel/tcg: Handle -singlestep in curr_cflags")
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ hw/core/cpu-common.c | 1 +
+file changed, 1 insertion(+)
+diff --git a/hw/core/cpu-common.c b/hw/core/cpu-common.c
+index XXXXXXX..XXXXXXX 100644
+--- a/hw/core/cpu-common.c
++++ b/hw/core/cpu-common.c
+@@ -XXX,XX +XXX,XX @@ static void cpu_common_initfn(Object *obj)
+     /* the default value is changed by qemu_init_vcpu() for softmmu */
+     cpu->nr_cores = 1;
+     cpu->nr_threads = 1;
++    cpu->cflags_next_tb = -1;
+     qemu_mutex_init(&cpu->work_mutex);
+     QSIMPLEQ_INIT(&cpu->work_list);
+--
+.34.1

-[Qemu-devel] [PULL 10/21] cputlb: serialize tlb updates with env->tlb_lock
+[PULL 21/47] target/sparc: Avoid TCGV_{LOW,HIGH}
-From: "Emilio G. Cota" <cota@braap.org>
+Use the official extend/extract functions instead of routines
 that will shortly be internal to tcg.
-Currently we rely on atomic operations for cross-CPU invalidations.
+Cc: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>
-There are two cases that these atomics miss: cross-CPU invalidations
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 can race with either (1) vCPU threads flushing their TLB, which
 happens via memset, or (2) vCPUs calling tlb_reset_dirty on their TLB,
 which updates .addr_write with a regular store. This results in
 undefined behaviour, since we're mixing regular and atomic ops
 on concurrent accesses.
 Fix it by using tlb_lock, a per-vCPU lock. All updaters of tlb_table
 and the corresponding victim cache now hold the lock.
 The readers that do not hold tlb_lock must use atomic reads when
 reading .addr_write, since this field can be updated by other threads;
 the conversion to atomic reads is done in the next patch.
 Note that an alternative fix would be to expand the use of atomic ops.
 However, in the case of TLB flushes this would have a huge performance
 impact, since (1) TLB flushes can happen very frequently and (2) we
 currently use a full memory barrier to flush each TLB entry, and a TLB
 has many entries. Instead, acquiring the lock is barely slower than a
 full memory barrier since it is uncontended, and with a single lock
 acquisition we can flush the entire TLB.
 Tested-by: Alex Bennée <alex.bennee@linaro.org>
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Signed-off-by: Emilio G. Cota <cota@braap.org>
 Message-Id: <20181009174557.16125-6-cota@braap.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/exec/cpu-defs.h |   3 +
+ target/sparc/translate.c | 21 ++++-----------------
- accel/tcg/cputlb.c      | 155 ++++++++++++++++++++++------------------
+file changed, 4 insertions(+), 17 deletions(-)
 files changed, 87 insertions(+), 71 deletions(-)
-diff --git a/include/exec/cpu-defs.h b/include/exec/cpu-defs.h
+diff --git a/target/sparc/translate.c b/target/sparc/translate.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/exec/cpu-defs.h
+--- a/target/sparc/translate.c
-+++ b/include/exec/cpu-defs.h
++++ b/target/sparc/translate.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static inline void gen_update_fprs_dirty(DisasContext *dc, int rd)
- #endif
+ /* floating point registers moves */
+ static TCGv_i32 gen_load_fpr_F(DisasContext *dc, unsigned int src)
  #include "qemu/host-utils.h"
 +#include "qemu/thread.h"
  #include "qemu/queue.h"
  #ifdef CONFIG_TCG
  #include "tcg-target.h"
@@ -XXX,XX +XXX,XX @@ typedef struct CPUIOTLBEntry {
  #define CPU_COMMON_TLB \
      /* The meaning of the MMU modes is defined in the target code. */   \
 +    /* tlb_lock serializes updates to tlb_table and tlb_v_table */      \
 +    QemuSpin tlb_lock;                                                  \
      CPUTLBEntry tlb_table[NB_MMU_MODES][CPU_TLB_SIZE];                  \
      CPUTLBEntry tlb_v_table[NB_MMU_MODES][CPU_VTLB_SIZE];               \
      CPUIOTLBEntry iotlb[NB_MMU_MODES][CPU_TLB_SIZE];                    \
 diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/cputlb.c
 +++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ QEMU_BUILD_BUG_ON(NB_MMU_MODES > 16);
  void tlb_init(CPUState *cpu)
  {
-+    CPUArchState *env = cpu->env_ptr;
+-#if TCG_TARGET_REG_BITS == 32
-+
+-    if (src & 1) {
-+    qemu_spin_init(&env->tlb_lock);
+-        return TCGV_LOW(cpu_fpr[src / 2]);
- }
+-    } else {
+-        return TCGV_HIGH(cpu_fpr[src / 2]);
- /* flush_all_helper: run fn across all cpus
+-    }
-@@ -XXX,XX +XXX,XX @@ static void tlb_flush_nocheck(CPUState *cpu)
+-#else
-     atomic_set(&env->tlb_flush_count, env->tlb_flush_count + 1);
+     TCGv_i32 ret = get_temp_i32(dc);
-     tlb_debug("(count: %zu)\n", tlb_flush_count());
+     if (src & 1) {
+         tcg_gen_extrl_i64_i32(ret, cpu_fpr[src / 2]);
-+    /*
+@@ -XXX,XX +XXX,XX @@ static TCGv_i32 gen_load_fpr_F(DisasContext *dc, unsigned int src)
-+     * tlb_table/tlb_v_table updates from any thread must hold tlb_lock.
+         tcg_gen_extrh_i64_i32(ret, cpu_fpr[src / 2]);
 +     * However, updates from the owner thread (as is the case here; see the
 +     * above assert_cpu_is_self) do not need atomic_set because all reads
 +     * that do not hold the lock are performed by the same owner thread.
 +     */
 +    qemu_spin_lock(&env->tlb_lock);
      memset(env->tlb_table, -1, sizeof(env->tlb_table));
      memset(env->tlb_v_table, -1, sizeof(env->tlb_v_table));
 +    qemu_spin_unlock(&env->tlb_lock);
 +
      cpu_tb_jmp_cache_clear(cpu);
      env->vtlb_index = 0;
@@ -XXX,XX +XXX,XX @@ static void tlb_flush_by_mmuidx_async_work(CPUState *cpu, run_on_cpu_data data)
      tlb_debug("start: mmu_idx:0x%04lx\n", mmu_idx_bitmask);
 +    qemu_spin_lock(&env->tlb_lock);
      for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
          if (test_bit(mmu_idx, &mmu_idx_bitmask)) {
@@ -XXX,XX +XXX,XX @@ static void tlb_flush_by_mmuidx_async_work(CPUState *cpu, run_on_cpu_data data)
              memset(env->tlb_v_table[mmu_idx], -1, sizeof(env->tlb_v_table[0]));
          }
      }
-+    qemu_spin_unlock(&env->tlb_lock);
+     return ret;
      cpu_tb_jmp_cache_clear(cpu);
@@ -XXX,XX +XXX,XX @@ static inline bool tlb_hit_page_anyprot(CPUTLBEntry *tlb_entry,
             tlb_hit_page(tlb_entry->addr_code, page);
  }
 -static inline void tlb_flush_entry(CPUTLBEntry *tlb_entry, target_ulong page)
 +/* Called with tlb_lock held */
 +static inline void tlb_flush_entry_locked(CPUTLBEntry *tlb_entry,
 +                                          target_ulong page)
  {
      if (tlb_hit_page_anyprot(tlb_entry, page)) {
          memset(tlb_entry, -1, sizeof(*tlb_entry));
      }
  }
 -static inline void tlb_flush_vtlb_page(CPUArchState *env, int mmu_idx,
 -                                       target_ulong page)
 +/* Called with tlb_lock held */
 +static inline void tlb_flush_vtlb_page_locked(CPUArchState *env, int mmu_idx,
 +                                              target_ulong page)
  {
      int k;
 +
 +    assert_cpu_is_self(ENV_GET_CPU(env));
      for (k = 0; k < CPU_VTLB_SIZE; k++) {
 -        tlb_flush_entry(&env->tlb_v_table[mmu_idx][k], page);
 +        tlb_flush_entry_locked(&env->tlb_v_table[mmu_idx][k], page);
      }
  }
@@ -XXX,XX +XXX,XX @@ static void tlb_flush_page_async_work(CPUState *cpu, run_on_cpu_data data)
      addr &= TARGET_PAGE_MASK;
      i = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
 +    qemu_spin_lock(&env->tlb_lock);
      for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
 -        tlb_flush_entry(&env->tlb_table[mmu_idx][i], addr);
 -        tlb_flush_vtlb_page(env, mmu_idx, addr);
 +        tlb_flush_entry_locked(&env->tlb_table[mmu_idx][i], addr);
 +        tlb_flush_vtlb_page_locked(env, mmu_idx, addr);
      }
 +    qemu_spin_unlock(&env->tlb_lock);
      tb_flush_jmp_cache(cpu, addr);
  }
@@ -XXX,XX +XXX,XX @@ static void tlb_flush_page_by_mmuidx_async_work(CPUState *cpu,
      tlb_debug("page:%d addr:"TARGET_FMT_lx" mmu_idx:0x%lx\n",
                page, addr, mmu_idx_bitmap);
 +    qemu_spin_lock(&env->tlb_lock);
      for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
          if (test_bit(mmu_idx, &mmu_idx_bitmap)) {
 -            tlb_flush_entry(&env->tlb_table[mmu_idx][page], addr);
 -            tlb_flush_vtlb_page(env, mmu_idx, addr);
 +            tlb_flush_entry_locked(&env->tlb_table[mmu_idx][page], addr);
 +            tlb_flush_vtlb_page_locked(env, mmu_idx, addr);
          }
      }
 +    qemu_spin_unlock(&env->tlb_lock);
      tb_flush_jmp_cache(cpu, addr);
  }
@@ -XXX,XX +XXX,XX @@ void tlb_unprotect_code(ram_addr_t ram_addr)
   * most usual is detecting writes to code regions which may invalidate
   * generated code.
   *
 - * Because we want other vCPUs to respond to changes straight away we
 - * update the te->addr_write field atomically. If the TLB entry has
 - * been changed by the vCPU in the mean time we skip the update.
 + * Other vCPUs might be reading their TLBs during guest execution, so we update
 + * te->addr_write with atomic_set. We don't need to worry about this for
 + * oversized guests as MTTCG is disabled for them.
   *
 - * As this function uses atomic accesses we also need to ensure
 - * updates to tlb_entries follow the same access rules. We don't need
 - * to worry about this for oversized guests as MTTCG is disabled for
 - * them.
 + * Called with tlb_lock held.
   */
 -
 -static void tlb_reset_dirty_range(CPUTLBEntry *tlb_entry, uintptr_t start,
 -                           uintptr_t length)
 +static void tlb_reset_dirty_range_locked(CPUTLBEntry *tlb_entry,
 +                                         uintptr_t start, uintptr_t length)
  {
 -#if TCG_OVERSIZED_GUEST
      uintptr_t addr = tlb_entry->addr_write;
      if ((addr & (TLB_INVALID_MASK | TLB_MMIO | TLB_NOTDIRTY)) == 0) {
          addr &= TARGET_PAGE_MASK;
          addr += tlb_entry->addend;
          if ((addr - start) < length) {
 +#if TCG_OVERSIZED_GUEST
              tlb_entry->addr_write |= TLB_NOTDIRTY;
 -        }
 -    }
  #else
 -    /* paired with atomic_mb_set in tlb_set_page_with_attrs */
 -    uintptr_t orig_addr = atomic_mb_read(&tlb_entry->addr_write);
 -    uintptr_t addr = orig_addr;
 -
 -    if ((addr & (TLB_INVALID_MASK | TLB_MMIO | TLB_NOTDIRTY)) == 0) {
 -        addr &= TARGET_PAGE_MASK;
 -        addr += atomic_read(&tlb_entry->addend);
 -        if ((addr - start) < length) {
 -            uintptr_t notdirty_addr = orig_addr | TLB_NOTDIRTY;
 -            atomic_cmpxchg(&tlb_entry->addr_write, orig_addr, notdirty_addr);
 +            atomic_set(&tlb_entry->addr_write,
 +                       tlb_entry->addr_write | TLB_NOTDIRTY);
 +#endif
          }
      }
 -#endif
  }
--/* For atomic correctness when running MTTCG we need to use the right
+ static void gen_store_fpr_F(DisasContext *dc, unsigned int dst, TCGv_i32 v)
 - * primitives when copying entries */
 -static inline void copy_tlb_helper(CPUTLBEntry *d, CPUTLBEntry *s,
 -                                   bool atomic_set)
 +/*
 + * Called with tlb_lock held.
 + * Called only from the vCPU context, i.e. the TLB's owner thread.
 + */
 +static inline void copy_tlb_helper_locked(CPUTLBEntry *d, const CPUTLBEntry *s)
  {
--#if TCG_OVERSIZED_GUEST
+-#if TCG_TARGET_REG_BITS == 32
-     *d = *s;
+-    if (dst & 1) {
 -        tcg_gen_mov_i32(TCGV_LOW(cpu_fpr[dst / 2]), v);
 -    } else {
 -        tcg_gen_mov_i32(TCGV_HIGH(cpu_fpr[dst / 2]), v);
 -    }
 -#else
--    if (atomic_set) {
+-    TCGv_i64 t = (TCGv_i64)v;
--        d->addr_read = s->addr_read;
++    TCGv_i64 t = tcg_temp_new_i64();
--        d->addr_code = s->addr_code;
++
--        atomic_set(&d->addend, atomic_read(&s->addend));
++    tcg_gen_extu_i32_i64(t, v);
--        /* Pairs with flag setting in tlb_reset_dirty_range */
+     tcg_gen_deposit_i64(cpu_fpr[dst / 2], cpu_fpr[dst / 2], t,
--        atomic_mb_set(&d->addr_write, atomic_read(&s->addr_write));
+                         (dst & 1 ? 0 : 32), 32);
 -    } else {
 -        d->addr_read = s->addr_read;
 -        d->addr_write = atomic_read(&s->addr_write);
 -        d->addr_code = s->addr_code;
 -        d->addend = atomic_read(&s->addend);
 -    }
 -#endif
++    tcg_temp_free_i64(t);
+     gen_update_fprs_dirty(dc, dst);
  }
- /* This is a cross vCPU call (i.e. another vCPU resetting the flags of
-- * the target vCPU). As such care needs to be taken that we don't
-- * dangerously race with another vCPU update. The only thing actually
-- * updated is the target TLB entry ->addr_write flags.
-+ * the target vCPU).
-+ * We must take tlb_lock to avoid racing with another vCPU update. The only
-+ * thing actually updated is the target TLB entry ->addr_write flags.
-  */
- void tlb_reset_dirty(CPUState *cpu, ram_addr_t start1, ram_addr_t length)
- {
-@@ -XXX,XX +XXX,XX @@ void tlb_reset_dirty(CPUState *cpu, ram_addr_t start1, ram_addr_t length)
-     int mmu_idx;
-     env = cpu->env_ptr;
-+    qemu_spin_lock(&env->tlb_lock);
-     for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
-         unsigned int i;
-         for (i = 0; i < CPU_TLB_SIZE; i++) {
--            tlb_reset_dirty_range(&env->tlb_table[mmu_idx][i],
--                                  start1, length);
-+            tlb_reset_dirty_range_locked(&env->tlb_table[mmu_idx][i], start1,
-+                                         length);
-         }
-         for (i = 0; i < CPU_VTLB_SIZE; i++) {
--            tlb_reset_dirty_range(&env->tlb_v_table[mmu_idx][i],
--                                  start1, length);
-+            tlb_reset_dirty_range_locked(&env->tlb_v_table[mmu_idx][i], start1,
-+                                         length);
-         }
-     }
-+    qemu_spin_unlock(&env->tlb_lock);
- }
--static inline void tlb_set_dirty1(CPUTLBEntry *tlb_entry, target_ulong vaddr)
-+/* Called with tlb_lock held */
-+static inline void tlb_set_dirty1_locked(CPUTLBEntry *tlb_entry,
-+                                         target_ulong vaddr)
- {
-     if (tlb_entry->addr_write == (vaddr | TLB_NOTDIRTY)) {
-         tlb_entry->addr_write = vaddr;
-@@ -XXX,XX +XXX,XX @@ void tlb_set_dirty(CPUState *cpu, target_ulong vaddr)
-     vaddr &= TARGET_PAGE_MASK;
-     i = (vaddr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
-+    qemu_spin_lock(&env->tlb_lock);
-     for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
--        tlb_set_dirty1(&env->tlb_table[mmu_idx][i], vaddr);
-+        tlb_set_dirty1_locked(&env->tlb_table[mmu_idx][i], vaddr);
-     }
-     for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
-         int k;
-         for (k = 0; k < CPU_VTLB_SIZE; k++) {
--            tlb_set_dirty1(&env->tlb_v_table[mmu_idx][k], vaddr);
-+            tlb_set_dirty1_locked(&env->tlb_v_table[mmu_idx][k], vaddr);
-         }
-     }
-+    qemu_spin_unlock(&env->tlb_lock);
- }
- /* Our TLB does not support large pages, so remember the area covered by
-@@ -XXX,XX +XXX,XX @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
-         addend = (uintptr_t)memory_region_get_ram_ptr(section->mr) + xlat;
-     }
--    /* Make sure there's no cached translation for the new page.  */
--    tlb_flush_vtlb_page(env, mmu_idx, vaddr_page);
--
-     code_address = address;
-     iotlb = memory_region_section_get_iotlb(cpu, section, vaddr_page,
-                                             paddr_page, xlat, prot, &address);
-@@ -XXX,XX +XXX,XX @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
-     index = (vaddr_page >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
-     te = &env->tlb_table[mmu_idx][index];
-+    /*
-+     * Hold the TLB lock for the rest of the function. We could acquire/release
-+     * the lock several times in the function, but it is faster to amortize the
-+     * acquisition cost by acquiring it just once. Note that this leads to
-+     * a longer critical section, but this is not a concern since the TLB lock
-+     * is unlikely to be contended.
-+     */
-+    qemu_spin_lock(&env->tlb_lock);
-+
-+    /* Make sure there's no cached translation for the new page.  */
-+    tlb_flush_vtlb_page_locked(env, mmu_idx, vaddr_page);
-+
-     /*
-      * Only evict the old entry to the victim tlb if it's for a
-      * different page; otherwise just overwrite the stale data.
-@@ -XXX,XX +XXX,XX @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
-         CPUTLBEntry *tv = &env->tlb_v_table[mmu_idx][vidx];
-         /* Evict the old entry into the victim tlb.  */
--        copy_tlb_helper(tv, te, true);
-+        copy_tlb_helper_locked(tv, te);
-         env->iotlb_v[mmu_idx][vidx] = env->iotlb[mmu_idx][index];
-     }
-@@ -XXX,XX +XXX,XX @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
-         }
-     }
--    /* Pairs with flag setting in tlb_reset_dirty_range */
--    copy_tlb_helper(te, &tn, true);
--    /* atomic_mb_set(&te->addr_write, write_address); */
-+    copy_tlb_helper_locked(te, &tn);
-+    qemu_spin_unlock(&env->tlb_lock);
- }
- /* Add a new TLB entry, but without specifying the memory
-@@ -XXX,XX +XXX,XX @@ static bool victim_tlb_hit(CPUArchState *env, size_t mmu_idx, size_t index,
-                            size_t elt_ofs, target_ulong page)
- {
-     size_t vidx;
-+
-+    assert_cpu_is_self(ENV_GET_CPU(env));
-     for (vidx = 0; vidx < CPU_VTLB_SIZE; ++vidx) {
-         CPUTLBEntry *vtlb = &env->tlb_v_table[mmu_idx][vidx];
-         target_ulong cmp = *(target_ulong *)((uintptr_t)vtlb + elt_ofs);
-@@ -XXX,XX +XXX,XX @@ static bool victim_tlb_hit(CPUArchState *env, size_t mmu_idx, size_t index,
-             /* Found entry in victim tlb, swap tlb and iotlb.  */
-             CPUTLBEntry tmptlb, *tlb = &env->tlb_table[mmu_idx][index];
--            copy_tlb_helper(&tmptlb, tlb, false);
--            copy_tlb_helper(tlb, vtlb, true);
--            copy_tlb_helper(vtlb, &tmptlb, true);
-+            qemu_spin_lock(&env->tlb_lock);
-+            copy_tlb_helper_locked(&tmptlb, tlb);
-+            copy_tlb_helper_locked(tlb, vtlb);
-+            copy_tlb_helper_locked(vtlb, &tmptlb);
-+            qemu_spin_unlock(&env->tlb_lock);
-             CPUIOTLBEntry tmpio, *io = &env->iotlb[mmu_idx][index];
-             CPUIOTLBEntry *vio = &env->iotlb_v[mmu_idx][vidx];
 --
-.17.2
+.34.1

-[Qemu-devel] [PULL 01/21] tcg: Implement CPU_LOG_TB_NOCHAIN during expansion
+[PULL 22/47] tcg: Move TCG_{LOW,HIGH} to tcg-internal.h
-Rather than test NOCHAIN before linking, do not emit the
+Move the error-generating fallback from tcg-op.c, and
-goto_tb opcode at all.  We already do this for goto_ptr.
+replace "_link_error" with modern QEMU_ERROR markup.
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/cpu-exec.c | 2 +-
+ include/tcg/tcg-op.h | 33 +++++----------------------------
- tcg/tcg-op.c         | 9 ++++++++-
+ include/tcg/tcg.h    | 12 ------------
-files changed, 9 insertions(+), 2 deletions(-)
+ tcg/tcg-internal.h   | 14 ++++++++++++++
  tcg/tcg-op-vec.c     |  2 ++
  tcg/tcg-op.c         | 37 ++++++++++++++++++++++++++++---------
 files changed, 49 insertions(+), 49 deletions(-)
-diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
+diff --git a/include/tcg/tcg-op.h b/include/tcg/tcg-op.h
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/cpu-exec.c
+--- a/include/tcg/tcg-op.h
-+++ b/accel/tcg/cpu-exec.c
++++ b/include/tcg/tcg-op.h
-@@ -XXX,XX +XXX,XX @@ static inline TranslationBlock *tb_find(CPUState *cpu,
+@@ -XXX,XX +XXX,XX @@ static inline void tcg_gen_mul_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
-     }
+     tcg_gen_op3_i64(INDEX_op_mul_i64, ret, arg1, arg2);
- #endif
+ }
-     /* See if we can patch the calling TB. */
+ #else /* TCG_TARGET_REG_BITS == 32 */
--    if (last_tb && !qemu_loglevel_mask(CPU_LOG_TB_NOCHAIN)) {
+-static inline void tcg_gen_st8_i64(TCGv_i64 arg1, TCGv_ptr arg2,
-+    if (last_tb) {
+-                                   tcg_target_long offset)
-         tb_add_jump(last_tb, tb_exit, tb);
+-{
-     }
+-    tcg_gen_st8_i32(TCGV_LOW(arg1), arg2, offset);
-     return tb;
+-}
 +void tcg_gen_st8_i64(TCGv_i64 arg1, TCGv_ptr arg2, tcg_target_long offset);
 +void tcg_gen_st16_i64(TCGv_i64 arg1, TCGv_ptr arg2, tcg_target_long offset);
 +void tcg_gen_st32_i64(TCGv_i64 arg1, TCGv_ptr arg2, tcg_target_long offset);
 -static inline void tcg_gen_st16_i64(TCGv_i64 arg1, TCGv_ptr arg2,
 -                                    tcg_target_long offset)
 -{
 -    tcg_gen_st16_i32(TCGV_LOW(arg1), arg2, offset);
 -}
 -
 -static inline void tcg_gen_st32_i64(TCGv_i64 arg1, TCGv_ptr arg2,
 -                                    tcg_target_long offset)
 -{
 -    tcg_gen_st_i32(TCGV_LOW(arg1), arg2, offset);
 -}
 -
 -static inline void tcg_gen_add_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
 -{
 -    tcg_gen_add2_i32(TCGV_LOW(ret), TCGV_HIGH(ret), TCGV_LOW(arg1),
 -                     TCGV_HIGH(arg1), TCGV_LOW(arg2), TCGV_HIGH(arg2));
 -}
 -
 -static inline void tcg_gen_sub_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
 -{
 -    tcg_gen_sub2_i32(TCGV_LOW(ret), TCGV_HIGH(ret), TCGV_LOW(arg1),
 -                     TCGV_HIGH(arg1), TCGV_LOW(arg2), TCGV_HIGH(arg2));
 -}
 +void tcg_gen_add_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
 +void tcg_gen_sub_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
  void tcg_gen_discard_i64(TCGv_i64 arg);
  void tcg_gen_mov_i64(TCGv_i64 ret, TCGv_i64 arg);
 diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/tcg/tcg.h
 +++ b/include/tcg/tcg.h
@@ -XXX,XX +XXX,XX @@ static inline TCGv_vec temp_tcgv_vec(TCGTemp *t)
      return (TCGv_vec)temp_tcgv_i32(t);
  }
 -#if TCG_TARGET_REG_BITS == 32
 -static inline TCGv_i32 TCGV_LOW(TCGv_i64 t)
 -{
 -    return temp_tcgv_i32(tcgv_i64_temp(t));
 -}
 -
 -static inline TCGv_i32 TCGV_HIGH(TCGv_i64 t)
 -{
 -    return temp_tcgv_i32(tcgv_i64_temp(t) + 1);
 -}
 -#endif
 -
  static inline TCGArg tcg_get_insn_param(TCGOp *op, int arg)
  {
      return op->args[arg];
 diff --git a/tcg/tcg-internal.h b/tcg/tcg-internal.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tcg-internal.h
 +++ b/tcg/tcg-internal.h
@@ -XXX,XX +XXX,XX @@ static inline unsigned tcg_call_flags(TCGOp *op)
      return tcg_call_info(op)->flags;
  }
 +#if TCG_TARGET_REG_BITS == 32
 +static inline TCGv_i32 TCGV_LOW(TCGv_i64 t)
 +{
 +    return temp_tcgv_i32(tcgv_i64_temp(t));
 +}
 +static inline TCGv_i32 TCGV_HIGH(TCGv_i64 t)
 +{
 +    return temp_tcgv_i32(tcgv_i64_temp(t) + 1);
 +}
 +#else
 +extern TCGv_i32 TCGV_LOW(TCGv_i64) QEMU_ERROR("32-bit code path is reachable");
 +extern TCGv_i32 TCGV_HIGH(TCGv_i64) QEMU_ERROR("32-bit code path is reachable");
 +#endif
 +
  #endif /* TCG_INTERNAL_H */
 diff --git a/tcg/tcg-op-vec.c b/tcg/tcg-op-vec.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tcg-op-vec.c
 +++ b/tcg/tcg-op-vec.c
@@ -XXX,XX +XXX,XX @@
  #include "tcg/tcg.h"
  #include "tcg/tcg-op.h"
  #include "tcg/tcg-mo.h"
 +#include "tcg-internal.h"
 +
  /* Reduce the number of ifdefs below.  This assumes that all uses of
     TCGV_HIGH and TCGV_LOW are properly protected by a conditional that
 diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tcg-op.c
 +++ b/tcg/tcg-op.c
-@@ -XXX,XX +XXX,XX @@ void tcg_gen_exit_tb(TranslationBlock *tb, unsigned idx)
+@@ -XXX,XX +XXX,XX @@
-            seen this numbered exit before, via tcg_gen_goto_tb.  */
+ #include "tcg/tcg-op.h"
-         tcg_debug_assert(tcg_ctx->goto_tb_issue_mask & (1 << idx));
+ #include "tcg/tcg-mo.h"
  #include "exec/plugin-gen.h"
 +#include "tcg-internal.h"
 -/* Reduce the number of ifdefs below.  This assumes that all uses of
 -   TCGV_HIGH and TCGV_LOW are properly protected by a conditional that
 -   the compiler can eliminate.  */
 -#if TCG_TARGET_REG_BITS == 64
 -extern TCGv_i32 TCGV_LOW_link_error(TCGv_i64);
 -extern TCGv_i32 TCGV_HIGH_link_error(TCGv_i64);
 -#define TCGV_LOW  TCGV_LOW_link_error
 -#define TCGV_HIGH TCGV_HIGH_link_error
 -#endif
  void tcg_gen_op1(TCGOpcode opc, TCGArg a1)
  {
@@ -XXX,XX +XXX,XX @@ void tcg_gen_ld_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset)
  #endif
-+        /* When not chaining, exit without indicating a link.  */
+ }
-+        if (qemu_loglevel_mask(CPU_LOG_TB_NOCHAIN)) {
-+            val = 0;
++void tcg_gen_st8_i64(TCGv_i64 arg1, TCGv_ptr arg2, tcg_target_long offset)
-+        }
++{
-     } else {
++    tcg_gen_st8_i32(TCGV_LOW(arg1), arg2, offset);
-         /* This is an exit via the exitreq label.  */
++}
-         tcg_debug_assert(idx == TB_EXIT_REQUESTED);
++
-@@ -XXX,XX +XXX,XX @@ void tcg_gen_goto_tb(unsigned idx)
++void tcg_gen_st16_i64(TCGv_i64 arg1, TCGv_ptr arg2, tcg_target_long offset)
-     tcg_debug_assert((tcg_ctx->goto_tb_issue_mask & (1 << idx)) == 0);
++{
-     tcg_ctx->goto_tb_issue_mask |= 1 << idx;
++    tcg_gen_st16_i32(TCGV_LOW(arg1), arg2, offset);
 +}
 +
 +void tcg_gen_st32_i64(TCGv_i64 arg1, TCGv_ptr arg2, tcg_target_long offset)
 +{
 +    tcg_gen_st_i32(TCGV_LOW(arg1), arg2, offset);
 +}
 +
  void tcg_gen_st_i64(TCGv_i64 arg1, TCGv_ptr arg2, tcg_target_long offset)
  {
  #if HOST_BIG_ENDIAN
@@ -XXX,XX +XXX,XX @@ void tcg_gen_st_i64(TCGv_i64 arg1, TCGv_ptr arg2, tcg_target_long offset)
  #endif
--    tcg_gen_op1i(INDEX_op_goto_tb, idx);
-+    /* When not chaining, we simply fall through to the "fallback" exit.  */
-+    if (!qemu_loglevel_mask(CPU_LOG_TB_NOCHAIN)) {
-+        tcg_gen_op1i(INDEX_op_goto_tb, idx);
-+    }
  }
- void tcg_gen_lookup_and_goto_ptr(void)
++void tcg_gen_add_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
 +{
 +    tcg_gen_add2_i32(TCGV_LOW(ret), TCGV_HIGH(ret), TCGV_LOW(arg1),
 +                     TCGV_HIGH(arg1), TCGV_LOW(arg2), TCGV_HIGH(arg2));
 +}
 +
 +void tcg_gen_sub_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
 +{
 +    tcg_gen_sub2_i32(TCGV_LOW(ret), TCGV_HIGH(ret), TCGV_LOW(arg1),
 +                     TCGV_HIGH(arg1), TCGV_LOW(arg2), TCGV_HIGH(arg2));
 +}
 +
  void tcg_gen_and_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
  {
      tcg_gen_and_i32(TCGV_LOW(ret), TCGV_LOW(arg1), TCGV_LOW(arg2));
 --
-.17.2
+.34.1

-New patch
+[PULL 23/47] tcg: Add temp_subindex to TCGTemp
+Record the location of a TCGTemp within a larger object.
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ include/tcg/tcg.h | 1 +
+ tcg/tcg.c         | 3 +++
+files changed, 4 insertions(+)
+diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
+index XXXXXXX..XXXXXXX 100644
+--- a/include/tcg/tcg.h
++++ b/include/tcg/tcg.h
+@@ -XXX,XX +XXX,XX @@ typedef struct TCGTemp {
+     unsigned int mem_coherent:1;
+     unsigned int mem_allocated:1;
+     unsigned int temp_allocated:1;
++    unsigned int temp_subindex:1;
+     int64_t val;
+     struct TCGTemp *mem_base;
+diff --git a/tcg/tcg.c b/tcg/tcg.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/tcg.c
++++ b/tcg/tcg.c
+@@ -XXX,XX +XXX,XX @@ TCGTemp *tcg_global_mem_new_internal(TCGType type, TCGv_ptr base,
+         ts2->mem_allocated = 1;
+         ts2->mem_base = base_ts;
+         ts2->mem_offset = offset + (1 - bigendian) * 4;
++        ts2->temp_subindex = 1;
+         pstrcpy(buf, sizeof(buf), name);
+         pstrcat(buf, sizeof(buf), "_1");
+         ts2->name = strdup(buf);
+@@ -XXX,XX +XXX,XX @@ TCGTemp *tcg_temp_new_internal(TCGType type, bool temp_local)
+             ts2->base_type = TCG_TYPE_I64;
+             ts2->type = TCG_TYPE_I32;
+             ts2->temp_allocated = 1;
++            ts2->temp_subindex = 1;
+             ts2->kind = kind;
+         } else {
+             ts->base_type = type;
+@@ -XXX,XX +XXX,XX @@ TCGTemp *tcg_constant_internal(TCGType type, int64_t val)
+             ts2->type = TCG_TYPE_I32;
+             ts2->kind = TEMP_CONST;
+             ts2->temp_allocated = 1;
++            ts2->temp_subindex = 1;
+             ts2->val = val >> 32;
+         } else {
+             ts->base_type = type;
+--
+.34.1

-[Qemu-devel] [PULL 03/21] tcg: fix use of uninitialized variable under CONFIG_PROFILER
+[PULL 24/47] tcg: Simplify calls to temp_sync vs mem_coherent
-From: "Emilio G. Cota" <cota@braap.org>
+The first thing that temp_sync does is check mem_coherent,
 so there's no need for the caller to do so.
-We forgot to initialize n in commit 15fa08f845 ("tcg: Dynamically
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 allocate TCGOps", 2017-12-29).
 Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
 Signed-off-by: Emilio G. Cota <cota@braap.org>
 Message-Id: <20181010144853.13005-3-cota@braap.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/tcg.c | 2 +-
+ tcg/tcg.c | 8 ++------
-file changed, 1 insertion(+), 1 deletion(-)
+file changed, 2 insertions(+), 6 deletions(-)
 diff --git a/tcg/tcg.c b/tcg/tcg.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tcg.c
 +++ b/tcg/tcg.c
-@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
+@@ -XXX,XX +XXX,XX @@ static bool tcg_reg_alloc_dup2(TCGContext *s, const TCGOp *op)
- #ifdef CONFIG_PROFILER
+     /* If the two inputs form one 64-bit value, try dupm_vec. */
-     {
+     if (itsl + 1 == itsh && itsl->base_type == TCG_TYPE_I64) {
--        int n;
+-        if (!itsl->mem_coherent) {
-+        int n = 0;
+-            temp_sync(s, itsl, s->reserved_regs, 0, 0);
+-        }
-         QTAILQ_FOREACH(op, &s->ops, link) {
+-        if (!itsh->mem_coherent) {
-             n++;
+-            temp_sync(s, itsh, s->reserved_regs, 0, 0);
 -        }
 +        temp_sync(s, itsl, s->reserved_regs, 0, 0);
 +        temp_sync(s, itsh, s->reserved_regs, 0, 0);
  #if HOST_BIG_ENDIAN
          TCGTemp *its = itsh;
  #else
 --
-.17.2
+.34.1

-[Qemu-devel] [PULL 21/21] cputlb: read CPUTLBEntry.addr_write atomically
+[PULL 25/47] tcg: Allocate TCGTemp pairs in host memory order
-From: "Emilio G. Cota" <cota@braap.org>
+Allocate the first of a pair at the lower address, and the
 second of a pair at the higher address.  This will make it
 easier to find the beginning of the larger memory block.
-Updates can come from other threads, so readers that do not
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 take tlb_lock must use atomic_read to avoid undefined
 behaviour (UB).
 This completes the conversion to tlb_lock. This conversion results
 on average in no performance loss, as the following experiments
 (run on an Intel i7-6700K CPU @ 4.00GHz) show.
 . aarch64 bootup+shutdown test:
 - Before:
  Performance counter stats for 'taskset -c 0 ../img/aarch64/die.sh' (10 runs):
 .087786      task-clock (msec)         #    0.998 CPUs utilized            ( +-  0.12% )
 ,574,905,303      cycles                    #    4.217 GHz                      ( +-  0.12% )
 ,097,908,812      instructions              #    1.81  insns per cycle          ( +-  0.08% )
 ,255,415,367      branches                  # 1369.747 M/sec                    ( +-  0.08% )
 ,278,962      branch-misses             #    1.69% of all branches          ( +-  0.18% )
 .504481349 seconds time elapsed                                          ( +-  0.14% )
 - After:
  Performance counter stats for 'taskset -c 0 ../img/aarch64/die.sh' (10 runs):
 .441328      task-clock (msec)         #    0.998 CPUs utilized            ( +-  0.07% )
 ,478,476,520      cycles                    #    4.218 GHz                      ( +-  0.07% )
 ,017,330,084      instructions              #    1.81  insns per cycle          ( +-  0.05% )
 ,251,929,667      branches                  # 1373.804 M/sec                    ( +-  0.05% )
 ,023,787      branch-misses             #    1.69% of all branches          ( +-  0.11% )
 .474970463 seconds time elapsed                                          ( +-  0.07% )
 . SPEC06int:
                                               SPEC06int (test set)
                                            [Y axis: Speedup over master]
 .15 +-+----+------+------+------+------+------+-------+------+------+------+------+------+------+----+-+
        |                                                                                                  |
 .1 +-+.................................+++.............................+  tlb-lock-v2 (m+++x)       +-+
        |                                +++ |                   +++        tlb-lock-v3 (spinl|ck)         |
        |                    +++          |  |     +++    +++     |                           |            |
 .05 +-+....+++...........####.........|####.+++.|......|.....###....+++...........+++....###.........+-+
        |      ###         ++#| #         |# |# ***### +++### +++#+#     |     +++     |     #|#    ###    |
 +-+++***+#++++####+++#++#++++++++++#++#+*+*++#++++#+#+****+#++++###++++###++++###++++#+#++++#+#+++-+
        |    *+* #    #++# ***  #   #### ***  # * *++# ****+# *| * # ****|#   |# #    #|#    #+#    # #    |
 .95 +-+..*.*.#....#..#.*|*..#...#..#.*|*..#.*.*..#.*|.*.#.*++*.#.*++*+#.****.#....#+#....#.#..++#.#..+-+
        |    * * #    #  # *|*  #   #  # *|*  # * *  # *++* # *  * # *  * # * |* #  ++# #    # #  *** #    |
        |    * * #  ++#  # *+*  #   #  # *|*  # * *  # *  * # *  * # *  * # *++* # **** #  ++# #  * * #    |
 .9 +-+..*.*.#...|#..#.*.*..#.++#..#.*|*..#.*.*..#.*..*.#.*..*.#.*..*.#.*..*.#.*.|*.#...|#.#..*.*.#..+-+
        |    * * #  ***  # * *  #  |#  # *+*  # * *  # *  * # *  * # *  * # *  * # *++* #   |# #  * * #    |
 .85 +-+..*.*.#..*|*..#.*.*..#.***..#.*.*..#.*.*..#.*..*.#.*..*.#.*..*.#.*..*.#.*..*.#.****.#..*.*.#..+-+
        |    * * #  *+*  # * *  # *|*  # * *  # * *  # *  * # *  * # *  * # *  * # *  * # * |* #  * * #    |
        |    * * #  * *  # * *  # *+*  # * *  # * *  # *  * # *  * # *  * # *  * # *  * # * |* #  * * #    |
 .8 +-+..*.*.#..*.*..#.*.*..#.*.*..#.*.*..#.*.*..#.*..*.#.*..*.#.*..*.#.*..*.#.*..*.#.*++*.#..*.*.#..+-+
        |    * * #  * *  # * *  # * *  # * *  # * *  # *  * # *  * # *  * # *  * # *  * # *  * #  * * #    |
 .75 +-+--***##--***###-***###-***###-***###-***###-****##-****##-****##-****##-****##-****##--***##--+-+
 .perlben401.bzip2403.gcc429.m445.gob456.hmme45462.libqua464.h26471.omnet473483.xalancbmkgeomean
   png: https://imgur.com/a/BHzpPTW
 Notes:
 - tlb-lock-v2 corresponds to an implementation with a mutex.
 - tlb-lock-v3 corresponds to the current implementation, i.e.
   a spinlock and a single lock acquisition in tlb_set_page_with_attrs.
 Signed-off-by: Emilio G. Cota <cota@braap.org>
 Message-Id: <20181016153840.25877-1-cota@braap.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/softmmu_template.h     | 12 ++++++------
+ tcg/tcg-internal.h |  4 ++--
- include/exec/cpu_ldst.h          | 11 ++++++++++-
+ tcg/tcg.c          | 58 ++++++++++++++++++++++------------------------
- include/exec/cpu_ldst_template.h |  2 +-
+files changed, 30 insertions(+), 32 deletions(-)
  accel/tcg/cputlb.c               | 19 +++++++++++++------
 files changed, 30 insertions(+), 14 deletions(-)
-diff --git a/accel/tcg/softmmu_template.h b/accel/tcg/softmmu_template.h
+diff --git a/tcg/tcg-internal.h b/tcg/tcg-internal.h
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/softmmu_template.h
+--- a/tcg/tcg-internal.h
-+++ b/accel/tcg/softmmu_template.h
++++ b/tcg/tcg-internal.h
-@@ -XXX,XX +XXX,XX @@ void helper_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
+@@ -XXX,XX +XXX,XX @@ static inline unsigned tcg_call_flags(TCGOp *op)
-     uintptr_t mmu_idx = get_mmuidx(oi);
+ #if TCG_TARGET_REG_BITS == 32
-     uintptr_t index = tlb_index(env, mmu_idx, addr);
+ static inline TCGv_i32 TCGV_LOW(TCGv_i64 t)
-     CPUTLBEntry *entry = tlb_entry(env, mmu_idx, addr);
+ {
--    target_ulong tlb_addr = entry->addr_write;
+-    return temp_tcgv_i32(tcgv_i64_temp(t));
-+    target_ulong tlb_addr = tlb_addr_write(entry);
++    return temp_tcgv_i32(tcgv_i64_temp(t) + HOST_BIG_ENDIAN);
-     unsigned a_bits = get_alignment_bits(get_memop(oi));
+ }
-     uintptr_t haddr;
+ static inline TCGv_i32 TCGV_HIGH(TCGv_i64 t)
+ {
-@@ -XXX,XX +XXX,XX @@ void helper_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
+-    return temp_tcgv_i32(tcgv_i64_temp(t) + 1);
-             tlb_fill(ENV_GET_CPU(env), addr, DATA_SIZE, MMU_DATA_STORE,
++    return temp_tcgv_i32(tcgv_i64_temp(t) + !HOST_BIG_ENDIAN);
-                      mmu_idx, retaddr);
+ }
  #else
  extern TCGv_i32 TCGV_LOW(TCGv_i64) QEMU_ERROR("32-bit code path is reachable");
 diff --git a/tcg/tcg.c b/tcg/tcg.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tcg.c
 +++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ TCGTemp *tcg_global_mem_new_internal(TCGType type, TCGv_ptr base,
      TCGContext *s = tcg_ctx;
      TCGTemp *base_ts = tcgv_ptr_temp(base);
      TCGTemp *ts = tcg_global_alloc(s);
 -    int indirect_reg = 0, bigendian = 0;
 -#if HOST_BIG_ENDIAN
 -    bigendian = 1;
 -#endif
 +    int indirect_reg = 0;
      switch (base_ts->kind) {
      case TEMP_FIXED:
@@ -XXX,XX +XXX,XX @@ TCGTemp *tcg_global_mem_new_internal(TCGType type, TCGv_ptr base,
          ts->indirect_reg = indirect_reg;
          ts->mem_allocated = 1;
          ts->mem_base = base_ts;
 -        ts->mem_offset = offset + bigendian * 4;
 +        ts->mem_offset = offset;
          pstrcpy(buf, sizeof(buf), name);
          pstrcat(buf, sizeof(buf), "_0");
          ts->name = strdup(buf);
@@ -XXX,XX +XXX,XX @@ TCGTemp *tcg_global_mem_new_internal(TCGType type, TCGv_ptr base,
          ts2->indirect_reg = indirect_reg;
          ts2->mem_allocated = 1;
          ts2->mem_base = base_ts;
 -        ts2->mem_offset = offset + (1 - bigendian) * 4;
 +        ts2->mem_offset = offset + 4;
          ts2->temp_subindex = 1;
          pstrcpy(buf, sizeof(buf), name);
          pstrcat(buf, sizeof(buf), "_1");
@@ -XXX,XX +XXX,XX @@ TCGTemp *tcg_constant_internal(TCGType type, int64_t val)
      ts = g_hash_table_lookup(h, &val);
      if (ts == NULL) {
 +        int64_t *val_ptr;
 +
          ts = tcg_temp_alloc(s);
          if (TCG_TARGET_REG_BITS == 32 && type == TCG_TYPE_I64) {
              TCGTemp *ts2 = tcg_temp_alloc(s);
 +            tcg_debug_assert(ts2 == ts + 1);
 +
              ts->base_type = TCG_TYPE_I64;
              ts->type = TCG_TYPE_I32;
              ts->kind = TEMP_CONST;
              ts->temp_allocated = 1;
 -            /*
 -             * Retain the full value of the 64-bit constant in the low
 -             * part, so that the hash table works.  Actual uses will
 -             * truncate the value to the low part.
 -             */
 -            ts->val = val;
 -            tcg_debug_assert(ts2 == ts + 1);
              ts2->base_type = TCG_TYPE_I64;
              ts2->type = TCG_TYPE_I32;
              ts2->kind = TEMP_CONST;
              ts2->temp_allocated = 1;
              ts2->temp_subindex = 1;
 -            ts2->val = val >> 32;
 +
 +            /*
 +             * Retain the full value of the 64-bit constant in the low
 +             * part, so that the hash table works.  Actual uses will
 +             * truncate the value to the low part.
 +             */
 +            ts[HOST_BIG_ENDIAN].val = val;
 +            ts[!HOST_BIG_ENDIAN].val = val >> 32;
 +            val_ptr = &ts[HOST_BIG_ENDIAN].val;
          } else {
              ts->base_type = type;
              ts->type = type;
              ts->kind = TEMP_CONST;
              ts->temp_allocated = 1;
              ts->val = val;
 +            val_ptr = &ts->val;
          }
--        tlb_addr = entry->addr_write & ~TLB_INVALID_MASK;
+-        g_hash_table_insert(h, &ts->val, ts);
-+        tlb_addr = tlb_addr_write(entry) & ~TLB_INVALID_MASK;
++        g_hash_table_insert(h, val_ptr, ts);
      }
-     /* Handle an IO access.  */
+     return ts;
-@@ -XXX,XX +XXX,XX @@ void helper_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
+@@ -XXX,XX +XXX,XX @@ void tcg_gen_callN(void *func, TCGTemp *ret, int nargs, TCGTemp **args)
-            cannot evict the first.  */
+     pi = 0;
-         page2 = (addr + DATA_SIZE) & TARGET_PAGE_MASK;
+     if (ret != NULL) {
-         entry2 = tlb_entry(env, mmu_idx, page2);
+         if (TCG_TARGET_REG_BITS < 64 && (typemask & 6) == dh_typecode_i64) {
--        if (!tlb_hit_page(entry2->addr_write, page2)
+-#if HOST_BIG_ENDIAN
-+        if (!tlb_hit_page(tlb_addr_write(entry2), page2)
+-            op->args[pi++] = temp_arg(ret + 1);
-             && !VICTIM_TLB_HIT(addr_write, page2)) {
+-            op->args[pi++] = temp_arg(ret);
-             tlb_fill(ENV_GET_CPU(env), page2, DATA_SIZE, MMU_DATA_STORE,
+-#else
-                      mmu_idx, retaddr);
+             op->args[pi++] = temp_arg(ret);
-@@ -XXX,XX +XXX,XX @@ void helper_be_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
+             op->args[pi++] = temp_arg(ret + 1);
-     uintptr_t mmu_idx = get_mmuidx(oi);
+-#endif
-     uintptr_t index = tlb_index(env, mmu_idx, addr);
+             nb_rets = 2;
-     CPUTLBEntry *entry = tlb_entry(env, mmu_idx, addr);
+         } else {
--    target_ulong tlb_addr = entry->addr_write;
+             op->args[pi++] = temp_arg(ret);
-+    target_ulong tlb_addr = tlb_addr_write(entry);
+@@ -XXX,XX +XXX,XX @@ void tcg_gen_callN(void *func, TCGTemp *ret, int nargs, TCGTemp **args)
      unsigned a_bits = get_alignment_bits(get_memop(oi));
      uintptr_t haddr;
@@ -XXX,XX +XXX,XX @@ void helper_be_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
              tlb_fill(ENV_GET_CPU(env), addr, DATA_SIZE, MMU_DATA_STORE,
                       mmu_idx, retaddr);
          }
--        tlb_addr = entry->addr_write & ~TLB_INVALID_MASK;
-+        tlb_addr = tlb_addr_write(entry) & ~TLB_INVALID_MASK;
+         if (TCG_TARGET_REG_BITS < 64 && is_64bit) {
 -            op->args[pi++] = temp_arg(args[i] + HOST_BIG_ENDIAN);
 -            op->args[pi++] = temp_arg(args[i] + !HOST_BIG_ENDIAN);
 +            op->args[pi++] = temp_arg(args[i]);
 +            op->args[pi++] = temp_arg(args[i] + 1);
              real_args += 2;
              continue;
          }
@@ -XXX,XX +XXX,XX @@ static bool tcg_reg_alloc_dup2(TCGContext *s, const TCGOp *op)
      }
-     /* Handle an IO access.  */
+     /* If the two inputs form one 64-bit value, try dupm_vec. */
-@@ -XXX,XX +XXX,XX @@ void helper_be_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
+-    if (itsl + 1 == itsh && itsl->base_type == TCG_TYPE_I64) {
-            cannot evict the first.  */
+-        temp_sync(s, itsl, s->reserved_regs, 0, 0);
-         page2 = (addr + DATA_SIZE) & TARGET_PAGE_MASK;
+-        temp_sync(s, itsh, s->reserved_regs, 0, 0);
-         entry2 = tlb_entry(env, mmu_idx, page2);
+-#if HOST_BIG_ENDIAN
--        if (!tlb_hit_page(entry2->addr_write, page2)
+-        TCGTemp *its = itsh;
-+        if (!tlb_hit_page(tlb_addr_write(entry2), page2)
+-#else
-             && !VICTIM_TLB_HIT(addr_write, page2)) {
+-        TCGTemp *its = itsl;
-             tlb_fill(ENV_GET_CPU(env), page2, DATA_SIZE, MMU_DATA_STORE,
+-#endif
-                      mmu_idx, retaddr);
++    if (itsl->temp_subindex == HOST_BIG_ENDIAN &&
-diff --git a/include/exec/cpu_ldst.h b/include/exec/cpu_ldst.h
++        itsh->temp_subindex == !HOST_BIG_ENDIAN &&
-index XXXXXXX..XXXXXXX 100644
++        itsl == itsh + (HOST_BIG_ENDIAN ? 1 : -1)) {
---- a/include/exec/cpu_ldst.h
++        TCGTemp *its = itsl - HOST_BIG_ENDIAN;
 +++ b/include/exec/cpu_ldst.h
@@ -XXX,XX +XXX,XX @@ extern __thread uintptr_t helper_retaddr;
  /* The memory helpers for tcg-generated code need tcg_target_long etc.  */
  #include "tcg.h"
 +static inline target_ulong tlb_addr_write(const CPUTLBEntry *entry)
 +{
 +#if TCG_OVERSIZED_GUEST
 +    return entry->addr_write;
 +#else
 +    return atomic_read(&entry->addr_write);
 +#endif
 +}
 +
- /* Find the TLB index corresponding to the mmu_idx + address pair.  */
++        temp_sync(s, its + 0, s->reserved_regs, 0, 0);
- static inline uintptr_t tlb_index(CPUArchState *env, uintptr_t mmu_idx,
++        temp_sync(s, its + 1, s->reserved_regs, 0, 0);
                                    target_ulong addr)
@@ -XXX,XX +XXX,XX @@ static inline void *tlb_vaddr_to_host(CPUArchState *env, abi_ptr addr,
          tlb_addr = tlbentry->addr_read;
          break;
      case 1:
 -        tlb_addr = tlbentry->addr_write;
 +        tlb_addr = tlb_addr_write(tlbentry);
          break;
      case 2:
          tlb_addr = tlbentry->addr_code;
 diff --git a/include/exec/cpu_ldst_template.h b/include/exec/cpu_ldst_template.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/exec/cpu_ldst_template.h
 +++ b/include/exec/cpu_ldst_template.h
@@ -XXX,XX +XXX,XX @@ glue(glue(glue(cpu_st, SUFFIX), MEMSUFFIX), _ra)(CPUArchState *env,
      addr = ptr;
      mmu_idx = CPU_MMU_INDEX;
      entry = tlb_entry(env, mmu_idx, addr);
 -    if (unlikely(entry->addr_write !=
 +    if (unlikely(tlb_addr_write(entry) !=
                   (addr & (TARGET_PAGE_MASK | (DATA_SIZE - 1))))) {
          oi = make_memop_idx(SHIFT, mmu_idx);
          glue(glue(helper_ret_st, SUFFIX), MMUSUFFIX)(env, addr, v, oi,
 diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/cputlb.c
 +++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ static inline bool tlb_hit_page_anyprot(CPUTLBEntry *tlb_entry,
                                          target_ulong page)
  {
      return tlb_hit_page(tlb_entry->addr_read, page) ||
 -           tlb_hit_page(tlb_entry->addr_write, page) ||
 +           tlb_hit_page(tlb_addr_write(tlb_entry), page) ||
             tlb_hit_page(tlb_entry->addr_code, page);
  }
@@ -XXX,XX +XXX,XX @@ static void io_writex(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
          tlb_fill(cpu, addr, size, MMU_DATA_STORE, mmu_idx, retaddr);
          entry = tlb_entry(env, mmu_idx, addr);
 -        tlb_addr = entry->addr_write;
 +        tlb_addr = tlb_addr_write(entry);
          if (!(tlb_addr & ~(TARGET_PAGE_MASK | TLB_RECHECK))) {
              /* RAM access */
              uintptr_t haddr = addr + entry->addend;
@@ -XXX,XX +XXX,XX @@ static bool victim_tlb_hit(CPUArchState *env, size_t mmu_idx, size_t index,
      assert_cpu_is_self(ENV_GET_CPU(env));
      for (vidx = 0; vidx < CPU_VTLB_SIZE; ++vidx) {
          CPUTLBEntry *vtlb = &env->tlb_v_table[mmu_idx][vidx];
 -        target_ulong cmp = *(target_ulong *)((uintptr_t)vtlb + elt_ofs);
 +        target_ulong cmp;
 +
-+        /* elt_ofs might correspond to .addr_write, so use atomic_read */
+         if (tcg_out_dupm_vec(s, vtype, MO_64, ots->reg,
-+#if TCG_OVERSIZED_GUEST
+                              its->mem_base->reg, its->mem_offset)) {
-+        cmp = *(target_ulong *)((uintptr_t)vtlb + elt_ofs);
+             goto done;
 +#else
 +        cmp = atomic_read((target_ulong *)((uintptr_t)vtlb + elt_ofs));
 +#endif
          if (cmp == page) {
              /* Found entry in victim tlb, swap tlb and iotlb.  */
@@ -XXX,XX +XXX,XX @@ void probe_write(CPUArchState *env, target_ulong addr, int size, int mmu_idx,
      uintptr_t index = tlb_index(env, mmu_idx, addr);
      CPUTLBEntry *entry = tlb_entry(env, mmu_idx, addr);
 -    if (!tlb_hit(entry->addr_write, addr)) {
 +    if (!tlb_hit(tlb_addr_write(entry), addr)) {
          /* TLB entry is for a different page */
          if (!VICTIM_TLB_HIT(addr_write, addr)) {
              tlb_fill(ENV_GET_CPU(env), addr, size, MMU_DATA_STORE,
@@ -XXX,XX +XXX,XX @@ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
      size_t mmu_idx = get_mmuidx(oi);
      uintptr_t index = tlb_index(env, mmu_idx, addr);
      CPUTLBEntry *tlbe = tlb_entry(env, mmu_idx, addr);
 -    target_ulong tlb_addr = tlbe->addr_write;
 +    target_ulong tlb_addr = tlb_addr_write(tlbe);
      TCGMemOp mop = get_memop(oi);
      int a_bits = get_alignment_bits(mop);
      int s_bits = mop & MO_SIZE;
@@ -XXX,XX +XXX,XX @@ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
              tlb_fill(ENV_GET_CPU(env), addr, 1 << s_bits, MMU_DATA_STORE,
                       mmu_idx, retaddr);
          }
 -        tlb_addr = tlbe->addr_write & ~TLB_INVALID_MASK;
 +        tlb_addr = tlb_addr_write(tlbe) & ~TLB_INVALID_MASK;
      }
      /* Notice an IO access or a needs-MMU-lookup access */
 --
-.17.2
+.34.1

-New patch
+[PULL 26/47] tcg: Move TCG_TYPE_COUNT outside enum
+The count is not itself an enumerator.  Move it outside to
+prevent the compiler from considering it with -Wswitch-enum.
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ include/tcg/tcg.h | 3 ++-
+file changed, 2 insertions(+), 1 deletion(-)
+diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
+index XXXXXXX..XXXXXXX 100644
+--- a/include/tcg/tcg.h
++++ b/include/tcg/tcg.h
+@@ -XXX,XX +XXX,XX @@ typedef enum TCGType {
+     TCG_TYPE_V128,
+     TCG_TYPE_V256,
+-    TCG_TYPE_COUNT, /* number of different types */
++    /* Number of different types (integer not enum) */
++#define TCG_TYPE_COUNT  (TCG_TYPE_V256 + 1)
+     /* An alias for the size of the host register.  */
+ #if TCG_TARGET_REG_BITS == 32
+--
+.34.1

-[Qemu-devel] [PULL 08/21] exec: introduce tlb_init
+[PULL 27/47] tcg: Introduce tcg_type_size
-From: "Emilio G. Cota" <cota@braap.org>
+Add a helper function for computing the size of a type.
-Paves the way for the addition of a per-TLB lock.
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
 Signed-off-by: Emilio G. Cota <cota@braap.org>
 Message-Id: <20181009174557.16125-4-cota@braap.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/exec/exec-all.h | 8 ++++++++
+ include/tcg/tcg.h | 16 ++++++++++++++++
- accel/tcg/cputlb.c      | 4 ++++
+ tcg/tcg.c         | 27 ++++++++++++---------------
- exec.c                  | 1 +
+files changed, 28 insertions(+), 15 deletions(-)
 files changed, 13 insertions(+)
-diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
+diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
 index XXXXXXX..XXXXXXX 100644
---- a/include/exec/exec-all.h
+--- a/include/tcg/tcg.h
-+++ b/include/exec/exec-all.h
++++ b/include/tcg/tcg.h
-@@ -XXX,XX +XXX,XX @@ void cpu_address_space_init(CPUState *cpu, int asidx,
+@@ -XXX,XX +XXX,XX @@ typedef enum TCGType {
+ #endif
- #if !defined(CONFIG_USER_ONLY) && defined(CONFIG_TCG)
+ } TCGType;
- /* cputlb.c */
 +/**
-+ * tlb_init - initialize a CPU's TLB
++ * tcg_type_size
-+ * @cpu: CPU whose TLB should be initialized
++ * @t: type
 + *
 + * Return the size of the type in bytes.
 + */
-+void tlb_init(CPUState *cpu);
++static inline int tcg_type_size(TCGType t)
  /**
   * tlb_flush_page:
   * @cpu: CPU whose TLB should be flushed
@@ -XXX,XX +XXX,XX @@ void tlb_set_page(CPUState *cpu, target_ulong vaddr,
  void probe_write(CPUArchState *env, target_ulong addr, int size, int mmu_idx,
                   uintptr_t retaddr);
  #else
 +static inline void tlb_init(CPUState *cpu)
 +{
-+}
++    unsigned i = t;
- static inline void tlb_flush_page(CPUState *cpu, target_ulong addr)
++    if (i >= TCG_TYPE_V64) {
- {
++        tcg_debug_assert(i < TCG_TYPE_COUNT);
- }
++        i -= TCG_TYPE_V64 - 1;
-diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
++    }
-index XXXXXXX..XXXXXXX 100644
++    return 4 << i;
 --- a/accel/tcg/cputlb.c
 +++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ QEMU_BUILD_BUG_ON(sizeof(target_ulong) > sizeof(run_on_cpu_data));
  QEMU_BUILD_BUG_ON(NB_MMU_MODES > 16);
  #define ALL_MMUIDX_BITS ((1 << NB_MMU_MODES) - 1)
 +void tlb_init(CPUState *cpu)
 +{
 +}
 +
- /* flush_all_helper: run fn across all cpus
+ /**
-  *
+  * get_alignment_bits
-  * If the wait flag is set then the src cpu's helper will be queued as
+  * @memop: MemOp value
-diff --git a/exec.c b/exec.c
+diff --git a/tcg/tcg.c b/tcg/tcg.c
 index XXXXXXX..XXXXXXX 100644
---- a/exec.c
+--- a/tcg/tcg.c
-+++ b/exec.c
++++ b/tcg/tcg.c
-@@ -XXX,XX +XXX,XX @@ void cpu_exec_realizefn(CPUState *cpu, Error **errp)
+@@ -XXX,XX +XXX,XX @@ static bool liveness_pass_2(TCGContext *s)
-         tcg_target_initialized = true;
-         cc->tcg_initialize();
+ static void temp_allocate_frame(TCGContext *s, TCGTemp *ts)
-     }
+ {
-+    tlb_init(cpu);
+-    intptr_t off, size, align;
++    int size = tcg_type_size(ts->type);
- #ifndef CONFIG_USER_ONLY
++    int align;
-     if (qdev_get_vmsd(DEVICE(cpu)) == NULL) {
++    intptr_t off;
      switch (ts->type) {
      case TCG_TYPE_I32:
 -        size = align = 4;
 +        align = 4;
          break;
      case TCG_TYPE_I64:
      case TCG_TYPE_V64:
 -        size = align = 8;
 +        align = 8;
          break;
      case TCG_TYPE_V128:
 -        size = align = 16;
 -        break;
      case TCG_TYPE_V256:
          /* Note that we do not require aligned storage for V256. */
 -        size = 32, align = 16;
 +        align = 16;
          break;
      default:
          g_assert_not_reached();
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_dup(TCGContext *s, const TCGOp *op)
      TCGRegSet dup_out_regs, dup_in_regs;
      TCGTemp *its, *ots;
      TCGType itype, vtype;
 -    intptr_t endian_fixup;
      unsigned vece;
 +    int lowpart_ofs;
      bool ok;
      ots = arg_temp(op->args[0]);
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_dup(TCGContext *s, const TCGOp *op)
          /* fall through */
      case TEMP_VAL_MEM:
 -#if HOST_BIG_ENDIAN
 -        endian_fixup = itype == TCG_TYPE_I32 ? 4 : 8;
 -        endian_fixup -= 1 << vece;
 -#else
 -        endian_fixup = 0;
 -#endif
 -        /* Attempt to dup directly from the input memory slot. */
 +        lowpart_ofs = 0;
 +        if (HOST_BIG_ENDIAN) {
 +            lowpart_ofs = tcg_type_size(itype) - (1 << vece);
 +        }
          if (tcg_out_dupm_vec(s, vtype, vece, ots->reg, its->mem_base->reg,
 -                             its->mem_offset + endian_fixup)) {
 +                             its->mem_offset + lowpart_ofs)) {
              goto done;
          }
          /* Load the input into the destination vector register. */
 --
-.17.2
+.34.1

-[Qemu-devel] [PULL 15/21] target/arm: Check HAVE_CMPXCHG128 at translate time
+[PULL 28/47] tcg: Introduce TCGCallReturnKind and TCGCallArgumentKind
-Reviewed-by: Emilio G. Cota <cota@braap.org>
+Prepare to replace a bunch of separate ifdefs with a
-Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+consistent way to describe the ABI of a function call.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/arm/helper-a64.c    | 16 ++++------------
+ tcg/tcg-internal.h | 15 +++++++++++++++
- target/arm/translate-a64.c | 38 ++++++++++++++++++++++----------------
+file changed, 15 insertions(+)
 files changed, 26 insertions(+), 28 deletions(-)
-diff --git a/target/arm/helper-a64.c b/target/arm/helper-a64.c
+diff --git a/tcg/tcg-internal.h b/tcg/tcg-internal.h
 index XXXXXXX..XXXXXXX 100644
---- a/target/arm/helper-a64.c
+--- a/tcg/tcg-internal.h
-+++ b/target/arm/helper-a64.c
++++ b/tcg/tcg-internal.h
@@ -XXX,XX +XXX,XX @@ uint64_t HELPER(paired_cmpxchg64_le_parallel)(CPUARMState *env, uint64_t addr,
      int mem_idx;
      TCGMemOpIdx oi;
 -    if (!HAVE_CMPXCHG128) {
 -        cpu_loop_exit_atomic(ENV_GET_CPU(env), ra);
 -    }
 +    assert(HAVE_CMPXCHG128);
      mem_idx = cpu_mmu_index(env, false);
      oi = make_memop_idx(MO_LEQ | MO_ALIGN_16, mem_idx);
@@ -XXX,XX +XXX,XX @@ uint64_t HELPER(paired_cmpxchg64_be_parallel)(CPUARMState *env, uint64_t addr,
      int mem_idx;
      TCGMemOpIdx oi;
 -    if (!HAVE_CMPXCHG128) {
 -        cpu_loop_exit_atomic(ENV_GET_CPU(env), ra);
 -    }
 +    assert(HAVE_CMPXCHG128);
      mem_idx = cpu_mmu_index(env, false);
      oi = make_memop_idx(MO_BEQ | MO_ALIGN_16, mem_idx);
@@ -XXX,XX +XXX,XX @@ void HELPER(casp_le_parallel)(CPUARMState *env, uint32_t rs, uint64_t addr,
      int mem_idx;
      TCGMemOpIdx oi;
 -    if (!HAVE_CMPXCHG128) {
 -        cpu_loop_exit_atomic(ENV_GET_CPU(env), ra);
 -    }
 +    assert(HAVE_CMPXCHG128);
      mem_idx = cpu_mmu_index(env, false);
      oi = make_memop_idx(MO_LEQ | MO_ALIGN_16, mem_idx);
@@ -XXX,XX +XXX,XX @@ void HELPER(casp_be_parallel)(CPUARMState *env, uint32_t rs, uint64_t addr,
      int mem_idx;
      TCGMemOpIdx oi;
 -    if (!HAVE_CMPXCHG128) {
 -        cpu_loop_exit_atomic(ENV_GET_CPU(env), ra);
 -    }
 +    assert(HAVE_CMPXCHG128);
      mem_idx = cpu_mmu_index(env, false);
      oi = make_memop_idx(MO_LEQ | MO_ALIGN_16, mem_idx);
 diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/arm/translate-a64.c
 +++ b/target/arm/translate-a64.c
 @@ -XXX,XX +XXX,XX @@
- #include "trace-tcg.h"
+ #define TCG_HIGHWATER 1024
- #include "translate-a64.h"
-+#include "qemu/atomic128.h"
++/*
++ * Describe the calling convention of a given argument type.
- static TCGv_i64 cpu_X[32];
++ */
- static TCGv_i64 cpu_pc;
++typedef enum {
-@@ -XXX,XX +XXX,XX @@ static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2,
++    TCG_CALL_RET_NORMAL,         /* by registers */
-                                        get_mem_index(s),
++} TCGCallReturnKind;
-                                        MO_64 | MO_ALIGN | s->be_data);
++
-             tcg_gen_setcond_i64(TCG_COND_NE, tmp, tmp, cpu_exclusive_val);
++typedef enum {
--        } else if (s->be_data == MO_LE) {
++    TCG_CALL_ARG_NORMAL,         /* by registers (continuing onto stack) */
--            if (tb_cflags(s->base.tb) & CF_PARALLEL) {
++    TCG_CALL_ARG_EVEN,           /* like normal, but skipping odd slots */
-+        } else if (tb_cflags(s->base.tb) & CF_PARALLEL) {
++    TCG_CALL_ARG_EXTEND,         /* for i32, as a sign/zero-extended i64 */
-+            if (!HAVE_CMPXCHG128) {
++    TCG_CALL_ARG_EXTEND_U,       /*      ... as a zero-extended i64 */
-+                gen_helper_exit_atomic(cpu_env);
++    TCG_CALL_ARG_EXTEND_S,       /*      ... as a sign-extended i64 */
-+                s->base.is_jmp = DISAS_NORETURN;
++} TCGCallArgumentKind;
-+            } else if (s->be_data == MO_LE) {
++
-                 gen_helper_paired_cmpxchg64_le_parallel(tmp, cpu_env,
+ typedef struct TCGHelperInfo {
-                                                         cpu_exclusive_addr,
+     void *func;
-                                                         cpu_reg(s, rt),
+     const char *name;
                                                          cpu_reg(s, rt2));
              } else {
 -                gen_helper_paired_cmpxchg64_le(tmp, cpu_env, cpu_exclusive_addr,
 -                                               cpu_reg(s, rt), cpu_reg(s, rt2));
 -            }
 -        } else {
 -            if (tb_cflags(s->base.tb) & CF_PARALLEL) {
                  gen_helper_paired_cmpxchg64_be_parallel(tmp, cpu_env,
                                                          cpu_exclusive_addr,
                                                          cpu_reg(s, rt),
                                                          cpu_reg(s, rt2));
 -            } else {
 -                gen_helper_paired_cmpxchg64_be(tmp, cpu_env, cpu_exclusive_addr,
 -                                               cpu_reg(s, rt), cpu_reg(s, rt2));
              }
 +        } else if (s->be_data == MO_LE) {
 +            gen_helper_paired_cmpxchg64_le(tmp, cpu_env, cpu_exclusive_addr,
 +                                           cpu_reg(s, rt), cpu_reg(s, rt2));
 +        } else {
 +            gen_helper_paired_cmpxchg64_be(tmp, cpu_env, cpu_exclusive_addr,
 +                                           cpu_reg(s, rt), cpu_reg(s, rt2));
          }
      } else {
          tcg_gen_atomic_cmpxchg_i64(tmp, cpu_exclusive_addr, cpu_exclusive_val,
@@ -XXX,XX +XXX,XX @@ static void gen_compare_and_swap_pair(DisasContext *s, int rs, int rt,
          }
          tcg_temp_free_i64(cmp);
      } else if (tb_cflags(s->base.tb) & CF_PARALLEL) {
 -        TCGv_i32 tcg_rs = tcg_const_i32(rs);
 -
 -        if (s->be_data == MO_LE) {
 -            gen_helper_casp_le_parallel(cpu_env, tcg_rs, addr, t1, t2);
 +        if (HAVE_CMPXCHG128) {
 +            TCGv_i32 tcg_rs = tcg_const_i32(rs);
 +            if (s->be_data == MO_LE) {
 +                gen_helper_casp_le_parallel(cpu_env, tcg_rs, addr, t1, t2);
 +            } else {
 +                gen_helper_casp_be_parallel(cpu_env, tcg_rs, addr, t1, t2);
 +            }
 +            tcg_temp_free_i32(tcg_rs);
          } else {
 -            gen_helper_casp_be_parallel(cpu_env, tcg_rs, addr, t1, t2);
 +            gen_helper_exit_atomic(cpu_env);
 +            s->base.is_jmp = DISAS_NORETURN;
          }
 -        tcg_temp_free_i32(tcg_rs);
      } else {
          TCGv_i64 d1 = tcg_temp_new_i64();
          TCGv_i64 d2 = tcg_temp_new_i64();
 --
-.17.2
+.34.1

-New patch
+[PULL 29/47] tcg: Replace TCG_TARGET_CALL_ALIGN_ARGS with TCG_TARGET_CALL_ARG_I64
+For 32-bit hosts when TCG_TARGET_CALL_ALIGN_ARGS was set, use
 TCG_CALL_ARG_EVEN.  For 64-bit hosts, TCG_TARGET_CALL_ALIGN_ARGS
 was silently ignored, so always use TCG_CALL_ARG_NORMAL.
 Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
  tcg/aarch64/tcg-target.h     |  2 +-
  tcg/arm/tcg-target.h         |  2 +-
  tcg/i386/tcg-target.h        |  1 +
  tcg/loongarch64/tcg-target.h |  2 +-
  tcg/mips/tcg-target.h        |  3 ++-
  tcg/riscv/tcg-target.h       |  6 +++++-
  tcg/s390x/tcg-target.h       |  1 +
  tcg/sparc64/tcg-target.h     |  1 +
  tcg/tci/tcg-target.h         |  5 +++++
  tcg/tcg.c                    |  6 ++++--
  tcg/ppc/tcg-target.c.inc     | 21 ++++++++-------------
 files changed, 30 insertions(+), 20 deletions(-)
 diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/aarch64/tcg-target.h
 +++ b/tcg/aarch64/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
  /* used for function call generation */
  #define TCG_REG_CALL_STACK              TCG_REG_SP
  #define TCG_TARGET_STACK_ALIGN          16
 -#define TCG_TARGET_CALL_ALIGN_ARGS      1
  #define TCG_TARGET_CALL_STACK_OFFSET    0
 +#define TCG_TARGET_CALL_ARG_I64         TCG_CALL_ARG_NORMAL
  /* optional instructions */
  #define TCG_TARGET_HAS_div_i32          1
 diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/arm/tcg-target.h
 +++ b/tcg/arm/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool use_neon_instructions;
  /* used for function call generation */
  #define TCG_TARGET_STACK_ALIGN        8
 -#define TCG_TARGET_CALL_ALIGN_ARGS    1
  #define TCG_TARGET_CALL_STACK_OFFSET    0
 +#define TCG_TARGET_CALL_ARG_I64         TCG_CALL_ARG_EVEN
  /* optional instructions */
  #define TCG_TARGET_HAS_ext8s_i32        1
 diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/i386/tcg-target.h
 +++ b/tcg/i386/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
  #else
  #define TCG_TARGET_CALL_STACK_OFFSET 0
  #endif
 +#define TCG_TARGET_CALL_ARG_I64      TCG_CALL_ARG_NORMAL
  extern bool have_bmi1;
  extern bool have_popcnt;
 diff --git a/tcg/loongarch64/tcg-target.h b/tcg/loongarch64/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/loongarch64/tcg-target.h
 +++ b/tcg/loongarch64/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
  /* used for function call generation */
  #define TCG_REG_CALL_STACK              TCG_REG_SP
  #define TCG_TARGET_STACK_ALIGN          16
 -#define TCG_TARGET_CALL_ALIGN_ARGS      1
  #define TCG_TARGET_CALL_STACK_OFFSET    0
 +#define TCG_TARGET_CALL_ARG_I64         TCG_CALL_ARG_NORMAL
  /* optional instructions */
  #define TCG_TARGET_HAS_movcond_i32      0
 diff --git a/tcg/mips/tcg-target.h b/tcg/mips/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/mips/tcg-target.h
 +++ b/tcg/mips/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
  #define TCG_TARGET_STACK_ALIGN        16
  #if _MIPS_SIM == _ABIO32
  # define TCG_TARGET_CALL_STACK_OFFSET 16
 +# define TCG_TARGET_CALL_ARG_I64      TCG_CALL_ARG_EVEN
  #else
  # define TCG_TARGET_CALL_STACK_OFFSET 0
 +# define TCG_TARGET_CALL_ARG_I64      TCG_CALL_ARG_NORMAL
  #endif
 -#define TCG_TARGET_CALL_ALIGN_ARGS    1
  /* MOVN/MOVZ instructions detection */
  #if (defined(__mips_isa_rev) && (__mips_isa_rev >= 1)) || \
 diff --git a/tcg/riscv/tcg-target.h b/tcg/riscv/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/riscv/tcg-target.h
 +++ b/tcg/riscv/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
  /* used for function call generation */
  #define TCG_REG_CALL_STACK              TCG_REG_SP
  #define TCG_TARGET_STACK_ALIGN          16
 -#define TCG_TARGET_CALL_ALIGN_ARGS      1
  #define TCG_TARGET_CALL_STACK_OFFSET    0
 +#if TCG_TARGET_REG_BITS == 32
 +#define TCG_TARGET_CALL_ARG_I64         TCG_CALL_ARG_EVEN
 +#else
 +#define TCG_TARGET_CALL_ARG_I64         TCG_CALL_ARG_NORMAL
 +#endif
  /* optional instructions */
  #define TCG_TARGET_HAS_movcond_i32      0
 diff --git a/tcg/s390x/tcg-target.h b/tcg/s390x/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/s390x/tcg-target.h
 +++ b/tcg/s390x/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern uint64_t s390_facilities[3];
  /* used for function call generation */
  #define TCG_TARGET_STACK_ALIGN        8
  #define TCG_TARGET_CALL_STACK_OFFSET    160
 +#define TCG_TARGET_CALL_ARG_I64         TCG_CALL_ARG_NORMAL
  #define TCG_TARGET_EXTEND_ARGS 1
  #define TCG_TARGET_HAS_MEMORY_BSWAP   1
 diff --git a/tcg/sparc64/tcg-target.h b/tcg/sparc64/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/sparc64/tcg-target.h
 +++ b/tcg/sparc64/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
  #define TCG_TARGET_STACK_ALIGN          16
  #define TCG_TARGET_CALL_STACK_OFFSET    (128 + 6*8 + TCG_TARGET_STACK_BIAS)
  #define TCG_TARGET_EXTEND_ARGS 1
 +#define TCG_TARGET_CALL_ARG_I64         TCG_CALL_ARG_NORMAL
  #if defined(__VIS__) && __VIS__ >= 0x300
  #define use_vis3_instructions  1
 diff --git a/tcg/tci/tcg-target.h b/tcg/tci/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tci/tcg-target.h
 +++ b/tcg/tci/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
  /* Used for function call generation. */
  #define TCG_TARGET_CALL_STACK_OFFSET    0
  #define TCG_TARGET_STACK_ALIGN          8
 +#if TCG_TARGET_REG_BITS == 32
 +# define TCG_TARGET_CALL_ARG_I64        TCG_CALL_ARG_EVEN
 +#else
 +# define TCG_TARGET_CALL_ARG_I64        TCG_CALL_ARG_NORMAL
 +#endif
  #define HAVE_TCG_QEMU_TB_EXEC
  #define TCG_TARGET_NEED_POOL_LABELS
 diff --git a/tcg/tcg.c b/tcg/tcg.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tcg.c
 +++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ void tcg_gen_callN(void *func, TCGTemp *ret, int nargs, TCGTemp **args)
           * for passing off to ffi_call.
           */
          want_align = true;
 -#elif defined(TCG_TARGET_CALL_ALIGN_ARGS)
 +#else
          /* Some targets want aligned 64 bit args */
 -        want_align = is_64bit;
 +        if (is_64bit) {
 +            want_align = TCG_TARGET_CALL_ARG_I64 == TCG_CALL_ARG_EVEN;
 +        }
  #endif
          if (TCG_TARGET_REG_BITS < 64 && want_align && (real_args & 1)) {
 diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/ppc/tcg-target.c.inc
 +++ b/tcg/ppc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@
  #endif
  #ifdef _CALL_SYSV
 -# define TCG_TARGET_CALL_ALIGN_ARGS   1
 +# define TCG_TARGET_CALL_ARG_I64   TCG_CALL_ARG_EVEN
 +#else
 +# define TCG_TARGET_CALL_ARG_I64   TCG_CALL_ARG_NORMAL
  #endif
  /* For some memory operations, we need a scratch that isn't R0.  For the AIX
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
      lo = lb->addrlo_reg;
      hi = lb->addrhi_reg;
      if (TCG_TARGET_REG_BITS < TARGET_LONG_BITS) {
 -#ifdef TCG_TARGET_CALL_ALIGN_ARGS
 -        arg |= 1;
 -#endif
 +        arg |= (TCG_TARGET_CALL_ARG_I64 == TCG_CALL_ARG_EVEN);
          tcg_out_mov(s, TCG_TYPE_I32, arg++, hi);
          tcg_out_mov(s, TCG_TYPE_I32, arg++, lo);
      } else {
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
      lo = lb->addrlo_reg;
      hi = lb->addrhi_reg;
      if (TCG_TARGET_REG_BITS < TARGET_LONG_BITS) {
 -#ifdef TCG_TARGET_CALL_ALIGN_ARGS
 -        arg |= 1;
 -#endif
 +        arg |= (TCG_TARGET_CALL_ARG_I64 == TCG_CALL_ARG_EVEN);
          tcg_out_mov(s, TCG_TYPE_I32, arg++, hi);
          tcg_out_mov(s, TCG_TYPE_I32, arg++, lo);
      } else {
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
      if (TCG_TARGET_REG_BITS == 32) {
          switch (s_bits) {
          case MO_64:
 -#ifdef TCG_TARGET_CALL_ALIGN_ARGS
 -            arg |= 1;
 -#endif
 +            arg |= (TCG_TARGET_CALL_ARG_I64 == TCG_CALL_ARG_EVEN);
              tcg_out_mov(s, TCG_TYPE_I32, arg++, hi);
              /* FALLTHRU */
          case MO_32:
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_fail_alignment(TCGContext *s, TCGLabelQemuLdst *l)
      if (TCG_TARGET_REG_BITS < TARGET_LONG_BITS) {
          TCGReg arg = TCG_REG_R4;
 -#ifdef TCG_TARGET_CALL_ALIGN_ARGS
 -        arg |= 1;
 -#endif
 +
 +        arg |= (TCG_TARGET_CALL_ARG_I64 == TCG_CALL_ARG_EVEN);
          if (l->addrlo_reg != arg) {
              tcg_out_mov(s, TCG_TYPE_I32, arg, l->addrhi_reg);
              tcg_out_mov(s, TCG_TYPE_I32, arg + 1, l->addrlo_reg);
 --
 .34.1

-New patch
+[PULL 30/47] tcg: Replace TCG_TARGET_EXTEND_ARGS with TCG_TARGET_CALL_ARG_I32
+For 64-bit hosts that had TCG_TARGET_EXTEND_ARGS, set
 TCG_TARGET_CALL_ARG_I32 to TCG_CALL_ARG_EXTEND.
 Otherwise, use TCG_CALL_ARG_NORMAL.
 Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
  tcg/aarch64/tcg-target.h     |  1 +
  tcg/arm/tcg-target.h         |  1 +
  tcg/i386/tcg-target.h        |  1 +
  tcg/loongarch64/tcg-target.h |  1 +
  tcg/mips/tcg-target.h        |  1 +
  tcg/riscv/tcg-target.h       |  1 +
  tcg/s390x/tcg-target.h       |  2 +-
  tcg/sparc64/tcg-target.h     |  2 +-
  tcg/tci/tcg-target.h         |  1 +
  tcg/tcg.c                    | 42 ++++++++++++++++++------------------
  tcg/ppc/tcg-target.c.inc     |  6 +++++-
 files changed, 35 insertions(+), 24 deletions(-)
 diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/aarch64/tcg-target.h
 +++ b/tcg/aarch64/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
  #define TCG_REG_CALL_STACK              TCG_REG_SP
  #define TCG_TARGET_STACK_ALIGN          16
  #define TCG_TARGET_CALL_STACK_OFFSET    0
 +#define TCG_TARGET_CALL_ARG_I32         TCG_CALL_ARG_NORMAL
  #define TCG_TARGET_CALL_ARG_I64         TCG_CALL_ARG_NORMAL
  /* optional instructions */
 diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/arm/tcg-target.h
 +++ b/tcg/arm/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool use_neon_instructions;
  /* used for function call generation */
  #define TCG_TARGET_STACK_ALIGN        8
  #define TCG_TARGET_CALL_STACK_OFFSET    0
 +#define TCG_TARGET_CALL_ARG_I32         TCG_CALL_ARG_NORMAL
  #define TCG_TARGET_CALL_ARG_I64         TCG_CALL_ARG_EVEN
  /* optional instructions */
 diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/i386/tcg-target.h
 +++ b/tcg/i386/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
  #else
  #define TCG_TARGET_CALL_STACK_OFFSET 0
  #endif
 +#define TCG_TARGET_CALL_ARG_I32      TCG_CALL_ARG_NORMAL
  #define TCG_TARGET_CALL_ARG_I64      TCG_CALL_ARG_NORMAL
  extern bool have_bmi1;
 diff --git a/tcg/loongarch64/tcg-target.h b/tcg/loongarch64/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/loongarch64/tcg-target.h
 +++ b/tcg/loongarch64/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
  #define TCG_REG_CALL_STACK              TCG_REG_SP
  #define TCG_TARGET_STACK_ALIGN          16
  #define TCG_TARGET_CALL_STACK_OFFSET    0
 +#define TCG_TARGET_CALL_ARG_I32         TCG_CALL_ARG_NORMAL
  #define TCG_TARGET_CALL_ARG_I64         TCG_CALL_ARG_NORMAL
  /* optional instructions */
 diff --git a/tcg/mips/tcg-target.h b/tcg/mips/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/mips/tcg-target.h
 +++ b/tcg/mips/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
  # define TCG_TARGET_CALL_STACK_OFFSET 0
  # define TCG_TARGET_CALL_ARG_I64      TCG_CALL_ARG_NORMAL
  #endif
 +#define TCG_TARGET_CALL_ARG_I32       TCG_CALL_ARG_NORMAL
  /* MOVN/MOVZ instructions detection */
  #if (defined(__mips_isa_rev) && (__mips_isa_rev >= 1)) || \
 diff --git a/tcg/riscv/tcg-target.h b/tcg/riscv/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/riscv/tcg-target.h
 +++ b/tcg/riscv/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
  #define TCG_REG_CALL_STACK              TCG_REG_SP
  #define TCG_TARGET_STACK_ALIGN          16
  #define TCG_TARGET_CALL_STACK_OFFSET    0
 +#define TCG_TARGET_CALL_ARG_I32         TCG_CALL_ARG_NORMAL
  #if TCG_TARGET_REG_BITS == 32
  #define TCG_TARGET_CALL_ARG_I64         TCG_CALL_ARG_EVEN
  #else
 diff --git a/tcg/s390x/tcg-target.h b/tcg/s390x/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/s390x/tcg-target.h
 +++ b/tcg/s390x/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern uint64_t s390_facilities[3];
  /* used for function call generation */
  #define TCG_TARGET_STACK_ALIGN        8
  #define TCG_TARGET_CALL_STACK_OFFSET    160
 +#define TCG_TARGET_CALL_ARG_I32         TCG_CALL_ARG_EXTEND
  #define TCG_TARGET_CALL_ARG_I64         TCG_CALL_ARG_NORMAL
 -#define TCG_TARGET_EXTEND_ARGS 1
  #define TCG_TARGET_HAS_MEMORY_BSWAP   1
  #define TCG_TARGET_DEFAULT_MO (TCG_MO_ALL & ~TCG_MO_ST_LD)
 diff --git a/tcg/sparc64/tcg-target.h b/tcg/sparc64/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/sparc64/tcg-target.h
 +++ b/tcg/sparc64/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
  #define TCG_TARGET_STACK_BIAS           2047
  #define TCG_TARGET_STACK_ALIGN          16
  #define TCG_TARGET_CALL_STACK_OFFSET    (128 + 6*8 + TCG_TARGET_STACK_BIAS)
 -#define TCG_TARGET_EXTEND_ARGS 1
 +#define TCG_TARGET_CALL_ARG_I32         TCG_CALL_ARG_EXTEND
  #define TCG_TARGET_CALL_ARG_I64         TCG_CALL_ARG_NORMAL
  #if defined(__VIS__) && __VIS__ >= 0x300
 diff --git a/tcg/tci/tcg-target.h b/tcg/tci/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tci/tcg-target.h
 +++ b/tcg/tci/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
  /* Used for function call generation. */
  #define TCG_TARGET_CALL_STACK_OFFSET    0
  #define TCG_TARGET_STACK_ALIGN          8
 +#define TCG_TARGET_CALL_ARG_I32         TCG_CALL_ARG_NORMAL
  #if TCG_TARGET_REG_BITS == 32
  # define TCG_TARGET_CALL_ARG_I64        TCG_CALL_ARG_EVEN
  #else
 diff --git a/tcg/tcg.c b/tcg/tcg.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tcg.c
 +++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ void tcg_gen_callN(void *func, TCGTemp *ret, int nargs, TCGTemp **args)
      }
  #endif
 -#if defined(TCG_TARGET_EXTEND_ARGS) && TCG_TARGET_REG_BITS == 64
 -    for (i = 0; i < nargs; ++i) {
 -        int argtype = extract32(typemask, (i + 1) * 3, 3);
 -        bool is_32bit = (argtype & ~1) == dh_typecode_i32;
 -        bool is_signed = argtype & 1;
 +    if (TCG_TARGET_CALL_ARG_I32 == TCG_CALL_ARG_EXTEND) {
 +        for (i = 0; i < nargs; ++i) {
 +            int argtype = extract32(typemask, (i + 1) * 3, 3);
 +            bool is_32bit = (argtype & ~1) == dh_typecode_i32;
 +            bool is_signed = argtype & 1;
 -        if (is_32bit) {
 -            TCGv_i64 temp = tcg_temp_new_i64();
 -            TCGv_i32 orig = temp_tcgv_i32(args[i]);
 -            if (is_signed) {
 -                tcg_gen_ext_i32_i64(temp, orig);
 -            } else {
 -                tcg_gen_extu_i32_i64(temp, orig);
 +            if (is_32bit) {
 +                TCGv_i64 temp = tcg_temp_new_i64();
 +                TCGv_i32 orig = temp_tcgv_i32(args[i]);
 +                if (is_signed) {
 +                    tcg_gen_ext_i32_i64(temp, orig);
 +                } else {
 +                    tcg_gen_extu_i32_i64(temp, orig);
 +                }
 +                args[i] = tcgv_i64_temp(temp);
              }
 -            args[i] = tcgv_i64_temp(temp);
          }
      }
 -#endif /* TCG_TARGET_EXTEND_ARGS */
      op = tcg_emit_op(INDEX_op_call);
@@ -XXX,XX +XXX,XX @@ void tcg_gen_callN(void *func, TCGTemp *ret, int nargs, TCGTemp **args)
      tcg_debug_assert(TCGOP_CALLI(op) == real_args);
      tcg_debug_assert(pi <= ARRAY_SIZE(op->args));
 -#if defined(TCG_TARGET_EXTEND_ARGS) && TCG_TARGET_REG_BITS == 64
 -    for (i = 0; i < nargs; ++i) {
 -        int argtype = extract32(typemask, (i + 1) * 3, 3);
 -        bool is_32bit = (argtype & ~1) == dh_typecode_i32;
 +    if (TCG_TARGET_CALL_ARG_I32 == TCG_CALL_ARG_EXTEND) {
 +        for (i = 0; i < nargs; ++i) {
 +            int argtype = extract32(typemask, (i + 1) * 3, 3);
 +            bool is_32bit = (argtype & ~1) == dh_typecode_i32;
 -        if (is_32bit) {
 -            tcg_temp_free_internal(args[i]);
 +            if (is_32bit) {
 +                tcg_temp_free_internal(args[i]);
 +            }
          }
      }
 -#endif /* TCG_TARGET_EXTEND_ARGS */
  }
  static void tcg_reg_alloc_start(TCGContext *s)
 diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/ppc/tcg-target.c.inc
 +++ b/tcg/ppc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@
  # endif
  #endif
 +#if TCG_TARGET_REG_BITS == 64
 +# define TCG_TARGET_CALL_ARG_I32   TCG_CALL_ARG_EXTEND
 +#else
 +# define TCG_TARGET_CALL_ARG_I32   TCG_CALL_ARG_NORMAL
 +#endif
  #ifdef _CALL_SYSV
  # define TCG_TARGET_CALL_ARG_I64   TCG_CALL_ARG_EVEN
  #else
@@ -XXX,XX +XXX,XX @@ static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
  /* Parameters for function call generation, used in tcg.c.  */
  #define TCG_TARGET_STACK_ALIGN       16
 -#define TCG_TARGET_EXTEND_ARGS       1
  #ifdef _CALL_AIX
  # define LINK_AREA_SIZE                (6 * SZR)
 --
 .34.1

-New patch
+[PULL 31/47] tcg: Use TCG_CALL_ARG_EVEN for TCI special case
+Change 32-bit tci TCG_TARGET_CALL_ARG_I32 to TCG_CALL_ARG_EVEN, to
+force 32-bit values to be aligned to 64-bit.  With a small reorg
+to the argument processing loop, this neatly replaces an ifdef for
+CONFIG_TCG_INTERPRETER.
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/tci/tcg-target.h |  3 +-
+ tcg/tcg.c            | 70 ++++++++++++++++++++++++++++----------------
+files changed, 47 insertions(+), 26 deletions(-)
+diff --git a/tcg/tci/tcg-target.h b/tcg/tci/tcg-target.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/tci/tcg-target.h
++++ b/tcg/tci/tcg-target.h
+@@ -XXX,XX +XXX,XX @@ typedef enum {
+ /* Used for function call generation. */
+ #define TCG_TARGET_CALL_STACK_OFFSET    0
+ #define TCG_TARGET_STACK_ALIGN          8
+-#define TCG_TARGET_CALL_ARG_I32         TCG_CALL_ARG_NORMAL
+ #if TCG_TARGET_REG_BITS == 32
++# define TCG_TARGET_CALL_ARG_I32        TCG_CALL_ARG_EVEN
+ # define TCG_TARGET_CALL_ARG_I64        TCG_CALL_ARG_EVEN
+ #else
++# define TCG_TARGET_CALL_ARG_I32        TCG_CALL_ARG_NORMAL
+ # define TCG_TARGET_CALL_ARG_I64        TCG_CALL_ARG_NORMAL
+ #endif
+diff --git a/tcg/tcg.c b/tcg/tcg.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/tcg.c
++++ b/tcg/tcg.c
+@@ -XXX,XX +XXX,XX @@ void tcg_gen_callN(void *func, TCGTemp *ret, int nargs, TCGTemp **args)
+     real_args = 0;
+     for (i = 0; i < nargs; i++) {
+         int argtype = extract32(typemask, (i + 1) * 3, 3);
+-        bool is_64bit = (argtype & ~1) == dh_typecode_i64;
+-        bool want_align = false;
++        TCGCallArgumentKind kind;
++        TCGType type;
+-#if defined(CONFIG_TCG_INTERPRETER)
+-        /*
+-         * Align all arguments, so that they land in predictable places
+-         * for passing off to ffi_call.
+-         */
+-        want_align = true;
+-#else
+-        /* Some targets want aligned 64 bit args */
+-        if (is_64bit) {
+-            want_align = TCG_TARGET_CALL_ARG_I64 == TCG_CALL_ARG_EVEN;
+-        }
+-#endif
+-
+-        if (TCG_TARGET_REG_BITS < 64 && want_align && (real_args & 1)) {
+-            op->args[pi++] = TCG_CALL_DUMMY_ARG;
+-            real_args++;
++        switch (argtype) {
++        case dh_typecode_i32:
++        case dh_typecode_s32:
++            type = TCG_TYPE_I32;
++            break;
++        case dh_typecode_i64:
++        case dh_typecode_s64:
++            type = TCG_TYPE_I64;
++            break;
++        case dh_typecode_ptr:
++            type = TCG_TYPE_PTR;
++            break;
++        default:
++            g_assert_not_reached();
+         }
+-        if (TCG_TARGET_REG_BITS < 64 && is_64bit) {
++        switch (type) {
++        case TCG_TYPE_I32:
++            kind = TCG_TARGET_CALL_ARG_I32;
++            break;
++        case TCG_TYPE_I64:
++            kind = TCG_TARGET_CALL_ARG_I64;
++            break;
++        default:
++            g_assert_not_reached();
++        }
++
++        switch (kind) {
++        case TCG_CALL_ARG_EVEN:
++            if (real_args & 1) {
++                op->args[pi++] = TCG_CALL_DUMMY_ARG;
++                real_args++;
++            }
++            /* fall through */
++        case TCG_CALL_ARG_NORMAL:
++            if (TCG_TARGET_REG_BITS == 32 && type == TCG_TYPE_I64) {
++                op->args[pi++] = temp_arg(args[i]);
++                op->args[pi++] = temp_arg(args[i] + 1);
++                real_args += 2;
++                break;
++            }
+             op->args[pi++] = temp_arg(args[i]);
+-            op->args[pi++] = temp_arg(args[i] + 1);
+-            real_args += 2;
+-            continue;
++            real_args++;
++            break;
++        default:
++            g_assert_not_reached();
+         }
+-
+-        op->args[pi++] = temp_arg(args[i]);
+-        real_args++;
+     }
+     op->args[pi++] = (uintptr_t)func;
+     op->args[pi++] = (uintptr_t)info;
+--
+.34.1

-New patch
+[PULL 32/47] accel/tcg/plugin: Don't search for the function pointer index
+The function pointer is immediately after the output and input
+operands; no need to search.
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ accel/tcg/plugin-gen.c | 29 +++++++++++------------------
+file changed, 11 insertions(+), 18 deletions(-)
+diff --git a/accel/tcg/plugin-gen.c b/accel/tcg/plugin-gen.c
+index XXXXXXX..XXXXXXX 100644
+--- a/accel/tcg/plugin-gen.c
++++ b/accel/tcg/plugin-gen.c
+@@ -XXX,XX +XXX,XX @@ static TCGOp *copy_st_ptr(TCGOp **begin_op, TCGOp *op)
+ static TCGOp *copy_call(TCGOp **begin_op, TCGOp *op, void *empty_func,
+                         void *func, int *cb_idx)
+ {
++    TCGOp *old_op;
++    int func_idx;
++
+     /* copy all ops until the call */
+     do {
+         op = copy_op_nocheck(begin_op, op);
+     } while (op->opc != INDEX_op_call);
+     /* fill in the op call */
+-    op->param1 = (*begin_op)->param1;
+-    op->param2 = (*begin_op)->param2;
++    old_op = *begin_op;
++    TCGOP_CALLI(op) = TCGOP_CALLI(old_op);
++    TCGOP_CALLO(op) = TCGOP_CALLO(old_op);
+     tcg_debug_assert(op->life == 0);
+-    if (*cb_idx == -1) {
+-        int i;
+-        /*
+-         * Instead of working out the position of the callback in args[], just
+-         * look for @empty_func, since it should be a unique pointer.
+-         */
+-        for (i = 0; i < MAX_OPC_PARAM_ARGS; i++) {
+-            if ((uintptr_t)(*begin_op)->args[i] == (uintptr_t)empty_func) {
+-                *cb_idx = i;
+-                break;
+-            }
+-        }
+-        tcg_debug_assert(i < MAX_OPC_PARAM_ARGS);
+-    }
+-    op->args[*cb_idx] = (uintptr_t)func;
+-    op->args[*cb_idx + 1] = (*begin_op)->args[*cb_idx + 1];
++    func_idx = TCGOP_CALLO(op) + TCGOP_CALLI(op);
++    *cb_idx = func_idx;
++
++    op->args[func_idx] = (uintptr_t)func;
++    op->args[func_idx + 1] = old_op->args[func_idx + 1];
+     return op;
+ }
+--
+.34.1

-[Qemu-devel] [PULL 07/21] target/unicore32: remove tlb_flush from uc32_init_fn
+[PULL 33/47] accel/tcg/plugin: Avoid duplicate copy in copy_call
-From: "Emilio G. Cota" <cota@braap.org>
+We copied all of the arguments in copy_op_nocheck.
 We only need to replace the one argument that we change.
-As far as I can tell tlb_flush does not need to be called
-this early. tlb_flush is eventually called after the CPU
-has been realized.
-This change paves the way to the introduction of tlb_init,
-which will be called from cpu_exec_realizefn.
-Cc: Guan Xuetao <gxt@mprc.pku.edu.cn>
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Signed-off-by: Emilio G. Cota <cota@braap.org>
 Message-Id: <20181009174557.16125-3-cota@braap.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/unicore32/cpu.c | 2 --
+ accel/tcg/plugin-gen.c | 2 --
 file changed, 2 deletions(-)
-diff --git a/target/unicore32/cpu.c b/target/unicore32/cpu.c
+diff --git a/accel/tcg/plugin-gen.c b/accel/tcg/plugin-gen.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/unicore32/cpu.c
+--- a/accel/tcg/plugin-gen.c
-+++ b/target/unicore32/cpu.c
++++ b/accel/tcg/plugin-gen.c
-@@ -XXX,XX +XXX,XX @@ static void uc32_cpu_initfn(Object *obj)
+@@ -XXX,XX +XXX,XX @@ static TCGOp *copy_call(TCGOp **begin_op, TCGOp *op, void *empty_func,
-     env->uncached_asr = ASR_MODE_PRIV;
-     env->regs[31] = 0x03000000;
+     func_idx = TCGOP_CALLO(op) + TCGOP_CALLI(op);
- #endif
+     *cb_idx = func_idx;
 -
--    tlb_flush(cs);
+     op->args[func_idx] = (uintptr_t)func;
 -    op->args[func_idx + 1] = old_op->args[func_idx + 1];
      return op;
  }
- static const VMStateDescription vmstate_uc32_cpu = {
 --
-.17.2
+.34.1

-[Qemu-devel] [PULL 19/21] target/s390x: Skip wout, cout helpers if op helper does not return
+[PULL 34/47] accel/tcg/plugin: Use copy_op in append_{udata,mem}_cb
-When op raises an exception, it may not have initialized the output
+Better to re-use the existing function for copying ops.
 temps that would be written back by wout or cout.
-Reviewed-by: David Hildenbrand <david@redhat.com>
+Acked-by: Alex Bennée <alex.bennee@linaro.org>
 Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/s390x/translate.c | 20 +++++++++++++++-----
+ accel/tcg/plugin-gen.c | 16 ++++++++--------
-file changed, 15 insertions(+), 5 deletions(-)
+file changed, 8 insertions(+), 8 deletions(-)
-diff --git a/target/s390x/translate.c b/target/s390x/translate.c
+diff --git a/accel/tcg/plugin-gen.c b/accel/tcg/plugin-gen.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/s390x/translate.c
+--- a/accel/tcg/plugin-gen.c
-+++ b/target/s390x/translate.c
++++ b/accel/tcg/plugin-gen.c
-@@ -XXX,XX +XXX,XX @@ struct DisasInsn {
+@@ -XXX,XX +XXX,XX @@ static TCGOp *append_udata_cb(const struct qemu_plugin_dyn_cb *cb,
+     op = copy_const_ptr(&begin_op, op, cb->userp);
-     const char *name;
+     /* copy the ld_i32, but note that we only have to copy it once */
-+    /* Pre-process arguments before HELP_OP.  */
+-    begin_op = QTAILQ_NEXT(begin_op, link);
-     void (*help_in1)(DisasContext *, DisasFields *, DisasOps *);
+-    tcg_debug_assert(begin_op && begin_op->opc == INDEX_op_ld_i32);
-     void (*help_in2)(DisasContext *, DisasFields *, DisasOps *);
+     if (*cb_idx == -1) {
-     void (*help_prep)(DisasContext *, DisasFields *, DisasOps *);
+-        op = tcg_op_insert_after(tcg_ctx, op, INDEX_op_ld_i32);
-+
+-        memcpy(op->args, begin_op->args, sizeof(op->args));
-+    /*
++        op = copy_op(&begin_op, op, INDEX_op_ld_i32);
-+     * Post-process output after HELP_OP.
++    } else {
-+     * Note that these are not called if HELP_OP returns DISAS_NORETURN.
++        begin_op = QTAILQ_NEXT(begin_op, link);
-+     */
++        tcg_debug_assert(begin_op && begin_op->opc == INDEX_op_ld_i32);
      void (*help_wout)(DisasContext *, DisasFields *, DisasOps *);
      void (*help_cout)(DisasContext *, DisasOps *);
 +
 +    /* Implement the operation itself.  */
      DisasJumpType (*help_op)(DisasContext *, DisasOps *);
      uint64_t data;
@@ -XXX,XX +XXX,XX @@ static DisasJumpType translate_one(CPUS390XState *env, DisasContext *s)
      if (insn->help_op) {
          ret = insn->help_op(s, &o);
      }
--    if (insn->help_wout) {
--        insn->help_wout(s, &f, &o);
+     /* call */
--    }
+@@ -XXX,XX +XXX,XX @@ static TCGOp *append_mem_cb(const struct qemu_plugin_dyn_cb *cb,
--    if (insn->help_cout) {
+     op = copy_const_ptr(&begin_op, op, cb->userp);
--        insn->help_cout(s, &o);
-+    if (ret != DISAS_NORETURN) {
+     /* copy the ld_i32, but note that we only have to copy it once */
-+        if (insn->help_wout) {
+-    begin_op = QTAILQ_NEXT(begin_op, link);
-+            insn->help_wout(s, &f, &o);
+-    tcg_debug_assert(begin_op && begin_op->opc == INDEX_op_ld_i32);
-+        }
+     if (*cb_idx == -1) {
-+        if (insn->help_cout) {
+-        op = tcg_op_insert_after(tcg_ctx, op, INDEX_op_ld_i32);
-+            insn->help_cout(s, &o);
+-        memcpy(op->args, begin_op->args, sizeof(op->args));
-+        }
++        op = copy_op(&begin_op, op, INDEX_op_ld_i32);
 +    } else {
 +        begin_op = QTAILQ_NEXT(begin_op, link);
 +        tcg_debug_assert(begin_op && begin_op->opc == INDEX_op_ld_i32);
      }
-     /* Free any temporaries created by the helpers.  */
+     /* extu_tl_i64 */
 --
-.17.2
+.34.1

-New patch
+[PULL 35/47] tcg: Pass number of arguments to tcg_emit_op() / tcg_op_insert_*()
+From: Philippe Mathieu-Daudé <philmd@linaro.org>
 In order to have variable size allocated TCGOp, pass the number
 of arguments we use (and would allocate) up to tcg_op_alloc().
 This alters tcg_emit_op(), tcg_op_insert_before() and
 tcg_op_insert_after() prototypes.
 In tcg_op_alloc() ensure the number of arguments is in range.
 Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 [PMD: Extracted from bigger patch]
 Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Message-Id: <20221218211832.73312-2-philmd@linaro.org>
 ---
  include/tcg/tcg-op.h   |  2 +-
  include/tcg/tcg.h      |  8 +++++---
  accel/tcg/plugin-gen.c |  5 ++++-
  tcg/optimize.c         |  4 ++--
  tcg/tcg-op-vec.c       |  8 ++++----
  tcg/tcg-op.c           | 12 ++++++------
  tcg/tcg.c              | 30 +++++++++++++++++-------------
 files changed, 39 insertions(+), 30 deletions(-)
 diff --git a/include/tcg/tcg-op.h b/include/tcg/tcg-op.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/tcg/tcg-op.h
 +++ b/include/tcg/tcg-op.h
@@ -XXX,XX +XXX,XX @@ static inline void tcg_gen_plugin_cb_start(unsigned from, unsigned type,
  static inline void tcg_gen_plugin_cb_end(void)
  {
 -    tcg_emit_op(INDEX_op_plugin_cb_end);
 +    tcg_emit_op(INDEX_op_plugin_cb_end, 0);
  }
  #if TARGET_LONG_BITS == 32
 diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/tcg/tcg.h
 +++ b/include/tcg/tcg.h
@@ -XXX,XX +XXX,XX @@ bool tcg_op_supported(TCGOpcode op);
  void tcg_gen_callN(void *func, TCGTemp *ret, int nargs, TCGTemp **args);
 -TCGOp *tcg_emit_op(TCGOpcode opc);
 +TCGOp *tcg_emit_op(TCGOpcode opc, unsigned nargs);
  void tcg_op_remove(TCGContext *s, TCGOp *op);
 -TCGOp *tcg_op_insert_before(TCGContext *s, TCGOp *op, TCGOpcode opc);
 -TCGOp *tcg_op_insert_after(TCGContext *s, TCGOp *op, TCGOpcode opc);
 +TCGOp *tcg_op_insert_before(TCGContext *s, TCGOp *op,
 +                            TCGOpcode opc, unsigned nargs);
 +TCGOp *tcg_op_insert_after(TCGContext *s, TCGOp *op,
 +                           TCGOpcode opc, unsigned nargs);
  /**
   * tcg_remove_ops_after:
 diff --git a/accel/tcg/plugin-gen.c b/accel/tcg/plugin-gen.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/plugin-gen.c
 +++ b/accel/tcg/plugin-gen.c
@@ -XXX,XX +XXX,XX @@ static TCGOp *rm_ops(TCGOp *op)
  static TCGOp *copy_op_nocheck(TCGOp **begin_op, TCGOp *op)
  {
 +    unsigned nargs = ARRAY_SIZE(op->args);
 +
      *begin_op = QTAILQ_NEXT(*begin_op, link);
      tcg_debug_assert(*begin_op);
 -    op = tcg_op_insert_after(tcg_ctx, op, (*begin_op)->opc);
 +    op = tcg_op_insert_after(tcg_ctx, op, (*begin_op)->opc, nargs);
      memcpy(op->args, (*begin_op)->args, sizeof(op->args));
 +
      return op;
  }
 diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/optimize.c
 +++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_addsub2(OptContext *ctx, TCGOp *op, bool add)
          rh = op->args[1];
          /* The proper opcode is supplied by tcg_opt_gen_mov. */
 -        op2 = tcg_op_insert_before(ctx->tcg, op, 0);
 +        op2 = tcg_op_insert_before(ctx->tcg, op, 0, 2);
          tcg_opt_gen_movi(ctx, op, rl, al);
          tcg_opt_gen_movi(ctx, op2, rh, ah);
@@ -XXX,XX +XXX,XX @@ static bool fold_multiply2(OptContext *ctx, TCGOp *op)
          rh = op->args[1];
          /* The proper opcode is supplied by tcg_opt_gen_mov. */
 -        op2 = tcg_op_insert_before(ctx->tcg, op, 0);
 +        op2 = tcg_op_insert_before(ctx->tcg, op, 0, 2);
          tcg_opt_gen_movi(ctx, op, rl, l);
          tcg_opt_gen_movi(ctx, op2, rh, h);
 diff --git a/tcg/tcg-op-vec.c b/tcg/tcg-op-vec.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tcg-op-vec.c
 +++ b/tcg/tcg-op-vec.c
@@ -XXX,XX +XXX,XX @@ bool tcg_can_emit_vecop_list(const TCGOpcode *list,
  void vec_gen_2(TCGOpcode opc, TCGType type, unsigned vece, TCGArg r, TCGArg a)
  {
 -    TCGOp *op = tcg_emit_op(opc);
 +    TCGOp *op = tcg_emit_op(opc, 2);
      TCGOP_VECL(op) = type - TCG_TYPE_V64;
      TCGOP_VECE(op) = vece;
      op->args[0] = r;
@@ -XXX,XX +XXX,XX @@ void vec_gen_2(TCGOpcode opc, TCGType type, unsigned vece, TCGArg r, TCGArg a)
  void vec_gen_3(TCGOpcode opc, TCGType type, unsigned vece,
                 TCGArg r, TCGArg a, TCGArg b)
  {
 -    TCGOp *op = tcg_emit_op(opc);
 +    TCGOp *op = tcg_emit_op(opc, 3);
      TCGOP_VECL(op) = type - TCG_TYPE_V64;
      TCGOP_VECE(op) = vece;
      op->args[0] = r;
@@ -XXX,XX +XXX,XX @@ void vec_gen_3(TCGOpcode opc, TCGType type, unsigned vece,
  void vec_gen_4(TCGOpcode opc, TCGType type, unsigned vece,
                 TCGArg r, TCGArg a, TCGArg b, TCGArg c)
  {
 -    TCGOp *op = tcg_emit_op(opc);
 +    TCGOp *op = tcg_emit_op(opc, 4);
      TCGOP_VECL(op) = type - TCG_TYPE_V64;
      TCGOP_VECE(op) = vece;
      op->args[0] = r;
@@ -XXX,XX +XXX,XX @@ void vec_gen_4(TCGOpcode opc, TCGType type, unsigned vece,
  static void vec_gen_6(TCGOpcode opc, TCGType type, unsigned vece, TCGArg r,
                        TCGArg a, TCGArg b, TCGArg c, TCGArg d, TCGArg e)
  {
 -    TCGOp *op = tcg_emit_op(opc);
 +    TCGOp *op = tcg_emit_op(opc, 6);
      TCGOP_VECL(op) = type - TCG_TYPE_V64;
      TCGOP_VECE(op) = vece;
      op->args[0] = r;
 diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tcg-op.c
 +++ b/tcg/tcg-op.c
@@ -XXX,XX +XXX,XX @@
  void tcg_gen_op1(TCGOpcode opc, TCGArg a1)
  {
 -    TCGOp *op = tcg_emit_op(opc);
 +    TCGOp *op = tcg_emit_op(opc, 1);
      op->args[0] = a1;
  }
  void tcg_gen_op2(TCGOpcode opc, TCGArg a1, TCGArg a2)
  {
 -    TCGOp *op = tcg_emit_op(opc);
 +    TCGOp *op = tcg_emit_op(opc, 2);
      op->args[0] = a1;
      op->args[1] = a2;
  }
  void tcg_gen_op3(TCGOpcode opc, TCGArg a1, TCGArg a2, TCGArg a3)
  {
 -    TCGOp *op = tcg_emit_op(opc);
 +    TCGOp *op = tcg_emit_op(opc, 3);
      op->args[0] = a1;
      op->args[1] = a2;
      op->args[2] = a3;
@@ -XXX,XX +XXX,XX @@ void tcg_gen_op3(TCGOpcode opc, TCGArg a1, TCGArg a2, TCGArg a3)
  void tcg_gen_op4(TCGOpcode opc, TCGArg a1, TCGArg a2, TCGArg a3, TCGArg a4)
  {
 -    TCGOp *op = tcg_emit_op(opc);
 +    TCGOp *op = tcg_emit_op(opc, 4);
      op->args[0] = a1;
      op->args[1] = a2;
      op->args[2] = a3;
@@ -XXX,XX +XXX,XX @@ void tcg_gen_op4(TCGOpcode opc, TCGArg a1, TCGArg a2, TCGArg a3, TCGArg a4)
  void tcg_gen_op5(TCGOpcode opc, TCGArg a1, TCGArg a2, TCGArg a3,
                   TCGArg a4, TCGArg a5)
  {
 -    TCGOp *op = tcg_emit_op(opc);
 +    TCGOp *op = tcg_emit_op(opc, 5);
      op->args[0] = a1;
      op->args[1] = a2;
      op->args[2] = a3;
@@ -XXX,XX +XXX,XX @@ void tcg_gen_op5(TCGOpcode opc, TCGArg a1, TCGArg a2, TCGArg a3,
  void tcg_gen_op6(TCGOpcode opc, TCGArg a1, TCGArg a2, TCGArg a3,
                   TCGArg a4, TCGArg a5, TCGArg a6)
  {
 -    TCGOp *op = tcg_emit_op(opc);
 +    TCGOp *op = tcg_emit_op(opc, 6);
      op->args[0] = a1;
      op->args[1] = a2;
      op->args[2] = a3;
 diff --git a/tcg/tcg.c b/tcg/tcg.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tcg.c
 +++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ bool tcg_op_supported(TCGOpcode op)
     and endian swap in tcg_reg_alloc_call(). */
  void tcg_gen_callN(void *func, TCGTemp *ret, int nargs, TCGTemp **args)
  {
 -    int i, real_args, nb_rets, pi;
 +    int i, real_args, nb_rets, pi, max_args;
      unsigned typemask;
      const TCGHelperInfo *info;
      TCGOp *op;
@@ -XXX,XX +XXX,XX @@ void tcg_gen_callN(void *func, TCGTemp *ret, int nargs, TCGTemp **args)
          }
      }
 -    op = tcg_emit_op(INDEX_op_call);
 +    max_args = ARRAY_SIZE(op->args);
 +    op = tcg_emit_op(INDEX_op_call, max_args);
      pi = 0;
      if (ret != NULL) {
@@ -XXX,XX +XXX,XX @@ void tcg_gen_callN(void *func, TCGTemp *ret, int nargs, TCGTemp **args)
      /* Make sure the fields didn't overflow.  */
      tcg_debug_assert(TCGOP_CALLI(op) == real_args);
 -    tcg_debug_assert(pi <= ARRAY_SIZE(op->args));
 +    tcg_debug_assert(pi <= max_args);
      if (TCG_TARGET_CALL_ARG_I32 == TCG_CALL_ARG_EXTEND) {
          for (i = 0; i < nargs; ++i) {
@@ -XXX,XX +XXX,XX @@ void tcg_remove_ops_after(TCGOp *op)
      }
  }
 -static TCGOp *tcg_op_alloc(TCGOpcode opc)
 +static TCGOp *tcg_op_alloc(TCGOpcode opc, unsigned nargs)
  {
      TCGContext *s = tcg_ctx;
      TCGOp *op;
 +    assert(nargs < ARRAY_SIZE(op->args));
      if (likely(QTAILQ_EMPTY(&s->free_ops))) {
          op = tcg_malloc(sizeof(TCGOp));
      } else {
@@ -XXX,XX +XXX,XX @@ static TCGOp *tcg_op_alloc(TCGOpcode opc)
      return op;
  }
 -TCGOp *tcg_emit_op(TCGOpcode opc)
 +TCGOp *tcg_emit_op(TCGOpcode opc, unsigned nargs)
  {
 -    TCGOp *op = tcg_op_alloc(opc);
 +    TCGOp *op = tcg_op_alloc(opc, nargs);
      QTAILQ_INSERT_TAIL(&tcg_ctx->ops, op, link);
      return op;
  }
 -TCGOp *tcg_op_insert_before(TCGContext *s, TCGOp *old_op, TCGOpcode opc)
 +TCGOp *tcg_op_insert_before(TCGContext *s, TCGOp *old_op,
 +                            TCGOpcode opc, unsigned nargs)
  {
 -    TCGOp *new_op = tcg_op_alloc(opc);
 +    TCGOp *new_op = tcg_op_alloc(opc, nargs);
      QTAILQ_INSERT_BEFORE(old_op, new_op, link);
      return new_op;
  }
 -TCGOp *tcg_op_insert_after(TCGContext *s, TCGOp *old_op, TCGOpcode opc)
 +TCGOp *tcg_op_insert_after(TCGContext *s, TCGOp *old_op,
 +                           TCGOpcode opc, unsigned nargs)
  {
 -    TCGOp *new_op = tcg_op_alloc(opc);
 +    TCGOp *new_op = tcg_op_alloc(opc, nargs);
      QTAILQ_INSERT_AFTER(&s->ops, old_op, new_op, link);
      return new_op;
  }
@@ -XXX,XX +XXX,XX @@ static bool liveness_pass_2(TCGContext *s)
                      TCGOpcode lopc = (arg_ts->type == TCG_TYPE_I32
                                        ? INDEX_op_ld_i32
                                        : INDEX_op_ld_i64);
 -                    TCGOp *lop = tcg_op_insert_before(s, op, lopc);
 +                    TCGOp *lop = tcg_op_insert_before(s, op, lopc, 3);
                      lop->args[0] = temp_arg(dir_ts);
                      lop->args[1] = temp_arg(arg_ts->mem_base);
@@ -XXX,XX +XXX,XX @@ static bool liveness_pass_2(TCGContext *s)
                      TCGOpcode sopc = (arg_ts->type == TCG_TYPE_I32
                                        ? INDEX_op_st_i32
                                        : INDEX_op_st_i64);
 -                    TCGOp *sop = tcg_op_insert_after(s, op, sopc);
 +                    TCGOp *sop = tcg_op_insert_after(s, op, sopc, 3);
                      TCGTemp *out_ts = dir_ts;
                      if (IS_DEAD_ARG(0)) {
@@ -XXX,XX +XXX,XX @@ static bool liveness_pass_2(TCGContext *s)
                      TCGOpcode sopc = (arg_ts->type == TCG_TYPE_I32
                                        ? INDEX_op_st_i32
                                        : INDEX_op_st_i64);
 -                    TCGOp *sop = tcg_op_insert_after(s, op, sopc);
 +                    TCGOp *sop = tcg_op_insert_after(s, op, sopc, 3);
                      sop->args[0] = temp_arg(dir_ts);
                      sop->args[1] = temp_arg(arg_ts->mem_base);
 --
 .34.1

-[Qemu-devel] [PULL 17/21] target/s390x: Convert to HAVE_CMPXCHG128 and HAVE_ATOMIC128
+[PULL 36/47] tcg: Vary the allocation size for TCGOp
-Reviewed-by: David Hildenbrand <david@redhat.com>
+We have been allocating a worst case number of arguments
 to support calls.  Instead, allow the size to vary.
 By default leave space for 4 args, to maximize reuse,
 but allow calls to increase the number of args to 32.
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+[PMD: Split patch in two]
+Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+Message-Id: <20221218211832.73312-3-philmd@linaro.org>
 ---
- target/s390x/mem_helper.c | 92 +++++++++++++++++----------------------
+ include/exec/helper-head.h |  2 --
-file changed, 41 insertions(+), 51 deletions(-)
+ include/tcg/tcg.h          | 46 +++++++++++++-------------------------
  accel/tcg/plugin-gen.c     | 10 ++++-----
  tcg/tcg.c                  | 35 +++++++++++++++++++++--------
 files changed, 47 insertions(+), 46 deletions(-)
-diff --git a/target/s390x/mem_helper.c b/target/s390x/mem_helper.c
+diff --git a/include/exec/helper-head.h b/include/exec/helper-head.h
 index XXXXXXX..XXXXXXX 100644
---- a/target/s390x/mem_helper.c
+--- a/include/exec/helper-head.h
-+++ b/target/s390x/mem_helper.c
++++ b/include/exec/helper-head.h
 @@ -XXX,XX +XXX,XX @@
- #include "exec/exec-all.h"
+ #define DEF_HELPER_7(name, ret, t1, t2, t3, t4, t5, t6, t7) \
- #include "exec/cpu_ldst.h"
+     DEF_HELPER_FLAGS_7(name, 0, ret, t1, t2, t3, t4, t5, t6, t7)
- #include "qemu/int128.h"
-+#include "qemu/atomic128.h"
+-/* MAX_OPC_PARAM_IARGS must be set to n if last entry is DEF_HELPER_FLAGS_n. */
+-
- #if !defined(CONFIG_USER_ONLY)
+ #endif /* EXEC_HELPER_HEAD_H */
- #include "hw/s390x/storage-keys.h"
+diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
-@@ -XXX,XX +XXX,XX @@ static void do_cdsg(CPUS390XState *env, uint64_t addr,
+index XXXXXXX..XXXXXXX 100644
-     bool fail;
+--- a/include/tcg/tcg.h
++++ b/include/tcg/tcg.h
-     if (parallel) {
+@@ -XXX,XX +XXX,XX @@
--#ifndef CONFIG_ATOMIC128
+ /* XXX: make safe guess about sizes */
-+#if !HAVE_CMPXCHG128
+ #define MAX_OP_PER_INSTR 266
-         cpu_loop_exit_atomic(ENV_GET_CPU(env), ra);
- #else
+-#if HOST_LONG_BITS == 32
-         int mem_idx = cpu_mmu_index(env, false);
+-#define MAX_OPC_PARAM_PER_ARG 2
-@@ -XXX,XX +XXX,XX @@ void HELPER(cdsg_parallel)(CPUS390XState *env, uint64_t addr,
+-#else
- static uint32_t do_csst(CPUS390XState *env, uint32_t r3, uint64_t a1,
+-#define MAX_OPC_PARAM_PER_ARG 1
-                         uint64_t a2, bool parallel)
+-#endif
 -#define MAX_OPC_PARAM_IARGS 7
 -#define MAX_OPC_PARAM_OARGS 1
 -#define MAX_OPC_PARAM_ARGS (MAX_OPC_PARAM_IARGS + MAX_OPC_PARAM_OARGS)
 -
 -/* A Call op needs up to 4 + 2N parameters on 32-bit archs,
 - * and up to 4 + N parameters on 64-bit archs
 - * (N = number of input arguments + output arguments).  */
 -#define MAX_OPC_PARAM (4 + (MAX_OPC_PARAM_PER_ARG * MAX_OPC_PARAM_ARGS))
 -
  #define CPU_TEMP_BUF_NLONGS 128
  #define TCG_STATIC_FRAME_SIZE  (CPU_TEMP_BUF_NLONGS * sizeof(long))
@@ -XXX,XX +XXX,XX @@ typedef struct TCGTempSet {
      unsigned long l[BITS_TO_LONGS(TCG_MAX_TEMPS)];
  } TCGTempSet;
 -/* While we limit helpers to 6 arguments, for 32-bit hosts, with padding,
 -   this imples a max of 6*2 (64-bit in) + 2 (64-bit out) = 14 operands.
 -   There are never more than 2 outputs, which means that we can store all
 -   dead + sync data within 16 bits.  */
 -#define DEAD_ARG  4
 -#define SYNC_ARG  1
 -typedef uint16_t TCGLifeData;
 +/*
 + * With 1 128-bit output, a 32-bit host requires 4 output parameters,
 + * which leaves a maximum of 28 other slots.  Which is enough for 7
 + * 128-bit operands.
 + */
 +#define DEAD_ARG  (1 << 4)
 +#define SYNC_ARG  (1 << 0)
 +typedef uint32_t TCGLifeData;
 -/* The layout here is designed to avoid a bitfield crossing of
 -   a 32-bit boundary, which would cause GCC to add extra padding.  */
  typedef struct TCGOp {
 -    TCGOpcode opc   : 8;        /*  8 */
 +    TCGOpcode opc   : 8;
 +    unsigned nargs  : 8;
      /* Parameters for this opcode.  See below.  */
 -    unsigned param1 : 4;        /* 12 */
 -    unsigned param2 : 4;        /* 16 */
 +    unsigned param1 : 8;
 +    unsigned param2 : 8;
      /* Lifetime data of the operands.  */
 -    unsigned life   : 16;       /* 32 */
 +    TCGLifeData life;
      /* Next and previous opcodes.  */
      QTAILQ_ENTRY(TCGOp) link;
 -    /* Arguments for the opcode.  */
 -    TCGArg args[MAX_OPC_PARAM];
 -
      /* Register preferences for the output(s).  */
      TCGRegSet output_pref[2];
 +
 +    /* Arguments for the opcode.  */
 +    TCGArg args[];
  } TCGOp;
  #define TCGOP_CALLI(X)    (X)->param1
 diff --git a/accel/tcg/plugin-gen.c b/accel/tcg/plugin-gen.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/plugin-gen.c
 +++ b/accel/tcg/plugin-gen.c
@@ -XXX,XX +XXX,XX @@ static TCGOp *rm_ops(TCGOp *op)
  static TCGOp *copy_op_nocheck(TCGOp **begin_op, TCGOp *op)
  {
--#if !defined(CONFIG_USER_ONLY) || defined(CONFIG_ATOMIC128)
+-    unsigned nargs = ARRAY_SIZE(op->args);
-     uint32_t mem_idx = cpu_mmu_index(env, false);
++    TCGOp *old_op = QTAILQ_NEXT(*begin_op, link);
--#endif
++    unsigned nargs = old_op->nargs;
-     uintptr_t ra = GETPC();
-     uint32_t fc = extract32(env->regs[0], 0, 8);
+-    *begin_op = QTAILQ_NEXT(*begin_op, link);
-     uint32_t sc = extract32(env->regs[0], 8, 8);
+-    tcg_debug_assert(*begin_op);
-@@ -XXX,XX +XXX,XX @@ static uint32_t do_csst(CPUS390XState *env, uint32_t r3, uint64_t a1,
+-    op = tcg_op_insert_after(tcg_ctx, op, (*begin_op)->opc, nargs);
-     probe_write(env, a2, 0, mem_idx, ra);
+-    memcpy(op->args, (*begin_op)->args, sizeof(op->args));
- #endif
++    *begin_op = old_op;
++    op = tcg_op_insert_after(tcg_ctx, op, old_op->opc, nargs);
--    /* Note that the compare-and-swap is atomic, and the store is atomic, but
++    memcpy(op->args, old_op->args, sizeof(op->args[0]) * nargs);
--       the complete operation is not.  Therefore we do not need to assert serial
--       context in order to implement this.  That said, restart early if we can't
+     return op;
--       support either operation that is supposed to be atomic.  */
+ }
-+    /*
+diff --git a/tcg/tcg.c b/tcg/tcg.c
-+     * Note that the compare-and-swap is atomic, and the store is atomic,
+index XXXXXXX..XXXXXXX 100644
-+     * but the complete operation is not.  Therefore we do not need to
+--- a/tcg/tcg.c
-+     * assert serial context in order to implement this.  That said,
++++ b/tcg/tcg.c
-+     * restart early if we can't support either operation that is supposed
+@@ -XXX,XX +XXX,XX @@ void tcg_gen_callN(void *func, TCGTemp *ret, int nargs, TCGTemp **args)
 +     * to be atomic.
 +     */
      if (parallel) {
 -        int mask = 0;
 -#if !defined(CONFIG_ATOMIC64)
 -        mask = -8;
 -#elif !defined(CONFIG_ATOMIC128)
 -        mask = -16;
 +        uint32_t max = 2;
 +#ifdef CONFIG_ATOMIC64
 +        max = 3;
  #endif
 -        if (((4 << fc) | (1 << sc)) & mask) {
 +        if ((HAVE_CMPXCHG128 ? 0 : fc + 2 > max) ||
 +            (HAVE_ATOMIC128  ? 0 : sc > max)) {
              cpu_loop_exit_atomic(ENV_GET_CPU(env), ra);
          }
      }
-@@ -XXX,XX +XXX,XX @@ static uint32_t do_csst(CPUS390XState *env, uint32_t r3, uint64_t a1,
-             Int128 cv = int128_make128(env->regs[r3 + 1], env->regs[r3]);
+-    max_args = ARRAY_SIZE(op->args);
-             Int128 ov;
++    /*
++     * A Call op needs up to 4 + 2N parameters on 32-bit archs,
--            if (parallel) {
++     * and up to 4 + N parameters on 64-bit archs
--#ifdef CONFIG_ATOMIC128
++     * (N = number of input arguments + output arguments).
--                TCGMemOpIdx oi = make_memop_idx(MO_TEQ | MO_ALIGN_16, mem_idx);
++     */
--                ov = helper_atomic_cmpxchgo_be_mmu(env, a1, cv, nv, oi, ra);
++    max_args = (64 / TCG_TARGET_REG_BITS) * nargs + 4;
--                cc = !int128_eq(ov, cv);
+     op = tcg_emit_op(INDEX_op_call, max_args);
--#else
--                /* Note that we asserted !parallel above.  */
+     pi = 0;
--                g_assert_not_reached();
+@@ -XXX,XX +XXX,XX @@ void tcg_remove_ops_after(TCGOp *op)
--#endif
+ static TCGOp *tcg_op_alloc(TCGOpcode opc, unsigned nargs)
--            } else {
+ {
-+            if (!parallel) {
+     TCGContext *s = tcg_ctx;
-                 uint64_t oh = cpu_ldq_data_ra(env, a1 + 0, ra);
+-    TCGOp *op;
-                 uint64_t ol = cpu_ldq_data_ra(env, a1 + 8, ra);
++    TCGOp *op = NULL;
-@@ -XXX,XX +XXX,XX @@ static uint32_t do_csst(CPUS390XState *env, uint32_t r3, uint64_t a1,
+-    assert(nargs < ARRAY_SIZE(op->args));
+-    if (likely(QTAILQ_EMPTY(&s->free_ops))) {
-                 cpu_stq_data_ra(env, a1 + 0, int128_gethi(nv), ra);
+-        op = tcg_malloc(sizeof(TCGOp));
-                 cpu_stq_data_ra(env, a1 + 8, int128_getlo(nv), ra);
+-    } else {
-+            } else if (HAVE_CMPXCHG128) {
+-        op = QTAILQ_FIRST(&s->free_ops);
-+                TCGMemOpIdx oi = make_memop_idx(MO_TEQ | MO_ALIGN_16, mem_idx);
+-        QTAILQ_REMOVE(&s->free_ops, op, link);
-+                ov = helper_atomic_cmpxchgo_be_mmu(env, a1, cv, nv, oi, ra);
++    if (unlikely(!QTAILQ_EMPTY(&s->free_ops))) {
-+                cc = !int128_eq(ov, cv);
++        QTAILQ_FOREACH(op, &s->free_ops, link) {
-+            } else {
++            if (nargs <= op->nargs) {
-+                /* Note that we asserted !parallel above.  */
++                QTAILQ_REMOVE(&s->free_ops, op, link);
-+                g_assert_not_reached();
++                nargs = op->nargs;
-             }
++                goto found;
++            }
-             env->regs[r3 + 0] = int128_gethi(ov);
++        }
@@ -XXX,XX +XXX,XX @@ static uint32_t do_csst(CPUS390XState *env, uint32_t r3, uint64_t a1,
              cpu_stq_data_ra(env, a2, svh, ra);
              break;
          case 4:
 -            if (parallel) {
 -#ifdef CONFIG_ATOMIC128
 +            if (!parallel) {
 +                cpu_stq_data_ra(env, a2 + 0, svh, ra);
 +                cpu_stq_data_ra(env, a2 + 8, svl, ra);
 +            } else if (HAVE_ATOMIC128) {
                  TCGMemOpIdx oi = make_memop_idx(MO_TEQ | MO_ALIGN_16, mem_idx);
                  Int128 sv = int128_make128(svl, svh);
                  helper_atomic_sto_be_mmu(env, a2, sv, oi, ra);
 -#else
 +            } else {
                  /* Note that we asserted !parallel above.  */
                  g_assert_not_reached();
 -#endif
 -            } else {
 -                cpu_stq_data_ra(env, a2 + 0, svh, ra);
 -                cpu_stq_data_ra(env, a2 + 8, svl, ra);
              }
              break;
          default:
@@ -XXX,XX +XXX,XX @@ static uint64_t do_lpq(CPUS390XState *env, uint64_t addr, bool parallel)
      uintptr_t ra = GETPC();
      uint64_t hi, lo;
 -    if (parallel) {
 -#ifndef CONFIG_ATOMIC128
 -        cpu_loop_exit_atomic(ENV_GET_CPU(env), ra);
 -#else
 +    if (!parallel) {
 +        check_alignment(env, addr, 16, ra);
 +        hi = cpu_ldq_data_ra(env, addr + 0, ra);
 +        lo = cpu_ldq_data_ra(env, addr + 8, ra);
 +    } else if (HAVE_ATOMIC128) {
          int mem_idx = cpu_mmu_index(env, false);
          TCGMemOpIdx oi = make_memop_idx(MO_TEQ | MO_ALIGN_16, mem_idx);
          Int128 v = helper_atomic_ldo_be_mmu(env, addr, oi, ra);
          hi = int128_gethi(v);
          lo = int128_getlo(v);
 -#endif
      } else {
 -        check_alignment(env, addr, 16, ra);
 -
 -        hi = cpu_ldq_data_ra(env, addr + 0, ra);
 -        lo = cpu_ldq_data_ra(env, addr + 8, ra);
 +        cpu_loop_exit_atomic(ENV_GET_CPU(env), ra);
      }
++
-     env->retxl = lo;
++    /* Most opcodes have 3 or 4 operands: reduce fragmentation. */
-@@ -XXX,XX +XXX,XX @@ static void do_stpq(CPUS390XState *env, uint64_t addr,
++    nargs = MAX(4, nargs);
- {
++    op = tcg_malloc(sizeof(TCGOp) + sizeof(TCGArg) * nargs);
-     uintptr_t ra = GETPC();
++
++ found:
--    if (parallel) {
+     memset(op, 0, offsetof(TCGOp, link));
--#ifndef CONFIG_ATOMIC128
+     op->opc = opc;
--        cpu_loop_exit_atomic(ENV_GET_CPU(env), ra);
+-    s->nb_ops++;
--#else
++    op->nargs = nargs;
--        int mem_idx = cpu_mmu_index(env, false);
--        TCGMemOpIdx oi = make_memop_idx(MO_TEQ | MO_ALIGN_16, mem_idx);
++    /* Check for bitfield overflow. */
--
++    tcg_debug_assert(op->nargs == nargs);
--        Int128 v = int128_make128(low, high);
++
--        helper_atomic_sto_be_mmu(env, addr, v, oi, ra);
++    s->nb_ops++;
--#endif
+     return op;
 -    } else {
 +    if (!parallel) {
          check_alignment(env, addr, 16, ra);
 -
          cpu_stq_data_ra(env, addr + 0, high, ra);
          cpu_stq_data_ra(env, addr + 8, low, ra);
 +    } else if (HAVE_ATOMIC128) {
 +        int mem_idx = cpu_mmu_index(env, false);
 +        TCGMemOpIdx oi = make_memop_idx(MO_TEQ | MO_ALIGN_16, mem_idx);
 +        Int128 v = int128_make128(low, high);
 +        helper_atomic_sto_be_mmu(env, addr, v, oi, ra);
 +    } else {
 +        cpu_loop_exit_atomic(ENV_GET_CPU(env), ra);
      }
  }
 --
-.17.2
+.34.1

-[Qemu-devel] [PULL 11/21] tcg: Add tlb_index and tlb_entry helpers
+[PULL 37/47] tcg: Use output_pref wrapper function
-Isolate the computation of an index from an address into a
+We will shortly have the possibility of more that two outputs,
-helper before we change that function.
+though only for calls (for which preferences are moot).  Avoid
 direct references to op->output_pref[] when possible.
-Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
-[ cota: convert tlb_vaddr_to_host; use atomic_read on addr_write ]
-Signed-off-by: Emilio G. Cota <cota@braap.org>
-Message-Id: <20181009175129.17888-2-cota@braap.org>
 ---
- accel/tcg/softmmu_template.h     | 64 +++++++++++++++++---------------
+ include/tcg/tcg.h |  5 +++++
- include/exec/cpu_ldst.h          | 19 ++++++++--
+ tcg/tcg.c         | 34 ++++++++++++++++++----------------
- include/exec/cpu_ldst_template.h | 25 +++++++------
+files changed, 23 insertions(+), 16 deletions(-)
  accel/tcg/cputlb.c               | 60 ++++++++++++++----------------
 files changed, 90 insertions(+), 78 deletions(-)
-diff --git a/accel/tcg/softmmu_template.h b/accel/tcg/softmmu_template.h
+diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/softmmu_template.h
+--- a/include/tcg/tcg.h
-+++ b/accel/tcg/softmmu_template.h
++++ b/include/tcg/tcg.h
-@@ -XXX,XX +XXX,XX @@ static inline DATA_TYPE glue(io_read, SUFFIX)(CPUArchState *env,
+@@ -XXX,XX +XXX,XX @@ typedef struct TCGOp {
- WORD_TYPE helper_le_ld_name(CPUArchState *env, target_ulong addr,
+ /* Make sure operands fit in the bitfields above.  */
-                             TCGMemOpIdx oi, uintptr_t retaddr)
+ QEMU_BUILD_BUG_ON(NB_OPS > (1 << 8));
- {
--    unsigned mmu_idx = get_mmuidx(oi);
++static inline TCGRegSet output_pref(const TCGOp *op, unsigned i)
--    int index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
++{
--    target_ulong tlb_addr = env->tlb_table[mmu_idx][index].ADDR_READ;
++    return i < ARRAY_SIZE(op->output_pref) ? op->output_pref[i] : 0;
-+    uintptr_t mmu_idx = get_mmuidx(oi);
++}
-+    uintptr_t index = tlb_index(env, mmu_idx, addr);
++
-+    CPUTLBEntry *entry = tlb_entry(env, mmu_idx, addr);
+ typedef struct TCGProfile {
-+    target_ulong tlb_addr = entry->ADDR_READ;
+     int64_t cpu_exec_time;
-     unsigned a_bits = get_alignment_bits(get_memop(oi));
+     int64_t tb_count1;
-     uintptr_t haddr;
+diff --git a/tcg/tcg.c b/tcg/tcg.c
-     DATA_TYPE res;
+index XXXXXXX..XXXXXXX 100644
-@@ -XXX,XX +XXX,XX @@ WORD_TYPE helper_le_ld_name(CPUArchState *env, target_ulong addr,
+--- a/tcg/tcg.c
-             tlb_fill(ENV_GET_CPU(env), addr, DATA_SIZE, READ_ACCESS_TYPE,
++++ b/tcg/tcg.c
-                      mmu_idx, retaddr);
+@@ -XXX,XX +XXX,XX @@ static void tcg_dump_ops(TCGContext *s, FILE *f, bool have_prefs)
          if (have_prefs) {
              for (i = 0; i < nb_oargs; ++i) {
 -                TCGRegSet set = op->output_pref[i];
 +                TCGRegSet set = output_pref(op, i);
                  if (i == 0) {
                      ne_fprintf(f, "  pref=");
@@ -XXX,XX +XXX,XX @@ static void liveness_pass_1(TCGContext *s)
                      }
                      ts->state = TS_DEAD;
                      la_reset_pref(ts);
 -
 -                    /* Not used -- it will be tcg_target_call_oarg_regs[i].  */
 -                    op->output_pref[i] = 0;
                  }
 +                /* Not used -- it will be tcg_target_call_oarg_reg().  */
 +                memset(op->output_pref, 0, sizeof(op->output_pref));
 +
                  if (!(call_flags & (TCG_CALL_NO_WRITE_GLOBALS |
                                      TCG_CALL_NO_READ_GLOBALS))) {
                      la_global_kill(s, nb_globals);
@@ -XXX,XX +XXX,XX @@ static void liveness_pass_1(TCGContext *s)
                  ts = arg_temp(op->args[i]);
                  /* Remember the preference of the uses that followed.  */
 -                op->output_pref[i] = *la_temp_pref(ts);
 +                if (i < ARRAY_SIZE(op->output_pref)) {
 +                    op->output_pref[i] = *la_temp_pref(ts);
 +                }
                  /* Output args are dead.  */
                  if (ts->state & TS_DEAD) {
@@ -XXX,XX +XXX,XX @@ static void liveness_pass_1(TCGContext *s)
                      set &= ct->regs;
                      if (ct->ialias) {
 -                        set &= op->output_pref[ct->alias_index];
 +                        set &= output_pref(op, ct->alias_index);
                      }
                      /* If the combination is not possible, restart.  */
                      if (set == 0) {
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_mov(TCGContext *s, const TCGOp *op)
      TCGReg oreg, ireg;
      allocated_regs = s->reserved_regs;
 -    preferred_regs = op->output_pref[0];
 +    preferred_regs = output_pref(op, 0);
      ots = arg_temp(op->args[0]);
      ts = arg_temp(op->args[1]);
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_dup(TCGContext *s, const TCGOp *op)
          if (IS_DEAD_ARG(1)) {
              temp_dead(s, its);
          }
--        tlb_addr = env->tlb_table[mmu_idx][index].ADDR_READ;
+-        tcg_reg_alloc_do_movi(s, ots, val, arg_life, op->output_pref[0]);
-+        tlb_addr = entry->ADDR_READ;
++        tcg_reg_alloc_do_movi(s, ots, val, arg_life, output_pref(op, 0));
      }
      /* Handle an IO access.  */
@@ -XXX,XX +XXX,XX @@ WORD_TYPE helper_le_ld_name(CPUArchState *env, target_ulong addr,
          return res;
      }
 -    haddr = addr + env->tlb_table[mmu_idx][index].addend;
 +    haddr = addr + entry->addend;
  #if DATA_SIZE == 1
      res = glue(glue(ld, LSUFFIX), _p)((uint8_t *)haddr);
  #else
@@ -XXX,XX +XXX,XX @@ WORD_TYPE helper_le_ld_name(CPUArchState *env, target_ulong addr,
  WORD_TYPE helper_be_ld_name(CPUArchState *env, target_ulong addr,
                              TCGMemOpIdx oi, uintptr_t retaddr)
  {
 -    unsigned mmu_idx = get_mmuidx(oi);
 -    int index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
 -    target_ulong tlb_addr = env->tlb_table[mmu_idx][index].ADDR_READ;
 +    uintptr_t mmu_idx = get_mmuidx(oi);
 +    uintptr_t index = tlb_index(env, mmu_idx, addr);
 +    CPUTLBEntry *entry = tlb_entry(env, mmu_idx, addr);
 +    target_ulong tlb_addr = entry->ADDR_READ;
      unsigned a_bits = get_alignment_bits(get_memop(oi));
      uintptr_t haddr;
      DATA_TYPE res;
@@ -XXX,XX +XXX,XX @@ WORD_TYPE helper_be_ld_name(CPUArchState *env, target_ulong addr,
              tlb_fill(ENV_GET_CPU(env), addr, DATA_SIZE, READ_ACCESS_TYPE,
                       mmu_idx, retaddr);
          }
 -        tlb_addr = env->tlb_table[mmu_idx][index].ADDR_READ;
 +        tlb_addr = entry->ADDR_READ;
      }
      /* Handle an IO access.  */
@@ -XXX,XX +XXX,XX @@ WORD_TYPE helper_be_ld_name(CPUArchState *env, target_ulong addr,
          return res;
      }
 -    haddr = addr + env->tlb_table[mmu_idx][index].addend;
 +    haddr = addr + entry->addend;
      res = glue(glue(ld, LSUFFIX), _be_p)((uint8_t *)haddr);
      return res;
  }
@@ -XXX,XX +XXX,XX @@ static inline void glue(io_write, SUFFIX)(CPUArchState *env,
  void helper_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
                         TCGMemOpIdx oi, uintptr_t retaddr)
  {
 -    unsigned mmu_idx = get_mmuidx(oi);
 -    int index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
 -    target_ulong tlb_addr = env->tlb_table[mmu_idx][index].addr_write;
 +    uintptr_t mmu_idx = get_mmuidx(oi);
 +    uintptr_t index = tlb_index(env, mmu_idx, addr);
 +    CPUTLBEntry *entry = tlb_entry(env, mmu_idx, addr);
 +    target_ulong tlb_addr = entry->addr_write;
      unsigned a_bits = get_alignment_bits(get_memop(oi));
      uintptr_t haddr;
@@ -XXX,XX +XXX,XX @@ void helper_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
              tlb_fill(ENV_GET_CPU(env), addr, DATA_SIZE, MMU_DATA_STORE,
                       mmu_idx, retaddr);
          }
 -        tlb_addr = env->tlb_table[mmu_idx][index].addr_write & ~TLB_INVALID_MASK;
 +        tlb_addr = entry->addr_write & ~TLB_INVALID_MASK;
      }
      /* Handle an IO access.  */
@@ -XXX,XX +XXX,XX @@ void helper_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
      if (DATA_SIZE > 1
          && unlikely((addr & ~TARGET_PAGE_MASK) + DATA_SIZE - 1
                       >= TARGET_PAGE_SIZE)) {
 -        int i, index2;
 -        target_ulong page2, tlb_addr2;
 +        int i;
 +        target_ulong page2;
 +        CPUTLBEntry *entry2;
      do_unaligned_access:
          /* Ensure the second page is in the TLB.  Note that the first page
             is already guaranteed to be filled, and that the second page
             cannot evict the first.  */
          page2 = (addr + DATA_SIZE) & TARGET_PAGE_MASK;
 -        index2 = (page2 >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
 -        tlb_addr2 = env->tlb_table[mmu_idx][index2].addr_write;
 -        if (!tlb_hit_page(tlb_addr2, page2)
 +        entry2 = tlb_entry(env, mmu_idx, page2);
 +        if (!tlb_hit_page(entry2->addr_write, page2)
              && !VICTIM_TLB_HIT(addr_write, page2)) {
              tlb_fill(ENV_GET_CPU(env), page2, DATA_SIZE, MMU_DATA_STORE,
                       mmu_idx, retaddr);
@@ -XXX,XX +XXX,XX @@ void helper_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
          return;
      }
--    haddr = addr + env->tlb_table[mmu_idx][index].addend;
+@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_dup(TCGContext *s, const TCGOp *op)
-+    haddr = addr + entry->addend;
+             tcg_regset_set_reg(allocated_regs, its->reg);
  #if DATA_SIZE == 1
      glue(glue(st, SUFFIX), _p)((uint8_t *)haddr, val);
  #else
@@ -XXX,XX +XXX,XX @@ void helper_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
  void helper_be_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
                         TCGMemOpIdx oi, uintptr_t retaddr)
  {
 -    unsigned mmu_idx = get_mmuidx(oi);
 -    int index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
 -    target_ulong tlb_addr = env->tlb_table[mmu_idx][index].addr_write;
 +    uintptr_t mmu_idx = get_mmuidx(oi);
 +    uintptr_t index = tlb_index(env, mmu_idx, addr);
 +    CPUTLBEntry *entry = tlb_entry(env, mmu_idx, addr);
 +    target_ulong tlb_addr = entry->addr_write;
      unsigned a_bits = get_alignment_bits(get_memop(oi));
      uintptr_t haddr;
@@ -XXX,XX +XXX,XX @@ void helper_be_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
              tlb_fill(ENV_GET_CPU(env), addr, DATA_SIZE, MMU_DATA_STORE,
                       mmu_idx, retaddr);
          }
--        tlb_addr = env->tlb_table[mmu_idx][index].addr_write & ~TLB_INVALID_MASK;
+         oreg = tcg_reg_alloc(s, dup_out_regs, allocated_regs,
-+        tlb_addr = entry->addr_write & ~TLB_INVALID_MASK;
+-                             op->output_pref[0], ots->indirect_base);
 +                             output_pref(op, 0), ots->indirect_base);
          set_temp_val_reg(s, ots, oreg);
      }
-     /* Handle an IO access.  */
+@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
-@@ -XXX,XX +XXX,XX @@ void helper_be_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
+         switch (arg_ct->pair) {
-     if (DATA_SIZE > 1
+         case 0: /* not paired */
-         && unlikely((addr & ~TARGET_PAGE_MASK) + DATA_SIZE - 1
+             if (arg_ct->ialias) {
-                      >= TARGET_PAGE_SIZE)) {
+-                i_preferred_regs = op->output_pref[arg_ct->alias_index];
--        int i, index2;
++                i_preferred_regs = output_pref(op, arg_ct->alias_index);
--        target_ulong page2, tlb_addr2;
-+        int i;
+                 /*
-+        target_ulong page2;
+                  * If the input is not dead after the instruction,
-+        CPUTLBEntry *entry2;
+@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
-     do_unaligned_access:
+              * and to identify a few cases where it's not required.
-         /* Ensure the second page is in the TLB.  Note that the first page
+              */
-            is already guaranteed to be filled, and that the second page
+             if (arg_ct->ialias) {
-            cannot evict the first.  */
+-                i_preferred_regs = op->output_pref[arg_ct->alias_index];
-         page2 = (addr + DATA_SIZE) & TARGET_PAGE_MASK;
++                i_preferred_regs = output_pref(op, arg_ct->alias_index);
--        index2 = (page2 >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
+                 if (IS_DEAD_ARG(i1) &&
--        tlb_addr2 = env->tlb_table[mmu_idx][index2].addr_write;
+                     IS_DEAD_ARG(i2) &&
--        if (!tlb_hit_page(tlb_addr2, page2)
+                     ts->val_type == TEMP_VAL_REG &&
-+        entry2 = tlb_entry(env, mmu_idx, page2);
+@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
-+        if (!tlb_hit_page(entry2->addr_write, page2)
-             && !VICTIM_TLB_HIT(addr_write, page2)) {
+         case 3: /* ialias with second output, no first input */
-             tlb_fill(ENV_GET_CPU(env), page2, DATA_SIZE, MMU_DATA_STORE,
+             tcg_debug_assert(arg_ct->ialias);
-                      mmu_idx, retaddr);
+-            i_preferred_regs = op->output_pref[arg_ct->alias_index];
-@@ -XXX,XX +XXX,XX @@ void helper_be_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
++            i_preferred_regs = output_pref(op, arg_ct->alias_index);
-         return;
              if (IS_DEAD_ARG(i) &&
                  ts->val_type == TEMP_VAL_REG &&
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
                  } else if (arg_ct->newreg) {
                      reg = tcg_reg_alloc(s, arg_ct->regs,
                                          i_allocated_regs | o_allocated_regs,
 -                                        op->output_pref[k], ts->indirect_base);
 +                                        output_pref(op, k), ts->indirect_base);
                  } else {
                      reg = tcg_reg_alloc(s, arg_ct->regs, o_allocated_regs,
 -                                        op->output_pref[k], ts->indirect_base);
 +                                        output_pref(op, k), ts->indirect_base);
                  }
                  break;
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
                      break;
                  }
                  reg = tcg_reg_alloc_pair(s, arg_ct->regs, o_allocated_regs,
 -                                         op->output_pref[k], ts->indirect_base);
 +                                         output_pref(op, k), ts->indirect_base);
                  break;
              case 2: /* second of pair */
@@ -XXX,XX +XXX,XX @@ static bool tcg_reg_alloc_dup2(TCGContext *s, const TCGOp *op)
          }
          oreg = tcg_reg_alloc(s, dup_out_regs, allocated_regs,
 -                             op->output_pref[0], ots->indirect_base);
 +                             output_pref(op, 0), ots->indirect_base);
          set_temp_val_reg(s, ots, oreg);
      }
--    haddr = addr + env->tlb_table[mmu_idx][index].addend;
-+    haddr = addr + entry->addend;
-     glue(glue(st, SUFFIX), _be_p)((uint8_t *)haddr, val);
- }
- #endif /* DATA_SIZE > 1 */
-diff --git a/include/exec/cpu_ldst.h b/include/exec/cpu_ldst.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/exec/cpu_ldst.h
-+++ b/include/exec/cpu_ldst.h
-@@ -XXX,XX +XXX,XX @@ extern __thread uintptr_t helper_retaddr;
- /* The memory helpers for tcg-generated code need tcg_target_long etc.  */
- #include "tcg.h"
-+/* Find the TLB index corresponding to the mmu_idx + address pair.  */
-+static inline uintptr_t tlb_index(CPUArchState *env, uintptr_t mmu_idx,
-+                                  target_ulong addr)
-+{
-+    return (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
-+}
-+
-+/* Find the TLB entry corresponding to the mmu_idx + address pair.  */
-+static inline CPUTLBEntry *tlb_entry(CPUArchState *env, uintptr_t mmu_idx,
-+                                     target_ulong addr)
-+{
-+    return &env->tlb_table[mmu_idx][tlb_index(env, mmu_idx, addr)];
-+}
-+
- #ifdef MMU_MODE0_SUFFIX
- #define CPU_MMU_INDEX 0
- #define MEMSUFFIX MMU_MODE0_SUFFIX
-@@ -XXX,XX +XXX,XX @@ static inline void *tlb_vaddr_to_host(CPUArchState *env, abi_ptr addr,
- #if defined(CONFIG_USER_ONLY)
-     return g2h(addr);
- #else
--    int index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
--    CPUTLBEntry *tlbentry = &env->tlb_table[mmu_idx][index];
-+    CPUTLBEntry *tlbentry = tlb_entry(env, mmu_idx, addr);
-     abi_ptr tlb_addr;
-     uintptr_t haddr;
-@@ -XXX,XX +XXX,XX @@ static inline void *tlb_vaddr_to_host(CPUArchState *env, abi_ptr addr,
-         return NULL;
-     }
--    haddr = addr + env->tlb_table[mmu_idx][index].addend;
-+    haddr = addr + tlbentry->addend;
-     return (void *)haddr;
- #endif /* defined(CONFIG_USER_ONLY) */
- }
-diff --git a/include/exec/cpu_ldst_template.h b/include/exec/cpu_ldst_template.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/exec/cpu_ldst_template.h
-+++ b/include/exec/cpu_ldst_template.h
-@@ -XXX,XX +XXX,XX @@ glue(glue(glue(cpu_ld, USUFFIX), MEMSUFFIX), _ra)(CPUArchState *env,
-                                                   target_ulong ptr,
-                                                   uintptr_t retaddr)
- {
--    int page_index;
-+    CPUTLBEntry *entry;
-     RES_TYPE res;
-     target_ulong addr;
-     int mmu_idx;
-@@ -XXX,XX +XXX,XX @@ glue(glue(glue(cpu_ld, USUFFIX), MEMSUFFIX), _ra)(CPUArchState *env,
- #endif
-     addr = ptr;
--    page_index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
-     mmu_idx = CPU_MMU_INDEX;
--    if (unlikely(env->tlb_table[mmu_idx][page_index].ADDR_READ !=
-+    entry = tlb_entry(env, mmu_idx, addr);
-+    if (unlikely(entry->ADDR_READ !=
-                  (addr & (TARGET_PAGE_MASK | (DATA_SIZE - 1))))) {
-         oi = make_memop_idx(SHIFT, mmu_idx);
-         res = glue(glue(helper_ret_ld, URETSUFFIX), MMUSUFFIX)(env, addr,
-                                                             oi, retaddr);
-     } else {
--        uintptr_t hostaddr = addr + env->tlb_table[mmu_idx][page_index].addend;
-+        uintptr_t hostaddr = addr + entry->addend;
-         res = glue(glue(ld, USUFFIX), _p)((uint8_t *)hostaddr);
-     }
-     return res;
-@@ -XXX,XX +XXX,XX @@ glue(glue(glue(cpu_lds, SUFFIX), MEMSUFFIX), _ra)(CPUArchState *env,
-                                                   target_ulong ptr,
-                                                   uintptr_t retaddr)
- {
--    int res, page_index;
-+    CPUTLBEntry *entry;
-+    int res;
-     target_ulong addr;
-     int mmu_idx;
-     TCGMemOpIdx oi;
-@@ -XXX,XX +XXX,XX @@ glue(glue(glue(cpu_lds, SUFFIX), MEMSUFFIX), _ra)(CPUArchState *env,
- #endif
-     addr = ptr;
--    page_index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
-     mmu_idx = CPU_MMU_INDEX;
--    if (unlikely(env->tlb_table[mmu_idx][page_index].ADDR_READ !=
-+    entry = tlb_entry(env, mmu_idx, addr);
-+    if (unlikely(entry->ADDR_READ !=
-                  (addr & (TARGET_PAGE_MASK | (DATA_SIZE - 1))))) {
-         oi = make_memop_idx(SHIFT, mmu_idx);
-         res = (DATA_STYPE)glue(glue(helper_ret_ld, SRETSUFFIX),
-                                MMUSUFFIX)(env, addr, oi, retaddr);
-     } else {
--        uintptr_t hostaddr = addr + env->tlb_table[mmu_idx][page_index].addend;
-+        uintptr_t hostaddr = addr + entry->addend;
-         res = glue(glue(lds, SUFFIX), _p)((uint8_t *)hostaddr);
-     }
-     return res;
-@@ -XXX,XX +XXX,XX @@ glue(glue(glue(cpu_st, SUFFIX), MEMSUFFIX), _ra)(CPUArchState *env,
-                                                  target_ulong ptr,
-                                                  RES_TYPE v, uintptr_t retaddr)
- {
--    int page_index;
-+    CPUTLBEntry *entry;
-     target_ulong addr;
-     int mmu_idx;
-     TCGMemOpIdx oi;
-@@ -XXX,XX +XXX,XX @@ glue(glue(glue(cpu_st, SUFFIX), MEMSUFFIX), _ra)(CPUArchState *env,
- #endif
-     addr = ptr;
--    page_index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
-     mmu_idx = CPU_MMU_INDEX;
--    if (unlikely(env->tlb_table[mmu_idx][page_index].addr_write !=
-+    entry = tlb_entry(env, mmu_idx, addr);
-+    if (unlikely(entry->addr_write !=
-                  (addr & (TARGET_PAGE_MASK | (DATA_SIZE - 1))))) {
-         oi = make_memop_idx(SHIFT, mmu_idx);
-         glue(glue(helper_ret_st, SUFFIX), MMUSUFFIX)(env, addr, v, oi,
-                                                      retaddr);
-     } else {
--        uintptr_t hostaddr = addr + env->tlb_table[mmu_idx][page_index].addend;
-+        uintptr_t hostaddr = addr + entry->addend;
-         glue(glue(st, SUFFIX), _p)((uint8_t *)hostaddr, v);
-     }
- }
-diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
-index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/cputlb.c
-+++ b/accel/tcg/cputlb.c
-@@ -XXX,XX +XXX,XX @@ static void tlb_flush_page_async_work(CPUState *cpu, run_on_cpu_data data)
- {
-     CPUArchState *env = cpu->env_ptr;
-     target_ulong addr = (target_ulong) data.target_ptr;
--    int i;
-     int mmu_idx;
-     assert_cpu_is_self(cpu);
-@@ -XXX,XX +XXX,XX @@ static void tlb_flush_page_async_work(CPUState *cpu, run_on_cpu_data data)
-     }
-     addr &= TARGET_PAGE_MASK;
--    i = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
-     qemu_spin_lock(&env->tlb_lock);
-     for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
--        tlb_flush_entry_locked(&env->tlb_table[mmu_idx][i], addr);
-+        tlb_flush_entry_locked(tlb_entry(env, mmu_idx, addr), addr);
-         tlb_flush_vtlb_page_locked(env, mmu_idx, addr);
-     }
-     qemu_spin_unlock(&env->tlb_lock);
-@@ -XXX,XX +XXX,XX @@ static void tlb_flush_page_by_mmuidx_async_work(CPUState *cpu,
-     target_ulong addr_and_mmuidx = (target_ulong) data.target_ptr;
-     target_ulong addr = addr_and_mmuidx & TARGET_PAGE_MASK;
-     unsigned long mmu_idx_bitmap = addr_and_mmuidx & ALL_MMUIDX_BITS;
--    int page = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
-     int mmu_idx;
-     assert_cpu_is_self(cpu);
--    tlb_debug("page:%d addr:"TARGET_FMT_lx" mmu_idx:0x%lx\n",
--              page, addr, mmu_idx_bitmap);
-+    tlb_debug("flush page addr:"TARGET_FMT_lx" mmu_idx:0x%lx\n",
-+              addr, mmu_idx_bitmap);
-     qemu_spin_lock(&env->tlb_lock);
-     for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
-         if (test_bit(mmu_idx, &mmu_idx_bitmap)) {
--            tlb_flush_entry_locked(&env->tlb_table[mmu_idx][page], addr);
-+            tlb_flush_entry_locked(tlb_entry(env, mmu_idx, addr), addr);
-             tlb_flush_vtlb_page_locked(env, mmu_idx, addr);
-         }
-     }
-@@ -XXX,XX +XXX,XX @@ static inline void tlb_set_dirty1_locked(CPUTLBEntry *tlb_entry,
- void tlb_set_dirty(CPUState *cpu, target_ulong vaddr)
- {
-     CPUArchState *env = cpu->env_ptr;
--    int i;
-     int mmu_idx;
-     assert_cpu_is_self(cpu);
-     vaddr &= TARGET_PAGE_MASK;
--    i = (vaddr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
-     qemu_spin_lock(&env->tlb_lock);
-     for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
--        tlb_set_dirty1_locked(&env->tlb_table[mmu_idx][i], vaddr);
-+        tlb_set_dirty1_locked(tlb_entry(env, mmu_idx, vaddr), vaddr);
-     }
-     for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
-@@ -XXX,XX +XXX,XX @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
-     iotlb = memory_region_section_get_iotlb(cpu, section, vaddr_page,
-                                             paddr_page, xlat, prot, &address);
--    index = (vaddr_page >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
--    te = &env->tlb_table[mmu_idx][index];
-+    index = tlb_index(env, mmu_idx, vaddr_page);
-+    te = tlb_entry(env, mmu_idx, vaddr_page);
-     /*
-      * Hold the TLB lock for the rest of the function. We could acquire/release
-@@ -XXX,XX +XXX,XX @@ static uint64_t io_readx(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
-          * repeat the MMU check here. This tlb_fill() call might
-          * longjump out if this access should cause a guest exception.
-          */
--        int index;
-+        CPUTLBEntry *entry;
-         target_ulong tlb_addr;
-         tlb_fill(cpu, addr, size, MMU_DATA_LOAD, mmu_idx, retaddr);
--        index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
--        tlb_addr = env->tlb_table[mmu_idx][index].addr_read;
-+        entry = tlb_entry(env, mmu_idx, addr);
-+        tlb_addr = entry->addr_read;
-         if (!(tlb_addr & ~(TARGET_PAGE_MASK | TLB_RECHECK))) {
-             /* RAM access */
--            uintptr_t haddr = addr + env->tlb_table[mmu_idx][index].addend;
-+            uintptr_t haddr = addr + entry->addend;
-             return ldn_p((void *)haddr, size);
-         }
-@@ -XXX,XX +XXX,XX @@ static void io_writex(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
-          * repeat the MMU check here. This tlb_fill() call might
-          * longjump out if this access should cause a guest exception.
-          */
--        int index;
-+        CPUTLBEntry *entry;
-         target_ulong tlb_addr;
-         tlb_fill(cpu, addr, size, MMU_DATA_STORE, mmu_idx, retaddr);
--        index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
--        tlb_addr = env->tlb_table[mmu_idx][index].addr_write;
-+        entry = tlb_entry(env, mmu_idx, addr);
-+        tlb_addr = entry->addr_write;
-         if (!(tlb_addr & ~(TARGET_PAGE_MASK | TLB_RECHECK))) {
-             /* RAM access */
--            uintptr_t haddr = addr + env->tlb_table[mmu_idx][index].addend;
-+            uintptr_t haddr = addr + entry->addend;
-             stn_p((void *)haddr, size, val);
-             return;
-@@ -XXX,XX +XXX,XX @@ static bool victim_tlb_hit(CPUArchState *env, size_t mmu_idx, size_t index,
-  */
- tb_page_addr_t get_page_addr_code(CPUArchState *env, target_ulong addr)
- {
--    int mmu_idx, index;
-+    uintptr_t mmu_idx = cpu_mmu_index(env, true);
-+    uintptr_t index = tlb_index(env, mmu_idx, addr);
-+    CPUTLBEntry *entry = tlb_entry(env, mmu_idx, addr);
-     void *p;
--    index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
--    mmu_idx = cpu_mmu_index(env, true);
--    if (unlikely(!tlb_hit(env->tlb_table[mmu_idx][index].addr_code, addr))) {
-+    if (unlikely(!tlb_hit(entry->addr_code, addr))) {
-         if (!VICTIM_TLB_HIT(addr_code, addr)) {
-             tlb_fill(ENV_GET_CPU(env), addr, 0, MMU_INST_FETCH, mmu_idx, 0);
-         }
--        assert(tlb_hit(env->tlb_table[mmu_idx][index].addr_code, addr));
-+        assert(tlb_hit(entry->addr_code, addr));
-     }
--    if (unlikely(env->tlb_table[mmu_idx][index].addr_code &
--                 (TLB_RECHECK | TLB_MMIO))) {
-+    if (unlikely(entry->addr_code & (TLB_RECHECK | TLB_MMIO))) {
-         /*
-          * Return -1 if we can't translate and execute from an entire
-          * page of RAM here, which will cause us to execute by loading
-@@ -XXX,XX +XXX,XX @@ tb_page_addr_t get_page_addr_code(CPUArchState *env, target_ulong addr)
-         return -1;
-     }
--    p = (void *)((uintptr_t)addr + env->tlb_table[mmu_idx][index].addend);
-+    p = (void *)((uintptr_t)addr + entry->addend);
-     return qemu_ram_addr_from_host_nofail(p);
- }
-@@ -XXX,XX +XXX,XX @@ tb_page_addr_t get_page_addr_code(CPUArchState *env, target_ulong addr)
- void probe_write(CPUArchState *env, target_ulong addr, int size, int mmu_idx,
-                  uintptr_t retaddr)
- {
--    int index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
--    target_ulong tlb_addr = env->tlb_table[mmu_idx][index].addr_write;
-+    uintptr_t index = tlb_index(env, mmu_idx, addr);
-+    CPUTLBEntry *entry = tlb_entry(env, mmu_idx, addr);
--    if (!tlb_hit(tlb_addr, addr)) {
-+    if (!tlb_hit(entry->addr_write, addr)) {
-         /* TLB entry is for a different page */
-         if (!VICTIM_TLB_HIT(addr_write, addr)) {
-             tlb_fill(ENV_GET_CPU(env), addr, size, MMU_DATA_STORE,
-@@ -XXX,XX +XXX,XX @@ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
-                                NotDirtyInfo *ndi)
- {
-     size_t mmu_idx = get_mmuidx(oi);
--    size_t index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
--    CPUTLBEntry *tlbe = &env->tlb_table[mmu_idx][index];
-+    uintptr_t index = tlb_index(env, mmu_idx, addr);
-+    CPUTLBEntry *tlbe = tlb_entry(env, mmu_idx, addr);
-     target_ulong tlb_addr = tlbe->addr_write;
-     TCGMemOp mop = get_memop(oi);
-     int a_bits = get_alignment_bits(mop);
 --
-.17.2
+.34.1

-New patch
+[PULL 38/47] tcg: Reorg function calls
+Pre-compute the function call layout for each helper at startup.
+Drop TCG_CALL_DUMMY_ARG, as we no longer need to leave gaps
+in the op->args[] array.  This allows several places to stop
+checking for NULL TCGTemp, to which TCG_CALL_DUMMY_ARG mapped.
+For tcg_gen_callN, loop over the arguments once.  Allocate the TCGOp
+for the call early but delay emitting it, collecting arguments first.
+This allows the argument processing loop to emit code for extensions
+and have them sequenced before the call.
+For tcg_reg_alloc_call, loop over the arguments in reverse order,
+which allows stack slots to be filled first naturally.
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ include/exec/helper-head.h |   2 +
+ include/tcg/tcg.h          |   5 +-
+ tcg/tcg-internal.h         |  22 +-
+ tcg/optimize.c             |   6 +-
+ tcg/tcg.c                  | 609 ++++++++++++++++++++++---------------
+files changed, 394 insertions(+), 250 deletions(-)
+diff --git a/include/exec/helper-head.h b/include/exec/helper-head.h
+index XXXXXXX..XXXXXXX 100644
+--- a/include/exec/helper-head.h
++++ b/include/exec/helper-head.h
+@@ -XXX,XX +XXX,XX @@
+ #define DEF_HELPER_7(name, ret, t1, t2, t3, t4, t5, t6, t7) \
+     DEF_HELPER_FLAGS_7(name, 0, ret, t1, t2, t3, t4, t5, t6, t7)
++/* MAX_CALL_IARGS must be set to n if last entry is DEF_HELPER_FLAGS_n. */
++
+ #endif /* EXEC_HELPER_HEAD_H */
+diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
+index XXXXXXX..XXXXXXX 100644
+--- a/include/tcg/tcg.h
++++ b/include/tcg/tcg.h
+@@ -XXX,XX +XXX,XX @@
+ /* XXX: make safe guess about sizes */
+ #define MAX_OP_PER_INSTR 266
++#define MAX_CALL_IARGS  7
++
+ #define CPU_TEMP_BUF_NLONGS 128
+ #define TCG_STATIC_FRAME_SIZE  (CPU_TEMP_BUF_NLONGS * sizeof(long))
+@@ -XXX,XX +XXX,XX @@ typedef TCGv_ptr TCGv_env;
+ #define TCG_CALL_NO_RWG_SE      (TCG_CALL_NO_RWG | TCG_CALL_NO_SE)
+ #define TCG_CALL_NO_WG_SE       (TCG_CALL_NO_WG | TCG_CALL_NO_SE)
+-/* Used to align parameters.  See the comment before tcgv_i32_temp.  */
+-#define TCG_CALL_DUMMY_ARG      ((TCGArg)0)
+-
+ /*
+  * Flags for the bswap opcodes.
+  * If IZ, the input is zero-extended, otherwise unknown.
+diff --git a/tcg/tcg-internal.h b/tcg/tcg-internal.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/tcg-internal.h
++++ b/tcg/tcg-internal.h
+@@ -XXX,XX +XXX,XX @@ typedef enum {
+     TCG_CALL_ARG_EXTEND_S,       /*      ... as a sign-extended i64 */
+ } TCGCallArgumentKind;
++typedef struct TCGCallArgumentLoc {
++    TCGCallArgumentKind kind    : 8;
++    unsigned arg_slot           : 8;
++    unsigned ref_slot           : 8;
++    unsigned arg_idx            : 4;
++    unsigned tmp_subindex       : 2;
++} TCGCallArgumentLoc;
++
++/* Avoid "unsigned < 0 is always false" Werror, when iarg_regs is empty. */
++#define REG_P(L) \
++    ((int)(L)->arg_slot < (int)ARRAY_SIZE(tcg_target_call_iarg_regs))
++
+ typedef struct TCGHelperInfo {
+     void *func;
+     const char *name;
+-    unsigned flags;
+-    unsigned typemask;
++    unsigned typemask           : 32;
++    unsigned flags              : 8;
++    unsigned nr_in              : 8;
++    unsigned nr_out             : 8;
++    TCGCallReturnKind out_kind  : 8;
++
++    /* Maximum physical arguments are constrained by TCG_TYPE_I128. */
++    TCGCallArgumentLoc in[MAX_CALL_IARGS * (128 / TCG_TARGET_REG_BITS)];
+ } TCGHelperInfo;
+ extern TCGContext tcg_init_ctx;
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static void init_arguments(OptContext *ctx, TCGOp *op, int nb_args)
+ {
+     for (int i = 0; i < nb_args; i++) {
+         TCGTemp *ts = arg_temp(op->args[i]);
+-        if (ts) {
+-            init_ts_info(ctx, ts);
+-        }
++        init_ts_info(ctx, ts);
+     }
+ }
+@@ -XXX,XX +XXX,XX @@ static void copy_propagate(OptContext *ctx, TCGOp *op,
+     for (int i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
+         TCGTemp *ts = arg_temp(op->args[i]);
+-        if (ts && ts_is_copy(ts)) {
++        if (ts_is_copy(ts)) {
+             op->args[i] = temp_arg(find_better_copy(s, ts));
+         }
+     }
+diff --git a/tcg/tcg.c b/tcg/tcg.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/tcg.c
++++ b/tcg/tcg.c
+@@ -XXX,XX +XXX,XX @@ void tcg_pool_reset(TCGContext *s)
+ #include "exec/helper-proto.h"
+-static const TCGHelperInfo all_helpers[] = {
++static TCGHelperInfo all_helpers[] = {
+ #include "exec/helper-tcg.h"
+ };
+ static GHashTable *helper_table;
+@@ -XXX,XX +XXX,XX @@ static ffi_type * const typecode_to_ffi[8] = {
+ };
+ #endif
++typedef struct TCGCumulativeArgs {
++    int arg_idx;                /* tcg_gen_callN args[] */
++    int info_in_idx;            /* TCGHelperInfo in[] */
++    int arg_slot;               /* regs+stack slot */
++    int ref_slot;               /* stack slots for references */
++} TCGCumulativeArgs;
++
++static void layout_arg_even(TCGCumulativeArgs *cum)
++{
++    cum->arg_slot += cum->arg_slot & 1;
++}
++
++static void layout_arg_1(TCGCumulativeArgs *cum, TCGHelperInfo *info,
++                         TCGCallArgumentKind kind)
++{
++    TCGCallArgumentLoc *loc = &info->in[cum->info_in_idx];
++
++    *loc = (TCGCallArgumentLoc){
++        .kind = kind,
++        .arg_idx = cum->arg_idx,
++        .arg_slot = cum->arg_slot,
++    };
++    cum->info_in_idx++;
++    cum->arg_slot++;
++}
++
++static void layout_arg_normal_n(TCGCumulativeArgs *cum,
++                                TCGHelperInfo *info, int n)
++{
++    TCGCallArgumentLoc *loc = &info->in[cum->info_in_idx];
++
++    for (int i = 0; i < n; ++i) {
++        /* Layout all using the same arg_idx, adjusting the subindex. */
++        loc[i] = (TCGCallArgumentLoc){
++            .kind = TCG_CALL_ARG_NORMAL,
++            .arg_idx = cum->arg_idx,
++            .tmp_subindex = i,
++            .arg_slot = cum->arg_slot + i,
++        };
++    }
++    cum->info_in_idx += n;
++    cum->arg_slot += n;
++}
++
++static void init_call_layout(TCGHelperInfo *info)
++{
++    int max_reg_slots = ARRAY_SIZE(tcg_target_call_iarg_regs);
++    int max_stk_slots = TCG_STATIC_CALL_ARGS_SIZE / sizeof(tcg_target_long);
++    unsigned typemask = info->typemask;
++    unsigned typecode;
++    TCGCumulativeArgs cum = { };
++
++    /*
++     * Parse and place any function return value.
++     */
++    typecode = typemask & 7;
++    switch (typecode) {
++    case dh_typecode_void:
++        info->nr_out = 0;
++        break;
++    case dh_typecode_i32:
++    case dh_typecode_s32:
++    case dh_typecode_ptr:
++        info->nr_out = 1;
++        info->out_kind = TCG_CALL_RET_NORMAL;
++        break;
++    case dh_typecode_i64:
++    case dh_typecode_s64:
++        info->nr_out = 64 / TCG_TARGET_REG_BITS;
++        info->out_kind = TCG_CALL_RET_NORMAL;
++        break;
++    default:
++        g_assert_not_reached();
++    }
++    assert(info->nr_out <= ARRAY_SIZE(tcg_target_call_oarg_regs));
++
++    /*
++     * Parse and place function arguments.
++     */
++    for (typemask >>= 3; typemask; typemask >>= 3, cum.arg_idx++) {
++        TCGCallArgumentKind kind;
++        TCGType type;
++
++        typecode = typemask & 7;
++        switch (typecode) {
++        case dh_typecode_i32:
++        case dh_typecode_s32:
++            type = TCG_TYPE_I32;
++            break;
++        case dh_typecode_i64:
++        case dh_typecode_s64:
++            type = TCG_TYPE_I64;
++            break;
++        case dh_typecode_ptr:
++            type = TCG_TYPE_PTR;
++            break;
++        default:
++            g_assert_not_reached();
++        }
++
++        switch (type) {
++        case TCG_TYPE_I32:
++            switch (TCG_TARGET_CALL_ARG_I32) {
++            case TCG_CALL_ARG_EVEN:
++                layout_arg_even(&cum);
++                /* fall through */
++            case TCG_CALL_ARG_NORMAL:
++                layout_arg_1(&cum, info, TCG_CALL_ARG_NORMAL);
++                break;
++            case TCG_CALL_ARG_EXTEND:
++                kind = TCG_CALL_ARG_EXTEND_U + (typecode & 1);
++                layout_arg_1(&cum, info, kind);
++                break;
++            default:
++                qemu_build_not_reached();
++            }
++            break;
++
++        case TCG_TYPE_I64:
++            switch (TCG_TARGET_CALL_ARG_I64) {
++            case TCG_CALL_ARG_EVEN:
++                layout_arg_even(&cum);
++                /* fall through */
++            case TCG_CALL_ARG_NORMAL:
++                if (TCG_TARGET_REG_BITS == 32) {
++                    layout_arg_normal_n(&cum, info, 2);
++                } else {
++                    layout_arg_1(&cum, info, TCG_CALL_ARG_NORMAL);
++                }
++                break;
++            default:
++                qemu_build_not_reached();
++            }
++            break;
++
++        default:
++            g_assert_not_reached();
++        }
++    }
++    info->nr_in = cum.info_in_idx;
++
++    /* Validate that we didn't overrun the input array. */
++    assert(cum.info_in_idx <= ARRAY_SIZE(info->in));
++    /* Validate the backend has enough argument space. */
++    assert(cum.arg_slot <= max_reg_slots + max_stk_slots);
++    assert(cum.ref_slot <= max_stk_slots);
++}
++
+ static int indirect_reg_alloc_order[ARRAY_SIZE(tcg_target_reg_alloc_order)];
+ static void process_op_defs(TCGContext *s);
+ static TCGTemp *tcg_global_reg_new_internal(TCGContext *s, TCGType type,
+@@ -XXX,XX +XXX,XX @@ static void tcg_context_init(unsigned max_cpus)
+     helper_table = g_hash_table_new(NULL, NULL);
+     for (i = 0; i < ARRAY_SIZE(all_helpers); ++i) {
++        init_call_layout(&all_helpers[i]);
+         g_hash_table_insert(helper_table, (gpointer)all_helpers[i].func,
+                             (gpointer)&all_helpers[i]);
+     }
+@@ -XXX,XX +XXX,XX @@ bool tcg_op_supported(TCGOpcode op)
+     }
+ }
+-/* Note: we convert the 64 bit args to 32 bit and do some alignment
+-   and endian swap. Maybe it would be better to do the alignment
+-   and endian swap in tcg_reg_alloc_call(). */
++static TCGOp *tcg_op_alloc(TCGOpcode opc, unsigned nargs);
++
+ void tcg_gen_callN(void *func, TCGTemp *ret, int nargs, TCGTemp **args)
+ {
+-    int i, real_args, nb_rets, pi, max_args;
+-    unsigned typemask;
+     const TCGHelperInfo *info;
++    TCGv_i64 extend_free[MAX_CALL_IARGS];
++    int n_extend = 0;
+     TCGOp *op;
++    int i, n, pi = 0, total_args;
+     info = g_hash_table_lookup(helper_table, (gpointer)func);
+-    typemask = info->typemask;
++    total_args = info->nr_out + info->nr_in + 2;
++    op = tcg_op_alloc(INDEX_op_call, total_args);
+ #ifdef CONFIG_PLUGIN
+     /* detect non-plugin helpers */
+@@ -XXX,XX +XXX,XX @@ void tcg_gen_callN(void *func, TCGTemp *ret, int nargs, TCGTemp **args)
+     }
+ #endif
+-    if (TCG_TARGET_CALL_ARG_I32 == TCG_CALL_ARG_EXTEND) {
+-        for (i = 0; i < nargs; ++i) {
+-            int argtype = extract32(typemask, (i + 1) * 3, 3);
+-            bool is_32bit = (argtype & ~1) == dh_typecode_i32;
+-            bool is_signed = argtype & 1;
++    TCGOP_CALLO(op) = n = info->nr_out;
++    switch (n) {
++    case 0:
++        tcg_debug_assert(ret == NULL);
++        break;
++    case 1:
++        tcg_debug_assert(ret != NULL);
++        op->args[pi++] = temp_arg(ret);
++        break;
++    case 2:
++        tcg_debug_assert(ret != NULL);
++        tcg_debug_assert(ret->base_type == ret->type + 1);
++        tcg_debug_assert(ret->temp_subindex == 0);
++        op->args[pi++] = temp_arg(ret);
++        op->args[pi++] = temp_arg(ret + 1);
++        break;
++    default:
++        g_assert_not_reached();
++    }
+-            if (is_32bit) {
++    TCGOP_CALLI(op) = n = info->nr_in;
++    for (i = 0; i < n; i++) {
++        const TCGCallArgumentLoc *loc = &info->in[i];
++        TCGTemp *ts = args[loc->arg_idx] + loc->tmp_subindex;
++
++        switch (loc->kind) {
++        case TCG_CALL_ARG_NORMAL:
++            op->args[pi++] = temp_arg(ts);
++            break;
++
++        case TCG_CALL_ARG_EXTEND_U:
++        case TCG_CALL_ARG_EXTEND_S:
++            {
+                 TCGv_i64 temp = tcg_temp_new_i64();
+-                TCGv_i32 orig = temp_tcgv_i32(args[i]);
+-                if (is_signed) {
++                TCGv_i32 orig = temp_tcgv_i32(ts);
++
++                if (loc->kind == TCG_CALL_ARG_EXTEND_S) {
+                     tcg_gen_ext_i32_i64(temp, orig);
+                 } else {
+                     tcg_gen_extu_i32_i64(temp, orig);
+                 }
+-                args[i] = tcgv_i64_temp(temp);
++                op->args[pi++] = tcgv_i64_arg(temp);
++                extend_free[n_extend++] = temp;
+             }
+-        }
+-    }
+-
+-    /*
+-     * A Call op needs up to 4 + 2N parameters on 32-bit archs,
+-     * and up to 4 + N parameters on 64-bit archs
+-     * (N = number of input arguments + output arguments).
+-     */
+-    max_args = (64 / TCG_TARGET_REG_BITS) * nargs + 4;
+-    op = tcg_emit_op(INDEX_op_call, max_args);
+-
+-    pi = 0;
+-    if (ret != NULL) {
+-        if (TCG_TARGET_REG_BITS < 64 && (typemask & 6) == dh_typecode_i64) {
+-            op->args[pi++] = temp_arg(ret);
+-            op->args[pi++] = temp_arg(ret + 1);
+-            nb_rets = 2;
+-        } else {
+-            op->args[pi++] = temp_arg(ret);
+-            nb_rets = 1;
+-        }
+-    } else {
+-        nb_rets = 0;
+-    }
+-    TCGOP_CALLO(op) = nb_rets;
+-
+-    real_args = 0;
+-    for (i = 0; i < nargs; i++) {
+-        int argtype = extract32(typemask, (i + 1) * 3, 3);
+-        TCGCallArgumentKind kind;
+-        TCGType type;
+-
+-        switch (argtype) {
+-        case dh_typecode_i32:
+-        case dh_typecode_s32:
+-            type = TCG_TYPE_I32;
+             break;
+-        case dh_typecode_i64:
+-        case dh_typecode_s64:
+-            type = TCG_TYPE_I64;
+-            break;
+-        case dh_typecode_ptr:
+-            type = TCG_TYPE_PTR;
+-            break;
+-        default:
+-            g_assert_not_reached();
+-        }
+-        switch (type) {
+-        case TCG_TYPE_I32:
+-            kind = TCG_TARGET_CALL_ARG_I32;
+-            break;
+-        case TCG_TYPE_I64:
+-            kind = TCG_TARGET_CALL_ARG_I64;
+-            break;
+-        default:
+-            g_assert_not_reached();
+-        }
+-
+-        switch (kind) {
+-        case TCG_CALL_ARG_EVEN:
+-            if (real_args & 1) {
+-                op->args[pi++] = TCG_CALL_DUMMY_ARG;
+-                real_args++;
+-            }
+-            /* fall through */
+-        case TCG_CALL_ARG_NORMAL:
+-            if (TCG_TARGET_REG_BITS == 32 && type == TCG_TYPE_I64) {
+-                op->args[pi++] = temp_arg(args[i]);
+-                op->args[pi++] = temp_arg(args[i] + 1);
+-                real_args += 2;
+-                break;
+-            }
+-            op->args[pi++] = temp_arg(args[i]);
+-            real_args++;
+-            break;
+         default:
+             g_assert_not_reached();
+         }
+     }
+     op->args[pi++] = (uintptr_t)func;
+     op->args[pi++] = (uintptr_t)info;
+-    TCGOP_CALLI(op) = real_args;
++    tcg_debug_assert(pi == total_args);
+-    /* Make sure the fields didn't overflow.  */
+-    tcg_debug_assert(TCGOP_CALLI(op) == real_args);
+-    tcg_debug_assert(pi <= max_args);
++    QTAILQ_INSERT_TAIL(&tcg_ctx->ops, op, link);
+-    if (TCG_TARGET_CALL_ARG_I32 == TCG_CALL_ARG_EXTEND) {
+-        for (i = 0; i < nargs; ++i) {
+-            int argtype = extract32(typemask, (i + 1) * 3, 3);
+-            bool is_32bit = (argtype & ~1) == dh_typecode_i32;
+-
+-            if (is_32bit) {
+-                tcg_temp_free_internal(args[i]);
+-            }
+-        }
++    tcg_debug_assert(n_extend < ARRAY_SIZE(extend_free));
++    for (i = 0; i < n_extend; ++i) {
++        tcg_temp_free_i64(extend_free[i]);
+     }
+ }
+@@ -XXX,XX +XXX,XX @@ static void tcg_dump_ops(TCGContext *s, FILE *f, bool have_prefs)
+             }
+             for (i = 0; i < nb_iargs; i++) {
+                 TCGArg arg = op->args[nb_oargs + i];
+-                const char *t = "<dummy>";
+-                if (arg != TCG_CALL_DUMMY_ARG) {
+-                    t = tcg_get_arg_str(s, buf, sizeof(buf), arg);
+-                }
++                const char *t = tcg_get_arg_str(s, buf, sizeof(buf), arg);
+                 col += ne_fprintf(f, ",%s", t);
+             }
+         } else {
+@@ -XXX,XX +XXX,XX @@ static void liveness_pass_1(TCGContext *s)
+         switch (opc) {
+         case INDEX_op_call:
+             {
+-                int call_flags;
+-                int nb_call_regs;
++                const TCGHelperInfo *info = tcg_call_info(op);
++                int call_flags = tcg_call_flags(op);
+                 nb_oargs = TCGOP_CALLO(op);
+                 nb_iargs = TCGOP_CALLI(op);
+-                call_flags = tcg_call_flags(op);
+                 /* pure functions can be removed if their result is unused */
+                 if (call_flags & TCG_CALL_NO_SIDE_EFFECTS) {
+@@ -XXX,XX +XXX,XX @@ static void liveness_pass_1(TCGContext *s)
+                 /* Record arguments that die in this helper.  */
+                 for (i = nb_oargs; i < nb_iargs + nb_oargs; i++) {
+                     ts = arg_temp(op->args[i]);
+-                    if (ts && ts->state & TS_DEAD) {
++                    if (ts->state & TS_DEAD) {
+                         arg_life |= DEAD_ARG << i;
+                     }
+                 }
+@@ -XXX,XX +XXX,XX @@ static void liveness_pass_1(TCGContext *s)
+                 /* For all live registers, remove call-clobbered prefs.  */
+                 la_cross_call(s, nb_temps);
+-                nb_call_regs = ARRAY_SIZE(tcg_target_call_iarg_regs);
++                /*
++                 * Input arguments are live for preceding opcodes.
++                 *
++                 * For those arguments that die, and will be allocated in
++                 * registers, clear the register set for that arg, to be
++                 * filled in below.  For args that will be on the stack,
++                 * reset to any available reg.  Process arguments in reverse
++                 * order so that if a temp is used more than once, the stack
++                 * reset to max happens before the register reset to 0.
++                 */
++                for (i = nb_iargs - 1; i >= 0; i--) {
++                    const TCGCallArgumentLoc *loc = &info->in[i];
++                    ts = arg_temp(op->args[nb_oargs + i]);
+-                /* Input arguments are live for preceding opcodes.  */
+-                for (i = 0; i < nb_iargs; i++) {
+-                    ts = arg_temp(op->args[i + nb_oargs]);
+-                    if (ts && ts->state & TS_DEAD) {
+-                        /* For those arguments that die, and will be allocated
+-                         * in registers, clear the register set for that arg,
+-                         * to be filled in below.  For args that will be on
+-                         * the stack, reset to any available reg.
+-                         */
+-                        *la_temp_pref(ts)
+-                            = (i < nb_call_regs ? 0 :
+-                               tcg_target_available_regs[ts->type]);
++                    if (ts->state & TS_DEAD) {
++                        switch (loc->kind) {
++                        case TCG_CALL_ARG_NORMAL:
++                        case TCG_CALL_ARG_EXTEND_U:
++                        case TCG_CALL_ARG_EXTEND_S:
++                            if (REG_P(loc)) {
++                                *la_temp_pref(ts) = 0;
++                                break;
++                            }
++                            /* fall through */
++                        default:
++                            *la_temp_pref(ts) =
++                                tcg_target_available_regs[ts->type];
++                            break;
++                        }
+                         ts->state &= ~TS_DEAD;
+                     }
+                 }
+-                /* For each input argument, add its input register to prefs.
+-                   If a temp is used once, this produces a single set bit.  */
+-                for (i = 0; i < MIN(nb_call_regs, nb_iargs); i++) {
+-                    ts = arg_temp(op->args[i + nb_oargs]);
+-                    if (ts) {
+-                        tcg_regset_set_reg(*la_temp_pref(ts),
+-                                           tcg_target_call_iarg_regs[i]);
++                /*
++                 * For each input argument, add its input register to prefs.
++                 * If a temp is used once, this produces a single set bit;
++                 * if a temp is used multiple times, this produces a set.
++                 */
++                for (i = 0; i < nb_iargs; i++) {
++                    const TCGCallArgumentLoc *loc = &info->in[i];
++                    ts = arg_temp(op->args[nb_oargs + i]);
++
++                    switch (loc->kind) {
++                    case TCG_CALL_ARG_NORMAL:
++                    case TCG_CALL_ARG_EXTEND_U:
++                    case TCG_CALL_ARG_EXTEND_S:
++                        if (REG_P(loc)) {
++                            tcg_regset_set_reg(*la_temp_pref(ts),
++                                tcg_target_call_iarg_regs[loc->arg_slot]);
++                        }
++                        break;
++                    default:
++                        break;
+                     }
+                 }
+             }
+@@ -XXX,XX +XXX,XX @@ static bool liveness_pass_2(TCGContext *s)
+         /* Make sure that input arguments are available.  */
+         for (i = nb_oargs; i < nb_iargs + nb_oargs; i++) {
+             arg_ts = arg_temp(op->args[i]);
+-            if (arg_ts) {
+-                dir_ts = arg_ts->state_ptr;
+-                if (dir_ts && arg_ts->state == TS_DEAD) {
+-                    TCGOpcode lopc = (arg_ts->type == TCG_TYPE_I32
+-                                      ? INDEX_op_ld_i32
+-                                      : INDEX_op_ld_i64);
+-                    TCGOp *lop = tcg_op_insert_before(s, op, lopc, 3);
++            dir_ts = arg_ts->state_ptr;
++            if (dir_ts && arg_ts->state == TS_DEAD) {
++                TCGOpcode lopc = (arg_ts->type == TCG_TYPE_I32
++                                  ? INDEX_op_ld_i32
++                                  : INDEX_op_ld_i64);
++                TCGOp *lop = tcg_op_insert_before(s, op, lopc, 3);
+-                    lop->args[0] = temp_arg(dir_ts);
+-                    lop->args[1] = temp_arg(arg_ts->mem_base);
+-                    lop->args[2] = arg_ts->mem_offset;
++                lop->args[0] = temp_arg(dir_ts);
++                lop->args[1] = temp_arg(arg_ts->mem_base);
++                lop->args[2] = arg_ts->mem_offset;
+-                    /* Loaded, but synced with memory.  */
+-                    arg_ts->state = TS_MEM;
+-                }
++                /* Loaded, but synced with memory.  */
++                arg_ts->state = TS_MEM;
+             }
+         }
+@@ -XXX,XX +XXX,XX @@ static bool liveness_pass_2(TCGContext *s)
+            so that we reload when needed.  */
+         for (i = nb_oargs; i < nb_iargs + nb_oargs; i++) {
+             arg_ts = arg_temp(op->args[i]);
+-            if (arg_ts) {
+-                dir_ts = arg_ts->state_ptr;
+-                if (dir_ts) {
+-                    op->args[i] = temp_arg(dir_ts);
+-                    changes = true;
+-                    if (IS_DEAD_ARG(i)) {
+-                        arg_ts->state = TS_DEAD;
+-                    }
++            dir_ts = arg_ts->state_ptr;
++            if (dir_ts) {
++                op->args[i] = temp_arg(dir_ts);
++                changes = true;
++                if (IS_DEAD_ARG(i)) {
++                    arg_ts->state = TS_DEAD;
+                 }
+             }
+         }
+@@ -XXX,XX +XXX,XX @@ static bool tcg_reg_alloc_dup2(TCGContext *s, const TCGOp *op)
+     return true;
+ }
++static void load_arg_reg(TCGContext *s, TCGReg reg, TCGTemp *ts,
++                         TCGRegSet allocated_regs)
++{
++    if (ts->val_type == TEMP_VAL_REG) {
++        if (ts->reg != reg) {
++            tcg_reg_free(s, reg, allocated_regs);
++            if (!tcg_out_mov(s, ts->type, reg, ts->reg)) {
++                /*
++                 * Cross register class move not supported.  Sync the
++                 * temp back to its slot and load from there.
++                 */
++                temp_sync(s, ts, allocated_regs, 0, 0);
++                tcg_out_ld(s, ts->type, reg,
++                           ts->mem_base->reg, ts->mem_offset);
++            }
++        }
++    } else {
++        TCGRegSet arg_set = 0;
++
++        tcg_reg_free(s, reg, allocated_regs);
++        tcg_regset_set_reg(arg_set, reg);
++        temp_load(s, ts, arg_set, allocated_regs, 0);
++    }
++}
++
++static void load_arg_stk(TCGContext *s, int stk_slot, TCGTemp *ts,
++                         TCGRegSet allocated_regs)
++{
++    /*
++     * When the destination is on the stack, load up the temp and store.
++     * If there are many call-saved registers, the temp might live to
++     * see another use; otherwise it'll be discarded.
++     */
++    temp_load(s, ts, tcg_target_available_regs[ts->type], allocated_regs, 0);
++    tcg_out_st(s, ts->type, ts->reg, TCG_REG_CALL_STACK,
++               TCG_TARGET_CALL_STACK_OFFSET +
++               stk_slot * sizeof(tcg_target_long));
++}
++
++static void load_arg_normal(TCGContext *s, const TCGCallArgumentLoc *l,
++                            TCGTemp *ts, TCGRegSet *allocated_regs)
++{
++    if (REG_P(l)) {
++        TCGReg reg = tcg_target_call_iarg_regs[l->arg_slot];
++        load_arg_reg(s, reg, ts, *allocated_regs);
++        tcg_regset_set_reg(*allocated_regs, reg);
++    } else {
++        load_arg_stk(s, l->arg_slot - ARRAY_SIZE(tcg_target_call_iarg_regs),
++                     ts, *allocated_regs);
++    }
++}
++
+ static void tcg_reg_alloc_call(TCGContext *s, TCGOp *op)
+ {
+     const int nb_oargs = TCGOP_CALLO(op);
+     const int nb_iargs = TCGOP_CALLI(op);
+     const TCGLifeData arg_life = op->life;
+-    const TCGHelperInfo *info;
+-    int flags, nb_regs, i;
+-    TCGReg reg;
+-    TCGArg arg;
+-    TCGTemp *ts;
+-    intptr_t stack_offset;
+-    size_t call_stack_size;
+-    tcg_insn_unit *func_addr;
+-    int allocate_args;
+-    TCGRegSet allocated_regs;
++    const TCGHelperInfo *info = tcg_call_info(op);
++    TCGRegSet allocated_regs = s->reserved_regs;
++    int i;
+-    func_addr = tcg_call_func(op);
+-    info = tcg_call_info(op);
+-    flags = info->flags;
++    /*
++     * Move inputs into place in reverse order,
++     * so that we place stacked arguments first.
++     */
++    for (i = nb_iargs - 1; i >= 0; --i) {
++        const TCGCallArgumentLoc *loc = &info->in[i];
++        TCGTemp *ts = arg_temp(op->args[nb_oargs + i]);
+-    nb_regs = ARRAY_SIZE(tcg_target_call_iarg_regs);
+-    if (nb_regs > nb_iargs) {
+-        nb_regs = nb_iargs;
+-    }
+-
+-    /* assign stack slots first */
+-    call_stack_size = (nb_iargs - nb_regs) * sizeof(tcg_target_long);
+-    call_stack_size = (call_stack_size + TCG_TARGET_STACK_ALIGN - 1) &
+-        ~(TCG_TARGET_STACK_ALIGN - 1);
+-    allocate_args = (call_stack_size > TCG_STATIC_CALL_ARGS_SIZE);
+-    if (allocate_args) {
+-        /* XXX: if more than TCG_STATIC_CALL_ARGS_SIZE is needed,
+-           preallocate call stack */
+-        tcg_abort();
+-    }
+-
+-    stack_offset = TCG_TARGET_CALL_STACK_OFFSET;
+-    for (i = nb_regs; i < nb_iargs; i++) {
+-        arg = op->args[nb_oargs + i];
+-        if (arg != TCG_CALL_DUMMY_ARG) {
+-            ts = arg_temp(arg);
+-            temp_load(s, ts, tcg_target_available_regs[ts->type],
+-                      s->reserved_regs, 0);
+-            tcg_out_st(s, ts->type, ts->reg, TCG_REG_CALL_STACK, stack_offset);
+-        }
+-        stack_offset += sizeof(tcg_target_long);
+-    }
+-
+-    /* assign input registers */
+-    allocated_regs = s->reserved_regs;
+-    for (i = 0; i < nb_regs; i++) {
+-        arg = op->args[nb_oargs + i];
+-        if (arg != TCG_CALL_DUMMY_ARG) {
+-            ts = arg_temp(arg);
+-            reg = tcg_target_call_iarg_regs[i];
+-
+-            if (ts->val_type == TEMP_VAL_REG) {
+-                if (ts->reg != reg) {
+-                    tcg_reg_free(s, reg, allocated_regs);
+-                    if (!tcg_out_mov(s, ts->type, reg, ts->reg)) {
+-                        /*
+-                         * Cross register class move not supported.  Sync the
+-                         * temp back to its slot and load from there.
+-                         */
+-                        temp_sync(s, ts, allocated_regs, 0, 0);
+-                        tcg_out_ld(s, ts->type, reg,
+-                                   ts->mem_base->reg, ts->mem_offset);
+-                    }
+-                }
+-            } else {
+-                TCGRegSet arg_set = 0;
+-
+-                tcg_reg_free(s, reg, allocated_regs);
+-                tcg_regset_set_reg(arg_set, reg);
+-                temp_load(s, ts, arg_set, allocated_regs, 0);
+-            }
+-
+-            tcg_regset_set_reg(allocated_regs, reg);
++        switch (loc->kind) {
++        case TCG_CALL_ARG_NORMAL:
++        case TCG_CALL_ARG_EXTEND_U:
++        case TCG_CALL_ARG_EXTEND_S:
++            load_arg_normal(s, loc, ts, &allocated_regs);
++            break;
++        default:
++            g_assert_not_reached();
+         }
+     }
+-    /* mark dead temporaries and free the associated registers */
++    /* Mark dead temporaries and free the associated registers.  */
+     for (i = nb_oargs; i < nb_iargs + nb_oargs; i++) {
+         if (IS_DEAD_ARG(i)) {
+             temp_dead(s, arg_temp(op->args[i]));
+         }
+     }
+-    /* clobber call registers */
++    /* Clobber call registers.  */
+     for (i = 0; i < TCG_TARGET_NB_REGS; i++) {
+         if (tcg_regset_test_reg(tcg_target_call_clobber_regs, i)) {
+             tcg_reg_free(s, i, allocated_regs);
+         }
+     }
+-    /* Save globals if they might be written by the helper, sync them if
+-       they might be read. */
+-    if (flags & TCG_CALL_NO_READ_GLOBALS) {
++    /*
++     * Save globals if they might be written by the helper,
++     * sync them if they might be read.
++     */
++    if (info->flags & TCG_CALL_NO_READ_GLOBALS) {
+         /* Nothing to do */
+-    } else if (flags & TCG_CALL_NO_WRITE_GLOBALS) {
++    } else if (info->flags & TCG_CALL_NO_WRITE_GLOBALS) {
+         sync_globals(s, allocated_regs);
+     } else {
+         save_globals(s, allocated_regs);
+@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_call(TCGContext *s, TCGOp *op)
+         gpointer hash = (gpointer)(uintptr_t)info->typemask;
+         ffi_cif *cif = g_hash_table_lookup(ffi_table, hash);
+         assert(cif != NULL);
+-        tcg_out_call(s, func_addr, cif);
++        tcg_out_call(s, tcg_call_func(op), cif);
+     }
+ #else
+-    tcg_out_call(s, func_addr);
++    tcg_out_call(s, tcg_call_func(op));
+ #endif
+-    /* assign output registers and emit moves if needed */
+-    for(i = 0; i < nb_oargs; i++) {
+-        arg = op->args[i];
+-        ts = arg_temp(arg);
++    /* Assign output registers and emit moves if needed.  */
++    switch (info->out_kind) {
++    case TCG_CALL_RET_NORMAL:
++        for (i = 0; i < nb_oargs; i++) {
++            TCGTemp *ts = arg_temp(op->args[i]);
++            TCGReg reg = tcg_target_call_oarg_regs[i];
+-        /* ENV should not be modified.  */
+-        tcg_debug_assert(!temp_readonly(ts));
++            /* ENV should not be modified.  */
++            tcg_debug_assert(!temp_readonly(ts));
+-        reg = tcg_target_call_oarg_regs[i];
+-        set_temp_val_reg(s, ts, reg);
+-        ts->mem_coherent = 0;
++            set_temp_val_reg(s, ts, reg);
++            ts->mem_coherent = 0;
++        }
++        break;
++    default:
++        g_assert_not_reached();
++    }
++
++    /* Flush or discard output registers as needed. */
++    for (i = 0; i < nb_oargs; i++) {
++        TCGTemp *ts = arg_temp(op->args[i]);
+         if (NEED_SYNC_ARG(i)) {
+-            temp_sync(s, ts, allocated_regs, 0, IS_DEAD_ARG(i));
++            temp_sync(s, ts, s->reserved_regs, 0, IS_DEAD_ARG(i));
+         } else if (IS_DEAD_ARG(i)) {
+             temp_dead(s, ts);
+         }
+--
+.34.1

-[Qemu-devel] [PULL 05/21] tcg: distribute tcg_time into TCG contexts
+[PULL 39/47] tcg: Convert typecode_to_ffi from array to function
-From: "Emilio G. Cota" <cota@braap.org>
+From: Philippe Mathieu-Daudé <philmd@linaro.org>
-When we implemented per-vCPU TCG contexts, we forgot to also
+In the unlikely case of invalid typecode mask, the function
-distribute the tcg_time counter, which has remained as a global
+will abort instead of returning a NULL pointer.
 accessed without any serialization, leading to potentially missed
 counts.
-Fix it by distributing the field over the TCG contexts, embedding
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
-it into TCGProfile with a field called "cpu_exec_time", which is more
+Message-Id: <20221111074101.2069454-27-richard.henderson@linaro.org>
-descriptive than "tcg_time". Add a function to query this value
+[PMD: Split from bigger patch]
-directly, and for completeness, fill in the field in
+Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
-tcg_profile_snapshot, even though its callers do not use it.
+Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Message-Id: <20221122180804.938-2-philmd@linaro.org>
 ---
  tcg/tcg.c | 30 ++++++++++++++++++++----------
 file changed, 20 insertions(+), 10 deletions(-)
-Signed-off-by: Emilio G. Cota <cota@braap.org>
-Message-Id: <20181010144853.13005-5-cota@braap.org>
-Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
----
- include/qemu/timer.h |  1 -
- tcg/tcg.h            |  2 ++
- cpus.c               |  3 ++-
- monitor.c            | 13 ++++++++++---
- tcg/tcg.c            | 23 +++++++++++++++++++++++
-files changed, 37 insertions(+), 5 deletions(-)
-diff --git a/include/qemu/timer.h b/include/qemu/timer.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/qemu/timer.h
-+++ b/include/qemu/timer.h
-@@ -XXX,XX +XXX,XX @@ static inline int64_t profile_getclock(void)
-     return get_clock();
- }
--extern int64_t tcg_time;
- extern int64_t dev_time;
- #endif
-diff --git a/tcg/tcg.h b/tcg/tcg.h
-index XXXXXXX..XXXXXXX 100644
---- a/tcg/tcg.h
-+++ b/tcg/tcg.h
-@@ -XXX,XX +XXX,XX @@ typedef struct TCGOp {
- QEMU_BUILD_BUG_ON(NB_OPS > (1 << 8));
- typedef struct TCGProfile {
-+    int64_t cpu_exec_time;
-     int64_t tb_count1;
-     int64_t tb_count;
-     int64_t op_count; /* total insn count */
-@@ -XXX,XX +XXX,XX @@ int tcg_check_temp_count(void);
- #define tcg_check_temp_count() 0
- #endif
-+int64_t tcg_cpu_exec_time(void);
- void tcg_dump_info(FILE *f, fprintf_function cpu_fprintf);
- void tcg_dump_op_count(FILE *f, fprintf_function cpu_fprintf);
-diff --git a/cpus.c b/cpus.c
-index XXXXXXX..XXXXXXX 100644
---- a/cpus.c
-+++ b/cpus.c
-@@ -XXX,XX +XXX,XX @@ static int tcg_cpu_exec(CPUState *cpu)
-     ret = cpu_exec(cpu);
-     cpu_exec_end(cpu);
- #ifdef CONFIG_PROFILER
--    tcg_time += profile_getclock() - ti;
-+    atomic_set(&tcg_ctx->prof.cpu_exec_time,
-+               tcg_ctx->prof.cpu_exec_time + profile_getclock() - ti);
- #endif
-     return ret;
- }
-diff --git a/monitor.c b/monitor.c
-index XXXXXXX..XXXXXXX 100644
---- a/monitor.c
-+++ b/monitor.c
-@@ -XXX,XX +XXX,XX @@
- #include "sysemu/cpus.h"
- #include "sysemu/iothread.h"
- #include "qemu/cutils.h"
-+#include "tcg/tcg.h"
- #if defined(TARGET_S390X)
- #include "hw/s390x/storage-keys.h"
-@@ -XXX,XX +XXX,XX @@ static void hmp_info_numa(Monitor *mon, const QDict *qdict)
- #ifdef CONFIG_PROFILER
--int64_t tcg_time;
- int64_t dev_time;
- static void hmp_info_profile(Monitor *mon, const QDict *qdict)
- {
-+    static int64_t last_cpu_exec_time;
-+    int64_t cpu_exec_time;
-+    int64_t delta;
-+
-+    cpu_exec_time = tcg_cpu_exec_time();
-+    delta = cpu_exec_time - last_cpu_exec_time;
-+
-     monitor_printf(mon, "async time  %" PRId64 " (%0.3f)\n",
-                    dev_time, dev_time / (double)NANOSECONDS_PER_SECOND);
-     monitor_printf(mon, "qemu time   %" PRId64 " (%0.3f)\n",
--                   tcg_time, tcg_time / (double)NANOSECONDS_PER_SECOND);
--    tcg_time = 0;
-+                   delta, delta / (double)NANOSECONDS_PER_SECOND);
-+    last_cpu_exec_time = cpu_exec_time;
-     dev_time = 0;
- }
- #else
 diff --git a/tcg/tcg.c b/tcg/tcg.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tcg.c
 +++ b/tcg/tcg.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static GHashTable *helper_table;
- /* Define to jump the ELF file used to communicate with GDB.  */
+ #ifdef CONFIG_TCG_INTERPRETER
- #undef DEBUG_JIT
+ static GHashTable *ffi_table;
-+#include "qemu/error-report.h"
+-static ffi_type * const typecode_to_ffi[8] = {
- #include "qemu/cutils.h"
+-    [dh_typecode_void] = &ffi_type_void,
- #include "qemu/host-utils.h"
+-    [dh_typecode_i32]  = &ffi_type_uint32,
- #include "qemu/timer.h"
+-    [dh_typecode_s32]  = &ffi_type_sint32,
-@@ -XXX,XX +XXX,XX @@ void tcg_profile_snapshot(TCGProfile *prof, bool counters, bool table)
+-    [dh_typecode_i64]  = &ffi_type_uint64,
-         const TCGProfile *orig = &s->prof;
+-    [dh_typecode_s64]  = &ffi_type_sint64,
+-    [dh_typecode_ptr]  = &ffi_type_pointer,
-         if (counters) {
+-};
-+            PROF_ADD(prof, orig, cpu_exec_time);
++static ffi_type *typecode_to_ffi(int argmask)
              PROF_ADD(prof, orig, tb_count1);
              PROF_ADD(prof, orig, tb_count);
              PROF_ADD(prof, orig, op_count);
@@ -XXX,XX +XXX,XX @@ void tcg_dump_op_count(FILE *f, fprintf_function cpu_fprintf)
                      prof.table_op_count[i]);
      }
  }
 +
 +int64_t tcg_cpu_exec_time(void)
 +{
-+    unsigned int n_ctxs = atomic_read(&n_tcg_ctxs);
++    switch (argmask) {
-+    unsigned int i;
++    case dh_typecode_void:
-+    int64_t ret = 0;
++        return &ffi_type_void;
-+
++    case dh_typecode_i32:
-+    for (i = 0; i < n_ctxs; i++) {
++        return &ffi_type_uint32;
-+        const TCGContext *s = atomic_read(&tcg_ctxs[i]);
++    case dh_typecode_s32:
-+        const TCGProfile *prof = &s->prof;
++        return &ffi_type_sint32;
-+
++    case dh_typecode_i64:
-+        ret += atomic_read(&prof->cpu_exec_time);
++        return &ffi_type_uint64;
 +    case dh_typecode_s64:
 +        return &ffi_type_sint64;
 +    case dh_typecode_ptr:
 +        return &ffi_type_pointer;
 +    }
-+    return ret;
++    g_assert_not_reached();
 +}
  #else
  void tcg_dump_op_count(FILE *f, fprintf_function cpu_fprintf)
  {
      cpu_fprintf(f, "[TCG profiler not compiled]\n");
  }
 +
 +int64_t tcg_cpu_exec_time(void)
 +{
 +    error_report("%s: TCG profiler not compiled", __func__);
 +    exit(EXIT_FAILURE);
 +}
  #endif
+ typedef struct TCGCumulativeArgs {
+@@ -XXX,XX +XXX,XX @@ static void tcg_context_init(unsigned max_cpus)
+         nargs = DIV_ROUND_UP(nargs, 3);
+         ca = g_malloc0(sizeof(*ca) + nargs * sizeof(ffi_type *));
+-        ca->cif.rtype = typecode_to_ffi[typemask & 7];
++        ca->cif.rtype = typecode_to_ffi(typemask & 7);
+         ca->cif.nargs = nargs;
+         if (nargs != 0) {
+             ca->cif.arg_types = ca->args;
+             for (int j = 0; j < nargs; ++j) {
+                 int typecode = extract32(typemask, (j + 1) * 3, 3);
+-                ca->args[j] = typecode_to_ffi[typecode];
++                ca->args[j] = typecode_to_ffi(typecode);
+             }
+         }
 --
-.17.2
+.34.1

-New patch
+[PULL 40/47] tcg: Factor init_ffi_layouts() out of tcg_context_init()
+From: Philippe Mathieu-Daudé <philmd@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+Message-Id: <20221111074101.2069454-27-richard.henderson@linaro.org>
+[PMD: Split from bigger patch]
+Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
+Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+Message-Id: <20221122180804.938-3-philmd@linaro.org>
+---
+ tcg/tcg.c | 83 +++++++++++++++++++++++++++++--------------------------
+file changed, 44 insertions(+), 39 deletions(-)
+diff --git a/tcg/tcg.c b/tcg/tcg.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/tcg.c
++++ b/tcg/tcg.c
+@@ -XXX,XX +XXX,XX @@ static ffi_type *typecode_to_ffi(int argmask)
+     }
+     g_assert_not_reached();
+ }
+-#endif
++
++static void init_ffi_layouts(void)
++{
++    /* g_direct_hash/equal for direct comparisons on uint32_t.  */
++    ffi_table = g_hash_table_new(NULL, NULL);
++    for (int i = 0; i < ARRAY_SIZE(all_helpers); ++i) {
++        uint32_t typemask = all_helpers[i].typemask;
++        gpointer hash = (gpointer)(uintptr_t)typemask;
++        struct {
++            ffi_cif cif;
++            ffi_type *args[];
++        } *ca;
++        ffi_status status;
++        int nargs;
++
++        if (g_hash_table_lookup(ffi_table, hash)) {
++            continue;
++        }
++
++        /* Ignoring the return type, find the last non-zero field. */
++        nargs = 32 - clz32(typemask >> 3);
++        nargs = DIV_ROUND_UP(nargs, 3);
++
++        ca = g_malloc0(sizeof(*ca) + nargs * sizeof(ffi_type *));
++        ca->cif.rtype = typecode_to_ffi(typemask & 7);
++        ca->cif.nargs = nargs;
++
++        if (nargs != 0) {
++            ca->cif.arg_types = ca->args;
++            for (int j = 0; j < nargs; ++j) {
++                int typecode = extract32(typemask, (j + 1) * 3, 3);
++                ca->args[j] = typecode_to_ffi(typecode);
++            }
++        }
++
++        status = ffi_prep_cif(&ca->cif, FFI_DEFAULT_ABI, nargs,
++                              ca->cif.rtype, ca->cif.arg_types);
++        assert(status == FFI_OK);
++
++        g_hash_table_insert(ffi_table, hash, (gpointer)&ca->cif);
++    }
++}
++#endif /* CONFIG_TCG_INTERPRETER */
+ typedef struct TCGCumulativeArgs {
+     int arg_idx;                /* tcg_gen_callN args[] */
+@@ -XXX,XX +XXX,XX @@ static void tcg_context_init(unsigned max_cpus)
+     }
+ #ifdef CONFIG_TCG_INTERPRETER
+-    /* g_direct_hash/equal for direct comparisons on uint32_t.  */
+-    ffi_table = g_hash_table_new(NULL, NULL);
+-    for (i = 0; i < ARRAY_SIZE(all_helpers); ++i) {
+-        struct {
+-            ffi_cif cif;
+-            ffi_type *args[];
+-        } *ca;
+-        uint32_t typemask = all_helpers[i].typemask;
+-        gpointer hash = (gpointer)(uintptr_t)typemask;
+-        ffi_status status;
+-        int nargs;
+-
+-        if (g_hash_table_lookup(ffi_table, hash)) {
+-            continue;
+-        }
+-
+-        /* Ignoring the return type, find the last non-zero field. */
+-        nargs = 32 - clz32(typemask >> 3);
+-        nargs = DIV_ROUND_UP(nargs, 3);
+-
+-        ca = g_malloc0(sizeof(*ca) + nargs * sizeof(ffi_type *));
+-        ca->cif.rtype = typecode_to_ffi(typemask & 7);
+-        ca->cif.nargs = nargs;
+-
+-        if (nargs != 0) {
+-            ca->cif.arg_types = ca->args;
+-            for (int j = 0; j < nargs; ++j) {
+-                int typecode = extract32(typemask, (j + 1) * 3, 3);
+-                ca->args[j] = typecode_to_ffi(typecode);
+-            }
+-        }
+-
+-        status = ffi_prep_cif(&ca->cif, FFI_DEFAULT_ABI, nargs,
+-                              ca->cif.rtype, ca->cif.arg_types);
+-        assert(status == FFI_OK);
+-
+-        g_hash_table_insert(ffi_table, hash, (gpointer)&ca->cif);
+-    }
++    init_ffi_layouts();
+ #endif
+     tcg_target_init(s);
+--
+.34.1

-New patch
+[PULL 41/47] tcg: Move ffi_cif pointer into TCGHelperInfo
+Instead of requiring a separate hash table lookup,
+put a pointer to the CIF into TCGHelperInfo.
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+Message-Id: <20221111074101.2069454-27-richard.henderson@linaro.org>
+[PMD: Split from bigger patch]
+Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
+Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+Message-Id: <20221122180804.938-4-philmd@linaro.org>
+---
+ tcg/tcg-internal.h |  7 +++++++
+ tcg/tcg.c          | 30 ++++++++++++++----------------
+files changed, 21 insertions(+), 16 deletions(-)
+diff --git a/tcg/tcg-internal.h b/tcg/tcg-internal.h
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/tcg-internal.h
++++ b/tcg/tcg-internal.h
+@@ -XXX,XX +XXX,XX @@
+ #ifndef TCG_INTERNAL_H
+ #define TCG_INTERNAL_H
++#ifdef CONFIG_TCG_INTERPRETER
++#include <ffi.h>
++#endif
++
+ #define TCG_HIGHWATER 1024
+ /*
+@@ -XXX,XX +XXX,XX @@ typedef struct TCGCallArgumentLoc {
+ typedef struct TCGHelperInfo {
+     void *func;
+     const char *name;
++#ifdef CONFIG_TCG_INTERPRETER
++    ffi_cif *cif;
++#endif
+     unsigned typemask           : 32;
+     unsigned flags              : 8;
+     unsigned nr_in              : 8;
+diff --git a/tcg/tcg.c b/tcg/tcg.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/tcg.c
++++ b/tcg/tcg.c
+@@ -XXX,XX +XXX,XX @@
+ #include "tcg/tcg-ldst.h"
+ #include "tcg-internal.h"
+-#ifdef CONFIG_TCG_INTERPRETER
+-#include <ffi.h>
+-#endif
+-
+ /* Forward declarations for functions declared in tcg-target.c.inc and
+    used here. */
+ static void tcg_target_init(TCGContext *s);
+@@ -XXX,XX +XXX,XX @@ static TCGHelperInfo all_helpers[] = {
+ static GHashTable *helper_table;
+ #ifdef CONFIG_TCG_INTERPRETER
+-static GHashTable *ffi_table;
+-
+ static ffi_type *typecode_to_ffi(int argmask)
+ {
+     switch (argmask) {
+@@ -XXX,XX +XXX,XX @@ static ffi_type *typecode_to_ffi(int argmask)
+ static void init_ffi_layouts(void)
+ {
+     /* g_direct_hash/equal for direct comparisons on uint32_t.  */
+-    ffi_table = g_hash_table_new(NULL, NULL);
++    GHashTable *ffi_table = g_hash_table_new(NULL, NULL);
++
+     for (int i = 0; i < ARRAY_SIZE(all_helpers); ++i) {
+-        uint32_t typemask = all_helpers[i].typemask;
++        TCGHelperInfo *info = &all_helpers[i];
++        unsigned typemask = info->typemask;
+         gpointer hash = (gpointer)(uintptr_t)typemask;
+         struct {
+             ffi_cif cif;
+@@ -XXX,XX +XXX,XX @@ static void init_ffi_layouts(void)
+         } *ca;
+         ffi_status status;
+         int nargs;
++        ffi_cif *cif;
+-        if (g_hash_table_lookup(ffi_table, hash)) {
++        cif = g_hash_table_lookup(ffi_table, hash);
++        if (cif) {
++            info->cif = cif;
+             continue;
+         }
+@@ -XXX,XX +XXX,XX @@ static void init_ffi_layouts(void)
+                               ca->cif.rtype, ca->cif.arg_types);
+         assert(status == FFI_OK);
+-        g_hash_table_insert(ffi_table, hash, (gpointer)&ca->cif);
++        cif = &ca->cif;
++        info->cif = cif;
++        g_hash_table_insert(ffi_table, hash, (gpointer)cif);
+     }
++
++    g_hash_table_destroy(ffi_table);
+ }
+ #endif /* CONFIG_TCG_INTERPRETER */
+@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_call(TCGContext *s, TCGOp *op)
+     }
+ #ifdef CONFIG_TCG_INTERPRETER
+-    {
+-        gpointer hash = (gpointer)(uintptr_t)info->typemask;
+-        ffi_cif *cif = g_hash_table_lookup(ffi_table, hash);
+-        assert(cif != NULL);
+-        tcg_out_call(s, tcg_call_func(op), cif);
+-    }
++    tcg_out_call(s, tcg_call_func(op), info->cif);
+ #else
+     tcg_out_call(s, tcg_call_func(op));
+ #endif
+--
+.34.1

-[Qemu-devel] [PULL 18/21] target/s390x: Split do_cdsg, do_lpq, do_stpq
+[PULL 42/47] tcg/aarch64: Merge tcg_out_callr into tcg_out_call
-Reviewed-by: David Hildenbrand <david@redhat.com>
+There is only one use, and BLR is perhaps even more
 self-documentary than CALLR.
 Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/s390x/mem_helper.c | 128 ++++++++++++++++++--------------------
+ tcg/aarch64/tcg-target.c.inc | 7 +------
-file changed, 61 insertions(+), 67 deletions(-)
+file changed, 1 insertion(+), 6 deletions(-)
-diff --git a/target/s390x/mem_helper.c b/target/s390x/mem_helper.c
+diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
---- a/target/s390x/mem_helper.c
+--- a/tcg/aarch64/tcg-target.c.inc
-+++ b/target/s390x/mem_helper.c
++++ b/tcg/aarch64/tcg-target.c.inc
-@@ -XXX,XX +XXX,XX @@ uint32_t HELPER(trXX)(CPUS390XState *env, uint32_t r1, uint32_t r2,
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_goto_long(TCGContext *s, const tcg_insn_unit *target)
-     return cc;
+     }
  }
--static void do_cdsg(CPUS390XState *env, uint64_t addr,
+-static inline void tcg_out_callr(TCGContext *s, TCGReg reg)
 -                    uint32_t r1, uint32_t r3, bool parallel)
 +void HELPER(cdsg)(CPUS390XState *env, uint64_t addr,
 +                  uint32_t r1, uint32_t r3)
  {
      uintptr_t ra = GETPC();
      Int128 cmpv = int128_make128(env->regs[r1 + 1], env->regs[r1]);
      Int128 newv = int128_make128(env->regs[r3 + 1], env->regs[r3]);
      Int128 oldv;
 +    uint64_t oldh, oldl;
      bool fail;
 -    if (parallel) {
 -#if !HAVE_CMPXCHG128
 -        cpu_loop_exit_atomic(ENV_GET_CPU(env), ra);
 -#else
 -        int mem_idx = cpu_mmu_index(env, false);
 -        TCGMemOpIdx oi = make_memop_idx(MO_TEQ | MO_ALIGN_16, mem_idx);
 -        oldv = helper_atomic_cmpxchgo_be_mmu(env, addr, cmpv, newv, oi, ra);
 -        fail = !int128_eq(oldv, cmpv);
 -#endif
 -    } else {
 -        uint64_t oldh, oldl;
 +    check_alignment(env, addr, 16, ra);
 -        check_alignment(env, addr, 16, ra);
 +    oldh = cpu_ldq_data_ra(env, addr + 0, ra);
 +    oldl = cpu_ldq_data_ra(env, addr + 8, ra);
 -        oldh = cpu_ldq_data_ra(env, addr + 0, ra);
 -        oldl = cpu_ldq_data_ra(env, addr + 8, ra);
 -
 -        oldv = int128_make128(oldl, oldh);
 -        fail = !int128_eq(oldv, cmpv);
 -        if (fail) {
 -            newv = oldv;
 -        }
 -
 -        cpu_stq_data_ra(env, addr + 0, int128_gethi(newv), ra);
 -        cpu_stq_data_ra(env, addr + 8, int128_getlo(newv), ra);
 +    oldv = int128_make128(oldl, oldh);
 +    fail = !int128_eq(oldv, cmpv);
 +    if (fail) {
 +        newv = oldv;
      }
 +    cpu_stq_data_ra(env, addr + 0, int128_gethi(newv), ra);
 +    cpu_stq_data_ra(env, addr + 8, int128_getlo(newv), ra);
 +
      env->cc_op = fail;
      env->regs[r1] = int128_gethi(oldv);
      env->regs[r1 + 1] = int128_getlo(oldv);
  }
 -void HELPER(cdsg)(CPUS390XState *env, uint64_t addr,
 -                  uint32_t r1, uint32_t r3)
 -{
--    do_cdsg(env, addr, r1, r3, false);
+-    tcg_out_insn(s, 3207, BLR, reg);
 -}
 -
- void HELPER(cdsg_parallel)(CPUS390XState *env, uint64_t addr,
+ static void tcg_out_call(TCGContext *s, const tcg_insn_unit *target)
                             uint32_t r1, uint32_t r3)
  {
--    do_cdsg(env, addr, r1, r3, true);
+     ptrdiff_t offset = tcg_pcrel_diff(s, target) >> 2;
-+    uintptr_t ra = GETPC();
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_call(TCGContext *s, const tcg_insn_unit *target)
-+    Int128 cmpv = int128_make128(env->regs[r1 + 1], env->regs[r1]);
+         tcg_out_insn(s, 3206, BL, offset);
-+    Int128 newv = int128_make128(env->regs[r3 + 1], env->regs[r3]);
+     } else {
-+    int mem_idx;
+         tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_TMP, (intptr_t)target);
-+    TCGMemOpIdx oi;
+-        tcg_out_callr(s, TCG_REG_TMP);
-+    Int128 oldv;
++        tcg_out_insn(s, 3207, BLR, TCG_REG_TMP);
 +    bool fail;
 +
 +    if (!HAVE_CMPXCHG128) {
 +        cpu_loop_exit_atomic(ENV_GET_CPU(env), ra);
 +    }
 +
 +    mem_idx = cpu_mmu_index(env, false);
 +    oi = make_memop_idx(MO_TEQ | MO_ALIGN_16, mem_idx);
 +    oldv = helper_atomic_cmpxchgo_be_mmu(env, addr, cmpv, newv, oi, ra);
 +    fail = !int128_eq(oldv, cmpv);
 +
 +    env->cc_op = fail;
 +    env->regs[r1] = int128_gethi(oldv);
 +    env->regs[r1 + 1] = int128_getlo(oldv);
  }
  static uint32_t do_csst(CPUS390XState *env, uint32_t r3, uint64_t a1,
@@ -XXX,XX +XXX,XX @@ uint64_t HELPER(lra)(CPUS390XState *env, uint64_t addr)
  #endif
  /* load pair from quadword */
 -static uint64_t do_lpq(CPUS390XState *env, uint64_t addr, bool parallel)
 +uint64_t HELPER(lpq)(CPUS390XState *env, uint64_t addr)
  {
      uintptr_t ra = GETPC();
      uint64_t hi, lo;
 -    if (!parallel) {
 -        check_alignment(env, addr, 16, ra);
 -        hi = cpu_ldq_data_ra(env, addr + 0, ra);
 -        lo = cpu_ldq_data_ra(env, addr + 8, ra);
 -    } else if (HAVE_ATOMIC128) {
 +    check_alignment(env, addr, 16, ra);
 +    hi = cpu_ldq_data_ra(env, addr + 0, ra);
 +    lo = cpu_ldq_data_ra(env, addr + 8, ra);
 +
 +    env->retxl = lo;
 +    return hi;
 +}
 +
 +uint64_t HELPER(lpq_parallel)(CPUS390XState *env, uint64_t addr)
 +{
 +    uintptr_t ra = GETPC();
 +    uint64_t hi, lo;
 +
 +    if (HAVE_ATOMIC128) {
          int mem_idx = cpu_mmu_index(env, false);
          TCGMemOpIdx oi = make_memop_idx(MO_TEQ | MO_ALIGN_16, mem_idx);
          Int128 v = helper_atomic_ldo_be_mmu(env, addr, oi, ra);
@@ -XXX,XX +XXX,XX @@ static uint64_t do_lpq(CPUS390XState *env, uint64_t addr, bool parallel)
      return hi;
  }
 -uint64_t HELPER(lpq)(CPUS390XState *env, uint64_t addr)
 -{
 -    return do_lpq(env, addr, false);
 -}
 -
 -uint64_t HELPER(lpq_parallel)(CPUS390XState *env, uint64_t addr)
 -{
 -    return do_lpq(env, addr, true);
 -}
 -
  /* store pair to quadword */
 -static void do_stpq(CPUS390XState *env, uint64_t addr,
 -                    uint64_t low, uint64_t high, bool parallel)
 +void HELPER(stpq)(CPUS390XState *env, uint64_t addr,
 +                  uint64_t low, uint64_t high)
  {
      uintptr_t ra = GETPC();
 -    if (!parallel) {
 -        check_alignment(env, addr, 16, ra);
 -        cpu_stq_data_ra(env, addr + 0, high, ra);
 -        cpu_stq_data_ra(env, addr + 8, low, ra);
 -    } else if (HAVE_ATOMIC128) {
 +    check_alignment(env, addr, 16, ra);
 +    cpu_stq_data_ra(env, addr + 0, high, ra);
 +    cpu_stq_data_ra(env, addr + 8, low, ra);
 +}
 +
 +void HELPER(stpq_parallel)(CPUS390XState *env, uint64_t addr,
 +                           uint64_t low, uint64_t high)
 +{
 +    uintptr_t ra = GETPC();
 +
 +    if (HAVE_ATOMIC128) {
          int mem_idx = cpu_mmu_index(env, false);
          TCGMemOpIdx oi = make_memop_idx(MO_TEQ | MO_ALIGN_16, mem_idx);
          Int128 v = int128_make128(low, high);
@@ -XXX,XX +XXX,XX @@ static void do_stpq(CPUS390XState *env, uint64_t addr,
      }
  }
--void HELPER(stpq)(CPUS390XState *env, uint64_t addr,
--                  uint64_t low, uint64_t high)
--{
--    do_stpq(env, addr, low, high, false);
--}
--
--void HELPER(stpq_parallel)(CPUS390XState *env, uint64_t addr,
--                           uint64_t low, uint64_t high)
--{
--    do_stpq(env, addr, low, high, true);
--}
--
- /* Execute instruction.  This instruction executes an insn modified with
-    the contents of r1.  It does not change the executed instruction in memory;
-    it does not change the program counter.
 --
-.17.2
+.34.1

-[Qemu-devel] [PULL 12/21] tcg: Split CONFIG_ATOMIC128
+[PULL 43/47] tcg: Add TCGHelperInfo argument to tcg_out_call
-GCC7+ will no longer advertise support for 16-byte __atomic operations
+This eliminates an ifdef for TCI, and will be required for
-if only cmpxchg is supported, as for x86_64.  Fortunately, x86_64 still
+expanding the call for TCGv_i128.
 has support for __sync_compare_and_swap_16 and we can make use of that.
 AArch64 does not have, nor ever has had such support, so open-code it.
-Reviewed-by: Emilio G. Cota <cota@braap.org>
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/atomic_template.h |  20 ++++-
+ tcg/tcg.c                        | 12 ++----------
- include/qemu/atomic128.h    | 155 ++++++++++++++++++++++++++++++++++++
+ tcg/aarch64/tcg-target.c.inc     | 12 +++++++++---
- tcg/tcg.h                   |  16 ++--
+ tcg/arm/tcg-target.c.inc         | 10 ++++++++--
- accel/tcg/cputlb.c          |   3 +-
+ tcg/i386/tcg-target.c.inc        |  5 +++--
- accel/tcg/user-exec.c       |   5 +-
+ tcg/loongarch64/tcg-target.c.inc |  7 ++++---
- configure                   |  19 +++++
+ tcg/mips/tcg-target.c.inc        |  3 ++-
-files changed, 204 insertions(+), 14 deletions(-)
+ tcg/ppc/tcg-target.c.inc         |  7 ++++---
- create mode 100644 include/qemu/atomic128.h
+ tcg/riscv/tcg-target.c.inc       |  7 ++++---
  tcg/s390x/tcg-target.c.inc       | 12 +++++++++---
  tcg/sparc64/tcg-target.c.inc     |  3 ++-
  tcg/tci/tcg-target.c.inc         |  3 ++-
 files changed, 49 insertions(+), 32 deletions(-)
-diff --git a/accel/tcg/atomic_template.h b/accel/tcg/atomic_template.h
+diff --git a/tcg/tcg.c b/tcg/tcg.c
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/atomic_template.h
+--- a/tcg/tcg.c
-+++ b/accel/tcg/atomic_template.h
++++ b/tcg/tcg.c
-@@ -XXX,XX +XXX,XX @@ ABI_TYPE ATOMIC_NAME(cmpxchg)(CPUArchState *env, target_ulong addr,
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg, TCGReg arg1,
-     DATA_TYPE ret;
+                        intptr_t arg2);
+ static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
-     ATOMIC_TRACE_RMW;
+                         TCGReg base, intptr_t ofs);
-+#if DATA_SIZE == 16
+-#ifdef CONFIG_TCG_INTERPRETER
-+    ret = atomic16_cmpxchg(haddr, cmpv, newv);
+ static void tcg_out_call(TCGContext *s, const tcg_insn_unit *target,
-+#else
+-                         ffi_cif *cif);
-     ret = atomic_cmpxchg__nocheck(haddr, cmpv, newv);
+-#else
-+#endif
+-static void tcg_out_call(TCGContext *s, const tcg_insn_unit *target);
-     ATOMIC_MMU_CLEANUP;
+-#endif
-     return ret;
++                         const TCGHelperInfo *info);
- }
+ static bool tcg_target_const_match(int64_t val, TCGType type, int ct);
+ #ifdef TCG_TARGET_NEED_LDST_LABELS
- #if DATA_SIZE >= 16
+ static int tcg_out_ldst_finalize(TCGContext *s);
-+#if HAVE_ATOMIC128
+@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_call(TCGContext *s, TCGOp *op)
- ABI_TYPE ATOMIC_NAME(ld)(CPUArchState *env, target_ulong addr EXTRA_ARGS)
+         save_globals(s, allocated_regs);
- {
+     }
-     ATOMIC_MMU_DECLS;
-     DATA_TYPE val, *haddr = ATOMIC_MMU_LOOKUP;
+-#ifdef CONFIG_TCG_INTERPRETER
+-    tcg_out_call(s, tcg_call_func(op), info->cif);
-     ATOMIC_TRACE_LD;
+-#else
--    __atomic_load(haddr, &val, __ATOMIC_RELAXED);
+-    tcg_out_call(s, tcg_call_func(op));
-+    val = atomic16_read(haddr);
+-#endif
-     ATOMIC_MMU_CLEANUP;
++    tcg_out_call(s, tcg_call_func(op), info);
-     return val;
- }
+     /* Assign output registers and emit moves if needed.  */
-@@ -XXX,XX +XXX,XX @@ void ATOMIC_NAME(st)(CPUArchState *env, target_ulong addr,
+     switch (info->out_kind) {
-     DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;
+diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
+index XXXXXXX..XXXXXXX 100644
-     ATOMIC_TRACE_ST;
+--- a/tcg/aarch64/tcg-target.c.inc
--    __atomic_store(haddr, &val, __ATOMIC_RELAXED);
++++ b/tcg/aarch64/tcg-target.c.inc
-+    atomic16_set(haddr, val);
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_goto_long(TCGContext *s, const tcg_insn_unit *target)
-     ATOMIC_MMU_CLEANUP;
+     }
  }
-+#endif
- #else
+-static void tcg_out_call(TCGContext *s, const tcg_insn_unit *target)
- ABI_TYPE ATOMIC_NAME(xchg)(CPUArchState *env, target_ulong addr,
++static void tcg_out_call_int(TCGContext *s, const tcg_insn_unit *target)
-                            ABI_TYPE val EXTRA_ARGS)
+ {
-@@ -XXX,XX +XXX,XX @@ ABI_TYPE ATOMIC_NAME(cmpxchg)(CPUArchState *env, target_ulong addr,
+     ptrdiff_t offset = tcg_pcrel_diff(s, target) >> 2;
-     DATA_TYPE ret;
+     if (offset == sextract64(offset, 0, 26)) {
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_call(TCGContext *s, const tcg_insn_unit *target)
-     ATOMIC_TRACE_RMW;
+     }
-+#if DATA_SIZE == 16
+ }
-+    ret = atomic16_cmpxchg(haddr, BSWAP(cmpv), BSWAP(newv));
-+#else
++static void tcg_out_call(TCGContext *s, const tcg_insn_unit *target,
-     ret = atomic_cmpxchg__nocheck(haddr, BSWAP(cmpv), BSWAP(newv));
++                         const TCGHelperInfo *info)
 +#endif
      ATOMIC_MMU_CLEANUP;
      return BSWAP(ret);
  }
  #if DATA_SIZE >= 16
 +#if HAVE_ATOMIC128
  ABI_TYPE ATOMIC_NAME(ld)(CPUArchState *env, target_ulong addr EXTRA_ARGS)
  {
      ATOMIC_MMU_DECLS;
      DATA_TYPE val, *haddr = ATOMIC_MMU_LOOKUP;
      ATOMIC_TRACE_LD;
 -    __atomic_load(haddr, &val, __ATOMIC_RELAXED);
 +    val = atomic16_read(haddr);
      ATOMIC_MMU_CLEANUP;
      return BSWAP(val);
  }
@@ -XXX,XX +XXX,XX @@ void ATOMIC_NAME(st)(CPUArchState *env, target_ulong addr,
      ATOMIC_TRACE_ST;
      val = BSWAP(val);
 -    __atomic_store(haddr, &val, __ATOMIC_RELAXED);
 +    atomic16_set(haddr, val);
      ATOMIC_MMU_CLEANUP;
  }
 +#endif
  #else
  ABI_TYPE ATOMIC_NAME(xchg)(CPUArchState *env, target_ulong addr,
                             ABI_TYPE val EXTRA_ARGS)
 diff --git a/include/qemu/atomic128.h b/include/qemu/atomic128.h
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/include/qemu/atomic128.h
@@ -XXX,XX +XXX,XX @@
 +/*
 + * Simple interface for 128-bit atomic operations.
 + *
 + * Copyright (C) 2018 Linaro, Ltd.
 + *
 + * This work is licensed under the terms of the GNU GPL, version 2 or later.
 + * See the COPYING file in the top-level directory.
 + *
 + * See docs/devel/atomics.txt for discussion about the guarantees each
 + * atomic primitive is meant to provide.
 + */
 +
 +#ifndef QEMU_ATOMIC128_H
 +#define QEMU_ATOMIC128_H
 +
 +/*
 + * GCC is a house divided about supporting large atomic operations.
 + *
 + * For hosts that only have large compare-and-swap, a legalistic reading
 + * of the C++ standard means that one cannot implement __atomic_read on
 + * read-only memory, and thus all atomic operations must synchronize
 + * through libatomic.
 + *
 + * See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=80878
 + *
 + * This interpretation is not especially helpful for QEMU.
 + * For softmmu, all RAM is always read/write from the hypervisor.
 + * For user-only, if the guest doesn't implement such an __atomic_read
 + * then the host need not worry about it either.
 + *
 + * Moreover, using libatomic is not an option, because its interface is
 + * built for std::atomic<T>, and requires that *all* accesses to such an
 + * object go through the library.  In our case we do not have an object
 + * in the C/C++ sense, but a view of memory as seen by the guest.
 + * The guest may issue a large atomic operation and then access those
 + * pieces using word-sized accesses.  From the hypervisor, we have no
 + * way to connect those two actions.
 + *
 + * Therefore, special case each platform.
 + */
 +
 +#if defined(CONFIG_ATOMIC128)
 +static inline Int128 atomic16_cmpxchg(Int128 *ptr, Int128 cmp, Int128 new)
 +{
-+    return atomic_cmpxchg__nocheck(ptr, cmp, new);
++    tcg_out_call_int(s, target);
 +}
 +# define HAVE_CMPXCHG128 1
 +#elif defined(CONFIG_CMPXCHG128)
 +static inline Int128 atomic16_cmpxchg(Int128 *ptr, Int128 cmp, Int128 new)
 +{
 +    return __sync_val_compare_and_swap_16(ptr, cmp, new);
 +}
 +# define HAVE_CMPXCHG128 1
 +#elif defined(__aarch64__)
 +/* Through gcc 8, aarch64 has no support for 128-bit at all.  */
 +static inline Int128 atomic16_cmpxchg(Int128 *ptr, Int128 cmp, Int128 new)
 +{
 +    uint64_t cmpl = int128_getlo(cmp), cmph = int128_gethi(cmp);
 +    uint64_t newl = int128_getlo(new), newh = int128_gethi(new);
 +    uint64_t oldl, oldh;
 +    uint32_t tmp;
 +
 +    asm("0: ldaxp %[oldl], %[oldh], %[mem]\n\t"
 +        "cmp %[oldl], %[cmpl]\n\t"
 +        "ccmp %[oldh], %[cmph], #0, eq\n\t"
 +        "b.ne 1f\n\t"
 +        "stlxp %w[tmp], %[newl], %[newh], %[mem]\n\t"
 +        "cbnz %w[tmp], 0b\n"
 +        "1:"
 +        : [mem] "+m"(*ptr), [tmp] "=&r"(tmp),
 +          [oldl] "=&r"(oldl), [oldh] "=r"(oldh)
 +        : [cmpl] "r"(cmpl), [cmph] "r"(cmph),
 +          [newl] "r"(newl), [newh] "r"(newh)
 +        : "memory", "cc");
 +
 +    return int128_make128(oldl, oldh);
 +}
 +# define HAVE_CMPXCHG128 1
 +#else
 +/* Fallback definition that must be optimized away, or error.  */
 +Int128 __attribute__((error("unsupported atomic")))
 +    atomic16_cmpxchg(Int128 *ptr, Int128 cmp, Int128 new);
 +# define HAVE_CMPXCHG128 0
 +#endif /* Some definition for HAVE_CMPXCHG128 */
 +
 +
 +#if defined(CONFIG_ATOMIC128)
 +static inline Int128 atomic16_read(Int128 *ptr)
 +{
 +    return atomic_read__nocheck(ptr);
 +}
 +
-+static inline void atomic16_set(Int128 *ptr, Int128 val)
+ void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_rx,
                                uintptr_t jmp_rw, uintptr_t addr)
  {
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
      tcg_out_mov(s, TARGET_LONG_BITS == 64, TCG_REG_X1, lb->addrlo_reg);
      tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_X2, oi);
      tcg_out_adr(s, TCG_REG_X3, lb->raddr);
 -    tcg_out_call(s, qemu_ld_helpers[opc & MO_SIZE]);
 +    tcg_out_call_int(s, qemu_ld_helpers[opc & MO_SIZE]);
      if (opc & MO_SIGN) {
          tcg_out_sxt(s, lb->type, size, lb->datalo_reg, TCG_REG_X0);
      } else {
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
      tcg_out_mov(s, size == MO_64, TCG_REG_X2, lb->datalo_reg);
      tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_X3, oi);
      tcg_out_adr(s, TCG_REG_X4, lb->raddr);
 -    tcg_out_call(s, qemu_st_helpers[opc & MO_SIZE]);
 +    tcg_out_call_int(s, qemu_st_helpers[opc & MO_SIZE]);
      tcg_out_goto(s, lb->raddr);
      return true;
  }
 diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/arm/tcg-target.c.inc
 +++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_goto(TCGContext *s, ARMCond cond, const tcg_insn_unit *addr)
   * The call case is mostly used for helpers - so it's not unreasonable
   * for them to be beyond branch range.
   */
 -static void tcg_out_call(TCGContext *s, const tcg_insn_unit *addr)
 +static void tcg_out_call_int(TCGContext *s, const tcg_insn_unit *addr)
  {
      intptr_t addri = (intptr_t)addr;
      ptrdiff_t disp = tcg_pcrel_diff(s, addr);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_call(TCGContext *s, const tcg_insn_unit *addr)
      tcg_out_blx_reg(s, COND_AL, TCG_REG_TMP);
  }
 +static void tcg_out_call(TCGContext *s, const tcg_insn_unit *addr,
 +                         const TCGHelperInfo *info)
 +{
-+    atomic_set__nocheck(ptr, val);
++    tcg_out_call_int(s, addr);
 +}
 +
-+# define HAVE_ATOMIC128 1
+ static void tcg_out_goto_label(TCGContext *s, ARMCond cond, TCGLabel *l)
-+#elif !defined(CONFIG_USER_ONLY) && defined(__aarch64__)
+ {
-+/* We can do better than cmpxchg for AArch64.  */
+     if (l->has_value) {
-+static inline Int128 atomic16_read(Int128 *ptr)
+@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
      argreg = tcg_out_arg_reg32(s, argreg, TCG_REG_R14);
      /* Use the canonical unsigned helpers and minimize icache usage. */
 -    tcg_out_call(s, qemu_ld_helpers[opc & MO_SIZE]);
 +    tcg_out_call_int(s, qemu_ld_helpers[opc & MO_SIZE]);
      datalo = lb->datalo_reg;
      datahi = lb->datahi_reg;
 diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/i386/tcg-target.c.inc
 +++ b/tcg/i386/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_branch(TCGContext *s, int call, const tcg_insn_unit *dest)
      }
  }
 -static inline void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest)
 +static void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest,
 +                         const TCGHelperInfo *info)
  {
      tcg_out_branch(s, 1, dest);
  }
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
                       (uintptr_t)l->raddr);
      }
 -    tcg_out_call(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]);
 +    tcg_out_branch(s, 1, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]);
      data_reg = l->datalo_reg;
      switch (opc & MO_SSIZE) {
 diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/loongarch64/tcg-target.c.inc
 +++ b/tcg/loongarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_call_int(TCGContext *s, const tcg_insn_unit *arg, bool tail)
      }
  }
 -static void tcg_out_call(TCGContext *s, const tcg_insn_unit *arg)
 +static void tcg_out_call(TCGContext *s, const tcg_insn_unit *arg,
 +                         const TCGHelperInfo *info)
  {
      tcg_out_call_int(s, arg, false);
  }
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
      tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_A2, oi);
      tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_A3, (tcg_target_long)l->raddr);
 -    tcg_out_call(s, qemu_ld_helpers[size]);
 +    tcg_out_call_int(s, qemu_ld_helpers[size], false);
      switch (opc & MO_SSIZE) {
      case MO_SB:
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
      tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_A3, oi);
      tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_A4, (tcg_target_long)l->raddr);
 -    tcg_out_call(s, qemu_st_helpers[size]);
 +    tcg_out_call_int(s, qemu_st_helpers[size], false);
      return tcg_out_goto(s, l->raddr);
  }
 diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/mips/tcg-target.c.inc
 +++ b/tcg/mips/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_call_int(TCGContext *s, const tcg_insn_unit *arg, bool tail)
      }
  }
 -static void tcg_out_call(TCGContext *s, const tcg_insn_unit *arg)
 +static void tcg_out_call(TCGContext *s, const tcg_insn_unit *arg,
 +                         const TCGHelperInfo *info)
  {
      tcg_out_call_int(s, arg, false);
      tcg_out_nop(s);
 diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/ppc/tcg-target.c.inc
 +++ b/tcg/ppc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_call_int(TCGContext *s, int lk,
  #endif
  }
 -static void tcg_out_call(TCGContext *s, const tcg_insn_unit *target)
 +static void tcg_out_call(TCGContext *s, const tcg_insn_unit *target,
 +                         const TCGHelperInfo *info)
  {
      tcg_out_call_int(s, LK, target);
  }
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
      tcg_out_movi(s, TCG_TYPE_I32, arg++, oi);
      tcg_out32(s, MFSPR | RT(arg) | LR);
 -    tcg_out_call(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]);
 +    tcg_out_call_int(s, LK, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]);
      lo = lb->datalo_reg;
      hi = lb->datahi_reg;
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
      tcg_out_movi(s, TCG_TYPE_I32, arg++, oi);
      tcg_out32(s, MFSPR | RT(arg) | LR);
 -    tcg_out_call(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
 +    tcg_out_call_int(s, LK, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
      tcg_out_b(s, 0, lb->raddr);
      return true;
 diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/riscv/tcg-target.c.inc
 +++ b/tcg/riscv/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_call_int(TCGContext *s, const tcg_insn_unit *arg, bool tail)
      }
  }
 -static void tcg_out_call(TCGContext *s, const tcg_insn_unit *arg)
 +static void tcg_out_call(TCGContext *s, const tcg_insn_unit *arg,
 +                         const TCGHelperInfo *info)
  {
      tcg_out_call_int(s, arg, false);
  }
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
      tcg_out_movi(s, TCG_TYPE_PTR, a2, oi);
      tcg_out_movi(s, TCG_TYPE_PTR, a3, (tcg_target_long)l->raddr);
 -    tcg_out_call(s, qemu_ld_helpers[opc & MO_SSIZE]);
 +    tcg_out_call_int(s, qemu_ld_helpers[opc & MO_SSIZE], false);
      tcg_out_mov(s, (opc & MO_SIZE) == MO_64, l->datalo_reg, a0);
      tcg_out_goto(s, l->raddr);
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
      tcg_out_movi(s, TCG_TYPE_PTR, a3, oi);
      tcg_out_movi(s, TCG_TYPE_PTR, a4, (tcg_target_long)l->raddr);
 -    tcg_out_call(s, qemu_st_helpers[opc & MO_SIZE]);
 +    tcg_out_call_int(s, qemu_st_helpers[opc & MO_SIZE], false);
      tcg_out_goto(s, l->raddr);
      return true;
 diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/s390x/tcg-target.c.inc
 +++ b/tcg/s390x/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tgen_brcond(TCGContext *s, TCGType type, TCGCond c,
      tgen_branch(s, cc, l);
  }
 -static void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest)
 +static void tcg_out_call_int(TCGContext *s, const tcg_insn_unit *dest)
  {
      ptrdiff_t off = tcg_pcrel_diff(s, dest) >> 1;
      if (off == (int32_t)off) {
@@ -XXX,XX +XXX,XX @@ static void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest)
      }
  }
 +static void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest,
 +                         const TCGHelperInfo *info)
 +{
-+    uint64_t l, h;
++    tcg_out_call_int(s, dest);
 +    uint32_t tmp;
 +
 +    /* The load must be paired with the store to guarantee not tearing.  */
 +    asm("0: ldxp %[l], %[h], %[mem]\n\t"
 +        "stxp %w[tmp], %[l], %[h], %[mem]\n\t"
 +        "cbnz %w[tmp], 0b"
 +        : [mem] "+m"(*ptr), [tmp] "=r"(tmp), [l] "=r"(l), [h] "=r"(h));
 +
 +    return int128_make128(l, h);
 +}
 +
-+static inline void atomic16_set(Int128 *ptr, Int128 val)
+ static void tcg_out_qemu_ld_direct(TCGContext *s, MemOp opc, TCGReg data,
-+{
+                                    TCGReg base, TCGReg index, int disp)
-+    uint64_t l = int128_getlo(val), h = int128_gethi(val);
+ {
-+    uint64_t t1, t2;
+@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
-+
+     }
-+    /* Load into temporaries to acquire the exclusive access lock.  */
+     tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_R4, oi);
-+    asm("0: ldxp %[t1], %[t2], %[mem]\n\t"
+     tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R5, (uintptr_t)lb->raddr);
-+        "stxp %w[t1], %[l], %[h], %[mem]\n\t"
+-    tcg_out_call(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SSIZE)]);
-+        "cbnz %w[t1], 0b"
++    tcg_out_call_int(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SSIZE)]);
-+        : [mem] "+m"(*ptr), [t1] "=&r"(t1), [t2] "=&r"(t2)
+     tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_R2);
-+        : [l] "r"(l), [h] "r"(h));
-+}
+     tgen_gotoi(s, S390_CC_ALWAYS, lb->raddr);
-+
+@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
-+# define HAVE_ATOMIC128 1
+     }
-+#elif !defined(CONFIG_USER_ONLY) && HAVE_CMPXCHG128
+     tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_R5, oi);
-+static inline Int128 atomic16_read(Int128 *ptr)
+     tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R6, (uintptr_t)lb->raddr);
-+{
+-    tcg_out_call(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
-+    /* Maybe replace 0 with 0, returning the old value.  */
++    tcg_out_call_int(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
-+    return atomic16_cmpxchg(ptr, 0, 0);
-+}
+     tgen_gotoi(s, S390_CC_ALWAYS, lb->raddr);
-+
+     return true;
-+static inline void atomic16_set(Int128 *ptr, Int128 val)
+diff --git a/tcg/sparc64/tcg-target.c.inc b/tcg/sparc64/tcg-target.c.inc
-+{
+index XXXXXXX..XXXXXXX 100644
-+    Int128 old = *ptr, cmp;
+--- a/tcg/sparc64/tcg-target.c.inc
-+    do {
++++ b/tcg/sparc64/tcg-target.c.inc
-+        cmp = old;
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_call_nodelay(TCGContext *s, const tcg_insn_unit *dest,
-+        old = atomic16_cmpxchg(ptr, cmp, val);
+     }
-+    } while (old != cmp);
+ }
-+}
-+
+-static void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest)
-+# define HAVE_ATOMIC128 1
++static void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest,
-+#else
++                         const TCGHelperInfo *info)
-+/* Fallback definitions that must be optimized away, or error.  */
+ {
-+Int128 __attribute__((error("unsupported atomic")))
+     tcg_out_call_nodelay(s, dest, false);
-+    atomic16_read(Int128 *ptr);
+     tcg_out_nop(s);
-+void __attribute__((error("unsupported atomic")))
+diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
-+    atomic16_set(Int128 *ptr, Int128 val);
+index XXXXXXX..XXXXXXX 100644
-+# define HAVE_ATOMIC128 0
+--- a/tcg/tci/tcg-target.c.inc
-+#endif /* Some definition for HAVE_ATOMIC128 */
++++ b/tcg/tci/tcg-target.c.inc
-+
+@@ -XXX,XX +XXX,XX @@ static void tcg_out_movi(TCGContext *s, TCGType type,
-+#endif /* QEMU_ATOMIC128_H */
+ }
-diff --git a/tcg/tcg.h b/tcg/tcg.h
-index XXXXXXX..XXXXXXX 100644
+ static void tcg_out_call(TCGContext *s, const tcg_insn_unit *func,
---- a/tcg/tcg.h
+-                         ffi_cif *cif)
-+++ b/tcg/tcg.h
++                         const TCGHelperInfo *info)
-@@ -XXX,XX +XXX,XX @@
+ {
- #include "qemu/queue.h"
++    ffi_cif *cif = info->cif;
- #include "tcg-mo.h"
+     tcg_insn_unit insn = 0;
- #include "tcg-target.h"
+     uint8_t which;
-+#include "qemu/int128.h"
  /* XXX: make safe guess about sizes */
  #define MAX_OP_PER_INSTR 266
@@ -XXX,XX +XXX,XX @@ GEN_ATOMIC_HELPER_ALL(xchg)
  #undef GEN_ATOMIC_HELPER
  #endif /* CONFIG_SOFTMMU */
 -#ifdef CONFIG_ATOMIC128
 -#include "qemu/int128.h"
 -
 -/* These aren't really a "proper" helpers because TCG cannot manage Int128.
 -   However, use the same format as the others, for use by the backends. */
 +/*
 + * These aren't really a "proper" helpers because TCG cannot manage Int128.
 + * However, use the same format as the others, for use by the backends.
 + *
 + * The cmpxchg functions are only defined if HAVE_CMPXCHG128;
 + * the ld/st functions are only defined if HAVE_ATOMIC128,
 + * as defined by <qemu/atomic128.h>.
 + */
  Int128 helper_atomic_cmpxchgo_le_mmu(CPUArchState *env, target_ulong addr,
                                       Int128 cmpv, Int128 newv,
                                       TCGMemOpIdx oi, uintptr_t retaddr);
@@ -XXX,XX +XXX,XX @@ void helper_atomic_sto_le_mmu(CPUArchState *env, target_ulong addr, Int128 val,
  void helper_atomic_sto_be_mmu(CPUArchState *env, target_ulong addr, Int128 val,
                                TCGMemOpIdx oi, uintptr_t retaddr);
 -#endif /* CONFIG_ATOMIC128 */
 -
  #endif /* TCG_H */
 diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/cputlb.c
 +++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@
  #include "exec/log.h"
  #include "exec/helper-proto.h"
  #include "qemu/atomic.h"
 +#include "qemu/atomic128.h"
  /* DEBUG defines, enable DEBUG_TLB_LOG to log to the CPU_LOG_MMU target */
  /* #define DEBUG_TLB */
@@ -XXX,XX +XXX,XX @@ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
  #include "atomic_template.h"
  #endif
 -#ifdef CONFIG_ATOMIC128
 +#if HAVE_CMPXCHG128 || HAVE_ATOMIC128
  #define DATA_SIZE 16
  #include "atomic_template.h"
  #endif
 diff --git a/accel/tcg/user-exec.c b/accel/tcg/user-exec.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/user-exec.c
 +++ b/accel/tcg/user-exec.c
@@ -XXX,XX +XXX,XX @@
  #include "exec/cpu_ldst.h"
  #include "translate-all.h"
  #include "exec/helper-proto.h"
 +#include "qemu/atomic128.h"
  #undef EAX
  #undef ECX
@@ -XXX,XX +XXX,XX @@ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
  /* The following is only callable from other helpers, and matches up
     with the softmmu version.  */
 -#ifdef CONFIG_ATOMIC128
 +#if HAVE_ATOMIC128 || HAVE_CMPXCHG128
  #undef EXTRA_ARGS
  #undef ATOMIC_NAME
@@ -XXX,XX +XXX,XX @@ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
  #define DATA_SIZE 16
  #include "atomic_template.h"
 -#endif /* CONFIG_ATOMIC128 */
 +#endif
 diff --git a/configure b/configure
 index XXXXXXX..XXXXXXX 100755
 --- a/configure
 +++ b/configure
@@ -XXX,XX +XXX,XX @@ EOF
    fi
  fi
 +cmpxchg128=no
 +if test "$int128" = yes -a "$atomic128" = no; then
 +  cat > $TMPC << EOF
 +int main(void)
 +{
 +  unsigned __int128 x = 0, y = 0;
 +  __sync_val_compare_and_swap_16(&x, y, x);
 +  return 0;
 +}
 +EOF
 +  if compile_prog "" "" ; then
 +    cmpxchg128=yes
 +  fi
 +fi
 +
  #########################################
  # See if 64-bit atomic operations are supported.
  # Note that without __atomic builtins, we can only
@@ -XXX,XX +XXX,XX @@ if test "$atomic128" = "yes" ; then
    echo "CONFIG_ATOMIC128=y" >> $config_host_mak
  fi
 +if test "$cmpxchg128" = "yes" ; then
 +  echo "CONFIG_CMPXCHG128=y" >> $config_host_mak
 +fi
 +
  if test "$atomic64" = "yes" ; then
    echo "CONFIG_ATOMIC64=y" >> $config_host_mak
  fi
 --
-.17.2
+.34.1

-[Qemu-devel] [PULL 04/21] tcg: plug holes in struct TCGProfile
+[PULL 44/47] accel/tcg: Fix tb_invalidate_phys_page_unwind
-From: "Emilio G. Cota" <cota@braap.org>
+When called from syscall(), we are not within a TB and pc == 0.
 We can skip the check for invalidating the current TB.
-This plugs two 4-byte holes in 64-bit.
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Signed-off-by: Emilio G. Cota <cota@braap.org>
 Message-Id: <20181010144853.13005-4-cota@braap.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/tcg.h | 2 +-
+ accel/tcg/tb-maint.c | 78 ++++++++++++++++++++++++--------------------
-file changed, 1 insertion(+), 1 deletion(-)
+file changed, 43 insertions(+), 35 deletions(-)
-diff --git a/tcg/tcg.h b/tcg/tcg.h
+diff --git a/accel/tcg/tb-maint.c b/accel/tcg/tb-maint.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/tcg.h
+--- a/accel/tcg/tb-maint.c
-+++ b/tcg/tcg.h
++++ b/accel/tcg/tb-maint.c
-@@ -XXX,XX +XXX,XX @@ typedef struct TCGProfile {
+@@ -XXX,XX +XXX,XX @@ void tb_invalidate_phys_page(tb_page_addr_t addr)
-     int64_t tb_count;
+  */
-     int64_t op_count; /* total insn count */
+ bool tb_invalidate_phys_page_unwind(tb_page_addr_t addr, uintptr_t pc)
-     int op_count_max; /* max insn per TB */
+ {
--    int64_t temp_count;
+-    assert(pc != 0);
-     int temp_count_max;
+-#ifdef TARGET_HAS_PRECISE_SMC
-+    int64_t temp_count;
+-    assert_memory_lock();
-     int64_t del_op_count;
+-    {
-     int64_t code_in_len;
+-        TranslationBlock *current_tb = tcg_tb_lookup(pc);
-     int64_t code_out_len;
+-        bool current_tb_modified = false;
 -        TranslationBlock *tb;
 -        PageForEachNext n;
 +    TranslationBlock *current_tb;
 +    bool current_tb_modified;
 +    TranslationBlock *tb;
 +    PageForEachNext n;
 -        addr &= TARGET_PAGE_MASK;
 -
 -        PAGE_FOR_EACH_TB(addr, addr + TARGET_PAGE_SIZE, unused, tb, n) {
 -            if (current_tb == tb &&
 -                (tb_cflags(current_tb) & CF_COUNT_MASK) != 1) {
 -                /*
 -                 * If we are modifying the current TB, we must stop its
 -                 * execution. We could be more precise by checking that
 -                 * the modification is after the current PC, but it would
 -                 * require a specialized function to partially restore
 -                 * the CPU state.
 -                 */
 -                current_tb_modified = true;
 -                cpu_restore_state_from_tb(current_cpu, current_tb, pc);
 -            }
 -            tb_phys_invalidate__locked(tb);
 -        }
 -
 -        if (current_tb_modified) {
 -            /* Force execution of one insn next time.  */
 -            CPUState *cpu = current_cpu;
 -            cpu->cflags_next_tb = 1 | CF_NOIRQ | curr_cflags(current_cpu);
 -            return true;
 -        }
 +    /*
 +     * Without precise smc semantics, or when outside of a TB,
 +     * we can skip to invalidate.
 +     */
 +#ifndef TARGET_HAS_PRECISE_SMC
 +    pc = 0;
 +#endif
 +    if (!pc) {
 +        tb_invalidate_phys_page(addr);
 +        return false;
 +    }
 +
 +    assert_memory_lock();
 +    current_tb = tcg_tb_lookup(pc);
 +
 +    addr &= TARGET_PAGE_MASK;
 +    current_tb_modified = false;
 +
 +    PAGE_FOR_EACH_TB(addr, addr + TARGET_PAGE_SIZE, unused, tb, n) {
 +        if (current_tb == tb &&
 +            (tb_cflags(current_tb) & CF_COUNT_MASK) != 1) {
 +            /*
 +             * If we are modifying the current TB, we must stop its
 +             * execution. We could be more precise by checking that
 +             * the modification is after the current PC, but it would
 +             * require a specialized function to partially restore
 +             * the CPU state.
 +             */
 +            current_tb_modified = true;
 +            cpu_restore_state_from_tb(current_cpu, current_tb, pc);
 +        }
 +        tb_phys_invalidate__locked(tb);
 +    }
 +
 +    if (current_tb_modified) {
 +        /* Force execution of one insn next time.  */
 +        CPUState *cpu = current_cpu;
 +        cpu->cflags_next_tb = 1 | CF_NOIRQ | curr_cflags(current_cpu);
 +        return true;
      }
 -#else
 -    tb_invalidate_phys_page(addr);
 -#endif /* TARGET_HAS_PRECISE_SMC */
      return false;
  }
  #else
 --
-.17.2
+.34.1

-[Qemu-devel] [PULL 13/21] target/i386: Convert to HAVE_CMPXCHG128
+[PULL 45/47] accel/tcg: Use g_free_rcu for user-exec interval trees
-Reviewed-by: Emilio G. Cota <cota@braap.org>
+Because we allow lockless lookups, we have to be careful
-Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+when it is freed.  Use rcu to delay the free until safe.
 Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/i386/mem_helper.c | 9 ++++-----
+ accel/tcg/user-exec.c | 18 ++++++++++--------
-file changed, 4 insertions(+), 5 deletions(-)
+file changed, 10 insertions(+), 8 deletions(-)
-diff --git a/target/i386/mem_helper.c b/target/i386/mem_helper.c
+diff --git a/accel/tcg/user-exec.c b/accel/tcg/user-exec.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/i386/mem_helper.c
+--- a/accel/tcg/user-exec.c
-+++ b/target/i386/mem_helper.c
++++ b/accel/tcg/user-exec.c
 @@ -XXX,XX +XXX,XX @@
  #include "exec/exec-all.h"
+ #include "tcg/tcg.h"
+ #include "qemu/bitops.h"
++#include "qemu/rcu.h"
  #include "exec/cpu_ldst.h"
- #include "qemu/int128.h"
+ #include "exec/translate-all.h"
-+#include "qemu/atomic128.h"
+ #include "exec/helper-proto.h"
- #include "tcg.h"
+@@ -XXX,XX +XXX,XX @@ bool handle_sigsegv_accerr_write(CPUState *cpu, sigset_t *old_set,
+ }
- void helper_cmpxchg8b_unlocked(CPUX86State *env, target_ulong a0)
-@@ -XXX,XX +XXX,XX @@ void helper_cmpxchg16b(CPUX86State *env, target_ulong a0)
+ typedef struct PageFlagsNode {
++    struct rcu_head rcu;
-     if ((a0 & 0xf) != 0) {
+     IntervalTreeNode itree;
-         raise_exception_ra(env, EXCP0D_GPF, ra);
+     int flags;
--    } else {
+ } PageFlagsNode;
--#ifndef CONFIG_ATOMIC128
+@@ -XXX,XX +XXX,XX @@ static bool pageflags_unset(target_ulong start, target_ulong last)
--        cpu_loop_exit_atomic(ENV_GET_CPU(env), ra);
+             }
--#else
+         } else if (p_last <= last) {
-+    } else if (HAVE_CMPXCHG128) {
+             /* Range completely covers node -- remove it. */
-         int eflags = cpu_cc_compute_all(env, CC_OP);
+-            g_free(p);
++            g_free_rcu(p, rcu);
-         Int128 cmpv = int128_make128(env->regs[R_EAX], env->regs[R_EDX]);
+         } else {
-@@ -XXX,XX +XXX,XX @@ void helper_cmpxchg16b(CPUX86State *env, target_ulong a0)
+             /* Truncate the node from the start. */
-             eflags &= ~CC_Z;
+             p->itree.start = last + 1;
@@ -XXX,XX +XXX,XX @@ static void pageflags_create_merge(target_ulong start, target_ulong last,
      if (prev) {
          if (next) {
              prev->itree.last = next->itree.last;
 -            g_free(next);
 +            g_free_rcu(next, rcu);
          } else {
              prev->itree.last = last;
          }
-         CC_SRC = eflags;
+@@ -XXX,XX +XXX,XX @@ static bool pageflags_set_clear(target_ulong start, target_ulong last,
--#endif
+             p->flags = merge_flags;
-+    } else {
+         } else {
-+        cpu_loop_exit_atomic(ENV_GET_CPU(env), ra);
+             interval_tree_remove(&p->itree, &pageflags_root);
 -            g_free(p);
 +            g_free_rcu(p, rcu);
          }
          goto done;
      }
@@ -XXX,XX +XXX,XX @@ static bool pageflags_set_clear(target_ulong start, target_ulong last,
                      p->flags = merge_flags;
                  } else {
                      interval_tree_remove(&p->itree, &pageflags_root);
 -                    g_free(p);
 +                    g_free_rcu(p, rcu);
                  }
                  if (p_last < last) {
                      start = p_last + 1;
@@ -XXX,XX +XXX,XX @@ static bool pageflags_set_clear(target_ulong start, target_ulong last,
          p->itree.start = last + 1;
          interval_tree_insert(&p->itree, &pageflags_root);
      } else {
 -        g_free(p);
 +        g_free_rcu(p, rcu);
          goto restart;
      }
      if (set_flags) {
@@ -XXX,XX +XXX,XX @@ tb_page_addr_t get_page_addr_code_hostp(CPUArchState *env, target_ulong addr,
  #define TBD_MASK   (TARGET_PAGE_MASK * TPD_PAGES)
  typedef struct TargetPageDataNode {
 +    struct rcu_head rcu;
      IntervalTreeNode itree;
      char data[TPD_PAGES][TARGET_PAGE_DATA_SIZE] __attribute__((aligned));
  } TargetPageDataNode;
@@ -XXX,XX +XXX,XX @@ void page_reset_target_data(target_ulong start, target_ulong end)
           n = next,
           next = next ? interval_tree_iter_next(n, start, last) : NULL) {
          target_ulong n_start, n_last, p_ofs, p_len;
 -        TargetPageDataNode *t;
 +        TargetPageDataNode *t = container_of(n, TargetPageDataNode, itree);
          if (n->start >= start && n->last <= last) {
              interval_tree_remove(n, &targetdata_root);
 -            g_free(n);
 +            g_free_rcu(t, rcu);
              continue;
          }
@@ -XXX,XX +XXX,XX @@ void page_reset_target_data(target_ulong start, target_ulong end)
          n_last = MIN(last, n->last);
          p_len = (n_last + 1 - n_start) >> TARGET_PAGE_BITS;
 -        t = container_of(n, TargetPageDataNode, itree);
          memset(t->data[p_ofs], 0, p_len * TARGET_PAGE_DATA_SIZE);
      }
  }
- #endif
 --
-.17.2
+.34.1

-[Qemu-devel] [PULL 20/21] target/s390x: Check HAVE_ATOMIC128 and HAVE_CMPXCHG128 at translate
+[PULL 46/47] accel/tcg: Handle false negative lookup in page_check_range
-Reviewed-by: David Hildenbrand <david@redhat.com>
+As in page_get_flags, we need to try again with the mmap
 lock held if we fail a page lookup.
 Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/s390x/mem_helper.c | 40 +++++++++++++++++++--------------------
+ accel/tcg/user-exec.c | 41 ++++++++++++++++++++++++++++++++++-------
- target/s390x/translate.c  | 25 +++++++++++++++++-------
+file changed, 34 insertions(+), 7 deletions(-)
 files changed, 38 insertions(+), 27 deletions(-)
-diff --git a/target/s390x/mem_helper.c b/target/s390x/mem_helper.c
+diff --git a/accel/tcg/user-exec.c b/accel/tcg/user-exec.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/s390x/mem_helper.c
+--- a/accel/tcg/user-exec.c
-+++ b/target/s390x/mem_helper.c
++++ b/accel/tcg/user-exec.c
-@@ -XXX,XX +XXX,XX @@ void HELPER(cdsg_parallel)(CPUS390XState *env, uint64_t addr,
+@@ -XXX,XX +XXX,XX @@ void page_set_flags(target_ulong start, target_ulong end, int flags)
-     Int128 oldv;
+ int page_check_range(target_ulong start, target_ulong len, int flags)
      bool fail;
 -    if (!HAVE_CMPXCHG128) {
 -        cpu_loop_exit_atomic(ENV_GET_CPU(env), ra);
 -    }
 +    assert(HAVE_CMPXCHG128);
      mem_idx = cpu_mmu_index(env, false);
      oi = make_memop_idx(MO_TEQ | MO_ALIGN_16, mem_idx);
@@ -XXX,XX +XXX,XX @@ uint64_t HELPER(lpq_parallel)(CPUS390XState *env, uint64_t addr)
  {
-     uintptr_t ra = GETPC();
+     target_ulong last;
-     uint64_t hi, lo;
++    int locked;  /* tri-state: =0: unlocked, +1: global, -1: local */
-+    int mem_idx;
++    int ret;
-+    TCGMemOpIdx oi;
-+    Int128 v;
+     if (len == 0) {
+         return 0;  /* trivial length */
--    if (HAVE_ATOMIC128) {
+@@ -XXX,XX +XXX,XX @@ int page_check_range(target_ulong start, target_ulong len, int flags)
--        int mem_idx = cpu_mmu_index(env, false);
+         return -1; /* wrap around */
--        TCGMemOpIdx oi = make_memop_idx(MO_TEQ | MO_ALIGN_16, mem_idx);
+     }
--        Int128 v = helper_atomic_ldo_be_mmu(env, addr, oi, ra);
--        hi = int128_gethi(v);
++    locked = have_mmap_lock();
--        lo = int128_getlo(v);
+     while (true) {
--    } else {
+         PageFlagsNode *p = pageflags_find(start, last);
--        cpu_loop_exit_atomic(ENV_GET_CPU(env), ra);
+         int missing;
--    }
-+    assert(HAVE_ATOMIC128);
+         if (!p) {
 -            return -1; /* entire region invalid */
 +            if (!locked) {
 +                /*
 +                 * Lockless lookups have false negatives.
 +                 * Retry with the lock held.
 +                 */
 +                mmap_lock();
 +                locked = -1;
 +                p = pageflags_find(start, last);
 +            }
 +            if (!p) {
 +                ret = -1; /* entire region invalid */
 +                break;
 +            }
          }
          if (start < p->itree.start) {
 -            return -1; /* initial bytes invalid */
 +            ret = -1; /* initial bytes invalid */
 +            break;
          }
          missing = flags & ~p->flags;
          if (missing & PAGE_READ) {
 -            return -1; /* page not readable */
 +            ret = -1; /* page not readable */
 +            break;
          }
          if (missing & PAGE_WRITE) {
              if (!(p->flags & PAGE_WRITE_ORG)) {
 -                return -1; /* page not writable */
 +                ret = -1; /* page not writable */
 +                break;
              }
              /* Asking about writable, but has been protected: undo. */
              if (!page_unprotect(start, 0)) {
 -                return -1;
 +                ret = -1;
 +                break;
              }
              /* TODO: page_unprotect should take a range, not a single page. */
              if (last - start < TARGET_PAGE_SIZE) {
 -                return 0; /* ok */
 +                ret = 0; /* ok */
 +                break;
              }
              start += TARGET_PAGE_SIZE;
              continue;
          }
          if (last <= p->itree.last) {
 -            return 0; /* ok */
 +            ret = 0; /* ok */
 +            break;
          }
          start = p->itree.last + 1;
      }
 +
-+    mem_idx = cpu_mmu_index(env, false);
++    /* Release the lock if acquired locally. */
-+    oi = make_memop_idx(MO_TEQ | MO_ALIGN_16, mem_idx);
++    if (locked < 0) {
-+    v = helper_atomic_ldo_be_mmu(env, addr, oi, ra);
++        mmap_unlock();
-+    hi = int128_gethi(v);
++    }
 +    lo = int128_getlo(v);
      env->retxl = lo;
      return hi;
@@ -XXX,XX +XXX,XX @@ void HELPER(stpq_parallel)(CPUS390XState *env, uint64_t addr,
                             uint64_t low, uint64_t high)
  {
      uintptr_t ra = GETPC();
 +    int mem_idx;
 +    TCGMemOpIdx oi;
 +    Int128 v;
 -    if (HAVE_ATOMIC128) {
 -        int mem_idx = cpu_mmu_index(env, false);
 -        TCGMemOpIdx oi = make_memop_idx(MO_TEQ | MO_ALIGN_16, mem_idx);
 -        Int128 v = int128_make128(low, high);
 -        helper_atomic_sto_be_mmu(env, addr, v, oi, ra);
 -    } else {
 -        cpu_loop_exit_atomic(ENV_GET_CPU(env), ra);
 -    }
 +    assert(HAVE_ATOMIC128);
 +
 +    mem_idx = cpu_mmu_index(env, false);
 +    oi = make_memop_idx(MO_TEQ | MO_ALIGN_16, mem_idx);
 +    v = int128_make128(low, high);
 +    helper_atomic_sto_be_mmu(env, addr, v, oi, ra);
  }
  /* Execute instruction.  This instruction executes an insn modified with
 diff --git a/target/s390x/translate.c b/target/s390x/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/s390x/translate.c
 +++ b/target/s390x/translate.c
@@ -XXX,XX +XXX,XX @@
  #include "trace-tcg.h"
  #include "exec/translator.h"
  #include "exec/log.h"
 +#include "qemu/atomic128.h"
  /* Information that (most) every instruction needs to manipulate.  */
@@ -XXX,XX +XXX,XX @@ static DisasJumpType op_cdsg(DisasContext *s, DisasOps *o)
      int r3 = get_field(s->fields, r3);
      int d2 = get_field(s->fields, d2);
      int b2 = get_field(s->fields, b2);
 +    DisasJumpType ret = DISAS_NEXT;
      TCGv_i64 addr;
      TCGv_i32 t_r1, t_r3;
@@ -XXX,XX +XXX,XX @@ static DisasJumpType op_cdsg(DisasContext *s, DisasOps *o)
      addr = get_address(s, 0, b2, d2);
      t_r1 = tcg_const_i32(r1);
      t_r3 = tcg_const_i32(r3);
 -    if (tb_cflags(s->base.tb) & CF_PARALLEL) {
 +    if (!(tb_cflags(s->base.tb) & CF_PARALLEL)) {
 +        gen_helper_cdsg(cpu_env, addr, t_r1, t_r3);
 +    } else if (HAVE_CMPXCHG128) {
          gen_helper_cdsg_parallel(cpu_env, addr, t_r1, t_r3);
      } else {
 -        gen_helper_cdsg(cpu_env, addr, t_r1, t_r3);
 +        gen_helper_exit_atomic(cpu_env);
 +        ret = DISAS_NORETURN;
      }
      tcg_temp_free_i64(addr);
      tcg_temp_free_i32(t_r1);
      tcg_temp_free_i32(t_r3);
      set_cc_static(s);
 -    return DISAS_NEXT;
 +    return ret;
  }
- static DisasJumpType op_csst(DisasContext *s, DisasOps *o)
+ void page_protect(tb_page_addr_t address)
@@ -XXX,XX +XXX,XX @@ static DisasJumpType op_lpd(DisasContext *s, DisasOps *o)
  static DisasJumpType op_lpq(DisasContext *s, DisasOps *o)
  {
 -    if (tb_cflags(s->base.tb) & CF_PARALLEL) {
 +    if (!(tb_cflags(s->base.tb) & CF_PARALLEL)) {
 +        gen_helper_lpq(o->out, cpu_env, o->in2);
 +    } else if (HAVE_ATOMIC128) {
          gen_helper_lpq_parallel(o->out, cpu_env, o->in2);
      } else {
 -        gen_helper_lpq(o->out, cpu_env, o->in2);
 +        gen_helper_exit_atomic(cpu_env);
 +        return DISAS_NORETURN;
      }
      return_low128(o->out2);
      return DISAS_NEXT;
@@ -XXX,XX +XXX,XX @@ static DisasJumpType op_stmh(DisasContext *s, DisasOps *o)
  static DisasJumpType op_stpq(DisasContext *s, DisasOps *o)
  {
 -    if (tb_cflags(s->base.tb) & CF_PARALLEL) {
 +    if (!(tb_cflags(s->base.tb) & CF_PARALLEL)) {
 +        gen_helper_stpq(cpu_env, o->in2, o->out2, o->out);
 +    } else if (HAVE_ATOMIC128) {
          gen_helper_stpq_parallel(cpu_env, o->in2, o->out2, o->out);
      } else {
 -        gen_helper_stpq(cpu_env, o->in2, o->out2, o->out);
 +        gen_helper_exit_atomic(cpu_env);
 +        return DISAS_NORETURN;
      }
      return DISAS_NEXT;
  }
 --
-.17.2
+.34.1

-New patch
+[PULL 47/47] tests/tcg/multiarch: add vma-pthread.c
+From: Ilya Leoshkevich <iii@linux.ibm.com>
 Add a test that locklessly changes and exercises page protection bits
 from various threads. This helps catch race conditions in the VMA
 handling.
 Acked-by: Alex Bennée <alex.bennee@linaro.org>
 Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
 Message-Id: <20221223120252.513319-1-iii@linux.ibm.com>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
  tests/tcg/multiarch/nop_func.h       |  25 ++++
  tests/tcg/multiarch/munmap-pthread.c |  16 +--
  tests/tcg/multiarch/vma-pthread.c    | 207 +++++++++++++++++++++++++++
  tests/tcg/multiarch/Makefile.target  |   3 +
 files changed, 236 insertions(+), 15 deletions(-)
  create mode 100644 tests/tcg/multiarch/nop_func.h
  create mode 100644 tests/tcg/multiarch/vma-pthread.c
 diff --git a/tests/tcg/multiarch/nop_func.h b/tests/tcg/multiarch/nop_func.h
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/tests/tcg/multiarch/nop_func.h
@@ -XXX,XX +XXX,XX @@
 +/*
 + * No-op functions that can be safely copied.
 + *
 + * SPDX-License-Identifier: GPL-2.0-or-later
 + */
 +#ifndef NOP_FUNC_H
 +#define NOP_FUNC_H
 +
 +static const char nop_func[] = {
 +#if defined(__aarch64__)
 +    0xc0, 0x03, 0x5f, 0xd6,     /* ret */
 +#elif defined(__alpha__)
 +    0x01, 0x80, 0xFA, 0x6B,     /* ret */
 +#elif defined(__arm__)
 +    0x1e, 0xff, 0x2f, 0xe1,     /* bx lr */
 +#elif defined(__riscv)
 +    0x67, 0x80, 0x00, 0x00,     /* ret */
 +#elif defined(__s390__)
 +    0x07, 0xfe,                 /* br %r14 */
 +#elif defined(__i386__) || defined(__x86_64__)
 +    0xc3,                       /* ret */
 +#endif
 +};
 +
 +#endif
 diff --git a/tests/tcg/multiarch/munmap-pthread.c b/tests/tcg/multiarch/munmap-pthread.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tests/tcg/multiarch/munmap-pthread.c
 +++ b/tests/tcg/multiarch/munmap-pthread.c
@@ -XXX,XX +XXX,XX @@
  #include <sys/mman.h>
  #include <unistd.h>
 -static const char nop_func[] = {
 -#if defined(__aarch64__)
 -    0xc0, 0x03, 0x5f, 0xd6,     /* ret */
 -#elif defined(__alpha__)
 -    0x01, 0x80, 0xFA, 0x6B,     /* ret */
 -#elif defined(__arm__)
 -    0x1e, 0xff, 0x2f, 0xe1,     /* bx lr */
 -#elif defined(__riscv)
 -    0x67, 0x80, 0x00, 0x00,     /* ret */
 -#elif defined(__s390__)
 -    0x07, 0xfe,                 /* br %r14 */
 -#elif defined(__i386__) || defined(__x86_64__)
 -    0xc3,                       /* ret */
 -#endif
 -};
 +#include "nop_func.h"
  static void *thread_mmap_munmap(void *arg)
  {
 diff --git a/tests/tcg/multiarch/vma-pthread.c b/tests/tcg/multiarch/vma-pthread.c
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/tests/tcg/multiarch/vma-pthread.c
@@ -XXX,XX +XXX,XX @@
 +/*
 + * Test that VMA updates do not race.
 + *
 + * SPDX-License-Identifier: GPL-2.0-or-later
 + *
 + * Map a contiguous chunk of RWX memory. Split it into 8 equally sized
 + * regions, each of which is guaranteed to have a certain combination of
 + * protection bits set.
 + *
 + * Reader, writer and executor threads perform the respective operations on
 + * pages, which are guaranteed to have the respective protection bit set.
 + * Two mutator threads change the non-fixed protection bits randomly.
 + */
 +#include <assert.h>
 +#include <fcntl.h>
 +#include <pthread.h>
 +#include <stdbool.h>
 +#include <stdlib.h>
 +#include <string.h>
 +#include <stdio.h>
 +#include <sys/mman.h>
 +#include <unistd.h>
 +
 +#include "nop_func.h"
 +
 +#define PAGE_IDX_BITS 10
 +#define PAGE_COUNT (1 << PAGE_IDX_BITS)
 +#define PAGE_IDX_MASK (PAGE_COUNT - 1)
 +#define REGION_IDX_BITS 3
 +#define PAGE_IDX_R_MASK (1 << 7)
 +#define PAGE_IDX_W_MASK (1 << 8)
 +#define PAGE_IDX_X_MASK (1 << 9)
 +#define REGION_MASK (PAGE_IDX_R_MASK | PAGE_IDX_W_MASK | PAGE_IDX_X_MASK)
 +#define PAGES_PER_REGION (1 << (PAGE_IDX_BITS - REGION_IDX_BITS))
 +
 +struct context {
 +    int pagesize;
 +    char *ptr;
 +    int dev_null_fd;
 +    volatile int mutator_count;
 +};
 +
 +static void *thread_read(void *arg)
 +{
 +    struct context *ctx = arg;
 +    ssize_t sret;
 +    size_t i, j;
 +    int ret;
 +
 +    for (i = 0; ctx->mutator_count; i++) {
 +        char *p;
 +
 +        j = (i & PAGE_IDX_MASK) | PAGE_IDX_R_MASK;
 +        p = &ctx->ptr[j * ctx->pagesize];
 +
 +        /* Read directly. */
 +        ret = memcmp(p, nop_func, sizeof(nop_func));
 +        if (ret != 0) {
 +            fprintf(stderr, "fail direct read %p\n", p);
 +            abort();
 +        }
 +
 +        /* Read indirectly. */
 +        sret = write(ctx->dev_null_fd, p, 1);
 +        if (sret != 1) {
 +            if (sret < 0) {
 +                fprintf(stderr, "fail indirect read %p (%m)\n", p);
 +            } else {
 +                fprintf(stderr, "fail indirect read %p (%zd)\n", p, sret);
 +            }
 +            abort();
 +        }
 +    }
 +
 +    return NULL;
 +}
 +
 +static void *thread_write(void *arg)
 +{
 +    struct context *ctx = arg;
 +    struct timespec *ts;
 +    size_t i, j;
 +    int ret;
 +
 +    for (i = 0; ctx->mutator_count; i++) {
 +        j = (i & PAGE_IDX_MASK) | PAGE_IDX_W_MASK;
 +
 +        /* Write directly. */
 +        memcpy(&ctx->ptr[j * ctx->pagesize], nop_func, sizeof(nop_func));
 +
 +        /* Write using a syscall. */
 +        ts = (struct timespec *)(&ctx->ptr[(j + 1) * ctx->pagesize] -
 +                                 sizeof(struct timespec));
 +        ret = clock_gettime(CLOCK_REALTIME, ts);
 +        if (ret != 0) {
 +            fprintf(stderr, "fail indirect write %p (%m)\n", ts);
 +            abort();
 +        }
 +    }
 +
 +    return NULL;
 +}
 +
 +static void *thread_execute(void *arg)
 +{
 +    struct context *ctx = arg;
 +    size_t i, j;
 +
 +    for (i = 0; ctx->mutator_count; i++) {
 +        j = (i & PAGE_IDX_MASK) | PAGE_IDX_X_MASK;
 +        ((void(*)(void))&ctx->ptr[j * ctx->pagesize])();
 +    }
 +
 +    return NULL;
 +}
 +
 +static void *thread_mutate(void *arg)
 +{
 +    size_t i, start_idx, end_idx, page_idx, tmp;
 +    struct context *ctx = arg;
 +    unsigned int seed;
 +    int prot, ret;
 +
 +    seed = (unsigned int)time(NULL);
 +    for (i = 0; i < 50000; i++) {
 +        start_idx = rand_r(&seed) & PAGE_IDX_MASK;
 +        end_idx = rand_r(&seed) & PAGE_IDX_MASK;
 +        if (start_idx > end_idx) {
 +            tmp = start_idx;
 +            start_idx = end_idx;
 +            end_idx = tmp;
 +        }
 +        prot = rand_r(&seed) & (PROT_READ | PROT_WRITE | PROT_EXEC);
 +        for (page_idx = start_idx & REGION_MASK; page_idx <= end_idx;
 +             page_idx += PAGES_PER_REGION) {
 +            if (page_idx & PAGE_IDX_R_MASK) {
 +                prot |= PROT_READ;
 +            }
 +            if (page_idx & PAGE_IDX_W_MASK) {
 +                /* FIXME: qemu syscalls check for both read+write. */
 +                prot |= PROT_WRITE | PROT_READ;
 +            }
 +            if (page_idx & PAGE_IDX_X_MASK) {
 +                prot |= PROT_EXEC;
 +            }
 +        }
 +        ret = mprotect(&ctx->ptr[start_idx * ctx->pagesize],
 +                       (end_idx - start_idx + 1) * ctx->pagesize, prot);
 +        assert(ret == 0);
 +    }
 +
 +    __atomic_fetch_sub(&ctx->mutator_count, 1, __ATOMIC_SEQ_CST);
 +
 +    return NULL;
 +}
 +
 +int main(void)
 +{
 +    pthread_t threads[5];
 +    struct context ctx;
 +    size_t i;
 +    int ret;
 +
 +    /* Without a template, nothing to test. */
 +    if (sizeof(nop_func) == 0) {
 +        return EXIT_SUCCESS;
 +    }
 +
 +    /* Initialize memory chunk. */
 +    ctx.pagesize = getpagesize();
 +    ctx.ptr = mmap(NULL, PAGE_COUNT * ctx.pagesize,
 +                   PROT_READ | PROT_WRITE | PROT_EXEC,
 +                   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
 +    assert(ctx.ptr != MAP_FAILED);
 +    for (i = 0; i < PAGE_COUNT; i++) {
 +        memcpy(&ctx.ptr[i * ctx.pagesize], nop_func, sizeof(nop_func));
 +    }
 +    ctx.dev_null_fd = open("/dev/null", O_WRONLY);
 +    assert(ctx.dev_null_fd >= 0);
 +    ctx.mutator_count = 2;
 +
 +    /* Start threads. */
 +    ret = pthread_create(&threads[0], NULL, thread_read, &ctx);
 +    assert(ret == 0);
 +    ret = pthread_create(&threads[1], NULL, thread_write, &ctx);
 +    assert(ret == 0);
 +    ret = pthread_create(&threads[2], NULL, thread_execute, &ctx);
 +    assert(ret == 0);
 +    for (i = 3; i <= 4; i++) {
 +        ret = pthread_create(&threads[i], NULL, thread_mutate, &ctx);
 +        assert(ret == 0);
 +    }
 +
 +    /* Wait for threads to stop. */
 +    for (i = 0; i < sizeof(threads) / sizeof(threads[0]); i++) {
 +        ret = pthread_join(threads[i], NULL);
 +        assert(ret == 0);
 +    }
 +
 +    /* Destroy memory chunk. */
 +    ret = close(ctx.dev_null_fd);
 +    assert(ret == 0);
 +    ret = munmap(ctx.ptr, PAGE_COUNT * ctx.pagesize);
 +    assert(ret == 0);
 +
 +    return EXIT_SUCCESS;
 +}
 diff --git a/tests/tcg/multiarch/Makefile.target b/tests/tcg/multiarch/Makefile.target
 index XXXXXXX..XXXXXXX 100644
 --- a/tests/tcg/multiarch/Makefile.target
 +++ b/tests/tcg/multiarch/Makefile.target
@@ -XXX,XX +XXX,XX @@ signals: LDFLAGS+=-lrt -lpthread
  munmap-pthread: CFLAGS+=-pthread
  munmap-pthread: LDFLAGS+=-pthread
 +vma-pthread: CFLAGS+=-pthread
 +vma-pthread: LDFLAGS+=-pthread
 +
  # We define the runner for test-mmap after the individual
  # architectures have defined their supported pages sizes. If no
  # additional page sizes are defined we only run the default test.
 --
 .34.1

The following changes since commit ff56877e911782dedc9a424233fd3f62369c258c:

Merge remote-tracking branch 'remotes/kraxel/tags/vga-20181015-pull-request' into staging (2018-10-15 15:03:45 +0100)

are available in the Git repository at:

https://github.com/rth7680/qemu.git tags/pull-tcg-20181016

for you to fetch changes up to e3e9d1ea20c75718ce7c528c588a0a497f12f750:

cputlb: read CPUTLBEntry.addr_write atomically (2018-10-16 10:04:27 -0700)

----------------------------------------------------------------
Queued tcg patches

----------------------------------------------------------------
Emilio G. Cota (10):
      tcg: access cpu->icount_decr.u16.high with atomics
      tcg: fix use of uninitialized variable under CONFIG_PROFILER
      tcg: plug holes in struct TCGProfile
      tcg: distribute tcg_time into TCG contexts
      target/alpha: remove tlb_flush from alpha_cpu_initfn
      target/unicore32: remove tlb_flush from uc32_init_fn
      exec: introduce tlb_init
      cputlb: fix assert_cpu_is_self macro
      cputlb: serialize tlb updates with env->tlb_lock
      cputlb: read CPUTLBEntry.addr_write atomically

Richard Henderson (11):
      tcg: Implement CPU_LOG_TB_NOCHAIN during expansion
      tcg: Add tlb_index and tlb_entry helpers
      tcg: Split CONFIG_ATOMIC128
      target/i386: Convert to HAVE_CMPXCHG128
      target/arm: Convert to HAVE_CMPXCHG128
      target/arm: Check HAVE_CMPXCHG128 at translate time
      target/ppc: Convert to HAVE_CMPXCHG128 and HAVE_ATOMIC128
      target/s390x: Convert to HAVE_CMPXCHG128 and HAVE_ATOMIC128
      target/s390x: Split do_cdsg, do_lpq, do_stpq
      target/s390x: Skip wout, cout helpers if op helper does not return
      target/s390x: Check HAVE_ATOMIC128 and HAVE_CMPXCHG128 at translate

Rather than test NOCHAIN before linking, do not emit the
goto_tb opcode at all.  We already do this for goto_ptr.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/cpu-exec.c | 2 +-
 tcg/tcg-op.c | 9 ++++++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cpu-exec.c
+++ b/accel/tcg/cpu-exec.c
@@ -XXX,XX +XXX,XX @@ static inline TranslationBlock *tb_find(CPUState *cpu,
 }
 #endif
 /* See if we can patch the calling TB. */
- if (last_tb && !qemu_loglevel_mask(CPU_LOG_TB_NOCHAIN)) {
+ if (last_tb) {
 tb_add_jump(last_tb, tb_exit, tb);
 }
 return tb;
diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg-op.c
+++ b/tcg/tcg-op.c
@@ -XXX,XX +XXX,XX @@ void tcg_gen_exit_tb(TranslationBlock *tb, unsigned idx)
 seen this numbered exit before, via tcg_gen_goto_tb. */
 tcg_debug_assert(tcg_ctx->goto_tb_issue_mask & (1 << idx));
 #endif
+ /* When not chaining, exit without indicating a link. */
+ if (qemu_loglevel_mask(CPU_LOG_TB_NOCHAIN)) {
+ val = 0;
+ }
 } else {
 /* This is an exit via the exitreq label. */
 tcg_debug_assert(idx == TB_EXIT_REQUESTED);
@@ -XXX,XX +XXX,XX @@ void tcg_gen_goto_tb(unsigned idx)
 tcg_debug_assert((tcg_ctx->goto_tb_issue_mask & (1 << idx)) == 0);
 tcg_ctx->goto_tb_issue_mask |= 1 << idx;
 #endif
- tcg_gen_op1i(INDEX_op_goto_tb, idx);
+ /* When not chaining, we simply fall through to the "fallback" exit. */
+ if (!qemu_loglevel_mask(CPU_LOG_TB_NOCHAIN)) {
+ tcg_gen_op1i(INDEX_op_goto_tb, idx);
+ }
 }
 
 void tcg_gen_lookup_and_goto_ptr(void)
-- 
2.17.2

From: "Emilio G. Cota" <cota@braap.org>

Consistently access u16.high with atomics to avoid
undefined behaviour in MTTCG.

Note that icount_decr.u16.low is only used in icount mode,
so regular accesses to it are OK.

Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Message-Id: <20181010144853.13005-2-cota@braap.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/tcg-all.c | 2 +-
 accel/tcg/translate-all.c | 2 +-
 qom/cpu.c | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/accel/tcg/tcg-all.c b/accel/tcg/tcg-all.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-all.c
+++ b/accel/tcg/tcg-all.c
@@ -XXX,XX +XXX,XX @@ static void tcg_handle_interrupt(CPUState *cpu, int mask)
     if (!qemu_cpu_is_self(cpu)) {
         qemu_cpu_kick(cpu);
     } else {
-        cpu->icount_decr.u16.high = -1;
+        atomic_set(&cpu->icount_decr.u16.high, -1);
         if (use_icount &&
             !cpu->can_do_io
             && (mask & ~old_mask) != 0) {
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -XXX,XX +XXX,XX @@ void cpu_interrupt(CPUState *cpu, int mask)
 {
     g_assert(qemu_mutex_iothread_locked());
     cpu->interrupt_request |= mask;
-    cpu->icount_decr.u16.high = -1;
+    atomic_set(&cpu->icount_decr.u16.high, -1);
 }
 
 /*
diff --git a/qom/cpu.c b/qom/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/qom/cpu.c
+++ b/qom/cpu.c
@@ -XXX,XX +XXX,XX @@ static void cpu_common_reset(CPUState *cpu)
     cpu->mem_io_pc = 0;
     cpu->mem_io_vaddr = 0;
     cpu->icount_extra = 0;
-    cpu->icount_decr.u32 = 0;
+    atomic_set(&cpu->icount_decr.u32, 0);
     cpu->can_do_io = 1;
     cpu->exception_index = -1;
     cpu->crash_occurred = false;
-- 
2.17.2

From: "Emilio G. Cota" <cota@braap.org>

When we implemented per-vCPU TCG contexts, we forgot to also
distribute the tcg_time counter, which has remained as a global
accessed without any serialization, leading to potentially missed
counts.

Fix it by distributing the field over the TCG contexts, embedding
it into TCGProfile with a field called "cpu_exec_time", which is more
descriptive than "tcg_time". Add a function to query this value
directly, and for completeness, fill in the field in
tcg_profile_snapshot, even though its callers do not use it.

Signed-off-by: Emilio G. Cota <cota@braap.org>
Message-Id: <20181010144853.13005-5-cota@braap.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/qemu/timer.h | 1 -
 tcg/tcg.h | 2 ++
 cpus.c | 3 ++-
 monitor.c | 13 ++++++++++---
 tcg/tcg.c | 23 +++++++++++++++++++++++
 5 files changed, 37 insertions(+), 5 deletions(-)

diff --git a/include/qemu/timer.h b/include/qemu/timer.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/timer.h
+++ b/include/qemu/timer.h
@@ -XXX,XX +XXX,XX @@ static inline int64_t profile_getclock(void)
 return get_clock();
 }
 
-extern int64_t tcg_time;
 extern int64_t dev_time;
 #endif
 
diff --git a/tcg/tcg.h b/tcg/tcg.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -XXX,XX +XXX,XX @@ typedef struct TCGOp {
 QEMU_BUILD_BUG_ON(NB_OPS > (1 << 8));
 
 typedef struct TCGProfile {
+ int64_t cpu_exec_time;
 int64_t tb_count1;
 int64_t tb_count;
 int64_t op_count; /* total insn count */
@@ -XXX,XX +XXX,XX @@ int tcg_check_temp_count(void);
 #define tcg_check_temp_count() 0
 #endif
 
+int64_t tcg_cpu_exec_time(void);
 void tcg_dump_info(FILE *f, fprintf_function cpu_fprintf);
 void tcg_dump_op_count(FILE *f, fprintf_function cpu_fprintf);
 
diff --git a/cpus.c b/cpus.c
index XXXXXXX..XXXXXXX 100644
--- a/cpus.c
+++ b/cpus.c
@@ -XXX,XX +XXX,XX @@ static int tcg_cpu_exec(CPUState *cpu)
 ret = cpu_exec(cpu);
 cpu_exec_end(cpu);
 #ifdef CONFIG_PROFILER
- tcg_time += profile_getclock() - ti;
+ atomic_set(&tcg_ctx->prof.cpu_exec_time,
+ tcg_ctx->prof.cpu_exec_time + profile_getclock() - ti);
 #endif
 return ret;
 }
diff --git a/monitor.c b/monitor.c
index XXXXXXX..XXXXXXX 100644
--- a/monitor.c
+++ b/monitor.c
@@ -XXX,XX +XXX,XX @@
 #include "sysemu/cpus.h"
 #include "sysemu/iothread.h"
 #include "qemu/cutils.h"
+#include "tcg/tcg.h"
 
 #if defined(TARGET_S390X)
 #include "hw/s390x/storage-keys.h"
@@ -XXX,XX +XXX,XX @@ static void hmp_info_numa(Monitor *mon, const QDict *qdict)
 
 #ifdef CONFIG_PROFILER
 
-int64_t tcg_time;
 int64_t dev_time;
 
 static void hmp_info_profile(Monitor *mon, const QDict *qdict)
 {
+ static int64_t last_cpu_exec_time;
+ int64_t cpu_exec_time;
+ int64_t delta;
+
+ cpu_exec_time = tcg_cpu_exec_time();
+ delta = cpu_exec_time - last_cpu_exec_time;
+
 monitor_printf(mon, "async time %" PRId64 " (%0.3f)\n",
 dev_time, dev_time / (double)NANOSECONDS_PER_SECOND);
 monitor_printf(mon, "qemu time %" PRId64 " (%0.3f)\n",
- tcg_time, tcg_time / (double)NANOSECONDS_PER_SECOND);
- tcg_time = 0;
+ delta, delta / (double)NANOSECONDS_PER_SECOND);
+ last_cpu_exec_time = cpu_exec_time;
 dev_time = 0;
 }
 #else
diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@
 /* Define to jump the ELF file used to communicate with GDB. */
 #undef DEBUG_JIT
 
+#include "qemu/error-report.h"
 #include "qemu/cutils.h"
 #include "qemu/host-utils.h"
 #include "qemu/timer.h"
@@ -XXX,XX +XXX,XX @@ void tcg_profile_snapshot(TCGProfile *prof, bool counters, bool table)
 const TCGProfile *orig = &s->prof;
 
 if (counters) {
+ PROF_ADD(prof, orig, cpu_exec_time);
 PROF_ADD(prof, orig, tb_count1);
 PROF_ADD(prof, orig, tb_count);
 PROF_ADD(prof, orig, op_count);
@@ -XXX,XX +XXX,XX @@ void tcg_dump_op_count(FILE *f, fprintf_function cpu_fprintf)
 prof.table_op_count[i]);
 }
 }
+
+int64_t tcg_cpu_exec_time(void)
+{
+ unsigned int n_ctxs = atomic_read(&n_tcg_ctxs);
+ unsigned int i;
+ int64_t ret = 0;
+
+ for (i = 0; i < n_ctxs; i++) {
+ const TCGContext *s = atomic_read(&tcg_ctxs[i]);
+ const TCGProfile *prof = &s->prof;
+
+ ret += atomic_read(&prof->cpu_exec_time);
+ }
+ return ret;
+}
 #else
 void tcg_dump_op_count(FILE *f, fprintf_function cpu_fprintf)
 {
 cpu_fprintf(f, "[TCG profiler not compiled]\n");
 }
+
+int64_t tcg_cpu_exec_time(void)
+{
+ error_report("%s: TCG profiler not compiled", __func__);
+ exit(EXIT_FAILURE);
+}
 #endif
 
 
-- 
2.17.2

From: "Emilio G. Cota" <cota@braap.org>

As far as I can tell tlb_flush does not need to be called
this early. tlb_flush is eventually called after the CPU
has been realized.

This change paves the way to the introduction of tlb_init,
which will be called from cpu_exec_realizefn.

Cc: Guan Xuetao <gxt@mprc.pku.edu.cn>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Message-Id: <20181009174557.16125-3-cota@braap.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/unicore32/cpu.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/target/unicore32/cpu.c b/target/unicore32/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/unicore32/cpu.c
+++ b/target/unicore32/cpu.c
@@ -XXX,XX +XXX,XX @@ static void uc32_cpu_initfn(Object *obj)
     env->uncached_asr = ASR_MODE_PRIV;
     env->regs[31] = 0x03000000;
 #endif
-
-    tlb_flush(cs);
 }
 
 static const VMStateDescription vmstate_uc32_cpu = {
-- 
2.17.2

From: "Emilio G. Cota" <cota@braap.org>

Paves the way for the addition of a per-TLB lock.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Message-Id: <20181009174557.16125-4-cota@braap.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/exec-all.h | 8 ++++++++
 accel/tcg/cputlb.c | 4 ++++
 exec.c | 1 +
 3 files changed, 13 insertions(+)

diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -XXX,XX +XXX,XX @@ void cpu_address_space_init(CPUState *cpu, int asidx,
 
 #if !defined(CONFIG_USER_ONLY) && defined(CONFIG_TCG)
 /* cputlb.c */
+/**
+ * tlb_init - initialize a CPU's TLB
+ * @cpu: CPU whose TLB should be initialized
+ */
+void tlb_init(CPUState *cpu);
 /**
 * tlb_flush_page:
 * @cpu: CPU whose TLB should be flushed
@@ -XXX,XX +XXX,XX @@ void tlb_set_page(CPUState *cpu, target_ulong vaddr,
 void probe_write(CPUArchState *env, target_ulong addr, int size, int mmu_idx,
 uintptr_t retaddr);
 #else
+static inline void tlb_init(CPUState *cpu)
+{
+}
 static inline void tlb_flush_page(CPUState *cpu, target_ulong addr)
 {
 }
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ QEMU_BUILD_BUG_ON(sizeof(target_ulong) > sizeof(run_on_cpu_data));
 QEMU_BUILD_BUG_ON(NB_MMU_MODES > 16);
 #define ALL_MMUIDX_BITS ((1 << NB_MMU_MODES) - 1)
 
+void tlb_init(CPUState *cpu)
+{
+}
+
 /* flush_all_helper: run fn across all cpus
 *
 * If the wait flag is set then the src cpu's helper will be queued as
diff --git a/exec.c b/exec.c
index XXXXXXX..XXXXXXX 100644
--- a/exec.c
+++ b/exec.c
@@ -XXX,XX +XXX,XX @@ void cpu_exec_realizefn(CPUState *cpu, Error **errp)
 tcg_target_initialized = true;
 cc->tcg_initialize();
 }
+ tlb_init(cpu);
 
 #ifndef CONFIG_USER_ONLY
 if (qdev_get_vmsd(DEVICE(cpu)) == NULL) {
-- 
2.17.2

From: "Emilio G. Cota" <cota@braap.org>

Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Message-Id: <20181009174557.16125-5-cota@braap.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/cputlb.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@
     } \
 } while (0)
 
-#define assert_cpu_is_self(this_cpu) do {                         \
+#define assert_cpu_is_self(cpu) do {                              \
         if (DEBUG_TLB_GATE) {                                     \
-            g_assert(!cpu->created || qemu_cpu_is_self(cpu));     \
+            g_assert(!(cpu)->created || qemu_cpu_is_self(cpu));   \
         }                                                         \
     } while (0)
 
-- 
2.17.2

From: "Emilio G. Cota" <cota@braap.org>

Currently we rely on atomic operations for cross-CPU invalidations.
There are two cases that these atomics miss: cross-CPU invalidations
can race with either (1) vCPU threads flushing their TLB, which
happens via memset, or (2) vCPUs calling tlb_reset_dirty on their TLB,
which updates .addr_write with a regular store. This results in
undefined behaviour, since we're mixing regular and atomic ops
on concurrent accesses.

Fix it by using tlb_lock, a per-vCPU lock. All updaters of tlb_table
and the corresponding victim cache now hold the lock.
The readers that do not hold tlb_lock must use atomic reads when
reading .addr_write, since this field can be updated by other threads;
the conversion to atomic reads is done in the next patch.

Note that an alternative fix would be to expand the use of atomic ops.
However, in the case of TLB flushes this would have a huge performance
impact, since (1) TLB flushes can happen very frequently and (2) we
currently use a full memory barrier to flush each TLB entry, and a TLB
has many entries. Instead, acquiring the lock is barely slower than a
full memory barrier since it is uncontended, and with a single lock
acquisition we can flush the entire TLB.

Tested-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Message-Id: <20181009174557.16125-6-cota@braap.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/cpu-defs.h | 3 +
 accel/tcg/cputlb.c | 155 ++++++++++++++++++++++------------------
 2 files changed, 87 insertions(+), 71 deletions(-)

diff --git a/include/exec/cpu-defs.h b/include/exec/cpu-defs.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/cpu-defs.h
+++ b/include/exec/cpu-defs.h
@@ -XXX,XX +XXX,XX @@
 #endif
 
 #include "qemu/host-utils.h"
+#include "qemu/thread.h"
 #include "qemu/queue.h"
 #ifdef CONFIG_TCG
 #include "tcg-target.h"
@@ -XXX,XX +XXX,XX @@ typedef struct CPUIOTLBEntry {
 
 #define CPU_COMMON_TLB \
 /* The meaning of the MMU modes is defined in the target code. */ \
+ /* tlb_lock serializes updates to tlb_table and tlb_v_table */ \
+ QemuSpin tlb_lock; \
 CPUTLBEntry tlb_table[NB_MMU_MODES][CPU_TLB_SIZE]; \
 CPUTLBEntry tlb_v_table[NB_MMU_MODES][CPU_VTLB_SIZE]; \
 CPUIOTLBEntry iotlb[NB_MMU_MODES][CPU_TLB_SIZE]; \
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ QEMU_BUILD_BUG_ON(NB_MMU_MODES > 16);
 
 void tlb_init(CPUState *cpu)
 {
+ CPUArchState *env = cpu->env_ptr;
+
+ qemu_spin_init(&env->tlb_lock);
 }
 
 /* flush_all_helper: run fn across all cpus
@@ -XXX,XX +XXX,XX @@ static void tlb_flush_nocheck(CPUState *cpu)
 atomic_set(&env->tlb_flush_count, env->tlb_flush_count + 1);
 tlb_debug("(count: %zu)\n", tlb_flush_count());
 
+ /*
+ * tlb_table/tlb_v_table updates from any thread must hold tlb_lock.
+ * However, updates from the owner thread (as is the case here; see the
+ * above assert_cpu_is_self) do not need atomic_set because all reads
+ * that do not hold the lock are performed by the same owner thread.
+ */
+ qemu_spin_lock(&env->tlb_lock);
 memset(env->tlb_table, -1, sizeof(env->tlb_table));
 memset(env->tlb_v_table, -1, sizeof(env->tlb_v_table));
+ qemu_spin_unlock(&env->tlb_lock);
+
 cpu_tb_jmp_cache_clear(cpu);
 
 env->vtlb_index = 0;
@@ -XXX,XX +XXX,XX @@ static void tlb_flush_by_mmuidx_async_work(CPUState *cpu, run_on_cpu_data data)
 
 tlb_debug("start: mmu_idx:0x%04lx\n", mmu_idx_bitmask);
 
+ qemu_spin_lock(&env->tlb_lock);
 for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
 
 if (test_bit(mmu_idx, &mmu_idx_bitmask)) {
@@ -XXX,XX +XXX,XX @@ static void tlb_flush_by_mmuidx_async_work(CPUState *cpu, run_on_cpu_data data)
 memset(env->tlb_v_table[mmu_idx], -1, sizeof(env->tlb_v_table[0]));
 }
 }
+ qemu_spin_unlock(&env->tlb_lock);
 
 cpu_tb_jmp_cache_clear(cpu);
 
@@ -XXX,XX +XXX,XX @@ static inline bool tlb_hit_page_anyprot(CPUTLBEntry *tlb_entry,
 tlb_hit_page(tlb_entry->addr_code, page);
 }
 
-static inline void tlb_flush_entry(CPUTLBEntry *tlb_entry, target_ulong page)
+/* Called with tlb_lock held */
+static inline void tlb_flush_entry_locked(CPUTLBEntry *tlb_entry,
+ target_ulong page)
 {
 if (tlb_hit_page_anyprot(tlb_entry, page)) {
 memset(tlb_entry, -1, sizeof(*tlb_entry));
 }
 }
 
-static inline void tlb_flush_vtlb_page(CPUArchState *env, int mmu_idx,
- target_ulong page)
+/* Called with tlb_lock held */
+static inline void tlb_flush_vtlb_page_locked(CPUArchState *env, int mmu_idx,
+ target_ulong page)
 {
 int k;
+
+ assert_cpu_is_self(ENV_GET_CPU(env));
 for (k = 0; k < CPU_VTLB_SIZE; k++) {
- tlb_flush_entry(&env->tlb_v_table[mmu_idx][k], page);
+ tlb_flush_entry_locked(&env->tlb_v_table[mmu_idx][k], page);
 }
 }
 
@@ -XXX,XX +XXX,XX @@ static void tlb_flush_page_async_work(CPUState *cpu, run_on_cpu_data data)
 
 addr &= TARGET_PAGE_MASK;
 i = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
+ qemu_spin_lock(&env->tlb_lock);
 for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
- tlb_flush_entry(&env->tlb_table[mmu_idx][i], addr);
- tlb_flush_vtlb_page(env, mmu_idx, addr);
+ tlb_flush_entry_locked(&env->tlb_table[mmu_idx][i], addr);
+ tlb_flush_vtlb_page_locked(env, mmu_idx, addr);
 }
+ qemu_spin_unlock(&env->tlb_lock);
 
 tb_flush_jmp_cache(cpu, addr);
 }
@@ -XXX,XX +XXX,XX @@ static void tlb_flush_page_by_mmuidx_async_work(CPUState *cpu,
 tlb_debug("page:%d addr:"TARGET_FMT_lx" mmu_idx:0x%lx\n",
 page, addr, mmu_idx_bitmap);
 
+ qemu_spin_lock(&env->tlb_lock);
 for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
 if (test_bit(mmu_idx, &mmu_idx_bitmap)) {
- tlb_flush_entry(&env->tlb_table[mmu_idx][page], addr);
- tlb_flush_vtlb_page(env, mmu_idx, addr);
+ tlb_flush_entry_locked(&env->tlb_table[mmu_idx][page], addr);
+ tlb_flush_vtlb_page_locked(env, mmu_idx, addr);
 }
 }
+ qemu_spin_unlock(&env->tlb_lock);
 
 tb_flush_jmp_cache(cpu, addr);
 }
@@ -XXX,XX +XXX,XX @@ void tlb_unprotect_code(ram_addr_t ram_addr)
 * most usual is detecting writes to code regions which may invalidate
 * generated code.
 *
- * Because we want other vCPUs to respond to changes straight away we
- * update the te->addr_write field atomically. If the TLB entry has
- * been changed by the vCPU in the mean time we skip the update.
+ * Other vCPUs might be reading their TLBs during guest execution, so we update
+ * te->addr_write with atomic_set. We don't need to worry about this for
+ * oversized guests as MTTCG is disabled for them.
 *
- * As this function uses atomic accesses we also need to ensure
- * updates to tlb_entries follow the same access rules. We don't need
- * to worry about this for oversized guests as MTTCG is disabled for
- * them.
+ * Called with tlb_lock held.
 */
-
-static void tlb_reset_dirty_range(CPUTLBEntry *tlb_entry, uintptr_t start,
- uintptr_t length)
+static void tlb_reset_dirty_range_locked(CPUTLBEntry *tlb_entry,
+ uintptr_t start, uintptr_t length)
 {
-#if TCG_OVERSIZED_GUEST
 uintptr_t addr = tlb_entry->addr_write;
 
 if ((addr & (TLB_INVALID_MASK | TLB_MMIO | TLB_NOTDIRTY)) == 0) {
 addr &= TARGET_PAGE_MASK;
 addr += tlb_entry->addend;
 if ((addr - start) < length) {
+#if TCG_OVERSIZED_GUEST
 tlb_entry->addr_write |= TLB_NOTDIRTY;
- }
- }
 #else
- /* paired with atomic_mb_set in tlb_set_page_with_attrs */
- uintptr_t orig_addr = atomic_mb_read(&tlb_entry->addr_write);
- uintptr_t addr = orig_addr;
-
- if ((addr & (TLB_INVALID_MASK | TLB_MMIO | TLB_NOTDIRTY)) == 0) {
- addr &= TARGET_PAGE_MASK;
- addr += atomic_read(&tlb_entry->addend);
- if ((addr - start) < length) {
- uintptr_t notdirty_addr = orig_addr | TLB_NOTDIRTY;
- atomic_cmpxchg(&tlb_entry->addr_write, orig_addr, notdirty_addr);
+ atomic_set(&tlb_entry->addr_write,
+ tlb_entry->addr_write | TLB_NOTDIRTY);
+#endif
 }
 }
-#endif
 }
 
-/* For atomic correctness when running MTTCG we need to use the right
- * primitives when copying entries */
-static inline void copy_tlb_helper(CPUTLBEntry *d, CPUTLBEntry *s,
- bool atomic_set)
+/*
+ * Called with tlb_lock held.
+ * Called only from the vCPU context, i.e. the TLB's owner thread.
+ */
+static inline void copy_tlb_helper_locked(CPUTLBEntry *d, const CPUTLBEntry *s)
 {
-#if TCG_OVERSIZED_GUEST
 *d = *s;
-#else
- if (atomic_set) {
- d->addr_read = s->addr_read;
- d->addr_code = s->addr_code;
- atomic_set(&d->addend, atomic_read(&s->addend));
- /* Pairs with flag setting in tlb_reset_dirty_range */
- atomic_mb_set(&d->addr_write, atomic_read(&s->addr_write));
- } else {
- d->addr_read = s->addr_read;
- d->addr_write = atomic_read(&s->addr_write);
- d->addr_code = s->addr_code;
- d->addend = atomic_read(&s->addend);
- }
-#endif
 }
 
 /* This is a cross vCPU call (i.e. another vCPU resetting the flags of
- * the target vCPU). As such care needs to be taken that we don't
- * dangerously race with another vCPU update. The only thing actually
- * updated is the target TLB entry ->addr_write flags.
+ * the target vCPU).
+ * We must take tlb_lock to avoid racing with another vCPU update. The only
+ * thing actually updated is the target TLB entry ->addr_write flags.
 */
 void tlb_reset_dirty(CPUState *cpu, ram_addr_t start1, ram_addr_t length)
 {
@@ -XXX,XX +XXX,XX @@ void tlb_reset_dirty(CPUState *cpu, ram_addr_t start1, ram_addr_t length)
 int mmu_idx;
 
 env = cpu->env_ptr;
+ qemu_spin_lock(&env->tlb_lock);
 for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
 unsigned int i;
 
 for (i = 0; i < CPU_TLB_SIZE; i++) {
- tlb_reset_dirty_range(&env->tlb_table[mmu_idx][i],
- start1, length);
+ tlb_reset_dirty_range_locked(&env->tlb_table[mmu_idx][i], start1,
+ length);
 }
 
 for (i = 0; i < CPU_VTLB_SIZE; i++) {
- tlb_reset_dirty_range(&env->tlb_v_table[mmu_idx][i],
- start1, length);
+ tlb_reset_dirty_range_locked(&env->tlb_v_table[mmu_idx][i], start1,
+ length);
 }
 }
+ qemu_spin_unlock(&env->tlb_lock);
 }
 
-static inline void tlb_set_dirty1(CPUTLBEntry *tlb_entry, target_ulong vaddr)
+/* Called with tlb_lock held */
+static inline void tlb_set_dirty1_locked(CPUTLBEntry *tlb_entry,
+ target_ulong vaddr)
 {
 if (tlb_entry->addr_write == (vaddr | TLB_NOTDIRTY)) {
 tlb_entry->addr_write = vaddr;
@@ -XXX,XX +XXX,XX @@ void tlb_set_dirty(CPUState *cpu, target_ulong vaddr)
 
 vaddr &= TARGET_PAGE_MASK;
 i = (vaddr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
+ qemu_spin_lock(&env->tlb_lock);
 for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
- tlb_set_dirty1(&env->tlb_table[mmu_idx][i], vaddr);
+ tlb_set_dirty1_locked(&env->tlb_table[mmu_idx][i], vaddr);
 }
 
 for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
 int k;
 for (k = 0; k < CPU_VTLB_SIZE; k++) {
- tlb_set_dirty1(&env->tlb_v_table[mmu_idx][k], vaddr);
+ tlb_set_dirty1_locked(&env->tlb_v_table[mmu_idx][k], vaddr);
 }
 }
+ qemu_spin_unlock(&env->tlb_lock);
 }
 
 /* Our TLB does not support large pages, so remember the area covered by
@@ -XXX,XX +XXX,XX @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
 addend = (uintptr_t)memory_region_get_ram_ptr(section->mr) + xlat;
 }
 
- /* Make sure there's no cached translation for the new page. */
- tlb_flush_vtlb_page(env, mmu_idx, vaddr_page);
-
 code_address = address;
 iotlb = memory_region_section_get_iotlb(cpu, section, vaddr_page,
 paddr_page, xlat, prot, &address);
@@ -XXX,XX +XXX,XX @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
 index = (vaddr_page >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
 te = &env->tlb_table[mmu_idx][index];
 
+ /*
+ * Hold the TLB lock for the rest of the function. We could acquire/release
+ * the lock several times in the function, but it is faster to amortize the
+ * acquisition cost by acquiring it just once. Note that this leads to
+ * a longer critical section, but this is not a concern since the TLB lock
+ * is unlikely to be contended.
+ */
+ qemu_spin_lock(&env->tlb_lock);
+
+ /* Make sure there's no cached translation for the new page. */
+ tlb_flush_vtlb_page_locked(env, mmu_idx, vaddr_page);
+
 /*
 * Only evict the old entry to the victim tlb if it's for a
 * different page; otherwise just overwrite the stale data.
@@ -XXX,XX +XXX,XX @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
 CPUTLBEntry *tv = &env->tlb_v_table[mmu_idx][vidx];
 
 /* Evict the old entry into the victim tlb. */
- copy_tlb_helper(tv, te, true);
+ copy_tlb_helper_locked(tv, te);
 env->iotlb_v[mmu_idx][vidx] = env->iotlb[mmu_idx][index];
 }
 
@@ -XXX,XX +XXX,XX @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
 }
 }
 
- /* Pairs with flag setting in tlb_reset_dirty_range */
- copy_tlb_helper(te, &tn, true);
- /* atomic_mb_set(&te->addr_write, write_address); */
+ copy_tlb_helper_locked(te, &tn);
+ qemu_spin_unlock(&env->tlb_lock);
 }
 
 /* Add a new TLB entry, but without specifying the memory
@@ -XXX,XX +XXX,XX @@ static bool victim_tlb_hit(CPUArchState *env, size_t mmu_idx, size_t index,
 size_t elt_ofs, target_ulong page)
 {
 size_t vidx;
+
+ assert_cpu_is_self(ENV_GET_CPU(env));
 for (vidx = 0; vidx < CPU_VTLB_SIZE; ++vidx) {
 CPUTLBEntry *vtlb = &env->tlb_v_table[mmu_idx][vidx];
 target_ulong cmp = *(target_ulong *)((uintptr_t)vtlb + elt_ofs);
@@ -XXX,XX +XXX,XX @@ static bool victim_tlb_hit(CPUArchState *env, size_t mmu_idx, size_t index,
 /* Found entry in victim tlb, swap tlb and iotlb. */
 CPUTLBEntry tmptlb, *tlb = &env->tlb_table[mmu_idx][index];
 
- copy_tlb_helper(&tmptlb, tlb, false);
- copy_tlb_helper(tlb, vtlb, true);
- copy_tlb_helper(vtlb, &tmptlb, true);
+ qemu_spin_lock(&env->tlb_lock);
+ copy_tlb_helper_locked(&tmptlb, tlb);
+ copy_tlb_helper_locked(tlb, vtlb);
+ copy_tlb_helper_locked(vtlb, &tmptlb);
+ qemu_spin_unlock(&env->tlb_lock);
 
 CPUIOTLBEntry tmpio, *io = &env->iotlb[mmu_idx][index];
 CPUIOTLBEntry *vio = &env->iotlb_v[mmu_idx][vidx];
-- 
2.17.2

Isolate the computation of an index from an address into a
helper before we change that function.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
[ cota: convert tlb_vaddr_to_host; use atomic_read on addr_write ]
Signed-off-by: Emilio G. Cota <cota@braap.org>
Message-Id: <20181009175129.17888-2-cota@braap.org>
---
 accel/tcg/softmmu_template.h | 64 +++++++++++++++++---------------
 include/exec/cpu_ldst.h | 19 ++++++++--
 include/exec/cpu_ldst_template.h | 25 +++++++------
 accel/tcg/cputlb.c | 60 ++++++++++++++----------------
 4 files changed, 90 insertions(+), 78 deletions(-)

diff --git a/accel/tcg/softmmu_template.h b/accel/tcg/softmmu_template.h
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/softmmu_template.h
+++ b/accel/tcg/softmmu_template.h
@@ -XXX,XX +XXX,XX @@ static inline DATA_TYPE glue(io_read, SUFFIX)(CPUArchState *env,
 WORD_TYPE helper_le_ld_name(CPUArchState *env, target_ulong addr,
 TCGMemOpIdx oi, uintptr_t retaddr)
 {
- unsigned mmu_idx = get_mmuidx(oi);
- int index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
- target_ulong tlb_addr = env->tlb_table[mmu_idx][index].ADDR_READ;
+ uintptr_t mmu_idx = get_mmuidx(oi);
+ uintptr_t index = tlb_index(env, mmu_idx, addr);
+ CPUTLBEntry *entry = tlb_entry(env, mmu_idx, addr);
+ target_ulong tlb_addr = entry->ADDR_READ;
 unsigned a_bits = get_alignment_bits(get_memop(oi));
 uintptr_t haddr;
 DATA_TYPE res;
@@ -XXX,XX +XXX,XX @@ WORD_TYPE helper_le_ld_name(CPUArchState *env, target_ulong addr,
 tlb_fill(ENV_GET_CPU(env), addr, DATA_SIZE, READ_ACCESS_TYPE,
 mmu_idx, retaddr);
 }
- tlb_addr = env->tlb_table[mmu_idx][index].ADDR_READ;
+ tlb_addr = entry->ADDR_READ;
 }
 
 /* Handle an IO access. */
@@ -XXX,XX +XXX,XX @@ WORD_TYPE helper_le_ld_name(CPUArchState *env, target_ulong addr,
 return res;
 }
 
- haddr = addr + env->tlb_table[mmu_idx][index].addend;
+ haddr = addr + entry->addend;
 #if DATA_SIZE == 1
 res = glue(glue(ld, LSUFFIX), _p)((uint8_t *)haddr);
 #else
@@ -XXX,XX +XXX,XX @@ WORD_TYPE helper_le_ld_name(CPUArchState *env, target_ulong addr,
 WORD_TYPE helper_be_ld_name(CPUArchState *env, target_ulong addr,
 TCGMemOpIdx oi, uintptr_t retaddr)
 {
- unsigned mmu_idx = get_mmuidx(oi);
- int index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
- target_ulong tlb_addr = env->tlb_table[mmu_idx][index].ADDR_READ;
+ uintptr_t mmu_idx = get_mmuidx(oi);
+ uintptr_t index = tlb_index(env, mmu_idx, addr);
+ CPUTLBEntry *entry = tlb_entry(env, mmu_idx, addr);
+ target_ulong tlb_addr = entry->ADDR_READ;
 unsigned a_bits = get_alignment_bits(get_memop(oi));
 uintptr_t haddr;
 DATA_TYPE res;
@@ -XXX,XX +XXX,XX @@ WORD_TYPE helper_be_ld_name(CPUArchState *env, target_ulong addr,
 tlb_fill(ENV_GET_CPU(env), addr, DATA_SIZE, READ_ACCESS_TYPE,
 mmu_idx, retaddr);
 }
- tlb_addr = env->tlb_table[mmu_idx][index].ADDR_READ;
+ tlb_addr = entry->ADDR_READ;
 }
 
 /* Handle an IO access. */
@@ -XXX,XX +XXX,XX @@ WORD_TYPE helper_be_ld_name(CPUArchState *env, target_ulong addr,
 return res;
 }
 
- haddr = addr + env->tlb_table[mmu_idx][index].addend;
+ haddr = addr + entry->addend;
 res = glue(glue(ld, LSUFFIX), _be_p)((uint8_t *)haddr);
 return res;
 }
@@ -XXX,XX +XXX,XX @@ static inline void glue(io_write, SUFFIX)(CPUArchState *env,
 void helper_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
 TCGMemOpIdx oi, uintptr_t retaddr)
 {
- unsigned mmu_idx = get_mmuidx(oi);
- int index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
- target_ulong tlb_addr = env->tlb_table[mmu_idx][index].addr_write;
+ uintptr_t mmu_idx = get_mmuidx(oi);
+ uintptr_t index = tlb_index(env, mmu_idx, addr);
+ CPUTLBEntry *entry = tlb_entry(env, mmu_idx, addr);
+ target_ulong tlb_addr = entry->addr_write;
 unsigned a_bits = get_alignment_bits(get_memop(oi));
 uintptr_t haddr;
 
@@ -XXX,XX +XXX,XX @@ void helper_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
 tlb_fill(ENV_GET_CPU(env), addr, DATA_SIZE, MMU_DATA_STORE,
 mmu_idx, retaddr);
 }
- tlb_addr = env->tlb_table[mmu_idx][index].addr_write & ~TLB_INVALID_MASK;
+ tlb_addr = entry->addr_write & ~TLB_INVALID_MASK;
 }
 
 /* Handle an IO access. */
@@ -XXX,XX +XXX,XX @@ void helper_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
 if (DATA_SIZE > 1
 && unlikely((addr & ~TARGET_PAGE_MASK) + DATA_SIZE - 1
 >= TARGET_PAGE_SIZE)) {
- int i, index2;
- target_ulong page2, tlb_addr2;
+ int i;
+ target_ulong page2;
+ CPUTLBEntry *entry2;
 do_unaligned_access:
 /* Ensure the second page is in the TLB. Note that the first page
 is already guaranteed to be filled, and that the second page
 cannot evict the first. */
 page2 = (addr + DATA_SIZE) & TARGET_PAGE_MASK;
- index2 = (page2 >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
- tlb_addr2 = env->tlb_table[mmu_idx][index2].addr_write;
- if (!tlb_hit_page(tlb_addr2, page2)
+ entry2 = tlb_entry(env, mmu_idx, page2);
+ if (!tlb_hit_page(entry2->addr_write, page2)
 && !VICTIM_TLB_HIT(addr_write, page2)) {
 tlb_fill(ENV_GET_CPU(env), page2, DATA_SIZE, MMU_DATA_STORE,
 mmu_idx, retaddr);
@@ -XXX,XX +XXX,XX @@ void helper_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
 return;
 }
 
- haddr = addr + env->tlb_table[mmu_idx][index].addend;
+ haddr = addr + entry->addend;
 #if DATA_SIZE == 1
 glue(glue(st, SUFFIX), _p)((uint8_t *)haddr, val);
 #else
@@ -XXX,XX +XXX,XX @@ void helper_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
 void helper_be_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
 TCGMemOpIdx oi, uintptr_t retaddr)
 {
- unsigned mmu_idx = get_mmuidx(oi);
- int index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
- target_ulong tlb_addr = env->tlb_table[mmu_idx][index].addr_write;
+ uintptr_t mmu_idx = get_mmuidx(oi);
+ uintptr_t index = tlb_index(env, mmu_idx, addr);
+ CPUTLBEntry *entry = tlb_entry(env, mmu_idx, addr);
+ target_ulong tlb_addr = entry->addr_write;
 unsigned a_bits = get_alignment_bits(get_memop(oi));
 uintptr_t haddr;
 
@@ -XXX,XX +XXX,XX @@ void helper_be_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
 tlb_fill(ENV_GET_CPU(env), addr, DATA_SIZE, MMU_DATA_STORE,
 mmu_idx, retaddr);
 }
- tlb_addr = env->tlb_table[mmu_idx][index].addr_write & ~TLB_INVALID_MASK;
+ tlb_addr = entry->addr_write & ~TLB_INVALID_MASK;
 }
 
 /* Handle an IO access. */
@@ -XXX,XX +XXX,XX @@ void helper_be_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
 if (DATA_SIZE > 1
 && unlikely((addr & ~TARGET_PAGE_MASK) + DATA_SIZE - 1
 >= TARGET_PAGE_SIZE)) {
- int i, index2;
- target_ulong page2, tlb_addr2;
+ int i;
+ target_ulong page2;
+ CPUTLBEntry *entry2;
 do_unaligned_access:
 /* Ensure the second page is in the TLB. Note that the first page
 is already guaranteed to be filled, and that the second page
 cannot evict the first. */
 page2 = (addr + DATA_SIZE) & TARGET_PAGE_MASK;
- index2 = (page2 >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
- tlb_addr2 = env->tlb_table[mmu_idx][index2].addr_write;
- if (!tlb_hit_page(tlb_addr2, page2)
+ entry2 = tlb_entry(env, mmu_idx, page2);
+ if (!tlb_hit_page(entry2->addr_write, page2)
 && !VICTIM_TLB_HIT(addr_write, page2)) {
 tlb_fill(ENV_GET_CPU(env), page2, DATA_SIZE, MMU_DATA_STORE,
 mmu_idx, retaddr);
@@ -XXX,XX +XXX,XX @@ void helper_be_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
 return;
 }
 
- haddr = addr + env->tlb_table[mmu_idx][index].addend;
+ haddr = addr + entry->addend;
 glue(glue(st, SUFFIX), _be_p)((uint8_t *)haddr, val);
 }
 #endif /* DATA_SIZE > 1 */
diff --git a/include/exec/cpu_ldst.h b/include/exec/cpu_ldst.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/cpu_ldst.h
+++ b/include/exec/cpu_ldst.h
@@ -XXX,XX +XXX,XX @@ extern __thread uintptr_t helper_retaddr;
 /* The memory helpers for tcg-generated code need tcg_target_long etc. */
 #include "tcg.h"
 
+/* Find the TLB index corresponding to the mmu_idx + address pair. */
+static inline uintptr_t tlb_index(CPUArchState *env, uintptr_t mmu_idx,
+ target_ulong addr)
+{
+ return (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
+}
+
+/* Find the TLB entry corresponding to the mmu_idx + address pair. */
+static inline CPUTLBEntry *tlb_entry(CPUArchState *env, uintptr_t mmu_idx,
+ target_ulong addr)
+{
+ return &env->tlb_table[mmu_idx][tlb_index(env, mmu_idx, addr)];
+}
+
 #ifdef MMU_MODE0_SUFFIX
 #define CPU_MMU_INDEX 0
 #define MEMSUFFIX MMU_MODE0_SUFFIX
@@ -XXX,XX +XXX,XX @@ static inline void *tlb_vaddr_to_host(CPUArchState *env, abi_ptr addr,
 #if defined(CONFIG_USER_ONLY)
 return g2h(addr);
 #else
- int index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
- CPUTLBEntry *tlbentry = &env->tlb_table[mmu_idx][index];
+ CPUTLBEntry *tlbentry = tlb_entry(env, mmu_idx, addr);
 abi_ptr tlb_addr;
 uintptr_t haddr;
 
@@ -XXX,XX +XXX,XX @@ static inline void *tlb_vaddr_to_host(CPUArchState *env, abi_ptr addr,
 return NULL;
 }
 
- haddr = addr + env->tlb_table[mmu_idx][index].addend;
+ haddr = addr + tlbentry->addend;
 return (void *)haddr;
 #endif /* defined(CONFIG_USER_ONLY) */
 }
diff --git a/include/exec/cpu_ldst_template.h b/include/exec/cpu_ldst_template.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/cpu_ldst_template.h
+++ b/include/exec/cpu_ldst_template.h
@@ -XXX,XX +XXX,XX @@ glue(glue(glue(cpu_ld, USUFFIX), MEMSUFFIX), _ra)(CPUArchState *env,
 target_ulong ptr,
 uintptr_t retaddr)
 {
- int page_index;
+ CPUTLBEntry *entry;
 RES_TYPE res;
 target_ulong addr;
 int mmu_idx;
@@ -XXX,XX +XXX,XX @@ glue(glue(glue(cpu_ld, USUFFIX), MEMSUFFIX), _ra)(CPUArchState *env,
 #endif
 
 addr = ptr;
- page_index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
 mmu_idx = CPU_MMU_INDEX;
- if (unlikely(env->tlb_table[mmu_idx][page_index].ADDR_READ !=
+ entry = tlb_entry(env, mmu_idx, addr);
+ if (unlikely(entry->ADDR_READ !=
 (addr & (TARGET_PAGE_MASK | (DATA_SIZE - 1))))) {
 oi = make_memop_idx(SHIFT, mmu_idx);
 res = glue(glue(helper_ret_ld, URETSUFFIX), MMUSUFFIX)(env, addr,
 oi, retaddr);
 } else {
- uintptr_t hostaddr = addr + env->tlb_table[mmu_idx][page_index].addend;
+ uintptr_t hostaddr = addr + entry->addend;
 res = glue(glue(ld, USUFFIX), _p)((uint8_t *)hostaddr);
 }
 return res;
@@ -XXX,XX +XXX,XX @@ glue(glue(glue(cpu_lds, SUFFIX), MEMSUFFIX), _ra)(CPUArchState *env,
 target_ulong ptr,
 uintptr_t retaddr)
 {
- int res, page_index;
+ CPUTLBEntry *entry;
+ int res;
 target_ulong addr;
 int mmu_idx;
 TCGMemOpIdx oi;
@@ -XXX,XX +XXX,XX @@ glue(glue(glue(cpu_lds, SUFFIX), MEMSUFFIX), _ra)(CPUArchState *env,
 #endif
 
 addr = ptr;
- page_index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
 mmu_idx = CPU_MMU_INDEX;
- if (unlikely(env->tlb_table[mmu_idx][page_index].ADDR_READ !=
+ entry = tlb_entry(env, mmu_idx, addr);
+ if (unlikely(entry->ADDR_READ !=
 (addr & (TARGET_PAGE_MASK | (DATA_SIZE - 1))))) {
 oi = make_memop_idx(SHIFT, mmu_idx);
 res = (DATA_STYPE)glue(glue(helper_ret_ld, SRETSUFFIX),
 MMUSUFFIX)(env, addr, oi, retaddr);
 } else {
- uintptr_t hostaddr = addr + env->tlb_table[mmu_idx][page_index].addend;
+ uintptr_t hostaddr = addr + entry->addend;
 res = glue(glue(lds, SUFFIX), _p)((uint8_t *)hostaddr);
 }
 return res;
@@ -XXX,XX +XXX,XX @@ glue(glue(glue(cpu_st, SUFFIX), MEMSUFFIX), _ra)(CPUArchState *env,
 target_ulong ptr,
 RES_TYPE v, uintptr_t retaddr)
 {
- int page_index;
+ CPUTLBEntry *entry;
 target_ulong addr;
 int mmu_idx;
 TCGMemOpIdx oi;
@@ -XXX,XX +XXX,XX @@ glue(glue(glue(cpu_st, SUFFIX), MEMSUFFIX), _ra)(CPUArchState *env,
 #endif
 
 addr = ptr;
- page_index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
 mmu_idx = CPU_MMU_INDEX;
- if (unlikely(env->tlb_table[mmu_idx][page_index].addr_write !=
+ entry = tlb_entry(env, mmu_idx, addr);
+ if (unlikely(entry->addr_write !=
 (addr & (TARGET_PAGE_MASK | (DATA_SIZE - 1))))) {
 oi = make_memop_idx(SHIFT, mmu_idx);
 glue(glue(helper_ret_st, SUFFIX), MMUSUFFIX)(env, addr, v, oi,
 retaddr);
 } else {
- uintptr_t hostaddr = addr + env->tlb_table[mmu_idx][page_index].addend;
+ uintptr_t hostaddr = addr + entry->addend;
 glue(glue(st, SUFFIX), _p)((uint8_t *)hostaddr, v);
 }
 }
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ static void tlb_flush_page_async_work(CPUState *cpu, run_on_cpu_data data)
 {
 CPUArchState *env = cpu->env_ptr;
 target_ulong addr = (target_ulong) data.target_ptr;
- int i;
 int mmu_idx;
 
 assert_cpu_is_self(cpu);
@@ -XXX,XX +XXX,XX @@ static void tlb_flush_page_async_work(CPUState *cpu, run_on_cpu_data data)
 }
 
 addr &= TARGET_PAGE_MASK;
- i = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
 qemu_spin_lock(&env->tlb_lock);
 for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
- tlb_flush_entry_locked(&env->tlb_table[mmu_idx][i], addr);
+ tlb_flush_entry_locked(tlb_entry(env, mmu_idx, addr), addr);
 tlb_flush_vtlb_page_locked(env, mmu_idx, addr);
 }
 qemu_spin_unlock(&env->tlb_lock);
@@ -XXX,XX +XXX,XX @@ static void tlb_flush_page_by_mmuidx_async_work(CPUState *cpu,
 target_ulong addr_and_mmuidx = (target_ulong) data.target_ptr;
 target_ulong addr = addr_and_mmuidx & TARGET_PAGE_MASK;
 unsigned long mmu_idx_bitmap = addr_and_mmuidx & ALL_MMUIDX_BITS;
- int page = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
 int mmu_idx;
 
 assert_cpu_is_self(cpu);
 
- tlb_debug("page:%d addr:"TARGET_FMT_lx" mmu_idx:0x%lx\n",
- page, addr, mmu_idx_bitmap);
+ tlb_debug("flush page addr:"TARGET_FMT_lx" mmu_idx:0x%lx\n",
+ addr, mmu_idx_bitmap);
 
 qemu_spin_lock(&env->tlb_lock);
 for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
 if (test_bit(mmu_idx, &mmu_idx_bitmap)) {
- tlb_flush_entry_locked(&env->tlb_table[mmu_idx][page], addr);
+ tlb_flush_entry_locked(tlb_entry(env, mmu_idx, addr), addr);
 tlb_flush_vtlb_page_locked(env, mmu_idx, addr);
 }
 }
@@ -XXX,XX +XXX,XX @@ static inline void tlb_set_dirty1_locked(CPUTLBEntry *tlb_entry,
 void tlb_set_dirty(CPUState *cpu, target_ulong vaddr)
 {
 CPUArchState *env = cpu->env_ptr;
- int i;
 int mmu_idx;
 
 assert_cpu_is_self(cpu);
 
 vaddr &= TARGET_PAGE_MASK;
- i = (vaddr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
 qemu_spin_lock(&env->tlb_lock);
 for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
- tlb_set_dirty1_locked(&env->tlb_table[mmu_idx][i], vaddr);
+ tlb_set_dirty1_locked(tlb_entry(env, mmu_idx, vaddr), vaddr);
 }
 
 for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
@@ -XXX,XX +XXX,XX @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
 iotlb = memory_region_section_get_iotlb(cpu, section, vaddr_page,
 paddr_page, xlat, prot, &address);
 
- index = (vaddr_page >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
- te = &env->tlb_table[mmu_idx][index];
+ index = tlb_index(env, mmu_idx, vaddr_page);
+ te = tlb_entry(env, mmu_idx, vaddr_page);
 
 /*
 * Hold the TLB lock for the rest of the function. We could acquire/release
@@ -XXX,XX +XXX,XX @@ static uint64_t io_readx(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
 * repeat the MMU check here. This tlb_fill() call might
 * longjump out if this access should cause a guest exception.
 */
- int index;
+ CPUTLBEntry *entry;
 target_ulong tlb_addr;
 
 tlb_fill(cpu, addr, size, MMU_DATA_LOAD, mmu_idx, retaddr);
 
- index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
- tlb_addr = env->tlb_table[mmu_idx][index].addr_read;
+ entry = tlb_entry(env, mmu_idx, addr);
+ tlb_addr = entry->addr_read;
 if (!(tlb_addr & ~(TARGET_PAGE_MASK | TLB_RECHECK))) {
 /* RAM access */
- uintptr_t haddr = addr + env->tlb_table[mmu_idx][index].addend;
+ uintptr_t haddr = addr + entry->addend;
 
 return ldn_p((void *)haddr, size);
 }
@@ -XXX,XX +XXX,XX @@ static void io_writex(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
 * repeat the MMU check here. This tlb_fill() call might
 * longjump out if this access should cause a guest exception.
 */
- int index;
+ CPUTLBEntry *entry;
 target_ulong tlb_addr;
 
 tlb_fill(cpu, addr, size, MMU_DATA_STORE, mmu_idx, retaddr);
 
- index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
- tlb_addr = env->tlb_table[mmu_idx][index].addr_write;
+ entry = tlb_entry(env, mmu_idx, addr);
+ tlb_addr = entry->addr_write;
 if (!(tlb_addr & ~(TARGET_PAGE_MASK | TLB_RECHECK))) {
 /* RAM access */
- uintptr_t haddr = addr + env->tlb_table[mmu_idx][index].addend;
+ uintptr_t haddr = addr + entry->addend;
 
 stn_p((void *)haddr, size, val);
 return;
@@ -XXX,XX +XXX,XX @@ static bool victim_tlb_hit(CPUArchState *env, size_t mmu_idx, size_t index,
 */
 tb_page_addr_t get_page_addr_code(CPUArchState *env, target_ulong addr)
 {
- int mmu_idx, index;
+ uintptr_t mmu_idx = cpu_mmu_index(env, true);
+ uintptr_t index = tlb_index(env, mmu_idx, addr);
+ CPUTLBEntry *entry = tlb_entry(env, mmu_idx, addr);
 void *p;
 
- index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
- mmu_idx = cpu_mmu_index(env, true);
- if (unlikely(!tlb_hit(env->tlb_table[mmu_idx][index].addr_code, addr))) {
+ if (unlikely(!tlb_hit(entry->addr_code, addr))) {
 if (!VICTIM_TLB_HIT(addr_code, addr)) {
 tlb_fill(ENV_GET_CPU(env), addr, 0, MMU_INST_FETCH, mmu_idx, 0);
 }
- assert(tlb_hit(env->tlb_table[mmu_idx][index].addr_code, addr));
+ assert(tlb_hit(entry->addr_code, addr));
 }
 
- if (unlikely(env->tlb_table[mmu_idx][index].addr_code &
- (TLB_RECHECK | TLB_MMIO))) {
+ if (unlikely(entry->addr_code & (TLB_RECHECK | TLB_MMIO))) {
 /*
 * Return -1 if we can't translate and execute from an entire
 * page of RAM here, which will cause us to execute by loading
@@ -XXX,XX +XXX,XX @@ tb_page_addr_t get_page_addr_code(CPUArchState *env, target_ulong addr)
 return -1;
 }
 
- p = (void *)((uintptr_t)addr + env->tlb_table[mmu_idx][index].addend);
+ p = (void *)((uintptr_t)addr + entry->addend);
 return qemu_ram_addr_from_host_nofail(p);
 }
 
@@ -XXX,XX +XXX,XX @@ tb_page_addr_t get_page_addr_code(CPUArchState *env, target_ulong addr)
 void probe_write(CPUArchState *env, target_ulong addr, int size, int mmu_idx,
 uintptr_t retaddr)
 {
- int index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
- target_ulong tlb_addr = env->tlb_table[mmu_idx][index].addr_write;
+ uintptr_t index = tlb_index(env, mmu_idx, addr);
+ CPUTLBEntry *entry = tlb_entry(env, mmu_idx, addr);
 
- if (!tlb_hit(tlb_addr, addr)) {
+ if (!tlb_hit(entry->addr_write, addr)) {
 /* TLB entry is for a different page */
 if (!VICTIM_TLB_HIT(addr_write, addr)) {
 tlb_fill(ENV_GET_CPU(env), addr, size, MMU_DATA_STORE,
@@ -XXX,XX +XXX,XX @@ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
 NotDirtyInfo *ndi)
 {
 size_t mmu_idx = get_mmuidx(oi);
- size_t index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
- CPUTLBEntry *tlbe = &env->tlb_table[mmu_idx][index];
+ uintptr_t index = tlb_index(env, mmu_idx, addr);
+ CPUTLBEntry *tlbe = tlb_entry(env, mmu_idx, addr);
 target_ulong tlb_addr = tlbe->addr_write;
 TCGMemOp mop = get_memop(oi);
 int a_bits = get_alignment_bits(mop);
-- 
2.17.2

GCC7+ will no longer advertise support for 16-byte __atomic operations
if only cmpxchg is supported, as for x86_64.  Fortunately, x86_64 still
has support for __sync_compare_and_swap_16 and we can make use of that.
AArch64 does not have, nor ever has had such support, so open-code it.

Reviewed-by: Emilio G. Cota <cota@braap.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/atomic_template.h | 20 ++++-
 include/qemu/atomic128.h | 155 ++++++++++++++++++++++++++++++++++++
 tcg/tcg.h | 16 ++--
 accel/tcg/cputlb.c | 3 +-
 accel/tcg/user-exec.c | 5 +-
 configure | 19 +++++
 6 files changed, 204 insertions(+), 14 deletions(-)
 create mode 100644 include/qemu/atomic128.h

diff --git a/accel/tcg/atomic_template.h b/accel/tcg/atomic_template.h
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/atomic_template.h
+++ b/accel/tcg/atomic_template.h
@@ -XXX,XX +XXX,XX @@ ABI_TYPE ATOMIC_NAME(cmpxchg)(CPUArchState *env, target_ulong addr,
 DATA_TYPE ret;
 
 ATOMIC_TRACE_RMW;
+#if DATA_SIZE == 16
+ ret = atomic16_cmpxchg(haddr, cmpv, newv);
+#else
 ret = atomic_cmpxchg__nocheck(haddr, cmpv, newv);
+#endif
 ATOMIC_MMU_CLEANUP;
 return ret;
 }
 
 #if DATA_SIZE >= 16
+#if HAVE_ATOMIC128
 ABI_TYPE ATOMIC_NAME(ld)(CPUArchState *env, target_ulong addr EXTRA_ARGS)
 {
 ATOMIC_MMU_DECLS;
 DATA_TYPE val, *haddr = ATOMIC_MMU_LOOKUP;
 
 ATOMIC_TRACE_LD;
- __atomic_load(haddr, &val, __ATOMIC_RELAXED);
+ val = atomic16_read(haddr);
 ATOMIC_MMU_CLEANUP;
 return val;
 }
@@ -XXX,XX +XXX,XX @@ void ATOMIC_NAME(st)(CPUArchState *env, target_ulong addr,
 DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;
 
 ATOMIC_TRACE_ST;
- __atomic_store(haddr, &val, __ATOMIC_RELAXED);
+ atomic16_set(haddr, val);
 ATOMIC_MMU_CLEANUP;
 }
+#endif
 #else
 ABI_TYPE ATOMIC_NAME(xchg)(CPUArchState *env, target_ulong addr,
 ABI_TYPE val EXTRA_ARGS)
@@ -XXX,XX +XXX,XX @@ ABI_TYPE ATOMIC_NAME(cmpxchg)(CPUArchState *env, target_ulong addr,
 DATA_TYPE ret;
 
 ATOMIC_TRACE_RMW;
+#if DATA_SIZE == 16
+ ret = atomic16_cmpxchg(haddr, BSWAP(cmpv), BSWAP(newv));
+#else
 ret = atomic_cmpxchg__nocheck(haddr, BSWAP(cmpv), BSWAP(newv));
+#endif
 ATOMIC_MMU_CLEANUP;
 return BSWAP(ret);
 }
 
 #if DATA_SIZE >= 16
+#if HAVE_ATOMIC128
 ABI_TYPE ATOMIC_NAME(ld)(CPUArchState *env, target_ulong addr EXTRA_ARGS)
 {
 ATOMIC_MMU_DECLS;
 DATA_TYPE val, *haddr = ATOMIC_MMU_LOOKUP;
 
 ATOMIC_TRACE_LD;
- __atomic_load(haddr, &val, __ATOMIC_RELAXED);
+ val = atomic16_read(haddr);
 ATOMIC_MMU_CLEANUP;
 return BSWAP(val);
 }
@@ -XXX,XX +XXX,XX @@ void ATOMIC_NAME(st)(CPUArchState *env, target_ulong addr,
 
 ATOMIC_TRACE_ST;
 val = BSWAP(val);
- __atomic_store(haddr, &val, __ATOMIC_RELAXED);
+ atomic16_set(haddr, val);
 ATOMIC_MMU_CLEANUP;
 }
+#endif
 #else
 ABI_TYPE ATOMIC_NAME(xchg)(CPUArchState *env, target_ulong addr,
 ABI_TYPE val EXTRA_ARGS)
diff --git a/include/qemu/atomic128.h b/include/qemu/atomic128.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/include/qemu/atomic128.h
@@ -XXX,XX +XXX,XX @@
+/*
+ * Simple interface for 128-bit atomic operations.
+ *
+ * Copyright (C) 2018 Linaro, Ltd.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ * See docs/devel/atomics.txt for discussion about the guarantees each
+ * atomic primitive is meant to provide.
+ */
+
+#ifndef QEMU_ATOMIC128_H
+#define QEMU_ATOMIC128_H
+
+/*
+ * GCC is a house divided about supporting large atomic operations.
+ *
+ * For hosts that only have large compare-and-swap, a legalistic reading
+ * of the C++ standard means that one cannot implement __atomic_read on
+ * read-only memory, and thus all atomic operations must synchronize
+ * through libatomic.
+ *
+ * See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=80878
+ *
+ * This interpretation is not especially helpful for QEMU.
+ * For softmmu, all RAM is always read/write from the hypervisor.
+ * For user-only, if the guest doesn't implement such an __atomic_read
+ * then the host need not worry about it either.
+ *
+ * Moreover, using libatomic is not an option, because its interface is
+ * built for std::atomic<T>, and requires that *all* accesses to such an
+ * object go through the library. In our case we do not have an object
+ * in the C/C++ sense, but a view of memory as seen by the guest.
+ * The guest may issue a large atomic operation and then access those
+ * pieces using word-sized accesses. From the hypervisor, we have no
+ * way to connect those two actions.
+ *
+ * Therefore, special case each platform.
+ */
+
+#if defined(CONFIG_ATOMIC128)
+static inline Int128 atomic16_cmpxchg(Int128 *ptr, Int128 cmp, Int128 new)
+{
+ return atomic_cmpxchg__nocheck(ptr, cmp, new);
+}
+# define HAVE_CMPXCHG128 1
+#elif defined(CONFIG_CMPXCHG128)
+static inline Int128 atomic16_cmpxchg(Int128 *ptr, Int128 cmp, Int128 new)
+{
+ return __sync_val_compare_and_swap_16(ptr, cmp, new);
+}
+# define HAVE_CMPXCHG128 1
+#elif defined(__aarch64__)
+/* Through gcc 8, aarch64 has no support for 128-bit at all. */
+static inline Int128 atomic16_cmpxchg(Int128 *ptr, Int128 cmp, Int128 new)
+{
+ uint64_t cmpl = int128_getlo(cmp), cmph = int128_gethi(cmp);
+ uint64_t newl = int128_getlo(new), newh = int128_gethi(new);
+ uint64_t oldl, oldh;
+ uint32_t tmp;
+
+ asm("0: ldaxp %[oldl], %[oldh], %[mem]\n\t"
+ "cmp %[oldl], %[cmpl]\n\t"
+ "ccmp %[oldh], %[cmph], #0, eq\n\t"
+ "b.ne 1f\n\t"
+ "stlxp %w[tmp], %[newl], %[newh], %[mem]\n\t"
+ "cbnz %w[tmp], 0b\n"
+ "1:"
+ : [mem] "+m"(*ptr), [tmp] "=&r"(tmp),
+ [oldl] "=&r"(oldl), [oldh] "=r"(oldh)
+ : [cmpl] "r"(cmpl), [cmph] "r"(cmph),
+ [newl] "r"(newl), [newh] "r"(newh)
+ : "memory", "cc");
+
+ return int128_make128(oldl, oldh);
+}
+# define HAVE_CMPXCHG128 1
+#else
+/* Fallback definition that must be optimized away, or error. */
+Int128 __attribute__((error("unsupported atomic")))
+ atomic16_cmpxchg(Int128 *ptr, Int128 cmp, Int128 new);
+# define HAVE_CMPXCHG128 0
+#endif /* Some definition for HAVE_CMPXCHG128 */
+
+
+#if defined(CONFIG_ATOMIC128)
+static inline Int128 atomic16_read(Int128 *ptr)
+{
+ return atomic_read__nocheck(ptr);
+}
+
+static inline void atomic16_set(Int128 *ptr, Int128 val)
+{
+ atomic_set__nocheck(ptr, val);
+}
+
+# define HAVE_ATOMIC128 1
+#elif !defined(CONFIG_USER_ONLY) && defined(__aarch64__)
+/* We can do better than cmpxchg for AArch64. */
+static inline Int128 atomic16_read(Int128 *ptr)
+{
+ uint64_t l, h;
+ uint32_t tmp;
+
+ /* The load must be paired with the store to guarantee not tearing. */
+ asm("0: ldxp %[l], %[h], %[mem]\n\t"
+ "stxp %w[tmp], %[l], %[h], %[mem]\n\t"
+ "cbnz %w[tmp], 0b"
+ : [mem] "+m"(*ptr), [tmp] "=r"(tmp), [l] "=r"(l), [h] "=r"(h));
+
+ return int128_make128(l, h);
+}
+
+static inline void atomic16_set(Int128 *ptr, Int128 val)
+{
+ uint64_t l = int128_getlo(val), h = int128_gethi(val);
+ uint64_t t1, t2;
+
+ /* Load into temporaries to acquire the exclusive access lock. */
+ asm("0: ldxp %[t1], %[t2], %[mem]\n\t"
+ "stxp %w[t1], %[l], %[h], %[mem]\n\t"
+ "cbnz %w[t1], 0b"
+ : [mem] "+m"(*ptr), [t1] "=&r"(t1), [t2] "=&r"(t2)
+ : [l] "r"(l), [h] "r"(h));
+}
+
+# define HAVE_ATOMIC128 1
+#elif !defined(CONFIG_USER_ONLY) && HAVE_CMPXCHG128
+static inline Int128 atomic16_read(Int128 *ptr)
+{
+ /* Maybe replace 0 with 0, returning the old value. */
+ return atomic16_cmpxchg(ptr, 0, 0);
+}
+
+static inline void atomic16_set(Int128 *ptr, Int128 val)
+{
+ Int128 old = *ptr, cmp;
+ do {
+ cmp = old;
+ old = atomic16_cmpxchg(ptr, cmp, val);
+ } while (old != cmp);
+}
+
+# define HAVE_ATOMIC128 1
+#else
+/* Fallback definitions that must be optimized away, or error. */
+Int128 __attribute__((error("unsupported atomic")))
+ atomic16_read(Int128 *ptr);
+void __attribute__((error("unsupported atomic")))
+ atomic16_set(Int128 *ptr, Int128 val);
+# define HAVE_ATOMIC128 0
+#endif /* Some definition for HAVE_ATOMIC128 */
+
+#endif /* QEMU_ATOMIC128_H */
diff --git a/tcg/tcg.h b/tcg/tcg.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -XXX,XX +XXX,XX @@
 #include "qemu/queue.h"
 #include "tcg-mo.h"
 #include "tcg-target.h"
+#include "qemu/int128.h"
 
 /* XXX: make safe guess about sizes */
 #define MAX_OP_PER_INSTR 266
@@ -XXX,XX +XXX,XX @@ GEN_ATOMIC_HELPER_ALL(xchg)
 #undef GEN_ATOMIC_HELPER
 #endif /* CONFIG_SOFTMMU */
 
-#ifdef CONFIG_ATOMIC128
-#include "qemu/int128.h"
-
-/* These aren't really a "proper" helpers because TCG cannot manage Int128.
- However, use the same format as the others, for use by the backends. */
+/*
+ * These aren't really a "proper" helpers because TCG cannot manage Int128.
+ * However, use the same format as the others, for use by the backends.
+ *
+ * The cmpxchg functions are only defined if HAVE_CMPXCHG128;
+ * the ld/st functions are only defined if HAVE_ATOMIC128,
+ * as defined by <qemu/atomic128.h>.
+ */
 Int128 helper_atomic_cmpxchgo_le_mmu(CPUArchState *env, target_ulong addr,
 Int128 cmpv, Int128 newv,
 TCGMemOpIdx oi, uintptr_t retaddr);
@@ -XXX,XX +XXX,XX @@ void helper_atomic_sto_le_mmu(CPUArchState *env, target_ulong addr, Int128 val,
 void helper_atomic_sto_be_mmu(CPUArchState *env, target_ulong addr, Int128 val,
 TCGMemOpIdx oi, uintptr_t retaddr);
 
-#endif /* CONFIG_ATOMIC128 */
-
 #endif /* TCG_H */
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@
 #include "exec/log.h"
 #include "exec/helper-proto.h"
 #include "qemu/atomic.h"
+#include "qemu/atomic128.h"
 
 /* DEBUG defines, enable DEBUG_TLB_LOG to log to the CPU_LOG_MMU target */
 /* #define DEBUG_TLB */
@@ -XXX,XX +XXX,XX @@ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
 #include "atomic_template.h"
 #endif
 
-#ifdef CONFIG_ATOMIC128
+#if HAVE_CMPXCHG128 || HAVE_ATOMIC128
 #define DATA_SIZE 16
 #include "atomic_template.h"
 #endif
diff --git a/accel/tcg/user-exec.c b/accel/tcg/user-exec.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/user-exec.c
+++ b/accel/tcg/user-exec.c
@@ -XXX,XX +XXX,XX @@
 #include "exec/cpu_ldst.h"
 #include "translate-all.h"
 #include "exec/helper-proto.h"
+#include "qemu/atomic128.h"
 
 #undef EAX
 #undef ECX
@@ -XXX,XX +XXX,XX @@ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
 /* The following is only callable from other helpers, and matches up
 with the softmmu version. */
 
-#ifdef CONFIG_ATOMIC128
+#if HAVE_ATOMIC128 || HAVE_CMPXCHG128
 
 #undef EXTRA_ARGS
 #undef ATOMIC_NAME
@@ -XXX,XX +XXX,XX @@ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
 
 #define DATA_SIZE 16
 #include "atomic_template.h"
-#endif /* CONFIG_ATOMIC128 */
+#endif
diff --git a/configure b/configure
index XXXXXXX..XXXXXXX 100755
--- a/configure
+++ b/configure
@@ -XXX,XX +XXX,XX @@ EOF
 fi
 fi
 
+cmpxchg128=no
+if test "$int128" = yes -a "$atomic128" = no; then
+ cat > $TMPC << EOF
+int main(void)
+{
+ unsigned __int128 x = 0, y = 0;
+ __sync_val_compare_and_swap_16(&x, y, x);
+ return 0;
+}
+EOF
+ if compile_prog "" "" ; then
+ cmpxchg128=yes
+ fi
+fi
+
 #########################################
 # See if 64-bit atomic operations are supported.
 # Note that without __atomic builtins, we can only
@@ -XXX,XX +XXX,XX @@ if test "$atomic128" = "yes" ; then
 echo "CONFIG_ATOMIC128=y" >> $config_host_mak
 fi
 
+if test "$cmpxchg128" = "yes" ; then
+ echo "CONFIG_CMPXCHG128=y" >> $config_host_mak
+fi
+
 if test "$atomic64" = "yes" ; then
 echo "CONFIG_ATOMIC64=y" >> $config_host_mak
 fi
-- 
2.17.2

Reviewed-by: Emilio G. Cota <cota@braap.org>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/i386/mem_helper.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/target/i386/mem_helper.c b/target/i386/mem_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/i386/mem_helper.c
+++ b/target/i386/mem_helper.c
@@ -XXX,XX +XXX,XX @@
 #include "exec/exec-all.h"
 #include "exec/cpu_ldst.h"
 #include "qemu/int128.h"
+#include "qemu/atomic128.h"
 #include "tcg.h"
 
 void helper_cmpxchg8b_unlocked(CPUX86State *env, target_ulong a0)
@@ -XXX,XX +XXX,XX @@ void helper_cmpxchg16b(CPUX86State *env, target_ulong a0)
 
     if ((a0 & 0xf) != 0) {
         raise_exception_ra(env, EXCP0D_GPF, ra);
-    } else {
-#ifndef CONFIG_ATOMIC128
-        cpu_loop_exit_atomic(ENV_GET_CPU(env), ra);
-#else
+    } else if (HAVE_CMPXCHG128) {
         int eflags = cpu_cc_compute_all(env, CC_OP);
 
         Int128 cmpv = int128_make128(env->regs[R_EAX], env->regs[R_EDX]);
@@ -XXX,XX +XXX,XX @@ void helper_cmpxchg16b(CPUX86State *env, target_ulong a0)
             eflags &= ~CC_Z;
         }
         CC_SRC = eflags;
-#endif
+    } else {
+        cpu_loop_exit_atomic(ENV_GET_CPU(env), ra);
     }
 }
 #endif
-- 
2.17.2

Reviewed-by: Emilio G. Cota <cota@braap.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/arm/helper-a64.c | 259 +++++++++++++++++++++-------------------
 1 file changed, 133 insertions(+), 126 deletions(-)

diff --git a/target/arm/helper-a64.c b/target/arm/helper-a64.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/helper-a64.c
+++ b/target/arm/helper-a64.c
@@ -XXX,XX +XXX,XX @@
 #include "exec/exec-all.h"
 #include "exec/cpu_ldst.h"
 #include "qemu/int128.h"
+#include "qemu/atomic128.h"
 #include "tcg.h"
 #include "fpu/softfloat.h"
 #include <zlib.h> /* For crc32 */
@@ -XXX,XX +XXX,XX @@ uint64_t HELPER(crc32c_64)(uint64_t acc, uint64_t val, uint32_t bytes)
 return crc32c(acc, buf, bytes) ^ 0xffffffff;
 }
 
-/* Returns 0 on success; 1 otherwise. */
-static uint64_t do_paired_cmpxchg64_le(CPUARMState *env, uint64_t addr,
- uint64_t new_lo, uint64_t new_hi,
- bool parallel, uintptr_t ra)
+uint64_t HELPER(paired_cmpxchg64_le)(CPUARMState *env, uint64_t addr,
+ uint64_t new_lo, uint64_t new_hi)
 {
- Int128 oldv, cmpv, newv;
+ Int128 cmpv = int128_make128(env->exclusive_val, env->exclusive_high);
+ Int128 newv = int128_make128(new_lo, new_hi);
+ Int128 oldv;
+ uintptr_t ra = GETPC();
+ uint64_t o0, o1;
 bool success;
 
- cmpv = int128_make128(env->exclusive_val, env->exclusive_high);
- newv = int128_make128(new_lo, new_hi);
-
- if (parallel) {
-#ifndef CONFIG_ATOMIC128
- cpu_loop_exit_atomic(ENV_GET_CPU(env), ra);
-#else
- int mem_idx = cpu_mmu_index(env, false);
- TCGMemOpIdx oi = make_memop_idx(MO_LEQ | MO_ALIGN_16, mem_idx);
- oldv = helper_atomic_cmpxchgo_le_mmu(env, addr, cmpv, newv, oi, ra);
- success = int128_eq(oldv, cmpv);
-#endif
- } else {
- uint64_t o0, o1;
-
 #ifdef CONFIG_USER_ONLY
- /* ??? Enforce alignment. */
- uint64_t *haddr = g2h(addr);
+ /* ??? Enforce alignment. */
+ uint64_t *haddr = g2h(addr);
 
- helper_retaddr = ra;
- o0 = ldq_le_p(haddr + 0);
- o1 = ldq_le_p(haddr + 1);
- oldv = int128_make128(o0, o1);
+ helper_retaddr = ra;
+ o0 = ldq_le_p(haddr + 0);
+ o1 = ldq_le_p(haddr + 1);
+ oldv = int128_make128(o0, o1);
 
- success = int128_eq(oldv, cmpv);
- if (success) {
- stq_le_p(haddr + 0, int128_getlo(newv));
- stq_le_p(haddr + 1, int128_gethi(newv));
- }
- helper_retaddr = 0;
-#else
- int mem_idx = cpu_mmu_index(env, false);
- TCGMemOpIdx oi0 = make_memop_idx(MO_LEQ | MO_ALIGN_16, mem_idx);
- TCGMemOpIdx oi1 = make_memop_idx(MO_LEQ, mem_idx);
-
- o0 = helper_le_ldq_mmu(env, addr + 0, oi0, ra);
- o1 = helper_le_ldq_mmu(env, addr + 8, oi1, ra);
- oldv = int128_make128(o0, o1);
-
- success = int128_eq(oldv, cmpv);
- if (success) {
- helper_le_stq_mmu(env, addr + 0, int128_getlo(newv), oi1, ra);
- helper_le_stq_mmu(env, addr + 8, int128_gethi(newv), oi1, ra);
- }
-#endif
+ success = int128_eq(oldv, cmpv);
+ if (success) {
+ stq_le_p(haddr + 0, int128_getlo(newv));
+ stq_le_p(haddr + 1, int128_gethi(newv));
 }
+ helper_retaddr = 0;
+#else
+ int mem_idx = cpu_mmu_index(env, false);
+ TCGMemOpIdx oi0 = make_memop_idx(MO_LEQ | MO_ALIGN_16, mem_idx);
+ TCGMemOpIdx oi1 = make_memop_idx(MO_LEQ, mem_idx);
+
+ o0 = helper_le_ldq_mmu(env, addr + 0, oi0, ra);
+ o1 = helper_le_ldq_mmu(env, addr + 8, oi1, ra);
+ oldv = int128_make128(o0, o1);
+
+ success = int128_eq(oldv, cmpv);
+ if (success) {
+ helper_le_stq_mmu(env, addr + 0, int128_getlo(newv), oi1, ra);
+ helper_le_stq_mmu(env, addr + 8, int128_gethi(newv), oi1, ra);
+ }
+#endif
 
 return !success;
 }
 
-uint64_t HELPER(paired_cmpxchg64_le)(CPUARMState *env, uint64_t addr,
- uint64_t new_lo, uint64_t new_hi)
-{
- return do_paired_cmpxchg64_le(env, addr, new_lo, new_hi, false, GETPC());
-}
-
 uint64_t HELPER(paired_cmpxchg64_le_parallel)(CPUARMState *env, uint64_t addr,
 uint64_t new_lo, uint64_t new_hi)
-{
- return do_paired_cmpxchg64_le(env, addr, new_lo, new_hi, true, GETPC());
-}
-
-static uint64_t do_paired_cmpxchg64_be(CPUARMState *env, uint64_t addr,
- uint64_t new_lo, uint64_t new_hi,
- bool parallel, uintptr_t ra)
 {
 Int128 oldv, cmpv, newv;
+ uintptr_t ra = GETPC();
 bool success;
+ int mem_idx;
+ TCGMemOpIdx oi;
 
- /* high and low need to be switched here because this is not actually a
- * 128bit store but two doublewords stored consecutively
- */
- cmpv = int128_make128(env->exclusive_high, env->exclusive_val);
- newv = int128_make128(new_hi, new_lo);
-
- if (parallel) {
-#ifndef CONFIG_ATOMIC128
+ if (!HAVE_CMPXCHG128) {
 cpu_loop_exit_atomic(ENV_GET_CPU(env), ra);
-#else
- int mem_idx = cpu_mmu_index(env, false);
- TCGMemOpIdx oi = make_memop_idx(MO_BEQ | MO_ALIGN_16, mem_idx);
- oldv = helper_atomic_cmpxchgo_be_mmu(env, addr, cmpv, newv, oi, ra);
- success = int128_eq(oldv, cmpv);
-#endif
- } else {
- uint64_t o0, o1;
-
-#ifdef CONFIG_USER_ONLY
- /* ??? Enforce alignment. */
- uint64_t *haddr = g2h(addr);
-
- helper_retaddr = ra;
- o1 = ldq_be_p(haddr + 0);
- o0 = ldq_be_p(haddr + 1);
- oldv = int128_make128(o0, o1);
-
- success = int128_eq(oldv, cmpv);
- if (success) {
- stq_be_p(haddr + 0, int128_gethi(newv));
- stq_be_p(haddr + 1, int128_getlo(newv));
- }
- helper_retaddr = 0;
-#else
- int mem_idx = cpu_mmu_index(env, false);
- TCGMemOpIdx oi0 = make_memop_idx(MO_BEQ | MO_ALIGN_16, mem_idx);
- TCGMemOpIdx oi1 = make_memop_idx(MO_BEQ, mem_idx);
-
- o1 = helper_be_ldq_mmu(env, addr + 0, oi0, ra);
- o0 = helper_be_ldq_mmu(env, addr + 8, oi1, ra);
- oldv = int128_make128(o0, o1);
-
- success = int128_eq(oldv, cmpv);
- if (success) {
- helper_be_stq_mmu(env, addr + 0, int128_gethi(newv), oi1, ra);
- helper_be_stq_mmu(env, addr + 8, int128_getlo(newv), oi1, ra);
- }
-#endif
 }
 
+ mem_idx = cpu_mmu_index(env, false);
+ oi = make_memop_idx(MO_LEQ | MO_ALIGN_16, mem_idx);
+
+ cmpv = int128_make128(env->exclusive_val, env->exclusive_high);
+ newv = int128_make128(new_lo, new_hi);
+ oldv = helper_atomic_cmpxchgo_le_mmu(env, addr, cmpv, newv, oi, ra);
+
+ success = int128_eq(oldv, cmpv);
 return !success;
 }
 
 uint64_t HELPER(paired_cmpxchg64_be)(CPUARMState *env, uint64_t addr,
 uint64_t new_lo, uint64_t new_hi)
 {
- return do_paired_cmpxchg64_be(env, addr, new_lo, new_hi, false, GETPC());
+ /*
+ * High and low need to be switched here because this is not actually a
+ * 128bit store but two doublewords stored consecutively
+ */
+ Int128 cmpv = int128_make128(env->exclusive_val, env->exclusive_high);
+ Int128 newv = int128_make128(new_lo, new_hi);
+ Int128 oldv;
+ uintptr_t ra = GETPC();
+ uint64_t o0, o1;
+ bool success;
+
+#ifdef CONFIG_USER_ONLY
+ /* ??? Enforce alignment. */
+ uint64_t *haddr = g2h(addr);
+
+ helper_retaddr = ra;
+ o1 = ldq_be_p(haddr + 0);
+ o0 = ldq_be_p(haddr + 1);
+ oldv = int128_make128(o0, o1);
+
+ success = int128_eq(oldv, cmpv);
+ if (success) {
+ stq_be_p(haddr + 0, int128_gethi(newv));
+ stq_be_p(haddr + 1, int128_getlo(newv));
+ }
+ helper_retaddr = 0;
+#else
+ int mem_idx = cpu_mmu_index(env, false);
+ TCGMemOpIdx oi0 = make_memop_idx(MO_BEQ | MO_ALIGN_16, mem_idx);
+ TCGMemOpIdx oi1 = make_memop_idx(MO_BEQ, mem_idx);
+
+ o1 = helper_be_ldq_mmu(env, addr + 0, oi0, ra);
+ o0 = helper_be_ldq_mmu(env, addr + 8, oi1, ra);
+ oldv = int128_make128(o0, o1);
+
+ success = int128_eq(oldv, cmpv);
+ if (success) {
+ helper_be_stq_mmu(env, addr + 0, int128_gethi(newv), oi1, ra);
+ helper_be_stq_mmu(env, addr + 8, int128_getlo(newv), oi1, ra);
+ }
+#endif
+
+ return !success;
 }
 
 uint64_t HELPER(paired_cmpxchg64_be_parallel)(CPUARMState *env, uint64_t addr,
- uint64_t new_lo, uint64_t new_hi)
+ uint64_t new_lo, uint64_t new_hi)
 {
- return do_paired_cmpxchg64_be(env, addr, new_lo, new_hi, true, GETPC());
+ Int128 oldv, cmpv, newv;
+ uintptr_t ra = GETPC();
+ bool success;
+ int mem_idx;
+ TCGMemOpIdx oi;
+
+ if (!HAVE_CMPXCHG128) {
+ cpu_loop_exit_atomic(ENV_GET_CPU(env), ra);
+ }
+
+ mem_idx = cpu_mmu_index(env, false);
+ oi = make_memop_idx(MO_BEQ | MO_ALIGN_16, mem_idx);
+
+ /*
+ * High and low need to be switched here because this is not actually a
+ * 128bit store but two doublewords stored consecutively
+ */
+ cmpv = int128_make128(env->exclusive_high, env->exclusive_val);
+ newv = int128_make128(new_hi, new_lo);
+ oldv = helper_atomic_cmpxchgo_be_mmu(env, addr, cmpv, newv, oi, ra);
+
+ success = int128_eq(oldv, cmpv);
+ return !success;
 }
 
 /* Writes back the old data into Rs. */
 void HELPER(casp_le_parallel)(CPUARMState *env, uint32_t rs, uint64_t addr,
 uint64_t new_lo, uint64_t new_hi)
 {
- uintptr_t ra = GETPC();
-#ifndef CONFIG_ATOMIC128
- cpu_loop_exit_atomic(ENV_GET_CPU(env), ra);
-#else
 Int128 oldv, cmpv, newv;
+ uintptr_t ra = GETPC();
+ int mem_idx;
+ TCGMemOpIdx oi;
+
+ if (!HAVE_CMPXCHG128) {
+ cpu_loop_exit_atomic(ENV_GET_CPU(env), ra);
+ }
+
+ mem_idx = cpu_mmu_index(env, false);
+ oi = make_memop_idx(MO_LEQ | MO_ALIGN_16, mem_idx);
 
 cmpv = int128_make128(env->xregs[rs], env->xregs[rs + 1]);
 newv = int128_make128(new_lo, new_hi);
-
- int mem_idx = cpu_mmu_index(env, false);
- TCGMemOpIdx oi = make_memop_idx(MO_LEQ | MO_ALIGN_16, mem_idx);
 oldv = helper_atomic_cmpxchgo_le_mmu(env, addr, cmpv, newv, oi, ra);
 
 env->xregs[rs] = int128_getlo(oldv);
 env->xregs[rs + 1] = int128_gethi(oldv);
-#endif
 }
 
 void HELPER(casp_be_parallel)(CPUARMState *env, uint32_t rs, uint64_t addr,
 uint64_t new_hi, uint64_t new_lo)
 {
- uintptr_t ra = GETPC();
-#ifndef CONFIG_ATOMIC128
- cpu_loop_exit_atomic(ENV_GET_CPU(env), ra);
-#else
 Int128 oldv, cmpv, newv;
+ uintptr_t ra = GETPC();
+ int mem_idx;
+ TCGMemOpIdx oi;
+
+ if (!HAVE_CMPXCHG128) {
+ cpu_loop_exit_atomic(ENV_GET_CPU(env), ra);
+ }
+
+ mem_idx = cpu_mmu_index(env, false);
+ oi = make_memop_idx(MO_LEQ | MO_ALIGN_16, mem_idx);
 
 cmpv = int128_make128(env->xregs[rs + 1], env->xregs[rs]);
 newv = int128_make128(new_lo, new_hi);
-
- int mem_idx = cpu_mmu_index(env, false);
- TCGMemOpIdx oi = make_memop_idx(MO_LEQ | MO_ALIGN_16, mem_idx);
 oldv = helper_atomic_cmpxchgo_be_mmu(env, addr, cmpv, newv, oi, ra);
 
 env->xregs[rs + 1] = int128_getlo(oldv);
 env->xregs[rs] = int128_gethi(oldv);
-#endif
 }
 
 /*
-- 
2.17.2

Reviewed-by: Emilio G. Cota <cota@braap.org>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/arm/helper-a64.c | 16 ++++------------
 target/arm/translate-a64.c | 38 ++++++++++++++++++++++----------------
 2 files changed, 26 insertions(+), 28 deletions(-)

diff --git a/target/arm/helper-a64.c b/target/arm/helper-a64.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/helper-a64.c
+++ b/target/arm/helper-a64.c
@@ -XXX,XX +XXX,XX @@ uint64_t HELPER(paired_cmpxchg64_le_parallel)(CPUARMState *env, uint64_t addr,
     int mem_idx;
     TCGMemOpIdx oi;
 
-    if (!HAVE_CMPXCHG128) {
-        cpu_loop_exit_atomic(ENV_GET_CPU(env), ra);
-    }
+    assert(HAVE_CMPXCHG128);
 
     mem_idx = cpu_mmu_index(env, false);
     oi = make_memop_idx(MO_LEQ | MO_ALIGN_16, mem_idx);
@@ -XXX,XX +XXX,XX @@ uint64_t HELPER(paired_cmpxchg64_be_parallel)(CPUARMState *env, uint64_t addr,
     int mem_idx;
     TCGMemOpIdx oi;
 
-    if (!HAVE_CMPXCHG128) {
-        cpu_loop_exit_atomic(ENV_GET_CPU(env), ra);
-    }
+    assert(HAVE_CMPXCHG128);
 
     mem_idx = cpu_mmu_index(env, false);
     oi = make_memop_idx(MO_BEQ | MO_ALIGN_16, mem_idx);
@@ -XXX,XX +XXX,XX @@ void HELPER(casp_le_parallel)(CPUARMState *env, uint32_t rs, uint64_t addr,
     int mem_idx;
     TCGMemOpIdx oi;
 
-    if (!HAVE_CMPXCHG128) {
-        cpu_loop_exit_atomic(ENV_GET_CPU(env), ra);
-    }
+    assert(HAVE_CMPXCHG128);
 
     mem_idx = cpu_mmu_index(env, false);
     oi = make_memop_idx(MO_LEQ | MO_ALIGN_16, mem_idx);
@@ -XXX,XX +XXX,XX @@ void HELPER(casp_be_parallel)(CPUARMState *env, uint32_t rs, uint64_t addr,
     int mem_idx;
     TCGMemOpIdx oi;
 
-    if (!HAVE_CMPXCHG128) {
-        cpu_loop_exit_atomic(ENV_GET_CPU(env), ra);
-    }
+    assert(HAVE_CMPXCHG128);
 
     mem_idx = cpu_mmu_index(env, false);
     oi = make_memop_idx(MO_LEQ | MO_ALIGN_16, mem_idx);
diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/translate-a64.c
+++ b/target/arm/translate-a64.c
@@ -XXX,XX +XXX,XX @@
 
 #include "trace-tcg.h"
 #include "translate-a64.h"
+#include "qemu/atomic128.h"
 
 static TCGv_i64 cpu_X[32];
 static TCGv_i64 cpu_pc;
@@ -XXX,XX +XXX,XX @@ static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2,
                                        get_mem_index(s),
                                        MO_64 | MO_ALIGN | s->be_data);
             tcg_gen_setcond_i64(TCG_COND_NE, tmp, tmp, cpu_exclusive_val);
-        } else if (s->be_data == MO_LE) {
-            if (tb_cflags(s->base.tb) & CF_PARALLEL) {
+        } else if (tb_cflags(s->base.tb) & CF_PARALLEL) {
+            if (!HAVE_CMPXCHG128) {
+                gen_helper_exit_atomic(cpu_env);
+                s->base.is_jmp = DISAS_NORETURN;
+            } else if (s->be_data == MO_LE) {
                 gen_helper_paired_cmpxchg64_le_parallel(tmp, cpu_env,
                                                         cpu_exclusive_addr,
                                                         cpu_reg(s, rt),
                                                         cpu_reg(s, rt2));
             } else {
-                gen_helper_paired_cmpxchg64_le(tmp, cpu_env, cpu_exclusive_addr,
-                                               cpu_reg(s, rt), cpu_reg(s, rt2));
-            }
-        } else {
-            if (tb_cflags(s->base.tb) & CF_PARALLEL) {
                 gen_helper_paired_cmpxchg64_be_parallel(tmp, cpu_env,
                                                         cpu_exclusive_addr,
                                                         cpu_reg(s, rt),
                                                         cpu_reg(s, rt2));
-            } else {
-                gen_helper_paired_cmpxchg64_be(tmp, cpu_env, cpu_exclusive_addr,
-                                               cpu_reg(s, rt), cpu_reg(s, rt2));
             }
+        } else if (s->be_data == MO_LE) {
+            gen_helper_paired_cmpxchg64_le(tmp, cpu_env, cpu_exclusive_addr,
+                                           cpu_reg(s, rt), cpu_reg(s, rt2));
+        } else {
+            gen_helper_paired_cmpxchg64_be(tmp, cpu_env, cpu_exclusive_addr,
+                                           cpu_reg(s, rt), cpu_reg(s, rt2));
         }
     } else {
         tcg_gen_atomic_cmpxchg_i64(tmp, cpu_exclusive_addr, cpu_exclusive_val,
@@ -XXX,XX +XXX,XX @@ static void gen_compare_and_swap_pair(DisasContext *s, int rs, int rt,
         }
         tcg_temp_free_i64(cmp);
     } else if (tb_cflags(s->base.tb) & CF_PARALLEL) {
-        TCGv_i32 tcg_rs = tcg_const_i32(rs);
-
-        if (s->be_data == MO_LE) {
-            gen_helper_casp_le_parallel(cpu_env, tcg_rs, addr, t1, t2);
+        if (HAVE_CMPXCHG128) {
+            TCGv_i32 tcg_rs = tcg_const_i32(rs);
+            if (s->be_data == MO_LE) {
+                gen_helper_casp_le_parallel(cpu_env, tcg_rs, addr, t1, t2);
+            } else {
+                gen_helper_casp_be_parallel(cpu_env, tcg_rs, addr, t1, t2);
+            }
+            tcg_temp_free_i32(tcg_rs);
         } else {
-            gen_helper_casp_be_parallel(cpu_env, tcg_rs, addr, t1, t2);
+            gen_helper_exit_atomic(cpu_env);
+            s->base.is_jmp = DISAS_NORETURN;
         }
-        tcg_temp_free_i32(tcg_rs);
     } else {
         TCGv_i64 d1 = tcg_temp_new_i64();
         TCGv_i64 d2 = tcg_temp_new_i64();
-- 
2.17.2

Reviewed-by: Emilio G. Cota <cota@braap.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/ppc/helper.h | 2 +-
 target/ppc/mem_helper.c | 33 ++++++++++--
 target/ppc/translate.c | 115 +++++++++++++++++++++-------------------
 3 files changed, 88 insertions(+), 62 deletions(-)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index XXXXXXX..XXXXXXX 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -XXX,XX +XXX,XX @@ DEF_HELPER_4(dscliq, void, env, fprp, fprp, i32)
 DEF_HELPER_1(tbegin, void, env)
 DEF_HELPER_FLAGS_1(fixup_thrm, TCG_CALL_NO_RWG, void, env)
 
-#if defined(TARGET_PPC64) && defined(CONFIG_ATOMIC128)
+#ifdef TARGET_PPC64
 DEF_HELPER_FLAGS_3(lq_le_parallel, TCG_CALL_NO_WG, i64, env, tl, i32)
 DEF_HELPER_FLAGS_3(lq_be_parallel, TCG_CALL_NO_WG, i64, env, tl, i32)
 DEF_HELPER_FLAGS_5(stq_le_parallel, TCG_CALL_NO_WG,
diff --git a/target/ppc/mem_helper.c b/target/ppc/mem_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/ppc/mem_helper.c
+++ b/target/ppc/mem_helper.c
@@ -XXX,XX +XXX,XX @@
 #include "exec/cpu_ldst.h"
 #include "tcg.h"
 #include "internal.h"
+#include "qemu/atomic128.h"
 
 //#define DEBUG_OP
 
@@ -XXX,XX +XXX,XX @@ target_ulong helper_lscbx(CPUPPCState *env, target_ulong addr, uint32_t reg,
     return i;
 }
 
-#if defined(TARGET_PPC64) && defined(CONFIG_ATOMIC128)
+#ifdef TARGET_PPC64
 uint64_t helper_lq_le_parallel(CPUPPCState *env, target_ulong addr,
                                uint32_t opidx)
 {
-    Int128 ret = helper_atomic_ldo_le_mmu(env, addr, opidx, GETPC());
+    Int128 ret;
+
+    /* We will have raised EXCP_ATOMIC from the translator.  */
+    assert(HAVE_ATOMIC128);
+    ret = helper_atomic_ldo_le_mmu(env, addr, opidx, GETPC());
     env->retxh = int128_gethi(ret);
     return int128_getlo(ret);
 }
@@ -XXX,XX +XXX,XX @@ uint64_t helper_lq_le_parallel(CPUPPCState *env, target_ulong addr,
 uint64_t helper_lq_be_parallel(CPUPPCState *env, target_ulong addr,
                                uint32_t opidx)
 {
-    Int128 ret = helper_atomic_ldo_be_mmu(env, addr, opidx, GETPC());
+    Int128 ret;
+
+    /* We will have raised EXCP_ATOMIC from the translator.  */
+    assert(HAVE_ATOMIC128);
+    ret = helper_atomic_ldo_be_mmu(env, addr, opidx, GETPC());
     env->retxh = int128_gethi(ret);
     return int128_getlo(ret);
 }
@@ -XXX,XX +XXX,XX @@ uint64_t helper_lq_be_parallel(CPUPPCState *env, target_ulong addr,
 void helper_stq_le_parallel(CPUPPCState *env, target_ulong addr,
                             uint64_t lo, uint64_t hi, uint32_t opidx)
 {
-    Int128 val = int128_make128(lo, hi);
+    Int128 val;
+
+    /* We will have raised EXCP_ATOMIC from the translator.  */
+    assert(HAVE_ATOMIC128);
+    val = int128_make128(lo, hi);
     helper_atomic_sto_le_mmu(env, addr, val, opidx, GETPC());
 }
 
 void helper_stq_be_parallel(CPUPPCState *env, target_ulong addr,
                             uint64_t lo, uint64_t hi, uint32_t opidx)
 {
-    Int128 val = int128_make128(lo, hi);
+    Int128 val;
+
+    /* We will have raised EXCP_ATOMIC from the translator.  */
+    assert(HAVE_ATOMIC128);
+    val = int128_make128(lo, hi);
     helper_atomic_sto_be_mmu(env, addr, val, opidx, GETPC());
 }
 
@@ -XXX,XX +XXX,XX @@ uint32_t helper_stqcx_le_parallel(CPUPPCState *env, target_ulong addr,
 {
     bool success = false;
 
+    /* We will have raised EXCP_ATOMIC from the translator.  */
+    assert(HAVE_CMPXCHG128);
+
     if (likely(addr == env->reserve_addr)) {
         Int128 oldv, cmpv, newv;
 
@@ -XXX,XX +XXX,XX @@ uint32_t helper_stqcx_be_parallel(CPUPPCState *env, target_ulong addr,
 {
     bool success = false;
 
+    /* We will have raised EXCP_ATOMIC from the translator.  */
+    assert(HAVE_CMPXCHG128);
+
     if (likely(addr == env->reserve_addr)) {
         Int128 oldv, cmpv, newv;
 
diff --git a/target/ppc/translate.c b/target/ppc/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/ppc/translate.c
+++ b/target/ppc/translate.c
@@ -XXX,XX +XXX,XX @@
 #include "trace-tcg.h"
 #include "exec/translator.h"
 #include "exec/log.h"
+#include "qemu/atomic128.h"
 
 
 #define CPU_SINGLE_STEP 0x1
@@ -XXX,XX +XXX,XX @@ static void gen_lq(DisasContext *ctx)
     hi = cpu_gpr[rd];
 
     if (tb_cflags(ctx->base.tb) & CF_PARALLEL) {
-#ifdef CONFIG_ATOMIC128
-        TCGv_i32 oi = tcg_temp_new_i32();
-        if (ctx->le_mode) {
-            tcg_gen_movi_i32(oi, make_memop_idx(MO_LEQ, ctx->mem_idx));
-            gen_helper_lq_le_parallel(lo, cpu_env, EA, oi);
+        if (HAVE_ATOMIC128) {
+            TCGv_i32 oi = tcg_temp_new_i32();
+            if (ctx->le_mode) {
+                tcg_gen_movi_i32(oi, make_memop_idx(MO_LEQ, ctx->mem_idx));
+                gen_helper_lq_le_parallel(lo, cpu_env, EA, oi);
+            } else {
+                tcg_gen_movi_i32(oi, make_memop_idx(MO_BEQ, ctx->mem_idx));
+                gen_helper_lq_be_parallel(lo, cpu_env, EA, oi);
+            }
+            tcg_temp_free_i32(oi);
+            tcg_gen_ld_i64(hi, cpu_env, offsetof(CPUPPCState, retxh));
         } else {
-            tcg_gen_movi_i32(oi, make_memop_idx(MO_BEQ, ctx->mem_idx));
-            gen_helper_lq_be_parallel(lo, cpu_env, EA, oi);
+            /* Restart with exclusive lock.  */
+            gen_helper_exit_atomic(cpu_env);
+            ctx->base.is_jmp = DISAS_NORETURN;
         }
-        tcg_temp_free_i32(oi);
-        tcg_gen_ld_i64(hi, cpu_env, offsetof(CPUPPCState, retxh));
-#else
-        /* Restart with exclusive lock.  */
-        gen_helper_exit_atomic(cpu_env);
-        ctx->base.is_jmp = DISAS_NORETURN;
-#endif
     } else if (ctx->le_mode) {
         tcg_gen_qemu_ld_i64(lo, EA, ctx->mem_idx, MO_LEQ);
         gen_addr_add(ctx, EA, EA, 8);
@@ -XXX,XX +XXX,XX @@ static void gen_std(DisasContext *ctx)
         hi = cpu_gpr[rs];
 
         if (tb_cflags(ctx->base.tb) & CF_PARALLEL) {
-#ifdef CONFIG_ATOMIC128
-            TCGv_i32 oi = tcg_temp_new_i32();
-            if (ctx->le_mode) {
-                tcg_gen_movi_i32(oi, make_memop_idx(MO_LEQ, ctx->mem_idx));
-                gen_helper_stq_le_parallel(cpu_env, EA, lo, hi, oi);
+            if (HAVE_ATOMIC128) {
+                TCGv_i32 oi = tcg_temp_new_i32();
+                if (ctx->le_mode) {
+                    tcg_gen_movi_i32(oi, make_memop_idx(MO_LEQ, ctx->mem_idx));
+                    gen_helper_stq_le_parallel(cpu_env, EA, lo, hi, oi);
+                } else {
+                    tcg_gen_movi_i32(oi, make_memop_idx(MO_BEQ, ctx->mem_idx));
+                    gen_helper_stq_be_parallel(cpu_env, EA, lo, hi, oi);
+                }
+                tcg_temp_free_i32(oi);
             } else {
-                tcg_gen_movi_i32(oi, make_memop_idx(MO_BEQ, ctx->mem_idx));
-                gen_helper_stq_be_parallel(cpu_env, EA, lo, hi, oi);
+                /* Restart with exclusive lock.  */
+                gen_helper_exit_atomic(cpu_env);
+                ctx->base.is_jmp = DISAS_NORETURN;
             }
-            tcg_temp_free_i32(oi);
-#else
-            /* Restart with exclusive lock.  */
-            gen_helper_exit_atomic(cpu_env);
-            ctx->base.is_jmp = DISAS_NORETURN;
-#endif
         } else if (ctx->le_mode) {
             tcg_gen_qemu_st_i64(lo, EA, ctx->mem_idx, MO_LEQ);
             gen_addr_add(ctx, EA, EA, 8);
@@ -XXX,XX +XXX,XX @@ static void gen_lqarx(DisasContext *ctx)
     hi = cpu_gpr[rd];
 
     if (tb_cflags(ctx->base.tb) & CF_PARALLEL) {
-#ifdef CONFIG_ATOMIC128
-        TCGv_i32 oi = tcg_temp_new_i32();
-        if (ctx->le_mode) {
-            tcg_gen_movi_i32(oi, make_memop_idx(MO_LEQ | MO_ALIGN_16,
-                                                ctx->mem_idx));
-            gen_helper_lq_le_parallel(lo, cpu_env, EA, oi);
+        if (HAVE_ATOMIC128) {
+            TCGv_i32 oi = tcg_temp_new_i32();
+            if (ctx->le_mode) {
+                tcg_gen_movi_i32(oi, make_memop_idx(MO_LEQ | MO_ALIGN_16,
+                                                    ctx->mem_idx));
+                gen_helper_lq_le_parallel(lo, cpu_env, EA, oi);
+            } else {
+                tcg_gen_movi_i32(oi, make_memop_idx(MO_BEQ | MO_ALIGN_16,
+                                                    ctx->mem_idx));
+                gen_helper_lq_be_parallel(lo, cpu_env, EA, oi);
+            }
+            tcg_temp_free_i32(oi);
+            tcg_gen_ld_i64(hi, cpu_env, offsetof(CPUPPCState, retxh));
         } else {
-            tcg_gen_movi_i32(oi, make_memop_idx(MO_BEQ | MO_ALIGN_16,
-                                                ctx->mem_idx));
-            gen_helper_lq_be_parallel(lo, cpu_env, EA, oi);
+            /* Restart with exclusive lock.  */
+            gen_helper_exit_atomic(cpu_env);
+            ctx->base.is_jmp = DISAS_NORETURN;
+            tcg_temp_free(EA);
+            return;
         }
-        tcg_temp_free_i32(oi);
-        tcg_gen_ld_i64(hi, cpu_env, offsetof(CPUPPCState, retxh));
-#else
-        /* Restart with exclusive lock.  */
-        gen_helper_exit_atomic(cpu_env);
-        ctx->base.is_jmp = DISAS_NORETURN;
-        tcg_temp_free(EA);
-        return;
-#endif
     } else if (ctx->le_mode) {
         tcg_gen_qemu_ld_i64(lo, EA, ctx->mem_idx, MO_LEQ | MO_ALIGN_16);
         tcg_gen_mov_tl(cpu_reserve, EA);
@@ -XXX,XX +XXX,XX @@ static void gen_stqcx_(DisasContext *ctx)
     hi = cpu_gpr[rs];
 
     if (tb_cflags(ctx->base.tb) & CF_PARALLEL) {
-        TCGv_i32 oi = tcg_const_i32(DEF_MEMOP(MO_Q) | MO_ALIGN_16);
-#ifdef CONFIG_ATOMIC128
-        if (ctx->le_mode) {
-            gen_helper_stqcx_le_parallel(cpu_crf[0], cpu_env, EA, lo, hi, oi);
+        if (HAVE_CMPXCHG128) {
+            TCGv_i32 oi = tcg_const_i32(DEF_MEMOP(MO_Q) | MO_ALIGN_16);
+            if (ctx->le_mode) {
+                gen_helper_stqcx_le_parallel(cpu_crf[0], cpu_env,
+                                             EA, lo, hi, oi);
+            } else {
+                gen_helper_stqcx_be_parallel(cpu_crf[0], cpu_env,
+                                             EA, lo, hi, oi);
+            }
+            tcg_temp_free_i32(oi);
         } else {
-            gen_helper_stqcx_le_parallel(cpu_crf[0], cpu_env, EA, lo, hi, oi);
+            /* Restart with exclusive lock.  */
+            gen_helper_exit_atomic(cpu_env);
+            ctx->base.is_jmp = DISAS_NORETURN;
         }
-#else
-        /* Restart with exclusive lock.  */
-        gen_helper_exit_atomic(cpu_env);
-        ctx->base.is_jmp = DISAS_NORETURN;
-#endif
         tcg_temp_free(EA);
-        tcg_temp_free_i32(oi);
     } else {
         TCGLabel *lab_fail = gen_new_label();
         TCGLabel *lab_over = gen_new_label();
-- 
2.17.2

Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/s390x/mem_helper.c | 92 +++++++++++++++++----------------------
 1 file changed, 41 insertions(+), 51 deletions(-)

diff --git a/target/s390x/mem_helper.c b/target/s390x/mem_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/s390x/mem_helper.c
+++ b/target/s390x/mem_helper.c
@@ -XXX,XX +XXX,XX @@
 #include "exec/exec-all.h"
 #include "exec/cpu_ldst.h"
 #include "qemu/int128.h"
+#include "qemu/atomic128.h"
 
 #if !defined(CONFIG_USER_ONLY)
 #include "hw/s390x/storage-keys.h"
@@ -XXX,XX +XXX,XX @@ static void do_cdsg(CPUS390XState *env, uint64_t addr,
 bool fail;
 
 if (parallel) {
-#ifndef CONFIG_ATOMIC128
+#if !HAVE_CMPXCHG128
 cpu_loop_exit_atomic(ENV_GET_CPU(env), ra);
 #else
 int mem_idx = cpu_mmu_index(env, false);
@@ -XXX,XX +XXX,XX @@ void HELPER(cdsg_parallel)(CPUS390XState *env, uint64_t addr,
 static uint32_t do_csst(CPUS390XState *env, uint32_t r3, uint64_t a1,
 uint64_t a2, bool parallel)
 {
-#if !defined(CONFIG_USER_ONLY) || defined(CONFIG_ATOMIC128)
 uint32_t mem_idx = cpu_mmu_index(env, false);
-#endif
 uintptr_t ra = GETPC();
 uint32_t fc = extract32(env->regs[0], 0, 8);
 uint32_t sc = extract32(env->regs[0], 8, 8);
@@ -XXX,XX +XXX,XX @@ static uint32_t do_csst(CPUS390XState *env, uint32_t r3, uint64_t a1,
 probe_write(env, a2, 0, mem_idx, ra);
 #endif
 
- /* Note that the compare-and-swap is atomic, and the store is atomic, but
- the complete operation is not. Therefore we do not need to assert serial
- context in order to implement this. That said, restart early if we can't
- support either operation that is supposed to be atomic. */
+ /*
+ * Note that the compare-and-swap is atomic, and the store is atomic,
+ * but the complete operation is not. Therefore we do not need to
+ * assert serial context in order to implement this. That said,
+ * restart early if we can't support either operation that is supposed
+ * to be atomic.
+ */
 if (parallel) {
- int mask = 0;
-#if !defined(CONFIG_ATOMIC64)
- mask = -8;
-#elif !defined(CONFIG_ATOMIC128)
- mask = -16;
+ uint32_t max = 2;
+#ifdef CONFIG_ATOMIC64
+ max = 3;
 #endif
- if (((4 << fc) | (1 << sc)) & mask) {
+ if ((HAVE_CMPXCHG128 ? 0 : fc + 2 > max) ||
+ (HAVE_ATOMIC128 ? 0 : sc > max)) {
 cpu_loop_exit_atomic(ENV_GET_CPU(env), ra);
 }
 }
@@ -XXX,XX +XXX,XX @@ static uint32_t do_csst(CPUS390XState *env, uint32_t r3, uint64_t a1,
 Int128 cv = int128_make128(env->regs[r3 + 1], env->regs[r3]);
 Int128 ov;
 
- if (parallel) {
-#ifdef CONFIG_ATOMIC128
- TCGMemOpIdx oi = make_memop_idx(MO_TEQ | MO_ALIGN_16, mem_idx);
- ov = helper_atomic_cmpxchgo_be_mmu(env, a1, cv, nv, oi, ra);
- cc = !int128_eq(ov, cv);
-#else
- /* Note that we asserted !parallel above. */
- g_assert_not_reached();
-#endif
- } else {
+ if (!parallel) {
 uint64_t oh = cpu_ldq_data_ra(env, a1 + 0, ra);
 uint64_t ol = cpu_ldq_data_ra(env, a1 + 8, ra);
 
@@ -XXX,XX +XXX,XX @@ static uint32_t do_csst(CPUS390XState *env, uint32_t r3, uint64_t a1,
 
 cpu_stq_data_ra(env, a1 + 0, int128_gethi(nv), ra);
 cpu_stq_data_ra(env, a1 + 8, int128_getlo(nv), ra);
+ } else if (HAVE_CMPXCHG128) {
+ TCGMemOpIdx oi = make_memop_idx(MO_TEQ | MO_ALIGN_16, mem_idx);
+ ov = helper_atomic_cmpxchgo_be_mmu(env, a1, cv, nv, oi, ra);
+ cc = !int128_eq(ov, cv);
+ } else {
+ /* Note that we asserted !parallel above. */
+ g_assert_not_reached();
 }
 
 env->regs[r3 + 0] = int128_gethi(ov);
@@ -XXX,XX +XXX,XX @@ static uint32_t do_csst(CPUS390XState *env, uint32_t r3, uint64_t a1,
 cpu_stq_data_ra(env, a2, svh, ra);
 break;
 case 4:
- if (parallel) {
-#ifdef CONFIG_ATOMIC128
+ if (!parallel) {
+ cpu_stq_data_ra(env, a2 + 0, svh, ra);
+ cpu_stq_data_ra(env, a2 + 8, svl, ra);
+ } else if (HAVE_ATOMIC128) {
 TCGMemOpIdx oi = make_memop_idx(MO_TEQ | MO_ALIGN_16, mem_idx);
 Int128 sv = int128_make128(svl, svh);
 helper_atomic_sto_be_mmu(env, a2, sv, oi, ra);
-#else
+ } else {
 /* Note that we asserted !parallel above. */
 g_assert_not_reached();
-#endif
- } else {
- cpu_stq_data_ra(env, a2 + 0, svh, ra);
- cpu_stq_data_ra(env, a2 + 8, svl, ra);
 }
 break;
 default:
@@ -XXX,XX +XXX,XX @@ static uint64_t do_lpq(CPUS390XState *env, uint64_t addr, bool parallel)
 uintptr_t ra = GETPC();
 uint64_t hi, lo;
 
- if (parallel) {
-#ifndef CONFIG_ATOMIC128
- cpu_loop_exit_atomic(ENV_GET_CPU(env), ra);
-#else
+ if (!parallel) {
+ check_alignment(env, addr, 16, ra);
+ hi = cpu_ldq_data_ra(env, addr + 0, ra);
+ lo = cpu_ldq_data_ra(env, addr + 8, ra);
+ } else if (HAVE_ATOMIC128) {
 int mem_idx = cpu_mmu_index(env, false);
 TCGMemOpIdx oi = make_memop_idx(MO_TEQ | MO_ALIGN_16, mem_idx);
 Int128 v = helper_atomic_ldo_be_mmu(env, addr, oi, ra);
 hi = int128_gethi(v);
 lo = int128_getlo(v);
-#endif
 } else {
- check_alignment(env, addr, 16, ra);
-
- hi = cpu_ldq_data_ra(env, addr + 0, ra);
- lo = cpu_ldq_data_ra(env, addr + 8, ra);
+ cpu_loop_exit_atomic(ENV_GET_CPU(env), ra);
 }
 
 env->retxl = lo;
@@ -XXX,XX +XXX,XX @@ static void do_stpq(CPUS390XState *env, uint64_t addr,
 {
 uintptr_t ra = GETPC();
 
- if (parallel) {
-#ifndef CONFIG_ATOMIC128
- cpu_loop_exit_atomic(ENV_GET_CPU(env), ra);
-#else
- int mem_idx = cpu_mmu_index(env, false);
- TCGMemOpIdx oi = make_memop_idx(MO_TEQ | MO_ALIGN_16, mem_idx);
-
- Int128 v = int128_make128(low, high);
- helper_atomic_sto_be_mmu(env, addr, v, oi, ra);
-#endif
- } else {
+ if (!parallel) {
 check_alignment(env, addr, 16, ra);
-
 cpu_stq_data_ra(env, addr + 0, high, ra);
 cpu_stq_data_ra(env, addr + 8, low, ra);
+ } else if (HAVE_ATOMIC128) {
+ int mem_idx = cpu_mmu_index(env, false);
+ TCGMemOpIdx oi = make_memop_idx(MO_TEQ | MO_ALIGN_16, mem_idx);
+ Int128 v = int128_make128(low, high);
+ helper_atomic_sto_be_mmu(env, addr, v, oi, ra);
+ } else {
+ cpu_loop_exit_atomic(ENV_GET_CPU(env), ra);
 }
 }
 
-- 
2.17.2

Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/s390x/mem_helper.c | 128 ++++++++++++++++++--------------------
 1 file changed, 61 insertions(+), 67 deletions(-)

diff --git a/target/s390x/mem_helper.c b/target/s390x/mem_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/s390x/mem_helper.c
+++ b/target/s390x/mem_helper.c
@@ -XXX,XX +XXX,XX @@ uint32_t HELPER(trXX)(CPUS390XState *env, uint32_t r1, uint32_t r2,
     return cc;
 }
 
-static void do_cdsg(CPUS390XState *env, uint64_t addr,
-                    uint32_t r1, uint32_t r3, bool parallel)
+void HELPER(cdsg)(CPUS390XState *env, uint64_t addr,
+                  uint32_t r1, uint32_t r3)
 {
     uintptr_t ra = GETPC();
     Int128 cmpv = int128_make128(env->regs[r1 + 1], env->regs[r1]);
     Int128 newv = int128_make128(env->regs[r3 + 1], env->regs[r3]);
     Int128 oldv;
+    uint64_t oldh, oldl;
     bool fail;
 
-    if (parallel) {
-#if !HAVE_CMPXCHG128
-        cpu_loop_exit_atomic(ENV_GET_CPU(env), ra);
-#else
-        int mem_idx = cpu_mmu_index(env, false);
-        TCGMemOpIdx oi = make_memop_idx(MO_TEQ | MO_ALIGN_16, mem_idx);
-        oldv = helper_atomic_cmpxchgo_be_mmu(env, addr, cmpv, newv, oi, ra);
-        fail = !int128_eq(oldv, cmpv);
-#endif
-    } else {
-        uint64_t oldh, oldl;
+    check_alignment(env, addr, 16, ra);
 
-        check_alignment(env, addr, 16, ra);
+    oldh = cpu_ldq_data_ra(env, addr + 0, ra);
+    oldl = cpu_ldq_data_ra(env, addr + 8, ra);
 
-        oldh = cpu_ldq_data_ra(env, addr + 0, ra);
-        oldl = cpu_ldq_data_ra(env, addr + 8, ra);
-
-        oldv = int128_make128(oldl, oldh);
-        fail = !int128_eq(oldv, cmpv);
-        if (fail) {
-            newv = oldv;
-        }
-
-        cpu_stq_data_ra(env, addr + 0, int128_gethi(newv), ra);
-        cpu_stq_data_ra(env, addr + 8, int128_getlo(newv), ra);
+    oldv = int128_make128(oldl, oldh);
+    fail = !int128_eq(oldv, cmpv);
+    if (fail) {
+        newv = oldv;
     }
 
+    cpu_stq_data_ra(env, addr + 0, int128_gethi(newv), ra);
+    cpu_stq_data_ra(env, addr + 8, int128_getlo(newv), ra);
+
     env->cc_op = fail;
     env->regs[r1] = int128_gethi(oldv);
     env->regs[r1 + 1] = int128_getlo(oldv);
 }
 
-void HELPER(cdsg)(CPUS390XState *env, uint64_t addr,
-                  uint32_t r1, uint32_t r3)
-{
-    do_cdsg(env, addr, r1, r3, false);
-}
-
 void HELPER(cdsg_parallel)(CPUS390XState *env, uint64_t addr,
                            uint32_t r1, uint32_t r3)
 {
-    do_cdsg(env, addr, r1, r3, true);
+    uintptr_t ra = GETPC();
+    Int128 cmpv = int128_make128(env->regs[r1 + 1], env->regs[r1]);
+    Int128 newv = int128_make128(env->regs[r3 + 1], env->regs[r3]);
+    int mem_idx;
+    TCGMemOpIdx oi;
+    Int128 oldv;
+    bool fail;
+
+    if (!HAVE_CMPXCHG128) {
+        cpu_loop_exit_atomic(ENV_GET_CPU(env), ra);
+    }
+
+    mem_idx = cpu_mmu_index(env, false);
+    oi = make_memop_idx(MO_TEQ | MO_ALIGN_16, mem_idx);
+    oldv = helper_atomic_cmpxchgo_be_mmu(env, addr, cmpv, newv, oi, ra);
+    fail = !int128_eq(oldv, cmpv);
+
+    env->cc_op = fail;
+    env->regs[r1] = int128_gethi(oldv);
+    env->regs[r1 + 1] = int128_getlo(oldv);
 }
 
 static uint32_t do_csst(CPUS390XState *env, uint32_t r3, uint64_t a1,
@@ -XXX,XX +XXX,XX @@ uint64_t HELPER(lra)(CPUS390XState *env, uint64_t addr)
 #endif
 
 /* load pair from quadword */
-static uint64_t do_lpq(CPUS390XState *env, uint64_t addr, bool parallel)
+uint64_t HELPER(lpq)(CPUS390XState *env, uint64_t addr)
 {
     uintptr_t ra = GETPC();
     uint64_t hi, lo;
 
-    if (!parallel) {
-        check_alignment(env, addr, 16, ra);
-        hi = cpu_ldq_data_ra(env, addr + 0, ra);
-        lo = cpu_ldq_data_ra(env, addr + 8, ra);
-    } else if (HAVE_ATOMIC128) {
+    check_alignment(env, addr, 16, ra);
+    hi = cpu_ldq_data_ra(env, addr + 0, ra);
+    lo = cpu_ldq_data_ra(env, addr + 8, ra);
+
+    env->retxl = lo;
+    return hi;
+}
+
+uint64_t HELPER(lpq_parallel)(CPUS390XState *env, uint64_t addr)
+{
+    uintptr_t ra = GETPC();
+    uint64_t hi, lo;
+
+    if (HAVE_ATOMIC128) {
         int mem_idx = cpu_mmu_index(env, false);
         TCGMemOpIdx oi = make_memop_idx(MO_TEQ | MO_ALIGN_16, mem_idx);
         Int128 v = helper_atomic_ldo_be_mmu(env, addr, oi, ra);
@@ -XXX,XX +XXX,XX @@ static uint64_t do_lpq(CPUS390XState *env, uint64_t addr, bool parallel)
     return hi;
 }
 
-uint64_t HELPER(lpq)(CPUS390XState *env, uint64_t addr)
-{
-    return do_lpq(env, addr, false);
-}
-
-uint64_t HELPER(lpq_parallel)(CPUS390XState *env, uint64_t addr)
-{
-    return do_lpq(env, addr, true);
-}
-
 /* store pair to quadword */
-static void do_stpq(CPUS390XState *env, uint64_t addr,
-                    uint64_t low, uint64_t high, bool parallel)
+void HELPER(stpq)(CPUS390XState *env, uint64_t addr,
+                  uint64_t low, uint64_t high)
 {
     uintptr_t ra = GETPC();
 
-    if (!parallel) {
-        check_alignment(env, addr, 16, ra);
-        cpu_stq_data_ra(env, addr + 0, high, ra);
-        cpu_stq_data_ra(env, addr + 8, low, ra);
-    } else if (HAVE_ATOMIC128) {
+    check_alignment(env, addr, 16, ra);
+    cpu_stq_data_ra(env, addr + 0, high, ra);
+    cpu_stq_data_ra(env, addr + 8, low, ra);
+}
+
+void HELPER(stpq_parallel)(CPUS390XState *env, uint64_t addr,
+                           uint64_t low, uint64_t high)
+{
+    uintptr_t ra = GETPC();
+
+    if (HAVE_ATOMIC128) {
         int mem_idx = cpu_mmu_index(env, false);
         TCGMemOpIdx oi = make_memop_idx(MO_TEQ | MO_ALIGN_16, mem_idx);
         Int128 v = int128_make128(low, high);
@@ -XXX,XX +XXX,XX @@ static void do_stpq(CPUS390XState *env, uint64_t addr,
     }
 }
 
-void HELPER(stpq)(CPUS390XState *env, uint64_t addr,
-                  uint64_t low, uint64_t high)
-{
-    do_stpq(env, addr, low, high, false);
-}
-
-void HELPER(stpq_parallel)(CPUS390XState *env, uint64_t addr,
-                           uint64_t low, uint64_t high)
-{
-    do_stpq(env, addr, low, high, true);
-}
-
 /* Execute instruction.  This instruction executes an insn modified with
    the contents of r1.  It does not change the executed instruction in memory;
    it does not change the program counter.
-- 
2.17.2

When op raises an exception, it may not have initialized the output
temps that would be written back by wout or cout.

Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/s390x/translate.c | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/target/s390x/translate.c b/target/s390x/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/s390x/translate.c
+++ b/target/s390x/translate.c
@@ -XXX,XX +XXX,XX @@ struct DisasInsn {
 
     const char *name;
 
+    /* Pre-process arguments before HELP_OP.  */
     void (*help_in1)(DisasContext *, DisasFields *, DisasOps *);
     void (*help_in2)(DisasContext *, DisasFields *, DisasOps *);
     void (*help_prep)(DisasContext *, DisasFields *, DisasOps *);
+
+    /*
+     * Post-process output after HELP_OP.
+     * Note that these are not called if HELP_OP returns DISAS_NORETURN.
+     */
     void (*help_wout)(DisasContext *, DisasFields *, DisasOps *);
     void (*help_cout)(DisasContext *, DisasOps *);
+
+    /* Implement the operation itself.  */
     DisasJumpType (*help_op)(DisasContext *, DisasOps *);
 
     uint64_t data;
@@ -XXX,XX +XXX,XX @@ static DisasJumpType translate_one(CPUS390XState *env, DisasContext *s)
     if (insn->help_op) {
         ret = insn->help_op(s, &o);
     }
-    if (insn->help_wout) {
-        insn->help_wout(s, &f, &o);
-    }
-    if (insn->help_cout) {
-        insn->help_cout(s, &o);
+    if (ret != DISAS_NORETURN) {
+        if (insn->help_wout) {
+            insn->help_wout(s, &f, &o);
+        }
+        if (insn->help_cout) {
+            insn->help_cout(s, &o);
+        }
     }
 
     /* Free any temporaries created by the helpers.  */
-- 
2.17.2

Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/s390x/mem_helper.c | 40 +++++++++++++++++++--------------------
 target/s390x/translate.c | 25 +++++++++++++++++-------
 2 files changed, 38 insertions(+), 27 deletions(-)

diff --git a/target/s390x/mem_helper.c b/target/s390x/mem_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/s390x/mem_helper.c
+++ b/target/s390x/mem_helper.c
@@ -XXX,XX +XXX,XX @@ void HELPER(cdsg_parallel)(CPUS390XState *env, uint64_t addr,
     Int128 oldv;
     bool fail;
 
-    if (!HAVE_CMPXCHG128) {
-        cpu_loop_exit_atomic(ENV_GET_CPU(env), ra);
-    }
+    assert(HAVE_CMPXCHG128);
 
     mem_idx = cpu_mmu_index(env, false);
     oi = make_memop_idx(MO_TEQ | MO_ALIGN_16, mem_idx);
@@ -XXX,XX +XXX,XX @@ uint64_t HELPER(lpq_parallel)(CPUS390XState *env, uint64_t addr)
 {
     uintptr_t ra = GETPC();
     uint64_t hi, lo;
+    int mem_idx;
+    TCGMemOpIdx oi;
+    Int128 v;
 
-    if (HAVE_ATOMIC128) {
-        int mem_idx = cpu_mmu_index(env, false);
-        TCGMemOpIdx oi = make_memop_idx(MO_TEQ | MO_ALIGN_16, mem_idx);
-        Int128 v = helper_atomic_ldo_be_mmu(env, addr, oi, ra);
-        hi = int128_gethi(v);
-        lo = int128_getlo(v);
-    } else {
-        cpu_loop_exit_atomic(ENV_GET_CPU(env), ra);
-    }
+    assert(HAVE_ATOMIC128);
+
+    mem_idx = cpu_mmu_index(env, false);
+    oi = make_memop_idx(MO_TEQ | MO_ALIGN_16, mem_idx);
+    v = helper_atomic_ldo_be_mmu(env, addr, oi, ra);
+    hi = int128_gethi(v);
+    lo = int128_getlo(v);
 
     env->retxl = lo;
     return hi;
@@ -XXX,XX +XXX,XX @@ void HELPER(stpq_parallel)(CPUS390XState *env, uint64_t addr,
                            uint64_t low, uint64_t high)
 {
     uintptr_t ra = GETPC();
+    int mem_idx;
+    TCGMemOpIdx oi;
+    Int128 v;
 
-    if (HAVE_ATOMIC128) {
-        int mem_idx = cpu_mmu_index(env, false);
-        TCGMemOpIdx oi = make_memop_idx(MO_TEQ | MO_ALIGN_16, mem_idx);
-        Int128 v = int128_make128(low, high);
-        helper_atomic_sto_be_mmu(env, addr, v, oi, ra);
-    } else {
-        cpu_loop_exit_atomic(ENV_GET_CPU(env), ra);
-    }
+    assert(HAVE_ATOMIC128);
+
+    mem_idx = cpu_mmu_index(env, false);
+    oi = make_memop_idx(MO_TEQ | MO_ALIGN_16, mem_idx);
+    v = int128_make128(low, high);
+    helper_atomic_sto_be_mmu(env, addr, v, oi, ra);
 }
 
 /* Execute instruction.  This instruction executes an insn modified with
diff --git a/target/s390x/translate.c b/target/s390x/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/s390x/translate.c
+++ b/target/s390x/translate.c
@@ -XXX,XX +XXX,XX @@
 #include "trace-tcg.h"
 #include "exec/translator.h"
 #include "exec/log.h"
+#include "qemu/atomic128.h"
 
 
 /* Information that (most) every instruction needs to manipulate.  */
@@ -XXX,XX +XXX,XX @@ static DisasJumpType op_cdsg(DisasContext *s, DisasOps *o)
     int r3 = get_field(s->fields, r3);
     int d2 = get_field(s->fields, d2);
     int b2 = get_field(s->fields, b2);
+    DisasJumpType ret = DISAS_NEXT;
     TCGv_i64 addr;
     TCGv_i32 t_r1, t_r3;
 
@@ -XXX,XX +XXX,XX @@ static DisasJumpType op_cdsg(DisasContext *s, DisasOps *o)
     addr = get_address(s, 0, b2, d2);
     t_r1 = tcg_const_i32(r1);
     t_r3 = tcg_const_i32(r3);
-    if (tb_cflags(s->base.tb) & CF_PARALLEL) {
+    if (!(tb_cflags(s->base.tb) & CF_PARALLEL)) {
+        gen_helper_cdsg(cpu_env, addr, t_r1, t_r3);
+    } else if (HAVE_CMPXCHG128) {
         gen_helper_cdsg_parallel(cpu_env, addr, t_r1, t_r3);
     } else {
-        gen_helper_cdsg(cpu_env, addr, t_r1, t_r3);
+        gen_helper_exit_atomic(cpu_env);
+        ret = DISAS_NORETURN;
     }
     tcg_temp_free_i64(addr);
     tcg_temp_free_i32(t_r1);
     tcg_temp_free_i32(t_r3);
 
     set_cc_static(s);
-    return DISAS_NEXT;
+    return ret;
 }
 
 static DisasJumpType op_csst(DisasContext *s, DisasOps *o)
@@ -XXX,XX +XXX,XX @@ static DisasJumpType op_lpd(DisasContext *s, DisasOps *o)
 
 static DisasJumpType op_lpq(DisasContext *s, DisasOps *o)
 {
-    if (tb_cflags(s->base.tb) & CF_PARALLEL) {
+    if (!(tb_cflags(s->base.tb) & CF_PARALLEL)) {
+        gen_helper_lpq(o->out, cpu_env, o->in2);
+    } else if (HAVE_ATOMIC128) {
         gen_helper_lpq_parallel(o->out, cpu_env, o->in2);
     } else {
-        gen_helper_lpq(o->out, cpu_env, o->in2);
+        gen_helper_exit_atomic(cpu_env);
+        return DISAS_NORETURN;
     }
     return_low128(o->out2);
     return DISAS_NEXT;
@@ -XXX,XX +XXX,XX @@ static DisasJumpType op_stmh(DisasContext *s, DisasOps *o)
 
 static DisasJumpType op_stpq(DisasContext *s, DisasOps *o)
 {
-    if (tb_cflags(s->base.tb) & CF_PARALLEL) {
+    if (!(tb_cflags(s->base.tb) & CF_PARALLEL)) {
+        gen_helper_stpq(cpu_env, o->in2, o->out2, o->out);
+    } else if (HAVE_ATOMIC128) {
         gen_helper_stpq_parallel(cpu_env, o->in2, o->out2, o->out);
     } else {
-        gen_helper_stpq(cpu_env, o->in2, o->out2, o->out);
+        gen_helper_exit_atomic(cpu_env);
+        return DISAS_NORETURN;
     }
     return DISAS_NEXT;
 }
-- 
2.17.2

From: "Emilio G. Cota" <cota@braap.org>

Updates can come from other threads, so readers that do not
take tlb_lock must use atomic_read to avoid undefined
behaviour (UB).

This completes the conversion to tlb_lock. This conversion results
on average in no performance loss, as the following experiments
(run on an Intel i7-6700K CPU @ 4.00GHz) show.

1. aarch64 bootup+shutdown test:

- Before:
 Performance counter stats for 'taskset -c 0 ../img/aarch64/die.sh' (10 runs):

7487.087786      task-clock (msec)         #    0.998 CPUs utilized            ( +-  0.12% )
    31,574,905,303      cycles                    #    4.217 GHz                      ( +-  0.12% )
    57,097,908,812      instructions              #    1.81  insns per cycle          ( +-  0.08% )
    10,255,415,367      branches                  # 1369.747 M/sec                    ( +-  0.08% )
       173,278,962      branch-misses             #    1.69% of all branches          ( +-  0.18% )

7.504481349 seconds time elapsed                                          ( +-  0.14% )

- After:
 Performance counter stats for 'taskset -c 0 ../img/aarch64/die.sh' (10 runs):

7462.441328      task-clock (msec)         #    0.998 CPUs utilized            ( +-  0.07% )
    31,478,476,520      cycles                    #    4.218 GHz                      ( +-  0.07% )
    57,017,330,084      instructions              #    1.81  insns per cycle          ( +-  0.05% )
    10,251,929,667      branches                  # 1373.804 M/sec                    ( +-  0.05% )
       173,023,787      branch-misses             #    1.69% of all branches          ( +-  0.11% )

7.474970463 seconds time elapsed                                          ( +-  0.07% )

2. SPEC06int:
                                              SPEC06int (test set)
                                           [Y axis: Speedup over master]
  1.15 +-+----+------+------+------+------+------+-------+------+------+------+------+------+------+----+-+
       |                                                                                                  |
   1.1 +-+.................................+++.............................+  tlb-lock-v2 (m+++x)       +-+
       |                                +++ |                   +++        tlb-lock-v3 (spinl|ck)         |
       |                    +++          |  |     +++    +++     |                           |            |
  1.05 +-+....+++...........####.........|####.+++.|......|.....###....+++...........+++....###.........+-+
       |      ###         ++#| #         |# |# ***### +++### +++#+#     |     +++     |     #|#    ###    |
     1 +-+++***+#++++####+++#++#++++++++++#++#+*+*++#++++#+#+****+#++++###++++###++++###++++#+#++++#+#+++-+
       |    *+* #    #++# ***  #   #### ***  # * *++# ****+# *| * # ****|#   |# #    #|#    #+#    # #    |
  0.95 +-+..*.*.#....#..#.*|*..#...#..#.*|*..#.*.*..#.*|.*.#.*++*.#.*++*+#.****.#....#+#....#.#..++#.#..+-+
       |    * * #    #  # *|*  #   #  # *|*  # * *  # *++* # *  * # *  * # * |* #  ++# #    # #  *** #    |
       |    * * #  ++#  # *+*  #   #  # *|*  # * *  # *  * # *  * # *  * # *++* # **** #  ++# #  * * #    |
   0.9 +-+..*.*.#...|#..#.*.*..#.++#..#.*|*..#.*.*..#.*..*.#.*..*.#.*..*.#.*..*.#.*.|*.#...|#.#..*.*.#..+-+
       |    * * #  ***  # * *  #  |#  # *+*  # * *  # *  * # *  * # *  * # *  * # *++* #   |# #  * * #    |
  0.85 +-+..*.*.#..*|*..#.*.*..#.***..#.*.*..#.*.*..#.*..*.#.*..*.#.*..*.#.*..*.#.*..*.#.****.#..*.*.#..+-+
       |    * * #  *+*  # * *  # *|*  # * *  # * *  # *  * # *  * # *  * # *  * # *  * # * |* #  * * #    |
       |    * * #  * *  # * *  # *+*  # * *  # * *  # *  * # *  * # *  * # *  * # *  * # * |* #  * * #    |
   0.8 +-+..*.*.#..*.*..#.*.*..#.*.*..#.*.*..#.*.*..#.*..*.#.*..*.#.*..*.#.*..*.#.*..*.#.*++*.#..*.*.#..+-+
       |    * * #  * *  # * *  # * *  # * *  # * *  # *  * # *  * # *  * # *  * # *  * # *  * #  * * #    |
  0.75 +-+--***##--***###-***###-***###-***###-***###-****##-****##-****##-****##-****##-****##--***##--+-+
 400.perlben401.bzip2403.gcc429.m445.gob456.hmme45462.libqua464.h26471.omnet473483.xalancbmkgeomean

png: https://imgur.com/a/BHzpPTW

Notes:
- tlb-lock-v2 corresponds to an implementation with a mutex.
- tlb-lock-v3 corresponds to the current implementation, i.e.
  a spinlock and a single lock acquisition in tlb_set_page_with_attrs.

Signed-off-by: Emilio G. Cota <cota@braap.org>
Message-Id: <20181016153840.25877-1-cota@braap.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/softmmu_template.h | 12 ++++++------
 include/exec/cpu_ldst.h | 11 ++++++++++-
 include/exec/cpu_ldst_template.h | 2 +-
 accel/tcg/cputlb.c | 19 +++++++++++++------
 4 files changed, 30 insertions(+), 14 deletions(-)

diff --git a/accel/tcg/softmmu_template.h b/accel/tcg/softmmu_template.h
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/softmmu_template.h
+++ b/accel/tcg/softmmu_template.h
@@ -XXX,XX +XXX,XX @@ void helper_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
 uintptr_t mmu_idx = get_mmuidx(oi);
 uintptr_t index = tlb_index(env, mmu_idx, addr);
 CPUTLBEntry *entry = tlb_entry(env, mmu_idx, addr);
- target_ulong tlb_addr = entry->addr_write;
+ target_ulong tlb_addr = tlb_addr_write(entry);
 unsigned a_bits = get_alignment_bits(get_memop(oi));
 uintptr_t haddr;
 
@@ -XXX,XX +XXX,XX @@ void helper_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
 tlb_fill(ENV_GET_CPU(env), addr, DATA_SIZE, MMU_DATA_STORE,
 mmu_idx, retaddr);
 }
- tlb_addr = entry->addr_write & ~TLB_INVALID_MASK;
+ tlb_addr = tlb_addr_write(entry) & ~TLB_INVALID_MASK;
 }
 
 /* Handle an IO access. */
@@ -XXX,XX +XXX,XX @@ void helper_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
 cannot evict the first. */
 page2 = (addr + DATA_SIZE) & TARGET_PAGE_MASK;
 entry2 = tlb_entry(env, mmu_idx, page2);
- if (!tlb_hit_page(entry2->addr_write, page2)
+ if (!tlb_hit_page(tlb_addr_write(entry2), page2)
 && !VICTIM_TLB_HIT(addr_write, page2)) {
 tlb_fill(ENV_GET_CPU(env), page2, DATA_SIZE, MMU_DATA_STORE,
 mmu_idx, retaddr);
@@ -XXX,XX +XXX,XX @@ void helper_be_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
 uintptr_t mmu_idx = get_mmuidx(oi);
 uintptr_t index = tlb_index(env, mmu_idx, addr);
 CPUTLBEntry *entry = tlb_entry(env, mmu_idx, addr);
- target_ulong tlb_addr = entry->addr_write;
+ target_ulong tlb_addr = tlb_addr_write(entry);
 unsigned a_bits = get_alignment_bits(get_memop(oi));
 uintptr_t haddr;
 
@@ -XXX,XX +XXX,XX @@ void helper_be_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
 tlb_fill(ENV_GET_CPU(env), addr, DATA_SIZE, MMU_DATA_STORE,
 mmu_idx, retaddr);
 }
- tlb_addr = entry->addr_write & ~TLB_INVALID_MASK;
+ tlb_addr = tlb_addr_write(entry) & ~TLB_INVALID_MASK;
 }
 
 /* Handle an IO access. */
@@ -XXX,XX +XXX,XX @@ void helper_be_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
 cannot evict the first. */
 page2 = (addr + DATA_SIZE) & TARGET_PAGE_MASK;
 entry2 = tlb_entry(env, mmu_idx, page2);
- if (!tlb_hit_page(entry2->addr_write, page2)
+ if (!tlb_hit_page(tlb_addr_write(entry2), page2)
 && !VICTIM_TLB_HIT(addr_write, page2)) {
 tlb_fill(ENV_GET_CPU(env), page2, DATA_SIZE, MMU_DATA_STORE,
 mmu_idx, retaddr);
diff --git a/include/exec/cpu_ldst.h b/include/exec/cpu_ldst.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/cpu_ldst.h
+++ b/include/exec/cpu_ldst.h
@@ -XXX,XX +XXX,XX @@ extern __thread uintptr_t helper_retaddr;
 /* The memory helpers for tcg-generated code need tcg_target_long etc. */
 #include "tcg.h"
 
+static inline target_ulong tlb_addr_write(const CPUTLBEntry *entry)
+{
+#if TCG_OVERSIZED_GUEST
+ return entry->addr_write;
+#else
+ return atomic_read(&entry->addr_write);
+#endif
+}
+
 /* Find the TLB index corresponding to the mmu_idx + address pair. */
 static inline uintptr_t tlb_index(CPUArchState *env, uintptr_t mmu_idx,
 target_ulong addr)
@@ -XXX,XX +XXX,XX @@ static inline void *tlb_vaddr_to_host(CPUArchState *env, abi_ptr addr,
 tlb_addr = tlbentry->addr_read;
 break;
 case 1:
- tlb_addr = tlbentry->addr_write;
+ tlb_addr = tlb_addr_write(tlbentry);
 break;
 case 2:
 tlb_addr = tlbentry->addr_code;
diff --git a/include/exec/cpu_ldst_template.h b/include/exec/cpu_ldst_template.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/cpu_ldst_template.h
+++ b/include/exec/cpu_ldst_template.h
@@ -XXX,XX +XXX,XX @@ glue(glue(glue(cpu_st, SUFFIX), MEMSUFFIX), _ra)(CPUArchState *env,
 addr = ptr;
 mmu_idx = CPU_MMU_INDEX;
 entry = tlb_entry(env, mmu_idx, addr);
- if (unlikely(entry->addr_write !=
+ if (unlikely(tlb_addr_write(entry) !=
 (addr & (TARGET_PAGE_MASK | (DATA_SIZE - 1))))) {
 oi = make_memop_idx(SHIFT, mmu_idx);
 glue(glue(helper_ret_st, SUFFIX), MMUSUFFIX)(env, addr, v, oi,
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ static inline bool tlb_hit_page_anyprot(CPUTLBEntry *tlb_entry,
 target_ulong page)
 {
 return tlb_hit_page(tlb_entry->addr_read, page) ||
- tlb_hit_page(tlb_entry->addr_write, page) ||
+ tlb_hit_page(tlb_addr_write(tlb_entry), page) ||
 tlb_hit_page(tlb_entry->addr_code, page);
 }
 
@@ -XXX,XX +XXX,XX @@ static void io_writex(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
 tlb_fill(cpu, addr, size, MMU_DATA_STORE, mmu_idx, retaddr);
 
 entry = tlb_entry(env, mmu_idx, addr);
- tlb_addr = entry->addr_write;
+ tlb_addr = tlb_addr_write(entry);
 if (!(tlb_addr & ~(TARGET_PAGE_MASK | TLB_RECHECK))) {
 /* RAM access */
 uintptr_t haddr = addr + entry->addend;
@@ -XXX,XX +XXX,XX @@ static bool victim_tlb_hit(CPUArchState *env, size_t mmu_idx, size_t index,
 assert_cpu_is_self(ENV_GET_CPU(env));
 for (vidx = 0; vidx < CPU_VTLB_SIZE; ++vidx) {
 CPUTLBEntry *vtlb = &env->tlb_v_table[mmu_idx][vidx];
- target_ulong cmp = *(target_ulong *)((uintptr_t)vtlb + elt_ofs);
+ target_ulong cmp;
+
+ /* elt_ofs might correspond to .addr_write, so use atomic_read */
+#if TCG_OVERSIZED_GUEST
+ cmp = *(target_ulong *)((uintptr_t)vtlb + elt_ofs);
+#else
+ cmp = atomic_read((target_ulong *)((uintptr_t)vtlb + elt_ofs));
+#endif
 
 if (cmp == page) {
 /* Found entry in victim tlb, swap tlb and iotlb. */
@@ -XXX,XX +XXX,XX @@ void probe_write(CPUArchState *env, target_ulong addr, int size, int mmu_idx,
 uintptr_t index = tlb_index(env, mmu_idx, addr);
 CPUTLBEntry *entry = tlb_entry(env, mmu_idx, addr);
 
- if (!tlb_hit(entry->addr_write, addr)) {
+ if (!tlb_hit(tlb_addr_write(entry), addr)) {
 /* TLB entry is for a different page */
 if (!VICTIM_TLB_HIT(addr_write, addr)) {
 tlb_fill(ENV_GET_CPU(env), addr, size, MMU_DATA_STORE,
@@ -XXX,XX +XXX,XX @@ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
 size_t mmu_idx = get_mmuidx(oi);
 uintptr_t index = tlb_index(env, mmu_idx, addr);
 CPUTLBEntry *tlbe = tlb_entry(env, mmu_idx, addr);
- target_ulong tlb_addr = tlbe->addr_write;
+ target_ulong tlb_addr = tlb_addr_write(tlbe);
 TCGMemOp mop = get_memop(oi);
 int a_bits = get_alignment_bits(mop);
 int s_bits = mop & MO_SIZE;
@@ -XXX,XX +XXX,XX @@ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
 tlb_fill(ENV_GET_CPU(env), addr, 1 << s_bits, MMU_DATA_STORE,
 mmu_idx, retaddr);
 }
- tlb_addr = tlbe->addr_write & ~TLB_INVALID_MASK;
+ tlb_addr = tlb_addr_write(tlbe) & ~TLB_INVALID_MASK;
 }
 
 /* Notice an IO access or a needs-MMU-lookup access */
-- 
2.17.2

The following changes since commit 222059a0fccf4af3be776fe35a5ea2d6a68f9a0b:

Merge tag 'pull-ppc-20221221' of https://gitlab.com/danielhb/qemu into staging (2022-12-21 18:08:09 +0000)

are available in the Git repository at:

https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20221229

for you to fetch changes up to b05e35533782a71a9fda472afd08442f50622a3e:

tests/tcg/multiarch: add vma-pthread.c (2022-12-29 12:39:45 -0800)

----------------------------------------------------------------
Fix race conditions in new user-only vma tracking.
Add tcg backend paired register allocation.
Cleanup tcg backend function call abi.

----------------------------------------------------------------
Ilya Leoshkevich (1):
      tests/tcg/multiarch: add vma-pthread.c

Mark Cave-Ayland (1):
      tcg: convert tcg/README to rst

Philippe Mathieu-Daudé (5):
      tcg/s390x: Fix coding style
      tcg: Massage process_op_defs()
      tcg: Pass number of arguments to tcg_emit_op() / tcg_op_insert_*()
      tcg: Convert typecode_to_ffi from array to function
      tcg: Factor init_ffi_layouts() out of tcg_context_init()

Richard Henderson (40):
      meson: Move CONFIG_TCG_INTERPRETER to config_host
      tcg: Cleanup trailing whitespace
      qemu/main-loop: Introduce QEMU_IOTHREAD_LOCK_GUARD
      hw/mips: Use QEMU_IOTHREAD_LOCK_GUARD in cpu_mips_irq_request
      target/ppc: Use QEMU_IOTHREAD_LOCK_GUARD in ppc_maybe_interrupt
      target/ppc: Use QEMU_IOTHREAD_LOCK_GUARD in cpu_interrupt_exittb
      target/riscv: Use QEMU_IOTHREAD_LOCK_GUARD in riscv_cpu_update_mip
      hw/ppc: Use QEMU_IOTHREAD_LOCK_GUARD in ppc_set_irq
      accel/tcg: Use QEMU_IOTHREAD_LOCK_GUARD in io_readx/io_writex
      tcg: Tidy tcg_reg_alloc_op
      tcg: Remove TCG_TARGET_STACK_GROWSUP
      tci: MAX_OPC_PARAM_IARGS is no longer used
      tcg: Fix tcg_reg_alloc_dup*
      tcg: Centralize updates to reg_to_temp
      tcg: Remove check_regs
      tcg: Introduce paired register allocation
      accel/tcg: Set cflags_next_tb in cpu_common_initfn
      target/sparc: Avoid TCGV_{LOW,HIGH}
      tcg: Move TCG_{LOW,HIGH} to tcg-internal.h
      tcg: Add temp_subindex to TCGTemp
      tcg: Simplify calls to temp_sync vs mem_coherent
      tcg: Allocate TCGTemp pairs in host memory order
      tcg: Move TCG_TYPE_COUNT outside enum
      tcg: Introduce tcg_type_size
      tcg: Introduce TCGCallReturnKind and TCGCallArgumentKind
      tcg: Replace TCG_TARGET_CALL_ALIGN_ARGS with TCG_TARGET_CALL_ARG_I64
      tcg: Replace TCG_TARGET_EXTEND_ARGS with TCG_TARGET_CALL_ARG_I32
      tcg: Use TCG_CALL_ARG_EVEN for TCI special case
      accel/tcg/plugin: Don't search for the function pointer index
      accel/tcg/plugin: Avoid duplicate copy in copy_call
      accel/tcg/plugin: Use copy_op in append_{udata,mem}_cb
      tcg: Vary the allocation size for TCGOp
      tcg: Use output_pref wrapper function
      tcg: Reorg function calls
      tcg: Move ffi_cif pointer into TCGHelperInfo
      tcg/aarch64: Merge tcg_out_callr into tcg_out_call
      tcg: Add TCGHelperInfo argument to tcg_out_call
      accel/tcg: Fix tb_invalidate_phys_page_unwind
      accel/tcg: Use g_free_rcu for user-exec interval trees
      accel/tcg: Handle false negative lookup in page_check_range

From: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>

Convert tcg/README to rst and move it to docs/devel as a new "TCG Intermediate
Representation" page. There are a few minor changes to improve the aesthetic
of the final output which are as follows:

- Rename the title from "Tiny Code Generator - Fabrice Bellard" to "TCG
    Intermediate Representation"

- Remove the section numbering

- Add the missing parameters to the ssadd_vec operations in the "Host
    vector operations" section

- Change the path to the Atomic Operations document to use a proper
    reference

- Replace tcg/README in tcg.rst with a proper reference to the new document

Signed-off-by: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>
Reviewed-by: Fabiano Rosas <farosas@suse.de>
Message-Id: <20221130100434.64207-2-mark.cave-ayland@ilande.co.uk>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 docs/devel/atomics.rst | 2 +
 docs/devel/index-tcg.rst | 1 +
 docs/devel/tcg-ops.rst | 941 +++++++++++++++++++++++++++++++++++++++
 docs/devel/tcg.rst | 2 +-
 tcg/README | 784 --------------------------------
 5 files changed, 945 insertions(+), 785 deletions(-)
 create mode 100644 docs/devel/tcg-ops.rst
 delete mode 100644 tcg/README

diff --git a/docs/devel/atomics.rst b/docs/devel/atomics.rst
index XXXXXXX..XXXXXXX 100644
--- a/docs/devel/atomics.rst
+++ b/docs/devel/atomics.rst
@@ -XXX,XX +XXX,XX @@
+.. _atomics-ref:
+
 =========================
 Atomic operations in QEMU
 =========================
diff --git a/docs/devel/index-tcg.rst b/docs/devel/index-tcg.rst
index XXXXXXX..XXXXXXX 100644
--- a/docs/devel/index-tcg.rst
+++ b/docs/devel/index-tcg.rst
@@ -XXX,XX +XXX,XX @@ are only implementing things for HW accelerated hypervisors.
 :maxdepth: 2
 
 tcg
+ tcg-ops
 decodetree
 multi-thread-tcg
 tcg-icount
diff --git a/docs/devel/tcg-ops.rst b/docs/devel/tcg-ops.rst
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/docs/devel/tcg-ops.rst
@@ -XXX,XX +XXX,XX @@
+.. _tcg-ops-ref:
+
+*******************************
+TCG Intermediate Representation
+*******************************
+
+Introduction
+============
+
+TCG (Tiny Code Generator) began as a generic backend for a C
+compiler. It was simplified to be used in QEMU. It also has its roots
+in the QOP code generator written by Paul Brook.
+
+Definitions
+===========
+
+TCG receives RISC-like *TCG ops* and performs some optimizations on them,
+including liveness analysis and trivial constant expression
+evaluation. TCG ops are then implemented in the host CPU back end,
+also known as the TCG target.
+
+The TCG *target* is the architecture for which we generate the
+code. It is of course not the same as the "target" of QEMU which is
+the emulated architecture. As TCG started as a generic C backend used
+for cross compiling, it is assumed that the TCG target is different
+from the host, although it is never the case for QEMU.
+
+In this document, we use *guest* to specify what architecture we are
+emulating; *target* always means the TCG target, the machine on which
+we are running QEMU.
+
+A TCG *function* corresponds to a QEMU Translated Block (TB).
+
+A TCG *temporary* is a variable only live in a basic block. Temporaries are allocated explicitly in each function.
+
+A TCG *local temporary* is a variable only live in a function. Local temporaries are allocated explicitly in each function.
+
+A TCG *global* is a variable which is live in all the functions
+(equivalent of a C global variable). They are defined before the
+functions defined. A TCG global can be a memory location (e.g. a QEMU
+CPU register), a fixed host register (e.g. the QEMU CPU state pointer)
+or a memory location which is stored in a register outside QEMU TBs
+(not implemented yet).
+
+A TCG *basic block* corresponds to a list of instructions terminated
+by a branch instruction.
+
+An operation with *undefined behavior* may result in a crash.
+
+An operation with *unspecified behavior* shall not crash. However,
+the result may be one of several possibilities so may be considered
+an *undefined result*.
+
+Intermediate representation
+===========================
+
+Introduction
+------------
+
+TCG instructions operate on variables which are temporaries, local
+temporaries or globals. TCG instructions and variables are strongly
+typed. Two types are supported: 32 bit integers and 64 bit
+integers. Pointers are defined as an alias to 32 bit or 64 bit
+integers depending on the TCG target word size.
+
+Each instruction has a fixed number of output variable operands, input
+variable operands and always constant operands.
+
+The notable exception is the call instruction which has a variable
+number of outputs and inputs.
+
+In the textual form, output operands usually come first, followed by
+input operands, followed by constant operands. The output type is
+included in the instruction name. Constants are prefixed with a '$'.
+
+.. code-block:: none
+
+ add_i32 t0, t1, t2 /* (t0 <- t1 + t2) */
+
+
+Assumptions
+-----------
+
+Basic blocks
+^^^^^^^^^^^^
+
+* Basic blocks end after branches (e.g. brcond_i32 instruction),
+ goto_tb and exit_tb instructions.
+
+* Basic blocks start after the end of a previous basic block, or at a
+ set_label instruction.
+
+After the end of a basic block, the content of temporaries is
+destroyed, but local temporaries and globals are preserved.
+
+Floating point types
+^^^^^^^^^^^^^^^^^^^^
+
+* Floating point types are not supported yet
+
+Pointers
+^^^^^^^^
+
+* Depending on the TCG target, pointer size is 32 bit or 64
+ bit. The type ``TCG_TYPE_PTR`` is an alias to ``TCG_TYPE_I32`` or
+ ``TCG_TYPE_I64``.
+
+Helpers
+^^^^^^^
+
+* Using the tcg_gen_helper_x_y it is possible to call any function
+ taking i32, i64 or pointer types. By default, before calling a helper,
+ all globals are stored at their canonical location and it is assumed
+ that the function can modify them. By default, the helper is allowed to
+ modify the CPU state or raise an exception.
+
+ This can be overridden using the following function modifiers:
+
+ - ``TCG_CALL_NO_READ_GLOBALS`` means that the helper does not read globals,
+ either directly or via an exception. They will not be saved to their
+ canonical locations before calling the helper.
+
+ - ``TCG_CALL_NO_WRITE_GLOBALS`` means that the helper does not modify any globals.
+ They will only be saved to their canonical location before calling helpers,
+ but they won't be reloaded afterwards.
+
+ - ``TCG_CALL_NO_SIDE_EFFECTS`` means that the call to the function is removed if
+ the return value is not used.
+
+ Note that ``TCG_CALL_NO_READ_GLOBALS`` implies ``TCG_CALL_NO_WRITE_GLOBALS``.
+
+ On some TCG targets (e.g. x86), several calling conventions are
+ supported.
+
+Branches
+^^^^^^^^
+
+* Use the instruction 'br' to jump to a label.
+
+Code Optimizations
+------------------
+
+When generating instructions, you can count on at least the following
+optimizations:
+
+- Single instructions are simplified, e.g.
+
+ .. code-block:: none
+
+ and_i32 t0, t0, $0xffffffff
+
+ is suppressed.
+
+- A liveness analysis is done at the basic block level. The
+ information is used to suppress moves from a dead variable to
+ another one. It is also used to remove instructions which compute
+ dead results. The later is especially useful for condition code
+ optimization in QEMU.
+
+ In the following example:
+
+ .. code-block:: none
+
+ add_i32 t0, t1, t2
+ add_i32 t0, t0, $1
+ mov_i32 t0, $1
+
+ only the last instruction is kept.
+
+
+Instruction Reference
+=====================
+
+Function call
+-------------
+
+.. list-table::
+
+ * - call *<ret>* *<params>* ptr
+
+ - | call function 'ptr' (pointer type)
+ |
+ | *<ret>* optional 32 bit or 64 bit return value
+ | *<params>* optional 32 bit or 64 bit parameters
+
+Jumps/Labels
+------------
+
+.. list-table::
+
+ * - set_label $label
+
+ - | Define label 'label' at the current program point.
+
+ * - br $label
+
+ - | Jump to label.
+
+ * - brcond_i32/i64 *t0*, *t1*, *cond*, *label*
+
+ - | Conditional jump if *t0* *cond* *t1* is true. *cond* can be:
+ |
+ | ``TCG_COND_EQ``
+ | ``TCG_COND_NE``
+ | ``TCG_COND_LT /* signed */``
+ | ``TCG_COND_GE /* signed */``
+ | ``TCG_COND_LE /* signed */``
+ | ``TCG_COND_GT /* signed */``
+ | ``TCG_COND_LTU /* unsigned */``
+ | ``TCG_COND_GEU /* unsigned */``
+ | ``TCG_COND_LEU /* unsigned */``
+ | ``TCG_COND_GTU /* unsigned */``
+
+Arithmetic
+----------
+
+.. list-table::
+
+ * - add_i32/i64 *t0*, *t1*, *t2*
+
+ - | *t0* = *t1* + *t2*
+
+ * - sub_i32/i64 *t0*, *t1*, *t2*
+
+ - | *t0* = *t1* - *t2*
+
+ * - neg_i32/i64 *t0*, *t1*
+
+ - | *t0* = -*t1* (two's complement)
+
+ * - mul_i32/i64 *t0*, *t1*, *t2*
+
+ - | *t0* = *t1* * *t2*
+
+ * - div_i32/i64 *t0*, *t1*, *t2*
+
+ - | *t0* = *t1* / *t2* (signed)
+ | Undefined behavior if division by zero or overflow.
+
+ * - divu_i32/i64 *t0*, *t1*, *t2*
+
+ - | *t0* = *t1* / *t2* (unsigned)
+ | Undefined behavior if division by zero.
+
+ * - rem_i32/i64 *t0*, *t1*, *t2*
+
+ - | *t0* = *t1* % *t2* (signed)
+ | Undefined behavior if division by zero or overflow.
+
+ * - remu_i32/i64 *t0*, *t1*, *t2*
+
+ - | *t0* = *t1* % *t2* (unsigned)
+ | Undefined behavior if division by zero.
+
+
+Logical
+-------
+
+.. list-table::
+
+ * - and_i32/i64 *t0*, *t1*, *t2*
+
+ - | *t0* = *t1* & *t2*
+
+ * - or_i32/i64 *t0*, *t1*, *t2*
+
+ - | *t0* = *t1* | *t2*
+
+ * - xor_i32/i64 *t0*, *t1*, *t2*
+
+ - | *t0* = *t1* ^ *t2*
+
+ * - not_i32/i64 *t0*, *t1*
+
+ - | *t0* = ~\ *t1*
+
+ * - andc_i32/i64 *t0*, *t1*, *t2*
+
+ - | *t0* = *t1* & ~\ *t2*
+
+ * - eqv_i32/i64 *t0*, *t1*, *t2*
+
+ - | *t0* = ~(*t1* ^ *t2*), or equivalently, *t0* = *t1* ^ ~\ *t2*
+
+ * - nand_i32/i64 *t0*, *t1*, *t2*
+
+ - | *t0* = ~(*t1* & *t2*)
+
+ * - nor_i32/i64 *t0*, *t1*, *t2*
+
+ - | *t0* = ~(*t1* | *t2*)
+
+ * - orc_i32/i64 *t0*, *t1*, *t2*
+
+ - | *t0* = *t1* | ~\ *t2*
+
+ * - clz_i32/i64 *t0*, *t1*, *t2*
+
+ - | *t0* = *t1* ? clz(*t1*) : *t2*
+
+ * - ctz_i32/i64 *t0*, *t1*, *t2*
+
+ - | *t0* = *t1* ? ctz(*t1*) : *t2*
+
+ * - ctpop_i32/i64 *t0*, *t1*
+
+ - | *t0* = number of bits set in *t1*
+ |
+ | With *ctpop* short for "count population", matching
+ | the function name used in ``include/qemu/host-utils.h``.
+
+
+Shifts/Rotates
+--------------
+
+.. list-table::
+
+ * - shl_i32/i64 *t0*, *t1*, *t2*
+
+ - | *t0* = *t1* << *t2*
+ | Unspecified behavior if *t2* < 0 or *t2* >= 32 (resp 64)
+
+ * - shr_i32/i64 *t0*, *t1*, *t2*
+
+ - | *t0* = *t1* >> *t2* (unsigned)
+ | Unspecified behavior if *t2* < 0 or *t2* >= 32 (resp 64)
+
+ * - sar_i32/i64 *t0*, *t1*, *t2*
+
+ - | *t0* = *t1* >> *t2* (signed)
+ | Unspecified behavior if *t2* < 0 or *t2* >= 32 (resp 64)
+
+ * - rotl_i32/i64 *t0*, *t1*, *t2*
+
+ - | Rotation of *t2* bits to the left
+ | Unspecified behavior if *t2* < 0 or *t2* >= 32 (resp 64)
+
+ * - rotr_i32/i64 *t0*, *t1*, *t2*
+
+ - | Rotation of *t2* bits to the right.
+ | Unspecified behavior if *t2* < 0 or *t2* >= 32 (resp 64)
+
+
+Misc
+----
+
+.. list-table::
+
+ * - mov_i32/i64 *t0*, *t1*
+
+ - | *t0* = *t1*
+ | Move *t1* to *t0* (both operands must have the same type).
+
+ * - ext8s_i32/i64 *t0*, *t1*
+
+ ext8u_i32/i64 *t0*, *t1*
+
+ ext16s_i32/i64 *t0*, *t1*
+
+ ext16u_i32/i64 *t0*, *t1*
+
+ ext32s_i64 *t0*, *t1*
+
+ ext32u_i64 *t0*, *t1*
+
+ - | 8, 16 or 32 bit sign/zero extension (both operands must have the same type)
+
+ * - bswap16_i32/i64 *t0*, *t1*, *flags*
+
+ - | 16 bit byte swap on the low bits of a 32/64 bit input.
+ |
+ | If *flags* & ``TCG_BSWAP_IZ``, then *t1* is known to be zero-extended from bit 15.
+ | If *flags* & ``TCG_BSWAP_OZ``, then *t0* will be zero-extended from bit 15.
+ | If *flags* & ``TCG_BSWAP_OS``, then *t0* will be sign-extended from bit 15.
+ |
+ | If neither ``TCG_BSWAP_OZ`` nor ``TCG_BSWAP_OS`` are set, then the bits of *t0* above bit 15 may contain any value.
+
+ * - bswap32_i64 *t0*, *t1*, *flags*
+
+ - | 32 bit byte swap on a 64-bit value. The flags are the same as for bswap16,
+ except they apply from bit 31 instead of bit 15.
+
+ * - bswap32_i32 *t0*, *t1*, *flags*
+
+ bswap64_i64 *t0*, *t1*, *flags*
+
+ - | 32/64 bit byte swap. The flags are ignored, but still present
+ for consistency with the other bswap opcodes.
+
+ * - discard_i32/i64 *t0*
+
+ - | Indicate that the value of *t0* won't be used later. It is useful to
+ force dead code elimination.
+
+ * - deposit_i32/i64 *dest*, *t1*, *t2*, *pos*, *len*
+
+ - | Deposit *t2* as a bitfield into *t1*, placing the result in *dest*.
+ |
+ | The bitfield is described by *pos*/*len*, which are immediate values:
+ |
+ | *len* - the length of the bitfield
+ | *pos* - the position of the first bit, counting from the LSB
+ |
+ | For example, "deposit_i32 dest, t1, t2, 8, 4" indicates a 4-bit field
+ at bit 8. This operation would be equivalent to
+ |
+ | *dest* = (*t1* & ~0x0f00) | ((*t2* << 8) & 0x0f00)
+
+ * - extract_i32/i64 *dest*, *t1*, *pos*, *len*
+
+ sextract_i32/i64 *dest*, *t1*, *pos*, *len*
+
+ - | Extract a bitfield from *t1*, placing the result in *dest*.
+ |
+ | The bitfield is described by *pos*/*len*, which are immediate values,
+ as above for deposit. For extract_*, the result will be extended
+ to the left with zeros; for sextract_*, the result will be extended
+ to the left with copies of the bitfield sign bit at *pos* + *len* - 1.
+ |
+ | For example, "sextract_i32 dest, t1, 8, 4" indicates a 4-bit field
+ at bit 8. This operation would be equivalent to
+ |
+ | *dest* = (*t1* << 20) >> 28
+ |
+ | (using an arithmetic right shift).
+
+ * - extract2_i32/i64 *dest*, *t1*, *t2*, *pos*
+
+ - | For N = {32,64}, extract an N-bit quantity from the concatenation
+ of *t2*:*t1*, beginning at *pos*. The tcg_gen_extract2_{i32,i64} expander
+ accepts 0 <= *pos* <= N as inputs. The backend code generator will
+ not see either 0 or N as inputs for these opcodes.
+
+ * - extrl_i64_i32 *t0*, *t1*
+
+ - | For 64-bit hosts only, extract the low 32-bits of input *t1* and place it
+ into 32-bit output *t0*. Depending on the host, this may be a simple move,
+ or may require additional canonicalization.
+
+ * - extrh_i64_i32 *t0*, *t1*
+
+ - | For 64-bit hosts only, extract the high 32-bits of input *t1* and place it
+ into 32-bit output *t0*. Depending on the host, this may be a simple shift,
+ or may require additional canonicalization.
+
+
+Conditional moves
+-----------------
+
+.. list-table::
+
+ * - setcond_i32/i64 *dest*, *t1*, *t2*, *cond*
+
+ - | *dest* = (*t1* *cond* *t2*)
+ |
+ | Set *dest* to 1 if (*t1* *cond* *t2*) is true, otherwise set to 0.
+
+ * - movcond_i32/i64 *dest*, *c1*, *c2*, *v1*, *v2*, *cond*
+
+ - | *dest* = (*c1* *cond* *c2* ? *v1* : *v2*)
+ |
+ | Set *dest* to *v1* if (*c1* *cond* *c2*) is true, otherwise set to *v2*.
+
+
+Type conversions
+----------------
+
+.. list-table::
+
+ * - ext_i32_i64 *t0*, *t1*
+
+ - | Convert *t1* (32 bit) to *t0* (64 bit) and does sign extension
+
+ * - extu_i32_i64 *t0*, *t1*
+
+ - | Convert *t1* (32 bit) to *t0* (64 bit) and does zero extension
+
+ * - trunc_i64_i32 *t0*, *t1*
+
+ - | Truncate *t1* (64 bit) to *t0* (32 bit)
+
+ * - concat_i32_i64 *t0*, *t1*, *t2*
+
+ - | Construct *t0* (64-bit) taking the low half from *t1* (32 bit) and the high half
+ from *t2* (32 bit).
+
+ * - concat32_i64 *t0*, *t1*, *t2*
+
+ - | Construct *t0* (64-bit) taking the low half from *t1* (64 bit) and the high half
+ from *t2* (64 bit).
+
+
+Load/Store
+----------
+
+.. list-table::
+
+ * - ld_i32/i64 *t0*, *t1*, *offset*
+
+ ld8s_i32/i64 *t0*, *t1*, *offset*
+
+ ld8u_i32/i64 *t0*, *t1*, *offset*
+
+ ld16s_i32/i64 *t0*, *t1*, *offset*
+
+ ld16u_i32/i64 *t0*, *t1*, *offset*
+
+ ld32s_i64 t0, *t1*, *offset*
+
+ ld32u_i64 t0, *t1*, *offset*
+
+ - | *t0* = read(*t1* + *offset*)
+ |
+ | Load 8, 16, 32 or 64 bits with or without sign extension from host memory.
+ *offset* must be a constant.
+
+ * - st_i32/i64 *t0*, *t1*, *offset*
+
+ st8_i32/i64 *t0*, *t1*, *offset*
+
+ st16_i32/i64 *t0*, *t1*, *offset*
+
+ st32_i64 *t0*, *t1*, *offset*
+
+ - | write(*t0*, *t1* + *offset*)
+ |
+ | Write 8, 16, 32 or 64 bits to host memory.
+
+All this opcodes assume that the pointed host memory doesn't correspond
+to a global. In the latter case the behaviour is unpredictable.
+
+
+Multiword arithmetic support
+----------------------------
+
+.. list-table::
+
+ * - add2_i32/i64 *t0_low*, *t0_high*, *t1_low*, *t1_high*, *t2_low*, *t2_high*
+
+ sub2_i32/i64 *t0_low*, *t0_high*, *t1_low*, *t1_high*, *t2_low*, *t2_high*
+
+ - | Similar to add/sub, except that the double-word inputs *t1* and *t2* are
+ formed from two single-word arguments, and the double-word output *t0*
+ is returned in two single-word outputs.
+
+ * - mulu2_i32/i64 *t0_low*, *t0_high*, *t1*, *t2*
+
+ - | Similar to mul, except two unsigned inputs *t1* and *t2* yielding the full
+ double-word product *t0*. The latter is returned in two single-word outputs.
+
+ * - muls2_i32/i64 *t0_low*, *t0_high*, *t1*, *t2*
+
+ - | Similar to mulu2, except the two inputs *t1* and *t2* are signed.
+
+ * - mulsh_i32/i64 *t0*, *t1*, *t2*
+
+ muluh_i32/i64 *t0*, *t1*, *t2*
+
+ - | Provide the high part of a signed or unsigned multiply, respectively.
+ |
+ | If mulu2/muls2 are not provided by the backend, the tcg-op generator
+ can obtain the same results by emitting a pair of opcodes, mul + muluh/mulsh.
+
+
+Memory Barrier support
+----------------------
+
+.. list-table::
+
+ * - mb *<$arg>*
+
+ - | Generate a target memory barrier instruction to ensure memory ordering
+ as being enforced by a corresponding guest memory barrier instruction.
+ |
+ | The ordering enforced by the backend may be stricter than the ordering
+ required by the guest. It cannot be weaker. This opcode takes a constant
+ argument which is required to generate the appropriate barrier
+ instruction. The backend should take care to emit the target barrier
+ instruction only when necessary i.e., for SMP guests and when MTTCG is
+ enabled.
+ |
+ | The guest translators should generate this opcode for all guest instructions
+ which have ordering side effects.
+ |
+ | Please see :ref:`atomics-ref` for more information on memory barriers.
+
+
+64-bit guest on 32-bit host support
+-----------------------------------
+
+The following opcodes are internal to TCG. Thus they are to be implemented by
+32-bit host code generators, but are not to be emitted by guest translators.
+They are emitted as needed by inline functions within ``tcg-op.h``.
+
+.. list-table::
+
+ * - brcond2_i32 *t0_low*, *t0_high*, *t1_low*, *t1_high*, *cond*, *label*
+
+ - | Similar to brcond, except that the 64-bit values *t0* and *t1*
+ are formed from two 32-bit arguments.
+
+ * - setcond2_i32 *dest*, *t1_low*, *t1_high*, *t2_low*, *t2_high*, *cond*
+
+ - | Similar to setcond, except that the 64-bit values *t1* and *t2* are
+ formed from two 32-bit arguments. The result is a 32-bit value.
+
+
+QEMU specific operations
+------------------------
+
+.. list-table::
+
+ * - exit_tb *t0*
+
+ - | Exit the current TB and return the value *t0* (word type).
+
+ * - goto_tb *index*
+
+ - | Exit the current TB and jump to the TB index *index* (constant) if the
+ current TB was linked to this TB. Otherwise execute the next
+ instructions. Only indices 0 and 1 are valid and tcg_gen_goto_tb may be issued
+ at most once with each slot index per TB.
+
+ * - lookup_and_goto_ptr *tb_addr*
+
+ - | Look up a TB address *tb_addr* and jump to it if valid. If not valid,
+ jump to the TCG epilogue to go back to the exec loop.
+ |
+ | This operation is optional. If the TCG backend does not implement the
+ goto_ptr opcode, emitting this op is equivalent to emitting exit_tb(0).
+
+ * - qemu_ld_i32/i64 *t0*, *t1*, *flags*, *memidx*
+
+ qemu_st_i32/i64 *t0*, *t1*, *flags*, *memidx*
+
+ qemu_st8_i32 *t0*, *t1*, *flags*, *memidx*
+
+ - | Load data at the guest address *t1* into *t0*, or store data in *t0* at guest
+ address *t1*. The _i32/_i64 size applies to the size of the input/output
+ register *t0* only. The address *t1* is always sized according to the guest,
+ and the width of the memory operation is controlled by *flags*.
+ |
+ | Both *t0* and *t1* may be split into little-endian ordered pairs of registers
+ if dealing with 64-bit quantities on a 32-bit host.
+ |
+ | The *memidx* selects the qemu tlb index to use (e.g. user or kernel access).
+ The flags are the MemOp bits, selecting the sign, width, and endianness
+ of the memory access.
+ |
+ | For a 32-bit host, qemu_ld/st_i64 is guaranteed to only be used with a
+ 64-bit memory access specified in *flags*.
+ |
+ | For i386, qemu_st8_i32 is exactly like qemu_st_i32, except the size of
+ the memory operation is known to be 8-bit. This allows the backend to
+ provide a different set of register constraints.
+
+
+Host vector operations
+----------------------
+
+All of the vector ops have two parameters, ``TCGOP_VECL`` & ``TCGOP_VECE``.
+The former specifies the length of the vector in log2 64-bit units; the
+latter specifies the length of the element (if applicable) in log2 8-bit units.
+E.g. VECL = 1 -> 64 << 1 -> v128, and VECE = 2 -> 1 << 2 -> i32.
+
+.. list-table::
+
+ * - mov_vec *v0*, *v1*
+ ld_vec *v0*, *t1*
+ st_vec *v0*, *t1*
+
+ - | Move, load and store.
+
+ * - dup_vec *v0*, *r1*
+
+ - | Duplicate the low N bits of *r1* into VECL/VECE copies across *v0*.
+
+ * - dupi_vec *v0*, *c*
+
+ - | Similarly, for a constant.
+ | Smaller values will be replicated to host register size by the expanders.
+
+ * - dup2_vec *v0*, *r1*, *r2*
+
+ - | Duplicate *r2*:*r1* into VECL/64 copies across *v0*. This opcode is
+ only present for 32-bit hosts.
+
+ * - add_vec *v0*, *v1*, *v2*
+
+ - | *v0* = *v1* + *v2*, in elements across the vector.
+
+ * - sub_vec *v0*, *v1*, *v2*
+
+ - | Similarly, *v0* = *v1* - *v2*.
+
+ * - mul_vec *v0*, *v1*, *v2*
+
+ - | Similarly, *v0* = *v1* * *v2*.
+
+ * - neg_vec *v0*, *v1*
+
+ - | Similarly, *v0* = -*v1*.
+
+ * - abs_vec *v0*, *v1*
+
+ - | Similarly, *v0* = *v1* < 0 ? -*v1* : *v1*, in elements across the vector.
+
+ * - smin_vec *v0*, *v1*, *v2*
+
+ umin_vec *v0*, *v1*, *v2*
+
+ - | Similarly, *v0* = MIN(*v1*, *v2*), for signed and unsigned element types.
+
+ * - smax_vec *v0*, *v1*, *v2*
+
+ umax_vec *v0*, *v1*, *v2*
+
+ - | Similarly, *v0* = MAX(*v1*, *v2*), for signed and unsigned element types.
+
+ * - ssadd_vec *v0*, *v1*, *v2*
+
+ sssub_vec *v0*, *v1*, *v2*
+
+ usadd_vec *v0*, *v1*, *v2*
+
+ ussub_vec *v0*, *v1*, *v2*
+
+ - | Signed and unsigned saturating addition and subtraction.
+ |
+ | If the true result is not representable within the element type, the
+ element is set to the minimum or maximum value for the type.
+
+ * - and_vec *v0*, *v1*, *v2*
+
+ or_vec *v0*, *v1*, *v2*
+
+ xor_vec *v0*, *v1*, *v2*
+
+ andc_vec *v0*, *v1*, *v2*
+
+ orc_vec *v0*, *v1*, *v2*
+
+ not_vec *v0*, *v1*
+
+ - | Similarly, logical operations with and without complement.
+ |
+ | Note that VECE is unused.
+
+ * - shli_vec *v0*, *v1*, *i2*
+
+ shls_vec *v0*, *v1*, *s2*
+
+ - | Shift all elements from v1 by a scalar *i2*/*s2*. I.e.
+
+ .. code-block:: c
+
+ for (i = 0; i < VECL/VECE; ++i) {
+ v0[i] = v1[i] << s2;
+ }
+
+ * - shri_vec *v0*, *v1*, *i2*
+
+ sari_vec *v0*, *v1*, *i2*
+
+ rotli_vec *v0*, *v1*, *i2*
+
+ shrs_vec *v0*, *v1*, *s2*
+
+ sars_vec *v0*, *v1*, *s2*
+
+ - | Similarly for logical and arithmetic right shift, and left rotate.
+
+ * - shlv_vec *v0*, *v1*, *v2*
+
+ - | Shift elements from *v1* by elements from *v2*. I.e.
+
+ .. code-block:: c
+
+ for (i = 0; i < VECL/VECE; ++i) {
+ v0[i] = v1[i] << v2[i];
+ }
+
+ * - shrv_vec *v0*, *v1*, *v2*
+
+ sarv_vec *v0*, *v1*, *v2*
+
+ rotlv_vec *v0*, *v1*, *v2*
+
+ rotrv_vec *v0*, *v1*, *v2*
+
+ - | Similarly for logical and arithmetic right shift, and rotates.
+
+ * - cmp_vec *v0*, *v1*, *v2*, *cond*
+
+ - | Compare vectors by element, storing -1 for true and 0 for false.
+
+ * - bitsel_vec *v0*, *v1*, *v2*, *v3*
+
+ - | Bitwise select, *v0* = (*v2* & *v1*) | (*v3* & ~\ *v1*), across the entire vector.
+
+ * - cmpsel_vec *v0*, *c1*, *c2*, *v3*, *v4*, *cond*
+
+ - | Select elements based on comparison results:
+
+ .. code-block:: c
+
+ for (i = 0; i < n; ++i) {
+ v0[i] = (c1[i] cond c2[i]) ? v3[i] : v4[i].
+ }
+
+**Note 1**: Some shortcuts are defined when the last operand is known to be
+a constant (e.g. addi for add, movi for mov).
+
+**Note 2**: When using TCG, the opcodes must never be generated directly
+as some of them may not be available as "real" opcodes. Always use the
+function tcg_gen_xxx(args).
+
+
+Backend
+=======
+
+``tcg-target.h`` contains the target specific definitions. ``tcg-target.c.inc``
+contains the target specific code; it is #included by ``tcg/tcg.c``, rather
+than being a standalone C file.
+
+Assumptions
+-----------
+
+The target word size (``TCG_TARGET_REG_BITS``) is expected to be 32 bit or
+64 bit. It is expected that the pointer has the same size as the word.
+
+On a 32 bit target, all 64 bit operations are converted to 32 bits. A
+few specific operations must be implemented to allow it (see add2_i32,
+sub2_i32, brcond2_i32).
+
+On a 64 bit target, the values are transferred between 32 and 64-bit
+registers using the following ops:
+
+- trunc_shr_i64_i32
+- ext_i32_i64
+- extu_i32_i64
+
+They ensure that the values are correctly truncated or extended when
+moved from a 32-bit to a 64-bit register or vice-versa. Note that the
+trunc_shr_i64_i32 is an optional op. It is not necessary to implement
+it if all the following conditions are met:
+
+- 64-bit registers can hold 32-bit values
+- 32-bit values in a 64-bit register do not need to stay zero or
+ sign extended
+- all 32-bit TCG ops ignore the high part of 64-bit registers
+
+Floating point operations are not supported in this version. A
+previous incarnation of the code generator had full support of them,
+but it is better to concentrate on integer operations first.
+
+Constraints
+----------------
+
+GCC like constraints are used to define the constraints of every
+instruction. Memory constraints are not supported in this
+version. Aliases are specified in the input operands as for GCC.
+
+The same register may be used for both an input and an output, even when
+they are not explicitly aliased. If an op expands to multiple target
+instructions then care must be taken to avoid clobbering input values.
+GCC style "early clobber" outputs are supported, with '``&``'.
+
+A target can define specific register or constant constraints. If an
+operation uses a constant input constraint which does not allow all
+constants, it must also accept registers in order to have a fallback.
+The constraint '``i``' is defined generically to accept any constant.
+The constraint '``r``' is not defined generically, but is consistently
+used by each backend to indicate all registers.
+
+The movi_i32 and movi_i64 operations must accept any constants.
+
+The mov_i32 and mov_i64 operations must accept any registers of the
+same type.
+
+The ld/st/sti instructions must accept signed 32 bit constant offsets.
+This can be implemented by reserving a specific register in which to
+compute the address if the offset is too big.
+
+The ld/st instructions must accept any destination (ld) or source (st)
+register.
+
+The sti instruction may fail if it cannot store the given constant.
+
+Function call assumptions
+-------------------------
+
+- The only supported types for parameters and return value are: 32 and
+ 64 bit integers and pointer.
+- The stack grows downwards.
+- The first N parameters are passed in registers.
+- The next parameters are passed on the stack by storing them as words.
+- Some registers are clobbered during the call.
+- The function can return 0 or 1 value in registers. On a 32 bit
+ target, functions must be able to return 2 values in registers for
+ 64 bit return type.
+
+
+Recommended coding rules for best performance
+=============================================
+
+- Use globals to represent the parts of the QEMU CPU state which are
+ often modified, e.g. the integer registers and the condition
+ codes. TCG will be able to use host registers to store them.
+
+- Avoid globals stored in fixed registers. They must be used only to
+ store the pointer to the CPU state and possibly to store a pointer
+ to a register window.
+
+- Use temporaries. Use local temporaries only when really needed,
+ e.g. when you need to use a value after a jump. Local temporaries
+ introduce a performance hit in the current TCG implementation: their
+ content is saved to memory at end of each basic block.
+
+- Free temporaries and local temporaries when they are no longer used
+ (tcg_temp_free). Since tcg_const_x() also creates a temporary, you
+ should free it after it is used. Freeing temporaries does not yield
+ a better generated code, but it reduces the memory usage of TCG and
+ the speed of the translation.
+
+- Don't hesitate to use helpers for complicated or seldom used guest
+ instructions. There is little performance advantage in using TCG to
+ implement guest instructions taking more than about twenty TCG
+ instructions. Note that this rule of thumb is more applicable to
+ helpers doing complex logic or arithmetic, where the C compiler has
+ scope to do a good job of optimisation; it is less relevant where
+ the instruction is mostly doing loads and stores, and in those cases
+ inline TCG may still be faster for longer sequences.
+
+- The hard limit on the number of TCG instructions you can generate
+ per guest instruction is set by ``MAX_OP_PER_INSTR`` in ``exec-all.h`` --
+ you cannot exceed this without risking a buffer overrun.
+
+- Use the 'discard' instruction if you know that TCG won't be able to
+ prove that a given global is "dead" at a given program point. The
+ x86 guest uses it to improve the condition codes optimisation.
diff --git a/docs/devel/tcg.rst b/docs/devel/tcg.rst
index XXXXXXX..XXXXXXX 100644
--- a/docs/devel/tcg.rst
+++ b/docs/devel/tcg.rst
@@ -XXX,XX +XXX,XX @@ which make it relatively easily portable and simple while achieving good
 performances.
 
 QEMU's dynamic translation backend is called TCG, for "Tiny Code
-Generator". For more information, please take a look at ``tcg/README``.
+Generator". For more information, please take a look at :ref:`tcg-ops-ref`.
 
 The following sections outline some notable features and implementation
 details of QEMU's dynamic translator.
diff --git a/tcg/README b/tcg/README
deleted file mode 100644
index XXXXXXX..XXXXXXX
--- a/tcg/README
+++ /dev/null
@@ -XXX,XX +XXX,XX @@
-Tiny Code Generator - Fabrice Bellard.
-
-1) Introduction
-
-TCG (Tiny Code Generator) began as a generic backend for a C
-compiler. It was simplified to be used in QEMU. It also has its roots
-in the QOP code generator written by Paul Brook. 
-
-2) Definitions
-
-TCG receives RISC-like "TCG ops" and performs some optimizations on them,
-including liveness analysis and trivial constant expression
-evaluation. TCG ops are then implemented in the host CPU back end,
-also known as the TCG "target".
-
-The TCG "target" is the architecture for which we generate the
-code. It is of course not the same as the "target" of QEMU which is
-the emulated architecture. As TCG started as a generic C backend used
-for cross compiling, it is assumed that the TCG target is different
-from the host, although it is never the case for QEMU.
-
-In this document, we use "guest" to specify what architecture we are
-emulating; "target" always means the TCG target, the machine on which
-we are running QEMU.
-
-A TCG "function" corresponds to a QEMU Translated Block (TB).
-
-A TCG "temporary" is a variable only live in a basic
-block. Temporaries are allocated explicitly in each function.
-
-A TCG "local temporary" is a variable only live in a function. Local
-temporaries are allocated explicitly in each function.
-
-A TCG "global" is a variable which is live in all the functions
-(equivalent of a C global variable). They are defined before the
-functions defined. A TCG global can be a memory location (e.g. a QEMU
-CPU register), a fixed host register (e.g. the QEMU CPU state pointer)
-or a memory location which is stored in a register outside QEMU TBs
-(not implemented yet).
-
-A TCG "basic block" corresponds to a list of instructions terminated
-by a branch instruction. 
-
-An operation with "undefined behavior" may result in a crash.
-
-An operation with "unspecified behavior" shall not crash. However,
-the result may be one of several possibilities so may be considered
-an "undefined result".
-
-3) Intermediate representation
-
-3.1) Introduction
-
-TCG instructions operate on variables which are temporaries, local
-temporaries or globals. TCG instructions and variables are strongly
-typed. Two types are supported: 32 bit integers and 64 bit
-integers. Pointers are defined as an alias to 32 bit or 64 bit
-integers depending on the TCG target word size.
-
-Each instruction has a fixed number of output variable operands, input
-variable operands and always constant operands.
-
-The notable exception is the call instruction which has a variable
-number of outputs and inputs.
-
-In the textual form, output operands usually come first, followed by
-input operands, followed by constant operands. The output type is
-included in the instruction name. Constants are prefixed with a '$'.
-
-add_i32 t0, t1, t2 (t0 <- t1 + t2)
-
-3.2) Assumptions
-
-* Basic blocks
-
-- Basic blocks end after branches (e.g. brcond_i32 instruction),
- goto_tb and exit_tb instructions.
-- Basic blocks start after the end of a previous basic block, or at a
- set_label instruction.
-
-After the end of a basic block, the content of temporaries is
-destroyed, but local temporaries and globals are preserved.
-
-* Floating point types are not supported yet
-
-* Pointers: depending on the TCG target, pointer size is 32 bit or 64
- bit. The type TCG_TYPE_PTR is an alias to TCG_TYPE_I32 or
- TCG_TYPE_I64.
-
-* Helpers:
-
-Using the tcg_gen_helper_x_y it is possible to call any function
-taking i32, i64 or pointer types. By default, before calling a helper,
-all globals are stored at their canonical location and it is assumed
-that the function can modify them. By default, the helper is allowed to
-modify the CPU state or raise an exception.
-
-This can be overridden using the following function modifiers:
-- TCG_CALL_NO_READ_GLOBALS means that the helper does not read globals,
- either directly or via an exception. They will not be saved to their
- canonical locations before calling the helper.
-- TCG_CALL_NO_WRITE_GLOBALS means that the helper does not modify any globals.
- They will only be saved to their canonical location before calling helpers,
- but they won't be reloaded afterwards.
-- TCG_CALL_NO_SIDE_EFFECTS means that the call to the function is removed if
- the return value is not used.
-
-Note that TCG_CALL_NO_READ_GLOBALS implies TCG_CALL_NO_WRITE_GLOBALS.
-
-On some TCG targets (e.g. x86), several calling conventions are
-supported.
-
-* Branches:
-
-Use the instruction 'br' to jump to a label.
-
-3.3) Code Optimizations
-
-When generating instructions, you can count on at least the following
-optimizations:
-
-- Single instructions are simplified, e.g.
-
- and_i32 t0, t0, $0xffffffff
- 
- is suppressed.
-
-- A liveness analysis is done at the basic block level. The
- information is used to suppress moves from a dead variable to
- another one. It is also used to remove instructions which compute
- dead results. The later is especially useful for condition code
- optimization in QEMU.
-
- In the following example:
-
- add_i32 t0, t1, t2
- add_i32 t0, t0, $1
- mov_i32 t0, $1
-
- only the last instruction is kept.
-
-3.4) Instruction Reference
-
-********* Function call
-
-* call <ret> <params> ptr
-
-call function 'ptr' (pointer type)
-
-<ret> optional 32 bit or 64 bit return value
-<params> optional 32 bit or 64 bit parameters
-
-********* Jumps/Labels
-
-* set_label $label
-
-Define label 'label' at the current program point.
-
-* br $label
-
-Jump to label.
-
-* brcond_i32/i64 t0, t1, cond, label
-
-Conditional jump if t0 cond t1 is true. cond can be:
- TCG_COND_EQ
- TCG_COND_NE
- TCG_COND_LT /* signed */
- TCG_COND_GE /* signed */
- TCG_COND_LE /* signed */
- TCG_COND_GT /* signed */
- TCG_COND_LTU /* unsigned */
- TCG_COND_GEU /* unsigned */
- TCG_COND_LEU /* unsigned */
- TCG_COND_GTU /* unsigned */
-
-********* Arithmetic
-
-* add_i32/i64 t0, t1, t2
-
-t0=t1+t2
-
-* sub_i32/i64 t0, t1, t2
-
-t0=t1-t2
-
-* neg_i32/i64 t0, t1
-
-t0=-t1 (two's complement)
-
-* mul_i32/i64 t0, t1, t2
-
-t0=t1*t2
-
-* div_i32/i64 t0, t1, t2
-
-t0=t1/t2 (signed). Undefined behavior if division by zero or overflow.
-
-* divu_i32/i64 t0, t1, t2
-
-t0=t1/t2 (unsigned). Undefined behavior if division by zero.
-
-* rem_i32/i64 t0, t1, t2
-
-t0=t1%t2 (signed). Undefined behavior if division by zero or overflow.
-
-* remu_i32/i64 t0, t1, t2
-
-t0=t1%t2 (unsigned). Undefined behavior if division by zero.
-
-********* Logical
-
-* and_i32/i64 t0, t1, t2
-
-t0=t1&t2
-
-* or_i32/i64 t0, t1, t2
-
-t0=t1|t2
-
-* xor_i32/i64 t0, t1, t2
-
-t0=t1^t2
-
-* not_i32/i64 t0, t1
-
-t0=~t1
-
-* andc_i32/i64 t0, t1, t2
-
-t0=t1&~t2
-
-* eqv_i32/i64 t0, t1, t2
-
-t0=~(t1^t2), or equivalently, t0=t1^~t2
-
-* nand_i32/i64 t0, t1, t2
-
-t0=~(t1&t2)
-
-* nor_i32/i64 t0, t1, t2
-
-t0=~(t1|t2)
-
-* orc_i32/i64 t0, t1, t2
-
-t0=t1|~t2
-
-* clz_i32/i64 t0, t1, t2
-
-t0 = t1 ? clz(t1) : t2
-
-* ctz_i32/i64 t0, t1, t2
-
-t0 = t1 ? ctz(t1) : t2
-
-* ctpop_i32/i64 t0, t1
-
-t0 = number of bits set in t1
-With "ctpop" short for "count population", matching
-the function name used in include/qemu/host-utils.h.
-
-********* Shifts/Rotates
-
-* shl_i32/i64 t0, t1, t2
-
-t0=t1 << t2. Unspecified behavior if t2 < 0 or t2 >= 32 (resp 64)
-
-* shr_i32/i64 t0, t1, t2
-
-t0=t1 >> t2 (unsigned). Unspecified behavior if t2 < 0 or t2 >= 32 (resp 64)
-
-* sar_i32/i64 t0, t1, t2
-
-t0=t1 >> t2 (signed). Unspecified behavior if t2 < 0 or t2 >= 32 (resp 64)
-
-* rotl_i32/i64 t0, t1, t2
-
-Rotation of t2 bits to the left.
-Unspecified behavior if t2 < 0 or t2 >= 32 (resp 64)
-
-* rotr_i32/i64 t0, t1, t2
-
-Rotation of t2 bits to the right.
-Unspecified behavior if t2 < 0 or t2 >= 32 (resp 64)
-
-********* Misc
-
-* mov_i32/i64 t0, t1
-
-t0 = t1
-
-Move t1 to t0 (both operands must have the same type).
-
-* ext8s_i32/i64 t0, t1
-ext8u_i32/i64 t0, t1
-ext16s_i32/i64 t0, t1
-ext16u_i32/i64 t0, t1
-ext32s_i64 t0, t1
-ext32u_i64 t0, t1
-
-8, 16 or 32 bit sign/zero extension (both operands must have the same type)
-
-* bswap16_i32/i64 t0, t1, flags
-
-16 bit byte swap on the low bits of a 32/64 bit input.
-If flags & TCG_BSWAP_IZ, then t1 is known to be zero-extended from bit 15.
-If flags & TCG_BSWAP_OZ, then t0 will be zero-extended from bit 15.
-If flags & TCG_BSWAP_OS, then t0 will be sign-extended from bit 15.
-If neither TCG_BSWAP_OZ nor TCG_BSWAP_OS are set, then the bits of
-t0 above bit 15 may contain any value.
-
-* bswap32_i64 t0, t1, flags
-
-32 bit byte swap on a 64-bit value. The flags are the same as for bswap16,
-except they apply from bit 31 instead of bit 15.
-
-* bswap32_i32 t0, t1, flags
-* bswap64_i64 t0, t1, flags
-
-32/64 bit byte swap. The flags are ignored, but still present
-for consistency with the other bswap opcodes.
-
-* discard_i32/i64 t0
-
-Indicate that the value of t0 won't be used later. It is useful to
-force dead code elimination.
-
-* deposit_i32/i64 dest, t1, t2, pos, len
-
-Deposit T2 as a bitfield into T1, placing the result in DEST.
-The bitfield is described by POS/LEN, which are immediate values:
-
- LEN - the length of the bitfield
- POS - the position of the first bit, counting from the LSB
-
-For example, "deposit_i32 dest, t1, t2, 8, 4" indicates a 4-bit field
-at bit 8. This operation would be equivalent to
-
- dest = (t1 & ~0x0f00) | ((t2 << 8) & 0x0f00)
-
-* extract_i32/i64 dest, t1, pos, len
-* sextract_i32/i64 dest, t1, pos, len
-
-Extract a bitfield from T1, placing the result in DEST.
-The bitfield is described by POS/LEN, which are immediate values,
-as above for deposit. For extract_*, the result will be extended
-to the left with zeros; for sextract_*, the result will be extended
-to the left with copies of the bitfield sign bit at pos + len - 1.
-
-For example, "sextract_i32 dest, t1, 8, 4" indicates a 4-bit field
-at bit 8. This operation would be equivalent to
-
- dest = (t1 << 20) >> 28
-
-(using an arithmetic right shift).
-
-* extract2_i32/i64 dest, t1, t2, pos
-
-For N = {32,64}, extract an N-bit quantity from the concatenation
-of t2:t1, beginning at pos. The tcg_gen_extract2_{i32,i64} expander
-accepts 0 <= pos <= N as inputs. The backend code generator will
-not see either 0 or N as inputs for these opcodes.
-
-* extrl_i64_i32 t0, t1
-
-For 64-bit hosts only, extract the low 32-bits of input T1 and place it
-into 32-bit output T0. Depending on the host, this may be a simple move,
-or may require additional canonicalization.
-
-* extrh_i64_i32 t0, t1
-
-For 64-bit hosts only, extract the high 32-bits of input T1 and place it
-into 32-bit output T0. Depending on the host, this may be a simple shift,
-or may require additional canonicalization.
-
-********* Conditional moves
-
-* setcond_i32/i64 dest, t1, t2, cond
-
-dest = (t1 cond t2)
-
-Set DEST to 1 if (T1 cond T2) is true, otherwise set to 0.
-
-* movcond_i32/i64 dest, c1, c2, v1, v2, cond
-
-dest = (c1 cond c2 ? v1 : v2)
-
-Set DEST to V1 if (C1 cond C2) is true, otherwise set to V2.
-
-********* Type conversions
-
-* ext_i32_i64 t0, t1
-Convert t1 (32 bit) to t0 (64 bit) and does sign extension
-
-* extu_i32_i64 t0, t1
-Convert t1 (32 bit) to t0 (64 bit) and does zero extension
-
-* trunc_i64_i32 t0, t1
-Truncate t1 (64 bit) to t0 (32 bit)
-
-* concat_i32_i64 t0, t1, t2
-Construct t0 (64-bit) taking the low half from t1 (32 bit) and the high half
-from t2 (32 bit).
-
-* concat32_i64 t0, t1, t2
-Construct t0 (64-bit) taking the low half from t1 (64 bit) and the high half
-from t2 (64 bit).
-
-********* Load/Store
-
-* ld_i32/i64 t0, t1, offset
-ld8s_i32/i64 t0, t1, offset
-ld8u_i32/i64 t0, t1, offset
-ld16s_i32/i64 t0, t1, offset
-ld16u_i32/i64 t0, t1, offset
-ld32s_i64 t0, t1, offset
-ld32u_i64 t0, t1, offset
-
-t0 = read(t1 + offset)
-Load 8, 16, 32 or 64 bits with or without sign extension from host memory. 
-offset must be a constant.
-
-* st_i32/i64 t0, t1, offset
-st8_i32/i64 t0, t1, offset
-st16_i32/i64 t0, t1, offset
-st32_i64 t0, t1, offset
-
-write(t0, t1 + offset)
-Write 8, 16, 32 or 64 bits to host memory.
-
-All this opcodes assume that the pointed host memory doesn't correspond
-to a global. In the latter case the behaviour is unpredictable.
-
-********* Multiword arithmetic support
-
-* add2_i32/i64 t0_low, t0_high, t1_low, t1_high, t2_low, t2_high
-* sub2_i32/i64 t0_low, t0_high, t1_low, t1_high, t2_low, t2_high
-
-Similar to add/sub, except that the double-word inputs T1 and T2 are
-formed from two single-word arguments, and the double-word output T0
-is returned in two single-word outputs.
-
-* mulu2_i32/i64 t0_low, t0_high, t1, t2
-
-Similar to mul, except two unsigned inputs T1 and T2 yielding the full
-double-word product T0. The later is returned in two single-word outputs.
-
-* muls2_i32/i64 t0_low, t0_high, t1, t2
-
-Similar to mulu2, except the two inputs T1 and T2 are signed.
-
-* mulsh_i32/i64 t0, t1, t2
-* muluh_i32/i64 t0, t1, t2
-
-Provide the high part of a signed or unsigned multiply, respectively.
-If mulu2/muls2 are not provided by the backend, the tcg-op generator
-can obtain the same results can be obtained by emitting a pair of
-opcodes, mul+muluh/mulsh.
-
-********* Memory Barrier support
-
-* mb <$arg>
-
-Generate a target memory barrier instruction to ensure memory ordering as being
-enforced by a corresponding guest memory barrier instruction. The ordering
-enforced by the backend may be stricter than the ordering required by the guest.
-It cannot be weaker. This opcode takes a constant argument which is required to
-generate the appropriate barrier instruction. The backend should take care to
-emit the target barrier instruction only when necessary i.e., for SMP guests and
-when MTTCG is enabled.
-
-The guest translators should generate this opcode for all guest instructions
-which have ordering side effects.
-
-Please see docs/devel/atomics.rst for more information on memory barriers.
-
-********* 64-bit guest on 32-bit host support
-
-The following opcodes are internal to TCG. Thus they are to be implemented by
-32-bit host code generators, but are not to be emitted by guest translators.
-They are emitted as needed by inline functions within "tcg-op.h".
-
-* brcond2_i32 t0_low, t0_high, t1_low, t1_high, cond, label
-
-Similar to brcond, except that the 64-bit values T0 and T1
-are formed from two 32-bit arguments.
-
-* setcond2_i32 dest, t1_low, t1_high, t2_low, t2_high, cond
-
-Similar to setcond, except that the 64-bit values T1 and T2 are
-formed from two 32-bit arguments. The result is a 32-bit value.
-
-********* QEMU specific operations
-
-* exit_tb t0
-
-Exit the current TB and return the value t0 (word type).
-
-* goto_tb index
-
-Exit the current TB and jump to the TB index 'index' (constant) if the
-current TB was linked to this TB. Otherwise execute the next
-instructions. Only indices 0 and 1 are valid and tcg_gen_goto_tb may be issued
-at most once with each slot index per TB.
-
-* lookup_and_goto_ptr tb_addr
-
-Look up a TB address ('tb_addr') and jump to it if valid. If not valid,
-jump to the TCG epilogue to go back to the exec loop.
-
-This operation is optional. If the TCG backend does not implement the
-goto_ptr opcode, emitting this op is equivalent to emitting exit_tb(0).
-
-* qemu_ld_i32/i64 t0, t1, flags, memidx
-* qemu_st_i32/i64 t0, t1, flags, memidx
-* qemu_st8_i32 t0, t1, flags, memidx
-
-Load data at the guest address t1 into t0, or store data in t0 at guest
-address t1. The _i32/_i64 size applies to the size of the input/output
-register t0 only. The address t1 is always sized according to the guest,
-and the width of the memory operation is controlled by flags.
-
-Both t0 and t1 may be split into little-endian ordered pairs of registers
-if dealing with 64-bit quantities on a 32-bit host.
-
-The memidx selects the qemu tlb index to use (e.g. user or kernel access).
-The flags are the MemOp bits, selecting the sign, width, and endianness
-of the memory access.
-
-For a 32-bit host, qemu_ld/st_i64 is guaranteed to only be used with a
-64-bit memory access specified in flags.
-
-For i386, qemu_st8_i32 is exactly like qemu_st_i32, except the size of
-the memory operation is known to be 8-bit. This allows the backend to
-provide a different set of register constraints.
-
-********* Host vector operations
-
-All of the vector ops have two parameters, TCGOP_VECL & TCGOP_VECE.
-The former specifies the length of the vector in log2 64-bit units; the
-later specifies the length of the element (if applicable) in log2 8-bit units.
-E.g. VECL=1 -> 64 << 1 -> v128, and VECE=2 -> 1 << 2 -> i32.
-
-* mov_vec v0, v1
-* ld_vec v0, t1
-* st_vec v0, t1
-
- Move, load and store.
-
-* dup_vec v0, r1
-
- Duplicate the low N bits of R1 into VECL/VECE copies across V0.
-
-* dupi_vec v0, c
-
- Similarly, for a constant.
- Smaller values will be replicated to host register size by the expanders.
-
-* dup2_vec v0, r1, r2
-
- Duplicate r2:r1 into VECL/64 copies across V0. This opcode is
- only present for 32-bit hosts.
-
-* add_vec v0, v1, v2
-
- v0 = v1 + v2, in elements across the vector.
-
-* sub_vec v0, v1, v2
-
- Similarly, v0 = v1 - v2.
-
-* mul_vec v0, v1, v2
-
- Similarly, v0 = v1 * v2.
-
-* neg_vec v0, v1
-
- Similarly, v0 = -v1.
-
-* abs_vec v0, v1
-
- Similarly, v0 = v1 < 0 ? -v1 : v1, in elements across the vector.
-
-* smin_vec:
-* umin_vec:
-
- Similarly, v0 = MIN(v1, v2), for signed and unsigned element types.
-
-* smax_vec:
-* umax_vec:
-
- Similarly, v0 = MAX(v1, v2), for signed and unsigned element types.
-
-* ssadd_vec:
-* sssub_vec:
-* usadd_vec:
-* ussub_vec:
-
- Signed and unsigned saturating addition and subtraction. If the true
- result is not representable within the element type, the element is
- set to the minimum or maximum value for the type.
-
-* and_vec v0, v1, v2
-* or_vec v0, v1, v2
-* xor_vec v0, v1, v2
-* andc_vec v0, v1, v2
-* orc_vec v0, v1, v2
-* not_vec v0, v1
-
- Similarly, logical operations with and without complement.
- Note that VECE is unused.
-
-* shli_vec v0, v1, i2
-* shls_vec v0, v1, s2
-
- Shift all elements from v1 by a scalar i2/s2. I.e.
-
- for (i = 0; i < VECL/VECE; ++i) {
- v0[i] = v1[i] << s2;
- }
-
-* shri_vec v0, v1, i2
-* sari_vec v0, v1, i2
-* rotli_vec v0, v1, i2
-* shrs_vec v0, v1, s2
-* sars_vec v0, v1, s2
-
- Similarly for logical and arithmetic right shift, and left rotate.
-
-* shlv_vec v0, v1, v2
-
- Shift elements from v1 by elements from v2. I.e.
-
- for (i = 0; i < VECL/VECE; ++i) {
- v0[i] = v1[i] << v2[i];
- }
-
-* shrv_vec v0, v1, v2
-* sarv_vec v0, v1, v2
-* rotlv_vec v0, v1, v2
-* rotrv_vec v0, v1, v2
-
- Similarly for logical and arithmetic right shift, and rotates.
-
-* cmp_vec v0, v1, v2, cond
-
- Compare vectors by element, storing -1 for true and 0 for false.
-
-* bitsel_vec v0, v1, v2, v3
-
- Bitwise select, v0 = (v2 & v1) | (v3 & ~v1), across the entire vector.
-
-* cmpsel_vec v0, c1, c2, v3, v4, cond
-
- Select elements based on comparison results:
- for (i = 0; i < n; ++i) {
- v0[i] = (c1[i] cond c2[i]) ? v3[i] : v4[i].
- }
-
-*********
-
-Note 1: Some shortcuts are defined when the last operand is known to be
-a constant (e.g. addi for add, movi for mov).
-
-Note 2: When using TCG, the opcodes must never be generated directly
-as some of them may not be available as "real" opcodes. Always use the
-function tcg_gen_xxx(args).
-
-4) Backend
-
-tcg-target.h contains the target specific definitions. tcg-target.c.inc
-contains the target specific code; it is #included by tcg/tcg.c, rather
-than being a standalone C file.
-
-4.1) Assumptions
-
-The target word size (TCG_TARGET_REG_BITS) is expected to be 32 bit or
-64 bit. It is expected that the pointer has the same size as the word.
-
-On a 32 bit target, all 64 bit operations are converted to 32 bits. A
-few specific operations must be implemented to allow it (see add2_i32,
-sub2_i32, brcond2_i32).
-
-On a 64 bit target, the values are transferred between 32 and 64-bit
-registers using the following ops:
-- trunc_shr_i64_i32
-- ext_i32_i64
-- extu_i32_i64
-
-They ensure that the values are correctly truncated or extended when
-moved from a 32-bit to a 64-bit register or vice-versa. Note that the
-trunc_shr_i64_i32 is an optional op. It is not necessary to implement
-it if all the following conditions are met:
-- 64-bit registers can hold 32-bit values
-- 32-bit values in a 64-bit register do not need to stay zero or
- sign extended
-- all 32-bit TCG ops ignore the high part of 64-bit registers
-
-Floating point operations are not supported in this version. A
-previous incarnation of the code generator had full support of them,
-but it is better to concentrate on integer operations first.
-
-4.2) Constraints
-
-GCC like constraints are used to define the constraints of every
-instruction. Memory constraints are not supported in this
-version. Aliases are specified in the input operands as for GCC.
-
-The same register may be used for both an input and an output, even when
-they are not explicitly aliased. If an op expands to multiple target
-instructions then care must be taken to avoid clobbering input values.
-GCC style "early clobber" outputs are supported, with '&'.
-
-A target can define specific register or constant constraints. If an
-operation uses a constant input constraint which does not allow all
-constants, it must also accept registers in order to have a fallback.
-The constraint 'i' is defined generically to accept any constant.
-The constraint 'r' is not defined generically, but is consistently
-used by each backend to indicate all registers.
-
-The movi_i32 and movi_i64 operations must accept any constants.
-
-The mov_i32 and mov_i64 operations must accept any registers of the
-same type.
-
-The ld/st/sti instructions must accept signed 32 bit constant offsets.
-This can be implemented by reserving a specific register in which to
-compute the address if the offset is too big.
-
-The ld/st instructions must accept any destination (ld) or source (st)
-register.
-
-The sti instruction may fail if it cannot store the given constant.
-
-4.3) Function call assumptions
-
-- The only supported types for parameters and return value are: 32 and
- 64 bit integers and pointer.
-- The stack grows downwards.
-- The first N parameters are passed in registers.
-- The next parameters are passed on the stack by storing them as words.
-- Some registers are clobbered during the call. 
-- The function can return 0 or 1 value in registers. On a 32 bit
- target, functions must be able to return 2 values in registers for
- 64 bit return type.
-
-5) Recommended coding rules for best performance
-
-- Use globals to represent the parts of the QEMU CPU state which are
- often modified, e.g. the integer registers and the condition
- codes. TCG will be able to use host registers to store them.
-
-- Avoid globals stored in fixed registers. They must be used only to
- store the pointer to the CPU state and possibly to store a pointer
- to a register window.
-
-- Use temporaries. Use local temporaries only when really needed,
- e.g. when you need to use a value after a jump. Local temporaries
- introduce a performance hit in the current TCG implementation: their
- content is saved to memory at end of each basic block.
-
-- Free temporaries and local temporaries when they are no longer used
- (tcg_temp_free). Since tcg_const_x() also creates a temporary, you
- should free it after it is used. Freeing temporaries does not yield
- a better generated code, but it reduces the memory usage of TCG and
- the speed of the translation.
-
-- Don't hesitate to use helpers for complicated or seldom used guest
- instructions. There is little performance advantage in using TCG to
- implement guest instructions taking more than about twenty TCG
- instructions. Note that this rule of thumb is more applicable to
- helpers doing complex logic or arithmetic, where the C compiler has
- scope to do a good job of optimisation; it is less relevant where
- the instruction is mostly doing loads and stores, and in those cases
- inline TCG may still be faster for longer sequences.
-
-- The hard limit on the number of TCG instructions you can generate
- per guest instruction is set by MAX_OP_PER_INSTR in exec-all.h --
- you cannot exceed this without risking a buffer overrun.
-
-- Use the 'discard' instruction if you know that TCG won't be able to
- prove that a given global is "dead" at a given program point. The
- x86 guest uses it to improve the condition codes optimisation.
-- 
2.34.1

Like CONFIG_TCG, the enabled method of execution is a host property
not a guest property.  This exposes the define to compile-once files.

Acked-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 meson.build | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/meson.build b/meson.build
index XXXXXXX..XXXXXXX 100644
--- a/meson.build
+++ b/meson.build
@@ -XXX,XX +XXX,XX @@ if get_option('tcg').allowed()
   endif
   if get_option('tcg_interpreter')
     tcg_arch = 'tci'
+    config_host += { 'CONFIG_TCG_INTERPRETER': 'y' }
   elif host_arch == 'x86_64'
     tcg_arch = 'i386'
   elif host_arch == 'ppc64'
@@ -XXX,XX +XXX,XX @@ foreach target : target_dirs
     if sym == 'CONFIG_TCG' or target in accelerator_targets.get(sym, [])
       config_target += { sym: 'y' }
       config_all += { sym: 'y' }
-      if sym == 'CONFIG_TCG' and tcg_arch == 'tci'
-        config_target += { 'CONFIG_TCG_INTERPRETER': 'y' }
-      endif
       if target in modular_tcg
         config_target += { 'CONFIG_TCG_MODULAR': 'y' }
       else
-- 
2.34.1

From: Philippe Mathieu-Daudé <philmd@linaro.org>

We are going to modify this code, so fix its style first to avoid:

ERROR: spaces required around that '*' (ctx:VxV)
 #281: FILE: tcg/s390x/tcg-target.c.inc:1224:
 + uintptr_t mask = ~(0xffffull << i*16);
 ^

Reviewed-by: Wilfred Mallawa <wilfred.mallawa@wdc.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Message-Id: <20221130132654.76369-2-philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/s390x/tcg-target.c.inc | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/s390x/tcg-target.c.inc
+++ b/tcg/s390x/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static bool maybe_out_small_movi(TCGContext *s, TCGType type,
 }
 
 for (i = 0; i < 4; i++) {
- tcg_target_long mask = 0xffffull << i*16;
+ tcg_target_long mask = 0xffffull <> i*16);
+ tcg_out_insn_RI(s, lli_insns[i], ret, uval >> i * 16);
 return true;
 }
 }
@@ -XXX,XX +XXX,XX @@ static void tgen_andi(TCGContext *s, TCGType type, TCGReg dest, uint64_t val)
 
 /* Try all 32-bit insns that can perform it in one go. */
 for (i = 0; i < 4; i++) {
- tcg_target_ulong mask = ~(0xffffull << i*16);
+ tcg_target_ulong mask = ~(0xffffull <> i*16);
+ tcg_out_insn_RI(s, ni_insns[i], dest, val >> i * 16);
 return;
 }
 }
@@ -XXX,XX +XXX,XX @@ static void tgen_andi(TCGContext *s, TCGType type, TCGReg dest, uint64_t val)
 /* Try all 48-bit insns that can perform it in one go. */
 if (HAVE_FACILITY(EXT_IMM)) {
 for (i = 0; i < 2; i++) {
- tcg_target_ulong mask = ~(0xffffffffull << i*32);
+ tcg_target_ulong mask = ~(0xffffffffull <> i*32);
+ tcg_out_insn_RIL(s, nif_insns[i], dest, val >> i * 32);
 return;
 }
 }
@@ -XXX,XX +XXX,XX @@ static void tgen_ori(TCGContext *s, TCGType type, TCGReg dest, uint64_t val)
 
 /* Try all 32-bit insns that can perform it in one go. */
 for (i = 0; i < 4; i++) {
- tcg_target_ulong mask = (0xffffull << i*16);
+ tcg_target_ulong mask = (0xffffull <> i*16);
+ tcg_out_insn_RI(s, oi_insns[i], dest, val >> i * 16);
 return;
 }
 }
@@ -XXX,XX +XXX,XX @@ static void tgen_ori(TCGContext *s, TCGType type, TCGReg dest, uint64_t val)
 /* Try all 48-bit insns that can perform it in one go. */
 if (HAVE_FACILITY(EXT_IMM)) {
 for (i = 0; i < 2; i++) {
- tcg_target_ulong mask = (0xffffffffull << i*32);
+ tcg_target_ulong mask = (0xffffffffull <> i*32);
+ tcg_out_insn_RIL(s, oif_insns[i], dest, val >> i * 32);
 return;
 }
 }
-- 
2.34.1

Remove whitespace at end of line, plus one place this also
highlights some missing braces.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tcg.c | 33 +++++++++++++++++----------------
 tcg/ppc/tcg-target.c.inc | 2 +-
 2 files changed, 18 insertions(+), 17 deletions(-)

diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ void *tcg_malloc_internal(TCGContext *s, int size)
 {
 TCGPool *p;
 int pool_size;
- 
+
 if (size > TCG_POOL_CHUNK_SIZE) {
 /* big malloc: insert a new pool (XXX: could optimize) */
 p = g_malloc(sizeof(TCGPool) + size);
@@ -XXX,XX +XXX,XX @@ void *tcg_malloc_internal(TCGContext *s, int size)
 p = g_malloc(sizeof(TCGPool) + pool_size);
 p->size = pool_size;
 p->next = NULL;
- if (s->pool_current) 
+ if (s->pool_current) {
 s->pool_current->next = p;
- else
+ } else {
 s->pool_first = p;
+ }
 } else {
 p = p->next;
 }
@@ -XXX,XX +XXX,XX @@ static void dump_regs(TCGContext *s)
 
 for(i = 0; i < TCG_TARGET_NB_REGS; i++) {
 if (s->reg_to_temp[i] != NULL) {
- printf("%s: %s\n", 
- tcg_target_reg_names[i], 
+ printf("%s: %s\n",
+ tcg_target_reg_names[i],
 tcg_get_arg_str_ptr(s, buf, sizeof(buf), s->reg_to_temp[i]));
 }
 }
@@ -XXX,XX +XXX,XX @@ static void check_regs(TCGContext *s)
 ts = s->reg_to_temp[reg];
 if (ts != NULL) {
 if (ts->val_type != TEMP_VAL_REG || ts->reg != reg) {
- printf("Inconsistency for register %s:\n", 
+ printf("Inconsistency for register %s:\n",
 tcg_target_reg_names[reg]);
 goto fail;
 }
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
 nb_iargs = def->nb_iargs;
 
 /* copy constants */
- memcpy(new_args + nb_oargs + nb_iargs, 
+ memcpy(new_args + nb_oargs + nb_iargs,
 op->args + nb_oargs + nb_iargs,
 sizeof(TCGArg) * def->nb_cargs);
 
 i_allocated_regs = s->reserved_regs;
 o_allocated_regs = s->reserved_regs;
 
- /* satisfy input constraints */ 
+ /* satisfy input constraints */
 for (k = 0; k < nb_iargs; k++) {
 TCGRegSet i_preferred_regs, o_preferred_regs;
 
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
 const_args[i] = 0;
 tcg_regset_set_reg(i_allocated_regs, reg);
 }
- 
+
 /* mark dead temporaries and free the associated registers */
 for (i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
 if (IS_DEAD_ARG(i)) {
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
 tcg_reg_alloc_bb_end(s, i_allocated_regs);
 } else {
 if (def->flags & TCG_OPF_CALL_CLOBBER) {
- /* XXX: permit generic clobber register list ? */ 
+ /* XXX: permit generic clobber register list ? */
 for (i = 0; i < TCG_TARGET_NB_REGS; i++) {
 if (tcg_regset_test_reg(tcg_target_call_clobber_regs, i)) {
 tcg_reg_free(s, i, i_allocated_regs);
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
 an exception. */
 sync_globals(s, i_allocated_regs);
 }
- 
+
 /* satisfy the output constraints */
 for(k = 0; k < nb_oargs; k++) {
 i = def->args_ct[k].sort_index;
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_call(TCGContext *s, TCGOp *op)
 
 /* assign stack slots first */
 call_stack_size = (nb_iargs - nb_regs) * sizeof(tcg_target_long);
- call_stack_size = (call_stack_size + TCG_TARGET_STACK_ALIGN - 1) & 
+ call_stack_size = (call_stack_size + TCG_TARGET_STACK_ALIGN - 1) &
 ~(TCG_TARGET_STACK_ALIGN - 1);
 allocate_args = (call_stack_size > TCG_STATIC_CALL_ARGS_SIZE);
 if (allocate_args) {
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_call(TCGContext *s, TCGOp *op)
 stack_offset += sizeof(tcg_target_long);
 #endif
 }
- 
+
 /* assign input registers */
 allocated_regs = s->reserved_regs;
 for (i = 0; i < nb_regs; i++) {
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_call(TCGContext *s, TCGOp *op)
 tcg_regset_set_reg(allocated_regs, reg);
 }
 }
- 
+
 /* mark dead temporaries and free the associated registers */
 for (i = nb_oargs; i < nb_iargs + nb_oargs; i++) {
 if (IS_DEAD_ARG(i)) {
 temp_dead(s, arg_temp(op->args[i]));
 }
 }
- 
+
 /* clobber call registers */
 for (i = 0; i < TCG_TARGET_NB_REGS; i++) {
 if (tcg_regset_test_reg(tcg_target_call_clobber_regs, i)) {
@@ -XXX,XX +XXX,XX @@ void tcg_dump_info(GString *buf)
 (double)s->code_out_len / tb_div_count);
 g_string_append_printf(buf, "avg search data/TB %0.1f\n",
 (double)s->search_out_len / tb_div_count);
- 
+
 g_string_append_printf(buf, "cycles/op %0.1f\n",
 s->op_count ? (double)tot / s->op_count : 0);
 g_string_append_printf(buf, "cycles/in byte %0.1f\n",
diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@
 # else
 # error "Unknown ABI"
 # endif
-#endif 
+#endif
 
 #ifdef _CALL_SYSV
 # define TCG_TARGET_CALL_ALIGN_ARGS 1
-- 
2.34.1

Create a wrapper for locking/unlocking the iothread lock.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/qemu/main-loop.h | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/include/qemu/main-loop.h b/include/qemu/main-loop.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/main-loop.h
+++ b/include/qemu/main-loop.h
@@ -XXX,XX +XXX,XX @@ void qemu_mutex_lock_iothread_impl(const char *file, int line);
  */
 void qemu_mutex_unlock_iothread(void);
 
+/**
+ * QEMU_IOTHREAD_LOCK_GUARD
+ *
+ * Wrap a block of code in a conditional qemu_mutex_{lock,unlock}_iothread.
+ */
+typedef struct IOThreadLockAuto IOThreadLockAuto;
+
+static inline IOThreadLockAuto *qemu_iothread_auto_lock(const char *file,
+                                                        int line)
+{
+    if (qemu_mutex_iothread_locked()) {
+        return NULL;
+    }
+    qemu_mutex_lock_iothread_impl(file, line);
+    /* Anything non-NULL causes the cleanup function to be called */
+    return (IOThreadLockAuto *)(uintptr_t)1;
+}
+
+static inline void qemu_iothread_auto_unlock(IOThreadLockAuto *l)
+{
+    qemu_mutex_unlock_iothread();
+}
+
+G_DEFINE_AUTOPTR_CLEANUP_FUNC(IOThreadLockAuto, qemu_iothread_auto_unlock)
+
+#define QEMU_IOTHREAD_LOCK_GUARD() \
+    g_autoptr(IOThreadLockAuto) _iothread_lock_auto __attribute__((unused)) \
+        = qemu_iothread_auto_lock(__FILE__, __LINE__)
+
 /*
  * qemu_cond_wait_iothread: Wait on condition for the main loop mutex
  *
-- 
2.34.1

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 hw/mips/mips_int.c | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/hw/mips/mips_int.c b/hw/mips/mips_int.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/mips/mips_int.c
+++ b/hw/mips/mips_int.c
@@ -XXX,XX +XXX,XX @@ static void cpu_mips_irq_request(void *opaque, int irq, int level)
 MIPSCPU *cpu = opaque;
 CPUMIPSState *env = &cpu->env;
 CPUState *cs = CPU(cpu);
- bool locked = false;
 
 if (irq < 0 || irq > 7) {
 return;
 }
 
- /* Make sure locking works even if BQL is already held by the caller */
- if (!qemu_mutex_iothread_locked()) {
- locked = true;
- qemu_mutex_lock_iothread();
- }
+ QEMU_IOTHREAD_LOCK_GUARD();
 
 if (level) {
 env->CP0_Cause |= 1 << (irq + CP0Ca_IP);
@@ -XXX,XX +XXX,XX @@ static void cpu_mips_irq_request(void *opaque, int irq, int level)
 } else {
 cpu_reset_interrupt(cs, CPU_INTERRUPT_HARD);
 }
-
- if (locked) {
- qemu_mutex_unlock_iothread();
- }
 }
 
 void cpu_mips_irq_init_cpu(MIPSCPU *cpu)
-- 
2.34.1

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Reviewed-by: Daniel Henrique Barboza <danielhb413@gmail.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/ppc/excp_helper.c | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/target/ppc/excp_helper.c b/target/ppc/excp_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/ppc/excp_helper.c
+++ b/target/ppc/excp_helper.c
@@ -XXX,XX +XXX,XX @@ static int ppc_next_unmasked_interrupt(CPUPPCState *env)
 void ppc_maybe_interrupt(CPUPPCState *env)
 {
     CPUState *cs = env_cpu(env);
-    bool locked = false;
-
-    if (!qemu_mutex_iothread_locked()) {
-        locked = true;
-        qemu_mutex_lock_iothread();
-    }
+    QEMU_IOTHREAD_LOCK_GUARD();
 
     if (ppc_next_unmasked_interrupt(env)) {
         cpu_interrupt(cs, CPU_INTERRUPT_HARD);
     } else {
         cpu_reset_interrupt(cs, CPU_INTERRUPT_HARD);
     }
-
-    if (locked) {
-        qemu_mutex_unlock_iothread();
-    }
 }
 
 #if defined(TARGET_PPC64)
-- 
2.34.1

In addition, use tcg_enabled instead of !kvm_enabled.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Reviewed-by: Daniel Henrique Barboza <danielhb413@gmail.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/ppc/helper_regs.c | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/target/ppc/helper_regs.c b/target/ppc/helper_regs.c
index XXXXXXX..XXXXXXX 100644
--- a/target/ppc/helper_regs.c
+++ b/target/ppc/helper_regs.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/main-loop.h"
 #include "exec/exec-all.h"
 #include "sysemu/kvm.h"
+#include "sysemu/tcg.h"
 #include "helper_regs.h"
 #include "power8-pmu.h"
 #include "cpu-models.h"
@@ -XXX,XX +XXX,XX @@ void cpu_interrupt_exittb(CPUState *cs)
 {
     /*
      * We don't need to worry about translation blocks
-     * when running with KVM.
+     * unless running with TCG.
      */
-    if (kvm_enabled()) {
-        return;
-    }
-
-    if (!qemu_mutex_iothread_locked()) {
-        qemu_mutex_lock_iothread();
-        cpu_interrupt(cs, CPU_INTERRUPT_EXITTB);
-        qemu_mutex_unlock_iothread();
-    } else {
+    if (tcg_enabled()) {
+        QEMU_IOTHREAD_LOCK_GUARD();
         cpu_interrupt(cs, CPU_INTERRUPT_EXITTB);
     }
 }
-- 
2.34.1

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Reviewed-by: Alistair Francis <alistair.francis@wdc.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/riscv/cpu_helper.c | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/target/riscv/cpu_helper.c b/target/riscv/cpu_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/riscv/cpu_helper.c
+++ b/target/riscv/cpu_helper.c
@@ -XXX,XX +XXX,XX @@ uint64_t riscv_cpu_update_mip(RISCVCPU *cpu, uint64_t mask, uint64_t value)
     CPURISCVState *env = &cpu->env;
     CPUState *cs = CPU(cpu);
     uint64_t gein, vsgein = 0, vstip = 0, old = env->mip;
-    bool locked = false;
 
     if (riscv_cpu_virt_enabled(env)) {
         gein = get_field(env->hstatus, HSTATUS_VGEIN);
@@ -XXX,XX +XXX,XX @@ uint64_t riscv_cpu_update_mip(RISCVCPU *cpu, uint64_t mask, uint64_t value)
     mask = ((mask == MIP_VSTIP) && env->vstime_irq) ? 0 : mask;
     vstip = env->vstime_irq ? MIP_VSTIP : 0;
 
-    if (!qemu_mutex_iothread_locked()) {
-        locked = true;
-        qemu_mutex_lock_iothread();
-    }
+    QEMU_IOTHREAD_LOCK_GUARD();
 
     env->mip = (env->mip & ~mask) | (value & mask);
 
@@ -XXX,XX +XXX,XX @@ uint64_t riscv_cpu_update_mip(RISCVCPU *cpu, uint64_t mask, uint64_t value)
         cpu_reset_interrupt(cs, CPU_INTERRUPT_HARD);
     }
 
-    if (locked) {
-        qemu_mutex_unlock_iothread();
-    }
-
     return old;
 }
 
-- 
2.34.1

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Reviewed-by: Daniel Henrique Barboza <danielhb413@gmail.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 hw/ppc/ppc.c | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/hw/ppc/ppc.c b/hw/ppc/ppc.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/ppc/ppc.c
+++ b/hw/ppc/ppc.c
@@ -XXX,XX +XXX,XX @@ void ppc_set_irq(PowerPCCPU *cpu, int irq, int level)
 {
     CPUPPCState *env = &cpu->env;
     unsigned int old_pending;
-    bool locked = false;
 
     /* We may already have the BQL if coming from the reset path */
-    if (!qemu_mutex_iothread_locked()) {
-        locked = true;
-        qemu_mutex_lock_iothread();
-    }
+    QEMU_IOTHREAD_LOCK_GUARD();
 
     old_pending = env->pending_interrupts;
 
@@ -XXX,XX +XXX,XX @@ void ppc_set_irq(PowerPCCPU *cpu, int irq, int level)
 
     trace_ppc_irq_set_exit(env, irq, level, env->pending_interrupts,
                            CPU(cpu)->interrupt_request);
-
-    if (locked) {
-        qemu_mutex_unlock_iothread();
-    }
 }
 
 /* PowerPC 6xx / 7xx internal IRQ controller */
-- 
2.34.1

Narrow the scope of the lock to the actual read/write,
moving the cpu_transation_failed call outside the lock.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/cputlb.c | 25 ++++++++-----------------
 1 file changed, 8 insertions(+), 17 deletions(-)

diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ static uint64_t io_readx(CPUArchState *env, CPUTLBEntryFull *full,
     MemoryRegionSection *section;
     MemoryRegion *mr;
     uint64_t val;
-    bool locked = false;
     MemTxResult r;
 
     section = iotlb_to_section(cpu, full->xlat_section, full->attrs);
@@ -XXX,XX +XXX,XX @@ static uint64_t io_readx(CPUArchState *env, CPUTLBEntryFull *full,
         cpu_io_recompile(cpu, retaddr);
     }
 
-    if (!qemu_mutex_iothread_locked()) {
-        qemu_mutex_lock_iothread();
-        locked = true;
+    {
+        QEMU_IOTHREAD_LOCK_GUARD();
+        r = memory_region_dispatch_read(mr, mr_offset, &val, op, full->attrs);
     }
-    r = memory_region_dispatch_read(mr, mr_offset, &val, op, full->attrs);
+
     if (r != MEMTX_OK) {
         hwaddr physaddr = mr_offset +
             section->offset_within_address_space -
@@ -XXX,XX +XXX,XX @@ static uint64_t io_readx(CPUArchState *env, CPUTLBEntryFull *full,
         cpu_transaction_failed(cpu, physaddr, addr, memop_size(op), access_type,
                                mmu_idx, full->attrs, r, retaddr);
     }
-    if (locked) {
-        qemu_mutex_unlock_iothread();
-    }
-
     return val;
 }
 
@@ -XXX,XX +XXX,XX @@ static void io_writex(CPUArchState *env, CPUTLBEntryFull *full,
     hwaddr mr_offset;
     MemoryRegionSection *section;
     MemoryRegion *mr;
-    bool locked = false;
     MemTxResult r;
 
     section = iotlb_to_section(cpu, full->xlat_section, full->attrs);
@@ -XXX,XX +XXX,XX @@ static void io_writex(CPUArchState *env, CPUTLBEntryFull *full,
      */
     save_iotlb_data(cpu, section, mr_offset);
 
-    if (!qemu_mutex_iothread_locked()) {
-        qemu_mutex_lock_iothread();
-        locked = true;
+    {
+        QEMU_IOTHREAD_LOCK_GUARD();
+        r = memory_region_dispatch_write(mr, mr_offset, val, op, full->attrs);
     }
-    r = memory_region_dispatch_write(mr, mr_offset, val, op, full->attrs);
+
     if (r != MEMTX_OK) {
         hwaddr physaddr = mr_offset +
             section->offset_within_address_space -
@@ -XXX,XX +XXX,XX @@ static void io_writex(CPUArchState *env, CPUTLBEntryFull *full,
                                MMU_DATA_STORE, mmu_idx, full->attrs, r,
                                retaddr);
     }
-    if (locked) {
-        qemu_mutex_unlock_iothread();
-    }
 }
 
 static inline target_ulong tlb_read_ofs(CPUTLBEntry *entry, size_t ofs)
-- 
2.34.1

Replace goto allocate_in_reg with a boolean.
Remove o_preferred_regs which isn't used, except to copy.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tcg.c | 45 +++++++++++++++++++++------------------------
 1 file changed, 21 insertions(+), 24 deletions(-)

diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
 
 /* satisfy input constraints */
 for (k = 0; k < nb_iargs; k++) {
- TCGRegSet i_preferred_regs, o_preferred_regs;
+ TCGRegSet i_preferred_regs;
+ bool allocate_new_reg;
 
 i = def->args_ct[nb_oargs + k].sort_index;
 arg = op->args[i];
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
 continue;
 }
 
- i_preferred_regs = o_preferred_regs = 0;
+ reg = ts->reg;
+ i_preferred_regs = 0;
+ allocate_new_reg = false;
+
 if (arg_ct->ialias) {
- o_preferred_regs = op->output_pref[arg_ct->alias_index];
+ i_preferred_regs = op->output_pref[arg_ct->alias_index];
 
 /*
 * If the input is readonly, then it cannot also be an
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
 * register and move it.
 */
 if (temp_readonly(ts) || !IS_DEAD_ARG(i)) {
- goto allocate_in_reg;
+ allocate_new_reg = true;
+ } else if (ts->val_type == TEMP_VAL_REG) {
+ /*
+ * Check if the current register has already been
+ * allocated for another input.
+ */
+ allocate_new_reg = tcg_regset_test_reg(i_allocated_regs, reg);
 }
-
- /*
- * Check if the current register has already been allocated
- * for another input aliased to an output.
- */
- if (ts->val_type == TEMP_VAL_REG) {
- reg = ts->reg;
- for (int k2 = 0; k2 < k; k2++) {
- int i2 = def->args_ct[nb_oargs + k2].sort_index;
- if (def->args_ct[i2].ialias && reg == new_args[i2]) {
- goto allocate_in_reg;
- }
- }
- }
- i_preferred_regs = o_preferred_regs;
 }
 
- temp_load(s, ts, arg_ct->regs, i_allocated_regs, i_preferred_regs);
- reg = ts->reg;
+ if (!allocate_new_reg) {
+ temp_load(s, ts, arg_ct->regs, i_allocated_regs, i_preferred_regs);
+ reg = ts->reg;
+ allocate_new_reg = !tcg_regset_test_reg(arg_ct->regs, reg);
+ }
 
- if (!tcg_regset_test_reg(arg_ct->regs, reg)) {
- allocate_in_reg:
+ if (allocate_new_reg) {
 /*
 * Allocate a new register matching the constraint
 * and move the temporary register into it.
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
 temp_load(s, ts, tcg_target_available_regs[ts->type],
 i_allocated_regs, 0);
 reg = tcg_reg_alloc(s, arg_ct->regs, i_allocated_regs,
- o_preferred_regs, ts->indirect_base);
+ i_preferred_regs, ts->indirect_base);
 if (!tcg_out_mov(s, ts->type, reg, ts->reg)) {
 /*
 * Cross register class move not supported. Sync the
-- 
2.34.1

The hppa host code has been removed since 2013; this
should have been deleted at the same time.

Fixes: 802b5081233a ("tcg-hppa: Remove tcg backend")
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/aarch64/tcg-target.h | 1 -
 tcg/arm/tcg-target.h | 1 -
 tcg/tcg.c | 32 ++------------------------------
 3 files changed, 2 insertions(+), 32 deletions(-)

diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/aarch64/tcg-target.h
+++ b/tcg/aarch64/tcg-target.h
@@ -XXX,XX +XXX,XX @@
 #define TCG_TARGET_INSN_UNIT_SIZE 4
 #define TCG_TARGET_TLB_DISPLACEMENT_BITS 24
 #define MAX_CODE_GEN_BUFFER_SIZE (2 * GiB)
-#undef TCG_TARGET_STACK_GROWSUP
 
 typedef enum {
 TCG_REG_X0, TCG_REG_X1, TCG_REG_X2, TCG_REG_X3,
diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.h
+++ b/tcg/arm/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern int arm_arch;
 
 #define use_armv7_instructions (__ARM_ARCH >= 7 || arm_arch >= 7)
 
-#undef TCG_TARGET_STACK_GROWSUP
 #define TCG_TARGET_INSN_UNIT_SIZE 4
 #define TCG_TARGET_TLB_DISPLACEMENT_BITS 16
 #define MAX_CODE_GEN_BUFFER_SIZE UINT32_MAX
diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ void tcg_gen_callN(void *func, TCGTemp *ret, int nargs, TCGTemp **args)
 }
 
 if (TCG_TARGET_REG_BITS < 64 && is_64bit) {
- /*
- * If stack grows up, then we will be placing successive
- * arguments at lower addresses, which means we need to
- * reverse the order compared to how we would normally
- * treat either big or little-endian. For those arguments
- * that will wind up in registers, this still works for
- * HPPA (the only current STACK_GROWSUP target) since the
- * argument registers are *also* allocated in decreasing
- * order. If another such target is added, this logic may
- * have to get more complicated to differentiate between
- * stack arguments and register arguments.
- */
-#if HOST_BIG_ENDIAN != defined(TCG_TARGET_STACK_GROWSUP)
- op->args[pi++] = temp_arg(args[i] + 1);
- op->args[pi++] = temp_arg(args[i]);
-#else
- op->args[pi++] = temp_arg(args[i]);
- op->args[pi++] = temp_arg(args[i] + 1);
-#endif
+ op->args[pi++] = temp_arg(args[i] + HOST_BIG_ENDIAN);
+ op->args[pi++] = temp_arg(args[i] + !HOST_BIG_ENDIAN);
 real_args += 2;
 continue;
 }
@@ -XXX,XX +XXX,XX @@ static bool tcg_reg_alloc_dup2(TCGContext *s, const TCGOp *op)
 return true;
 }
 
-#ifdef TCG_TARGET_STACK_GROWSUP
-#define STACK_DIR(x) (-(x))
-#else
-#define STACK_DIR(x) (x)
-#endif
-
 static void tcg_reg_alloc_call(TCGContext *s, TCGOp *op)
 {
 const int nb_oargs = TCGOP_CALLO(op);
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_call(TCGContext *s, TCGOp *op)
 stack_offset = TCG_TARGET_CALL_STACK_OFFSET;
 for (i = nb_regs; i < nb_iargs; i++) {
 arg = op->args[nb_oargs + i];
-#ifdef TCG_TARGET_STACK_GROWSUP
- stack_offset -= sizeof(tcg_target_long);
-#endif
 if (arg != TCG_CALL_DUMMY_ARG) {
 ts = arg_temp(arg);
 temp_load(s, ts, tcg_target_available_regs[ts->type],
 s->reserved_regs, 0);
 tcg_out_st(s, ts->type, ts->reg, TCG_REG_CALL_STACK, stack_offset);
 }
-#ifndef TCG_TARGET_STACK_GROWSUP
 stack_offset += sizeof(tcg_target_long);
-#endif
 }
 
 /* assign input registers */
-- 
2.34.1

Unused since commit 7b7d8b2d9a ("tcg/tci: Use ffi for calls").

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tci.c | 1 -
 tcg/tci/tcg-target.c.inc | 4 ----
 2 files changed, 5 deletions(-)

diff --git a/tcg/tci.c b/tcg/tci.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci.c
+++ b/tcg/tci.c
@@ -XXX,XX +XXX,XX @@
  */
 
 #include "qemu/osdep.h"
-#include "tcg/tcg.h"           /* MAX_OPC_PARAM_IARGS */
 #include "exec/cpu_ldst.h"
 #include "tcg/tcg-op.h"
 #include "tcg/tcg-ldst.h"
diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci/tcg-target.c.inc
+++ b/tcg/tci/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static const int tcg_target_reg_alloc_order[] = {
     TCG_REG_R0,
 };
 
-#if MAX_OPC_PARAM_IARGS != 7
-# error Fix needed, number of supported input arguments changed!
-#endif
-
 /* No call arguments via registers.  All will be stored on the "stack". */
 static const int tcg_target_call_iarg_regs[] = { };
 
-- 
2.34.1

The assignment to mem_coherent should be done with any
modification, not simply with a newly allocated register.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tcg.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_dup(TCGContext *s, const TCGOp *op)
         ots->reg = tcg_reg_alloc(s, dup_out_regs, allocated_regs,
                                  op->output_pref[0], ots->indirect_base);
         ots->val_type = TEMP_VAL_REG;
-        ots->mem_coherent = 0;
         s->reg_to_temp[ots->reg] = ots;
     }
 
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_dup(TCGContext *s, const TCGOp *op)
     tcg_debug_assert(ok);
 
  done:
+    ots->mem_coherent = 0;
     if (IS_DEAD_ARG(1)) {
         temp_dead(s, its);
     }
@@ -XXX,XX +XXX,XX @@ static bool tcg_reg_alloc_dup2(TCGContext *s, const TCGOp *op)
         ots->reg = tcg_reg_alloc(s, dup_out_regs, allocated_regs,
                                  op->output_pref[0], ots->indirect_base);
         ots->val_type = TEMP_VAL_REG;
-        ots->mem_coherent = 0;
         s->reg_to_temp[ots->reg] = ots;
     }
 
@@ -XXX,XX +XXX,XX @@ static bool tcg_reg_alloc_dup2(TCGContext *s, const TCGOp *op)
     return false;
 
  done:
+    ots->mem_coherent = 0;
     if (IS_DEAD_ARG(1)) {
         temp_dead(s, itsl);
     }
-- 
2.34.1

Create two new functions, set_temp_val_{reg,nonreg}.
Assert that the reg_to_temp mapping is correct before
any changes are made.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tcg.c | 159 +++++++++++++++++++++++++++++-------------------------
 1 file changed, 85 insertions(+), 74 deletions(-)

diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static void temp_allocate_frame(TCGContext *s, TCGTemp *ts)
 ts->mem_allocated = 1;
 }
 
+/* Assign @reg to @ts, and update reg_to_temp[]. */
+static void set_temp_val_reg(TCGContext *s, TCGTemp *ts, TCGReg reg)
+{
+ if (ts->val_type == TEMP_VAL_REG) {
+ TCGReg old = ts->reg;
+ tcg_debug_assert(s->reg_to_temp[old] == ts);
+ if (old == reg) {
+ return;
+ }
+ s->reg_to_temp[old] = NULL;
+ }
+ tcg_debug_assert(s->reg_to_temp[reg] == NULL);
+ s->reg_to_temp[reg] = ts;
+ ts->val_type = TEMP_VAL_REG;
+ ts->reg = reg;
+}
+
+/* Assign a non-register value type to @ts, and update reg_to_temp[]. */
+static void set_temp_val_nonreg(TCGContext *s, TCGTemp *ts, TCGTempVal type)
+{
+ tcg_debug_assert(type != TEMP_VAL_REG);
+ if (ts->val_type == TEMP_VAL_REG) {
+ TCGReg reg = ts->reg;
+ tcg_debug_assert(s->reg_to_temp[reg] == ts);
+ s->reg_to_temp[reg] = NULL;
+ }
+ ts->val_type = type;
+}
+
 static void temp_load(TCGContext *, TCGTemp *, TCGRegSet, TCGRegSet, TCGRegSet);
 
 /* Mark a temporary as free or dead. If 'free_or_dead' is negative,
@@ -XXX,XX +XXX,XX @@ static void temp_free_or_dead(TCGContext *s, TCGTemp *ts, int free_or_dead)
 default:
 g_assert_not_reached();
 }
- if (ts->val_type == TEMP_VAL_REG) {
- s->reg_to_temp[ts->reg] = NULL;
- }
- ts->val_type = new_type;
+ set_temp_val_nonreg(s, ts, new_type);
 }
 
 /* Mark a temporary as dead. */
@@ -XXX,XX +XXX,XX @@ static void temp_load(TCGContext *s, TCGTemp *ts, TCGRegSet desired_regs,
 default:
 tcg_abort();
 }
- ts->reg = reg;
- ts->val_type = TEMP_VAL_REG;
- s->reg_to_temp[reg] = ts;
+ set_temp_val_reg(s, ts, reg);
 }
 
 /* Save a temporary to memory. 'allocated_regs' is used in case a
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_do_movi(TCGContext *s, TCGTemp *ots,
 tcg_debug_assert(!temp_readonly(ots));
 
 /* The movi is not explicitly generated here. */
- if (ots->val_type == TEMP_VAL_REG) {
- s->reg_to_temp[ots->reg] = NULL;
- }
- ots->val_type = TEMP_VAL_CONST;
+ set_temp_val_nonreg(s, ots, TEMP_VAL_CONST);
 ots->val = val;
 ots->mem_coherent = 0;
 if (NEED_SYNC_ARG(0)) {
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_mov(TCGContext *s, const TCGOp *op)
 TCGRegSet allocated_regs, preferred_regs;
 TCGTemp *ts, *ots;
 TCGType otype, itype;
+ TCGReg oreg, ireg;
 
 allocated_regs = s->reserved_regs;
 preferred_regs = op->output_pref[0];
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_mov(TCGContext *s, const TCGOp *op)
 temp_load(s, ts, tcg_target_available_regs[itype],
 allocated_regs, preferred_regs);
 }
-
 tcg_debug_assert(ts->val_type == TEMP_VAL_REG);
+ ireg = ts->reg;
+
 if (IS_DEAD_ARG(0)) {
 /* mov to a non-saved dead register makes no sense (even with
 liveness analysis disabled). */
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_mov(TCGContext *s, const TCGOp *op)
 if (!ots->mem_allocated) {
 temp_allocate_frame(s, ots);
 }
- tcg_out_st(s, otype, ts->reg, ots->mem_base->reg, ots->mem_offset);
+ tcg_out_st(s, otype, ireg, ots->mem_base->reg, ots->mem_offset);
 if (IS_DEAD_ARG(1)) {
 temp_dead(s, ts);
 }
 temp_dead(s, ots);
+ return;
+ }
+
+ if (IS_DEAD_ARG(1) && ts->kind != TEMP_FIXED) {
+ /*
+ * The mov can be suppressed. Kill input first, so that it
+ * is unlinked from reg_to_temp, then set the output to the
+ * reg that we saved from the input.
+ */
+ temp_dead(s, ts);
+ oreg = ireg;
 } else {
- if (IS_DEAD_ARG(1) && ts->kind != TEMP_FIXED) {
- /* the mov can be suppressed */
- if (ots->val_type == TEMP_VAL_REG) {
- s->reg_to_temp[ots->reg] = NULL;
- }
- ots->reg = ts->reg;
- temp_dead(s, ts);
+ if (ots->val_type == TEMP_VAL_REG) {
+ oreg = ots->reg;
 } else {
- if (ots->val_type != TEMP_VAL_REG) {
- /* When allocating a new register, make sure to not spill the
- input one. */
- tcg_regset_set_reg(allocated_regs, ts->reg);
- ots->reg = tcg_reg_alloc(s, tcg_target_available_regs[otype],
- allocated_regs, preferred_regs,
- ots->indirect_base);
- }
- if (!tcg_out_mov(s, otype, ots->reg, ts->reg)) {
- /*
- * Cross register class move not supported.
- * Store the source register into the destination slot
- * and leave the destination temp as TEMP_VAL_MEM.
- */
- assert(!temp_readonly(ots));
- if (!ts->mem_allocated) {
- temp_allocate_frame(s, ots);
- }
- tcg_out_st(s, ts->type, ts->reg,
- ots->mem_base->reg, ots->mem_offset);
- ots->mem_coherent = 1;
- temp_free_or_dead(s, ots, -1);
- return;
- }
+ /* Make sure to not spill the input register during allocation. */
+ oreg = tcg_reg_alloc(s, tcg_target_available_regs[otype],
+ allocated_regs | ((TCGRegSet)1 << ireg),
+ preferred_regs, ots->indirect_base);
 }
- ots->val_type = TEMP_VAL_REG;
- ots->mem_coherent = 0;
- s->reg_to_temp[ots->reg] = ots;
- if (NEED_SYNC_ARG(0)) {
- temp_sync(s, ots, allocated_regs, 0, 0);
+ if (!tcg_out_mov(s, otype, oreg, ireg)) {
+ /*
+ * Cross register class move not supported.
+ * Store the source register into the destination slot
+ * and leave the destination temp as TEMP_VAL_MEM.
+ */
+ assert(!temp_readonly(ots));
+ if (!ts->mem_allocated) {
+ temp_allocate_frame(s, ots);
+ }
+ tcg_out_st(s, ts->type, ireg, ots->mem_base->reg, ots->mem_offset);
+ set_temp_val_nonreg(s, ts, TEMP_VAL_MEM);
+ ots->mem_coherent = 1;
+ return;
 }
 }
+ set_temp_val_reg(s, ots, oreg);
+ ots->mem_coherent = 0;
+
+ if (NEED_SYNC_ARG(0)) {
+ temp_sync(s, ots, allocated_regs, 0, 0);
+ }
 }
 
 /*
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_dup(TCGContext *s, const TCGOp *op)
 /* Allocate the output register now. */
 if (ots->val_type != TEMP_VAL_REG) {
 TCGRegSet allocated_regs = s->reserved_regs;
+ TCGReg oreg;
 
 if (!IS_DEAD_ARG(1) && its->val_type == TEMP_VAL_REG) {
 /* Make sure to not spill the input register. */
 tcg_regset_set_reg(allocated_regs, its->reg);
 }
- ots->reg = tcg_reg_alloc(s, dup_out_regs, allocated_regs,
- op->output_pref[0], ots->indirect_base);
- ots->val_type = TEMP_VAL_REG;
- s->reg_to_temp[ots->reg] = ots;
+ oreg = tcg_reg_alloc(s, dup_out_regs, allocated_regs,
+ op->output_pref[0], ots->indirect_base);
+ set_temp_val_reg(s, ots, oreg);
 }
 
 switch (its->val_type) {
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_dup(TCGContext *s, const TCGOp *op)
 #else
 endian_fixup = 0;
 #endif
+ /* Attempt to dup directly from the input memory slot. */
 if (tcg_out_dupm_vec(s, vtype, vece, ots->reg, its->mem_base->reg,
 its->mem_offset + endian_fixup)) {
 goto done;
 }
+ /* Load the input into the destination vector register. */
 tcg_out_ld(s, itype, ots->reg, its->mem_base->reg, its->mem_offset);
 break;
 
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
 op->output_pref[k], ts->indirect_base);
 }
 tcg_regset_set_reg(o_allocated_regs, reg);
- if (ts->val_type == TEMP_VAL_REG) {
- s->reg_to_temp[ts->reg] = NULL;
- }
- ts->val_type = TEMP_VAL_REG;
- ts->reg = reg;
- /*
- * Temp value is modified, so the value kept in memory is
- * potentially not the same.
- */
+ set_temp_val_reg(s, ts, reg);
 ts->mem_coherent = 0;
- s->reg_to_temp[reg] = ts;
 new_args[i] = reg;
 }
 }
@@ -XXX,XX +XXX,XX @@ static bool tcg_reg_alloc_dup2(TCGContext *s, const TCGOp *op)
 TCGRegSet allocated_regs = s->reserved_regs;
 TCGRegSet dup_out_regs =
 tcg_op_defs[INDEX_op_dup_vec].args_ct[0].regs;
+ TCGReg oreg;
 
 /* Make sure to not spill the input registers. */
 if (!IS_DEAD_ARG(1) && itsl->val_type == TEMP_VAL_REG) {
@@ -XXX,XX +XXX,XX @@ static bool tcg_reg_alloc_dup2(TCGContext *s, const TCGOp *op)
 tcg_regset_set_reg(allocated_regs, itsh->reg);
 }
 
- ots->reg = tcg_reg_alloc(s, dup_out_regs, allocated_regs,
- op->output_pref[0], ots->indirect_base);
- ots->val_type = TEMP_VAL_REG;
- s->reg_to_temp[ots->reg] = ots;
+ oreg = tcg_reg_alloc(s, dup_out_regs, allocated_regs,
+ op->output_pref[0], ots->indirect_base);
+ set_temp_val_reg(s, ots, oreg);
 }
 
 /* Promote dup2 of immediates to dupi_vec. */
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_call(TCGContext *s, TCGOp *op)
 tcg_debug_assert(!temp_readonly(ts));
 
 reg = tcg_target_call_oarg_regs[i];
- tcg_debug_assert(s->reg_to_temp[reg] == NULL);
- if (ts->val_type == TEMP_VAL_REG) {
- s->reg_to_temp[ts->reg] = NULL;
- }
- ts->val_type = TEMP_VAL_REG;
- ts->reg = reg;
+ set_temp_val_reg(s, ts, reg);
 ts->mem_coherent = 0;
- s->reg_to_temp[reg] = ts;
 if (NEED_SYNC_ARG(i)) {
 temp_sync(s, ts, allocated_regs, 0, IS_DEAD_ARG(i));
 } else if (IS_DEAD_ARG(i)) {
-- 
2.34.1

We now check the consistency of reg_to_temp[] with each update,
so the utility of checking consistency at the end of each
opcode is minimal.  In addition, the form of this check is
quite expensive, consuming 10% of a checking-enabled build.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tcg.c | 76 -------------------------------------------------------
 1 file changed, 76 deletions(-)

diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static bool liveness_pass_2(TCGContext *s)
 return changes;
 }
 
-#ifdef CONFIG_DEBUG_TCG
-static void dump_regs(TCGContext *s)
-{
- TCGTemp *ts;
- int i;
- char buf[64];
-
- for(i = 0; i < s->nb_temps; i++) {
- ts = &s->temps[i];
- printf(" %10s: ", tcg_get_arg_str_ptr(s, buf, sizeof(buf), ts));
- switch(ts->val_type) {
- case TEMP_VAL_REG:
- printf("%s", tcg_target_reg_names[ts->reg]);
- break;
- case TEMP_VAL_MEM:
- printf("%d(%s)", (int)ts->mem_offset,
- tcg_target_reg_names[ts->mem_base->reg]);
- break;
- case TEMP_VAL_CONST:
- printf("$0x%" PRIx64, ts->val);
- break;
- case TEMP_VAL_DEAD:
- printf("D");
- break;
- default:
- printf("???");
- break;
- }
- printf("\n");
- }
-
- for(i = 0; i < TCG_TARGET_NB_REGS; i++) {
- if (s->reg_to_temp[i] != NULL) {
- printf("%s: %s\n",
- tcg_target_reg_names[i],
- tcg_get_arg_str_ptr(s, buf, sizeof(buf), s->reg_to_temp[i]));
- }
- }
-}
-
-static void check_regs(TCGContext *s)
-{
- int reg;
- int k;
- TCGTemp *ts;
- char buf[64];
-
- for (reg = 0; reg < TCG_TARGET_NB_REGS; reg++) {
- ts = s->reg_to_temp[reg];
- if (ts != NULL) {
- if (ts->val_type != TEMP_VAL_REG || ts->reg != reg) {
- printf("Inconsistency for register %s:\n",
- tcg_target_reg_names[reg]);
- goto fail;
- }
- }
- }
- for (k = 0; k < s->nb_temps; k++) {
- ts = &s->temps[k];
- if (ts->val_type == TEMP_VAL_REG
- && ts->kind != TEMP_FIXED
- && s->reg_to_temp[ts->reg] != ts) {
- printf("Inconsistency for temp %s:\n",
- tcg_get_arg_str_ptr(s, buf, sizeof(buf), ts));
- fail:
- printf("reg state:\n");
- dump_regs(s);
- tcg_abort();
- }
- }
-}
-#endif
-
 static void temp_allocate_frame(TCGContext *s, TCGTemp *ts)
 {
 intptr_t off, size, align;
@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, target_ulong pc_start)
 tcg_reg_alloc_op(s, op);
 break;
 }
-#ifdef CONFIG_DEBUG_TCG
- check_regs(s);
-#endif
 /* Test for (pending) buffer overflow. The assumption is that any
 one operation beginning below the high water mark cannot overrun
 the buffer completely. Thus we can test for overflow after
-- 
2.34.1

From: Philippe Mathieu-Daudé <philmd@linaro.org>

In preparation of introducing paired registers,
massage a bit process_op_defs()'s switch case.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
[PMD: Split from bigger patch, 1/3]
Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Message-Id: <20221219220925.79218-2-philmd@linaro.org>
---
 tcg/tcg.c | 61 +++++++++++++++++++++++++++++++------------------------
 1 file changed, 34 insertions(+), 27 deletions(-)

diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static void process_op_defs(TCGContext *s)
 for (op = 0; op < NB_OPS; op++) {
 TCGOpDef *def = &tcg_op_defs[op];
 const TCGTargetOpDef *tdefs;
- int i, nb_args;
+ int i, o, nb_args;
 
 if (def->flags & TCG_OPF_NOT_PRESENT) {
 continue;
@@ -XXX,XX +XXX,XX @@ static void process_op_defs(TCGContext *s)
 
 for (i = 0; i < nb_args; i++) {
 const char *ct_str = tdefs->args_ct_str[i];
+ bool input_p = i >= def->nb_oargs;
+
 /* Incomplete TCGTargetOpDef entry. */
 tcg_debug_assert(ct_str != NULL);
 
- while (*ct_str != '\0') {
- switch(*ct_str) {
- case '0' ... '9':
- {
- int oarg = *ct_str - '0';
- tcg_debug_assert(ct_str == tdefs->args_ct_str[i]);
- tcg_debug_assert(oarg < def->nb_oargs);
- tcg_debug_assert(def->args_ct[oarg].regs != 0);
- def->args_ct[i] = def->args_ct[oarg];
- /* The output sets oalias. */
- def->args_ct[oarg].oalias = true;
- def->args_ct[oarg].alias_index = i;
- /* The input sets ialias. */
- def->args_ct[i].ialias = true;
- def->args_ct[i].alias_index = oarg;
- }
- ct_str++;
- break;
- case '&':
- def->args_ct[i].newreg = true;
- ct_str++;
- break;
+ switch (*ct_str) {
+ case '0' ... '9':
+ o = *ct_str - '0';
+ tcg_debug_assert(input_p);
+ tcg_debug_assert(o < def->nb_oargs);
+ tcg_debug_assert(def->args_ct[o].regs != 0);
+ tcg_debug_assert(!def->args_ct[o].oalias);
+ def->args_ct[i] = def->args_ct[o];
+ /* The output sets oalias. */
+ def->args_ct[o].oalias = 1;
+ def->args_ct[o].alias_index = i;
+ /* The input sets ialias. */
+ def->args_ct[i].ialias = 1;
+ def->args_ct[i].alias_index = o;
+ tcg_debug_assert(ct_str[1] == '\0');
+ continue;
+
+ case '&':
+ tcg_debug_assert(!input_p);
+ def->args_ct[i].newreg = true;
+ ct_str++;
+ break;
+ }
+
+ do {
+ switch (*ct_str) {
 case 'i':
 def->args_ct[i].ct |= TCG_CT_CONST;
- ct_str++;
 break;
 
 /* Include all of the target-specific constraints. */
 
 #undef CONST
 #define CONST(CASE, MASK) \
- case CASE: def->args_ct[i].ct |= MASK; ct_str++; break;
+ case CASE: def->args_ct[i].ct |= MASK; break;
 #define REGS(CASE, MASK) \
- case CASE: def->args_ct[i].regs |= MASK; ct_str++; break;
+ case CASE: def->args_ct[i].regs |= MASK; break;
 
 #include "tcg-target-con-str.h"
 
 #undef REGS
 #undef CONST
 default:
+ case '0' ... '9':
+ case '&':
 /* Typo in TCGTargetOpDef constraint. */
 g_assert_not_reached();
 }
- }
+ } while (*++ct_str != '\0');
 }
 
 /* TCGTargetOpDef entry with too much information? */
-- 
2.34.1

There are several instances where we need to be able to
allocate a pair of registers to related inputs/outputs.
Add 'p' and 'm' register constraints for this, in order to
be able to allocate the even/odd register first or second.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/tcg/tcg.h | 2 +
 tcg/tcg.c | 419 ++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 373 insertions(+), 48 deletions(-)

diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg.h
+++ b/include/tcg/tcg.h
@@ -XXX,XX +XXX,XX @@ typedef struct TCGArgConstraint {
 unsigned ct : 16;
 unsigned alias_index : 4;
 unsigned sort_index : 4;
+ unsigned pair_index : 4;
+ unsigned pair : 2; /* 0: none, 1: first, 2: second, 3: second alias */
 bool oalias : 1;
 bool ialias : 1;
 bool newreg : 1;
diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static void tcg_dump_ops(TCGContext *s, FILE *f, bool have_prefs)
 static int get_constraint_priority(const TCGOpDef *def, int k)
 {
 const TCGArgConstraint *arg_ct = &def->args_ct[k];
- int n;
+ int n = ctpop64(arg_ct->regs);
 
- if (arg_ct->oalias) {
- /* an alias is equivalent to a single register */
- n = 1;
- } else {
- n = ctpop64(arg_ct->regs);
+ /*
+ * Sort constraints of a single register first, which includes output
+ * aliases (which must exactly match the input already allocated).
+ */
+ if (n == 1 || arg_ct->oalias) {
+ return INT_MAX;
 }
- return TCG_TARGET_NB_REGS - n + 1;
+
+ /*
+ * Sort register pairs next, first then second immediately after.
+ * Arbitrarily sort multiple pairs by the index of the first reg;
+ * there shouldn't be many pairs.
+ */
+ switch (arg_ct->pair) {
+ case 1:
+ case 3:
+ return (k + 1) * 2;
+ case 2:
+ return (arg_ct->pair_index + 1) * 2 - 1;
+ }
+
+ /* Finally, sort by decreasing register count. */
+ assert(n > 1);
+ return -n;
 }
 
 /* sort from highest priority to lowest */
@@ -XXX,XX +XXX,XX @@ static void process_op_defs(TCGContext *s)
 for (op = 0; op < NB_OPS; op++) {
 TCGOpDef *def = &tcg_op_defs[op];
 const TCGTargetOpDef *tdefs;
- int i, o, nb_args;
+ bool saw_alias_pair = false;
+ int i, o, i2, o2, nb_args;
 
 if (def->flags & TCG_OPF_NOT_PRESENT) {
 continue;
@@ -XXX,XX +XXX,XX @@ static void process_op_defs(TCGContext *s)
 /* The input sets ialias. */
 def->args_ct[i].ialias = 1;
 def->args_ct[i].alias_index = o;
+ if (def->args_ct[i].pair) {
+ saw_alias_pair = true;
+ }
 tcg_debug_assert(ct_str[1] == '\0');
 continue;
 
@@ -XXX,XX +XXX,XX @@ static void process_op_defs(TCGContext *s)
 def->args_ct[i].newreg = true;
 ct_str++;
 break;
+
+ case 'p': /* plus */
+ /* Allocate to the register after the previous. */
+ tcg_debug_assert(i > (input_p ? def->nb_oargs : 0));
+ o = i - 1;
+ tcg_debug_assert(!def->args_ct[o].pair);
+ tcg_debug_assert(!def->args_ct[o].ct);
+ def->args_ct[i] = (TCGArgConstraint){
+ .pair = 2,
+ .pair_index = o,
+ .regs = def->args_ct[o].regs << 1,
+ };
+ def->args_ct[o].pair = 1;
+ def->args_ct[o].pair_index = i;
+ tcg_debug_assert(ct_str[1] == '\0');
+ continue;
+
+ case 'm': /* minus */
+ /* Allocate to the register before the previous. */
+ tcg_debug_assert(i > (input_p ? def->nb_oargs : 0));
+ o = i - 1;
+ tcg_debug_assert(!def->args_ct[o].pair);
+ tcg_debug_assert(!def->args_ct[o].ct);
+ def->args_ct[i] = (TCGArgConstraint){
+ .pair = 1,
+ .pair_index = o,
+ .regs = def->args_ct[o].regs >> 1,
+ };
+ def->args_ct[o].pair = 2;
+ def->args_ct[o].pair_index = i;
+ tcg_debug_assert(ct_str[1] == '\0');
+ continue;
 }
 
 do {
@@ -XXX,XX +XXX,XX @@ static void process_op_defs(TCGContext *s)
 default:
 case '0' ... '9':
 case '&':
+ case 'p':
+ case 'm':
 /* Typo in TCGTargetOpDef constraint. */
 g_assert_not_reached();
 }
@@ -XXX,XX +XXX,XX @@ static void process_op_defs(TCGContext *s)
 /* TCGTargetOpDef entry with too much information? */
 tcg_debug_assert(i == TCG_MAX_OP_ARGS || tdefs->args_ct_str[i] == NULL);
 
+ /*
+ * Fix up output pairs that are aliased with inputs.
+ * When we created the alias, we copied pair from the output.
+ * There are three cases:
+ * (1a) Pairs of inputs alias pairs of outputs.
+ * (1b) One input aliases the first of a pair of outputs.
+ * (2) One input aliases the second of a pair of outputs.
+ *
+ * Case 1a is handled by making sure that the pair_index'es are
+ * properly updated so that they appear the same as a pair of inputs.
+ *
+ * Case 1b is handled by setting the pair_index of the input to
+ * itself, simply so it doesn't point to an unrelated argument.
+ * Since we don't encounter the "second" during the input allocation
+ * phase, nothing happens with the second half of the input pair.
+ *
+ * Case 2 is handled by setting the second input to pair=3, the
+ * first output to pair=3, and the pair_index'es to match.
+ */
+ if (saw_alias_pair) {
+ for (i = def->nb_oargs; i < nb_args; i++) {
+ /*
+ * Since [0-9pm] must be alone in the constraint string,
+ * the only way they can both be set is if the pair comes
+ * from the output alias.
+ */
+ if (!def->args_ct[i].ialias) {
+ continue;
+ }
+ switch (def->args_ct[i].pair) {
+ case 0:
+ break;
+ case 1:
+ o = def->args_ct[i].alias_index;
+ o2 = def->args_ct[o].pair_index;
+ tcg_debug_assert(def->args_ct[o].pair == 1);
+ tcg_debug_assert(def->args_ct[o2].pair == 2);
+ if (def->args_ct[o2].oalias) {
+ /* Case 1a */
+ i2 = def->args_ct[o2].alias_index;
+ tcg_debug_assert(def->args_ct[i2].pair == 2);
+ def->args_ct[i2].pair_index = i;
+ def->args_ct[i].pair_index = i2;
+ } else {
+ /* Case 1b */
+ def->args_ct[i].pair_index = i;
+ }
+ break;
+ case 2:
+ o = def->args_ct[i].alias_index;
+ o2 = def->args_ct[o].pair_index;
+ tcg_debug_assert(def->args_ct[o].pair == 2);
+ tcg_debug_assert(def->args_ct[o2].pair == 1);
+ if (def->args_ct[o2].oalias) {
+ /* Case 1a */
+ i2 = def->args_ct[o2].alias_index;
+ tcg_debug_assert(def->args_ct[i2].pair == 1);
+ def->args_ct[i2].pair_index = i;
+ def->args_ct[i].pair_index = i2;
+ } else {
+ /* Case 2 */
+ def->args_ct[i].pair = 3;
+ def->args_ct[o2].pair = 3;
+ def->args_ct[i].pair_index = o2;
+ def->args_ct[o2].pair_index = i;
+ }
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ }
+ }
+
 /* sort the constraints (XXX: this is just an heuristic) */
 sort_constraints(def, 0, def->nb_oargs);
 sort_constraints(def, def->nb_oargs, def->nb_iargs);
@@ -XXX,XX +XXX,XX @@ static TCGReg tcg_reg_alloc(TCGContext *s, TCGRegSet required_regs,
 tcg_abort();
 }
 
+static TCGReg tcg_reg_alloc_pair(TCGContext *s, TCGRegSet required_regs,
+ TCGRegSet allocated_regs,
+ TCGRegSet preferred_regs, bool rev)
+{
+ int i, j, k, fmin, n = ARRAY_SIZE(tcg_target_reg_alloc_order);
+ TCGRegSet reg_ct[2];
+ const int *order;
+
+ /* Ensure that if I is not in allocated_regs, I+1 is not either. */
+ reg_ct[1] = required_regs & ~(allocated_regs | (allocated_regs >> 1));
+ tcg_debug_assert(reg_ct[1] != 0);
+ reg_ct[0] = reg_ct[1] & preferred_regs;
+
+ order = rev ? indirect_reg_alloc_order : tcg_target_reg_alloc_order;
+
+ /*
+ * Skip the preferred_regs option if it cannot be satisfied,
+ * or if the preference made no difference.
+ */
+ k = reg_ct[0] == 0 || reg_ct[0] == reg_ct[1];
+
+ /*
+ * Minimize the number of flushes by looking for 2 free registers first,
+ * then a single flush, then two flushes.
+ */
+ for (fmin = 2; fmin >= 0; fmin--) {
+ for (j = k; j < 2; j++) {
+ TCGRegSet set = reg_ct[j];
+
+ for (i = 0; i < n; i++) {
+ TCGReg reg = order[i];
+
+ if (tcg_regset_test_reg(set, reg)) {
+ int f = !s->reg_to_temp[reg] + !s->reg_to_temp[reg + 1];
+ if (f >= fmin) {
+ tcg_reg_free(s, reg, allocated_regs);
+ tcg_reg_free(s, reg + 1, allocated_regs);
+ return reg;
+ }
+ }
+ }
+ }
+ }
+ tcg_abort();
+}
+
 /* Make sure the temporary is in a register. If needed, allocate the register
 from DESIRED while avoiding ALLOCATED. */
 static void temp_load(TCGContext *s, TCGTemp *ts, TCGRegSet desired_regs,
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
 
 /* satisfy input constraints */
 for (k = 0; k < nb_iargs; k++) {
- TCGRegSet i_preferred_regs;
- bool allocate_new_reg;
+ TCGRegSet i_preferred_regs, i_required_regs;
+ bool allocate_new_reg, copyto_new_reg;
+ TCGTemp *ts2;
+ int i1, i2;
 
 i = def->args_ct[nb_oargs + k].sort_index;
 arg = op->args[i];
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
 
 reg = ts->reg;
 i_preferred_regs = 0;
+ i_required_regs = arg_ct->regs;
 allocate_new_reg = false;
+ copyto_new_reg = false;
 
- if (arg_ct->ialias) {
+ switch (arg_ct->pair) {
+ case 0: /* not paired */
+ if (arg_ct->ialias) {
+ i_preferred_regs = op->output_pref[arg_ct->alias_index];
+
+ /*
+ * If the input is not dead after the instruction,
+ * we must allocate a new register and move it.
+ */
+ if (!IS_DEAD_ARG(i)) {
+ allocate_new_reg = true;
+ } else if (ts->val_type == TEMP_VAL_REG) {
+ /*
+ * Check if the current register has already been
+ * allocated for another input.
+ */
+ allocate_new_reg =
+ tcg_regset_test_reg(i_allocated_regs, reg);
+ }
+ }
+ if (!allocate_new_reg) {
+ temp_load(s, ts, i_required_regs, i_allocated_regs,
+ i_preferred_regs);
+ reg = ts->reg;
+ allocate_new_reg = !tcg_regset_test_reg(i_required_regs, reg);
+ }
+ if (allocate_new_reg) {
+ /*
+ * Allocate a new register matching the constraint
+ * and move the temporary register into it.
+ */
+ temp_load(s, ts, tcg_target_available_regs[ts->type],
+ i_allocated_regs, 0);
+ reg = tcg_reg_alloc(s, i_required_regs, i_allocated_regs,
+ i_preferred_regs, ts->indirect_base);
+ copyto_new_reg = true;
+ }
+ break;
+
+ case 1:
+ /* First of an input pair; if i1 == i2, the second is an output. */
+ i1 = i;
+ i2 = arg_ct->pair_index;
+ ts2 = i1 != i2 ? arg_temp(op->args[i2]) : NULL;
+
+ /*
+ * It is easier to default to allocating a new pair
+ * and to identify a few cases where it's not required.
+ */
+ if (arg_ct->ialias) {
+ i_preferred_regs = op->output_pref[arg_ct->alias_index];
+ if (IS_DEAD_ARG(i1) &&
+ IS_DEAD_ARG(i2) &&
+ ts->val_type == TEMP_VAL_REG &&
+ ts->reg < TCG_TARGET_NB_REGS - 1 &&
+ tcg_regset_test_reg(i_required_regs, reg) &&
+ !tcg_regset_test_reg(i_allocated_regs, reg) &&
+ !tcg_regset_test_reg(i_allocated_regs, reg + 1) &&
+ (ts2
+ ? ts2->val_type == TEMP_VAL_REG &&
+ ts2->reg == reg + 1
+ : s->reg_to_temp[reg + 1] == NULL)) {
+ break;
+ }
+ } else {
+ /* Without aliasing, the pair must also be an input. */
+ tcg_debug_assert(ts2);
+ if (ts->val_type == TEMP_VAL_REG &&
+ ts2->val_type == TEMP_VAL_REG &&
+ ts2->reg == reg + 1 &&
+ tcg_regset_test_reg(i_required_regs, reg)) {
+ break;
+ }
+ }
+ reg = tcg_reg_alloc_pair(s, i_required_regs, i_allocated_regs,
+ 0, ts->indirect_base);
+ goto do_pair;
+
+ case 2: /* pair second */
+ reg = new_args[arg_ct->pair_index] + 1;
+ goto do_pair;
+
+ case 3: /* ialias with second output, no first input */
+ tcg_debug_assert(arg_ct->ialias);
 i_preferred_regs = op->output_pref[arg_ct->alias_index];
 
- /*
- * If the input is readonly, then it cannot also be an
- * output and aliased to itself. If the input is not
- * dead after the instruction, we must allocate a new
- * register and move it.
- */
- if (temp_readonly(ts) || !IS_DEAD_ARG(i)) {
- allocate_new_reg = true;
- } else if (ts->val_type == TEMP_VAL_REG) {
- /*
- * Check if the current register has already been
- * allocated for another input.
- */
- allocate_new_reg = tcg_regset_test_reg(i_allocated_regs, reg);
+ if (IS_DEAD_ARG(i) &&
+ ts->val_type == TEMP_VAL_REG &&
+ reg > 0 &&
+ s->reg_to_temp[reg - 1] == NULL &&
+ tcg_regset_test_reg(i_required_regs, reg) &&
+ !tcg_regset_test_reg(i_allocated_regs, reg) &&
+ !tcg_regset_test_reg(i_allocated_regs, reg - 1)) {
+ tcg_regset_set_reg(i_allocated_regs, reg - 1);
+ break;
 }
- }
+ reg = tcg_reg_alloc_pair(s, i_required_regs >> 1,
+ i_allocated_regs, 0,
+ ts->indirect_base);
+ tcg_regset_set_reg(i_allocated_regs, reg);
+ reg += 1;
+ goto do_pair;
 
- if (!allocate_new_reg) {
- temp_load(s, ts, arg_ct->regs, i_allocated_regs, i_preferred_regs);
- reg = ts->reg;
- allocate_new_reg = !tcg_regset_test_reg(arg_ct->regs, reg);
- }
-
- if (allocate_new_reg) {
+ do_pair:
 /*
- * Allocate a new register matching the constraint
- * and move the temporary register into it.
+ * If an aliased input is not dead after the instruction,
+ * we must allocate a new register and move it.
 */
- temp_load(s, ts, tcg_target_available_regs[ts->type],
- i_allocated_regs, 0);
- reg = tcg_reg_alloc(s, arg_ct->regs, i_allocated_regs,
- i_preferred_regs, ts->indirect_base);
+ if (arg_ct->ialias && !IS_DEAD_ARG(i)) {
+ TCGRegSet t_allocated_regs = i_allocated_regs;
+
+ /*
+ * Because of the alias, and the continued life, make sure
+ * that the temp is somewhere *other* than the reg pair,
+ * and we get a copy in reg.
+ */
+ tcg_regset_set_reg(t_allocated_regs, reg);
+ tcg_regset_set_reg(t_allocated_regs, reg + 1);
+ if (ts->val_type == TEMP_VAL_REG && ts->reg == reg) {
+ /* If ts was already in reg, copy it somewhere else. */
+ TCGReg nr;
+ bool ok;
+
+ tcg_debug_assert(ts->kind != TEMP_FIXED);
+ nr = tcg_reg_alloc(s, tcg_target_available_regs[ts->type],
+ t_allocated_regs, 0, ts->indirect_base);
+ ok = tcg_out_mov(s, ts->type, nr, reg);
+ tcg_debug_assert(ok);
+
+ set_temp_val_reg(s, ts, nr);
+ } else {
+ temp_load(s, ts, tcg_target_available_regs[ts->type],
+ t_allocated_regs, 0);
+ copyto_new_reg = true;
+ }
+ } else {
+ /* Preferably allocate to reg, otherwise copy. */
+ i_required_regs = (TCGRegSet)1 << reg;
+ temp_load(s, ts, i_required_regs, i_allocated_regs,
+ i_preferred_regs);
+ copyto_new_reg = ts->reg != reg;
+ }
+ break;
+
+ default:
+ g_assert_not_reached();
+ }
+
+ if (copyto_new_reg) {
 if (!tcg_out_mov(s, ts->type, reg, ts->reg)) {
 /*
 * Cross register class move not supported. Sync the
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
 /* ENV should not be modified. */
 tcg_debug_assert(!temp_readonly(ts));
 
- if (arg_ct->oalias && !const_args[arg_ct->alias_index]) {
- reg = new_args[arg_ct->alias_index];
- } else if (arg_ct->newreg) {
- reg = tcg_reg_alloc(s, arg_ct->regs,
- i_allocated_regs | o_allocated_regs,
- op->output_pref[k], ts->indirect_base);
- } else {
- reg = tcg_reg_alloc(s, arg_ct->regs, o_allocated_regs,
- op->output_pref[k], ts->indirect_base);
+ switch (arg_ct->pair) {
+ case 0: /* not paired */
+ if (arg_ct->oalias && !const_args[arg_ct->alias_index]) {
+ reg = new_args[arg_ct->alias_index];
+ } else if (arg_ct->newreg) {
+ reg = tcg_reg_alloc(s, arg_ct->regs,
+ i_allocated_regs | o_allocated_regs,
+ op->output_pref[k], ts->indirect_base);
+ } else {
+ reg = tcg_reg_alloc(s, arg_ct->regs, o_allocated_regs,
+ op->output_pref[k], ts->indirect_base);
+ }
+ break;
+
+ case 1: /* first of pair */
+ tcg_debug_assert(!arg_ct->newreg);
+ if (arg_ct->oalias) {
+ reg = new_args[arg_ct->alias_index];
+ break;
+ }
+ reg = tcg_reg_alloc_pair(s, arg_ct->regs, o_allocated_regs,
+ op->output_pref[k], ts->indirect_base);
+ break;
+
+ case 2: /* second of pair */
+ tcg_debug_assert(!arg_ct->newreg);
+ if (arg_ct->oalias) {
+ reg = new_args[arg_ct->alias_index];
+ } else {
+ reg = new_args[arg_ct->pair_index] + 1;
+ }
+ break;
+
+ case 3: /* first of pair, aliasing with a second input */
+ tcg_debug_assert(!arg_ct->newreg);
+ reg = new_args[arg_ct->pair_index] - 1;
+ break;
+
+ default:
+ g_assert_not_reached();
 }
 tcg_regset_set_reg(o_allocated_regs, reg);
 set_temp_val_reg(s, ts, reg);
-- 
2.34.1

Use the official extend/extract functions instead of routines
that will shortly be internal to tcg.

Cc: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/sparc/translate.c | 21 ++++-----------------
 1 file changed, 4 insertions(+), 17 deletions(-)

diff --git a/target/sparc/translate.c b/target/sparc/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/sparc/translate.c
+++ b/target/sparc/translate.c
@@ -XXX,XX +XXX,XX @@ static inline void gen_update_fprs_dirty(DisasContext *dc, int rd)
 /* floating point registers moves */
 static TCGv_i32 gen_load_fpr_F(DisasContext *dc, unsigned int src)
 {
-#if TCG_TARGET_REG_BITS == 32
-    if (src & 1) {
-        return TCGV_LOW(cpu_fpr[src / 2]);
-    } else {
-        return TCGV_HIGH(cpu_fpr[src / 2]);
-    }
-#else
     TCGv_i32 ret = get_temp_i32(dc);
     if (src & 1) {
         tcg_gen_extrl_i64_i32(ret, cpu_fpr[src / 2]);
@@ -XXX,XX +XXX,XX @@ static TCGv_i32 gen_load_fpr_F(DisasContext *dc, unsigned int src)
         tcg_gen_extrh_i64_i32(ret, cpu_fpr[src / 2]);
     }
     return ret;
-#endif
 }
 
 static void gen_store_fpr_F(DisasContext *dc, unsigned int dst, TCGv_i32 v)
 {
-#if TCG_TARGET_REG_BITS == 32
-    if (dst & 1) {
-        tcg_gen_mov_i32(TCGV_LOW(cpu_fpr[dst / 2]), v);
-    } else {
-        tcg_gen_mov_i32(TCGV_HIGH(cpu_fpr[dst / 2]), v);
-    }
-#else
-    TCGv_i64 t = (TCGv_i64)v;
+    TCGv_i64 t = tcg_temp_new_i64();
+
+    tcg_gen_extu_i32_i64(t, v);
     tcg_gen_deposit_i64(cpu_fpr[dst / 2], cpu_fpr[dst / 2], t,
                         (dst & 1 ? 0 : 32), 32);
-#endif
+    tcg_temp_free_i64(t);
     gen_update_fprs_dirty(dc, dst);
 }
 
-- 
2.34.1

Move the error-generating fallback from tcg-op.c, and
replace "_link_error" with modern QEMU_ERROR markup.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/tcg/tcg-op.h | 33 +++++----------------------------
 include/tcg/tcg.h | 12 ------------
 tcg/tcg-internal.h | 14 ++++++++++++++
 tcg/tcg-op-vec.c | 2 ++
 tcg/tcg-op.c | 37 ++++++++++++++++++++++++++++---------
 5 files changed, 49 insertions(+), 49 deletions(-)

diff --git a/include/tcg/tcg-op.h b/include/tcg/tcg-op.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg-op.h
+++ b/include/tcg/tcg-op.h
@@ -XXX,XX +XXX,XX @@ static inline void tcg_gen_mul_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
     tcg_gen_op3_i64(INDEX_op_mul_i64, ret, arg1, arg2);
 }
 #else /* TCG_TARGET_REG_BITS == 32 */
-static inline void tcg_gen_st8_i64(TCGv_i64 arg1, TCGv_ptr arg2,
-                                   tcg_target_long offset)
-{
-    tcg_gen_st8_i32(TCGV_LOW(arg1), arg2, offset);
-}
+void tcg_gen_st8_i64(TCGv_i64 arg1, TCGv_ptr arg2, tcg_target_long offset);
+void tcg_gen_st16_i64(TCGv_i64 arg1, TCGv_ptr arg2, tcg_target_long offset);
+void tcg_gen_st32_i64(TCGv_i64 arg1, TCGv_ptr arg2, tcg_target_long offset);
 
-static inline void tcg_gen_st16_i64(TCGv_i64 arg1, TCGv_ptr arg2,
-                                    tcg_target_long offset)
-{
-    tcg_gen_st16_i32(TCGV_LOW(arg1), arg2, offset);
-}
-
-static inline void tcg_gen_st32_i64(TCGv_i64 arg1, TCGv_ptr arg2,
-                                    tcg_target_long offset)
-{
-    tcg_gen_st_i32(TCGV_LOW(arg1), arg2, offset);
-}
-
-static inline void tcg_gen_add_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
-{
-    tcg_gen_add2_i32(TCGV_LOW(ret), TCGV_HIGH(ret), TCGV_LOW(arg1),
-                     TCGV_HIGH(arg1), TCGV_LOW(arg2), TCGV_HIGH(arg2));
-}
-
-static inline void tcg_gen_sub_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
-{
-    tcg_gen_sub2_i32(TCGV_LOW(ret), TCGV_HIGH(ret), TCGV_LOW(arg1),
-                     TCGV_HIGH(arg1), TCGV_LOW(arg2), TCGV_HIGH(arg2));
-}
+void tcg_gen_add_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_sub_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
 
 void tcg_gen_discard_i64(TCGv_i64 arg);
 void tcg_gen_mov_i64(TCGv_i64 ret, TCGv_i64 arg);
diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg.h
+++ b/include/tcg/tcg.h
@@ -XXX,XX +XXX,XX @@ static inline TCGv_vec temp_tcgv_vec(TCGTemp *t)
     return (TCGv_vec)temp_tcgv_i32(t);
 }
 
-#if TCG_TARGET_REG_BITS == 32
-static inline TCGv_i32 TCGV_LOW(TCGv_i64 t)
-{
-    return temp_tcgv_i32(tcgv_i64_temp(t));
-}
-
-static inline TCGv_i32 TCGV_HIGH(TCGv_i64 t)
-{
-    return temp_tcgv_i32(tcgv_i64_temp(t) + 1);
-}
-#endif
-
 static inline TCGArg tcg_get_insn_param(TCGOp *op, int arg)
 {
     return op->args[arg];
diff --git a/tcg/tcg-internal.h b/tcg/tcg-internal.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg-internal.h
+++ b/tcg/tcg-internal.h
@@ -XXX,XX +XXX,XX @@ static inline unsigned tcg_call_flags(TCGOp *op)
     return tcg_call_info(op)->flags;
 }
 
+#if TCG_TARGET_REG_BITS == 32
+static inline TCGv_i32 TCGV_LOW(TCGv_i64 t)
+{
+    return temp_tcgv_i32(tcgv_i64_temp(t));
+}
+static inline TCGv_i32 TCGV_HIGH(TCGv_i64 t)
+{
+    return temp_tcgv_i32(tcgv_i64_temp(t) + 1);
+}
+#else
+extern TCGv_i32 TCGV_LOW(TCGv_i64) QEMU_ERROR("32-bit code path is reachable");
+extern TCGv_i32 TCGV_HIGH(TCGv_i64) QEMU_ERROR("32-bit code path is reachable");
+#endif
+
 #endif /* TCG_INTERNAL_H */
diff --git a/tcg/tcg-op-vec.c b/tcg/tcg-op-vec.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg-op-vec.c
+++ b/tcg/tcg-op-vec.c
@@ -XXX,XX +XXX,XX @@
 #include "tcg/tcg.h"
 #include "tcg/tcg-op.h"
 #include "tcg/tcg-mo.h"
+#include "tcg-internal.h"
+
 
 /* Reduce the number of ifdefs below.  This assumes that all uses of
    TCGV_HIGH and TCGV_LOW are properly protected by a conditional that
diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg-op.c
+++ b/tcg/tcg-op.c
@@ -XXX,XX +XXX,XX @@
 #include "tcg/tcg-op.h"
 #include "tcg/tcg-mo.h"
 #include "exec/plugin-gen.h"
+#include "tcg-internal.h"
 
-/* Reduce the number of ifdefs below.  This assumes that all uses of
-   TCGV_HIGH and TCGV_LOW are properly protected by a conditional that
-   the compiler can eliminate.  */
-#if TCG_TARGET_REG_BITS == 64
-extern TCGv_i32 TCGV_LOW_link_error(TCGv_i64);
-extern TCGv_i32 TCGV_HIGH_link_error(TCGv_i64);
-#define TCGV_LOW  TCGV_LOW_link_error
-#define TCGV_HIGH TCGV_HIGH_link_error
-#endif
 
 void tcg_gen_op1(TCGOpcode opc, TCGArg a1)
 {
@@ -XXX,XX +XXX,XX @@ void tcg_gen_ld_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset)
 #endif
 }
 
+void tcg_gen_st8_i64(TCGv_i64 arg1, TCGv_ptr arg2, tcg_target_long offset)
+{
+    tcg_gen_st8_i32(TCGV_LOW(arg1), arg2, offset);
+}
+
+void tcg_gen_st16_i64(TCGv_i64 arg1, TCGv_ptr arg2, tcg_target_long offset)
+{
+    tcg_gen_st16_i32(TCGV_LOW(arg1), arg2, offset);
+}
+
+void tcg_gen_st32_i64(TCGv_i64 arg1, TCGv_ptr arg2, tcg_target_long offset)
+{
+    tcg_gen_st_i32(TCGV_LOW(arg1), arg2, offset);
+}
+
 void tcg_gen_st_i64(TCGv_i64 arg1, TCGv_ptr arg2, tcg_target_long offset)
 {
 #if HOST_BIG_ENDIAN
@@ -XXX,XX +XXX,XX @@ void tcg_gen_st_i64(TCGv_i64 arg1, TCGv_ptr arg2, tcg_target_long offset)
 #endif
 }
 
+void tcg_gen_add_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    tcg_gen_add2_i32(TCGV_LOW(ret), TCGV_HIGH(ret), TCGV_LOW(arg1),
+                     TCGV_HIGH(arg1), TCGV_LOW(arg2), TCGV_HIGH(arg2));
+}
+
+void tcg_gen_sub_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    tcg_gen_sub2_i32(TCGV_LOW(ret), TCGV_HIGH(ret), TCGV_LOW(arg1),
+                     TCGV_HIGH(arg1), TCGV_LOW(arg2), TCGV_HIGH(arg2));
+}
+
 void tcg_gen_and_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
 {
     tcg_gen_and_i32(TCGV_LOW(ret), TCGV_LOW(arg1), TCGV_LOW(arg2));
-- 
2.34.1

Record the location of a TCGTemp within a larger object.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/tcg/tcg.h | 1 +
 tcg/tcg.c | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg.h
+++ b/include/tcg/tcg.h
@@ -XXX,XX +XXX,XX @@ typedef struct TCGTemp {
     unsigned int mem_coherent:1;
     unsigned int mem_allocated:1;
     unsigned int temp_allocated:1;
+    unsigned int temp_subindex:1;
 
     int64_t val;
     struct TCGTemp *mem_base;
diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ TCGTemp *tcg_global_mem_new_internal(TCGType type, TCGv_ptr base,
         ts2->mem_allocated = 1;
         ts2->mem_base = base_ts;
         ts2->mem_offset = offset + (1 - bigendian) * 4;
+        ts2->temp_subindex = 1;
         pstrcpy(buf, sizeof(buf), name);
         pstrcat(buf, sizeof(buf), "_1");
         ts2->name = strdup(buf);
@@ -XXX,XX +XXX,XX @@ TCGTemp *tcg_temp_new_internal(TCGType type, bool temp_local)
             ts2->base_type = TCG_TYPE_I64;
             ts2->type = TCG_TYPE_I32;
             ts2->temp_allocated = 1;
+            ts2->temp_subindex = 1;
             ts2->kind = kind;
         } else {
             ts->base_type = type;
@@ -XXX,XX +XXX,XX @@ TCGTemp *tcg_constant_internal(TCGType type, int64_t val)
             ts2->type = TCG_TYPE_I32;
             ts2->kind = TEMP_CONST;
             ts2->temp_allocated = 1;
+            ts2->temp_subindex = 1;
             ts2->val = val >> 32;
         } else {
             ts->base_type = type;
-- 
2.34.1

Allocate the first of a pair at the lower address, and the
second of a pair at the higher address.  This will make it
easier to find the beginning of the larger memory block.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tcg-internal.h | 4 ++--
 tcg/tcg.c | 58 ++++++++++++++++++++++------------------------
 2 files changed, 30 insertions(+), 32 deletions(-)

diff --git a/tcg/tcg-internal.h b/tcg/tcg-internal.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg-internal.h
+++ b/tcg/tcg-internal.h
@@ -XXX,XX +XXX,XX @@ static inline unsigned tcg_call_flags(TCGOp *op)
 #if TCG_TARGET_REG_BITS == 32
 static inline TCGv_i32 TCGV_LOW(TCGv_i64 t)
 {
- return temp_tcgv_i32(tcgv_i64_temp(t));
+ return temp_tcgv_i32(tcgv_i64_temp(t) + HOST_BIG_ENDIAN);
 }
 static inline TCGv_i32 TCGV_HIGH(TCGv_i64 t)
 {
- return temp_tcgv_i32(tcgv_i64_temp(t) + 1);
+ return temp_tcgv_i32(tcgv_i64_temp(t) + !HOST_BIG_ENDIAN);
 }
 #else
 extern TCGv_i32 TCGV_LOW(TCGv_i64) QEMU_ERROR("32-bit code path is reachable");
diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ TCGTemp *tcg_global_mem_new_internal(TCGType type, TCGv_ptr base,
 TCGContext *s = tcg_ctx;
 TCGTemp *base_ts = tcgv_ptr_temp(base);
 TCGTemp *ts = tcg_global_alloc(s);
- int indirect_reg = 0, bigendian = 0;
-#if HOST_BIG_ENDIAN
- bigendian = 1;
-#endif
+ int indirect_reg = 0;
 
 switch (base_ts->kind) {
 case TEMP_FIXED:
@@ -XXX,XX +XXX,XX @@ TCGTemp *tcg_global_mem_new_internal(TCGType type, TCGv_ptr base,
 ts->indirect_reg = indirect_reg;
 ts->mem_allocated = 1;
 ts->mem_base = base_ts;
- ts->mem_offset = offset + bigendian * 4;
+ ts->mem_offset = offset;
 pstrcpy(buf, sizeof(buf), name);
 pstrcat(buf, sizeof(buf), "_0");
 ts->name = strdup(buf);
@@ -XXX,XX +XXX,XX @@ TCGTemp *tcg_global_mem_new_internal(TCGType type, TCGv_ptr base,
 ts2->indirect_reg = indirect_reg;
 ts2->mem_allocated = 1;
 ts2->mem_base = base_ts;
- ts2->mem_offset = offset + (1 - bigendian) * 4;
+ ts2->mem_offset = offset + 4;
 ts2->temp_subindex = 1;
 pstrcpy(buf, sizeof(buf), name);
 pstrcat(buf, sizeof(buf), "_1");
@@ -XXX,XX +XXX,XX @@ TCGTemp *tcg_constant_internal(TCGType type, int64_t val)
 
 ts = g_hash_table_lookup(h, &val);
 if (ts == NULL) {
+ int64_t *val_ptr;
+
 ts = tcg_temp_alloc(s);
 
 if (TCG_TARGET_REG_BITS == 32 && type == TCG_TYPE_I64) {
 TCGTemp *ts2 = tcg_temp_alloc(s);
 
+ tcg_debug_assert(ts2 == ts + 1);
+
 ts->base_type = TCG_TYPE_I64;
 ts->type = TCG_TYPE_I32;
 ts->kind = TEMP_CONST;
 ts->temp_allocated = 1;
- /*
- * Retain the full value of the 64-bit constant in the low
- * part, so that the hash table works. Actual uses will
- * truncate the value to the low part.
- */
- ts->val = val;
 
- tcg_debug_assert(ts2 == ts + 1);
 ts2->base_type = TCG_TYPE_I64;
 ts2->type = TCG_TYPE_I32;
 ts2->kind = TEMP_CONST;
 ts2->temp_allocated = 1;
 ts2->temp_subindex = 1;
- ts2->val = val >> 32;
+
+ /*
+ * Retain the full value of the 64-bit constant in the low
+ * part, so that the hash table works. Actual uses will
+ * truncate the value to the low part.
+ */
+ ts[HOST_BIG_ENDIAN].val = val;
+ ts[!HOST_BIG_ENDIAN].val = val >> 32;
+ val_ptr = &ts[HOST_BIG_ENDIAN].val;
 } else {
 ts->base_type = type;
 ts->type = type;
 ts->kind = TEMP_CONST;
 ts->temp_allocated = 1;
 ts->val = val;
+ val_ptr = &ts->val;
 }
- g_hash_table_insert(h, &ts->val, ts);
+ g_hash_table_insert(h, val_ptr, ts);
 }
 
 return ts;
@@ -XXX,XX +XXX,XX @@ void tcg_gen_callN(void *func, TCGTemp *ret, int nargs, TCGTemp **args)
 pi = 0;
 if (ret != NULL) {
 if (TCG_TARGET_REG_BITS < 64 && (typemask & 6) == dh_typecode_i64) {
-#if HOST_BIG_ENDIAN
- op->args[pi++] = temp_arg(ret + 1);
- op->args[pi++] = temp_arg(ret);
-#else
 op->args[pi++] = temp_arg(ret);
 op->args[pi++] = temp_arg(ret + 1);
-#endif
 nb_rets = 2;
 } else {
 op->args[pi++] = temp_arg(ret);
@@ -XXX,XX +XXX,XX @@ void tcg_gen_callN(void *func, TCGTemp *ret, int nargs, TCGTemp **args)
 }
 
 if (TCG_TARGET_REG_BITS < 64 && is_64bit) {
- op->args[pi++] = temp_arg(args[i] + HOST_BIG_ENDIAN);
- op->args[pi++] = temp_arg(args[i] + !HOST_BIG_ENDIAN);
+ op->args[pi++] = temp_arg(args[i]);
+ op->args[pi++] = temp_arg(args[i] + 1);
 real_args += 2;
 continue;
 }
@@ -XXX,XX +XXX,XX @@ static bool tcg_reg_alloc_dup2(TCGContext *s, const TCGOp *op)
 }
 
 /* If the two inputs form one 64-bit value, try dupm_vec. */
- if (itsl + 1 == itsh && itsl->base_type == TCG_TYPE_I64) {
- temp_sync(s, itsl, s->reserved_regs, 0, 0);
- temp_sync(s, itsh, s->reserved_regs, 0, 0);
-#if HOST_BIG_ENDIAN
- TCGTemp *its = itsh;
-#else
- TCGTemp *its = itsl;
-#endif
+ if (itsl->temp_subindex == HOST_BIG_ENDIAN &&
+ itsh->temp_subindex == !HOST_BIG_ENDIAN &&
+ itsl == itsh + (HOST_BIG_ENDIAN ? 1 : -1)) {
+ TCGTemp *its = itsl - HOST_BIG_ENDIAN;
+
+ temp_sync(s, its + 0, s->reserved_regs, 0, 0);
+ temp_sync(s, its + 1, s->reserved_regs, 0, 0);
+
 if (tcg_out_dupm_vec(s, vtype, MO_64, ots->reg,
 its->mem_base->reg, its->mem_offset)) {
 goto done;
-- 
2.34.1

Add a helper function for computing the size of a type.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/tcg/tcg.h | 16 ++++++++++++++++
 tcg/tcg.c | 27 ++++++++++++---------------
 2 files changed, 28 insertions(+), 15 deletions(-)

diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg.h
+++ b/include/tcg/tcg.h
@@ -XXX,XX +XXX,XX @@ typedef enum TCGType {
 #endif
 } TCGType;
 
+/**
+ * tcg_type_size
+ * @t: type
+ *
+ * Return the size of the type in bytes.
+ */
+static inline int tcg_type_size(TCGType t)
+{
+ unsigned i = t;
+ if (i >= TCG_TYPE_V64) {
+ tcg_debug_assert(i < TCG_TYPE_COUNT);
+ i -= TCG_TYPE_V64 - 1;
+ }
+ return 4 << i;
+}
+
 /**
 * get_alignment_bits
 * @memop: MemOp value
diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static bool liveness_pass_2(TCGContext *s)
 
 static void temp_allocate_frame(TCGContext *s, TCGTemp *ts)
 {
- intptr_t off, size, align;
+ int size = tcg_type_size(ts->type);
+ int align;
+ intptr_t off;
 
 switch (ts->type) {
 case TCG_TYPE_I32:
- size = align = 4;
+ align = 4;
 break;
 case TCG_TYPE_I64:
 case TCG_TYPE_V64:
- size = align = 8;
+ align = 8;
 break;
 case TCG_TYPE_V128:
- size = align = 16;
- break;
 case TCG_TYPE_V256:
 /* Note that we do not require aligned storage for V256. */
- size = 32, align = 16;
+ align = 16;
 break;
 default:
 g_assert_not_reached();
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_dup(TCGContext *s, const TCGOp *op)
 TCGRegSet dup_out_regs, dup_in_regs;
 TCGTemp *its, *ots;
 TCGType itype, vtype;
- intptr_t endian_fixup;
 unsigned vece;
+ int lowpart_ofs;
 bool ok;
 
 ots = arg_temp(op->args[0]);
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_dup(TCGContext *s, const TCGOp *op)
 /* fall through */
 
 case TEMP_VAL_MEM:
-#if HOST_BIG_ENDIAN
- endian_fixup = itype == TCG_TYPE_I32 ? 4 : 8;
- endian_fixup -= 1 << vece;
-#else
- endian_fixup = 0;
-#endif
- /* Attempt to dup directly from the input memory slot. */
+ lowpart_ofs = 0;
+ if (HOST_BIG_ENDIAN) {
+ lowpart_ofs = tcg_type_size(itype) - (1 << vece);
+ }
 if (tcg_out_dupm_vec(s, vtype, vece, ots->reg, its->mem_base->reg,
- its->mem_offset + endian_fixup)) {
+ its->mem_offset + lowpart_ofs)) {
 goto done;
 }
 /* Load the input into the destination vector register. */
-- 
2.34.1

Prepare to replace a bunch of separate ifdefs with a
consistent way to describe the ABI of a function call.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tcg-internal.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/tcg/tcg-internal.h b/tcg/tcg-internal.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg-internal.h
+++ b/tcg/tcg-internal.h
@@ -XXX,XX +XXX,XX @@
 
 #define TCG_HIGHWATER 1024
 
+/*
+ * Describe the calling convention of a given argument type.
+ */
+typedef enum {
+    TCG_CALL_RET_NORMAL,         /* by registers */
+} TCGCallReturnKind;
+
+typedef enum {
+    TCG_CALL_ARG_NORMAL,         /* by registers (continuing onto stack) */
+    TCG_CALL_ARG_EVEN,           /* like normal, but skipping odd slots */
+    TCG_CALL_ARG_EXTEND,         /* for i32, as a sign/zero-extended i64 */
+    TCG_CALL_ARG_EXTEND_U,       /*      ... as a zero-extended i64 */
+    TCG_CALL_ARG_EXTEND_S,       /*      ... as a sign-extended i64 */
+} TCGCallArgumentKind;
+
 typedef struct TCGHelperInfo {
     void *func;
     const char *name;
-- 
2.34.1

For 32-bit hosts when TCG_TARGET_CALL_ALIGN_ARGS was set, use
TCG_CALL_ARG_EVEN.  For 64-bit hosts, TCG_TARGET_CALL_ALIGN_ARGS
was silently ignored, so always use TCG_CALL_ARG_NORMAL.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/aarch64/tcg-target.h | 2 +-
 tcg/arm/tcg-target.h | 2 +-
 tcg/i386/tcg-target.h | 1 +
 tcg/loongarch64/tcg-target.h | 2 +-
 tcg/mips/tcg-target.h | 3 ++-
 tcg/riscv/tcg-target.h | 6 +++++-
 tcg/s390x/tcg-target.h | 1 +
 tcg/sparc64/tcg-target.h | 1 +
 tcg/tci/tcg-target.h | 5 +++++
 tcg/tcg.c | 6 ++++--
 tcg/ppc/tcg-target.c.inc | 21 ++++++++-------------
 11 files changed, 30 insertions(+), 20 deletions(-)

diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/aarch64/tcg-target.h
+++ b/tcg/aarch64/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
 /* used for function call generation */
 #define TCG_REG_CALL_STACK TCG_REG_SP
 #define TCG_TARGET_STACK_ALIGN 16
-#define TCG_TARGET_CALL_ALIGN_ARGS 1
 #define TCG_TARGET_CALL_STACK_OFFSET 0
+#define TCG_TARGET_CALL_ARG_I64 TCG_CALL_ARG_NORMAL
 
 /* optional instructions */
 #define TCG_TARGET_HAS_div_i32 1
diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.h
+++ b/tcg/arm/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool use_neon_instructions;
 
 /* used for function call generation */
 #define TCG_TARGET_STACK_ALIGN		8
-#define TCG_TARGET_CALL_ALIGN_ARGS	1
 #define TCG_TARGET_CALL_STACK_OFFSET	0
+#define TCG_TARGET_CALL_ARG_I64 TCG_CALL_ARG_EVEN
 
 /* optional instructions */
 #define TCG_TARGET_HAS_ext8s_i32 1
diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
 #else
 #define TCG_TARGET_CALL_STACK_OFFSET 0
 #endif
+#define TCG_TARGET_CALL_ARG_I64 TCG_CALL_ARG_NORMAL
 
 extern bool have_bmi1;
 extern bool have_popcnt;
diff --git a/tcg/loongarch64/tcg-target.h b/tcg/loongarch64/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/loongarch64/tcg-target.h
+++ b/tcg/loongarch64/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
 /* used for function call generation */
 #define TCG_REG_CALL_STACK TCG_REG_SP
 #define TCG_TARGET_STACK_ALIGN 16
-#define TCG_TARGET_CALL_ALIGN_ARGS 1
 #define TCG_TARGET_CALL_STACK_OFFSET 0
+#define TCG_TARGET_CALL_ARG_I64 TCG_CALL_ARG_NORMAL
 
 /* optional instructions */
 #define TCG_TARGET_HAS_movcond_i32 0
diff --git a/tcg/mips/tcg-target.h b/tcg/mips/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/mips/tcg-target.h
+++ b/tcg/mips/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
 #define TCG_TARGET_STACK_ALIGN 16
 #if _MIPS_SIM == _ABIO32
 # define TCG_TARGET_CALL_STACK_OFFSET 16
+# define TCG_TARGET_CALL_ARG_I64 TCG_CALL_ARG_EVEN
 #else
 # define TCG_TARGET_CALL_STACK_OFFSET 0
+# define TCG_TARGET_CALL_ARG_I64 TCG_CALL_ARG_NORMAL
 #endif
-#define TCG_TARGET_CALL_ALIGN_ARGS 1
 
 /* MOVN/MOVZ instructions detection */
 #if (defined(__mips_isa_rev) && (__mips_isa_rev >= 1)) || \
diff --git a/tcg/riscv/tcg-target.h b/tcg/riscv/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/riscv/tcg-target.h
+++ b/tcg/riscv/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
 /* used for function call generation */
 #define TCG_REG_CALL_STACK TCG_REG_SP
 #define TCG_TARGET_STACK_ALIGN 16
-#define TCG_TARGET_CALL_ALIGN_ARGS 1
 #define TCG_TARGET_CALL_STACK_OFFSET 0
+#if TCG_TARGET_REG_BITS == 32
+#define TCG_TARGET_CALL_ARG_I64 TCG_CALL_ARG_EVEN
+#else
+#define TCG_TARGET_CALL_ARG_I64 TCG_CALL_ARG_NORMAL
+#endif
 
 /* optional instructions */
 #define TCG_TARGET_HAS_movcond_i32 0
diff --git a/tcg/s390x/tcg-target.h b/tcg/s390x/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/s390x/tcg-target.h
+++ b/tcg/s390x/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern uint64_t s390_facilities[3];
 /* used for function call generation */
 #define TCG_TARGET_STACK_ALIGN		8
 #define TCG_TARGET_CALL_STACK_OFFSET	160
+#define TCG_TARGET_CALL_ARG_I64 TCG_CALL_ARG_NORMAL
 
 #define TCG_TARGET_EXTEND_ARGS 1
 #define TCG_TARGET_HAS_MEMORY_BSWAP 1
diff --git a/tcg/sparc64/tcg-target.h b/tcg/sparc64/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/sparc64/tcg-target.h
+++ b/tcg/sparc64/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
 #define TCG_TARGET_STACK_ALIGN 16
 #define TCG_TARGET_CALL_STACK_OFFSET (128 + 6*8 + TCG_TARGET_STACK_BIAS)
 #define TCG_TARGET_EXTEND_ARGS 1
+#define TCG_TARGET_CALL_ARG_I64 TCG_CALL_ARG_NORMAL
 
 #if defined(__VIS__) && __VIS__ >= 0x300
 #define use_vis3_instructions 1
diff --git a/tcg/tci/tcg-target.h b/tcg/tci/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci/tcg-target.h
+++ b/tcg/tci/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
 /* Used for function call generation. */
 #define TCG_TARGET_CALL_STACK_OFFSET 0
 #define TCG_TARGET_STACK_ALIGN 8
+#if TCG_TARGET_REG_BITS == 32
+# define TCG_TARGET_CALL_ARG_I64 TCG_CALL_ARG_EVEN
+#else
+# define TCG_TARGET_CALL_ARG_I64 TCG_CALL_ARG_NORMAL
+#endif
 
 #define HAVE_TCG_QEMU_TB_EXEC
 #define TCG_TARGET_NEED_POOL_LABELS
diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ void tcg_gen_callN(void *func, TCGTemp *ret, int nargs, TCGTemp **args)
 * for passing off to ffi_call.
 */
 want_align = true;
-#elif defined(TCG_TARGET_CALL_ALIGN_ARGS)
+#else
 /* Some targets want aligned 64 bit args */
- want_align = is_64bit;
+ if (is_64bit) {
+ want_align = TCG_TARGET_CALL_ARG_I64 == TCG_CALL_ARG_EVEN;
+ }
 #endif
 
 if (TCG_TARGET_REG_BITS < 64 && want_align && (real_args & 1)) {
diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@
 #endif
 
 #ifdef _CALL_SYSV
-# define TCG_TARGET_CALL_ALIGN_ARGS 1
+# define TCG_TARGET_CALL_ARG_I64 TCG_CALL_ARG_EVEN
+#else
+# define TCG_TARGET_CALL_ARG_I64 TCG_CALL_ARG_NORMAL
 #endif
 
 /* For some memory operations, we need a scratch that isn't R0. For the AIX
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
 lo = lb->addrlo_reg;
 hi = lb->addrhi_reg;
 if (TCG_TARGET_REG_BITS < TARGET_LONG_BITS) {
-#ifdef TCG_TARGET_CALL_ALIGN_ARGS
- arg |= 1;
-#endif
+ arg |= (TCG_TARGET_CALL_ARG_I64 == TCG_CALL_ARG_EVEN);
 tcg_out_mov(s, TCG_TYPE_I32, arg++, hi);
 tcg_out_mov(s, TCG_TYPE_I32, arg++, lo);
 } else {
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
 lo = lb->addrlo_reg;
 hi = lb->addrhi_reg;
 if (TCG_TARGET_REG_BITS < TARGET_LONG_BITS) {
-#ifdef TCG_TARGET_CALL_ALIGN_ARGS
- arg |= 1;
-#endif
+ arg |= (TCG_TARGET_CALL_ARG_I64 == TCG_CALL_ARG_EVEN);
 tcg_out_mov(s, TCG_TYPE_I32, arg++, hi);
 tcg_out_mov(s, TCG_TYPE_I32, arg++, lo);
 } else {
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
 if (TCG_TARGET_REG_BITS == 32) {
 switch (s_bits) {
 case MO_64:
-#ifdef TCG_TARGET_CALL_ALIGN_ARGS
- arg |= 1;
-#endif
+ arg |= (TCG_TARGET_CALL_ARG_I64 == TCG_CALL_ARG_EVEN);
 tcg_out_mov(s, TCG_TYPE_I32, arg++, hi);
 /* FALLTHRU */
 case MO_32:
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_fail_alignment(TCGContext *s, TCGLabelQemuLdst *l)
 
 if (TCG_TARGET_REG_BITS < TARGET_LONG_BITS) {
 TCGReg arg = TCG_REG_R4;
-#ifdef TCG_TARGET_CALL_ALIGN_ARGS
- arg |= 1;
-#endif
+
+ arg |= (TCG_TARGET_CALL_ARG_I64 == TCG_CALL_ARG_EVEN);
 if (l->addrlo_reg != arg) {
 tcg_out_mov(s, TCG_TYPE_I32, arg, l->addrhi_reg);
 tcg_out_mov(s, TCG_TYPE_I32, arg + 1, l->addrlo_reg);
-- 
2.34.1

For 64-bit hosts that had TCG_TARGET_EXTEND_ARGS, set
TCG_TARGET_CALL_ARG_I32 to TCG_CALL_ARG_EXTEND.
Otherwise, use TCG_CALL_ARG_NORMAL.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/aarch64/tcg-target.h | 1 +
 tcg/arm/tcg-target.h | 1 +
 tcg/i386/tcg-target.h | 1 +
 tcg/loongarch64/tcg-target.h | 1 +
 tcg/mips/tcg-target.h | 1 +
 tcg/riscv/tcg-target.h | 1 +
 tcg/s390x/tcg-target.h | 2 +-
 tcg/sparc64/tcg-target.h | 2 +-
 tcg/tci/tcg-target.h | 1 +
 tcg/tcg.c | 42 ++++++++++++++++++------------------
 tcg/ppc/tcg-target.c.inc | 6 +++++-
 11 files changed, 35 insertions(+), 24 deletions(-)

diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/aarch64/tcg-target.h
+++ b/tcg/aarch64/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
 #define TCG_REG_CALL_STACK TCG_REG_SP
 #define TCG_TARGET_STACK_ALIGN 16
 #define TCG_TARGET_CALL_STACK_OFFSET 0
+#define TCG_TARGET_CALL_ARG_I32 TCG_CALL_ARG_NORMAL
 #define TCG_TARGET_CALL_ARG_I64 TCG_CALL_ARG_NORMAL
 
 /* optional instructions */
diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.h
+++ b/tcg/arm/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool use_neon_instructions;
 /* used for function call generation */
 #define TCG_TARGET_STACK_ALIGN		8
 #define TCG_TARGET_CALL_STACK_OFFSET	0
+#define TCG_TARGET_CALL_ARG_I32 TCG_CALL_ARG_NORMAL
 #define TCG_TARGET_CALL_ARG_I64 TCG_CALL_ARG_EVEN
 
 /* optional instructions */
diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
 #else
 #define TCG_TARGET_CALL_STACK_OFFSET 0
 #endif
+#define TCG_TARGET_CALL_ARG_I32 TCG_CALL_ARG_NORMAL
 #define TCG_TARGET_CALL_ARG_I64 TCG_CALL_ARG_NORMAL
 
 extern bool have_bmi1;
diff --git a/tcg/loongarch64/tcg-target.h b/tcg/loongarch64/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/loongarch64/tcg-target.h
+++ b/tcg/loongarch64/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
 #define TCG_REG_CALL_STACK TCG_REG_SP
 #define TCG_TARGET_STACK_ALIGN 16
 #define TCG_TARGET_CALL_STACK_OFFSET 0
+#define TCG_TARGET_CALL_ARG_I32 TCG_CALL_ARG_NORMAL
 #define TCG_TARGET_CALL_ARG_I64 TCG_CALL_ARG_NORMAL
 
 /* optional instructions */
diff --git a/tcg/mips/tcg-target.h b/tcg/mips/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/mips/tcg-target.h
+++ b/tcg/mips/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
 # define TCG_TARGET_CALL_STACK_OFFSET 0
 # define TCG_TARGET_CALL_ARG_I64 TCG_CALL_ARG_NORMAL
 #endif
+#define TCG_TARGET_CALL_ARG_I32 TCG_CALL_ARG_NORMAL
 
 /* MOVN/MOVZ instructions detection */
 #if (defined(__mips_isa_rev) && (__mips_isa_rev >= 1)) || \
diff --git a/tcg/riscv/tcg-target.h b/tcg/riscv/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/riscv/tcg-target.h
+++ b/tcg/riscv/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
 #define TCG_REG_CALL_STACK TCG_REG_SP
 #define TCG_TARGET_STACK_ALIGN 16
 #define TCG_TARGET_CALL_STACK_OFFSET 0
+#define TCG_TARGET_CALL_ARG_I32 TCG_CALL_ARG_NORMAL
 #if TCG_TARGET_REG_BITS == 32
 #define TCG_TARGET_CALL_ARG_I64 TCG_CALL_ARG_EVEN
 #else
diff --git a/tcg/s390x/tcg-target.h b/tcg/s390x/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/s390x/tcg-target.h
+++ b/tcg/s390x/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern uint64_t s390_facilities[3];
 /* used for function call generation */
 #define TCG_TARGET_STACK_ALIGN		8
 #define TCG_TARGET_CALL_STACK_OFFSET	160
+#define TCG_TARGET_CALL_ARG_I32 TCG_CALL_ARG_EXTEND
 #define TCG_TARGET_CALL_ARG_I64 TCG_CALL_ARG_NORMAL
 
-#define TCG_TARGET_EXTEND_ARGS 1
 #define TCG_TARGET_HAS_MEMORY_BSWAP 1
 
 #define TCG_TARGET_DEFAULT_MO (TCG_MO_ALL & ~TCG_MO_ST_LD)
diff --git a/tcg/sparc64/tcg-target.h b/tcg/sparc64/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/sparc64/tcg-target.h
+++ b/tcg/sparc64/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
 #define TCG_TARGET_STACK_BIAS 2047
 #define TCG_TARGET_STACK_ALIGN 16
 #define TCG_TARGET_CALL_STACK_OFFSET (128 + 6*8 + TCG_TARGET_STACK_BIAS)
-#define TCG_TARGET_EXTEND_ARGS 1
+#define TCG_TARGET_CALL_ARG_I32 TCG_CALL_ARG_EXTEND
 #define TCG_TARGET_CALL_ARG_I64 TCG_CALL_ARG_NORMAL
 
 #if defined(__VIS__) && __VIS__ >= 0x300
diff --git a/tcg/tci/tcg-target.h b/tcg/tci/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci/tcg-target.h
+++ b/tcg/tci/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
 /* Used for function call generation. */
 #define TCG_TARGET_CALL_STACK_OFFSET 0
 #define TCG_TARGET_STACK_ALIGN 8
+#define TCG_TARGET_CALL_ARG_I32 TCG_CALL_ARG_NORMAL
 #if TCG_TARGET_REG_BITS == 32
 # define TCG_TARGET_CALL_ARG_I64 TCG_CALL_ARG_EVEN
 #else
diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ void tcg_gen_callN(void *func, TCGTemp *ret, int nargs, TCGTemp **args)
 }
 #endif
 
-#if defined(TCG_TARGET_EXTEND_ARGS) && TCG_TARGET_REG_BITS == 64
- for (i = 0; i < nargs; ++i) {
- int argtype = extract32(typemask, (i + 1) * 3, 3);
- bool is_32bit = (argtype & ~1) == dh_typecode_i32;
- bool is_signed = argtype & 1;
+ if (TCG_TARGET_CALL_ARG_I32 == TCG_CALL_ARG_EXTEND) {
+ for (i = 0; i < nargs; ++i) {
+ int argtype = extract32(typemask, (i + 1) * 3, 3);
+ bool is_32bit = (argtype & ~1) == dh_typecode_i32;
+ bool is_signed = argtype & 1;
 
- if (is_32bit) {
- TCGv_i64 temp = tcg_temp_new_i64();
- TCGv_i32 orig = temp_tcgv_i32(args[i]);
- if (is_signed) {
- tcg_gen_ext_i32_i64(temp, orig);
- } else {
- tcg_gen_extu_i32_i64(temp, orig);
+ if (is_32bit) {
+ TCGv_i64 temp = tcg_temp_new_i64();
+ TCGv_i32 orig = temp_tcgv_i32(args[i]);
+ if (is_signed) {
+ tcg_gen_ext_i32_i64(temp, orig);
+ } else {
+ tcg_gen_extu_i32_i64(temp, orig);
+ }
+ args[i] = tcgv_i64_temp(temp);
 }
- args[i] = tcgv_i64_temp(temp);
 }
 }
-#endif /* TCG_TARGET_EXTEND_ARGS */
 
 op = tcg_emit_op(INDEX_op_call);
 
@@ -XXX,XX +XXX,XX @@ void tcg_gen_callN(void *func, TCGTemp *ret, int nargs, TCGTemp **args)
 tcg_debug_assert(TCGOP_CALLI(op) == real_args);
 tcg_debug_assert(pi <= ARRAY_SIZE(op->args));
 
-#if defined(TCG_TARGET_EXTEND_ARGS) && TCG_TARGET_REG_BITS == 64
- for (i = 0; i < nargs; ++i) {
- int argtype = extract32(typemask, (i + 1) * 3, 3);
- bool is_32bit = (argtype & ~1) == dh_typecode_i32;
+ if (TCG_TARGET_CALL_ARG_I32 == TCG_CALL_ARG_EXTEND) {
+ for (i = 0; i < nargs; ++i) {
+ int argtype = extract32(typemask, (i + 1) * 3, 3);
+ bool is_32bit = (argtype & ~1) == dh_typecode_i32;
 
- if (is_32bit) {
- tcg_temp_free_internal(args[i]);
+ if (is_32bit) {
+ tcg_temp_free_internal(args[i]);
+ }
 }
 }
-#endif /* TCG_TARGET_EXTEND_ARGS */
 }
 
 static void tcg_reg_alloc_start(TCGContext *s)
diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@
 # endif
 #endif
 
+#if TCG_TARGET_REG_BITS == 64
+# define TCG_TARGET_CALL_ARG_I32 TCG_CALL_ARG_EXTEND
+#else
+# define TCG_TARGET_CALL_ARG_I32 TCG_CALL_ARG_NORMAL
+#endif
 #ifdef _CALL_SYSV
 # define TCG_TARGET_CALL_ARG_I64 TCG_CALL_ARG_EVEN
 #else
@@ -XXX,XX +XXX,XX @@ static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
 
 /* Parameters for function call generation, used in tcg.c. */
 #define TCG_TARGET_STACK_ALIGN 16
-#define TCG_TARGET_EXTEND_ARGS 1
 
 #ifdef _CALL_AIX
 # define LINK_AREA_SIZE (6 * SZR)
-- 
2.34.1

Change 32-bit tci TCG_TARGET_CALL_ARG_I32 to TCG_CALL_ARG_EVEN, to
force 32-bit values to be aligned to 64-bit.  With a small reorg
to the argument processing loop, this neatly replaces an ifdef for
CONFIG_TCG_INTERPRETER.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tci/tcg-target.h | 3 +-
 tcg/tcg.c | 70 ++++++++++++++++++++++++++++----------------
 2 files changed, 47 insertions(+), 26 deletions(-)

diff --git a/tcg/tci/tcg-target.h b/tcg/tci/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci/tcg-target.h
+++ b/tcg/tci/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
 /* Used for function call generation. */
 #define TCG_TARGET_CALL_STACK_OFFSET 0
 #define TCG_TARGET_STACK_ALIGN 8
-#define TCG_TARGET_CALL_ARG_I32 TCG_CALL_ARG_NORMAL
 #if TCG_TARGET_REG_BITS == 32
+# define TCG_TARGET_CALL_ARG_I32 TCG_CALL_ARG_EVEN
 # define TCG_TARGET_CALL_ARG_I64 TCG_CALL_ARG_EVEN
 #else
+# define TCG_TARGET_CALL_ARG_I32 TCG_CALL_ARG_NORMAL
 # define TCG_TARGET_CALL_ARG_I64 TCG_CALL_ARG_NORMAL
 #endif
 
diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ void tcg_gen_callN(void *func, TCGTemp *ret, int nargs, TCGTemp **args)
 real_args = 0;
 for (i = 0; i < nargs; i++) {
 int argtype = extract32(typemask, (i + 1) * 3, 3);
- bool is_64bit = (argtype & ~1) == dh_typecode_i64;
- bool want_align = false;
+ TCGCallArgumentKind kind;
+ TCGType type;
 
-#if defined(CONFIG_TCG_INTERPRETER)
- /*
- * Align all arguments, so that they land in predictable places
- * for passing off to ffi_call.
- */
- want_align = true;
-#else
- /* Some targets want aligned 64 bit args */
- if (is_64bit) {
- want_align = TCG_TARGET_CALL_ARG_I64 == TCG_CALL_ARG_EVEN;
- }
-#endif
-
- if (TCG_TARGET_REG_BITS < 64 && want_align && (real_args & 1)) {
- op->args[pi++] = TCG_CALL_DUMMY_ARG;
- real_args++;
+ switch (argtype) {
+ case dh_typecode_i32:
+ case dh_typecode_s32:
+ type = TCG_TYPE_I32;
+ break;
+ case dh_typecode_i64:
+ case dh_typecode_s64:
+ type = TCG_TYPE_I64;
+ break;
+ case dh_typecode_ptr:
+ type = TCG_TYPE_PTR;
+ break;
+ default:
+ g_assert_not_reached();
 }
 
- if (TCG_TARGET_REG_BITS < 64 && is_64bit) {
+ switch (type) {
+ case TCG_TYPE_I32:
+ kind = TCG_TARGET_CALL_ARG_I32;
+ break;
+ case TCG_TYPE_I64:
+ kind = TCG_TARGET_CALL_ARG_I64;
+ break;
+ default:
+ g_assert_not_reached();
+ }
+
+ switch (kind) {
+ case TCG_CALL_ARG_EVEN:
+ if (real_args & 1) {
+ op->args[pi++] = TCG_CALL_DUMMY_ARG;
+ real_args++;
+ }
+ /* fall through */
+ case TCG_CALL_ARG_NORMAL:
+ if (TCG_TARGET_REG_BITS == 32 && type == TCG_TYPE_I64) {
+ op->args[pi++] = temp_arg(args[i]);
+ op->args[pi++] = temp_arg(args[i] + 1);
+ real_args += 2;
+ break;
+ }
 op->args[pi++] = temp_arg(args[i]);
- op->args[pi++] = temp_arg(args[i] + 1);
- real_args += 2;
- continue;
+ real_args++;
+ break;
+ default:
+ g_assert_not_reached();
 }
-
- op->args[pi++] = temp_arg(args[i]);
- real_args++;
 }
 op->args[pi++] = (uintptr_t)func;
 op->args[pi++] = (uintptr_t)info;
-- 
2.34.1

The function pointer is immediately after the output and input
operands; no need to search.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/plugin-gen.c | 29 +++++++++++------------------
 1 file changed, 11 insertions(+), 18 deletions(-)

diff --git a/accel/tcg/plugin-gen.c b/accel/tcg/plugin-gen.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/plugin-gen.c
+++ b/accel/tcg/plugin-gen.c
@@ -XXX,XX +XXX,XX @@ static TCGOp *copy_st_ptr(TCGOp **begin_op, TCGOp *op)
 static TCGOp *copy_call(TCGOp **begin_op, TCGOp *op, void *empty_func,
 void *func, int *cb_idx)
 {
+ TCGOp *old_op;
+ int func_idx;
+
 /* copy all ops until the call */
 do {
 op = copy_op_nocheck(begin_op, op);
 } while (op->opc != INDEX_op_call);
 
 /* fill in the op call */
- op->param1 = (*begin_op)->param1;
- op->param2 = (*begin_op)->param2;
+ old_op = *begin_op;
+ TCGOP_CALLI(op) = TCGOP_CALLI(old_op);
+ TCGOP_CALLO(op) = TCGOP_CALLO(old_op);
 tcg_debug_assert(op->life == 0);
- if (*cb_idx == -1) {
- int i;
 
- /*
- * Instead of working out the position of the callback in args[], just
- * look for @empty_func, since it should be a unique pointer.
- */
- for (i = 0; i < MAX_OPC_PARAM_ARGS; i++) {
- if ((uintptr_t)(*begin_op)->args[i] == (uintptr_t)empty_func) {
- *cb_idx = i;
- break;
- }
- }
- tcg_debug_assert(i < MAX_OPC_PARAM_ARGS);
- }
- op->args[*cb_idx] = (uintptr_t)func;
- op->args[*cb_idx + 1] = (*begin_op)->args[*cb_idx + 1];
+ func_idx = TCGOP_CALLO(op) + TCGOP_CALLI(op);
+ *cb_idx = func_idx;
+
+ op->args[func_idx] = (uintptr_t)func;
+ op->args[func_idx + 1] = old_op->args[func_idx + 1];
 
 return op;
 }
-- 
2.34.1

Better to re-use the existing function for copying ops.

Acked-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/plugin-gen.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/accel/tcg/plugin-gen.c b/accel/tcg/plugin-gen.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/plugin-gen.c
+++ b/accel/tcg/plugin-gen.c
@@ -XXX,XX +XXX,XX @@ static TCGOp *append_udata_cb(const struct qemu_plugin_dyn_cb *cb,
     op = copy_const_ptr(&begin_op, op, cb->userp);
 
     /* copy the ld_i32, but note that we only have to copy it once */
-    begin_op = QTAILQ_NEXT(begin_op, link);
-    tcg_debug_assert(begin_op && begin_op->opc == INDEX_op_ld_i32);
     if (*cb_idx == -1) {
-        op = tcg_op_insert_after(tcg_ctx, op, INDEX_op_ld_i32);
-        memcpy(op->args, begin_op->args, sizeof(op->args));
+        op = copy_op(&begin_op, op, INDEX_op_ld_i32);
+    } else {
+        begin_op = QTAILQ_NEXT(begin_op, link);
+        tcg_debug_assert(begin_op && begin_op->opc == INDEX_op_ld_i32);
     }
 
     /* call */
@@ -XXX,XX +XXX,XX @@ static TCGOp *append_mem_cb(const struct qemu_plugin_dyn_cb *cb,
     op = copy_const_ptr(&begin_op, op, cb->userp);
 
     /* copy the ld_i32, but note that we only have to copy it once */
-    begin_op = QTAILQ_NEXT(begin_op, link);
-    tcg_debug_assert(begin_op && begin_op->opc == INDEX_op_ld_i32);
     if (*cb_idx == -1) {
-        op = tcg_op_insert_after(tcg_ctx, op, INDEX_op_ld_i32);
-        memcpy(op->args, begin_op->args, sizeof(op->args));
+        op = copy_op(&begin_op, op, INDEX_op_ld_i32);
+    } else {
+        begin_op = QTAILQ_NEXT(begin_op, link);
+        tcg_debug_assert(begin_op && begin_op->opc == INDEX_op_ld_i32);
     }
 
     /* extu_tl_i64 */
-- 
2.34.1

From: Philippe Mathieu-Daudé <philmd@linaro.org>

In order to have variable size allocated TCGOp, pass the number
of arguments we use (and would allocate) up to tcg_op_alloc().

This alters tcg_emit_op(), tcg_op_insert_before() and
tcg_op_insert_after() prototypes.

In tcg_op_alloc() ensure the number of arguments is in range.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
[PMD: Extracted from bigger patch]
Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Message-Id: <20221218211832.73312-2-philmd@linaro.org>
---
 include/tcg/tcg-op.h | 2 +-
 include/tcg/tcg.h | 8 +++++---
 accel/tcg/plugin-gen.c | 5 ++++-
 tcg/optimize.c | 4 ++--
 tcg/tcg-op-vec.c | 8 ++++----
 tcg/tcg-op.c | 12 ++++++------
 tcg/tcg.c | 30 +++++++++++++++++-------------
 7 files changed, 39 insertions(+), 30 deletions(-)

diff --git a/include/tcg/tcg-op.h b/include/tcg/tcg-op.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg-op.h
+++ b/include/tcg/tcg-op.h
@@ -XXX,XX +XXX,XX @@ static inline void tcg_gen_plugin_cb_start(unsigned from, unsigned type,
 
 static inline void tcg_gen_plugin_cb_end(void)
 {
- tcg_emit_op(INDEX_op_plugin_cb_end);
+ tcg_emit_op(INDEX_op_plugin_cb_end, 0);
 }
 
 #if TARGET_LONG_BITS == 32
diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg.h
+++ b/include/tcg/tcg.h
@@ -XXX,XX +XXX,XX @@ bool tcg_op_supported(TCGOpcode op);
 
 void tcg_gen_callN(void *func, TCGTemp *ret, int nargs, TCGTemp **args);
 
-TCGOp *tcg_emit_op(TCGOpcode opc);
+TCGOp *tcg_emit_op(TCGOpcode opc, unsigned nargs);
 void tcg_op_remove(TCGContext *s, TCGOp *op);
-TCGOp *tcg_op_insert_before(TCGContext *s, TCGOp *op, TCGOpcode opc);
-TCGOp *tcg_op_insert_after(TCGContext *s, TCGOp *op, TCGOpcode opc);
+TCGOp *tcg_op_insert_before(TCGContext *s, TCGOp *op,
+ TCGOpcode opc, unsigned nargs);
+TCGOp *tcg_op_insert_after(TCGContext *s, TCGOp *op,
+ TCGOpcode opc, unsigned nargs);
 
 /**
 * tcg_remove_ops_after:
diff --git a/accel/tcg/plugin-gen.c b/accel/tcg/plugin-gen.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/plugin-gen.c
+++ b/accel/tcg/plugin-gen.c
@@ -XXX,XX +XXX,XX @@ static TCGOp *rm_ops(TCGOp *op)
 
 static TCGOp *copy_op_nocheck(TCGOp **begin_op, TCGOp *op)
 {
+ unsigned nargs = ARRAY_SIZE(op->args);
+
 *begin_op = QTAILQ_NEXT(*begin_op, link);
 tcg_debug_assert(*begin_op);
- op = tcg_op_insert_after(tcg_ctx, op, (*begin_op)->opc);
+ op = tcg_op_insert_after(tcg_ctx, op, (*begin_op)->opc, nargs);
 memcpy(op->args, (*begin_op)->args, sizeof(op->args));
+
 return op;
 }
 
diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_addsub2(OptContext *ctx, TCGOp *op, bool add)
 rh = op->args[1];
 
 /* The proper opcode is supplied by tcg_opt_gen_mov. */
- op2 = tcg_op_insert_before(ctx->tcg, op, 0);
+ op2 = tcg_op_insert_before(ctx->tcg, op, 0, 2);
 
 tcg_opt_gen_movi(ctx, op, rl, al);
 tcg_opt_gen_movi(ctx, op2, rh, ah);
@@ -XXX,XX +XXX,XX @@ static bool fold_multiply2(OptContext *ctx, TCGOp *op)
 rh = op->args[1];
 
 /* The proper opcode is supplied by tcg_opt_gen_mov. */
- op2 = tcg_op_insert_before(ctx->tcg, op, 0);
+ op2 = tcg_op_insert_before(ctx->tcg, op, 0, 2);
 
 tcg_opt_gen_movi(ctx, op, rl, l);
 tcg_opt_gen_movi(ctx, op2, rh, h);
diff --git a/tcg/tcg-op-vec.c b/tcg/tcg-op-vec.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg-op-vec.c
+++ b/tcg/tcg-op-vec.c
@@ -XXX,XX +XXX,XX @@ bool tcg_can_emit_vecop_list(const TCGOpcode *list,
 
 void vec_gen_2(TCGOpcode opc, TCGType type, unsigned vece, TCGArg r, TCGArg a)
 {
- TCGOp *op = tcg_emit_op(opc);
+ TCGOp *op = tcg_emit_op(opc, 2);
 TCGOP_VECL(op) = type - TCG_TYPE_V64;
 TCGOP_VECE(op) = vece;
 op->args[0] = r;
@@ -XXX,XX +XXX,XX @@ void vec_gen_2(TCGOpcode opc, TCGType type, unsigned vece, TCGArg r, TCGArg a)
 void vec_gen_3(TCGOpcode opc, TCGType type, unsigned vece,
 TCGArg r, TCGArg a, TCGArg b)
 {
- TCGOp *op = tcg_emit_op(opc);
+ TCGOp *op = tcg_emit_op(opc, 3);
 TCGOP_VECL(op) = type - TCG_TYPE_V64;
 TCGOP_VECE(op) = vece;
 op->args[0] = r;
@@ -XXX,XX +XXX,XX @@ void vec_gen_3(TCGOpcode opc, TCGType type, unsigned vece,
 void vec_gen_4(TCGOpcode opc, TCGType type, unsigned vece,
 TCGArg r, TCGArg a, TCGArg b, TCGArg c)
 {
- TCGOp *op = tcg_emit_op(opc);
+ TCGOp *op = tcg_emit_op(opc, 4);
 TCGOP_VECL(op) = type - TCG_TYPE_V64;
 TCGOP_VECE(op) = vece;
 op->args[0] = r;
@@ -XXX,XX +XXX,XX @@ void vec_gen_4(TCGOpcode opc, TCGType type, unsigned vece,
 static void vec_gen_6(TCGOpcode opc, TCGType type, unsigned vece, TCGArg r,
 TCGArg a, TCGArg b, TCGArg c, TCGArg d, TCGArg e)
 {
- TCGOp *op = tcg_emit_op(opc);
+ TCGOp *op = tcg_emit_op(opc, 6);
 TCGOP_VECL(op) = type - TCG_TYPE_V64;
 TCGOP_VECE(op) = vece;
 op->args[0] = r;
diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg-op.c
+++ b/tcg/tcg-op.c
@@ -XXX,XX +XXX,XX @@
 
 void tcg_gen_op1(TCGOpcode opc, TCGArg a1)
 {
- TCGOp *op = tcg_emit_op(opc);
+ TCGOp *op = tcg_emit_op(opc, 1);
 op->args[0] = a1;
 }
 
 void tcg_gen_op2(TCGOpcode opc, TCGArg a1, TCGArg a2)
 {
- TCGOp *op = tcg_emit_op(opc);
+ TCGOp *op = tcg_emit_op(opc, 2);
 op->args[0] = a1;
 op->args[1] = a2;
 }
 
 void tcg_gen_op3(TCGOpcode opc, TCGArg a1, TCGArg a2, TCGArg a3)
 {
- TCGOp *op = tcg_emit_op(opc);
+ TCGOp *op = tcg_emit_op(opc, 3);
 op->args[0] = a1;
 op->args[1] = a2;
 op->args[2] = a3;
@@ -XXX,XX +XXX,XX @@ void tcg_gen_op3(TCGOpcode opc, TCGArg a1, TCGArg a2, TCGArg a3)
 
 void tcg_gen_op4(TCGOpcode opc, TCGArg a1, TCGArg a2, TCGArg a3, TCGArg a4)
 {
- TCGOp *op = tcg_emit_op(opc);
+ TCGOp *op = tcg_emit_op(opc, 4);
 op->args[0] = a1;
 op->args[1] = a2;
 op->args[2] = a3;
@@ -XXX,XX +XXX,XX @@ void tcg_gen_op4(TCGOpcode opc, TCGArg a1, TCGArg a2, TCGArg a3, TCGArg a4)
 void tcg_gen_op5(TCGOpcode opc, TCGArg a1, TCGArg a2, TCGArg a3,
 TCGArg a4, TCGArg a5)
 {
- TCGOp *op = tcg_emit_op(opc);
+ TCGOp *op = tcg_emit_op(opc, 5);
 op->args[0] = a1;
 op->args[1] = a2;
 op->args[2] = a3;
@@ -XXX,XX +XXX,XX @@ void tcg_gen_op5(TCGOpcode opc, TCGArg a1, TCGArg a2, TCGArg a3,
 void tcg_gen_op6(TCGOpcode opc, TCGArg a1, TCGArg a2, TCGArg a3,
 TCGArg a4, TCGArg a5, TCGArg a6)
 {
- TCGOp *op = tcg_emit_op(opc);
+ TCGOp *op = tcg_emit_op(opc, 6);
 op->args[0] = a1;
 op->args[1] = a2;
 op->args[2] = a3;
diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ bool tcg_op_supported(TCGOpcode op)
 and endian swap in tcg_reg_alloc_call(). */
 void tcg_gen_callN(void *func, TCGTemp *ret, int nargs, TCGTemp **args)
 {
- int i, real_args, nb_rets, pi;
+ int i, real_args, nb_rets, pi, max_args;
 unsigned typemask;
 const TCGHelperInfo *info;
 TCGOp *op;
@@ -XXX,XX +XXX,XX @@ void tcg_gen_callN(void *func, TCGTemp *ret, int nargs, TCGTemp **args)
 }
 }
 
- op = tcg_emit_op(INDEX_op_call);
+ max_args = ARRAY_SIZE(op->args);
+ op = tcg_emit_op(INDEX_op_call, max_args);
 
 pi = 0;
 if (ret != NULL) {
@@ -XXX,XX +XXX,XX @@ void tcg_gen_callN(void *func, TCGTemp *ret, int nargs, TCGTemp **args)
 
 /* Make sure the fields didn't overflow. */
 tcg_debug_assert(TCGOP_CALLI(op) == real_args);
- tcg_debug_assert(pi <= ARRAY_SIZE(op->args));
+ tcg_debug_assert(pi <= max_args);
 
 if (TCG_TARGET_CALL_ARG_I32 == TCG_CALL_ARG_EXTEND) {
 for (i = 0; i < nargs; ++i) {
@@ -XXX,XX +XXX,XX @@ void tcg_remove_ops_after(TCGOp *op)
 }
 }
 
-static TCGOp *tcg_op_alloc(TCGOpcode opc)
+static TCGOp *tcg_op_alloc(TCGOpcode opc, unsigned nargs)
 {
 TCGContext *s = tcg_ctx;
 TCGOp *op;
 
+ assert(nargs < ARRAY_SIZE(op->args));
 if (likely(QTAILQ_EMPTY(&s->free_ops))) {
 op = tcg_malloc(sizeof(TCGOp));
 } else {
@@ -XXX,XX +XXX,XX @@ static TCGOp *tcg_op_alloc(TCGOpcode opc)
 return op;
 }
 
-TCGOp *tcg_emit_op(TCGOpcode opc)
+TCGOp *tcg_emit_op(TCGOpcode opc, unsigned nargs)
 {
- TCGOp *op = tcg_op_alloc(opc);
+ TCGOp *op = tcg_op_alloc(opc, nargs);
 QTAILQ_INSERT_TAIL(&tcg_ctx->ops, op, link);
 return op;
 }
 
-TCGOp *tcg_op_insert_before(TCGContext *s, TCGOp *old_op, TCGOpcode opc)
+TCGOp *tcg_op_insert_before(TCGContext *s, TCGOp *old_op,
+ TCGOpcode opc, unsigned nargs)
 {
- TCGOp *new_op = tcg_op_alloc(opc);
+ TCGOp *new_op = tcg_op_alloc(opc, nargs);
 QTAILQ_INSERT_BEFORE(old_op, new_op, link);
 return new_op;
 }
 
-TCGOp *tcg_op_insert_after(TCGContext *s, TCGOp *old_op, TCGOpcode opc)
+TCGOp *tcg_op_insert_after(TCGContext *s, TCGOp *old_op,
+ TCGOpcode opc, unsigned nargs)
 {
- TCGOp *new_op = tcg_op_alloc(opc);
+ TCGOp *new_op = tcg_op_alloc(opc, nargs);
 QTAILQ_INSERT_AFTER(&s->ops, old_op, new_op, link);
 return new_op;
 }
@@ -XXX,XX +XXX,XX @@ static bool liveness_pass_2(TCGContext *s)
 TCGOpcode lopc = (arg_ts->type == TCG_TYPE_I32
 ? INDEX_op_ld_i32
 : INDEX_op_ld_i64);
- TCGOp *lop = tcg_op_insert_before(s, op, lopc);
+ TCGOp *lop = tcg_op_insert_before(s, op, lopc, 3);
 
 lop->args[0] = temp_arg(dir_ts);
 lop->args[1] = temp_arg(arg_ts->mem_base);
@@ -XXX,XX +XXX,XX @@ static bool liveness_pass_2(TCGContext *s)
 TCGOpcode sopc = (arg_ts->type == TCG_TYPE_I32
 ? INDEX_op_st_i32
 : INDEX_op_st_i64);
- TCGOp *sop = tcg_op_insert_after(s, op, sopc);
+ TCGOp *sop = tcg_op_insert_after(s, op, sopc, 3);
 TCGTemp *out_ts = dir_ts;
 
 if (IS_DEAD_ARG(0)) {
@@ -XXX,XX +XXX,XX @@ static bool liveness_pass_2(TCGContext *s)
 TCGOpcode sopc = (arg_ts->type == TCG_TYPE_I32
 ? INDEX_op_st_i32
 : INDEX_op_st_i64);
- TCGOp *sop = tcg_op_insert_after(s, op, sopc);
+ TCGOp *sop = tcg_op_insert_after(s, op, sopc, 3);
 
 sop->args[0] = temp_arg(dir_ts);
 sop->args[1] = temp_arg(arg_ts->mem_base);
-- 
2.34.1

We have been allocating a worst case number of arguments
to support calls.  Instead, allow the size to vary.
By default leave space for 4 args, to maximize reuse,
but allow calls to increase the number of args to 32.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
[PMD: Split patch in two]
Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Message-Id: <20221218211832.73312-3-philmd@linaro.org>
---
 include/exec/helper-head.h | 2 --
 include/tcg/tcg.h | 46 +++++++++++++-------------------------
 accel/tcg/plugin-gen.c | 10 ++++-----
 tcg/tcg.c | 35 +++++++++++++++++++++--------
 4 files changed, 47 insertions(+), 46 deletions(-)

We will shortly have the possibility of more that two outputs,
though only for calls (for which preferences are moot).  Avoid
direct references to op->output_pref[] when possible.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/tcg/tcg.h | 5 +++++
 tcg/tcg.c | 34 ++++++++++++++++++----------------
 2 files changed, 23 insertions(+), 16 deletions(-)

diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg.h
+++ b/include/tcg/tcg.h
@@ -XXX,XX +XXX,XX @@ typedef struct TCGOp {
 /* Make sure operands fit in the bitfields above. */
 QEMU_BUILD_BUG_ON(NB_OPS > (1 << 8));
 
+static inline TCGRegSet output_pref(const TCGOp *op, unsigned i)
+{
+ return i < ARRAY_SIZE(op->output_pref) ? op->output_pref[i] : 0;
+}
+
 typedef struct TCGProfile {
 int64_t cpu_exec_time;
 int64_t tb_count1;
diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static void tcg_dump_ops(TCGContext *s, FILE *f, bool have_prefs)
 
 if (have_prefs) {
 for (i = 0; i < nb_oargs; ++i) {
- TCGRegSet set = op->output_pref[i];
+ TCGRegSet set = output_pref(op, i);
 
 if (i == 0) {
 ne_fprintf(f, " pref=");
@@ -XXX,XX +XXX,XX @@ static void liveness_pass_1(TCGContext *s)
 }
 ts->state = TS_DEAD;
 la_reset_pref(ts);
-
- /* Not used -- it will be tcg_target_call_oarg_regs[i]. */
- op->output_pref[i] = 0;
 }
 
+ /* Not used -- it will be tcg_target_call_oarg_reg(). */
+ memset(op->output_pref, 0, sizeof(op->output_pref));
+
 if (!(call_flags & (TCG_CALL_NO_WRITE_GLOBALS |
 TCG_CALL_NO_READ_GLOBALS))) {
 la_global_kill(s, nb_globals);
@@ -XXX,XX +XXX,XX @@ static void liveness_pass_1(TCGContext *s)
 ts = arg_temp(op->args[i]);
 
 /* Remember the preference of the uses that followed. */
- op->output_pref[i] = *la_temp_pref(ts);
+ if (i < ARRAY_SIZE(op->output_pref)) {
+ op->output_pref[i] = *la_temp_pref(ts);
+ }
 
 /* Output args are dead. */
 if (ts->state & TS_DEAD) {
@@ -XXX,XX +XXX,XX @@ static void liveness_pass_1(TCGContext *s)
 
 set &= ct->regs;
 if (ct->ialias) {
- set &= op->output_pref[ct->alias_index];
+ set &= output_pref(op, ct->alias_index);
 }
 /* If the combination is not possible, restart. */
 if (set == 0) {
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_mov(TCGContext *s, const TCGOp *op)
 TCGReg oreg, ireg;
 
 allocated_regs = s->reserved_regs;
- preferred_regs = op->output_pref[0];
+ preferred_regs = output_pref(op, 0);
 ots = arg_temp(op->args[0]);
 ts = arg_temp(op->args[1]);
 
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_dup(TCGContext *s, const TCGOp *op)
 if (IS_DEAD_ARG(1)) {
 temp_dead(s, its);
 }
- tcg_reg_alloc_do_movi(s, ots, val, arg_life, op->output_pref[0]);
+ tcg_reg_alloc_do_movi(s, ots, val, arg_life, output_pref(op, 0));
 return;
 }
 
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_dup(TCGContext *s, const TCGOp *op)
 tcg_regset_set_reg(allocated_regs, its->reg);
 }
 oreg = tcg_reg_alloc(s, dup_out_regs, allocated_regs,
- op->output_pref[0], ots->indirect_base);
+ output_pref(op, 0), ots->indirect_base);
 set_temp_val_reg(s, ots, oreg);
 }
 
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
 switch (arg_ct->pair) {
 case 0: /* not paired */
 if (arg_ct->ialias) {
- i_preferred_regs = op->output_pref[arg_ct->alias_index];
+ i_preferred_regs = output_pref(op, arg_ct->alias_index);
 
 /*
 * If the input is not dead after the instruction,
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
 * and to identify a few cases where it's not required.
 */
 if (arg_ct->ialias) {
- i_preferred_regs = op->output_pref[arg_ct->alias_index];
+ i_preferred_regs = output_pref(op, arg_ct->alias_index);
 if (IS_DEAD_ARG(i1) &&
 IS_DEAD_ARG(i2) &&
 ts->val_type == TEMP_VAL_REG &&
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
 
 case 3: /* ialias with second output, no first input */
 tcg_debug_assert(arg_ct->ialias);
- i_preferred_regs = op->output_pref[arg_ct->alias_index];
+ i_preferred_regs = output_pref(op, arg_ct->alias_index);
 
 if (IS_DEAD_ARG(i) &&
 ts->val_type == TEMP_VAL_REG &&
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
 } else if (arg_ct->newreg) {
 reg = tcg_reg_alloc(s, arg_ct->regs,
 i_allocated_regs | o_allocated_regs,
- op->output_pref[k], ts->indirect_base);
+ output_pref(op, k), ts->indirect_base);
 } else {
 reg = tcg_reg_alloc(s, arg_ct->regs, o_allocated_regs,
- op->output_pref[k], ts->indirect_base);
+ output_pref(op, k), ts->indirect_base);
 }
 break;
 
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
 break;
 }
 reg = tcg_reg_alloc_pair(s, arg_ct->regs, o_allocated_regs,
- op->output_pref[k], ts->indirect_base);
+ output_pref(op, k), ts->indirect_base);
 break;
 
 case 2: /* second of pair */
@@ -XXX,XX +XXX,XX @@ static bool tcg_reg_alloc_dup2(TCGContext *s, const TCGOp *op)
 }
 
 oreg = tcg_reg_alloc(s, dup_out_regs, allocated_regs,
- op->output_pref[0], ots->indirect_base);
+ output_pref(op, 0), ots->indirect_base);
 set_temp_val_reg(s, ots, oreg);
 }
 
-- 
2.34.1

Pre-compute the function call layout for each helper at startup.
Drop TCG_CALL_DUMMY_ARG, as we no longer need to leave gaps
in the op->args[] array.  This allows several places to stop
checking for NULL TCGTemp, to which TCG_CALL_DUMMY_ARG mapped.

For tcg_gen_callN, loop over the arguments once.  Allocate the TCGOp
for the call early but delay emitting it, collecting arguments first.
This allows the argument processing loop to emit code for extensions
and have them sequenced before the call.

For tcg_reg_alloc_call, loop over the arguments in reverse order,
which allows stack slots to be filled first naturally.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/helper-head.h | 2 +
 include/tcg/tcg.h | 5 +-
 tcg/tcg-internal.h | 22 +-
 tcg/optimize.c | 6 +-
 tcg/tcg.c | 609 ++++++++++++++++++++++---------------
 5 files changed, 394 insertions(+), 250 deletions(-)

diff --git a/include/exec/helper-head.h b/include/exec/helper-head.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/helper-head.h
+++ b/include/exec/helper-head.h
@@ -XXX,XX +XXX,XX @@
 #define DEF_HELPER_7(name, ret, t1, t2, t3, t4, t5, t6, t7) \
 DEF_HELPER_FLAGS_7(name, 0, ret, t1, t2, t3, t4, t5, t6, t7)
 
+/* MAX_CALL_IARGS must be set to n if last entry is DEF_HELPER_FLAGS_n. */
+
 #endif /* EXEC_HELPER_HEAD_H */
diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg.h
+++ b/include/tcg/tcg.h
@@ -XXX,XX +XXX,XX @@
 /* XXX: make safe guess about sizes */
 #define MAX_OP_PER_INSTR 266
 
+#define MAX_CALL_IARGS 7
+
 #define CPU_TEMP_BUF_NLONGS 128
 #define TCG_STATIC_FRAME_SIZE (CPU_TEMP_BUF_NLONGS * sizeof(long))
 
@@ -XXX,XX +XXX,XX @@ typedef TCGv_ptr TCGv_env;
 #define TCG_CALL_NO_RWG_SE (TCG_CALL_NO_RWG | TCG_CALL_NO_SE)
 #define TCG_CALL_NO_WG_SE (TCG_CALL_NO_WG | TCG_CALL_NO_SE)
 
-/* Used to align parameters. See the comment before tcgv_i32_temp. */
-#define TCG_CALL_DUMMY_ARG ((TCGArg)0)
-
 /*
 * Flags for the bswap opcodes.
 * If IZ, the input is zero-extended, otherwise unknown.
diff --git a/tcg/tcg-internal.h b/tcg/tcg-internal.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg-internal.h
+++ b/tcg/tcg-internal.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
 TCG_CALL_ARG_EXTEND_S, /* ... as a sign-extended i64 */
 } TCGCallArgumentKind;
 
+typedef struct TCGCallArgumentLoc {
+ TCGCallArgumentKind kind : 8;
+ unsigned arg_slot : 8;
+ unsigned ref_slot : 8;
+ unsigned arg_idx : 4;
+ unsigned tmp_subindex : 2;
+} TCGCallArgumentLoc;
+
+/* Avoid "unsigned < 0 is always false" Werror, when iarg_regs is empty. */
+#define REG_P(L) \
+ ((int)(L)->arg_slot < (int)ARRAY_SIZE(tcg_target_call_iarg_regs))
+
 typedef struct TCGHelperInfo {
 void *func;
 const char *name;
- unsigned flags;
- unsigned typemask;
+ unsigned typemask : 32;
+ unsigned flags : 8;
+ unsigned nr_in : 8;
+ unsigned nr_out : 8;
+ TCGCallReturnKind out_kind : 8;
+
+ /* Maximum physical arguments are constrained by TCG_TYPE_I128. */
+ TCGCallArgumentLoc in[MAX_CALL_IARGS * (128 / TCG_TARGET_REG_BITS)];
 } TCGHelperInfo;
 
 extern TCGContext tcg_init_ctx;
diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static void init_arguments(OptContext *ctx, TCGOp *op, int nb_args)
 {
 for (int i = 0; i < nb_args; i++) {
 TCGTemp *ts = arg_temp(op->args[i]);
- if (ts) {
- init_ts_info(ctx, ts);
- }
+ init_ts_info(ctx, ts);
 }
 }
 
@@ -XXX,XX +XXX,XX @@ static void copy_propagate(OptContext *ctx, TCGOp *op,
 
 for (int i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
 TCGTemp *ts = arg_temp(op->args[i]);
- if (ts && ts_is_copy(ts)) {
+ if (ts_is_copy(ts)) {
 op->args[i] = temp_arg(find_better_copy(s, ts));
 }
 }
diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ void tcg_pool_reset(TCGContext *s)
 
 #include "exec/helper-proto.h"
 
-static const TCGHelperInfo all_helpers[] = {
+static TCGHelperInfo all_helpers[] = {
 #include "exec/helper-tcg.h"
 };
 static GHashTable *helper_table;
@@ -XXX,XX +XXX,XX @@ static ffi_type * const typecode_to_ffi[8] = {
 };
 #endif
 
+typedef struct TCGCumulativeArgs {
+ int arg_idx; /* tcg_gen_callN args[] */
+ int info_in_idx; /* TCGHelperInfo in[] */
+ int arg_slot; /* regs+stack slot */
+ int ref_slot; /* stack slots for references */
+} TCGCumulativeArgs;
+
+static void layout_arg_even(TCGCumulativeArgs *cum)
+{
+ cum->arg_slot += cum->arg_slot & 1;
+}
+
+static void layout_arg_1(TCGCumulativeArgs *cum, TCGHelperInfo *info,
+ TCGCallArgumentKind kind)
+{
+ TCGCallArgumentLoc *loc = &info->in[cum->info_in_idx];
+
+ *loc = (TCGCallArgumentLoc){
+ .kind = kind,
+ .arg_idx = cum->arg_idx,
+ .arg_slot = cum->arg_slot,
+ };
+ cum->info_in_idx++;
+ cum->arg_slot++;
+}
+
+static void layout_arg_normal_n(TCGCumulativeArgs *cum,
+ TCGHelperInfo *info, int n)
+{
+ TCGCallArgumentLoc *loc = &info->in[cum->info_in_idx];
+
+ for (int i = 0; i < n; ++i) {
+ /* Layout all using the same arg_idx, adjusting the subindex. */
+ loc[i] = (TCGCallArgumentLoc){
+ .kind = TCG_CALL_ARG_NORMAL,
+ .arg_idx = cum->arg_idx,
+ .tmp_subindex = i,
+ .arg_slot = cum->arg_slot + i,
+ };
+ }
+ cum->info_in_idx += n;
+ cum->arg_slot += n;
+}
+
+static void init_call_layout(TCGHelperInfo *info)
+{
+ int max_reg_slots = ARRAY_SIZE(tcg_target_call_iarg_regs);
+ int max_stk_slots = TCG_STATIC_CALL_ARGS_SIZE / sizeof(tcg_target_long);
+ unsigned typemask = info->typemask;
+ unsigned typecode;
+ TCGCumulativeArgs cum = { };
+
+ /*
+ * Parse and place any function return value.
+ */
+ typecode = typemask & 7;
+ switch (typecode) {
+ case dh_typecode_void:
+ info->nr_out = 0;
+ break;
+ case dh_typecode_i32:
+ case dh_typecode_s32:
+ case dh_typecode_ptr:
+ info->nr_out = 1;
+ info->out_kind = TCG_CALL_RET_NORMAL;
+ break;
+ case dh_typecode_i64:
+ case dh_typecode_s64:
+ info->nr_out = 64 / TCG_TARGET_REG_BITS;
+ info->out_kind = TCG_CALL_RET_NORMAL;
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ assert(info->nr_out <= ARRAY_SIZE(tcg_target_call_oarg_regs));
+
+ /*
+ * Parse and place function arguments.
+ */
+ for (typemask >>= 3; typemask; typemask >>= 3, cum.arg_idx++) {
+ TCGCallArgumentKind kind;
+ TCGType type;
+
+ typecode = typemask & 7;
+ switch (typecode) {
+ case dh_typecode_i32:
+ case dh_typecode_s32:
+ type = TCG_TYPE_I32;
+ break;
+ case dh_typecode_i64:
+ case dh_typecode_s64:
+ type = TCG_TYPE_I64;
+ break;
+ case dh_typecode_ptr:
+ type = TCG_TYPE_PTR;
+ break;
+ default:
+ g_assert_not_reached();
+ }
+
+ switch (type) {
+ case TCG_TYPE_I32:
+ switch (TCG_TARGET_CALL_ARG_I32) {
+ case TCG_CALL_ARG_EVEN:
+ layout_arg_even(&cum);
+ /* fall through */
+ case TCG_CALL_ARG_NORMAL:
+ layout_arg_1(&cum, info, TCG_CALL_ARG_NORMAL);
+ break;
+ case TCG_CALL_ARG_EXTEND:
+ kind = TCG_CALL_ARG_EXTEND_U + (typecode & 1);
+ layout_arg_1(&cum, info, kind);
+ break;
+ default:
+ qemu_build_not_reached();
+ }
+ break;
+
+ case TCG_TYPE_I64:
+ switch (TCG_TARGET_CALL_ARG_I64) {
+ case TCG_CALL_ARG_EVEN:
+ layout_arg_even(&cum);
+ /* fall through */
+ case TCG_CALL_ARG_NORMAL:
+ if (TCG_TARGET_REG_BITS == 32) {
+ layout_arg_normal_n(&cum, info, 2);
+ } else {
+ layout_arg_1(&cum, info, TCG_CALL_ARG_NORMAL);
+ }
+ break;
+ default:
+ qemu_build_not_reached();
+ }
+ break;
+
+ default:
+ g_assert_not_reached();
+ }
+ }
+ info->nr_in = cum.info_in_idx;
+
+ /* Validate that we didn't overrun the input array. */
+ assert(cum.info_in_idx <= ARRAY_SIZE(info->in));
+ /* Validate the backend has enough argument space. */
+ assert(cum.arg_slot <= max_reg_slots + max_stk_slots);
+ assert(cum.ref_slot <= max_stk_slots);
+}
+
 static int indirect_reg_alloc_order[ARRAY_SIZE(tcg_target_reg_alloc_order)];
 static void process_op_defs(TCGContext *s);
 static TCGTemp *tcg_global_reg_new_internal(TCGContext *s, TCGType type,
@@ -XXX,XX +XXX,XX @@ static void tcg_context_init(unsigned max_cpus)
 helper_table = g_hash_table_new(NULL, NULL);
 
 for (i = 0; i < ARRAY_SIZE(all_helpers); ++i) {
+ init_call_layout(&all_helpers[i]);
 g_hash_table_insert(helper_table, (gpointer)all_helpers[i].func,
 (gpointer)&all_helpers[i]);
 }
@@ -XXX,XX +XXX,XX @@ bool tcg_op_supported(TCGOpcode op)
 }
 }
 
-/* Note: we convert the 64 bit args to 32 bit and do some alignment
- and endian swap. Maybe it would be better to do the alignment
- and endian swap in tcg_reg_alloc_call(). */
+static TCGOp *tcg_op_alloc(TCGOpcode opc, unsigned nargs);
+
 void tcg_gen_callN(void *func, TCGTemp *ret, int nargs, TCGTemp **args)
 {
- int i, real_args, nb_rets, pi, max_args;
- unsigned typemask;
 const TCGHelperInfo *info;
+ TCGv_i64 extend_free[MAX_CALL_IARGS];
+ int n_extend = 0;
 TCGOp *op;
+ int i, n, pi = 0, total_args;
 
 info = g_hash_table_lookup(helper_table, (gpointer)func);
- typemask = info->typemask;
+ total_args = info->nr_out + info->nr_in + 2;
+ op = tcg_op_alloc(INDEX_op_call, total_args);
 
 #ifdef CONFIG_PLUGIN
 /* detect non-plugin helpers */
@@ -XXX,XX +XXX,XX @@ void tcg_gen_callN(void *func, TCGTemp *ret, int nargs, TCGTemp **args)
 }
 #endif
 
- if (TCG_TARGET_CALL_ARG_I32 == TCG_CALL_ARG_EXTEND) {
- for (i = 0; i < nargs; ++i) {
- int argtype = extract32(typemask, (i + 1) * 3, 3);
- bool is_32bit = (argtype & ~1) == dh_typecode_i32;
- bool is_signed = argtype & 1;
+ TCGOP_CALLO(op) = n = info->nr_out;
+ switch (n) {
+ case 0:
+ tcg_debug_assert(ret == NULL);
+ break;
+ case 1:
+ tcg_debug_assert(ret != NULL);
+ op->args[pi++] = temp_arg(ret);
+ break;
+ case 2:
+ tcg_debug_assert(ret != NULL);
+ tcg_debug_assert(ret->base_type == ret->type + 1);
+ tcg_debug_assert(ret->temp_subindex == 0);
+ op->args[pi++] = temp_arg(ret);
+ op->args[pi++] = temp_arg(ret + 1);
+ break;
+ default:
+ g_assert_not_reached();
+ }
 
- if (is_32bit) {
+ TCGOP_CALLI(op) = n = info->nr_in;
+ for (i = 0; i < n; i++) {
+ const TCGCallArgumentLoc *loc = &info->in[i];
+ TCGTemp *ts = args[loc->arg_idx] + loc->tmp_subindex;
+
+ switch (loc->kind) {
+ case TCG_CALL_ARG_NORMAL:
+ op->args[pi++] = temp_arg(ts);
+ break;
+
+ case TCG_CALL_ARG_EXTEND_U:
+ case TCG_CALL_ARG_EXTEND_S:
+ {
 TCGv_i64 temp = tcg_temp_new_i64();
- TCGv_i32 orig = temp_tcgv_i32(args[i]);
- if (is_signed) {
+ TCGv_i32 orig = temp_tcgv_i32(ts);
+
+ if (loc->kind == TCG_CALL_ARG_EXTEND_S) {
 tcg_gen_ext_i32_i64(temp, orig);
 } else {
 tcg_gen_extu_i32_i64(temp, orig);
 }
- args[i] = tcgv_i64_temp(temp);
+ op->args[pi++] = tcgv_i64_arg(temp);
+ extend_free[n_extend++] = temp;
 }
- }
- }
-
- /*
- * A Call op needs up to 4 + 2N parameters on 32-bit archs,
- * and up to 4 + N parameters on 64-bit archs
- * (N = number of input arguments + output arguments).
- */
- max_args = (64 / TCG_TARGET_REG_BITS) * nargs + 4;
- op = tcg_emit_op(INDEX_op_call, max_args);
-
- pi = 0;
- if (ret != NULL) {
- if (TCG_TARGET_REG_BITS < 64 && (typemask & 6) == dh_typecode_i64) {
- op->args[pi++] = temp_arg(ret);
- op->args[pi++] = temp_arg(ret + 1);
- nb_rets = 2;
- } else {
- op->args[pi++] = temp_arg(ret);
- nb_rets = 1;
- }
- } else {
- nb_rets = 0;
- }
- TCGOP_CALLO(op) = nb_rets;
-
- real_args = 0;
- for (i = 0; i < nargs; i++) {
- int argtype = extract32(typemask, (i + 1) * 3, 3);
- TCGCallArgumentKind kind;
- TCGType type;
-
- switch (argtype) {
- case dh_typecode_i32:
- case dh_typecode_s32:
- type = TCG_TYPE_I32;
 break;
- case dh_typecode_i64:
- case dh_typecode_s64:
- type = TCG_TYPE_I64;
- break;
- case dh_typecode_ptr:
- type = TCG_TYPE_PTR;
- break;
- default:
- g_assert_not_reached();
- }
 
- switch (type) {
- case TCG_TYPE_I32:
- kind = TCG_TARGET_CALL_ARG_I32;
- break;
- case TCG_TYPE_I64:
- kind = TCG_TARGET_CALL_ARG_I64;
- break;
- default:
- g_assert_not_reached();
- }
-
- switch (kind) {
- case TCG_CALL_ARG_EVEN:
- if (real_args & 1) {
- op->args[pi++] = TCG_CALL_DUMMY_ARG;
- real_args++;
- }
- /* fall through */
- case TCG_CALL_ARG_NORMAL:
- if (TCG_TARGET_REG_BITS == 32 && type == TCG_TYPE_I64) {
- op->args[pi++] = temp_arg(args[i]);
- op->args[pi++] = temp_arg(args[i] + 1);
- real_args += 2;
- break;
- }
- op->args[pi++] = temp_arg(args[i]);
- real_args++;
- break;
 default:
 g_assert_not_reached();
 }
 }
 op->args[pi++] = (uintptr_t)func;
 op->args[pi++] = (uintptr_t)info;
- TCGOP_CALLI(op) = real_args;
+ tcg_debug_assert(pi == total_args);
 
- /* Make sure the fields didn't overflow. */
- tcg_debug_assert(TCGOP_CALLI(op) == real_args);
- tcg_debug_assert(pi <= max_args);
+ QTAILQ_INSERT_TAIL(&tcg_ctx->ops, op, link);
 
- if (TCG_TARGET_CALL_ARG_I32 == TCG_CALL_ARG_EXTEND) {
- for (i = 0; i < nargs; ++i) {
- int argtype = extract32(typemask, (i + 1) * 3, 3);
- bool is_32bit = (argtype & ~1) == dh_typecode_i32;
-
- if (is_32bit) {
- tcg_temp_free_internal(args[i]);
- }
- }
+ tcg_debug_assert(n_extend < ARRAY_SIZE(extend_free));
+ for (i = 0; i < n_extend; ++i) {
+ tcg_temp_free_i64(extend_free[i]);
 }
 }
 
@@ -XXX,XX +XXX,XX @@ static void tcg_dump_ops(TCGContext *s, FILE *f, bool have_prefs)
 }
 for (i = 0; i < nb_iargs; i++) {
 TCGArg arg = op->args[nb_oargs + i];
- const char *t = "<dummy>";
- if (arg != TCG_CALL_DUMMY_ARG) {
- t = tcg_get_arg_str(s, buf, sizeof(buf), arg);
- }
+ const char *t = tcg_get_arg_str(s, buf, sizeof(buf), arg);
 col += ne_fprintf(f, ",%s", t);
 }
 } else {
@@ -XXX,XX +XXX,XX @@ static void liveness_pass_1(TCGContext *s)
 switch (opc) {
 case INDEX_op_call:
 {
- int call_flags;
- int nb_call_regs;
+ const TCGHelperInfo *info = tcg_call_info(op);
+ int call_flags = tcg_call_flags(op);
 
 nb_oargs = TCGOP_CALLO(op);
 nb_iargs = TCGOP_CALLI(op);
- call_flags = tcg_call_flags(op);
 
 /* pure functions can be removed if their result is unused */
 if (call_flags & TCG_CALL_NO_SIDE_EFFECTS) {
@@ -XXX,XX +XXX,XX @@ static void liveness_pass_1(TCGContext *s)
 /* Record arguments that die in this helper. */
 for (i = nb_oargs; i < nb_iargs + nb_oargs; i++) {
 ts = arg_temp(op->args[i]);
- if (ts && ts->state & TS_DEAD) {
+ if (ts->state & TS_DEAD) {
 arg_life |= DEAD_ARG << i;
 }
 }
@@ -XXX,XX +XXX,XX @@ static void liveness_pass_1(TCGContext *s)
 /* For all live registers, remove call-clobbered prefs. */
 la_cross_call(s, nb_temps);
 
- nb_call_regs = ARRAY_SIZE(tcg_target_call_iarg_regs);
+ /*
+ * Input arguments are live for preceding opcodes.
+ *
+ * For those arguments that die, and will be allocated in
+ * registers, clear the register set for that arg, to be
+ * filled in below. For args that will be on the stack,
+ * reset to any available reg. Process arguments in reverse
+ * order so that if a temp is used more than once, the stack
+ * reset to max happens before the register reset to 0.
+ */
+ for (i = nb_iargs - 1; i >= 0; i--) {
+ const TCGCallArgumentLoc *loc = &info->in[i];
+ ts = arg_temp(op->args[nb_oargs + i]);
 
- /* Input arguments are live for preceding opcodes. */
- for (i = 0; i < nb_iargs; i++) {
- ts = arg_temp(op->args[i + nb_oargs]);
- if (ts && ts->state & TS_DEAD) {
- /* For those arguments that die, and will be allocated
- * in registers, clear the register set for that arg,
- * to be filled in below. For args that will be on
- * the stack, reset to any available reg.
- */
- *la_temp_pref(ts)
- = (i < nb_call_regs ? 0 :
- tcg_target_available_regs[ts->type]);
+ if (ts->state & TS_DEAD) {
+ switch (loc->kind) {
+ case TCG_CALL_ARG_NORMAL:
+ case TCG_CALL_ARG_EXTEND_U:
+ case TCG_CALL_ARG_EXTEND_S:
+ if (REG_P(loc)) {
+ *la_temp_pref(ts) = 0;
+ break;
+ }
+ /* fall through */
+ default:
+ *la_temp_pref(ts) =
+ tcg_target_available_regs[ts->type];
+ break;
+ }
 ts->state &= ~TS_DEAD;
 }
 }
 
- /* For each input argument, add its input register to prefs.
- If a temp is used once, this produces a single set bit. */
- for (i = 0; i < MIN(nb_call_regs, nb_iargs); i++) {
- ts = arg_temp(op->args[i + nb_oargs]);
- if (ts) {
- tcg_regset_set_reg(*la_temp_pref(ts),
- tcg_target_call_iarg_regs[i]);
+ /*
+ * For each input argument, add its input register to prefs.
+ * If a temp is used once, this produces a single set bit;
+ * if a temp is used multiple times, this produces a set.
+ */
+ for (i = 0; i < nb_iargs; i++) {
+ const TCGCallArgumentLoc *loc = &info->in[i];
+ ts = arg_temp(op->args[nb_oargs + i]);
+
+ switch (loc->kind) {
+ case TCG_CALL_ARG_NORMAL:
+ case TCG_CALL_ARG_EXTEND_U:
+ case TCG_CALL_ARG_EXTEND_S:
+ if (REG_P(loc)) {
+ tcg_regset_set_reg(*la_temp_pref(ts),
+ tcg_target_call_iarg_regs[loc->arg_slot]);
+ }
+ break;
+ default:
+ break;
 }
 }
 }
@@ -XXX,XX +XXX,XX @@ static bool liveness_pass_2(TCGContext *s)
 /* Make sure that input arguments are available. */
 for (i = nb_oargs; i < nb_iargs + nb_oargs; i++) {
 arg_ts = arg_temp(op->args[i]);
- if (arg_ts) {
- dir_ts = arg_ts->state_ptr;
- if (dir_ts && arg_ts->state == TS_DEAD) {
- TCGOpcode lopc = (arg_ts->type == TCG_TYPE_I32
- ? INDEX_op_ld_i32
- : INDEX_op_ld_i64);
- TCGOp *lop = tcg_op_insert_before(s, op, lopc, 3);
+ dir_ts = arg_ts->state_ptr;
+ if (dir_ts && arg_ts->state == TS_DEAD) {
+ TCGOpcode lopc = (arg_ts->type == TCG_TYPE_I32
+ ? INDEX_op_ld_i32
+ : INDEX_op_ld_i64);
+ TCGOp *lop = tcg_op_insert_before(s, op, lopc, 3);
 
- lop->args[0] = temp_arg(dir_ts);
- lop->args[1] = temp_arg(arg_ts->mem_base);
- lop->args[2] = arg_ts->mem_offset;
+ lop->args[0] = temp_arg(dir_ts);
+ lop->args[1] = temp_arg(arg_ts->mem_base);
+ lop->args[2] = arg_ts->mem_offset;
 
- /* Loaded, but synced with memory. */
- arg_ts->state = TS_MEM;
- }
+ /* Loaded, but synced with memory. */
+ arg_ts->state = TS_MEM;
 }
 }
 
@@ -XXX,XX +XXX,XX @@ static bool liveness_pass_2(TCGContext *s)
 so that we reload when needed. */
 for (i = nb_oargs; i < nb_iargs + nb_oargs; i++) {
 arg_ts = arg_temp(op->args[i]);
- if (arg_ts) {
- dir_ts = arg_ts->state_ptr;
- if (dir_ts) {
- op->args[i] = temp_arg(dir_ts);
- changes = true;
- if (IS_DEAD_ARG(i)) {
- arg_ts->state = TS_DEAD;
- }
+ dir_ts = arg_ts->state_ptr;
+ if (dir_ts) {
+ op->args[i] = temp_arg(dir_ts);
+ changes = true;
+ if (IS_DEAD_ARG(i)) {
+ arg_ts->state = TS_DEAD;
 }
 }
 }
@@ -XXX,XX +XXX,XX @@ static bool tcg_reg_alloc_dup2(TCGContext *s, const TCGOp *op)
 return true;
 }
 
+static void load_arg_reg(TCGContext *s, TCGReg reg, TCGTemp *ts,
+ TCGRegSet allocated_regs)
+{
+ if (ts->val_type == TEMP_VAL_REG) {
+ if (ts->reg != reg) {
+ tcg_reg_free(s, reg, allocated_regs);
+ if (!tcg_out_mov(s, ts->type, reg, ts->reg)) {
+ /*
+ * Cross register class move not supported. Sync the
+ * temp back to its slot and load from there.
+ */
+ temp_sync(s, ts, allocated_regs, 0, 0);
+ tcg_out_ld(s, ts->type, reg,
+ ts->mem_base->reg, ts->mem_offset);
+ }
+ }
+ } else {
+ TCGRegSet arg_set = 0;
+
+ tcg_reg_free(s, reg, allocated_regs);
+ tcg_regset_set_reg(arg_set, reg);
+ temp_load(s, ts, arg_set, allocated_regs, 0);
+ }
+}
+
+static void load_arg_stk(TCGContext *s, int stk_slot, TCGTemp *ts,
+ TCGRegSet allocated_regs)
+{
+ /*
+ * When the destination is on the stack, load up the temp and store.
+ * If there are many call-saved registers, the temp might live to
+ * see another use; otherwise it'll be discarded.
+ */
+ temp_load(s, ts, tcg_target_available_regs[ts->type], allocated_regs, 0);
+ tcg_out_st(s, ts->type, ts->reg, TCG_REG_CALL_STACK,
+ TCG_TARGET_CALL_STACK_OFFSET +
+ stk_slot * sizeof(tcg_target_long));
+}
+
+static void load_arg_normal(TCGContext *s, const TCGCallArgumentLoc *l,
+ TCGTemp *ts, TCGRegSet *allocated_regs)
+{
+ if (REG_P(l)) {
+ TCGReg reg = tcg_target_call_iarg_regs[l->arg_slot];
+ load_arg_reg(s, reg, ts, *allocated_regs);
+ tcg_regset_set_reg(*allocated_regs, reg);
+ } else {
+ load_arg_stk(s, l->arg_slot - ARRAY_SIZE(tcg_target_call_iarg_regs),
+ ts, *allocated_regs);
+ }
+}
+
 static void tcg_reg_alloc_call(TCGContext *s, TCGOp *op)
 {
 const int nb_oargs = TCGOP_CALLO(op);
 const int nb_iargs = TCGOP_CALLI(op);
 const TCGLifeData arg_life = op->life;
- const TCGHelperInfo *info;
- int flags, nb_regs, i;
- TCGReg reg;
- TCGArg arg;
- TCGTemp *ts;
- intptr_t stack_offset;
- size_t call_stack_size;
- tcg_insn_unit *func_addr;
- int allocate_args;
- TCGRegSet allocated_regs;
+ const TCGHelperInfo *info = tcg_call_info(op);
+ TCGRegSet allocated_regs = s->reserved_regs;
+ int i;
 
- func_addr = tcg_call_func(op);
- info = tcg_call_info(op);
- flags = info->flags;
+ /*
+ * Move inputs into place in reverse order,
+ * so that we place stacked arguments first.
+ */
+ for (i = nb_iargs - 1; i >= 0; --i) {
+ const TCGCallArgumentLoc *loc = &info->in[i];
+ TCGTemp *ts = arg_temp(op->args[nb_oargs + i]);
 
- nb_regs = ARRAY_SIZE(tcg_target_call_iarg_regs);
- if (nb_regs > nb_iargs) {
- nb_regs = nb_iargs;
- }
-
- /* assign stack slots first */
- call_stack_size = (nb_iargs - nb_regs) * sizeof(tcg_target_long);
- call_stack_size = (call_stack_size + TCG_TARGET_STACK_ALIGN - 1) &
- ~(TCG_TARGET_STACK_ALIGN - 1);
- allocate_args = (call_stack_size > TCG_STATIC_CALL_ARGS_SIZE);
- if (allocate_args) {
- /* XXX: if more than TCG_STATIC_CALL_ARGS_SIZE is needed,
- preallocate call stack */
- tcg_abort();
- }
-
- stack_offset = TCG_TARGET_CALL_STACK_OFFSET;
- for (i = nb_regs; i < nb_iargs; i++) {
- arg = op->args[nb_oargs + i];
- if (arg != TCG_CALL_DUMMY_ARG) {
- ts = arg_temp(arg);
- temp_load(s, ts, tcg_target_available_regs[ts->type],
- s->reserved_regs, 0);
- tcg_out_st(s, ts->type, ts->reg, TCG_REG_CALL_STACK, stack_offset);
- }
- stack_offset += sizeof(tcg_target_long);
- }
-
- /* assign input registers */
- allocated_regs = s->reserved_regs;
- for (i = 0; i < nb_regs; i++) {
- arg = op->args[nb_oargs + i];
- if (arg != TCG_CALL_DUMMY_ARG) {
- ts = arg_temp(arg);
- reg = tcg_target_call_iarg_regs[i];
-
- if (ts->val_type == TEMP_VAL_REG) {
- if (ts->reg != reg) {
- tcg_reg_free(s, reg, allocated_regs);
- if (!tcg_out_mov(s, ts->type, reg, ts->reg)) {
- /*
- * Cross register class move not supported. Sync the
- * temp back to its slot and load from there.
- */
- temp_sync(s, ts, allocated_regs, 0, 0);
- tcg_out_ld(s, ts->type, reg,
- ts->mem_base->reg, ts->mem_offset);
- }
- }
- } else {
- TCGRegSet arg_set = 0;
-
- tcg_reg_free(s, reg, allocated_regs);
- tcg_regset_set_reg(arg_set, reg);
- temp_load(s, ts, arg_set, allocated_regs, 0);
- }
-
- tcg_regset_set_reg(allocated_regs, reg);
+ switch (loc->kind) {
+ case TCG_CALL_ARG_NORMAL:
+ case TCG_CALL_ARG_EXTEND_U:
+ case TCG_CALL_ARG_EXTEND_S:
+ load_arg_normal(s, loc, ts, &allocated_regs);
+ break;
+ default:
+ g_assert_not_reached();
 }
 }
 
- /* mark dead temporaries and free the associated registers */
+ /* Mark dead temporaries and free the associated registers. */
 for (i = nb_oargs; i < nb_iargs + nb_oargs; i++) {
 if (IS_DEAD_ARG(i)) {
 temp_dead(s, arg_temp(op->args[i]));
 }
 }
 
- /* clobber call registers */
+ /* Clobber call registers. */
 for (i = 0; i < TCG_TARGET_NB_REGS; i++) {
 if (tcg_regset_test_reg(tcg_target_call_clobber_regs, i)) {
 tcg_reg_free(s, i, allocated_regs);
 }
 }
 
- /* Save globals if they might be written by the helper, sync them if
- they might be read. */
- if (flags & TCG_CALL_NO_READ_GLOBALS) {
+ /*
+ * Save globals if they might be written by the helper,
+ * sync them if they might be read.
+ */
+ if (info->flags & TCG_CALL_NO_READ_GLOBALS) {
 /* Nothing to do */
- } else if (flags & TCG_CALL_NO_WRITE_GLOBALS) {
+ } else if (info->flags & TCG_CALL_NO_WRITE_GLOBALS) {
 sync_globals(s, allocated_regs);
 } else {
 save_globals(s, allocated_regs);
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_call(TCGContext *s, TCGOp *op)
 gpointer hash = (gpointer)(uintptr_t)info->typemask;
 ffi_cif *cif = g_hash_table_lookup(ffi_table, hash);
 assert(cif != NULL);
- tcg_out_call(s, func_addr, cif);
+ tcg_out_call(s, tcg_call_func(op), cif);
 }
 #else
- tcg_out_call(s, func_addr);
+ tcg_out_call(s, tcg_call_func(op));
 #endif
 
- /* assign output registers and emit moves if needed */
- for(i = 0; i < nb_oargs; i++) {
- arg = op->args[i];
- ts = arg_temp(arg);
+ /* Assign output registers and emit moves if needed. */
+ switch (info->out_kind) {
+ case TCG_CALL_RET_NORMAL:
+ for (i = 0; i < nb_oargs; i++) {
+ TCGTemp *ts = arg_temp(op->args[i]);
+ TCGReg reg = tcg_target_call_oarg_regs[i];
 
- /* ENV should not be modified. */
- tcg_debug_assert(!temp_readonly(ts));
+ /* ENV should not be modified. */
+ tcg_debug_assert(!temp_readonly(ts));
 
- reg = tcg_target_call_oarg_regs[i];
- set_temp_val_reg(s, ts, reg);
- ts->mem_coherent = 0;
+ set_temp_val_reg(s, ts, reg);
+ ts->mem_coherent = 0;
+ }
+ break;
+ default:
+ g_assert_not_reached();
+ }
+
+ /* Flush or discard output registers as needed. */
+ for (i = 0; i < nb_oargs; i++) {
+ TCGTemp *ts = arg_temp(op->args[i]);
 if (NEED_SYNC_ARG(i)) {
- temp_sync(s, ts, allocated_regs, 0, IS_DEAD_ARG(i));
+ temp_sync(s, ts, s->reserved_regs, 0, IS_DEAD_ARG(i));
 } else if (IS_DEAD_ARG(i)) {
 temp_dead(s, ts);
 }
-- 
2.34.1

From: Philippe Mathieu-Daudé <philmd@linaro.org>

In the unlikely case of invalid typecode mask, the function
will abort instead of returning a NULL pointer.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Message-Id: <20221111074101.2069454-27-richard.henderson@linaro.org>
[PMD: Split from bigger patch]
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Message-Id: <20221122180804.938-2-philmd@linaro.org>
---
 tcg/tcg.c | 30 ++++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static GHashTable *helper_table;
 #ifdef CONFIG_TCG_INTERPRETER
 static GHashTable *ffi_table;
 
-static ffi_type * const typecode_to_ffi[8] = {
- [dh_typecode_void] = &ffi_type_void,
- [dh_typecode_i32] = &ffi_type_uint32,
- [dh_typecode_s32] = &ffi_type_sint32,
- [dh_typecode_i64] = &ffi_type_uint64,
- [dh_typecode_s64] = &ffi_type_sint64,
- [dh_typecode_ptr] = &ffi_type_pointer,
-};
+static ffi_type *typecode_to_ffi(int argmask)
+{
+ switch (argmask) {
+ case dh_typecode_void:
+ return &ffi_type_void;
+ case dh_typecode_i32:
+ return &ffi_type_uint32;
+ case dh_typecode_s32:
+ return &ffi_type_sint32;
+ case dh_typecode_i64:
+ return &ffi_type_uint64;
+ case dh_typecode_s64:
+ return &ffi_type_sint64;
+ case dh_typecode_ptr:
+ return &ffi_type_pointer;
+ }
+ g_assert_not_reached();
+}
 #endif
 
 typedef struct TCGCumulativeArgs {
@@ -XXX,XX +XXX,XX @@ static void tcg_context_init(unsigned max_cpus)
 nargs = DIV_ROUND_UP(nargs, 3);
 
 ca = g_malloc0(sizeof(*ca) + nargs * sizeof(ffi_type *));
- ca->cif.rtype = typecode_to_ffi[typemask & 7];
+ ca->cif.rtype = typecode_to_ffi(typemask & 7);
 ca->cif.nargs = nargs;
 
 if (nargs != 0) {
 ca->cif.arg_types = ca->args;
 for (int j = 0; j < nargs; ++j) {
 int typecode = extract32(typemask, (j + 1) * 3, 3);
- ca->args[j] = typecode_to_ffi[typecode];
+ ca->args[j] = typecode_to_ffi(typecode);
 }
 }
 
-- 
2.34.1

From: Philippe Mathieu-Daudé <philmd@linaro.org>

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Message-Id: <20221111074101.2069454-27-richard.henderson@linaro.org>
[PMD: Split from bigger patch]
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Message-Id: <20221122180804.938-3-philmd@linaro.org>
---
 tcg/tcg.c | 83 +++++++++++++++++++++++++++++--------------------------
 1 file changed, 44 insertions(+), 39 deletions(-)

diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static ffi_type *typecode_to_ffi(int argmask)
 }
 g_assert_not_reached();
 }
-#endif
+
+static void init_ffi_layouts(void)
+{
+ /* g_direct_hash/equal for direct comparisons on uint32_t. */
+ ffi_table = g_hash_table_new(NULL, NULL);
+ for (int i = 0; i < ARRAY_SIZE(all_helpers); ++i) {
+ uint32_t typemask = all_helpers[i].typemask;
+ gpointer hash = (gpointer)(uintptr_t)typemask;
+ struct {
+ ffi_cif cif;
+ ffi_type *args[];
+ } *ca;
+ ffi_status status;
+ int nargs;
+
+ if (g_hash_table_lookup(ffi_table, hash)) {
+ continue;
+ }
+
+ /* Ignoring the return type, find the last non-zero field. */
+ nargs = 32 - clz32(typemask >> 3);
+ nargs = DIV_ROUND_UP(nargs, 3);
+
+ ca = g_malloc0(sizeof(*ca) + nargs * sizeof(ffi_type *));
+ ca->cif.rtype = typecode_to_ffi(typemask & 7);
+ ca->cif.nargs = nargs;
+
+ if (nargs != 0) {
+ ca->cif.arg_types = ca->args;
+ for (int j = 0; j < nargs; ++j) {
+ int typecode = extract32(typemask, (j + 1) * 3, 3);
+ ca->args[j] = typecode_to_ffi(typecode);
+ }
+ }
+
+ status = ffi_prep_cif(&ca->cif, FFI_DEFAULT_ABI, nargs,
+ ca->cif.rtype, ca->cif.arg_types);
+ assert(status == FFI_OK);
+
+ g_hash_table_insert(ffi_table, hash, (gpointer)&ca->cif);
+ }
+}
+#endif /* CONFIG_TCG_INTERPRETER */
 
 typedef struct TCGCumulativeArgs {
 int arg_idx; /* tcg_gen_callN args[] */
@@ -XXX,XX +XXX,XX @@ static void tcg_context_init(unsigned max_cpus)
 }
 
 #ifdef CONFIG_TCG_INTERPRETER
- /* g_direct_hash/equal for direct comparisons on uint32_t. */
- ffi_table = g_hash_table_new(NULL, NULL);
- for (i = 0; i < ARRAY_SIZE(all_helpers); ++i) {
- struct {
- ffi_cif cif;
- ffi_type *args[];
- } *ca;
- uint32_t typemask = all_helpers[i].typemask;
- gpointer hash = (gpointer)(uintptr_t)typemask;
- ffi_status status;
- int nargs;
-
- if (g_hash_table_lookup(ffi_table, hash)) {
- continue;
- }
-
- /* Ignoring the return type, find the last non-zero field. */
- nargs = 32 - clz32(typemask >> 3);
- nargs = DIV_ROUND_UP(nargs, 3);
-
- ca = g_malloc0(sizeof(*ca) + nargs * sizeof(ffi_type *));
- ca->cif.rtype = typecode_to_ffi(typemask & 7);
- ca->cif.nargs = nargs;
-
- if (nargs != 0) {
- ca->cif.arg_types = ca->args;
- for (int j = 0; j < nargs; ++j) {
- int typecode = extract32(typemask, (j + 1) * 3, 3);
- ca->args[j] = typecode_to_ffi(typecode);
- }
- }
-
- status = ffi_prep_cif(&ca->cif, FFI_DEFAULT_ABI, nargs,
- ca->cif.rtype, ca->cif.arg_types);
- assert(status == FFI_OK);
-
- g_hash_table_insert(ffi_table, hash, (gpointer)&ca->cif);
- }
+ init_ffi_layouts();
 #endif
 
 tcg_target_init(s);
-- 
2.34.1

Instead of requiring a separate hash table lookup,
put a pointer to the CIF into TCGHelperInfo.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Message-Id: <20221111074101.2069454-27-richard.henderson@linaro.org>
[PMD: Split from bigger patch]
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Message-Id: <20221122180804.938-4-philmd@linaro.org>
---
 tcg/tcg-internal.h | 7 +++++++
 tcg/tcg.c | 30 ++++++++++++++----------------
 2 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/tcg/tcg-internal.h b/tcg/tcg-internal.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg-internal.h
+++ b/tcg/tcg-internal.h
@@ -XXX,XX +XXX,XX @@
 #ifndef TCG_INTERNAL_H
 #define TCG_INTERNAL_H
 
+#ifdef CONFIG_TCG_INTERPRETER
+#include <ffi.h>
+#endif
+
 #define TCG_HIGHWATER 1024
 
 /*
@@ -XXX,XX +XXX,XX @@ typedef struct TCGCallArgumentLoc {
 typedef struct TCGHelperInfo {
 void *func;
 const char *name;
+#ifdef CONFIG_TCG_INTERPRETER
+ ffi_cif *cif;
+#endif
 unsigned typemask : 32;
 unsigned flags : 8;
 unsigned nr_in : 8;
diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@
 #include "tcg/tcg-ldst.h"
 #include "tcg-internal.h"
 
-#ifdef CONFIG_TCG_INTERPRETER
-#include <ffi.h>
-#endif
-
 /* Forward declarations for functions declared in tcg-target.c.inc and
 used here. */
 static void tcg_target_init(TCGContext *s);
@@ -XXX,XX +XXX,XX @@ static TCGHelperInfo all_helpers[] = {
 static GHashTable *helper_table;
 
 #ifdef CONFIG_TCG_INTERPRETER
-static GHashTable *ffi_table;
-
 static ffi_type *typecode_to_ffi(int argmask)
 {
 switch (argmask) {
@@ -XXX,XX +XXX,XX @@ static ffi_type *typecode_to_ffi(int argmask)
 static void init_ffi_layouts(void)
 {
 /* g_direct_hash/equal for direct comparisons on uint32_t. */
- ffi_table = g_hash_table_new(NULL, NULL);
+ GHashTable *ffi_table = g_hash_table_new(NULL, NULL);
+
 for (int i = 0; i < ARRAY_SIZE(all_helpers); ++i) {
- uint32_t typemask = all_helpers[i].typemask;
+ TCGHelperInfo *info = &all_helpers[i];
+ unsigned typemask = info->typemask;
 gpointer hash = (gpointer)(uintptr_t)typemask;
 struct {
 ffi_cif cif;
@@ -XXX,XX +XXX,XX @@ static void init_ffi_layouts(void)
 } *ca;
 ffi_status status;
 int nargs;
+ ffi_cif *cif;
 
- if (g_hash_table_lookup(ffi_table, hash)) {
+ cif = g_hash_table_lookup(ffi_table, hash);
+ if (cif) {
+ info->cif = cif;
 continue;
 }
 
@@ -XXX,XX +XXX,XX @@ static void init_ffi_layouts(void)
 ca->cif.rtype, ca->cif.arg_types);
 assert(status == FFI_OK);
 
- g_hash_table_insert(ffi_table, hash, (gpointer)&ca->cif);
+ cif = &ca->cif;
+ info->cif = cif;
+ g_hash_table_insert(ffi_table, hash, (gpointer)cif);
 }
+
+ g_hash_table_destroy(ffi_table);
 }
 #endif /* CONFIG_TCG_INTERPRETER */
 
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_call(TCGContext *s, TCGOp *op)
 }
 
 #ifdef CONFIG_TCG_INTERPRETER
- {
- gpointer hash = (gpointer)(uintptr_t)info->typemask;
- ffi_cif *cif = g_hash_table_lookup(ffi_table, hash);
- assert(cif != NULL);
- tcg_out_call(s, tcg_call_func(op), cif);
- }
+ tcg_out_call(s, tcg_call_func(op), info->cif);
 #else
 tcg_out_call(s, tcg_call_func(op));
 #endif
-- 
2.34.1

There is only one use, and BLR is perhaps even more
self-documentary than CALLR.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/aarch64/tcg-target.c.inc | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/aarch64/tcg-target.c.inc
+++ b/tcg/aarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_goto_long(TCGContext *s, const tcg_insn_unit *target)
     }
 }
 
-static inline void tcg_out_callr(TCGContext *s, TCGReg reg)
-{
-    tcg_out_insn(s, 3207, BLR, reg);
-}
-
 static void tcg_out_call(TCGContext *s, const tcg_insn_unit *target)
 {
     ptrdiff_t offset = tcg_pcrel_diff(s, target) >> 2;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_call(TCGContext *s, const tcg_insn_unit *target)
         tcg_out_insn(s, 3206, BL, offset);
     } else {
         tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_TMP, (intptr_t)target);
-        tcg_out_callr(s, TCG_REG_TMP);
+        tcg_out_insn(s, 3207, BLR, TCG_REG_TMP);
     }
 }
 
-- 
2.34.1

This eliminates an ifdef for TCI, and will be required for
expanding the call for TCGv_i128.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tcg.c | 12 ++----------
 tcg/aarch64/tcg-target.c.inc | 12 +++++++++---
 tcg/arm/tcg-target.c.inc | 10 ++++++++--
 tcg/i386/tcg-target.c.inc | 5 +++--
 tcg/loongarch64/tcg-target.c.inc | 7 ++++---
 tcg/mips/tcg-target.c.inc | 3 ++-
 tcg/ppc/tcg-target.c.inc | 7 ++++---
 tcg/riscv/tcg-target.c.inc | 7 ++++---
 tcg/s390x/tcg-target.c.inc | 12 +++++++++---
 tcg/sparc64/tcg-target.c.inc | 3 ++-
 tcg/tci/tcg-target.c.inc | 3 ++-
 11 files changed, 49 insertions(+), 32 deletions(-)

diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg, TCGReg arg1,
                        intptr_t arg2);
 static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
                         TCGReg base, intptr_t ofs);
-#ifdef CONFIG_TCG_INTERPRETER
 static void tcg_out_call(TCGContext *s, const tcg_insn_unit *target,
-                         ffi_cif *cif);
-#else
-static void tcg_out_call(TCGContext *s, const tcg_insn_unit *target);
-#endif
+                         const TCGHelperInfo *info);
 static bool tcg_target_const_match(int64_t val, TCGType type, int ct);
 #ifdef TCG_TARGET_NEED_LDST_LABELS
 static int tcg_out_ldst_finalize(TCGContext *s);
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_call(TCGContext *s, TCGOp *op)
         save_globals(s, allocated_regs);
     }
 
-#ifdef CONFIG_TCG_INTERPRETER
-    tcg_out_call(s, tcg_call_func(op), info->cif);
-#else
-    tcg_out_call(s, tcg_call_func(op));
-#endif
+    tcg_out_call(s, tcg_call_func(op), info);
 
     /* Assign output registers and emit moves if needed.  */
     switch (info->out_kind) {
diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/aarch64/tcg-target.c.inc
+++ b/tcg/aarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_goto_long(TCGContext *s, const tcg_insn_unit *target)
     }
 }
 
-static void tcg_out_call(TCGContext *s, const tcg_insn_unit *target)
+static void tcg_out_call_int(TCGContext *s, const tcg_insn_unit *target)
 {
     ptrdiff_t offset = tcg_pcrel_diff(s, target) >> 2;
     if (offset == sextract64(offset, 0, 26)) {
@@ -XXX,XX +XXX,XX @@ static void tcg_out_call(TCGContext *s, const tcg_insn_unit *target)
     }
 }
 
+static void tcg_out_call(TCGContext *s, const tcg_insn_unit *target,
+                         const TCGHelperInfo *info)
+{
+    tcg_out_call_int(s, target);
+}
+
 void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_rx,
                               uintptr_t jmp_rw, uintptr_t addr)
 {
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
     tcg_out_mov(s, TARGET_LONG_BITS == 64, TCG_REG_X1, lb->addrlo_reg);
     tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_X2, oi);
     tcg_out_adr(s, TCG_REG_X3, lb->raddr);
-    tcg_out_call(s, qemu_ld_helpers[opc & MO_SIZE]);
+    tcg_out_call_int(s, qemu_ld_helpers[opc & MO_SIZE]);
     if (opc & MO_SIGN) {
         tcg_out_sxt(s, lb->type, size, lb->datalo_reg, TCG_REG_X0);
     } else {
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
     tcg_out_mov(s, size == MO_64, TCG_REG_X2, lb->datalo_reg);
     tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_X3, oi);
     tcg_out_adr(s, TCG_REG_X4, lb->raddr);
-    tcg_out_call(s, qemu_st_helpers[opc & MO_SIZE]);
+    tcg_out_call_int(s, qemu_st_helpers[opc & MO_SIZE]);
     tcg_out_goto(s, lb->raddr);
     return true;
 }
diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_goto(TCGContext *s, ARMCond cond, const tcg_insn_unit *addr)
  * The call case is mostly used for helpers - so it's not unreasonable
  * for them to be beyond branch range.
  */
-static void tcg_out_call(TCGContext *s, const tcg_insn_unit *addr)
+static void tcg_out_call_int(TCGContext *s, const tcg_insn_unit *addr)
 {
     intptr_t addri = (intptr_t)addr;
     ptrdiff_t disp = tcg_pcrel_diff(s, addr);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_call(TCGContext *s, const tcg_insn_unit *addr)
     tcg_out_blx_reg(s, COND_AL, TCG_REG_TMP);
 }
 
+static void tcg_out_call(TCGContext *s, const tcg_insn_unit *addr,
+                         const TCGHelperInfo *info)
+{
+    tcg_out_call_int(s, addr);
+}
+
 static void tcg_out_goto_label(TCGContext *s, ARMCond cond, TCGLabel *l)
 {
     if (l->has_value) {
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
     argreg = tcg_out_arg_reg32(s, argreg, TCG_REG_R14);
 
     /* Use the canonical unsigned helpers and minimize icache usage. */
-    tcg_out_call(s, qemu_ld_helpers[opc & MO_SIZE]);
+    tcg_out_call_int(s, qemu_ld_helpers[opc & MO_SIZE]);
 
     datalo = lb->datalo_reg;
     datahi = lb->datahi_reg;
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_branch(TCGContext *s, int call, const tcg_insn_unit *dest)
     }
 }
 
-static inline void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest)
+static void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest,
+                         const TCGHelperInfo *info)
 {
     tcg_out_branch(s, 1, dest);
 }
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
                      (uintptr_t)l->raddr);
     }
 
-    tcg_out_call(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]);
+    tcg_out_branch(s, 1, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]);
 
     data_reg = l->datalo_reg;
     switch (opc & MO_SSIZE) {
diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/loongarch64/tcg-target.c.inc
+++ b/tcg/loongarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_call_int(TCGContext *s, const tcg_insn_unit *arg, bool tail)
     }
 }
 
-static void tcg_out_call(TCGContext *s, const tcg_insn_unit *arg)
+static void tcg_out_call(TCGContext *s, const tcg_insn_unit *arg,
+                         const TCGHelperInfo *info)
 {
     tcg_out_call_int(s, arg, false);
 }
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
     tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_A2, oi);
     tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_A3, (tcg_target_long)l->raddr);
 
-    tcg_out_call(s, qemu_ld_helpers[size]);
+    tcg_out_call_int(s, qemu_ld_helpers[size], false);
 
     switch (opc & MO_SSIZE) {
     case MO_SB:
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
     tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_A3, oi);
     tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_A4, (tcg_target_long)l->raddr);
 
-    tcg_out_call(s, qemu_st_helpers[size]);
+    tcg_out_call_int(s, qemu_st_helpers[size], false);
 
     return tcg_out_goto(s, l->raddr);
 }
diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/mips/tcg-target.c.inc
+++ b/tcg/mips/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_call_int(TCGContext *s, const tcg_insn_unit *arg, bool tail)
     }
 }
 
-static void tcg_out_call(TCGContext *s, const tcg_insn_unit *arg)
+static void tcg_out_call(TCGContext *s, const tcg_insn_unit *arg,
+                         const TCGHelperInfo *info)
 {
     tcg_out_call_int(s, arg, false);
     tcg_out_nop(s);
diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_call_int(TCGContext *s, int lk,
 #endif
 }
 
-static void tcg_out_call(TCGContext *s, const tcg_insn_unit *target)
+static void tcg_out_call(TCGContext *s, const tcg_insn_unit *target,
+                         const TCGHelperInfo *info)
 {
     tcg_out_call_int(s, LK, target);
 }
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
     tcg_out_movi(s, TCG_TYPE_I32, arg++, oi);
     tcg_out32(s, MFSPR | RT(arg) | LR);
 
-    tcg_out_call(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]);
+    tcg_out_call_int(s, LK, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]);
 
     lo = lb->datalo_reg;
     hi = lb->datahi_reg;
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
     tcg_out_movi(s, TCG_TYPE_I32, arg++, oi);
     tcg_out32(s, MFSPR | RT(arg) | LR);
 
-    tcg_out_call(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
+    tcg_out_call_int(s, LK, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
 
     tcg_out_b(s, 0, lb->raddr);
     return true;
diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/riscv/tcg-target.c.inc
+++ b/tcg/riscv/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_call_int(TCGContext *s, const tcg_insn_unit *arg, bool tail)
     }
 }
 
-static void tcg_out_call(TCGContext *s, const tcg_insn_unit *arg)
+static void tcg_out_call(TCGContext *s, const tcg_insn_unit *arg,
+                         const TCGHelperInfo *info)
 {
     tcg_out_call_int(s, arg, false);
 }
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
     tcg_out_movi(s, TCG_TYPE_PTR, a2, oi);
     tcg_out_movi(s, TCG_TYPE_PTR, a3, (tcg_target_long)l->raddr);
 
-    tcg_out_call(s, qemu_ld_helpers[opc & MO_SSIZE]);
+    tcg_out_call_int(s, qemu_ld_helpers[opc & MO_SSIZE], false);
     tcg_out_mov(s, (opc & MO_SIZE) == MO_64, l->datalo_reg, a0);
 
     tcg_out_goto(s, l->raddr);
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
     tcg_out_movi(s, TCG_TYPE_PTR, a3, oi);
     tcg_out_movi(s, TCG_TYPE_PTR, a4, (tcg_target_long)l->raddr);
 
-    tcg_out_call(s, qemu_st_helpers[opc & MO_SIZE]);
+    tcg_out_call_int(s, qemu_st_helpers[opc & MO_SIZE], false);
 
     tcg_out_goto(s, l->raddr);
     return true;
diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/s390x/tcg-target.c.inc
+++ b/tcg/s390x/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tgen_brcond(TCGContext *s, TCGType type, TCGCond c,
     tgen_branch(s, cc, l);
 }
 
-static void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest)
+static void tcg_out_call_int(TCGContext *s, const tcg_insn_unit *dest)
 {
     ptrdiff_t off = tcg_pcrel_diff(s, dest) >> 1;
     if (off == (int32_t)off) {
@@ -XXX,XX +XXX,XX @@ static void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest)
     }
 }
 
+static void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest,
+                         const TCGHelperInfo *info)
+{
+    tcg_out_call_int(s, dest);
+}
+
 static void tcg_out_qemu_ld_direct(TCGContext *s, MemOp opc, TCGReg data,
                                    TCGReg base, TCGReg index, int disp)
 {
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
     }
     tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_R4, oi);
     tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R5, (uintptr_t)lb->raddr);
-    tcg_out_call(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SSIZE)]);
+    tcg_out_call_int(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SSIZE)]);
     tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_R2);
 
     tgen_gotoi(s, S390_CC_ALWAYS, lb->raddr);
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
     }
     tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_R5, oi);
     tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R6, (uintptr_t)lb->raddr);
-    tcg_out_call(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
+    tcg_out_call_int(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
 
     tgen_gotoi(s, S390_CC_ALWAYS, lb->raddr);
     return true;
diff --git a/tcg/sparc64/tcg-target.c.inc b/tcg/sparc64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/sparc64/tcg-target.c.inc
+++ b/tcg/sparc64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_call_nodelay(TCGContext *s, const tcg_insn_unit *dest,
     }
 }
 
-static void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest)
+static void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest,
+                         const TCGHelperInfo *info)
 {
     tcg_out_call_nodelay(s, dest, false);
     tcg_out_nop(s);
diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci/tcg-target.c.inc
+++ b/tcg/tci/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_movi(TCGContext *s, TCGType type,
 }
 
 static void tcg_out_call(TCGContext *s, const tcg_insn_unit *func,
-                         ffi_cif *cif)
+                         const TCGHelperInfo *info)
 {
+    ffi_cif *cif = info->cif;
     tcg_insn_unit insn = 0;
     uint8_t which;
 
-- 
2.34.1

When called from syscall(), we are not within a TB and pc == 0.
We can skip the check for invalidating the current TB.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/tb-maint.c | 78 ++++++++++++++++++++++++--------------------
 1 file changed, 43 insertions(+), 35 deletions(-)

diff --git a/accel/tcg/tb-maint.c b/accel/tcg/tb-maint.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tb-maint.c
+++ b/accel/tcg/tb-maint.c
@@ -XXX,XX +XXX,XX @@ void tb_invalidate_phys_page(tb_page_addr_t addr)
  */
 bool tb_invalidate_phys_page_unwind(tb_page_addr_t addr, uintptr_t pc)
 {
-    assert(pc != 0);
-#ifdef TARGET_HAS_PRECISE_SMC
-    assert_memory_lock();
-    {
-        TranslationBlock *current_tb = tcg_tb_lookup(pc);
-        bool current_tb_modified = false;
-        TranslationBlock *tb;
-        PageForEachNext n;
+    TranslationBlock *current_tb;
+    bool current_tb_modified;
+    TranslationBlock *tb;
+    PageForEachNext n;
 
-        addr &= TARGET_PAGE_MASK;
-
-        PAGE_FOR_EACH_TB(addr, addr + TARGET_PAGE_SIZE, unused, tb, n) {
-            if (current_tb == tb &&
-                (tb_cflags(current_tb) & CF_COUNT_MASK) != 1) {
-                /*
-                 * If we are modifying the current TB, we must stop its
-                 * execution. We could be more precise by checking that
-                 * the modification is after the current PC, but it would
-                 * require a specialized function to partially restore
-                 * the CPU state.
-                 */
-                current_tb_modified = true;
-                cpu_restore_state_from_tb(current_cpu, current_tb, pc);
-            }
-            tb_phys_invalidate__locked(tb);
-        }
-
-        if (current_tb_modified) {
-            /* Force execution of one insn next time.  */
-            CPUState *cpu = current_cpu;
-            cpu->cflags_next_tb = 1 | CF_NOIRQ | curr_cflags(current_cpu);
-            return true;
-        }
+    /*
+     * Without precise smc semantics, or when outside of a TB,
+     * we can skip to invalidate.
+     */
+#ifndef TARGET_HAS_PRECISE_SMC
+    pc = 0;
+#endif
+    if (!pc) {
+        tb_invalidate_phys_page(addr);
+        return false;
+    }
+
+    assert_memory_lock();
+    current_tb = tcg_tb_lookup(pc);
+
+    addr &= TARGET_PAGE_MASK;
+    current_tb_modified = false;
+
+    PAGE_FOR_EACH_TB(addr, addr + TARGET_PAGE_SIZE, unused, tb, n) {
+        if (current_tb == tb &&
+            (tb_cflags(current_tb) & CF_COUNT_MASK) != 1) {
+            /*
+             * If we are modifying the current TB, we must stop its
+             * execution. We could be more precise by checking that
+             * the modification is after the current PC, but it would
+             * require a specialized function to partially restore
+             * the CPU state.
+             */
+            current_tb_modified = true;
+            cpu_restore_state_from_tb(current_cpu, current_tb, pc);
+        }
+        tb_phys_invalidate__locked(tb);
+    }
+
+    if (current_tb_modified) {
+        /* Force execution of one insn next time.  */
+        CPUState *cpu = current_cpu;
+        cpu->cflags_next_tb = 1 | CF_NOIRQ | curr_cflags(current_cpu);
+        return true;
     }
-#else
-    tb_invalidate_phys_page(addr);
-#endif /* TARGET_HAS_PRECISE_SMC */
     return false;
 }
 #else
-- 
2.34.1

Because we allow lockless lookups, we have to be careful
when it is freed.  Use rcu to delay the free until safe.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/user-exec.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/accel/tcg/user-exec.c b/accel/tcg/user-exec.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/user-exec.c
+++ b/accel/tcg/user-exec.c
@@ -XXX,XX +XXX,XX @@
 #include "exec/exec-all.h"
 #include "tcg/tcg.h"
 #include "qemu/bitops.h"
+#include "qemu/rcu.h"
 #include "exec/cpu_ldst.h"
 #include "exec/translate-all.h"
 #include "exec/helper-proto.h"
@@ -XXX,XX +XXX,XX @@ bool handle_sigsegv_accerr_write(CPUState *cpu, sigset_t *old_set,
 }
 
 typedef struct PageFlagsNode {
+ struct rcu_head rcu;
 IntervalTreeNode itree;
 int flags;
 } PageFlagsNode;
@@ -XXX,XX +XXX,XX @@ static bool pageflags_unset(target_ulong start, target_ulong last)
 }
 } else if (p_last <= last) {
 /* Range completely covers node -- remove it. */
- g_free(p);
+ g_free_rcu(p, rcu);
 } else {
 /* Truncate the node from the start. */
 p->itree.start = last + 1;
@@ -XXX,XX +XXX,XX @@ static void pageflags_create_merge(target_ulong start, target_ulong last,
 if (prev) {
 if (next) {
 prev->itree.last = next->itree.last;
- g_free(next);
+ g_free_rcu(next, rcu);
 } else {
 prev->itree.last = last;
 }
@@ -XXX,XX +XXX,XX @@ static bool pageflags_set_clear(target_ulong start, target_ulong last,
 p->flags = merge_flags;
 } else {
 interval_tree_remove(&p->itree, &pageflags_root);
- g_free(p);
+ g_free_rcu(p, rcu);
 }
 goto done;
 }
@@ -XXX,XX +XXX,XX @@ static bool pageflags_set_clear(target_ulong start, target_ulong last,
 p->flags = merge_flags;
 } else {
 interval_tree_remove(&p->itree, &pageflags_root);
- g_free(p);
+ g_free_rcu(p, rcu);
 }
 if (p_last < last) {
 start = p_last + 1;
@@ -XXX,XX +XXX,XX @@ static bool pageflags_set_clear(target_ulong start, target_ulong last,
 p->itree.start = last + 1;
 interval_tree_insert(&p->itree, &pageflags_root);
 } else {
- g_free(p);
+ g_free_rcu(p, rcu);
 goto restart;
 }
 if (set_flags) {
@@ -XXX,XX +XXX,XX @@ tb_page_addr_t get_page_addr_code_hostp(CPUArchState *env, target_ulong addr,
 #define TBD_MASK (TARGET_PAGE_MASK * TPD_PAGES)
 
 typedef struct TargetPageDataNode {
+ struct rcu_head rcu;
 IntervalTreeNode itree;
 char data[TPD_PAGES][TARGET_PAGE_DATA_SIZE] __attribute__((aligned));
 } TargetPageDataNode;
@@ -XXX,XX +XXX,XX @@ void page_reset_target_data(target_ulong start, target_ulong end)
 n = next,
 next = next ? interval_tree_iter_next(n, start, last) : NULL) {
 target_ulong n_start, n_last, p_ofs, p_len;
- TargetPageDataNode *t;
+ TargetPageDataNode *t = container_of(n, TargetPageDataNode, itree);
 
 if (n->start >= start && n->last <= last) {
 interval_tree_remove(n, &targetdata_root);
- g_free(n);
+ g_free_rcu(t, rcu);
 continue;
 }
 
@@ -XXX,XX +XXX,XX @@ void page_reset_target_data(target_ulong start, target_ulong end)
 n_last = MIN(last, n->last);
 p_len = (n_last + 1 - n_start) >> TARGET_PAGE_BITS;
 
- t = container_of(n, TargetPageDataNode, itree);
 memset(t->data[p_ofs], 0, p_len * TARGET_PAGE_DATA_SIZE);
 }
 }
-- 
2.34.1

As in page_get_flags, we need to try again with the mmap
lock held if we fail a page lookup.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/user-exec.c | 41 ++++++++++++++++++++++++++++++++++-------
 1 file changed, 34 insertions(+), 7 deletions(-)

diff --git a/accel/tcg/user-exec.c b/accel/tcg/user-exec.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/user-exec.c
+++ b/accel/tcg/user-exec.c
@@ -XXX,XX +XXX,XX @@ void page_set_flags(target_ulong start, target_ulong end, int flags)
 int page_check_range(target_ulong start, target_ulong len, int flags)
 {
 target_ulong last;
+ int locked; /* tri-state: =0: unlocked, +1: global, -1: local */
+ int ret;
 
 if (len == 0) {
 return 0; /* trivial length */
@@ -XXX,XX +XXX,XX @@ int page_check_range(target_ulong start, target_ulong len, int flags)
 return -1; /* wrap around */
 }
 
+ locked = have_mmap_lock();
 while (true) {
 PageFlagsNode *p = pageflags_find(start, last);
 int missing;
 
 if (!p) {
- return -1; /* entire region invalid */
+ if (!locked) {
+ /*
+ * Lockless lookups have false negatives.
+ * Retry with the lock held.
+ */
+ mmap_lock();
+ locked = -1;
+ p = pageflags_find(start, last);
+ }
+ if (!p) {
+ ret = -1; /* entire region invalid */
+ break;
+ }
 }
 if (start < p->itree.start) {
- return -1; /* initial bytes invalid */
+ ret = -1; /* initial bytes invalid */
+ break;
 }
 
 missing = flags & ~p->flags;
 if (missing & PAGE_READ) {
- return -1; /* page not readable */
+ ret = -1; /* page not readable */
+ break;
 }
 if (missing & PAGE_WRITE) {
 if (!(p->flags & PAGE_WRITE_ORG)) {
- return -1; /* page not writable */
+ ret = -1; /* page not writable */
+ break;
 }
 /* Asking about writable, but has been protected: undo. */
 if (!page_unprotect(start, 0)) {
- return -1;
+ ret = -1;
+ break;
 }
 /* TODO: page_unprotect should take a range, not a single page. */
 if (last - start < TARGET_PAGE_SIZE) {
- return 0; /* ok */
+ ret = 0; /* ok */
+ break;
 }
 start += TARGET_PAGE_SIZE;
 continue;
 }
 
 if (last <= p->itree.last) {
- return 0; /* ok */
+ ret = 0; /* ok */
+ break;
 }
 start = p->itree.last + 1;
 }
+
+ /* Release the lock if acquired locally. */
+ if (locked < 0) {
+ mmap_unlock();
+ }
+ return ret;
 }
 
 void page_protect(tb_page_addr_t address)
-- 
2.34.1

From: Ilya Leoshkevich <iii@linux.ibm.com>

Add a test that locklessly changes and exercises page protection bits
from various threads. This helps catch race conditions in the VMA
handling.

Acked-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
Message-Id: <20221223120252.513319-1-iii@linux.ibm.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tests/tcg/multiarch/nop_func.h | 25 ++++
 tests/tcg/multiarch/munmap-pthread.c | 16 +--
 tests/tcg/multiarch/vma-pthread.c | 207 +++++++++++++++++++++++++++
 tests/tcg/multiarch/Makefile.target | 3 +
 4 files changed, 236 insertions(+), 15 deletions(-)
 create mode 100644 tests/tcg/multiarch/nop_func.h
 create mode 100644 tests/tcg/multiarch/vma-pthread.c

diff --git a/tests/tcg/multiarch/nop_func.h b/tests/tcg/multiarch/nop_func.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tests/tcg/multiarch/nop_func.h
@@ -XXX,XX +XXX,XX @@
+/*
+ * No-op functions that can be safely copied.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+#ifndef NOP_FUNC_H
+#define NOP_FUNC_H
+
+static const char nop_func[] = {
+#if defined(__aarch64__)
+ 0xc0, 0x03, 0x5f, 0xd6, /* ret */
+#elif defined(__alpha__)
+ 0x01, 0x80, 0xFA, 0x6B, /* ret */
+#elif defined(__arm__)
+ 0x1e, 0xff, 0x2f, 0xe1, /* bx lr */
+#elif defined(__riscv)
+ 0x67, 0x80, 0x00, 0x00, /* ret */
+#elif defined(__s390__)
+ 0x07, 0xfe, /* br %r14 */
+#elif defined(__i386__) || defined(__x86_64__)
+ 0xc3, /* ret */
+#endif
+};
+
+#endif
diff --git a/tests/tcg/multiarch/munmap-pthread.c b/tests/tcg/multiarch/munmap-pthread.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/tcg/multiarch/munmap-pthread.c
+++ b/tests/tcg/multiarch/munmap-pthread.c
@@ -XXX,XX +XXX,XX @@
 #include <sys/mman.h>
 #include <unistd.h>
 
-static const char nop_func[] = {
-#if defined(__aarch64__)
- 0xc0, 0x03, 0x5f, 0xd6, /* ret */
-#elif defined(__alpha__)
- 0x01, 0x80, 0xFA, 0x6B, /* ret */
-#elif defined(__arm__)
- 0x1e, 0xff, 0x2f, 0xe1, /* bx lr */
-#elif defined(__riscv)
- 0x67, 0x80, 0x00, 0x00, /* ret */
-#elif defined(__s390__)
- 0x07, 0xfe, /* br %r14 */
-#elif defined(__i386__) || defined(__x86_64__)
- 0xc3, /* ret */
-#endif
-};
+#include "nop_func.h"
 
 static void *thread_mmap_munmap(void *arg)
 {
diff --git a/tests/tcg/multiarch/vma-pthread.c b/tests/tcg/multiarch/vma-pthread.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tests/tcg/multiarch/vma-pthread.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * Test that VMA updates do not race.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ * Map a contiguous chunk of RWX memory. Split it into 8 equally sized
+ * regions, each of which is guaranteed to have a certain combination of
+ * protection bits set.
+ *
+ * Reader, writer and executor threads perform the respective operations on
+ * pages, which are guaranteed to have the respective protection bit set.
+ * Two mutator threads change the non-fixed protection bits randomly.
+ */
+#include <assert.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include "nop_func.h"
+
+#define PAGE_IDX_BITS 10
+#define PAGE_COUNT (1 << PAGE_IDX_BITS)
+#define PAGE_IDX_MASK (PAGE_COUNT - 1)
+#define REGION_IDX_BITS 3
+#define PAGE_IDX_R_MASK (1 << 7)
+#define PAGE_IDX_W_MASK (1 << 8)
+#define PAGE_IDX_X_MASK (1 << 9)
+#define REGION_MASK (PAGE_IDX_R_MASK | PAGE_IDX_W_MASK | PAGE_IDX_X_MASK)
+#define PAGES_PER_REGION (1 << (PAGE_IDX_BITS - REGION_IDX_BITS))
+
+struct context {
+ int pagesize;
+ char *ptr;
+ int dev_null_fd;
+ volatile int mutator_count;
+};
+
+static void *thread_read(void *arg)
+{
+ struct context *ctx = arg;
+ ssize_t sret;
+ size_t i, j;
+ int ret;
+
+ for (i = 0; ctx->mutator_count; i++) {
+ char *p;
+
+ j = (i & PAGE_IDX_MASK) | PAGE_IDX_R_MASK;
+ p = &ctx->ptr[j * ctx->pagesize];
+
+ /* Read directly. */
+ ret = memcmp(p, nop_func, sizeof(nop_func));
+ if (ret != 0) {
+ fprintf(stderr, "fail direct read %p\n", p);
+ abort();
+ }
+
+ /* Read indirectly. */
+ sret = write(ctx->dev_null_fd, p, 1);
+ if (sret != 1) {
+ if (sret < 0) {
+ fprintf(stderr, "fail indirect read %p (%m)\n", p);
+ } else {
+ fprintf(stderr, "fail indirect read %p (%zd)\n", p, sret);
+ }
+ abort();
+ }
+ }
+
+ return NULL;
+}
+
+static void *thread_write(void *arg)
+{
+ struct context *ctx = arg;
+ struct timespec *ts;
+ size_t i, j;
+ int ret;
+
+ for (i = 0; ctx->mutator_count; i++) {
+ j = (i & PAGE_IDX_MASK) | PAGE_IDX_W_MASK;
+
+ /* Write directly. */
+ memcpy(&ctx->ptr[j * ctx->pagesize], nop_func, sizeof(nop_func));
+
+ /* Write using a syscall. */
+ ts = (struct timespec *)(&ctx->ptr[(j + 1) * ctx->pagesize] -
+ sizeof(struct timespec));
+ ret = clock_gettime(CLOCK_REALTIME, ts);
+ if (ret != 0) {
+ fprintf(stderr, "fail indirect write %p (%m)\n", ts);
+ abort();
+ }
+ }
+
+ return NULL;
+}
+
+static void *thread_execute(void *arg)
+{
+ struct context *ctx = arg;
+ size_t i, j;
+
+ for (i = 0; ctx->mutator_count; i++) {
+ j = (i & PAGE_IDX_MASK) | PAGE_IDX_X_MASK;
+ ((void(*)(void))&ctx->ptr[j * ctx->pagesize])();
+ }
+
+ return NULL;
+}
+
+static void *thread_mutate(void *arg)
+{
+ size_t i, start_idx, end_idx, page_idx, tmp;
+ struct context *ctx = arg;
+ unsigned int seed;
+ int prot, ret;
+
+ seed = (unsigned int)time(NULL);
+ for (i = 0; i < 50000; i++) {
+ start_idx = rand_r(&seed) & PAGE_IDX_MASK;
+ end_idx = rand_r(&seed) & PAGE_IDX_MASK;
+ if (start_idx > end_idx) {
+ tmp = start_idx;
+ start_idx = end_idx;
+ end_idx = tmp;
+ }
+ prot = rand_r(&seed) & (PROT_READ | PROT_WRITE | PROT_EXEC);
+ for (page_idx = start_idx & REGION_MASK; page_idx <= end_idx;
+ page_idx += PAGES_PER_REGION) {
+ if (page_idx & PAGE_IDX_R_MASK) {
+ prot |= PROT_READ;
+ }
+ if (page_idx & PAGE_IDX_W_MASK) {
+ /* FIXME: qemu syscalls check for both read+write. */
+ prot |= PROT_WRITE | PROT_READ;
+ }
+ if (page_idx & PAGE_IDX_X_MASK) {
+ prot |= PROT_EXEC;
+ }
+ }
+ ret = mprotect(&ctx->ptr[start_idx * ctx->pagesize],
+ (end_idx - start_idx + 1) * ctx->pagesize, prot);
+ assert(ret == 0);
+ }
+
+ __atomic_fetch_sub(&ctx->mutator_count, 1, __ATOMIC_SEQ_CST);
+
+ return NULL;
+}
+
+int main(void)
+{
+ pthread_t threads[5];
+ struct context ctx;
+ size_t i;
+ int ret;
+
+ /* Without a template, nothing to test. */
+ if (sizeof(nop_func) == 0) {
+ return EXIT_SUCCESS;
+ }
+
+ /* Initialize memory chunk. */
+ ctx.pagesize = getpagesize();
+ ctx.ptr = mmap(NULL, PAGE_COUNT * ctx.pagesize,
+ PROT_READ | PROT_WRITE | PROT_EXEC,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ assert(ctx.ptr != MAP_FAILED);
+ for (i = 0; i < PAGE_COUNT; i++) {
+ memcpy(&ctx.ptr[i * ctx.pagesize], nop_func, sizeof(nop_func));
+ }
+ ctx.dev_null_fd = open("/dev/null", O_WRONLY);
+ assert(ctx.dev_null_fd >= 0);
+ ctx.mutator_count = 2;
+
+ /* Start threads. */
+ ret = pthread_create(&threads[0], NULL, thread_read, &ctx);
+ assert(ret == 0);
+ ret = pthread_create(&threads[1], NULL, thread_write, &ctx);
+ assert(ret == 0);
+ ret = pthread_create(&threads[2], NULL, thread_execute, &ctx);
+ assert(ret == 0);
+ for (i = 3; i <= 4; i++) {
+ ret = pthread_create(&threads[i], NULL, thread_mutate, &ctx);
+ assert(ret == 0);
+ }
+
+ /* Wait for threads to stop. */
+ for (i = 0; i < sizeof(threads) / sizeof(threads[0]); i++) {
+ ret = pthread_join(threads[i], NULL);
+ assert(ret == 0);
+ }
+
+ /* Destroy memory chunk. */
+ ret = close(ctx.dev_null_fd);
+ assert(ret == 0);
+ ret = munmap(ctx.ptr, PAGE_COUNT * ctx.pagesize);
+ assert(ret == 0);
+
+ return EXIT_SUCCESS;
+}
diff --git a/tests/tcg/multiarch/Makefile.target b/tests/tcg/multiarch/Makefile.target
index XXXXXXX..XXXXXXX 100644
--- a/tests/tcg/multiarch/Makefile.target
+++ b/tests/tcg/multiarch/Makefile.target
@@ -XXX,XX +XXX,XX @@ signals: LDFLAGS+=-lrt -lpthread
 munmap-pthread: CFLAGS+=-pthread
 munmap-pthread: LDFLAGS+=-pthread
 
+vma-pthread: CFLAGS+=-pthread
+vma-pthread: LDFLAGS+=-pthread
+
 # We define the runner for test-mmap after the individual
 # architectures have defined their supported pages sizes. If no
 # additional page sizes are defined we only run the default test.
-- 
2.34.1