Series comparison

-[PULL 00/16] tcg patch queue
+[PULL for-5.2 0/2] tcg patch queue
-This is v4 of my notdirty + rom patch set with two suggested name
+The following changes since commit 3c8c36c9087da957f580a9bb5ebf7814a753d1c6:
 changes (qemu_build_not_reached, TLB_DISCARD_WRITE) from David and Alex.
+  Merge remote-tracking branch 'remotes/kraxel/tags/ui-20201104-pull-request' into staging (2020-11-04 16:52:17 +0000)
 r~
 The following changes since commit 240ab11fb72049d6373cbbec8d788f8e411a00bc:
   Merge remote-tracking branch 'remotes/aperard/tags/pull-xen-20190924' into staging (2019-09-24 15:36:31 +0100)
 are available in the Git repository at:
-  https://github.com/rth7680/qemu.git tags/pull-tcg-20190925
+  https://github.com/rth7680/qemu.git tags/pull-tcg-20201104
-for you to fetch changes up to ae57db63acf5a0399232f852acc5c1d83ef63400:
+for you to fetch changes up to c56caea3b2a4ef5d760266f554df0d92c5a45f87:
-  cputlb: Pass retaddr to tb_check_watchpoint (2019-09-25 10:56:28 -0700)
+  tcg: Revert "tcg/optimize: Flush data at labels not TCG_OPF_BB_END" (2020-11-04 10:35:40 -0800)
 ----------------------------------------------------------------
-Fixes for TLB_BSWAP
+Fix assert in set_jmp_reset_offset
-Coversion of NOTDIRTY and ROM handling to cputlb
+Revert cross-branch optimization in tcg/optimize.c.
 Followup cleanups to cputlb
 ----------------------------------------------------------------
-Richard Henderson (16):
+Richard Henderson (2):
-      exec: Use TARGET_PAGE_BITS_MIN for TLB flags
+      tcg: Remove assert from set_jmp_reset_offset
-      cputlb: Disable __always_inline__ without optimization
+      tcg: Revert "tcg/optimize: Flush data at labels not TCG_OPF_BB_END"
       qemu/compiler.h: Add qemu_build_not_reached
       cputlb: Use qemu_build_not_reached in load/store_helpers
       cputlb: Split out load/store_memop
       cputlb: Introduce TLB_BSWAP
       exec: Adjust notdirty tracing
       cputlb: Move ROM handling from I/O path to TLB path
       cputlb: Move NOTDIRTY handling from I/O path to TLB path
       cputlb: Partially inline memory_region_section_get_iotlb
       cputlb: Merge and move memory_notdirty_write_{prepare,complete}
       cputlb: Handle TLB_NOTDIRTY in probe_access
       cputlb: Remove cpu->mem_io_vaddr
       cputlb: Remove tb_invalidate_phys_page_range is_cpu_write_access
       cputlb: Pass retaddr to tb_invalidate_phys_page_fast
       cputlb: Pass retaddr to tb_check_watchpoint
- accel/tcg/translate-all.h      |   8 +-
+ tcg/optimize.c | 35 +++++++++++++++++------------------
- include/exec/cpu-all.h         |  23 ++-
+ tcg/tcg.c      |  9 +++++----
- include/exec/cpu-common.h      |   3 -
+files changed, 22 insertions(+), 22 deletions(-)
  include/exec/exec-all.h        |   6 +-
  include/exec/memory-internal.h |  65 --------
  include/hw/core/cpu.h          |   2 -
  include/qemu/compiler.h        |  26 +++
  accel/tcg/cputlb.c             | 348 +++++++++++++++++++++++++----------------
  accel/tcg/translate-all.c      |  51 +++---
  exec.c                         | 158 +------------------
  hw/core/cpu.c                  |   1 -
  memory.c                       |  20 ---
  trace-events                   |   4 +-
 files changed, 288 insertions(+), 427 deletions(-)

-[PULL 01/16] exec: Use TARGET_PAGE_BITS_MIN for TLB flags
+Deleted patch
-These bits do not need to vary with the actual page size
-used by the guest.
-Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-Reviewed-by: David Hildenbrand <david@redhat.com>
-Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
-Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
----
- include/exec/cpu-all.h | 16 ++++++++++------
-file changed, 10 insertions(+), 6 deletions(-)
-diff --git a/include/exec/cpu-all.h b/include/exec/cpu-all.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/exec/cpu-all.h
-+++ b/include/exec/cpu-all.h
-@@ -XXX,XX +XXX,XX @@ CPUArchState *cpu_copy(CPUArchState *env);
- #if !defined(CONFIG_USER_ONLY)
--/* Flags stored in the low bits of the TLB virtual address.  These are
-- * defined so that fast path ram access is all zeros.
-+/*
-+ * Flags stored in the low bits of the TLB virtual address.
-+ * These are defined so that fast path ram access is all zeros.
-  * The flags all must be between TARGET_PAGE_BITS and
-  * maximum address alignment bit.
-+ *
-+ * Use TARGET_PAGE_BITS_MIN so that these bits are constant
-+ * when TARGET_PAGE_BITS_VARY is in effect.
-  */
- /* Zero if TLB entry is valid.  */
--#define TLB_INVALID_MASK    (1 << (TARGET_PAGE_BITS - 1))
-+#define TLB_INVALID_MASK    (1 << (TARGET_PAGE_BITS_MIN - 1))
- /* Set if TLB entry references a clean RAM page.  The iotlb entry will
-    contain the page physical address.  */
--#define TLB_NOTDIRTY        (1 << (TARGET_PAGE_BITS - 2))
-+#define TLB_NOTDIRTY        (1 << (TARGET_PAGE_BITS_MIN - 2))
- /* Set if TLB entry is an IO callback.  */
--#define TLB_MMIO            (1 << (TARGET_PAGE_BITS - 3))
-+#define TLB_MMIO            (1 << (TARGET_PAGE_BITS_MIN - 3))
- /* Set if TLB entry contains a watchpoint.  */
--#define TLB_WATCHPOINT      (1 << (TARGET_PAGE_BITS - 4))
-+#define TLB_WATCHPOINT      (1 << (TARGET_PAGE_BITS_MIN - 4))
- /* Use this mask to check interception with an alignment mask
-  * in a TCG backend.
---
-.17.1

-[PULL 02/16] cputlb: Disable __always_inline__ without optimization
+Deleted patch
-This forced inlining can result in missing symbols,
-which makes a debugging build harder to follow.
-Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
-Reviewed-by: David Hildenbrand <david@redhat.com>
-Reported-by: Peter Maydell <peter.maydell@linaro.org>
-Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
----
- include/qemu/compiler.h | 11 +++++++++++
- accel/tcg/cputlb.c      |  4 ++--
-files changed, 13 insertions(+), 2 deletions(-)
-diff --git a/include/qemu/compiler.h b/include/qemu/compiler.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/qemu/compiler.h
-+++ b/include/qemu/compiler.h
-@@ -XXX,XX +XXX,XX @@
- # define QEMU_NONSTRING
- #endif
-+/*
-+ * Forced inlining may be desired to encourage constant propagation
-+ * of function parameters.  However, it can also make debugging harder,
-+ * so disable it for a non-optimizing build.
-+ */
-+#if defined(__OPTIMIZE__)
-+#define QEMU_ALWAYS_INLINE  __attribute__((always_inline))
-+#else
-+#define QEMU_ALWAYS_INLINE
-+#endif
-+
- /* Implement C11 _Generic via GCC builtins.  Example:
-  *
-  *    QEMU_GENERIC(x, (float, sinf), (long double, sinl), sin) (x)
-diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
-index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/cputlb.c
-+++ b/accel/tcg/cputlb.c
-@@ -XXX,XX +XXX,XX @@ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
- typedef uint64_t FullLoadHelper(CPUArchState *env, target_ulong addr,
-                                 TCGMemOpIdx oi, uintptr_t retaddr);
--static inline uint64_t __attribute__((always_inline))
-+static inline uint64_t QEMU_ALWAYS_INLINE
- load_helper(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi,
-             uintptr_t retaddr, MemOp op, bool code_read,
-             FullLoadHelper *full_load)
-@@ -XXX,XX +XXX,XX @@ tcg_target_ulong helper_be_ldsl_mmu(CPUArchState *env, target_ulong addr,
-  * Store Helpers
-  */
--static inline void __attribute__((always_inline))
-+static inline void QEMU_ALWAYS_INLINE
- store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
-              TCGMemOpIdx oi, uintptr_t retaddr, MemOp op)
- {
---
-.17.1

-[PULL 03/16] qemu/compiler.h: Add qemu_build_not_reached
+Deleted patch
-Use this as a compile-time assert that a particular
-code path is not reachable.
-Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
----
- include/qemu/compiler.h | 15 +++++++++++++++
-file changed, 15 insertions(+)
-diff --git a/include/qemu/compiler.h b/include/qemu/compiler.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/qemu/compiler.h
-+++ b/include/qemu/compiler.h
-@@ -XXX,XX +XXX,XX @@
- #define QEMU_GENERIC9(x, a0, ...) QEMU_GENERIC_IF(x, a0, QEMU_GENERIC8(x, __VA_ARGS__))
- #define QEMU_GENERIC10(x, a0, ...) QEMU_GENERIC_IF(x, a0, QEMU_GENERIC9(x, __VA_ARGS__))
-+/**
-+ * qemu_build_not_reached()
-+ *
-+ * The compiler, during optimization, is expected to prove that a call
-+ * to this function cannot be reached and remove it.  If the compiler
-+ * supports QEMU_ERROR, this will be reported at compile time; otherwise
-+ * this will be reported at link time due to the missing symbol.
-+ */
-+#ifdef __OPTIMIZE__
-+extern void QEMU_NORETURN QEMU_ERROR("code path is reachable")
-+    qemu_build_not_reached(void);
-+#else
-+#define qemu_build_not_reached()  g_assert_not_reached()
-+#endif
-+
- #endif /* COMPILER_H */
---
-.17.1

-[PULL 04/16] cputlb: Use qemu_build_not_reached in load/store_helpers
+[PULL for-5.2 1/2] tcg: Remove assert from set_jmp_reset_offset
-Increase the current runtime assert to a compile-time assert.
+Since 6e6c4efed99, there has been a more appropriate range check
 done later at the end of tcg_gen_code.  There, a failing range
 check results in a returned error code, which causes the TB to
 be restarted at half the size.
-Reviewed-by: David Hildenbrand <david@redhat.com>
+Reported-by: Sai Pavan Boddu <saipava@xilinx.com>
-Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Tested-by: Sai Pavan Boddu <sai.pavan.boddu@xilinx.com>
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/cputlb.c | 5 ++---
+ tcg/tcg.c | 9 +++++----
-file changed, 2 insertions(+), 3 deletions(-)
+file changed, 5 insertions(+), 4 deletions(-)
-diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
+diff --git a/tcg/tcg.c b/tcg/tcg.c
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/cputlb.c
+--- a/tcg/tcg.c
-+++ b/accel/tcg/cputlb.c
++++ b/tcg/tcg.c
-@@ -XXX,XX +XXX,XX @@ load_helper(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi,
+@@ -XXX,XX +XXX,XX @@ static bool tcg_resolve_relocs(TCGContext *s)
-         res = ldq_le_p(haddr);
-         break;
+ static void set_jmp_reset_offset(TCGContext *s, int which)
-     default:
+ {
--        g_assert_not_reached();
+-    size_t off = tcg_current_code_size(s);
-+        qemu_build_not_reached();
+-    s->tb_jmp_reset_offset[which] = off;
-     }
+-    /* Make sure that we didn't overflow the stored offset.  */
+-    assert(s->tb_jmp_reset_offset[which] == off);
-     return res;
++    /*
-@@ -XXX,XX +XXX,XX @@ store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
++     * We will check for overflow at the end of the opcode loop in
-         stq_le_p(haddr, val);
++     * tcg_gen_code, where we bound tcg_current_code_size to UINT16_MAX.
-         break;
++     */
-     default:
++    s->tb_jmp_reset_offset[which] = tcg_current_code_size(s);
 -        g_assert_not_reached();
 -        break;
 +        qemu_build_not_reached();
      }
  }
+ #include "tcg-target.c.inc"
 --
-.17.1
+.25.1

-[PULL 05/16] cputlb: Split out load/store_memop
+Deleted patch
-We will shortly be using these more than once.
-Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-Reviewed-by: David Hildenbrand <david@redhat.com>
-Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
----
- accel/tcg/cputlb.c | 107 +++++++++++++++++++++++----------------------
-file changed, 55 insertions(+), 52 deletions(-)
-diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
-index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/cputlb.c
-+++ b/accel/tcg/cputlb.c
-@@ -XXX,XX +XXX,XX @@ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
- typedef uint64_t FullLoadHelper(CPUArchState *env, target_ulong addr,
-                                 TCGMemOpIdx oi, uintptr_t retaddr);
-+static inline uint64_t QEMU_ALWAYS_INLINE
-+load_memop(const void *haddr, MemOp op)
-+{
-+    switch (op) {
-+    case MO_UB:
-+        return ldub_p(haddr);
-+    case MO_BEUW:
-+        return lduw_be_p(haddr);
-+    case MO_LEUW:
-+        return lduw_le_p(haddr);
-+    case MO_BEUL:
-+        return (uint32_t)ldl_be_p(haddr);
-+    case MO_LEUL:
-+        return (uint32_t)ldl_le_p(haddr);
-+    case MO_BEQ:
-+        return ldq_be_p(haddr);
-+    case MO_LEQ:
-+        return ldq_le_p(haddr);
-+    default:
-+        qemu_build_not_reached();
-+    }
-+}
-+
- static inline uint64_t QEMU_ALWAYS_INLINE
- load_helper(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi,
-             uintptr_t retaddr, MemOp op, bool code_read,
-@@ -XXX,XX +XXX,XX @@ load_helper(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi,
-  do_aligned_access:
-     haddr = (void *)((uintptr_t)addr + entry->addend);
--    switch (op) {
--    case MO_UB:
--        res = ldub_p(haddr);
--        break;
--    case MO_BEUW:
--        res = lduw_be_p(haddr);
--        break;
--    case MO_LEUW:
--        res = lduw_le_p(haddr);
--        break;
--    case MO_BEUL:
--        res = (uint32_t)ldl_be_p(haddr);
--        break;
--    case MO_LEUL:
--        res = (uint32_t)ldl_le_p(haddr);
--        break;
--    case MO_BEQ:
--        res = ldq_be_p(haddr);
--        break;
--    case MO_LEQ:
--        res = ldq_le_p(haddr);
--        break;
--    default:
--        qemu_build_not_reached();
--    }
--
--    return res;
-+    return load_memop(haddr, op);
- }
- /*
-@@ -XXX,XX +XXX,XX @@ tcg_target_ulong helper_be_ldsl_mmu(CPUArchState *env, target_ulong addr,
-  * Store Helpers
-  */
-+static inline void QEMU_ALWAYS_INLINE
-+store_memop(void *haddr, uint64_t val, MemOp op)
-+{
-+    switch (op) {
-+    case MO_UB:
-+        stb_p(haddr, val);
-+        break;
-+    case MO_BEUW:
-+        stw_be_p(haddr, val);
-+        break;
-+    case MO_LEUW:
-+        stw_le_p(haddr, val);
-+        break;
-+    case MO_BEUL:
-+        stl_be_p(haddr, val);
-+        break;
-+    case MO_LEUL:
-+        stl_le_p(haddr, val);
-+        break;
-+    case MO_BEQ:
-+        stq_be_p(haddr, val);
-+        break;
-+    case MO_LEQ:
-+        stq_le_p(haddr, val);
-+        break;
-+    default:
-+        qemu_build_not_reached();
-+    }
-+}
-+
- static inline void QEMU_ALWAYS_INLINE
- store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
-              TCGMemOpIdx oi, uintptr_t retaddr, MemOp op)
-@@ -XXX,XX +XXX,XX @@ store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
-  do_aligned_access:
-     haddr = (void *)((uintptr_t)addr + entry->addend);
--    switch (op) {
--    case MO_UB:
--        stb_p(haddr, val);
--        break;
--    case MO_BEUW:
--        stw_be_p(haddr, val);
--        break;
--    case MO_LEUW:
--        stw_le_p(haddr, val);
--        break;
--    case MO_BEUL:
--        stl_be_p(haddr, val);
--        break;
--    case MO_LEUL:
--        stl_le_p(haddr, val);
--        break;
--    case MO_BEQ:
--        stq_be_p(haddr, val);
--        break;
--    case MO_LEQ:
--        stq_le_p(haddr, val);
--        break;
--    default:
--        qemu_build_not_reached();
--    }
-+    store_memop(haddr, val, op);
- }
- void helper_ret_stb_mmu(CPUArchState *env, target_ulong addr, uint8_t val,
---
-.17.1

-[PULL 06/16] cputlb: Introduce TLB_BSWAP
+[PULL for-5.2 2/2] tcg: Revert "tcg/optimize: Flush data at labels not TCG_OPF_BB_END"
-Handle bswap on ram directly in load/store_helper.  This fixes a
+This reverts commit cd0372c515c4732d8bd3777cdd995c139c7ed7ea.
 bug with the previous implementation in that one cannot use the
 I/O path for RAM.
-Fixes: a26fc6f5152b47f1
+The patch is incorrect in that it retains copies between globals and
-Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+non-local temps, and non-local temps still die at the end of the BB.
-Reviewed-by: David Hildenbrand <david@redhat.com>
 Failing test case for hppa:
     .globl    _start
 _start:
     cmpiclr,=    0x24,%r19,%r0
     cmpiclr,<>    0x2f,%r19,%r19
  ---- 00010057 0001005b
  movi_i32 tmp0,$0x24
  sub_i32 tmp1,tmp0,r19
  mov_i32 tmp2,tmp0
  mov_i32 tmp3,r19
  movi_i32 tmp1,$0x0
  ---- 0001005b 0001005f
  brcond_i32 tmp2,tmp3,eq,$L1
  movi_i32 tmp0,$0x2f
  sub_i32 tmp1,tmp0,r19
  mov_i32 tmp2,tmp0
  mov_i32 tmp3,r19
  movi_i32 tmp1,$0x0
  mov_i32 r19,tmp1
  setcond_i32 psw_n,tmp2,tmp3,ne
  set_label $L1
 In this case, both copies of "mov_i32 tmp3,r19" are removed.  The
 second because opt thought it was redundant.  The first is removed
 later by liveness because tmp3 is known to be dead.  This leaves
 the setcond_i32 with an uninitialized input.
 Revert the entire patch for 5.2, and a proper optimization across
 the branch may be considered for the next development cycle.
 Reported-by: qemu@igor2.repo.hu
 Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/exec/cpu-all.h |  4 ++-
+ tcg/optimize.c | 35 +++++++++++++++++------------------
- accel/tcg/cputlb.c     | 72 +++++++++++++++++++++++++-----------------
+file changed, 17 insertions(+), 18 deletions(-)
 files changed, 46 insertions(+), 30 deletions(-)
-diff --git a/include/exec/cpu-all.h b/include/exec/cpu-all.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/exec/cpu-all.h
+--- a/tcg/optimize.c
-+++ b/include/exec/cpu-all.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ CPUArchState *cpu_copy(CPUArchState *env);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
- #define TLB_MMIO            (1 << (TARGET_PAGE_BITS_MIN - 3))
+                     }
- /* Set if TLB entry contains a watchpoint.  */
+                 }
- #define TLB_WATCHPOINT      (1 << (TARGET_PAGE_BITS_MIN - 4))
+             }
-+/* Set if TLB entry requires byte swap.  */
+-            /* fall through */
-+#define TLB_BSWAP           (1 << (TARGET_PAGE_BITS_MIN - 5))
++            goto do_reset_output;
- /* Use this mask to check interception with an alignment mask
+         default:
-  * in a TCG backend.
+         do_default:
-  */
+-            /*
- #define TLB_FLAGS_MASK \
+-             * Default case: we know nothing about operation (or were unable
--    (TLB_INVALID_MASK | TLB_NOTDIRTY | TLB_MMIO | TLB_WATCHPOINT)
+-             * to compute the operation result) so no propagation is done.
-+    (TLB_INVALID_MASK | TLB_NOTDIRTY | TLB_MMIO | TLB_WATCHPOINT | TLB_BSWAP)
+-             */
+-            for (i = 0; i < nb_oargs; i++) {
- /**
+-                reset_temp(op->args[i]);
-  * tlb_hit_page: return true if page aligned @addr is a hit against the
+-                /*
-diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
+-                 * Save the corresponding known-zero bits mask for the
-index XXXXXXX..XXXXXXX 100644
+-                 * first output argument (only one supported so far).
---- a/accel/tcg/cputlb.c
+-                 */
-+++ b/accel/tcg/cputlb.c
+-                if (i == 0) {
-@@ -XXX,XX +XXX,XX @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
+-                    arg_info(op->args[i])->mask = mask;
-         address |= TLB_INVALID_MASK;
++            /* Default case: we know nothing about operation (or were unable
-     }
++               to compute the operation result) so no propagation is done.
-     if (attrs.byte_swap) {
++               We trash everything if the operation is the end of a basic
--        /* Force the access through the I/O slow path.  */
++               block, otherwise we only trash the output args.  "mask" is
--        address |= TLB_MMIO;
++               the non-zero bits mask for the first output arg.  */
-+        address |= TLB_BSWAP;
++            if (def->flags & TCG_OPF_BB_END) {
-     }
++                bitmap_zero(temps_used.l, nb_temps);
-     if (!memory_region_is_ram(section->mr) &&
++            } else {
-         !memory_region_is_romd(section->mr)) {
++        do_reset_output:
-@@ -XXX,XX +XXX,XX @@ static uint64_t io_readx(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
++                for (i = 0; i < nb_oargs; i++) {
-     bool locked = false;
++                    reset_temp(op->args[i]);
-     MemTxResult r;
++                    /* Save the corresponding known-zero bits mask for the
++                       first output argument (only one supported so far). */
--    if (iotlbentry->attrs.byte_swap) {
++                    if (i == 0) {
--        op ^= MO_BSWAP;
++                        arg_info(op->args[i])->mask = mask;
--    }
++                    }
                  }
              }
              break;
 -
-     section = iotlb_to_section(cpu, iotlbentry->addr, iotlbentry->attrs);
+-        case INDEX_op_set_label:
-     mr = section->mr;
+-            /* Trash everything at the start of a new extended bb. */
-     mr_offset = (iotlbentry->addr & TARGET_PAGE_MASK) + addr;
+-            bitmap_zero(temps_used.l, nb_temps);
-@@ -XXX,XX +XXX,XX @@ static void io_writex(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
+-            break;
      bool locked = false;
      MemTxResult r;
 -    if (iotlbentry->attrs.byte_swap) {
 -        op ^= MO_BSWAP;
 -    }
 -
      section = iotlb_to_section(cpu, iotlbentry->addr, iotlbentry->attrs);
      mr = section->mr;
      mr_offset = (iotlbentry->addr & TARGET_PAGE_MASK) + addr;
@@ -XXX,XX +XXX,XX @@ void *probe_access(CPUArchState *env, target_ulong addr, int size,
                               wp_access, retaddr);
      }
 -    if (tlb_addr & (TLB_NOTDIRTY | TLB_MMIO)) {
 -        /* I/O access */
 +    /* Reject I/O access, or other required slow-path.  */
 +    if (tlb_addr & (TLB_NOTDIRTY | TLB_MMIO | TLB_BSWAP)) {
          return NULL;
      }
@@ -XXX,XX +XXX,XX @@ load_helper(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi,
      /* Handle anything that isn't just a straight memory access.  */
      if (unlikely(tlb_addr & ~TARGET_PAGE_MASK)) {
          CPUIOTLBEntry *iotlbentry;
 +        bool need_swap;
          /* For anything that is unaligned, recurse through full_load.  */
          if ((addr & (size - 1)) != 0) {
@@ -XXX,XX +XXX,XX @@ load_helper(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi,
              /* On watchpoint hit, this will longjmp out.  */
              cpu_check_watchpoint(env_cpu(env), addr, size,
                                   iotlbentry->attrs, BP_MEM_READ, retaddr);
 -
 -            /* The backing page may or may not require I/O.  */
 -            tlb_addr &= ~TLB_WATCHPOINT;
 -            if ((tlb_addr & ~TARGET_PAGE_MASK) == 0) {
 -                goto do_aligned_access;
 -            }
          }
-+        need_swap = size > 1 && (tlb_addr & TLB_BSWAP);
+         /* Eliminate duplicate and redundant fence instructions.  */
 +
          /* Handle I/O access.  */
 -        return io_readx(env, iotlbentry, mmu_idx, addr,
 -                        retaddr, access_type, op);
 +        if (likely(tlb_addr & TLB_MMIO)) {
 +            return io_readx(env, iotlbentry, mmu_idx, addr, retaddr,
 +                            access_type, op ^ (need_swap * MO_BSWAP));
 +        }
 +
 +        haddr = (void *)((uintptr_t)addr + entry->addend);
 +
 +        /*
 +         * Keep these two load_memop separate to ensure that the compiler
 +         * is able to fold the entire function to a single instruction.
 +         * There is a build-time assert inside to remind you of this.  ;-)
 +         */
 +        if (unlikely(need_swap)) {
 +            return load_memop(haddr, op ^ MO_BSWAP);
 +        }
 +        return load_memop(haddr, op);
      }
      /* Handle slow unaligned access (it spans two pages or IO).  */
@@ -XXX,XX +XXX,XX @@ load_helper(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi,
          return res & MAKE_64BIT_MASK(0, size * 8);
      }
 - do_aligned_access:
      haddr = (void *)((uintptr_t)addr + entry->addend);
      return load_memop(haddr, op);
  }
@@ -XXX,XX +XXX,XX @@ store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
      /* Handle anything that isn't just a straight memory access.  */
      if (unlikely(tlb_addr & ~TARGET_PAGE_MASK)) {
          CPUIOTLBEntry *iotlbentry;
 +        bool need_swap;
          /* For anything that is unaligned, recurse through byte stores.  */
          if ((addr & (size - 1)) != 0) {
@@ -XXX,XX +XXX,XX @@ store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
              /* On watchpoint hit, this will longjmp out.  */
              cpu_check_watchpoint(env_cpu(env), addr, size,
                                   iotlbentry->attrs, BP_MEM_WRITE, retaddr);
 -
 -            /* The backing page may or may not require I/O.  */
 -            tlb_addr &= ~TLB_WATCHPOINT;
 -            if ((tlb_addr & ~TARGET_PAGE_MASK) == 0) {
 -                goto do_aligned_access;
 -            }
          }
 +        need_swap = size > 1 && (tlb_addr & TLB_BSWAP);
 +
          /* Handle I/O access.  */
 -        io_writex(env, iotlbentry, mmu_idx, val, addr, retaddr, op);
 +        if (likely(tlb_addr & (TLB_MMIO | TLB_NOTDIRTY))) {
 +            io_writex(env, iotlbentry, mmu_idx, val, addr, retaddr,
 +                      op ^ (need_swap * MO_BSWAP));
 +            return;
 +        }
 +
 +        haddr = (void *)((uintptr_t)addr + entry->addend);
 +
 +        /*
 +         * Keep these two store_memop separate to ensure that the compiler
 +         * is able to fold the entire function to a single instruction.
 +         * There is a build-time assert inside to remind you of this.  ;-)
 +         */
 +        if (unlikely(need_swap)) {
 +            store_memop(haddr, val, op ^ MO_BSWAP);
 +        } else {
 +            store_memop(haddr, val, op);
 +        }
          return;
      }
@@ -XXX,XX +XXX,XX @@ store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
          return;
      }
 - do_aligned_access:
      haddr = (void *)((uintptr_t)addr + entry->addend);
      store_memop(haddr, val, op);
  }
 --
-.17.1
+.25.1

-[PULL 07/16] exec: Adjust notdirty tracing
+Deleted patch
-The memory_region_tb_read tracepoint is unreachable, since notdirty
-is supposed to apply only to writes.  The memory_region_tb_write
-tracepoint is mis-named, because notdirty is not only used for TB
-invalidation.  It is also used for e.g. VGA RAM updates and migration.
-Replace memory_region_tb_write with memory_notdirty_write_access,
-and place it in memory_notdirty_write_prepare where it can catch
-all of the instances.  Add memory_notdirty_set_dirty to log when
-we no longer intercept writes to a page.
-Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
-Reviewed-by: David Hildenbrand <david@redhat.com>
-Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
----
- exec.c       | 3 +++
- memory.c     | 4 ----
- trace-events | 4 ++--
-files changed, 5 insertions(+), 6 deletions(-)
-diff --git a/exec.c b/exec.c
-index XXXXXXX..XXXXXXX 100644
---- a/exec.c
-+++ b/exec.c
-@@ -XXX,XX +XXX,XX @@ void memory_notdirty_write_prepare(NotDirtyInfo *ndi,
-     ndi->size = size;
-     ndi->pages = NULL;
-+    trace_memory_notdirty_write_access(mem_vaddr, ram_addr, size);
-+
-     assert(tcg_enabled());
-     if (!cpu_physical_memory_get_dirty_flag(ram_addr, DIRTY_MEMORY_CODE)) {
-         ndi->pages = page_collection_lock(ram_addr, ram_addr + size);
-@@ -XXX,XX +XXX,XX @@ void memory_notdirty_write_complete(NotDirtyInfo *ndi)
-     /* we remove the notdirty callback only if the code has been
-        flushed */
-     if (!cpu_physical_memory_is_clean(ndi->ram_addr)) {
-+        trace_memory_notdirty_set_dirty(ndi->mem_vaddr);
-         tlb_set_dirty(ndi->cpu, ndi->mem_vaddr);
-     }
- }
-diff --git a/memory.c b/memory.c
-index XXXXXXX..XXXXXXX 100644
---- a/memory.c
-+++ b/memory.c
-@@ -XXX,XX +XXX,XX @@ static MemTxResult  memory_region_read_accessor(MemoryRegion *mr,
-         /* Accesses to code which has previously been translated into a TB show
-          * up in the MMIO path, as accesses to the io_mem_notdirty
-          * MemoryRegion. */
--        trace_memory_region_tb_read(get_cpu_index(), addr, tmp, size);
-     } else if (TRACE_MEMORY_REGION_OPS_READ_ENABLED) {
-         hwaddr abs_addr = memory_region_to_absolute_addr(mr, addr);
-         trace_memory_region_ops_read(get_cpu_index(), mr, abs_addr, tmp, size);
-@@ -XXX,XX +XXX,XX @@ static MemTxResult memory_region_read_with_attrs_accessor(MemoryRegion *mr,
-         /* Accesses to code which has previously been translated into a TB show
-          * up in the MMIO path, as accesses to the io_mem_notdirty
-          * MemoryRegion. */
--        trace_memory_region_tb_read(get_cpu_index(), addr, tmp, size);
-     } else if (TRACE_MEMORY_REGION_OPS_READ_ENABLED) {
-         hwaddr abs_addr = memory_region_to_absolute_addr(mr, addr);
-         trace_memory_region_ops_read(get_cpu_index(), mr, abs_addr, tmp, size);
-@@ -XXX,XX +XXX,XX @@ static MemTxResult memory_region_write_accessor(MemoryRegion *mr,
-         /* Accesses to code which has previously been translated into a TB show
-          * up in the MMIO path, as accesses to the io_mem_notdirty
-          * MemoryRegion. */
--        trace_memory_region_tb_write(get_cpu_index(), addr, tmp, size);
-     } else if (TRACE_MEMORY_REGION_OPS_WRITE_ENABLED) {
-         hwaddr abs_addr = memory_region_to_absolute_addr(mr, addr);
-         trace_memory_region_ops_write(get_cpu_index(), mr, abs_addr, tmp, size);
-@@ -XXX,XX +XXX,XX @@ static MemTxResult memory_region_write_with_attrs_accessor(MemoryRegion *mr,
-         /* Accesses to code which has previously been translated into a TB show
-          * up in the MMIO path, as accesses to the io_mem_notdirty
-          * MemoryRegion. */
--        trace_memory_region_tb_write(get_cpu_index(), addr, tmp, size);
-     } else if (TRACE_MEMORY_REGION_OPS_WRITE_ENABLED) {
-         hwaddr abs_addr = memory_region_to_absolute_addr(mr, addr);
-         trace_memory_region_ops_write(get_cpu_index(), mr, abs_addr, tmp, size);
-diff --git a/trace-events b/trace-events
-index XXXXXXX..XXXXXXX 100644
---- a/trace-events
-+++ b/trace-events
-@@ -XXX,XX +XXX,XX @@ dma_map_wait(void *dbs) "dbs=%p"
- find_ram_offset(uint64_t size, uint64_t offset) "size: 0x%" PRIx64 " @ 0x%" PRIx64
- find_ram_offset_loop(uint64_t size, uint64_t candidate, uint64_t offset, uint64_t next, uint64_t mingap) "trying size: 0x%" PRIx64 " @ 0x%" PRIx64 ", offset: 0x%" PRIx64" next: 0x%" PRIx64 " mingap: 0x%" PRIx64
- ram_block_discard_range(const char *rbname, void *hva, size_t length, bool need_madvise, bool need_fallocate, int ret) "%s@%p + 0x%zx: madvise: %d fallocate: %d ret: %d"
-+memory_notdirty_write_access(uint64_t vaddr, uint64_t ram_addr, unsigned size) "0x%" PRIx64 " ram_addr 0x%" PRIx64 " size %u"
-+memory_notdirty_set_dirty(uint64_t vaddr) "0x%" PRIx64
- # memory.c
- memory_region_ops_read(int cpu_index, void *mr, uint64_t addr, uint64_t value, unsigned size) "cpu %d mr %p addr 0x%"PRIx64" value 0x%"PRIx64" size %u"
- memory_region_ops_write(int cpu_index, void *mr, uint64_t addr, uint64_t value, unsigned size) "cpu %d mr %p addr 0x%"PRIx64" value 0x%"PRIx64" size %u"
- memory_region_subpage_read(int cpu_index, void *mr, uint64_t offset, uint64_t value, unsigned size) "cpu %d mr %p offset 0x%"PRIx64" value 0x%"PRIx64" size %u"
- memory_region_subpage_write(int cpu_index, void *mr, uint64_t offset, uint64_t value, unsigned size) "cpu %d mr %p offset 0x%"PRIx64" value 0x%"PRIx64" size %u"
--memory_region_tb_read(int cpu_index, uint64_t addr, uint64_t value, unsigned size) "cpu %d addr 0x%"PRIx64" value 0x%"PRIx64" size %u"
--memory_region_tb_write(int cpu_index, uint64_t addr, uint64_t value, unsigned size) "cpu %d addr 0x%"PRIx64" value 0x%"PRIx64" size %u"
- memory_region_ram_device_read(int cpu_index, void *mr, uint64_t addr, uint64_t value, unsigned size) "cpu %d mr %p addr 0x%"PRIx64" value 0x%"PRIx64" size %u"
- memory_region_ram_device_write(int cpu_index, void *mr, uint64_t addr, uint64_t value, unsigned size) "cpu %d mr %p addr 0x%"PRIx64" value 0x%"PRIx64" size %u"
- flatview_new(void *view, void *root) "%p (root %p)"
---
-.17.1

-[PULL 08/16] cputlb: Move ROM handling from I/O path to TLB path
+Deleted patch
-It does not require going through the whole I/O path
-in order to discard a write.
-Reviewed-by: David Hildenbrand <david@redhat.com>
-Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
----
- include/exec/cpu-all.h    |  5 ++++-
- include/exec/cpu-common.h |  1 -
- accel/tcg/cputlb.c        | 36 ++++++++++++++++++++--------------
- exec.c                    | 41 +--------------------------------------
-files changed, 26 insertions(+), 57 deletions(-)
-diff --git a/include/exec/cpu-all.h b/include/exec/cpu-all.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/exec/cpu-all.h
-+++ b/include/exec/cpu-all.h
-@@ -XXX,XX +XXX,XX @@ CPUArchState *cpu_copy(CPUArchState *env);
- #define TLB_WATCHPOINT      (1 << (TARGET_PAGE_BITS_MIN - 4))
- /* Set if TLB entry requires byte swap.  */
- #define TLB_BSWAP           (1 << (TARGET_PAGE_BITS_MIN - 5))
-+/* Set if TLB entry writes ignored.  */
-+#define TLB_DISCARD_WRITE   (1 << (TARGET_PAGE_BITS_MIN - 6))
- /* Use this mask to check interception with an alignment mask
-  * in a TCG backend.
-  */
- #define TLB_FLAGS_MASK \
--    (TLB_INVALID_MASK | TLB_NOTDIRTY | TLB_MMIO | TLB_WATCHPOINT | TLB_BSWAP)
-+    (TLB_INVALID_MASK | TLB_NOTDIRTY | TLB_MMIO \
-+    | TLB_WATCHPOINT | TLB_BSWAP | TLB_DISCARD_WRITE)
- /**
-  * tlb_hit_page: return true if page aligned @addr is a hit against the
-diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/exec/cpu-common.h
-+++ b/include/exec/cpu-common.h
-@@ -XXX,XX +XXX,XX @@ void qemu_flush_coalesced_mmio_buffer(void);
- void cpu_flush_icache_range(hwaddr start, hwaddr len);
--extern struct MemoryRegion io_mem_rom;
- extern struct MemoryRegion io_mem_notdirty;
- typedef int (RAMBlockIterFunc)(RAMBlock *rb, void *opaque);
-diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
-index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/cputlb.c
-+++ b/accel/tcg/cputlb.c
-@@ -XXX,XX +XXX,XX @@ static void tlb_reset_dirty_range_locked(CPUTLBEntry *tlb_entry,
- {
-     uintptr_t addr = tlb_entry->addr_write;
--    if ((addr & (TLB_INVALID_MASK | TLB_MMIO | TLB_NOTDIRTY)) == 0) {
-+    if ((addr & (TLB_INVALID_MASK | TLB_MMIO |
-+                 TLB_DISCARD_WRITE | TLB_NOTDIRTY)) == 0) {
-         addr &= TARGET_PAGE_MASK;
-         addr += tlb_entry->addend;
-         if ((addr - start) < length) {
-@@ -XXX,XX +XXX,XX @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
-         address |= TLB_MMIO;
-         addend = 0;
-     } else {
--        /* TLB_MMIO for rom/romd handled below */
-         addend = (uintptr_t)memory_region_get_ram_ptr(section->mr) + xlat;
-     }
-@@ -XXX,XX +XXX,XX @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
-     tn.addr_write = -1;
-     if (prot & PAGE_WRITE) {
--        if ((memory_region_is_ram(section->mr) && section->readonly)
--            || memory_region_is_romd(section->mr)) {
--            /* Write access calls the I/O callback.  */
--            tn.addr_write = address | TLB_MMIO;
--        } else if (memory_region_is_ram(section->mr)
--                   && cpu_physical_memory_is_clean(
--                       memory_region_get_ram_addr(section->mr) + xlat)) {
--            tn.addr_write = address | TLB_NOTDIRTY;
--        } else {
--            tn.addr_write = address;
-+        tn.addr_write = address;
-+        if (memory_region_is_romd(section->mr)) {
-+            /* Use the MMIO path so that the device can switch states. */
-+            tn.addr_write |= TLB_MMIO;
-+        } else if (memory_region_is_ram(section->mr)) {
-+            if (section->readonly) {
-+                tn.addr_write |= TLB_DISCARD_WRITE;
-+            } else if (cpu_physical_memory_is_clean(
-+                        memory_region_get_ram_addr(section->mr) + xlat)) {
-+                tn.addr_write |= TLB_NOTDIRTY;
-+            }
-         }
-         if (prot & PAGE_WRITE_INV) {
-             tn.addr_write |= TLB_INVALID_MASK;
-@@ -XXX,XX +XXX,XX @@ static uint64_t io_readx(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
-     mr = section->mr;
-     mr_offset = (iotlbentry->addr & TARGET_PAGE_MASK) + addr;
-     cpu->mem_io_pc = retaddr;
--    if (mr != &io_mem_rom && mr != &io_mem_notdirty && !cpu->can_do_io) {
-+    if (mr != &io_mem_notdirty && !cpu->can_do_io) {
-         cpu_io_recompile(cpu, retaddr);
-     }
-@@ -XXX,XX +XXX,XX @@ static void io_writex(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
-     section = iotlb_to_section(cpu, iotlbentry->addr, iotlbentry->attrs);
-     mr = section->mr;
-     mr_offset = (iotlbentry->addr & TARGET_PAGE_MASK) + addr;
--    if (mr != &io_mem_rom && mr != &io_mem_notdirty && !cpu->can_do_io) {
-+    if (mr != &io_mem_notdirty && !cpu->can_do_io) {
-         cpu_io_recompile(cpu, retaddr);
-     }
-     cpu->mem_io_vaddr = addr;
-@@ -XXX,XX +XXX,XX @@ void *probe_access(CPUArchState *env, target_ulong addr, int size,
-     }
-     /* Reject I/O access, or other required slow-path.  */
--    if (tlb_addr & (TLB_NOTDIRTY | TLB_MMIO | TLB_BSWAP)) {
-+    if (tlb_addr & (TLB_NOTDIRTY | TLB_MMIO | TLB_BSWAP | TLB_DISCARD_WRITE)) {
-         return NULL;
-     }
-@@ -XXX,XX +XXX,XX @@ store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
-             return;
-         }
-+        /* Ignore writes to ROM.  */
-+        if (unlikely(tlb_addr & TLB_DISCARD_WRITE)) {
-+            return;
-+        }
-+
-         haddr = (void *)((uintptr_t)addr + entry->addend);
-         /*
-diff --git a/exec.c b/exec.c
-index XXXXXXX..XXXXXXX 100644
---- a/exec.c
-+++ b/exec.c
-@@ -XXX,XX +XXX,XX @@ static MemoryRegion *system_io;
- AddressSpace address_space_io;
- AddressSpace address_space_memory;
--MemoryRegion io_mem_rom, io_mem_notdirty;
-+MemoryRegion io_mem_notdirty;
- static MemoryRegion io_mem_unassigned;
- #endif
-@@ -XXX,XX +XXX,XX @@ typedef struct subpage_t {
- #define PHYS_SECTION_UNASSIGNED 0
- #define PHYS_SECTION_NOTDIRTY 1
--#define PHYS_SECTION_ROM 2
- static void io_mem_init(void);
- static void memory_map_init(void);
-@@ -XXX,XX +XXX,XX @@ hwaddr memory_region_section_get_iotlb(CPUState *cpu,
-         iotlb = memory_region_get_ram_addr(section->mr) + xlat;
-         if (!section->readonly) {
-             iotlb |= PHYS_SECTION_NOTDIRTY;
--        } else {
--            iotlb |= PHYS_SECTION_ROM;
-         }
-     } else {
-         AddressSpaceDispatch *d;
-@@ -XXX,XX +XXX,XX @@ static uint16_t dummy_section(PhysPageMap *map, FlatView *fv, MemoryRegion *mr)
-     return phys_section_add(map, &section);
- }
--static void readonly_mem_write(void *opaque, hwaddr addr,
--                               uint64_t val, unsigned size)
--{
--    /* Ignore any write to ROM. */
--}
--
--static bool readonly_mem_accepts(void *opaque, hwaddr addr,
--                                 unsigned size, bool is_write,
--                                 MemTxAttrs attrs)
--{
--    return is_write;
--}
--
--/* This will only be used for writes, because reads are special cased
-- * to directly access the underlying host ram.
-- */
--static const MemoryRegionOps readonly_mem_ops = {
--    .write = readonly_mem_write,
--    .valid.accepts = readonly_mem_accepts,
--    .endianness = DEVICE_NATIVE_ENDIAN,
--    .valid = {
--        .min_access_size = 1,
--        .max_access_size = 8,
--        .unaligned = false,
--    },
--    .impl = {
--        .min_access_size = 1,
--        .max_access_size = 8,
--        .unaligned = false,
--    },
--};
--
- MemoryRegionSection *iotlb_to_section(CPUState *cpu,
-                                       hwaddr index, MemTxAttrs attrs)
- {
-@@ -XXX,XX +XXX,XX @@ MemoryRegionSection *iotlb_to_section(CPUState *cpu,
- static void io_mem_init(void)
- {
--    memory_region_init_io(&io_mem_rom, NULL, &readonly_mem_ops,
--                          NULL, NULL, UINT64_MAX);
-     memory_region_init_io(&io_mem_unassigned, NULL, &unassigned_mem_ops, NULL,
-                           NULL, UINT64_MAX);
-@@ -XXX,XX +XXX,XX @@ AddressSpaceDispatch *address_space_dispatch_new(FlatView *fv)
-     assert(n == PHYS_SECTION_UNASSIGNED);
-     n = dummy_section(&d->map, fv, &io_mem_notdirty);
-     assert(n == PHYS_SECTION_NOTDIRTY);
--    n = dummy_section(&d->map, fv, &io_mem_rom);
--    assert(n == PHYS_SECTION_ROM);
-     d->phys_map  = (PhysPageEntry) { .ptr = PHYS_MAP_NODE_NIL, .skip = 1 };
---
-.17.1

-[PULL 09/16] cputlb: Move NOTDIRTY handling from I/O path to TLB path
+Deleted patch
-Pages that we want to track for NOTDIRTY are RAM.  We do not
-really need to go through the I/O path to handle them.
-Acked-by: David Hildenbrand <david@redhat.com>
-Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
-Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
----
- include/exec/cpu-common.h |  2 --
- accel/tcg/cputlb.c        | 26 +++++++++++++++++---
- exec.c                    | 50 ---------------------------------------
- memory.c                  | 16 -------------
-files changed, 23 insertions(+), 71 deletions(-)
-diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/exec/cpu-common.h
-+++ b/include/exec/cpu-common.h
-@@ -XXX,XX +XXX,XX @@ void qemu_flush_coalesced_mmio_buffer(void);
- void cpu_flush_icache_range(hwaddr start, hwaddr len);
--extern struct MemoryRegion io_mem_notdirty;
--
- typedef int (RAMBlockIterFunc)(RAMBlock *rb, void *opaque);
- int qemu_ram_foreach_block(RAMBlockIterFunc func, void *opaque);
-diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
-index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/cputlb.c
-+++ b/accel/tcg/cputlb.c
-@@ -XXX,XX +XXX,XX @@ static uint64_t io_readx(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
-     mr = section->mr;
-     mr_offset = (iotlbentry->addr & TARGET_PAGE_MASK) + addr;
-     cpu->mem_io_pc = retaddr;
--    if (mr != &io_mem_notdirty && !cpu->can_do_io) {
-+    if (!cpu->can_do_io) {
-         cpu_io_recompile(cpu, retaddr);
-     }
-@@ -XXX,XX +XXX,XX @@ static void io_writex(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
-     section = iotlb_to_section(cpu, iotlbentry->addr, iotlbentry->attrs);
-     mr = section->mr;
-     mr_offset = (iotlbentry->addr & TARGET_PAGE_MASK) + addr;
--    if (mr != &io_mem_notdirty && !cpu->can_do_io) {
-+    if (!cpu->can_do_io) {
-         cpu_io_recompile(cpu, retaddr);
-     }
-     cpu->mem_io_vaddr = addr;
-@@ -XXX,XX +XXX,XX @@ store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
-         need_swap = size > 1 && (tlb_addr & TLB_BSWAP);
-         /* Handle I/O access.  */
--        if (likely(tlb_addr & (TLB_MMIO | TLB_NOTDIRTY))) {
-+        if (tlb_addr & TLB_MMIO) {
-             io_writex(env, iotlbentry, mmu_idx, val, addr, retaddr,
-                       op ^ (need_swap * MO_BSWAP));
-             return;
-@@ -XXX,XX +XXX,XX @@ store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
-         haddr = (void *)((uintptr_t)addr + entry->addend);
-+        /* Handle clean RAM pages.  */
-+        if (tlb_addr & TLB_NOTDIRTY) {
-+            NotDirtyInfo ndi;
-+
-+            /* We require mem_io_pc in tb_invalidate_phys_page_range.  */
-+            env_cpu(env)->mem_io_pc = retaddr;
-+
-+            memory_notdirty_write_prepare(&ndi, env_cpu(env), addr,
-+                                          addr + iotlbentry->addr, size);
-+
-+            if (unlikely(need_swap)) {
-+                store_memop(haddr, val, op ^ MO_BSWAP);
-+            } else {
-+                store_memop(haddr, val, op);
-+            }
-+
-+            memory_notdirty_write_complete(&ndi);
-+            return;
-+        }
-+
-         /*
-          * Keep these two store_memop separate to ensure that the compiler
-          * is able to fold the entire function to a single instruction.
-diff --git a/exec.c b/exec.c
-index XXXXXXX..XXXXXXX 100644
---- a/exec.c
-+++ b/exec.c
-@@ -XXX,XX +XXX,XX @@ static MemoryRegion *system_io;
- AddressSpace address_space_io;
- AddressSpace address_space_memory;
--MemoryRegion io_mem_notdirty;
- static MemoryRegion io_mem_unassigned;
- #endif
-@@ -XXX,XX +XXX,XX @@ typedef struct subpage_t {
- } subpage_t;
- #define PHYS_SECTION_UNASSIGNED 0
--#define PHYS_SECTION_NOTDIRTY 1
- static void io_mem_init(void);
- static void memory_map_init(void);
-@@ -XXX,XX +XXX,XX @@ hwaddr memory_region_section_get_iotlb(CPUState *cpu,
-     if (memory_region_is_ram(section->mr)) {
-         /* Normal RAM.  */
-         iotlb = memory_region_get_ram_addr(section->mr) + xlat;
--        if (!section->readonly) {
--            iotlb |= PHYS_SECTION_NOTDIRTY;
--        }
-     } else {
-         AddressSpaceDispatch *d;
-@@ -XXX,XX +XXX,XX @@ void memory_notdirty_write_complete(NotDirtyInfo *ndi)
-     }
- }
--/* Called within RCU critical section.  */
--static void notdirty_mem_write(void *opaque, hwaddr ram_addr,
--                               uint64_t val, unsigned size)
--{
--    NotDirtyInfo ndi;
--
--    memory_notdirty_write_prepare(&ndi, current_cpu, current_cpu->mem_io_vaddr,
--                         ram_addr, size);
--
--    stn_p(qemu_map_ram_ptr(NULL, ram_addr), size, val);
--    memory_notdirty_write_complete(&ndi);
--}
--
--static bool notdirty_mem_accepts(void *opaque, hwaddr addr,
--                                 unsigned size, bool is_write,
--                                 MemTxAttrs attrs)
--{
--    return is_write;
--}
--
--static const MemoryRegionOps notdirty_mem_ops = {
--    .write = notdirty_mem_write,
--    .valid.accepts = notdirty_mem_accepts,
--    .endianness = DEVICE_NATIVE_ENDIAN,
--    .valid = {
--        .min_access_size = 1,
--        .max_access_size = 8,
--        .unaligned = false,
--    },
--    .impl = {
--        .min_access_size = 1,
--        .max_access_size = 8,
--        .unaligned = false,
--    },
--};
--
- /* Generate a debug exception if a watchpoint has been hit.  */
- void cpu_check_watchpoint(CPUState *cpu, vaddr addr, vaddr len,
-                           MemTxAttrs attrs, int flags, uintptr_t ra)
-@@ -XXX,XX +XXX,XX @@ static void io_mem_init(void)
- {
-     memory_region_init_io(&io_mem_unassigned, NULL, &unassigned_mem_ops, NULL,
-                           NULL, UINT64_MAX);
--
--    /* io_mem_notdirty calls tb_invalidate_phys_page_fast,
--     * which can be called without the iothread mutex.
--     */
--    memory_region_init_io(&io_mem_notdirty, NULL, &notdirty_mem_ops, NULL,
--                          NULL, UINT64_MAX);
--    memory_region_clear_global_locking(&io_mem_notdirty);
- }
- AddressSpaceDispatch *address_space_dispatch_new(FlatView *fv)
-@@ -XXX,XX +XXX,XX @@ AddressSpaceDispatch *address_space_dispatch_new(FlatView *fv)
-     n = dummy_section(&d->map, fv, &io_mem_unassigned);
-     assert(n == PHYS_SECTION_UNASSIGNED);
--    n = dummy_section(&d->map, fv, &io_mem_notdirty);
--    assert(n == PHYS_SECTION_NOTDIRTY);
-     d->phys_map  = (PhysPageEntry) { .ptr = PHYS_MAP_NODE_NIL, .skip = 1 };
-diff --git a/memory.c b/memory.c
-index XXXXXXX..XXXXXXX 100644
---- a/memory.c
-+++ b/memory.c
-@@ -XXX,XX +XXX,XX @@ static MemTxResult  memory_region_read_accessor(MemoryRegion *mr,
-     tmp = mr->ops->read(mr->opaque, addr, size);
-     if (mr->subpage) {
-         trace_memory_region_subpage_read(get_cpu_index(), mr, addr, tmp, size);
--    } else if (mr == &io_mem_notdirty) {
--        /* Accesses to code which has previously been translated into a TB show
--         * up in the MMIO path, as accesses to the io_mem_notdirty
--         * MemoryRegion. */
-     } else if (TRACE_MEMORY_REGION_OPS_READ_ENABLED) {
-         hwaddr abs_addr = memory_region_to_absolute_addr(mr, addr);
-         trace_memory_region_ops_read(get_cpu_index(), mr, abs_addr, tmp, size);
-@@ -XXX,XX +XXX,XX @@ static MemTxResult memory_region_read_with_attrs_accessor(MemoryRegion *mr,
-     r = mr->ops->read_with_attrs(mr->opaque, addr, &tmp, size, attrs);
-     if (mr->subpage) {
-         trace_memory_region_subpage_read(get_cpu_index(), mr, addr, tmp, size);
--    } else if (mr == &io_mem_notdirty) {
--        /* Accesses to code which has previously been translated into a TB show
--         * up in the MMIO path, as accesses to the io_mem_notdirty
--         * MemoryRegion. */
-     } else if (TRACE_MEMORY_REGION_OPS_READ_ENABLED) {
-         hwaddr abs_addr = memory_region_to_absolute_addr(mr, addr);
-         trace_memory_region_ops_read(get_cpu_index(), mr, abs_addr, tmp, size);
-@@ -XXX,XX +XXX,XX @@ static MemTxResult memory_region_write_accessor(MemoryRegion *mr,
-     if (mr->subpage) {
-         trace_memory_region_subpage_write(get_cpu_index(), mr, addr, tmp, size);
--    } else if (mr == &io_mem_notdirty) {
--        /* Accesses to code which has previously been translated into a TB show
--         * up in the MMIO path, as accesses to the io_mem_notdirty
--         * MemoryRegion. */
-     } else if (TRACE_MEMORY_REGION_OPS_WRITE_ENABLED) {
-         hwaddr abs_addr = memory_region_to_absolute_addr(mr, addr);
-         trace_memory_region_ops_write(get_cpu_index(), mr, abs_addr, tmp, size);
-@@ -XXX,XX +XXX,XX @@ static MemTxResult memory_region_write_with_attrs_accessor(MemoryRegion *mr,
-     if (mr->subpage) {
-         trace_memory_region_subpage_write(get_cpu_index(), mr, addr, tmp, size);
--    } else if (mr == &io_mem_notdirty) {
--        /* Accesses to code which has previously been translated into a TB show
--         * up in the MMIO path, as accesses to the io_mem_notdirty
--         * MemoryRegion. */
-     } else if (TRACE_MEMORY_REGION_OPS_WRITE_ENABLED) {
-         hwaddr abs_addr = memory_region_to_absolute_addr(mr, addr);
-         trace_memory_region_ops_write(get_cpu_index(), mr, abs_addr, tmp, size);
---
-.17.1

-[PULL 10/16] cputlb: Partially inline memory_region_section_get_iotlb
+Deleted patch
-There is only one caller, tlb_set_page_with_attrs.  We cannot
-inline the entire function because the AddressSpaceDispatch
-structure is private to exec.c, and cannot easily be moved to
-include/exec/memory-internal.h.
-Compute is_ram and is_romd once within tlb_set_page_with_attrs.
-Fold the number of tests against these predicates.  Compute
-cpu_physical_memory_is_clean outside of the tlb lock region.
-Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
----
- include/exec/exec-all.h |  6 +---
- accel/tcg/cputlb.c      | 68 ++++++++++++++++++++++++++---------------
- exec.c                  | 22 ++-----------
-files changed, 47 insertions(+), 49 deletions(-)
-diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/exec/exec-all.h
-+++ b/include/exec/exec-all.h
-@@ -XXX,XX +XXX,XX @@ address_space_translate_for_iotlb(CPUState *cpu, int asidx, hwaddr addr,
-                                   hwaddr *xlat, hwaddr *plen,
-                                   MemTxAttrs attrs, int *prot);
- hwaddr memory_region_section_get_iotlb(CPUState *cpu,
--                                       MemoryRegionSection *section,
--                                       target_ulong vaddr,
--                                       hwaddr paddr, hwaddr xlat,
--                                       int prot,
--                                       target_ulong *address);
-+                                       MemoryRegionSection *section);
- #endif
- /* vl.c */
-diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
-index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/cputlb.c
-+++ b/accel/tcg/cputlb.c
-@@ -XXX,XX +XXX,XX @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
-     MemoryRegionSection *section;
-     unsigned int index;
-     target_ulong address;
--    target_ulong code_address;
-+    target_ulong write_address;
-     uintptr_t addend;
-     CPUTLBEntry *te, tn;
-     hwaddr iotlb, xlat, sz, paddr_page;
-     target_ulong vaddr_page;
-     int asidx = cpu_asidx_from_attrs(cpu, attrs);
-     int wp_flags;
-+    bool is_ram, is_romd;
-     assert_cpu_is_self(cpu);
-@@ -XXX,XX +XXX,XX @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
-     if (attrs.byte_swap) {
-         address |= TLB_BSWAP;
-     }
--    if (!memory_region_is_ram(section->mr) &&
--        !memory_region_is_romd(section->mr)) {
--        /* IO memory case */
--        address |= TLB_MMIO;
--        addend = 0;
--    } else {
-+
-+    is_ram = memory_region_is_ram(section->mr);
-+    is_romd = memory_region_is_romd(section->mr);
-+
-+    if (is_ram || is_romd) {
-+        /* RAM and ROMD both have associated host memory. */
-         addend = (uintptr_t)memory_region_get_ram_ptr(section->mr) + xlat;
-+    } else {
-+        /* I/O does not; force the host address to NULL. */
-+        addend = 0;
-+    }
-+
-+    write_address = address;
-+    if (is_ram) {
-+        iotlb = memory_region_get_ram_addr(section->mr) + xlat;
-+        /*
-+         * Computing is_clean is expensive; avoid all that unless
-+         * the page is actually writable.
-+         */
-+        if (prot & PAGE_WRITE) {
-+            if (section->readonly) {
-+                write_address |= TLB_DISCARD_WRITE;
-+            } else if (cpu_physical_memory_is_clean(iotlb)) {
-+                write_address |= TLB_NOTDIRTY;
-+            }
-+        }
-+    } else {
-+        /* I/O or ROMD */
-+        iotlb = memory_region_section_get_iotlb(cpu, section) + xlat;
-+        /*
-+         * Writes to romd devices must go through MMIO to enable write.
-+         * Reads to romd devices go through the ram_ptr found above,
-+         * but of course reads to I/O must go through MMIO.
-+         */
-+        write_address |= TLB_MMIO;
-+        if (!is_romd) {
-+            address = write_address;
-+        }
-     }
--    code_address = address;
--    iotlb = memory_region_section_get_iotlb(cpu, section, vaddr_page,
--                                            paddr_page, xlat, prot, &address);
-     wp_flags = cpu_watchpoint_address_matches(cpu, vaddr_page,
-                                               TARGET_PAGE_SIZE);
-@@ -XXX,XX +XXX,XX @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
-     /*
-      * At this point iotlb contains a physical section number in the lower
-      * TARGET_PAGE_BITS, and either
--     *  + the ram_addr_t of the page base of the target RAM (if NOTDIRTY or ROM)
--     *  + the offset within section->mr of the page base (otherwise)
-+     *  + the ram_addr_t of the page base of the target RAM (RAM)
-+     *  + the offset within section->mr of the page base (I/O, ROMD)
-      * We subtract the vaddr_page (which is page aligned and thus won't
-      * disturb the low bits) to give an offset which can be added to the
-      * (non-page-aligned) vaddr of the eventual memory access to get
-@@ -XXX,XX +XXX,XX @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
-     }
-     if (prot & PAGE_EXEC) {
--        tn.addr_code = code_address;
-+        tn.addr_code = address;
-     } else {
-         tn.addr_code = -1;
-     }
-     tn.addr_write = -1;
-     if (prot & PAGE_WRITE) {
--        tn.addr_write = address;
--        if (memory_region_is_romd(section->mr)) {
--            /* Use the MMIO path so that the device can switch states. */
--            tn.addr_write |= TLB_MMIO;
--        } else if (memory_region_is_ram(section->mr)) {
--            if (section->readonly) {
--                tn.addr_write |= TLB_DISCARD_WRITE;
--            } else if (cpu_physical_memory_is_clean(
--                        memory_region_get_ram_addr(section->mr) + xlat)) {
--                tn.addr_write |= TLB_NOTDIRTY;
--            }
--        }
-+        tn.addr_write = write_address;
-         if (prot & PAGE_WRITE_INV) {
-             tn.addr_write |= TLB_INVALID_MASK;
-         }
-diff --git a/exec.c b/exec.c
-index XXXXXXX..XXXXXXX 100644
---- a/exec.c
-+++ b/exec.c
-@@ -XXX,XX +XXX,XX @@ bool cpu_physical_memory_snapshot_get_dirty(DirtyBitmapSnapshot *snap,
- /* Called from RCU critical section */
- hwaddr memory_region_section_get_iotlb(CPUState *cpu,
--                                       MemoryRegionSection *section,
--                                       target_ulong vaddr,
--                                       hwaddr paddr, hwaddr xlat,
--                                       int prot,
--                                       target_ulong *address)
-+                                       MemoryRegionSection *section)
- {
--    hwaddr iotlb;
--
--    if (memory_region_is_ram(section->mr)) {
--        /* Normal RAM.  */
--        iotlb = memory_region_get_ram_addr(section->mr) + xlat;
--    } else {
--        AddressSpaceDispatch *d;
--
--        d = flatview_to_dispatch(section->fv);
--        iotlb = section - d->map.sections;
--        iotlb += xlat;
--    }
--
--    return iotlb;
-+    AddressSpaceDispatch *d = flatview_to_dispatch(section->fv);
-+    return section - d->map.sections;
- }
- #endif /* defined(CONFIG_USER_ONLY) */
---
-.17.1

-[PULL 11/16] cputlb: Merge and move memory_notdirty_write_{prepare, complete}
+Deleted patch
-Since 9458a9a1df1a, all readers of the dirty bitmaps wait
-for the rcu lock, which means that they wait until the end
-of any executing TranslationBlock.
-As a consequence, there is no need for the actual access
-to happen in between the _prepare and _complete.  Therefore,
-we can improve things by merging the two functions into
-notdirty_write and dropping the NotDirtyInfo structure.
-In addition, the only users of notdirty_write are in cputlb.c,
-so move the merged function there.  Pass in the CPUIOTLBEntry
-from which the ram_addr_t may be computed.
-Reviewed-by: David Hildenbrand <david@redhat.com>
-Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
----
- include/exec/memory-internal.h | 65 -----------------------------
- accel/tcg/cputlb.c             | 76 +++++++++++++++++++---------------
- exec.c                         | 44 --------------------
-files changed, 42 insertions(+), 143 deletions(-)
-diff --git a/include/exec/memory-internal.h b/include/exec/memory-internal.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/exec/memory-internal.h
-+++ b/include/exec/memory-internal.h
-@@ -XXX,XX +XXX,XX @@ void address_space_dispatch_free(AddressSpaceDispatch *d);
- void mtree_print_dispatch(struct AddressSpaceDispatch *d,
-                           MemoryRegion *root);
--
--struct page_collection;
--
--/* Opaque struct for passing info from memory_notdirty_write_prepare()
-- * to memory_notdirty_write_complete(). Callers should treat all fields
-- * as private, with the exception of @active.
-- *
-- * @active is a field which is not touched by either the prepare or
-- * complete functions, but which the caller can use if it wishes to
-- * track whether it has called prepare for this struct and so needs
-- * to later call the complete function.
-- */
--typedef struct {
--    CPUState *cpu;
--    struct page_collection *pages;
--    ram_addr_t ram_addr;
--    vaddr mem_vaddr;
--    unsigned size;
--    bool active;
--} NotDirtyInfo;
--
--/**
-- * memory_notdirty_write_prepare: call before writing to non-dirty memory
-- * @ndi: pointer to opaque NotDirtyInfo struct
-- * @cpu: CPU doing the write
-- * @mem_vaddr: virtual address of write
-- * @ram_addr: the ram address of the write
-- * @size: size of write in bytes
-- *
-- * Any code which writes to the host memory corresponding to
-- * guest RAM which has been marked as NOTDIRTY must wrap those
-- * writes in calls to memory_notdirty_write_prepare() and
-- * memory_notdirty_write_complete():
-- *
-- *  NotDirtyInfo ndi;
-- *  memory_notdirty_write_prepare(&ndi, ....);
-- *  ... perform write here ...
-- *  memory_notdirty_write_complete(&ndi);
-- *
-- * These calls will ensure that we flush any TCG translated code for
-- * the memory being written, update the dirty bits and (if possible)
-- * remove the slowpath callback for writing to the memory.
-- *
-- * This must only be called if we are using TCG; it will assert otherwise.
-- *
-- * We may take locks in the prepare call, so callers must ensure that
-- * they don't exit (via longjump or otherwise) without calling complete.
-- *
-- * This call must only be made inside an RCU critical section.
-- * (Note that while we're executing a TCG TB we're always in an
-- * RCU critical section, which is likely to be the case for callers
-- * of these functions.)
-- */
--void memory_notdirty_write_prepare(NotDirtyInfo *ndi,
--                                   CPUState *cpu,
--                                   vaddr mem_vaddr,
--                                   ram_addr_t ram_addr,
--                                   unsigned size);
--/**
-- * memory_notdirty_write_complete: finish write to non-dirty memory
-- * @ndi: pointer to the opaque NotDirtyInfo struct which was initialized
-- * by memory_not_dirty_write_prepare().
-- */
--void memory_notdirty_write_complete(NotDirtyInfo *ndi);
--
- #endif
- #endif
-diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
-index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/cputlb.c
-+++ b/accel/tcg/cputlb.c
-@@ -XXX,XX +XXX,XX @@
- #include "exec/helper-proto.h"
- #include "qemu/atomic.h"
- #include "qemu/atomic128.h"
-+#include "translate-all.h"
- /* DEBUG defines, enable DEBUG_TLB_LOG to log to the CPU_LOG_MMU target */
- /* #define DEBUG_TLB */
-@@ -XXX,XX +XXX,XX @@ tb_page_addr_t get_page_addr_code(CPUArchState *env, target_ulong addr)
-     return qemu_ram_addr_from_host_nofail(p);
- }
-+static void notdirty_write(CPUState *cpu, vaddr mem_vaddr, unsigned size,
-+                           CPUIOTLBEntry *iotlbentry, uintptr_t retaddr)
-+{
-+    ram_addr_t ram_addr = mem_vaddr + iotlbentry->addr;
-+
-+    trace_memory_notdirty_write_access(mem_vaddr, ram_addr, size);
-+
-+    if (!cpu_physical_memory_get_dirty_flag(ram_addr, DIRTY_MEMORY_CODE)) {
-+        struct page_collection *pages
-+            = page_collection_lock(ram_addr, ram_addr + size);
-+
-+        /* We require mem_io_pc in tb_invalidate_phys_page_range.  */
-+        cpu->mem_io_pc = retaddr;
-+
-+        tb_invalidate_phys_page_fast(pages, ram_addr, size);
-+        page_collection_unlock(pages);
-+    }
-+
-+    /*
-+     * Set both VGA and migration bits for simplicity and to remove
-+     * the notdirty callback faster.
-+     */
-+    cpu_physical_memory_set_dirty_range(ram_addr, size, DIRTY_CLIENTS_NOCODE);
-+
-+    /* We remove the notdirty callback only if the code has been flushed. */
-+    if (!cpu_physical_memory_is_clean(ram_addr)) {
-+        trace_memory_notdirty_set_dirty(mem_vaddr);
-+        tlb_set_dirty(cpu, mem_vaddr);
-+    }
-+}
-+
- /*
-  * Probe for whether the specified guest access is permitted. If it is not
-  * permitted then an exception will be taken in the same way as if this
-@@ -XXX,XX +XXX,XX @@ void *tlb_vaddr_to_host(CPUArchState *env, abi_ptr addr,
- /* Probe for a read-modify-write atomic operation.  Do not allow unaligned
-  * operations, or io operations to proceed.  Return the host address.  */
- static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
--                               TCGMemOpIdx oi, uintptr_t retaddr,
--                               NotDirtyInfo *ndi)
-+                               TCGMemOpIdx oi, uintptr_t retaddr)
- {
-     size_t mmu_idx = get_mmuidx(oi);
-     uintptr_t index = tlb_index(env, mmu_idx, addr);
-@@ -XXX,XX +XXX,XX @@ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
-     hostaddr = (void *)((uintptr_t)addr + tlbe->addend);
--    ndi->active = false;
-     if (unlikely(tlb_addr & TLB_NOTDIRTY)) {
--        ndi->active = true;
--        memory_notdirty_write_prepare(ndi, env_cpu(env), addr,
--                                      qemu_ram_addr_from_host_nofail(hostaddr),
--                                      1 << s_bits);
-+        notdirty_write(env_cpu(env), addr, 1 << s_bits,
-+                       &env_tlb(env)->d[mmu_idx].iotlb[index], retaddr);
-     }
-     return hostaddr;
-@@ -XXX,XX +XXX,XX @@ store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
-             return;
-         }
--        haddr = (void *)((uintptr_t)addr + entry->addend);
--
-         /* Handle clean RAM pages.  */
-         if (tlb_addr & TLB_NOTDIRTY) {
--            NotDirtyInfo ndi;
--
--            /* We require mem_io_pc in tb_invalidate_phys_page_range.  */
--            env_cpu(env)->mem_io_pc = retaddr;
--
--            memory_notdirty_write_prepare(&ndi, env_cpu(env), addr,
--                                          addr + iotlbentry->addr, size);
--
--            if (unlikely(need_swap)) {
--                store_memop(haddr, val, op ^ MO_BSWAP);
--            } else {
--                store_memop(haddr, val, op);
--            }
--
--            memory_notdirty_write_complete(&ndi);
--            return;
-+            notdirty_write(env_cpu(env), addr, size, iotlbentry, retaddr);
-         }
-+        haddr = (void *)((uintptr_t)addr + entry->addend);
-+
-         /*
-          * Keep these two store_memop separate to ensure that the compiler
-          * is able to fold the entire function to a single instruction.
-@@ -XXX,XX +XXX,XX @@ void helper_be_stq_mmu(CPUArchState *env, target_ulong addr, uint64_t val,
- #define EXTRA_ARGS     , TCGMemOpIdx oi, uintptr_t retaddr
- #define ATOMIC_NAME(X) \
-     HELPER(glue(glue(glue(atomic_ ## X, SUFFIX), END), _mmu))
--#define ATOMIC_MMU_DECLS NotDirtyInfo ndi
--#define ATOMIC_MMU_LOOKUP atomic_mmu_lookup(env, addr, oi, retaddr, &ndi)
--#define ATOMIC_MMU_CLEANUP                              \
--    do {                                                \
--        if (unlikely(ndi.active)) {                     \
--            memory_notdirty_write_complete(&ndi);       \
--        }                                               \
--    } while (0)
-+#define ATOMIC_MMU_DECLS
-+#define ATOMIC_MMU_LOOKUP atomic_mmu_lookup(env, addr, oi, retaddr)
-+#define ATOMIC_MMU_CLEANUP
- #define DATA_SIZE 1
- #include "atomic_template.h"
-@@ -XXX,XX +XXX,XX @@ void helper_be_stq_mmu(CPUArchState *env, target_ulong addr, uint64_t val,
- #undef ATOMIC_MMU_LOOKUP
- #define EXTRA_ARGS         , TCGMemOpIdx oi
- #define ATOMIC_NAME(X)     HELPER(glue(glue(atomic_ ## X, SUFFIX), END))
--#define ATOMIC_MMU_LOOKUP  atomic_mmu_lookup(env, addr, oi, GETPC(), &ndi)
-+#define ATOMIC_MMU_LOOKUP  atomic_mmu_lookup(env, addr, oi, GETPC())
- #define DATA_SIZE 1
- #include "atomic_template.h"
-diff --git a/exec.c b/exec.c
-index XXXXXXX..XXXXXXX 100644
---- a/exec.c
-+++ b/exec.c
-@@ -XXX,XX +XXX,XX @@ ram_addr_t qemu_ram_addr_from_host(void *ptr)
-     return block->offset + offset;
- }
--/* Called within RCU critical section. */
--void memory_notdirty_write_prepare(NotDirtyInfo *ndi,
--                          CPUState *cpu,
--                          vaddr mem_vaddr,
--                          ram_addr_t ram_addr,
--                          unsigned size)
--{
--    ndi->cpu = cpu;
--    ndi->ram_addr = ram_addr;
--    ndi->mem_vaddr = mem_vaddr;
--    ndi->size = size;
--    ndi->pages = NULL;
--
--    trace_memory_notdirty_write_access(mem_vaddr, ram_addr, size);
--
--    assert(tcg_enabled());
--    if (!cpu_physical_memory_get_dirty_flag(ram_addr, DIRTY_MEMORY_CODE)) {
--        ndi->pages = page_collection_lock(ram_addr, ram_addr + size);
--        tb_invalidate_phys_page_fast(ndi->pages, ram_addr, size);
--    }
--}
--
--/* Called within RCU critical section. */
--void memory_notdirty_write_complete(NotDirtyInfo *ndi)
--{
--    if (ndi->pages) {
--        assert(tcg_enabled());
--        page_collection_unlock(ndi->pages);
--        ndi->pages = NULL;
--    }
--
--    /* Set both VGA and migration bits for simplicity and to remove
--     * the notdirty callback faster.
--     */
--    cpu_physical_memory_set_dirty_range(ndi->ram_addr, ndi->size,
--                                        DIRTY_CLIENTS_NOCODE);
--    /* we remove the notdirty callback only if the code has been
--       flushed */
--    if (!cpu_physical_memory_is_clean(ndi->ram_addr)) {
--        trace_memory_notdirty_set_dirty(ndi->mem_vaddr);
--        tlb_set_dirty(ndi->cpu, ndi->mem_vaddr);
--    }
--}
--
- /* Generate a debug exception if a watchpoint has been hit.  */
- void cpu_check_watchpoint(CPUState *cpu, vaddr addr, vaddr len,
-                           MemTxAttrs attrs, int flags, uintptr_t ra)
---
-.17.1

-[PULL 12/16] cputlb: Handle TLB_NOTDIRTY in probe_access
+Deleted patch
-We can use notdirty_write for the write and return a valid host
-pointer for this case.
-Reviewed-by: David Hildenbrand <david@redhat.com>
-Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
----
- accel/tcg/cputlb.c | 26 +++++++++++++++++---------
-file changed, 17 insertions(+), 9 deletions(-)
-diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
-index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/cputlb.c
-+++ b/accel/tcg/cputlb.c
-@@ -XXX,XX +XXX,XX @@ void *probe_access(CPUArchState *env, target_ulong addr, int size,
-         return NULL;
-     }
--    /* Handle watchpoints.  */
--    if (tlb_addr & TLB_WATCHPOINT) {
--        cpu_check_watchpoint(env_cpu(env), addr, size,
--                             env_tlb(env)->d[mmu_idx].iotlb[index].attrs,
--                             wp_access, retaddr);
--    }
-+    if (unlikely(tlb_addr & TLB_FLAGS_MASK)) {
-+        CPUIOTLBEntry *iotlbentry = &env_tlb(env)->d[mmu_idx].iotlb[index];
--    /* Reject I/O access, or other required slow-path.  */
--    if (tlb_addr & (TLB_NOTDIRTY | TLB_MMIO | TLB_BSWAP | TLB_DISCARD_WRITE)) {
--        return NULL;
-+        /* Reject I/O access, or other required slow-path.  */
-+        if (tlb_addr & (TLB_MMIO | TLB_BSWAP | TLB_DISCARD_WRITE)) {
-+            return NULL;
-+        }
-+
-+        /* Handle watchpoints.  */
-+        if (tlb_addr & TLB_WATCHPOINT) {
-+            cpu_check_watchpoint(env_cpu(env), addr, size,
-+                                 iotlbentry->attrs, wp_access, retaddr);
-+        }
-+
-+        /* Handle clean RAM pages.  */
-+        if (tlb_addr & TLB_NOTDIRTY) {
-+            notdirty_write(env_cpu(env), addr, size, iotlbentry, retaddr);
-+        }
-     }
-     return (void *)((uintptr_t)addr + entry->addend);
---
-.17.1

-[PULL 13/16] cputlb: Remove cpu->mem_io_vaddr
+Deleted patch
-With the merge of notdirty handling into store_helper,
-the last user of cpu->mem_io_vaddr was removed.
-Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-Reviewed-by: David Hildenbrand <david@redhat.com>
-Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
----
- include/hw/core/cpu.h | 2 --
- accel/tcg/cputlb.c    | 2 --
- hw/core/cpu.c         | 1 -
-files changed, 5 deletions(-)
-diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/hw/core/cpu.h
-+++ b/include/hw/core/cpu.h
-@@ -XXX,XX +XXX,XX @@ struct qemu_work_item;
-  * @next_cpu: Next CPU sharing TB cache.
-  * @opaque: User data.
-  * @mem_io_pc: Host Program Counter at which the memory was accessed.
-- * @mem_io_vaddr: Target virtual address at which the memory was accessed.
-  * @kvm_fd: vCPU file descriptor for KVM.
-  * @work_mutex: Lock to prevent multiple access to queued_work_*.
-  * @queued_work_first: First asynchronous work pending.
-@@ -XXX,XX +XXX,XX @@ struct CPUState {
-      * we store some rarely used information in the CPU context.
-      */
-     uintptr_t mem_io_pc;
--    vaddr mem_io_vaddr;
-     /*
-      * This is only needed for the legacy cpu_unassigned_access() hook;
-      * when all targets using it have been converted to use
-diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
-index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/cputlb.c
-+++ b/accel/tcg/cputlb.c
-@@ -XXX,XX +XXX,XX @@ static uint64_t io_readx(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
-         cpu_io_recompile(cpu, retaddr);
-     }
--    cpu->mem_io_vaddr = addr;
-     cpu->mem_io_access_type = access_type;
-     if (mr->global_locking && !qemu_mutex_iothread_locked()) {
-@@ -XXX,XX +XXX,XX @@ static void io_writex(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
-     if (!cpu->can_do_io) {
-         cpu_io_recompile(cpu, retaddr);
-     }
--    cpu->mem_io_vaddr = addr;
-     cpu->mem_io_pc = retaddr;
-     if (mr->global_locking && !qemu_mutex_iothread_locked()) {
-diff --git a/hw/core/cpu.c b/hw/core/cpu.c
-index XXXXXXX..XXXXXXX 100644
---- a/hw/core/cpu.c
-+++ b/hw/core/cpu.c
-@@ -XXX,XX +XXX,XX @@ static void cpu_common_reset(CPUState *cpu)
-     cpu->interrupt_request = 0;
-     cpu->halted = 0;
-     cpu->mem_io_pc = 0;
--    cpu->mem_io_vaddr = 0;
-     cpu->icount_extra = 0;
-     atomic_set(&cpu->icount_decr_ptr->u32, 0);
-     cpu->can_do_io = 1;
---
-.17.1

-[PULL 14/16] cputlb: Remove tb_invalidate_phys_page_range is_cpu_write_access
+Deleted patch
-All callers pass false to this argument.  Remove it and pass the
-constant on to tb_invalidate_phys_page_range__locked.
-Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-Reviewed-by: David Hildenbrand <david@redhat.com>
-Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
----
- accel/tcg/translate-all.h | 3 +--
- accel/tcg/translate-all.c | 6 ++----
- exec.c                    | 4 ++--
-files changed, 5 insertions(+), 8 deletions(-)
-diff --git a/accel/tcg/translate-all.h b/accel/tcg/translate-all.h
-index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/translate-all.h
-+++ b/accel/tcg/translate-all.h
-@@ -XXX,XX +XXX,XX @@ struct page_collection *page_collection_lock(tb_page_addr_t start,
- void page_collection_unlock(struct page_collection *set);
- void tb_invalidate_phys_page_fast(struct page_collection *pages,
-                                   tb_page_addr_t start, int len);
--void tb_invalidate_phys_page_range(tb_page_addr_t start, tb_page_addr_t end,
--                                   int is_cpu_write_access);
-+void tb_invalidate_phys_page_range(tb_page_addr_t start, tb_page_addr_t end);
- void tb_check_watchpoint(CPUState *cpu);
- #ifdef CONFIG_USER_ONLY
-diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
-index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/translate-all.c
-+++ b/accel/tcg/translate-all.c
-@@ -XXX,XX +XXX,XX @@ tb_invalidate_phys_page_range__locked(struct page_collection *pages,
-  *
-  * Called with mmap_lock held for user-mode emulation
-  */
--void tb_invalidate_phys_page_range(tb_page_addr_t start, tb_page_addr_t end,
--                                   int is_cpu_write_access)
-+void tb_invalidate_phys_page_range(tb_page_addr_t start, tb_page_addr_t end)
- {
-     struct page_collection *pages;
-     PageDesc *p;
-@@ -XXX,XX +XXX,XX @@ void tb_invalidate_phys_page_range(tb_page_addr_t start, tb_page_addr_t end,
-         return;
-     }
-     pages = page_collection_lock(start, end);
--    tb_invalidate_phys_page_range__locked(pages, p, start, end,
--                                          is_cpu_write_access);
-+    tb_invalidate_phys_page_range__locked(pages, p, start, end, 0);
-     page_collection_unlock(pages);
- }
-diff --git a/exec.c b/exec.c
-index XXXXXXX..XXXXXXX 100644
---- a/exec.c
-+++ b/exec.c
-@@ -XXX,XX +XXX,XX @@ const char *parse_cpu_option(const char *cpu_option)
- void tb_invalidate_phys_addr(target_ulong addr)
- {
-     mmap_lock();
--    tb_invalidate_phys_page_range(addr, addr + 1, 0);
-+    tb_invalidate_phys_page_range(addr, addr + 1);
-     mmap_unlock();
- }
-@@ -XXX,XX +XXX,XX @@ void tb_invalidate_phys_addr(AddressSpace *as, hwaddr addr, MemTxAttrs attrs)
-         return;
-     }
-     ram_addr = memory_region_get_ram_addr(mr) + addr;
--    tb_invalidate_phys_page_range(ram_addr, ram_addr + 1, 0);
-+    tb_invalidate_phys_page_range(ram_addr, ram_addr + 1);
-     rcu_read_unlock();
- }
---
-.17.1

-[PULL 15/16] cputlb: Pass retaddr to tb_invalidate_phys_page_fast
+Deleted patch
-Rather than rely on cpu->mem_io_pc, pass retaddr down directly.
-Within tb_invalidate_phys_page_range__locked, the is_cpu_write_access
-parameter is non-zero exactly when retaddr would be non-zero, so that
-is a simple replacement.
-Recognize that current_tb_not_found is true only when mem_io_pc
-(and now retaddr) are also non-zero, so remove a redundant test.
-Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-Reviewed-by: David Hildenbrand <david@redhat.com>
-Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
----
- accel/tcg/translate-all.h |  3 ++-
- accel/tcg/cputlb.c        |  6 +-----
- accel/tcg/translate-all.c | 39 +++++++++++++++++++--------------------
-files changed, 22 insertions(+), 26 deletions(-)
-diff --git a/accel/tcg/translate-all.h b/accel/tcg/translate-all.h
-index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/translate-all.h
-+++ b/accel/tcg/translate-all.h
-@@ -XXX,XX +XXX,XX @@ struct page_collection *page_collection_lock(tb_page_addr_t start,
-                                              tb_page_addr_t end);
- void page_collection_unlock(struct page_collection *set);
- void tb_invalidate_phys_page_fast(struct page_collection *pages,
--                                  tb_page_addr_t start, int len);
-+                                  tb_page_addr_t start, int len,
-+                                  uintptr_t retaddr);
- void tb_invalidate_phys_page_range(tb_page_addr_t start, tb_page_addr_t end);
- void tb_check_watchpoint(CPUState *cpu);
-diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
-index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/cputlb.c
-+++ b/accel/tcg/cputlb.c
-@@ -XXX,XX +XXX,XX @@ static void notdirty_write(CPUState *cpu, vaddr mem_vaddr, unsigned size,
-     if (!cpu_physical_memory_get_dirty_flag(ram_addr, DIRTY_MEMORY_CODE)) {
-         struct page_collection *pages
-             = page_collection_lock(ram_addr, ram_addr + size);
--
--        /* We require mem_io_pc in tb_invalidate_phys_page_range.  */
--        cpu->mem_io_pc = retaddr;
--
--        tb_invalidate_phys_page_fast(pages, ram_addr, size);
-+        tb_invalidate_phys_page_fast(pages, ram_addr, size, retaddr);
-         page_collection_unlock(pages);
-     }
-diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
-index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/translate-all.c
-+++ b/accel/tcg/translate-all.c
-@@ -XXX,XX +XXX,XX @@ static void
- tb_invalidate_phys_page_range__locked(struct page_collection *pages,
-                                       PageDesc *p, tb_page_addr_t start,
-                                       tb_page_addr_t end,
--                                      int is_cpu_write_access)
-+                                      uintptr_t retaddr)
- {
-     TranslationBlock *tb;
-     tb_page_addr_t tb_start, tb_end;
-@@ -XXX,XX +XXX,XX @@ tb_invalidate_phys_page_range__locked(struct page_collection *pages,
- #ifdef TARGET_HAS_PRECISE_SMC
-     CPUState *cpu = current_cpu;
-     CPUArchState *env = NULL;
--    int current_tb_not_found = is_cpu_write_access;
-+    bool current_tb_not_found = retaddr != 0;
-+    bool current_tb_modified = false;
-     TranslationBlock *current_tb = NULL;
--    int current_tb_modified = 0;
-     target_ulong current_pc = 0;
-     target_ulong current_cs_base = 0;
-     uint32_t current_flags = 0;
-@@ -XXX,XX +XXX,XX @@ tb_invalidate_phys_page_range__locked(struct page_collection *pages,
-         if (!(tb_end <= start || tb_start >= end)) {
- #ifdef TARGET_HAS_PRECISE_SMC
-             if (current_tb_not_found) {
--                current_tb_not_found = 0;
--                current_tb = NULL;
--                if (cpu->mem_io_pc) {
--                    /* now we have a real cpu fault */
--                    current_tb = tcg_tb_lookup(cpu->mem_io_pc);
--                }
-+                current_tb_not_found = false;
-+                /* now we have a real cpu fault */
-+                current_tb = tcg_tb_lookup(retaddr);
-             }
-             if (current_tb == tb &&
-                 (tb_cflags(current_tb) & CF_COUNT_MASK) != 1) {
--                /* If we are modifying the current TB, we must stop
--                its execution. We could be more precise by checking
--                that the modification is after the current PC, but it
--                would require a specialized function to partially
--                restore the CPU state */
--
--                current_tb_modified = 1;
--                cpu_restore_state_from_tb(cpu, current_tb,
--                                          cpu->mem_io_pc, true);
-+                /*
-+                 * If we are modifying the current TB, we must stop
-+                 * its execution. We could be more precise by checking
-+                 * that the modification is after the current PC, but it
-+                 * would require a specialized function to partially
-+                 * restore the CPU state.
-+                 */
-+                current_tb_modified = true;
-+                cpu_restore_state_from_tb(cpu, current_tb, retaddr, true);
-                 cpu_get_tb_cpu_state(env, &current_pc, &current_cs_base,
-                                      &current_flags);
-             }
-@@ -XXX,XX +XXX,XX @@ void tb_invalidate_phys_range(target_ulong start, target_ulong end)
-  * Call with all @pages in the range [@start, @start + len[ locked.
-  */
- void tb_invalidate_phys_page_fast(struct page_collection *pages,
--                                  tb_page_addr_t start, int len)
-+                                  tb_page_addr_t start, int len,
-+                                  uintptr_t retaddr)
- {
-     PageDesc *p;
-@@ -XXX,XX +XXX,XX @@ void tb_invalidate_phys_page_fast(struct page_collection *pages,
-         }
-     } else {
-     do_invalidate:
--        tb_invalidate_phys_page_range__locked(pages, p, start, start + len, 1);
-+        tb_invalidate_phys_page_range__locked(pages, p, start, start + len,
-+                                              retaddr);
-     }
- }
- #else
---
-.17.1

-[PULL 16/16] cputlb: Pass retaddr to tb_check_watchpoint
+Deleted patch
-Fixes the previous TLB_WATCHPOINT patches because we are currently
-failing to set cpu->mem_io_pc with the call to cpu_check_watchpoint.
-Pass down the retaddr directly because it's readily available.
-Fixes: 50b107c5d61
-Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-Reviewed-by: David Hildenbrand <david@redhat.com>
-Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
----
- accel/tcg/translate-all.h | 2 +-
- accel/tcg/translate-all.c | 6 +++---
- exec.c                    | 2 +-
-files changed, 5 insertions(+), 5 deletions(-)
-diff --git a/accel/tcg/translate-all.h b/accel/tcg/translate-all.h
-index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/translate-all.h
-+++ b/accel/tcg/translate-all.h
-@@ -XXX,XX +XXX,XX @@ void tb_invalidate_phys_page_fast(struct page_collection *pages,
-                                   tb_page_addr_t start, int len,
-                                   uintptr_t retaddr);
- void tb_invalidate_phys_page_range(tb_page_addr_t start, tb_page_addr_t end);
--void tb_check_watchpoint(CPUState *cpu);
-+void tb_check_watchpoint(CPUState *cpu, uintptr_t retaddr);
- #ifdef CONFIG_USER_ONLY
- int page_unprotect(target_ulong address, uintptr_t pc);
-diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
-index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/translate-all.c
-+++ b/accel/tcg/translate-all.c
-@@ -XXX,XX +XXX,XX @@ static bool tb_invalidate_phys_page(tb_page_addr_t addr, uintptr_t pc)
- #endif
- /* user-mode: call with mmap_lock held */
--void tb_check_watchpoint(CPUState *cpu)
-+void tb_check_watchpoint(CPUState *cpu, uintptr_t retaddr)
- {
-     TranslationBlock *tb;
-     assert_memory_lock();
--    tb = tcg_tb_lookup(cpu->mem_io_pc);
-+    tb = tcg_tb_lookup(retaddr);
-     if (tb) {
-         /* We can use retranslation to find the PC.  */
--        cpu_restore_state_from_tb(cpu, tb, cpu->mem_io_pc, true);
-+        cpu_restore_state_from_tb(cpu, tb, retaddr, true);
-         tb_phys_invalidate(tb, -1);
-     } else {
-         /* The exception probably happened in a helper.  The CPU state should
-diff --git a/exec.c b/exec.c
-index XXXXXXX..XXXXXXX 100644
---- a/exec.c
-+++ b/exec.c
-@@ -XXX,XX +XXX,XX @@ void cpu_check_watchpoint(CPUState *cpu, vaddr addr, vaddr len,
-                 cpu->watchpoint_hit = wp;
-                 mmap_lock();
--                tb_check_watchpoint(cpu);
-+                tb_check_watchpoint(cpu, ra);
-                 if (wp->flags & BP_STOP_BEFORE_ACCESS) {
-                     cpu->exception_index = EXCP_DEBUG;
-                     mmap_unlock();
---
-.17.1

This is v4 of my notdirty + rom patch set with two suggested name
changes (qemu_build_not_reached, TLB_DISCARD_WRITE) from David and Alex.

The following changes since commit 240ab11fb72049d6373cbbec8d788f8e411a00bc:

Merge remote-tracking branch 'remotes/aperard/tags/pull-xen-20190924' into staging (2019-09-24 15:36:31 +0100)

are available in the Git repository at:

https://github.com/rth7680/qemu.git tags/pull-tcg-20190925

for you to fetch changes up to ae57db63acf5a0399232f852acc5c1d83ef63400:

cputlb: Pass retaddr to tb_check_watchpoint (2019-09-25 10:56:28 -0700)

----------------------------------------------------------------
Fixes for TLB_BSWAP
Coversion of NOTDIRTY and ROM handling to cputlb
Followup cleanups to cputlb

----------------------------------------------------------------
Richard Henderson (16):
      exec: Use TARGET_PAGE_BITS_MIN for TLB flags
      cputlb: Disable __always_inline__ without optimization
      qemu/compiler.h: Add qemu_build_not_reached
      cputlb: Use qemu_build_not_reached in load/store_helpers
      cputlb: Split out load/store_memop
      cputlb: Introduce TLB_BSWAP
      exec: Adjust notdirty tracing
      cputlb: Move ROM handling from I/O path to TLB path
      cputlb: Move NOTDIRTY handling from I/O path to TLB path
      cputlb: Partially inline memory_region_section_get_iotlb
      cputlb: Merge and move memory_notdirty_write_{prepare,complete}
      cputlb: Handle TLB_NOTDIRTY in probe_access
      cputlb: Remove cpu->mem_io_vaddr
      cputlb: Remove tb_invalidate_phys_page_range is_cpu_write_access
      cputlb: Pass retaddr to tb_invalidate_phys_page_fast
      cputlb: Pass retaddr to tb_check_watchpoint

These bits do not need to vary with the actual page size
used by the guest.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/cpu-all.h | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/include/exec/cpu-all.h b/include/exec/cpu-all.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/cpu-all.h
+++ b/include/exec/cpu-all.h
@@ -XXX,XX +XXX,XX @@ CPUArchState *cpu_copy(CPUArchState *env);
 
 #if !defined(CONFIG_USER_ONLY)
 
-/* Flags stored in the low bits of the TLB virtual address.  These are
- * defined so that fast path ram access is all zeros.
+/*
+ * Flags stored in the low bits of the TLB virtual address.
+ * These are defined so that fast path ram access is all zeros.
  * The flags all must be between TARGET_PAGE_BITS and
  * maximum address alignment bit.
+ *
+ * Use TARGET_PAGE_BITS_MIN so that these bits are constant
+ * when TARGET_PAGE_BITS_VARY is in effect.
  */
 /* Zero if TLB entry is valid.  */
-#define TLB_INVALID_MASK    (1 << (TARGET_PAGE_BITS - 1))
+#define TLB_INVALID_MASK    (1 << (TARGET_PAGE_BITS_MIN - 1))
 /* Set if TLB entry references a clean RAM page.  The iotlb entry will
    contain the page physical address.  */
-#define TLB_NOTDIRTY        (1 << (TARGET_PAGE_BITS - 2))
+#define TLB_NOTDIRTY        (1 << (TARGET_PAGE_BITS_MIN - 2))
 /* Set if TLB entry is an IO callback.  */
-#define TLB_MMIO            (1 << (TARGET_PAGE_BITS - 3))
+#define TLB_MMIO            (1 << (TARGET_PAGE_BITS_MIN - 3))
 /* Set if TLB entry contains a watchpoint.  */
-#define TLB_WATCHPOINT      (1 << (TARGET_PAGE_BITS - 4))
+#define TLB_WATCHPOINT      (1 << (TARGET_PAGE_BITS_MIN - 4))
 
 /* Use this mask to check interception with an alignment mask
  * in a TCG backend.
-- 
2.17.1

This forced inlining can result in missing symbols,
which makes a debugging build harder to follow.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reported-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/qemu/compiler.h | 11 +++++++++++
 accel/tcg/cputlb.c      |  4 ++--
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/include/qemu/compiler.h b/include/qemu/compiler.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/compiler.h
+++ b/include/qemu/compiler.h
@@ -XXX,XX +XXX,XX @@
 # define QEMU_NONSTRING
 #endif
 
+/*
+ * Forced inlining may be desired to encourage constant propagation
+ * of function parameters.  However, it can also make debugging harder,
+ * so disable it for a non-optimizing build.
+ */
+#if defined(__OPTIMIZE__)
+#define QEMU_ALWAYS_INLINE  __attribute__((always_inline))
+#else
+#define QEMU_ALWAYS_INLINE
+#endif
+
 /* Implement C11 _Generic via GCC builtins.  Example:
  *
  *    QEMU_GENERIC(x, (float, sinf), (long double, sinl), sin) (x)
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
 typedef uint64_t FullLoadHelper(CPUArchState *env, target_ulong addr,
                                 TCGMemOpIdx oi, uintptr_t retaddr);
 
-static inline uint64_t __attribute__((always_inline))
+static inline uint64_t QEMU_ALWAYS_INLINE
 load_helper(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi,
             uintptr_t retaddr, MemOp op, bool code_read,
             FullLoadHelper *full_load)
@@ -XXX,XX +XXX,XX @@ tcg_target_ulong helper_be_ldsl_mmu(CPUArchState *env, target_ulong addr,
  * Store Helpers
  */
 
-static inline void __attribute__((always_inline))
+static inline void QEMU_ALWAYS_INLINE
 store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
              TCGMemOpIdx oi, uintptr_t retaddr, MemOp op)
 {
-- 
2.17.1

Use this as a compile-time assert that a particular
code path is not reachable.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/qemu/compiler.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/include/qemu/compiler.h b/include/qemu/compiler.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/compiler.h
+++ b/include/qemu/compiler.h
@@ -XXX,XX +XXX,XX @@
 #define QEMU_GENERIC9(x, a0, ...) QEMU_GENERIC_IF(x, a0, QEMU_GENERIC8(x, __VA_ARGS__))
 #define QEMU_GENERIC10(x, a0, ...) QEMU_GENERIC_IF(x, a0, QEMU_GENERIC9(x, __VA_ARGS__))
 
+/**
+ * qemu_build_not_reached()
+ *
+ * The compiler, during optimization, is expected to prove that a call
+ * to this function cannot be reached and remove it.  If the compiler
+ * supports QEMU_ERROR, this will be reported at compile time; otherwise
+ * this will be reported at link time due to the missing symbol.
+ */
+#ifdef __OPTIMIZE__
+extern void QEMU_NORETURN QEMU_ERROR("code path is reachable")
+    qemu_build_not_reached(void);
+#else
+#define qemu_build_not_reached()  g_assert_not_reached()
+#endif
+
 #endif /* COMPILER_H */
-- 
2.17.1

We will shortly be using these more than once.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/cputlb.c | 107 +++++++++++++++++++++++----------------------
 1 file changed, 55 insertions(+), 52 deletions(-)

diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
 typedef uint64_t FullLoadHelper(CPUArchState *env, target_ulong addr,
                                 TCGMemOpIdx oi, uintptr_t retaddr);
 
+static inline uint64_t QEMU_ALWAYS_INLINE
+load_memop(const void *haddr, MemOp op)
+{
+    switch (op) {
+    case MO_UB:
+        return ldub_p(haddr);
+    case MO_BEUW:
+        return lduw_be_p(haddr);
+    case MO_LEUW:
+        return lduw_le_p(haddr);
+    case MO_BEUL:
+        return (uint32_t)ldl_be_p(haddr);
+    case MO_LEUL:
+        return (uint32_t)ldl_le_p(haddr);
+    case MO_BEQ:
+        return ldq_be_p(haddr);
+    case MO_LEQ:
+        return ldq_le_p(haddr);
+    default:
+        qemu_build_not_reached();
+    }
+}
+
 static inline uint64_t QEMU_ALWAYS_INLINE
 load_helper(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi,
             uintptr_t retaddr, MemOp op, bool code_read,
@@ -XXX,XX +XXX,XX @@ load_helper(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi,
 
  do_aligned_access:
     haddr = (void *)((uintptr_t)addr + entry->addend);
-    switch (op) {
-    case MO_UB:
-        res = ldub_p(haddr);
-        break;
-    case MO_BEUW:
-        res = lduw_be_p(haddr);
-        break;
-    case MO_LEUW:
-        res = lduw_le_p(haddr);
-        break;
-    case MO_BEUL:
-        res = (uint32_t)ldl_be_p(haddr);
-        break;
-    case MO_LEUL:
-        res = (uint32_t)ldl_le_p(haddr);
-        break;
-    case MO_BEQ:
-        res = ldq_be_p(haddr);
-        break;
-    case MO_LEQ:
-        res = ldq_le_p(haddr);
-        break;
-    default:
-        qemu_build_not_reached();
-    }
-
-    return res;
+    return load_memop(haddr, op);
 }
 
 /*
@@ -XXX,XX +XXX,XX @@ tcg_target_ulong helper_be_ldsl_mmu(CPUArchState *env, target_ulong addr,
  * Store Helpers
  */
 
+static inline void QEMU_ALWAYS_INLINE
+store_memop(void *haddr, uint64_t val, MemOp op)
+{
+    switch (op) {
+    case MO_UB:
+        stb_p(haddr, val);
+        break;
+    case MO_BEUW:
+        stw_be_p(haddr, val);
+        break;
+    case MO_LEUW:
+        stw_le_p(haddr, val);
+        break;
+    case MO_BEUL:
+        stl_be_p(haddr, val);
+        break;
+    case MO_LEUL:
+        stl_le_p(haddr, val);
+        break;
+    case MO_BEQ:
+        stq_be_p(haddr, val);
+        break;
+    case MO_LEQ:
+        stq_le_p(haddr, val);
+        break;
+    default:
+        qemu_build_not_reached();
+    }
+}
+
 static inline void QEMU_ALWAYS_INLINE
 store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
              TCGMemOpIdx oi, uintptr_t retaddr, MemOp op)
@@ -XXX,XX +XXX,XX @@ store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
 
  do_aligned_access:
     haddr = (void *)((uintptr_t)addr + entry->addend);
-    switch (op) {
-    case MO_UB:
-        stb_p(haddr, val);
-        break;
-    case MO_BEUW:
-        stw_be_p(haddr, val);
-        break;
-    case MO_LEUW:
-        stw_le_p(haddr, val);
-        break;
-    case MO_BEUL:
-        stl_be_p(haddr, val);
-        break;
-    case MO_LEUL:
-        stl_le_p(haddr, val);
-        break;
-    case MO_BEQ:
-        stq_be_p(haddr, val);
-        break;
-    case MO_LEQ:
-        stq_le_p(haddr, val);
-        break;
-    default:
-        qemu_build_not_reached();
-    }
+    store_memop(haddr, val, op);
 }
 
 void helper_ret_stb_mmu(CPUArchState *env, target_ulong addr, uint8_t val,
-- 
2.17.1

Handle bswap on ram directly in load/store_helper.  This fixes a
bug with the previous implementation in that one cannot use the
I/O path for RAM.

Fixes: a26fc6f5152b47f1
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/cpu-all.h |  4 ++-
 accel/tcg/cputlb.c     | 72 +++++++++++++++++++++++++-----------------
 2 files changed, 46 insertions(+), 30 deletions(-)

diff --git a/include/exec/cpu-all.h b/include/exec/cpu-all.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/cpu-all.h
+++ b/include/exec/cpu-all.h
@@ -XXX,XX +XXX,XX @@ CPUArchState *cpu_copy(CPUArchState *env);
 #define TLB_MMIO            (1 << (TARGET_PAGE_BITS_MIN - 3))
 /* Set if TLB entry contains a watchpoint.  */
 #define TLB_WATCHPOINT      (1 << (TARGET_PAGE_BITS_MIN - 4))
+/* Set if TLB entry requires byte swap.  */
+#define TLB_BSWAP           (1 << (TARGET_PAGE_BITS_MIN - 5))
 
 /* Use this mask to check interception with an alignment mask
  * in a TCG backend.
  */
 #define TLB_FLAGS_MASK \
-    (TLB_INVALID_MASK | TLB_NOTDIRTY | TLB_MMIO | TLB_WATCHPOINT)
+    (TLB_INVALID_MASK | TLB_NOTDIRTY | TLB_MMIO | TLB_WATCHPOINT | TLB_BSWAP)
 
 /**
  * tlb_hit_page: return true if page aligned @addr is a hit against the
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
         address |= TLB_INVALID_MASK;
     }
     if (attrs.byte_swap) {
-        /* Force the access through the I/O slow path.  */
-        address |= TLB_MMIO;
+        address |= TLB_BSWAP;
     }
     if (!memory_region_is_ram(section->mr) &&
         !memory_region_is_romd(section->mr)) {
@@ -XXX,XX +XXX,XX @@ static uint64_t io_readx(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
     bool locked = false;
     MemTxResult r;
 
-    if (iotlbentry->attrs.byte_swap) {
-        op ^= MO_BSWAP;
-    }
-
     section = iotlb_to_section(cpu, iotlbentry->addr, iotlbentry->attrs);
     mr = section->mr;
     mr_offset = (iotlbentry->addr & TARGET_PAGE_MASK) + addr;
@@ -XXX,XX +XXX,XX @@ static void io_writex(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
     bool locked = false;
     MemTxResult r;
 
-    if (iotlbentry->attrs.byte_swap) {
-        op ^= MO_BSWAP;
-    }
-
     section = iotlb_to_section(cpu, iotlbentry->addr, iotlbentry->attrs);
     mr = section->mr;
     mr_offset = (iotlbentry->addr & TARGET_PAGE_MASK) + addr;
@@ -XXX,XX +XXX,XX @@ void *probe_access(CPUArchState *env, target_ulong addr, int size,
                              wp_access, retaddr);
     }
 
-    if (tlb_addr & (TLB_NOTDIRTY | TLB_MMIO)) {
-        /* I/O access */
+    /* Reject I/O access, or other required slow-path.  */
+    if (tlb_addr & (TLB_NOTDIRTY | TLB_MMIO | TLB_BSWAP)) {
         return NULL;
     }
 
@@ -XXX,XX +XXX,XX @@ load_helper(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi,
     /* Handle anything that isn't just a straight memory access.  */
     if (unlikely(tlb_addr & ~TARGET_PAGE_MASK)) {
         CPUIOTLBEntry *iotlbentry;
+        bool need_swap;
 
         /* For anything that is unaligned, recurse through full_load.  */
         if ((addr & (size - 1)) != 0) {
@@ -XXX,XX +XXX,XX @@ load_helper(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi,
             /* On watchpoint hit, this will longjmp out.  */
             cpu_check_watchpoint(env_cpu(env), addr, size,
                                  iotlbentry->attrs, BP_MEM_READ, retaddr);
-
-            /* The backing page may or may not require I/O.  */
-            tlb_addr &= ~TLB_WATCHPOINT;
-            if ((tlb_addr & ~TARGET_PAGE_MASK) == 0) {
-                goto do_aligned_access;
-            }
         }
 
+        need_swap = size > 1 && (tlb_addr & TLB_BSWAP);
+
         /* Handle I/O access.  */
-        return io_readx(env, iotlbentry, mmu_idx, addr,
-                        retaddr, access_type, op);
+        if (likely(tlb_addr & TLB_MMIO)) {
+            return io_readx(env, iotlbentry, mmu_idx, addr, retaddr,
+                            access_type, op ^ (need_swap * MO_BSWAP));
+        }
+
+        haddr = (void *)((uintptr_t)addr + entry->addend);
+
+        /*
+         * Keep these two load_memop separate to ensure that the compiler
+         * is able to fold the entire function to a single instruction.
+         * There is a build-time assert inside to remind you of this.  ;-)
+         */
+        if (unlikely(need_swap)) {
+            return load_memop(haddr, op ^ MO_BSWAP);
+        }
+        return load_memop(haddr, op);
     }
 
     /* Handle slow unaligned access (it spans two pages or IO).  */
@@ -XXX,XX +XXX,XX @@ load_helper(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi,
         return res & MAKE_64BIT_MASK(0, size * 8);
     }
 
- do_aligned_access:
     haddr = (void *)((uintptr_t)addr + entry->addend);
     return load_memop(haddr, op);
 }
@@ -XXX,XX +XXX,XX @@ store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
     /* Handle anything that isn't just a straight memory access.  */
     if (unlikely(tlb_addr & ~TARGET_PAGE_MASK)) {
         CPUIOTLBEntry *iotlbentry;
+        bool need_swap;
 
         /* For anything that is unaligned, recurse through byte stores.  */
         if ((addr & (size - 1)) != 0) {
@@ -XXX,XX +XXX,XX @@ store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
             /* On watchpoint hit, this will longjmp out.  */
             cpu_check_watchpoint(env_cpu(env), addr, size,
                                  iotlbentry->attrs, BP_MEM_WRITE, retaddr);
-
-            /* The backing page may or may not require I/O.  */
-            tlb_addr &= ~TLB_WATCHPOINT;
-            if ((tlb_addr & ~TARGET_PAGE_MASK) == 0) {
-                goto do_aligned_access;
-            }
         }
 
+        need_swap = size > 1 && (tlb_addr & TLB_BSWAP);
+
         /* Handle I/O access.  */
-        io_writex(env, iotlbentry, mmu_idx, val, addr, retaddr, op);
+        if (likely(tlb_addr & (TLB_MMIO | TLB_NOTDIRTY))) {
+            io_writex(env, iotlbentry, mmu_idx, val, addr, retaddr,
+                      op ^ (need_swap * MO_BSWAP));
+            return;
+        }
+
+        haddr = (void *)((uintptr_t)addr + entry->addend);
+
+        /*
+         * Keep these two store_memop separate to ensure that the compiler
+         * is able to fold the entire function to a single instruction.
+         * There is a build-time assert inside to remind you of this.  ;-)
+         */
+        if (unlikely(need_swap)) {
+            store_memop(haddr, val, op ^ MO_BSWAP);
+        } else {
+            store_memop(haddr, val, op);
+        }
         return;
     }
 
@@ -XXX,XX +XXX,XX @@ store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
         return;
     }
 
- do_aligned_access:
     haddr = (void *)((uintptr_t)addr + entry->addend);
     store_memop(haddr, val, op);
 }
-- 
2.17.1

The memory_region_tb_read tracepoint is unreachable, since notdirty
is supposed to apply only to writes.  The memory_region_tb_write
tracepoint is mis-named, because notdirty is not only used for TB
invalidation.  It is also used for e.g. VGA RAM updates and migration.

Replace memory_region_tb_write with memory_notdirty_write_access,
and place it in memory_notdirty_write_prepare where it can catch
all of the instances.  Add memory_notdirty_set_dirty to log when
we no longer intercept writes to a page.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 exec.c       | 3 +++
 memory.c     | 4 ----
 trace-events | 4 ++--
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/exec.c b/exec.c
index XXXXXXX..XXXXXXX 100644
--- a/exec.c
+++ b/exec.c
@@ -XXX,XX +XXX,XX @@ void memory_notdirty_write_prepare(NotDirtyInfo *ndi,
     ndi->size = size;
     ndi->pages = NULL;
 
+    trace_memory_notdirty_write_access(mem_vaddr, ram_addr, size);
+
     assert(tcg_enabled());
     if (!cpu_physical_memory_get_dirty_flag(ram_addr, DIRTY_MEMORY_CODE)) {
         ndi->pages = page_collection_lock(ram_addr, ram_addr + size);
@@ -XXX,XX +XXX,XX @@ void memory_notdirty_write_complete(NotDirtyInfo *ndi)
     /* we remove the notdirty callback only if the code has been
        flushed */
     if (!cpu_physical_memory_is_clean(ndi->ram_addr)) {
+        trace_memory_notdirty_set_dirty(ndi->mem_vaddr);
         tlb_set_dirty(ndi->cpu, ndi->mem_vaddr);
     }
 }
diff --git a/memory.c b/memory.c
index XXXXXXX..XXXXXXX 100644
--- a/memory.c
+++ b/memory.c
@@ -XXX,XX +XXX,XX @@ static MemTxResult  memory_region_read_accessor(MemoryRegion *mr,
         /* Accesses to code which has previously been translated into a TB show
          * up in the MMIO path, as accesses to the io_mem_notdirty
          * MemoryRegion. */
-        trace_memory_region_tb_read(get_cpu_index(), addr, tmp, size);
     } else if (TRACE_MEMORY_REGION_OPS_READ_ENABLED) {
         hwaddr abs_addr = memory_region_to_absolute_addr(mr, addr);
         trace_memory_region_ops_read(get_cpu_index(), mr, abs_addr, tmp, size);
@@ -XXX,XX +XXX,XX @@ static MemTxResult memory_region_read_with_attrs_accessor(MemoryRegion *mr,
         /* Accesses to code which has previously been translated into a TB show
          * up in the MMIO path, as accesses to the io_mem_notdirty
          * MemoryRegion. */
-        trace_memory_region_tb_read(get_cpu_index(), addr, tmp, size);
     } else if (TRACE_MEMORY_REGION_OPS_READ_ENABLED) {
         hwaddr abs_addr = memory_region_to_absolute_addr(mr, addr);
         trace_memory_region_ops_read(get_cpu_index(), mr, abs_addr, tmp, size);
@@ -XXX,XX +XXX,XX @@ static MemTxResult memory_region_write_accessor(MemoryRegion *mr,
         /* Accesses to code which has previously been translated into a TB show
          * up in the MMIO path, as accesses to the io_mem_notdirty
          * MemoryRegion. */
-        trace_memory_region_tb_write(get_cpu_index(), addr, tmp, size);
     } else if (TRACE_MEMORY_REGION_OPS_WRITE_ENABLED) {
         hwaddr abs_addr = memory_region_to_absolute_addr(mr, addr);
         trace_memory_region_ops_write(get_cpu_index(), mr, abs_addr, tmp, size);
@@ -XXX,XX +XXX,XX @@ static MemTxResult memory_region_write_with_attrs_accessor(MemoryRegion *mr,
         /* Accesses to code which has previously been translated into a TB show
          * up in the MMIO path, as accesses to the io_mem_notdirty
          * MemoryRegion. */
-        trace_memory_region_tb_write(get_cpu_index(), addr, tmp, size);
     } else if (TRACE_MEMORY_REGION_OPS_WRITE_ENABLED) {
         hwaddr abs_addr = memory_region_to_absolute_addr(mr, addr);
         trace_memory_region_ops_write(get_cpu_index(), mr, abs_addr, tmp, size);
diff --git a/trace-events b/trace-events
index XXXXXXX..XXXXXXX 100644
--- a/trace-events
+++ b/trace-events
@@ -XXX,XX +XXX,XX @@ dma_map_wait(void *dbs) "dbs=%p"
 find_ram_offset(uint64_t size, uint64_t offset) "size: 0x%" PRIx64 " @ 0x%" PRIx64
 find_ram_offset_loop(uint64_t size, uint64_t candidate, uint64_t offset, uint64_t next, uint64_t mingap) "trying size: 0x%" PRIx64 " @ 0x%" PRIx64 ", offset: 0x%" PRIx64" next: 0x%" PRIx64 " mingap: 0x%" PRIx64
 ram_block_discard_range(const char *rbname, void *hva, size_t length, bool need_madvise, bool need_fallocate, int ret) "%s@%p + 0x%zx: madvise: %d fallocate: %d ret: %d"
+memory_notdirty_write_access(uint64_t vaddr, uint64_t ram_addr, unsigned size) "0x%" PRIx64 " ram_addr 0x%" PRIx64 " size %u"
+memory_notdirty_set_dirty(uint64_t vaddr) "0x%" PRIx64
 
 # memory.c
 memory_region_ops_read(int cpu_index, void *mr, uint64_t addr, uint64_t value, unsigned size) "cpu %d mr %p addr 0x%"PRIx64" value 0x%"PRIx64" size %u"
 memory_region_ops_write(int cpu_index, void *mr, uint64_t addr, uint64_t value, unsigned size) "cpu %d mr %p addr 0x%"PRIx64" value 0x%"PRIx64" size %u"
 memory_region_subpage_read(int cpu_index, void *mr, uint64_t offset, uint64_t value, unsigned size) "cpu %d mr %p offset 0x%"PRIx64" value 0x%"PRIx64" size %u"
 memory_region_subpage_write(int cpu_index, void *mr, uint64_t offset, uint64_t value, unsigned size) "cpu %d mr %p offset 0x%"PRIx64" value 0x%"PRIx64" size %u"
-memory_region_tb_read(int cpu_index, uint64_t addr, uint64_t value, unsigned size) "cpu %d addr 0x%"PRIx64" value 0x%"PRIx64" size %u"
-memory_region_tb_write(int cpu_index, uint64_t addr, uint64_t value, unsigned size) "cpu %d addr 0x%"PRIx64" value 0x%"PRIx64" size %u"
 memory_region_ram_device_read(int cpu_index, void *mr, uint64_t addr, uint64_t value, unsigned size) "cpu %d mr %p addr 0x%"PRIx64" value 0x%"PRIx64" size %u"
 memory_region_ram_device_write(int cpu_index, void *mr, uint64_t addr, uint64_t value, unsigned size) "cpu %d mr %p addr 0x%"PRIx64" value 0x%"PRIx64" size %u"
 flatview_new(void *view, void *root) "%p (root %p)"
-- 
2.17.1

It does not require going through the whole I/O path
in order to discard a write.

Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/cpu-all.h    |  5 ++++-
 include/exec/cpu-common.h |  1 -
 accel/tcg/cputlb.c        | 36 ++++++++++++++++++++--------------
 exec.c                    | 41 +--------------------------------------
 4 files changed, 26 insertions(+), 57 deletions(-)

diff --git a/include/exec/cpu-all.h b/include/exec/cpu-all.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/cpu-all.h
+++ b/include/exec/cpu-all.h
@@ -XXX,XX +XXX,XX @@ CPUArchState *cpu_copy(CPUArchState *env);
 #define TLB_WATCHPOINT      (1 << (TARGET_PAGE_BITS_MIN - 4))
 /* Set if TLB entry requires byte swap.  */
 #define TLB_BSWAP           (1 << (TARGET_PAGE_BITS_MIN - 5))
+/* Set if TLB entry writes ignored.  */
+#define TLB_DISCARD_WRITE   (1 << (TARGET_PAGE_BITS_MIN - 6))
 
 /* Use this mask to check interception with an alignment mask
  * in a TCG backend.
  */
 #define TLB_FLAGS_MASK \
-    (TLB_INVALID_MASK | TLB_NOTDIRTY | TLB_MMIO | TLB_WATCHPOINT | TLB_BSWAP)
+    (TLB_INVALID_MASK | TLB_NOTDIRTY | TLB_MMIO \
+    | TLB_WATCHPOINT | TLB_BSWAP | TLB_DISCARD_WRITE)
 
 /**
  * tlb_hit_page: return true if page aligned @addr is a hit against the
diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/cpu-common.h
+++ b/include/exec/cpu-common.h
@@ -XXX,XX +XXX,XX @@ void qemu_flush_coalesced_mmio_buffer(void);
 
 void cpu_flush_icache_range(hwaddr start, hwaddr len);
 
-extern struct MemoryRegion io_mem_rom;
 extern struct MemoryRegion io_mem_notdirty;
 
 typedef int (RAMBlockIterFunc)(RAMBlock *rb, void *opaque);
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ static void tlb_reset_dirty_range_locked(CPUTLBEntry *tlb_entry,
 {
     uintptr_t addr = tlb_entry->addr_write;
 
-    if ((addr & (TLB_INVALID_MASK | TLB_MMIO | TLB_NOTDIRTY)) == 0) {
+    if ((addr & (TLB_INVALID_MASK | TLB_MMIO |
+                 TLB_DISCARD_WRITE | TLB_NOTDIRTY)) == 0) {
         addr &= TARGET_PAGE_MASK;
         addr += tlb_entry->addend;
         if ((addr - start) < length) {
@@ -XXX,XX +XXX,XX @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
         address |= TLB_MMIO;
         addend = 0;
     } else {
-        /* TLB_MMIO for rom/romd handled below */
         addend = (uintptr_t)memory_region_get_ram_ptr(section->mr) + xlat;
     }
 
@@ -XXX,XX +XXX,XX @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
 
     tn.addr_write = -1;
     if (prot & PAGE_WRITE) {
-        if ((memory_region_is_ram(section->mr) && section->readonly)
-            || memory_region_is_romd(section->mr)) {
-            /* Write access calls the I/O callback.  */
-            tn.addr_write = address | TLB_MMIO;
-        } else if (memory_region_is_ram(section->mr)
-                   && cpu_physical_memory_is_clean(
-                       memory_region_get_ram_addr(section->mr) + xlat)) {
-            tn.addr_write = address | TLB_NOTDIRTY;
-        } else {
-            tn.addr_write = address;
+        tn.addr_write = address;
+        if (memory_region_is_romd(section->mr)) {
+            /* Use the MMIO path so that the device can switch states. */
+            tn.addr_write |= TLB_MMIO;
+        } else if (memory_region_is_ram(section->mr)) {
+            if (section->readonly) {
+                tn.addr_write |= TLB_DISCARD_WRITE;
+            } else if (cpu_physical_memory_is_clean(
+                        memory_region_get_ram_addr(section->mr) + xlat)) {
+                tn.addr_write |= TLB_NOTDIRTY;
+            }
         }
         if (prot & PAGE_WRITE_INV) {
             tn.addr_write |= TLB_INVALID_MASK;
@@ -XXX,XX +XXX,XX @@ static uint64_t io_readx(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
     mr = section->mr;
     mr_offset = (iotlbentry->addr & TARGET_PAGE_MASK) + addr;
     cpu->mem_io_pc = retaddr;
-    if (mr != &io_mem_rom && mr != &io_mem_notdirty && !cpu->can_do_io) {
+    if (mr != &io_mem_notdirty && !cpu->can_do_io) {
         cpu_io_recompile(cpu, retaddr);
     }
 
@@ -XXX,XX +XXX,XX @@ static void io_writex(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
     section = iotlb_to_section(cpu, iotlbentry->addr, iotlbentry->attrs);
     mr = section->mr;
     mr_offset = (iotlbentry->addr & TARGET_PAGE_MASK) + addr;
-    if (mr != &io_mem_rom && mr != &io_mem_notdirty && !cpu->can_do_io) {
+    if (mr != &io_mem_notdirty && !cpu->can_do_io) {
         cpu_io_recompile(cpu, retaddr);
     }
     cpu->mem_io_vaddr = addr;
@@ -XXX,XX +XXX,XX @@ void *probe_access(CPUArchState *env, target_ulong addr, int size,
     }
 
     /* Reject I/O access, or other required slow-path.  */
-    if (tlb_addr & (TLB_NOTDIRTY | TLB_MMIO | TLB_BSWAP)) {
+    if (tlb_addr & (TLB_NOTDIRTY | TLB_MMIO | TLB_BSWAP | TLB_DISCARD_WRITE)) {
         return NULL;
     }
 
@@ -XXX,XX +XXX,XX @@ store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
             return;
         }
 
+        /* Ignore writes to ROM.  */
+        if (unlikely(tlb_addr & TLB_DISCARD_WRITE)) {
+            return;
+        }
+
         haddr = (void *)((uintptr_t)addr + entry->addend);
 
         /*
diff --git a/exec.c b/exec.c
index XXXXXXX..XXXXXXX 100644
--- a/exec.c
+++ b/exec.c
@@ -XXX,XX +XXX,XX @@ static MemoryRegion *system_io;
 AddressSpace address_space_io;
 AddressSpace address_space_memory;
 
-MemoryRegion io_mem_rom, io_mem_notdirty;
+MemoryRegion io_mem_notdirty;
 static MemoryRegion io_mem_unassigned;
 #endif
 
@@ -XXX,XX +XXX,XX @@ typedef struct subpage_t {
 
 #define PHYS_SECTION_UNASSIGNED 0
 #define PHYS_SECTION_NOTDIRTY 1
-#define PHYS_SECTION_ROM 2
 
 static void io_mem_init(void);
 static void memory_map_init(void);
@@ -XXX,XX +XXX,XX @@ hwaddr memory_region_section_get_iotlb(CPUState *cpu,
         iotlb = memory_region_get_ram_addr(section->mr) + xlat;
         if (!section->readonly) {
             iotlb |= PHYS_SECTION_NOTDIRTY;
-        } else {
-            iotlb |= PHYS_SECTION_ROM;
         }
     } else {
         AddressSpaceDispatch *d;
@@ -XXX,XX +XXX,XX @@ static uint16_t dummy_section(PhysPageMap *map, FlatView *fv, MemoryRegion *mr)
     return phys_section_add(map, &section);
 }
 
-static void readonly_mem_write(void *opaque, hwaddr addr,
-                               uint64_t val, unsigned size)
-{
-    /* Ignore any write to ROM. */
-}
-
-static bool readonly_mem_accepts(void *opaque, hwaddr addr,
-                                 unsigned size, bool is_write,
-                                 MemTxAttrs attrs)
-{
-    return is_write;
-}
-
-/* This will only be used for writes, because reads are special cased
- * to directly access the underlying host ram.
- */
-static const MemoryRegionOps readonly_mem_ops = {
-    .write = readonly_mem_write,
-    .valid.accepts = readonly_mem_accepts,
-    .endianness = DEVICE_NATIVE_ENDIAN,
-    .valid = {
-        .min_access_size = 1,
-        .max_access_size = 8,
-        .unaligned = false,
-    },
-    .impl = {
-        .min_access_size = 1,
-        .max_access_size = 8,
-        .unaligned = false,
-    },
-};
-
 MemoryRegionSection *iotlb_to_section(CPUState *cpu,
                                       hwaddr index, MemTxAttrs attrs)
 {
@@ -XXX,XX +XXX,XX @@ MemoryRegionSection *iotlb_to_section(CPUState *cpu,
 
 static void io_mem_init(void)
 {
-    memory_region_init_io(&io_mem_rom, NULL, &readonly_mem_ops,
-                          NULL, NULL, UINT64_MAX);
     memory_region_init_io(&io_mem_unassigned, NULL, &unassigned_mem_ops, NULL,
                           NULL, UINT64_MAX);
 
@@ -XXX,XX +XXX,XX @@ AddressSpaceDispatch *address_space_dispatch_new(FlatView *fv)
     assert(n == PHYS_SECTION_UNASSIGNED);
     n = dummy_section(&d->map, fv, &io_mem_notdirty);
     assert(n == PHYS_SECTION_NOTDIRTY);
-    n = dummy_section(&d->map, fv, &io_mem_rom);
-    assert(n == PHYS_SECTION_ROM);
 
     d->phys_map  = (PhysPageEntry) { .ptr = PHYS_MAP_NODE_NIL, .skip = 1 };
 
-- 
2.17.1

Pages that we want to track for NOTDIRTY are RAM.  We do not
really need to go through the I/O path to handle them.

Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/cpu-common.h |  2 --
 accel/tcg/cputlb.c        | 26 +++++++++++++++++---
 exec.c                    | 50 ---------------------------------------
 memory.c                  | 16 -------------
 4 files changed, 23 insertions(+), 71 deletions(-)

diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/cpu-common.h
+++ b/include/exec/cpu-common.h
@@ -XXX,XX +XXX,XX @@ void qemu_flush_coalesced_mmio_buffer(void);
 
 void cpu_flush_icache_range(hwaddr start, hwaddr len);
 
-extern struct MemoryRegion io_mem_notdirty;
-
 typedef int (RAMBlockIterFunc)(RAMBlock *rb, void *opaque);
 
 int qemu_ram_foreach_block(RAMBlockIterFunc func, void *opaque);
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ static uint64_t io_readx(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
     mr = section->mr;
     mr_offset = (iotlbentry->addr & TARGET_PAGE_MASK) + addr;
     cpu->mem_io_pc = retaddr;
-    if (mr != &io_mem_notdirty && !cpu->can_do_io) {
+    if (!cpu->can_do_io) {
         cpu_io_recompile(cpu, retaddr);
     }
 
@@ -XXX,XX +XXX,XX @@ static void io_writex(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
     section = iotlb_to_section(cpu, iotlbentry->addr, iotlbentry->attrs);
     mr = section->mr;
     mr_offset = (iotlbentry->addr & TARGET_PAGE_MASK) + addr;
-    if (mr != &io_mem_notdirty && !cpu->can_do_io) {
+    if (!cpu->can_do_io) {
         cpu_io_recompile(cpu, retaddr);
     }
     cpu->mem_io_vaddr = addr;
@@ -XXX,XX +XXX,XX @@ store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
         need_swap = size > 1 && (tlb_addr & TLB_BSWAP);
 
         /* Handle I/O access.  */
-        if (likely(tlb_addr & (TLB_MMIO | TLB_NOTDIRTY))) {
+        if (tlb_addr & TLB_MMIO) {
             io_writex(env, iotlbentry, mmu_idx, val, addr, retaddr,
                       op ^ (need_swap * MO_BSWAP));
             return;
@@ -XXX,XX +XXX,XX @@ store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
 
         haddr = (void *)((uintptr_t)addr + entry->addend);
 
+        /* Handle clean RAM pages.  */
+        if (tlb_addr & TLB_NOTDIRTY) {
+            NotDirtyInfo ndi;
+
+            /* We require mem_io_pc in tb_invalidate_phys_page_range.  */
+            env_cpu(env)->mem_io_pc = retaddr;
+
+            memory_notdirty_write_prepare(&ndi, env_cpu(env), addr,
+                                          addr + iotlbentry->addr, size);
+
+            if (unlikely(need_swap)) {
+                store_memop(haddr, val, op ^ MO_BSWAP);
+            } else {
+                store_memop(haddr, val, op);
+            }
+
+            memory_notdirty_write_complete(&ndi);
+            return;
+        }
+
         /*
          * Keep these two store_memop separate to ensure that the compiler
          * is able to fold the entire function to a single instruction.
diff --git a/exec.c b/exec.c
index XXXXXXX..XXXXXXX 100644
--- a/exec.c
+++ b/exec.c
@@ -XXX,XX +XXX,XX @@ static MemoryRegion *system_io;
 AddressSpace address_space_io;
 AddressSpace address_space_memory;
 
-MemoryRegion io_mem_notdirty;
 static MemoryRegion io_mem_unassigned;
 #endif
 
@@ -XXX,XX +XXX,XX @@ typedef struct subpage_t {
 } subpage_t;
 
 #define PHYS_SECTION_UNASSIGNED 0
-#define PHYS_SECTION_NOTDIRTY 1
 
 static void io_mem_init(void);
 static void memory_map_init(void);
@@ -XXX,XX +XXX,XX @@ hwaddr memory_region_section_get_iotlb(CPUState *cpu,
     if (memory_region_is_ram(section->mr)) {
         /* Normal RAM.  */
         iotlb = memory_region_get_ram_addr(section->mr) + xlat;
-        if (!section->readonly) {
-            iotlb |= PHYS_SECTION_NOTDIRTY;
-        }
     } else {
         AddressSpaceDispatch *d;
 
@@ -XXX,XX +XXX,XX @@ void memory_notdirty_write_complete(NotDirtyInfo *ndi)
     }
 }
 
-/* Called within RCU critical section.  */
-static void notdirty_mem_write(void *opaque, hwaddr ram_addr,
-                               uint64_t val, unsigned size)
-{
-    NotDirtyInfo ndi;
-
-    memory_notdirty_write_prepare(&ndi, current_cpu, current_cpu->mem_io_vaddr,
-                         ram_addr, size);
-
-    stn_p(qemu_map_ram_ptr(NULL, ram_addr), size, val);
-    memory_notdirty_write_complete(&ndi);
-}
-
-static bool notdirty_mem_accepts(void *opaque, hwaddr addr,
-                                 unsigned size, bool is_write,
-                                 MemTxAttrs attrs)
-{
-    return is_write;
-}
-
-static const MemoryRegionOps notdirty_mem_ops = {
-    .write = notdirty_mem_write,
-    .valid.accepts = notdirty_mem_accepts,
-    .endianness = DEVICE_NATIVE_ENDIAN,
-    .valid = {
-        .min_access_size = 1,
-        .max_access_size = 8,
-        .unaligned = false,
-    },
-    .impl = {
-        .min_access_size = 1,
-        .max_access_size = 8,
-        .unaligned = false,
-    },
-};
-
 /* Generate a debug exception if a watchpoint has been hit.  */
 void cpu_check_watchpoint(CPUState *cpu, vaddr addr, vaddr len,
                           MemTxAttrs attrs, int flags, uintptr_t ra)
@@ -XXX,XX +XXX,XX @@ static void io_mem_init(void)
 {
     memory_region_init_io(&io_mem_unassigned, NULL, &unassigned_mem_ops, NULL,
                           NULL, UINT64_MAX);
-
-    /* io_mem_notdirty calls tb_invalidate_phys_page_fast,
-     * which can be called without the iothread mutex.
-     */
-    memory_region_init_io(&io_mem_notdirty, NULL, &notdirty_mem_ops, NULL,
-                          NULL, UINT64_MAX);
-    memory_region_clear_global_locking(&io_mem_notdirty);
 }
 
 AddressSpaceDispatch *address_space_dispatch_new(FlatView *fv)
@@ -XXX,XX +XXX,XX @@ AddressSpaceDispatch *address_space_dispatch_new(FlatView *fv)
 
     n = dummy_section(&d->map, fv, &io_mem_unassigned);
     assert(n == PHYS_SECTION_UNASSIGNED);
-    n = dummy_section(&d->map, fv, &io_mem_notdirty);
-    assert(n == PHYS_SECTION_NOTDIRTY);
 
     d->phys_map  = (PhysPageEntry) { .ptr = PHYS_MAP_NODE_NIL, .skip = 1 };
 
diff --git a/memory.c b/memory.c
index XXXXXXX..XXXXXXX 100644
--- a/memory.c
+++ b/memory.c
@@ -XXX,XX +XXX,XX @@ static MemTxResult  memory_region_read_accessor(MemoryRegion *mr,
     tmp = mr->ops->read(mr->opaque, addr, size);
     if (mr->subpage) {
         trace_memory_region_subpage_read(get_cpu_index(), mr, addr, tmp, size);
-    } else if (mr == &io_mem_notdirty) {
-        /* Accesses to code which has previously been translated into a TB show
-         * up in the MMIO path, as accesses to the io_mem_notdirty
-         * MemoryRegion. */
     } else if (TRACE_MEMORY_REGION_OPS_READ_ENABLED) {
         hwaddr abs_addr = memory_region_to_absolute_addr(mr, addr);
         trace_memory_region_ops_read(get_cpu_index(), mr, abs_addr, tmp, size);
@@ -XXX,XX +XXX,XX @@ static MemTxResult memory_region_read_with_attrs_accessor(MemoryRegion *mr,
     r = mr->ops->read_with_attrs(mr->opaque, addr, &tmp, size, attrs);
     if (mr->subpage) {
         trace_memory_region_subpage_read(get_cpu_index(), mr, addr, tmp, size);
-    } else if (mr == &io_mem_notdirty) {
-        /* Accesses to code which has previously been translated into a TB show
-         * up in the MMIO path, as accesses to the io_mem_notdirty
-         * MemoryRegion. */
     } else if (TRACE_MEMORY_REGION_OPS_READ_ENABLED) {
         hwaddr abs_addr = memory_region_to_absolute_addr(mr, addr);
         trace_memory_region_ops_read(get_cpu_index(), mr, abs_addr, tmp, size);
@@ -XXX,XX +XXX,XX @@ static MemTxResult memory_region_write_accessor(MemoryRegion *mr,
 
     if (mr->subpage) {
         trace_memory_region_subpage_write(get_cpu_index(), mr, addr, tmp, size);
-    } else if (mr == &io_mem_notdirty) {
-        /* Accesses to code which has previously been translated into a TB show
-         * up in the MMIO path, as accesses to the io_mem_notdirty
-         * MemoryRegion. */
     } else if (TRACE_MEMORY_REGION_OPS_WRITE_ENABLED) {
         hwaddr abs_addr = memory_region_to_absolute_addr(mr, addr);
         trace_memory_region_ops_write(get_cpu_index(), mr, abs_addr, tmp, size);
@@ -XXX,XX +XXX,XX @@ static MemTxResult memory_region_write_with_attrs_accessor(MemoryRegion *mr,
 
     if (mr->subpage) {
         trace_memory_region_subpage_write(get_cpu_index(), mr, addr, tmp, size);
-    } else if (mr == &io_mem_notdirty) {
-        /* Accesses to code which has previously been translated into a TB show
-         * up in the MMIO path, as accesses to the io_mem_notdirty
-         * MemoryRegion. */
     } else if (TRACE_MEMORY_REGION_OPS_WRITE_ENABLED) {
         hwaddr abs_addr = memory_region_to_absolute_addr(mr, addr);
         trace_memory_region_ops_write(get_cpu_index(), mr, abs_addr, tmp, size);
-- 
2.17.1

There is only one caller, tlb_set_page_with_attrs.  We cannot
inline the entire function because the AddressSpaceDispatch
structure is private to exec.c, and cannot easily be moved to
include/exec/memory-internal.h.

Compute is_ram and is_romd once within tlb_set_page_with_attrs.
Fold the number of tests against these predicates.  Compute
cpu_physical_memory_is_clean outside of the tlb lock region.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/exec-all.h |  6 +---
 accel/tcg/cputlb.c      | 68 ++++++++++++++++++++++++++---------------
 exec.c                  | 22 ++-----------
 3 files changed, 47 insertions(+), 49 deletions(-)

diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -XXX,XX +XXX,XX @@ address_space_translate_for_iotlb(CPUState *cpu, int asidx, hwaddr addr,
                                   hwaddr *xlat, hwaddr *plen,
                                   MemTxAttrs attrs, int *prot);
 hwaddr memory_region_section_get_iotlb(CPUState *cpu,
-                                       MemoryRegionSection *section,
-                                       target_ulong vaddr,
-                                       hwaddr paddr, hwaddr xlat,
-                                       int prot,
-                                       target_ulong *address);
+                                       MemoryRegionSection *section);
 #endif
 
 /* vl.c */
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
     MemoryRegionSection *section;
     unsigned int index;
     target_ulong address;
-    target_ulong code_address;
+    target_ulong write_address;
     uintptr_t addend;
     CPUTLBEntry *te, tn;
     hwaddr iotlb, xlat, sz, paddr_page;
     target_ulong vaddr_page;
     int asidx = cpu_asidx_from_attrs(cpu, attrs);
     int wp_flags;
+    bool is_ram, is_romd;
 
     assert_cpu_is_self(cpu);
 
@@ -XXX,XX +XXX,XX @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
     if (attrs.byte_swap) {
         address |= TLB_BSWAP;
     }
-    if (!memory_region_is_ram(section->mr) &&
-        !memory_region_is_romd(section->mr)) {
-        /* IO memory case */
-        address |= TLB_MMIO;
-        addend = 0;
-    } else {
+
+    is_ram = memory_region_is_ram(section->mr);
+    is_romd = memory_region_is_romd(section->mr);
+
+    if (is_ram || is_romd) {
+        /* RAM and ROMD both have associated host memory. */
         addend = (uintptr_t)memory_region_get_ram_ptr(section->mr) + xlat;
+    } else {
+        /* I/O does not; force the host address to NULL. */
+        addend = 0;
+    }
+
+    write_address = address;
+    if (is_ram) {
+        iotlb = memory_region_get_ram_addr(section->mr) + xlat;
+        /*
+         * Computing is_clean is expensive; avoid all that unless
+         * the page is actually writable.
+         */
+        if (prot & PAGE_WRITE) {
+            if (section->readonly) {
+                write_address |= TLB_DISCARD_WRITE;
+            } else if (cpu_physical_memory_is_clean(iotlb)) {
+                write_address |= TLB_NOTDIRTY;
+            }
+        }
+    } else {
+        /* I/O or ROMD */
+        iotlb = memory_region_section_get_iotlb(cpu, section) + xlat;
+        /*
+         * Writes to romd devices must go through MMIO to enable write.
+         * Reads to romd devices go through the ram_ptr found above,
+         * but of course reads to I/O must go through MMIO.
+         */
+        write_address |= TLB_MMIO;
+        if (!is_romd) {
+            address = write_address;
+        }
     }
 
-    code_address = address;
-    iotlb = memory_region_section_get_iotlb(cpu, section, vaddr_page,
-                                            paddr_page, xlat, prot, &address);
     wp_flags = cpu_watchpoint_address_matches(cpu, vaddr_page,
                                               TARGET_PAGE_SIZE);
 
@@ -XXX,XX +XXX,XX @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
     /*
      * At this point iotlb contains a physical section number in the lower
      * TARGET_PAGE_BITS, and either
-     *  + the ram_addr_t of the page base of the target RAM (if NOTDIRTY or ROM)
-     *  + the offset within section->mr of the page base (otherwise)
+     *  + the ram_addr_t of the page base of the target RAM (RAM)
+     *  + the offset within section->mr of the page base (I/O, ROMD)
      * We subtract the vaddr_page (which is page aligned and thus won't
      * disturb the low bits) to give an offset which can be added to the
      * (non-page-aligned) vaddr of the eventual memory access to get
@@ -XXX,XX +XXX,XX @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
     }
 
     if (prot & PAGE_EXEC) {
-        tn.addr_code = code_address;
+        tn.addr_code = address;
     } else {
         tn.addr_code = -1;
     }
 
     tn.addr_write = -1;
     if (prot & PAGE_WRITE) {
-        tn.addr_write = address;
-        if (memory_region_is_romd(section->mr)) {
-            /* Use the MMIO path so that the device can switch states. */
-            tn.addr_write |= TLB_MMIO;
-        } else if (memory_region_is_ram(section->mr)) {
-            if (section->readonly) {
-                tn.addr_write |= TLB_DISCARD_WRITE;
-            } else if (cpu_physical_memory_is_clean(
-                        memory_region_get_ram_addr(section->mr) + xlat)) {
-                tn.addr_write |= TLB_NOTDIRTY;
-            }
-        }
+        tn.addr_write = write_address;
         if (prot & PAGE_WRITE_INV) {
             tn.addr_write |= TLB_INVALID_MASK;
         }
diff --git a/exec.c b/exec.c
index XXXXXXX..XXXXXXX 100644
--- a/exec.c
+++ b/exec.c
@@ -XXX,XX +XXX,XX @@ bool cpu_physical_memory_snapshot_get_dirty(DirtyBitmapSnapshot *snap,
 
 /* Called from RCU critical section */
 hwaddr memory_region_section_get_iotlb(CPUState *cpu,
-                                       MemoryRegionSection *section,
-                                       target_ulong vaddr,
-                                       hwaddr paddr, hwaddr xlat,
-                                       int prot,
-                                       target_ulong *address)
+                                       MemoryRegionSection *section)
 {
-    hwaddr iotlb;
-
-    if (memory_region_is_ram(section->mr)) {
-        /* Normal RAM.  */
-        iotlb = memory_region_get_ram_addr(section->mr) + xlat;
-    } else {
-        AddressSpaceDispatch *d;
-
-        d = flatview_to_dispatch(section->fv);
-        iotlb = section - d->map.sections;
-        iotlb += xlat;
-    }
-
-    return iotlb;
+    AddressSpaceDispatch *d = flatview_to_dispatch(section->fv);
+    return section - d->map.sections;
 }
 #endif /* defined(CONFIG_USER_ONLY) */
 
-- 
2.17.1

Since 9458a9a1df1a, all readers of the dirty bitmaps wait
for the rcu lock, which means that they wait until the end
of any executing TranslationBlock.

As a consequence, there is no need for the actual access
to happen in between the _prepare and _complete.  Therefore,
we can improve things by merging the two functions into
notdirty_write and dropping the NotDirtyInfo structure.

In addition, the only users of notdirty_write are in cputlb.c,
so move the merged function there.  Pass in the CPUIOTLBEntry
from which the ram_addr_t may be computed.

Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/memory-internal.h | 65 -----------------------------
 accel/tcg/cputlb.c             | 76 +++++++++++++++++++---------------
 exec.c                         | 44 --------------------
 3 files changed, 42 insertions(+), 143 deletions(-)

diff --git a/include/exec/memory-internal.h b/include/exec/memory-internal.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/memory-internal.h
+++ b/include/exec/memory-internal.h
@@ -XXX,XX +XXX,XX @@ void address_space_dispatch_free(AddressSpaceDispatch *d);
 
 void mtree_print_dispatch(struct AddressSpaceDispatch *d,
                           MemoryRegion *root);
-
-struct page_collection;
-
-/* Opaque struct for passing info from memory_notdirty_write_prepare()
- * to memory_notdirty_write_complete(). Callers should treat all fields
- * as private, with the exception of @active.
- *
- * @active is a field which is not touched by either the prepare or
- * complete functions, but which the caller can use if it wishes to
- * track whether it has called prepare for this struct and so needs
- * to later call the complete function.
- */
-typedef struct {
-    CPUState *cpu;
-    struct page_collection *pages;
-    ram_addr_t ram_addr;
-    vaddr mem_vaddr;
-    unsigned size;
-    bool active;
-} NotDirtyInfo;
-
-/**
- * memory_notdirty_write_prepare: call before writing to non-dirty memory
- * @ndi: pointer to opaque NotDirtyInfo struct
- * @cpu: CPU doing the write
- * @mem_vaddr: virtual address of write
- * @ram_addr: the ram address of the write
- * @size: size of write in bytes
- *
- * Any code which writes to the host memory corresponding to
- * guest RAM which has been marked as NOTDIRTY must wrap those
- * writes in calls to memory_notdirty_write_prepare() and
- * memory_notdirty_write_complete():
- *
- *  NotDirtyInfo ndi;
- *  memory_notdirty_write_prepare(&ndi, ....);
- *  ... perform write here ...
- *  memory_notdirty_write_complete(&ndi);
- *
- * These calls will ensure that we flush any TCG translated code for
- * the memory being written, update the dirty bits and (if possible)
- * remove the slowpath callback for writing to the memory.
- *
- * This must only be called if we are using TCG; it will assert otherwise.
- *
- * We may take locks in the prepare call, so callers must ensure that
- * they don't exit (via longjump or otherwise) without calling complete.
- *
- * This call must only be made inside an RCU critical section.
- * (Note that while we're executing a TCG TB we're always in an
- * RCU critical section, which is likely to be the case for callers
- * of these functions.)
- */
-void memory_notdirty_write_prepare(NotDirtyInfo *ndi,
-                                   CPUState *cpu,
-                                   vaddr mem_vaddr,
-                                   ram_addr_t ram_addr,
-                                   unsigned size);
-/**
- * memory_notdirty_write_complete: finish write to non-dirty memory
- * @ndi: pointer to the opaque NotDirtyInfo struct which was initialized
- * by memory_not_dirty_write_prepare().
- */
-void memory_notdirty_write_complete(NotDirtyInfo *ndi);
-
 #endif
 #endif
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@
 #include "exec/helper-proto.h"
 #include "qemu/atomic.h"
 #include "qemu/atomic128.h"
+#include "translate-all.h"
 
 /* DEBUG defines, enable DEBUG_TLB_LOG to log to the CPU_LOG_MMU target */
 /* #define DEBUG_TLB */
@@ -XXX,XX +XXX,XX @@ tb_page_addr_t get_page_addr_code(CPUArchState *env, target_ulong addr)
     return qemu_ram_addr_from_host_nofail(p);
 }
 
+static void notdirty_write(CPUState *cpu, vaddr mem_vaddr, unsigned size,
+                           CPUIOTLBEntry *iotlbentry, uintptr_t retaddr)
+{
+    ram_addr_t ram_addr = mem_vaddr + iotlbentry->addr;
+
+    trace_memory_notdirty_write_access(mem_vaddr, ram_addr, size);
+
+    if (!cpu_physical_memory_get_dirty_flag(ram_addr, DIRTY_MEMORY_CODE)) {
+        struct page_collection *pages
+            = page_collection_lock(ram_addr, ram_addr + size);
+
+        /* We require mem_io_pc in tb_invalidate_phys_page_range.  */
+        cpu->mem_io_pc = retaddr;
+
+        tb_invalidate_phys_page_fast(pages, ram_addr, size);
+        page_collection_unlock(pages);
+    }
+
+    /*
+     * Set both VGA and migration bits for simplicity and to remove
+     * the notdirty callback faster.
+     */
+    cpu_physical_memory_set_dirty_range(ram_addr, size, DIRTY_CLIENTS_NOCODE);
+
+    /* We remove the notdirty callback only if the code has been flushed. */
+    if (!cpu_physical_memory_is_clean(ram_addr)) {
+        trace_memory_notdirty_set_dirty(mem_vaddr);
+        tlb_set_dirty(cpu, mem_vaddr);
+    }
+}
+
 /*
  * Probe for whether the specified guest access is permitted. If it is not
  * permitted then an exception will be taken in the same way as if this
@@ -XXX,XX +XXX,XX @@ void *tlb_vaddr_to_host(CPUArchState *env, abi_ptr addr,
 /* Probe for a read-modify-write atomic operation.  Do not allow unaligned
  * operations, or io operations to proceed.  Return the host address.  */
 static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
-                               TCGMemOpIdx oi, uintptr_t retaddr,
-                               NotDirtyInfo *ndi)
+                               TCGMemOpIdx oi, uintptr_t retaddr)
 {
     size_t mmu_idx = get_mmuidx(oi);
     uintptr_t index = tlb_index(env, mmu_idx, addr);
@@ -XXX,XX +XXX,XX @@ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
 
     hostaddr = (void *)((uintptr_t)addr + tlbe->addend);
 
-    ndi->active = false;
     if (unlikely(tlb_addr & TLB_NOTDIRTY)) {
-        ndi->active = true;
-        memory_notdirty_write_prepare(ndi, env_cpu(env), addr,
-                                      qemu_ram_addr_from_host_nofail(hostaddr),
-                                      1 << s_bits);
+        notdirty_write(env_cpu(env), addr, 1 << s_bits,
+                       &env_tlb(env)->d[mmu_idx].iotlb[index], retaddr);
     }
 
     return hostaddr;
@@ -XXX,XX +XXX,XX @@ store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
             return;
         }
 
-        haddr = (void *)((uintptr_t)addr + entry->addend);
-
         /* Handle clean RAM pages.  */
         if (tlb_addr & TLB_NOTDIRTY) {
-            NotDirtyInfo ndi;
-
-            /* We require mem_io_pc in tb_invalidate_phys_page_range.  */
-            env_cpu(env)->mem_io_pc = retaddr;
-
-            memory_notdirty_write_prepare(&ndi, env_cpu(env), addr,
-                                          addr + iotlbentry->addr, size);
-
-            if (unlikely(need_swap)) {
-                store_memop(haddr, val, op ^ MO_BSWAP);
-            } else {
-                store_memop(haddr, val, op);
-            }
-
-            memory_notdirty_write_complete(&ndi);
-            return;
+            notdirty_write(env_cpu(env), addr, size, iotlbentry, retaddr);
         }
 
+        haddr = (void *)((uintptr_t)addr + entry->addend);
+
         /*
          * Keep these two store_memop separate to ensure that the compiler
          * is able to fold the entire function to a single instruction.
@@ -XXX,XX +XXX,XX @@ void helper_be_stq_mmu(CPUArchState *env, target_ulong addr, uint64_t val,
 #define EXTRA_ARGS     , TCGMemOpIdx oi, uintptr_t retaddr
 #define ATOMIC_NAME(X) \
     HELPER(glue(glue(glue(atomic_ ## X, SUFFIX), END), _mmu))
-#define ATOMIC_MMU_DECLS NotDirtyInfo ndi
-#define ATOMIC_MMU_LOOKUP atomic_mmu_lookup(env, addr, oi, retaddr, &ndi)
-#define ATOMIC_MMU_CLEANUP                              \
-    do {                                                \
-        if (unlikely(ndi.active)) {                     \
-            memory_notdirty_write_complete(&ndi);       \
-        }                                               \
-    } while (0)
+#define ATOMIC_MMU_DECLS
+#define ATOMIC_MMU_LOOKUP atomic_mmu_lookup(env, addr, oi, retaddr)
+#define ATOMIC_MMU_CLEANUP
 
 #define DATA_SIZE 1
 #include "atomic_template.h"
@@ -XXX,XX +XXX,XX @@ void helper_be_stq_mmu(CPUArchState *env, target_ulong addr, uint64_t val,
 #undef ATOMIC_MMU_LOOKUP
 #define EXTRA_ARGS         , TCGMemOpIdx oi
 #define ATOMIC_NAME(X)     HELPER(glue(glue(atomic_ ## X, SUFFIX), END))
-#define ATOMIC_MMU_LOOKUP  atomic_mmu_lookup(env, addr, oi, GETPC(), &ndi)
+#define ATOMIC_MMU_LOOKUP  atomic_mmu_lookup(env, addr, oi, GETPC())
 
 #define DATA_SIZE 1
 #include "atomic_template.h"
diff --git a/exec.c b/exec.c
index XXXXXXX..XXXXXXX 100644
--- a/exec.c
+++ b/exec.c
@@ -XXX,XX +XXX,XX @@ ram_addr_t qemu_ram_addr_from_host(void *ptr)
     return block->offset + offset;
 }
 
-/* Called within RCU critical section. */
-void memory_notdirty_write_prepare(NotDirtyInfo *ndi,
-                          CPUState *cpu,
-                          vaddr mem_vaddr,
-                          ram_addr_t ram_addr,
-                          unsigned size)
-{
-    ndi->cpu = cpu;
-    ndi->ram_addr = ram_addr;
-    ndi->mem_vaddr = mem_vaddr;
-    ndi->size = size;
-    ndi->pages = NULL;
-
-    trace_memory_notdirty_write_access(mem_vaddr, ram_addr, size);
-
-    assert(tcg_enabled());
-    if (!cpu_physical_memory_get_dirty_flag(ram_addr, DIRTY_MEMORY_CODE)) {
-        ndi->pages = page_collection_lock(ram_addr, ram_addr + size);
-        tb_invalidate_phys_page_fast(ndi->pages, ram_addr, size);
-    }
-}
-
-/* Called within RCU critical section. */
-void memory_notdirty_write_complete(NotDirtyInfo *ndi)
-{
-    if (ndi->pages) {
-        assert(tcg_enabled());
-        page_collection_unlock(ndi->pages);
-        ndi->pages = NULL;
-    }
-
-    /* Set both VGA and migration bits for simplicity and to remove
-     * the notdirty callback faster.
-     */
-    cpu_physical_memory_set_dirty_range(ndi->ram_addr, ndi->size,
-                                        DIRTY_CLIENTS_NOCODE);
-    /* we remove the notdirty callback only if the code has been
-       flushed */
-    if (!cpu_physical_memory_is_clean(ndi->ram_addr)) {
-        trace_memory_notdirty_set_dirty(ndi->mem_vaddr);
-        tlb_set_dirty(ndi->cpu, ndi->mem_vaddr);
-    }
-}
-
 /* Generate a debug exception if a watchpoint has been hit.  */
 void cpu_check_watchpoint(CPUState *cpu, vaddr addr, vaddr len,
                           MemTxAttrs attrs, int flags, uintptr_t ra)
-- 
2.17.1

We can use notdirty_write for the write and return a valid host
pointer for this case.

diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ void *probe_access(CPUArchState *env, target_ulong addr, int size,
         return NULL;
     }
 
-    /* Handle watchpoints.  */
-    if (tlb_addr & TLB_WATCHPOINT) {
-        cpu_check_watchpoint(env_cpu(env), addr, size,
-                             env_tlb(env)->d[mmu_idx].iotlb[index].attrs,
-                             wp_access, retaddr);
-    }
+    if (unlikely(tlb_addr & TLB_FLAGS_MASK)) {
+        CPUIOTLBEntry *iotlbentry = &env_tlb(env)->d[mmu_idx].iotlb[index];
 
-    /* Reject I/O access, or other required slow-path.  */
-    if (tlb_addr & (TLB_NOTDIRTY | TLB_MMIO | TLB_BSWAP | TLB_DISCARD_WRITE)) {
-        return NULL;
+        /* Reject I/O access, or other required slow-path.  */
+        if (tlb_addr & (TLB_MMIO | TLB_BSWAP | TLB_DISCARD_WRITE)) {
+            return NULL;
+        }
+
+        /* Handle watchpoints.  */
+        if (tlb_addr & TLB_WATCHPOINT) {
+            cpu_check_watchpoint(env_cpu(env), addr, size,
+                                 iotlbentry->attrs, wp_access, retaddr);
+        }
+
+        /* Handle clean RAM pages.  */
+        if (tlb_addr & TLB_NOTDIRTY) {
+            notdirty_write(env_cpu(env), addr, size, iotlbentry, retaddr);
+        }
     }
 
     return (void *)((uintptr_t)addr + entry->addend);
-- 
2.17.1

With the merge of notdirty handling into store_helper,
the last user of cpu->mem_io_vaddr was removed.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/hw/core/cpu.h | 2 --
 accel/tcg/cputlb.c    | 2 --
 hw/core/cpu.c         | 1 -
 3 files changed, 5 deletions(-)

diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/core/cpu.h
+++ b/include/hw/core/cpu.h
@@ -XXX,XX +XXX,XX @@ struct qemu_work_item;
  * @next_cpu: Next CPU sharing TB cache.
  * @opaque: User data.
  * @mem_io_pc: Host Program Counter at which the memory was accessed.
- * @mem_io_vaddr: Target virtual address at which the memory was accessed.
  * @kvm_fd: vCPU file descriptor for KVM.
  * @work_mutex: Lock to prevent multiple access to queued_work_*.
  * @queued_work_first: First asynchronous work pending.
@@ -XXX,XX +XXX,XX @@ struct CPUState {
      * we store some rarely used information in the CPU context.
      */
     uintptr_t mem_io_pc;
-    vaddr mem_io_vaddr;
     /*
      * This is only needed for the legacy cpu_unassigned_access() hook;
      * when all targets using it have been converted to use
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ static uint64_t io_readx(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
         cpu_io_recompile(cpu, retaddr);
     }
 
-    cpu->mem_io_vaddr = addr;
     cpu->mem_io_access_type = access_type;
 
     if (mr->global_locking && !qemu_mutex_iothread_locked()) {
@@ -XXX,XX +XXX,XX @@ static void io_writex(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
     if (!cpu->can_do_io) {
         cpu_io_recompile(cpu, retaddr);
     }
-    cpu->mem_io_vaddr = addr;
     cpu->mem_io_pc = retaddr;
 
     if (mr->global_locking && !qemu_mutex_iothread_locked()) {
diff --git a/hw/core/cpu.c b/hw/core/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/core/cpu.c
+++ b/hw/core/cpu.c
@@ -XXX,XX +XXX,XX @@ static void cpu_common_reset(CPUState *cpu)
     cpu->interrupt_request = 0;
     cpu->halted = 0;
     cpu->mem_io_pc = 0;
-    cpu->mem_io_vaddr = 0;
     cpu->icount_extra = 0;
     atomic_set(&cpu->icount_decr_ptr->u32, 0);
     cpu->can_do_io = 1;
-- 
2.17.1

All callers pass false to this argument.  Remove it and pass the
constant on to tb_invalidate_phys_page_range__locked.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/translate-all.h | 3 +--
 accel/tcg/translate-all.c | 6 ++----
 exec.c                    | 4 ++--
 3 files changed, 5 insertions(+), 8 deletions(-)

Rather than rely on cpu->mem_io_pc, pass retaddr down directly.

Within tb_invalidate_phys_page_range__locked, the is_cpu_write_access
parameter is non-zero exactly when retaddr would be non-zero, so that
is a simple replacement.

Recognize that current_tb_not_found is true only when mem_io_pc
(and now retaddr) are also non-zero, so remove a redundant test.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/translate-all.h |  3 ++-
 accel/tcg/cputlb.c        |  6 +-----
 accel/tcg/translate-all.c | 39 +++++++++++++++++++--------------------
 3 files changed, 22 insertions(+), 26 deletions(-)

diff --git a/accel/tcg/translate-all.h b/accel/tcg/translate-all.h
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/translate-all.h
+++ b/accel/tcg/translate-all.h
@@ -XXX,XX +XXX,XX @@ struct page_collection *page_collection_lock(tb_page_addr_t start,
                                              tb_page_addr_t end);
 void page_collection_unlock(struct page_collection *set);
 void tb_invalidate_phys_page_fast(struct page_collection *pages,
-                                  tb_page_addr_t start, int len);
+                                  tb_page_addr_t start, int len,
+                                  uintptr_t retaddr);
 void tb_invalidate_phys_page_range(tb_page_addr_t start, tb_page_addr_t end);
 void tb_check_watchpoint(CPUState *cpu);
 
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ static void notdirty_write(CPUState *cpu, vaddr mem_vaddr, unsigned size,
     if (!cpu_physical_memory_get_dirty_flag(ram_addr, DIRTY_MEMORY_CODE)) {
         struct page_collection *pages
             = page_collection_lock(ram_addr, ram_addr + size);
-
-        /* We require mem_io_pc in tb_invalidate_phys_page_range.  */
-        cpu->mem_io_pc = retaddr;
-
-        tb_invalidate_phys_page_fast(pages, ram_addr, size);
+        tb_invalidate_phys_page_fast(pages, ram_addr, size, retaddr);
         page_collection_unlock(pages);
     }
 
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -XXX,XX +XXX,XX @@ static void
 tb_invalidate_phys_page_range__locked(struct page_collection *pages,
                                       PageDesc *p, tb_page_addr_t start,
                                       tb_page_addr_t end,
-                                      int is_cpu_write_access)
+                                      uintptr_t retaddr)
 {
     TranslationBlock *tb;
     tb_page_addr_t tb_start, tb_end;
@@ -XXX,XX +XXX,XX @@ tb_invalidate_phys_page_range__locked(struct page_collection *pages,
 #ifdef TARGET_HAS_PRECISE_SMC
     CPUState *cpu = current_cpu;
     CPUArchState *env = NULL;
-    int current_tb_not_found = is_cpu_write_access;
+    bool current_tb_not_found = retaddr != 0;
+    bool current_tb_modified = false;
     TranslationBlock *current_tb = NULL;
-    int current_tb_modified = 0;
     target_ulong current_pc = 0;
     target_ulong current_cs_base = 0;
     uint32_t current_flags = 0;
@@ -XXX,XX +XXX,XX @@ tb_invalidate_phys_page_range__locked(struct page_collection *pages,
         if (!(tb_end <= start || tb_start >= end)) {
 #ifdef TARGET_HAS_PRECISE_SMC
             if (current_tb_not_found) {
-                current_tb_not_found = 0;
-                current_tb = NULL;
-                if (cpu->mem_io_pc) {
-                    /* now we have a real cpu fault */
-                    current_tb = tcg_tb_lookup(cpu->mem_io_pc);
-                }
+                current_tb_not_found = false;
+                /* now we have a real cpu fault */
+                current_tb = tcg_tb_lookup(retaddr);
             }
             if (current_tb == tb &&
                 (tb_cflags(current_tb) & CF_COUNT_MASK) != 1) {
-                /* If we are modifying the current TB, we must stop
-                its execution. We could be more precise by checking
-                that the modification is after the current PC, but it
-                would require a specialized function to partially
-                restore the CPU state */
-
-                current_tb_modified = 1;
-                cpu_restore_state_from_tb(cpu, current_tb,
-                                          cpu->mem_io_pc, true);
+                /*
+                 * If we are modifying the current TB, we must stop
+                 * its execution. We could be more precise by checking
+                 * that the modification is after the current PC, but it
+                 * would require a specialized function to partially
+                 * restore the CPU state.
+                 */
+                current_tb_modified = true;
+                cpu_restore_state_from_tb(cpu, current_tb, retaddr, true);
                 cpu_get_tb_cpu_state(env, &current_pc, &current_cs_base,
                                      &current_flags);
             }
@@ -XXX,XX +XXX,XX @@ void tb_invalidate_phys_range(target_ulong start, target_ulong end)
  * Call with all @pages in the range [@start, @start + len[ locked.
  */
 void tb_invalidate_phys_page_fast(struct page_collection *pages,
-                                  tb_page_addr_t start, int len)
+                                  tb_page_addr_t start, int len,
+                                  uintptr_t retaddr)
 {
     PageDesc *p;
 
@@ -XXX,XX +XXX,XX @@ void tb_invalidate_phys_page_fast(struct page_collection *pages,
         }
     } else {
     do_invalidate:
-        tb_invalidate_phys_page_range__locked(pages, p, start, start + len, 1);
+        tb_invalidate_phys_page_range__locked(pages, p, start, start + len,
+                                              retaddr);
     }
 }
 #else
-- 
2.17.1

Fixes the previous TLB_WATCHPOINT patches because we are currently
failing to set cpu->mem_io_pc with the call to cpu_check_watchpoint.
Pass down the retaddr directly because it's readily available.

Fixes: 50b107c5d61
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/translate-all.h | 2 +-
 accel/tcg/translate-all.c | 6 +++---
 exec.c                    | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/accel/tcg/translate-all.h b/accel/tcg/translate-all.h
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/translate-all.h
+++ b/accel/tcg/translate-all.h
@@ -XXX,XX +XXX,XX @@ void tb_invalidate_phys_page_fast(struct page_collection *pages,
                                   tb_page_addr_t start, int len,
                                   uintptr_t retaddr);
 void tb_invalidate_phys_page_range(tb_page_addr_t start, tb_page_addr_t end);
-void tb_check_watchpoint(CPUState *cpu);
+void tb_check_watchpoint(CPUState *cpu, uintptr_t retaddr);
 
 #ifdef CONFIG_USER_ONLY
 int page_unprotect(target_ulong address, uintptr_t pc);
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -XXX,XX +XXX,XX @@ static bool tb_invalidate_phys_page(tb_page_addr_t addr, uintptr_t pc)
 #endif
 
 /* user-mode: call with mmap_lock held */
-void tb_check_watchpoint(CPUState *cpu)
+void tb_check_watchpoint(CPUState *cpu, uintptr_t retaddr)
 {
     TranslationBlock *tb;
 
     assert_memory_lock();
 
-    tb = tcg_tb_lookup(cpu->mem_io_pc);
+    tb = tcg_tb_lookup(retaddr);
     if (tb) {
         /* We can use retranslation to find the PC.  */
-        cpu_restore_state_from_tb(cpu, tb, cpu->mem_io_pc, true);
+        cpu_restore_state_from_tb(cpu, tb, retaddr, true);
         tb_phys_invalidate(tb, -1);
     } else {
         /* The exception probably happened in a helper.  The CPU state should
diff --git a/exec.c b/exec.c
index XXXXXXX..XXXXXXX 100644
--- a/exec.c
+++ b/exec.c
@@ -XXX,XX +XXX,XX @@ void cpu_check_watchpoint(CPUState *cpu, vaddr addr, vaddr len,
                 cpu->watchpoint_hit = wp;
 
                 mmap_lock();
-                tb_check_watchpoint(cpu);
+                tb_check_watchpoint(cpu, ra);
                 if (wp->flags & BP_STOP_BEFORE_ACCESS) {
                     cpu->exception_index = EXCP_DEBUG;
                     mmap_unlock();
-- 
2.17.1

Since 6e6c4efed99, there has been a more appropriate range check
done later at the end of tcg_gen_code.  There, a failing range
check results in a returned error code, which causes the TB to
be restarted at half the size.

Reported-by: Sai Pavan Boddu <saipava@xilinx.com>
Tested-by: Sai Pavan Boddu <sai.pavan.boddu@xilinx.com>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tcg.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static bool tcg_resolve_relocs(TCGContext *s)
 
 static void set_jmp_reset_offset(TCGContext *s, int which)
 {
-    size_t off = tcg_current_code_size(s);
-    s->tb_jmp_reset_offset[which] = off;
-    /* Make sure that we didn't overflow the stored offset.  */
-    assert(s->tb_jmp_reset_offset[which] == off);
+    /*
+     * We will check for overflow at the end of the opcode loop in
+     * tcg_gen_code, where we bound tcg_current_code_size to UINT16_MAX.
+     */
+    s->tb_jmp_reset_offset[which] = tcg_current_code_size(s);
 }
 
 #include "tcg-target.c.inc"
-- 
2.25.1

This reverts commit cd0372c515c4732d8bd3777cdd995c139c7ed7ea.

The patch is incorrect in that it retains copies between globals and
non-local temps, and non-local temps still die at the end of the BB.

Failing test case for hppa:

.globl	_start
_start:
	cmpiclr,=	0x24,%r19,%r0
	cmpiclr,<>	0x2f,%r19,%r19

---- 00010057 0001005b
 movi_i32 tmp0,$0x24
 sub_i32 tmp1,tmp0,r19
 mov_i32 tmp2,tmp0
 mov_i32 tmp3,r19
 movi_i32 tmp1,$0x0

---- 0001005b 0001005f
 brcond_i32 tmp2,tmp3,eq,$L1
 movi_i32 tmp0,$0x2f
 sub_i32 tmp1,tmp0,r19
 mov_i32 tmp2,tmp0
 mov_i32 tmp3,r19
 movi_i32 tmp1,$0x0
 mov_i32 r19,tmp1
 setcond_i32 psw_n,tmp2,tmp3,ne
 set_label $L1

In this case, both copies of "mov_i32 tmp3,r19" are removed.  The
second because opt thought it was redundant.  The first is removed
later by liveness because tmp3 is known to be dead.  This leaves
the setcond_i32 with an uninitialized input.

Revert the entire patch for 5.2, and a proper optimization across
the branch may be considered for the next development cycle.

Reported-by: qemu@igor2.repo.hu
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 35 +++++++++++++++++------------------
 1 file changed, 17 insertions(+), 18 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                     }
                 }
             }
-            /* fall through */
+            goto do_reset_output;
 
         default:
         do_default:
-            /*
-             * Default case: we know nothing about operation (or were unable
-             * to compute the operation result) so no propagation is done.
-             */
-            for (i = 0; i < nb_oargs; i++) {
-                reset_temp(op->args[i]);
-                /*
-                 * Save the corresponding known-zero bits mask for the
-                 * first output argument (only one supported so far).
-                 */
-                if (i == 0) {
-                    arg_info(op->args[i])->mask = mask;
+            /* Default case: we know nothing about operation (or were unable
+               to compute the operation result) so no propagation is done.
+               We trash everything if the operation is the end of a basic
+               block, otherwise we only trash the output args.  "mask" is
+               the non-zero bits mask for the first output arg.  */
+            if (def->flags & TCG_OPF_BB_END) {
+                bitmap_zero(temps_used.l, nb_temps);
+            } else {
+        do_reset_output:
+                for (i = 0; i < nb_oargs; i++) {
+                    reset_temp(op->args[i]);
+                    /* Save the corresponding known-zero bits mask for the
+                       first output argument (only one supported so far). */
+                    if (i == 0) {
+                        arg_info(op->args[i])->mask = mask;
+                    }
                 }
             }
             break;
-
-        case INDEX_op_set_label:
-            /* Trash everything at the start of a new extended bb. */
-            bitmap_zero(temps_used.l, nb_temps);
-            break;
         }
 
         /* Eliminate duplicate and redundant fence instructions.  */
-- 
2.25.1