[RFC PATCH 1/3] accel/tcg: Fix TCG TLB coherency race with physical dirty bit clearing

Nicholas Piggin posted 3 patches 8 months, 4 weeks ago
Maintainers: Richard Henderson <richard.henderson@linaro.org>, Paolo Bonzini <pbonzini@redhat.com>, Eduardo Habkost <eduardo@habkost.net>, Marcel Apfelbaum <marcel.apfelbaum@gmail.com>, "Philippe Mathieu-Daudé" <philmd@linaro.org>, Yanan Wang <wangyanan55@huawei.com>, Alex Williamson <alex.williamson@redhat.com>, "Cédric Le Goater" <clg@redhat.com>, "Michael S. Tsirkin" <mst@redhat.com>, Peter Xu <peterx@redhat.com>, David Hildenbrand <david@redhat.com>, Nicholas Piggin <npiggin@gmail.com>, Daniel Henrique Barboza <danielhb413@gmail.com>
[RFC PATCH 1/3] accel/tcg: Fix TCG TLB coherency race with physical dirty bit clearing
Posted by Nicholas Piggin 8 months, 4 weeks ago
The TCG TLB keeps a TLB_NOTDIRTY bit that must be kept coherent
with the cpu physical dirty memory bitmaps. If any bits are clear,
TLB_NOTDIRTY *must* be set in all CPUs so that the nodirty_write()
slowpath is guaranteed to be called on a write access.

TLB_NOTDIRTY may be set if none of the bits are clear, it just
results in a superfluous nodirty_write() call (which should remove
the bit).

There is a race with clearing physical dirty bits and setting
TLB_NOTDIRTY vs setting or creating TLB entries without the
TLB_NOTDIRTY bit, that can cause the bit to get lost and
notdirty_write() updates to be missed.

nodirty_write()
1. cpu_physical_memory_set_dirty_range()
2. if (!cpu_physical_memory_is_clean())
3.    tlb_set_dirty()

vs

cpu_physical_memory_test_and_clear_dirty()
A. dirty = bitmap_test_and_clear_atomic()
   if (dirty)
B.     tlb_reset_dirty_range_all()

1 executes, then 2 finds no bits clean, then A clears the dirty bit
and runs B to set TLB_NOTDIRTY on the TLB entries, then 3 clears the
TLB_NOTDIRTY bits from the TLB entries.

This results in the physical dirty bitmap having a clear bit with a
TLB entry pointing to it without TLB_NOTDIRTY, which means stores
through that TLB are performed without the notdirty_write() call to
set dirty bits (or invalidate tb).

Fix this by checking the physical dirty bitmap while holding the TLB
lock that also covers TLB entry insertion / TLB_NOTDIRTY clearing.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
Hi Thomas,

This is the other dirty bitmap race I saw, if you are hunting
migration bugs...

Thanks,
Nick

 include/exec/exec-all.h |  1 -
 accel/stubs/tcg-stub.c  |  4 ----
 accel/tcg/cputlb.c      | 47 +++++++++++++++++++++++++++++------------
 3 files changed, 33 insertions(+), 19 deletions(-)

diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index ce36bb10d4..ade81b25f0 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -654,7 +654,6 @@ static inline void mmap_unlock(void) {}
 #define WITH_MMAP_LOCK_GUARD()
 
 void tlb_reset_dirty(CPUState *cpu, ram_addr_t start1, ram_addr_t length);
-void tlb_set_dirty(CPUState *cpu, vaddr addr);
 
 MemoryRegionSection *
 address_space_translate_for_iotlb(CPUState *cpu, int asidx, hwaddr addr,
diff --git a/accel/stubs/tcg-stub.c b/accel/stubs/tcg-stub.c
index 8a496a2a6f..dd890d6cf6 100644
--- a/accel/stubs/tcg-stub.c
+++ b/accel/stubs/tcg-stub.c
@@ -18,10 +18,6 @@ void tb_flush(CPUState *cpu)
 {
 }
 
-void tlb_set_dirty(CPUState *cpu, vaddr vaddr)
-{
-}
-
 int probe_access_flags(CPUArchState *env, vaddr addr, int size,
                        MMUAccessType access_type, int mmu_idx,
                        bool nonfault, void **phost, uintptr_t retaddr)
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index 047cd2cc0a..078f4ef166 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -1001,10 +1001,17 @@ static inline void copy_tlb_helper_locked(CPUTLBEntry *d, const CPUTLBEntry *s)
     *d = *s;
 }
 
-/* This is a cross vCPU call (i.e. another vCPU resetting the flags of
+/*
+ * This is a cross vCPU call (i.e. another vCPU resetting the flags of
  * the target vCPU).
  * We must take tlb_c.lock to avoid racing with another vCPU update. The only
  * thing actually updated is the target TLB entry ->addr_write flags.
+ *
+ * This can race between the physical memory dirty bits becoming all set and
+ * tlb_set_page_full or tlb_try_set_dirty creating dirty entries: it is
+ * harmless to set TLB_NOTDIRTY on a !clean physical page, it might just
+ * cause a notdirty_write() slowpath on the next write which clears out the
+ * spurious TLB_NOTDIRTY.
  */
 void tlb_reset_dirty(CPUState *cpu, ram_addr_t start1, ram_addr_t length)
 {
@@ -1037,9 +1044,11 @@ static inline void tlb_set_dirty1_locked(CPUTLBEntry *tlb_entry,
     }
 }
 
-/* update the TLB corresponding to virtual page vaddr
-   so that it is no longer dirty */
-void tlb_set_dirty(CPUState *cpu, vaddr addr)
+/*
+ * Update the TLB corresponding to virtual page vaddr if it no longer
+ * requires the TLB_NOTDIRTY bit.
+ */
+static void tlb_try_set_dirty(CPUState *cpu, vaddr addr, ram_addr_t ram_addr)
 {
     int mmu_idx;
 
@@ -1047,6 +1056,12 @@ void tlb_set_dirty(CPUState *cpu, vaddr addr)
 
     addr &= TARGET_PAGE_MASK;
     qemu_spin_lock(&cpu->neg.tlb.c.lock);
+    if (cpu_physical_memory_is_clean(ram_addr)) {
+        /* Must be checked under lock, like tlb_set_page_full */
+        qemu_spin_unlock(&cpu->neg.tlb.c.lock);
+        return;
+    }
+
     for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
         tlb_set_dirty1_locked(tlb_entry(cpu, mmu_idx, addr), addr);
     }
@@ -1165,6 +1180,19 @@ void tlb_set_page_full(CPUState *cpu, int mmu_idx,
         addend = 0;
     }
 
+    /*
+     * Hold the TLB lock for the rest of the function. We could acquire/release
+     * the lock several times in the function, but it is faster to amortize the
+     * acquisition cost by acquiring it just once. Note that this leads to
+     * a longer critical section, but this is not a concern since the TLB lock
+     * is unlikely to be contended.
+     *
+     * The TLB lock must be held over checking cpu_physical_memory_is_clean
+     * and inserting the entry into the TLB, so clearing phsyical memory
+     * dirty status can find and set TLB_NOTDIRTY on the entries.
+     */
+    qemu_spin_lock(&tlb->c.lock);
+
     write_flags = read_flags;
     if (is_ram) {
         iotlb = memory_region_get_ram_addr(section->mr) + xlat;
@@ -1200,15 +1228,6 @@ void tlb_set_page_full(CPUState *cpu, int mmu_idx,
     index = tlb_index(cpu, mmu_idx, addr_page);
     te = tlb_entry(cpu, mmu_idx, addr_page);
 
-    /*
-     * Hold the TLB lock for the rest of the function. We could acquire/release
-     * the lock several times in the function, but it is faster to amortize the
-     * acquisition cost by acquiring it just once. Note that this leads to
-     * a longer critical section, but this is not a concern since the TLB lock
-     * is unlikely to be contended.
-     */
-    qemu_spin_lock(&tlb->c.lock);
-
     /* Note that the tlb is no longer clean.  */
     tlb->c.dirty |= 1 << mmu_idx;
 
@@ -1409,7 +1428,7 @@ static void notdirty_write(CPUState *cpu, vaddr mem_vaddr, unsigned size,
     /* We remove the notdirty callback only if the code has been flushed. */
     if (!cpu_physical_memory_is_clean(ram_addr)) {
         trace_memory_notdirty_set_dirty(mem_vaddr);
-        tlb_set_dirty(cpu, mem_vaddr);
+        tlb_try_set_dirty(cpu, mem_vaddr, ram_addr);
     }
 }
 
-- 
2.42.0