The TCG TLB keeps a TLB_NOTDIRTY bit that must be kept coherent
with the cpu physical dirty memory bitmaps. If any bits are clear,
TLB_NOTDIRTY *must* be set in all CPUs so that the nodirty_write()
slowpath is guaranteed to be called on a write access.
TLB_NOTDIRTY may be set if none of the bits are clear, it just
results in a superfluous nodirty_write() call (which should remove
the bit).
There is a race with clearing physical dirty bits and setting
TLB_NOTDIRTY vs setting or creating TLB entries without the
TLB_NOTDIRTY bit, that can cause the bit to get lost and
notdirty_write() updates to be missed.
nodirty_write()
1. cpu_physical_memory_set_dirty_range()
2. if (!cpu_physical_memory_is_clean())
3. tlb_set_dirty()
vs
cpu_physical_memory_test_and_clear_dirty()
A. dirty = bitmap_test_and_clear_atomic()
if (dirty)
B. tlb_reset_dirty_range_all()
1 executes, then 2 finds no bits clean, then A clears the dirty bit
and runs B to set TLB_NOTDIRTY on the TLB entries, then 3 clears the
TLB_NOTDIRTY bits from the TLB entries.
This results in the physical dirty bitmap having a clear bit with a
TLB entry pointing to it without TLB_NOTDIRTY, which means stores
through that TLB are performed without the notdirty_write() call to
set dirty bits (or invalidate tb).
Fix this by checking the physical dirty bitmap while holding the TLB
lock that also covers TLB entry insertion / TLB_NOTDIRTY clearing.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
Hi Thomas,
This is the other dirty bitmap race I saw, if you are hunting
migration bugs...
Thanks,
Nick
include/exec/exec-all.h | 1 -
accel/stubs/tcg-stub.c | 4 ----
accel/tcg/cputlb.c | 47 +++++++++++++++++++++++++++++------------
3 files changed, 33 insertions(+), 19 deletions(-)
diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index ce36bb10d4..ade81b25f0 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -654,7 +654,6 @@ static inline void mmap_unlock(void) {}
#define WITH_MMAP_LOCK_GUARD()
void tlb_reset_dirty(CPUState *cpu, ram_addr_t start1, ram_addr_t length);
-void tlb_set_dirty(CPUState *cpu, vaddr addr);
MemoryRegionSection *
address_space_translate_for_iotlb(CPUState *cpu, int asidx, hwaddr addr,
diff --git a/accel/stubs/tcg-stub.c b/accel/stubs/tcg-stub.c
index 8a496a2a6f..dd890d6cf6 100644
--- a/accel/stubs/tcg-stub.c
+++ b/accel/stubs/tcg-stub.c
@@ -18,10 +18,6 @@ void tb_flush(CPUState *cpu)
{
}
-void tlb_set_dirty(CPUState *cpu, vaddr vaddr)
-{
-}
-
int probe_access_flags(CPUArchState *env, vaddr addr, int size,
MMUAccessType access_type, int mmu_idx,
bool nonfault, void **phost, uintptr_t retaddr)
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index 047cd2cc0a..078f4ef166 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -1001,10 +1001,17 @@ static inline void copy_tlb_helper_locked(CPUTLBEntry *d, const CPUTLBEntry *s)
*d = *s;
}
-/* This is a cross vCPU call (i.e. another vCPU resetting the flags of
+/*
+ * This is a cross vCPU call (i.e. another vCPU resetting the flags of
* the target vCPU).
* We must take tlb_c.lock to avoid racing with another vCPU update. The only
* thing actually updated is the target TLB entry ->addr_write flags.
+ *
+ * This can race between the physical memory dirty bits becoming all set and
+ * tlb_set_page_full or tlb_try_set_dirty creating dirty entries: it is
+ * harmless to set TLB_NOTDIRTY on a !clean physical page, it might just
+ * cause a notdirty_write() slowpath on the next write which clears out the
+ * spurious TLB_NOTDIRTY.
*/
void tlb_reset_dirty(CPUState *cpu, ram_addr_t start1, ram_addr_t length)
{
@@ -1037,9 +1044,11 @@ static inline void tlb_set_dirty1_locked(CPUTLBEntry *tlb_entry,
}
}
-/* update the TLB corresponding to virtual page vaddr
- so that it is no longer dirty */
-void tlb_set_dirty(CPUState *cpu, vaddr addr)
+/*
+ * Update the TLB corresponding to virtual page vaddr if it no longer
+ * requires the TLB_NOTDIRTY bit.
+ */
+static void tlb_try_set_dirty(CPUState *cpu, vaddr addr, ram_addr_t ram_addr)
{
int mmu_idx;
@@ -1047,6 +1056,12 @@ void tlb_set_dirty(CPUState *cpu, vaddr addr)
addr &= TARGET_PAGE_MASK;
qemu_spin_lock(&cpu->neg.tlb.c.lock);
+ if (cpu_physical_memory_is_clean(ram_addr)) {
+ /* Must be checked under lock, like tlb_set_page_full */
+ qemu_spin_unlock(&cpu->neg.tlb.c.lock);
+ return;
+ }
+
for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
tlb_set_dirty1_locked(tlb_entry(cpu, mmu_idx, addr), addr);
}
@@ -1165,6 +1180,19 @@ void tlb_set_page_full(CPUState *cpu, int mmu_idx,
addend = 0;
}
+ /*
+ * Hold the TLB lock for the rest of the function. We could acquire/release
+ * the lock several times in the function, but it is faster to amortize the
+ * acquisition cost by acquiring it just once. Note that this leads to
+ * a longer critical section, but this is not a concern since the TLB lock
+ * is unlikely to be contended.
+ *
+ * The TLB lock must be held over checking cpu_physical_memory_is_clean
+ * and inserting the entry into the TLB, so clearing phsyical memory
+ * dirty status can find and set TLB_NOTDIRTY on the entries.
+ */
+ qemu_spin_lock(&tlb->c.lock);
+
write_flags = read_flags;
if (is_ram) {
iotlb = memory_region_get_ram_addr(section->mr) + xlat;
@@ -1200,15 +1228,6 @@ void tlb_set_page_full(CPUState *cpu, int mmu_idx,
index = tlb_index(cpu, mmu_idx, addr_page);
te = tlb_entry(cpu, mmu_idx, addr_page);
- /*
- * Hold the TLB lock for the rest of the function. We could acquire/release
- * the lock several times in the function, but it is faster to amortize the
- * acquisition cost by acquiring it just once. Note that this leads to
- * a longer critical section, but this is not a concern since the TLB lock
- * is unlikely to be contended.
- */
- qemu_spin_lock(&tlb->c.lock);
-
/* Note that the tlb is no longer clean. */
tlb->c.dirty |= 1 << mmu_idx;
@@ -1409,7 +1428,7 @@ static void notdirty_write(CPUState *cpu, vaddr mem_vaddr, unsigned size,
/* We remove the notdirty callback only if the code has been flushed. */
if (!cpu_physical_memory_is_clean(ram_addr)) {
trace_memory_notdirty_set_dirty(mem_vaddr);
- tlb_set_dirty(cpu, mem_vaddr);
+ tlb_try_set_dirty(cpu, mem_vaddr, ram_addr);
}
}
--
2.42.0