From: Vipin Sharma <vipinsh@google.com>
Use MMU read lock to recover TDP MMU NX huge pages. Iterate
over the huge pages list under tdp_mmu_pages_lock protection and
unaccount the page before dropping the lock.
We must not zap an SPTE if:
- The SPTE is a root page.
- The SPTE does not point at the SP's page table.
If the SPTE does not point at the SP's page table, then something else
has changed the SPTE, so we cannot safely zap it.
Warn if zapping SPTE fails and current SPTE is still pointing to same
page table. This should never happen.
There is always a race between dirty logging, vCPU faults, and NX huge
page recovery for backing a gfn by an NX huge page or an executable
small page. Unaccounting sooner during the list traversal is increasing
the window of that race. Functionally, it is okay, because accounting
doesn't protect against iTLB multi-hit bug, it is there purely to
prevent KVM from bouncing a gfn between two page sizes. The only
downside is that a vCPU will end up doing more work in tearing down all
the child SPTEs. This should be a very rare race.
Zapping under MMU read lock unblocks vCPUs which are waiting for MMU
read lock. This optimizaion is done to solve a guest jitter issue on
Windows VM which was observing an increase in network latency.
Suggested-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Vipin Sharma <vipinsh@google.com>
Co-developed-by: James Houghton <jthoughton@google.com>
Signed-off-by: James Houghton <jthoughton@google.com>
---
arch/x86/kvm/mmu/mmu.c | 99 ++++++++++++++++++++++++--------------
arch/x86/kvm/mmu/tdp_mmu.c | 42 +++++++++++++---
2 files changed, 98 insertions(+), 43 deletions(-)
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index b074f7bb5cc58..10ba328b664d7 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -7535,12 +7535,40 @@ static unsigned long nx_huge_pages_to_zap(struct kvm *kvm,
return ratio ? DIV_ROUND_UP(pages, ratio) : 0;
}
+static bool kvm_mmu_sp_dirty_logging_enabled(struct kvm *kvm,
+ struct kvm_mmu_page *sp)
+{
+ struct kvm_memory_slot *slot = NULL;
+
+ /*
+ * Since gfn_to_memslot() is relatively expensive, it helps to skip it if
+ * it the test cannot possibly return true. On the other hand, if any
+ * memslot has logging enabled, chances are good that all of them do, in
+ * which case unaccount_nx_huge_page() is much cheaper than zapping the
+ * page.
+ *
+ * If a memslot update is in progress, reading an incorrect value of
+ * kvm->nr_memslots_dirty_logging is not a problem: if it is becoming
+ * zero, gfn_to_memslot() will be done unnecessarily; if it is becoming
+ * nonzero, the page will be zapped unnecessarily. Either way, this only
+ * affects efficiency in racy situations, and not correctness.
+ */
+ if (atomic_read(&kvm->nr_memslots_dirty_logging)) {
+ struct kvm_memslots *slots;
+
+ slots = kvm_memslots_for_spte_role(kvm, sp->role);
+ slot = __gfn_to_memslot(slots, sp->gfn);
+ WARN_ON_ONCE(!slot);
+ }
+ return slot && kvm_slot_dirty_track_enabled(slot);
+}
+
static void kvm_recover_nx_huge_pages(struct kvm *kvm,
enum kvm_mmu_type mmu_type)
{
unsigned long to_zap = nx_huge_pages_to_zap(kvm, mmu_type);
+ bool is_tdp = mmu_type == KVM_TDP_MMU;
struct list_head *nx_huge_pages;
- struct kvm_memory_slot *slot;
struct kvm_mmu_page *sp;
LIST_HEAD(invalid_list);
bool flush = false;
@@ -7549,7 +7577,10 @@ static void kvm_recover_nx_huge_pages(struct kvm *kvm,
nx_huge_pages = &kvm->arch.possible_nx_huge_pages[mmu_type].pages;
rcu_idx = srcu_read_lock(&kvm->srcu);
- write_lock(&kvm->mmu_lock);
+ if (is_tdp)
+ read_lock(&kvm->mmu_lock);
+ else
+ write_lock(&kvm->mmu_lock);
/*
* Zapping TDP MMU shadow pages, including the remote TLB flush, must
@@ -7559,8 +7590,13 @@ static void kvm_recover_nx_huge_pages(struct kvm *kvm,
rcu_read_lock();
for ( ; to_zap; --to_zap) {
- if (list_empty(nx_huge_pages))
+ if (is_tdp)
+ spin_lock(&kvm->arch.tdp_mmu_pages_lock);
+ if (list_empty(nx_huge_pages)) {
+ if (is_tdp)
+ spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
break;
+ }
/*
* We use a separate list instead of just using active_mmu_pages
@@ -7575,50 +7611,38 @@ static void kvm_recover_nx_huge_pages(struct kvm *kvm,
WARN_ON_ONCE(!sp->nx_huge_page_disallowed);
WARN_ON_ONCE(!sp->role.direct);
+ unaccount_nx_huge_page(kvm, sp);
+
+ if (is_tdp)
+ spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
+
/*
- * Unaccount and do not attempt to recover any NX Huge Pages
- * that are being dirty tracked, as they would just be faulted
- * back in as 4KiB pages. The NX Huge Pages in this slot will be
- * recovered, along with all the other huge pages in the slot,
- * when dirty logging is disabled.
- *
- * Since gfn_to_memslot() is relatively expensive, it helps to
- * skip it if it the test cannot possibly return true. On the
- * other hand, if any memslot has logging enabled, chances are
- * good that all of them do, in which case unaccount_nx_huge_page()
- * is much cheaper than zapping the page.
- *
- * If a memslot update is in progress, reading an incorrect value
- * of kvm->nr_memslots_dirty_logging is not a problem: if it is
- * becoming zero, gfn_to_memslot() will be done unnecessarily; if
- * it is becoming nonzero, the page will be zapped unnecessarily.
- * Either way, this only affects efficiency in racy situations,
- * and not correctness.
+ * Do not attempt to recover any NX Huge Pages that are being
+ * dirty tracked, as they would just be faulted back in as 4KiB
+ * pages. The NX Huge Pages in this slot will be recovered,
+ * along with all the other huge pages in the slot, when dirty
+ * logging is disabled.
*/
- slot = NULL;
- if (atomic_read(&kvm->nr_memslots_dirty_logging)) {
- struct kvm_memslots *slots;
+ if (!kvm_mmu_sp_dirty_logging_enabled(kvm, sp)) {
+ if (is_tdp)
+ flush |= kvm_tdp_mmu_zap_possible_nx_huge_page(kvm, sp);
+ else
+ kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
- slots = kvm_memslots_for_spte_role(kvm, sp->role);
- slot = __gfn_to_memslot(slots, sp->gfn);
- WARN_ON_ONCE(!slot);
}
- if (slot && kvm_slot_dirty_track_enabled(slot))
- unaccount_nx_huge_page(kvm, sp);
- else if (mmu_type == KVM_TDP_MMU)
- flush |= kvm_tdp_mmu_zap_possible_nx_huge_page(kvm, sp);
- else
- kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
WARN_ON_ONCE(sp->nx_huge_page_disallowed);
if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
rcu_read_unlock();
- cond_resched_rwlock_write(&kvm->mmu_lock);
- flush = false;
+ if (is_tdp)
+ cond_resched_rwlock_read(&kvm->mmu_lock);
+ else
+ cond_resched_rwlock_write(&kvm->mmu_lock);
+ flush = false;
rcu_read_lock();
}
}
@@ -7626,7 +7650,10 @@ static void kvm_recover_nx_huge_pages(struct kvm *kvm,
rcu_read_unlock();
- write_unlock(&kvm->mmu_lock);
+ if (is_tdp)
+ read_unlock(&kvm->mmu_lock);
+ else
+ write_unlock(&kvm->mmu_lock);
srcu_read_unlock(&kvm->srcu, rcu_idx);
}
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 19907eb04a9c4..31d921705dee7 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -928,21 +928,49 @@ static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
bool kvm_tdp_mmu_zap_possible_nx_huge_page(struct kvm *kvm,
struct kvm_mmu_page *sp)
{
- u64 old_spte;
+ struct tdp_iter iter = {
+ .old_spte = sp->ptep ? kvm_tdp_mmu_read_spte(sp->ptep) : 0,
+ .sptep = sp->ptep,
+ .level = sp->role.level + 1,
+ .gfn = sp->gfn,
+ .as_id = kvm_mmu_page_as_id(sp),
+ };
+
+ lockdep_assert_held_read(&kvm->mmu_lock);
+
+ if (WARN_ON_ONCE(!is_tdp_mmu_page(sp)))
+ return false;
/*
- * This helper intentionally doesn't allow zapping a root shadow page,
- * which doesn't have a parent page table and thus no associated entry.
+ * Root shadow pages don't have a parent page table and thus no
+ * associated entry, but they can never be possible NX huge pages.
*/
if (WARN_ON_ONCE(!sp->ptep))
return false;
- old_spte = kvm_tdp_mmu_read_spte(sp->ptep);
- if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte)))
+ /*
+ * Since mmu_lock is held in read mode, it's possible another task has
+ * already modified the SPTE. Zap the SPTE if and only if the SPTE
+ * points at the SP's page table, as checking shadow-present isn't
+ * sufficient, e.g. the SPTE could be replaced by a leaf SPTE, or even
+ * another SP. Note, spte_to_child_pt() also checks that the SPTE is
+ * shadow-present, i.e. guards against zapping a frozen SPTE.
+ */
+ if ((tdp_ptep_t)sp->spt != spte_to_child_pt(iter.old_spte, iter.level))
return false;
- tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte,
- SHADOW_NONPRESENT_VALUE, sp->gfn, sp->role.level + 1);
+ /*
+ * If a different task modified the SPTE, then it should be impossible
+ * for the SPTE to still be used for the to-be-zapped SP. Non-leaf
+ * SPTEs don't have Dirty bits, KVM always sets the Accessed bit when
+ * creating non-leaf SPTEs, and all other bits are immutable for non-
+ * leaf SPTEs, i.e. the only legal operations for non-leaf SPTEs are
+ * zapping and replacement.
+ */
+ if (tdp_mmu_set_spte_atomic(kvm, &iter, SHADOW_NONPRESENT_VALUE)) {
+ WARN_ON_ONCE((tdp_ptep_t)sp->spt == spte_to_child_pt(iter.old_spte, iter.level));
+ return false;
+ }
return true;
}
--
2.50.0.rc2.692.g299adb8693-goog
Hi James, kernel test robot noticed the following build errors: [auto build test ERROR on 8046d29dde17002523f94d3e6e0ebe486ce52166] url: https://github.com/intel-lab-lkp/linux/commits/James-Houghton/KVM-x86-mmu-Track-TDP-MMU-NX-huge-pages-separately/20250614-042620 base: 8046d29dde17002523f94d3e6e0ebe486ce52166 patch link: https://lore.kernel.org/r/20250613202315.2790592-4-jthoughton%40google.com patch subject: [PATCH v4 3/7] KVM: x86/mmu: Recover TDP MMU NX huge pages using MMU read lock config: i386-buildonly-randconfig-003-20250614 (https://download.01.org/0day-ci/archive/20250614/202506142129.ClBlxdtW-lkp@intel.com/config) compiler: clang version 20.1.2 (https://github.com/llvm/llvm-project 58df0ef89dd64126512e4ee27b4ac3fd8ddf6247) reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250614/202506142129.ClBlxdtW-lkp@intel.com/reproduce) If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot <lkp@intel.com> | Closes: https://lore.kernel.org/oe-kbuild-all/202506142129.ClBlxdtW-lkp@intel.com/ All errors (new ones prefixed by >>): arch/x86/kvm/mmu/mmu.c:7570:28: error: use of undeclared identifier 'KVM_TDP_MMU' 7570 | bool is_tdp = mmu_type == KVM_TDP_MMU; | ^ >> arch/x86/kvm/mmu/mmu.c:7594:25: error: no member named 'tdp_mmu_pages_lock' in 'struct kvm_arch' 7594 | spin_lock(&kvm->arch.tdp_mmu_pages_lock); | ~~~~~~~~~ ^ arch/x86/kvm/mmu/mmu.c:7597:28: error: no member named 'tdp_mmu_pages_lock' in 'struct kvm_arch' 7597 | spin_unlock(&kvm->arch.tdp_mmu_pages_lock); | ~~~~~~~~~ ^ arch/x86/kvm/mmu/mmu.c:7617:27: error: no member named 'tdp_mmu_pages_lock' in 'struct kvm_arch' 7617 | spin_unlock(&kvm->arch.tdp_mmu_pages_lock); | ~~~~~~~~~ ^ 4 errors generated. vim +7594 arch/x86/kvm/mmu/mmu.c 7565 7566 static void kvm_recover_nx_huge_pages(struct kvm *kvm, 7567 enum kvm_mmu_type mmu_type) 7568 { 7569 unsigned long to_zap = nx_huge_pages_to_zap(kvm, mmu_type); > 7570 bool is_tdp = mmu_type == KVM_TDP_MMU; 7571 struct list_head *nx_huge_pages; 7572 struct kvm_mmu_page *sp; 7573 LIST_HEAD(invalid_list); 7574 bool flush = false; 7575 int rcu_idx; 7576 7577 nx_huge_pages = &kvm->arch.possible_nx_huge_pages[mmu_type].pages; 7578 7579 rcu_idx = srcu_read_lock(&kvm->srcu); 7580 if (is_tdp) 7581 read_lock(&kvm->mmu_lock); 7582 else 7583 write_lock(&kvm->mmu_lock); 7584 7585 /* 7586 * Zapping TDP MMU shadow pages, including the remote TLB flush, must 7587 * be done under RCU protection, because the pages are freed via RCU 7588 * callback. 7589 */ 7590 rcu_read_lock(); 7591 7592 for ( ; to_zap; --to_zap) { 7593 if (is_tdp) > 7594 spin_lock(&kvm->arch.tdp_mmu_pages_lock); 7595 if (list_empty(nx_huge_pages)) { 7596 if (is_tdp) 7597 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 7598 break; 7599 } 7600 7601 /* 7602 * We use a separate list instead of just using active_mmu_pages 7603 * because the number of shadow pages that be replaced with an 7604 * NX huge page is expected to be relatively small compared to 7605 * the total number of shadow pages. And because the TDP MMU 7606 * doesn't use active_mmu_pages. 7607 */ 7608 sp = list_first_entry(nx_huge_pages, 7609 struct kvm_mmu_page, 7610 possible_nx_huge_page_link); 7611 WARN_ON_ONCE(!sp->nx_huge_page_disallowed); 7612 WARN_ON_ONCE(!sp->role.direct); 7613 7614 unaccount_nx_huge_page(kvm, sp); 7615 7616 if (is_tdp) 7617 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 7618 7619 /* 7620 * Do not attempt to recover any NX Huge Pages that are being 7621 * dirty tracked, as they would just be faulted back in as 4KiB 7622 * pages. The NX Huge Pages in this slot will be recovered, 7623 * along with all the other huge pages in the slot, when dirty 7624 * logging is disabled. 7625 */ 7626 if (!kvm_mmu_sp_dirty_logging_enabled(kvm, sp)) { 7627 if (is_tdp) 7628 flush |= kvm_tdp_mmu_zap_possible_nx_huge_page(kvm, sp); 7629 else 7630 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); 7631 7632 } 7633 7634 WARN_ON_ONCE(sp->nx_huge_page_disallowed); 7635 7636 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { 7637 kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); 7638 rcu_read_unlock(); 7639 7640 if (is_tdp) 7641 cond_resched_rwlock_read(&kvm->mmu_lock); 7642 else 7643 cond_resched_rwlock_write(&kvm->mmu_lock); 7644 7645 flush = false; 7646 rcu_read_lock(); 7647 } 7648 } 7649 kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); 7650 7651 rcu_read_unlock(); 7652 7653 if (is_tdp) 7654 read_unlock(&kvm->mmu_lock); 7655 else 7656 write_unlock(&kvm->mmu_lock); 7657 srcu_read_unlock(&kvm->srcu, rcu_idx); 7658 } 7659 -- 0-DAY CI Kernel Test Service https://github.com/intel/lkp-tests/wiki
> All errors (new ones prefixed by >>): > > arch/x86/kvm/mmu/mmu.c:7570:28: error: use of undeclared identifier 'KVM_TDP_MMU' > 7570 | bool is_tdp = mmu_type == KVM_TDP_MMU; > | ^ > >> arch/x86/kvm/mmu/mmu.c:7594:25: error: no member named 'tdp_mmu_pages_lock' in 'struct kvm_arch' > 7594 | spin_lock(&kvm->arch.tdp_mmu_pages_lock); > | ~~~~~~~~~ ^ > arch/x86/kvm/mmu/mmu.c:7597:28: error: no member named 'tdp_mmu_pages_lock' in 'struct kvm_arch' > 7597 | spin_unlock(&kvm->arch.tdp_mmu_pages_lock); > | ~~~~~~~~~ ^ > arch/x86/kvm/mmu/mmu.c:7617:27: error: no member named 'tdp_mmu_pages_lock' in 'struct kvm_arch' > 7617 | spin_unlock(&kvm->arch.tdp_mmu_pages_lock); > | ~~~~~~~~~ ^ > 4 errors generated. Fixup for this below. I also realized that the variable name `is_tdp` is bad/misleading, so I've changed it to `is_tdp_mmu` as part of this fixup too. Sean/Paolo, let me know if I should just go ahead and post the fixed series, given the size of this fixup. I don't really like having to #ifdef all the places where we take tdp_mmu_pages_lock, but I couldn't find a way to avoid that. Even doing #ifndef CONFIG_X86_64 #define is_tdp_mmu false #endif didn't work. :( Anyway, here's the fixup: diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 10ba328b664d7..7df1b4ead705b 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -7564,10 +7564,10 @@ static bool kvm_mmu_sp_dirty_logging_enabled(struct kvm *kvm, } static void kvm_recover_nx_huge_pages(struct kvm *kvm, - enum kvm_mmu_type mmu_type) + const enum kvm_mmu_type mmu_type) { unsigned long to_zap = nx_huge_pages_to_zap(kvm, mmu_type); - bool is_tdp = mmu_type == KVM_TDP_MMU; + bool is_tdp_mmu = mmu_type == KVM_TDP_MMU; struct list_head *nx_huge_pages; struct kvm_mmu_page *sp; LIST_HEAD(invalid_list); @@ -7577,7 +7577,7 @@ static void kvm_recover_nx_huge_pages(struct kvm *kvm, nx_huge_pages = &kvm->arch.possible_nx_huge_pages[mmu_type].pages; rcu_idx = srcu_read_lock(&kvm->srcu); - if (is_tdp) + if (is_tdp_mmu) read_lock(&kvm->mmu_lock); else write_lock(&kvm->mmu_lock); @@ -7590,11 +7590,15 @@ static void kvm_recover_nx_huge_pages(struct kvm *kvm, rcu_read_lock(); for ( ; to_zap; --to_zap) { - if (is_tdp) +#ifdef CONFIG_X86_64 + if (is_tdp_mmu) spin_lock(&kvm->arch.tdp_mmu_pages_lock); +#endif if (list_empty(nx_huge_pages)) { - if (is_tdp) +#ifdef CONFIG_X86_64 + if (is_tdp_mmu) spin_unlock(&kvm->arch.tdp_mmu_pages_lock); +#endif break; } @@ -7613,8 +7617,10 @@ static void kvm_recover_nx_huge_pages(struct kvm *kvm, unaccount_nx_huge_page(kvm, sp); - if (is_tdp) +#ifdef CONFIG_X86_64 + if (is_tdp_mmu) spin_unlock(&kvm->arch.tdp_mmu_pages_lock); +#endif /* * Do not attempt to recover any NX Huge Pages that are being @@ -7624,7 +7630,7 @@ static void kvm_recover_nx_huge_pages(struct kvm *kvm, * logging is disabled. */ if (!kvm_mmu_sp_dirty_logging_enabled(kvm, sp)) { - if (is_tdp) + if (is_tdp_mmu) flush |= kvm_tdp_mmu_zap_possible_nx_huge_page(kvm, sp); else kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); @@ -7637,7 +7643,7 @@ static void kvm_recover_nx_huge_pages(struct kvm *kvm, kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); rcu_read_unlock(); - if (is_tdp) + if (is_tdp_mmu) cond_resched_rwlock_read(&kvm->mmu_lock); else cond_resched_rwlock_write(&kvm->mmu_lock); @@ -7650,7 +7656,7 @@ static void kvm_recover_nx_huge_pages(struct kvm *kvm, rcu_read_unlock(); - if (is_tdp) + if (is_tdp_mmu) read_unlock(&kvm->mmu_lock); else write_unlock(&kvm->mmu_lock);
© 2016 - 2025 Red Hat, Inc.