[v4] KVM: x86/mmu: Run TDP MMU NX huge page recovery under MMU read lock

[PATCH v4 1/7] KVM: x86/mmu: Track TDP MMU NX huge pages separately

Posted by James Houghton 3 months, 4 weeks ago

From: Vipin Sharma <vipinsh@google.com>

Introduce struct kvm_possible_nx_huge_pages to track the list of
possible NX huge pages and the number of pages on the list.

When calculating how many pages to zap, we use the new counts we have
(instead of kvm->stat.nx_lpage_splits, which would be the sum of the two
new counts).

Suggested-by: Sean Christopherson <seanjc@google.com>
Suggested-by: David Matlack <dmatlack@google.com>
Signed-off-by: Vipin Sharma <vipinsh@google.com>
Co-developed-by: James Houghton <jthoughton@google.com>
Signed-off-by: James Houghton <jthoughton@google.com>
---
 arch/x86/include/asm/kvm_host.h | 39 ++++++++++++++--------
 arch/x86/kvm/mmu/mmu.c          | 58 +++++++++++++++++++++------------
 arch/x86/kvm/mmu/mmu_internal.h |  7 ++--
 arch/x86/kvm/mmu/tdp_mmu.c      |  4 +--
 4 files changed, 71 insertions(+), 37 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b4a391929cdba..9df15c9717771 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1334,6 +1334,30 @@ enum kvm_apicv_inhibit {
 	__APICV_INHIBIT_REASON(SEV),			\
 	__APICV_INHIBIT_REASON(LOGICAL_ID_ALIASED)
 
+struct kvm_possible_nx_huge_pages {
+	/*
+	 * A list of kvm_mmu_page structs that, if zapped, could possibly be
+	 * replaced by an NX huge page.  A shadow page is on this list if its
+	 * existence disallows an NX huge page (nx_huge_page_disallowed is set)
+	 * and there are no other conditions that prevent a huge page, e.g.
+	 * the backing host page is huge, dirtly logging is not enabled for its
+	 * memslot, etc...  Note, zapping shadow pages on this list doesn't
+	 * guarantee an NX huge page will be created in its stead, e.g. if the
+	 * guest attempts to execute from the region then KVM obviously can't
+	 * create an NX huge page (without hanging the guest).
+	 */
+	struct list_head pages;
+	u64 nr_pages;
+};
+
+enum kvm_mmu_type {
+	KVM_SHADOW_MMU,
+#ifdef CONFIG_X86_64
+	KVM_TDP_MMU,
+#endif
+	KVM_NR_MMU_TYPES,
+};
+
 struct kvm_arch {
 	unsigned long n_used_mmu_pages;
 	unsigned long n_requested_mmu_pages;
@@ -1346,18 +1370,7 @@ struct kvm_arch {
 	bool pre_fault_allowed;
 	struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
 	struct list_head active_mmu_pages;
-	/*
-	 * A list of kvm_mmu_page structs that, if zapped, could possibly be
-	 * replaced by an NX huge page.  A shadow page is on this list if its
-	 * existence disallows an NX huge page (nx_huge_page_disallowed is set)
-	 * and there are no other conditions that prevent a huge page, e.g.
-	 * the backing host page is huge, dirtly logging is not enabled for its
-	 * memslot, etc...  Note, zapping shadow pages on this list doesn't
-	 * guarantee an NX huge page will be created in its stead, e.g. if the
-	 * guest attempts to execute from the region then KVM obviously can't
-	 * create an NX huge page (without hanging the guest).
-	 */
-	struct list_head possible_nx_huge_pages;
+	struct kvm_possible_nx_huge_pages possible_nx_huge_pages[KVM_NR_MMU_TYPES];
 #ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING
 	struct kvm_page_track_notifier_head track_notifier_head;
 #endif
@@ -1516,7 +1529,7 @@ struct kvm_arch {
 	 * is held in read mode:
 	 *  - tdp_mmu_roots (above)
 	 *  - the link field of kvm_mmu_page structs used by the TDP MMU
-	 *  - possible_nx_huge_pages;
+	 *  - possible_nx_huge_pages[KVM_TDP_MMU];
 	 *  - the possible_nx_huge_page_link field of kvm_mmu_page structs used
 	 *    by the TDP MMU
 	 * Because the lock is only taken within the MMU lock, strictly
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 4e06e2e89a8fa..f44d7f3acc179 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -65,9 +65,9 @@ int __read_mostly nx_huge_pages = -1;
 static uint __read_mostly nx_huge_pages_recovery_period_ms;
 #ifdef CONFIG_PREEMPT_RT
 /* Recovery can cause latency spikes, disable it for PREEMPT_RT.  */
-static uint __read_mostly nx_huge_pages_recovery_ratio = 0;
+unsigned int __read_mostly nx_huge_pages_recovery_ratio;
 #else
-static uint __read_mostly nx_huge_pages_recovery_ratio = 60;
+unsigned int __read_mostly nx_huge_pages_recovery_ratio = 60;
 #endif
 
 static int get_nx_huge_pages(char *buffer, const struct kernel_param *kp);
@@ -776,7 +776,8 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
 		kvm_flush_remote_tlbs_gfn(kvm, gfn, PG_LEVEL_4K);
 }
 
-void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+				 enum kvm_mmu_type mmu_type)
 {
 	/*
 	 * If it's possible to replace the shadow page with an NX huge page,
@@ -790,8 +791,9 @@ void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 		return;
 
 	++kvm->stat.nx_lpage_splits;
+	++kvm->arch.possible_nx_huge_pages[mmu_type].nr_pages;
 	list_add_tail(&sp->possible_nx_huge_page_link,
-		      &kvm->arch.possible_nx_huge_pages);
+		      &kvm->arch.possible_nx_huge_pages[mmu_type].pages);
 }
 
 static void account_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp,
@@ -800,7 +802,7 @@ static void account_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp,
 	sp->nx_huge_page_disallowed = true;
 
 	if (nx_huge_page_possible)
-		track_possible_nx_huge_page(kvm, sp);
+		track_possible_nx_huge_page(kvm, sp, KVM_SHADOW_MMU);
 }
 
 static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
@@ -819,12 +821,14 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
 	kvm_mmu_gfn_allow_lpage(slot, gfn);
 }
 
-void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+				   enum kvm_mmu_type mmu_type)
 {
 	if (list_empty(&sp->possible_nx_huge_page_link))
 		return;
 
 	--kvm->stat.nx_lpage_splits;
+	--kvm->arch.possible_nx_huge_pages[mmu_type].nr_pages;
 	list_del_init(&sp->possible_nx_huge_page_link);
 }
 
@@ -832,7 +836,7 @@ static void unaccount_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
 	sp->nx_huge_page_disallowed = false;
 
-	untrack_possible_nx_huge_page(kvm, sp);
+	untrack_possible_nx_huge_page(kvm, sp, KVM_SHADOW_MMU);
 }
 
 static struct kvm_memory_slot *gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu,
@@ -6684,9 +6688,12 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
 
 void kvm_mmu_init_vm(struct kvm *kvm)
 {
+	int i;
+
 	kvm->arch.shadow_mmio_value = shadow_mmio_value;
 	INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
-	INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages);
+	for (i = 0; i < KVM_NR_MMU_TYPES; ++i)
+		INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages[i].pages);
 	spin_lock_init(&kvm->arch.mmu_unsync_pages_lock);
 
 	if (tdp_mmu_enabled)
@@ -7519,16 +7526,27 @@ static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel
 	return err;
 }
 
-static void kvm_recover_nx_huge_pages(struct kvm *kvm)
+static unsigned long nx_huge_pages_to_zap(struct kvm *kvm,
+					  enum kvm_mmu_type mmu_type)
+{
+	unsigned long pages = READ_ONCE(kvm->arch.possible_nx_huge_pages[mmu_type].nr_pages);
+	unsigned int ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
+
+	return ratio ? DIV_ROUND_UP(pages, ratio) : 0;
+}
+
+static void kvm_recover_nx_huge_pages(struct kvm *kvm,
+				      enum kvm_mmu_type mmu_type)
 {
-	unsigned long nx_lpage_splits = kvm->stat.nx_lpage_splits;
+	unsigned long to_zap = nx_huge_pages_to_zap(kvm, mmu_type);
+	struct list_head *nx_huge_pages;
 	struct kvm_memory_slot *slot;
-	int rcu_idx;
 	struct kvm_mmu_page *sp;
-	unsigned int ratio;
 	LIST_HEAD(invalid_list);
 	bool flush = false;
-	ulong to_zap;
+	int rcu_idx;
+
+	nx_huge_pages = &kvm->arch.possible_nx_huge_pages[mmu_type].pages;
 
 	rcu_idx = srcu_read_lock(&kvm->srcu);
 	write_lock(&kvm->mmu_lock);
@@ -7540,10 +7558,8 @@ static void kvm_recover_nx_huge_pages(struct kvm *kvm)
 	 */
 	rcu_read_lock();
 
-	ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
-	to_zap = ratio ? DIV_ROUND_UP(nx_lpage_splits, ratio) : 0;
 	for ( ; to_zap; --to_zap) {
-		if (list_empty(&kvm->arch.possible_nx_huge_pages))
+		if (list_empty(nx_huge_pages))
 			break;
 
 		/*
@@ -7553,7 +7569,7 @@ static void kvm_recover_nx_huge_pages(struct kvm *kvm)
 		 * the total number of shadow pages.  And because the TDP MMU
 		 * doesn't use active_mmu_pages.
 		 */
-		sp = list_first_entry(&kvm->arch.possible_nx_huge_pages,
+		sp = list_first_entry(nx_huge_pages,
 				      struct kvm_mmu_page,
 				      possible_nx_huge_page_link);
 		WARN_ON_ONCE(!sp->nx_huge_page_disallowed);
@@ -7590,7 +7606,7 @@ static void kvm_recover_nx_huge_pages(struct kvm *kvm)
 
 		if (slot && kvm_slot_dirty_track_enabled(slot))
 			unaccount_nx_huge_page(kvm, sp);
-		else if (is_tdp_mmu_page(sp))
+		else if (mmu_type == KVM_TDP_MMU)
 			flush |= kvm_tdp_mmu_zap_sp(kvm, sp);
 		else
 			kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
@@ -7621,9 +7637,10 @@ static void kvm_nx_huge_page_recovery_worker_kill(void *data)
 static bool kvm_nx_huge_page_recovery_worker(void *data)
 {
 	struct kvm *kvm = data;
+	long remaining_time;
 	bool enabled;
 	uint period;
-	long remaining_time;
+	int i;
 
 	enabled = calc_nx_huge_pages_recovery_period(&period);
 	if (!enabled)
@@ -7638,7 +7655,8 @@ static bool kvm_nx_huge_page_recovery_worker(void *data)
 	}
 
 	__set_current_state(TASK_RUNNING);
-	kvm_recover_nx_huge_pages(kvm);
+	for (i = 0; i < KVM_NR_MMU_TYPES; ++i)
+		kvm_recover_nx_huge_pages(kvm, i);
 	kvm->arch.nx_huge_page_last = get_jiffies_64();
 	return true;
 }
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index db8f33e4de624..a8fd2de13f707 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -413,7 +413,10 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm,
 void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
 void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level);
 
-void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp);
-void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp);
+void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+				 enum kvm_mmu_type mmu_type);
+void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+				   enum kvm_mmu_type mmu_type);
 
+extern unsigned int nx_huge_pages_recovery_ratio;
 #endif /* __KVM_X86_MMU_INTERNAL_H */
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 7f3d7229b2c1f..48b070f9f4e13 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -355,7 +355,7 @@ static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
 
 	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
 	sp->nx_huge_page_disallowed = false;
-	untrack_possible_nx_huge_page(kvm, sp);
+	untrack_possible_nx_huge_page(kvm, sp, KVM_TDP_MMU);
 	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
 }
 
@@ -1303,7 +1303,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
 		    fault->req_level >= iter.level) {
 			spin_lock(&kvm->arch.tdp_mmu_pages_lock);
 			if (sp->nx_huge_page_disallowed)
-				track_possible_nx_huge_page(kvm, sp);
+				track_possible_nx_huge_page(kvm, sp, KVM_TDP_MMU);
 			spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
 		}
 	}
-- 
2.50.0.rc2.692.g299adb8693-goog

Re: [PATCH v4 1/7] KVM: x86/mmu: Track TDP MMU NX huge pages separately

Posted by kernel test robot 3 months, 3 weeks ago

Hi James,

kernel test robot noticed the following build errors:

[auto build test ERROR on 8046d29dde17002523f94d3e6e0ebe486ce52166]

url:    https://github.com/intel-lab-lkp/linux/commits/James-Houghton/KVM-x86-mmu-Track-TDP-MMU-NX-huge-pages-separately/20250614-042620
base:   8046d29dde17002523f94d3e6e0ebe486ce52166
patch link:    https://lore.kernel.org/r/20250613202315.2790592-2-jthoughton%40google.com
patch subject: [PATCH v4 1/7] KVM: x86/mmu: Track TDP MMU NX huge pages separately
config: i386-randconfig-003-20250614 (https://download.01.org/0day-ci/archive/20250614/202506142050.kfDUdARX-lkp@intel.com/config)
compiler: gcc-12 (Debian 12.2.0-14) 12.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250614/202506142050.kfDUdARX-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202506142050.kfDUdARX-lkp@intel.com/

All errors (new ones prefixed by >>):

   arch/x86/kvm/mmu/mmu.c: In function 'kvm_recover_nx_huge_pages':
>> arch/x86/kvm/mmu/mmu.c:7609:38: error: 'KVM_TDP_MMU' undeclared (first use in this function)
    7609 |                 else if (mmu_type == KVM_TDP_MMU)
         |                                      ^~~~~~~~~~~
   arch/x86/kvm/mmu/mmu.c:7609:38: note: each undeclared identifier is reported only once for each function it appears in


vim +/KVM_TDP_MMU +7609 arch/x86/kvm/mmu/mmu.c

  7537	
  7538	static void kvm_recover_nx_huge_pages(struct kvm *kvm,
  7539					      enum kvm_mmu_type mmu_type)
  7540	{
  7541		unsigned long to_zap = nx_huge_pages_to_zap(kvm, mmu_type);
  7542		struct list_head *nx_huge_pages;
  7543		struct kvm_memory_slot *slot;
  7544		struct kvm_mmu_page *sp;
  7545		LIST_HEAD(invalid_list);
  7546		bool flush = false;
  7547		int rcu_idx;
  7548	
  7549		nx_huge_pages = &kvm->arch.possible_nx_huge_pages[mmu_type].pages;
  7550	
  7551		rcu_idx = srcu_read_lock(&kvm->srcu);
  7552		write_lock(&kvm->mmu_lock);
  7553	
  7554		/*
  7555		 * Zapping TDP MMU shadow pages, including the remote TLB flush, must
  7556		 * be done under RCU protection, because the pages are freed via RCU
  7557		 * callback.
  7558		 */
  7559		rcu_read_lock();
  7560	
  7561		for ( ; to_zap; --to_zap) {
  7562			if (list_empty(nx_huge_pages))
  7563				break;
  7564	
  7565			/*
  7566			 * We use a separate list instead of just using active_mmu_pages
  7567			 * because the number of shadow pages that be replaced with an
  7568			 * NX huge page is expected to be relatively small compared to
  7569			 * the total number of shadow pages.  And because the TDP MMU
  7570			 * doesn't use active_mmu_pages.
  7571			 */
  7572			sp = list_first_entry(nx_huge_pages,
  7573					      struct kvm_mmu_page,
  7574					      possible_nx_huge_page_link);
  7575			WARN_ON_ONCE(!sp->nx_huge_page_disallowed);
  7576			WARN_ON_ONCE(!sp->role.direct);
  7577	
  7578			/*
  7579			 * Unaccount and do not attempt to recover any NX Huge Pages
  7580			 * that are being dirty tracked, as they would just be faulted
  7581			 * back in as 4KiB pages. The NX Huge Pages in this slot will be
  7582			 * recovered, along with all the other huge pages in the slot,
  7583			 * when dirty logging is disabled.
  7584			 *
  7585			 * Since gfn_to_memslot() is relatively expensive, it helps to
  7586			 * skip it if it the test cannot possibly return true.  On the
  7587			 * other hand, if any memslot has logging enabled, chances are
  7588			 * good that all of them do, in which case unaccount_nx_huge_page()
  7589			 * is much cheaper than zapping the page.
  7590			 *
  7591			 * If a memslot update is in progress, reading an incorrect value
  7592			 * of kvm->nr_memslots_dirty_logging is not a problem: if it is
  7593			 * becoming zero, gfn_to_memslot() will be done unnecessarily; if
  7594			 * it is becoming nonzero, the page will be zapped unnecessarily.
  7595			 * Either way, this only affects efficiency in racy situations,
  7596			 * and not correctness.
  7597			 */
  7598			slot = NULL;
  7599			if (atomic_read(&kvm->nr_memslots_dirty_logging)) {
  7600				struct kvm_memslots *slots;
  7601	
  7602				slots = kvm_memslots_for_spte_role(kvm, sp->role);
  7603				slot = __gfn_to_memslot(slots, sp->gfn);
  7604				WARN_ON_ONCE(!slot);
  7605			}
  7606	
  7607			if (slot && kvm_slot_dirty_track_enabled(slot))
  7608				unaccount_nx_huge_page(kvm, sp);
> 7609			else if (mmu_type == KVM_TDP_MMU)
  7610				flush |= kvm_tdp_mmu_zap_sp(kvm, sp);
  7611			else
  7612				kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
  7613			WARN_ON_ONCE(sp->nx_huge_page_disallowed);
  7614	
  7615			if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
  7616				kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
  7617				rcu_read_unlock();
  7618	
  7619				cond_resched_rwlock_write(&kvm->mmu_lock);
  7620				flush = false;
  7621	
  7622				rcu_read_lock();
  7623			}
  7624		}
  7625		kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
  7626	
  7627		rcu_read_unlock();
  7628	
  7629		write_unlock(&kvm->mmu_lock);
  7630		srcu_read_unlock(&kvm->srcu, rcu_idx);
  7631	}
  7632	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

Re: [PATCH v4 1/7] KVM: x86/mmu: Track TDP MMU NX huge pages separately

Posted by James Houghton 3 months, 3 weeks ago

> All errors (new ones prefixed by >>):
> 
>    arch/x86/kvm/mmu/mmu.c: In function 'kvm_recover_nx_huge_pages':
> >> arch/x86/kvm/mmu/mmu.c:7609:38: error: 'KVM_TDP_MMU' undeclared (first use in this function)
>     7609 |                 else if (mmu_type == KVM_TDP_MMU)
>          |                                      ^~~~~~~~~~~
>    arch/x86/kvm/mmu/mmu.c:7609:38: note: each undeclared identifier is reported only once for each function it appears in

Sorry for not trying to build on i386. :(

Fixup for this, as Sean originally had[1]:

[1]: https://lore.kernel.org/kvm/ZyJCjJx2lxnEnDwa@google.com/

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 9df15c9717771..d544a269c1920 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1358,6 +1358,10 @@ enum kvm_mmu_type {
 	KVM_NR_MMU_TYPES,
 };
 
+#ifndef CONFIG_X86_64
+#define KVM_TDP_MMU -1
+#endif
+
 struct kvm_arch {
 	unsigned long n_used_mmu_pages;
 	unsigned long n_requested_mmu_pages;