[v1] KVM: TDX huge page support for private memory

[RFC PATCH 16/21] KVM: x86/mmu: Introduce kvm_split_boundary_leafs() to split boundary leafs

Posted by Yan Zhao 9 months, 3 weeks ago

Introduce kvm_split_boundary_leafs() to manage the splitting of boundary
leafs within the mirror root.

Before zapping a specific GFN range in the mirror root, split any huge leaf
that intersects with the boundary of the GFN range to ensure that the
subsequent zap operation does not impact any GFN outside the specified
range. This is crucial for the mirror root as the private page table
requires the guest's ACCEPT operation after faulting back a GFN.

This function should be called while kvm->mmu_lock is held for writing. The
kvm->mmu_lock is temporarily released to allocate memory for sp for split.
The only expected error is -ENOMEM.

Opportunistically, WARN in tdp_mmu_zap_leafs() if zapping a huge leaf in
the mirror root affects a GFN outside the specified range.

Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
---
 arch/x86/kvm/mmu/mmu.c     |  21 +++++++
 arch/x86/kvm/mmu/tdp_mmu.c | 116 ++++++++++++++++++++++++++++++++++++-
 arch/x86/kvm/mmu/tdp_mmu.h |   1 +
 include/linux/kvm_host.h   |   1 +
 4 files changed, 136 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 0e227199d73e..0d49c69b6b55 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -1640,6 +1640,27 @@ static bool __kvm_rmap_zap_gfn_range(struct kvm *kvm,
 				 start, end - 1, can_yield, true, flush);
 }
 
+/*
+ * Split large leafs at the boundary of the specified range for the mirror root
+ *
+ * Return value:
+ * 0 : success, no flush is required;
+ * 1 : success, flush is required;
+ * <0: failure.
+ */
+int kvm_split_boundary_leafs(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+	bool ret = 0;
+
+	lockdep_assert_once(kvm->mmu_invalidate_in_progress ||
+			    lockdep_is_held(&kvm->slots_lock));
+
+	if (tdp_mmu_enabled)
+		ret = kvm_tdp_mmu_gfn_range_split_boundary(kvm, range);
+
+	return ret;
+}
+
 bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
 {
 	bool flush = false;
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 0f683753a7bb..d3fba5d11ea2 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -324,6 +324,8 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 				u64 old_spte, u64 new_spte, int level,
 				bool shared);
 
+static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
+				   struct kvm_mmu_page *sp, bool shared);
 static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(bool mirror);
 static void *get_external_spt(gfn_t gfn, u64 new_spte, int level);
 
@@ -962,6 +964,19 @@ bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
 	return true;
 }
 
+static inline bool iter_split_required(struct kvm *kvm, struct kvm_mmu_page *root,
+				       struct tdp_iter *iter, gfn_t start, gfn_t end)
+{
+	if (!is_mirror_sp(root) || !is_large_pte(iter->old_spte))
+		return false;
+
+	/* Fully contained, no need to split */
+	if (iter->gfn >= start && iter->gfn + KVM_PAGES_PER_HPAGE(iter->level) <= end)
+		return false;
+
+	return true;
+}
+
 /*
  * If can_yield is true, will release the MMU lock and reschedule if the
  * scheduler needs the CPU or there is contention on the MMU lock. If this
@@ -991,6 +1006,8 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
 		    !is_last_spte(iter.old_spte, iter.level))
 			continue;
 
+		WARN_ON_ONCE(iter_split_required(kvm, root, &iter, start, end));
+
 		tdp_mmu_iter_set_spte(kvm, &iter, SHADOW_NONPRESENT_VALUE);
 
 		/*
@@ -1246,9 +1263,6 @@ static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
 	return 0;
 }
 
-static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
-				   struct kvm_mmu_page *sp, bool shared);
-
 /*
  * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
  * page tables and SPTEs to translate the faulting guest physical address.
@@ -1341,6 +1355,102 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
 	return ret;
 }
 
+/*
+ * Split large leafs at the boundary of the specified range for the mirror root
+ */
+static int tdp_mmu_split_boundary_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
+					gfn_t start, gfn_t end, bool can_yield, bool *flush)
+{
+	struct kvm_mmu_page *sp = NULL;
+	struct tdp_iter iter;
+
+	WARN_ON_ONCE(!can_yield);
+
+	if (!is_mirror_sp(root))
+		return 0;
+
+	end = min(end, tdp_mmu_max_gfn_exclusive());
+
+	lockdep_assert_held_write(&kvm->mmu_lock);
+
+	rcu_read_lock();
+
+	for_each_tdp_pte_min_level(iter, kvm, root, PG_LEVEL_4K, start, end) {
+retry:
+		if (can_yield &&
+		    tdp_mmu_iter_cond_resched(kvm, &iter, *flush, false)) {
+			*flush = false;
+			continue;
+		}
+
+		if (!is_shadow_present_pte(iter.old_spte) ||
+		    !is_last_spte(iter.old_spte, iter.level) ||
+		    !iter_split_required(kvm, root, &iter, start, end))
+			continue;
+
+		if (!sp) {
+			rcu_read_unlock();
+
+			write_unlock(&kvm->mmu_lock);
+
+			sp = tdp_mmu_alloc_sp_for_split(true);
+
+			write_lock(&kvm->mmu_lock);
+
+			if (!sp) {
+				trace_kvm_mmu_split_huge_page(iter.gfn, iter.old_spte,
+							      iter.level, -ENOMEM);
+				return -ENOMEM;
+			}
+			rcu_read_lock();
+
+			iter.yielded = true;
+			continue;
+		}
+		tdp_mmu_init_child_sp(sp, &iter);
+
+		if (tdp_mmu_split_huge_page(kvm, &iter, sp, false))
+			goto retry;
+
+		sp = NULL;
+		/*
+		 * Set yielded in case after splitting to a lower level,
+		 * the new iter requires furter splitting.
+		 */
+		iter.yielded = true;
+		*flush = true;
+	}
+
+	rcu_read_unlock();
+
+	/* Leave it here though it should be impossible for the mirror root */
+	if (sp)
+		tdp_mmu_free_sp(sp);
+	return 0;
+}
+
+int kvm_tdp_mmu_gfn_range_split_boundary(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+	enum kvm_tdp_mmu_root_types types;
+	struct kvm_mmu_page *root;
+	bool flush = false;
+	int ret;
+
+	types = kvm_gfn_range_filter_to_root_types(kvm, range->attr_filter) | KVM_INVALID_ROOTS;
+
+	__for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, types) {
+		ret = tdp_mmu_split_boundary_leafs(kvm, root, range->start, range->end,
+						   range->may_block, &flush);
+		if (ret < 0) {
+			if (flush)
+				kvm_flush_remote_tlbs(kvm);
+
+			return ret;
+		}
+	}
+	return flush;
+}
+
 /* Used by mmu notifier via kvm_unmap_gfn_range() */
 bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
 				 bool flush)
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index 52acf99d40a0..806a21d4f0e3 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -69,6 +69,7 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm);
 void kvm_tdp_mmu_invalidate_roots(struct kvm *kvm,
 				  enum kvm_tdp_mmu_root_types root_types);
 void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm, bool shared);
+int kvm_tdp_mmu_gfn_range_split_boundary(struct kvm *kvm, struct kvm_gfn_range *range);
 
 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 655d36e1f4db..19d7a577e7ed 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -272,6 +272,7 @@ struct kvm_gfn_range {
 bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);
 bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
 bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
+int kvm_split_boundary_leafs(struct kvm *kvm, struct kvm_gfn_range *range);
 #endif
 
 enum {
-- 
2.43.2

Re: [RFC PATCH 16/21] KVM: x86/mmu: Introduce kvm_split_boundary_leafs() to split boundary leafs

Posted by Edgecombe, Rick P 9 months ago

On Thu, 2025-04-24 at 11:08 +0800, Yan Zhao wrote:
> Introduce kvm_split_boundary_leafs() to manage the splitting of boundary
> leafs within the mirror root.
> 
> Before zapping a specific GFN range in the mirror root, split any huge leaf
> that intersects with the boundary of the GFN range to ensure that the
> subsequent zap operation does not impact any GFN outside the specified
> range. This is crucial for the mirror root as the private page table
> requires the guest's ACCEPT operation after faulting back a GFN.
> 
> This function should be called while kvm->mmu_lock is held for writing. The
> kvm->mmu_lock is temporarily released to allocate memory for sp for split.
> The only expected error is -ENOMEM.
> 
> Opportunistically, WARN in tdp_mmu_zap_leafs() if zapping a huge leaf in
> the mirror root affects a GFN outside the specified range.
> 
> Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
> Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
> ---
>  arch/x86/kvm/mmu/mmu.c     |  21 +++++++
>  arch/x86/kvm/mmu/tdp_mmu.c | 116 ++++++++++++++++++++++++++++++++++++-
>  arch/x86/kvm/mmu/tdp_mmu.h |   1 +
>  include/linux/kvm_host.h   |   1 +
>  4 files changed, 136 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 0e227199d73e..0d49c69b6b55 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -1640,6 +1640,27 @@ static bool __kvm_rmap_zap_gfn_range(struct kvm *kvm,
>  				 start, end - 1, can_yield, true, flush);
>  }
>  
> +/*
> + * Split large leafs at the boundary of the specified range for the mirror root
> + *
> + * Return value:
> + * 0 : success, no flush is required;
> + * 1 : success, flush is required;
> + * <0: failure.
> + */
> +int kvm_split_boundary_leafs(struct kvm *kvm, struct kvm_gfn_range *range)
> +{
> +	bool ret = 0;
> +
> +	lockdep_assert_once(kvm->mmu_invalidate_in_progress ||
> +			    lockdep_is_held(&kvm->slots_lock));
> +
> +	if (tdp_mmu_enabled)
> +		ret = kvm_tdp_mmu_gfn_range_split_boundary(kvm, range);
> +
> +	return ret;
> +}
> +
>  bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
>  {
>  	bool flush = false;
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index 0f683753a7bb..d3fba5d11ea2 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -324,6 +324,8 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
>  				u64 old_spte, u64 new_spte, int level,
>  				bool shared);
>  
> +static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
> +				   struct kvm_mmu_page *sp, bool shared);
>  static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(bool mirror);
>  static void *get_external_spt(gfn_t gfn, u64 new_spte, int level);
>  
> @@ -962,6 +964,19 @@ bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
>  	return true;
>  }
>  
> +static inline bool iter_split_required(struct kvm *kvm, struct kvm_mmu_page *root,
> +				       struct tdp_iter *iter, gfn_t start, gfn_t end)
> +{
> +	if (!is_mirror_sp(root) || !is_large_pte(iter->old_spte))
> +		return false;
> +
> +	/* Fully contained, no need to split */
> +	if (iter->gfn >= start && iter->gfn + KVM_PAGES_PER_HPAGE(iter->level) <= end)
> +		return false;
> +
> +	return true;
> +}
> +
>  /*
>   * If can_yield is true, will release the MMU lock and reschedule if the
>   * scheduler needs the CPU or there is contention on the MMU lock. If this
> @@ -991,6 +1006,8 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
>  		    !is_last_spte(iter.old_spte, iter.level))
>  			continue;
>  
> +		WARN_ON_ONCE(iter_split_required(kvm, root, &iter, start, end));
> +

Kind of unrelated change? But good idea. Maybe for another patch.

>  		tdp_mmu_iter_set_spte(kvm, &iter, SHADOW_NONPRESENT_VALUE);
>  
>  		/*
> @@ -1246,9 +1263,6 @@ static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
>  	return 0;
>  }
>  
> -static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
> -				   struct kvm_mmu_page *sp, bool shared);
> -
>  /*
>   * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
>   * page tables and SPTEs to translate the faulting guest physical address.
> @@ -1341,6 +1355,102 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
>  	return ret;
>  }
>  
> +/*
> + * Split large leafs at the boundary of the specified range for the mirror root
> + */
> +static int tdp_mmu_split_boundary_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
> +					gfn_t start, gfn_t end, bool can_yield, bool *flush)
> +{
> +	struct kvm_mmu_page *sp = NULL;
> +	struct tdp_iter iter;
> +
> +	WARN_ON_ONCE(!can_yield);

Why pass this in then?

> +
> +	if (!is_mirror_sp(root))
> +		return 0;

What is special about mirror roots here?

> +
> +	end = min(end, tdp_mmu_max_gfn_exclusive());
> +
> +	lockdep_assert_held_write(&kvm->mmu_lock);
> +
> +	rcu_read_lock();
> +
> +	for_each_tdp_pte_min_level(iter, kvm, root, PG_LEVEL_4K, start, end) {
> +retry:
> +		if (can_yield &&

Do we need this part of the conditional based on the above?

> +		    tdp_mmu_iter_cond_resched(kvm, &iter, *flush, false)) {
> +			*flush = false;
> +			continue;
> +		}
> +
> +		if (!is_shadow_present_pte(iter.old_spte) ||
> +		    !is_last_spte(iter.old_spte, iter.level) ||
> +		    !iter_split_required(kvm, root, &iter, start, end))
> +			continue;
> +
> +		if (!sp) {
> +			rcu_read_unlock();
> +
> +			write_unlock(&kvm->mmu_lock);
> +
> +			sp = tdp_mmu_alloc_sp_for_split(true);
> +
> +			write_lock(&kvm->mmu_lock);
> +
> +			if (!sp) {
> +				trace_kvm_mmu_split_huge_page(iter.gfn, iter.old_spte,
> +							      iter.level, -ENOMEM);
> +				return -ENOMEM;
> +			}
> +			rcu_read_lock();
> +
> +			iter.yielded = true;
> +			continue;
> +		}
> +		tdp_mmu_init_child_sp(sp, &iter);
> +
> +		if (tdp_mmu_split_huge_page(kvm, &iter, sp, false))

I think it can't fail when you hold mmu write lock.

> +			goto retry;
> +
> +		sp = NULL;
> +		/*
> +		 * Set yielded in case after splitting to a lower level,
> +		 * the new iter requires furter splitting.
> +		 */
> +		iter.yielded = true;
> +		*flush = true;
> +	}
> +
> +	rcu_read_unlock();
> +
> +	/* Leave it here though it should be impossible for the mirror root */
> +	if (sp)
> +		tdp_mmu_free_sp(sp);

What do you think about relying on tdp_mmu_split_huge_pages_root() and moving
this to an optimization patch at the end?

Or what about just two calls to tdp_mmu_split_huge_pages_root() at the
boundaries?

> +	return 0;
> +}
> +
> +int kvm_tdp_mmu_gfn_range_split_boundary(struct kvm *kvm, struct kvm_gfn_range *range)
> +{
> +	enum kvm_tdp_mmu_root_types types;
> +	struct kvm_mmu_page *root;
> +	bool flush = false;
> +	int ret;
> +
> +	types = kvm_gfn_range_filter_to_root_types(kvm, range->attr_filter) | KVM_INVALID_ROOTS;

What is the reason for KVM_INVALID_ROOTS in this case?

> +
> +	__for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, types) {

It would be better to check for mirror roots here, instead of inside
tdp_mmu_split_boundary_leafs().

> +		ret = tdp_mmu_split_boundary_leafs(kvm, root, range->start, range->end,
> +						   range->may_block, &flush);
> +		if (ret < 0) {
> +			if (flush)
> +				kvm_flush_remote_tlbs(kvm);
> +
> +			return ret;
> +		}
> +	}
> +	return flush;
> +}
> +
>  /* Used by mmu notifier via kvm_unmap_gfn_range() */
>  bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
>  				 bool flush)
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
> index 52acf99d40a0..806a21d4f0e3 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.h
> +++ b/arch/x86/kvm/mmu/tdp_mmu.h
> @@ -69,6 +69,7 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm);
>  void kvm_tdp_mmu_invalidate_roots(struct kvm *kvm,
>  				  enum kvm_tdp_mmu_root_types root_types);
>  void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm, bool shared);
> +int kvm_tdp_mmu_gfn_range_split_boundary(struct kvm *kvm, struct kvm_gfn_range *range);
>  
>  int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
>  
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 655d36e1f4db..19d7a577e7ed 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -272,6 +272,7 @@ struct kvm_gfn_range {
>  bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);
>  bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
>  bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
> +int kvm_split_boundary_leafs(struct kvm *kvm, struct kvm_gfn_range *range);
>  #endif
>  
>  enum {

Re: [RFC PATCH 16/21] KVM: x86/mmu: Introduce kvm_split_boundary_leafs() to split boundary leafs

Posted by Yan Zhao 8 months, 4 weeks ago

On Wed, May 14, 2025 at 06:56:26AM +0800, Edgecombe, Rick P wrote:
> On Thu, 2025-04-24 at 11:08 +0800, Yan Zhao wrote:
> > Introduce kvm_split_boundary_leafs() to manage the splitting of boundary
> > leafs within the mirror root.
> > 
> > Before zapping a specific GFN range in the mirror root, split any huge leaf
> > that intersects with the boundary of the GFN range to ensure that the
> > subsequent zap operation does not impact any GFN outside the specified
> > range. This is crucial for the mirror root as the private page table
> > requires the guest's ACCEPT operation after faulting back a GFN.
> > 
> > This function should be called while kvm->mmu_lock is held for writing. The
> > kvm->mmu_lock is temporarily released to allocate memory for sp for split.
> > The only expected error is -ENOMEM.
> > 
> > Opportunistically, WARN in tdp_mmu_zap_leafs() if zapping a huge leaf in
> > the mirror root affects a GFN outside the specified range.
> > 
> > Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
> > Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
> > ---
> >  arch/x86/kvm/mmu/mmu.c     |  21 +++++++
> >  arch/x86/kvm/mmu/tdp_mmu.c | 116 ++++++++++++++++++++++++++++++++++++-
> >  arch/x86/kvm/mmu/tdp_mmu.h |   1 +
> >  include/linux/kvm_host.h   |   1 +
> >  4 files changed, 136 insertions(+), 3 deletions(-)
> > 
> > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > index 0e227199d73e..0d49c69b6b55 100644
> > --- a/arch/x86/kvm/mmu/mmu.c
> > +++ b/arch/x86/kvm/mmu/mmu.c
> > @@ -1640,6 +1640,27 @@ static bool __kvm_rmap_zap_gfn_range(struct kvm *kvm,
> >  				 start, end - 1, can_yield, true, flush);
> >  }
> >  
> > +/*
> > + * Split large leafs at the boundary of the specified range for the mirror root
> > + *
> > + * Return value:
> > + * 0 : success, no flush is required;
> > + * 1 : success, flush is required;
> > + * <0: failure.
> > + */
> > +int kvm_split_boundary_leafs(struct kvm *kvm, struct kvm_gfn_range *range)
> > +{
> > +	bool ret = 0;
> > +
> > +	lockdep_assert_once(kvm->mmu_invalidate_in_progress ||
> > +			    lockdep_is_held(&kvm->slots_lock));
> > +
> > +	if (tdp_mmu_enabled)
> > +		ret = kvm_tdp_mmu_gfn_range_split_boundary(kvm, range);
> > +
> > +	return ret;
> > +}
> > +
> >  bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
> >  {
> >  	bool flush = false;
> > diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> > index 0f683753a7bb..d3fba5d11ea2 100644
> > --- a/arch/x86/kvm/mmu/tdp_mmu.c
> > +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> > @@ -324,6 +324,8 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
> >  				u64 old_spte, u64 new_spte, int level,
> >  				bool shared);
> >  
> > +static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
> > +				   struct kvm_mmu_page *sp, bool shared);
> >  static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(bool mirror);
> >  static void *get_external_spt(gfn_t gfn, u64 new_spte, int level);
> >  
> > @@ -962,6 +964,19 @@ bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
> >  	return true;
> >  }
> >  
> > +static inline bool iter_split_required(struct kvm *kvm, struct kvm_mmu_page *root,
> > +				       struct tdp_iter *iter, gfn_t start, gfn_t end)
> > +{
> > +	if (!is_mirror_sp(root) || !is_large_pte(iter->old_spte))
> > +		return false;
> > +
> > +	/* Fully contained, no need to split */
> > +	if (iter->gfn >= start && iter->gfn + KVM_PAGES_PER_HPAGE(iter->level) <= end)
> > +		return false;
> > +
> > +	return true;
> > +}
> > +
> >  /*
> >   * If can_yield is true, will release the MMU lock and reschedule if the
> >   * scheduler needs the CPU or there is contention on the MMU lock. If this
> > @@ -991,6 +1006,8 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
> >  		    !is_last_spte(iter.old_spte, iter.level))
> >  			continue;
> >  
> > +		WARN_ON_ONCE(iter_split_required(kvm, root, &iter, start, end));
> > +
> 
> Kind of unrelated change? But good idea. Maybe for another patch.
Yes, will move it to a separate patch in a formal version.
As initial RFC, I hoped to show related changes in one patch to allow a whole
picture.


> >  		tdp_mmu_iter_set_spte(kvm, &iter, SHADOW_NONPRESENT_VALUE);
> >  
> >  		/*
> > @@ -1246,9 +1263,6 @@ static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
> >  	return 0;
> >  }
> >  
> > -static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
> > -				   struct kvm_mmu_page *sp, bool shared);
> > -
> >  /*
> >   * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
> >   * page tables and SPTEs to translate the faulting guest physical address.
> > @@ -1341,6 +1355,102 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
> >  	return ret;
> >  }
> >  
> > +/*
> > + * Split large leafs at the boundary of the specified range for the mirror root
> > + */
> > +static int tdp_mmu_split_boundary_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
> > +					gfn_t start, gfn_t end, bool can_yield, bool *flush)
> > +{
> > +	struct kvm_mmu_page *sp = NULL;
> > +	struct tdp_iter iter;
> > +
> > +	WARN_ON_ONCE(!can_yield);
> 
> Why pass this in then?
Right, can move the warning up to the caller.
Currently callers of kvm_split_boundary_leafs() are only
kvm_arch_pre_set_memory_attributes() and kvm_gmem_punch_hole(), so can_yield is
always false.

> > +
> > +	if (!is_mirror_sp(root))
> > +		return 0;
> 
> What is special about mirror roots here?
Hmm, I thought only the mirror root needs splitting before zapping of the
S-EPT, which requires guest's acceptance. Other roots could tolerate zapping of
a larger range than required.

Maybe AMD guys can shout out if I'm wrong.

> > +	end = min(end, tdp_mmu_max_gfn_exclusive());
> > +
> > +	lockdep_assert_held_write(&kvm->mmu_lock);
> > +
> > +	rcu_read_lock();
> > +
> > +	for_each_tdp_pte_min_level(iter, kvm, root, PG_LEVEL_4K, start, end) {
> > +retry:
> > +		if (can_yield &&
> 
> Do we need this part of the conditional based on the above?
No need if we don't pass in can_yield.

> > +		    tdp_mmu_iter_cond_resched(kvm, &iter, *flush, false)) {
> > +			*flush = false;
> > +			continue;
> > +		}
> > +
> > +		if (!is_shadow_present_pte(iter.old_spte) ||
> > +		    !is_last_spte(iter.old_spte, iter.level) ||
> > +		    !iter_split_required(kvm, root, &iter, start, end))
> > +			continue;
> > +
> > +		if (!sp) {
> > +			rcu_read_unlock();
> > +
> > +			write_unlock(&kvm->mmu_lock);
> > +
> > +			sp = tdp_mmu_alloc_sp_for_split(true);
> > +
> > +			write_lock(&kvm->mmu_lock);
> > +
> > +			if (!sp) {
> > +				trace_kvm_mmu_split_huge_page(iter.gfn, iter.old_spte,
> > +							      iter.level, -ENOMEM);
> > +				return -ENOMEM;
> > +			}
> > +			rcu_read_lock();
> > +
> > +			iter.yielded = true;
> > +			continue;
> > +		}
> > +		tdp_mmu_init_child_sp(sp, &iter);
> > +
> > +		if (tdp_mmu_split_huge_page(kvm, &iter, sp, false))
> 
> I think it can't fail when you hold mmu write lock.
You are right!
Thanks for catching it.

> > +			goto retry;
> > +
> > +		sp = NULL;
> > +		/*
> > +		 * Set yielded in case after splitting to a lower level,
> > +		 * the new iter requires furter splitting.
> > +		 */
> > +		iter.yielded = true;
> > +		*flush = true;
> > +	}
> > +
> > +	rcu_read_unlock();
> > +
> > +	/* Leave it here though it should be impossible for the mirror root */
> > +	if (sp)
> > +		tdp_mmu_free_sp(sp);
> 
> What do you think about relying on tdp_mmu_split_huge_pages_root() and moving
> this to an optimization patch at the end?
> 
> Or what about just two calls to tdp_mmu_split_huge_pages_root() at the
> boundaries?
Though the two generally look like the same, relying on
tdp_mmu_split_huge_pages_root() will create several minor changes scattering
in tdp_mmu_split_huge_pages_root().

e.g. update flush after tdp_mmu_iter_cond_resched(), check
iter_split_required(), set "iter.yielded = true".

So, it may be hard to review as a initial RFC.

I prefer to do that after Paolo and Sean have taken a look of it :)

> > +	return 0;
> > +}
> > +
> > +int kvm_tdp_mmu_gfn_range_split_boundary(struct kvm *kvm, struct kvm_gfn_range *range)
> > +{
> > +	enum kvm_tdp_mmu_root_types types;
> > +	struct kvm_mmu_page *root;
> > +	bool flush = false;
> > +	int ret;
> > +
> > +	types = kvm_gfn_range_filter_to_root_types(kvm, range->attr_filter) | KVM_INVALID_ROOTS;
> 
> What is the reason for KVM_INVALID_ROOTS in this case?
I wanted to keep consistent with that in kvm_tdp_mmu_unmap_gfn_range().
Yes, we can remove the KVM_INVALID_ROOTS.

> > +
> > +	__for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, types) {
> 
> It would be better to check for mirror roots here, instead of inside
> tdp_mmu_split_boundary_leafs().
Ok.

> 
> > +		ret = tdp_mmu_split_boundary_leafs(kvm, root, range->start, range->end,
> > +						   range->may_block, &flush);
> > +		if (ret < 0) {
> > +			if (flush)
> > +				kvm_flush_remote_tlbs(kvm);
> > +
> > +			return ret;
> > +		}
> > +	}
> > +	return flush;
> > +}
> > +
> >  /* Used by mmu notifier via kvm_unmap_gfn_range() */
> >  bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
> >  				 bool flush)
> > diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
> > index 52acf99d40a0..806a21d4f0e3 100644
> > --- a/arch/x86/kvm/mmu/tdp_mmu.h
> > +++ b/arch/x86/kvm/mmu/tdp_mmu.h
> > @@ -69,6 +69,7 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm);
> >  void kvm_tdp_mmu_invalidate_roots(struct kvm *kvm,
> >  				  enum kvm_tdp_mmu_root_types root_types);
> >  void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm, bool shared);
> > +int kvm_tdp_mmu_gfn_range_split_boundary(struct kvm *kvm, struct kvm_gfn_range *range);
> >  
> >  int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
> >  
> > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > index 655d36e1f4db..19d7a577e7ed 100644
> > --- a/include/linux/kvm_host.h
> > +++ b/include/linux/kvm_host.h
> > @@ -272,6 +272,7 @@ struct kvm_gfn_range {
> >  bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);
> >  bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
> >  bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
> > +int kvm_split_boundary_leafs(struct kvm *kvm, struct kvm_gfn_range *range);
> >  #endif
> >  
> >  enum {
>

Re: [RFC PATCH 16/21] KVM: x86/mmu: Introduce kvm_split_boundary_leafs() to split boundary leafs

Posted by Yan Zhao 8 months, 4 weeks ago

On Fri, May 16, 2025 at 03:46:53PM +0800, Yan Zhao wrote:
> > > +			goto retry;
> > > +
> > > +		sp = NULL;
> > > +		/*
> > > +		 * Set yielded in case after splitting to a lower level,
> > > +		 * the new iter requires furter splitting.
> > > +		 */
> > > +		iter.yielded = true;
> > > +		*flush = true;
> > > +	}
> > > +
> > > +	rcu_read_unlock();
> > > +
> > > +	/* Leave it here though it should be impossible for the mirror root */
> > > +	if (sp)
> > > +		tdp_mmu_free_sp(sp);
> > 
> > What do you think about relying on tdp_mmu_split_huge_pages_root() and moving
> > this to an optimization patch at the end?
> > 
> > Or what about just two calls to tdp_mmu_split_huge_pages_root() at the
> > boundaries?
> Though the two generally look like the same, relying on
> tdp_mmu_split_huge_pages_root() will create several minor changes scattering
> in tdp_mmu_split_huge_pages_root().
> 
> e.g. update flush after tdp_mmu_iter_cond_resched(), check
> iter_split_required(), set "iter.yielded = true".
> 
> So, it may be hard to review as a initial RFC.
> 
> I prefer to do that after Paolo and Sean have taken a look of it :)

Oh, I might misunderstood your meaning.
Yes, if necessary, we can provide a separate patch at the end to combine code of
tdp_mmu_split_huge_pages_root() and tdp_mmu_split_boundary_leafs().

Re: [RFC PATCH 16/21] KVM: x86/mmu: Introduce kvm_split_boundary_leafs() to split boundary leafs

Posted by Edgecombe, Rick P 8 months, 4 weeks ago

On Fri, 2025-05-16 at 19:44 +0800, Yan Zhao wrote:
> > > What do you think about relying on tdp_mmu_split_huge_pages_root() and
> > > moving
> > > this to an optimization patch at the end?
> > > 
> > > Or what about just two calls to tdp_mmu_split_huge_pages_root() at the
> > > boundaries?
> > Though the two generally look like the same, relying on
> > tdp_mmu_split_huge_pages_root() will create several minor changes scattering
> > in tdp_mmu_split_huge_pages_root().
> > 
> > e.g. update flush after tdp_mmu_iter_cond_resched(), check
> > iter_split_required(), set "iter.yielded = true".
> > 
> > So, it may be hard to review as a initial RFC.
> > 
> > I prefer to do that after Paolo and Sean have taken a look of it :)
> 
> Oh, I might misunderstood your meaning.
> Yes, if necessary, we can provide a separate patch at the end to combine code
> of
> tdp_mmu_split_huge_pages_root() and tdp_mmu_split_boundary_leafs().

Hmm, I'm not sure if they will look at this version or wait until Intel folks
work through it a bit. As for reviewability, the log could simply explain that
tdp_mmu_split_huge_pages_root() is the simple option and an optimization patch
will follow. I think it's helpful to separate optimization from implementation.
It can be confusing what is for which purpose.

Re: [RFC PATCH 16/21] KVM: x86/mmu: Introduce kvm_split_boundary_leafs() to split boundary leafs

Posted by Yan Zhao 8 months, 4 weeks ago

On Fri, May 16, 2025 at 03:46:53PM +0800, Yan Zhao wrote:
> On Wed, May 14, 2025 at 06:56:26AM +0800, Edgecombe, Rick P wrote:
> > On Thu, 2025-04-24 at 11:08 +0800, Yan Zhao wrote:

> > >  /*
> > >   * If can_yield is true, will release the MMU lock and reschedule if the
> > >   * scheduler needs the CPU or there is contention on the MMU lock. If this
> > > @@ -991,6 +1006,8 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
> > >  		    !is_last_spte(iter.old_spte, iter.level))
> > >  			continue;
> > >  
> > > +		WARN_ON_ONCE(iter_split_required(kvm, root, &iter, start, end));
> > > +
> > 
> > Kind of unrelated change? But good idea. Maybe for another patch.
> Yes, will move it to a separate patch in a formal version.
> As initial RFC, I hoped to show related changes in one patch to allow a whole
> picture.
> 
> > > +int kvm_tdp_mmu_gfn_range_split_boundary(struct kvm *kvm, struct kvm_gfn_range *range)
> > > +{
> > > +	enum kvm_tdp_mmu_root_types types;
> > > +	struct kvm_mmu_page *root;
> > > +	bool flush = false;
> > > +	int ret;
> > > +
> > > +	types = kvm_gfn_range_filter_to_root_types(kvm, range->attr_filter) | KVM_INVALID_ROOTS;
> > 
> > What is the reason for KVM_INVALID_ROOTS in this case?
> I wanted to keep consistent with that in kvm_tdp_mmu_unmap_gfn_range().
With this consistency, we can warn in tdp_mmu_zap_leafs() as below though
there should be no invalid mirror root.

WARN_ON_ONCE(iter_split_required(kvm, root, &iter, start, end));

Re: [RFC PATCH 16/21] KVM: x86/mmu: Introduce kvm_split_boundary_leafs() to split boundary leafs

Posted by Edgecombe, Rick P 8 months, 4 weeks ago

On Fri, 2025-05-16 at 16:03 +0800, Yan Zhao wrote:
> > 
> > > > +int kvm_tdp_mmu_gfn_range_split_boundary(struct kvm *kvm, struct
> > > > kvm_gfn_range *range)
> > > > +{
> > > > +	enum kvm_tdp_mmu_root_types types;
> > > > +	struct kvm_mmu_page *root;
> > > > +	bool flush = false;
> > > > +	int ret;
> > > > +
> > > > +	types = kvm_gfn_range_filter_to_root_types(kvm, range-
> > > > >attr_filter) | KVM_INVALID_ROOTS;
> > > 
> > > What is the reason for KVM_INVALID_ROOTS in this case?
> > I wanted to keep consistent with that in kvm_tdp_mmu_unmap_gfn_range().

Yea, lack of consistency would raise other questions.

> With this consistency, we can warn in tdp_mmu_zap_leafs() as below though
> there should be no invalid mirror root.
> 
> WARN_ON_ONCE(iter_split_required(kvm, root, &iter, start, end));
>  

Hmm, let's be clear about the logic. This is essentially a mirror TDP only
function, and there we don't have the same invalid root scenarios as the more
complicated cases. I'm not exactly sure how we could hit the warning if they
didn't match. I guess a hole punch on the fd while the TD is getting torn down?

Let's comment the reasoning at least.

Re: [RFC PATCH 16/21] KVM: x86/mmu: Introduce kvm_split_boundary_leafs() to split boundary leafs

Posted by Yan Zhao 8 months, 3 weeks ago

On Sat, May 17, 2025 at 06:27:10AM +0800, Edgecombe, Rick P wrote:
> On Fri, 2025-05-16 at 16:03 +0800, Yan Zhao wrote:
> > > 
> > > > > +int kvm_tdp_mmu_gfn_range_split_boundary(struct kvm *kvm, struct
> > > > > kvm_gfn_range *range)
> > > > > +{
> > > > > +	enum kvm_tdp_mmu_root_types types;
> > > > > +	struct kvm_mmu_page *root;
> > > > > +	bool flush = false;
> > > > > +	int ret;
> > > > > +
> > > > > +	types = kvm_gfn_range_filter_to_root_types(kvm, range-
> > > > > >attr_filter) | KVM_INVALID_ROOTS;
> > > > 
> > > > What is the reason for KVM_INVALID_ROOTS in this case?
> > > I wanted to keep consistent with that in kvm_tdp_mmu_unmap_gfn_range().
> 
> Yea, lack of consistency would raise other questions.
> 
> > With this consistency, we can warn in tdp_mmu_zap_leafs() as below though
> > there should be no invalid mirror root.
> > 
> > WARN_ON_ONCE(iter_split_required(kvm, root, &iter, start, end));
> >  
> 
> Hmm, let's be clear about the logic. This is essentially a mirror TDP only
> function, and there we don't have the same invalid root scenarios as the more
> complicated cases. I'm not exactly sure how we could hit the warning if they
> didn't match. I guess a hole punch on the fd while the TD is getting torn down?
In practice, the warning shoudn't be hit because mirror root should only be
invalidated after gmem_fd is destroyed.

> Let's comment the reasoning at least.
Will do.