[RFC PATCH 19/21] KVM: gmem: Split huge boundary leafs for punch hole of private memory

Yan Zhao posted 21 patches 7 months, 3 weeks ago
Only 20 patches received!
There is a newer version of this series
[RFC PATCH 19/21] KVM: gmem: Split huge boundary leafs for punch hole of private memory
Posted by Yan Zhao 7 months, 3 weeks ago
Splitting of huge leafs in the mirror page table for kvm_gmem_punch_hole().

Enhance kvm_gmem_invalidate_begin() to invoke kvm_split_boundary_leafs()
for splitting boundary huge leafs before caling kvm_unmap_gfn_range() to do
the real zapping. As kvm_split_boundary_leafs() may fail due to out of
memory, propagate the error to further fail the kvm_gmem_punch_hole().

Splitting huge boudary leafs in the mirror page table is not required for
kvm_gmem_release() as the entire page table is to be zapped; it's also not
required for kvm_gmem_error_folio() as a SPTE must not map more than one
physical folio.

Note: as the kvm_gmem_punch_hole() may request to zap several GFN ranges,
if an out-of-memory error occurs during the splitting of a GFN range, some
previous GFN ranges may have been successfully split and zapped.

Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
---
 virt/kvm/guest_memfd.c | 30 +++++++++++++++++++++++-------
 1 file changed, 23 insertions(+), 7 deletions(-)

diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 4bb140e7f30d..008061734ac5 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -292,13 +292,14 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index, int
 	return folio;
 }
 
-static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start,
-				      pgoff_t end)
+static int kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start,
+				     pgoff_t end, bool need_split)
 {
 	bool flush = false, found_memslot = false;
 	struct kvm_memory_slot *slot;
 	struct kvm *kvm = gmem->kvm;
 	unsigned long index;
+	int ret = 0;
 
 	xa_for_each_range(&gmem->bindings, index, slot, start, end - 1) {
 		pgoff_t pgoff = slot->gmem.pgoff;
@@ -319,14 +320,23 @@ static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start,
 			kvm_mmu_invalidate_begin(kvm);
 		}
 
+		if (need_split) {
+			ret = kvm_split_boundary_leafs(kvm, &gfn_range);
+			if (ret < 0)
+				goto out;
+
+			flush |= ret;
+		}
 		flush |= kvm_mmu_unmap_gfn_range(kvm, &gfn_range);
 	}
 
+out:
 	if (flush)
 		kvm_flush_remote_tlbs(kvm);
 
 	if (found_memslot)
 		KVM_MMU_UNLOCK(kvm);
+	return 0;
 }
 
 static void kvm_gmem_invalidate_end(struct kvm_gmem *gmem, pgoff_t start,
@@ -347,6 +357,7 @@ static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 	loff_t size = i_size_read(inode);
 	pgoff_t start, end;
 	struct kvm_gmem *gmem;
+	int ret = 0;
 
 	if (offset > size)
 		return 0;
@@ -361,18 +372,22 @@ static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 	 */
 	filemap_invalidate_lock(inode->i_mapping);
 
-	list_for_each_entry(gmem, gmem_list, entry)
-		kvm_gmem_invalidate_begin(gmem, start, end);
+	list_for_each_entry(gmem, gmem_list, entry) {
+		ret = kvm_gmem_invalidate_begin(gmem, start, end, true);
+		if (ret < 0)
+			goto out;
+	}
 
 	truncate_inode_pages_range(inode->i_mapping, offset, offset + len - 1);
 	kvm_gmem_mark_range_unprepared(inode, start, end - start);
 
+out:
 	list_for_each_entry(gmem, gmem_list, entry)
 		kvm_gmem_invalidate_end(gmem, start, end);
 
 	filemap_invalidate_unlock(inode->i_mapping);
 
-	return 0;
+	return ret;
 }
 
 static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len)
@@ -440,7 +455,7 @@ static int kvm_gmem_release(struct inode *inode, struct file *file)
 	 * Zap all SPTEs pointed at by this file.  Do not free the backing
 	 * memory, as its lifetime is associated with the inode, not the file.
 	 */
-	kvm_gmem_invalidate_begin(gmem, 0, -1ul);
+	kvm_gmem_invalidate_begin(gmem, 0, -1ul, false);
 	kvm_gmem_invalidate_end(gmem, 0, -1ul);
 
 	list_del(&gmem->entry);
@@ -524,8 +539,9 @@ static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *fol
 	start = folio->index;
 	end = start + folio_nr_pages(folio);
 
+	/* The size of the SEPT will not exceed the size of the folio */
 	list_for_each_entry(gmem, gmem_list, entry)
-		kvm_gmem_invalidate_begin(gmem, start, end);
+		kvm_gmem_invalidate_begin(gmem, start, end, false);
 
 	/*
 	 * Do not truncate the range, what action is taken in response to the
-- 
2.43.2
Re: [RFC PATCH 19/21] KVM: gmem: Split huge boundary leafs for punch hole of private memory
Posted by Edgecombe, Rick P 7 months ago
On Thu, 2025-04-24 at 11:08 +0800, Yan Zhao wrote:
> +static int kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start,
> +				     pgoff_t end, bool need_split)
>  {
>  	bool flush = false, found_memslot = false;
>  	struct kvm_memory_slot *slot;
>  	struct kvm *kvm = gmem->kvm;
>  	unsigned long index;
> +	int ret = 0;
>  
>  	xa_for_each_range(&gmem->bindings, index, slot, start, end - 1) {
>  		pgoff_t pgoff = slot->gmem.pgoff;
> @@ -319,14 +320,23 @@ static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start,
>  			kvm_mmu_invalidate_begin(kvm);
>  		}
>  
> +		if (need_split) {
> +			ret = kvm_split_boundary_leafs(kvm, &gfn_range);

What is the effect for other guestmemfd users? SEV doesn't need this, right? Oh
I see, down in tdp_mmu_split_boundary_leafs() it bails on non-mirror roots. I
don't like the naming then. It sounds deterministic, but it's really only
necessary splits for certain VM types.

I guess it all depends on how well teaching kvm_mmu_unmap_gfn_range() to fail
goes. But otherwise, we should call it like kvm_prepare_zap_range() or
something. And have it make it clearly do nothing for non-TDX high up where it's
easy to see.

> +			if (ret < 0)
> +				goto out;
> +
> +			flush |= ret;
> +		}
>  		flush |= kvm_mmu_unmap_gfn_range(kvm, &gfn_range);
>  	}
>  
> +out:
>  	if (flush)
>  		kvm_flush_remote_tlbs(kvm);
>  
>  	if (found_memslot)
>  		KVM_MMU_UNLOCK(kvm);
> +	

Re: [RFC PATCH 19/21] KVM: gmem: Split huge boundary leafs for punch hole of private memory
Posted by Yan Zhao 7 months ago
On Wed, May 14, 2025 at 06:59:01AM +0800, Edgecombe, Rick P wrote:
> On Thu, 2025-04-24 at 11:08 +0800, Yan Zhao wrote:
> > +static int kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start,
> > +				     pgoff_t end, bool need_split)
> >  {
> >  	bool flush = false, found_memslot = false;
> >  	struct kvm_memory_slot *slot;
> >  	struct kvm *kvm = gmem->kvm;
> >  	unsigned long index;
> > +	int ret = 0;
> >  
> >  	xa_for_each_range(&gmem->bindings, index, slot, start, end - 1) {
> >  		pgoff_t pgoff = slot->gmem.pgoff;
> > @@ -319,14 +320,23 @@ static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start,
> >  			kvm_mmu_invalidate_begin(kvm);
> >  		}
> >  
> > +		if (need_split) {
> > +			ret = kvm_split_boundary_leafs(kvm, &gfn_range);
> 
> What is the effect for other guestmemfd users? SEV doesn't need this, right? Oh
> I see, down in tdp_mmu_split_boundary_leafs() it bails on non-mirror roots. I
> don't like the naming then. It sounds deterministic, but it's really only
> necessary splits for certain VM types.
Right, kvm_split_boundary_leafs() only takes effect on the mirror root.

> I guess it all depends on how well teaching kvm_mmu_unmap_gfn_range() to fail
> goes. But otherwise, we should call it like kvm_prepare_zap_range() or
Hmm, if we call it kvm_prepare_zap_range(), we have to invoke it for all zaps.
However, except kvm_gmem_punch_hole(), the other two callers
kvm_gmem_error_folio(), kvm_gmem_release() have no need to perfrom splitting
before zapping.
Passing in the invalidation reason to kvm_gmem_invalidate_begin() also makes
things complicated.

> something. And have it make it clearly do nothing for non-TDX high up where it's
> easy to see.
Would a name like kvm_split_boundary_leafs_for_mirror() be too TDX specific?

If we name it kvm_split_boundary_leafs(), SEV can simply remove the bailing out
if they want in future.

> 
> > +			if (ret < 0)
> > +				goto out;
> > +
> > +			flush |= ret;
> > +		}
> >  		flush |= kvm_mmu_unmap_gfn_range(kvm, &gfn_range);
> >  	}
> >  
> > +out:
> >  	if (flush)
> >  		kvm_flush_remote_tlbs(kvm);
> >  
> >  	if (found_memslot)
> >  		KVM_MMU_UNLOCK(kvm);
> > +	
> 
Re: [RFC PATCH 19/21] KVM: gmem: Split huge boundary leafs for punch hole of private memory
Posted by Francesco Lavra 7 months, 3 weeks ago
On 2025-04-24 at 3:08, Yan Zhao wrote:
> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> index 4bb140e7f30d..008061734ac5 100644
> --- a/virt/kvm/guest_memfd.c
> +++ b/virt/kvm/guest_memfd.c
> @@ -292,13 +292,14 @@ static struct folio *kvm_gmem_get_folio(struct
> inode *inode, pgoff_t index, int
>  	return folio;
>  }
>  
> -static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t
> start,
> -				      pgoff_t end)
> +static int kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t
> start,
> +				     pgoff_t end, bool need_split)
>  {
>  	bool flush = false, found_memslot = false;
>  	struct kvm_memory_slot *slot;
>  	struct kvm *kvm = gmem->kvm;
>  	unsigned long index;
> +	int ret = 0;
>  
>  	xa_for_each_range(&gmem->bindings, index, slot, start, end -
> 1) {
>  		pgoff_t pgoff = slot->gmem.pgoff;
> @@ -319,14 +320,23 @@ static void kvm_gmem_invalidate_begin(struct
> kvm_gmem *gmem, pgoff_t start,
>  			kvm_mmu_invalidate_begin(kvm);
>  		}
>  
> +		if (need_split) {
> +			ret = kvm_split_boundary_leafs(kvm,
> &gfn_range);
> +			if (ret < 0)
> +				goto out;
> +
> +			flush |= ret;
> +		}
>  		flush |= kvm_mmu_unmap_gfn_range(kvm, &gfn_range);
>  	}
>  
> +out:
>  	if (flush)
>  		kvm_flush_remote_tlbs(kvm);
>  
>  	if (found_memslot)
>  		KVM_MMU_UNLOCK(kvm);
> +	return 0;

Should return ret, not 0
Re: [RFC PATCH 19/21] KVM: gmem: Split huge boundary leafs for punch hole of private memory
Posted by Yan Zhao 7 months, 3 weeks ago
On Thu, Apr 24, 2025 at 12:19:32PM +0200, Francesco Lavra wrote:
> On 2025-04-24 at 3:08, Yan Zhao wrote:
> > diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> > index 4bb140e7f30d..008061734ac5 100644
> > --- a/virt/kvm/guest_memfd.c
> > +++ b/virt/kvm/guest_memfd.c
> > @@ -292,13 +292,14 @@ static struct folio *kvm_gmem_get_folio(struct
> > inode *inode, pgoff_t index, int
> >  	return folio;
> >  }
> >  
> > -static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t
> > start,
> > -				      pgoff_t end)
> > +static int kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t
> > start,
> > +				     pgoff_t end, bool need_split)
> >  {
> >  	bool flush = false, found_memslot = false;
> >  	struct kvm_memory_slot *slot;
> >  	struct kvm *kvm = gmem->kvm;
> >  	unsigned long index;
> > +	int ret = 0;
> >  
> >  	xa_for_each_range(&gmem->bindings, index, slot, start, end -
> > 1) {
> >  		pgoff_t pgoff = slot->gmem.pgoff;
> > @@ -319,14 +320,23 @@ static void kvm_gmem_invalidate_begin(struct
> > kvm_gmem *gmem, pgoff_t start,
> >  			kvm_mmu_invalidate_begin(kvm);
> >  		}
> >  
> > +		if (need_split) {
> > +			ret = kvm_split_boundary_leafs(kvm,
> > &gfn_range);
> > +			if (ret < 0)
> > +				goto out;
> > +
> > +			flush |= ret;
> > +		}
> >  		flush |= kvm_mmu_unmap_gfn_range(kvm, &gfn_range);
> >  	}
> >  
> > +out:
> >  	if (flush)
> >  		kvm_flush_remote_tlbs(kvm);
> >  
> >  	if (found_memslot)
> >  		KVM_MMU_UNLOCK(kvm);
> > +	return 0;
> 
> Should return ret, not 0
Yes, thank you for the correction!