[v2] KVM: guest_memfd: Rework preparation/population flows in prep for in-place conversion

[PATCH v2 2/5] KVM: guest_memfd: Remove preparation tracking

Posted by Michael Roth 1 month, 3 weeks ago

guest_memfd currently uses the folio uptodate flag to track:

  1) whether or not a page has been cleared before initial usage
  2) whether or not the architecture hooks have been issued to put the
     page in a private state as defined by the architecture

In practice, 2) is only actually being tracked for SEV-SNP VMs, and
there do not seem to be any plans/reasons that would suggest this will
change in the future, so this additional tracking/complexity is not
really providing any general benefit to guest_memfd users. Future plans
around in-place conversion and hugepage support, where the per-folio
uptodate flag is planned to be used purely to track the initial clearing
of folios, whereas conversion operations could trigger multiple
transitions between 'prepared' and 'unprepared' and thus need separate
tracking, will make the burden of tracking this information within
guest_memfd even more complex, since preparation generally happens
during fault time, on the "read-side" of any global locks that might
protect state tracked by guest_memfd, and so may require more complex
locking schemes to allow for concurrent handling of page faults for
multiple vCPUs where the "preparedness" state tracked by guest_memfd
might need to be updated as part of handling the fault.

Instead of keeping this current/future complexity within guest_memfd for
what is essentially just SEV-SNP, just drop the tracking for 2) and have
the arch-specific preparation hooks get triggered unconditionally on
every fault so the arch-specific hooks can check the preparation state
directly and decide whether or not a folio still needs additional
preparation. In the case of SEV-SNP, the preparation state is already
checked again via the preparation hooks to avoid double-preparation, so
nothing extra needs to be done to update the handling of things there.

Signed-off-by: Michael Roth <michael.roth@amd.com>
---
 virt/kvm/guest_memfd.c | 44 ++++++++++++------------------------------
 1 file changed, 12 insertions(+), 32 deletions(-)

diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 9dafa44838fe..8b1248f42aae 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -76,11 +76,6 @@ static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slo
 	return 0;
 }
 
-static inline void kvm_gmem_mark_prepared(struct folio *folio)
-{
-	folio_mark_uptodate(folio);
-}
-
 /*
  * Process @folio, which contains @gfn, so that the guest can use it.
  * The folio must be locked and the gfn must be contained in @slot.
@@ -90,13 +85,7 @@ static inline void kvm_gmem_mark_prepared(struct folio *folio)
 static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
 				  gfn_t gfn, struct folio *folio)
 {
-	unsigned long nr_pages, i;
 	pgoff_t index;
-	int r;
-
-	nr_pages = folio_nr_pages(folio);
-	for (i = 0; i < nr_pages; i++)
-		clear_highpage(folio_page(folio, i));
 
 	/*
 	 * Preparing huge folios should always be safe, since it should
@@ -114,11 +103,8 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
 	WARN_ON(!IS_ALIGNED(slot->gmem.pgoff, folio_nr_pages(folio)));
 	index = kvm_gmem_get_index(slot, gfn);
 	index = ALIGN_DOWN(index, folio_nr_pages(folio));
-	r = __kvm_gmem_prepare_folio(kvm, slot, index, folio);
-	if (!r)
-		kvm_gmem_mark_prepared(folio);
 
-	return r;
+	return __kvm_gmem_prepare_folio(kvm, slot, index, folio);
 }
 
 /*
@@ -429,7 +415,7 @@ static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf)
 
 	if (!folio_test_uptodate(folio)) {
 		clear_highpage(folio_page(folio, 0));
-		kvm_gmem_mark_prepared(folio);
+		folio_mark_uptodate(folio);
 	}
 
 	vmf->page = folio_file_page(folio, vmf->pgoff);
@@ -766,7 +752,7 @@ void kvm_gmem_unbind(struct kvm_memory_slot *slot)
 static struct folio *__kvm_gmem_get_pfn(struct file *file,
 					struct kvm_memory_slot *slot,
 					pgoff_t index, kvm_pfn_t *pfn,
-					bool *is_prepared, int *max_order)
+					int *max_order)
 {
 	struct file *slot_file = READ_ONCE(slot->gmem.file);
 	struct gmem_file *f = file->private_data;
@@ -796,7 +782,6 @@ static struct folio *__kvm_gmem_get_pfn(struct file *file,
 	if (max_order)
 		*max_order = 0;
 
-	*is_prepared = folio_test_uptodate(folio);
 	return folio;
 }
 
@@ -806,19 +791,22 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
 {
 	pgoff_t index = kvm_gmem_get_index(slot, gfn);
 	struct folio *folio;
-	bool is_prepared = false;
 	int r = 0;
 
 	CLASS(gmem_get_file, file)(slot);
 	if (!file)
 		return -EFAULT;
 
-	folio = __kvm_gmem_get_pfn(file, slot, index, pfn, &is_prepared, max_order);
+	folio = __kvm_gmem_get_pfn(file, slot, index, pfn, max_order);
 	if (IS_ERR(folio))
 		return PTR_ERR(folio);
 
-	if (!is_prepared)
-		r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio);
+	if (!folio_test_uptodate(folio)) {
+		clear_highpage(folio_page(folio, 0));
+		folio_mark_uptodate(folio);
+	}
+
+	r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio);
 
 	folio_unlock(folio);
 
@@ -861,7 +849,6 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
 		struct folio *folio;
 		gfn_t gfn = start_gfn + i;
 		pgoff_t index = kvm_gmem_get_index(slot, gfn);
-		bool is_prepared = false;
 		kvm_pfn_t pfn;
 
 		if (signal_pending(current)) {
@@ -869,19 +856,12 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
 			break;
 		}
 
-		folio = __kvm_gmem_get_pfn(file, slot, index, &pfn, &is_prepared, NULL);
+		folio = __kvm_gmem_get_pfn(file, slot, index, &pfn, NULL);
 		if (IS_ERR(folio)) {
 			ret = PTR_ERR(folio);
 			break;
 		}
 
-		if (is_prepared) {
-			folio_unlock(folio);
-			folio_put(folio);
-			ret = -EEXIST;
-			break;
-		}
-
 		folio_unlock(folio);
 
 		ret = -EINVAL;
@@ -893,7 +873,7 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
 		p = src ? src + i * PAGE_SIZE : NULL;
 		ret = post_populate(kvm, gfn, pfn, p, opaque);
 		if (!ret)
-			kvm_gmem_mark_prepared(folio);
+			folio_mark_uptodate(folio);
 
 put_folio_and_exit:
 		folio_put(folio);
-- 
2.25.1

Re: [PATCH v2 2/5] KVM: guest_memfd: Remove preparation tracking

Posted by Huang, Kai 1 month, 3 weeks ago

>  /*
>   * Process @folio, which contains @gfn, so that the guest can use it.
>   * The folio must be locked and the gfn must be contained in @slot.
> @@ -90,13 +85,7 @@ static inline void kvm_gmem_mark_prepared(struct folio *folio)
>  static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
>  				  gfn_t gfn, struct folio *folio)
>  {
> -	unsigned long nr_pages, i;
>  	pgoff_t index;
> -	int r;
> -
> -	nr_pages = folio_nr_pages(folio);
> -	for (i = 0; i < nr_pages; i++)
> -		clear_highpage(folio_page(folio, i));
> 

Here the entire folio is cleared, but ...

[...]

> -	folio = __kvm_gmem_get_pfn(file, slot, index, pfn, &is_prepared, max_order);
> +	folio = __kvm_gmem_get_pfn(file, slot, index, pfn, max_order);
>  	if (IS_ERR(folio))
>  		return PTR_ERR(folio);
>  
> -	if (!is_prepared)
> -		r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio);
> +	if (!folio_test_uptodate(folio)) {
> +		clear_highpage(folio_page(folio, 0));
> +		folio_mark_uptodate(folio);
> +	}

... here only the first page is cleared.

I understand currently there's no huge folio coming out of gmem now, but
since both __kvm_gmem_get_pfn() and kvm_gmem_get_pfn() still have
@max_level as output, it's kinda inconsistent here.

I also see kvm_gmem_fault_user_mapping() only clears the first page too,
but I think that already has assumption that folio can never be huge
currently?

Given this, and the fact that the first patch of this series has
introduced 

	WARN_ON_ONCE(folio_order(folio));

in kvm_gmem_get_folio(), I think it's fine to only clear the first page,
but for the sake of consistency, perhaps we should just remove @max_order
from __kvm_gmem_get_pfn() and kvm_gmem_get_pfn()?

Then we can handle huge folio logic when that comes to play.

Btw:

I actually looked into the RFC v1 discussion but the code there actually
does a loop to clear all pages in the folio.  There were some other
discussions about AFAICT they were more related to issues regarding to 
"mark entire folio as uptodate while only one page is processed in
post_populate()".

Btw2:

There was also discussion that clearing page isn't required for TDX.  To
that end, maybe we can remove clearing page from gmem common code but to
SEV code, e.g., as part of "folio preparation"?

Re: [PATCH v2 2/5] KVM: guest_memfd: Remove preparation tracking

Posted by Michael Roth 1 month ago

On Thu, Dec 18, 2025 at 10:53:14PM +0000, Huang, Kai wrote:
> 
> >  /*
> >   * Process @folio, which contains @gfn, so that the guest can use it.
> >   * The folio must be locked and the gfn must be contained in @slot.
> > @@ -90,13 +85,7 @@ static inline void kvm_gmem_mark_prepared(struct folio *folio)
> >  static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
> >  				  gfn_t gfn, struct folio *folio)
> >  {
> > -	unsigned long nr_pages, i;
> >  	pgoff_t index;
> > -	int r;
> > -
> > -	nr_pages = folio_nr_pages(folio);
> > -	for (i = 0; i < nr_pages; i++)
> > -		clear_highpage(folio_page(folio, i));
> > 
> 
> Here the entire folio is cleared, but ...
> 
> [...]
> 
> > -	folio = __kvm_gmem_get_pfn(file, slot, index, pfn, &is_prepared, max_order);
> > +	folio = __kvm_gmem_get_pfn(file, slot, index, pfn, max_order);
> >  	if (IS_ERR(folio))
> >  		return PTR_ERR(folio);
> >  
> > -	if (!is_prepared)
> > -		r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio);
> > +	if (!folio_test_uptodate(folio)) {
> > +		clear_highpage(folio_page(folio, 0));
> > +		folio_mark_uptodate(folio);
> > +	}
> 
> ... here only the first page is cleared.
> 
> I understand currently there's no huge folio coming out of gmem now, but
> since both __kvm_gmem_get_pfn() and kvm_gmem_get_pfn() still have
> @max_level as output, it's kinda inconsistent here.
> 
> I also see kvm_gmem_fault_user_mapping() only clears the first page too,
> but I think that already has assumption that folio can never be huge
> currently?
> 
> Given this, and the fact that the first patch of this series has
> introduced 
> 
> 	WARN_ON_ONCE(folio_order(folio));
> 
> in kvm_gmem_get_folio(), I think it's fine to only clear the first page,
> but for the sake of consistency, perhaps we should just remove @max_order
> from __kvm_gmem_get_pfn() and kvm_gmem_get_pfn()?
> 
> Then we can handle huge folio logic when that comes to play.

Sean had mentioned during the PUCK prior to this that he was okay with
stripping out traces of hugepage support from kvm_gmem_populate() path,
since it's bringing about unecessary complexity for a use-case we'll
potentially never support. We will however eventually support hugepages
outside the kvm_gmem_populate() path, and the bits and pieces of the
API that plumb those details into KVM MMU code are more useful to keep
around since there's existing hugepage support in KVM MMU that make it
clearer where/when we'll need it. So I'm not sure we gain much from the
churn of stripping it out. However, if as part of wiring up hugepage
support those interfaces prove insufficient, then I wouldn't be opposed
to similarly adding pre-patches to strip it out for a cleaner base
implementation, but I don't really think that need to be part fo this
series which is focused more on the population path rather than fault
handling at run-time, so I've left things as-is for v3 for now.

> 
> Btw:
> 
> I actually looked into the RFC v1 discussion but the code there actually
> does a loop to clear all pages in the folio.  There were some other

The thinking there was to try to not actively break the hugepage-related
bits that were already in place, but since we decided to implement
hugepage-related stuff for kvm_gmem_populate() as a clean follow-up
implementation, there's no need to consider the hugepage case and loop
through the page.

> discussions about AFAICT they were more related to issues regarding to 
> "mark entire folio as uptodate while only one page is processed in
> post_populate()".
> 
> Btw2:
> 
> There was also discussion that clearing page isn't required for TDX.  To
> that end, maybe we can remove clearing page from gmem common code but to
> SEV code, e.g., as part of "folio preparation"?

I think we are considering this approach for TDX and there was some
discussion of having gmem-internal flags to select for this type of
handling, but I think that would make more sense as part of TDX-specific
enablement of in-place conversion. I'm planning to post the SNP-specific
enablement of in-place conversion patches on top of this series, so
maybe we can consider this in response to that series or as part of the
TDX-specific enablement.

-Mike

Re: [PATCH v2 2/5] KVM: guest_memfd: Remove preparation tracking

Posted by Gupta, Pankaj 1 month, 3 weeks ago

> guest_memfd currently uses the folio uptodate flag to track:
>
>    1) whether or not a page has been cleared before initial usage
>    2) whether or not the architecture hooks have been issued to put the
>       page in a private state as defined by the architecture
>
> In practice, 2) is only actually being tracked for SEV-SNP VMs, and
> there do not seem to be any plans/reasons that would suggest this will
> change in the future, so this additional tracking/complexity is not
> really providing any general benefit to guest_memfd users. Future plans
> around in-place conversion and hugepage support, where the per-folio
> uptodate flag is planned to be used purely to track the initial clearing
> of folios, whereas conversion operations could trigger multiple
> transitions between 'prepared' and 'unprepared' and thus need separate
> tracking, will make the burden of tracking this information within
> guest_memfd even more complex, since preparation generally happens
> during fault time, on the "read-side" of any global locks that might
> protect state tracked by guest_memfd, and so may require more complex
> locking schemes to allow for concurrent handling of page faults for
> multiple vCPUs where the "preparedness" state tracked by guest_memfd
> might need to be updated as part of handling the fault.
>
> Instead of keeping this current/future complexity within guest_memfd for
> what is essentially just SEV-SNP, just drop the tracking for 2) and have
> the arch-specific preparation hooks get triggered unconditionally on
> every fault so the arch-specific hooks can check the preparation state
> directly and decide whether or not a folio still needs additional
> preparation. In the case of SEV-SNP, the preparation state is already
> checked again via the preparation hooks to avoid double-preparation, so
> nothing extra needs to be done to update the handling of things there.
>
> Signed-off-by: Michael Roth <michael.roth@amd.com>

Reviewed-by: Pankaj Gupta <pankaj.gupta@amd.com>

> ---
>   virt/kvm/guest_memfd.c | 44 ++++++++++++------------------------------
>   1 file changed, 12 insertions(+), 32 deletions(-)
>
> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> index 9dafa44838fe..8b1248f42aae 100644
> --- a/virt/kvm/guest_memfd.c
> +++ b/virt/kvm/guest_memfd.c
> @@ -76,11 +76,6 @@ static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slo
>   	return 0;
>   }
>   
> -static inline void kvm_gmem_mark_prepared(struct folio *folio)
> -{
> -	folio_mark_uptodate(folio);
> -}
> -
>   /*
>    * Process @folio, which contains @gfn, so that the guest can use it.
>    * The folio must be locked and the gfn must be contained in @slot.
> @@ -90,13 +85,7 @@ static inline void kvm_gmem_mark_prepared(struct folio *folio)
>   static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
>   				  gfn_t gfn, struct folio *folio)
>   {
> -	unsigned long nr_pages, i;
>   	pgoff_t index;
> -	int r;
> -
> -	nr_pages = folio_nr_pages(folio);
> -	for (i = 0; i < nr_pages; i++)
> -		clear_highpage(folio_page(folio, i));
>   
>   	/*
>   	 * Preparing huge folios should always be safe, since it should
> @@ -114,11 +103,8 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
>   	WARN_ON(!IS_ALIGNED(slot->gmem.pgoff, folio_nr_pages(folio)));
>   	index = kvm_gmem_get_index(slot, gfn);
>   	index = ALIGN_DOWN(index, folio_nr_pages(folio));
> -	r = __kvm_gmem_prepare_folio(kvm, slot, index, folio);
> -	if (!r)
> -		kvm_gmem_mark_prepared(folio);
>   
> -	return r;
> +	return __kvm_gmem_prepare_folio(kvm, slot, index, folio);
>   }
>   
>   /*
> @@ -429,7 +415,7 @@ static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf)
>   
>   	if (!folio_test_uptodate(folio)) {
>   		clear_highpage(folio_page(folio, 0));
> -		kvm_gmem_mark_prepared(folio);
> +		folio_mark_uptodate(folio);
>   	}
>   
>   	vmf->page = folio_file_page(folio, vmf->pgoff);
> @@ -766,7 +752,7 @@ void kvm_gmem_unbind(struct kvm_memory_slot *slot)
>   static struct folio *__kvm_gmem_get_pfn(struct file *file,
>   					struct kvm_memory_slot *slot,
>   					pgoff_t index, kvm_pfn_t *pfn,
> -					bool *is_prepared, int *max_order)
> +					int *max_order)
>   {
>   	struct file *slot_file = READ_ONCE(slot->gmem.file);
>   	struct gmem_file *f = file->private_data;
> @@ -796,7 +782,6 @@ static struct folio *__kvm_gmem_get_pfn(struct file *file,
>   	if (max_order)
>   		*max_order = 0;
>   
> -	*is_prepared = folio_test_uptodate(folio);
>   	return folio;
>   }
>   
> @@ -806,19 +791,22 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
>   {
>   	pgoff_t index = kvm_gmem_get_index(slot, gfn);
>   	struct folio *folio;
> -	bool is_prepared = false;
>   	int r = 0;
>   
>   	CLASS(gmem_get_file, file)(slot);
>   	if (!file)
>   		return -EFAULT;
>   
> -	folio = __kvm_gmem_get_pfn(file, slot, index, pfn, &is_prepared, max_order);
> +	folio = __kvm_gmem_get_pfn(file, slot, index, pfn, max_order);
>   	if (IS_ERR(folio))
>   		return PTR_ERR(folio);
>   
> -	if (!is_prepared)
> -		r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio);
> +	if (!folio_test_uptodate(folio)) {
> +		clear_highpage(folio_page(folio, 0));
> +		folio_mark_uptodate(folio);
> +	}
> +
> +	r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio);
>   
>   	folio_unlock(folio);
>   
> @@ -861,7 +849,6 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
>   		struct folio *folio;
>   		gfn_t gfn = start_gfn + i;
>   		pgoff_t index = kvm_gmem_get_index(slot, gfn);
> -		bool is_prepared = false;
>   		kvm_pfn_t pfn;
>   
>   		if (signal_pending(current)) {
> @@ -869,19 +856,12 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
>   			break;
>   		}
>   
> -		folio = __kvm_gmem_get_pfn(file, slot, index, &pfn, &is_prepared, NULL);
> +		folio = __kvm_gmem_get_pfn(file, slot, index, &pfn, NULL);
>   		if (IS_ERR(folio)) {
>   			ret = PTR_ERR(folio);
>   			break;
>   		}
>   
> -		if (is_prepared) {
> -			folio_unlock(folio);
> -			folio_put(folio);
> -			ret = -EEXIST;
> -			break;
> -		}
> -
>   		folio_unlock(folio);
>   
>   		ret = -EINVAL;
> @@ -893,7 +873,7 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
>   		p = src ? src + i * PAGE_SIZE : NULL;
>   		ret = post_populate(kvm, gfn, pfn, p, opaque);
>   		if (!ret)
> -			kvm_gmem_mark_prepared(folio);
> +			folio_mark_uptodate(folio);
>   
>   put_folio_and_exit:
>   		folio_put(folio);

Re: [PATCH v2 2/5] KVM: guest_memfd: Remove preparation tracking

Posted by Vishal Annapurve 1 month, 3 weeks ago

On Mon, Dec 15, 2025 at 7:35 AM Michael Roth <michael.roth@amd.com> wrote:
>
> guest_memfd currently uses the folio uptodate flag to track:
>
>   1) whether or not a page has been cleared before initial usage
>   2) whether or not the architecture hooks have been issued to put the
>      page in a private state as defined by the architecture
>
> In practice, 2) is only actually being tracked for SEV-SNP VMs, and
> there do not seem to be any plans/reasons that would suggest this will
> change in the future, so this additional tracking/complexity is not
> really providing any general benefit to guest_memfd users. Future plans
> around in-place conversion and hugepage support, where the per-folio
> uptodate flag is planned to be used purely to track the initial clearing
> of folios, whereas conversion operations could trigger multiple
> transitions between 'prepared' and 'unprepared' and thus need separate
> tracking, will make the burden of tracking this information within
> guest_memfd even more complex, since preparation generally happens
> during fault time, on the "read-side" of any global locks that might
> protect state tracked by guest_memfd, and so may require more complex
> locking schemes to allow for concurrent handling of page faults for
> multiple vCPUs where the "preparedness" state tracked by guest_memfd
> might need to be updated as part of handling the fault.
>
> Instead of keeping this current/future complexity within guest_memfd for
> what is essentially just SEV-SNP, just drop the tracking for 2) and have
> the arch-specific preparation hooks get triggered unconditionally on
> every fault so the arch-specific hooks can check the preparation state
> directly and decide whether or not a folio still needs additional
> preparation. In the case of SEV-SNP, the preparation state is already
> checked again via the preparation hooks to avoid double-preparation, so
> nothing extra needs to be done to update the handling of things there.
>
> Signed-off-by: Michael Roth <michael.roth@amd.com>

Reviewed-By: Vishal Annapurve <vannapurve@google.com>
Tested-By: Vishal Annapurve <vannapurve@google.com>

[PATCH v2 1/5] KVM: guest_memfd: Remove partial hugepage handling from kvm_gmem_populate()
[PATCH v2 2/5] KVM: guest_memfd: Remove preparation tracking
[PATCH v2 3/5] KVM: SEV: Document/enforce page-alignment for KVM_SEV_SNP_LAUNCH_UPDATE
[PATCH v2 4/5] KVM: TDX: Document alignment requirements for KVM_TDX_INIT_MEM_REGION
[PATCH v2 5/5] KVM: guest_memfd: GUP source pages prior to populating guest memory