[v1] KVM: guest_memfd: Rework preparation/population flows in prep for in-place conversion

[PATCH 1/3] KVM: guest_memfd: Remove preparation tracking

Posted by Michael Roth 2 months, 3 weeks ago

guest_memfd currently uses the folio uptodate flag to track:

  1) whether or not a page has been cleared before initial usage
  2) whether or not the architecture hooks have been issued to put the
     page in a private state as defined by the architecture

In practice, 2) is only actually being tracked for SEV-SNP VMs, and
there do not seem to be any plans/reasons that would suggest this will
change in the future, so this additional tracking/complexity is not
really providing any general benefit to guest_memfd users. Future plans
around in-place conversion and hugepage support, where the per-folio
uptodate flag is planned to be used purely to track the initial clearing
of folios, whereas conversion operations could trigger multiple
transitions between 'prepared' and 'unprepared' and thus need separate
tracking, will make the burden of tracking this information within
guest_memfd even more complex, since preparation generally happens
during fault time, on the "read-side" of any global locks that might
protect state tracked by guest_memfd, and so may require more complex
locking schemes to allow for concurrent handling of page faults for
multiple vCPUs where the "preparedness" state tracked by guest_memfd
might need to be updated as part of handling the fault.

Instead of keeping this current/future complexity within guest_memfd for
what is essentially just SEV-SNP, just drop the tracking for 2) and have
the arch-specific preparation hooks get triggered unconditionally on
every fault so the arch-specific hooks can check the preparation state
directly and decide whether or not a folio still needs additional
preparation. In the case of SEV-SNP, the preparation state is already
checked again via the preparation hooks to avoid double-preparation, so
nothing extra needs to be done to update the handling of things there.

Signed-off-by: Michael Roth <michael.roth@amd.com>
---
 virt/kvm/guest_memfd.c | 47 ++++++++++++++----------------------------
 1 file changed, 15 insertions(+), 32 deletions(-)

diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index fdaea3422c30..9160379df378 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -76,11 +76,6 @@ static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slo
 	return 0;
 }
 
-static inline void kvm_gmem_mark_prepared(struct folio *folio)
-{
-	folio_mark_uptodate(folio);
-}
-
 /*
  * Process @folio, which contains @gfn, so that the guest can use it.
  * The folio must be locked and the gfn must be contained in @slot.
@@ -90,13 +85,7 @@ static inline void kvm_gmem_mark_prepared(struct folio *folio)
 static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
 				  gfn_t gfn, struct folio *folio)
 {
-	unsigned long nr_pages, i;
 	pgoff_t index;
-	int r;
-
-	nr_pages = folio_nr_pages(folio);
-	for (i = 0; i < nr_pages; i++)
-		clear_highpage(folio_page(folio, i));
 
 	/*
 	 * Preparing huge folios should always be safe, since it should
@@ -114,11 +103,8 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
 	WARN_ON(!IS_ALIGNED(slot->gmem.pgoff, folio_nr_pages(folio)));
 	index = kvm_gmem_get_index(slot, gfn);
 	index = ALIGN_DOWN(index, folio_nr_pages(folio));
-	r = __kvm_gmem_prepare_folio(kvm, slot, index, folio);
-	if (!r)
-		kvm_gmem_mark_prepared(folio);
 
-	return r;
+	return __kvm_gmem_prepare_folio(kvm, slot, index, folio);
 }
 
 /*
@@ -420,7 +406,7 @@ static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf)
 
 	if (!folio_test_uptodate(folio)) {
 		clear_highpage(folio_page(folio, 0));
-		kvm_gmem_mark_prepared(folio);
+		folio_mark_uptodate(folio);
 	}
 
 	vmf->page = folio_file_page(folio, vmf->pgoff);
@@ -757,7 +743,7 @@ void kvm_gmem_unbind(struct kvm_memory_slot *slot)
 static struct folio *__kvm_gmem_get_pfn(struct file *file,
 					struct kvm_memory_slot *slot,
 					pgoff_t index, kvm_pfn_t *pfn,
-					bool *is_prepared, int *max_order)
+					int *max_order)
 {
 	struct file *slot_file = READ_ONCE(slot->gmem.file);
 	struct gmem_file *f = file->private_data;
@@ -787,7 +773,6 @@ static struct folio *__kvm_gmem_get_pfn(struct file *file,
 	if (max_order)
 		*max_order = 0;
 
-	*is_prepared = folio_test_uptodate(folio);
 	return folio;
 }
 
@@ -797,19 +782,25 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
 {
 	pgoff_t index = kvm_gmem_get_index(slot, gfn);
 	struct folio *folio;
-	bool is_prepared = false;
 	int r = 0;
 
 	CLASS(gmem_get_file, file)(slot);
 	if (!file)
 		return -EFAULT;
 
-	folio = __kvm_gmem_get_pfn(file, slot, index, pfn, &is_prepared, max_order);
+	folio = __kvm_gmem_get_pfn(file, slot, index, pfn, max_order);
 	if (IS_ERR(folio))
 		return PTR_ERR(folio);
 
-	if (!is_prepared)
-		r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio);
+	if (!folio_test_uptodate(folio)) {
+		unsigned long i, nr_pages = folio_nr_pages(folio);
+
+		for (i = 0; i < nr_pages; i++)
+			clear_highpage(folio_page(folio, i));
+		folio_mark_uptodate(folio);
+	}
+
+	r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio);
 
 	folio_unlock(folio);
 
@@ -852,7 +843,6 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
 		struct folio *folio;
 		gfn_t gfn = start_gfn + i;
 		pgoff_t index = kvm_gmem_get_index(slot, gfn);
-		bool is_prepared = false;
 		kvm_pfn_t pfn;
 
 		if (signal_pending(current)) {
@@ -860,19 +850,12 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
 			break;
 		}
 
-		folio = __kvm_gmem_get_pfn(file, slot, index, &pfn, &is_prepared, &max_order);
+		folio = __kvm_gmem_get_pfn(file, slot, index, &pfn, &max_order);
 		if (IS_ERR(folio)) {
 			ret = PTR_ERR(folio);
 			break;
 		}
 
-		if (is_prepared) {
-			folio_unlock(folio);
-			folio_put(folio);
-			ret = -EEXIST;
-			break;
-		}
-
 		folio_unlock(folio);
 		WARN_ON(!IS_ALIGNED(gfn, 1 << max_order) ||
 			(npages - i) < (1 << max_order));
@@ -889,7 +872,7 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
 		p = src ? src + i * PAGE_SIZE : NULL;
 		ret = post_populate(kvm, gfn, pfn, p, max_order, opaque);
 		if (!ret)
-			kvm_gmem_mark_prepared(folio);
+			folio_mark_uptodate(folio);
 
 put_folio_and_exit:
 		folio_put(folio);
-- 
2.25.1

Re: [PATCH 1/3] KVM: guest_memfd: Remove preparation tracking

Posted by Yan Zhao 2 months, 2 weeks ago

On Thu, Nov 13, 2025 at 05:07:57PM -0600, Michael Roth wrote:
> @@ -797,19 +782,25 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
>  {
>  	pgoff_t index = kvm_gmem_get_index(slot, gfn);
>  	struct folio *folio;
> -	bool is_prepared = false;
>  	int r = 0;
>  
>  	CLASS(gmem_get_file, file)(slot);
>  	if (!file)
>  		return -EFAULT;
>  
> -	folio = __kvm_gmem_get_pfn(file, slot, index, pfn, &is_prepared, max_order);
> +	folio = __kvm_gmem_get_pfn(file, slot, index, pfn, max_order);
>  	if (IS_ERR(folio))
>  		return PTR_ERR(folio);
>  
> -	if (!is_prepared)
> -		r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio);
> +	if (!folio_test_uptodate(folio)) {
> +		unsigned long i, nr_pages = folio_nr_pages(folio);
> +
> +		for (i = 0; i < nr_pages; i++)
> +			clear_highpage(folio_page(folio, i));
> +		folio_mark_uptodate(folio);
Here, the entire folio is cleared only when the folio is not marked uptodate.
Then, please check my questions at the bottom

> +	}
> +
> +	r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio);
>  
>  	folio_unlock(folio);
>  
> @@ -852,7 +843,6 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
>  		struct folio *folio;
>  		gfn_t gfn = start_gfn + i;
>  		pgoff_t index = kvm_gmem_get_index(slot, gfn);
> -		bool is_prepared = false;
>  		kvm_pfn_t pfn;
>  
>  		if (signal_pending(current)) {
> @@ -860,19 +850,12 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
>  			break;
>  		}
>  
> -		folio = __kvm_gmem_get_pfn(file, slot, index, &pfn, &is_prepared, &max_order);
> +		folio = __kvm_gmem_get_pfn(file, slot, index, &pfn, &max_order);
>  		if (IS_ERR(folio)) {
>  			ret = PTR_ERR(folio);
>  			break;
>  		}
>  
> -		if (is_prepared) {
> -			folio_unlock(folio);
> -			folio_put(folio);
> -			ret = -EEXIST;
> -			break;
> -		}
> -
>  		folio_unlock(folio);
>  		WARN_ON(!IS_ALIGNED(gfn, 1 << max_order) ||
>  			(npages - i) < (1 << max_order));
TDX could hit this warning easily when npages == 1, max_order == 9.

> @@ -889,7 +872,7 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
>  		p = src ? src + i * PAGE_SIZE : NULL;
>  		ret = post_populate(kvm, gfn, pfn, p, max_order, opaque);
>  		if (!ret)
> -			kvm_gmem_mark_prepared(folio);
> +			folio_mark_uptodate(folio);
As also asked in [1], why is the entire folio marked as uptodate here? Why does
kvm_gmem_get_pfn() clear all pages of a huge folio when the folio isn't marked
uptodate?

It's possible (at least for TDX) that a huge folio is only partially populated
by kvm_gmem_populate(). Then kvm_gmem_get_pfn() faults in another part of the
huge folio. For example, in TDX, GFN 0x81f belongs to the init memory region,
while GFN 0x820 is faulted after TD is running. However, these two GFNs can
belong to the same folio of order 9.

Note: the current code should not impact TDX. I'm just asking out of curiosity:)

[1] https://lore.kernel.org/all/aQ3uj4BZL6uFQzrD@yzhao56-desk.sh.intel.com/

Re: [PATCH 1/3] KVM: guest_memfd: Remove preparation tracking

Posted by Michael Roth 2 months, 2 weeks ago

On Thu, Nov 20, 2025 at 05:12:55PM +0800, Yan Zhao wrote:
> On Thu, Nov 13, 2025 at 05:07:57PM -0600, Michael Roth wrote:
> > @@ -797,19 +782,25 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
> >  {
> >  	pgoff_t index = kvm_gmem_get_index(slot, gfn);
> >  	struct folio *folio;
> > -	bool is_prepared = false;
> >  	int r = 0;
> >  
> >  	CLASS(gmem_get_file, file)(slot);
> >  	if (!file)
> >  		return -EFAULT;
> >  
> > -	folio = __kvm_gmem_get_pfn(file, slot, index, pfn, &is_prepared, max_order);
> > +	folio = __kvm_gmem_get_pfn(file, slot, index, pfn, max_order);
> >  	if (IS_ERR(folio))
> >  		return PTR_ERR(folio);
> >  
> > -	if (!is_prepared)
> > -		r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio);
> > +	if (!folio_test_uptodate(folio)) {
> > +		unsigned long i, nr_pages = folio_nr_pages(folio);
> > +
> > +		for (i = 0; i < nr_pages; i++)
> > +			clear_highpage(folio_page(folio, i));
> > +		folio_mark_uptodate(folio);
> Here, the entire folio is cleared only when the folio is not marked uptodate.
> Then, please check my questions at the bottom

Yes, in this patch at least where I tried to mirror the current logic. I
would not be surprised if we need to rework things for inplace/hugepage
support though, but decoupling 'preparation' from the uptodate flag is
the main goal here.

> 
> > +	}
> > +
> > +	r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio);
> >  
> >  	folio_unlock(folio);
> >  
> > @@ -852,7 +843,6 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
> >  		struct folio *folio;
> >  		gfn_t gfn = start_gfn + i;
> >  		pgoff_t index = kvm_gmem_get_index(slot, gfn);
> > -		bool is_prepared = false;
> >  		kvm_pfn_t pfn;
> >  
> >  		if (signal_pending(current)) {
> > @@ -860,19 +850,12 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
> >  			break;
> >  		}
> >  
> > -		folio = __kvm_gmem_get_pfn(file, slot, index, &pfn, &is_prepared, &max_order);
> > +		folio = __kvm_gmem_get_pfn(file, slot, index, &pfn, &max_order);
> >  		if (IS_ERR(folio)) {
> >  			ret = PTR_ERR(folio);
> >  			break;
> >  		}
> >  
> > -		if (is_prepared) {
> > -			folio_unlock(folio);
> > -			folio_put(folio);
> > -			ret = -EEXIST;
> > -			break;
> > -		}
> > -
> >  		folio_unlock(folio);
> >  		WARN_ON(!IS_ALIGNED(gfn, 1 << max_order) ||
> >  			(npages - i) < (1 << max_order));
> TDX could hit this warning easily when npages == 1, max_order == 9.

Yes, this will need to change to handle that. I don't think I had to
change this for previous iterations of SNP hugepage support, but
there are definitely cases where a sub-2M range might get populated 
even though it's backed by a 2M folio, so I'm not sure why I didn't
hit it there.

But I'm taking Sean's cue on touching as little of the existing
hugepage logic as possible in this particular series so we can revisit
the remaining changes with some better context.

> 
> > @@ -889,7 +872,7 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
> >  		p = src ? src + i * PAGE_SIZE : NULL;
> >  		ret = post_populate(kvm, gfn, pfn, p, max_order, opaque);
> >  		if (!ret)
> > -			kvm_gmem_mark_prepared(folio);
> > +			folio_mark_uptodate(folio);
> As also asked in [1], why is the entire folio marked as uptodate here? Why does
> kvm_gmem_get_pfn() clear all pages of a huge folio when the folio isn't marked
> uptodate?

Quoting your example from[1] for more context:

> I also have a question about this patch:
> 
> Suppose there's a 2MB huge folio A, where
> A1 and A2 are 4KB pages belonging to folio A.
> 
> (1) kvm_gmem_populate() invokes __kvm_gmem_get_pfn() and gets folio A.
>     It adds page A1 and invokes folio_mark_uptodate() on folio A.

In SNP hugepage patchset you responded to, it would only mark A1 as
prepared/cleared. There was 4K-granularity tracking added to handle this.
There was an odd subtlety in that series though: it was defaulting to the
folio_order() for the prep-tracking/post-populate, but it would then clamp
it down based on the max order possible according whether that particular
order was a homogenous range of KVM_MEMORY_ATTRIBUTE_PRIVATE. Which is not
a great way to handle things, and I don't remember if I'd actually intended
to implement it that way or not... that's probably why I never tripped over
the WARN_ON() above, now that I think of it.

But neither of these these apply to any current plans for hugepage support
that I'm aware of, so probably not worth working through what that series
did and look at this from a fresh perspective.

> 
> (2) kvm_gmem_get_pfn() later faults in page A2.
>     As folio A is uptodate, clear_highpage() is not invoked on page A2.
>     kvm_gmem_prepare_folio() is invoked on the whole folio A.
> 
> (2) could occur at least in TDX when only a part the 2MB page is added as guest
> initial memory.
> 
> My questions:
> - Would (2) occur on SEV?
> - If it does, is the lack of clear_highpage() on A2 a problem ?
> - Is invoking gmem_prepare on page A1 a problem?

Assuming this patch goes upstream in some form, we will now have the
following major differences versus previous code:

  1) uptodate flag only tracks whether a folio has been cleared
  2) gmem always calls kvm_arch_gmem_prepare() via kvm_gmem_get_pfn() and
     the architecture can handle it's own tracking at whatever granularity
     it likes.

My hope is that 1) can similarly be done in such a way that gmem does not
need to track things at sub-hugepage granularity and necessitate the need
for some new data structure/state/flag to track sub-page status.

My understanding based on prior discussion in guest_memfd calls was that
it would be okay to go ahead and clear the entire folio at initial allocation
time, and basically never mess with it again. It was also my understanding
that for TDX it might even be optimal to completely skip clearing the folio
if it is getting mapped into SecureEPT as a hugepage since the TDX module
would handle that, but that maybe conversely after private->shared there
would be some need to reclear... I'll try to find that discussion and
refresh. Vishal I believe suggested some flags to provide more control over
this behavior.

> 
> It's possible (at least for TDX) that a huge folio is only partially populated
> by kvm_gmem_populate(). Then kvm_gmem_get_pfn() faults in another part of the
> huge folio. For example, in TDX, GFN 0x81f belongs to the init memory region,
> while GFN 0x820 is faulted after TD is running. However, these two GFNs can
> belong to the same folio of order 9.

Would the above scheme of clearing the entire folio up front and not re-clearing
at fault time work for this case?

Thanks,

Mike

> 
> Note: the current code should not impact TDX. I'm just asking out of curiosity:)
> 
> [1] https://lore.kernel.org/all/aQ3uj4BZL6uFQzrD@yzhao56-desk.sh.intel.com/
> 
>

Re: [PATCH 1/3] KVM: guest_memfd: Remove preparation tracking

Posted by Yan Zhao 2 months, 2 weeks ago

On Fri, Nov 21, 2025 at 06:43:14AM -0600, Michael Roth wrote:
> On Thu, Nov 20, 2025 at 05:12:55PM +0800, Yan Zhao wrote:
> > On Thu, Nov 13, 2025 at 05:07:57PM -0600, Michael Roth wrote:
> > > @@ -797,19 +782,25 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
> > >  {
> > >  	pgoff_t index = kvm_gmem_get_index(slot, gfn);
> > >  	struct folio *folio;
> > > -	bool is_prepared = false;
> > >  	int r = 0;
> > >  
> > >  	CLASS(gmem_get_file, file)(slot);
> > >  	if (!file)
> > >  		return -EFAULT;
> > >  
> > > -	folio = __kvm_gmem_get_pfn(file, slot, index, pfn, &is_prepared, max_order);
> > > +	folio = __kvm_gmem_get_pfn(file, slot, index, pfn, max_order);
> > >  	if (IS_ERR(folio))
> > >  		return PTR_ERR(folio);
> > >  
> > > -	if (!is_prepared)
> > > -		r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio);
> > > +	if (!folio_test_uptodate(folio)) {
> > > +		unsigned long i, nr_pages = folio_nr_pages(folio);
> > > +
> > > +		for (i = 0; i < nr_pages; i++)
> > > +			clear_highpage(folio_page(folio, i));
> > > +		folio_mark_uptodate(folio);
> > Here, the entire folio is cleared only when the folio is not marked uptodate.
> > Then, please check my questions at the bottom
> 
> Yes, in this patch at least where I tried to mirror the current logic. I
> would not be surprised if we need to rework things for inplace/hugepage
> support though, but decoupling 'preparation' from the uptodate flag is
> the main goal here.
Could you elaborate a little why the decoupling is needed if it's not for
hugepage?


> > > +	}
> > > +
> > > +	r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio);
> > >  
> > >  	folio_unlock(folio);
> > >  
> > > @@ -852,7 +843,6 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
> > >  		struct folio *folio;
> > >  		gfn_t gfn = start_gfn + i;
> > >  		pgoff_t index = kvm_gmem_get_index(slot, gfn);
> > > -		bool is_prepared = false;
> > >  		kvm_pfn_t pfn;
> > >  
> > >  		if (signal_pending(current)) {
> > > @@ -860,19 +850,12 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
> > >  			break;
> > >  		}
> > >  
> > > -		folio = __kvm_gmem_get_pfn(file, slot, index, &pfn, &is_prepared, &max_order);
> > > +		folio = __kvm_gmem_get_pfn(file, slot, index, &pfn, &max_order);
> > >  		if (IS_ERR(folio)) {
> > >  			ret = PTR_ERR(folio);
> > >  			break;
> > >  		}
> > >  
> > > -		if (is_prepared) {
> > > -			folio_unlock(folio);
> > > -			folio_put(folio);
> > > -			ret = -EEXIST;
> > > -			break;
> > > -		}
> > > -
> > >  		folio_unlock(folio);
> > >  		WARN_ON(!IS_ALIGNED(gfn, 1 << max_order) ||
> > >  			(npages - i) < (1 << max_order));
> > TDX could hit this warning easily when npages == 1, max_order == 9.
> 
> Yes, this will need to change to handle that. I don't think I had to
> change this for previous iterations of SNP hugepage support, but
> there are definitely cases where a sub-2M range might get populated 
> even though it's backed by a 2M folio, so I'm not sure why I didn't
> hit it there.
> 
> But I'm taking Sean's cue on touching as little of the existing
> hugepage logic as possible in this particular series so we can revisit
> the remaining changes with some better context.
Frankly, I don't understand why this patch 1 is required if we only want "moving
GUP out of post_populate()" to work for 4KB folios.

> > 
> > > @@ -889,7 +872,7 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
> > >  		p = src ? src + i * PAGE_SIZE : NULL;
> > >  		ret = post_populate(kvm, gfn, pfn, p, max_order, opaque);
> > >  		if (!ret)
> > > -			kvm_gmem_mark_prepared(folio);
> > > +			folio_mark_uptodate(folio);
> > As also asked in [1], why is the entire folio marked as uptodate here? Why does
> > kvm_gmem_get_pfn() clear all pages of a huge folio when the folio isn't marked
> > uptodate?
> 
> Quoting your example from[1] for more context:
> 
> > I also have a question about this patch:
> > 
> > Suppose there's a 2MB huge folio A, where
> > A1 and A2 are 4KB pages belonging to folio A.
> > 
> > (1) kvm_gmem_populate() invokes __kvm_gmem_get_pfn() and gets folio A.
> >     It adds page A1 and invokes folio_mark_uptodate() on folio A.
> 
> In SNP hugepage patchset you responded to, it would only mark A1 as
You mean code in
https://github.com/amdese/linux/commits/snp-inplace-conversion-rfc1 ?

> prepared/cleared. There was 4K-granularity tracking added to handle this.
I don't find the code that marks only A1 as "prepared/cleared".
Instead, I just found folio_mark_uptodate() is invoked by kvm_gmem_populate()
to mark the entire folio A as uptodate.

However, according to your statement below that "uptodate flag only tracks
whether a folio has been cleared", I don't follow why and where the entire folio
A would be cleared if kvm_gmem_populate() only adds page A1.

> There was an odd subtlety in that series though: it was defaulting to the
> folio_order() for the prep-tracking/post-populate, but it would then clamp
> it down based on the max order possible according whether that particular
> order was a homogenous range of KVM_MEMORY_ATTRIBUTE_PRIVATE. Which is not
> a great way to handle things, and I don't remember if I'd actually intended
> to implement it that way or not... that's probably why I never tripped over
> the WARN_ON() above, now that I think of it.
> 
> But neither of these these apply to any current plans for hugepage support
> that I'm aware of, so probably not worth working through what that series
> did and look at this from a fresh perspective.
> 
> > 
> > (2) kvm_gmem_get_pfn() later faults in page A2.
> >     As folio A is uptodate, clear_highpage() is not invoked on page A2.
> >     kvm_gmem_prepare_folio() is invoked on the whole folio A.
> > 
> > (2) could occur at least in TDX when only a part the 2MB page is added as guest
> > initial memory.
> > 
> > My questions:
> > - Would (2) occur on SEV?
> > - If it does, is the lack of clear_highpage() on A2 a problem ?
> > - Is invoking gmem_prepare on page A1 a problem?
> 
> Assuming this patch goes upstream in some form, we will now have the
> following major differences versus previous code:
> 
>   1) uptodate flag only tracks whether a folio has been cleared
>   2) gmem always calls kvm_arch_gmem_prepare() via kvm_gmem_get_pfn() and
>      the architecture can handle it's own tracking at whatever granularity
>      it likes.
2) looks good to me.

> My hope is that 1) can similarly be done in such a way that gmem does not
> need to track things at sub-hugepage granularity and necessitate the need
> for some new data structure/state/flag to track sub-page status.
I actually don't understand what uptodate flag helps gmem to track.
Why can't clear_highpage() be done inside arch specific code? TDX doesn't need
this clearing after all.

> My understanding based on prior discussion in guest_memfd calls was that
> it would be okay to go ahead and clear the entire folio at initial allocation
> time, and basically never mess with it again. It was also my understanding
That's where I don't follow in this patch.
I don't see where the entire folio A is cleared if it's only partially mapped by
kvm_gmem_populate(). kvm_gmem_get_pfn() won't clear folio A either due to
kvm_gmem_populate() has set the uptodate flag.

> that for TDX it might even be optimal to completely skip clearing the folio
> if it is getting mapped into SecureEPT as a hugepage since the TDX module
> would handle that, but that maybe conversely after private->shared there
> would be some need to reclear... I'll try to find that discussion and
> refresh. Vishal I believe suggested some flags to provide more control over
> this behavior.
> 
> > 
> > It's possible (at least for TDX) that a huge folio is only partially populated
> > by kvm_gmem_populate(). Then kvm_gmem_get_pfn() faults in another part of the
> > huge folio. For example, in TDX, GFN 0x81f belongs to the init memory region,
> > while GFN 0x820 is faulted after TD is running. However, these two GFNs can
> > belong to the same folio of order 9.
> 
> Would the above scheme of clearing the entire folio up front and not re-clearing
> at fault time work for this case?
This case doesn't affect TDX, because TDX clearing private pages internally in
SEAM APIs. So, as long as kvm_gmem_get_pfn() does not invoke clear_highpage()
after making a folio private, it works fine for TDX.

I was just trying to understand why SNP needs the clearing of entire folio in
kvm_gmem_get_pfn() while I don't see how the entire folio is cleared when it's
partially mapped in kvm_gmem_populate().
Also, I'm wondering if it would be better if SNP could move the clearing of
folio into something like kvm_arch_gmem_clear(), just as kvm_arch_gmem_prepare()
which is always invoked by kvm_gmem_get_pfn() and the architecture can handle
it's own tracking at whatever granularity.

 
> > Note: the current code should not impact TDX. I'm just asking out of curiosity:)
> > 
> > [1] https://lore.kernel.org/all/aQ3uj4BZL6uFQzrD@yzhao56-desk.sh.intel.com/
> > 
> >

Re: [PATCH 1/3] KVM: guest_memfd: Remove preparation tracking

Posted by Michael Roth 2 months, 1 week ago

On Tue, Nov 25, 2025 at 11:13:25AM +0800, Yan Zhao wrote:
> On Fri, Nov 21, 2025 at 06:43:14AM -0600, Michael Roth wrote:
> > On Thu, Nov 20, 2025 at 05:12:55PM +0800, Yan Zhao wrote:
> > > On Thu, Nov 13, 2025 at 05:07:57PM -0600, Michael Roth wrote:
> > > > @@ -797,19 +782,25 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
> > > >  {
> > > >  	pgoff_t index = kvm_gmem_get_index(slot, gfn);
> > > >  	struct folio *folio;
> > > > -	bool is_prepared = false;
> > > >  	int r = 0;
> > > >  
> > > >  	CLASS(gmem_get_file, file)(slot);
> > > >  	if (!file)
> > > >  		return -EFAULT;
> > > >  
> > > > -	folio = __kvm_gmem_get_pfn(file, slot, index, pfn, &is_prepared, max_order);
> > > > +	folio = __kvm_gmem_get_pfn(file, slot, index, pfn, max_order);
> > > >  	if (IS_ERR(folio))
> > > >  		return PTR_ERR(folio);
> > > >  
> > > > -	if (!is_prepared)
> > > > -		r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio);
> > > > +	if (!folio_test_uptodate(folio)) {
> > > > +		unsigned long i, nr_pages = folio_nr_pages(folio);
> > > > +
> > > > +		for (i = 0; i < nr_pages; i++)
> > > > +			clear_highpage(folio_page(folio, i));
> > > > +		folio_mark_uptodate(folio);
> > > Here, the entire folio is cleared only when the folio is not marked uptodate.
> > > Then, please check my questions at the bottom
> > 
> > Yes, in this patch at least where I tried to mirror the current logic. I
> > would not be surprised if we need to rework things for inplace/hugepage
> > support though, but decoupling 'preparation' from the uptodate flag is
> > the main goal here.
> Could you elaborate a little why the decoupling is needed if it's not for
> hugepage?

For instance, for in-place conversion:

  1. initial allocation: clear, set uptodate, fault in as private
  2. private->shared: call invalidate hook, fault in as shared
  3. shared->private: call prep hook, fault in as private

Here, 2/3 need to track where the current state is shared/private in
order to make appropriate architecture-specific changes (e.g. RMP table
updates). But we want to allow for non-destructive in-place conversion,
where a page is 'uptodate', but not in the desired shared/private state.
So 'uptodate' becomes a separate piece of state, which is still
reasonable for gmem to track in the current 4K-only implementation, and
provides for a reasonable approach to upstreaming in-place conversion,
which isn't far off for either SNP or TDX.

For hugepages, we'll have other things to consider, but those things are
probably still somewhat far off, and so we shouldn't block steps toward
in-place conversion based on uncertainty around hugepages. I think it's
gotten enough attention at least that we know it *can* work, e.g. even
if we take the inefficient/easy route of zero'ing the whole folio on
initial access, setting it uptodate, and never doing anything with 
uptodate again, it's still a usable implementation.

> 
> 
> > > > +	}
> > > > +
> > > > +	r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio);
> > > >  
> > > >  	folio_unlock(folio);
> > > >  
> > > > @@ -852,7 +843,6 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
> > > >  		struct folio *folio;
> > > >  		gfn_t gfn = start_gfn + i;
> > > >  		pgoff_t index = kvm_gmem_get_index(slot, gfn);
> > > > -		bool is_prepared = false;
> > > >  		kvm_pfn_t pfn;
> > > >  
> > > >  		if (signal_pending(current)) {
> > > > @@ -860,19 +850,12 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
> > > >  			break;
> > > >  		}
> > > >  
> > > > -		folio = __kvm_gmem_get_pfn(file, slot, index, &pfn, &is_prepared, &max_order);
> > > > +		folio = __kvm_gmem_get_pfn(file, slot, index, &pfn, &max_order);
> > > >  		if (IS_ERR(folio)) {
> > > >  			ret = PTR_ERR(folio);
> > > >  			break;
> > > >  		}
> > > >  
> > > > -		if (is_prepared) {
> > > > -			folio_unlock(folio);
> > > > -			folio_put(folio);
> > > > -			ret = -EEXIST;
> > > > -			break;
> > > > -		}
> > > > -
> > > >  		folio_unlock(folio);
> > > >  		WARN_ON(!IS_ALIGNED(gfn, 1 << max_order) ||
> > > >  			(npages - i) < (1 << max_order));
> > > TDX could hit this warning easily when npages == 1, max_order == 9.
> > 
> > Yes, this will need to change to handle that. I don't think I had to
> > change this for previous iterations of SNP hugepage support, but
> > there are definitely cases where a sub-2M range might get populated 
> > even though it's backed by a 2M folio, so I'm not sure why I didn't
> > hit it there.
> > 
> > But I'm taking Sean's cue on touching as little of the existing
> > hugepage logic as possible in this particular series so we can revisit
> > the remaining changes with some better context.
> Frankly, I don't understand why this patch 1 is required if we only want "moving
> GUP out of post_populate()" to work for 4KB folios.

Above I outlined one of the use-cases for in-place conversion.

During the 2 PUCK sessions prior to this RFC, Sean also mentioned some
potential that other deadlocks might exist in current code due to
how the locking is currently handled, and that we should consider this
as a general cleanup against current kvm/next, but I leave that to Sean
to elaborate on.

Personally I think this series makes sense against kvm/next regardless:
tracking preparation in gmem is basically already broken: everyone ignores
it except SNP, so it was never performing that duty as-designed. So we
are now simplying uptodate flag to no longer include this extra
state-tracking, and leaving it for architecture-specific tracking. I
can't see that be anything but beneficial to future gmem changes.

> 
> > > 
> > > > @@ -889,7 +872,7 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
> > > >  		p = src ? src + i * PAGE_SIZE : NULL;
> > > >  		ret = post_populate(kvm, gfn, pfn, p, max_order, opaque);
> > > >  		if (!ret)
> > > > -			kvm_gmem_mark_prepared(folio);
> > > > +			folio_mark_uptodate(folio);
> > > As also asked in [1], why is the entire folio marked as uptodate here? Why does
> > > kvm_gmem_get_pfn() clear all pages of a huge folio when the folio isn't marked
> > > uptodate?
> > 
> > Quoting your example from[1] for more context:
> > 
> > > I also have a question about this patch:
> > > 
> > > Suppose there's a 2MB huge folio A, where
> > > A1 and A2 are 4KB pages belonging to folio A.
> > > 
> > > (1) kvm_gmem_populate() invokes __kvm_gmem_get_pfn() and gets folio A.
> > >     It adds page A1 and invokes folio_mark_uptodate() on folio A.
> > 
> > In SNP hugepage patchset you responded to, it would only mark A1 as
> You mean code in
> https://github.com/amdese/linux/commits/snp-inplace-conversion-rfc1 ?

No, sorry, I got my references mixed up. The only publically-posted
version of SNP hugepage support is the THP series that does not involve
in-place conversion, and that's what I was referencing. It's there where
per-4K bitmap was added to track preparation, and in that series
page-clearing/preparation are still coupled to some degree so per-4K
tracking of page-clearing was still possible and that's how it was
handled:

  https://github.com/AMDESE/linux/blob/snp-prepare-thp-rfc1/virt/kvm/guest_memfd.c#L992

but that can be considered an abandoned approach so I wouldn't spend
much time referencing that.

> 
> > prepared/cleared. There was 4K-granularity tracking added to handle this.
> I don't find the code that marks only A1 as "prepared/cleared".
> Instead, I just found folio_mark_uptodate() is invoked by kvm_gmem_populate()
> to mark the entire folio A as uptodate.
> 
> However, according to your statement below that "uptodate flag only tracks
> whether a folio has been cleared", I don't follow why and where the entire folio
> A would be cleared if kvm_gmem_populate() only adds page A1.
> 
> > There was an odd subtlety in that series though: it was defaulting to the
> > folio_order() for the prep-tracking/post-populate, but it would then clamp
> > it down based on the max order possible according whether that particular
> > order was a homogenous range of KVM_MEMORY_ATTRIBUTE_PRIVATE. Which is not
> > a great way to handle things, and I don't remember if I'd actually intended
> > to implement it that way or not... that's probably why I never tripped over
> > the WARN_ON() above, now that I think of it.
> > 
> > But neither of these these apply to any current plans for hugepage support
> > that I'm aware of, so probably not worth working through what that series
> > did and look at this from a fresh perspective.
> > 
> > > 
> > > (2) kvm_gmem_get_pfn() later faults in page A2.
> > >     As folio A is uptodate, clear_highpage() is not invoked on page A2.
> > >     kvm_gmem_prepare_folio() is invoked on the whole folio A.
> > > 
> > > (2) could occur at least in TDX when only a part the 2MB page is added as guest
> > > initial memory.
> > > 
> > > My questions:
> > > - Would (2) occur on SEV?
> > > - If it does, is the lack of clear_highpage() on A2 a problem ?
> > > - Is invoking gmem_prepare on page A1 a problem?
> > 
> > Assuming this patch goes upstream in some form, we will now have the
> > following major differences versus previous code:
> > 
> >   1) uptodate flag only tracks whether a folio has been cleared
> >   2) gmem always calls kvm_arch_gmem_prepare() via kvm_gmem_get_pfn() and
> >      the architecture can handle it's own tracking at whatever granularity
> >      it likes.
> 2) looks good to me.
> 
> > My hope is that 1) can similarly be done in such a way that gmem does not
> > need to track things at sub-hugepage granularity and necessitate the need
> > for some new data structure/state/flag to track sub-page status.
> I actually don't understand what uptodate flag helps gmem to track.
> Why can't clear_highpage() be done inside arch specific code? TDX doesn't need
> this clearing after all.

It could. E.g. via the kernel-internal gmem flag that I mentioned in my
earlier reply, or some alternative. 

In the context of this series, uptodate flag continues to instruct
kvm_gmem_get_pfn() that it doesn't not need to re-clear pages, because
a prior kvm_gmem_get_pfn() or kvm_gmem_populate() already initialized
the folio, and it is no longer tied to any notion of
preparedness-tracking.

What use uptodate will have in the context of hugepages: I'm not sure.
For non-in-place conversion, it's tempting to just let it continue to be
per-folio and require clearing the whole folio on initial access, but
it's not efficient. It may make sense to farm it out to
post-populate/prep hooks instead, as you're suggesting for TDX.

But then, for in-place conversion, you have to deal with pages initially
faulted in as shared. They might be split prior to initial access as a
private page, where we can't assume TDX will have scrubbed things. So in
that case it might still make sense to rely on it.

Definitely things that require some more thought. But having it inextricably
tied to preparedness just makes preparation tracking similarly more
complicated as it pulls it back into gmem when that does not seem to be
the direction any architectures other SNP have/want to go.

> 
> > My understanding based on prior discussion in guest_memfd calls was that
> > it would be okay to go ahead and clear the entire folio at initial allocation
> > time, and basically never mess with it again. It was also my understanding
> That's where I don't follow in this patch.
> I don't see where the entire folio A is cleared if it's only partially mapped by
> kvm_gmem_populate(). kvm_gmem_get_pfn() won't clear folio A either due to
> kvm_gmem_populate() has set the uptodate flag.
> 
> > that for TDX it might even be optimal to completely skip clearing the folio
> > if it is getting mapped into SecureEPT as a hugepage since the TDX module
> > would handle that, but that maybe conversely after private->shared there
> > would be some need to reclear... I'll try to find that discussion and
> > refresh. Vishal I believe suggested some flags to provide more control over
> > this behavior.
> > 
> > > 
> > > It's possible (at least for TDX) that a huge folio is only partially populated
> > > by kvm_gmem_populate(). Then kvm_gmem_get_pfn() faults in another part of the
> > > huge folio. For example, in TDX, GFN 0x81f belongs to the init memory region,
> > > while GFN 0x820 is faulted after TD is running. However, these two GFNs can
> > > belong to the same folio of order 9.
> > 
> > Would the above scheme of clearing the entire folio up front and not re-clearing
> > at fault time work for this case?
> This case doesn't affect TDX, because TDX clearing private pages internally in
> SEAM APIs. So, as long as kvm_gmem_get_pfn() does not invoke clear_highpage()
> after making a folio private, it works fine for TDX.
> 
> I was just trying to understand why SNP needs the clearing of entire folio in
> kvm_gmem_get_pfn() while I don't see how the entire folio is cleared when it's
> partially mapped in kvm_gmem_populate().
> Also, I'm wondering if it would be better if SNP could move the clearing of
> folio into something like kvm_arch_gmem_clear(), just as kvm_arch_gmem_prepare()
> which is always invoked by kvm_gmem_get_pfn() and the architecture can handle
> it's own tracking at whatever granularity.

Possibly, but I touched elsewhere on where in-place conversion might
trip up this approach. At least decoupling them allows for the prep side
of things to be moved to architecture-specific tracking. We can deal
with uptodate separately I think.

-Mike

> 
>  
> > > Note: the current code should not impact TDX. I'm just asking out of curiosity:)
> > > 
> > > [1] https://lore.kernel.org/all/aQ3uj4BZL6uFQzrD@yzhao56-desk.sh.intel.com/
> > > 
> > >

Re: [PATCH 1/3] KVM: guest_memfd: Remove preparation tracking

Posted by Yan Zhao 2 months, 1 week ago

On Mon, Dec 01, 2025 at 05:44:47PM -0600, Michael Roth wrote:
> On Tue, Nov 25, 2025 at 11:13:25AM +0800, Yan Zhao wrote:
> > On Fri, Nov 21, 2025 at 06:43:14AM -0600, Michael Roth wrote:
> > > On Thu, Nov 20, 2025 at 05:12:55PM +0800, Yan Zhao wrote:
> > > > On Thu, Nov 13, 2025 at 05:07:57PM -0600, Michael Roth wrote:
> > > > > @@ -797,19 +782,25 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
> > > > >  {
> > > > >  	pgoff_t index = kvm_gmem_get_index(slot, gfn);
> > > > >  	struct folio *folio;
> > > > > -	bool is_prepared = false;
> > > > >  	int r = 0;
> > > > >  
> > > > >  	CLASS(gmem_get_file, file)(slot);
> > > > >  	if (!file)
> > > > >  		return -EFAULT;
> > > > >  
> > > > > -	folio = __kvm_gmem_get_pfn(file, slot, index, pfn, &is_prepared, max_order);
> > > > > +	folio = __kvm_gmem_get_pfn(file, slot, index, pfn, max_order);
> > > > >  	if (IS_ERR(folio))
> > > > >  		return PTR_ERR(folio);
> > > > >  
> > > > > -	if (!is_prepared)
> > > > > -		r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio);
> > > > > +	if (!folio_test_uptodate(folio)) {
> > > > > +		unsigned long i, nr_pages = folio_nr_pages(folio);
> > > > > +
> > > > > +		for (i = 0; i < nr_pages; i++)
> > > > > +			clear_highpage(folio_page(folio, i));
> > > > > +		folio_mark_uptodate(folio);
> > > > Here, the entire folio is cleared only when the folio is not marked uptodate.
> > > > Then, please check my questions at the bottom
> > > 
> > > Yes, in this patch at least where I tried to mirror the current logic. I
> > > would not be surprised if we need to rework things for inplace/hugepage
> > > support though, but decoupling 'preparation' from the uptodate flag is
> > > the main goal here.
> > Could you elaborate a little why the decoupling is needed if it's not for
> > hugepage?
> 
> For instance, for in-place conversion:
> 
>   1. initial allocation: clear, set uptodate, fault in as private
>   2. private->shared: call invalidate hook, fault in as shared
>   3. shared->private: call prep hook, fault in as private
> 
> Here, 2/3 need to track where the current state is shared/private in
> order to make appropriate architecture-specific changes (e.g. RMP table
> updates). But we want to allow for non-destructive in-place conversion,
> where a page is 'uptodate', but not in the desired shared/private state.
> So 'uptodate' becomes a separate piece of state, which is still
> reasonable for gmem to track in the current 4K-only implementation, and
> provides for a reasonable approach to upstreaming in-place conversion,
> which isn't far off for either SNP or TDX.
To me, "1. initial allocation: clear, set uptodate" is more appropriate to
be done in kvm_gmem_get_folio(), instead of in kvm_gmem_get_pfn().

With it, below looks reasonable to me.
> For hugepages, we'll have other things to consider, but those things are
> probably still somewhat far off, and so we shouldn't block steps toward
> in-place conversion based on uncertainty around hugepages. I think it's
> gotten enough attention at least that we know it *can* work, e.g. even
> if we take the inefficient/easy route of zero'ing the whole folio on
> initial access, setting it uptodate, and never doing anything with 
> uptodate again, it's still a usable implementation.

<...>
> > > Assuming this patch goes upstream in some form, we will now have the
> > > following major differences versus previous code:
> > > 
> > >   1) uptodate flag only tracks whether a folio has been cleared
> > >   2) gmem always calls kvm_arch_gmem_prepare() via kvm_gmem_get_pfn() and
> > >      the architecture can handle it's own tracking at whatever granularity
> > >      it likes.
> > 2) looks good to me.
> > 
> > > My hope is that 1) can similarly be done in such a way that gmem does not
> > > need to track things at sub-hugepage granularity and necessitate the need
> > > for some new data structure/state/flag to track sub-page status.
> > I actually don't understand what uptodate flag helps gmem to track.
> > Why can't clear_highpage() be done inside arch specific code? TDX doesn't need
> > this clearing after all.
> 
> It could. E.g. via the kernel-internal gmem flag that I mentioned in my
> earlier reply, or some alternative. 
> 
> In the context of this series, uptodate flag continues to instruct
> kvm_gmem_get_pfn() that it doesn't not need to re-clear pages, because
> a prior kvm_gmem_get_pfn() or kvm_gmem_populate() already initialized
> the folio, and it is no longer tied to any notion of
> preparedness-tracking.
> 
> What use uptodate will have in the context of hugepages: I'm not sure.
> For non-in-place conversion, it's tempting to just let it continue to be
> per-folio and require clearing the whole folio on initial access, but
> it's not efficient. It may make sense to farm it out to
> post-populate/prep hooks instead, as you're suggesting for TDX.
> 
> But then, for in-place conversion, you have to deal with pages initially
> faulted in as shared. They might be split prior to initial access as a
> private page, where we can't assume TDX will have scrubbed things. So in
> that case it might still make sense to rely on it.
> 
> Definitely things that require some more thought. But having it inextricably
> tied to preparedness just makes preparation tracking similarly more
> complicated as it pulls it back into gmem when that does not seem to be
> the direction any architectures other SNP have/want to go.
> 
> > 
> > > My understanding based on prior discussion in guest_memfd calls was that
> > > it would be okay to go ahead and clear the entire folio at initial allocation
> > > time, and basically never mess with it again. It was also my understanding
> > That's where I don't follow in this patch.
> > I don't see where the entire folio A is cleared if it's only partially mapped by
> > kvm_gmem_populate(). kvm_gmem_get_pfn() won't clear folio A either due to
> > kvm_gmem_populate() has set the uptodate flag.
> > 
> > > that for TDX it might even be optimal to completely skip clearing the folio
> > > if it is getting mapped into SecureEPT as a hugepage since the TDX module
> > > would handle that, but that maybe conversely after private->shared there
> > > would be some need to reclear... I'll try to find that discussion and
> > > refresh. Vishal I believe suggested some flags to provide more control over
> > > this behavior.
> > > 
> > > > 
> > > > It's possible (at least for TDX) that a huge folio is only partially populated
> > > > by kvm_gmem_populate(). Then kvm_gmem_get_pfn() faults in another part of the
> > > > huge folio. For example, in TDX, GFN 0x81f belongs to the init memory region,
> > > > while GFN 0x820 is faulted after TD is running. However, these two GFNs can
> > > > belong to the same folio of order 9.
> > > 
> > > Would the above scheme of clearing the entire folio up front and not re-clearing
> > > at fault time work for this case?
> > This case doesn't affect TDX, because TDX clearing private pages internally in
> > SEAM APIs. So, as long as kvm_gmem_get_pfn() does not invoke clear_highpage()
> > after making a folio private, it works fine for TDX.
> > 
> > I was just trying to understand why SNP needs the clearing of entire folio in
> > kvm_gmem_get_pfn() while I don't see how the entire folio is cleared when it's
> > partially mapped in kvm_gmem_populate().
> > Also, I'm wondering if it would be better if SNP could move the clearing of
> > folio into something like kvm_arch_gmem_clear(), just as kvm_arch_gmem_prepare()
> > which is always invoked by kvm_gmem_get_pfn() and the architecture can handle
> > it's own tracking at whatever granularity.
> 
> Possibly, but I touched elsewhere on where in-place conversion might
> trip up this approach. At least decoupling them allows for the prep side
> of things to be moved to architecture-specific tracking. We can deal
> with uptodate separately I think.
> 
> -Mike
> 
> > 
> >  
> > > > Note: the current code should not impact TDX. I'm just asking out of curiosity:)
> > > > 
> > > > [1] https://lore.kernel.org/all/aQ3uj4BZL6uFQzrD@yzhao56-desk.sh.intel.com/
> > > > 
> > > >

Re: [PATCH 1/3] KVM: guest_memfd: Remove preparation tracking

Posted by Michael Roth 2 months, 1 week ago

On Tue, Dec 02, 2025 at 05:17:20PM +0800, Yan Zhao wrote:
> On Mon, Dec 01, 2025 at 05:44:47PM -0600, Michael Roth wrote:
> > On Tue, Nov 25, 2025 at 11:13:25AM +0800, Yan Zhao wrote:
> > > On Fri, Nov 21, 2025 at 06:43:14AM -0600, Michael Roth wrote:
> > > > On Thu, Nov 20, 2025 at 05:12:55PM +0800, Yan Zhao wrote:
> > > > > On Thu, Nov 13, 2025 at 05:07:57PM -0600, Michael Roth wrote:
> > > > > > @@ -797,19 +782,25 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
> > > > > >  {
> > > > > >  	pgoff_t index = kvm_gmem_get_index(slot, gfn);
> > > > > >  	struct folio *folio;
> > > > > > -	bool is_prepared = false;
> > > > > >  	int r = 0;
> > > > > >  
> > > > > >  	CLASS(gmem_get_file, file)(slot);
> > > > > >  	if (!file)
> > > > > >  		return -EFAULT;
> > > > > >  
> > > > > > -	folio = __kvm_gmem_get_pfn(file, slot, index, pfn, &is_prepared, max_order);
> > > > > > +	folio = __kvm_gmem_get_pfn(file, slot, index, pfn, max_order);
> > > > > >  	if (IS_ERR(folio))
> > > > > >  		return PTR_ERR(folio);
> > > > > >  
> > > > > > -	if (!is_prepared)
> > > > > > -		r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio);
> > > > > > +	if (!folio_test_uptodate(folio)) {
> > > > > > +		unsigned long i, nr_pages = folio_nr_pages(folio);
> > > > > > +
> > > > > > +		for (i = 0; i < nr_pages; i++)
> > > > > > +			clear_highpage(folio_page(folio, i));
> > > > > > +		folio_mark_uptodate(folio);
> > > > > Here, the entire folio is cleared only when the folio is not marked uptodate.
> > > > > Then, please check my questions at the bottom
> > > > 
> > > > Yes, in this patch at least where I tried to mirror the current logic. I
> > > > would not be surprised if we need to rework things for inplace/hugepage
> > > > support though, but decoupling 'preparation' from the uptodate flag is
> > > > the main goal here.
> > > Could you elaborate a little why the decoupling is needed if it's not for
> > > hugepage?
> > 
> > For instance, for in-place conversion:
> > 
> >   1. initial allocation: clear, set uptodate, fault in as private
> >   2. private->shared: call invalidate hook, fault in as shared
> >   3. shared->private: call prep hook, fault in as private
> > 
> > Here, 2/3 need to track where the current state is shared/private in
> > order to make appropriate architecture-specific changes (e.g. RMP table
> > updates). But we want to allow for non-destructive in-place conversion,
> > where a page is 'uptodate', but not in the desired shared/private state.
> > So 'uptodate' becomes a separate piece of state, which is still
> > reasonable for gmem to track in the current 4K-only implementation, and
> > provides for a reasonable approach to upstreaming in-place conversion,
> > which isn't far off for either SNP or TDX.
> To me, "1. initial allocation: clear, set uptodate" is more appropriate to
> be done in kvm_gmem_get_folio(), instead of in kvm_gmem_get_pfn().

The downside is that preallocating originally involved just
preallocating, and zero'ing would happen lazily during fault time. (or
upfront via KVM_PRE_FAULT_MEMORY).

But in the context of the hugetlb RFC, it certainly looks cleaner to do
it there, since it could be done before any potential splitting activity,
so then all the tail pages can inherit that initial uptodate flag.

We'd probably want to weigh that these trade-offs based on how it will
affect hugepages, but that would be clearer in the context of a new
posting of hugepage support on top of these changes. So I think it's
better to address that change as a follow-up so we can consider it with
more context.

> 
> With it, below looks reasonable to me.
> > For hugepages, we'll have other things to consider, but those things are
> > probably still somewhat far off, and so we shouldn't block steps toward
> > in-place conversion based on uncertainty around hugepages. I think it's
> > gotten enough attention at least that we know it *can* work, e.g. even
> > if we take the inefficient/easy route of zero'ing the whole folio on
> > initial access, setting it uptodate, and never doing anything with 
> > uptodate again, it's still a usable implementation.

To me as well. But in the context of this series, against kvm/next, it
creates userspace-visible changes to pre-allocation behavior that we
can't justify in the context of this series alone, so as above I think
that's something to save for hugepage follow-up.

FWIW though, I ended up taking this approach for the hugetlb-based test
branch, to address the fact (after you reminded me) that I wasn't fully
zero'ing folio's in the kvm_gmem_populate() path:

  https://github.com/AMDESE/linux/commit/253fb30b076d81232deba0b02db070d5bc2b90b5

So maybe for your testing you could do similar. For newer hugepage
support I'll likely do similar, but I don't think any of this reasoning
or changes makes sense to people reviewing this series without already
being aware of hugepage plans/development, so that's why I'm trying to
keep this series more focused on in-place conversion enablement, because
hugepage plans might be massively reworked for next posting based on LPC
talks and changes in direction mentioned in recent guest_memfd calls and
we are basically just hand-waving about what it will look like at this
point while blocking other efforts.

-Mike

> 
> <...>
> > > > Assuming this patch goes upstream in some form, we will now have the
> > > > following major differences versus previous code:
> > > > 
> > > >   1) uptodate flag only tracks whether a folio has been cleared
> > > >   2) gmem always calls kvm_arch_gmem_prepare() via kvm_gmem_get_pfn() and
> > > >      the architecture can handle it's own tracking at whatever granularity
> > > >      it likes.
> > > 2) looks good to me.
> > > 
> > > > My hope is that 1) can similarly be done in such a way that gmem does not
> > > > need to track things at sub-hugepage granularity and necessitate the need
> > > > for some new data structure/state/flag to track sub-page status.
> > > I actually don't understand what uptodate flag helps gmem to track.
> > > Why can't clear_highpage() be done inside arch specific code? TDX doesn't need
> > > this clearing after all.
> > 
> > It could. E.g. via the kernel-internal gmem flag that I mentioned in my
> > earlier reply, or some alternative. 
> > 
> > In the context of this series, uptodate flag continues to instruct
> > kvm_gmem_get_pfn() that it doesn't not need to re-clear pages, because
> > a prior kvm_gmem_get_pfn() or kvm_gmem_populate() already initialized
> > the folio, and it is no longer tied to any notion of
> > preparedness-tracking.
> > 
> > What use uptodate will have in the context of hugepages: I'm not sure.
> > For non-in-place conversion, it's tempting to just let it continue to be
> > per-folio and require clearing the whole folio on initial access, but
> > it's not efficient. It may make sense to farm it out to
> > post-populate/prep hooks instead, as you're suggesting for TDX.
> > 
> > But then, for in-place conversion, you have to deal with pages initially
> > faulted in as shared. They might be split prior to initial access as a
> > private page, where we can't assume TDX will have scrubbed things. So in
> > that case it might still make sense to rely on it.
> > 
> > Definitely things that require some more thought. But having it inextricably
> > tied to preparedness just makes preparation tracking similarly more
> > complicated as it pulls it back into gmem when that does not seem to be
> > the direction any architectures other SNP have/want to go.
> > 
> > > 
> > > > My understanding based on prior discussion in guest_memfd calls was that
> > > > it would be okay to go ahead and clear the entire folio at initial allocation
> > > > time, and basically never mess with it again. It was also my understanding
> > > That's where I don't follow in this patch.
> > > I don't see where the entire folio A is cleared if it's only partially mapped by
> > > kvm_gmem_populate(). kvm_gmem_get_pfn() won't clear folio A either due to
> > > kvm_gmem_populate() has set the uptodate flag.
> > > 
> > > > that for TDX it might even be optimal to completely skip clearing the folio
> > > > if it is getting mapped into SecureEPT as a hugepage since the TDX module
> > > > would handle that, but that maybe conversely after private->shared there
> > > > would be some need to reclear... I'll try to find that discussion and
> > > > refresh. Vishal I believe suggested some flags to provide more control over
> > > > this behavior.
> > > > 
> > > > > 
> > > > > It's possible (at least for TDX) that a huge folio is only partially populated
> > > > > by kvm_gmem_populate(). Then kvm_gmem_get_pfn() faults in another part of the
> > > > > huge folio. For example, in TDX, GFN 0x81f belongs to the init memory region,
> > > > > while GFN 0x820 is faulted after TD is running. However, these two GFNs can
> > > > > belong to the same folio of order 9.
> > > > 
> > > > Would the above scheme of clearing the entire folio up front and not re-clearing
> > > > at fault time work for this case?
> > > This case doesn't affect TDX, because TDX clearing private pages internally in
> > > SEAM APIs. So, as long as kvm_gmem_get_pfn() does not invoke clear_highpage()
> > > after making a folio private, it works fine for TDX.
> > > 
> > > I was just trying to understand why SNP needs the clearing of entire folio in
> > > kvm_gmem_get_pfn() while I don't see how the entire folio is cleared when it's
> > > partially mapped in kvm_gmem_populate().
> > > Also, I'm wondering if it would be better if SNP could move the clearing of
> > > folio into something like kvm_arch_gmem_clear(), just as kvm_arch_gmem_prepare()
> > > which is always invoked by kvm_gmem_get_pfn() and the architecture can handle
> > > it's own tracking at whatever granularity.
> > 
> > Possibly, but I touched elsewhere on where in-place conversion might
> > trip up this approach. At least decoupling them allows for the prep side
> > of things to be moved to architecture-specific tracking. We can deal
> > with uptodate separately I think.
> > 
> > -Mike
> > 
> > > 
> > >  
> > > > > Note: the current code should not impact TDX. I'm just asking out of curiosity:)
> > > > > 
> > > > > [1] https://lore.kernel.org/all/aQ3uj4BZL6uFQzrD@yzhao56-desk.sh.intel.com/
> > > > > 
> > > > >  
>

Re: [PATCH 1/3] KVM: guest_memfd: Remove preparation tracking

Posted by Yan Zhao 2 months ago

On Wed, Dec 03, 2025 at 07:47:17AM -0600, Michael Roth wrote:
> On Tue, Dec 02, 2025 at 05:17:20PM +0800, Yan Zhao wrote:
> > On Mon, Dec 01, 2025 at 05:44:47PM -0600, Michael Roth wrote:
> > > On Tue, Nov 25, 2025 at 11:13:25AM +0800, Yan Zhao wrote:
> > > > On Fri, Nov 21, 2025 at 06:43:14AM -0600, Michael Roth wrote:
> > > > > On Thu, Nov 20, 2025 at 05:12:55PM +0800, Yan Zhao wrote:
> > > > > > On Thu, Nov 13, 2025 at 05:07:57PM -0600, Michael Roth wrote:
> > > > > > > @@ -797,19 +782,25 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
> > > > > > >  {
> > > > > > >  	pgoff_t index = kvm_gmem_get_index(slot, gfn);
> > > > > > >  	struct folio *folio;
> > > > > > > -	bool is_prepared = false;
> > > > > > >  	int r = 0;
> > > > > > >  
> > > > > > >  	CLASS(gmem_get_file, file)(slot);
> > > > > > >  	if (!file)
> > > > > > >  		return -EFAULT;
> > > > > > >  
> > > > > > > -	folio = __kvm_gmem_get_pfn(file, slot, index, pfn, &is_prepared, max_order);
> > > > > > > +	folio = __kvm_gmem_get_pfn(file, slot, index, pfn, max_order);
> > > > > > >  	if (IS_ERR(folio))
> > > > > > >  		return PTR_ERR(folio);
> > > > > > >  
> > > > > > > -	if (!is_prepared)
> > > > > > > -		r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio);
> > > > > > > +	if (!folio_test_uptodate(folio)) {
> > > > > > > +		unsigned long i, nr_pages = folio_nr_pages(folio);
> > > > > > > +
> > > > > > > +		for (i = 0; i < nr_pages; i++)
> > > > > > > +			clear_highpage(folio_page(folio, i));
> > > > > > > +		folio_mark_uptodate(folio);
> > > > > > Here, the entire folio is cleared only when the folio is not marked uptodate.
> > > > > > Then, please check my questions at the bottom
> > > > > 
> > > > > Yes, in this patch at least where I tried to mirror the current logic. I
> > > > > would not be surprised if we need to rework things for inplace/hugepage
> > > > > support though, but decoupling 'preparation' from the uptodate flag is
> > > > > the main goal here.
> > > > Could you elaborate a little why the decoupling is needed if it's not for
> > > > hugepage?
> > > 
> > > For instance, for in-place conversion:
> > > 
> > >   1. initial allocation: clear, set uptodate, fault in as private
> > >   2. private->shared: call invalidate hook, fault in as shared
> > >   3. shared->private: call prep hook, fault in as private
> > > 
> > > Here, 2/3 need to track where the current state is shared/private in
> > > order to make appropriate architecture-specific changes (e.g. RMP table
> > > updates). But we want to allow for non-destructive in-place conversion,
> > > where a page is 'uptodate', but not in the desired shared/private state.
> > > So 'uptodate' becomes a separate piece of state, which is still
> > > reasonable for gmem to track in the current 4K-only implementation, and
> > > provides for a reasonable approach to upstreaming in-place conversion,
> > > which isn't far off for either SNP or TDX.
> > To me, "1. initial allocation: clear, set uptodate" is more appropriate to
> > be done in kvm_gmem_get_folio(), instead of in kvm_gmem_get_pfn().
> 
> The downside is that preallocating originally involved just
> preallocating, and zero'ing would happen lazily during fault time. (or
> upfront via KVM_PRE_FAULT_MEMORY).
> 
> But in the context of the hugetlb RFC, it certainly looks cleaner to do
> it there, since it could be done before any potential splitting activity,
> so then all the tail pages can inherit that initial uptodate flag.
> 
> We'd probably want to weigh that these trade-offs based on how it will
> affect hugepages, but that would be clearer in the context of a new
> posting of hugepage support on top of these changes. So I think it's
> better to address that change as a follow-up so we can consider it with
> more context.
> 
> > 
> > With it, below looks reasonable to me.
> > > For hugepages, we'll have other things to consider, but those things are
> > > probably still somewhat far off, and so we shouldn't block steps toward
> > > in-place conversion based on uncertainty around hugepages. I think it's
> > > gotten enough attention at least that we know it *can* work, e.g. even
> > > if we take the inefficient/easy route of zero'ing the whole folio on
> > > initial access, setting it uptodate, and never doing anything with 
> > > uptodate again, it's still a usable implementation.
> 
> To me as well. But in the context of this series, against kvm/next, it
> creates userspace-visible changes to pre-allocation behavior that we
> can't justify in the context of this series alone, so as above I think
> that's something to save for hugepage follow-up.
> 
> FWIW though, I ended up taking this approach for the hugetlb-based test
> branch, to address the fact (after you reminded me) that I wasn't fully
> zero'ing folio's in the kvm_gmem_populate() path:
> 
>   https://github.com/AMDESE/linux/commit/253fb30b076d81232deba0b02db070d5bc2b90b5
> 
> So maybe for your testing you could do similar. For newer hugepage
> support I'll likely do similar, but I don't think any of this reasoning
> or changes makes sense to people reviewing this series without already
> being aware of hugepage plans/development, so that's why I'm trying to
> keep this series more focused on in-place conversion enablement, because
> hugepage plans might be massively reworked for next posting based on LPC
> talks and changes in direction mentioned in recent guest_memfd calls and
> we are basically just hand-waving about what it will look like at this
> point while blocking other efforts.
> 
Got it. Thanks for the explanation!

Re: [PATCH 1/3] KVM: guest_memfd: Remove preparation tracking

Posted by Vishal Annapurve 2 months, 1 week ago

On Mon, Nov 24, 2025 at 7:15 PM Yan Zhao <yan.y.zhao@intel.com> wrote:
>
> On Fri, Nov 21, 2025 at 06:43:14AM -0600, Michael Roth wrote:
> > On Thu, Nov 20, 2025 at 05:12:55PM +0800, Yan Zhao wrote:
> > > On Thu, Nov 13, 2025 at 05:07:57PM -0600, Michael Roth wrote:
> > > > @@ -797,19 +782,25 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
> > > >  {
> > > >   pgoff_t index = kvm_gmem_get_index(slot, gfn);
> > > >   struct folio *folio;
> > > > - bool is_prepared = false;
> > > >   int r = 0;
> > > >
> > > >   CLASS(gmem_get_file, file)(slot);
> > > >   if (!file)
> > > >           return -EFAULT;
> > > >
> > > > - folio = __kvm_gmem_get_pfn(file, slot, index, pfn, &is_prepared, max_order);
> > > > + folio = __kvm_gmem_get_pfn(file, slot, index, pfn, max_order);
> > > >   if (IS_ERR(folio))
> > > >           return PTR_ERR(folio);
> > > >
> > > > - if (!is_prepared)
> > > > -         r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio);
> > > > + if (!folio_test_uptodate(folio)) {
> > > > +         unsigned long i, nr_pages = folio_nr_pages(folio);
> > > > +
> > > > +         for (i = 0; i < nr_pages; i++)
> > > > +                 clear_highpage(folio_page(folio, i));
> > > > +         folio_mark_uptodate(folio);
> > > Here, the entire folio is cleared only when the folio is not marked uptodate.
> > > Then, please check my questions at the bottom
> >
> > Yes, in this patch at least where I tried to mirror the current logic. I
> > would not be surprised if we need to rework things for inplace/hugepage
> > support though, but decoupling 'preparation' from the uptodate flag is
> > the main goal here.
> Could you elaborate a little why the decoupling is needed if it's not for
> hugepage?

IMO, decoupling is useful in general and we don't necessarily need to
wait till hugepage support lands to clean up this logic. Current
preparation logic has created some confusion regarding multiple
features for guest_memfd under discussion such as generic write, uffd
support, and direct map removal. It would be useful to simplify the
guest_memfd logic in this regard.

>
>
> > > > + }
> > > > +
> > > > + r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio);
> > > >
> > > >   folio_unlock(folio);
> > > >
> > > > @@ -852,7 +843,6 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
> > > >           struct folio *folio;
> > > >           gfn_t gfn = start_gfn + i;
> > > >           pgoff_t index = kvm_gmem_get_index(slot, gfn);
> > > > -         bool is_prepared = false;
> > > >           kvm_pfn_t pfn;
> > > >
> > > >           if (signal_pending(current)) {
> > > > @@ -860,19 +850,12 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
> > > >                   break;
> > > >           }
> > > >
> > > > -         folio = __kvm_gmem_get_pfn(file, slot, index, &pfn, &is_prepared, &max_order);
> > > > +         folio = __kvm_gmem_get_pfn(file, slot, index, &pfn, &max_order);
> > > >           if (IS_ERR(folio)) {
> > > >                   ret = PTR_ERR(folio);
> > > >                   break;
> > > >           }
> > > >
> > > > -         if (is_prepared) {
> > > > -                 folio_unlock(folio);
> > > > -                 folio_put(folio);
> > > > -                 ret = -EEXIST;
> > > > -                 break;
> > > > -         }
> > > > -
> > > >           folio_unlock(folio);
> > > >           WARN_ON(!IS_ALIGNED(gfn, 1 << max_order) ||
> > > >                   (npages - i) < (1 << max_order));
> > > TDX could hit this warning easily when npages == 1, max_order == 9.
> >
> > Yes, this will need to change to handle that. I don't think I had to
> > change this for previous iterations of SNP hugepage support, but
> > there are definitely cases where a sub-2M range might get populated
> > even though it's backed by a 2M folio, so I'm not sure why I didn't
> > hit it there.
> >
> > But I'm taking Sean's cue on touching as little of the existing
> > hugepage logic as possible in this particular series so we can revisit
> > the remaining changes with some better context.
> Frankly, I don't understand why this patch 1 is required if we only want "moving
> GUP out of post_populate()" to work for 4KB folios.
>
> > >
> > > > @@ -889,7 +872,7 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
> > > >           p = src ? src + i * PAGE_SIZE : NULL;
> > > >           ret = post_populate(kvm, gfn, pfn, p, max_order, opaque);
> > > >           if (!ret)
> > > > -                 kvm_gmem_mark_prepared(folio);
> > > > +                 folio_mark_uptodate(folio);
> > > As also asked in [1], why is the entire folio marked as uptodate here? Why does
> > > kvm_gmem_get_pfn() clear all pages of a huge folio when the folio isn't marked
> > > uptodate?
> >
> > Quoting your example from[1] for more context:
> >
> > > I also have a question about this patch:
> > >
> > > Suppose there's a 2MB huge folio A, where
> > > A1 and A2 are 4KB pages belonging to folio A.
> > >
> > > (1) kvm_gmem_populate() invokes __kvm_gmem_get_pfn() and gets folio A.
> > >     It adds page A1 and invokes folio_mark_uptodate() on folio A.
> >
> > In SNP hugepage patchset you responded to, it would only mark A1 as
> You mean code in
> https://github.com/amdese/linux/commits/snp-inplace-conversion-rfc1 ?
>
> > prepared/cleared. There was 4K-granularity tracking added to handle this.
> I don't find the code that marks only A1 as "prepared/cleared".
> Instead, I just found folio_mark_uptodate() is invoked by kvm_gmem_populate()
> to mark the entire folio A as uptodate.
>
> However, according to your statement below that "uptodate flag only tracks
> whether a folio has been cleared", I don't follow why and where the entire folio
> A would be cleared if kvm_gmem_populate() only adds page A1.

I think kvm_gmem_populate() is currently only used by SNP and TDX
logic, I don't see an issue with marking the complete folio as
uptodate even if its partially updated by kvm_gmem_populate() paths as
the private memory will eventually get initialized anyways.

>
> > There was an odd subtlety in that series though: it was defaulting to the
> > folio_order() for the prep-tracking/post-populate, but it would then clamp
> > it down based on the max order possible according whether that particular
> > order was a homogenous range of KVM_MEMORY_ATTRIBUTE_PRIVATE. Which is not
> > a great way to handle things, and I don't remember if I'd actually intended
> > to implement it that way or not... that's probably why I never tripped over
> > the WARN_ON() above, now that I think of it.
> >
> > But neither of these these apply to any current plans for hugepage support
> > that I'm aware of, so probably not worth working through what that series
> > did and look at this from a fresh perspective.
> >
> > >
> > > (2) kvm_gmem_get_pfn() later faults in page A2.
> > >     As folio A is uptodate, clear_highpage() is not invoked on page A2.
> > >     kvm_gmem_prepare_folio() is invoked on the whole folio A.
> > >
> > > (2) could occur at least in TDX when only a part the 2MB page is added as guest
> > > initial memory.
> > >
> > > My questions:
> > > - Would (2) occur on SEV?
> > > - If it does, is the lack of clear_highpage() on A2 a problem ?
> > > - Is invoking gmem_prepare on page A1 a problem?
> >
> > Assuming this patch goes upstream in some form, we will now have the
> > following major differences versus previous code:
> >
> >   1) uptodate flag only tracks whether a folio has been cleared
> >   2) gmem always calls kvm_arch_gmem_prepare() via kvm_gmem_get_pfn() and
> >      the architecture can handle it's own tracking at whatever granularity
> >      it likes.
> 2) looks good to me.
>
> > My hope is that 1) can similarly be done in such a way that gmem does not
> > need to track things at sub-hugepage granularity and necessitate the need
> > for some new data structure/state/flag to track sub-page status.
> I actually don't understand what uptodate flag helps gmem to track.
> Why can't clear_highpage() be done inside arch specific code? TDX doesn't need
> this clearing after all.

Target audience for guest_memfd includes non-confidential VMs as well.
Inline with shmem and other filesystems, guest_memfd should clear
pages on fault before handing them out to the users. There should be a
way to opt-out of this behavior for certain private faults like for
SNP/TDX and possibly for CCA as well.

>
> > My understanding based on prior discussion in guest_memfd calls was that
> > it would be okay to go ahead and clear the entire folio at initial allocation
> > time, and basically never mess with it again. It was also my understanding
> That's where I don't follow in this patch.
> I don't see where the entire folio A is cleared if it's only partially mapped by
> kvm_gmem_populate(). kvm_gmem_get_pfn() won't clear folio A either due to
> kvm_gmem_populate() has set the uptodate flag.

Since kvm_gmem_populate() is specific to SNP and TDX VMs, I don't
think this behavior is concerning.

Re: [PATCH 1/3] KVM: guest_memfd: Remove preparation tracking

Posted by Yan Zhao 2 months, 1 week ago

On Sun, Nov 30, 2025 at 05:35:41PM -0800, Vishal Annapurve wrote:
> On Mon, Nov 24, 2025 at 7:15 PM Yan Zhao <yan.y.zhao@intel.com> wrote:
> > > > > @@ -889,7 +872,7 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
> > > > >           p = src ? src + i * PAGE_SIZE : NULL;
> > > > >           ret = post_populate(kvm, gfn, pfn, p, max_order, opaque);
> > > > >           if (!ret)
> > > > > -                 kvm_gmem_mark_prepared(folio);
> > > > > +                 folio_mark_uptodate(folio);
> > > > As also asked in [1], why is the entire folio marked as uptodate here? Why does
> > > > kvm_gmem_get_pfn() clear all pages of a huge folio when the folio isn't marked
> > > > uptodate?
> > >
> > > Quoting your example from[1] for more context:
> > >
> > > > I also have a question about this patch:
> > > >
> > > > Suppose there's a 2MB huge folio A, where
> > > > A1 and A2 are 4KB pages belonging to folio A.
> > > >
> > > > (1) kvm_gmem_populate() invokes __kvm_gmem_get_pfn() and gets folio A.
> > > >     It adds page A1 and invokes folio_mark_uptodate() on folio A.
> > >
> > > In SNP hugepage patchset you responded to, it would only mark A1 as
> > You mean code in
> > https://github.com/amdese/linux/commits/snp-inplace-conversion-rfc1 ?
> >
> > > prepared/cleared. There was 4K-granularity tracking added to handle this.
> > I don't find the code that marks only A1 as "prepared/cleared".
> > Instead, I just found folio_mark_uptodate() is invoked by kvm_gmem_populate()
> > to mark the entire folio A as uptodate.
> >
> > However, according to your statement below that "uptodate flag only tracks
> > whether a folio has been cleared", I don't follow why and where the entire folio
> > A would be cleared if kvm_gmem_populate() only adds page A1.
> 
> I think kvm_gmem_populate() is currently only used by SNP and TDX
> logic, I don't see an issue with marking the complete folio as
> uptodate even if its partially updated by kvm_gmem_populate() paths as
> the private memory will eventually get initialized anyways.
Still using the above example,
If only page A1 is passed to sev_gmem_post_populate(), will SNP initialize the
entire folio A?
- if yes, could you kindly point me to the code that does this? .
- if sev_gmem_post_populate() only initializes page A1, after marking the
  complete folio A as uptodate in kvm_gmem_populate(), later faulting in page A2
  in kvm_gmem_get_pfn() will not clear page A2 by invoking clear_highpage(),
  since the entire folio A is uptodate. I don't understand why this is OK.
  Or what's the purpose of invoking clear_highpage() on other folios?

Thanks
Yan

Re: [PATCH 1/3] KVM: guest_memfd: Remove preparation tracking

Posted by Vishal Annapurve 2 months, 1 week ago

On Sun, Nov 30, 2025 at 6:53 PM Yan Zhao <yan.y.zhao@intel.com> wrote:
>
> On Sun, Nov 30, 2025 at 05:35:41PM -0800, Vishal Annapurve wrote:
> > On Mon, Nov 24, 2025 at 7:15 PM Yan Zhao <yan.y.zhao@intel.com> wrote:
> > > > > > @@ -889,7 +872,7 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
> > > > > >           p = src ? src + i * PAGE_SIZE : NULL;
> > > > > >           ret = post_populate(kvm, gfn, pfn, p, max_order, opaque);
> > > > > >           if (!ret)
> > > > > > -                 kvm_gmem_mark_prepared(folio);
> > > > > > +                 folio_mark_uptodate(folio);
> > > > > As also asked in [1], why is the entire folio marked as uptodate here? Why does
> > > > > kvm_gmem_get_pfn() clear all pages of a huge folio when the folio isn't marked
> > > > > uptodate?
> > > >
> > > > Quoting your example from[1] for more context:
> > > >
> > > > > I also have a question about this patch:
> > > > >
> > > > > Suppose there's a 2MB huge folio A, where
> > > > > A1 and A2 are 4KB pages belonging to folio A.
> > > > >
> > > > > (1) kvm_gmem_populate() invokes __kvm_gmem_get_pfn() and gets folio A.
> > > > >     It adds page A1 and invokes folio_mark_uptodate() on folio A.
> > > >
> > > > In SNP hugepage patchset you responded to, it would only mark A1 as
> > > You mean code in
> > > https://github.com/amdese/linux/commits/snp-inplace-conversion-rfc1 ?
> > >
> > > > prepared/cleared. There was 4K-granularity tracking added to handle this.
> > > I don't find the code that marks only A1 as "prepared/cleared".
> > > Instead, I just found folio_mark_uptodate() is invoked by kvm_gmem_populate()
> > > to mark the entire folio A as uptodate.
> > >
> > > However, according to your statement below that "uptodate flag only tracks
> > > whether a folio has been cleared", I don't follow why and where the entire folio
> > > A would be cleared if kvm_gmem_populate() only adds page A1.
> >
> > I think kvm_gmem_populate() is currently only used by SNP and TDX
> > logic, I don't see an issue with marking the complete folio as
> > uptodate even if its partially updated by kvm_gmem_populate() paths as
> > the private memory will eventually get initialized anyways.
> Still using the above example,
> If only page A1 is passed to sev_gmem_post_populate(), will SNP initialize the
> entire folio A?
> - if yes, could you kindly point me to the code that does this? .
> - if sev_gmem_post_populate() only initializes page A1, after marking the
>   complete folio A as uptodate in kvm_gmem_populate(), later faulting in page A2
>   in kvm_gmem_get_pfn() will not clear page A2 by invoking clear_highpage(),
>   since the entire folio A is uptodate. I don't understand why this is OK.
>   Or what's the purpose of invoking clear_highpage() on other folios?

I think sev_gmem_post_populate() only initializes the ranges marked
for snp_launch_update(). Since the current code lacks a hugepage
provider, the kvm_gmem_populate() doesn't need to explicitly clear
anything for 4K backings during kvm_gmem_populate().

I see your point. Once a hugepage provider lands, kvm_gmem_populate()
can first invoke clear_highpage() or an equivalent API on a complete
huge folio before calling the architecture-specific post-populate hook
to keep the implementation consistent.

Subsequently, we need to figure out a way to avoid this clearing for
SNP/TDX/CCA private faults.

>
> Thanks
> Yan

Re: [PATCH 1/3] KVM: guest_memfd: Remove preparation tracking

Posted by Yan Zhao 2 months, 1 week ago

On Mon, Dec 01, 2025 at 11:33:18AM -0800, Vishal Annapurve wrote:
> On Sun, Nov 30, 2025 at 6:53 PM Yan Zhao <yan.y.zhao@intel.com> wrote:
> >
> > On Sun, Nov 30, 2025 at 05:35:41PM -0800, Vishal Annapurve wrote:
> > > On Mon, Nov 24, 2025 at 7:15 PM Yan Zhao <yan.y.zhao@intel.com> wrote:
> > > > > > > @@ -889,7 +872,7 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
> > > > > > >           p = src ? src + i * PAGE_SIZE : NULL;
> > > > > > >           ret = post_populate(kvm, gfn, pfn, p, max_order, opaque);
> > > > > > >           if (!ret)
> > > > > > > -                 kvm_gmem_mark_prepared(folio);
> > > > > > > +                 folio_mark_uptodate(folio);
> > > > > > As also asked in [1], why is the entire folio marked as uptodate here? Why does
> > > > > > kvm_gmem_get_pfn() clear all pages of a huge folio when the folio isn't marked
> > > > > > uptodate?
> > > > >
> > > > > Quoting your example from[1] for more context:
> > > > >
> > > > > > I also have a question about this patch:
> > > > > >
> > > > > > Suppose there's a 2MB huge folio A, where
> > > > > > A1 and A2 are 4KB pages belonging to folio A.
> > > > > >
> > > > > > (1) kvm_gmem_populate() invokes __kvm_gmem_get_pfn() and gets folio A.
> > > > > >     It adds page A1 and invokes folio_mark_uptodate() on folio A.
> > > > >
> > > > > In SNP hugepage patchset you responded to, it would only mark A1 as
> > > > You mean code in
> > > > https://github.com/amdese/linux/commits/snp-inplace-conversion-rfc1 ?
> > > >
> > > > > prepared/cleared. There was 4K-granularity tracking added to handle this.
> > > > I don't find the code that marks only A1 as "prepared/cleared".
> > > > Instead, I just found folio_mark_uptodate() is invoked by kvm_gmem_populate()
> > > > to mark the entire folio A as uptodate.
> > > >
> > > > However, according to your statement below that "uptodate flag only tracks
> > > > whether a folio has been cleared", I don't follow why and where the entire folio
> > > > A would be cleared if kvm_gmem_populate() only adds page A1.
> > >
> > > I think kvm_gmem_populate() is currently only used by SNP and TDX
> > > logic, I don't see an issue with marking the complete folio as
> > > uptodate even if its partially updated by kvm_gmem_populate() paths as
> > > the private memory will eventually get initialized anyways.
> > Still using the above example,
> > If only page A1 is passed to sev_gmem_post_populate(), will SNP initialize the
> > entire folio A?
> > - if yes, could you kindly point me to the code that does this? .
> > - if sev_gmem_post_populate() only initializes page A1, after marking the
> >   complete folio A as uptodate in kvm_gmem_populate(), later faulting in page A2
> >   in kvm_gmem_get_pfn() will not clear page A2 by invoking clear_highpage(),
> >   since the entire folio A is uptodate. I don't understand why this is OK.
> >   Or what's the purpose of invoking clear_highpage() on other folios?
> 
> I think sev_gmem_post_populate() only initializes the ranges marked
> for snp_launch_update(). Since the current code lacks a hugepage
> provider, the kvm_gmem_populate() doesn't need to explicitly clear
> anything for 4K backings during kvm_gmem_populate().
> 
> I see your point. Once a hugepage provider lands, kvm_gmem_populate()
> can first invoke clear_highpage() or an equivalent API on a complete
> huge folio before calling the architecture-specific post-populate hook
> to keep the implementation consistent.
Maybe clear_highpage() in kvm_gmem_get_folio()?

When in-place copy in kvm_gmem_populate() comes, kvm_gmem_get_folio() can be
invoked first for shared memory, so clear_highpage() there is before userspace
writes to shared memory. No clear_highpage() is required when kvm_gmem_populate()
invokes __kvm_gmem_get_pfn() to get the folio again.

> Subsequently, we need to figure out a way to avoid this clearing for
> SNP/TDX/CCA private faults.
> 
> >
> > Thanks
> > Yan

Re: [PATCH 1/3] KVM: guest_memfd: Remove preparation tracking

Posted by Ackerley Tng 2 months, 3 weeks ago

Michael Roth <michael.roth@amd.com> writes:

> guest_memfd currently uses the folio uptodate flag to track:
>
>   1) whether or not a page has been cleared before initial usage
>   2) whether or not the architecture hooks have been issued to put the
>      page in a private state as defined by the architecture
>
> In practice, 2) is only actually being tracked for SEV-SNP VMs, and
> there do not seem to be any plans/reasons that would suggest this will
> change in the future, so this additional tracking/complexity is not
> really providing any general benefit to guest_memfd users. Future plans
> around in-place conversion and hugepage support, where the per-folio
> uptodate flag is planned to be used purely to track the initial clearing
> of folios, whereas conversion operations could trigger multiple
> transitions between 'prepared' and 'unprepared' and thus need separate
> tracking, will make the burden of tracking this information within
> guest_memfd even more complex, since preparation generally happens
> during fault time, on the "read-side" of any global locks that might
> protect state tracked by guest_memfd, and so may require more complex
> locking schemes to allow for concurrent handling of page faults for
> multiple vCPUs where the "preparedness" state tracked by guest_memfd
> might need to be updated as part of handling the fault.
>
> Instead of keeping this current/future complexity within guest_memfd for
> what is essentially just SEV-SNP, just drop the tracking for 2) and have
> the arch-specific preparation hooks get triggered unconditionally on
> every fault so the arch-specific hooks can check the preparation state
> directly and decide whether or not a folio still needs additional
> preparation. In the case of SEV-SNP, the preparation state is already
> checked again via the preparation hooks to avoid double-preparation, so
> nothing extra needs to be done to update the handling of things there.
>

This looks good to me, thanks!

What do you think of moving preparation (or SNP-specific work) to be
done when the page is actually mapped by KVM instead? So whatever's done
in preparation is now called from KVM instead of within guest_memfd [1]?

I'm concerned about how this preparation needs to be done for the entire
folio. With huge pages, could it be weird if actually only one page in
the huge page is faulted in, and hence only that one page needs to be
prepared, instead of the entire huge page?

In the other series [2], there was a part about how guest_memfd should
invalidate the shared status on conversion from private to shared. Is
that still an intended step, after this series to remove preparation
tracking?

[1] https://lore.kernel.org/all/diqzcy7op5wg.fsf@google.com/
[2] https://lore.kernel.org/all/20250613005400.3694904-4-michael.roth@amd.com/

> Signed-off-by: Michael Roth <michael.roth@amd.com>
> ---
>  virt/kvm/guest_memfd.c | 47 ++++++++++++++----------------------------
>  1 file changed, 15 insertions(+), 32 deletions(-)
>
> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> index fdaea3422c30..9160379df378 100644
> --- a/virt/kvm/guest_memfd.c
> +++ b/virt/kvm/guest_memfd.c
> @@ -76,11 +76,6 @@ static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slo
>  	return 0;
>  }
>  
> -static inline void kvm_gmem_mark_prepared(struct folio *folio)
> -{
> -	folio_mark_uptodate(folio);
> -}
> -
>  /*
>   * Process @folio, which contains @gfn, so that the guest can use it.
>   * The folio must be locked and the gfn must be contained in @slot.
> @@ -90,13 +85,7 @@ static inline void kvm_gmem_mark_prepared(struct folio *folio)
>  static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
>  				  gfn_t gfn, struct folio *folio)
>  {
> -	unsigned long nr_pages, i;
>  	pgoff_t index;
> -	int r;
> -
> -	nr_pages = folio_nr_pages(folio);
> -	for (i = 0; i < nr_pages; i++)
> -		clear_highpage(folio_page(folio, i));
>  
>  	/*
>  	 * Preparing huge folios should always be safe, since it should
> @@ -114,11 +103,8 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
>  	WARN_ON(!IS_ALIGNED(slot->gmem.pgoff, folio_nr_pages(folio)));
>  	index = kvm_gmem_get_index(slot, gfn);
>  	index = ALIGN_DOWN(index, folio_nr_pages(folio));
> -	r = __kvm_gmem_prepare_folio(kvm, slot, index, folio);
> -	if (!r)
> -		kvm_gmem_mark_prepared(folio);
>  
> -	return r;
> +	return __kvm_gmem_prepare_folio(kvm, slot, index, folio);
>  }
>  
>  /*
> @@ -420,7 +406,7 @@ static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf)
>  
>  	if (!folio_test_uptodate(folio)) {
>  		clear_highpage(folio_page(folio, 0));
> -		kvm_gmem_mark_prepared(folio);
> +		folio_mark_uptodate(folio);
>  	}
>  
>  	vmf->page = folio_file_page(folio, vmf->pgoff);
> @@ -757,7 +743,7 @@ void kvm_gmem_unbind(struct kvm_memory_slot *slot)
>  static struct folio *__kvm_gmem_get_pfn(struct file *file,
>  					struct kvm_memory_slot *slot,
>  					pgoff_t index, kvm_pfn_t *pfn,
> -					bool *is_prepared, int *max_order)
> +					int *max_order)
>  {
>  	struct file *slot_file = READ_ONCE(slot->gmem.file);
>  	struct gmem_file *f = file->private_data;
> @@ -787,7 +773,6 @@ static struct folio *__kvm_gmem_get_pfn(struct file *file,
>  	if (max_order)
>  		*max_order = 0;
>  
> -	*is_prepared = folio_test_uptodate(folio);
>  	return folio;
>  }
>  
> @@ -797,19 +782,25 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
>  {
>  	pgoff_t index = kvm_gmem_get_index(slot, gfn);
>  	struct folio *folio;
> -	bool is_prepared = false;
>  	int r = 0;
>  
>  	CLASS(gmem_get_file, file)(slot);
>  	if (!file)
>  		return -EFAULT;
>  
> -	folio = __kvm_gmem_get_pfn(file, slot, index, pfn, &is_prepared, max_order);
> +	folio = __kvm_gmem_get_pfn(file, slot, index, pfn, max_order);
>  	if (IS_ERR(folio))
>  		return PTR_ERR(folio);
>  
> -	if (!is_prepared)
> -		r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio);
> +	if (!folio_test_uptodate(folio)) {
> +		unsigned long i, nr_pages = folio_nr_pages(folio);
> +
> +		for (i = 0; i < nr_pages; i++)
> +			clear_highpage(folio_page(folio, i));
> +		folio_mark_uptodate(folio);
> +	}
> +
> +	r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio);
>  
>  	folio_unlock(folio);
>  
> @@ -852,7 +843,6 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
>  		struct folio *folio;
>  		gfn_t gfn = start_gfn + i;
>  		pgoff_t index = kvm_gmem_get_index(slot, gfn);
> -		bool is_prepared = false;
>  		kvm_pfn_t pfn;
>  
>  		if (signal_pending(current)) {
> @@ -860,19 +850,12 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
>  			break;
>  		}
>  
> -		folio = __kvm_gmem_get_pfn(file, slot, index, &pfn, &is_prepared, &max_order);
> +		folio = __kvm_gmem_get_pfn(file, slot, index, &pfn, &max_order);
>  		if (IS_ERR(folio)) {
>  			ret = PTR_ERR(folio);
>  			break;
>  		}
>  
> -		if (is_prepared) {
> -			folio_unlock(folio);
> -			folio_put(folio);
> -			ret = -EEXIST;
> -			break;
> -		}
> -
>  		folio_unlock(folio);
>  		WARN_ON(!IS_ALIGNED(gfn, 1 << max_order) ||
>  			(npages - i) < (1 << max_order));
> @@ -889,7 +872,7 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
>  		p = src ? src + i * PAGE_SIZE : NULL;
>  		ret = post_populate(kvm, gfn, pfn, p, max_order, opaque);
>  		if (!ret)
> -			kvm_gmem_mark_prepared(folio);
> +			folio_mark_uptodate(folio);
>  
>  put_folio_and_exit:
>  		folio_put(folio);
> -- 
> 2.25.1

Re: [PATCH 1/3] KVM: guest_memfd: Remove preparation tracking

Posted by Michael Roth 2 months, 3 weeks ago

On Mon, Nov 17, 2025 at 03:58:46PM -0800, Ackerley Tng wrote:
> Michael Roth <michael.roth@amd.com> writes:
> 
> > guest_memfd currently uses the folio uptodate flag to track:
> >
> >   1) whether or not a page has been cleared before initial usage
> >   2) whether or not the architecture hooks have been issued to put the
> >      page in a private state as defined by the architecture
> >
> > In practice, 2) is only actually being tracked for SEV-SNP VMs, and
> > there do not seem to be any plans/reasons that would suggest this will
> > change in the future, so this additional tracking/complexity is not
> > really providing any general benefit to guest_memfd users. Future plans
> > around in-place conversion and hugepage support, where the per-folio
> > uptodate flag is planned to be used purely to track the initial clearing
> > of folios, whereas conversion operations could trigger multiple
> > transitions between 'prepared' and 'unprepared' and thus need separate
> > tracking, will make the burden of tracking this information within
> > guest_memfd even more complex, since preparation generally happens
> > during fault time, on the "read-side" of any global locks that might
> > protect state tracked by guest_memfd, and so may require more complex
> > locking schemes to allow for concurrent handling of page faults for
> > multiple vCPUs where the "preparedness" state tracked by guest_memfd
> > might need to be updated as part of handling the fault.
> >
> > Instead of keeping this current/future complexity within guest_memfd for
> > what is essentially just SEV-SNP, just drop the tracking for 2) and have
> > the arch-specific preparation hooks get triggered unconditionally on
> > every fault so the arch-specific hooks can check the preparation state
> > directly and decide whether or not a folio still needs additional
> > preparation. In the case of SEV-SNP, the preparation state is already
> > checked again via the preparation hooks to avoid double-preparation, so
> > nothing extra needs to be done to update the handling of things there.
> >
> 
> This looks good to me, thanks!
> 
> What do you think of moving preparation (or SNP-specific work) to be
> done when the page is actually mapped by KVM instead? So whatever's done
> in preparation is now called from KVM instead of within guest_memfd [1]?

Now that preparation tracking is removed, it is now completely decoupled
from the kvm_gmem_populate() path and fully contained in
kvm_gmem_get_pfn(), where it becomes a lot more straightforward to move
this into the KVM MMU fault path.

But gmem currently also handles the inverse operation via the
gmem_invalidate() hooks, which is driven separately from the KVM MMU
notifiers. And it's not so simple to just plumb it into those paths,
but invalidation in this sense involves clearing the 'validated' bit
in the RMP table for the page, which is a destructive operation, whereas
the notifiers as they exist today can be using for non-destructive
operations like simply rebuilding stage2 mappings. So we'd probably need
to think through what that would look like if we really want to move
preparation/un-preparation out of gmem.

So I think it makes sense to consider this patch as-is as a stepping
stone toward that, but I don't have any objection to going that
direction. Curious what others have to say though.

> 
> I'm concerned about how this preparation needs to be done for the entire
> folio. With huge pages, could it be weird if actually only one page in
> the huge page is faulted in, and hence only that one page needs to be
> prepared, instead of the entire huge page?

In previous iterations of THP support for SNP[1] I think this worked out
okay. You'd prepare optimistically prepare the whole huge folio, and if KVM
mapped it as, say, 4K, you'd get an RMP fault and PSMASH the RMP table
to smaller 4K/prepare entries. But that was before in-place conversion
was in the picture, so we didn't have to worry about ever converting
those other prepared entries to a shared state, so you could defer
everything until folio cleanup. For in-place we'd need to take the
memory attributes for the range we are mapping into account and clamp
the range down to a smaller order accordingly before issuing the prepare
hook. But I think it would still be doable.

Maybe more directly would be to let KVM MMU tell us the max mapping
level it will be using so we can just defer all the attribute handling
to KVM. But this same approach could still be done with gmem issuing
the prepare hooks in the case of in-place conversion. So I think it's
doable either way... hard to tell what approach is cleaner without some
hugepage patches on top. I'm still trying to get update THP on top of
your in-place conversion patches posted and maybe it'll be easier to see
what things would look like in that context.

[1] https://lore.kernel.org/kvm/20241212063635.712877-1-michael.roth@amd.com/

> 
> In the other series [2], there was a part about how guest_memfd should
> invalidate the shared status on conversion from private to shared. Is
> that still an intended step, after this series to remove preparation
> tracking?

Yes, I was still planning to have gmem drive prepare/invalidate where
needed. If we move things out to MMU that will require some rethinking
however.


Thanks,

Mike

> 
> [1] https://lore.kernel.org/all/diqzcy7op5wg.fsf@google.com/
> [2] https://lore.kernel.org/all/20250613005400.3694904-4-michael.roth@amd.com/
> 
> > Signed-off-by: Michael Roth <michael.roth@amd.com>
> > ---
> >  virt/kvm/guest_memfd.c | 47 ++++++++++++++----------------------------
> >  1 file changed, 15 insertions(+), 32 deletions(-)
> >
> > diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> > index fdaea3422c30..9160379df378 100644
> > --- a/virt/kvm/guest_memfd.c
> > +++ b/virt/kvm/guest_memfd.c
> > @@ -76,11 +76,6 @@ static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slo
> >  	return 0;
> >  }
> >  
> > -static inline void kvm_gmem_mark_prepared(struct folio *folio)
> > -{
> > -	folio_mark_uptodate(folio);
> > -}
> > -
> >  /*
> >   * Process @folio, which contains @gfn, so that the guest can use it.
> >   * The folio must be locked and the gfn must be contained in @slot.
> > @@ -90,13 +85,7 @@ static inline void kvm_gmem_mark_prepared(struct folio *folio)
> >  static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
> >  				  gfn_t gfn, struct folio *folio)
> >  {
> > -	unsigned long nr_pages, i;
> >  	pgoff_t index;
> > -	int r;
> > -
> > -	nr_pages = folio_nr_pages(folio);
> > -	for (i = 0; i < nr_pages; i++)
> > -		clear_highpage(folio_page(folio, i));
> >  
> >  	/*
> >  	 * Preparing huge folios should always be safe, since it should
> > @@ -114,11 +103,8 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
> >  	WARN_ON(!IS_ALIGNED(slot->gmem.pgoff, folio_nr_pages(folio)));
> >  	index = kvm_gmem_get_index(slot, gfn);
> >  	index = ALIGN_DOWN(index, folio_nr_pages(folio));
> > -	r = __kvm_gmem_prepare_folio(kvm, slot, index, folio);
> > -	if (!r)
> > -		kvm_gmem_mark_prepared(folio);
> >  
> > -	return r;
> > +	return __kvm_gmem_prepare_folio(kvm, slot, index, folio);
> >  }
> >  
> >  /*
> > @@ -420,7 +406,7 @@ static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf)
> >  
> >  	if (!folio_test_uptodate(folio)) {
> >  		clear_highpage(folio_page(folio, 0));
> > -		kvm_gmem_mark_prepared(folio);
> > +		folio_mark_uptodate(folio);
> >  	}
> >  
> >  	vmf->page = folio_file_page(folio, vmf->pgoff);
> > @@ -757,7 +743,7 @@ void kvm_gmem_unbind(struct kvm_memory_slot *slot)
> >  static struct folio *__kvm_gmem_get_pfn(struct file *file,
> >  					struct kvm_memory_slot *slot,
> >  					pgoff_t index, kvm_pfn_t *pfn,
> > -					bool *is_prepared, int *max_order)
> > +					int *max_order)
> >  {
> >  	struct file *slot_file = READ_ONCE(slot->gmem.file);
> >  	struct gmem_file *f = file->private_data;
> > @@ -787,7 +773,6 @@ static struct folio *__kvm_gmem_get_pfn(struct file *file,
> >  	if (max_order)
> >  		*max_order = 0;
> >  
> > -	*is_prepared = folio_test_uptodate(folio);
> >  	return folio;
> >  }
> >  
> > @@ -797,19 +782,25 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
> >  {
> >  	pgoff_t index = kvm_gmem_get_index(slot, gfn);
> >  	struct folio *folio;
> > -	bool is_prepared = false;
> >  	int r = 0;
> >  
> >  	CLASS(gmem_get_file, file)(slot);
> >  	if (!file)
> >  		return -EFAULT;
> >  
> > -	folio = __kvm_gmem_get_pfn(file, slot, index, pfn, &is_prepared, max_order);
> > +	folio = __kvm_gmem_get_pfn(file, slot, index, pfn, max_order);
> >  	if (IS_ERR(folio))
> >  		return PTR_ERR(folio);
> >  
> > -	if (!is_prepared)
> > -		r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio);
> > +	if (!folio_test_uptodate(folio)) {
> > +		unsigned long i, nr_pages = folio_nr_pages(folio);
> > +
> > +		for (i = 0; i < nr_pages; i++)
> > +			clear_highpage(folio_page(folio, i));
> > +		folio_mark_uptodate(folio);
> > +	}
> > +
> > +	r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio);
> >  
> >  	folio_unlock(folio);
> >  
> > @@ -852,7 +843,6 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
> >  		struct folio *folio;
> >  		gfn_t gfn = start_gfn + i;
> >  		pgoff_t index = kvm_gmem_get_index(slot, gfn);
> > -		bool is_prepared = false;
> >  		kvm_pfn_t pfn;
> >  
> >  		if (signal_pending(current)) {
> > @@ -860,19 +850,12 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
> >  			break;
> >  		}
> >  
> > -		folio = __kvm_gmem_get_pfn(file, slot, index, &pfn, &is_prepared, &max_order);
> > +		folio = __kvm_gmem_get_pfn(file, slot, index, &pfn, &max_order);
> >  		if (IS_ERR(folio)) {
> >  			ret = PTR_ERR(folio);
> >  			break;
> >  		}
> >  
> > -		if (is_prepared) {
> > -			folio_unlock(folio);
> > -			folio_put(folio);
> > -			ret = -EEXIST;
> > -			break;
> > -		}
> > -
> >  		folio_unlock(folio);
> >  		WARN_ON(!IS_ALIGNED(gfn, 1 << max_order) ||
> >  			(npages - i) < (1 << max_order));
> > @@ -889,7 +872,7 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
> >  		p = src ? src + i * PAGE_SIZE : NULL;
> >  		ret = post_populate(kvm, gfn, pfn, p, max_order, opaque);
> >  		if (!ret)
> > -			kvm_gmem_mark_prepared(folio);
> > +			folio_mark_uptodate(folio);
> >  
> >  put_folio_and_exit:
> >  		folio_put(folio);
> > -- 
> > 2.25.1
>

[PATCH 1/3] KVM: guest_memfd: Remove preparation tracking
[PATCH 2/3] KVM: TDX: Document alignment requirements for KVM_TDX_INIT_MEM_REGION
[PATCH 3/3] KVM: guest_memfd: GUP source pages prior to populating guest memory