guest_memfd currently uses the folio uptodate flag to track:
1) whether or not a page had been cleared before initial usage
2) whether or not the architecture hooks have been issued to put the
page in a private state as defined by the architecture
In practice, 2) is only actually being tracked for SEV-SNP VMs, and
there do not seem to be any plans/reasons that would suggest this will
change in the future, so this additional tracking/complexity is not
really providing any general benefit to guest_memfd users. Future plans
around in-place conversion and hugepage support, where the per-folio
uptodate flag is planned to be used purely to track the initial clearing
of folios, whereas conversion operations could trigger multiple
transitions between 'prepared' and 'unprepared' and thus need separate
tracking, will make the burden of tracking this information within
guest_memfd even more complex, since preparation generally happens
during fault time, on the "read-side" of any global locks that might
protect state tracked by guest_memfd, and so may require more complex
locking schemes to allow for concurrent handling of page faults for
multiple vCPUs where the "preparedness" state tracked by guest_memfd
might need to be updated as part of handling the fault.
Instead of keeping this current/future complexity within guest_memfd for
what is essentially just SEV-SNP, just drop the tracking for 2) and have
the arch-specific preparation hooks get triggered unconditionally on
every fault so the arch-specific hooks can check the preparation state
directly and decide whether or not a folio still needs additional
preparation. In the case of SEV-SNP, the preparation state is already
checked again via the preparation hooks to avoid double-preparation, so
nothing extra needs to be done to update the handling of things there.
Signed-off-by: Michael Roth <michael.roth@amd.com>
---
virt/kvm/guest_memfd.c | 47 ++++++++++++++----------------------------
1 file changed, 15 insertions(+), 32 deletions(-)
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 35f94a288e52..cc93c502b5d8 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -421,11 +421,6 @@ static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slo
return 0;
}
-static inline void kvm_gmem_mark_prepared(struct folio *folio)
-{
- folio_mark_uptodate(folio);
-}
-
/*
* Process @folio, which contains @gfn, so that the guest can use it.
* The folio must be locked and the gfn must be contained in @slot.
@@ -435,13 +430,7 @@ static inline void kvm_gmem_mark_prepared(struct folio *folio)
static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
gfn_t gfn, struct folio *folio)
{
- unsigned long nr_pages, i;
pgoff_t index;
- int r;
-
- nr_pages = folio_nr_pages(folio);
- for (i = 0; i < nr_pages; i++)
- clear_highpage(folio_page(folio, i));
/*
* Preparing huge folios should always be safe, since it should
@@ -459,11 +448,8 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
WARN_ON(!IS_ALIGNED(slot->gmem.pgoff, 1 << folio_order(folio)));
index = gfn - slot->base_gfn + slot->gmem.pgoff;
index = ALIGN_DOWN(index, 1 << folio_order(folio));
- r = __kvm_gmem_prepare_folio(kvm, slot, index, folio);
- if (!r)
- kvm_gmem_mark_prepared(folio);
- return r;
+ return __kvm_gmem_prepare_folio(kvm, slot, index, folio);
}
static int __kvm_gmem_filemap_add_folio(struct address_space *mapping,
@@ -808,7 +794,7 @@ static vm_fault_t kvm_gmem_fault_shared(struct vm_fault *vmf)
if (!folio_test_uptodate(folio)) {
clear_highpage(folio_page(folio, 0));
- kvm_gmem_mark_prepared(folio);
+ folio_mark_uptodate(folio);
}
vmf->page = folio_file_page(folio, vmf->pgoff);
@@ -1306,7 +1292,7 @@ void kvm_gmem_unbind(struct kvm_memory_slot *slot)
static struct folio *__kvm_gmem_get_pfn(struct file *file,
struct kvm_memory_slot *slot,
pgoff_t index, kvm_pfn_t *pfn,
- bool *is_prepared, int *max_order)
+ int *max_order)
{
struct file *gmem_file = READ_ONCE(slot->gmem.file);
struct kvm_gmem *gmem = file->private_data;
@@ -1337,7 +1323,6 @@ static struct folio *__kvm_gmem_get_pfn(struct file *file,
if (max_order)
*max_order = 0;
- *is_prepared = folio_test_uptodate(folio);
return folio;
}
@@ -1348,7 +1333,6 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
pgoff_t index = kvm_gmem_get_index(slot, gfn);
struct file *file = kvm_gmem_get_file(slot);
struct folio *folio;
- bool is_prepared = false;
int r = 0;
if (!file)
@@ -1356,14 +1340,21 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
filemap_invalidate_lock_shared(file_inode(file)->i_mapping);
- folio = __kvm_gmem_get_pfn(file, slot, index, pfn, &is_prepared, max_order);
+ folio = __kvm_gmem_get_pfn(file, slot, index, pfn, max_order);
if (IS_ERR(folio)) {
r = PTR_ERR(folio);
goto out;
}
- if (!is_prepared)
- r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio);
+ if (!folio_test_uptodate(folio)) {
+ unsigned long i, nr_pages = folio_nr_pages(folio);
+
+ for (i = 0; i < nr_pages; i++)
+ clear_highpage(folio_page(folio, i));
+ folio_mark_uptodate(folio);
+ }
+
+ r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio);
folio_unlock(folio);
@@ -1420,7 +1411,6 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
struct folio *folio;
gfn_t gfn = start_gfn + i;
pgoff_t index = kvm_gmem_get_index(slot, gfn);
- bool is_prepared = false;
kvm_pfn_t pfn;
if (signal_pending(current)) {
@@ -1428,19 +1418,12 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
break;
}
- folio = __kvm_gmem_get_pfn(file, slot, index, &pfn, &is_prepared, &max_order);
+ folio = __kvm_gmem_get_pfn(file, slot, index, &pfn, &max_order);
if (IS_ERR(folio)) {
ret = PTR_ERR(folio);
break;
}
- if (is_prepared) {
- folio_unlock(folio);
- folio_put(folio);
- ret = -EEXIST;
- break;
- }
-
folio_unlock(folio);
WARN_ON(!IS_ALIGNED(gfn, 1 << max_order) ||
(npages - i) < (1 << max_order));
@@ -1457,7 +1440,7 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
p = src ? src + i * PAGE_SIZE : NULL;
ret = post_populate(kvm, gfn, pfn, p, max_order, opaque);
if (!ret)
- kvm_gmem_mark_prepared(folio);
+ folio_mark_uptodate(folio);
put_folio_and_exit:
folio_put(folio);
--
2.25.1
Michael Roth <michael.roth@amd.com> writes: > guest_memfd currently uses the folio uptodate flag to track: > > 1) whether or not a page had been cleared before initial usage > 2) whether or not the architecture hooks have been issued to put the > page in a private state as defined by the architecture > > In practice, 2) is only actually being tracked for SEV-SNP VMs, and > there do not seem to be any plans/reasons that would suggest this will > change in the future, so this additional tracking/complexity is not > really providing any general benefit to guest_memfd users. Future plans > around in-place conversion and hugepage support, where the per-folio > uptodate flag is planned to be used purely to track the initial clearing > of folios, whereas conversion operations could trigger multiple > transitions between 'prepared' and 'unprepared' and thus need separate > tracking, will make the burden of tracking this information within > guest_memfd even more complex, since preparation generally happens > during fault time, on the "read-side" of any global locks that might > protect state tracked by guest_memfd, and so may require more complex > locking schemes to allow for concurrent handling of page faults for > multiple vCPUs where the "preparedness" state tracked by guest_memfd > might need to be updated as part of handling the fault. > > Instead of keeping this current/future complexity within guest_memfd for > what is essentially just SEV-SNP, just drop the tracking for 2) and have > the arch-specific preparation hooks get triggered unconditionally on > every fault so the arch-specific hooks can check the preparation state > directly and decide whether or not a folio still needs additional > preparation. In the case of SEV-SNP, the preparation state is already > checked again via the preparation hooks to avoid double-preparation, so > nothing extra needs to be done to update the handling of things there. > > Signed-off-by: Michael Roth <michael.roth@amd.com> > --- > virt/kvm/guest_memfd.c | 47 ++++++++++++++---------------------------- > 1 file changed, 15 insertions(+), 32 deletions(-) > > diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c > index 35f94a288e52..cc93c502b5d8 100644 > --- a/virt/kvm/guest_memfd.c > +++ b/virt/kvm/guest_memfd.c > @@ -421,11 +421,6 @@ static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slo > return 0; > } > > -static inline void kvm_gmem_mark_prepared(struct folio *folio) > -{ > - folio_mark_uptodate(folio); > -} > - > /* > * Process @folio, which contains @gfn, so that the guest can use it. > * The folio must be locked and the gfn must be contained in @slot. > @@ -435,13 +430,7 @@ static inline void kvm_gmem_mark_prepared(struct folio *folio) > static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot, > gfn_t gfn, struct folio *folio) > { > - unsigned long nr_pages, i; > pgoff_t index; > - int r; > - > - nr_pages = folio_nr_pages(folio); > - for (i = 0; i < nr_pages; i++) > - clear_highpage(folio_page(folio, i)); > > /* > * Preparing huge folios should always be safe, since it should > @@ -459,11 +448,8 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot, While working on HugeTLB support for guest_memfd, I added a test that tries to map a non-huge-page-aligned gmem.pgoff to a huge-page aligned gfn. I understand that config would destroy the performance advantages of huge pages, but I think the test is necessary since Yan brought up the use case here [1]. The conclusion in that thread, I believe, was to allow binding of unaligned GFNs to offsets, but disallow large pages in that case. The next series for guest_memfd HugeTLB support will include a fix similar to this [2]. While testing, I hit this WARN_ON with a non-huge-page-aligned gmem.pgoff. > WARN_ON(!IS_ALIGNED(slot->gmem.pgoff, 1 << folio_order(folio))); Do you all think this WARN_ON can be removed? Also, do you think kvm_gmem_prepare_folio()s interface should perhaps be changed to take pfn, gfn, nr_pages (PAGE_SIZE pages) and level? I think taking a folio is kind of awkward since we're not really setting up the folio, we're setting up something mapping-related for the folio. Also, kvm_gmem_invalidate() doesn't take folios, which is more aligned with invalidating mappings rather than something folio-related. [1] https://lore.kernel.org/all/aA7UXI0NB7oQQrL2@yzhao56-desk.sh.intel.com/ [2] https://github.com/googleprodkernel/linux-cc/commit/371ed9281e0c9ba41cfdc20b48a6c5566f61a7df > index = gfn - slot->base_gfn + slot->gmem.pgoff; > index = ALIGN_DOWN(index, 1 << folio_order(folio)); > - r = __kvm_gmem_prepare_folio(kvm, slot, index, folio); > - if (!r) > - kvm_gmem_mark_prepared(folio); > > - return r; > + return __kvm_gmem_prepare_folio(kvm, slot, index, folio); > } > > > [...snip...] >
On Mon, Aug 25, 2025 at 04:08:19PM -0700, Ackerley Tng wrote: > Michael Roth <michael.roth@amd.com> writes: > > > guest_memfd currently uses the folio uptodate flag to track: > > > > 1) whether or not a page had been cleared before initial usage > > 2) whether or not the architecture hooks have been issued to put the > > page in a private state as defined by the architecture > > > > In practice, 2) is only actually being tracked for SEV-SNP VMs, and > > there do not seem to be any plans/reasons that would suggest this will > > change in the future, so this additional tracking/complexity is not > > really providing any general benefit to guest_memfd users. Future plans > > around in-place conversion and hugepage support, where the per-folio > > uptodate flag is planned to be used purely to track the initial clearing > > of folios, whereas conversion operations could trigger multiple > > transitions between 'prepared' and 'unprepared' and thus need separate > > tracking, will make the burden of tracking this information within > > guest_memfd even more complex, since preparation generally happens > > during fault time, on the "read-side" of any global locks that might > > protect state tracked by guest_memfd, and so may require more complex > > locking schemes to allow for concurrent handling of page faults for > > multiple vCPUs where the "preparedness" state tracked by guest_memfd > > might need to be updated as part of handling the fault. > > > > Instead of keeping this current/future complexity within guest_memfd for > > what is essentially just SEV-SNP, just drop the tracking for 2) and have > > the arch-specific preparation hooks get triggered unconditionally on > > every fault so the arch-specific hooks can check the preparation state > > directly and decide whether or not a folio still needs additional > > preparation. In the case of SEV-SNP, the preparation state is already > > checked again via the preparation hooks to avoid double-preparation, so > > nothing extra needs to be done to update the handling of things there. > > > > Signed-off-by: Michael Roth <michael.roth@amd.com> > > --- > > virt/kvm/guest_memfd.c | 47 ++++++++++++++---------------------------- > > 1 file changed, 15 insertions(+), 32 deletions(-) > > > > diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c > > index 35f94a288e52..cc93c502b5d8 100644 > > --- a/virt/kvm/guest_memfd.c > > +++ b/virt/kvm/guest_memfd.c > > @@ -421,11 +421,6 @@ static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slo > > return 0; > > } > > > > -static inline void kvm_gmem_mark_prepared(struct folio *folio) > > -{ > > - folio_mark_uptodate(folio); > > -} > > - > > /* > > * Process @folio, which contains @gfn, so that the guest can use it. > > * The folio must be locked and the gfn must be contained in @slot. > > @@ -435,13 +430,7 @@ static inline void kvm_gmem_mark_prepared(struct folio *folio) > > static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot, > > gfn_t gfn, struct folio *folio) > > { > > - unsigned long nr_pages, i; > > pgoff_t index; > > - int r; > > - > > - nr_pages = folio_nr_pages(folio); > > - for (i = 0; i < nr_pages; i++) > > - clear_highpage(folio_page(folio, i)); > > > > /* > > * Preparing huge folios should always be safe, since it should > > @@ -459,11 +448,8 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot, > > While working on HugeTLB support for guest_memfd, I added a test that > tries to map a non-huge-page-aligned gmem.pgoff to a huge-page aligned > gfn. > > I understand that config would destroy the performance advantages of > huge pages, but I think the test is necessary since Yan brought up the > use case here [1]. > > The conclusion in that thread, I believe, was to allow binding of > unaligned GFNs to offsets, but disallow large pages in that case. The > next series for guest_memfd HugeTLB support will include a fix similar > to this [2]. > > While testing, I hit this WARN_ON with a non-huge-page-aligned > gmem.pgoff. > > > WARN_ON(!IS_ALIGNED(slot->gmem.pgoff, 1 << folio_order(folio))); > > Do you all think this WARN_ON can be removed? I think so.. I actually ended up dropping this WARN_ON() for a similar reason: https://github.com/AMDESE/linux/commit/c654cd144ad0d823f4db8793ebf9b43a3e8a7c48 but in that case it was to deal with memslots where most of the GPA ranges are huge-page aligned to the gmemfd, and it's just that the start/end GPA ranges have been split up and associated with other memslots. In that case I still try to allow hugepages but force order 0 in kvm_gmem_get_pfn() for the start/end ranges. I haven't really considered the case where entire GPA range is misaligned with gmemfd hugepage offsets but the proposed handling seems reasonable to me... I need to take a closer look at whether the above-mentioned logic is at odds with what is/will be implemented in kvm_alloc_memslot_metadata() however as that seems a bit more restrictive. Thanks, Mike > > Also, do you think kvm_gmem_prepare_folio()s interface should perhaps be > changed to take pfn, gfn, nr_pages (PAGE_SIZE pages) and level? > > I think taking a folio is kind of awkward since we're not really setting > up the folio, we're setting up something mapping-related for the > folio. Also, kvm_gmem_invalidate() doesn't take folios, which is more > aligned with invalidating mappings rather than something folio-related. > > [1] https://lore.kernel.org/all/aA7UXI0NB7oQQrL2@yzhao56-desk.sh.intel.com/ > [2] https://github.com/googleprodkernel/linux-cc/commit/371ed9281e0c9ba41cfdc20b48a6c5566f61a7df > > > index = gfn - slot->base_gfn + slot->gmem.pgoff; > > index = ALIGN_DOWN(index, 1 << folio_order(folio)); > > - r = __kvm_gmem_prepare_folio(kvm, slot, index, folio); > > - if (!r) > > - kvm_gmem_mark_prepared(folio); > > > > - return r; > > + return __kvm_gmem_prepare_folio(kvm, slot, index, folio); > > } > > > > > > [...snip...] > > >
Michael Roth <michael.roth@amd.com> writes: > On Mon, Aug 25, 2025 at 04:08:19PM -0700, Ackerley Tng wrote: >> Michael Roth <michael.roth@amd.com> writes: >> >> > guest_memfd currently uses the folio uptodate flag to track: >> > >> > 1) whether or not a page had been cleared before initial usage >> > 2) whether or not the architecture hooks have been issued to put the >> > page in a private state as defined by the architecture >> > >> > In practice, 2) is only actually being tracked for SEV-SNP VMs, and >> > there do not seem to be any plans/reasons that would suggest this will >> > change in the future, so this additional tracking/complexity is not >> > really providing any general benefit to guest_memfd users. Future plans >> > around in-place conversion and hugepage support, where the per-folio >> > uptodate flag is planned to be used purely to track the initial clearing >> > of folios, whereas conversion operations could trigger multiple >> > transitions between 'prepared' and 'unprepared' and thus need separate >> > tracking, will make the burden of tracking this information within >> > guest_memfd even more complex, since preparation generally happens >> > during fault time, on the "read-side" of any global locks that might >> > protect state tracked by guest_memfd, and so may require more complex >> > locking schemes to allow for concurrent handling of page faults for >> > multiple vCPUs where the "preparedness" state tracked by guest_memfd >> > might need to be updated as part of handling the fault. >> > >> > Instead of keeping this current/future complexity within guest_memfd for >> > what is essentially just SEV-SNP, just drop the tracking for 2) and have >> > the arch-specific preparation hooks get triggered unconditionally on >> > every fault so the arch-specific hooks can check the preparation state >> > directly and decide whether or not a folio still needs additional >> > preparation. In the case of SEV-SNP, the preparation state is already >> > checked again via the preparation hooks to avoid double-preparation, so >> > nothing extra needs to be done to update the handling of things there. >> > >> > Signed-off-by: Michael Roth <michael.roth@amd.com> >> > --- >> > virt/kvm/guest_memfd.c | 47 ++++++++++++++---------------------------- >> > 1 file changed, 15 insertions(+), 32 deletions(-) >> > >> > diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c >> > index 35f94a288e52..cc93c502b5d8 100644 >> > --- a/virt/kvm/guest_memfd.c >> > +++ b/virt/kvm/guest_memfd.c >> > @@ -421,11 +421,6 @@ static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slo >> > return 0; >> > } >> > >> > -static inline void kvm_gmem_mark_prepared(struct folio *folio) >> > -{ >> > - folio_mark_uptodate(folio); >> > -} >> > - >> > /* >> > * Process @folio, which contains @gfn, so that the guest can use it. >> > * The folio must be locked and the gfn must be contained in @slot. >> > @@ -435,13 +430,7 @@ static inline void kvm_gmem_mark_prepared(struct folio *folio) >> > static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot, >> > gfn_t gfn, struct folio *folio) >> > { >> > - unsigned long nr_pages, i; >> > pgoff_t index; >> > - int r; >> > - >> > - nr_pages = folio_nr_pages(folio); >> > - for (i = 0; i < nr_pages; i++) >> > - clear_highpage(folio_page(folio, i)); >> > >> > /* >> > * Preparing huge folios should always be safe, since it should >> > @@ -459,11 +448,8 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot, >> >> While working on HugeTLB support for guest_memfd, I added a test that >> tries to map a non-huge-page-aligned gmem.pgoff to a huge-page aligned >> gfn. >> >> I understand that config would destroy the performance advantages of >> huge pages, but I think the test is necessary since Yan brought up the >> use case here [1]. >> >> The conclusion in that thread, I believe, was to allow binding of >> unaligned GFNs to offsets, but disallow large pages in that case. The >> next series for guest_memfd HugeTLB support will include a fix similar >> to this [2]. >> >> While testing, I hit this WARN_ON with a non-huge-page-aligned >> gmem.pgoff. >> >> > WARN_ON(!IS_ALIGNED(slot->gmem.pgoff, 1 << folio_order(folio))); >> >> Do you all think this WARN_ON can be removed? > > I think so.. I actually ended up dropping this WARN_ON() for a similar > reason: > Thanks for confirming! > https://github.com/AMDESE/linux/commit/c654cd144ad0d823f4db8793ebf9b43a3e8a7c48 > > but in that case it was to deal with memslots where most of the GPA > ranges are huge-page aligned to the gmemfd, and it's just that the start/end > GPA ranges have been split up and associated with other memslots. In that case > I still try to allow hugepages but force order 0 in kvm_gmem_get_pfn() > for the start/end ranges. > > I haven't really considered the case where entire GPA range is misaligned > with gmemfd hugepage offsets but the proposed handling seems reasonable > to me... I need to take a closer look at whether the above-mentioned > logic is at odds with what is/will be implemented in > kvm_alloc_memslot_metadata() however as that seems a bit more restrictive. > Does this help? [1] (from a WIP patch series). KVM already checks that the guest base address (base_gfn) and the userspace virtual address (userspace_addr) are aligned relative to each other for each large page level. If they are not, large pages are disabled for the entire memory slot. [1] extends that same check for slot->base_gfn and slot->gmem.pgoff. Hence, guest_memfd is letting KVM manage the mapping. guest_memfd reports max_order based on what it knows (folio size, and folio size is also determined by shareability), and KVM manages the mapping after taking account lpage_info in addition to max_order. [1] https://github.com/googleprodkernel/linux-cc/commit/371ed9281e0c9ba41cfdc20b48a6c5566f61a7df > Thanks, > > Mike > >> >> Also, do you think kvm_gmem_prepare_folio()s interface should perhaps be >> changed to take pfn, gfn, nr_pages (PAGE_SIZE pages) and level? >> >> I think taking a folio is kind of awkward since we're not really setting >> up the folio, we're setting up something mapping-related for the >> folio. Also, kvm_gmem_invalidate() doesn't take folios, which is more >> aligned with invalidating mappings rather than something folio-related. >> >> [1] https://lore.kernel.org/all/aA7UXI0NB7oQQrL2@yzhao56-desk.sh.intel.com/ >> [2] https://github.com/googleprodkernel/linux-cc/commit/371ed9281e0c9ba41cfdc20b48a6c5566f61a7df >> >> > index = gfn - slot->base_gfn + slot->gmem.pgoff; >> > index = ALIGN_DOWN(index, 1 << folio_order(folio)); >> > - r = __kvm_gmem_prepare_folio(kvm, slot, index, folio); >> > - if (!r) >> > - kvm_gmem_mark_prepared(folio); >> > >> > - return r; >> > + return __kvm_gmem_prepare_folio(kvm, slot, index, folio); >> > } >> > >> > >> > [...snip...] >> > >>
Ackerley Tng <ackerleytng@google.com> writes: > Michael Roth <michael.roth@amd.com> writes: > >> On Mon, Aug 25, 2025 at 04:08:19PM -0700, Ackerley Tng wrote: >>> Michael Roth <michael.roth@amd.com> writes: >>> >>> >>> [...snip...] >>> >>> > @@ -435,13 +430,7 @@ static inline void kvm_gmem_mark_prepared(struct folio *folio) >>> > static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot, >>> > gfn_t gfn, struct folio *folio) >>> > { >>> > - unsigned long nr_pages, i; >>> > pgoff_t index; >>> > - int r; >>> > - >>> > - nr_pages = folio_nr_pages(folio); >>> > - for (i = 0; i < nr_pages; i++) >>> > - clear_highpage(folio_page(folio, i)); >>> > >>> > /* >>> > * Preparing huge folios should always be safe, since it should >>> > @@ -459,11 +448,8 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot, >>> >>> While working on HugeTLB support for guest_memfd, I added a test that >>> tries to map a non-huge-page-aligned gmem.pgoff to a huge-page aligned >>> gfn. >>> >>> I understand that config would destroy the performance advantages of >>> huge pages, but I think the test is necessary since Yan brought up the >>> use case here [1]. >>> >>> The conclusion in that thread, I believe, was to allow binding of >>> unaligned GFNs to offsets, but disallow large pages in that case. The >>> next series for guest_memfd HugeTLB support will include a fix similar >>> to this [2]. >>> >>> While testing, I hit this WARN_ON with a non-huge-page-aligned >>> gmem.pgoff. >>> >>> > WARN_ON(!IS_ALIGNED(slot->gmem.pgoff, 1 << folio_order(folio))); >>> >>> Do you all think this WARN_ON can be removed? >> >> I think so.. I actually ended up dropping this WARN_ON() for a similar >> reason: >> > > Thanks for confirming! > Dropping this WARN_ON() actually further highlights the importance of separating preparedness from folio flags (and the folio). With huge pages being supported in guest_memfd, it's possible for just part of a folio to be mapped into the stage 2 page tables. One example of this is if userspace were to request populating just 2M in a 1G page. If preparedness were recorded in folio flags, then the entire 1G would be considered prepared even though only 2M of that page was prepared (updated in RMP tables). So I do support making the uptodate flag only mean zeroed, and taking preparedness out of the picture. With this change, kvm_gmem_prepare_folio() and __kvm_gmem_prepare_folio() seems to be a misnomer, since conceptually we're not preparing a folio, we can't assume that we're always preparing a whole folio once huge pages are in the picture. What do you all think of taking this even further? Instead of keeping kvm_gmem_prepare_folio() within guest_memfd, what if we 1. Focus on preparing pfn ranges (retaining kvm_arch_gmem_prepare() is good) and not folios 2. More clearly and directly associate preparing pfns with mapping (rather than with getting a folio to be mapped) into stage 2 page tables What I have in mind for (2) is to update kvm_tdp_mmu_map() to do an arch-specific call, when fault->is_private, to call kvm_arch_gmem_prepare() just before mapping the pfns and when the mapping level is known. The cleanup counterpart would then be to call kvm_arch_gmem_invalidate() somewhere in tdp_mmu_zap_leafs(). kvm_arch_gmem_prepare() and kvm_arch_gmem_invalidate() would then drop out of guest_memfd and be moved back into the core of KVM. Technically these two functions don't even need to have gmem in the name since any memory can be prepared in the SNP sense, though for the foreseeable future gmem is the only memory supported for private memory in CoCo VMs. Also, to push this along a little, I feel that this series does a few things. What do you all think of re-focusing this series (or a part of this series) as "Separating SNP preparation from guest_memfd" or "Separating arch-specific preparation from guest_memfd"? >> >> [...snip...] >>
Ackerley Tng <ackerleytng@google.com> writes: > Ackerley Tng <ackerleytng@google.com> writes: > >> Michael Roth <michael.roth@amd.com> writes: >> >>> On Mon, Aug 25, 2025 at 04:08:19PM -0700, Ackerley Tng wrote: >>>> Michael Roth <michael.roth@amd.com> writes: >>>> >>>> >>>> [...snip...] >>>> >>>> > @@ -435,13 +430,7 @@ static inline void kvm_gmem_mark_prepared(struct folio *folio) >>>> > static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot, >>>> > gfn_t gfn, struct folio *folio) >>>> > { >>>> > - unsigned long nr_pages, i; >>>> > pgoff_t index; >>>> > - int r; >>>> > - >>>> > - nr_pages = folio_nr_pages(folio); >>>> > - for (i = 0; i < nr_pages; i++) >>>> > - clear_highpage(folio_page(folio, i)); >>>> > >>>> > /* >>>> > * Preparing huge folios should always be safe, since it should >>>> > @@ -459,11 +448,8 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot, >>>> >>>> While working on HugeTLB support for guest_memfd, I added a test that >>>> tries to map a non-huge-page-aligned gmem.pgoff to a huge-page aligned >>>> gfn. >>>> >>>> I understand that config would destroy the performance advantages of >>>> huge pages, but I think the test is necessary since Yan brought up the >>>> use case here [1]. >>>> >>>> The conclusion in that thread, I believe, was to allow binding of >>>> unaligned GFNs to offsets, but disallow large pages in that case. The >>>> next series for guest_memfd HugeTLB support will include a fix similar >>>> to this [2]. >>>> >>>> While testing, I hit this WARN_ON with a non-huge-page-aligned >>>> gmem.pgoff. >>>> >>>> > WARN_ON(!IS_ALIGNED(slot->gmem.pgoff, 1 << folio_order(folio))); >>>> >>>> Do you all think this WARN_ON can be removed? >>> >>> I think so.. I actually ended up dropping this WARN_ON() for a similar >>> reason: >>> >> >> Thanks for confirming! >> > > Dropping this WARN_ON() actually further highlights the importance of > separating preparedness from folio flags (and the folio). > > With huge pages being supported in guest_memfd, it's possible for just > part of a folio to be mapped into the stage 2 page tables. One example > of this is if userspace were to request populating just 2M in a 1G > page. If preparedness were recorded in folio flags, then the entire 1G > would be considered prepared even though only 2M of that page was > prepared (updated in RMP tables). > > So I do support making the uptodate flag only mean zeroed, and taking > preparedness out of the picture. > > With this change, kvm_gmem_prepare_folio() and > __kvm_gmem_prepare_folio() seems to be a misnomer, since conceptually > we're not preparing a folio, we can't assume that we're always preparing > a whole folio once huge pages are in the picture. > > What do you all think of taking this even further? Instead of keeping > kvm_gmem_prepare_folio() within guest_memfd, what if we > > 1. Focus on preparing pfn ranges (retaining kvm_arch_gmem_prepare() is > good) and not folios > > 2. More clearly and directly associate preparing pfns with mapping > (rather than with getting a folio to be mapped) into stage 2 page > tables > Thought about this a little more and maybe this is not quite accurate either. On a conversion, for SNP, does the memory actually need to be unmapped from the NPTs, or would it be possible to just flip the C bit? If conversion only involves flipping the C bit and updating RMP tables, then perhaps preparation and invalidation shouldn't be associated with mapping, but directly with conversions, or setting page private/shared state. > What I have in mind for (2) is to update kvm_tdp_mmu_map() to do an > arch-specific call, when fault->is_private, to call > kvm_arch_gmem_prepare() just before mapping the pfns and when the > mapping level is known. > > The cleanup counterpart would then be to call kvm_arch_gmem_invalidate() > somewhere in tdp_mmu_zap_leafs(). > > kvm_arch_gmem_prepare() and kvm_arch_gmem_invalidate() would then drop > out of guest_memfd and be moved back into the core of KVM. > > Technically these two functions don't even need to have gmem in the name > since any memory can be prepared in the SNP sense, though for the > foreseeable future gmem is the only memory supported for private memory > in CoCo VMs. > > Also, to push this along a little, I feel that this series does a few > things. What do you all think of re-focusing this series (or a part of > this series) as "Separating SNP preparation from guest_memfd" or > "Separating arch-specific preparation from guest_memfd"? > >>> >>> [...snip...] >>>
On Thu, Jun 12, 2025 at 5:55 PM Michael Roth <michael.roth@amd.com> wrote: > > guest_memfd currently uses the folio uptodate flag to track: > > 1) whether or not a page had been cleared before initial usage > 2) whether or not the architecture hooks have been issued to put the > page in a private state as defined by the architecture > > In practice, 2) is only actually being tracked for SEV-SNP VMs, and > there do not seem to be any plans/reasons that would suggest this will > change in the future, so this additional tracking/complexity is not > really providing any general benefit to guest_memfd users. Future plans > around in-place conversion and hugepage support, where the per-folio > uptodate flag is planned to be used purely to track the initial clearing > of folios, whereas conversion operations could trigger multiple > transitions between 'prepared' and 'unprepared' and thus need separate > tracking, will make the burden of tracking this information within > guest_memfd even more complex, since preparation generally happens > during fault time, on the "read-side" of any global locks that might > protect state tracked by guest_memfd, and so may require more complex > locking schemes to allow for concurrent handling of page faults for > multiple vCPUs where the "preparedness" state tracked by guest_memfd > might need to be updated as part of handling the fault. > > Instead of keeping this current/future complexity within guest_memfd for > what is essentially just SEV-SNP, just drop the tracking for 2) and have > the arch-specific preparation hooks get triggered unconditionally on > every fault so the arch-specific hooks can check the preparation state > directly and decide whether or not a folio still needs additional > preparation. In the case of SEV-SNP, the preparation state is already > checked again via the preparation hooks to avoid double-preparation, so > nothing extra needs to be done to update the handling of things there. > I believe this patch doesn't need to depend on stage1/stage2 and can be sent directly for review on kvm tip, is that right? This update paired with zeroing modifications[1] will make uptodate flag redundant for guest_memfd memory. [1] https://lore.kernel.org/lkml/CAGtprH-+gPN8J_RaEit=M_ErHWTmFHeCipC6viT6PHhG3ELg6A@mail.gmail.com/
On Tue, Jul 15, 2025 at 05:47:31AM -0700, Vishal Annapurve wrote: > On Thu, Jun 12, 2025 at 5:55 PM Michael Roth <michael.roth@amd.com> wrote: > > > > guest_memfd currently uses the folio uptodate flag to track: > > > > 1) whether or not a page had been cleared before initial usage > > 2) whether or not the architecture hooks have been issued to put the > > page in a private state as defined by the architecture > > > > In practice, 2) is only actually being tracked for SEV-SNP VMs, and > > there do not seem to be any plans/reasons that would suggest this will > > change in the future, so this additional tracking/complexity is not > > really providing any general benefit to guest_memfd users. Future plans > > around in-place conversion and hugepage support, where the per-folio > > uptodate flag is planned to be used purely to track the initial clearing > > of folios, whereas conversion operations could trigger multiple > > transitions between 'prepared' and 'unprepared' and thus need separate > > tracking, will make the burden of tracking this information within > > guest_memfd even more complex, since preparation generally happens > > during fault time, on the "read-side" of any global locks that might > > protect state tracked by guest_memfd, and so may require more complex > > locking schemes to allow for concurrent handling of page faults for > > multiple vCPUs where the "preparedness" state tracked by guest_memfd > > might need to be updated as part of handling the fault. > > > > Instead of keeping this current/future complexity within guest_memfd for > > what is essentially just SEV-SNP, just drop the tracking for 2) and have > > the arch-specific preparation hooks get triggered unconditionally on > > every fault so the arch-specific hooks can check the preparation state > > directly and decide whether or not a folio still needs additional > > preparation. In the case of SEV-SNP, the preparation state is already > > checked again via the preparation hooks to avoid double-preparation, so > > nothing extra needs to be done to update the handling of things there. > > > > I believe this patch doesn't need to depend on stage1/stage2 and can > be sent directly for review on kvm tip, is that right? Yes, this was actually tested initially against kvm/next and should not cause issues. I wanted to post the change in the context of in-place conversion/hugetlb work to help motivate why we're considering the change, but ideally we'd get this one applied soon-ish since the question of "how to track preparation state" seems to be throwing a wrench into all the planning activities and at the end of the day only SNP is making use of it so it seems to be becoming more trouble than it's worth at a fairly fast pace. -Mike > > This update paired with zeroing modifications[1] will make uptodate > flag redundant for guest_memfd memory. > > [1] https://lore.kernel.org/lkml/CAGtprH-+gPN8J_RaEit=M_ErHWTmFHeCipC6viT6PHhG3ELg6A@mail.gmail.com/
© 2016 - 2025 Red Hat, Inc.