When guest_memfd becomes a library, a callback will need to be made to
the owner (KVM SEV) to transition pages back to hypervisor-owned/shared
state. This is currently being done as part of .free_folio() address
space op, but this callback shouldn't assume that the mapping still
exists. guest_memfd library will need the mapping to still exist to look
up its operations table.
.release_folio() and .invalidate_folio() address space ops can serve the
same purpose here. The key difference between release_folio() and
free_folio() is whether the mapping is still valid at time of the
callback. This approach was discussed in the link in the footer, but not
taken because free_folio() was easier to implement.
Link: https://lore.kernel.org/kvm/20231016115028.996656-1-michael.roth@amd.com/
Signed-off-by: Elliot Berman <quic_eberman@quicinc.com>
---
virt/kvm/guest_memfd.c | 19 ++++++++++++++++---
1 file changed, 16 insertions(+), 3 deletions(-)
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 47a9f68f7b247f4cba0c958b4c7cd9458e7c46b4..13f83ad8a4c26ba82aca4f2684f22044abb4bc19 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -358,22 +358,35 @@ static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *fol
}
#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
-static void kvm_gmem_free_folio(struct folio *folio)
+static bool kvm_gmem_release_folio(struct folio *folio, gfp_t gfp)
{
struct page *page = folio_page(folio, 0);
kvm_pfn_t pfn = page_to_pfn(page);
int order = folio_order(folio);
kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order));
+
+ return true;
+}
+
+static void kvm_gmem_invalidate_folio(struct folio *folio, size_t offset,
+ size_t len)
+{
+ WARN_ON_ONCE(offset != 0);
+ WARN_ON_ONCE(len != folio_size(folio));
+
+ if (offset == 0 && len == folio_size(folio))
+ filemap_release_folio(folio, 0);
}
#endif
static const struct address_space_operations kvm_gmem_aops = {
.dirty_folio = noop_dirty_folio,
- .migrate_folio = kvm_gmem_migrate_folio,
+ .migrate_folio = kvm_gmem_migrate_folio,
.error_remove_folio = kvm_gmem_error_folio,
#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
- .free_folio = kvm_gmem_free_folio,
+ .release_folio = kvm_gmem_release_folio,
+ .invalidate_folio = kvm_gmem_invalidate_folio,
#endif
};
--
2.34.1
On 13.11.24 23:34, Elliot Berman wrote: > When guest_memfd becomes a library, a callback will need to be made to > the owner (KVM SEV) to transition pages back to hypervisor-owned/shared > state. This is currently being done as part of .free_folio() address > space op, but this callback shouldn't assume that the mapping still > exists. guest_memfd library will need the mapping to still exist to look > up its operations table. I assume you mean, that the mapping is no longer set for the folio (it sure still exists, because we are getting a callback from it :) )? Staring at filemap_remove_folio(), this is exactly what happens: We remember folio->mapping, call __filemap_remove_folio(), and then call filemap_free_folio() where we zap folio->mapping via page_cache_delete(). Maybe it's easier+cleaner to also forward the mapping to the free_folio() callback, just like we do with filemap_free_folio()? Would that help? CCing Willy if that would be reasonable extension of the free_folio callback. > > .release_folio() and .invalidate_folio() address space ops can serve the > same purpose here. The key difference between release_folio() and > free_folio() is whether the mapping is still valid at time of the > callback. This approach was discussed in the link in the footer, but not > taken because free_folio() was easier to implement. > > Link: https://lore.kernel.org/kvm/20231016115028.996656-1-michael.roth@amd.com/ > Signed-off-by: Elliot Berman <quic_eberman@quicinc.com> > --- > virt/kvm/guest_memfd.c | 19 ++++++++++++++++--- > 1 file changed, 16 insertions(+), 3 deletions(-) > > diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c > index 47a9f68f7b247f4cba0c958b4c7cd9458e7c46b4..13f83ad8a4c26ba82aca4f2684f22044abb4bc19 100644 > --- a/virt/kvm/guest_memfd.c > +++ b/virt/kvm/guest_memfd.c > @@ -358,22 +358,35 @@ static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *fol > } > > #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE > -static void kvm_gmem_free_folio(struct folio *folio) > +static bool kvm_gmem_release_folio(struct folio *folio, gfp_t gfp) > { > struct page *page = folio_page(folio, 0); > kvm_pfn_t pfn = page_to_pfn(page); > int order = folio_order(folio); > > kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order)); > + > + return true; > +} > + > +static void kvm_gmem_invalidate_folio(struct folio *folio, size_t offset, > + size_t len) > +{ > + WARN_ON_ONCE(offset != 0); > + WARN_ON_ONCE(len != folio_size(folio)); > + > + if (offset == 0 && len == folio_size(folio)) > + filemap_release_folio(folio, 0); > } > #endif > > static const struct address_space_operations kvm_gmem_aops = { > .dirty_folio = noop_dirty_folio, > - .migrate_folio = kvm_gmem_migrate_folio, > + .migrate_folio = kvm_gmem_migrate_folio, > .error_remove_folio = kvm_gmem_error_folio, > #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE > - .free_folio = kvm_gmem_free_folio, > + .release_folio = kvm_gmem_release_folio, > + .invalidate_folio = kvm_gmem_invalidate_folio, > #endif > }; > > -- Cheers, David / dhildenb
On 15.11.24 11:58, David Hildenbrand wrote: > On 13.11.24 23:34, Elliot Berman wrote: >> When guest_memfd becomes a library, a callback will need to be made to >> the owner (KVM SEV) to transition pages back to hypervisor-owned/shared >> state. This is currently being done as part of .free_folio() address >> space op, but this callback shouldn't assume that the mapping still >> exists. guest_memfd library will need the mapping to still exist to look >> up its operations table. > > I assume you mean, that the mapping is no longer set for the folio (it > sure still exists, because we are getting a callback from it :) )? > > Staring at filemap_remove_folio(), this is exactly what happens: > > We remember folio->mapping, call __filemap_remove_folio(), and then call > filemap_free_folio() where we zap folio->mapping via page_cache_delete(). > > Maybe it's easier+cleaner to also forward the mapping to the > free_folio() callback, just like we do with filemap_free_folio()? Would > that help? > > CCing Willy if that would be reasonable extension of the free_folio > callback. > Now really CCing him. :) > >> >> .release_folio() and .invalidate_folio() address space ops can serve the >> same purpose here. The key difference between release_folio() and >> free_folio() is whether the mapping is still valid at time of the >> callback. This approach was discussed in the link in the footer, but not >> taken because free_folio() was easier to implement. >> >> Link: https://lore.kernel.org/kvm/20231016115028.996656-1-michael.roth@amd.com/ >> Signed-off-by: Elliot Berman <quic_eberman@quicinc.com> >> --- >> virt/kvm/guest_memfd.c | 19 ++++++++++++++++--- >> 1 file changed, 16 insertions(+), 3 deletions(-) >> >> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c >> index 47a9f68f7b247f4cba0c958b4c7cd9458e7c46b4..13f83ad8a4c26ba82aca4f2684f22044abb4bc19 100644 >> --- a/virt/kvm/guest_memfd.c >> +++ b/virt/kvm/guest_memfd.c >> @@ -358,22 +358,35 @@ static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *fol >> } >> >> #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE >> -static void kvm_gmem_free_folio(struct folio *folio) >> +static bool kvm_gmem_release_folio(struct folio *folio, gfp_t gfp) >> { >> struct page *page = folio_page(folio, 0); >> kvm_pfn_t pfn = page_to_pfn(page); >> int order = folio_order(folio); >> >> kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order)); >> + >> + return true; >> +} >> + >> +static void kvm_gmem_invalidate_folio(struct folio *folio, size_t offset, >> + size_t len) >> +{ >> + WARN_ON_ONCE(offset != 0); >> + WARN_ON_ONCE(len != folio_size(folio)); >> + >> + if (offset == 0 && len == folio_size(folio)) >> + filemap_release_folio(folio, 0); >> } >> #endif >> >> static const struct address_space_operations kvm_gmem_aops = { >> .dirty_folio = noop_dirty_folio, >> - .migrate_folio = kvm_gmem_migrate_folio, >> + .migrate_folio = kvm_gmem_migrate_folio, >> .error_remove_folio = kvm_gmem_error_folio, >> #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE >> - .free_folio = kvm_gmem_free_folio, >> + .release_folio = kvm_gmem_release_folio, >> + .invalidate_folio = kvm_gmem_invalidate_folio, >> #endif >> }; >> >> > > -- Cheers, David / dhildenb
On Fri, Nov 15, 2024 at 11:58:59AM +0100, David Hildenbrand wrote: > On 15.11.24 11:58, David Hildenbrand wrote: > > On 13.11.24 23:34, Elliot Berman wrote: > > > When guest_memfd becomes a library, a callback will need to be made to > > > the owner (KVM SEV) to transition pages back to hypervisor-owned/shared > > > state. This is currently being done as part of .free_folio() address > > > space op, but this callback shouldn't assume that the mapping still > > > exists. guest_memfd library will need the mapping to still exist to look > > > up its operations table. > > > > I assume you mean, that the mapping is no longer set for the folio (it > > sure still exists, because we are getting a callback from it :) )? > > > > Staring at filemap_remove_folio(), this is exactly what happens: > > > > We remember folio->mapping, call __filemap_remove_folio(), and then call > > filemap_free_folio() where we zap folio->mapping via page_cache_delete(). > > > > Maybe it's easier+cleaner to also forward the mapping to the > > free_folio() callback, just like we do with filemap_free_folio()? Would > > that help? > > > > CCing Willy if that would be reasonable extension of the free_folio > > callback. > > I like this approach too. It would avoid the checks we have to do in the invalidate_folio() callback and is cleaner. - Elliot > > > > > > .release_folio() and .invalidate_folio() address space ops can serve the > > > same purpose here. The key difference between release_folio() and > > > free_folio() is whether the mapping is still valid at time of the > > > callback. This approach was discussed in the link in the footer, but not > > > taken because free_folio() was easier to implement. > > > > > > Link: https://lore.kernel.org/kvm/20231016115028.996656-1-michael.roth@amd.com/ > > > Signed-off-by: Elliot Berman <quic_eberman@quicinc.com> > > > --- > > > virt/kvm/guest_memfd.c | 19 ++++++++++++++++--- > > > 1 file changed, 16 insertions(+), 3 deletions(-) > > > > > > diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c > > > index 47a9f68f7b247f4cba0c958b4c7cd9458e7c46b4..13f83ad8a4c26ba82aca4f2684f22044abb4bc19 100644 > > > --- a/virt/kvm/guest_memfd.c > > > +++ b/virt/kvm/guest_memfd.c > > > @@ -358,22 +358,35 @@ static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *fol > > > } > > > #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE > > > -static void kvm_gmem_free_folio(struct folio *folio) > > > +static bool kvm_gmem_release_folio(struct folio *folio, gfp_t gfp) > > > { > > > struct page *page = folio_page(folio, 0); > > > kvm_pfn_t pfn = page_to_pfn(page); > > > int order = folio_order(folio); > > > kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order)); > > > + > > > + return true; > > > +} > > > + > > > +static void kvm_gmem_invalidate_folio(struct folio *folio, size_t offset, > > > + size_t len) > > > +{ > > > + WARN_ON_ONCE(offset != 0); > > > + WARN_ON_ONCE(len != folio_size(folio)); > > > + > > > + if (offset == 0 && len == folio_size(folio)) > > > + filemap_release_folio(folio, 0); > > > } > > > #endif > > > static const struct address_space_operations kvm_gmem_aops = { > > > .dirty_folio = noop_dirty_folio, > > > - .migrate_folio = kvm_gmem_migrate_folio, > > > + .migrate_folio = kvm_gmem_migrate_folio, > > > .error_remove_folio = kvm_gmem_error_folio, > > > #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE > > > - .free_folio = kvm_gmem_free_folio, > > > + .release_folio = kvm_gmem_release_folio, > > > + .invalidate_folio = kvm_gmem_invalidate_folio, > > > #endif > > > }; > > > > > > > > > > -- > Cheers, > > David / dhildenb >
On 15.11.24 21:13, Elliot Berman wrote: > On Fri, Nov 15, 2024 at 11:58:59AM +0100, David Hildenbrand wrote: >> On 15.11.24 11:58, David Hildenbrand wrote: >>> On 13.11.24 23:34, Elliot Berman wrote: >>>> When guest_memfd becomes a library, a callback will need to be made to >>>> the owner (KVM SEV) to transition pages back to hypervisor-owned/shared >>>> state. This is currently being done as part of .free_folio() address >>>> space op, but this callback shouldn't assume that the mapping still >>>> exists. guest_memfd library will need the mapping to still exist to look >>>> up its operations table. >>> >>> I assume you mean, that the mapping is no longer set for the folio (it >>> sure still exists, because we are getting a callback from it :) )? >>> >>> Staring at filemap_remove_folio(), this is exactly what happens: >>> >>> We remember folio->mapping, call __filemap_remove_folio(), and then call >>> filemap_free_folio() where we zap folio->mapping via page_cache_delete(). >>> >>> Maybe it's easier+cleaner to also forward the mapping to the >>> free_folio() callback, just like we do with filemap_free_folio()? Would >>> that help? >>> >>> CCing Willy if that would be reasonable extension of the free_folio >>> callback. >>> > > I like this approach too. It would avoid the checks we have to do in the > invalidate_folio() callback and is cleaner. It really should be fairly simple Documentation/filesystems/locking.rst | 2 +- fs/nfs/dir.c | 9 +++++---- fs/orangefs/inode.c | 3 ++- include/linux/fs.h | 2 +- mm/filemap.c | 2 +- mm/secretmem.c | 3 ++- virt/kvm/guest_memfd.c | 3 ++- 7 files changed, 14 insertions(+), 10 deletions(-) diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index f5e3676db954b..f1a20ad5edbee 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -258,7 +258,7 @@ prototypes:: sector_t (*bmap)(struct address_space *, sector_t); void (*invalidate_folio) (struct folio *, size_t start, size_t len); bool (*release_folio)(struct folio *, gfp_t); - void (*free_folio)(struct folio *); + void (*free_folio)(struct address_space *, struct folio *); int (*direct_IO)(struct kiocb *, struct iov_iter *iter); int (*migrate_folio)(struct address_space *, struct folio *dst, struct folio *src, enum migrate_mode); diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 492cffd9d3d84..f7da6d7496b06 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -218,7 +218,8 @@ static void nfs_readdir_folio_init_array(struct folio *folio, u64 last_cookie, /* * we are freeing strings created by nfs_add_to_readdir_array() */ -static void nfs_readdir_clear_array(struct folio *folio) +static void nfs_readdir_clear_array(struct address_space *mapping, + struct folio *folio) { struct nfs_cache_array *array; unsigned int i; @@ -233,7 +234,7 @@ static void nfs_readdir_clear_array(struct folio *folio) static void nfs_readdir_folio_reinit_array(struct folio *folio, u64 last_cookie, u64 change_attr) { - nfs_readdir_clear_array(folio); + nfs_readdir_clear_array(folio->mapping, folio); nfs_readdir_folio_init_array(folio, last_cookie, change_attr); } @@ -249,7 +250,7 @@ nfs_readdir_folio_array_alloc(u64 last_cookie, gfp_t gfp_flags) static void nfs_readdir_folio_array_free(struct folio *folio) { if (folio) { - nfs_readdir_clear_array(folio); + nfs_readdir_clear_array(folio->mapping, folio); folio_put(folio); } } @@ -391,7 +392,7 @@ static void nfs_readdir_folio_init_and_validate(struct folio *folio, u64 cookie, if (folio_test_uptodate(folio)) { if (nfs_readdir_folio_validate(folio, cookie, change_attr)) return; - nfs_readdir_clear_array(folio); + nfs_readdir_clear_array(folio->mapping, folio); } nfs_readdir_folio_init_array(folio, cookie, change_attr); folio_mark_uptodate(folio); diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index aae6d2b8767df..d936694b8e91f 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -470,7 +470,8 @@ static bool orangefs_release_folio(struct folio *folio, gfp_t foo) return !folio_test_private(folio); } -static void orangefs_free_folio(struct folio *folio) +static void orangefs_free_folio(struct address_space *mapping, + struct folio *folio) { kfree(folio_detach_private(folio)); } diff --git a/include/linux/fs.h b/include/linux/fs.h index 3559446279c15..4dd4013541c1b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -417,7 +417,7 @@ struct address_space_operations { sector_t (*bmap)(struct address_space *, sector_t); void (*invalidate_folio) (struct folio *, size_t offset, size_t len); bool (*release_folio)(struct folio *, gfp_t); - void (*free_folio)(struct folio *folio); + void (*free_folio)(struct address_space *, struct folio *folio); ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter); /* * migrate the contents of a folio to the specified target. If diff --git a/mm/filemap.c b/mm/filemap.c index e582a1545d2ae..86f975ba80746 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -239,7 +239,7 @@ void filemap_free_folio(struct address_space *mapping, struct folio *folio) free_folio = mapping->a_ops->free_folio; if (free_folio) - free_folio(folio); + free_folio(mapping, folio); if (folio_test_large(folio)) refs = folio_nr_pages(folio); diff --git a/mm/secretmem.c b/mm/secretmem.c index 399552814fd0f..1d2ed3391734d 100644 --- a/mm/secretmem.c +++ b/mm/secretmem.c @@ -152,7 +152,8 @@ static int secretmem_migrate_folio(struct address_space *mapping, return -EBUSY; } -static void secretmem_free_folio(struct folio *folio) +static void secretmem_free_folio(struct address_space *mapping, + struct folio *folio) { set_direct_map_default_noflush(&folio->page); folio_zero_segment(folio, 0, folio_size(folio)); diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 8f079a61a56db..573946c4fff51 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -353,7 +353,8 @@ static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *fol } #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE -static void kvm_gmem_free_folio(struct folio *folio) +static void kvm_gmem_free_folio(struct address_space *mapping, + struct folio *folio) { struct page *page = folio_page(folio, 0); kvm_pfn_t pfn = page_to_pfn(page); -- 2.47.0 -- Cheers, David / dhildenb
© 2016 - 2024 Red Hat, Inc.