When guest_memfd becomes a library, a callback will need to be made to
the owner (KVM SEV) to transition pages back to hypervisor-owned/shared
state. This is currently being done as part of .free_folio() address
space op, but this callback shouldn't assume that the mapping still
exists. guest_memfd library will need the mapping to still exist to look
up its operations table.
.release_folio() and .invalidate_folio() address space ops can serve the
same purpose here. The key difference between release_folio() and
free_folio() is whether the mapping is still valid at time of the
callback. This approach was discussed in the link in the footer, but not
taken because free_folio() was easier to implement.
Link: https://lore.kernel.org/kvm/20231016115028.996656-1-michael.roth@amd.com/
Signed-off-by: Elliot Berman <quic_eberman@quicinc.com>
---
virt/kvm/guest_memfd.c | 19 ++++++++++++++++---
1 file changed, 16 insertions(+), 3 deletions(-)
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 47a9f68f7b247f4cba0c958b4c7cd9458e7c46b4..13f83ad8a4c26ba82aca4f2684f22044abb4bc19 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -358,22 +358,35 @@ static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *fol
}
#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
-static void kvm_gmem_free_folio(struct folio *folio)
+static bool kvm_gmem_release_folio(struct folio *folio, gfp_t gfp)
{
struct page *page = folio_page(folio, 0);
kvm_pfn_t pfn = page_to_pfn(page);
int order = folio_order(folio);
kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order));
+
+ return true;
+}
+
+static void kvm_gmem_invalidate_folio(struct folio *folio, size_t offset,
+ size_t len)
+{
+ WARN_ON_ONCE(offset != 0);
+ WARN_ON_ONCE(len != folio_size(folio));
+
+ if (offset == 0 && len == folio_size(folio))
+ filemap_release_folio(folio, 0);
}
#endif
static const struct address_space_operations kvm_gmem_aops = {
.dirty_folio = noop_dirty_folio,
- .migrate_folio = kvm_gmem_migrate_folio,
+ .migrate_folio = kvm_gmem_migrate_folio,
.error_remove_folio = kvm_gmem_error_folio,
#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
- .free_folio = kvm_gmem_free_folio,
+ .release_folio = kvm_gmem_release_folio,
+ .invalidate_folio = kvm_gmem_invalidate_folio,
#endif
};
--
2.34.1
On 13.11.24 23:34, Elliot Berman wrote:
> When guest_memfd becomes a library, a callback will need to be made to
> the owner (KVM SEV) to transition pages back to hypervisor-owned/shared
> state. This is currently being done as part of .free_folio() address
> space op, but this callback shouldn't assume that the mapping still
> exists. guest_memfd library will need the mapping to still exist to look
> up its operations table.
I assume you mean, that the mapping is no longer set for the folio (it
sure still exists, because we are getting a callback from it :) )?
Staring at filemap_remove_folio(), this is exactly what happens:
We remember folio->mapping, call __filemap_remove_folio(), and then call
filemap_free_folio() where we zap folio->mapping via page_cache_delete().
Maybe it's easier+cleaner to also forward the mapping to the
free_folio() callback, just like we do with filemap_free_folio()? Would
that help?
CCing Willy if that would be reasonable extension of the free_folio
callback.
>
> .release_folio() and .invalidate_folio() address space ops can serve the
> same purpose here. The key difference between release_folio() and
> free_folio() is whether the mapping is still valid at time of the
> callback. This approach was discussed in the link in the footer, but not
> taken because free_folio() was easier to implement.
>
> Link: https://lore.kernel.org/kvm/20231016115028.996656-1-michael.roth@amd.com/
> Signed-off-by: Elliot Berman <quic_eberman@quicinc.com>
> ---
> virt/kvm/guest_memfd.c | 19 ++++++++++++++++---
> 1 file changed, 16 insertions(+), 3 deletions(-)
>
> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> index 47a9f68f7b247f4cba0c958b4c7cd9458e7c46b4..13f83ad8a4c26ba82aca4f2684f22044abb4bc19 100644
> --- a/virt/kvm/guest_memfd.c
> +++ b/virt/kvm/guest_memfd.c
> @@ -358,22 +358,35 @@ static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *fol
> }
>
> #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
> -static void kvm_gmem_free_folio(struct folio *folio)
> +static bool kvm_gmem_release_folio(struct folio *folio, gfp_t gfp)
> {
> struct page *page = folio_page(folio, 0);
> kvm_pfn_t pfn = page_to_pfn(page);
> int order = folio_order(folio);
>
> kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order));
> +
> + return true;
> +}
> +
> +static void kvm_gmem_invalidate_folio(struct folio *folio, size_t offset,
> + size_t len)
> +{
> + WARN_ON_ONCE(offset != 0);
> + WARN_ON_ONCE(len != folio_size(folio));
> +
> + if (offset == 0 && len == folio_size(folio))
> + filemap_release_folio(folio, 0);
> }
> #endif
>
> static const struct address_space_operations kvm_gmem_aops = {
> .dirty_folio = noop_dirty_folio,
> - .migrate_folio = kvm_gmem_migrate_folio,
> + .migrate_folio = kvm_gmem_migrate_folio,
> .error_remove_folio = kvm_gmem_error_folio,
> #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
> - .free_folio = kvm_gmem_free_folio,
> + .release_folio = kvm_gmem_release_folio,
> + .invalidate_folio = kvm_gmem_invalidate_folio,
> #endif
> };
>
>
--
Cheers,
David / dhildenb
On 15.11.24 11:58, David Hildenbrand wrote:
> On 13.11.24 23:34, Elliot Berman wrote:
>> When guest_memfd becomes a library, a callback will need to be made to
>> the owner (KVM SEV) to transition pages back to hypervisor-owned/shared
>> state. This is currently being done as part of .free_folio() address
>> space op, but this callback shouldn't assume that the mapping still
>> exists. guest_memfd library will need the mapping to still exist to look
>> up its operations table.
>
> I assume you mean, that the mapping is no longer set for the folio (it
> sure still exists, because we are getting a callback from it :) )?
>
> Staring at filemap_remove_folio(), this is exactly what happens:
>
> We remember folio->mapping, call __filemap_remove_folio(), and then call
> filemap_free_folio() where we zap folio->mapping via page_cache_delete().
>
> Maybe it's easier+cleaner to also forward the mapping to the
> free_folio() callback, just like we do with filemap_free_folio()? Would
> that help?
>
> CCing Willy if that would be reasonable extension of the free_folio
> callback.
>
Now really CCing him. :)
>
>>
>> .release_folio() and .invalidate_folio() address space ops can serve the
>> same purpose here. The key difference between release_folio() and
>> free_folio() is whether the mapping is still valid at time of the
>> callback. This approach was discussed in the link in the footer, but not
>> taken because free_folio() was easier to implement.
>>
>> Link: https://lore.kernel.org/kvm/20231016115028.996656-1-michael.roth@amd.com/
>> Signed-off-by: Elliot Berman <quic_eberman@quicinc.com>
>> ---
>> virt/kvm/guest_memfd.c | 19 ++++++++++++++++---
>> 1 file changed, 16 insertions(+), 3 deletions(-)
>>
>> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
>> index 47a9f68f7b247f4cba0c958b4c7cd9458e7c46b4..13f83ad8a4c26ba82aca4f2684f22044abb4bc19 100644
>> --- a/virt/kvm/guest_memfd.c
>> +++ b/virt/kvm/guest_memfd.c
>> @@ -358,22 +358,35 @@ static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *fol
>> }
>>
>> #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
>> -static void kvm_gmem_free_folio(struct folio *folio)
>> +static bool kvm_gmem_release_folio(struct folio *folio, gfp_t gfp)
>> {
>> struct page *page = folio_page(folio, 0);
>> kvm_pfn_t pfn = page_to_pfn(page);
>> int order = folio_order(folio);
>>
>> kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order));
>> +
>> + return true;
>> +}
>> +
>> +static void kvm_gmem_invalidate_folio(struct folio *folio, size_t offset,
>> + size_t len)
>> +{
>> + WARN_ON_ONCE(offset != 0);
>> + WARN_ON_ONCE(len != folio_size(folio));
>> +
>> + if (offset == 0 && len == folio_size(folio))
>> + filemap_release_folio(folio, 0);
>> }
>> #endif
>>
>> static const struct address_space_operations kvm_gmem_aops = {
>> .dirty_folio = noop_dirty_folio,
>> - .migrate_folio = kvm_gmem_migrate_folio,
>> + .migrate_folio = kvm_gmem_migrate_folio,
>> .error_remove_folio = kvm_gmem_error_folio,
>> #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
>> - .free_folio = kvm_gmem_free_folio,
>> + .release_folio = kvm_gmem_release_folio,
>> + .invalidate_folio = kvm_gmem_invalidate_folio,
>> #endif
>> };
>>
>>
>
>
--
Cheers,
David / dhildenb
On Fri, Nov 15, 2024 at 11:58:59AM +0100, David Hildenbrand wrote:
> On 15.11.24 11:58, David Hildenbrand wrote:
> > On 13.11.24 23:34, Elliot Berman wrote:
> > > When guest_memfd becomes a library, a callback will need to be made to
> > > the owner (KVM SEV) to transition pages back to hypervisor-owned/shared
> > > state. This is currently being done as part of .free_folio() address
> > > space op, but this callback shouldn't assume that the mapping still
> > > exists. guest_memfd library will need the mapping to still exist to look
> > > up its operations table.
> >
> > I assume you mean, that the mapping is no longer set for the folio (it
> > sure still exists, because we are getting a callback from it :) )?
> >
> > Staring at filemap_remove_folio(), this is exactly what happens:
> >
> > We remember folio->mapping, call __filemap_remove_folio(), and then call
> > filemap_free_folio() where we zap folio->mapping via page_cache_delete().
> >
> > Maybe it's easier+cleaner to also forward the mapping to the
> > free_folio() callback, just like we do with filemap_free_folio()? Would
> > that help?
> >
> > CCing Willy if that would be reasonable extension of the free_folio
> > callback.
> >
I like this approach too. It would avoid the checks we have to do in the
invalidate_folio() callback and is cleaner.
- Elliot
> > >
> > > .release_folio() and .invalidate_folio() address space ops can serve the
> > > same purpose here. The key difference between release_folio() and
> > > free_folio() is whether the mapping is still valid at time of the
> > > callback. This approach was discussed in the link in the footer, but not
> > > taken because free_folio() was easier to implement.
> > >
> > > Link: https://lore.kernel.org/kvm/20231016115028.996656-1-michael.roth@amd.com/
> > > Signed-off-by: Elliot Berman <quic_eberman@quicinc.com>
> > > ---
> > > virt/kvm/guest_memfd.c | 19 ++++++++++++++++---
> > > 1 file changed, 16 insertions(+), 3 deletions(-)
> > >
> > > diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> > > index 47a9f68f7b247f4cba0c958b4c7cd9458e7c46b4..13f83ad8a4c26ba82aca4f2684f22044abb4bc19 100644
> > > --- a/virt/kvm/guest_memfd.c
> > > +++ b/virt/kvm/guest_memfd.c
> > > @@ -358,22 +358,35 @@ static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *fol
> > > }
> > > #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
> > > -static void kvm_gmem_free_folio(struct folio *folio)
> > > +static bool kvm_gmem_release_folio(struct folio *folio, gfp_t gfp)
> > > {
> > > struct page *page = folio_page(folio, 0);
> > > kvm_pfn_t pfn = page_to_pfn(page);
> > > int order = folio_order(folio);
> > > kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order));
> > > +
> > > + return true;
> > > +}
> > > +
> > > +static void kvm_gmem_invalidate_folio(struct folio *folio, size_t offset,
> > > + size_t len)
> > > +{
> > > + WARN_ON_ONCE(offset != 0);
> > > + WARN_ON_ONCE(len != folio_size(folio));
> > > +
> > > + if (offset == 0 && len == folio_size(folio))
> > > + filemap_release_folio(folio, 0);
> > > }
> > > #endif
> > > static const struct address_space_operations kvm_gmem_aops = {
> > > .dirty_folio = noop_dirty_folio,
> > > - .migrate_folio = kvm_gmem_migrate_folio,
> > > + .migrate_folio = kvm_gmem_migrate_folio,
> > > .error_remove_folio = kvm_gmem_error_folio,
> > > #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
> > > - .free_folio = kvm_gmem_free_folio,
> > > + .release_folio = kvm_gmem_release_folio,
> > > + .invalidate_folio = kvm_gmem_invalidate_folio,
> > > #endif
> > > };
> > >
> >
> >
>
>
> --
> Cheers,
>
> David / dhildenb
>
On 15.11.24 21:13, Elliot Berman wrote:
> On Fri, Nov 15, 2024 at 11:58:59AM +0100, David Hildenbrand wrote:
>> On 15.11.24 11:58, David Hildenbrand wrote:
>>> On 13.11.24 23:34, Elliot Berman wrote:
>>>> When guest_memfd becomes a library, a callback will need to be made to
>>>> the owner (KVM SEV) to transition pages back to hypervisor-owned/shared
>>>> state. This is currently being done as part of .free_folio() address
>>>> space op, but this callback shouldn't assume that the mapping still
>>>> exists. guest_memfd library will need the mapping to still exist to look
>>>> up its operations table.
>>>
>>> I assume you mean, that the mapping is no longer set for the folio (it
>>> sure still exists, because we are getting a callback from it :) )?
>>>
>>> Staring at filemap_remove_folio(), this is exactly what happens:
>>>
>>> We remember folio->mapping, call __filemap_remove_folio(), and then call
>>> filemap_free_folio() where we zap folio->mapping via page_cache_delete().
>>>
>>> Maybe it's easier+cleaner to also forward the mapping to the
>>> free_folio() callback, just like we do with filemap_free_folio()? Would
>>> that help?
>>>
>>> CCing Willy if that would be reasonable extension of the free_folio
>>> callback.
>>>
>
> I like this approach too. It would avoid the checks we have to do in the
> invalidate_folio() callback and is cleaner.
It really should be fairly simple
Documentation/filesystems/locking.rst | 2 +-
fs/nfs/dir.c | 9 +++++----
fs/orangefs/inode.c | 3 ++-
include/linux/fs.h | 2 +-
mm/filemap.c | 2 +-
mm/secretmem.c | 3 ++-
virt/kvm/guest_memfd.c | 3 ++-
7 files changed, 14 insertions(+), 10 deletions(-)
diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst
index f5e3676db954b..f1a20ad5edbee 100644
--- a/Documentation/filesystems/locking.rst
+++ b/Documentation/filesystems/locking.rst
@@ -258,7 +258,7 @@ prototypes::
sector_t (*bmap)(struct address_space *, sector_t);
void (*invalidate_folio) (struct folio *, size_t start, size_t len);
bool (*release_folio)(struct folio *, gfp_t);
- void (*free_folio)(struct folio *);
+ void (*free_folio)(struct address_space *, struct folio *);
int (*direct_IO)(struct kiocb *, struct iov_iter *iter);
int (*migrate_folio)(struct address_space *, struct folio *dst,
struct folio *src, enum migrate_mode);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 492cffd9d3d84..f7da6d7496b06 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -218,7 +218,8 @@ static void nfs_readdir_folio_init_array(struct folio *folio, u64 last_cookie,
/*
* we are freeing strings created by nfs_add_to_readdir_array()
*/
-static void nfs_readdir_clear_array(struct folio *folio)
+static void nfs_readdir_clear_array(struct address_space *mapping,
+ struct folio *folio)
{
struct nfs_cache_array *array;
unsigned int i;
@@ -233,7 +234,7 @@ static void nfs_readdir_clear_array(struct folio *folio)
static void nfs_readdir_folio_reinit_array(struct folio *folio, u64 last_cookie,
u64 change_attr)
{
- nfs_readdir_clear_array(folio);
+ nfs_readdir_clear_array(folio->mapping, folio);
nfs_readdir_folio_init_array(folio, last_cookie, change_attr);
}
@@ -249,7 +250,7 @@ nfs_readdir_folio_array_alloc(u64 last_cookie, gfp_t gfp_flags)
static void nfs_readdir_folio_array_free(struct folio *folio)
{
if (folio) {
- nfs_readdir_clear_array(folio);
+ nfs_readdir_clear_array(folio->mapping, folio);
folio_put(folio);
}
}
@@ -391,7 +392,7 @@ static void nfs_readdir_folio_init_and_validate(struct folio *folio, u64 cookie,
if (folio_test_uptodate(folio)) {
if (nfs_readdir_folio_validate(folio, cookie, change_attr))
return;
- nfs_readdir_clear_array(folio);
+ nfs_readdir_clear_array(folio->mapping, folio);
}
nfs_readdir_folio_init_array(folio, cookie, change_attr);
folio_mark_uptodate(folio);
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index aae6d2b8767df..d936694b8e91f 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -470,7 +470,8 @@ static bool orangefs_release_folio(struct folio *folio, gfp_t foo)
return !folio_test_private(folio);
}
-static void orangefs_free_folio(struct folio *folio)
+static void orangefs_free_folio(struct address_space *mapping,
+ struct folio *folio)
{
kfree(folio_detach_private(folio));
}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3559446279c15..4dd4013541c1b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -417,7 +417,7 @@ struct address_space_operations {
sector_t (*bmap)(struct address_space *, sector_t);
void (*invalidate_folio) (struct folio *, size_t offset, size_t len);
bool (*release_folio)(struct folio *, gfp_t);
- void (*free_folio)(struct folio *folio);
+ void (*free_folio)(struct address_space *, struct folio *folio);
ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter);
/*
* migrate the contents of a folio to the specified target. If
diff --git a/mm/filemap.c b/mm/filemap.c
index e582a1545d2ae..86f975ba80746 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -239,7 +239,7 @@ void filemap_free_folio(struct address_space *mapping, struct folio *folio)
free_folio = mapping->a_ops->free_folio;
if (free_folio)
- free_folio(folio);
+ free_folio(mapping, folio);
if (folio_test_large(folio))
refs = folio_nr_pages(folio);
diff --git a/mm/secretmem.c b/mm/secretmem.c
index 399552814fd0f..1d2ed3391734d 100644
--- a/mm/secretmem.c
+++ b/mm/secretmem.c
@@ -152,7 +152,8 @@ static int secretmem_migrate_folio(struct address_space *mapping,
return -EBUSY;
}
-static void secretmem_free_folio(struct folio *folio)
+static void secretmem_free_folio(struct address_space *mapping,
+ struct folio *folio)
{
set_direct_map_default_noflush(&folio->page);
folio_zero_segment(folio, 0, folio_size(folio));
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 8f079a61a56db..573946c4fff51 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -353,7 +353,8 @@ static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *fol
}
#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
-static void kvm_gmem_free_folio(struct folio *folio)
+static void kvm_gmem_free_folio(struct address_space *mapping,
+ struct folio *folio)
{
struct page *page = folio_page(folio, 0);
kvm_pfn_t pfn = page_to_pfn(page);
--
2.47.0
--
Cheers,
David / dhildenb
© 2016 - 2026 Red Hat, Inc.