[v1] guest_memfd: In-place conversion support

[RFC PATCH v1 09/37] KVM: guest_memfd: Skip LRU for guest_memfd folios

Posted by Ackerley Tng 3 months, 3 weeks ago

filemap_add_folio(), called from filemap_grab_folio(), adds folios to
an LRU list. This is unnecessary for guest_memfd, which does not
participate in swapping.

In addition, the LRU list takes a reference count on the folio. With
shared-to-private memory conversions for KVM guests dependent on folio
refcounts, this extra reference can cause conversions to fail due to
unexpected refcounts.

Rework kvm_gmem_get_folio() to manually allocate and insert the folio
into the page cache without placing it on the LRU. This is done by
calling __filemap_add_folio() directly.

The folio is then marked unevictable to avoid participation in
swapping. The ->free_folio() handler is modified to unset the
unevictable flag when the folio is released from guest_memfd.

This change ensures that LRU lists no longer take refcounts on
guest_memfd folios, significantly reducing the chance of elevated
refcounts during conversion.

To facilitate this, __filemap_add_folio is exported for KVM's use.

Signed-off-by: Ackerley Tng <ackerleytng@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 mm/filemap.c           |  1 +
 mm/memcontrol.c        |  2 ++
 virt/kvm/guest_memfd.c | 60 +++++++++++++++++++++++++++++++++---------
 3 files changed, 50 insertions(+), 13 deletions(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index 03f223be575ca..60c7c95bbd7e6 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -954,6 +954,7 @@ noinline int __filemap_add_folio(struct address_space *mapping,
 	return xas_error(&xas);
 }
 ALLOW_ERROR_INJECTION(__filemap_add_folio, ERRNO);
+EXPORT_SYMBOL_FOR_MODULES(__filemap_add_folio, "kvm");
 
 int filemap_add_folio(struct address_space *mapping, struct folio *folio,
 				pgoff_t index, gfp_t gfp)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8dd7fbed5a942..fe8629414d0a9 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4721,6 +4721,7 @@ int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp)
 
 	return ret;
 }
+EXPORT_SYMBOL_FOR_MODULES(__mem_cgroup_charge, "kvm");
 
 /**
  * mem_cgroup_charge_hugetlb - charge the memcg for a hugetlb folio
@@ -4893,6 +4894,7 @@ void __mem_cgroup_uncharge(struct folio *folio)
 	uncharge_folio(folio, &ug);
 	uncharge_batch(&ug);
 }
+EXPORT_SYMBOL_FOR_MODULES(__mem_cgroup_uncharge, "kvm");
 
 void __mem_cgroup_uncharge_folios(struct folio_batch *folios)
 {
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 2a9e9220a48aa..dab2b3ce78bc8 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -148,6 +148,41 @@ static struct mempolicy *kvm_gmem_get_folio_policy(struct gmem_inode *gi,
 #endif
 }
 
+static struct folio *__kvm_gmem_get_folio(struct address_space *mapping,
+					  pgoff_t index,
+					  struct mempolicy *policy)
+{
+	const gfp_t gfp = mapping_gfp_mask(mapping);
+	struct folio *folio;
+	int err;
+
+	folio = filemap_lock_folio(mapping, index);
+	if (!IS_ERR(folio))
+		return folio;
+
+	folio = filemap_alloc_folio(gfp, 0, policy);
+	if (!folio)
+		return ERR_PTR(-ENOMEM);
+
+	err = mem_cgroup_charge(folio, NULL, gfp);
+	if (err)
+		goto err_put;
+
+	__folio_set_locked(folio);
+
+	err = __filemap_add_folio(mapping, folio, index, gfp, NULL);
+	if (err) {
+		__folio_clear_locked(folio);
+		goto err_put;
+	}
+
+	return folio;
+
+err_put:
+	folio_put(folio);
+	return ERR_PTR(err);
+}
+
 /*
  * Returns a locked folio on success.  The caller is responsible for
  * setting the up-to-date flag before the memory is mapped into the guest.
@@ -160,6 +195,7 @@ static struct mempolicy *kvm_gmem_get_folio_policy(struct gmem_inode *gi,
 static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
 {
 	/* TODO: Support huge pages. */
+	struct address_space *mapping = inode->i_mapping;
 	struct mempolicy *policy;
 	struct folio *folio;
 
@@ -167,16 +203,17 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
 	 * Fast-path: See if folio is already present in mapping to avoid
 	 * policy_lookup.
 	 */
-	folio = filemap_lock_folio(inode->i_mapping, index);
+	folio = filemap_lock_folio(mapping, index);
 	if (!IS_ERR(folio))
 		return folio;
 
 	policy = kvm_gmem_get_folio_policy(GMEM_I(inode), index);
-	folio = __filemap_get_folio_mpol(inode->i_mapping, index,
-					 FGP_LOCK | FGP_CREAT,
-					 mapping_gfp_mask(inode->i_mapping), policy);
-	mpol_cond_put(policy);
 
+	do {
+		folio = __kvm_gmem_get_folio(mapping, index, policy);
+	} while (IS_ERR(folio) && PTR_ERR(folio) == -EEXIST);
+
+	mpol_cond_put(policy);
 	return folio;
 }
 
@@ -588,24 +625,21 @@ static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *fol
 	return MF_DELAYED;
 }
 
-#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
 static void kvm_gmem_free_folio(struct folio *folio)
 {
-	struct page *page = folio_page(folio, 0);
-	kvm_pfn_t pfn = page_to_pfn(page);
-	int order = folio_order(folio);
+	folio_clear_unevictable(folio);
 
-	kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order));
-}
+#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
+	kvm_arch_gmem_invalidate(folio_pfn(folio),
+				 folio_pfn(folio) + folio_nr_pages(folio));
 #endif
+}
 
 static const struct address_space_operations kvm_gmem_aops = {
 	.dirty_folio = noop_dirty_folio,
 	.migrate_folio	= kvm_gmem_migrate_folio,
 	.error_remove_folio = kvm_gmem_error_folio,
-#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
 	.free_folio = kvm_gmem_free_folio,
-#endif
 };
 
 static int kvm_gmem_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
-- 
2.51.0.858.gf9c4a03a3a-goog

Re: [RFC PATCH v1 09/37] KVM: guest_memfd: Skip LRU for guest_memfd folios

Posted by Yan Zhao 3 weeks, 1 day ago

> +static struct folio *__kvm_gmem_get_folio(struct address_space *mapping,
> +					  pgoff_t index,
> +					  struct mempolicy *policy)
> +{
> +	const gfp_t gfp = mapping_gfp_mask(mapping);
> +	struct folio *folio;
> +	int err;
> +
> +	folio = filemap_lock_folio(mapping, index);
> +	if (!IS_ERR(folio))
> +		return folio;
> +
> +	folio = filemap_alloc_folio(gfp, 0, policy);
> +	if (!folio)
> +		return ERR_PTR(-ENOMEM);
> +
> +	err = mem_cgroup_charge(folio, NULL, gfp);
> +	if (err)
> +		goto err_put;
> +
> +	__folio_set_locked(folio);
> +
> +	err = __filemap_add_folio(mapping, folio, index, gfp, NULL);
> +	if (err) {
		 mem_cgroup_uncharge(folio);

> +		__folio_clear_locked(folio);
> +		goto err_put;
> +	}
> +
> +	return folio;
> +
> +err_put:
> +	folio_put(folio);
> +	return ERR_PTR(err);
> +}
> +
>  /*
>   * Returns a locked folio on success.  The caller is responsible for
>   * setting the up-to-date flag before the memory is mapped into the guest.
> @@ -160,6 +195,7 @@ static struct mempolicy *kvm_gmem_get_folio_policy(struct gmem_inode *gi,
>  static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
>  {
>  	/* TODO: Support huge pages. */
> +	struct address_space *mapping = inode->i_mapping;
>  	struct mempolicy *policy;
>  	struct folio *folio;
>  
> @@ -167,16 +203,17 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
>  	 * Fast-path: See if folio is already present in mapping to avoid
>  	 * policy_lookup.
>  	 */
> -	folio = filemap_lock_folio(inode->i_mapping, index);
> +	folio = filemap_lock_folio(mapping, index);
>  	if (!IS_ERR(folio))
>  		return folio;
>  
>  	policy = kvm_gmem_get_folio_policy(GMEM_I(inode), index);
> -	folio = __filemap_get_folio_mpol(inode->i_mapping, index,
> -					 FGP_LOCK | FGP_CREAT,
> -					 mapping_gfp_mask(inode->i_mapping), policy);
> -	mpol_cond_put(policy);
>  
> +	do {
> +		folio = __kvm_gmem_get_folio(mapping, index, policy);
> +	} while (IS_ERR(folio) && PTR_ERR(folio) == -EEXIST);
Why not just return ERR_PTR(-EEXIST) up to kvm_gmem_get_pfn() and have a higher
level retry?

> +	mpol_cond_put(policy);
>  	return folio;
>  }

Re: [RFC PATCH v1 09/37] KVM: guest_memfd: Skip LRU for guest_memfd folios

Posted by Vlastimil Babka 3 months, 2 weeks ago

On 10/17/25 22:11, Ackerley Tng wrote:
> filemap_add_folio(), called from filemap_grab_folio(), adds folios to
> an LRU list. This is unnecessary for guest_memfd, which does not
> participate in swapping.

IIRC guest_memfd mappings are unevictable. That should mean they are not
ultimately added to a list (see lruvec_add_folio()).

> In addition, the LRU list takes a reference count on the folio. With

IIUC the refcount is temporary while being on the percpu
&cpu_fbatches.lru_add, added by __folio_batch_add_and_move(). When flushed
via folio_batch_move_lru(), the refcount is removed and there's only the LRU
folio flag that remains. The fbatch flushing can be triggered if you see an
unexpected refcount increase. So it might be feasible to do without this
patch (maybe it was already tried and there were substantial issues, in
which case should be mentioned).

> shared-to-private memory conversions for KVM guests dependent on folio
> refcounts, this extra reference can cause conversions to fail due to
> unexpected refcounts.
> 
> Rework kvm_gmem_get_folio() to manually allocate and insert the folio
> into the page cache without placing it on the LRU. This is done by
> calling __filemap_add_folio() directly.
> 
> The folio is then marked unevictable to avoid participation in
> swapping. The ->free_folio() handler is modified to unset the
> unevictable flag when the folio is released from guest_memfd.
> 
> This change ensures that LRU lists no longer take refcounts on
> guest_memfd folios, significantly reducing the chance of elevated
> refcounts during conversion.
> 
> To facilitate this, __filemap_add_folio is exported for KVM's use.
> 
> Signed-off-by: Ackerley Tng <ackerleytng@google.com>
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> ---
>  mm/filemap.c           |  1 +
>  mm/memcontrol.c        |  2 ++
>  virt/kvm/guest_memfd.c | 60 +++++++++++++++++++++++++++++++++---------
>  3 files changed, 50 insertions(+), 13 deletions(-)
> 
> diff --git a/mm/filemap.c b/mm/filemap.c
> index 03f223be575ca..60c7c95bbd7e6 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -954,6 +954,7 @@ noinline int __filemap_add_folio(struct address_space *mapping,
>  	return xas_error(&xas);
>  }
>  ALLOW_ERROR_INJECTION(__filemap_add_folio, ERRNO);
> +EXPORT_SYMBOL_FOR_MODULES(__filemap_add_folio, "kvm");
>  
>  int filemap_add_folio(struct address_space *mapping, struct folio *folio,
>  				pgoff_t index, gfp_t gfp)
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 8dd7fbed5a942..fe8629414d0a9 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -4721,6 +4721,7 @@ int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp)
>  
>  	return ret;
>  }
> +EXPORT_SYMBOL_FOR_MODULES(__mem_cgroup_charge, "kvm");
>  
>  /**
>   * mem_cgroup_charge_hugetlb - charge the memcg for a hugetlb folio
> @@ -4893,6 +4894,7 @@ void __mem_cgroup_uncharge(struct folio *folio)
>  	uncharge_folio(folio, &ug);
>  	uncharge_batch(&ug);
>  }
> +EXPORT_SYMBOL_FOR_MODULES(__mem_cgroup_uncharge, "kvm");
>  
>  void __mem_cgroup_uncharge_folios(struct folio_batch *folios)
>  {
> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> index 2a9e9220a48aa..dab2b3ce78bc8 100644
> --- a/virt/kvm/guest_memfd.c
> +++ b/virt/kvm/guest_memfd.c
> @@ -148,6 +148,41 @@ static struct mempolicy *kvm_gmem_get_folio_policy(struct gmem_inode *gi,
>  #endif
>  }
>  
> +static struct folio *__kvm_gmem_get_folio(struct address_space *mapping,
> +					  pgoff_t index,
> +					  struct mempolicy *policy)
> +{
> +	const gfp_t gfp = mapping_gfp_mask(mapping);
> +	struct folio *folio;
> +	int err;
> +
> +	folio = filemap_lock_folio(mapping, index);
> +	if (!IS_ERR(folio))
> +		return folio;
> +
> +	folio = filemap_alloc_folio(gfp, 0, policy);
> +	if (!folio)
> +		return ERR_PTR(-ENOMEM);
> +
> +	err = mem_cgroup_charge(folio, NULL, gfp);
> +	if (err)
> +		goto err_put;
> +
> +	__folio_set_locked(folio);
> +
> +	err = __filemap_add_folio(mapping, folio, index, gfp, NULL);
> +	if (err) {
> +		__folio_clear_locked(folio);
> +		goto err_put;
> +	}
> +
> +	return folio;
> +
> +err_put:
> +	folio_put(folio);
> +	return ERR_PTR(err);
> +}
> +
>  /*
>   * Returns a locked folio on success.  The caller is responsible for
>   * setting the up-to-date flag before the memory is mapped into the guest.
> @@ -160,6 +195,7 @@ static struct mempolicy *kvm_gmem_get_folio_policy(struct gmem_inode *gi,
>  static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
>  {
>  	/* TODO: Support huge pages. */
> +	struct address_space *mapping = inode->i_mapping;
>  	struct mempolicy *policy;
>  	struct folio *folio;
>  
> @@ -167,16 +203,17 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
>  	 * Fast-path: See if folio is already present in mapping to avoid
>  	 * policy_lookup.
>  	 */
> -	folio = filemap_lock_folio(inode->i_mapping, index);
> +	folio = filemap_lock_folio(mapping, index);
>  	if (!IS_ERR(folio))
>  		return folio;
>  
>  	policy = kvm_gmem_get_folio_policy(GMEM_I(inode), index);
> -	folio = __filemap_get_folio_mpol(inode->i_mapping, index,
> -					 FGP_LOCK | FGP_CREAT,
> -					 mapping_gfp_mask(inode->i_mapping), policy);
> -	mpol_cond_put(policy);
>  
> +	do {
> +		folio = __kvm_gmem_get_folio(mapping, index, policy);
> +	} while (IS_ERR(folio) && PTR_ERR(folio) == -EEXIST);
> +
> +	mpol_cond_put(policy);
>  	return folio;
>  }
>  
> @@ -588,24 +625,21 @@ static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *fol
>  	return MF_DELAYED;
>  }
>  
> -#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
>  static void kvm_gmem_free_folio(struct folio *folio)
>  {
> -	struct page *page = folio_page(folio, 0);
> -	kvm_pfn_t pfn = page_to_pfn(page);
> -	int order = folio_order(folio);
> +	folio_clear_unevictable(folio);
>  
> -	kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order));
> -}
> +#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
> +	kvm_arch_gmem_invalidate(folio_pfn(folio),
> +				 folio_pfn(folio) + folio_nr_pages(folio));
>  #endif
> +}
>  
>  static const struct address_space_operations kvm_gmem_aops = {
>  	.dirty_folio = noop_dirty_folio,
>  	.migrate_folio	= kvm_gmem_migrate_folio,
>  	.error_remove_folio = kvm_gmem_error_folio,
> -#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
>  	.free_folio = kvm_gmem_free_folio,
> -#endif
>  };
>  
>  static int kvm_gmem_setattr(struct mnt_idmap *idmap, struct dentry *dentry,

Re: [RFC PATCH v1 09/37] KVM: guest_memfd: Skip LRU for guest_memfd folios

Posted by Ackerley Tng 2 weeks ago

Vlastimil Babka <vbabka@suse.cz> writes:

> On 10/17/25 22:11, Ackerley Tng wrote:
>> filemap_add_folio(), called from filemap_grab_folio(), adds folios to
>> an LRU list. This is unnecessary for guest_memfd, which does not
>> participate in swapping.
>
> IIRC guest_memfd mappings are unevictable. That should mean they are not
> ultimately added to a list (see lruvec_add_folio()).
>
>> In addition, the LRU list takes a reference count on the folio. With
>
> IIUC the refcount is temporary while being on the percpu
> &cpu_fbatches.lru_add, added by __folio_batch_add_and_move().

Thanks for pointing this out. You're right about this, I misunderstood
this refcounting earlier.

> When flushed
> via folio_batch_move_lru(), the refcount is removed and there's only the LRU
> folio flag that remains. The fbatch flushing can be triggered if you see an
> unexpected refcount increase.

The new plan is, to update kvm_gmem_is_safe_for_conversion() to drain
the fbatch if it some elevated refcount is found:

static bool kvm_gmem_is_safe_for_conversion(struct inode *inode,
					    pgoff_t start, size_t nr_pages,
					    pgoff_t *err_index)
{
	struct address_space *mapping = inode->i_mapping;
	const int filemap_get_folios_refcount = 1;
	pgoff_t last = start + nr_pages - 1;
	struct folio_batch fbatch;
	bool lru_drained = false;
	bool safe = true;
	int i;

	folio_batch_init(&fbatch);
	while (safe && filemap_get_folios(mapping, &start, last, &fbatch)) {

		for (i = 0; i < folio_batch_count(&fbatch);) {
			struct folio *folio = fbatch.folios[i];

			safe = (folio_ref_count(folio) ==
				folio_nr_pages(folio) +
				filemap_get_folios_refcount);

			if (safe) {
				++i;
			} else if (!lru_drained) {
				lru_add_drain_all();
				lru_drained = true;
			} else {
				*err_index = folio->index;
				break;
			}
		}

		folio_batch_release(&fbatch);
	}

	return safe;
}

I hope this is what you meant!

> So it might be feasible to do without this
> patch (maybe it was already tried and there were substantial issues, in
> which case should be mentioned).
>

The patch "KVM: guest_memfd: Skip LRU for guest_memfd folios" will be
dropped from the next revision, and "KVM: guest_memfd: Don't set
FGP_ACCESSED when getting folios" is no longer a requirement for this
patch series.

>> shared-to-private memory conversions for KVM guests dependent on folio
>> refcounts, this extra reference can cause conversions to fail due to
>> unexpected refcounts.
>>
>>
>> [...snip...]
>>