[v2] 1G page support for guest_memfd

[RFC PATCH v2 32/51] KVM: guest_memfd: Support guestmem_hugetlb as custom allocator

Posted by Ackerley Tng 8 months, 4 weeks ago

This patch adds support for guestmem_hugetlb as the first custom
allocator in guest_memfd.

If requested at guest_memfd creation time, the custom allocator will
be used in initialization and cleanup.

Signed-off-by: Ackerley Tng <ackerleytng@google.com>

Change-Id: I1eb9625dc761ecadcc2aa21480cfdfcf9ab7ce67
---
 include/uapi/linux/kvm.h |   1 +
 virt/kvm/Kconfig         |   5 +
 virt/kvm/guest_memfd.c   | 203 +++++++++++++++++++++++++++++++++++++--
 3 files changed, 199 insertions(+), 10 deletions(-)

diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 433e184f83ea..af486b2e4862 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1571,6 +1571,7 @@ struct kvm_memory_attributes {
 
 #define GUEST_MEMFD_FLAG_SUPPORT_SHARED	(1UL << 0)
 #define GUEST_MEMFD_FLAG_INIT_PRIVATE	(1UL << 1)
+#define GUEST_MEMFD_FLAG_HUGETLB	(1UL << 2)
 
 struct kvm_create_guest_memfd {
 	__u64 size;
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 14ffd9c1d480..ff917bb57371 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -133,3 +133,8 @@ config KVM_GMEM_SHARED_MEM
        select KVM_GMEM
        bool
        prompt "Enables in-place shared memory for guest_memfd"
+
+config KVM_GMEM_HUGETLB
+       select KVM_PRIVATE_MEM
+       depends on GUESTMEM_HUGETLB
+       bool "Enables using a custom allocator with guest_memfd, see CONFIG_GUESTMEM_HUGETLB"
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 8c9c9e54616b..c65d93c5a443 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -3,11 +3,14 @@
 #include <linux/backing-dev.h>
 #include <linux/falloc.h>
 #include <linux/fs.h>
+#include <linux/guestmem.h>
 #include <linux/kvm_host.h>
 #include <linux/maple_tree.h>
 #include <linux/pseudo_fs.h>
 #include <linux/pagemap.h>
 
+#include <uapi/linux/guestmem.h>
+
 #include "kvm_mm.h"
 
 static struct vfsmount *kvm_gmem_mnt;
@@ -22,6 +25,10 @@ struct kvm_gmem_inode_private {
 #ifdef CONFIG_KVM_GMEM_SHARED_MEM
 	struct maple_tree shareability;
 #endif
+#ifdef CONFIG_KVM_GMEM_HUGETLB
+	const struct guestmem_allocator_operations *allocator_ops;
+	void *allocator_private;
+#endif
 };
 
 enum shareability {
@@ -40,6 +47,44 @@ static struct kvm_gmem_inode_private *kvm_gmem_private(struct inode *inode)
 	return inode->i_mapping->i_private_data;
 }
 
+#ifdef CONFIG_KVM_GMEM_HUGETLB
+
+static const struct guestmem_allocator_operations *
+kvm_gmem_allocator_ops(struct inode *inode)
+{
+	return kvm_gmem_private(inode)->allocator_ops;
+}
+
+static void *kvm_gmem_allocator_private(struct inode *inode)
+{
+	return kvm_gmem_private(inode)->allocator_private;
+}
+
+static bool kvm_gmem_has_custom_allocator(struct inode *inode)
+{
+	return kvm_gmem_allocator_ops(inode) != NULL;
+}
+
+#else
+
+static const struct guestmem_allocator_operations *
+kvm_gmem_allocator_ops(struct inode *inode)
+{
+	return NULL;
+}
+
+static void *kvm_gmem_allocator_private(struct inode *inode)
+{
+	return NULL;
+}
+
+static bool kvm_gmem_has_custom_allocator(struct inode *inode)
+{
+	return false;
+}
+
+#endif
+
 /**
  * folio_file_pfn - like folio_file_page, but return a pfn.
  * @folio: The folio which contains this index.
@@ -510,7 +555,6 @@ static int kvm_gmem_filemap_add_folio(struct address_space *mapping,
 static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
 {
 	struct folio *folio;
-	gfp_t gfp;
 	int ret;
 
 repeat:
@@ -518,17 +562,24 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
 	if (!IS_ERR(folio))
 		return folio;
 
-	gfp = mapping_gfp_mask(inode->i_mapping);
+	if (kvm_gmem_has_custom_allocator(inode)) {
+		void *p = kvm_gmem_allocator_private(inode);
 
-	/* TODO: Support huge pages. */
-	folio = filemap_alloc_folio(gfp, 0);
-	if (!folio)
-		return ERR_PTR(-ENOMEM);
+		folio = kvm_gmem_allocator_ops(inode)->alloc_folio(p);
+		if (IS_ERR(folio))
+			return folio;
+	} else {
+		gfp_t gfp = mapping_gfp_mask(inode->i_mapping);
 
-	ret = mem_cgroup_charge(folio, NULL, gfp);
-	if (ret) {
-		folio_put(folio);
-		return ERR_PTR(ret);
+		folio = filemap_alloc_folio(gfp, 0);
+		if (!folio)
+			return ERR_PTR(-ENOMEM);
+
+		ret = mem_cgroup_charge(folio, NULL, gfp);
+		if (ret) {
+			folio_put(folio);
+			return ERR_PTR(ret);
+		}
 	}
 
 	ret = kvm_gmem_filemap_add_folio(inode->i_mapping, folio, index);
@@ -611,6 +662,80 @@ static void kvm_gmem_invalidate_end(struct kvm_gmem *gmem, pgoff_t start,
 	}
 }
 
+/**
+ * kvm_gmem_truncate_indices() - Truncates all folios beginning @index for
+ * @nr_pages.
+ *
+ * @mapping: filemap to truncate pages from.
+ * @index: the index in the filemap to begin truncation.
+ * @nr_pages: number of PAGE_SIZE pages to truncate.
+ *
+ * Return: the number of PAGE_SIZE pages that were actually truncated.
+ */
+static long kvm_gmem_truncate_indices(struct address_space *mapping,
+				      pgoff_t index, size_t nr_pages)
+{
+	struct folio_batch fbatch;
+	long truncated;
+	pgoff_t last;
+
+	last = index + nr_pages - 1;
+
+	truncated = 0;
+	folio_batch_init(&fbatch);
+	while (filemap_get_folios(mapping, &index, last, &fbatch)) {
+		unsigned int i;
+
+		for (i = 0; i < folio_batch_count(&fbatch); ++i) {
+			struct folio *f = fbatch.folios[i];
+
+			truncated += folio_nr_pages(f);
+			folio_lock(f);
+			truncate_inode_folio(f->mapping, f);
+			folio_unlock(f);
+		}
+
+		folio_batch_release(&fbatch);
+		cond_resched();
+	}
+
+	return truncated;
+}
+
+/**
+ * kvm_gmem_truncate_inode_aligned_pages() - Removes entire folios from filemap
+ * in @inode.
+ *
+ * @inode: inode to remove folios from.
+ * @index: start of range to be truncated. Must be hugepage aligned.
+ * @nr_pages: number of PAGE_SIZE pages to be iterated over.
+ *
+ * Removes folios beginning @index for @nr_pages from filemap in @inode, updates
+ * inode metadata.
+ */
+static void kvm_gmem_truncate_inode_aligned_pages(struct inode *inode,
+						  pgoff_t index,
+						  size_t nr_pages)
+{
+	size_t nr_per_huge_page;
+	long num_freed;
+	pgoff_t idx;
+	void *priv;
+
+	priv = kvm_gmem_allocator_private(inode);
+	nr_per_huge_page = kvm_gmem_allocator_ops(inode)->nr_pages_in_folio(priv);
+
+	num_freed = 0;
+	for (idx = index; idx < index + nr_pages; idx += nr_per_huge_page) {
+		num_freed += kvm_gmem_truncate_indices(
+			inode->i_mapping, idx, nr_per_huge_page);
+	}
+
+	spin_lock(&inode->i_lock);
+	inode->i_blocks -= (num_freed << PAGE_SHIFT) / 512;
+	spin_unlock(&inode->i_lock);
+}
+
 static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 {
 	struct list_head *gmem_list = &inode->i_mapping->i_private_list;
@@ -940,6 +1065,13 @@ static void kvm_gmem_free_inode(struct inode *inode)
 {
 	struct kvm_gmem_inode_private *private = kvm_gmem_private(inode);
 
+	/* private may be NULL if inode creation process had an error. */
+	if (private && kvm_gmem_has_custom_allocator(inode)) {
+		void *p = kvm_gmem_allocator_private(inode);
+
+		kvm_gmem_allocator_ops(inode)->inode_teardown(p, inode->i_size);
+	}
+
 	kfree(private);
 
 	free_inode_nonrcu(inode);
@@ -959,8 +1091,24 @@ static void kvm_gmem_destroy_inode(struct inode *inode)
 #endif
 }
 
+static void kvm_gmem_evict_inode(struct inode *inode)
+{
+	truncate_inode_pages_final_prepare(inode->i_mapping);
+
+	if (kvm_gmem_has_custom_allocator(inode)) {
+		size_t nr_pages = inode->i_size >> PAGE_SHIFT;
+
+		kvm_gmem_truncate_inode_aligned_pages(inode, 0, nr_pages);
+	} else {
+		truncate_inode_pages(inode->i_mapping, 0);
+	}
+
+	clear_inode(inode);
+}
+
 static const struct super_operations kvm_gmem_super_operations = {
 	.statfs		= simple_statfs,
+	.evict_inode	= kvm_gmem_evict_inode,
 	.destroy_inode	= kvm_gmem_destroy_inode,
 	.free_inode	= kvm_gmem_free_inode,
 };
@@ -1062,6 +1210,12 @@ static void kvm_gmem_free_folio(struct folio *folio)
 {
 	folio_clear_unevictable(folio);
 
+	/*
+	 * No-op for 4K page since the PG_uptodate is cleared as part of
+	 * freeing, but may be required for other allocators to reset page.
+	 */
+	folio_clear_uptodate(folio);
+
 	kvm_gmem_invalidate(folio);
 }
 
@@ -1115,6 +1269,25 @@ static struct inode *kvm_gmem_inode_make_secure_inode(const char *name,
 	if (err)
 		goto out;
 
+#ifdef CONFIG_KVM_GMEM_HUGETLB
+	if (flags & GUEST_MEMFD_FLAG_HUGETLB) {
+		void *allocator_priv;
+		size_t nr_pages;
+
+		allocator_priv = guestmem_hugetlb_ops.inode_setup(size, flags);
+		if (IS_ERR(allocator_priv)) {
+			err = PTR_ERR(allocator_priv);
+			goto out;
+		}
+
+		private->allocator_ops = &guestmem_hugetlb_ops;
+		private->allocator_private = allocator_priv;
+
+		nr_pages = guestmem_hugetlb_ops.nr_pages_in_folio(allocator_priv);
+		inode->i_blkbits = ilog2(nr_pages << PAGE_SHIFT);
+	}
+#endif
+
 	inode->i_private = (void *)(unsigned long)flags;
 	inode->i_op = &kvm_gmem_iops;
 	inode->i_mapping->a_ops = &kvm_gmem_aops;
@@ -1210,6 +1383,10 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
 	return err;
 }
 
+/* Mask of bits belonging to allocators and are opaque to guest_memfd. */
+#define SUPPORTED_CUSTOM_ALLOCATOR_MASK \
+	(GUESTMEM_HUGETLB_FLAG_MASK << GUESTMEM_HUGETLB_FLAG_SHIFT)
+
 int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args)
 {
 	loff_t size = args->size;
@@ -1222,6 +1399,12 @@ int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args)
 	if (flags & GUEST_MEMFD_FLAG_SUPPORT_SHARED)
 		valid_flags |= GUEST_MEMFD_FLAG_INIT_PRIVATE;
 
+	if (IS_ENABLED(CONFIG_KVM_GMEM_HUGETLB) &&
+	    flags & GUEST_MEMFD_FLAG_HUGETLB) {
+		valid_flags |= GUEST_MEMFD_FLAG_HUGETLB |
+			       SUPPORTED_CUSTOM_ALLOCATOR_MASK;
+	}
+
 	if (flags & ~valid_flags)
 		return -EINVAL;
 
-- 
2.49.0.1045.g170613ef41-goog

Re: [RFC PATCH v2 32/51] KVM: guest_memfd: Support guestmem_hugetlb as custom allocator

Posted by Sean Christopherson 4 months, 1 week ago

Trimmed the Cc to KVM/guest_memfd folks.

On Wed, May 14, 2025, Ackerley Tng wrote:
> @@ -22,6 +25,10 @@ struct kvm_gmem_inode_private {
>  #ifdef CONFIG_KVM_GMEM_SHARED_MEM
>  	struct maple_tree shareability;
>  #endif
> +#ifdef CONFIG_KVM_GMEM_HUGETLB
> +	const struct guestmem_allocator_operations *allocator_ops;
> +	void *allocator_private;
> +#endif

This is beyond silly.  There is _one_ "custom" allocator, and no evidence that
the "generic" logic written for custom allocators would actually be correct for
anything other than a hugetlb allocator.

And to my point about guestmem_hugetlb.c not actually needing access to "private"
mm/ state,  I would much rather add e.g. virt/kvm/guest_memfd_hugetlb.{c.h}, and
in the header define:

  struct gmem_hugetlb_inode {
	struct hstate *h;
	struct hugepage_subpool *spool;
	struct hugetlb_cgroup *h_cg_rsvd;
  };

and then in guest_memfd.c have

  struct gmem_inode {
	struct shared_policy policy;
	struct inode vfs_inode;

	u64 flags;
	struct maple_tree attributes;

#ifdef CONFIG_KVM_GUEST_MEMFD_HUGETLB
	struct gmem_hugetlb_inode hugetlb;
#endif
  };

or maybe even better, avoid even that "jump" and define "struct gmem_inode" in a
new virt/kvm/guest_memfd.h:

  struct gmem_inode {
	struct shared_policy policy;
	struct inode vfs_inode;

	u64 flags;
	struct maple_tree attributes;

#ifdef CONFIG_KVM_GUEST_MEMFD_HUGETLB
	struct hstate *h;
	struct hugepage_subpool *spool;
	struct hugetlb_cgroup *h_cg_rsvd;
#endif
  };


The setup code can them be:

#ifdef CONFIG_KVM_GUEST_MEMFD_HUGETLB
	if (flags & GUEST_MEMFD_FLAG_HUGETLB) {
		err = kvm_gmem_init_hugetlb(inode, size, huge_page_size_log2)
		if (err)
			goto out;
	}
#endif


Actually, if we're at all clever, the #ifdefs can go away completely so long as
kvm_gmem_init_hugetlb() is uncondtionally _declared_, because we rely on dead
code elimination to drop the call before linking.

>  };
> +/**
> + * kvm_gmem_truncate_indices() - Truncates all folios beginning @index for
> + * @nr_pages.
> + *
> + * @mapping: filemap to truncate pages from.
> + * @index: the index in the filemap to begin truncation.
> + * @nr_pages: number of PAGE_SIZE pages to truncate.
> + *
> + * Return: the number of PAGE_SIZE pages that were actually truncated.
> + */

Do not add kerneldoc comments for internal helpers.  They inevitably become stale
and are a source of friction for developers.  The _only_ non-obvious thing here
is the return value.  

> +static long kvm_gmem_truncate_indices(struct address_space *mapping,
> +				      pgoff_t index, size_t nr_pages)
> +{
> +	struct folio_batch fbatch;
> +	long truncated;
> +	pgoff_t last;
> +
> +	last = index + nr_pages - 1;
> +
> +	truncated = 0;
> +	folio_batch_init(&fbatch);
> +	while (filemap_get_folios(mapping, &index, last, &fbatch)) {
> +		unsigned int i;
> +
> +		for (i = 0; i < folio_batch_count(&fbatch); ++i) {
> +			struct folio *f = fbatch.folios[i];
> +
> +			truncated += folio_nr_pages(f);
> +			folio_lock(f);
> +			truncate_inode_folio(f->mapping, f);
> +			folio_unlock(f);
> +		}
> +
> +		folio_batch_release(&fbatch);
> +		cond_resched();
> +	}
> +
> +	return truncated;
> +}

Re: [RFC PATCH v2 32/51] KVM: guest_memfd: Support guestmem_hugetlb as custom allocator

Posted by Michael Roth 4 months, 1 week ago

Taking Sean's cue on trimming Cc list:

On Wed, May 14, 2025 at 04:42:11PM -0700, Ackerley Tng wrote:
> This patch adds support for guestmem_hugetlb as the first custom
> allocator in guest_memfd.
> 
> If requested at guest_memfd creation time, the custom allocator will
> be used in initialization and cleanup.
> 
> Signed-off-by: Ackerley Tng <ackerleytng@google.com>
> 
> Change-Id: I1eb9625dc761ecadcc2aa21480cfdfcf9ab7ce67
> ---
>  include/uapi/linux/kvm.h |   1 +
>  virt/kvm/Kconfig         |   5 +
>  virt/kvm/guest_memfd.c   | 203 +++++++++++++++++++++++++++++++++++++--
>  3 files changed, 199 insertions(+), 10 deletions(-)
> 

<snip>

> @@ -518,17 +562,24 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
>  	if (!IS_ERR(folio))
>  		return folio;
>  
> -	gfp = mapping_gfp_mask(inode->i_mapping);
> +	if (kvm_gmem_has_custom_allocator(inode)) {
> +		void *p = kvm_gmem_allocator_private(inode);
>  
> -	/* TODO: Support huge pages. */
> -	folio = filemap_alloc_folio(gfp, 0);
> -	if (!folio)
> -		return ERR_PTR(-ENOMEM);
> +		folio = kvm_gmem_allocator_ops(inode)->alloc_folio(p);
> +		if (IS_ERR(folio))
> +			return folio;

One issue with current guestmem_hugetlb implementation of ->alloc_folio()
is that if you have 2 vCPUs faulting in the same GPA range, they might both
attempt to reserve the allocation, and if they happen to be fighting
over the last folio available in the subpool, then 1 of them might return
ENOMEM and the guest will crash (this is much more likely if using 1GB
pages).

It seems like we need to allow the allocator to return EAGAIN to signal
that it's still worth retrying, but at the same time I'm not sure
guestmem_hugetlb will be able to distinguish between *actually* being
out of memory vs. a temporary situation like this. We can make
guestmem_hugetlb smarter so that it could do this but it would need to
track stuff like whether 2 allocations are in-flight for the same GPA
range, and that's sounding a lot more like something guest_memfd proper
should be doing.

There's also another potential/related issue below that I haven't run
into but seems possible...

> +	} else {
> +		gfp_t gfp = mapping_gfp_mask(inode->i_mapping);
>  
> -	ret = mem_cgroup_charge(folio, NULL, gfp);
> -	if (ret) {
> -		folio_put(folio);
> -		return ERR_PTR(ret);
> +		folio = filemap_alloc_folio(gfp, 0);
> +		if (!folio)
> +			return ERR_PTR(-ENOMEM);
> +
> +		ret = mem_cgroup_charge(folio, NULL, gfp);
> +		if (ret) {
> +			folio_put(folio);
> +			return ERR_PTR(ret);
> +		}
>  	}
>  
>  	ret = kvm_gmem_filemap_add_folio(inode->i_mapping, folio, index);

I think the relevant bits are in another patch, but it's closely related
to the above scenario so I'll just paste it here for context:

    ret = kvm_gmem_filemap_add_folio(inode->i_mapping, folio, index_floor);
    if (ret) {
            folio_put(folio);

            /*
             * There was a race, two threads tried to get a folio indexing
             * to the same location in the filemap. The losing thread should
             * free the allocated folio, then lock the folio added to the
             * filemap by the winning thread.
             */
            if (ret == -EEXIST)
                    goto repeat;

            WARN_ON_ONCE(ret);
            return ERR_PTR(ret);
    }

    /* Leave just filemap's refcounts on folio. */
    folio_put(folio);

    ret = kvm_gmem_try_split_folio_in_filemap(inode, folio);
    if (ret)
            goto err;

    spin_lock(&inode->i_lock);
    inode->i_blocks += allocated_size / 512;
    spin_unlock(&inode->i_lock);

    /*
     * folio is the one that is allocated, this gets the folio at the
     * requested index.
     */
    folio = filemap_lock_folio(inode->i_mapping, index); 

Here you check for a similar race like the above (except the reservation
doesn't fail so you get 2+ vCPUs with allocated folios they are trying to
map), and the losing vCPUs are expected to free their un-filemapped folios
and retry to get the winning/mapped one. However, later in this function you
might end up splitting the winning page, which involves unmapping it
beforehand, so with really bad timing it seems like a "losing" vCPU can still
insert its folio in the filemap and trip up the splitting path, which I'm not
sure is handled gracefully.

This is another case where I wish we had something like a range lock so we
can still parallelize allocations with having vCPUs interleaving
allocation/splitting work for a particular range. Lacking that, maybe some
other locking scheme will work, but with allocation/splitting/HVO in the mix
it would be pretty easy to kill performance if we're not careful.

-Mike

> @@ -611,6 +662,80 @@ static void kvm_gmem_invalidate_end(struct kvm_gmem *gmem, pgoff_t start,
>  	}
>  }
>  
> +/**
> + * kvm_gmem_truncate_indices() - Truncates all folios beginning @index for
> + * @nr_pages.
> + *
> + * @mapping: filemap to truncate pages from.
> + * @index: the index in the filemap to begin truncation.
> + * @nr_pages: number of PAGE_SIZE pages to truncate.
> + *
> + * Return: the number of PAGE_SIZE pages that were actually truncated.
> + */
> +static long kvm_gmem_truncate_indices(struct address_space *mapping,
> +				      pgoff_t index, size_t nr_pages)
> +{
> +	struct folio_batch fbatch;
> +	long truncated;
> +	pgoff_t last;
> +
> +	last = index + nr_pages - 1;
> +
> +	truncated = 0;
> +	folio_batch_init(&fbatch);
> +	while (filemap_get_folios(mapping, &index, last, &fbatch)) {
> +		unsigned int i;
> +
> +		for (i = 0; i < folio_batch_count(&fbatch); ++i) {
> +			struct folio *f = fbatch.folios[i];
> +
> +			truncated += folio_nr_pages(f);
> +			folio_lock(f);
> +			truncate_inode_folio(f->mapping, f);
> +			folio_unlock(f);
> +		}
> +
> +		folio_batch_release(&fbatch);
> +		cond_resched();
> +	}
> +
> +	return truncated;
> +}
> +
> +/**
> + * kvm_gmem_truncate_inode_aligned_pages() - Removes entire folios from filemap
> + * in @inode.
> + *
> + * @inode: inode to remove folios from.
> + * @index: start of range to be truncated. Must be hugepage aligned.
> + * @nr_pages: number of PAGE_SIZE pages to be iterated over.
> + *
> + * Removes folios beginning @index for @nr_pages from filemap in @inode, updates
> + * inode metadata.
> + */
> +static void kvm_gmem_truncate_inode_aligned_pages(struct inode *inode,
> +						  pgoff_t index,
> +						  size_t nr_pages)
> +{
> +	size_t nr_per_huge_page;
> +	long num_freed;
> +	pgoff_t idx;
> +	void *priv;
> +
> +	priv = kvm_gmem_allocator_private(inode);
> +	nr_per_huge_page = kvm_gmem_allocator_ops(inode)->nr_pages_in_folio(priv);
> +
> +	num_freed = 0;
> +	for (idx = index; idx < index + nr_pages; idx += nr_per_huge_page) {
> +		num_freed += kvm_gmem_truncate_indices(
> +			inode->i_mapping, idx, nr_per_huge_page);
> +	}
> +
> +	spin_lock(&inode->i_lock);
> +	inode->i_blocks -= (num_freed << PAGE_SHIFT) / 512;
> +	spin_unlock(&inode->i_lock);
> +}
> +
>  static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len)
>  {
>  	struct list_head *gmem_list = &inode->i_mapping->i_private_list;
> @@ -940,6 +1065,13 @@ static void kvm_gmem_free_inode(struct inode *inode)
>  {
>  	struct kvm_gmem_inode_private *private = kvm_gmem_private(inode);
>  
> +	/* private may be NULL if inode creation process had an error. */
> +	if (private && kvm_gmem_has_custom_allocator(inode)) {
> +		void *p = kvm_gmem_allocator_private(inode);
> +
> +		kvm_gmem_allocator_ops(inode)->inode_teardown(p, inode->i_size);
> +	}
> +
>  	kfree(private);
>  
>  	free_inode_nonrcu(inode);
> @@ -959,8 +1091,24 @@ static void kvm_gmem_destroy_inode(struct inode *inode)
>  #endif
>  }
>  
> +static void kvm_gmem_evict_inode(struct inode *inode)
> +{
> +	truncate_inode_pages_final_prepare(inode->i_mapping);
> +
> +	if (kvm_gmem_has_custom_allocator(inode)) {
> +		size_t nr_pages = inode->i_size >> PAGE_SHIFT;
> +
> +		kvm_gmem_truncate_inode_aligned_pages(inode, 0, nr_pages);
> +	} else {
> +		truncate_inode_pages(inode->i_mapping, 0);
> +	}
> +
> +	clear_inode(inode);
> +}
> +
>  static const struct super_operations kvm_gmem_super_operations = {
>  	.statfs		= simple_statfs,
> +	.evict_inode	= kvm_gmem_evict_inode,
>  	.destroy_inode	= kvm_gmem_destroy_inode,
>  	.free_inode	= kvm_gmem_free_inode,
>  };
> @@ -1062,6 +1210,12 @@ static void kvm_gmem_free_folio(struct folio *folio)
>  {
>  	folio_clear_unevictable(folio);
>  
> +	/*
> +	 * No-op for 4K page since the PG_uptodate is cleared as part of
> +	 * freeing, but may be required for other allocators to reset page.
> +	 */
> +	folio_clear_uptodate(folio);
> +
>  	kvm_gmem_invalidate(folio);
>  }
>  
> @@ -1115,6 +1269,25 @@ static struct inode *kvm_gmem_inode_make_secure_inode(const char *name,
>  	if (err)
>  		goto out;
>  
> +#ifdef CONFIG_KVM_GMEM_HUGETLB
> +	if (flags & GUEST_MEMFD_FLAG_HUGETLB) {
> +		void *allocator_priv;
> +		size_t nr_pages;
> +
> +		allocator_priv = guestmem_hugetlb_ops.inode_setup(size, flags);
> +		if (IS_ERR(allocator_priv)) {
> +			err = PTR_ERR(allocator_priv);
> +			goto out;
> +		}
> +
> +		private->allocator_ops = &guestmem_hugetlb_ops;
> +		private->allocator_private = allocator_priv;
> +
> +		nr_pages = guestmem_hugetlb_ops.nr_pages_in_folio(allocator_priv);
> +		inode->i_blkbits = ilog2(nr_pages << PAGE_SHIFT);
> +	}
> +#endif
> +
>  	inode->i_private = (void *)(unsigned long)flags;
>  	inode->i_op = &kvm_gmem_iops;
>  	inode->i_mapping->a_ops = &kvm_gmem_aops;
> @@ -1210,6 +1383,10 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
>  	return err;
>  }
>  
> +/* Mask of bits belonging to allocators and are opaque to guest_memfd. */
> +#define SUPPORTED_CUSTOM_ALLOCATOR_MASK \
> +	(GUESTMEM_HUGETLB_FLAG_MASK << GUESTMEM_HUGETLB_FLAG_SHIFT)
> +
>  int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args)
>  {
>  	loff_t size = args->size;
> @@ -1222,6 +1399,12 @@ int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args)
>  	if (flags & GUEST_MEMFD_FLAG_SUPPORT_SHARED)
>  		valid_flags |= GUEST_MEMFD_FLAG_INIT_PRIVATE;
>  
> +	if (IS_ENABLED(CONFIG_KVM_GMEM_HUGETLB) &&
> +	    flags & GUEST_MEMFD_FLAG_HUGETLB) {
> +		valid_flags |= GUEST_MEMFD_FLAG_HUGETLB |
> +			       SUPPORTED_CUSTOM_ALLOCATOR_MASK;
> +	}
> +
>  	if (flags & ~valid_flags)
>  		return -EINVAL;
>  
> -- 
> 2.49.0.1045.g170613ef41-goog
> 
>

Re: [RFC PATCH v2 32/51] KVM: guest_memfd: Support guestmem_hugetlb as custom allocator

Posted by Tony Lindgren 5 months, 4 weeks ago

Hi,

On Wed, May 14, 2025 at 04:42:11PM -0700, Ackerley Tng wrote:
> --- a/virt/kvm/Kconfig
> +++ b/virt/kvm/Kconfig
> @@ -133,3 +133,8 @@ config KVM_GMEM_SHARED_MEM
>         select KVM_GMEM
>         bool
>         prompt "Enables in-place shared memory for guest_memfd"
> +
> +config KVM_GMEM_HUGETLB
> +       select KVM_PRIVATE_MEM
> +       depends on GUESTMEM_HUGETLB
> +       bool "Enables using a custom allocator with guest_memfd, see CONFIG_GUESTMEM_HUGETLB"

For v3, this needs s/KVM_PRIVATE_MEM/KVM_GMEM/ assuming the patches are
based on "KVM: Rename CONFIG_KVM_PRIVATE_MEM to CONFIG_KVM_GMEM".

Also probably good idea to run some make randconfig builds on the series to
check the various Kconfig changes and inline functions for build errors.

Regards,

Tony

Re: [RFC PATCH v2 32/51] KVM: guest_memfd: Support guestmem_hugetlb as custom allocator

Posted by Yan Zhao 8 months, 2 weeks ago

On Wed, May 14, 2025 at 04:42:11PM -0700, Ackerley Tng wrote:
>  enum shareability {
> @@ -40,6 +47,44 @@ static struct kvm_gmem_inode_private *kvm_gmem_private(struct inode *inode)
>  	return inode->i_mapping->i_private_data;
>  }
>  
> +#ifdef CONFIG_KVM_GMEM_HUGETLB
> +
> +static const struct guestmem_allocator_operations *
> +kvm_gmem_allocator_ops(struct inode *inode)
> +{
> +	return kvm_gmem_private(inode)->allocator_ops;
> +}
> +
> +static void *kvm_gmem_allocator_private(struct inode *inode)
> +{
> +	return kvm_gmem_private(inode)->allocator_private;
> +}
> +
> +static bool kvm_gmem_has_custom_allocator(struct inode *inode)
> +{

+       if (!kvm_gmem_private(inode))
+               return NULL;

> +	return kvm_gmem_allocator_ops(inode) != NULL;
> +}
> +
...

> +static void kvm_gmem_evict_inode(struct inode *inode)
> +{
> +	truncate_inode_pages_final_prepare(inode->i_mapping);
> +
> +	if (kvm_gmem_has_custom_allocator(inode)) {

The i_private_data of the root inode in pseudo fs is NULL.
Without the two lines added above, evicting the root inode during unmount will
cause NULL pointer deference.

> +		size_t nr_pages = inode->i_size >> PAGE_SHIFT;
> +
> +		kvm_gmem_truncate_inode_aligned_pages(inode, 0, nr_pages);
> +	} else {
> +		truncate_inode_pages(inode->i_mapping, 0);
> +	}
> +
> +	clear_inode(inode);
> +}
> +
>  static const struct super_operations kvm_gmem_super_operations = {
>  	.statfs		= simple_statfs,
> +	.evict_inode	= kvm_gmem_evict_inode,
>  	.destroy_inode	= kvm_gmem_destroy_inode,
>  	.free_inode	= kvm_gmem_free_inode,
>  };