[RFC PATCH v2 29/51] mm: guestmem_hugetlb: Wrap HugeTLB as an allocator for guest_memfd

Ackerley Tng posted 51 patches 7 months, 1 week ago
[RFC PATCH v2 29/51] mm: guestmem_hugetlb: Wrap HugeTLB as an allocator for guest_memfd
Posted by Ackerley Tng 7 months, 1 week ago
guestmem_hugetlb is an allocator for guest_memfd. It wraps HugeTLB to
provide huge folios for guest_memfd.

This patch also introduces guestmem_allocator_operations as a set of
operations that allocators for guest_memfd can provide. In a later
patch, guest_memfd will use these operations to manage pages from an
allocator.

The allocator operations are memory-management specific and are placed
in mm/ so key mm-specific functions do not have to be exposed
unnecessarily.

Signed-off-by: Ackerley Tng <ackerleytng@google.com>

Change-Id: I3cafe111ea7b3c84755d7112ff8f8c541c11136d
---
 include/linux/guestmem.h      |  20 +++++
 include/uapi/linux/guestmem.h |  29 +++++++
 mm/Kconfig                    |   5 +-
 mm/guestmem_hugetlb.c         | 159 ++++++++++++++++++++++++++++++++++
 4 files changed, 212 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/guestmem.h
 create mode 100644 include/uapi/linux/guestmem.h

diff --git a/include/linux/guestmem.h b/include/linux/guestmem.h
new file mode 100644
index 000000000000..4b2d820274d9
--- /dev/null
+++ b/include/linux/guestmem.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_GUESTMEM_H
+#define _LINUX_GUESTMEM_H
+
+#include <linux/fs.h>
+
+struct guestmem_allocator_operations {
+	void *(*inode_setup)(size_t size, u64 flags);
+	void (*inode_teardown)(void *private, size_t inode_size);
+	struct folio *(*alloc_folio)(void *private);
+	/*
+	 * Returns the number of PAGE_SIZE pages in a page that this guestmem
+	 * allocator provides.
+	 */
+	size_t (*nr_pages_in_folio)(void *priv);
+};
+
+extern const struct guestmem_allocator_operations guestmem_hugetlb_ops;
+
+#endif
diff --git a/include/uapi/linux/guestmem.h b/include/uapi/linux/guestmem.h
new file mode 100644
index 000000000000..2e518682edd5
--- /dev/null
+++ b/include/uapi/linux/guestmem.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_LINUX_GUESTMEM_H
+#define _UAPI_LINUX_GUESTMEM_H
+
+/*
+ * Huge page size must be explicitly defined when using the guestmem_hugetlb
+ * allocator for guest_memfd.  It is the responsibility of the application to
+ * know which sizes are supported on the running system.  See mmap(2) man page
+ * for details.
+ */
+
+#define GUESTMEM_HUGETLB_FLAG_SHIFT	58
+#define GUESTMEM_HUGETLB_FLAG_MASK	0x3fUL
+
+#define GUESTMEM_HUGETLB_FLAG_16KB	(14UL << GUESTMEM_HUGETLB_FLAG_SHIFT)
+#define GUESTMEM_HUGETLB_FLAG_64KB	(16UL << GUESTMEM_HUGETLB_FLAG_SHIFT)
+#define GUESTMEM_HUGETLB_FLAG_512KB	(19UL << GUESTMEM_HUGETLB_FLAG_SHIFT)
+#define GUESTMEM_HUGETLB_FLAG_1MB	(20UL << GUESTMEM_HUGETLB_FLAG_SHIFT)
+#define GUESTMEM_HUGETLB_FLAG_2MB	(21UL << GUESTMEM_HUGETLB_FLAG_SHIFT)
+#define GUESTMEM_HUGETLB_FLAG_8MB	(23UL << GUESTMEM_HUGETLB_FLAG_SHIFT)
+#define GUESTMEM_HUGETLB_FLAG_16MB	(24UL << GUESTMEM_HUGETLB_FLAG_SHIFT)
+#define GUESTMEM_HUGETLB_FLAG_32MB	(25UL << GUESTMEM_HUGETLB_FLAG_SHIFT)
+#define GUESTMEM_HUGETLB_FLAG_256MB	(28UL << GUESTMEM_HUGETLB_FLAG_SHIFT)
+#define GUESTMEM_HUGETLB_FLAG_512MB	(29UL << GUESTMEM_HUGETLB_FLAG_SHIFT)
+#define GUESTMEM_HUGETLB_FLAG_1GB	(30UL << GUESTMEM_HUGETLB_FLAG_SHIFT)
+#define GUESTMEM_HUGETLB_FLAG_2GB	(31UL << GUESTMEM_HUGETLB_FLAG_SHIFT)
+#define GUESTMEM_HUGETLB_FLAG_16GB	(34UL << GUESTMEM_HUGETLB_FLAG_SHIFT)
+
+#endif /* _UAPI_LINUX_GUESTMEM_H */
diff --git a/mm/Kconfig b/mm/Kconfig
index 131adc49f58d..bb6e39e37245 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1218,7 +1218,10 @@ config SECRETMEM
 
 config GUESTMEM_HUGETLB
 	bool "Enable guestmem_hugetlb allocator for guest_memfd"
-	depends on HUGETLBFS
+	select GUESTMEM
+	select HUGETLBFS
+	select HUGETLB_PAGE
+	select HUGETLB_PAGE_OPTIMIZE_VMEMMAP
 	help
 	  Enable this to make HugeTLB folios available to guest_memfd
 	  (KVM virtualization) as backing memory.
diff --git a/mm/guestmem_hugetlb.c b/mm/guestmem_hugetlb.c
index 51a724ebcc50..5459ef7eb329 100644
--- a/mm/guestmem_hugetlb.c
+++ b/mm/guestmem_hugetlb.c
@@ -5,6 +5,14 @@
  */
 
 #include <linux/mm_types.h>
+#include <linux/guestmem.h>
+#include <linux/hugetlb.h>
+#include <linux/hugetlb_cgroup.h>
+#include <linux/mempolicy.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+
+#include <uapi/linux/guestmem.h>
 
 #include "guestmem_hugetlb.h"
 
@@ -12,3 +20,154 @@ void guestmem_hugetlb_handle_folio_put(struct folio *folio)
 {
 	WARN_ONCE(1, "A placeholder that shouldn't trigger. Work in progress.");
 }
+
+struct guestmem_hugetlb_private {
+	struct hstate *h;
+	struct hugepage_subpool *spool;
+	struct hugetlb_cgroup *h_cg_rsvd;
+};
+
+static size_t guestmem_hugetlb_nr_pages_in_folio(void *priv)
+{
+	struct guestmem_hugetlb_private *private = priv;
+
+	return pages_per_huge_page(private->h);
+}
+
+static void *guestmem_hugetlb_setup(size_t size, u64 flags)
+
+{
+	struct guestmem_hugetlb_private *private;
+	struct hugetlb_cgroup *h_cg_rsvd = NULL;
+	struct hugepage_subpool *spool;
+	unsigned long nr_pages;
+	int page_size_log;
+	struct hstate *h;
+	long hpages;
+	int idx;
+	int ret;
+
+	page_size_log = (flags >> GUESTMEM_HUGETLB_FLAG_SHIFT) &
+			GUESTMEM_HUGETLB_FLAG_MASK;
+	h = hstate_sizelog(page_size_log);
+	if (!h)
+		return ERR_PTR(-EINVAL);
+
+	/*
+	 * Check against h because page_size_log could be 0 to request default
+	 * HugeTLB page size.
+	 */
+	if (!IS_ALIGNED(size, huge_page_size(h)))
+		return ERR_PTR(-EINVAL);
+
+	private = kzalloc(sizeof(*private), GFP_KERNEL);
+	if (!private)
+		return ERR_PTR(-ENOMEM);
+
+	/* Creating a subpool makes reservations, hence charge for them now. */
+	idx = hstate_index(h);
+	nr_pages = size >> PAGE_SHIFT;
+	ret = hugetlb_cgroup_charge_cgroup_rsvd(idx, nr_pages, &h_cg_rsvd);
+	if (ret)
+		goto err_free;
+
+	hpages = size >> huge_page_shift(h);
+	spool = hugepage_new_subpool(h, hpages, hpages, false);
+	if (!spool)
+		goto err_uncharge;
+
+	private->h = h;
+	private->spool = spool;
+	private->h_cg_rsvd = h_cg_rsvd;
+
+	return private;
+
+err_uncharge:
+	ret = -ENOMEM;
+	hugetlb_cgroup_uncharge_cgroup_rsvd(idx, nr_pages, h_cg_rsvd);
+err_free:
+	kfree(private);
+	return ERR_PTR(ret);
+}
+
+static void guestmem_hugetlb_teardown(void *priv, size_t inode_size)
+{
+	struct guestmem_hugetlb_private *private = priv;
+	unsigned long nr_pages;
+	int idx;
+
+	hugepage_put_subpool(private->spool);
+
+	idx = hstate_index(private->h);
+	nr_pages = inode_size >> PAGE_SHIFT;
+	hugetlb_cgroup_uncharge_cgroup_rsvd(idx, nr_pages, private->h_cg_rsvd);
+
+	kfree(private);
+}
+
+static struct folio *guestmem_hugetlb_alloc_folio(void *priv)
+{
+	struct guestmem_hugetlb_private *private = priv;
+	struct mempolicy *mpol;
+	struct folio *folio;
+	pgoff_t ilx;
+	int ret;
+
+	ret = hugepage_subpool_get_pages(private->spool, 1);
+	if (ret == -ENOMEM) {
+		return ERR_PTR(-ENOMEM);
+	} else if (ret > 0) {
+		/* guest_memfd will not use surplus pages. */
+		goto err_put_pages;
+	}
+
+	/*
+	 * TODO: mempolicy would probably have to be stored on the inode, use
+	 * task policy for now.
+	 */
+	mpol = get_task_policy(current);
+
+	/* TODO: ignore interleaving for now. */
+	ilx = NO_INTERLEAVE_INDEX;
+
+	/*
+	 * charge_cgroup_rsvd is false because we already charged reservations
+	 * when creating the subpool for this
+	 * guest_memfd. use_existing_reservation is true - we're using a
+	 * reservation from the guest_memfd's subpool.
+	 */
+	folio = hugetlb_alloc_folio(private->h, mpol, ilx, false, true);
+	mpol_cond_put(mpol);
+
+	if (IS_ERR_OR_NULL(folio))
+		goto err_put_pages;
+
+	/*
+	 * Clear restore_reserve here so that when this folio is freed,
+	 * free_huge_folio() will always attempt to return the reservation to
+	 * the subpool.  guest_memfd, unlike regular hugetlb, has no resv_map,
+	 * and hence when freeing, the folio needs to be returned to the
+	 * subpool.  guest_memfd does not use surplus hugetlb pages, so in
+	 * free_huge_folio(), returning to subpool will always succeed and the
+	 * hstate reservation will then get restored.
+	 *
+	 * hugetlbfs does this in hugetlb_add_to_page_cache().
+	 */
+	folio_clear_hugetlb_restore_reserve(folio);
+
+	hugetlb_set_folio_subpool(folio, private->spool);
+
+	return folio;
+
+err_put_pages:
+	hugepage_subpool_put_pages(private->spool, 1);
+	return ERR_PTR(-ENOMEM);
+}
+
+const struct guestmem_allocator_operations guestmem_hugetlb_ops = {
+	.inode_setup = guestmem_hugetlb_setup,
+	.inode_teardown = guestmem_hugetlb_teardown,
+	.alloc_folio = guestmem_hugetlb_alloc_folio,
+	.nr_pages_in_folio = guestmem_hugetlb_nr_pages_in_folio,
+};
+EXPORT_SYMBOL_GPL(guestmem_hugetlb_ops);
-- 
2.49.0.1045.g170613ef41-goog
Re: [RFC PATCH v2 29/51] mm: guestmem_hugetlb: Wrap HugeTLB as an allocator for guest_memfd
Posted by Sean Christopherson 2 months, 2 weeks ago
On Wed, May 14, 2025, Ackerley Tng wrote:
> guestmem_hugetlb is an allocator for guest_memfd. It wraps HugeTLB to
> provide huge folios for guest_memfd.
> 
> This patch also introduces guestmem_allocator_operations as a set of
> operations that allocators for guest_memfd can provide. In a later
> patch, guest_memfd will use these operations to manage pages from an
> allocator.
> 
> The allocator operations are memory-management specific and are placed
> in mm/ so key mm-specific functions do not have to be exposed
> unnecessarily.

This code doesn't have to be put in mm/, all of the #includes are to <linux/xxx.h>.
Unless I'm missing something, what you actually want to avoid is _exporting_ mm/
APIs, and for that all that is needed is ensure the code is built-in to the kernel
binary, not to kvm.ko.

diff --git a/virt/kvm/Makefile.kvm b/virt/kvm/Makefile.kvm
index d047d4cf58c9..c18c77e8a638 100644
--- a/virt/kvm/Makefile.kvm
+++ b/virt/kvm/Makefile.kvm
@@ -13,3 +13,5 @@ kvm-$(CONFIG_HAVE_KVM_IRQ_ROUTING) += $(KVM)/irqchip.o
 kvm-$(CONFIG_HAVE_KVM_DIRTY_RING) += $(KVM)/dirty_ring.o
 kvm-$(CONFIG_HAVE_KVM_PFNCACHE) += $(KVM)/pfncache.o
 kvm-$(CONFIG_KVM_GUEST_MEMFD) += $(KVM)/guest_memfd.o
+
+obj-$(subst m,y,$(CONFIG_KVM_GUEST_MEMFD)) += $(KVM)/guest_memfd_hugepages.o
\ No newline at end of file

People may want the code to live in mm/ for maintenance and ownership reasons
(or not, I haven't followed the discussions on hugepage support), but that's a
very different justification than what's described in the changelog.

And if the _only_ user is guest_memfd, putting this in mm/ feels quite weird.
And if we anticipate other users, the name guestmem_hugetlb is weird, because
AFAICT there's nothing in here that is in any way guest specific, it's just a
few APIs for allocating and accounting hugepages.

Personally, I don't see much point in trying to make this a "generic" library,
in quotes because the whole guestmem_xxx namespace makes it anything but generic.
I don't see anything in mm/guestmem_hugetlb.c that makes me go "ooh, that's nasty,
I'm glad this is handled by a library".  But if we want to go straight to a
library, it should be something that is really truly generic, i.e. not "guest"
specific in any way.

> Signed-off-by: Ackerley Tng <ackerleytng@google.com>
> 
> Change-Id: I3cafe111ea7b3c84755d7112ff8f8c541c11136d
> ---
>  include/linux/guestmem.h      |  20 +++++
>  include/uapi/linux/guestmem.h |  29 +++++++
>  mm/Kconfig                    |   5 +-
>  mm/guestmem_hugetlb.c         | 159 ++++++++++++++++++++++++++++++++++
>  4 files changed, 212 insertions(+), 1 deletion(-)
>  create mode 100644 include/linux/guestmem.h
>  create mode 100644 include/uapi/linux/guestmem.h


..

> diff --git a/include/uapi/linux/guestmem.h b/include/uapi/linux/guestmem.h
> new file mode 100644
> index 000000000000..2e518682edd5
> --- /dev/null
> +++ b/include/uapi/linux/guestmem.h

With my KVM hat on, NAK to defining uAPI in a library like this.  This subtly
defines uAPI for KVM, and effectively any other userspace-facing entity that
utilizes the library/allocator.  KVM's uAPI needs to be defined by KVM, period.

There's absolutely zero reason to have guestmem_hugetlb_setup() take in flags.
Explicitly pass the page size, or if preferred, the page_size_log, and let the
caller figure out how to communicate the size to the kernel.

IMO, the whole MAP_HUGE_xxx approach is a (clever) hack to squeeze the desired
size into mmap() flags.  I don't see any reason to carry that forward to guest_memfd.
For once, we had the foresight to reserve some space in KVM's uAPI structure, so
there's no need to squeeze things into flags.

E.g. we could do something like this:

diff --git include/uapi/linux/kvm.h include/uapi/linux/kvm.h
index 42053036d38d..b79914472d27 100644
--- include/uapi/linux/kvm.h
+++ include/uapi/linux/kvm.h
@@ -1605,11 +1605,16 @@ struct kvm_memory_attributes {
 #define KVM_CREATE_GUEST_MEMFD _IOWR(KVMIO,  0xd4, struct kvm_create_guest_memfd)
 #define GUEST_MEMFD_FLAG_MMAP          (1ULL << 0)
 #define GUEST_MEMFD_FLAG_INIT_SHARED   (1ULL << 1)
+#define GUEST_MEMFD_FLAG_HUGE_PAGES    (1ULL << 2)
 
 struct kvm_create_guest_memfd {
        __u64 size;
        __u64 flags;
-       __u64 reserved[6];
+       __u8 huge_page_size_log2;
+       __u8 reserve8;
+       __u16 reserve16;
+       __u32 reserve32;
+       __u64 reserved[5];
 };
 
 #define KVM_PRE_FAULT_MEMORY   _IOWR(KVMIO, 0xd5, struct kvm_pre_fault_memory)

And not have to burn 6 bits of flags to encode the size in a weird location.

But that's a detail for KVM to sort out, which is exactly my point; how this is
presented to userspace for guest_memfd is question for KVM.

> @@ -0,0 +1,29 @@
> +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
> +#ifndef _UAPI_LINUX_GUESTMEM_H
> +#define _UAPI_LINUX_GUESTMEM_H
> +
> +/*
> + * Huge page size must be explicitly defined when using the guestmem_hugetlb
> + * allocator for guest_memfd.  It is the responsibility of the application to
> + * know which sizes are supported on the running system.  See mmap(2) man page
> + * for details.
> + */
> +
> +#define GUESTMEM_HUGETLB_FLAG_SHIFT	58
> +#define GUESTMEM_HUGETLB_FLAG_MASK	0x3fUL
> +
> +#define GUESTMEM_HUGETLB_FLAG_16KB	(14UL << GUESTMEM_HUGETLB_FLAG_SHIFT)
> +#define GUESTMEM_HUGETLB_FLAG_64KB	(16UL << GUESTMEM_HUGETLB_FLAG_SHIFT)
> +#define GUESTMEM_HUGETLB_FLAG_512KB	(19UL << GUESTMEM_HUGETLB_FLAG_SHIFT)
> +#define GUESTMEM_HUGETLB_FLAG_1MB	(20UL << GUESTMEM_HUGETLB_FLAG_SHIFT)
> +#define GUESTMEM_HUGETLB_FLAG_2MB	(21UL << GUESTMEM_HUGETLB_FLAG_SHIFT)
> +#define GUESTMEM_HUGETLB_FLAG_8MB	(23UL << GUESTMEM_HUGETLB_FLAG_SHIFT)
> +#define GUESTMEM_HUGETLB_FLAG_16MB	(24UL << GUESTMEM_HUGETLB_FLAG_SHIFT)
> +#define GUESTMEM_HUGETLB_FLAG_32MB	(25UL << GUESTMEM_HUGETLB_FLAG_SHIFT)
> +#define GUESTMEM_HUGETLB_FLAG_256MB	(28UL << GUESTMEM_HUGETLB_FLAG_SHIFT)
> +#define GUESTMEM_HUGETLB_FLAG_512MB	(29UL << GUESTMEM_HUGETLB_FLAG_SHIFT)
> +#define GUESTMEM_HUGETLB_FLAG_1GB	(30UL << GUESTMEM_HUGETLB_FLAG_SHIFT)
> +#define GUESTMEM_HUGETLB_FLAG_2GB	(31UL << GUESTMEM_HUGETLB_FLAG_SHIFT)
> +#define GUESTMEM_HUGETLB_FLAG_16GB	(34UL << GUESTMEM_HUGETLB_FLAG_SHIFT)
> +
> +#endif /* _UAPI_LINUX_GUESTMEM_H */

...

> +const struct guestmem_allocator_operations guestmem_hugetlb_ops = {
> +	.inode_setup = guestmem_hugetlb_setup,
> +	.inode_teardown = guestmem_hugetlb_teardown,
> +	.alloc_folio = guestmem_hugetlb_alloc_folio,
> +	.nr_pages_in_folio = guestmem_hugetlb_nr_pages_in_folio,
> +};
> +EXPORT_SYMBOL_GPL(guestmem_hugetlb_ops);

Why are these bundled into a structure?  AFAICT, that adds layers of indirection
for absolutely no reason.  And especially on the KVM guest_memfd side, implementing
a pile of infrastructure to support "custom" allocators is very premature.  Without
a second "custom" allocator, it's impossible to determine if the indirection
provided is actually a good design.  I.e. all of the kvm_gmem_has_custom_allocator()
logic in guest_memfd.c is just HugeTLB logic buried behind a layer of unnecessary
indirection.
Re: [RFC PATCH v2 29/51] mm: guestmem_hugetlb: Wrap HugeTLB as an allocator for guest_memfd
Posted by Sean Christopherson 2 months, 2 weeks ago
On Fri, Oct 03, 2025, Sean Christopherson wrote:
> On Wed, May 14, 2025, Ackerley Tng wrote:
> > guestmem_hugetlb is an allocator for guest_memfd. It wraps HugeTLB to
> > provide huge folios for guest_memfd.
> > 
> > This patch also introduces guestmem_allocator_operations as a set of
> > operations that allocators for guest_memfd can provide. In a later
> > patch, guest_memfd will use these operations to manage pages from an
> > allocator.
> > 
> > The allocator operations are memory-management specific and are placed
> > in mm/ so key mm-specific functions do not have to be exposed
> > unnecessarily.
> 
> This code doesn't have to be put in mm/, all of the #includes are to <linux/xxx.h>.
> Unless I'm missing something, what you actually want to avoid is _exporting_ mm/
> APIs, and for that all that is needed is ensure the code is built-in to the kernel
> binary, not to kvm.ko.
> 
> diff --git a/virt/kvm/Makefile.kvm b/virt/kvm/Makefile.kvm
> index d047d4cf58c9..c18c77e8a638 100644
> --- a/virt/kvm/Makefile.kvm
> +++ b/virt/kvm/Makefile.kvm
> @@ -13,3 +13,5 @@ kvm-$(CONFIG_HAVE_KVM_IRQ_ROUTING) += $(KVM)/irqchip.o
>  kvm-$(CONFIG_HAVE_KVM_DIRTY_RING) += $(KVM)/dirty_ring.o
>  kvm-$(CONFIG_HAVE_KVM_PFNCACHE) += $(KVM)/pfncache.o
>  kvm-$(CONFIG_KVM_GUEST_MEMFD) += $(KVM)/guest_memfd.o
> +
> +obj-$(subst m,y,$(CONFIG_KVM_GUEST_MEMFD)) += $(KVM)/guest_memfd_hugepages.o
> \ No newline at end of file
> 
> People may want the code to live in mm/ for maintenance and ownership reasons
> (or not, I haven't followed the discussions on hugepage support), but that's a
> very different justification than what's described in the changelog.
> 
> And if the _only_ user is guest_memfd, putting this in mm/ feels quite weird.
> And if we anticipate other users, the name guestmem_hugetlb is weird, because
> AFAICT there's nothing in here that is in any way guest specific, it's just a
> few APIs for allocating and accounting hugepages.
> 
> Personally, I don't see much point in trying to make this a "generic" library,
> in quotes because the whole guestmem_xxx namespace makes it anything but generic.
> I don't see anything in mm/guestmem_hugetlb.c that makes me go "ooh, that's nasty,
> I'm glad this is handled by a library".  But if we want to go straight to a
> library, it should be something that is really truly generic, i.e. not "guest"
> specific in any way.

Ah, the complexity and the mm-internal dependencies come along in the splitting
and merging patch.  Putting that code in mm/ makes perfect sense, but I'm still
not convinced that putting _all_ of this code in mm/ is the correct split.

As proposed, this is a weird combination of being an extension of guest_memfd, a
somewhat generic library, _and_ a subsystem (e.g. the global workqueue and stash).

_If_ we need a library, then IMO it should be a truly generic library.  Any pieces
that are guest_memfd specific belong in KVM.  And any subsystem-like things should
should probably be implemented as an extension to HugeTLB itself, which is already
it's own subsytem.  Emphasis on "if", because it's not clear to me that that a
library is warranted.

AFAICT, the novelty here is the splitting and re-merging of hugetlb folios, and
that seems like it should be explicitly an extension of the hugetlb subsystem.
E.g. that behavior needs to take hugetlb_lock, interact with global vmemmap state
like hugetlb_optimize_vmemmap_key, etc.  If that's implemented as something like
hugetlb_splittable.c or whatever, and wired up to be explicitly configured via
hugetlb_init(), then there may not be much left for a library.
Re: [RFC PATCH v2 29/51] mm: guestmem_hugetlb: Wrap HugeTLB as an allocator for guest_memfd
Posted by Michael Roth 3 months ago
On Wed, May 14, 2025 at 04:42:08PM -0700, Ackerley Tng wrote:
> guestmem_hugetlb is an allocator for guest_memfd. It wraps HugeTLB to
> provide huge folios for guest_memfd.
> 
> This patch also introduces guestmem_allocator_operations as a set of
> operations that allocators for guest_memfd can provide. In a later
> patch, guest_memfd will use these operations to manage pages from an
> allocator.
> 
> The allocator operations are memory-management specific and are placed
> in mm/ so key mm-specific functions do not have to be exposed
> unnecessarily.
> 
> Signed-off-by: Ackerley Tng <ackerleytng@google.com>
> 
> Change-Id: I3cafe111ea7b3c84755d7112ff8f8c541c11136d
> ---
>  include/linux/guestmem.h      |  20 +++++
>  include/uapi/linux/guestmem.h |  29 +++++++
>  mm/Kconfig                    |   5 +-
>  mm/guestmem_hugetlb.c         | 159 ++++++++++++++++++++++++++++++++++
>  4 files changed, 212 insertions(+), 1 deletion(-)
>  create mode 100644 include/linux/guestmem.h
>  create mode 100644 include/uapi/linux/guestmem.h
> 
> diff --git a/include/linux/guestmem.h b/include/linux/guestmem.h
> new file mode 100644
> index 000000000000..4b2d820274d9
> --- /dev/null
> +++ b/include/linux/guestmem.h
> @@ -0,0 +1,20 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _LINUX_GUESTMEM_H
> +#define _LINUX_GUESTMEM_H
> +
> +#include <linux/fs.h>
> +
> +struct guestmem_allocator_operations {
> +	void *(*inode_setup)(size_t size, u64 flags);
> +	void (*inode_teardown)(void *private, size_t inode_size);
> +	struct folio *(*alloc_folio)(void *private);
> +	/*
> +	 * Returns the number of PAGE_SIZE pages in a page that this guestmem
> +	 * allocator provides.
> +	 */
> +	size_t (*nr_pages_in_folio)(void *priv);
> +};
> +
> +extern const struct guestmem_allocator_operations guestmem_hugetlb_ops;
> +
> +#endif
> diff --git a/include/uapi/linux/guestmem.h b/include/uapi/linux/guestmem.h
> new file mode 100644
> index 000000000000..2e518682edd5
> --- /dev/null
> +++ b/include/uapi/linux/guestmem.h
> @@ -0,0 +1,29 @@
> +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
> +#ifndef _UAPI_LINUX_GUESTMEM_H
> +#define _UAPI_LINUX_GUESTMEM_H
> +
> +/*
> + * Huge page size must be explicitly defined when using the guestmem_hugetlb
> + * allocator for guest_memfd.  It is the responsibility of the application to
> + * know which sizes are supported on the running system.  See mmap(2) man page
> + * for details.
> + */
> +
> +#define GUESTMEM_HUGETLB_FLAG_SHIFT	58
> +#define GUESTMEM_HUGETLB_FLAG_MASK	0x3fUL
> +
> +#define GUESTMEM_HUGETLB_FLAG_16KB	(14UL << GUESTMEM_HUGETLB_FLAG_SHIFT)
> +#define GUESTMEM_HUGETLB_FLAG_64KB	(16UL << GUESTMEM_HUGETLB_FLAG_SHIFT)
> +#define GUESTMEM_HUGETLB_FLAG_512KB	(19UL << GUESTMEM_HUGETLB_FLAG_SHIFT)
> +#define GUESTMEM_HUGETLB_FLAG_1MB	(20UL << GUESTMEM_HUGETLB_FLAG_SHIFT)
> +#define GUESTMEM_HUGETLB_FLAG_2MB	(21UL << GUESTMEM_HUGETLB_FLAG_SHIFT)
> +#define GUESTMEM_HUGETLB_FLAG_8MB	(23UL << GUESTMEM_HUGETLB_FLAG_SHIFT)
> +#define GUESTMEM_HUGETLB_FLAG_16MB	(24UL << GUESTMEM_HUGETLB_FLAG_SHIFT)
> +#define GUESTMEM_HUGETLB_FLAG_32MB	(25UL << GUESTMEM_HUGETLB_FLAG_SHIFT)
> +#define GUESTMEM_HUGETLB_FLAG_256MB	(28UL << GUESTMEM_HUGETLB_FLAG_SHIFT)
> +#define GUESTMEM_HUGETLB_FLAG_512MB	(29UL << GUESTMEM_HUGETLB_FLAG_SHIFT)
> +#define GUESTMEM_HUGETLB_FLAG_1GB	(30UL << GUESTMEM_HUGETLB_FLAG_SHIFT)
> +#define GUESTMEM_HUGETLB_FLAG_2GB	(31UL << GUESTMEM_HUGETLB_FLAG_SHIFT)
> +#define GUESTMEM_HUGETLB_FLAG_16GB	(34UL << GUESTMEM_HUGETLB_FLAG_SHIFT)
> +
> +#endif /* _UAPI_LINUX_GUESTMEM_H */
> diff --git a/mm/Kconfig b/mm/Kconfig
> index 131adc49f58d..bb6e39e37245 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -1218,7 +1218,10 @@ config SECRETMEM
>  
>  config GUESTMEM_HUGETLB
>  	bool "Enable guestmem_hugetlb allocator for guest_memfd"
> -	depends on HUGETLBFS
> +	select GUESTMEM
> +	select HUGETLBFS
> +	select HUGETLB_PAGE
> +	select HUGETLB_PAGE_OPTIMIZE_VMEMMAP
>  	help
>  	  Enable this to make HugeTLB folios available to guest_memfd
>  	  (KVM virtualization) as backing memory.
> diff --git a/mm/guestmem_hugetlb.c b/mm/guestmem_hugetlb.c
> index 51a724ebcc50..5459ef7eb329 100644
> --- a/mm/guestmem_hugetlb.c
> +++ b/mm/guestmem_hugetlb.c
> @@ -5,6 +5,14 @@
>   */
>  
>  #include <linux/mm_types.h>
> +#include <linux/guestmem.h>
> +#include <linux/hugetlb.h>
> +#include <linux/hugetlb_cgroup.h>
> +#include <linux/mempolicy.h>
> +#include <linux/mm.h>
> +#include <linux/pagemap.h>
> +
> +#include <uapi/linux/guestmem.h>
>  
>  #include "guestmem_hugetlb.h"
>  
> @@ -12,3 +20,154 @@ void guestmem_hugetlb_handle_folio_put(struct folio *folio)
>  {
>  	WARN_ONCE(1, "A placeholder that shouldn't trigger. Work in progress.");
>  }
> +
> +struct guestmem_hugetlb_private {
> +	struct hstate *h;
> +	struct hugepage_subpool *spool;
> +	struct hugetlb_cgroup *h_cg_rsvd;
> +};
> +
> +static size_t guestmem_hugetlb_nr_pages_in_folio(void *priv)
> +{
> +	struct guestmem_hugetlb_private *private = priv;
> +
> +	return pages_per_huge_page(private->h);
> +}
> +
> +static void *guestmem_hugetlb_setup(size_t size, u64 flags)
> +
> +{
> +	struct guestmem_hugetlb_private *private;
> +	struct hugetlb_cgroup *h_cg_rsvd = NULL;
> +	struct hugepage_subpool *spool;
> +	unsigned long nr_pages;
> +	int page_size_log;
> +	struct hstate *h;
> +	long hpages;
> +	int idx;
> +	int ret;
> +
> +	page_size_log = (flags >> GUESTMEM_HUGETLB_FLAG_SHIFT) &
> +			GUESTMEM_HUGETLB_FLAG_MASK;
> +	h = hstate_sizelog(page_size_log);
> +	if (!h)
> +		return ERR_PTR(-EINVAL);
> +
> +	/*
> +	 * Check against h because page_size_log could be 0 to request default
> +	 * HugeTLB page size.
> +	 */
> +	if (!IS_ALIGNED(size, huge_page_size(h)))
> +		return ERR_PTR(-EINVAL);

For SNP testing we ended up needing to relax this to play along a little
easier with QEMU/etc. and instead just round the size up via:

  size = round_up(size, huge_page_size(h));

The thinking is that since, presumably, the size would span beyond what
we actually bind to any memslots, that KVM will simply map them as 4K
in nested page table, and userspace already causes 4K split and inode
size doesn't change as part of this adjustment so the extra pages would
remain inaccessible.

The accounting might get a little weird but it's probably fair to
document that non-hugepage-aligned gmemfd sizes can result in wasted memory
if userspace wants to fine tune around that.

-Mike

> +
> +	private = kzalloc(sizeof(*private), GFP_KERNEL);
> +	if (!private)
> +		return ERR_PTR(-ENOMEM);
> +
> +	/* Creating a subpool makes reservations, hence charge for them now. */
> +	idx = hstate_index(h);
> +	nr_pages = size >> PAGE_SHIFT;
> +	ret = hugetlb_cgroup_charge_cgroup_rsvd(idx, nr_pages, &h_cg_rsvd);
> +	if (ret)
> +		goto err_free;
> +
> +	hpages = size >> huge_page_shift(h);
> +	spool = hugepage_new_subpool(h, hpages, hpages, false);
> +	if (!spool)
> +		goto err_uncharge;
> +
> +	private->h = h;
> +	private->spool = spool;
> +	private->h_cg_rsvd = h_cg_rsvd;
> +
> +	return private;
> +
> +err_uncharge:
> +	ret = -ENOMEM;
> +	hugetlb_cgroup_uncharge_cgroup_rsvd(idx, nr_pages, h_cg_rsvd);
> +err_free:
> +	kfree(private);
> +	return ERR_PTR(ret);
> +}
> +
> +static void guestmem_hugetlb_teardown(void *priv, size_t inode_size)
> +{
> +	struct guestmem_hugetlb_private *private = priv;
> +	unsigned long nr_pages;
> +	int idx;
> +
> +	hugepage_put_subpool(private->spool);
> +
> +	idx = hstate_index(private->h);
> +	nr_pages = inode_size >> PAGE_SHIFT;
> +	hugetlb_cgroup_uncharge_cgroup_rsvd(idx, nr_pages, private->h_cg_rsvd);
> +
> +	kfree(private);
> +}
> +
> +static struct folio *guestmem_hugetlb_alloc_folio(void *priv)
> +{
> +	struct guestmem_hugetlb_private *private = priv;
> +	struct mempolicy *mpol;
> +	struct folio *folio;
> +	pgoff_t ilx;
> +	int ret;
> +
> +	ret = hugepage_subpool_get_pages(private->spool, 1);
> +	if (ret == -ENOMEM) {
> +		return ERR_PTR(-ENOMEM);
> +	} else if (ret > 0) {
> +		/* guest_memfd will not use surplus pages. */
> +		goto err_put_pages;
> +	}
> +
> +	/*
> +	 * TODO: mempolicy would probably have to be stored on the inode, use
> +	 * task policy for now.
> +	 */
> +	mpol = get_task_policy(current);
> +
> +	/* TODO: ignore interleaving for now. */
> +	ilx = NO_INTERLEAVE_INDEX;
> +
> +	/*
> +	 * charge_cgroup_rsvd is false because we already charged reservations
> +	 * when creating the subpool for this
> +	 * guest_memfd. use_existing_reservation is true - we're using a
> +	 * reservation from the guest_memfd's subpool.
> +	 */
> +	folio = hugetlb_alloc_folio(private->h, mpol, ilx, false, true);
> +	mpol_cond_put(mpol);
> +
> +	if (IS_ERR_OR_NULL(folio))
> +		goto err_put_pages;
> +
> +	/*
> +	 * Clear restore_reserve here so that when this folio is freed,
> +	 * free_huge_folio() will always attempt to return the reservation to
> +	 * the subpool.  guest_memfd, unlike regular hugetlb, has no resv_map,
> +	 * and hence when freeing, the folio needs to be returned to the
> +	 * subpool.  guest_memfd does not use surplus hugetlb pages, so in
> +	 * free_huge_folio(), returning to subpool will always succeed and the
> +	 * hstate reservation will then get restored.
> +	 *
> +	 * hugetlbfs does this in hugetlb_add_to_page_cache().
> +	 */
> +	folio_clear_hugetlb_restore_reserve(folio);
> +
> +	hugetlb_set_folio_subpool(folio, private->spool);
> +
> +	return folio;
> +
> +err_put_pages:
> +	hugepage_subpool_put_pages(private->spool, 1);
> +	return ERR_PTR(-ENOMEM);
> +}
> +
> +const struct guestmem_allocator_operations guestmem_hugetlb_ops = {
> +	.inode_setup = guestmem_hugetlb_setup,
> +	.inode_teardown = guestmem_hugetlb_teardown,
> +	.alloc_folio = guestmem_hugetlb_alloc_folio,
> +	.nr_pages_in_folio = guestmem_hugetlb_nr_pages_in_folio,
> +};
> +EXPORT_SYMBOL_GPL(guestmem_hugetlb_ops);
> -- 
> 2.49.0.1045.g170613ef41-goog
> 
>
Re: [RFC PATCH v2 29/51] mm: guestmem_hugetlb: Wrap HugeTLB as an allocator for guest_memfd
Posted by Ackerley Tng 3 months ago
Michael Roth <michael.roth@amd.com> writes:

> On Wed, May 14, 2025 at 04:42:08PM -0700, Ackerley Tng wrote:
>> 
>> [...snip...]
>> 
>> +static void *guestmem_hugetlb_setup(size_t size, u64 flags)
>> +
>> +{
>> +	struct guestmem_hugetlb_private *private;
>> +	struct hugetlb_cgroup *h_cg_rsvd = NULL;
>> +	struct hugepage_subpool *spool;
>> +	unsigned long nr_pages;
>> +	int page_size_log;
>> +	struct hstate *h;
>> +	long hpages;
>> +	int idx;
>> +	int ret;
>> +
>> +	page_size_log = (flags >> GUESTMEM_HUGETLB_FLAG_SHIFT) &
>> +			GUESTMEM_HUGETLB_FLAG_MASK;
>> +	h = hstate_sizelog(page_size_log);
>> +	if (!h)
>> +		return ERR_PTR(-EINVAL);
>> +
>> +	/*
>> +	 * Check against h because page_size_log could be 0 to request default
>> +	 * HugeTLB page size.
>> +	 */
>> +	if (!IS_ALIGNED(size, huge_page_size(h)))
>> +		return ERR_PTR(-EINVAL);
>
> For SNP testing we ended up needing to relax this to play along a little
> easier with QEMU/etc. and instead just round the size up via:
>
>   size = round_up(size, huge_page_size(h));
>
> The thinking is that since, presumably, the size would span beyond what
> we actually bind to any memslots, that KVM will simply map them as 4K
> in nested page table, and userspace already causes 4K split and inode
> size doesn't change as part of this adjustment so the extra pages would
> remain inaccessible.
>
> The accounting might get a little weird but it's probably fair to
> document that non-hugepage-aligned gmemfd sizes can result in wasted memory
> if userspace wants to fine tune around that.

Is there a specific use case where the userspace VMM must allocate some
guest_memfd file size that isn't huge_page_size(h) aligned?

Rounding up silently feels like it would be hiding errors, and feels a
little especially when we're doing so much work to save memory,
retaining HVO, removing double allocation and all.

>
> -Mike
>
>> +
>> +	private = kzalloc(sizeof(*private), GFP_KERNEL);
>> +	if (!private)
>> +		return ERR_PTR(-ENOMEM);
>> +
>> +	/* Creating a subpool makes reservations, hence charge for them now. */
>> +	idx = hstate_index(h);
>> +	nr_pages = size >> PAGE_SHIFT;
>> +	ret = hugetlb_cgroup_charge_cgroup_rsvd(idx, nr_pages, &h_cg_rsvd);
>> +	if (ret)
>> +		goto err_free;
>> +
>> +	hpages = size >> huge_page_shift(h);
>> +	spool = hugepage_new_subpool(h, hpages, hpages, false);
>> +	if (!spool)
>> +		goto err_uncharge;
>> +
>> +	private->h = h;
>> +	private->spool = spool;
>> +	private->h_cg_rsvd = h_cg_rsvd;
>> +
>> +	return private;
>> +
>> +err_uncharge:
>> +	ret = -ENOMEM;
>> +	hugetlb_cgroup_uncharge_cgroup_rsvd(idx, nr_pages, h_cg_rsvd);
>> +err_free:
>> +	kfree(private);
>> +	return ERR_PTR(ret);
>> +}
>> +
>> 
>> [...snip...]
>>
Re: [RFC PATCH v2 29/51] mm: guestmem_hugetlb: Wrap HugeTLB as an allocator for guest_memfd
Posted by Ackerley Tng 7 months ago
Ackerley Tng <ackerleytng@google.com> writes:

> guestmem_hugetlb is an allocator for guest_memfd. It wraps HugeTLB to
> provide huge folios for guest_memfd.
>
> This patch also introduces guestmem_allocator_operations as a set of
> operations that allocators for guest_memfd can provide. In a later
> patch, guest_memfd will use these operations to manage pages from an
> allocator.
>
> The allocator operations are memory-management specific and are placed
> in mm/ so key mm-specific functions do not have to be exposed
> unnecessarily.
>
> Signed-off-by: Ackerley Tng <ackerleytng@google.com>
>
> Change-Id: I3cafe111ea7b3c84755d7112ff8f8c541c11136d
> ---
>  include/linux/guestmem.h      |  20 +++++
>  include/uapi/linux/guestmem.h |  29 +++++++
>  mm/Kconfig                    |   5 +-
>  mm/guestmem_hugetlb.c         | 159 ++++++++++++++++++++++++++++++++++
>  4 files changed, 212 insertions(+), 1 deletion(-)
>  create mode 100644 include/linux/guestmem.h
>  create mode 100644 include/uapi/linux/guestmem.h
>
> <snip>
>
> diff --git a/mm/Kconfig b/mm/Kconfig
> index 131adc49f58d..bb6e39e37245 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -1218,7 +1218,10 @@ config SECRETMEM
>  
>  config GUESTMEM_HUGETLB
>  	bool "Enable guestmem_hugetlb allocator for guest_memfd"
> -	depends on HUGETLBFS
> +	select GUESTMEM
> +	select HUGETLBFS
> +	select HUGETLB_PAGE
> +	select HUGETLB_PAGE_OPTIMIZE_VMEMMAP

My bad. I left out CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON=y in
my testing and just found that when it is set, I hit

  BUG_ON(pte_page(ptep_get(pte)) != walk->reuse_page);

with the basic guest_memfd_test on splitting pages on allocation.

I'll follow up with the fix soon.

Another note about testing: I've been testing in a nested VM for the
development process:

1. Host
2. VM for development
3. Nested VM running kernel being developed
4. Nested nested VMs created during selftests

This series has not yet been tested on a physical host.

>  	help
>  	  Enable this to make HugeTLB folios available to guest_memfd
>  	  (KVM virtualization) as backing memory.
>
> <snip>
>
Re: [RFC PATCH v2 29/51] mm: guestmem_hugetlb: Wrap HugeTLB as an allocator for guest_memfd
Posted by Ackerley Tng 7 months ago
Ackerley Tng <ackerleytng@google.com> writes:

> Ackerley Tng <ackerleytng@google.com> writes:
>
>> guestmem_hugetlb is an allocator for guest_memfd. It wraps HugeTLB to
>> provide huge folios for guest_memfd.
>>
>> This patch also introduces guestmem_allocator_operations as a set of
>> operations that allocators for guest_memfd can provide. In a later
>> patch, guest_memfd will use these operations to manage pages from an
>> allocator.
>>
>> The allocator operations are memory-management specific and are placed
>> in mm/ so key mm-specific functions do not have to be exposed
>> unnecessarily.
>>
>> Signed-off-by: Ackerley Tng <ackerleytng@google.com>
>>
>> Change-Id: I3cafe111ea7b3c84755d7112ff8f8c541c11136d
>> ---
>>  include/linux/guestmem.h      |  20 +++++
>>  include/uapi/linux/guestmem.h |  29 +++++++
>>  mm/Kconfig                    |   5 +-
>>  mm/guestmem_hugetlb.c         | 159 ++++++++++++++++++++++++++++++++++
>>  4 files changed, 212 insertions(+), 1 deletion(-)
>>  create mode 100644 include/linux/guestmem.h
>>  create mode 100644 include/uapi/linux/guestmem.h
>>
>> <snip>
>>
>> diff --git a/mm/Kconfig b/mm/Kconfig
>> index 131adc49f58d..bb6e39e37245 100644
>> --- a/mm/Kconfig
>> +++ b/mm/Kconfig
>> @@ -1218,7 +1218,10 @@ config SECRETMEM
>>  
>>  config GUESTMEM_HUGETLB
>>  	bool "Enable guestmem_hugetlb allocator for guest_memfd"
>> -	depends on HUGETLBFS
>> +	select GUESTMEM
>> +	select HUGETLBFS
>> +	select HUGETLB_PAGE
>> +	select HUGETLB_PAGE_OPTIMIZE_VMEMMAP
>
> My bad. I left out CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON=y in
> my testing and just found that when it is set, I hit
>
>   BUG_ON(pte_page(ptep_get(pte)) != walk->reuse_page);
>
> with the basic guest_memfd_test on splitting pages on allocation.
>
> I'll follow up with the fix soon.
>
> Another note about testing: I've been testing in a nested VM for the
> development process:
>
> 1. Host
> 2. VM for development
> 3. Nested VM running kernel being developed
> 4. Nested nested VMs created during selftests
>
> This series has not yet been tested on a physical host.
>
>>  	help
>>  	  Enable this to make HugeTLB folios available to guest_memfd
>>  	  (KVM virtualization) as backing memory.
>>
>> <snip>
>>

Here's the fix for this issue

From 998af6404d4e39920ba42764e7f3815cb9bb9e3d Mon Sep 17 00:00:00 2001
Message-ID: <998af6404d4e39920ba42764e7f3815cb9bb9e3d.1747427489.git.ackerleytng@google.com>
From: Ackerley Tng <ackerleytng@google.com>
Date: Fri, 16 May 2025 13:14:55 -0700
Subject: [RFC PATCH v2 1/1] KVM: guest_memfd: Reorder undoing vmemmap
 optimization and stashing hugetlb folio metadata

Without this patch, when HugeTLB folio metadata is stashed, the
vmemmap_optimized flag, stored in a HugeTLB folio's folio->private was
stashed as set.

The first splitting works, but on merging, when the folio metadata was
unstashed, vmemmap_optimized is unstashed as set, making the call to
hugetlb_vmemmap_optimize_folio() skip actually applying optimizations.

On a second split, hugetlb_vmemmap_restore_folio() attempts to reapply
optimizations when it was already applied, hence hitting the BUG().

Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
 mm/guestmem_hugetlb.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/mm/guestmem_hugetlb.c b/mm/guestmem_hugetlb.c
index 8727598cf18e..2c0192543676 100644
--- a/mm/guestmem_hugetlb.c
+++ b/mm/guestmem_hugetlb.c
@@ -200,16 +200,21 @@ static int guestmem_hugetlb_split_folio(struct folio *folio)
 		return 0;
 
 	orig_nr_pages = folio_nr_pages(folio);
-	ret = guestmem_hugetlb_stash_metadata(folio);
+
+	/*
+	 * hugetlb_vmemmap_restore_folio() has to be called ahead of the rest
+	 * because it checks page type. This doesn't actually split the folio,
+	 * so the first few struct pages are still intact.
+	 */
+	ret = hugetlb_vmemmap_restore_folio(folio_hstate(folio), folio);
 	if (ret)
 		return ret;
 
 	/*
-	 * hugetlb_vmemmap_restore_folio() has to be called ahead of the rest
-	 * because it checks and page type. This doesn't actually split the
-	 * folio, so the first few struct pages are still intact.
+	 * Stash metadata after vmemmap stuff so the outcome of the vmemmap
+	 * restoration is stashed.
 	 */
-	ret = hugetlb_vmemmap_restore_folio(folio_hstate(folio), folio);
+	ret = guestmem_hugetlb_stash_metadata(folio);
 	if (ret)
 		goto err;
 
@@ -254,8 +259,7 @@ static int guestmem_hugetlb_split_folio(struct folio *folio)
 	return 0;
 
 err:
-	guestmem_hugetlb_unstash_free_metadata(folio);
-
+	hugetlb_vmemmap_optimize_folio(folio_hstate(folio), folio);
 	return ret;
 }
 
-- 
2.49.0.1101.gccaa498523-goog
Re: [RFC PATCH v2 29/51] mm: guestmem_hugetlb: Wrap HugeTLB as an allocator for guest_memfd
Posted by Michael Roth 3 months ago
On Fri, May 16, 2025 at 01:33:39PM -0700, Ackerley Tng wrote:
> Ackerley Tng <ackerleytng@google.com> writes:
> 
> > Ackerley Tng <ackerleytng@google.com> writes:
> >
> >> guestmem_hugetlb is an allocator for guest_memfd. It wraps HugeTLB to
> >> provide huge folios for guest_memfd.
> >>
> >> This patch also introduces guestmem_allocator_operations as a set of
> >> operations that allocators for guest_memfd can provide. In a later
> >> patch, guest_memfd will use these operations to manage pages from an
> >> allocator.
> >>
> >> The allocator operations are memory-management specific and are placed
> >> in mm/ so key mm-specific functions do not have to be exposed
> >> unnecessarily.
> >>
> >> Signed-off-by: Ackerley Tng <ackerleytng@google.com>
> >>
> >> Change-Id: I3cafe111ea7b3c84755d7112ff8f8c541c11136d
> >> ---
> >>  include/linux/guestmem.h      |  20 +++++
> >>  include/uapi/linux/guestmem.h |  29 +++++++
> >>  mm/Kconfig                    |   5 +-
> >>  mm/guestmem_hugetlb.c         | 159 ++++++++++++++++++++++++++++++++++
> >>  4 files changed, 212 insertions(+), 1 deletion(-)
> >>  create mode 100644 include/linux/guestmem.h
> >>  create mode 100644 include/uapi/linux/guestmem.h
> >>
> >> <snip>
> >>
> >> diff --git a/mm/Kconfig b/mm/Kconfig
> >> index 131adc49f58d..bb6e39e37245 100644
> >> --- a/mm/Kconfig
> >> +++ b/mm/Kconfig
> >> @@ -1218,7 +1218,10 @@ config SECRETMEM
> >>  
> >>  config GUESTMEM_HUGETLB
> >>  	bool "Enable guestmem_hugetlb allocator for guest_memfd"
> >> -	depends on HUGETLBFS
> >> +	select GUESTMEM
> >> +	select HUGETLBFS
> >> +	select HUGETLB_PAGE
> >> +	select HUGETLB_PAGE_OPTIMIZE_VMEMMAP
> >
> > My bad. I left out CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON=y in
> > my testing and just found that when it is set, I hit
> >
> >   BUG_ON(pte_page(ptep_get(pte)) != walk->reuse_page);
> >
> > with the basic guest_memfd_test on splitting pages on allocation.
> >
> > I'll follow up with the fix soon.
> >
> > Another note about testing: I've been testing in a nested VM for the
> > development process:
> >
> > 1. Host
> > 2. VM for development
> > 3. Nested VM running kernel being developed
> > 4. Nested nested VMs created during selftests
> >
> > This series has not yet been tested on a physical host.
> >
> >>  	help
> >>  	  Enable this to make HugeTLB folios available to guest_memfd
> >>  	  (KVM virtualization) as backing memory.
> >>
> >> <snip>
> >>
> 
> Here's the fix for this issue
> 
> From 998af6404d4e39920ba42764e7f3815cb9bb9e3d Mon Sep 17 00:00:00 2001
> Message-ID: <998af6404d4e39920ba42764e7f3815cb9bb9e3d.1747427489.git.ackerleytng@google.com>
> From: Ackerley Tng <ackerleytng@google.com>
> Date: Fri, 16 May 2025 13:14:55 -0700
> Subject: [RFC PATCH v2 1/1] KVM: guest_memfd: Reorder undoing vmemmap
>  optimization and stashing hugetlb folio metadata
> 
> Without this patch, when HugeTLB folio metadata is stashed, the
> vmemmap_optimized flag, stored in a HugeTLB folio's folio->private was
> stashed as set.
> 
> The first splitting works, but on merging, when the folio metadata was
> unstashed, vmemmap_optimized is unstashed as set, making the call to
> hugetlb_vmemmap_optimize_folio() skip actually applying optimizations.
> 
> On a second split, hugetlb_vmemmap_restore_folio() attempts to reapply
> optimizations when it was already applied, hence hitting the BUG().
> 
> Signed-off-by: Ackerley Tng <ackerleytng@google.com>
> ---
>  mm/guestmem_hugetlb.c | 18 +++++++++++-------
>  1 file changed, 11 insertions(+), 7 deletions(-)
> 
> diff --git a/mm/guestmem_hugetlb.c b/mm/guestmem_hugetlb.c
> index 8727598cf18e..2c0192543676 100644
> --- a/mm/guestmem_hugetlb.c
> +++ b/mm/guestmem_hugetlb.c
> @@ -200,16 +200,21 @@ static int guestmem_hugetlb_split_folio(struct folio *folio)
>  		return 0;
>  
>  	orig_nr_pages = folio_nr_pages(folio);
> -	ret = guestmem_hugetlb_stash_metadata(folio);
> +
> +	/*
> +	 * hugetlb_vmemmap_restore_folio() has to be called ahead of the rest
> +	 * because it checks page type. This doesn't actually split the folio,
> +	 * so the first few struct pages are still intact.
> +	 */
> +	ret = hugetlb_vmemmap_restore_folio(folio_hstate(folio), folio);
>  	if (ret)
>  		return ret;
>  
>  	/*
> -	 * hugetlb_vmemmap_restore_folio() has to be called ahead of the rest
> -	 * because it checks and page type. This doesn't actually split the
> -	 * folio, so the first few struct pages are still intact.
> +	 * Stash metadata after vmemmap stuff so the outcome of the vmemmap
> +	 * restoration is stashed.
>  	 */
> -	ret = hugetlb_vmemmap_restore_folio(folio_hstate(folio), folio);
> +	ret = guestmem_hugetlb_stash_metadata(folio);
>  	if (ret)
>  		goto err;

Doh, I missed this before replying earlier. This definitely seems like
the cleaner fix as it pertains to other flags/state that potentially
becomes stale after calling hugetlb_vmemmap_restore_folio().

-Mike