[RFC PATCH v2 38/51] KVM: guest_memfd: Split allocator pages for guest_memfd use

Ackerley Tng posted 51 patches 7 months, 1 week ago
[RFC PATCH v2 38/51] KVM: guest_memfd: Split allocator pages for guest_memfd use
Posted by Ackerley Tng 7 months, 1 week ago
In this patch, newly allocated pages are split to 4K regular pages
before providing them to the requester (fallocate() or KVM).

During a private to shared conversion, folios are split if not already
split.

During a shared to private conversion, folios are merged if not
already merged.

When the folios are removed from the filemap on truncation, the
allocator is given a chance to do any necessary prep for when the
folio is freed.

When a conversion is requested on a subfolio within a hugepage range,
faulting must be prevented on the whole hugepage range for
correctness.

See related discussion at
https://lore.kernel.org/all/Z__AAB_EFxGFEjDR@google.com/T/

Signed-off-by: Ackerley Tng <ackerleytng@google.com>
Co-developed-by: Vishal Annapurve <vannapurve@google.com>
Signed-off-by: Vishal Annapurve <vannapurve@google.com>
Change-Id: Ib5ee22e3dae034c529773048a626ad98d4b10af3
---
 mm/filemap.c           |   2 +
 virt/kvm/guest_memfd.c | 501 +++++++++++++++++++++++++++++++++++++++--
 2 files changed, 483 insertions(+), 20 deletions(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index a02c3d8e00e8..a052f8e0c41e 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -223,6 +223,7 @@ void __filemap_remove_folio(struct folio *folio, void *shadow)
 	filemap_unaccount_folio(mapping, folio);
 	page_cache_delete(mapping, folio, shadow);
 }
+EXPORT_SYMBOL_GPL(__filemap_remove_folio);
 
 void filemap_free_folio(struct address_space *mapping, struct folio *folio)
 {
@@ -258,6 +259,7 @@ void filemap_remove_folio(struct folio *folio)
 
 	filemap_free_folio(mapping, folio);
 }
+EXPORT_SYMBOL_GPL(filemap_remove_folio);
 
 /*
  * page_cache_delete_batch - delete several folios from page cache
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index c578d0ebe314..cb426c1dfef8 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -41,6 +41,11 @@ static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start,
 				      pgoff_t end);
 static void kvm_gmem_invalidate_end(struct kvm_gmem *gmem, pgoff_t start,
 				    pgoff_t end);
+static int __kvm_gmem_filemap_add_folio(struct address_space *mapping,
+					struct folio *folio, pgoff_t index);
+static int kvm_gmem_restructure_folios_in_range(struct inode *inode,
+						pgoff_t start, size_t nr_pages,
+						bool is_split_operation);
 
 static struct kvm_gmem_inode_private *kvm_gmem_private(struct inode *inode)
 {
@@ -126,6 +131,31 @@ static enum shareability kvm_gmem_shareability_get(struct inode *inode,
 	return xa_to_value(entry);
 }
 
+static bool kvm_gmem_shareability_in_range(struct inode *inode, pgoff_t start,
+					    size_t nr_pages, enum shareability m)
+{
+	struct maple_tree *mt;
+	pgoff_t last;
+	void *entry;
+
+	mt = &kvm_gmem_private(inode)->shareability;
+
+	last = start + nr_pages - 1;
+	mt_for_each(mt, entry, start, last) {
+		if (xa_to_value(entry) == m)
+			return true;
+	}
+
+	return false;
+}
+
+static inline bool kvm_gmem_has_some_shared(struct inode *inode, pgoff_t start,
+					    size_t nr_pages)
+{
+	return kvm_gmem_shareability_in_range(inode, start, nr_pages,
+					     SHAREABILITY_ALL);
+}
+
 static struct folio *kvm_gmem_get_shared_folio(struct inode *inode, pgoff_t index)
 {
 	if (kvm_gmem_shareability_get(inode, index) != SHAREABILITY_ALL)
@@ -241,6 +271,105 @@ static bool kvm_gmem_has_safe_refcount(struct address_space *mapping, pgoff_t st
 	return refcount_safe;
 }
 
+static void kvm_gmem_unmap_private(struct kvm_gmem *gmem, pgoff_t start,
+				   pgoff_t end)
+{
+	struct kvm_memory_slot *slot;
+	struct kvm *kvm = gmem->kvm;
+	unsigned long index;
+	bool locked = false;
+	bool flush = false;
+
+	xa_for_each_range(&gmem->bindings, index, slot, start, end - 1) {
+		pgoff_t pgoff = slot->gmem.pgoff;
+
+		struct kvm_gfn_range gfn_range = {
+			.start = slot->base_gfn + max(pgoff, start) - pgoff,
+			.end = slot->base_gfn + min(pgoff + slot->npages, end) - pgoff,
+			.slot = slot,
+			.may_block = true,
+			/* This function is only concerned with private mappings. */
+			.attr_filter = KVM_FILTER_PRIVATE,
+		};
+
+		if (!locked) {
+			KVM_MMU_LOCK(kvm);
+			locked = true;
+		}
+
+		flush |= kvm_mmu_unmap_gfn_range(kvm, &gfn_range);
+	}
+
+	if (flush)
+		kvm_flush_remote_tlbs(kvm);
+
+	if (locked)
+		KVM_MMU_UNLOCK(kvm);
+}
+
+static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start,
+				      pgoff_t end)
+{
+	struct kvm_memory_slot *slot;
+	struct kvm *kvm = gmem->kvm;
+	unsigned long index;
+	bool found_memslot;
+
+	found_memslot = false;
+	xa_for_each_range(&gmem->bindings, index, slot, start, end - 1) {
+		gfn_t gfn_start;
+		gfn_t gfn_end;
+		pgoff_t pgoff;
+
+		pgoff = slot->gmem.pgoff;
+
+		gfn_start = slot->base_gfn + max(pgoff, start) - pgoff;
+		gfn_end = slot->base_gfn + min(pgoff + slot->npages, end) - pgoff;
+
+		if (!found_memslot) {
+			found_memslot = true;
+
+			KVM_MMU_LOCK(kvm);
+			kvm_mmu_invalidate_begin(kvm);
+		}
+
+		kvm_mmu_invalidate_range_add(kvm, gfn_start, gfn_end);
+	}
+
+	if (found_memslot)
+		KVM_MMU_UNLOCK(kvm);
+}
+
+static pgoff_t kvm_gmem_compute_invalidate_bound(struct inode *inode,
+						 pgoff_t bound, bool start)
+{
+	size_t nr_pages;
+	void *priv;
+
+	if (!kvm_gmem_has_custom_allocator(inode))
+		return bound;
+
+	priv = kvm_gmem_allocator_private(inode);
+	nr_pages = kvm_gmem_allocator_ops(inode)->nr_pages_in_folio(priv);
+
+	if (start)
+		return round_down(bound, nr_pages);
+	else
+		return round_up(bound, nr_pages);
+}
+
+static pgoff_t kvm_gmem_compute_invalidate_start(struct inode *inode,
+						 pgoff_t bound)
+{
+	return kvm_gmem_compute_invalidate_bound(inode, bound, true);
+}
+
+static pgoff_t kvm_gmem_compute_invalidate_end(struct inode *inode,
+					       pgoff_t bound)
+{
+	return kvm_gmem_compute_invalidate_bound(inode, bound, false);
+}
+
 static int kvm_gmem_shareability_apply(struct inode *inode,
 				       struct conversion_work *work,
 				       enum shareability m)
@@ -299,35 +428,53 @@ static void kvm_gmem_convert_invalidate_begin(struct inode *inode,
 					      struct conversion_work *work)
 {
 	struct list_head *gmem_list;
+	pgoff_t invalidate_start;
+	pgoff_t invalidate_end;
 	struct kvm_gmem *gmem;
-	pgoff_t end;
+	pgoff_t work_end;
 
-	end = work->start + work->nr_pages;
+	work_end = work->start + work->nr_pages;
+	invalidate_start = kvm_gmem_compute_invalidate_start(inode, work->start);
+	invalidate_end = kvm_gmem_compute_invalidate_end(inode, work_end);
 
 	gmem_list = &inode->i_mapping->i_private_list;
 	list_for_each_entry(gmem, gmem_list, entry)
-		kvm_gmem_invalidate_begin(gmem, work->start, end);
+		kvm_gmem_invalidate_begin(gmem, invalidate_start, invalidate_end);
 }
 
 static void kvm_gmem_convert_invalidate_end(struct inode *inode,
 					    struct conversion_work *work)
 {
 	struct list_head *gmem_list;
+	pgoff_t invalidate_start;
+	pgoff_t invalidate_end;
 	struct kvm_gmem *gmem;
-	pgoff_t end;
+	pgoff_t work_end;
 
-	end = work->start + work->nr_pages;
+	work_end = work->start + work->nr_pages;
+	invalidate_start = kvm_gmem_compute_invalidate_start(inode, work->start);
+	invalidate_end = kvm_gmem_compute_invalidate_end(inode, work_end);
 
 	gmem_list = &inode->i_mapping->i_private_list;
 	list_for_each_entry(gmem, gmem_list, entry)
-		kvm_gmem_invalidate_end(gmem, work->start, end);
+		kvm_gmem_invalidate_end(gmem, invalidate_start, invalidate_end);
 }
 
 static int kvm_gmem_convert_should_proceed(struct inode *inode,
 					   struct conversion_work *work,
 					   bool to_shared, pgoff_t *error_index)
 {
-	if (!to_shared) {
+	if (to_shared) {
+		struct list_head *gmem_list;
+		struct kvm_gmem *gmem;
+		pgoff_t work_end;
+
+		work_end = work->start + work->nr_pages;
+
+		gmem_list = &inode->i_mapping->i_private_list;
+		list_for_each_entry(gmem, gmem_list, entry)
+			kvm_gmem_unmap_private(gmem, work->start, work_end);
+	} else {
 		unmap_mapping_pages(inode->i_mapping, work->start,
 				    work->nr_pages, false);
 
@@ -340,6 +487,27 @@ static int kvm_gmem_convert_should_proceed(struct inode *inode,
 	return 0;
 }
 
+static int kvm_gmem_convert_execute_work(struct inode *inode,
+					 struct conversion_work *work,
+					 bool to_shared)
+{
+	enum shareability m;
+	int ret;
+
+	m = to_shared ? SHAREABILITY_ALL : SHAREABILITY_GUEST;
+	ret = kvm_gmem_shareability_apply(inode, work, m);
+	if (ret)
+		return ret;
+	/*
+	 * Apply shareability first so split/merge can operate on new
+	 * shareability state.
+	 */
+	ret = kvm_gmem_restructure_folios_in_range(
+		inode, work->start, work->nr_pages, to_shared);
+
+	return ret;
+}
+
 static int kvm_gmem_convert_range(struct file *file, pgoff_t start,
 				  size_t nr_pages, bool shared,
 				  pgoff_t *error_index)
@@ -371,18 +539,21 @@ static int kvm_gmem_convert_range(struct file *file, pgoff_t start,
 
 	list_for_each_entry(work, &work_list, list) {
 		rollback_stop_item = work;
-		ret = kvm_gmem_shareability_apply(inode, work, m);
+
+		ret = kvm_gmem_convert_execute_work(inode, work, shared);
 		if (ret)
 			break;
 	}
 
 	if (ret) {
-		m = shared ? SHAREABILITY_GUEST : SHAREABILITY_ALL;
 		list_for_each_entry(work, &work_list, list) {
+			int r;
+
+			r = kvm_gmem_convert_execute_work(inode, work, !shared);
+			WARN_ON(r);
+
 			if (work == rollback_stop_item)
 				break;
-
-			WARN_ON(kvm_gmem_shareability_apply(inode, work, m));
 		}
 	}
 
@@ -434,6 +605,277 @@ static int kvm_gmem_ioctl_convert_range(struct file *file,
 	return ret;
 }
 
+#ifdef CONFIG_KVM_GMEM_HUGETLB
+
+static inline void __filemap_remove_folio_for_restructuring(struct folio *folio)
+{
+	struct address_space *mapping = folio->mapping;
+
+	spin_lock(&mapping->host->i_lock);
+	xa_lock_irq(&mapping->i_pages);
+
+	__filemap_remove_folio(folio, NULL);
+
+	xa_unlock_irq(&mapping->i_pages);
+	spin_unlock(&mapping->host->i_lock);
+}
+
+/**
+ * filemap_remove_folio_for_restructuring() - Remove @folio from filemap for
+ * split/merge.
+ *
+ * @folio: the folio to be removed.
+ *
+ * Similar to filemap_remove_folio(), but skips LRU-related calls (meaningless
+ * for guest_memfd), and skips call to ->free_folio() to maintain folio flags.
+ *
+ * Context: Expects only the filemap's refcounts to be left on the folio. Will
+ *          freeze these refcounts away so that no other users will interfere
+ *          with restructuring.
+ */
+static inline void filemap_remove_folio_for_restructuring(struct folio *folio)
+{
+	int filemap_refcount;
+
+	filemap_refcount = folio_nr_pages(folio);
+	while (!folio_ref_freeze(folio, filemap_refcount)) {
+		/*
+		 * At this point only filemap refcounts are expected, hence okay
+		 * to spin until speculative refcounts go away.
+		 */
+		WARN_ONCE(1, "Spinning on folio=%p refcount=%d", folio, folio_ref_count(folio));
+	}
+
+	folio_lock(folio);
+	__filemap_remove_folio_for_restructuring(folio);
+	folio_unlock(folio);
+}
+
+/**
+ * kvm_gmem_split_folio_in_filemap() - Split @folio within filemap in @inode.
+ *
+ * @inode: inode containing the folio.
+ * @folio: folio to be split.
+ *
+ * Split a folio into folios of size PAGE_SIZE. Will clean up folio from filemap
+ * and add back the split folios.
+ *
+ * Context: Expects that before this call, folio's refcount is just the
+ *          filemap's refcounts. After this function returns, the split folios'
+ *          refcounts will also be filemap's refcounts.
+ * Return: 0 on success or negative error otherwise.
+ */
+static int kvm_gmem_split_folio_in_filemap(struct inode *inode, struct folio *folio)
+{
+	size_t orig_nr_pages;
+	pgoff_t orig_index;
+	size_t i, j;
+	int ret;
+
+	orig_nr_pages = folio_nr_pages(folio);
+	if (orig_nr_pages == 1)
+		return 0;
+
+	orig_index = folio->index;
+
+	filemap_remove_folio_for_restructuring(folio);
+
+	ret = kvm_gmem_allocator_ops(inode)->split_folio(folio);
+	if (ret)
+		goto err;
+
+	for (i = 0; i < orig_nr_pages; ++i) {
+		struct folio *f = page_folio(folio_page(folio, i));
+
+		ret = __kvm_gmem_filemap_add_folio(inode->i_mapping, f,
+						   orig_index + i);
+		if (ret)
+			goto rollback;
+	}
+
+	return ret;
+
+rollback:
+	for (j = 0; j < i; ++j) {
+		struct folio *f = page_folio(folio_page(folio, j));
+
+		filemap_remove_folio_for_restructuring(f);
+	}
+
+	kvm_gmem_allocator_ops(inode)->merge_folio(folio);
+err:
+	WARN_ON(__kvm_gmem_filemap_add_folio(inode->i_mapping, folio, orig_index));
+
+	return ret;
+}
+
+static inline int kvm_gmem_try_split_folio_in_filemap(struct inode *inode,
+						      struct folio *folio)
+{
+	size_t to_nr_pages;
+	void *priv;
+
+	if (!kvm_gmem_has_custom_allocator(inode))
+		return 0;
+
+	priv = kvm_gmem_allocator_private(inode);
+	to_nr_pages = kvm_gmem_allocator_ops(inode)->nr_pages_in_page(priv);
+
+	if (kvm_gmem_has_some_shared(inode, folio->index, to_nr_pages))
+		return kvm_gmem_split_folio_in_filemap(inode, folio);
+
+	return 0;
+}
+
+/**
+ * kvm_gmem_merge_folio_in_filemap() - Merge @first_folio within filemap in
+ * @inode.
+ *
+ * @inode: inode containing the folio.
+ * @first_folio: first folio among folios to be merged.
+ *
+ * Will clean up subfolios from filemap and add back the merged folio.
+ *
+ * Context: Expects that before this call, all subfolios only have filemap
+ *          refcounts. After this function returns, the merged folio will only
+ *          have filemap refcounts.
+ * Return: 0 on success or negative error otherwise.
+ */
+static int kvm_gmem_merge_folio_in_filemap(struct inode *inode,
+					   struct folio *first_folio)
+{
+	size_t to_nr_pages;
+	pgoff_t index;
+	void *priv;
+	size_t i;
+	int ret;
+
+	index = first_folio->index;
+
+	priv = kvm_gmem_allocator_private(inode);
+	to_nr_pages = kvm_gmem_allocator_ops(inode)->nr_pages_in_folio(priv);
+	if (folio_nr_pages(first_folio) == to_nr_pages)
+		return 0;
+
+	for (i = 0; i < to_nr_pages; ++i) {
+		struct folio *f = page_folio(folio_page(first_folio, i));
+
+		filemap_remove_folio_for_restructuring(f);
+	}
+
+	kvm_gmem_allocator_ops(inode)->merge_folio(first_folio);
+
+	ret = __kvm_gmem_filemap_add_folio(inode->i_mapping, first_folio, index);
+	if (ret)
+		goto err_split;
+
+	return ret;
+
+err_split:
+	WARN_ON(kvm_gmem_allocator_ops(inode)->split_folio(first_folio));
+	for (i = 0; i < to_nr_pages; ++i) {
+		struct folio *f = page_folio(folio_page(first_folio, i));
+
+		WARN_ON(__kvm_gmem_filemap_add_folio(inode->i_mapping, f, index + i));
+	}
+
+	return ret;
+}
+
+static inline int kvm_gmem_try_merge_folio_in_filemap(struct inode *inode,
+						      struct folio *first_folio)
+{
+	size_t to_nr_pages;
+	void *priv;
+
+	priv = kvm_gmem_allocator_private(inode);
+	to_nr_pages = kvm_gmem_allocator_ops(inode)->nr_pages_in_folio(priv);
+
+	if (kvm_gmem_has_some_shared(inode, first_folio->index, to_nr_pages))
+		return 0;
+
+	return kvm_gmem_merge_folio_in_filemap(inode, first_folio);
+}
+
+static int kvm_gmem_restructure_folios_in_range(struct inode *inode,
+						pgoff_t start, size_t nr_pages,
+						bool is_split_operation)
+{
+	size_t to_nr_pages;
+	pgoff_t index;
+	pgoff_t end;
+	void *priv;
+	int ret;
+
+	if (!kvm_gmem_has_custom_allocator(inode))
+		return 0;
+
+	end = start + nr_pages;
+
+	/* Round to allocator page size, to check all (huge) pages in range. */
+	priv = kvm_gmem_allocator_private(inode);
+	to_nr_pages = kvm_gmem_allocator_ops(inode)->nr_pages_in_folio(priv);
+
+	start = round_down(start, to_nr_pages);
+	end = round_up(end, to_nr_pages);
+
+	for (index = start; index < end; index += to_nr_pages) {
+		struct folio *f;
+
+		f = filemap_get_folio(inode->i_mapping, index);
+		if (IS_ERR(f))
+			continue;
+
+		/* Leave just filemap's refcounts on the folio. */
+		folio_put(f);
+
+		if (is_split_operation)
+			ret = kvm_gmem_split_folio_in_filemap(inode, f);
+		else
+			ret = kvm_gmem_try_merge_folio_in_filemap(inode, f);
+
+		if (ret)
+			goto rollback;
+	}
+	return ret;
+
+rollback:
+	for (index -= to_nr_pages; index >= start; index -= to_nr_pages) {
+		struct folio *f;
+
+		f = filemap_get_folio(inode->i_mapping, index);
+		if (IS_ERR(f))
+			continue;
+
+		/* Leave just filemap's refcounts on the folio. */
+		folio_put(f);
+
+		if (is_split_operation)
+			WARN_ON(kvm_gmem_merge_folio_in_filemap(inode, f));
+		else
+			WARN_ON(kvm_gmem_split_folio_in_filemap(inode, f));
+	}
+
+	return ret;
+}
+
+#else
+
+static inline int kvm_gmem_try_split_folio_in_filemap(struct inode *inode,
+						      struct folio *folio)
+{
+	return 0;
+}
+
+static int kvm_gmem_restructure_folios_in_range(struct inode *inode,
+						pgoff_t start, size_t nr_pages,
+						bool is_split_operation)
+{
+	return 0;
+}
+
+#endif
+
 #else
 
 static int kvm_gmem_shareability_setup(struct maple_tree *mt, loff_t size, u64 flags)
@@ -563,11 +1005,16 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
 		return folio;
 
 	if (kvm_gmem_has_custom_allocator(inode)) {
-		void *p = kvm_gmem_allocator_private(inode);
+		size_t nr_pages;
+		void *p;
 
+		p = kvm_gmem_allocator_private(inode);
 		folio = kvm_gmem_allocator_ops(inode)->alloc_folio(p);
 		if (IS_ERR(folio))
 			return folio;
+
+		nr_pages = kvm_gmem_allocator_ops(inode)->nr_pages_in_folio(p);
+		index_floor = round_down(index, nr_pages);
 	} else {
 		gfp_t gfp = mapping_gfp_mask(inode->i_mapping);
 
@@ -580,10 +1027,11 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
 			folio_put(folio);
 			return ERR_PTR(ret);
 		}
+
+		index_floor = index;
 	}
 	allocated_size = folio_size(folio);
 
-	index_floor = round_down(index, folio_nr_pages(folio));
 	ret = kvm_gmem_filemap_add_folio(inode->i_mapping, folio, index_floor);
 	if (ret) {
 		folio_put(folio);
@@ -600,6 +1048,13 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
 		return ERR_PTR(ret);
 	}
 
+	/* Leave just filemap's refcounts on folio. */
+	folio_put(folio);
+
+	ret = kvm_gmem_try_split_folio_in_filemap(inode, folio);
+	if (ret)
+		goto err;
+
 	spin_lock(&inode->i_lock);
 	inode->i_blocks += allocated_size / 512;
 	spin_unlock(&inode->i_lock);
@@ -608,14 +1063,17 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
 	 * folio is the one that is allocated, this gets the folio at the
 	 * requested index.
 	 */
-	folio = page_folio(folio_file_page(folio, index));
-	folio_lock(folio);
+	folio = filemap_lock_folio(inode->i_mapping, index);
 
 	return folio;
+
+err:
+	filemap_remove_folio(folio);
+	return ERR_PTR(ret);
 }
 
-static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start,
-				      pgoff_t end)
+static void kvm_gmem_invalidate_begin_and_zap(struct kvm_gmem *gmem,
+					      pgoff_t start, pgoff_t end)
 {
 	bool flush = false, found_memslot = false;
 	struct kvm_memory_slot *slot;
@@ -848,7 +1306,7 @@ static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 	filemap_invalidate_lock(inode->i_mapping);
 
 	list_for_each_entry(gmem, gmem_list, entry)
-		kvm_gmem_invalidate_begin(gmem, start, end);
+		kvm_gmem_invalidate_begin_and_zap(gmem, start, end);
 
 	if (kvm_gmem_has_custom_allocator(inode)) {
 		kvm_gmem_truncate_inode_range(inode, offset, offset + len);
@@ -978,7 +1436,7 @@ static int kvm_gmem_release(struct inode *inode, struct file *file)
 	 * Zap all SPTEs pointed at by this file.  Do not free the backing
 	 * memory, as its lifetime is associated with the inode, not the file.
 	 */
-	kvm_gmem_invalidate_begin(gmem, 0, -1ul);
+	kvm_gmem_invalidate_begin_and_zap(gmem, 0, -1ul);
 	kvm_gmem_invalidate_end(gmem, 0, -1ul);
 
 	list_del(&gmem->entry);
@@ -1289,7 +1747,7 @@ static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *fol
 	end = start + folio_nr_pages(folio);
 
 	list_for_each_entry(gmem, gmem_list, entry)
-		kvm_gmem_invalidate_begin(gmem, start, end);
+		kvm_gmem_invalidate_begin_and_zap(gmem, start, end);
 
 	/*
 	 * Do not truncate the range, what action is taken in response to the
@@ -1330,6 +1788,9 @@ static void kvm_gmem_free_folio(struct address_space *mapping,
 	 */
 	folio_clear_uptodate(folio);
 
+	if (kvm_gmem_has_custom_allocator(mapping->host))
+		kvm_gmem_allocator_ops(mapping->host)->free_folio(folio);
+
 	kvm_gmem_invalidate(folio);
 }
 
-- 
2.49.0.1045.g170613ef41-goog
Re: [RFC PATCH v2 38/51] KVM: guest_memfd: Split allocator pages for guest_memfd use
Posted by Binbin Wu 6 months, 2 weeks ago

On 5/15/2025 7:42 AM, Ackerley Tng wrote:
[...]
> +
> +static inline int kvm_gmem_try_split_folio_in_filemap(struct inode *inode,
> +						      struct folio *folio)
> +{
> +	size_t to_nr_pages;
> +	void *priv;
> +
> +	if (!kvm_gmem_has_custom_allocator(inode))
> +		return 0;
> +
> +	priv = kvm_gmem_allocator_private(inode);
> +	to_nr_pages = kvm_gmem_allocator_ops(inode)->nr_pages_in_page(priv);
> +
> +	if (kvm_gmem_has_some_shared(inode, folio->index, to_nr_pages))

What if a huge page whose attribute is shared?

> +		return kvm_gmem_split_folio_in_filemap(inode, folio);
> +
> +	return 0;
> +}
> +
[...]
>   
>   static int kvm_gmem_shareability_setup(struct maple_tree *mt, loff_t size, u64 flags)
> @@ -563,11 +1005,16 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
>   		return folio;
>   
>   	if (kvm_gmem_has_custom_allocator(inode)) {
> -		void *p = kvm_gmem_allocator_private(inode);
> +		size_t nr_pages;
> +		void *p;
>   
> +		p = kvm_gmem_allocator_private(inode);
>   		folio = kvm_gmem_allocator_ops(inode)->alloc_folio(p);
>   		if (IS_ERR(folio))
>   			return folio;
> +
> +		nr_pages = kvm_gmem_allocator_ops(inode)->nr_pages_in_folio(p);
> +		index_floor = round_down(index, nr_pages);
>   	} else {
>   		gfp_t gfp = mapping_gfp_mask(inode->i_mapping);
>   
> @@ -580,10 +1027,11 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
>   			folio_put(folio);
>   			return ERR_PTR(ret);
>   		}
> +
> +		index_floor = index;
>   	}
>   	allocated_size = folio_size(folio);
>   
> -	index_floor = round_down(index, folio_nr_pages(folio));
>   	ret = kvm_gmem_filemap_add_folio(inode->i_mapping, folio, index_floor);
>   	if (ret) {
>   		folio_put(folio);
> @@ -600,6 +1048,13 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
>   		return ERR_PTR(ret);
>   	}
>   
> +	/* Leave just filemap's refcounts on folio. */
> +	folio_put(folio);
> +
> +	ret = kvm_gmem_try_split_folio_in_filemap(inode, folio);

When !CONFIG_KVM_GMEM_SHARED_MEM, kvm_gmem_try_split_folio_in_filemap() is
undefined.

> +	if (ret)
> +		goto err;
> +
>   	spin_lock(&inode->i_lock);
>   	inode->i_blocks += allocated_size / 512;
>   	spin_unlock(&inode->i_lock);
>
[...]
Re: [RFC PATCH v2 38/51] KVM: guest_memfd: Split allocator pages for guest_memfd use
Posted by Ackerley Tng 6 months, 2 weeks ago
Binbin Wu <binbin.wu@linux.intel.com> writes:

> On 5/15/2025 7:42 AM, Ackerley Tng wrote:
> [...]
>> +
>> +static inline int kvm_gmem_try_split_folio_in_filemap(struct inode *inode,
>> +						      struct folio *folio)
>> +{
>> +	size_t to_nr_pages;
>> +	void *priv;
>> +
>> +	if (!kvm_gmem_has_custom_allocator(inode))
>> +		return 0;
>> +
>> +	priv = kvm_gmem_allocator_private(inode);
>> +	to_nr_pages = kvm_gmem_allocator_ops(inode)->nr_pages_in_page(priv);
>> +
>> +	if (kvm_gmem_has_some_shared(inode, folio->index, to_nr_pages))
>
> What if a huge page whose attribute is shared?
>

This checks if there are any shared pages in the range [folio->index,
folio->index + to_nr_pages), so if the entire huge page is shared this
function should also return true.

folio->index is the start of the merged huge page, and to_nr_pages is
the number of pages in the merged huge page, so this should be querying
exactly the entire huge page.

Note to self: rename kvm_gmem_has_some_shared() to
kvm_gmem_has_any_shared() in the next revision.

Hope I answered your question! Let me know if I misunderstood your question.

>> +		return kvm_gmem_split_folio_in_filemap(inode, folio);
>> +
>> +	return 0;
>> +}
>> +
> [...]
>>   
>>   static int kvm_gmem_shareability_setup(struct maple_tree *mt, loff_t size, u64 flags)
>> @@ -563,11 +1005,16 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
>>   		return folio;
>>   
>>   	if (kvm_gmem_has_custom_allocator(inode)) {
>> -		void *p = kvm_gmem_allocator_private(inode);
>> +		size_t nr_pages;
>> +		void *p;
>>   
>> +		p = kvm_gmem_allocator_private(inode);
>>   		folio = kvm_gmem_allocator_ops(inode)->alloc_folio(p);
>>   		if (IS_ERR(folio))
>>   			return folio;
>> +
>> +		nr_pages = kvm_gmem_allocator_ops(inode)->nr_pages_in_folio(p);
>> +		index_floor = round_down(index, nr_pages);
>>   	} else {
>>   		gfp_t gfp = mapping_gfp_mask(inode->i_mapping);
>>   
>> @@ -580,10 +1027,11 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
>>   			folio_put(folio);
>>   			return ERR_PTR(ret);
>>   		}
>> +
>> +		index_floor = index;
>>   	}
>>   	allocated_size = folio_size(folio);
>>   
>> -	index_floor = round_down(index, folio_nr_pages(folio));
>>   	ret = kvm_gmem_filemap_add_folio(inode->i_mapping, folio, index_floor);
>>   	if (ret) {
>>   		folio_put(folio);
>> @@ -600,6 +1048,13 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
>>   		return ERR_PTR(ret);
>>   	}
>>   
>> +	/* Leave just filemap's refcounts on folio. */
>> +	folio_put(folio);
>> +
>> +	ret = kvm_gmem_try_split_folio_in_filemap(inode, folio);
>
> When !CONFIG_KVM_GMEM_SHARED_MEM, kvm_gmem_try_split_folio_in_filemap() is
> undefined.
>

Will fix this in the next revision. Thanks!

>> +	if (ret)
>> +		goto err;
>> +
>>   	spin_lock(&inode->i_lock);
>>   	inode->i_blocks += allocated_size / 512;
>>   	spin_unlock(&inode->i_lock);
>>
> [...]
Re: [RFC PATCH v2 38/51] KVM: guest_memfd: Split allocator pages for guest_memfd use
Posted by Yan Zhao 6 months, 3 weeks ago
On Wed, May 14, 2025 at 04:42:17PM -0700, Ackerley Tng wrote:
> +static int kvm_gmem_convert_execute_work(struct inode *inode,
> +					 struct conversion_work *work,
> +					 bool to_shared)
> +{
> +	enum shareability m;
> +	int ret;
> +
> +	m = to_shared ? SHAREABILITY_ALL : SHAREABILITY_GUEST;
> +	ret = kvm_gmem_shareability_apply(inode, work, m);
> +	if (ret)
> +		return ret;
> +	/*
> +	 * Apply shareability first so split/merge can operate on new
> +	 * shareability state.
> +	 */
> +	ret = kvm_gmem_restructure_folios_in_range(
> +		inode, work->start, work->nr_pages, to_shared);
> +
> +	return ret;
> +}
> +
>  static int kvm_gmem_convert_range(struct file *file, pgoff_t start,
>  				  size_t nr_pages, bool shared,
>  				  pgoff_t *error_index)
> @@ -371,18 +539,21 @@ static int kvm_gmem_convert_range(struct file *file, pgoff_t start,
>  
>  	list_for_each_entry(work, &work_list, list) {
>  		rollback_stop_item = work;
> -		ret = kvm_gmem_shareability_apply(inode, work, m);
> +
> +		ret = kvm_gmem_convert_execute_work(inode, work, shared);
>  		if (ret)
>  			break;
>  	}
>  
>  	if (ret) {
> -		m = shared ? SHAREABILITY_GUEST : SHAREABILITY_ALL;
>  		list_for_each_entry(work, &work_list, list) {
> +			int r;
> +
> +			r = kvm_gmem_convert_execute_work(inode, work, !shared);
> +			WARN_ON(r);
> +
>  			if (work == rollback_stop_item)
>  				break;
> -
> -			WARN_ON(kvm_gmem_shareability_apply(inode, work, m));
Could kvm_gmem_shareability_apply() fail here?

>  		}
>  	}
>  
> @@ -434,6 +605,277 @@ static int kvm_gmem_ioctl_convert_range(struct file *file,
>  	return ret;
>  }
>  
> +#ifdef CONFIG_KVM_GMEM_HUGETLB
> +
> +static inline void __filemap_remove_folio_for_restructuring(struct folio *folio)
> +{
> +	struct address_space *mapping = folio->mapping;
> +
> +	spin_lock(&mapping->host->i_lock);
> +	xa_lock_irq(&mapping->i_pages);
> +
> +	__filemap_remove_folio(folio, NULL);
> +
> +	xa_unlock_irq(&mapping->i_pages);
> +	spin_unlock(&mapping->host->i_lock);
> +}
> +
> +/**
> + * filemap_remove_folio_for_restructuring() - Remove @folio from filemap for
> + * split/merge.
> + *
> + * @folio: the folio to be removed.
> + *
> + * Similar to filemap_remove_folio(), but skips LRU-related calls (meaningless
> + * for guest_memfd), and skips call to ->free_folio() to maintain folio flags.
> + *
> + * Context: Expects only the filemap's refcounts to be left on the folio. Will
> + *          freeze these refcounts away so that no other users will interfere
> + *          with restructuring.
> + */
> +static inline void filemap_remove_folio_for_restructuring(struct folio *folio)
> +{
> +	int filemap_refcount;
> +
> +	filemap_refcount = folio_nr_pages(folio);
> +	while (!folio_ref_freeze(folio, filemap_refcount)) {
> +		/*
> +		 * At this point only filemap refcounts are expected, hence okay
> +		 * to spin until speculative refcounts go away.
> +		 */
> +		WARN_ONCE(1, "Spinning on folio=%p refcount=%d", folio, folio_ref_count(folio));
> +	}
> +
> +	folio_lock(folio);
> +	__filemap_remove_folio_for_restructuring(folio);
> +	folio_unlock(folio);
> +}
> +
> +/**
> + * kvm_gmem_split_folio_in_filemap() - Split @folio within filemap in @inode.
> + *
> + * @inode: inode containing the folio.
> + * @folio: folio to be split.
> + *
> + * Split a folio into folios of size PAGE_SIZE. Will clean up folio from filemap
> + * and add back the split folios.
> + *
> + * Context: Expects that before this call, folio's refcount is just the
> + *          filemap's refcounts. After this function returns, the split folios'
> + *          refcounts will also be filemap's refcounts.
> + * Return: 0 on success or negative error otherwise.
> + */
> +static int kvm_gmem_split_folio_in_filemap(struct inode *inode, struct folio *folio)
> +{
> +	size_t orig_nr_pages;
> +	pgoff_t orig_index;
> +	size_t i, j;
> +	int ret;
> +
> +	orig_nr_pages = folio_nr_pages(folio);
> +	if (orig_nr_pages == 1)
> +		return 0;
> +
> +	orig_index = folio->index;
> +
> +	filemap_remove_folio_for_restructuring(folio);
> +
> +	ret = kvm_gmem_allocator_ops(inode)->split_folio(folio);
> +	if (ret)
> +		goto err;
> +
> +	for (i = 0; i < orig_nr_pages; ++i) {
> +		struct folio *f = page_folio(folio_page(folio, i));
> +
> +		ret = __kvm_gmem_filemap_add_folio(inode->i_mapping, f,
> +						   orig_index + i);
Why does the failure of __kvm_gmem_filemap_add_folio() here lead to rollback,    
while the failure of the one under rollback only triggers WARN_ON()?

> +		if (ret)
> +			goto rollback;
> +	}
> +
> +	return ret;
> +
> +rollback:
> +	for (j = 0; j < i; ++j) {
> +		struct folio *f = page_folio(folio_page(folio, j));
> +
> +		filemap_remove_folio_for_restructuring(f);
> +	}
> +
> +	kvm_gmem_allocator_ops(inode)->merge_folio(folio);
> +err:
> +	WARN_ON(__kvm_gmem_filemap_add_folio(inode->i_mapping, folio, orig_index));
> +
> +	return ret;
> +}
> +
> +static inline int kvm_gmem_try_split_folio_in_filemap(struct inode *inode,
> +						      struct folio *folio)
> +{
> +	size_t to_nr_pages;
> +	void *priv;
> +
> +	if (!kvm_gmem_has_custom_allocator(inode))
> +		return 0;
> +
> +	priv = kvm_gmem_allocator_private(inode);
> +	to_nr_pages = kvm_gmem_allocator_ops(inode)->nr_pages_in_page(priv);
> +
> +	if (kvm_gmem_has_some_shared(inode, folio->index, to_nr_pages))
> +		return kvm_gmem_split_folio_in_filemap(inode, folio);
> +
> +	return 0;
> +}
> +
> +/**
> + * kvm_gmem_merge_folio_in_filemap() - Merge @first_folio within filemap in
> + * @inode.
> + *
> + * @inode: inode containing the folio.
> + * @first_folio: first folio among folios to be merged.
> + *
> + * Will clean up subfolios from filemap and add back the merged folio.
> + *
> + * Context: Expects that before this call, all subfolios only have filemap
> + *          refcounts. After this function returns, the merged folio will only
> + *          have filemap refcounts.
> + * Return: 0 on success or negative error otherwise.
> + */
> +static int kvm_gmem_merge_folio_in_filemap(struct inode *inode,
> +					   struct folio *first_folio)
> +{
> +	size_t to_nr_pages;
> +	pgoff_t index;
> +	void *priv;
> +	size_t i;
> +	int ret;
> +
> +	index = first_folio->index;
> +
> +	priv = kvm_gmem_allocator_private(inode);
> +	to_nr_pages = kvm_gmem_allocator_ops(inode)->nr_pages_in_folio(priv);
> +	if (folio_nr_pages(first_folio) == to_nr_pages)
> +		return 0;
> +
> +	for (i = 0; i < to_nr_pages; ++i) {
> +		struct folio *f = page_folio(folio_page(first_folio, i));
> +
> +		filemap_remove_folio_for_restructuring(f);
> +	}
> +
> +	kvm_gmem_allocator_ops(inode)->merge_folio(first_folio);
> +
> +	ret = __kvm_gmem_filemap_add_folio(inode->i_mapping, first_folio, index);
> +	if (ret)
> +		goto err_split;
> +
> +	return ret;
> +
> +err_split:
> +	WARN_ON(kvm_gmem_allocator_ops(inode)->split_folio(first_folio));
guestmem_hugetlb_split_folio() is possible to fail. e.g.
After the stash is freed by guestmem_hugetlb_unstash_free_metadata() in
guestmem_hugetlb_merge_folio(), it's possible to get -ENOMEM for the stash
allocation in guestmem_hugetlb_stash_metadata() in
guestmem_hugetlb_split_folio().


> +	for (i = 0; i < to_nr_pages; ++i) {
> +		struct folio *f = page_folio(folio_page(first_folio, i));
> +
> +		WARN_ON(__kvm_gmem_filemap_add_folio(inode->i_mapping, f, index + i));
> +	}
> +
> +	return ret;
> +}
> +
> +static inline int kvm_gmem_try_merge_folio_in_filemap(struct inode *inode,
> +						      struct folio *first_folio)
> +{
> +	size_t to_nr_pages;
> +	void *priv;
> +
> +	priv = kvm_gmem_allocator_private(inode);
> +	to_nr_pages = kvm_gmem_allocator_ops(inode)->nr_pages_in_folio(priv);
> +
> +	if (kvm_gmem_has_some_shared(inode, first_folio->index, to_nr_pages))
> +		return 0;
> +
> +	return kvm_gmem_merge_folio_in_filemap(inode, first_folio);
> +}
> +
> +static int kvm_gmem_restructure_folios_in_range(struct inode *inode,
> +						pgoff_t start, size_t nr_pages,
> +						bool is_split_operation)
> +{
> +	size_t to_nr_pages;
> +	pgoff_t index;
> +	pgoff_t end;
> +	void *priv;
> +	int ret;
> +
> +	if (!kvm_gmem_has_custom_allocator(inode))
> +		return 0;
> +
> +	end = start + nr_pages;
> +
> +	/* Round to allocator page size, to check all (huge) pages in range. */
> +	priv = kvm_gmem_allocator_private(inode);
> +	to_nr_pages = kvm_gmem_allocator_ops(inode)->nr_pages_in_folio(priv);
> +
> +	start = round_down(start, to_nr_pages);
> +	end = round_up(end, to_nr_pages);
> +
> +	for (index = start; index < end; index += to_nr_pages) {
> +		struct folio *f;
> +
> +		f = filemap_get_folio(inode->i_mapping, index);
> +		if (IS_ERR(f))
> +			continue;
> +
> +		/* Leave just filemap's refcounts on the folio. */
> +		folio_put(f);
> +
> +		if (is_split_operation)
> +			ret = kvm_gmem_split_folio_in_filemap(inode, f);
kvm_gmem_try_split_folio_in_filemap()?

> +		else
> +			ret = kvm_gmem_try_merge_folio_in_filemap(inode, f);
> +
> +		if (ret)
> +			goto rollback;
> +	}
> +	return ret;
> +
> +rollback:
> +	for (index -= to_nr_pages; index >= start; index -= to_nr_pages) {
> +		struct folio *f;
> +
> +		f = filemap_get_folio(inode->i_mapping, index);
> +		if (IS_ERR(f))
> +			continue;
> +
> +		/* Leave just filemap's refcounts on the folio. */
> +		folio_put(f);
> +
> +		if (is_split_operation)
> +			WARN_ON(kvm_gmem_merge_folio_in_filemap(inode, f));
> +		else
> +			WARN_ON(kvm_gmem_split_folio_in_filemap(inode, f));
Is it safe to just leave WARN_ON()s in the rollback case?

Besides, are the kvm_gmem_merge_folio_in_filemap() and
kvm_gmem_split_folio_in_filemap() here duplicated with the
kvm_gmem_split_folio_in_filemap() and kvm_gmem_try_merge_folio_in_filemap() in
the following "r = kvm_gmem_convert_execute_work(inode, work, !shared)"?

> +	}
> +
> +	return ret;
> +}
> +
> +#else
> +
> +static inline int kvm_gmem_try_split_folio_in_filemap(struct inode *inode,
> +						      struct folio *folio)
> +{
> +	return 0;
> +}
> +
> +static int kvm_gmem_restructure_folios_in_range(struct inode *inode,
> +						pgoff_t start, size_t nr_pages,
> +						bool is_split_operation)
> +{
> +	return 0;
> +}
> +
> +#endif
> +
Re: [RFC PATCH v2 38/51] KVM: guest_memfd: Split allocator pages for guest_memfd use
Posted by Ackerley Tng 6 months, 2 weeks ago
Yan Zhao <yan.y.zhao@intel.com> writes:

> On Wed, May 14, 2025 at 04:42:17PM -0700, Ackerley Tng wrote:
>> +static int kvm_gmem_convert_execute_work(struct inode *inode,
>> +					 struct conversion_work *work,
>> +					 bool to_shared)
>> +{
>> +	enum shareability m;
>> +	int ret;
>> +
>> +	m = to_shared ? SHAREABILITY_ALL : SHAREABILITY_GUEST;
>> +	ret = kvm_gmem_shareability_apply(inode, work, m);
>> +	if (ret)
>> +		return ret;
>> +	/*
>> +	 * Apply shareability first so split/merge can operate on new
>> +	 * shareability state.
>> +	 */
>> +	ret = kvm_gmem_restructure_folios_in_range(
>> +		inode, work->start, work->nr_pages, to_shared);
>> +
>> +	return ret;
>> +}
>> +

Hi Yan,

Thanks for your thorough reviews and your alternative suggestion in the
other discussion at [1]! I'll try to bring the conversion-related parts
of that discussion over here.

>>  static int kvm_gmem_convert_range(struct file *file, pgoff_t start,
>>  				  size_t nr_pages, bool shared,
>>  				  pgoff_t *error_index)

The guiding principle I was using for the conversion ioctls is

* Have the shareability updates and any necessary page restructuring
  (aka splitting/merging) either fully complete for not at all by the
  time the conversion ioctl returns.
* Any unmapping (from host or guest page tables) will not be re-mapped
  on errors.
* Rollback undoes changes if conversion failed, and in those cases any
  errors are turned into WARNings.

The rationale is that we want page sizes to be in sync with shareability
so that any faults after the (successful or failed) conversion will not
wrongly map in a larger page than allowed and cause any host crashes.

We considered 3 places where the memory can be mapped for conversions:

1. Host page tables
2. Guest page tables
3. IOMMU page tables

Unmapping from host page tables is the simplest case. We unmap any
shared ranges from the host page tables. Any accesses after the failed
conversion would just fault the memory back in and proceed as usual.

guest_memfd memory is not unmapped from IOMMUs in conversions. This case
is handled because IOMMU mappings hold refcounts. After unmapping from
the host, we check for unexpected refcounts and fail if there are
unexpected refcounts.

We also unmap from guest page tables. Considering failed conversions, if
the pages are shared, we're good since the next time the guest accesses
the page, the page will be faulted in as before.

If the pages are private, on the next guest access, the pages will be
faulted in again as well. This is fine for software-protected VMs IIUC.

For TDX (and SNP) IIUC the memory would have been cleared, and the
memory would also need to be re-accepted. I was thinking that this is by
design, since when a TDX guest requests a conversion it knows that the
contents is not to be used again.

The userspace VMM is obligated to keep trying convert and if it gives
up, userspace VMM should inform the guest that the conversion
failed. The guest should handle conversion failures too and not assume
that conversion always succeeds.

Putting TDX aside for a moment, so far, there are a few ways this
conversion could fail:

a. Unexpected refcounts. Userspace should clear up the unexpected
   refcounts and report failure to the guest if it can't for whatever
   reason.
b. ENOMEM because (i) we ran out of memory updating the shareability
   maple_tree or (ii) since splitting involves allocating more memory
   for struct pages and we ran out of memory there. In this case the
   userspace VMM gets -ENOMEM and can make more memory available and
   then retry, or if it can't, also report failure to the guest.

TDX introduces TDX-specific conversion failures (see discussion at
[1]), which this series doesn't handle, but I think we still have a line
of sight to handle new errors.

In the other thread [1], I was proposing to have guest_memfd decide what
to do on errors, but I think that might be baking more TDX-specific
details into guest_memfd/KVM, and perhaps this is better:

We could return the errors to userspace and let userspace determine what
to do. For retryable errors (as determined by userspace), it should do
what it needs to do, and retry. For errors like TDX being unable to
reclaim the memory, it could tell guest_memfd to leak that memory.

If userspace gives up, it should report conversion failure to the guest
if userspace thinks the guest can continue (to a clean shutdown or
otherwise). If something terrible happened during conversion, then
userspace might have to exit itself or shutdown the host.

In [2], for TDX-specific conversion failures, you proposed prepping to
eliminate errors and exiting early on failure, then actually
unmapping. I think that could work too.

I'm a little concerned that prepping could be complicated, since the
nature of conversion depends on the current state of shareability, and
there's a lot to prepare, everything from counting memory required for
maple_tree allocation (and merging ranges in the maple_tree), and
counting the number of pages required for undoing vmemmap optimization
in the case of splitting...

And even after doing all the prep to eliminate errors, the unmapping
could fail in TDX-specific cases anyway, which still needs to be
handled.

Hence I'm hoping you'll consider to let TDX-specific failures be
built-in and handled alongside other failures by getting help from the
userspace VMM, and in the worst case letting the guest know the
conversion failed.

I also appreciate comments or suggestions from anyone else!

[1] https://lore.kernel.org/all/diqzfrgfp95d.fsf@ackerleytng-ctop.c.googlers.com/
[2] https://lore.kernel.org/all/aEEEJbTzlncbRaRA@yzhao56-desk.sh.intel.com/

>> @@ -371,18 +539,21 @@ static int kvm_gmem_convert_range(struct file *file, pgoff_t start,
>>  
>>  	list_for_each_entry(work, &work_list, list) {
>>  		rollback_stop_item = work;
>> -		ret = kvm_gmem_shareability_apply(inode, work, m);
>> +
>> +		ret = kvm_gmem_convert_execute_work(inode, work, shared);
>>  		if (ret)
>>  			break;
>>  	}
>>  
>>  	if (ret) {
>> -		m = shared ? SHAREABILITY_GUEST : SHAREABILITY_ALL;
>>  		list_for_each_entry(work, &work_list, list) {
>> +			int r;
>> +
>> +			r = kvm_gmem_convert_execute_work(inode, work, !shared);
>> +			WARN_ON(r);
>> +
>>  			if (work == rollback_stop_item)
>>  				break;
>> -
>> -			WARN_ON(kvm_gmem_shareability_apply(inode, work, m));
> Could kvm_gmem_shareability_apply() fail here?
>

Yes, it could. If shareability cannot be updated, then we probably ran
out of memory. Userspace VMM will probably get -ENOMEM set on some
earlier ret and should handle that accordingly.

On -ENOMEM in a rollback, the host is in a very tough spot anyway, and a
clean guest shutdown may be the only way out, hence this is a WARN and
not returned to userspace.

>>  		}
>>  	}
>>  
>> @@ -434,6 +605,277 @@ static int kvm_gmem_ioctl_convert_range(struct file *file,
>>  	return ret;
>>  }
>>  
>> +#ifdef CONFIG_KVM_GMEM_HUGETLB
>> +
>> +static inline void __filemap_remove_folio_for_restructuring(struct folio *folio)
>> +{
>> +	struct address_space *mapping = folio->mapping;
>> +
>> +	spin_lock(&mapping->host->i_lock);
>> +	xa_lock_irq(&mapping->i_pages);
>> +
>> +	__filemap_remove_folio(folio, NULL);
>> +
>> +	xa_unlock_irq(&mapping->i_pages);
>> +	spin_unlock(&mapping->host->i_lock);
>> +}
>> +
>> +/**
>> + * filemap_remove_folio_for_restructuring() - Remove @folio from filemap for
>> + * split/merge.
>> + *
>> + * @folio: the folio to be removed.
>> + *
>> + * Similar to filemap_remove_folio(), but skips LRU-related calls (meaningless
>> + * for guest_memfd), and skips call to ->free_folio() to maintain folio flags.
>> + *
>> + * Context: Expects only the filemap's refcounts to be left on the folio. Will
>> + *          freeze these refcounts away so that no other users will interfere
>> + *          with restructuring.
>> + */
>> +static inline void filemap_remove_folio_for_restructuring(struct folio *folio)
>> +{
>> +	int filemap_refcount;
>> +
>> +	filemap_refcount = folio_nr_pages(folio);
>> +	while (!folio_ref_freeze(folio, filemap_refcount)) {
>> +		/*
>> +		 * At this point only filemap refcounts are expected, hence okay
>> +		 * to spin until speculative refcounts go away.
>> +		 */
>> +		WARN_ONCE(1, "Spinning on folio=%p refcount=%d", folio, folio_ref_count(folio));
>> +	}
>> +
>> +	folio_lock(folio);
>> +	__filemap_remove_folio_for_restructuring(folio);
>> +	folio_unlock(folio);
>> +}
>> +
>> +/**
>> + * kvm_gmem_split_folio_in_filemap() - Split @folio within filemap in @inode.
>> + *
>> + * @inode: inode containing the folio.
>> + * @folio: folio to be split.
>> + *
>> + * Split a folio into folios of size PAGE_SIZE. Will clean up folio from filemap
>> + * and add back the split folios.
>> + *
>> + * Context: Expects that before this call, folio's refcount is just the
>> + *          filemap's refcounts. After this function returns, the split folios'
>> + *          refcounts will also be filemap's refcounts.
>> + * Return: 0 on success or negative error otherwise.
>> + */
>> +static int kvm_gmem_split_folio_in_filemap(struct inode *inode, struct folio *folio)
>> +{
>> +	size_t orig_nr_pages;
>> +	pgoff_t orig_index;
>> +	size_t i, j;
>> +	int ret;
>> +
>> +	orig_nr_pages = folio_nr_pages(folio);
>> +	if (orig_nr_pages == 1)
>> +		return 0;
>> +
>> +	orig_index = folio->index;
>> +
>> +	filemap_remove_folio_for_restructuring(folio);
>> +
>> +	ret = kvm_gmem_allocator_ops(inode)->split_folio(folio);
>> +	if (ret)
>> +		goto err;
>> +
>> +	for (i = 0; i < orig_nr_pages; ++i) {
>> +		struct folio *f = page_folio(folio_page(folio, i));
>> +
>> +		ret = __kvm_gmem_filemap_add_folio(inode->i_mapping, f,
>> +						   orig_index + i);
> Why does the failure of __kvm_gmem_filemap_add_folio() here lead to rollback,    
> while the failure of the one under rollback only triggers WARN_ON()?
>

Mostly because I don't really have a choice on rollback. On rollback we
try to restore the merged folio back into the filemap, and if we
couldn't, the host is probably in rather bad shape in terms of memory
availability and there may not be many options for the userspace VMM.

Perhaps the different possible errors from
__kvm_gmem_filemap_add_folio() in both should be handled differently. Do
you have any suggestions on that?

>> +		if (ret)
>> +			goto rollback;
>> +	}
>> +
>> +	return ret;
>> +
>> +rollback:
>> +	for (j = 0; j < i; ++j) {
>> +		struct folio *f = page_folio(folio_page(folio, j));
>> +
>> +		filemap_remove_folio_for_restructuring(f);
>> +	}
>> +
>> +	kvm_gmem_allocator_ops(inode)->merge_folio(folio);
>> +err:
>> +	WARN_ON(__kvm_gmem_filemap_add_folio(inode->i_mapping, folio, orig_index));
>> +
>> +	return ret;
>> +}
>> +
>> +static inline int kvm_gmem_try_split_folio_in_filemap(struct inode *inode,
>> +						      struct folio *folio)
>> +{
>> +	size_t to_nr_pages;
>> +	void *priv;
>> +
>> +	if (!kvm_gmem_has_custom_allocator(inode))
>> +		return 0;
>> +
>> +	priv = kvm_gmem_allocator_private(inode);
>> +	to_nr_pages = kvm_gmem_allocator_ops(inode)->nr_pages_in_page(priv);
>> +
>> +	if (kvm_gmem_has_some_shared(inode, folio->index, to_nr_pages))
>> +		return kvm_gmem_split_folio_in_filemap(inode, folio);
>> +
>> +	return 0;
>> +}
>> +
>> +/**
>> + * kvm_gmem_merge_folio_in_filemap() - Merge @first_folio within filemap in
>> + * @inode.
>> + *
>> + * @inode: inode containing the folio.
>> + * @first_folio: first folio among folios to be merged.
>> + *
>> + * Will clean up subfolios from filemap and add back the merged folio.
>> + *
>> + * Context: Expects that before this call, all subfolios only have filemap
>> + *          refcounts. After this function returns, the merged folio will only
>> + *          have filemap refcounts.
>> + * Return: 0 on success or negative error otherwise.
>> + */
>> +static int kvm_gmem_merge_folio_in_filemap(struct inode *inode,
>> +					   struct folio *first_folio)
>> +{
>> +	size_t to_nr_pages;
>> +	pgoff_t index;
>> +	void *priv;
>> +	size_t i;
>> +	int ret;
>> +
>> +	index = first_folio->index;
>> +
>> +	priv = kvm_gmem_allocator_private(inode);
>> +	to_nr_pages = kvm_gmem_allocator_ops(inode)->nr_pages_in_folio(priv);
>> +	if (folio_nr_pages(first_folio) == to_nr_pages)
>> +		return 0;
>> +
>> +	for (i = 0; i < to_nr_pages; ++i) {
>> +		struct folio *f = page_folio(folio_page(first_folio, i));
>> +
>> +		filemap_remove_folio_for_restructuring(f);
>> +	}
>> +
>> +	kvm_gmem_allocator_ops(inode)->merge_folio(first_folio);
>> +
>> +	ret = __kvm_gmem_filemap_add_folio(inode->i_mapping, first_folio, index);
>> +	if (ret)
>> +		goto err_split;
>> +
>> +	return ret;
>> +
>> +err_split:
>> +	WARN_ON(kvm_gmem_allocator_ops(inode)->split_folio(first_folio));
> guestmem_hugetlb_split_folio() is possible to fail. e.g.
> After the stash is freed by guestmem_hugetlb_unstash_free_metadata() in
> guestmem_hugetlb_merge_folio(), it's possible to get -ENOMEM for the stash
> allocation in guestmem_hugetlb_stash_metadata() in
> guestmem_hugetlb_split_folio().
>
>

Yes. This is also on the error path. In line with all the other error
and rollback paths, I don't really have other options at this point,
since on error, I probably ran out of memory, so I try my best to
restore the original state but give up with a WARN otherwise.

>> +	for (i = 0; i < to_nr_pages; ++i) {
>> +		struct folio *f = page_folio(folio_page(first_folio, i));
>> +
>> +		WARN_ON(__kvm_gmem_filemap_add_folio(inode->i_mapping, f, index + i));
>> +	}
>> +
>> +	return ret;
>> +}
>> +
>> +static inline int kvm_gmem_try_merge_folio_in_filemap(struct inode *inode,
>> +						      struct folio *first_folio)
>> +{
>> +	size_t to_nr_pages;
>> +	void *priv;
>> +
>> +	priv = kvm_gmem_allocator_private(inode);
>> +	to_nr_pages = kvm_gmem_allocator_ops(inode)->nr_pages_in_folio(priv);
>> +
>> +	if (kvm_gmem_has_some_shared(inode, first_folio->index, to_nr_pages))
>> +		return 0;
>> +
>> +	return kvm_gmem_merge_folio_in_filemap(inode, first_folio);
>> +}
>> +
>> +static int kvm_gmem_restructure_folios_in_range(struct inode *inode,
>> +						pgoff_t start, size_t nr_pages,
>> +						bool is_split_operation)
>> +{
>> +	size_t to_nr_pages;
>> +	pgoff_t index;
>> +	pgoff_t end;
>> +	void *priv;
>> +	int ret;
>> +
>> +	if (!kvm_gmem_has_custom_allocator(inode))
>> +		return 0;
>> +
>> +	end = start + nr_pages;
>> +
>> +	/* Round to allocator page size, to check all (huge) pages in range. */
>> +	priv = kvm_gmem_allocator_private(inode);
>> +	to_nr_pages = kvm_gmem_allocator_ops(inode)->nr_pages_in_folio(priv);
>> +
>> +	start = round_down(start, to_nr_pages);
>> +	end = round_up(end, to_nr_pages);
>> +
>> +	for (index = start; index < end; index += to_nr_pages) {
>> +		struct folio *f;
>> +
>> +		f = filemap_get_folio(inode->i_mapping, index);
>> +		if (IS_ERR(f))
>> +			continue;
>> +
>> +		/* Leave just filemap's refcounts on the folio. */
>> +		folio_put(f);
>> +
>> +		if (is_split_operation)
>> +			ret = kvm_gmem_split_folio_in_filemap(inode, f);
> kvm_gmem_try_split_folio_in_filemap()?
>

Here we know for sure that this was a private-to-shared
conversion. Hence, we know that there are at least some shared parts in
this huge page and we can skip checking that. 

>> +		else
>> +			ret = kvm_gmem_try_merge_folio_in_filemap(inode, f);
>> +

For merge, we don't know if the entire huge page might perhaps contain
some other shared subpages, hence we "try" to merge by first checking
against shareability to find shared subpages.

>> +		if (ret)
>> +			goto rollback;
>> +	}
>> +	return ret;
>> +
>> +rollback:
>> +	for (index -= to_nr_pages; index >= start; index -= to_nr_pages) {

Note to self: the first index -= to_nr_pages was meant to skip the index
that caused the failure, but this could cause an underflow if index = 0
when entering rollback. Need to fix this in the next revision.

>> +		struct folio *f;
>> +
>> +		f = filemap_get_folio(inode->i_mapping, index);
>> +		if (IS_ERR(f))
>> +			continue;
>> +
>> +		/* Leave just filemap's refcounts on the folio. */
>> +		folio_put(f);
>> +
>> +		if (is_split_operation)
>> +			WARN_ON(kvm_gmem_merge_folio_in_filemap(inode, f));
>> +		else
>> +			WARN_ON(kvm_gmem_split_folio_in_filemap(inode, f));
> Is it safe to just leave WARN_ON()s in the rollback case?
>

Same as above. I don't think we have much of a choice.

> Besides, are the kvm_gmem_merge_folio_in_filemap() and
> kvm_gmem_split_folio_in_filemap() here duplicated with the
> kvm_gmem_split_folio_in_filemap() and kvm_gmem_try_merge_folio_in_filemap() in
> the following "r = kvm_gmem_convert_execute_work(inode, work, !shared)"?
>

This handles the case where some pages in the range [start, start +
nr_pages) were split and the failure was halfway through. I could call
kvm_gmem_convert_execute_work() with !shared but that would go over all
the folios again from the start.

>> +	}
>> +
>> +	return ret;
>> +}
>> +
>> +#else
>> +
>> +static inline int kvm_gmem_try_split_folio_in_filemap(struct inode *inode,
>> +						      struct folio *folio)
>> +{
>> +	return 0;
>> +}
>> +
>> +static int kvm_gmem_restructure_folios_in_range(struct inode *inode,
>> +						pgoff_t start, size_t nr_pages,
>> +						bool is_split_operation)
>> +{
>> +	return 0;
>> +}
>> +
>> +#endif
>> +
>
Re: [RFC PATCH v2 38/51] KVM: guest_memfd: Split allocator pages for guest_memfd use
Posted by Yan Zhao 6 months ago
On Thu, Jun 05, 2025 at 12:10:08PM -0700, Ackerley Tng wrote:
> Yan Zhao <yan.y.zhao@intel.com> writes:
> 
> > On Wed, May 14, 2025 at 04:42:17PM -0700, Ackerley Tng wrote:
> >> +static int kvm_gmem_convert_execute_work(struct inode *inode,
> >> +					 struct conversion_work *work,
> >> +					 bool to_shared)
> >> +{
> >> +	enum shareability m;
> >> +	int ret;
> >> +
> >> +	m = to_shared ? SHAREABILITY_ALL : SHAREABILITY_GUEST;
> >> +	ret = kvm_gmem_shareability_apply(inode, work, m);
> >> +	if (ret)
> >> +		return ret;
> >> +	/*
> >> +	 * Apply shareability first so split/merge can operate on new
> >> +	 * shareability state.
> >> +	 */
> >> +	ret = kvm_gmem_restructure_folios_in_range(
> >> +		inode, work->start, work->nr_pages, to_shared);
> >> +
> >> +	return ret;
> >> +}
> >> +
> 
> Hi Yan,
> 
> Thanks for your thorough reviews and your alternative suggestion in the
> other discussion at [1]! I'll try to bring the conversion-related parts
> of that discussion over here.
> 
> >>  static int kvm_gmem_convert_range(struct file *file, pgoff_t start,
> >>  				  size_t nr_pages, bool shared,
> >>  				  pgoff_t *error_index)
> 
> The guiding principle I was using for the conversion ioctls is
> 
> * Have the shareability updates and any necessary page restructuring
>   (aka splitting/merging) either fully complete for not at all by the
>   time the conversion ioctl returns.
> * Any unmapping (from host or guest page tables) will not be re-mapped
>   on errors.
> * Rollback undoes changes if conversion failed, and in those cases any
>   errors are turned into WARNings.
> 
> The rationale is that we want page sizes to be in sync with shareability
> so that any faults after the (successful or failed) conversion will not
> wrongly map in a larger page than allowed and cause any host crashes.
> 
> We considered 3 places where the memory can be mapped for conversions:
> 
> 1. Host page tables
> 2. Guest page tables
> 3. IOMMU page tables
> 
> Unmapping from host page tables is the simplest case. We unmap any
> shared ranges from the host page tables. Any accesses after the failed
> conversion would just fault the memory back in and proceed as usual.
> 
> guest_memfd memory is not unmapped from IOMMUs in conversions. This case
> is handled because IOMMU mappings hold refcounts. After unmapping from
> the host, we check for unexpected refcounts and fail if there are
> unexpected refcounts.
> 
> We also unmap from guest page tables. Considering failed conversions, if
> the pages are shared, we're good since the next time the guest accesses
> the page, the page will be faulted in as before.
> 
> If the pages are private, on the next guest access, the pages will be
> faulted in again as well. This is fine for software-protected VMs IIUC.
> 
> For TDX (and SNP) IIUC the memory would have been cleared, and the
> memory would also need to be re-accepted. I was thinking that this is by
> design, since when a TDX guest requests a conversion it knows that the
> contents is not to be used again.
This is not guaranteed.

On private-to-shared conversion failure, the guest may leak the page or release
the page. If the guest chooses the latter (e.g. in kvmclock_init_mem(),
kvm_arch_ptp_init()), the page is regarded as private by the guest OS.
Re-acceptance then will not happen before the guest access.

So, it's better for host to keep the original SEPT if private-to-shared
conversion fails.


 
> The userspace VMM is obligated to keep trying convert and if it gives
> up, userspace VMM should inform the guest that the conversion
> failed. The guest should handle conversion failures too and not assume
> that conversion always succeeds.
I don't think relying userspace to keep trying convert endlessly is a good
design.

> 
> Putting TDX aside for a moment, so far, there are a few ways this
> conversion could fail:
> 
> a. Unexpected refcounts. Userspace should clear up the unexpected
>    refcounts and report failure to the guest if it can't for whatever
>    reason.
This is acceptable. Unmapping shared mappings in the primary MMU or shared EPT
is harmless.

> b. ENOMEM because (i) we ran out of memory updating the shareability
>    maple_tree or (ii) since splitting involves allocating more memory
>    for struct pages and we ran out of memory there. In this case the
>    userspace VMM gets -ENOMEM and can make more memory available and
>    then retry, or if it can't, also report failure to the guest.
This is unacceptable. Why not reserve the memory before determining to start
the real conversion? If -ENOMEM is returned before executing the conversion, we
don't need to handle the restore error, which is impossible to be handled
gracefully.


> TDX introduces TDX-specific conversion failures (see discussion at
> [1]), which this series doesn't handle, but I think we still have a line
> of sight to handle new errors.
The errors can be divided into two categories:
(1) errors due to kernel bugs
(2) errors that could occur in normal or bad conditions (e.g. the -ENOMEM)

We can't handle (1), so BUG_ON or leaking memory is allowed.
However, we should try to avoid (2) especially in the rollback path.


> In the other thread [1], I was proposing to have guest_memfd decide what
> to do on errors, but I think that might be baking more TDX-specific
> details into guest_memfd/KVM, and perhaps this is better:
The TDX-specific errors in the unmapping path is of category (1).
So, we hope to resolve it by BUG_ON and leaking the memory.

The other conversion error for TDX is for splitting memory. We hope to
do the splitting before executing the real conversion.

Please check the proposal details at
https://lore.kernel.org/all/aE%2Fq9VKkmaCcuwpU@yzhao56-desk.sh.intel.com.

> We could return the errors to userspace and let userspace determine what
> to do. For retryable errors (as determined by userspace), it should do
> what it needs to do, and retry. For errors like TDX being unable to
> reclaim the memory, it could tell guest_memfd to leak that memory.
> 
> If userspace gives up, it should report conversion failure to the guest
> if userspace thinks the guest can continue (to a clean shutdown or
> otherwise). If something terrible happened during conversion, then
> userspace might have to exit itself or shutdown the host.
> 
> In [2], for TDX-specific conversion failures, you proposed prepping to
> eliminate errors and exiting early on failure, then actually
> unmapping. I think that could work too.
> 
> I'm a little concerned that prepping could be complicated, since the
> nature of conversion depends on the current state of shareability, and
> there's a lot to prepare, everything from counting memory required for
> maple_tree allocation (and merging ranges in the maple_tree), and
> counting the number of pages required for undoing vmemmap optimization
> in the case of splitting...
> 
> And even after doing all the prep to eliminate errors, the unmapping
> could fail in TDX-specific cases anyway, which still needs to be
> handled.
> 
> Hence I'm hoping you'll consider to let TDX-specific failures be
> built-in and handled alongside other failures by getting help from the
> userspace VMM, and in the worst case letting the guest know the
> conversion failed.
> 
> I also appreciate comments or suggestions from anyone else!
> 
> [1] https://lore.kernel.org/all/diqzfrgfp95d.fsf@ackerleytng-ctop.c.googlers.com/
> [2] https://lore.kernel.org/all/aEEEJbTzlncbRaRA@yzhao56-desk.sh.intel.com/
> 
> >> @@ -371,18 +539,21 @@ static int kvm_gmem_convert_range(struct file *file, pgoff_t start,
> >>  
> >>  	list_for_each_entry(work, &work_list, list) {
> >>  		rollback_stop_item = work;
> >> -		ret = kvm_gmem_shareability_apply(inode, work, m);
> >> +
> >> +		ret = kvm_gmem_convert_execute_work(inode, work, shared);
> >>  		if (ret)
> >>  			break;
> >>  	}
> >>  
> >>  	if (ret) {
> >> -		m = shared ? SHAREABILITY_GUEST : SHAREABILITY_ALL;
> >>  		list_for_each_entry(work, &work_list, list) {
> >> +			int r;
> >> +
> >> +			r = kvm_gmem_convert_execute_work(inode, work, !shared);
> >> +			WARN_ON(r);
> >> +
> >>  			if (work == rollback_stop_item)
> >>  				break;
> >> -
> >> -			WARN_ON(kvm_gmem_shareability_apply(inode, work, m));
> > Could kvm_gmem_shareability_apply() fail here?
> >
> 
> Yes, it could. If shareability cannot be updated, then we probably ran
> out of memory. Userspace VMM will probably get -ENOMEM set on some
> earlier ret and should handle that accordingly.
> 
> On -ENOMEM in a rollback, the host is in a very tough spot anyway, and a
> clean guest shutdown may be the only way out, hence this is a WARN and
> not returned to userspace.
> 
> >>  		}
> >>  	}
> >>  
> >> @@ -434,6 +605,277 @@ static int kvm_gmem_ioctl_convert_range(struct file *file,
> >>  	return ret;
> >>  }
> >>  
> >> +#ifdef CONFIG_KVM_GMEM_HUGETLB
> >> +
> >> +static inline void __filemap_remove_folio_for_restructuring(struct folio *folio)
> >> +{
> >> +	struct address_space *mapping = folio->mapping;
> >> +
> >> +	spin_lock(&mapping->host->i_lock);
> >> +	xa_lock_irq(&mapping->i_pages);
> >> +
> >> +	__filemap_remove_folio(folio, NULL);
> >> +
> >> +	xa_unlock_irq(&mapping->i_pages);
> >> +	spin_unlock(&mapping->host->i_lock);
> >> +}
> >> +
> >> +/**
> >> + * filemap_remove_folio_for_restructuring() - Remove @folio from filemap for
> >> + * split/merge.
> >> + *
> >> + * @folio: the folio to be removed.
> >> + *
> >> + * Similar to filemap_remove_folio(), but skips LRU-related calls (meaningless
> >> + * for guest_memfd), and skips call to ->free_folio() to maintain folio flags.
> >> + *
> >> + * Context: Expects only the filemap's refcounts to be left on the folio. Will
> >> + *          freeze these refcounts away so that no other users will interfere
> >> + *          with restructuring.
> >> + */
> >> +static inline void filemap_remove_folio_for_restructuring(struct folio *folio)
> >> +{
> >> +	int filemap_refcount;
> >> +
> >> +	filemap_refcount = folio_nr_pages(folio);
> >> +	while (!folio_ref_freeze(folio, filemap_refcount)) {
> >> +		/*
> >> +		 * At this point only filemap refcounts are expected, hence okay
> >> +		 * to spin until speculative refcounts go away.
> >> +		 */
> >> +		WARN_ONCE(1, "Spinning on folio=%p refcount=%d", folio, folio_ref_count(folio));
> >> +	}
> >> +
> >> +	folio_lock(folio);
> >> +	__filemap_remove_folio_for_restructuring(folio);
> >> +	folio_unlock(folio);
> >> +}
> >> +
> >> +/**
> >> + * kvm_gmem_split_folio_in_filemap() - Split @folio within filemap in @inode.
> >> + *
> >> + * @inode: inode containing the folio.
> >> + * @folio: folio to be split.
> >> + *
> >> + * Split a folio into folios of size PAGE_SIZE. Will clean up folio from filemap
> >> + * and add back the split folios.
> >> + *
> >> + * Context: Expects that before this call, folio's refcount is just the
> >> + *          filemap's refcounts. After this function returns, the split folios'
> >> + *          refcounts will also be filemap's refcounts.
> >> + * Return: 0 on success or negative error otherwise.
> >> + */
> >> +static int kvm_gmem_split_folio_in_filemap(struct inode *inode, struct folio *folio)
> >> +{
> >> +	size_t orig_nr_pages;
> >> +	pgoff_t orig_index;
> >> +	size_t i, j;
> >> +	int ret;
> >> +
> >> +	orig_nr_pages = folio_nr_pages(folio);
> >> +	if (orig_nr_pages == 1)
> >> +		return 0;
> >> +
> >> +	orig_index = folio->index;
> >> +
> >> +	filemap_remove_folio_for_restructuring(folio);
> >> +
> >> +	ret = kvm_gmem_allocator_ops(inode)->split_folio(folio);
> >> +	if (ret)
> >> +		goto err;
> >> +
> >> +	for (i = 0; i < orig_nr_pages; ++i) {
> >> +		struct folio *f = page_folio(folio_page(folio, i));
> >> +
> >> +		ret = __kvm_gmem_filemap_add_folio(inode->i_mapping, f,
> >> +						   orig_index + i);
> > Why does the failure of __kvm_gmem_filemap_add_folio() here lead to rollback,    
> > while the failure of the one under rollback only triggers WARN_ON()?
> >
> 
> Mostly because I don't really have a choice on rollback. On rollback we
> try to restore the merged folio back into the filemap, and if we
> couldn't, the host is probably in rather bad shape in terms of memory
> availability and there may not be many options for the userspace VMM.
Out of memory is not a good excuse for rollback error.

> Perhaps the different possible errors from
> __kvm_gmem_filemap_add_folio() in both should be handled differently. Do
> you have any suggestions on that?
Maybe reserving the memory or ruling out other factors that could lead to
conversion failure before executing the conversion?
Then BUG_ON the if the failure is cause by a bug.

> >> +		if (ret)
> >> +			goto rollback;
> >> +	}
> >> +
> >> +	return ret;
> >> +
> >> +rollback:
> >> +	for (j = 0; j < i; ++j) {
> >> +		struct folio *f = page_folio(folio_page(folio, j));
> >> +
> >> +		filemap_remove_folio_for_restructuring(f);
> >> +	}
> >> +
> >> +	kvm_gmem_allocator_ops(inode)->merge_folio(folio);
> >> +err:
> >> +	WARN_ON(__kvm_gmem_filemap_add_folio(inode->i_mapping, folio, orig_index));
> >> +
> >> +	return ret;
> >> +}
> >> +
> >> +static inline int kvm_gmem_try_split_folio_in_filemap(struct inode *inode,
> >> +						      struct folio *folio)
> >> +{
> >> +	size_t to_nr_pages;
> >> +	void *priv;
> >> +
> >> +	if (!kvm_gmem_has_custom_allocator(inode))
> >> +		return 0;
> >> +
> >> +	priv = kvm_gmem_allocator_private(inode);
> >> +	to_nr_pages = kvm_gmem_allocator_ops(inode)->nr_pages_in_page(priv);
> >> +
> >> +	if (kvm_gmem_has_some_shared(inode, folio->index, to_nr_pages))
> >> +		return kvm_gmem_split_folio_in_filemap(inode, folio);
> >> +
> >> +	return 0;
> >> +}
> >> +
> >> +/**
> >> + * kvm_gmem_merge_folio_in_filemap() - Merge @first_folio within filemap in
> >> + * @inode.
> >> + *
> >> + * @inode: inode containing the folio.
> >> + * @first_folio: first folio among folios to be merged.
> >> + *
> >> + * Will clean up subfolios from filemap and add back the merged folio.
> >> + *
> >> + * Context: Expects that before this call, all subfolios only have filemap
> >> + *          refcounts. After this function returns, the merged folio will only
> >> + *          have filemap refcounts.
> >> + * Return: 0 on success or negative error otherwise.
> >> + */
> >> +static int kvm_gmem_merge_folio_in_filemap(struct inode *inode,
> >> +					   struct folio *first_folio)
> >> +{
> >> +	size_t to_nr_pages;
> >> +	pgoff_t index;
> >> +	void *priv;
> >> +	size_t i;
> >> +	int ret;
> >> +
> >> +	index = first_folio->index;
> >> +
> >> +	priv = kvm_gmem_allocator_private(inode);
> >> +	to_nr_pages = kvm_gmem_allocator_ops(inode)->nr_pages_in_folio(priv);
> >> +	if (folio_nr_pages(first_folio) == to_nr_pages)
> >> +		return 0;
> >> +
> >> +	for (i = 0; i < to_nr_pages; ++i) {
> >> +		struct folio *f = page_folio(folio_page(first_folio, i));
> >> +
> >> +		filemap_remove_folio_for_restructuring(f);
> >> +	}
> >> +
> >> +	kvm_gmem_allocator_ops(inode)->merge_folio(first_folio);
> >> +
> >> +	ret = __kvm_gmem_filemap_add_folio(inode->i_mapping, first_folio, index);
> >> +	if (ret)
> >> +		goto err_split;
> >> +
> >> +	return ret;
> >> +
> >> +err_split:
> >> +	WARN_ON(kvm_gmem_allocator_ops(inode)->split_folio(first_folio));
> > guestmem_hugetlb_split_folio() is possible to fail. e.g.
> > After the stash is freed by guestmem_hugetlb_unstash_free_metadata() in
> > guestmem_hugetlb_merge_folio(), it's possible to get -ENOMEM for the stash
> > allocation in guestmem_hugetlb_stash_metadata() in
> > guestmem_hugetlb_split_folio().
> >
> >
> 
> Yes. This is also on the error path. In line with all the other error
> and rollback paths, I don't really have other options at this point,
> since on error, I probably ran out of memory, so I try my best to
> restore the original state but give up with a WARN otherwise.
> 
> >> +	for (i = 0; i < to_nr_pages; ++i) {
> >> +		struct folio *f = page_folio(folio_page(first_folio, i));
> >> +
> >> +		WARN_ON(__kvm_gmem_filemap_add_folio(inode->i_mapping, f, index + i));
> >> +	}
> >> +
> >> +	return ret;
> >> +}
> >> +
> >> +static inline int kvm_gmem_try_merge_folio_in_filemap(struct inode *inode,
> >> +						      struct folio *first_folio)
> >> +{
> >> +	size_t to_nr_pages;
> >> +	void *priv;
> >> +
> >> +	priv = kvm_gmem_allocator_private(inode);
> >> +	to_nr_pages = kvm_gmem_allocator_ops(inode)->nr_pages_in_folio(priv);
> >> +
> >> +	if (kvm_gmem_has_some_shared(inode, first_folio->index, to_nr_pages))
> >> +		return 0;
> >> +
> >> +	return kvm_gmem_merge_folio_in_filemap(inode, first_folio);
> >> +}
> >> +
> >> +static int kvm_gmem_restructure_folios_in_range(struct inode *inode,
> >> +						pgoff_t start, size_t nr_pages,
> >> +						bool is_split_operation)
> >> +{
> >> +	size_t to_nr_pages;
> >> +	pgoff_t index;
> >> +	pgoff_t end;
> >> +	void *priv;
> >> +	int ret;
> >> +
> >> +	if (!kvm_gmem_has_custom_allocator(inode))
> >> +		return 0;
> >> +
> >> +	end = start + nr_pages;
> >> +
> >> +	/* Round to allocator page size, to check all (huge) pages in range. */
> >> +	priv = kvm_gmem_allocator_private(inode);
> >> +	to_nr_pages = kvm_gmem_allocator_ops(inode)->nr_pages_in_folio(priv);
> >> +
> >> +	start = round_down(start, to_nr_pages);
> >> +	end = round_up(end, to_nr_pages);
> >> +
> >> +	for (index = start; index < end; index += to_nr_pages) {
> >> +		struct folio *f;
> >> +
> >> +		f = filemap_get_folio(inode->i_mapping, index);
> >> +		if (IS_ERR(f))
> >> +			continue;
> >> +
> >> +		/* Leave just filemap's refcounts on the folio. */
> >> +		folio_put(f);
> >> +
> >> +		if (is_split_operation)
> >> +			ret = kvm_gmem_split_folio_in_filemap(inode, f);
> > kvm_gmem_try_split_folio_in_filemap()?
> >
> 
> Here we know for sure that this was a private-to-shared
> conversion. Hence, we know that there are at least some shared parts in
> this huge page and we can skip checking that. 
Ok.

> >> +		else
> >> +			ret = kvm_gmem_try_merge_folio_in_filemap(inode, f);
> >> +
> 
> For merge, we don't know if the entire huge page might perhaps contain
> some other shared subpages, hence we "try" to merge by first checking
> against shareability to find shared subpages.
Makes sense.


> >> +		if (ret)
> >> +			goto rollback;
> >> +	}
> >> +	return ret;
> >> +
> >> +rollback:
> >> +	for (index -= to_nr_pages; index >= start; index -= to_nr_pages) {
> 
> Note to self: the first index -= to_nr_pages was meant to skip the index
> that caused the failure, but this could cause an underflow if index = 0
> when entering rollback. Need to fix this in the next revision.
Yes :)

> >> +		struct folio *f;
> >> +
> >> +		f = filemap_get_folio(inode->i_mapping, index);
> >> +		if (IS_ERR(f))
> >> +			continue;
> >> +
> >> +		/* Leave just filemap's refcounts on the folio. */
> >> +		folio_put(f);
> >> +
> >> +		if (is_split_operation)
> >> +			WARN_ON(kvm_gmem_merge_folio_in_filemap(inode, f));
> >> +		else
> >> +			WARN_ON(kvm_gmem_split_folio_in_filemap(inode, f));
> > Is it safe to just leave WARN_ON()s in the rollback case?
> >
> 
> Same as above. I don't think we have much of a choice.
> 
> > Besides, are the kvm_gmem_merge_folio_in_filemap() and
> > kvm_gmem_split_folio_in_filemap() here duplicated with the
> > kvm_gmem_split_folio_in_filemap() and kvm_gmem_try_merge_folio_in_filemap() in
> > the following "r = kvm_gmem_convert_execute_work(inode, work, !shared)"?
> >
> 
> This handles the case where some pages in the range [start, start +
> nr_pages) were split and the failure was halfway through. I could call
> kvm_gmem_convert_execute_work() with !shared but that would go over all
> the folios again from the start.
> 
> >> +	}
> >> +
> >> +	return ret;
> >> +}
> >> +
> >> +#else
> >> +
> >> +static inline int kvm_gmem_try_split_folio_in_filemap(struct inode *inode,
> >> +						      struct folio *folio)
> >> +{
> >> +	return 0;
> >> +}
> >> +
> >> +static int kvm_gmem_restructure_folios_in_range(struct inode *inode,
> >> +						pgoff_t start, size_t nr_pages,
> >> +						bool is_split_operation)
> >> +{
> >> +	return 0;
> >> +}
> >> +
> >> +#endif
> >> +
> >
Re: [RFC PATCH v2 38/51] KVM: guest_memfd: Split allocator pages for guest_memfd use
Posted by Yan Zhao 6 months, 3 weeks ago
On Wed, May 14, 2025 at 04:42:17PM -0700, Ackerley Tng wrote:
> In this patch, newly allocated pages are split to 4K regular pages
> before providing them to the requester (fallocate() or KVM).
> 
> During a private to shared conversion, folios are split if not already
> split.
> 
> During a shared to private conversion, folios are merged if not
> already merged.
> 
> When the folios are removed from the filemap on truncation, the
> allocator is given a chance to do any necessary prep for when the
> folio is freed.
> 
> When a conversion is requested on a subfolio within a hugepage range,
> faulting must be prevented on the whole hugepage range for
> correctness.
> 
> See related discussion at
> https://lore.kernel.org/all/Z__AAB_EFxGFEjDR@google.com/T/
> 
> Signed-off-by: Ackerley Tng <ackerleytng@google.com>
> Co-developed-by: Vishal Annapurve <vannapurve@google.com>
> Signed-off-by: Vishal Annapurve <vannapurve@google.com>
> Change-Id: Ib5ee22e3dae034c529773048a626ad98d4b10af3
> ---
>  mm/filemap.c           |   2 +
>  virt/kvm/guest_memfd.c | 501 +++++++++++++++++++++++++++++++++++++++--
>  2 files changed, 483 insertions(+), 20 deletions(-)
> 
> diff --git a/mm/filemap.c b/mm/filemap.c
> index a02c3d8e00e8..a052f8e0c41e 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -223,6 +223,7 @@ void __filemap_remove_folio(struct folio *folio, void *shadow)
>  	filemap_unaccount_folio(mapping, folio);
>  	page_cache_delete(mapping, folio, shadow);
>  }
> +EXPORT_SYMBOL_GPL(__filemap_remove_folio);
>  
>  void filemap_free_folio(struct address_space *mapping, struct folio *folio)
>  {
> @@ -258,6 +259,7 @@ void filemap_remove_folio(struct folio *folio)
>  
>  	filemap_free_folio(mapping, folio);
>  }
> +EXPORT_SYMBOL_GPL(filemap_remove_folio);
>  
>  /*
>   * page_cache_delete_batch - delete several folios from page cache
> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> index c578d0ebe314..cb426c1dfef8 100644
> --- a/virt/kvm/guest_memfd.c
> +++ b/virt/kvm/guest_memfd.c
> @@ -41,6 +41,11 @@ static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start,
>  				      pgoff_t end);
>  static void kvm_gmem_invalidate_end(struct kvm_gmem *gmem, pgoff_t start,
>  				    pgoff_t end);
> +static int __kvm_gmem_filemap_add_folio(struct address_space *mapping,
> +					struct folio *folio, pgoff_t index);
> +static int kvm_gmem_restructure_folios_in_range(struct inode *inode,
> +						pgoff_t start, size_t nr_pages,
> +						bool is_split_operation);
>  
>  static struct kvm_gmem_inode_private *kvm_gmem_private(struct inode *inode)
>  {
> @@ -126,6 +131,31 @@ static enum shareability kvm_gmem_shareability_get(struct inode *inode,
>  	return xa_to_value(entry);
>  }
>  
> +static bool kvm_gmem_shareability_in_range(struct inode *inode, pgoff_t start,
> +					    size_t nr_pages, enum shareability m)
> +{
> +	struct maple_tree *mt;
> +	pgoff_t last;
> +	void *entry;
> +
> +	mt = &kvm_gmem_private(inode)->shareability;
> +
> +	last = start + nr_pages - 1;
> +	mt_for_each(mt, entry, start, last) {
> +		if (xa_to_value(entry) == m)
> +			return true;
> +	}
> +
> +	return false;
> +}
> +
> +static inline bool kvm_gmem_has_some_shared(struct inode *inode, pgoff_t start,
> +					    size_t nr_pages)
> +{
> +	return kvm_gmem_shareability_in_range(inode, start, nr_pages,
> +					     SHAREABILITY_ALL);
> +}
> +
>  static struct folio *kvm_gmem_get_shared_folio(struct inode *inode, pgoff_t index)
>  {
>  	if (kvm_gmem_shareability_get(inode, index) != SHAREABILITY_ALL)
> @@ -241,6 +271,105 @@ static bool kvm_gmem_has_safe_refcount(struct address_space *mapping, pgoff_t st
>  	return refcount_safe;
>  }
>  
> +static void kvm_gmem_unmap_private(struct kvm_gmem *gmem, pgoff_t start,
> +				   pgoff_t end)
> +{
> +	struct kvm_memory_slot *slot;
> +	struct kvm *kvm = gmem->kvm;
> +	unsigned long index;
> +	bool locked = false;
> +	bool flush = false;
> +
> +	xa_for_each_range(&gmem->bindings, index, slot, start, end - 1) {
> +		pgoff_t pgoff = slot->gmem.pgoff;
> +
> +		struct kvm_gfn_range gfn_range = {
> +			.start = slot->base_gfn + max(pgoff, start) - pgoff,
> +			.end = slot->base_gfn + min(pgoff + slot->npages, end) - pgoff,
> +			.slot = slot,
> +			.may_block = true,
> +			/* This function is only concerned with private mappings. */
> +			.attr_filter = KVM_FILTER_PRIVATE,
> +		};
> +
> +		if (!locked) {
> +			KVM_MMU_LOCK(kvm);
> +			locked = true;
> +		}
> +
> +		flush |= kvm_mmu_unmap_gfn_range(kvm, &gfn_range);
> +	}
> +
> +	if (flush)
> +		kvm_flush_remote_tlbs(kvm);
> +
> +	if (locked)
> +		KVM_MMU_UNLOCK(kvm);
> +}
> +
> +static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start,
> +				      pgoff_t end)
> +{
> +	struct kvm_memory_slot *slot;
> +	struct kvm *kvm = gmem->kvm;
> +	unsigned long index;
> +	bool found_memslot;
> +
> +	found_memslot = false;
> +	xa_for_each_range(&gmem->bindings, index, slot, start, end - 1) {
> +		gfn_t gfn_start;
> +		gfn_t gfn_end;
> +		pgoff_t pgoff;
> +
> +		pgoff = slot->gmem.pgoff;
> +
> +		gfn_start = slot->base_gfn + max(pgoff, start) - pgoff;
> +		gfn_end = slot->base_gfn + min(pgoff + slot->npages, end) - pgoff;
> +		if (!found_memslot) {
> +			found_memslot = true;
> +
> +			KVM_MMU_LOCK(kvm);
> +			kvm_mmu_invalidate_begin(kvm);
> +		}
> +
> +		kvm_mmu_invalidate_range_add(kvm, gfn_start, gfn_end);
> +	}
> +
> +	if (found_memslot)
> +		KVM_MMU_UNLOCK(kvm);
> +}
> +
> +static pgoff_t kvm_gmem_compute_invalidate_bound(struct inode *inode,
> +						 pgoff_t bound, bool start)
> +{
> +	size_t nr_pages;
> +	void *priv;
> +
> +	if (!kvm_gmem_has_custom_allocator(inode))
> +		return bound;
> +
> +	priv = kvm_gmem_allocator_private(inode);
> +	nr_pages = kvm_gmem_allocator_ops(inode)->nr_pages_in_folio(priv);
> +
> +	if (start)
> +		return round_down(bound, nr_pages);
> +	else
> +		return round_up(bound, nr_pages);
> +}
> +
> +static pgoff_t kvm_gmem_compute_invalidate_start(struct inode *inode,
> +						 pgoff_t bound)
> +{
> +	return kvm_gmem_compute_invalidate_bound(inode, bound, true);
> +}
> +
> +static pgoff_t kvm_gmem_compute_invalidate_end(struct inode *inode,
> +					       pgoff_t bound)
> +{
> +	return kvm_gmem_compute_invalidate_bound(inode, bound, false);
> +}
> +
>  static int kvm_gmem_shareability_apply(struct inode *inode,
>  				       struct conversion_work *work,
>  				       enum shareability m)
> @@ -299,35 +428,53 @@ static void kvm_gmem_convert_invalidate_begin(struct inode *inode,
>  					      struct conversion_work *work)
>  {
>  	struct list_head *gmem_list;
> +	pgoff_t invalidate_start;
> +	pgoff_t invalidate_end;
>  	struct kvm_gmem *gmem;
> -	pgoff_t end;
> +	pgoff_t work_end;
>  
> -	end = work->start + work->nr_pages;
> +	work_end = work->start + work->nr_pages;
> +	invalidate_start = kvm_gmem_compute_invalidate_start(inode, work->start);
> +	invalidate_end = kvm_gmem_compute_invalidate_end(inode, work_end);
Could we just notify the exact gfn range and let KVM adjust the invalidate
range?

Then kvm_gmem_invalidate_begin() can asks KVM to do EPT splitting before any
kvm_mmu_unmap_gfn_range() is performed.


>  	gmem_list = &inode->i_mapping->i_private_list;
>  	list_for_each_entry(gmem, gmem_list, entry)
> -		kvm_gmem_invalidate_begin(gmem, work->start, end);
> +		kvm_gmem_invalidate_begin(gmem, invalidate_start, invalidate_end);
>  }
>  
>  static void kvm_gmem_convert_invalidate_end(struct inode *inode,
>  					    struct conversion_work *work)
>  {
>  	struct list_head *gmem_list;
> +	pgoff_t invalidate_start;
> +	pgoff_t invalidate_end;
>  	struct kvm_gmem *gmem;
> -	pgoff_t end;
> +	pgoff_t work_end;
>  
> -	end = work->start + work->nr_pages;
> +	work_end = work->start + work->nr_pages;
> +	invalidate_start = kvm_gmem_compute_invalidate_start(inode, work->start);
> +	invalidate_end = kvm_gmem_compute_invalidate_end(inode, work_end);
>  
>  	gmem_list = &inode->i_mapping->i_private_list;
>  	list_for_each_entry(gmem, gmem_list, entry)
> -		kvm_gmem_invalidate_end(gmem, work->start, end);
> +		kvm_gmem_invalidate_end(gmem, invalidate_start, invalidate_end);
>  }
>  
>  static int kvm_gmem_convert_should_proceed(struct inode *inode,
>  					   struct conversion_work *work,
>  					   bool to_shared, pgoff_t *error_index)
>  {
> -	if (!to_shared) {
> +	if (to_shared) {
> +		struct list_head *gmem_list;
> +		struct kvm_gmem *gmem;
> +		pgoff_t work_end;
> +
> +		work_end = work->start + work->nr_pages;
> +
> +		gmem_list = &inode->i_mapping->i_private_list;
> +		list_for_each_entry(gmem, gmem_list, entry)
> +			kvm_gmem_unmap_private(gmem, work->start, work_end);
> +	} else {
>  		unmap_mapping_pages(inode->i_mapping, work->start,
>  				    work->nr_pages, false);
>  
> @@ -340,6 +487,27 @@ static int kvm_gmem_convert_should_proceed(struct inode *inode,
>  	return 0;
>  }
>  
> +static int kvm_gmem_convert_execute_work(struct inode *inode,
> +					 struct conversion_work *work,
> +					 bool to_shared)
> +{
> +	enum shareability m;
> +	int ret;
> +
> +	m = to_shared ? SHAREABILITY_ALL : SHAREABILITY_GUEST;
> +	ret = kvm_gmem_shareability_apply(inode, work, m);
> +	if (ret)
> +		return ret;
> +	/*
> +	 * Apply shareability first so split/merge can operate on new
> +	 * shareability state.
> +	 */
> +	ret = kvm_gmem_restructure_folios_in_range(
> +		inode, work->start, work->nr_pages, to_shared);
> +
> +	return ret;
> +}
> +
>  static int kvm_gmem_convert_range(struct file *file, pgoff_t start,
>  				  size_t nr_pages, bool shared,
>  				  pgoff_t *error_index)
> @@ -371,18 +539,21 @@ static int kvm_gmem_convert_range(struct file *file, pgoff_t start,
>  
>  	list_for_each_entry(work, &work_list, list) {
>  		rollback_stop_item = work;
> -		ret = kvm_gmem_shareability_apply(inode, work, m);
> +
> +		ret = kvm_gmem_convert_execute_work(inode, work, shared);
>  		if (ret)
>  			break;
>  	}
>  
>  	if (ret) {
> -		m = shared ? SHAREABILITY_GUEST : SHAREABILITY_ALL;
>  		list_for_each_entry(work, &work_list, list) {
> +			int r;
> +
> +			r = kvm_gmem_convert_execute_work(inode, work, !shared);
> +			WARN_ON(r);
> +
>  			if (work == rollback_stop_item)
>  				break;
> -
> -			WARN_ON(kvm_gmem_shareability_apply(inode, work, m));
>  		}
>  	}
>  
> @@ -434,6 +605,277 @@ static int kvm_gmem_ioctl_convert_range(struct file *file,
>  	return ret;
>  }
>  
> +#ifdef CONFIG_KVM_GMEM_HUGETLB
> +
> +static inline void __filemap_remove_folio_for_restructuring(struct folio *folio)
> +{
> +	struct address_space *mapping = folio->mapping;
> +
> +	spin_lock(&mapping->host->i_lock);
> +	xa_lock_irq(&mapping->i_pages);
> +
> +	__filemap_remove_folio(folio, NULL);
> +
> +	xa_unlock_irq(&mapping->i_pages);
> +	spin_unlock(&mapping->host->i_lock);
> +}
> +
> +/**
> + * filemap_remove_folio_for_restructuring() - Remove @folio from filemap for
> + * split/merge.
> + *
> + * @folio: the folio to be removed.
> + *
> + * Similar to filemap_remove_folio(), but skips LRU-related calls (meaningless
> + * for guest_memfd), and skips call to ->free_folio() to maintain folio flags.
> + *
> + * Context: Expects only the filemap's refcounts to be left on the folio. Will
> + *          freeze these refcounts away so that no other users will interfere
> + *          with restructuring.
> + */
> +static inline void filemap_remove_folio_for_restructuring(struct folio *folio)
> +{
> +	int filemap_refcount;
> +
> +	filemap_refcount = folio_nr_pages(folio);
> +	while (!folio_ref_freeze(folio, filemap_refcount)) {
> +		/*
> +		 * At this point only filemap refcounts are expected, hence okay
> +		 * to spin until speculative refcounts go away.
> +		 */
> +		WARN_ONCE(1, "Spinning on folio=%p refcount=%d", folio, folio_ref_count(folio));
> +	}
> +
> +	folio_lock(folio);
> +	__filemap_remove_folio_for_restructuring(folio);
> +	folio_unlock(folio);
> +}
> +
> +/**
> + * kvm_gmem_split_folio_in_filemap() - Split @folio within filemap in @inode.
> + *
> + * @inode: inode containing the folio.
> + * @folio: folio to be split.
> + *
> + * Split a folio into folios of size PAGE_SIZE. Will clean up folio from filemap
> + * and add back the split folios.
> + *
> + * Context: Expects that before this call, folio's refcount is just the
> + *          filemap's refcounts. After this function returns, the split folios'
> + *          refcounts will also be filemap's refcounts.
> + * Return: 0 on success or negative error otherwise.
> + */
> +static int kvm_gmem_split_folio_in_filemap(struct inode *inode, struct folio *folio)
> +{
> +	size_t orig_nr_pages;
> +	pgoff_t orig_index;
> +	size_t i, j;
> +	int ret;
> +
> +	orig_nr_pages = folio_nr_pages(folio);
> +	if (orig_nr_pages == 1)
> +		return 0;
> +
> +	orig_index = folio->index;
> +
> +	filemap_remove_folio_for_restructuring(folio);
> +
> +	ret = kvm_gmem_allocator_ops(inode)->split_folio(folio);
> +	if (ret)
> +		goto err;
> +
> +	for (i = 0; i < orig_nr_pages; ++i) {
> +		struct folio *f = page_folio(folio_page(folio, i));
> +
> +		ret = __kvm_gmem_filemap_add_folio(inode->i_mapping, f,
> +						   orig_index + i);
> +		if (ret)
> +			goto rollback;
> +	}
> +
> +	return ret;
> +
> +rollback:
> +	for (j = 0; j < i; ++j) {
> +		struct folio *f = page_folio(folio_page(folio, j));
> +
> +		filemap_remove_folio_for_restructuring(f);
> +	}
> +
> +	kvm_gmem_allocator_ops(inode)->merge_folio(folio);
> +err:
> +	WARN_ON(__kvm_gmem_filemap_add_folio(inode->i_mapping, folio, orig_index));
> +
> +	return ret;
> +}
> +
> +static inline int kvm_gmem_try_split_folio_in_filemap(struct inode *inode,
> +						      struct folio *folio)
> +{
> +	size_t to_nr_pages;
> +	void *priv;
> +
> +	if (!kvm_gmem_has_custom_allocator(inode))
> +		return 0;
> +
> +	priv = kvm_gmem_allocator_private(inode);
> +	to_nr_pages = kvm_gmem_allocator_ops(inode)->nr_pages_in_page(priv);
> +
> +	if (kvm_gmem_has_some_shared(inode, folio->index, to_nr_pages))
If the guest_memfd is configured with GUESTMEM_HUGETLB_FLAG_1GB, it seems that
whenever there's a shared page within a 1GB range, the folio will always be
split into 4KB folios. Is it good?

> +		return kvm_gmem_split_folio_in_filemap(inode, folio);
> +
> +	return 0;
> +}
> +
> +/**
> + * kvm_gmem_merge_folio_in_filemap() - Merge @first_folio within filemap in
> + * @inode.
> + *
> + * @inode: inode containing the folio.
> + * @first_folio: first folio among folios to be merged.
> + *
> + * Will clean up subfolios from filemap and add back the merged folio.
> + *
> + * Context: Expects that before this call, all subfolios only have filemap
> + *          refcounts. After this function returns, the merged folio will only
> + *          have filemap refcounts.
> + * Return: 0 on success or negative error otherwise.
> + */
> +static int kvm_gmem_merge_folio_in_filemap(struct inode *inode,
> +					   struct folio *first_folio)
> +{
> +	size_t to_nr_pages;
> +	pgoff_t index;
> +	void *priv;
> +	size_t i;
> +	int ret;
> +
> +	index = first_folio->index;
> +
> +	priv = kvm_gmem_allocator_private(inode);
> +	to_nr_pages = kvm_gmem_allocator_ops(inode)->nr_pages_in_folio(priv);
> +	if (folio_nr_pages(first_folio) == to_nr_pages)
> +		return 0;
> +
> +	for (i = 0; i < to_nr_pages; ++i) {
> +		struct folio *f = page_folio(folio_page(first_folio, i));
> +
> +		filemap_remove_folio_for_restructuring(f);
> +	}
> +
> +	kvm_gmem_allocator_ops(inode)->merge_folio(first_folio);
> +
> +	ret = __kvm_gmem_filemap_add_folio(inode->i_mapping, first_folio, index);
> +	if (ret)
> +		goto err_split;
> +
> +	return ret;
> +
> +err_split:
> +	WARN_ON(kvm_gmem_allocator_ops(inode)->split_folio(first_folio));
> +	for (i = 0; i < to_nr_pages; ++i) {
> +		struct folio *f = page_folio(folio_page(first_folio, i));
> +
> +		WARN_ON(__kvm_gmem_filemap_add_folio(inode->i_mapping, f, index + i));
> +	}
> +
> +	return ret;
> +}
> +
> +static inline int kvm_gmem_try_merge_folio_in_filemap(struct inode *inode,
> +						      struct folio *first_folio)
> +{
> +	size_t to_nr_pages;
> +	void *priv;
> +
> +	priv = kvm_gmem_allocator_private(inode);
> +	to_nr_pages = kvm_gmem_allocator_ops(inode)->nr_pages_in_folio(priv);
> +
> +	if (kvm_gmem_has_some_shared(inode, first_folio->index, to_nr_pages))
> +		return 0;
> +
> +	return kvm_gmem_merge_folio_in_filemap(inode, first_folio);
> +}
> +
> +static int kvm_gmem_restructure_folios_in_range(struct inode *inode,
> +						pgoff_t start, size_t nr_pages,
> +						bool is_split_operation)
> +{
> +	size_t to_nr_pages;
> +	pgoff_t index;
> +	pgoff_t end;
> +	void *priv;
> +	int ret;
> +
> +	if (!kvm_gmem_has_custom_allocator(inode))
> +		return 0;
> +
> +	end = start + nr_pages;
> +
> +	/* Round to allocator page size, to check all (huge) pages in range. */
> +	priv = kvm_gmem_allocator_private(inode);
> +	to_nr_pages = kvm_gmem_allocator_ops(inode)->nr_pages_in_folio(priv);
> +
> +	start = round_down(start, to_nr_pages);
> +	end = round_up(end, to_nr_pages);
> +
> +	for (index = start; index < end; index += to_nr_pages) {
> +		struct folio *f;
> +
> +		f = filemap_get_folio(inode->i_mapping, index);
> +		if (IS_ERR(f))
> +			continue;
> +
> +		/* Leave just filemap's refcounts on the folio. */
> +		folio_put(f);
> +
> +		if (is_split_operation)
> +			ret = kvm_gmem_split_folio_in_filemap(inode, f);
The split operation is performed after kvm_gmem_unmap_private() within
kvm_gmem_convert_should_proceed(), right?

So, it seems that that it's not necessary for TDX to avoid holding private page
references, as TDX must have released the page refs after
kvm_gmem_unmap_private() (except when there's TDX module or KVM bug).

> +		else
> +			ret = kvm_gmem_try_merge_folio_in_filemap(inode, f);
> +
> +		if (ret)
> +			goto rollback;
> +	}
> +	return ret;
> +
> +rollback:
> +	for (index -= to_nr_pages; index >= start; index -= to_nr_pages) {
> +		struct folio *f;
> +
> +		f = filemap_get_folio(inode->i_mapping, index);
> +		if (IS_ERR(f))
> +			continue;
> +
> +		/* Leave just filemap's refcounts on the folio. */
> +		folio_put(f);
> +
> +		if (is_split_operation)
> +			WARN_ON(kvm_gmem_merge_folio_in_filemap(inode, f));
> +		else
> +			WARN_ON(kvm_gmem_split_folio_in_filemap(inode, f));
> +	}
> +
> +	return ret;
> +}
> +
> +#else
> +
> +static inline int kvm_gmem_try_split_folio_in_filemap(struct inode *inode,
> +						      struct folio *folio)
> +{
> +	return 0;
> +}
> +
> +static int kvm_gmem_restructure_folios_in_range(struct inode *inode,
> +						pgoff_t start, size_t nr_pages,
> +						bool is_split_operation)
> +{
> +	return 0;
> +}
> +
> +#endif
> +
>  #else
>  
>  static int kvm_gmem_shareability_setup(struct maple_tree *mt, loff_t size, u64 flags)
> @@ -563,11 +1005,16 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
>  		return folio;
>  
>  	if (kvm_gmem_has_custom_allocator(inode)) {
> -		void *p = kvm_gmem_allocator_private(inode);
> +		size_t nr_pages;
> +		void *p;
>  
> +		p = kvm_gmem_allocator_private(inode);
>  		folio = kvm_gmem_allocator_ops(inode)->alloc_folio(p);
>  		if (IS_ERR(folio))
>  			return folio;
> +
> +		nr_pages = kvm_gmem_allocator_ops(inode)->nr_pages_in_folio(p);
> +		index_floor = round_down(index, nr_pages);
>  	} else {
>  		gfp_t gfp = mapping_gfp_mask(inode->i_mapping);
>  
> @@ -580,10 +1027,11 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
>  			folio_put(folio);
>  			return ERR_PTR(ret);
>  		}
> +
> +		index_floor = index;
>  	}
>  	allocated_size = folio_size(folio);
>  
> -	index_floor = round_down(index, folio_nr_pages(folio));
>  	ret = kvm_gmem_filemap_add_folio(inode->i_mapping, folio, index_floor);
>  	if (ret) {
>  		folio_put(folio);
> @@ -600,6 +1048,13 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
>  		return ERR_PTR(ret);
>  	}
>  
> +	/* Leave just filemap's refcounts on folio. */
> +	folio_put(folio);
> +
> +	ret = kvm_gmem_try_split_folio_in_filemap(inode, folio);
> +	if (ret)
> +		goto err;
> +
>  	spin_lock(&inode->i_lock);
>  	inode->i_blocks += allocated_size / 512;
>  	spin_unlock(&inode->i_lock);
> @@ -608,14 +1063,17 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
>  	 * folio is the one that is allocated, this gets the folio at the
>  	 * requested index.
>  	 */
> -	folio = page_folio(folio_file_page(folio, index));
> -	folio_lock(folio);
> +	folio = filemap_lock_folio(inode->i_mapping, index);
>  
>  	return folio;
> +
> +err:
> +	filemap_remove_folio(folio);
> +	return ERR_PTR(ret);
>  }
>  
> -static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start,
> -				      pgoff_t end)
> +static void kvm_gmem_invalidate_begin_and_zap(struct kvm_gmem *gmem,
> +					      pgoff_t start, pgoff_t end)
>  {
>  	bool flush = false, found_memslot = false;
>  	struct kvm_memory_slot *slot;
> @@ -848,7 +1306,7 @@ static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len)
>  	filemap_invalidate_lock(inode->i_mapping);
>  
>  	list_for_each_entry(gmem, gmem_list, entry)
> -		kvm_gmem_invalidate_begin(gmem, start, end);
> +		kvm_gmem_invalidate_begin_and_zap(gmem, start, end);
>  
>  	if (kvm_gmem_has_custom_allocator(inode)) {
>  		kvm_gmem_truncate_inode_range(inode, offset, offset + len);
> @@ -978,7 +1436,7 @@ static int kvm_gmem_release(struct inode *inode, struct file *file)
>  	 * Zap all SPTEs pointed at by this file.  Do not free the backing
>  	 * memory, as its lifetime is associated with the inode, not the file.
>  	 */
> -	kvm_gmem_invalidate_begin(gmem, 0, -1ul);
> +	kvm_gmem_invalidate_begin_and_zap(gmem, 0, -1ul);
>  	kvm_gmem_invalidate_end(gmem, 0, -1ul);
>  
>  	list_del(&gmem->entry);
> @@ -1289,7 +1747,7 @@ static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *fol
>  	end = start + folio_nr_pages(folio);
>  
>  	list_for_each_entry(gmem, gmem_list, entry)
> -		kvm_gmem_invalidate_begin(gmem, start, end);
> +		kvm_gmem_invalidate_begin_and_zap(gmem, start, end);
>  
>  	/*
>  	 * Do not truncate the range, what action is taken in response to the
> @@ -1330,6 +1788,9 @@ static void kvm_gmem_free_folio(struct address_space *mapping,
>  	 */
>  	folio_clear_uptodate(folio);
>  
> +	if (kvm_gmem_has_custom_allocator(mapping->host))
> +		kvm_gmem_allocator_ops(mapping->host)->free_folio(folio);
> +
>  	kvm_gmem_invalidate(folio);
>  }
>  
> -- 
> 2.49.0.1045.g170613ef41-goog
>
Re: [RFC PATCH v2 38/51] KVM: guest_memfd: Split allocator pages for guest_memfd use
Posted by Ackerley Tng 6 months, 2 weeks ago
Yan Zhao <yan.y.zhao@intel.com> writes:

> On Wed, May 14, 2025 at 04:42:17PM -0700, Ackerley Tng wrote:

[...]

>> +static pgoff_t kvm_gmem_compute_invalidate_bound(struct inode *inode,
>> +						 pgoff_t bound, bool start)
>> +{
>> +	size_t nr_pages;
>> +	void *priv;
>> +
>> +	if (!kvm_gmem_has_custom_allocator(inode))
>> +		return bound;
>> +
>> +	priv = kvm_gmem_allocator_private(inode);
>> +	nr_pages = kvm_gmem_allocator_ops(inode)->nr_pages_in_folio(priv);
>> +
>> +	if (start)
>> +		return round_down(bound, nr_pages);
>> +	else
>> +		return round_up(bound, nr_pages);
>> +}
>> +
>> +static pgoff_t kvm_gmem_compute_invalidate_start(struct inode *inode,
>> +						 pgoff_t bound)
>> +{
>> +	return kvm_gmem_compute_invalidate_bound(inode, bound, true);
>> +}
>> +
>> +static pgoff_t kvm_gmem_compute_invalidate_end(struct inode *inode,
>> +					       pgoff_t bound)
>> +{
>> +	return kvm_gmem_compute_invalidate_bound(inode, bound, false);
>> +}
>> +
>>  static int kvm_gmem_shareability_apply(struct inode *inode,
>>  				       struct conversion_work *work,
>>  				       enum shareability m)
>> @@ -299,35 +428,53 @@ static void kvm_gmem_convert_invalidate_begin(struct inode *inode,
>>  					      struct conversion_work *work)
>>  {
>>  	struct list_head *gmem_list;
>> +	pgoff_t invalidate_start;
>> +	pgoff_t invalidate_end;
>>  	struct kvm_gmem *gmem;
>> -	pgoff_t end;
>> +	pgoff_t work_end;
>>  
>> -	end = work->start + work->nr_pages;
>> +	work_end = work->start + work->nr_pages;
>> +	invalidate_start = kvm_gmem_compute_invalidate_start(inode, work->start);
>> +	invalidate_end = kvm_gmem_compute_invalidate_end(inode, work_end);

The invalidation range is broadened to include the full range to take
care of this race [1] reported for the conversion flow that uses
KVM_SET_MEMORY_ATTRIBUTES ioctl, so I also repeated the broadening for
this guest_memfd conversion ioctl.

> Could we just notify the exact gfn range and let KVM adjust the invalidate
> range?
>

How do we get KVM to adjust the invalidate range?

> Then kvm_gmem_invalidate_begin() can asks KVM to do EPT splitting before any
> kvm_mmu_unmap_gfn_range() is performed.
>
>

In this snapshot of my WIP of putting this HugeTLB support with TDX huge
page EPT support [2], I was thinking to combine EPT splitting together
with unmap, and leaving the invalidate to be a separate part. (See
kvm_gmem_unmap_private().) I did it this way so that the EPT splitting
is range is the unmapping range, and only the invalidation range is
broadened.

What do you think of that?

>>  	gmem_list = &inode->i_mapping->i_private_list;
>>  	list_for_each_entry(gmem, gmem_list, entry)
>> -		kvm_gmem_invalidate_begin(gmem, work->start, end);
>> +		kvm_gmem_invalidate_begin(gmem, invalidate_start, invalidate_end);
>>  }
>>  
>>  static void kvm_gmem_convert_invalidate_end(struct inode *inode,
>>  					    struct conversion_work *work)
>>  {
>>  	struct list_head *gmem_list;
>> +	pgoff_t invalidate_start;
>> +	pgoff_t invalidate_end;
>>  	struct kvm_gmem *gmem;
>> -	pgoff_t end;
>> +	pgoff_t work_end;
>>  
>> -	end = work->start + work->nr_pages;
>> +	work_end = work->start + work->nr_pages;
>> +	invalidate_start = kvm_gmem_compute_invalidate_start(inode, work->start);
>> +	invalidate_end = kvm_gmem_compute_invalidate_end(inode, work_end);
>>  
>>  	gmem_list = &inode->i_mapping->i_private_list;
>>  	list_for_each_entry(gmem, gmem_list, entry)
>> -		kvm_gmem_invalidate_end(gmem, work->start, end);
>> +		kvm_gmem_invalidate_end(gmem, invalidate_start, invalidate_end);
>>  }
>>  
>>  static int kvm_gmem_convert_should_proceed(struct inode *inode,
>>  					   struct conversion_work *work,
>>  					   bool to_shared, pgoff_t *error_index)
>>  {
>> -	if (!to_shared) {
>> +	if (to_shared) {
>> +		struct list_head *gmem_list;
>> +		struct kvm_gmem *gmem;
>> +		pgoff_t work_end;
>> +
>> +		work_end = work->start + work->nr_pages;
>> +
>> +		gmem_list = &inode->i_mapping->i_private_list;
>> +		list_for_each_entry(gmem, gmem_list, entry)
>> +			kvm_gmem_unmap_private(gmem, work->start, work_end);
>> +	} else {
>>  		unmap_mapping_pages(inode->i_mapping, work->start,
>>  				    work->nr_pages, false);
>>  
>> @@ -340,6 +487,27 @@ static int kvm_gmem_convert_should_proceed(struct inode *inode,
>>  	return 0;
>>  }
>>  
>> +static int kvm_gmem_convert_execute_work(struct inode *inode,
>> +					 struct conversion_work *work,
>> +					 bool to_shared)
>> +{
>> +	enum shareability m;
>> +	int ret;
>> +
>> +	m = to_shared ? SHAREABILITY_ALL : SHAREABILITY_GUEST;
>> +	ret = kvm_gmem_shareability_apply(inode, work, m);
>> +	if (ret)
>> +		return ret;
>> +	/*
>> +	 * Apply shareability first so split/merge can operate on new
>> +	 * shareability state.
>> +	 */
>> +	ret = kvm_gmem_restructure_folios_in_range(
>> +		inode, work->start, work->nr_pages, to_shared);
>> +
>> +	return ret;
>> +}
>> +
>>  static int kvm_gmem_convert_range(struct file *file, pgoff_t start,
>>  				  size_t nr_pages, bool shared,
>>  				  pgoff_t *error_index)
>> @@ -371,18 +539,21 @@ static int kvm_gmem_convert_range(struct file *file, pgoff_t start,
>>  
>>  	list_for_each_entry(work, &work_list, list) {
>>  		rollback_stop_item = work;
>> -		ret = kvm_gmem_shareability_apply(inode, work, m);
>> +
>> +		ret = kvm_gmem_convert_execute_work(inode, work, shared);
>>  		if (ret)
>>  			break;
>>  	}
>>  
>>  	if (ret) {
>> -		m = shared ? SHAREABILITY_GUEST : SHAREABILITY_ALL;
>>  		list_for_each_entry(work, &work_list, list) {
>> +			int r;
>> +
>> +			r = kvm_gmem_convert_execute_work(inode, work, !shared);
>> +			WARN_ON(r);
>> +
>>  			if (work == rollback_stop_item)
>>  				break;
>> -
>> -			WARN_ON(kvm_gmem_shareability_apply(inode, work, m));
>>  		}
>>  	}
>>  
>> @@ -434,6 +605,277 @@ static int kvm_gmem_ioctl_convert_range(struct file *file,
>>  	return ret;
>>  }
>>  
>> +#ifdef CONFIG_KVM_GMEM_HUGETLB
>> +
>> +static inline void __filemap_remove_folio_for_restructuring(struct folio *folio)
>> +{
>> +	struct address_space *mapping = folio->mapping;
>> +
>> +	spin_lock(&mapping->host->i_lock);
>> +	xa_lock_irq(&mapping->i_pages);
>> +
>> +	__filemap_remove_folio(folio, NULL);
>> +
>> +	xa_unlock_irq(&mapping->i_pages);
>> +	spin_unlock(&mapping->host->i_lock);
>> +}
>> +
>> +/**
>> + * filemap_remove_folio_for_restructuring() - Remove @folio from filemap for
>> + * split/merge.
>> + *
>> + * @folio: the folio to be removed.
>> + *
>> + * Similar to filemap_remove_folio(), but skips LRU-related calls (meaningless
>> + * for guest_memfd), and skips call to ->free_folio() to maintain folio flags.
>> + *
>> + * Context: Expects only the filemap's refcounts to be left on the folio. Will
>> + *          freeze these refcounts away so that no other users will interfere
>> + *          with restructuring.
>> + */
>> +static inline void filemap_remove_folio_for_restructuring(struct folio *folio)
>> +{
>> +	int filemap_refcount;
>> +
>> +	filemap_refcount = folio_nr_pages(folio);
>> +	while (!folio_ref_freeze(folio, filemap_refcount)) {
>> +		/*
>> +		 * At this point only filemap refcounts are expected, hence okay
>> +		 * to spin until speculative refcounts go away.
>> +		 */
>> +		WARN_ONCE(1, "Spinning on folio=%p refcount=%d", folio, folio_ref_count(folio));
>> +	}
>> +
>> +	folio_lock(folio);
>> +	__filemap_remove_folio_for_restructuring(folio);
>> +	folio_unlock(folio);
>> +}
>> +
>> +/**
>> + * kvm_gmem_split_folio_in_filemap() - Split @folio within filemap in @inode.
>> + *
>> + * @inode: inode containing the folio.
>> + * @folio: folio to be split.
>> + *
>> + * Split a folio into folios of size PAGE_SIZE. Will clean up folio from filemap
>> + * and add back the split folios.
>> + *
>> + * Context: Expects that before this call, folio's refcount is just the
>> + *          filemap's refcounts. After this function returns, the split folios'
>> + *          refcounts will also be filemap's refcounts.
>> + * Return: 0 on success or negative error otherwise.
>> + */
>> +static int kvm_gmem_split_folio_in_filemap(struct inode *inode, struct folio *folio)
>> +{
>> +	size_t orig_nr_pages;
>> +	pgoff_t orig_index;
>> +	size_t i, j;
>> +	int ret;
>> +
>> +	orig_nr_pages = folio_nr_pages(folio);
>> +	if (orig_nr_pages == 1)
>> +		return 0;
>> +
>> +	orig_index = folio->index;
>> +
>> +	filemap_remove_folio_for_restructuring(folio);
>> +
>> +	ret = kvm_gmem_allocator_ops(inode)->split_folio(folio);
>> +	if (ret)
>> +		goto err;
>> +
>> +	for (i = 0; i < orig_nr_pages; ++i) {
>> +		struct folio *f = page_folio(folio_page(folio, i));
>> +
>> +		ret = __kvm_gmem_filemap_add_folio(inode->i_mapping, f,
>> +						   orig_index + i);
>> +		if (ret)
>> +			goto rollback;
>> +	}
>> +
>> +	return ret;
>> +
>> +rollback:
>> +	for (j = 0; j < i; ++j) {
>> +		struct folio *f = page_folio(folio_page(folio, j));
>> +
>> +		filemap_remove_folio_for_restructuring(f);
>> +	}
>> +
>> +	kvm_gmem_allocator_ops(inode)->merge_folio(folio);
>> +err:
>> +	WARN_ON(__kvm_gmem_filemap_add_folio(inode->i_mapping, folio, orig_index));
>> +
>> +	return ret;
>> +}
>> +
>> +static inline int kvm_gmem_try_split_folio_in_filemap(struct inode *inode,
>> +						      struct folio *folio)
>> +{
>> +	size_t to_nr_pages;
>> +	void *priv;
>> +
>> +	if (!kvm_gmem_has_custom_allocator(inode))
>> +		return 0;
>> +
>> +	priv = kvm_gmem_allocator_private(inode);
>> +	to_nr_pages = kvm_gmem_allocator_ops(inode)->nr_pages_in_page(priv);
>> +
>> +	if (kvm_gmem_has_some_shared(inode, folio->index, to_nr_pages))
> If the guest_memfd is configured with GUESTMEM_HUGETLB_FLAG_1GB, it seems that
> whenever there's a shared page within a 1GB range, the folio will always be
> split into 4KB folios. Is it good?
>

It is not the best, but okay as an initial step.

We want to work on splitting 1G to 2M (as many 2M pages as possible)
then to 4K. I believe the agreement with the community is that the
1G->2M->4K splitting is an optimization for the patch series after this
one.

>> +		return kvm_gmem_split_folio_in_filemap(inode, folio);
>> +
>> +	return 0;
>> +}
>> +
>> +/**
>> + * kvm_gmem_merge_folio_in_filemap() - Merge @first_folio within filemap in
>> + * @inode.
>> + *
>> + * @inode: inode containing the folio.
>> + * @first_folio: first folio among folios to be merged.
>> + *
>> + * Will clean up subfolios from filemap and add back the merged folio.
>> + *
>> + * Context: Expects that before this call, all subfolios only have filemap
>> + *          refcounts. After this function returns, the merged folio will only
>> + *          have filemap refcounts.
>> + * Return: 0 on success or negative error otherwise.
>> + */
>> +static int kvm_gmem_merge_folio_in_filemap(struct inode *inode,
>> +					   struct folio *first_folio)
>> +{
>> +	size_t to_nr_pages;
>> +	pgoff_t index;
>> +	void *priv;
>> +	size_t i;
>> +	int ret;
>> +
>> +	index = first_folio->index;
>> +
>> +	priv = kvm_gmem_allocator_private(inode);
>> +	to_nr_pages = kvm_gmem_allocator_ops(inode)->nr_pages_in_folio(priv);
>> +	if (folio_nr_pages(first_folio) == to_nr_pages)
>> +		return 0;
>> +
>> +	for (i = 0; i < to_nr_pages; ++i) {
>> +		struct folio *f = page_folio(folio_page(first_folio, i));
>> +
>> +		filemap_remove_folio_for_restructuring(f);
>> +	}
>> +
>> +	kvm_gmem_allocator_ops(inode)->merge_folio(first_folio);
>> +
>> +	ret = __kvm_gmem_filemap_add_folio(inode->i_mapping, first_folio, index);
>> +	if (ret)
>> +		goto err_split;
>> +
>> +	return ret;
>> +
>> +err_split:
>> +	WARN_ON(kvm_gmem_allocator_ops(inode)->split_folio(first_folio));
>> +	for (i = 0; i < to_nr_pages; ++i) {
>> +		struct folio *f = page_folio(folio_page(first_folio, i));
>> +
>> +		WARN_ON(__kvm_gmem_filemap_add_folio(inode->i_mapping, f, index + i));
>> +	}
>> +
>> +	return ret;
>> +}
>> +
>> +static inline int kvm_gmem_try_merge_folio_in_filemap(struct inode *inode,
>> +						      struct folio *first_folio)
>> +{
>> +	size_t to_nr_pages;
>> +	void *priv;
>> +
>> +	priv = kvm_gmem_allocator_private(inode);
>> +	to_nr_pages = kvm_gmem_allocator_ops(inode)->nr_pages_in_folio(priv);
>> +
>> +	if (kvm_gmem_has_some_shared(inode, first_folio->index, to_nr_pages))
>> +		return 0;
>> +
>> +	return kvm_gmem_merge_folio_in_filemap(inode, first_folio);
>> +}
>> +
>> +static int kvm_gmem_restructure_folios_in_range(struct inode *inode,
>> +						pgoff_t start, size_t nr_pages,
>> +						bool is_split_operation)
>> +{
>> +	size_t to_nr_pages;
>> +	pgoff_t index;
>> +	pgoff_t end;
>> +	void *priv;
>> +	int ret;
>> +
>> +	if (!kvm_gmem_has_custom_allocator(inode))
>> +		return 0;
>> +
>> +	end = start + nr_pages;
>> +
>> +	/* Round to allocator page size, to check all (huge) pages in range. */
>> +	priv = kvm_gmem_allocator_private(inode);
>> +	to_nr_pages = kvm_gmem_allocator_ops(inode)->nr_pages_in_folio(priv);
>> +
>> +	start = round_down(start, to_nr_pages);
>> +	end = round_up(end, to_nr_pages);
>> +
>> +	for (index = start; index < end; index += to_nr_pages) {
>> +		struct folio *f;
>> +
>> +		f = filemap_get_folio(inode->i_mapping, index);
>> +		if (IS_ERR(f))
>> +			continue;
>> +
>> +		/* Leave just filemap's refcounts on the folio. */
>> +		folio_put(f);
>> +
>> +		if (is_split_operation)
>> +			ret = kvm_gmem_split_folio_in_filemap(inode, f);
> The split operation is performed after kvm_gmem_unmap_private() within
> kvm_gmem_convert_should_proceed(), right?
>
> So, it seems that that it's not necessary for TDX to avoid holding private page
> references, as TDX must have released the page refs after
> kvm_gmem_unmap_private() (except when there's TDX module or KVM bug).
>

I agree with your assessment in the follow up email.

We don't want to unmap more than the requested conversion range to avoid
extra churn. If TDX holds refcounts on mapped pages, the subpages that
are still mapped will contribute to the refcount of the huge page, and
we can't split a page that has refcounts because we don't know how the
refcounts are distributed over the subpages.

I guess technically if the refcounts are divisible across nr_pages, we
could still split, but if we have a 1G page, but only some of the 1G
subpages are mapped into TDX EPTs, then we would have a refcount that we
don't know how to divide out.

>> +		else
>> +			ret = kvm_gmem_try_merge_folio_in_filemap(inode, f);
>> +
>> +		if (ret)
>> +			goto rollback;
>> +	}
>> +	return ret;
>> +
>> +rollback:
>> +	for (index -= to_nr_pages; index >= start; index -= to_nr_pages) {
>> +		struct folio *f;
>> +
>> +		f = filemap_get_folio(inode->i_mapping, index);
>> +		if (IS_ERR(f))
>> +			continue;
>> +
>> +		/* Leave just filemap's refcounts on the folio. */
>> +		folio_put(f);
>> +
>> +		if (is_split_operation)
>> +			WARN_ON(kvm_gmem_merge_folio_in_filemap(inode, f));
>> +		else
>> +			WARN_ON(kvm_gmem_split_folio_in_filemap(inode, f));
>> +	}
>> +
>> +	return ret;
>> +}
>> +
>> +#else
>> +

[...]

[1] https://lore.kernel.org/all/Z__AAB_EFxGFEjDR@google.com/
[2] https://github.com/googleprodkernel/linux-cc/commits/wip-tdx-gmem-conversions-hugetlb-2mept/
Re: [RFC PATCH v2 38/51] KVM: guest_memfd: Split allocator pages for guest_memfd use
Posted by Yan Zhao 6 months, 3 weeks ago
> > +static int kvm_gmem_restructure_folios_in_range(struct inode *inode,
> > +						pgoff_t start, size_t nr_pages,
> > +						bool is_split_operation)
> > +{
> > +	size_t to_nr_pages;
> > +	pgoff_t index;
> > +	pgoff_t end;
> > +	void *priv;
> > +	int ret;
> > +
> > +	if (!kvm_gmem_has_custom_allocator(inode))
> > +		return 0;
> > +
> > +	end = start + nr_pages;
> > +
> > +	/* Round to allocator page size, to check all (huge) pages in range. */
> > +	priv = kvm_gmem_allocator_private(inode);
> > +	to_nr_pages = kvm_gmem_allocator_ops(inode)->nr_pages_in_folio(priv);
> > +
> > +	start = round_down(start, to_nr_pages);
> > +	end = round_up(end, to_nr_pages);
> > +
> > +	for (index = start; index < end; index += to_nr_pages) {
> > +		struct folio *f;
> > +
> > +		f = filemap_get_folio(inode->i_mapping, index);
> > +		if (IS_ERR(f))
> > +			continue;
> > +
> > +		/* Leave just filemap's refcounts on the folio. */
> > +		folio_put(f);
> > +
> > +		if (is_split_operation)
> > +			ret = kvm_gmem_split_folio_in_filemap(inode, f);
> The split operation is performed after kvm_gmem_unmap_private() within
> kvm_gmem_convert_should_proceed(), right?
> 
> So, it seems that that it's not necessary for TDX to avoid holding private page
> references, as TDX must have released the page refs after
> kvm_gmem_unmap_private() (except when there's TDX module or KVM bug).
Oops. Please ignore this one.
The unmap does not necessarily cover the entire folio range, so split still
requires TDX not to hold ref count.
Re: [RFC PATCH v2 38/51] KVM: guest_memfd: Split allocator pages for guest_memfd use
Posted by Edgecombe, Rick P 6 months, 4 weeks ago
On Wed, 2025-05-14 at 16:42 -0700, Ackerley Tng wrote:
> +
> +static pgoff_t kvm_gmem_compute_invalidate_bound(struct inode *inode,
> +						 pgoff_t bound, bool start)
> +{
> +	size_t nr_pages;
> +	void *priv;
> +
> +	if (!kvm_gmem_has_custom_allocator(inode))

General comment - It's a bit unfortunate how kvm_gmem_has_custom_allocator() is
checked all over the place across this series. There are only two allocators
after this, right? So one is implemented with callbacks presumably designed to
fit other allocators, and one has special case logic in guest_memfd.c.

Did you consider designing struct guestmem_allocator_operations so that it could
encapsulate the special logic for both the existing and new allocators? If it
didn't work well, could we expect that a next allocator would actually fit
struct guestmem_allocator_operations?

> +		return bound;
> +
> +	priv = kvm_gmem_allocator_private(inode);
> +	nr_pages = kvm_gmem_allocator_ops(inode)->nr_pages_in_folio(priv);
> +
> +	if (start)
> +		return round_down(bound, nr_pages);
> +	else
> +		return round_up(bound, nr_pages);
> +}
> +
> +static pgoff_t kvm_gmem_compute_invalidate_start(struct inode *inode,
> +						 pgoff_t bound)
> +{
> +	return kvm_gmem_compute_invalidate_bound(inode, bound, true);
> +}
> +
> +static pgoff_t kvm_gmem_compute_invalidate_end(struct inode *inode,
> +					       pgoff_t bound)
> +{
> +	return kvm_gmem_compute_invalidate_bound(inode, bound, false);
> +}
> +
Re: [RFC PATCH v2 38/51] KVM: guest_memfd: Split allocator pages for guest_memfd use
Posted by Ackerley Tng 6 months, 2 weeks ago
"Edgecombe, Rick P" <rick.p.edgecombe@intel.com> writes:

> On Wed, 2025-05-14 at 16:42 -0700, Ackerley Tng wrote:
>> +
>> +static pgoff_t kvm_gmem_compute_invalidate_bound(struct inode *inode,
>> +						 pgoff_t bound, bool start)
>> +{
>> +	size_t nr_pages;
>> +	void *priv;
>> +
>> +	if (!kvm_gmem_has_custom_allocator(inode))
>
> General comment - It's a bit unfortunate how kvm_gmem_has_custom_allocator() is
> checked all over the place across this series. There are only two allocators
> after this, right? So one is implemented with callbacks presumably designed to
> fit other allocators, and one has special case logic in guest_memfd.c.
>
> Did you consider designing struct guestmem_allocator_operations so that it could
> encapsulate the special logic for both the existing and new
> allocators?

I did, yes. I believe it is definitely possible to make standard 4K
pages become another allocator too.

I would love to clean this up. Not sure if that will be a new series
after this one, or part of this one though.

> If it
> didn't work well, could we expect that a next allocator would actually fit
> struct guestmem_allocator_operations?
>

This was definitely designed to support allocators beyond
guestmem_hugetlb, though I won't promise that it will be a perfect fit
for future allocators. This is internal to the kernel and this interface
can be updated for future allocators though.

>> +		return bound;
>> +
>> +	priv = kvm_gmem_allocator_private(inode);
>> +	nr_pages = kvm_gmem_allocator_ops(inode)->nr_pages_in_folio(priv);
>> +
>> +	if (start)
>> +		return round_down(bound, nr_pages);
>> +	else
>> +		return round_up(bound, nr_pages);
>> +}
>> +
>> +static pgoff_t kvm_gmem_compute_invalidate_start(struct inode *inode,
>> +						 pgoff_t bound)
>> +{
>> +	return kvm_gmem_compute_invalidate_bound(inode, bound, true);
>> +}
>> +
>> +static pgoff_t kvm_gmem_compute_invalidate_end(struct inode *inode,
>> +					       pgoff_t bound)
>> +{
>> +	return kvm_gmem_compute_invalidate_bound(inode, bound, false);
>> +}
>> +
Re: [RFC PATCH v2 38/51] KVM: guest_memfd: Split allocator pages for guest_memfd use
Posted by Ackerley Tng 6 months, 2 weeks ago
"Edgecombe, Rick P" <rick.p.edgecombe@intel.com> writes:

> On Wed, 2025-05-14 at 16:42 -0700, Ackerley Tng wrote:
>> +
>> +static pgoff_t kvm_gmem_compute_invalidate_bound(struct inode *inode,
>> +						 pgoff_t bound, bool start)
>> +{
>> +	size_t nr_pages;
>> +	void *priv;
>> +
>> +	if (!kvm_gmem_has_custom_allocator(inode))
>
> General comment - It's a bit unfortunate how kvm_gmem_has_custom_allocator() is
> checked all over the place across this series. There are only two allocators
> after this, right? So one is implemented with callbacks presumably designed to
> fit other allocators, and one has special case logic in guest_memfd.c.
>
> Did you consider designing struct guestmem_allocator_operations so that it could
> encapsulate the special logic for both the existing and new
> allocators?

I did, yes. I believe it is definitely possible to make standard 4K
pages become another allocator too.

I would love to clean this up. Not sure if that will be a new series
after this one, or part of this one though.

> If it
> didn't work well, could we expect that a next allocator would actually fit
> struct guestmem_allocator_operations?
>

This was definitely designed to support allocators beyond
guestmem_hugetlb, though I won't promise that it will be a perfect fit
for future allocators. This is internal to the kernel and this interface
can be updated for future allocators though.

>> +		return bound;
>> +
>> +	priv = kvm_gmem_allocator_private(inode);
>> +	nr_pages = kvm_gmem_allocator_ops(inode)->nr_pages_in_folio(priv);
>> +
>> +	if (start)
>> +		return round_down(bound, nr_pages);
>> +	else
>> +		return round_up(bound, nr_pages);
>> +}
>> +
>> +static pgoff_t kvm_gmem_compute_invalidate_start(struct inode *inode,
>> +						 pgoff_t bound)
>> +{
>> +	return kvm_gmem_compute_invalidate_bound(inode, bound, true);
>> +}
>> +
>> +static pgoff_t kvm_gmem_compute_invalidate_end(struct inode *inode,
>> +					       pgoff_t bound)
>> +{
>> +	return kvm_gmem_compute_invalidate_bound(inode, bound, false);
>> +}
>> +
Re: [RFC PATCH v2 38/51] KVM: guest_memfd: Split allocator pages for guest_memfd use
Posted by Ackerley Tng 6 months, 2 weeks ago
"Edgecombe, Rick P" <rick.p.edgecombe@intel.com> writes:

> On Wed, 2025-05-14 at 16:42 -0700, Ackerley Tng wrote:
>> +
>> +static pgoff_t kvm_gmem_compute_invalidate_bound(struct inode *inode,
>> +						 pgoff_t bound, bool start)
>> +{
>> +	size_t nr_pages;
>> +	void *priv;
>> +
>> +	if (!kvm_gmem_has_custom_allocator(inode))
>
> General comment - It's a bit unfortunate how kvm_gmem_has_custom_allocator() is
> checked all over the place across this series. There are only two allocators
> after this, right? So one is implemented with callbacks presumably designed to
> fit other allocators, and one has special case logic in guest_memfd.c.
>
> Did you consider designing struct guestmem_allocator_operations so that it could
> encapsulate the special logic for both the existing and new
> allocators?

I did, yes. I believe it is definitely possible to make standard 4K
pages become another allocator too.

I would love to clean this up. Not sure if that will be a new series
after this one, or part of this one though.

> If it
> didn't work well, could we expect that a next allocator would actually fit
> struct guestmem_allocator_operations?
>

This was definitely designed to support allocators beyond
guestmem_hugetlb, though I won't promise that it will be a perfect fit
for future allocators. This is internal to the kernel and this interface
can be updated for future allocators though.

>> +		return bound;
>> +
>> +	priv = kvm_gmem_allocator_private(inode);
>> +	nr_pages = kvm_gmem_allocator_ops(inode)->nr_pages_in_folio(priv);
>> +
>> +	if (start)
>> +		return round_down(bound, nr_pages);
>> +	else
>> +		return round_up(bound, nr_pages);
>> +}
>> +
>> +static pgoff_t kvm_gmem_compute_invalidate_start(struct inode *inode,
>> +						 pgoff_t bound)
>> +{
>> +	return kvm_gmem_compute_invalidate_bound(inode, bound, true);
>> +}
>> +
>> +static pgoff_t kvm_gmem_compute_invalidate_end(struct inode *inode,
>> +					       pgoff_t bound)
>> +{
>> +	return kvm_gmem_compute_invalidate_bound(inode, bound, false);
>> +}
>> +
Re: [RFC PATCH v2 38/51] KVM: guest_memfd: Split allocator pages for guest_memfd use
Posted by Ackerley Tng 6 months, 2 weeks ago
"Edgecombe, Rick P" <rick.p.edgecombe@intel.com> writes:

> On Wed, 2025-05-14 at 16:42 -0700, Ackerley Tng wrote:
>> +
>> +static pgoff_t kvm_gmem_compute_invalidate_bound(struct inode *inode,
>> +						 pgoff_t bound, bool start)
>> +{
>> +	size_t nr_pages;
>> +	void *priv;
>> +
>> +	if (!kvm_gmem_has_custom_allocator(inode))
>
> General comment - It's a bit unfortunate how kvm_gmem_has_custom_allocator() is
> checked all over the place across this series. There are only two allocators
> after this, right? So one is implemented with callbacks presumably designed to
> fit other allocators, and one has special case logic in guest_memfd.c.
>
> Did you consider designing struct guestmem_allocator_operations so that it could
> encapsulate the special logic for both the existing and new
> allocators?

I did, yes. I believe it is definitely possible to make standard 4K
pages become another allocator too.

I would love to clean this up. Not sure if that will be a new series
after this one, or part of this one though.

> If it
> didn't work well, could we expect that a next allocator would actually fit
> struct guestmem_allocator_operations?
>

This was definitely designed to support allocators beyond
guestmem_hugetlb, though I won't promise that it will be a perfect fit
for future allocators. This is internal to the kernel and this interface
can be updated for future allocators though.

>> +		return bound;
>> +
>> +	priv = kvm_gmem_allocator_private(inode);
>> +	nr_pages = kvm_gmem_allocator_ops(inode)->nr_pages_in_folio(priv);
>> +
>> +	if (start)
>> +		return round_down(bound, nr_pages);
>> +	else
>> +		return round_up(bound, nr_pages);
>> +}
>> +
>> +static pgoff_t kvm_gmem_compute_invalidate_start(struct inode *inode,
>> +						 pgoff_t bound)
>> +{
>> +	return kvm_gmem_compute_invalidate_bound(inode, bound, true);
>> +}
>> +
>> +static pgoff_t kvm_gmem_compute_invalidate_end(struct inode *inode,
>> +					       pgoff_t bound)
>> +{
>> +	return kvm_gmem_compute_invalidate_bound(inode, bound, false);
>> +}
>> +
Re: [RFC PATCH v2 38/51] KVM: guest_memfd: Split allocator pages for guest_memfd use
Posted by Edgecombe, Rick P 6 months, 2 weeks ago
On Thu, 2025-06-05 at 10:15 -0700, Ackerley Tng wrote:
> > > > > > > > Did you consider designing struct guestmem_allocator_operations
> > > > > > > > so that > > > > it > > could
> > > > > > > > encapsulate the special logic for both the existing and new
> > > > > > > > allocators?
> > > > 
> > > > I did, yes. I believe it is definitely possible to make standard 4K
> > > > pages become another allocator too.
> > > > 
> > > > I would love to clean this up. Not sure if that will be a new series
> > > > after this one, or part of this one though.

Usually new work should handle the refactor first, then build on top of it. The
code today bolts on a new thing in a dirty way leaving cleanup.

Towards also expedient reviewability, a better order could be:
1. Add allocator callbacks one at a time (or in whatever granularity is
possible), moving 4k allocator to callbacks at the same time. Basically a code
move. Don't factor out common code between the planned allocators. Will be dirt
simple to review.
2. Introduce changes to hugelbfs, explaining why each will be used by guestmemfd
3. Add hugetlbsfs/1GB custom allocator to guestmemfd code, a callback at a time.
Have any necessary factoring out of 4k page allocator bits out of the callback
implementation come in a separate preceding patch. Explain the commonality.

What do you think?

Also, for (2) do you think you could move some of these pure cleanup patches out
of the series to go up ahead of time? And for any hugetlb changes that 1GB
guestmemfd depends on, explain why in the log? I'm not clear what is required
and what is opportunistic cleanup.


> > > > 
> > > > > > > > If it
> > > > > > > > didn't work well, could we expect that a next allocator would
> > > > > > > > actually > > > > fit
> > > > > > > > struct guestmem_allocator_operations?
> > > > > > > > 
> > > > 
> > > > This was definitely designed to support allocators beyond
> > > > guestmem_hugetlb, though I won't promise that it will be a perfect fit
> > > > for future allocators. This is internal to the kernel and this interface
> > > > can be updated for future allocators though.

Makes sense. This was probing on whether the interface didn't fit the 4k
allocator. It makes sense to have the interface target the existing 2
allocators, and no future ideas.
Re: [RFC PATCH v2 38/51] KVM: guest_memfd: Split allocator pages for guest_memfd use
Posted by Ackerley Tng 6 months, 2 weeks ago
"Edgecombe, Rick P" <rick.p.edgecombe@intel.com> writes:

> On Wed, 2025-05-14 at 16:42 -0700, Ackerley Tng wrote:
>> +
>> +static pgoff_t kvm_gmem_compute_invalidate_bound(struct inode *inode,
>> +						 pgoff_t bound, bool start)
>> +{
>> +	size_t nr_pages;
>> +	void *priv;
>> +
>> +	if (!kvm_gmem_has_custom_allocator(inode))
>
> General comment - It's a bit unfortunate how kvm_gmem_has_custom_allocator() is
> checked all over the place across this series. There are only two allocators
> after this, right? So one is implemented with callbacks presumably designed to
> fit other allocators, and one has special case logic in guest_memfd.c.
>
> Did you consider designing struct guestmem_allocator_operations so that it could
> encapsulate the special logic for both the existing and new
> allocators?

I did, yes. I believe it is definitely possible to make standard 4K
pages become another allocator too.

I would love to clean this up. Not sure if that will be a new series
after this one, or part of this one though.

> If it
> didn't work well, could we expect that a next allocator would actually fit
> struct guestmem_allocator_operations?
>

This was definitely designed to support allocators beyond
guestmem_hugetlb, though I won't promise that it will be a perfect fit
for future allocators. This is internal to the kernel and this interface
can be updated for future allocators though.

>> +		return bound;
>> +
>> +	priv = kvm_gmem_allocator_private(inode);
>> +	nr_pages = kvm_gmem_allocator_ops(inode)->nr_pages_in_folio(priv);
>> +
>> +	if (start)
>> +		return round_down(bound, nr_pages);
>> +	else
>> +		return round_up(bound, nr_pages);
>> +}
>> +
>> +static pgoff_t kvm_gmem_compute_invalidate_start(struct inode *inode,
>> +						 pgoff_t bound)
>> +{
>> +	return kvm_gmem_compute_invalidate_bound(inode, bound, true);
>> +}
>> +
>> +static pgoff_t kvm_gmem_compute_invalidate_end(struct inode *inode,
>> +					       pgoff_t bound)
>> +{
>> +	return kvm_gmem_compute_invalidate_bound(inode, bound, false);
>> +}
>> +