[RFC PATCH 39/39] KVM: guest_memfd: Dynamically split/reconstruct HugeTLB page

Ackerley Tng posted 39 patches 1 year, 5 months ago
There is a newer version of this series
[RFC PATCH 39/39] KVM: guest_memfd: Dynamically split/reconstruct HugeTLB page
Posted by Ackerley Tng 1 year, 5 months ago
From: Vishal Annapurve <vannapurve@google.com>

The faultability of a page is used to determine whether to split or
reconstruct a page.

If there is any page in a folio that is faultable, split the folio. If
all pages in a folio are not faultable, reconstruct the folio.

On truncation, always reconstruct and free regardless of
faultability (as long as a HugeTLB page's worth of pages is
truncated).

Co-developed-by: Vishal Annapurve <vannapurve@google.com>
Signed-off-by: Vishal Annapurve <vannapurve@google.com>
Co-developed-by: Ackerley Tng <ackerleytng@google.com>
Signed-off-by: Ackerley Tng <ackerleytng@google.com>

---
 virt/kvm/guest_memfd.c | 678 +++++++++++++++++++++++++++--------------
 1 file changed, 456 insertions(+), 222 deletions(-)

diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index fb292e542381..0afc111099c0 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -99,6 +99,23 @@ static bool kvm_gmem_is_faultable(struct inode *inode, pgoff_t index)
 	return xa_to_value(xa_load(faultability, index)) == KVM_GMEM_FAULTABILITY_VALUE;
 }
 
+/**
+ * Return true if any of the @nr_pages beginning at @index is allowed to be
+ * faulted in.
+ */
+static bool kvm_gmem_is_any_faultable(struct inode *inode, pgoff_t index,
+				      int nr_pages)
+{
+	pgoff_t i;
+
+	for (i = index; i < index + nr_pages; ++i) {
+		if (kvm_gmem_is_faultable(inode, i))
+		    return true;
+	}
+
+	return false;
+}
+
 /**
  * folio_file_pfn - like folio_file_page, but return a pfn.
  * @folio: The folio which contains this index.
@@ -312,6 +329,40 @@ static int kvm_gmem_hugetlb_filemap_add_folio(struct address_space *mapping,
 	return 0;
 }
 
+static inline void kvm_gmem_hugetlb_filemap_remove_folio(struct folio *folio)
+{
+	folio_lock(folio);
+
+	folio_clear_dirty(folio);
+	folio_clear_uptodate(folio);
+	filemap_remove_folio(folio);
+
+	folio_unlock(folio);
+}
+
+/*
+ * Locks a block of nr_pages (1 << huge_page_order(h)) pages within @mapping
+ * beginning at @index. Take either this or filemap_invalidate_lock() whenever
+ * the filemap is accessed.
+ */
+static u32 hugetlb_fault_mutex_lock(struct address_space *mapping, pgoff_t index)
+{
+	pgoff_t hindex;
+	u32 hash;
+
+	hindex = index >> huge_page_order(kvm_gmem_hgmem(mapping->host)->h);
+	hash = hugetlb_fault_mutex_hash(mapping, hindex);
+
+	mutex_lock(&hugetlb_fault_mutex_table[hash]);
+
+	return hash;
+}
+
+static void hugetlb_fault_mutex_unlock(u32 hash)
+{
+	mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+}
+
 struct kvm_gmem_split_stash {
 	struct {
 		unsigned long _flags_2;
@@ -394,15 +445,136 @@ static int kvm_gmem_hugetlb_reconstruct_folio(struct hstate *h, struct folio *fo
 	}
 
 	__folio_set_hugetlb(folio);
-
-	folio_set_count(folio, 1);
+	hugetlb_folio_list_add(folio, &h->hugepage_activelist);
 
 	hugetlb_vmemmap_optimize_folio(h, folio);
 
+	folio_set_count(folio, 1);
+
 	return 0;
 }
 
-/* Basically folio_set_order(folio, 1) without the checks. */
+/**
+ * Reconstruct a HugeTLB folio out of folio_nr_pages(@first_folio) pages. Will
+ * clean up subfolios from filemap and add back the reconstructed folio. Folios
+ * to be reconstructed must not be locked, and reconstructed folio will not be
+ * locked. Return 0 on success or negative error otherwise.
+ *
+ * hugetlb_fault_mutex_lock() has to be held when calling this function.
+ *
+ * Expects that before this call, the filemap's refcounts are the only refcounts
+ * for the folios in the filemap. After this function returns, the filemap's
+ * refcount will be the only refcount on the reconstructed folio.
+ */
+static int kvm_gmem_reconstruct_folio_in_filemap(struct hstate *h,
+						 struct folio *first_folio)
+{
+	struct address_space *mapping;
+	struct folio_batch fbatch;
+	unsigned long end;
+	pgoff_t index;
+	pgoff_t next;
+	int ret;
+	int i;
+
+	if (folio_order(first_folio) == huge_page_order(h))
+		return 0;
+
+	index = first_folio->index;
+	mapping = first_folio->mapping;
+
+	next = index;
+	end = index + (1UL << huge_page_order(h));
+	folio_batch_init(&fbatch);
+	while (filemap_get_folios(mapping, &next, end - 1, &fbatch)) {
+		for (i = 0; i < folio_batch_count(&fbatch); ++i) {
+			struct folio *folio;
+
+			folio = fbatch.folios[i];
+
+			/*
+			 * Before removing from filemap, take a reference so
+			 * sub-folios don't get freed when removing from
+			 * filemap.
+			 */
+			folio_get(folio);
+
+			kvm_gmem_hugetlb_filemap_remove_folio(folio);
+		}
+		folio_batch_release(&fbatch);
+	}
+
+	ret = kvm_gmem_hugetlb_reconstruct_folio(h, first_folio);
+	if (ret) {
+		/* TODO: handle cleanup properly. */
+		WARN_ON(ret);
+		return ret;
+	}
+
+	kvm_gmem_hugetlb_filemap_add_folio(mapping, first_folio, index,
+					   htlb_alloc_mask(h));
+
+	folio_unlock(first_folio);
+	folio_put(first_folio);
+
+	return ret;
+}
+
+/**
+ * Reconstruct any HugeTLB folios in range [@start, @end), if all the subfolios
+ * are not faultable. Return 0 on success or negative error otherwise.
+ *
+ * Will skip any folios that are already reconstructed.
+ */
+static int kvm_gmem_try_reconstruct_folios_range(struct inode *inode,
+						 pgoff_t start, pgoff_t end)
+{
+	unsigned int nr_pages;
+	pgoff_t aligned_start;
+	pgoff_t aligned_end;
+	struct hstate *h;
+	pgoff_t index;
+	int ret;
+
+	if (!is_kvm_gmem_hugetlb(inode))
+		return 0;
+
+	h = kvm_gmem_hgmem(inode)->h;
+	nr_pages = 1UL << huge_page_order(h);
+
+	aligned_start = round_up(start, nr_pages);
+	aligned_end = round_down(end, nr_pages);
+
+	ret = 0;
+	for (index = aligned_start; !ret && index < aligned_end; index += nr_pages) {
+		struct folio *folio;
+		u32 hash;
+
+		hash = hugetlb_fault_mutex_lock(inode->i_mapping, index);
+
+		folio = filemap_get_folio(inode->i_mapping, index);
+		if (!IS_ERR(folio)) {
+			/*
+			 * Drop refcount because reconstruction expects an equal number
+			 * of refcounts for all subfolios - just keep the refcount taken
+			 * by the filemap.
+			 */
+			folio_put(folio);
+
+			/* Merge only when the entire block of nr_pages is not faultable. */
+			if (!kvm_gmem_is_any_faultable(inode, index, nr_pages)) {
+				ret = kvm_gmem_reconstruct_folio_in_filemap(h, folio);
+				WARN_ON(ret);
+			}
+		}
+
+		hugetlb_fault_mutex_unlock(hash);
+	}
+
+	return ret;
+}
+
+/* Basically folio_set_order() without the checks. */
 static inline void kvm_gmem_folio_set_order(struct folio *folio, unsigned int order)
 {
 	folio->_flags_1 = (folio->_flags_1 & ~0xffUL) | order;
@@ -414,8 +586,8 @@ static inline void kvm_gmem_folio_set_order(struct folio *folio, unsigned int or
 /**
  * Split a HugeTLB @folio of size huge_page_size(@h).
  *
- * After splitting, each split folio has a refcount of 1. There are no checks on
- * refcounts before splitting.
+ * Folio must have refcount of 1 when this function is called. After splitting,
+ * each split folio has a refcount of 1.
  *
  * Return 0 on success and negative error otherwise.
  */
@@ -423,14 +595,18 @@ static int kvm_gmem_hugetlb_split_folio(struct hstate *h, struct folio *folio)
 {
 	int ret;
 
+	VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio) != 1, folio);
+
+	folio_set_count(folio, 0);
+
 	ret = hugetlb_vmemmap_restore_folio(h, folio);
 	if (ret)
-		return ret;
+		goto out;
 
 	ret = kvm_gmem_hugetlb_stash_metadata(folio);
 	if (ret) {
 		hugetlb_vmemmap_optimize_folio(h, folio);
-		return ret;
+		goto out;
 	}
 
 	kvm_gmem_folio_set_order(folio, 0);
@@ -439,109 +615,183 @@ static int kvm_gmem_hugetlb_split_folio(struct hstate *h, struct folio *folio)
 	__folio_clear_hugetlb(folio);
 
 	/*
-	 * Remove the first folio from h->hugepage_activelist since it is no
+	 * Remove the original folio from h->hugepage_activelist since it is no
 	 * longer a HugeTLB page. The other split pages should not be on any
 	 * lists.
 	 */
 	hugetlb_folio_list_del(folio);
 
-	return 0;
+	ret = 0;
+out:
+	folio_set_count(folio, 1);
+	return ret;
 }
 
-static struct folio *kvm_gmem_hugetlb_alloc_and_cache_folio(struct inode *inode,
-							    pgoff_t index)
+/**
+ * Split a HugeTLB folio into folio_nr_pages(@folio) pages. Will clean up folio
+ * from filemap and add back the split folios. @folio must not be locked, and
+ * all split folios will not be locked. Return 0 on success or negative error
+ * otherwise.
+ *
+ * hugetlb_fault_mutex_lock() has to be held when calling this function.
+ *
+ * Expects that before this call, the filemap's refcounts are the only refcounts
+ * for the folio. After this function returns, the filemap's refcounts will be
+ * the only refcounts on the split folios.
+ */
+static int kvm_gmem_split_folio_in_filemap(struct hstate *h, struct folio *folio)
 {
-	struct folio *allocated_hugetlb_folio;
-	pgoff_t hugetlb_first_subpage_index;
-	struct page *hugetlb_first_subpage;
-	struct kvm_gmem_hugetlb *hgmem;
-	struct page *requested_page;
+	struct address_space *mapping;
+	struct page *first_subpage;
+	pgoff_t index;
 	int ret;
 	int i;
 
-	hgmem = kvm_gmem_hgmem(inode);
-	allocated_hugetlb_folio = kvm_gmem_hugetlb_alloc_folio(hgmem->h, hgmem->spool);
-	if (IS_ERR(allocated_hugetlb_folio))
-		return allocated_hugetlb_folio;
+	if (folio_order(folio) == 0)
+		return 0;
 
-	requested_page = folio_file_page(allocated_hugetlb_folio, index);
-	hugetlb_first_subpage = folio_file_page(allocated_hugetlb_folio, 0);
-	hugetlb_first_subpage_index = index & (huge_page_mask(hgmem->h) >> PAGE_SHIFT);
+	index = folio->index;
+	mapping = folio->mapping;
 
-	ret = kvm_gmem_hugetlb_split_folio(hgmem->h, allocated_hugetlb_folio);
+	first_subpage = folio_page(folio, 0);
+
+	/*
+	 * Take reference so that folio will not be released when removed from
+	 * filemap.
+	 */
+	folio_get(folio);
+
+	kvm_gmem_hugetlb_filemap_remove_folio(folio);
+
+	ret = kvm_gmem_hugetlb_split_folio(h, folio);
 	if (ret) {
-		folio_put(allocated_hugetlb_folio);
-		return ERR_PTR(ret);
+		WARN_ON(ret);
+		kvm_gmem_hugetlb_filemap_add_folio(mapping, folio, index,
+						   htlb_alloc_mask(h));
+		folio_put(folio);
+		return ret;
 	}
 
-	for (i = 0; i < pages_per_huge_page(hgmem->h); ++i) {
-		struct folio *folio = page_folio(nth_page(hugetlb_first_subpage, i));
+	for (i = 0; i < pages_per_huge_page(h); ++i) {
+		struct folio *folio = page_folio(nth_page(first_subpage, i));
 
-		ret = kvm_gmem_hugetlb_filemap_add_folio(inode->i_mapping,
-							 folio,
-							 hugetlb_first_subpage_index + i,
-							 htlb_alloc_mask(hgmem->h));
+		ret = kvm_gmem_hugetlb_filemap_add_folio(mapping, folio,
+							 index + i,
+							 htlb_alloc_mask(h));
 		if (ret) {
 			/* TODO: handle cleanup properly. */
-			pr_err("Handle cleanup properly index=%lx, ret=%d\n",
-			       hugetlb_first_subpage_index + i, ret);
-			dump_page(nth_page(hugetlb_first_subpage, i), "check");
-			return ERR_PTR(ret);
+			WARN_ON(ret);
+			return ret;
 		}
 
+		folio_unlock(folio);
+
 		/*
-		 * Skip unlocking for the requested index since
-		 * kvm_gmem_get_folio() returns a locked folio.
-		 *
-		 * Do folio_put() to drop the refcount that came with the folio,
-		 * from splitting the folio. Splitting the folio has a refcount
-		 * to be in line with hugetlb_alloc_folio(), which returns a
-		 * folio with refcount 1.
-		 *
-		 * Skip folio_put() for requested index since
-		 * kvm_gmem_get_folio() returns a folio with refcount 1.
+		 * Drop reference so that the only remaining reference is the
+		 * one held by the filemap.
 		 */
-		if (hugetlb_first_subpage_index + i != index) {
-			folio_unlock(folio);
-			folio_put(folio);
-		}
+		folio_put(folio);
 	}
 
+	return ret;
+}
+
+/*
+ * Allocates and then caches a folio in the filemap. Returns a folio with
+ * refcount of 2: 1 after allocation, and 1 taken by the filemap.
+ */
+static struct folio *kvm_gmem_hugetlb_alloc_and_cache_folio(struct inode *inode,
+							    pgoff_t index)
+{
+	struct kvm_gmem_hugetlb *hgmem;
+	pgoff_t aligned_index;
+	struct folio *folio;
+	int nr_pages;
+	int ret;
+
+	hgmem = kvm_gmem_hgmem(inode);
+	folio = kvm_gmem_hugetlb_alloc_folio(hgmem->h, hgmem->spool);
+	if (IS_ERR(folio))
+		return folio;
+
+	nr_pages = 1UL << huge_page_order(hgmem->h);
+	aligned_index = round_down(index, nr_pages);
+
+	ret = kvm_gmem_hugetlb_filemap_add_folio(inode->i_mapping, folio,
+						 aligned_index,
+						 htlb_alloc_mask(hgmem->h));
+	WARN_ON(ret);
+
 	spin_lock(&inode->i_lock);
 	inode->i_blocks += blocks_per_huge_page(hgmem->h);
 	spin_unlock(&inode->i_lock);
 
-	return page_folio(requested_page);
+	return folio;
+}
+
+/**
+ * Split @folio if any of the subfolios are faultable. Returns the split
+ * (locked, refcount=2) folio at @index.
+ *
+ * Expects a locked folio with 1 refcount in addition to filemap's refcounts.
+ *
+ * After splitting, the subfolios in the filemap will be unlocked and have
+ * refcount 1 (other than the returned folio, which will be locked and have
+ * refcount 2).
+ */
+static struct folio *kvm_gmem_maybe_split_folio(struct folio *folio, pgoff_t index)
+{
+	pgoff_t aligned_index;
+	struct inode *inode;
+	struct hstate *h;
+	int nr_pages;
+	int ret;
+
+	inode = folio->mapping->host;
+	h = kvm_gmem_hgmem(inode)->h;
+	nr_pages = 1UL << huge_page_order(h);
+	aligned_index = round_down(index, nr_pages);
+
+	if (!kvm_gmem_is_any_faultable(inode, aligned_index, nr_pages))
+		return folio;
+
+	/* Drop lock and refcount in preparation for splitting. */
+	folio_unlock(folio);
+	folio_put(folio);
+
+	ret = kvm_gmem_split_folio_in_filemap(h, folio);
+	if (ret) {
+		kvm_gmem_hugetlb_filemap_remove_folio(folio);
+		return ERR_PTR(ret);
+	}
+
+	/*
+	 * At this point, the filemap has the only reference on the folio. Take
+	 * lock and refcount on folio to align with kvm_gmem_get_folio().
+	 */
+	return filemap_lock_folio(inode->i_mapping, index);
 }
 
 static struct folio *kvm_gmem_get_hugetlb_folio(struct inode *inode,
 						pgoff_t index)
 {
-	struct address_space *mapping;
 	struct folio *folio;
-	struct hstate *h;
-	pgoff_t hindex;
 	u32 hash;
 
-	h = kvm_gmem_hgmem(inode)->h;
-	hindex = index >> huge_page_order(h);
-	mapping = inode->i_mapping;
-
-	/* To lock, we calculate the hash using the hindex and not index. */
-	hash = hugetlb_fault_mutex_hash(mapping, hindex);
-	mutex_lock(&hugetlb_fault_mutex_table[hash]);
+	hash = hugetlb_fault_mutex_lock(inode->i_mapping, index);
 
 	/*
-	 * The filemap is indexed with index and not hindex. Taking lock on
-	 * folio to align with kvm_gmem_get_regular_folio()
+	 * The filemap is indexed with index and not hindex. Take lock on folio
+	 * to align with kvm_gmem_get_regular_folio()
 	 */
-	folio = filemap_lock_folio(mapping, index);
+	folio = filemap_lock_folio(inode->i_mapping, index);
+	if (IS_ERR(folio))
+		folio = kvm_gmem_hugetlb_alloc_and_cache_folio(inode, index);
+
 	if (!IS_ERR(folio))
-		goto out;
+		folio = kvm_gmem_maybe_split_folio(folio, index);
 
-	folio = kvm_gmem_hugetlb_alloc_and_cache_folio(inode, index);
-out:
-	mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+	hugetlb_fault_mutex_unlock(hash);
 
 	return folio;
 }
@@ -610,17 +860,6 @@ static void kvm_gmem_invalidate_end(struct kvm_gmem *gmem, pgoff_t start,
 	}
 }
 
-static inline void kvm_gmem_hugetlb_filemap_remove_folio(struct folio *folio)
-{
-	folio_lock(folio);
-
-	folio_clear_dirty(folio);
-	folio_clear_uptodate(folio);
-	filemap_remove_folio(folio);
-
-	folio_unlock(folio);
-}
-
 /**
  * Removes folios in range [@lstart, @lend) from page cache/filemap (@mapping),
  * returning the number of HugeTLB pages freed.
@@ -631,61 +870,30 @@ static int kvm_gmem_hugetlb_filemap_remove_folios(struct address_space *mapping,
 						  struct hstate *h,
 						  loff_t lstart, loff_t lend)
 {
-	const pgoff_t end = lend >> PAGE_SHIFT;
-	pgoff_t next = lstart >> PAGE_SHIFT;
-	LIST_HEAD(folios_to_reconstruct);
-	struct folio_batch fbatch;
-	struct folio *folio, *tmp;
-	int num_freed = 0;
-	int i;
-
-	/*
-	 * TODO: Iterate over huge_page_size(h) blocks to avoid taking and
-	 * releasing hugetlb_fault_mutex_table[hash] lock so often. When
-	 * truncating, lstart and lend should be clipped to the size of this
-	 * guest_memfd file, otherwise there would be too many iterations.
-	 */
-	folio_batch_init(&fbatch);
-	while (filemap_get_folios(mapping, &next, end - 1, &fbatch)) {
-		for (i = 0; i < folio_batch_count(&fbatch); ++i) {
-			struct folio *folio;
-			pgoff_t hindex;
-			u32 hash;
-
-			folio = fbatch.folios[i];
+	loff_t offset;
+	int num_freed;
 
-			hindex = folio->index >> huge_page_order(h);
-			hash = hugetlb_fault_mutex_hash(mapping, hindex);
-			mutex_lock(&hugetlb_fault_mutex_table[hash]);
+	num_freed = 0;
+	for (offset = lstart; offset < lend; offset += huge_page_size(h)) {
+		struct folio *folio;
+		pgoff_t index;
+		u32 hash;
 
-			/*
-			 * Collect first pages of HugeTLB folios for
-			 * reconstruction later.
-			 */
-			if ((folio->index & ~(huge_page_mask(h) >> PAGE_SHIFT)) == 0)
-				list_add(&folio->lru, &folios_to_reconstruct);
+		index = offset >> PAGE_SHIFT;
+		hash = hugetlb_fault_mutex_lock(mapping, index);
 
-			/*
-			 * Before removing from filemap, take a reference so
-			 * sub-folios don't get freed. Don't free the sub-folios
-			 * until after reconstruction.
-			 */
-			folio_get(folio);
+		folio = filemap_get_folio(mapping, index);
+		if (!IS_ERR(folio)) {
+			/* Drop refcount so that filemap holds only reference. */
+			folio_put(folio);
 
+			kvm_gmem_reconstruct_folio_in_filemap(h, folio);
 			kvm_gmem_hugetlb_filemap_remove_folio(folio);
 
-			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+			num_freed++;
 		}
-		folio_batch_release(&fbatch);
-		cond_resched();
-	}
-
-	list_for_each_entry_safe(folio, tmp, &folios_to_reconstruct, lru) {
-		kvm_gmem_hugetlb_reconstruct_folio(h, folio);
-		hugetlb_folio_list_move(folio, &h->hugepage_activelist);
 
-		folio_put(folio);
-		num_freed++;
+		hugetlb_fault_mutex_unlock(hash);
 	}
 
 	return num_freed;
@@ -705,6 +913,10 @@ static void kvm_gmem_hugetlb_truncate_folios_range(struct inode *inode,
 	int gbl_reserve;
 	int num_freed;
 
+	/* No point truncating more than inode size. */
+	lstart = min(lstart, inode->i_size);
+	lend = min(lend, inode->i_size);
+
 	hgmem = kvm_gmem_hgmem(inode);
 	h = hgmem->h;
 
@@ -1042,13 +1254,27 @@ static vm_fault_t kvm_gmem_fault(struct vm_fault *vmf)
 	bool is_prepared;
 
 	inode = file_inode(vmf->vma->vm_file);
-	if (!kvm_gmem_is_faultable(inode, vmf->pgoff))
+
+	/*
+	 * Use filemap_invalidate_lock_shared() to make sure
+	 * kvm_gmem_get_folio() doesn't race with faultability updates.
+	 */
+	filemap_invalidate_lock_shared(inode->i_mapping);
+
+	if (!kvm_gmem_is_faultable(inode, vmf->pgoff)) {
+		filemap_invalidate_unlock_shared(inode->i_mapping);
 		return VM_FAULT_SIGBUS;
+	}
 
 	folio = kvm_gmem_get_folio(inode, vmf->pgoff);
+
+	filemap_invalidate_unlock_shared(inode->i_mapping);
+
 	if (!folio)
 		return VM_FAULT_SIGBUS;
 
+	WARN(folio_test_hugetlb(folio), "should not be faulting in hugetlb folio=%p\n", folio);
+
 	is_prepared = folio_test_uptodate(folio);
 	if (!is_prepared) {
 		unsigned long nr_pages;
@@ -1731,8 +1957,6 @@ static bool kvm_gmem_no_mappings_range(struct inode *inode, pgoff_t start, pgoff
 	pgoff_t index;
 	bool checked_indices_unmapped;
 
-	filemap_invalidate_lock_shared(inode->i_mapping);
-
 	/* TODO: replace iteration with filemap_get_folios() for efficiency. */
 	checked_indices_unmapped = true;
 	for (index = start; checked_indices_unmapped && index < end;) {
@@ -1754,98 +1978,130 @@ static bool kvm_gmem_no_mappings_range(struct inode *inode, pgoff_t start, pgoff
 		folio_put(folio);
 	}
 
-	filemap_invalidate_unlock_shared(inode->i_mapping);
 	return checked_indices_unmapped;
 }
 
 /**
- * Returns true if pages in range [@start, @end) in memslot @slot have no
- * userspace mappings.
+ * Split any HugeTLB folios in range [@start, @end), if any of the offsets in
+ * the folio are faultable. Return 0 on success or negative error otherwise.
+ *
+ * Will skip any folios that are already split.
  */
-static bool kvm_gmem_no_mappings_slot(struct kvm_memory_slot *slot,
-				      gfn_t start, gfn_t end)
+static int kvm_gmem_try_split_folios_range(struct inode *inode,
+					   pgoff_t start, pgoff_t end)
 {
-	pgoff_t offset_start;
-	pgoff_t offset_end;
-	struct file *file;
-	bool ret;
-
-	offset_start = start - slot->base_gfn + slot->gmem.pgoff;
-	offset_end = end - slot->base_gfn + slot->gmem.pgoff;
-
-	file = kvm_gmem_get_file(slot);
-	if (!file)
-		return false;
-
-	ret = kvm_gmem_no_mappings_range(file_inode(file), offset_start, offset_end);
+	unsigned int nr_pages;
+	pgoff_t aligned_start;
+	pgoff_t aligned_end;
+	struct hstate *h;
+	pgoff_t index;
+	int ret;
 
-	fput(file);
+	if (!is_kvm_gmem_hugetlb(inode))
+		return 0;
 
-	return ret;
-}
+	h = kvm_gmem_hgmem(inode)->h;
+	nr_pages = 1UL << huge_page_order(h);
 
-/**
- * Returns true if pages in range [@start, @end) have no host userspace mappings.
- */
-static bool kvm_gmem_no_mappings(struct kvm *kvm, gfn_t start, gfn_t end)
-{
-	int i;
+	aligned_start = round_down(start, nr_pages);
+	aligned_end = round_up(end, nr_pages);
 
-	lockdep_assert_held(&kvm->slots_lock);
+	ret = 0;
+	for (index = aligned_start; !ret && index < aligned_end; index += nr_pages) {
+		struct folio *folio;
+		u32 hash;
 
-	for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
-		struct kvm_memslot_iter iter;
-		struct kvm_memslots *slots;
+		hash = hugetlb_fault_mutex_lock(inode->i_mapping, index);
 
-		slots = __kvm_memslots(kvm, i);
-		kvm_for_each_memslot_in_gfn_range(&iter, slots, start, end) {
-			struct kvm_memory_slot *slot;
-			gfn_t gfn_start;
-			gfn_t gfn_end;
-
-			slot = iter.slot;
-			gfn_start = max(start, slot->base_gfn);
-			gfn_end = min(end, slot->base_gfn + slot->npages);
+		folio = filemap_get_folio(inode->i_mapping, index);
+		if (!IS_ERR(folio)) {
+			/*
+			 * Drop refcount so that the only references held are refcounts
+			 * from the filemap.
+			 */
+			folio_put(folio);
 
-			if (iter.slot->flags & KVM_MEM_GUEST_MEMFD &&
-			    !kvm_gmem_no_mappings_slot(iter.slot, gfn_start, gfn_end))
-				return false;
+			if (kvm_gmem_is_any_faultable(inode, index, nr_pages)) {
+				ret = kvm_gmem_split_folio_in_filemap(h, folio);
+				if (ret) {
+					/* TODO cleanup properly. */
+					WARN_ON(ret);
+				}
+			}
 		}
+
+		hugetlb_fault_mutex_unlock(hash);
 	}
 
-	return true;
+	return ret;
 }
 
 /**
- * Set faultability of given range of gfns [@start, @end) in memslot @slot to
- * @faultable.
+ * Returns 0 if guest_memfd permits setting range [@start, @end) with
+ * faultability @faultable within memslot @slot, or negative error otherwise.
+ *
+ * If a request was made to set the memory to PRIVATE (not faultable), the pages
+ * in the range must not be pinned or mapped for the request to be permitted.
+ *
+ * Because this may allow pages to be faulted in to userspace when requested to
+ * set attributes to shared, this must only be called after the pages have been
+ * invalidated from guest page tables.
  */
-static void kvm_gmem_set_faultable_slot(struct kvm_memory_slot *slot, gfn_t start,
-					gfn_t end, bool faultable)
+static int kvm_gmem_try_set_faultable_slot(struct kvm_memory_slot *slot,
+					   gfn_t start, gfn_t end,
+					   bool faultable)
 {
 	pgoff_t start_offset;
+	struct inode *inode;
 	pgoff_t end_offset;
 	struct file *file;
+	int ret;
 
 	file = kvm_gmem_get_file(slot);
 	if (!file)
-		return;
+		return 0;
 
 	start_offset = start - slot->base_gfn + slot->gmem.pgoff;
 	end_offset = end - slot->base_gfn + slot->gmem.pgoff;
 
-	WARN_ON(kvm_gmem_set_faultable(file_inode(file), start_offset, end_offset,
-				       faultable));
+	inode = file_inode(file);
+
+	/*
+	 * Use filemap_invalidate_lock_shared() to make sure
+	 * splitting/reconstruction doesn't race with faultability updates.
+	 */
+	filemap_invalidate_lock(inode->i_mapping);
+
+	kvm_gmem_set_faultable(inode, start_offset, end_offset, faultable);
+
+	if (faultable) {
+		ret = kvm_gmem_try_split_folios_range(inode, start_offset,
+						      end_offset);
+	} else {
+		if (kvm_gmem_no_mappings_range(inode, start_offset, end_offset)) {
+			ret = kvm_gmem_try_reconstruct_folios_range(inode,
+								    start_offset,
+								    end_offset);
+		} else {
+			ret = -EINVAL;
+		}
+	}
+
+	filemap_invalidate_unlock(inode->i_mapping);
 
 	fput(file);
+
+	return ret;
 }
 
 /**
- * Set faultability of given range of gfns [@start, @end) in memslot @slot to
- * @faultable.
+ * Returns 0 if guest_memfd permits setting range [@start, @end) with
+ * faultability @faultable within VM @kvm, or negative error otherwise.
+ *
+ * See kvm_gmem_try_set_faultable_slot() for details.
  */
-static void kvm_gmem_set_faultable_vm(struct kvm *kvm, gfn_t start, gfn_t end,
-				      bool faultable)
+static int kvm_gmem_try_set_faultable_vm(struct kvm *kvm, gfn_t start, gfn_t end,
+					 bool faultable)
 {
 	int i;
 
@@ -1866,43 +2122,15 @@ static void kvm_gmem_set_faultable_vm(struct kvm *kvm, gfn_t start, gfn_t end,
 			gfn_end = min(end, slot->base_gfn + slot->npages);
 
 			if (iter.slot->flags & KVM_MEM_GUEST_MEMFD) {
-				kvm_gmem_set_faultable_slot(slot, gfn_start,
-							    gfn_end, faultable);
+				int ret;
+
+				ret = kvm_gmem_try_set_faultable_slot(slot, gfn_start,
+								      gfn_end, faultable);
+				if (ret)
+					return ret;
 			}
 		}
 	}
-}
-
-/**
- * Returns true if guest_memfd permits setting range [@start, @end) to PRIVATE.
- *
- * If memory is faulted in to host userspace and a request was made to set the
- * memory to PRIVATE, the faulted in pages must not be pinned for the request to
- * be permitted.
- */
-static int kvm_gmem_should_set_attributes_private(struct kvm *kvm, gfn_t start,
-						  gfn_t end)
-{
-	kvm_gmem_set_faultable_vm(kvm, start, end, false);
-
-	if (kvm_gmem_no_mappings(kvm, start, end))
-		return 0;
-
-	kvm_gmem_set_faultable_vm(kvm, start, end, true);
-	return -EINVAL;
-}
-
-/**
- * Returns true if guest_memfd permits setting range [@start, @end) to SHARED.
- *
- * Because this allows pages to be faulted in to userspace, this must only be
- * called after the pages have been invalidated from guest page tables.
- */
-static int kvm_gmem_should_set_attributes_shared(struct kvm *kvm, gfn_t start,
-						 gfn_t end)
-{
-	/* Always okay to set shared, hence set range faultable here. */
-	kvm_gmem_set_faultable_vm(kvm, start, end, true);
 
 	return 0;
 }
@@ -1922,10 +2150,16 @@ static int kvm_gmem_should_set_attributes_shared(struct kvm *kvm, gfn_t start,
 int kvm_gmem_should_set_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
 				   unsigned long attrs)
 {
-	if (attrs & KVM_MEMORY_ATTRIBUTE_PRIVATE)
-		return kvm_gmem_should_set_attributes_private(kvm, start, end);
-	else
-		return kvm_gmem_should_set_attributes_shared(kvm, start, end);
+	bool faultable;
+	int ret;
+
+	faultable = !(attrs & KVM_MEMORY_ATTRIBUTE_PRIVATE);
+
+	ret = kvm_gmem_try_set_faultable_vm(kvm, start, end, faultable);
+	if (ret)
+		WARN_ON(kvm_gmem_try_set_faultable_vm(kvm, start, end, !faultable));
+
+	return ret;
 }
 
 #endif
-- 
2.46.0.598.g6f2099f65c-goog
Re: [RFC PATCH 39/39] KVM: guest_memfd: Dynamically split/reconstruct HugeTLB page
Posted by Yan Zhao 10 months, 1 week ago
On Tue, Sep 10, 2024 at 11:44:10PM +0000, Ackerley Tng wrote:
> +/*
> + * Allocates and then caches a folio in the filemap. Returns a folio with
> + * refcount of 2: 1 after allocation, and 1 taken by the filemap.
> + */
> +static struct folio *kvm_gmem_hugetlb_alloc_and_cache_folio(struct inode *inode,
> +							    pgoff_t index)
> +{
> +	struct kvm_gmem_hugetlb *hgmem;
> +	pgoff_t aligned_index;
> +	struct folio *folio;
> +	int nr_pages;
> +	int ret;
> +
> +	hgmem = kvm_gmem_hgmem(inode);
> +	folio = kvm_gmem_hugetlb_alloc_folio(hgmem->h, hgmem->spool);
> +	if (IS_ERR(folio))
> +		return folio;
> +
> +	nr_pages = 1UL << huge_page_order(hgmem->h);
> +	aligned_index = round_down(index, nr_pages);
Maybe a gap here.

When a guest_memfd is bound to a slot where slot->base_gfn is not aligned to
2M/1G and slot->gmem.pgoff is 0, even if an index is 2M/1G aligned, the
corresponding GFN is not 2M/1G aligned.

However, TDX requires that private huge pages be 2M aligned in GFN.

> +	ret = kvm_gmem_hugetlb_filemap_add_folio(inode->i_mapping, folio,
> +						 aligned_index,
> +						 htlb_alloc_mask(hgmem->h));
> +	WARN_ON(ret);
> +
>  	spin_lock(&inode->i_lock);
>  	inode->i_blocks += blocks_per_huge_page(hgmem->h);
>  	spin_unlock(&inode->i_lock);
>  
> -	return page_folio(requested_page);
> +	return folio;
> +}
Re: [RFC PATCH 39/39] KVM: guest_memfd: Dynamically split/reconstruct HugeTLB page
Posted by Ackerley Tng 9 months, 2 weeks ago
Yan Zhao <yan.y.zhao@intel.com> writes:

> On Tue, Sep 10, 2024 at 11:44:10PM +0000, Ackerley Tng wrote:
>> +/*
>> + * Allocates and then caches a folio in the filemap. Returns a folio with
>> + * refcount of 2: 1 after allocation, and 1 taken by the filemap.
>> + */
>> +static struct folio *kvm_gmem_hugetlb_alloc_and_cache_folio(struct inode *inode,
>> +							    pgoff_t index)
>> +{
>> +	struct kvm_gmem_hugetlb *hgmem;
>> +	pgoff_t aligned_index;
>> +	struct folio *folio;
>> +	int nr_pages;
>> +	int ret;
>> +
>> +	hgmem = kvm_gmem_hgmem(inode);
>> +	folio = kvm_gmem_hugetlb_alloc_folio(hgmem->h, hgmem->spool);
>> +	if (IS_ERR(folio))
>> +		return folio;
>> +
>> +	nr_pages = 1UL << huge_page_order(hgmem->h);
>> +	aligned_index = round_down(index, nr_pages);
> Maybe a gap here.
>
> When a guest_memfd is bound to a slot where slot->base_gfn is not aligned to
> 2M/1G and slot->gmem.pgoff is 0, even if an index is 2M/1G aligned, the
> corresponding GFN is not 2M/1G aligned.

Thanks for looking into this.

In 1G page support for guest_memfd, the offset and size are always
hugepage aligned to the hugepage size requested at guest_memfd creation
time, and it is true that when binding to a memslot, slot->base_gfn and
slot->npages may not be hugepage aligned.

>
> However, TDX requires that private huge pages be 2M aligned in GFN.
>

IIUC other factors also contribute to determining the mapping level in
the guest page tables, like lpage_info and .private_max_mapping_level()
in kvm_x86_ops.

If slot->base_gfn and slot->npages are not hugepage aligned, lpage_info
will track that and not allow faulting into guest page tables at higher
granularity.

Hence I think it is okay to leave it to KVM to fault pages into the
guest correctly. For guest_memfd will just maintain the invariant that
offset and size are hugepage aligned, but not require that
slot->base_gfn and slot->npages are hugepage aligned. This behavior will
be consistent with other backing memory for guests like regular shmem or
HugeTLB.

>> +	ret = kvm_gmem_hugetlb_filemap_add_folio(inode->i_mapping, folio,
>> +						 aligned_index,
>> +						 htlb_alloc_mask(hgmem->h));
>> +	WARN_ON(ret);
>> +
>>  	spin_lock(&inode->i_lock);
>>  	inode->i_blocks += blocks_per_huge_page(hgmem->h);
>>  	spin_unlock(&inode->i_lock);
>>  
>> -	return page_folio(requested_page);
>> +	return folio;
>> +}
Re: [RFC PATCH 39/39] KVM: guest_memfd: Dynamically split/reconstruct HugeTLB page
Posted by Yan Zhao 9 months, 2 weeks ago
On Wed, Apr 23, 2025 at 03:02:02PM -0700, Ackerley Tng wrote:
> Yan Zhao <yan.y.zhao@intel.com> writes:
> 
> > On Tue, Sep 10, 2024 at 11:44:10PM +0000, Ackerley Tng wrote:
> >> +/*
> >> + * Allocates and then caches a folio in the filemap. Returns a folio with
> >> + * refcount of 2: 1 after allocation, and 1 taken by the filemap.
> >> + */
> >> +static struct folio *kvm_gmem_hugetlb_alloc_and_cache_folio(struct inode *inode,
> >> +							    pgoff_t index)
> >> +{
> >> +	struct kvm_gmem_hugetlb *hgmem;
> >> +	pgoff_t aligned_index;
> >> +	struct folio *folio;
> >> +	int nr_pages;
> >> +	int ret;
> >> +
> >> +	hgmem = kvm_gmem_hgmem(inode);
> >> +	folio = kvm_gmem_hugetlb_alloc_folio(hgmem->h, hgmem->spool);
> >> +	if (IS_ERR(folio))
> >> +		return folio;
> >> +
> >> +	nr_pages = 1UL << huge_page_order(hgmem->h);
> >> +	aligned_index = round_down(index, nr_pages);
> > Maybe a gap here.
> >
> > When a guest_memfd is bound to a slot where slot->base_gfn is not aligned to
> > 2M/1G and slot->gmem.pgoff is 0, even if an index is 2M/1G aligned, the
> > corresponding GFN is not 2M/1G aligned.
> 
> Thanks for looking into this.
> 
> In 1G page support for guest_memfd, the offset and size are always
> hugepage aligned to the hugepage size requested at guest_memfd creation
> time, and it is true that when binding to a memslot, slot->base_gfn and
> slot->npages may not be hugepage aligned.
> 
> >
> > However, TDX requires that private huge pages be 2M aligned in GFN.
> >
> 
> IIUC other factors also contribute to determining the mapping level in
> the guest page tables, like lpage_info and .private_max_mapping_level()
> in kvm_x86_ops.
>
> If slot->base_gfn and slot->npages are not hugepage aligned, lpage_info
> will track that and not allow faulting into guest page tables at higher
> granularity.
 
lpage_info only checks the alignments of slot->base_gfn and
slot->base_gfn + npages. e.g.,

if slot->base_gfn is 8K, npages is 8M, then for this slot,
lpage_info[2M][0].disallow_lpage = 1, which is for GFN [4K, 2M+8K);
lpage_info[2M][1].disallow_lpage = 0, which is for GFN [2M+8K, 4M+8K);
lpage_info[2M][2].disallow_lpage = 0, which is for GFN [4M+8K, 6M+8K);
lpage_info[2M][3].disallow_lpage = 1, which is for GFN [6M+8K, 8M+8K);

  ---------------------------------------------------------
  |          |  |          |  |          |  |          |  |
  8K        2M 2M+8K      4M  4M+8K     6M  6M+8K     8M  8M+8K

For GFN 6M and GFN 6M+4K, as they both belong to lpage_info[2M][2], huge
page is allowed. Also, they have the same aligned_index 2 in guest_memfd.
So, guest_memfd allocates the same huge folio of 2M order for them.

However, for TDX, GFN 6M and GFN 6M+4K should not belong to the same folio.
It's also weird for a 2M mapping in KVM to stride across 2 huge folios.

> Hence I think it is okay to leave it to KVM to fault pages into the
> guest correctly. For guest_memfd will just maintain the invariant that
> offset and size are hugepage aligned, but not require that
> slot->base_gfn and slot->npages are hugepage aligned. This behavior will
> be consistent with other backing memory for guests like regular shmem or
> HugeTLB.
> 
> >> +	ret = kvm_gmem_hugetlb_filemap_add_folio(inode->i_mapping, folio,
> >> +						 aligned_index,
> >> +						 htlb_alloc_mask(hgmem->h));
> >> +	WARN_ON(ret);
> >> +
> >>  	spin_lock(&inode->i_lock);
> >>  	inode->i_blocks += blocks_per_huge_page(hgmem->h);
> >>  	spin_unlock(&inode->i_lock);
> >>  
> >> -	return page_folio(requested_page);
> >> +	return folio;
> >> +}
Re: [RFC PATCH 39/39] KVM: guest_memfd: Dynamically split/reconstruct HugeTLB page
Posted by Yan Zhao 9 months, 2 weeks ago
On Thu, Apr 24, 2025 at 09:09:22AM +0800, Yan Zhao wrote:
> On Wed, Apr 23, 2025 at 03:02:02PM -0700, Ackerley Tng wrote:
> > Yan Zhao <yan.y.zhao@intel.com> writes:
> > 
> > > On Tue, Sep 10, 2024 at 11:44:10PM +0000, Ackerley Tng wrote:
> > >> +/*
> > >> + * Allocates and then caches a folio in the filemap. Returns a folio with
> > >> + * refcount of 2: 1 after allocation, and 1 taken by the filemap.
> > >> + */
> > >> +static struct folio *kvm_gmem_hugetlb_alloc_and_cache_folio(struct inode *inode,
> > >> +							    pgoff_t index)
> > >> +{
> > >> +	struct kvm_gmem_hugetlb *hgmem;
> > >> +	pgoff_t aligned_index;
> > >> +	struct folio *folio;
> > >> +	int nr_pages;
> > >> +	int ret;
> > >> +
> > >> +	hgmem = kvm_gmem_hgmem(inode);
> > >> +	folio = kvm_gmem_hugetlb_alloc_folio(hgmem->h, hgmem->spool);
> > >> +	if (IS_ERR(folio))
> > >> +		return folio;
> > >> +
> > >> +	nr_pages = 1UL << huge_page_order(hgmem->h);
> > >> +	aligned_index = round_down(index, nr_pages);
> > > Maybe a gap here.
> > >
> > > When a guest_memfd is bound to a slot where slot->base_gfn is not aligned to
> > > 2M/1G and slot->gmem.pgoff is 0, even if an index is 2M/1G aligned, the
> > > corresponding GFN is not 2M/1G aligned.
> > 
> > Thanks for looking into this.
> > 
> > In 1G page support for guest_memfd, the offset and size are always
> > hugepage aligned to the hugepage size requested at guest_memfd creation
> > time, and it is true that when binding to a memslot, slot->base_gfn and
> > slot->npages may not be hugepage aligned.
> > 
> > >
> > > However, TDX requires that private huge pages be 2M aligned in GFN.
> > >
> > 
> > IIUC other factors also contribute to determining the mapping level in
> > the guest page tables, like lpage_info and .private_max_mapping_level()
> > in kvm_x86_ops.
> >
> > If slot->base_gfn and slot->npages are not hugepage aligned, lpage_info
> > will track that and not allow faulting into guest page tables at higher
> > granularity.
>  
> lpage_info only checks the alignments of slot->base_gfn and
> slot->base_gfn + npages. e.g.,
> 
> if slot->base_gfn is 8K, npages is 8M, then for this slot,
> lpage_info[2M][0].disallow_lpage = 1, which is for GFN [4K, 2M+8K);
> lpage_info[2M][1].disallow_lpage = 0, which is for GFN [2M+8K, 4M+8K);
> lpage_info[2M][2].disallow_lpage = 0, which is for GFN [4M+8K, 6M+8K);
> lpage_info[2M][3].disallow_lpage = 1, which is for GFN [6M+8K, 8M+8K);
> 
>   ---------------------------------------------------------
>   |          |  |          |  |          |  |          |  |
>   8K        2M 2M+8K      4M  4M+8K     6M  6M+8K     8M  8M+8K
> 
> For GFN 6M and GFN 6M+4K, as they both belong to lpage_info[2M][2], huge
> page is allowed. Also, they have the same aligned_index 2 in guest_memfd.
> So, guest_memfd allocates the same huge folio of 2M order for them.
Sorry, sent too fast this morning. The example is not right. The correct
one is:

For GFN 4M and GFN 4M+16K, lpage_info indicates that 2M is allowed. So,
KVM will create a 2M mapping for them.

However, in guest_memfd, GFN 4M and GFN 4M+16K do not correspond to the
same 2M folio and physical addresses may not be contiguous.


> However, for TDX, GFN 6M and GFN 6M+4K should not belong to the same folio.
> It's also weird for a 2M mapping in KVM to stride across 2 huge folios.
> 
> > Hence I think it is okay to leave it to KVM to fault pages into the
> > guest correctly. For guest_memfd will just maintain the invariant that
> > offset and size are hugepage aligned, but not require that
> > slot->base_gfn and slot->npages are hugepage aligned. This behavior will
> > be consistent with other backing memory for guests like regular shmem or
> > HugeTLB.
> > 
> > >> +	ret = kvm_gmem_hugetlb_filemap_add_folio(inode->i_mapping, folio,
> > >> +						 aligned_index,
> > >> +						 htlb_alloc_mask(hgmem->h));
> > >> +	WARN_ON(ret);
> > >> +
> > >>  	spin_lock(&inode->i_lock);
> > >>  	inode->i_blocks += blocks_per_huge_page(hgmem->h);
> > >>  	spin_unlock(&inode->i_lock);
> > >>  
> > >> -	return page_folio(requested_page);
> > >> +	return folio;
> > >> +}
Re: [RFC PATCH 39/39] KVM: guest_memfd: Dynamically split/reconstruct HugeTLB page
Posted by Chenyi Qiang 9 months, 2 weeks ago

On 4/24/2025 12:25 PM, Yan Zhao wrote:
> On Thu, Apr 24, 2025 at 09:09:22AM +0800, Yan Zhao wrote:
>> On Wed, Apr 23, 2025 at 03:02:02PM -0700, Ackerley Tng wrote:
>>> Yan Zhao <yan.y.zhao@intel.com> writes:
>>>
>>>> On Tue, Sep 10, 2024 at 11:44:10PM +0000, Ackerley Tng wrote:
>>>>> +/*
>>>>> + * Allocates and then caches a folio in the filemap. Returns a folio with
>>>>> + * refcount of 2: 1 after allocation, and 1 taken by the filemap.
>>>>> + */
>>>>> +static struct folio *kvm_gmem_hugetlb_alloc_and_cache_folio(struct inode *inode,
>>>>> +							    pgoff_t index)
>>>>> +{
>>>>> +	struct kvm_gmem_hugetlb *hgmem;
>>>>> +	pgoff_t aligned_index;
>>>>> +	struct folio *folio;
>>>>> +	int nr_pages;
>>>>> +	int ret;
>>>>> +
>>>>> +	hgmem = kvm_gmem_hgmem(inode);
>>>>> +	folio = kvm_gmem_hugetlb_alloc_folio(hgmem->h, hgmem->spool);
>>>>> +	if (IS_ERR(folio))
>>>>> +		return folio;
>>>>> +
>>>>> +	nr_pages = 1UL << huge_page_order(hgmem->h);
>>>>> +	aligned_index = round_down(index, nr_pages);
>>>> Maybe a gap here.
>>>>
>>>> When a guest_memfd is bound to a slot where slot->base_gfn is not aligned to
>>>> 2M/1G and slot->gmem.pgoff is 0, even if an index is 2M/1G aligned, the
>>>> corresponding GFN is not 2M/1G aligned.
>>>
>>> Thanks for looking into this.
>>>
>>> In 1G page support for guest_memfd, the offset and size are always
>>> hugepage aligned to the hugepage size requested at guest_memfd creation
>>> time, and it is true that when binding to a memslot, slot->base_gfn and
>>> slot->npages may not be hugepage aligned.
>>>
>>>>
>>>> However, TDX requires that private huge pages be 2M aligned in GFN.
>>>>
>>>
>>> IIUC other factors also contribute to determining the mapping level in
>>> the guest page tables, like lpage_info and .private_max_mapping_level()
>>> in kvm_x86_ops.
>>>
>>> If slot->base_gfn and slot->npages are not hugepage aligned, lpage_info
>>> will track that and not allow faulting into guest page tables at higher
>>> granularity.
>>  
>> lpage_info only checks the alignments of slot->base_gfn and
>> slot->base_gfn + npages. e.g.,
>>
>> if slot->base_gfn is 8K, npages is 8M, then for this slot,
>> lpage_info[2M][0].disallow_lpage = 1, which is for GFN [4K, 2M+8K);
>> lpage_info[2M][1].disallow_lpage = 0, which is for GFN [2M+8K, 4M+8K);
>> lpage_info[2M][2].disallow_lpage = 0, which is for GFN [4M+8K, 6M+8K);
>> lpage_info[2M][3].disallow_lpage = 1, which is for GFN [6M+8K, 8M+8K);

Should it be?
lpage_info[2M][0].disallow_lpage = 1, which is for GFN [8K, 2M);
lpage_info[2M][1].disallow_lpage = 0, which is for GFN [2M, 4M);
lpage_info[2M][2].disallow_lpage = 0, which is for GFN [4M, 6M);
lpage_info[2M][3].disallow_lpage = 0, which is for GFN [6M, 8M);
lpage_info[2M][4].disallow_lpage = 1, which is for GFN [8M, 8M+8K);

>>
>>   ---------------------------------------------------------
>>   |          |  |          |  |          |  |          |  |
>>   8K        2M 2M+8K      4M  4M+8K     6M  6M+8K     8M  8M+8K
>>
>> For GFN 6M and GFN 6M+4K, as they both belong to lpage_info[2M][2], huge
>> page is allowed. Also, they have the same aligned_index 2 in guest_memfd.
>> So, guest_memfd allocates the same huge folio of 2M order for them.
> Sorry, sent too fast this morning. The example is not right. The correct
> one is:
> 
> For GFN 4M and GFN 4M+16K, lpage_info indicates that 2M is allowed. So,
> KVM will create a 2M mapping for them.
> 
> However, in guest_memfd, GFN 4M and GFN 4M+16K do not correspond to the
> same 2M folio and physical addresses may not be contiguous.
> 
> 
>> However, for TDX, GFN 6M and GFN 6M+4K should not belong to the same folio.
>> It's also weird for a 2M mapping in KVM to stride across 2 huge folios.
>>
>>> Hence I think it is okay to leave it to KVM to fault pages into the
>>> guest correctly. For guest_memfd will just maintain the invariant that
>>> offset and size are hugepage aligned, but not require that
>>> slot->base_gfn and slot->npages are hugepage aligned. This behavior will
>>> be consistent with other backing memory for guests like regular shmem or
>>> HugeTLB.
>>>
>>>>> +	ret = kvm_gmem_hugetlb_filemap_add_folio(inode->i_mapping, folio,
>>>>> +						 aligned_index,
>>>>> +						 htlb_alloc_mask(hgmem->h));
>>>>> +	WARN_ON(ret);
>>>>> +
>>>>>  	spin_lock(&inode->i_lock);
>>>>>  	inode->i_blocks += blocks_per_huge_page(hgmem->h);
>>>>>  	spin_unlock(&inode->i_lock);
>>>>>  
>>>>> -	return page_folio(requested_page);
>>>>> +	return folio;
>>>>> +}
>
Re: [RFC PATCH 39/39] KVM: guest_memfd: Dynamically split/reconstruct HugeTLB page
Posted by Yan Zhao 9 months, 2 weeks ago
On Thu, Apr 24, 2025 at 01:55:51PM +0800, Chenyi Qiang wrote:
> 
> 
> On 4/24/2025 12:25 PM, Yan Zhao wrote:
> > On Thu, Apr 24, 2025 at 09:09:22AM +0800, Yan Zhao wrote:
> >> On Wed, Apr 23, 2025 at 03:02:02PM -0700, Ackerley Tng wrote:
> >>> Yan Zhao <yan.y.zhao@intel.com> writes:
> >>>
> >>>> On Tue, Sep 10, 2024 at 11:44:10PM +0000, Ackerley Tng wrote:
> >>>>> +/*
> >>>>> + * Allocates and then caches a folio in the filemap. Returns a folio with
> >>>>> + * refcount of 2: 1 after allocation, and 1 taken by the filemap.
> >>>>> + */
> >>>>> +static struct folio *kvm_gmem_hugetlb_alloc_and_cache_folio(struct inode *inode,
> >>>>> +							    pgoff_t index)
> >>>>> +{
> >>>>> +	struct kvm_gmem_hugetlb *hgmem;
> >>>>> +	pgoff_t aligned_index;
> >>>>> +	struct folio *folio;
> >>>>> +	int nr_pages;
> >>>>> +	int ret;
> >>>>> +
> >>>>> +	hgmem = kvm_gmem_hgmem(inode);
> >>>>> +	folio = kvm_gmem_hugetlb_alloc_folio(hgmem->h, hgmem->spool);
> >>>>> +	if (IS_ERR(folio))
> >>>>> +		return folio;
> >>>>> +
> >>>>> +	nr_pages = 1UL << huge_page_order(hgmem->h);
> >>>>> +	aligned_index = round_down(index, nr_pages);
> >>>> Maybe a gap here.
> >>>>
> >>>> When a guest_memfd is bound to a slot where slot->base_gfn is not aligned to
> >>>> 2M/1G and slot->gmem.pgoff is 0, even if an index is 2M/1G aligned, the
> >>>> corresponding GFN is not 2M/1G aligned.
> >>>
> >>> Thanks for looking into this.
> >>>
> >>> In 1G page support for guest_memfd, the offset and size are always
> >>> hugepage aligned to the hugepage size requested at guest_memfd creation
> >>> time, and it is true that when binding to a memslot, slot->base_gfn and
> >>> slot->npages may not be hugepage aligned.
> >>>
> >>>>
> >>>> However, TDX requires that private huge pages be 2M aligned in GFN.
> >>>>
> >>>
> >>> IIUC other factors also contribute to determining the mapping level in
> >>> the guest page tables, like lpage_info and .private_max_mapping_level()
> >>> in kvm_x86_ops.
> >>>
> >>> If slot->base_gfn and slot->npages are not hugepage aligned, lpage_info
> >>> will track that and not allow faulting into guest page tables at higher
> >>> granularity.
> >>  
> >> lpage_info only checks the alignments of slot->base_gfn and
> >> slot->base_gfn + npages. e.g.,
> >>
> >> if slot->base_gfn is 8K, npages is 8M, then for this slot,
> >> lpage_info[2M][0].disallow_lpage = 1, which is for GFN [4K, 2M+8K);
> >> lpage_info[2M][1].disallow_lpage = 0, which is for GFN [2M+8K, 4M+8K);
> >> lpage_info[2M][2].disallow_lpage = 0, which is for GFN [4M+8K, 6M+8K);
> >> lpage_info[2M][3].disallow_lpage = 1, which is for GFN [6M+8K, 8M+8K);
> 
> Should it be?
> lpage_info[2M][0].disallow_lpage = 1, which is for GFN [8K, 2M);
> lpage_info[2M][1].disallow_lpage = 0, which is for GFN [2M, 4M);
> lpage_info[2M][2].disallow_lpage = 0, which is for GFN [4M, 6M);
> lpage_info[2M][3].disallow_lpage = 0, which is for GFN [6M, 8M);
> lpage_info[2M][4].disallow_lpage = 1, which is for GFN [8M, 8M+8K);
Right. Good catch. Thanks!

Let me update the example as below:
slot->base_gfn is 2 (for GPA 8KB), npages 2000 (for a 8MB range)

lpage_info[2M][0].disallow_lpage = 1, which is for GPA [8KB, 2MB);
lpage_info[2M][1].disallow_lpage = 0, which is for GPA [2MB, 4MB);
lpage_info[2M][2].disallow_lpage = 0, which is for GPA [4MB, 6MB);
lpage_info[2M][3].disallow_lpage = 0, which is for GPA [6MB, 8MB);
lpage_info[2M][4].disallow_lpage = 1, which is for GPA [8MB, 8MB+8KB);

lpage_info indicates that a 2MB mapping is alllowed to cover GPA 4MB and GPA
4MB+16KB. However, their aligned_index values lead guest_memfd to allocate two
2MB folios, whose physical addresses may not be contiguous.

Additionally, if the guest accesses two GPAs, e.g., GPA 2MB+8KB and GPA 4MB,
KVM could create two 2MB mappings to cover GPA ranges [2MB, 4MB), [4MB, 6MB).
However, guest_memfd just allocates the same 2MB folio for both faults.


> 
> >>
> >>   ---------------------------------------------------------
> >>   |          |  |          |  |          |  |          |  |
> >>   8K        2M 2M+8K      4M  4M+8K     6M  6M+8K     8M  8M+8K
> >>
> >> For GFN 6M and GFN 6M+4K, as they both belong to lpage_info[2M][2], huge
> >> page is allowed. Also, they have the same aligned_index 2 in guest_memfd.
> >> So, guest_memfd allocates the same huge folio of 2M order for them.
> > Sorry, sent too fast this morning. The example is not right. The correct
> > one is:
> > 
> > For GFN 4M and GFN 4M+16K, lpage_info indicates that 2M is allowed. So,
> > KVM will create a 2M mapping for them.
> > 
> > However, in guest_memfd, GFN 4M and GFN 4M+16K do not correspond to the
> > same 2M folio and physical addresses may not be contiguous.
> > 
> > 
> >> However, for TDX, GFN 6M and GFN 6M+4K should not belong to the same folio.
> >> It's also weird for a 2M mapping in KVM to stride across 2 huge folios.
> >>
> >>> Hence I think it is okay to leave it to KVM to fault pages into the
> >>> guest correctly. For guest_memfd will just maintain the invariant that
> >>> offset and size are hugepage aligned, but not require that
> >>> slot->base_gfn and slot->npages are hugepage aligned. This behavior will
> >>> be consistent with other backing memory for guests like regular shmem or
> >>> HugeTLB.
> >>>
> >>>>> +	ret = kvm_gmem_hugetlb_filemap_add_folio(inode->i_mapping, folio,
> >>>>> +						 aligned_index,
> >>>>> +						 htlb_alloc_mask(hgmem->h));
> >>>>> +	WARN_ON(ret);
> >>>>> +
> >>>>>  	spin_lock(&inode->i_lock);
> >>>>>  	inode->i_blocks += blocks_per_huge_page(hgmem->h);
> >>>>>  	spin_unlock(&inode->i_lock);
> >>>>>  
> >>>>> -	return page_folio(requested_page);
> >>>>> +	return folio;
> >>>>> +}
> > 
>
Re: [RFC PATCH 39/39] KVM: guest_memfd: Dynamically split/reconstruct HugeTLB page
Posted by Vishal Annapurve 9 months, 2 weeks ago
On Thu, Apr 24, 2025 at 1:15 AM Yan Zhao <yan.y.zhao@intel.com> wrote:
>
> On Thu, Apr 24, 2025 at 01:55:51PM +0800, Chenyi Qiang wrote:
> >
> >
> > On 4/24/2025 12:25 PM, Yan Zhao wrote:
> > > On Thu, Apr 24, 2025 at 09:09:22AM +0800, Yan Zhao wrote:
> > >> On Wed, Apr 23, 2025 at 03:02:02PM -0700, Ackerley Tng wrote:
> > >>> Yan Zhao <yan.y.zhao@intel.com> writes:
> > >>>
> > >>>> On Tue, Sep 10, 2024 at 11:44:10PM +0000, Ackerley Tng wrote:
> > >>>>> +/*
> > >>>>> + * Allocates and then caches a folio in the filemap. Returns a folio with
> > >>>>> + * refcount of 2: 1 after allocation, and 1 taken by the filemap.
> > >>>>> + */
> > >>>>> +static struct folio *kvm_gmem_hugetlb_alloc_and_cache_folio(struct inode *inode,
> > >>>>> +                                                           pgoff_t index)
> > >>>>> +{
> > >>>>> +       struct kvm_gmem_hugetlb *hgmem;
> > >>>>> +       pgoff_t aligned_index;
> > >>>>> +       struct folio *folio;
> > >>>>> +       int nr_pages;
> > >>>>> +       int ret;
> > >>>>> +
> > >>>>> +       hgmem = kvm_gmem_hgmem(inode);
> > >>>>> +       folio = kvm_gmem_hugetlb_alloc_folio(hgmem->h, hgmem->spool);
> > >>>>> +       if (IS_ERR(folio))
> > >>>>> +               return folio;
> > >>>>> +
> > >>>>> +       nr_pages = 1UL << huge_page_order(hgmem->h);
> > >>>>> +       aligned_index = round_down(index, nr_pages);
> > >>>> Maybe a gap here.
> > >>>>
> > >>>> When a guest_memfd is bound to a slot where slot->base_gfn is not aligned to
> > >>>> 2M/1G and slot->gmem.pgoff is 0, even if an index is 2M/1G aligned, the
> > >>>> corresponding GFN is not 2M/1G aligned.
> > >>>
> > >>> Thanks for looking into this.
> > >>>
> > >>> In 1G page support for guest_memfd, the offset and size are always
> > >>> hugepage aligned to the hugepage size requested at guest_memfd creation
> > >>> time, and it is true that when binding to a memslot, slot->base_gfn and
> > >>> slot->npages may not be hugepage aligned.
> > >>>
> > >>>>
> > >>>> However, TDX requires that private huge pages be 2M aligned in GFN.
> > >>>>
> > >>>
> > >>> IIUC other factors also contribute to determining the mapping level in
> > >>> the guest page tables, like lpage_info and .private_max_mapping_level()
> > >>> in kvm_x86_ops.
> > >>>
> > >>> If slot->base_gfn and slot->npages are not hugepage aligned, lpage_info
> > >>> will track that and not allow faulting into guest page tables at higher
> > >>> granularity.
> > >>
> > >> lpage_info only checks the alignments of slot->base_gfn and
> > >> slot->base_gfn + npages. e.g.,
> > >>
> > >> if slot->base_gfn is 8K, npages is 8M, then for this slot,
> > >> lpage_info[2M][0].disallow_lpage = 1, which is for GFN [4K, 2M+8K);
> > >> lpage_info[2M][1].disallow_lpage = 0, which is for GFN [2M+8K, 4M+8K);
> > >> lpage_info[2M][2].disallow_lpage = 0, which is for GFN [4M+8K, 6M+8K);
> > >> lpage_info[2M][3].disallow_lpage = 1, which is for GFN [6M+8K, 8M+8K);
> >
> > Should it be?
> > lpage_info[2M][0].disallow_lpage = 1, which is for GFN [8K, 2M);
> > lpage_info[2M][1].disallow_lpage = 0, which is for GFN [2M, 4M);
> > lpage_info[2M][2].disallow_lpage = 0, which is for GFN [4M, 6M);
> > lpage_info[2M][3].disallow_lpage = 0, which is for GFN [6M, 8M);
> > lpage_info[2M][4].disallow_lpage = 1, which is for GFN [8M, 8M+8K);
> Right. Good catch. Thanks!
>
> Let me update the example as below:
> slot->base_gfn is 2 (for GPA 8KB), npages 2000 (for a 8MB range)
>
> lpage_info[2M][0].disallow_lpage = 1, which is for GPA [8KB, 2MB);
> lpage_info[2M][1].disallow_lpage = 0, which is for GPA [2MB, 4MB);
> lpage_info[2M][2].disallow_lpage = 0, which is for GPA [4MB, 6MB);
> lpage_info[2M][3].disallow_lpage = 0, which is for GPA [6MB, 8MB);
> lpage_info[2M][4].disallow_lpage = 1, which is for GPA [8MB, 8MB+8KB);
>
> lpage_info indicates that a 2MB mapping is alllowed to cover GPA 4MB and GPA
> 4MB+16KB. However, their aligned_index values lead guest_memfd to allocate two
> 2MB folios, whose physical addresses may not be contiguous.
>
> Additionally, if the guest accesses two GPAs, e.g., GPA 2MB+8KB and GPA 4MB,
> KVM could create two 2MB mappings to cover GPA ranges [2MB, 4MB), [4MB, 6MB).
> However, guest_memfd just allocates the same 2MB folio for both faults.
>
>
> >
> > >>
> > >>   ---------------------------------------------------------
> > >>   |          |  |          |  |          |  |          |  |
> > >>   8K        2M 2M+8K      4M  4M+8K     6M  6M+8K     8M  8M+8K
> > >>
> > >> For GFN 6M and GFN 6M+4K, as they both belong to lpage_info[2M][2], huge
> > >> page is allowed. Also, they have the same aligned_index 2 in guest_memfd.
> > >> So, guest_memfd allocates the same huge folio of 2M order for them.
> > > Sorry, sent too fast this morning. The example is not right. The correct
> > > one is:
> > >
> > > For GFN 4M and GFN 4M+16K, lpage_info indicates that 2M is allowed. So,
> > > KVM will create a 2M mapping for them.
> > >
> > > However, in guest_memfd, GFN 4M and GFN 4M+16K do not correspond to the
> > > same 2M folio and physical addresses may not be contiguous.

Then during binding, guest memfd offset misalignment with hugepage
should be same as gfn misalignment. i.e.

(offset & ~huge_page_mask(h)) == ((slot->base_gfn << PAGE_SHIFT) &
~huge_page_mask(h));

For non guest_memfd backed scenarios, KVM allows slot gfn ranges that
are not hugepage aligned, so guest_memfd should also be able to
support non-hugepage aligned memslots.

> > >
> > >
> > >> However, for TDX, GFN 6M and GFN 6M+4K should not belong to the same folio.
> > >> It's also weird for a 2M mapping in KVM to stride across 2 huge folios.
> > >>
> > >>> Hence I think it is okay to leave it to KVM to fault pages into the
> > >>> guest correctly. For guest_memfd will just maintain the invariant that
> > >>> offset and size are hugepage aligned, but not require that
> > >>> slot->base_gfn and slot->npages are hugepage aligned. This behavior will
> > >>> be consistent with other backing memory for guests like regular shmem or
> > >>> HugeTLB.
> > >>>
> > >>>>> +       ret = kvm_gmem_hugetlb_filemap_add_folio(inode->i_mapping, folio,
> > >>>>> +                                                aligned_index,
> > >>>>> +                                                htlb_alloc_mask(hgmem->h));
> > >>>>> +       WARN_ON(ret);
> > >>>>> +
> > >>>>>         spin_lock(&inode->i_lock);
> > >>>>>         inode->i_blocks += blocks_per_huge_page(hgmem->h);
> > >>>>>         spin_unlock(&inode->i_lock);
> > >>>>>
> > >>>>> -       return page_folio(requested_page);
> > >>>>> +       return folio;
> > >>>>> +}
> > >
> >
Re: [RFC PATCH 39/39] KVM: guest_memfd: Dynamically split/reconstruct HugeTLB page
Posted by Ackerley Tng 9 months, 2 weeks ago
Vishal Annapurve <vannapurve@google.com> writes:

> On Thu, Apr 24, 2025 at 1:15 AM Yan Zhao <yan.y.zhao@intel.com> wrote:
>>
>> On Thu, Apr 24, 2025 at 01:55:51PM +0800, Chenyi Qiang wrote:
>> >
>> >
>> > On 4/24/2025 12:25 PM, Yan Zhao wrote:
>> > > On Thu, Apr 24, 2025 at 09:09:22AM +0800, Yan Zhao wrote:
>> > >> On Wed, Apr 23, 2025 at 03:02:02PM -0700, Ackerley Tng wrote:
>> > >>> Yan Zhao <yan.y.zhao@intel.com> writes:
>> > >>>
>> > >>>> On Tue, Sep 10, 2024 at 11:44:10PM +0000, Ackerley Tng wrote:
>> > >>>>> +/*
>> > >>>>> + * Allocates and then caches a folio in the filemap. Returns a folio with
>> > >>>>> + * refcount of 2: 1 after allocation, and 1 taken by the filemap.
>> > >>>>> + */
>> > >>>>> +static struct folio *kvm_gmem_hugetlb_alloc_and_cache_folio(struct inode *inode,
>> > >>>>> +                                                           pgoff_t index)
>> > >>>>> +{
>> > >>>>> +       struct kvm_gmem_hugetlb *hgmem;
>> > >>>>> +       pgoff_t aligned_index;
>> > >>>>> +       struct folio *folio;
>> > >>>>> +       int nr_pages;
>> > >>>>> +       int ret;
>> > >>>>> +
>> > >>>>> +       hgmem = kvm_gmem_hgmem(inode);
>> > >>>>> +       folio = kvm_gmem_hugetlb_alloc_folio(hgmem->h, hgmem->spool);
>> > >>>>> +       if (IS_ERR(folio))
>> > >>>>> +               return folio;
>> > >>>>> +
>> > >>>>> +       nr_pages = 1UL << huge_page_order(hgmem->h);
>> > >>>>> +       aligned_index = round_down(index, nr_pages);
>> > >>>> Maybe a gap here.
>> > >>>>
>> > >>>> When a guest_memfd is bound to a slot where slot->base_gfn is not aligned to
>> > >>>> 2M/1G and slot->gmem.pgoff is 0, even if an index is 2M/1G aligned, the
>> > >>>> corresponding GFN is not 2M/1G aligned.
>> > >>>
>> > >>> Thanks for looking into this.
>> > >>>
>> > >>> In 1G page support for guest_memfd, the offset and size are always
>> > >>> hugepage aligned to the hugepage size requested at guest_memfd creation
>> > >>> time, and it is true that when binding to a memslot, slot->base_gfn and
>> > >>> slot->npages may not be hugepage aligned.
>> > >>>
>> > >>>>
>> > >>>> However, TDX requires that private huge pages be 2M aligned in GFN.
>> > >>>>
>> > >>>
>> > >>> IIUC other factors also contribute to determining the mapping level in
>> > >>> the guest page tables, like lpage_info and .private_max_mapping_level()
>> > >>> in kvm_x86_ops.
>> > >>>
>> > >>> If slot->base_gfn and slot->npages are not hugepage aligned, lpage_info
>> > >>> will track that and not allow faulting into guest page tables at higher
>> > >>> granularity.
>> > >>
>> > >> lpage_info only checks the alignments of slot->base_gfn and
>> > >> slot->base_gfn + npages. e.g.,
>> > >>
>> > >> if slot->base_gfn is 8K, npages is 8M, then for this slot,
>> > >> lpage_info[2M][0].disallow_lpage = 1, which is for GFN [4K, 2M+8K);
>> > >> lpage_info[2M][1].disallow_lpage = 0, which is for GFN [2M+8K, 4M+8K);
>> > >> lpage_info[2M][2].disallow_lpage = 0, which is for GFN [4M+8K, 6M+8K);
>> > >> lpage_info[2M][3].disallow_lpage = 1, which is for GFN [6M+8K, 8M+8K);
>> >
>> > Should it be?
>> > lpage_info[2M][0].disallow_lpage = 1, which is for GFN [8K, 2M);
>> > lpage_info[2M][1].disallow_lpage = 0, which is for GFN [2M, 4M);
>> > lpage_info[2M][2].disallow_lpage = 0, which is for GFN [4M, 6M);
>> > lpage_info[2M][3].disallow_lpage = 0, which is for GFN [6M, 8M);
>> > lpage_info[2M][4].disallow_lpage = 1, which is for GFN [8M, 8M+8K);
>> Right. Good catch. Thanks!
>>
>> Let me update the example as below:
>> slot->base_gfn is 2 (for GPA 8KB), npages 2000 (for a 8MB range)
>>
>> lpage_info[2M][0].disallow_lpage = 1, which is for GPA [8KB, 2MB);
>> lpage_info[2M][1].disallow_lpage = 0, which is for GPA [2MB, 4MB);
>> lpage_info[2M][2].disallow_lpage = 0, which is for GPA [4MB, 6MB);
>> lpage_info[2M][3].disallow_lpage = 0, which is for GPA [6MB, 8MB);
>> lpage_info[2M][4].disallow_lpage = 1, which is for GPA [8MB, 8MB+8KB);
>>
>> lpage_info indicates that a 2MB mapping is alllowed to cover GPA 4MB and GPA
>> 4MB+16KB. However, their aligned_index values lead guest_memfd to allocate two
>> 2MB folios, whose physical addresses may not be contiguous.
>>
>> Additionally, if the guest accesses two GPAs, e.g., GPA 2MB+8KB and GPA 4MB,
>> KVM could create two 2MB mappings to cover GPA ranges [2MB, 4MB), [4MB, 6MB).
>> However, guest_memfd just allocates the same 2MB folio for both faults.
>>
>>
>> >
>> > >>
>> > >>   ---------------------------------------------------------
>> > >>   |          |  |          |  |          |  |          |  |
>> > >>   8K        2M 2M+8K      4M  4M+8K     6M  6M+8K     8M  8M+8K
>> > >>
>> > >> For GFN 6M and GFN 6M+4K, as they both belong to lpage_info[2M][2], huge
>> > >> page is allowed. Also, they have the same aligned_index 2 in guest_memfd.
>> > >> So, guest_memfd allocates the same huge folio of 2M order for them.
>> > > Sorry, sent too fast this morning. The example is not right. The correct
>> > > one is:
>> > >
>> > > For GFN 4M and GFN 4M+16K, lpage_info indicates that 2M is allowed. So,
>> > > KVM will create a 2M mapping for them.
>> > >
>> > > However, in guest_memfd, GFN 4M and GFN 4M+16K do not correspond to the
>> > > same 2M folio and physical addresses may not be contiguous.
>
> Then during binding, guest memfd offset misalignment with hugepage
> should be same as gfn misalignment. i.e.
>
> (offset & ~huge_page_mask(h)) == ((slot->base_gfn << PAGE_SHIFT) &
> ~huge_page_mask(h));
>
> For non guest_memfd backed scenarios, KVM allows slot gfn ranges that
> are not hugepage aligned, so guest_memfd should also be able to
> support non-hugepage aligned memslots.
>

I drew up a picture [1] which hopefully clarifies this.

Thanks for pointing this out, I understand better now and we will add an
extra constraint during memslot binding of guest_memfd to check that gfn
offsets within a hugepage must be guest_memfd offsets.

Adding checks at binding time will allow hugepage-unaligned offsets (to
be at parity with non-guest_memfd backing memory) but still fix this
issue.

lpage_info will make sure that ranges near the bounds will be
fragmented, but the hugepages in the middle will still be mappable as
hugepages.

[1] https://lpc.events/event/18/contributions/1764/attachments/1409/3706/binding-must-have-same-alignment.svg

>> > >
>> > >
>> > >> However, for TDX, GFN 6M and GFN 6M+4K should not belong to the same folio.
>> > >> It's also weird for a 2M mapping in KVM to stride across 2 huge folios.
>> > >>
>> > >>> Hence I think it is okay to leave it to KVM to fault pages into the
>> > >>> guest correctly. For guest_memfd will just maintain the invariant that
>> > >>> offset and size are hugepage aligned, but not require that
>> > >>> slot->base_gfn and slot->npages are hugepage aligned. This behavior will
>> > >>> be consistent with other backing memory for guests like regular shmem or
>> > >>> HugeTLB.
>> > >>>
>> > >>>>> +       ret = kvm_gmem_hugetlb_filemap_add_folio(inode->i_mapping, folio,
>> > >>>>> +                                                aligned_index,
>> > >>>>> +                                                htlb_alloc_mask(hgmem->h));
>> > >>>>> +       WARN_ON(ret);
>> > >>>>> +
>> > >>>>>         spin_lock(&inode->i_lock);
>> > >>>>>         inode->i_blocks += blocks_per_huge_page(hgmem->h);
>> > >>>>>         spin_unlock(&inode->i_lock);
>> > >>>>>
>> > >>>>> -       return page_folio(requested_page);
>> > >>>>> +       return folio;
>> > >>>>> +}
>> > >
>> >
Re: [RFC PATCH 39/39] KVM: guest_memfd: Dynamically split/reconstruct HugeTLB page
Posted by Yan Zhao 9 months, 2 weeks ago
On Thu, Apr 24, 2025 at 11:15:11AM -0700, Ackerley Tng wrote:
> Vishal Annapurve <vannapurve@google.com> writes:
> 
> > On Thu, Apr 24, 2025 at 1:15 AM Yan Zhao <yan.y.zhao@intel.com> wrote:
> >>
> >> On Thu, Apr 24, 2025 at 01:55:51PM +0800, Chenyi Qiang wrote:
> >> >
> >> >
> >> > On 4/24/2025 12:25 PM, Yan Zhao wrote:
> >> > > On Thu, Apr 24, 2025 at 09:09:22AM +0800, Yan Zhao wrote:
> >> > >> On Wed, Apr 23, 2025 at 03:02:02PM -0700, Ackerley Tng wrote:
> >> > >>> Yan Zhao <yan.y.zhao@intel.com> writes:
> >> > >>>
> >> > >>>> On Tue, Sep 10, 2024 at 11:44:10PM +0000, Ackerley Tng wrote:
> >> > >>>>> +/*
> >> > >>>>> + * Allocates and then caches a folio in the filemap. Returns a folio with
> >> > >>>>> + * refcount of 2: 1 after allocation, and 1 taken by the filemap.
> >> > >>>>> + */
> >> > >>>>> +static struct folio *kvm_gmem_hugetlb_alloc_and_cache_folio(struct inode *inode,
> >> > >>>>> +                                                           pgoff_t index)
> >> > >>>>> +{
> >> > >>>>> +       struct kvm_gmem_hugetlb *hgmem;
> >> > >>>>> +       pgoff_t aligned_index;
> >> > >>>>> +       struct folio *folio;
> >> > >>>>> +       int nr_pages;
> >> > >>>>> +       int ret;
> >> > >>>>> +
> >> > >>>>> +       hgmem = kvm_gmem_hgmem(inode);
> >> > >>>>> +       folio = kvm_gmem_hugetlb_alloc_folio(hgmem->h, hgmem->spool);
> >> > >>>>> +       if (IS_ERR(folio))
> >> > >>>>> +               return folio;
> >> > >>>>> +
> >> > >>>>> +       nr_pages = 1UL << huge_page_order(hgmem->h);
> >> > >>>>> +       aligned_index = round_down(index, nr_pages);
> >> > >>>> Maybe a gap here.
> >> > >>>>
> >> > >>>> When a guest_memfd is bound to a slot where slot->base_gfn is not aligned to
> >> > >>>> 2M/1G and slot->gmem.pgoff is 0, even if an index is 2M/1G aligned, the
> >> > >>>> corresponding GFN is not 2M/1G aligned.
> >> > >>>
> >> > >>> Thanks for looking into this.
> >> > >>>
> >> > >>> In 1G page support for guest_memfd, the offset and size are always
> >> > >>> hugepage aligned to the hugepage size requested at guest_memfd creation
> >> > >>> time, and it is true that when binding to a memslot, slot->base_gfn and
> >> > >>> slot->npages may not be hugepage aligned.
> >> > >>>
> >> > >>>>
> >> > >>>> However, TDX requires that private huge pages be 2M aligned in GFN.
> >> > >>>>
> >> > >>>
> >> > >>> IIUC other factors also contribute to determining the mapping level in
> >> > >>> the guest page tables, like lpage_info and .private_max_mapping_level()
> >> > >>> in kvm_x86_ops.
> >> > >>>
> >> > >>> If slot->base_gfn and slot->npages are not hugepage aligned, lpage_info
> >> > >>> will track that and not allow faulting into guest page tables at higher
> >> > >>> granularity.
> >> > >>
> >> > >> lpage_info only checks the alignments of slot->base_gfn and
> >> > >> slot->base_gfn + npages. e.g.,
> >> > >>
> >> > >> if slot->base_gfn is 8K, npages is 8M, then for this slot,
> >> > >> lpage_info[2M][0].disallow_lpage = 1, which is for GFN [4K, 2M+8K);
> >> > >> lpage_info[2M][1].disallow_lpage = 0, which is for GFN [2M+8K, 4M+8K);
> >> > >> lpage_info[2M][2].disallow_lpage = 0, which is for GFN [4M+8K, 6M+8K);
> >> > >> lpage_info[2M][3].disallow_lpage = 1, which is for GFN [6M+8K, 8M+8K);
> >> >
> >> > Should it be?
> >> > lpage_info[2M][0].disallow_lpage = 1, which is for GFN [8K, 2M);
> >> > lpage_info[2M][1].disallow_lpage = 0, which is for GFN [2M, 4M);
> >> > lpage_info[2M][2].disallow_lpage = 0, which is for GFN [4M, 6M);
> >> > lpage_info[2M][3].disallow_lpage = 0, which is for GFN [6M, 8M);
> >> > lpage_info[2M][4].disallow_lpage = 1, which is for GFN [8M, 8M+8K);
> >> Right. Good catch. Thanks!
> >>
> >> Let me update the example as below:
> >> slot->base_gfn is 2 (for GPA 8KB), npages 2000 (for a 8MB range)
> >>
> >> lpage_info[2M][0].disallow_lpage = 1, which is for GPA [8KB, 2MB);
> >> lpage_info[2M][1].disallow_lpage = 0, which is for GPA [2MB, 4MB);
> >> lpage_info[2M][2].disallow_lpage = 0, which is for GPA [4MB, 6MB);
> >> lpage_info[2M][3].disallow_lpage = 0, which is for GPA [6MB, 8MB);
> >> lpage_info[2M][4].disallow_lpage = 1, which is for GPA [8MB, 8MB+8KB);
> >>
> >> lpage_info indicates that a 2MB mapping is alllowed to cover GPA 4MB and GPA
> >> 4MB+16KB. However, their aligned_index values lead guest_memfd to allocate two
> >> 2MB folios, whose physical addresses may not be contiguous.
> >>
> >> Additionally, if the guest accesses two GPAs, e.g., GPA 2MB+8KB and GPA 4MB,
> >> KVM could create two 2MB mappings to cover GPA ranges [2MB, 4MB), [4MB, 6MB).
> >> However, guest_memfd just allocates the same 2MB folio for both faults.
> >>
> >>
> >> >
> >> > >>
> >> > >>   ---------------------------------------------------------
> >> > >>   |          |  |          |  |          |  |          |  |
> >> > >>   8K        2M 2M+8K      4M  4M+8K     6M  6M+8K     8M  8M+8K
> >> > >>
> >> > >> For GFN 6M and GFN 6M+4K, as they both belong to lpage_info[2M][2], huge
> >> > >> page is allowed. Also, they have the same aligned_index 2 in guest_memfd.
> >> > >> So, guest_memfd allocates the same huge folio of 2M order for them.
> >> > > Sorry, sent too fast this morning. The example is not right. The correct
> >> > > one is:
> >> > >
> >> > > For GFN 4M and GFN 4M+16K, lpage_info indicates that 2M is allowed. So,
> >> > > KVM will create a 2M mapping for them.
> >> > >
> >> > > However, in guest_memfd, GFN 4M and GFN 4M+16K do not correspond to the
> >> > > same 2M folio and physical addresses may not be contiguous.
> >
> > Then during binding, guest memfd offset misalignment with hugepage
> > should be same as gfn misalignment. i.e.
> >
> > (offset & ~huge_page_mask(h)) == ((slot->base_gfn << PAGE_SHIFT) &
> > ~huge_page_mask(h));
> >
> > For non guest_memfd backed scenarios, KVM allows slot gfn ranges that
> > are not hugepage aligned, so guest_memfd should also be able to
> > support non-hugepage aligned memslots.
> >
> 
> I drew up a picture [1] which hopefully clarifies this.
> 
> Thanks for pointing this out, I understand better now and we will add an
> extra constraint during memslot binding of guest_memfd to check that gfn
> offsets within a hugepage must be guest_memfd offsets.
I'm a bit confused.

As "index = gfn - slot->base_gfn + slot->gmem.pgoff", do you mean you are going
to force "slot->base_gfn == slot->gmem.pgoff" ?

For some memory region, e.g., "pc.ram", it's divided into 2 parts:
- one with offset 0, size 0x80000000(2G),
  positioned at GPA 0, which is below GPA 4G;
- one with offset 0x80000000(2G), size 0x80000000(2G),
  positioned at GPA 0x100000000(4G), which is above GPA 4G.

For the second part, its slot->base_gfn is 0x100000000, while slot->gmem.pgoff
is 0x80000000.

> Adding checks at binding time will allow hugepage-unaligned offsets (to
> be at parity with non-guest_memfd backing memory) but still fix this
> issue.
> 
> lpage_info will make sure that ranges near the bounds will be
> fragmented, but the hugepages in the middle will still be mappable as
> hugepages.
> 
> [1] https://lpc.events/event/18/contributions/1764/attachments/1409/3706/binding-must-have-same-alignment.svg


Re: [RFC PATCH 39/39] KVM: guest_memfd: Dynamically split/reconstruct HugeTLB page
Posted by Ackerley Tng 9 months, 2 weeks ago
Yan Zhao <yan.y.zhao@intel.com> writes:

> On Thu, Apr 24, 2025 at 11:15:11AM -0700, Ackerley Tng wrote:
>> Vishal Annapurve <vannapurve@google.com> writes:
>> 
>> > On Thu, Apr 24, 2025 at 1:15 AM Yan Zhao <yan.y.zhao@intel.com> wrote:
>> >>
>> >> On Thu, Apr 24, 2025 at 01:55:51PM +0800, Chenyi Qiang wrote:
>> >> >
>> >> >
>> >> > On 4/24/2025 12:25 PM, Yan Zhao wrote:
>> >> > > On Thu, Apr 24, 2025 at 09:09:22AM +0800, Yan Zhao wrote:
>> >> > >> On Wed, Apr 23, 2025 at 03:02:02PM -0700, Ackerley Tng wrote:
>> >> > >>> Yan Zhao <yan.y.zhao@intel.com> writes:
>> >> > >>>
>> >> > >>>> On Tue, Sep 10, 2024 at 11:44:10PM +0000, Ackerley Tng wrote:
>> >> > >>>>> +/*
>> >> > >>>>> + * Allocates and then caches a folio in the filemap. Returns a folio with
>> >> > >>>>> + * refcount of 2: 1 after allocation, and 1 taken by the filemap.
>> >> > >>>>> + */
>> >> > >>>>> +static struct folio *kvm_gmem_hugetlb_alloc_and_cache_folio(struct inode *inode,
>> >> > >>>>> +                                                           pgoff_t index)
>> >> > >>>>> +{
>> >> > >>>>> +       struct kvm_gmem_hugetlb *hgmem;
>> >> > >>>>> +       pgoff_t aligned_index;
>> >> > >>>>> +       struct folio *folio;
>> >> > >>>>> +       int nr_pages;
>> >> > >>>>> +       int ret;
>> >> > >>>>> +
>> >> > >>>>> +       hgmem = kvm_gmem_hgmem(inode);
>> >> > >>>>> +       folio = kvm_gmem_hugetlb_alloc_folio(hgmem->h, hgmem->spool);
>> >> > >>>>> +       if (IS_ERR(folio))
>> >> > >>>>> +               return folio;
>> >> > >>>>> +
>> >> > >>>>> +       nr_pages = 1UL << huge_page_order(hgmem->h);
>> >> > >>>>> +       aligned_index = round_down(index, nr_pages);
>> >> > >>>> Maybe a gap here.
>> >> > >>>>
>> >> > >>>> When a guest_memfd is bound to a slot where slot->base_gfn is not aligned to
>> >> > >>>> 2M/1G and slot->gmem.pgoff is 0, even if an index is 2M/1G aligned, the
>> >> > >>>> corresponding GFN is not 2M/1G aligned.
>> >> > >>>
>> >> > >>> Thanks for looking into this.
>> >> > >>>
>> >> > >>> In 1G page support for guest_memfd, the offset and size are always
>> >> > >>> hugepage aligned to the hugepage size requested at guest_memfd creation
>> >> > >>> time, and it is true that when binding to a memslot, slot->base_gfn and
>> >> > >>> slot->npages may not be hugepage aligned.
>> >> > >>>
>> >> > >>>>
>> >> > >>>> However, TDX requires that private huge pages be 2M aligned in GFN.
>> >> > >>>>
>> >> > >>>
>> >> > >>> IIUC other factors also contribute to determining the mapping level in
>> >> > >>> the guest page tables, like lpage_info and .private_max_mapping_level()
>> >> > >>> in kvm_x86_ops.
>> >> > >>>
>> >> > >>> If slot->base_gfn and slot->npages are not hugepage aligned, lpage_info
>> >> > >>> will track that and not allow faulting into guest page tables at higher
>> >> > >>> granularity.
>> >> > >>
>> >> > >> lpage_info only checks the alignments of slot->base_gfn and
>> >> > >> slot->base_gfn + npages. e.g.,
>> >> > >>
>> >> > >> if slot->base_gfn is 8K, npages is 8M, then for this slot,
>> >> > >> lpage_info[2M][0].disallow_lpage = 1, which is for GFN [4K, 2M+8K);
>> >> > >> lpage_info[2M][1].disallow_lpage = 0, which is for GFN [2M+8K, 4M+8K);
>> >> > >> lpage_info[2M][2].disallow_lpage = 0, which is for GFN [4M+8K, 6M+8K);
>> >> > >> lpage_info[2M][3].disallow_lpage = 1, which is for GFN [6M+8K, 8M+8K);
>> >> >
>> >> > Should it be?
>> >> > lpage_info[2M][0].disallow_lpage = 1, which is for GFN [8K, 2M);
>> >> > lpage_info[2M][1].disallow_lpage = 0, which is for GFN [2M, 4M);
>> >> > lpage_info[2M][2].disallow_lpage = 0, which is for GFN [4M, 6M);
>> >> > lpage_info[2M][3].disallow_lpage = 0, which is for GFN [6M, 8M);
>> >> > lpage_info[2M][4].disallow_lpage = 1, which is for GFN [8M, 8M+8K);
>> >> Right. Good catch. Thanks!
>> >>
>> >> Let me update the example as below:
>> >> slot->base_gfn is 2 (for GPA 8KB), npages 2000 (for a 8MB range)
>> >>
>> >> lpage_info[2M][0].disallow_lpage = 1, which is for GPA [8KB, 2MB);
>> >> lpage_info[2M][1].disallow_lpage = 0, which is for GPA [2MB, 4MB);
>> >> lpage_info[2M][2].disallow_lpage = 0, which is for GPA [4MB, 6MB);
>> >> lpage_info[2M][3].disallow_lpage = 0, which is for GPA [6MB, 8MB);
>> >> lpage_info[2M][4].disallow_lpage = 1, which is for GPA [8MB, 8MB+8KB);
>> >>
>> >> lpage_info indicates that a 2MB mapping is alllowed to cover GPA 4MB and GPA
>> >> 4MB+16KB. However, their aligned_index values lead guest_memfd to allocate two
>> >> 2MB folios, whose physical addresses may not be contiguous.
>> >>
>> >> Additionally, if the guest accesses two GPAs, e.g., GPA 2MB+8KB and GPA 4MB,
>> >> KVM could create two 2MB mappings to cover GPA ranges [2MB, 4MB), [4MB, 6MB).
>> >> However, guest_memfd just allocates the same 2MB folio for both faults.
>> >>
>> >>
>> >> >
>> >> > >>
>> >> > >>   ---------------------------------------------------------
>> >> > >>   |          |  |          |  |          |  |          |  |
>> >> > >>   8K        2M 2M+8K      4M  4M+8K     6M  6M+8K     8M  8M+8K
>> >> > >>
>> >> > >> For GFN 6M and GFN 6M+4K, as they both belong to lpage_info[2M][2], huge
>> >> > >> page is allowed. Also, they have the same aligned_index 2 in guest_memfd.
>> >> > >> So, guest_memfd allocates the same huge folio of 2M order for them.
>> >> > > Sorry, sent too fast this morning. The example is not right. The correct
>> >> > > one is:
>> >> > >
>> >> > > For GFN 4M and GFN 4M+16K, lpage_info indicates that 2M is allowed. So,
>> >> > > KVM will create a 2M mapping for them.
>> >> > >
>> >> > > However, in guest_memfd, GFN 4M and GFN 4M+16K do not correspond to the
>> >> > > same 2M folio and physical addresses may not be contiguous.
>> >
>> > Then during binding, guest memfd offset misalignment with hugepage
>> > should be same as gfn misalignment. i.e.
>> >
>> > (offset & ~huge_page_mask(h)) == ((slot->base_gfn << PAGE_SHIFT) &
>> > ~huge_page_mask(h));
>> >
>> > For non guest_memfd backed scenarios, KVM allows slot gfn ranges that
>> > are not hugepage aligned, so guest_memfd should also be able to
>> > support non-hugepage aligned memslots.
>> >
>> 
>> I drew up a picture [1] which hopefully clarifies this.
>> 
>> Thanks for pointing this out, I understand better now and we will add an
>> extra constraint during memslot binding of guest_memfd to check that gfn
>> offsets within a hugepage must be guest_memfd offsets.
> I'm a bit confused.
>
> As "index = gfn - slot->base_gfn + slot->gmem.pgoff", do you mean you are going
> to force "slot->base_gfn == slot->gmem.pgoff" ?
>
> For some memory region, e.g., "pc.ram", it's divided into 2 parts:
> - one with offset 0, size 0x80000000(2G),
>   positioned at GPA 0, which is below GPA 4G;
> - one with offset 0x80000000(2G), size 0x80000000(2G),
>   positioned at GPA 0x100000000(4G), which is above GPA 4G.
>
> For the second part, its slot->base_gfn is 0x100000000, while slot->gmem.pgoff
> is 0x80000000.
>

Nope I don't mean to enforce that they are equal, we just need the
offsets within the page to be equal.

I edited Vishal's code snippet, perhaps it would help explain better:

page_size is the size of the hugepage, so in our example,

  page_size = SZ_2M;
  page_mask = ~(page_size - 1);
  offset_within_page = slot->gmem.pgoff & page_mask;
  gfn_within_page = (slot->base_gfn << PAGE_SHIFT) & page_mask;

We will enforce that

  offset_within_page == gfn_within_page;

>> Adding checks at binding time will allow hugepage-unaligned offsets (to
>> be at parity with non-guest_memfd backing memory) but still fix this
>> issue.
>> 
>> lpage_info will make sure that ranges near the bounds will be
>> fragmented, but the hugepages in the middle will still be mappable as
>> hugepages.
>> 
>> [1] https://lpc.events/event/18/contributions/1764/attachments/1409/3706/binding-must-have-same-alignment.svg
Re: [RFC PATCH 39/39] KVM: guest_memfd: Dynamically split/reconstruct HugeTLB page
Posted by Yan Zhao 9 months, 2 weeks ago
On Fri, Apr 25, 2025 at 03:45:20PM -0700, Ackerley Tng wrote:
> Yan Zhao <yan.y.zhao@intel.com> writes:
> 
> > On Thu, Apr 24, 2025 at 11:15:11AM -0700, Ackerley Tng wrote:
> >> Vishal Annapurve <vannapurve@google.com> writes:
> >> 
> >> > On Thu, Apr 24, 2025 at 1:15 AM Yan Zhao <yan.y.zhao@intel.com> wrote:
> >> >>
> >> >> On Thu, Apr 24, 2025 at 01:55:51PM +0800, Chenyi Qiang wrote:
> >> >> >
> >> >> >
> >> >> > On 4/24/2025 12:25 PM, Yan Zhao wrote:
> >> >> > > On Thu, Apr 24, 2025 at 09:09:22AM +0800, Yan Zhao wrote:
> >> >> > >> On Wed, Apr 23, 2025 at 03:02:02PM -0700, Ackerley Tng wrote:
> >> >> > >>> Yan Zhao <yan.y.zhao@intel.com> writes:
> >> >> > >>>
> >> >> > >>>> On Tue, Sep 10, 2024 at 11:44:10PM +0000, Ackerley Tng wrote:
> >> >> > >>>>> +/*
> >> >> > >>>>> + * Allocates and then caches a folio in the filemap. Returns a folio with
> >> >> > >>>>> + * refcount of 2: 1 after allocation, and 1 taken by the filemap.
> >> >> > >>>>> + */
> >> >> > >>>>> +static struct folio *kvm_gmem_hugetlb_alloc_and_cache_folio(struct inode *inode,
> >> >> > >>>>> +                                                           pgoff_t index)
> >> >> > >>>>> +{
> >> >> > >>>>> +       struct kvm_gmem_hugetlb *hgmem;
> >> >> > >>>>> +       pgoff_t aligned_index;
> >> >> > >>>>> +       struct folio *folio;
> >> >> > >>>>> +       int nr_pages;
> >> >> > >>>>> +       int ret;
> >> >> > >>>>> +
> >> >> > >>>>> +       hgmem = kvm_gmem_hgmem(inode);
> >> >> > >>>>> +       folio = kvm_gmem_hugetlb_alloc_folio(hgmem->h, hgmem->spool);
> >> >> > >>>>> +       if (IS_ERR(folio))
> >> >> > >>>>> +               return folio;
> >> >> > >>>>> +
> >> >> > >>>>> +       nr_pages = 1UL << huge_page_order(hgmem->h);
> >> >> > >>>>> +       aligned_index = round_down(index, nr_pages);
> >> >> > >>>> Maybe a gap here.
> >> >> > >>>>
> >> >> > >>>> When a guest_memfd is bound to a slot where slot->base_gfn is not aligned to
> >> >> > >>>> 2M/1G and slot->gmem.pgoff is 0, even if an index is 2M/1G aligned, the
> >> >> > >>>> corresponding GFN is not 2M/1G aligned.
> >> >> > >>>
> >> >> > >>> Thanks for looking into this.
> >> >> > >>>
> >> >> > >>> In 1G page support for guest_memfd, the offset and size are always
> >> >> > >>> hugepage aligned to the hugepage size requested at guest_memfd creation
> >> >> > >>> time, and it is true that when binding to a memslot, slot->base_gfn and
> >> >> > >>> slot->npages may not be hugepage aligned.
> >> >> > >>>
> >> >> > >>>>
> >> >> > >>>> However, TDX requires that private huge pages be 2M aligned in GFN.
> >> >> > >>>>
> >> >> > >>>
> >> >> > >>> IIUC other factors also contribute to determining the mapping level in
> >> >> > >>> the guest page tables, like lpage_info and .private_max_mapping_level()
> >> >> > >>> in kvm_x86_ops.
> >> >> > >>>
> >> >> > >>> If slot->base_gfn and slot->npages are not hugepage aligned, lpage_info
> >> >> > >>> will track that and not allow faulting into guest page tables at higher
> >> >> > >>> granularity.
> >> >> > >>
> >> >> > >> lpage_info only checks the alignments of slot->base_gfn and
> >> >> > >> slot->base_gfn + npages. e.g.,
> >> >> > >>
> >> >> > >> if slot->base_gfn is 8K, npages is 8M, then for this slot,
> >> >> > >> lpage_info[2M][0].disallow_lpage = 1, which is for GFN [4K, 2M+8K);
> >> >> > >> lpage_info[2M][1].disallow_lpage = 0, which is for GFN [2M+8K, 4M+8K);
> >> >> > >> lpage_info[2M][2].disallow_lpage = 0, which is for GFN [4M+8K, 6M+8K);
> >> >> > >> lpage_info[2M][3].disallow_lpage = 1, which is for GFN [6M+8K, 8M+8K);
> >> >> >
> >> >> > Should it be?
> >> >> > lpage_info[2M][0].disallow_lpage = 1, which is for GFN [8K, 2M);
> >> >> > lpage_info[2M][1].disallow_lpage = 0, which is for GFN [2M, 4M);
> >> >> > lpage_info[2M][2].disallow_lpage = 0, which is for GFN [4M, 6M);
> >> >> > lpage_info[2M][3].disallow_lpage = 0, which is for GFN [6M, 8M);
> >> >> > lpage_info[2M][4].disallow_lpage = 1, which is for GFN [8M, 8M+8K);
> >> >> Right. Good catch. Thanks!
> >> >>
> >> >> Let me update the example as below:
> >> >> slot->base_gfn is 2 (for GPA 8KB), npages 2000 (for a 8MB range)
> >> >>
> >> >> lpage_info[2M][0].disallow_lpage = 1, which is for GPA [8KB, 2MB);
> >> >> lpage_info[2M][1].disallow_lpage = 0, which is for GPA [2MB, 4MB);
> >> >> lpage_info[2M][2].disallow_lpage = 0, which is for GPA [4MB, 6MB);
> >> >> lpage_info[2M][3].disallow_lpage = 0, which is for GPA [6MB, 8MB);
> >> >> lpage_info[2M][4].disallow_lpage = 1, which is for GPA [8MB, 8MB+8KB);
> >> >>
> >> >> lpage_info indicates that a 2MB mapping is alllowed to cover GPA 4MB and GPA
> >> >> 4MB+16KB. However, their aligned_index values lead guest_memfd to allocate two
> >> >> 2MB folios, whose physical addresses may not be contiguous.
> >> >>
> >> >> Additionally, if the guest accesses two GPAs, e.g., GPA 2MB+8KB and GPA 4MB,
> >> >> KVM could create two 2MB mappings to cover GPA ranges [2MB, 4MB), [4MB, 6MB).
> >> >> However, guest_memfd just allocates the same 2MB folio for both faults.
> >> >>
> >> >>
> >> >> >
> >> >> > >>
> >> >> > >>   ---------------------------------------------------------
> >> >> > >>   |          |  |          |  |          |  |          |  |
> >> >> > >>   8K        2M 2M+8K      4M  4M+8K     6M  6M+8K     8M  8M+8K
> >> >> > >>
> >> >> > >> For GFN 6M and GFN 6M+4K, as they both belong to lpage_info[2M][2], huge
> >> >> > >> page is allowed. Also, they have the same aligned_index 2 in guest_memfd.
> >> >> > >> So, guest_memfd allocates the same huge folio of 2M order for them.
> >> >> > > Sorry, sent too fast this morning. The example is not right. The correct
> >> >> > > one is:
> >> >> > >
> >> >> > > For GFN 4M and GFN 4M+16K, lpage_info indicates that 2M is allowed. So,
> >> >> > > KVM will create a 2M mapping for them.
> >> >> > >
> >> >> > > However, in guest_memfd, GFN 4M and GFN 4M+16K do not correspond to the
> >> >> > > same 2M folio and physical addresses may not be contiguous.
> >> >
> >> > Then during binding, guest memfd offset misalignment with hugepage
> >> > should be same as gfn misalignment. i.e.
> >> >
> >> > (offset & ~huge_page_mask(h)) == ((slot->base_gfn << PAGE_SHIFT) &
> >> > ~huge_page_mask(h));
> >> >
> >> > For non guest_memfd backed scenarios, KVM allows slot gfn ranges that
> >> > are not hugepage aligned, so guest_memfd should also be able to
> >> > support non-hugepage aligned memslots.
> >> >
> >> 
> >> I drew up a picture [1] which hopefully clarifies this.
> >> 
> >> Thanks for pointing this out, I understand better now and we will add an
> >> extra constraint during memslot binding of guest_memfd to check that gfn
> >> offsets within a hugepage must be guest_memfd offsets.
> > I'm a bit confused.
> >
> > As "index = gfn - slot->base_gfn + slot->gmem.pgoff", do you mean you are going
> > to force "slot->base_gfn == slot->gmem.pgoff" ?
> >
> > For some memory region, e.g., "pc.ram", it's divided into 2 parts:
> > - one with offset 0, size 0x80000000(2G),
> >   positioned at GPA 0, which is below GPA 4G;
> > - one with offset 0x80000000(2G), size 0x80000000(2G),
> >   positioned at GPA 0x100000000(4G), which is above GPA 4G.
> >
> > For the second part, its slot->base_gfn is 0x100000000, while slot->gmem.pgoff
> > is 0x80000000.
> >
> 
> Nope I don't mean to enforce that they are equal, we just need the
> offsets within the page to be equal.
> 
> I edited Vishal's code snippet, perhaps it would help explain better:
> 
> page_size is the size of the hugepage, so in our example,
> 
>   page_size = SZ_2M;
>   page_mask = ~(page_size - 1);
page_mask = page_size - 1  ?

>   offset_within_page = slot->gmem.pgoff & page_mask;
>   gfn_within_page = (slot->base_gfn << PAGE_SHIFT) & page_mask;
> 
> We will enforce that
> 
>   offset_within_page == gfn_within_page;
For "pc.ram", if it has 2.5G below 4G, it would be configured as follows
- slot 1: slot->gmem.pgoff=0, base GPA 0, size=2.5G
- slot 2: slot->gmem.pgoff=2.5G, base GPA 4G, size=1.5G

When binding these two slots to the same guest_memfd created with flag
KVM_GUEST_MEMFD_HUGE_1GB: 
- binding the 1st slot will succeed;
- binding the 2nd slot will fail.

What options does userspace have in this scenario?
It can't reduce the flag to KVM_GUEST_MEMFD_HUGE_2MB. Adjusting the gmem.pgoff
isn't ideal either.

What about something similar as below?

diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index d2feacd14786..87c33704a748 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -1842,8 +1842,16 @@ __kvm_gmem_get_pfn(struct file *file, struct kvm_memory_slot *slot,
        }

        *pfn = folio_file_pfn(folio, index);
-       if (max_order)
-               *max_order = folio_order(folio);
+       if (max_order) {
+               int order;
+
+               order = folio_order(folio);
+
+               while (order > 0 && ((slot->base_gfn ^ slot->gmem.pgoff) & ((1 << order) - 1)))
+                       order--;
+
+               *max_order = order;
+       }

        *is_prepared = folio_test_uptodate(folio);
        return folio;


> >> Adding checks at binding time will allow hugepage-unaligned offsets (to
> >> be at parity with non-guest_memfd backing memory) but still fix this
> >> issue.
> >> 
> >> lpage_info will make sure that ranges near the bounds will be
> >> fragmented, but the hugepages in the middle will still be mappable as
> >> hugepages.
> >> 
> >> [1] https://lpc.events/event/18/contributions/1764/attachments/1409/3706/binding-must-have-same-alignment.svg
Re: [RFC PATCH 39/39] KVM: guest_memfd: Dynamically split/reconstruct HugeTLB page
Posted by Ackerley Tng 9 months, 1 week ago
Yan Zhao <yan.y.zhao@intel.com> writes:

> On Fri, Apr 25, 2025 at 03:45:20PM -0700, Ackerley Tng wrote:
>> Yan Zhao <yan.y.zhao@intel.com> writes:
>> 
>> > On Thu, Apr 24, 2025 at 11:15:11AM -0700, Ackerley Tng wrote:
>> >> Vishal Annapurve <vannapurve@google.com> writes:
>> >> 
>> >> > On Thu, Apr 24, 2025 at 1:15 AM Yan Zhao <yan.y.zhao@intel.com> wrote:
>> >> >>
>> >> >> On Thu, Apr 24, 2025 at 01:55:51PM +0800, Chenyi Qiang wrote:
>> >> >> >
>> >> >> >
>> >> >> > On 4/24/2025 12:25 PM, Yan Zhao wrote:
>> >> >> > > On Thu, Apr 24, 2025 at 09:09:22AM +0800, Yan Zhao wrote:
>> >> >> > >> On Wed, Apr 23, 2025 at 03:02:02PM -0700, Ackerley Tng wrote:
>> >> >> > >>> Yan Zhao <yan.y.zhao@intel.com> writes:
>> >> >> > >>>
>> >> >> > >>>> On Tue, Sep 10, 2024 at 11:44:10PM +0000, Ackerley Tng wrote:
>> >> >> > >>>>> +/*
>> >> >> > >>>>> + * Allocates and then caches a folio in the filemap. Returns a folio with
>> >> >> > >>>>> + * refcount of 2: 1 after allocation, and 1 taken by the filemap.
>> >> >> > >>>>> + */
>> >> >> > >>>>> +static struct folio *kvm_gmem_hugetlb_alloc_and_cache_folio(struct inode *inode,
>> >> >> > >>>>> +                                                           pgoff_t index)
>> >> >> > >>>>> +{
>> >> >> > >>>>> +       struct kvm_gmem_hugetlb *hgmem;
>> >> >> > >>>>> +       pgoff_t aligned_index;
>> >> >> > >>>>> +       struct folio *folio;
>> >> >> > >>>>> +       int nr_pages;
>> >> >> > >>>>> +       int ret;
>> >> >> > >>>>> +
>> >> >> > >>>>> +       hgmem = kvm_gmem_hgmem(inode);
>> >> >> > >>>>> +       folio = kvm_gmem_hugetlb_alloc_folio(hgmem->h, hgmem->spool);
>> >> >> > >>>>> +       if (IS_ERR(folio))
>> >> >> > >>>>> +               return folio;
>> >> >> > >>>>> +
>> >> >> > >>>>> +       nr_pages = 1UL << huge_page_order(hgmem->h);
>> >> >> > >>>>> +       aligned_index = round_down(index, nr_pages);
>> >> >> > >>>> Maybe a gap here.
>> >> >> > >>>>
>> >> >> > >>>> When a guest_memfd is bound to a slot where slot->base_gfn is not aligned to
>> >> >> > >>>> 2M/1G and slot->gmem.pgoff is 0, even if an index is 2M/1G aligned, the
>> >> >> > >>>> corresponding GFN is not 2M/1G aligned.
>> >> >> > >>>
>> >> >> > >>> Thanks for looking into this.
>> >> >> > >>>
>> >> >> > >>> In 1G page support for guest_memfd, the offset and size are always
>> >> >> > >>> hugepage aligned to the hugepage size requested at guest_memfd creation
>> >> >> > >>> time, and it is true that when binding to a memslot, slot->base_gfn and
>> >> >> > >>> slot->npages may not be hugepage aligned.
>> >> >> > >>>
>> >> >> > >>>>
>> >> >> > >>>> However, TDX requires that private huge pages be 2M aligned in GFN.
>> >> >> > >>>>
>> >> >> > >>>
>> >> >> > >>> IIUC other factors also contribute to determining the mapping level in
>> >> >> > >>> the guest page tables, like lpage_info and .private_max_mapping_level()
>> >> >> > >>> in kvm_x86_ops.
>> >> >> > >>>
>> >> >> > >>> If slot->base_gfn and slot->npages are not hugepage aligned, lpage_info
>> >> >> > >>> will track that and not allow faulting into guest page tables at higher
>> >> >> > >>> granularity.
>> >> >> > >>
>> >> >> > >> lpage_info only checks the alignments of slot->base_gfn and
>> >> >> > >> slot->base_gfn + npages. e.g.,
>> >> >> > >>
>> >> >> > >> if slot->base_gfn is 8K, npages is 8M, then for this slot,
>> >> >> > >> lpage_info[2M][0].disallow_lpage = 1, which is for GFN [4K, 2M+8K);
>> >> >> > >> lpage_info[2M][1].disallow_lpage = 0, which is for GFN [2M+8K, 4M+8K);
>> >> >> > >> lpage_info[2M][2].disallow_lpage = 0, which is for GFN [4M+8K, 6M+8K);
>> >> >> > >> lpage_info[2M][3].disallow_lpage = 1, which is for GFN [6M+8K, 8M+8K);
>> >> >> >
>> >> >> > Should it be?
>> >> >> > lpage_info[2M][0].disallow_lpage = 1, which is for GFN [8K, 2M);
>> >> >> > lpage_info[2M][1].disallow_lpage = 0, which is for GFN [2M, 4M);
>> >> >> > lpage_info[2M][2].disallow_lpage = 0, which is for GFN [4M, 6M);
>> >> >> > lpage_info[2M][3].disallow_lpage = 0, which is for GFN [6M, 8M);
>> >> >> > lpage_info[2M][4].disallow_lpage = 1, which is for GFN [8M, 8M+8K);
>> >> >> Right. Good catch. Thanks!
>> >> >>
>> >> >> Let me update the example as below:
>> >> >> slot->base_gfn is 2 (for GPA 8KB), npages 2000 (for a 8MB range)
>> >> >>
>> >> >> lpage_info[2M][0].disallow_lpage = 1, which is for GPA [8KB, 2MB);
>> >> >> lpage_info[2M][1].disallow_lpage = 0, which is for GPA [2MB, 4MB);
>> >> >> lpage_info[2M][2].disallow_lpage = 0, which is for GPA [4MB, 6MB);
>> >> >> lpage_info[2M][3].disallow_lpage = 0, which is for GPA [6MB, 8MB);
>> >> >> lpage_info[2M][4].disallow_lpage = 1, which is for GPA [8MB, 8MB+8KB);
>> >> >>
>> >> >> lpage_info indicates that a 2MB mapping is alllowed to cover GPA 4MB and GPA
>> >> >> 4MB+16KB. However, their aligned_index values lead guest_memfd to allocate two
>> >> >> 2MB folios, whose physical addresses may not be contiguous.
>> >> >>
>> >> >> Additionally, if the guest accesses two GPAs, e.g., GPA 2MB+8KB and GPA 4MB,
>> >> >> KVM could create two 2MB mappings to cover GPA ranges [2MB, 4MB), [4MB, 6MB).
>> >> >> However, guest_memfd just allocates the same 2MB folio for both faults.
>> >> >>
>> >> >>
>> >> >> >
>> >> >> > >>
>> >> >> > >>   ---------------------------------------------------------
>> >> >> > >>   |          |  |          |  |          |  |          |  |
>> >> >> > >>   8K        2M 2M+8K      4M  4M+8K     6M  6M+8K     8M  8M+8K
>> >> >> > >>
>> >> >> > >> For GFN 6M and GFN 6M+4K, as they both belong to lpage_info[2M][2], huge
>> >> >> > >> page is allowed. Also, they have the same aligned_index 2 in guest_memfd.
>> >> >> > >> So, guest_memfd allocates the same huge folio of 2M order for them.
>> >> >> > > Sorry, sent too fast this morning. The example is not right. The correct
>> >> >> > > one is:
>> >> >> > >
>> >> >> > > For GFN 4M and GFN 4M+16K, lpage_info indicates that 2M is allowed. So,
>> >> >> > > KVM will create a 2M mapping for them.
>> >> >> > >
>> >> >> > > However, in guest_memfd, GFN 4M and GFN 4M+16K do not correspond to the
>> >> >> > > same 2M folio and physical addresses may not be contiguous.
>> >> >
>> >> > Then during binding, guest memfd offset misalignment with hugepage
>> >> > should be same as gfn misalignment. i.e.
>> >> >
>> >> > (offset & ~huge_page_mask(h)) == ((slot->base_gfn << PAGE_SHIFT) &
>> >> > ~huge_page_mask(h));
>> >> >
>> >> > For non guest_memfd backed scenarios, KVM allows slot gfn ranges that
>> >> > are not hugepage aligned, so guest_memfd should also be able to
>> >> > support non-hugepage aligned memslots.
>> >> >
>> >> 
>> >> I drew up a picture [1] which hopefully clarifies this.
>> >> 
>> >> Thanks for pointing this out, I understand better now and we will add an
>> >> extra constraint during memslot binding of guest_memfd to check that gfn
>> >> offsets within a hugepage must be guest_memfd offsets.
>> > I'm a bit confused.
>> >
>> > As "index = gfn - slot->base_gfn + slot->gmem.pgoff", do you mean you are going
>> > to force "slot->base_gfn == slot->gmem.pgoff" ?
>> >
>> > For some memory region, e.g., "pc.ram", it's divided into 2 parts:
>> > - one with offset 0, size 0x80000000(2G),
>> >   positioned at GPA 0, which is below GPA 4G;
>> > - one with offset 0x80000000(2G), size 0x80000000(2G),
>> >   positioned at GPA 0x100000000(4G), which is above GPA 4G.
>> >
>> > For the second part, its slot->base_gfn is 0x100000000, while slot->gmem.pgoff
>> > is 0x80000000.
>> >
>> 
>> Nope I don't mean to enforce that they are equal, we just need the
>> offsets within the page to be equal.
>> 
>> I edited Vishal's code snippet, perhaps it would help explain better:
>> 
>> page_size is the size of the hugepage, so in our example,
>> 
>>   page_size = SZ_2M;
>>   page_mask = ~(page_size - 1);
> page_mask = page_size - 1  ?
>

Yes, thank you!

>>   offset_within_page = slot->gmem.pgoff & page_mask;
>>   gfn_within_page = (slot->base_gfn << PAGE_SHIFT) & page_mask;
>> 
>> We will enforce that
>> 
>>   offset_within_page == gfn_within_page;
> For "pc.ram", if it has 2.5G below 4G, it would be configured as follows
> - slot 1: slot->gmem.pgoff=0, base GPA 0, size=2.5G
> - slot 2: slot->gmem.pgoff=2.5G, base GPA 4G, size=1.5G
>
> When binding these two slots to the same guest_memfd created with flag
> KVM_GUEST_MEMFD_HUGE_1GB: 
> - binding the 1st slot will succeed;
> - binding the 2nd slot will fail.
>
> What options does userspace have in this scenario?
> It can't reduce the flag to KVM_GUEST_MEMFD_HUGE_2MB. Adjusting the gmem.pgoff
> isn't ideal either.
>
> What about something similar as below?
>
> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> index d2feacd14786..87c33704a748 100644
> --- a/virt/kvm/guest_memfd.c
> +++ b/virt/kvm/guest_memfd.c
> @@ -1842,8 +1842,16 @@ __kvm_gmem_get_pfn(struct file *file, struct kvm_memory_slot *slot,
>         }
>
>         *pfn = folio_file_pfn(folio, index);
> -       if (max_order)
> -               *max_order = folio_order(folio);
> +       if (max_order) {
> +               int order;
> +
> +               order = folio_order(folio);
> +
> +               while (order > 0 && ((slot->base_gfn ^ slot->gmem.pgoff) & ((1 << order) - 1)))
> +                       order--;
> +
> +               *max_order = order;
> +       }
>
>         *is_prepared = folio_test_uptodate(folio);
>         return folio;
>

Vishal was wondering how this is working before guest_memfd was
introduced, for other backing memory like HugeTLB.

I then poked around and found this [1]. I will be adding a similar check
for any slot where kvm_slot_can_be_private(slot).

Yan, that should work, right?

[1] https://github.com/torvalds/linux/blob/b6ea1680d0ac0e45157a819c41b46565f4616186/arch/x86/kvm/x86.c#L12996

>> >> Adding checks at binding time will allow hugepage-unaligned offsets (to
>> >> be at parity with non-guest_memfd backing memory) but still fix this
>> >> issue.
>> >> 
>> >> lpage_info will make sure that ranges near the bounds will be
>> >> fragmented, but the hugepages in the middle will still be mappable as
>> >> hugepages.
>> >> 
>> >> [1] https://lpc.events/event/18/contributions/1764/attachments/1409/3706/binding-must-have-same-alignment.svg
Re: [RFC PATCH 39/39] KVM: guest_memfd: Dynamically split/reconstruct HugeTLB page
Posted by Yan Zhao 9 months, 1 week ago
On Wed, Apr 30, 2025 at 01:09:33PM -0700, Ackerley Tng wrote:
> Yan Zhao <yan.y.zhao@intel.com> writes:
> 
> > On Fri, Apr 25, 2025 at 03:45:20PM -0700, Ackerley Tng wrote:
> >> Yan Zhao <yan.y.zhao@intel.com> writes:
> >> 
> >> > On Thu, Apr 24, 2025 at 11:15:11AM -0700, Ackerley Tng wrote:
> >> >> Vishal Annapurve <vannapurve@google.com> writes:
> >> >> 
> >> >> > On Thu, Apr 24, 2025 at 1:15 AM Yan Zhao <yan.y.zhao@intel.com> wrote:
> >> >> >>
> >> >> >> On Thu, Apr 24, 2025 at 01:55:51PM +0800, Chenyi Qiang wrote:
> >> >> >> >
> >> >> >> >
> >> >> >> > On 4/24/2025 12:25 PM, Yan Zhao wrote:
> >> >> >> > > On Thu, Apr 24, 2025 at 09:09:22AM +0800, Yan Zhao wrote:
> >> >> >> > >> On Wed, Apr 23, 2025 at 03:02:02PM -0700, Ackerley Tng wrote:
> >> >> >> > >>> Yan Zhao <yan.y.zhao@intel.com> writes:
> >> >> >> > >>>
> >> >> >> > >>>> On Tue, Sep 10, 2024 at 11:44:10PM +0000, Ackerley Tng wrote:
> >> >> >> > >>>>> +/*
> >> >> >> > >>>>> + * Allocates and then caches a folio in the filemap. Returns a folio with
> >> >> >> > >>>>> + * refcount of 2: 1 after allocation, and 1 taken by the filemap.
> >> >> >> > >>>>> + */
> >> >> >> > >>>>> +static struct folio *kvm_gmem_hugetlb_alloc_and_cache_folio(struct inode *inode,
> >> >> >> > >>>>> +                                                           pgoff_t index)
> >> >> >> > >>>>> +{
> >> >> >> > >>>>> +       struct kvm_gmem_hugetlb *hgmem;
> >> >> >> > >>>>> +       pgoff_t aligned_index;
> >> >> >> > >>>>> +       struct folio *folio;
> >> >> >> > >>>>> +       int nr_pages;
> >> >> >> > >>>>> +       int ret;
> >> >> >> > >>>>> +
> >> >> >> > >>>>> +       hgmem = kvm_gmem_hgmem(inode);
> >> >> >> > >>>>> +       folio = kvm_gmem_hugetlb_alloc_folio(hgmem->h, hgmem->spool);
> >> >> >> > >>>>> +       if (IS_ERR(folio))
> >> >> >> > >>>>> +               return folio;
> >> >> >> > >>>>> +
> >> >> >> > >>>>> +       nr_pages = 1UL << huge_page_order(hgmem->h);
> >> >> >> > >>>>> +       aligned_index = round_down(index, nr_pages);
> >> >> >> > >>>> Maybe a gap here.
> >> >> >> > >>>>
> >> >> >> > >>>> When a guest_memfd is bound to a slot where slot->base_gfn is not aligned to
> >> >> >> > >>>> 2M/1G and slot->gmem.pgoff is 0, even if an index is 2M/1G aligned, the
> >> >> >> > >>>> corresponding GFN is not 2M/1G aligned.
> >> >> >> > >>>
> >> >> >> > >>> Thanks for looking into this.
> >> >> >> > >>>
> >> >> >> > >>> In 1G page support for guest_memfd, the offset and size are always
> >> >> >> > >>> hugepage aligned to the hugepage size requested at guest_memfd creation
> >> >> >> > >>> time, and it is true that when binding to a memslot, slot->base_gfn and
> >> >> >> > >>> slot->npages may not be hugepage aligned.
> >> >> >> > >>>
> >> >> >> > >>>>
> >> >> >> > >>>> However, TDX requires that private huge pages be 2M aligned in GFN.
> >> >> >> > >>>>
> >> >> >> > >>>
> >> >> >> > >>> IIUC other factors also contribute to determining the mapping level in
> >> >> >> > >>> the guest page tables, like lpage_info and .private_max_mapping_level()
> >> >> >> > >>> in kvm_x86_ops.
> >> >> >> > >>>
> >> >> >> > >>> If slot->base_gfn and slot->npages are not hugepage aligned, lpage_info
> >> >> >> > >>> will track that and not allow faulting into guest page tables at higher
> >> >> >> > >>> granularity.
> >> >> >> > >>
> >> >> >> > >> lpage_info only checks the alignments of slot->base_gfn and
> >> >> >> > >> slot->base_gfn + npages. e.g.,
> >> >> >> > >>
> >> >> >> > >> if slot->base_gfn is 8K, npages is 8M, then for this slot,
> >> >> >> > >> lpage_info[2M][0].disallow_lpage = 1, which is for GFN [4K, 2M+8K);
> >> >> >> > >> lpage_info[2M][1].disallow_lpage = 0, which is for GFN [2M+8K, 4M+8K);
> >> >> >> > >> lpage_info[2M][2].disallow_lpage = 0, which is for GFN [4M+8K, 6M+8K);
> >> >> >> > >> lpage_info[2M][3].disallow_lpage = 1, which is for GFN [6M+8K, 8M+8K);
> >> >> >> >
> >> >> >> > Should it be?
> >> >> >> > lpage_info[2M][0].disallow_lpage = 1, which is for GFN [8K, 2M);
> >> >> >> > lpage_info[2M][1].disallow_lpage = 0, which is for GFN [2M, 4M);
> >> >> >> > lpage_info[2M][2].disallow_lpage = 0, which is for GFN [4M, 6M);
> >> >> >> > lpage_info[2M][3].disallow_lpage = 0, which is for GFN [6M, 8M);
> >> >> >> > lpage_info[2M][4].disallow_lpage = 1, which is for GFN [8M, 8M+8K);
> >> >> >> Right. Good catch. Thanks!
> >> >> >>
> >> >> >> Let me update the example as below:
> >> >> >> slot->base_gfn is 2 (for GPA 8KB), npages 2000 (for a 8MB range)
> >> >> >>
> >> >> >> lpage_info[2M][0].disallow_lpage = 1, which is for GPA [8KB, 2MB);
> >> >> >> lpage_info[2M][1].disallow_lpage = 0, which is for GPA [2MB, 4MB);
> >> >> >> lpage_info[2M][2].disallow_lpage = 0, which is for GPA [4MB, 6MB);
> >> >> >> lpage_info[2M][3].disallow_lpage = 0, which is for GPA [6MB, 8MB);
> >> >> >> lpage_info[2M][4].disallow_lpage = 1, which is for GPA [8MB, 8MB+8KB);
> >> >> >>
> >> >> >> lpage_info indicates that a 2MB mapping is alllowed to cover GPA 4MB and GPA
> >> >> >> 4MB+16KB. However, their aligned_index values lead guest_memfd to allocate two
> >> >> >> 2MB folios, whose physical addresses may not be contiguous.
> >> >> >>
> >> >> >> Additionally, if the guest accesses two GPAs, e.g., GPA 2MB+8KB and GPA 4MB,
> >> >> >> KVM could create two 2MB mappings to cover GPA ranges [2MB, 4MB), [4MB, 6MB).
> >> >> >> However, guest_memfd just allocates the same 2MB folio for both faults.
> >> >> >>
> >> >> >>
> >> >> >> >
> >> >> >> > >>
> >> >> >> > >>   ---------------------------------------------------------
> >> >> >> > >>   |          |  |          |  |          |  |          |  |
> >> >> >> > >>   8K        2M 2M+8K      4M  4M+8K     6M  6M+8K     8M  8M+8K
> >> >> >> > >>
> >> >> >> > >> For GFN 6M and GFN 6M+4K, as they both belong to lpage_info[2M][2], huge
> >> >> >> > >> page is allowed. Also, they have the same aligned_index 2 in guest_memfd.
> >> >> >> > >> So, guest_memfd allocates the same huge folio of 2M order for them.
> >> >> >> > > Sorry, sent too fast this morning. The example is not right. The correct
> >> >> >> > > one is:
> >> >> >> > >
> >> >> >> > > For GFN 4M and GFN 4M+16K, lpage_info indicates that 2M is allowed. So,
> >> >> >> > > KVM will create a 2M mapping for them.
> >> >> >> > >
> >> >> >> > > However, in guest_memfd, GFN 4M and GFN 4M+16K do not correspond to the
> >> >> >> > > same 2M folio and physical addresses may not be contiguous.
> >> >> >
> >> >> > Then during binding, guest memfd offset misalignment with hugepage
> >> >> > should be same as gfn misalignment. i.e.
> >> >> >
> >> >> > (offset & ~huge_page_mask(h)) == ((slot->base_gfn << PAGE_SHIFT) &
> >> >> > ~huge_page_mask(h));
> >> >> >
> >> >> > For non guest_memfd backed scenarios, KVM allows slot gfn ranges that
> >> >> > are not hugepage aligned, so guest_memfd should also be able to
> >> >> > support non-hugepage aligned memslots.
> >> >> >
> >> >> 
> >> >> I drew up a picture [1] which hopefully clarifies this.
> >> >> 
> >> >> Thanks for pointing this out, I understand better now and we will add an
> >> >> extra constraint during memslot binding of guest_memfd to check that gfn
> >> >> offsets within a hugepage must be guest_memfd offsets.
> >> > I'm a bit confused.
> >> >
> >> > As "index = gfn - slot->base_gfn + slot->gmem.pgoff", do you mean you are going
> >> > to force "slot->base_gfn == slot->gmem.pgoff" ?
> >> >
> >> > For some memory region, e.g., "pc.ram", it's divided into 2 parts:
> >> > - one with offset 0, size 0x80000000(2G),
> >> >   positioned at GPA 0, which is below GPA 4G;
> >> > - one with offset 0x80000000(2G), size 0x80000000(2G),
> >> >   positioned at GPA 0x100000000(4G), which is above GPA 4G.
> >> >
> >> > For the second part, its slot->base_gfn is 0x100000000, while slot->gmem.pgoff
> >> > is 0x80000000.
> >> >
> >> 
> >> Nope I don't mean to enforce that they are equal, we just need the
> >> offsets within the page to be equal.
> >> 
> >> I edited Vishal's code snippet, perhaps it would help explain better:
> >> 
> >> page_size is the size of the hugepage, so in our example,
> >> 
> >>   page_size = SZ_2M;
> >>   page_mask = ~(page_size - 1);
> > page_mask = page_size - 1  ?
> >
> 
> Yes, thank you!
> 
> >>   offset_within_page = slot->gmem.pgoff & page_mask;
> >>   gfn_within_page = (slot->base_gfn << PAGE_SHIFT) & page_mask;
> >> 
> >> We will enforce that
> >> 
> >>   offset_within_page == gfn_within_page;
> > For "pc.ram", if it has 2.5G below 4G, it would be configured as follows
> > - slot 1: slot->gmem.pgoff=0, base GPA 0, size=2.5G
> > - slot 2: slot->gmem.pgoff=2.5G, base GPA 4G, size=1.5G
> >
> > When binding these two slots to the same guest_memfd created with flag
> > KVM_GUEST_MEMFD_HUGE_1GB: 
> > - binding the 1st slot will succeed;
> > - binding the 2nd slot will fail.
> >
> > What options does userspace have in this scenario?
> > It can't reduce the flag to KVM_GUEST_MEMFD_HUGE_2MB. Adjusting the gmem.pgoff
> > isn't ideal either.
> >
> > What about something similar as below?
> >
> > diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> > index d2feacd14786..87c33704a748 100644
> > --- a/virt/kvm/guest_memfd.c
> > +++ b/virt/kvm/guest_memfd.c
> > @@ -1842,8 +1842,16 @@ __kvm_gmem_get_pfn(struct file *file, struct kvm_memory_slot *slot,
> >         }
> >
> >         *pfn = folio_file_pfn(folio, index);
> > -       if (max_order)
> > -               *max_order = folio_order(folio);
> > +       if (max_order) {
> > +               int order;
> > +
> > +               order = folio_order(folio);
> > +
> > +               while (order > 0 && ((slot->base_gfn ^ slot->gmem.pgoff) & ((1 << order) - 1)))
> > +                       order--;
> > +
> > +               *max_order = order;
> > +       }
> >
> >         *is_prepared = folio_test_uptodate(folio);
> >         return folio;
> >
> 
> Vishal was wondering how this is working before guest_memfd was
> introduced, for other backing memory like HugeTLB.
> 
> I then poked around and found this [1]. I will be adding a similar check
> for any slot where kvm_slot_can_be_private(slot).
>
> Yan, that should work, right?
No, I don't think the checking of ugfn [1] should work.

1. Even for slots bound to in-place-conversion guest_memfd (i.e. shared memory
are allocated from guest_memfd), the slot->userspace_addr does not necessarily
have the same offset as slot->gmem.pgoff. Even if we audit the offset in
kvm_gmem_bind(), userspace could invoke munmap() and mmap() afterwards, causing
slot->userspace_addr to point to a different offset.

2. for slots bound to guest_memfd that do not support in-place-conversion,
shared memory is allocated from a different backend. Therefore, checking
"slot->base_gfn ^ slot->gmem.pgoff" is required for private memory. The check is
currently absent because guest_memfd supports 4K only.

 
> [1] https://github.com/torvalds/linux/blob/b6ea1680d0ac0e45157a819c41b46565f4616186/arch/x86/kvm/x86.c#L12996
> 
> >> >> Adding checks at binding time will allow hugepage-unaligned offsets (to
> >> >> be at parity with non-guest_memfd backing memory) but still fix this
> >> >> issue.
> >> >> 
> >> >> lpage_info will make sure that ranges near the bounds will be
> >> >> fragmented, but the hugepages in the middle will still be mappable as
> >> >> hugepages.
> >> >> 
> >> >> [1] https://lpc.events/event/18/contributions/1764/attachments/1409/3706/binding-must-have-same-alignment.svg
Re: [RFC PATCH 39/39] KVM: guest_memfd: Dynamically split/reconstruct HugeTLB page
Posted by Ackerley Tng 9 months, 1 week ago
Yan Zhao <yan.y.zhao@intel.com> writes:

>> > <snip>
>> >
>> > What options does userspace have in this scenario?
>> > It can't reduce the flag to KVM_GUEST_MEMFD_HUGE_2MB. Adjusting the gmem.pgoff
>> > isn't ideal either.
>> >
>> > What about something similar as below?
>> >
>> > diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
>> > index d2feacd14786..87c33704a748 100644
>> > --- a/virt/kvm/guest_memfd.c
>> > +++ b/virt/kvm/guest_memfd.c
>> > @@ -1842,8 +1842,16 @@ __kvm_gmem_get_pfn(struct file *file, struct kvm_memory_slot *slot,
>> >         }
>> >
>> >         *pfn = folio_file_pfn(folio, index);
>> > -       if (max_order)
>> > -               *max_order = folio_order(folio);
>> > +       if (max_order) {
>> > +               int order;
>> > +
>> > +               order = folio_order(folio);
>> > +
>> > +               while (order > 0 && ((slot->base_gfn ^ slot->gmem.pgoff) & ((1 << order) - 1)))
>> > +                       order--;
>> > +
>> > +               *max_order = order;
>> > +       }
>> >
>> >         *is_prepared = folio_test_uptodate(folio);
>> >         return folio;
>> >
>> 
>> Vishal was wondering how this is working before guest_memfd was
>> introduced, for other backing memory like HugeTLB.
>> 
>> I then poked around and found this [1]. I will be adding a similar check
>> for any slot where kvm_slot_can_be_private(slot).
>>
>> Yan, that should work, right?
> No, I don't think the checking of ugfn [1] should work.
>
> 1. Even for slots bound to in-place-conversion guest_memfd (i.e. shared memory
> are allocated from guest_memfd), the slot->userspace_addr does not necessarily
> have the same offset as slot->gmem.pgoff. Even if we audit the offset in
> kvm_gmem_bind(), userspace could invoke munmap() and mmap() afterwards, causing
> slot->userspace_addr to point to a different offset.
>
> 2. for slots bound to guest_memfd that do not support in-place-conversion,
> shared memory is allocated from a different backend. Therefore, checking
> "slot->base_gfn ^ slot->gmem.pgoff" is required for private memory. The check is
> currently absent because guest_memfd supports 4K only.
>
>

Let me clarify, I meant these changes:

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4b64ab3..d0dccf1 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -12938,6 +12938,11 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages)
        return 0;
 }
 
+static inline bool kvm_is_level_aligned(u64 value, int level)
+{
+       return IS_ALIGNED(value, KVM_PAGES_PER_HPAGE(level));
+}
+
 static int kvm_alloc_memslot_metadata(struct kvm *kvm,
                                      struct kvm_memory_slot *slot)
 {
@@ -12971,16 +12976,20 @@ static int kvm_alloc_memslot_metadata(struct kvm *kvm,
 
                slot->arch.lpage_info[i - 1] = linfo;
 
-               if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
+               if (!kvm_is_level_aligned(slot->base_gfn, level))
                        linfo[0].disallow_lpage = 1;
-               if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
+               if (!kvm_is_level_aligned(slot->base_gfn + npages, level))
                        linfo[lpages - 1].disallow_lpage = 1;
                ugfn = slot->userspace_addr >> PAGE_SHIFT;
                /*
-                * If the gfn and userspace address are not aligned wrt each
-                * other, disable large page support for this slot.
+                * If the gfn and userspace address are not aligned or if gfn
+                * and guest_memfd offset are not aligned wrt each other,
+                * disable large page support for this slot.
                 */
-               if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1)) {
+               if (!kvm_is_level_aligned(slot->base_gfn ^ ugfn, level) ||
+                   (kvm_slot_can_be_private(slot) &&
+                    !kvm_is_level_aligned(slot->base_gfn ^ slot->gmem.pgoff,
+                                          level))) {
                        unsigned long j;
 
                        for (j = 0; j < lpages; ++j)

This does not rely on the ugfn check, but adds a similar check for gmem.pgoff.

I think this should take care of case (1.), for guest_memfds going to be
used for both shared and private memory. Userspace can't update
slot->userspace_addr, since guest_memfd memslots cannot be updated and
can only be deleted.

If userspace re-uses slot->userspace_addr for some other memory address
without deleting and re-adding a memslot,

+ KVM's access to memory should still be fine, since after the recent
  discussion at guest_memfd upstream call, KVM's guest faults will
  always go via fd+offset and KVM's access won't be disrupted
  there. Whatever checking done at memslot binding time will still be
  valid.
+ Host's access and other accesses (e.g. instruction emulation, which
  uses slot->userspace_addr) to guest memory will be broken, but I think
  there's nothing protecting against that. The same breakage would
  happen for non-guest_memfd memslot.

p.s. I will be adding the validation as you suggested [1], though that
shouldn't make a difference here, since the above check directly
validates against gmem.pgoff.

Regarding 2., checking this checks against gmem.pgoff and should handle
that as well.

[1] https://lore.kernel.org/all/aBnMp26iWWhUrsVf@yzhao56-desk.sh.intel.com/

I prefer checking at binding time because it aligns with the ugfn check
that is already there, and avoids having to check at every fault.

>> [1] https://github.com/torvalds/linux/blob/b6ea1680d0ac0e45157a819c41b46565f4616186/arch/x86/kvm/x86.c#L12996
>> 
>> >> >> Adding checks at binding time will allow hugepage-unaligned offsets (to
>> >> >> be at parity with non-guest_memfd backing memory) but still fix this
>> >> >> issue.
>> >> >> 
>> >> >> lpage_info will make sure that ranges near the bounds will be
>> >> >> fragmented, but the hugepages in the middle will still be mappable as
>> >> >> hugepages.
>> >> >> 
>> >> >> [1] https://lpc.events/event/18/contributions/1764/attachments/1409/3706/binding-must-have-same-alignment.svg
Re: [RFC PATCH 39/39] KVM: guest_memfd: Dynamically split/reconstruct HugeTLB page
Posted by Yan Zhao 9 months, 1 week ago
On Tue, May 06, 2025 at 12:22:47PM -0700, Ackerley Tng wrote:
> Yan Zhao <yan.y.zhao@intel.com> writes:
> 
> >> > <snip>
> >> >
> >> > What options does userspace have in this scenario?
> >> > It can't reduce the flag to KVM_GUEST_MEMFD_HUGE_2MB. Adjusting the gmem.pgoff
> >> > isn't ideal either.
> >> >
> >> > What about something similar as below?
> >> >
> >> > diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> >> > index d2feacd14786..87c33704a748 100644
> >> > --- a/virt/kvm/guest_memfd.c
> >> > +++ b/virt/kvm/guest_memfd.c
> >> > @@ -1842,8 +1842,16 @@ __kvm_gmem_get_pfn(struct file *file, struct kvm_memory_slot *slot,
> >> >         }
> >> >
> >> >         *pfn = folio_file_pfn(folio, index);
> >> > -       if (max_order)
> >> > -               *max_order = folio_order(folio);
> >> > +       if (max_order) {
> >> > +               int order;
> >> > +
> >> > +               order = folio_order(folio);
> >> > +
> >> > +               while (order > 0 && ((slot->base_gfn ^ slot->gmem.pgoff) & ((1 << order) - 1)))
> >> > +                       order--;
> >> > +
> >> > +               *max_order = order;
> >> > +       }
> >> >
> >> >         *is_prepared = folio_test_uptodate(folio);
> >> >         return folio;
> >> >
> >> 
> >> Vishal was wondering how this is working before guest_memfd was
> >> introduced, for other backing memory like HugeTLB.
> >> 
> >> I then poked around and found this [1]. I will be adding a similar check
> >> for any slot where kvm_slot_can_be_private(slot).
> >>
> >> Yan, that should work, right?
> > No, I don't think the checking of ugfn [1] should work.
> >
> > 1. Even for slots bound to in-place-conversion guest_memfd (i.e. shared memory
> > are allocated from guest_memfd), the slot->userspace_addr does not necessarily
> > have the same offset as slot->gmem.pgoff. Even if we audit the offset in
> > kvm_gmem_bind(), userspace could invoke munmap() and mmap() afterwards, causing
> > slot->userspace_addr to point to a different offset.
> >
> > 2. for slots bound to guest_memfd that do not support in-place-conversion,
> > shared memory is allocated from a different backend. Therefore, checking
> > "slot->base_gfn ^ slot->gmem.pgoff" is required for private memory. The check is
> > currently absent because guest_memfd supports 4K only.
> >
> >
> 
> Let me clarify, I meant these changes:
> 
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 4b64ab3..d0dccf1 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -12938,6 +12938,11 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages)
>         return 0;
>  }
>  
> +static inline bool kvm_is_level_aligned(u64 value, int level)
> +{
> +       return IS_ALIGNED(value, KVM_PAGES_PER_HPAGE(level));
> +}
> +
>  static int kvm_alloc_memslot_metadata(struct kvm *kvm,
>                                       struct kvm_memory_slot *slot)
>  {
> @@ -12971,16 +12976,20 @@ static int kvm_alloc_memslot_metadata(struct kvm *kvm,
>  
>                 slot->arch.lpage_info[i - 1] = linfo;
>  
> -               if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
> +               if (!kvm_is_level_aligned(slot->base_gfn, level))
>                         linfo[0].disallow_lpage = 1;
> -               if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
> +               if (!kvm_is_level_aligned(slot->base_gfn + npages, level))
>                         linfo[lpages - 1].disallow_lpage = 1;
>                 ugfn = slot->userspace_addr >> PAGE_SHIFT;
>                 /*
> -                * If the gfn and userspace address are not aligned wrt each
> -                * other, disable large page support for this slot.
> +                * If the gfn and userspace address are not aligned or if gfn
> +                * and guest_memfd offset are not aligned wrt each other,
> +                * disable large page support for this slot.
>                  */
> -               if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1)) {
> +               if (!kvm_is_level_aligned(slot->base_gfn ^ ugfn, level) ||
> +                   (kvm_slot_can_be_private(slot) &&
> +                    !kvm_is_level_aligned(slot->base_gfn ^ slot->gmem.pgoff,
> +                                          level))) {
>                         unsigned long j;
>  
>                         for (j = 0; j < lpages; ++j)
> 
> This does not rely on the ugfn check, but adds a similar check for gmem.pgoff.
In the case of shared memory is not allocated from guest_memfd, (e.g. with the
current upstream code), the checking of gmem.pgoff here will disallow huge page
of shared memory even if "slot->base_gfn ^ ugfn" is aligned.

> I think this should take care of case (1.), for guest_memfds going to be
> used for both shared and private memory. Userspace can't update
> slot->userspace_addr, since guest_memfd memslots cannot be updated and
> can only be deleted.
> 
> If userspace re-uses slot->userspace_addr for some other memory address
> without deleting and re-adding a memslot,
> 
> + KVM's access to memory should still be fine, since after the recent
>   discussion at guest_memfd upstream call, KVM's guest faults will
>   always go via fd+offset and KVM's access won't be disrupted
>   there. Whatever checking done at memslot binding time will still be
>   valid.
Could the offset of shared memory and offset of private memory be different if
userspace re-uses slot->userspace_addr without deleting and re-adding a memslot?

Then though the two offsets are validated as equal in kvm_gmem_bind(), they may
differ later on.

> + Host's access and other accesses (e.g. instruction emulation, which
>   uses slot->userspace_addr) to guest memory will be broken, but I think
>   there's nothing protecting against that. The same breakage would
>   happen for non-guest_memfd memslot.
Why is host access broken in non-guest_memfd case?
The HVA is still a valid one in QEMU's mmap-ed address space.

> p.s. I will be adding the validation as you suggested [1], though that
> shouldn't make a difference here, since the above check directly
> validates against gmem.pgoff.
> 
> Regarding 2., checking this checks against gmem.pgoff and should handle
> that as well.
> 
> [1] https://lore.kernel.org/all/aBnMp26iWWhUrsVf@yzhao56-desk.sh.intel.com/
> 
> I prefer checking at binding time because it aligns with the ugfn check
> that is already there, and avoids having to check at every fault.
> 
> >> [1] https://github.com/torvalds/linux/blob/b6ea1680d0ac0e45157a819c41b46565f4616186/arch/x86/kvm/x86.c#L12996
> >> 
> >> >> >> Adding checks at binding time will allow hugepage-unaligned offsets (to
> >> >> >> be at parity with non-guest_memfd backing memory) but still fix this
> >> >> >> issue.
> >> >> >> 
> >> >> >> lpage_info will make sure that ranges near the bounds will be
> >> >> >> fragmented, but the hugepages in the middle will still be mappable as
> >> >> >> hugepages.
> >> >> >> 
> >> >> >> [1] https://lpc.events/event/18/contributions/1764/attachments/1409/3706/binding-must-have-same-alignment.svg
Re: [RFC PATCH 39/39] KVM: guest_memfd: Dynamically split/reconstruct HugeTLB page
Posted by Ackerley Tng 9 months ago
Yan Zhao <yan.y.zhao@intel.com> writes:

> On Tue, May 06, 2025 at 12:22:47PM -0700, Ackerley Tng wrote:
>> Yan Zhao <yan.y.zhao@intel.com> writes:
>> 
>> >> > <snip>
>> >> >
>> >> > What options does userspace have in this scenario?
>> >> > It can't reduce the flag to KVM_GUEST_MEMFD_HUGE_2MB. Adjusting the gmem.pgoff
>> >> > isn't ideal either.
>> >> >
>> >> > What about something similar as below?
>> >> >
>> >> > diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
>> >> > index d2feacd14786..87c33704a748 100644
>> >> > --- a/virt/kvm/guest_memfd.c
>> >> > +++ b/virt/kvm/guest_memfd.c
>> >> > @@ -1842,8 +1842,16 @@ __kvm_gmem_get_pfn(struct file *file, struct kvm_memory_slot *slot,
>> >> >         }
>> >> >
>> >> >         *pfn = folio_file_pfn(folio, index);
>> >> > -       if (max_order)
>> >> > -               *max_order = folio_order(folio);
>> >> > +       if (max_order) {
>> >> > +               int order;
>> >> > +
>> >> > +               order = folio_order(folio);
>> >> > +
>> >> > +               while (order > 0 && ((slot->base_gfn ^ slot->gmem.pgoff) & ((1 << order) - 1)))
>> >> > +                       order--;
>> >> > +
>> >> > +               *max_order = order;
>> >> > +       }
>> >> >
>> >> >         *is_prepared = folio_test_uptodate(folio);
>> >> >         return folio;
>> >> >
>> >> 
>> >> Vishal was wondering how this is working before guest_memfd was
>> >> introduced, for other backing memory like HugeTLB.
>> >> 
>> >> I then poked around and found this [1]. I will be adding a similar check
>> >> for any slot where kvm_slot_can_be_private(slot).
>> >>
>> >> Yan, that should work, right?
>> > No, I don't think the checking of ugfn [1] should work.
>> >
>> > 1. Even for slots bound to in-place-conversion guest_memfd (i.e. shared memory
>> > are allocated from guest_memfd), the slot->userspace_addr does not necessarily
>> > have the same offset as slot->gmem.pgoff. Even if we audit the offset in
>> > kvm_gmem_bind(), userspace could invoke munmap() and mmap() afterwards, causing
>> > slot->userspace_addr to point to a different offset.
>> >
>> > 2. for slots bound to guest_memfd that do not support in-place-conversion,
>> > shared memory is allocated from a different backend. Therefore, checking
>> > "slot->base_gfn ^ slot->gmem.pgoff" is required for private memory. The check is
>> > currently absent because guest_memfd supports 4K only.
>> >
>> >
>> 
>> Let me clarify, I meant these changes:
>> 
>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>> index 4b64ab3..d0dccf1 100644
>> --- a/arch/x86/kvm/x86.c
>> +++ b/arch/x86/kvm/x86.c
>> @@ -12938,6 +12938,11 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages)
>>         return 0;
>>  }
>>  
>> +static inline bool kvm_is_level_aligned(u64 value, int level)
>> +{
>> +       return IS_ALIGNED(value, KVM_PAGES_PER_HPAGE(level));
>> +}
>> +
>>  static int kvm_alloc_memslot_metadata(struct kvm *kvm,
>>                                       struct kvm_memory_slot *slot)
>>  {
>> @@ -12971,16 +12976,20 @@ static int kvm_alloc_memslot_metadata(struct kvm *kvm,
>>  
>>                 slot->arch.lpage_info[i - 1] = linfo;
>>  
>> -               if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
>> +               if (!kvm_is_level_aligned(slot->base_gfn, level))
>>                         linfo[0].disallow_lpage = 1;
>> -               if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
>> +               if (!kvm_is_level_aligned(slot->base_gfn + npages, level))
>>                         linfo[lpages - 1].disallow_lpage = 1;
>>                 ugfn = slot->userspace_addr >> PAGE_SHIFT;
>>                 /*
>> -                * If the gfn and userspace address are not aligned wrt each
>> -                * other, disable large page support for this slot.
>> +                * If the gfn and userspace address are not aligned or if gfn
>> +                * and guest_memfd offset are not aligned wrt each other,
>> +                * disable large page support for this slot.
>>                  */
>> -               if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1)) {
>> +               if (!kvm_is_level_aligned(slot->base_gfn ^ ugfn, level) ||
>> +                   (kvm_slot_can_be_private(slot) &&
>> +                    !kvm_is_level_aligned(slot->base_gfn ^ slot->gmem.pgoff,
>> +                                          level))) {
>>                         unsigned long j;
>>  
>>                         for (j = 0; j < lpages; ++j)
>> 
>> This does not rely on the ugfn check, but adds a similar check for gmem.pgoff.
> In the case of shared memory is not allocated from guest_memfd, (e.g. with the
> current upstream code), the checking of gmem.pgoff here will disallow huge page
> of shared memory even if "slot->base_gfn ^ ugfn" is aligned.
>

Thanks, I get it now. What you mean is that the memslot could have been
set up such that

+ slot->userspace_addr is aligned with slot->base_gfn, to be used for
  shared memory, and 
+ slot->gmem.pgoff is not aligned with slot->base_gfn, to be used for
  private memory

and this check would disallow huge page mappings even though this
memslot was going to only be used for shared memory.

The only way to fix this would indeed be a runtime check, since the
shared/private status can change at runtime.

I think it is okay that this check is stricter than necessary, since it
just results in mapping without huge pages.

What do you think?

>> I think this should take care of case (1.), for guest_memfds going to be
>> used for both shared and private memory. Userspace can't update
>> slot->userspace_addr, since guest_memfd memslots cannot be updated and
>> can only be deleted.
>> 
>> If userspace re-uses slot->userspace_addr for some other memory address
>> without deleting and re-adding a memslot,
>> 
>> + KVM's access to memory should still be fine, since after the recent
>>   discussion at guest_memfd upstream call, KVM's guest faults will
>>   always go via fd+offset and KVM's access won't be disrupted
>>   there. Whatever checking done at memslot binding time will still be
>>   valid.
> Could the offset of shared memory and offset of private memory be different if
> userspace re-uses slot->userspace_addr without deleting and re-adding a memslot?
>

They could be different, yes. I think what you mean is if userspace does
something like

addr = mmap(guest_memfd);
ioctl(KVM_SET_USER_MEMORY_REGION, addr, guest_memfd);
munmap(addr);
addr = mmap(addr, other_fd);
(with no second call to KVM_SET_USER_MEMORY_REGION)

Without guest_memfd, when munmap() happens, KVM should get a
notification via mmu_notifiers. That will unmap the pages from guest
page tables. At the next fault, host page tables will be consulted to
determine max_mapping_level, and at that time the mapping level would be
the new mapping level in host page tables.

> Then though the two offsets are validated as equal in kvm_gmem_bind(), they may
> differ later on.
>

This is true.

Continuing from above, with guest_memfd, no issues if guest_memfd is
only used for private memory, since shared memory uses the same
mechanism as before guest_memfd.

If guest_memfd is used for both private and shared memory, on unmapping,
KVM will also get notified via mmu_notifiers. On the next fault, the
mapping level is determined as follows (I have a patch coming up that
will illustrate this better)

1. guest_memfd will return 4K since this is a shared folio and shared
   folios are always split to 4K. But suppose in future guest_memfd
   supports shared folios at higher levels, say 1G, we continue...
2. lpage info (not updated since userspace swapped out addr) will say
   map at 1G
3. Since this is a shared fault, we check host page tables, which would
   say 4K since there was a munmap() and mmap().

I think it should still work as expected.

>> + Host's access and other accesses (e.g. instruction emulation, which
>>   uses slot->userspace_addr) to guest memory will be broken, but I think
>>   there's nothing protecting against that. The same breakage would
>>   happen for non-guest_memfd memslot.
> Why is host access broken in non-guest_memfd case?
> The HVA is still a valid one in QEMU's mmap-ed address space.
>

I was thinking that if a guest was executing code and the code gets
swapped out from under its feet by replacing the memory pointed to by
addr, the guest would be broken.

Now that I think about it again, it could be a valid use case. You're
right, thanks for pointing this out!

>> p.s. I will be adding the validation as you suggested [1], though that
>> shouldn't make a difference here, since the above check directly
>> validates against gmem.pgoff.
>> 
>> Regarding 2., checking this checks against gmem.pgoff and should handle
>> that as well.
>> 
>> [1] https://lore.kernel.org/all/aBnMp26iWWhUrsVf@yzhao56-desk.sh.intel.com/
>> 
>> I prefer checking at binding time because it aligns with the ugfn check
>> that is already there, and avoids having to check at every fault.
>> 
>> >> [1] https://github.com/torvalds/linux/blob/b6ea1680d0ac0e45157a819c41b46565f4616186/arch/x86/kvm/x86.c#L12996
>> >> 
>> >> >> >> Adding checks at binding time will allow hugepage-unaligned offsets (to
>> >> >> >> be at parity with non-guest_memfd backing memory) but still fix this
>> >> >> >> issue.
>> >> >> >> 
>> >> >> >> lpage_info will make sure that ranges near the bounds will be
>> >> >> >> fragmented, but the hugepages in the middle will still be mappable as
>> >> >> >> hugepages.
>> >> >> >> 
>> >> >> >> [1] https://lpc.events/event/18/contributions/1764/attachments/1409/3706/binding-must-have-same-alignment.svg
Re: [RFC PATCH 39/39] KVM: guest_memfd: Dynamically split/reconstruct HugeTLB page
Posted by Vishal Annapurve 9 months, 2 weeks ago
On Sun, Apr 27, 2025 at 6:08 PM Yan Zhao <yan.y.zhao@intel.com> wrote:
>
> On Fri, Apr 25, 2025 at 03:45:20PM -0700, Ackerley Tng wrote:
> > Yan Zhao <yan.y.zhao@intel.com> writes:
> > ...
> > >
> > > For some memory region, e.g., "pc.ram", it's divided into 2 parts:
> > > - one with offset 0, size 0x80000000(2G),
> > >   positioned at GPA 0, which is below GPA 4G;
> > > - one with offset 0x80000000(2G), size 0x80000000(2G),
> > >   positioned at GPA 0x100000000(4G), which is above GPA 4G.
> > >
> > > For the second part, its slot->base_gfn is 0x100000000, while slot->gmem.pgoff
> > > is 0x80000000.
> > >
> >
> > Nope I don't mean to enforce that they are equal, we just need the
> > offsets within the page to be equal.
> >
> > I edited Vishal's code snippet, perhaps it would help explain better:
> >
> > page_size is the size of the hugepage, so in our example,
> >
> >   page_size = SZ_2M;
> >   page_mask = ~(page_size - 1);
> page_mask = page_size - 1  ?
>
> >   offset_within_page = slot->gmem.pgoff & page_mask;
> >   gfn_within_page = (slot->base_gfn << PAGE_SHIFT) & page_mask;
> >
> > We will enforce that
> >
> >   offset_within_page == gfn_within_page;
> For "pc.ram", if it has 2.5G below 4G, it would be configured as follows
> - slot 1: slot->gmem.pgoff=0, base GPA 0, size=2.5G
> - slot 2: slot->gmem.pgoff=2.5G, base GPA 4G, size=1.5G
>
> When binding these two slots to the same guest_memfd created with flag
> KVM_GUEST_MEMFD_HUGE_1GB:
> - binding the 1st slot will succeed;
> - binding the 2nd slot will fail.
>
> What options does userspace have in this scenario?

Userspace can create new gmem files that have aligned offsets. But I
see your point, enforcing alignment at binding time will lead to
wastage of memory. i.e. Your example above could be reworked to have:
- slot 1: slot->gmem.pgoff=0, base GPA 0, size=2.5G, gmem_fd = x, gmem_size = 3G
- slot 2: slot->gmem.pgoff=0, base GPA 4G, size=1.5G, gmem_fd = y,
gmem_size = 2G

This will waste 1G of memory as gmem files will have to be hugepage aligned.

> It can't reduce the flag to KVM_GUEST_MEMFD_HUGE_2MB. Adjusting the gmem.pgoff
> isn't ideal either.
>
> What about something similar as below?
>
> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> index d2feacd14786..87c33704a748 100644
> --- a/virt/kvm/guest_memfd.c
> +++ b/virt/kvm/guest_memfd.c
> @@ -1842,8 +1842,16 @@ __kvm_gmem_get_pfn(struct file *file, struct kvm_memory_slot *slot,
>         }
>
>         *pfn = folio_file_pfn(folio, index);
> -       if (max_order)
> -               *max_order = folio_order(folio);
> +       if (max_order) {
> +               int order;
> +
> +               order = folio_order(folio);
> +
> +               while (order > 0 && ((slot->base_gfn ^ slot->gmem.pgoff) & ((1 << order) - 1)))

This sounds better. Userspace will need to avoid this in general or
keep such ranges short so that most of the guest memory ranges can be
mapped at hugepage granularity. So maybe a pr_warn could be spewed
during binding that the alignment is not optimal.

> +                       order--;
> +
> +               *max_order = order;
> +       }
>
>         *is_prepared = folio_test_uptodate(folio);
>         return folio;
>
>
> > >> Adding checks at binding time will allow hugepage-unaligned offsets (to
> > >> be at parity with non-guest_memfd backing memory) but still fix this
> > >> issue.
> > >>
> > >> lpage_info will make sure that ranges near the bounds will be
> > >> fragmented, but the hugepages in the middle will still be mappable as
> > >> hugepages.
> > >>
> > >> [1] https://lpc.events/event/18/contributions/1764/attachments/1409/3706/binding-must-have-same-alignment.svg