[PATCH v10 06/13] khugepaged: add mTHP support

Nico Pache posted 13 patches 1 month, 2 weeks ago
There is a newer version of this series
[PATCH v10 06/13] khugepaged: add mTHP support
Posted by Nico Pache 1 month, 2 weeks ago
Introduce the ability for khugepaged to collapse to different mTHP sizes.
While scanning PMD ranges for potential collapse candidates, keep track
of pages in KHUGEPAGED_MIN_MTHP_ORDER chunks via a bitmap. Each bit
represents a utilized region of order KHUGEPAGED_MIN_MTHP_ORDER ptes. If
mTHPs are enabled we remove the restriction of max_ptes_none during the
scan phase so we don't bailout early and miss potential mTHP candidates.

A new function collapse_scan_bitmap is used to perform binary recursion on
the bitmap and determine the best eligible order for the collapse.
A stack struct is used instead of traditional recursion. max_ptes_none
will be scaled by the attempted collapse order to determine how "full" an
order must be before being considered for collapse.

Once we determine what mTHP sizes fits best in that PMD range a collapse
is attempted. A minimum collapse order of 2 is used as this is the lowest
order supported by anon memory.

For orders configured with "always", we perform greedy collapsing
to that order without considering bit density.

If a mTHP collapse is attempted, but contains swapped out, or shared
pages, we don't perform the collapse. This is because adding new entries
can lead to new none pages, and these may lead to constant promotion into
a higher order (m)THP. A similar issue can occur with "max_ptes_none >
HPAGE_PMD_NR/2" due to the fact that a collapse will introduce at least 2x
the number of pages, and on a future scan will satisfy the promotion
condition once again.

For non-PMD collapse we must leave the anon VMA write locked until after
we collapse the mTHP-- in the PMD case all the pages are isolated, but in
the non-PMD case this is not true, and we must keep the lock to prevent
changes to the VMA from occurring.

Currently madv_collapse is not supported and will only attempt PMD
collapse.

Signed-off-by: Nico Pache <npache@redhat.com>
---
 include/linux/khugepaged.h |   4 +
 mm/khugepaged.c            | 236 +++++++++++++++++++++++++++++--------
 2 files changed, 188 insertions(+), 52 deletions(-)

diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
index eb1946a70cff..d12cdb9ef3ba 100644
--- a/include/linux/khugepaged.h
+++ b/include/linux/khugepaged.h
@@ -1,6 +1,10 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _LINUX_KHUGEPAGED_H
 #define _LINUX_KHUGEPAGED_H
+#define KHUGEPAGED_MIN_MTHP_ORDER	2
+#define KHUGEPAGED_MIN_MTHP_NR	(1 << KHUGEPAGED_MIN_MTHP_ORDER)
+#define MAX_MTHP_BITMAP_SIZE  (1 << (ilog2(MAX_PTRS_PER_PTE) - KHUGEPAGED_MIN_MTHP_ORDER))
+#define MTHP_BITMAP_SIZE  (1 << (HPAGE_PMD_ORDER - KHUGEPAGED_MIN_MTHP_ORDER))
 
 #include <linux/mm.h>
 
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 074101d03c9d..1ad7e00d3fd6 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -94,6 +94,11 @@ static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
 
 static struct kmem_cache *mm_slot_cache __ro_after_init;
 
+struct scan_bit_state {
+	u8 order;
+	u16 offset;
+};
+
 struct collapse_control {
 	bool is_khugepaged;
 
@@ -102,6 +107,18 @@ struct collapse_control {
 
 	/* nodemask for allocation fallback */
 	nodemask_t alloc_nmask;
+
+	/*
+	 * bitmap used to collapse mTHP sizes.
+	 * 1bit = order KHUGEPAGED_MIN_MTHP_ORDER mTHP
+	 */
+	DECLARE_BITMAP(mthp_bitmap, MAX_MTHP_BITMAP_SIZE);
+	DECLARE_BITMAP(mthp_bitmap_temp, MAX_MTHP_BITMAP_SIZE);
+	struct scan_bit_state mthp_bitmap_stack[MAX_MTHP_BITMAP_SIZE];
+};
+
+struct collapse_control khugepaged_collapse_control = {
+	.is_khugepaged = true,
 };
 
 /**
@@ -854,10 +871,6 @@ static void khugepaged_alloc_sleep(void)
 	remove_wait_queue(&khugepaged_wait, &wait);
 }
 
-struct collapse_control khugepaged_collapse_control = {
-	.is_khugepaged = true,
-};
-
 static bool collapse_scan_abort(int nid, struct collapse_control *cc)
 {
 	int i;
@@ -1136,17 +1149,19 @@ static int alloc_charge_folio(struct folio **foliop, struct mm_struct *mm,
 
 static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
 			      int referenced, int unmapped,
-			      struct collapse_control *cc)
+			      struct collapse_control *cc, bool *mmap_locked,
+			      unsigned int order, unsigned long offset)
 {
 	LIST_HEAD(compound_pagelist);
 	pmd_t *pmd, _pmd;
-	pte_t *pte;
+	pte_t *pte = NULL, mthp_pte;
 	pgtable_t pgtable;
 	struct folio *folio;
 	spinlock_t *pmd_ptl, *pte_ptl;
 	int result = SCAN_FAIL;
 	struct vm_area_struct *vma;
 	struct mmu_notifier_range range;
+	unsigned long _address = address + offset * PAGE_SIZE;
 
 	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 
@@ -1155,16 +1170,20 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
 	 * The allocation can take potentially a long time if it involves
 	 * sync compaction, and we do not need to hold the mmap_lock during
 	 * that. We will recheck the vma after taking it again in write mode.
+	 * If collapsing mTHPs we may have already released the read_lock.
 	 */
-	mmap_read_unlock(mm);
+	if (*mmap_locked) {
+		mmap_read_unlock(mm);
+		*mmap_locked = false;
+	}
 
-	result = alloc_charge_folio(&folio, mm, cc, HPAGE_PMD_ORDER);
+	result = alloc_charge_folio(&folio, mm, cc, order);
 	if (result != SCAN_SUCCEED)
 		goto out_nolock;
 
 	mmap_read_lock(mm);
-	result = hugepage_vma_revalidate(mm, address, true, &vma, cc,
-					 BIT(HPAGE_PMD_ORDER));
+	*mmap_locked = true;
+	result = hugepage_vma_revalidate(mm, address, true, &vma, cc, BIT(order));
 	if (result != SCAN_SUCCEED) {
 		mmap_read_unlock(mm);
 		goto out_nolock;
@@ -1182,13 +1201,14 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
 		 * released when it fails. So we jump out_nolock directly in
 		 * that case.  Continuing to collapse causes inconsistency.
 		 */
-		result = __collapse_huge_page_swapin(mm, vma, address, pmd,
-						     referenced, HPAGE_PMD_ORDER);
+		result = __collapse_huge_page_swapin(mm, vma, _address, pmd,
+						     referenced, order);
 		if (result != SCAN_SUCCEED)
 			goto out_nolock;
 	}
 
 	mmap_read_unlock(mm);
+	*mmap_locked = false;
 	/*
 	 * Prevent all access to pagetables with the exception of
 	 * gup_fast later handled by the ptep_clear_flush and the VM
@@ -1198,8 +1218,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
 	 * mmap_lock.
 	 */
 	mmap_write_lock(mm);
-	result = hugepage_vma_revalidate(mm, address, true, &vma, cc,
-					 BIT(HPAGE_PMD_ORDER));
+	result = hugepage_vma_revalidate(mm, address, true, &vma, cc, BIT(order));
 	if (result != SCAN_SUCCEED)
 		goto out_up_write;
 	/* check if the pmd is still valid */
@@ -1210,11 +1229,12 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
 
 	anon_vma_lock_write(vma->anon_vma);
 
-	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address,
-				address + HPAGE_PMD_SIZE);
+	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, _address,
+				_address + (PAGE_SIZE << order));
 	mmu_notifier_invalidate_range_start(&range);
 
 	pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
+
 	/*
 	 * This removes any huge TLB entry from the CPU so we won't allow
 	 * huge and small TLB entries for the same virtual address to
@@ -1228,19 +1248,16 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
 	mmu_notifier_invalidate_range_end(&range);
 	tlb_remove_table_sync_one();
 
-	pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl);
+	pte = pte_offset_map_lock(mm, &_pmd, _address, &pte_ptl);
 	if (pte) {
-		result = __collapse_huge_page_isolate(vma, address, pte, cc,
-						      &compound_pagelist,
-						      HPAGE_PMD_ORDER);
+		result = __collapse_huge_page_isolate(vma, _address, pte, cc,
+						      &compound_pagelist, order);
 		spin_unlock(pte_ptl);
 	} else {
 		result = SCAN_PMD_NULL;
 	}
 
 	if (unlikely(result != SCAN_SUCCEED)) {
-		if (pte)
-			pte_unmap(pte);
 		spin_lock(pmd_ptl);
 		BUG_ON(!pmd_none(*pmd));
 		/*
@@ -1255,17 +1272,17 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
 	}
 
 	/*
-	 * All pages are isolated and locked so anon_vma rmap
-	 * can't run anymore.
+	 * For PMD collapse all pages are isolated and locked so anon_vma
+	 * rmap can't run anymore
 	 */
-	anon_vma_unlock_write(vma->anon_vma);
+	if (order == HPAGE_PMD_ORDER)
+		anon_vma_unlock_write(vma->anon_vma);
 
 	result = __collapse_huge_page_copy(pte, folio, pmd, _pmd,
-					   vma, address, pte_ptl,
-					   &compound_pagelist, HPAGE_PMD_ORDER);
-	pte_unmap(pte);
+					   vma, _address, pte_ptl,
+					   &compound_pagelist, order);
 	if (unlikely(result != SCAN_SUCCEED))
-		goto out_up_write;
+		goto out_unlock_anon_vma;
 
 	/*
 	 * The smp_wmb() inside __folio_mark_uptodate() ensures the
@@ -1273,33 +1290,115 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
 	 * write.
 	 */
 	__folio_mark_uptodate(folio);
-	pgtable = pmd_pgtable(_pmd);
-
-	_pmd = folio_mk_pmd(folio, vma->vm_page_prot);
-	_pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
-
-	spin_lock(pmd_ptl);
-	BUG_ON(!pmd_none(*pmd));
-	folio_add_new_anon_rmap(folio, vma, address, RMAP_EXCLUSIVE);
-	folio_add_lru_vma(folio, vma);
-	pgtable_trans_huge_deposit(mm, pmd, pgtable);
-	set_pmd_at(mm, address, pmd, _pmd);
-	update_mmu_cache_pmd(vma, address, pmd);
-	deferred_split_folio(folio, false);
-	spin_unlock(pmd_ptl);
+	if (order == HPAGE_PMD_ORDER) {
+		pgtable = pmd_pgtable(_pmd);
+		_pmd = folio_mk_pmd(folio, vma->vm_page_prot);
+		_pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
+
+		spin_lock(pmd_ptl);
+		BUG_ON(!pmd_none(*pmd));
+		folio_add_new_anon_rmap(folio, vma, _address, RMAP_EXCLUSIVE);
+		folio_add_lru_vma(folio, vma);
+		pgtable_trans_huge_deposit(mm, pmd, pgtable);
+		set_pmd_at(mm, address, pmd, _pmd);
+		update_mmu_cache_pmd(vma, address, pmd);
+		deferred_split_folio(folio, false);
+		spin_unlock(pmd_ptl);
+	} else { /* mTHP collapse */
+		mthp_pte = mk_pte(&folio->page, vma->vm_page_prot);
+		mthp_pte = maybe_mkwrite(pte_mkdirty(mthp_pte), vma);
+
+		spin_lock(pmd_ptl);
+		BUG_ON(!pmd_none(*pmd));
+		folio_ref_add(folio, (1 << order) - 1);
+		folio_add_new_anon_rmap(folio, vma, _address, RMAP_EXCLUSIVE);
+		folio_add_lru_vma(folio, vma);
+		set_ptes(vma->vm_mm, _address, pte, mthp_pte, (1 << order));
+		update_mmu_cache_range(NULL, vma, _address, pte, (1 << order));
+
+		smp_wmb(); /* make pte visible before pmd */
+		pmd_populate(mm, pmd, pmd_pgtable(_pmd));
+		spin_unlock(pmd_ptl);
+	}
 
 	folio = NULL;
 
 	result = SCAN_SUCCEED;
+out_unlock_anon_vma:
+	if (order != HPAGE_PMD_ORDER)
+		anon_vma_unlock_write(vma->anon_vma);
 out_up_write:
+	if (pte)
+		pte_unmap(pte);
 	mmap_write_unlock(mm);
 out_nolock:
+	*mmap_locked = false;
 	if (folio)
 		folio_put(folio);
 	trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result);
 	return result;
 }
 
+/* Recursive function to consume the bitmap */
+static int collapse_scan_bitmap(struct mm_struct *mm, unsigned long address,
+			int referenced, int unmapped, struct collapse_control *cc,
+			bool *mmap_locked, unsigned long enabled_orders)
+{
+	u8 order, next_order;
+	u16 offset, mid_offset;
+	int num_chunks;
+	int bits_set, threshold_bits;
+	int top = -1;
+	int collapsed = 0;
+	int ret;
+	struct scan_bit_state state;
+	bool is_pmd_only = (enabled_orders == (1 << HPAGE_PMD_ORDER));
+
+	cc->mthp_bitmap_stack[++top] = (struct scan_bit_state)
+		{ HPAGE_PMD_ORDER - KHUGEPAGED_MIN_MTHP_ORDER, 0 };
+
+	while (top >= 0) {
+		state = cc->mthp_bitmap_stack[top--];
+		order = state.order + KHUGEPAGED_MIN_MTHP_ORDER;
+		offset = state.offset;
+		num_chunks = 1 << (state.order);
+		/* Skip mTHP orders that are not enabled */
+		if (!test_bit(order, &enabled_orders))
+			goto next_order;
+
+		/* copy the relavant section to a new bitmap */
+		bitmap_shift_right(cc->mthp_bitmap_temp, cc->mthp_bitmap, offset,
+				  MTHP_BITMAP_SIZE);
+
+		bits_set = bitmap_weight(cc->mthp_bitmap_temp, num_chunks);
+		threshold_bits = (HPAGE_PMD_NR - khugepaged_max_ptes_none - 1)
+				>> (HPAGE_PMD_ORDER - state.order);
+
+		/* Check if the region is "almost full" based on the threshold */
+		if (bits_set > threshold_bits || is_pmd_only
+			|| test_bit(order, &huge_anon_orders_always)) {
+			ret = collapse_huge_page(mm, address, referenced, unmapped,
+						 cc, mmap_locked, order,
+						 offset * KHUGEPAGED_MIN_MTHP_NR);
+			if (ret == SCAN_SUCCEED) {
+				collapsed += (1 << order);
+				continue;
+			}
+		}
+
+next_order:
+		if (state.order > 0) {
+			next_order = state.order - 1;
+			mid_offset = offset + (num_chunks / 2);
+			cc->mthp_bitmap_stack[++top] = (struct scan_bit_state)
+				{ next_order, mid_offset };
+			cc->mthp_bitmap_stack[++top] = (struct scan_bit_state)
+				{ next_order, offset };
+		}
+	}
+	return collapsed;
+}
+
 static int collapse_scan_pmd(struct mm_struct *mm,
 			     struct vm_area_struct *vma,
 			     unsigned long address, bool *mmap_locked,
@@ -1307,31 +1406,60 @@ static int collapse_scan_pmd(struct mm_struct *mm,
 {
 	pmd_t *pmd;
 	pte_t *pte, *_pte;
+	int i;
 	int result = SCAN_FAIL, referenced = 0;
 	int none_or_zero = 0, shared = 0;
 	struct page *page = NULL;
 	struct folio *folio = NULL;
 	unsigned long _address;
+	unsigned long enabled_orders;
 	spinlock_t *ptl;
 	int node = NUMA_NO_NODE, unmapped = 0;
+	bool is_pmd_only;
 	bool writable = false;
-
+	int chunk_none_count = 0;
+	int scaled_none = khugepaged_max_ptes_none >> (HPAGE_PMD_ORDER - KHUGEPAGED_MIN_MTHP_ORDER);
+	unsigned long tva_flags = cc->is_khugepaged ? TVA_KHUGEPAGED : TVA_FORCED_COLLAPSE;
 	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 
 	result = find_pmd_or_thp_or_none(mm, address, &pmd);
 	if (result != SCAN_SUCCEED)
 		goto out;
 
+	bitmap_zero(cc->mthp_bitmap, MAX_MTHP_BITMAP_SIZE);
+	bitmap_zero(cc->mthp_bitmap_temp, MAX_MTHP_BITMAP_SIZE);
 	memset(cc->node_load, 0, sizeof(cc->node_load));
 	nodes_clear(cc->alloc_nmask);
+
+	if (cc->is_khugepaged)
+		enabled_orders = thp_vma_allowable_orders(vma, vma->vm_flags,
+			tva_flags, THP_ORDERS_ALL_ANON);
+	else
+		enabled_orders = BIT(HPAGE_PMD_ORDER);
+
+	is_pmd_only = (enabled_orders == (1 << HPAGE_PMD_ORDER));
+
 	pte = pte_offset_map_lock(mm, pmd, address, &ptl);
 	if (!pte) {
 		result = SCAN_PMD_NULL;
 		goto out;
 	}
 
-	for (_address = address, _pte = pte; _pte < pte + HPAGE_PMD_NR;
-	     _pte++, _address += PAGE_SIZE) {
+	for (i = 0; i < HPAGE_PMD_NR; i++) {
+		/*
+		 * we are reading in KHUGEPAGED_MIN_MTHP_NR page chunks. if
+		 * there are pages in this chunk keep track of it in the bitmap
+		 * for mTHP collapsing.
+		 */
+		if (i % KHUGEPAGED_MIN_MTHP_NR == 0) {
+			if (i > 0 && chunk_none_count <= scaled_none)
+				bitmap_set(cc->mthp_bitmap,
+					   (i - 1) / KHUGEPAGED_MIN_MTHP_NR, 1);
+			chunk_none_count = 0;
+		}
+
+		_pte = pte + i;
+		_address = address + i * PAGE_SIZE;
 		pte_t pteval = ptep_get(_pte);
 		if (is_swap_pte(pteval)) {
 			++unmapped;
@@ -1354,10 +1482,11 @@ static int collapse_scan_pmd(struct mm_struct *mm,
 			}
 		}
 		if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
+			++chunk_none_count;
 			++none_or_zero;
 			if (!userfaultfd_armed(vma) &&
-			    (!cc->is_khugepaged ||
-			     none_or_zero <= khugepaged_max_ptes_none)) {
+			    (!cc->is_khugepaged || !is_pmd_only ||
+				none_or_zero <= khugepaged_max_ptes_none)) {
 				continue;
 			} else {
 				result = SCAN_EXCEED_NONE_PTE;
@@ -1453,6 +1582,7 @@ static int collapse_scan_pmd(struct mm_struct *mm,
 								     address)))
 			referenced++;
 	}
+
 	if (!writable) {
 		result = SCAN_PAGE_RO;
 	} else if (cc->is_khugepaged &&
@@ -1465,10 +1595,12 @@ static int collapse_scan_pmd(struct mm_struct *mm,
 out_unmap:
 	pte_unmap_unlock(pte, ptl);
 	if (result == SCAN_SUCCEED) {
-		result = collapse_huge_page(mm, address, referenced,
-					    unmapped, cc);
-		/* collapse_huge_page will return with the mmap_lock released */
-		*mmap_locked = false;
+		result = collapse_scan_bitmap(mm, address, referenced, unmapped, cc,
+					      mmap_locked, enabled_orders);
+		if (result > 0)
+			result = SCAN_SUCCEED;
+		else
+			result = SCAN_FAIL;
 	}
 out:
 	trace_mm_khugepaged_scan_pmd(mm, folio, writable, referenced,
-- 
2.50.1
Re: [PATCH v10 06/13] khugepaged: add mTHP support
Posted by Lorenzo Stoakes 1 month, 2 weeks ago
On Tue, Aug 19, 2025 at 07:41:58AM -0600, Nico Pache wrote:
> Introduce the ability for khugepaged to collapse to different mTHP sizes.
> While scanning PMD ranges for potential collapse candidates, keep track
> of pages in KHUGEPAGED_MIN_MTHP_ORDER chunks via a bitmap. Each bit
> represents a utilized region of order KHUGEPAGED_MIN_MTHP_ORDER ptes. If
> mTHPs are enabled we remove the restriction of max_ptes_none during the
> scan phase so we don't bailout early and miss potential mTHP candidates.
>
> A new function collapse_scan_bitmap is used to perform binary recursion on
> the bitmap and determine the best eligible order for the collapse.
> A stack struct is used instead of traditional recursion. max_ptes_none
> will be scaled by the attempted collapse order to determine how "full" an
> order must be before being considered for collapse.
>
> Once we determine what mTHP sizes fits best in that PMD range a collapse
> is attempted. A minimum collapse order of 2 is used as this is the lowest
> order supported by anon memory.
>
> For orders configured with "always", we perform greedy collapsing
> to that order without considering bit density.
>
> If a mTHP collapse is attempted, but contains swapped out, or shared
> pages, we don't perform the collapse. This is because adding new entries
> can lead to new none pages, and these may lead to constant promotion into
> a higher order (m)THP. A similar issue can occur with "max_ptes_none >
> HPAGE_PMD_NR/2" due to the fact that a collapse will introduce at least 2x
> the number of pages, and on a future scan will satisfy the promotion
> condition once again.
>
> For non-PMD collapse we must leave the anon VMA write locked until after
> we collapse the mTHP-- in the PMD case all the pages are isolated, but in
> the non-PMD case this is not true, and we must keep the lock to prevent
> changes to the VMA from occurring.
>
> Currently madv_collapse is not supported and will only attempt PMD
> collapse.

Yes I think this has to remain the case unfortunately as we override
sysfs-specified orders for MADV_COLLAPSE and there's no sensible way to
determine what order we ought to be using.

>
> Signed-off-by: Nico Pache <npache@redhat.com>

You've gone from small incremental changes to a huge one here... for the
sake of reviewer sanity at least, any chance of breaking this up?

> ---
>  include/linux/khugepaged.h |   4 +
>  mm/khugepaged.c            | 236 +++++++++++++++++++++++++++++--------
>  2 files changed, 188 insertions(+), 52 deletions(-)
>
> diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
> index eb1946a70cff..d12cdb9ef3ba 100644
> --- a/include/linux/khugepaged.h
> +++ b/include/linux/khugepaged.h
> @@ -1,6 +1,10 @@
>  /* SPDX-License-Identifier: GPL-2.0 */
>  #ifndef _LINUX_KHUGEPAGED_H
>  #define _LINUX_KHUGEPAGED_H
> +#define KHUGEPAGED_MIN_MTHP_ORDER	2

I guess this makes sense as by definition 2 pages is least it could
possibly be.

> +#define KHUGEPAGED_MIN_MTHP_NR	(1 << KHUGEPAGED_MIN_MTHP_ORDER)

Surely KHUGEPAGED_MIN_NR_MTHP_PTES would be more meaningful?

> +#define MAX_MTHP_BITMAP_SIZE  (1 << (ilog2(MAX_PTRS_PER_PTE) - KHUGEPAGED_MIN_MTHP_ORDER))

This is confusing - size of what?

If it's number of bits surely this should be ilog2(MAX_PTRS_PER_PTE) -
KHUGEPAGED_MIN_MTHP_ORDER?

This seems to be more so 'the maximum value that could contain the bits right?

I think this is just wrong though, see below at DECLARE_BITMAP() stuff.

> +#define MTHP_BITMAP_SIZE  (1 << (HPAGE_PMD_ORDER - KHUGEPAGED_MIN_MTHP_ORDER))

Hard to know how this relates to MAX_MTHP_BITMAP_SIZE?

I guess this is the current bitmap size indicating all that is possible,
but if these are all #define's what is this accomplishing?

For all - please do not do (1 << xxx)! This can lead to sign-extension bugs at least
in theory, use _BITUL(...), it's neater too.

NIT but the whitespace is all screwed up here.

KHUGEPAGED_MIN_MTHP_ORDER and KHUGEPAGED_MIN_MTHP_NR

>
>  #include <linux/mm.h>
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 074101d03c9d..1ad7e00d3fd6 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -94,6 +94,11 @@ static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
>
>  static struct kmem_cache *mm_slot_cache __ro_after_init;
>
> +struct scan_bit_state {
> +	u8 order;
> +	u16 offset;
> +};
> +
>  struct collapse_control {
>  	bool is_khugepaged;
>
> @@ -102,6 +107,18 @@ struct collapse_control {
>
>  	/* nodemask for allocation fallback */
>  	nodemask_t alloc_nmask;
> +
> +	/*
> +	 * bitmap used to collapse mTHP sizes.
> +	 * 1bit = order KHUGEPAGED_MIN_MTHP_ORDER mTHP

I'm not sure what this '1bit = xxx' comment means?

> +	 */
> +	DECLARE_BITMAP(mthp_bitmap, MAX_MTHP_BITMAP_SIZE);

Hmm this seems wrong.

DECLARE_BITMAP(..., val) is expessed as:

#define DECLARE_BITMAP(name,bits) \
	unsigned long name[BITS_TO_LONGS(bits)]

So the 2nd param should be number of bits.

But MAX_MTHP_BITMAP_SIZE is:

(1 << (ilog2(MAX_PTRS_PER_PTE) - KHUGEPAGED_MIN_MTHP_ORDER))

So typically:

(1 << (9 - 2)) = 128

And BITS_TO_LONGS is defined as:

__KERNEL_DIV_ROUND_UP(nr, BITS_PER_TYPE(long))

So essentially this will be 128 / 8 on a 64-bit system so 16 bytes to
store... 7 bits?

Unless I'm missing something here?

> +	DECLARE_BITMAP(mthp_bitmap_temp, MAX_MTHP_BITMAP_SIZE);

Same comment as above obviously. But also this is kind of horrible, why are
we putting a copy of this entire bitmap on the stack every time we declare
a cc?

> +	struct scan_bit_state mthp_bitmap_stack[MAX_MTHP_BITMAP_SIZE];
> +};
> +
> +struct collapse_control khugepaged_collapse_control = {
> +	.is_khugepaged = true,
>  };

Why are we moving this here?

>
>  /**
> @@ -854,10 +871,6 @@ static void khugepaged_alloc_sleep(void)
>  	remove_wait_queue(&khugepaged_wait, &wait);
>  }
>
> -struct collapse_control khugepaged_collapse_control = {
> -	.is_khugepaged = true,
> -};
> -
>  static bool collapse_scan_abort(int nid, struct collapse_control *cc)
>  {
>  	int i;
> @@ -1136,17 +1149,19 @@ static int alloc_charge_folio(struct folio **foliop, struct mm_struct *mm,
>
>  static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
>  			      int referenced, int unmapped,
> -			      struct collapse_control *cc)
> +			      struct collapse_control *cc, bool *mmap_locked,
> +			      unsigned int order, unsigned long offset)
>  {
>  	LIST_HEAD(compound_pagelist);
>  	pmd_t *pmd, _pmd;
> -	pte_t *pte;
> +	pte_t *pte = NULL, mthp_pte;
>  	pgtable_t pgtable;
>  	struct folio *folio;
>  	spinlock_t *pmd_ptl, *pte_ptl;
>  	int result = SCAN_FAIL;
>  	struct vm_area_struct *vma;
>  	struct mmu_notifier_range range;
> +	unsigned long _address = address + offset * PAGE_SIZE;

This name is really horrible. please name it sensibly.

It feels like address ought to be consistently the base of the THP or mTHP
we wish to collapse, and if we need something PMD aligned for some reason
we should rename _that_ to e.g. pmd_address.

Orrr it could be mthp_address...

Perhaps we could just figure that out here and pass only the
address... aligning to PMD boundary shouldn't be hard/costly.

But it may indicate we need further refactorisation so we don't need to
paper over cracks + pass around a PMD address to do things when that may
not be where the (m)THP range begins.

>
>  	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
>
> @@ -1155,16 +1170,20 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
>  	 * The allocation can take potentially a long time if it involves
>  	 * sync compaction, and we do not need to hold the mmap_lock during
>  	 * that. We will recheck the vma after taking it again in write mode.
> +	 * If collapsing mTHPs we may have already released the read_lock.
>  	 */
> -	mmap_read_unlock(mm);
> +	if (*mmap_locked) {
> +		mmap_read_unlock(mm);
> +		*mmap_locked = false;
> +	}
>
> -	result = alloc_charge_folio(&folio, mm, cc, HPAGE_PMD_ORDER);
> +	result = alloc_charge_folio(&folio, mm, cc, order);
>  	if (result != SCAN_SUCCEED)
>  		goto out_nolock;
>
>  	mmap_read_lock(mm);
> -	result = hugepage_vma_revalidate(mm, address, true, &vma, cc,
> -					 BIT(HPAGE_PMD_ORDER));
> +	*mmap_locked = true;
> +	result = hugepage_vma_revalidate(mm, address, true, &vma, cc, BIT(order));

I mean this is kind of going back to previous commits, but it's really ugly
to pass a BIT(xxx) here, is that really necessary? Can't we just pass in
the order?

It's also inconsistent with other calls like
e.g. __collapse_huge_page_swapin() below which passes the order.

Same goes obv. for all such invocations.

>  	if (result != SCAN_SUCCEED) {
>  		mmap_read_unlock(mm);
>  		goto out_nolock;
> @@ -1182,13 +1201,14 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
>  		 * released when it fails. So we jump out_nolock directly in
>  		 * that case.  Continuing to collapse causes inconsistency.
>  		 */
> -		result = __collapse_huge_page_swapin(mm, vma, address, pmd,
> -						     referenced, HPAGE_PMD_ORDER);
> +		result = __collapse_huge_page_swapin(mm, vma, _address, pmd,
> +						     referenced, order);
>  		if (result != SCAN_SUCCEED)
>  			goto out_nolock;
>  	}
>
>  	mmap_read_unlock(mm);
> +	*mmap_locked = false;
>  	/*
>  	 * Prevent all access to pagetables with the exception of
>  	 * gup_fast later handled by the ptep_clear_flush and the VM
> @@ -1198,8 +1218,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
>  	 * mmap_lock.
>  	 */
>  	mmap_write_lock(mm);
> -	result = hugepage_vma_revalidate(mm, address, true, &vma, cc,
> -					 BIT(HPAGE_PMD_ORDER));
> +	result = hugepage_vma_revalidate(mm, address, true, &vma, cc, BIT(order));
>  	if (result != SCAN_SUCCEED)
>  		goto out_up_write;
>  	/* check if the pmd is still valid */
> @@ -1210,11 +1229,12 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
>
>  	anon_vma_lock_write(vma->anon_vma);
>
> -	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address,
> -				address + HPAGE_PMD_SIZE);
> +	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, _address,
> +				_address + (PAGE_SIZE << order));

This _address is horrible. That really does have to change.

>  	mmu_notifier_invalidate_range_start(&range);
>
>  	pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
> +

Odd whitespace...

>  	/*
>  	 * This removes any huge TLB entry from the CPU so we won't allow
>  	 * huge and small TLB entries for the same virtual address to
> @@ -1228,19 +1248,16 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
>  	mmu_notifier_invalidate_range_end(&range);
>  	tlb_remove_table_sync_one();
>
> -	pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl);
> +	pte = pte_offset_map_lock(mm, &_pmd, _address, &pte_ptl);

I see we already have a 'convention' of _ prefix on the pmd param, but two
wrongs don't make a right...

>  	if (pte) {
> -		result = __collapse_huge_page_isolate(vma, address, pte, cc,
> -						      &compound_pagelist,
> -						      HPAGE_PMD_ORDER);
> +		result = __collapse_huge_page_isolate(vma, _address, pte, cc,
> +						      &compound_pagelist, order);
>  		spin_unlock(pte_ptl);
>  	} else {
>  		result = SCAN_PMD_NULL;
>  	}
>
>  	if (unlikely(result != SCAN_SUCCEED)) {
> -		if (pte)
> -			pte_unmap(pte);

Why are we removing this?

>  		spin_lock(pmd_ptl);
>  		BUG_ON(!pmd_none(*pmd));
>  		/*
> @@ -1255,17 +1272,17 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
>  	}
>
>  	/*
> -	 * All pages are isolated and locked so anon_vma rmap
> -	 * can't run anymore.
> +	 * For PMD collapse all pages are isolated and locked so anon_vma
> +	 * rmap can't run anymore
>  	 */
> -	anon_vma_unlock_write(vma->anon_vma);
> +	if (order == HPAGE_PMD_ORDER)
> +		anon_vma_unlock_write(vma->anon_vma);

Hmm this is introducing a horrible new way for things to go wrong. And
there's now a whole host of terrible error paths that can go wrong very
easily around rmap locks and yeah, no way we cannot do it this way.

rmap locks are VERY sensitive and the ordering of the locking matters a
great deal (see top of mm/rmap.c). So we have to be SO careful here.

I suggest you simply have a boolean 'anon_vma_locked' or something like
this, and get rid of these horrible additional code paths and the second
order == HPAGE_PMD_ORDER check.

We'll track whether or not the lock is held and thereby needs releasing
that way instead.

Also, and very importantly - are you 100% sure you can't possibly have a
deadlock or issue beyond this point if you don't release the rmap lock?

This is veeeery important, as there can be implicit assumptions around
whether or not one can acquire these locks and you basically have to audit
ALL code over which this lock is held.

I'm speaking from hard experience here having bumped into this in various
attempts at work relating to this stuff...

>
>  	result = __collapse_huge_page_copy(pte, folio, pmd, _pmd,
> -					   vma, address, pte_ptl,
> -					   &compound_pagelist, HPAGE_PMD_ORDER);
> -	pte_unmap(pte);
> +					   vma, _address, pte_ptl,
> +					   &compound_pagelist, order);
>  	if (unlikely(result != SCAN_SUCCEED))
> -		goto out_up_write;
> +		goto out_unlock_anon_vma;

See above...

>
>  	/*
>  	 * The smp_wmb() inside __folio_mark_uptodate() ensures the
> @@ -1273,33 +1290,115 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
>  	 * write.
>  	 */
>  	__folio_mark_uptodate(folio);
> -	pgtable = pmd_pgtable(_pmd);
> -
> -	_pmd = folio_mk_pmd(folio, vma->vm_page_prot);
> -	_pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
> -
> -	spin_lock(pmd_ptl);
> -	BUG_ON(!pmd_none(*pmd));
> -	folio_add_new_anon_rmap(folio, vma, address, RMAP_EXCLUSIVE);
> -	folio_add_lru_vma(folio, vma);
> -	pgtable_trans_huge_deposit(mm, pmd, pgtable);
> -	set_pmd_at(mm, address, pmd, _pmd);
> -	update_mmu_cache_pmd(vma, address, pmd);
> -	deferred_split_folio(folio, false);
> -	spin_unlock(pmd_ptl);
> +	if (order == HPAGE_PMD_ORDER) {
> +		pgtable = pmd_pgtable(_pmd);
> +		_pmd = folio_mk_pmd(folio, vma->vm_page_prot);
> +		_pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
> +
> +		spin_lock(pmd_ptl);
> +		BUG_ON(!pmd_none(*pmd));

I know you're refactoring this, but be good to change this to a
WARN_ON_ONCE(), BUG_ON() is verboten unless it's absolutely definitely
going to be a kernel nuclear event, so worth changing things up as we go.

> +		folio_add_new_anon_rmap(folio, vma, _address, RMAP_EXCLUSIVE);
> +		folio_add_lru_vma(folio, vma);
> +		pgtable_trans_huge_deposit(mm, pmd, pgtable);
> +		set_pmd_at(mm, address, pmd, _pmd);
> +		update_mmu_cache_pmd(vma, address, pmd);
> +		deferred_split_folio(folio, false);
> +		spin_unlock(pmd_ptl);
> +	} else { /* mTHP collapse */
> +		mthp_pte = mk_pte(&folio->page, vma->vm_page_prot);

I guess it's a rule that each THP or mTHP range spanned must span one and
only one folio.

Not sure &folio->page has a future though.

Maybe better to use folio_page(folio, 0)?

> +		mthp_pte = maybe_mkwrite(pte_mkdirty(mthp_pte), vma);
> +
> +		spin_lock(pmd_ptl);
> +		BUG_ON(!pmd_none(*pmd));

having said the above, this is trictly introducing a new BUG_ON() which is
a no-no, please make it a WARN_ON_ONCE().

> +		folio_ref_add(folio, (1 << order) - 1);

Again no 1 << x please.

Do we do something similar somewhere else for mthp ref counting? Can we
share code somehow?

> +		folio_add_new_anon_rmap(folio, vma, _address, RMAP_EXCLUSIVE);
> +		folio_add_lru_vma(folio, vma);
> +		set_ptes(vma->vm_mm, _address, pte, mthp_pte, (1 << order));

Please avoid 1 << order, and I think at this point since you reference it a
bunch of times, just store a local var like nr_pages or sth?

> +		update_mmu_cache_range(NULL, vma, _address, pte, (1 << order));
> +
> +		smp_wmb(); /* make pte visible before pmd */

Can you give some detail as to why this will work here and why it is
necessary?

> +		pmd_populate(mm, pmd, pmd_pgtable(_pmd));

If we're updating PTE entriess why do we need to assign the PMD entry?

> +		spin_unlock(pmd_ptl);
> +	}

This deeply, badly needs to be refactored into something that both shares
code and separates out these two operations.

This function is disgustingly long as it is, and that's not your fault, but
let's try to make things better as we go.

>
>  	folio = NULL;
>
>  	result = SCAN_SUCCEED;
> +out_unlock_anon_vma:
> +	if (order != HPAGE_PMD_ORDER)
> +		anon_vma_unlock_write(vma->anon_vma);

Obviously again as above, we need to simplify this and get rid of this
whole bit.

>  out_up_write:
> +	if (pte)
> +		pte_unmap(pte);

OK I guess you moved this from above down here? Is this a valid place to do this?

>  	mmap_write_unlock(mm);
>  out_nolock:
> +	*mmap_locked = false;

This is kind of horrible, we now have pretty mad logic around who sets
mmap_locked and where.

Can we just do this at the call sites so we avoid that?

I mean anything we do with this is hideous, but that'd be less confusing It
hink.

>  	if (folio)
>  		folio_put(folio);
>  	trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result);
>  	return result;
>  }
>
> +/* Recursive function to consume the bitmap */

Err... please don't? Kernel stack is a seriously finite resource, we do not
want recursion at all.

But I'm not actually seeing any recursion here? Am I missing something?

> +static int collapse_scan_bitmap(struct mm_struct *mm, unsigned long address,
> +			int referenced, int unmapped, struct collapse_control *cc,
> +			bool *mmap_locked, unsigned long enabled_orders)

This is a complicated and confusing function, it requires a comment
describing how it works.

> +{
> +	u8 order, next_order;
> +	u16 offset, mid_offset;
> +	int num_chunks;
> +	int bits_set, threshold_bits;
> +	int top = -1;

Err why do we start at -1 then immediately increment it?

> +	int collapsed = 0;
> +	int ret;
> +	struct scan_bit_state state;
> +	bool is_pmd_only = (enabled_orders == (1 << HPAGE_PMD_ORDER));

Extraneous outer parens.

> +
> +	cc->mthp_bitmap_stack[++top] = (struct scan_bit_state)
> +		{ HPAGE_PMD_ORDER - KHUGEPAGED_MIN_MTHP_ORDER, 0 };

This is the same as

	cc->mthp_bitmap_stack[0] = ...;
	top = 1;

No?


This is really horrible. Can we just have a helper function for this
please?

Like:

	static int mthp_push_stack(struct collapse_control *cc,
		int index, u8 order, u16 offset)
	{
		struct scan_bit_state *state = &cc->mthp_bitmap_stack[index];

		VM_WARN_ON(index >= MAX_MTHP_BITMAP_SIZE);

		state->order = order;
		state->offset = offset;

		return index + 1;
	}

And can invoke via:

	top = mthp_push_stack(cc, top, order, offset);

Or pass index as a pointer possibly also.

> +
> +	while (top >= 0) {
> +		state = cc->mthp_bitmap_stack[top--];

OK so this is the recursive bit...

Oh man this function so needs a comment describing what it does, seriously.

I think honestly for sake of my own sanity I'm going to hold off reviewing
the rest of this until there's something describing the algorithm, in
detail here, above the function.

> +		order = state.order + KHUGEPAGED_MIN_MTHP_ORDER;
> +		offset = state.offset;
> +		num_chunks = 1 << (state.order);
> +		/* Skip mTHP orders that are not enabled */
> +		if (!test_bit(order, &enabled_orders))
> +			goto next_order;
> +
> +		/* copy the relavant section to a new bitmap */
> +		bitmap_shift_right(cc->mthp_bitmap_temp, cc->mthp_bitmap, offset,
> +				  MTHP_BITMAP_SIZE);
> +
> +		bits_set = bitmap_weight(cc->mthp_bitmap_temp, num_chunks);
> +		threshold_bits = (HPAGE_PMD_NR - khugepaged_max_ptes_none - 1)
> +				>> (HPAGE_PMD_ORDER - state.order);
> +
> +		/* Check if the region is "almost full" based on the threshold */
> +		if (bits_set > threshold_bits || is_pmd_only
> +			|| test_bit(order, &huge_anon_orders_always)) {
> +			ret = collapse_huge_page(mm, address, referenced, unmapped,
> +						 cc, mmap_locked, order,
> +						 offset * KHUGEPAGED_MIN_MTHP_NR);
> +			if (ret == SCAN_SUCCEED) {
> +				collapsed += (1 << order);
> +				continue;
> +			}
> +		}
> +
> +next_order:
> +		if (state.order > 0) {
> +			next_order = state.order - 1;
> +			mid_offset = offset + (num_chunks / 2);
> +			cc->mthp_bitmap_stack[++top] = (struct scan_bit_state)
> +				{ next_order, mid_offset };
> +			cc->mthp_bitmap_stack[++top] = (struct scan_bit_state)
> +				{ next_order, offset };
> +		}
> +	}
> +	return collapsed;
> +}
> +
>  static int collapse_scan_pmd(struct mm_struct *mm,
>  			     struct vm_area_struct *vma,
>  			     unsigned long address, bool *mmap_locked,
> @@ -1307,31 +1406,60 @@ static int collapse_scan_pmd(struct mm_struct *mm,
>  {
>  	pmd_t *pmd;
>  	pte_t *pte, *_pte;
> +	int i;
>  	int result = SCAN_FAIL, referenced = 0;
>  	int none_or_zero = 0, shared = 0;
>  	struct page *page = NULL;
>  	struct folio *folio = NULL;
>  	unsigned long _address;
> +	unsigned long enabled_orders;
>  	spinlock_t *ptl;
>  	int node = NUMA_NO_NODE, unmapped = 0;
> +	bool is_pmd_only;
>  	bool writable = false;
> -
> +	int chunk_none_count = 0;
> +	int scaled_none = khugepaged_max_ptes_none >> (HPAGE_PMD_ORDER - KHUGEPAGED_MIN_MTHP_ORDER);
> +	unsigned long tva_flags = cc->is_khugepaged ? TVA_KHUGEPAGED : TVA_FORCED_COLLAPSE;
>  	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
>
>  	result = find_pmd_or_thp_or_none(mm, address, &pmd);
>  	if (result != SCAN_SUCCEED)
>  		goto out;
>
> +	bitmap_zero(cc->mthp_bitmap, MAX_MTHP_BITMAP_SIZE);
> +	bitmap_zero(cc->mthp_bitmap_temp, MAX_MTHP_BITMAP_SIZE);

Having this 'temp' thing on the stack for everyone is just horrid.

>  	memset(cc->node_load, 0, sizeof(cc->node_load));
>  	nodes_clear(cc->alloc_nmask);
> +
> +	if (cc->is_khugepaged)
> +		enabled_orders = thp_vma_allowable_orders(vma, vma->vm_flags,
> +			tva_flags, THP_ORDERS_ALL_ANON);
> +	else
> +		enabled_orders = BIT(HPAGE_PMD_ORDER);
> +
> +	is_pmd_only = (enabled_orders == (1 << HPAGE_PMD_ORDER));

This is horrid, can we have a function broken out to do this please?

In general if you keep open coding stuff, just write a static function for
it, the compiler is smart enough to inline.

> +
>  	pte = pte_offset_map_lock(mm, pmd, address, &ptl);
>  	if (!pte) {
>  		result = SCAN_PMD_NULL;
>  		goto out;
>  	}
>
> -	for (_address = address, _pte = pte; _pte < pte + HPAGE_PMD_NR;
> -	     _pte++, _address += PAGE_SIZE) {
> +	for (i = 0; i < HPAGE_PMD_NR; i++) {
> +		/*
> +		 * we are reading in KHUGEPAGED_MIN_MTHP_NR page chunks. if
> +		 * there are pages in this chunk keep track of it in the bitmap
> +		 * for mTHP collapsing.
> +		 */
> +		if (i % KHUGEPAGED_MIN_MTHP_NR == 0) {
> +			if (i > 0 && chunk_none_count <= scaled_none)
> +				bitmap_set(cc->mthp_bitmap,
> +					   (i - 1) / KHUGEPAGED_MIN_MTHP_NR, 1);
> +			chunk_none_count = 0;
> +		}

This whole thing is really confusing and you are not explaining the
algoritm here at all.

This requires a comment, and really this bit should be separated out please.

> +
> +		_pte = pte + i;
> +		_address = address + i * PAGE_SIZE;
>  		pte_t pteval = ptep_get(_pte);
>  		if (is_swap_pte(pteval)) {
>  			++unmapped;
> @@ -1354,10 +1482,11 @@ static int collapse_scan_pmd(struct mm_struct *mm,
>  			}
>  		}
>  		if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
> +			++chunk_none_count;
>  			++none_or_zero;
>  			if (!userfaultfd_armed(vma) &&
> -			    (!cc->is_khugepaged ||
> -			     none_or_zero <= khugepaged_max_ptes_none)) {
> +			    (!cc->is_khugepaged || !is_pmd_only ||
> +				none_or_zero <= khugepaged_max_ptes_none)) {
>  				continue;
>  			} else {
>  				result = SCAN_EXCEED_NONE_PTE;
> @@ -1453,6 +1582,7 @@ static int collapse_scan_pmd(struct mm_struct *mm,
>  								     address)))
>  			referenced++;
>  	}
> +
>  	if (!writable) {
>  		result = SCAN_PAGE_RO;
>  	} else if (cc->is_khugepaged &&
> @@ -1465,10 +1595,12 @@ static int collapse_scan_pmd(struct mm_struct *mm,
>  out_unmap:
>  	pte_unmap_unlock(pte, ptl);
>  	if (result == SCAN_SUCCEED) {
> -		result = collapse_huge_page(mm, address, referenced,
> -					    unmapped, cc);
> -		/* collapse_huge_page will return with the mmap_lock released */
> -		*mmap_locked = false;
> +		result = collapse_scan_bitmap(mm, address, referenced, unmapped, cc,
> +					      mmap_locked, enabled_orders);
> +		if (result > 0)
> +			result = SCAN_SUCCEED;
> +		else
> +			result = SCAN_FAIL;

We're reusing result as both an enum value and as a storage for unmber
colapsed PTE entries?

Can we just use a new local variable? Thanks

>  	}
>  out:
>  	trace_mm_khugepaged_scan_pmd(mm, folio, writable, referenced,
> --
> 2.50.1
>

I will review the bitmap/chunk stuff in more detail once the algorithm is
commented.

Cheers, Lorenzo
Re: [PATCH v10 06/13] khugepaged: add mTHP support
Posted by Nico Pache 1 month ago
On Wed, Aug 20, 2025 at 12:30 PM Lorenzo Stoakes
<lorenzo.stoakes@oracle.com> wrote:
>
> On Tue, Aug 19, 2025 at 07:41:58AM -0600, Nico Pache wrote:
> > Introduce the ability for khugepaged to collapse to different mTHP sizes.
> > While scanning PMD ranges for potential collapse candidates, keep track
> > of pages in KHUGEPAGED_MIN_MTHP_ORDER chunks via a bitmap. Each bit
> > represents a utilized region of order KHUGEPAGED_MIN_MTHP_ORDER ptes. If
> > mTHPs are enabled we remove the restriction of max_ptes_none during the
> > scan phase so we don't bailout early and miss potential mTHP candidates.
> >
> > A new function collapse_scan_bitmap is used to perform binary recursion on
> > the bitmap and determine the best eligible order for the collapse.
> > A stack struct is used instead of traditional recursion. max_ptes_none
> > will be scaled by the attempted collapse order to determine how "full" an
> > order must be before being considered for collapse.
> >
> > Once we determine what mTHP sizes fits best in that PMD range a collapse
> > is attempted. A minimum collapse order of 2 is used as this is the lowest
> > order supported by anon memory.
> >
> > For orders configured with "always", we perform greedy collapsing
> > to that order without considering bit density.
> >
> > If a mTHP collapse is attempted, but contains swapped out, or shared
> > pages, we don't perform the collapse. This is because adding new entries
> > can lead to new none pages, and these may lead to constant promotion into
> > a higher order (m)THP. A similar issue can occur with "max_ptes_none >
> > HPAGE_PMD_NR/2" due to the fact that a collapse will introduce at least 2x
> > the number of pages, and on a future scan will satisfy the promotion
> > condition once again.
> >
> > For non-PMD collapse we must leave the anon VMA write locked until after
> > we collapse the mTHP-- in the PMD case all the pages are isolated, but in
> > the non-PMD case this is not true, and we must keep the lock to prevent
> > changes to the VMA from occurring.
> >
> > Currently madv_collapse is not supported and will only attempt PMD
> > collapse.
>
> Yes I think this has to remain the case unfortunately as we override
> sysfs-specified orders for MADV_COLLAPSE and there's no sensible way to
> determine what order we ought to be using.
>
> >
> > Signed-off-by: Nico Pache <npache@redhat.com>
>
> You've gone from small incremental changes to a huge one here... for the
> sake of reviewer sanity at least, any chance of breaking this up?
I had this as two patches (one for the bitmap and one for implementing
it), but I was asked to squash them :/
>
> > ---
> >  include/linux/khugepaged.h |   4 +
> >  mm/khugepaged.c            | 236 +++++++++++++++++++++++++++++--------
> >  2 files changed, 188 insertions(+), 52 deletions(-)
> >
> > diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
> > index eb1946a70cff..d12cdb9ef3ba 100644
> > --- a/include/linux/khugepaged.h
> > +++ b/include/linux/khugepaged.h
> > @@ -1,6 +1,10 @@
> >  /* SPDX-License-Identifier: GPL-2.0 */
> >  #ifndef _LINUX_KHUGEPAGED_H
> >  #define _LINUX_KHUGEPAGED_H
> > +#define KHUGEPAGED_MIN_MTHP_ORDER    2
>
> I guess this makes sense as by definition 2 pages is least it could
> possibly be.
Order, so 4 pages, 16kB mTHP
>
> > +#define KHUGEPAGED_MIN_MTHP_NR       (1 << KHUGEPAGED_MIN_MTHP_ORDER)
>
> Surely KHUGEPAGED_MIN_NR_MTHP_PTES would be more meaningful?
Sure!
>
> > +#define MAX_MTHP_BITMAP_SIZE  (1 << (ilog2(MAX_PTRS_PER_PTE) - KHUGEPAGED_MIN_MTHP_ORDER))
>
> This is confusing - size of what?
We need it like this due to ppc64 (and maybe others?), it used to be
based on PMD_ORDER, but some arches fail to compile due to the PMD
size only being known at boot time.

This compiles to 9 on arches that have 512 ptes.
so 1 << (9 - 2) == 128
>
> If it's number of bits surely this should be ilog2(MAX_PTRS_PER_PTE) -
> KHUGEPAGED_MIN_MTHP_ORDER?
This would only be 7? We need a 128 bit bitmap
>
> This seems to be more so 'the maximum value that could contain the bits right?
>
> I think this is just wrong though, see below at DECLARE_BITMAP() stuff.
>
> > +#define MTHP_BITMAP_SIZE  (1 << (HPAGE_PMD_ORDER - KHUGEPAGED_MIN_MTHP_ORDER))
>
> Hard to know how this relates to MAX_MTHP_BITMAP_SIZE?
>
> I guess this is the current bitmap size indicating all that is possible,
> but if these are all #define's what is this accomplishing?
One for compile time one for runtime. Kind of annoying but it was the
easiest solution given the architecture limitations.
>
> For all - please do not do (1 << xxx)! This can lead to sign-extension bugs at least
> in theory, use _BITUL(...), it's neater too.
ack, thanks!
>
> NIT but the whitespace is all screwed up here.
>
> KHUGEPAGED_MIN_MTHP_ORDER and KHUGEPAGED_MIN_MTHP_NR
>
> >
> >  #include <linux/mm.h>
> >
> > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > index 074101d03c9d..1ad7e00d3fd6 100644
> > --- a/mm/khugepaged.c
> > +++ b/mm/khugepaged.c
> > @@ -94,6 +94,11 @@ static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
> >
> >  static struct kmem_cache *mm_slot_cache __ro_after_init;
> >
> > +struct scan_bit_state {
> > +     u8 order;
> > +     u16 offset;
> > +};
> > +
> >  struct collapse_control {
> >       bool is_khugepaged;
> >
> > @@ -102,6 +107,18 @@ struct collapse_control {
> >
> >       /* nodemask for allocation fallback */
> >       nodemask_t alloc_nmask;
> > +
> > +     /*
> > +      * bitmap used to collapse mTHP sizes.
> > +      * 1bit = order KHUGEPAGED_MIN_MTHP_ORDER mTHP
>
> I'm not sure what this '1bit = xxx' comment means?
A single bit represents 1 << MIN_MTHP_ORDER (4) pages. Ill express that better
>
> > +      */
> > +     DECLARE_BITMAP(mthp_bitmap, MAX_MTHP_BITMAP_SIZE);
>
> Hmm this seems wrong.
Should be a bitmap with 128 bits (for 4k page size). Not sure what's wrong here.
>
> DECLARE_BITMAP(..., val) is expessed as:
>
> #define DECLARE_BITMAP(name,bits) \
>         unsigned long name[BITS_TO_LONGS(bits)]
>
> So the 2nd param should be number of bits.
>
> But MAX_MTHP_BITMAP_SIZE is:
>
> (1 << (ilog2(MAX_PTRS_PER_PTE) - KHUGEPAGED_MIN_MTHP_ORDER))
>
> So typically:
>
> (1 << (9 - 2)) = 128
>
> And BITS_TO_LONGS is defined as:
>
> __KERNEL_DIV_ROUND_UP(nr, BITS_PER_TYPE(long))
>
> So essentially this will be 128 / 8 on a 64-bit system so 16 bytes to
> store... 7 bits?
I think you mean 64. 8 would be BYTES_PER_TYPE
>
> Unless I'm missing something here?
Hmm, unless the DECLARE_BITMAP is being used incorrectly in multiple
places, DECLARE_BITMAP(..., # of bits) is how this is intended to be
used.

I think it's an array of unsigned longs, so each part of the name[] is
already 64 bits. hence the divide.
>
> > +     DECLARE_BITMAP(mthp_bitmap_temp, MAX_MTHP_BITMAP_SIZE);
>
> Same comment as above obviously. But also this is kind of horrible, why are
> we putting a copy of this entire bitmap on the stack every time we declare
> a cc?
The temp one is used as a scratch pad, Baolin also finds this useful
in his file mTHP collapse useful for another use as well.

In general khugepaged always uses the same CC, so it doesn't not
having to constantly allocate this.
>
> > +     struct scan_bit_state mthp_bitmap_stack[MAX_MTHP_BITMAP_SIZE];
> > +};
> > +
> > +struct collapse_control khugepaged_collapse_control = {
> > +     .is_khugepaged = true,
> >  };
>
> Why are we moving this here?
Because if not it doesn't compile.
>
> >
> >  /**
> > @@ -854,10 +871,6 @@ static void khugepaged_alloc_sleep(void)
> >       remove_wait_queue(&khugepaged_wait, &wait);
> >  }
> >
> > -struct collapse_control khugepaged_collapse_control = {
> > -     .is_khugepaged = true,
> > -};
> > -
> >  static bool collapse_scan_abort(int nid, struct collapse_control *cc)
> >  {
> >       int i;
> > @@ -1136,17 +1149,19 @@ static int alloc_charge_folio(struct folio **foliop, struct mm_struct *mm,
> >
> >  static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> >                             int referenced, int unmapped,
> > -                           struct collapse_control *cc)
> > +                           struct collapse_control *cc, bool *mmap_locked,
> > +                           unsigned int order, unsigned long offset)
> >  {
> >       LIST_HEAD(compound_pagelist);
> >       pmd_t *pmd, _pmd;
> > -     pte_t *pte;
> > +     pte_t *pte = NULL, mthp_pte;
> >       pgtable_t pgtable;
> >       struct folio *folio;
> >       spinlock_t *pmd_ptl, *pte_ptl;
> >       int result = SCAN_FAIL;
> >       struct vm_area_struct *vma;
> >       struct mmu_notifier_range range;
> > +     unsigned long _address = address + offset * PAGE_SIZE;
>
> This name is really horrible. please name it sensibly.
>
> It feels like address ought to be consistently the base of the THP or mTHP
> we wish to collapse, and if we need something PMD aligned for some reason
> we should rename _that_ to e.g. pmd_address.
>
> Orrr it could be mthp_address...
>
> Perhaps we could just figure that out here and pass only the
> address... aligning to PMD boundary shouldn't be hard/costly.
>
> But it may indicate we need further refactorisation so we don't need to
> paper over cracks + pass around a PMD address to do things when that may
> not be where the (m)THP range begins.
Ok i'll rename them, but we still need to know the PMD address as we
rely on it for a few key operations.
Can we leave _address and rename address to pmd_address?
>
> >
> >       VM_BUG_ON(address & ~HPAGE_PMD_MASK);
> >
> > @@ -1155,16 +1170,20 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> >        * The allocation can take potentially a long time if it involves
> >        * sync compaction, and we do not need to hold the mmap_lock during
> >        * that. We will recheck the vma after taking it again in write mode.
> > +      * If collapsing mTHPs we may have already released the read_lock.
> >        */
> > -     mmap_read_unlock(mm);
> > +     if (*mmap_locked) {
> > +             mmap_read_unlock(mm);
> > +             *mmap_locked = false;
> > +     }
> >
> > -     result = alloc_charge_folio(&folio, mm, cc, HPAGE_PMD_ORDER);
> > +     result = alloc_charge_folio(&folio, mm, cc, order);
> >       if (result != SCAN_SUCCEED)
> >               goto out_nolock;
> >
> >       mmap_read_lock(mm);
> > -     result = hugepage_vma_revalidate(mm, address, true, &vma, cc,
> > -                                      BIT(HPAGE_PMD_ORDER));
> > +     *mmap_locked = true;
> > +     result = hugepage_vma_revalidate(mm, address, true, &vma, cc, BIT(order));
>
> I mean this is kind of going back to previous commits, but it's really ugly
> to pass a BIT(xxx) here, is that really necessary? Can't we just pass in
> the order?
Yes and no... currently we only ever pass the bit of the current order
so we could get away with it, but to generalize it we want the ability
to pass a bitmap of the available orders. Like in the case of future
madvise_collapse support, we would need to pass a bitmap of possible
orders.
>
> It's also inconsistent with other calls like
> e.g. __collapse_huge_page_swapin() below which passes the order.
>
> Same goes obv. for all such invocations.
>
> >       if (result != SCAN_SUCCEED) {
> >               mmap_read_unlock(mm);
> >               goto out_nolock;
> > @@ -1182,13 +1201,14 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> >                * released when it fails. So we jump out_nolock directly in
> >                * that case.  Continuing to collapse causes inconsistency.
> >                */
> > -             result = __collapse_huge_page_swapin(mm, vma, address, pmd,
> > -                                                  referenced, HPAGE_PMD_ORDER);
> > +             result = __collapse_huge_page_swapin(mm, vma, _address, pmd,
> > +                                                  referenced, order);
> >               if (result != SCAN_SUCCEED)
> >                       goto out_nolock;
> >       }
> >
> >       mmap_read_unlock(mm);
> > +     *mmap_locked = false;
> >       /*
> >        * Prevent all access to pagetables with the exception of
> >        * gup_fast later handled by the ptep_clear_flush and the VM
> > @@ -1198,8 +1218,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> >        * mmap_lock.
> >        */
> >       mmap_write_lock(mm);
> > -     result = hugepage_vma_revalidate(mm, address, true, &vma, cc,
> > -                                      BIT(HPAGE_PMD_ORDER));
> > +     result = hugepage_vma_revalidate(mm, address, true, &vma, cc, BIT(order));
> >       if (result != SCAN_SUCCEED)
> >               goto out_up_write;
> >       /* check if the pmd is still valid */
> > @@ -1210,11 +1229,12 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> >
> >       anon_vma_lock_write(vma->anon_vma);
> >
> > -     mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address,
> > -                             address + HPAGE_PMD_SIZE);
> > +     mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, _address,
> > +                             _address + (PAGE_SIZE << order));
>
> This _address is horrible. That really does have to change.
>
> >       mmu_notifier_invalidate_range_start(&range);
> >
> >       pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
> > +
>
> Odd whitespace...
>
> >       /*
> >        * This removes any huge TLB entry from the CPU so we won't allow
> >        * huge and small TLB entries for the same virtual address to
> > @@ -1228,19 +1248,16 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> >       mmu_notifier_invalidate_range_end(&range);
> >       tlb_remove_table_sync_one();
> >
> > -     pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl);
> > +     pte = pte_offset_map_lock(mm, &_pmd, _address, &pte_ptl);
>
> I see we already have a 'convention' of _ prefix on the pmd param, but two
> wrongs don't make a right...
>
> >       if (pte) {
> > -             result = __collapse_huge_page_isolate(vma, address, pte, cc,
> > -                                                   &compound_pagelist,
> > -                                                   HPAGE_PMD_ORDER);
> > +             result = __collapse_huge_page_isolate(vma, _address, pte, cc,
> > +                                                   &compound_pagelist, order);
> >               spin_unlock(pte_ptl);
> >       } else {
> >               result = SCAN_PMD_NULL;
> >       }
> >
> >       if (unlikely(result != SCAN_SUCCEED)) {
> > -             if (pte)
> > -                     pte_unmap(pte);
>
> Why are we removing this?
>
> >               spin_lock(pmd_ptl);
> >               BUG_ON(!pmd_none(*pmd));
> >               /*
> > @@ -1255,17 +1272,17 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> >       }
> >
> >       /*
> > -      * All pages are isolated and locked so anon_vma rmap
> > -      * can't run anymore.
> > +      * For PMD collapse all pages are isolated and locked so anon_vma
> > +      * rmap can't run anymore
> >        */
> > -     anon_vma_unlock_write(vma->anon_vma);
> > +     if (order == HPAGE_PMD_ORDER)
> > +             anon_vma_unlock_write(vma->anon_vma);
>
> Hmm this is introducing a horrible new way for things to go wrong. And
> there's now a whole host of terrible error paths that can go wrong very
> easily around rmap locks and yeah, no way we cannot do it this way.
>
> rmap locks are VERY sensitive and the ordering of the locking matters a
> great deal (see top of mm/rmap.c). So we have to be SO careful here.
>
> I suggest you simply have a boolean 'anon_vma_locked' or something like
> this, and get rid of these horrible additional code paths and the second
> order == HPAGE_PMD_ORDER check.
>
> We'll track whether or not the lock is held and thereby needs releasing
> that way instead.
>
> Also, and very importantly - are you 100% sure you can't possibly have a
> deadlock or issue beyond this point if you don't release the rmap lock?
I double checked, this was added as a fix to an issue Hugh reported.
The gap between these callers is rather small, and I see no way that
it could skip the lock/unlock cycle.
>
> This is veeeery important, as there can be implicit assumptions around
> whether or not one can acquire these locks and you basically have to audit
> ALL code over which this lock is held.
>
> I'm speaking from hard experience here having bumped into this in various
> attempts at work relating to this stuff...
>
> >
> >       result = __collapse_huge_page_copy(pte, folio, pmd, _pmd,
> > -                                        vma, address, pte_ptl,
> > -                                        &compound_pagelist, HPAGE_PMD_ORDER);
> > -     pte_unmap(pte);
> > +                                        vma, _address, pte_ptl,
> > +                                        &compound_pagelist, order);
> >       if (unlikely(result != SCAN_SUCCEED))
> > -             goto out_up_write;
> > +             goto out_unlock_anon_vma;
>
> See above...
>
> >
> >       /*
> >        * The smp_wmb() inside __folio_mark_uptodate() ensures the
> > @@ -1273,33 +1290,115 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> >        * write.
> >        */
> >       __folio_mark_uptodate(folio);
> > -     pgtable = pmd_pgtable(_pmd);
> > -
> > -     _pmd = folio_mk_pmd(folio, vma->vm_page_prot);
> > -     _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
> > -
> > -     spin_lock(pmd_ptl);
> > -     BUG_ON(!pmd_none(*pmd));
> > -     folio_add_new_anon_rmap(folio, vma, address, RMAP_EXCLUSIVE);
> > -     folio_add_lru_vma(folio, vma);
> > -     pgtable_trans_huge_deposit(mm, pmd, pgtable);
> > -     set_pmd_at(mm, address, pmd, _pmd);
> > -     update_mmu_cache_pmd(vma, address, pmd);
> > -     deferred_split_folio(folio, false);
> > -     spin_unlock(pmd_ptl);
> > +     if (order == HPAGE_PMD_ORDER) {
> > +             pgtable = pmd_pgtable(_pmd);
> > +             _pmd = folio_mk_pmd(folio, vma->vm_page_prot);
> > +             _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
> > +
> > +             spin_lock(pmd_ptl);
> > +             BUG_ON(!pmd_none(*pmd));
>
> I know you're refactoring this, but be good to change this to a
> WARN_ON_ONCE(), BUG_ON() is verboten unless it's absolutely definitely
> going to be a kernel nuclear event, so worth changing things up as we go.
Yeah i keep seeing those warning in checkpatch, so Ill go ahead and edit it.
>
> > +             folio_add_new_anon_rmap(folio, vma, _address, RMAP_EXCLUSIVE);
> > +             folio_add_lru_vma(folio, vma);
> > +             pgtable_trans_huge_deposit(mm, pmd, pgtable);
> > +             set_pmd_at(mm, address, pmd, _pmd);
> > +             update_mmu_cache_pmd(vma, address, pmd);
> > +             deferred_split_folio(folio, false);
> > +             spin_unlock(pmd_ptl);
> > +     } else { /* mTHP collapse */
> > +             mthp_pte = mk_pte(&folio->page, vma->vm_page_prot);
>
> I guess it's a rule that each THP or mTHP range spanned must span one and
> only one folio.
>
> Not sure &folio->page has a future though.
>
> Maybe better to use folio_page(folio, 0)?
Ok sounds good I'll use that.
>
> > +             mthp_pte = maybe_mkwrite(pte_mkdirty(mthp_pte), vma);
> > +
> > +             spin_lock(pmd_ptl);
> > +             BUG_ON(!pmd_none(*pmd));
>
> having said the above, this is trictly introducing a new BUG_ON() which is
> a no-no, please make it a WARN_ON_ONCE().
>
> > +             folio_ref_add(folio, (1 << order) - 1);
>
> Again no 1 << x please.
>
> Do we do something similar somewhere else for mthp ref counting? Can we
> share code somehow?
Yeah but IIRC its only like 2 or 3 places that do something like
this... most callers to folio_add_* do things in slightly different
manners. Maybe something to look into for the future, but I think it
will be difficult to generalize it.
>
> > +             folio_add_new_anon_rmap(folio, vma, _address, RMAP_EXCLUSIVE);
> > +             folio_add_lru_vma(folio, vma);
> > +             set_ptes(vma->vm_mm, _address, pte, mthp_pte, (1 << order));
>
> Please avoid 1 << order, and I think at this point since you reference it a
> bunch of times, just store a local var like nr_pages or sth?
yeah not a bad idea!
>
> > +             update_mmu_cache_range(NULL, vma, _address, pte, (1 << order));
> > +
> > +             smp_wmb(); /* make pte visible before pmd */
>
> Can you give some detail as to why this will work here and why it is
> necessary?
Other parts of the kernel do it when setting ptes before updating the
PMD. I'm not sure if it's necessary, but better safe than sorry.
>
> > +             pmd_populate(mm, pmd, pmd_pgtable(_pmd));
>
> If we're updating PTE entriess why do we need to assign the PMD entry?
We removed the PMD entry for GUP_fast reasons, then we reinstall the
PMD entry after the mTHP is in place. Same as for PMD collapse.
>
> > +             spin_unlock(pmd_ptl);
> > +     }
>
> This deeply, badly needs to be refactored into something that both shares
> code and separates out these two operations.
>
> This function is disgustingly long as it is, and that's not your fault, but
> let's try to make things better as we go.
>
> >
> >       folio = NULL;
> >
> >       result = SCAN_SUCCEED;
> > +out_unlock_anon_vma:
> > +     if (order != HPAGE_PMD_ORDER)
> > +             anon_vma_unlock_write(vma->anon_vma);
>
> Obviously again as above, we need to simplify this and get rid of this
> whole bit.
>
> >  out_up_write:
> > +     if (pte)
> > +             pte_unmap(pte);
>
> OK I guess you moved this from above down here? Is this a valid place to do this?
Yes if not we were potentially unmapping a pte early.
>
> >       mmap_write_unlock(mm);
> >  out_nolock:
> > +     *mmap_locked = false;
>
> This is kind of horrible, we now have pretty mad logic around who sets
> mmap_locked and where.
>
> Can we just do this at the call sites so we avoid that?
>
> I mean anything we do with this is hideous, but that'd be less confusing It
> hink.
>
> >       if (folio)
> >               folio_put(folio);
> >       trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result);
> >       return result;
> >  }
> >
> > +/* Recursive function to consume the bitmap */
>
> Err... please don't? Kernel stack is a seriously finite resource, we do not
> want recursion at all.
>
> But I'm not actually seeing any recursion here? Am I missing something?
>
> > +static int collapse_scan_bitmap(struct mm_struct *mm, unsigned long address,
> > +                     int referenced, int unmapped, struct collapse_control *cc,
> > +                     bool *mmap_locked, unsigned long enabled_orders)
>
> This is a complicated and confusing function, it requires a comment
> describing how it works.
Ok will do!
>
> > +{
> > +     u8 order, next_order;
> > +     u16 offset, mid_offset;
> > +     int num_chunks;
> > +     int bits_set, threshold_bits;
> > +     int top = -1;
>
> Err why do we start at -1 then immediately increment it?
You are correct, it was probably a leftover bit from my development
phase. Seems I can just set it to 0 to begin with.
>
> > +     int collapsed = 0;
> > +     int ret;
> > +     struct scan_bit_state state;
> > +     bool is_pmd_only = (enabled_orders == (1 << HPAGE_PMD_ORDER));
>
> Extraneous outer parens.
ack
>
> > +
> > +     cc->mthp_bitmap_stack[++top] = (struct scan_bit_state)
> > +             { HPAGE_PMD_ORDER - KHUGEPAGED_MIN_MTHP_ORDER, 0 };
>
> This is the same as
>
>         cc->mthp_bitmap_stack[0] = ...;
>         top = 1;
>
> No?
no it would be bitmap_stack[0] = ...
then top goes to -1 (at state =... ), and if we add more items
(next_order) to the stack it would go top = 1 (adds one for each half
of the split)
>
>
> This is really horrible. Can we just have a helper function for this
> please?
Seems kinda excessive for 4 lines and one caller.
>
> Like:
>
>         static int mthp_push_stack(struct collapse_control *cc,
>                 int index, u8 order, u16 offset)
>         {
>                 struct scan_bit_state *state = &cc->mthp_bitmap_stack[index];
>
>                 VM_WARN_ON(index >= MAX_MTHP_BITMAP_SIZE);
>
>                 state->order = order;
>                 state->offset = offset;
>
>                 return index + 1;
>         }

This would not work in its current state because its ++index in the
current implementation. I would need to refactor, but the general idea
still stands
>
> And can invoke via:
>
>         top = mthp_push_stack(cc, top, order, offset);
>
> Or pass index as a pointer possibly also.
>
> > +
> > +     while (top >= 0) {
> > +             state = cc->mthp_bitmap_stack[top--];
>
> OK so this is the recursive bit...
>
> Oh man this function so needs a comment describing what it does, seriously.
>
> I think honestly for sake of my own sanity I'm going to hold off reviewing
> the rest of this until there's something describing the algorithm, in
> detail here, above the function.
It's basically binary recursion with a stack structure, that checks
regions of the bitmap in descending order (ie order 9, order 8, ...)
if we go to the next order we add two items to the stack (left and
right half). I will add a comment describing it at the top of the
function.
>
> > +             order = state.order + KHUGEPAGED_MIN_MTHP_ORDER;
> > +             offset = state.offset;
> > +             num_chunks = 1 << (state.order);
> > +             /* Skip mTHP orders that are not enabled */
> > +             if (!test_bit(order, &enabled_orders))
> > +                     goto next_order;
> > +
> > +             /* copy the relavant section to a new bitmap */
> > +             bitmap_shift_right(cc->mthp_bitmap_temp, cc->mthp_bitmap, offset,
> > +                               MTHP_BITMAP_SIZE);
> > +
> > +             bits_set = bitmap_weight(cc->mthp_bitmap_temp, num_chunks);
> > +             threshold_bits = (HPAGE_PMD_NR - khugepaged_max_ptes_none - 1)
> > +                             >> (HPAGE_PMD_ORDER - state.order);
> > +
> > +             /* Check if the region is "almost full" based on the threshold */
> > +             if (bits_set > threshold_bits || is_pmd_only
> > +                     || test_bit(order, &huge_anon_orders_always)) {
> > +                     ret = collapse_huge_page(mm, address, referenced, unmapped,
> > +                                              cc, mmap_locked, order,
> > +                                              offset * KHUGEPAGED_MIN_MTHP_NR);
> > +                     if (ret == SCAN_SUCCEED) {
> > +                             collapsed += (1 << order);
> > +                             continue;
> > +                     }
> > +             }
> > +
> > +next_order:
> > +             if (state.order > 0) {
> > +                     next_order = state.order - 1;
> > +                     mid_offset = offset + (num_chunks / 2);
> > +                     cc->mthp_bitmap_stack[++top] = (struct scan_bit_state)
> > +                             { next_order, mid_offset };
> > +                     cc->mthp_bitmap_stack[++top] = (struct scan_bit_state)
> > +                             { next_order, offset };
> > +             }
> > +     }
> > +     return collapsed;
> > +}
> > +
> >  static int collapse_scan_pmd(struct mm_struct *mm,
> >                            struct vm_area_struct *vma,
> >                            unsigned long address, bool *mmap_locked,
> > @@ -1307,31 +1406,60 @@ static int collapse_scan_pmd(struct mm_struct *mm,
> >  {
> >       pmd_t *pmd;
> >       pte_t *pte, *_pte;
> > +     int i;
> >       int result = SCAN_FAIL, referenced = 0;
> >       int none_or_zero = 0, shared = 0;
> >       struct page *page = NULL;
> >       struct folio *folio = NULL;
> >       unsigned long _address;
> > +     unsigned long enabled_orders;
> >       spinlock_t *ptl;
> >       int node = NUMA_NO_NODE, unmapped = 0;
> > +     bool is_pmd_only;
> >       bool writable = false;
> > -
> > +     int chunk_none_count = 0;
> > +     int scaled_none = khugepaged_max_ptes_none >> (HPAGE_PMD_ORDER - KHUGEPAGED_MIN_MTHP_ORDER);
> > +     unsigned long tva_flags = cc->is_khugepaged ? TVA_KHUGEPAGED : TVA_FORCED_COLLAPSE;
> >       VM_BUG_ON(address & ~HPAGE_PMD_MASK);
> >
> >       result = find_pmd_or_thp_or_none(mm, address, &pmd);
> >       if (result != SCAN_SUCCEED)
> >               goto out;
> >
> > +     bitmap_zero(cc->mthp_bitmap, MAX_MTHP_BITMAP_SIZE);
> > +     bitmap_zero(cc->mthp_bitmap_temp, MAX_MTHP_BITMAP_SIZE);
>
> Having this 'temp' thing on the stack for everyone is just horrid.
As I mention above this serves a very good purpose, and is also
expanded in another series by Baolin to serve another similar purpose
too.
>
> >       memset(cc->node_load, 0, sizeof(cc->node_load));
> >       nodes_clear(cc->alloc_nmask);
> > +
> > +     if (cc->is_khugepaged)
> > +             enabled_orders = thp_vma_allowable_orders(vma, vma->vm_flags,
> > +                     tva_flags, THP_ORDERS_ALL_ANON);
> > +     else
> > +             enabled_orders = BIT(HPAGE_PMD_ORDER);
> > +
> > +     is_pmd_only = (enabled_orders == (1 << HPAGE_PMD_ORDER));
>
> This is horrid, can we have a function broken out to do this please?
>
> In general if you keep open coding stuff, just write a static function for
> it, the compiler is smart enough to inline.
ok, we do this is a few places so perhaps its the best approach.
>
> > +
> >       pte = pte_offset_map_lock(mm, pmd, address, &ptl);
> >       if (!pte) {
> >               result = SCAN_PMD_NULL;
> >               goto out;
> >       }
> >
> > -     for (_address = address, _pte = pte; _pte < pte + HPAGE_PMD_NR;
> > -          _pte++, _address += PAGE_SIZE) {
> > +     for (i = 0; i < HPAGE_PMD_NR; i++) {
> > +             /*
> > +              * we are reading in KHUGEPAGED_MIN_MTHP_NR page chunks. if
> > +              * there are pages in this chunk keep track of it in the bitmap
> > +              * for mTHP collapsing.
> > +              */
> > +             if (i % KHUGEPAGED_MIN_MTHP_NR == 0) {
> > +                     if (i > 0 && chunk_none_count <= scaled_none)
> > +                             bitmap_set(cc->mthp_bitmap,
> > +                                        (i - 1) / KHUGEPAGED_MIN_MTHP_NR, 1);
> > +                     chunk_none_count = 0;
> > +             }
>
> This whole thing is really confusing and you are not explaining the
> algoritm here at all.
>
> This requires a comment, and really this bit should be separated out please.
This used to be its own commit, but multiple people wanted it
squashed... ugh. Which should we go with?
>
> > +
> > +             _pte = pte + i;
> > +             _address = address + i * PAGE_SIZE;
> >               pte_t pteval = ptep_get(_pte);
> >               if (is_swap_pte(pteval)) {
> >                       ++unmapped;
> > @@ -1354,10 +1482,11 @@ static int collapse_scan_pmd(struct mm_struct *mm,
> >                       }
> >               }
> >               if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
> > +                     ++chunk_none_count;
> >                       ++none_or_zero;
> >                       if (!userfaultfd_armed(vma) &&
> > -                         (!cc->is_khugepaged ||
> > -                          none_or_zero <= khugepaged_max_ptes_none)) {
> > +                         (!cc->is_khugepaged || !is_pmd_only ||
> > +                             none_or_zero <= khugepaged_max_ptes_none)) {
> >                               continue;
> >                       } else {
> >                               result = SCAN_EXCEED_NONE_PTE;
> > @@ -1453,6 +1582,7 @@ static int collapse_scan_pmd(struct mm_struct *mm,
> >                                                                    address)))
> >                       referenced++;
> >       }
> > +
> >       if (!writable) {
> >               result = SCAN_PAGE_RO;
> >       } else if (cc->is_khugepaged &&
> > @@ -1465,10 +1595,12 @@ static int collapse_scan_pmd(struct mm_struct *mm,
> >  out_unmap:
> >       pte_unmap_unlock(pte, ptl);
> >       if (result == SCAN_SUCCEED) {
> > -             result = collapse_huge_page(mm, address, referenced,
> > -                                         unmapped, cc);
> > -             /* collapse_huge_page will return with the mmap_lock released */
> > -             *mmap_locked = false;
> > +             result = collapse_scan_bitmap(mm, address, referenced, unmapped, cc,
> > +                                           mmap_locked, enabled_orders);
> > +             if (result > 0)
> > +                     result = SCAN_SUCCEED;
> > +             else
> > +                     result = SCAN_FAIL;
>
> We're reusing result as both an enum value and as a storage for unmber
> colapsed PTE entries?
>
> Can we just use a new local variable? Thanks
>
> >       }
> >  out:
> >       trace_mm_khugepaged_scan_pmd(mm, folio, writable, referenced,
> > --
> > 2.50.1
> >
>
> I will review the bitmap/chunk stuff in more detail once the algorithm is
> commented.
ok thanks for the review.
>
> Cheers, Lorenzo
>
Re: [PATCH v10 06/13] khugepaged: add mTHP support
Posted by Lorenzo Stoakes 4 weeks ago
You've not responded to a ton of comments, I'm guesing I should assume in
those cases you're acking the comments implicitly? :) Do let me know.

On Tue, Sep 02, 2025 at 02:12:38PM -0600, Nico Pache wrote:
> On Wed, Aug 20, 2025 at 12:30 PM Lorenzo Stoakes
> <lorenzo.stoakes@oracle.com> wrote:
> >
> > On Tue, Aug 19, 2025 at 07:41:58AM -0600, Nico Pache wrote:
> > > Introduce the ability for khugepaged to collapse to different mTHP sizes.
> > > While scanning PMD ranges for potential collapse candidates, keep track
> > > of pages in KHUGEPAGED_MIN_MTHP_ORDER chunks via a bitmap. Each bit
> > > represents a utilized region of order KHUGEPAGED_MIN_MTHP_ORDER ptes. If
> > > mTHPs are enabled we remove the restriction of max_ptes_none during the
> > > scan phase so we don't bailout early and miss potential mTHP candidates.
> > >
> > > A new function collapse_scan_bitmap is used to perform binary recursion on
> > > the bitmap and determine the best eligible order for the collapse.
> > > A stack struct is used instead of traditional recursion. max_ptes_none
> > > will be scaled by the attempted collapse order to determine how "full" an
> > > order must be before being considered for collapse.
> > >
> > > Once we determine what mTHP sizes fits best in that PMD range a collapse
> > > is attempted. A minimum collapse order of 2 is used as this is the lowest
> > > order supported by anon memory.
> > >
> > > For orders configured with "always", we perform greedy collapsing
> > > to that order without considering bit density.
> > >
> > > If a mTHP collapse is attempted, but contains swapped out, or shared
> > > pages, we don't perform the collapse. This is because adding new entries
> > > can lead to new none pages, and these may lead to constant promotion into
> > > a higher order (m)THP. A similar issue can occur with "max_ptes_none >
> > > HPAGE_PMD_NR/2" due to the fact that a collapse will introduce at least 2x
> > > the number of pages, and on a future scan will satisfy the promotion
> > > condition once again.
> > >
> > > For non-PMD collapse we must leave the anon VMA write locked until after
> > > we collapse the mTHP-- in the PMD case all the pages are isolated, but in
> > > the non-PMD case this is not true, and we must keep the lock to prevent
> > > changes to the VMA from occurring.
> > >
> > > Currently madv_collapse is not supported and will only attempt PMD
> > > collapse.
> >
> > Yes I think this has to remain the case unfortunately as we override
> > sysfs-specified orders for MADV_COLLAPSE and there's no sensible way to
> > determine what order we ought to be using.
> >
> > >
> > > Signed-off-by: Nico Pache <npache@redhat.com>
> >
> > You've gone from small incremental changes to a huge one here... for the
> > sake of reviewer sanity at least, any chance of breaking this up?
> I had this as two patches (one for the bitmap and one for implementing
> it), but I was asked to squash them :/

Yeah it makes sense to combine those two, but maybe there's a better way of
splitting things out.

> >
> > > ---
> > >  include/linux/khugepaged.h |   4 +
> > >  mm/khugepaged.c            | 236 +++++++++++++++++++++++++++++--------
> > >  2 files changed, 188 insertions(+), 52 deletions(-)
> > >
> > > diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
> > > index eb1946a70cff..d12cdb9ef3ba 100644
> > > --- a/include/linux/khugepaged.h
> > > +++ b/include/linux/khugepaged.h
> > > @@ -1,6 +1,10 @@
> > >  /* SPDX-License-Identifier: GPL-2.0 */
> > >  #ifndef _LINUX_KHUGEPAGED_H
> > >  #define _LINUX_KHUGEPAGED_H
> > > +#define KHUGEPAGED_MIN_MTHP_ORDER    2
> >
> > I guess this makes sense as by definition 2 pages is least it could
> > possibly be.
> Order, so 4 pages, 16kB mTHP

Right, misread that! :) sorry. I guess then in fact there is no such thing
as an order-1 mTHP? I did wonder how useful that'd be so makes more sense

> >
> > > +#define KHUGEPAGED_MIN_MTHP_NR       (1 << KHUGEPAGED_MIN_MTHP_ORDER)
> >
> > Surely KHUGEPAGED_MIN_NR_MTHP_PTES would be more meaningful?
> Sure!

Thanks!

> >
> > > +#define MAX_MTHP_BITMAP_SIZE  (1 << (ilog2(MAX_PTRS_PER_PTE) - KHUGEPAGED_MIN_MTHP_ORDER))
> >
> > This is confusing - size of what?
> We need it like this due to ppc64 (and maybe others?), it used to be
> based on PMD_ORDER, but some arches fail to compile due to the PMD
> size only being known at boot time.

That really sucks. Do please put this context in a comment though (see
below for more discussion on this).

>
> This compiles to 9 on arches that have 512 ptes.
> so 1 << (9 - 2) == 128

What I'm saying is - what is this expressed in? There's no information on
that here.

So from what you say I can see it's the number of bits required in the
bitmap, because we're being smart and only bothering to mark entries at a
granularity of minimum mTHP size.

OK so the idea is this is the number of PTE entries per entry in the
bitmap.

This is KEY missing context. We really need to spell things out here, the
THP code is... confusing :) to put it politely, so we need to be especially
careful to be mega clear.

So please please PLEASE add a comment explaining all this. And clearly
state that the unit here is bits.


> >
> > If it's number of bits surely this should be ilog2(MAX_PTRS_PER_PTE) -
> > KHUGEPAGED_MIN_MTHP_ORDER?
> This would only be 7? We need a 128 bit bitmap

Again missing context is you're storing bits per minimum mTHP as per above.

> >
> > This seems to be more so 'the maximum value that could contain the bits right?
> >
> > I think this is just wrong though, see below at DECLARE_BITMAP() stuff.
> >
> > > +#define MTHP_BITMAP_SIZE  (1 << (HPAGE_PMD_ORDER - KHUGEPAGED_MIN_MTHP_ORDER))
> >
> > Hard to know how this relates to MAX_MTHP_BITMAP_SIZE?
> >
> > I guess this is the current bitmap size indicating all that is possible,
> > but if these are all #define's what is this accomplishing?
> One for compile time one for runtime. Kind of annoying but it was the
> easiest solution given the architecture limitations.

OK, this context is fantastic + important for understanding, so let's put
it in a comment :)

> >
> > For all - please do not do (1 << xxx)! This can lead to sign-extension bugs at least
> > in theory, use _BITUL(...), it's neater too.
> ack, thanks!

OK sorry, based on David's feedback on this - just use 1UL << xxx here instead.

(An aside that isn't really relevant now but - also I was wrong to suggest
_BITUL() anyway BIT() is The Way (TM), for some reason I had it in my head
that the former was better :P)

> >
> > NIT but the whitespace is all screwed up here.
> >
> > KHUGEPAGED_MIN_MTHP_ORDER and KHUGEPAGED_MIN_MTHP_NR
> >
> > >
> > >  #include <linux/mm.h>
> > >
> > > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > > index 074101d03c9d..1ad7e00d3fd6 100644
> > > --- a/mm/khugepaged.c
> > > +++ b/mm/khugepaged.c
> > > @@ -94,6 +94,11 @@ static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
> > >
> > >  static struct kmem_cache *mm_slot_cache __ro_after_init;
> > >
> > > +struct scan_bit_state {
> > > +     u8 order;
> > > +     u16 offset;
> > > +};
> > > +
> > >  struct collapse_control {
> > >       bool is_khugepaged;
> > >
> > > @@ -102,6 +107,18 @@ struct collapse_control {
> > >
> > >       /* nodemask for allocation fallback */
> > >       nodemask_t alloc_nmask;
> > > +
> > > +     /*
> > > +      * bitmap used to collapse mTHP sizes.
> > > +      * 1bit = order KHUGEPAGED_MIN_MTHP_ORDER mTHP
> >
> > I'm not sure what this '1bit = xxx' comment means?
> A single bit represents 1 << MIN_MTHP_ORDER (4) pages. Ill express that better

Yeah again, this is the missing context that would have helped me feel a
lot less confused here :P

> >
> > > +      */
> > > +     DECLARE_BITMAP(mthp_bitmap, MAX_MTHP_BITMAP_SIZE);
> >
> > Hmm this seems wrong.
> Should be a bitmap with 128 bits (for 4k page size). Not sure what's wrong here.
> >
> > DECLARE_BITMAP(..., val) is expessed as:
> >
> > #define DECLARE_BITMAP(name,bits) \
> >         unsigned long name[BITS_TO_LONGS(bits)]
> >
> > So the 2nd param should be number of bits.
> >
> > But MAX_MTHP_BITMAP_SIZE is:
> >
> > (1 << (ilog2(MAX_PTRS_PER_PTE) - KHUGEPAGED_MIN_MTHP_ORDER))
> >
> > So typically:
> >
> > (1 << (9 - 2)) = 128
> >
> > And BITS_TO_LONGS is defined as:
> >
> > __KERNEL_DIV_ROUND_UP(nr, BITS_PER_TYPE(long))
> >
> > So essentially this will be 128 / 8 on a 64-bit system so 16 bytes to
> > store... 7 bits?
> I think you mean 64. 8 would be BYTES_PER_TYPE
> >
> > Unless I'm missing something here?
> Hmm, unless the DECLARE_BITMAP is being used incorrectly in multiple
> places, DECLARE_BITMAP(..., # of bits) is how this is intended to be
> used.
>
> I think it's an array of unsigned longs, so each part of the name[] is
> already 64 bits. hence the divide.

Yup again this is due to the missing context above.

> >
> > > +     DECLARE_BITMAP(mthp_bitmap_temp, MAX_MTHP_BITMAP_SIZE);
> >
> > Same comment as above obviously. But also this is kind of horrible, why are
> > we putting a copy of this entire bitmap on the stack every time we declare
> > a cc?
> The temp one is used as a scratch pad, Baolin also finds this useful
> in his file mTHP collapse useful for another use as well.

Saying 'scratch pad' is really just saying 'temporary' using different
words :) this isn't hugely helpful.

If we _need_ this we should give this a better name, and I also don't know
why we need this on the stack for all collapse_control users. Can't you
just have your 'scratch pad' local to the function that needs it or passed
as a pointer?

I'm sure it'd be useful to somebody to stick various temporary things in
vm_area_struct for instance, but it'd be hugely egregious to do so...

>
> In general khugepaged always uses the same CC, so it doesn't not
> having to constantly allocate this.

You're putting a LOT of data on the stack, and kernel stacks are very
delicate, I'm quite concerned about this.

BEFORE:

struct collapse_control {
	bool                       is_khugepaged;        /*     0     1 */

	/* XXX 3 bytes hole, try to pack */

	u32                        node_load[64];        /*     4   256 */

	/* XXX 4 bytes hole, try to pack */

	/* --- cacheline 4 boundary (256 bytes) was 8 bytes ago --- */
	nodemask_t                 alloc_nmask;          /*   264     8 */

	/* size: 272, cachelines: 5, members: 3 */
	/* sum members: 265, holes: 2, sum holes: 7 */
	/* last cacheline: 16 bytes */
};

272 bytes.

AFTER:

struct collapse_control {
	bool                       is_khugepaged;        /*     0     1 */

	/* XXX 3 bytes hole, try to pack */

	u32                        node_load[64];        /*     4   256 */

	/* XXX 4 bytes hole, try to pack */

	/* --- cacheline 4 boundary (256 bytes) was 8 bytes ago --- */
	nodemask_t                 alloc_nmask;          /*   264     8 */
	long unsigned int          mthp_bitmap[2];       /*   272    16 */
	long unsigned int          mthp_bitmap_temp[2];  /*   288    16 */
	struct scan_bit_state      mthp_bitmap_stack[128]; /*   304   512 */

	/* size: 816, cachelines: 13, members: 6 */
	/* sum members: 809, holes: 2, sum holes: 7 */
	/* last cacheline: 48 bytes */
};

816 bytes.

So you're roughly tripling the size of this struct and making this a thing
in all callstacks that reference collapse_control.

Kernel stack is a _super_ finite resource.

I mean I can't really review your stack implementation thing until you've
added a comment to help me understand what you are doing there (sorry but
it's just too fiddly for me to first principles it), but I wonder if we
truly need to be doing this?

I wonder if we can just put this into somewhere allocated...

> >
> > > +     struct scan_bit_state mthp_bitmap_stack[MAX_MTHP_BITMAP_SIZE];
> > > +};
> > > +
> > > +struct collapse_control khugepaged_collapse_control = {
> > > +     .is_khugepaged = true,
> > >  };
> >
> > Why are we moving this here?
> Because if not it doesn't compile.

A reason as to why this is the case would be helpful :)

But fair enough!

> >
> > >
> > >  /**
> > > @@ -854,10 +871,6 @@ static void khugepaged_alloc_sleep(void)
> > >       remove_wait_queue(&khugepaged_wait, &wait);
> > >  }
> > >
> > > -struct collapse_control khugepaged_collapse_control = {
> > > -     .is_khugepaged = true,
> > > -};
> > > -
> > >  static bool collapse_scan_abort(int nid, struct collapse_control *cc)
> > >  {
> > >       int i;
> > > @@ -1136,17 +1149,19 @@ static int alloc_charge_folio(struct folio **foliop, struct mm_struct *mm,
> > >
> > >  static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> > >                             int referenced, int unmapped,
> > > -                           struct collapse_control *cc)
> > > +                           struct collapse_control *cc, bool *mmap_locked,
> > > +                           unsigned int order, unsigned long offset)
> > >  {
> > >       LIST_HEAD(compound_pagelist);
> > >       pmd_t *pmd, _pmd;
> > > -     pte_t *pte;
> > > +     pte_t *pte = NULL, mthp_pte;
> > >       pgtable_t pgtable;
> > >       struct folio *folio;
> > >       spinlock_t *pmd_ptl, *pte_ptl;
> > >       int result = SCAN_FAIL;
> > >       struct vm_area_struct *vma;
> > >       struct mmu_notifier_range range;
> > > +     unsigned long _address = address + offset * PAGE_SIZE;
> >
> > This name is really horrible. please name it sensibly.
> >
> > It feels like address ought to be consistently the base of the THP or mTHP
> > we wish to collapse, and if we need something PMD aligned for some reason
> > we should rename _that_ to e.g. pmd_address.
> >
> > Orrr it could be mthp_address...
> >
> > Perhaps we could just figure that out here and pass only the
> > address... aligning to PMD boundary shouldn't be hard/costly.
> >
> > But it may indicate we need further refactorisation so we don't need to
> > paper over cracks + pass around a PMD address to do things when that may
> > not be where the (m)THP range begins.
> Ok i'll rename them, but we still need to know the PMD address as we
> rely on it for a few key operations.
> Can we leave _address and rename address to pmd_address?

No, we absolutely cannot leave _address as '_address', that's terrible and
I'm just not going to live with that.

I know it's trivial, but it's just such a horrid naming convention.

> >
> > >
> > >       VM_BUG_ON(address & ~HPAGE_PMD_MASK);
> > >
> > > @@ -1155,16 +1170,20 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> > >        * The allocation can take potentially a long time if it involves
> > >        * sync compaction, and we do not need to hold the mmap_lock during
> > >        * that. We will recheck the vma after taking it again in write mode.
> > > +      * If collapsing mTHPs we may have already released the read_lock.
> > >        */
> > > -     mmap_read_unlock(mm);
> > > +     if (*mmap_locked) {
> > > +             mmap_read_unlock(mm);
> > > +             *mmap_locked = false;
> > > +     }
> > >
> > > -     result = alloc_charge_folio(&folio, mm, cc, HPAGE_PMD_ORDER);
> > > +     result = alloc_charge_folio(&folio, mm, cc, order);
> > >       if (result != SCAN_SUCCEED)
> > >               goto out_nolock;
> > >
> > >       mmap_read_lock(mm);
> > > -     result = hugepage_vma_revalidate(mm, address, true, &vma, cc,
> > > -                                      BIT(HPAGE_PMD_ORDER));
> > > +     *mmap_locked = true;
> > > +     result = hugepage_vma_revalidate(mm, address, true, &vma, cc, BIT(order));
> >
> > I mean this is kind of going back to previous commits, but it's really ugly
> > to pass a BIT(xxx) here, is that really necessary? Can't we just pass in
> > the order?
> Yes and no... currently we only ever pass the bit of the current order
> so we could get away with it, but to generalize it we want the ability
> to pass a bitmap of the available orders. Like in the case of future
> madvise_collapse support, we would need to pass a bitmap of possible
> orders.

Can we just change that when we need it?

'Future proofing' for an possible future implementation detail is just not
how we should implement things.

Right now we don't do that, so let's just pass the order. If in future we
want to change it we can.

> >
> > It's also inconsistent with other calls like
> > e.g. __collapse_huge_page_swapin() below which passes the order.
> >
> > Same goes obv. for all such invocations.
> >
> > >       if (result != SCAN_SUCCEED) {
> > >               mmap_read_unlock(mm);
> > >               goto out_nolock;
> > > @@ -1182,13 +1201,14 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> > >                * released when it fails. So we jump out_nolock directly in
> > >                * that case.  Continuing to collapse causes inconsistency.
> > >                */
> > > -             result = __collapse_huge_page_swapin(mm, vma, address, pmd,
> > > -                                                  referenced, HPAGE_PMD_ORDER);
> > > +             result = __collapse_huge_page_swapin(mm, vma, _address, pmd,
> > > +                                                  referenced, order);
> > >               if (result != SCAN_SUCCEED)
> > >                       goto out_nolock;
> > >       }
> > >
> > >       mmap_read_unlock(mm);
> > > +     *mmap_locked = false;
> > >       /*
> > >        * Prevent all access to pagetables with the exception of
> > >        * gup_fast later handled by the ptep_clear_flush and the VM
> > > @@ -1198,8 +1218,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> > >        * mmap_lock.
> > >        */
> > >       mmap_write_lock(mm);
> > > -     result = hugepage_vma_revalidate(mm, address, true, &vma, cc,
> > > -                                      BIT(HPAGE_PMD_ORDER));
> > > +     result = hugepage_vma_revalidate(mm, address, true, &vma, cc, BIT(order));
> > >       if (result != SCAN_SUCCEED)
> > >               goto out_up_write;
> > >       /* check if the pmd is still valid */
> > > @@ -1210,11 +1229,12 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> > >
> > >       anon_vma_lock_write(vma->anon_vma);
> > >
> > > -     mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address,
> > > -                             address + HPAGE_PMD_SIZE);
> > > +     mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, _address,
> > > +                             _address + (PAGE_SIZE << order));
> >
> > This _address is horrible. That really does have to change.
> >
> > >       mmu_notifier_invalidate_range_start(&range);
> > >
> > >       pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
> > > +
> >
> > Odd whitespace...
> >
> > >       /*
> > >        * This removes any huge TLB entry from the CPU so we won't allow
> > >        * huge and small TLB entries for the same virtual address to
> > > @@ -1228,19 +1248,16 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> > >       mmu_notifier_invalidate_range_end(&range);
> > >       tlb_remove_table_sync_one();
> > >
> > > -     pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl);
> > > +     pte = pte_offset_map_lock(mm, &_pmd, _address, &pte_ptl);
> >
> > I see we already have a 'convention' of _ prefix on the pmd param, but two
> > wrongs don't make a right...
> >
> > >       if (pte) {
> > > -             result = __collapse_huge_page_isolate(vma, address, pte, cc,
> > > -                                                   &compound_pagelist,
> > > -                                                   HPAGE_PMD_ORDER);
> > > +             result = __collapse_huge_page_isolate(vma, _address, pte, cc,
> > > +                                                   &compound_pagelist, order);
> > >               spin_unlock(pte_ptl);
> > >       } else {
> > >               result = SCAN_PMD_NULL;
> > >       }
> > >
> > >       if (unlikely(result != SCAN_SUCCEED)) {
> > > -             if (pte)
> > > -                     pte_unmap(pte);
> >
> > Why are we removing this?

You're missing some of the comments, I'm guesing for most of the smaller
stuff you're just implicitly acking them but this one was a question :)

> >
> > >               spin_lock(pmd_ptl);
> > >               BUG_ON(!pmd_none(*pmd));
> > >               /*
> > > @@ -1255,17 +1272,17 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> > >       }
> > >
> > >       /*
> > > -      * All pages are isolated and locked so anon_vma rmap
> > > -      * can't run anymore.
> > > +      * For PMD collapse all pages are isolated and locked so anon_vma
> > > +      * rmap can't run anymore
> > >        */
> > > -     anon_vma_unlock_write(vma->anon_vma);
> > > +     if (order == HPAGE_PMD_ORDER)
> > > +             anon_vma_unlock_write(vma->anon_vma);
> >
> > Hmm this is introducing a horrible new way for things to go wrong. And
> > there's now a whole host of terrible error paths that can go wrong very
> > easily around rmap locks and yeah, no way we cannot do it this way.
> >
> > rmap locks are VERY sensitive and the ordering of the locking matters a
> > great deal (see top of mm/rmap.c). So we have to be SO careful here.
> >
> > I suggest you simply have a boolean 'anon_vma_locked' or something like
> > this, and get rid of these horrible additional code paths and the second
> > order == HPAGE_PMD_ORDER check.
> >
> > We'll track whether or not the lock is held and thereby needs releasing
> > that way instead.

You've not responded to this suggestion re: refactoring here.

I'm really not a fan of us arbitrarily messing with rmap locks like this,
and we should very carefully keep track of whether we have/have not
released them.

Again rmap locking is a dangerous area, I've got personal experience of
this (see top of mm/rmap.c for an indication of complexity here as well as
https://kernel.org/doc/html/latest/mm/process_addrs.html).

> >
> > Also, and very importantly - are you 100% sure you can't possibly have a
> > deadlock or issue beyond this point if you don't release the rmap lock?
> I double checked, this was added as a fix to an issue Hugh reported.

> The gap between these callers is rather small, and I see no way that
> it could skip the lock/unlock cycle.

We're going to need more than this to be confident, you need to clearly
justify why we won't encounter issues this way.

> >
> > This is veeeery important, as there can be implicit assumptions around
> > whether or not one can acquire these locks and you basically have to audit
> > ALL code over which this lock is held.
> >
> > I'm speaking from hard experience here having bumped into this in various
> > attempts at work relating to this stuff...
> >
> > >
> > >       result = __collapse_huge_page_copy(pte, folio, pmd, _pmd,
> > > -                                        vma, address, pte_ptl,
> > > -                                        &compound_pagelist, HPAGE_PMD_ORDER);
> > > -     pte_unmap(pte);
> > > +                                        vma, _address, pte_ptl,
> > > +                                        &compound_pagelist, order);
> > >       if (unlikely(result != SCAN_SUCCEED))
> > > -             goto out_up_write;
> > > +             goto out_unlock_anon_vma;
> >
> > See above...
> >
> > >
> > >       /*
> > >        * The smp_wmb() inside __folio_mark_uptodate() ensures the
> > > @@ -1273,33 +1290,115 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> > >        * write.
> > >        */
> > >       __folio_mark_uptodate(folio);
> > > -     pgtable = pmd_pgtable(_pmd);
> > > -
> > > -     _pmd = folio_mk_pmd(folio, vma->vm_page_prot);
> > > -     _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
> > > -
> > > -     spin_lock(pmd_ptl);
> > > -     BUG_ON(!pmd_none(*pmd));
> > > -     folio_add_new_anon_rmap(folio, vma, address, RMAP_EXCLUSIVE);
> > > -     folio_add_lru_vma(folio, vma);
> > > -     pgtable_trans_huge_deposit(mm, pmd, pgtable);
> > > -     set_pmd_at(mm, address, pmd, _pmd);
> > > -     update_mmu_cache_pmd(vma, address, pmd);
> > > -     deferred_split_folio(folio, false);
> > > -     spin_unlock(pmd_ptl);
> > > +     if (order == HPAGE_PMD_ORDER) {
> > > +             pgtable = pmd_pgtable(_pmd);
> > > +             _pmd = folio_mk_pmd(folio, vma->vm_page_prot);
> > > +             _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
> > > +
> > > +             spin_lock(pmd_ptl);
> > > +             BUG_ON(!pmd_none(*pmd));
> >
> > I know you're refactoring this, but be good to change this to a
> > WARN_ON_ONCE(), BUG_ON() is verboten unless it's absolutely definitely
> > going to be a kernel nuclear event, so worth changing things up as we go.

> Yeah i keep seeing those warning in checkpatch, so Ill go ahead and edit it.

Thanks!

> >
> > > +             folio_add_new_anon_rmap(folio, vma, _address, RMAP_EXCLUSIVE);
> > > +             folio_add_lru_vma(folio, vma);
> > > +             pgtable_trans_huge_deposit(mm, pmd, pgtable);
> > > +             set_pmd_at(mm, address, pmd, _pmd);
> > > +             update_mmu_cache_pmd(vma, address, pmd);
> > > +             deferred_split_folio(folio, false);
> > > +             spin_unlock(pmd_ptl);
> > > +     } else { /* mTHP collapse */
> > > +             mthp_pte = mk_pte(&folio->page, vma->vm_page_prot);
> >
> > I guess it's a rule that each THP or mTHP range spanned must span one and
> > only one folio.
> >
> > Not sure &folio->page has a future though.
> >
> > Maybe better to use folio_page(folio, 0)?
> Ok sounds good I'll use that.

Thanks!

> >
> > > +             mthp_pte = maybe_mkwrite(pte_mkdirty(mthp_pte), vma);
> > > +
> > > +             spin_lock(pmd_ptl);
> > > +             BUG_ON(!pmd_none(*pmd));
> >
> > having said the above, this is trictly introducing a new BUG_ON() which is
> > a no-no, please make it a WARN_ON_ONCE().

This one is more important, please do do this.

> >
> > > +             folio_ref_add(folio, (1 << order) - 1);
> >
> > Again no 1 << x please.

(as per David feedback elsewhere, 1UL << instead)

> >
> > Do we do something similar somewhere else for mthp ref counting? Can we
> > share code somehow?

> Yeah but IIRC its only like 2 or 3 places that do something like
> this... most callers to folio_add_* do things in slightly different
> manners. Maybe something to look into for the future, but I think it
> will be difficult to generalize it.

OK.

> >
> > > +             folio_add_new_anon_rmap(folio, vma, _address, RMAP_EXCLUSIVE);
> > > +             folio_add_lru_vma(folio, vma);
> > > +             set_ptes(vma->vm_mm, _address, pte, mthp_pte, (1 << order));
> >
> > Please avoid 1 << order, and I think at this point since you reference it a
> > bunch of times, just store a local var like nr_pages or sth?

> yeah not a bad idea!

Thanks!

> >
> > > +             update_mmu_cache_range(NULL, vma, _address, pte, (1 << order));
> > > +
> > > +             smp_wmb(); /* make pte visible before pmd */
> >
> > Can you give some detail as to why this will work here and why it is
> > necessary?

> Other parts of the kernel do it when setting ptes before updating the
> PMD. I'm not sure if it's necessary, but better safe than sorry.

Unfortunately this is a _totally_ unacceptable justification. We can't put
in barriers based on 'better safe than sorry'. You need to analysis this
and determine whether or not it's necessary.

So the comment in pmd_install() seems to give an indication:

	if (likely(pmd_none(*pmd))) {	/* Has another populated it ? */
		mm_inc_nr_ptes(mm);
		/*
		 * Ensure all pte setup (eg. pte page lock and page clearing) are
		 * visible before the pte is made visible to other CPUs by being
		 * put into page tables.
		 *
		 * The other side of the story is the pointer chasing in the page
		 * table walking code (when walking the page table without locking;
		 * ie. most of the time). Fortunately, these data accesses consist
		 * of a chain of data-dependent loads, meaning most CPUs (alpha
		 * being the notable exception) will already guarantee loads are
		 * seen in-order. See the alpha page table accessors for the
		 * smp_rmb() barriers in page table walking code.
		 */
		smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
		pmd_populate(mm, pmd, *pte);
		*pte = NULL;
	}

> >
> > > +             pmd_populate(mm, pmd, pmd_pgtable(_pmd));
> >
> > If we're updating PTE entriess why do we need to assign the PMD entry?

> We removed the PMD entry for GUP_fast reasons, then we reinstall the
> PMD entry after the mTHP is in place. Same as for PMD collapse.

OK a comment to this effect would be useful.

> >
> > > +             spin_unlock(pmd_ptl);
> > > +     }
> >
> > This deeply, badly needs to be refactored into something that both shares
> > code and separates out these two operations.
> >
> > This function is disgustingly long as it is, and that's not your fault, but
> > let's try to make things better as we go.
> >
> > >
> > >       folio = NULL;
> > >
> > >       result = SCAN_SUCCEED;
> > > +out_unlock_anon_vma:
> > > +     if (order != HPAGE_PMD_ORDER)
> > > +             anon_vma_unlock_write(vma->anon_vma);
> >
> > Obviously again as above, we need to simplify this and get rid of this
> > whole bit.
> >
> > >  out_up_write:
> > > +     if (pte)
> > > +             pte_unmap(pte);
> >
> > OK I guess you moved this from above down here? Is this a valid place to do this?
> Yes if not we were potentially unmapping a pte early.
> >
> > >       mmap_write_unlock(mm);
> > >  out_nolock:
> > > +     *mmap_locked = false;
> >
> > This is kind of horrible, we now have pretty mad logic around who sets
> > mmap_locked and where.
> >
> > Can we just do this at the call sites so we avoid that?
> >
> > I mean anything we do with this is hideous, but that'd be less confusing It
> > hink.
> >
> > >       if (folio)
> > >               folio_put(folio);
> > >       trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result);
> > >       return result;
> > >  }
> > >
> > > +/* Recursive function to consume the bitmap */
> >
> > Err... please don't? Kernel stack is a seriously finite resource, we do not
> > want recursion at all.
> >
> > But I'm not actually seeing any recursion here? Am I missing something?
> >

Yup this was before I realised it was an iterative implementation :)

Though we are putting load on the stack anyway...

> > > +static int collapse_scan_bitmap(struct mm_struct *mm, unsigned long address,
> > > +                     int referenced, int unmapped, struct collapse_control *cc,
> > > +                     bool *mmap_locked, unsigned long enabled_orders)
> >
> > This is a complicated and confusing function, it requires a comment
> > describing how it works.
> Ok will do!

This is _VERY_ key - I need this I think to be able to sanely review this
code.

> >
> > > +{
> > > +     u8 order, next_order;
> > > +     u16 offset, mid_offset;
> > > +     int num_chunks;
> > > +     int bits_set, threshold_bits;
> > > +     int top = -1;
> >
> > Err why do we start at -1 then immediately increment it?
> You are correct, it was probably a leftover bit from my development
> phase. Seems I can just set it to 0 to begin with.

Thanks!

> >
> > > +     int collapsed = 0;
> > > +     int ret;
> > > +     struct scan_bit_state state;
> > > +     bool is_pmd_only = (enabled_orders == (1 << HPAGE_PMD_ORDER));
> >
> > Extraneous outer parens.
> ack

Thanks!

> >
> > > +
> > > +     cc->mthp_bitmap_stack[++top] = (struct scan_bit_state)
> > > +             { HPAGE_PMD_ORDER - KHUGEPAGED_MIN_MTHP_ORDER, 0 };
> >
> > This is the same as
> >
> >         cc->mthp_bitmap_stack[0] = ...;
> >         top = 1;
> >
> > No?


> no it would be bitmap_stack[0] = ...
> then top goes to -1 (at state =... ), and if we add more items
> (next_order) to the stack it would go top = 1 (adds one for each half
> of the split)

OK, again going to need that comment to really grok all this... :)

> >
> >
> > This is really horrible. Can we just have a helper function for this
> > please?

> Seems kinda excessive for 4 lines and one caller.

It's code that's very horrible to understand using a very unusual way of
initialising a struct within the kernel.

And having code be understandable when the compiler can inline for us does
indeed justify this even in the instance of 4 lines and one caller :)

> >
> > Like:
> >
> >         static int mthp_push_stack(struct collapse_control *cc,
> >                 int index, u8 order, u16 offset)
> >         {
> >                 struct scan_bit_state *state = &cc->mthp_bitmap_stack[index];
> >
> >                 VM_WARN_ON(index >= MAX_MTHP_BITMAP_SIZE);
> >
> >                 state->order = order;
> >                 state->offset = offset;
> >
> >                 return index + 1;
> >         }
>
> This would not work in its current state because its ++index in the
> current implementation. I would need to refactor, but the general idea
> still stands

OK thanks.

Something _like_ this would make things a great deal clearer.

> >
> > And can invoke via:
> >
> >         top = mthp_push_stack(cc, top, order, offset);
> >
> > Or pass index as a pointer possibly also.
> >
> > > +
> > > +     while (top >= 0) {
> > > +             state = cc->mthp_bitmap_stack[top--];
> >
> > OK so this is the recursive bit...
> >
> > Oh man this function so needs a comment describing what it does, seriously.
> >
> > I think honestly for sake of my own sanity I'm going to hold off reviewing
> > the rest of this until there's something describing the algorithm, in
> > detail here, above the function.

> It's basically binary recursion with a stack structure, that checks
> regions of the bitmap in descending order (ie order 9, order 8, ...)
> if we go to the next order we add two items to the stack (left and
> right half). I will add a comment describing it at the top of the
> function.

Right, to reiterate - this needs a _big_ comment.

I'm sorry to split the review of this patch in two on this, because once
you do that I'm going to inevitably do the deferred review on the
algorithm, but I just feel this is the only sensible way I can determine
whether what you intend to do here makes sense and is correctly
implemented.

Kinda need that 'what you intend to do' bit. So it should be an expansive
and detailed comment explicitly explaining the algorithm and why you're
doing it.

Thanks!

> >
> > > +             order = state.order + KHUGEPAGED_MIN_MTHP_ORDER;
> > > +             offset = state.offset;
> > > +             num_chunks = 1 << (state.order);
> > > +             /* Skip mTHP orders that are not enabled */
> > > +             if (!test_bit(order, &enabled_orders))
> > > +                     goto next_order;
> > > +
> > > +             /* copy the relavant section to a new bitmap */
> > > +             bitmap_shift_right(cc->mthp_bitmap_temp, cc->mthp_bitmap, offset,
> > > +                               MTHP_BITMAP_SIZE);
> > > +
> > > +             bits_set = bitmap_weight(cc->mthp_bitmap_temp, num_chunks);
> > > +             threshold_bits = (HPAGE_PMD_NR - khugepaged_max_ptes_none - 1)
> > > +                             >> (HPAGE_PMD_ORDER - state.order);
> > > +
> > > +             /* Check if the region is "almost full" based on the threshold */
> > > +             if (bits_set > threshold_bits || is_pmd_only
> > > +                     || test_bit(order, &huge_anon_orders_always)) {
> > > +                     ret = collapse_huge_page(mm, address, referenced, unmapped,
> > > +                                              cc, mmap_locked, order,
> > > +                                              offset * KHUGEPAGED_MIN_MTHP_NR);
> > > +                     if (ret == SCAN_SUCCEED) {
> > > +                             collapsed += (1 << order);
> > > +                             continue;
> > > +                     }
> > > +             }
> > > +
> > > +next_order:
> > > +             if (state.order > 0) {
> > > +                     next_order = state.order - 1;
> > > +                     mid_offset = offset + (num_chunks / 2);
> > > +                     cc->mthp_bitmap_stack[++top] = (struct scan_bit_state)
> > > +                             { next_order, mid_offset };
> > > +                     cc->mthp_bitmap_stack[++top] = (struct scan_bit_state)
> > > +                             { next_order, offset };
> > > +             }
> > > +     }
> > > +     return collapsed;
> > > +}
> > > +
> > >  static int collapse_scan_pmd(struct mm_struct *mm,
> > >                            struct vm_area_struct *vma,
> > >                            unsigned long address, bool *mmap_locked,
> > > @@ -1307,31 +1406,60 @@ static int collapse_scan_pmd(struct mm_struct *mm,
> > >  {
> > >       pmd_t *pmd;
> > >       pte_t *pte, *_pte;
> > > +     int i;
> > >       int result = SCAN_FAIL, referenced = 0;
> > >       int none_or_zero = 0, shared = 0;
> > >       struct page *page = NULL;
> > >       struct folio *folio = NULL;
> > >       unsigned long _address;
> > > +     unsigned long enabled_orders;
> > >       spinlock_t *ptl;
> > >       int node = NUMA_NO_NODE, unmapped = 0;
> > > +     bool is_pmd_only;
> > >       bool writable = false;
> > > -
> > > +     int chunk_none_count = 0;
> > > +     int scaled_none = khugepaged_max_ptes_none >> (HPAGE_PMD_ORDER - KHUGEPAGED_MIN_MTHP_ORDER);
> > > +     unsigned long tva_flags = cc->is_khugepaged ? TVA_KHUGEPAGED : TVA_FORCED_COLLAPSE;
> > >       VM_BUG_ON(address & ~HPAGE_PMD_MASK);
> > >
> > >       result = find_pmd_or_thp_or_none(mm, address, &pmd);
> > >       if (result != SCAN_SUCCEED)
> > >               goto out;
> > >
> > > +     bitmap_zero(cc->mthp_bitmap, MAX_MTHP_BITMAP_SIZE);
> > > +     bitmap_zero(cc->mthp_bitmap_temp, MAX_MTHP_BITMAP_SIZE);
> >
> > Having this 'temp' thing on the stack for everyone is just horrid.
> As I mention above this serves a very good purpose, and is also
> expanded in another series by Baolin to serve another similar purpose
> too.

Yeah, I'm not hugely convinced :) responded there.

I can probably give more useful feedback on this once the big comment is
added!

> >
> > >       memset(cc->node_load, 0, sizeof(cc->node_load));
> > >       nodes_clear(cc->alloc_nmask);
> > > +
> > > +     if (cc->is_khugepaged)
> > > +             enabled_orders = thp_vma_allowable_orders(vma, vma->vm_flags,
> > > +                     tva_flags, THP_ORDERS_ALL_ANON);
> > > +     else
> > > +             enabled_orders = BIT(HPAGE_PMD_ORDER);
> > > +
> > > +     is_pmd_only = (enabled_orders == (1 << HPAGE_PMD_ORDER));
> >
> > This is horrid, can we have a function broken out to do this please?
> >
> > In general if you keep open coding stuff, just write a static function for
> > it, the compiler is smart enough to inline.
> ok, we do this is a few places so perhaps its the best approach.
> >
> > > +
> > >       pte = pte_offset_map_lock(mm, pmd, address, &ptl);
> > >       if (!pte) {
> > >               result = SCAN_PMD_NULL;
> > >               goto out;
> > >       }
> > >
> > > -     for (_address = address, _pte = pte; _pte < pte + HPAGE_PMD_NR;
> > > -          _pte++, _address += PAGE_SIZE) {
> > > +     for (i = 0; i < HPAGE_PMD_NR; i++) {
> > > +             /*
> > > +              * we are reading in KHUGEPAGED_MIN_MTHP_NR page chunks. if
> > > +              * there are pages in this chunk keep track of it in the bitmap
> > > +              * for mTHP collapsing.
> > > +              */
> > > +             if (i % KHUGEPAGED_MIN_MTHP_NR == 0) {
> > > +                     if (i > 0 && chunk_none_count <= scaled_none)
> > > +                             bitmap_set(cc->mthp_bitmap,
> > > +                                        (i - 1) / KHUGEPAGED_MIN_MTHP_NR, 1);
> > > +                     chunk_none_count = 0;
> > > +             }
> >
> > This whole thing is really confusing and you are not explaining the
> > algoritm here at all.
> >
> > This requires a comment, and really this bit should be separated out please.
> This used to be its own commit, but multiple people wanted it
> squashed... ugh. Which should we go with?
> >
> > > +
> > > +             _pte = pte + i;
> > > +             _address = address + i * PAGE_SIZE;
> > >               pte_t pteval = ptep_get(_pte);
> > >               if (is_swap_pte(pteval)) {
> > >                       ++unmapped;
> > > @@ -1354,10 +1482,11 @@ static int collapse_scan_pmd(struct mm_struct *mm,
> > >                       }
> > >               }
> > >               if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
> > > +                     ++chunk_none_count;
> > >                       ++none_or_zero;
> > >                       if (!userfaultfd_armed(vma) &&
> > > -                         (!cc->is_khugepaged ||
> > > -                          none_or_zero <= khugepaged_max_ptes_none)) {
> > > +                         (!cc->is_khugepaged || !is_pmd_only ||
> > > +                             none_or_zero <= khugepaged_max_ptes_none)) {
> > >                               continue;
> > >                       } else {
> > >                               result = SCAN_EXCEED_NONE_PTE;
> > > @@ -1453,6 +1582,7 @@ static int collapse_scan_pmd(struct mm_struct *mm,
> > >                                                                    address)))
> > >                       referenced++;
> > >       }
> > > +
> > >       if (!writable) {
> > >               result = SCAN_PAGE_RO;
> > >       } else if (cc->is_khugepaged &&
> > > @@ -1465,10 +1595,12 @@ static int collapse_scan_pmd(struct mm_struct *mm,
> > >  out_unmap:
> > >       pte_unmap_unlock(pte, ptl);
> > >       if (result == SCAN_SUCCEED) {
> > > -             result = collapse_huge_page(mm, address, referenced,
> > > -                                         unmapped, cc);
> > > -             /* collapse_huge_page will return with the mmap_lock released */
> > > -             *mmap_locked = false;
> > > +             result = collapse_scan_bitmap(mm, address, referenced, unmapped, cc,
> > > +                                           mmap_locked, enabled_orders);
> > > +             if (result > 0)
> > > +                     result = SCAN_SUCCEED;
> > > +             else
> > > +                     result = SCAN_FAIL;
> >
> > We're reusing result as both an enum value and as a storage for unmber
> > colapsed PTE entries?
> >
> > Can we just use a new local variable? Thanks

Again you've skipped a ton of review comments here. You need to respond to
everything.

> >
> > >       }
> > >  out:
> > >       trace_mm_khugepaged_scan_pmd(mm, folio, writable, referenced,
> > > --
> > > 2.50.1
> > >
> >
> > I will review the bitmap/chunk stuff in more detail once the algorithm is
> > commented.
> ok thanks for the review.

No problem! :)
Re: [PATCH v10 06/13] khugepaged: add mTHP support
Posted by Nico Pache 3 weeks, 4 days ago
On Fri, Sep 5, 2025 at 4:15 AM Lorenzo Stoakes
<lorenzo.stoakes@oracle.com> wrote:
>
> You've not responded to a ton of comments, I'm guesing I should assume in
> those cases you're acking the comments implicitly? :) Do let me know.
Yes! If they are obvious nits, then implicit ack.

Sorry I wrote my response over a couple of days while making changes
based on the review and trying to figure out a solution to the "creep"
issue, and forgot to thoroughly review all the comments before sending
this out. I'll make sure to go through it again.
>
> On Tue, Sep 02, 2025 at 02:12:38PM -0600, Nico Pache wrote:
> > On Wed, Aug 20, 2025 at 12:30 PM Lorenzo Stoakes
> > <lorenzo.stoakes@oracle.com> wrote:
> > >
> > > On Tue, Aug 19, 2025 at 07:41:58AM -0600, Nico Pache wrote:
> > > > Introduce the ability for khugepaged to collapse to different mTHP sizes.
> > > > While scanning PMD ranges for potential collapse candidates, keep track
> > > > of pages in KHUGEPAGED_MIN_MTHP_ORDER chunks via a bitmap. Each bit
> > > > represents a utilized region of order KHUGEPAGED_MIN_MTHP_ORDER ptes. If
> > > > mTHPs are enabled we remove the restriction of max_ptes_none during the
> > > > scan phase so we don't bailout early and miss potential mTHP candidates.
> > > >
> > > > A new function collapse_scan_bitmap is used to perform binary recursion on
> > > > the bitmap and determine the best eligible order for the collapse.
> > > > A stack struct is used instead of traditional recursion. max_ptes_none
> > > > will be scaled by the attempted collapse order to determine how "full" an
> > > > order must be before being considered for collapse.
> > > >
> > > > Once we determine what mTHP sizes fits best in that PMD range a collapse
> > > > is attempted. A minimum collapse order of 2 is used as this is the lowest
> > > > order supported by anon memory.
> > > >
> > > > For orders configured with "always", we perform greedy collapsing
> > > > to that order without considering bit density.
> > > >
> > > > If a mTHP collapse is attempted, but contains swapped out, or shared
> > > > pages, we don't perform the collapse. This is because adding new entries
> > > > can lead to new none pages, and these may lead to constant promotion into
> > > > a higher order (m)THP. A similar issue can occur with "max_ptes_none >
> > > > HPAGE_PMD_NR/2" due to the fact that a collapse will introduce at least 2x
> > > > the number of pages, and on a future scan will satisfy the promotion
> > > > condition once again.
> > > >
> > > > For non-PMD collapse we must leave the anon VMA write locked until after
> > > > we collapse the mTHP-- in the PMD case all the pages are isolated, but in
> > > > the non-PMD case this is not true, and we must keep the lock to prevent
> > > > changes to the VMA from occurring.
> > > >
> > > > Currently madv_collapse is not supported and will only attempt PMD
> > > > collapse.
> > >
> > > Yes I think this has to remain the case unfortunately as we override
> > > sysfs-specified orders for MADV_COLLAPSE and there's no sensible way to
> > > determine what order we ought to be using.
> > >
> > > >
> > > > Signed-off-by: Nico Pache <npache@redhat.com>
> > >
> > > You've gone from small incremental changes to a huge one here... for the
> > > sake of reviewer sanity at least, any chance of breaking this up?
> > I had this as two patches (one for the bitmap and one for implementing
> > it), but I was asked to squash them :/
>
> Yeah it makes sense to combine those two, but maybe there's a better way of
> splitting things out.
I'll give it a shot but most of this patch belongs together in my eyes.
>
> > >
> > > > ---
> > > >  include/linux/khugepaged.h |   4 +
> > > >  mm/khugepaged.c            | 236 +++++++++++++++++++++++++++++--------
> > > >  2 files changed, 188 insertions(+), 52 deletions(-)
> > > >
> > > > diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
> > > > index eb1946a70cff..d12cdb9ef3ba 100644
> > > > --- a/include/linux/khugepaged.h
> > > > +++ b/include/linux/khugepaged.h
> > > > @@ -1,6 +1,10 @@
> > > >  /* SPDX-License-Identifier: GPL-2.0 */
> > > >  #ifndef _LINUX_KHUGEPAGED_H
> > > >  #define _LINUX_KHUGEPAGED_H
> > > > +#define KHUGEPAGED_MIN_MTHP_ORDER    2
> > >
> > > I guess this makes sense as by definition 2 pages is least it could
> > > possibly be.
> > Order, so 4 pages, 16kB mTHP
>
> Right, misread that! :) sorry. I guess then in fact there is no such thing
> as an order-1 mTHP? I did wonder how useful that'd be so makes more sense
Anon memory does not support order 1 THPs, im not sure if there are
any plans do so either.
>
> > >
> > > > +#define KHUGEPAGED_MIN_MTHP_NR       (1 << KHUGEPAGED_MIN_MTHP_ORDER)
> > >
> > > Surely KHUGEPAGED_MIN_NR_MTHP_PTES would be more meaningful?
> > Sure!
>
> Thanks!
>
> > >
> > > > +#define MAX_MTHP_BITMAP_SIZE  (1 << (ilog2(MAX_PTRS_PER_PTE) - KHUGEPAGED_MIN_MTHP_ORDER))
> > >
> > > This is confusing - size of what?
> > We need it like this due to ppc64 (and maybe others?), it used to be
> > based on PMD_ORDER, but some arches fail to compile due to the PMD
> > size only being known at boot time.
>
> That really sucks. Do please put this context in a comment though (see
> below for more discussion on this).
Ok, will do. although with my most recent changes we might just be
able to get rid of this all together.
>
> >
> > This compiles to 9 on arches that have 512 ptes.
> > so 1 << (9 - 2) == 128
>
> What I'm saying is - what is this expressed in? There's no information on
> that here.
In my next implementation its probably going to be gone, but what it
indicates is the bitmap size (128), where each bit equals a range of 4
pages. 128 *4 = 512.
>
> So from what you say I can see it's the number of bits required in the
> bitmap, because we're being smart and only bothering to mark entries at a
> granularity of minimum mTHP size.
>
> OK so the idea is this is the number of PTE entries per entry in the
> bitmap.
Yeah it was an optimization that I was going for, but on second
thought it's probably pretty useless and causes some inaccuracies in
the max_ptes_none scaling and collapsing.
>
> This is KEY missing context. We really need to spell things out here, the
> THP code is... confusing :) to put it politely, so we need to be especially
> careful to be mega clear.
Yeah its rather confusing, sorry, but I think my newer implementation
that gets rid of this is actually much cleaner.
>
> So please please PLEASE add a comment explaining all this. And clearly
> state that the unit here is bits.
ack, I'll be more clear with my newer implementation, and it should be
easier to understand anyways.
>
>
> > >
> > > If it's number of bits surely this should be ilog2(MAX_PTRS_PER_PTE) -
> > > KHUGEPAGED_MIN_MTHP_ORDER?
> > This would only be 7? We need a 128 bit bitmap
>
> Again missing context is you're storing bits per minimum mTHP as per above.
>
> > >
> > > This seems to be more so 'the maximum value that could contain the bits right?
> > >
> > > I think this is just wrong though, see below at DECLARE_BITMAP() stuff.
> > >
> > > > +#define MTHP_BITMAP_SIZE  (1 << (HPAGE_PMD_ORDER - KHUGEPAGED_MIN_MTHP_ORDER))
> > >
> > > Hard to know how this relates to MAX_MTHP_BITMAP_SIZE?
> > >
> > > I guess this is the current bitmap size indicating all that is possible,
> > > but if these are all #define's what is this accomplishing?
> > One for compile time one for runtime. Kind of annoying but it was the
> > easiest solution given the architecture limitations.
>
> OK, this context is fantastic + important for understanding, so let's put
> it in a comment :)
Or just get rid of it ;)
>
> > >
> > > For all - please do not do (1 << xxx)! This can lead to sign-extension bugs at least
> > > in theory, use _BITUL(...), it's neater too.
> > ack, thanks!
>
> OK sorry, based on David's feedback on this - just use 1UL << xxx here instead.
ack
>
> (An aside that isn't really relevant now but - also I was wrong to suggest
> _BITUL() anyway BIT() is The Way (TM), for some reason I had it in my head
> that the former was better :P)
>
> > >
> > > NIT but the whitespace is all screwed up here.
> > >
> > > KHUGEPAGED_MIN_MTHP_ORDER and KHUGEPAGED_MIN_MTHP_NR
> > >
> > > >
> > > >  #include <linux/mm.h>
> > > >
> > > > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > > > index 074101d03c9d..1ad7e00d3fd6 100644
> > > > --- a/mm/khugepaged.c
> > > > +++ b/mm/khugepaged.c
> > > > @@ -94,6 +94,11 @@ static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
> > > >
> > > >  static struct kmem_cache *mm_slot_cache __ro_after_init;
> > > >
> > > > +struct scan_bit_state {
> > > > +     u8 order;
> > > > +     u16 offset;
> > > > +};
> > > > +
> > > >  struct collapse_control {
> > > >       bool is_khugepaged;
> > > >
> > > > @@ -102,6 +107,18 @@ struct collapse_control {
> > > >
> > > >       /* nodemask for allocation fallback */
> > > >       nodemask_t alloc_nmask;
> > > > +
> > > > +     /*
> > > > +      * bitmap used to collapse mTHP sizes.
> > > > +      * 1bit = order KHUGEPAGED_MIN_MTHP_ORDER mTHP
> > >
> > > I'm not sure what this '1bit = xxx' comment means?
> > A single bit represents 1 << MIN_MTHP_ORDER (4) pages. Ill express that better
>
> Yeah again, this is the missing context that would have helped me feel a
> lot less confused here :P
>
> > >
> > > > +      */
> > > > +     DECLARE_BITMAP(mthp_bitmap, MAX_MTHP_BITMAP_SIZE);
> > >
> > > Hmm this seems wrong.
> > Should be a bitmap with 128 bits (for 4k page size). Not sure what's wrong here.
> > >
> > > DECLARE_BITMAP(..., val) is expessed as:
> > >
> > > #define DECLARE_BITMAP(name,bits) \
> > >         unsigned long name[BITS_TO_LONGS(bits)]
> > >
> > > So the 2nd param should be number of bits.
> > >
> > > But MAX_MTHP_BITMAP_SIZE is:
> > >
> > > (1 << (ilog2(MAX_PTRS_PER_PTE) - KHUGEPAGED_MIN_MTHP_ORDER))
> > >
> > > So typically:
> > >
> > > (1 << (9 - 2)) = 128
> > >
> > > And BITS_TO_LONGS is defined as:
> > >
> > > __KERNEL_DIV_ROUND_UP(nr, BITS_PER_TYPE(long))
> > >
> > > So essentially this will be 128 / 8 on a 64-bit system so 16 bytes to
> > > store... 7 bits?
> > I think you mean 64. 8 would be BYTES_PER_TYPE
> > >
> > > Unless I'm missing something here?
> > Hmm, unless the DECLARE_BITMAP is being used incorrectly in multiple
> > places, DECLARE_BITMAP(..., # of bits) is how this is intended to be
> > used.
> >
> > I think it's an array of unsigned longs, so each part of the name[] is
> > already 64 bits. hence the divide.
>
> Yup again this is due to the missing context above.
>
> > >
> > > > +     DECLARE_BITMAP(mthp_bitmap_temp, MAX_MTHP_BITMAP_SIZE);
> > >
> > > Same comment as above obviously. But also this is kind of horrible, why are
> > > we putting a copy of this entire bitmap on the stack every time we declare
> > > a cc?
> > The temp one is used as a scratch pad, Baolin also finds this useful
> > in his file mTHP collapse useful for another use as well.
>
> Saying 'scratch pad' is really just saying 'temporary' using different
> words :) this isn't hugely helpful.
When doing the "recursion", there is no easy way to read the bitmap
between bits N0-N1, so we must manipulate it to extract that data;
however, we still want to preserve the original state of the bitmap.
The temp_bitmap is used to do these bit manipulations while not
destroying the original state. I will see if there's a better way to
read arbitrary ranges of the bitmap so we dont need to have two
copies, but IIRC there is not-- I might have to expand the bitmap API
to include this.
>
> If we _need_ this we should give this a better name, and I also don't know
> why we need this on the stack for all collapse_control users. Can't you
> just have your 'scratch pad' local to the function that needs it or passed
> as a pointer?
If I cant find a solution to the problem above I will look into moving
it to the local stack.
>
> I'm sure it'd be useful to somebody to stick various temporary things in
> vm_area_struct for instance, but it'd be hugely egregious to do so...
The reason I didnt see this as a huge issue, is there is only one
global CC that is reused for all khugepaged runs. It's a one time
allocation and does not scale.
>
> >
> > In general khugepaged always uses the same CC, so it doesn't not
> > having to constantly allocate this.
>
> You're putting a LOT of data on the stack, and kernel stacks are very
> delicate, I'm quite concerned about this.
>
> BEFORE:
>
> struct collapse_control {
>         bool                       is_khugepaged;        /*     0     1 */
>
>         /* XXX 3 bytes hole, try to pack */
>
>         u32                        node_load[64];        /*     4   256 */
>
>         /* XXX 4 bytes hole, try to pack */
>
>         /* --- cacheline 4 boundary (256 bytes) was 8 bytes ago --- */
>         nodemask_t                 alloc_nmask;          /*   264     8 */
>
>         /* size: 272, cachelines: 5, members: 3 */
>         /* sum members: 265, holes: 2, sum holes: 7 */
>         /* last cacheline: 16 bytes */
> };
>
> 272 bytes.
>
> AFTER:
>
> struct collapse_control {
>         bool                       is_khugepaged;        /*     0     1 */
>
>         /* XXX 3 bytes hole, try to pack */
>
>         u32                        node_load[64];        /*     4   256 */
>
>         /* XXX 4 bytes hole, try to pack */
>
>         /* --- cacheline 4 boundary (256 bytes) was 8 bytes ago --- */
>         nodemask_t                 alloc_nmask;          /*   264     8 */
>         long unsigned int          mthp_bitmap[2];       /*   272    16 */
>         long unsigned int          mthp_bitmap_temp[2];  /*   288    16 */
>         struct scan_bit_state      mthp_bitmap_stack[128]; /*   304   512 */
>
>         /* size: 816, cachelines: 13, members: 6 */
>         /* sum members: 809, holes: 2, sum holes: 7 */
>         /* last cacheline: 48 bytes */
> };
>
> 816 bytes.
>
> So you're roughly tripling the size of this struct and making this a thing
> in all callstacks that reference collapse_control.
>
> Kernel stack is a _super_ finite resource.
>
> I mean I can't really review your stack implementation thing until you've
> added a comment to help me understand what you are doing there (sorry but
> it's just too fiddly for me to first principles it), but I wonder if we
> truly need to be doing this?
>
> I wonder if we can just put this into somewhere allocated...
>
> > >
> > > > +     struct scan_bit_state mthp_bitmap_stack[MAX_MTHP_BITMAP_SIZE];
> > > > +};
> > > > +
> > > > +struct collapse_control khugepaged_collapse_control = {
> > > > +     .is_khugepaged = true,
> > > >  };
> > >
> > > Why are we moving this here?
> > Because if not it doesn't compile.
>
> A reason as to why this is the case would be helpful :)
>
> But fair enough!
>
> > >
> > > >
> > > >  /**
> > > > @@ -854,10 +871,6 @@ static void khugepaged_alloc_sleep(void)
> > > >       remove_wait_queue(&khugepaged_wait, &wait);
> > > >  }
> > > >
> > > > -struct collapse_control khugepaged_collapse_control = {
> > > > -     .is_khugepaged = true,
> > > > -};
> > > > -
> > > >  static bool collapse_scan_abort(int nid, struct collapse_control *cc)
> > > >  {
> > > >       int i;
> > > > @@ -1136,17 +1149,19 @@ static int alloc_charge_folio(struct folio **foliop, struct mm_struct *mm,
> > > >
> > > >  static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> > > >                             int referenced, int unmapped,
> > > > -                           struct collapse_control *cc)
> > > > +                           struct collapse_control *cc, bool *mmap_locked,
> > > > +                           unsigned int order, unsigned long offset)
> > > >  {
> > > >       LIST_HEAD(compound_pagelist);
> > > >       pmd_t *pmd, _pmd;
> > > > -     pte_t *pte;
> > > > +     pte_t *pte = NULL, mthp_pte;
> > > >       pgtable_t pgtable;
> > > >       struct folio *folio;
> > > >       spinlock_t *pmd_ptl, *pte_ptl;
> > > >       int result = SCAN_FAIL;
> > > >       struct vm_area_struct *vma;
> > > >       struct mmu_notifier_range range;
> > > > +     unsigned long _address = address + offset * PAGE_SIZE;
> > >
> > > This name is really horrible. please name it sensibly.
> > >
> > > It feels like address ought to be consistently the base of the THP or mTHP
> > > we wish to collapse, and if we need something PMD aligned for some reason
> > > we should rename _that_ to e.g. pmd_address.
> > >
> > > Orrr it could be mthp_address...
> > >
> > > Perhaps we could just figure that out here and pass only the
> > > address... aligning to PMD boundary shouldn't be hard/costly.
> > >
> > > But it may indicate we need further refactorisation so we don't need to
> > > paper over cracks + pass around a PMD address to do things when that may
> > > not be where the (m)THP range begins.
> > Ok i'll rename them, but we still need to know the PMD address as we
> > rely on it for a few key operations.
> > Can we leave _address and rename address to pmd_address?
>
> No, we absolutely cannot leave _address as '_address', that's terrible and
> I'm just not going to live with that.
>
> I know it's trivial, but it's just such a horrid naming convention.
Ok Ill rename it to something else!
>
> > >
> > > >
> > > >       VM_BUG_ON(address & ~HPAGE_PMD_MASK);
> > > >
> > > > @@ -1155,16 +1170,20 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> > > >        * The allocation can take potentially a long time if it involves
> > > >        * sync compaction, and we do not need to hold the mmap_lock during
> > > >        * that. We will recheck the vma after taking it again in write mode.
> > > > +      * If collapsing mTHPs we may have already released the read_lock.
> > > >        */
> > > > -     mmap_read_unlock(mm);
> > > > +     if (*mmap_locked) {
> > > > +             mmap_read_unlock(mm);
> > > > +             *mmap_locked = false;
> > > > +     }
> > > >
> > > > -     result = alloc_charge_folio(&folio, mm, cc, HPAGE_PMD_ORDER);
> > > > +     result = alloc_charge_folio(&folio, mm, cc, order);
> > > >       if (result != SCAN_SUCCEED)
> > > >               goto out_nolock;
> > > >
> > > >       mmap_read_lock(mm);
> > > > -     result = hugepage_vma_revalidate(mm, address, true, &vma, cc,
> > > > -                                      BIT(HPAGE_PMD_ORDER));
> > > > +     *mmap_locked = true;
> > > > +     result = hugepage_vma_revalidate(mm, address, true, &vma, cc, BIT(order));
> > >
> > > I mean this is kind of going back to previous commits, but it's really ugly
> > > to pass a BIT(xxx) here, is that really necessary? Can't we just pass in
> > > the order?
> > Yes and no... currently we only ever pass the bit of the current order
> > so we could get away with it, but to generalize it we want the ability
> > to pass a bitmap of the available orders. Like in the case of future
> > madvise_collapse support, we would need to pass a bitmap of possible
> > orders.
>
> Can we just change that when we need it?
Sure, it was more so that I generalized it for madv_collapse and
khugepaged, but then we dropped madv_collapse support, and I never
undid the change.
>
> 'Future proofing' for an possible future implementation detail is just not
> how we should implement things.
>
> Right now we don't do that, so let's just pass the order. If in future we
> want to change it we can.
ack
>
> > >
> > > It's also inconsistent with other calls like
> > > e.g. __collapse_huge_page_swapin() below which passes the order.
> > >
> > > Same goes obv. for all such invocations.
> > >
> > > >       if (result != SCAN_SUCCEED) {
> > > >               mmap_read_unlock(mm);
> > > >               goto out_nolock;
> > > > @@ -1182,13 +1201,14 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> > > >                * released when it fails. So we jump out_nolock directly in
> > > >                * that case.  Continuing to collapse causes inconsistency.
> > > >                */
> > > > -             result = __collapse_huge_page_swapin(mm, vma, address, pmd,
> > > > -                                                  referenced, HPAGE_PMD_ORDER);
> > > > +             result = __collapse_huge_page_swapin(mm, vma, _address, pmd,
> > > > +                                                  referenced, order);
> > > >               if (result != SCAN_SUCCEED)
> > > >                       goto out_nolock;
> > > >       }
> > > >
> > > >       mmap_read_unlock(mm);
> > > > +     *mmap_locked = false;
> > > >       /*
> > > >        * Prevent all access to pagetables with the exception of
> > > >        * gup_fast later handled by the ptep_clear_flush and the VM
> > > > @@ -1198,8 +1218,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> > > >        * mmap_lock.
> > > >        */
> > > >       mmap_write_lock(mm);
> > > > -     result = hugepage_vma_revalidate(mm, address, true, &vma, cc,
> > > > -                                      BIT(HPAGE_PMD_ORDER));
> > > > +     result = hugepage_vma_revalidate(mm, address, true, &vma, cc, BIT(order));
> > > >       if (result != SCAN_SUCCEED)
> > > >               goto out_up_write;
> > > >       /* check if the pmd is still valid */
> > > > @@ -1210,11 +1229,12 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> > > >
> > > >       anon_vma_lock_write(vma->anon_vma);
> > > >
> > > > -     mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address,
> > > > -                             address + HPAGE_PMD_SIZE);
> > > > +     mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, _address,
> > > > +                             _address + (PAGE_SIZE << order));
> > >
> > > This _address is horrible. That really does have to change.
> > >
> > > >       mmu_notifier_invalidate_range_start(&range);
> > > >
> > > >       pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
> > > > +
> > >
> > > Odd whitespace...
> > >
> > > >       /*
> > > >        * This removes any huge TLB entry from the CPU so we won't allow
> > > >        * huge and small TLB entries for the same virtual address to
> > > > @@ -1228,19 +1248,16 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> > > >       mmu_notifier_invalidate_range_end(&range);
> > > >       tlb_remove_table_sync_one();
> > > >
> > > > -     pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl);
> > > > +     pte = pte_offset_map_lock(mm, &_pmd, _address, &pte_ptl);
> > >
> > > I see we already have a 'convention' of _ prefix on the pmd param, but two
> > > wrongs don't make a right...
> > >
> > > >       if (pte) {
> > > > -             result = __collapse_huge_page_isolate(vma, address, pte, cc,
> > > > -                                                   &compound_pagelist,
> > > > -                                                   HPAGE_PMD_ORDER);
> > > > +             result = __collapse_huge_page_isolate(vma, _address, pte, cc,
> > > > +                                                   &compound_pagelist, order);
> > > >               spin_unlock(pte_ptl);
> > > >       } else {
> > > >               result = SCAN_PMD_NULL;
> > > >       }
> > > >
> > > >       if (unlikely(result != SCAN_SUCCEED)) {
> > > > -             if (pte)
> > > > -                     pte_unmap(pte);
> > >
> > > Why are we removing this?
>
> You're missing some of the comments, I'm guesing for most of the smaller
> stuff you're just implicitly acking them but this one was a question :)
Ah, for some of your reviews you comment, then later when you see
another piece you say something like "ah ok this is why", for a lot of
these cases I might have skipped replying because it seems that you
figured it out.
For this one we moved the pte_unmap further down, because if not, we
could be prematurely unmapping a PTE, which is later referenced during
the mTHP collapse, which was not the case when we only did PMD
collapse.
>
> > >
> > > >               spin_lock(pmd_ptl);
> > > >               BUG_ON(!pmd_none(*pmd));
> > > >               /*
> > > > @@ -1255,17 +1272,17 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> > > >       }
> > > >
> > > >       /*
> > > > -      * All pages are isolated and locked so anon_vma rmap
> > > > -      * can't run anymore.
> > > > +      * For PMD collapse all pages are isolated and locked so anon_vma
> > > > +      * rmap can't run anymore
> > > >        */
> > > > -     anon_vma_unlock_write(vma->anon_vma);
> > > > +     if (order == HPAGE_PMD_ORDER)
> > > > +             anon_vma_unlock_write(vma->anon_vma);
> > >
> > > Hmm this is introducing a horrible new way for things to go wrong. And
> > > there's now a whole host of terrible error paths that can go wrong very
> > > easily around rmap locks and yeah, no way we cannot do it this way.
> > >
> > > rmap locks are VERY sensitive and the ordering of the locking matters a
> > > great deal (see top of mm/rmap.c). So we have to be SO careful here.
> > >
> > > I suggest you simply have a boolean 'anon_vma_locked' or something like
> > > this, and get rid of these horrible additional code paths and the second
> > > order == HPAGE_PMD_ORDER check.
> > >
> > > We'll track whether or not the lock is held and thereby needs releasing
> > > that way instead.
>
> You've not responded to this suggestion re: refactoring here.
I think if anything this refactoring adds complexity... we simply
postpone the unlock we normally do, to later if its mTHP collapse,
because we havent fully isolated all the pages (like we do in the PMD
case).
We arent doing anything extremely different other than hold the
already held lock for a little longer. We would still need these order
== HPAGE_PMD_ORDER checks, but just be flipping a flag that says its
locked, essentially adding more complexity for the same outcome.
>
> I'm really not a fan of us arbitrarily messing with rmap locks like this,
> and we should very carefully keep track of whether we have/have not
> released them.
Its important to note, the code was already doing this, as I stated
above, we simply hold it for longer because we haven't isolated all
the pages.
>
> Again rmap locking is a dangerous area, I've got personal experience of
> this (see top of mm/rmap.c for an indication of complexity here as well as
> https://kernel.org/doc/html/latest/mm/process_addrs.html).
>
> > >
> > > Also, and very importantly - are you 100% sure you can't possibly have a
> > > deadlock or issue beyond this point if you don't release the rmap lock?
> > I double checked, this was added as a fix to an issue Hugh reported.
>
> > The gap between these callers is rather small, and I see no way that
> > it could skip the lock/unlock cycle.
>
> We're going to need more than this to be confident, you need to clearly
> justify why we won't encounter issues this way.

// WE ARE HOLDING THE LOCK ALREADY
if (order == HPAGE_PMD_ORDER) //ONLY RELEASE IF ITS PMD
        anon_vma_unlock_write(vma->anon_vma);

    result = __collapse_huge_page_copy(pte, folio, pmd, _pmd,
                       vma, _address, pte_ptl,
                       order, &compound_pagelist);
    if (unlikely(result != SCAN_SUCCEED))
        goto out_unlock_anon_vma; //LETS RELEASE THE LOCK IF WE FAIL

if (order == HPAGE_PMD_ORDER) {
      ...
     // NO EXIT PATHS
    } else { /* mTHP collapse */
        ...
       // NO EXIT PATHS
    }

    folio = NULL;

    result = SCAN_SUCCEED;
out_unlock_anon_vma:
    if (order != HPAGE_PMD_ORDER)
        anon_vma_unlock_write(vma->anon_vma); //RELEASE IF IT WASN'T
RELEASED EARLIER

Not sure if this is the verification you are looking for, but I really
dont see how this could go wrong, or how this is any different than
what the khugepaged code is already doing in a number of places.
>
> > >
> > > This is veeeery important, as there can be implicit assumptions around
> > > whether or not one can acquire these locks and you basically have to audit
> > > ALL code over which this lock is held.
> > >
> > > I'm speaking from hard experience here having bumped into this in various
> > > attempts at work relating to this stuff...
> > >
> > > >
> > > >       result = __collapse_huge_page_copy(pte, folio, pmd, _pmd,
> > > > -                                        vma, address, pte_ptl,
> > > > -                                        &compound_pagelist, HPAGE_PMD_ORDER);
> > > > -     pte_unmap(pte);
> > > > +                                        vma, _address, pte_ptl,
> > > > +                                        &compound_pagelist, order);
> > > >       if (unlikely(result != SCAN_SUCCEED))
> > > > -             goto out_up_write;
> > > > +             goto out_unlock_anon_vma;
> > >
> > > See above...
> > >
> > > >
> > > >       /*
> > > >        * The smp_wmb() inside __folio_mark_uptodate() ensures the
> > > > @@ -1273,33 +1290,115 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> > > >        * write.
> > > >        */
> > > >       __folio_mark_uptodate(folio);
> > > > -     pgtable = pmd_pgtable(_pmd);
> > > > -
> > > > -     _pmd = folio_mk_pmd(folio, vma->vm_page_prot);
> > > > -     _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
> > > > -
> > > > -     spin_lock(pmd_ptl);
> > > > -     BUG_ON(!pmd_none(*pmd));
> > > > -     folio_add_new_anon_rmap(folio, vma, address, RMAP_EXCLUSIVE);
> > > > -     folio_add_lru_vma(folio, vma);
> > > > -     pgtable_trans_huge_deposit(mm, pmd, pgtable);
> > > > -     set_pmd_at(mm, address, pmd, _pmd);
> > > > -     update_mmu_cache_pmd(vma, address, pmd);
> > > > -     deferred_split_folio(folio, false);
> > > > -     spin_unlock(pmd_ptl);
> > > > +     if (order == HPAGE_PMD_ORDER) {
> > > > +             pgtable = pmd_pgtable(_pmd);
> > > > +             _pmd = folio_mk_pmd(folio, vma->vm_page_prot);
> > > > +             _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
> > > > +
> > > > +             spin_lock(pmd_ptl);
> > > > +             BUG_ON(!pmd_none(*pmd));
> > >
> > > I know you're refactoring this, but be good to change this to a
> > > WARN_ON_ONCE(), BUG_ON() is verboten unless it's absolutely definitely
> > > going to be a kernel nuclear event, so worth changing things up as we go.
>
> > Yeah i keep seeing those warning in checkpatch, so Ill go ahead and edit it.
>
> Thanks!
>
> > >
> > > > +             folio_add_new_anon_rmap(folio, vma, _address, RMAP_EXCLUSIVE);
> > > > +             folio_add_lru_vma(folio, vma);
> > > > +             pgtable_trans_huge_deposit(mm, pmd, pgtable);
> > > > +             set_pmd_at(mm, address, pmd, _pmd);
> > > > +             update_mmu_cache_pmd(vma, address, pmd);
> > > > +             deferred_split_folio(folio, false);
> > > > +             spin_unlock(pmd_ptl);
> > > > +     } else { /* mTHP collapse */
> > > > +             mthp_pte = mk_pte(&folio->page, vma->vm_page_prot);
> > >
> > > I guess it's a rule that each THP or mTHP range spanned must span one and
> > > only one folio.
> > >
> > > Not sure &folio->page has a future though.
> > >
> > > Maybe better to use folio_page(folio, 0)?
> > Ok sounds good I'll use that.
>
> Thanks!
>
> > >
> > > > +             mthp_pte = maybe_mkwrite(pte_mkdirty(mthp_pte), vma);
> > > > +
> > > > +             spin_lock(pmd_ptl);
> > > > +             BUG_ON(!pmd_none(*pmd));
> > >
> > > having said the above, this is trictly introducing a new BUG_ON() which is
> > > a no-no, please make it a WARN_ON_ONCE().
>
> This one is more important, please do do this.
>
> > >
> > > > +             folio_ref_add(folio, (1 << order) - 1);
> > >
> > > Again no 1 << x please.
>
> (as per David feedback elsewhere, 1UL << instead)
ack
>
> > >
> > > Do we do something similar somewhere else for mthp ref counting? Can we
> > > share code somehow?
>
> > Yeah but IIRC its only like 2 or 3 places that do something like
> > this... most callers to folio_add_* do things in slightly different
> > manners. Maybe something to look into for the future, but I think it
> > will be difficult to generalize it.
>
> OK.
>
> > >
> > > > +             folio_add_new_anon_rmap(folio, vma, _address, RMAP_EXCLUSIVE);
> > > > +             folio_add_lru_vma(folio, vma);
> > > > +             set_ptes(vma->vm_mm, _address, pte, mthp_pte, (1 << order));
> > >
> > > Please avoid 1 << order, and I think at this point since you reference it a
> > > bunch of times, just store a local var like nr_pages or sth?
>
> > yeah not a bad idea!
>
> Thanks!
>
> > >
> > > > +             update_mmu_cache_range(NULL, vma, _address, pte, (1 << order));
> > > > +
> > > > +             smp_wmb(); /* make pte visible before pmd */
> > >
> > > Can you give some detail as to why this will work here and why it is
> > > necessary?
>
> > Other parts of the kernel do it when setting ptes before updating the
> > PMD. I'm not sure if it's necessary, but better safe than sorry.
>
> Unfortunately this is a _totally_ unacceptable justification. We can't put
> in barriers based on 'better safe than sorry'. You need to analysis this
> and determine whether or not it's necessary.
>
> So the comment in pmd_install() seems to give an indication:
>
>         if (likely(pmd_none(*pmd))) {   /* Has another populated it ? */
>                 mm_inc_nr_ptes(mm);
>                 /*
>                  * Ensure all pte setup (eg. pte page lock and page clearing) are
>                  * visible before the pte is made visible to other CPUs by being
>                  * put into page tables.
>                  *
>                  * The other side of the story is the pointer chasing in the page
>                  * table walking code (when walking the page table without locking;
>                  * ie. most of the time). Fortunately, these data accesses consist
>                  * of a chain of data-dependent loads, meaning most CPUs (alpha
>                  * being the notable exception) will already guarantee loads are
>                  * seen in-order. See the alpha page table accessors for the
>                  * smp_rmb() barriers in page table walking code.
>                  */
>                 smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
>                 pmd_populate(mm, pmd, *pte);
>                 *pte = NULL;
>         }

So this seems to indicate we do need it for some rare edge
cases/arches, and that's why it exists in other callers too.
>
> > >
> > > > +             pmd_populate(mm, pmd, pmd_pgtable(_pmd));
> > >
> > > If we're updating PTE entriess why do we need to assign the PMD entry?
>
> > We removed the PMD entry for GUP_fast reasons, then we reinstall the
> > PMD entry after the mTHP is in place. Same as for PMD collapse.
>
> OK a comment to this effect would be useful.
Ok ill add a small comment. feel free to take a look at the failure
handling around __collapse_huge_page_isolate to see another example of
this. We remove the pmd (pmdp_collapse_flush), then if the isolation
fails, we repopulate the original PMD. In the mTHP case, because we
are not installing a new PMD, we still remove the PMD (for gup-fast
reasons), modify the PTE entries within that PMD, then reinstall the
original PMD. This prevent GUP-fast from accessing these PTEs/PMD mid
change.
>
> > >
> > > > +             spin_unlock(pmd_ptl);
> > > > +     }
> > >
> > > This deeply, badly needs to be refactored into something that both shares
> > > code and separates out these two operations.
> > >
> > > This function is disgustingly long as it is, and that's not your fault, but
> > > let's try to make things better as we go.
> > >
> > > >
> > > >       folio = NULL;
> > > >
> > > >       result = SCAN_SUCCEED;
> > > > +out_unlock_anon_vma:
> > > > +     if (order != HPAGE_PMD_ORDER)
> > > > +             anon_vma_unlock_write(vma->anon_vma);
> > >
> > > Obviously again as above, we need to simplify this and get rid of this
> > > whole bit.
> > >
> > > >  out_up_write:
> > > > +     if (pte)
> > > > +             pte_unmap(pte);
> > >
> > > OK I guess you moved this from above down here? Is this a valid place to do this?
> > Yes if not we were potentially unmapping a pte early.
> > >
> > > >       mmap_write_unlock(mm);
> > > >  out_nolock:
> > > > +     *mmap_locked = false;
> > >
> > > This is kind of horrible, we now have pretty mad logic around who sets
> > > mmap_locked and where.
> > >
> > > Can we just do this at the call sites so we avoid that?
> > >
> > > I mean anything we do with this is hideous, but that'd be less confusing It
> > > hink.
> > >
> > > >       if (folio)
> > > >               folio_put(folio);
> > > >       trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result);
> > > >       return result;
> > > >  }
> > > >
> > > > +/* Recursive function to consume the bitmap */
> > >
> > > Err... please don't? Kernel stack is a seriously finite resource, we do not
> > > want recursion at all.
> > >
> > > But I'm not actually seeing any recursion here? Am I missing something?
> > >
>
> Yup this was before I realised it was an iterative implementation :)
>
> Though we are putting load on the stack anyway...
Ill see if I can shrink this load!
>
> > > > +static int collapse_scan_bitmap(struct mm_struct *mm, unsigned long address,
> > > > +                     int referenced, int unmapped, struct collapse_control *cc,
> > > > +                     bool *mmap_locked, unsigned long enabled_orders)
> > >
> > > This is a complicated and confusing function, it requires a comment
> > > describing how it works.
> > Ok will do!
>
> This is _VERY_ key - I need this I think to be able to sanely review this
> code.
ack, im also simplifying it a bit in the next iteration.
>
> > >
> > > > +{
> > > > +     u8 order, next_order;
> > > > +     u16 offset, mid_offset;
> > > > +     int num_chunks;
> > > > +     int bits_set, threshold_bits;
> > > > +     int top = -1;
> > >
> > > Err why do we start at -1 then immediately increment it?
> > You are correct, it was probably a leftover bit from my development
> > phase. Seems I can just set it to 0 to begin with.
>
> Thanks!
>
> > >
> > > > +     int collapsed = 0;
> > > > +     int ret;
> > > > +     struct scan_bit_state state;
> > > > +     bool is_pmd_only = (enabled_orders == (1 << HPAGE_PMD_ORDER));
> > >
> > > Extraneous outer parens.
> > ack
>
> Thanks!
>
> > >
> > > > +
> > > > +     cc->mthp_bitmap_stack[++top] = (struct scan_bit_state)
> > > > +             { HPAGE_PMD_ORDER - KHUGEPAGED_MIN_MTHP_ORDER, 0 };
> > >
> > > This is the same as
> > >
> > >         cc->mthp_bitmap_stack[0] = ...;
> > >         top = 1;
> > >
> > > No?
>
>
> > no it would be bitmap_stack[0] = ...
> > then top goes to -1 (at state =... ), and if we add more items
> > (next_order) to the stack it would go top = 1 (adds one for each half
> > of the split)
>
> OK, again going to need that comment to really grok all this... :)
>
> > >
> > >
> > > This is really horrible. Can we just have a helper function for this
> > > please?
>
> > Seems kinda excessive for 4 lines and one caller.
>
> It's code that's very horrible to understand using a very unusual way of
> initialising a struct within the kernel.
ok, ill see if i can clean it up.
>
> And having code be understandable when the compiler can inline for us does
> indeed justify this even in the instance of 4 lines and one caller :)
ack
>
> > >
> > > Like:
> > >
> > >         static int mthp_push_stack(struct collapse_control *cc,
> > >                 int index, u8 order, u16 offset)
> > >         {
> > >                 struct scan_bit_state *state = &cc->mthp_bitmap_stack[index];
> > >
> > >                 VM_WARN_ON(index >= MAX_MTHP_BITMAP_SIZE);
> > >
> > >                 state->order = order;
> > >                 state->offset = offset;
> > >
> > >                 return index + 1;
> > >         }
> >
> > This would not work in its current state because its ++index in the
> > current implementation. I would need to refactor, but the general idea
> > still stands
>
> OK thanks.
>
> Something _like_ this would make things a great deal clearer.
>
> > >
> > > And can invoke via:
> > >
> > >         top = mthp_push_stack(cc, top, order, offset);
> > >
> > > Or pass index as a pointer possibly also.
> > >
> > > > +
> > > > +     while (top >= 0) {
> > > > +             state = cc->mthp_bitmap_stack[top--];
> > >
> > > OK so this is the recursive bit...
> > >
> > > Oh man this function so needs a comment describing what it does, seriously.
> > >
> > > I think honestly for sake of my own sanity I'm going to hold off reviewing
> > > the rest of this until there's something describing the algorithm, in
> > > detail here, above the function.
>
> > It's basically binary recursion with a stack structure, that checks
> > regions of the bitmap in descending order (ie order 9, order 8, ...)
> > if we go to the next order we add two items to the stack (left and
> > right half). I will add a comment describing it at the top of the
> > function.
>
> Right, to reiterate - this needs a _big_ comment.
>
> I'm sorry to split the review of this patch in two on this, because once
> you do that I'm going to inevitably do the deferred review on the
> algorithm, but I just feel this is the only sensible way I can determine
> whether what you intend to do here makes sense and is correctly
> implemented.
Its all good! This pushed me to come up with a better solution, by
both removing the compression into 128 bits, and also come up with a
solution to the creep (although its not the 0/511 that you and David
were talking about).
>
> Kinda need that 'what you intend to do' bit. So it should be an expansive
> and detailed comment explicitly explaining the algorithm and why you're
> doing it.
>
> Thanks!
>
> > >
> > > > +             order = state.order + KHUGEPAGED_MIN_MTHP_ORDER;
> > > > +             offset = state.offset;
> > > > +             num_chunks = 1 << (state.order);
> > > > +             /* Skip mTHP orders that are not enabled */
> > > > +             if (!test_bit(order, &enabled_orders))
> > > > +                     goto next_order;
> > > > +
> > > > +             /* copy the relavant section to a new bitmap */
> > > > +             bitmap_shift_right(cc->mthp_bitmap_temp, cc->mthp_bitmap, offset,
> > > > +                               MTHP_BITMAP_SIZE);
> > > > +
> > > > +             bits_set = bitmap_weight(cc->mthp_bitmap_temp, num_chunks);
> > > > +             threshold_bits = (HPAGE_PMD_NR - khugepaged_max_ptes_none - 1)
> > > > +                             >> (HPAGE_PMD_ORDER - state.order);
> > > > +
> > > > +             /* Check if the region is "almost full" based on the threshold */
> > > > +             if (bits_set > threshold_bits || is_pmd_only
> > > > +                     || test_bit(order, &huge_anon_orders_always)) {
> > > > +                     ret = collapse_huge_page(mm, address, referenced, unmapped,
> > > > +                                              cc, mmap_locked, order,
> > > > +                                              offset * KHUGEPAGED_MIN_MTHP_NR);
> > > > +                     if (ret == SCAN_SUCCEED) {
> > > > +                             collapsed += (1 << order);
> > > > +                             continue;
> > > > +                     }
> > > > +             }
> > > > +
> > > > +next_order:
> > > > +             if (state.order > 0) {
> > > > +                     next_order = state.order - 1;
> > > > +                     mid_offset = offset + (num_chunks / 2);
> > > > +                     cc->mthp_bitmap_stack[++top] = (struct scan_bit_state)
> > > > +                             { next_order, mid_offset };
> > > > +                     cc->mthp_bitmap_stack[++top] = (struct scan_bit_state)
> > > > +                             { next_order, offset };
> > > > +             }
> > > > +     }
> > > > +     return collapsed;
> > > > +}
> > > > +
> > > >  static int collapse_scan_pmd(struct mm_struct *mm,
> > > >                            struct vm_area_struct *vma,
> > > >                            unsigned long address, bool *mmap_locked,
> > > > @@ -1307,31 +1406,60 @@ static int collapse_scan_pmd(struct mm_struct *mm,
> > > >  {
> > > >       pmd_t *pmd;
> > > >       pte_t *pte, *_pte;
> > > > +     int i;
> > > >       int result = SCAN_FAIL, referenced = 0;
> > > >       int none_or_zero = 0, shared = 0;
> > > >       struct page *page = NULL;
> > > >       struct folio *folio = NULL;
> > > >       unsigned long _address;
> > > > +     unsigned long enabled_orders;
> > > >       spinlock_t *ptl;
> > > >       int node = NUMA_NO_NODE, unmapped = 0;
> > > > +     bool is_pmd_only;
> > > >       bool writable = false;
> > > > -
> > > > +     int chunk_none_count = 0;
> > > > +     int scaled_none = khugepaged_max_ptes_none >> (HPAGE_PMD_ORDER - KHUGEPAGED_MIN_MTHP_ORDER);
> > > > +     unsigned long tva_flags = cc->is_khugepaged ? TVA_KHUGEPAGED : TVA_FORCED_COLLAPSE;
> > > >       VM_BUG_ON(address & ~HPAGE_PMD_MASK);
> > > >
> > > >       result = find_pmd_or_thp_or_none(mm, address, &pmd);
> > > >       if (result != SCAN_SUCCEED)
> > > >               goto out;
> > > >
> > > > +     bitmap_zero(cc->mthp_bitmap, MAX_MTHP_BITMAP_SIZE);
> > > > +     bitmap_zero(cc->mthp_bitmap_temp, MAX_MTHP_BITMAP_SIZE);
> > >
> > > Having this 'temp' thing on the stack for everyone is just horrid.
> > As I mention above this serves a very good purpose, and is also
> > expanded in another series by Baolin to serve another similar purpose
> > too.
>
> Yeah, I'm not hugely convinced :) responded there.
>
> I can probably give more useful feedback on this once the big comment is
> added!
Ok! hopefully I find a solution so I can remove the temp one.
>
> > >
> > > >       memset(cc->node_load, 0, sizeof(cc->node_load));
> > > >       nodes_clear(cc->alloc_nmask);
> > > > +
> > > > +     if (cc->is_khugepaged)
> > > > +             enabled_orders = thp_vma_allowable_orders(vma, vma->vm_flags,
> > > > +                     tva_flags, THP_ORDERS_ALL_ANON);
> > > > +     else
> > > > +             enabled_orders = BIT(HPAGE_PMD_ORDER);
> > > > +
> > > > +     is_pmd_only = (enabled_orders == (1 << HPAGE_PMD_ORDER));
> > >
> > > This is horrid, can we have a function broken out to do this please?
> > >
> > > In general if you keep open coding stuff, just write a static function for
> > > it, the compiler is smart enough to inline.
> > ok, we do this is a few places so perhaps its the best approach.
> > >
> > > > +
> > > >       pte = pte_offset_map_lock(mm, pmd, address, &ptl);
> > > >       if (!pte) {
> > > >               result = SCAN_PMD_NULL;
> > > >               goto out;
> > > >       }
> > > >
> > > > -     for (_address = address, _pte = pte; _pte < pte + HPAGE_PMD_NR;
> > > > -          _pte++, _address += PAGE_SIZE) {
> > > > +     for (i = 0; i < HPAGE_PMD_NR; i++) {
> > > > +             /*
> > > > +              * we are reading in KHUGEPAGED_MIN_MTHP_NR page chunks. if
> > > > +              * there are pages in this chunk keep track of it in the bitmap
> > > > +              * for mTHP collapsing.
> > > > +              */
> > > > +             if (i % KHUGEPAGED_MIN_MTHP_NR == 0) {
> > > > +                     if (i > 0 && chunk_none_count <= scaled_none)
> > > > +                             bitmap_set(cc->mthp_bitmap,
> > > > +                                        (i - 1) / KHUGEPAGED_MIN_MTHP_NR, 1);
> > > > +                     chunk_none_count = 0;
> > > > +             }
> > >
> > > This whole thing is really confusing and you are not explaining the
> > > algoritm here at all.
> > >
> > > This requires a comment, and really this bit should be separated out please.
> > This used to be its own commit, but multiple people wanted it
> > squashed... ugh. Which should we go with?
> > >
> > > > +
> > > > +             _pte = pte + i;
> > > > +             _address = address + i * PAGE_SIZE;
> > > >               pte_t pteval = ptep_get(_pte);
> > > >               if (is_swap_pte(pteval)) {
> > > >                       ++unmapped;
> > > > @@ -1354,10 +1482,11 @@ static int collapse_scan_pmd(struct mm_struct *mm,
> > > >                       }
> > > >               }
> > > >               if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
> > > > +                     ++chunk_none_count;
> > > >                       ++none_or_zero;
> > > >                       if (!userfaultfd_armed(vma) &&
> > > > -                         (!cc->is_khugepaged ||
> > > > -                          none_or_zero <= khugepaged_max_ptes_none)) {
> > > > +                         (!cc->is_khugepaged || !is_pmd_only ||
> > > > +                             none_or_zero <= khugepaged_max_ptes_none)) {
> > > >                               continue;
> > > >                       } else {
> > > >                               result = SCAN_EXCEED_NONE_PTE;
> > > > @@ -1453,6 +1582,7 @@ static int collapse_scan_pmd(struct mm_struct *mm,
> > > >                                                                    address)))
> > > >                       referenced++;
> > > >       }
> > > > +
> > > >       if (!writable) {
> > > >               result = SCAN_PAGE_RO;
> > > >       } else if (cc->is_khugepaged &&
> > > > @@ -1465,10 +1595,12 @@ static int collapse_scan_pmd(struct mm_struct *mm,
> > > >  out_unmap:
> > > >       pte_unmap_unlock(pte, ptl);
> > > >       if (result == SCAN_SUCCEED) {
> > > > -             result = collapse_huge_page(mm, address, referenced,
> > > > -                                         unmapped, cc);
> > > > -             /* collapse_huge_page will return with the mmap_lock released */
> > > > -             *mmap_locked = false;
> > > > +             result = collapse_scan_bitmap(mm, address, referenced, unmapped, cc,
> > > > +                                           mmap_locked, enabled_orders);
> > > > +             if (result > 0)
> > > > +                     result = SCAN_SUCCEED;
> > > > +             else
> > > > +                     result = SCAN_FAIL;
> > >
> > > We're reusing result as both an enum value and as a storage for unmber
> > > colapsed PTE entries?
> > >
> > > Can we just use a new local variable? Thanks
>
> Again you've skipped a ton of review comments here. You need to respond to
> everything.
Hmm we could, although it would serve no purpose in my eyes...
nr_collapsed = collapse_scan_bitmap()
if(nr_collapsed > 0)
    result = SCAN_SUCCEED.

To me that is basically the same thing with extra steps. But maybe its
better for a code readiblity standpoint.
>
> > >
> > > >       }
> > > >  out:
> > > >       trace_mm_khugepaged_scan_pmd(mm, folio, writable, referenced,
> > > > --
> > > > 2.50.1
> > > >
> > >
> > > I will review the bitmap/chunk stuff in more detail once the algorithm is
> > > commented.
> > ok thanks for the review.
>
> No problem! :)
>
Re: [PATCH v10 06/13] khugepaged: add mTHP support
Posted by Lorenzo Stoakes 3 weeks, 1 day ago
On Mon, Sep 08, 2025 at 04:29:51PM -0600, Nico Pache wrote:
> On Fri, Sep 5, 2025 at 4:15 AM Lorenzo Stoakes
> <lorenzo.stoakes@oracle.com> wrote:
> >
> > You've not responded to a ton of comments, I'm guesing I should assume in
> > those cases you're acking the comments implicitly? :) Do let me know.

> Yes! If they are obvious nits, then implicit ack.

OK that's fine, thanks!

>
> Sorry I wrote my response over a couple of days while making changes
> based on the review and trying to figure out a solution to the "creep"
> issue, and forgot to thoroughly review all the comments before sending
> this out. I'll make sure to go through it again.

Thanks!

> >
> > On Tue, Sep 02, 2025 at 02:12:38PM -0600, Nico Pache wrote:
> > > On Wed, Aug 20, 2025 at 12:30 PM Lorenzo Stoakes
> > > <lorenzo.stoakes@oracle.com> wrote:
> > > >
> > > > On Tue, Aug 19, 2025 at 07:41:58AM -0600, Nico Pache wrote:
> > > > > Introduce the ability for khugepaged to collapse to different mTHP sizes.
> > > > > While scanning PMD ranges for potential collapse candidates, keep track
> > > > > of pages in KHUGEPAGED_MIN_MTHP_ORDER chunks via a bitmap. Each bit
> > > > > represents a utilized region of order KHUGEPAGED_MIN_MTHP_ORDER ptes. If
> > > > > mTHPs are enabled we remove the restriction of max_ptes_none during the
> > > > > scan phase so we don't bailout early and miss potential mTHP candidates.
> > > > >
> > > > > A new function collapse_scan_bitmap is used to perform binary recursion on
> > > > > the bitmap and determine the best eligible order for the collapse.
> > > > > A stack struct is used instead of traditional recursion. max_ptes_none
> > > > > will be scaled by the attempted collapse order to determine how "full" an
> > > > > order must be before being considered for collapse.
> > > > >
> > > > > Once we determine what mTHP sizes fits best in that PMD range a collapse
> > > > > is attempted. A minimum collapse order of 2 is used as this is the lowest
> > > > > order supported by anon memory.
> > > > >
> > > > > For orders configured with "always", we perform greedy collapsing
> > > > > to that order without considering bit density.
> > > > >
> > > > > If a mTHP collapse is attempted, but contains swapped out, or shared
> > > > > pages, we don't perform the collapse. This is because adding new entries
> > > > > can lead to new none pages, and these may lead to constant promotion into
> > > > > a higher order (m)THP. A similar issue can occur with "max_ptes_none >
> > > > > HPAGE_PMD_NR/2" due to the fact that a collapse will introduce at least 2x
> > > > > the number of pages, and on a future scan will satisfy the promotion
> > > > > condition once again.
> > > > >
> > > > > For non-PMD collapse we must leave the anon VMA write locked until after
> > > > > we collapse the mTHP-- in the PMD case all the pages are isolated, but in
> > > > > the non-PMD case this is not true, and we must keep the lock to prevent
> > > > > changes to the VMA from occurring.
> > > > >
> > > > > Currently madv_collapse is not supported and will only attempt PMD
> > > > > collapse.
> > > >
> > > > Yes I think this has to remain the case unfortunately as we override
> > > > sysfs-specified orders for MADV_COLLAPSE and there's no sensible way to
> > > > determine what order we ought to be using.
> > > >
> > > > >
> > > > > Signed-off-by: Nico Pache <npache@redhat.com>
> > > >
> > > > You've gone from small incremental changes to a huge one here... for the
> > > > sake of reviewer sanity at least, any chance of breaking this up?
> > > I had this as two patches (one for the bitmap and one for implementing
> > > it), but I was asked to squash them :/
> >
> > Yeah it makes sense to combine those two, but maybe there's a better way of
> > splitting things out.

> I'll give it a shot but most of this patch belongs together in my eyes.

OK, let's see what's most sensible.

> >
> > > >
> > > > > ---
> > > > >  include/linux/khugepaged.h |   4 +
> > > > >  mm/khugepaged.c            | 236 +++++++++++++++++++++++++++++--------
> > > > >  2 files changed, 188 insertions(+), 52 deletions(-)
> > > > >
> > > > > diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
> > > > > index eb1946a70cff..d12cdb9ef3ba 100644
> > > > > --- a/include/linux/khugepaged.h
> > > > > +++ b/include/linux/khugepaged.h
> > > > > @@ -1,6 +1,10 @@
> > > > >  /* SPDX-License-Identifier: GPL-2.0 */
> > > > >  #ifndef _LINUX_KHUGEPAGED_H
> > > > >  #define _LINUX_KHUGEPAGED_H
> > > > > +#define KHUGEPAGED_MIN_MTHP_ORDER    2
> > > >
> > > > I guess this makes sense as by definition 2 pages is least it could
> > > > possibly be.
> > > Order, so 4 pages, 16kB mTHP
> >
> > Right, misread that! :) sorry. I guess then in fact there is no such thing
> > as an order-1 mTHP? I did wonder how useful that'd be so makes more sense
> Anon memory does not support order 1 THPs, im not sure if there are
> any plans do so either.

Yeah right, this was me being dumb!

> >
> > > >
> > > > > +#define KHUGEPAGED_MIN_MTHP_NR       (1 << KHUGEPAGED_MIN_MTHP_ORDER)
> > > >
> > > > Surely KHUGEPAGED_MIN_NR_MTHP_PTES would be more meaningful?
> > > Sure!
> >
> > Thanks!
> >
> > > >
> > > > > +#define MAX_MTHP_BITMAP_SIZE  (1 << (ilog2(MAX_PTRS_PER_PTE) - KHUGEPAGED_MIN_MTHP_ORDER))
> > > >
> > > > This is confusing - size of what?
> > > We need it like this due to ppc64 (and maybe others?), it used to be
> > > based on PMD_ORDER, but some arches fail to compile due to the PMD
> > > size only being known at boot time.
> >
> > That really sucks. Do please put this context in a comment though (see
> > below for more discussion on this).
> Ok, will do. although with my most recent changes we might just be
> able to get rid of this all together.
> >
> > >
> > > This compiles to 9 on arches that have 512 ptes.
> > > so 1 << (9 - 2) == 128
> >
> > What I'm saying is - what is this expressed in? There's no information on
> > that here.

> In my next implementation its probably going to be gone, but what it
> indicates is the bitmap size (128), where each bit equals a range of 4
> pages. 128 *4 = 512.

OK thanks. Let's see what the new implementation is like.

> > > >
> > > > > +     DECLARE_BITMAP(mthp_bitmap_temp, MAX_MTHP_BITMAP_SIZE);
> > > >
> > > > Same comment as above obviously. But also this is kind of horrible, why are
> > > > we putting a copy of this entire bitmap on the stack every time we declare
> > > > a cc?
> > > The temp one is used as a scratch pad, Baolin also finds this useful
> > > in his file mTHP collapse useful for another use as well.
> >
> > Saying 'scratch pad' is really just saying 'temporary' using different
> > words :) this isn't hugely helpful.

> When doing the "recursion", there is no easy way to read the bitmap
> between bits N0-N1, so we must manipulate it to extract that data;
> however, we still want to preserve the original state of the bitmap.
> The temp_bitmap is used to do these bit manipulations while not
> destroying the original state. I will see if there's a better way to
> read arbitrary ranges of the bitmap so we dont need to have two
> copies, but IIRC there is not-- I might have to expand the bitmap API
> to include this.

It at the very least needs a better name.

> >
> > If we _need_ this we should give this a better name, and I also don't know
> > why we need this on the stack for all collapse_control users. Can't you
> > just have your 'scratch pad' local to the function that needs it or passed
> > as a pointer?
> If I cant find a solution to the problem above I will look into moving
> it to the local stack.

Yeah, maybe not necessary after I realised that in fact we don't put cc on the
stack... :)

It does definitely need a better name, however...

> >
> > I'm sure it'd be useful to somebody to stick various temporary things in
> > vm_area_struct for instance, but it'd be hugely egregious to do so...

> The reason I didnt see this as a huge issue, is there is only one
> global CC that is reused for all khugepaged runs. It's a one time
> allocation and does not scale.

Yeah, see below, I realised cc isn't on the stack so. Fine :)

> >
> > >
> > > In general khugepaged always uses the same CC, so it doesn't not
> > > having to constantly allocate this.
> >
> > You're putting a LOT of data on the stack, and kernel stacks are very
> > delicate, I'm quite concerned about this.
> >
> > BEFORE:
> >
> > struct collapse_control {
> >         bool                       is_khugepaged;        /*     0     1 */
> >
> >         /* XXX 3 bytes hole, try to pack */
> >
> >         u32                        node_load[64];        /*     4   256 */
> >
> >         /* XXX 4 bytes hole, try to pack */
> >
> >         /* --- cacheline 4 boundary (256 bytes) was 8 bytes ago --- */
> >         nodemask_t                 alloc_nmask;          /*   264     8 */
> >
> >         /* size: 272, cachelines: 5, members: 3 */
> >         /* sum members: 265, holes: 2, sum holes: 7 */
> >         /* last cacheline: 16 bytes */
> > };
> >
> > 272 bytes.
> >
> > AFTER:
> >
> > struct collapse_control {
> >         bool                       is_khugepaged;        /*     0     1 */
> >
> >         /* XXX 3 bytes hole, try to pack */
> >
> >         u32                        node_load[64];        /*     4   256 */
> >
> >         /* XXX 4 bytes hole, try to pack */
> >
> >         /* --- cacheline 4 boundary (256 bytes) was 8 bytes ago --- */
> >         nodemask_t                 alloc_nmask;          /*   264     8 */
> >         long unsigned int          mthp_bitmap[2];       /*   272    16 */
> >         long unsigned int          mthp_bitmap_temp[2];  /*   288    16 */
> >         struct scan_bit_state      mthp_bitmap_stack[128]; /*   304   512 */
> >
> >         /* size: 816, cachelines: 13, members: 6 */
> >         /* sum members: 809, holes: 2, sum holes: 7 */
> >         /* last cacheline: 48 bytes */
> > };
> >
> > 816 bytes.
> >
> > So you're roughly tripling the size of this struct and making this a thing
> > in all callstacks that reference collapse_control.
> >
> > Kernel stack is a _super_ finite resource.
> >
> > I mean I can't really review your stack implementation thing until you've
> > added a comment to help me understand what you are doing there (sorry but
> > it's just too fiddly for me to first principles it), but I wonder if we
> > truly need to be doing this?
> >
> > I wonder if we can just put this into somewhere allocated...

OK hang on... HANG ON :) have I been dumb here?

Sorry... this isn't on the stack is it?

It just looks like, from the function signatures _that it should be_... or at
least as per how we usually implement xxx_control structs.

OK that's really good. So you can ignore this bit... Probably we're fine with
the size being larger.

I see we kmalloc() a cc in madvise_collapse() (fine) or have it static in
khugepaged_collapse_control.

So ok. Never mind :)

> > > > >
> > > > >  static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> > > > >                             int referenced, int unmapped,
> > > > > -                           struct collapse_control *cc)
> > > > > +                           struct collapse_control *cc, bool *mmap_locked,
> > > > > +                           unsigned int order, unsigned long offset)
> > > > >  {
> > > > >       LIST_HEAD(compound_pagelist);
> > > > >       pmd_t *pmd, _pmd;
> > > > > -     pte_t *pte;
> > > > > +     pte_t *pte = NULL, mthp_pte;
> > > > >       pgtable_t pgtable;
> > > > >       struct folio *folio;
> > > > >       spinlock_t *pmd_ptl, *pte_ptl;
> > > > >       int result = SCAN_FAIL;
> > > > >       struct vm_area_struct *vma;
> > > > >       struct mmu_notifier_range range;
> > > > > +     unsigned long _address = address + offset * PAGE_SIZE;
> > > >
> > > > This name is really horrible. please name it sensibly.
> > > >
> > > > It feels like address ought to be consistently the base of the THP or mTHP
> > > > we wish to collapse, and if we need something PMD aligned for some reason
> > > > we should rename _that_ to e.g. pmd_address.
> > > >
> > > > Orrr it could be mthp_address...
> > > >
> > > > Perhaps we could just figure that out here and pass only the
> > > > address... aligning to PMD boundary shouldn't be hard/costly.
> > > >
> > > > But it may indicate we need further refactorisation so we don't need to
> > > > paper over cracks + pass around a PMD address to do things when that may
> > > > not be where the (m)THP range begins.
> > > Ok i'll rename them, but we still need to know the PMD address as we
> > > rely on it for a few key operations.
> > > Can we leave _address and rename address to pmd_address?
> >
> > No, we absolutely cannot leave _address as '_address', that's terrible and
> > I'm just not going to live with that.
> >
> > I know it's trivial, but it's just such a horrid naming convention.

> Ok Ill rename it to something else!

Thanks!

> >
> > > >
> > > > >
> > > > >       VM_BUG_ON(address & ~HPAGE_PMD_MASK);
> > > > >
> > > > > @@ -1155,16 +1170,20 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> > > > >        * The allocation can take potentially a long time if it involves
> > > > >        * sync compaction, and we do not need to hold the mmap_lock during
> > > > >        * that. We will recheck the vma after taking it again in write mode.
> > > > > +      * If collapsing mTHPs we may have already released the read_lock.
> > > > >        */
> > > > > -     mmap_read_unlock(mm);
> > > > > +     if (*mmap_locked) {
> > > > > +             mmap_read_unlock(mm);
> > > > > +             *mmap_locked = false;
> > > > > +     }
> > > > >
> > > > > -     result = alloc_charge_folio(&folio, mm, cc, HPAGE_PMD_ORDER);
> > > > > +     result = alloc_charge_folio(&folio, mm, cc, order);
> > > > >       if (result != SCAN_SUCCEED)
> > > > >               goto out_nolock;
> > > > >
> > > > >       mmap_read_lock(mm);
> > > > > -     result = hugepage_vma_revalidate(mm, address, true, &vma, cc,
> > > > > -                                      BIT(HPAGE_PMD_ORDER));
> > > > > +     *mmap_locked = true;
> > > > > +     result = hugepage_vma_revalidate(mm, address, true, &vma, cc, BIT(order));
> > > >
> > > > I mean this is kind of going back to previous commits, but it's really ugly
> > > > to pass a BIT(xxx) here, is that really necessary? Can't we just pass in
> > > > the order?
> > > Yes and no... currently we only ever pass the bit of the current order
> > > so we could get away with it, but to generalize it we want the ability
> > > to pass a bitmap of the available orders. Like in the case of future
> > > madvise_collapse support, we would need to pass a bitmap of possible
> > > orders.
> >
> > Can we just change that when we need it?

> Sure, it was more so that I generalized it for madv_collapse and
> khugepaged, but then we dropped madv_collapse support, and I never
> undid the change.

Thanks, I think best to undo it until we need it.

> >
> > 'Future proofing' for an possible future implementation detail is just not
> > how we should implement things.
> >
> > Right now we don't do that, so let's just pass the order. If in future we
> > want to change it we can.

> ack

Thanks!

> > > >
> > > > >       if (pte) {
> > > > > -             result = __collapse_huge_page_isolate(vma, address, pte, cc,
> > > > > -                                                   &compound_pagelist,
> > > > > -                                                   HPAGE_PMD_ORDER);
> > > > > +             result = __collapse_huge_page_isolate(vma, _address, pte, cc,
> > > > > +                                                   &compound_pagelist, order);
> > > > >               spin_unlock(pte_ptl);
> > > > >       } else {
> > > > >               result = SCAN_PMD_NULL;
> > > > >       }
> > > > >
> > > > >       if (unlikely(result != SCAN_SUCCEED)) {
> > > > > -             if (pte)
> > > > > -                     pte_unmap(pte);
> > > >
> > > > Why are we removing this?
> >
> > You're missing some of the comments, I'm guesing for most of the smaller
> > stuff you're just implicitly acking them but this one was a question :)

> Ah, for some of your reviews you comment, then later when you see
> another piece you say something like "ah ok this is why", for a lot of
> these cases I might have skipped replying because it seems that you
> figured it out.

Sure it's fine, I'm just sort of trying to draw your attention to stuff that
needs some discussion :)

There's a lot here so it's easy to lose track from both sides (I may very well
be repeating myself, apologies if so!)

> For this one we moved the pte_unmap further down, because if not, we
> could be prematurely unmapping a PTE, which is later referenced during
> the mTHP collapse, which was not the case when we only did PMD
> collapse.

Ack!

> >
> > > >
> > > > >               spin_lock(pmd_ptl);
> > > > >               BUG_ON(!pmd_none(*pmd));
> > > > >               /*
> > > > > @@ -1255,17 +1272,17 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> > > > >       }
> > > > >
> > > > >       /*
> > > > > -      * All pages are isolated and locked so anon_vma rmap
> > > > > -      * can't run anymore.
> > > > > +      * For PMD collapse all pages are isolated and locked so anon_vma
> > > > > +      * rmap can't run anymore
> > > > >        */
> > > > > -     anon_vma_unlock_write(vma->anon_vma);
> > > > > +     if (order == HPAGE_PMD_ORDER)
> > > > > +             anon_vma_unlock_write(vma->anon_vma);
> > > >
> > > > Hmm this is introducing a horrible new way for things to go wrong. And
> > > > there's now a whole host of terrible error paths that can go wrong very
> > > > easily around rmap locks and yeah, no way we cannot do it this way.
> > > >
> > > > rmap locks are VERY sensitive and the ordering of the locking matters a
> > > > great deal (see top of mm/rmap.c). So we have to be SO careful here.
> > > >
> > > > I suggest you simply have a boolean 'anon_vma_locked' or something like
> > > > this, and get rid of these horrible additional code paths and the second
> > > > order == HPAGE_PMD_ORDER check.
> > > >
> > > > We'll track whether or not the lock is held and thereby needs releasing
> > > > that way instead.
> >
> > You've not responded to this suggestion re: refactoring here.
> I think if anything this refactoring adds complexity... we simply

I don't really think it does.

We have an open-coded condition that you just have to happen to know when
reading the code determines whether a lock is held or released.

Or, we could just have a boolean that explicitly encodes this.

> postpone the unlock we normally do, to later if its mTHP collapse,
> because we havent fully isolated all the pages (like we do in the PMD
> case).
> We arent doing anything extremely different other than hold the
> already held lock for a little longer. We would still need these order
> == HPAGE_PMD_ORDER checks, but just be flipping a flag that says its
> locked, essentially adding more complexity for the same outcome.

Yeah, I don't agree sorry.

I really do not like this, we should have the boolean.

> >
> > I'm really not a fan of us arbitrarily messing with rmap locks like this,
> > and we should very carefully keep track of whether we have/have not
> > released them.
> Its important to note, the code was already doing this, as I stated
> above, we simply hold it for longer because we haven't isolated all
> the pages.
> >
> > Again rmap locking is a dangerous area, I've got personal experience of
> > this (see top of mm/rmap.c for an indication of complexity here as well as
> > https://kernel.org/doc/html/latest/mm/process_addrs.html).
> >
> > > >
> > > > Also, and very importantly - are you 100% sure you can't possibly have a
> > > > deadlock or issue beyond this point if you don't release the rmap lock?
> > > I double checked, this was added as a fix to an issue Hugh reported.
> >
> > > The gap between these callers is rather small, and I see no way that
> > > it could skip the lock/unlock cycle.
> >
> > We're going to need more than this to be confident, you need to clearly
> > justify why we won't encounter issues this way.
>
> // WE ARE HOLDING THE LOCK ALREADY
> if (order == HPAGE_PMD_ORDER) //ONLY RELEASE IF ITS PMD
>         anon_vma_unlock_write(vma->anon_vma);
>
>     result = __collapse_huge_page_copy(pte, folio, pmd, _pmd,
>                        vma, _address, pte_ptl,
>                        order, &compound_pagelist);
>     if (unlikely(result != SCAN_SUCCEED))
>         goto out_unlock_anon_vma; //LETS RELEASE THE LOCK IF WE FAIL
>
> if (order == HPAGE_PMD_ORDER) {
>       ...
>      // NO EXIT PATHS
>     } else { /* mTHP collapse */
>         ...
>        // NO EXIT PATHS
>     }
>
>     folio = NULL;
>
>     result = SCAN_SUCCEED;
> out_unlock_anon_vma:
>     if (order != HPAGE_PMD_ORDER)
>         anon_vma_unlock_write(vma->anon_vma); //RELEASE IF IT WASN'T
> RELEASED EARLIER

Thanks for this, but I think this only makes my point re: a booelan: now you
have to just realise at each point in the code that the open-coded condition is
what determines whether the lock is held or not.

A boolean that does this is a _lot_ clearer.

>
> Not sure if this is the verification you are looking for, but I really
> dont see how this could go wrong, or how this is any different than
> what the khugepaged code is already doing in a number of places.

Existing khugepaged code is not a good guideline for future development :) the
code is not always entirely delightful there or in huge_memory.c either...

> >
> > > >
> > > > > +             update_mmu_cache_range(NULL, vma, _address, pte, (1 << order));
> > > > > +
> > > > > +             smp_wmb(); /* make pte visible before pmd */
> > > >
> > > > Can you give some detail as to why this will work here and why it is
> > > > necessary?
> >
> > > Other parts of the kernel do it when setting ptes before updating the
> > > PMD. I'm not sure if it's necessary, but better safe than sorry.
> >
> > Unfortunately this is a _totally_ unacceptable justification. We can't put
> > in barriers based on 'better safe than sorry'. You need to analysis this
> > and determine whether or not it's necessary.
> >
> > So the comment in pmd_install() seems to give an indication:
> >
> >         if (likely(pmd_none(*pmd))) {   /* Has another populated it ? */
> >                 mm_inc_nr_ptes(mm);
> >                 /*
> >                  * Ensure all pte setup (eg. pte page lock and page clearing) are
> >                  * visible before the pte is made visible to other CPUs by being
> >                  * put into page tables.
> >                  *
> >                  * The other side of the story is the pointer chasing in the page
> >                  * table walking code (when walking the page table without locking;
> >                  * ie. most of the time). Fortunately, these data accesses consist
> >                  * of a chain of data-dependent loads, meaning most CPUs (alpha
> >                  * being the notable exception) will already guarantee loads are
> >                  * seen in-order. See the alpha page table accessors for the
> >                  * smp_rmb() barriers in page table walking code.
> >                  */
> >                 smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
> >                 pmd_populate(mm, pmd, *pte);
> >                 *pte = NULL;
> >         }
>
> So this seems to indicate we do need it for some rare edge
> cases/arches, and that's why it exists in other callers too.

Yup, so let's be explicit and say something like 'see pmd_install()' or
something.

Open-coded 'just have to know' stuff is just something we want to avoid in
general.

> >
> > > >
> > > > > +             pmd_populate(mm, pmd, pmd_pgtable(_pmd));
> > > >
> > > > If we're updating PTE entriess why do we need to assign the PMD entry?
> >
> > > We removed the PMD entry for GUP_fast reasons, then we reinstall the
> > > PMD entry after the mTHP is in place. Same as for PMD collapse.
> >
> > OK a comment to this effect would be useful.
> Ok ill add a small comment. feel free to take a look at the failure
> handling around __collapse_huge_page_isolate to see another example of
> this. We remove the pmd (pmdp_collapse_flush), then if the isolation
> fails, we repopulate the original PMD. In the mTHP case, because we
> are not installing a new PMD, we still remove the PMD (for gup-fast
> reasons), modify the PTE entries within that PMD, then reinstall the
> original PMD. This prevent GUP-fast from accessing these PTEs/PMD mid
> change.

Thanks. We need this kind of context in the code.

> >
> > > >
> > > > > +             spin_unlock(pmd_ptl);
> > > > > +     }
> > > >
> > > > This deeply, badly needs to be refactored into something that both shares
> > > > code and separates out these two operations.
> > > >
> > > > This function is disgustingly long as it is, and that's not your fault, but
> > > > let's try to make things better as we go.
> > > >
> > > > >
> > > > >       folio = NULL;
> > > > >
> > > > >       result = SCAN_SUCCEED;
> > > > > +out_unlock_anon_vma:
> > > > > +     if (order != HPAGE_PMD_ORDER)
> > > > > +             anon_vma_unlock_write(vma->anon_vma);
> > > >
> > > > Obviously again as above, we need to simplify this and get rid of this
> > > > whole bit.
> > > >
> > > > >  out_up_write:
> > > > > +     if (pte)
> > > > > +             pte_unmap(pte);
> > > >
> > > > OK I guess you moved this from above down here? Is this a valid place to do this?
> > > Yes if not we were potentially unmapping a pte early.
> > > >
> > > > >       mmap_write_unlock(mm);
> > > > >  out_nolock:
> > > > > +     *mmap_locked = false;
> > > >
> > > > This is kind of horrible, we now have pretty mad logic around who sets
> > > > mmap_locked and where.
> > > >
> > > > Can we just do this at the call sites so we avoid that?
> > > >
> > > > I mean anything we do with this is hideous, but that'd be less confusing It
> > > > hink.
> > > >
> > > > >       if (folio)
> > > > >               folio_put(folio);
> > > > >       trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result);
> > > > >       return result;
> > > > >  }
> > > > >
> > > > > +/* Recursive function to consume the bitmap */
> > > >
> > > > Err... please don't? Kernel stack is a seriously finite resource, we do not
> > > > want recursion at all.
> > > >
> > > > But I'm not actually seeing any recursion here? Am I missing something?
> > > >
> >
> > Yup this was before I realised it was an iterative implementation :)
> >
> > Though we are putting load on the stack anyway...

> Ill see if I can shrink this load!

Thanks!

Things are less problematic with cc not on the stack (my mistake that, sorry),
but obviously anything that could help reduce the stack usage here is good
anyway.

> >
> > > > > +static int collapse_scan_bitmap(struct mm_struct *mm, unsigned long address,
> > > > > +                     int referenced, int unmapped, struct collapse_control *cc,
> > > > > +                     bool *mmap_locked, unsigned long enabled_orders)
> > > >
> > > > This is a complicated and confusing function, it requires a comment
> > > > describing how it works.
> > > Ok will do!
> >
> > This is _VERY_ key - I need this I think to be able to sanely review this
> > code.
> ack, im also simplifying it a bit in the next iteration.

Thanks, much appreciated :)

I do wonder if this could be separate, but perhaps not.

> > > > >  static int collapse_scan_pmd(struct mm_struct *mm,
> > > > >                            struct vm_area_struct *vma,
> > > > >                            unsigned long address, bool *mmap_locked,
> > > > > @@ -1307,31 +1406,60 @@ static int collapse_scan_pmd(struct mm_struct *mm,
> > > > >  {
> > > > >       pmd_t *pmd;
> > > > >       pte_t *pte, *_pte;
> > > > > +     int i;
> > > > >       int result = SCAN_FAIL, referenced = 0;
> > > > >       int none_or_zero = 0, shared = 0;
> > > > >       struct page *page = NULL;
> > > > >       struct folio *folio = NULL;
> > > > >       unsigned long _address;
> > > > > +     unsigned long enabled_orders;
> > > > >       spinlock_t *ptl;
> > > > >       int node = NUMA_NO_NODE, unmapped = 0;
> > > > > +     bool is_pmd_only;
> > > > >       bool writable = false;
> > > > > -
> > > > > +     int chunk_none_count = 0;
> > > > > +     int scaled_none = khugepaged_max_ptes_none >> (HPAGE_PMD_ORDER - KHUGEPAGED_MIN_MTHP_ORDER);
> > > > > +     unsigned long tva_flags = cc->is_khugepaged ? TVA_KHUGEPAGED : TVA_FORCED_COLLAPSE;
> > > > >       VM_BUG_ON(address & ~HPAGE_PMD_MASK);
> > > > >
> > > > >       result = find_pmd_or_thp_or_none(mm, address, &pmd);
> > > > >       if (result != SCAN_SUCCEED)
> > > > >               goto out;
> > > > >
> > > > > +     bitmap_zero(cc->mthp_bitmap, MAX_MTHP_BITMAP_SIZE);
> > > > > +     bitmap_zero(cc->mthp_bitmap_temp, MAX_MTHP_BITMAP_SIZE);
> > > >
> > > > Having this 'temp' thing on the stack for everyone is just horrid.
> > > As I mention above this serves a very good purpose, and is also
> > > expanded in another series by Baolin to serve another similar purpose
> > > too.
> >
> > Yeah, I'm not hugely convinced :) responded there.
> >
> > I can probably give more useful feedback on this once the big comment is
> > added!

> Ok! hopefully I find a solution so I can remove the temp one.

Thanks!

Again sorry to separate this out (+ force more review back-and-forth), it's just
the only way I can sensibly review this I think!

> > > > > +
> > > > > +             _pte = pte + i;
> > > > > +             _address = address + i * PAGE_SIZE;
> > > > >               pte_t pteval = ptep_get(_pte);
> > > > >               if (is_swap_pte(pteval)) {
> > > > >                       ++unmapped;
> > > > > @@ -1354,10 +1482,11 @@ static int collapse_scan_pmd(struct mm_struct *mm,
> > > > >                       }
> > > > >               }
> > > > >               if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
> > > > > +                     ++chunk_none_count;
> > > > >                       ++none_or_zero;
> > > > >                       if (!userfaultfd_armed(vma) &&
> > > > > -                         (!cc->is_khugepaged ||
> > > > > -                          none_or_zero <= khugepaged_max_ptes_none)) {
> > > > > +                         (!cc->is_khugepaged || !is_pmd_only ||
> > > > > +                             none_or_zero <= khugepaged_max_ptes_none)) {
> > > > >                               continue;
> > > > >                       } else {
> > > > >                               result = SCAN_EXCEED_NONE_PTE;
> > > > > @@ -1453,6 +1582,7 @@ static int collapse_scan_pmd(struct mm_struct *mm,
> > > > >                                                                    address)))
> > > > >                       referenced++;
> > > > >       }
> > > > > +
> > > > >       if (!writable) {
> > > > >               result = SCAN_PAGE_RO;
> > > > >       } else if (cc->is_khugepaged &&
> > > > > @@ -1465,10 +1595,12 @@ static int collapse_scan_pmd(struct mm_struct *mm,
> > > > >  out_unmap:
> > > > >       pte_unmap_unlock(pte, ptl);
> > > > >       if (result == SCAN_SUCCEED) {
> > > > > -             result = collapse_huge_page(mm, address, referenced,
> > > > > -                                         unmapped, cc);
> > > > > -             /* collapse_huge_page will return with the mmap_lock released */
> > > > > -             *mmap_locked = false;
> > > > > +             result = collapse_scan_bitmap(mm, address, referenced, unmapped, cc,
> > > > > +                                           mmap_locked, enabled_orders);
> > > > > +             if (result > 0)
> > > > > +                     result = SCAN_SUCCEED;
> > > > > +             else
> > > > > +                     result = SCAN_FAIL;
> > > >
> > > > We're reusing result as both an enum value and as a storage for unmber
> > > > colapsed PTE entries?
> > > >
> > > > Can we just use a new local variable? Thanks
> >
> > Again you've skipped a ton of review comments here. You need to respond to
> > everything.
> Hmm we could, although it would serve no purpose in my eyes...
> nr_collapsed = collapse_scan_bitmap()
> if(nr_collapsed > 0)
>     result = SCAN_SUCCEED.
>
> To me that is basically the same thing with extra steps. But maybe its
> better for a code readiblity standpoint.

Code readability is a very good purpose to serve, unless we discover perf issues
that suggest we should behave otherwise, this should be our _primary_ aim in my
view.

Even more so the case in THP code which is, often, well let's just say 'not
beautiful' :)

Thanks for responding so thoroughly, we'll get this over the line I promise!

Cheers, Lorenzo