[v1] khugepaged: mTHP support

[RFC 08/11] khugepaged: introduce khugepaged_scan_bitmap for mTHP support

Posted by Nico Pache 1 year, 1 month ago

khugepaged scans PMD ranges for potential collapse to a hugepage. To add
mTHP support we use this scan to instead record chunks of fully utilized
sections of the PMD.

create a bitmap to represent a PMD in order MTHP_MIN_ORDER chunks.
by default we will set this to order 3. The reasoning is that for 4K 512
PMD size this results in a 64 bit bitmap which has some optimizations.
For other arches like ARM64 64K, we can set a larger order if needed.

khugepaged_scan_bitmap uses a stack struct to recursively scan a bitmap
that represents chunks of fully utilized regions. We can then determine
what mTHP size fits best and in the following patch, we set this bitmap
while scanning the PMD.

max_ptes_none is used as a scale to determine how "full" an order must
be before being considered for collapse.

Signed-off-by: Nico Pache <npache@redhat.com>
---
 include/linux/khugepaged.h |   4 +-
 mm/khugepaged.c            | 129 +++++++++++++++++++++++++++++++++++--
 2 files changed, 126 insertions(+), 7 deletions(-)

diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
index 1f46046080f5..31cff8aeec4a 100644
--- a/include/linux/khugepaged.h
+++ b/include/linux/khugepaged.h
@@ -1,7 +1,9 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _LINUX_KHUGEPAGED_H
 #define _LINUX_KHUGEPAGED_H
-
+#define MIN_MTHP_ORDER	3
+#define MIN_MTHP_NR	(1<<MIN_MTHP_ORDER)
+#define MTHP_BITMAP_SIZE  (1<<(HPAGE_PMD_ORDER - MIN_MTHP_ORDER))
 extern unsigned int khugepaged_max_ptes_none __read_mostly;
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 extern struct attribute_group khugepaged_attr_group;
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 9eb161b04ee4..de1dc6ea3c71 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -94,6 +94,11 @@ static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
 
 static struct kmem_cache *mm_slot_cache __ro_after_init;
 
+struct scan_bit_state {
+	u8 order;
+	u8 offset;
+};
+
 struct collapse_control {
 	bool is_khugepaged;
 
@@ -102,6 +107,15 @@ struct collapse_control {
 
 	/* nodemask for allocation fallback */
 	nodemask_t alloc_nmask;
+
+	/* bitmap used to collapse mTHP sizes. 1bit = order MIN_MTHP_ORDER mTHP */
+	unsigned long *mthp_bitmap;
+	unsigned long *mthp_bitmap_temp;
+	struct scan_bit_state *mthp_bitmap_stack;
+};
+
+struct collapse_control khugepaged_collapse_control = {
+	.is_khugepaged = true,
 };
 
 /**
@@ -389,6 +403,25 @@ int __init khugepaged_init(void)
 	if (!mm_slot_cache)
 		return -ENOMEM;
 
+	/*
+	 * allocate the bitmaps dynamically since MTHP_BITMAP_SIZE is not known at
+	 * compile time for some architectures.
+	 */
+	khugepaged_collapse_control.mthp_bitmap = kmalloc_array(
+		BITS_TO_LONGS(MTHP_BITMAP_SIZE), sizeof(unsigned long), GFP_KERNEL);
+	if (!khugepaged_collapse_control.mthp_bitmap)
+		return -ENOMEM;
+
+	khugepaged_collapse_control.mthp_bitmap_temp = kmalloc_array(
+		BITS_TO_LONGS(MTHP_BITMAP_SIZE), sizeof(unsigned long), GFP_KERNEL);
+	if (!khugepaged_collapse_control.mthp_bitmap_temp)
+		return -ENOMEM;
+
+	khugepaged_collapse_control.mthp_bitmap_stack = kmalloc_array(
+		MTHP_BITMAP_SIZE, sizeof(struct scan_bit_state), GFP_KERNEL);
+	if (!khugepaged_collapse_control.mthp_bitmap_stack)
+		return -ENOMEM;
+
 	khugepaged_pages_to_scan = HPAGE_PMD_NR * 8;
 	khugepaged_max_ptes_none = HPAGE_PMD_NR - 1;
 	khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8;
@@ -400,6 +433,9 @@ int __init khugepaged_init(void)
 void __init khugepaged_destroy(void)
 {
 	kmem_cache_destroy(mm_slot_cache);
+	kfree(khugepaged_collapse_control.mthp_bitmap);
+	kfree(khugepaged_collapse_control.mthp_bitmap_temp);
+	kfree(khugepaged_collapse_control.mthp_bitmap_stack);
 }
 
 static inline int khugepaged_test_exit(struct mm_struct *mm)
@@ -850,10 +886,6 @@ static void khugepaged_alloc_sleep(void)
 	remove_wait_queue(&khugepaged_wait, &wait);
 }
 
-struct collapse_control khugepaged_collapse_control = {
-	.is_khugepaged = true,
-};
-
 static bool khugepaged_scan_abort(int nid, struct collapse_control *cc)
 {
 	int i;
@@ -1102,7 +1134,8 @@ static int alloc_charge_folio(struct folio **foliop, struct mm_struct *mm,
 
 static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
 			      int referenced, int unmapped,
-			      struct collapse_control *cc)
+			      struct collapse_control *cc, bool *mmap_locked,
+				  int order, int offset)
 {
 	LIST_HEAD(compound_pagelist);
 	pmd_t *pmd, _pmd;
@@ -1115,6 +1148,11 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
 	struct mmu_notifier_range range;
 	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 
+	/* if collapsing mTHPs we may have already released the read_lock, and
+	 * need to reaquire it to keep the proper locking order.
+	 */
+	if (!*mmap_locked)
+		mmap_read_lock(mm);
 	/*
 	 * Before allocating the hugepage, release the mmap_lock read lock.
 	 * The allocation can take potentially a long time if it involves
@@ -1122,6 +1160,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
 	 * that. We will recheck the vma after taking it again in write mode.
 	 */
 	mmap_read_unlock(mm);
+	*mmap_locked = false;
 
 	result = alloc_charge_folio(&folio, mm, cc, HPAGE_PMD_ORDER);
 	if (result != SCAN_SUCCEED)
@@ -1256,12 +1295,71 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
 out_up_write:
 	mmap_write_unlock(mm);
 out_nolock:
+	*mmap_locked = false;
 	if (folio)
 		folio_put(folio);
 	trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result);
 	return result;
 }
 
+// Recursive function to consume the bitmap
+static int khugepaged_scan_bitmap(struct mm_struct *mm, unsigned long address,
+			int referenced, int unmapped, struct collapse_control *cc,
+			bool *mmap_locked, unsigned long enabled_orders)
+{
+	u8 order, offset;
+	int num_chunks;
+	int bits_set, max_percent, threshold_bits;
+	int next_order, mid_offset;
+	int top = -1;
+	int collapsed = 0;
+	int ret;
+	struct scan_bit_state state;
+
+	cc->mthp_bitmap_stack[++top] = (struct scan_bit_state)
+		{ HPAGE_PMD_ORDER - MIN_MTHP_ORDER, 0 };
+
+	while (top >= 0) {
+		state = cc->mthp_bitmap_stack[top--];
+		order = state.order;
+		offset = state.offset;
+		num_chunks = 1 << order;
+		// Skip mTHP orders that are not enabled
+		if (!(enabled_orders >> (order +  MIN_MTHP_ORDER)) & 1)
+			goto next;
+
+		// copy the relavant section to a new bitmap
+		bitmap_shift_right(cc->mthp_bitmap_temp, cc->mthp_bitmap, offset,
+				  MTHP_BITMAP_SIZE);
+
+		bits_set = bitmap_weight(cc->mthp_bitmap_temp, num_chunks);
+
+		// Check if the region is "almost full" based on the threshold
+		max_percent = ((HPAGE_PMD_NR - khugepaged_max_ptes_none - 1) * 100)
+						/ (HPAGE_PMD_NR - 1);
+		threshold_bits = (max_percent * num_chunks) / 100;
+
+		if (bits_set >= threshold_bits) {
+			ret = collapse_huge_page(mm, address, referenced, unmapped, cc,
+					mmap_locked, order + MIN_MTHP_ORDER, offset * MIN_MTHP_NR);
+			if (ret == SCAN_SUCCEED)
+				collapsed += (1 << (order + MIN_MTHP_ORDER));
+			continue;
+		}
+
+next:
+		if (order > 0) {
+			next_order = order - 1;
+			mid_offset = offset + (num_chunks / 2);
+			cc->mthp_bitmap_stack[++top] = (struct scan_bit_state)
+				{ next_order, mid_offset };
+			cc->mthp_bitmap_stack[++top] = (struct scan_bit_state)
+				{ next_order, offset };
+			}
+	}
+	return collapsed;
+}
+
 static int khugepaged_scan_pmd(struct mm_struct *mm,
 				   struct vm_area_struct *vma,
 				   unsigned long address, bool *mmap_locked,
@@ -1430,7 +1528,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
 	pte_unmap_unlock(pte, ptl);
 	if (result == SCAN_SUCCEED) {
 		result = collapse_huge_page(mm, address, referenced,
-					    unmapped, cc);
+					    unmapped, cc, mmap_locked, HPAGE_PMD_ORDER, 0);
 		/* collapse_huge_page will return with the mmap_lock released */
 		*mmap_locked = false;
 	}
@@ -2767,6 +2865,21 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
 		return -ENOMEM;
 	cc->is_khugepaged = false;
 
+	cc->mthp_bitmap = kmalloc_array(
+		BITS_TO_LONGS(MTHP_BITMAP_SIZE), sizeof(unsigned long), GFP_KERNEL);
+	if (!cc->mthp_bitmap)
+		return -ENOMEM;
+
+	cc->mthp_bitmap_temp = kmalloc_array(
+		BITS_TO_LONGS(MTHP_BITMAP_SIZE), sizeof(unsigned long), GFP_KERNEL);
+	if (!cc->mthp_bitmap_temp)
+		return -ENOMEM;
+
+	cc->mthp_bitmap_stack = kmalloc_array(
+		MTHP_BITMAP_SIZE, sizeof(struct scan_bit_state), GFP_KERNEL);
+	if (!cc->mthp_bitmap_stack)
+		return -ENOMEM;
+
 	mmgrab(mm);
 	lru_add_drain_all();
 
@@ -2831,8 +2944,12 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
 out_nolock:
 	mmap_assert_locked(mm);
 	mmdrop(mm);
+	kfree(cc->mthp_bitmap);
+	kfree(cc->mthp_bitmap_temp);
+	kfree(cc->mthp_bitmap_stack);
 	kfree(cc);
 
+
 	return thps == ((hend - hstart) >> HPAGE_PMD_SHIFT) ? 0
 			: madvise_collapse_errno(last_fail);
 }
-- 
2.47.1

Re: [RFC 08/11] khugepaged: introduce khugepaged_scan_bitmap for mTHP support

Posted by Dev Jain 1 year ago

On 09/01/25 5:01 am, Nico Pache wrote:
> khugepaged scans PMD ranges for potential collapse to a hugepage. To add
> mTHP support we use this scan to instead record chunks of fully utilized
> sections of the PMD.
> 
> create a bitmap to represent a PMD in order MTHP_MIN_ORDER chunks.
> by default we will set this to order 3. The reasoning is that for 4K 512
> PMD size this results in a 64 bit bitmap which has some optimizations.
> For other arches like ARM64 64K, we can set a larger order if needed.
> 
> khugepaged_scan_bitmap uses a stack struct to recursively scan a bitmap
> that represents chunks of fully utilized regions. We can then determine
> what mTHP size fits best and in the following patch, we set this bitmap
> while scanning the PMD.
> 
> max_ptes_none is used as a scale to determine how "full" an order must
> be before being considered for collapse.
> 
> Signed-off-by: Nico Pache <npache@redhat.com>

Here is my objective and subjective analysis.

--------------------- Mathematical Analysis ----------------------------

First off, in my series, I am missing one thing: When I fail to collapse 
a range as a result of exhausting all orders, I should jump to the next 
range starting with the alignment of order at which I just failed (i.e, 
the minimum allowable order). Currently I am exiting which is wasteful. 
This should be easy to extend.

Let's call Nico Pache's method NP, and Dev Jain's method DJ.

The only difference between NP and DJ is the remembrance of the state of 
the PTEs (I have already reverted to using write lock for my v2, see 
this reply: 
https://lore.kernel.org/all/71a2f471-3082-4ca2-ac48-2f664977282f@arm.com/). 
NP saves empty and filled PTEs in a bitmap, and then uses the optimized 
(let us assume them to be constant time operations, hopefully?) bitmap 
APIs, like bitmap_shift_right(), and bitmap_weight(). The latter is what 
determines whether for a particular order, the range has enough filled 
PTEs to justify calling collapse_huge_page(). DJ does this naively with 
a brute force iteration. Now, the edge NP has over DJ is just before 
calling collapse_huge_page(). Post calling that, everything remains the 
same; assuming that both DJ and NP derive the same collapsed ranges, 
then, collapse_huge_page() succeeds in NP if and only if it succeeds in 
DJ. NP knows quickly, when and when not to call collapse_huge_page().

So the question is, how many iterations of PTE scans does NP save over 
DJ? We prove a stronger result:

Let the PTE table consist of 2^x pte entries, where x >= 2 belongs to 
the set of natural numbers (x >= 2 because anon collapse is not 
supported for x < 2). Let f(x) = #iterations performed by DJ in the 
worst case. The worst case is, all orders are enabled, and we have some 
distribution of the PTEs.

Lemma: f(x) <= 2^x * (x-1).

Proof: We perform weak mathematical induction on x. Assume 
zero-indexing, and assume the worst case that all orders are enabled.

Base case: Let x = 4. We have 16 entries. NP does 16 iterations. In the 
worst case, this what DJ may do: it will iterate all 16, and not 
collapse. Then it will iterate from 0-7 pte entries, and not collapse. 
Then, it will iterate from 0-3, and may or may not collapse. Here is the 
worst case (When I write l-r below, I mean the range l-r, both inclusive):

0-15 fail -> 0-7 fail -> 0-3 fail/success -> 4-7 fail/success -> 8-15 
fail -> 8-11 fail/success -> 12-15 fail/success

#iterations = 16+8+4+4+8+4+4 = 48 = 2^4 * (4-1).
Convince yourself that f(2) == 4 and f(3) <= 16.

Inductive hypothesis: Assume the lemma is true for some x > 4.

We need to prove for x+1. Let X = 2^(x+1) - 1, and Y = 2^x - 1.
Let DJ start scanning from 0. If 0-X is success, we are done. So, assume 
0-X fails. Now, DJ looks at 0-Y. Note that, for any x s.t 0 <= x <= X, 
if DJ starts scanning from x, there is no way it will cross the scan 
into the next half, i.e Y+1-X, since the scan length from x will be 
atmost the highest power-of-two alignment of x. Given this, we scan 0-Y 
completely, and then start from Y+1. Having established the above 
argument, we can use the inductive hypothesis on 0-Y and Y+1-X to derive 
that f(x) <= 2^(x+1) + 2f(x) <= 2^(x+1) + 2(2^x * (x-1)) = 2^(x+1) + 
2^(x+1) * (x-1) = 2^(x+1) * (x). Q.E.D
(You can simulate the proof for x=9; what I mean to say is, we can 
divide 0-511 into 0-255 and 256-511).

So, for our case, NP performs 512 iterations, and DJ performs in the 
worst case, 512 * 8 = 4096 iterations. Hmm...

----------------------- Subjective Analysis --------------------------

[1] The worst case is, well, the worst case; the practical case on arm64 
machines is, only 2048k and 64k is enabled. So, NP performs 512 
iterations, and DJ performs 512 + 16 * (number of 64K chunks) = 512 + 
512 = 1024 iterations. That is not much difference.

[2] Both implementations have the creep problem described here:
https://lore.kernel.org/all/20241216165105.56185-13-dev.jain@arm.com/

[3] The bitmaps are being created only for pte_none case, whereas we 
also have the shared and the swap case. In fact, for the none case, if 
we have PMD-order enabled, we will almost surely collapse to PMD size, 
given that the common case is khugepaged_max_ptes_none = 511: if we have 
one PTE filled, we will call collapse_huge_page(), and both DJ and NP 
will perform 512 iterations. Therefore, the bitmaps also need to be 
extended to the shared and the swap case so as to get any potential 
benefit from the idea in a practical scenario.

[4] NP does not handle scanning VMAs of size less than PMD. Since NP 
introduces a single entry point of khugepaged_collapse_single_pmd(), I 
am not sure how difficult it will be to extend the implementation, and 
given that, MTHP_BITMAP_SIZE is a compile time constant. I have extended 
this in my v2 and it works.

[5] In NP, for a bit to be set, the chunk completely needs to be 
filled/shared/swapped out. This completely changes the meaning of the 
sysfs parameters max_ptes_*. It also makes it very hard to debug since 
it may happen that, distribution D1 has more PTEs filled but less bits 
in the bitmap set than distribution D2. DJ also changes the meaning of 
the parameters due to scaling errors, but that is only an off-by-one 
error, therefore, the behaviour is easier to predict.

[6] In NP, we have: remember the state of the PTEs -> 
alloc_charge_folio() -> read_lock(), unlock() -> mmap_write_lock() -> 
anon_vma_lock_write() -> TLB flush for PMD. There is a significant time 
difference here, and the remembered PTEs may be vastly different from 
what we have now. Obviously I cannot pinpoint an exact number as to how 
bad this is or not for the accuracy of khugepaged. For DJ, since a 
particular PTE may come into the scan range multiple times, DJ gives the 
range a chance if the distribution changed recently.

[7] The last time I tried to save on #iterations of PTE entries, this 
happened:

https://lore.kernel.org/all/ZugxqJ-CjEi5lEW_@casper.infradead.org/

Matthew Wilcox pointed out a potential regression in a patch which was 
an "obvious optimization" to me on paper; I tested and it turned out he 
was correct:

https://lore.kernel.org/all/8700274f-b521-444e-8d17-c06039a1376c@arm.com/

We could argue whether it is worth to have the bitmap memory 
initialization, copying, weight checking, and recursion overhead.

This is the most I can come up with by analyzing from a third person 
perspective :)

Re: [RFC 08/11] khugepaged: introduce khugepaged_scan_bitmap for mTHP support

Posted by Dev Jain 1 year ago


On 12/01/25 8:43 pm, Dev Jain wrote:
> 
> 
> On 09/01/25 5:01 am, Nico Pache wrote:
>> khugepaged scans PMD ranges for potential collapse to a hugepage. To add
>> mTHP support we use this scan to instead record chunks of fully utilized
>> sections of the PMD.
>>
>> create a bitmap to represent a PMD in order MTHP_MIN_ORDER chunks.
>> by default we will set this to order 3. The reasoning is that for 4K 512
>> PMD size this results in a 64 bit bitmap which has some optimizations.
>> For other arches like ARM64 64K, we can set a larger order if needed.
>>
>> khugepaged_scan_bitmap uses a stack struct to recursively scan a bitmap
>> that represents chunks of fully utilized regions. We can then determine
>> what mTHP size fits best and in the following patch, we set this bitmap
>> while scanning the PMD.
>>
>> max_ptes_none is used as a scale to determine how "full" an order must
>> be before being considered for collapse.
>>
>> Signed-off-by: Nico Pache <npache@redhat.com>
> 
> Here is my objective and subjective analysis.
> 
> --------------------- Mathematical Analysis ----------------------------
> 
> First off, in my series, I am missing one thing: When I fail to collapse 
> a range as a result of exhausting all orders, I should jump to the next 
> range starting with the alignment of order at which I just failed (i.e, 
> the minimum allowable order). Currently I am exiting which is wasteful. 
> This should be easy to extend.
> 
> Let's call Nico Pache's method NP, and Dev Jain's method DJ.
> 
> The only difference between NP and DJ is the remembrance of the state of 
> the PTEs (I have already reverted to using write lock for my v2, see 
> this reply: https://lore.kernel.org/all/71a2f471-3082-4ca2- 
> ac48-2f664977282f@arm.com/). NP saves empty and filled PTEs in a bitmap, 
> and then uses the optimized (let us assume them to be constant time 
> operations, hopefully?) bitmap APIs, like bitmap_shift_right(), and 
> bitmap_weight(). The latter is what determines whether for a particular 
> order, the range has enough filled PTEs to justify calling 
> collapse_huge_page(). DJ does this naively with a brute force iteration. 
> Now, the edge NP has over DJ is just before calling 
> collapse_huge_page(). Post calling that, everything remains the same; 
> assuming that both DJ and NP derive the same collapsed ranges, then, 
> collapse_huge_page() succeeds in NP if and only if it succeeds in DJ. NP 
> knows quickly, when and when not to call collapse_huge_page().
> 
> So the question is, how many iterations of PTE scans does NP save over 
> DJ? We prove a stronger result:
> 
> Let the PTE table consist of 2^x pte entries, where x >= 2 belongs to 
> the set of natural numbers (x >= 2 because anon collapse is not 
> supported for x < 2). Let f(x) = #iterations performed by DJ in the 
> worst case. The worst case is, all orders are enabled, and we have some 
> distribution of the PTEs.
> 
> Lemma: f(x) <= 2^x * (x-1).
> 
> Proof: We perform weak mathematical induction on x. Assume zero- 
> indexing, and assume the worst case that all orders are enabled.
> 
> Base case: Let x = 4. We have 16 entries. NP does 16 iterations. In the 
> worst case, this what DJ may do: it will iterate all 16, and not 
> collapse. Then it will iterate from 0-7 pte entries, and not collapse. 
> Then, it will iterate from 0-3, and may or may not collapse. Here is the 
> worst case (When I write l-r below, I mean the range l-r, both inclusive):
> 
> 0-15 fail -> 0-7 fail -> 0-3 fail/success -> 4-7 fail/success -> 8-15 
> fail -> 8-11 fail/success -> 12-15 fail/success
> 
> #iterations = 16+8+4+4+8+4+4 = 48 = 2^4 * (4-1).
> Convince yourself that f(2) == 4 and f(3) <= 16.
> 
> Inductive hypothesis: Assume the lemma is true for some x > 4.
> 
> We need to prove for x+1. Let X = 2^(x+1) - 1, and Y = 2^x - 1.
> Let DJ start scanning from 0. If 0-X is success, we are done. So, assume 
> 0-X fails. Now, DJ looks at 0-Y. Note that, for any x s.t 0 <= x <= X, 
> if DJ starts scanning from x, there is no way it will cross the scan 
> into the next half, i.e Y+1-X, since the scan length from x will be 
> atmost the highest power-of-two alignment of x. Given this, we scan 0-Y 
> completely, and then start from Y+1. Having established the above 
> argument, we can use the inductive hypothesis on 0-Y and Y+1-X to derive 
> that f(x) <= 2^(x+1) + 2f(x) <= 2^(x+1) + 2(2^x * (x-1)) = 2^(x+1) +

Typo: f(x+1) <= 2^(x+1) + 2f(x).

> 2^(x+1) * (x-1) = 2^(x+1) * (x). Q.E.D
> (You can simulate the proof for x=9; what I mean to say is, we can 
> divide 0-511 into 0-255 and 256-511).
> 
> So, for our case, NP performs 512 iterations, and DJ performs in the 
> worst case, 512 * 8 = 4096 iterations. Hmm...
> 
> ----------------------- Subjective Analysis --------------------------
> 
> [1] The worst case is, well, the worst case; the practical case on arm64 
> machines is, only 2048k and 64k is enabled. So, NP performs 512 
> iterations, and DJ performs 512 + 16 * (number of 64K chunks) = 512 + 
> 512 = 1024 iterations. That is not much difference.
> 
> [2] Both implementations have the creep problem described here:
> https://lore.kernel.org/all/20241216165105.56185-13-dev.jain@arm.com/
> 
> [3] The bitmaps are being created only for pte_none case, whereas we 
> also have the shared and the swap case. In fact, for the none case, if 
> we have PMD-order enabled, we will almost surely collapse to PMD size, 
> given that the common case is khugepaged_max_ptes_none = 511: if we have 
> one PTE filled, we will call collapse_huge_page(), and both DJ and NP 
> will perform 512 iterations. Therefore, the bitmaps also need to be 
> extended to the shared and the swap case so as to get any potential 
> benefit from the idea in a practical scenario.
> 
> [4] NP does not handle scanning VMAs of size less than PMD. Since NP 
> introduces a single entry point of khugepaged_collapse_single_pmd(), I 
> am not sure how difficult it will be to extend the implementation, and 
> given that, MTHP_BITMAP_SIZE is a compile time constant. I have extended 
> this in my v2 and it works.
> 
> [5] In NP, for a bit to be set, the chunk completely needs to be filled/ 
> shared/swapped out. This completely changes the meaning of the sysfs 
> parameters max_ptes_*. It also makes it very hard to debug since it may 
> happen that, distribution D1 has more PTEs filled but less bits in the 
> bitmap set than distribution D2. DJ also changes the meaning of the 
> parameters due to scaling errors, but that is only an off-by-one error, 
> therefore, the behaviour is easier to predict.
> 
> [6] In NP, we have: remember the state of the PTEs -> 
> alloc_charge_folio() -> read_lock(), unlock() -> mmap_write_lock() -> 
> anon_vma_lock_write() -> TLB flush for PMD. There is a significant time 
> difference here, and the remembered PTEs may be vastly different from 
> what we have now. Obviously I cannot pinpoint an exact number as to how 
> bad this is or not for the accuracy of khugepaged. For DJ, since a 
> particular PTE may come into the scan range multiple times, DJ gives the 
> range a chance if the distribution changed recently.
> 
> [7] The last time I tried to save on #iterations of PTE entries, this 
> happened:
> 
> https://lore.kernel.org/all/ZugxqJ-CjEi5lEW_@casper.infradead.org/
> 
> Matthew Wilcox pointed out a potential regression in a patch which was 
> an "obvious optimization" to me on paper; I tested and it turned out he 
> was correct:
> 
> https://lore.kernel.org/all/8700274f-b521-444e-8d17-c06039a1376c@arm.com/
> 
> We could argue whether it is worth to have the bitmap memory 
> initialization, copying, weight checking, and recursion overhead.
> 
> This is the most I can come up with by analyzing from a third person 
> perspective :)
> 
>

Re: [RFC 08/11] khugepaged: introduce khugepaged_scan_bitmap for mTHP support

Posted by Dev Jain 1 year ago


On 09/01/25 5:01 am, Nico Pache wrote:
> khugepaged scans PMD ranges for potential collapse to a hugepage. To add
> mTHP support we use this scan to instead record chunks of fully utilized
> sections of the PMD.
> 
> create a bitmap to represent a PMD in order MTHP_MIN_ORDER chunks.
> by default we will set this to order 3. The reasoning is that for 4K 512
> PMD size this results in a 64 bit bitmap which has some optimizations.
> For other arches like ARM64 64K, we can set a larger order if needed.
> 
> khugepaged_scan_bitmap uses a stack struct to recursively scan a bitmap
> that represents chunks of fully utilized regions. We can then determine
> what mTHP size fits best and in the following patch, we set this bitmap
> while scanning the PMD.
> 
> max_ptes_none is used as a scale to determine how "full" an order must
> be before being considered for collapse.
> 
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---
>   include/linux/khugepaged.h |   4 +-
>   mm/khugepaged.c            | 129 +++++++++++++++++++++++++++++++++++--
>   2 files changed, 126 insertions(+), 7 deletions(-)
> 

[--snip--]

>   
> +// Recursive function to consume the bitmap
> +static int khugepaged_scan_bitmap(struct mm_struct *mm, unsigned long address,
> +			int referenced, int unmapped, struct collapse_control *cc,
> +			bool *mmap_locked, unsigned long enabled_orders)
> +{
> +	u8 order, offset;
> +	int num_chunks;
> +	int bits_set, max_percent, threshold_bits;
> +	int next_order, mid_offset;
> +	int top = -1;
> +	int collapsed = 0;
> +	int ret;
> +	struct scan_bit_state state;
> +
> +	cc->mthp_bitmap_stack[++top] = (struct scan_bit_state)
> +		{ HPAGE_PMD_ORDER - MIN_MTHP_ORDER, 0 };
> +
> +	while (top >= 0) {
> +		state = cc->mthp_bitmap_stack[top--];
> +		order = state.order;
> +		offset = state.offset;
> +		num_chunks = 1 << order;
> +		// Skip mTHP orders that are not enabled
> +		if (!(enabled_orders >> (order +  MIN_MTHP_ORDER)) & 1)
> +			goto next;
> +
> +		// copy the relavant section to a new bitmap
> +		bitmap_shift_right(cc->mthp_bitmap_temp, cc->mthp_bitmap, offset,
> +				  MTHP_BITMAP_SIZE);
> +
> +		bits_set = bitmap_weight(cc->mthp_bitmap_temp, num_chunks);
> +
> +		// Check if the region is "almost full" based on the threshold
> +		max_percent = ((HPAGE_PMD_NR - khugepaged_max_ptes_none - 1) * 100)
> +						/ (HPAGE_PMD_NR - 1);
> +		threshold_bits = (max_percent * num_chunks) / 100;
> +
> +		if (bits_set >= threshold_bits) {
> +			ret = collapse_huge_page(mm, address, referenced, unmapped, cc,
> +					mmap_locked, order + MIN_MTHP_ORDER, offset * MIN_MTHP_NR);
> +			if (ret == SCAN_SUCCEED)
> +				collapsed += (1 << (order + MIN_MTHP_ORDER));
> +			continue;
> +		}

We are going to the lower order when it is not in the allowed mask of 
orders, or when we are below the threshold. What to do when these 
conditions do not happen, and the reason for collapse failure is 
collapse_huge_page()? For example, if you start with a PMD order scan,
and collapse_huge_page() fails, then you hit "continue", and then exit 
the loop because there is nothing else in the stack, so we exit without
trying mTHPs.

> +
> +next:
> +		if (order > 0) {
> +			next_order = order - 1;
> +			mid_offset = offset + (num_chunks / 2);
> +			cc->mthp_bitmap_stack[++top] = (struct scan_bit_state)
> +				{ next_order, mid_offset };
> +			cc->mthp_bitmap_stack[++top] = (struct scan_bit_state)
> +				{ next_order, offset };
> +			}
> +	}
> +	return collapsed;
> +}
> +

Re: [RFC 08/11] khugepaged: introduce khugepaged_scan_bitmap for mTHP support

Posted by Nico Pache 1 year ago

On Fri, Jan 10, 2025 at 7:54 AM Dev Jain <dev.jain@arm.com> wrote:
>
>
>
> On 09/01/25 5:01 am, Nico Pache wrote:
> > khugepaged scans PMD ranges for potential collapse to a hugepage. To add
> > mTHP support we use this scan to instead record chunks of fully utilized
> > sections of the PMD.
> >
> > create a bitmap to represent a PMD in order MTHP_MIN_ORDER chunks.
> > by default we will set this to order 3. The reasoning is that for 4K 512
> > PMD size this results in a 64 bit bitmap which has some optimizations.
> > For other arches like ARM64 64K, we can set a larger order if needed.
> >
> > khugepaged_scan_bitmap uses a stack struct to recursively scan a bitmap
> > that represents chunks of fully utilized regions. We can then determine
> > what mTHP size fits best and in the following patch, we set this bitmap
> > while scanning the PMD.
> >
> > max_ptes_none is used as a scale to determine how "full" an order must
> > be before being considered for collapse.
> >
> > Signed-off-by: Nico Pache <npache@redhat.com>
> > ---
> >   include/linux/khugepaged.h |   4 +-
> >   mm/khugepaged.c            | 129 +++++++++++++++++++++++++++++++++++--
> >   2 files changed, 126 insertions(+), 7 deletions(-)
> >
>
> [--snip--]
>
> >
> > +// Recursive function to consume the bitmap
> > +static int khugepaged_scan_bitmap(struct mm_struct *mm, unsigned long address,
> > +                     int referenced, int unmapped, struct collapse_control *cc,
> > +                     bool *mmap_locked, unsigned long enabled_orders)
> > +{
> > +     u8 order, offset;
> > +     int num_chunks;
> > +     int bits_set, max_percent, threshold_bits;
> > +     int next_order, mid_offset;
> > +     int top = -1;
> > +     int collapsed = 0;
> > +     int ret;
> > +     struct scan_bit_state state;
> > +
> > +     cc->mthp_bitmap_stack[++top] = (struct scan_bit_state)
> > +             { HPAGE_PMD_ORDER - MIN_MTHP_ORDER, 0 };
> > +
> > +     while (top >= 0) {
> > +             state = cc->mthp_bitmap_stack[top--];
> > +             order = state.order;
> > +             offset = state.offset;
> > +             num_chunks = 1 << order;
> > +             // Skip mTHP orders that are not enabled
> > +             if (!(enabled_orders >> (order +  MIN_MTHP_ORDER)) & 1)
> > +                     goto next;
> > +
> > +             // copy the relavant section to a new bitmap
> > +             bitmap_shift_right(cc->mthp_bitmap_temp, cc->mthp_bitmap, offset,
> > +                               MTHP_BITMAP_SIZE);
> > +
> > +             bits_set = bitmap_weight(cc->mthp_bitmap_temp, num_chunks);
> > +
> > +             // Check if the region is "almost full" based on the threshold
> > +             max_percent = ((HPAGE_PMD_NR - khugepaged_max_ptes_none - 1) * 100)
> > +                                             / (HPAGE_PMD_NR - 1);
> > +             threshold_bits = (max_percent * num_chunks) / 100;
> > +
> > +             if (bits_set >= threshold_bits) {
> > +                     ret = collapse_huge_page(mm, address, referenced, unmapped, cc,
> > +                                     mmap_locked, order + MIN_MTHP_ORDER, offset * MIN_MTHP_NR);
> > +                     if (ret == SCAN_SUCCEED)
> > +                             collapsed += (1 << (order + MIN_MTHP_ORDER));
> > +                     continue;
> > +             }
>
> We are going to the lower order when it is not in the allowed mask of
> orders, or when we are below the threshold. What to do when these
> conditions do not happen, and the reason for collapse failure is
> collapse_huge_page()? For example, if you start with a PMD order scan,
> and collapse_huge_page() fails, then you hit "continue", and then exit
> the loop because there is nothing else in the stack, so we exit without
> trying mTHPs.

Thanks for catching that, I introduced that bug when I went from the
recursion to stack based approach.
This should only continue on SCAN_SUCCEED. If not it needs to go next:

I think I also need to handle the case where nothing succeeds in
khugepaged_scan_pmd.


>
> > +
> > +next:
> > +             if (order > 0) {
> > +                     next_order = order - 1;
> > +                     mid_offset = offset + (num_chunks / 2);
> > +                     cc->mthp_bitmap_stack[++top] = (struct scan_bit_state)
> > +                             { next_order, mid_offset };
> > +                     cc->mthp_bitmap_stack[++top] = (struct scan_bit_state)
> > +                             { next_order, offset };
> > +                     }
> > +     }
> > +     return collapsed;
> > +}
> > +
>

Re: [RFC 08/11] khugepaged: introduce khugepaged_scan_bitmap for mTHP support

Posted by Dev Jain 1 year ago


On 09/01/25 5:01 am, Nico Pache wrote:
> khugepaged scans PMD ranges for potential collapse to a hugepage. To add
> mTHP support we use this scan to instead record chunks of fully utilized
> sections of the PMD.
> 
> create a bitmap to represent a PMD in order MTHP_MIN_ORDER chunks.
> by default we will set this to order 3. The reasoning is that for 4K 512
> PMD size this results in a 64 bit bitmap which has some optimizations.
> For other arches like ARM64 64K, we can set a larger order if needed.
> 
> khugepaged_scan_bitmap uses a stack struct to recursively scan a bitmap
> that represents chunks of fully utilized regions. We can then determine
> what mTHP size fits best and in the following patch, we set this bitmap
> while scanning the PMD.
> 
> max_ptes_none is used as a scale to determine how "full" an order must
> be before being considered for collapse.
> 
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---
>   include/linux/khugepaged.h |   4 +-
>   mm/khugepaged.c            | 129 +++++++++++++++++++++++++++++++++++--
>   2 files changed, 126 insertions(+), 7 deletions(-)
> 
> diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
> index 1f46046080f5..31cff8aeec4a 100644
> --- a/include/linux/khugepaged.h
> +++ b/include/linux/khugepaged.h
> @@ -1,7 +1,9 @@
>   /* SPDX-License-Identifier: GPL-2.0 */
>   #ifndef _LINUX_KHUGEPAGED_H
>   #define _LINUX_KHUGEPAGED_H
> -

Nit: I don't think this line needs to be deleted.

> +#define MIN_MTHP_ORDER	3
> +#define MIN_MTHP_NR	(1<<MIN_MTHP_ORDER)

Nit: Insert a space: (1 << MIN_MTHP_ORDER)

> +#define MTHP_BITMAP_SIZE  (1<<(HPAGE_PMD_ORDER - MIN_MTHP_ORDER))
>   extern unsigned int khugepaged_max_ptes_none __read_mostly;
>   #ifdef CONFIG_TRANSPARENT_HUGEPAGE
>   extern struct attribute_group khugepaged_attr_group;
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 9eb161b04ee4..de1dc6ea3c71 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -94,6 +94,11 @@ static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
>   
>   static struct kmem_cache *mm_slot_cache __ro_after_init;
>   
> +struct scan_bit_state {
> +	u8 order;
> +	u8 offset;
> +};
> +
>   struct collapse_control {
>   	bool is_khugepaged;
>   
> @@ -102,6 +107,15 @@ struct collapse_control {
>   
>   	/* nodemask for allocation fallback */
>   	nodemask_t alloc_nmask;
> +
> +	/* bitmap used to collapse mTHP sizes. 1bit = order MIN_MTHP_ORDER mTHP */
> +	unsigned long *mthp_bitmap;
> +	unsigned long *mthp_bitmap_temp;
> +	struct scan_bit_state *mthp_bitmap_stack;
> +};
> +
> +struct collapse_control khugepaged_collapse_control = {
> +	.is_khugepaged = true,
>   };
>   
>   /**
> @@ -389,6 +403,25 @@ int __init khugepaged_init(void)
>   	if (!mm_slot_cache)
>   		return -ENOMEM;
>   
> +	/*
> +	 * allocate the bitmaps dynamically since MTHP_BITMAP_SIZE is not known at
> +	 * compile time for some architectures.
> +	 */
> +	khugepaged_collapse_control.mthp_bitmap = kmalloc_array(
> +		BITS_TO_LONGS(MTHP_BITMAP_SIZE), sizeof(unsigned long), GFP_KERNEL);
> +	if (!khugepaged_collapse_control.mthp_bitmap)
> +		return -ENOMEM;
> +
> +	khugepaged_collapse_control.mthp_bitmap_temp = kmalloc_array(
> +		BITS_TO_LONGS(MTHP_BITMAP_SIZE), sizeof(unsigned long), GFP_KERNEL);
> +	if (!khugepaged_collapse_control.mthp_bitmap_temp)
> +		return -ENOMEM;
> +
> +	khugepaged_collapse_control.mthp_bitmap_stack = kmalloc_array(
> +		MTHP_BITMAP_SIZE, sizeof(struct scan_bit_state), GFP_KERNEL);
> +	if (!khugepaged_collapse_control.mthp_bitmap_stack)
> +		return -ENOMEM;
> +
>   	khugepaged_pages_to_scan = HPAGE_PMD_NR * 8;
>   	khugepaged_max_ptes_none = HPAGE_PMD_NR - 1;
>   	khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8;
> @@ -400,6 +433,9 @@ int __init khugepaged_init(void)
>   void __init khugepaged_destroy(void)
>   {
>   	kmem_cache_destroy(mm_slot_cache);
> +	kfree(khugepaged_collapse_control.mthp_bitmap);
> +	kfree(khugepaged_collapse_control.mthp_bitmap_temp);
> +	kfree(khugepaged_collapse_control.mthp_bitmap_stack);
>   }
>   
>   static inline int khugepaged_test_exit(struct mm_struct *mm)
> @@ -850,10 +886,6 @@ static void khugepaged_alloc_sleep(void)
>   	remove_wait_queue(&khugepaged_wait, &wait);
>   }
>   
> -struct collapse_control khugepaged_collapse_control = {
> -	.is_khugepaged = true,
> -};
> -
>   static bool khugepaged_scan_abort(int nid, struct collapse_control *cc)
>   {
>   	int i;
> @@ -1102,7 +1134,8 @@ static int alloc_charge_folio(struct folio **foliop, struct mm_struct *mm,
>   
>   static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
>   			      int referenced, int unmapped,
> -			      struct collapse_control *cc)
> +			      struct collapse_control *cc, bool *mmap_locked,
> +				  int order, int offset)
>   {
>   	LIST_HEAD(compound_pagelist);
>   	pmd_t *pmd, _pmd;
> @@ -1115,6 +1148,11 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
>   	struct mmu_notifier_range range;
>   	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
>   
> +	/* if collapsing mTHPs we may have already released the read_lock, and
> +	 * need to reaquire it to keep the proper locking order.
> +	 */
> +	if (!*mmap_locked)
> +		mmap_read_lock(mm);

There is no need to take the read lock again, because we drop it just
after this.

>   	/*
>   	 * Before allocating the hugepage, release the mmap_lock read lock.
>   	 * The allocation can take potentially a long time if it involves
> @@ -1122,6 +1160,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
>   	 * that. We will recheck the vma after taking it again in write mode.
>   	 */
>   	mmap_read_unlock(mm);
> +	*mmap_locked = false;
>   
>   	result = alloc_charge_folio(&folio, mm, cc, HPAGE_PMD_ORDER);
>   	if (result != SCAN_SUCCEED)
> @@ -1256,12 +1295,71 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
>   out_up_write:
>   	mmap_write_unlock(mm);
>   out_nolock:
> +	*mmap_locked = false;
>   	if (folio)
>   		folio_put(folio);
>   	trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result);
>   	return result;
>   }
>   
> +// Recursive function to consume the bitmap
> +static int khugepaged_scan_bitmap(struct mm_struct *mm, unsigned long address,
> +			int referenced, int unmapped, struct collapse_control *cc,
> +			bool *mmap_locked, unsigned long enabled_orders)
> +{
> +	u8 order, offset;
> +	int num_chunks;
> +	int bits_set, max_percent, threshold_bits;
> +	int next_order, mid_offset;
> +	int top = -1;
> +	int collapsed = 0;
> +	int ret;
> +	struct scan_bit_state state;
> +
> +	cc->mthp_bitmap_stack[++top] = (struct scan_bit_state)
> +		{ HPAGE_PMD_ORDER - MIN_MTHP_ORDER, 0 };
> +
> +	while (top >= 0) {
> +		state = cc->mthp_bitmap_stack[top--];
> +		order = state.order;
> +		offset = state.offset;
> +		num_chunks = 1 << order;
> +		// Skip mTHP orders that are not enabled
> +		if (!(enabled_orders >> (order +  MIN_MTHP_ORDER)) & 1)
> +			goto next;
> +
> +		// copy the relavant section to a new bitmap
> +		bitmap_shift_right(cc->mthp_bitmap_temp, cc->mthp_bitmap, offset,
> +				  MTHP_BITMAP_SIZE);
> +
> +		bits_set = bitmap_weight(cc->mthp_bitmap_temp, num_chunks);
> +
> +		// Check if the region is "almost full" based on the threshold
> +		max_percent = ((HPAGE_PMD_NR - khugepaged_max_ptes_none - 1) * 100)
> +						/ (HPAGE_PMD_NR - 1);
> +		threshold_bits = (max_percent * num_chunks) / 100;
> +
> +		if (bits_set >= threshold_bits) {
> +			ret = collapse_huge_page(mm, address, referenced, unmapped, cc,
> +					mmap_locked, order + MIN_MTHP_ORDER, offset * MIN_MTHP_NR);
> +			if (ret == SCAN_SUCCEED)
> +				collapsed += (1 << (order + MIN_MTHP_ORDER));
> +			continue;
> +		}
> +
> +next:
> +		if (order > 0) {
> +			next_order = order - 1;
> +			mid_offset = offset + (num_chunks / 2);
> +			cc->mthp_bitmap_stack[++top] = (struct scan_bit_state)
> +				{ next_order, mid_offset };
> +			cc->mthp_bitmap_stack[++top] = (struct scan_bit_state)
> +				{ next_order, offset };
> +			}
> +	}
> +	return collapsed;
> +}
> +
>   static int khugepaged_scan_pmd(struct mm_struct *mm,
>   				   struct vm_area_struct *vma,
>   				   unsigned long address, bool *mmap_locked,
> @@ -1430,7 +1528,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
>   	pte_unmap_unlock(pte, ptl);
>   	if (result == SCAN_SUCCEED) {
>   		result = collapse_huge_page(mm, address, referenced,
> -					    unmapped, cc);
> +					    unmapped, cc, mmap_locked, HPAGE_PMD_ORDER, 0);
>   		/* collapse_huge_page will return with the mmap_lock released */
>   		*mmap_locked = false;
>   	}
> @@ -2767,6 +2865,21 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
>   		return -ENOMEM;
>   	cc->is_khugepaged = false;
>   
> +	cc->mthp_bitmap = kmalloc_array(
> +		BITS_TO_LONGS(MTHP_BITMAP_SIZE), sizeof(unsigned long), GFP_KERNEL);
> +	if (!cc->mthp_bitmap)
> +		return -ENOMEM;
> +
> +	cc->mthp_bitmap_temp = kmalloc_array(
> +		BITS_TO_LONGS(MTHP_BITMAP_SIZE), sizeof(unsigned long), GFP_KERNEL);
> +	if (!cc->mthp_bitmap_temp)
> +		return -ENOMEM;
> +
> +	cc->mthp_bitmap_stack = kmalloc_array(
> +		MTHP_BITMAP_SIZE, sizeof(struct scan_bit_state), GFP_KERNEL);
> +	if (!cc->mthp_bitmap_stack)
> +		return -ENOMEM;
> +
>   	mmgrab(mm);
>   	lru_add_drain_all();
>   
> @@ -2831,8 +2944,12 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
>   out_nolock:
>   	mmap_assert_locked(mm);
>   	mmdrop(mm);
> +	kfree(cc->mthp_bitmap);
> +	kfree(cc->mthp_bitmap_temp);
> +	kfree(cc->mthp_bitmap_stack);
>   	kfree(cc);
>   
> +
>   	return thps == ((hend - hstart) >> HPAGE_PMD_SHIFT) ? 0
>   			: madvise_collapse_errno(last_fail);
>   }

Re: [RFC 08/11] khugepaged: introduce khugepaged_scan_bitmap for mTHP support

Posted by Nico Pache 1 year ago

On Fri, Jan 10, 2025 at 2:06 AM Dev Jain <dev.jain@arm.com> wrote:
>
>
>
> On 09/01/25 5:01 am, Nico Pache wrote:
> > khugepaged scans PMD ranges for potential collapse to a hugepage. To add
> > mTHP support we use this scan to instead record chunks of fully utilized
> > sections of the PMD.
> >
> > create a bitmap to represent a PMD in order MTHP_MIN_ORDER chunks.
> > by default we will set this to order 3. The reasoning is that for 4K 512
> > PMD size this results in a 64 bit bitmap which has some optimizations.
> > For other arches like ARM64 64K, we can set a larger order if needed.
> >
> > khugepaged_scan_bitmap uses a stack struct to recursively scan a bitmap
> > that represents chunks of fully utilized regions. We can then determine
> > what mTHP size fits best and in the following patch, we set this bitmap
> > while scanning the PMD.
> >
> > max_ptes_none is used as a scale to determine how "full" an order must
> > be before being considered for collapse.
> >
> > Signed-off-by: Nico Pache <npache@redhat.com>
> > ---
> >   include/linux/khugepaged.h |   4 +-
> >   mm/khugepaged.c            | 129 +++++++++++++++++++++++++++++++++++--
> >   2 files changed, 126 insertions(+), 7 deletions(-)
> >
> > diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
> > index 1f46046080f5..31cff8aeec4a 100644
> > --- a/include/linux/khugepaged.h
> > +++ b/include/linux/khugepaged.h
> > @@ -1,7 +1,9 @@
> >   /* SPDX-License-Identifier: GPL-2.0 */
> >   #ifndef _LINUX_KHUGEPAGED_H
> >   #define _LINUX_KHUGEPAGED_H
> > -
>
> Nit: I don't think this line needs to be deleted.
>
> > +#define MIN_MTHP_ORDER       3
> > +#define MIN_MTHP_NR  (1<<MIN_MTHP_ORDER)
>
> Nit: Insert a space: (1 << MIN_MTHP_ORDER)
>
> > +#define MTHP_BITMAP_SIZE  (1<<(HPAGE_PMD_ORDER - MIN_MTHP_ORDER))
> >   extern unsigned int khugepaged_max_ptes_none __read_mostly;
> >   #ifdef CONFIG_TRANSPARENT_HUGEPAGE
> >   extern struct attribute_group khugepaged_attr_group;
> > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > index 9eb161b04ee4..de1dc6ea3c71 100644
> > --- a/mm/khugepaged.c
> > +++ b/mm/khugepaged.c
> > @@ -94,6 +94,11 @@ static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
> >
> >   static struct kmem_cache *mm_slot_cache __ro_after_init;
> >
> > +struct scan_bit_state {
> > +     u8 order;
> > +     u8 offset;
> > +};
> > +
> >   struct collapse_control {
> >       bool is_khugepaged;
> >
> > @@ -102,6 +107,15 @@ struct collapse_control {
> >
> >       /* nodemask for allocation fallback */
> >       nodemask_t alloc_nmask;
> > +
> > +     /* bitmap used to collapse mTHP sizes. 1bit = order MIN_MTHP_ORDER mTHP */
> > +     unsigned long *mthp_bitmap;
> > +     unsigned long *mthp_bitmap_temp;
> > +     struct scan_bit_state *mthp_bitmap_stack;
> > +};
> > +
> > +struct collapse_control khugepaged_collapse_control = {
> > +     .is_khugepaged = true,
> >   };
> >
> >   /**
> > @@ -389,6 +403,25 @@ int __init khugepaged_init(void)
> >       if (!mm_slot_cache)
> >               return -ENOMEM;
> >
> > +     /*
> > +      * allocate the bitmaps dynamically since MTHP_BITMAP_SIZE is not known at
> > +      * compile time for some architectures.
> > +      */
> > +     khugepaged_collapse_control.mthp_bitmap = kmalloc_array(
> > +             BITS_TO_LONGS(MTHP_BITMAP_SIZE), sizeof(unsigned long), GFP_KERNEL);
> > +     if (!khugepaged_collapse_control.mthp_bitmap)
> > +             return -ENOMEM;
> > +
> > +     khugepaged_collapse_control.mthp_bitmap_temp = kmalloc_array(
> > +             BITS_TO_LONGS(MTHP_BITMAP_SIZE), sizeof(unsigned long), GFP_KERNEL);
> > +     if (!khugepaged_collapse_control.mthp_bitmap_temp)
> > +             return -ENOMEM;
> > +
> > +     khugepaged_collapse_control.mthp_bitmap_stack = kmalloc_array(
> > +             MTHP_BITMAP_SIZE, sizeof(struct scan_bit_state), GFP_KERNEL);
> > +     if (!khugepaged_collapse_control.mthp_bitmap_stack)
> > +             return -ENOMEM;
> > +
> >       khugepaged_pages_to_scan = HPAGE_PMD_NR * 8;
> >       khugepaged_max_ptes_none = HPAGE_PMD_NR - 1;
> >       khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8;
> > @@ -400,6 +433,9 @@ int __init khugepaged_init(void)
> >   void __init khugepaged_destroy(void)
> >   {
> >       kmem_cache_destroy(mm_slot_cache);
> > +     kfree(khugepaged_collapse_control.mthp_bitmap);
> > +     kfree(khugepaged_collapse_control.mthp_bitmap_temp);
> > +     kfree(khugepaged_collapse_control.mthp_bitmap_stack);
> >   }
> >
> >   static inline int khugepaged_test_exit(struct mm_struct *mm)
> > @@ -850,10 +886,6 @@ static void khugepaged_alloc_sleep(void)
> >       remove_wait_queue(&khugepaged_wait, &wait);
> >   }
> >
> > -struct collapse_control khugepaged_collapse_control = {
> > -     .is_khugepaged = true,
> > -};
> > -
> >   static bool khugepaged_scan_abort(int nid, struct collapse_control *cc)
> >   {
> >       int i;
> > @@ -1102,7 +1134,8 @@ static int alloc_charge_folio(struct folio **foliop, struct mm_struct *mm,
> >
> >   static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> >                             int referenced, int unmapped,
> > -                           struct collapse_control *cc)
> > +                           struct collapse_control *cc, bool *mmap_locked,
> > +                               int order, int offset)
> >   {
> >       LIST_HEAD(compound_pagelist);
> >       pmd_t *pmd, _pmd;
> > @@ -1115,6 +1148,11 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> >       struct mmu_notifier_range range;
> >       VM_BUG_ON(address & ~HPAGE_PMD_MASK);
> >
> > +     /* if collapsing mTHPs we may have already released the read_lock, and
> > +      * need to reaquire it to keep the proper locking order.
> > +      */
> > +     if (!*mmap_locked)
> > +             mmap_read_lock(mm);
>
> There is no need to take the read lock again, because we drop it just
> after this.

collapse_huge_page expects the mmap_lock to already be taken, and it
returns with it unlocked. If we are collapsing multiple mTHPs under
the same PMD, then I think we need to reacquire the lock before
calling unlock on it.

>
> >       /*
> >        * Before allocating the hugepage, release the mmap_lock read lock.
> >        * The allocation can take potentially a long time if it involves
> > @@ -1122,6 +1160,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> >        * that. We will recheck the vma after taking it again in write mode.
> >        */
> >       mmap_read_unlock(mm);
> > +     *mmap_locked = false;
> >
> >       result = alloc_charge_folio(&folio, mm, cc, HPAGE_PMD_ORDER);
> >       if (result != SCAN_SUCCEED)
> > @@ -1256,12 +1295,71 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> >   out_up_write:
> >       mmap_write_unlock(mm);
> >   out_nolock:
> > +     *mmap_locked = false;
> >       if (folio)
> >               folio_put(folio);
> >       trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result);
> >       return result;
> >   }
> >
> > +// Recursive function to consume the bitmap
> > +static int khugepaged_scan_bitmap(struct mm_struct *mm, unsigned long address,
> > +                     int referenced, int unmapped, struct collapse_control *cc,
> > +                     bool *mmap_locked, unsigned long enabled_orders)
> > +{
> > +     u8 order, offset;
> > +     int num_chunks;
> > +     int bits_set, max_percent, threshold_bits;
> > +     int next_order, mid_offset;
> > +     int top = -1;
> > +     int collapsed = 0;
> > +     int ret;
> > +     struct scan_bit_state state;
> > +
> > +     cc->mthp_bitmap_stack[++top] = (struct scan_bit_state)
> > +             { HPAGE_PMD_ORDER - MIN_MTHP_ORDER, 0 };
> > +
> > +     while (top >= 0) {
> > +             state = cc->mthp_bitmap_stack[top--];
> > +             order = state.order;
> > +             offset = state.offset;
> > +             num_chunks = 1 << order;
> > +             // Skip mTHP orders that are not enabled
> > +             if (!(enabled_orders >> (order +  MIN_MTHP_ORDER)) & 1)
> > +                     goto next;
> > +
> > +             // copy the relavant section to a new bitmap
> > +             bitmap_shift_right(cc->mthp_bitmap_temp, cc->mthp_bitmap, offset,
> > +                               MTHP_BITMAP_SIZE);
> > +
> > +             bits_set = bitmap_weight(cc->mthp_bitmap_temp, num_chunks);
> > +
> > +             // Check if the region is "almost full" based on the threshold
> > +             max_percent = ((HPAGE_PMD_NR - khugepaged_max_ptes_none - 1) * 100)
> > +                                             / (HPAGE_PMD_NR - 1);
> > +             threshold_bits = (max_percent * num_chunks) / 100;
> > +
> > +             if (bits_set >= threshold_bits) {
> > +                     ret = collapse_huge_page(mm, address, referenced, unmapped, cc,
> > +                                     mmap_locked, order + MIN_MTHP_ORDER, offset * MIN_MTHP_NR);
> > +                     if (ret == SCAN_SUCCEED)
> > +                             collapsed += (1 << (order + MIN_MTHP_ORDER));
> > +                     continue;
> > +             }
> > +
> > +next:
> > +             if (order > 0) {
> > +                     next_order = order - 1;
> > +                     mid_offset = offset + (num_chunks / 2);
> > +                     cc->mthp_bitmap_stack[++top] = (struct scan_bit_state)
> > +                             { next_order, mid_offset };
> > +                     cc->mthp_bitmap_stack[++top] = (struct scan_bit_state)
> > +                             { next_order, offset };
> > +                     }
> > +     }
> > +     return collapsed;
> > +}
> > +
> >   static int khugepaged_scan_pmd(struct mm_struct *mm,
> >                                  struct vm_area_struct *vma,
> >                                  unsigned long address, bool *mmap_locked,
> > @@ -1430,7 +1528,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
> >       pte_unmap_unlock(pte, ptl);
> >       if (result == SCAN_SUCCEED) {
> >               result = collapse_huge_page(mm, address, referenced,
> > -                                         unmapped, cc);
> > +                                         unmapped, cc, mmap_locked, HPAGE_PMD_ORDER, 0);
> >               /* collapse_huge_page will return with the mmap_lock released */
> >               *mmap_locked = false;
> >       }
> > @@ -2767,6 +2865,21 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
> >               return -ENOMEM;
> >       cc->is_khugepaged = false;
> >
> > +     cc->mthp_bitmap = kmalloc_array(
> > +             BITS_TO_LONGS(MTHP_BITMAP_SIZE), sizeof(unsigned long), GFP_KERNEL);
> > +     if (!cc->mthp_bitmap)
> > +             return -ENOMEM;
> > +
> > +     cc->mthp_bitmap_temp = kmalloc_array(
> > +             BITS_TO_LONGS(MTHP_BITMAP_SIZE), sizeof(unsigned long), GFP_KERNEL);
> > +     if (!cc->mthp_bitmap_temp)
> > +             return -ENOMEM;
> > +
> > +     cc->mthp_bitmap_stack = kmalloc_array(
> > +             MTHP_BITMAP_SIZE, sizeof(struct scan_bit_state), GFP_KERNEL);
> > +     if (!cc->mthp_bitmap_stack)
> > +             return -ENOMEM;
> > +
> >       mmgrab(mm);
> >       lru_add_drain_all();
> >
> > @@ -2831,8 +2944,12 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
> >   out_nolock:
> >       mmap_assert_locked(mm);
> >       mmdrop(mm);
> > +     kfree(cc->mthp_bitmap);
> > +     kfree(cc->mthp_bitmap_temp);
> > +     kfree(cc->mthp_bitmap_stack);
> >       kfree(cc);
> >
> > +
> >       return thps == ((hend - hstart) >> HPAGE_PMD_SHIFT) ? 0
> >                       : madvise_collapse_errno(last_fail);
> >   }
>

Re: [RFC 08/11] khugepaged: introduce khugepaged_scan_bitmap for mTHP support

Posted by Dev Jain 1 year ago


On 11/01/25 3:18 am, Nico Pache wrote:
> On Fri, Jan 10, 2025 at 2:06 AM Dev Jain <dev.jain@arm.com> wrote:
>>
>>
>>
>> On 09/01/25 5:01 am, Nico Pache wrote:
>>> khugepaged scans PMD ranges for potential collapse to a hugepage. To add
>>> mTHP support we use this scan to instead record chunks of fully utilized
>>> sections of the PMD.
>>>
>>> create a bitmap to represent a PMD in order MTHP_MIN_ORDER chunks.
>>> by default we will set this to order 3. The reasoning is that for 4K 512
>>> PMD size this results in a 64 bit bitmap which has some optimizations.
>>> For other arches like ARM64 64K, we can set a larger order if needed.
>>>
>>> khugepaged_scan_bitmap uses a stack struct to recursively scan a bitmap
>>> that represents chunks of fully utilized regions. We can then determine
>>> what mTHP size fits best and in the following patch, we set this bitmap
>>> while scanning the PMD.
>>>
>>> max_ptes_none is used as a scale to determine how "full" an order must
>>> be before being considered for collapse.
>>>
>>> Signed-off-by: Nico Pache <npache@redhat.com>
>>> ---
>>>    include/linux/khugepaged.h |   4 +-
>>>    mm/khugepaged.c            | 129 +++++++++++++++++++++++++++++++++++--
>>>    2 files changed, 126 insertions(+), 7 deletions(-)
>>>
>>> diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
>>> index 1f46046080f5..31cff8aeec4a 100644
>>> --- a/include/linux/khugepaged.h
>>> +++ b/include/linux/khugepaged.h
>>> @@ -1,7 +1,9 @@
>>>    /* SPDX-License-Identifier: GPL-2.0 */
>>>    #ifndef _LINUX_KHUGEPAGED_H
>>>    #define _LINUX_KHUGEPAGED_H
>>> -
>>
>> Nit: I don't think this line needs to be deleted.
>>
>>> +#define MIN_MTHP_ORDER       3
>>> +#define MIN_MTHP_NR  (1<<MIN_MTHP_ORDER)
>>
>> Nit: Insert a space: (1 << MIN_MTHP_ORDER)
>>
>>> +#define MTHP_BITMAP_SIZE  (1<<(HPAGE_PMD_ORDER - MIN_MTHP_ORDER))
>>>    extern unsigned int khugepaged_max_ptes_none __read_mostly;
>>>    #ifdef CONFIG_TRANSPARENT_HUGEPAGE
>>>    extern struct attribute_group khugepaged_attr_group;
>>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>>> index 9eb161b04ee4..de1dc6ea3c71 100644
>>> --- a/mm/khugepaged.c
>>> +++ b/mm/khugepaged.c
>>> @@ -94,6 +94,11 @@ static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
>>>
>>>    static struct kmem_cache *mm_slot_cache __ro_after_init;
>>>
>>> +struct scan_bit_state {
>>> +     u8 order;
>>> +     u8 offset;
>>> +};
>>> +
>>>    struct collapse_control {
>>>        bool is_khugepaged;
>>>
>>> @@ -102,6 +107,15 @@ struct collapse_control {
>>>
>>>        /* nodemask for allocation fallback */
>>>        nodemask_t alloc_nmask;
>>> +
>>> +     /* bitmap used to collapse mTHP sizes. 1bit = order MIN_MTHP_ORDER mTHP */
>>> +     unsigned long *mthp_bitmap;
>>> +     unsigned long *mthp_bitmap_temp;
>>> +     struct scan_bit_state *mthp_bitmap_stack;
>>> +};
>>> +
>>> +struct collapse_control khugepaged_collapse_control = {
>>> +     .is_khugepaged = true,
>>>    };
>>>
>>>    /**
>>> @@ -389,6 +403,25 @@ int __init khugepaged_init(void)
>>>        if (!mm_slot_cache)
>>>                return -ENOMEM;
>>>
>>> +     /*
>>> +      * allocate the bitmaps dynamically since MTHP_BITMAP_SIZE is not known at
>>> +      * compile time for some architectures.
>>> +      */
>>> +     khugepaged_collapse_control.mthp_bitmap = kmalloc_array(
>>> +             BITS_TO_LONGS(MTHP_BITMAP_SIZE), sizeof(unsigned long), GFP_KERNEL);
>>> +     if (!khugepaged_collapse_control.mthp_bitmap)
>>> +             return -ENOMEM;
>>> +
>>> +     khugepaged_collapse_control.mthp_bitmap_temp = kmalloc_array(
>>> +             BITS_TO_LONGS(MTHP_BITMAP_SIZE), sizeof(unsigned long), GFP_KERNEL);
>>> +     if (!khugepaged_collapse_control.mthp_bitmap_temp)
>>> +             return -ENOMEM;
>>> +
>>> +     khugepaged_collapse_control.mthp_bitmap_stack = kmalloc_array(
>>> +             MTHP_BITMAP_SIZE, sizeof(struct scan_bit_state), GFP_KERNEL);
>>> +     if (!khugepaged_collapse_control.mthp_bitmap_stack)
>>> +             return -ENOMEM;
>>> +
>>>        khugepaged_pages_to_scan = HPAGE_PMD_NR * 8;
>>>        khugepaged_max_ptes_none = HPAGE_PMD_NR - 1;
>>>        khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8;
>>> @@ -400,6 +433,9 @@ int __init khugepaged_init(void)
>>>    void __init khugepaged_destroy(void)
>>>    {
>>>        kmem_cache_destroy(mm_slot_cache);
>>> +     kfree(khugepaged_collapse_control.mthp_bitmap);
>>> +     kfree(khugepaged_collapse_control.mthp_bitmap_temp);
>>> +     kfree(khugepaged_collapse_control.mthp_bitmap_stack);
>>>    }
>>>
>>>    static inline int khugepaged_test_exit(struct mm_struct *mm)
>>> @@ -850,10 +886,6 @@ static void khugepaged_alloc_sleep(void)
>>>        remove_wait_queue(&khugepaged_wait, &wait);
>>>    }
>>>
>>> -struct collapse_control khugepaged_collapse_control = {
>>> -     .is_khugepaged = true,
>>> -};
>>> -
>>>    static bool khugepaged_scan_abort(int nid, struct collapse_control *cc)
>>>    {
>>>        int i;
>>> @@ -1102,7 +1134,8 @@ static int alloc_charge_folio(struct folio **foliop, struct mm_struct *mm,
>>>
>>>    static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
>>>                              int referenced, int unmapped,
>>> -                           struct collapse_control *cc)
>>> +                           struct collapse_control *cc, bool *mmap_locked,
>>> +                               int order, int offset)
>>>    {
>>>        LIST_HEAD(compound_pagelist);
>>>        pmd_t *pmd, _pmd;
>>> @@ -1115,6 +1148,11 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
>>>        struct mmu_notifier_range range;
>>>        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
>>>
>>> +     /* if collapsing mTHPs we may have already released the read_lock, and
>>> +      * need to reaquire it to keep the proper locking order.
>>> +      */
>>> +     if (!*mmap_locked)
>>> +             mmap_read_lock(mm);
>>
>> There is no need to take the read lock again, because we drop it just
>> after this.
> 
> collapse_huge_page expects the mmap_lock to already be taken, and it
> returns with it unlocked. If we are collapsing multiple mTHPs under
> the same PMD, then I think we need to reacquire the lock before
> calling unlock on it.

I cannot figure out a potential place where we drop the lock before 
entering collapse_huge_page(). In any case, wouldn't this be better:
if (*mmap_locked)
	mmap_read_unlock(mm);

Basically, instead of putting the if condition around the lock, you do 
it around the unlock?

> 
>>
>>>        /*
>>>         * Before allocating the hugepage, release the mmap_lock read lock.
>>>         * The allocation can take potentially a long time if it involves
>>> @@ -1122,6 +1160,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
>>>         * that. We will recheck the vma after taking it again in write mode.
>>>         */
>>>        mmap_read_unlock(mm);
>>> +     *mmap_locked = false;
>>>
>>>        result = alloc_charge_folio(&folio, mm, cc, HPAGE_PMD_ORDER);
>>>        if (result != SCAN_SUCCEED)
>>> @@ -1256,12 +1295,71 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
>>>    out_up_write:
>>>        mmap_write_unlock(mm);
>>>    out_nolock:
>>> +     *mmap_locked = false;
>>>        if (folio)
>>>                folio_put(folio);
>>>        trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result);
>>>        return result;
>>>    }
>>>
>>> +// Recursive function to consume the bitmap
>>> +static int khugepaged_scan_bitmap(struct mm_struct *mm, unsigned long address,
>>> +                     int referenced, int unmapped, struct collapse_control *cc,
>>> +                     bool *mmap_locked, unsigned long enabled_orders)
>>> +{
>>> +     u8 order, offset;
>>> +     int num_chunks;
>>> +     int bits_set, max_percent, threshold_bits;
>>> +     int next_order, mid_offset;
>>> +     int top = -1;
>>> +     int collapsed = 0;
>>> +     int ret;
>>> +     struct scan_bit_state state;
>>> +
>>> +     cc->mthp_bitmap_stack[++top] = (struct scan_bit_state)
>>> +             { HPAGE_PMD_ORDER - MIN_MTHP_ORDER, 0 };
>>> +
>>> +     while (top >= 0) {
>>> +             state = cc->mthp_bitmap_stack[top--];
>>> +             order = state.order;
>>> +             offset = state.offset;
>>> +             num_chunks = 1 << order;
>>> +             // Skip mTHP orders that are not enabled
>>> +             if (!(enabled_orders >> (order +  MIN_MTHP_ORDER)) & 1)
>>> +                     goto next;
>>> +
>>> +             // copy the relavant section to a new bitmap
>>> +             bitmap_shift_right(cc->mthp_bitmap_temp, cc->mthp_bitmap, offset,
>>> +                               MTHP_BITMAP_SIZE);
>>> +
>>> +             bits_set = bitmap_weight(cc->mthp_bitmap_temp, num_chunks);
>>> +
>>> +             // Check if the region is "almost full" based on the threshold
>>> +             max_percent = ((HPAGE_PMD_NR - khugepaged_max_ptes_none - 1) * 100)
>>> +                                             / (HPAGE_PMD_NR - 1);
>>> +             threshold_bits = (max_percent * num_chunks) / 100;
>>> +
>>> +             if (bits_set >= threshold_bits) {
>>> +                     ret = collapse_huge_page(mm, address, referenced, unmapped, cc,
>>> +                                     mmap_locked, order + MIN_MTHP_ORDER, offset * MIN_MTHP_NR);
>>> +                     if (ret == SCAN_SUCCEED)
>>> +                             collapsed += (1 << (order + MIN_MTHP_ORDER));
>>> +                     continue;
>>> +             }
>>> +
>>> +next:
>>> +             if (order > 0) {
>>> +                     next_order = order - 1;
>>> +                     mid_offset = offset + (num_chunks / 2);
>>> +                     cc->mthp_bitmap_stack[++top] = (struct scan_bit_state)
>>> +                             { next_order, mid_offset };
>>> +                     cc->mthp_bitmap_stack[++top] = (struct scan_bit_state)
>>> +                             { next_order, offset };
>>> +                     }
>>> +     }
>>> +     return collapsed;
>>> +}
>>> +
>>>    static int khugepaged_scan_pmd(struct mm_struct *mm,
>>>                                   struct vm_area_struct *vma,
>>>                                   unsigned long address, bool *mmap_locked,
>>> @@ -1430,7 +1528,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
>>>        pte_unmap_unlock(pte, ptl);
>>>        if (result == SCAN_SUCCEED) {
>>>                result = collapse_huge_page(mm, address, referenced,
>>> -                                         unmapped, cc);
>>> +                                         unmapped, cc, mmap_locked, HPAGE_PMD_ORDER, 0);
>>>                /* collapse_huge_page will return with the mmap_lock released */
>>>                *mmap_locked = false;
>>>        }
>>> @@ -2767,6 +2865,21 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
>>>                return -ENOMEM;
>>>        cc->is_khugepaged = false;
>>>
>>> +     cc->mthp_bitmap = kmalloc_array(
>>> +             BITS_TO_LONGS(MTHP_BITMAP_SIZE), sizeof(unsigned long), GFP_KERNEL);
>>> +     if (!cc->mthp_bitmap)
>>> +             return -ENOMEM;
>>> +
>>> +     cc->mthp_bitmap_temp = kmalloc_array(
>>> +             BITS_TO_LONGS(MTHP_BITMAP_SIZE), sizeof(unsigned long), GFP_KERNEL);
>>> +     if (!cc->mthp_bitmap_temp)
>>> +             return -ENOMEM;
>>> +
>>> +     cc->mthp_bitmap_stack = kmalloc_array(
>>> +             MTHP_BITMAP_SIZE, sizeof(struct scan_bit_state), GFP_KERNEL);
>>> +     if (!cc->mthp_bitmap_stack)
>>> +             return -ENOMEM;
>>> +
>>>        mmgrab(mm);
>>>        lru_add_drain_all();
>>>
>>> @@ -2831,8 +2944,12 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
>>>    out_nolock:
>>>        mmap_assert_locked(mm);
>>>        mmdrop(mm);
>>> +     kfree(cc->mthp_bitmap);
>>> +     kfree(cc->mthp_bitmap_temp);
>>> +     kfree(cc->mthp_bitmap_stack);
>>>        kfree(cc);
>>>
>>> +
>>>        return thps == ((hend - hstart) >> HPAGE_PMD_SHIFT) ? 0
>>>                        : madvise_collapse_errno(last_fail);
>>>    }
>>
>

Re: [RFC 08/11] khugepaged: introduce khugepaged_scan_bitmap for mTHP support

Posted by Nico Pache 1 year ago

On Sun, Jan 12, 2025 at 4:23 AM Dev Jain <dev.jain@arm.com> wrote:
>
>
>
> On 11/01/25 3:18 am, Nico Pache wrote:
> > On Fri, Jan 10, 2025 at 2:06 AM Dev Jain <dev.jain@arm.com> wrote:
> >>
> >>
> >>
> >> On 09/01/25 5:01 am, Nico Pache wrote:
> >>> khugepaged scans PMD ranges for potential collapse to a hugepage. To add
> >>> mTHP support we use this scan to instead record chunks of fully utilized
> >>> sections of the PMD.
> >>>
> >>> create a bitmap to represent a PMD in order MTHP_MIN_ORDER chunks.
> >>> by default we will set this to order 3. The reasoning is that for 4K 512
> >>> PMD size this results in a 64 bit bitmap which has some optimizations.
> >>> For other arches like ARM64 64K, we can set a larger order if needed.
> >>>
> >>> khugepaged_scan_bitmap uses a stack struct to recursively scan a bitmap
> >>> that represents chunks of fully utilized regions. We can then determine
> >>> what mTHP size fits best and in the following patch, we set this bitmap
> >>> while scanning the PMD.
> >>>
> >>> max_ptes_none is used as a scale to determine how "full" an order must
> >>> be before being considered for collapse.
> >>>
> >>> Signed-off-by: Nico Pache <npache@redhat.com>
> >>> ---
> >>>    include/linux/khugepaged.h |   4 +-
> >>>    mm/khugepaged.c            | 129 +++++++++++++++++++++++++++++++++++--
> >>>    2 files changed, 126 insertions(+), 7 deletions(-)
> >>>
> >>> diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
> >>> index 1f46046080f5..31cff8aeec4a 100644
> >>> --- a/include/linux/khugepaged.h
> >>> +++ b/include/linux/khugepaged.h
> >>> @@ -1,7 +1,9 @@
> >>>    /* SPDX-License-Identifier: GPL-2.0 */
> >>>    #ifndef _LINUX_KHUGEPAGED_H
> >>>    #define _LINUX_KHUGEPAGED_H
> >>> -
> >>
> >> Nit: I don't think this line needs to be deleted.
> >>
> >>> +#define MIN_MTHP_ORDER       3
> >>> +#define MIN_MTHP_NR  (1<<MIN_MTHP_ORDER)
> >>
> >> Nit: Insert a space: (1 << MIN_MTHP_ORDER)
> >>
> >>> +#define MTHP_BITMAP_SIZE  (1<<(HPAGE_PMD_ORDER - MIN_MTHP_ORDER))
> >>>    extern unsigned int khugepaged_max_ptes_none __read_mostly;
> >>>    #ifdef CONFIG_TRANSPARENT_HUGEPAGE
> >>>    extern struct attribute_group khugepaged_attr_group;
> >>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> >>> index 9eb161b04ee4..de1dc6ea3c71 100644
> >>> --- a/mm/khugepaged.c
> >>> +++ b/mm/khugepaged.c
> >>> @@ -94,6 +94,11 @@ static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
> >>>
> >>>    static struct kmem_cache *mm_slot_cache __ro_after_init;
> >>>
> >>> +struct scan_bit_state {
> >>> +     u8 order;
> >>> +     u8 offset;
> >>> +};
> >>> +
> >>>    struct collapse_control {
> >>>        bool is_khugepaged;
> >>>
> >>> @@ -102,6 +107,15 @@ struct collapse_control {
> >>>
> >>>        /* nodemask for allocation fallback */
> >>>        nodemask_t alloc_nmask;
> >>> +
> >>> +     /* bitmap used to collapse mTHP sizes. 1bit = order MIN_MTHP_ORDER mTHP */
> >>> +     unsigned long *mthp_bitmap;
> >>> +     unsigned long *mthp_bitmap_temp;
> >>> +     struct scan_bit_state *mthp_bitmap_stack;
> >>> +};
> >>> +
> >>> +struct collapse_control khugepaged_collapse_control = {
> >>> +     .is_khugepaged = true,
> >>>    };
> >>>
> >>>    /**
> >>> @@ -389,6 +403,25 @@ int __init khugepaged_init(void)
> >>>        if (!mm_slot_cache)
> >>>                return -ENOMEM;
> >>>
> >>> +     /*
> >>> +      * allocate the bitmaps dynamically since MTHP_BITMAP_SIZE is not known at
> >>> +      * compile time for some architectures.
> >>> +      */
> >>> +     khugepaged_collapse_control.mthp_bitmap = kmalloc_array(
> >>> +             BITS_TO_LONGS(MTHP_BITMAP_SIZE), sizeof(unsigned long), GFP_KERNEL);
> >>> +     if (!khugepaged_collapse_control.mthp_bitmap)
> >>> +             return -ENOMEM;
> >>> +
> >>> +     khugepaged_collapse_control.mthp_bitmap_temp = kmalloc_array(
> >>> +             BITS_TO_LONGS(MTHP_BITMAP_SIZE), sizeof(unsigned long), GFP_KERNEL);
> >>> +     if (!khugepaged_collapse_control.mthp_bitmap_temp)
> >>> +             return -ENOMEM;
> >>> +
> >>> +     khugepaged_collapse_control.mthp_bitmap_stack = kmalloc_array(
> >>> +             MTHP_BITMAP_SIZE, sizeof(struct scan_bit_state), GFP_KERNEL);
> >>> +     if (!khugepaged_collapse_control.mthp_bitmap_stack)
> >>> +             return -ENOMEM;
> >>> +
> >>>        khugepaged_pages_to_scan = HPAGE_PMD_NR * 8;
> >>>        khugepaged_max_ptes_none = HPAGE_PMD_NR - 1;
> >>>        khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8;
> >>> @@ -400,6 +433,9 @@ int __init khugepaged_init(void)
> >>>    void __init khugepaged_destroy(void)
> >>>    {
> >>>        kmem_cache_destroy(mm_slot_cache);
> >>> +     kfree(khugepaged_collapse_control.mthp_bitmap);
> >>> +     kfree(khugepaged_collapse_control.mthp_bitmap_temp);
> >>> +     kfree(khugepaged_collapse_control.mthp_bitmap_stack);
> >>>    }
> >>>
> >>>    static inline int khugepaged_test_exit(struct mm_struct *mm)
> >>> @@ -850,10 +886,6 @@ static void khugepaged_alloc_sleep(void)
> >>>        remove_wait_queue(&khugepaged_wait, &wait);
> >>>    }
> >>>
> >>> -struct collapse_control khugepaged_collapse_control = {
> >>> -     .is_khugepaged = true,
> >>> -};
> >>> -
> >>>    static bool khugepaged_scan_abort(int nid, struct collapse_control *cc)
> >>>    {
> >>>        int i;
> >>> @@ -1102,7 +1134,8 @@ static int alloc_charge_folio(struct folio **foliop, struct mm_struct *mm,
> >>>
> >>>    static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> >>>                              int referenced, int unmapped,
> >>> -                           struct collapse_control *cc)
> >>> +                           struct collapse_control *cc, bool *mmap_locked,
> >>> +                               int order, int offset)
> >>>    {
> >>>        LIST_HEAD(compound_pagelist);
> >>>        pmd_t *pmd, _pmd;
> >>> @@ -1115,6 +1148,11 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> >>>        struct mmu_notifier_range range;
> >>>        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
> >>>
> >>> +     /* if collapsing mTHPs we may have already released the read_lock, and
> >>> +      * need to reaquire it to keep the proper locking order.
> >>> +      */
> >>> +     if (!*mmap_locked)
> >>> +             mmap_read_lock(mm);
> >>
> >> There is no need to take the read lock again, because we drop it just
> >> after this.
> >
> > collapse_huge_page expects the mmap_lock to already be taken, and it
> > returns with it unlocked. If we are collapsing multiple mTHPs under
> > the same PMD, then I think we need to reacquire the lock before
> > calling unlock on it.
>
> I cannot figure out a potential place where we drop the lock before
> entering collapse_huge_page(). In any case, wouldn't this be better:

Let's say we are collapsing two 1024kB mTHPs in a single PMD region.
We call collapse_huge_page on the first mTHP and during the collapse
the lock is dropped.
When the second mTHP collapse is attempted the lock has already been dropped.

> if (*mmap_locked)
>         mmap_read_unlock(mm);
>
> Basically, instead of putting the if condition around the lock, you do
> it around the unlock?

Yeah that seems much cleaner, Ill give it a try, thanks!

>
> >
> >>
> >>>        /*
> >>>         * Before allocating the hugepage, release the mmap_lock read lock.
> >>>         * The allocation can take potentially a long time if it involves
> >>> @@ -1122,6 +1160,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> >>>         * that. We will recheck the vma after taking it again in write mode.
> >>>         */
> >>>        mmap_read_unlock(mm);
> >>> +     *mmap_locked = false;
> >>>
> >>>        result = alloc_charge_folio(&folio, mm, cc, HPAGE_PMD_ORDER);
> >>>        if (result != SCAN_SUCCEED)
> >>> @@ -1256,12 +1295,71 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> >>>    out_up_write:
> >>>        mmap_write_unlock(mm);
> >>>    out_nolock:
> >>> +     *mmap_locked = false;
> >>>        if (folio)
> >>>                folio_put(folio);
> >>>        trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result);
> >>>        return result;
> >>>    }
> >>>
> >>> +// Recursive function to consume the bitmap
> >>> +static int khugepaged_scan_bitmap(struct mm_struct *mm, unsigned long address,
> >>> +                     int referenced, int unmapped, struct collapse_control *cc,
> >>> +                     bool *mmap_locked, unsigned long enabled_orders)
> >>> +{
> >>> +     u8 order, offset;
> >>> +     int num_chunks;
> >>> +     int bits_set, max_percent, threshold_bits;
> >>> +     int next_order, mid_offset;
> >>> +     int top = -1;
> >>> +     int collapsed = 0;
> >>> +     int ret;
> >>> +     struct scan_bit_state state;
> >>> +
> >>> +     cc->mthp_bitmap_stack[++top] = (struct scan_bit_state)
> >>> +             { HPAGE_PMD_ORDER - MIN_MTHP_ORDER, 0 };
> >>> +
> >>> +     while (top >= 0) {
> >>> +             state = cc->mthp_bitmap_stack[top--];
> >>> +             order = state.order;
> >>> +             offset = state.offset;
> >>> +             num_chunks = 1 << order;
> >>> +             // Skip mTHP orders that are not enabled
> >>> +             if (!(enabled_orders >> (order +  MIN_MTHP_ORDER)) & 1)
> >>> +                     goto next;
> >>> +
> >>> +             // copy the relavant section to a new bitmap
> >>> +             bitmap_shift_right(cc->mthp_bitmap_temp, cc->mthp_bitmap, offset,
> >>> +                               MTHP_BITMAP_SIZE);
> >>> +
> >>> +             bits_set = bitmap_weight(cc->mthp_bitmap_temp, num_chunks);
> >>> +
> >>> +             // Check if the region is "almost full" based on the threshold
> >>> +             max_percent = ((HPAGE_PMD_NR - khugepaged_max_ptes_none - 1) * 100)
> >>> +                                             / (HPAGE_PMD_NR - 1);
> >>> +             threshold_bits = (max_percent * num_chunks) / 100;
> >>> +
> >>> +             if (bits_set >= threshold_bits) {
> >>> +                     ret = collapse_huge_page(mm, address, referenced, unmapped, cc,
> >>> +                                     mmap_locked, order + MIN_MTHP_ORDER, offset * MIN_MTHP_NR);
> >>> +                     if (ret == SCAN_SUCCEED)
> >>> +                             collapsed += (1 << (order + MIN_MTHP_ORDER));
> >>> +                     continue;
> >>> +             }
> >>> +
> >>> +next:
> >>> +             if (order > 0) {
> >>> +                     next_order = order - 1;
> >>> +                     mid_offset = offset + (num_chunks / 2);
> >>> +                     cc->mthp_bitmap_stack[++top] = (struct scan_bit_state)
> >>> +                             { next_order, mid_offset };
> >>> +                     cc->mthp_bitmap_stack[++top] = (struct scan_bit_state)
> >>> +                             { next_order, offset };
> >>> +                     }
> >>> +     }
> >>> +     return collapsed;
> >>> +}
> >>> +
> >>>    static int khugepaged_scan_pmd(struct mm_struct *mm,
> >>>                                   struct vm_area_struct *vma,
> >>>                                   unsigned long address, bool *mmap_locked,
> >>> @@ -1430,7 +1528,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
> >>>        pte_unmap_unlock(pte, ptl);
> >>>        if (result == SCAN_SUCCEED) {
> >>>                result = collapse_huge_page(mm, address, referenced,
> >>> -                                         unmapped, cc);
> >>> +                                         unmapped, cc, mmap_locked, HPAGE_PMD_ORDER, 0);
> >>>                /* collapse_huge_page will return with the mmap_lock released */
> >>>                *mmap_locked = false;
> >>>        }
> >>> @@ -2767,6 +2865,21 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
> >>>                return -ENOMEM;
> >>>        cc->is_khugepaged = false;
> >>>
> >>> +     cc->mthp_bitmap = kmalloc_array(
> >>> +             BITS_TO_LONGS(MTHP_BITMAP_SIZE), sizeof(unsigned long), GFP_KERNEL);
> >>> +     if (!cc->mthp_bitmap)
> >>> +             return -ENOMEM;
> >>> +
> >>> +     cc->mthp_bitmap_temp = kmalloc_array(
> >>> +             BITS_TO_LONGS(MTHP_BITMAP_SIZE), sizeof(unsigned long), GFP_KERNEL);
> >>> +     if (!cc->mthp_bitmap_temp)
> >>> +             return -ENOMEM;
> >>> +
> >>> +     cc->mthp_bitmap_stack = kmalloc_array(
> >>> +             MTHP_BITMAP_SIZE, sizeof(struct scan_bit_state), GFP_KERNEL);
> >>> +     if (!cc->mthp_bitmap_stack)
> >>> +             return -ENOMEM;
> >>> +
> >>>        mmgrab(mm);
> >>>        lru_add_drain_all();
> >>>
> >>> @@ -2831,8 +2944,12 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
> >>>    out_nolock:
> >>>        mmap_assert_locked(mm);
> >>>        mmdrop(mm);
> >>> +     kfree(cc->mthp_bitmap);
> >>> +     kfree(cc->mthp_bitmap_temp);
> >>> +     kfree(cc->mthp_bitmap_stack);
> >>>        kfree(cc);
> >>>
> >>> +
> >>>        return thps == ((hend - hstart) >> HPAGE_PMD_SHIFT) ? 0
> >>>                        : madvise_collapse_errno(last_fail);
> >>>    }
> >>
> >
>