[PATCH mm-unstable v3 5/5] mm/khugepaged: unify khugepaged and madv_collapse with collapse_single_pmd()

Nico Pache posted 5 patches 3 weeks, 5 days ago
There is a newer version of this series
[PATCH mm-unstable v3 5/5] mm/khugepaged: unify khugepaged and madv_collapse with collapse_single_pmd()
Posted by Nico Pache 3 weeks, 5 days ago
The khugepaged daemon and madvise_collapse have two different
implementations that do almost the same thing. Create collapse_single_pmd
to increase code reuse and create an entry point to these two users.

Refactor madvise_collapse and collapse_scan_mm_slot to use the new
collapse_single_pmd function. This introduces a minor behavioral change
that is most likely an undiscovered bug. The current implementation of
khugepaged tests collapse_test_exit_or_disable before calling
collapse_pte_mapped_thp, but we weren't doing it in the madvise_collapse
case. By unifying these two callers madvise_collapse now also performs
this check. We also modify the return value to be SCAN_ANY_PROCESS which
properly indicates that this process is no longer valid to operate on.

By moving the madvise_collapse writeback-retry logic into the helper
function we can also avoid having to revalidate the VMA.

We also guard the khugepaged_pages_collapsed variable to ensure its only
incremented for khugepaged.

Signed-off-by: Nico Pache <npache@redhat.com>
---
 mm/khugepaged.c | 120 +++++++++++++++++++++++++-----------------------
 1 file changed, 63 insertions(+), 57 deletions(-)

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 33ae56e313ed..733c4a42c2ce 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -2409,6 +2409,65 @@ static enum scan_result collapse_scan_file(struct mm_struct *mm,
 	return result;
 }
 
+/*
+ * Try to collapse a single PMD starting at a PMD aligned addr, and return
+ * the results.
+ */
+static enum scan_result collapse_single_pmd(unsigned long addr,
+		struct vm_area_struct *vma, bool *mmap_locked,
+		struct collapse_control *cc)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	bool triggered_wb = false;
+	enum scan_result result;
+	struct file *file;
+	pgoff_t pgoff;
+
+	if (vma_is_anonymous(vma)) {
+		result = collapse_scan_pmd(mm, vma, addr, mmap_locked, cc);
+		goto end;
+	}
+
+	file = get_file(vma->vm_file);
+	pgoff = linear_page_index(vma, addr);
+
+	mmap_read_unlock(mm);
+	*mmap_locked = false;
+retry:
+	result = collapse_scan_file(mm, addr, file, pgoff, cc);
+
+	/*
+	 * For MADV_COLLAPSE, when encountering dirty pages, try to writeback,
+	 * then retry the collapse one time.
+	 */
+	if (!cc->is_khugepaged && result == SCAN_PAGE_DIRTY_OR_WRITEBACK &&
+	    !triggered_wb && mapping_can_writeback(file->f_mapping)) {
+		const loff_t lstart = (loff_t)pgoff << PAGE_SHIFT;
+		const loff_t lend = lstart + HPAGE_PMD_SIZE - 1;
+
+		filemap_write_and_wait_range(file->f_mapping, lstart, lend);
+		triggered_wb = true;
+		goto retry;
+	}
+	fput(file);
+
+	if (result == SCAN_PTE_MAPPED_HUGEPAGE) {
+		mmap_read_lock(mm);
+		if (collapse_test_exit_or_disable(mm))
+			result = SCAN_ANY_PROCESS;
+		else
+			result = try_collapse_pte_mapped_thp(mm, addr,
+							     !cc->is_khugepaged);
+		if (result == SCAN_PMD_MAPPED)
+			result = SCAN_SUCCEED;
+		mmap_read_unlock(mm);
+	}
+end:
+	if (cc->is_khugepaged && result == SCAN_SUCCEED)
+		++khugepaged_pages_collapsed;
+	return result;
+}
+
 static void collapse_scan_mm_slot(unsigned int progress_max,
 		enum scan_result *result, struct collapse_control *cc)
 	__releases(&khugepaged_mm_lock)
@@ -2479,34 +2538,9 @@ static void collapse_scan_mm_slot(unsigned int progress_max,
 			VM_BUG_ON(khugepaged_scan.address < hstart ||
 				  khugepaged_scan.address + HPAGE_PMD_SIZE >
 				  hend);
-			if (!vma_is_anonymous(vma)) {
-				struct file *file = get_file(vma->vm_file);
-				pgoff_t pgoff = linear_page_index(vma,
-						khugepaged_scan.address);
-
-				mmap_read_unlock(mm);
-				mmap_locked = false;
-				*result = collapse_scan_file(mm,
-					khugepaged_scan.address, file, pgoff, cc);
-				fput(file);
-				if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
-					mmap_read_lock(mm);
-					if (collapse_test_exit_or_disable(mm))
-						goto breakouterloop;
-					*result = try_collapse_pte_mapped_thp(mm,
-						khugepaged_scan.address, false);
-					if (*result == SCAN_PMD_MAPPED)
-						*result = SCAN_SUCCEED;
-					mmap_read_unlock(mm);
-				}
-			} else {
-				*result = collapse_scan_pmd(mm, vma,
-					khugepaged_scan.address, &mmap_locked, cc);
-			}
-
-			if (*result == SCAN_SUCCEED)
-				++khugepaged_pages_collapsed;
 
+			*result = collapse_single_pmd(khugepaged_scan.address,
+						      vma, &mmap_locked, cc);
 			/* move to next address */
 			khugepaged_scan.address += HPAGE_PMD_SIZE;
 			if (!mmap_locked)
@@ -2806,9 +2840,7 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
 
 	for (addr = hstart; addr < hend; addr += HPAGE_PMD_SIZE) {
 		enum scan_result result = SCAN_FAIL;
-		bool triggered_wb = false;
 
-retry:
 		if (!mmap_locked) {
 			cond_resched();
 			mmap_read_lock(mm);
@@ -2823,46 +2855,20 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
 			hend = min(hend, vma->vm_end & HPAGE_PMD_MASK);
 		}
 		mmap_assert_locked(mm);
-		if (!vma_is_anonymous(vma)) {
-			struct file *file = get_file(vma->vm_file);
-			pgoff_t pgoff = linear_page_index(vma, addr);
 
-			mmap_read_unlock(mm);
-			mmap_locked = false;
-			*lock_dropped = true;
-			result = collapse_scan_file(mm, addr, file, pgoff, cc);
-
-			if (result == SCAN_PAGE_DIRTY_OR_WRITEBACK && !triggered_wb &&
-			    mapping_can_writeback(file->f_mapping)) {
-				loff_t lstart = (loff_t)pgoff << PAGE_SHIFT;
-				loff_t lend = lstart + HPAGE_PMD_SIZE - 1;
+		result = collapse_single_pmd(addr, vma, &mmap_locked, cc);
 
-				filemap_write_and_wait_range(file->f_mapping, lstart, lend);
-				triggered_wb = true;
-				fput(file);
-				goto retry;
-			}
-			fput(file);
-		} else {
-			result = collapse_scan_pmd(mm, vma, addr, &mmap_locked, cc);
-		}
 		if (!mmap_locked)
 			*lock_dropped = true;
 
-handle_result:
 		switch (result) {
 		case SCAN_SUCCEED:
 		case SCAN_PMD_MAPPED:
 			++thps;
 			break;
-		case SCAN_PTE_MAPPED_HUGEPAGE:
-			BUG_ON(mmap_locked);
-			mmap_read_lock(mm);
-			result = try_collapse_pte_mapped_thp(mm, addr, true);
-			mmap_read_unlock(mm);
-			goto handle_result;
 		/* Whitelisted set of results where continuing OK */
 		case SCAN_NO_PTE_TABLE:
+		case SCAN_PTE_MAPPED_HUGEPAGE:
 		case SCAN_PTE_NON_PRESENT:
 		case SCAN_PTE_UFFD_WP:
 		case SCAN_LACK_REFERENCED_PAGE:
-- 
2.53.0
Re: [PATCH mm-unstable v3 5/5] mm/khugepaged: unify khugepaged and madv_collapse with collapse_single_pmd()
Posted by Lorenzo Stoakes (Oracle) 3 weeks, 1 day ago
On Wed, Mar 11, 2026 at 03:13:15PM -0600, Nico Pache wrote:
> The khugepaged daemon and madvise_collapse have two different
> implementations that do almost the same thing. Create collapse_single_pmd
> to increase code reuse and create an entry point to these two users.

Ah this is nice :) Thanks!

>
> Refactor madvise_collapse and collapse_scan_mm_slot to use the new
> collapse_single_pmd function. This introduces a minor behavioral change
> that is most likely an undiscovered bug. The current implementation of
> khugepaged tests collapse_test_exit_or_disable before calling
> collapse_pte_mapped_thp, but we weren't doing it in the madvise_collapse
> case. By unifying these two callers madvise_collapse now also performs
> this check. We also modify the return value to be SCAN_ANY_PROCESS which
> properly indicates that this process is no longer valid to operate on.
>
> By moving the madvise_collapse writeback-retry logic into the helper
> function we can also avoid having to revalidate the VMA.
>
> We also guard the khugepaged_pages_collapsed variable to ensure its only
> incremented for khugepaged.
>
> Signed-off-by: Nico Pache <npache@redhat.com>

The logic all seems correct to me, just a bunch of nits below really. This is
a really nice refactoring! :)

With them addressed:

Reviewed-by: Lorenzo Stoakes (Oracle) <ljs@kernel.org>

Cheers, Lorenzo

> ---
>  mm/khugepaged.c | 120 +++++++++++++++++++++++++-----------------------
>  1 file changed, 63 insertions(+), 57 deletions(-)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 33ae56e313ed..733c4a42c2ce 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -2409,6 +2409,65 @@ static enum scan_result collapse_scan_file(struct mm_struct *mm,
>  	return result;
>  }
>
> +/*
> + * Try to collapse a single PMD starting at a PMD aligned addr, and return
> + * the results.
> + */
> +static enum scan_result collapse_single_pmd(unsigned long addr,
> +		struct vm_area_struct *vma, bool *mmap_locked,

mmap_locked seems mildly pointless here, and it's a semi-code smell to pass 'is
locked' flags I think.

You never read this, but the parameter implies somebody might pass in mmaplocked
== false, but you know it's always true here.

Anyway I think it makes more sense to pass in lock_dropped and get rid of
mmap_locked in madvise_collapse() and just pass in lock_dropped directly
(setting it false if anon).

Also obviously update collapse_scan_mm_slot() to use lock_dropped instead just
inverted.

That's clearer I think since it makes it a verb rather than a noun and the
function is dictating whether or not the lock is dropped, it also implies the
lock is held on entry.

> +		struct collapse_control *cc)
> +{
> +	struct mm_struct *mm = vma->vm_mm;
> +	bool triggered_wb = false;
> +	enum scan_result result;
> +	struct file *file;
> +	pgoff_t pgoff;
> +

Maybe move the mmap_assert_locked() from madvise_collapse() to here? Then we
assert it in both cases.

> +	if (vma_is_anonymous(vma)) {
> +		result = collapse_scan_pmd(mm, vma, addr, mmap_locked, cc);
> +		goto end;
> +	}
> +
> +	file = get_file(vma->vm_file);
> +	pgoff = linear_page_index(vma, addr);
> +
> +	mmap_read_unlock(mm);
> +	*mmap_locked = false;
> +retry:
> +	result = collapse_scan_file(mm, addr, file, pgoff, cc);
> +
> +	/*
> +	 * For MADV_COLLAPSE, when encountering dirty pages, try to writeback,
> +	 * then retry the collapse one time.
> +	 */
> +	if (!cc->is_khugepaged && result == SCAN_PAGE_DIRTY_OR_WRITEBACK &&
> +	    !triggered_wb && mapping_can_writeback(file->f_mapping)) {
> +		const loff_t lstart = (loff_t)pgoff << PAGE_SHIFT;
> +		const loff_t lend = lstart + HPAGE_PMD_SIZE - 1;
> +
> +		filemap_write_and_wait_range(file->f_mapping, lstart, lend);
> +		triggered_wb = true;
> +		goto retry;

Thinking through this logic I do agree that we don't need to revalidate here,
which should be quite a nice win, I just don't know why we previously assumed
we'd have to... or maybe it was just because it became too spaghetti to goto
around it somehow??

> +	}
> +	fput(file);
> +
> +	if (result == SCAN_PTE_MAPPED_HUGEPAGE) {
> +		mmap_read_lock(mm);
> +		if (collapse_test_exit_or_disable(mm))
> +			result = SCAN_ANY_PROCESS;
> +		else
> +			result = try_collapse_pte_mapped_thp(mm, addr,
> +							     !cc->is_khugepaged);
> +		if (result == SCAN_PMD_MAPPED)
> +			result = SCAN_SUCCEED;
> +		mmap_read_unlock(mm);
> +	}
> +end:
> +	if (cc->is_khugepaged && result == SCAN_SUCCEED)
> +		++khugepaged_pages_collapsed;
> +	return result;
> +}
> +
>  static void collapse_scan_mm_slot(unsigned int progress_max,
>  		enum scan_result *result, struct collapse_control *cc)
>  	__releases(&khugepaged_mm_lock)
> @@ -2479,34 +2538,9 @@ static void collapse_scan_mm_slot(unsigned int progress_max,
>  			VM_BUG_ON(khugepaged_scan.address < hstart ||
>  				  khugepaged_scan.address + HPAGE_PMD_SIZE >
>  				  hend);

Nice-to-have, but could we convert these VM_BUG_ON()'s to VM_WARN_ON_ONCE()'s
while we're passing?

> -			if (!vma_is_anonymous(vma)) {
> -				struct file *file = get_file(vma->vm_file);
> -				pgoff_t pgoff = linear_page_index(vma,
> -						khugepaged_scan.address);
> -
> -				mmap_read_unlock(mm);
> -				mmap_locked = false;
> -				*result = collapse_scan_file(mm,
> -					khugepaged_scan.address, file, pgoff, cc);
> -				fput(file);
> -				if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
> -					mmap_read_lock(mm);
> -					if (collapse_test_exit_or_disable(mm))
> -						goto breakouterloop;
> -					*result = try_collapse_pte_mapped_thp(mm,
> -						khugepaged_scan.address, false);
> -					if (*result == SCAN_PMD_MAPPED)
> -						*result = SCAN_SUCCEED;
> -					mmap_read_unlock(mm);
> -				}
> -			} else {
> -				*result = collapse_scan_pmd(mm, vma,
> -					khugepaged_scan.address, &mmap_locked, cc);
> -			}
> -
> -			if (*result == SCAN_SUCCEED)
> -				++khugepaged_pages_collapsed;
>
> +			*result = collapse_single_pmd(khugepaged_scan.address,
> +						      vma, &mmap_locked, cc);
>  			/* move to next address */
>  			khugepaged_scan.address += HPAGE_PMD_SIZE;
>  			if (!mmap_locked)
> @@ -2806,9 +2840,7 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
>
>  	for (addr = hstart; addr < hend; addr += HPAGE_PMD_SIZE) {
>  		enum scan_result result = SCAN_FAIL;
> -		bool triggered_wb = false;
>
> -retry:
>  		if (!mmap_locked) {
>  			cond_resched();
>  			mmap_read_lock(mm);
> @@ -2823,46 +2855,20 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
>  			hend = min(hend, vma->vm_end & HPAGE_PMD_MASK);
>  		}
>  		mmap_assert_locked(mm);
> -		if (!vma_is_anonymous(vma)) {
> -			struct file *file = get_file(vma->vm_file);
> -			pgoff_t pgoff = linear_page_index(vma, addr);
>
> -			mmap_read_unlock(mm);
> -			mmap_locked = false;
> -			*lock_dropped = true;
> -			result = collapse_scan_file(mm, addr, file, pgoff, cc);
> -
> -			if (result == SCAN_PAGE_DIRTY_OR_WRITEBACK && !triggered_wb &&
> -			    mapping_can_writeback(file->f_mapping)) {
> -				loff_t lstart = (loff_t)pgoff << PAGE_SHIFT;
> -				loff_t lend = lstart + HPAGE_PMD_SIZE - 1;
> +		result = collapse_single_pmd(addr, vma, &mmap_locked, cc);
>
> -				filemap_write_and_wait_range(file->f_mapping, lstart, lend);
> -				triggered_wb = true;
> -				fput(file);
> -				goto retry;
> -			}
> -			fput(file);
> -		} else {
> -			result = collapse_scan_pmd(mm, vma, addr, &mmap_locked, cc);
> -		}
>  		if (!mmap_locked)
>  			*lock_dropped = true;
>
> -handle_result:
>  		switch (result) {
>  		case SCAN_SUCCEED:
>  		case SCAN_PMD_MAPPED:
>  			++thps;
>  			break;
> -		case SCAN_PTE_MAPPED_HUGEPAGE:
> -			BUG_ON(mmap_locked);
> -			mmap_read_lock(mm);
> -			result = try_collapse_pte_mapped_thp(mm, addr, true);
> -			mmap_read_unlock(mm);
> -			goto handle_result;
>  		/* Whitelisted set of results where continuing OK */
>  		case SCAN_NO_PTE_TABLE:
> +		case SCAN_PTE_MAPPED_HUGEPAGE:
>  		case SCAN_PTE_NON_PRESENT:
>  		case SCAN_PTE_UFFD_WP:
>  		case SCAN_LACK_REFERENCED_PAGE:
> --
> 2.53.0
>
Re: [PATCH mm-unstable v3 5/5] mm/khugepaged: unify khugepaged and madv_collapse with collapse_single_pmd()
Posted by Nico Pache 2 weeks, 6 days ago

On 3/16/26 12:54 PM, Lorenzo Stoakes (Oracle) wrote:
> On Wed, Mar 11, 2026 at 03:13:15PM -0600, Nico Pache wrote:
>> The khugepaged daemon and madvise_collapse have two different
>> implementations that do almost the same thing. Create collapse_single_pmd
>> to increase code reuse and create an entry point to these two users.
> 
> Ah this is nice :) Thanks!

Thanks :) hopefully more khugepaged cleanups to come after these series' land.

> 
>>
>> Refactor madvise_collapse and collapse_scan_mm_slot to use the new
>> collapse_single_pmd function. This introduces a minor behavioral change
>> that is most likely an undiscovered bug. The current implementation of
>> khugepaged tests collapse_test_exit_or_disable before calling
>> collapse_pte_mapped_thp, but we weren't doing it in the madvise_collapse
>> case. By unifying these two callers madvise_collapse now also performs
>> this check. We also modify the return value to be SCAN_ANY_PROCESS which
>> properly indicates that this process is no longer valid to operate on.
>>
>> By moving the madvise_collapse writeback-retry logic into the helper
>> function we can also avoid having to revalidate the VMA.
>>
>> We also guard the khugepaged_pages_collapsed variable to ensure its only
>> incremented for khugepaged.
>>
>> Signed-off-by: Nico Pache <npache@redhat.com>
> 
> The logic all seems correct to me, just a bunch of nits below really. This is
> a really nice refactoring! :)
> 
> With them addressed:
> 
> Reviewed-by: Lorenzo Stoakes (Oracle) <ljs@kernel.org>

Thanks I will address those!

> 
> Cheers, Lorenzo
> 
>> ---
>>  mm/khugepaged.c | 120 +++++++++++++++++++++++++-----------------------
>>  1 file changed, 63 insertions(+), 57 deletions(-)
>>
>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>> index 33ae56e313ed..733c4a42c2ce 100644
>> --- a/mm/khugepaged.c
>> +++ b/mm/khugepaged.c
>> @@ -2409,6 +2409,65 @@ static enum scan_result collapse_scan_file(struct mm_struct *mm,
>>  	return result;
>>  }
>>
>> +/*
>> + * Try to collapse a single PMD starting at a PMD aligned addr, and return
>> + * the results.
>> + */
>> +static enum scan_result collapse_single_pmd(unsigned long addr,
>> +		struct vm_area_struct *vma, bool *mmap_locked,
> 
> mmap_locked seems mildly pointless here, and it's a semi-code smell to pass 'is
> locked' flags I think.
> 
> You never read this, but the parameter implies somebody might pass in mmaplocked
> == false, but you know it's always true here.
> 
> Anyway I think it makes more sense to pass in lock_dropped and get rid of
> mmap_locked in madvise_collapse() and just pass in lock_dropped directly
> (setting it false if anon).
> 
> Also obviously update collapse_scan_mm_slot() to use lock_dropped instead just
> inverted.
> 
> That's clearer I think since it makes it a verb rather than a noun and the
> function is dictating whether or not the lock is dropped, it also implies the
> lock is held on entry.

Ok I will give this a shot!

> 
>> +		struct collapse_control *cc)
>> +{
>> +	struct mm_struct *mm = vma->vm_mm;
>> +	bool triggered_wb = false;
>> +	enum scan_result result;
>> +	struct file *file;
>> +	pgoff_t pgoff;
>> +
> 
> Maybe move the mmap_assert_locked() from madvise_collapse() to here? Then we
> assert it in both cases.

ack, Sounds like a good idea!

> 
>> +	if (vma_is_anonymous(vma)) {
>> +		result = collapse_scan_pmd(mm, vma, addr, mmap_locked, cc);
>> +		goto end;
>> +	}
>> +
>> +	file = get_file(vma->vm_file);
>> +	pgoff = linear_page_index(vma, addr);
>> +
>> +	mmap_read_unlock(mm);
>> +	*mmap_locked = false;
>> +retry:
>> +	result = collapse_scan_file(mm, addr, file, pgoff, cc);
>> +
>> +	/*
>> +	 * For MADV_COLLAPSE, when encountering dirty pages, try to writeback,
>> +	 * then retry the collapse one time.
>> +	 */
>> +	if (!cc->is_khugepaged && result == SCAN_PAGE_DIRTY_OR_WRITEBACK &&
>> +	    !triggered_wb && mapping_can_writeback(file->f_mapping)) {
>> +		const loff_t lstart = (loff_t)pgoff << PAGE_SHIFT;
>> +		const loff_t lend = lstart + HPAGE_PMD_SIZE - 1;
>> +
>> +		filemap_write_and_wait_range(file->f_mapping, lstart, lend);
>> +		triggered_wb = true;
>> +		goto retry;
> 
> Thinking through this logic I do agree that we don't need to revalidate here,
> which should be quite a nice win, I just don't know why we previously assumed
> we'd have to... or maybe it was just because it became too spaghetti to goto
> around it somehow??

I believe the latter, the retry went at the top of the loop, and the
revalidation was already being done.

> 
>> +	}
>> +	fput(file);
>> +
>> +	if (result == SCAN_PTE_MAPPED_HUGEPAGE) {
>> +		mmap_read_lock(mm);
>> +		if (collapse_test_exit_or_disable(mm))
>> +			result = SCAN_ANY_PROCESS;
>> +		else
>> +			result = try_collapse_pte_mapped_thp(mm, addr,
>> +							     !cc->is_khugepaged);
>> +		if (result == SCAN_PMD_MAPPED)
>> +			result = SCAN_SUCCEED;
>> +		mmap_read_unlock(mm);
>> +	}
>> +end:
>> +	if (cc->is_khugepaged && result == SCAN_SUCCEED)
>> +		++khugepaged_pages_collapsed;
>> +	return result;
>> +}
>> +
>>  static void collapse_scan_mm_slot(unsigned int progress_max,
>>  		enum scan_result *result, struct collapse_control *cc)
>>  	__releases(&khugepaged_mm_lock)
>> @@ -2479,34 +2538,9 @@ static void collapse_scan_mm_slot(unsigned int progress_max,
>>  			VM_BUG_ON(khugepaged_scan.address < hstart ||
>>  				  khugepaged_scan.address + HPAGE_PMD_SIZE >
>>  				  hend);
> 
> Nice-to-have, but could we convert these VM_BUG_ON()'s to VM_WARN_ON_ONCE()'s
> while we're passing?

Yeah sure, I have a question about these, because they do concern me (perhaps
out of ignorance). does a WARN_ON_ONCE stop the daemon? I would be concerned
about a rogue khugepaged instance going through and messing with page tables
when it fails some assertion. Could this not lead to serious memory/file
corruptions?

Thanks for the reviews!

Cheers,
-- Nico

> 
>> -			if (!vma_is_anonymous(vma)) {
>> -				struct file *file = get_file(vma->vm_file);
>> -				pgoff_t pgoff = linear_page_index(vma,
>> -						khugepaged_scan.address);
>> -
>> -				mmap_read_unlock(mm);
>> -				mmap_locked = false;
>> -				*result = collapse_scan_file(mm,
>> -					khugepaged_scan.address, file, pgoff, cc);
>> -				fput(file);
>> -				if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
>> -					mmap_read_lock(mm);
>> -					if (collapse_test_exit_or_disable(mm))
>> -						goto breakouterloop;
>> -					*result = try_collapse_pte_mapped_thp(mm,
>> -						khugepaged_scan.address, false);
>> -					if (*result == SCAN_PMD_MAPPED)
>> -						*result = SCAN_SUCCEED;
>> -					mmap_read_unlock(mm);
>> -				}
>> -			} else {
>> -				*result = collapse_scan_pmd(mm, vma,
>> -					khugepaged_scan.address, &mmap_locked, cc);
>> -			}
>> -
>> -			if (*result == SCAN_SUCCEED)
>> -				++khugepaged_pages_collapsed;
>>
>> +			*result = collapse_single_pmd(khugepaged_scan.address,
>> +						      vma, &mmap_locked, cc);
>>  			/* move to next address */
>>  			khugepaged_scan.address += HPAGE_PMD_SIZE;
>>  			if (!mmap_locked)
>> @@ -2806,9 +2840,7 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
>>
>>  	for (addr = hstart; addr < hend; addr += HPAGE_PMD_SIZE) {
>>  		enum scan_result result = SCAN_FAIL;
>> -		bool triggered_wb = false;
>>
>> -retry:
>>  		if (!mmap_locked) {
>>  			cond_resched();
>>  			mmap_read_lock(mm);
>> @@ -2823,46 +2855,20 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
>>  			hend = min(hend, vma->vm_end & HPAGE_PMD_MASK);
>>  		}
>>  		mmap_assert_locked(mm);
>> -		if (!vma_is_anonymous(vma)) {
>> -			struct file *file = get_file(vma->vm_file);
>> -			pgoff_t pgoff = linear_page_index(vma, addr);
>>
>> -			mmap_read_unlock(mm);
>> -			mmap_locked = false;
>> -			*lock_dropped = true;
>> -			result = collapse_scan_file(mm, addr, file, pgoff, cc);
>> -
>> -			if (result == SCAN_PAGE_DIRTY_OR_WRITEBACK && !triggered_wb &&
>> -			    mapping_can_writeback(file->f_mapping)) {
>> -				loff_t lstart = (loff_t)pgoff << PAGE_SHIFT;
>> -				loff_t lend = lstart + HPAGE_PMD_SIZE - 1;
>> +		result = collapse_single_pmd(addr, vma, &mmap_locked, cc);
>>
>> -				filemap_write_and_wait_range(file->f_mapping, lstart, lend);
>> -				triggered_wb = true;
>> -				fput(file);
>> -				goto retry;
>> -			}
>> -			fput(file);
>> -		} else {
>> -			result = collapse_scan_pmd(mm, vma, addr, &mmap_locked, cc);
>> -		}
>>  		if (!mmap_locked)
>>  			*lock_dropped = true;
>>
>> -handle_result:
>>  		switch (result) {
>>  		case SCAN_SUCCEED:
>>  		case SCAN_PMD_MAPPED:
>>  			++thps;
>>  			break;
>> -		case SCAN_PTE_MAPPED_HUGEPAGE:
>> -			BUG_ON(mmap_locked);
>> -			mmap_read_lock(mm);
>> -			result = try_collapse_pte_mapped_thp(mm, addr, true);
>> -			mmap_read_unlock(mm);
>> -			goto handle_result;
>>  		/* Whitelisted set of results where continuing OK */
>>  		case SCAN_NO_PTE_TABLE:
>> +		case SCAN_PTE_MAPPED_HUGEPAGE:
>>  		case SCAN_PTE_NON_PRESENT:
>>  		case SCAN_PTE_UFFD_WP:
>>  		case SCAN_LACK_REFERENCED_PAGE:
>> --
>> 2.53.0
>>
>
Re: [PATCH mm-unstable v3 5/5] mm/khugepaged: unify khugepaged and madv_collapse with collapse_single_pmd()
Posted by Lorenzo Stoakes (Oracle) 2 weeks, 5 days ago
On Wed, Mar 18, 2026 at 11:22:07AM -0600, Nico Pache wrote:
>
>
> On 3/16/26 12:54 PM, Lorenzo Stoakes (Oracle) wrote:
> > On Wed, Mar 11, 2026 at 03:13:15PM -0600, Nico Pache wrote:
> >> The khugepaged daemon and madvise_collapse have two different
> >> implementations that do almost the same thing. Create collapse_single_pmd
> >> to increase code reuse and create an entry point to these two users.
> >
> > Ah this is nice :) Thanks!
>
> Thanks :) hopefully more khugepaged cleanups to come after these series' land.
>
> >
> >>
> >> Refactor madvise_collapse and collapse_scan_mm_slot to use the new
> >> collapse_single_pmd function. This introduces a minor behavioral change
> >> that is most likely an undiscovered bug. The current implementation of
> >> khugepaged tests collapse_test_exit_or_disable before calling
> >> collapse_pte_mapped_thp, but we weren't doing it in the madvise_collapse
> >> case. By unifying these two callers madvise_collapse now also performs
> >> this check. We also modify the return value to be SCAN_ANY_PROCESS which
> >> properly indicates that this process is no longer valid to operate on.
> >>
> >> By moving the madvise_collapse writeback-retry logic into the helper
> >> function we can also avoid having to revalidate the VMA.
> >>
> >> We also guard the khugepaged_pages_collapsed variable to ensure its only
> >> incremented for khugepaged.
> >>
> >> Signed-off-by: Nico Pache <npache@redhat.com>
> >
> > The logic all seems correct to me, just a bunch of nits below really. This is
> > a really nice refactoring! :)
> >
> > With them addressed:
> >
> > Reviewed-by: Lorenzo Stoakes (Oracle) <ljs@kernel.org>
>
> Thanks I will address those!
>
> >
> > Cheers, Lorenzo
> >
> >> ---
> >>  mm/khugepaged.c | 120 +++++++++++++++++++++++++-----------------------
> >>  1 file changed, 63 insertions(+), 57 deletions(-)
> >>
> >> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> >> index 33ae56e313ed..733c4a42c2ce 100644
> >> --- a/mm/khugepaged.c
> >> +++ b/mm/khugepaged.c
> >> @@ -2409,6 +2409,65 @@ static enum scan_result collapse_scan_file(struct mm_struct *mm,
> >>  	return result;
> >>  }
> >>
> >> +/*
> >> + * Try to collapse a single PMD starting at a PMD aligned addr, and return
> >> + * the results.
> >> + */
> >> +static enum scan_result collapse_single_pmd(unsigned long addr,
> >> +		struct vm_area_struct *vma, bool *mmap_locked,
> >
> > mmap_locked seems mildly pointless here, and it's a semi-code smell to pass 'is
> > locked' flags I think.
> >
> > You never read this, but the parameter implies somebody might pass in mmaplocked
> > == false, but you know it's always true here.
> >
> > Anyway I think it makes more sense to pass in lock_dropped and get rid of
> > mmap_locked in madvise_collapse() and just pass in lock_dropped directly
> > (setting it false if anon).
> >
> > Also obviously update collapse_scan_mm_slot() to use lock_dropped instead just
> > inverted.
> >
> > That's clearer I think since it makes it a verb rather than a noun and the
> > function is dictating whether or not the lock is dropped, it also implies the
> > lock is held on entry.
>
> Ok I will give this a shot!
>
> >
> >> +		struct collapse_control *cc)
> >> +{
> >> +	struct mm_struct *mm = vma->vm_mm;
> >> +	bool triggered_wb = false;
> >> +	enum scan_result result;
> >> +	struct file *file;
> >> +	pgoff_t pgoff;
> >> +
> >
> > Maybe move the mmap_assert_locked() from madvise_collapse() to here? Then we
> > assert it in both cases.
>
> ack, Sounds like a good idea!

Thanks + for above! :)

>
> >
> >> +	if (vma_is_anonymous(vma)) {
> >> +		result = collapse_scan_pmd(mm, vma, addr, mmap_locked, cc);
> >> +		goto end;
> >> +	}
> >> +
> >> +	file = get_file(vma->vm_file);
> >> +	pgoff = linear_page_index(vma, addr);
> >> +
> >> +	mmap_read_unlock(mm);
> >> +	*mmap_locked = false;
> >> +retry:
> >> +	result = collapse_scan_file(mm, addr, file, pgoff, cc);
> >> +
> >> +	/*
> >> +	 * For MADV_COLLAPSE, when encountering dirty pages, try to writeback,
> >> +	 * then retry the collapse one time.
> >> +	 */
> >> +	if (!cc->is_khugepaged && result == SCAN_PAGE_DIRTY_OR_WRITEBACK &&
> >> +	    !triggered_wb && mapping_can_writeback(file->f_mapping)) {
> >> +		const loff_t lstart = (loff_t)pgoff << PAGE_SHIFT;
> >> +		const loff_t lend = lstart + HPAGE_PMD_SIZE - 1;
> >> +
> >> +		filemap_write_and_wait_range(file->f_mapping, lstart, lend);
> >> +		triggered_wb = true;
> >> +		goto retry;
> >
> > Thinking through this logic I do agree that we don't need to revalidate here,
> > which should be quite a nice win, I just don't know why we previously assumed
> > we'd have to... or maybe it was just because it became too spaghetti to goto
> > around it somehow??
>
> I believe the latter, the retry went at the top of the loop, and the
> revalidation was already being done.

Yeah makes sense!

>
> >
> >> +	}
> >> +	fput(file);
> >> +
> >> +	if (result == SCAN_PTE_MAPPED_HUGEPAGE) {
> >> +		mmap_read_lock(mm);
> >> +		if (collapse_test_exit_or_disable(mm))
> >> +			result = SCAN_ANY_PROCESS;
> >> +		else
> >> +			result = try_collapse_pte_mapped_thp(mm, addr,
> >> +							     !cc->is_khugepaged);
> >> +		if (result == SCAN_PMD_MAPPED)
> >> +			result = SCAN_SUCCEED;
> >> +		mmap_read_unlock(mm);
> >> +	}
> >> +end:
> >> +	if (cc->is_khugepaged && result == SCAN_SUCCEED)
> >> +		++khugepaged_pages_collapsed;
> >> +	return result;
> >> +}
> >> +
> >>  static void collapse_scan_mm_slot(unsigned int progress_max,
> >>  		enum scan_result *result, struct collapse_control *cc)
> >>  	__releases(&khugepaged_mm_lock)
> >> @@ -2479,34 +2538,9 @@ static void collapse_scan_mm_slot(unsigned int progress_max,
> >>  			VM_BUG_ON(khugepaged_scan.address < hstart ||
> >>  				  khugepaged_scan.address + HPAGE_PMD_SIZE >
> >>  				  hend);
> >
> > Nice-to-have, but could we convert these VM_BUG_ON()'s to VM_WARN_ON_ONCE()'s
> > while we're passing?
>
> Yeah sure, I have a question about these, because they do concern me (perhaps
> out of ignorance). does a WARN_ON_ONCE stop the daemon? I would be concerned
> about a rogue khugepaged instance going through and messing with page tables
> when it fails some assertion. Could this not lead to serious memory/file
> corruptions?

It won't, but all of this is in CONFIG_DEBUG_VM anyway, so it's a kernel bug if
it happens in a release kernel, this is just for visibility when debugging and
those systems should be e.g. VMS etc.

The fact this is VM_xxx and hasn't apparently fired before suggests we're good.

>
> Thanks for the reviews!
>
> Cheers,
> -- Nico
>
> >
> >> -			if (!vma_is_anonymous(vma)) {
> >> -				struct file *file = get_file(vma->vm_file);
> >> -				pgoff_t pgoff = linear_page_index(vma,
> >> -						khugepaged_scan.address);
> >> -
> >> -				mmap_read_unlock(mm);
> >> -				mmap_locked = false;
> >> -				*result = collapse_scan_file(mm,
> >> -					khugepaged_scan.address, file, pgoff, cc);
> >> -				fput(file);
> >> -				if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
> >> -					mmap_read_lock(mm);
> >> -					if (collapse_test_exit_or_disable(mm))
> >> -						goto breakouterloop;
> >> -					*result = try_collapse_pte_mapped_thp(mm,
> >> -						khugepaged_scan.address, false);
> >> -					if (*result == SCAN_PMD_MAPPED)
> >> -						*result = SCAN_SUCCEED;
> >> -					mmap_read_unlock(mm);
> >> -				}
> >> -			} else {
> >> -				*result = collapse_scan_pmd(mm, vma,
> >> -					khugepaged_scan.address, &mmap_locked, cc);
> >> -			}
> >> -
> >> -			if (*result == SCAN_SUCCEED)
> >> -				++khugepaged_pages_collapsed;
> >>
> >> +			*result = collapse_single_pmd(khugepaged_scan.address,
> >> +						      vma, &mmap_locked, cc);
> >>  			/* move to next address */
> >>  			khugepaged_scan.address += HPAGE_PMD_SIZE;
> >>  			if (!mmap_locked)
> >> @@ -2806,9 +2840,7 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
> >>
> >>  	for (addr = hstart; addr < hend; addr += HPAGE_PMD_SIZE) {
> >>  		enum scan_result result = SCAN_FAIL;
> >> -		bool triggered_wb = false;
> >>
> >> -retry:
> >>  		if (!mmap_locked) {
> >>  			cond_resched();
> >>  			mmap_read_lock(mm);
> >> @@ -2823,46 +2855,20 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
> >>  			hend = min(hend, vma->vm_end & HPAGE_PMD_MASK);
> >>  		}
> >>  		mmap_assert_locked(mm);
> >> -		if (!vma_is_anonymous(vma)) {
> >> -			struct file *file = get_file(vma->vm_file);
> >> -			pgoff_t pgoff = linear_page_index(vma, addr);
> >>
> >> -			mmap_read_unlock(mm);
> >> -			mmap_locked = false;
> >> -			*lock_dropped = true;
> >> -			result = collapse_scan_file(mm, addr, file, pgoff, cc);
> >> -
> >> -			if (result == SCAN_PAGE_DIRTY_OR_WRITEBACK && !triggered_wb &&
> >> -			    mapping_can_writeback(file->f_mapping)) {
> >> -				loff_t lstart = (loff_t)pgoff << PAGE_SHIFT;
> >> -				loff_t lend = lstart + HPAGE_PMD_SIZE - 1;
> >> +		result = collapse_single_pmd(addr, vma, &mmap_locked, cc);
> >>
> >> -				filemap_write_and_wait_range(file->f_mapping, lstart, lend);
> >> -				triggered_wb = true;
> >> -				fput(file);
> >> -				goto retry;
> >> -			}
> >> -			fput(file);
> >> -		} else {
> >> -			result = collapse_scan_pmd(mm, vma, addr, &mmap_locked, cc);
> >> -		}
> >>  		if (!mmap_locked)
> >>  			*lock_dropped = true;
> >>
> >> -handle_result:
> >>  		switch (result) {
> >>  		case SCAN_SUCCEED:
> >>  		case SCAN_PMD_MAPPED:
> >>  			++thps;
> >>  			break;
> >> -		case SCAN_PTE_MAPPED_HUGEPAGE:
> >> -			BUG_ON(mmap_locked);
> >> -			mmap_read_lock(mm);
> >> -			result = try_collapse_pte_mapped_thp(mm, addr, true);
> >> -			mmap_read_unlock(mm);
> >> -			goto handle_result;
> >>  		/* Whitelisted set of results where continuing OK */
> >>  		case SCAN_NO_PTE_TABLE:
> >> +		case SCAN_PTE_MAPPED_HUGEPAGE:
> >>  		case SCAN_PTE_NON_PRESENT:
> >>  		case SCAN_PTE_UFFD_WP:
> >>  		case SCAN_LACK_REFERENCED_PAGE:
> >> --
> >> 2.53.0
> >>
> >
>

Cheers, Lorenzo
Re: [PATCH mm-unstable v3 5/5] mm/khugepaged: unify khugepaged and madv_collapse with collapse_single_pmd()
Posted by Lance Yang 3 weeks, 2 days ago
On Wed, Mar 11, 2026 at 03:13:15PM -0600, Nico Pache wrote:
>The khugepaged daemon and madvise_collapse have two different
>implementations that do almost the same thing. Create collapse_single_pmd
>to increase code reuse and create an entry point to these two users.
>
>Refactor madvise_collapse and collapse_scan_mm_slot to use the new
>collapse_single_pmd function. This introduces a minor behavioral change
>that is most likely an undiscovered bug. The current implementation of
>khugepaged tests collapse_test_exit_or_disable before calling
>collapse_pte_mapped_thp, but we weren't doing it in the madvise_collapse
>case. By unifying these two callers madvise_collapse now also performs
>this check. We also modify the return value to be SCAN_ANY_PROCESS which
>properly indicates that this process is no longer valid to operate on.
>
>By moving the madvise_collapse writeback-retry logic into the helper
>function we can also avoid having to revalidate the VMA.
>
>We also guard the khugepaged_pages_collapsed variable to ensure its only
>incremented for khugepaged.
>
>Signed-off-by: Nico Pache <npache@redhat.com>
>---
> mm/khugepaged.c | 120 +++++++++++++++++++++++++-----------------------
> 1 file changed, 63 insertions(+), 57 deletions(-)
>
>diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>index 33ae56e313ed..733c4a42c2ce 100644
>--- a/mm/khugepaged.c
>+++ b/mm/khugepaged.c
>@@ -2409,6 +2409,65 @@ static enum scan_result collapse_scan_file(struct mm_struct *mm,
> 	return result;
> }
> 
>+/*
>+ * Try to collapse a single PMD starting at a PMD aligned addr, and return
>+ * the results.
>+ */
>+static enum scan_result collapse_single_pmd(unsigned long addr,
>+		struct vm_area_struct *vma, bool *mmap_locked,
>+		struct collapse_control *cc)
>+{
>+	struct mm_struct *mm = vma->vm_mm;
>+	bool triggered_wb = false;
>+	enum scan_result result;
>+	struct file *file;
>+	pgoff_t pgoff;
>+
>+	if (vma_is_anonymous(vma)) {
>+		result = collapse_scan_pmd(mm, vma, addr, mmap_locked, cc);
>+		goto end;
>+	}
>+
>+	file = get_file(vma->vm_file);
>+	pgoff = linear_page_index(vma, addr);
>+
>+	mmap_read_unlock(mm);
>+	*mmap_locked = false;
>+retry:
>+	result = collapse_scan_file(mm, addr, file, pgoff, cc);
>+
>+	/*
>+	 * For MADV_COLLAPSE, when encountering dirty pages, try to writeback,
>+	 * then retry the collapse one time.
>+	 */
>+	if (!cc->is_khugepaged && result == SCAN_PAGE_DIRTY_OR_WRITEBACK &&
>+	    !triggered_wb && mapping_can_writeback(file->f_mapping)) {
>+		const loff_t lstart = (loff_t)pgoff << PAGE_SHIFT;
>+		const loff_t lend = lstart + HPAGE_PMD_SIZE - 1;
>+
>+		filemap_write_and_wait_range(file->f_mapping, lstart, lend);
>+		triggered_wb = true;
>+		goto retry;

While the old retry path did go back through hugepage_vma_revalidate(),
the retry itself is not relying on the original VMA remaining unchanged
IIUC.

After dropping mmap_lock, the code still holds a reference to the file,
so no lifetime issue should arise here :)

So, LGTM!
Reviewed-by: Lance Yang <lance.yang@linux.dev>

Cheers,
Lance
Re: [PATCH mm-unstable v3 5/5] mm/khugepaged: unify khugepaged and madv_collapse with collapse_single_pmd()
Posted by Baolin Wang 3 weeks, 4 days ago

On 3/12/26 5:13 AM, Nico Pache wrote:
> The khugepaged daemon and madvise_collapse have two different
> implementations that do almost the same thing. Create collapse_single_pmd
> to increase code reuse and create an entry point to these two users.
> 
> Refactor madvise_collapse and collapse_scan_mm_slot to use the new
> collapse_single_pmd function. This introduces a minor behavioral change
> that is most likely an undiscovered bug. The current implementation of
> khugepaged tests collapse_test_exit_or_disable before calling
> collapse_pte_mapped_thp, but we weren't doing it in the madvise_collapse
> case. By unifying these two callers madvise_collapse now also performs
> this check. We also modify the return value to be SCAN_ANY_PROCESS which
> properly indicates that this process is no longer valid to operate on.
> 
> By moving the madvise_collapse writeback-retry logic into the helper
> function we can also avoid having to revalidate the VMA.
> 
> We also guard the khugepaged_pages_collapsed variable to ensure its only
> incremented for khugepaged.
> 
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---

LGTM.
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Re: [PATCH mm-unstable v3 5/5] mm/khugepaged: unify khugepaged and madv_collapse with collapse_single_pmd()
Posted by David Hildenbrand (Arm) 3 weeks, 5 days ago
On 3/11/26 22:13, Nico Pache wrote:
> The khugepaged daemon and madvise_collapse have two different
> implementations that do almost the same thing. Create collapse_single_pmd
> to increase code reuse and create an entry point to these two users.
> 
> Refactor madvise_collapse and collapse_scan_mm_slot to use the new
> collapse_single_pmd function. This introduces a minor behavioral change
> that is most likely an undiscovered bug. The current implementation of
> khugepaged tests collapse_test_exit_or_disable before calling
> collapse_pte_mapped_thp, but we weren't doing it in the madvise_collapse
> case. By unifying these two callers madvise_collapse now also performs
> this check. We also modify the return value to be SCAN_ANY_PROCESS which
> properly indicates that this process is no longer valid to operate on.
> 
> By moving the madvise_collapse writeback-retry logic into the helper
> function we can also avoid having to revalidate the VMA.
> 
> We also guard the khugepaged_pages_collapsed variable to ensure its only
> incremented for khugepaged.
> 
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---
>  mm/khugepaged.c | 120 +++++++++++++++++++++++++-----------------------
>  1 file changed, 63 insertions(+), 57 deletions(-)
> 
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 33ae56e313ed..733c4a42c2ce 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -2409,6 +2409,65 @@ static enum scan_result collapse_scan_file(struct mm_struct *mm,
>  	return result;
>  }
>  
> +/*
> + * Try to collapse a single PMD starting at a PMD aligned addr, and return
> + * the results.
> + */
> +static enum scan_result collapse_single_pmd(unsigned long addr,
> +		struct vm_area_struct *vma, bool *mmap_locked,
> +		struct collapse_control *cc)
> +{
> +	struct mm_struct *mm = vma->vm_mm;
> +	bool triggered_wb = false;
> +	enum scan_result result;
> +	struct file *file;
> +	pgoff_t pgoff;
> +
> +	if (vma_is_anonymous(vma)) {
> +		result = collapse_scan_pmd(mm, vma, addr, mmap_locked, cc);
> +		goto end;
> +	}
> +
> +	file = get_file(vma->vm_file);
> +	pgoff = linear_page_index(vma, addr);
> +
> +	mmap_read_unlock(mm);
> +	*mmap_locked = false;
> +retry:
> +	result = collapse_scan_file(mm, addr, file, pgoff, cc);
> +
> +	/*
> +	 * For MADV_COLLAPSE, when encountering dirty pages, try to writeback,
> +	 * then retry the collapse one time.
> +	 */
> +	if (!cc->is_khugepaged && result == SCAN_PAGE_DIRTY_OR_WRITEBACK &&
> +	    !triggered_wb && mapping_can_writeback(file->f_mapping)) {
> +		const loff_t lstart = (loff_t)pgoff << PAGE_SHIFT;
> +		const loff_t lend = lstart + HPAGE_PMD_SIZE - 1;
> +
> +		filemap_write_and_wait_range(file->f_mapping, lstart, lend);
> +		triggered_wb = true;
> +		goto retry;
> +	}
> +	fput(file);
> +
> +	if (result == SCAN_PTE_MAPPED_HUGEPAGE) {
> +		mmap_read_lock(mm);
> +		if (collapse_test_exit_or_disable(mm))
> +			result = SCAN_ANY_PROCESS;
> +		else
> +			result = try_collapse_pte_mapped_thp(mm, addr,
> +							     !cc->is_khugepaged);
> +		if (result == SCAN_PMD_MAPPED)
> +			result = SCAN_SUCCEED;
> +		mmap_read_unlock(mm);
> +	}
> +end:
> +	if (cc->is_khugepaged && result == SCAN_SUCCEED)
> +		++khugepaged_pages_collapsed;
> +	return result;
> +}
> +
>  static void collapse_scan_mm_slot(unsigned int progress_max,
>  		enum scan_result *result, struct collapse_control *cc)
>  	__releases(&khugepaged_mm_lock)
> @@ -2479,34 +2538,9 @@ static void collapse_scan_mm_slot(unsigned int progress_max,
>  			VM_BUG_ON(khugepaged_scan.address < hstart ||
>  				  khugepaged_scan.address + HPAGE_PMD_SIZE >
>  				  hend);
> -			if (!vma_is_anonymous(vma)) {
> -				struct file *file = get_file(vma->vm_file);
> -				pgoff_t pgoff = linear_page_index(vma,
> -						khugepaged_scan.address);
> -
> -				mmap_read_unlock(mm);
> -				mmap_locked = false;
> -				*result = collapse_scan_file(mm,
> -					khugepaged_scan.address, file, pgoff, cc);
> -				fput(file);
> -				if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
> -					mmap_read_lock(mm);
> -					if (collapse_test_exit_or_disable(mm))
> -						goto breakouterloop;
> -					*result = try_collapse_pte_mapped_thp(mm,
> -						khugepaged_scan.address, false);
> -					if (*result == SCAN_PMD_MAPPED)
> -						*result = SCAN_SUCCEED;
> -					mmap_read_unlock(mm);
> -				}
> -			} else {
> -				*result = collapse_scan_pmd(mm, vma,
> -					khugepaged_scan.address, &mmap_locked, cc);
> -			}
> -
> -			if (*result == SCAN_SUCCEED)
> -				++khugepaged_pages_collapsed;
>  
> +			*result = collapse_single_pmd(khugepaged_scan.address,
> +						      vma, &mmap_locked, cc);
>  			/* move to next address */
>  			khugepaged_scan.address += HPAGE_PMD_SIZE;
>  			if (!mmap_locked)
> @@ -2806,9 +2840,7 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
>  
>  	for (addr = hstart; addr < hend; addr += HPAGE_PMD_SIZE) {
>  		enum scan_result result = SCAN_FAIL;
> -		bool triggered_wb = false;
>  
> -retry:
>  		if (!mmap_locked) {
>  			cond_resched();
>  			mmap_read_lock(mm);
> @@ -2823,46 +2855,20 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
>  			hend = min(hend, vma->vm_end & HPAGE_PMD_MASK);
>  		}
>  		mmap_assert_locked(mm);
> -		if (!vma_is_anonymous(vma)) {
> -			struct file *file = get_file(vma->vm_file);
> -			pgoff_t pgoff = linear_page_index(vma, addr);
>  
> -			mmap_read_unlock(mm);
> -			mmap_locked = false;
> -			*lock_dropped = true;

Okay, we can get rid of this because collapse_single_pmd() will never
drop the lock, to retake it, returning with "mmap_locked == true".

If it dropped the lock, even if it relocked, it will always return with
the lock dropped and "mmap_locked == false".

The "if (!mmap_locked)" will properly set "*lock_dropped = true;".

Acked-by: David Hildenbrand (Arm) <david@kernel.org>

-- 
Cheers,

David
Re: [PATCH mm-unstable v3 5/5] mm/khugepaged: unify khugepaged and madv_collapse with collapse_single_pmd()
Posted by Wei Yang 3 weeks, 5 days ago
On Wed, Mar 11, 2026 at 03:13:15PM -0600, Nico Pache wrote:
[..]
>@@ -2823,46 +2855,20 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
> 			hend = min(hend, vma->vm_end & HPAGE_PMD_MASK);
> 		}
> 		mmap_assert_locked(mm);
>-		if (!vma_is_anonymous(vma)) {
>-			struct file *file = get_file(vma->vm_file);
>-			pgoff_t pgoff = linear_page_index(vma, addr);
> 
>-			mmap_read_unlock(mm);
>-			mmap_locked = false;
>-			*lock_dropped = true;
>-			result = collapse_scan_file(mm, addr, file, pgoff, cc);
>-
>-			if (result == SCAN_PAGE_DIRTY_OR_WRITEBACK && !triggered_wb &&
>-			    mapping_can_writeback(file->f_mapping)) {
>-				loff_t lstart = (loff_t)pgoff << PAGE_SHIFT;
>-				loff_t lend = lstart + HPAGE_PMD_SIZE - 1;
>+		result = collapse_single_pmd(addr, vma, &mmap_locked, cc);
> 
>-				filemap_write_and_wait_range(file->f_mapping, lstart, lend);
>-				triggered_wb = true;
>-				fput(file);
>-				goto retry;
>-			}
>-			fput(file);
>-		} else {
>-			result = collapse_scan_pmd(mm, vma, addr, &mmap_locked, cc);
>-		}
> 		if (!mmap_locked)
> 			*lock_dropped = true;
> 
>-handle_result:
> 		switch (result) {
> 		case SCAN_SUCCEED:
> 		case SCAN_PMD_MAPPED:
> 			++thps;
> 			break;
>-		case SCAN_PTE_MAPPED_HUGEPAGE:
>-			BUG_ON(mmap_locked);
>-			mmap_read_lock(mm);
>-			result = try_collapse_pte_mapped_thp(mm, addr, true);
>-			mmap_read_unlock(mm);
>-			goto handle_result;
> 		/* Whitelisted set of results where continuing OK */
> 		case SCAN_NO_PTE_TABLE:
>+		case SCAN_PTE_MAPPED_HUGEPAGE:

It looks we won't have this case after refactor?

Current code flow is like this:

  result = collapse_single_pmd()
      result = collapse_scan_file()
          result = collapse_file()

      if (result == SCAN_PTE_MAPPED_HUGEPAGE) {            --- (1)
          result = SCAN_ANY_PROCESS;
	  or 
	  result = try_collapse_pte_mapped_thp();
      }

Only collapse_scan_file() and collapse_file() may return
SCAN_PTE_MAPPED_HUGEPAGE, and then handled in (1). After this, result is set
to another value to indicate whether we collapse it or not.

So I am afraid we don't expect to see SCAN_PTE_MAPPED_HUGEPAGE here. Do I miss
something?

> 		case SCAN_PTE_NON_PRESENT:
> 		case SCAN_PTE_UFFD_WP:
> 		case SCAN_LACK_REFERENCED_PAGE:
>-- 
>2.53.0
>

-- 
Wei Yang
Help you, Help me
Re: [PATCH mm-unstable v3 5/5] mm/khugepaged: unify khugepaged and madv_collapse with collapse_single_pmd()
Posted by Nico Pache 2 weeks, 6 days ago

On 3/11/26 8:04 PM, Wei Yang wrote:
> On Wed, Mar 11, 2026 at 03:13:15PM -0600, Nico Pache wrote:
> [..]
>> @@ -2823,46 +2855,20 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
>> 			hend = min(hend, vma->vm_end & HPAGE_PMD_MASK);
>> 		}
>> 		mmap_assert_locked(mm);
>> -		if (!vma_is_anonymous(vma)) {
>> -			struct file *file = get_file(vma->vm_file);
>> -			pgoff_t pgoff = linear_page_index(vma, addr);
>>
>> -			mmap_read_unlock(mm);
>> -			mmap_locked = false;
>> -			*lock_dropped = true;
>> -			result = collapse_scan_file(mm, addr, file, pgoff, cc);
>> -
>> -			if (result == SCAN_PAGE_DIRTY_OR_WRITEBACK && !triggered_wb &&
>> -			    mapping_can_writeback(file->f_mapping)) {
>> -				loff_t lstart = (loff_t)pgoff << PAGE_SHIFT;
>> -				loff_t lend = lstart + HPAGE_PMD_SIZE - 1;
>> +		result = collapse_single_pmd(addr, vma, &mmap_locked, cc);
>>
>> -				filemap_write_and_wait_range(file->f_mapping, lstart, lend);
>> -				triggered_wb = true;
>> -				fput(file);
>> -				goto retry;
>> -			}
>> -			fput(file);
>> -		} else {
>> -			result = collapse_scan_pmd(mm, vma, addr, &mmap_locked, cc);
>> -		}
>> 		if (!mmap_locked)
>> 			*lock_dropped = true;
>>
>> -handle_result:
>> 		switch (result) {
>> 		case SCAN_SUCCEED:
>> 		case SCAN_PMD_MAPPED:
>> 			++thps;
>> 			break;
>> -		case SCAN_PTE_MAPPED_HUGEPAGE:
>> -			BUG_ON(mmap_locked);
>> -			mmap_read_lock(mm);
>> -			result = try_collapse_pte_mapped_thp(mm, addr, true);
>> -			mmap_read_unlock(mm);
>> -			goto handle_result;
>> 		/* Whitelisted set of results where continuing OK */
>> 		case SCAN_NO_PTE_TABLE:
>> +		case SCAN_PTE_MAPPED_HUGEPAGE:
> 
> It looks we won't have this case after refactor?
> 
> Current code flow is like this:
> 
>   result = collapse_single_pmd()
>       result = collapse_scan_file()
>           result = collapse_file()
> 
>       if (result == SCAN_PTE_MAPPED_HUGEPAGE) {            --- (1)
>           result = SCAN_ANY_PROCESS;
> 	  or 
> 	  result = try_collapse_pte_mapped_thp();
>       }
> 
> Only collapse_scan_file() and collapse_file() may return
> SCAN_PTE_MAPPED_HUGEPAGE, and then handled in (1). After this, result is set
> to another value to indicate whether we collapse it or not.
> 
> So I am afraid we don't expect to see SCAN_PTE_MAPPED_HUGEPAGE here. Do I miss
> something?

No your assessment is correct, should I remove it from the list? I've been quite
confused about requests to list all the available ENUMs, does that mean we want
all the enums that are reachable or all the enums that are available as a
result? Im guessing the former based on your comment.

Cheers,
-- Nico

> 
>> 		case SCAN_PTE_NON_PRESENT:
>> 		case SCAN_PTE_UFFD_WP:
>> 		case SCAN_LACK_REFERENCED_PAGE:
>> -- 
>> 2.53.0
>>
>
Re: [PATCH mm-unstable v3 5/5] mm/khugepaged: unify khugepaged and madv_collapse with collapse_single_pmd()
Posted by Wei Yang 2 weeks, 4 days ago
On Wed, Mar 18, 2026 at 10:54:17AM -0600, Nico Pache wrote:
>
>
>On 3/11/26 8:04 PM, Wei Yang wrote:
>> On Wed, Mar 11, 2026 at 03:13:15PM -0600, Nico Pache wrote:
>> [..]
>>> @@ -2823,46 +2855,20 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
>>> 			hend = min(hend, vma->vm_end & HPAGE_PMD_MASK);
>>> 		}
>>> 		mmap_assert_locked(mm);
>>> -		if (!vma_is_anonymous(vma)) {
>>> -			struct file *file = get_file(vma->vm_file);
>>> -			pgoff_t pgoff = linear_page_index(vma, addr);
>>>
>>> -			mmap_read_unlock(mm);
>>> -			mmap_locked = false;
>>> -			*lock_dropped = true;
>>> -			result = collapse_scan_file(mm, addr, file, pgoff, cc);
>>> -
>>> -			if (result == SCAN_PAGE_DIRTY_OR_WRITEBACK && !triggered_wb &&
>>> -			    mapping_can_writeback(file->f_mapping)) {
>>> -				loff_t lstart = (loff_t)pgoff << PAGE_SHIFT;
>>> -				loff_t lend = lstart + HPAGE_PMD_SIZE - 1;
>>> +		result = collapse_single_pmd(addr, vma, &mmap_locked, cc);
>>>
>>> -				filemap_write_and_wait_range(file->f_mapping, lstart, lend);
>>> -				triggered_wb = true;
>>> -				fput(file);
>>> -				goto retry;
>>> -			}
>>> -			fput(file);
>>> -		} else {
>>> -			result = collapse_scan_pmd(mm, vma, addr, &mmap_locked, cc);
>>> -		}
>>> 		if (!mmap_locked)
>>> 			*lock_dropped = true;
>>>
>>> -handle_result:
>>> 		switch (result) {
>>> 		case SCAN_SUCCEED:
>>> 		case SCAN_PMD_MAPPED:
>>> 			++thps;
>>> 			break;
>>> -		case SCAN_PTE_MAPPED_HUGEPAGE:
>>> -			BUG_ON(mmap_locked);
>>> -			mmap_read_lock(mm);
>>> -			result = try_collapse_pte_mapped_thp(mm, addr, true);
>>> -			mmap_read_unlock(mm);
>>> -			goto handle_result;
>>> 		/* Whitelisted set of results where continuing OK */
>>> 		case SCAN_NO_PTE_TABLE:
>>> +		case SCAN_PTE_MAPPED_HUGEPAGE:
>> 
>> It looks we won't have this case after refactor?
>> 
>> Current code flow is like this:
>> 
>>   result = collapse_single_pmd()
>>       result = collapse_scan_file()
>>           result = collapse_file()
>> 
>>       if (result == SCAN_PTE_MAPPED_HUGEPAGE) {            --- (1)
>>           result = SCAN_ANY_PROCESS;
>> 	  or 
>> 	  result = try_collapse_pte_mapped_thp();
>>       }
>> 
>> Only collapse_scan_file() and collapse_file() may return
>> SCAN_PTE_MAPPED_HUGEPAGE, and then handled in (1). After this, result is set
>> to another value to indicate whether we collapse it or not.
>> 
>> So I am afraid we don't expect to see SCAN_PTE_MAPPED_HUGEPAGE here. Do I miss
>> something?
>
>No your assessment is correct, should I remove it from the list? I've been quite
>confused about requests to list all the available ENUMs, does that mean we want
>all the enums that are reachable or all the enums that are available as a
>result? Im guessing the former based on your comment.
>

If it is me, I would remove it :-) Otherwise, I will think
collapse_single_pmd() may return SCAN_PTE_MAPPED_HUGEPAGE.

But, yeah, this is really trivial.

>Cheers,
>-- Nico
>
>> 
>>> 		case SCAN_PTE_NON_PRESENT:
>>> 		case SCAN_PTE_UFFD_WP:
>>> 		case SCAN_LACK_REFERENCED_PAGE:
>>> -- 
>>> 2.53.0
>>>
>> 

-- 
Wei Yang
Help you, Help me