[PATCH mm-unstable v2 5/5] mm/khugepaged: unify khugepaged and madv_collapse with collapse_single_pmd()

Nico Pache posted 5 patches 1 month, 1 week ago
There is a newer version of this series
[PATCH mm-unstable v2 5/5] mm/khugepaged: unify khugepaged and madv_collapse with collapse_single_pmd()
Posted by Nico Pache 1 month, 1 week ago
The khugepaged daemon and madvise_collapse have two different
implementations that do almost the same thing.

Create collapse_single_pmd to increase code reuse and create an entry
point to these two users.

Refactor madvise_collapse and collapse_scan_mm_slot to use the new
collapse_single_pmd function. This introduces a minor behavioral change
that is most likely an undiscovered bug. The current implementation of
khugepaged tests collapse_test_exit_or_disable before calling
collapse_pte_mapped_thp, but we weren't doing it in the madvise_collapse
case. By unifying these two callers madvise_collapse now also performs
this check. We also modify the return value to be SCAN_ANY_PROCESS which
properly indicates that this process is no longer valid to operate on.

We also guard the khugepaged_pages_collapsed variable to ensure its only
incremented for khugepaged.

Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Signed-off-by: Nico Pache <npache@redhat.com>
---
 mm/khugepaged.c | 128 ++++++++++++++++++++++++++----------------------
 1 file changed, 69 insertions(+), 59 deletions(-)

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 64086488ca01..0058970d4579 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -2417,6 +2417,70 @@ static enum scan_result collapse_scan_file(struct mm_struct *mm, unsigned long a
 	return result;
 }
 
+/*
+ * Try to collapse a single PMD starting at a PMD aligned addr, and return
+ * the results.
+ */
+static enum scan_result collapse_single_pmd(unsigned long addr,
+		struct vm_area_struct *vma, bool *mmap_locked,
+		unsigned int *cur_progress, struct collapse_control *cc)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	bool triggered_wb = false;
+	enum scan_result result;
+	struct file *file;
+	pgoff_t pgoff;
+
+	if (vma_is_anonymous(vma)) {
+		result = collapse_scan_pmd(mm, vma, addr, mmap_locked, cur_progress, cc);
+		goto end;
+	}
+
+	file = get_file(vma->vm_file);
+	pgoff = linear_page_index(vma, addr);
+
+	mmap_read_unlock(mm);
+	*mmap_locked = false;
+retry:
+	result = collapse_scan_file(mm, addr, file, pgoff, cur_progress, cc);
+
+	/*
+	 * For MADV_COLLAPSE, when encountering dirty pages, try to writeback,
+	 * then retry the collapse one time.
+	 */
+	if (!cc->is_khugepaged && result == SCAN_PAGE_DIRTY_OR_WRITEBACK &&
+	    triggered_wb && mapping_can_writeback(file->f_mapping)) {
+		const loff_t lstart = (loff_t)pgoff << PAGE_SHIFT;
+		const loff_t lend = lstart + HPAGE_PMD_SIZE - 1;
+
+		filemap_write_and_wait_range(file->f_mapping, lstart, lend);
+		triggered_wb = true;
+		goto retry;
+	}
+	fput(file);
+
+	if (result != SCAN_PTE_MAPPED_HUGEPAGE)
+		goto end;
+
+	mmap_read_lock(mm);
+	*mmap_locked = true;
+	if (collapse_test_exit_or_disable(mm)) {
+		mmap_read_unlock(mm);
+		*mmap_locked = false;
+		return SCAN_ANY_PROCESS;
+	}
+	result = try_collapse_pte_mapped_thp(mm, addr, !cc->is_khugepaged);
+	if (result == SCAN_PMD_MAPPED)
+		result = SCAN_SUCCEED;
+	mmap_read_unlock(mm);
+	*mmap_locked = false;
+
+end:
+	if (cc->is_khugepaged && result == SCAN_SUCCEED)
+		++khugepaged_pages_collapsed;
+	return result;
+}
+
 static unsigned int collapse_scan_mm_slot(unsigned int pages, enum scan_result *result,
 					    struct collapse_control *cc)
 	__releases(&khugepaged_mm_lock)
@@ -2489,36 +2553,9 @@ static unsigned int collapse_scan_mm_slot(unsigned int pages, enum scan_result *
 			VM_BUG_ON(khugepaged_scan.address < hstart ||
 				  khugepaged_scan.address + HPAGE_PMD_SIZE >
 				  hend);
-			if (!vma_is_anonymous(vma)) {
-				struct file *file = get_file(vma->vm_file);
-				pgoff_t pgoff = linear_page_index(vma,
-						khugepaged_scan.address);
-
-				mmap_read_unlock(mm);
-				mmap_locked = false;
-				*result = collapse_scan_file(mm,
-					khugepaged_scan.address, file, pgoff,
-					&cur_progress, cc);
-				fput(file);
-				if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
-					mmap_read_lock(mm);
-					if (collapse_test_exit_or_disable(mm))
-						goto breakouterloop;
-					*result = try_collapse_pte_mapped_thp(mm,
-						khugepaged_scan.address, false);
-					if (*result == SCAN_PMD_MAPPED)
-						*result = SCAN_SUCCEED;
-					mmap_read_unlock(mm);
-				}
-			} else {
-				*result = collapse_scan_pmd(mm, vma,
-					khugepaged_scan.address, &mmap_locked,
-					&cur_progress, cc);
-			}
-
-			if (*result == SCAN_SUCCEED)
-				++khugepaged_pages_collapsed;
 
+			*result = collapse_single_pmd(khugepaged_scan.address,
+						      vma, &mmap_locked, &cur_progress, cc);
 			/* move to next address */
 			khugepaged_scan.address += HPAGE_PMD_SIZE;
 			progress += cur_progress;
@@ -2819,13 +2856,12 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
 
 	for (addr = hstart; addr < hend; addr += HPAGE_PMD_SIZE) {
 		enum scan_result result = SCAN_FAIL;
-		bool triggered_wb = false;
 
-retry:
 		if (!mmap_locked) {
 			cond_resched();
 			mmap_read_lock(mm);
 			mmap_locked = true;
+			*lock_dropped = true;
 			result = hugepage_vma_revalidate(mm, addr, false, &vma,
 							 cc);
 			if (result  != SCAN_SUCCEED) {
@@ -2836,46 +2872,20 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
 			hend = min(hend, vma->vm_end & HPAGE_PMD_MASK);
 		}
 		mmap_assert_locked(mm);
-		if (!vma_is_anonymous(vma)) {
-			struct file *file = get_file(vma->vm_file);
-			pgoff_t pgoff = linear_page_index(vma, addr);
-
-			mmap_read_unlock(mm);
-			mmap_locked = false;
-			*lock_dropped = true;
-			result = collapse_scan_file(mm, addr, file, pgoff, NULL, cc);
 
-			if (result == SCAN_PAGE_DIRTY_OR_WRITEBACK && !triggered_wb &&
-			    mapping_can_writeback(file->f_mapping)) {
-				loff_t lstart = (loff_t)pgoff << PAGE_SHIFT;
-				loff_t lend = lstart + HPAGE_PMD_SIZE - 1;
+		result = collapse_single_pmd(addr, vma, &mmap_locked, NULL, cc);
 
-				filemap_write_and_wait_range(file->f_mapping, lstart, lend);
-				triggered_wb = true;
-				fput(file);
-				goto retry;
-			}
-			fput(file);
-		} else {
-			result = collapse_scan_pmd(mm, vma, addr, &mmap_locked, NULL, cc);
-		}
 		if (!mmap_locked)
 			*lock_dropped = true;
 
-handle_result:
 		switch (result) {
 		case SCAN_SUCCEED:
 		case SCAN_PMD_MAPPED:
 			++thps;
 			break;
-		case SCAN_PTE_MAPPED_HUGEPAGE:
-			BUG_ON(mmap_locked);
-			mmap_read_lock(mm);
-			result = try_collapse_pte_mapped_thp(mm, addr, true);
-			mmap_read_unlock(mm);
-			goto handle_result;
 		/* Whitelisted set of results where continuing OK */
 		case SCAN_NO_PTE_TABLE:
+		case SCAN_PTE_MAPPED_HUGEPAGE:
 		case SCAN_PTE_NON_PRESENT:
 		case SCAN_PTE_UFFD_WP:
 		case SCAN_LACK_REFERENCED_PAGE:
-- 
2.53.0
Re: [PATCH mm-unstable v2 5/5] mm/khugepaged: unify khugepaged and madv_collapse with collapse_single_pmd()
Posted by David Hildenbrand (Arm) 1 month, 1 week ago
On 2/26/26 02:29, Nico Pache wrote:
> The khugepaged daemon and madvise_collapse have two different
> implementations that do almost the same thing.
> 
> Create collapse_single_pmd to increase code reuse and create an entry
> point to these two users.
> 
> Refactor madvise_collapse and collapse_scan_mm_slot to use the new
> collapse_single_pmd function. This introduces a minor behavioral change
> that is most likely an undiscovered bug. The current implementation of
> khugepaged tests collapse_test_exit_or_disable before calling
> collapse_pte_mapped_thp, but we weren't doing it in the madvise_collapse
> case. By unifying these two callers madvise_collapse now also performs
> this check. We also modify the return value to be SCAN_ANY_PROCESS which
> properly indicates that this process is no longer valid to operate on.
> 
> We also guard the khugepaged_pages_collapsed variable to ensure its only
> incremented for khugepaged.
> 
> Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>

Probably best to drop Lorenzo's RB after bigger changes.

> Signed-off-by: Nico Pache <npache@redhat.com>
> ---
>  mm/khugepaged.c | 128 ++++++++++++++++++++++++++----------------------
>  1 file changed, 69 insertions(+), 59 deletions(-)
> 
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 64086488ca01..0058970d4579 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -2417,6 +2417,70 @@ static enum scan_result collapse_scan_file(struct mm_struct *mm, unsigned long a
>  	return result;
>  }
>  
> +/*
> + * Try to collapse a single PMD starting at a PMD aligned addr, and return
> + * the results.
> + */
> +static enum scan_result collapse_single_pmd(unsigned long addr,
> +		struct vm_area_struct *vma, bool *mmap_locked,
> +		unsigned int *cur_progress, struct collapse_control *cc)
> +{
> +	struct mm_struct *mm = vma->vm_mm;
> +	bool triggered_wb = false;
> +	enum scan_result result;
> +	struct file *file;
> +	pgoff_t pgoff;
> +
> +	if (vma_is_anonymous(vma)) {
> +		result = collapse_scan_pmd(mm, vma, addr, mmap_locked, cur_progress, cc);
> +		goto end;
> +	}
> +
> +	file = get_file(vma->vm_file);
> +	pgoff = linear_page_index(vma, addr);
> +
> +	mmap_read_unlock(mm);
> +	*mmap_locked = false;
> +retry:
> +	result = collapse_scan_file(mm, addr, file, pgoff, cur_progress, cc);
> +
> +	/*
> +	 * For MADV_COLLAPSE, when encountering dirty pages, try to writeback,
> +	 * then retry the collapse one time.
> +	 */
> +	if (!cc->is_khugepaged && result == SCAN_PAGE_DIRTY_OR_WRITEBACK &&
> +	    triggered_wb && mapping_can_writeback(file->f_mapping)) {

!triggered_wb, right?


> +		const loff_t lstart = (loff_t)pgoff << PAGE_SHIFT;
> +		const loff_t lend = lstart + HPAGE_PMD_SIZE - 1;
> +
> +		filemap_write_and_wait_range(file->f_mapping, lstart, lend);
> +		triggered_wb = true;
> +		goto retry;
> +	}
> +	fput(file);
> +
> +	if (result != SCAN_PTE_MAPPED_HUGEPAGE)
> +		goto end;
> +
> +	mmap_read_lock(mm);
> +	*mmap_locked = true;

On all paths below, you set "*mmap_locked = false". Why even bother about setting the variable?

> +	if (collapse_test_exit_or_disable(mm)) {
> +		mmap_read_unlock(mm);
> +		*mmap_locked = false;
> +		return SCAN_ANY_PROCESS;
> +	}
> +	result = try_collapse_pte_mapped_thp(mm, addr, !cc->is_khugepaged);
> +	if (result == SCAN_PMD_MAPPED)
> +		result = SCAN_SUCCEED;
> +	mmap_read_unlock(mm);
> +	*mmap_locked = false;

This might all read nicer without the goto and without the early return.

/* If we have a THP in the pagecache, try to retract the pagetable. */
if (result == SCAN_PTE_MAPPED_HUGEPAGE) {
	mmap_read_lock(mm);
	if (collapse_test_exit_or_disable(mm))
		result = SCAN_ANY_PROCESS;
	else
		result = try_collapse_pte_mapped_thp(mm, addr, !cc->is_khugepaged);
	if (result == SCAN_PMD_MAPPED)
		result = SCAN_SUCCEED
	}
	mmap_read_unlock(mm);
}

> +
> +end:
> +	if (cc->is_khugepaged && result == SCAN_SUCCEED)
> +		++khugepaged_pages_collapsed;
> +	return result;
> +}
> +
>  static unsigned int collapse_scan_mm_slot(unsigned int pages, enum scan_result *result,
>  					    struct collapse_control *cc)
>  	__releases(&khugepaged_mm_lock)
> @@ -2489,36 +2553,9 @@ static unsigned int collapse_scan_mm_slot(unsigned int pages, enum scan_result *
>  			VM_BUG_ON(khugepaged_scan.address < hstart ||
>  				  khugepaged_scan.address + HPAGE_PMD_SIZE >
>  				  hend);
> -			if (!vma_is_anonymous(vma)) {
> -				struct file *file = get_file(vma->vm_file);
> -				pgoff_t pgoff = linear_page_index(vma,
> -						khugepaged_scan.address);
> -
> -				mmap_read_unlock(mm);
> -				mmap_locked = false;
> -				*result = collapse_scan_file(mm,
> -					khugepaged_scan.address, file, pgoff,
> -					&cur_progress, cc);
> -				fput(file);
> -				if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
> -					mmap_read_lock(mm);
> -					if (collapse_test_exit_or_disable(mm))
> -						goto breakouterloop;
> -					*result = try_collapse_pte_mapped_thp(mm,
> -						khugepaged_scan.address, false);
> -					if (*result == SCAN_PMD_MAPPED)
> -						*result = SCAN_SUCCEED;
> -					mmap_read_unlock(mm);
> -				}
> -			} else {
> -				*result = collapse_scan_pmd(mm, vma,
> -					khugepaged_scan.address, &mmap_locked,
> -					&cur_progress, cc);
> -			}
> -
> -			if (*result == SCAN_SUCCEED)
> -				++khugepaged_pages_collapsed;
>  
> +			*result = collapse_single_pmd(khugepaged_scan.address,
> +						      vma, &mmap_locked, &cur_progress, cc);
>  			/* move to next address */
>  			khugepaged_scan.address += HPAGE_PMD_SIZE;
>  			progress += cur_progress;
> @@ -2819,13 +2856,12 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
>  
>  	for (addr = hstart; addr < hend; addr += HPAGE_PMD_SIZE) {
>  		enum scan_result result = SCAN_FAIL;
> -		bool triggered_wb = false;
>  
> -retry:
>  		if (!mmap_locked) {
>  			cond_resched();
>  			mmap_read_lock(mm);
>  			mmap_locked = true;
> +			*lock_dropped = true;

Hm, is this change here required at all? Shouldn't we instead need to know from
collapse_single_pmd() whether it dropped the lock?


-- 
Cheers,

David
Re: [PATCH mm-unstable v2 5/5] mm/khugepaged: unify khugepaged and madv_collapse with collapse_single_pmd()
Posted by Nico Pache 1 month, 1 week ago
On Thu, Feb 26, 2026 at 2:41 AM David Hildenbrand (Arm)
<david@kernel.org> wrote:
>
> On 2/26/26 02:29, Nico Pache wrote:
> > The khugepaged daemon and madvise_collapse have two different
> > implementations that do almost the same thing.
> >
> > Create collapse_single_pmd to increase code reuse and create an entry
> > point to these two users.
> >
> > Refactor madvise_collapse and collapse_scan_mm_slot to use the new
> > collapse_single_pmd function. This introduces a minor behavioral change
> > that is most likely an undiscovered bug. The current implementation of
> > khugepaged tests collapse_test_exit_or_disable before calling
> > collapse_pte_mapped_thp, but we weren't doing it in the madvise_collapse
> > case. By unifying these two callers madvise_collapse now also performs
> > this check. We also modify the return value to be SCAN_ANY_PROCESS which
> > properly indicates that this process is no longer valid to operate on.
> >
> > We also guard the khugepaged_pages_collapsed variable to ensure its only
> > incremented for khugepaged.
> >
> > Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
>
> Probably best to drop Lorenzo's RB after bigger changes.
>
> > Signed-off-by: Nico Pache <npache@redhat.com>
> > ---
> >  mm/khugepaged.c | 128 ++++++++++++++++++++++++++----------------------
> >  1 file changed, 69 insertions(+), 59 deletions(-)
> >
> > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > index 64086488ca01..0058970d4579 100644
> > --- a/mm/khugepaged.c
> > +++ b/mm/khugepaged.c
> > @@ -2417,6 +2417,70 @@ static enum scan_result collapse_scan_file(struct mm_struct *mm, unsigned long a
> >       return result;
> >  }
> >
> > +/*
> > + * Try to collapse a single PMD starting at a PMD aligned addr, and return
> > + * the results.
> > + */
> > +static enum scan_result collapse_single_pmd(unsigned long addr,
> > +             struct vm_area_struct *vma, bool *mmap_locked,
> > +             unsigned int *cur_progress, struct collapse_control *cc)
> > +{
> > +     struct mm_struct *mm = vma->vm_mm;
> > +     bool triggered_wb = false;
> > +     enum scan_result result;
> > +     struct file *file;
> > +     pgoff_t pgoff;
> > +
> > +     if (vma_is_anonymous(vma)) {
> > +             result = collapse_scan_pmd(mm, vma, addr, mmap_locked, cur_progress, cc);
> > +             goto end;
> > +     }
> > +
> > +     file = get_file(vma->vm_file);
> > +     pgoff = linear_page_index(vma, addr);
> > +
> > +     mmap_read_unlock(mm);
> > +     *mmap_locked = false;
> > +retry:
> > +     result = collapse_scan_file(mm, addr, file, pgoff, cur_progress, cc);
> > +
> > +     /*
> > +      * For MADV_COLLAPSE, when encountering dirty pages, try to writeback,
> > +      * then retry the collapse one time.
> > +      */
> > +     if (!cc->is_khugepaged && result == SCAN_PAGE_DIRTY_OR_WRITEBACK &&
> > +         triggered_wb && mapping_can_writeback(file->f_mapping)) {
>
> !triggered_wb, right?
>
>
> > +             const loff_t lstart = (loff_t)pgoff << PAGE_SHIFT;
> > +             const loff_t lend = lstart + HPAGE_PMD_SIZE - 1;
> > +
> > +             filemap_write_and_wait_range(file->f_mapping, lstart, lend);
> > +             triggered_wb = true;
> > +             goto retry;
> > +     }
> > +     fput(file);
> > +
> > +     if (result != SCAN_PTE_MAPPED_HUGEPAGE)
> > +             goto end;
> > +
> > +     mmap_read_lock(mm);
> > +     *mmap_locked = true;
>
> On all paths below, you set "*mmap_locked = false". Why even bother about setting the variable?

Yeah I believe someone (Lorenzo?) pointed that out during the last
review cycle. I forgot to look into it :<

As you state, I believe we can drop the repetitive mmap_locked (iirc
this was introduced in an earlier version before `lock_dropped`) and
move it into the single_pmd() function.

>
> > +     if (collapse_test_exit_or_disable(mm)) {
> > +             mmap_read_unlock(mm);
> > +             *mmap_locked = false;
> > +             return SCAN_ANY_PROCESS;
> > +     }
> > +     result = try_collapse_pte_mapped_thp(mm, addr, !cc->is_khugepaged);
> > +     if (result == SCAN_PMD_MAPPED)
> > +             result = SCAN_SUCCEED;
> > +     mmap_read_unlock(mm);
> > +     *mmap_locked = false;
>
> This might all read nicer without the goto and without the early return.

I'll see what I can do!

>
> /* If we have a THP in the pagecache, try to retract the pagetable. */
> if (result == SCAN_PTE_MAPPED_HUGEPAGE) {
>         mmap_read_lock(mm);
>         if (collapse_test_exit_or_disable(mm))
>                 result = SCAN_ANY_PROCESS;
>         else
>                 result = try_collapse_pte_mapped_thp(mm, addr, !cc->is_khugepaged);
>         if (result == SCAN_PMD_MAPPED)
>                 result = SCAN_SUCCEED
>         }
>         mmap_read_unlock(mm);
> }

Oh thanks! I'll try this

>
> > +
> > +end:
> > +     if (cc->is_khugepaged && result == SCAN_SUCCEED)
> > +             ++khugepaged_pages_collapsed;
> > +     return result;
> > +}
> > +
> >  static unsigned int collapse_scan_mm_slot(unsigned int pages, enum scan_result *result,
> >                                           struct collapse_control *cc)
> >       __releases(&khugepaged_mm_lock)
> > @@ -2489,36 +2553,9 @@ static unsigned int collapse_scan_mm_slot(unsigned int pages, enum scan_result *
> >                       VM_BUG_ON(khugepaged_scan.address < hstart ||
> >                                 khugepaged_scan.address + HPAGE_PMD_SIZE >
> >                                 hend);
> > -                     if (!vma_is_anonymous(vma)) {
> > -                             struct file *file = get_file(vma->vm_file);
> > -                             pgoff_t pgoff = linear_page_index(vma,
> > -                                             khugepaged_scan.address);
> > -
> > -                             mmap_read_unlock(mm);
> > -                             mmap_locked = false;
> > -                             *result = collapse_scan_file(mm,
> > -                                     khugepaged_scan.address, file, pgoff,
> > -                                     &cur_progress, cc);
> > -                             fput(file);
> > -                             if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
> > -                                     mmap_read_lock(mm);
> > -                                     if (collapse_test_exit_or_disable(mm))
> > -                                             goto breakouterloop;
> > -                                     *result = try_collapse_pte_mapped_thp(mm,
> > -                                             khugepaged_scan.address, false);
> > -                                     if (*result == SCAN_PMD_MAPPED)
> > -                                             *result = SCAN_SUCCEED;
> > -                                     mmap_read_unlock(mm);
> > -                             }
> > -                     } else {
> > -                             *result = collapse_scan_pmd(mm, vma,
> > -                                     khugepaged_scan.address, &mmap_locked,
> > -                                     &cur_progress, cc);
> > -                     }
> > -
> > -                     if (*result == SCAN_SUCCEED)
> > -                             ++khugepaged_pages_collapsed;
> >
> > +                     *result = collapse_single_pmd(khugepaged_scan.address,
> > +                                                   vma, &mmap_locked, &cur_progress, cc);
> >                       /* move to next address */
> >                       khugepaged_scan.address += HPAGE_PMD_SIZE;
> >                       progress += cur_progress;
> > @@ -2819,13 +2856,12 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
> >
> >       for (addr = hstart; addr < hend; addr += HPAGE_PMD_SIZE) {
> >               enum scan_result result = SCAN_FAIL;
> > -             bool triggered_wb = false;
> >
> > -retry:
> >               if (!mmap_locked) {
> >                       cond_resched();
> >                       mmap_read_lock(mm);
> >                       mmap_locked = true;
> > +                     *lock_dropped = true;
>
> Hm, is this change here required at all? Shouldn't we instead need to know from
> collapse_single_pmd() whether it dropped the lock?

I'll verify all this locking and post a fixup! This 'lock dropped'
feature was introduced mid series. And I think it makes mmap_locked
redundant. I verified this once before but forgot most of the details
ATM.

Cheers,
-- Nico

>
>
> --
> Cheers,
>
> David
>
Re: [PATCH mm-unstable v2 5/5] mm/khugepaged: unify khugepaged and madv_collapse with collapse_single_pmd()
Posted by Baolin Wang 1 month, 1 week ago

On 2/26/26 9:29 AM, Nico Pache wrote:
> The khugepaged daemon and madvise_collapse have two different
> implementations that do almost the same thing.
> 
> Create collapse_single_pmd to increase code reuse and create an entry
> point to these two users.
> 
> Refactor madvise_collapse and collapse_scan_mm_slot to use the new
> collapse_single_pmd function. This introduces a minor behavioral change
> that is most likely an undiscovered bug. The current implementation of
> khugepaged tests collapse_test_exit_or_disable before calling
> collapse_pte_mapped_thp, but we weren't doing it in the madvise_collapse
> case. By unifying these two callers madvise_collapse now also performs
> this check. We also modify the return value to be SCAN_ANY_PROCESS which
> properly indicates that this process is no longer valid to operate on.
> 
> We also guard the khugepaged_pages_collapsed variable to ensure its only
> incremented for khugepaged.
> 
> Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---

[snip]

>   	for (addr = hstart; addr < hend; addr += HPAGE_PMD_SIZE) {
>   		enum scan_result result = SCAN_FAIL;
> -		bool triggered_wb = false;
>   
> -retry:
>   		if (!mmap_locked) {
>   			cond_resched();
>   			mmap_read_lock(mm);
>   			mmap_locked = true;
> +			*lock_dropped = true;
IIUC, this should be '*lock_dropped = false', right?

>   			result = hugepage_vma_revalidate(mm, addr, false, &vma,
>   							 cc);
>   			if (result  != SCAN_SUCCEED) {
> @@ -2836,46 +2872,20 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
>   			hend = min(hend, vma->vm_end & HPAGE_PMD_MASK);
>   		}
>   		mmap_assert_locked(mm);
> -		if (!vma_is_anonymous(vma)) {
> -			struct file *file = get_file(vma->vm_file);
> -			pgoff_t pgoff = linear_page_index(vma, addr);
> -
> -			mmap_read_unlock(mm);
> -			mmap_locked = false;
> -			*lock_dropped = true;
> -			result = collapse_scan_file(mm, addr, file, pgoff, NULL, cc);
>   
> -			if (result == SCAN_PAGE_DIRTY_OR_WRITEBACK && !triggered_wb &&
> -			    mapping_can_writeback(file->f_mapping)) {
> -				loff_t lstart = (loff_t)pgoff << PAGE_SHIFT;
> -				loff_t lend = lstart + HPAGE_PMD_SIZE - 1;
> +		result = collapse_single_pmd(addr, vma, &mmap_locked, NULL, cc);
>   
> -				filemap_write_and_wait_range(file->f_mapping, lstart, lend);
> -				triggered_wb = true;
> -				fput(file);
> -				goto retry;
> -			}
> -			fput(file);
> -		} else {
> -			result = collapse_scan_pmd(mm, vma, addr, &mmap_locked, NULL, cc);
> -		}
>   		if (!mmap_locked)
>   			*lock_dropped = true;
>   
> -handle_result:
>   		switch (result) {
>   		case SCAN_SUCCEED:
>   		case SCAN_PMD_MAPPED:
>   			++thps;
>   			break;
> -		case SCAN_PTE_MAPPED_HUGEPAGE:
> -			BUG_ON(mmap_locked);
> -			mmap_read_lock(mm);
> -			result = try_collapse_pte_mapped_thp(mm, addr, true);
> -			mmap_read_unlock(mm);
> -			goto handle_result;
>   		/* Whitelisted set of results where continuing OK */
>   		case SCAN_NO_PTE_TABLE:
> +		case SCAN_PTE_MAPPED_HUGEPAGE:
>   		case SCAN_PTE_NON_PRESENT:
>   		case SCAN_PTE_UFFD_WP:
>   		case SCAN_LACK_REFERENCED_PAGE:
Re: [PATCH mm-unstable v2 5/5] mm/khugepaged: unify khugepaged and madv_collapse with collapse_single_pmd()
Posted by Nico Pache 1 month, 1 week ago
On Thu, Feb 26, 2026 at 2:24 AM Baolin Wang
<baolin.wang@linux.alibaba.com> wrote:
>
>
>
> On 2/26/26 9:29 AM, Nico Pache wrote:
> > The khugepaged daemon and madvise_collapse have two different
> > implementations that do almost the same thing.
> >
> > Create collapse_single_pmd to increase code reuse and create an entry
> > point to these two users.
> >
> > Refactor madvise_collapse and collapse_scan_mm_slot to use the new
> > collapse_single_pmd function. This introduces a minor behavioral change
> > that is most likely an undiscovered bug. The current implementation of
> > khugepaged tests collapse_test_exit_or_disable before calling
> > collapse_pte_mapped_thp, but we weren't doing it in the madvise_collapse
> > case. By unifying these two callers madvise_collapse now also performs
> > this check. We also modify the return value to be SCAN_ANY_PROCESS which
> > properly indicates that this process is no longer valid to operate on.
> >
> > We also guard the khugepaged_pages_collapsed variable to ensure its only
> > incremented for khugepaged.
> >
> > Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> > Signed-off-by: Nico Pache <npache@redhat.com>
> > ---
>
> [snip]
>
> >       for (addr = hstart; addr < hend; addr += HPAGE_PMD_SIZE) {
> >               enum scan_result result = SCAN_FAIL;
> > -             bool triggered_wb = false;
> >
> > -retry:
> >               if (!mmap_locked) {
> >                       cond_resched();
> >                       mmap_read_lock(mm);
> >                       mmap_locked = true;
> > +                     *lock_dropped = true;
> IIUC, this should be '*lock_dropped = false', right?

Yes! Thanks for catching that :) As David and others have pointed out,
this lock handling here might be unnecessary and better placed in
collapse_single_pmd(). I meant to look into that before posting this
but it slipped my mind.

>
> >                       result = hugepage_vma_revalidate(mm, addr, false, &vma,
> >                                                        cc);
> >                       if (result  != SCAN_SUCCEED) {
> > @@ -2836,46 +2872,20 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
> >                       hend = min(hend, vma->vm_end & HPAGE_PMD_MASK);
> >               }
> >               mmap_assert_locked(mm);
> > -             if (!vma_is_anonymous(vma)) {
> > -                     struct file *file = get_file(vma->vm_file);
> > -                     pgoff_t pgoff = linear_page_index(vma, addr);
> > -
> > -                     mmap_read_unlock(mm);
> > -                     mmap_locked = false;
> > -                     *lock_dropped = true;
> > -                     result = collapse_scan_file(mm, addr, file, pgoff, NULL, cc);
> >
> > -                     if (result == SCAN_PAGE_DIRTY_OR_WRITEBACK && !triggered_wb &&
> > -                         mapping_can_writeback(file->f_mapping)) {
> > -                             loff_t lstart = (loff_t)pgoff << PAGE_SHIFT;
> > -                             loff_t lend = lstart + HPAGE_PMD_SIZE - 1;
> > +             result = collapse_single_pmd(addr, vma, &mmap_locked, NULL, cc);
> >
> > -                             filemap_write_and_wait_range(file->f_mapping, lstart, lend);
> > -                             triggered_wb = true;
> > -                             fput(file);
> > -                             goto retry;
> > -                     }
> > -                     fput(file);
> > -             } else {
> > -                     result = collapse_scan_pmd(mm, vma, addr, &mmap_locked, NULL, cc);
> > -             }
> >               if (!mmap_locked)
> >                       *lock_dropped = true;
> >
> > -handle_result:
> >               switch (result) {
> >               case SCAN_SUCCEED:
> >               case SCAN_PMD_MAPPED:
> >                       ++thps;
> >                       break;
> > -             case SCAN_PTE_MAPPED_HUGEPAGE:
> > -                     BUG_ON(mmap_locked);
> > -                     mmap_read_lock(mm);
> > -                     result = try_collapse_pte_mapped_thp(mm, addr, true);
> > -                     mmap_read_unlock(mm);
> > -                     goto handle_result;
> >               /* Whitelisted set of results where continuing OK */
> >               case SCAN_NO_PTE_TABLE:
> > +             case SCAN_PTE_MAPPED_HUGEPAGE:
> >               case SCAN_PTE_NON_PRESENT:
> >               case SCAN_PTE_UFFD_WP:
> >               case SCAN_LACK_REFERENCED_PAGE:
>
Re: [PATCH mm-unstable v2 5/5] mm/khugepaged: unify khugepaged and madv_collapse with collapse_single_pmd()
Posted by Nico Pache 3 weeks, 5 days ago
On Thu, Feb 26, 2026 at 1:20 PM Nico Pache <npache@redhat.com> wrote:
>
> On Thu, Feb 26, 2026 at 2:24 AM Baolin Wang
> <baolin.wang@linux.alibaba.com> wrote:
> >
> >
> >
> > On 2/26/26 9:29 AM, Nico Pache wrote:
> > > The khugepaged daemon and madvise_collapse have two different
> > > implementations that do almost the same thing.
> > >
> > > Create collapse_single_pmd to increase code reuse and create an entry
> > > point to these two users.
> > >
> > > Refactor madvise_collapse and collapse_scan_mm_slot to use the new
> > > collapse_single_pmd function. This introduces a minor behavioral change
> > > that is most likely an undiscovered bug. The current implementation of
> > > khugepaged tests collapse_test_exit_or_disable before calling
> > > collapse_pte_mapped_thp, but we weren't doing it in the madvise_collapse
> > > case. By unifying these two callers madvise_collapse now also performs
> > > this check. We also modify the return value to be SCAN_ANY_PROCESS which
> > > properly indicates that this process is no longer valid to operate on.
> > >
> > > We also guard the khugepaged_pages_collapsed variable to ensure its only
> > > incremented for khugepaged.
> > >
> > > Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> > > Signed-off-by: Nico Pache <npache@redhat.com>
> > > ---
> >
> > [snip]
> >
> > >       for (addr = hstart; addr < hend; addr += HPAGE_PMD_SIZE) {
> > >               enum scan_result result = SCAN_FAIL;
> > > -             bool triggered_wb = false;
> > >
> > > -retry:
> > >               if (!mmap_locked) {
> > >                       cond_resched();
> > >                       mmap_read_lock(mm);
> > >                       mmap_locked = true;
> > > +                     *lock_dropped = true;
> > IIUC, this should be '*lock_dropped = false', right?
>
> Yes! Thanks for catching that :) As David and others have pointed out,
> this lock handling here might be unnecessary and better placed in
> collapse_single_pmd(). I meant to look into that before posting this
> but it slipped my mind.

On second pass, no, I think we should drop this line altogether. If
(!mmap_locked) -> we have either just completed a collapse, or we
tried file collapse on a 2MB region. Collapse_single_pmd would report
this, and we would have already set lock_dropped.

>
> >
> > >                       result = hugepage_vma_revalidate(mm, addr, false, &vma,
> > >                                                        cc);
> > >                       if (result  != SCAN_SUCCEED) {
> > > @@ -2836,46 +2872,20 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
> > >                       hend = min(hend, vma->vm_end & HPAGE_PMD_MASK);
> > >               }
> > >               mmap_assert_locked(mm);
> > > -             if (!vma_is_anonymous(vma)) {
> > > -                     struct file *file = get_file(vma->vm_file);
> > > -                     pgoff_t pgoff = linear_page_index(vma, addr);
> > > -
> > > -                     mmap_read_unlock(mm);
> > > -                     mmap_locked = false;
> > > -                     *lock_dropped = true;
> > > -                     result = collapse_scan_file(mm, addr, file, pgoff, NULL, cc);
> > >
> > > -                     if (result == SCAN_PAGE_DIRTY_OR_WRITEBACK && !triggered_wb &&
> > > -                         mapping_can_writeback(file->f_mapping)) {
> > > -                             loff_t lstart = (loff_t)pgoff << PAGE_SHIFT;
> > > -                             loff_t lend = lstart + HPAGE_PMD_SIZE - 1;
> > > +             result = collapse_single_pmd(addr, vma, &mmap_locked, NULL, cc);
> > >
> > > -                             filemap_write_and_wait_range(file->f_mapping, lstart, lend);
> > > -                             triggered_wb = true;
> > > -                             fput(file);
> > > -                             goto retry;
> > > -                     }
> > > -                     fput(file);
> > > -             } else {
> > > -                     result = collapse_scan_pmd(mm, vma, addr, &mmap_locked, NULL, cc);
> > > -             }
> > >               if (!mmap_locked)
> > >                       *lock_dropped = true;
> > >
> > > -handle_result:
> > >               switch (result) {
> > >               case SCAN_SUCCEED:
> > >               case SCAN_PMD_MAPPED:
> > >                       ++thps;
> > >                       break;
> > > -             case SCAN_PTE_MAPPED_HUGEPAGE:
> > > -                     BUG_ON(mmap_locked);
> > > -                     mmap_read_lock(mm);
> > > -                     result = try_collapse_pte_mapped_thp(mm, addr, true);
> > > -                     mmap_read_unlock(mm);
> > > -                     goto handle_result;
> > >               /* Whitelisted set of results where continuing OK */
> > >               case SCAN_NO_PTE_TABLE:
> > > +             case SCAN_PTE_MAPPED_HUGEPAGE:
> > >               case SCAN_PTE_NON_PRESENT:
> > >               case SCAN_PTE_UFFD_WP:
> > >               case SCAN_LACK_REFERENCED_PAGE:
> >