The khugepaged daemon and madvise_collapse have two different
implementations that do almost the same thing.
Create collapse_single_pmd to increase code reuse and create an entry
point to these two users.
Refactor madvise_collapse and collapse_scan_mm_slot to use the new
collapse_single_pmd function. This introduces a minor behavioral change
that is most likely an undiscovered bug. The current implementation of
khugepaged tests collapse_test_exit_or_disable before calling
collapse_pte_mapped_thp, but we weren't doing it in the madvise_collapse
case. By unifying these two callers madvise_collapse now also performs
this check. We also modify the return value to be SCAN_ANY_PROCESS which
properly indicates that this process is no longer valid to operate on.
We also guard the khugepaged_pages_collapsed variable to ensure its only
incremented for khugepaged.
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Signed-off-by: Nico Pache <npache@redhat.com>
---
mm/khugepaged.c | 128 ++++++++++++++++++++++++++----------------------
1 file changed, 69 insertions(+), 59 deletions(-)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 64086488ca01..0058970d4579 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -2417,6 +2417,70 @@ static enum scan_result collapse_scan_file(struct mm_struct *mm, unsigned long a
return result;
}
+/*
+ * Try to collapse a single PMD starting at a PMD aligned addr, and return
+ * the results.
+ */
+static enum scan_result collapse_single_pmd(unsigned long addr,
+ struct vm_area_struct *vma, bool *mmap_locked,
+ unsigned int *cur_progress, struct collapse_control *cc)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ bool triggered_wb = false;
+ enum scan_result result;
+ struct file *file;
+ pgoff_t pgoff;
+
+ if (vma_is_anonymous(vma)) {
+ result = collapse_scan_pmd(mm, vma, addr, mmap_locked, cur_progress, cc);
+ goto end;
+ }
+
+ file = get_file(vma->vm_file);
+ pgoff = linear_page_index(vma, addr);
+
+ mmap_read_unlock(mm);
+ *mmap_locked = false;
+retry:
+ result = collapse_scan_file(mm, addr, file, pgoff, cur_progress, cc);
+
+ /*
+ * For MADV_COLLAPSE, when encountering dirty pages, try to writeback,
+ * then retry the collapse one time.
+ */
+ if (!cc->is_khugepaged && result == SCAN_PAGE_DIRTY_OR_WRITEBACK &&
+ triggered_wb && mapping_can_writeback(file->f_mapping)) {
+ const loff_t lstart = (loff_t)pgoff << PAGE_SHIFT;
+ const loff_t lend = lstart + HPAGE_PMD_SIZE - 1;
+
+ filemap_write_and_wait_range(file->f_mapping, lstart, lend);
+ triggered_wb = true;
+ goto retry;
+ }
+ fput(file);
+
+ if (result != SCAN_PTE_MAPPED_HUGEPAGE)
+ goto end;
+
+ mmap_read_lock(mm);
+ *mmap_locked = true;
+ if (collapse_test_exit_or_disable(mm)) {
+ mmap_read_unlock(mm);
+ *mmap_locked = false;
+ return SCAN_ANY_PROCESS;
+ }
+ result = try_collapse_pte_mapped_thp(mm, addr, !cc->is_khugepaged);
+ if (result == SCAN_PMD_MAPPED)
+ result = SCAN_SUCCEED;
+ mmap_read_unlock(mm);
+ *mmap_locked = false;
+
+end:
+ if (cc->is_khugepaged && result == SCAN_SUCCEED)
+ ++khugepaged_pages_collapsed;
+ return result;
+}
+
static unsigned int collapse_scan_mm_slot(unsigned int pages, enum scan_result *result,
struct collapse_control *cc)
__releases(&khugepaged_mm_lock)
@@ -2489,36 +2553,9 @@ static unsigned int collapse_scan_mm_slot(unsigned int pages, enum scan_result *
VM_BUG_ON(khugepaged_scan.address < hstart ||
khugepaged_scan.address + HPAGE_PMD_SIZE >
hend);
- if (!vma_is_anonymous(vma)) {
- struct file *file = get_file(vma->vm_file);
- pgoff_t pgoff = linear_page_index(vma,
- khugepaged_scan.address);
-
- mmap_read_unlock(mm);
- mmap_locked = false;
- *result = collapse_scan_file(mm,
- khugepaged_scan.address, file, pgoff,
- &cur_progress, cc);
- fput(file);
- if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
- mmap_read_lock(mm);
- if (collapse_test_exit_or_disable(mm))
- goto breakouterloop;
- *result = try_collapse_pte_mapped_thp(mm,
- khugepaged_scan.address, false);
- if (*result == SCAN_PMD_MAPPED)
- *result = SCAN_SUCCEED;
- mmap_read_unlock(mm);
- }
- } else {
- *result = collapse_scan_pmd(mm, vma,
- khugepaged_scan.address, &mmap_locked,
- &cur_progress, cc);
- }
-
- if (*result == SCAN_SUCCEED)
- ++khugepaged_pages_collapsed;
+ *result = collapse_single_pmd(khugepaged_scan.address,
+ vma, &mmap_locked, &cur_progress, cc);
/* move to next address */
khugepaged_scan.address += HPAGE_PMD_SIZE;
progress += cur_progress;
@@ -2819,13 +2856,12 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
for (addr = hstart; addr < hend; addr += HPAGE_PMD_SIZE) {
enum scan_result result = SCAN_FAIL;
- bool triggered_wb = false;
-retry:
if (!mmap_locked) {
cond_resched();
mmap_read_lock(mm);
mmap_locked = true;
+ *lock_dropped = true;
result = hugepage_vma_revalidate(mm, addr, false, &vma,
cc);
if (result != SCAN_SUCCEED) {
@@ -2836,46 +2872,20 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
hend = min(hend, vma->vm_end & HPAGE_PMD_MASK);
}
mmap_assert_locked(mm);
- if (!vma_is_anonymous(vma)) {
- struct file *file = get_file(vma->vm_file);
- pgoff_t pgoff = linear_page_index(vma, addr);
-
- mmap_read_unlock(mm);
- mmap_locked = false;
- *lock_dropped = true;
- result = collapse_scan_file(mm, addr, file, pgoff, NULL, cc);
- if (result == SCAN_PAGE_DIRTY_OR_WRITEBACK && !triggered_wb &&
- mapping_can_writeback(file->f_mapping)) {
- loff_t lstart = (loff_t)pgoff << PAGE_SHIFT;
- loff_t lend = lstart + HPAGE_PMD_SIZE - 1;
+ result = collapse_single_pmd(addr, vma, &mmap_locked, NULL, cc);
- filemap_write_and_wait_range(file->f_mapping, lstart, lend);
- triggered_wb = true;
- fput(file);
- goto retry;
- }
- fput(file);
- } else {
- result = collapse_scan_pmd(mm, vma, addr, &mmap_locked, NULL, cc);
- }
if (!mmap_locked)
*lock_dropped = true;
-handle_result:
switch (result) {
case SCAN_SUCCEED:
case SCAN_PMD_MAPPED:
++thps;
break;
- case SCAN_PTE_MAPPED_HUGEPAGE:
- BUG_ON(mmap_locked);
- mmap_read_lock(mm);
- result = try_collapse_pte_mapped_thp(mm, addr, true);
- mmap_read_unlock(mm);
- goto handle_result;
/* Whitelisted set of results where continuing OK */
case SCAN_NO_PTE_TABLE:
+ case SCAN_PTE_MAPPED_HUGEPAGE:
case SCAN_PTE_NON_PRESENT:
case SCAN_PTE_UFFD_WP:
case SCAN_LACK_REFERENCED_PAGE:
--
2.53.0
On 2/26/26 02:29, Nico Pache wrote:
> The khugepaged daemon and madvise_collapse have two different
> implementations that do almost the same thing.
>
> Create collapse_single_pmd to increase code reuse and create an entry
> point to these two users.
>
> Refactor madvise_collapse and collapse_scan_mm_slot to use the new
> collapse_single_pmd function. This introduces a minor behavioral change
> that is most likely an undiscovered bug. The current implementation of
> khugepaged tests collapse_test_exit_or_disable before calling
> collapse_pte_mapped_thp, but we weren't doing it in the madvise_collapse
> case. By unifying these two callers madvise_collapse now also performs
> this check. We also modify the return value to be SCAN_ANY_PROCESS which
> properly indicates that this process is no longer valid to operate on.
>
> We also guard the khugepaged_pages_collapsed variable to ensure its only
> incremented for khugepaged.
>
> Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Probably best to drop Lorenzo's RB after bigger changes.
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---
> mm/khugepaged.c | 128 ++++++++++++++++++++++++++----------------------
> 1 file changed, 69 insertions(+), 59 deletions(-)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 64086488ca01..0058970d4579 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -2417,6 +2417,70 @@ static enum scan_result collapse_scan_file(struct mm_struct *mm, unsigned long a
> return result;
> }
>
> +/*
> + * Try to collapse a single PMD starting at a PMD aligned addr, and return
> + * the results.
> + */
> +static enum scan_result collapse_single_pmd(unsigned long addr,
> + struct vm_area_struct *vma, bool *mmap_locked,
> + unsigned int *cur_progress, struct collapse_control *cc)
> +{
> + struct mm_struct *mm = vma->vm_mm;
> + bool triggered_wb = false;
> + enum scan_result result;
> + struct file *file;
> + pgoff_t pgoff;
> +
> + if (vma_is_anonymous(vma)) {
> + result = collapse_scan_pmd(mm, vma, addr, mmap_locked, cur_progress, cc);
> + goto end;
> + }
> +
> + file = get_file(vma->vm_file);
> + pgoff = linear_page_index(vma, addr);
> +
> + mmap_read_unlock(mm);
> + *mmap_locked = false;
> +retry:
> + result = collapse_scan_file(mm, addr, file, pgoff, cur_progress, cc);
> +
> + /*
> + * For MADV_COLLAPSE, when encountering dirty pages, try to writeback,
> + * then retry the collapse one time.
> + */
> + if (!cc->is_khugepaged && result == SCAN_PAGE_DIRTY_OR_WRITEBACK &&
> + triggered_wb && mapping_can_writeback(file->f_mapping)) {
!triggered_wb, right?
> + const loff_t lstart = (loff_t)pgoff << PAGE_SHIFT;
> + const loff_t lend = lstart + HPAGE_PMD_SIZE - 1;
> +
> + filemap_write_and_wait_range(file->f_mapping, lstart, lend);
> + triggered_wb = true;
> + goto retry;
> + }
> + fput(file);
> +
> + if (result != SCAN_PTE_MAPPED_HUGEPAGE)
> + goto end;
> +
> + mmap_read_lock(mm);
> + *mmap_locked = true;
On all paths below, you set "*mmap_locked = false". Why even bother about setting the variable?
> + if (collapse_test_exit_or_disable(mm)) {
> + mmap_read_unlock(mm);
> + *mmap_locked = false;
> + return SCAN_ANY_PROCESS;
> + }
> + result = try_collapse_pte_mapped_thp(mm, addr, !cc->is_khugepaged);
> + if (result == SCAN_PMD_MAPPED)
> + result = SCAN_SUCCEED;
> + mmap_read_unlock(mm);
> + *mmap_locked = false;
This might all read nicer without the goto and without the early return.
/* If we have a THP in the pagecache, try to retract the pagetable. */
if (result == SCAN_PTE_MAPPED_HUGEPAGE) {
mmap_read_lock(mm);
if (collapse_test_exit_or_disable(mm))
result = SCAN_ANY_PROCESS;
else
result = try_collapse_pte_mapped_thp(mm, addr, !cc->is_khugepaged);
if (result == SCAN_PMD_MAPPED)
result = SCAN_SUCCEED
}
mmap_read_unlock(mm);
}
> +
> +end:
> + if (cc->is_khugepaged && result == SCAN_SUCCEED)
> + ++khugepaged_pages_collapsed;
> + return result;
> +}
> +
> static unsigned int collapse_scan_mm_slot(unsigned int pages, enum scan_result *result,
> struct collapse_control *cc)
> __releases(&khugepaged_mm_lock)
> @@ -2489,36 +2553,9 @@ static unsigned int collapse_scan_mm_slot(unsigned int pages, enum scan_result *
> VM_BUG_ON(khugepaged_scan.address < hstart ||
> khugepaged_scan.address + HPAGE_PMD_SIZE >
> hend);
> - if (!vma_is_anonymous(vma)) {
> - struct file *file = get_file(vma->vm_file);
> - pgoff_t pgoff = linear_page_index(vma,
> - khugepaged_scan.address);
> -
> - mmap_read_unlock(mm);
> - mmap_locked = false;
> - *result = collapse_scan_file(mm,
> - khugepaged_scan.address, file, pgoff,
> - &cur_progress, cc);
> - fput(file);
> - if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
> - mmap_read_lock(mm);
> - if (collapse_test_exit_or_disable(mm))
> - goto breakouterloop;
> - *result = try_collapse_pte_mapped_thp(mm,
> - khugepaged_scan.address, false);
> - if (*result == SCAN_PMD_MAPPED)
> - *result = SCAN_SUCCEED;
> - mmap_read_unlock(mm);
> - }
> - } else {
> - *result = collapse_scan_pmd(mm, vma,
> - khugepaged_scan.address, &mmap_locked,
> - &cur_progress, cc);
> - }
> -
> - if (*result == SCAN_SUCCEED)
> - ++khugepaged_pages_collapsed;
>
> + *result = collapse_single_pmd(khugepaged_scan.address,
> + vma, &mmap_locked, &cur_progress, cc);
> /* move to next address */
> khugepaged_scan.address += HPAGE_PMD_SIZE;
> progress += cur_progress;
> @@ -2819,13 +2856,12 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
>
> for (addr = hstart; addr < hend; addr += HPAGE_PMD_SIZE) {
> enum scan_result result = SCAN_FAIL;
> - bool triggered_wb = false;
>
> -retry:
> if (!mmap_locked) {
> cond_resched();
> mmap_read_lock(mm);
> mmap_locked = true;
> + *lock_dropped = true;
Hm, is this change here required at all? Shouldn't we instead need to know from
collapse_single_pmd() whether it dropped the lock?
--
Cheers,
David
On Thu, Feb 26, 2026 at 2:41 AM David Hildenbrand (Arm)
<david@kernel.org> wrote:
>
> On 2/26/26 02:29, Nico Pache wrote:
> > The khugepaged daemon and madvise_collapse have two different
> > implementations that do almost the same thing.
> >
> > Create collapse_single_pmd to increase code reuse and create an entry
> > point to these two users.
> >
> > Refactor madvise_collapse and collapse_scan_mm_slot to use the new
> > collapse_single_pmd function. This introduces a minor behavioral change
> > that is most likely an undiscovered bug. The current implementation of
> > khugepaged tests collapse_test_exit_or_disable before calling
> > collapse_pte_mapped_thp, but we weren't doing it in the madvise_collapse
> > case. By unifying these two callers madvise_collapse now also performs
> > this check. We also modify the return value to be SCAN_ANY_PROCESS which
> > properly indicates that this process is no longer valid to operate on.
> >
> > We also guard the khugepaged_pages_collapsed variable to ensure its only
> > incremented for khugepaged.
> >
> > Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
>
> Probably best to drop Lorenzo's RB after bigger changes.
>
> > Signed-off-by: Nico Pache <npache@redhat.com>
> > ---
> > mm/khugepaged.c | 128 ++++++++++++++++++++++++++----------------------
> > 1 file changed, 69 insertions(+), 59 deletions(-)
> >
> > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > index 64086488ca01..0058970d4579 100644
> > --- a/mm/khugepaged.c
> > +++ b/mm/khugepaged.c
> > @@ -2417,6 +2417,70 @@ static enum scan_result collapse_scan_file(struct mm_struct *mm, unsigned long a
> > return result;
> > }
> >
> > +/*
> > + * Try to collapse a single PMD starting at a PMD aligned addr, and return
> > + * the results.
> > + */
> > +static enum scan_result collapse_single_pmd(unsigned long addr,
> > + struct vm_area_struct *vma, bool *mmap_locked,
> > + unsigned int *cur_progress, struct collapse_control *cc)
> > +{
> > + struct mm_struct *mm = vma->vm_mm;
> > + bool triggered_wb = false;
> > + enum scan_result result;
> > + struct file *file;
> > + pgoff_t pgoff;
> > +
> > + if (vma_is_anonymous(vma)) {
> > + result = collapse_scan_pmd(mm, vma, addr, mmap_locked, cur_progress, cc);
> > + goto end;
> > + }
> > +
> > + file = get_file(vma->vm_file);
> > + pgoff = linear_page_index(vma, addr);
> > +
> > + mmap_read_unlock(mm);
> > + *mmap_locked = false;
> > +retry:
> > + result = collapse_scan_file(mm, addr, file, pgoff, cur_progress, cc);
> > +
> > + /*
> > + * For MADV_COLLAPSE, when encountering dirty pages, try to writeback,
> > + * then retry the collapse one time.
> > + */
> > + if (!cc->is_khugepaged && result == SCAN_PAGE_DIRTY_OR_WRITEBACK &&
> > + triggered_wb && mapping_can_writeback(file->f_mapping)) {
>
> !triggered_wb, right?
>
>
> > + const loff_t lstart = (loff_t)pgoff << PAGE_SHIFT;
> > + const loff_t lend = lstart + HPAGE_PMD_SIZE - 1;
> > +
> > + filemap_write_and_wait_range(file->f_mapping, lstart, lend);
> > + triggered_wb = true;
> > + goto retry;
> > + }
> > + fput(file);
> > +
> > + if (result != SCAN_PTE_MAPPED_HUGEPAGE)
> > + goto end;
> > +
> > + mmap_read_lock(mm);
> > + *mmap_locked = true;
>
> On all paths below, you set "*mmap_locked = false". Why even bother about setting the variable?
Yeah I believe someone (Lorenzo?) pointed that out during the last
review cycle. I forgot to look into it :<
As you state, I believe we can drop the repetitive mmap_locked (iirc
this was introduced in an earlier version before `lock_dropped`) and
move it into the single_pmd() function.
>
> > + if (collapse_test_exit_or_disable(mm)) {
> > + mmap_read_unlock(mm);
> > + *mmap_locked = false;
> > + return SCAN_ANY_PROCESS;
> > + }
> > + result = try_collapse_pte_mapped_thp(mm, addr, !cc->is_khugepaged);
> > + if (result == SCAN_PMD_MAPPED)
> > + result = SCAN_SUCCEED;
> > + mmap_read_unlock(mm);
> > + *mmap_locked = false;
>
> This might all read nicer without the goto and without the early return.
I'll see what I can do!
>
> /* If we have a THP in the pagecache, try to retract the pagetable. */
> if (result == SCAN_PTE_MAPPED_HUGEPAGE) {
> mmap_read_lock(mm);
> if (collapse_test_exit_or_disable(mm))
> result = SCAN_ANY_PROCESS;
> else
> result = try_collapse_pte_mapped_thp(mm, addr, !cc->is_khugepaged);
> if (result == SCAN_PMD_MAPPED)
> result = SCAN_SUCCEED
> }
> mmap_read_unlock(mm);
> }
Oh thanks! I'll try this
>
> > +
> > +end:
> > + if (cc->is_khugepaged && result == SCAN_SUCCEED)
> > + ++khugepaged_pages_collapsed;
> > + return result;
> > +}
> > +
> > static unsigned int collapse_scan_mm_slot(unsigned int pages, enum scan_result *result,
> > struct collapse_control *cc)
> > __releases(&khugepaged_mm_lock)
> > @@ -2489,36 +2553,9 @@ static unsigned int collapse_scan_mm_slot(unsigned int pages, enum scan_result *
> > VM_BUG_ON(khugepaged_scan.address < hstart ||
> > khugepaged_scan.address + HPAGE_PMD_SIZE >
> > hend);
> > - if (!vma_is_anonymous(vma)) {
> > - struct file *file = get_file(vma->vm_file);
> > - pgoff_t pgoff = linear_page_index(vma,
> > - khugepaged_scan.address);
> > -
> > - mmap_read_unlock(mm);
> > - mmap_locked = false;
> > - *result = collapse_scan_file(mm,
> > - khugepaged_scan.address, file, pgoff,
> > - &cur_progress, cc);
> > - fput(file);
> > - if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
> > - mmap_read_lock(mm);
> > - if (collapse_test_exit_or_disable(mm))
> > - goto breakouterloop;
> > - *result = try_collapse_pte_mapped_thp(mm,
> > - khugepaged_scan.address, false);
> > - if (*result == SCAN_PMD_MAPPED)
> > - *result = SCAN_SUCCEED;
> > - mmap_read_unlock(mm);
> > - }
> > - } else {
> > - *result = collapse_scan_pmd(mm, vma,
> > - khugepaged_scan.address, &mmap_locked,
> > - &cur_progress, cc);
> > - }
> > -
> > - if (*result == SCAN_SUCCEED)
> > - ++khugepaged_pages_collapsed;
> >
> > + *result = collapse_single_pmd(khugepaged_scan.address,
> > + vma, &mmap_locked, &cur_progress, cc);
> > /* move to next address */
> > khugepaged_scan.address += HPAGE_PMD_SIZE;
> > progress += cur_progress;
> > @@ -2819,13 +2856,12 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
> >
> > for (addr = hstart; addr < hend; addr += HPAGE_PMD_SIZE) {
> > enum scan_result result = SCAN_FAIL;
> > - bool triggered_wb = false;
> >
> > -retry:
> > if (!mmap_locked) {
> > cond_resched();
> > mmap_read_lock(mm);
> > mmap_locked = true;
> > + *lock_dropped = true;
>
> Hm, is this change here required at all? Shouldn't we instead need to know from
> collapse_single_pmd() whether it dropped the lock?
I'll verify all this locking and post a fixup! This 'lock dropped'
feature was introduced mid series. And I think it makes mmap_locked
redundant. I verified this once before but forgot most of the details
ATM.
Cheers,
-- Nico
>
>
> --
> Cheers,
>
> David
>
On 2/26/26 9:29 AM, Nico Pache wrote:
> The khugepaged daemon and madvise_collapse have two different
> implementations that do almost the same thing.
>
> Create collapse_single_pmd to increase code reuse and create an entry
> point to these two users.
>
> Refactor madvise_collapse and collapse_scan_mm_slot to use the new
> collapse_single_pmd function. This introduces a minor behavioral change
> that is most likely an undiscovered bug. The current implementation of
> khugepaged tests collapse_test_exit_or_disable before calling
> collapse_pte_mapped_thp, but we weren't doing it in the madvise_collapse
> case. By unifying these two callers madvise_collapse now also performs
> this check. We also modify the return value to be SCAN_ANY_PROCESS which
> properly indicates that this process is no longer valid to operate on.
>
> We also guard the khugepaged_pages_collapsed variable to ensure its only
> incremented for khugepaged.
>
> Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---
[snip]
> for (addr = hstart; addr < hend; addr += HPAGE_PMD_SIZE) {
> enum scan_result result = SCAN_FAIL;
> - bool triggered_wb = false;
>
> -retry:
> if (!mmap_locked) {
> cond_resched();
> mmap_read_lock(mm);
> mmap_locked = true;
> + *lock_dropped = true;
IIUC, this should be '*lock_dropped = false', right?
> result = hugepage_vma_revalidate(mm, addr, false, &vma,
> cc);
> if (result != SCAN_SUCCEED) {
> @@ -2836,46 +2872,20 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
> hend = min(hend, vma->vm_end & HPAGE_PMD_MASK);
> }
> mmap_assert_locked(mm);
> - if (!vma_is_anonymous(vma)) {
> - struct file *file = get_file(vma->vm_file);
> - pgoff_t pgoff = linear_page_index(vma, addr);
> -
> - mmap_read_unlock(mm);
> - mmap_locked = false;
> - *lock_dropped = true;
> - result = collapse_scan_file(mm, addr, file, pgoff, NULL, cc);
>
> - if (result == SCAN_PAGE_DIRTY_OR_WRITEBACK && !triggered_wb &&
> - mapping_can_writeback(file->f_mapping)) {
> - loff_t lstart = (loff_t)pgoff << PAGE_SHIFT;
> - loff_t lend = lstart + HPAGE_PMD_SIZE - 1;
> + result = collapse_single_pmd(addr, vma, &mmap_locked, NULL, cc);
>
> - filemap_write_and_wait_range(file->f_mapping, lstart, lend);
> - triggered_wb = true;
> - fput(file);
> - goto retry;
> - }
> - fput(file);
> - } else {
> - result = collapse_scan_pmd(mm, vma, addr, &mmap_locked, NULL, cc);
> - }
> if (!mmap_locked)
> *lock_dropped = true;
>
> -handle_result:
> switch (result) {
> case SCAN_SUCCEED:
> case SCAN_PMD_MAPPED:
> ++thps;
> break;
> - case SCAN_PTE_MAPPED_HUGEPAGE:
> - BUG_ON(mmap_locked);
> - mmap_read_lock(mm);
> - result = try_collapse_pte_mapped_thp(mm, addr, true);
> - mmap_read_unlock(mm);
> - goto handle_result;
> /* Whitelisted set of results where continuing OK */
> case SCAN_NO_PTE_TABLE:
> + case SCAN_PTE_MAPPED_HUGEPAGE:
> case SCAN_PTE_NON_PRESENT:
> case SCAN_PTE_UFFD_WP:
> case SCAN_LACK_REFERENCED_PAGE:
On Thu, Feb 26, 2026 at 2:24 AM Baolin Wang
<baolin.wang@linux.alibaba.com> wrote:
>
>
>
> On 2/26/26 9:29 AM, Nico Pache wrote:
> > The khugepaged daemon and madvise_collapse have two different
> > implementations that do almost the same thing.
> >
> > Create collapse_single_pmd to increase code reuse and create an entry
> > point to these two users.
> >
> > Refactor madvise_collapse and collapse_scan_mm_slot to use the new
> > collapse_single_pmd function. This introduces a minor behavioral change
> > that is most likely an undiscovered bug. The current implementation of
> > khugepaged tests collapse_test_exit_or_disable before calling
> > collapse_pte_mapped_thp, but we weren't doing it in the madvise_collapse
> > case. By unifying these two callers madvise_collapse now also performs
> > this check. We also modify the return value to be SCAN_ANY_PROCESS which
> > properly indicates that this process is no longer valid to operate on.
> >
> > We also guard the khugepaged_pages_collapsed variable to ensure its only
> > incremented for khugepaged.
> >
> > Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> > Signed-off-by: Nico Pache <npache@redhat.com>
> > ---
>
> [snip]
>
> > for (addr = hstart; addr < hend; addr += HPAGE_PMD_SIZE) {
> > enum scan_result result = SCAN_FAIL;
> > - bool triggered_wb = false;
> >
> > -retry:
> > if (!mmap_locked) {
> > cond_resched();
> > mmap_read_lock(mm);
> > mmap_locked = true;
> > + *lock_dropped = true;
> IIUC, this should be '*lock_dropped = false', right?
Yes! Thanks for catching that :) As David and others have pointed out,
this lock handling here might be unnecessary and better placed in
collapse_single_pmd(). I meant to look into that before posting this
but it slipped my mind.
>
> > result = hugepage_vma_revalidate(mm, addr, false, &vma,
> > cc);
> > if (result != SCAN_SUCCEED) {
> > @@ -2836,46 +2872,20 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
> > hend = min(hend, vma->vm_end & HPAGE_PMD_MASK);
> > }
> > mmap_assert_locked(mm);
> > - if (!vma_is_anonymous(vma)) {
> > - struct file *file = get_file(vma->vm_file);
> > - pgoff_t pgoff = linear_page_index(vma, addr);
> > -
> > - mmap_read_unlock(mm);
> > - mmap_locked = false;
> > - *lock_dropped = true;
> > - result = collapse_scan_file(mm, addr, file, pgoff, NULL, cc);
> >
> > - if (result == SCAN_PAGE_DIRTY_OR_WRITEBACK && !triggered_wb &&
> > - mapping_can_writeback(file->f_mapping)) {
> > - loff_t lstart = (loff_t)pgoff << PAGE_SHIFT;
> > - loff_t lend = lstart + HPAGE_PMD_SIZE - 1;
> > + result = collapse_single_pmd(addr, vma, &mmap_locked, NULL, cc);
> >
> > - filemap_write_and_wait_range(file->f_mapping, lstart, lend);
> > - triggered_wb = true;
> > - fput(file);
> > - goto retry;
> > - }
> > - fput(file);
> > - } else {
> > - result = collapse_scan_pmd(mm, vma, addr, &mmap_locked, NULL, cc);
> > - }
> > if (!mmap_locked)
> > *lock_dropped = true;
> >
> > -handle_result:
> > switch (result) {
> > case SCAN_SUCCEED:
> > case SCAN_PMD_MAPPED:
> > ++thps;
> > break;
> > - case SCAN_PTE_MAPPED_HUGEPAGE:
> > - BUG_ON(mmap_locked);
> > - mmap_read_lock(mm);
> > - result = try_collapse_pte_mapped_thp(mm, addr, true);
> > - mmap_read_unlock(mm);
> > - goto handle_result;
> > /* Whitelisted set of results where continuing OK */
> > case SCAN_NO_PTE_TABLE:
> > + case SCAN_PTE_MAPPED_HUGEPAGE:
> > case SCAN_PTE_NON_PRESENT:
> > case SCAN_PTE_UFFD_WP:
> > case SCAN_LACK_REFERENCED_PAGE:
>
On Thu, Feb 26, 2026 at 1:20 PM Nico Pache <npache@redhat.com> wrote:
>
> On Thu, Feb 26, 2026 at 2:24 AM Baolin Wang
> <baolin.wang@linux.alibaba.com> wrote:
> >
> >
> >
> > On 2/26/26 9:29 AM, Nico Pache wrote:
> > > The khugepaged daemon and madvise_collapse have two different
> > > implementations that do almost the same thing.
> > >
> > > Create collapse_single_pmd to increase code reuse and create an entry
> > > point to these two users.
> > >
> > > Refactor madvise_collapse and collapse_scan_mm_slot to use the new
> > > collapse_single_pmd function. This introduces a minor behavioral change
> > > that is most likely an undiscovered bug. The current implementation of
> > > khugepaged tests collapse_test_exit_or_disable before calling
> > > collapse_pte_mapped_thp, but we weren't doing it in the madvise_collapse
> > > case. By unifying these two callers madvise_collapse now also performs
> > > this check. We also modify the return value to be SCAN_ANY_PROCESS which
> > > properly indicates that this process is no longer valid to operate on.
> > >
> > > We also guard the khugepaged_pages_collapsed variable to ensure its only
> > > incremented for khugepaged.
> > >
> > > Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> > > Signed-off-by: Nico Pache <npache@redhat.com>
> > > ---
> >
> > [snip]
> >
> > > for (addr = hstart; addr < hend; addr += HPAGE_PMD_SIZE) {
> > > enum scan_result result = SCAN_FAIL;
> > > - bool triggered_wb = false;
> > >
> > > -retry:
> > > if (!mmap_locked) {
> > > cond_resched();
> > > mmap_read_lock(mm);
> > > mmap_locked = true;
> > > + *lock_dropped = true;
> > IIUC, this should be '*lock_dropped = false', right?
>
> Yes! Thanks for catching that :) As David and others have pointed out,
> this lock handling here might be unnecessary and better placed in
> collapse_single_pmd(). I meant to look into that before posting this
> but it slipped my mind.
On second pass, no, I think we should drop this line altogether. If
(!mmap_locked) -> we have either just completed a collapse, or we
tried file collapse on a 2MB region. Collapse_single_pmd would report
this, and we would have already set lock_dropped.
>
> >
> > > result = hugepage_vma_revalidate(mm, addr, false, &vma,
> > > cc);
> > > if (result != SCAN_SUCCEED) {
> > > @@ -2836,46 +2872,20 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
> > > hend = min(hend, vma->vm_end & HPAGE_PMD_MASK);
> > > }
> > > mmap_assert_locked(mm);
> > > - if (!vma_is_anonymous(vma)) {
> > > - struct file *file = get_file(vma->vm_file);
> > > - pgoff_t pgoff = linear_page_index(vma, addr);
> > > -
> > > - mmap_read_unlock(mm);
> > > - mmap_locked = false;
> > > - *lock_dropped = true;
> > > - result = collapse_scan_file(mm, addr, file, pgoff, NULL, cc);
> > >
> > > - if (result == SCAN_PAGE_DIRTY_OR_WRITEBACK && !triggered_wb &&
> > > - mapping_can_writeback(file->f_mapping)) {
> > > - loff_t lstart = (loff_t)pgoff << PAGE_SHIFT;
> > > - loff_t lend = lstart + HPAGE_PMD_SIZE - 1;
> > > + result = collapse_single_pmd(addr, vma, &mmap_locked, NULL, cc);
> > >
> > > - filemap_write_and_wait_range(file->f_mapping, lstart, lend);
> > > - triggered_wb = true;
> > > - fput(file);
> > > - goto retry;
> > > - }
> > > - fput(file);
> > > - } else {
> > > - result = collapse_scan_pmd(mm, vma, addr, &mmap_locked, NULL, cc);
> > > - }
> > > if (!mmap_locked)
> > > *lock_dropped = true;
> > >
> > > -handle_result:
> > > switch (result) {
> > > case SCAN_SUCCEED:
> > > case SCAN_PMD_MAPPED:
> > > ++thps;
> > > break;
> > > - case SCAN_PTE_MAPPED_HUGEPAGE:
> > > - BUG_ON(mmap_locked);
> > > - mmap_read_lock(mm);
> > > - result = try_collapse_pte_mapped_thp(mm, addr, true);
> > > - mmap_read_unlock(mm);
> > > - goto handle_result;
> > > /* Whitelisted set of results where continuing OK */
> > > case SCAN_NO_PTE_TABLE:
> > > + case SCAN_PTE_MAPPED_HUGEPAGE:
> > > case SCAN_PTE_NON_PRESENT:
> > > case SCAN_PTE_UFFD_WP:
> > > case SCAN_LACK_REFERENCED_PAGE:
> >
© 2016 - 2026 Red Hat, Inc.