The khugepaged daemon and madvise_collapse have two different
implementations that do almost the same thing.
Create collapse_single_pmd to increase code reuse and create an entry
point to these two users.
Refactor madvise_collapse and collapse_scan_mm_slot to use the new
collapse_single_pmd function. This introduces a minor behavioral change
that is most likely an undiscovered bug. The current implementation of
khugepaged tests collapse_test_exit_or_disable before calling
collapse_pte_mapped_thp, but we weren't doing it in the madvise_collapse
case. By unifying these two callers madvise_collapse now also performs
this check. We also modify the return value to be SCAN_ANY_PROCESS which
properly indicates that this process is no longer valid to operate on.
We also guard the khugepaged_pages_collapsed variable to ensure its only
incremented for khugepaged.
Reviewed-by: Wei Yang <richard.weiyang@gmail.com>
Reviewed-by: Lance Yang <lance.yang@linux.dev>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Acked-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Nico Pache <npache@redhat.com>
---
mm/khugepaged.c | 106 +++++++++++++++++++++++++++---------------------
1 file changed, 60 insertions(+), 46 deletions(-)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index fefcbdca4510..59e5a5588d85 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -2394,6 +2394,54 @@ static enum scan_result collapse_scan_file(struct mm_struct *mm, unsigned long a
return result;
}
+/*
+ * Try to collapse a single PMD starting at a PMD aligned addr, and return
+ * the results.
+ */
+static enum scan_result collapse_single_pmd(unsigned long addr,
+ struct vm_area_struct *vma, bool *mmap_locked,
+ struct collapse_control *cc)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ enum scan_result result;
+ struct file *file;
+ pgoff_t pgoff;
+
+ if (vma_is_anonymous(vma)) {
+ result = collapse_scan_pmd(mm, vma, addr, mmap_locked, cc);
+ goto end;
+ }
+
+ file = get_file(vma->vm_file);
+ pgoff = linear_page_index(vma, addr);
+
+ mmap_read_unlock(mm);
+ *mmap_locked = false;
+ result = collapse_scan_file(mm, addr, file, pgoff, cc);
+ fput(file);
+
+ if (result != SCAN_PTE_MAPPED_HUGEPAGE)
+ goto end;
+
+ mmap_read_lock(mm);
+ *mmap_locked = true;
+ if (collapse_test_exit_or_disable(mm)) {
+ mmap_read_unlock(mm);
+ *mmap_locked = false;
+ return SCAN_ANY_PROCESS;
+ }
+ result = try_collapse_pte_mapped_thp(mm, addr, !cc->is_khugepaged);
+ if (result == SCAN_PMD_MAPPED)
+ result = SCAN_SUCCEED;
+ mmap_read_unlock(mm);
+ *mmap_locked = false;
+
+end:
+ if (cc->is_khugepaged && result == SCAN_SUCCEED)
+ ++khugepaged_pages_collapsed;
+ return result;
+}
+
static unsigned int collapse_scan_mm_slot(unsigned int pages, enum scan_result *result,
struct collapse_control *cc)
__releases(&khugepaged_mm_lock)
@@ -2466,34 +2514,9 @@ static unsigned int collapse_scan_mm_slot(unsigned int pages, enum scan_result *
VM_BUG_ON(khugepaged_scan.address < hstart ||
khugepaged_scan.address + HPAGE_PMD_SIZE >
hend);
- if (!vma_is_anonymous(vma)) {
- struct file *file = get_file(vma->vm_file);
- pgoff_t pgoff = linear_page_index(vma,
- khugepaged_scan.address);
-
- mmap_read_unlock(mm);
- mmap_locked = false;
- *result = collapse_scan_file(mm,
- khugepaged_scan.address, file, pgoff, cc);
- fput(file);
- if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
- mmap_read_lock(mm);
- if (collapse_test_exit_or_disable(mm))
- goto breakouterloop;
- *result = try_collapse_pte_mapped_thp(mm,
- khugepaged_scan.address, false);
- if (*result == SCAN_PMD_MAPPED)
- *result = SCAN_SUCCEED;
- mmap_read_unlock(mm);
- }
- } else {
- *result = collapse_scan_pmd(mm, vma,
- khugepaged_scan.address, &mmap_locked, cc);
- }
-
- if (*result == SCAN_SUCCEED)
- ++khugepaged_pages_collapsed;
+ *result = collapse_single_pmd(khugepaged_scan.address,
+ vma, &mmap_locked, cc);
/* move to next address */
khugepaged_scan.address += HPAGE_PMD_SIZE;
progress += HPAGE_PMD_NR;
@@ -2799,6 +2822,7 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
cond_resched();
mmap_read_lock(mm);
mmap_locked = true;
+ *lock_dropped = true;
result = hugepage_vma_revalidate(mm, addr, false, &vma,
cc);
if (result != SCAN_SUCCEED) {
@@ -2809,17 +2833,17 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
hend = min(hend, vma->vm_end & HPAGE_PMD_MASK);
}
mmap_assert_locked(mm);
- if (!vma_is_anonymous(vma)) {
- struct file *file = get_file(vma->vm_file);
- pgoff_t pgoff = linear_page_index(vma, addr);
- mmap_read_unlock(mm);
- mmap_locked = false;
+ result = collapse_single_pmd(addr, vma, &mmap_locked, cc);
+
+ if (!mmap_locked)
*lock_dropped = true;
- result = collapse_scan_file(mm, addr, file, pgoff, cc);
- if (result == SCAN_PAGE_DIRTY_OR_WRITEBACK && !triggered_wb &&
- mapping_can_writeback(file->f_mapping)) {
+ if (result == SCAN_PAGE_DIRTY_OR_WRITEBACK && !triggered_wb) {
+ struct file *file = get_file(vma->vm_file);
+ pgoff_t pgoff = linear_page_index(vma, addr);
+
+ if (mapping_can_writeback(file->f_mapping)) {
loff_t lstart = (loff_t)pgoff << PAGE_SHIFT;
loff_t lend = lstart + HPAGE_PMD_SIZE - 1;
@@ -2829,26 +2853,16 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
goto retry;
}
fput(file);
- } else {
- result = collapse_scan_pmd(mm, vma, addr, &mmap_locked, cc);
}
- if (!mmap_locked)
- *lock_dropped = true;
-handle_result:
switch (result) {
case SCAN_SUCCEED:
case SCAN_PMD_MAPPED:
++thps;
break;
- case SCAN_PTE_MAPPED_HUGEPAGE:
- BUG_ON(mmap_locked);
- mmap_read_lock(mm);
- result = try_collapse_pte_mapped_thp(mm, addr, true);
- mmap_read_unlock(mm);
- goto handle_result;
/* Whitelisted set of results where continuing OK */
case SCAN_NO_PTE_TABLE:
+ case SCAN_PTE_MAPPED_HUGEPAGE:
case SCAN_PTE_NON_PRESENT:
case SCAN_PTE_UFFD_WP:
case SCAN_LACK_REFERENCED_PAGE:
--
2.52.0
Another trivial point on this one, you're missing the prefix in the subject here. In general I think better to say mm/khugepaged: rather than khugepaged: also. Cheers, Lorenzo
Hi Andrew,
could you please apply the following fixup to avoid potentially using a stale
VMA in the new writeback-retry logic for madvise collapse.
Thank you!
-- Nico
----8<----
commit a9ac3b1bfa926dd707ac3a785583f8d7a0579578
Author: Nico Pache <npache@redhat.com>
Date: Fri Jan 23 16:32:42 2026 -0700
madvise writeback retry logic fix
Signed-off-by: Nico Pache <npache@redhat.com>
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 59e5a5588d85..2b054f7d9753 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -2418,6 +2418,14 @@ static enum scan_result collapse_single_pmd(unsigned long
addr,
mmap_read_unlock(mm);
*mmap_locked = false;
result = collapse_scan_file(mm, addr, file, pgoff, cc);
+
+ if (!cc->is_khugepaged && result == SCAN_PAGE_DIRTY_OR_WRITEBACK &&
+ mapping_can_writeback(file->f_mapping)) {
+ const loff_t lstart = (loff_t)pgoff << PAGE_SHIFT;
+ const loff_t lend = lstart + HPAGE_PMD_SIZE - 1;
+
+ filemap_write_and_wait_range(file->f_mapping, lstart, lend);
+ }
fput(file);
if (result != SCAN_PTE_MAPPED_HUGEPAGE)
@@ -2840,19 +2848,8 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned
long start,
*lock_dropped = true;
if (result == SCAN_PAGE_DIRTY_OR_WRITEBACK && !triggered_wb) {
- struct file *file = get_file(vma->vm_file);
- pgoff_t pgoff = linear_page_index(vma, addr);
-
- if (mapping_can_writeback(file->f_mapping)) {
- loff_t lstart = (loff_t)pgoff << PAGE_SHIFT;
- loff_t lend = lstart + HPAGE_PMD_SIZE - 1;
-
- filemap_write_and_wait_range(file->f_mapping, lstart, lend);
- triggered_wb = true;
- fput(file);
- goto retry;
- }
- fput(file);
+ triggered_wb = true;
+ goto retry;
}
switch (result) {
--
2.52.0
On 1/22/26 12:28 PM, Nico Pache wrote:
> The khugepaged daemon and madvise_collapse have two different
> implementations that do almost the same thing.
>
> Create collapse_single_pmd to increase code reuse and create an entry
> point to these two users.
>
> Refactor madvise_collapse and collapse_scan_mm_slot to use the new
> collapse_single_pmd function. This introduces a minor behavioral change
> that is most likely an undiscovered bug. The current implementation of
> khugepaged tests collapse_test_exit_or_disable before calling
> collapse_pte_mapped_thp, but we weren't doing it in the madvise_collapse
> case. By unifying these two callers madvise_collapse now also performs
> this check. We also modify the return value to be SCAN_ANY_PROCESS which
> properly indicates that this process is no longer valid to operate on.
>
> We also guard the khugepaged_pages_collapsed variable to ensure its only
> incremented for khugepaged.
>
> Reviewed-by: Wei Yang <richard.weiyang@gmail.com>
> Reviewed-by: Lance Yang <lance.yang@linux.dev>
> Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> Reviewed-by: Zi Yan <ziy@nvidia.com>
> Acked-by: David Hildenbrand <david@redhat.com>
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---
> mm/khugepaged.c | 106 +++++++++++++++++++++++++++---------------------
> 1 file changed, 60 insertions(+), 46 deletions(-)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index fefcbdca4510..59e5a5588d85 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -2394,6 +2394,54 @@ static enum scan_result collapse_scan_file(struct mm_struct *mm, unsigned long a
> return result;
> }
>
> +/*
> + * Try to collapse a single PMD starting at a PMD aligned addr, and return
> + * the results.
> + */
> +static enum scan_result collapse_single_pmd(unsigned long addr,
> + struct vm_area_struct *vma, bool *mmap_locked,
> + struct collapse_control *cc)
> +{
> + struct mm_struct *mm = vma->vm_mm;
> + enum scan_result result;
> + struct file *file;
> + pgoff_t pgoff;
> +
> + if (vma_is_anonymous(vma)) {
> + result = collapse_scan_pmd(mm, vma, addr, mmap_locked, cc);
> + goto end;
> + }
> +
> + file = get_file(vma->vm_file);
> + pgoff = linear_page_index(vma, addr);
> +
> + mmap_read_unlock(mm);
> + *mmap_locked = false;
> + result = collapse_scan_file(mm, addr, file, pgoff, cc);
> + fput(file);
> +
> + if (result != SCAN_PTE_MAPPED_HUGEPAGE)
> + goto end;
> +
> + mmap_read_lock(mm);
> + *mmap_locked = true;
> + if (collapse_test_exit_or_disable(mm)) {
> + mmap_read_unlock(mm);
> + *mmap_locked = false;
> + return SCAN_ANY_PROCESS;
> + }
> + result = try_collapse_pte_mapped_thp(mm, addr, !cc->is_khugepaged);
> + if (result == SCAN_PMD_MAPPED)
> + result = SCAN_SUCCEED;
> + mmap_read_unlock(mm);
> + *mmap_locked = false;
> +
> +end:
> + if (cc->is_khugepaged && result == SCAN_SUCCEED)
> + ++khugepaged_pages_collapsed;
> + return result;
> +}
> +
> static unsigned int collapse_scan_mm_slot(unsigned int pages, enum scan_result *result,
> struct collapse_control *cc)
> __releases(&khugepaged_mm_lock)
> @@ -2466,34 +2514,9 @@ static unsigned int collapse_scan_mm_slot(unsigned int pages, enum scan_result *
> VM_BUG_ON(khugepaged_scan.address < hstart ||
> khugepaged_scan.address + HPAGE_PMD_SIZE >
> hend);
> - if (!vma_is_anonymous(vma)) {
> - struct file *file = get_file(vma->vm_file);
> - pgoff_t pgoff = linear_page_index(vma,
> - khugepaged_scan.address);
> -
> - mmap_read_unlock(mm);
> - mmap_locked = false;
> - *result = collapse_scan_file(mm,
> - khugepaged_scan.address, file, pgoff, cc);
> - fput(file);
> - if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
> - mmap_read_lock(mm);
> - if (collapse_test_exit_or_disable(mm))
> - goto breakouterloop;
> - *result = try_collapse_pte_mapped_thp(mm,
> - khugepaged_scan.address, false);
> - if (*result == SCAN_PMD_MAPPED)
> - *result = SCAN_SUCCEED;
> - mmap_read_unlock(mm);
> - }
> - } else {
> - *result = collapse_scan_pmd(mm, vma,
> - khugepaged_scan.address, &mmap_locked, cc);
> - }
> -
> - if (*result == SCAN_SUCCEED)
> - ++khugepaged_pages_collapsed;
>
> + *result = collapse_single_pmd(khugepaged_scan.address,
> + vma, &mmap_locked, cc);
> /* move to next address */
> khugepaged_scan.address += HPAGE_PMD_SIZE;
> progress += HPAGE_PMD_NR;
> @@ -2799,6 +2822,7 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
> cond_resched();
> mmap_read_lock(mm);
> mmap_locked = true;
> + *lock_dropped = true;
> result = hugepage_vma_revalidate(mm, addr, false, &vma,
> cc);
> if (result != SCAN_SUCCEED) {
> @@ -2809,17 +2833,17 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
> hend = min(hend, vma->vm_end & HPAGE_PMD_MASK);
> }
> mmap_assert_locked(mm);
> - if (!vma_is_anonymous(vma)) {
> - struct file *file = get_file(vma->vm_file);
> - pgoff_t pgoff = linear_page_index(vma, addr);
>
> - mmap_read_unlock(mm);
> - mmap_locked = false;
> + result = collapse_single_pmd(addr, vma, &mmap_locked, cc);
> +
> + if (!mmap_locked)
> *lock_dropped = true;
> - result = collapse_scan_file(mm, addr, file, pgoff, cc);
>
> - if (result == SCAN_PAGE_DIRTY_OR_WRITEBACK && !triggered_wb &&
> - mapping_can_writeback(file->f_mapping)) {
> + if (result == SCAN_PAGE_DIRTY_OR_WRITEBACK && !triggered_wb) {
> + struct file *file = get_file(vma->vm_file);
> + pgoff_t pgoff = linear_page_index(vma, addr);
> +
> + if (mapping_can_writeback(file->f_mapping)) {
> loff_t lstart = (loff_t)pgoff << PAGE_SHIFT;
> loff_t lend = lstart + HPAGE_PMD_SIZE - 1;
>
> @@ -2829,26 +2853,16 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
> goto retry;
> }
> fput(file);
> - } else {
> - result = collapse_scan_pmd(mm, vma, addr, &mmap_locked, cc);
> }
> - if (!mmap_locked)
> - *lock_dropped = true;
>
> -handle_result:
> switch (result) {
> case SCAN_SUCCEED:
> case SCAN_PMD_MAPPED:
> ++thps;
> break;
> - case SCAN_PTE_MAPPED_HUGEPAGE:
> - BUG_ON(mmap_locked);
> - mmap_read_lock(mm);
> - result = try_collapse_pte_mapped_thp(mm, addr, true);
> - mmap_read_unlock(mm);
> - goto handle_result;
> /* Whitelisted set of results where continuing OK */
> case SCAN_NO_PTE_TABLE:
> + case SCAN_PTE_MAPPED_HUGEPAGE:
> case SCAN_PTE_NON_PRESENT:
> case SCAN_PTE_UFFD_WP:
> case SCAN_LACK_REFERENCED_PAGE:
On Wed, Jan 28, 2026 at 09:38:37AM -0700, Nico Pache wrote:
> Hi Andrew,
>
> could you please apply the following fixup to avoid potentially using a stale
> VMA in the new writeback-retry logic for madvise collapse.
>
> Thank you!
> -- Nico
>
> ----8<----
> commit a9ac3b1bfa926dd707ac3a785583f8d7a0579578
> Author: Nico Pache <npache@redhat.com>
> Date: Fri Jan 23 16:32:42 2026 -0700
>
> madvise writeback retry logic fix
>
> Signed-off-by: Nico Pache <npache@redhat.com>
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 59e5a5588d85..2b054f7d9753 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -2418,6 +2418,14 @@ static enum scan_result collapse_single_pmd(unsigned long
> addr,
> mmap_read_unlock(mm);
> *mmap_locked = false;
> result = collapse_scan_file(mm, addr, file, pgoff, cc);
> +
> + if (!cc->is_khugepaged && result == SCAN_PAGE_DIRTY_OR_WRITEBACK &&
> + mapping_can_writeback(file->f_mapping)) {
> + const loff_t lstart = (loff_t)pgoff << PAGE_SHIFT;
> + const loff_t lend = lstart + HPAGE_PMD_SIZE - 1;
> +
> + filemap_write_and_wait_range(file->f_mapping, lstart, lend);
> + }
> fput(file);
>
> if (result != SCAN_PTE_MAPPED_HUGEPAGE)
> @@ -2840,19 +2848,8 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned
> long start,
> *lock_dropped = true;
>
> if (result == SCAN_PAGE_DIRTY_OR_WRITEBACK && !triggered_wb) {
> - struct file *file = get_file(vma->vm_file);
> - pgoff_t pgoff = linear_page_index(vma, addr);
> -
> - if (mapping_can_writeback(file->f_mapping)) {
> - loff_t lstart = (loff_t)pgoff << PAGE_SHIFT;
> - loff_t lend = lstart + HPAGE_PMD_SIZE - 1;
> -
> - filemap_write_and_wait_range(file->f_mapping, lstart, lend);
> - triggered_wb = true;
> - fput(file);
> - goto retry;
> - }
> - fput(file);
> + triggered_wb = true;
> + goto retry;
> }
LGTM, with this in place you can add back my tag to the patch.
It'd be good to reference this in the commit message, but you can do that
on a respin.
So:
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
On 2026/1/23 03:28, Nico Pache wrote:
> The khugepaged daemon and madvise_collapse have two different
> implementations that do almost the same thing.
>
> Create collapse_single_pmd to increase code reuse and create an entry
> point to these two users.
>
> Refactor madvise_collapse and collapse_scan_mm_slot to use the new
> collapse_single_pmd function. This introduces a minor behavioral change
> that is most likely an undiscovered bug. The current implementation of
> khugepaged tests collapse_test_exit_or_disable before calling
> collapse_pte_mapped_thp, but we weren't doing it in the madvise_collapse
> case. By unifying these two callers madvise_collapse now also performs
> this check. We also modify the return value to be SCAN_ANY_PROCESS which
> properly indicates that this process is no longer valid to operate on.
>
> We also guard the khugepaged_pages_collapsed variable to ensure its only
> incremented for khugepaged.
>
> Reviewed-by: Wei Yang <richard.weiyang@gmail.com>
> Reviewed-by: Lance Yang <lance.yang@linux.dev>
> Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> Reviewed-by: Zi Yan <ziy@nvidia.com>
> Acked-by: David Hildenbrand <david@redhat.com>
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---
I think this patch introduces some functional changes compared to previous
version[1] ...
Maybe we should drop the r-b tags and let folks take another look?
There might be an issue with the vma access in madvise_collapse(). See
below:
[1]
https://lore.kernel.org/linux-mm/20251201174627.23295-3-npache@redhat.com/
> mm/khugepaged.c | 106 +++++++++++++++++++++++++++---------------------
> 1 file changed, 60 insertions(+), 46 deletions(-)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index fefcbdca4510..59e5a5588d85 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -2394,6 +2394,54 @@ static enum scan_result collapse_scan_file(struct mm_struct *mm, unsigned long a
> return result;
> }
>
> +/*
> + * Try to collapse a single PMD starting at a PMD aligned addr, and return
> + * the results.
> + */
> +static enum scan_result collapse_single_pmd(unsigned long addr,
> + struct vm_area_struct *vma, bool *mmap_locked,
> + struct collapse_control *cc)
> +{
> + struct mm_struct *mm = vma->vm_mm;
> + enum scan_result result;
> + struct file *file;
> + pgoff_t pgoff;
> +
> + if (vma_is_anonymous(vma)) {
> + result = collapse_scan_pmd(mm, vma, addr, mmap_locked, cc);
> + goto end;
> + }
> +
> + file = get_file(vma->vm_file);
> + pgoff = linear_page_index(vma, addr);
> +
> + mmap_read_unlock(mm);
> + *mmap_locked = false;
> + result = collapse_scan_file(mm, addr, file, pgoff, cc);
> + fput(file);
> +
> + if (result != SCAN_PTE_MAPPED_HUGEPAGE)
> + goto end;
> +
> + mmap_read_lock(mm);
> + *mmap_locked = true;
> + if (collapse_test_exit_or_disable(mm)) {
> + mmap_read_unlock(mm);
> + *mmap_locked = false;
> + return SCAN_ANY_PROCESS;
> + }
> + result = try_collapse_pte_mapped_thp(mm, addr, !cc->is_khugepaged);
> + if (result == SCAN_PMD_MAPPED)
> + result = SCAN_SUCCEED;
> + mmap_read_unlock(mm);
> + *mmap_locked = false;
> +
> +end:
> + if (cc->is_khugepaged && result == SCAN_SUCCEED)
> + ++khugepaged_pages_collapsed;
> + return result;
> +}
> +
> static unsigned int collapse_scan_mm_slot(unsigned int pages, enum scan_result *result,
> struct collapse_control *cc)
> __releases(&khugepaged_mm_lock)
> @@ -2466,34 +2514,9 @@ static unsigned int collapse_scan_mm_slot(unsigned int pages, enum scan_result *
> VM_BUG_ON(khugepaged_scan.address < hstart ||
> khugepaged_scan.address + HPAGE_PMD_SIZE >
> hend);
> - if (!vma_is_anonymous(vma)) {
> - struct file *file = get_file(vma->vm_file);
> - pgoff_t pgoff = linear_page_index(vma,
> - khugepaged_scan.address);
> -
> - mmap_read_unlock(mm);
> - mmap_locked = false;
> - *result = collapse_scan_file(mm,
> - khugepaged_scan.address, file, pgoff, cc);
> - fput(file);
> - if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
> - mmap_read_lock(mm);
> - if (collapse_test_exit_or_disable(mm))
> - goto breakouterloop;
> - *result = try_collapse_pte_mapped_thp(mm,
> - khugepaged_scan.address, false);
> - if (*result == SCAN_PMD_MAPPED)
> - *result = SCAN_SUCCEED;
> - mmap_read_unlock(mm);
> - }
> - } else {
> - *result = collapse_scan_pmd(mm, vma,
> - khugepaged_scan.address, &mmap_locked, cc);
> - }
> -
> - if (*result == SCAN_SUCCEED)
> - ++khugepaged_pages_collapsed;
>
> + *result = collapse_single_pmd(khugepaged_scan.address,
> + vma, &mmap_locked, cc);
> /* move to next address */
> khugepaged_scan.address += HPAGE_PMD_SIZE;
> progress += HPAGE_PMD_NR;
> @@ -2799,6 +2822,7 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
> cond_resched();
> mmap_read_lock(mm);
> mmap_locked = true;
> + *lock_dropped = true;
> result = hugepage_vma_revalidate(mm, addr, false, &vma,
> cc);
> if (result != SCAN_SUCCEED) {
> @@ -2809,17 +2833,17 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
> hend = min(hend, vma->vm_end & HPAGE_PMD_MASK);
> }
> mmap_assert_locked(mm);
> - if (!vma_is_anonymous(vma)) {
> - struct file *file = get_file(vma->vm_file);
> - pgoff_t pgoff = linear_page_index(vma, addr);
>
> - mmap_read_unlock(mm);
> - mmap_locked = false;
> + result = collapse_single_pmd(addr, vma, &mmap_locked, cc);
> +
> + if (!mmap_locked)
> *lock_dropped = true;
> - result = collapse_scan_file(mm, addr, file, pgoff, cc);
>
> - if (result == SCAN_PAGE_DIRTY_OR_WRITEBACK && !triggered_wb &&
> - mapping_can_writeback(file->f_mapping)) {
> + if (result == SCAN_PAGE_DIRTY_OR_WRITEBACK && !triggered_wb) {
> + struct file *file = get_file(vma->vm_file);
> + pgoff_t pgoff = linear_page_index(vma, addr);
After collapse_single_pmd() returns, mmap_lock might have been released.
Between
that unlock and here, another thread could unmap/remap the VMA, making
the vma
pointer stale when we access vma->vm_file?
Would it be safer to get the file reference before calling
collapse_single_pmd()?
Or we need to revalidate the VMA after getting the lock back?
Thanks,
Lance
> +
> + if (mapping_can_writeback(file->f_mapping)) {
> loff_t lstart = (loff_t)pgoff << PAGE_SHIFT;
> loff_t lend = lstart + HPAGE_PMD_SIZE - 1;
>
> @@ -2829,26 +2853,16 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
> goto retry;
> }
> fput(file);
> - } else {
> - result = collapse_scan_pmd(mm, vma, addr, &mmap_locked, cc);
> }
> - if (!mmap_locked)
> - *lock_dropped = true;
>
> -handle_result:
> switch (result) {
> case SCAN_SUCCEED:
> case SCAN_PMD_MAPPED:
> ++thps;
> break;
> - case SCAN_PTE_MAPPED_HUGEPAGE:
> - BUG_ON(mmap_locked);
> - mmap_read_lock(mm);
> - result = try_collapse_pte_mapped_thp(mm, addr, true);
> - mmap_read_unlock(mm);
> - goto handle_result;
> /* Whitelisted set of results where continuing OK */
> case SCAN_NO_PTE_TABLE:
> + case SCAN_PTE_MAPPED_HUGEPAGE:
> case SCAN_PTE_NON_PRESENT:
> case SCAN_PTE_UFFD_WP:
> case SCAN_LACK_REFERENCED_PAGE:
Andrew - when this goes into mm-new if there isn't a respin between, please
drop all tags except any obviously sent re: the fix-patch.
Thanks!
On Fri, Jan 23, 2026 at 01:07:16PM +0800, Lance Yang wrote:
>
>
> On 2026/1/23 03:28, Nico Pache wrote:
> > The khugepaged daemon and madvise_collapse have two different
> > implementations that do almost the same thing.
> >
> > Create collapse_single_pmd to increase code reuse and create an entry
> > point to these two users.
> >
> > Refactor madvise_collapse and collapse_scan_mm_slot to use the new
> > collapse_single_pmd function. This introduces a minor behavioral change
> > that is most likely an undiscovered bug. The current implementation of
> > khugepaged tests collapse_test_exit_or_disable before calling
> > collapse_pte_mapped_thp, but we weren't doing it in the madvise_collapse
> > case. By unifying these two callers madvise_collapse now also performs
> > this check. We also modify the return value to be SCAN_ANY_PROCESS which
> > properly indicates that this process is no longer valid to operate on.
> >
> > We also guard the khugepaged_pages_collapsed variable to ensure its only
> > incremented for khugepaged.
> >
> > Reviewed-by: Wei Yang <richard.weiyang@gmail.com>
> > Reviewed-by: Lance Yang <lance.yang@linux.dev>
> > Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> > Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> > Reviewed-by: Zi Yan <ziy@nvidia.com>
> > Acked-by: David Hildenbrand <david@redhat.com>
> > Signed-off-by: Nico Pache <npache@redhat.com>
> > ---
>
> I think this patch introduces some functional changes compared to previous
> version[1] ...
>
> Maybe we should drop the r-b tags and let folks take another look?
Yes thanks Lance, absolutely this should happen.
Especially on a small-iteration respin (I really wanted to get to v13 but the
rebase issue killed that).
I know it wasn't intentional, not suggesting that of course :) just obviously as
a process thing - it's _very_ important to make clear what you've changed and
what you haven't. For truly minor changes no need to drop the tags, but often my
workflow is:
- Check which patches I haven't reviewed yet.
- Go review those.
So I might well have missed that.
I often try to do a git range-diff, but in this case I probably wouldn't have on
basis of the v13 having merge conflicts.
But obviously given the above I went and fixed them up and applied v13 locally
so I could check everything :)
>
> There might be an issue with the vma access in madvise_collapse(). See
> below:
>
> [1]
> https://lore.kernel.org/linux-mm/20251201174627.23295-3-npache@redhat.com/
>
> > mm/khugepaged.c | 106 +++++++++++++++++++++++++++---------------------
> > 1 file changed, 60 insertions(+), 46 deletions(-)
> >
> > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > index fefcbdca4510..59e5a5588d85 100644
> > --- a/mm/khugepaged.c
> > +++ b/mm/khugepaged.c
> > @@ -2394,6 +2394,54 @@ static enum scan_result collapse_scan_file(struct mm_struct *mm, unsigned long a
> > return result;
> > }
> > +/*
> > + * Try to collapse a single PMD starting at a PMD aligned addr, and return
> > + * the results.
> > + */
> > +static enum scan_result collapse_single_pmd(unsigned long addr,
> > + struct vm_area_struct *vma, bool *mmap_locked,
> > + struct collapse_control *cc)
> > +{
> > + struct mm_struct *mm = vma->vm_mm;
> > + enum scan_result result;
> > + struct file *file;
> > + pgoff_t pgoff;
> > +
> > + if (vma_is_anonymous(vma)) {
> > + result = collapse_scan_pmd(mm, vma, addr, mmap_locked, cc);
> > + goto end;
> > + }
> > +
> > + file = get_file(vma->vm_file);
> > + pgoff = linear_page_index(vma, addr);
> > +
> > + mmap_read_unlock(mm);
> > + *mmap_locked = false;
> > + result = collapse_scan_file(mm, addr, file, pgoff, cc);
> > + fput(file);
> > +
> > + if (result != SCAN_PTE_MAPPED_HUGEPAGE)
> > + goto end;
> > +
> > + mmap_read_lock(mm);
> > + *mmap_locked = true;
> > + if (collapse_test_exit_or_disable(mm)) {
> > + mmap_read_unlock(mm);
> > + *mmap_locked = false;
> > + return SCAN_ANY_PROCESS;
> > + }
> > + result = try_collapse_pte_mapped_thp(mm, addr, !cc->is_khugepaged);
> > + if (result == SCAN_PMD_MAPPED)
> > + result = SCAN_SUCCEED;
> > + mmap_read_unlock(mm);
> > + *mmap_locked = false;
> > +
> > +end:
> > + if (cc->is_khugepaged && result == SCAN_SUCCEED)
> > + ++khugepaged_pages_collapsed;
> > + return result;
> > +}
> > +
> > static unsigned int collapse_scan_mm_slot(unsigned int pages, enum scan_result *result,
> > struct collapse_control *cc)
> > __releases(&khugepaged_mm_lock)
> > @@ -2466,34 +2514,9 @@ static unsigned int collapse_scan_mm_slot(unsigned int pages, enum scan_result *
> > VM_BUG_ON(khugepaged_scan.address < hstart ||
> > khugepaged_scan.address + HPAGE_PMD_SIZE >
> > hend);
> > - if (!vma_is_anonymous(vma)) {
> > - struct file *file = get_file(vma->vm_file);
> > - pgoff_t pgoff = linear_page_index(vma,
> > - khugepaged_scan.address);
> > -
> > - mmap_read_unlock(mm);
> > - mmap_locked = false;
> > - *result = collapse_scan_file(mm,
> > - khugepaged_scan.address, file, pgoff, cc);
> > - fput(file);
> > - if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
> > - mmap_read_lock(mm);
> > - if (collapse_test_exit_or_disable(mm))
> > - goto breakouterloop;
> > - *result = try_collapse_pte_mapped_thp(mm,
> > - khugepaged_scan.address, false);
> > - if (*result == SCAN_PMD_MAPPED)
> > - *result = SCAN_SUCCEED;
> > - mmap_read_unlock(mm);
> > - }
> > - } else {
> > - *result = collapse_scan_pmd(mm, vma,
> > - khugepaged_scan.address, &mmap_locked, cc);
> > - }
> > -
> > - if (*result == SCAN_SUCCEED)
> > - ++khugepaged_pages_collapsed;
> > + *result = collapse_single_pmd(khugepaged_scan.address,
> > + vma, &mmap_locked, cc);
> > /* move to next address */
> > khugepaged_scan.address += HPAGE_PMD_SIZE;
> > progress += HPAGE_PMD_NR;
> > @@ -2799,6 +2822,7 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
> > cond_resched();
> > mmap_read_lock(mm);
> > mmap_locked = true;
> > + *lock_dropped = true;
> > result = hugepage_vma_revalidate(mm, addr, false, &vma,
> > cc);
> > if (result != SCAN_SUCCEED) {
> > @@ -2809,17 +2833,17 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
> > hend = min(hend, vma->vm_end & HPAGE_PMD_MASK);
> > }
> > mmap_assert_locked(mm);
> > - if (!vma_is_anonymous(vma)) {
> > - struct file *file = get_file(vma->vm_file);
> > - pgoff_t pgoff = linear_page_index(vma, addr);
> > - mmap_read_unlock(mm);
> > - mmap_locked = false;
> > + result = collapse_single_pmd(addr, vma, &mmap_locked, cc);
> > +
> > + if (!mmap_locked)
> > *lock_dropped = true;
> > - result = collapse_scan_file(mm, addr, file, pgoff, cc);
> > - if (result == SCAN_PAGE_DIRTY_OR_WRITEBACK && !triggered_wb &&
> > - mapping_can_writeback(file->f_mapping)) {
> > + if (result == SCAN_PAGE_DIRTY_OR_WRITEBACK && !triggered_wb) {
> > + struct file *file = get_file(vma->vm_file);
> > + pgoff_t pgoff = linear_page_index(vma, addr);
>
>
> After collapse_single_pmd() returns, mmap_lock might have been released.
> Between
> that unlock and here, another thread could unmap/remap the VMA, making the
> vma
> pointer stale when we access vma->vm_file?
Yeah, yikes.
The locking logic around this code is horrifying... but that's one for future
series I guess.
>
> Would it be safer to get the file reference before calling
> collapse_single_pmd()?
> Or we need to revalidate the VMA after getting the lock back?
Also obviously the pgoff.
I know Nico suggested a patch in a response, will check.
Cheers, Lorenzo
On Mon, 26 Jan 2026 11:40:21 +0000 Lorenzo Stoakes <lorenzo.stoakes@oracle.com> wrote: > Andrew - when this goes into mm-new if there isn't a respin between, please > drop all tags except any obviously sent re: the fix-patch. > I've been believing this is next -rc1 material. Was that mistaken?
On Mon, Jan 26, 2026 at 07:09:18AM -0800, Andrew Morton wrote: > On Mon, 26 Jan 2026 11:40:21 +0000 Lorenzo Stoakes <lorenzo.stoakes@oracle.com> wrote: > > > Andrew - when this goes into mm-new if there isn't a respin between, please > > drop all tags except any obviously sent re: the fix-patch. > > > > I've been believing this is next -rc1 material. Was that mistaken? Yeah this isn't ready yet sorry. I did hope we could get this in this cycle but there's too much to check (esp. given this change for isntance0 and we need more time to stabilise it, so please keep this out of mm-(un)stable for now. It's a really huge change to THP so we need to take our time with it. So we're aiming for 6.21-rc1 / 7.1-rc1 or whatever deems it to be :) i.e. next+1-rc1. Cheers, Lorenzo
On Thu, Jan 22, 2026 at 10:08 PM Lance Yang <lance.yang@linux.dev> wrote:
>
>
>
> On 2026/1/23 03:28, Nico Pache wrote:
> > The khugepaged daemon and madvise_collapse have two different
> > implementations that do almost the same thing.
> >
> > Create collapse_single_pmd to increase code reuse and create an entry
> > point to these two users.
> >
> > Refactor madvise_collapse and collapse_scan_mm_slot to use the new
> > collapse_single_pmd function. This introduces a minor behavioral change
> > that is most likely an undiscovered bug. The current implementation of
> > khugepaged tests collapse_test_exit_or_disable before calling
> > collapse_pte_mapped_thp, but we weren't doing it in the madvise_collapse
> > case. By unifying these two callers madvise_collapse now also performs
> > this check. We also modify the return value to be SCAN_ANY_PROCESS which
> > properly indicates that this process is no longer valid to operate on.
> >
> > We also guard the khugepaged_pages_collapsed variable to ensure its only
> > incremented for khugepaged.
> >
> > Reviewed-by: Wei Yang <richard.weiyang@gmail.com>
> > Reviewed-by: Lance Yang <lance.yang@linux.dev>
> > Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> > Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> > Reviewed-by: Zi Yan <ziy@nvidia.com>
> > Acked-by: David Hildenbrand <david@redhat.com>
> > Signed-off-by: Nico Pache <npache@redhat.com>
> > ---
>
> I think this patch introduces some functional changes compared to previous
> version[1] ...
>
> Maybe we should drop the r-b tags and let folks take another look?
>
> There might be an issue with the vma access in madvise_collapse(). See
> below:
>
> [1]
> https://lore.kernel.org/linux-mm/20251201174627.23295-3-npache@redhat.com/
>
> > mm/khugepaged.c | 106 +++++++++++++++++++++++++++---------------------
> > 1 file changed, 60 insertions(+), 46 deletions(-)
> >
> > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > index fefcbdca4510..59e5a5588d85 100644
> > --- a/mm/khugepaged.c
> > +++ b/mm/khugepaged.c
> > @@ -2394,6 +2394,54 @@ static enum scan_result collapse_scan_file(struct mm_struct *mm, unsigned long a
> > return result;
> > }
> >
> > +/*
> > + * Try to collapse a single PMD starting at a PMD aligned addr, and return
> > + * the results.
> > + */
> > +static enum scan_result collapse_single_pmd(unsigned long addr,
> > + struct vm_area_struct *vma, bool *mmap_locked,
> > + struct collapse_control *cc)
> > +{
> > + struct mm_struct *mm = vma->vm_mm;
> > + enum scan_result result;
> > + struct file *file;
> > + pgoff_t pgoff;
> > +
> > + if (vma_is_anonymous(vma)) {
> > + result = collapse_scan_pmd(mm, vma, addr, mmap_locked, cc);
> > + goto end;
> > + }
> > +
> > + file = get_file(vma->vm_file);
> > + pgoff = linear_page_index(vma, addr);
> > +
> > + mmap_read_unlock(mm);
> > + *mmap_locked = false;
> > + result = collapse_scan_file(mm, addr, file, pgoff, cc);
> > + fput(file);
> > +
> > + if (result != SCAN_PTE_MAPPED_HUGEPAGE)
> > + goto end;
> > +
> > + mmap_read_lock(mm);
> > + *mmap_locked = true;
> > + if (collapse_test_exit_or_disable(mm)) {
> > + mmap_read_unlock(mm);
> > + *mmap_locked = false;
> > + return SCAN_ANY_PROCESS;
> > + }
> > + result = try_collapse_pte_mapped_thp(mm, addr, !cc->is_khugepaged);
> > + if (result == SCAN_PMD_MAPPED)
> > + result = SCAN_SUCCEED;
> > + mmap_read_unlock(mm);
> > + *mmap_locked = false;
> > +
> > +end:
> > + if (cc->is_khugepaged && result == SCAN_SUCCEED)
> > + ++khugepaged_pages_collapsed;
> > + return result;
> > +}
> > +
> > static unsigned int collapse_scan_mm_slot(unsigned int pages, enum scan_result *result,
> > struct collapse_control *cc)
> > __releases(&khugepaged_mm_lock)
> > @@ -2466,34 +2514,9 @@ static unsigned int collapse_scan_mm_slot(unsigned int pages, enum scan_result *
> > VM_BUG_ON(khugepaged_scan.address < hstart ||
> > khugepaged_scan.address + HPAGE_PMD_SIZE >
> > hend);
> > - if (!vma_is_anonymous(vma)) {
> > - struct file *file = get_file(vma->vm_file);
> > - pgoff_t pgoff = linear_page_index(vma,
> > - khugepaged_scan.address);
> > -
> > - mmap_read_unlock(mm);
> > - mmap_locked = false;
> > - *result = collapse_scan_file(mm,
> > - khugepaged_scan.address, file, pgoff, cc);
> > - fput(file);
> > - if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
> > - mmap_read_lock(mm);
> > - if (collapse_test_exit_or_disable(mm))
> > - goto breakouterloop;
> > - *result = try_collapse_pte_mapped_thp(mm,
> > - khugepaged_scan.address, false);
> > - if (*result == SCAN_PMD_MAPPED)
> > - *result = SCAN_SUCCEED;
> > - mmap_read_unlock(mm);
> > - }
> > - } else {
> > - *result = collapse_scan_pmd(mm, vma,
> > - khugepaged_scan.address, &mmap_locked, cc);
> > - }
> > -
> > - if (*result == SCAN_SUCCEED)
> > - ++khugepaged_pages_collapsed;
> >
> > + *result = collapse_single_pmd(khugepaged_scan.address,
> > + vma, &mmap_locked, cc);
> > /* move to next address */
> > khugepaged_scan.address += HPAGE_PMD_SIZE;
> > progress += HPAGE_PMD_NR;
> > @@ -2799,6 +2822,7 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
> > cond_resched();
> > mmap_read_lock(mm);
> > mmap_locked = true;
> > + *lock_dropped = true;
> > result = hugepage_vma_revalidate(mm, addr, false, &vma,
> > cc);
> > if (result != SCAN_SUCCEED) {
> > @@ -2809,17 +2833,17 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
> > hend = min(hend, vma->vm_end & HPAGE_PMD_MASK);
> > }
> > mmap_assert_locked(mm);
> > - if (!vma_is_anonymous(vma)) {
> > - struct file *file = get_file(vma->vm_file);
> > - pgoff_t pgoff = linear_page_index(vma, addr);
> >
> > - mmap_read_unlock(mm);
> > - mmap_locked = false;
> > + result = collapse_single_pmd(addr, vma, &mmap_locked, cc);
> > +
> > + if (!mmap_locked)
> > *lock_dropped = true;
> > - result = collapse_scan_file(mm, addr, file, pgoff, cc);
> >
> > - if (result == SCAN_PAGE_DIRTY_OR_WRITEBACK && !triggered_wb &&
> > - mapping_can_writeback(file->f_mapping)) {
> > + if (result == SCAN_PAGE_DIRTY_OR_WRITEBACK && !triggered_wb) {
> > + struct file *file = get_file(vma->vm_file);
> > + pgoff_t pgoff = linear_page_index(vma, addr);
>
>
> After collapse_single_pmd() returns, mmap_lock might have been released.
> Between
> that unlock and here, another thread could unmap/remap the VMA, making
> the vma
> pointer stale when we access vma->vm_file?
+ Shivank, I thought they were on the CC list.
Hey! I thought of this case, but then figured it was no different than
what is currently implemented for the writeback-retry logic, since the
mmap lock is dropped and not revalidated. BUT I failed to consider
that the file reference is held throughout that time.
I thought of moving the functionality into collapse_single_pmd(), but
figured I'd keep it in madvise_collapse() as it's the sole user of
that functionality. Given the potential file ref issue, that may be
the best solution, and I dont think it should be too difficult. I'll
queue that up, and also drop the r-b tags as you suggested.
Ok, here's my solution, does this look like the right approach?:
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 59e5a5588d85..dda9fdc35767 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -2418,6 +2418,14 @@ static enum scan_result
collapse_single_pmd(unsigned long addr,
mmap_read_unlock(mm);
*mmap_locked = false;
result = collapse_scan_file(mm, addr, file, pgoff, cc);
+
+ if (!cc->is_khugepaged && result == SCAN_PAGE_DIRTY_OR_WRITEBACK &&
+ mapping_can_writeback(file->f_mapping)) {
+ loff_t lstart = (loff_t)pgoff << PAGE_SHIFT;
+ loff_t lend = lstart + HPAGE_PMD_SIZE - 1;
+
+ filemap_write_and_wait_range(file->f_mapping, lstart, lend);
+ }
fput(file);
if (result != SCAN_PTE_MAPPED_HUGEPAGE)
@@ -2840,19 +2848,8 @@ int madvise_collapse(struct vm_area_struct
*vma, unsigned long start,
*lock_dropped = true;
if (result == SCAN_PAGE_DIRTY_OR_WRITEBACK && !triggered_wb) {
- struct file *file = get_file(vma->vm_file);
- pgoff_t pgoff = linear_page_index(vma, addr);
-
- if (mapping_can_writeback(file->f_mapping)) {
- loff_t lstart = (loff_t)pgoff << PAGE_SHIFT;
- loff_t lend = lstart + HPAGE_PMD_SIZE - 1;
-
-
filemap_write_and_wait_range(file->f_mapping, lstart, lend);
- triggered_wb = true;
- fput(file);
- goto retry;
- }
- fput(file);
+ triggered_wb = true;
+ goto retry;
}
switch (result) {
-- Nico
>
> Would it be safer to get the file reference before calling
> collapse_single_pmd()?
> Or we need to revalidate the VMA after getting the lock back?
>
>
> Thanks,
> Lance
>
> > +
> > + if (mapping_can_writeback(file->f_mapping)) {
> > loff_t lstart = (loff_t)pgoff << PAGE_SHIFT;
> > loff_t lend = lstart + HPAGE_PMD_SIZE - 1;
> >
> > @@ -2829,26 +2853,16 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
> > goto retry;
> > }
> > fput(file);
> > - } else {
> > - result = collapse_scan_pmd(mm, vma, addr, &mmap_locked, cc);
> > }
> > - if (!mmap_locked)
> > - *lock_dropped = true;
> >
> > -handle_result:
> > switch (result) {
> > case SCAN_SUCCEED:
> > case SCAN_PMD_MAPPED:
> > ++thps;
> > break;
> > - case SCAN_PTE_MAPPED_HUGEPAGE:
> > - BUG_ON(mmap_locked);
> > - mmap_read_lock(mm);
> > - result = try_collapse_pte_mapped_thp(mm, addr, true);
> > - mmap_read_unlock(mm);
> > - goto handle_result;
> > /* Whitelisted set of results where continuing OK */
> > case SCAN_NO_PTE_TABLE:
> > + case SCAN_PTE_MAPPED_HUGEPAGE:
> > case SCAN_PTE_NON_PRESENT:
> > case SCAN_PTE_UFFD_WP:
> > case SCAN_LACK_REFERENCED_PAGE:
>
On Fri, Jan 23, 2026 at 04:26:09PM -0700, Nico Pache wrote:
> On Thu, Jan 22, 2026 at 10:08 PM Lance Yang <lance.yang@linux.dev> wrote:
> >
> >
> >
> > On 2026/1/23 03:28, Nico Pache wrote:
> > > The khugepaged daemon and madvise_collapse have two different
> > > implementations that do almost the same thing.
> > >
> > > Create collapse_single_pmd to increase code reuse and create an entry
> > > point to these two users.
> > >
> > > Refactor madvise_collapse and collapse_scan_mm_slot to use the new
> > > collapse_single_pmd function. This introduces a minor behavioral change
> > > that is most likely an undiscovered bug. The current implementation of
> > > khugepaged tests collapse_test_exit_or_disable before calling
> > > collapse_pte_mapped_thp, but we weren't doing it in the madvise_collapse
> > > case. By unifying these two callers madvise_collapse now also performs
> > > this check. We also modify the return value to be SCAN_ANY_PROCESS which
> > > properly indicates that this process is no longer valid to operate on.
> > >
> > > We also guard the khugepaged_pages_collapsed variable to ensure its only
> > > incremented for khugepaged.
> > >
> > > Reviewed-by: Wei Yang <richard.weiyang@gmail.com>
> > > Reviewed-by: Lance Yang <lance.yang@linux.dev>
> > > Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> > > Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> > > Reviewed-by: Zi Yan <ziy@nvidia.com>
> > > Acked-by: David Hildenbrand <david@redhat.com>
> > > Signed-off-by: Nico Pache <npache@redhat.com>
> > > ---
> >
> > I think this patch introduces some functional changes compared to previous
> > version[1] ...
> >
> > Maybe we should drop the r-b tags and let folks take another look?
> >
> > There might be an issue with the vma access in madvise_collapse(). See
> > below:
> >
> > [1]
> > https://lore.kernel.org/linux-mm/20251201174627.23295-3-npache@redhat.com/
> >
> > > mm/khugepaged.c | 106 +++++++++++++++++++++++++++---------------------
> > > 1 file changed, 60 insertions(+), 46 deletions(-)
> > >
> > > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > > index fefcbdca4510..59e5a5588d85 100644
> > > --- a/mm/khugepaged.c
> > > +++ b/mm/khugepaged.c
> > > @@ -2394,6 +2394,54 @@ static enum scan_result collapse_scan_file(struct mm_struct *mm, unsigned long a
> > > return result;
> > > }
> > >
> > > +/*
> > > + * Try to collapse a single PMD starting at a PMD aligned addr, and return
> > > + * the results.
> > > + */
> > > +static enum scan_result collapse_single_pmd(unsigned long addr,
> > > + struct vm_area_struct *vma, bool *mmap_locked,
> > > + struct collapse_control *cc)
> > > +{
> > > + struct mm_struct *mm = vma->vm_mm;
> > > + enum scan_result result;
> > > + struct file *file;
> > > + pgoff_t pgoff;
> > > +
> > > + if (vma_is_anonymous(vma)) {
> > > + result = collapse_scan_pmd(mm, vma, addr, mmap_locked, cc);
> > > + goto end;
> > > + }
> > > +
> > > + file = get_file(vma->vm_file);
> > > + pgoff = linear_page_index(vma, addr);
> > > +
> > > + mmap_read_unlock(mm);
> > > + *mmap_locked = false;
> > > + result = collapse_scan_file(mm, addr, file, pgoff, cc);
> > > + fput(file);
> > > +
> > > + if (result != SCAN_PTE_MAPPED_HUGEPAGE)
> > > + goto end;
> > > +
> > > + mmap_read_lock(mm);
> > > + *mmap_locked = true;
> > > + if (collapse_test_exit_or_disable(mm)) {
> > > + mmap_read_unlock(mm);
> > > + *mmap_locked = false;
> > > + return SCAN_ANY_PROCESS;
> > > + }
> > > + result = try_collapse_pte_mapped_thp(mm, addr, !cc->is_khugepaged);
> > > + if (result == SCAN_PMD_MAPPED)
> > > + result = SCAN_SUCCEED;
> > > + mmap_read_unlock(mm);
> > > + *mmap_locked = false;
> > > +
> > > +end:
> > > + if (cc->is_khugepaged && result == SCAN_SUCCEED)
> > > + ++khugepaged_pages_collapsed;
> > > + return result;
> > > +}
> > > +
> > > static unsigned int collapse_scan_mm_slot(unsigned int pages, enum scan_result *result,
> > > struct collapse_control *cc)
> > > __releases(&khugepaged_mm_lock)
> > > @@ -2466,34 +2514,9 @@ static unsigned int collapse_scan_mm_slot(unsigned int pages, enum scan_result *
> > > VM_BUG_ON(khugepaged_scan.address < hstart ||
> > > khugepaged_scan.address + HPAGE_PMD_SIZE >
> > > hend);
> > > - if (!vma_is_anonymous(vma)) {
> > > - struct file *file = get_file(vma->vm_file);
> > > - pgoff_t pgoff = linear_page_index(vma,
> > > - khugepaged_scan.address);
> > > -
> > > - mmap_read_unlock(mm);
> > > - mmap_locked = false;
> > > - *result = collapse_scan_file(mm,
> > > - khugepaged_scan.address, file, pgoff, cc);
> > > - fput(file);
> > > - if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
> > > - mmap_read_lock(mm);
> > > - if (collapse_test_exit_or_disable(mm))
> > > - goto breakouterloop;
> > > - *result = try_collapse_pte_mapped_thp(mm,
> > > - khugepaged_scan.address, false);
> > > - if (*result == SCAN_PMD_MAPPED)
> > > - *result = SCAN_SUCCEED;
> > > - mmap_read_unlock(mm);
> > > - }
> > > - } else {
> > > - *result = collapse_scan_pmd(mm, vma,
> > > - khugepaged_scan.address, &mmap_locked, cc);
> > > - }
> > > -
> > > - if (*result == SCAN_SUCCEED)
> > > - ++khugepaged_pages_collapsed;
> > >
> > > + *result = collapse_single_pmd(khugepaged_scan.address,
> > > + vma, &mmap_locked, cc);
> > > /* move to next address */
> > > khugepaged_scan.address += HPAGE_PMD_SIZE;
> > > progress += HPAGE_PMD_NR;
> > > @@ -2799,6 +2822,7 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
> > > cond_resched();
> > > mmap_read_lock(mm);
> > > mmap_locked = true;
> > > + *lock_dropped = true;
> > > result = hugepage_vma_revalidate(mm, addr, false, &vma,
> > > cc);
> > > if (result != SCAN_SUCCEED) {
> > > @@ -2809,17 +2833,17 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
> > > hend = min(hend, vma->vm_end & HPAGE_PMD_MASK);
> > > }
> > > mmap_assert_locked(mm);
> > > - if (!vma_is_anonymous(vma)) {
> > > - struct file *file = get_file(vma->vm_file);
> > > - pgoff_t pgoff = linear_page_index(vma, addr);
> > >
> > > - mmap_read_unlock(mm);
> > > - mmap_locked = false;
> > > + result = collapse_single_pmd(addr, vma, &mmap_locked, cc);
> > > +
> > > + if (!mmap_locked)
> > > *lock_dropped = true;
> > > - result = collapse_scan_file(mm, addr, file, pgoff, cc);
> > >
> > > - if (result == SCAN_PAGE_DIRTY_OR_WRITEBACK && !triggered_wb &&
> > > - mapping_can_writeback(file->f_mapping)) {
> > > + if (result == SCAN_PAGE_DIRTY_OR_WRITEBACK && !triggered_wb) {
> > > + struct file *file = get_file(vma->vm_file);
> > > + pgoff_t pgoff = linear_page_index(vma, addr);
> >
> >
> > After collapse_single_pmd() returns, mmap_lock might have been released.
> > Between
> > that unlock and here, another thread could unmap/remap the VMA, making
> > the vma
> > pointer stale when we access vma->vm_file?
>
> + Shivank, I thought they were on the CC list.
>
> Hey! I thought of this case, but then figured it was no different than
> what is currently implemented for the writeback-retry logic, since the
> mmap lock is dropped and not revalidated. BUT I failed to consider
> that the file reference is held throughout that time.
You obviously can't manipulate or reference a pointer to a VMA in any way
if is no longer stabilised, that'd be a potential UAF.
>
> I thought of moving the functionality into collapse_single_pmd(), but
> figured I'd keep it in madvise_collapse() as it's the sole user of
> that functionality. Given the potential file ref issue, that may be
> the best solution, and I dont think it should be too difficult. I'll
> queue that up, and also drop the r-b tags as you suggested.
>
> Ok, here's my solution, does this look like the right approach?:
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 59e5a5588d85..dda9fdc35767 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -2418,6 +2418,14 @@ static enum scan_result
> collapse_single_pmd(unsigned long addr,
> mmap_read_unlock(mm);
> *mmap_locked = false;
> result = collapse_scan_file(mm, addr, file, pgoff, cc);
> +
> + if (!cc->is_khugepaged && result == SCAN_PAGE_DIRTY_OR_WRITEBACK &&
> + mapping_can_writeback(file->f_mapping)) {
> + loff_t lstart = (loff_t)pgoff << PAGE_SHIFT;
> + loff_t lend = lstart + HPAGE_PMD_SIZE - 1;
NIT, but Let's const-ify these.
Also credit to Baolin for having suggested taking the approach of putting
here! :)
> +
> + filemap_write_and_wait_range(file->f_mapping, lstart, lend);
> + }
> fput(file);
>
> if (result != SCAN_PTE_MAPPED_HUGEPAGE)
> @@ -2840,19 +2848,8 @@ int madvise_collapse(struct vm_area_struct
> *vma, unsigned long start,
> *lock_dropped = true;
>
> if (result == SCAN_PAGE_DIRTY_OR_WRITEBACK && !triggered_wb) {
> - struct file *file = get_file(vma->vm_file);
> - pgoff_t pgoff = linear_page_index(vma, addr);
> -
> - if (mapping_can_writeback(file->f_mapping)) {
> - loff_t lstart = (loff_t)pgoff << PAGE_SHIFT;
> - loff_t lend = lstart + HPAGE_PMD_SIZE - 1;
> -
> -
> filemap_write_and_wait_range(file->f_mapping, lstart, lend);
> - triggered_wb = true;
> - fput(file);
> - goto retry;
> - }
> - fput(file);
> + triggered_wb = true;
> + goto retry;
OK this looks correct I agree with Lance.
Could you send this in reply to the parent, i.e. [0], as a fix-patch and
ask Andrew to apply it?
Can then review that there.
[0]:https://lore.kernel.org/all/20260122192841.128719-4-npache@redhat.com/
> }
>
> switch (result) {
>
>
>
> -- Nico
>
Cheers, Lorenzo
On 2026/1/24 07:26, Nico Pache wrote:
> On Thu, Jan 22, 2026 at 10:08 PM Lance Yang <lance.yang@linux.dev> wrote:
>>
>>
>>
>> On 2026/1/23 03:28, Nico Pache wrote:
>>> The khugepaged daemon and madvise_collapse have two different
>>> implementations that do almost the same thing.
>>>
>>> Create collapse_single_pmd to increase code reuse and create an entry
>>> point to these two users.
>>>
>>> Refactor madvise_collapse and collapse_scan_mm_slot to use the new
>>> collapse_single_pmd function. This introduces a minor behavioral change
>>> that is most likely an undiscovered bug. The current implementation of
>>> khugepaged tests collapse_test_exit_or_disable before calling
>>> collapse_pte_mapped_thp, but we weren't doing it in the madvise_collapse
>>> case. By unifying these two callers madvise_collapse now also performs
>>> this check. We also modify the return value to be SCAN_ANY_PROCESS which
>>> properly indicates that this process is no longer valid to operate on.
>>>
>>> We also guard the khugepaged_pages_collapsed variable to ensure its only
>>> incremented for khugepaged.
>>>
>>> Reviewed-by: Wei Yang <richard.weiyang@gmail.com>
>>> Reviewed-by: Lance Yang <lance.yang@linux.dev>
>>> Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
>>> Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
>>> Reviewed-by: Zi Yan <ziy@nvidia.com>
>>> Acked-by: David Hildenbrand <david@redhat.com>
>>> Signed-off-by: Nico Pache <npache@redhat.com>
>>> ---
>>
>> I think this patch introduces some functional changes compared to previous
>> version[1] ...
>>
>> Maybe we should drop the r-b tags and let folks take another look?
>>
>> There might be an issue with the vma access in madvise_collapse(). See
>> below:
>>
>> [1]
>> https://lore.kernel.org/linux-mm/20251201174627.23295-3-npache@redhat.com/
>>
>>> mm/khugepaged.c | 106 +++++++++++++++++++++++++++---------------------
>>> 1 file changed, 60 insertions(+), 46 deletions(-)
>>>
>>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>>> index fefcbdca4510..59e5a5588d85 100644
>>> --- a/mm/khugepaged.c
>>> +++ b/mm/khugepaged.c
>>> @@ -2394,6 +2394,54 @@ static enum scan_result collapse_scan_file(struct mm_struct *mm, unsigned long a
>>> return result;
>>> }
>>>
>>> +/*
>>> + * Try to collapse a single PMD starting at a PMD aligned addr, and return
>>> + * the results.
>>> + */
>>> +static enum scan_result collapse_single_pmd(unsigned long addr,
>>> + struct vm_area_struct *vma, bool *mmap_locked,
>>> + struct collapse_control *cc)
>>> +{
>>> + struct mm_struct *mm = vma->vm_mm;
>>> + enum scan_result result;
>>> + struct file *file;
>>> + pgoff_t pgoff;
>>> +
>>> + if (vma_is_anonymous(vma)) {
>>> + result = collapse_scan_pmd(mm, vma, addr, mmap_locked, cc);
>>> + goto end;
>>> + }
>>> +
>>> + file = get_file(vma->vm_file);
>>> + pgoff = linear_page_index(vma, addr);
>>> +
>>> + mmap_read_unlock(mm);
>>> + *mmap_locked = false;
>>> + result = collapse_scan_file(mm, addr, file, pgoff, cc);
>>> + fput(file);
>>> +
>>> + if (result != SCAN_PTE_MAPPED_HUGEPAGE)
>>> + goto end;
>>> +
>>> + mmap_read_lock(mm);
>>> + *mmap_locked = true;
>>> + if (collapse_test_exit_or_disable(mm)) {
>>> + mmap_read_unlock(mm);
>>> + *mmap_locked = false;
>>> + return SCAN_ANY_PROCESS;
>>> + }
>>> + result = try_collapse_pte_mapped_thp(mm, addr, !cc->is_khugepaged);
>>> + if (result == SCAN_PMD_MAPPED)
>>> + result = SCAN_SUCCEED;
>>> + mmap_read_unlock(mm);
>>> + *mmap_locked = false;
>>> +
>>> +end:
>>> + if (cc->is_khugepaged && result == SCAN_SUCCEED)
>>> + ++khugepaged_pages_collapsed;
>>> + return result;
>>> +}
>>> +
>>> static unsigned int collapse_scan_mm_slot(unsigned int pages, enum scan_result *result,
>>> struct collapse_control *cc)
>>> __releases(&khugepaged_mm_lock)
>>> @@ -2466,34 +2514,9 @@ static unsigned int collapse_scan_mm_slot(unsigned int pages, enum scan_result *
>>> VM_BUG_ON(khugepaged_scan.address < hstart ||
>>> khugepaged_scan.address + HPAGE_PMD_SIZE >
>>> hend);
>>> - if (!vma_is_anonymous(vma)) {
>>> - struct file *file = get_file(vma->vm_file);
>>> - pgoff_t pgoff = linear_page_index(vma,
>>> - khugepaged_scan.address);
>>> -
>>> - mmap_read_unlock(mm);
>>> - mmap_locked = false;
>>> - *result = collapse_scan_file(mm,
>>> - khugepaged_scan.address, file, pgoff, cc);
>>> - fput(file);
>>> - if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
>>> - mmap_read_lock(mm);
>>> - if (collapse_test_exit_or_disable(mm))
>>> - goto breakouterloop;
>>> - *result = try_collapse_pte_mapped_thp(mm,
>>> - khugepaged_scan.address, false);
>>> - if (*result == SCAN_PMD_MAPPED)
>>> - *result = SCAN_SUCCEED;
>>> - mmap_read_unlock(mm);
>>> - }
>>> - } else {
>>> - *result = collapse_scan_pmd(mm, vma,
>>> - khugepaged_scan.address, &mmap_locked, cc);
>>> - }
>>> -
>>> - if (*result == SCAN_SUCCEED)
>>> - ++khugepaged_pages_collapsed;
>>>
>>> + *result = collapse_single_pmd(khugepaged_scan.address,
>>> + vma, &mmap_locked, cc);
>>> /* move to next address */
>>> khugepaged_scan.address += HPAGE_PMD_SIZE;
>>> progress += HPAGE_PMD_NR;
>>> @@ -2799,6 +2822,7 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
>>> cond_resched();
>>> mmap_read_lock(mm);
>>> mmap_locked = true;
>>> + *lock_dropped = true;
>>> result = hugepage_vma_revalidate(mm, addr, false, &vma,
>>> cc);
>>> if (result != SCAN_SUCCEED) {
>>> @@ -2809,17 +2833,17 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
>>> hend = min(hend, vma->vm_end & HPAGE_PMD_MASK);
>>> }
>>> mmap_assert_locked(mm);
>>> - if (!vma_is_anonymous(vma)) {
>>> - struct file *file = get_file(vma->vm_file);
>>> - pgoff_t pgoff = linear_page_index(vma, addr);
>>>
>>> - mmap_read_unlock(mm);
>>> - mmap_locked = false;
>>> + result = collapse_single_pmd(addr, vma, &mmap_locked, cc);
>>> +
>>> + if (!mmap_locked)
>>> *lock_dropped = true;
>>> - result = collapse_scan_file(mm, addr, file, pgoff, cc);
>>>
>>> - if (result == SCAN_PAGE_DIRTY_OR_WRITEBACK && !triggered_wb &&
>>> - mapping_can_writeback(file->f_mapping)) {
>>> + if (result == SCAN_PAGE_DIRTY_OR_WRITEBACK && !triggered_wb) {
>>> + struct file *file = get_file(vma->vm_file);
>>> + pgoff_t pgoff = linear_page_index(vma, addr);
>>
>>
>> After collapse_single_pmd() returns, mmap_lock might have been released.
>> Between
>> that unlock and here, another thread could unmap/remap the VMA, making
>> the vma
>> pointer stale when we access vma->vm_file?
>
> + Shivank, I thought they were on the CC list.
>
> Hey! I thought of this case, but then figured it was no different than
> what is currently implemented for the writeback-retry logic, since the
> mmap lock is dropped and not revalidated. BUT I failed to consider
> that the file reference is held throughout that time.
>
> I thought of moving the functionality into collapse_single_pmd(), but
> figured I'd keep it in madvise_collapse() as it's the sole user of
> that functionality. Given the potential file ref issue, that may be
> the best solution, and I dont think it should be too difficult. I'll
> queue that up, and also drop the r-b tags as you suggested.
>
> Ok, here's my solution, does this look like the right approach?:
Hey! Thanks for the quick fix!
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 59e5a5588d85..dda9fdc35767 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -2418,6 +2418,14 @@ static enum scan_result
> collapse_single_pmd(unsigned long addr,
> mmap_read_unlock(mm);
> *mmap_locked = false;
> result = collapse_scan_file(mm, addr, file, pgoff, cc);
> +
> + if (!cc->is_khugepaged && result == SCAN_PAGE_DIRTY_OR_WRITEBACK &&
> + mapping_can_writeback(file->f_mapping)) {
> + loff_t lstart = (loff_t)pgoff << PAGE_SHIFT;
> + loff_t lend = lstart + HPAGE_PMD_SIZE - 1;
> +
> + filemap_write_and_wait_range(file->f_mapping, lstart, lend);
> + }
> fput(file);
>
> if (result != SCAN_PTE_MAPPED_HUGEPAGE)
> @@ -2840,19 +2848,8 @@ int madvise_collapse(struct vm_area_struct
> *vma, unsigned long start,
> *lock_dropped = true;
>
> if (result == SCAN_PAGE_DIRTY_OR_WRITEBACK && !triggered_wb) {
> - struct file *file = get_file(vma->vm_file);
> - pgoff_t pgoff = linear_page_index(vma, addr);
> -
> - if (mapping_can_writeback(file->f_mapping)) {
> - loff_t lstart = (loff_t)pgoff << PAGE_SHIFT;
> - loff_t lend = lstart + HPAGE_PMD_SIZE - 1;
> -
> -
> filemap_write_and_wait_range(file->f_mapping, lstart, lend);
> - triggered_wb = true;
> - fput(file);
> - goto retry;
> - }
> - fput(file);
> + triggered_wb = true;
> + goto retry;
> }
>
> switch (result) {
>
>
>
> -- Nico
From a quick glimpse, that looks good to me ;)
Only madvise needs writeback and then retry once, and khugepaged just
skips dirty pages and moves on.
Now, we grab the file reference before dropping mmap_lock, then only
use the file pointer during writeback - no vma access after unlock.
So even if the VMA gets unmapped, we're safe, IIUC.
[...]
On 1/23/26 1:07 PM, Lance Yang wrote:
>
>
> On 2026/1/23 03:28, Nico Pache wrote:
>> The khugepaged daemon and madvise_collapse have two different
>> implementations that do almost the same thing.
>>
>> Create collapse_single_pmd to increase code reuse and create an entry
>> point to these two users.
>>
>> Refactor madvise_collapse and collapse_scan_mm_slot to use the new
>> collapse_single_pmd function. This introduces a minor behavioral change
>> that is most likely an undiscovered bug. The current implementation of
>> khugepaged tests collapse_test_exit_or_disable before calling
>> collapse_pte_mapped_thp, but we weren't doing it in the madvise_collapse
>> case. By unifying these two callers madvise_collapse now also performs
>> this check. We also modify the return value to be SCAN_ANY_PROCESS which
>> properly indicates that this process is no longer valid to operate on.
>>
>> We also guard the khugepaged_pages_collapsed variable to ensure its only
>> incremented for khugepaged.
>>
>> Reviewed-by: Wei Yang <richard.weiyang@gmail.com>
>> Reviewed-by: Lance Yang <lance.yang@linux.dev>
>> Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
>> Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
>> Reviewed-by: Zi Yan <ziy@nvidia.com>
>> Acked-by: David Hildenbrand <david@redhat.com>
>> Signed-off-by: Nico Pache <npache@redhat.com>
>> ---
>
> I think this patch introduces some functional changes compared to previous
> version[1] ...
>
> Maybe we should drop the r-b tags and let folks take another look?
>
> There might be an issue with the vma access in madvise_collapse(). See
> below:
>
> [1] https://lore.kernel.org/linux-mm/20251201174627.23295-3-
> npache@redhat.com/
>
>> mm/khugepaged.c | 106 +++++++++++++++++++++++++++---------------------
>> 1 file changed, 60 insertions(+), 46 deletions(-)
>>
>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>> index fefcbdca4510..59e5a5588d85 100644
>> --- a/mm/khugepaged.c
>> +++ b/mm/khugepaged.c
>> @@ -2394,6 +2394,54 @@ static enum scan_result
>> collapse_scan_file(struct mm_struct *mm, unsigned long a
>> return result;
>> }
>> +/*
>> + * Try to collapse a single PMD starting at a PMD aligned addr, and
>> return
>> + * the results.
>> + */
>> +static enum scan_result collapse_single_pmd(unsigned long addr,
>> + struct vm_area_struct *vma, bool *mmap_locked,
>> + struct collapse_control *cc)
>> +{
>> + struct mm_struct *mm = vma->vm_mm;
>> + enum scan_result result;
>> + struct file *file;
>> + pgoff_t pgoff;
>> +
>> + if (vma_is_anonymous(vma)) {
>> + result = collapse_scan_pmd(mm, vma, addr, mmap_locked, cc);
>> + goto end;
>> + }
>> +
>> + file = get_file(vma->vm_file);
>> + pgoff = linear_page_index(vma, addr);
>> +
>> + mmap_read_unlock(mm);
>> + *mmap_locked = false;
>> + result = collapse_scan_file(mm, addr, file, pgoff, cc);
>> + fput(file);
>> +
>> + if (result != SCAN_PTE_MAPPED_HUGEPAGE)
>> + goto end;
>> +
>> + mmap_read_lock(mm);
>> + *mmap_locked = true;
>> + if (collapse_test_exit_or_disable(mm)) {
>> + mmap_read_unlock(mm);
>> + *mmap_locked = false;
>> + return SCAN_ANY_PROCESS;
>> + }
>> + result = try_collapse_pte_mapped_thp(mm, addr, !cc->is_khugepaged);
>> + if (result == SCAN_PMD_MAPPED)
>> + result = SCAN_SUCCEED;
>> + mmap_read_unlock(mm);
>> + *mmap_locked = false;
>> +
>> +end:
>> + if (cc->is_khugepaged && result == SCAN_SUCCEED)
>> + ++khugepaged_pages_collapsed;
>> + return result;
>> +}
>> +
>> static unsigned int collapse_scan_mm_slot(unsigned int pages, enum
>> scan_result *result,
>> struct collapse_control *cc)
>> __releases(&khugepaged_mm_lock)
>> @@ -2466,34 +2514,9 @@ static unsigned int
>> collapse_scan_mm_slot(unsigned int pages, enum scan_result *
>> VM_BUG_ON(khugepaged_scan.address < hstart ||
>> khugepaged_scan.address + HPAGE_PMD_SIZE >
>> hend);
>> - if (!vma_is_anonymous(vma)) {
>> - struct file *file = get_file(vma->vm_file);
>> - pgoff_t pgoff = linear_page_index(vma,
>> - khugepaged_scan.address);
>> -
>> - mmap_read_unlock(mm);
>> - mmap_locked = false;
>> - *result = collapse_scan_file(mm,
>> - khugepaged_scan.address, file, pgoff, cc);
>> - fput(file);
>> - if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
>> - mmap_read_lock(mm);
>> - if (collapse_test_exit_or_disable(mm))
>> - goto breakouterloop;
>> - *result = try_collapse_pte_mapped_thp(mm,
>> - khugepaged_scan.address, false);
>> - if (*result == SCAN_PMD_MAPPED)
>> - *result = SCAN_SUCCEED;
>> - mmap_read_unlock(mm);
>> - }
>> - } else {
>> - *result = collapse_scan_pmd(mm, vma,
>> - khugepaged_scan.address, &mmap_locked, cc);
>> - }
>> -
>> - if (*result == SCAN_SUCCEED)
>> - ++khugepaged_pages_collapsed;
>> + *result = collapse_single_pmd(khugepaged_scan.address,
>> + vma, &mmap_locked, cc);
>> /* move to next address */
>> khugepaged_scan.address += HPAGE_PMD_SIZE;
>> progress += HPAGE_PMD_NR;
>> @@ -2799,6 +2822,7 @@ int madvise_collapse(struct vm_area_struct *vma,
>> unsigned long start,
>> cond_resched();
>> mmap_read_lock(mm);
>> mmap_locked = true;
>> + *lock_dropped = true;
>> result = hugepage_vma_revalidate(mm, addr, false, &vma,
>> cc);
>> if (result != SCAN_SUCCEED) {
>> @@ -2809,17 +2833,17 @@ int madvise_collapse(struct vm_area_struct
>> *vma, unsigned long start,
>> hend = min(hend, vma->vm_end & HPAGE_PMD_MASK);
>> }
>> mmap_assert_locked(mm);
>> - if (!vma_is_anonymous(vma)) {
>> - struct file *file = get_file(vma->vm_file);
>> - pgoff_t pgoff = linear_page_index(vma, addr);
>> - mmap_read_unlock(mm);
>> - mmap_locked = false;
>> + result = collapse_single_pmd(addr, vma, &mmap_locked, cc);
>> +
>> + if (!mmap_locked)
>> *lock_dropped = true;
>> - result = collapse_scan_file(mm, addr, file, pgoff, cc);
>> - if (result == SCAN_PAGE_DIRTY_OR_WRITEBACK && !
>> triggered_wb &&
>> - mapping_can_writeback(file->f_mapping)) {
>> + if (result == SCAN_PAGE_DIRTY_OR_WRITEBACK && !triggered_wb) {
>> + struct file *file = get_file(vma->vm_file);
>> + pgoff_t pgoff = linear_page_index(vma, addr);
>
>
> After collapse_single_pmd() returns, mmap_lock might have been released.
> Between
> that unlock and here, another thread could unmap/remap the VMA, making
> the vma
> pointer stale when we access vma->vm_file?
>
> Would it be safer to get the file reference before calling
> collapse_single_pmd()?
> Or we need to revalidate the VMA after getting the lock back?
Good catch. I think we can move the filemap_write_and_wait_range()
related logic into collapse_single_pmd(), after we get a file reference.
On Fri, Jan 23, 2026 at 05:31:17PM +0800, Baolin Wang wrote: > > > On 1/23/26 1:07 PM, Lance Yang wrote: > > > > > > After collapse_single_pmd() returns, mmap_lock might have been released. > > Between > > that unlock and here, another thread could unmap/remap the VMA, making > > the vma > > pointer stale when we access vma->vm_file? > > > > Would it be safer to get the file reference before calling > > collapse_single_pmd()? > > Or we need to revalidate the VMA after getting the lock back? > Good catch. I think we can move the filemap_write_and_wait_range() related > logic into collapse_single_pmd(), after we get a file reference. Good suggestion, is what Nico did in the suggested patch :) Agreed better there. Thanks, Lorenzo
© 2016 - 2026 Red Hat, Inc.