Currently, each scan always increases "progress" by HPAGE_PMD_NR,
even if only scanning a single pte.
This patch does not change the original semantics of "progress", it
simply uses the exact number of PTEs counted to replace HPAGE_PMD_NR.
Let me provide a detailed example:
static int hpage_collapse_scan_pmd()
{
for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR;
_pte++, addr += PAGE_SIZE) {
pte_t pteval = ptep_get(_pte);
...
if (pte_uffd_wp(pteval)) { <-- first scan hit
result = SCAN_PTE_UFFD_WP;
goto out_unmap;
}
}
}
During the first scan, if pte_uffd_wp(pteval) is true, the loop exits
directly. In practice, only one PTE is scanned before termination.
Here, "progress += 1" reflects the actual number of PTEs scanned, but
previously "progress += HPAGE_PMD_NR" always.
Signed-off-by: Vernon Yang <yanglincheng@kylinos.cn>
---
mm/khugepaged.c | 28 ++++++++++++++++++++++------
1 file changed, 22 insertions(+), 6 deletions(-)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 2e570f83778c..5c6015ac7b5e 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1249,6 +1249,7 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long start_addr, bool *mmap_locked,
+ int *cur_progress,
struct collapse_control *cc)
{
pmd_t *pmd;
@@ -1264,19 +1265,27 @@ static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
VM_BUG_ON(start_addr & ~HPAGE_PMD_MASK);
result = find_pmd_or_thp_or_none(mm, start_addr, &pmd);
- if (result != SCAN_SUCCEED)
+ if (result != SCAN_SUCCEED) {
+ if (cur_progress)
+ *cur_progress = HPAGE_PMD_NR;
goto out;
+ }
memset(cc->node_load, 0, sizeof(cc->node_load));
nodes_clear(cc->alloc_nmask);
pte = pte_offset_map_lock(mm, pmd, start_addr, &ptl);
if (!pte) {
+ if (cur_progress)
+ *cur_progress = HPAGE_PMD_NR;
result = SCAN_NO_PTE_TABLE;
goto out;
}
for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR;
_pte++, addr += PAGE_SIZE) {
+ if (cur_progress)
+ *cur_progress += 1;
+
pte_t pteval = ptep_get(_pte);
if (pte_none_or_zero(pteval)) {
++none_or_zero;
@@ -2297,6 +2306,7 @@ static enum scan_result collapse_file(struct mm_struct *mm, unsigned long addr,
static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
struct file *file, pgoff_t start,
+ int *cur_progress,
struct collapse_control *cc)
{
struct folio *folio = NULL;
@@ -2337,6 +2347,9 @@ static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm, unsigned
continue;
}
+ if (cur_progress)
+ *cur_progress += folio_nr_pages(folio);
+
if (folio_order(folio) == HPAGE_PMD_ORDER &&
folio->index == start) {
/* Maybe PMD-mapped */
@@ -2466,6 +2479,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result
while (khugepaged_scan.address < hend) {
bool mmap_locked = true;
+ int cur_progress = 0;
cond_resched();
if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
@@ -2482,7 +2496,8 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result
mmap_read_unlock(mm);
mmap_locked = false;
*result = hpage_collapse_scan_file(mm,
- khugepaged_scan.address, file, pgoff, cc);
+ khugepaged_scan.address, file, pgoff,
+ &cur_progress, cc);
fput(file);
if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
mmap_read_lock(mm);
@@ -2496,7 +2511,8 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result
}
} else {
*result = hpage_collapse_scan_pmd(mm, vma,
- khugepaged_scan.address, &mmap_locked, cc);
+ khugepaged_scan.address, &mmap_locked,
+ &cur_progress, cc);
}
if (*result == SCAN_SUCCEED)
@@ -2504,7 +2520,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result
/* move to next address */
khugepaged_scan.address += HPAGE_PMD_SIZE;
- progress += HPAGE_PMD_NR;
+ progress += cur_progress;
if (!mmap_locked)
/*
* We released mmap_lock so break loop. Note
@@ -2826,11 +2842,11 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
mmap_read_unlock(mm);
mmap_locked = false;
result = hpage_collapse_scan_file(mm, addr, file, pgoff,
- cc);
+ NULL, cc);
fput(file);
} else {
result = hpage_collapse_scan_pmd(mm, vma, addr,
- &mmap_locked, cc);
+ &mmap_locked, NULL, cc);
}
if (!mmap_locked)
*lock_dropped = true;
--
2.51.0
On 1/11/26 13:19, Vernon Yang wrote:
> Currently, each scan always increases "progress" by HPAGE_PMD_NR,
> even if only scanning a single pte.
>
> This patch does not change the original semantics of "progress", it
> simply uses the exact number of PTEs counted to replace HPAGE_PMD_NR.
>
> Let me provide a detailed example:
>
> static int hpage_collapse_scan_pmd()
> {
> for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR;
> _pte++, addr += PAGE_SIZE) {
> pte_t pteval = ptep_get(_pte);
> ...
> if (pte_uffd_wp(pteval)) { <-- first scan hit
> result = SCAN_PTE_UFFD_WP;
> goto out_unmap;
> }
> }
> }
>
> During the first scan, if pte_uffd_wp(pteval) is true, the loop exits
> directly. In practice, only one PTE is scanned before termination.
> Here, "progress += 1" reflects the actual number of PTEs scanned, but
> previously "progress += HPAGE_PMD_NR" always.
>
> Signed-off-by: Vernon Yang <yanglincheng@kylinos.cn>
> ---
> mm/khugepaged.c | 28 ++++++++++++++++++++++------
> 1 file changed, 22 insertions(+), 6 deletions(-)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 2e570f83778c..5c6015ac7b5e 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -1249,6 +1249,7 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
> static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
> struct vm_area_struct *vma,
> unsigned long start_addr, bool *mmap_locked,
> + int *cur_progress,
> struct collapse_control *cc)
> {
> pmd_t *pmd;
> @@ -1264,19 +1265,27 @@ static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
> VM_BUG_ON(start_addr & ~HPAGE_PMD_MASK);
>
> result = find_pmd_or_thp_or_none(mm, start_addr, &pmd);
> - if (result != SCAN_SUCCEED)
> + if (result != SCAN_SUCCEED) {
> + if (cur_progress)
> + *cur_progress = HPAGE_PMD_NR;
> goto out;
> + }
>
> memset(cc->node_load, 0, sizeof(cc->node_load));
> nodes_clear(cc->alloc_nmask);
> pte = pte_offset_map_lock(mm, pmd, start_addr, &ptl);
> if (!pte) {
> + if (cur_progress)
> + *cur_progress = HPAGE_PMD_NR;
> result = SCAN_NO_PTE_TABLE;
> goto out;
> }
>
> for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR;
> _pte++, addr += PAGE_SIZE) {
> + if (cur_progress)
> + *cur_progress += 1;
> +
> pte_t pteval = ptep_get(_pte);
> if (pte_none_or_zero(pteval)) {
> ++none_or_zero;
> @@ -2297,6 +2306,7 @@ static enum scan_result collapse_file(struct mm_struct *mm, unsigned long addr,
>
> static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
> struct file *file, pgoff_t start,
> + int *cur_progress,
> struct collapse_control *cc)
> {
> struct folio *folio = NULL;
> @@ -2337,6 +2347,9 @@ static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm, unsigned
> continue;
> }
>
> + if (cur_progress)
> + *cur_progress += folio_nr_pages(folio);
> +
Okay, I had another look and I think the file path is confusing. We're
scanning xarray entries. But then, we only count some entries and not
others.
Can we just keep that alone in this patch? That is, always indicate a
progress of HPAGE_PMD_NR right at the start of the function?
--
Cheers
David
On Wed, Jan 14, 2026 at 12:38:46PM +0100, David Hildenbrand (Red Hat) wrote:
> On 1/11/26 13:19, Vernon Yang wrote:
> > Currently, each scan always increases "progress" by HPAGE_PMD_NR,
> > even if only scanning a single pte.
> >
> > This patch does not change the original semantics of "progress", it
> > simply uses the exact number of PTEs counted to replace HPAGE_PMD_NR.
> >
> > Let me provide a detailed example:
> >
> > static int hpage_collapse_scan_pmd()
> > {
> > for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR;
> > _pte++, addr += PAGE_SIZE) {
> > pte_t pteval = ptep_get(_pte);
> > ...
> > if (pte_uffd_wp(pteval)) { <-- first scan hit
> > result = SCAN_PTE_UFFD_WP;
> > goto out_unmap;
> > }
> > }
> > }
> >
> > During the first scan, if pte_uffd_wp(pteval) is true, the loop exits
> > directly. In practice, only one PTE is scanned before termination.
> > Here, "progress += 1" reflects the actual number of PTEs scanned, but
> > previously "progress += HPAGE_PMD_NR" always.
> >
> > Signed-off-by: Vernon Yang <yanglincheng@kylinos.cn>
> > ---
> > mm/khugepaged.c | 28 ++++++++++++++++++++++------
> > 1 file changed, 22 insertions(+), 6 deletions(-)
> >
> > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > index 2e570f83778c..5c6015ac7b5e 100644
> > --- a/mm/khugepaged.c
> > +++ b/mm/khugepaged.c
> > @@ -1249,6 +1249,7 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
> > static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
> > struct vm_area_struct *vma,
> > unsigned long start_addr, bool *mmap_locked,
> > + int *cur_progress,
> > struct collapse_control *cc)
> > {
> > pmd_t *pmd;
> > @@ -1264,19 +1265,27 @@ static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
> > VM_BUG_ON(start_addr & ~HPAGE_PMD_MASK);
> > result = find_pmd_or_thp_or_none(mm, start_addr, &pmd);
> > - if (result != SCAN_SUCCEED)
> > + if (result != SCAN_SUCCEED) {
> > + if (cur_progress)
> > + *cur_progress = HPAGE_PMD_NR;
> > goto out;
> > + }
> > memset(cc->node_load, 0, sizeof(cc->node_load));
> > nodes_clear(cc->alloc_nmask);
> > pte = pte_offset_map_lock(mm, pmd, start_addr, &ptl);
> > if (!pte) {
> > + if (cur_progress)
> > + *cur_progress = HPAGE_PMD_NR;
> > result = SCAN_NO_PTE_TABLE;
> > goto out;
> > }
> > for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR;
> > _pte++, addr += PAGE_SIZE) {
> > + if (cur_progress)
> > + *cur_progress += 1;
> > +
> > pte_t pteval = ptep_get(_pte);
> > if (pte_none_or_zero(pteval)) {
> > ++none_or_zero;
> > @@ -2297,6 +2306,7 @@ static enum scan_result collapse_file(struct mm_struct *mm, unsigned long addr,
> > static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
> > struct file *file, pgoff_t start,
> > + int *cur_progress,
> > struct collapse_control *cc)
> > {
> > struct folio *folio = NULL;
> > @@ -2337,6 +2347,9 @@ static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm, unsigned
> > continue;
> > }
> > + if (cur_progress)
> > + *cur_progress += folio_nr_pages(folio);
> > +
>
> Okay, I had another look and I think the file path is confusing. We're
> scanning xarray entries. But then, we only count some entries and not
> others.
Thank you for the review.
Move PATCH #3 comments here
> Assume we found a single 4k folio in the xarray, but then collapse a 2M THP.
> Is the progress really "1" ?
This example is indeed a issue, I will use "xas->xa_index" to fix
these issues.
> What about shmem swap entries (xa_is_value)?
Sorry, I missed it. I will add "1 << xas_get_order(&xas)" to
"cur_progreess".
> Can we just keep that alone in this patch? That is, always indicate a
> progress of HPAGE_PMD_NR right at the start of the function?
Studying the implementation source code of xarray, I discovered that
these issues can be fixed by utilizing "xas->xa_index".
I send the code as follow (patch #2 and #3 are squashed). Let's see if
it works? If not, please let me know, Thanks!
--
Thanks,
Vernon
diff --git a/include/linux/xarray.h b/include/linux/xarray.h
index be850174e802..f77d97d7b957 100644
--- a/include/linux/xarray.h
+++ b/include/linux/xarray.h
@@ -1646,6 +1646,15 @@ static inline void xas_set(struct xa_state *xas, unsigned long index)
xas->xa_node = XAS_RESTART;
}
+/**
+ * xas_get_index() - Get XArray operation state for a different index.
+ * @xas: XArray operation state.
+ */
+static inline unsigned long xas_get_index(struct xa_state *xas)
+{
+ return xas->xa_index;
+}
+
/**
* xas_advance() - Skip over sibling entries.
* @xas: XArray operation state.
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 2e570f83778c..7d42035ece5b 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -68,7 +68,10 @@ enum scan_result {
static struct task_struct *khugepaged_thread __read_mostly;
static DEFINE_MUTEX(khugepaged_mutex);
-/* default scan 8*HPAGE_PMD_NR ptes (or vmas) every 10 second */
+/*
+ * default scan 8*HPAGE_PMD_NR ptes, pmd_mapped, no_pte_table or vmas
+ * every 10 second.
+ */
static unsigned int khugepaged_pages_to_scan __read_mostly;
static unsigned int khugepaged_pages_collapsed;
static unsigned int khugepaged_full_scans;
@@ -1249,6 +1252,7 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long start_addr, bool *mmap_locked,
+ unsigned int *cur_progress,
struct collapse_control *cc)
{
pmd_t *pmd;
@@ -1263,6 +1267,9 @@ static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
VM_BUG_ON(start_addr & ~HPAGE_PMD_MASK);
+ if (cur_progress)
+ *cur_progress += 1;
+
result = find_pmd_or_thp_or_none(mm, start_addr, &pmd);
if (result != SCAN_SUCCEED)
goto out;
@@ -1403,6 +1410,8 @@ static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
} else {
result = SCAN_SUCCEED;
}
+ if (cur_progress)
+ *cur_progress += _pte - pte;
out_unmap:
pte_unmap_unlock(pte, ptl);
if (result == SCAN_SUCCEED) {
@@ -2297,6 +2306,7 @@ static enum scan_result collapse_file(struct mm_struct *mm, unsigned long addr,
static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
struct file *file, pgoff_t start,
+ unsigned int *cur_progress,
struct collapse_control *cc)
{
struct folio *folio = NULL;
@@ -2386,6 +2396,18 @@ static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm, unsigned
cond_resched_rcu();
}
}
+ if (cur_progress) {
+ unsigned long idx = xas_get_index(&xas) - start;
+
+ if (folio == NULL)
+ *cur_progress += HPAGE_PMD_NR;
+ else if (xa_is_value(folio))
+ *cur_progress += idx + (1 << xas_get_order(&xas));
+ else if (folio_order(folio) == HPAGE_PMD_ORDER)
+ *cur_progress += idx + 1;
+ else
+ *cur_progress += idx + folio_nr_pages(folio);
+ }
rcu_read_unlock();
if (result == SCAN_SUCCEED) {
@@ -2466,6 +2488,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result
while (khugepaged_scan.address < hend) {
bool mmap_locked = true;
+ unsigned int cur_progress = 0;
cond_resched();
if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
@@ -2482,7 +2505,8 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result
mmap_read_unlock(mm);
mmap_locked = false;
*result = hpage_collapse_scan_file(mm,
- khugepaged_scan.address, file, pgoff, cc);
+ khugepaged_scan.address, file, pgoff,
+ &cur_progress, cc);
fput(file);
if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
mmap_read_lock(mm);
@@ -2496,7 +2520,8 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result
}
} else {
*result = hpage_collapse_scan_pmd(mm, vma,
- khugepaged_scan.address, &mmap_locked, cc);
+ khugepaged_scan.address, &mmap_locked,
+ &cur_progress, cc);
}
if (*result == SCAN_SUCCEED)
@@ -2504,7 +2529,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result
/* move to next address */
khugepaged_scan.address += HPAGE_PMD_SIZE;
- progress += HPAGE_PMD_NR;
+ progress += cur_progress;
if (!mmap_locked)
/*
* We released mmap_lock so break loop. Note
@@ -2826,11 +2851,11 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
mmap_read_unlock(mm);
mmap_locked = false;
result = hpage_collapse_scan_file(mm, addr, file, pgoff,
- cc);
+ NULL, cc);
fput(file);
} else {
result = hpage_collapse_scan_pmd(mm, vma, addr,
- &mmap_locked, cc);
+ &mmap_locked, NULL, cc);
}
if (!mmap_locked)
*lock_dropped = true;
On Sat, Jan 17, 2026 at 12:18:20PM +0800, Vernon Yang wrote:
> On Wed, Jan 14, 2026 at 12:38:46PM +0100, David Hildenbrand (Red Hat) wrote:
> > On 1/11/26 13:19, Vernon Yang wrote:
> > > Currently, each scan always increases "progress" by HPAGE_PMD_NR,
> > > even if only scanning a single pte.
> > >
> > > This patch does not change the original semantics of "progress", it
> > > simply uses the exact number of PTEs counted to replace HPAGE_PMD_NR.
> > >
> > > Let me provide a detailed example:
> > >
> > > static int hpage_collapse_scan_pmd()
> > > {
> > > for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR;
> > > _pte++, addr += PAGE_SIZE) {
> > > pte_t pteval = ptep_get(_pte);
> > > ...
> > > if (pte_uffd_wp(pteval)) { <-- first scan hit
> > > result = SCAN_PTE_UFFD_WP;
> > > goto out_unmap;
> > > }
> > > }
> > > }
> > >
> > > During the first scan, if pte_uffd_wp(pteval) is true, the loop exits
> > > directly. In practice, only one PTE is scanned before termination.
> > > Here, "progress += 1" reflects the actual number of PTEs scanned, but
> > > previously "progress += HPAGE_PMD_NR" always.
> > >
> > > Signed-off-by: Vernon Yang <yanglincheng@kylinos.cn>
> > > ---
> > > mm/khugepaged.c | 28 ++++++++++++++++++++++------
> > > 1 file changed, 22 insertions(+), 6 deletions(-)
> > >
> > > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > > index 2e570f83778c..5c6015ac7b5e 100644
> > > --- a/mm/khugepaged.c
> > > +++ b/mm/khugepaged.c
> > > @@ -1249,6 +1249,7 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
> > > static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
> > > struct vm_area_struct *vma,
> > > unsigned long start_addr, bool *mmap_locked,
> > > + int *cur_progress,
> > > struct collapse_control *cc)
> > > {
> > > pmd_t *pmd;
> > > @@ -1264,19 +1265,27 @@ static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
> > > VM_BUG_ON(start_addr & ~HPAGE_PMD_MASK);
> > > result = find_pmd_or_thp_or_none(mm, start_addr, &pmd);
> > > - if (result != SCAN_SUCCEED)
> > > + if (result != SCAN_SUCCEED) {
> > > + if (cur_progress)
> > > + *cur_progress = HPAGE_PMD_NR;
> > > goto out;
> > > + }
> > > memset(cc->node_load, 0, sizeof(cc->node_load));
> > > nodes_clear(cc->alloc_nmask);
> > > pte = pte_offset_map_lock(mm, pmd, start_addr, &ptl);
> > > if (!pte) {
> > > + if (cur_progress)
> > > + *cur_progress = HPAGE_PMD_NR;
> > > result = SCAN_NO_PTE_TABLE;
> > > goto out;
> > > }
> > > for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR;
> > > _pte++, addr += PAGE_SIZE) {
> > > + if (cur_progress)
> > > + *cur_progress += 1;
> > > +
> > > pte_t pteval = ptep_get(_pte);
> > > if (pte_none_or_zero(pteval)) {
> > > ++none_or_zero;
> > > @@ -2297,6 +2306,7 @@ static enum scan_result collapse_file(struct mm_struct *mm, unsigned long addr,
> > > static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
> > > struct file *file, pgoff_t start,
> > > + int *cur_progress,
> > > struct collapse_control *cc)
> > > {
> > > struct folio *folio = NULL;
> > > @@ -2337,6 +2347,9 @@ static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm, unsigned
> > > continue;
> > > }
> > > + if (cur_progress)
> > > + *cur_progress += folio_nr_pages(folio);
> > > +
> >
> > Okay, I had another look and I think the file path is confusing. We're
> > scanning xarray entries. But then, we only count some entries and not
> > others.
>
> Thank you for the review.
>
> Move PATCH #3 comments here
>
> > Assume we found a single 4k folio in the xarray, but then collapse a 2M THP.
> > Is the progress really "1" ?
>
> This example is indeed a issue, I will use "xas->xa_index" to fix
> these issues.
>
> > What about shmem swap entries (xa_is_value)?
>
> Sorry, I missed it. I will add "1 << xas_get_order(&xas)" to
> "cur_progreess".
>
> > Can we just keep that alone in this patch? That is, always indicate a
> > progress of HPAGE_PMD_NR right at the start of the function?
>
> Studying the implementation source code of xarray, I discovered that
> these issues can be fixed by utilizing "xas->xa_index".
>
> I send the code as follow (patch #2 and #3 are squashed). Let's see if
> it works? If not, please let me know, Thanks!
>
> --
> Thanks,
> Vernon
>
>
> diff --git a/include/linux/xarray.h b/include/linux/xarray.h
> index be850174e802..f77d97d7b957 100644
> --- a/include/linux/xarray.h
> +++ b/include/linux/xarray.h
> @@ -1646,6 +1646,15 @@ static inline void xas_set(struct xa_state *xas, unsigned long index)
> xas->xa_node = XAS_RESTART;
> }
>
> +/**
> + * xas_get_index() - Get XArray operation state for a different index.
> + * @xas: XArray operation state.
> + */
> +static inline unsigned long xas_get_index(struct xa_state *xas)
> +{
> + return xas->xa_index;
> +}
> +
> /**
> * xas_advance() - Skip over sibling entries.
> * @xas: XArray operation state.
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 2e570f83778c..7d42035ece5b 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -68,7 +68,10 @@ enum scan_result {
> static struct task_struct *khugepaged_thread __read_mostly;
> static DEFINE_MUTEX(khugepaged_mutex);
>
> -/* default scan 8*HPAGE_PMD_NR ptes (or vmas) every 10 second */
> +/*
> + * default scan 8*HPAGE_PMD_NR ptes, pmd_mapped, no_pte_table or vmas
> + * every 10 second.
> + */
> static unsigned int khugepaged_pages_to_scan __read_mostly;
> static unsigned int khugepaged_pages_collapsed;
> static unsigned int khugepaged_full_scans;
> @@ -1249,6 +1252,7 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
> static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
> struct vm_area_struct *vma,
> unsigned long start_addr, bool *mmap_locked,
> + unsigned int *cur_progress,
> struct collapse_control *cc)
> {
> pmd_t *pmd;
> @@ -1263,6 +1267,9 @@ static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
>
> VM_BUG_ON(start_addr & ~HPAGE_PMD_MASK);
>
> + if (cur_progress)
> + *cur_progress += 1;
> +
> result = find_pmd_or_thp_or_none(mm, start_addr, &pmd);
> if (result != SCAN_SUCCEED)
> goto out;
> @@ -1403,6 +1410,8 @@ static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
> } else {
> result = SCAN_SUCCEED;
> }
> + if (cur_progress)
> + *cur_progress += _pte - pte;
> out_unmap:
> pte_unmap_unlock(pte, ptl);
Check it over once more carefully, the operation for counting
"cur_progress" should be placed under "out_unmap", not above it.
As shown below:
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index c92c7e34ef6f..a1b4fdbee8e1 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1421,9 +1421,9 @@ static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
} else {
result = SCAN_SUCCEED;
}
+out_unmap:
if (cur_progress)
*cur_progress += _pte - pte;
-out_unmap:
pte_unmap_unlock(pte, ptl);
> if (result == SCAN_SUCCEED) {
> @@ -2297,6 +2306,7 @@ static enum scan_result collapse_file(struct mm_struct *mm, unsigned long addr,
>
> static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
> struct file *file, pgoff_t start,
> + unsigned int *cur_progress,
> struct collapse_control *cc)
> {
> struct folio *folio = NULL;
> @@ -2386,6 +2396,18 @@ static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm, unsigned
> cond_resched_rcu();
> }
> }
> + if (cur_progress) {
> + unsigned long idx = xas_get_index(&xas) - start;
> +
> + if (folio == NULL)
> + *cur_progress += HPAGE_PMD_NR;
> + else if (xa_is_value(folio))
> + *cur_progress += idx + (1 << xas_get_order(&xas));
> + else if (folio_order(folio) == HPAGE_PMD_ORDER)
> + *cur_progress += idx + 1;
> + else
> + *cur_progress += idx + folio_nr_pages(folio);
> + }
> rcu_read_unlock();
>
> if (result == SCAN_SUCCEED) {
> @@ -2466,6 +2488,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result
>
> while (khugepaged_scan.address < hend) {
> bool mmap_locked = true;
> + unsigned int cur_progress = 0;
>
> cond_resched();
> if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
> @@ -2482,7 +2505,8 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result
> mmap_read_unlock(mm);
> mmap_locked = false;
> *result = hpage_collapse_scan_file(mm,
> - khugepaged_scan.address, file, pgoff, cc);
> + khugepaged_scan.address, file, pgoff,
> + &cur_progress, cc);
> fput(file);
> if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
> mmap_read_lock(mm);
> @@ -2496,7 +2520,8 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result
> }
> } else {
> *result = hpage_collapse_scan_pmd(mm, vma,
> - khugepaged_scan.address, &mmap_locked, cc);
> + khugepaged_scan.address, &mmap_locked,
> + &cur_progress, cc);
> }
>
> if (*result == SCAN_SUCCEED)
> @@ -2504,7 +2529,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result
>
> /* move to next address */
> khugepaged_scan.address += HPAGE_PMD_SIZE;
> - progress += HPAGE_PMD_NR;
> + progress += cur_progress;
> if (!mmap_locked)
> /*
> * We released mmap_lock so break loop. Note
> @@ -2826,11 +2851,11 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
> mmap_read_unlock(mm);
> mmap_locked = false;
> result = hpage_collapse_scan_file(mm, addr, file, pgoff,
> - cc);
> + NULL, cc);
> fput(file);
> } else {
> result = hpage_collapse_scan_pmd(mm, vma, addr,
> - &mmap_locked, cc);
> + &mmap_locked, NULL, cc);
> }
> if (!mmap_locked)
> *lock_dropped = true;
>
> mmap_locked = false;
> result = hpage_collapse_scan_file(mm, addr, file, pgoff,
> - cc);
> + NULL, cc);
> fput(file);
> } else {
> result = hpage_collapse_scan_pmd(mm, vma, addr,
> - &mmap_locked, cc);
> + &mmap_locked, NULL, cc);
> }
> if (!mmap_locked)
> *lock_dropped = true;
Seeing the "NULL" tells me that this patch does effectively nothing and
should likely get squashed into #3 ?
--
Cheers
David
On 1/14/26 12:23, David Hildenbrand (Red Hat) wrote:
>
>> mmap_locked = false;
>> result = hpage_collapse_scan_file(mm, addr, file, pgoff,
>> - cc);
>> + NULL, cc);
>> fput(file);
>> } else {
>> result = hpage_collapse_scan_pmd(mm, vma, addr,
>> - &mmap_locked, cc);
>> + &mmap_locked, NULL, cc);
>> }
>> if (!mmap_locked)
>> *lock_dropped = true;
>
> Seeing the "NULL" tells me that this patch does effectively nothing and
> should likely get squashed into #3 ?
Ah, sorry, I missed the other instanced where you pass something.
From another glimpse, LGTM (although I think the end result might look
better when both patches are squashed; anyhow, patch #3 fixes these
instances up)
Acked-by: David Hildenbrand (Red Hat) <david@kernel.org>
--
Cheers
David
© 2016 - 2026 Red Hat, Inc.