[v5] Improve khugepaged scan logic

[PATCH mm-new v5 2/5] mm: khugepaged: refine scan progress number

Posted by Vernon Yang 2 weeks, 1 day ago

From: Vernon Yang <yanglincheng@kylinos.cn>

Currently, each scan always increases "progress" by HPAGE_PMD_NR,
even if only scanning a single PTE/PMD entry.

- When only scanning a sigle PTE entry, let me provide a detailed
  example:

static int hpage_collapse_scan_pmd()
{
	for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR;
	     _pte++, addr += PAGE_SIZE) {
		pte_t pteval = ptep_get(_pte);
		...
		if (pte_uffd_wp(pteval)) { <-- first scan hit
			result = SCAN_PTE_UFFD_WP;
			goto out_unmap;
		}
	}
}

During the first scan, if pte_uffd_wp(pteval) is true, the loop exits
directly. In practice, only one PTE is scanned before termination.
Here, "progress += 1" reflects the actual number of PTEs scanned, but
previously "progress += HPAGE_PMD_NR" always.

- When the memory has been collapsed to PMD, let me provide a detailed
  example:

The following data is traced by bpftrace on a desktop system. After
the system has been left idle for 10 minutes upon booting, a lot of
SCAN_PMD_MAPPED or SCAN_NO_PTE_TABLE are observed during a full scan
by khugepaged.

@scan_pmd_status[1]: 1           ## SCAN_SUCCEED
@scan_pmd_status[6]: 2           ## SCAN_EXCEED_SHARED_PTE
@scan_pmd_status[3]: 142         ## SCAN_PMD_MAPPED
@scan_pmd_status[2]: 178         ## SCAN_NO_PTE_TABLE
total progress size: 674 MB
Total time         : 419 seconds ## include khugepaged_scan_sleep_millisecs

The khugepaged_scan list save all task that support collapse into hugepage,
as long as the task is not destroyed, khugepaged will not remove it from
the khugepaged_scan list. This exist a phenomenon where task has already
collapsed all memory regions into hugepage, but khugepaged continues to
scan it, which wastes CPU time and invalid, and due to
khugepaged_scan_sleep_millisecs (default 10s) causes a long wait for
scanning a large number of invalid task, so scanning really valid task
is later.

After applying this patch, when the memory is either SCAN_PMD_MAPPED or
SCAN_NO_PTE_TABLE, just skip it, as follow:

@scan_pmd_status[6]: 2
@scan_pmd_status[3]: 147
@scan_pmd_status[2]: 173
total progress size: 45 MB
Total time         : 20 seconds

Signed-off-by: Vernon Yang <yanglincheng@kylinos.cn>
---
 include/linux/xarray.h |  9 ++++++++
 mm/khugepaged.c        | 47 ++++++++++++++++++++++++++++++++++--------
 2 files changed, 47 insertions(+), 9 deletions(-)

diff --git a/include/linux/xarray.h b/include/linux/xarray.h
index be850174e802..f77d97d7b957 100644
--- a/include/linux/xarray.h
+++ b/include/linux/xarray.h
@@ -1646,6 +1646,15 @@ static inline void xas_set(struct xa_state *xas, unsigned long index)
 	xas->xa_node = XAS_RESTART;
 }
 
+/**
+ * xas_get_index() - Get XArray operation state for a different index.
+ * @xas: XArray operation state.
+ */
+static inline unsigned long xas_get_index(struct xa_state *xas)
+{
+	return xas->xa_index;
+}
+
 /**
  * xas_advance() - Skip over sibling entries.
  * @xas: XArray operation state.
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 6f0f05148765..de95029e3763 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -68,7 +68,10 @@ enum scan_result {
 static struct task_struct *khugepaged_thread __read_mostly;
 static DEFINE_MUTEX(khugepaged_mutex);
 
-/* default scan 8*HPAGE_PMD_NR ptes (or vmas) every 10 second */
+/*
+ * default scan 8*HPAGE_PMD_NR ptes, pmd_mapped, no_pte_table or vmas
+ * every 10 second.
+ */
 static unsigned int khugepaged_pages_to_scan __read_mostly;
 static unsigned int khugepaged_pages_collapsed;
 static unsigned int khugepaged_full_scans;
@@ -1240,7 +1243,8 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
 }
 
 static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
-		struct vm_area_struct *vma, unsigned long start_addr, bool *mmap_locked,
+		struct vm_area_struct *vma, unsigned long start_addr,
+		bool *mmap_locked, unsigned int *cur_progress,
 		struct collapse_control *cc)
 {
 	pmd_t *pmd;
@@ -1255,6 +1259,9 @@ static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
 
 	VM_BUG_ON(start_addr & ~HPAGE_PMD_MASK);
 
+	if (cur_progress)
+		*cur_progress += 1;
+
 	result = find_pmd_or_thp_or_none(mm, start_addr, &pmd);
 	if (result != SCAN_SUCCEED)
 		goto out;
@@ -1396,6 +1403,12 @@ static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
 		result = SCAN_SUCCEED;
 	}
 out_unmap:
+	if (cur_progress) {
+		if (_pte >= pte + HPAGE_PMD_NR)
+			*cur_progress += HPAGE_PMD_NR - 1;
+		else
+			*cur_progress += _pte - pte;
+	}
 	pte_unmap_unlock(pte, ptl);
 	if (result == SCAN_SUCCEED) {
 		result = collapse_huge_page(mm, start_addr, referenced,
@@ -2286,8 +2299,9 @@ static enum scan_result collapse_file(struct mm_struct *mm, unsigned long addr,
 	return result;
 }
 
-static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
-		struct file *file, pgoff_t start, struct collapse_control *cc)
+static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm,
+		unsigned long addr, struct file *file, pgoff_t start,
+		unsigned int *cur_progress, struct collapse_control *cc)
 {
 	struct folio *folio = NULL;
 	struct address_space *mapping = file->f_mapping;
@@ -2376,6 +2390,18 @@ static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm, unsigned
 			cond_resched_rcu();
 		}
 	}
+	if (cur_progress) {
+		unsigned long idx = xas_get_index(&xas) - start;
+
+		if (folio == NULL)
+			*cur_progress += HPAGE_PMD_NR;
+		else if (xa_is_value(folio))
+			*cur_progress += idx + (1 << xas_get_order(&xas));
+		else if (folio_order(folio) == HPAGE_PMD_ORDER)
+			*cur_progress += idx + 1;
+		else
+			*cur_progress += idx + folio_nr_pages(folio);
+	}
 	rcu_read_unlock();
 
 	if (result == SCAN_SUCCEED) {
@@ -2456,6 +2482,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result
 
 		while (khugepaged_scan.address < hend) {
 			bool mmap_locked = true;
+			unsigned int cur_progress = 0;
 
 			cond_resched();
 			if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
@@ -2472,7 +2499,8 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result
 				mmap_read_unlock(mm);
 				mmap_locked = false;
 				*result = hpage_collapse_scan_file(mm,
-					khugepaged_scan.address, file, pgoff, cc);
+					khugepaged_scan.address, file, pgoff,
+					&cur_progress, cc);
 				fput(file);
 				if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
 					mmap_read_lock(mm);
@@ -2486,7 +2514,8 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result
 				}
 			} else {
 				*result = hpage_collapse_scan_pmd(mm, vma,
-					khugepaged_scan.address, &mmap_locked, cc);
+					khugepaged_scan.address, &mmap_locked,
+					&cur_progress, cc);
 			}
 
 			if (*result == SCAN_SUCCEED)
@@ -2494,7 +2523,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result
 
 			/* move to next address */
 			khugepaged_scan.address += HPAGE_PMD_SIZE;
-			progress += HPAGE_PMD_NR;
+			progress += cur_progress;
 			if (!mmap_locked)
 				/*
 				 * We released mmap_lock so break loop.  Note
@@ -2817,7 +2846,7 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
 			mmap_locked = false;
 			*lock_dropped = true;
 			result = hpage_collapse_scan_file(mm, addr, file, pgoff,
-							  cc);
+							  NULL, cc);
 
 			if (result == SCAN_PAGE_DIRTY_OR_WRITEBACK && !triggered_wb &&
 			    mapping_can_writeback(file->f_mapping)) {
@@ -2832,7 +2861,7 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
 			fput(file);
 		} else {
 			result = hpage_collapse_scan_pmd(mm, vma, addr,
-							 &mmap_locked, cc);
+							 &mmap_locked, NULL, cc);
 		}
 		if (!mmap_locked)
 			*lock_dropped = true;
-- 
2.51.0

Re: [PATCH mm-new v5 2/5] mm: khugepaged: refine scan progress number

Posted by Dev Jain 1 week, 3 days ago

On 23/01/26 1:52 pm, Vernon Yang wrote:
> From: Vernon Yang <yanglincheng@kylinos.cn>
>
> Currently, each scan always increases "progress" by HPAGE_PMD_NR,
> even if only scanning a single PTE/PMD entry.
>
> - When only scanning a sigle PTE entry, let me provide a detailed
>   example:
>
> static int hpage_collapse_scan_pmd()
> {
> 	for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR;
> 	     _pte++, addr += PAGE_SIZE) {
> 		pte_t pteval = ptep_get(_pte);
> 		...
> 		if (pte_uffd_wp(pteval)) { <-- first scan hit
> 			result = SCAN_PTE_UFFD_WP;
> 			goto out_unmap;
> 		}
> 	}
> }
>
> During the first scan, if pte_uffd_wp(pteval) is true, the loop exits
> directly. In practice, only one PTE is scanned before termination.
> Here, "progress += 1" reflects the actual number of PTEs scanned, but
> previously "progress += HPAGE_PMD_NR" always.
>
> - When the memory has been collapsed to PMD, let me provide a detailed
>   example:
>
> The following data is traced by bpftrace on a desktop system. After
> the system has been left idle for 10 minutes upon booting, a lot of
> SCAN_PMD_MAPPED or SCAN_NO_PTE_TABLE are observed during a full scan
> by khugepaged.
>
> @scan_pmd_status[1]: 1           ## SCAN_SUCCEED
> @scan_pmd_status[6]: 2           ## SCAN_EXCEED_SHARED_PTE
> @scan_pmd_status[3]: 142         ## SCAN_PMD_MAPPED
> @scan_pmd_status[2]: 178         ## SCAN_NO_PTE_TABLE

Could you elaborate what is [1], [6] etc and 1,2,142, etc?

> total progress size: 674 MB
> Total time         : 419 seconds ## include khugepaged_scan_sleep_millisecs
>
> The khugepaged_scan list save all task that support collapse into hugepage,
> as long as the task is not destroyed, khugepaged will not remove it from
> the khugepaged_scan list. This exist a phenomenon where task has already
> collapsed all memory regions into hugepage, but khugepaged continues to
> scan it, which wastes CPU time and invalid, and due to
> khugepaged_scan_sleep_millisecs (default 10s) causes a long wait for
> scanning a large number of invalid task, so scanning really valid task
> is later.
>
> After applying this patch, when the memory is either SCAN_PMD_MAPPED or
> SCAN_NO_PTE_TABLE, just skip it, as follow:
>
> @scan_pmd_status[6]: 2
> @scan_pmd_status[3]: 147
> @scan_pmd_status[2]: 173
> total progress size: 45 MB
> Total time         : 20 seconds
>
> Signed-off-by: Vernon Yang <yanglincheng@kylinos.cn>
> ---
>  include/linux/xarray.h |  9 ++++++++
>  mm/khugepaged.c        | 47 ++++++++++++++++++++++++++++++++++--------
>  2 files changed, 47 insertions(+), 9 deletions(-)
>
> diff --git a/include/linux/xarray.h b/include/linux/xarray.h
> index be850174e802..f77d97d7b957 100644
> --- a/include/linux/xarray.h
> +++ b/include/linux/xarray.h
> @@ -1646,6 +1646,15 @@ static inline void xas_set(struct xa_state *xas, unsigned long index)
>  	xas->xa_node = XAS_RESTART;
>  }
>  
> +/**
> + * xas_get_index() - Get XArray operation state for a different index.
> + * @xas: XArray operation state.
> + */
> +static inline unsigned long xas_get_index(struct xa_state *xas)
> +{
> +	return xas->xa_index;
> +}
> +
>  /**
>   * xas_advance() - Skip over sibling entries.
>   * @xas: XArray operation state.
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 6f0f05148765..de95029e3763 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -68,7 +68,10 @@ enum scan_result {
>  static struct task_struct *khugepaged_thread __read_mostly;
>  static DEFINE_MUTEX(khugepaged_mutex);
>  
> -/* default scan 8*HPAGE_PMD_NR ptes (or vmas) every 10 second */
> +/*
> + * default scan 8*HPAGE_PMD_NR ptes, pmd_mapped, no_pte_table or vmas
> + * every 10 second.
> + */
>  static unsigned int khugepaged_pages_to_scan __read_mostly;
>  static unsigned int khugepaged_pages_collapsed;
>  static unsigned int khugepaged_full_scans;
> @@ -1240,7 +1243,8 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
>  }
>  
>  static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
> -		struct vm_area_struct *vma, unsigned long start_addr, bool *mmap_locked,
> +		struct vm_area_struct *vma, unsigned long start_addr,
> +		bool *mmap_locked, unsigned int *cur_progress,
>  		struct collapse_control *cc)
>  {
>  	pmd_t *pmd;
> @@ -1255,6 +1259,9 @@ static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
>  
>  	VM_BUG_ON(start_addr & ~HPAGE_PMD_MASK);
>  
> +	if (cur_progress)
> +		*cur_progress += 1;

Why not be a little more explicit, and do this addition if find_pmd_or_thp_or_none fails,
or pte_offset_map_lock fails? The way you do it right now is not readable - it gives no
idea as to why on function entry we do a +1 right away. Please do add some comments too.

> +
>  	result = find_pmd_or_thp_or_none(mm, start_addr, &pmd);
>  	if (result != SCAN_SUCCEED)
>  		goto out;
> @@ -1396,6 +1403,12 @@ static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
>  		result = SCAN_SUCCEED;
>  	}
>  out_unmap:
> +	if (cur_progress) {
> +		if (_pte >= pte + HPAGE_PMD_NR)
> +			*cur_progress += HPAGE_PMD_NR - 1;
> +		else
> +			*cur_progress += _pte - pte;
> +	}
>  	pte_unmap_unlock(pte, ptl);
>  	if (result == SCAN_SUCCEED) {
>  		result = collapse_huge_page(mm, start_addr, referenced,
> @@ -2286,8 +2299,9 @@ static enum scan_result collapse_file(struct mm_struct *mm, unsigned long addr,
>  	return result;
>  }
>  
> -static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
> -		struct file *file, pgoff_t start, struct collapse_control *cc)
> +static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm,
> +		unsigned long addr, struct file *file, pgoff_t start,
> +		unsigned int *cur_progress, struct collapse_control *cc)
>  {
>  	struct folio *folio = NULL;
>  	struct address_space *mapping = file->f_mapping;
> @@ -2376,6 +2390,18 @@ static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm, unsigned
>  			cond_resched_rcu();
>  		}
>  	}
> +	if (cur_progress) {
> +		unsigned long idx = xas_get_index(&xas) - start;
> +
> +		if (folio == NULL)
> +			*cur_progress += HPAGE_PMD_NR;

I think this whole block needs some comments. Can you explain, why you
do a particular increment in each case?

> +		else if (xa_is_value(folio))
> +			*cur_progress += idx + (1 << xas_get_order(&xas));
> +		else if (folio_order(folio) == HPAGE_PMD_ORDER)
> +			*cur_progress += idx + 1;
> +		else
> +			*cur_progress += idx + folio_nr_pages(folio);
> +	}
>  	rcu_read_unlock();
>  
>  	if (result == SCAN_SUCCEED) {
> @@ -2456,6 +2482,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result
>  
>  		while (khugepaged_scan.address < hend) {
>  			bool mmap_locked = true;
> +			unsigned int cur_progress = 0;
>  
>  			cond_resched();
>  			if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
> @@ -2472,7 +2499,8 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result
>  				mmap_read_unlock(mm);
>  				mmap_locked = false;
>  				*result = hpage_collapse_scan_file(mm,
> -					khugepaged_scan.address, file, pgoff, cc);
> +					khugepaged_scan.address, file, pgoff,
> +					&cur_progress, cc);
>  				fput(file);
>  				if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
>  					mmap_read_lock(mm);
> @@ -2486,7 +2514,8 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result
>  				}
>  			} else {
>  				*result = hpage_collapse_scan_pmd(mm, vma,
> -					khugepaged_scan.address, &mmap_locked, cc);
> +					khugepaged_scan.address, &mmap_locked,
> +					&cur_progress, cc);
>  			}
>  
>  			if (*result == SCAN_SUCCEED)
> @@ -2494,7 +2523,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result
>  
>  			/* move to next address */
>  			khugepaged_scan.address += HPAGE_PMD_SIZE;
> -			progress += HPAGE_PMD_NR;
> +			progress += cur_progress;
>  			if (!mmap_locked)
>  				/*
>  				 * We released mmap_lock so break loop.  Note
> @@ -2817,7 +2846,7 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
>  			mmap_locked = false;
>  			*lock_dropped = true;
>  			result = hpage_collapse_scan_file(mm, addr, file, pgoff,
> -							  cc);
> +							  NULL, cc);
>  
>  			if (result == SCAN_PAGE_DIRTY_OR_WRITEBACK && !triggered_wb &&
>  			    mapping_can_writeback(file->f_mapping)) {
> @@ -2832,7 +2861,7 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
>  			fput(file);
>  		} else {
>  			result = hpage_collapse_scan_pmd(mm, vma, addr,
> -							 &mmap_locked, cc);
> +							 &mmap_locked, NULL, cc);
>  		}
>  		if (!mmap_locked)
>  			*lock_dropped = true;

Re: [PATCH mm-new v5 2/5] mm: khugepaged: refine scan progress number

Posted by Vernon Yang 1 week, 2 days ago

On Wed, Jan 28, 2026 at 01:59:33PM +0530, Dev Jain wrote:
>
> On 23/01/26 1:52 pm, Vernon Yang wrote:
> > From: Vernon Yang <yanglincheng@kylinos.cn>
> >
> > Currently, each scan always increases "progress" by HPAGE_PMD_NR,
> > even if only scanning a single PTE/PMD entry.
> >
> > - When only scanning a sigle PTE entry, let me provide a detailed
> >   example:
> >
> > static int hpage_collapse_scan_pmd()
> > {
> > 	for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR;
> > 	     _pte++, addr += PAGE_SIZE) {
> > 		pte_t pteval = ptep_get(_pte);
> > 		...
> > 		if (pte_uffd_wp(pteval)) { <-- first scan hit
> > 			result = SCAN_PTE_UFFD_WP;
> > 			goto out_unmap;
> > 		}
> > 	}
> > }
> >
> > During the first scan, if pte_uffd_wp(pteval) is true, the loop exits
> > directly. In practice, only one PTE is scanned before termination.
> > Here, "progress += 1" reflects the actual number of PTEs scanned, but
> > previously "progress += HPAGE_PMD_NR" always.
> >
> > - When the memory has been collapsed to PMD, let me provide a detailed
> >   example:
> >
> > The following data is traced by bpftrace on a desktop system. After
> > the system has been left idle for 10 minutes upon booting, a lot of
> > SCAN_PMD_MAPPED or SCAN_NO_PTE_TABLE are observed during a full scan
> > by khugepaged.
> >
> > @scan_pmd_status[1]: 1           ## SCAN_SUCCEED
> > @scan_pmd_status[6]: 2           ## SCAN_EXCEED_SHARED_PTE
> > @scan_pmd_status[3]: 142         ## SCAN_PMD_MAPPED
> > @scan_pmd_status[2]: 178         ## SCAN_NO_PTE_TABLE
>
> Could you elaborate what is [1], [6] etc and 1,2,142, etc?

These 1,6 are value of "enum scan_result", you can directly refer to the
notes on the right.

These 1,2,142,178 are number of different "enum scan_result" from
trace_mm_khugepaged_scan_pmd and trace_mm_khugepaged_scan_file.

as example, SCAN_PMD_MAPPED has 142 times during a full scan by
khugepaged.

> > total progress size: 674 MB
> > Total time         : 419 seconds ## include khugepaged_scan_sleep_millisecs
> >
> > The khugepaged_scan list save all task that support collapse into hugepage,
> > as long as the task is not destroyed, khugepaged will not remove it from
> > the khugepaged_scan list. This exist a phenomenon where task has already
> > collapsed all memory regions into hugepage, but khugepaged continues to
> > scan it, which wastes CPU time and invalid, and due to
> > khugepaged_scan_sleep_millisecs (default 10s) causes a long wait for
> > scanning a large number of invalid task, so scanning really valid task
> > is later.
> >
> > After applying this patch, when the memory is either SCAN_PMD_MAPPED or
> > SCAN_NO_PTE_TABLE, just skip it, as follow:
> >
> > @scan_pmd_status[6]: 2
> > @scan_pmd_status[3]: 147
> > @scan_pmd_status[2]: 173
> > total progress size: 45 MB
> > Total time         : 20 seconds
> >
> > Signed-off-by: Vernon Yang <yanglincheng@kylinos.cn>
> > ---
> >  include/linux/xarray.h |  9 ++++++++
> >  mm/khugepaged.c        | 47 ++++++++++++++++++++++++++++++++++--------
> >  2 files changed, 47 insertions(+), 9 deletions(-)
> >
> > diff --git a/include/linux/xarray.h b/include/linux/xarray.h
> > index be850174e802..f77d97d7b957 100644
> > --- a/include/linux/xarray.h
> > +++ b/include/linux/xarray.h
> > @@ -1646,6 +1646,15 @@ static inline void xas_set(struct xa_state *xas, unsigned long index)
> >  	xas->xa_node = XAS_RESTART;
> >  }
> >
> > +/**
> > + * xas_get_index() - Get XArray operation state for a different index.
> > + * @xas: XArray operation state.
> > + */
> > +static inline unsigned long xas_get_index(struct xa_state *xas)
> > +{
> > +	return xas->xa_index;
> > +}
> > +
> >  /**
> >   * xas_advance() - Skip over sibling entries.
> >   * @xas: XArray operation state.
> > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > index 6f0f05148765..de95029e3763 100644
> > --- a/mm/khugepaged.c
> > +++ b/mm/khugepaged.c
> > @@ -68,7 +68,10 @@ enum scan_result {
> >  static struct task_struct *khugepaged_thread __read_mostly;
> >  static DEFINE_MUTEX(khugepaged_mutex);
> >
> > -/* default scan 8*HPAGE_PMD_NR ptes (or vmas) every 10 second */
> > +/*
> > + * default scan 8*HPAGE_PMD_NR ptes, pmd_mapped, no_pte_table or vmas
> > + * every 10 second.
> > + */
> >  static unsigned int khugepaged_pages_to_scan __read_mostly;
> >  static unsigned int khugepaged_pages_collapsed;
> >  static unsigned int khugepaged_full_scans;
> > @@ -1240,7 +1243,8 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
> >  }
> >
> >  static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
> > -		struct vm_area_struct *vma, unsigned long start_addr, bool *mmap_locked,
> > +		struct vm_area_struct *vma, unsigned long start_addr,
> > +		bool *mmap_locked, unsigned int *cur_progress,
> >  		struct collapse_control *cc)
> >  {
> >  	pmd_t *pmd;
> > @@ -1255,6 +1259,9 @@ static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
> >
> >  	VM_BUG_ON(start_addr & ~HPAGE_PMD_MASK);
> >
> > +	if (cur_progress)
> > +		*cur_progress += 1;
>
> Why not be a little more explicit, and do this addition if find_pmd_or_thp_or_none fails,
> or pte_offset_map_lock fails? The way you do it right now is not readable - it gives no
> idea as to why on function entry we do a +1 right away. Please do add some comments too.

If this way is not clear enough, we can directly add 1 in
find_pmd_or_thp_or_none() etc, BUT it's a bit redundant.
Please take a look at which one is better.

case 1:
as the V4 PATCH #2 [1] and #3 [2], only hpage_collapse_scan_pmd().
[1] https://lore.kernel.org/linux-mm/20260111121909.8410-3-yanglincheng@kylinos.cn
[2] https://lore.kernel.org/linux-mm/20260111121909.8410-4-yanglincheng@kylinos.cn

static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
		struct vm_area_struct *vma, unsigned long start_addr,
		bool *mmap_locked, unsigned int *cur_progress,
		struct collapse_control *cc)
{
	...
	result = find_pmd_or_thp_or_none(mm, start_addr, &pmd);
	if (result != SCAN_SUCCEED) {
		if (cur_progress)
			*cur_progress += 1; // here
		goto out;
	}
	...
	pte = pte_offset_map_lock(mm, pmd, start_addr, &ptl);
	if (!pte) {
		if (cur_progress)
			*cur_progress += 1; // here
		result = SCAN_NO_PTE_TABLE;
		goto out;
	}

	for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR;
	     _pte++, addr += PAGE_SIZE) {
		if (cur_progress)
			*cur_progress += 1; // here
		...
	}
}

case 2:

static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
		struct vm_area_struct *vma, unsigned long start_addr,
		bool *mmap_locked, unsigned int *cur_progress,
		struct collapse_control *cc)
{
	...
	result = find_pmd_or_thp_or_none(mm, start_addr, &pmd);
	if (result != SCAN_SUCCEED) {
		if (cur_progress)
			*cur_progress += 1; // here
		goto out;
	}
	...
	pte = pte_offset_map_lock(mm, pmd, start_addr, &ptl);
	if (!pte) {
		if (cur_progress)
			*cur_progress += 1; // here
		result = SCAN_NO_PTE_TABLE;
		goto out;
	}

	for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR;
	     _pte++, addr += PAGE_SIZE) {
		...
	}
	...
out_unmap:
	if (cur_progress) {
		if (_pte >= pte + HPAGE_PMD_NR)
			*cur_progress += HPAGE_PMD_NR;   // here
		else
			*cur_progress += _pte - pte + 1; // here
	}
}

case 3:
	current patch, and add more comments to clearer.

> > +
> >  	result = find_pmd_or_thp_or_none(mm, start_addr, &pmd);
> >  	if (result != SCAN_SUCCEED)
> >  		goto out;
> > @@ -1396,6 +1403,12 @@ static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
> >  		result = SCAN_SUCCEED;
> >  	}
> >  out_unmap:
> > +	if (cur_progress) {
> > +		if (_pte >= pte + HPAGE_PMD_NR)
> > +			*cur_progress += HPAGE_PMD_NR - 1;
> > +		else
> > +			*cur_progress += _pte - pte;
> > +	}
> >  	pte_unmap_unlock(pte, ptl);
> >  	if (result == SCAN_SUCCEED) {
> >  		result = collapse_huge_page(mm, start_addr, referenced,
> > @@ -2286,8 +2299,9 @@ static enum scan_result collapse_file(struct mm_struct *mm, unsigned long addr,
> >  	return result;
> >  }
> >
> > -static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
> > -		struct file *file, pgoff_t start, struct collapse_control *cc)
> > +static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm,
> > +		unsigned long addr, struct file *file, pgoff_t start,
> > +		unsigned int *cur_progress, struct collapse_control *cc)
> >  {
> >  	struct folio *folio = NULL;
> >  	struct address_space *mapping = file->f_mapping;
> > @@ -2376,6 +2390,18 @@ static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm, unsigned
> >  			cond_resched_rcu();
> >  		}
> >  	}
> > +	if (cur_progress) {
> > +		unsigned long idx = xas_get_index(&xas) - start;
> > +
> > +		if (folio == NULL)
> > +			*cur_progress += HPAGE_PMD_NR;
>
> I think this whole block needs some comments. Can you explain, why you
> do a particular increment in each case?
>
> > +		else if (xa_is_value(folio))
> > +			*cur_progress += idx + (1 << xas_get_order(&xas));
> > +		else if (folio_order(folio) == HPAGE_PMD_ORDER)
> > +			*cur_progress += idx + 1;
> > +		else
> > +			*cur_progress += idx + folio_nr_pages(folio);
> > +	}

The "idx" represent PTEs number already scanned when exiting
xas_for_each().

However, the last valid folio size was not counted in "idx" (except
folio == NULL, "idx" equal to HPAGE_PMD_NR), which can be further
divided into three cases:

1. shmem swap entries (xa_is_value), add folio size.
2. the folio is HPAGE_PMD_ORDER, the memory has been collapsed
   to PMD, so add 1 only.
3. Normal folio, add folio size.

Is the version below more readable?

	if (cur_progress) {
		*cur_progress += xas.xa_index - start;

		if (folio) {
			if (xa_is_value(folio))
				*cur_progress += 1 << xas_get_order(&xas);
			else if (folio_order(folio) == HPAGE_PMD_ORDER)
				*cur_progress += 1;
			else
				*cur_progress += folio_nr_pages(folio);
		}
	}

> >  	rcu_read_unlock();
> >
> >  	if (result == SCAN_SUCCEED) {
> > @@ -2456,6 +2482,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result
> >
> >  		while (khugepaged_scan.address < hend) {
> >  			bool mmap_locked = true;
> > +			unsigned int cur_progress = 0;
> >
> >  			cond_resched();
> >  			if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
> > @@ -2472,7 +2499,8 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result
> >  				mmap_read_unlock(mm);
> >  				mmap_locked = false;
> >  				*result = hpage_collapse_scan_file(mm,
> > -					khugepaged_scan.address, file, pgoff, cc);
> > +					khugepaged_scan.address, file, pgoff,
> > +					&cur_progress, cc);
> >  				fput(file);
> >  				if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
> >  					mmap_read_lock(mm);
> > @@ -2486,7 +2514,8 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result
> >  				}
> >  			} else {
> >  				*result = hpage_collapse_scan_pmd(mm, vma,
> > -					khugepaged_scan.address, &mmap_locked, cc);
> > +					khugepaged_scan.address, &mmap_locked,
> > +					&cur_progress, cc);
> >  			}
> >
> >  			if (*result == SCAN_SUCCEED)
> > @@ -2494,7 +2523,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result
> >
> >  			/* move to next address */
> >  			khugepaged_scan.address += HPAGE_PMD_SIZE;
> > -			progress += HPAGE_PMD_NR;
> > +			progress += cur_progress;
> >  			if (!mmap_locked)
> >  				/*
> >  				 * We released mmap_lock so break loop.  Note
> > @@ -2817,7 +2846,7 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
> >  			mmap_locked = false;
> >  			*lock_dropped = true;
> >  			result = hpage_collapse_scan_file(mm, addr, file, pgoff,
> > -							  cc);
> > +							  NULL, cc);
> >
> >  			if (result == SCAN_PAGE_DIRTY_OR_WRITEBACK && !triggered_wb &&
> >  			    mapping_can_writeback(file->f_mapping)) {
> > @@ -2832,7 +2861,7 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
> >  			fput(file);
> >  		} else {
> >  			result = hpage_collapse_scan_pmd(mm, vma, addr,
> > -							 &mmap_locked, cc);
> > +							 &mmap_locked, NULL, cc);
> >  		}
> >  		if (!mmap_locked)
> >  			*lock_dropped = true;
>

--
Thanks,
Vernon

Re: [PATCH mm-new v5 2/5] mm: khugepaged: refine scan progress number

Posted by Dev Jain 1 week, 2 days ago

On 28/01/26 8:04 pm, Vernon Yang wrote:
> On Wed, Jan 28, 2026 at 01:59:33PM +0530, Dev Jain wrote:
>> On 23/01/26 1:52 pm, Vernon Yang wrote:
>>> From: Vernon Yang <yanglincheng@kylinos.cn>
>>>
>>> Currently, each scan always increases "progress" by HPAGE_PMD_NR,
>>> even if only scanning a single PTE/PMD entry.
>>>
>>> - When only scanning a sigle PTE entry, let me provide a detailed
>>>   example:
>>>
>>> static int hpage_collapse_scan_pmd()
>>> {
>>> 	for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR;
>>> 	     _pte++, addr += PAGE_SIZE) {
>>> 		pte_t pteval = ptep_get(_pte);
>>> 		...
>>> 		if (pte_uffd_wp(pteval)) { <-- first scan hit
>>> 			result = SCAN_PTE_UFFD_WP;
>>> 			goto out_unmap;
>>> 		}
>>> 	}
>>> }
>>>
>>> During the first scan, if pte_uffd_wp(pteval) is true, the loop exits
>>> directly. In practice, only one PTE is scanned before termination.
>>> Here, "progress += 1" reflects the actual number of PTEs scanned, but
>>> previously "progress += HPAGE_PMD_NR" always.
>>>
>>> - When the memory has been collapsed to PMD, let me provide a detailed
>>>   example:
>>>
>>> The following data is traced by bpftrace on a desktop system. After
>>> the system has been left idle for 10 minutes upon booting, a lot of
>>> SCAN_PMD_MAPPED or SCAN_NO_PTE_TABLE are observed during a full scan
>>> by khugepaged.
>>>
>>> @scan_pmd_status[1]: 1           ## SCAN_SUCCEED
>>> @scan_pmd_status[6]: 2           ## SCAN_EXCEED_SHARED_PTE
>>> @scan_pmd_status[3]: 142         ## SCAN_PMD_MAPPED
>>> @scan_pmd_status[2]: 178         ## SCAN_NO_PTE_TABLE
>> Could you elaborate what is [1], [6] etc and 1,2,142, etc?
> These 1,6 are value of "enum scan_result", you can directly refer to the
> notes on the right.
>
> These 1,2,142,178 are number of different "enum scan_result" from
> trace_mm_khugepaged_scan_pmd and trace_mm_khugepaged_scan_file.
>
> as example, SCAN_PMD_MAPPED has 142 times during a full scan by
> khugepaged.

Thanks. Can you please mention this in the patch description. You can simply
right it like this:

"From trace_mm_khugepaged_scan_pmd and trace_mm_khugepaged_scan_file, the
following statuses were observed, with frequency mentioned next to them:

SCAN_SUCCEED: 1
SCAN_PMD_MAPPED: 142
....."

and so on.

>
>>> total progress size: 674 MB
>>> Total time         : 419 seconds ## include khugepaged_scan_sleep_millisecs
>>>
>>> The khugepaged_scan list save all task that support collapse into hugepage,
>>> as long as the task is not destroyed, khugepaged will not remove it from
>>> the khugepaged_scan list. This exist a phenomenon where task has already
>>> collapsed all memory regions into hugepage, but khugepaged continues to
>>> scan it, which wastes CPU time and invalid, and due to
>>> khugepaged_scan_sleep_millisecs (default 10s) causes a long wait for
>>> scanning a large number of invalid task, so scanning really valid task
>>> is later.
>>>
>>> After applying this patch, when the memory is either SCAN_PMD_MAPPED or
>>> SCAN_NO_PTE_TABLE, just skip it, as follow:
>>>
>>> @scan_pmd_status[6]: 2
>>> @scan_pmd_status[3]: 147
>>> @scan_pmd_status[2]: 173
>>> total progress size: 45 MB
>>> Total time         : 20 seconds
>>>
>>> Signed-off-by: Vernon Yang <yanglincheng@kylinos.cn>
>>> ---
>>>  include/linux/xarray.h |  9 ++++++++
>>>  mm/khugepaged.c        | 47 ++++++++++++++++++++++++++++++++++--------
>>>  2 files changed, 47 insertions(+), 9 deletions(-)
>>>
>>> diff --git a/include/linux/xarray.h b/include/linux/xarray.h
>>> index be850174e802..f77d97d7b957 100644
>>> --- a/include/linux/xarray.h
>>> +++ b/include/linux/xarray.h
>>> @@ -1646,6 +1646,15 @@ static inline void xas_set(struct xa_state *xas, unsigned long index)
>>>  	xas->xa_node = XAS_RESTART;
>>>  }
>>>
>>> +/**
>>> + * xas_get_index() - Get XArray operation state for a different index.
>>> + * @xas: XArray operation state.
>>> + */
>>> +static inline unsigned long xas_get_index(struct xa_state *xas)
>>> +{
>>> +	return xas->xa_index;
>>> +}
>>> +
>>>  /**
>>>   * xas_advance() - Skip over sibling entries.
>>>   * @xas: XArray operation state.
>>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>>> index 6f0f05148765..de95029e3763 100644
>>> --- a/mm/khugepaged.c
>>> +++ b/mm/khugepaged.c
>>> @@ -68,7 +68,10 @@ enum scan_result {
>>>  static struct task_struct *khugepaged_thread __read_mostly;
>>>  static DEFINE_MUTEX(khugepaged_mutex);
>>>
>>> -/* default scan 8*HPAGE_PMD_NR ptes (or vmas) every 10 second */
>>> +/*
>>> + * default scan 8*HPAGE_PMD_NR ptes, pmd_mapped, no_pte_table or vmas
>>> + * every 10 second.
>>> + */
>>>  static unsigned int khugepaged_pages_to_scan __read_mostly;
>>>  static unsigned int khugepaged_pages_collapsed;
>>>  static unsigned int khugepaged_full_scans;
>>> @@ -1240,7 +1243,8 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
>>>  }
>>>
>>>  static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
>>> -		struct vm_area_struct *vma, unsigned long start_addr, bool *mmap_locked,
>>> +		struct vm_area_struct *vma, unsigned long start_addr,
>>> +		bool *mmap_locked, unsigned int *cur_progress,
>>>  		struct collapse_control *cc)
>>>  {
>>>  	pmd_t *pmd;
>>> @@ -1255,6 +1259,9 @@ static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
>>>
>>>  	VM_BUG_ON(start_addr & ~HPAGE_PMD_MASK);
>>>
>>> +	if (cur_progress)
>>> +		*cur_progress += 1;
>> Why not be a little more explicit, and do this addition if find_pmd_or_thp_or_none fails,
>> or pte_offset_map_lock fails? The way you do it right now is not readable - it gives no
>> idea as to why on function entry we do a +1 right away. Please do add some comments too.
> If this way is not clear enough, we can directly add 1 in
> find_pmd_or_thp_or_none() etc, BUT it's a bit redundant.
> Please take a look at which one is better.
>
> case 1:
> as the V4 PATCH #2 [1] and #3 [2], only hpage_collapse_scan_pmd().
> [1] https://lore.kernel.org/linux-mm/20260111121909.8410-3-yanglincheng@kylinos.cn
> [2] https://lore.kernel.org/linux-mm/20260111121909.8410-4-yanglincheng@kylinos.cn
>
> static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
> 		struct vm_area_struct *vma, unsigned long start_addr,
> 		bool *mmap_locked, unsigned int *cur_progress,
> 		struct collapse_control *cc)
> {
> 	...
> 	result = find_pmd_or_thp_or_none(mm, start_addr, &pmd);
> 	if (result != SCAN_SUCCEED) {
> 		if (cur_progress)
> 			*cur_progress += 1; // here
> 		goto out;
> 	}
> 	...
> 	pte = pte_offset_map_lock(mm, pmd, start_addr, &ptl);
> 	if (!pte) {
> 		if (cur_progress)
> 			*cur_progress += 1; // here
> 		result = SCAN_NO_PTE_TABLE;
> 		goto out;
> 	}
>
> 	for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR;
> 	     _pte++, addr += PAGE_SIZE) {
> 		if (cur_progress)
> 			*cur_progress += 1; // here
> 		...
> 	}
> }
>
> case 2:
>
> static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
> 		struct vm_area_struct *vma, unsigned long start_addr,
> 		bool *mmap_locked, unsigned int *cur_progress,
> 		struct collapse_control *cc)
> {
> 	...
> 	result = find_pmd_or_thp_or_none(mm, start_addr, &pmd);
> 	if (result != SCAN_SUCCEED) {
> 		if (cur_progress)
> 			*cur_progress += 1; // here

Let us be more explicit and set this equal to 1, instead of adding 1.

> 		goto out;
> 	}
> 	...
> 	pte = pte_offset_map_lock(mm, pmd, start_addr, &ptl);
> 	if (!pte) {
> 		if (cur_progress)
> 			*cur_progress += 1; // here

Same comment as above.

> 		result = SCAN_NO_PTE_TABLE;
> 		goto out;
> 	}
>
> 	for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR;
> 	     _pte++, addr += PAGE_SIZE) {
> 		...
> 	}
> 	...
> out_unmap:
> 	if (cur_progress) {
> 		if (_pte >= pte + HPAGE_PMD_NR)
> 			*cur_progress += HPAGE_PMD_NR;   // here
> 		else
> 			*cur_progress += _pte - pte + 1; // here
> 	}
> }

I will vote case 2. In case 1 I don't like the fact that the if (cur_progress)
branch will be checked each iteration - and I don't think the compiler can
optimize this since the body of the loop is complex, so this check cannot
be hoisted out of the loop.


>
> case 3:
> 	current patch, and add more comments to clearer.
>
>>> +
>>>  	result = find_pmd_or_thp_or_none(mm, start_addr, &pmd);
>>>  	if (result != SCAN_SUCCEED)
>>>  		goto out;
>>> @@ -1396,6 +1403,12 @@ static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
>>>  		result = SCAN_SUCCEED;
>>>  	}
>>>  out_unmap:
>>> +	if (cur_progress) {
>>> +		if (_pte >= pte + HPAGE_PMD_NR)
>>> +			*cur_progress += HPAGE_PMD_NR - 1;
>>> +		else
>>> +			*cur_progress += _pte - pte;
>>> +	}
>>>  	pte_unmap_unlock(pte, ptl);
>>>  	if (result == SCAN_SUCCEED) {
>>>  		result = collapse_huge_page(mm, start_addr, referenced,
>>> @@ -2286,8 +2299,9 @@ static enum scan_result collapse_file(struct mm_struct *mm, unsigned long addr,
>>>  	return result;
>>>  }
>>>
>>> -static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
>>> -		struct file *file, pgoff_t start, struct collapse_control *cc)
>>> +static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm,
>>> +		unsigned long addr, struct file *file, pgoff_t start,
>>> +		unsigned int *cur_progress, struct collapse_control *cc)
>>>  {
>>>  	struct folio *folio = NULL;
>>>  	struct address_space *mapping = file->f_mapping;
>>> @@ -2376,6 +2390,18 @@ static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm, unsigned
>>>  			cond_resched_rcu();
>>>  		}
>>>  	}
>>> +	if (cur_progress) {
>>> +		unsigned long idx = xas_get_index(&xas) - start;
>>> +
>>> +		if (folio == NULL)
>>> +			*cur_progress += HPAGE_PMD_NR;
>> I think this whole block needs some comments. Can you explain, why you
>> do a particular increment in each case?
>>
>>> +		else if (xa_is_value(folio))
>>> +			*cur_progress += idx + (1 << xas_get_order(&xas));
>>> +		else if (folio_order(folio) == HPAGE_PMD_ORDER)
>>> +			*cur_progress += idx + 1;
>>> +		else
>>> +			*cur_progress += idx + folio_nr_pages(folio);
>>> +	}
> The "idx" represent PTEs number already scanned when exiting
> xas_for_each().
>
> However, the last valid folio size was not counted in "idx" (except
> folio == NULL, "idx" equal to HPAGE_PMD_NR), which can be further
> divided into three cases:

But, the number of PTEs you account in these three cases, are *not*
scanned, right? So we can simply drop these 3 cases.

>
> 1. shmem swap entries (xa_is_value), add folio size.
> 2. the folio is HPAGE_PMD_ORDER, the memory has been collapsed
>    to PMD, so add 1 only.
> 3. Normal folio, add folio size.
>
> Is the version below more readable?
>
> 	if (cur_progress) {
> 		*cur_progress += xas.xa_index - start;
>
> 		if (folio) {
> 			if (xa_is_value(folio))
> 				*cur_progress += 1 << xas_get_order(&xas);
> 			else if (folio_order(folio) == HPAGE_PMD_ORDER)
> 				*cur_progress += 1;
> 			else
> 				*cur_progress += folio_nr_pages(folio);
> 		}
> 	}

Yep, this is unneeded complexity. This looks really ugly and the benefits of
this are not clear. You can simply do

if (cur_progress)
	*cur_progress = xas.xa_index - start;

>
>>>  	rcu_read_unlock();
>>>
>>>  	if (result == SCAN_SUCCEED) {
>>> @@ -2456,6 +2482,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result
>>>
>>>  		while (khugepaged_scan.address < hend) {
>>>  			bool mmap_locked = true;
>>> +			unsigned int cur_progress = 0;
>>>
>>>  			cond_resched();
>>>  			if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
>>> @@ -2472,7 +2499,8 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result
>>>  				mmap_read_unlock(mm);
>>>  				mmap_locked = false;
>>>  				*result = hpage_collapse_scan_file(mm,
>>> -					khugepaged_scan.address, file, pgoff, cc);
>>> +					khugepaged_scan.address, file, pgoff,
>>> +					&cur_progress, cc);
>>>  				fput(file);
>>>  				if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
>>>  					mmap_read_lock(mm);
>>> @@ -2486,7 +2514,8 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result
>>>  				}
>>>  			} else {
>>>  				*result = hpage_collapse_scan_pmd(mm, vma,
>>> -					khugepaged_scan.address, &mmap_locked, cc);
>>> +					khugepaged_scan.address, &mmap_locked,
>>> +					&cur_progress, cc);
>>>  			}
>>>
>>>  			if (*result == SCAN_SUCCEED)
>>> @@ -2494,7 +2523,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result
>>>
>>>  			/* move to next address */
>>>  			khugepaged_scan.address += HPAGE_PMD_SIZE;
>>> -			progress += HPAGE_PMD_NR;
>>> +			progress += cur_progress;
>>>  			if (!mmap_locked)
>>>  				/*
>>>  				 * We released mmap_lock so break loop.  Note
>>> @@ -2817,7 +2846,7 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
>>>  			mmap_locked = false;
>>>  			*lock_dropped = true;
>>>  			result = hpage_collapse_scan_file(mm, addr, file, pgoff,
>>> -							  cc);
>>> +							  NULL, cc);
>>>
>>>  			if (result == SCAN_PAGE_DIRTY_OR_WRITEBACK && !triggered_wb &&
>>>  			    mapping_can_writeback(file->f_mapping)) {
>>> @@ -2832,7 +2861,7 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
>>>  			fput(file);
>>>  		} else {
>>>  			result = hpage_collapse_scan_pmd(mm, vma, addr,
>>> -							 &mmap_locked, cc);
>>> +							 &mmap_locked, NULL, cc);
>>>  		}
>>>  		if (!mmap_locked)
>>>  			*lock_dropped = true;
> --
> Thanks,
> Vernon

Re: [PATCH mm-new v5 2/5] mm: khugepaged: refine scan progress number

Posted by Lance Yang 1 week, 1 day ago


On 2026/1/29 13:35, Dev Jain wrote:
> 
> On 28/01/26 8:04 pm, Vernon Yang wrote:
>> On Wed, Jan 28, 2026 at 01:59:33PM +0530, Dev Jain wrote:
>>> On 23/01/26 1:52 pm, Vernon Yang wrote:
>>>> From: Vernon Yang <yanglincheng@kylinos.cn>
>>>>
>>>> Currently, each scan always increases "progress" by HPAGE_PMD_NR,
>>>> even if only scanning a single PTE/PMD entry.
>>>>
>>>> - When only scanning a sigle PTE entry, let me provide a detailed
>>>>    example:
>>>>
>>>> static int hpage_collapse_scan_pmd()
>>>> {
>>>> 	for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR;
>>>> 	     _pte++, addr += PAGE_SIZE) {
>>>> 		pte_t pteval = ptep_get(_pte);
>>>> 		...
>>>> 		if (pte_uffd_wp(pteval)) { <-- first scan hit
>>>> 			result = SCAN_PTE_UFFD_WP;
>>>> 			goto out_unmap;
>>>> 		}
>>>> 	}
>>>> }
>>>>
>>>> During the first scan, if pte_uffd_wp(pteval) is true, the loop exits
>>>> directly. In practice, only one PTE is scanned before termination.
>>>> Here, "progress += 1" reflects the actual number of PTEs scanned, but
>>>> previously "progress += HPAGE_PMD_NR" always.
>>>>
>>>> - When the memory has been collapsed to PMD, let me provide a detailed
>>>>    example:
>>>>
>>>> The following data is traced by bpftrace on a desktop system. After
>>>> the system has been left idle for 10 minutes upon booting, a lot of
>>>> SCAN_PMD_MAPPED or SCAN_NO_PTE_TABLE are observed during a full scan
>>>> by khugepaged.
>>>>
>>>> @scan_pmd_status[1]: 1           ## SCAN_SUCCEED
>>>> @scan_pmd_status[6]: 2           ## SCAN_EXCEED_SHARED_PTE
>>>> @scan_pmd_status[3]: 142         ## SCAN_PMD_MAPPED
>>>> @scan_pmd_status[2]: 178         ## SCAN_NO_PTE_TABLE
>>> Could you elaborate what is [1], [6] etc and 1,2,142, etc?
>> These 1,6 are value of "enum scan_result", you can directly refer to the
>> notes on the right.
>>
>> These 1,2,142,178 are number of different "enum scan_result" from
>> trace_mm_khugepaged_scan_pmd and trace_mm_khugepaged_scan_file.
>>
>> as example, SCAN_PMD_MAPPED has 142 times during a full scan by
>> khugepaged.
> 
> Thanks. Can you please mention this in the patch description. You can simply
> right it like this:
> 
> "From trace_mm_khugepaged_scan_pmd and trace_mm_khugepaged_scan_file, the
> following statuses were observed, with frequency mentioned next to them:
> 
> SCAN_SUCCEED: 1
> SCAN_PMD_MAPPED: 142
> ....."
> 
> and so on.
> 
>>
>>>> total progress size: 674 MB
>>>> Total time         : 419 seconds ## include khugepaged_scan_sleep_millisecs
>>>>
>>>> The khugepaged_scan list save all task that support collapse into hugepage,
>>>> as long as the task is not destroyed, khugepaged will not remove it from
>>>> the khugepaged_scan list. This exist a phenomenon where task has already
>>>> collapsed all memory regions into hugepage, but khugepaged continues to
>>>> scan it, which wastes CPU time and invalid, and due to
>>>> khugepaged_scan_sleep_millisecs (default 10s) causes a long wait for
>>>> scanning a large number of invalid task, so scanning really valid task
>>>> is later.
>>>>
>>>> After applying this patch, when the memory is either SCAN_PMD_MAPPED or
>>>> SCAN_NO_PTE_TABLE, just skip it, as follow:
>>>>
>>>> @scan_pmd_status[6]: 2
>>>> @scan_pmd_status[3]: 147
>>>> @scan_pmd_status[2]: 173
>>>> total progress size: 45 MB
>>>> Total time         : 20 seconds
>>>>
>>>> Signed-off-by: Vernon Yang <yanglincheng@kylinos.cn>
>>>> ---
>>>>   include/linux/xarray.h |  9 ++++++++
>>>>   mm/khugepaged.c        | 47 ++++++++++++++++++++++++++++++++++--------
>>>>   2 files changed, 47 insertions(+), 9 deletions(-)
>>>>
>>>> diff --git a/include/linux/xarray.h b/include/linux/xarray.h
>>>> index be850174e802..f77d97d7b957 100644
>>>> --- a/include/linux/xarray.h
>>>> +++ b/include/linux/xarray.h
>>>> @@ -1646,6 +1646,15 @@ static inline void xas_set(struct xa_state *xas, unsigned long index)
>>>>   	xas->xa_node = XAS_RESTART;
>>>>   }
>>>>
>>>> +/**
>>>> + * xas_get_index() - Get XArray operation state for a different index.
>>>> + * @xas: XArray operation state.
>>>> + */
>>>> +static inline unsigned long xas_get_index(struct xa_state *xas)
>>>> +{
>>>> +	return xas->xa_index;
>>>> +}
>>>> +
>>>>   /**
>>>>    * xas_advance() - Skip over sibling entries.
>>>>    * @xas: XArray operation state.
>>>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>>>> index 6f0f05148765..de95029e3763 100644
>>>> --- a/mm/khugepaged.c
>>>> +++ b/mm/khugepaged.c
>>>> @@ -68,7 +68,10 @@ enum scan_result {
>>>>   static struct task_struct *khugepaged_thread __read_mostly;
>>>>   static DEFINE_MUTEX(khugepaged_mutex);
>>>>
>>>> -/* default scan 8*HPAGE_PMD_NR ptes (or vmas) every 10 second */
>>>> +/*
>>>> + * default scan 8*HPAGE_PMD_NR ptes, pmd_mapped, no_pte_table or vmas
>>>> + * every 10 second.
>>>> + */
>>>>   static unsigned int khugepaged_pages_to_scan __read_mostly;
>>>>   static unsigned int khugepaged_pages_collapsed;
>>>>   static unsigned int khugepaged_full_scans;
>>>> @@ -1240,7 +1243,8 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
>>>>   }
>>>>
>>>>   static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
>>>> -		struct vm_area_struct *vma, unsigned long start_addr, bool *mmap_locked,
>>>> +		struct vm_area_struct *vma, unsigned long start_addr,
>>>> +		bool *mmap_locked, unsigned int *cur_progress,
>>>>   		struct collapse_control *cc)
>>>>   {
>>>>   	pmd_t *pmd;
>>>> @@ -1255,6 +1259,9 @@ static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
>>>>
>>>>   	VM_BUG_ON(start_addr & ~HPAGE_PMD_MASK);
>>>>
>>>> +	if (cur_progress)
>>>> +		*cur_progress += 1;
>>> Why not be a little more explicit, and do this addition if find_pmd_or_thp_or_none fails,
>>> or pte_offset_map_lock fails? The way you do it right now is not readable - it gives no
>>> idea as to why on function entry we do a +1 right away. Please do add some comments too.
>> If this way is not clear enough, we can directly add 1 in
>> find_pmd_or_thp_or_none() etc, BUT it's a bit redundant.
>> Please take a look at which one is better.
>>
>> case 1:
>> as the V4 PATCH #2 [1] and #3 [2], only hpage_collapse_scan_pmd().
>> [1] https://lore.kernel.org/linux-mm/20260111121909.8410-3-yanglincheng@kylinos.cn
>> [2] https://lore.kernel.org/linux-mm/20260111121909.8410-4-yanglincheng@kylinos.cn
>>
>> static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
>> 		struct vm_area_struct *vma, unsigned long start_addr,
>> 		bool *mmap_locked, unsigned int *cur_progress,
>> 		struct collapse_control *cc)
>> {
>> 	...
>> 	result = find_pmd_or_thp_or_none(mm, start_addr, &pmd);
>> 	if (result != SCAN_SUCCEED) {
>> 		if (cur_progress)
>> 			*cur_progress += 1; // here
>> 		goto out;
>> 	}
>> 	...
>> 	pte = pte_offset_map_lock(mm, pmd, start_addr, &ptl);
>> 	if (!pte) {
>> 		if (cur_progress)
>> 			*cur_progress += 1; // here
>> 		result = SCAN_NO_PTE_TABLE;
>> 		goto out;
>> 	}
>>
>> 	for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR;
>> 	     _pte++, addr += PAGE_SIZE) {
>> 		if (cur_progress)
>> 			*cur_progress += 1; // here
>> 		...
>> 	}
>> }
>>
>> case 2:
>>
>> static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
>> 		struct vm_area_struct *vma, unsigned long start_addr,
>> 		bool *mmap_locked, unsigned int *cur_progress,
>> 		struct collapse_control *cc)
>> {
>> 	...
>> 	result = find_pmd_or_thp_or_none(mm, start_addr, &pmd);
>> 	if (result != SCAN_SUCCEED) {
>> 		if (cur_progress)
>> 			*cur_progress += 1; // here
> 
> Let us be more explicit and set this equal to 1, instead of adding 1.
> 
>> 		goto out;
>> 	}
>> 	...
>> 	pte = pte_offset_map_lock(mm, pmd, start_addr, &ptl);
>> 	if (!pte) {
>> 		if (cur_progress)
>> 			*cur_progress += 1; // here
> 
> Same comment as above.
> 
>> 		result = SCAN_NO_PTE_TABLE;
>> 		goto out;
>> 	}
>>
>> 	for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR;
>> 	     _pte++, addr += PAGE_SIZE) {
>> 		...
>> 	}
>> 	...
>> out_unmap:
>> 	if (cur_progress) {
>> 		if (_pte >= pte + HPAGE_PMD_NR)
>> 			*cur_progress += HPAGE_PMD_NR;   // here
>> 		else
>> 			*cur_progress += _pte - pte + 1; // here
>> 	}
>> }
> 
> I will vote case 2. In case 1 I don't like the fact that the if (cur_progress)
> branch will be checked each iteration - and I don't think the compiler can
> optimize this since the body of the loop is complex, so this check cannot
> be hoisted out of the loop.
> 
> 
>>
>> case 3:
>> 	current patch, and add more comments to clearer.
>>
>>>> +
>>>>   	result = find_pmd_or_thp_or_none(mm, start_addr, &pmd);
>>>>   	if (result != SCAN_SUCCEED)
>>>>   		goto out;
>>>> @@ -1396,6 +1403,12 @@ static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
>>>>   		result = SCAN_SUCCEED;
>>>>   	}
>>>>   out_unmap:
>>>> +	if (cur_progress) {
>>>> +		if (_pte >= pte + HPAGE_PMD_NR)
>>>> +			*cur_progress += HPAGE_PMD_NR - 1;
>>>> +		else
>>>> +			*cur_progress += _pte - pte;
>>>> +	}
>>>>   	pte_unmap_unlock(pte, ptl);
>>>>   	if (result == SCAN_SUCCEED) {
>>>>   		result = collapse_huge_page(mm, start_addr, referenced,
>>>> @@ -2286,8 +2299,9 @@ static enum scan_result collapse_file(struct mm_struct *mm, unsigned long addr,
>>>>   	return result;
>>>>   }
>>>>
>>>> -static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
>>>> -		struct file *file, pgoff_t start, struct collapse_control *cc)
>>>> +static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm,
>>>> +		unsigned long addr, struct file *file, pgoff_t start,
>>>> +		unsigned int *cur_progress, struct collapse_control *cc)
>>>>   {
>>>>   	struct folio *folio = NULL;
>>>>   	struct address_space *mapping = file->f_mapping;
>>>> @@ -2376,6 +2390,18 @@ static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm, unsigned
>>>>   			cond_resched_rcu();
>>>>   		}
>>>>   	}
>>>> +	if (cur_progress) {
>>>> +		unsigned long idx = xas_get_index(&xas) - start;
>>>> +
>>>> +		if (folio == NULL)
>>>> +			*cur_progress += HPAGE_PMD_NR;
>>> I think this whole block needs some comments. Can you explain, why you
>>> do a particular increment in each case?
>>>
>>>> +		else if (xa_is_value(folio))
>>>> +			*cur_progress += idx + (1 << xas_get_order(&xas));
>>>> +		else if (folio_order(folio) == HPAGE_PMD_ORDER)
>>>> +			*cur_progress += idx + 1;
>>>> +		else
>>>> +			*cur_progress += idx + folio_nr_pages(folio);
>>>> +	}
>> The "idx" represent PTEs number already scanned when exiting
>> xas_for_each().
>>
>> However, the last valid folio size was not counted in "idx" (except
>> folio == NULL, "idx" equal to HPAGE_PMD_NR), which can be further
>> divided into three cases:
> 
> But, the number of PTEs you account in these three cases, are *not*
> scanned, right? So we can simply drop these 3 cases.
> 
>>
>> 1. shmem swap entries (xa_is_value), add folio size.
>> 2. the folio is HPAGE_PMD_ORDER, the memory has been collapsed
>>     to PMD, so add 1 only.
>> 3. Normal folio, add folio size.
>>
>> Is the version below more readable?
>>
>> 	if (cur_progress) {
>> 		*cur_progress += xas.xa_index - start;
>>
>> 		if (folio) {
>> 			if (xa_is_value(folio))
>> 				*cur_progress += 1 << xas_get_order(&xas);
>> 			else if (folio_order(folio) == HPAGE_PMD_ORDER)
>> 				*cur_progress += 1;
>> 			else
>> 				*cur_progress += folio_nr_pages(folio);
>> 		}
>> 	}
> 
> Yep, this is unneeded complexity. This looks really ugly and the benefits of
> this are not clear. You can simply do
> 
> if (cur_progress)
> 	*cur_progress = xas.xa_index - start;
> 

I agree with Dev here. The extra complexity in hpage_collapse_scan_file()
doesn't seem worth it.

Suggest:

if (cur_progress)
	*cur_progress = max(xas.xa_index - start, 1UL);

Just keeps it simple, and handles the idx=0 case you mentioned as well.

[...]

Re: [PATCH mm-new v5 2/5] mm: khugepaged: refine scan progress number

Posted by Vernon Yang 1 week, 1 day ago

On Thu, Jan 29, 2026 at 5:18 PM Lance Yang <lance.yang@linux.dev> wrote:
>
> On 2026/1/29 13:35, Dev Jain wrote:
> >
> > On 28/01/26 8:04 pm, Vernon Yang wrote:
> >> On Wed, Jan 28, 2026 at 01:59:33PM +0530, Dev Jain wrote:
> >>> On 23/01/26 1:52 pm, Vernon Yang wrote:
> >>>> From: Vernon Yang <yanglincheng@kylinos.cn>
> >>>>
> >>>> Currently, each scan always increases "progress" by HPAGE_PMD_NR,
> >>>> even if only scanning a single PTE/PMD entry.
> >>>>
> >>>> - When only scanning a sigle PTE entry, let me provide a detailed
> >>>>    example:
> >>>>
> >>>> static int hpage_collapse_scan_pmd()
> >>>> {
> >>>>    for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR;
> >>>>         _pte++, addr += PAGE_SIZE) {
> >>>>            pte_t pteval = ptep_get(_pte);
> >>>>            ...
> >>>>            if (pte_uffd_wp(pteval)) { <-- first scan hit
> >>>>                    result = SCAN_PTE_UFFD_WP;
> >>>>                    goto out_unmap;
> >>>>            }
> >>>>    }
> >>>> }
> >>>>
> >>>> During the first scan, if pte_uffd_wp(pteval) is true, the loop exits
> >>>> directly. In practice, only one PTE is scanned before termination.
> >>>> Here, "progress += 1" reflects the actual number of PTEs scanned, but
> >>>> previously "progress += HPAGE_PMD_NR" always.
> >>>>
> >>>> - When the memory has been collapsed to PMD, let me provide a detailed
> >>>>    example:
> >>>>
> >>>> The following data is traced by bpftrace on a desktop system. After
> >>>> the system has been left idle for 10 minutes upon booting, a lot of
> >>>> SCAN_PMD_MAPPED or SCAN_NO_PTE_TABLE are observed during a full scan
> >>>> by khugepaged.
> >>>>
> >>>> @scan_pmd_status[1]: 1           ## SCAN_SUCCEED
> >>>> @scan_pmd_status[6]: 2           ## SCAN_EXCEED_SHARED_PTE
> >>>> @scan_pmd_status[3]: 142         ## SCAN_PMD_MAPPED
> >>>> @scan_pmd_status[2]: 178         ## SCAN_NO_PTE_TABLE
> >>> Could you elaborate what is [1], [6] etc and 1,2,142, etc?
> >> These 1,6 are value of "enum scan_result", you can directly refer to the
> >> notes on the right.
> >>
> >> These 1,2,142,178 are number of different "enum scan_result" from
> >> trace_mm_khugepaged_scan_pmd and trace_mm_khugepaged_scan_file.
> >>
> >> as example, SCAN_PMD_MAPPED has 142 times during a full scan by
> >> khugepaged.
> >
> > Thanks. Can you please mention this in the patch description. You can simply
> > right it like this:
> >
> > "From trace_mm_khugepaged_scan_pmd and trace_mm_khugepaged_scan_file, the
> > following statuses were observed, with frequency mentioned next to them:
> >
> > SCAN_SUCCEED: 1
> > SCAN_PMD_MAPPED: 142
> > ....."
> >
> > and so on.
> >
> >>
> >>>> total progress size: 674 MB
> >>>> Total time         : 419 seconds ## include khugepaged_scan_sleep_millisecs
> >>>>
> >>>> The khugepaged_scan list save all task that support collapse into hugepage,
> >>>> as long as the task is not destroyed, khugepaged will not remove it from
> >>>> the khugepaged_scan list. This exist a phenomenon where task has already
> >>>> collapsed all memory regions into hugepage, but khugepaged continues to
> >>>> scan it, which wastes CPU time and invalid, and due to
> >>>> khugepaged_scan_sleep_millisecs (default 10s) causes a long wait for
> >>>> scanning a large number of invalid task, so scanning really valid task
> >>>> is later.
> >>>>
> >>>> After applying this patch, when the memory is either SCAN_PMD_MAPPED or
> >>>> SCAN_NO_PTE_TABLE, just skip it, as follow:
> >>>>
> >>>> @scan_pmd_status[6]: 2
> >>>> @scan_pmd_status[3]: 147
> >>>> @scan_pmd_status[2]: 173
> >>>> total progress size: 45 MB
> >>>> Total time         : 20 seconds
> >>>>
> >>>> Signed-off-by: Vernon Yang <yanglincheng@kylinos.cn>
> >>>> ---
> >>>>   include/linux/xarray.h |  9 ++++++++
> >>>>   mm/khugepaged.c        | 47 ++++++++++++++++++++++++++++++++++--------
> >>>>   2 files changed, 47 insertions(+), 9 deletions(-)
> >>>>
> >>>> diff --git a/include/linux/xarray.h b/include/linux/xarray.h
> >>>> index be850174e802..f77d97d7b957 100644
> >>>> --- a/include/linux/xarray.h
> >>>> +++ b/include/linux/xarray.h
> >>>> @@ -1646,6 +1646,15 @@ static inline void xas_set(struct xa_state *xas, unsigned long index)
> >>>>    xas->xa_node = XAS_RESTART;
> >>>>   }
> >>>>
> >>>> +/**
> >>>> + * xas_get_index() - Get XArray operation state for a different index.
> >>>> + * @xas: XArray operation state.
> >>>> + */
> >>>> +static inline unsigned long xas_get_index(struct xa_state *xas)
> >>>> +{
> >>>> +  return xas->xa_index;
> >>>> +}
> >>>> +
> >>>>   /**
> >>>>    * xas_advance() - Skip over sibling entries.
> >>>>    * @xas: XArray operation state.
> >>>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> >>>> index 6f0f05148765..de95029e3763 100644
> >>>> --- a/mm/khugepaged.c
> >>>> +++ b/mm/khugepaged.c
> >>>> @@ -68,7 +68,10 @@ enum scan_result {
> >>>>   static struct task_struct *khugepaged_thread __read_mostly;
> >>>>   static DEFINE_MUTEX(khugepaged_mutex);
> >>>>
> >>>> -/* default scan 8*HPAGE_PMD_NR ptes (or vmas) every 10 second */
> >>>> +/*
> >>>> + * default scan 8*HPAGE_PMD_NR ptes, pmd_mapped, no_pte_table or vmas
> >>>> + * every 10 second.
> >>>> + */
> >>>>   static unsigned int khugepaged_pages_to_scan __read_mostly;
> >>>>   static unsigned int khugepaged_pages_collapsed;
> >>>>   static unsigned int khugepaged_full_scans;
> >>>> @@ -1240,7 +1243,8 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
> >>>>   }
> >>>>
> >>>>   static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
> >>>> -          struct vm_area_struct *vma, unsigned long start_addr, bool *mmap_locked,
> >>>> +          struct vm_area_struct *vma, unsigned long start_addr,
> >>>> +          bool *mmap_locked, unsigned int *cur_progress,
> >>>>            struct collapse_control *cc)
> >>>>   {
> >>>>    pmd_t *pmd;
> >>>> @@ -1255,6 +1259,9 @@ static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
> >>>>
> >>>>    VM_BUG_ON(start_addr & ~HPAGE_PMD_MASK);
> >>>>
> >>>> +  if (cur_progress)
> >>>> +          *cur_progress += 1;
> >>> Why not be a little more explicit, and do this addition if find_pmd_or_thp_or_none fails,
> >>> or pte_offset_map_lock fails? The way you do it right now is not readable - it gives no
> >>> idea as to why on function entry we do a +1 right away. Please do add some comments too.
> >> If this way is not clear enough, we can directly add 1 in
> >> find_pmd_or_thp_or_none() etc, BUT it's a bit redundant.
> >> Please take a look at which one is better.
> >>
> >> case 1:
> >> as the V4 PATCH #2 [1] and #3 [2], only hpage_collapse_scan_pmd().
> >> [1] https://lore.kernel.org/linux-mm/20260111121909.8410-3-yanglincheng@kylinos.cn
> >> [2] https://lore.kernel.org/linux-mm/20260111121909.8410-4-yanglincheng@kylinos.cn
> >>
> >> static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
> >>              struct vm_area_struct *vma, unsigned long start_addr,
> >>              bool *mmap_locked, unsigned int *cur_progress,
> >>              struct collapse_control *cc)
> >> {
> >>      ...
> >>      result = find_pmd_or_thp_or_none(mm, start_addr, &pmd);
> >>      if (result != SCAN_SUCCEED) {
> >>              if (cur_progress)
> >>                      *cur_progress += 1; // here
> >>              goto out;
> >>      }
> >>      ...
> >>      pte = pte_offset_map_lock(mm, pmd, start_addr, &ptl);
> >>      if (!pte) {
> >>              if (cur_progress)
> >>                      *cur_progress += 1; // here
> >>              result = SCAN_NO_PTE_TABLE;
> >>              goto out;
> >>      }
> >>
> >>      for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR;
> >>           _pte++, addr += PAGE_SIZE) {
> >>              if (cur_progress)
> >>                      *cur_progress += 1; // here
> >>              ...
> >>      }
> >> }
> >>
> >> case 2:
> >>
> >> static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
> >>              struct vm_area_struct *vma, unsigned long start_addr,
> >>              bool *mmap_locked, unsigned int *cur_progress,
> >>              struct collapse_control *cc)
> >> {
> >>      ...
> >>      result = find_pmd_or_thp_or_none(mm, start_addr, &pmd);
> >>      if (result != SCAN_SUCCEED) {
> >>              if (cur_progress)
> >>                      *cur_progress += 1; // here
> >
> > Let us be more explicit and set this equal to 1, instead of adding 1.
> >
> >>              goto out;
> >>      }
> >>      ...
> >>      pte = pte_offset_map_lock(mm, pmd, start_addr, &ptl);
> >>      if (!pte) {
> >>              if (cur_progress)
> >>                      *cur_progress += 1; // here
> >
> > Same comment as above.
> >
> >>              result = SCAN_NO_PTE_TABLE;
> >>              goto out;
> >>      }
> >>
> >>      for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR;
> >>           _pte++, addr += PAGE_SIZE) {
> >>              ...
> >>      }
> >>      ...
> >> out_unmap:
> >>      if (cur_progress) {
> >>              if (_pte >= pte + HPAGE_PMD_NR)
> >>                      *cur_progress += HPAGE_PMD_NR;   // here
> >>              else
> >>                      *cur_progress += _pte - pte + 1; // here
> >>      }
> >> }
> >
> > I will vote case 2. In case 1 I don't like the fact that the if (cur_progress)
> > branch will be checked each iteration - and I don't think the compiler can
> > optimize this since the body of the loop is complex, so this check cannot
> > be hoisted out of the loop.
> >
> >
> >>
> >> case 3:
> >>      current patch, and add more comments to clearer.
> >>
> >>>> +
> >>>>    result = find_pmd_or_thp_or_none(mm, start_addr, &pmd);
> >>>>    if (result != SCAN_SUCCEED)
> >>>>            goto out;
> >>>> @@ -1396,6 +1403,12 @@ static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
> >>>>            result = SCAN_SUCCEED;
> >>>>    }
> >>>>   out_unmap:
> >>>> +  if (cur_progress) {
> >>>> +          if (_pte >= pte + HPAGE_PMD_NR)
> >>>> +                  *cur_progress += HPAGE_PMD_NR - 1;
> >>>> +          else
> >>>> +                  *cur_progress += _pte - pte;
> >>>> +  }
> >>>>    pte_unmap_unlock(pte, ptl);
> >>>>    if (result == SCAN_SUCCEED) {
> >>>>            result = collapse_huge_page(mm, start_addr, referenced,
> >>>> @@ -2286,8 +2299,9 @@ static enum scan_result collapse_file(struct mm_struct *mm, unsigned long addr,
> >>>>    return result;
> >>>>   }
> >>>>
> >>>> -static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
> >>>> -          struct file *file, pgoff_t start, struct collapse_control *cc)
> >>>> +static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm,
> >>>> +          unsigned long addr, struct file *file, pgoff_t start,
> >>>> +          unsigned int *cur_progress, struct collapse_control *cc)
> >>>>   {
> >>>>    struct folio *folio = NULL;
> >>>>    struct address_space *mapping = file->f_mapping;
> >>>> @@ -2376,6 +2390,18 @@ static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm, unsigned
> >>>>                    cond_resched_rcu();
> >>>>            }
> >>>>    }
> >>>> +  if (cur_progress) {
> >>>> +          unsigned long idx = xas_get_index(&xas) - start;
> >>>> +
> >>>> +          if (folio == NULL)
> >>>> +                  *cur_progress += HPAGE_PMD_NR;
> >>> I think this whole block needs some comments. Can you explain, why you
> >>> do a particular increment in each case?
> >>>
> >>>> +          else if (xa_is_value(folio))
> >>>> +                  *cur_progress += idx + (1 << xas_get_order(&xas));
> >>>> +          else if (folio_order(folio) == HPAGE_PMD_ORDER)
> >>>> +                  *cur_progress += idx + 1;
> >>>> +          else
> >>>> +                  *cur_progress += idx + folio_nr_pages(folio);
> >>>> +  }
> >> The "idx" represent PTEs number already scanned when exiting
> >> xas_for_each().
> >>
> >> However, the last valid folio size was not counted in "idx" (except
> >> folio == NULL, "idx" equal to HPAGE_PMD_NR), which can be further
> >> divided into three cases:
> >
> > But, the number of PTEs you account in these three cases, are *not*
> > scanned, right? So we can simply drop these 3 cases.
> >
> >>
> >> 1. shmem swap entries (xa_is_value), add folio size.
> >> 2. the folio is HPAGE_PMD_ORDER, the memory has been collapsed
> >>     to PMD, so add 1 only.
> >> 3. Normal folio, add folio size.
> >>
> >> Is the version below more readable?
> >>
> >>      if (cur_progress) {
> >>              *cur_progress += xas.xa_index - start;
> >>
> >>              if (folio) {
> >>                      if (xa_is_value(folio))
> >>                              *cur_progress += 1 << xas_get_order(&xas);
> >>                      else if (folio_order(folio) == HPAGE_PMD_ORDER)
> >>                              *cur_progress += 1;
> >>                      else
> >>                              *cur_progress += folio_nr_pages(folio);
> >>              }
> >>      }
> >
> > Yep, this is unneeded complexity. This looks really ugly and the benefits of
> > this are not clear. You can simply do
> >
> > if (cur_progress)
> >       *cur_progress = xas.xa_index - start;
> >
>
> I agree with Dev here. The extra complexity in hpage_collapse_scan_file()
> doesn't seem worth it.
>
> Suggest:
>
> if (cur_progress)
>         *cur_progress = max(xas.xa_index - start, 1UL);
>
> Just keeps it simple, and handles the idx=0 case you mentioned as well.

Thank you for your suggestion, but two scenarios are still missing. For a
detailed explanation, please refer to the link below.

https://lore.kernel.org/linux-mm/20260123082232.16413-1-vernon2gm@gmail.com/T/#mc3969cb0d52e28ffea2fb96260f0880a5f005848

Re: [PATCH mm-new v5 2/5] mm: khugepaged: refine scan progress number

Posted by Vernon Yang 1 week, 2 days ago

On Thu, Jan 29, 2026 at 11:05:36AM +0530, Dev Jain wrote:
>
> On 28/01/26 8:04 pm, Vernon Yang wrote:
> > On Wed, Jan 28, 2026 at 01:59:33PM +0530, Dev Jain wrote:
> >> On 23/01/26 1:52 pm, Vernon Yang wrote:
> >>> From: Vernon Yang <yanglincheng@kylinos.cn>
> >>>
> >>> Currently, each scan always increases "progress" by HPAGE_PMD_NR,
> >>> even if only scanning a single PTE/PMD entry.
> >>>
> >>> - When only scanning a sigle PTE entry, let me provide a detailed
> >>>   example:
> >>>
> >>> static int hpage_collapse_scan_pmd()
> >>> {
> >>> 	for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR;
> >>> 	     _pte++, addr += PAGE_SIZE) {
> >>> 		pte_t pteval = ptep_get(_pte);
> >>> 		...
> >>> 		if (pte_uffd_wp(pteval)) { <-- first scan hit
> >>> 			result = SCAN_PTE_UFFD_WP;
> >>> 			goto out_unmap;
> >>> 		}
> >>> 	}
> >>> }
> >>>
> >>> During the first scan, if pte_uffd_wp(pteval) is true, the loop exits
> >>> directly. In practice, only one PTE is scanned before termination.
> >>> Here, "progress += 1" reflects the actual number of PTEs scanned, but
> >>> previously "progress += HPAGE_PMD_NR" always.
> >>>
> >>> - When the memory has been collapsed to PMD, let me provide a detailed
> >>>   example:
> >>>
> >>> The following data is traced by bpftrace on a desktop system. After
> >>> the system has been left idle for 10 minutes upon booting, a lot of
> >>> SCAN_PMD_MAPPED or SCAN_NO_PTE_TABLE are observed during a full scan
> >>> by khugepaged.
> >>>
> >>> @scan_pmd_status[1]: 1           ## SCAN_SUCCEED
> >>> @scan_pmd_status[6]: 2           ## SCAN_EXCEED_SHARED_PTE
> >>> @scan_pmd_status[3]: 142         ## SCAN_PMD_MAPPED
> >>> @scan_pmd_status[2]: 178         ## SCAN_NO_PTE_TABLE
> >> Could you elaborate what is [1], [6] etc and 1,2,142, etc?
> > These 1,6 are value of "enum scan_result", you can directly refer to the
> > notes on the right.
> >
> > These 1,2,142,178 are number of different "enum scan_result" from
> > trace_mm_khugepaged_scan_pmd and trace_mm_khugepaged_scan_file.
> >
> > as example, SCAN_PMD_MAPPED has 142 times during a full scan by
> > khugepaged.
>
> Thanks. Can you please mention this in the patch description. You can simply
> right it like this:
>
> "From trace_mm_khugepaged_scan_pmd and trace_mm_khugepaged_scan_file, the
> following statuses were observed, with frequency mentioned next to them:
>
> SCAN_SUCCEED: 1
> SCAN_PMD_MAPPED: 142
> ....."
>
> and so on.

LGTM, I will do it in the next version. Thanks!

> >
> >>> total progress size: 674 MB
> >>> Total time         : 419 seconds ## include khugepaged_scan_sleep_millisecs
> >>>
> >>> The khugepaged_scan list save all task that support collapse into hugepage,
> >>> as long as the task is not destroyed, khugepaged will not remove it from
> >>> the khugepaged_scan list. This exist a phenomenon where task has already
> >>> collapsed all memory regions into hugepage, but khugepaged continues to
> >>> scan it, which wastes CPU time and invalid, and due to
> >>> khugepaged_scan_sleep_millisecs (default 10s) causes a long wait for
> >>> scanning a large number of invalid task, so scanning really valid task
> >>> is later.
> >>>
> >>> After applying this patch, when the memory is either SCAN_PMD_MAPPED or
> >>> SCAN_NO_PTE_TABLE, just skip it, as follow:
> >>>
> >>> @scan_pmd_status[6]: 2
> >>> @scan_pmd_status[3]: 147
> >>> @scan_pmd_status[2]: 173
> >>> total progress size: 45 MB
> >>> Total time         : 20 seconds
> >>>
> >>> Signed-off-by: Vernon Yang <yanglincheng@kylinos.cn>
> >>> ---
> >>>  include/linux/xarray.h |  9 ++++++++
> >>>  mm/khugepaged.c        | 47 ++++++++++++++++++++++++++++++++++--------
> >>>  2 files changed, 47 insertions(+), 9 deletions(-)
> >>>
> >>> diff --git a/include/linux/xarray.h b/include/linux/xarray.h
> >>> index be850174e802..f77d97d7b957 100644
> >>> --- a/include/linux/xarray.h
> >>> +++ b/include/linux/xarray.h
> >>> @@ -1646,6 +1646,15 @@ static inline void xas_set(struct xa_state *xas, unsigned long index)
> >>>  	xas->xa_node = XAS_RESTART;
> >>>  }
> >>>
> >>> +/**
> >>> + * xas_get_index() - Get XArray operation state for a different index.
> >>> + * @xas: XArray operation state.
> >>> + */
> >>> +static inline unsigned long xas_get_index(struct xa_state *xas)
> >>> +{
> >>> +	return xas->xa_index;
> >>> +}
> >>> +
> >>>  /**
> >>>   * xas_advance() - Skip over sibling entries.
> >>>   * @xas: XArray operation state.
> >>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> >>> index 6f0f05148765..de95029e3763 100644
> >>> --- a/mm/khugepaged.c
> >>> +++ b/mm/khugepaged.c
> >>> @@ -68,7 +68,10 @@ enum scan_result {
> >>>  static struct task_struct *khugepaged_thread __read_mostly;
> >>>  static DEFINE_MUTEX(khugepaged_mutex);
> >>>
> >>> -/* default scan 8*HPAGE_PMD_NR ptes (or vmas) every 10 second */
> >>> +/*
> >>> + * default scan 8*HPAGE_PMD_NR ptes, pmd_mapped, no_pte_table or vmas
> >>> + * every 10 second.
> >>> + */
> >>>  static unsigned int khugepaged_pages_to_scan __read_mostly;
> >>>  static unsigned int khugepaged_pages_collapsed;
> >>>  static unsigned int khugepaged_full_scans;
> >>> @@ -1240,7 +1243,8 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
> >>>  }
> >>>
> >>>  static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
> >>> -		struct vm_area_struct *vma, unsigned long start_addr, bool *mmap_locked,
> >>> +		struct vm_area_struct *vma, unsigned long start_addr,
> >>> +		bool *mmap_locked, unsigned int *cur_progress,
> >>>  		struct collapse_control *cc)
> >>>  {
> >>>  	pmd_t *pmd;
> >>> @@ -1255,6 +1259,9 @@ static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
> >>>
> >>>  	VM_BUG_ON(start_addr & ~HPAGE_PMD_MASK);
> >>>
> >>> +	if (cur_progress)
> >>> +		*cur_progress += 1;
> >> Why not be a little more explicit, and do this addition if find_pmd_or_thp_or_none fails,
> >> or pte_offset_map_lock fails? The way you do it right now is not readable - it gives no
> >> idea as to why on function entry we do a +1 right away. Please do add some comments too.
> > If this way is not clear enough, we can directly add 1 in
> > find_pmd_or_thp_or_none() etc, BUT it's a bit redundant.
> > Please take a look at which one is better.
> >
> > case 1:
> > as the V4 PATCH #2 [1] and #3 [2], only hpage_collapse_scan_pmd().
> > [1] https://lore.kernel.org/linux-mm/20260111121909.8410-3-yanglincheng@kylinos.cn
> > [2] https://lore.kernel.org/linux-mm/20260111121909.8410-4-yanglincheng@kylinos.cn
> >
> > static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
> > 		struct vm_area_struct *vma, unsigned long start_addr,
> > 		bool *mmap_locked, unsigned int *cur_progress,
> > 		struct collapse_control *cc)
> > {
> > 	...
> > 	result = find_pmd_or_thp_or_none(mm, start_addr, &pmd);
> > 	if (result != SCAN_SUCCEED) {
> > 		if (cur_progress)
> > 			*cur_progress += 1; // here
> > 		goto out;
> > 	}
> > 	...
> > 	pte = pte_offset_map_lock(mm, pmd, start_addr, &ptl);
> > 	if (!pte) {
> > 		if (cur_progress)
> > 			*cur_progress += 1; // here
> > 		result = SCAN_NO_PTE_TABLE;
> > 		goto out;
> > 	}
> >
> > 	for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR;
> > 	     _pte++, addr += PAGE_SIZE) {
> > 		if (cur_progress)
> > 			*cur_progress += 1; // here
> > 		...
> > 	}
> > }
> >
> > case 2:
> >
> > static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
> > 		struct vm_area_struct *vma, unsigned long start_addr,
> > 		bool *mmap_locked, unsigned int *cur_progress,
> > 		struct collapse_control *cc)
> > {
> > 	...
> > 	result = find_pmd_or_thp_or_none(mm, start_addr, &pmd);
> > 	if (result != SCAN_SUCCEED) {
> > 		if (cur_progress)
> > 			*cur_progress += 1; // here
>
> Let us be more explicit and set this equal to 1, instead of adding 1.

LGTM, I will do it in the next version. Thanks!

> > 		goto out;
> > 	}
> > 	...
> > 	pte = pte_offset_map_lock(mm, pmd, start_addr, &ptl);
> > 	if (!pte) {
> > 		if (cur_progress)
> > 			*cur_progress += 1; // here
>
> Same comment as above.
>
> > 		result = SCAN_NO_PTE_TABLE;
> > 		goto out;
> > 	}
> >
> > 	for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR;
> > 	     _pte++, addr += PAGE_SIZE) {
> > 		...
> > 	}
> > 	...
> > out_unmap:
> > 	if (cur_progress) {
> > 		if (_pte >= pte + HPAGE_PMD_NR)
> > 			*cur_progress += HPAGE_PMD_NR;   // here
> > 		else
> > 			*cur_progress += _pte - pte + 1; // here
> > 	}
> > }
>
> I will vote case 2. In case 1 I don't like the fact that the if (cur_progress)
> branch will be checked each iteration - and I don't think the compiler can
> optimize this since the body of the loop is complex, so this check cannot
> be hoisted out of the loop.
>
>
> >
> > case 3:
> > 	current patch, and add more comments to clearer.
> >
> >>> +
> >>>  	result = find_pmd_or_thp_or_none(mm, start_addr, &pmd);
> >>>  	if (result != SCAN_SUCCEED)
> >>>  		goto out;
> >>> @@ -1396,6 +1403,12 @@ static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
> >>>  		result = SCAN_SUCCEED;
> >>>  	}
> >>>  out_unmap:
> >>> +	if (cur_progress) {
> >>> +		if (_pte >= pte + HPAGE_PMD_NR)
> >>> +			*cur_progress += HPAGE_PMD_NR - 1;
> >>> +		else
> >>> +			*cur_progress += _pte - pte;
> >>> +	}
> >>>  	pte_unmap_unlock(pte, ptl);
> >>>  	if (result == SCAN_SUCCEED) {
> >>>  		result = collapse_huge_page(mm, start_addr, referenced,
> >>> @@ -2286,8 +2299,9 @@ static enum scan_result collapse_file(struct mm_struct *mm, unsigned long addr,
> >>>  	return result;
> >>>  }
> >>>
> >>> -static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
> >>> -		struct file *file, pgoff_t start, struct collapse_control *cc)
> >>> +static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm,
> >>> +		unsigned long addr, struct file *file, pgoff_t start,
> >>> +		unsigned int *cur_progress, struct collapse_control *cc)
> >>>  {
> >>>  	struct folio *folio = NULL;
> >>>  	struct address_space *mapping = file->f_mapping;
> >>> @@ -2376,6 +2390,18 @@ static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm, unsigned
> >>>  			cond_resched_rcu();
> >>>  		}
> >>>  	}
> >>> +	if (cur_progress) {
> >>> +		unsigned long idx = xas_get_index(&xas) - start;
> >>> +
> >>> +		if (folio == NULL)
> >>> +			*cur_progress += HPAGE_PMD_NR;
> >> I think this whole block needs some comments. Can you explain, why you
> >> do a particular increment in each case?
> >>
> >>> +		else if (xa_is_value(folio))
> >>> +			*cur_progress += idx + (1 << xas_get_order(&xas));
> >>> +		else if (folio_order(folio) == HPAGE_PMD_ORDER)
> >>> +			*cur_progress += idx + 1;
> >>> +		else
> >>> +			*cur_progress += idx + folio_nr_pages(folio);
> >>> +	}
> > The "idx" represent PTEs number already scanned when exiting
> > xas_for_each().
> >
> > However, the last valid folio size was not counted in "idx" (except
> > folio == NULL, "idx" equal to HPAGE_PMD_NR), which can be further
> > divided into three cases:
>
> But, the number of PTEs you account in these three cases, are *not*
> scanned, right? So we can simply drop these 3 cases.

No, these three cases are the last scanning folio to break, I think we
need to add them. Imagine that if we trigger HPAGE_PMD_ORDER folio
firstly, "idx" is equal to 0.

> >
> > 1. shmem swap entries (xa_is_value), add folio size.
> > 2. the folio is HPAGE_PMD_ORDER, the memory has been collapsed
> >    to PMD, so add 1 only.
> > 3. Normal folio, add folio size.
> >
> > Is the version below more readable?
> >
> > 	if (cur_progress) {
> > 		*cur_progress += xas.xa_index - start;
> >
> > 		if (folio) {
> > 			if (xa_is_value(folio))
> > 				*cur_progress += 1 << xas_get_order(&xas);
> > 			else if (folio_order(folio) == HPAGE_PMD_ORDER)
> > 				*cur_progress += 1;
> > 			else
> > 				*cur_progress += folio_nr_pages(folio);
> > 		}
> > 	}
>
> Yep, this is unneeded complexity. This looks really ugly and the benefits of
> this are not clear. You can simply do
>
> if (cur_progress)
> 	*cur_progress = xas.xa_index - start;
>
> >
> >>>  	rcu_read_unlock();
> >>>
> >>>  	if (result == SCAN_SUCCEED) {
> >>> @@ -2456,6 +2482,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result
> >>>
> >>>  		while (khugepaged_scan.address < hend) {
> >>>  			bool mmap_locked = true;
> >>> +			unsigned int cur_progress = 0;
> >>>
> >>>  			cond_resched();
> >>>  			if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
> >>> @@ -2472,7 +2499,8 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result
> >>>  				mmap_read_unlock(mm);
> >>>  				mmap_locked = false;
> >>>  				*result = hpage_collapse_scan_file(mm,
> >>> -					khugepaged_scan.address, file, pgoff, cc);
> >>> +					khugepaged_scan.address, file, pgoff,
> >>> +					&cur_progress, cc);
> >>>  				fput(file);
> >>>  				if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
> >>>  					mmap_read_lock(mm);
> >>> @@ -2486,7 +2514,8 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result
> >>>  				}
> >>>  			} else {
> >>>  				*result = hpage_collapse_scan_pmd(mm, vma,
> >>> -					khugepaged_scan.address, &mmap_locked, cc);
> >>> +					khugepaged_scan.address, &mmap_locked,
> >>> +					&cur_progress, cc);
> >>>  			}
> >>>
> >>>  			if (*result == SCAN_SUCCEED)
> >>> @@ -2494,7 +2523,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result
> >>>
> >>>  			/* move to next address */
> >>>  			khugepaged_scan.address += HPAGE_PMD_SIZE;
> >>> -			progress += HPAGE_PMD_NR;
> >>> +			progress += cur_progress;
> >>>  			if (!mmap_locked)
> >>>  				/*
> >>>  				 * We released mmap_lock so break loop.  Note
> >>> @@ -2817,7 +2846,7 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
> >>>  			mmap_locked = false;
> >>>  			*lock_dropped = true;
> >>>  			result = hpage_collapse_scan_file(mm, addr, file, pgoff,
> >>> -							  cc);
> >>> +							  NULL, cc);
> >>>
> >>>  			if (result == SCAN_PAGE_DIRTY_OR_WRITEBACK && !triggered_wb &&
> >>>  			    mapping_can_writeback(file->f_mapping)) {
> >>> @@ -2832,7 +2861,7 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
> >>>  			fput(file);
> >>>  		} else {
> >>>  			result = hpage_collapse_scan_pmd(mm, vma, addr,
> >>> -							 &mmap_locked, cc);
> >>> +							 &mmap_locked, NULL, cc);
> >>>  		}
> >>>  		if (!mmap_locked)
> >>>  			*lock_dropped = true;
> > --
> > Thanks,
> > Vernon
>

Re: [PATCH mm-new v5 2/5] mm: khugepaged: refine scan progress number

Posted by Dev Jain 1 week, 2 days ago

On 29/01/26 1:29 pm, Vernon Yang wrote:
> On Thu, Jan 29, 2026 at 11:05:36AM +0530, Dev Jain wrote:
>> On 28/01/26 8:04 pm, Vernon Yang wrote:
>>> On Wed, Jan 28, 2026 at 01:59:33PM +0530, Dev Jain wrote:
>>>> On 23/01/26 1:52 pm, Vernon Yang wrote:
>>>>> From: Vernon Yang <yanglincheng@kylinos.cn>
>>>>>
>>>>> Currently, each scan always increases "progress" by HPAGE_PMD_NR,
>>>>> even if only scanning a single PTE/PMD entry.
>>>>>
>>>>> - When only scanning a sigle PTE entry, let me provide a detailed
>>>>>   example:
>>>>>
>>>>> static int hpage_collapse_scan_pmd()
>>>>> {
>>>>> 	for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR;
>>>>> 	     _pte++, addr += PAGE_SIZE) {
>>>>> 		pte_t pteval = ptep_get(_pte);
>>>>> 		...
>>>>> 		if (pte_uffd_wp(pteval)) { <-- first scan hit
>>>>> 			result = SCAN_PTE_UFFD_WP;
>>>>> 			goto out_unmap;
>>>>> 		}
>>>>> 	}
>>>>> }
>>>>>
>>>>> During the first scan, if pte_uffd_wp(pteval) is true, the loop exits
>>>>> directly. In practice, only one PTE is scanned before termination.
>>>>> Here, "progress += 1" reflects the actual number of PTEs scanned, but
>>>>> previously "progress += HPAGE_PMD_NR" always.
>>>>>
>>>>> - When the memory has been collapsed to PMD, let me provide a detailed
>>>>>   example:
>>>>>
>>>>> The following data is traced by bpftrace on a desktop system. After
>>>>> the system has been left idle for 10 minutes upon booting, a lot of
>>>>> SCAN_PMD_MAPPED or SCAN_NO_PTE_TABLE are observed during a full scan
>>>>> by khugepaged.
>>>>>
>>>>> @scan_pmd_status[1]: 1           ## SCAN_SUCCEED
>>>>> @scan_pmd_status[6]: 2           ## SCAN_EXCEED_SHARED_PTE
>>>>> @scan_pmd_status[3]: 142         ## SCAN_PMD_MAPPED
>>>>> @scan_pmd_status[2]: 178         ## SCAN_NO_PTE_TABLE
>>>> Could you elaborate what is [1], [6] etc and 1,2,142, etc?
>>> These 1,6 are value of "enum scan_result", you can directly refer to the
>>> notes on the right.
>>>
>>> These 1,2,142,178 are number of different "enum scan_result" from
>>> trace_mm_khugepaged_scan_pmd and trace_mm_khugepaged_scan_file.
>>>
>>> as example, SCAN_PMD_MAPPED has 142 times during a full scan by
>>> khugepaged.
>> Thanks. Can you please mention this in the patch description. You can simply
>> right it like this:
>>
>> "From trace_mm_khugepaged_scan_pmd and trace_mm_khugepaged_scan_file, the
>> following statuses were observed, with frequency mentioned next to them:
>>
>> SCAN_SUCCEED: 1
>> SCAN_PMD_MAPPED: 142
>> ....."
>>
>> and so on.
> LGTM, I will do it in the next version. Thanks!
>
>>>>> total progress size: 674 MB
>>>>> Total time         : 419 seconds ## include khugepaged_scan_sleep_millisecs
>>>>>
>>>>> The khugepaged_scan list save all task that support collapse into hugepage,
>>>>> as long as the task is not destroyed, khugepaged will not remove it from
>>>>> the khugepaged_scan list. This exist a phenomenon where task has already
>>>>> collapsed all memory regions into hugepage, but khugepaged continues to
>>>>> scan it, which wastes CPU time and invalid, and due to
>>>>> khugepaged_scan_sleep_millisecs (default 10s) causes a long wait for
>>>>> scanning a large number of invalid task, so scanning really valid task
>>>>> is later.
>>>>>
>>>>> After applying this patch, when the memory is either SCAN_PMD_MAPPED or
>>>>> SCAN_NO_PTE_TABLE, just skip it, as follow:
>>>>>
>>>>> @scan_pmd_status[6]: 2
>>>>> @scan_pmd_status[3]: 147
>>>>> @scan_pmd_status[2]: 173
>>>>> total progress size: 45 MB
>>>>> Total time         : 20 seconds
>>>>>
>>>>> Signed-off-by: Vernon Yang <yanglincheng@kylinos.cn>
>>>>> ---
>>>>>  include/linux/xarray.h |  9 ++++++++
>>>>>  mm/khugepaged.c        | 47 ++++++++++++++++++++++++++++++++++--------
>>>>>  2 files changed, 47 insertions(+), 9 deletions(-)
>>>>>
>>>>> diff --git a/include/linux/xarray.h b/include/linux/xarray.h
>>>>> index be850174e802..f77d97d7b957 100644
>>>>> --- a/include/linux/xarray.h
>>>>> +++ b/include/linux/xarray.h
>>>>> @@ -1646,6 +1646,15 @@ static inline void xas_set(struct xa_state *xas, unsigned long index)
>>>>>  	xas->xa_node = XAS_RESTART;
>>>>>  }
>>>>>
>>>>> +/**
>>>>> + * xas_get_index() - Get XArray operation state for a different index.
>>>>> + * @xas: XArray operation state.
>>>>> + */
>>>>> +static inline unsigned long xas_get_index(struct xa_state *xas)
>>>>> +{
>>>>> +	return xas->xa_index;
>>>>> +}
>>>>> +
>>>>>  /**
>>>>>   * xas_advance() - Skip over sibling entries.
>>>>>   * @xas: XArray operation state.
>>>>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>>>>> index 6f0f05148765..de95029e3763 100644
>>>>> --- a/mm/khugepaged.c
>>>>> +++ b/mm/khugepaged.c
>>>>> @@ -68,7 +68,10 @@ enum scan_result {
>>>>>  static struct task_struct *khugepaged_thread __read_mostly;
>>>>>  static DEFINE_MUTEX(khugepaged_mutex);
>>>>>
>>>>> -/* default scan 8*HPAGE_PMD_NR ptes (or vmas) every 10 second */
>>>>> +/*
>>>>> + * default scan 8*HPAGE_PMD_NR ptes, pmd_mapped, no_pte_table or vmas
>>>>> + * every 10 second.
>>>>> + */
>>>>>  static unsigned int khugepaged_pages_to_scan __read_mostly;
>>>>>  static unsigned int khugepaged_pages_collapsed;
>>>>>  static unsigned int khugepaged_full_scans;
>>>>> @@ -1240,7 +1243,8 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
>>>>>  }
>>>>>
>>>>>  static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
>>>>> -		struct vm_area_struct *vma, unsigned long start_addr, bool *mmap_locked,
>>>>> +		struct vm_area_struct *vma, unsigned long start_addr,
>>>>> +		bool *mmap_locked, unsigned int *cur_progress,
>>>>>  		struct collapse_control *cc)
>>>>>  {
>>>>>  	pmd_t *pmd;
>>>>> @@ -1255,6 +1259,9 @@ static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
>>>>>
>>>>>  	VM_BUG_ON(start_addr & ~HPAGE_PMD_MASK);
>>>>>
>>>>> +	if (cur_progress)
>>>>> +		*cur_progress += 1;
>>>> Why not be a little more explicit, and do this addition if find_pmd_or_thp_or_none fails,
>>>> or pte_offset_map_lock fails? The way you do it right now is not readable - it gives no
>>>> idea as to why on function entry we do a +1 right away. Please do add some comments too.
>>> If this way is not clear enough, we can directly add 1 in
>>> find_pmd_or_thp_or_none() etc, BUT it's a bit redundant.
>>> Please take a look at which one is better.
>>>
>>> case 1:
>>> as the V4 PATCH #2 [1] and #3 [2], only hpage_collapse_scan_pmd().
>>> [1] https://lore.kernel.org/linux-mm/20260111121909.8410-3-yanglincheng@kylinos.cn
>>> [2] https://lore.kernel.org/linux-mm/20260111121909.8410-4-yanglincheng@kylinos.cn
>>>
>>> static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
>>> 		struct vm_area_struct *vma, unsigned long start_addr,
>>> 		bool *mmap_locked, unsigned int *cur_progress,
>>> 		struct collapse_control *cc)
>>> {
>>> 	...
>>> 	result = find_pmd_or_thp_or_none(mm, start_addr, &pmd);
>>> 	if (result != SCAN_SUCCEED) {
>>> 		if (cur_progress)
>>> 			*cur_progress += 1; // here
>>> 		goto out;
>>> 	}
>>> 	...
>>> 	pte = pte_offset_map_lock(mm, pmd, start_addr, &ptl);
>>> 	if (!pte) {
>>> 		if (cur_progress)
>>> 			*cur_progress += 1; // here
>>> 		result = SCAN_NO_PTE_TABLE;
>>> 		goto out;
>>> 	}
>>>
>>> 	for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR;
>>> 	     _pte++, addr += PAGE_SIZE) {
>>> 		if (cur_progress)
>>> 			*cur_progress += 1; // here
>>> 		...
>>> 	}
>>> }
>>>
>>> case 2:
>>>
>>> static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
>>> 		struct vm_area_struct *vma, unsigned long start_addr,
>>> 		bool *mmap_locked, unsigned int *cur_progress,
>>> 		struct collapse_control *cc)
>>> {
>>> 	...
>>> 	result = find_pmd_or_thp_or_none(mm, start_addr, &pmd);
>>> 	if (result != SCAN_SUCCEED) {
>>> 		if (cur_progress)
>>> 			*cur_progress += 1; // here
>> Let us be more explicit and set this equal to 1, instead of adding 1.
> LGTM, I will do it in the next version. Thanks!
>
>>> 		goto out;
>>> 	}
>>> 	...
>>> 	pte = pte_offset_map_lock(mm, pmd, start_addr, &ptl);
>>> 	if (!pte) {
>>> 		if (cur_progress)
>>> 			*cur_progress += 1; // here
>> Same comment as above.
>>
>>> 		result = SCAN_NO_PTE_TABLE;
>>> 		goto out;
>>> 	}
>>>
>>> 	for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR;
>>> 	     _pte++, addr += PAGE_SIZE) {
>>> 		...
>>> 	}
>>> 	...
>>> out_unmap:
>>> 	if (cur_progress) {
>>> 		if (_pte >= pte + HPAGE_PMD_NR)
>>> 			*cur_progress += HPAGE_PMD_NR;   // here
>>> 		else
>>> 			*cur_progress += _pte - pte + 1; // here
>>> 	}
>>> }
>> I will vote case 2. In case 1 I don't like the fact that the if (cur_progress)
>> branch will be checked each iteration - and I don't think the compiler can
>> optimize this since the body of the loop is complex, so this check cannot
>> be hoisted out of the loop.
>>
>>
>>> case 3:
>>> 	current patch, and add more comments to clearer.
>>>
>>>>> +
>>>>>  	result = find_pmd_or_thp_or_none(mm, start_addr, &pmd);
>>>>>  	if (result != SCAN_SUCCEED)
>>>>>  		goto out;
>>>>> @@ -1396,6 +1403,12 @@ static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
>>>>>  		result = SCAN_SUCCEED;
>>>>>  	}
>>>>>  out_unmap:
>>>>> +	if (cur_progress) {
>>>>> +		if (_pte >= pte + HPAGE_PMD_NR)
>>>>> +			*cur_progress += HPAGE_PMD_NR - 1;
>>>>> +		else
>>>>> +			*cur_progress += _pte - pte;
>>>>> +	}
>>>>>  	pte_unmap_unlock(pte, ptl);
>>>>>  	if (result == SCAN_SUCCEED) {
>>>>>  		result = collapse_huge_page(mm, start_addr, referenced,
>>>>> @@ -2286,8 +2299,9 @@ static enum scan_result collapse_file(struct mm_struct *mm, unsigned long addr,
>>>>>  	return result;
>>>>>  }
>>>>>
>>>>> -static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
>>>>> -		struct file *file, pgoff_t start, struct collapse_control *cc)
>>>>> +static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm,
>>>>> +		unsigned long addr, struct file *file, pgoff_t start,
>>>>> +		unsigned int *cur_progress, struct collapse_control *cc)
>>>>>  {
>>>>>  	struct folio *folio = NULL;
>>>>>  	struct address_space *mapping = file->f_mapping;
>>>>> @@ -2376,6 +2390,18 @@ static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm, unsigned
>>>>>  			cond_resched_rcu();
>>>>>  		}
>>>>>  	}
>>>>> +	if (cur_progress) {
>>>>> +		unsigned long idx = xas_get_index(&xas) - start;
>>>>> +
>>>>> +		if (folio == NULL)
>>>>> +			*cur_progress += HPAGE_PMD_NR;
>>>> I think this whole block needs some comments. Can you explain, why you
>>>> do a particular increment in each case?
>>>>
>>>>> +		else if (xa_is_value(folio))
>>>>> +			*cur_progress += idx + (1 << xas_get_order(&xas));
>>>>> +		else if (folio_order(folio) == HPAGE_PMD_ORDER)
>>>>> +			*cur_progress += idx + 1;
>>>>> +		else
>>>>> +			*cur_progress += idx + folio_nr_pages(folio);
>>>>> +	}
>>> The "idx" represent PTEs number already scanned when exiting
>>> xas_for_each().
>>>
>>> However, the last valid folio size was not counted in "idx" (except
>>> folio == NULL, "idx" equal to HPAGE_PMD_NR), which can be further
>>> divided into three cases:
>> But, the number of PTEs you account in these three cases, are *not*
>> scanned, right? So we can simply drop these 3 cases.
> No, these three cases are the last scanning folio to break, I think we
> need to add them. Imagine that if we trigger HPAGE_PMD_ORDER folio
> firstly, "idx" is equal to 0.

If you hit a large folio and break, you don't consume any extra iterations.
The number of iterations is completely determined by xa_index. This is kind
of analogous to SCAN_PMD_MAPPED - in one "iteration", you decide to stop
scanning, and set progress to 1.

In any case, the problem which you describe in the patch description is
completely solved by setting progress to 1 upon failure of find_pmd_or_thp_or_none.
So anything else which you do (like counting iterations in hpage_collapse_scan_pmd
or hpage_collapse_scan_file) is extra and it is not clear if it has a practical
benefit. The patch benefits us because there are a lot of SCAN_PMD_MAPPED and SCAN_PMD_NONE.
Will we practically also encounter a large number of SCAN_EXCEED_PTE_SWAP, PTE_NONE, etc?

I tilt towards keeping the other bits of the patch, if they can be simplified, and
because this patch is relatively harmless. Just like you count the number of iterations
in hpage_collapse_scan_pmd(), you can do the same here using xa_index.

>
>>> 1. shmem swap entries (xa_is_value), add folio size.
>>> 2. the folio is HPAGE_PMD_ORDER, the memory has been collapsed
>>>    to PMD, so add 1 only.
>>> 3. Normal folio, add folio size.
>>>
>>> Is the version below more readable?
>>>
>>> 	if (cur_progress) {
>>> 		*cur_progress += xas.xa_index - start;
>>>
>>> 		if (folio) {
>>> 			if (xa_is_value(folio))
>>> 				*cur_progress += 1 << xas_get_order(&xas);
>>> 			else if (folio_order(folio) == HPAGE_PMD_ORDER)
>>> 				*cur_progress += 1;
>>> 			else
>>> 				*cur_progress += folio_nr_pages(folio);
>>> 		}
>>> 	}
>> Yep, this is unneeded complexity. This looks really ugly and the benefits of
>> this are not clear. You can simply do
>>
>> if (cur_progress)
>> 	*cur_progress = xas.xa_index - start;
>>
>>>>>  	rcu_read_unlock();
>>>>>
>>>>>  	if (result == SCAN_SUCCEED) {
>>>>> @@ -2456,6 +2482,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result
>>>>>
>>>>>  		while (khugepaged_scan.address < hend) {
>>>>>  			bool mmap_locked = true;
>>>>> +			unsigned int cur_progress = 0;
>>>>>
>>>>>  			cond_resched();
>>>>>  			if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
>>>>> @@ -2472,7 +2499,8 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result
>>>>>  				mmap_read_unlock(mm);
>>>>>  				mmap_locked = false;
>>>>>  				*result = hpage_collapse_scan_file(mm,
>>>>> -					khugepaged_scan.address, file, pgoff, cc);
>>>>> +					khugepaged_scan.address, file, pgoff,
>>>>> +					&cur_progress, cc);
>>>>>  				fput(file);
>>>>>  				if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
>>>>>  					mmap_read_lock(mm);
>>>>> @@ -2486,7 +2514,8 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result
>>>>>  				}
>>>>>  			} else {
>>>>>  				*result = hpage_collapse_scan_pmd(mm, vma,
>>>>> -					khugepaged_scan.address, &mmap_locked, cc);
>>>>> +					khugepaged_scan.address, &mmap_locked,
>>>>> +					&cur_progress, cc);
>>>>>  			}
>>>>>
>>>>>  			if (*result == SCAN_SUCCEED)
>>>>> @@ -2494,7 +2523,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result
>>>>>
>>>>>  			/* move to next address */
>>>>>  			khugepaged_scan.address += HPAGE_PMD_SIZE;
>>>>> -			progress += HPAGE_PMD_NR;
>>>>> +			progress += cur_progress;
>>>>>  			if (!mmap_locked)
>>>>>  				/*
>>>>>  				 * We released mmap_lock so break loop.  Note
>>>>> @@ -2817,7 +2846,7 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
>>>>>  			mmap_locked = false;
>>>>>  			*lock_dropped = true;
>>>>>  			result = hpage_collapse_scan_file(mm, addr, file, pgoff,
>>>>> -							  cc);
>>>>> +							  NULL, cc);
>>>>>
>>>>>  			if (result == SCAN_PAGE_DIRTY_OR_WRITEBACK && !triggered_wb &&
>>>>>  			    mapping_can_writeback(file->f_mapping)) {
>>>>> @@ -2832,7 +2861,7 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
>>>>>  			fput(file);
>>>>>  		} else {
>>>>>  			result = hpage_collapse_scan_pmd(mm, vma, addr,
>>>>> -							 &mmap_locked, cc);
>>>>> +							 &mmap_locked, NULL, cc);
>>>>>  		}
>>>>>  		if (!mmap_locked)
>>>>>  			*lock_dropped = true;
>>> --
>>> Thanks,
>>> Vernon

Re: [PATCH mm-new v5 2/5] mm: khugepaged: refine scan progress number

Posted by Vernon Yang 1 week, 1 day ago

On Thu, Jan 29, 2026 at 4:32 PM Dev Jain <dev.jain@arm.com> wrote:
>
> On 29/01/26 1:29 pm, Vernon Yang wrote:
> > On Thu, Jan 29, 2026 at 11:05:36AM +0530, Dev Jain wrote:
> >> On 28/01/26 8:04 pm, Vernon Yang wrote:
> >>> On Wed, Jan 28, 2026 at 01:59:33PM +0530, Dev Jain wrote:
> >>>> On 23/01/26 1:52 pm, Vernon Yang wrote:
> >>>>> From: Vernon Yang <yanglincheng@kylinos.cn>
> >>>>>
> >>>>> Currently, each scan always increases "progress" by HPAGE_PMD_NR,
> >>>>> even if only scanning a single PTE/PMD entry.
> >>>>>
> >>>>> - When only scanning a sigle PTE entry, let me provide a detailed
> >>>>>   example:
> >>>>>
> >>>>> static int hpage_collapse_scan_pmd()
> >>>>> {
> >>>>>   for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR;
> >>>>>        _pte++, addr += PAGE_SIZE) {
> >>>>>           pte_t pteval = ptep_get(_pte);
> >>>>>           ...
> >>>>>           if (pte_uffd_wp(pteval)) { <-- first scan hit
> >>>>>                   result = SCAN_PTE_UFFD_WP;
> >>>>>                   goto out_unmap;
> >>>>>           }
> >>>>>   }
> >>>>> }
> >>>>>
> >>>>> During the first scan, if pte_uffd_wp(pteval) is true, the loop exits
> >>>>> directly. In practice, only one PTE is scanned before termination.
> >>>>> Here, "progress += 1" reflects the actual number of PTEs scanned, but
> >>>>> previously "progress += HPAGE_PMD_NR" always.
> >>>>>
> >>>>> - When the memory has been collapsed to PMD, let me provide a detailed
> >>>>>   example:
> >>>>>
> >>>>> The following data is traced by bpftrace on a desktop system. After
> >>>>> the system has been left idle for 10 minutes upon booting, a lot of
> >>>>> SCAN_PMD_MAPPED or SCAN_NO_PTE_TABLE are observed during a full scan
> >>>>> by khugepaged.
> >>>>>
> >>>>> @scan_pmd_status[1]: 1           ## SCAN_SUCCEED
> >>>>> @scan_pmd_status[6]: 2           ## SCAN_EXCEED_SHARED_PTE
> >>>>> @scan_pmd_status[3]: 142         ## SCAN_PMD_MAPPED
> >>>>> @scan_pmd_status[2]: 178         ## SCAN_NO_PTE_TABLE
> >>>> Could you elaborate what is [1], [6] etc and 1,2,142, etc?
> >>> These 1,6 are value of "enum scan_result", you can directly refer to the
> >>> notes on the right.
> >>>
> >>> These 1,2,142,178 are number of different "enum scan_result" from
> >>> trace_mm_khugepaged_scan_pmd and trace_mm_khugepaged_scan_file.
> >>>
> >>> as example, SCAN_PMD_MAPPED has 142 times during a full scan by
> >>> khugepaged.
> >> Thanks. Can you please mention this in the patch description. You can simply
> >> right it like this:
> >>
> >> "From trace_mm_khugepaged_scan_pmd and trace_mm_khugepaged_scan_file, the
> >> following statuses were observed, with frequency mentioned next to them:
> >>
> >> SCAN_SUCCEED: 1
> >> SCAN_PMD_MAPPED: 142
> >> ....."
> >>
> >> and so on.
> > LGTM, I will do it in the next version. Thanks!
> >
> >>>>> total progress size: 674 MB
> >>>>> Total time         : 419 seconds ## include khugepaged_scan_sleep_millisecs
> >>>>>
> >>>>> The khugepaged_scan list save all task that support collapse into hugepage,
> >>>>> as long as the task is not destroyed, khugepaged will not remove it from
> >>>>> the khugepaged_scan list. This exist a phenomenon where task has already
> >>>>> collapsed all memory regions into hugepage, but khugepaged continues to
> >>>>> scan it, which wastes CPU time and invalid, and due to
> >>>>> khugepaged_scan_sleep_millisecs (default 10s) causes a long wait for
> >>>>> scanning a large number of invalid task, so scanning really valid task
> >>>>> is later.
> >>>>>
> >>>>> After applying this patch, when the memory is either SCAN_PMD_MAPPED or
> >>>>> SCAN_NO_PTE_TABLE, just skip it, as follow:
> >>>>>
> >>>>> @scan_pmd_status[6]: 2
> >>>>> @scan_pmd_status[3]: 147
> >>>>> @scan_pmd_status[2]: 173
> >>>>> total progress size: 45 MB
> >>>>> Total time         : 20 seconds
> >>>>>
> >>>>> Signed-off-by: Vernon Yang <yanglincheng@kylinos.cn>
> >>>>> ---
> >>>>>  include/linux/xarray.h |  9 ++++++++
> >>>>>  mm/khugepaged.c        | 47 ++++++++++++++++++++++++++++++++++--------
> >>>>>  2 files changed, 47 insertions(+), 9 deletions(-)
> >>>>>
> >>>>> diff --git a/include/linux/xarray.h b/include/linux/xarray.h
> >>>>> index be850174e802..f77d97d7b957 100644
> >>>>> --- a/include/linux/xarray.h
> >>>>> +++ b/include/linux/xarray.h
> >>>>> @@ -1646,6 +1646,15 @@ static inline void xas_set(struct xa_state *xas, unsigned long index)
> >>>>>   xas->xa_node = XAS_RESTART;
> >>>>>  }
> >>>>>
> >>>>> +/**
> >>>>> + * xas_get_index() - Get XArray operation state for a different index.
> >>>>> + * @xas: XArray operation state.
> >>>>> + */
> >>>>> +static inline unsigned long xas_get_index(struct xa_state *xas)
> >>>>> +{
> >>>>> + return xas->xa_index;
> >>>>> +}
> >>>>> +
> >>>>>  /**
> >>>>>   * xas_advance() - Skip over sibling entries.
> >>>>>   * @xas: XArray operation state.
> >>>>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> >>>>> index 6f0f05148765..de95029e3763 100644
> >>>>> --- a/mm/khugepaged.c
> >>>>> +++ b/mm/khugepaged.c
> >>>>> @@ -68,7 +68,10 @@ enum scan_result {
> >>>>>  static struct task_struct *khugepaged_thread __read_mostly;
> >>>>>  static DEFINE_MUTEX(khugepaged_mutex);
> >>>>>
> >>>>> -/* default scan 8*HPAGE_PMD_NR ptes (or vmas) every 10 second */
> >>>>> +/*
> >>>>> + * default scan 8*HPAGE_PMD_NR ptes, pmd_mapped, no_pte_table or vmas
> >>>>> + * every 10 second.
> >>>>> + */
> >>>>>  static unsigned int khugepaged_pages_to_scan __read_mostly;
> >>>>>  static unsigned int khugepaged_pages_collapsed;
> >>>>>  static unsigned int khugepaged_full_scans;
> >>>>> @@ -1240,7 +1243,8 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
> >>>>>  }
> >>>>>
> >>>>>  static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
> >>>>> -         struct vm_area_struct *vma, unsigned long start_addr, bool *mmap_locked,
> >>>>> +         struct vm_area_struct *vma, unsigned long start_addr,
> >>>>> +         bool *mmap_locked, unsigned int *cur_progress,
> >>>>>           struct collapse_control *cc)
> >>>>>  {
> >>>>>   pmd_t *pmd;
> >>>>> @@ -1255,6 +1259,9 @@ static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
> >>>>>
> >>>>>   VM_BUG_ON(start_addr & ~HPAGE_PMD_MASK);
> >>>>>
> >>>>> + if (cur_progress)
> >>>>> +         *cur_progress += 1;
> >>>> Why not be a little more explicit, and do this addition if find_pmd_or_thp_or_none fails,
> >>>> or pte_offset_map_lock fails? The way you do it right now is not readable - it gives no
> >>>> idea as to why on function entry we do a +1 right away. Please do add some comments too.
> >>> If this way is not clear enough, we can directly add 1 in
> >>> find_pmd_or_thp_or_none() etc, BUT it's a bit redundant.
> >>> Please take a look at which one is better.
> >>>
> >>> case 1:
> >>> as the V4 PATCH #2 [1] and #3 [2], only hpage_collapse_scan_pmd().
> >>> [1] https://lore.kernel.org/linux-mm/20260111121909.8410-3-yanglincheng@kylinos.cn
> >>> [2] https://lore.kernel.org/linux-mm/20260111121909.8410-4-yanglincheng@kylinos.cn
> >>>
> >>> static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
> >>>             struct vm_area_struct *vma, unsigned long start_addr,
> >>>             bool *mmap_locked, unsigned int *cur_progress,
> >>>             struct collapse_control *cc)
> >>> {
> >>>     ...
> >>>     result = find_pmd_or_thp_or_none(mm, start_addr, &pmd);
> >>>     if (result != SCAN_SUCCEED) {
> >>>             if (cur_progress)
> >>>                     *cur_progress += 1; // here
> >>>             goto out;
> >>>     }
> >>>     ...
> >>>     pte = pte_offset_map_lock(mm, pmd, start_addr, &ptl);
> >>>     if (!pte) {
> >>>             if (cur_progress)
> >>>                     *cur_progress += 1; // here
> >>>             result = SCAN_NO_PTE_TABLE;
> >>>             goto out;
> >>>     }
> >>>
> >>>     for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR;
> >>>          _pte++, addr += PAGE_SIZE) {
> >>>             if (cur_progress)
> >>>                     *cur_progress += 1; // here
> >>>             ...
> >>>     }
> >>> }
> >>>
> >>> case 2:
> >>>
> >>> static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
> >>>             struct vm_area_struct *vma, unsigned long start_addr,
> >>>             bool *mmap_locked, unsigned int *cur_progress,
> >>>             struct collapse_control *cc)
> >>> {
> >>>     ...
> >>>     result = find_pmd_or_thp_or_none(mm, start_addr, &pmd);
> >>>     if (result != SCAN_SUCCEED) {
> >>>             if (cur_progress)
> >>>                     *cur_progress += 1; // here
> >> Let us be more explicit and set this equal to 1, instead of adding 1.
> > LGTM, I will do it in the next version. Thanks!
> >
> >>>             goto out;
> >>>     }
> >>>     ...
> >>>     pte = pte_offset_map_lock(mm, pmd, start_addr, &ptl);
> >>>     if (!pte) {
> >>>             if (cur_progress)
> >>>                     *cur_progress += 1; // here
> >> Same comment as above.
> >>
> >>>             result = SCAN_NO_PTE_TABLE;
> >>>             goto out;
> >>>     }
> >>>
> >>>     for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR;
> >>>          _pte++, addr += PAGE_SIZE) {
> >>>             ...
> >>>     }
> >>>     ...
> >>> out_unmap:
> >>>     if (cur_progress) {
> >>>             if (_pte >= pte + HPAGE_PMD_NR)
> >>>                     *cur_progress += HPAGE_PMD_NR;   // here
> >>>             else
> >>>                     *cur_progress += _pte - pte + 1; // here
> >>>     }
> >>> }
> >> I will vote case 2. In case 1 I don't like the fact that the if (cur_progress)
> >> branch will be checked each iteration - and I don't think the compiler can
> >> optimize this since the body of the loop is complex, so this check cannot
> >> be hoisted out of the loop.
> >>
> >>
> >>> case 3:
> >>>     current patch, and add more comments to clearer.
> >>>
> >>>>> +
> >>>>>   result = find_pmd_or_thp_or_none(mm, start_addr, &pmd);
> >>>>>   if (result != SCAN_SUCCEED)
> >>>>>           goto out;
> >>>>> @@ -1396,6 +1403,12 @@ static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
> >>>>>           result = SCAN_SUCCEED;
> >>>>>   }
> >>>>>  out_unmap:
> >>>>> + if (cur_progress) {
> >>>>> +         if (_pte >= pte + HPAGE_PMD_NR)
> >>>>> +                 *cur_progress += HPAGE_PMD_NR - 1;
> >>>>> +         else
> >>>>> +                 *cur_progress += _pte - pte;
> >>>>> + }
> >>>>>   pte_unmap_unlock(pte, ptl);
> >>>>>   if (result == SCAN_SUCCEED) {
> >>>>>           result = collapse_huge_page(mm, start_addr, referenced,
> >>>>> @@ -2286,8 +2299,9 @@ static enum scan_result collapse_file(struct mm_struct *mm, unsigned long addr,
> >>>>>   return result;
> >>>>>  }
> >>>>>
> >>>>> -static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
> >>>>> -         struct file *file, pgoff_t start, struct collapse_control *cc)
> >>>>> +static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm,
> >>>>> +         unsigned long addr, struct file *file, pgoff_t start,
> >>>>> +         unsigned int *cur_progress, struct collapse_control *cc)
> >>>>>  {
> >>>>>   struct folio *folio = NULL;
> >>>>>   struct address_space *mapping = file->f_mapping;
> >>>>> @@ -2376,6 +2390,18 @@ static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm, unsigned
> >>>>>                   cond_resched_rcu();
> >>>>>           }
> >>>>>   }
> >>>>> + if (cur_progress) {
> >>>>> +         unsigned long idx = xas_get_index(&xas) - start;
> >>>>> +
> >>>>> +         if (folio == NULL)
> >>>>> +                 *cur_progress += HPAGE_PMD_NR;
> >>>> I think this whole block needs some comments. Can you explain, why you
> >>>> do a particular increment in each case?
> >>>>
> >>>>> +         else if (xa_is_value(folio))
> >>>>> +                 *cur_progress += idx + (1 << xas_get_order(&xas));
> >>>>> +         else if (folio_order(folio) == HPAGE_PMD_ORDER)
> >>>>> +                 *cur_progress += idx + 1;
> >>>>> +         else
> >>>>> +                 *cur_progress += idx + folio_nr_pages(folio);
> >>>>> + }
> >>> The "idx" represent PTEs number already scanned when exiting
> >>> xas_for_each().
> >>>
> >>> However, the last valid folio size was not counted in "idx" (except
> >>> folio == NULL, "idx" equal to HPAGE_PMD_NR), which can be further
> >>> divided into three cases:
> >> But, the number of PTEs you account in these three cases, are *not*
> >> scanned, right? So we can simply drop these 3 cases.
> > No, these three cases are the last scanning folio to break, I think we
> > need to add them. Imagine that if we trigger HPAGE_PMD_ORDER folio
> > firstly, "idx" is equal to 0.
>
> If you hit a large folio and break, you don't consume any extra iterations.
> The number of iterations is completely determined by xa_index. This is kind
> of analogous to SCAN_PMD_MAPPED - in one "iteration", you decide to stop
> scanning, and set progress to 1.
>
> In any case, the problem which you describe in the patch description is
> completely solved by setting progress to 1 upon failure of find_pmd_or_thp_or_none.
> So anything else which you do (like counting iterations in hpage_collapse_scan_pmd
> or hpage_collapse_scan_file) is extra and it is not clear if it has a practical
> benefit. The patch benefits us because there are a lot of SCAN_PMD_MAPPED and SCAN_PMD_NONE.
> Will we practically also encounter a large number of SCAN_EXCEED_PTE_SWAP, PTE_NONE, etc?
>
> I tilt towards keeping the other bits of the patch, if they can be simplified, and
> because this patch is relatively harmless. Just like you count the number of iterations
> in hpage_collapse_scan_pmd(), you can do the same here using xa_index.

The semantics of hpage_collapse_scan_pmd() _pte and hpage_collapse_scan_file()
xas.xa_index are different.

Let me give a detailed example :)

xas->xa_index represents the starting address of the last folio when exiting
xas_for_each().

Assuming each folio size is 32KB, when we iterate "folio1 -> folio2" and break,
xas->xa_index equals the starting address of folio2, so "idx = 8". However,
folio2 is not counted in "idx".

In reality, folio2 has been scanned, so we need to add the folio2 size, making
"idx = 16" the correct value.

There are two ways for folio2:
1. shmem swap entries (xa_is_value), breaking out of the xas_for_each loop
   due to SCAN_EXCEED_SWAP_PTE.
2. Normal folio, breaking out of the xas_for_each loop due to
   SCAN_SCAN_ABORT/SCAN_PAGE_LRU/SCAN_PAGE_COUNT.

Move Lance suggestion to here.
> if (cur_progress)
>       *cur_progress = max(xas.xa_index - start, 1UL);

If we use the code suggested by Lance, we will miss the folio2 size,
the result is "idx = 8".
Is this the result we wanted? If Yes, Lance's suggested code is perfect.

Another more specific scenario is when the first iterated folio is
HPAGE_PMD_ORDER, "idx = 0", Yep, we can directly set "cur_progress = 1",
it's simple enough.

> >
> >>> 1. shmem swap entries (xa_is_value), add folio size.
> >>> 2. the folio is HPAGE_PMD_ORDER, the memory has been collapsed
> >>>    to PMD, so add 1 only.
> >>> 3. Normal folio, add folio size.
> >>>
> >>> Is the version below more readable?
> >>>
> >>>     if (cur_progress) {
> >>>             *cur_progress += xas.xa_index - start;
> >>>
> >>>             if (folio) {
> >>>                     if (xa_is_value(folio))
> >>>                             *cur_progress += 1 << xas_get_order(&xas);
> >>>                     else if (folio_order(folio) == HPAGE_PMD_ORDER)
> >>>                             *cur_progress += 1;
> >>>                     else
> >>>                             *cur_progress += folio_nr_pages(folio);
> >>>             }
> >>>     }
> >> Yep, this is unneeded complexity. This looks really ugly and the benefits of
> >> this are not clear. You can simply do
> >>
> >> if (cur_progress)
> >>      *cur_progress = xas.xa_index - start;
> >>
> >>>>>   rcu_read_unlock();
> >>>>>
> >>>>>   if (result == SCAN_SUCCEED) {
> >>>>> @@ -2456,6 +2482,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result
> >>>>>
> >>>>>           while (khugepaged_scan.address < hend) {
> >>>>>                   bool mmap_locked = true;
> >>>>> +                 unsigned int cur_progress = 0;
> >>>>>
> >>>>>                   cond_resched();
> >>>>>                   if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
> >>>>> @@ -2472,7 +2499,8 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result
> >>>>>                           mmap_read_unlock(mm);
> >>>>>                           mmap_locked = false;
> >>>>>                           *result = hpage_collapse_scan_file(mm,
> >>>>> -                                 khugepaged_scan.address, file, pgoff, cc);
> >>>>> +                                 khugepaged_scan.address, file, pgoff,
> >>>>> +                                 &cur_progress, cc);
> >>>>>                           fput(file);
> >>>>>                           if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
> >>>>>                                   mmap_read_lock(mm);
> >>>>> @@ -2486,7 +2514,8 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result
> >>>>>                           }
> >>>>>                   } else {
> >>>>>                           *result = hpage_collapse_scan_pmd(mm, vma,
> >>>>> -                                 khugepaged_scan.address, &mmap_locked, cc);
> >>>>> +                                 khugepaged_scan.address, &mmap_locked,
> >>>>> +                                 &cur_progress, cc);
> >>>>>                   }
> >>>>>
> >>>>>                   if (*result == SCAN_SUCCEED)
> >>>>> @@ -2494,7 +2523,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result
> >>>>>
> >>>>>                   /* move to next address */
> >>>>>                   khugepaged_scan.address += HPAGE_PMD_SIZE;
> >>>>> -                 progress += HPAGE_PMD_NR;
> >>>>> +                 progress += cur_progress;
> >>>>>                   if (!mmap_locked)
> >>>>>                           /*
> >>>>>                            * We released mmap_lock so break loop.  Note
> >>>>> @@ -2817,7 +2846,7 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
> >>>>>                   mmap_locked = false;
> >>>>>                   *lock_dropped = true;
> >>>>>                   result = hpage_collapse_scan_file(mm, addr, file, pgoff,
> >>>>> -                                                   cc);
> >>>>> +                                                   NULL, cc);
> >>>>>
> >>>>>                   if (result == SCAN_PAGE_DIRTY_OR_WRITEBACK && !triggered_wb &&
> >>>>>                       mapping_can_writeback(file->f_mapping)) {
> >>>>> @@ -2832,7 +2861,7 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
> >>>>>                   fput(file);
> >>>>>           } else {
> >>>>>                   result = hpage_collapse_scan_pmd(mm, vma, addr,
> >>>>> -                                                  &mmap_locked, cc);
> >>>>> +                                                  &mmap_locked, NULL, cc);
> >>>>>           }
> >>>>>           if (!mmap_locked)
> >>>>>                   *lock_dropped = true;
> >>> --
> >>> Thanks,
> >>> Vernon

Re: [PATCH mm-new v5 2/5] mm: khugepaged: refine scan progress number

Posted by Dev Jain 1 week, 1 day ago

On 29/01/26 5:54 pm, Vernon Yang wrote:
> On Thu, Jan 29, 2026 at 4:32 PM Dev Jain <dev.jain@arm.com> wrote:
>> On 29/01/26 1:29 pm, Vernon Yang wrote:
>>> On Thu, Jan 29, 2026 at 11:05:36AM +0530, Dev Jain wrote:
>>>> On 28/01/26 8:04 pm, Vernon Yang wrote:
>>>>> On Wed, Jan 28, 2026 at 01:59:33PM +0530, Dev Jain wrote:
>>>>>> On 23/01/26 1:52 pm, Vernon Yang wrote:
>>>>>>> From: Vernon Yang <yanglincheng@kylinos.cn>
>>>>>>>
>>>>>>> Currently, each scan always increases "progress" by HPAGE_PMD_NR,
>>>>>>> even if only scanning a single PTE/PMD entry.
>>>>>>>
>>>>>>> - When only scanning a sigle PTE entry, let me provide a detailed
>>>>>>>   example:
>>>>>>>
>>>>>>> static int hpage_collapse_scan_pmd()
>>>>>>> {
>>>>>>>   for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR;
>>>>>>>        _pte++, addr += PAGE_SIZE) {
>>>>>>>           pte_t pteval = ptep_get(_pte);
>>>>>>>           ...
>>>>>>>           if (pte_uffd_wp(pteval)) { <-- first scan hit
>>>>>>>                   result = SCAN_PTE_UFFD_WP;
>>>>>>>                   goto out_unmap;
>>>>>>>           }
>>>>>>>   }
>>>>>>> }
>>>>>>>
>>>>>>> During the first scan, if pte_uffd_wp(pteval) is true, the loop exits
>>>>>>> directly. In practice, only one PTE is scanned before termination.
>>>>>>> Here, "progress += 1" reflects the actual number of PTEs scanned, but
>>>>>>> previously "progress += HPAGE_PMD_NR" always.
>>>>>>>
>>>>>>> - When the memory has been collapsed to PMD, let me provide a detailed
>>>>>>>   example:
>>>>>>>
>>>>>>> The following data is traced by bpftrace on a desktop system. After
>>>>>>> the system has been left idle for 10 minutes upon booting, a lot of
>>>>>>> SCAN_PMD_MAPPED or SCAN_NO_PTE_TABLE are observed during a full scan
>>>>>>> by khugepaged.
>>>>>>>
>>>>>>> @scan_pmd_status[1]: 1           ## SCAN_SUCCEED
>>>>>>> @scan_pmd_status[6]: 2           ## SCAN_EXCEED_SHARED_PTE
>>>>>>> @scan_pmd_status[3]: 142         ## SCAN_PMD_MAPPED
>>>>>>> @scan_pmd_status[2]: 178         ## SCAN_NO_PTE_TABLE
>>>>>> Could you elaborate what is [1], [6] etc and 1,2,142, etc?
>>>>> These 1,6 are value of "enum scan_result", you can directly refer to the
>>>>> notes on the right.
>>>>>
>>>>> These 1,2,142,178 are number of different "enum scan_result" from
>>>>> trace_mm_khugepaged_scan_pmd and trace_mm_khugepaged_scan_file.
>>>>>
>>>>> as example, SCAN_PMD_MAPPED has 142 times during a full scan by
>>>>> khugepaged.
>>>> Thanks. Can you please mention this in the patch description. You can simply
>>>> right it like this:
>>>>
>>>> "From trace_mm_khugepaged_scan_pmd and trace_mm_khugepaged_scan_file, the
>>>> following statuses were observed, with frequency mentioned next to them:
>>>>
>>>> SCAN_SUCCEED: 1
>>>> SCAN_PMD_MAPPED: 142
>>>> ....."
>>>>
>>>> and so on.
>>> LGTM, I will do it in the next version. Thanks!
>>>
>>>>>>> total progress size: 674 MB
>>>>>>> Total time         : 419 seconds ## include khugepaged_scan_sleep_millisecs
>>>>>>>
>>>>>>> The khugepaged_scan list save all task that support collapse into hugepage,
>>>>>>> as long as the task is not destroyed, khugepaged will not remove it from
>>>>>>> the khugepaged_scan list. This exist a phenomenon where task has already
>>>>>>> collapsed all memory regions into hugepage, but khugepaged continues to
>>>>>>> scan it, which wastes CPU time and invalid, and due to
>>>>>>> khugepaged_scan_sleep_millisecs (default 10s) causes a long wait for
>>>>>>> scanning a large number of invalid task, so scanning really valid task
>>>>>>> is later.
>>>>>>>
>>>>>>> After applying this patch, when the memory is either SCAN_PMD_MAPPED or
>>>>>>> SCAN_NO_PTE_TABLE, just skip it, as follow:
>>>>>>>
>>>>>>> @scan_pmd_status[6]: 2
>>>>>>> @scan_pmd_status[3]: 147
>>>>>>> @scan_pmd_status[2]: 173
>>>>>>> total progress size: 45 MB
>>>>>>> Total time         : 20 seconds
>>>>>>>
>>>>>>> Signed-off-by: Vernon Yang <yanglincheng@kylinos.cn>
>>>>>>> ---
>>>>>>>  include/linux/xarray.h |  9 ++++++++
>>>>>>>  mm/khugepaged.c        | 47 ++++++++++++++++++++++++++++++++++--------
>>>>>>>  2 files changed, 47 insertions(+), 9 deletions(-)
>>>>>>>
>>>>>>> diff --git a/include/linux/xarray.h b/include/linux/xarray.h
>>>>>>> index be850174e802..f77d97d7b957 100644
>>>>>>> --- a/include/linux/xarray.h
>>>>>>> +++ b/include/linux/xarray.h
>>>>>>> @@ -1646,6 +1646,15 @@ static inline void xas_set(struct xa_state *xas, unsigned long index)
>>>>>>>   xas->xa_node = XAS_RESTART;
>>>>>>>  }
>>>>>>>
>>>>>>> +/**
>>>>>>> + * xas_get_index() - Get XArray operation state for a different index.
>>>>>>> + * @xas: XArray operation state.
>>>>>>> + */
>>>>>>> +static inline unsigned long xas_get_index(struct xa_state *xas)
>>>>>>> +{
>>>>>>> + return xas->xa_index;
>>>>>>> +}
>>>>>>> +
>>>>>>>  /**
>>>>>>>   * xas_advance() - Skip over sibling entries.
>>>>>>>   * @xas: XArray operation state.
>>>>>>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>>>>>>> index 6f0f05148765..de95029e3763 100644
>>>>>>> --- a/mm/khugepaged.c
>>>>>>> +++ b/mm/khugepaged.c
>>>>>>> @@ -68,7 +68,10 @@ enum scan_result {
>>>>>>>  static struct task_struct *khugepaged_thread __read_mostly;
>>>>>>>  static DEFINE_MUTEX(khugepaged_mutex);
>>>>>>>
>>>>>>> -/* default scan 8*HPAGE_PMD_NR ptes (or vmas) every 10 second */
>>>>>>> +/*
>>>>>>> + * default scan 8*HPAGE_PMD_NR ptes, pmd_mapped, no_pte_table or vmas
>>>>>>> + * every 10 second.
>>>>>>> + */
>>>>>>>  static unsigned int khugepaged_pages_to_scan __read_mostly;
>>>>>>>  static unsigned int khugepaged_pages_collapsed;
>>>>>>>  static unsigned int khugepaged_full_scans;
>>>>>>> @@ -1240,7 +1243,8 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
>>>>>>>  }
>>>>>>>
>>>>>>>  static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
>>>>>>> -         struct vm_area_struct *vma, unsigned long start_addr, bool *mmap_locked,
>>>>>>> +         struct vm_area_struct *vma, unsigned long start_addr,
>>>>>>> +         bool *mmap_locked, unsigned int *cur_progress,
>>>>>>>           struct collapse_control *cc)
>>>>>>>  {
>>>>>>>   pmd_t *pmd;
>>>>>>> @@ -1255,6 +1259,9 @@ static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
>>>>>>>
>>>>>>>   VM_BUG_ON(start_addr & ~HPAGE_PMD_MASK);
>>>>>>>
>>>>>>> + if (cur_progress)
>>>>>>> +         *cur_progress += 1;
>>>>>> Why not be a little more explicit, and do this addition if find_pmd_or_thp_or_none fails,
>>>>>> or pte_offset_map_lock fails? The way you do it right now is not readable - it gives no
>>>>>> idea as to why on function entry we do a +1 right away. Please do add some comments too.
>>>>> If this way is not clear enough, we can directly add 1 in
>>>>> find_pmd_or_thp_or_none() etc, BUT it's a bit redundant.
>>>>> Please take a look at which one is better.
>>>>>
>>>>> case 1:
>>>>> as the V4 PATCH #2 [1] and #3 [2], only hpage_collapse_scan_pmd().
>>>>> [1] https://lore.kernel.org/linux-mm/20260111121909.8410-3-yanglincheng@kylinos.cn
>>>>> [2] https://lore.kernel.org/linux-mm/20260111121909.8410-4-yanglincheng@kylinos.cn
>>>>>
>>>>> static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
>>>>>             struct vm_area_struct *vma, unsigned long start_addr,
>>>>>             bool *mmap_locked, unsigned int *cur_progress,
>>>>>             struct collapse_control *cc)
>>>>> {
>>>>>     ...
>>>>>     result = find_pmd_or_thp_or_none(mm, start_addr, &pmd);
>>>>>     if (result != SCAN_SUCCEED) {
>>>>>             if (cur_progress)
>>>>>                     *cur_progress += 1; // here
>>>>>             goto out;
>>>>>     }
>>>>>     ...
>>>>>     pte = pte_offset_map_lock(mm, pmd, start_addr, &ptl);
>>>>>     if (!pte) {
>>>>>             if (cur_progress)
>>>>>                     *cur_progress += 1; // here
>>>>>             result = SCAN_NO_PTE_TABLE;
>>>>>             goto out;
>>>>>     }
>>>>>
>>>>>     for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR;
>>>>>          _pte++, addr += PAGE_SIZE) {
>>>>>             if (cur_progress)
>>>>>                     *cur_progress += 1; // here
>>>>>             ...
>>>>>     }
>>>>> }
>>>>>
>>>>> case 2:
>>>>>
>>>>> static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
>>>>>             struct vm_area_struct *vma, unsigned long start_addr,
>>>>>             bool *mmap_locked, unsigned int *cur_progress,
>>>>>             struct collapse_control *cc)
>>>>> {
>>>>>     ...
>>>>>     result = find_pmd_or_thp_or_none(mm, start_addr, &pmd);
>>>>>     if (result != SCAN_SUCCEED) {
>>>>>             if (cur_progress)
>>>>>                     *cur_progress += 1; // here
>>>> Let us be more explicit and set this equal to 1, instead of adding 1.
>>> LGTM, I will do it in the next version. Thanks!
>>>
>>>>>             goto out;
>>>>>     }
>>>>>     ...
>>>>>     pte = pte_offset_map_lock(mm, pmd, start_addr, &ptl);
>>>>>     if (!pte) {
>>>>>             if (cur_progress)
>>>>>                     *cur_progress += 1; // here
>>>> Same comment as above.
>>>>
>>>>>             result = SCAN_NO_PTE_TABLE;
>>>>>             goto out;
>>>>>     }
>>>>>
>>>>>     for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR;
>>>>>          _pte++, addr += PAGE_SIZE) {
>>>>>             ...
>>>>>     }
>>>>>     ...
>>>>> out_unmap:
>>>>>     if (cur_progress) {
>>>>>             if (_pte >= pte + HPAGE_PMD_NR)
>>>>>                     *cur_progress += HPAGE_PMD_NR;   // here
>>>>>             else
>>>>>                     *cur_progress += _pte - pte + 1; // here
>>>>>     }
>>>>> }
>>>> I will vote case 2. In case 1 I don't like the fact that the if (cur_progress)
>>>> branch will be checked each iteration - and I don't think the compiler can
>>>> optimize this since the body of the loop is complex, so this check cannot
>>>> be hoisted out of the loop.
>>>>
>>>>
>>>>> case 3:
>>>>>     current patch, and add more comments to clearer.
>>>>>
>>>>>>> +
>>>>>>>   result = find_pmd_or_thp_or_none(mm, start_addr, &pmd);
>>>>>>>   if (result != SCAN_SUCCEED)
>>>>>>>           goto out;
>>>>>>> @@ -1396,6 +1403,12 @@ static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
>>>>>>>           result = SCAN_SUCCEED;
>>>>>>>   }
>>>>>>>  out_unmap:
>>>>>>> + if (cur_progress) {
>>>>>>> +         if (_pte >= pte + HPAGE_PMD_NR)
>>>>>>> +                 *cur_progress += HPAGE_PMD_NR - 1;
>>>>>>> +         else
>>>>>>> +                 *cur_progress += _pte - pte;
>>>>>>> + }
>>>>>>>   pte_unmap_unlock(pte, ptl);
>>>>>>>   if (result == SCAN_SUCCEED) {
>>>>>>>           result = collapse_huge_page(mm, start_addr, referenced,
>>>>>>> @@ -2286,8 +2299,9 @@ static enum scan_result collapse_file(struct mm_struct *mm, unsigned long addr,
>>>>>>>   return result;
>>>>>>>  }
>>>>>>>
>>>>>>> -static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
>>>>>>> -         struct file *file, pgoff_t start, struct collapse_control *cc)
>>>>>>> +static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm,
>>>>>>> +         unsigned long addr, struct file *file, pgoff_t start,
>>>>>>> +         unsigned int *cur_progress, struct collapse_control *cc)
>>>>>>>  {
>>>>>>>   struct folio *folio = NULL;
>>>>>>>   struct address_space *mapping = file->f_mapping;
>>>>>>> @@ -2376,6 +2390,18 @@ static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm, unsigned
>>>>>>>                   cond_resched_rcu();
>>>>>>>           }
>>>>>>>   }
>>>>>>> + if (cur_progress) {
>>>>>>> +         unsigned long idx = xas_get_index(&xas) - start;
>>>>>>> +
>>>>>>> +         if (folio == NULL)
>>>>>>> +                 *cur_progress += HPAGE_PMD_NR;
>>>>>> I think this whole block needs some comments. Can you explain, why you
>>>>>> do a particular increment in each case?
>>>>>>
>>>>>>> +         else if (xa_is_value(folio))
>>>>>>> +                 *cur_progress += idx + (1 << xas_get_order(&xas));
>>>>>>> +         else if (folio_order(folio) == HPAGE_PMD_ORDER)
>>>>>>> +                 *cur_progress += idx + 1;
>>>>>>> +         else
>>>>>>> +                 *cur_progress += idx + folio_nr_pages(folio);
>>>>>>> + }
>>>>> The "idx" represent PTEs number already scanned when exiting
>>>>> xas_for_each().
>>>>>
>>>>> However, the last valid folio size was not counted in "idx" (except
>>>>> folio == NULL, "idx" equal to HPAGE_PMD_NR), which can be further
>>>>> divided into three cases:
>>>> But, the number of PTEs you account in these three cases, are *not*
>>>> scanned, right? So we can simply drop these 3 cases.
>>> No, these three cases are the last scanning folio to break, I think we
>>> need to add them. Imagine that if we trigger HPAGE_PMD_ORDER folio
>>> firstly, "idx" is equal to 0.
>> If you hit a large folio and break, you don't consume any extra iterations.
>> The number of iterations is completely determined by xa_index. This is kind
>> of analogous to SCAN_PMD_MAPPED - in one "iteration", you decide to stop
>> scanning, and set progress to 1.
>>
>> In any case, the problem which you describe in the patch description is
>> completely solved by setting progress to 1 upon failure of find_pmd_or_thp_or_none.
>> So anything else which you do (like counting iterations in hpage_collapse_scan_pmd
>> or hpage_collapse_scan_file) is extra and it is not clear if it has a practical
>> benefit. The patch benefits us because there are a lot of SCAN_PMD_MAPPED and SCAN_PMD_NONE.
>> Will we practically also encounter a large number of SCAN_EXCEED_PTE_SWAP, PTE_NONE, etc?
>>
>> I tilt towards keeping the other bits of the patch, if they can be simplified, and
>> because this patch is relatively harmless. Just like you count the number of iterations
>> in hpage_collapse_scan_pmd(), you can do the same here using xa_index.
> The semantics of hpage_collapse_scan_pmd() _pte and hpage_collapse_scan_file()
> xas.xa_index are different.
>
> Let me give a detailed example :)
>
> xas->xa_index represents the starting address of the last folio when exiting
> xas_for_each().
>
> Assuming each folio size is 32KB, when we iterate "folio1 -> folio2" and break,
> xas->xa_index equals the starting address of folio2, so "idx = 8". However,
> folio2 is not counted in "idx".
>
> In reality, folio2 has been scanned, so we need to add the folio2 size, making
> "idx = 16" the correct value.

The "scanning" of folio2 is O(1). It is *not* proportional to the number of ptes
covered by folio2 (which is 8), as we break immediately.

As I said above, you will be contradicting the patch's objective here - if you
see a PMD_ORDER folio and set progress = HPAGE_PMD_NR, that exactly corresponds
to seeing a PMD mapping in the anon collapse path, and setting progress = HPAGE_PMD_NR.
But that is not appropriate - deducing that the PMD maps a PMD folio, is an O(1)
operation w.r.t the number of ptes.

>
> There are two ways for folio2:
> 1. shmem swap entries (xa_is_value), breaking out of the xas_for_each loop
>    due to SCAN_EXCEED_SWAP_PTE.
> 2. Normal folio, breaking out of the xas_for_each loop due to
>    SCAN_SCAN_ABORT/SCAN_PAGE_LRU/SCAN_PAGE_COUNT.
>
> Move Lance suggestion to here.
>> if (cur_progress)
>>       *cur_progress = max(xas.xa_index - start, 1UL);
> If we use the code suggested by Lance, we will miss the folio2 size,
> the result is "idx = 8".
> Is this the result we wanted? If Yes, Lance's suggested code is perfect.
>
> Another more specific scenario is when the first iterated folio is
> HPAGE_PMD_ORDER, "idx = 0", Yep, we can directly set "cur_progress = 1",
> it's simple enough.

Yes, let us do that.

>
>>>>> 1. shmem swap entries (xa_is_value), add folio size.
>>>>> 2. the folio is HPAGE_PMD_ORDER, the memory has been collapsed
>>>>>    to PMD, so add 1 only.
>>>>> 3. Normal folio, add folio size.
>>>>>
>>>>> Is the version below more readable?
>>>>>
>>>>>     if (cur_progress) {
>>>>>             *cur_progress += xas.xa_index - start;
>>>>>
>>>>>             if (folio) {
>>>>>                     if (xa_is_value(folio))
>>>>>                             *cur_progress += 1 << xas_get_order(&xas);
>>>>>                     else if (folio_order(folio) == HPAGE_PMD_ORDER)
>>>>>                             *cur_progress += 1;
>>>>>                     else
>>>>>                             *cur_progress += folio_nr_pages(folio);
>>>>>             }
>>>>>     }
>>>> Yep, this is unneeded complexity. This looks really ugly and the benefits of
>>>> this are not clear. You can simply do
>>>>
>>>> if (cur_progress)
>>>>      *cur_progress = xas.xa_index - start;
>>>>
>>>>>>>   rcu_read_unlock();
>>>>>>>
>>>>>>>   if (result == SCAN_SUCCEED) {
>>>>>>> @@ -2456,6 +2482,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result
>>>>>>>
>>>>>>>           while (khugepaged_scan.address < hend) {
>>>>>>>                   bool mmap_locked = true;
>>>>>>> +                 unsigned int cur_progress = 0;
>>>>>>>
>>>>>>>                   cond_resched();
>>>>>>>                   if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
>>>>>>> @@ -2472,7 +2499,8 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result
>>>>>>>                           mmap_read_unlock(mm);
>>>>>>>                           mmap_locked = false;
>>>>>>>                           *result = hpage_collapse_scan_file(mm,
>>>>>>> -                                 khugepaged_scan.address, file, pgoff, cc);
>>>>>>> +                                 khugepaged_scan.address, file, pgoff,
>>>>>>> +                                 &cur_progress, cc);
>>>>>>>                           fput(file);
>>>>>>>                           if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
>>>>>>>                                   mmap_read_lock(mm);
>>>>>>> @@ -2486,7 +2514,8 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result
>>>>>>>                           }
>>>>>>>                   } else {
>>>>>>>                           *result = hpage_collapse_scan_pmd(mm, vma,
>>>>>>> -                                 khugepaged_scan.address, &mmap_locked, cc);
>>>>>>> +                                 khugepaged_scan.address, &mmap_locked,
>>>>>>> +                                 &cur_progress, cc);
>>>>>>>                   }
>>>>>>>
>>>>>>>                   if (*result == SCAN_SUCCEED)
>>>>>>> @@ -2494,7 +2523,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result
>>>>>>>
>>>>>>>                   /* move to next address */
>>>>>>>                   khugepaged_scan.address += HPAGE_PMD_SIZE;
>>>>>>> -                 progress += HPAGE_PMD_NR;
>>>>>>> +                 progress += cur_progress;
>>>>>>>                   if (!mmap_locked)
>>>>>>>                           /*
>>>>>>>                            * We released mmap_lock so break loop.  Note
>>>>>>> @@ -2817,7 +2846,7 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
>>>>>>>                   mmap_locked = false;
>>>>>>>                   *lock_dropped = true;
>>>>>>>                   result = hpage_collapse_scan_file(mm, addr, file, pgoff,
>>>>>>> -                                                   cc);
>>>>>>> +                                                   NULL, cc);
>>>>>>>
>>>>>>>                   if (result == SCAN_PAGE_DIRTY_OR_WRITEBACK && !triggered_wb &&
>>>>>>>                       mapping_can_writeback(file->f_mapping)) {
>>>>>>> @@ -2832,7 +2861,7 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
>>>>>>>                   fput(file);
>>>>>>>           } else {
>>>>>>>                   result = hpage_collapse_scan_pmd(mm, vma, addr,
>>>>>>> -                                                  &mmap_locked, cc);
>>>>>>> +                                                  &mmap_locked, NULL, cc);
>>>>>>>           }
>>>>>>>           if (!mmap_locked)
>>>>>>>                   *lock_dropped = true;
>>>>> --
>>>>> Thanks,
>>>>> Vernon

Re: [PATCH mm-new v5 2/5] mm: khugepaged: refine scan progress number

Posted by Matthew Wilcox 2 weeks ago

On Fri, Jan 23, 2026 at 04:22:29PM +0800, Vernon Yang wrote:
> +	if (cur_progress) {
> +		unsigned long idx = xas_get_index(&xas) - start;

I agree with Dev.  This is less readable than:

		unsigned long idx = xas.xa_index - start;

Re: [PATCH mm-new v5 2/5] mm: khugepaged: refine scan progress number

Posted by Vernon Yang 2 weeks ago

On Fri, Jan 23, 2026 at 11:19 PM Matthew Wilcox <willy@infradead.org> wrote:
>
> On Fri, Jan 23, 2026 at 04:22:29PM +0800, Vernon Yang wrote:
> > +     if (cur_progress) {
> > +             unsigned long idx = xas_get_index(&xas) - start;
>
> I agree with Dev.  This is less readable than:
>
>                 unsigned long idx = xas.xa_index - start;

LGTM, I will use "xas->xa_index" directly in the next version. Thanks!

Re: [PATCH mm-new v5 2/5] mm: khugepaged: refine scan progress number

Posted by Dev Jain 2 weeks ago

On 23/01/26 1:52 pm, Vernon Yang wrote:
> From: Vernon Yang <yanglincheng@kylinos.cn>
>
> Currently, each scan always increases "progress" by HPAGE_PMD_NR,
> even if only scanning a single PTE/PMD entry.
>
> - When only scanning a sigle PTE entry, let me provide a detailed
>   example:
>
> static int hpage_collapse_scan_pmd()
> {
> 	for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR;
> 	     _pte++, addr += PAGE_SIZE) {
> 		pte_t pteval = ptep_get(_pte);
> 		...
> 		if (pte_uffd_wp(pteval)) { <-- first scan hit
> 			result = SCAN_PTE_UFFD_WP;
> 			goto out_unmap;
> 		}
> 	}
> }
>
> During the first scan, if pte_uffd_wp(pteval) is true, the loop exits
> directly. In practice, only one PTE is scanned before termination.
> Here, "progress += 1" reflects the actual number of PTEs scanned, but
> previously "progress += HPAGE_PMD_NR" always.
>
> - When the memory has been collapsed to PMD, let me provide a detailed
>   example:
>
> The following data is traced by bpftrace on a desktop system. After
> the system has been left idle for 10 minutes upon booting, a lot of
> SCAN_PMD_MAPPED or SCAN_NO_PTE_TABLE are observed during a full scan
> by khugepaged.
>
> @scan_pmd_status[1]: 1           ## SCAN_SUCCEED
> @scan_pmd_status[6]: 2           ## SCAN_EXCEED_SHARED_PTE
> @scan_pmd_status[3]: 142         ## SCAN_PMD_MAPPED
> @scan_pmd_status[2]: 178         ## SCAN_NO_PTE_TABLE
> total progress size: 674 MB
> Total time         : 419 seconds ## include khugepaged_scan_sleep_millisecs
>
> The khugepaged_scan list save all task that support collapse into hugepage,
> as long as the task is not destroyed, khugepaged will not remove it from
> the khugepaged_scan list. This exist a phenomenon where task has already
> collapsed all memory regions into hugepage, but khugepaged continues to
> scan it, which wastes CPU time and invalid, and due to
> khugepaged_scan_sleep_millisecs (default 10s) causes a long wait for
> scanning a large number of invalid task, so scanning really valid task
> is later.
>
> After applying this patch, when the memory is either SCAN_PMD_MAPPED or
> SCAN_NO_PTE_TABLE, just skip it, as follow:
>
> @scan_pmd_status[6]: 2
> @scan_pmd_status[3]: 147
> @scan_pmd_status[2]: 173
> total progress size: 45 MB
> Total time         : 20 seconds
>
> Signed-off-by: Vernon Yang <yanglincheng@kylinos.cn>
> ---
>  include/linux/xarray.h |  9 ++++++++
>  mm/khugepaged.c        | 47 ++++++++++++++++++++++++++++++++++--------
>  2 files changed, 47 insertions(+), 9 deletions(-)
>
> diff --git a/include/linux/xarray.h b/include/linux/xarray.h
> index be850174e802..f77d97d7b957 100644
> --- a/include/linux/xarray.h
> +++ b/include/linux/xarray.h
> @@ -1646,6 +1646,15 @@ static inline void xas_set(struct xa_state *xas, unsigned long index)
>  	xas->xa_node = XAS_RESTART;
>  }
>  
> +/**
> + * xas_get_index() - Get XArray operation state for a different index.
> + * @xas: XArray operation state.
> + */
> +static inline unsigned long xas_get_index(struct xa_state *xas)
> +{
> +	return xas->xa_index;
> +}
> +

Why is this needed?

>  /**
>   * xas_advance() - Skip over sibling entries.
>   * @xas: XArray operation state.
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 6f0f05148765..de95029e3763 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -68,7 +68,10 @@ enum scan_result {
>  static struct task_struct *khugepaged_thread __read_mostly;
>  static DEFINE_MUTEX(khugepaged_mutex);
>  
> -/* default scan 8*HPAGE_PMD_NR ptes (or vmas) every 10 second */
> +/*
> + * default scan 8*HPAGE_PMD_NR ptes, pmd_mapped, no_pte_table or vmas
> + * every 10 second.
> + */
>  static unsigned int khugepaged_pages_to_scan __read_mostly;
>  static unsigned int khugepaged_pages_collapsed;
>  static unsigned int khugepaged_full_scans;
> @@ -1240,7 +1243,8 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
>  }
>  
>  static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
> -		struct vm_area_struct *vma, unsigned long start_addr, bool *mmap_locked,
> +		struct vm_area_struct *vma, unsigned long start_addr,
> +		bool *mmap_locked, unsigned int *cur_progress,
>  		struct collapse_control *cc)
>  {
>  	pmd_t *pmd;
> @@ -1255,6 +1259,9 @@ static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
>  
>  	VM_BUG_ON(start_addr & ~HPAGE_PMD_MASK);
>  
> +	if (cur_progress)
> +		*cur_progress += 1;
> +

1. Why do we need to do if (cur_progress). Isn't it guaranteed that the pointer will never be NULL.
2. Why do we increment this on function entry?

>  	result = find_pmd_or_thp_or_none(mm, start_addr, &pmd);
>  	if (result != SCAN_SUCCEED)
>  		goto out;
> @@ -1396,6 +1403,12 @@ static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
>  		result = SCAN_SUCCEED;
>  	}
>  out_unmap:
> +	if (cur_progress) {
> +		if (_pte >= pte + HPAGE_PMD_NR)
> +			*cur_progress += HPAGE_PMD_NR - 1;
> +		else
> +			*cur_progress += _pte - pte;
> +	}

Why are two cases required here - shouldn't it just be _pte - pte?

>  	pte_unmap_unlock(pte, ptl);
>  	if (result == SCAN_SUCCEED) {
>  		result = collapse_huge_page(mm, start_addr, referenced,
> @@ -2286,8 +2299,9 @@ static enum scan_result collapse_file(struct mm_struct *mm, unsigned long addr,
>  	return result;
>  }
>  
> -static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
> -		struct file *file, pgoff_t start, struct collapse_control *cc)
> +static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm,
> +		unsigned long addr, struct file *file, pgoff_t start,
> +		unsigned int *cur_progress, struct collapse_control *cc)
>  {
>  	struct folio *folio = NULL;
>  	struct address_space *mapping = file->f_mapping;
> @@ -2376,6 +2390,18 @@ static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm, unsigned
>  			cond_resched_rcu();
>  		}
>  	}
> +	if (cur_progress) {
> +		unsigned long idx = xas_get_index(&xas) - start;
> +
> +		if (folio == NULL)
> +			*cur_progress += HPAGE_PMD_NR;
> +		else if (xa_is_value(folio))
> +			*cur_progress += idx + (1 << xas_get_order(&xas));
> +		else if (folio_order(folio) == HPAGE_PMD_ORDER)
> +			*cur_progress += idx + 1;
> +		else
> +			*cur_progress += idx + folio_nr_pages(folio);
> +	}
>  	rcu_read_unlock();
>  
>  	if (result == SCAN_SUCCEED) {
> @@ -2456,6 +2482,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result
>  
>  		while (khugepaged_scan.address < hend) {
>  			bool mmap_locked = true;
> +			unsigned int cur_progress = 0;
>  
>  			cond_resched();
>  			if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
> @@ -2472,7 +2499,8 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result
>  				mmap_read_unlock(mm);
>  				mmap_locked = false;
>  				*result = hpage_collapse_scan_file(mm,
> -					khugepaged_scan.address, file, pgoff, cc);
> +					khugepaged_scan.address, file, pgoff,
> +					&cur_progress, cc);
>  				fput(file);
>  				if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
>  					mmap_read_lock(mm);
> @@ -2486,7 +2514,8 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result
>  				}
>  			} else {
>  				*result = hpage_collapse_scan_pmd(mm, vma,
> -					khugepaged_scan.address, &mmap_locked, cc);
> +					khugepaged_scan.address, &mmap_locked,
> +					&cur_progress, cc);
>  			}
>  
>  			if (*result == SCAN_SUCCEED)
> @@ -2494,7 +2523,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result
>  
>  			/* move to next address */
>  			khugepaged_scan.address += HPAGE_PMD_SIZE;
> -			progress += HPAGE_PMD_NR;
> +			progress += cur_progress;
>  			if (!mmap_locked)
>  				/*
>  				 * We released mmap_lock so break loop.  Note
> @@ -2817,7 +2846,7 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
>  			mmap_locked = false;
>  			*lock_dropped = true;
>  			result = hpage_collapse_scan_file(mm, addr, file, pgoff,
> -							  cc);
> +							  NULL, cc);
>  
>  			if (result == SCAN_PAGE_DIRTY_OR_WRITEBACK && !triggered_wb &&
>  			    mapping_can_writeback(file->f_mapping)) {
> @@ -2832,7 +2861,7 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
>  			fput(file);
>  		} else {
>  			result = hpage_collapse_scan_pmd(mm, vma, addr,
> -							 &mmap_locked, cc);
> +							 &mmap_locked, NULL, cc);
>  		}
>  		if (!mmap_locked)
>  			*lock_dropped = true;

Re: [PATCH mm-new v5 2/5] mm: khugepaged: refine scan progress number

Posted by Vernon Yang 2 weeks ago

On Fri, Jan 23, 2026 at 6:46 PM Dev Jain <dev.jain@arm.com> wrote:
>
> On 23/01/26 1:52 pm, Vernon Yang wrote:
> > From: Vernon Yang <yanglincheng@kylinos.cn>
> >
> > Currently, each scan always increases "progress" by HPAGE_PMD_NR,
> > even if only scanning a single PTE/PMD entry.
> >
> > - When only scanning a sigle PTE entry, let me provide a detailed
> >   example:
> >
> > static int hpage_collapse_scan_pmd()
> > {
> >       for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR;
> >            _pte++, addr += PAGE_SIZE) {
> >               pte_t pteval = ptep_get(_pte);
> >               ...
> >               if (pte_uffd_wp(pteval)) { <-- first scan hit
> >                       result = SCAN_PTE_UFFD_WP;
> >                       goto out_unmap;
> >               }
> >       }
> > }
> >
> > During the first scan, if pte_uffd_wp(pteval) is true, the loop exits
> > directly. In practice, only one PTE is scanned before termination.
> > Here, "progress += 1" reflects the actual number of PTEs scanned, but
> > previously "progress += HPAGE_PMD_NR" always.
> >
> > - When the memory has been collapsed to PMD, let me provide a detailed
> >   example:
> >
> > The following data is traced by bpftrace on a desktop system. After
> > the system has been left idle for 10 minutes upon booting, a lot of
> > SCAN_PMD_MAPPED or SCAN_NO_PTE_TABLE are observed during a full scan
> > by khugepaged.
> >
> > @scan_pmd_status[1]: 1           ## SCAN_SUCCEED
> > @scan_pmd_status[6]: 2           ## SCAN_EXCEED_SHARED_PTE
> > @scan_pmd_status[3]: 142         ## SCAN_PMD_MAPPED
> > @scan_pmd_status[2]: 178         ## SCAN_NO_PTE_TABLE
> > total progress size: 674 MB
> > Total time         : 419 seconds ## include khugepaged_scan_sleep_millisecs
> >
> > The khugepaged_scan list save all task that support collapse into hugepage,
> > as long as the task is not destroyed, khugepaged will not remove it from
> > the khugepaged_scan list. This exist a phenomenon where task has already
> > collapsed all memory regions into hugepage, but khugepaged continues to
> > scan it, which wastes CPU time and invalid, and due to
> > khugepaged_scan_sleep_millisecs (default 10s) causes a long wait for
> > scanning a large number of invalid task, so scanning really valid task
> > is later.
> >
> > After applying this patch, when the memory is either SCAN_PMD_MAPPED or
> > SCAN_NO_PTE_TABLE, just skip it, as follow:
> >
> > @scan_pmd_status[6]: 2
> > @scan_pmd_status[3]: 147
> > @scan_pmd_status[2]: 173
> > total progress size: 45 MB
> > Total time         : 20 seconds
> >
> > Signed-off-by: Vernon Yang <yanglincheng@kylinos.cn>
> > ---
> >  include/linux/xarray.h |  9 ++++++++
> >  mm/khugepaged.c        | 47 ++++++++++++++++++++++++++++++++++--------
> >  2 files changed, 47 insertions(+), 9 deletions(-)
> >
> > diff --git a/include/linux/xarray.h b/include/linux/xarray.h
> > index be850174e802..f77d97d7b957 100644
> > --- a/include/linux/xarray.h
> > +++ b/include/linux/xarray.h
> > @@ -1646,6 +1646,15 @@ static inline void xas_set(struct xa_state *xas, unsigned long index)
> >       xas->xa_node = XAS_RESTART;
> >  }
> >
> > +/**
> > + * xas_get_index() - Get XArray operation state for a different index.
> > + * @xas: XArray operation state.
> > + */
> > +static inline unsigned long xas_get_index(struct xa_state *xas)
> > +{
> > +     return xas->xa_index;
> > +}
> > +
>
> Why is this needed?

Please refer to hpage_collapse_scan_file(), which obtains the starting
address of the last folio when exiting xas_for_each().

Using "xas->xa_index" directly is also acceptable.

> >  /**
> >   * xas_advance() - Skip over sibling entries.
> >   * @xas: XArray operation state.
> > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > index 6f0f05148765..de95029e3763 100644
> > --- a/mm/khugepaged.c
> > +++ b/mm/khugepaged.c
> > @@ -68,7 +68,10 @@ enum scan_result {
> >  static struct task_struct *khugepaged_thread __read_mostly;
> >  static DEFINE_MUTEX(khugepaged_mutex);
> >
> > -/* default scan 8*HPAGE_PMD_NR ptes (or vmas) every 10 second */
> > +/*
> > + * default scan 8*HPAGE_PMD_NR ptes, pmd_mapped, no_pte_table or vmas
> > + * every 10 second.
> > + */
> >  static unsigned int khugepaged_pages_to_scan __read_mostly;
> >  static unsigned int khugepaged_pages_collapsed;
> >  static unsigned int khugepaged_full_scans;
> > @@ -1240,7 +1243,8 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
> >  }
> >
> >  static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
> > -             struct vm_area_struct *vma, unsigned long start_addr, bool *mmap_locked,
> > +             struct vm_area_struct *vma, unsigned long start_addr,
> > +             bool *mmap_locked, unsigned int *cur_progress,
> >               struct collapse_control *cc)
> >  {
> >       pmd_t *pmd;
> > @@ -1255,6 +1259,9 @@ static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
> >
> >       VM_BUG_ON(start_addr & ~HPAGE_PMD_MASK);
> >
> > +     if (cur_progress)
> > +             *cur_progress += 1;
> > +
>
> 1. Why do we need to do if (cur_progress). Isn't it guaranteed that the pointer will never be NULL.

cur_progress is NULL at madvise_collapse()

> 2. Why do we increment this on function entry?

When the memory has been collapsed to PMD, we increment one only, same as
the previous PATCH #3 [1].

[1] https://lore.kernel.org/linux-mm/20260111121909.8410-4-yanglincheng@kylinos.cn

>
> >       result = find_pmd_or_thp_or_none(mm, start_addr, &pmd);
> >       if (result != SCAN_SUCCEED)
> >               goto out;
> > @@ -1396,6 +1403,12 @@ static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
> >               result = SCAN_SUCCEED;
> >       }
> >  out_unmap:
> > +     if (cur_progress) {
> > +             if (_pte >= pte + HPAGE_PMD_NR)
> > +                     *cur_progress += HPAGE_PMD_NR - 1;
> > +             else
> > +                     *cur_progress += _pte - pte;
> > +     }
>
> Why are two cases required here - shouldn't it just be _pte - pte?

At the function entry, "cur_progress += 1", If this PMD is SCAN_PMD_MAPPED
or SCAN_NO_PTE_TABLE, return directly. otherwise,

When "_pte == pte + HPAGE_PMD_NR" to exit loop, HPAGE_PMD_NR
(also "_pte - pte") PTEs have been scanned, the "cur_progress" needs to be
incremented by "HPAGE_PMD_NR - 1".

When "_pte < pte + HPAGE_PMD_NR" to break loop, "_pte - pte + 1" PTEs have
been scanned, and "cur_progress" needs to be incremented by "_pte - pte".

> >       pte_unmap_unlock(pte, ptl);
> >       if (result == SCAN_SUCCEED) {
> >               result = collapse_huge_page(mm, start_addr, referenced,
> > @@ -2286,8 +2299,9 @@ static enum scan_result collapse_file(struct mm_struct *mm, unsigned long addr,
> >       return result;
> >  }
> >
> > -static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
> > -             struct file *file, pgoff_t start, struct collapse_control *cc)
> > +static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm,
> > +             unsigned long addr, struct file *file, pgoff_t start,
> > +             unsigned int *cur_progress, struct collapse_control *cc)
> >  {
> >       struct folio *folio = NULL;
> >       struct address_space *mapping = file->f_mapping;
> > @@ -2376,6 +2390,18 @@ static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm, unsigned
> >                       cond_resched_rcu();
> >               }
> >       }
> > +     if (cur_progress) {
> > +             unsigned long idx = xas_get_index(&xas) - start;
> > +
> > +             if (folio == NULL)
> > +                     *cur_progress += HPAGE_PMD_NR;
> > +             else if (xa_is_value(folio))
> > +                     *cur_progress += idx + (1 << xas_get_order(&xas));
> > +             else if (folio_order(folio) == HPAGE_PMD_ORDER)
> > +                     *cur_progress += idx + 1;
> > +             else
> > +                     *cur_progress += idx + folio_nr_pages(folio);
> > +     }
> >       rcu_read_unlock();
> >
> >       if (result == SCAN_SUCCEED) {
> > @@ -2456,6 +2482,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result
> >
> >               while (khugepaged_scan.address < hend) {
> >                       bool mmap_locked = true;
> > +                     unsigned int cur_progress = 0;
> >
> >                       cond_resched();
> >                       if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
> > @@ -2472,7 +2499,8 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result
> >                               mmap_read_unlock(mm);
> >                               mmap_locked = false;
> >                               *result = hpage_collapse_scan_file(mm,
> > -                                     khugepaged_scan.address, file, pgoff, cc);
> > +                                     khugepaged_scan.address, file, pgoff,
> > +                                     &cur_progress, cc);
> >                               fput(file);
> >                               if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
> >                                       mmap_read_lock(mm);
> > @@ -2486,7 +2514,8 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result
> >                               }
> >                       } else {
> >                               *result = hpage_collapse_scan_pmd(mm, vma,
> > -                                     khugepaged_scan.address, &mmap_locked, cc);
> > +                                     khugepaged_scan.address, &mmap_locked,
> > +                                     &cur_progress, cc);
> >                       }
> >
> >                       if (*result == SCAN_SUCCEED)
> > @@ -2494,7 +2523,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result
> >
> >                       /* move to next address */
> >                       khugepaged_scan.address += HPAGE_PMD_SIZE;
> > -                     progress += HPAGE_PMD_NR;
> > +                     progress += cur_progress;
> >                       if (!mmap_locked)
> >                               /*
> >                                * We released mmap_lock so break loop.  Note
> > @@ -2817,7 +2846,7 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
> >                       mmap_locked = false;
> >                       *lock_dropped = true;
> >                       result = hpage_collapse_scan_file(mm, addr, file, pgoff,
> > -                                                       cc);
> > +                                                       NULL, cc);
> >
> >                       if (result == SCAN_PAGE_DIRTY_OR_WRITEBACK && !triggered_wb &&
> >                           mapping_can_writeback(file->f_mapping)) {
> > @@ -2832,7 +2861,7 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
> >                       fput(file);
> >               } else {
> >                       result = hpage_collapse_scan_pmd(mm, vma, addr,
> > -                                                      &mmap_locked, cc);
> > +                                                      &mmap_locked, NULL, cc);
> >               }
> >               if (!mmap_locked)
> >                       *lock_dropped = true;

[PATCH mm-new v5 1/5] mm: khugepaged: add trace_mm_khugepaged_scan event
[PATCH mm-new v5 2/5] mm: khugepaged: refine scan progress number
[PATCH mm-new v5 3/5] mm: add folio_test_lazyfree helper
[PATCH mm-new v5 4/5] mm: khugepaged: skip lazy-free folios
[PATCH mm-new v5 5/5] mm: khugepaged: set to next mm direct when mm has MMF_DISABLE_THP_COMPLETELY