[v2] mm: khugepaged cleanups and mTHP prerequisites

[PATCH mm-unstable v2 1/5] mm: consolidate anonymous folio PTE mapping into helpers

Posted by Nico Pache 1 month, 1 week ago

The anonymous page fault handler in do_anonymous_page() open-codes the
sequence to map a newly allocated anonymous folio at the PTE level:
	- construct the PTE entry
	- add rmap
	- add to LRU
	- set the PTEs
	- update the MMU cache.

Introduce a two helpers to consolidate this duplicated logic, mirroring the
existing map_anon_folio_pmd_nopf() pattern for PMD-level mappings:

	map_anon_folio_pte_nopf(): constructs the PTE entry, takes folio
	references, adds anon rmap and LRU. This function also handles the
	uffd_wp that can occur in the pf variant.

	map_anon_folio_pte_pf(): extends the nopf variant to handle MM_ANONPAGES
	counter updates, and mTHP fault allocation statistics for the page fault
	path.

The zero-page read path in do_anonymous_page() is also untangled from the
shared setpte label, since it does not allocate a folio and should not
share the same mapping sequence as the write path. Make nr_pages = 1
rather than relying on the variable. This makes it more clear that we
are operating on the zero page only.

This refactoring will also help reduce code duplication between mm/memory.c
and mm/khugepaged.c, and provides a clean API for PTE-level anonymous folio
mapping that can be reused by future callers.

Signed-off-by: Nico Pache <npache@redhat.com>
---
 include/linux/mm.h |  4 ++++
 mm/memory.c        | 59 +++++++++++++++++++++++++++++++---------------
 2 files changed, 44 insertions(+), 19 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 13336340612e..3ebf143c7502 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -4901,4 +4901,8 @@ static inline bool snapshot_page_is_faithful(const struct page_snapshot *ps)
 
 void snapshot_page(struct page_snapshot *ps, const struct page *page);
 
+void map_anon_folio_pte_nopf(struct folio *folio, pte_t *pte,
+		struct vm_area_struct *vma, unsigned long addr,
+		bool uffd_wp);
+
 #endif /* _LINUX_MM_H */
diff --git a/mm/memory.c b/mm/memory.c
index 9385842c3503..a1a364e1fdcd 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -5189,6 +5189,36 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
 	return folio_prealloc(vma->vm_mm, vma, vmf->address, true);
 }
 
+void map_anon_folio_pte_nopf(struct folio *folio, pte_t *pte,
+		struct vm_area_struct *vma, unsigned long addr,
+		bool uffd_wp)
+{
+	unsigned int nr_pages = folio_nr_pages(folio);
+	pte_t entry = folio_mk_pte(folio, vma->vm_page_prot);
+
+	entry = pte_sw_mkyoung(entry);
+
+	if (vma->vm_flags & VM_WRITE)
+		entry = pte_mkwrite(pte_mkdirty(entry), vma);
+	if (uffd_wp)
+		entry = pte_mkuffd_wp(entry);
+
+	folio_ref_add(folio, nr_pages - 1);
+	folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE);
+	folio_add_lru_vma(folio, vma);
+	set_ptes(vma->vm_mm, addr, pte, entry, nr_pages);
+	update_mmu_cache_range(NULL, vma, addr, pte, nr_pages);
+}
+
+static void map_anon_folio_pte_pf(struct folio *folio, pte_t *pte,
+		struct vm_area_struct *vma, unsigned long addr,
+		unsigned int nr_pages, bool uffd_wp)
+{
+	map_anon_folio_pte_nopf(folio, pte, vma, addr, uffd_wp);
+	add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages);
+	count_mthp_stat(folio_order(folio), MTHP_STAT_ANON_FAULT_ALLOC);
+}
+
 /*
  * We enter with non-exclusive mmap_lock (to exclude vma changes,
  * but allow concurrent faults), and pte mapped but not yet locked.
@@ -5235,7 +5265,14 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
 			pte_unmap_unlock(vmf->pte, vmf->ptl);
 			return handle_userfault(vmf, VM_UFFD_MISSING);
 		}
-		goto setpte;
+		if (vmf_orig_pte_uffd_wp(vmf))
+			entry = pte_mkuffd_wp(entry);
+		set_pte_at(vma->vm_mm, addr, vmf->pte, entry);
+
+		/* No need to invalidate - it was non-present before */
+		update_mmu_cache_range(vmf, vma, addr, vmf->pte,
+				       /*nr_pages=*/ 1);
+		goto unlock;
 	}
 
 	/* Allocate our own private page. */
@@ -5259,11 +5296,6 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
 	 */
 	__folio_mark_uptodate(folio);
 
-	entry = folio_mk_pte(folio, vma->vm_page_prot);
-	entry = pte_sw_mkyoung(entry);
-	if (vma->vm_flags & VM_WRITE)
-		entry = pte_mkwrite(pte_mkdirty(entry), vma);
-
 	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl);
 	if (!vmf->pte)
 		goto release;
@@ -5285,19 +5317,8 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
 		folio_put(folio);
 		return handle_userfault(vmf, VM_UFFD_MISSING);
 	}
-
-	folio_ref_add(folio, nr_pages - 1);
-	add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages);
-	count_mthp_stat(folio_order(folio), MTHP_STAT_ANON_FAULT_ALLOC);
-	folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE);
-	folio_add_lru_vma(folio, vma);
-setpte:
-	if (vmf_orig_pte_uffd_wp(vmf))
-		entry = pte_mkuffd_wp(entry);
-	set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr_pages);
-
-	/* No need to invalidate - it was non-present before */
-	update_mmu_cache_range(vmf, vma, addr, vmf->pte, nr_pages);
+	map_anon_folio_pte_pf(folio, vmf->pte, vma, addr, nr_pages,
+			      vmf_orig_pte_uffd_wp(vmf));
 unlock:
 	if (vmf->pte)
 		pte_unmap_unlock(vmf->pte, vmf->ptl);
-- 
2.53.0

Re: [PATCH mm-unstable v2 1/5] mm: consolidate anonymous folio PTE mapping into helpers

Posted by Dev Jain 1 month, 1 week ago


On 26/02/26 6:59 am, Nico Pache wrote:
> The anonymous page fault handler in do_anonymous_page() open-codes the
> sequence to map a newly allocated anonymous folio at the PTE level:
> 	- construct the PTE entry
> 	- add rmap
> 	- add to LRU
> 	- set the PTEs
> 	- update the MMU cache.
> 
> Introduce a two helpers to consolidate this duplicated logic, mirroring the
> existing map_anon_folio_pmd_nopf() pattern for PMD-level mappings:
> 
> 	map_anon_folio_pte_nopf(): constructs the PTE entry, takes folio
> 	references, adds anon rmap and LRU. This function also handles the
> 	uffd_wp that can occur in the pf variant.
> 
> 	map_anon_folio_pte_pf(): extends the nopf variant to handle MM_ANONPAGES
> 	counter updates, and mTHP fault allocation statistics for the page fault
> 	path.
> 
> The zero-page read path in do_anonymous_page() is also untangled from the
> shared setpte label, since it does not allocate a folio and should not
> share the same mapping sequence as the write path. Make nr_pages = 1
> rather than relying on the variable. This makes it more clear that we
> are operating on the zero page only.
> 
> This refactoring will also help reduce code duplication between mm/memory.c
> and mm/khugepaged.c, and provides a clean API for PTE-level anonymous folio
> mapping that can be reused by future callers.
> 
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---

Reviewed-by: Dev Jain <dev.jain@arm.com>

Should we make map_anon_folio_pte_pf() inline, since it has only one caller.

>  include/linux/mm.h |  4 ++++
>  mm/memory.c        | 59 +++++++++++++++++++++++++++++++---------------
>  2 files changed, 44 insertions(+), 19 deletions(-)
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 13336340612e..3ebf143c7502 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -4901,4 +4901,8 @@ static inline bool snapshot_page_is_faithful(const struct page_snapshot *ps)
>  
>  void snapshot_page(struct page_snapshot *ps, const struct page *page);
>  
> +void map_anon_folio_pte_nopf(struct folio *folio, pte_t *pte,
> +		struct vm_area_struct *vma, unsigned long addr,
> +		bool uffd_wp);
> +
>  #endif /* _LINUX_MM_H */
> diff --git a/mm/memory.c b/mm/memory.c
> index 9385842c3503..a1a364e1fdcd 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -5189,6 +5189,36 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
>  	return folio_prealloc(vma->vm_mm, vma, vmf->address, true);
>  }
>  
> +void map_anon_folio_pte_nopf(struct folio *folio, pte_t *pte,
> +		struct vm_area_struct *vma, unsigned long addr,
> +		bool uffd_wp)
> +{
> +	unsigned int nr_pages = folio_nr_pages(folio);
> +	pte_t entry = folio_mk_pte(folio, vma->vm_page_prot);
> +
> +	entry = pte_sw_mkyoung(entry);
> +
> +	if (vma->vm_flags & VM_WRITE)
> +		entry = pte_mkwrite(pte_mkdirty(entry), vma);
> +	if (uffd_wp)
> +		entry = pte_mkuffd_wp(entry);
> +
> +	folio_ref_add(folio, nr_pages - 1);
> +	folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE);
> +	folio_add_lru_vma(folio, vma);
> +	set_ptes(vma->vm_mm, addr, pte, entry, nr_pages);
> +	update_mmu_cache_range(NULL, vma, addr, pte, nr_pages);
> +}
> +
> +static void map_anon_folio_pte_pf(struct folio *folio, pte_t *pte,
> +		struct vm_area_struct *vma, unsigned long addr,
> +		unsigned int nr_pages, bool uffd_wp)
> +{
> +	map_anon_folio_pte_nopf(folio, pte, vma, addr, uffd_wp);
> +	add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages);
> +	count_mthp_stat(folio_order(folio), MTHP_STAT_ANON_FAULT_ALLOC);
> +}
> +
>  /*
>   * We enter with non-exclusive mmap_lock (to exclude vma changes,
>   * but allow concurrent faults), and pte mapped but not yet locked.
> @@ -5235,7 +5265,14 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
>  			pte_unmap_unlock(vmf->pte, vmf->ptl);
>  			return handle_userfault(vmf, VM_UFFD_MISSING);
>  		}
> -		goto setpte;
> +		if (vmf_orig_pte_uffd_wp(vmf))
> +			entry = pte_mkuffd_wp(entry);
> +		set_pte_at(vma->vm_mm, addr, vmf->pte, entry);
> +
> +		/* No need to invalidate - it was non-present before */
> +		update_mmu_cache_range(vmf, vma, addr, vmf->pte,
> +				       /*nr_pages=*/ 1);
> +		goto unlock;
>  	}
>  
>  	/* Allocate our own private page. */
> @@ -5259,11 +5296,6 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
>  	 */
>  	__folio_mark_uptodate(folio);
>  
> -	entry = folio_mk_pte(folio, vma->vm_page_prot);
> -	entry = pte_sw_mkyoung(entry);
> -	if (vma->vm_flags & VM_WRITE)
> -		entry = pte_mkwrite(pte_mkdirty(entry), vma);
> -
>  	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl);
>  	if (!vmf->pte)
>  		goto release;
> @@ -5285,19 +5317,8 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
>  		folio_put(folio);
>  		return handle_userfault(vmf, VM_UFFD_MISSING);
>  	}
> -
> -	folio_ref_add(folio, nr_pages - 1);
> -	add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages);
> -	count_mthp_stat(folio_order(folio), MTHP_STAT_ANON_FAULT_ALLOC);
> -	folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE);
> -	folio_add_lru_vma(folio, vma);
> -setpte:
> -	if (vmf_orig_pte_uffd_wp(vmf))
> -		entry = pte_mkuffd_wp(entry);
> -	set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr_pages);
> -
> -	/* No need to invalidate - it was non-present before */
> -	update_mmu_cache_range(vmf, vma, addr, vmf->pte, nr_pages);
> +	map_anon_folio_pte_pf(folio, vmf->pte, vma, addr, nr_pages,
> +			      vmf_orig_pte_uffd_wp(vmf));
>  unlock:
>  	if (vmf->pte)
>  		pte_unmap_unlock(vmf->pte, vmf->ptl);

Re: [PATCH mm-unstable v2 1/5] mm: consolidate anonymous folio PTE mapping into helpers

Posted by Lance Yang 1 month, 1 week ago


On 2026/2/26 09:29, Nico Pache wrote:
> The anonymous page fault handler in do_anonymous_page() open-codes the
> sequence to map a newly allocated anonymous folio at the PTE level:
> 	- construct the PTE entry
> 	- add rmap
> 	- add to LRU
> 	- set the PTEs
> 	- update the MMU cache.
> 
> Introduce a two helpers to consolidate this duplicated logic, mirroring the
> existing map_anon_folio_pmd_nopf() pattern for PMD-level mappings:
> 
> 	map_anon_folio_pte_nopf(): constructs the PTE entry, takes folio
> 	references, adds anon rmap and LRU. This function also handles the
> 	uffd_wp that can occur in the pf variant.
> 
> 	map_anon_folio_pte_pf(): extends the nopf variant to handle MM_ANONPAGES
> 	counter updates, and mTHP fault allocation statistics for the page fault
> 	path.
> 
> The zero-page read path in do_anonymous_page() is also untangled from the
> shared setpte label, since it does not allocate a folio and should not
> share the same mapping sequence as the write path. Make nr_pages = 1
> rather than relying on the variable. This makes it more clear that we
> are operating on the zero page only.
> 
> This refactoring will also help reduce code duplication between mm/memory.c
> and mm/khugepaged.c, and provides a clean API for PTE-level anonymous folio
> mapping that can be reused by future callers.
> 
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---

The refactoring looks good to me.

With the change David suggested (removing nr_pages parameter from
map_anon_folio_pte_pf() and using folio_order() instead internally),

Reviewed-by: Lance Yang <lance.yang@linux.dev>

Re: [PATCH mm-unstable v2 1/5] mm: consolidate anonymous folio PTE mapping into helpers

Posted by David Hildenbrand (Arm) 1 month, 1 week ago

On 2/26/26 02:29, Nico Pache wrote:
> The anonymous page fault handler in do_anonymous_page() open-codes the
> sequence to map a newly allocated anonymous folio at the PTE level:
> 	- construct the PTE entry
> 	- add rmap
> 	- add to LRU
> 	- set the PTEs
> 	- update the MMU cache.
> 
> Introduce a two helpers to consolidate this duplicated logic, mirroring the
> existing map_anon_folio_pmd_nopf() pattern for PMD-level mappings:
> 
> 	map_anon_folio_pte_nopf(): constructs the PTE entry, takes folio
> 	references, adds anon rmap and LRU. This function also handles the
> 	uffd_wp that can occur in the pf variant.
> 
> 	map_anon_folio_pte_pf(): extends the nopf variant to handle MM_ANONPAGES
> 	counter updates, and mTHP fault allocation statistics for the page fault
> 	path.
> 
> The zero-page read path in do_anonymous_page() is also untangled from the
> shared setpte label, since it does not allocate a folio and should not
> share the same mapping sequence as the write path. Make nr_pages = 1
> rather than relying on the variable. This makes it more clear that we
> are operating on the zero page only.
> 
> This refactoring will also help reduce code duplication between mm/memory.c
> and mm/khugepaged.c, and provides a clean API for PTE-level anonymous folio
> mapping that can be reused by future callers.
> 
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---
>  include/linux/mm.h |  4 ++++
>  mm/memory.c        | 59 +++++++++++++++++++++++++++++++---------------
>  2 files changed, 44 insertions(+), 19 deletions(-)
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 13336340612e..3ebf143c7502 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -4901,4 +4901,8 @@ static inline bool snapshot_page_is_faithful(const struct page_snapshot *ps)
>  
>  void snapshot_page(struct page_snapshot *ps, const struct page *page);
>  
> +void map_anon_folio_pte_nopf(struct folio *folio, pte_t *pte,
> +		struct vm_area_struct *vma, unsigned long addr,
> +		bool uffd_wp);
> +
>  #endif /* _LINUX_MM_H */
> diff --git a/mm/memory.c b/mm/memory.c
> index 9385842c3503..a1a364e1fdcd 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -5189,6 +5189,36 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
>  	return folio_prealloc(vma->vm_mm, vma, vmf->address, true);
>  }
>  
> +void map_anon_folio_pte_nopf(struct folio *folio, pte_t *pte,
> +		struct vm_area_struct *vma, unsigned long addr,
> +		bool uffd_wp)
> +{
> +	unsigned int nr_pages = folio_nr_pages(folio);
> +	pte_t entry = folio_mk_pte(folio, vma->vm_page_prot);
> +
> +	entry = pte_sw_mkyoung(entry);
> +
> +	if (vma->vm_flags & VM_WRITE)
> +		entry = pte_mkwrite(pte_mkdirty(entry), vma);
> +	if (uffd_wp)
> +		entry = pte_mkuffd_wp(entry);
> +
> +	folio_ref_add(folio, nr_pages - 1);
> +	folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE);
> +	folio_add_lru_vma(folio, vma);
> +	set_ptes(vma->vm_mm, addr, pte, entry, nr_pages);
> +	update_mmu_cache_range(NULL, vma, addr, pte, nr_pages);
> +}
> +
> +static void map_anon_folio_pte_pf(struct folio *folio, pte_t *pte,
> +		struct vm_area_struct *vma, unsigned long addr,
> +		unsigned int nr_pages, bool uffd_wp)
> +{
> +	map_anon_folio_pte_nopf(folio, pte, vma, addr, uffd_wp);
> +	add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages);
> +	count_mthp_stat(folio_order(folio), MTHP_STAT_ANON_FAULT_ALLOC);
> +}

One thing:

You can also void passing in "nr_pages" here, especially when you query
the order below, and simply do

unsigned int order = folio_order(folio);

map_anon_folio_pte_nopf(folio, pte, vma, addr, uffd_wp);
add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1U << order);
count_mthp_stat(order, MTHP_STAT_ANON_FAULT_ALLOC);


Apart from that

Acked-by: David Hildenbrand (Arm) <david@kernel.org>

-- 
Cheers,

David

[PATCH mm-unstable v2 1/5] mm: consolidate anonymous folio PTE mapping into helpers
[PATCH mm-unstable v2 2/5] mm: introduce is_pmd_order helper
[PATCH mm-unstable v2 3/5] mm/khugepaged: define COLLAPSE_MAX_PTES_LIMIT as HPAGE_PMD_NR - 1
[PATCH mm-unstable v2 4/5] mm/khugepaged: rename hpage_collapse_* to collapse_*
[PATCH mm-unstable v2 5/5] mm/khugepaged: unify khugepaged and madv_collapse with collapse_single_pmd()