mm: add huge pfnmap support for remap_pfn_range()

[PATCH RFC 2/2] mm: add PMD-level huge page support for remap_pfn_range()

Posted by Yin Tirui 1 week, 1 day ago

Add PMD-level huge page support to remap_pfn_range(), automatically
creating huge mappings when prerequisites are satisfied (size, alignment,
architecture support, etc.) and falling back to normal page mappings
otherwise.

Implement special huge PMD splitting by utilizing the pgtable deposit/
withdraw mechanism. When splitting is needed, the deposited pgtable is
withdrawn and populated with individual PTEs created from the original
huge mapping, using pte_clrhuge() to clear huge page attributes.

Update arch_needs_pgtable_deposit() to return true when PMD pfnmap
support is enabled, ensuring proper pgtable management for huge
pfnmap operations.

Introduce pfnmap_max_page_shift parameter to control maximum page
size and "nohugepfnmap" boot option to disable huge pfnmap entirely.

Signed-off-by: Yin Tirui <yintirui@huawei.com>
---
 include/linux/pgtable.h |  6 +++-
 mm/huge_memory.c        | 22 ++++++++----
 mm/memory.c             | 74 ++++++++++++++++++++++++++++++++++++-----
 3 files changed, 85 insertions(+), 17 deletions(-)

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 4c035637eeb7..4028318552ca 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1025,7 +1025,11 @@ extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
 #endif
 
 #ifndef arch_needs_pgtable_deposit
-#define arch_needs_pgtable_deposit() (false)
+#define arch_needs_pgtable_deposit arch_needs_pgtable_deposit
+static inline bool arch_needs_pgtable_deposit(void)
+{
+	return IS_ENABLED(CONFIG_ARCH_SUPPORTS_PMD_PFNMAP);
+}
 #endif
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 9c38a95e9f09..9f20adcbbb55 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2857,14 +2857,22 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 
 	if (!vma_is_anonymous(vma)) {
 		old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd);
-		/*
-		 * We are going to unmap this huge page. So
-		 * just go ahead and zap it
-		 */
-		if (arch_needs_pgtable_deposit())
-			zap_deposited_table(mm, pmd);
-		if (!vma_is_dax(vma) && vma_is_special_huge(vma))
+		if (!vma_is_dax(vma) && vma_is_special_huge(vma)) {
+			pte_t entry;
+
+			pgtable = pgtable_trans_huge_withdraw(mm, pmd);
+			if (unlikely(!pgtable))
+				return;
+			pmd_populate(mm, &_pmd, pgtable);
+			pte = pte_offset_map(&_pmd, haddr);
+			entry = pte_clrhuge(pfn_pte(pmd_pfn(old_pmd), pmd_pgprot(old_pmd)));
+			set_ptes(mm, haddr, pte, entry, HPAGE_PMD_NR);
+			pte_unmap(pte);
+
+			smp_wmb(); /* make pte visible before pmd */
+			pmd_populate(mm, pmd, pgtable);
 			return;
+		}
 		if (unlikely(is_pmd_migration_entry(old_pmd))) {
 			swp_entry_t entry;
 
diff --git a/mm/memory.c b/mm/memory.c
index 0ba4f6b71847..c4aaf3bd9cad 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2674,6 +2674,19 @@ vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
 	return __vm_insert_mixed(vma, addr, pfn, true);
 }
 
+#ifdef CONFIG_ARCH_SUPPORTS_HUGE_PFNMAP
+static unsigned int __ro_after_init pfnmap_max_page_shift = BITS_PER_LONG - 1;
+
+static int __init set_nohugepfnmap(char *str)
+{
+	pfnmap_max_page_shift = PAGE_SHIFT;
+	return 0;
+}
+early_param("nohugepfnmap", set_nohugepfnmap);
+#else /* CONFIG_ARCH_SUPPORTS_HUGE_PFNMAP */
+static const unsigned int pfnmap_max_page_shift = PAGE_SHIFT;
+#endif	/* CONFIG_ARCH_SUPPORTS_HUGE_PFNMAP */
+
 /*
  * maps a range of physical memory into the requested pages. the old
  * mappings are removed. any references to nonexistent pages results
@@ -2705,9 +2718,47 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
 	return err;
 }
 
+#ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP
+static int remap_try_huge_pmd(struct mm_struct *mm, pmd_t *pmd,
+			unsigned long addr, unsigned long end,
+			unsigned long pfn, pgprot_t prot,
+			unsigned int page_shift)
+{
+	pgtable_t pgtable;
+	spinlock_t *ptl;
+
+	if (page_shift < PMD_SHIFT)
+		return 0;
+
+	if ((end - addr) != PMD_SIZE)
+		return 0;
+
+	if (!IS_ALIGNED(addr, PMD_SIZE))
+		return 0;
+
+	if (!IS_ALIGNED(pfn, 1 << (PMD_SHIFT - PAGE_SHIFT)))
+		return 0;
+
+	if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr))
+		return 0;
+
+	set_pmd_at(mm, addr, pmd, pmd_mkspecial(pmd_mkhuge(pfn_pmd(pfn, prot))));
+
+	pgtable = pte_alloc_one(mm);
+	if (unlikely(!pgtable))
+		return 1;
+	mm_inc_nr_ptes(mm);
+	ptl = pmd_lock(mm, pmd);
+	pgtable_trans_huge_deposit(mm, pmd, pgtable);
+	spin_unlock(ptl);
+
+	return 1;
+}
+#endif
+
 static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
 			unsigned long addr, unsigned long end,
-			unsigned long pfn, pgprot_t prot)
+			unsigned long pfn, pgprot_t prot, unsigned int max_page_shift)
 {
 	pmd_t *pmd;
 	unsigned long next;
@@ -2720,6 +2771,12 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
 	VM_BUG_ON(pmd_trans_huge(*pmd));
 	do {
 		next = pmd_addr_end(addr, end);
+#ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP
+		if (remap_try_huge_pmd(mm, pmd, addr, next,
+				pfn + (addr >> PAGE_SHIFT), prot, max_page_shift)) {
+			continue;
+		}
+#endif
 		err = remap_pte_range(mm, pmd, addr, next,
 				pfn + (addr >> PAGE_SHIFT), prot);
 		if (err)
@@ -2730,7 +2787,7 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
 
 static inline int remap_pud_range(struct mm_struct *mm, p4d_t *p4d,
 			unsigned long addr, unsigned long end,
-			unsigned long pfn, pgprot_t prot)
+			unsigned long pfn, pgprot_t prot, unsigned int max_page_shift)
 {
 	pud_t *pud;
 	unsigned long next;
@@ -2743,7 +2800,7 @@ static inline int remap_pud_range(struct mm_struct *mm, p4d_t *p4d,
 	do {
 		next = pud_addr_end(addr, end);
 		err = remap_pmd_range(mm, pud, addr, next,
-				pfn + (addr >> PAGE_SHIFT), prot);
+				pfn + (addr >> PAGE_SHIFT), prot, max_page_shift);
 		if (err)
 			return err;
 	} while (pud++, addr = next, addr != end);
@@ -2752,7 +2809,7 @@ static inline int remap_pud_range(struct mm_struct *mm, p4d_t *p4d,
 
 static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
 			unsigned long addr, unsigned long end,
-			unsigned long pfn, pgprot_t prot)
+			unsigned long pfn, pgprot_t prot, unsigned int max_page_shift)
 {
 	p4d_t *p4d;
 	unsigned long next;
@@ -2765,7 +2822,7 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
 	do {
 		next = p4d_addr_end(addr, end);
 		err = remap_pud_range(mm, p4d, addr, next,
-				pfn + (addr >> PAGE_SHIFT), prot);
+				pfn + (addr >> PAGE_SHIFT), prot, max_page_shift);
 		if (err)
 			return err;
 	} while (p4d++, addr = next, addr != end);
@@ -2773,7 +2830,7 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
 }
 
 static int remap_pfn_range_internal(struct vm_area_struct *vma, unsigned long addr,
-		unsigned long pfn, unsigned long size, pgprot_t prot)
+		unsigned long pfn, unsigned long size, pgprot_t prot, unsigned int max_page_shift)
 {
 	pgd_t *pgd;
 	unsigned long next;
@@ -2817,7 +2874,7 @@ static int remap_pfn_range_internal(struct vm_area_struct *vma, unsigned long ad
 	do {
 		next = pgd_addr_end(addr, end);
 		err = remap_p4d_range(mm, pgd, addr, next,
-				pfn + (addr >> PAGE_SHIFT), prot);
+				pfn + (addr >> PAGE_SHIFT), prot, max_page_shift);
 		if (err)
 			return err;
 	} while (pgd++, addr = next, addr != end);
@@ -2832,8 +2889,7 @@ static int remap_pfn_range_internal(struct vm_area_struct *vma, unsigned long ad
 int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
 		unsigned long pfn, unsigned long size, pgprot_t prot)
 {
-	int error = remap_pfn_range_internal(vma, addr, pfn, size, prot);
-
+	int error = remap_pfn_range_internal(vma, addr, pfn, size, prot, pfnmap_max_page_shift);
 	if (!error)
 		return 0;
 
-- 
2.43.0

Re: [PATCH RFC 2/2] mm: add PMD-level huge page support for remap_pfn_range()

Posted by David Hildenbrand 1 week ago

On 23.09.25 15:31, Yin Tirui wrote:
> Add PMD-level huge page support to remap_pfn_range(), automatically
> creating huge mappings when prerequisites are satisfied (size, alignment,
> architecture support, etc.) and falling back to normal page mappings
> otherwise.
> 
> Implement special huge PMD splitting by utilizing the pgtable deposit/
> withdraw mechanism. When splitting is needed, the deposited pgtable is
> withdrawn and populated with individual PTEs created from the original
> huge mapping, using pte_clrhuge() to clear huge page attributes.
> 
> Update arch_needs_pgtable_deposit() to return true when PMD pfnmap
> support is enabled, ensuring proper pgtable management for huge
> pfnmap operations.
> 
> Introduce pfnmap_max_page_shift parameter to control maximum page
> size and "nohugepfnmap" boot option to disable huge pfnmap entirely.

Why? If an arch supports it we should just do it. Or what's the reason 
behind that?

> 
> Signed-off-by: Yin Tirui <yintirui@huawei.com>
> ---
>   include/linux/pgtable.h |  6 +++-
>   mm/huge_memory.c        | 22 ++++++++----
>   mm/memory.c             | 74 ++++++++++++++++++++++++++++++++++++-----
>   3 files changed, 85 insertions(+), 17 deletions(-)
> 
> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
> index 4c035637eeb7..4028318552ca 100644
> --- a/include/linux/pgtable.h
> +++ b/include/linux/pgtable.h
> @@ -1025,7 +1025,11 @@ extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
>   #endif
>   
>   #ifndef arch_needs_pgtable_deposit
> -#define arch_needs_pgtable_deposit() (false)
> +#define arch_needs_pgtable_deposit arch_needs_pgtable_deposit
> +static inline bool arch_needs_pgtable_deposit(void)
> +{
> +	return IS_ENABLED(CONFIG_ARCH_SUPPORTS_PMD_PFNMAP);
> +}
>   #endif
>   
>   #ifdef CONFIG_TRANSPARENT_HUGEPAGE
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 9c38a95e9f09..9f20adcbbb55 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -2857,14 +2857,22 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>   
>   	if (!vma_is_anonymous(vma)) {
>   		old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd);
> -		/*
> -		 * We are going to unmap this huge page. So
> -		 * just go ahead and zap it
> -		 */
> -		if (arch_needs_pgtable_deposit())
> -			zap_deposited_table(mm, pmd);

Are you sure we can just entirely remove this block for 
!vma_is_anonymous(vma)?

-- 
Cheers

David / dhildenb

Re: [PATCH RFC 2/2] mm: add PMD-level huge page support for remap_pfn_range()

Posted by Matthew Wilcox 1 week, 1 day ago

On Tue, Sep 23, 2025 at 09:31:04PM +0800, Yin Tirui wrote:
> +			entry = pte_clrhuge(pfn_pte(pmd_pfn(old_pmd), pmd_pgprot(old_pmd)));

This doesn't make sense.  And I'm not saying you got this wrong; I
suspect in terms of how things work today it's actually necessary.
But the way we handle this stuff is so insane.

pte_clrhuge() should not exist.  If we have a PTE, it can't have the
huge bit set, by definition (don't anybody mention hugetlbfs because
that is an entirely separate pile of broken horrors).  I understand what
you're trying to do here.  You want to construct a PTE that points to
the same address as the first page of the PMD and has the same
permissions.  But that *should* be written as:

	entry = pfn_pte(pmd_pfn(old_pmd), pmd_pgprot(old_pmd)));

right?  Now, pmd_pgprot() might or might not want to return the huge bit
set.  I'm not sure.  Perhaps you could have a look through and figure it
out.  But pfn_pte() should never return a PTE with the huge bit set.
So if it is set in the pgorot on entry, it should filter it out.

There are going to be consequences to this.  Maybe there's code
somewhere that relies on pfn_pte() returning a PTE with the huge bit
set.  Perhaps it's hugetlbfs.

But we have to start cleaning this garbage up.  I did some work with
e3981db444a0 and the commits leading up to that.  See
https://lkml.kernel.org/r/20250402181709.2386022-12-willy@infradead.org

I'd like pte_clrhuge() to be deleted from x86, not added to arm and
riscv.

[PATCH RFC 1/2] pgtable: add pte_clrhuge() implementation for arm64 and riscv
[PATCH RFC 2/2] mm: add PMD-level huge page support for remap_pfn_range()