[PATCH v3 08/22] mm: Allow page table accessors to be non-idempotent

Samuel Holland posted 22 patches 3 weeks, 5 days ago
[PATCH v3 08/22] mm: Allow page table accessors to be non-idempotent
Posted by Samuel Holland 3 weeks, 5 days ago
Currently, some functions such as pte_offset_map() are passed both
pointers to hardware page tables, and pointers to previously-read PMD
entries on the stack. To ensure correctness in the first case, these
functions must use the page table accessor function (pmdp_get()) to
dereference the supplied pointer. However, this means pmdp_get() is
called twice in the second case. This double call must be avoided if
pmdp_get() applies some non-idempotent transformation to the value.

Avoid the double transformation by calling set_pmd() on the stack
variables where necessary to keep set_pmd()/pmdp_get() calls balanced.

Signed-off-by: Samuel Holland <samuel.holland@sifive.com>
---

(no changes since v2)

Changes in v2:
 - New patch for v2

 kernel/events/core.c  | 2 ++
 mm/gup.c              | 3 +++
 mm/khugepaged.c       | 6 ++++--
 mm/page_table_check.c | 3 +++
 mm/pgtable-generic.c  | 2 ++
 5 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index fa4f9165bd94..7969b060bf2d 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8154,6 +8154,8 @@ static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr)
 	if (pmd_leaf(pmd))
 		return pmd_leaf_size(pmd);
 
+	/* transform pmd as if &pmd pointed to a hardware page table */
+	set_pmd(&pmd, pmd);
 	ptep = pte_offset_map(&pmd, addr);
 	if (!ptep)
 		goto again;
diff --git a/mm/gup.c b/mm/gup.c
index 549f9e868311..aba61704049e 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -2844,7 +2844,10 @@ static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
 	int ret = 0;
 	pte_t *ptep, *ptem;
 
+	/* transform pmd as if &pmd pointed to a hardware page table */
+	set_pmd(&pmd, pmd);
 	ptem = ptep = pte_offset_map(&pmd, addr);
+	pmd = pmdp_get(&pmd);
 	if (!ptep)
 		return 0;
 	do {
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 1bff8ade751a..ab1f68a7bc83 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1724,7 +1724,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
 		struct mmu_notifier_range range;
 		struct mm_struct *mm;
 		unsigned long addr;
-		pmd_t *pmd, pgt_pmd;
+		pmd_t *pmd, pgt_pmd, pmdval;
 		spinlock_t *pml;
 		spinlock_t *ptl;
 		bool success = false;
@@ -1777,7 +1777,9 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
 		 */
 		if (check_pmd_state(pmd) != SCAN_SUCCEED)
 			goto drop_pml;
-		ptl = pte_lockptr(mm, pmd);
+		/* pte_lockptr() needs a value, not a pointer to a page table */
+		pmdval = pmdp_get(pmd);
+		ptl = pte_lockptr(mm, &pmdval);
 		if (ptl != pml)
 			spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
 
diff --git a/mm/page_table_check.c b/mm/page_table_check.c
index 31f4c39d20ef..77d6688db0de 100644
--- a/mm/page_table_check.c
+++ b/mm/page_table_check.c
@@ -260,7 +260,10 @@ void __page_table_check_pte_clear_range(struct mm_struct *mm,
 		return;
 
 	if (!pmd_bad(pmd) && !pmd_leaf(pmd)) {
+		/* transform pmd as if &pmd pointed to a hardware page table */
+		set_pmd(&pmd, pmd);
 		pte_t *ptep = pte_offset_map(&pmd, addr);
+		pmd = pmdp_get(&pmd);
 		unsigned long i;
 
 		if (WARN_ON(!ptep))
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 63a573306bfa..6602deb002f1 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -299,6 +299,8 @@ pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp)
 		pmd_clear_bad(pmd);
 		goto nomap;
 	}
+	/* transform pmdval as if &pmdval pointed to a hardware page table */
+	set_pmd(&pmdval, pmdval);
 	return __pte_map(&pmdval, addr);
 nomap:
 	rcu_read_unlock();
-- 
2.47.2
Re: [PATCH v3 08/22] mm: Allow page table accessors to be non-idempotent
Posted by Ryan Roberts 1 week, 4 days ago
On 13/11/2025 01:45, Samuel Holland wrote:
> Currently, some functions such as pte_offset_map() are passed both
> pointers to hardware page tables, and pointers to previously-read PMD
> entries on the stack. To ensure correctness in the first case, these
> functions must use the page table accessor function (pmdp_get()) to
> dereference the supplied pointer. However, this means pmdp_get() is
> called twice in the second case. This double call must be avoided if
> pmdp_get() applies some non-idempotent transformation to the value.
> 
> Avoid the double transformation by calling set_pmd() on the stack
> variables where necessary to keep set_pmd()/pmdp_get() calls balanced.

I don't think this is a good solution.

arm64, at least, expects and requires that only pointers to entries in pgtables
are passed to the arch helpers (e.g. set_pte(), ptep_get(), etc). For PTEs,
arm64 accesses adjacent entries within the page table to manage contiguous
mappings. If it is passed a pointer to a stack variable, it may erroneously
access other stuff on the stack thinking it is an entry in a page table.

I think we should formalize this as a clear requirement for all these functions;
all pte/pmd/pud/p4d/pgd pointers passed to the arch pgtable helpers must always
point to entries in pgtables.

arm64 will very likely take advantage of this in future in the pmd/pud/...
helpers as it does today for the pte level. But even today, arm64's set_pmd()
will emit barriers which are totally unnecessary when operating on a stack
variable that the HW PTW will never see.

Thanks,
Ryan

> 
> Signed-off-by: Samuel Holland <samuel.holland@sifive.com>
> ---
> 
> (no changes since v2)
> 
> Changes in v2:
>  - New patch for v2
> 
>  kernel/events/core.c  | 2 ++
>  mm/gup.c              | 3 +++
>  mm/khugepaged.c       | 6 ++++--
>  mm/page_table_check.c | 3 +++
>  mm/pgtable-generic.c  | 2 ++
>  5 files changed, 14 insertions(+), 2 deletions(-)
> 
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index fa4f9165bd94..7969b060bf2d 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -8154,6 +8154,8 @@ static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr)
>  	if (pmd_leaf(pmd))
>  		return pmd_leaf_size(pmd);
>  
> +	/* transform pmd as if &pmd pointed to a hardware page table */
> +	set_pmd(&pmd, pmd);
>  	ptep = pte_offset_map(&pmd, addr);
>  	if (!ptep)
>  		goto again;
> diff --git a/mm/gup.c b/mm/gup.c
> index 549f9e868311..aba61704049e 100644
> --- a/mm/gup.c
> +++ b/mm/gup.c
> @@ -2844,7 +2844,10 @@ static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
>  	int ret = 0;
>  	pte_t *ptep, *ptem;
>  
> +	/* transform pmd as if &pmd pointed to a hardware page table */
> +	set_pmd(&pmd, pmd);
>  	ptem = ptep = pte_offset_map(&pmd, addr);
> +	pmd = pmdp_get(&pmd);
>  	if (!ptep)
>  		return 0;
>  	do {
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 1bff8ade751a..ab1f68a7bc83 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -1724,7 +1724,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
>  		struct mmu_notifier_range range;
>  		struct mm_struct *mm;
>  		unsigned long addr;
> -		pmd_t *pmd, pgt_pmd;
> +		pmd_t *pmd, pgt_pmd, pmdval;
>  		spinlock_t *pml;
>  		spinlock_t *ptl;
>  		bool success = false;
> @@ -1777,7 +1777,9 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
>  		 */
>  		if (check_pmd_state(pmd) != SCAN_SUCCEED)
>  			goto drop_pml;
> -		ptl = pte_lockptr(mm, pmd);
> +		/* pte_lockptr() needs a value, not a pointer to a page table */
> +		pmdval = pmdp_get(pmd);
> +		ptl = pte_lockptr(mm, &pmdval);
>  		if (ptl != pml)
>  			spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
>  
> diff --git a/mm/page_table_check.c b/mm/page_table_check.c
> index 31f4c39d20ef..77d6688db0de 100644
> --- a/mm/page_table_check.c
> +++ b/mm/page_table_check.c
> @@ -260,7 +260,10 @@ void __page_table_check_pte_clear_range(struct mm_struct *mm,
>  		return;
>  
>  	if (!pmd_bad(pmd) && !pmd_leaf(pmd)) {
> +		/* transform pmd as if &pmd pointed to a hardware page table */
> +		set_pmd(&pmd, pmd);
>  		pte_t *ptep = pte_offset_map(&pmd, addr);
> +		pmd = pmdp_get(&pmd);
>  		unsigned long i;
>  
>  		if (WARN_ON(!ptep))
> diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
> index 63a573306bfa..6602deb002f1 100644
> --- a/mm/pgtable-generic.c
> +++ b/mm/pgtable-generic.c
> @@ -299,6 +299,8 @@ pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp)
>  		pmd_clear_bad(pmd);
>  		goto nomap;
>  	}
> +	/* transform pmdval as if &pmdval pointed to a hardware page table */
> +	set_pmd(&pmdval, pmdval);
>  	return __pte_map(&pmdval, addr);
>  nomap:
>  	rcu_read_unlock();
Re: [PATCH v3 08/22] mm: Allow page table accessors to be non-idempotent
Posted by David Hildenbrand (Red Hat) 1 week, 4 days ago
On 11/27/25 17:57, Ryan Roberts wrote:
> On 13/11/2025 01:45, Samuel Holland wrote:
>> Currently, some functions such as pte_offset_map() are passed both
>> pointers to hardware page tables, and pointers to previously-read PMD
>> entries on the stack. To ensure correctness in the first case, these
>> functions must use the page table accessor function (pmdp_get()) to
>> dereference the supplied pointer. However, this means pmdp_get() is
>> called twice in the second case. This double call must be avoided if
>> pmdp_get() applies some non-idempotent transformation to the value.
>>
>> Avoid the double transformation by calling set_pmd() on the stack
>> variables where necessary to keep set_pmd()/pmdp_get() calls balanced.
> 
> I don't think this is a good solution.

Agreed,

	set_pmd(&pmd, pmd);

is rather horrible.

-- 
Cheers

David
Re: [PATCH v3 08/22] mm: Allow page table accessors to be non-idempotent
Posted by kernel test robot 3 weeks, 4 days ago
Hi Samuel,

kernel test robot noticed the following build errors:

[auto build test ERROR on 24172e0d79900908cf5ebf366600616d29c9b417]

url:    https://github.com/intel-lab-lkp/linux/commits/Samuel-Holland/mm-ptdump-replace-READ_ONCE-with-standard-page-table-accessors/20251113-095117
base:   24172e0d79900908cf5ebf366600616d29c9b417
patch link:    https://lore.kernel.org/r/20251113014656.2605447-9-samuel.holland%40sifive.com
patch subject: [PATCH v3 08/22] mm: Allow page table accessors to be non-idempotent
config: powerpc-allnoconfig (https://download.01.org/0day-ci/archive/20251113/202511131448.ZCsuBlBE-lkp@intel.com/config)
compiler: powerpc-linux-gcc (GCC) 15.1.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20251113/202511131448.ZCsuBlBE-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202511131448.ZCsuBlBE-lkp@intel.com/

All errors (new ones prefixed by >>):

   mm/gup.c: In function 'gup_fast_pte_range':
>> mm/gup.c:2848:9: error: implicit declaration of function 'set_pmd'; did you mean 'set_p4d'? [-Wimplicit-function-declaration]
    2848 |         set_pmd(&pmd, pmd);
         |         ^~~~~~~
         |         set_p4d
--
   mm/pgtable-generic.c: In function '___pte_offset_map':
>> mm/pgtable-generic.c:303:9: error: implicit declaration of function 'set_pmd'; did you mean 'set_p4d'? [-Wimplicit-function-declaration]
     303 |         set_pmd(&pmdval, pmdval);
         |         ^~~~~~~
         |         set_p4d


vim +2848 mm/gup.c

  2819	
  2820	#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
  2821	/*
  2822	 * GUP-fast relies on pte change detection to avoid concurrent pgtable
  2823	 * operations.
  2824	 *
  2825	 * To pin the page, GUP-fast needs to do below in order:
  2826	 * (1) pin the page (by prefetching pte), then (2) check pte not changed.
  2827	 *
  2828	 * For the rest of pgtable operations where pgtable updates can be racy
  2829	 * with GUP-fast, we need to do (1) clear pte, then (2) check whether page
  2830	 * is pinned.
  2831	 *
  2832	 * Above will work for all pte-level operations, including THP split.
  2833	 *
  2834	 * For THP collapse, it's a bit more complicated because GUP-fast may be
  2835	 * walking a pgtable page that is being freed (pte is still valid but pmd
  2836	 * can be cleared already).  To avoid race in such condition, we need to
  2837	 * also check pmd here to make sure pmd doesn't change (corresponds to
  2838	 * pmdp_collapse_flush() in the THP collapse code path).
  2839	 */
  2840	static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
  2841			unsigned long end, unsigned int flags, struct page **pages,
  2842			int *nr)
  2843	{
  2844		int ret = 0;
  2845		pte_t *ptep, *ptem;
  2846	
  2847		/* transform pmd as if &pmd pointed to a hardware page table */
> 2848		set_pmd(&pmd, pmd);
  2849		ptem = ptep = pte_offset_map(&pmd, addr);
  2850		pmd = pmdp_get(&pmd);
  2851		if (!ptep)
  2852			return 0;
  2853		do {
  2854			pte_t pte = ptep_get_lockless(ptep);
  2855			struct page *page;
  2856			struct folio *folio;
  2857	
  2858			/*
  2859			 * Always fallback to ordinary GUP on PROT_NONE-mapped pages:
  2860			 * pte_access_permitted() better should reject these pages
  2861			 * either way: otherwise, GUP-fast might succeed in
  2862			 * cases where ordinary GUP would fail due to VMA access
  2863			 * permissions.
  2864			 */
  2865			if (pte_protnone(pte))
  2866				goto pte_unmap;
  2867	
  2868			if (!pte_access_permitted(pte, flags & FOLL_WRITE))
  2869				goto pte_unmap;
  2870	
  2871			if (pte_special(pte))
  2872				goto pte_unmap;
  2873	
  2874			/* If it's not marked as special it must have a valid memmap. */
  2875			VM_WARN_ON_ONCE(!pfn_valid(pte_pfn(pte)));
  2876			page = pte_page(pte);
  2877	
  2878			folio = try_grab_folio_fast(page, 1, flags);
  2879			if (!folio)
  2880				goto pte_unmap;
  2881	
  2882			if (unlikely(pmd_val(pmd) != pmd_val(pmdp_get(pmdp))) ||
  2883			    unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) {
  2884				gup_put_folio(folio, 1, flags);
  2885				goto pte_unmap;
  2886			}
  2887	
  2888			if (!gup_fast_folio_allowed(folio, flags)) {
  2889				gup_put_folio(folio, 1, flags);
  2890				goto pte_unmap;
  2891			}
  2892	
  2893			if (!pte_write(pte) && gup_must_unshare(NULL, flags, page)) {
  2894				gup_put_folio(folio, 1, flags);
  2895				goto pte_unmap;
  2896			}
  2897	
  2898			/*
  2899			 * We need to make the page accessible if and only if we are
  2900			 * going to access its content (the FOLL_PIN case).  Please
  2901			 * see Documentation/core-api/pin_user_pages.rst for
  2902			 * details.
  2903			 */
  2904			if ((flags & FOLL_PIN) && arch_make_folio_accessible(folio)) {
  2905				gup_put_folio(folio, 1, flags);
  2906				goto pte_unmap;
  2907			}
  2908			folio_set_referenced(folio);
  2909			pages[*nr] = page;
  2910			(*nr)++;
  2911		} while (ptep++, addr += PAGE_SIZE, addr != end);
  2912	
  2913		ret = 1;
  2914	
  2915	pte_unmap:
  2916		pte_unmap(ptem);
  2917		return ret;
  2918	}
  2919	#else
  2920	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki