Currently, some functions such as pte_offset_map() are passed both
pointers to hardware page tables, and pointers to previously-read PMD
entries on the stack. To ensure correctness in the first case, these
functions must use the page table accessor function (pmdp_get()) to
dereference the supplied pointer. However, this means pmdp_get() is
called twice in the second case. This double call must be avoided if
pmdp_get() applies some non-idempotent transformation to the value.
Avoid the double transformation by calling set_pmd() on the stack
variables where necessary to keep set_pmd()/pmdp_get() calls balanced.
Signed-off-by: Samuel Holland <samuel.holland@sifive.com>
---
(no changes since v2)
Changes in v2:
- New patch for v2
kernel/events/core.c | 2 ++
mm/gup.c | 3 +++
mm/khugepaged.c | 6 ++++--
mm/page_table_check.c | 3 +++
mm/pgtable-generic.c | 2 ++
5 files changed, 14 insertions(+), 2 deletions(-)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index fa4f9165bd94..7969b060bf2d 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8154,6 +8154,8 @@ static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr)
if (pmd_leaf(pmd))
return pmd_leaf_size(pmd);
+ /* transform pmd as if &pmd pointed to a hardware page table */
+ set_pmd(&pmd, pmd);
ptep = pte_offset_map(&pmd, addr);
if (!ptep)
goto again;
diff --git a/mm/gup.c b/mm/gup.c
index 549f9e868311..aba61704049e 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -2844,7 +2844,10 @@ static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
int ret = 0;
pte_t *ptep, *ptem;
+ /* transform pmd as if &pmd pointed to a hardware page table */
+ set_pmd(&pmd, pmd);
ptem = ptep = pte_offset_map(&pmd, addr);
+ pmd = pmdp_get(&pmd);
if (!ptep)
return 0;
do {
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 1bff8ade751a..ab1f68a7bc83 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1724,7 +1724,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
struct mmu_notifier_range range;
struct mm_struct *mm;
unsigned long addr;
- pmd_t *pmd, pgt_pmd;
+ pmd_t *pmd, pgt_pmd, pmdval;
spinlock_t *pml;
spinlock_t *ptl;
bool success = false;
@@ -1777,7 +1777,9 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
*/
if (check_pmd_state(pmd) != SCAN_SUCCEED)
goto drop_pml;
- ptl = pte_lockptr(mm, pmd);
+ /* pte_lockptr() needs a value, not a pointer to a page table */
+ pmdval = pmdp_get(pmd);
+ ptl = pte_lockptr(mm, &pmdval);
if (ptl != pml)
spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
diff --git a/mm/page_table_check.c b/mm/page_table_check.c
index 31f4c39d20ef..77d6688db0de 100644
--- a/mm/page_table_check.c
+++ b/mm/page_table_check.c
@@ -260,7 +260,10 @@ void __page_table_check_pte_clear_range(struct mm_struct *mm,
return;
if (!pmd_bad(pmd) && !pmd_leaf(pmd)) {
+ /* transform pmd as if &pmd pointed to a hardware page table */
+ set_pmd(&pmd, pmd);
pte_t *ptep = pte_offset_map(&pmd, addr);
+ pmd = pmdp_get(&pmd);
unsigned long i;
if (WARN_ON(!ptep))
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 63a573306bfa..6602deb002f1 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -299,6 +299,8 @@ pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp)
pmd_clear_bad(pmd);
goto nomap;
}
+ /* transform pmdval as if &pmdval pointed to a hardware page table */
+ set_pmd(&pmdval, pmdval);
return __pte_map(&pmdval, addr);
nomap:
rcu_read_unlock();
--
2.47.2
On 13/11/2025 01:45, Samuel Holland wrote:
> Currently, some functions such as pte_offset_map() are passed both
> pointers to hardware page tables, and pointers to previously-read PMD
> entries on the stack. To ensure correctness in the first case, these
> functions must use the page table accessor function (pmdp_get()) to
> dereference the supplied pointer. However, this means pmdp_get() is
> called twice in the second case. This double call must be avoided if
> pmdp_get() applies some non-idempotent transformation to the value.
>
> Avoid the double transformation by calling set_pmd() on the stack
> variables where necessary to keep set_pmd()/pmdp_get() calls balanced.
I don't think this is a good solution.
arm64, at least, expects and requires that only pointers to entries in pgtables
are passed to the arch helpers (e.g. set_pte(), ptep_get(), etc). For PTEs,
arm64 accesses adjacent entries within the page table to manage contiguous
mappings. If it is passed a pointer to a stack variable, it may erroneously
access other stuff on the stack thinking it is an entry in a page table.
I think we should formalize this as a clear requirement for all these functions;
all pte/pmd/pud/p4d/pgd pointers passed to the arch pgtable helpers must always
point to entries in pgtables.
arm64 will very likely take advantage of this in future in the pmd/pud/...
helpers as it does today for the pte level. But even today, arm64's set_pmd()
will emit barriers which are totally unnecessary when operating on a stack
variable that the HW PTW will never see.
Thanks,
Ryan
>
> Signed-off-by: Samuel Holland <samuel.holland@sifive.com>
> ---
>
> (no changes since v2)
>
> Changes in v2:
> - New patch for v2
>
> kernel/events/core.c | 2 ++
> mm/gup.c | 3 +++
> mm/khugepaged.c | 6 ++++--
> mm/page_table_check.c | 3 +++
> mm/pgtable-generic.c | 2 ++
> 5 files changed, 14 insertions(+), 2 deletions(-)
>
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index fa4f9165bd94..7969b060bf2d 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -8154,6 +8154,8 @@ static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr)
> if (pmd_leaf(pmd))
> return pmd_leaf_size(pmd);
>
> + /* transform pmd as if &pmd pointed to a hardware page table */
> + set_pmd(&pmd, pmd);
> ptep = pte_offset_map(&pmd, addr);
> if (!ptep)
> goto again;
> diff --git a/mm/gup.c b/mm/gup.c
> index 549f9e868311..aba61704049e 100644
> --- a/mm/gup.c
> +++ b/mm/gup.c
> @@ -2844,7 +2844,10 @@ static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
> int ret = 0;
> pte_t *ptep, *ptem;
>
> + /* transform pmd as if &pmd pointed to a hardware page table */
> + set_pmd(&pmd, pmd);
> ptem = ptep = pte_offset_map(&pmd, addr);
> + pmd = pmdp_get(&pmd);
> if (!ptep)
> return 0;
> do {
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 1bff8ade751a..ab1f68a7bc83 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -1724,7 +1724,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
> struct mmu_notifier_range range;
> struct mm_struct *mm;
> unsigned long addr;
> - pmd_t *pmd, pgt_pmd;
> + pmd_t *pmd, pgt_pmd, pmdval;
> spinlock_t *pml;
> spinlock_t *ptl;
> bool success = false;
> @@ -1777,7 +1777,9 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
> */
> if (check_pmd_state(pmd) != SCAN_SUCCEED)
> goto drop_pml;
> - ptl = pte_lockptr(mm, pmd);
> + /* pte_lockptr() needs a value, not a pointer to a page table */
> + pmdval = pmdp_get(pmd);
> + ptl = pte_lockptr(mm, &pmdval);
> if (ptl != pml)
> spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
>
> diff --git a/mm/page_table_check.c b/mm/page_table_check.c
> index 31f4c39d20ef..77d6688db0de 100644
> --- a/mm/page_table_check.c
> +++ b/mm/page_table_check.c
> @@ -260,7 +260,10 @@ void __page_table_check_pte_clear_range(struct mm_struct *mm,
> return;
>
> if (!pmd_bad(pmd) && !pmd_leaf(pmd)) {
> + /* transform pmd as if &pmd pointed to a hardware page table */
> + set_pmd(&pmd, pmd);
> pte_t *ptep = pte_offset_map(&pmd, addr);
> + pmd = pmdp_get(&pmd);
> unsigned long i;
>
> if (WARN_ON(!ptep))
> diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
> index 63a573306bfa..6602deb002f1 100644
> --- a/mm/pgtable-generic.c
> +++ b/mm/pgtable-generic.c
> @@ -299,6 +299,8 @@ pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp)
> pmd_clear_bad(pmd);
> goto nomap;
> }
> + /* transform pmdval as if &pmdval pointed to a hardware page table */
> + set_pmd(&pmdval, pmdval);
> return __pte_map(&pmdval, addr);
> nomap:
> rcu_read_unlock();
On 11/27/25 17:57, Ryan Roberts wrote: > On 13/11/2025 01:45, Samuel Holland wrote: >> Currently, some functions such as pte_offset_map() are passed both >> pointers to hardware page tables, and pointers to previously-read PMD >> entries on the stack. To ensure correctness in the first case, these >> functions must use the page table accessor function (pmdp_get()) to >> dereference the supplied pointer. However, this means pmdp_get() is >> called twice in the second case. This double call must be avoided if >> pmdp_get() applies some non-idempotent transformation to the value. >> >> Avoid the double transformation by calling set_pmd() on the stack >> variables where necessary to keep set_pmd()/pmdp_get() calls balanced. > > I don't think this is a good solution. Agreed, set_pmd(&pmd, pmd); is rather horrible. -- Cheers David
Hi Samuel,
kernel test robot noticed the following build errors:
[auto build test ERROR on 24172e0d79900908cf5ebf366600616d29c9b417]
url: https://github.com/intel-lab-lkp/linux/commits/Samuel-Holland/mm-ptdump-replace-READ_ONCE-with-standard-page-table-accessors/20251113-095117
base: 24172e0d79900908cf5ebf366600616d29c9b417
patch link: https://lore.kernel.org/r/20251113014656.2605447-9-samuel.holland%40sifive.com
patch subject: [PATCH v3 08/22] mm: Allow page table accessors to be non-idempotent
config: powerpc-allnoconfig (https://download.01.org/0day-ci/archive/20251113/202511131448.ZCsuBlBE-lkp@intel.com/config)
compiler: powerpc-linux-gcc (GCC) 15.1.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20251113/202511131448.ZCsuBlBE-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202511131448.ZCsuBlBE-lkp@intel.com/
All errors (new ones prefixed by >>):
mm/gup.c: In function 'gup_fast_pte_range':
>> mm/gup.c:2848:9: error: implicit declaration of function 'set_pmd'; did you mean 'set_p4d'? [-Wimplicit-function-declaration]
2848 | set_pmd(&pmd, pmd);
| ^~~~~~~
| set_p4d
--
mm/pgtable-generic.c: In function '___pte_offset_map':
>> mm/pgtable-generic.c:303:9: error: implicit declaration of function 'set_pmd'; did you mean 'set_p4d'? [-Wimplicit-function-declaration]
303 | set_pmd(&pmdval, pmdval);
| ^~~~~~~
| set_p4d
vim +2848 mm/gup.c
2819
2820 #ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
2821 /*
2822 * GUP-fast relies on pte change detection to avoid concurrent pgtable
2823 * operations.
2824 *
2825 * To pin the page, GUP-fast needs to do below in order:
2826 * (1) pin the page (by prefetching pte), then (2) check pte not changed.
2827 *
2828 * For the rest of pgtable operations where pgtable updates can be racy
2829 * with GUP-fast, we need to do (1) clear pte, then (2) check whether page
2830 * is pinned.
2831 *
2832 * Above will work for all pte-level operations, including THP split.
2833 *
2834 * For THP collapse, it's a bit more complicated because GUP-fast may be
2835 * walking a pgtable page that is being freed (pte is still valid but pmd
2836 * can be cleared already). To avoid race in such condition, we need to
2837 * also check pmd here to make sure pmd doesn't change (corresponds to
2838 * pmdp_collapse_flush() in the THP collapse code path).
2839 */
2840 static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
2841 unsigned long end, unsigned int flags, struct page **pages,
2842 int *nr)
2843 {
2844 int ret = 0;
2845 pte_t *ptep, *ptem;
2846
2847 /* transform pmd as if &pmd pointed to a hardware page table */
> 2848 set_pmd(&pmd, pmd);
2849 ptem = ptep = pte_offset_map(&pmd, addr);
2850 pmd = pmdp_get(&pmd);
2851 if (!ptep)
2852 return 0;
2853 do {
2854 pte_t pte = ptep_get_lockless(ptep);
2855 struct page *page;
2856 struct folio *folio;
2857
2858 /*
2859 * Always fallback to ordinary GUP on PROT_NONE-mapped pages:
2860 * pte_access_permitted() better should reject these pages
2861 * either way: otherwise, GUP-fast might succeed in
2862 * cases where ordinary GUP would fail due to VMA access
2863 * permissions.
2864 */
2865 if (pte_protnone(pte))
2866 goto pte_unmap;
2867
2868 if (!pte_access_permitted(pte, flags & FOLL_WRITE))
2869 goto pte_unmap;
2870
2871 if (pte_special(pte))
2872 goto pte_unmap;
2873
2874 /* If it's not marked as special it must have a valid memmap. */
2875 VM_WARN_ON_ONCE(!pfn_valid(pte_pfn(pte)));
2876 page = pte_page(pte);
2877
2878 folio = try_grab_folio_fast(page, 1, flags);
2879 if (!folio)
2880 goto pte_unmap;
2881
2882 if (unlikely(pmd_val(pmd) != pmd_val(pmdp_get(pmdp))) ||
2883 unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) {
2884 gup_put_folio(folio, 1, flags);
2885 goto pte_unmap;
2886 }
2887
2888 if (!gup_fast_folio_allowed(folio, flags)) {
2889 gup_put_folio(folio, 1, flags);
2890 goto pte_unmap;
2891 }
2892
2893 if (!pte_write(pte) && gup_must_unshare(NULL, flags, page)) {
2894 gup_put_folio(folio, 1, flags);
2895 goto pte_unmap;
2896 }
2897
2898 /*
2899 * We need to make the page accessible if and only if we are
2900 * going to access its content (the FOLL_PIN case). Please
2901 * see Documentation/core-api/pin_user_pages.rst for
2902 * details.
2903 */
2904 if ((flags & FOLL_PIN) && arch_make_folio_accessible(folio)) {
2905 gup_put_folio(folio, 1, flags);
2906 goto pte_unmap;
2907 }
2908 folio_set_referenced(folio);
2909 pages[*nr] = page;
2910 (*nr)++;
2911 } while (ptep++, addr += PAGE_SIZE, addr != end);
2912
2913 ret = 1;
2914
2915 pte_unmap:
2916 pte_unmap(ptem);
2917 return ret;
2918 }
2919 #else
2920
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
© 2016 - 2025 Red Hat, Inc.