[v3] riscv: Memory type control for platforms with physical memory aliases

[PATCH v3 08/22] mm: Allow page table accessors to be non-idempotent

Posted by Samuel Holland 2 months, 4 weeks ago

Currently, some functions such as pte_offset_map() are passed both
pointers to hardware page tables, and pointers to previously-read PMD
entries on the stack. To ensure correctness in the first case, these
functions must use the page table accessor function (pmdp_get()) to
dereference the supplied pointer. However, this means pmdp_get() is
called twice in the second case. This double call must be avoided if
pmdp_get() applies some non-idempotent transformation to the value.

Avoid the double transformation by calling set_pmd() on the stack
variables where necessary to keep set_pmd()/pmdp_get() calls balanced.

Signed-off-by: Samuel Holland <samuel.holland@sifive.com>
---

(no changes since v2)

Changes in v2:
 - New patch for v2

 kernel/events/core.c  | 2 ++
 mm/gup.c              | 3 +++
 mm/khugepaged.c       | 6 ++++--
 mm/page_table_check.c | 3 +++
 mm/pgtable-generic.c  | 2 ++
 5 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index fa4f9165bd94..7969b060bf2d 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8154,6 +8154,8 @@ static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr)
 	if (pmd_leaf(pmd))
 		return pmd_leaf_size(pmd);
 
+	/* transform pmd as if &pmd pointed to a hardware page table */
+	set_pmd(&pmd, pmd);
 	ptep = pte_offset_map(&pmd, addr);
 	if (!ptep)
 		goto again;
diff --git a/mm/gup.c b/mm/gup.c
index 549f9e868311..aba61704049e 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -2844,7 +2844,10 @@ static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
 	int ret = 0;
 	pte_t *ptep, *ptem;
 
+	/* transform pmd as if &pmd pointed to a hardware page table */
+	set_pmd(&pmd, pmd);
 	ptem = ptep = pte_offset_map(&pmd, addr);
+	pmd = pmdp_get(&pmd);
 	if (!ptep)
 		return 0;
 	do {
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 1bff8ade751a..ab1f68a7bc83 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1724,7 +1724,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
 		struct mmu_notifier_range range;
 		struct mm_struct *mm;
 		unsigned long addr;
-		pmd_t *pmd, pgt_pmd;
+		pmd_t *pmd, pgt_pmd, pmdval;
 		spinlock_t *pml;
 		spinlock_t *ptl;
 		bool success = false;
@@ -1777,7 +1777,9 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
 		 */
 		if (check_pmd_state(pmd) != SCAN_SUCCEED)
 			goto drop_pml;
-		ptl = pte_lockptr(mm, pmd);
+		/* pte_lockptr() needs a value, not a pointer to a page table */
+		pmdval = pmdp_get(pmd);
+		ptl = pte_lockptr(mm, &pmdval);
 		if (ptl != pml)
 			spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
 
diff --git a/mm/page_table_check.c b/mm/page_table_check.c
index 31f4c39d20ef..77d6688db0de 100644
--- a/mm/page_table_check.c
+++ b/mm/page_table_check.c
@@ -260,7 +260,10 @@ void __page_table_check_pte_clear_range(struct mm_struct *mm,
 		return;
 
 	if (!pmd_bad(pmd) && !pmd_leaf(pmd)) {
+		/* transform pmd as if &pmd pointed to a hardware page table */
+		set_pmd(&pmd, pmd);
 		pte_t *ptep = pte_offset_map(&pmd, addr);
+		pmd = pmdp_get(&pmd);
 		unsigned long i;
 
 		if (WARN_ON(!ptep))
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 63a573306bfa..6602deb002f1 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -299,6 +299,8 @@ pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp)
 		pmd_clear_bad(pmd);
 		goto nomap;
 	}
+	/* transform pmdval as if &pmdval pointed to a hardware page table */
+	set_pmd(&pmdval, pmdval);
 	return __pte_map(&pmdval, addr);
 nomap:
 	rcu_read_unlock();
-- 
2.47.2

Re: [PATCH v3 08/22] mm: Allow page table accessors to be non-idempotent

Posted by Ryan Roberts 2 months, 1 week ago

On 13/11/2025 01:45, Samuel Holland wrote:
> Currently, some functions such as pte_offset_map() are passed both
> pointers to hardware page tables, and pointers to previously-read PMD
> entries on the stack. To ensure correctness in the first case, these
> functions must use the page table accessor function (pmdp_get()) to
> dereference the supplied pointer. However, this means pmdp_get() is
> called twice in the second case. This double call must be avoided if
> pmdp_get() applies some non-idempotent transformation to the value.
> 
> Avoid the double transformation by calling set_pmd() on the stack
> variables where necessary to keep set_pmd()/pmdp_get() calls balanced.

I don't think this is a good solution.

arm64, at least, expects and requires that only pointers to entries in pgtables
are passed to the arch helpers (e.g. set_pte(), ptep_get(), etc). For PTEs,
arm64 accesses adjacent entries within the page table to manage contiguous
mappings. If it is passed a pointer to a stack variable, it may erroneously
access other stuff on the stack thinking it is an entry in a page table.

I think we should formalize this as a clear requirement for all these functions;
all pte/pmd/pud/p4d/pgd pointers passed to the arch pgtable helpers must always
point to entries in pgtables.

arm64 will very likely take advantage of this in future in the pmd/pud/...
helpers as it does today for the pte level. But even today, arm64's set_pmd()
will emit barriers which are totally unnecessary when operating on a stack
variable that the HW PTW will never see.

Thanks,
Ryan

> 
> Signed-off-by: Samuel Holland <samuel.holland@sifive.com>
> ---
> 
> (no changes since v2)
> 
> Changes in v2:
>  - New patch for v2
> 
>  kernel/events/core.c  | 2 ++
>  mm/gup.c              | 3 +++
>  mm/khugepaged.c       | 6 ++++--
>  mm/page_table_check.c | 3 +++
>  mm/pgtable-generic.c  | 2 ++
>  5 files changed, 14 insertions(+), 2 deletions(-)
> 
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index fa4f9165bd94..7969b060bf2d 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -8154,6 +8154,8 @@ static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr)
>  	if (pmd_leaf(pmd))
>  		return pmd_leaf_size(pmd);
>  
> +	/* transform pmd as if &pmd pointed to a hardware page table */
> +	set_pmd(&pmd, pmd);
>  	ptep = pte_offset_map(&pmd, addr);
>  	if (!ptep)
>  		goto again;
> diff --git a/mm/gup.c b/mm/gup.c
> index 549f9e868311..aba61704049e 100644
> --- a/mm/gup.c
> +++ b/mm/gup.c
> @@ -2844,7 +2844,10 @@ static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
>  	int ret = 0;
>  	pte_t *ptep, *ptem;
>  
> +	/* transform pmd as if &pmd pointed to a hardware page table */
> +	set_pmd(&pmd, pmd);
>  	ptem = ptep = pte_offset_map(&pmd, addr);
> +	pmd = pmdp_get(&pmd);
>  	if (!ptep)
>  		return 0;
>  	do {
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 1bff8ade751a..ab1f68a7bc83 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -1724,7 +1724,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
>  		struct mmu_notifier_range range;
>  		struct mm_struct *mm;
>  		unsigned long addr;
> -		pmd_t *pmd, pgt_pmd;
> +		pmd_t *pmd, pgt_pmd, pmdval;
>  		spinlock_t *pml;
>  		spinlock_t *ptl;
>  		bool success = false;
> @@ -1777,7 +1777,9 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
>  		 */
>  		if (check_pmd_state(pmd) != SCAN_SUCCEED)
>  			goto drop_pml;
> -		ptl = pte_lockptr(mm, pmd);
> +		/* pte_lockptr() needs a value, not a pointer to a page table */
> +		pmdval = pmdp_get(pmd);
> +		ptl = pte_lockptr(mm, &pmdval);
>  		if (ptl != pml)
>  			spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
>  
> diff --git a/mm/page_table_check.c b/mm/page_table_check.c
> index 31f4c39d20ef..77d6688db0de 100644
> --- a/mm/page_table_check.c
> +++ b/mm/page_table_check.c
> @@ -260,7 +260,10 @@ void __page_table_check_pte_clear_range(struct mm_struct *mm,
>  		return;
>  
>  	if (!pmd_bad(pmd) && !pmd_leaf(pmd)) {
> +		/* transform pmd as if &pmd pointed to a hardware page table */
> +		set_pmd(&pmd, pmd);
>  		pte_t *ptep = pte_offset_map(&pmd, addr);
> +		pmd = pmdp_get(&pmd);
>  		unsigned long i;
>  
>  		if (WARN_ON(!ptep))
> diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
> index 63a573306bfa..6602deb002f1 100644
> --- a/mm/pgtable-generic.c
> +++ b/mm/pgtable-generic.c
> @@ -299,6 +299,8 @@ pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp)
>  		pmd_clear_bad(pmd);
>  		goto nomap;
>  	}
> +	/* transform pmdval as if &pmdval pointed to a hardware page table */
> +	set_pmd(&pmdval, pmdval);
>  	return __pte_map(&pmdval, addr);
>  nomap:
>  	rcu_read_unlock();

Re: [PATCH v3 08/22] mm: Allow page table accessors to be non-idempotent

Posted by David Hildenbrand (Red Hat) 2 months, 1 week ago

On 11/27/25 17:57, Ryan Roberts wrote:
> On 13/11/2025 01:45, Samuel Holland wrote:
>> Currently, some functions such as pte_offset_map() are passed both
>> pointers to hardware page tables, and pointers to previously-read PMD
>> entries on the stack. To ensure correctness in the first case, these
>> functions must use the page table accessor function (pmdp_get()) to
>> dereference the supplied pointer. However, this means pmdp_get() is
>> called twice in the second case. This double call must be avoided if
>> pmdp_get() applies some non-idempotent transformation to the value.
>>
>> Avoid the double transformation by calling set_pmd() on the stack
>> variables where necessary to keep set_pmd()/pmdp_get() calls balanced.
> 
> I don't think this is a good solution.

Agreed,

	set_pmd(&pmd, pmd);

is rather horrible.

-- 
Cheers

David

Re: [PATCH v3 08/22] mm: Allow page table accessors to be non-idempotent

Posted by Samuel Holland 2 months ago

On 2025-11-28 2:47 AM, David Hildenbrand (Red Hat) wrote:
> On 11/27/25 17:57, Ryan Roberts wrote:
>> On 13/11/2025 01:45, Samuel Holland wrote:
>>> Currently, some functions such as pte_offset_map() are passed both
>>> pointers to hardware page tables, and pointers to previously-read PMD
>>> entries on the stack. To ensure correctness in the first case, these
>>> functions must use the page table accessor function (pmdp_get()) to
>>> dereference the supplied pointer. However, this means pmdp_get() is
>>> called twice in the second case. This double call must be avoided if
>>> pmdp_get() applies some non-idempotent transformation to the value.
>>>
>>> Avoid the double transformation by calling set_pmd() on the stack
>>> variables where necessary to keep set_pmd()/pmdp_get() calls balanced.
>>
>> I don't think this is a good solution.
> 
> Agreed,
> 
>     set_pmd(&pmd, pmd);
> 
> is rather horrible.
I agree that this patch is ugly. The only way I see to avoid code like this is
to refactor (or duplicate) the functions so no function takes pointers to both
hardware page tables and on-stack page table entries. Is that sort of
refactoring the right direction to go for v4?

Regards,
Samuel

Re: [PATCH v3 08/22] mm: Allow page table accessors to be non-idempotent

Posted by Ryan Roberts 2 months ago

On 11/12/2025 00:33, Samuel Holland wrote:
> On 2025-11-28 2:47 AM, David Hildenbrand (Red Hat) wrote:
>> On 11/27/25 17:57, Ryan Roberts wrote:
>>> On 13/11/2025 01:45, Samuel Holland wrote:
>>>> Currently, some functions such as pte_offset_map() are passed both
>>>> pointers to hardware page tables, and pointers to previously-read PMD
>>>> entries on the stack. To ensure correctness in the first case, these
>>>> functions must use the page table accessor function (pmdp_get()) to
>>>> dereference the supplied pointer. However, this means pmdp_get() is
>>>> called twice in the second case. This double call must be avoided if
>>>> pmdp_get() applies some non-idempotent transformation to the value.
>>>>
>>>> Avoid the double transformation by calling set_pmd() on the stack
>>>> variables where necessary to keep set_pmd()/pmdp_get() calls balanced.
>>>
>>> I don't think this is a good solution.
>>
>> Agreed,
>>
>>     set_pmd(&pmd, pmd);
>>
>> is rather horrible.
> I agree that this patch is ugly. The only way I see to avoid code like this is
> to refactor (or duplicate) the functions so no function takes pointers to both
> hardware page tables and on-stack page table entries. Is that sort of
> refactoring the right direction to go for v4?

From a quick look at the code, I think that some cases are solvable by
refactoring to pass the value instead of the pointer, and leave it to the higher
level decide how to read the value from the pointer - it knows if it is pointing
to HW pgtable or if it's a (e.g) stack value.

But the more I look at the code, the more instances I find where pointers to
stack variables are being passed to arch pgtable helpers as if they are HW
pgtable entry pointers. (Mainly pmd level).

I wonder if we need to bite the bullet and explicitly separate the types? At
each level, we have:

 1. page table entry value
 2. pointer to page table entry _value_ (e.g. pointer to pXX_t on stack)
 3. pointer to page table entry in HW pgtable

Today, 1 is represented by pte_t, pmd_t, etc. 2 and 3 are represented by the
same type; pte_t*, pmd_t*, etc.

If we create a new type for 3, it will both document and enforce when type 2 or
type 3 is required.

e.g:

// pte_t: defined by arch.
typedef unsigned long pte_t;

// ptep_t: new opaque type that can't be dereferenced.
struct __ptep_t;
typedef struct __ptep_t *ptep_t;

// getter/setter responsible for cast & deref as appropriate.
pte_t ptep_get(ptep_t ptep)
{
	return READ_ONCE(*(pte_t *)ptep);
}

int do_stuff(void)
{
	// value on stack: ok
	pte_t mypte;

	// pointer to value on stack: ok
	pte_t *pmypte = &mypte;

	// handle to entry on stack: not allowed by compiler!
	ptep_t myptep = &mypte;

	// handle to entry in pgtable: ok
	ptep_t myptep = pte_offset_kernel(...);

	// read value of pgtable entry: ok
	pte_t val = ptep_get(myptep);

	// attempt to pass pointer to stack variable: not allowed by compiler!
	pte_t val = ptep_get(&mypte);

	// attempt to directly dereference ptep: not allowed by compiler!
	pte_t val = *myptep;
}

We could do this incrementally by initially typedefing ptep_t to be:
typedef pte_t *ptep_t;

Then we could flip the switch arch-by-arch to enable the stronger checking. We
likely wouldn't need to convert arches that don't care.

I think by doing this, it will expose all the current issues and force us to fix
them properly.

On the related subject of conversion to pXXp_get(); I've been looking into this
and personally, I think we should have 2 helper flavours at each level:

 - pXXd_get()      optimizable by compiler; defaults to C dereference
 - pXXd_get_once() single-copy-atomic and unmovable by compiler

It simplifies the converstion process, and reduces the risk of bugs
significantly (go read about the arm32 issues discussed in Anshuman's series if
you haven't done already).

I appreciate this is all probably a lot more work than you would prefer to sign
up for, I'd be happy to collaborate if we get concensus that this approach makes
sense. What do you think?

Thanks,
Ryan

Re: [PATCH v3 08/22] mm: Allow page table accessors to be non-idempotent

Posted by David Hildenbrand (Red Hat) 1 month, 3 weeks ago

On 12/11/25 14:59, Ryan Roberts wrote:
> On 11/12/2025 00:33, Samuel Holland wrote:
>> On 2025-11-28 2:47 AM, David Hildenbrand (Red Hat) wrote:
>>> On 11/27/25 17:57, Ryan Roberts wrote:
>>>> On 13/11/2025 01:45, Samuel Holland wrote:
>>>>> Currently, some functions such as pte_offset_map() are passed both
>>>>> pointers to hardware page tables, and pointers to previously-read PMD
>>>>> entries on the stack. To ensure correctness in the first case, these
>>>>> functions must use the page table accessor function (pmdp_get()) to
>>>>> dereference the supplied pointer. However, this means pmdp_get() is
>>>>> called twice in the second case. This double call must be avoided if
>>>>> pmdp_get() applies some non-idempotent transformation to the value.
>>>>>
>>>>> Avoid the double transformation by calling set_pmd() on the stack
>>>>> variables where necessary to keep set_pmd()/pmdp_get() calls balanced.
>>>>
>>>> I don't think this is a good solution.
>>>
>>> Agreed,
>>>
>>>      set_pmd(&pmd, pmd);
>>>
>>> is rather horrible.
>> I agree that this patch is ugly. The only way I see to avoid code like this is
>> to refactor (or duplicate) the functions so no function takes pointers to both
>> hardware page tables and on-stack page table entries. Is that sort of
>> refactoring the right direction to go for v4?
> 
>  From a quick look at the code, I think that some cases are solvable by
> refactoring to pass the value instead of the pointer, and leave it to the higher
> level decide how to read the value from the pointer - it knows if it is pointing
> to HW pgtable or if it's a (e.g) stack value.
> 
> But the more I look at the code, the more instances I find where pointers to
> stack variables are being passed to arch pgtable helpers as if they are HW
> pgtable entry pointers. (Mainly pmd level).
> 
> I wonder if we need to bite the bullet and explicitly separate the types? At
> each level, we have:
> 
>   1. page table entry value
>   2. pointer to page table entry _value_ (e.g. pointer to pXX_t on stack)
>   3. pointer to page table entry in HW pgtable
> 
> Today, 1 is represented by pte_t, pmd_t, etc. 2 and 3 are represented by the
> same type; pte_t*, pmd_t*, etc.
> 
> If we create a new type for 3, it will both document and enforce when type 2 or
> type 3 is required.
> 
> e.g:
> 
> // pte_t: defined by arch.
> typedef unsigned long pte_t;
> 
> // ptep_t: new opaque type that can't be dereferenced.
> struct __ptep_t;
> typedef struct __ptep_t *ptep_t;

This is what I had in mind when we last discussed this topic and I 
suggested a way forward to not play whack-a-mole with new users that do 
*ptep showing up.

Agreed that we ideally indicate that this is a HW PTE pointer that must 
be de-referenced through ptep_get() or similar. (maybe we'd find a new 
name for this set of functions).

I talked to Samuel at LPC, pointing him at the way XEN-pv implemented 
support for changing PFNs in ptes. We might not need all of that rework 
to move forward with the risc-v change.

Having that said, I also agree that this would be a cleanup worth having 
(which will result in quite a bit of churn :) ).

So if someone wants to bump up the patch count, speak now.

-- 
Cheers

David

Re: [PATCH v3 08/22] mm: Allow page table accessors to be non-idempotent

Posted by Lorenzo Stoakes 1 month, 3 weeks ago

On Thu, Dec 11, 2025 at 01:59:53PM +0000, Ryan Roberts wrote:
> On 11/12/2025 00:33, Samuel Holland wrote:
> > On 2025-11-28 2:47 AM, David Hildenbrand (Red Hat) wrote:
> >> On 11/27/25 17:57, Ryan Roberts wrote:
> >>> On 13/11/2025 01:45, Samuel Holland wrote:
> >>>> Currently, some functions such as pte_offset_map() are passed both
> >>>> pointers to hardware page tables, and pointers to previously-read PMD
> >>>> entries on the stack. To ensure correctness in the first case, these
> >>>> functions must use the page table accessor function (pmdp_get()) to
> >>>> dereference the supplied pointer. However, this means pmdp_get() is
> >>>> called twice in the second case. This double call must be avoided if
> >>>> pmdp_get() applies some non-idempotent transformation to the value.
> >>>>
> >>>> Avoid the double transformation by calling set_pmd() on the stack
> >>>> variables where necessary to keep set_pmd()/pmdp_get() calls balanced.
> >>>
> >>> I don't think this is a good solution.
> >>
> >> Agreed,
> >>
> >>     set_pmd(&pmd, pmd);
> >>
> >> is rather horrible.
> > I agree that this patch is ugly. The only way I see to avoid code like this is
> > to refactor (or duplicate) the functions so no function takes pointers to both
> > hardware page tables and on-stack page table entries. Is that sort of
> > refactoring the right direction to go for v4?
>
> From a quick look at the code, I think that some cases are solvable by
> refactoring to pass the value instead of the pointer, and leave it to the higher
> level decide how to read the value from the pointer - it knows if it is pointing
> to HW pgtable or if it's a (e.g) stack value.
>
> But the more I look at the code, the more instances I find where pointers to
> stack variables are being passed to arch pgtable helpers as if they are HW
> pgtable entry pointers. (Mainly pmd level).

Ugh. We do need to nip this in the bud I think!

>
> I wonder if we need to bite the bullet and explicitly separate the types? At
> each level, we have:
>
>  1. page table entry value
>  2. pointer to page table entry _value_ (e.g. pointer to pXX_t on stack)
>  3. pointer to page table entry in HW pgtable
>
> Today, 1 is represented by pte_t, pmd_t, etc. 2 and 3 are represented by the
> same type; pte_t*, pmd_t*, etc.
>
> If we create a new type for 3, it will both document and enforce when type 2 or
> type 3 is required.
>
> e.g:
>
> // pte_t: defined by arch.
> typedef unsigned long pte_t;
>
> // ptep_t: new opaque type that can't be dereferenced.
> struct __ptep_t;
> typedef struct __ptep_t *ptep_t;
>
> // getter/setter responsible for cast & deref as appropriate.
> pte_t ptep_get(ptep_t ptep)
> {
> 	return READ_ONCE(*(pte_t *)ptep);
> }

I think we've got ourselves in a jumble with pte vs ptep vs pteval
vs. etc. etc. and we aren't always consistent with it.

So I think we ought to put 'hw' somewhere in the name.

And we can also now get away from the overly abbreviated names given we have the
chance :)

So something like 'pte_hw_t' perhaps?

I like the general concept, though I think it's kinda gross to hide the fact
that it's a pointer in the typedef _that_ directly.

So perhaps be less horrid if it was something like:

typedef struct {
	pte_t *ptr;
} pte_hw_t;

Perhaps?

Then could have:

pte_t hw_pte_get(pte_hw_t pte_hw)
{
	return READ_ONCE(*pte_hw.ptr);
}

>
> int do_stuff(void)
> {
> 	// value on stack: ok
> 	pte_t mypte;
>
> 	// pointer to value on stack: ok
> 	pte_t *pmypte = &mypte;
>
> 	// handle to entry on stack: not allowed by compiler!
> 	ptep_t myptep = &mypte;
>
> 	// handle to entry in pgtable: ok
> 	ptep_t myptep = pte_offset_kernel(...);
>
> 	// read value of pgtable entry: ok
> 	pte_t val = ptep_get(myptep);
>
> 	// attempt to pass pointer to stack variable: not allowed by compiler!
> 	pte_t val = ptep_get(&mypte);
>
> 	// attempt to directly dereference ptep: not allowed by compiler!
> 	pte_t val = *myptep;
> }
>
>
> We could do this incrementally by initially typedefing ptep_t to be:
> typedef pte_t *ptep_t;

Hm yeah still hate the idea of typedef'ing this as a ptr so directly.

Obviously this would make an incremental thing a little harder, but could have
this stuff on top of existing logic and make it incremental by changing logic
bit-by-bit?

>
> Then we could flip the switch arch-by-arch to enable the stronger checking. We
> likely wouldn't need to convert arches that don't care.
>
> I think by doing this, it will expose all the current issues and force us to fix
> them properly.

Yeah this is nice.

>
> On the related subject of conversion to pXXp_get(); I've been looking into this
> and personally, I think we should have 2 helper flavours at each level:
>
>  - pXXd_get()      optimizable by compiler; defaults to C dereference
>  - pXXd_get_once() single-copy-atomic and unmovable by compiler

Yes!

There is some _real_ confusion about when and when we don't need to do this.

Though wouldn't having a typdef separating out a hw entry imply that hw entry ->
get once, sw entry -> get?

>
> It simplifies the converstion process, and reduces the risk of bugs
> significantly (go read about the arm32 issues discussed in Anshuman's series if
> you haven't done already).

Though I haven't read this series so can't remember actually if there were cases
where even with hw entries we wanted to sometimes READ_ONCE() and sometimes not?

>
> I appreciate this is all probably a lot more work than you would prefer to sign
> up for, I'd be happy to collaborate if we get concensus that this approach makes
> sense. What do you think?
>
> Thanks,
> Ryan
>

Thanks for this concept, overall very much in agreement!

Cheers, Lorenzo

Re: [PATCH v3 08/22] mm: Allow page table accessors to be non-idempotent

Posted by Ryan Roberts 1 month, 3 weeks ago

On 16/12/2025 10:29, Lorenzo Stoakes wrote:
> On Thu, Dec 11, 2025 at 01:59:53PM +0000, Ryan Roberts wrote:
>> On 11/12/2025 00:33, Samuel Holland wrote:
>>> On 2025-11-28 2:47 AM, David Hildenbrand (Red Hat) wrote:
>>>> On 11/27/25 17:57, Ryan Roberts wrote:
>>>>> On 13/11/2025 01:45, Samuel Holland wrote:
>>>>>> Currently, some functions such as pte_offset_map() are passed both
>>>>>> pointers to hardware page tables, and pointers to previously-read PMD
>>>>>> entries on the stack. To ensure correctness in the first case, these
>>>>>> functions must use the page table accessor function (pmdp_get()) to
>>>>>> dereference the supplied pointer. However, this means pmdp_get() is
>>>>>> called twice in the second case. This double call must be avoided if
>>>>>> pmdp_get() applies some non-idempotent transformation to the value.
>>>>>>
>>>>>> Avoid the double transformation by calling set_pmd() on the stack
>>>>>> variables where necessary to keep set_pmd()/pmdp_get() calls balanced.
>>>>>
>>>>> I don't think this is a good solution.
>>>>
>>>> Agreed,
>>>>
>>>>     set_pmd(&pmd, pmd);
>>>>
>>>> is rather horrible.
>>> I agree that this patch is ugly. The only way I see to avoid code like this is
>>> to refactor (or duplicate) the functions so no function takes pointers to both
>>> hardware page tables and on-stack page table entries. Is that sort of
>>> refactoring the right direction to go for v4?
>>
>> From a quick look at the code, I think that some cases are solvable by
>> refactoring to pass the value instead of the pointer, and leave it to the higher
>> level decide how to read the value from the pointer - it knows if it is pointing
>> to HW pgtable or if it's a (e.g) stack value.
>>
>> But the more I look at the code, the more instances I find where pointers to
>> stack variables are being passed to arch pgtable helpers as if they are HW
>> pgtable entry pointers. (Mainly pmd level).
> 
> Ugh. We do need to nip this in the bud I think!
> 
>>
>> I wonder if we need to bite the bullet and explicitly separate the types? At
>> each level, we have:
>>
>>  1. page table entry value
>>  2. pointer to page table entry _value_ (e.g. pointer to pXX_t on stack)
>>  3. pointer to page table entry in HW pgtable
>>
>> Today, 1 is represented by pte_t, pmd_t, etc. 2 and 3 are represented by the
>> same type; pte_t*, pmd_t*, etc.
>>
>> If we create a new type for 3, it will both document and enforce when type 2 or
>> type 3 is required.
>>
>> e.g:
>>
>> // pte_t: defined by arch.
>> typedef unsigned long pte_t;
>>
>> // ptep_t: new opaque type that can't be dereferenced.
>> struct __ptep_t;
>> typedef struct __ptep_t *ptep_t;
>>
>> // getter/setter responsible for cast & deref as appropriate.
>> pte_t ptep_get(ptep_t ptep)
>> {
>> 	return READ_ONCE(*(pte_t *)ptep);
>> }
> 
> I think we've got ourselves in a jumble with pte vs ptep vs pteval
> vs. etc. etc. and we aren't always consistent with it.
> 
> So I think we ought to put 'hw' somewhere in the name.

Yeah that would probably give the best end result. I was thinking that ptep/pXdp
might reduce churn since we already use it in function names. Stack pointers are
definitely a small minority so easier to change their convention.

But having an explicit "hw" would probably be best.

> 
> And we can also now get away from the overly abbreviated names given we have the
> chance :)
> 
> So something like 'pte_hw_t' perhaps?

sounds good. Or hw_pte_t? Then "pte_t" is still greppable and it better matches
the hw_pte_get() that you have below.

> 
> I like the general concept, though I think it's kinda gross to hide the fact
> that it's a pointer in the typedef _that_ directly.

My aim was to have a type that you can't dereference, can't accidentally pass as
a pte_t* and also can't increment/decrement/index.

This gives us a framework where the architecture can completely virtualize the
pgtable if it likes; and the compiler can enforce that higher level code can't
accidentally work around it.

I have a couple of use cases where these properties will come in handy; arm64
now defines 128-bit pgtables. We have a prototype adding it to the kernel, but
at the moment it has to be a compile time descision so that all the generic code
knows the size of the entries. It would be preferable to make it a boot time
decision, based on the HW capabilities (or cmdline, ...). But for that, we need
to better insulate the physical pgtables from the core-mm. This concept solves that.

> 
> So perhaps be less horrid if it was something like:
> 
> typedef struct {
> 	pte_t *ptr;
> } pte_hw_t;
> 
> Perhaps?

I _think_ that works for the above, and it is certainly nicer.

pte_hw_t pte_table = pte_alloc(...);
for (i = 0; i < PTRS_PER_PTE; i++)
	hw_pte_get(pte_table[i]);

I think that would fail to compile, right? Which is what we want. Instead we
need something like:

pte_hw_t hwpte = pte_alloc(...);
for (i = 0; i < PTRS_PER_PTE; i++, hwpte = hw_pte_next(hwpte))
	hw_pte_get(hwpte);

> 
> Then could have:
> 
> pte_t hw_pte_get(pte_hw_t pte_hw)
> {
> 	return READ_ONCE(*pte_hw.ptr);
> }
> 
>>
>> int do_stuff(void)
>> {
>> 	// value on stack: ok
>> 	pte_t mypte;
>>
>> 	// pointer to value on stack: ok
>> 	pte_t *pmypte = &mypte;
>>
>> 	// handle to entry on stack: not allowed by compiler!
>> 	ptep_t myptep = &mypte;
>>
>> 	// handle to entry in pgtable: ok
>> 	ptep_t myptep = pte_offset_kernel(...);
>>
>> 	// read value of pgtable entry: ok
>> 	pte_t val = ptep_get(myptep);
>>
>> 	// attempt to pass pointer to stack variable: not allowed by compiler!
>> 	pte_t val = ptep_get(&mypte);
>>
>> 	// attempt to directly dereference ptep: not allowed by compiler!
>> 	pte_t val = *myptep;
>> }
>>
>>
>> We could do this incrementally by initially typedefing ptep_t to be:
>> typedef pte_t *ptep_t;
> 
> Hm yeah still hate the idea of typedef'ing this as a ptr so directly.

What's the problem with it?

> 
> Obviously this would make an incremental thing a little harder, but could have
> this stuff on top of existing logic and make it incremental by changing logic
> bit-by-bit?

I'm not sure... sounds to me like we would need to convert absolutely everything
including arches that are difficult for most people to test. With the "typedef
pte_t *hw_pte_t;" approach, you can go bit-by-bit and leave those unloved arches
alone entirely.

> 
>>
>> Then we could flip the switch arch-by-arch to enable the stronger checking. We
>> likely wouldn't need to convert arches that don't care.
>>
>> I think by doing this, it will expose all the current issues and force us to fix
>> them properly.
> 
> Yeah this is nice.
> 
>>
>> On the related subject of conversion to pXXp_get(); I've been looking into this
>> and personally, I think we should have 2 helper flavours at each level:
>>
>>  - pXXd_get()      optimizable by compiler; defaults to C dereference
>>  - pXXd_get_once() single-copy-atomic and unmovable by compiler
> 
> Yes!
> 
> There is some _real_ confusion about when and when we don't need to do this.

I think the requirements are basically this:

"_ONCE" gives 2 properties; single-copy-atomicity and compiler can't move/remove
instructions.

SCA is important to prevent tearing if the HW can update the PTE concurrently
with SW reading it (i.e. HW access/dirty) or if SW is reading without holding
the PTL. But there are times when it doesn't matter if the PTE gets torn because
you are only comparing the result using pte_none() or pte_same().

The "compiler can't move/remove" property (I'm sure there is a formal name for
this?) is important for lockless walkers. Probably you could achieve similar
results by using barrier()?

> 
> Though wouldn't having a typdef separating out a hw entry imply that hw entry ->
> get once, sw entry -> get?

No; you should never use the accessor methods for sw entries - they are just
variables - read them normally. If all HW accesses use "once", then the compiler
can't optimize away the accesses for the folded pgtable case.

> 
>>
>> It simplifies the converstion process, and reduces the risk of bugs
>> significantly (go read about the arm32 issues discussed in Anshuman's series if
>> you haven't done already).
> 
> Though I haven't read this series so can't remember actually if there were cases
> where even with hw entries we wanted to sometimes READ_ONCE() and sometimes not?

Yes exactly that.

> 
>>
>> I appreciate this is all probably a lot more work than you would prefer to sign
>> up for, I'd be happy to collaborate if we get concensus that this approach makes
>> sense. What do you think?
>>
>> Thanks,
>> Ryan
>>
> 
> Thanks for this concept, overall very much in agreement!

I think it would work but it's going to be a lot of churn. But if you think it's
worth it and we'll be able to get it in, then I'm up for working on it in the
background.

> 
> Cheers, Lorenzo

Re: [PATCH v3 08/22] mm: Allow page table accessors to be non-idempotent

Posted by Lorenzo Stoakes 1 month, 3 weeks ago

On Tue, Dec 16, 2025 at 05:46:57PM +0000, Ryan Roberts wrote:
> On 16/12/2025 10:29, Lorenzo Stoakes wrote:
> > On Thu, Dec 11, 2025 at 01:59:53PM +0000, Ryan Roberts wrote:
> >> On 11/12/2025 00:33, Samuel Holland wrote:
> >>> On 2025-11-28 2:47 AM, David Hildenbrand (Red Hat) wrote:
> >>>> On 11/27/25 17:57, Ryan Roberts wrote:
> >>>>> On 13/11/2025 01:45, Samuel Holland wrote:
> >>>>>> Currently, some functions such as pte_offset_map() are passed both
> >>>>>> pointers to hardware page tables, and pointers to previously-read PMD
> >>>>>> entries on the stack. To ensure correctness in the first case, these
> >>>>>> functions must use the page table accessor function (pmdp_get()) to
> >>>>>> dereference the supplied pointer. However, this means pmdp_get() is
> >>>>>> called twice in the second case. This double call must be avoided if
> >>>>>> pmdp_get() applies some non-idempotent transformation to the value.
> >>>>>>
> >>>>>> Avoid the double transformation by calling set_pmd() on the stack
> >>>>>> variables where necessary to keep set_pmd()/pmdp_get() calls balanced.
> >>>>>
> >>>>> I don't think this is a good solution.
> >>>>
> >>>> Agreed,
> >>>>
> >>>>     set_pmd(&pmd, pmd);
> >>>>
> >>>> is rather horrible.
> >>> I agree that this patch is ugly. The only way I see to avoid code like this is
> >>> to refactor (or duplicate) the functions so no function takes pointers to both
> >>> hardware page tables and on-stack page table entries. Is that sort of
> >>> refactoring the right direction to go for v4?
> >>
> >> From a quick look at the code, I think that some cases are solvable by
> >> refactoring to pass the value instead of the pointer, and leave it to the higher
> >> level decide how to read the value from the pointer - it knows if it is pointing
> >> to HW pgtable or if it's a (e.g) stack value.
> >>
> >> But the more I look at the code, the more instances I find where pointers to
> >> stack variables are being passed to arch pgtable helpers as if they are HW
> >> pgtable entry pointers. (Mainly pmd level).
> >
> > Ugh. We do need to nip this in the bud I think!
> >
> >>
> >> I wonder if we need to bite the bullet and explicitly separate the types? At
> >> each level, we have:
> >>
> >>  1. page table entry value
> >>  2. pointer to page table entry _value_ (e.g. pointer to pXX_t on stack)
> >>  3. pointer to page table entry in HW pgtable
> >>
> >> Today, 1 is represented by pte_t, pmd_t, etc. 2 and 3 are represented by the
> >> same type; pte_t*, pmd_t*, etc.
> >>
> >> If we create a new type for 3, it will both document and enforce when type 2 or
> >> type 3 is required.
> >>
> >> e.g:
> >>
> >> // pte_t: defined by arch.
> >> typedef unsigned long pte_t;
> >>
> >> // ptep_t: new opaque type that can't be dereferenced.
> >> struct __ptep_t;
> >> typedef struct __ptep_t *ptep_t;
> >>
> >> // getter/setter responsible for cast & deref as appropriate.
> >> pte_t ptep_get(ptep_t ptep)
> >> {
> >> 	return READ_ONCE(*(pte_t *)ptep);
> >> }
> >
> > I think we've got ourselves in a jumble with pte vs ptep vs pteval
> > vs. etc. etc. and we aren't always consistent with it.
> >
> > So I think we ought to put 'hw' somewhere in the name.
>
> Yeah that would probably give the best end result. I was thinking that ptep/pXdp
> might reduce churn since we already use it in function names. Stack pointers are
> definitely a small minority so easier to change their convention.
>
> But having an explicit "hw" would probably be best.

Yeah, unfortunately I think the existing inconsistencies call for a clean
break.

>
> >
> > And we can also now get away from the overly abbreviated names given we have the
> > chance :)
> >
> > So something like 'pte_hw_t' perhaps?
>
> sounds good. Or hw_pte_t? Then "pte_t" is still greppable and it better matches
> the hw_pte_get() that you have below.

Sure hw_pte_t works!

>
> >
> > I like the general concept, though I think it's kinda gross to hide the fact
> > that it's a pointer in the typedef _that_ directly.
>
> My aim was to have a type that you can't dereference, can't accidentally pass as
> a pte_t* and also can't increment/decrement/index.

Right.

>
> This gives us a framework where the architecture can completely virtualize the
> pgtable if it likes; and the compiler can enforce that higher level code can't
> accidentally work around it.

Yeah it's nice to be able to hide properties like this, I did think the use
of an incomplete type pointer was quite nice there :)

>
> I have a couple of use cases where these properties will come in handy; arm64
> now defines 128-bit pgtables. We have a prototype adding it to the kernel, but
> at the moment it has to be a compile time descision so that all the generic code
> knows the size of the entries. It would be preferable to make it a boot time
> decision, based on the HW capabilities (or cmdline, ...). But for that, we need
> to better insulate the physical pgtables from the core-mm. This concept solves that.

NIce.

>
> >
> > So perhaps be less horrid if it was something like:
> >
> > typedef struct {
> > 	pte_t *ptr;
> > } pte_hw_t;
> >
> > Perhaps?
>
> I _think_ that works for the above, and it is certainly nicer.
>
> pte_hw_t pte_table = pte_alloc(...);
> for (i = 0; i < PTRS_PER_PTE; i++)
> 	hw_pte_get(pte_table[i]);
>
> I think that would fail to compile, right? Which is what we want. Instead we
> need something like:

Yeah that wouldn't work of course.

But wouldn't it also not work if you had an opaque type? As you'd then be
dereferencing it? So it'd be something to solve for either approach right?

(An aside - this brings me back to the whole thing of differentiating
between page _tables_ and _entries_, another thing we do badly at.)

>
> pte_hw_t hwpte = pte_alloc(...);
> for (i = 0; i < PTRS_PER_PTE; i++, hwpte = hw_pte_next(hwpte))
> 	hw_pte_get(hwpte);
>
> >
> > Then could have:
> >
> > pte_t hw_pte_get(pte_hw_t pte_hw)
> > {
> > 	return READ_ONCE(*pte_hw.ptr);
> > }
> >

Yeah could do something like that.

It might actually be nice to abstract the iterator-like behaviour too to
the arch for e.g. your 128-bit entry stuff?

> >>
> >> int do_stuff(void)
> >> {
> >> 	// value on stack: ok
> >> 	pte_t mypte;
> >>
> >> 	// pointer to value on stack: ok
> >> 	pte_t *pmypte = &mypte;
> >>
> >> 	// handle to entry on stack: not allowed by compiler!
> >> 	ptep_t myptep = &mypte;
> >>
> >> 	// handle to entry in pgtable: ok
> >> 	ptep_t myptep = pte_offset_kernel(...);
> >>
> >> 	// read value of pgtable entry: ok
> >> 	pte_t val = ptep_get(myptep);
> >>
> >> 	// attempt to pass pointer to stack variable: not allowed by compiler!
> >> 	pte_t val = ptep_get(&mypte);
> >>
> >> 	// attempt to directly dereference ptep: not allowed by compiler!
> >> 	pte_t val = *myptep;
> >> }
> >>
> >>
> >> We could do this incrementally by initially typedefing ptep_t to be:
> >> typedef pte_t *ptep_t;
> >
> > Hm yeah still hate the idea of typedef'ing this as a ptr so directly.
>
> What's the problem with it?

It's the fact you are making what looks like not-a-pointer into a pointer.

It's even more egregious for the non-opaque default thing of typedef pte_t
*hw_pte_t - as then you truly are just making hw_pte_t not look like a
pointer but behave like one.


>
> >
> > Obviously this would make an incremental thing a little harder, but could have
> > this stuff on top of existing logic and make it incremental by changing logic
> > bit-by-bit?
>
> I'm not sure... sounds to me like we would need to convert absolutely everything
> including arches that are difficult for most people to test. With the "typedef
> pte_t *hw_pte_t;" approach, you can go bit-by-bit and leave those unloved arches
> alone entirely.

Couldn't we have a generic thing for arches that don't yet use the type
potentially? Might be a pain.

>
> >
> >>
> >> Then we could flip the switch arch-by-arch to enable the stronger checking. We
> >> likely wouldn't need to convert arches that don't care.
> >>
> >> I think by doing this, it will expose all the current issues and force us to fix
> >> them properly.
> >
> > Yeah this is nice.
> >
> >>
> >> On the related subject of conversion to pXXp_get(); I've been looking into this
> >> and personally, I think we should have 2 helper flavours at each level:
> >>
> >>  - pXXd_get()      optimizable by compiler; defaults to C dereference
> >>  - pXXd_get_once() single-copy-atomic and unmovable by compiler
> >
> > Yes!
> >
> > There is some _real_ confusion about when and when we don't need to do this.
>
> I think the requirements are basically this:
>
> "_ONCE" gives 2 properties; single-copy-atomicity and compiler can't move/remove
> instructions.
>
> SCA is important to prevent tearing if the HW can update the PTE concurrently
> with SW reading it (i.e. HW access/dirty) or if SW is reading without holding
> the PTL. But there are times when it doesn't matter if the PTE gets torn because
> you are only comparing the result using pte_none() or pte_same().
>
> The "compiler can't move/remove" property (I'm sure there is a formal name for
> this?) is important for lockless walkers. Probably you could achieve similar
> results by using barrier()?

OK makes sense.

>
> >
> > Though wouldn't having a typdef separating out a hw entry imply that hw entry ->
> > get once, sw entry -> get?
>
> No; you should never use the accessor methods for sw entries - they are just
> variables - read them normally. If all HW accesses use "once", then the compiler
> can't optimize away the accesses for the folded pgtable case.

Right, yes makes sense.

>
> >
> >>
> >> It simplifies the converstion process, and reduces the risk of bugs
> >> significantly (go read about the arm32 issues discussed in Anshuman's series if
> >> you haven't done already).
> >
> > Though I haven't read this series so can't remember actually if there were cases
> > where even with hw entries we wanted to sometimes READ_ONCE() and sometimes not?
>
> Yes exactly that.

Yeah I guess based on whether the properties desired match the above or
not.

>
> >
> >>
> >> I appreciate this is all probably a lot more work than you would prefer to sign
> >> up for, I'd be happy to collaborate if we get concensus that this approach makes
> >> sense. What do you think?
> >>
> >> Thanks,
> >> Ryan
> >>
> >
> > Thanks for this concept, overall very much in agreement!
>
> I think it would work but it's going to be a lot of churn. But if you think it's
> worth it and we'll be able to get it in, then I'm up for working on it in the
> background.

Well I am never averse to churn :)

I think at this point we have so much of a confusing mess of things going
on in the page table logic that it makes sense to pay the churn price for this.

>
> >
> > Cheers, Lorenzo
>

Thanks for looking into this,

Cheers, Lorenzo

Re: [PATCH v3 08/22] mm: Allow page table accessors to be non-idempotent

Posted by kernel test robot 2 months, 4 weeks ago

Hi Samuel,

kernel test robot noticed the following build errors:

[auto build test ERROR on 24172e0d79900908cf5ebf366600616d29c9b417]

url:    https://github.com/intel-lab-lkp/linux/commits/Samuel-Holland/mm-ptdump-replace-READ_ONCE-with-standard-page-table-accessors/20251113-095117
base:   24172e0d79900908cf5ebf366600616d29c9b417
patch link:    https://lore.kernel.org/r/20251113014656.2605447-9-samuel.holland%40sifive.com
patch subject: [PATCH v3 08/22] mm: Allow page table accessors to be non-idempotent
config: powerpc-allnoconfig (https://download.01.org/0day-ci/archive/20251113/202511131448.ZCsuBlBE-lkp@intel.com/config)
compiler: powerpc-linux-gcc (GCC) 15.1.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20251113/202511131448.ZCsuBlBE-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202511131448.ZCsuBlBE-lkp@intel.com/

All errors (new ones prefixed by >>):

   mm/gup.c: In function 'gup_fast_pte_range':
>> mm/gup.c:2848:9: error: implicit declaration of function 'set_pmd'; did you mean 'set_p4d'? [-Wimplicit-function-declaration]
    2848 |         set_pmd(&pmd, pmd);
         |         ^~~~~~~
         |         set_p4d
--
   mm/pgtable-generic.c: In function '___pte_offset_map':
>> mm/pgtable-generic.c:303:9: error: implicit declaration of function 'set_pmd'; did you mean 'set_p4d'? [-Wimplicit-function-declaration]
     303 |         set_pmd(&pmdval, pmdval);
         |         ^~~~~~~
         |         set_p4d


vim +2848 mm/gup.c

  2819	
  2820	#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
  2821	/*
  2822	 * GUP-fast relies on pte change detection to avoid concurrent pgtable
  2823	 * operations.
  2824	 *
  2825	 * To pin the page, GUP-fast needs to do below in order:
  2826	 * (1) pin the page (by prefetching pte), then (2) check pte not changed.
  2827	 *
  2828	 * For the rest of pgtable operations where pgtable updates can be racy
  2829	 * with GUP-fast, we need to do (1) clear pte, then (2) check whether page
  2830	 * is pinned.
  2831	 *
  2832	 * Above will work for all pte-level operations, including THP split.
  2833	 *
  2834	 * For THP collapse, it's a bit more complicated because GUP-fast may be
  2835	 * walking a pgtable page that is being freed (pte is still valid but pmd
  2836	 * can be cleared already).  To avoid race in such condition, we need to
  2837	 * also check pmd here to make sure pmd doesn't change (corresponds to
  2838	 * pmdp_collapse_flush() in the THP collapse code path).
  2839	 */
  2840	static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
  2841			unsigned long end, unsigned int flags, struct page **pages,
  2842			int *nr)
  2843	{
  2844		int ret = 0;
  2845		pte_t *ptep, *ptem;
  2846	
  2847		/* transform pmd as if &pmd pointed to a hardware page table */
> 2848		set_pmd(&pmd, pmd);
  2849		ptem = ptep = pte_offset_map(&pmd, addr);
  2850		pmd = pmdp_get(&pmd);
  2851		if (!ptep)
  2852			return 0;
  2853		do {
  2854			pte_t pte = ptep_get_lockless(ptep);
  2855			struct page *page;
  2856			struct folio *folio;
  2857	
  2858			/*
  2859			 * Always fallback to ordinary GUP on PROT_NONE-mapped pages:
  2860			 * pte_access_permitted() better should reject these pages
  2861			 * either way: otherwise, GUP-fast might succeed in
  2862			 * cases where ordinary GUP would fail due to VMA access
  2863			 * permissions.
  2864			 */
  2865			if (pte_protnone(pte))
  2866				goto pte_unmap;
  2867	
  2868			if (!pte_access_permitted(pte, flags & FOLL_WRITE))
  2869				goto pte_unmap;
  2870	
  2871			if (pte_special(pte))
  2872				goto pte_unmap;
  2873	
  2874			/* If it's not marked as special it must have a valid memmap. */
  2875			VM_WARN_ON_ONCE(!pfn_valid(pte_pfn(pte)));
  2876			page = pte_page(pte);
  2877	
  2878			folio = try_grab_folio_fast(page, 1, flags);
  2879			if (!folio)
  2880				goto pte_unmap;
  2881	
  2882			if (unlikely(pmd_val(pmd) != pmd_val(pmdp_get(pmdp))) ||
  2883			    unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) {
  2884				gup_put_folio(folio, 1, flags);
  2885				goto pte_unmap;
  2886			}
  2887	
  2888			if (!gup_fast_folio_allowed(folio, flags)) {
  2889				gup_put_folio(folio, 1, flags);
  2890				goto pte_unmap;
  2891			}
  2892	
  2893			if (!pte_write(pte) && gup_must_unshare(NULL, flags, page)) {
  2894				gup_put_folio(folio, 1, flags);
  2895				goto pte_unmap;
  2896			}
  2897	
  2898			/*
  2899			 * We need to make the page accessible if and only if we are
  2900			 * going to access its content (the FOLL_PIN case).  Please
  2901			 * see Documentation/core-api/pin_user_pages.rst for
  2902			 * details.
  2903			 */
  2904			if ((flags & FOLL_PIN) && arch_make_folio_accessible(folio)) {
  2905				gup_put_folio(folio, 1, flags);
  2906				goto pte_unmap;
  2907			}
  2908			folio_set_referenced(folio);
  2909			pages[*nr] = page;
  2910			(*nr)++;
  2911		} while (ptep++, addr += PAGE_SIZE, addr != end);
  2912	
  2913		ret = 1;
  2914	
  2915	pte_unmap:
  2916		pte_unmap(ptem);
  2917		return ret;
  2918	}
  2919	#else
  2920	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki