[PATCH v2 1/3] arm64: mm: support batch clearing of the young flag for large folios

Baolin Wang posted 3 patches 5 days, 8 hours ago
[PATCH v2 1/3] arm64: mm: support batch clearing of the young flag for large folios
Posted by Baolin Wang 5 days, 8 hours ago
Currently, contpte_ptep_test_and_clear_young() and contpte_ptep_clear_flush_young()
only clear the young flag and flush TLBs for PTEs within the contiguous range.
To support batch PTE operations for other sized large folios in the following
patches, adding a new parameter to specify the number of PTEs.

While we are at it, rename the functions to maintain consistency with other
contpte_*() functions.

Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
---
 arch/arm64/include/asm/pgtable.h | 12 ++++-----
 arch/arm64/mm/contpte.c          | 44 ++++++++++++++++++++++----------
 2 files changed, 37 insertions(+), 19 deletions(-)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 0944e296dd4a..e03034683156 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -1679,10 +1679,10 @@ extern void contpte_clear_full_ptes(struct mm_struct *mm, unsigned long addr,
 extern pte_t contpte_get_and_clear_full_ptes(struct mm_struct *mm,
 				unsigned long addr, pte_t *ptep,
 				unsigned int nr, int full);
-extern int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma,
-				unsigned long addr, pte_t *ptep);
-extern int contpte_ptep_clear_flush_young(struct vm_area_struct *vma,
-				unsigned long addr, pte_t *ptep);
+extern int contpte_test_and_clear_young_ptes(struct vm_area_struct *vma,
+				unsigned long addr, pte_t *ptep, unsigned int nr);
+extern int contpte_clear_flush_young_ptes(struct vm_area_struct *vma,
+				unsigned long addr, pte_t *ptep, unsigned int nr);
 extern void contpte_wrprotect_ptes(struct mm_struct *mm, unsigned long addr,
 				pte_t *ptep, unsigned int nr);
 extern int contpte_ptep_set_access_flags(struct vm_area_struct *vma,
@@ -1854,7 +1854,7 @@ static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
 	if (likely(!pte_valid_cont(orig_pte)))
 		return __ptep_test_and_clear_young(vma, addr, ptep);
 
-	return contpte_ptep_test_and_clear_young(vma, addr, ptep);
+	return contpte_test_and_clear_young_ptes(vma, addr, ptep, CONT_PTES);
 }
 
 #define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
@@ -1866,7 +1866,7 @@ static inline int ptep_clear_flush_young(struct vm_area_struct *vma,
 	if (likely(!pte_valid_cont(orig_pte)))
 		return __ptep_clear_flush_young(vma, addr, ptep);
 
-	return contpte_ptep_clear_flush_young(vma, addr, ptep);
+	return contpte_clear_flush_young_ptes(vma, addr, ptep, CONT_PTES);
 }
 
 #define wrprotect_ptes wrprotect_ptes
diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c
index c0557945939c..19b122441be3 100644
--- a/arch/arm64/mm/contpte.c
+++ b/arch/arm64/mm/contpte.c
@@ -488,8 +488,9 @@ pte_t contpte_get_and_clear_full_ptes(struct mm_struct *mm,
 }
 EXPORT_SYMBOL_GPL(contpte_get_and_clear_full_ptes);
 
-int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma,
-					unsigned long addr, pte_t *ptep)
+int contpte_test_and_clear_young_ptes(struct vm_area_struct *vma,
+					unsigned long addr, pte_t *ptep,
+					unsigned int nr)
 {
 	/*
 	 * ptep_clear_flush_young() technically requires us to clear the access
@@ -500,39 +501,56 @@ int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma,
 	 * having to unfold.
 	 */
 
+	unsigned long start = addr;
+	unsigned long end = start + nr * PAGE_SIZE;
 	int young = 0;
 	int i;
 
-	ptep = contpte_align_down(ptep);
-	addr = ALIGN_DOWN(addr, CONT_PTE_SIZE);
+	if (pte_cont(__ptep_get(ptep + nr - 1)))
+		end = ALIGN(end, CONT_PTE_SIZE);
 
-	for (i = 0; i < CONT_PTES; i++, ptep++, addr += PAGE_SIZE)
-		young |= __ptep_test_and_clear_young(vma, addr, ptep);
+	if (pte_cont(__ptep_get(ptep))) {
+		start = ALIGN_DOWN(start, CONT_PTE_SIZE);
+		ptep = contpte_align_down(ptep);
+	}
+
+	nr = (end - start) / PAGE_SIZE;
+	for (i = 0; i < nr; i++, ptep++, start += PAGE_SIZE)
+		young |= __ptep_test_and_clear_young(vma, start, ptep);
 
 	return young;
 }
-EXPORT_SYMBOL_GPL(contpte_ptep_test_and_clear_young);
+EXPORT_SYMBOL_GPL(contpte_test_and_clear_young_ptes);
 
-int contpte_ptep_clear_flush_young(struct vm_area_struct *vma,
-					unsigned long addr, pte_t *ptep)
+int contpte_clear_flush_young_ptes(struct vm_area_struct *vma,
+				unsigned long addr, pte_t *ptep,
+				unsigned int nr)
 {
 	int young;
 
-	young = contpte_ptep_test_and_clear_young(vma, addr, ptep);
+	young = contpte_test_and_clear_young_ptes(vma, addr, ptep, nr);
 
 	if (young) {
+		unsigned long start = addr;
+		unsigned long end = start + nr * PAGE_SIZE;
+
+		if (pte_cont(__ptep_get(ptep + nr - 1)))
+			end = ALIGN(end, CONT_PTE_SIZE);
+
+		if (pte_cont(__ptep_get(ptep)))
+			start = ALIGN_DOWN(start, CONT_PTE_SIZE);
+
 		/*
 		 * See comment in __ptep_clear_flush_young(); same rationale for
 		 * eliding the trailing DSB applies here.
 		 */
-		addr = ALIGN_DOWN(addr, CONT_PTE_SIZE);
-		__flush_tlb_range_nosync(vma->vm_mm, addr, addr + CONT_PTE_SIZE,
+		__flush_tlb_range_nosync(vma->vm_mm, start, end,
 					 PAGE_SIZE, true, 3);
 	}
 
 	return young;
 }
-EXPORT_SYMBOL_GPL(contpte_ptep_clear_flush_young);
+EXPORT_SYMBOL_GPL(contpte_clear_flush_young_ptes);
 
 void contpte_wrprotect_ptes(struct mm_struct *mm, unsigned long addr,
 					pte_t *ptep, unsigned int nr)
-- 
2.47.3
Re: [PATCH v2 1/3] arm64: mm: support batch clearing of the young flag for large folios
Posted by Lorenzo Stoakes 1 day, 5 hours ago
On Thu, Dec 11, 2025 at 04:16:54PM +0800, Baolin Wang wrote:
> Currently, contpte_ptep_test_and_clear_young() and contpte_ptep_clear_flush_young()
> only clear the young flag and flush TLBs for PTEs within the contiguous range.
> To support batch PTE operations for other sized large folios in the following
> patches, adding a new parameter to specify the number of PTEs.
>
> While we are at it, rename the functions to maintain consistency with other
> contpte_*() functions.
>
> Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> ---
>  arch/arm64/include/asm/pgtable.h | 12 ++++-----
>  arch/arm64/mm/contpte.c          | 44 ++++++++++++++++++++++----------
>  2 files changed, 37 insertions(+), 19 deletions(-)
>
> diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
> index 0944e296dd4a..e03034683156 100644
> --- a/arch/arm64/include/asm/pgtable.h
> +++ b/arch/arm64/include/asm/pgtable.h
> @@ -1679,10 +1679,10 @@ extern void contpte_clear_full_ptes(struct mm_struct *mm, unsigned long addr,
>  extern pte_t contpte_get_and_clear_full_ptes(struct mm_struct *mm,
>  				unsigned long addr, pte_t *ptep,
>  				unsigned int nr, int full);
> -extern int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma,
> -				unsigned long addr, pte_t *ptep);
> -extern int contpte_ptep_clear_flush_young(struct vm_area_struct *vma,
> -				unsigned long addr, pte_t *ptep);
> +extern int contpte_test_and_clear_young_ptes(struct vm_area_struct *vma,
> +				unsigned long addr, pte_t *ptep, unsigned int nr);
> +extern int contpte_clear_flush_young_ptes(struct vm_area_struct *vma,
> +				unsigned long addr, pte_t *ptep, unsigned int nr);

In core mm at least, as a convention, we strip 'extern' from header declarations
as we go as they're not necessary.

I wonder about this naming convention also. The contpte_xxx_() prefix
obviously implies we are operating upon PTEs in the contiguous range, now
we are doing something... different and 'nr' is a bit vague.

Is it the nr of contiguous ranges? Well in reality it's nr_pages right? But
now we're not saying they're necessarily contiguous in the sense of contpte...

I wonder whether we'd be better off introducing a new function that
explicitly has 'batch' or 'batched' in the name, and have the usual
contpte_***() stuff and callers that need batching using a new underlying helper?

Obviously I defer to Ryan on all this, just my thoughts...

>  extern void contpte_wrprotect_ptes(struct mm_struct *mm, unsigned long addr,
>  				pte_t *ptep, unsigned int nr);
>  extern int contpte_ptep_set_access_flags(struct vm_area_struct *vma,
> @@ -1854,7 +1854,7 @@ static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
>  	if (likely(!pte_valid_cont(orig_pte)))
>  		return __ptep_test_and_clear_young(vma, addr, ptep);
>
> -	return contpte_ptep_test_and_clear_young(vma, addr, ptep);
> +	return contpte_test_and_clear_young_ptes(vma, addr, ptep, CONT_PTES);
>  }
>
>  #define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
> @@ -1866,7 +1866,7 @@ static inline int ptep_clear_flush_young(struct vm_area_struct *vma,
>  	if (likely(!pte_valid_cont(orig_pte)))
>  		return __ptep_clear_flush_young(vma, addr, ptep);
>
> -	return contpte_ptep_clear_flush_young(vma, addr, ptep);
> +	return contpte_clear_flush_young_ptes(vma, addr, ptep, CONT_PTES);
>  }
>
>  #define wrprotect_ptes wrprotect_ptes
> diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c
> index c0557945939c..19b122441be3 100644
> --- a/arch/arm64/mm/contpte.c
> +++ b/arch/arm64/mm/contpte.c
> @@ -488,8 +488,9 @@ pte_t contpte_get_and_clear_full_ptes(struct mm_struct *mm,
>  }
>  EXPORT_SYMBOL_GPL(contpte_get_and_clear_full_ptes);
>
> -int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma,
> -					unsigned long addr, pte_t *ptep)
> +int contpte_test_and_clear_young_ptes(struct vm_area_struct *vma,
> +					unsigned long addr, pte_t *ptep,
> +					unsigned int nr)
>  {
>  	/*
>  	 * ptep_clear_flush_young() technically requires us to clear the access
> @@ -500,39 +501,56 @@ int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma,
>  	 * having to unfold.
>  	 */

Hmm shouldn't you need to update this comment now 'nr' is a thing? E.g.:

"And since we only create a contig range when the range is covered by a single
folio, we can get away with clearing young for the whole contig range here, so
we avoid having to unfold."

However now you're allowing for large folios that are not contpte?

Overall feeling pretty iffy about implementing this this way.

>
> +	unsigned long start = addr;
> +	unsigned long end = start + nr * PAGE_SIZE;

So now [addr, addr + nr * PAGE_SIZE) must for sure be within a single PTE
table?

>  	int young = 0;
>  	int i;
>
> -	ptep = contpte_align_down(ptep);
> -	addr = ALIGN_DOWN(addr, CONT_PTE_SIZE);
> +	if (pte_cont(__ptep_get(ptep + nr - 1)))
> +		end = ALIGN(end, CONT_PTE_SIZE);

OK so I guess for PTE_CONT to be set, it must be aligned to CONT_PTE_SIZE,
with other PTE entries present there not having PTE_CONT set (Ryan - do
correct me if I'm wrong!)

I guess then no worry about running off the end of the PTE table here.

I wonder about using pte_cont_addr_end() here, but I guess you are not
intending to limit to a range here but rather capture the entire contpte
range of the end.

I don't love that now a function prefixed with contpte_... can have a
condition where part of the input range is _not_ a contpte entry, or is
specified partially...


>
> -	for (i = 0; i < CONT_PTES; i++, ptep++, addr += PAGE_SIZE)
> -		young |= __ptep_test_and_clear_young(vma, addr, ptep);
> +	if (pte_cont(__ptep_get(ptep))) {
> +		start = ALIGN_DOWN(start, CONT_PTE_SIZE);
> +		ptep = contpte_align_down(ptep);
> +	}
> +
> +	nr = (end - start) / PAGE_SIZE;

Hm don't love this reuse of input param.

> +	for (i = 0; i < nr; i++, ptep++, start += PAGE_SIZE)
> +		young |= __ptep_test_and_clear_young(vma, start, ptep);

Again, overall find it a bit iffy that we are essentially overloading the
code with non-contpte behaviour.

>
>  	return young;
>  }
> -EXPORT_SYMBOL_GPL(contpte_ptep_test_and_clear_young);
> +EXPORT_SYMBOL_GPL(contpte_test_and_clear_young_ptes);
>
> -int contpte_ptep_clear_flush_young(struct vm_area_struct *vma,
> -					unsigned long addr, pte_t *ptep)
> +int contpte_clear_flush_young_ptes(struct vm_area_struct *vma,
> +				unsigned long addr, pte_t *ptep,
> +				unsigned int nr)
>  {
>  	int young;
>
> -	young = contpte_ptep_test_and_clear_young(vma, addr, ptep);
> +	young = contpte_test_and_clear_young_ptes(vma, addr, ptep, nr);
>
>  	if (young) {
> +		unsigned long start = addr;
> +		unsigned long end = start + nr * PAGE_SIZE;
> +
> +		if (pte_cont(__ptep_get(ptep + nr - 1)))
> +			end = ALIGN(end, CONT_PTE_SIZE);
> +
> +		if (pte_cont(__ptep_get(ptep)))
> +			start = ALIGN_DOWN(start, CONT_PTE_SIZE);
> +
>  		/*
>  		 * See comment in __ptep_clear_flush_young(); same rationale for
>  		 * eliding the trailing DSB applies here.
>  		 */
> -		addr = ALIGN_DOWN(addr, CONT_PTE_SIZE);
> -		__flush_tlb_range_nosync(vma->vm_mm, addr, addr + CONT_PTE_SIZE,
> +		__flush_tlb_range_nosync(vma->vm_mm, start, end,
>  					 PAGE_SIZE, true, 3);

Similar comments to above.

Am curious as to Ryan's opinion.

>  	}
>
>  	return young;
>  }
> -EXPORT_SYMBOL_GPL(contpte_ptep_clear_flush_young);
> +EXPORT_SYMBOL_GPL(contpte_clear_flush_young_ptes);
>
>  void contpte_wrprotect_ptes(struct mm_struct *mm, unsigned long addr,
>  					pte_t *ptep, unsigned int nr)
> --
> 2.47.3
>
Re: [PATCH v2 1/3] arm64: mm: support batch clearing of the young flag for large folios
Posted by Baolin Wang 13 hours ago

On 2025/12/15 19:36, Lorenzo Stoakes wrote:
> On Thu, Dec 11, 2025 at 04:16:54PM +0800, Baolin Wang wrote:
>> Currently, contpte_ptep_test_and_clear_young() and contpte_ptep_clear_flush_young()
>> only clear the young flag and flush TLBs for PTEs within the contiguous range.
>> To support batch PTE operations for other sized large folios in the following
>> patches, adding a new parameter to specify the number of PTEs.
>>
>> While we are at it, rename the functions to maintain consistency with other
>> contpte_*() functions.
>>
>> Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
>> ---
>>   arch/arm64/include/asm/pgtable.h | 12 ++++-----
>>   arch/arm64/mm/contpte.c          | 44 ++++++++++++++++++++++----------
>>   2 files changed, 37 insertions(+), 19 deletions(-)
>>
>> diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
>> index 0944e296dd4a..e03034683156 100644
>> --- a/arch/arm64/include/asm/pgtable.h
>> +++ b/arch/arm64/include/asm/pgtable.h
>> @@ -1679,10 +1679,10 @@ extern void contpte_clear_full_ptes(struct mm_struct *mm, unsigned long addr,
>>   extern pte_t contpte_get_and_clear_full_ptes(struct mm_struct *mm,
>>   				unsigned long addr, pte_t *ptep,
>>   				unsigned int nr, int full);
>> -extern int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma,
>> -				unsigned long addr, pte_t *ptep);
>> -extern int contpte_ptep_clear_flush_young(struct vm_area_struct *vma,
>> -				unsigned long addr, pte_t *ptep);
>> +extern int contpte_test_and_clear_young_ptes(struct vm_area_struct *vma,
>> +				unsigned long addr, pte_t *ptep, unsigned int nr);
>> +extern int contpte_clear_flush_young_ptes(struct vm_area_struct *vma,
>> +				unsigned long addr, pte_t *ptep, unsigned int nr);
> 
> In core mm at least, as a convention, we strip 'extern' from header declarations
> as we go as they're not necessary.

Sure. I can remove the 'extern'.

> I wonder about this naming convention also. The contpte_xxx_() prefix
> obviously implies we are operating upon PTEs in the contiguous range, now
> we are doing something... different and 'nr' is a bit vague.
> 
> Is it the nr of contiguous ranges? Well in reality it's nr_pages right? But

Yes, 'nr' here means nr_pages, which follows the convention of the 
parameters in contpte_xxx_() functions.

> now we're not saying they're necessarily contiguous in the sense of contpte...
> 
> I wonder whether we'd be better off introducing a new function that
> explicitly has 'batch' or 'batched' in the name, and have the usual
> contpte_***() stuff and callers that need batching using a new underlying helper?

OK. I get your point. However, this is outside the scope of my patchset.

Perhaps we can clean up all the contpte_xxx_() functions if everyone 
agrees that this is confusing.

> Obviously I defer to Ryan on all this, just my thoughts...
> 
>>   extern void contpte_wrprotect_ptes(struct mm_struct *mm, unsigned long addr,
>>   				pte_t *ptep, unsigned int nr);
>>   extern int contpte_ptep_set_access_flags(struct vm_area_struct *vma,
>> @@ -1854,7 +1854,7 @@ static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
>>   	if (likely(!pte_valid_cont(orig_pte)))
>>   		return __ptep_test_and_clear_young(vma, addr, ptep);
>>
>> -	return contpte_ptep_test_and_clear_young(vma, addr, ptep);
>> +	return contpte_test_and_clear_young_ptes(vma, addr, ptep, CONT_PTES);
>>   }
>>
>>   #define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
>> @@ -1866,7 +1866,7 @@ static inline int ptep_clear_flush_young(struct vm_area_struct *vma,
>>   	if (likely(!pte_valid_cont(orig_pte)))
>>   		return __ptep_clear_flush_young(vma, addr, ptep);
>>
>> -	return contpte_ptep_clear_flush_young(vma, addr, ptep);
>> +	return contpte_clear_flush_young_ptes(vma, addr, ptep, CONT_PTES);
>>   }
>>
>>   #define wrprotect_ptes wrprotect_ptes
>> diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c
>> index c0557945939c..19b122441be3 100644
>> --- a/arch/arm64/mm/contpte.c
>> +++ b/arch/arm64/mm/contpte.c
>> @@ -488,8 +488,9 @@ pte_t contpte_get_and_clear_full_ptes(struct mm_struct *mm,
>>   }
>>   EXPORT_SYMBOL_GPL(contpte_get_and_clear_full_ptes);
>>
>> -int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma,
>> -					unsigned long addr, pte_t *ptep)
>> +int contpte_test_and_clear_young_ptes(struct vm_area_struct *vma,
>> +					unsigned long addr, pte_t *ptep,
>> +					unsigned int nr)
>>   {
>>   	/*
>>   	 * ptep_clear_flush_young() technically requires us to clear the access
>> @@ -500,39 +501,56 @@ int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma,
>>   	 * having to unfold.
>>   	 */
> 
> Hmm shouldn't you need to update this comment now 'nr' is a thing? E.g.:
> 
> "And since we only create a contig range when the range is covered by a single
> folio, we can get away with clearing young for the whole contig range here, so
> we avoid having to unfold."

I think the original comments are still clear:
"
/*
  * ptep_clear_flush_young() technically requires us to clear the access
  * flag for a _single_ pte. However, the core-mm code actually tracks
  * access/dirty per folio, not per page. And since we only create a
  * contig range when the range is covered by a single folio, we can get
  * away with clearing young for the whole contig range here, so we avoid
  * having to unfold.
  */
"

> However now you're allowing for large folios that are not contpte?
> 
> Overall feeling pretty iffy about implementing this this way.

Please see the above comments, which explain why we should do that.

>> +	unsigned long start = addr;
>> +	unsigned long end = start + nr * PAGE_SIZE;
> 
> So now [addr, addr + nr * PAGE_SIZE) must for sure be within a single PTE
> table?

Caller has made sure that (see folio_referenced_one()).

>>   	int young = 0;
>>   	int i;
>>
>> -	ptep = contpte_align_down(ptep);
>> -	addr = ALIGN_DOWN(addr, CONT_PTE_SIZE);
>> +	if (pte_cont(__ptep_get(ptep + nr - 1)))
>> +		end = ALIGN(end, CONT_PTE_SIZE);
> 
> OK so I guess for PTE_CONT to be set, it must be aligned to CONT_PTE_SIZE,
> with other PTE entries present there not having PTE_CONT set (Ryan - do
> correct me if I'm wrong!)
> 
> I guess then no worry about running off the end of the PTE table here.
> 
> I wonder about using pte_cont_addr_end() here, but I guess you are not
> intending to limit to a range here but rather capture the entire contpte
> range of the end.

Right.

> I don't love that now a function prefixed with contpte_... can have a
> condition where part of the input range is _not_ a contpte entry, or is
> specified partially...

Like I said above, this is outside the scope of my patchset. If Ryan 
also thinks we need to do some cleanups for all the contpte_xxx() 
functions in the contpte.c file, we can send a follow-up patchset to 
rename all the contpte_xxx() functions to make it clear.

>> -	for (i = 0; i < CONT_PTES; i++, ptep++, addr += PAGE_SIZE)
>> -		young |= __ptep_test_and_clear_young(vma, addr, ptep);
>> +	if (pte_cont(__ptep_get(ptep))) {
>> +		start = ALIGN_DOWN(start, CONT_PTE_SIZE);
>> +		ptep = contpte_align_down(ptep);
>> +	}
>> +
>> +	nr = (end - start) / PAGE_SIZE;
> 
> Hm don't love this reuse of input param.

OK. Will use another local variable instead.

> 
>> +	for (i = 0; i < nr; i++, ptep++, start += PAGE_SIZE)
>> +		young |= __ptep_test_and_clear_young(vma, start, ptep);
> 
> Again, overall find it a bit iffy that we are essentially overloading the
> code with non-contpte behaviour.

Let me clarify further: this is the handling logic of the contpte_xxx() 
functions in the contpte.c file. For example, the function 
contpte_set_ptes() has a parameter 'nr', which may be greater than 
CONT_PTES, meaning that it can handle multiple CONT_PTES range sizes of 
a large folio.

It might be a bit confusing, and I understand this is why you want to 
change the 'contpte_' prefix to the 'batch_' prefix. Of course, we can 
hope Ryan can provide some input on this.

Thanks for reviewing.
Re: [PATCH v2 1/3] arm64: mm: support batch clearing of the young flag for large folios
Posted by Lorenzo Stoakes 5 hours ago
On Tue, Dec 16, 2025 at 11:32:58AM +0800, Baolin Wang wrote:
>
>
> On 2025/12/15 19:36, Lorenzo Stoakes wrote:
> > On Thu, Dec 11, 2025 at 04:16:54PM +0800, Baolin Wang wrote:
> > > Currently, contpte_ptep_test_and_clear_young() and contpte_ptep_clear_flush_young()
> > > only clear the young flag and flush TLBs for PTEs within the contiguous range.
> > > To support batch PTE operations for other sized large folios in the following
> > > patches, adding a new parameter to specify the number of PTEs.
> > >
> > > While we are at it, rename the functions to maintain consistency with other
> > > contpte_*() functions.
> > >
> > > Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> > > ---
> > >   arch/arm64/include/asm/pgtable.h | 12 ++++-----
> > >   arch/arm64/mm/contpte.c          | 44 ++++++++++++++++++++++----------
> > >   2 files changed, 37 insertions(+), 19 deletions(-)
> > >
> > > diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
> > > index 0944e296dd4a..e03034683156 100644
> > > --- a/arch/arm64/include/asm/pgtable.h
> > > +++ b/arch/arm64/include/asm/pgtable.h
> > > @@ -1679,10 +1679,10 @@ extern void contpte_clear_full_ptes(struct mm_struct *mm, unsigned long addr,
> > >   extern pte_t contpte_get_and_clear_full_ptes(struct mm_struct *mm,
> > >   				unsigned long addr, pte_t *ptep,
> > >   				unsigned int nr, int full);
> > > -extern int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma,
> > > -				unsigned long addr, pte_t *ptep);
> > > -extern int contpte_ptep_clear_flush_young(struct vm_area_struct *vma,
> > > -				unsigned long addr, pte_t *ptep);
> > > +extern int contpte_test_and_clear_young_ptes(struct vm_area_struct *vma,
> > > +				unsigned long addr, pte_t *ptep, unsigned int nr);
> > > +extern int contpte_clear_flush_young_ptes(struct vm_area_struct *vma,
> > > +				unsigned long addr, pte_t *ptep, unsigned int nr);
> >
> > In core mm at least, as a convention, we strip 'extern' from header declarations
> > as we go as they're not necessary.
>
> Sure. I can remove the 'extern'.

Thanks.

>
> > I wonder about this naming convention also. The contpte_xxx_() prefix
> > obviously implies we are operating upon PTEs in the contiguous range, now
> > we are doing something... different and 'nr' is a bit vague.
> >
> > Is it the nr of contiguous ranges? Well in reality it's nr_pages right? But
>
> Yes, 'nr' here means nr_pages, which follows the convention of the
> parameters in contpte_xxx_() functions.

Except you're violating the convention already by doing non-contpte stuff in
contpte functions?

The confusion is between nr of contpte ranges and nr of pages, so I think we
should be explicit.

>
> > now we're not saying they're necessarily contiguous in the sense of contpte...
> >
> > I wonder whether we'd be better off introducing a new function that
> > explicitly has 'batch' or 'batched' in the name, and have the usual
> > contpte_***() stuff and callers that need batching using a new underlying helper?
>
> OK. I get your point. However, this is outside the scope of my patchset.

Well, no, this is review feedback that needs to be addressed, I really don't
like this patch as-is.

So for this to be upstreamable you really need to separate this out.

>
> Perhaps we can clean up all the contpte_xxx_() functions if everyone agrees
> that this is confusing.

I mean, unless Ryan explicitly makes a case for this being fine, I'm inclined to
reject the patch unless we do this.

The contpte logic is already very confusing, and this is only adding to it.

>
> > Obviously I defer to Ryan on all this, just my thoughts...
> >
> > >   extern void contpte_wrprotect_ptes(struct mm_struct *mm, unsigned long addr,
> > >   				pte_t *ptep, unsigned int nr);
> > >   extern int contpte_ptep_set_access_flags(struct vm_area_struct *vma,
> > > @@ -1854,7 +1854,7 @@ static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
> > >   	if (likely(!pte_valid_cont(orig_pte)))
> > >   		return __ptep_test_and_clear_young(vma, addr, ptep);
> > >
> > > -	return contpte_ptep_test_and_clear_young(vma, addr, ptep);
> > > +	return contpte_test_and_clear_young_ptes(vma, addr, ptep, CONT_PTES);
> > >   }
> > >
> > >   #define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
> > > @@ -1866,7 +1866,7 @@ static inline int ptep_clear_flush_young(struct vm_area_struct *vma,
> > >   	if (likely(!pte_valid_cont(orig_pte)))
> > >   		return __ptep_clear_flush_young(vma, addr, ptep);
> > >
> > > -	return contpte_ptep_clear_flush_young(vma, addr, ptep);
> > > +	return contpte_clear_flush_young_ptes(vma, addr, ptep, CONT_PTES);
> > >   }
> > >
> > >   #define wrprotect_ptes wrprotect_ptes
> > > diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c
> > > index c0557945939c..19b122441be3 100644
> > > --- a/arch/arm64/mm/contpte.c
> > > +++ b/arch/arm64/mm/contpte.c
> > > @@ -488,8 +488,9 @@ pte_t contpte_get_and_clear_full_ptes(struct mm_struct *mm,
> > >   }
> > >   EXPORT_SYMBOL_GPL(contpte_get_and_clear_full_ptes);
> > >
> > > -int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma,
> > > -					unsigned long addr, pte_t *ptep)
> > > +int contpte_test_and_clear_young_ptes(struct vm_area_struct *vma,
> > > +					unsigned long addr, pte_t *ptep,
> > > +					unsigned int nr)
> > >   {
> > >   	/*
> > >   	 * ptep_clear_flush_young() technically requires us to clear the access
> > > @@ -500,39 +501,56 @@ int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma,
> > >   	 * having to unfold.
> > >   	 */
> >
> > Hmm shouldn't you need to update this comment now 'nr' is a thing? E.g.:
> >
> > "And since we only create a contig range when the range is covered by a single
> > folio, we can get away with clearing young for the whole contig range here, so
> > we avoid having to unfold."
>
> I think the original comments are still clear:
> "
> /*
>  * ptep_clear_flush_young() technically requires us to clear the access
>  * flag for a _single_ pte. However, the core-mm code actually tracks
>  * access/dirty per folio, not per page. And since we only create a
>  * contig range when the range is covered by a single folio, we can get
>  * away with clearing young for the whole contig range here, so we avoid
>  * having to unfold.
>  */
> "

Except nr means you can span more than a single contig pte range? So this
comment is no longer applicable as-is?

You've changed the function, the comment needs to change too.

>
> > However now you're allowing for large folios that are not contpte?
> >
> > Overall feeling pretty iffy about implementing this this way.
>
> Please see the above comments, which explain why we should do that.

You haven't explained anything? Maybe I missed it?

Can you explain clearly what nr might actually be in relation to contpte's? I'm
confused even as to the utility here.

Is it batched ranges of contptes? But then why are you trying to see if end is a
contpte + expand the range if so?

>
> > > +	unsigned long start = addr;
> > > +	unsigned long end = start + nr * PAGE_SIZE;
> >
> > So now [addr, addr + nr * PAGE_SIZE) must for sure be within a single PTE
> > table?
>
> Caller has made sure that (see folio_referenced_one()).

You should comment this somehow, I hate this nested assumptions stuff 'function
X which calls function Y which calls function Z makes sure that parameter A
fulfils requirements B but we don't mention it anywhere'.

>
> > >   	int young = 0;
> > >   	int i;
> > >
> > > -	ptep = contpte_align_down(ptep);
> > > -	addr = ALIGN_DOWN(addr, CONT_PTE_SIZE);
> > > +	if (pte_cont(__ptep_get(ptep + nr - 1)))
> > > +		end = ALIGN(end, CONT_PTE_SIZE);
> >
> > OK so I guess for PTE_CONT to be set, it must be aligned to CONT_PTE_SIZE,
> > with other PTE entries present there not having PTE_CONT set (Ryan - do
> > correct me if I'm wrong!)
> >
> > I guess then no worry about running off the end of the PTE table here.
> >
> > I wonder about using pte_cont_addr_end() here, but I guess you are not
> > intending to limit to a range here but rather capture the entire contpte
> > range of the end.
>
> Right.
>
> > I don't love that now a function prefixed with contpte_... can have a
> > condition where part of the input range is _not_ a contpte entry, or is
> > specified partially...
>
> Like I said above, this is outside the scope of my patchset. If Ryan also
> thinks we need to do some cleanups for all the contpte_xxx() functions in
> the contpte.c file, we can send a follow-up patchset to rename all the
> contpte_xxx() functions to make it clear.

It's really not outside of the scope of the patch set, this is review feedback
you have to address :) trying not to be a grumpy kernel dev here :>)

In general in the kernel we don't accept confusing patches then defer making
them not-confusing until later.

We review until the series is at the correct standard to be accepted before we
merge.

As always I'm happy to be convinced that I'm mistaken or something's not
necesary, however!

>
> > > -	for (i = 0; i < CONT_PTES; i++, ptep++, addr += PAGE_SIZE)
> > > -		young |= __ptep_test_and_clear_young(vma, addr, ptep);
> > > +	if (pte_cont(__ptep_get(ptep))) {
> > > +		start = ALIGN_DOWN(start, CONT_PTE_SIZE);
> > > +		ptep = contpte_align_down(ptep);
> > > +	}
> > > +
> > > +	nr = (end - start) / PAGE_SIZE;
> >
> > Hm don't love this reuse of input param.
>
> OK. Will use another local variable instead.

Thanks.

>
> >
> > > +	for (i = 0; i < nr; i++, ptep++, start += PAGE_SIZE)
> > > +		young |= __ptep_test_and_clear_young(vma, start, ptep);
> >
> > Again, overall find it a bit iffy that we are essentially overloading the
> > code with non-contpte behaviour.
>
> Let me clarify further: this is the handling logic of the contpte_xxx()
> functions in the contpte.c file. For example, the function
> contpte_set_ptes() has a parameter 'nr', which may be greater than
> CONT_PTES, meaning that it can handle multiple CONT_PTES range sizes of a
> large folio.
>
> It might be a bit confusing, and I understand this is why you want to change
> the 'contpte_' prefix to the 'batch_' prefix. Of course, we can hope Ryan
> can provide some input on this.
>
> Thanks for reviewing.

OK so we have some reeeeal confusion here. And this speaks to the patch needing
work.

This seems to answer what I asked above - basically - are we doing _batches_ of
_contpte_ entries, or are we just accepting arbitrary large folios here and
batching up operations on them, including non-contpte entries.

Presumably from the above you're saying - this is dealing with batches of
contpte entries.

In which case this has to be made _super_ clear. Not just adding an arbitrary
'nr' parameter and letting people guess.

The contpte code is _already_ confusing and somewhat surprising if you're not
aware of it, we need to be going in the other direction.

Thanks, Lorenzo