[v1] support batched checking of the young flag for MGLRU

[PATCH 4/5] mm: support batched checking of the young flag for MGLRU

Posted by Baolin Wang 1 month, 1 week ago

Use the batched helper clear_young_ptes_notify() to check and clear the
young flag to improve the performance during large folio reclamation when
MGLRU is enabled.

Meanwhile, we can also support batched checking the young and dirty flag
when MGLRU walks the mm's pagetable to update the folios' generation
counter. Since MGLRU also checks the PTE dirty bit, use folio_pte_batch_flags()
with FPB_MERGE_YOUNG_DIRTY set to detect batches of PTEs for a large folio.

Then we can remove the ptep_clear_young_notify() since it has no users now.

Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
---
 include/linux/mmzone.h |  5 +++--
 mm/internal.h          | 12 ------------
 mm/rmap.c              | 30 ++++++++++++++++--------------
 mm/vmscan.c            | 37 +++++++++++++++++++++++++++++--------
 4 files changed, 48 insertions(+), 36 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index db41b18a919d..de9fee4244d9 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -630,7 +630,7 @@ struct lru_gen_memcg {
 
 void lru_gen_init_pgdat(struct pglist_data *pgdat);
 void lru_gen_init_lruvec(struct lruvec *lruvec);
-bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
+bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw, unsigned int batched);
 
 void lru_gen_init_memcg(struct mem_cgroup *memcg);
 void lru_gen_exit_memcg(struct mem_cgroup *memcg);
@@ -649,7 +649,8 @@ static inline void lru_gen_init_lruvec(struct lruvec *lruvec)
 {
 }
 
-static inline bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
+static inline bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw,
+				       unsigned int batched)
 {
 	return false;
 }
diff --git a/mm/internal.h b/mm/internal.h
index 1b59be99dc3f..4e8d37570f46 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1824,12 +1824,6 @@ static inline int clear_young_ptes_notify(struct vm_area_struct *vma,
 	return young;
 }
 
-static inline int ptep_clear_young_notify(struct vm_area_struct *vma,
-					  unsigned long addr, pte_t *ptep)
-{
-	return clear_young_ptes_notify(vma, addr, ptep, 1);
-}
-
 static inline int pmdp_clear_young_notify(struct vm_area_struct *vma,
 					  unsigned long addr, pmd_t *pmdp)
 {
@@ -1847,12 +1841,6 @@ static inline int pmdp_clear_young_notify(struct vm_area_struct *vma,
 #define clear_young_ptes_notify	test_and_clear_young_ptes
 #define pmdp_clear_young_notify	pmdp_test_and_clear_young
 
-static inline int ptep_clear_young_notify(struct vm_area_struct *vma,
-					  unsigned long addr, pte_t *ptep)
-{
-	return test_and_clear_young_ptes(vma, addr, ptep, 1);
-}
-
 #endif /* CONFIG_MMU_NOTIFIER */
 
 #endif	/* __MM_INTERNAL_H */
diff --git a/mm/rmap.c b/mm/rmap.c
index be785dfc9336..1c147251ae28 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -958,25 +958,21 @@ static bool folio_referenced_one(struct folio *folio,
 			return false;
 		}
 
+		if (pvmw.pte && folio_test_large(folio)) {
+			unsigned long end_addr = pmd_addr_end(address, vma->vm_end);
+			unsigned int max_nr = (end_addr - address) >> PAGE_SHIFT;
+			pte_t pteval = ptep_get(pvmw.pte);
+
+			nr = folio_pte_batch(folio, pvmw.pte, pteval, max_nr);
+			ptes += nr;
+		}
+
 		if (lru_gen_enabled() && pvmw.pte) {
-			if (lru_gen_look_around(&pvmw))
+			if (lru_gen_look_around(&pvmw, nr))
 				referenced++;
 		} else if (pvmw.pte) {
-			if (folio_test_large(folio)) {
-				unsigned long end_addr = pmd_addr_end(address, vma->vm_end);
-				unsigned int max_nr = (end_addr - address) >> PAGE_SHIFT;
-				pte_t pteval = ptep_get(pvmw.pte);
-
-				nr = folio_pte_batch(folio, pvmw.pte,
-						     pteval, max_nr);
-			}
-
-			ptes += nr;
 			if (clear_flush_young_ptes_notify(vma, address, pvmw.pte, nr))
 				referenced++;
-			/* Skip the batched PTEs */
-			pvmw.pte += nr - 1;
-			pvmw.address += (nr - 1) * PAGE_SIZE;
 		} else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
 			if (pmdp_clear_flush_young_notify(vma, address,
 						pvmw.pmd))
@@ -995,6 +991,12 @@ static bool folio_referenced_one(struct folio *folio,
 			page_vma_mapped_walk_done(&pvmw);
 			break;
 		}
+
+		/* Skip the batched PTEs */
+		if (nr > 1) {
+			pvmw.pte += nr - 1;
+			pvmw.address += (nr - 1) * PAGE_SIZE;
+		}
 	}
 
 	if (referenced)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 728868c61750..d83962468b2e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3494,6 +3494,7 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
 	struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
 	DEFINE_MAX_SEQ(walk->lruvec);
 	int gen = lru_gen_from_seq(max_seq);
+	unsigned int nr;
 	pmd_t pmdval;
 
 	pte = pte_offset_map_rw_nolock(args->mm, pmd, start & PMD_MASK, &pmdval, &ptl);
@@ -3512,11 +3513,13 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
 
 	lazy_mmu_mode_enable();
 restart:
-	for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) {
+	for (i = pte_index(start), addr = start; addr != end; i += nr, addr += nr * PAGE_SIZE) {
 		unsigned long pfn;
 		struct folio *folio;
-		pte_t ptent = ptep_get(pte + i);
+		pte_t *ptep = pte + i;
+		pte_t ptent = ptep_get(ptep);
 
+		nr = 1;
 		total++;
 		walk->mm_stats[MM_LEAF_TOTAL]++;
 
@@ -3528,7 +3531,14 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
 		if (!folio)
 			continue;
 
-		if (!ptep_clear_young_notify(args->vma, addr, pte + i))
+		if (folio_test_large(folio)) {
+			unsigned int max_nr = (end - addr) >> PAGE_SHIFT;
+
+			nr = folio_pte_batch_flags(folio, NULL, ptep, &ptent,
+						   max_nr, FPB_MERGE_YOUNG_DIRTY);
+		}
+
+		if (!clear_young_ptes_notify(args->vma, addr, ptep, nr))
 			continue;
 
 		if (last != folio) {
@@ -4186,7 +4196,7 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
  * the PTE table to the Bloom filter. This forms a feedback loop between the
  * eviction and the aging.
  */
-bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
+bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw, unsigned int batched)
 {
 	int i;
 	bool dirty;
@@ -4205,11 +4215,13 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
 	struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
 	DEFINE_MAX_SEQ(lruvec);
 	int gen = lru_gen_from_seq(max_seq);
+	unsigned int nr;
+	pte_t *ptep;
 
 	lockdep_assert_held(pvmw->ptl);
 	VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio);
 
-	if (!ptep_clear_young_notify(vma, addr, pte))
+	if (!clear_young_ptes_notify(vma, addr, pte, batched))
 		return false;
 
 	if (spin_is_contended(pvmw->ptl))
@@ -4243,10 +4255,12 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
 
 	pte -= (addr - start) / PAGE_SIZE;
 
-	for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) {
+	for (i = 0, addr = start, ptep = pte; addr != end;
+	     i += nr, ptep += nr, addr += nr * PAGE_SIZE) {
 		unsigned long pfn;
-		pte_t ptent = ptep_get(pte + i);
+		pte_t ptent = ptep_get(ptep);
 
+		nr = 1;
 		pfn = get_pte_pfn(ptent, vma, addr, pgdat);
 		if (pfn == -1)
 			continue;
@@ -4255,7 +4269,14 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
 		if (!folio)
 			continue;
 
-		if (!ptep_clear_young_notify(vma, addr, pte + i))
+		if (folio_test_large(folio)) {
+			unsigned int max_nr = (end - addr) >> PAGE_SHIFT;
+
+			nr = folio_pte_batch_flags(folio, NULL, ptep, &ptent,
+						   max_nr, FPB_MERGE_YOUNG_DIRTY);
+		}
+
+		if (!clear_young_ptes_notify(vma, addr, ptep, nr))
 			continue;
 
 		if (last != folio) {
-- 
2.47.3

Re: [PATCH 4/5] mm: support batched checking of the young flag for MGLRU

Posted by David Hildenbrand (Arm) 1 month, 1 week ago

On 2/24/26 02:56, Baolin Wang wrote:
> Use the batched helper clear_young_ptes_notify() to check and clear the
> young flag to improve the performance during large folio reclamation when
> MGLRU is enabled.
> 
> Meanwhile, we can also support batched checking the young and dirty flag
> when MGLRU walks the mm's pagetable to update the folios' generation
> counter. Since MGLRU also checks the PTE dirty bit, use folio_pte_batch_flags()
> with FPB_MERGE_YOUNG_DIRTY set to detect batches of PTEs for a large folio.
> 
> Then we can remove the ptep_clear_young_notify() since it has no users now.
> 
> Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> ---

[...]

>  
> -static inline int ptep_clear_young_notify(struct vm_area_struct *vma,
> -					  unsigned long addr, pte_t *ptep)
> -{
> -	return clear_young_ptes_notify(vma, addr, ptep, 1);
> -}
> -
>  static inline int pmdp_clear_young_notify(struct vm_area_struct *vma,
>  					  unsigned long addr, pmd_t *pmdp)
>  {
> @@ -1847,12 +1841,6 @@ static inline int pmdp_clear_young_notify(struct vm_area_struct *vma,
>  #define clear_young_ptes_notify	test_and_clear_young_ptes
>  #define pmdp_clear_young_notify	pmdp_test_and_clear_young
>  
> -static inline int ptep_clear_young_notify(struct vm_area_struct *vma,
> -					  unsigned long addr, pte_t *ptep)
> -{
> -	return test_and_clear_young_ptes(vma, addr, ptep, 1);
> -}
> -

Oh, we remove the last user, nice.


>  #endif /* CONFIG_MMU_NOTIFIER */
>  
>  #endif	/* __MM_INTERNAL_H */
> diff --git a/mm/rmap.c b/mm/rmap.c
> index be785dfc9336..1c147251ae28 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -958,25 +958,21 @@ static bool folio_referenced_one(struct folio *folio,
>  			return false;
>  		}
>  
> +		if (pvmw.pte && folio_test_large(folio)) {
> +			unsigned long end_addr = pmd_addr_end(address, vma->vm_end);
> +			unsigned int max_nr = (end_addr - address) >> PAGE_SHIFT;

Both could be const.

> +			pte_t pteval = ptep_get(pvmw.pte);

I wonder if there could be a way to avoid this ptep_get() by letting
page_vma_mapped_walk() just provide the last value it used (in
check_pte() I guess). Something for another patch.

> +
> +			nr = folio_pte_batch(folio, pvmw.pte, pteval, max_nr);
> +			ptes += nr;
> +		}
> +
>  		if (lru_gen_enabled() && pvmw.pte) {
> -			if (lru_gen_look_around(&pvmw))
> +			if (lru_gen_look_around(&pvmw, nr))
>  				referenced++;
>  		} else if (pvmw.pte) {
> -			if (folio_test_large(folio)) {
> -				unsigned long end_addr = pmd_addr_end(address, vma->vm_end);
> -				unsigned int max_nr = (end_addr - address) >> PAGE_SHIFT;
> -				pte_t pteval = ptep_get(pvmw.pte);
> -
> -				nr = folio_pte_batch(folio, pvmw.pte,
> -						     pteval, max_nr);
> -			}
> -
> -			ptes += nr;
>  			if (clear_flush_young_ptes_notify(vma, address, pvmw.pte, nr))
>  				referenced++;
> -			/* Skip the batched PTEs */
> -			pvmw.pte += nr - 1;
> -			pvmw.address += (nr - 1) * PAGE_SIZE;
>  		} else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
>  			if (pmdp_clear_flush_young_notify(vma, address,
>  						pvmw.pmd))
> @@ -995,6 +991,12 @@ static bool folio_referenced_one(struct folio *folio,
>  			page_vma_mapped_walk_done(&pvmw);
>  			break;
>  		}
> +
> +		/* Skip the batched PTEs */
> +		if (nr > 1) {
> +			pvmw.pte += nr - 1;
> +			pvmw.address += (nr - 1) * PAGE_SIZE;
> +		}

As nr >= 1, you can just unconditionaly do

pvmw.pte += nr - 1;
pvmw.address += (nr - 1) * PAGE_SIZE;

>  	}
>  
>  	if (referenced)
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 728868c61750..d83962468b2e 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -3494,6 +3494,7 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
>  	struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
>  	DEFINE_MAX_SEQ(walk->lruvec);
>  	int gen = lru_gen_from_seq(max_seq);
> +	unsigned int nr;
>  	pmd_t pmdval;
>  
>  	pte = pte_offset_map_rw_nolock(args->mm, pmd, start & PMD_MASK, &pmdval, &ptl);
> @@ -3512,11 +3513,13 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
>  
>  	lazy_mmu_mode_enable();
>  restart:
> -	for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) {
> +	for (i = pte_index(start), addr = start; addr != end; i += nr, addr += nr * PAGE_SIZE) {
>  		unsigned long pfn;
>  		struct folio *folio;
> -		pte_t ptent = ptep_get(pte + i);
> +		pte_t *ptep = pte + i;
> +		pte_t ptent = ptep_get(ptep);


Existing "pte vs ptent" vs. "ptep vs. pte" is already confusing.
Combining them into "pte vs. ptep vs. ptent" is no good.

If you need another variable, call it "cur_pte". Or rename "pte" to
"start_pte".

>  
> +		nr = 1;
>  		total++;
>  		walk->mm_stats[MM_LEAF_TOTAL]++;
>  
> @@ -3528,7 +3531,14 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
>  		if (!folio)
>  			continue;
>  
> -		if (!ptep_clear_young_notify(args->vma, addr, pte + i))
> +		if (folio_test_large(folio)) {
> +			unsigned int max_nr = (end - addr) >> PAGE_SHIFT;
> +
> +			nr = folio_pte_batch_flags(folio, NULL, ptep, &ptent,
> +						   max_nr, FPB_MERGE_YOUNG_DIRTY);
> +		}
> +
> +		if (!clear_young_ptes_notify(args->vma, addr, ptep, nr))
>  			continue;
>  
>  		if (last != folio) {
> @@ -4186,7 +4196,7 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
>   * the PTE table to the Bloom filter. This forms a feedback loop between the
>   * eviction and the aging.
>   */
> -bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
> +bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw, unsigned int batched)

What is "batched"? Did you mean "nr_ptes" ? Or just the initial value
for "nr" ?

[...]

>  
> -		if (!ptep_clear_young_notify(vma, addr, pte + i))
> +		if (folio_test_large(folio)) {
> +			unsigned int max_nr = (end - addr) >> PAGE_SHIFT;

Can be const.

> +
> +			nr = folio_pte_batch_flags(folio, NULL, ptep, &ptent,
> +						   max_nr, FPB_MERGE_YOUNG_DIRTY);
> +		}

I guess we might benefit from a FPB_MERGE_YOUNG only here. But this
should work.

-- 
Cheers,

David

Re: [PATCH 4/5] mm: support batched checking of the young flag for MGLRU

Posted by Baolin Wang 1 month, 1 week ago


On 2/25/26 10:25 PM, David Hildenbrand (Arm) wrote:
> On 2/24/26 02:56, Baolin Wang wrote:
>> Use the batched helper clear_young_ptes_notify() to check and clear the
>> young flag to improve the performance during large folio reclamation when
>> MGLRU is enabled.
>>
>> Meanwhile, we can also support batched checking the young and dirty flag
>> when MGLRU walks the mm's pagetable to update the folios' generation
>> counter. Since MGLRU also checks the PTE dirty bit, use folio_pte_batch_flags()
>> with FPB_MERGE_YOUNG_DIRTY set to detect batches of PTEs for a large folio.
>>
>> Then we can remove the ptep_clear_young_notify() since it has no users now.
>>
>> Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
>> ---
> 
> [...]
> 
>>   
>> -static inline int ptep_clear_young_notify(struct vm_area_struct *vma,
>> -					  unsigned long addr, pte_t *ptep)
>> -{
>> -	return clear_young_ptes_notify(vma, addr, ptep, 1);
>> -}
>> -
>>   static inline int pmdp_clear_young_notify(struct vm_area_struct *vma,
>>   					  unsigned long addr, pmd_t *pmdp)
>>   {
>> @@ -1847,12 +1841,6 @@ static inline int pmdp_clear_young_notify(struct vm_area_struct *vma,
>>   #define clear_young_ptes_notify	test_and_clear_young_ptes
>>   #define pmdp_clear_young_notify	pmdp_test_and_clear_young
>>   
>> -static inline int ptep_clear_young_notify(struct vm_area_struct *vma,
>> -					  unsigned long addr, pte_t *ptep)
>> -{
>> -	return test_and_clear_young_ptes(vma, addr, ptep, 1);
>> -}
>> -
> 
> Oh, we remove the last user, nice.
> 
> 
>>   #endif /* CONFIG_MMU_NOTIFIER */
>>   
>>   #endif	/* __MM_INTERNAL_H */
>> diff --git a/mm/rmap.c b/mm/rmap.c
>> index be785dfc9336..1c147251ae28 100644
>> --- a/mm/rmap.c
>> +++ b/mm/rmap.c
>> @@ -958,25 +958,21 @@ static bool folio_referenced_one(struct folio *folio,
>>   			return false;
>>   		}
>>   
>> +		if (pvmw.pte && folio_test_large(folio)) {
>> +			unsigned long end_addr = pmd_addr_end(address, vma->vm_end);
>> +			unsigned int max_nr = (end_addr - address) >> PAGE_SHIFT;
> 
> Both could be const.

Ack.

> 
>> +			pte_t pteval = ptep_get(pvmw.pte);
> 
> I wonder if there could be a way to avoid this ptep_get() by letting
> page_vma_mapped_walk() just provide the last value it used (in
> check_pte() I guess). Something for another patch.

Well, we’d need to add a new field to ‘struct page_vma_mapped_walk’ to 
store the last value (e.g., pvmw.pteval), but this makes me wonder if it 
is worth adding a new field just to avoid a lightweight read (which 
should have no obvious performance impact).

>> +
>> +			nr = folio_pte_batch(folio, pvmw.pte, pteval, max_nr);
>> +			ptes += nr;
>> +		}
>> +
>>   		if (lru_gen_enabled() && pvmw.pte) {
>> -			if (lru_gen_look_around(&pvmw))
>> +			if (lru_gen_look_around(&pvmw, nr))
>>   				referenced++;
>>   		} else if (pvmw.pte) {
>> -			if (folio_test_large(folio)) {
>> -				unsigned long end_addr = pmd_addr_end(address, vma->vm_end);
>> -				unsigned int max_nr = (end_addr - address) >> PAGE_SHIFT;
>> -				pte_t pteval = ptep_get(pvmw.pte);
>> -
>> -				nr = folio_pte_batch(folio, pvmw.pte,
>> -						     pteval, max_nr);
>> -			}
>> -
>> -			ptes += nr;
>>   			if (clear_flush_young_ptes_notify(vma, address, pvmw.pte, nr))
>>   				referenced++;
>> -			/* Skip the batched PTEs */
>> -			pvmw.pte += nr - 1;
>> -			pvmw.address += (nr - 1) * PAGE_SIZE;
>>   		} else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
>>   			if (pmdp_clear_flush_young_notify(vma, address,
>>   						pvmw.pmd))
>> @@ -995,6 +991,12 @@ static bool folio_referenced_one(struct folio *folio,
>>   			page_vma_mapped_walk_done(&pvmw);
>>   			break;
>>   		}
>> +
>> +		/* Skip the batched PTEs */
>> +		if (nr > 1) {
>> +			pvmw.pte += nr - 1;
>> +			pvmw.address += (nr - 1) * PAGE_SIZE;
>> +		}
> 
> As nr >= 1, you can just unconditionaly do
> 
> pvmw.pte += nr - 1;
> pvmw.address += (nr - 1) * PAGE_SIZE;

Actually, I want to filter out the THP case where the 'pvmw.pte' is 
NULL. But it shouldn’t be a problem, because 'nr' is always 1 for the 
THP case. I can remove the check.

>>   	}
>>   
>>   	if (referenced)
>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>> index 728868c61750..d83962468b2e 100644
>> --- a/mm/vmscan.c
>> +++ b/mm/vmscan.c
>> @@ -3494,6 +3494,7 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
>>   	struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
>>   	DEFINE_MAX_SEQ(walk->lruvec);
>>   	int gen = lru_gen_from_seq(max_seq);
>> +	unsigned int nr;
>>   	pmd_t pmdval;
>>   
>>   	pte = pte_offset_map_rw_nolock(args->mm, pmd, start & PMD_MASK, &pmdval, &ptl);
>> @@ -3512,11 +3513,13 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
>>   
>>   	lazy_mmu_mode_enable();
>>   restart:
>> -	for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) {
>> +	for (i = pte_index(start), addr = start; addr != end; i += nr, addr += nr * PAGE_SIZE) {
>>   		unsigned long pfn;
>>   		struct folio *folio;
>> -		pte_t ptent = ptep_get(pte + i);
>> +		pte_t *ptep = pte + i;
>> +		pte_t ptent = ptep_get(ptep);
> 
> 
> Existing "pte vs ptent" vs. "ptep vs. pte" is already confusing.
> Combining them into "pte vs. ptep vs. ptent" is no good.
> 
> If you need another variable, call it "cur_pte". Or rename "pte" to
> "start_pte".

OK. "cur_pte" sounds good to me.

>> +		nr = 1;
>>   		total++;
>>   		walk->mm_stats[MM_LEAF_TOTAL]++;
>>   
>> @@ -3528,7 +3531,14 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
>>   		if (!folio)
>>   			continue;
>>   
>> -		if (!ptep_clear_young_notify(args->vma, addr, pte + i))
>> +		if (folio_test_large(folio)) {
>> +			unsigned int max_nr = (end - addr) >> PAGE_SHIFT;
>> +
>> +			nr = folio_pte_batch_flags(folio, NULL, ptep, &ptent,
>> +						   max_nr, FPB_MERGE_YOUNG_DIRTY);
>> +		}
>> +
>> +		if (!clear_young_ptes_notify(args->vma, addr, ptep, nr))
>>   			continue;
>>   
>>   		if (last != folio) {
>> @@ -4186,7 +4196,7 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
>>    * the PTE table to the Bloom filter. This forms a feedback loop between the
>>    * eviction and the aging.
>>    */
>> -bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
>> +bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw, unsigned int batched)
> 
> What is "batched"? Did you mean "nr_ptes" ? Or just the initial value
> for "nr" ?

There is already an 'nr' variable in this function. "nr_ptes" sounds 
good to me, and will use it.

>> -		if (!ptep_clear_young_notify(vma, addr, pte + i))
>> +		if (folio_test_large(folio)) {
>> +			unsigned int max_nr = (end - addr) >> PAGE_SHIFT;
> 
> Can be const.

Ack.

> 
>> +
>> +			nr = folio_pte_batch_flags(folio, NULL, ptep, &ptent,
>> +						   max_nr, FPB_MERGE_YOUNG_DIRTY);
>> +		}
> 
> I guess we might benefit from a FPB_MERGE_YOUNG only here. But this
> should work.

I’ve thought about it. Instead of adding another flag and some new 'if' 
branches for folio_pte_batch_flags(), and given that it brings no 
performance improvement for MGLRU, I still prefer the current 
FPB_MERGE_YOUNG_DIRTY method. :)

Thanks for reviewing.

Re: [PATCH 4/5] mm: support batched checking of the young flag for MGLRU

Posted by David Hildenbrand (Arm) 1 month, 1 week ago

>>
>>> +            pte_t pteval = ptep_get(pvmw.pte);
>>
>> I wonder if there could be a way to avoid this ptep_get() by letting
>> page_vma_mapped_walk() just provide the last value it used (in
>> check_pte() I guess). Something for another patch.
> 
> Well, we’d need to add a new field to ‘struct page_vma_mapped_walk’ to
> store the last value (e.g., pvmw.pteval),

Yes.

> but this makes me wonder if it
> is worth adding a new field just to avoid a lightweight read (which
> should have no obvious performance impact).

You recall that ptep_get() on arm64 is not that lightweight due to
con-pte? :)

But yeah, something for another day.

[...]

>>
>> What is "batched"? Did you mean "nr_ptes" ? Or just the initial value
>> for "nr" ?
> 
> There is already an 'nr' variable in this function. "nr_ptes" sounds
> good to me, and will use it.

You can just use "nr" here and reuse it for the existing variable?

Both have the same semantics (nr of ptes / pages), so having a single
value might cause less confusion.

-- 
Cheers,

David

Re: [PATCH 4/5] mm: support batched checking of the young flag for MGLRU

Posted by Baolin Wang 1 month, 1 week ago


On 2/26/26 5:08 PM, David Hildenbrand (Arm) wrote:
> 
>>>
>>>> +            pte_t pteval = ptep_get(pvmw.pte);
>>>
>>> I wonder if there could be a way to avoid this ptep_get() by letting
>>> page_vma_mapped_walk() just provide the last value it used (in
>>> check_pte() I guess). Something for another patch.
>>
>> Well, we’d need to add a new field to ‘struct page_vma_mapped_walk’ to
>> store the last value (e.g., pvmw.pteval),
> 
> Yes.
> 
>> but this makes me wonder if it
>> is worth adding a new field just to avoid a lightweight read (which
>> should have no obvious performance impact).
> 
> You recall that ptep_get() on arm64 is not that lightweight due to
> con-pte? :)
> 
> But yeah, something for another day.

OK.

>>> What is "batched"? Did you mean "nr_ptes" ? Or just the initial value
>>> for "nr" ?
>>
>> There is already an 'nr' variable in this function. "nr_ptes" sounds
>> good to me, and will use it.
> 
> You can just use "nr" here and reuse it for the existing variable?
> 
> Both have the same semantics (nr of ptes / pages), so having a single
> value might cause less confusion.

Sure. Will do.

Re: [PATCH 4/5] mm: support batched checking of the young flag for MGLRU

Posted by Rik van Riel 1 month, 1 week ago

On Tue, 2026-02-24 at 09:56 +0800, Baolin Wang wrote:
> Use the batched helper clear_young_ptes_notify() to check and clear
> the
> young flag to improve the performance during large folio reclamation
> when
> MGLRU is enabled.
> 
> Meanwhile, we can also support batched checking the young and dirty
> flag
> when MGLRU walks the mm's pagetable to update the folios' generation
> counter. Since MGLRU also checks the PTE dirty bit, use
> folio_pte_batch_flags()
> with FPB_MERGE_YOUNG_DIRTY set to detect batches of PTEs for a large
> folio.
> 
> Then we can remove the ptep_clear_young_notify() since it has no
> users now.
> 
> Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> 
I'm not entirely happy with how much code is duplicated
between MGLRU and classic LRU, but merging duplicate
functionality seems like a thing for a separate patch.

This one looks good to me.

Reviewed-by: Rik van Riel <riel@surriel.com>

-- 
All Rights Reversed.

[PATCH 1/5] mm: use inline helper functions instead of ugly macros
[PATCH 2/5] mm: rmap: add a ZONE_DEVICE folio warning in folio_referenced()
[PATCH 3/5] mm: add a batched helper to clear the young flag for large folios
[PATCH 4/5] mm: support batched checking of the young flag for MGLRU
[PATCH 5/5] arm64: mm: implement the architecture-specific test_and_clear_young_ptes()