[PATCH v4 3/4] mm: Support batched unmap for lazyfree large folios during reclamation

Barry Song posted 4 patches 12 months ago
[PATCH v4 3/4] mm: Support batched unmap for lazyfree large folios during reclamation
Posted by Barry Song 12 months ago
From: Barry Song <v-songbaohua@oppo.com>

Currently, the PTEs and rmap of a large folio are removed one at a time.
This is not only slow but also causes the large folio to be unnecessarily
added to deferred_split, which can lead to races between the
deferred_split shrinker callback and memory reclamation. This patch
releases all PTEs and rmap entries in a batch.
Currently, it only handles lazyfree large folios.

The below microbench tries to reclaim 128MB lazyfree large folios
whose sizes are 64KiB:

 #include <stdio.h>
 #include <sys/mman.h>
 #include <string.h>
 #include <time.h>

 #define SIZE 128*1024*1024  // 128 MB

 unsigned long read_split_deferred()
 {
 	FILE *file = fopen("/sys/kernel/mm/transparent_hugepage"
			"/hugepages-64kB/stats/split_deferred", "r");
 	if (!file) {
 		perror("Error opening file");
 		return 0;
 	}

 	unsigned long value;
 	if (fscanf(file, "%lu", &value) != 1) {
 		perror("Error reading value");
 		fclose(file);
 		return 0;
 	}

 	fclose(file);
 	return value;
 }

 int main(int argc, char *argv[])
 {
 	while(1) {
 		volatile int *p = mmap(0, SIZE, PROT_READ | PROT_WRITE,
 				MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);

 		memset((void *)p, 1, SIZE);

 		madvise((void *)p, SIZE, MADV_FREE);

 		clock_t start_time = clock();
 		unsigned long start_split = read_split_deferred();
 		madvise((void *)p, SIZE, MADV_PAGEOUT);
 		clock_t end_time = clock();
 		unsigned long end_split = read_split_deferred();

 		double elapsed_time = (double)(end_time - start_time) / CLOCKS_PER_SEC;
 		printf("Time taken by reclamation: %f seconds, split_deferred: %ld\n",
 			elapsed_time, end_split - start_split);

 		munmap((void *)p, SIZE);
 	}
 	return 0;
 }

w/o patch:
~ # ./a.out
Time taken by reclamation: 0.177418 seconds, split_deferred: 2048
Time taken by reclamation: 0.178348 seconds, split_deferred: 2048
Time taken by reclamation: 0.174525 seconds, split_deferred: 2048
Time taken by reclamation: 0.171620 seconds, split_deferred: 2048
Time taken by reclamation: 0.172241 seconds, split_deferred: 2048
Time taken by reclamation: 0.174003 seconds, split_deferred: 2048
Time taken by reclamation: 0.171058 seconds, split_deferred: 2048
Time taken by reclamation: 0.171993 seconds, split_deferred: 2048
Time taken by reclamation: 0.169829 seconds, split_deferred: 2048
Time taken by reclamation: 0.172895 seconds, split_deferred: 2048
Time taken by reclamation: 0.176063 seconds, split_deferred: 2048
Time taken by reclamation: 0.172568 seconds, split_deferred: 2048
Time taken by reclamation: 0.171185 seconds, split_deferred: 2048
Time taken by reclamation: 0.170632 seconds, split_deferred: 2048
Time taken by reclamation: 0.170208 seconds, split_deferred: 2048
Time taken by reclamation: 0.174192 seconds, split_deferred: 2048
...

w/ patch:
~ # ./a.out
Time taken by reclamation: 0.074231 seconds, split_deferred: 0
Time taken by reclamation: 0.071026 seconds, split_deferred: 0
Time taken by reclamation: 0.072029 seconds, split_deferred: 0
Time taken by reclamation: 0.071873 seconds, split_deferred: 0
Time taken by reclamation: 0.073573 seconds, split_deferred: 0
Time taken by reclamation: 0.071906 seconds, split_deferred: 0
Time taken by reclamation: 0.073604 seconds, split_deferred: 0
Time taken by reclamation: 0.075903 seconds, split_deferred: 0
Time taken by reclamation: 0.073191 seconds, split_deferred: 0
Time taken by reclamation: 0.071228 seconds, split_deferred: 0
Time taken by reclamation: 0.071391 seconds, split_deferred: 0
Time taken by reclamation: 0.071468 seconds, split_deferred: 0
Time taken by reclamation: 0.071896 seconds, split_deferred: 0
Time taken by reclamation: 0.072508 seconds, split_deferred: 0
Time taken by reclamation: 0.071884 seconds, split_deferred: 0
Time taken by reclamation: 0.072433 seconds, split_deferred: 0
Time taken by reclamation: 0.071939 seconds, split_deferred: 0
...

Signed-off-by: Barry Song <v-songbaohua@oppo.com>
---
 mm/rmap.c | 72 ++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 50 insertions(+), 22 deletions(-)

diff --git a/mm/rmap.c b/mm/rmap.c
index 89e51a7a9509..8786704bd466 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1781,6 +1781,25 @@ void folio_remove_rmap_pud(struct folio *folio, struct page *page,
 #endif
 }
 
+/* We support batch unmapping of PTEs for lazyfree large folios */
+static inline bool can_batch_unmap_folio_ptes(unsigned long addr,
+			struct folio *folio, pte_t *ptep)
+{
+	const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
+	int max_nr = folio_nr_pages(folio);
+	pte_t pte = ptep_get(ptep);
+
+	if (!folio_test_anon(folio) || folio_test_swapbacked(folio))
+		return false;
+	if (pte_unused(pte))
+		return false;
+	if (pte_pfn(pte) != folio_pfn(folio))
+		return false;
+
+	return folio_pte_batch(folio, addr, ptep, pte, max_nr, fpb_flags, NULL,
+			       NULL, NULL) == max_nr;
+}
+
 /*
  * @arg: enum ttu_flags will be passed to this argument
  */
@@ -1794,6 +1813,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
 	struct page *subpage;
 	struct mmu_notifier_range range;
 	enum ttu_flags flags = (enum ttu_flags)(long)arg;
+	unsigned long nr_pages = 1, end_addr;
 	unsigned long pfn;
 	unsigned long hsz = 0;
 
@@ -1933,23 +1953,26 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
 			if (pte_dirty(pteval))
 				folio_mark_dirty(folio);
 		} else if (likely(pte_present(pteval))) {
-			flush_cache_page(vma, address, pfn);
-			/* Nuke the page table entry. */
-			if (should_defer_flush(mm, flags)) {
-				/*
-				 * We clear the PTE but do not flush so potentially
-				 * a remote CPU could still be writing to the folio.
-				 * If the entry was previously clean then the
-				 * architecture must guarantee that a clear->dirty
-				 * transition on a cached TLB entry is written through
-				 * and traps if the PTE is unmapped.
-				 */
-				pteval = ptep_get_and_clear(mm, address, pvmw.pte);
+			if (folio_test_large(folio) && !(flags & TTU_HWPOISON) &&
+			    can_batch_unmap_folio_ptes(address, folio, pvmw.pte))
+				nr_pages = folio_nr_pages(folio);
+			end_addr = address + nr_pages * PAGE_SIZE;
+			flush_cache_range(vma, address, end_addr);
 
-				set_tlb_ubc_flush_pending(mm, pteval, address, address + PAGE_SIZE);
-			} else {
-				pteval = ptep_clear_flush(vma, address, pvmw.pte);
-			}
+			/* Nuke the page table entry. */
+			pteval = get_and_clear_full_ptes(mm, address, pvmw.pte, nr_pages, 0);
+			/*
+			 * We clear the PTE but do not flush so potentially
+			 * a remote CPU could still be writing to the folio.
+			 * If the entry was previously clean then the
+			 * architecture must guarantee that a clear->dirty
+			 * transition on a cached TLB entry is written through
+			 * and traps if the PTE is unmapped.
+			 */
+			if (should_defer_flush(mm, flags))
+				set_tlb_ubc_flush_pending(mm, pteval, address, end_addr);
+			else
+				flush_tlb_range(vma, address, end_addr);
 			if (pte_dirty(pteval))
 				folio_mark_dirty(folio);
 		} else {
@@ -2027,7 +2050,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
 					 * redirtied either using the page table or a previously
 					 * obtained GUP reference.
 					 */
-					set_pte_at(mm, address, pvmw.pte, pteval);
+					set_ptes(mm, address, pvmw.pte, pteval, nr_pages);
 					folio_set_swapbacked(folio);
 					goto walk_abort;
 				} else if (ref_count != 1 + map_count) {
@@ -2040,10 +2063,10 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
 					 * We'll come back here later and detect if the folio was
 					 * dirtied when the additional reference is gone.
 					 */
-					set_pte_at(mm, address, pvmw.pte, pteval);
+					set_ptes(mm, address, pvmw.pte, pteval, nr_pages);
 					goto walk_abort;
 				}
-				dec_mm_counter(mm, MM_ANONPAGES);
+				add_mm_counter(mm, MM_ANONPAGES, -nr_pages);
 				goto discard;
 			}
 
@@ -2108,13 +2131,18 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
 			dec_mm_counter(mm, mm_counter_file(folio));
 		}
 discard:
-		if (unlikely(folio_test_hugetlb(folio)))
+		if (unlikely(folio_test_hugetlb(folio))) {
 			hugetlb_remove_rmap(folio);
-		else
-			folio_remove_rmap_pte(folio, subpage, vma);
+		} else {
+			folio_remove_rmap_ptes(folio, subpage, nr_pages, vma);
+			folio_ref_sub(folio, nr_pages - 1);
+		}
 		if (vma->vm_flags & VM_LOCKED)
 			mlock_drain_local();
 		folio_put(folio);
+		/* We have already batched the entire folio */
+		if (nr_pages > 1)
+			goto walk_done;
 		continue;
 walk_abort:
 		ret = false;
-- 
2.39.3 (Apple Git-146)
Re: [PATCH v4 3/4] mm: Support batched unmap for lazyfree large folios during reclamation
Posted by Harry Yoo 7 months, 1 week ago
On Fri, Feb 14, 2025 at 10:30:14PM +1300, Barry Song wrote:
> From: Barry Song <v-songbaohua@oppo.com>
> 
> Currently, the PTEs and rmap of a large folio are removed one at a time.
> This is not only slow but also causes the large folio to be unnecessarily
> added to deferred_split, which can lead to races between the
> deferred_split shrinker callback and memory reclamation. This patch
> releases all PTEs and rmap entries in a batch.
> Currently, it only handles lazyfree large folios.
> 
> The below microbench tries to reclaim 128MB lazyfree large folios
> whose sizes are 64KiB:
> 
>  #include <stdio.h>
>  #include <sys/mman.h>
>  #include <string.h>
>  #include <time.h>
> 
>  #define SIZE 128*1024*1024  // 128 MB
> 
>  unsigned long read_split_deferred()
>  {
>  	FILE *file = fopen("/sys/kernel/mm/transparent_hugepage"
> 			"/hugepages-64kB/stats/split_deferred", "r");
>  	if (!file) {
>  		perror("Error opening file");
>  		return 0;
>  	}
> 
>  	unsigned long value;
>  	if (fscanf(file, "%lu", &value) != 1) {
>  		perror("Error reading value");
>  		fclose(file);
>  		return 0;
>  	}
> 
>  	fclose(file);
>  	return value;
>  }
> 
>  int main(int argc, char *argv[])
>  {
>  	while(1) {
>  		volatile int *p = mmap(0, SIZE, PROT_READ | PROT_WRITE,
>  				MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
> 
>  		memset((void *)p, 1, SIZE);
> 
>  		madvise((void *)p, SIZE, MADV_FREE);
> 
>  		clock_t start_time = clock();
>  		unsigned long start_split = read_split_deferred();
>  		madvise((void *)p, SIZE, MADV_PAGEOUT);
>  		clock_t end_time = clock();
>  		unsigned long end_split = read_split_deferred();
> 
>  		double elapsed_time = (double)(end_time - start_time) / CLOCKS_PER_SEC;
>  		printf("Time taken by reclamation: %f seconds, split_deferred: %ld\n",
>  			elapsed_time, end_split - start_split);
> 
>  		munmap((void *)p, SIZE);
>  	}
>  	return 0;
>  }
> 
> w/o patch:
> ~ # ./a.out
> Time taken by reclamation: 0.177418 seconds, split_deferred: 2048
> Time taken by reclamation: 0.178348 seconds, split_deferred: 2048
> Time taken by reclamation: 0.174525 seconds, split_deferred: 2048
> Time taken by reclamation: 0.171620 seconds, split_deferred: 2048
> Time taken by reclamation: 0.172241 seconds, split_deferred: 2048
> Time taken by reclamation: 0.174003 seconds, split_deferred: 2048
> Time taken by reclamation: 0.171058 seconds, split_deferred: 2048
> Time taken by reclamation: 0.171993 seconds, split_deferred: 2048
> Time taken by reclamation: 0.169829 seconds, split_deferred: 2048
> Time taken by reclamation: 0.172895 seconds, split_deferred: 2048
> Time taken by reclamation: 0.176063 seconds, split_deferred: 2048
> Time taken by reclamation: 0.172568 seconds, split_deferred: 2048
> Time taken by reclamation: 0.171185 seconds, split_deferred: 2048
> Time taken by reclamation: 0.170632 seconds, split_deferred: 2048
> Time taken by reclamation: 0.170208 seconds, split_deferred: 2048
> Time taken by reclamation: 0.174192 seconds, split_deferred: 2048
> ...
> 
> w/ patch:
> ~ # ./a.out
> Time taken by reclamation: 0.074231 seconds, split_deferred: 0
> Time taken by reclamation: 0.071026 seconds, split_deferred: 0
> Time taken by reclamation: 0.072029 seconds, split_deferred: 0
> Time taken by reclamation: 0.071873 seconds, split_deferred: 0
> Time taken by reclamation: 0.073573 seconds, split_deferred: 0
> Time taken by reclamation: 0.071906 seconds, split_deferred: 0
> Time taken by reclamation: 0.073604 seconds, split_deferred: 0
> Time taken by reclamation: 0.075903 seconds, split_deferred: 0
> Time taken by reclamation: 0.073191 seconds, split_deferred: 0
> Time taken by reclamation: 0.071228 seconds, split_deferred: 0
> Time taken by reclamation: 0.071391 seconds, split_deferred: 0
> Time taken by reclamation: 0.071468 seconds, split_deferred: 0
> Time taken by reclamation: 0.071896 seconds, split_deferred: 0
> Time taken by reclamation: 0.072508 seconds, split_deferred: 0
> Time taken by reclamation: 0.071884 seconds, split_deferred: 0
> Time taken by reclamation: 0.072433 seconds, split_deferred: 0
> Time taken by reclamation: 0.071939 seconds, split_deferred: 0
> ...
> 
> Signed-off-by: Barry Song <v-songbaohua@oppo.com>
> ---

I'm still following the long discussions and follow-up patch series,
but let me ask a possibly silly question here :)

>  mm/rmap.c | 72 ++++++++++++++++++++++++++++++++++++++-----------------
>  1 file changed, 50 insertions(+), 22 deletions(-)
> 
> diff --git a/mm/rmap.c b/mm/rmap.c
> index 89e51a7a9509..8786704bd466 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -1933,23 +1953,26 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
>  			if (pte_dirty(pteval))
>  				folio_mark_dirty(folio);
>  		} else if (likely(pte_present(pteval))) {
> -			flush_cache_page(vma, address, pfn);
> -			/* Nuke the page table entry. */
> -			if (should_defer_flush(mm, flags)) {
> -				/*
> -				 * We clear the PTE but do not flush so potentially
> -				 * a remote CPU could still be writing to the folio.
> -				 * If the entry was previously clean then the
> -				 * architecture must guarantee that a clear->dirty
> -				 * transition on a cached TLB entry is written through
> -				 * and traps if the PTE is unmapped.
> -				 */
> -				pteval = ptep_get_and_clear(mm, address, pvmw.pte);
> +			if (folio_test_large(folio) && !(flags & TTU_HWPOISON) &&
> +			    can_batch_unmap_folio_ptes(address, folio, pvmw.pte))
> +				nr_pages = folio_nr_pages(folio);
> +			end_addr = address + nr_pages * PAGE_SIZE;
> +			flush_cache_range(vma, address, end_addr);
>  
> -				set_tlb_ubc_flush_pending(mm, pteval, address, address + PAGE_SIZE);
> -			} else {
> -				pteval = ptep_clear_flush(vma, address, pvmw.pte);
> -			}
> +			/* Nuke the page table entry. */
> +			pteval = get_and_clear_full_ptes(mm, address, pvmw.pte, nr_pages, 0);
> +			/*
> +			 * We clear the PTE but do not flush so potentially
> +			 * a remote CPU could still be writing to the folio.
> +			 * If the entry was previously clean then the
> +			 * architecture must guarantee that a clear->dirty
> +			 * transition on a cached TLB entry is written through
> +			 * and traps if the PTE is unmapped.
> +			 */
> +			if (should_defer_flush(mm, flags))
> +				set_tlb_ubc_flush_pending(mm, pteval, address, end_addr);

When the first pte of a PTE-mapped THP has _PAGE_PROTNONE bit set
(by NUMA balancing), can set_tlb_ubc_flush_pending() mistakenly think that
it doesn't need to flush the whole range, although some ptes in the range
doesn't have _PAGE_PROTNONE bit set?

> +			else
> +				flush_tlb_range(vma, address, end_addr);
>  			if (pte_dirty(pteval))
>  				folio_mark_dirty(folio);
>  		} else {

-- 
Cheers,
Harry / Hyeonggon
Re: [PATCH v4 3/4] mm: Support batched unmap for lazyfree large folios during reclamation
Posted by Harry Yoo 7 months, 1 week ago
On Tue, Jul 01, 2025 at 07:03:50PM +0900, Harry Yoo wrote:
> On Fri, Feb 14, 2025 at 10:30:14PM +1300, Barry Song wrote:
> > From: Barry Song <v-songbaohua@oppo.com>
> > 
> > Currently, the PTEs and rmap of a large folio are removed one at a time.
> > This is not only slow but also causes the large folio to be unnecessarily
> > added to deferred_split, which can lead to races between the
> > deferred_split shrinker callback and memory reclamation. This patch
> > releases all PTEs and rmap entries in a batch.
> > Currently, it only handles lazyfree large folios.
> > 
> > The below microbench tries to reclaim 128MB lazyfree large folios
> > whose sizes are 64KiB:
> > 
> >  #include <stdio.h>
> >  #include <sys/mman.h>
> >  #include <string.h>
> >  #include <time.h>
> > 
> >  #define SIZE 128*1024*1024  // 128 MB
> > 
> >  unsigned long read_split_deferred()
> >  {
> >  	FILE *file = fopen("/sys/kernel/mm/transparent_hugepage"
> > 			"/hugepages-64kB/stats/split_deferred", "r");
> >  	if (!file) {
> >  		perror("Error opening file");
> >  		return 0;
> >  	}
> > 
> >  	unsigned long value;
> >  	if (fscanf(file, "%lu", &value) != 1) {
> >  		perror("Error reading value");
> >  		fclose(file);
> >  		return 0;
> >  	}
> > 
> >  	fclose(file);
> >  	return value;
> >  }
> > 
> >  int main(int argc, char *argv[])
> >  {
> >  	while(1) {
> >  		volatile int *p = mmap(0, SIZE, PROT_READ | PROT_WRITE,
> >  				MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
> > 
> >  		memset((void *)p, 1, SIZE);
> > 
> >  		madvise((void *)p, SIZE, MADV_FREE);
> > 
> >  		clock_t start_time = clock();
> >  		unsigned long start_split = read_split_deferred();
> >  		madvise((void *)p, SIZE, MADV_PAGEOUT);
> >  		clock_t end_time = clock();
> >  		unsigned long end_split = read_split_deferred();
> > 
> >  		double elapsed_time = (double)(end_time - start_time) / CLOCKS_PER_SEC;
> >  		printf("Time taken by reclamation: %f seconds, split_deferred: %ld\n",
> >  			elapsed_time, end_split - start_split);
> > 
> >  		munmap((void *)p, SIZE);
> >  	}
> >  	return 0;
> >  }
> > 
> > w/o patch:
> > ~ # ./a.out
> > Time taken by reclamation: 0.177418 seconds, split_deferred: 2048
> > Time taken by reclamation: 0.178348 seconds, split_deferred: 2048
> > Time taken by reclamation: 0.174525 seconds, split_deferred: 2048
> > Time taken by reclamation: 0.171620 seconds, split_deferred: 2048
> > Time taken by reclamation: 0.172241 seconds, split_deferred: 2048
> > Time taken by reclamation: 0.174003 seconds, split_deferred: 2048
> > Time taken by reclamation: 0.171058 seconds, split_deferred: 2048
> > Time taken by reclamation: 0.171993 seconds, split_deferred: 2048
> > Time taken by reclamation: 0.169829 seconds, split_deferred: 2048
> > Time taken by reclamation: 0.172895 seconds, split_deferred: 2048
> > Time taken by reclamation: 0.176063 seconds, split_deferred: 2048
> > Time taken by reclamation: 0.172568 seconds, split_deferred: 2048
> > Time taken by reclamation: 0.171185 seconds, split_deferred: 2048
> > Time taken by reclamation: 0.170632 seconds, split_deferred: 2048
> > Time taken by reclamation: 0.170208 seconds, split_deferred: 2048
> > Time taken by reclamation: 0.174192 seconds, split_deferred: 2048
> > ...
> > 
> > w/ patch:
> > ~ # ./a.out
> > Time taken by reclamation: 0.074231 seconds, split_deferred: 0
> > Time taken by reclamation: 0.071026 seconds, split_deferred: 0
> > Time taken by reclamation: 0.072029 seconds, split_deferred: 0
> > Time taken by reclamation: 0.071873 seconds, split_deferred: 0
> > Time taken by reclamation: 0.073573 seconds, split_deferred: 0
> > Time taken by reclamation: 0.071906 seconds, split_deferred: 0
> > Time taken by reclamation: 0.073604 seconds, split_deferred: 0
> > Time taken by reclamation: 0.075903 seconds, split_deferred: 0
> > Time taken by reclamation: 0.073191 seconds, split_deferred: 0
> > Time taken by reclamation: 0.071228 seconds, split_deferred: 0
> > Time taken by reclamation: 0.071391 seconds, split_deferred: 0
> > Time taken by reclamation: 0.071468 seconds, split_deferred: 0
> > Time taken by reclamation: 0.071896 seconds, split_deferred: 0
> > Time taken by reclamation: 0.072508 seconds, split_deferred: 0
> > Time taken by reclamation: 0.071884 seconds, split_deferred: 0
> > Time taken by reclamation: 0.072433 seconds, split_deferred: 0
> > Time taken by reclamation: 0.071939 seconds, split_deferred: 0
> > ...
> > 
> > Signed-off-by: Barry Song <v-songbaohua@oppo.com>
> > ---
> 
> I'm still following the long discussions and follow-up patch series,
> but let me ask a possibly silly question here :)
> 
> >  mm/rmap.c | 72 ++++++++++++++++++++++++++++++++++++++-----------------
> >  1 file changed, 50 insertions(+), 22 deletions(-)
> > 
> > diff --git a/mm/rmap.c b/mm/rmap.c
> > index 89e51a7a9509..8786704bd466 100644
> > --- a/mm/rmap.c
> > +++ b/mm/rmap.c
> > @@ -1933,23 +1953,26 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
> >  			if (pte_dirty(pteval))
> >  				folio_mark_dirty(folio);
> >  		} else if (likely(pte_present(pteval))) {
> > -			flush_cache_page(vma, address, pfn);
> > -			/* Nuke the page table entry. */
> > -			if (should_defer_flush(mm, flags)) {
> > -				/*
> > -				 * We clear the PTE but do not flush so potentially
> > -				 * a remote CPU could still be writing to the folio.
> > -				 * If the entry was previously clean then the
> > -				 * architecture must guarantee that a clear->dirty
> > -				 * transition on a cached TLB entry is written through
> > -				 * and traps if the PTE is unmapped.
> > -				 */
> > -				pteval = ptep_get_and_clear(mm, address, pvmw.pte);
> > +			if (folio_test_large(folio) && !(flags & TTU_HWPOISON) &&
> > +			    can_batch_unmap_folio_ptes(address, folio, pvmw.pte))
> > +				nr_pages = folio_nr_pages(folio);
> > +			end_addr = address + nr_pages * PAGE_SIZE;
> > +			flush_cache_range(vma, address, end_addr);
> >  
> > -				set_tlb_ubc_flush_pending(mm, pteval, address, address + PAGE_SIZE);
> > -			} else {
> > -				pteval = ptep_clear_flush(vma, address, pvmw.pte);
> > -			}
> > +			/* Nuke the page table entry. */
> > +			pteval = get_and_clear_full_ptes(mm, address, pvmw.pte, nr_pages, 0);
> > +			/*
> > +			 * We clear the PTE but do not flush so potentially
> > +			 * a remote CPU could still be writing to the folio.
> > +			 * If the entry was previously clean then the
> > +			 * architecture must guarantee that a clear->dirty
> > +			 * transition on a cached TLB entry is written through
> > +			 * and traps if the PTE is unmapped.
> > +			 */
> > +			if (should_defer_flush(mm, flags))
> > +				set_tlb_ubc_flush_pending(mm, pteval, address, end_addr);
> 
> When the first pte of a PTE-mapped THP has _PAGE_PROTNONE bit set
> (by NUMA balancing), can set_tlb_ubc_flush_pending() mistakenly think that
> it doesn't need to flush the whole range, although some ptes in the range
> doesn't have _PAGE_PROTNONE bit set?

No, then folio_pte_batch() should have returned nr < folio_nr_pages(folio).

> > +			else
> > +				flush_tlb_range(vma, address, end_addr);
> >  			if (pte_dirty(pteval))
> >  				folio_mark_dirty(folio);
> >  		} else {
> 
> -- 
> Cheers,
> Harry / Hyeonggon

-- 
Cheers,
Harry / Hyeonggon
Re: [PATCH v4 3/4] mm: Support batched unmap for lazyfree large folios during reclamation
Posted by David Hildenbrand 7 months, 1 week ago
>>> +			/* Nuke the page table entry. */
>>> +			pteval = get_and_clear_full_ptes(mm, address, pvmw.pte, nr_pages, 0);
>>> +			/*
>>> +			 * We clear the PTE but do not flush so potentially
>>> +			 * a remote CPU could still be writing to the folio.
>>> +			 * If the entry was previously clean then the
>>> +			 * architecture must guarantee that a clear->dirty
>>> +			 * transition on a cached TLB entry is written through
>>> +			 * and traps if the PTE is unmapped.
>>> +			 */
>>> +			if (should_defer_flush(mm, flags))
>>> +				set_tlb_ubc_flush_pending(mm, pteval, address, end_addr);
>>
>> When the first pte of a PTE-mapped THP has _PAGE_PROTNONE bit set
>> (by NUMA balancing), can set_tlb_ubc_flush_pending() mistakenly think that
>> it doesn't need to flush the whole range, although some ptes in the range
>> doesn't have _PAGE_PROTNONE bit set?
> 
> No, then folio_pte_batch() should have returned nr < folio_nr_pages(folio).

Right, folio_pte_batch() does currently not batch across PTEs that 
differ in pte_protnone().

-- 
Cheers,

David / dhildenb
Re: [PATCH v4 3/4] mm: Support batched unmap for lazyfree large folios during reclamation
Posted by David Hildenbrand 7 months, 2 weeks ago
On 14.02.25 10:30, Barry Song wrote:
> From: Barry Song <v-songbaohua@oppo.com>
> 
> Currently, the PTEs and rmap of a large folio are removed one at a time.
> This is not only slow but also causes the large folio to be unnecessarily
> added to deferred_split, which can lead to races between the
> deferred_split shrinker callback and memory reclamation. This patch
> releases all PTEs and rmap entries in a batch.
> Currently, it only handles lazyfree large folios.
> 
> The below microbench tries to reclaim 128MB lazyfree large folios
> whose sizes are 64KiB:
> 
>   #include <stdio.h>
>   #include <sys/mman.h>
>   #include <string.h>
>   #include <time.h>
> 
>   #define SIZE 128*1024*1024  // 128 MB
> 
>   unsigned long read_split_deferred()
>   {
>   	FILE *file = fopen("/sys/kernel/mm/transparent_hugepage"
> 			"/hugepages-64kB/stats/split_deferred", "r");
>   	if (!file) {
>   		perror("Error opening file");
>   		return 0;
>   	}
> 
>   	unsigned long value;
>   	if (fscanf(file, "%lu", &value) != 1) {
>   		perror("Error reading value");
>   		fclose(file);
>   		return 0;
>   	}
> 
>   	fclose(file);
>   	return value;
>   }
> 
>   int main(int argc, char *argv[])
>   {
>   	while(1) {
>   		volatile int *p = mmap(0, SIZE, PROT_READ | PROT_WRITE,
>   				MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
> 
>   		memset((void *)p, 1, SIZE);
> 
>   		madvise((void *)p, SIZE, MADV_FREE);
> 
>   		clock_t start_time = clock();
>   		unsigned long start_split = read_split_deferred();
>   		madvise((void *)p, SIZE, MADV_PAGEOUT);
>   		clock_t end_time = clock();
>   		unsigned long end_split = read_split_deferred();
> 
>   		double elapsed_time = (double)(end_time - start_time) / CLOCKS_PER_SEC;
>   		printf("Time taken by reclamation: %f seconds, split_deferred: %ld\n",
>   			elapsed_time, end_split - start_split);
> 
>   		munmap((void *)p, SIZE);
>   	}
>   	return 0;
>   }
> 
> w/o patch:
> ~ # ./a.out
> Time taken by reclamation: 0.177418 seconds, split_deferred: 2048
> Time taken by reclamation: 0.178348 seconds, split_deferred: 2048
> Time taken by reclamation: 0.174525 seconds, split_deferred: 2048
> Time taken by reclamation: 0.171620 seconds, split_deferred: 2048
> Time taken by reclamation: 0.172241 seconds, split_deferred: 2048
> Time taken by reclamation: 0.174003 seconds, split_deferred: 2048
> Time taken by reclamation: 0.171058 seconds, split_deferred: 2048
> Time taken by reclamation: 0.171993 seconds, split_deferred: 2048
> Time taken by reclamation: 0.169829 seconds, split_deferred: 2048
> Time taken by reclamation: 0.172895 seconds, split_deferred: 2048
> Time taken by reclamation: 0.176063 seconds, split_deferred: 2048
> Time taken by reclamation: 0.172568 seconds, split_deferred: 2048
> Time taken by reclamation: 0.171185 seconds, split_deferred: 2048
> Time taken by reclamation: 0.170632 seconds, split_deferred: 2048
> Time taken by reclamation: 0.170208 seconds, split_deferred: 2048
> Time taken by reclamation: 0.174192 seconds, split_deferred: 2048
> ...
> 
> w/ patch:
> ~ # ./a.out
> Time taken by reclamation: 0.074231 seconds, split_deferred: 0
> Time taken by reclamation: 0.071026 seconds, split_deferred: 0
> Time taken by reclamation: 0.072029 seconds, split_deferred: 0
> Time taken by reclamation: 0.071873 seconds, split_deferred: 0
> Time taken by reclamation: 0.073573 seconds, split_deferred: 0
> Time taken by reclamation: 0.071906 seconds, split_deferred: 0
> Time taken by reclamation: 0.073604 seconds, split_deferred: 0
> Time taken by reclamation: 0.075903 seconds, split_deferred: 0
> Time taken by reclamation: 0.073191 seconds, split_deferred: 0
> Time taken by reclamation: 0.071228 seconds, split_deferred: 0
> Time taken by reclamation: 0.071391 seconds, split_deferred: 0
> Time taken by reclamation: 0.071468 seconds, split_deferred: 0
> Time taken by reclamation: 0.071896 seconds, split_deferred: 0
> Time taken by reclamation: 0.072508 seconds, split_deferred: 0
> Time taken by reclamation: 0.071884 seconds, split_deferred: 0
> Time taken by reclamation: 0.072433 seconds, split_deferred: 0
> Time taken by reclamation: 0.071939 seconds, split_deferred: 0
> ...
> 
> Signed-off-by: Barry Song <v-songbaohua@oppo.com>
> ---
>   mm/rmap.c | 72 ++++++++++++++++++++++++++++++++++++++-----------------
>   1 file changed, 50 insertions(+), 22 deletions(-)
> 
> diff --git a/mm/rmap.c b/mm/rmap.c
> index 89e51a7a9509..8786704bd466 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -1781,6 +1781,25 @@ void folio_remove_rmap_pud(struct folio *folio, struct page *page,
>   #endif
>   }
>   
> +/* We support batch unmapping of PTEs for lazyfree large folios */
> +static inline bool can_batch_unmap_folio_ptes(unsigned long addr,
> +			struct folio *folio, pte_t *ptep)
> +{
> +	const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
> +	int max_nr = folio_nr_pages(folio);

Let's assume we have the first page of a folio mapped at the last page 
table entry in our page table.

What prevents folio_pte_batch() from reading outside the page table?


-- 
Cheers,

David / dhildenb
Re: [PATCH v4 3/4] mm: Support batched unmap for lazyfree large folios during reclamation
Posted by Lance Yang 7 months, 2 weeks ago
On 2025/6/24 20:55, David Hildenbrand wrote:
> On 14.02.25 10:30, Barry Song wrote:
>> From: Barry Song <v-songbaohua@oppo.com>
[...]
>> diff --git a/mm/rmap.c b/mm/rmap.c
>> index 89e51a7a9509..8786704bd466 100644
>> --- a/mm/rmap.c
>> +++ b/mm/rmap.c
>> @@ -1781,6 +1781,25 @@ void folio_remove_rmap_pud(struct folio *folio, 
>> struct page *page,
>>   #endif
>>   }
>> +/* We support batch unmapping of PTEs for lazyfree large folios */
>> +static inline bool can_batch_unmap_folio_ptes(unsigned long addr,
>> +            struct folio *folio, pte_t *ptep)
>> +{
>> +    const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
>> +    int max_nr = folio_nr_pages(folio);
> 
> Let's assume we have the first page of a folio mapped at the last page 
> table entry in our page table.

Good point. I'm curious if it is something we've seen in practice ;)

> 
> What prevents folio_pte_batch() from reading outside the page table?

Assuming such a scenario is possible, to prevent any chance of an
out-of-bounds read, how about this change:

diff --git a/mm/rmap.c b/mm/rmap.c
index fb63d9256f09..9aeae811a38b 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1852,6 +1852,25 @@ static inline bool can_batch_unmap_folio_ptes(unsigned long addr,
 	const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
 	int max_nr = folio_nr_pages(folio);
 	pte_t pte = ptep_get(ptep);
+	unsigned long end_addr;
+
+	/*
+	 * To batch unmap, the entire folio's PTEs must be contiguous
+	 * and mapped within the same PTE page table, which corresponds to
+	 * a single PMD entry. Before calling folio_pte_batch(), which does
+	 * not perform boundary checks itself, we must verify that the
+	 * address range covered by the folio does not cross a PMD boundary.
+	 */
+	end_addr = addr + (max_nr * PAGE_SIZE) - 1;
+
+	/*
+	 * A fast way to check for a PMD boundary cross is to align both
+	 * the start and end addresses to the PMD boundary and see if they
+	 * are different. If they are, the range spans across at least two
+	 * different PMD-managed regions.
+	 */
+	if ((addr & PMD_MASK) != (end_addr & PMD_MASK))
+		return false;
 
 	if (!folio_test_anon(folio) || folio_test_swapbacked(folio))
 		return false;
--
Thanks,
Lance
Re: [PATCH v4 3/4] mm: Support batched unmap for lazyfree large folios during reclamation
Posted by David Hildenbrand 7 months, 2 weeks ago
On 24.06.25 17:26, Lance Yang wrote:
> On 2025/6/24 20:55, David Hildenbrand wrote:
>> On 14.02.25 10:30, Barry Song wrote:
>>> From: Barry Song <v-songbaohua@oppo.com>
> [...]
>>> diff --git a/mm/rmap.c b/mm/rmap.c
>>> index 89e51a7a9509..8786704bd466 100644
>>> --- a/mm/rmap.c
>>> +++ b/mm/rmap.c
>>> @@ -1781,6 +1781,25 @@ void folio_remove_rmap_pud(struct folio *folio,
>>> struct page *page,
>>>    #endif
>>>    }
>>> +/* We support batch unmapping of PTEs for lazyfree large folios */
>>> +static inline bool can_batch_unmap_folio_ptes(unsigned long addr,
>>> +            struct folio *folio, pte_t *ptep)
>>> +{
>>> +    const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
>>> +    int max_nr = folio_nr_pages(folio);
>>
>> Let's assume we have the first page of a folio mapped at the last page
>> table entry in our page table.
> 
> Good point. I'm curious if it is something we've seen in practice ;)

I challenge you to write a reproducer :P I assume it might be doable 
through simple mremap().

> 
>>
>> What prevents folio_pte_batch() from reading outside the page table?
> 
> Assuming such a scenario is possible, to prevent any chance of an
> out-of-bounds read, how about this change:
> 
> diff --git a/mm/rmap.c b/mm/rmap.c
> index fb63d9256f09..9aeae811a38b 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -1852,6 +1852,25 @@ static inline bool can_batch_unmap_folio_ptes(unsigned long addr,
>   	const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
>   	int max_nr = folio_nr_pages(folio);
>   	pte_t pte = ptep_get(ptep);
> +	unsigned long end_addr;
> +
> +	/*
> +	 * To batch unmap, the entire folio's PTEs must be contiguous
> +	 * and mapped within the same PTE page table, which corresponds to
> +	 * a single PMD entry. Before calling folio_pte_batch(), which does
> +	 * not perform boundary checks itself, we must verify that the
> +	 * address range covered by the folio does not cross a PMD boundary.
> +	 */
> +	end_addr = addr + (max_nr * PAGE_SIZE) - 1;
> +
> +	/*
> +	 * A fast way to check for a PMD boundary cross is to align both
> +	 * the start and end addresses to the PMD boundary and see if they
> +	 * are different. If they are, the range spans across at least two
> +	 * different PMD-managed regions.
> +	 */
> +	if ((addr & PMD_MASK) != (end_addr & PMD_MASK))
> +		return false;

You should not be messing with max_nr = folio_nr_pages(folio) here at 
all. folio_pte_batch() takes care of that.

Also, way too many comments ;)

You may only batch within a single VMA and within a single page table.

So simply align the addr up to the next PMD, and make sure it does not 
exceed the vma end.

ALIGN and friends can help avoiding excessive comments.

-- 
Cheers,

David / dhildenb
Re: [PATCH v4 3/4] mm: Support batched unmap for lazyfree large folios during reclamation
Posted by Lance Yang 7 months, 2 weeks ago

On 2025/6/24 23:34, David Hildenbrand wrote:
> On 24.06.25 17:26, Lance Yang wrote:
>> On 2025/6/24 20:55, David Hildenbrand wrote:
>>> On 14.02.25 10:30, Barry Song wrote:
>>>> From: Barry Song <v-songbaohua@oppo.com>
>> [...]
>>>> diff --git a/mm/rmap.c b/mm/rmap.c
>>>> index 89e51a7a9509..8786704bd466 100644
>>>> --- a/mm/rmap.c
>>>> +++ b/mm/rmap.c
>>>> @@ -1781,6 +1781,25 @@ void folio_remove_rmap_pud(struct folio *folio,
>>>> struct page *page,
>>>>    #endif
>>>>    }
>>>> +/* We support batch unmapping of PTEs for lazyfree large folios */
>>>> +static inline bool can_batch_unmap_folio_ptes(unsigned long addr,
>>>> +            struct folio *folio, pte_t *ptep)
>>>> +{
>>>> +    const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
>>>> +    int max_nr = folio_nr_pages(folio);
>>>
>>> Let's assume we have the first page of a folio mapped at the last page
>>> table entry in our page table.
>>
>> Good point. I'm curious if it is something we've seen in practice ;)
> 
> I challenge you to write a reproducer :P I assume it might be doable 
> through simple mremap().

Yes! The scenario is indeed reproducible from userspace ;p

First, I get a 64KB folio by allocating a large anonymous mapping and
advising the kernel with madvise(MADV_HUGEPAGE). After faulting in the
pages, /proc/self/pagemap confirms the PFNs are contiguous.

Then, the key is to use mremap() with MREMAP_FIXED to move the folio to
a virtual address that crosses a PMD boundary. Doing so ensures the
physically contiguous folio is mapped by PTEs from two different page
tables.

The C reproducer is attached. It was tested on a system with 64KB mTHP
enabled (in madvise mode). Please correct me if I'm wrong ;)


```
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <unistd.h>
#include <fcntl.h>
#include <errno.h>
#include <sys/mman.h>
#include <stdbool.h>

#define PAGE_SIZE ((size_t)sysconf(_SC_PAGESIZE))
#define FOLIO_SIZE (64 * 1024)
#define NUM_PAGES_IN_FOLIO (FOLIO_SIZE / PAGE_SIZE)
#define PMD_SIZE (2 * 1024 * 1024)

int get_pagemap_entry(uint64_t *entry, int pagemap_fd, uintptr_t vaddr) {
     size_t offset = (vaddr / PAGE_SIZE) * sizeof(uint64_t);
     if (pread(pagemap_fd, entry, sizeof(uint64_t), offset) != 
sizeof(uint64_t)) {
         perror("pread pagemap");
         return -1;
     }
     return 0;
}

int is_page_present(uint64_t entry) { return (entry >> 63) & 1; }
uint64_t get_pfn(uint64_t entry) { return entry & ((1ULL << 55) - 1); }

bool verify_contiguity(int pagemap_fd, uintptr_t vaddr, size_t size, 
const char *label) {
     printf("\n--- Verifying Contiguity for: %s at 0x%lx ---\n", label, 
vaddr);
     printf("Page |      Virtual Address      | Present |   PFN 
(Physical)   | Contiguous?\n");
  
printf("-----+---------------------------+---------+--------------------+-------------\n");

     uint64_t first_pfn = 0;
     bool is_contiguous = true;
     int num_pages = size / PAGE_SIZE;

     for (int i = 0; i < num_pages; ++i) {
         uintptr_t current_vaddr = vaddr + i * PAGE_SIZE;
         uint64_t pagemap_entry;

         if (get_pagemap_entry(&pagemap_entry, pagemap_fd, 
current_vaddr) != 0) {
             is_contiguous = false;
             break;
         }

         if (!is_page_present(pagemap_entry)) {
             printf(" %2d  | 0x%016lx |    No   |        N/A         | 
Error\n", i, current_vaddr);
             is_contiguous = false;
             continue;
         }

         uint64_t pfn = get_pfn(pagemap_entry);
         char contiguous_str[4] = "Yes";

         if (i == 0) {
             first_pfn = pfn;
         } else {
             if (pfn != first_pfn + i) {
                 strcpy(contiguous_str, "No!");
                 is_contiguous = false;
             }
         }

         printf(" %2d  | 0x%016lx |   Yes   | 0x%-16lx |     %s\n", i, 
current_vaddr, pfn, contiguous_str);
     }

     if (is_contiguous) {
         printf("Verification PASSED: PFNs are contiguous for %s.\n", 
label);
     } else {
         printf("Verification FAILED: PFNs are NOT contiguous for 
%s.\n", label);
     }
     return is_contiguous;
}


int main(void) {
     printf("--- Folio-across-PMD-boundary reproducer ---\n");
     printf("Page size: %zu KB, Folio size: %zu KB, PMD coverage: %zu MB\n",
            PAGE_SIZE / 1024, FOLIO_SIZE / 1024, PMD_SIZE / (1024 * 1024));

     size_t source_size = 4 * 1024 * 1024;
     void *source_addr = mmap(NULL, source_size, PROT_READ | PROT_WRITE,
                              MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
     if (source_addr == MAP_FAILED) {
         perror("mmap source"); exit(EXIT_FAILURE);
     }
     printf("\n1. Source memory mapped at: %p\n", source_addr);

     if (madvise(source_addr, source_size, MADV_HUGEPAGE) != 0) {
         perror("madvise MADV_HUGEPAGE");
     }
     printf("2. Advised kernel to use large folios (MADV_HUGEPAGE).\n");

     memset(source_addr, 'A', source_size);
     printf("3. Faulted in source pages.\n");

     int pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
     if (pagemap_fd < 0) {
         perror("open /proc/self/pagemap");
         exit(EXIT_FAILURE);
     }

     if (!verify_contiguity(pagemap_fd, (uintptr_t)source_addr, 
FOLIO_SIZE, "Source Address (pre-mremap)")) {
         fprintf(stderr, "\nInitial folio allocation failed. Cannot 
proceed.\n");
         close(pagemap_fd);
         munmap(source_addr, source_size);
         exit(EXIT_FAILURE);
     }

     uintptr_t search_base = 0x10000000000UL;
     uintptr_t pmd_boundary = (search_base + PMD_SIZE) & ~(PMD_SIZE - 1);
     uintptr_t target_vaddr = pmd_boundary - PAGE_SIZE;
     printf("\n5. Calculated target address to be 0x%lx\n", target_vaddr);

     munmap((void *)target_vaddr, FOLIO_SIZE);
     void *new_addr = mremap(source_addr, FOLIO_SIZE, FOLIO_SIZE, 
MREMAP_MAYMOVE | MREMAP_FIXED, (void *)target_vaddr);
     if (new_addr == MAP_FAILED) {
         perror("mremap");
         close(pagemap_fd);
         exit(EXIT_FAILURE);
     }
     printf("6. Successfully mremap'd %zu KB to 0x%lx.\n", FOLIO_SIZE / 
1024, (uintptr_t)new_addr);

     bool final_success = verify_contiguity(pagemap_fd, 
(uintptr_t)new_addr, FOLIO_SIZE, "Target Address (post-mremap)");

     printf("\n--- Final Conclusion ---\n");
     if (final_success) {
         printf("✅ SUCCESS: The folio's pages remained physically 
contiguous after remapping to a PMD-crossing virtual address.\n");
         printf("   The reproducer successfully created the desired 
edge-case memory layout.\n");
     } else {
         printf("❌ UNEXPECTED FAILURE: The pages were not contiguous 
after mremap.\n");
     }

     close(pagemap_fd);
     munmap(new_addr, FOLIO_SIZE);

     return 0;
}
```

$ a.out

```
--- Folio-across-PMD-boundary reproducer ---
Page size: 4 KB, Folio size: 64 KB, PMD coverage: 2 MB

1. Source memory mapped at: 0x7f2e41200000
2. Advised kernel to use large folios (MADV_HUGEPAGE).
3. Faulted in source pages.

--- Verifying Contiguity for: Source Address (pre-mremap) at 
0x7f2e41200000 ---
Page |      Virtual Address      | Present |   PFN (Physical)   | 
Contiguous?
-----+---------------------------+---------+--------------------+-------------
   0  | 0x00007f2e41200000 |   Yes   | 0x113aa0           |     Yes
   1  | 0x00007f2e41201000 |   Yes   | 0x113aa1           |     Yes
   2  | 0x00007f2e41202000 |   Yes   | 0x113aa2           |     Yes
   3  | 0x00007f2e41203000 |   Yes   | 0x113aa3           |     Yes
   4  | 0x00007f2e41204000 |   Yes   | 0x113aa4           |     Yes
   5  | 0x00007f2e41205000 |   Yes   | 0x113aa5           |     Yes
   6  | 0x00007f2e41206000 |   Yes   | 0x113aa6           |     Yes
   7  | 0x00007f2e41207000 |   Yes   | 0x113aa7           |     Yes
   8  | 0x00007f2e41208000 |   Yes   | 0x113aa8           |     Yes
   9  | 0x00007f2e41209000 |   Yes   | 0x113aa9           |     Yes
  10  | 0x00007f2e4120a000 |   Yes   | 0x113aaa           |     Yes
  11  | 0x00007f2e4120b000 |   Yes   | 0x113aab           |     Yes
  12  | 0x00007f2e4120c000 |   Yes   | 0x113aac           |     Yes
  13  | 0x00007f2e4120d000 |   Yes   | 0x113aad           |     Yes
  14  | 0x00007f2e4120e000 |   Yes   | 0x113aae           |     Yes
  15  | 0x00007f2e4120f000 |   Yes   | 0x113aaf           |     Yes
Verification PASSED: PFNs are contiguous for Source Address (pre-mremap).

5. Calculated target address to be 0x100001ff000
6. Successfully mremap'd 64 KB to 0x100001ff000.

--- Verifying Contiguity for: Target Address (post-mremap) at 
0x100001ff000 ---
Page |      Virtual Address      | Present |   PFN (Physical)   | 
Contiguous?
-----+---------------------------+---------+--------------------+-------------
   0  | 0x00000100001ff000 |   Yes   | 0x113aa0           |     Yes
   1  | 0x0000010000200000 |   Yes   | 0x113aa1           |     Yes
   2  | 0x0000010000201000 |   Yes   | 0x113aa2           |     Yes
   3  | 0x0000010000202000 |   Yes   | 0x113aa3           |     Yes
   4  | 0x0000010000203000 |   Yes   | 0x113aa4           |     Yes
   5  | 0x0000010000204000 |   Yes   | 0x113aa5           |     Yes
   6  | 0x0000010000205000 |   Yes   | 0x113aa6           |     Yes
   7  | 0x0000010000206000 |   Yes   | 0x113aa7           |     Yes
   8  | 0x0000010000207000 |   Yes   | 0x113aa8           |     Yes
   9  | 0x0000010000208000 |   Yes   | 0x113aa9           |     Yes
  10  | 0x0000010000209000 |   Yes   | 0x113aaa           |     Yes
  11  | 0x000001000020a000 |   Yes   | 0x113aab           |     Yes
  12  | 0x000001000020b000 |   Yes   | 0x113aac           |     Yes
  13  | 0x000001000020c000 |   Yes   | 0x113aad           |     Yes
  14  | 0x000001000020d000 |   Yes   | 0x113aae           |     Yes
  15  | 0x000001000020e000 |   Yes   | 0x113aaf           |     Yes
Verification PASSED: PFNs are contiguous for Target Address (post-mremap).

--- Final Conclusion ---
✅ SUCCESS: The folio's pages remained physically contiguous after 
remapping to a PMD-crossing virtual address.
    The reproducer successfully created the desired edge-case memory layout.
```
Thanks,
Lance

> 
>>
>>>
>>> What prevents folio_pte_batch() from reading outside the page table?
>>
>> Assuming such a scenario is possible, to prevent any chance of an
>> out-of-bounds read, how about this change:
>>
>> diff --git a/mm/rmap.c b/mm/rmap.c
>> index fb63d9256f09..9aeae811a38b 100644
>> --- a/mm/rmap.c
>> +++ b/mm/rmap.c
>> @@ -1852,6 +1852,25 @@ static inline bool 
>> can_batch_unmap_folio_ptes(unsigned long addr,
>>       const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
>>       int max_nr = folio_nr_pages(folio);
>>       pte_t pte = ptep_get(ptep);
>> +    unsigned long end_addr;
>> +
>> +    /*
>> +     * To batch unmap, the entire folio's PTEs must be contiguous
>> +     * and mapped within the same PTE page table, which corresponds to
>> +     * a single PMD entry. Before calling folio_pte_batch(), which does
>> +     * not perform boundary checks itself, we must verify that the
>> +     * address range covered by the folio does not cross a PMD boundary.
>> +     */
>> +    end_addr = addr + (max_nr * PAGE_SIZE) - 1;
>> +
>> +    /*
>> +     * A fast way to check for a PMD boundary cross is to align both
>> +     * the start and end addresses to the PMD boundary and see if they
>> +     * are different. If they are, the range spans across at least two
>> +     * different PMD-managed regions.
>> +     */
>> +    if ((addr & PMD_MASK) != (end_addr & PMD_MASK))
>> +        return false;
> 
> You should not be messing with max_nr = folio_nr_pages(folio) here at 
> all. folio_pte_batch() takes care of that.
> 
> Also, way too many comments ;)
> 
> You may only batch within a single VMA and within a single page table.
> 
> So simply align the addr up to the next PMD, and make sure it does not 
> exceed the vma end.
> 
> ALIGN and friends can help avoiding excessive comments.
> 

Re: [PATCH v4 3/4] mm: Support batched unmap for lazyfree large folios during reclamation
Posted by Lance Yang 7 months, 2 weeks ago

On 2025/6/25 16:44, Lance Yang wrote:
> 
> 
> On 2025/6/24 23:34, David Hildenbrand wrote:
>> On 24.06.25 17:26, Lance Yang wrote:
>>> On 2025/6/24 20:55, David Hildenbrand wrote:
>>>> On 14.02.25 10:30, Barry Song wrote:
>>>>> From: Barry Song <v-songbaohua@oppo.com>
>>> [...]
>>>>> diff --git a/mm/rmap.c b/mm/rmap.c
>>>>> index 89e51a7a9509..8786704bd466 100644
>>>>> --- a/mm/rmap.c
>>>>> +++ b/mm/rmap.c
>>>>> @@ -1781,6 +1781,25 @@ void folio_remove_rmap_pud(struct folio *folio,
>>>>> struct page *page,
>>>>>    #endif
>>>>>    }
>>>>> +/* We support batch unmapping of PTEs for lazyfree large folios */
>>>>> +static inline bool can_batch_unmap_folio_ptes(unsigned long addr,
>>>>> +            struct folio *folio, pte_t *ptep)
>>>>> +{
>>>>> +    const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
>>>>> +    int max_nr = folio_nr_pages(folio);
>>>>
>>>> Let's assume we have the first page of a folio mapped at the last page
>>>> table entry in our page table.
>>>
>>> Good point. I'm curious if it is something we've seen in practice ;)
>>
>> I challenge you to write a reproducer :P I assume it might be doable 
>> through simple mremap().
> 
> Yes! The scenario is indeed reproducible from userspace ;p
> 
> First, I get a 64KB folio by allocating a large anonymous mapping and
> advising the kernel with madvise(MADV_HUGEPAGE). After faulting in the
> pages, /proc/self/pagemap confirms the PFNs are contiguous.
> 
> Then, the key is to use mremap() with MREMAP_FIXED to move the folio to
> a virtual address that crosses a PMD boundary. Doing so ensures the
> physically contiguous folio is mapped by PTEs from two different page
> tables.

Forgot to add:

The mTHP split counters didn't change during the mremap, which confirms
the large folio was only remapped, not split.

Thanks,
Lance

> 
> The C reproducer is attached. It was tested on a system with 64KB mTHP
> enabled (in madvise mode). Please correct me if I'm wrong ;)
> 
> 
> ```
> #define _GNU_SOURCE
> #include <stdio.h>
> #include <stdlib.h>
> #include <string.h>
> #include <stdint.h>
> #include <unistd.h>
> #include <fcntl.h>
> #include <errno.h>
> #include <sys/mman.h>
> #include <stdbool.h>
> 
> #define PAGE_SIZE ((size_t)sysconf(_SC_PAGESIZE))
> #define FOLIO_SIZE (64 * 1024)
> #define NUM_PAGES_IN_FOLIO (FOLIO_SIZE / PAGE_SIZE)
> #define PMD_SIZE (2 * 1024 * 1024)
> 
> int get_pagemap_entry(uint64_t *entry, int pagemap_fd, uintptr_t vaddr) {
>      size_t offset = (vaddr / PAGE_SIZE) * sizeof(uint64_t);
>      if (pread(pagemap_fd, entry, sizeof(uint64_t), offset) != 
> sizeof(uint64_t)) {
>          perror("pread pagemap");
>          return -1;
>      }
>      return 0;
> }
> 
> int is_page_present(uint64_t entry) { return (entry >> 63) & 1; }
> uint64_t get_pfn(uint64_t entry) { return entry & ((1ULL << 55) - 1); }
> 
> bool verify_contiguity(int pagemap_fd, uintptr_t vaddr, size_t size, 
> const char *label) {
>      printf("\n--- Verifying Contiguity for: %s at 0x%lx ---\n", label, 
> vaddr);
>      printf("Page |      Virtual Address      | Present |   PFN 
> (Physical)   | Contiguous?\n");
> 
> printf("-----+---------------------------+---------+-------------------- 
> +-------------\n");
> 
>      uint64_t first_pfn = 0;
>      bool is_contiguous = true;
>      int num_pages = size / PAGE_SIZE;
> 
>      for (int i = 0; i < num_pages; ++i) {
>          uintptr_t current_vaddr = vaddr + i * PAGE_SIZE;
>          uint64_t pagemap_entry;
> 
>          if (get_pagemap_entry(&pagemap_entry, pagemap_fd, 
> current_vaddr) != 0) {
>              is_contiguous = false;
>              break;
>          }
> 
>          if (!is_page_present(pagemap_entry)) {
>              printf(" %2d  | 0x%016lx |    No   |        N/A         | 
> Error\n", i, current_vaddr);
>              is_contiguous = false;
>              continue;
>          }
> 
>          uint64_t pfn = get_pfn(pagemap_entry);
>          char contiguous_str[4] = "Yes";
> 
>          if (i == 0) {
>              first_pfn = pfn;
>          } else {
>              if (pfn != first_pfn + i) {
>                  strcpy(contiguous_str, "No!");
>                  is_contiguous = false;
>              }
>          }
> 
>          printf(" %2d  | 0x%016lx |   Yes   | 0x%-16lx |     %s\n", i, 
> current_vaddr, pfn, contiguous_str);
>      }
> 
>      if (is_contiguous) {
>          printf("Verification PASSED: PFNs are contiguous for %s.\n", 
> label);
>      } else {
>          printf("Verification FAILED: PFNs are NOT contiguous for %s. 
> \n", label);
>      }
>      return is_contiguous;
> }
> 
> 
> int main(void) {
>      printf("--- Folio-across-PMD-boundary reproducer ---\n");
>      printf("Page size: %zu KB, Folio size: %zu KB, PMD coverage: %zu 
> MB\n",
>             PAGE_SIZE / 1024, FOLIO_SIZE / 1024, PMD_SIZE / (1024 * 1024));
> 
>      size_t source_size = 4 * 1024 * 1024;
>      void *source_addr = mmap(NULL, source_size, PROT_READ | PROT_WRITE,
>                               MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
>      if (source_addr == MAP_FAILED) {
>          perror("mmap source"); exit(EXIT_FAILURE);
>      }
>      printf("\n1. Source memory mapped at: %p\n", source_addr);
> 
>      if (madvise(source_addr, source_size, MADV_HUGEPAGE) != 0) {
>          perror("madvise MADV_HUGEPAGE");
>      }
>      printf("2. Advised kernel to use large folios (MADV_HUGEPAGE).\n");
> 
>      memset(source_addr, 'A', source_size);
>      printf("3. Faulted in source pages.\n");
> 
>      int pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
>      if (pagemap_fd < 0) {
>          perror("open /proc/self/pagemap");
>          exit(EXIT_FAILURE);
>      }
> 
>      if (!verify_contiguity(pagemap_fd, (uintptr_t)source_addr, 
> FOLIO_SIZE, "Source Address (pre-mremap)")) {
>          fprintf(stderr, "\nInitial folio allocation failed. Cannot 
> proceed.\n");
>          close(pagemap_fd);
>          munmap(source_addr, source_size);
>          exit(EXIT_FAILURE);
>      }
> 
>      uintptr_t search_base = 0x10000000000UL;
>      uintptr_t pmd_boundary = (search_base + PMD_SIZE) & ~(PMD_SIZE - 1);
>      uintptr_t target_vaddr = pmd_boundary - PAGE_SIZE;
>      printf("\n5. Calculated target address to be 0x%lx\n", target_vaddr);
> 
>      munmap((void *)target_vaddr, FOLIO_SIZE);
>      void *new_addr = mremap(source_addr, FOLIO_SIZE, FOLIO_SIZE, 
> MREMAP_MAYMOVE | MREMAP_FIXED, (void *)target_vaddr);
>      if (new_addr == MAP_FAILED) {
>          perror("mremap");
>          close(pagemap_fd);
>          exit(EXIT_FAILURE);
>      }
>      printf("6. Successfully mremap'd %zu KB to 0x%lx.\n", FOLIO_SIZE / 
> 1024, (uintptr_t)new_addr);
> 
>      bool final_success = verify_contiguity(pagemap_fd, 
> (uintptr_t)new_addr, FOLIO_SIZE, "Target Address (post-mremap)");
> 
>      printf("\n--- Final Conclusion ---\n");
>      if (final_success) {
>          printf("✅ SUCCESS: The folio's pages remained physically 
> contiguous after remapping to a PMD-crossing virtual address.\n");
>          printf("   The reproducer successfully created the desired 
> edge-case memory layout.\n");
>      } else {
>          printf("❌ UNEXPECTED FAILURE: The pages were not contiguous 
> after mremap.\n");
>      }
> 
>      close(pagemap_fd);
>      munmap(new_addr, FOLIO_SIZE);
> 
>      return 0;
> }
> ```
> 
> $ a.out
> 
> ```
> --- Folio-across-PMD-boundary reproducer ---
> Page size: 4 KB, Folio size: 64 KB, PMD coverage: 2 MB
> 
> 1. Source memory mapped at: 0x7f2e41200000
> 2. Advised kernel to use large folios (MADV_HUGEPAGE).
> 3. Faulted in source pages.
> 
> --- Verifying Contiguity for: Source Address (pre-mremap) at 
> 0x7f2e41200000 ---
> Page |      Virtual Address      | Present |   PFN (Physical)   | 
> Contiguous?
> -----+---------------------------+---------+-------------------- 
> +-------------
>    0  | 0x00007f2e41200000 |   Yes   | 0x113aa0           |     Yes
>    1  | 0x00007f2e41201000 |   Yes   | 0x113aa1           |     Yes
>    2  | 0x00007f2e41202000 |   Yes   | 0x113aa2           |     Yes
>    3  | 0x00007f2e41203000 |   Yes   | 0x113aa3           |     Yes
>    4  | 0x00007f2e41204000 |   Yes   | 0x113aa4           |     Yes
>    5  | 0x00007f2e41205000 |   Yes   | 0x113aa5           |     Yes
>    6  | 0x00007f2e41206000 |   Yes   | 0x113aa6           |     Yes
>    7  | 0x00007f2e41207000 |   Yes   | 0x113aa7           |     Yes
>    8  | 0x00007f2e41208000 |   Yes   | 0x113aa8           |     Yes
>    9  | 0x00007f2e41209000 |   Yes   | 0x113aa9           |     Yes
>   10  | 0x00007f2e4120a000 |   Yes   | 0x113aaa           |     Yes
>   11  | 0x00007f2e4120b000 |   Yes   | 0x113aab           |     Yes
>   12  | 0x00007f2e4120c000 |   Yes   | 0x113aac           |     Yes
>   13  | 0x00007f2e4120d000 |   Yes   | 0x113aad           |     Yes
>   14  | 0x00007f2e4120e000 |   Yes   | 0x113aae           |     Yes
>   15  | 0x00007f2e4120f000 |   Yes   | 0x113aaf           |     Yes
> Verification PASSED: PFNs are contiguous for Source Address (pre-mremap).
> 
> 5. Calculated target address to be 0x100001ff000
> 6. Successfully mremap'd 64 KB to 0x100001ff000.
> 
> --- Verifying Contiguity for: Target Address (post-mremap) at 
> 0x100001ff000 ---
> Page |      Virtual Address      | Present |   PFN (Physical)   | 
> Contiguous?
> -----+---------------------------+---------+-------------------- 
> +-------------
>    0  | 0x00000100001ff000 |   Yes   | 0x113aa0           |     Yes
>    1  | 0x0000010000200000 |   Yes   | 0x113aa1           |     Yes
>    2  | 0x0000010000201000 |   Yes   | 0x113aa2           |     Yes
>    3  | 0x0000010000202000 |   Yes   | 0x113aa3           |     Yes
>    4  | 0x0000010000203000 |   Yes   | 0x113aa4           |     Yes
>    5  | 0x0000010000204000 |   Yes   | 0x113aa5           |     Yes
>    6  | 0x0000010000205000 |   Yes   | 0x113aa6           |     Yes
>    7  | 0x0000010000206000 |   Yes   | 0x113aa7           |     Yes
>    8  | 0x0000010000207000 |   Yes   | 0x113aa8           |     Yes
>    9  | 0x0000010000208000 |   Yes   | 0x113aa9           |     Yes
>   10  | 0x0000010000209000 |   Yes   | 0x113aaa           |     Yes
>   11  | 0x000001000020a000 |   Yes   | 0x113aab           |     Yes
>   12  | 0x000001000020b000 |   Yes   | 0x113aac           |     Yes
>   13  | 0x000001000020c000 |   Yes   | 0x113aad           |     Yes
>   14  | 0x000001000020d000 |   Yes   | 0x113aae           |     Yes
>   15  | 0x000001000020e000 |   Yes   | 0x113aaf           |     Yes
> Verification PASSED: PFNs are contiguous for Target Address (post-mremap).
> 
> --- Final Conclusion ---
> ✅ SUCCESS: The folio's pages remained physically contiguous after 
> remapping to a PMD-crossing virtual address.
>     The reproducer successfully created the desired edge-case memory 
> layout.
> ```
> Thanks,
> Lance
> 
>>
>>>
>>>>
>>>> What prevents folio_pte_batch() from reading outside the page table?
>>>
>>> Assuming such a scenario is possible, to prevent any chance of an
>>> out-of-bounds read, how about this change:
>>>
>>> diff --git a/mm/rmap.c b/mm/rmap.c
>>> index fb63d9256f09..9aeae811a38b 100644
>>> --- a/mm/rmap.c
>>> +++ b/mm/rmap.c
>>> @@ -1852,6 +1852,25 @@ static inline bool 
>>> can_batch_unmap_folio_ptes(unsigned long addr,
>>>       const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
>>>       int max_nr = folio_nr_pages(folio);
>>>       pte_t pte = ptep_get(ptep);
>>> +    unsigned long end_addr;
>>> +
>>> +    /*
>>> +     * To batch unmap, the entire folio's PTEs must be contiguous
>>> +     * and mapped within the same PTE page table, which corresponds to
>>> +     * a single PMD entry. Before calling folio_pte_batch(), which does
>>> +     * not perform boundary checks itself, we must verify that the
>>> +     * address range covered by the folio does not cross a PMD 
>>> boundary.
>>> +     */
>>> +    end_addr = addr + (max_nr * PAGE_SIZE) - 1;
>>> +
>>> +    /*
>>> +     * A fast way to check for a PMD boundary cross is to align both
>>> +     * the start and end addresses to the PMD boundary and see if they
>>> +     * are different. If they are, the range spans across at least two
>>> +     * different PMD-managed regions.
>>> +     */
>>> +    if ((addr & PMD_MASK) != (end_addr & PMD_MASK))
>>> +        return false;
>>
>> You should not be messing with max_nr = folio_nr_pages(folio) here at 
>> all. folio_pte_batch() takes care of that.
>>
>> Also, way too many comments ;)
>>
>> You may only batch within a single VMA and within a single page table.
>>
>> So simply align the addr up to the next PMD, and make sure it does not 
>> exceed the vma end.
>>
>> ALIGN and friends can help avoiding excessive comments.
>>
> 

Re: [PATCH v4 3/4] mm: Support batched unmap for lazyfree large folios during reclamation
Posted by Lance Yang 7 months, 2 weeks ago
On 2025/6/24 23:34, David Hildenbrand wrote:
> On 24.06.25 17:26, Lance Yang wrote:
>> On 2025/6/24 20:55, David Hildenbrand wrote:
>>> On 14.02.25 10:30, Barry Song wrote:
>>>> From: Barry Song <v-songbaohua@oppo.com>
>> [...]
>>>> diff --git a/mm/rmap.c b/mm/rmap.c
>>>> index 89e51a7a9509..8786704bd466 100644
>>>> --- a/mm/rmap.c
>>>> +++ b/mm/rmap.c
>>>> @@ -1781,6 +1781,25 @@ void folio_remove_rmap_pud(struct folio *folio,
>>>> struct page *page,
>>>>    #endif
>>>>    }
>>>> +/* We support batch unmapping of PTEs for lazyfree large folios */
>>>> +static inline bool can_batch_unmap_folio_ptes(unsigned long addr,
>>>> +            struct folio *folio, pte_t *ptep)
>>>> +{
>>>> +    const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
>>>> +    int max_nr = folio_nr_pages(folio);
>>>
>>> Let's assume we have the first page of a folio mapped at the last page
>>> table entry in our page table.
>>
>> Good point. I'm curious if it is something we've seen in practice ;)
> 
> I challenge you to write a reproducer :P I assume it might be doable 
> through simple mremap().
> 
>>
>>>
>>> What prevents folio_pte_batch() from reading outside the page table?
>>
>> Assuming such a scenario is possible, to prevent any chance of an
>> out-of-bounds read, how about this change:
>>
>> diff --git a/mm/rmap.c b/mm/rmap.c
>> index fb63d9256f09..9aeae811a38b 100644
>> --- a/mm/rmap.c
>> +++ b/mm/rmap.c
>> @@ -1852,6 +1852,25 @@ static inline bool 
>> can_batch_unmap_folio_ptes(unsigned long addr,
>>       const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
>>       int max_nr = folio_nr_pages(folio);
>>       pte_t pte = ptep_get(ptep);
>> +    unsigned long end_addr;
>> +
>> +    /*
>> +     * To batch unmap, the entire folio's PTEs must be contiguous
>> +     * and mapped within the same PTE page table, which corresponds to
>> +     * a single PMD entry. Before calling folio_pte_batch(), which does
>> +     * not perform boundary checks itself, we must verify that the
>> +     * address range covered by the folio does not cross a PMD boundary.
>> +     */
>> +    end_addr = addr + (max_nr * PAGE_SIZE) - 1;
>> +
>> +    /*
>> +     * A fast way to check for a PMD boundary cross is to align both
>> +     * the start and end addresses to the PMD boundary and see if they
>> +     * are different. If they are, the range spans across at least two
>> +     * different PMD-managed regions.
>> +     */
>> +    if ((addr & PMD_MASK) != (end_addr & PMD_MASK))
>> +        return false;
> 
> You should not be messing with max_nr = folio_nr_pages(folio) here at 
> all. folio_pte_batch() takes care of that.
> 
> Also, way too many comments ;)
> 
> You may only batch within a single VMA and within a single page table.
> 
> So simply align the addr up to the next PMD, and make sure it does not 
> exceed the vma end.
> 
> ALIGN and friends can help avoiding excessive comments.

Thanks! How about this updated version based on your suggestion:

diff --git a/mm/rmap.c b/mm/rmap.c
index fb63d9256f09..241d55a92a47 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1847,12 +1847,25 @@ void folio_remove_rmap_pud(struct folio *folio, struct page *page,
 
 /* We support batch unmapping of PTEs for lazyfree large folios */
 static inline bool can_batch_unmap_folio_ptes(unsigned long addr,
-			struct folio *folio, pte_t *ptep)
+					      struct folio *folio, pte_t *ptep,
+					      struct vm_area_struct *vma)
 {
 	const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
+	unsigned long next_pmd, vma_end, end_addr;
 	int max_nr = folio_nr_pages(folio);
 	pte_t pte = ptep_get(ptep);
 
+	/*
+	 * Limit the batch scan within a single VMA and within a single
+	 * page table.
+	 */
+	vma_end = vma->vm_end;
+	next_pmd = ALIGN(addr + 1, PMD_SIZE);
+	end_addr = addr + (unsigned long)max_nr * PAGE_SIZE;
+
+	if (end_addr > min(next_pmd, vma_end))
+		return false;
+
 	if (!folio_test_anon(folio) || folio_test_swapbacked(folio))
 		return false;
 	if (pte_unused(pte))
@@ -2025,7 +2038,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
 				folio_mark_dirty(folio);
 		} else if (likely(pte_present(pteval))) {
 			if (folio_test_large(folio) && !(flags & TTU_HWPOISON) &&
-			    can_batch_unmap_folio_ptes(address, folio, pvmw.pte))
+			    can_batch_unmap_folio_ptes(address, folio, pvmw.pte, pvmw.vma))
 				nr_pages = folio_nr_pages(folio);
 			end_addr = address + nr_pages * PAGE_SIZE;
 			flush_cache_range(vma, address, end_addr);
--
Thanks,
Lance
Re: [PATCH v4 3/4] mm: Support batched unmap for lazyfree large folios during reclamation
Posted by David Hildenbrand 7 months, 2 weeks ago
On 24.06.25 18:25, Lance Yang wrote:
> On 2025/6/24 23:34, David Hildenbrand wrote:
>> On 24.06.25 17:26, Lance Yang wrote:
>>> On 2025/6/24 20:55, David Hildenbrand wrote:
>>>> On 14.02.25 10:30, Barry Song wrote:
>>>>> From: Barry Song <v-songbaohua@oppo.com>
>>> [...]
>>>>> diff --git a/mm/rmap.c b/mm/rmap.c
>>>>> index 89e51a7a9509..8786704bd466 100644
>>>>> --- a/mm/rmap.c
>>>>> +++ b/mm/rmap.c
>>>>> @@ -1781,6 +1781,25 @@ void folio_remove_rmap_pud(struct folio *folio,
>>>>> struct page *page,
>>>>>     #endif
>>>>>     }
>>>>> +/* We support batch unmapping of PTEs for lazyfree large folios */
>>>>> +static inline bool can_batch_unmap_folio_ptes(unsigned long addr,
>>>>> +            struct folio *folio, pte_t *ptep)
>>>>> +{
>>>>> +    const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
>>>>> +    int max_nr = folio_nr_pages(folio);
>>>>
>>>> Let's assume we have the first page of a folio mapped at the last page
>>>> table entry in our page table.
>>>
>>> Good point. I'm curious if it is something we've seen in practice ;)
>>
>> I challenge you to write a reproducer :P I assume it might be doable
>> through simple mremap().
>>
>>>
>>>>
>>>> What prevents folio_pte_batch() from reading outside the page table?
>>>
>>> Assuming such a scenario is possible, to prevent any chance of an
>>> out-of-bounds read, how about this change:
>>>
>>> diff --git a/mm/rmap.c b/mm/rmap.c
>>> index fb63d9256f09..9aeae811a38b 100644
>>> --- a/mm/rmap.c
>>> +++ b/mm/rmap.c
>>> @@ -1852,6 +1852,25 @@ static inline bool
>>> can_batch_unmap_folio_ptes(unsigned long addr,
>>>        const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
>>>        int max_nr = folio_nr_pages(folio);
>>>        pte_t pte = ptep_get(ptep);
>>> +    unsigned long end_addr;
>>> +
>>> +    /*
>>> +     * To batch unmap, the entire folio's PTEs must be contiguous
>>> +     * and mapped within the same PTE page table, which corresponds to
>>> +     * a single PMD entry. Before calling folio_pte_batch(), which does
>>> +     * not perform boundary checks itself, we must verify that the
>>> +     * address range covered by the folio does not cross a PMD boundary.
>>> +     */
>>> +    end_addr = addr + (max_nr * PAGE_SIZE) - 1;
>>> +
>>> +    /*
>>> +     * A fast way to check for a PMD boundary cross is to align both
>>> +     * the start and end addresses to the PMD boundary and see if they
>>> +     * are different. If they are, the range spans across at least two
>>> +     * different PMD-managed regions.
>>> +     */
>>> +    if ((addr & PMD_MASK) != (end_addr & PMD_MASK))
>>> +        return false;
>>
>> You should not be messing with max_nr = folio_nr_pages(folio) here at
>> all. folio_pte_batch() takes care of that.
>>
>> Also, way too many comments ;)
>>
>> You may only batch within a single VMA and within a single page table.
>>
>> So simply align the addr up to the next PMD, and make sure it does not
>> exceed the vma end.
>>
>> ALIGN and friends can help avoiding excessive comments.
> 
> Thanks! How about this updated version based on your suggestion:
> 
> diff --git a/mm/rmap.c b/mm/rmap.c
> index fb63d9256f09..241d55a92a47 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -1847,12 +1847,25 @@ void folio_remove_rmap_pud(struct folio *folio, struct page *page,
>   
>   /* We support batch unmapping of PTEs for lazyfree large folios */
>   static inline bool can_batch_unmap_folio_ptes(unsigned long addr,
> -			struct folio *folio, pte_t *ptep)
> +					      struct folio *folio, pte_t *ptep,
> +					      struct vm_area_struct *vma)
>   {
>   	const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
> +	unsigned long next_pmd, vma_end, end_addr;
>   	int max_nr = folio_nr_pages(folio);
>   	pte_t pte = ptep_get(ptep);
>   
> +	/*
> +	 * Limit the batch scan within a single VMA and within a single
> +	 * page table.
> +	 */
> +	vma_end = vma->vm_end;
> +	next_pmd = ALIGN(addr + 1, PMD_SIZE);
> +	end_addr = addr + (unsigned long)max_nr * PAGE_SIZE;
> +
> +	if (end_addr > min(next_pmd, vma_end))
> +		return false;

May I suggest that we clean all that up as we fix it?

Maybe something like this:

diff --git a/mm/rmap.c b/mm/rmap.c
index 3b74bb19c11dd..11fbddc6ad8d6 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1845,23 +1845,38 @@ void folio_remove_rmap_pud(struct folio *folio, struct page *page,
  #endif
  }
  
-/* We support batch unmapping of PTEs for lazyfree large folios */
-static inline bool can_batch_unmap_folio_ptes(unsigned long addr,
-                       struct folio *folio, pte_t *ptep)
+static inline unsigned int folio_unmap_pte_batch(struct folio *folio,
+               struct page_vma_mapped_walk *pvmw, enum ttu_flags flags,
+               pte_t pte)
  {
         const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
-       int max_nr = folio_nr_pages(folio);
-       pte_t pte = ptep_get(ptep);
+       struct vm_area_struct *vma = pvmw->vma;
+       unsigned long end_addr, addr = pvmw->address;
+       unsigned int max_nr;
+
+       if (flags & TTU_HWPOISON)
+               return 1;
+       if (!folio_test_large(folio))
+               return 1;
+
+       /* We may only batch within a single VMA and a single page table. */
+       end_addr = min_t(unsigned long, ALIGN(addr + 1, PMD_SIZE), vma->vm_end);
+       max_nr = (end_addr - addr) >> PAGE_SHIFT;
  
+       /* We only support lazyfree batching for now ... */
         if (!folio_test_anon(folio) || folio_test_swapbacked(folio))
-               return false;
+               return 1;
         if (pte_unused(pte))
-               return false;
-       if (pte_pfn(pte) != folio_pfn(folio))
-               return false;
+               return 1;
+       /* ... where we must be able to batch the whole folio. */
+       if (pte_pfn(pte) != folio_pfn(folio) || max_nr != folio_nr_pages(folio))
+               return 1;
+       max_nr = folio_pte_batch(folio, addr, pvmw->pte, pte, max_nr, fpb_flags,
+                                NULL, NULL, NULL);
  
-       return folio_pte_batch(folio, addr, ptep, pte, max_nr, fpb_flags, NULL,
-                              NULL, NULL) == max_nr;
+       if (max_nr != folio_nr_pages(folio))
+               return 1;
+       return max_nr;
  }
  
  /*
@@ -2024,9 +2039,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
                         if (pte_dirty(pteval))
                                 folio_mark_dirty(folio);
                 } else if (likely(pte_present(pteval))) {
-                       if (folio_test_large(folio) && !(flags & TTU_HWPOISON) &&
-                           can_batch_unmap_folio_ptes(address, folio, pvmw.pte))
-                               nr_pages = folio_nr_pages(folio);
+                       nr_pages = folio_unmap_pte_batch(folio, &pvmw, flags, pteval);
                         end_addr = address + nr_pages * PAGE_SIZE;
                         flush_cache_range(vma, address, end_addr);
  

Note that I don't quite understand why we have to batch the whole thing or fallback to
individual pages. Why can't we perform other batches that span only some PTEs? What's special
about 1 PTE vs. 2 PTEs vs. all PTEs?


Can someone enlighten me why that is required?

-- 
Cheers,

David / dhildenb
Re: [PATCH v4 3/4] mm: Support batched unmap for lazyfree large folios during reclamation
Posted by Lance Yang 7 months, 2 weeks ago

On 2025/6/25 18:00, David Hildenbrand wrote:
> On 24.06.25 18:25, Lance Yang wrote:
>> On 2025/6/24 23:34, David Hildenbrand wrote:
>>> On 24.06.25 17:26, Lance Yang wrote:
>>>> On 2025/6/24 20:55, David Hildenbrand wrote:
>>>>> On 14.02.25 10:30, Barry Song wrote:
>>>>>> From: Barry Song <v-songbaohua@oppo.com>
>>>> [...]
>>>>>> diff --git a/mm/rmap.c b/mm/rmap.c
>>>>>> index 89e51a7a9509..8786704bd466 100644
>>>>>> --- a/mm/rmap.c
>>>>>> +++ b/mm/rmap.c
>>>>>> @@ -1781,6 +1781,25 @@ void folio_remove_rmap_pud(struct folio 
>>>>>> *folio,
>>>>>> struct page *page,
>>>>>>     #endif
>>>>>>     }
>>>>>> +/* We support batch unmapping of PTEs for lazyfree large folios */
>>>>>> +static inline bool can_batch_unmap_folio_ptes(unsigned long addr,
>>>>>> +            struct folio *folio, pte_t *ptep)
>>>>>> +{
>>>>>> +    const fpb_t fpb_flags = FPB_IGNORE_DIRTY | 
>>>>>> FPB_IGNORE_SOFT_DIRTY;
>>>>>> +    int max_nr = folio_nr_pages(folio);
>>>>>
>>>>> Let's assume we have the first page of a folio mapped at the last page
>>>>> table entry in our page table.
>>>>
>>>> Good point. I'm curious if it is something we've seen in practice ;)
>>>
>>> I challenge you to write a reproducer :P I assume it might be doable
>>> through simple mremap().
>>>
>>>>
>>>>>
>>>>> What prevents folio_pte_batch() from reading outside the page table?
>>>>
>>>> Assuming such a scenario is possible, to prevent any chance of an
>>>> out-of-bounds read, how about this change:
>>>>
>>>> diff --git a/mm/rmap.c b/mm/rmap.c
>>>> index fb63d9256f09..9aeae811a38b 100644
>>>> --- a/mm/rmap.c
>>>> +++ b/mm/rmap.c
>>>> @@ -1852,6 +1852,25 @@ static inline bool
>>>> can_batch_unmap_folio_ptes(unsigned long addr,
>>>>        const fpb_t fpb_flags = FPB_IGNORE_DIRTY | 
>>>> FPB_IGNORE_SOFT_DIRTY;
>>>>        int max_nr = folio_nr_pages(folio);
>>>>        pte_t pte = ptep_get(ptep);
>>>> +    unsigned long end_addr;
>>>> +
>>>> +    /*
>>>> +     * To batch unmap, the entire folio's PTEs must be contiguous
>>>> +     * and mapped within the same PTE page table, which corresponds to
>>>> +     * a single PMD entry. Before calling folio_pte_batch(), which 
>>>> does
>>>> +     * not perform boundary checks itself, we must verify that the
>>>> +     * address range covered by the folio does not cross a PMD 
>>>> boundary.
>>>> +     */
>>>> +    end_addr = addr + (max_nr * PAGE_SIZE) - 1;
>>>> +
>>>> +    /*
>>>> +     * A fast way to check for a PMD boundary cross is to align both
>>>> +     * the start and end addresses to the PMD boundary and see if they
>>>> +     * are different. If they are, the range spans across at least two
>>>> +     * different PMD-managed regions.
>>>> +     */
>>>> +    if ((addr & PMD_MASK) != (end_addr & PMD_MASK))
>>>> +        return false;
>>>
>>> You should not be messing with max_nr = folio_nr_pages(folio) here at
>>> all. folio_pte_batch() takes care of that.
>>>
>>> Also, way too many comments ;)
>>>
>>> You may only batch within a single VMA and within a single page table.
>>>
>>> So simply align the addr up to the next PMD, and make sure it does not
>>> exceed the vma end.
>>>
>>> ALIGN and friends can help avoiding excessive comments.
>>
>> Thanks! How about this updated version based on your suggestion:
>>
>> diff --git a/mm/rmap.c b/mm/rmap.c
>> index fb63d9256f09..241d55a92a47 100644
>> --- a/mm/rmap.c
>> +++ b/mm/rmap.c
>> @@ -1847,12 +1847,25 @@ void folio_remove_rmap_pud(struct folio 
>> *folio, struct page *page,
>>   /* We support batch unmapping of PTEs for lazyfree large folios */
>>   static inline bool can_batch_unmap_folio_ptes(unsigned long addr,
>> -            struct folio *folio, pte_t *ptep)
>> +                          struct folio *folio, pte_t *ptep,
>> +                          struct vm_area_struct *vma)
>>   {
>>       const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
>> +    unsigned long next_pmd, vma_end, end_addr;
>>       int max_nr = folio_nr_pages(folio);
>>       pte_t pte = ptep_get(ptep);
>> +    /*
>> +     * Limit the batch scan within a single VMA and within a single
>> +     * page table.
>> +     */
>> +    vma_end = vma->vm_end;
>> +    next_pmd = ALIGN(addr + 1, PMD_SIZE);
>> +    end_addr = addr + (unsigned long)max_nr * PAGE_SIZE;
>> +
>> +    if (end_addr > min(next_pmd, vma_end))
>> +        return false;
> 
> May I suggest that we clean all that up as we fix it?

Yeah, that looks much better. Thanks for the suggestion!

> 
> Maybe something like this:
> 
> diff --git a/mm/rmap.c b/mm/rmap.c
> index 3b74bb19c11dd..11fbddc6ad8d6 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -1845,23 +1845,38 @@ void folio_remove_rmap_pud(struct folio *folio, 
> struct page *page,
>   #endif
>   }
> 
> -/* We support batch unmapping of PTEs for lazyfree large folios */
> -static inline bool can_batch_unmap_folio_ptes(unsigned long addr,
> -                       struct folio *folio, pte_t *ptep)
> +static inline unsigned int folio_unmap_pte_batch(struct folio *folio,
> +               struct page_vma_mapped_walk *pvmw, enum ttu_flags flags,
> +               pte_t pte)
>   {
>          const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
> -       int max_nr = folio_nr_pages(folio);
> -       pte_t pte = ptep_get(ptep);
> +       struct vm_area_struct *vma = pvmw->vma;
> +       unsigned long end_addr, addr = pvmw->address;
> +       unsigned int max_nr;
> +
> +       if (flags & TTU_HWPOISON)
> +               return 1;
> +       if (!folio_test_large(folio))
> +               return 1;
> +
> +       /* We may only batch within a single VMA and a single page 
> table. */
> +       end_addr = min_t(unsigned long, ALIGN(addr + 1, PMD_SIZE), vma- 
>  >vm_end);
> +       max_nr = (end_addr - addr) >> PAGE_SHIFT;
> 
> +       /* We only support lazyfree batching for now ... */
>          if (!folio_test_anon(folio) || folio_test_swapbacked(folio))
> -               return false;
> +               return 1;
>          if (pte_unused(pte))
> -               return false;
> -       if (pte_pfn(pte) != folio_pfn(folio))
> -               return false;
> +               return 1;
> +       /* ... where we must be able to batch the whole folio. */
> +       if (pte_pfn(pte) != folio_pfn(folio) || max_nr != 
> folio_nr_pages(folio))
> +               return 1;
> +       max_nr = folio_pte_batch(folio, addr, pvmw->pte, pte, max_nr, 
> fpb_flags,
> +                                NULL, NULL, NULL);
> 
> -       return folio_pte_batch(folio, addr, ptep, pte, max_nr, 
> fpb_flags, NULL,
> -                              NULL, NULL) == max_nr;
> +       if (max_nr != folio_nr_pages(folio))
> +               return 1;
> +       return max_nr;
>   }
> 
>   /*
> @@ -2024,9 +2039,7 @@ static bool try_to_unmap_one(struct folio *folio, 
> struct vm_area_struct *vma,
>                          if (pte_dirty(pteval))
>                                  folio_mark_dirty(folio);
>                  } else if (likely(pte_present(pteval))) {
> -                       if (folio_test_large(folio) && !(flags & 
> TTU_HWPOISON) &&
> -                           can_batch_unmap_folio_ptes(address, folio, 
> pvmw.pte))
> -                               nr_pages = folio_nr_pages(folio);
> +                       nr_pages = folio_unmap_pte_batch(folio, &pvmw, 
> flags, pteval);
>                          end_addr = address + nr_pages * PAGE_SIZE;
>                          flush_cache_range(vma, address, end_addr);
> 
> 
> Note that I don't quite understand why we have to batch the whole thing 
> or fallback to
> individual pages. Why can't we perform other batches that span only some 
> PTEs? What's special
> about 1 PTE vs. 2 PTEs vs. all PTEs?

That's a good point about the "all-or-nothing" batching logic ;)

It seems the "all-or-nothing" approach is specific to the lazyfree use
case, which needs to unmap the entire folio for reclamation. If that's
not possible, it falls back to the single-page slow path.

Also, supporting partial batches would be useful, but not common case
I guess, so let's leave it as is ;p

Thanks,
Lance

> 
> 
> Can someone enlighten me why that is required?
> 

Re: [PATCH v4 3/4] mm: Support batched unmap for lazyfree large folios during reclamation
Posted by Barry Song 7 months, 2 weeks ago
> >
> > Note that I don't quite understand why we have to batch the whole thing
> > or fallback to
> > individual pages. Why can't we perform other batches that span only some
> > PTEs? What's special
> > about 1 PTE vs. 2 PTEs vs. all PTEs?
>
> That's a good point about the "all-or-nothing" batching logic ;)
>
> It seems the "all-or-nothing" approach is specific to the lazyfree use
> case, which needs to unmap the entire folio for reclamation. If that's
> not possible, it falls back to the single-page slow path.

Other cases advance the PTE themselves, while try_to_unmap_one() relies
on page_vma_mapped_walk() to advance the PTE. Unless we want to manually
modify pvmw.pte and pvmw.address outside of page_vma_mapped_walk(), which
to me seems like a violation of layers. :-)

Thanks
Barry
Re: [PATCH v4 3/4] mm: Support batched unmap for lazyfree large folios during reclamation
Posted by David Hildenbrand 7 months, 2 weeks ago
On 25.06.25 12:57, Barry Song wrote:
>>>
>>> Note that I don't quite understand why we have to batch the whole thing
>>> or fallback to
>>> individual pages. Why can't we perform other batches that span only some
>>> PTEs? What's special
>>> about 1 PTE vs. 2 PTEs vs. all PTEs?
>>
>> That's a good point about the "all-or-nothing" batching logic ;)
>>
>> It seems the "all-or-nothing" approach is specific to the lazyfree use
>> case, which needs to unmap the entire folio for reclamation. If that's
>> not possible, it falls back to the single-page slow path.
> 
> Other cases advance the PTE themselves, while try_to_unmap_one() relies
> on page_vma_mapped_walk() to advance the PTE. Unless we want to manually
> modify pvmw.pte and pvmw.address outside of page_vma_mapped_walk(), which
> to me seems like a violation of layers. :-)

Please explain to me why the following is not clearer and better:

diff --git a/mm/rmap.c b/mm/rmap.c
index 8200d705fe4ac..09e2c2f28aa58 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1845,23 +1845,31 @@ void folio_remove_rmap_pud(struct folio *folio, struct page *page,
  #endif
  }
  
-/* We support batch unmapping of PTEs for lazyfree large folios */
-static inline bool can_batch_unmap_folio_ptes(unsigned long addr,
-                       struct folio *folio, pte_t *ptep)
+static inline unsigned int folio_unmap_pte_batch(struct folio *folio,
+               struct page_vma_mapped_walk *pvmw, enum ttu_flags flags,
+               pte_t pte)
  {
         const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
-       int max_nr = folio_nr_pages(folio);
-       pte_t pte = ptep_get(ptep);
+       struct vm_area_struct *vma = pvmw->vma;
+       unsigned long end_addr, addr = pvmw->address;
+       unsigned int max_nr;
+
+       if (flags & TTU_HWPOISON)
+               return 1;
+       if (!folio_test_large(folio))
+               return 1;
+
+       /* We may only batch within a single VMA and a single page table. */
+       end_addr = min_t(unsigned long, ALIGN(addr + 1, PMD_SIZE), vma->vm_end);
+       max_nr = (end_addr - addr) >> PAGE_SHIFT;
  
+       /* We only support lazyfree batching for now ... */
         if (!folio_test_anon(folio) || folio_test_swapbacked(folio))
-               return false;
+               return 1;
         if (pte_unused(pte))
-               return false;
-       if (pte_pfn(pte) != folio_pfn(folio))
-               return false;
-
-       return folio_pte_batch(folio, addr, ptep, pte, max_nr, fpb_flags, NULL,
-                              NULL, NULL) == max_nr;
+               return 1;
+       return folio_pte_batch(folio, addr, pvmw->pte, pte, max_nr, fpb_flags,
+                              NULL, NULL, NULL);
  }
  
  /*
@@ -2024,9 +2032,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
                         if (pte_dirty(pteval))
                                 folio_mark_dirty(folio);
                 } else if (likely(pte_present(pteval))) {
-                       if (folio_test_large(folio) && !(flags & TTU_HWPOISON) &&
-                           can_batch_unmap_folio_ptes(address, folio, pvmw.pte))
-                               nr_pages = folio_nr_pages(folio);
+                       nr_pages = folio_unmap_pte_batch(folio, &pvmw, flags, pteval);
                         end_addr = address + nr_pages * PAGE_SIZE;
                         flush_cache_range(vma, address, end_addr);


-- 
Cheers,

David / dhildenb
Re: [PATCH v4 3/4] mm: Support batched unmap for lazyfree large folios during reclamation
Posted by Barry Song 7 months, 2 weeks ago
On Wed, Jun 25, 2025 at 11:01 PM David Hildenbrand <david@redhat.com> wrote:
>
> On 25.06.25 12:57, Barry Song wrote:
> >>>
> >>> Note that I don't quite understand why we have to batch the whole thing
> >>> or fallback to
> >>> individual pages. Why can't we perform other batches that span only some
> >>> PTEs? What's special
> >>> about 1 PTE vs. 2 PTEs vs. all PTEs?
> >>
> >> That's a good point about the "all-or-nothing" batching logic ;)
> >>
> >> It seems the "all-or-nothing" approach is specific to the lazyfree use
> >> case, which needs to unmap the entire folio for reclamation. If that's
> >> not possible, it falls back to the single-page slow path.
> >
> > Other cases advance the PTE themselves, while try_to_unmap_one() relies
> > on page_vma_mapped_walk() to advance the PTE. Unless we want to manually
> > modify pvmw.pte and pvmw.address outside of page_vma_mapped_walk(), which
> > to me seems like a violation of layers. :-)
>
> Please explain to me why the following is not clearer and better:

This part is much clearer, but that doesn’t necessarily improve the overall
picture. The main challenge is how to exit the iteration of
while (page_vma_mapped_walk(&pvmw)).

Right now, we have it laid out quite straightforwardly:
                /* We have already batched the entire folio */
                if (nr_pages > 1)
                        goto walk_done;

with any nr between 1 and folio_nr_pages(), we have to consider two issues:
1. How to skip PTE checks inside page_vma_mapped_walk for entries that
were already handled in the previous batch;
2. How to break the iteration when this batch has arrived at the end.

Of course, we could avoid both, but that would mean performing unnecessary
checks inside page_vma_mapped_walk().

We’ll still need to introduce some “complicated” code to address the issues
mentioned above, won’t we?

>
> diff --git a/mm/rmap.c b/mm/rmap.c
> index 8200d705fe4ac..09e2c2f28aa58 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -1845,23 +1845,31 @@ void folio_remove_rmap_pud(struct folio *folio, struct page *page,
>   #endif
>   }
>
> -/* We support batch unmapping of PTEs for lazyfree large folios */
> -static inline bool can_batch_unmap_folio_ptes(unsigned long addr,
> -                       struct folio *folio, pte_t *ptep)
> +static inline unsigned int folio_unmap_pte_batch(struct folio *folio,
> +               struct page_vma_mapped_walk *pvmw, enum ttu_flags flags,
> +               pte_t pte)
>   {
>          const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
> -       int max_nr = folio_nr_pages(folio);
> -       pte_t pte = ptep_get(ptep);
> +       struct vm_area_struct *vma = pvmw->vma;
> +       unsigned long end_addr, addr = pvmw->address;
> +       unsigned int max_nr;
> +
> +       if (flags & TTU_HWPOISON)
> +               return 1;
> +       if (!folio_test_large(folio))
> +               return 1;
> +
> +       /* We may only batch within a single VMA and a single page table. */
> +       end_addr = min_t(unsigned long, ALIGN(addr + 1, PMD_SIZE), vma->vm_end);
> +       max_nr = (end_addr - addr) >> PAGE_SHIFT;
>
> +       /* We only support lazyfree batching for now ... */
>          if (!folio_test_anon(folio) || folio_test_swapbacked(folio))
> -               return false;
> +               return 1;
>          if (pte_unused(pte))
> -               return false;
> -       if (pte_pfn(pte) != folio_pfn(folio))
> -               return false;
> -
> -       return folio_pte_batch(folio, addr, ptep, pte, max_nr, fpb_flags, NULL,
> -                              NULL, NULL) == max_nr;
> +               return 1;
> +       return folio_pte_batch(folio, addr, pvmw->pte, pte, max_nr, fpb_flags,
> +                              NULL, NULL, NULL);
>   }
>
>   /*
> @@ -2024,9 +2032,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
>                          if (pte_dirty(pteval))
>                                  folio_mark_dirty(folio);
>                  } else if (likely(pte_present(pteval))) {
> -                       if (folio_test_large(folio) && !(flags & TTU_HWPOISON) &&
> -                           can_batch_unmap_folio_ptes(address, folio, pvmw.pte))
> -                               nr_pages = folio_nr_pages(folio);
> +                       nr_pages = folio_unmap_pte_batch(folio, &pvmw, flags, pteval);
>                          end_addr = address + nr_pages * PAGE_SIZE;
>                          flush_cache_range(vma, address, end_addr);
>
>
> --
> Cheers,
>
> David / dhildenb
>

Thanks
Barry
Re: [PATCH v4 3/4] mm: Support batched unmap for lazyfree large folios during reclamation
Posted by David Hildenbrand 7 months, 2 weeks ago
On 25.06.25 13:15, Barry Song wrote:
> On Wed, Jun 25, 2025 at 11:01 PM David Hildenbrand <david@redhat.com> wrote:
>>
>> On 25.06.25 12:57, Barry Song wrote:
>>>>>
>>>>> Note that I don't quite understand why we have to batch the whole thing
>>>>> or fallback to
>>>>> individual pages. Why can't we perform other batches that span only some
>>>>> PTEs? What's special
>>>>> about 1 PTE vs. 2 PTEs vs. all PTEs?
>>>>
>>>> That's a good point about the "all-or-nothing" batching logic ;)
>>>>
>>>> It seems the "all-or-nothing" approach is specific to the lazyfree use
>>>> case, which needs to unmap the entire folio for reclamation. If that's
>>>> not possible, it falls back to the single-page slow path.
>>>
>>> Other cases advance the PTE themselves, while try_to_unmap_one() relies
>>> on page_vma_mapped_walk() to advance the PTE. Unless we want to manually
>>> modify pvmw.pte and pvmw.address outside of page_vma_mapped_walk(), which
>>> to me seems like a violation of layers. :-)
>>
>> Please explain to me why the following is not clearer and better:
> 
> This part is much clearer, but that doesn’t necessarily improve the overall
> picture. The main challenge is how to exit the iteration of
> while (page_vma_mapped_walk(&pvmw)).

Okay, I get what you mean now.

> 
> Right now, we have it laid out quite straightforwardly:
>                  /* We have already batched the entire folio */
>                  if (nr_pages > 1)
>                          goto walk_done;


Given that the comment is completely confusing whens seeing the check ... :)

/*
  * If we are sure that we batched the entire folio and cleared all PTEs,
  * we can just optimize and stop right here.
  */
if (nr_pages == folio_nr_pages(folio))
	goto walk_done;

would make the comment match.

> 
> with any nr between 1 and folio_nr_pages(), we have to consider two issues:
> 1. How to skip PTE checks inside page_vma_mapped_walk for entries that
> were already handled in the previous batch;

They are cleared if we reach that point. So the pte_none() checks will 
simply skip them?

> 2. How to break the iteration when this batch has arrived at the end.

page_vma_mapped_walk() should be doing that?

-- 
Cheers,

David / dhildenb

Re: [PATCH v4 3/4] mm: Support batched unmap for lazyfree large folios during reclamation
Posted by Barry Song 7 months, 2 weeks ago
On Wed, Jun 25, 2025 at 11:27 PM David Hildenbrand <david@redhat.com> wrote:
>
> On 25.06.25 13:15, Barry Song wrote:
> > On Wed, Jun 25, 2025 at 11:01 PM David Hildenbrand <david@redhat.com> wrote:
> >>
> >> On 25.06.25 12:57, Barry Song wrote:
> >>>>>
> >>>>> Note that I don't quite understand why we have to batch the whole thing
> >>>>> or fallback to
> >>>>> individual pages. Why can't we perform other batches that span only some
> >>>>> PTEs? What's special
> >>>>> about 1 PTE vs. 2 PTEs vs. all PTEs?
> >>>>
> >>>> That's a good point about the "all-or-nothing" batching logic ;)
> >>>>
> >>>> It seems the "all-or-nothing" approach is specific to the lazyfree use
> >>>> case, which needs to unmap the entire folio for reclamation. If that's
> >>>> not possible, it falls back to the single-page slow path.
> >>>
> >>> Other cases advance the PTE themselves, while try_to_unmap_one() relies
> >>> on page_vma_mapped_walk() to advance the PTE. Unless we want to manually
> >>> modify pvmw.pte and pvmw.address outside of page_vma_mapped_walk(), which
> >>> to me seems like a violation of layers. :-)
> >>
> >> Please explain to me why the following is not clearer and better:
> >
> > This part is much clearer, but that doesn’t necessarily improve the overall
> > picture. The main challenge is how to exit the iteration of
> > while (page_vma_mapped_walk(&pvmw)).
>
> Okay, I get what you mean now.
>
> >
> > Right now, we have it laid out quite straightforwardly:
> >                  /* We have already batched the entire folio */
> >                  if (nr_pages > 1)
> >                          goto walk_done;
>
>
> Given that the comment is completely confusing whens seeing the check ... :)
>
> /*
>   * If we are sure that we batched the entire folio and cleared all PTEs,
>   * we can just optimize and stop right here.
>   */
> if (nr_pages == folio_nr_pages(folio))
>         goto walk_done;
>
> would make the comment match.

Yes, that clarifies it.

>
> >
> > with any nr between 1 and folio_nr_pages(), we have to consider two issues:
> > 1. How to skip PTE checks inside page_vma_mapped_walk for entries that
> > were already handled in the previous batch;
>
> They are cleared if we reach that point. So the pte_none() checks will
> simply skip them?
>
> > 2. How to break the iteration when this batch has arrived at the end.
>
> page_vma_mapped_walk() should be doing that?

It seems you might have missed the part in my reply that says:
"Of course, we could avoid both, but that would mean performing unnecessary
checks inside page_vma_mapped_walk()."

That’s true for both. But I’m wondering why we’re still doing the check,
even when we’re fairly sure they’ve already been cleared or we’ve reached
the end :-)

Somehow, I feel we could combine your cleanup code—which handles a batch
size of "nr" between 1 and nr_pages—with the
"if (nr_pages == folio_nr_pages(folio)) goto walk_done" check.
In practice, this would let us skip almost all unnecessary checks,
except for a few rare corner cases.

For those corner cases where "nr" truly falls between 1 and nr_pages,
we can just leave them as-is—performing the redundant check inside
page_vma_mapped_walk().

>
> --
> Cheers,
>
> David / dhildenb
>

Thanks
Barry
Re: [PATCH v4 3/4] mm: Support batched unmap for lazyfree large folios during reclamation
Posted by David Hildenbrand 7 months, 2 weeks ago
On 25.06.25 13:42, Barry Song wrote:
> On Wed, Jun 25, 2025 at 11:27 PM David Hildenbrand <david@redhat.com> wrote:
>>
>> On 25.06.25 13:15, Barry Song wrote:
>>> On Wed, Jun 25, 2025 at 11:01 PM David Hildenbrand <david@redhat.com> wrote:
>>>>
>>>> On 25.06.25 12:57, Barry Song wrote:
>>>>>>>
>>>>>>> Note that I don't quite understand why we have to batch the whole thing
>>>>>>> or fallback to
>>>>>>> individual pages. Why can't we perform other batches that span only some
>>>>>>> PTEs? What's special
>>>>>>> about 1 PTE vs. 2 PTEs vs. all PTEs?
>>>>>>
>>>>>> That's a good point about the "all-or-nothing" batching logic ;)
>>>>>>
>>>>>> It seems the "all-or-nothing" approach is specific to the lazyfree use
>>>>>> case, which needs to unmap the entire folio for reclamation. If that's
>>>>>> not possible, it falls back to the single-page slow path.
>>>>>
>>>>> Other cases advance the PTE themselves, while try_to_unmap_one() relies
>>>>> on page_vma_mapped_walk() to advance the PTE. Unless we want to manually
>>>>> modify pvmw.pte and pvmw.address outside of page_vma_mapped_walk(), which
>>>>> to me seems like a violation of layers. :-)
>>>>
>>>> Please explain to me why the following is not clearer and better:
>>>
>>> This part is much clearer, but that doesn’t necessarily improve the overall
>>> picture. The main challenge is how to exit the iteration of
>>> while (page_vma_mapped_walk(&pvmw)).
>>
>> Okay, I get what you mean now.
>>
>>>
>>> Right now, we have it laid out quite straightforwardly:
>>>                   /* We have already batched the entire folio */
>>>                   if (nr_pages > 1)
>>>                           goto walk_done;
>>
>>
>> Given that the comment is completely confusing whens seeing the check ... :)
>>
>> /*
>>    * If we are sure that we batched the entire folio and cleared all PTEs,
>>    * we can just optimize and stop right here.
>>    */
>> if (nr_pages == folio_nr_pages(folio))
>>          goto walk_done;
>>
>> would make the comment match.
> 
> Yes, that clarifies it.
> 
>>
>>>
>>> with any nr between 1 and folio_nr_pages(), we have to consider two issues:
>>> 1. How to skip PTE checks inside page_vma_mapped_walk for entries that
>>> were already handled in the previous batch;
>>
>> They are cleared if we reach that point. So the pte_none() checks will
>> simply skip them?
>>
>>> 2. How to break the iteration when this batch has arrived at the end.
>>
>> page_vma_mapped_walk() should be doing that?
> 
> It seems you might have missed the part in my reply that says:
> "Of course, we could avoid both, but that would mean performing unnecessary
> checks inside page_vma_mapped_walk()."
 > > That’s true for both. But I’m wondering why we’re still doing the 
check,
> even when we’re fairly sure they’ve already been cleared or we’ve reached
> the end :-)

:)

> 
> Somehow, I feel we could combine your cleanup code—which handles a batch
> size of "nr" between 1 and nr_pages—with the
> "if (nr_pages == folio_nr_pages(folio)) goto walk_done" check.

Yeah, that's what I was suggesting. It would have to be part of the 
cleanup I think.

I'm still wondering if there is a case where

if (nr_pages == folio_nr_pages(folio))
	goto walk_done;

would be wrong when dealing with small folios.

> In practice, this would let us skip almost all unnecessary checks,
> except for a few rare corner cases.
> 
> For those corner cases where "nr" truly falls between 1 and nr_pages,
> we can just leave them as-is—performing the redundant check inside
> page_vma_mapped_walk().

I mean, batching mapcount+refcount updates etc. is always a win. If we 
end up doing some unnecessary pte_none() checks, that might be 
suboptimal but mostly noise in contrast to the other stuff we will 
optimize out :)

Agreed that if we can easily avoid these pte_none() checks, we should do 
that. Optimizing that for "nr_pages == folio_nr_pages(folio)" makes sense.

-- 
Cheers,

David / dhildenb

Re: [PATCH v4 3/4] mm: Support batched unmap for lazyfree large folios during reclamation
Posted by Lance Yang 7 months, 2 weeks ago

On 2025/6/25 20:09, David Hildenbrand wrote:
> On 25.06.25 13:42, Barry Song wrote:
>> On Wed, Jun 25, 2025 at 11:27 PM David Hildenbrand <david@redhat.com> 
>> wrote:
>>>
>>> On 25.06.25 13:15, Barry Song wrote:
>>>> On Wed, Jun 25, 2025 at 11:01 PM David Hildenbrand 
>>>> <david@redhat.com> wrote:
>>>>>
>>>>> On 25.06.25 12:57, Barry Song wrote:
>>>>>>>>
>>>>>>>> Note that I don't quite understand why we have to batch the 
>>>>>>>> whole thing
>>>>>>>> or fallback to
>>>>>>>> individual pages. Why can't we perform other batches that span 
>>>>>>>> only some
>>>>>>>> PTEs? What's special
>>>>>>>> about 1 PTE vs. 2 PTEs vs. all PTEs?
>>>>>>>
>>>>>>> That's a good point about the "all-or-nothing" batching logic ;)
>>>>>>>
>>>>>>> It seems the "all-or-nothing" approach is specific to the 
>>>>>>> lazyfree use
>>>>>>> case, which needs to unmap the entire folio for reclamation. If 
>>>>>>> that's
>>>>>>> not possible, it falls back to the single-page slow path.
>>>>>>
>>>>>> Other cases advance the PTE themselves, while try_to_unmap_one() 
>>>>>> relies
>>>>>> on page_vma_mapped_walk() to advance the PTE. Unless we want to 
>>>>>> manually
>>>>>> modify pvmw.pte and pvmw.address outside of 
>>>>>> page_vma_mapped_walk(), which
>>>>>> to me seems like a violation of layers. :-)
>>>>>
>>>>> Please explain to me why the following is not clearer and better:
>>>>
>>>> This part is much clearer, but that doesn’t necessarily improve the 
>>>> overall
>>>> picture. The main challenge is how to exit the iteration of
>>>> while (page_vma_mapped_walk(&pvmw)).
>>>
>>> Okay, I get what you mean now.
>>>
>>>>
>>>> Right now, we have it laid out quite straightforwardly:
>>>>                   /* We have already batched the entire folio */
>>>>                   if (nr_pages > 1)
>>>>                           goto walk_done;
>>>
>>>
>>> Given that the comment is completely confusing whens seeing the 
>>> check ... :)
>>>
>>> /*
>>>    * If we are sure that we batched the entire folio and cleared all 
>>> PTEs,
>>>    * we can just optimize and stop right here.
>>>    */
>>> if (nr_pages == folio_nr_pages(folio))
>>>          goto walk_done;
>>>
>>> would make the comment match.
>>
>> Yes, that clarifies it.
>>
>>>
>>>>
>>>> with any nr between 1 and folio_nr_pages(), we have to consider two 
>>>> issues:
>>>> 1. How to skip PTE checks inside page_vma_mapped_walk for entries that
>>>> were already handled in the previous batch;
>>>
>>> They are cleared if we reach that point. So the pte_none() checks will
>>> simply skip them?
>>>
>>>> 2. How to break the iteration when this batch has arrived at the end.
>>>
>>> page_vma_mapped_walk() should be doing that?
>>
>> It seems you might have missed the part in my reply that says:
>> "Of course, we could avoid both, but that would mean performing 
>> unnecessary
>> checks inside page_vma_mapped_walk()."
>  > > That’s true for both. But I’m wondering why we’re still doing the 
> check,
>> even when we’re fairly sure they’ve already been cleared or we’ve reached
>> the end :-)
> 
> :)
> 
>>
>> Somehow, I feel we could combine your cleanup code—which handles a batch
>> size of "nr" between 1 and nr_pages—with the
>> "if (nr_pages == folio_nr_pages(folio)) goto walk_done" check.
> 
> Yeah, that's what I was suggesting. It would have to be part of the 
> cleanup I think.
> 
> I'm still wondering if there is a case where
> 
> if (nr_pages == folio_nr_pages(folio))
>      goto walk_done;
> 
> would be wrong when dealing with small folios.

We can make the check more explicit to avoid any future trouble ;)

if (nr_pages > 1 && nr_pages == folio_nr_pages(folio))
     goto walk_done;

It should be safe for small folios.

Thanks,
Lance

> 
>> In practice, this would let us skip almost all unnecessary checks,
>> except for a few rare corner cases.
>>
>> For those corner cases where "nr" truly falls between 1 and nr_pages,
>> we can just leave them as-is—performing the redundant check inside
>> page_vma_mapped_walk().
> 
> I mean, batching mapcount+refcount updates etc. is always a win. If we 
> end up doing some unnecessary pte_none() checks, that might be 
> suboptimal but mostly noise in contrast to the other stuff we will 
> optimize out :)
> 
> Agreed that if we can easily avoid these pte_none() checks, we should do 
> that. Optimizing that for "nr_pages == folio_nr_pages(folio)" makes sense.
> 

Re: [PATCH v4 3/4] mm: Support batched unmap for lazyfree large folios during reclamation
Posted by David Hildenbrand 7 months, 2 weeks ago
On 25.06.25 14:58, Lance Yang wrote:
> 
> 
> On 2025/6/25 20:09, David Hildenbrand wrote:
>> On 25.06.25 13:42, Barry Song wrote:
>>> On Wed, Jun 25, 2025 at 11:27 PM David Hildenbrand <david@redhat.com>
>>> wrote:
>>>>
>>>> On 25.06.25 13:15, Barry Song wrote:
>>>>> On Wed, Jun 25, 2025 at 11:01 PM David Hildenbrand
>>>>> <david@redhat.com> wrote:
>>>>>>
>>>>>> On 25.06.25 12:57, Barry Song wrote:
>>>>>>>>>
>>>>>>>>> Note that I don't quite understand why we have to batch the
>>>>>>>>> whole thing
>>>>>>>>> or fallback to
>>>>>>>>> individual pages. Why can't we perform other batches that span
>>>>>>>>> only some
>>>>>>>>> PTEs? What's special
>>>>>>>>> about 1 PTE vs. 2 PTEs vs. all PTEs?
>>>>>>>>
>>>>>>>> That's a good point about the "all-or-nothing" batching logic ;)
>>>>>>>>
>>>>>>>> It seems the "all-or-nothing" approach is specific to the
>>>>>>>> lazyfree use
>>>>>>>> case, which needs to unmap the entire folio for reclamation. If
>>>>>>>> that's
>>>>>>>> not possible, it falls back to the single-page slow path.
>>>>>>>
>>>>>>> Other cases advance the PTE themselves, while try_to_unmap_one()
>>>>>>> relies
>>>>>>> on page_vma_mapped_walk() to advance the PTE. Unless we want to
>>>>>>> manually
>>>>>>> modify pvmw.pte and pvmw.address outside of
>>>>>>> page_vma_mapped_walk(), which
>>>>>>> to me seems like a violation of layers. :-)
>>>>>>
>>>>>> Please explain to me why the following is not clearer and better:
>>>>>
>>>>> This part is much clearer, but that doesn’t necessarily improve the
>>>>> overall
>>>>> picture. The main challenge is how to exit the iteration of
>>>>> while (page_vma_mapped_walk(&pvmw)).
>>>>
>>>> Okay, I get what you mean now.
>>>>
>>>>>
>>>>> Right now, we have it laid out quite straightforwardly:
>>>>>                    /* We have already batched the entire folio */
>>>>>                    if (nr_pages > 1)
>>>>>                            goto walk_done;
>>>>
>>>>
>>>> Given that the comment is completely confusing whens seeing the
>>>> check ... :)
>>>>
>>>> /*
>>>>     * If we are sure that we batched the entire folio and cleared all
>>>> PTEs,
>>>>     * we can just optimize and stop right here.
>>>>     */
>>>> if (nr_pages == folio_nr_pages(folio))
>>>>           goto walk_done;
>>>>
>>>> would make the comment match.
>>>
>>> Yes, that clarifies it.
>>>
>>>>
>>>>>
>>>>> with any nr between 1 and folio_nr_pages(), we have to consider two
>>>>> issues:
>>>>> 1. How to skip PTE checks inside page_vma_mapped_walk for entries that
>>>>> were already handled in the previous batch;
>>>>
>>>> They are cleared if we reach that point. So the pte_none() checks will
>>>> simply skip them?
>>>>
>>>>> 2. How to break the iteration when this batch has arrived at the end.
>>>>
>>>> page_vma_mapped_walk() should be doing that?
>>>
>>> It seems you might have missed the part in my reply that says:
>>> "Of course, we could avoid both, but that would mean performing
>>> unnecessary
>>> checks inside page_vma_mapped_walk()."
>>   > > That’s true for both. But I’m wondering why we’re still doing the
>> check,
>>> even when we’re fairly sure they’ve already been cleared or we’ve reached
>>> the end :-)
>>
>> :)
>>
>>>
>>> Somehow, I feel we could combine your cleanup code—which handles a batch
>>> size of "nr" between 1 and nr_pages—with the
>>> "if (nr_pages == folio_nr_pages(folio)) goto walk_done" check.
>>
>> Yeah, that's what I was suggesting. It would have to be part of the
>> cleanup I think.
>>
>> I'm still wondering if there is a case where
>>
>> if (nr_pages == folio_nr_pages(folio))
>>       goto walk_done;
>>
>> would be wrong when dealing with small folios.
> 
> We can make the check more explicit to avoid any future trouble ;)
> 
> if (nr_pages > 1 && nr_pages == folio_nr_pages(folio))
>       goto walk_done;
> 
> It should be safe for small folios.

I mean, we are walking a single VMA, and a small folio should never be 
mapped multiple times into the same VMA. (ignoring KSM, but KSM is 
handled differently either way)

So we should be good, just wanted to raise it.

-- 
Cheers,

David / dhildenb

Re: [PATCH v4 3/4] mm: Support batched unmap for lazyfree large folios during reclamation
Posted by Lance Yang 7 months, 2 weeks ago

On 2025/6/25 20:09, David Hildenbrand wrote:
>>
>> Somehow, I feel we could combine your cleanup code—which handles a batch
>> size of "nr" between 1 and nr_pages—with the
>> "if (nr_pages == folio_nr_pages(folio)) goto walk_done" check.
> 
> Yeah, that's what I was suggesting. It would have to be part of the 
> cleanup I think.
> 
> I'm still wondering if there is a case where
> 
> if (nr_pages == folio_nr_pages(folio))
>      goto walk_done;
> 
> would be wrong when dealing with small folios.
> 
>> In practice, this would let us skip almost all unnecessary checks,
>> except for a few rare corner cases.
>>
>> For those corner cases where "nr" truly falls between 1 and nr_pages,
>> we can just leave them as-is—performing the redundant check inside
>> page_vma_mapped_walk().
> 
> I mean, batching mapcount+refcount updates etc. is always a win. If we 
> end up doing some unnecessary pte_none() checks, that might be 
> suboptimal but mostly noise in contrast to the other stuff we will 
> optimize out 🙂
> 
> Agreed that if we can easily avoid these pte_none() checks, we should do 
> that. Optimizing that for "nr_pages == folio_nr_pages(folio)" makes sense.

Hmm... I have a question about the reference counting here ...

		if (vma->vm_flags & VM_LOCKED)
			mlock_drain_local();
		folio_put(folio);
		/* We have already batched the entire folio */

Does anyone else still hold a reference to this folio after folio_put()?

		if (nr_pages == folio_nr_pages(folio))
			goto walk_done;
		continue;
walk_abort:
		ret = false;
walk_done:
		page_vma_mapped_walk_done(&pvmw);
		break;
	}

Thanks,
Lance

Re: [PATCH v4 3/4] mm: Support batched unmap for lazyfree large folios during reclamation
Posted by David Hildenbrand 7 months, 2 weeks ago
On 25.06.25 14:20, Lance Yang wrote:
> 
> 
> On 2025/6/25 20:09, David Hildenbrand wrote:
>>>
>>> Somehow, I feel we could combine your cleanup code—which handles a batch
>>> size of "nr" between 1 and nr_pages—with the
>>> "if (nr_pages == folio_nr_pages(folio)) goto walk_done" check.
>>
>> Yeah, that's what I was suggesting. It would have to be part of the
>> cleanup I think.
>>
>> I'm still wondering if there is a case where
>>
>> if (nr_pages == folio_nr_pages(folio))
>>       goto walk_done;
>>
>> would be wrong when dealing with small folios.
>>
>>> In practice, this would let us skip almost all unnecessary checks,
>>> except for a few rare corner cases.
>>>
>>> For those corner cases where "nr" truly falls between 1 and nr_pages,
>>> we can just leave them as-is—performing the redundant check inside
>>> page_vma_mapped_walk().
>>
>> I mean, batching mapcount+refcount updates etc. is always a win. If we
>> end up doing some unnecessary pte_none() checks, that might be
>> suboptimal but mostly noise in contrast to the other stuff we will
>> optimize out 🙂
>>
>> Agreed that if we can easily avoid these pte_none() checks, we should do
>> that. Optimizing that for "nr_pages == folio_nr_pages(folio)" makes sense.
> 
> Hmm... I have a question about the reference counting here ...
> 
> 		if (vma->vm_flags & VM_LOCKED)
> 			mlock_drain_local();
> 		folio_put(folio);
> 		/* We have already batched the entire folio */
> 
> Does anyone else still hold a reference to this folio after folio_put()?

The caller of the unmap operation should better hold a reference :)

Also, I am not sure why we don't perform a

folio_put_refs(folio, nr_pages);

... :)

-- 
Cheers,

David / dhildenb

Re: [PATCH v4 3/4] mm: Support batched unmap for lazyfree large folios during reclamation
Posted by Barry Song 7 months, 2 weeks ago
On Thu, Jun 26, 2025 at 12:25 AM David Hildenbrand <david@redhat.com> wrote:
>
> On 25.06.25 14:20, Lance Yang wrote:
[...]
> > Hmm... I have a question about the reference counting here ...
> >
> >               if (vma->vm_flags & VM_LOCKED)
> >                       mlock_drain_local();
> >               folio_put(folio);
> >               /* We have already batched the entire folio */
> >
> > Does anyone else still hold a reference to this folio after folio_put()?
>
> The caller of the unmap operation should better hold a reference :)
>
> Also, I am not sure why we don't perform a
>
> folio_put_refs(folio, nr_pages);

Because we've already called folio_ref_sub(folio, nr_pages - 1);
Looking back, it’s kind of ugly, huh.

discard:
                if (unlikely(folio_test_hugetlb(folio))) {
                        hugetlb_remove_rmap(folio);
                } else {
                        folio_remove_rmap_ptes(folio, subpage, nr_pages, vma);
                        folio_ref_sub(folio, nr_pages - 1);
                }

I assume Lance will send a patch? If so, remember to remove this
when switching to folio_put_refs(folio, nr_pages);

Thanks
Barry
Re: [PATCH v4 3/4] mm: Support batched unmap for lazyfree large folios during reclamation
Posted by Lance Yang 7 months, 2 weeks ago

On 2025/6/26 05:03, Barry Song wrote:
> On Thu, Jun 26, 2025 at 12:25 AM David Hildenbrand <david@redhat.com> wrote:
>>
>> On 25.06.25 14:20, Lance Yang wrote:
> [...]
>>> Hmm... I have a question about the reference counting here ...
>>>
>>>                if (vma->vm_flags & VM_LOCKED)
>>>                        mlock_drain_local();
>>>                folio_put(folio);
>>>                /* We have already batched the entire folio */
>>>
>>> Does anyone else still hold a reference to this folio after folio_put()?
>>
>> The caller of the unmap operation should better hold a reference :)
>>
>> Also, I am not sure why we don't perform a
>>
>> folio_put_refs(folio, nr_pages);
> 
> Because we've already called folio_ref_sub(folio, nr_pages - 1);
> Looking back, it’s kind of ugly, huh.
> 
> discard:
>                  if (unlikely(folio_test_hugetlb(folio))) {
>                          hugetlb_remove_rmap(folio);
>                  } else {
>                          folio_remove_rmap_ptes(folio, subpage, nr_pages, vma);
>                          folio_ref_sub(folio, nr_pages - 1);
>                  }
> 
> I assume Lance will send a patch? If so, remember to remove this
> when switching to folio_put_refs(folio, nr_pages);

Ah, got it. Thanks for pointing that out!

Re: [PATCH v4 3/4] mm: Support batched unmap for lazyfree large folios during reclamation
Posted by David Hildenbrand 7 months, 2 weeks ago
On 26.06.25 03:17, Lance Yang wrote:
> 
> 
> On 2025/6/26 05:03, Barry Song wrote:
>> On Thu, Jun 26, 2025 at 12:25 AM David Hildenbrand <david@redhat.com> wrote:
>>>
>>> On 25.06.25 14:20, Lance Yang wrote:
>> [...]
>>>> Hmm... I have a question about the reference counting here ...
>>>>
>>>>                 if (vma->vm_flags & VM_LOCKED)
>>>>                         mlock_drain_local();
>>>>                 folio_put(folio);
>>>>                 /* We have already batched the entire folio */
>>>>
>>>> Does anyone else still hold a reference to this folio after folio_put()?
>>>
>>> The caller of the unmap operation should better hold a reference :)
>>>
>>> Also, I am not sure why we don't perform a
>>>
>>> folio_put_refs(folio, nr_pages);
>>
>> Because we've already called folio_ref_sub(folio, nr_pages - 1);
>> Looking back, it’s kind of ugly, huh.
>>
>> discard:
>>                   if (unlikely(folio_test_hugetlb(folio))) {
>>                           hugetlb_remove_rmap(folio);
>>                   } else {
>>                           folio_remove_rmap_ptes(folio, subpage, nr_pages, vma);
>>                           folio_ref_sub(folio, nr_pages - 1);
>>                   }
>>
>> I assume Lance will send a patch? If so, remember to remove this
>> when switching to folio_put_refs(folio, nr_pages);
> 
> Ah, got it. Thanks for pointing that out!

Obviously I was hinting that the split refcount update can be merged 
into a single refcount update :)

-- 
Cheers,

David / dhildenb

Re: [PATCH v4 3/4] mm: Support batched unmap for lazyfree large folios during reclamation
Posted by Lance Yang 7 months, 2 weeks ago
Before I send out the real patch, I'd like to get some quick feedback to
ensure I've understood the discussion correctly ;)

Does this look like the right direction?

diff --git a/mm/rmap.c b/mm/rmap.c
index fb63d9256f09..5ebffe2137e4 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1845,23 +1845,37 @@ void folio_remove_rmap_pud(struct folio *folio, struct page *page,
 #endif
 }
 
-/* We support batch unmapping of PTEs for lazyfree large folios */
-static inline bool can_batch_unmap_folio_ptes(unsigned long addr,
-			struct folio *folio, pte_t *ptep)
+static inline unsigned int folio_unmap_pte_batch(struct folio *folio,
+			struct page_vma_mapped_walk *pvmw,
+			enum ttu_flags flags, pte_t pte)
 {
 	const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
-	int max_nr = folio_nr_pages(folio);
-	pte_t pte = ptep_get(ptep);
+	unsigned long end_addr, addr = pvmw->address;
+	struct vm_area_struct *vma = pvmw->vma;
+	unsigned int max_nr;
+
+	if (flags & TTU_HWPOISON)
+		return 1;
+	if (!folio_test_large(folio))
+		return 1;
 
+	/* We may only batch within a single VMA and a single page table. */
+	end_addr = pmd_addr_end(addr, vma->vm_end);
+	max_nr = (end_addr - addr) >> PAGE_SHIFT;
+
+	/* We only support lazyfree batching for now ... */
 	if (!folio_test_anon(folio) || folio_test_swapbacked(folio))
-		return false;
+		return 1;
 	if (pte_unused(pte))
-		return false;
-	if (pte_pfn(pte) != folio_pfn(folio))
-		return false;
+		return 1;
+
+	/* ... where we must be able to batch the whole folio. */
+	if (pte_pfn(pte) != folio_pfn(folio) || max_nr != folio_nr_pages(folio))
+		return 1;
+	max_nr = folio_pte_batch(folio, addr, pvmw->pte, pte, max_nr, fpb_flags,
+				 NULL, NULL, NULL);
 
-	return folio_pte_batch(folio, addr, ptep, pte, max_nr, fpb_flags, NULL,
-			       NULL, NULL) == max_nr;
+	return (max_nr != folio_nr_pages(folio)) ? 1 : max_nr;
 }
 
 /*
@@ -2024,9 +2038,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
 			if (pte_dirty(pteval))
 				folio_mark_dirty(folio);
 		} else if (likely(pte_present(pteval))) {
-			if (folio_test_large(folio) && !(flags & TTU_HWPOISON) &&
-			    can_batch_unmap_folio_ptes(address, folio, pvmw.pte))
-				nr_pages = folio_nr_pages(folio);
+			nr_pages = folio_unmap_pte_batch(folio, &pvmw, flags, pteval);
 			end_addr = address + nr_pages * PAGE_SIZE;
 			flush_cache_range(vma, address, end_addr);
 
@@ -2206,13 +2218,16 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
 			hugetlb_remove_rmap(folio);
 		} else {
 			folio_remove_rmap_ptes(folio, subpage, nr_pages, vma);
-			folio_ref_sub(folio, nr_pages - 1);
 		}
 		if (vma->vm_flags & VM_LOCKED)
 			mlock_drain_local();
-		folio_put(folio);
-		/* We have already batched the entire folio */
-		if (nr_pages > 1)
+		folio_put_refs(folio, nr_pages);
+
+		/*
+		 * If we are sure that we batched the entire folio and cleared
+		 * all PTEs, we can just optimize and stop right here.
+		 */
+		if (nr_pages == folio_nr_pages(folio))
 			goto walk_done;
 		continue;
 walk_abort:
--

Thanks,
Lance
Re: [PATCH v4 3/4] mm: Support batched unmap for lazyfree large folios during reclamation
Posted by Barry Song 7 months, 2 weeks ago
On Thu, Jun 26, 2025 at 9:29 PM Lance Yang <ioworker0@gmail.com> wrote:
>
[...]
> +
> +               /*
> +                * If we are sure that we batched the entire folio and cleared
> +                * all PTEs, we can just optimize and stop right here.
> +                */
> +               if (nr_pages == folio_nr_pages(folio))

David also mentioned if (nr_pages > 1 && nr_pages == folio_nr_pages(folio)).
I assume it’s still fine when nr_pages == 1 for small folios? No?

>                         goto walk_done;
>                 continue;
>  walk_abort:
> --
>

Thanks,
Lance
Re: [PATCH v4 3/4] mm: Support batched unmap for lazyfree large folios during reclamation
Posted by David Hildenbrand 7 months, 2 weeks ago
On 26.06.25 23:46, Barry Song wrote:
> On Thu, Jun 26, 2025 at 9:29 PM Lance Yang <ioworker0@gmail.com> wrote:
>>
> [...]
>> +
>> +               /*
>> +                * If we are sure that we batched the entire folio and cleared
>> +                * all PTEs, we can just optimize and stop right here.
>> +                */
>> +               if (nr_pages == folio_nr_pages(folio))
> 
> David also mentioned if (nr_pages > 1 && nr_pages == folio_nr_pages(folio)).
> I assume it’s still fine when nr_pages == 1 for small folios? No?

Yeah, as raised I think it is fine. We should never have any folio page 
mapped multiple times into the same VMA in any case. (excluding KSM 
pages, but they are handled differenty, using a specialized rmap walk)

-- 
Cheers,

David / dhildenb

Re: [PATCH v4 3/4] mm: Support batched unmap for lazyfree large folios during reclamation
Posted by Lance Yang 7 months, 2 weeks ago
On 2025/6/26 17:29, Lance Yang wrote:
> Before I send out the real patch, I'd like to get some quick feedback to
> ensure I've understood the discussion correctly ;)
> 
> Does this look like the right direction?
> 
> diff --git a/mm/rmap.c b/mm/rmap.c
> index fb63d9256f09..5ebffe2137e4 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -1845,23 +1845,37 @@ void folio_remove_rmap_pud(struct folio *folio, struct page *page,
>   #endif
>   }
>   
> -/* We support batch unmapping of PTEs for lazyfree large folios */
> -static inline bool can_batch_unmap_folio_ptes(unsigned long addr,
> -			struct folio *folio, pte_t *ptep)
> +static inline unsigned int folio_unmap_pte_batch(struct folio *folio,
> +			struct page_vma_mapped_walk *pvmw,
> +			enum ttu_flags flags, pte_t pte)
>   {
>   	const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
> -	int max_nr = folio_nr_pages(folio);
> -	pte_t pte = ptep_get(ptep);
> +	unsigned long end_addr, addr = pvmw->address;
> +	struct vm_area_struct *vma = pvmw->vma;
> +	unsigned int max_nr;
> +
> +	if (flags & TTU_HWPOISON)
> +		return 1;
> +	if (!folio_test_large(folio))
> +		return 1;
>   
> +	/* We may only batch within a single VMA and a single page table. */
> +	end_addr = pmd_addr_end(addr, vma->vm_end);
> +	max_nr = (end_addr - addr) >> PAGE_SHIFT;
> +
> +	/* We only support lazyfree batching for now ... */
>   	if (!folio_test_anon(folio) || folio_test_swapbacked(folio))
> -		return false;
> +		return 1;
>   	if (pte_unused(pte))
> -		return false;
> -	if (pte_pfn(pte) != folio_pfn(folio))
> -		return false;
> +		return 1;
> +
> +	/* ... where we must be able to batch the whole folio. */
> +	if (pte_pfn(pte) != folio_pfn(folio) || max_nr != folio_nr_pages(folio))
> +		return 1;
> +	max_nr = folio_pte_batch(folio, addr, pvmw->pte, pte, max_nr, fpb_flags,
> +				 NULL, NULL, NULL);
>   
> -	return folio_pte_batch(folio, addr, ptep, pte, max_nr, fpb_flags, NULL,
> -			       NULL, NULL) == max_nr;
> +	return (max_nr != folio_nr_pages(folio)) ? 1 : max_nr;
>   }
>   
>   /*
> @@ -2024,9 +2038,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
>   			if (pte_dirty(pteval))
>   				folio_mark_dirty(folio);
>   		} else if (likely(pte_present(pteval))) {
> -			if (folio_test_large(folio) && !(flags & TTU_HWPOISON) &&
> -			    can_batch_unmap_folio_ptes(address, folio, pvmw.pte))
> -				nr_pages = folio_nr_pages(folio);
> +			nr_pages = folio_unmap_pte_batch(folio, &pvmw, flags, pteval);
>   			end_addr = address + nr_pages * PAGE_SIZE;
>   			flush_cache_range(vma, address, end_addr);
>   
> @@ -2206,13 +2218,16 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
>   			hugetlb_remove_rmap(folio);
>   		} else {
>   			folio_remove_rmap_ptes(folio, subpage, nr_pages, vma);
> -			folio_ref_sub(folio, nr_pages - 1);
>   		}
>   		if (vma->vm_flags & VM_LOCKED)
>   			mlock_drain_local();
> -		folio_put(folio);
> -		/* We have already batched the entire folio */
> -		if (nr_pages > 1)
> +		folio_put_refs(folio, nr_pages);
> +
> +		/*
> +		 * If we are sure that we batched the entire folio and cleared
> +		 * all PTEs, we can just optimize and stop right here.
> +		 */
> +		if (nr_pages == folio_nr_pages(folio))
>   			goto walk_done;
>   		continue;
>   walk_abort:
> --

Oops ... Through testing on my machine, I found that the logic doesn't
behave as expected because I messed up the meaning of max_nr (the available
scan room in the page table) with folio_nr_pages(folio) :(

With the following change:

diff --git a/mm/rmap.c b/mm/rmap.c
index 5ebffe2137e4..b1407348e14e 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1850,9 +1850,9 @@ static inline unsigned int folio_unmap_pte_batch(struct folio *folio,
 			enum ttu_flags flags, pte_t pte)
 {
 	const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
+	unsigned int max_nr, nr_pages = folio_nr_pages(folio);
 	unsigned long end_addr, addr = pvmw->address;
 	struct vm_area_struct *vma = pvmw->vma;
-	unsigned int max_nr;
 
 	if (flags & TTU_HWPOISON)
 		return 1;
@@ -1870,12 +1870,13 @@ static inline unsigned int folio_unmap_pte_batch(struct folio *folio,
 		return 1;
 
 	/* ... where we must be able to batch the whole folio. */
-	if (pte_pfn(pte) != folio_pfn(folio) || max_nr != folio_nr_pages(folio))
+	if (pte_pfn(pte) != folio_pfn(folio) || max_nr < nr_pages)
 		return 1;
-	max_nr = folio_pte_batch(folio, addr, pvmw->pte, pte, max_nr, fpb_flags,
-				 NULL, NULL, NULL);
 
-	return (max_nr != folio_nr_pages(folio)) ? 1 : max_nr;
+	max_nr = folio_pte_batch(folio, addr, pvmw->pte, pte, nr_pages,
+				 fpb_flags, NULL, NULL, NULL);
+
+	return (max_nr != nr_pages) ? 1 : max_nr;
 }
 
 /*
--

... then things work as expected for the lazyfree case, without any
splitting.
Re: [PATCH v4 3/4] mm: Support batched unmap for lazyfree large folios during reclamation
Posted by David Hildenbrand 7 months, 2 weeks ago
On 26.06.25 14:44, Lance Yang wrote:
> 
> On 2025/6/26 17:29, Lance Yang wrote:
>> Before I send out the real patch, I'd like to get some quick feedback to
>> ensure I've understood the discussion correctly ;)
>>
>> Does this look like the right direction?
>>
>> diff --git a/mm/rmap.c b/mm/rmap.c
>> index fb63d9256f09..5ebffe2137e4 100644
>> --- a/mm/rmap.c
>> +++ b/mm/rmap.c
>> @@ -1845,23 +1845,37 @@ void folio_remove_rmap_pud(struct folio *folio, struct page *page,
>>    #endif
>>    }
>>    
>> -/* We support batch unmapping of PTEs for lazyfree large folios */
>> -static inline bool can_batch_unmap_folio_ptes(unsigned long addr,
>> -			struct folio *folio, pte_t *ptep)
>> +static inline unsigned int folio_unmap_pte_batch(struct folio *folio,
>> +			struct page_vma_mapped_walk *pvmw,
>> +			enum ttu_flags flags, pte_t pte)
>>    {
>>    	const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
>> -	int max_nr = folio_nr_pages(folio);
>> -	pte_t pte = ptep_get(ptep);
>> +	unsigned long end_addr, addr = pvmw->address;
>> +	struct vm_area_struct *vma = pvmw->vma;
>> +	unsigned int max_nr;
>> +
>> +	if (flags & TTU_HWPOISON)
>> +		return 1;
>> +	if (!folio_test_large(folio))
>> +		return 1;
>>    
>> +	/* We may only batch within a single VMA and a single page table. */
>> +	end_addr = pmd_addr_end(addr, vma->vm_end);
>> +	max_nr = (end_addr - addr) >> PAGE_SHIFT;
>> +
>> +	/* We only support lazyfree batching for now ... */
>>    	if (!folio_test_anon(folio) || folio_test_swapbacked(folio))
>> -		return false;
>> +		return 1;
>>    	if (pte_unused(pte))
>> -		return false;
>> -	if (pte_pfn(pte) != folio_pfn(folio))
>> -		return false;
>> +		return 1;
>> +
>> +	/* ... where we must be able to batch the whole folio. */
>> +	if (pte_pfn(pte) != folio_pfn(folio) || max_nr != folio_nr_pages(folio))
>> +		return 1;
>> +	max_nr = folio_pte_batch(folio, addr, pvmw->pte, pte, max_nr, fpb_flags,
>> +				 NULL, NULL, NULL);
>>    
>> -	return folio_pte_batch(folio, addr, ptep, pte, max_nr, fpb_flags, NULL,
>> -			       NULL, NULL) == max_nr;
>> +	return (max_nr != folio_nr_pages(folio)) ? 1 : max_nr;
>>    }
>>    
>>    /*
>> @@ -2024,9 +2038,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
>>    			if (pte_dirty(pteval))
>>    				folio_mark_dirty(folio);
>>    		} else if (likely(pte_present(pteval))) {
>> -			if (folio_test_large(folio) && !(flags & TTU_HWPOISON) &&
>> -			    can_batch_unmap_folio_ptes(address, folio, pvmw.pte))
>> -				nr_pages = folio_nr_pages(folio);
>> +			nr_pages = folio_unmap_pte_batch(folio, &pvmw, flags, pteval);
>>    			end_addr = address + nr_pages * PAGE_SIZE;
>>    			flush_cache_range(vma, address, end_addr);
>>    
>> @@ -2206,13 +2218,16 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
>>    			hugetlb_remove_rmap(folio);
>>    		} else {
>>    			folio_remove_rmap_ptes(folio, subpage, nr_pages, vma);
>> -			folio_ref_sub(folio, nr_pages - 1);
>>    		}
>>    		if (vma->vm_flags & VM_LOCKED)
>>    			mlock_drain_local();
>> -		folio_put(folio);
>> -		/* We have already batched the entire folio */
>> -		if (nr_pages > 1)
>> +		folio_put_refs(folio, nr_pages);
>> +
>> +		/*
>> +		 * If we are sure that we batched the entire folio and cleared
>> +		 * all PTEs, we can just optimize and stop right here.
>> +		 */
>> +		if (nr_pages == folio_nr_pages(folio))
>>    			goto walk_done;
>>    		continue;
>>    walk_abort:
>> --
> 
> Oops ... Through testing on my machine, I found that the logic doesn't
> behave as expected because I messed up the meaning of max_nr (the available
> scan room in the page table) with folio_nr_pages(folio) :(
> 
> With the following change:
> 
> diff --git a/mm/rmap.c b/mm/rmap.c
> index 5ebffe2137e4..b1407348e14e 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -1850,9 +1850,9 @@ static inline unsigned int folio_unmap_pte_batch(struct folio *folio,
>   			enum ttu_flags flags, pte_t pte)
>   {
>   	const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
> +	unsigned int max_nr, nr_pages = folio_nr_pages(folio);
>   	unsigned long end_addr, addr = pvmw->address;
>   	struct vm_area_struct *vma = pvmw->vma;
> -	unsigned int max_nr;
>   
>   	if (flags & TTU_HWPOISON)
>   		return 1;
> @@ -1870,12 +1870,13 @@ static inline unsigned int folio_unmap_pte_batch(struct folio *folio,
>   		return 1;
>   
>   	/* ... where we must be able to batch the whole folio. */

Why is that still required? :)

> -	if (pte_pfn(pte) != folio_pfn(folio) || max_nr != folio_nr_pages(folio))
> +	if (pte_pfn(pte) != folio_pfn(folio) || max_nr < nr_pages)
>   		return 1;
> -	max_nr = folio_pte_batch(folio, addr, pvmw->pte, pte, max_nr, fpb_flags,
> -				 NULL, NULL, NULL);
>   
> -	return (max_nr != folio_nr_pages(folio)) ? 1 : max_nr;
> +	max_nr = folio_pte_batch(folio, addr, pvmw->pte, pte, nr_pages,
> +				 fpb_flags, NULL, NULL, NULL);
> +
> +	return (max_nr != nr_pages) ? 1 : max_nr;

Why is that still required? :)

-- 
Cheers,

David / dhildenb
Re: [PATCH v4 3/4] mm: Support batched unmap for lazyfree large folios during reclamation
Posted by Lance Yang 7 months, 2 weeks ago

On 2025/6/26 21:16, David Hildenbrand wrote:
> On 26.06.25 14:44, Lance Yang wrote:
>>
>> On 2025/6/26 17:29, Lance Yang wrote:
>>> Before I send out the real patch, I'd like to get some quick feedback to
>>> ensure I've understood the discussion correctly ;)
>>>
>>> Does this look like the right direction?
>>>
>>> diff --git a/mm/rmap.c b/mm/rmap.c
>>> index fb63d9256f09..5ebffe2137e4 100644
>>> --- a/mm/rmap.c
>>> +++ b/mm/rmap.c
>>> @@ -1845,23 +1845,37 @@ void folio_remove_rmap_pud(struct folio 
>>> *folio, struct page *page,
>>>    #endif
>>>    }
>>> -/* We support batch unmapping of PTEs for lazyfree large folios */
>>> -static inline bool can_batch_unmap_folio_ptes(unsigned long addr,
>>> -            struct folio *folio, pte_t *ptep)
>>> +static inline unsigned int folio_unmap_pte_batch(struct folio *folio,
>>> +            struct page_vma_mapped_walk *pvmw,
>>> +            enum ttu_flags flags, pte_t pte)
>>>    {
>>>        const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
>>> -    int max_nr = folio_nr_pages(folio);
>>> -    pte_t pte = ptep_get(ptep);
>>> +    unsigned long end_addr, addr = pvmw->address;
>>> +    struct vm_area_struct *vma = pvmw->vma;
>>> +    unsigned int max_nr;
>>> +
>>> +    if (flags & TTU_HWPOISON)
>>> +        return 1;
>>> +    if (!folio_test_large(folio))
>>> +        return 1;
>>> +    /* We may only batch within a single VMA and a single page 
>>> table. */
>>> +    end_addr = pmd_addr_end(addr, vma->vm_end);
>>> +    max_nr = (end_addr - addr) >> PAGE_SHIFT;
>>> +
>>> +    /* We only support lazyfree batching for now ... */
>>>        if (!folio_test_anon(folio) || folio_test_swapbacked(folio))
>>> -        return false;
>>> +        return 1;
>>>        if (pte_unused(pte))
>>> -        return false;
>>> -    if (pte_pfn(pte) != folio_pfn(folio))
>>> -        return false;
>>> +        return 1;
>>> +
>>> +    /* ... where we must be able to batch the whole folio. */
>>> +    if (pte_pfn(pte) != folio_pfn(folio) || max_nr != 
>>> folio_nr_pages(folio))
>>> +        return 1;
>>> +    max_nr = folio_pte_batch(folio, addr, pvmw->pte, pte, max_nr, 
>>> fpb_flags,
>>> +                 NULL, NULL, NULL);
>>> -    return folio_pte_batch(folio, addr, ptep, pte, max_nr, 
>>> fpb_flags, NULL,
>>> -                   NULL, NULL) == max_nr;
>>> +    return (max_nr != folio_nr_pages(folio)) ? 1 : max_nr;
>>>    }
>>>    /*
>>> @@ -2024,9 +2038,7 @@ static bool try_to_unmap_one(struct folio 
>>> *folio, struct vm_area_struct *vma,
>>>                if (pte_dirty(pteval))
>>>                    folio_mark_dirty(folio);
>>>            } else if (likely(pte_present(pteval))) {
>>> -            if (folio_test_large(folio) && !(flags & TTU_HWPOISON) &&
>>> -                can_batch_unmap_folio_ptes(address, folio, pvmw.pte))
>>> -                nr_pages = folio_nr_pages(folio);
>>> +            nr_pages = folio_unmap_pte_batch(folio, &pvmw, flags, 
>>> pteval);
>>>                end_addr = address + nr_pages * PAGE_SIZE;
>>>                flush_cache_range(vma, address, end_addr);
>>> @@ -2206,13 +2218,16 @@ static bool try_to_unmap_one(struct folio 
>>> *folio, struct vm_area_struct *vma,
>>>                hugetlb_remove_rmap(folio);
>>>            } else {
>>>                folio_remove_rmap_ptes(folio, subpage, nr_pages, vma);
>>> -            folio_ref_sub(folio, nr_pages - 1);
>>>            }
>>>            if (vma->vm_flags & VM_LOCKED)
>>>                mlock_drain_local();
>>> -        folio_put(folio);
>>> -        /* We have already batched the entire folio */
>>> -        if (nr_pages > 1)
>>> +        folio_put_refs(folio, nr_pages);
>>> +
>>> +        /*
>>> +         * If we are sure that we batched the entire folio and cleared
>>> +         * all PTEs, we can just optimize and stop right here.
>>> +         */
>>> +        if (nr_pages == folio_nr_pages(folio))
>>>                goto walk_done;
>>>            continue;
>>>    walk_abort:
>>> -- 
>>
>> Oops ... Through testing on my machine, I found that the logic doesn't
>> behave as expected because I messed up the meaning of max_nr (the 
>> available
>> scan room in the page table) with folio_nr_pages(folio) :(
>>
>> With the following change:
>>
>> diff --git a/mm/rmap.c b/mm/rmap.c
>> index 5ebffe2137e4..b1407348e14e 100644
>> --- a/mm/rmap.c
>> +++ b/mm/rmap.c
>> @@ -1850,9 +1850,9 @@ static inline unsigned int 
>> folio_unmap_pte_batch(struct folio *folio,
>>               enum ttu_flags flags, pte_t pte)
>>   {
>>       const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
>> +    unsigned int max_nr, nr_pages = folio_nr_pages(folio);
>>       unsigned long end_addr, addr = pvmw->address;
>>       struct vm_area_struct *vma = pvmw->vma;
>> -    unsigned int max_nr;
>>       if (flags & TTU_HWPOISON)
>>           return 1;
>> @@ -1870,12 +1870,13 @@ static inline unsigned int 
>> folio_unmap_pte_batch(struct folio *folio,
>>           return 1;
>>       /* ... where we must be able to batch the whole folio. */
> 
> Why is that still required? :)

Sorry ... I was still stuck in the "all-or-nothing" mindset ...

So, IIUC, you mean we should completely remove the "max_nr < nr_pages"
check and just let folio_pte_batch handle whatever partial batch it
safely can.

> 
>> -    if (pte_pfn(pte) != folio_pfn(folio) || max_nr != 
>> folio_nr_pages(folio))
>> +    if (pte_pfn(pte) != folio_pfn(folio) || max_nr < nr_pages)
>>           return 1;
>> -    max_nr = folio_pte_batch(folio, addr, pvmw->pte, pte, max_nr, 
>> fpb_flags,
>> -                 NULL, NULL, NULL);
>> -    return (max_nr != folio_nr_pages(folio)) ? 1 : max_nr;
>> +    max_nr = folio_pte_batch(folio, addr, pvmw->pte, pte, nr_pages,
>> +                 fpb_flags, NULL, NULL, NULL);
>> +
>> +    return (max_nr != nr_pages) ? 1 : max_nr;
> 
> Why is that still required? :)

Then simply return the number of PTEs that consecutively map to the
large folio. Right?
Re: [PATCH v4 3/4] mm: Support batched unmap for lazyfree large folios during reclamation
Posted by David Hildenbrand 7 months, 2 weeks ago
On 26.06.25 15:52, Lance Yang wrote:
> 
> 
> On 2025/6/26 21:16, David Hildenbrand wrote:
>> On 26.06.25 14:44, Lance Yang wrote:
>>>
>>> On 2025/6/26 17:29, Lance Yang wrote:
>>>> Before I send out the real patch, I'd like to get some quick feedback to
>>>> ensure I've understood the discussion correctly ;)
>>>>
>>>> Does this look like the right direction?
>>>>
>>>> diff --git a/mm/rmap.c b/mm/rmap.c
>>>> index fb63d9256f09..5ebffe2137e4 100644
>>>> --- a/mm/rmap.c
>>>> +++ b/mm/rmap.c
>>>> @@ -1845,23 +1845,37 @@ void folio_remove_rmap_pud(struct folio
>>>> *folio, struct page *page,
>>>>     #endif
>>>>     }
>>>> -/* We support batch unmapping of PTEs for lazyfree large folios */
>>>> -static inline bool can_batch_unmap_folio_ptes(unsigned long addr,
>>>> -            struct folio *folio, pte_t *ptep)
>>>> +static inline unsigned int folio_unmap_pte_batch(struct folio *folio,
>>>> +            struct page_vma_mapped_walk *pvmw,
>>>> +            enum ttu_flags flags, pte_t pte)
>>>>     {
>>>>         const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
>>>> -    int max_nr = folio_nr_pages(folio);
>>>> -    pte_t pte = ptep_get(ptep);
>>>> +    unsigned long end_addr, addr = pvmw->address;
>>>> +    struct vm_area_struct *vma = pvmw->vma;
>>>> +    unsigned int max_nr;
>>>> +
>>>> +    if (flags & TTU_HWPOISON)
>>>> +        return 1;
>>>> +    if (!folio_test_large(folio))
>>>> +        return 1;
>>>> +    /* We may only batch within a single VMA and a single page
>>>> table. */
>>>> +    end_addr = pmd_addr_end(addr, vma->vm_end);
>>>> +    max_nr = (end_addr - addr) >> PAGE_SHIFT;
>>>> +
>>>> +    /* We only support lazyfree batching for now ... */
>>>>         if (!folio_test_anon(folio) || folio_test_swapbacked(folio))
>>>> -        return false;
>>>> +        return 1;
>>>>         if (pte_unused(pte))
>>>> -        return false;
>>>> -    if (pte_pfn(pte) != folio_pfn(folio))
>>>> -        return false;
>>>> +        return 1;
>>>> +
>>>> +    /* ... where we must be able to batch the whole folio. */
>>>> +    if (pte_pfn(pte) != folio_pfn(folio) || max_nr !=
>>>> folio_nr_pages(folio))
>>>> +        return 1;
>>>> +    max_nr = folio_pte_batch(folio, addr, pvmw->pte, pte, max_nr,
>>>> fpb_flags,
>>>> +                 NULL, NULL, NULL);
>>>> -    return folio_pte_batch(folio, addr, ptep, pte, max_nr,
>>>> fpb_flags, NULL,
>>>> -                   NULL, NULL) == max_nr;
>>>> +    return (max_nr != folio_nr_pages(folio)) ? 1 : max_nr;
>>>>     }
>>>>     /*
>>>> @@ -2024,9 +2038,7 @@ static bool try_to_unmap_one(struct folio
>>>> *folio, struct vm_area_struct *vma,
>>>>                 if (pte_dirty(pteval))
>>>>                     folio_mark_dirty(folio);
>>>>             } else if (likely(pte_present(pteval))) {
>>>> -            if (folio_test_large(folio) && !(flags & TTU_HWPOISON) &&
>>>> -                can_batch_unmap_folio_ptes(address, folio, pvmw.pte))
>>>> -                nr_pages = folio_nr_pages(folio);
>>>> +            nr_pages = folio_unmap_pte_batch(folio, &pvmw, flags,
>>>> pteval);
>>>>                 end_addr = address + nr_pages * PAGE_SIZE;
>>>>                 flush_cache_range(vma, address, end_addr);
>>>> @@ -2206,13 +2218,16 @@ static bool try_to_unmap_one(struct folio
>>>> *folio, struct vm_area_struct *vma,
>>>>                 hugetlb_remove_rmap(folio);
>>>>             } else {
>>>>                 folio_remove_rmap_ptes(folio, subpage, nr_pages, vma);
>>>> -            folio_ref_sub(folio, nr_pages - 1);
>>>>             }
>>>>             if (vma->vm_flags & VM_LOCKED)
>>>>                 mlock_drain_local();
>>>> -        folio_put(folio);
>>>> -        /* We have already batched the entire folio */
>>>> -        if (nr_pages > 1)
>>>> +        folio_put_refs(folio, nr_pages);
>>>> +
>>>> +        /*
>>>> +         * If we are sure that we batched the entire folio and cleared
>>>> +         * all PTEs, we can just optimize and stop right here.
>>>> +         */
>>>> +        if (nr_pages == folio_nr_pages(folio))
>>>>                 goto walk_done;
>>>>             continue;
>>>>     walk_abort:
>>>> -- 
>>>
>>> Oops ... Through testing on my machine, I found that the logic doesn't
>>> behave as expected because I messed up the meaning of max_nr (the
>>> available
>>> scan room in the page table) with folio_nr_pages(folio) :(
>>>
>>> With the following change:
>>>
>>> diff --git a/mm/rmap.c b/mm/rmap.c
>>> index 5ebffe2137e4..b1407348e14e 100644
>>> --- a/mm/rmap.c
>>> +++ b/mm/rmap.c
>>> @@ -1850,9 +1850,9 @@ static inline unsigned int
>>> folio_unmap_pte_batch(struct folio *folio,
>>>                enum ttu_flags flags, pte_t pte)
>>>    {
>>>        const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
>>> +    unsigned int max_nr, nr_pages = folio_nr_pages(folio);
>>>        unsigned long end_addr, addr = pvmw->address;
>>>        struct vm_area_struct *vma = pvmw->vma;
>>> -    unsigned int max_nr;
>>>        if (flags & TTU_HWPOISON)
>>>            return 1;
>>> @@ -1870,12 +1870,13 @@ static inline unsigned int
>>> folio_unmap_pte_batch(struct folio *folio,
>>>            return 1;
>>>        /* ... where we must be able to batch the whole folio. */
>>
>> Why is that still required? :)
> 
> Sorry ... I was still stuck in the "all-or-nothing" mindset ...
> 
> So, IIUC, you mean we should completely remove the "max_nr < nr_pages"
> check and just let folio_pte_batch handle whatever partial batch it
> safely can.
> 
>>
>>> -    if (pte_pfn(pte) != folio_pfn(folio) || max_nr !=
>>> folio_nr_pages(folio))
>>> +    if (pte_pfn(pte) != folio_pfn(folio) || max_nr < nr_pages)
>>>            return 1;
>>> -    max_nr = folio_pte_batch(folio, addr, pvmw->pte, pte, max_nr,
>>> fpb_flags,
>>> -                 NULL, NULL, NULL);
>>> -    return (max_nr != folio_nr_pages(folio)) ? 1 : max_nr;
>>> +    max_nr = folio_pte_batch(folio, addr, pvmw->pte, pte, nr_pages,
>>> +                 fpb_flags, NULL, NULL, NULL);
>>> +
>>> +    return (max_nr != nr_pages) ? 1 : max_nr;
>>
>> Why is that still required? :)
> 
> Then simply return the number of PTEs that consecutively map to the
> large folio. Right?

Yes. Any part of the large folio. Just return folio_pte_batch() ...

-- 
Cheers,

David / dhildenb

Re: [PATCH v4 3/4] mm: Support batched unmap for lazyfree large folios during reclamation
Posted by Lance Yang 7 months, 2 weeks ago

On 2025/6/26 22:39, David Hildenbrand wrote:
[...]
>>>> @@ -1870,12 +1870,13 @@ static inline unsigned int
>>>> folio_unmap_pte_batch(struct folio *folio,
>>>>            return 1;
>>>>        /* ... where we must be able to batch the whole folio. */
>>>
>>> Why is that still required? :)
>>
>> Sorry ... I was still stuck in the "all-or-nothing" mindset ...
>>
>> So, IIUC, you mean we should completely remove the "max_nr < nr_pages"
>> check and just let folio_pte_batch handle whatever partial batch it
>> safely can.
>>
>>>
>>>> -    if (pte_pfn(pte) != folio_pfn(folio) || max_nr !=
>>>> folio_nr_pages(folio))
>>>> +    if (pte_pfn(pte) != folio_pfn(folio) || max_nr < nr_pages)
>>>>            return 1;
>>>> -    max_nr = folio_pte_batch(folio, addr, pvmw->pte, pte, max_nr,
>>>> fpb_flags,
>>>> -                 NULL, NULL, NULL);
>>>> -    return (max_nr != folio_nr_pages(folio)) ? 1 : max_nr;
>>>> +    max_nr = folio_pte_batch(folio, addr, pvmw->pte, pte, nr_pages,
>>>> +                 fpb_flags, NULL, NULL, NULL);
>>>> +
>>>> +    return (max_nr != nr_pages) ? 1 : max_nr;
>>>
>>> Why is that still required? :)
>>
>> Then simply return the number of PTEs that consecutively map to the
>> large folio. Right?
> 
> Yes. Any part of the large folio. Just return folio_pte_batch() ...
> 

Ah, got it. Thanks!

Re: [PATCH v4 3/4] mm: Support batched unmap for lazyfree large folios during reclamation
Posted by Lance Yang 7 months, 2 weeks ago

On 2025/6/25 20:25, David Hildenbrand wrote:
> On 25.06.25 14:20, Lance Yang wrote:
>>
>>
>> On 2025/6/25 20:09, David Hildenbrand wrote:
>>>>
>>>> Somehow, I feel we could combine your cleanup code—which handles a 
>>>> batch
>>>> size of "nr" between 1 and nr_pages—with the
>>>> "if (nr_pages == folio_nr_pages(folio)) goto walk_done" check.
>>>
>>> Yeah, that's what I was suggesting. It would have to be part of the
>>> cleanup I think.
>>>
>>> I'm still wondering if there is a case where
>>>
>>> if (nr_pages == folio_nr_pages(folio))
>>>       goto walk_done;
>>>
>>> would be wrong when dealing with small folios.
>>>
>>>> In practice, this would let us skip almost all unnecessary checks,
>>>> except for a few rare corner cases.
>>>>
>>>> For those corner cases where "nr" truly falls between 1 and nr_pages,
>>>> we can just leave them as-is—performing the redundant check inside
>>>> page_vma_mapped_walk().
>>>
>>> I mean, batching mapcount+refcount updates etc. is always a win. If we
>>> end up doing some unnecessary pte_none() checks, that might be
>>> suboptimal but mostly noise in contrast to the other stuff we will
>>> optimize out 🙂
>>>
>>> Agreed that if we can easily avoid these pte_none() checks, we should do
>>> that. Optimizing that for "nr_pages == folio_nr_pages(folio)" makes 
>>> sense.
>>
>> Hmm... I have a question about the reference counting here ...
>>
>>         if (vma->vm_flags & VM_LOCKED)
>>             mlock_drain_local();
>>         folio_put(folio);
>>         /* We have already batched the entire folio */
>>
>> Does anyone else still hold a reference to this folio after folio_put()?
> 
> The caller of the unmap operation should better hold a reference :)

Ah, you're right. I should have realized that :(

Thanks,
Lance

> 
> Also, I am not sure why we don't perform a
> 
> folio_put_refs(folio, nr_pages);
> 
> ... :)
> 

Re: [PATCH v4 3/4] mm: Support batched unmap for lazyfree large folios during reclamation
Posted by David Hildenbrand 7 months, 2 weeks ago
On 25.06.25 12:47, Lance Yang wrote:
> 
> 
> On 2025/6/25 18:00, David Hildenbrand wrote:
>> On 24.06.25 18:25, Lance Yang wrote:
>>> On 2025/6/24 23:34, David Hildenbrand wrote:
>>>> On 24.06.25 17:26, Lance Yang wrote:
>>>>> On 2025/6/24 20:55, David Hildenbrand wrote:
>>>>>> On 14.02.25 10:30, Barry Song wrote:
>>>>>>> From: Barry Song <v-songbaohua@oppo.com>
>>>>> [...]
>>>>>>> diff --git a/mm/rmap.c b/mm/rmap.c
>>>>>>> index 89e51a7a9509..8786704bd466 100644
>>>>>>> --- a/mm/rmap.c
>>>>>>> +++ b/mm/rmap.c
>>>>>>> @@ -1781,6 +1781,25 @@ void folio_remove_rmap_pud(struct folio
>>>>>>> *folio,
>>>>>>> struct page *page,
>>>>>>>      #endif
>>>>>>>      }
>>>>>>> +/* We support batch unmapping of PTEs for lazyfree large folios */
>>>>>>> +static inline bool can_batch_unmap_folio_ptes(unsigned long addr,
>>>>>>> +            struct folio *folio, pte_t *ptep)
>>>>>>> +{
>>>>>>> +    const fpb_t fpb_flags = FPB_IGNORE_DIRTY |
>>>>>>> FPB_IGNORE_SOFT_DIRTY;
>>>>>>> +    int max_nr = folio_nr_pages(folio);
>>>>>>
>>>>>> Let's assume we have the first page of a folio mapped at the last page
>>>>>> table entry in our page table.
>>>>>
>>>>> Good point. I'm curious if it is something we've seen in practice ;)
>>>>
>>>> I challenge you to write a reproducer :P I assume it might be doable
>>>> through simple mremap().
>>>>
>>>>>
>>>>>>
>>>>>> What prevents folio_pte_batch() from reading outside the page table?
>>>>>
>>>>> Assuming such a scenario is possible, to prevent any chance of an
>>>>> out-of-bounds read, how about this change:
>>>>>
>>>>> diff --git a/mm/rmap.c b/mm/rmap.c
>>>>> index fb63d9256f09..9aeae811a38b 100644
>>>>> --- a/mm/rmap.c
>>>>> +++ b/mm/rmap.c
>>>>> @@ -1852,6 +1852,25 @@ static inline bool
>>>>> can_batch_unmap_folio_ptes(unsigned long addr,
>>>>>         const fpb_t fpb_flags = FPB_IGNORE_DIRTY |
>>>>> FPB_IGNORE_SOFT_DIRTY;
>>>>>         int max_nr = folio_nr_pages(folio);
>>>>>         pte_t pte = ptep_get(ptep);
>>>>> +    unsigned long end_addr;
>>>>> +
>>>>> +    /*
>>>>> +     * To batch unmap, the entire folio's PTEs must be contiguous
>>>>> +     * and mapped within the same PTE page table, which corresponds to
>>>>> +     * a single PMD entry. Before calling folio_pte_batch(), which
>>>>> does
>>>>> +     * not perform boundary checks itself, we must verify that the
>>>>> +     * address range covered by the folio does not cross a PMD
>>>>> boundary.
>>>>> +     */
>>>>> +    end_addr = addr + (max_nr * PAGE_SIZE) - 1;
>>>>> +
>>>>> +    /*
>>>>> +     * A fast way to check for a PMD boundary cross is to align both
>>>>> +     * the start and end addresses to the PMD boundary and see if they
>>>>> +     * are different. If they are, the range spans across at least two
>>>>> +     * different PMD-managed regions.
>>>>> +     */
>>>>> +    if ((addr & PMD_MASK) != (end_addr & PMD_MASK))
>>>>> +        return false;
>>>>
>>>> You should not be messing with max_nr = folio_nr_pages(folio) here at
>>>> all. folio_pte_batch() takes care of that.
>>>>
>>>> Also, way too many comments ;)
>>>>
>>>> You may only batch within a single VMA and within a single page table.
>>>>
>>>> So simply align the addr up to the next PMD, and make sure it does not
>>>> exceed the vma end.
>>>>
>>>> ALIGN and friends can help avoiding excessive comments.
>>>
>>> Thanks! How about this updated version based on your suggestion:
>>>
>>> diff --git a/mm/rmap.c b/mm/rmap.c
>>> index fb63d9256f09..241d55a92a47 100644
>>> --- a/mm/rmap.c
>>> +++ b/mm/rmap.c
>>> @@ -1847,12 +1847,25 @@ void folio_remove_rmap_pud(struct folio
>>> *folio, struct page *page,
>>>    /* We support batch unmapping of PTEs for lazyfree large folios */
>>>    static inline bool can_batch_unmap_folio_ptes(unsigned long addr,
>>> -            struct folio *folio, pte_t *ptep)
>>> +                          struct folio *folio, pte_t *ptep,
>>> +                          struct vm_area_struct *vma)
>>>    {
>>>        const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
>>> +    unsigned long next_pmd, vma_end, end_addr;
>>>        int max_nr = folio_nr_pages(folio);
>>>        pte_t pte = ptep_get(ptep);
>>> +    /*
>>> +     * Limit the batch scan within a single VMA and within a single
>>> +     * page table.
>>> +     */
>>> +    vma_end = vma->vm_end;
>>> +    next_pmd = ALIGN(addr + 1, PMD_SIZE);
>>> +    end_addr = addr + (unsigned long)max_nr * PAGE_SIZE;
>>> +
>>> +    if (end_addr > min(next_pmd, vma_end))
>>> +        return false;
>>
>> May I suggest that we clean all that up as we fix it?
> 
> Yeah, that looks much better. Thanks for the suggestion!
> 
>>
>> Maybe something like this:
>>
>> diff --git a/mm/rmap.c b/mm/rmap.c
>> index 3b74bb19c11dd..11fbddc6ad8d6 100644
>> --- a/mm/rmap.c
>> +++ b/mm/rmap.c
>> @@ -1845,23 +1845,38 @@ void folio_remove_rmap_pud(struct folio *folio,
>> struct page *page,
>>    #endif
>>    }
>>
>> -/* We support batch unmapping of PTEs for lazyfree large folios */
>> -static inline bool can_batch_unmap_folio_ptes(unsigned long addr,
>> -                       struct folio *folio, pte_t *ptep)
>> +static inline unsigned int folio_unmap_pte_batch(struct folio *folio,
>> +               struct page_vma_mapped_walk *pvmw, enum ttu_flags flags,
>> +               pte_t pte)
>>    {
>>           const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
>> -       int max_nr = folio_nr_pages(folio);
>> -       pte_t pte = ptep_get(ptep);
>> +       struct vm_area_struct *vma = pvmw->vma;
>> +       unsigned long end_addr, addr = pvmw->address;
>> +       unsigned int max_nr;
>> +
>> +       if (flags & TTU_HWPOISON)
>> +               return 1;
>> +       if (!folio_test_large(folio))
>> +               return 1;
>> +
>> +       /* We may only batch within a single VMA and a single page
>> table. */
>> +       end_addr = min_t(unsigned long, ALIGN(addr + 1, PMD_SIZE), vma-
>>   >vm_end);
>> +       max_nr = (end_addr - addr) >> PAGE_SHIFT;
>>
>> +       /* We only support lazyfree batching for now ... */
>>           if (!folio_test_anon(folio) || folio_test_swapbacked(folio))
>> -               return false;
>> +               return 1;
>>           if (pte_unused(pte))
>> -               return false;
>> -       if (pte_pfn(pte) != folio_pfn(folio))
>> -               return false;
>> +               return 1;
>> +       /* ... where we must be able to batch the whole folio. */
>> +       if (pte_pfn(pte) != folio_pfn(folio) || max_nr !=
>> folio_nr_pages(folio))
>> +               return 1;
>> +       max_nr = folio_pte_batch(folio, addr, pvmw->pte, pte, max_nr,
>> fpb_flags,
>> +                                NULL, NULL, NULL);
>>
>> -       return folio_pte_batch(folio, addr, ptep, pte, max_nr,
>> fpb_flags, NULL,
>> -                              NULL, NULL) == max_nr;
>> +       if (max_nr != folio_nr_pages(folio))
>> +               return 1;
>> +       return max_nr;
>>    }
>>
>>    /*
>> @@ -2024,9 +2039,7 @@ static bool try_to_unmap_one(struct folio *folio,
>> struct vm_area_struct *vma,
>>                           if (pte_dirty(pteval))
>>                                   folio_mark_dirty(folio);
>>                   } else if (likely(pte_present(pteval))) {
>> -                       if (folio_test_large(folio) && !(flags &
>> TTU_HWPOISON) &&
>> -                           can_batch_unmap_folio_ptes(address, folio,
>> pvmw.pte))
>> -                               nr_pages = folio_nr_pages(folio);
>> +                       nr_pages = folio_unmap_pte_batch(folio, &pvmw,
>> flags, pteval);
>>                           end_addr = address + nr_pages * PAGE_SIZE;
>>                           flush_cache_range(vma, address, end_addr);
>>
>>
>> Note that I don't quite understand why we have to batch the whole thing
>> or fallback to
>> individual pages. Why can't we perform other batches that span only some
>> PTEs? What's special
>> about 1 PTE vs. 2 PTEs vs. all PTEs?
> 
> That's a good point about the "all-or-nothing" batching logic ;)
> 
> It seems the "all-or-nothing" approach is specific to the lazyfree use
> case, which needs to unmap the entire folio for reclamation. If that's
> not possible, it falls back to the single-page slow path.
> 
> Also, supporting partial batches would be useful, but not common case
> I guess, so let's leave it as is ;p

We can literally make this code less complicated if we just support it? :)

I mean, it's dropping 3 if conditions from the code I shared.

-- 
Cheers,

David / dhildenb

Re: [PATCH v4 3/4] mm: Support batched unmap for lazyfree large folios during reclamation
Posted by Barry Song 7 months, 2 weeks ago
> > diff --git a/mm/rmap.c b/mm/rmap.c
> > index fb63d9256f09..241d55a92a47 100644
> > --- a/mm/rmap.c
> > +++ b/mm/rmap.c
> > @@ -1847,12 +1847,25 @@ void folio_remove_rmap_pud(struct folio *folio, struct page *page,
> >
> >   /* We support batch unmapping of PTEs for lazyfree large folios */
> >   static inline bool can_batch_unmap_folio_ptes(unsigned long addr,
> > -                     struct folio *folio, pte_t *ptep)
> > +                                           struct folio *folio, pte_t *ptep,
> > +                                           struct vm_area_struct *vma)
> >   {
> >       const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
> > +     unsigned long next_pmd, vma_end, end_addr;
> >       int max_nr = folio_nr_pages(folio);
> >       pte_t pte = ptep_get(ptep);
> >
> > +     /*
> > +      * Limit the batch scan within a single VMA and within a single
> > +      * page table.
> > +      */
> > +     vma_end = vma->vm_end;
> > +     next_pmd = ALIGN(addr + 1, PMD_SIZE);
> > +     end_addr = addr + (unsigned long)max_nr * PAGE_SIZE;
> > +
> > +     if (end_addr > min(next_pmd, vma_end))
> > +             return false;
>
> May I suggest that we clean all that up as we fix it?
>
> Maybe something like this:
>
> diff --git a/mm/rmap.c b/mm/rmap.c
> index 3b74bb19c11dd..11fbddc6ad8d6 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -1845,23 +1845,38 @@ void folio_remove_rmap_pud(struct folio *folio, struct page *page,
>   #endif
>   }
>
> -/* We support batch unmapping of PTEs for lazyfree large folios */
> -static inline bool can_batch_unmap_folio_ptes(unsigned long addr,
> -                       struct folio *folio, pte_t *ptep)
> +static inline unsigned int folio_unmap_pte_batch(struct folio *folio,
> +               struct page_vma_mapped_walk *pvmw, enum ttu_flags flags,
> +               pte_t pte)
>   {
>          const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
> -       int max_nr = folio_nr_pages(folio);
> -       pte_t pte = ptep_get(ptep);
> +       struct vm_area_struct *vma = pvmw->vma;
> +       unsigned long end_addr, addr = pvmw->address;
> +       unsigned int max_nr;
> +
> +       if (flags & TTU_HWPOISON)
> +               return 1;
> +       if (!folio_test_large(folio))
> +               return 1;
> +
> +       /* We may only batch within a single VMA and a single page table. */
> +       end_addr = min_t(unsigned long, ALIGN(addr + 1, PMD_SIZE), vma->vm_end);

Is this pmd_addr_end()?

> +       max_nr = (end_addr - addr) >> PAGE_SHIFT;
>
> +       /* We only support lazyfree batching for now ... */
>          if (!folio_test_anon(folio) || folio_test_swapbacked(folio))
> -               return false;
> +               return 1;
>          if (pte_unused(pte))
> -               return false;
> -       if (pte_pfn(pte) != folio_pfn(folio))
> -               return false;
> +               return 1;
> +       /* ... where we must be able to batch the whole folio. */
> +       if (pte_pfn(pte) != folio_pfn(folio) || max_nr != folio_nr_pages(folio))
> +               return 1;
> +       max_nr = folio_pte_batch(folio, addr, pvmw->pte, pte, max_nr, fpb_flags,
> +                                NULL, NULL, NULL);
>
> -       return folio_pte_batch(folio, addr, ptep, pte, max_nr, fpb_flags, NULL,
> -                              NULL, NULL) == max_nr;
> +       if (max_nr != folio_nr_pages(folio))
> +               return 1;
> +       return max_nr;
>   }
>
>   /*
> @@ -2024,9 +2039,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
>                          if (pte_dirty(pteval))
>                                  folio_mark_dirty(folio);
>                  } else if (likely(pte_present(pteval))) {
> -                       if (folio_test_large(folio) && !(flags & TTU_HWPOISON) &&
> -                           can_batch_unmap_folio_ptes(address, folio, pvmw.pte))
> -                               nr_pages = folio_nr_pages(folio);
> +                       nr_pages = folio_unmap_pte_batch(folio, &pvmw, flags, pteval);
>                          end_addr = address + nr_pages * PAGE_SIZE;
>                          flush_cache_range(vma, address, end_addr);
>
>
> Note that I don't quite understand why we have to batch the whole thing or fallback to
> individual pages. Why can't we perform other batches that span only some PTEs? What's special
> about 1 PTE vs. 2 PTEs vs. all PTEs?
>
>
> Can someone enlighten me why that is required?

It's probably not a strict requirement — I thought cases where the
count is greater than 1 but less than nr_pages might not provide much
practical benefit, except perhaps in very rare edge cases, since
madv_free() already calls split_folio().

if (folio_test_large(folio)) {
                        bool any_young, any_dirty;
                        nr = madvise_folio_pte_batch(addr, end, folio, pte,
                                                     ptent,
&any_young, &any_dirty);


                        if (nr < folio_nr_pages(folio)) {
                                ...
                                err = split_folio(folio);
                                ...
                       }
}

Another reason is that when we extend this to non-lazyfree anonymous
folios [1], things get complicated: checking anon_exclusive and updating
folio_try_share_anon_rmap_pte with the number of PTEs becomes tricky if
a folio is partially exclusive and partially shared.

[1] https://lore.kernel.org/linux-mm/20250513084620.58231-1-21cnbao@gmail.com/

>
> --
> Cheers,
>
> David / dhildenb
>

Thanks
Barry
Re: [PATCH v4 3/4] mm: Support batched unmap for lazyfree large folios during reclamation
Posted by David Hildenbrand 7 months, 2 weeks ago
On 25.06.25 12:38, Barry Song wrote:
>>> diff --git a/mm/rmap.c b/mm/rmap.c
>>> index fb63d9256f09..241d55a92a47 100644
>>> --- a/mm/rmap.c
>>> +++ b/mm/rmap.c
>>> @@ -1847,12 +1847,25 @@ void folio_remove_rmap_pud(struct folio *folio, struct page *page,
>>>
>>>    /* We support batch unmapping of PTEs for lazyfree large folios */
>>>    static inline bool can_batch_unmap_folio_ptes(unsigned long addr,
>>> -                     struct folio *folio, pte_t *ptep)
>>> +                                           struct folio *folio, pte_t *ptep,
>>> +                                           struct vm_area_struct *vma)
>>>    {
>>>        const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
>>> +     unsigned long next_pmd, vma_end, end_addr;
>>>        int max_nr = folio_nr_pages(folio);
>>>        pte_t pte = ptep_get(ptep);
>>>
>>> +     /*
>>> +      * Limit the batch scan within a single VMA and within a single
>>> +      * page table.
>>> +      */
>>> +     vma_end = vma->vm_end;
>>> +     next_pmd = ALIGN(addr + 1, PMD_SIZE);
>>> +     end_addr = addr + (unsigned long)max_nr * PAGE_SIZE;
>>> +
>>> +     if (end_addr > min(next_pmd, vma_end))
>>> +             return false;
>>
>> May I suggest that we clean all that up as we fix it?
>>
>> Maybe something like this:
>>
>> diff --git a/mm/rmap.c b/mm/rmap.c
>> index 3b74bb19c11dd..11fbddc6ad8d6 100644
>> --- a/mm/rmap.c
>> +++ b/mm/rmap.c
>> @@ -1845,23 +1845,38 @@ void folio_remove_rmap_pud(struct folio *folio, struct page *page,
>>    #endif
>>    }
>>
>> -/* We support batch unmapping of PTEs for lazyfree large folios */
>> -static inline bool can_batch_unmap_folio_ptes(unsigned long addr,
>> -                       struct folio *folio, pte_t *ptep)
>> +static inline unsigned int folio_unmap_pte_batch(struct folio *folio,
>> +               struct page_vma_mapped_walk *pvmw, enum ttu_flags flags,
>> +               pte_t pte)
>>    {
>>           const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
>> -       int max_nr = folio_nr_pages(folio);
>> -       pte_t pte = ptep_get(ptep);
>> +       struct vm_area_struct *vma = pvmw->vma;
>> +       unsigned long end_addr, addr = pvmw->address;
>> +       unsigned int max_nr;
>> +
>> +       if (flags & TTU_HWPOISON)
>> +               return 1;
>> +       if (!folio_test_large(folio))
>> +               return 1;
>> +
>> +       /* We may only batch within a single VMA and a single page table. */
>> +       end_addr = min_t(unsigned long, ALIGN(addr + 1, PMD_SIZE), vma->vm_end);
> 
> Is this pmd_addr_end()?
> 

Yes, that could be reused as well here.

>> +       max_nr = (end_addr - addr) >> PAGE_SHIFT;
>>
>> +       /* We only support lazyfree batching for now ... */
>>           if (!folio_test_anon(folio) || folio_test_swapbacked(folio))
>> -               return false;
>> +               return 1;
>>           if (pte_unused(pte))
>> -               return false;
>> -       if (pte_pfn(pte) != folio_pfn(folio))
>> -               return false;
>> +               return 1;
>> +       /* ... where we must be able to batch the whole folio. */
>> +       if (pte_pfn(pte) != folio_pfn(folio) || max_nr != folio_nr_pages(folio))
>> +               return 1;
>> +       max_nr = folio_pte_batch(folio, addr, pvmw->pte, pte, max_nr, fpb_flags,
>> +                                NULL, NULL, NULL);
>>
>> -       return folio_pte_batch(folio, addr, ptep, pte, max_nr, fpb_flags, NULL,
>> -                              NULL, NULL) == max_nr;
>> +       if (max_nr != folio_nr_pages(folio))
>> +               return 1;
>> +       return max_nr;
>>    }
>>
>>    /*
>> @@ -2024,9 +2039,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
>>                           if (pte_dirty(pteval))
>>                                   folio_mark_dirty(folio);
>>                   } else if (likely(pte_present(pteval))) {
>> -                       if (folio_test_large(folio) && !(flags & TTU_HWPOISON) &&
>> -                           can_batch_unmap_folio_ptes(address, folio, pvmw.pte))
>> -                               nr_pages = folio_nr_pages(folio);
>> +                       nr_pages = folio_unmap_pte_batch(folio, &pvmw, flags, pteval);
>>                           end_addr = address + nr_pages * PAGE_SIZE;
>>                           flush_cache_range(vma, address, end_addr);
>>
>>
>> Note that I don't quite understand why we have to batch the whole thing or fallback to
>> individual pages. Why can't we perform other batches that span only some PTEs? What's special
>> about 1 PTE vs. 2 PTEs vs. all PTEs?
>>
>>
>> Can someone enlighten me why that is required?
> 
> It's probably not a strict requirement — I thought cases where the
> count is greater than 1 but less than nr_pages might not provide much
> practical benefit, except perhaps in very rare edge cases, since
> madv_free() already calls split_folio().

Okay, but it makes the code more complicated. If there is no reason to
prevent the batching, we should drop it.

> 
> if (folio_test_large(folio)) {
>                          bool any_young, any_dirty;
>                          nr = madvise_folio_pte_batch(addr, end, folio, pte,
>                                                       ptent,
> &any_young, &any_dirty);
> 
> 
>                          if (nr < folio_nr_pages(folio)) {
>                                  ...
>                                  err = split_folio(folio);
>                                  ...
>                         }
> }
> 
> Another reason is that when we extend this to non-lazyfree anonymous
> folios [1], things get complicated: checking anon_exclusive and updating
> folio_try_share_anon_rmap_pte with the number of PTEs becomes tricky if
> a folio is partially exclusive and partially shared.

Right, but that's just another limitation on top how much we can batch, 
right?


-- 
Cheers,

David / dhildenb

Re: [PATCH v4 3/4] mm: Support batched unmap for lazyfree large folios during reclamation
Posted by Barry Song 7 months, 2 weeks ago
On Wed, Jun 25, 2025 at 10:43 PM David Hildenbrand <david@redhat.com> wrote:
>
> On 25.06.25 12:38, Barry Song wrote:
> >>> diff --git a/mm/rmap.c b/mm/rmap.c
> >>> index fb63d9256f09..241d55a92a47 100644
> >>> --- a/mm/rmap.c
> >>> +++ b/mm/rmap.c
> >>> @@ -1847,12 +1847,25 @@ void folio_remove_rmap_pud(struct folio *folio, struct page *page,
> >>>
> >>>    /* We support batch unmapping of PTEs for lazyfree large folios */
> >>>    static inline bool can_batch_unmap_folio_ptes(unsigned long addr,
> >>> -                     struct folio *folio, pte_t *ptep)
> >>> +                                           struct folio *folio, pte_t *ptep,
> >>> +                                           struct vm_area_struct *vma)
> >>>    {
> >>>        const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
> >>> +     unsigned long next_pmd, vma_end, end_addr;
> >>>        int max_nr = folio_nr_pages(folio);
> >>>        pte_t pte = ptep_get(ptep);
> >>>
> >>> +     /*
> >>> +      * Limit the batch scan within a single VMA and within a single
> >>> +      * page table.
> >>> +      */
> >>> +     vma_end = vma->vm_end;
> >>> +     next_pmd = ALIGN(addr + 1, PMD_SIZE);
> >>> +     end_addr = addr + (unsigned long)max_nr * PAGE_SIZE;
> >>> +
> >>> +     if (end_addr > min(next_pmd, vma_end))
> >>> +             return false;
> >>
> >> May I suggest that we clean all that up as we fix it?
> >>
> >> Maybe something like this:
> >>
> >> diff --git a/mm/rmap.c b/mm/rmap.c
> >> index 3b74bb19c11dd..11fbddc6ad8d6 100644
> >> --- a/mm/rmap.c
> >> +++ b/mm/rmap.c
> >> @@ -1845,23 +1845,38 @@ void folio_remove_rmap_pud(struct folio *folio, struct page *page,
> >>    #endif
> >>    }
> >>
> >> -/* We support batch unmapping of PTEs for lazyfree large folios */
> >> -static inline bool can_batch_unmap_folio_ptes(unsigned long addr,
> >> -                       struct folio *folio, pte_t *ptep)
> >> +static inline unsigned int folio_unmap_pte_batch(struct folio *folio,
> >> +               struct page_vma_mapped_walk *pvmw, enum ttu_flags flags,
> >> +               pte_t pte)
> >>    {
> >>           const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
> >> -       int max_nr = folio_nr_pages(folio);
> >> -       pte_t pte = ptep_get(ptep);
> >> +       struct vm_area_struct *vma = pvmw->vma;
> >> +       unsigned long end_addr, addr = pvmw->address;
> >> +       unsigned int max_nr;
> >> +
> >> +       if (flags & TTU_HWPOISON)
> >> +               return 1;
> >> +       if (!folio_test_large(folio))
> >> +               return 1;
> >> +
> >> +       /* We may only batch within a single VMA and a single page table. */
> >> +       end_addr = min_t(unsigned long, ALIGN(addr + 1, PMD_SIZE), vma->vm_end);
> >
> > Is this pmd_addr_end()?
> >
>
> Yes, that could be reused as well here.
>
> >> +       max_nr = (end_addr - addr) >> PAGE_SHIFT;
> >>
> >> +       /* We only support lazyfree batching for now ... */
> >>           if (!folio_test_anon(folio) || folio_test_swapbacked(folio))
> >> -               return false;
> >> +               return 1;
> >>           if (pte_unused(pte))
> >> -               return false;
> >> -       if (pte_pfn(pte) != folio_pfn(folio))
> >> -               return false;
> >> +               return 1;
> >> +       /* ... where we must be able to batch the whole folio. */
> >> +       if (pte_pfn(pte) != folio_pfn(folio) || max_nr != folio_nr_pages(folio))
> >> +               return 1;
> >> +       max_nr = folio_pte_batch(folio, addr, pvmw->pte, pte, max_nr, fpb_flags,
> >> +                                NULL, NULL, NULL);
> >>
> >> -       return folio_pte_batch(folio, addr, ptep, pte, max_nr, fpb_flags, NULL,
> >> -                              NULL, NULL) == max_nr;
> >> +       if (max_nr != folio_nr_pages(folio))
> >> +               return 1;
> >> +       return max_nr;
> >>    }
> >>
> >>    /*
> >> @@ -2024,9 +2039,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
> >>                           if (pte_dirty(pteval))
> >>                                   folio_mark_dirty(folio);
> >>                   } else if (likely(pte_present(pteval))) {
> >> -                       if (folio_test_large(folio) && !(flags & TTU_HWPOISON) &&
> >> -                           can_batch_unmap_folio_ptes(address, folio, pvmw.pte))
> >> -                               nr_pages = folio_nr_pages(folio);
> >> +                       nr_pages = folio_unmap_pte_batch(folio, &pvmw, flags, pteval);
> >>                           end_addr = address + nr_pages * PAGE_SIZE;
> >>                           flush_cache_range(vma, address, end_addr);
> >>
> >>
> >> Note that I don't quite understand why we have to batch the whole thing or fallback to
> >> individual pages. Why can't we perform other batches that span only some PTEs? What's special
> >> about 1 PTE vs. 2 PTEs vs. all PTEs?
> >>
> >>
> >> Can someone enlighten me why that is required?
> >
> > It's probably not a strict requirement — I thought cases where the
> > count is greater than 1 but less than nr_pages might not provide much
> > practical benefit, except perhaps in very rare edge cases, since
> > madv_free() already calls split_folio().
>
> Okay, but it makes the code more complicated. If there is no reason to
> prevent the batching, we should drop it.

It's not necessarily more complex, since page_vma_mapped_walk() still
has to check each PTE individually and can't skip ahead based on nr.
With nr_pages batched, we can exit the loop early in one go.

Thanks
Barry
Re: [PATCH v4 3/4] mm: Support batched unmap for lazyfree large folios during reclamation
Posted by David Hildenbrand 7 months, 2 weeks ago
On 25.06.25 12:49, Barry Song wrote:
> On Wed, Jun 25, 2025 at 10:43 PM David Hildenbrand <david@redhat.com> wrote:
>>
>> On 25.06.25 12:38, Barry Song wrote:
>>>>> diff --git a/mm/rmap.c b/mm/rmap.c
>>>>> index fb63d9256f09..241d55a92a47 100644
>>>>> --- a/mm/rmap.c
>>>>> +++ b/mm/rmap.c
>>>>> @@ -1847,12 +1847,25 @@ void folio_remove_rmap_pud(struct folio *folio, struct page *page,
>>>>>
>>>>>     /* We support batch unmapping of PTEs for lazyfree large folios */
>>>>>     static inline bool can_batch_unmap_folio_ptes(unsigned long addr,
>>>>> -                     struct folio *folio, pte_t *ptep)
>>>>> +                                           struct folio *folio, pte_t *ptep,
>>>>> +                                           struct vm_area_struct *vma)
>>>>>     {
>>>>>         const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
>>>>> +     unsigned long next_pmd, vma_end, end_addr;
>>>>>         int max_nr = folio_nr_pages(folio);
>>>>>         pte_t pte = ptep_get(ptep);
>>>>>
>>>>> +     /*
>>>>> +      * Limit the batch scan within a single VMA and within a single
>>>>> +      * page table.
>>>>> +      */
>>>>> +     vma_end = vma->vm_end;
>>>>> +     next_pmd = ALIGN(addr + 1, PMD_SIZE);
>>>>> +     end_addr = addr + (unsigned long)max_nr * PAGE_SIZE;
>>>>> +
>>>>> +     if (end_addr > min(next_pmd, vma_end))
>>>>> +             return false;
>>>>
>>>> May I suggest that we clean all that up as we fix it?
>>>>
>>>> Maybe something like this:
>>>>
>>>> diff --git a/mm/rmap.c b/mm/rmap.c
>>>> index 3b74bb19c11dd..11fbddc6ad8d6 100644
>>>> --- a/mm/rmap.c
>>>> +++ b/mm/rmap.c
>>>> @@ -1845,23 +1845,38 @@ void folio_remove_rmap_pud(struct folio *folio, struct page *page,
>>>>     #endif
>>>>     }
>>>>
>>>> -/* We support batch unmapping of PTEs for lazyfree large folios */
>>>> -static inline bool can_batch_unmap_folio_ptes(unsigned long addr,
>>>> -                       struct folio *folio, pte_t *ptep)
>>>> +static inline unsigned int folio_unmap_pte_batch(struct folio *folio,
>>>> +               struct page_vma_mapped_walk *pvmw, enum ttu_flags flags,
>>>> +               pte_t pte)
>>>>     {
>>>>            const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
>>>> -       int max_nr = folio_nr_pages(folio);
>>>> -       pte_t pte = ptep_get(ptep);
>>>> +       struct vm_area_struct *vma = pvmw->vma;
>>>> +       unsigned long end_addr, addr = pvmw->address;
>>>> +       unsigned int max_nr;
>>>> +
>>>> +       if (flags & TTU_HWPOISON)
>>>> +               return 1;
>>>> +       if (!folio_test_large(folio))
>>>> +               return 1;
>>>> +
>>>> +       /* We may only batch within a single VMA and a single page table. */
>>>> +       end_addr = min_t(unsigned long, ALIGN(addr + 1, PMD_SIZE), vma->vm_end);
>>>
>>> Is this pmd_addr_end()?
>>>
>>
>> Yes, that could be reused as well here.
>>
>>>> +       max_nr = (end_addr - addr) >> PAGE_SHIFT;
>>>>
>>>> +       /* We only support lazyfree batching for now ... */
>>>>            if (!folio_test_anon(folio) || folio_test_swapbacked(folio))
>>>> -               return false;
>>>> +               return 1;
>>>>            if (pte_unused(pte))
>>>> -               return false;
>>>> -       if (pte_pfn(pte) != folio_pfn(folio))
>>>> -               return false;
>>>> +               return 1;
>>>> +       /* ... where we must be able to batch the whole folio. */
>>>> +       if (pte_pfn(pte) != folio_pfn(folio) || max_nr != folio_nr_pages(folio))
>>>> +               return 1;
>>>> +       max_nr = folio_pte_batch(folio, addr, pvmw->pte, pte, max_nr, fpb_flags,
>>>> +                                NULL, NULL, NULL);
>>>>
>>>> -       return folio_pte_batch(folio, addr, ptep, pte, max_nr, fpb_flags, NULL,
>>>> -                              NULL, NULL) == max_nr;
>>>> +       if (max_nr != folio_nr_pages(folio))
>>>> +               return 1;
>>>> +       return max_nr;
>>>>     }
>>>>
>>>>     /*
>>>> @@ -2024,9 +2039,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
>>>>                            if (pte_dirty(pteval))
>>>>                                    folio_mark_dirty(folio);
>>>>                    } else if (likely(pte_present(pteval))) {
>>>> -                       if (folio_test_large(folio) && !(flags & TTU_HWPOISON) &&
>>>> -                           can_batch_unmap_folio_ptes(address, folio, pvmw.pte))
>>>> -                               nr_pages = folio_nr_pages(folio);
>>>> +                       nr_pages = folio_unmap_pte_batch(folio, &pvmw, flags, pteval);
>>>>                            end_addr = address + nr_pages * PAGE_SIZE;
>>>>                            flush_cache_range(vma, address, end_addr);
>>>>
>>>>
>>>> Note that I don't quite understand why we have to batch the whole thing or fallback to
>>>> individual pages. Why can't we perform other batches that span only some PTEs? What's special
>>>> about 1 PTE vs. 2 PTEs vs. all PTEs?
>>>>
>>>>
>>>> Can someone enlighten me why that is required?
>>>
>>> It's probably not a strict requirement — I thought cases where the
>>> count is greater than 1 but less than nr_pages might not provide much
>>> practical benefit, except perhaps in very rare edge cases, since
>>> madv_free() already calls split_folio().
>>
>> Okay, but it makes the code more complicated. If there is no reason to
>> prevent the batching, we should drop it.
> 
> It's not necessarily more complex, since page_vma_mapped_walk() still
> has to check each PTE individually and can't skip ahead based on nr.
> With nr_pages batched, we can exit the loop early in one go.

I said "complicated", not "complex". The code is more complicated than 
necessary.

-- 
Cheers,

David / dhildenb

Re: [PATCH v4 3/4] mm: Support batched unmap for lazyfree large folios during reclamation
Posted by Barry Song 7 months, 2 weeks ago
On Wed, Jun 25, 2025 at 4:27 AM Lance Yang <ioworker0@gmail.com> wrote:
>
> On 2025/6/24 23:34, David Hildenbrand wrote:
> > On 24.06.25 17:26, Lance Yang wrote:
> >> On 2025/6/24 20:55, David Hildenbrand wrote:
> >>> On 14.02.25 10:30, Barry Song wrote:
> >>>> From: Barry Song <v-songbaohua@oppo.com>
> >> [...]
> >>>> diff --git a/mm/rmap.c b/mm/rmap.c
> >>>> index 89e51a7a9509..8786704bd466 100644
> >>>> --- a/mm/rmap.c
> >>>> +++ b/mm/rmap.c
> >>>> @@ -1781,6 +1781,25 @@ void folio_remove_rmap_pud(struct folio *folio,
> >>>> struct page *page,
> >>>>    #endif
> >>>>    }
> >>>> +/* We support batch unmapping of PTEs for lazyfree large folios */
> >>>> +static inline bool can_batch_unmap_folio_ptes(unsigned long addr,
> >>>> +            struct folio *folio, pte_t *ptep)
> >>>> +{
> >>>> +    const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
> >>>> +    int max_nr = folio_nr_pages(folio);
> >>>
> >>> Let's assume we have the first page of a folio mapped at the last page
> >>> table entry in our page table.
> >>
> >> Good point. I'm curious if it is something we've seen in practice ;)
> >
> > I challenge you to write a reproducer :P I assume it might be doable
> > through simple mremap().
> >
> >>
> >>>
> >>> What prevents folio_pte_batch() from reading outside the page table?
> >>
> >> Assuming such a scenario is possible, to prevent any chance of an
> >> out-of-bounds read, how about this change:
> >>
> >> diff --git a/mm/rmap.c b/mm/rmap.c
> >> index fb63d9256f09..9aeae811a38b 100644
> >> --- a/mm/rmap.c
> >> +++ b/mm/rmap.c
> >> @@ -1852,6 +1852,25 @@ static inline bool
> >> can_batch_unmap_folio_ptes(unsigned long addr,
> >>       const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
> >>       int max_nr = folio_nr_pages(folio);
> >>       pte_t pte = ptep_get(ptep);
> >> +    unsigned long end_addr;
> >> +
> >> +    /*
> >> +     * To batch unmap, the entire folio's PTEs must be contiguous
> >> +     * and mapped within the same PTE page table, which corresponds to
> >> +     * a single PMD entry. Before calling folio_pte_batch(), which does
> >> +     * not perform boundary checks itself, we must verify that the
> >> +     * address range covered by the folio does not cross a PMD boundary.
> >> +     */
> >> +    end_addr = addr + (max_nr * PAGE_SIZE) - 1;
> >> +
> >> +    /*
> >> +     * A fast way to check for a PMD boundary cross is to align both
> >> +     * the start and end addresses to the PMD boundary and see if they
> >> +     * are different. If they are, the range spans across at least two
> >> +     * different PMD-managed regions.
> >> +     */
> >> +    if ((addr & PMD_MASK) != (end_addr & PMD_MASK))
> >> +        return false;
> >
> > You should not be messing with max_nr = folio_nr_pages(folio) here at
> > all. folio_pte_batch() takes care of that.
> >
> > Also, way too many comments ;)
> >
> > You may only batch within a single VMA and within a single page table.
> >
> > So simply align the addr up to the next PMD, and make sure it does not
> > exceed the vma end.
> >
> > ALIGN and friends can help avoiding excessive comments.
>
> Thanks! How about this updated version based on your suggestion:
>
> diff --git a/mm/rmap.c b/mm/rmap.c
> index fb63d9256f09..241d55a92a47 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -1847,12 +1847,25 @@ void folio_remove_rmap_pud(struct folio *folio, struct page *page,
>
>  /* We support batch unmapping of PTEs for lazyfree large folios */
>  static inline bool can_batch_unmap_folio_ptes(unsigned long addr,
> -                       struct folio *folio, pte_t *ptep)
> +                                             struct folio *folio, pte_t *ptep,
> +                                             struct vm_area_struct *vma)
>  {
>         const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
> +       unsigned long next_pmd, vma_end, end_addr;
>         int max_nr = folio_nr_pages(folio);
>         pte_t pte = ptep_get(ptep);
>
> +       /*
> +        * Limit the batch scan within a single VMA and within a single
> +        * page table.
> +        */
> +       vma_end = vma->vm_end;
> +       next_pmd = ALIGN(addr + 1, PMD_SIZE);
> +       end_addr = addr + (unsigned long)max_nr * PAGE_SIZE;
> +
> +       if (end_addr > min(next_pmd, vma_end))
> +               return false;
> +

I had a similar check in do_swap_page() for both forward and backward
out-of-bounds page tables, but I forgot to add it for this unmap path.

this is do_swap_page():

        if (folio_test_large(folio) && folio_test_swapcache(folio)) {
                int nr = folio_nr_pages(folio);
                unsigned long idx = folio_page_idx(folio, page);
                unsigned long folio_start = address - idx * PAGE_SIZE;
                unsigned long folio_end = folio_start + nr * PAGE_SIZE;

                pte_t *folio_ptep;
                pte_t folio_pte;

                if (unlikely(folio_start < max(address & PMD_MASK,
vma->vm_start)))
                        goto check_folio;
                if (unlikely(folio_end > pmd_addr_end(address, vma->vm_end)))
                        goto check_folio;
       }

So maybe something like folio_end > pmd_addr_end(address, vma->vm_end)?

Thanks
Barry