Optimize mremap() by PTE-batching

[PATCH 3/3] mm: Optimize mremap() by PTE batching

Posted by Dev Jain 9 months, 1 week ago

Use folio_pte_batch() to optimize move_ptes(). Use get_and_clear_full_ptes()
so as to elide TLBIs on each contig block, which was previously done by
ptep_get_and_clear().

Signed-off-by: Dev Jain <dev.jain@arm.com>
---
 mm/mremap.c | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/mm/mremap.c b/mm/mremap.c
index 1a08a7c3b92f..3621c07d8eea 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -176,7 +176,7 @@ static int move_ptes(struct pagetable_move_control *pmc,
 	struct vm_area_struct *vma = pmc->old;
 	bool need_clear_uffd_wp = vma_has_uffd_without_event_remap(vma);
 	struct mm_struct *mm = vma->vm_mm;
-	pte_t *old_ptep, *new_ptep, pte;
+	pte_t *old_ptep, *new_ptep, old_pte, pte;
 	pmd_t dummy_pmdval;
 	spinlock_t *old_ptl, *new_ptl;
 	bool force_flush = false;
@@ -185,6 +185,7 @@ static int move_ptes(struct pagetable_move_control *pmc,
 	unsigned long old_end = old_addr + extent;
 	unsigned long len = old_end - old_addr;
 	int err = 0;
+	int nr;
 
 	/*
 	 * When need_rmap_locks is true, we take the i_mmap_rwsem and anon_vma
@@ -237,10 +238,14 @@ static int move_ptes(struct pagetable_move_control *pmc,
 
 	for (; old_addr < old_end; old_ptep++, old_addr += PAGE_SIZE,
 				   new_ptep++, new_addr += PAGE_SIZE) {
-		if (pte_none(ptep_get(old_ptep)))
+		const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
+		int max_nr = (old_end - old_addr) >> PAGE_SHIFT;
+
+		nr = 1;
+		old_pte = ptep_get(old_ptep);
+		if (pte_none(old_pte))
 			continue;
 
-		pte = ptep_get_and_clear(mm, old_addr, old_ptep);
 		/*
 		 * If we are remapping a valid PTE, make sure
 		 * to flush TLB before we drop the PTL for the
@@ -252,8 +257,17 @@ static int move_ptes(struct pagetable_move_control *pmc,
 		 * the TLB entry for the old mapping has been
 		 * flushed.
 		 */
-		if (pte_present(pte))
+		if (pte_present(old_pte)) {
+			if ((max_nr != 1) && maybe_contiguous_pte_pfns(old_ptep, old_pte)) {
+				struct folio *folio = vm_normal_folio(vma, old_addr, old_pte);
+
+				if (folio && folio_test_large(folio))
+					nr = folio_pte_batch(folio, old_addr, old_ptep,
+					old_pte, max_nr, fpb_flags, NULL, NULL, NULL);
+			}
 			force_flush = true;
+		}
+		pte = get_and_clear_full_ptes(mm, old_addr, old_ptep, nr, 0);
 		pte = move_pte(pte, old_addr, new_addr);
 		pte = move_soft_dirty_pte(pte);
 
@@ -266,7 +280,7 @@ static int move_ptes(struct pagetable_move_control *pmc,
 				else if (is_swap_pte(pte))
 					pte = pte_swp_clear_uffd_wp(pte);
 			}
-			set_pte_at(mm, new_addr, new_ptep, pte);
+			set_ptes(mm, new_addr, new_ptep, pte, nr);
 		}
 	}
 
-- 
2.30.2

Re: [PATCH 3/3] mm: Optimize mremap() by PTE batching

Posted by Lorenzo Stoakes 9 months, 1 week ago

On Tue, May 06, 2025 at 10:30:56AM +0530, Dev Jain wrote:
> Use folio_pte_batch() to optimize move_ptes(). Use get_and_clear_full_ptes()
> so as to elide TLBIs on each contig block, which was previously done by
> ptep_get_and_clear().

No mention of large folios

>
> Signed-off-by: Dev Jain <dev.jain@arm.com>
> ---
>  mm/mremap.c | 24 +++++++++++++++++++-----
>  1 file changed, 19 insertions(+), 5 deletions(-)
>
> diff --git a/mm/mremap.c b/mm/mremap.c
> index 1a08a7c3b92f..3621c07d8eea 100644
> --- a/mm/mremap.c
> +++ b/mm/mremap.c
> @@ -176,7 +176,7 @@ static int move_ptes(struct pagetable_move_control *pmc,
>  	struct vm_area_struct *vma = pmc->old;
>  	bool need_clear_uffd_wp = vma_has_uffd_without_event_remap(vma);
>  	struct mm_struct *mm = vma->vm_mm;
> -	pte_t *old_ptep, *new_ptep, pte;
> +	pte_t *old_ptep, *new_ptep, old_pte, pte;

Obviously given previous comment you know what I'm going to say here :) let's
put old_pte, pte in a new decl.

>  	pmd_t dummy_pmdval;
>  	spinlock_t *old_ptl, *new_ptl;
>  	bool force_flush = false;
> @@ -185,6 +185,7 @@ static int move_ptes(struct pagetable_move_control *pmc,
>  	unsigned long old_end = old_addr + extent;
>  	unsigned long len = old_end - old_addr;
>  	int err = 0;
> +	int nr;
>
>  	/*
>  	 * When need_rmap_locks is true, we take the i_mmap_rwsem and anon_vma
> @@ -237,10 +238,14 @@ static int move_ptes(struct pagetable_move_control *pmc,
>
>  	for (; old_addr < old_end; old_ptep++, old_addr += PAGE_SIZE,
>  				   new_ptep++, new_addr += PAGE_SIZE) {

Hm this just seems wrong, even if we're dealing with a large folio we're still
offsetting by PAGE_SIZE each time and iterating through further sub-pages?

Shouldn't we be doing something like += nr and += PAGE_SIZE * nr?

Then it'd make sense to initialise nr to 1.

Honestly I'd prefer us though to refactor move_ptes() to something like:

	for (; old_addr < old_end; old_ptep++, old_addr += PAGE_SIZE,
				   new_ptep++, new_addr += PAGE_SIZE) {
		pte_t old_pte = ptep_get(old_ptep);

		if (pte_none(old_pte))
			continue;

		move_pte(pmc, vma, old_ptep, old_pte);
	}

Declaring this new move_pte() where you can put the rest of the stuff.

I'd much rather we do this than add to the mess as-is.



> -		if (pte_none(ptep_get(old_ptep)))
> +		const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
> +		int max_nr = (old_end - old_addr) >> PAGE_SHIFT;
> +
> +		nr = 1;
> +		old_pte = ptep_get(old_ptep);

You can declare this in the for loop, no need for us to contaminate whole
function scope with it.

Same with 'nr' in this implementation (though I'd rather you changed it up, see
above).

> +		if (pte_none(old_pte))
>  			continue;
>
> -		pte = ptep_get_and_clear(mm, old_addr, old_ptep);
>  		/*
>  		 * If we are remapping a valid PTE, make sure
>  		 * to flush TLB before we drop the PTL for the
> @@ -252,8 +257,17 @@ static int move_ptes(struct pagetable_move_control *pmc,
>  		 * the TLB entry for the old mapping has been
>  		 * flushed.
>  		 */
> -		if (pte_present(pte))
> +		if (pte_present(old_pte)) {
> +			if ((max_nr != 1) && maybe_contiguous_pte_pfns(old_ptep, old_pte)) {
> +				struct folio *folio = vm_normal_folio(vma, old_addr, old_pte);
> +
> +				if (folio && folio_test_large(folio))
> +					nr = folio_pte_batch(folio, old_addr, old_ptep,
> +					old_pte, max_nr, fpb_flags, NULL, NULL, NULL);

Indentation seems completely broken here? I also hate that we're nesting to this
degree? Can we please find a way not to?

This function is already a bit of a clogged mess, I'd rather we clean up then
add to it.

(See above again :)


> +			}
>  			force_flush = true;
> +		}
> +		pte = get_and_clear_full_ptes(mm, old_addr, old_ptep, nr, 0);
>  		pte = move_pte(pte, old_addr, new_addr);
>  		pte = move_soft_dirty_pte(pte);
>
> @@ -266,7 +280,7 @@ static int move_ptes(struct pagetable_move_control *pmc,
>  				else if (is_swap_pte(pte))
>  					pte = pte_swp_clear_uffd_wp(pte);
>  			}
> -			set_pte_at(mm, new_addr, new_ptep, pte);
> +			set_ptes(mm, new_addr, new_ptep, pte, nr);
>  		}
>  	}
>
> --
> 2.30.2
>

Cheers, Lorenzo

Re: [PATCH 3/3] mm: Optimize mremap() by PTE batching

Posted by Dev Jain 9 months, 1 week ago


On 06/05/25 7:19 pm, Lorenzo Stoakes wrote:
> On Tue, May 06, 2025 at 10:30:56AM +0530, Dev Jain wrote:
>> Use folio_pte_batch() to optimize move_ptes(). Use get_and_clear_full_ptes()
>> so as to elide TLBIs on each contig block, which was previously done by
>> ptep_get_and_clear().
> 
> No mention of large folios
> 
>>
>> Signed-off-by: Dev Jain <dev.jain@arm.com>
>> ---
>>   mm/mremap.c | 24 +++++++++++++++++++-----
>>   1 file changed, 19 insertions(+), 5 deletions(-)
>>
>> diff --git a/mm/mremap.c b/mm/mremap.c
>> index 1a08a7c3b92f..3621c07d8eea 100644
>> --- a/mm/mremap.c
>> +++ b/mm/mremap.c
>> @@ -176,7 +176,7 @@ static int move_ptes(struct pagetable_move_control *pmc,
>>   	struct vm_area_struct *vma = pmc->old;
>>   	bool need_clear_uffd_wp = vma_has_uffd_without_event_remap(vma);
>>   	struct mm_struct *mm = vma->vm_mm;
>> -	pte_t *old_ptep, *new_ptep, pte;
>> +	pte_t *old_ptep, *new_ptep, old_pte, pte;
> 
> Obviously given previous comment you know what I'm going to say here :) let's
> put old_pte, pte in a new decl.
> 
>>   	pmd_t dummy_pmdval;
>>   	spinlock_t *old_ptl, *new_ptl;
>>   	bool force_flush = false;
>> @@ -185,6 +185,7 @@ static int move_ptes(struct pagetable_move_control *pmc,
>>   	unsigned long old_end = old_addr + extent;
>>   	unsigned long len = old_end - old_addr;
>>   	int err = 0;
>> +	int nr;
>>
>>   	/*
>>   	 * When need_rmap_locks is true, we take the i_mmap_rwsem and anon_vma
>> @@ -237,10 +238,14 @@ static int move_ptes(struct pagetable_move_control *pmc,
>>
>>   	for (; old_addr < old_end; old_ptep++, old_addr += PAGE_SIZE,
>>   				   new_ptep++, new_addr += PAGE_SIZE) {
> 
> Hm this just seems wrong, even if we're dealing with a large folio we're still
> offsetting by PAGE_SIZE each time and iterating through further sub-pages?
> 
> Shouldn't we be doing something like += nr and += PAGE_SIZE * nr?

This is embarrassing *facepalm* . The crazy part is that I didn't even 
notice this because I got an optimization due to get_and_clear_full_ptes 
-> the number of TLB flushes reduced, and the loop continued due to 
pte_none().

> 
> Then it'd make sense to initialise nr to 1.
> 
> Honestly I'd prefer us though to refactor move_ptes() to something like:
> 
> 	for (; old_addr < old_end; old_ptep++, old_addr += PAGE_SIZE,
> 				   new_ptep++, new_addr += PAGE_SIZE) {
> 		pte_t old_pte = ptep_get(old_ptep);
> 
> 		if (pte_none(old_pte))
> 			continue;
> 
> 		move_pte(pmc, vma, old_ptep, old_pte);
> 	}
> 
> Declaring this new move_pte() where you can put the rest of the stuff.
> 
> I'd much rather we do this than add to the mess as-is.
> 
> 
> 
>> -		if (pte_none(ptep_get(old_ptep)))
>> +		const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
>> +		int max_nr = (old_end - old_addr) >> PAGE_SHIFT;
>> +
>> +		nr = 1;
>> +		old_pte = ptep_get(old_ptep);
> 
> You can declare this in the for loop, no need for us to contaminate whole
> function scope with it.
> 
> Same with 'nr' in this implementation (though I'd rather you changed it up, see
> above).
> 
>> +		if (pte_none(old_pte))
>>   			continue;
>>
>> -		pte = ptep_get_and_clear(mm, old_addr, old_ptep);
>>   		/*
>>   		 * If we are remapping a valid PTE, make sure
>>   		 * to flush TLB before we drop the PTL for the
>> @@ -252,8 +257,17 @@ static int move_ptes(struct pagetable_move_control *pmc,
>>   		 * the TLB entry for the old mapping has been
>>   		 * flushed.
>>   		 */
>> -		if (pte_present(pte))
>> +		if (pte_present(old_pte)) {
>> +			if ((max_nr != 1) && maybe_contiguous_pte_pfns(old_ptep, old_pte)) {
>> +				struct folio *folio = vm_normal_folio(vma, old_addr, old_pte);
>> +
>> +				if (folio && folio_test_large(folio))
>> +					nr = folio_pte_batch(folio, old_addr, old_ptep,
>> +					old_pte, max_nr, fpb_flags, NULL, NULL, NULL);
> 
> Indentation seems completely broken here? I also hate that we're nesting to this
> degree? Can we please find a way not to?
> 
> This function is already a bit of a clogged mess, I'd rather we clean up then
> add to it.
> 
> (See above again :)
> 
> 
>> +			}
>>   			force_flush = true;
>> +		}
>> +		pte = get_and_clear_full_ptes(mm, old_addr, old_ptep, nr, 0);
>>   		pte = move_pte(pte, old_addr, new_addr);
>>   		pte = move_soft_dirty_pte(pte);
>>
>> @@ -266,7 +280,7 @@ static int move_ptes(struct pagetable_move_control *pmc,
>>   				else if (is_swap_pte(pte))
>>   					pte = pte_swp_clear_uffd_wp(pte);
>>   			}
>> -			set_pte_at(mm, new_addr, new_ptep, pte);
>> +			set_ptes(mm, new_addr, new_ptep, pte, nr);
>>   		}
>>   	}
>>
>> --
>> 2.30.2
>>
> 
> Cheers, Lorenzo

Re: [PATCH 3/3] mm: Optimize mremap() by PTE batching

Posted by Lorenzo Stoakes 9 months, 1 week ago

On Tue, May 06, 2025 at 07:40:49PM +0530, Dev Jain wrote:
>
>
> On 06/05/25 7:19 pm, Lorenzo Stoakes wrote:
> > On Tue, May 06, 2025 at 10:30:56AM +0530, Dev Jain wrote:
> > > Use folio_pte_batch() to optimize move_ptes(). Use get_and_clear_full_ptes()
> > > so as to elide TLBIs on each contig block, which was previously done by
> > > ptep_get_and_clear().
> >
> > No mention of large folios
> >
> > >
> > > Signed-off-by: Dev Jain <dev.jain@arm.com>
> > > ---
> > >   mm/mremap.c | 24 +++++++++++++++++++-----
> > >   1 file changed, 19 insertions(+), 5 deletions(-)
> > >
> > > diff --git a/mm/mremap.c b/mm/mremap.c
> > > index 1a08a7c3b92f..3621c07d8eea 100644
> > > --- a/mm/mremap.c
> > > +++ b/mm/mremap.c
> > > @@ -176,7 +176,7 @@ static int move_ptes(struct pagetable_move_control *pmc,
> > >   	struct vm_area_struct *vma = pmc->old;
> > >   	bool need_clear_uffd_wp = vma_has_uffd_without_event_remap(vma);
> > >   	struct mm_struct *mm = vma->vm_mm;
> > > -	pte_t *old_ptep, *new_ptep, pte;
> > > +	pte_t *old_ptep, *new_ptep, old_pte, pte;
> >
> > Obviously given previous comment you know what I'm going to say here :) let's
> > put old_pte, pte in a new decl.
> >
> > >   	pmd_t dummy_pmdval;
> > >   	spinlock_t *old_ptl, *new_ptl;
> > >   	bool force_flush = false;
> > > @@ -185,6 +185,7 @@ static int move_ptes(struct pagetable_move_control *pmc,
> > >   	unsigned long old_end = old_addr + extent;
> > >   	unsigned long len = old_end - old_addr;
> > >   	int err = 0;
> > > +	int nr;
> > >
> > >   	/*
> > >   	 * When need_rmap_locks is true, we take the i_mmap_rwsem and anon_vma
> > > @@ -237,10 +238,14 @@ static int move_ptes(struct pagetable_move_control *pmc,
> > >
> > >   	for (; old_addr < old_end; old_ptep++, old_addr += PAGE_SIZE,
> > >   				   new_ptep++, new_addr += PAGE_SIZE) {
> >
> > Hm this just seems wrong, even if we're dealing with a large folio we're still
> > offsetting by PAGE_SIZE each time and iterating through further sub-pages?
> >
> > Shouldn't we be doing something like += nr and += PAGE_SIZE * nr?
>
> This is embarrassing *facepalm* . The crazy part is that I didn't even
> notice this because I got an optimization due to get_and_clear_full_ptes ->
> the number of TLB flushes reduced, and the loop continued due to pte_none().

Haha don't worry, we've all missed things like this in the past (I know I
have...), this is what code review is for :>)

>
> >
> > Then it'd make sense to initialise nr to 1.
> >
> > Honestly I'd prefer us though to refactor move_ptes() to something like:
> >
> > 	for (; old_addr < old_end; old_ptep++, old_addr += PAGE_SIZE,
> > 				   new_ptep++, new_addr += PAGE_SIZE) {
> > 		pte_t old_pte = ptep_get(old_ptep);
> >
> > 		if (pte_none(old_pte))
> > 			continue;
> >
> > 		move_pte(pmc, vma, old_ptep, old_pte);
> > 	}
> >
> > Declaring this new move_pte() where you can put the rest of the stuff.
> >
> > I'd much rather we do this than add to the mess as-is.
> >
> >
> >
> > > -		if (pte_none(ptep_get(old_ptep)))
> > > +		const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
> > > +		int max_nr = (old_end - old_addr) >> PAGE_SHIFT;
> > > +
> > > +		nr = 1;
> > > +		old_pte = ptep_get(old_ptep);
> >
> > You can declare this in the for loop, no need for us to contaminate whole
> > function scope with it.
> >
> > Same with 'nr' in this implementation (though I'd rather you changed it up, see
> > above).
> >
> > > +		if (pte_none(old_pte))
> > >   			continue;
> > >
> > > -		pte = ptep_get_and_clear(mm, old_addr, old_ptep);
> > >   		/*
> > >   		 * If we are remapping a valid PTE, make sure
> > >   		 * to flush TLB before we drop the PTL for the
> > > @@ -252,8 +257,17 @@ static int move_ptes(struct pagetable_move_control *pmc,
> > >   		 * the TLB entry for the old mapping has been
> > >   		 * flushed.
> > >   		 */
> > > -		if (pte_present(pte))
> > > +		if (pte_present(old_pte)) {
> > > +			if ((max_nr != 1) && maybe_contiguous_pte_pfns(old_ptep, old_pte)) {
> > > +				struct folio *folio = vm_normal_folio(vma, old_addr, old_pte);
> > > +
> > > +				if (folio && folio_test_large(folio))
> > > +					nr = folio_pte_batch(folio, old_addr, old_ptep,
> > > +					old_pte, max_nr, fpb_flags, NULL, NULL, NULL);
> >
> > Indentation seems completely broken here? I also hate that we're nesting to this
> > degree? Can we please find a way not to?
> >
> > This function is already a bit of a clogged mess, I'd rather we clean up then
> > add to it.
> >
> > (See above again :)
> >
> >
> > > +			}
> > >   			force_flush = true;
> > > +		}
> > > +		pte = get_and_clear_full_ptes(mm, old_addr, old_ptep, nr, 0);
> > >   		pte = move_pte(pte, old_addr, new_addr);
> > >   		pte = move_soft_dirty_pte(pte);
> > >
> > > @@ -266,7 +280,7 @@ static int move_ptes(struct pagetable_move_control *pmc,
> > >   				else if (is_swap_pte(pte))
> > >   					pte = pte_swp_clear_uffd_wp(pte);
> > >   			}
> > > -			set_pte_at(mm, new_addr, new_ptep, pte);
> > > +			set_ptes(mm, new_addr, new_ptep, pte, nr);
> > >   		}
> > >   	}
> > >
> > > --
> > > 2.30.2
> > >
> >
> > Cheers, Lorenzo
>

Re: [PATCH 3/3] mm: Optimize mremap() by PTE batching

Posted by Lorenzo Stoakes 9 months, 1 week ago

On Tue, May 06, 2025 at 02:49:04PM +0100, Lorenzo Stoakes wrote:
> On Tue, May 06, 2025 at 10:30:56AM +0530, Dev Jain wrote:
> > Use folio_pte_batch() to optimize move_ptes(). Use get_and_clear_full_ptes()
> > so as to elide TLBIs on each contig block, which was previously done by
> > ptep_get_and_clear().
>
> No mention of large folios

Sorry didn't finish my sentence here. I suggest you add more detail
here. Again the 'why' and what is this for etc. etc.

Equally I don't think having code that seemingly randomly invokes batched
functions is a good idea, I think the relevant logic should be separated
into functions that explicit reference large folios or should have comments
explaining what you're doing.

You can see some of how I separated out such things for my
MREMAP_RELOCATE_ANON series at [0] for instance.

[0]: https://lore.kernel.org/linux-mm/8ca7c8219a2a0e67e1c5c277d0c0d070052de401.1746305604.git.lorenzo.stoakes@oracle.com/

Thanks!

>
> >
> > Signed-off-by: Dev Jain <dev.jain@arm.com>
> > ---
> >  mm/mremap.c | 24 +++++++++++++++++++-----
> >  1 file changed, 19 insertions(+), 5 deletions(-)
> >
> > diff --git a/mm/mremap.c b/mm/mremap.c
> > index 1a08a7c3b92f..3621c07d8eea 100644
> > --- a/mm/mremap.c
> > +++ b/mm/mremap.c
> > @@ -176,7 +176,7 @@ static int move_ptes(struct pagetable_move_control *pmc,
> >  	struct vm_area_struct *vma = pmc->old;
> >  	bool need_clear_uffd_wp = vma_has_uffd_without_event_remap(vma);
> >  	struct mm_struct *mm = vma->vm_mm;
> > -	pte_t *old_ptep, *new_ptep, pte;
> > +	pte_t *old_ptep, *new_ptep, old_pte, pte;
>
> Obviously given previous comment you know what I'm going to say here :) let's
> put old_pte, pte in a new decl.
>
> >  	pmd_t dummy_pmdval;
> >  	spinlock_t *old_ptl, *new_ptl;
> >  	bool force_flush = false;
> > @@ -185,6 +185,7 @@ static int move_ptes(struct pagetable_move_control *pmc,
> >  	unsigned long old_end = old_addr + extent;
> >  	unsigned long len = old_end - old_addr;
> >  	int err = 0;
> > +	int nr;
> >
> >  	/*
> >  	 * When need_rmap_locks is true, we take the i_mmap_rwsem and anon_vma
> > @@ -237,10 +238,14 @@ static int move_ptes(struct pagetable_move_control *pmc,
> >
> >  	for (; old_addr < old_end; old_ptep++, old_addr += PAGE_SIZE,
> >  				   new_ptep++, new_addr += PAGE_SIZE) {
>
> Hm this just seems wrong, even if we're dealing with a large folio we're still
> offsetting by PAGE_SIZE each time and iterating through further sub-pages?
>
> Shouldn't we be doing something like += nr and += PAGE_SIZE * nr?
>
> Then it'd make sense to initialise nr to 1.
>
> Honestly I'd prefer us though to refactor move_ptes() to something like:
>
> 	for (; old_addr < old_end; old_ptep++, old_addr += PAGE_SIZE,
> 				   new_ptep++, new_addr += PAGE_SIZE) {
> 		pte_t old_pte = ptep_get(old_ptep);
>
> 		if (pte_none(old_pte))
> 			continue;
>
> 		move_pte(pmc, vma, old_ptep, old_pte);
> 	}
>
> Declaring this new move_pte() where you can put the rest of the stuff.
>
> I'd much rather we do this than add to the mess as-is.
>
>
>
> > -		if (pte_none(ptep_get(old_ptep)))
> > +		const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
> > +		int max_nr = (old_end - old_addr) >> PAGE_SHIFT;
> > +
> > +		nr = 1;
> > +		old_pte = ptep_get(old_ptep);
>
> You can declare this in the for loop, no need for us to contaminate whole
> function scope with it.
>
> Same with 'nr' in this implementation (though I'd rather you changed it up, see
> above).
>
> > +		if (pte_none(old_pte))
> >  			continue;
> >
> > -		pte = ptep_get_and_clear(mm, old_addr, old_ptep);
> >  		/*
> >  		 * If we are remapping a valid PTE, make sure
> >  		 * to flush TLB before we drop the PTL for the
> > @@ -252,8 +257,17 @@ static int move_ptes(struct pagetable_move_control *pmc,
> >  		 * the TLB entry for the old mapping has been
> >  		 * flushed.
> >  		 */
> > -		if (pte_present(pte))
> > +		if (pte_present(old_pte)) {
> > +			if ((max_nr != 1) && maybe_contiguous_pte_pfns(old_ptep, old_pte)) {
> > +				struct folio *folio = vm_normal_folio(vma, old_addr, old_pte);
> > +
> > +				if (folio && folio_test_large(folio))
> > +					nr = folio_pte_batch(folio, old_addr, old_ptep,
> > +					old_pte, max_nr, fpb_flags, NULL, NULL, NULL);
>
> Indentation seems completely broken here? I also hate that we're nesting to this
> degree? Can we please find a way not to?
>
> This function is already a bit of a clogged mess, I'd rather we clean up then
> add to it.
>
> (See above again :)
>
>
> > +			}
> >  			force_flush = true;
> > +		}
> > +		pte = get_and_clear_full_ptes(mm, old_addr, old_ptep, nr, 0);
> >  		pte = move_pte(pte, old_addr, new_addr);
> >  		pte = move_soft_dirty_pte(pte);
> >
> > @@ -266,7 +280,7 @@ static int move_ptes(struct pagetable_move_control *pmc,
> >  				else if (is_swap_pte(pte))
> >  					pte = pte_swp_clear_uffd_wp(pte);
> >  			}
> > -			set_pte_at(mm, new_addr, new_ptep, pte);
> > +			set_ptes(mm, new_addr, new_ptep, pte, nr);
> >  		}
> >  	}
> >
> > --
> > 2.30.2
> >
>
> Cheers, Lorenzo

Re: [PATCH 3/3] mm: Optimize mremap() by PTE batching

Posted by Anshuman Khandual 9 months, 1 week ago

On 5/6/25 10:30, Dev Jain wrote:
> Use folio_pte_batch() to optimize move_ptes(). Use get_and_clear_full_ptes()
> so as to elide TLBIs on each contig block, which was previously done by
> ptep_get_and_clear().
> 
> Signed-off-by: Dev Jain <dev.jain@arm.com>
> ---
>  mm/mremap.c | 24 +++++++++++++++++++-----
>  1 file changed, 19 insertions(+), 5 deletions(-)
> 
> diff --git a/mm/mremap.c b/mm/mremap.c
> index 1a08a7c3b92f..3621c07d8eea 100644
> --- a/mm/mremap.c
> +++ b/mm/mremap.c
> @@ -176,7 +176,7 @@ static int move_ptes(struct pagetable_move_control *pmc,
>  	struct vm_area_struct *vma = pmc->old;
>  	bool need_clear_uffd_wp = vma_has_uffd_without_event_remap(vma);
>  	struct mm_struct *mm = vma->vm_mm;
> -	pte_t *old_ptep, *new_ptep, pte;
> +	pte_t *old_ptep, *new_ptep, old_pte, pte;
>  	pmd_t dummy_pmdval;
>  	spinlock_t *old_ptl, *new_ptl;
>  	bool force_flush = false;
> @@ -185,6 +185,7 @@ static int move_ptes(struct pagetable_move_control *pmc,
>  	unsigned long old_end = old_addr + extent;
>  	unsigned long len = old_end - old_addr;
>  	int err = 0;
> +	int nr;
>  
>  	/*
>  	 * When need_rmap_locks is true, we take the i_mmap_rwsem and anon_vma
> @@ -237,10 +238,14 @@ static int move_ptes(struct pagetable_move_control *pmc,
>  
>  	for (; old_addr < old_end; old_ptep++, old_addr += PAGE_SIZE,
>  				   new_ptep++, new_addr += PAGE_SIZE) {
> -		if (pte_none(ptep_get(old_ptep)))
> +		const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
> +		int max_nr = (old_end - old_addr) >> PAGE_SHIFT;
> +
> +		nr = 1;
> +		old_pte = ptep_get(old_ptep);
> +		if (pte_none(old_pte))
>  			continue;
>  
> -		pte = ptep_get_and_clear(mm, old_addr, old_ptep);
>  		/*
>  		 * If we are remapping a valid PTE, make sure
>  		 * to flush TLB before we drop the PTL for the
> @@ -252,8 +257,17 @@ static int move_ptes(struct pagetable_move_control *pmc,
>  		 * the TLB entry for the old mapping has been
>  		 * flushed.
>  		 */
> -		if (pte_present(pte))
> +		if (pte_present(old_pte)) {
> +			if ((max_nr != 1) && maybe_contiguous_pte_pfns(old_ptep, old_pte)) {

maybe_contiguous_pte_pfns() cost will be applicable for memory
areas greater than a single PAGE_SIZE (i.e max_nr != 1) ? This
helper extracts an additional consecutive pte, ensures that it
is valid mapped and extracts pfn before comparing for the span.

There is some cost associated with the above code sequence which
looks justified for sequential access of memory buffers that has
consecutive physical memory backing. But what happens when such
buffers are less probable, will those buffers take a performance
hit for all the comparisons that just turn out to be negative ?

> +				struct folio *folio = vm_normal_folio(vma, old_addr, old_pte);
> +
> +				if (folio && folio_test_large(folio))
> +					nr = folio_pte_batch(folio, old_addr, old_ptep,
> +					old_pte, max_nr, fpb_flags, NULL, NULL, NULL);
> +			}
>  			force_flush = true;
> +		}
> +		pte = get_and_clear_full_ptes(mm, old_addr, old_ptep, nr, 0);
>  		pte = move_pte(pte, old_addr, new_addr);
>  		pte = move_soft_dirty_pte(pte);
>  
> @@ -266,7 +280,7 @@ static int move_ptes(struct pagetable_move_control *pmc,
>  				else if (is_swap_pte(pte))
>  					pte = pte_swp_clear_uffd_wp(pte);
>  			}
> -			set_pte_at(mm, new_addr, new_ptep, pte);
> +			set_ptes(mm, new_addr, new_ptep, pte, nr);
>  		}
>  	}
>

Re: [PATCH 3/3] mm: Optimize mremap() by PTE batching

Posted by Dev Jain 9 months, 1 week ago


On 06/05/25 3:40 pm, Anshuman Khandual wrote:
> On 5/6/25 10:30, Dev Jain wrote:
>> Use folio_pte_batch() to optimize move_ptes(). Use get_and_clear_full_ptes()
>> so as to elide TLBIs on each contig block, which was previously done by
>> ptep_get_and_clear().
>>
>> Signed-off-by: Dev Jain <dev.jain@arm.com>
>> ---
>>   mm/mremap.c | 24 +++++++++++++++++++-----
>>   1 file changed, 19 insertions(+), 5 deletions(-)
>>
>> diff --git a/mm/mremap.c b/mm/mremap.c
>> index 1a08a7c3b92f..3621c07d8eea 100644
>> --- a/mm/mremap.c
>> +++ b/mm/mremap.c
>> @@ -176,7 +176,7 @@ static int move_ptes(struct pagetable_move_control *pmc,
>>   	struct vm_area_struct *vma = pmc->old;
>>   	bool need_clear_uffd_wp = vma_has_uffd_without_event_remap(vma);
>>   	struct mm_struct *mm = vma->vm_mm;
>> -	pte_t *old_ptep, *new_ptep, pte;
>> +	pte_t *old_ptep, *new_ptep, old_pte, pte;
>>   	pmd_t dummy_pmdval;
>>   	spinlock_t *old_ptl, *new_ptl;
>>   	bool force_flush = false;
>> @@ -185,6 +185,7 @@ static int move_ptes(struct pagetable_move_control *pmc,
>>   	unsigned long old_end = old_addr + extent;
>>   	unsigned long len = old_end - old_addr;
>>   	int err = 0;
>> +	int nr;
>>   
>>   	/*
>>   	 * When need_rmap_locks is true, we take the i_mmap_rwsem and anon_vma
>> @@ -237,10 +238,14 @@ static int move_ptes(struct pagetable_move_control *pmc,
>>   
>>   	for (; old_addr < old_end; old_ptep++, old_addr += PAGE_SIZE,
>>   				   new_ptep++, new_addr += PAGE_SIZE) {
>> -		if (pte_none(ptep_get(old_ptep)))
>> +		const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
>> +		int max_nr = (old_end - old_addr) >> PAGE_SHIFT;
>> +
>> +		nr = 1;
>> +		old_pte = ptep_get(old_ptep);
>> +		if (pte_none(old_pte))
>>   			continue;
>>   
>> -		pte = ptep_get_and_clear(mm, old_addr, old_ptep);
>>   		/*
>>   		 * If we are remapping a valid PTE, make sure
>>   		 * to flush TLB before we drop the PTL for the
>> @@ -252,8 +257,17 @@ static int move_ptes(struct pagetable_move_control *pmc,
>>   		 * the TLB entry for the old mapping has been
>>   		 * flushed.
>>   		 */
>> -		if (pte_present(pte))
>> +		if (pte_present(old_pte)) {
>> +			if ((max_nr != 1) && maybe_contiguous_pte_pfns(old_ptep, old_pte)) {
> 
> maybe_contiguous_pte_pfns() cost will be applicable for memory
> areas greater than a single PAGE_SIZE (i.e max_nr != 1) ? This
> helper extracts an additional consecutive pte, ensures that it
> is valid mapped and extracts pfn before comparing for the span.
> 
> There is some cost associated with the above code sequence which
> looks justified for sequential access of memory buffers that has
> consecutive physical memory backing.

I did not see any regression for the simple case of mremapping base pages.

> But what happens when such
> buffers are less probable, will those buffers take a performance
> hit for all the comparisons that just turn out to be negative ?

When would that be the case? We are remapping consecutive ptes to 
consecutive ptes.

> 
>> +				struct folio *folio = vm_normal_folio(vma, old_addr, old_pte);
>> +
>> +				if (folio && folio_test_large(folio))
>> +					nr = folio_pte_batch(folio, old_addr, old_ptep,
>> +					old_pte, max_nr, fpb_flags, NULL, NULL, NULL);
>> +			}
>>   			force_flush = true;
>> +		}
>> +		pte = get_and_clear_full_ptes(mm, old_addr, old_ptep, nr, 0);
>>   		pte = move_pte(pte, old_addr, new_addr);
>>   		pte = move_soft_dirty_pte(pte);
>>   
>> @@ -266,7 +280,7 @@ static int move_ptes(struct pagetable_move_control *pmc,
>>   				else if (is_swap_pte(pte))
>>   					pte = pte_swp_clear_uffd_wp(pte);
>>   			}
>> -			set_pte_at(mm, new_addr, new_ptep, pte);
>> +			set_ptes(mm, new_addr, new_ptep, pte, nr);
>>   		}
>>   	}
>>

[PATCH 1/3] mm: Call pointers to ptes as ptep
[PATCH 2/3] mm: Add generic helper to hint a large folio
[PATCH 3/3] mm: Optimize mremap() by PTE batching