[v4] Optimize mprotect() for large folios

[PATCH v4 1/4] mm: Optimize mprotect() for MM_CP_PROT_NUMA by batch-skipping PTEs

Posted by Dev Jain 3 months, 1 week ago

In case of prot_numa, there are various cases in which we can skip to the
next iteration. Since the skip condition is based on the folio and not
the PTEs, we can skip a PTE batch. Additionally refactor all of this
into a new function to clean up the existing code.

Signed-off-by: Dev Jain <dev.jain@arm.com>
---
 mm/mprotect.c | 134 ++++++++++++++++++++++++++++++++------------------
 1 file changed, 87 insertions(+), 47 deletions(-)

diff --git a/mm/mprotect.c b/mm/mprotect.c
index 88709c01177b..af10a7fbe6b8 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -83,6 +83,83 @@ bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr,
 	return pte_dirty(pte);
 }
 
+static int mprotect_folio_pte_batch(struct folio *folio, unsigned long addr,
+		pte_t *ptep, pte_t pte, int max_nr_ptes)
+{
+	const fpb_t flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
+
+	if (!folio || !folio_test_large(folio) || (max_nr_ptes == 1))
+		return 1;
+
+	return folio_pte_batch(folio, addr, ptep, pte, max_nr_ptes, flags,
+			       NULL, NULL, NULL);
+}
+
+static int prot_numa_skip_ptes(struct folio **foliop, struct vm_area_struct *vma,
+		unsigned long addr, pte_t oldpte, pte_t *pte, int target_node,
+		int max_nr_ptes)
+{
+	struct folio *folio = NULL;
+	int nr_ptes = 1;
+	bool toptier;
+	int nid;
+
+	/* Avoid TLB flush if possible */
+	if (pte_protnone(oldpte))
+		goto skip_batch;
+
+	folio = vm_normal_folio(vma, addr, oldpte);
+	if (!folio)
+		goto skip_batch;
+
+	if (folio_is_zone_device(folio) || folio_test_ksm(folio))
+		goto skip_batch;
+
+	/* Also skip shared copy-on-write pages */
+	if (is_cow_mapping(vma->vm_flags) &&
+	    (folio_maybe_dma_pinned(folio) || folio_maybe_mapped_shared(folio)))
+		goto skip_batch;
+
+	/*
+	 * While migration can move some dirty pages,
+	 * it cannot move them all from MIGRATE_ASYNC
+	 * context.
+	 */
+	if (folio_is_file_lru(folio) && folio_test_dirty(folio))
+		goto skip_batch;
+
+	/*
+	 * Don't mess with PTEs if page is already on the node
+	 * a single-threaded process is running on.
+	 */
+	nid = folio_nid(folio);
+	if (target_node == nid)
+		goto skip_batch;
+
+	toptier = node_is_toptier(nid);
+
+	/*
+	 * Skip scanning top tier node if normal numa
+	 * balancing is disabled
+	 */
+	if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) && toptier)
+		goto skip_batch;
+
+	if (folio_use_access_time(folio)) {
+		folio_xchg_access_time(folio, jiffies_to_msecs(jiffies));
+
+		/* Do not skip in this case */
+		nr_ptes = 0;
+		goto out;
+	}
+
+skip_batch:
+	nr_ptes = mprotect_folio_pte_batch(folio, addr, pte, oldpte, max_nr_ptes);
+out:
+	*foliop = folio;
+	return nr_ptes;
+}
+
 static long change_pte_range(struct mmu_gather *tlb,
 		struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr,
 		unsigned long end, pgprot_t newprot, unsigned long cp_flags)
@@ -94,6 +171,7 @@ static long change_pte_range(struct mmu_gather *tlb,
 	bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
 	bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
 	bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
+	int nr_ptes;
 
 	tlb_change_page_size(tlb, PAGE_SIZE);
 	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
@@ -108,8 +186,11 @@ static long change_pte_range(struct mmu_gather *tlb,
 	flush_tlb_batched_pending(vma->vm_mm);
 	arch_enter_lazy_mmu_mode();
 	do {
+		nr_ptes = 1;
 		oldpte = ptep_get(pte);
 		if (pte_present(oldpte)) {
+			int max_nr_ptes = (end - addr) >> PAGE_SHIFT;
+			struct folio *folio = NULL;
 			pte_t ptent;
 
 			/*
@@ -117,53 +198,12 @@ static long change_pte_range(struct mmu_gather *tlb,
 			 * pages. See similar comment in change_huge_pmd.
 			 */
 			if (prot_numa) {
-				struct folio *folio;
-				int nid;
-				bool toptier;
-
-				/* Avoid TLB flush if possible */
-				if (pte_protnone(oldpte))
-					continue;
-
-				folio = vm_normal_folio(vma, addr, oldpte);
-				if (!folio || folio_is_zone_device(folio) ||
-				    folio_test_ksm(folio))
-					continue;
-
-				/* Also skip shared copy-on-write pages */
-				if (is_cow_mapping(vma->vm_flags) &&
-				    (folio_maybe_dma_pinned(folio) ||
-				     folio_maybe_mapped_shared(folio)))
-					continue;
-
-				/*
-				 * While migration can move some dirty pages,
-				 * it cannot move them all from MIGRATE_ASYNC
-				 * context.
-				 */
-				if (folio_is_file_lru(folio) &&
-				    folio_test_dirty(folio))
-					continue;
-
-				/*
-				 * Don't mess with PTEs if page is already on the node
-				 * a single-threaded process is running on.
-				 */
-				nid = folio_nid(folio);
-				if (target_node == nid)
-					continue;
-				toptier = node_is_toptier(nid);
-
-				/*
-				 * Skip scanning top tier node if normal numa
-				 * balancing is disabled
-				 */
-				if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) &&
-				    toptier)
+				nr_ptes = prot_numa_skip_ptes(&folio, vma,
+							      addr, oldpte, pte,
+							      target_node,
+							      max_nr_ptes);
+				if (nr_ptes)
 					continue;
-				if (folio_use_access_time(folio))
-					folio_xchg_access_time(folio,
-						jiffies_to_msecs(jiffies));
 			}
 
 			oldpte = ptep_modify_prot_start(vma, addr, pte);
@@ -280,7 +320,7 @@ static long change_pte_range(struct mmu_gather *tlb,
 				pages++;
 			}
 		}
-	} while (pte++, addr += PAGE_SIZE, addr != end);
+	} while (pte += nr_ptes, addr += nr_ptes * PAGE_SIZE, addr != end);
 	arch_leave_lazy_mmu_mode();
 	pte_unmap_unlock(pte - 1, ptl);
 
-- 
2.30.2

Re: [PATCH v4 1/4] mm: Optimize mprotect() for MM_CP_PROT_NUMA by batch-skipping PTEs

Posted by Lorenzo Stoakes 3 months, 1 week ago

On Sat, Jun 28, 2025 at 05:04:32PM +0530, Dev Jain wrote:
> In case of prot_numa, there are various cases in which we can skip to the
> next iteration. Since the skip condition is based on the folio and not
> the PTEs, we can skip a PTE batch. Additionally refactor all of this
> into a new function to clean up the existing code.
>
> Signed-off-by: Dev Jain <dev.jain@arm.com>
> ---
>  mm/mprotect.c | 134 ++++++++++++++++++++++++++++++++------------------
>  1 file changed, 87 insertions(+), 47 deletions(-)
>
> diff --git a/mm/mprotect.c b/mm/mprotect.c
> index 88709c01177b..af10a7fbe6b8 100644
> --- a/mm/mprotect.c
> +++ b/mm/mprotect.c
> @@ -83,6 +83,83 @@ bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr,
>  	return pte_dirty(pte);
>  }
>
> +static int mprotect_folio_pte_batch(struct folio *folio, unsigned long addr,
> +		pte_t *ptep, pte_t pte, int max_nr_ptes)
> +{
> +	const fpb_t flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
> +
> +	if (!folio || !folio_test_large(folio) || (max_nr_ptes == 1))
> +		return 1;
> +
> +	return folio_pte_batch(folio, addr, ptep, pte, max_nr_ptes, flags,
> +			       NULL, NULL, NULL);
> +}
> +
> +static int prot_numa_skip_ptes(struct folio **foliop, struct vm_area_struct *vma,
> +		unsigned long addr, pte_t oldpte, pte_t *pte, int target_node,
> +		int max_nr_ptes)
> +{

While it's nice to separate this out, it's not so nice to pass folio as a
pointer to a pointer like this and maybe or maybe not set it.

Just get the folio before you call this... you'll need it either way.

I'll wait until you separate it all out before reviewing next revision as a bit
tricky as-is.

Thanks!

Re: [PATCH v4 1/4] mm: Optimize mprotect() for MM_CP_PROT_NUMA by batch-skipping PTEs

Posted by Dev Jain 3 months, 1 week ago

On 02/07/25 3:07 pm, Lorenzo Stoakes wrote:
> On Sat, Jun 28, 2025 at 05:04:32PM +0530, Dev Jain wrote:
>> In case of prot_numa, there are various cases in which we can skip to the
>> next iteration. Since the skip condition is based on the folio and not
>> the PTEs, we can skip a PTE batch. Additionally refactor all of this
>> into a new function to clean up the existing code.
>>
>> Signed-off-by: Dev Jain <dev.jain@arm.com>
>> ---
>>   mm/mprotect.c | 134 ++++++++++++++++++++++++++++++++------------------
>>   1 file changed, 87 insertions(+), 47 deletions(-)
>>
>> diff --git a/mm/mprotect.c b/mm/mprotect.c
>> index 88709c01177b..af10a7fbe6b8 100644
>> --- a/mm/mprotect.c
>> +++ b/mm/mprotect.c
>> @@ -83,6 +83,83 @@ bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr,
>>   	return pte_dirty(pte);
>>   }
>>
>> +static int mprotect_folio_pte_batch(struct folio *folio, unsigned long addr,
>> +		pte_t *ptep, pte_t pte, int max_nr_ptes)
>> +{
>> +	const fpb_t flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
>> +
>> +	if (!folio || !folio_test_large(folio) || (max_nr_ptes == 1))
>> +		return 1;
>> +
>> +	return folio_pte_batch(folio, addr, ptep, pte, max_nr_ptes, flags,
>> +			       NULL, NULL, NULL);
>> +}
>> +
>> +static int prot_numa_skip_ptes(struct folio **foliop, struct vm_area_struct *vma,
>> +		unsigned long addr, pte_t oldpte, pte_t *pte, int target_node,
>> +		int max_nr_ptes)
>> +{
> While it's nice to separate this out, it's not so nice to pass folio as a
> pointer to a pointer like this and maybe or maybe not set it.
>
> Just get the folio before you call this... you'll need it either way.

I did that on David's suggestion:

https://lore.kernel.org/all/8c389ee5-f7a4-44f6-a0d6-cc01c3da4d91@redhat.com/

We were trying to reuse the folio if available from prot_numa_skip_ptes,
to avoid using vm_normal_folio() again. Not sure if avoiding vm_normal_folio
is worth the complexity.

>
> I'll wait until you separate it all out before reviewing next revision as a bit
> tricky as-is.
>
> Thanks!

Re: [PATCH v4 1/4] mm: Optimize mprotect() for MM_CP_PROT_NUMA by batch-skipping PTEs

Posted by Lorenzo Stoakes 3 months, 1 week ago

On Wed, Jul 02, 2025 at 08:31:33PM +0530, Dev Jain wrote:
>
> On 02/07/25 3:07 pm, Lorenzo Stoakes wrote:
> > On Sat, Jun 28, 2025 at 05:04:32PM +0530, Dev Jain wrote:
> > > In case of prot_numa, there are various cases in which we can skip to the
> > > next iteration. Since the skip condition is based on the folio and not
> > > the PTEs, we can skip a PTE batch. Additionally refactor all of this
> > > into a new function to clean up the existing code.
> > >
> > > Signed-off-by: Dev Jain <dev.jain@arm.com>
> > > ---
> > >   mm/mprotect.c | 134 ++++++++++++++++++++++++++++++++------------------
> > >   1 file changed, 87 insertions(+), 47 deletions(-)
> > >
> > > diff --git a/mm/mprotect.c b/mm/mprotect.c
> > > index 88709c01177b..af10a7fbe6b8 100644
> > > --- a/mm/mprotect.c
> > > +++ b/mm/mprotect.c
> > > @@ -83,6 +83,83 @@ bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr,
> > >   	return pte_dirty(pte);
> > >   }
> > >
> > > +static int mprotect_folio_pte_batch(struct folio *folio, unsigned long addr,
> > > +		pte_t *ptep, pte_t pte, int max_nr_ptes)
> > > +{
> > > +	const fpb_t flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
> > > +
> > > +	if (!folio || !folio_test_large(folio) || (max_nr_ptes == 1))
> > > +		return 1;
> > > +
> > > +	return folio_pte_batch(folio, addr, ptep, pte, max_nr_ptes, flags,
> > > +			       NULL, NULL, NULL);
> > > +}
> > > +
> > > +static int prot_numa_skip_ptes(struct folio **foliop, struct vm_area_struct *vma,
> > > +		unsigned long addr, pte_t oldpte, pte_t *pte, int target_node,
> > > +		int max_nr_ptes)
> > > +{
> > While it's nice to separate this out, it's not so nice to pass folio as a
> > pointer to a pointer like this and maybe or maybe not set it.
> >
> > Just get the folio before you call this... you'll need it either way.
>
> I did that on David's suggestion:
>
> https://lore.kernel.org/all/8c389ee5-f7a4-44f6-a0d6-cc01c3da4d91@redhat.com/
>
> We were trying to reuse the folio if available from prot_numa_skip_ptes,
> to avoid using vm_normal_folio() again. Not sure if avoiding vm_normal_folio
> is worth the complexity.

Well, do you need to? You're doing vm_normal_folio() in both cases, why not just
put the vm_normal_folio() lookup in change_pte_range() before invoking this
function, then reuse the folio in the loop?

Oh right, I guess David was concerned about not needing to look it up in the
pte_protnone(oldpte) case?

I'm not sure that's worth it honestly. If we do _really_ want to do this, then
at least put the param last

Re: [PATCH v4 1/4] mm: Optimize mprotect() for MM_CP_PROT_NUMA by batch-skipping PTEs

Posted by Lorenzo Stoakes 3 months, 1 week ago

On Sat, Jun 28, 2025 at 05:04:32PM +0530, Dev Jain wrote:
> In case of prot_numa, there are various cases in which we can skip to the
> next iteration. Since the skip condition is based on the folio and not
> the PTEs, we can skip a PTE batch. Additionally refactor all of this
> into a new function to clean up the existing code.

Hmm, is this a completely new concept for this series?

Please try not to introduce brand new things to a series midway through.

This seems to be adding a whole ton of questionable logic for an edge case.

Can we maybe just drop this for this series please?

>
> Signed-off-by: Dev Jain <dev.jain@arm.com>
> ---
>  mm/mprotect.c | 134 ++++++++++++++++++++++++++++++++------------------
>  1 file changed, 87 insertions(+), 47 deletions(-)
>
> diff --git a/mm/mprotect.c b/mm/mprotect.c
> index 88709c01177b..af10a7fbe6b8 100644
> --- a/mm/mprotect.c
> +++ b/mm/mprotect.c
> @@ -83,6 +83,83 @@ bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr,
>  	return pte_dirty(pte);
>  }
>
> +static int mprotect_folio_pte_batch(struct folio *folio, unsigned long addr,
> +		pte_t *ptep, pte_t pte, int max_nr_ptes)
> +{
> +	const fpb_t flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
> +
> +	if (!folio || !folio_test_large(folio) || (max_nr_ptes == 1))
> +		return 1;
> +
> +	return folio_pte_batch(folio, addr, ptep, pte, max_nr_ptes, flags,
> +			       NULL, NULL, NULL);
> +}

I find it really odd that you're introducing this in a seemingly unrelated change.

Also won't this conflict with David's changes?

I know you like to rush out a dozen series at once, but once again I'm asking
maybe please hold off?

I seem to remember David asked you for the same thing because of this, but maybe
I'm misremembering.

We have only so much review resource and adding in brand new concepts mid-way
and doing things that blatantly conflict with other series really doesn't help.

> +
> +static int prot_numa_skip_ptes(struct folio **foliop, struct vm_area_struct *vma,
> +		unsigned long addr, pte_t oldpte, pte_t *pte, int target_node,
> +		int max_nr_ptes)
> +{
> +	struct folio *folio = NULL;
> +	int nr_ptes = 1;
> +	bool toptier;
> +	int nid;
> +
> +	/* Avoid TLB flush if possible */
> +	if (pte_protnone(oldpte))
> +		goto skip_batch;
> +
> +	folio = vm_normal_folio(vma, addr, oldpte);
> +	if (!folio)
> +		goto skip_batch;
> +
> +	if (folio_is_zone_device(folio) || folio_test_ksm(folio))
> +		goto skip_batch;
> +
> +	/* Also skip shared copy-on-write pages */
> +	if (is_cow_mapping(vma->vm_flags) &&
> +	    (folio_maybe_dma_pinned(folio) || folio_maybe_mapped_shared(folio)))
> +		goto skip_batch;
> +
> +	/*
> +	 * While migration can move some dirty pages,
> +	 * it cannot move them all from MIGRATE_ASYNC
> +	 * context.
> +	 */
> +	if (folio_is_file_lru(folio) && folio_test_dirty(folio))
> +		goto skip_batch;
> +
> +	/*
> +	 * Don't mess with PTEs if page is already on the node
> +	 * a single-threaded process is running on.
> +	 */
> +	nid = folio_nid(folio);
> +	if (target_node == nid)
> +		goto skip_batch;
> +
> +	toptier = node_is_toptier(nid);
> +
> +	/*
> +	 * Skip scanning top tier node if normal numa
> +	 * balancing is disabled
> +	 */
> +	if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) && toptier)
> +		goto skip_batch;
> +
> +	if (folio_use_access_time(folio)) {
> +		folio_xchg_access_time(folio, jiffies_to_msecs(jiffies));
> +
> +		/* Do not skip in this case */
> +		nr_ptes = 0;
> +		goto out;
> +	}
> +
> +skip_batch:
> +	nr_ptes = mprotect_folio_pte_batch(folio, addr, pte, oldpte, max_nr_ptes);
> +out:
> +	*foliop = folio;
> +	return nr_ptes;
> +}

Yeah yuck. I don't like that we're doing all this for this edge case.

> +
>  static long change_pte_range(struct mmu_gather *tlb,
>  		struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr,
>  		unsigned long end, pgprot_t newprot, unsigned long cp_flags)
> @@ -94,6 +171,7 @@ static long change_pte_range(struct mmu_gather *tlb,
>  	bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
>  	bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
>  	bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
> +	int nr_ptes;
>
>  	tlb_change_page_size(tlb, PAGE_SIZE);
>  	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
> @@ -108,8 +186,11 @@ static long change_pte_range(struct mmu_gather *tlb,
>  	flush_tlb_batched_pending(vma->vm_mm);
>  	arch_enter_lazy_mmu_mode();
>  	do {
> +		nr_ptes = 1;
>  		oldpte = ptep_get(pte);
>  		if (pte_present(oldpte)) {
> +			int max_nr_ptes = (end - addr) >> PAGE_SHIFT;
> +			struct folio *folio = NULL;
>  			pte_t ptent;
>
>  			/*
> @@ -117,53 +198,12 @@ static long change_pte_range(struct mmu_gather *tlb,
>  			 * pages. See similar comment in change_huge_pmd.
>  			 */
>  			if (prot_numa) {
> -				struct folio *folio;
> -				int nid;
> -				bool toptier;
> -
> -				/* Avoid TLB flush if possible */
> -				if (pte_protnone(oldpte))
> -					continue;
> -
> -				folio = vm_normal_folio(vma, addr, oldpte);
> -				if (!folio || folio_is_zone_device(folio) ||
> -				    folio_test_ksm(folio))
> -					continue;
> -
> -				/* Also skip shared copy-on-write pages */
> -				if (is_cow_mapping(vma->vm_flags) &&
> -				    (folio_maybe_dma_pinned(folio) ||
> -				     folio_maybe_mapped_shared(folio)))
> -					continue;
> -
> -				/*
> -				 * While migration can move some dirty pages,
> -				 * it cannot move them all from MIGRATE_ASYNC
> -				 * context.
> -				 */
> -				if (folio_is_file_lru(folio) &&
> -				    folio_test_dirty(folio))
> -					continue;
> -
> -				/*
> -				 * Don't mess with PTEs if page is already on the node
> -				 * a single-threaded process is running on.
> -				 */
> -				nid = folio_nid(folio);
> -				if (target_node == nid)
> -					continue;
> -				toptier = node_is_toptier(nid);
> -
> -				/*
> -				 * Skip scanning top tier node if normal numa
> -				 * balancing is disabled
> -				 */
> -				if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) &&
> -				    toptier)
> +				nr_ptes = prot_numa_skip_ptes(&folio, vma,
> +							      addr, oldpte, pte,
> +							      target_node,
> +							      max_nr_ptes);
> +				if (nr_ptes)

I'm not really a fan of this being added (unless I'm missing something here) but
_generally_ it's better to separate out a move and a change if you can.

>  					continue;
> -				if (folio_use_access_time(folio))
> -					folio_xchg_access_time(folio,
> -						jiffies_to_msecs(jiffies));
>  			}
>
>  			oldpte = ptep_modify_prot_start(vma, addr, pte);
> @@ -280,7 +320,7 @@ static long change_pte_range(struct mmu_gather *tlb,
>  				pages++;
>  			}
>  		}
> -	} while (pte++, addr += PAGE_SIZE, addr != end);
> +	} while (pte += nr_ptes, addr += nr_ptes * PAGE_SIZE, addr != end);
>  	arch_leave_lazy_mmu_mode();
>  	pte_unmap_unlock(pte - 1, ptl);
>
> --
> 2.30.2
>

Anyway will hold off on reviewing the actual changes here until we can figure
out whether this is event appropriate here.

Re: [PATCH v4 1/4] mm: Optimize mprotect() for MM_CP_PROT_NUMA by batch-skipping PTEs

Posted by Dev Jain 3 months, 1 week ago

On 30/06/25 4:55 pm, Lorenzo Stoakes wrote:
> On Sat, Jun 28, 2025 at 05:04:32PM +0530, Dev Jain wrote:
>> In case of prot_numa, there are various cases in which we can skip to the
>> next iteration. Since the skip condition is based on the folio and not
>> the PTEs, we can skip a PTE batch. Additionally refactor all of this
>> into a new function to clean up the existing code.
> Hmm, is this a completely new concept for this series?
>
> Please try not to introduce brand new things to a series midway through.
>
> This seems to be adding a whole ton of questionable logic for an edge case.
>
> Can we maybe just drop this for this series please?

I refactored this into a new function on David's suggestion:

https://lore.kernel.org/all/912757c0-8a75-4307-a0bd-8755f6135b5a@redhat.com/

Maybe you are saying, having a refactoring patch first and then the "skip a
PTE batch" second, I'll be happy to do that, that would be cleaner.

>
>> Signed-off-by: Dev Jain <dev.jain@arm.com>
>> ---
>>   mm/mprotect.c | 134 ++++++++++++++++++++++++++++++++------------------
>>   1 file changed, 87 insertions(+), 47 deletions(-)
>>
>> diff --git a/mm/mprotect.c b/mm/mprotect.c
>> index 88709c01177b..af10a7fbe6b8 100644
>> --- a/mm/mprotect.c
>> +++ b/mm/mprotect.c
>> @@ -83,6 +83,83 @@ bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr,
>>   	return pte_dirty(pte);
>>   }
>>
>> +static int mprotect_folio_pte_batch(struct folio *folio, unsigned long addr,
>> +		pte_t *ptep, pte_t pte, int max_nr_ptes)
>> +{
>> +	const fpb_t flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
>> +
>> +	if (!folio || !folio_test_large(folio) || (max_nr_ptes == 1))
>> +		return 1;
>> +
>> +	return folio_pte_batch(folio, addr, ptep, pte, max_nr_ptes, flags,
>> +			       NULL, NULL, NULL);
>> +}
> I find it really odd that you're introducing this in a seemingly unrelated change.
>
> Also won't this conflict with David's changes?

This series was (ofcourse, IMHO) pretty stable at v3, and there were comments
coming on David's series, so I guessed that he will have to post a v2 anyways
after mine gets merged. My guess could have been wrong. For the khugepaged
batching series, I have sent the migration race patch separately exactly
because of his series, so that in that case the rebasing burden is mine.

>
> I know you like to rush out a dozen series at once, but once again I'm asking
> maybe please hold off?

Lorenzo : ) Except for the mremap series which you pointed out, I make it a point
to never repost in the same week, unless it is an obvious single patch, and even
in that case I give 2-3 days for the reviews to settle. I posted
v3 of this series more than a month ago, so it makes total sense to post this.
Also I have seen many people spamming the list with the next versions on literally
the same day, not that I am using this as a precedent. The mistake I made here
is that on Saturday I saw David's series but then forgot that I am using the
same infrastructure he is changing and went ahead posting this. I suddenly
remembered this during the khugepaged series and dropped the first two patches
for that.
  

>
> I seem to remember David asked you for the same thing because of this, but maybe
> I'm misremembering.

I don't recollect that happening, maybe I am wrong.

>
> We have only so much review resource and adding in brand new concepts mid-way
> and doing things that blatantly conflict with other series really doesn't help.
>
>> +
>> +static int prot_numa_skip_ptes(struct folio **foliop, struct vm_area_struct *vma,
>> +		unsigned long addr, pte_t oldpte, pte_t *pte, int target_node,
>> +		int max_nr_ptes)
>> +{
>> +	struct folio *folio = NULL;
>> +	int nr_ptes = 1;
>> +	bool toptier;
>> +	int nid;
>> +
>> +	/* Avoid TLB flush if possible */
>> +	if (pte_protnone(oldpte))
>> +		goto skip_batch;
>> +
>> +	folio = vm_normal_folio(vma, addr, oldpte);
>> +	if (!folio)
>> +		goto skip_batch;
>> +
>> +	if (folio_is_zone_device(folio) || folio_test_ksm(folio))
>> +		goto skip_batch;
>> +
>> +	/* Also skip shared copy-on-write pages */
>> +	if (is_cow_mapping(vma->vm_flags) &&
>> +	    (folio_maybe_dma_pinned(folio) || folio_maybe_mapped_shared(folio)))
>> +		goto skip_batch;
>> +
>> +	/*
>> +	 * While migration can move some dirty pages,
>> +	 * it cannot move them all from MIGRATE_ASYNC
>> +	 * context.
>> +	 */
>> +	if (folio_is_file_lru(folio) && folio_test_dirty(folio))
>> +		goto skip_batch;
>> +
>> +	/*
>> +	 * Don't mess with PTEs if page is already on the node
>> +	 * a single-threaded process is running on.
>> +	 */
>> +	nid = folio_nid(folio);
>> +	if (target_node == nid)
>> +		goto skip_batch;
>> +
>> +	toptier = node_is_toptier(nid);
>> +
>> +	/*
>> +	 * Skip scanning top tier node if normal numa
>> +	 * balancing is disabled
>> +	 */
>> +	if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) && toptier)
>> +		goto skip_batch;
>> +
>> +	if (folio_use_access_time(folio)) {
>> +		folio_xchg_access_time(folio, jiffies_to_msecs(jiffies));
>> +
>> +		/* Do not skip in this case */
>> +		nr_ptes = 0;
>> +		goto out;
>> +	}
>> +
>> +skip_batch:
>> +	nr_ptes = mprotect_folio_pte_batch(folio, addr, pte, oldpte, max_nr_ptes);
>> +out:
>> +	*foliop = folio;
>> +	return nr_ptes;
>> +}
> Yeah yuck. I don't like that we're doing all this for this edge case.
>
>> +
>>   static long change_pte_range(struct mmu_gather *tlb,
>>   		struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr,
>>   		unsigned long end, pgprot_t newprot, unsigned long cp_flags)
>> @@ -94,6 +171,7 @@ static long change_pte_range(struct mmu_gather *tlb,
>>   	bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
>>   	bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
>>   	bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
>> +	int nr_ptes;
>>
>>   	tlb_change_page_size(tlb, PAGE_SIZE);
>>   	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
>> @@ -108,8 +186,11 @@ static long change_pte_range(struct mmu_gather *tlb,
>>   	flush_tlb_batched_pending(vma->vm_mm);
>>   	arch_enter_lazy_mmu_mode();
>>   	do {
>> +		nr_ptes = 1;
>>   		oldpte = ptep_get(pte);
>>   		if (pte_present(oldpte)) {
>> +			int max_nr_ptes = (end - addr) >> PAGE_SHIFT;
>> +			struct folio *folio = NULL;
>>   			pte_t ptent;
>>
>>   			/*
>> @@ -117,53 +198,12 @@ static long change_pte_range(struct mmu_gather *tlb,
>>   			 * pages. See similar comment in change_huge_pmd.
>>   			 */
>>   			if (prot_numa) {
>> -				struct folio *folio;
>> -				int nid;
>> -				bool toptier;
>> -
>> -				/* Avoid TLB flush if possible */
>> -				if (pte_protnone(oldpte))
>> -					continue;
>> -
>> -				folio = vm_normal_folio(vma, addr, oldpte);
>> -				if (!folio || folio_is_zone_device(folio) ||
>> -				    folio_test_ksm(folio))
>> -					continue;
>> -
>> -				/* Also skip shared copy-on-write pages */
>> -				if (is_cow_mapping(vma->vm_flags) &&
>> -				    (folio_maybe_dma_pinned(folio) ||
>> -				     folio_maybe_mapped_shared(folio)))
>> -					continue;
>> -
>> -				/*
>> -				 * While migration can move some dirty pages,
>> -				 * it cannot move them all from MIGRATE_ASYNC
>> -				 * context.
>> -				 */
>> -				if (folio_is_file_lru(folio) &&
>> -				    folio_test_dirty(folio))
>> -					continue;
>> -
>> -				/*
>> -				 * Don't mess with PTEs if page is already on the node
>> -				 * a single-threaded process is running on.
>> -				 */
>> -				nid = folio_nid(folio);
>> -				if (target_node == nid)
>> -					continue;
>> -				toptier = node_is_toptier(nid);
>> -
>> -				/*
>> -				 * Skip scanning top tier node if normal numa
>> -				 * balancing is disabled
>> -				 */
>> -				if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) &&
>> -				    toptier)
>> +				nr_ptes = prot_numa_skip_ptes(&folio, vma,
>> +							      addr, oldpte, pte,
>> +							      target_node,
>> +							      max_nr_ptes);
>> +				if (nr_ptes)
> I'm not really a fan of this being added (unless I'm missing something here) but
> _generally_ it's better to separate out a move and a change if you can.

Yup I'll split this patch.

>
>>   					continue;
>> -				if (folio_use_access_time(folio))
>> -					folio_xchg_access_time(folio,
>> -						jiffies_to_msecs(jiffies));
>>   			}
>>
>>   			oldpte = ptep_modify_prot_start(vma, addr, pte);
>> @@ -280,7 +320,7 @@ static long change_pte_range(struct mmu_gather *tlb,
>>   				pages++;
>>   			}
>>   		}
>> -	} while (pte++, addr += PAGE_SIZE, addr != end);
>> +	} while (pte += nr_ptes, addr += nr_ptes * PAGE_SIZE, addr != end);
>>   	arch_leave_lazy_mmu_mode();
>>   	pte_unmap_unlock(pte - 1, ptl);
>>
>> --
>> 2.30.2
>>
> Anyway will hold off on reviewing the actual changes here until we can figure
> out whether this is event appropriate here.

Re: [PATCH v4 1/4] mm: Optimize mprotect() for MM_CP_PROT_NUMA by batch-skipping PTEs

Posted by Lorenzo Stoakes 3 months, 1 week ago

On Mon, Jun 30, 2025 at 05:10:36PM +0530, Dev Jain wrote:
>
> On 30/06/25 4:55 pm, Lorenzo Stoakes wrote:
> > On Sat, Jun 28, 2025 at 05:04:32PM +0530, Dev Jain wrote:
> > > In case of prot_numa, there are various cases in which we can skip to the
> > > next iteration. Since the skip condition is based on the folio and not
> > > the PTEs, we can skip a PTE batch. Additionally refactor all of this
> > > into a new function to clean up the existing code.
> > Hmm, is this a completely new concept for this series?
> >
> > Please try not to introduce brand new things to a series midway through.
> >
> > This seems to be adding a whole ton of questionable logic for an edge case.
> >
> > Can we maybe just drop this for this series please?
>
> I refactored this into a new function on David's suggestion:
>
> https://lore.kernel.org/all/912757c0-8a75-4307-a0bd-8755f6135b5a@redhat.com/
>
> Maybe you are saying, having a refactoring patch first and then the "skip a
> PTE batch" second, I'll be happy to do that, that would be cleaner.

OK apologies then, my mistake.

So essentially you were doing this explicitly for the prot numa case and it just
wasn't clear in subject line before, ok.

Yes please separate the two out!

> This series was (ofcourse, IMHO) pretty stable at v3, and there were comments
> coming on David's series, so I guessed that he will have to post a v2 anyways
> after mine gets merged. My guess could have been wrong. For the khugepaged
> batching series, I have sent the migration race patch separately exactly
> because of his series, so that in that case the rebasing burden is mine.

This stuff can be difficult to align on, but I'd suggest that when there's
another series in the interim that is fundamentally changing a function
signature that will make your code not compile it's probably best to hold off.

And that's why I'm suggesting this here.

>
> >
> > I know you like to rush out a dozen series at once, but once again I'm asking
> > maybe please hold off?
>
> Lorenzo : ) Except for the mremap series which you pointed out, I make it a point
> to never repost in the same week, unless it is an obvious single patch, and even
> in that case I give 2-3 days for the reviews to settle. I posted
> v3 of this series more than a month ago, so it makes total sense to post this.
> Also I have seen many people spamming the list with the next versions on literally
> the same day, not that I am using this as a precedent. The mistake I made here
> is that on Saturday I saw David's series but then forgot that I am using the
> same infrastructure he is changing and went ahead posting this. I suddenly
> remembered this during the khugepaged series and dropped the first two patches
> for that.

I'm not saying you shot this out shortly after a previous version, I'm saying
you have a whole bunch of series at once (I know as I'm cc'd on them all I think
:), and I'm politely asking you to maybe not send another that causes a known
conflict.

Whether or not you do so is up to you, it was a request.

>
> >
> > I seem to remember David asked you for the same thing because of this, but maybe
> > I'm misremembering.
>
> I don't recollect that happening, maybe I am wrong.

Yeah maybe I'm misremembering, apologies if so!

Re: [PATCH v4 1/4] mm: Optimize mprotect() for MM_CP_PROT_NUMA by batch-skipping PTEs

Posted by Dev Jain 3 months, 1 week ago

On 30/06/25 5:21 pm, Lorenzo Stoakes wrote:
> On Mon, Jun 30, 2025 at 05:10:36PM +0530, Dev Jain wrote:
>> On 30/06/25 4:55 pm, Lorenzo Stoakes wrote:
>>> On Sat, Jun 28, 2025 at 05:04:32PM +0530, Dev Jain wrote:
>>>> In case of prot_numa, there are various cases in which we can skip to the
>>>> next iteration. Since the skip condition is based on the folio and not
>>>> the PTEs, we can skip a PTE batch. Additionally refactor all of this
>>>> into a new function to clean up the existing code.
>>> Hmm, is this a completely new concept for this series?
>>>
>>> Please try not to introduce brand new things to a series midway through.
>>>
>>> This seems to be adding a whole ton of questionable logic for an edge case.
>>>
>>> Can we maybe just drop this for this series please?
>> I refactored this into a new function on David's suggestion:
>>
>> https://lore.kernel.org/all/912757c0-8a75-4307-a0bd-8755f6135b5a@redhat.com/
>>
>> Maybe you are saying, having a refactoring patch first and then the "skip a
>> PTE batch" second, I'll be happy to do that, that would be cleaner.
> OK apologies then, my mistake.
>
> So essentially you were doing this explicitly for the prot numa case and it just
> wasn't clear in subject line before, ok.
>
> Yes please separate the two out!
>
>> This series was (ofcourse, IMHO) pretty stable at v3, and there were comments
>> coming on David's series, so I guessed that he will have to post a v2 anyways
>> after mine gets merged. My guess could have been wrong. For the khugepaged
>> batching series, I have sent the migration race patch separately exactly
>> because of his series, so that in that case the rebasing burden is mine.
> This stuff can be difficult to align on, but I'd suggest that when there's
> another series in the interim that is fundamentally changing a function
> signature that will make your code not compile it's probably best to hold off.

Good point about changing the function signature. Anyways my bad, should have
been more careful, apologies.

>
> And that's why I'm suggesting this here.
>
>>> I know you like to rush out a dozen series at once, but once again I'm asking
>>> maybe please hold off?
>> Lorenzo : ) Except for the mremap series which you pointed out, I make it a point
>> to never repost in the same week, unless it is an obvious single patch, and even
>> in that case I give 2-3 days for the reviews to settle. I posted
>> v3 of this series more than a month ago, so it makes total sense to post this.
>> Also I have seen many people spamming the list with the next versions on literally
>> the same day, not that I am using this as a precedent. The mistake I made here
>> is that on Saturday I saw David's series but then forgot that I am using the
>> same infrastructure he is changing and went ahead posting this. I suddenly
>> remembered this during the khugepaged series and dropped the first two patches
>> for that.
> I'm not saying you shot this out shortly after a previous version, I'm saying
> you have a whole bunch of series at once (I know as I'm cc'd on them all I think
> :), and I'm politely asking you to maybe not send another that causes a known
> conflict.
>
> Whether or not you do so is up to you, it was a request.

Yup my bad for forgetting about the conflict, apologies!

>
>>> I seem to remember David asked you for the same thing because of this, but maybe
>>> I'm misremembering.
>> I don't recollect that happening, maybe I am wrong.
> Yeah maybe I'm misremembering, apologies if so!

Re: [PATCH v4 1/4] mm: Optimize mprotect() for MM_CP_PROT_NUMA by batch-skipping PTEs

Posted by Ryan Roberts 3 months, 1 week ago

On 30/06/2025 12:25, Lorenzo Stoakes wrote:
> On Sat, Jun 28, 2025 at 05:04:32PM +0530, Dev Jain wrote:
>> In case of prot_numa, there are various cases in which we can skip to the
>> next iteration. Since the skip condition is based on the folio and not
>> the PTEs, we can skip a PTE batch. Additionally refactor all of this
>> into a new function to clean up the existing code.
> 
> Hmm, is this a completely new concept for this series?
> 
> Please try not to introduce brand new things to a series midway through.
> 
> This seems to be adding a whole ton of questionable logic for an edge case.
> 
> Can we maybe just drop this for this series please?

From my perspective, at least, there are no new logical changes in here vs the
previous version. And I don't think the patches have been re-organised either.
David (I think?) was asking for the name of the patch to be changed to include
MM_CP_PROT_NUMA and also for the code to be moved out of line to it's own
function. That's all that Dev has done AFAICT (although as per my review
comments, the refactoring has introduced a bug).

My preference is that we should ultimately support this batching. It could be a
separate series if you insist, but it's all contbuting to the same goal
ultimately; making mprotect support PTE batching.

Just my 2c.

Thanks,
Ryan

> 
>>
>> Signed-off-by: Dev Jain <dev.jain@arm.com>
>> ---
>>  mm/mprotect.c | 134 ++++++++++++++++++++++++++++++++------------------
>>  1 file changed, 87 insertions(+), 47 deletions(-)
>>
>> diff --git a/mm/mprotect.c b/mm/mprotect.c
>> index 88709c01177b..af10a7fbe6b8 100644
>> --- a/mm/mprotect.c
>> +++ b/mm/mprotect.c
>> @@ -83,6 +83,83 @@ bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr,
>>  	return pte_dirty(pte);
>>  }
>>
>> +static int mprotect_folio_pte_batch(struct folio *folio, unsigned long addr,
>> +		pte_t *ptep, pte_t pte, int max_nr_ptes)
>> +{
>> +	const fpb_t flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
>> +
>> +	if (!folio || !folio_test_large(folio) || (max_nr_ptes == 1))
>> +		return 1;
>> +
>> +	return folio_pte_batch(folio, addr, ptep, pte, max_nr_ptes, flags,
>> +			       NULL, NULL, NULL);
>> +}
> 
> I find it really odd that you're introducing this in a seemingly unrelated change.
> 
> Also won't this conflict with David's changes?
> 
> I know you like to rush out a dozen series at once, but once again I'm asking
> maybe please hold off?
> 
> I seem to remember David asked you for the same thing because of this, but maybe
> I'm misremembering.
> 
> We have only so much review resource and adding in brand new concepts mid-way
> and doing things that blatantly conflict with other series really doesn't help.
> 
>> +
>> +static int prot_numa_skip_ptes(struct folio **foliop, struct vm_area_struct *vma,
>> +		unsigned long addr, pte_t oldpte, pte_t *pte, int target_node,
>> +		int max_nr_ptes)
>> +{
>> +	struct folio *folio = NULL;
>> +	int nr_ptes = 1;
>> +	bool toptier;
>> +	int nid;
>> +
>> +	/* Avoid TLB flush if possible */
>> +	if (pte_protnone(oldpte))
>> +		goto skip_batch;
>> +
>> +	folio = vm_normal_folio(vma, addr, oldpte);
>> +	if (!folio)
>> +		goto skip_batch;
>> +
>> +	if (folio_is_zone_device(folio) || folio_test_ksm(folio))
>> +		goto skip_batch;
>> +
>> +	/* Also skip shared copy-on-write pages */
>> +	if (is_cow_mapping(vma->vm_flags) &&
>> +	    (folio_maybe_dma_pinned(folio) || folio_maybe_mapped_shared(folio)))
>> +		goto skip_batch;
>> +
>> +	/*
>> +	 * While migration can move some dirty pages,
>> +	 * it cannot move them all from MIGRATE_ASYNC
>> +	 * context.
>> +	 */
>> +	if (folio_is_file_lru(folio) && folio_test_dirty(folio))
>> +		goto skip_batch;
>> +
>> +	/*
>> +	 * Don't mess with PTEs if page is already on the node
>> +	 * a single-threaded process is running on.
>> +	 */
>> +	nid = folio_nid(folio);
>> +	if (target_node == nid)
>> +		goto skip_batch;
>> +
>> +	toptier = node_is_toptier(nid);
>> +
>> +	/*
>> +	 * Skip scanning top tier node if normal numa
>> +	 * balancing is disabled
>> +	 */
>> +	if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) && toptier)
>> +		goto skip_batch;
>> +
>> +	if (folio_use_access_time(folio)) {
>> +		folio_xchg_access_time(folio, jiffies_to_msecs(jiffies));
>> +
>> +		/* Do not skip in this case */
>> +		nr_ptes = 0;
>> +		goto out;
>> +	}
>> +
>> +skip_batch:
>> +	nr_ptes = mprotect_folio_pte_batch(folio, addr, pte, oldpte, max_nr_ptes);
>> +out:
>> +	*foliop = folio;
>> +	return nr_ptes;
>> +}
> 
> Yeah yuck. I don't like that we're doing all this for this edge case.
> 
>> +
>>  static long change_pte_range(struct mmu_gather *tlb,
>>  		struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr,
>>  		unsigned long end, pgprot_t newprot, unsigned long cp_flags)
>> @@ -94,6 +171,7 @@ static long change_pte_range(struct mmu_gather *tlb,
>>  	bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
>>  	bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
>>  	bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
>> +	int nr_ptes;
>>
>>  	tlb_change_page_size(tlb, PAGE_SIZE);
>>  	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
>> @@ -108,8 +186,11 @@ static long change_pte_range(struct mmu_gather *tlb,
>>  	flush_tlb_batched_pending(vma->vm_mm);
>>  	arch_enter_lazy_mmu_mode();
>>  	do {
>> +		nr_ptes = 1;
>>  		oldpte = ptep_get(pte);
>>  		if (pte_present(oldpte)) {
>> +			int max_nr_ptes = (end - addr) >> PAGE_SHIFT;
>> +			struct folio *folio = NULL;
>>  			pte_t ptent;
>>
>>  			/*
>> @@ -117,53 +198,12 @@ static long change_pte_range(struct mmu_gather *tlb,
>>  			 * pages. See similar comment in change_huge_pmd.
>>  			 */
>>  			if (prot_numa) {
>> -				struct folio *folio;
>> -				int nid;
>> -				bool toptier;
>> -
>> -				/* Avoid TLB flush if possible */
>> -				if (pte_protnone(oldpte))
>> -					continue;
>> -
>> -				folio = vm_normal_folio(vma, addr, oldpte);
>> -				if (!folio || folio_is_zone_device(folio) ||
>> -				    folio_test_ksm(folio))
>> -					continue;
>> -
>> -				/* Also skip shared copy-on-write pages */
>> -				if (is_cow_mapping(vma->vm_flags) &&
>> -				    (folio_maybe_dma_pinned(folio) ||
>> -				     folio_maybe_mapped_shared(folio)))
>> -					continue;
>> -
>> -				/*
>> -				 * While migration can move some dirty pages,
>> -				 * it cannot move them all from MIGRATE_ASYNC
>> -				 * context.
>> -				 */
>> -				if (folio_is_file_lru(folio) &&
>> -				    folio_test_dirty(folio))
>> -					continue;
>> -
>> -				/*
>> -				 * Don't mess with PTEs if page is already on the node
>> -				 * a single-threaded process is running on.
>> -				 */
>> -				nid = folio_nid(folio);
>> -				if (target_node == nid)
>> -					continue;
>> -				toptier = node_is_toptier(nid);
>> -
>> -				/*
>> -				 * Skip scanning top tier node if normal numa
>> -				 * balancing is disabled
>> -				 */
>> -				if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) &&
>> -				    toptier)
>> +				nr_ptes = prot_numa_skip_ptes(&folio, vma,
>> +							      addr, oldpte, pte,
>> +							      target_node,
>> +							      max_nr_ptes);
>> +				if (nr_ptes)
> 
> I'm not really a fan of this being added (unless I'm missing something here) but
> _generally_ it's better to separate out a move and a change if you can.
> 
>>  					continue;
>> -				if (folio_use_access_time(folio))
>> -					folio_xchg_access_time(folio,
>> -						jiffies_to_msecs(jiffies));
>>  			}
>>
>>  			oldpte = ptep_modify_prot_start(vma, addr, pte);
>> @@ -280,7 +320,7 @@ static long change_pte_range(struct mmu_gather *tlb,
>>  				pages++;
>>  			}
>>  		}
>> -	} while (pte++, addr += PAGE_SIZE, addr != end);
>> +	} while (pte += nr_ptes, addr += nr_ptes * PAGE_SIZE, addr != end);
>>  	arch_leave_lazy_mmu_mode();
>>  	pte_unmap_unlock(pte - 1, ptl);
>>
>> --
>> 2.30.2
>>
> 
> Anyway will hold off on reviewing the actual changes here until we can figure
> out whether this is event appropriate here.

Re: [PATCH v4 1/4] mm: Optimize mprotect() for MM_CP_PROT_NUMA by batch-skipping PTEs

Posted by Lorenzo Stoakes 3 months, 1 week ago

On Mon, Jun 30, 2025 at 12:39:33PM +0100, Ryan Roberts wrote:
> On 30/06/2025 12:25, Lorenzo Stoakes wrote:
> > On Sat, Jun 28, 2025 at 05:04:32PM +0530, Dev Jain wrote:
> >> In case of prot_numa, there are various cases in which we can skip to the
> >> next iteration. Since the skip condition is based on the folio and not
> >> the PTEs, we can skip a PTE batch. Additionally refactor all of this
> >> into a new function to clean up the existing code.
> >
> > Hmm, is this a completely new concept for this series?
> >
> > Please try not to introduce brand new things to a series midway through.
> >
> > This seems to be adding a whole ton of questionable logic for an edge case.
> >
> > Can we maybe just drop this for this series please?
>
> From my perspective, at least, there are no new logical changes in here vs the
> previous version. And I don't think the patches have been re-organised either.
> David (I think?) was asking for the name of the patch to be changed to include
> MM_CP_PROT_NUMA and also for the code to be moved out of line to it's own
> function. That's all that Dev has done AFAICT (although as per my review
> comments, the refactoring has introduced a bug).
>
> My preference is that we should ultimately support this batching. It could be a
> separate series if you insist, but it's all contbuting to the same goal
> ultimately; making mprotect support PTE batching.
>
> Just my 2c.
>
> Thanks,
> Ryan

Ack, was my mistake, apologies. I hadn't realised this was part of the series, I
hadn't looked it for a while...

But I think it's better to have the refactor and the batch bit done separately
so it's clear which is which, unless the change is so trivial as for that to be
just noise.

Cheers, Lorenzo

Re: [PATCH v4 1/4] mm: Optimize mprotect() for MM_CP_PROT_NUMA by batch-skipping PTEs

Posted by Ryan Roberts 3 months, 1 week ago

On 28/06/2025 12:34, Dev Jain wrote:
> In case of prot_numa, there are various cases in which we can skip to the
> next iteration. Since the skip condition is based on the folio and not
> the PTEs, we can skip a PTE batch. Additionally refactor all of this
> into a new function to clean up the existing code.
> 
> Signed-off-by: Dev Jain <dev.jain@arm.com>
> ---
>  mm/mprotect.c | 134 ++++++++++++++++++++++++++++++++------------------
>  1 file changed, 87 insertions(+), 47 deletions(-)
> 
> diff --git a/mm/mprotect.c b/mm/mprotect.c
> index 88709c01177b..af10a7fbe6b8 100644
> --- a/mm/mprotect.c
> +++ b/mm/mprotect.c
> @@ -83,6 +83,83 @@ bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr,
>  	return pte_dirty(pte);
>  }
>  
> +static int mprotect_folio_pte_batch(struct folio *folio, unsigned long addr,
> +		pte_t *ptep, pte_t pte, int max_nr_ptes)
> +{
> +	const fpb_t flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
> +
> +	if (!folio || !folio_test_large(folio) || (max_nr_ptes == 1))

The !folio check wasn't in the previous version. Why is it needed now?

> +		return 1;
> +
> +	return folio_pte_batch(folio, addr, ptep, pte, max_nr_ptes, flags,
> +			       NULL, NULL, NULL);
> +}
> +
> +static int prot_numa_skip_ptes(struct folio **foliop, struct vm_area_struct *vma,
> +		unsigned long addr, pte_t oldpte, pte_t *pte, int target_node,
> +		int max_nr_ptes)
> +{
> +	struct folio *folio = NULL;
> +	int nr_ptes = 1;
> +	bool toptier;
> +	int nid;
> +
> +	/* Avoid TLB flush if possible */
> +	if (pte_protnone(oldpte))
> +		goto skip_batch;
> +
> +	folio = vm_normal_folio(vma, addr, oldpte);
> +	if (!folio)
> +		goto skip_batch;
> +
> +	if (folio_is_zone_device(folio) || folio_test_ksm(folio))
> +		goto skip_batch;
> +
> +	/* Also skip shared copy-on-write pages */
> +	if (is_cow_mapping(vma->vm_flags) &&
> +	    (folio_maybe_dma_pinned(folio) || folio_maybe_mapped_shared(folio)))
> +		goto skip_batch;
> +
> +	/*
> +	 * While migration can move some dirty pages,
> +	 * it cannot move them all from MIGRATE_ASYNC
> +	 * context.
> +	 */
> +	if (folio_is_file_lru(folio) && folio_test_dirty(folio))
> +		goto skip_batch;
> +
> +	/*
> +	 * Don't mess with PTEs if page is already on the node
> +	 * a single-threaded process is running on.
> +	 */
> +	nid = folio_nid(folio);
> +	if (target_node == nid)
> +		goto skip_batch;
> +
> +	toptier = node_is_toptier(nid);
> +
> +	/*
> +	 * Skip scanning top tier node if normal numa
> +	 * balancing is disabled
> +	 */
> +	if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) && toptier)
> +		goto skip_batch;
> +
> +	if (folio_use_access_time(folio)) {
> +		folio_xchg_access_time(folio, jiffies_to_msecs(jiffies));
> +
> +		/* Do not skip in this case */
> +		nr_ptes = 0;
> +		goto out;

This doesn't smell right... perhaps I'm not understanding the logic. Why do you
return nr_ptes = 0 if you end up in this conditional, but nr_ptes = 1 if you
don't take this conditional? I think you want to return nr_ptes == 0 for both
cases?...

> +	}
> +
> +skip_batch:
> +	nr_ptes = mprotect_folio_pte_batch(folio, addr, pte, oldpte, max_nr_ptes);
> +out:
> +	*foliop = folio;
> +	return nr_ptes;
> +}
> +
>  static long change_pte_range(struct mmu_gather *tlb,
>  		struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr,
>  		unsigned long end, pgprot_t newprot, unsigned long cp_flags)
> @@ -94,6 +171,7 @@ static long change_pte_range(struct mmu_gather *tlb,
>  	bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
>  	bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
>  	bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
> +	int nr_ptes;
>  
>  	tlb_change_page_size(tlb, PAGE_SIZE);
>  	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
> @@ -108,8 +186,11 @@ static long change_pte_range(struct mmu_gather *tlb,
>  	flush_tlb_batched_pending(vma->vm_mm);
>  	arch_enter_lazy_mmu_mode();
>  	do {
> +		nr_ptes = 1;
>  		oldpte = ptep_get(pte);
>  		if (pte_present(oldpte)) {
> +			int max_nr_ptes = (end - addr) >> PAGE_SHIFT;
> +			struct folio *folio = NULL;
>  			pte_t ptent;
>  
>  			/*
> @@ -117,53 +198,12 @@ static long change_pte_range(struct mmu_gather *tlb,
>  			 * pages. See similar comment in change_huge_pmd.
>  			 */
>  			if (prot_numa) {
> -				struct folio *folio;
> -				int nid;
> -				bool toptier;
> -
> -				/* Avoid TLB flush if possible */
> -				if (pte_protnone(oldpte))
> -					continue;
> -
> -				folio = vm_normal_folio(vma, addr, oldpte);
> -				if (!folio || folio_is_zone_device(folio) ||
> -				    folio_test_ksm(folio))
> -					continue;
> -
> -				/* Also skip shared copy-on-write pages */
> -				if (is_cow_mapping(vma->vm_flags) &&
> -				    (folio_maybe_dma_pinned(folio) ||
> -				     folio_maybe_mapped_shared(folio)))
> -					continue;
> -
> -				/*
> -				 * While migration can move some dirty pages,
> -				 * it cannot move them all from MIGRATE_ASYNC
> -				 * context.
> -				 */
> -				if (folio_is_file_lru(folio) &&
> -				    folio_test_dirty(folio))
> -					continue;
> -
> -				/*
> -				 * Don't mess with PTEs if page is already on the node
> -				 * a single-threaded process is running on.
> -				 */
> -				nid = folio_nid(folio);
> -				if (target_node == nid)
> -					continue;
> -				toptier = node_is_toptier(nid);
> -
> -				/*
> -				 * Skip scanning top tier node if normal numa
> -				 * balancing is disabled
> -				 */
> -				if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) &&
> -				    toptier)
> +				nr_ptes = prot_numa_skip_ptes(&folio, vma,
> +							      addr, oldpte, pte,
> +							      target_node,
> +							      max_nr_ptes);
> +				if (nr_ptes)
>  					continue;

...But now here nr_ptes == 0 for the "don't skip" case, so won't you process
that PTE twice because while (pte += nr_ptes, ...) won't advance it?

Suggest forcing nr_ptes = 1 after this conditional "continue"?

Thanks,
Ryan


> -				if (folio_use_access_time(folio))
> -					folio_xchg_access_time(folio,
> -						jiffies_to_msecs(jiffies));
>  			}
>  
>  			oldpte = ptep_modify_prot_start(vma, addr, pte);
> @@ -280,7 +320,7 @@ static long change_pte_range(struct mmu_gather *tlb,
>  				pages++;
>  			}
>  		}
> -	} while (pte++, addr += PAGE_SIZE, addr != end);
> +	} while (pte += nr_ptes, addr += nr_ptes * PAGE_SIZE, addr != end);
>  	arch_leave_lazy_mmu_mode();
>  	pte_unmap_unlock(pte - 1, ptl);
>

Re: [PATCH v4 1/4] mm: Optimize mprotect() for MM_CP_PROT_NUMA by batch-skipping PTEs

Posted by Dev Jain 3 months, 1 week ago

On 30/06/25 3:12 pm, Ryan Roberts wrote:
> On 28/06/2025 12:34, Dev Jain wrote:
>> In case of prot_numa, there are various cases in which we can skip to the
>> next iteration. Since the skip condition is based on the folio and not
>> the PTEs, we can skip a PTE batch. Additionally refactor all of this
>> into a new function to clean up the existing code.
>>
>> Signed-off-by: Dev Jain <dev.jain@arm.com>
>> ---
>>   mm/mprotect.c | 134 ++++++++++++++++++++++++++++++++------------------
>>   1 file changed, 87 insertions(+), 47 deletions(-)
>>
>> diff --git a/mm/mprotect.c b/mm/mprotect.c
>> index 88709c01177b..af10a7fbe6b8 100644
>> --- a/mm/mprotect.c
>> +++ b/mm/mprotect.c
>> @@ -83,6 +83,83 @@ bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr,
>>   	return pte_dirty(pte);
>>   }
>>   
>> +static int mprotect_folio_pte_batch(struct folio *folio, unsigned long addr,
>> +		pte_t *ptep, pte_t pte, int max_nr_ptes)
>> +{
>> +	const fpb_t flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
>> +
>> +	if (!folio || !folio_test_large(folio) || (max_nr_ptes == 1))
> The !folio check wasn't in the previous version. Why is it needed now?

It was there, actually. After prot_numa_skip_ptes(), if the folio is still
NULL, we get it using vm_normal_folio(). If this returns NULL, then
mprotect_folio_pte_batch() will return 1 to say that we cannot batch.

>
>> +		return 1;
>> +
>> +	return folio_pte_batch(folio, addr, ptep, pte, max_nr_ptes, flags,
>> +			       NULL, NULL, NULL);
>> +}
>> +
>> +static int prot_numa_skip_ptes(struct folio **foliop, struct vm_area_struct *vma,
>> +		unsigned long addr, pte_t oldpte, pte_t *pte, int target_node,
>> +		int max_nr_ptes)
>> +{
>> +	struct folio *folio = NULL;
>> +	int nr_ptes = 1;
>> +	bool toptier;
>> +	int nid;
>> +
>> +	/* Avoid TLB flush if possible */
>> +	if (pte_protnone(oldpte))
>> +		goto skip_batch;
>> +
>> +	folio = vm_normal_folio(vma, addr, oldpte);
>> +	if (!folio)
>> +		goto skip_batch;
>> +
>> +	if (folio_is_zone_device(folio) || folio_test_ksm(folio))
>> +		goto skip_batch;
>> +
>> +	/* Also skip shared copy-on-write pages */
>> +	if (is_cow_mapping(vma->vm_flags) &&
>> +	    (folio_maybe_dma_pinned(folio) || folio_maybe_mapped_shared(folio)))
>> +		goto skip_batch;
>> +
>> +	/*
>> +	 * While migration can move some dirty pages,
>> +	 * it cannot move them all from MIGRATE_ASYNC
>> +	 * context.
>> +	 */
>> +	if (folio_is_file_lru(folio) && folio_test_dirty(folio))
>> +		goto skip_batch;
>> +
>> +	/*
>> +	 * Don't mess with PTEs if page is already on the node
>> +	 * a single-threaded process is running on.
>> +	 */
>> +	nid = folio_nid(folio);
>> +	if (target_node == nid)
>> +		goto skip_batch;
>> +
>> +	toptier = node_is_toptier(nid);
>> +
>> +	/*
>> +	 * Skip scanning top tier node if normal numa
>> +	 * balancing is disabled
>> +	 */
>> +	if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) && toptier)
>> +		goto skip_batch;
>> +
>> +	if (folio_use_access_time(folio)) {
>> +		folio_xchg_access_time(folio, jiffies_to_msecs(jiffies));
>> +
>> +		/* Do not skip in this case */
>> +		nr_ptes = 0;
>> +		goto out;
> This doesn't smell right... perhaps I'm not understanding the logic. Why do you
> return nr_ptes = 0 if you end up in this conditional, but nr_ptes = 1 if you
> don't take this conditional? I think you want to return nr_ptes == 0 for both
> cases?...

In the existing code, we do not skip if we take this conditional. So nr_ptes == 0
is only a hint that we don't have to skip in this case.

>
>> +	}
>> +
>> +skip_batch:
>> +	nr_ptes = mprotect_folio_pte_batch(folio, addr, pte, oldpte, max_nr_ptes);
>> +out:
>> +	*foliop = folio;
>> +	return nr_ptes;
>> +}
>> +
>>   static long change_pte_range(struct mmu_gather *tlb,
>>   		struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr,
>>   		unsigned long end, pgprot_t newprot, unsigned long cp_flags)
>> @@ -94,6 +171,7 @@ static long change_pte_range(struct mmu_gather *tlb,
>>   	bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
>>   	bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
>>   	bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
>> +	int nr_ptes;
>>   
>>   	tlb_change_page_size(tlb, PAGE_SIZE);
>>   	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
>> @@ -108,8 +186,11 @@ static long change_pte_range(struct mmu_gather *tlb,
>>   	flush_tlb_batched_pending(vma->vm_mm);
>>   	arch_enter_lazy_mmu_mode();
>>   	do {
>> +		nr_ptes = 1;
>>   		oldpte = ptep_get(pte);
>>   		if (pte_present(oldpte)) {
>> +			int max_nr_ptes = (end - addr) >> PAGE_SHIFT;
>> +			struct folio *folio = NULL;
>>   			pte_t ptent;
>>   
>>   			/*
>> @@ -117,53 +198,12 @@ static long change_pte_range(struct mmu_gather *tlb,
>>   			 * pages. See similar comment in change_huge_pmd.
>>   			 */
>>   			if (prot_numa) {
>> -				struct folio *folio;
>> -				int nid;
>> -				bool toptier;
>> -
>> -				/* Avoid TLB flush if possible */
>> -				if (pte_protnone(oldpte))
>> -					continue;
>> -
>> -				folio = vm_normal_folio(vma, addr, oldpte);
>> -				if (!folio || folio_is_zone_device(folio) ||
>> -				    folio_test_ksm(folio))
>> -					continue;
>> -
>> -				/* Also skip shared copy-on-write pages */
>> -				if (is_cow_mapping(vma->vm_flags) &&
>> -				    (folio_maybe_dma_pinned(folio) ||
>> -				     folio_maybe_mapped_shared(folio)))
>> -					continue;
>> -
>> -				/*
>> -				 * While migration can move some dirty pages,
>> -				 * it cannot move them all from MIGRATE_ASYNC
>> -				 * context.
>> -				 */
>> -				if (folio_is_file_lru(folio) &&
>> -				    folio_test_dirty(folio))
>> -					continue;
>> -
>> -				/*
>> -				 * Don't mess with PTEs if page is already on the node
>> -				 * a single-threaded process is running on.
>> -				 */
>> -				nid = folio_nid(folio);
>> -				if (target_node == nid)
>> -					continue;
>> -				toptier = node_is_toptier(nid);
>> -
>> -				/*
>> -				 * Skip scanning top tier node if normal numa
>> -				 * balancing is disabled
>> -				 */
>> -				if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) &&
>> -				    toptier)
>> +				nr_ptes = prot_numa_skip_ptes(&folio, vma,
>> +							      addr, oldpte, pte,
>> +							      target_node,
>> +							      max_nr_ptes);
>> +				if (nr_ptes)
>>   					continue;
> ...But now here nr_ptes == 0 for the "don't skip" case, so won't you process
> that PTE twice because while (pte += nr_ptes, ...) won't advance it?
>
> Suggest forcing nr_ptes = 1 after this conditional "continue"?

nr_ptes will be forced to a non zero value through mprotect_folio_pte_batch().

>
> Thanks,
> Ryan
>
>
>> -				if (folio_use_access_time(folio))
>> -					folio_xchg_access_time(folio,
>> -						jiffies_to_msecs(jiffies));
>>   			}
>>   
>>   			oldpte = ptep_modify_prot_start(vma, addr, pte);
>> @@ -280,7 +320,7 @@ static long change_pte_range(struct mmu_gather *tlb,
>>   				pages++;
>>   			}
>>   		}
>> -	} while (pte++, addr += PAGE_SIZE, addr != end);
>> +	} while (pte += nr_ptes, addr += nr_ptes * PAGE_SIZE, addr != end);
>>   	arch_leave_lazy_mmu_mode();
>>   	pte_unmap_unlock(pte - 1, ptl);
>>

Re: [PATCH v4 1/4] mm: Optimize mprotect() for MM_CP_PROT_NUMA by batch-skipping PTEs

Posted by Ryan Roberts 3 months, 1 week ago

On 30/06/2025 10:49, Dev Jain wrote:
> 
> On 30/06/25 3:12 pm, Ryan Roberts wrote:
>> On 28/06/2025 12:34, Dev Jain wrote:
>>> In case of prot_numa, there are various cases in which we can skip to the
>>> next iteration. Since the skip condition is based on the folio and not
>>> the PTEs, we can skip a PTE batch. Additionally refactor all of this
>>> into a new function to clean up the existing code.
>>>
>>> Signed-off-by: Dev Jain <dev.jain@arm.com>
>>> ---
>>>   mm/mprotect.c | 134 ++++++++++++++++++++++++++++++++------------------
>>>   1 file changed, 87 insertions(+), 47 deletions(-)
>>>
>>> diff --git a/mm/mprotect.c b/mm/mprotect.c
>>> index 88709c01177b..af10a7fbe6b8 100644
>>> --- a/mm/mprotect.c
>>> +++ b/mm/mprotect.c
>>> @@ -83,6 +83,83 @@ bool can_change_pte_writable(struct vm_area_struct *vma,
>>> unsigned long addr,
>>>       return pte_dirty(pte);
>>>   }
>>>   +static int mprotect_folio_pte_batch(struct folio *folio, unsigned long addr,
>>> +        pte_t *ptep, pte_t pte, int max_nr_ptes)
>>> +{
>>> +    const fpb_t flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
>>> +
>>> +    if (!folio || !folio_test_large(folio) || (max_nr_ptes == 1))
>> The !folio check wasn't in the previous version. Why is it needed now?
> 
> It was there, actually. After prot_numa_skip_ptes(), if the folio is still
> NULL, we get it using vm_normal_folio(). If this returns NULL, then
> mprotect_folio_pte_batch() will return 1 to say that we cannot batch.
> 
>>
>>> +        return 1;
>>> +
>>> +    return folio_pte_batch(folio, addr, ptep, pte, max_nr_ptes, flags,
>>> +                   NULL, NULL, NULL);
>>> +}
>>> +
>>> +static int prot_numa_skip_ptes(struct folio **foliop, struct vm_area_struct
>>> *vma,
>>> +        unsigned long addr, pte_t oldpte, pte_t *pte, int target_node,
>>> +        int max_nr_ptes)
>>> +{
>>> +    struct folio *folio = NULL;
>>> +    int nr_ptes = 1;
>>> +    bool toptier;
>>> +    int nid;
>>> +
>>> +    /* Avoid TLB flush if possible */
>>> +    if (pte_protnone(oldpte))
>>> +        goto skip_batch;
>>> +
>>> +    folio = vm_normal_folio(vma, addr, oldpte);
>>> +    if (!folio)
>>> +        goto skip_batch;
>>> +
>>> +    if (folio_is_zone_device(folio) || folio_test_ksm(folio))
>>> +        goto skip_batch;
>>> +
>>> +    /* Also skip shared copy-on-write pages */
>>> +    if (is_cow_mapping(vma->vm_flags) &&
>>> +        (folio_maybe_dma_pinned(folio) || folio_maybe_mapped_shared(folio)))
>>> +        goto skip_batch;
>>> +
>>> +    /*
>>> +     * While migration can move some dirty pages,
>>> +     * it cannot move them all from MIGRATE_ASYNC
>>> +     * context.
>>> +     */
>>> +    if (folio_is_file_lru(folio) && folio_test_dirty(folio))
>>> +        goto skip_batch;
>>> +
>>> +    /*
>>> +     * Don't mess with PTEs if page is already on the node
>>> +     * a single-threaded process is running on.
>>> +     */
>>> +    nid = folio_nid(folio);
>>> +    if (target_node == nid)
>>> +        goto skip_batch;
>>> +
>>> +    toptier = node_is_toptier(nid);
>>> +
>>> +    /*
>>> +     * Skip scanning top tier node if normal numa
>>> +     * balancing is disabled
>>> +     */
>>> +    if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) && toptier)
>>> +        goto skip_batch;
>>> +
>>> +    if (folio_use_access_time(folio)) {
>>> +        folio_xchg_access_time(folio, jiffies_to_msecs(jiffies));
>>> +
>>> +        /* Do not skip in this case */
>>> +        nr_ptes = 0;
>>> +        goto out;
>> This doesn't smell right... perhaps I'm not understanding the logic. Why do you
>> return nr_ptes = 0 if you end up in this conditional, but nr_ptes = 1 if you
>> don't take this conditional? I think you want to return nr_ptes == 0 for both
>> cases?...
> 
> In the existing code, we do not skip if we take this conditional. So nr_ptes == 0
> is only a hint that we don't have to skip in this case.

We also do not skip if we do not take the conditional,right? "hint that we don't
have to skip in this case"... no I think it's a "directive that we must not
skip"? A hint is something that the implementation is free to ignore. But I
don't think that's the case here.

What I'm saying is that I think this block should actually be:

	if (folio_use_access_time(folio))
		folio_xchg_access_time(folio, jiffies_to_msecs(jiffies));

	/* Do not skip in this case */
	nr_ptes = 0;
	goto out;

> 
>>
>>> +    }
>>> +
>>> +skip_batch:
>>> +    nr_ptes = mprotect_folio_pte_batch(folio, addr, pte, oldpte, max_nr_ptes);
>>> +out:
>>> +    *foliop = folio;
>>> +    return nr_ptes;
>>> +}
>>> +
>>>   static long change_pte_range(struct mmu_gather *tlb,
>>>           struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr,
>>>           unsigned long end, pgprot_t newprot, unsigned long cp_flags)
>>> @@ -94,6 +171,7 @@ static long change_pte_range(struct mmu_gather *tlb,
>>>       bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
>>>       bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
>>>       bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
>>> +    int nr_ptes;
>>>         tlb_change_page_size(tlb, PAGE_SIZE);
>>>       pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
>>> @@ -108,8 +186,11 @@ static long change_pte_range(struct mmu_gather *tlb,
>>>       flush_tlb_batched_pending(vma->vm_mm);
>>>       arch_enter_lazy_mmu_mode();
>>>       do {
>>> +        nr_ptes = 1;
>>>           oldpte = ptep_get(pte);
>>>           if (pte_present(oldpte)) {
>>> +            int max_nr_ptes = (end - addr) >> PAGE_SHIFT;
>>> +            struct folio *folio = NULL;
>>>               pte_t ptent;
>>>                 /*
>>> @@ -117,53 +198,12 @@ static long change_pte_range(struct mmu_gather *tlb,
>>>                * pages. See similar comment in change_huge_pmd.
>>>                */
>>>               if (prot_numa) {
>>> -                struct folio *folio;
>>> -                int nid;
>>> -                bool toptier;
>>> -
>>> -                /* Avoid TLB flush if possible */
>>> -                if (pte_protnone(oldpte))
>>> -                    continue;
>>> -
>>> -                folio = vm_normal_folio(vma, addr, oldpte);
>>> -                if (!folio || folio_is_zone_device(folio) ||
>>> -                    folio_test_ksm(folio))
>>> -                    continue;
>>> -
>>> -                /* Also skip shared copy-on-write pages */
>>> -                if (is_cow_mapping(vma->vm_flags) &&
>>> -                    (folio_maybe_dma_pinned(folio) ||
>>> -                     folio_maybe_mapped_shared(folio)))
>>> -                    continue;
>>> -
>>> -                /*
>>> -                 * While migration can move some dirty pages,
>>> -                 * it cannot move them all from MIGRATE_ASYNC
>>> -                 * context.
>>> -                 */
>>> -                if (folio_is_file_lru(folio) &&
>>> -                    folio_test_dirty(folio))
>>> -                    continue;
>>> -
>>> -                /*
>>> -                 * Don't mess with PTEs if page is already on the node
>>> -                 * a single-threaded process is running on.
>>> -                 */
>>> -                nid = folio_nid(folio);
>>> -                if (target_node == nid)
>>> -                    continue;
>>> -                toptier = node_is_toptier(nid);
>>> -
>>> -                /*
>>> -                 * Skip scanning top tier node if normal numa
>>> -                 * balancing is disabled
>>> -                 */
>>> -                if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) &&
>>> -                    toptier)
>>> +                nr_ptes = prot_numa_skip_ptes(&folio, vma,
>>> +                                  addr, oldpte, pte,
>>> +                                  target_node,
>>> +                                  max_nr_ptes);
>>> +                if (nr_ptes)
>>>                       continue;
>> ...But now here nr_ptes == 0 for the "don't skip" case, so won't you process
>> that PTE twice because while (pte += nr_ptes, ...) won't advance it?
>>
>> Suggest forcing nr_ptes = 1 after this conditional "continue"?
> 
> nr_ptes will be forced to a non zero value through mprotect_folio_pte_batch().

But you don't call mprotect_folio_pte_batch() if you have set nr_ptes = 0;
Perhaps you are referring to calling mprotect_folio_pte_batch() on the
processing path in a future patch? But that means that this patch is buggy
without the future patch.

> 
>>
>> Thanks,
>> Ryan
>>
>>
>>> -                if (folio_use_access_time(folio))
>>> -                    folio_xchg_access_time(folio,
>>> -                        jiffies_to_msecs(jiffies));
>>>               }
>>>                 oldpte = ptep_modify_prot_start(vma, addr, pte);
>>> @@ -280,7 +320,7 @@ static long change_pte_range(struct mmu_gather *tlb,
>>>                   pages++;
>>>               }
>>>           }
>>> -    } while (pte++, addr += PAGE_SIZE, addr != end);
>>> +    } while (pte += nr_ptes, addr += nr_ptes * PAGE_SIZE, addr != end);
>>>       arch_leave_lazy_mmu_mode();
>>>       pte_unmap_unlock(pte - 1, ptl);
>>>

Re: [PATCH v4 1/4] mm: Optimize mprotect() for MM_CP_PROT_NUMA by batch-skipping PTEs

Posted by Dev Jain 3 months, 1 week ago

On 30/06/25 3:25 pm, Ryan Roberts wrote:
> On 30/06/2025 10:49, Dev Jain wrote:
>> On 30/06/25 3:12 pm, Ryan Roberts wrote:
>>> On 28/06/2025 12:34, Dev Jain wrote:
>>>> In case of prot_numa, there are various cases in which we can skip to the
>>>> next iteration. Since the skip condition is based on the folio and not
>>>> the PTEs, we can skip a PTE batch. Additionally refactor all of this
>>>> into a new function to clean up the existing code.
>>>>
>>>> Signed-off-by: Dev Jain <dev.jain@arm.com>
>>>> ---
>>>>    mm/mprotect.c | 134 ++++++++++++++++++++++++++++++++------------------
>>>>    1 file changed, 87 insertions(+), 47 deletions(-)
>>>>
>>>> diff --git a/mm/mprotect.c b/mm/mprotect.c
>>>> index 88709c01177b..af10a7fbe6b8 100644
>>>> --- a/mm/mprotect.c
>>>> +++ b/mm/mprotect.c
>>>> @@ -83,6 +83,83 @@ bool can_change_pte_writable(struct vm_area_struct *vma,
>>>> unsigned long addr,
>>>>        return pte_dirty(pte);
>>>>    }
>>>>    +static int mprotect_folio_pte_batch(struct folio *folio, unsigned long addr,
>>>> +        pte_t *ptep, pte_t pte, int max_nr_ptes)
>>>> +{
>>>> +    const fpb_t flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
>>>> +
>>>> +    if (!folio || !folio_test_large(folio) || (max_nr_ptes == 1))
>>> The !folio check wasn't in the previous version. Why is it needed now?
>> It was there, actually. After prot_numa_skip_ptes(), if the folio is still
>> NULL, we get it using vm_normal_folio(). If this returns NULL, then
>> mprotect_folio_pte_batch() will return 1 to say that we cannot batch.
>>
>>>> +        return 1;
>>>> +
>>>> +    return folio_pte_batch(folio, addr, ptep, pte, max_nr_ptes, flags,
>>>> +                   NULL, NULL, NULL);
>>>> +}
>>>> +
>>>> +static int prot_numa_skip_ptes(struct folio **foliop, struct vm_area_struct
>>>> *vma,
>>>> +        unsigned long addr, pte_t oldpte, pte_t *pte, int target_node,
>>>> +        int max_nr_ptes)
>>>> +{
>>>> +    struct folio *folio = NULL;
>>>> +    int nr_ptes = 1;
>>>> +    bool toptier;
>>>> +    int nid;
>>>> +
>>>> +    /* Avoid TLB flush if possible */
>>>> +    if (pte_protnone(oldpte))
>>>> +        goto skip_batch;
>>>> +
>>>> +    folio = vm_normal_folio(vma, addr, oldpte);
>>>> +    if (!folio)
>>>> +        goto skip_batch;
>>>> +
>>>> +    if (folio_is_zone_device(folio) || folio_test_ksm(folio))
>>>> +        goto skip_batch;
>>>> +
>>>> +    /* Also skip shared copy-on-write pages */
>>>> +    if (is_cow_mapping(vma->vm_flags) &&
>>>> +        (folio_maybe_dma_pinned(folio) || folio_maybe_mapped_shared(folio)))
>>>> +        goto skip_batch;
>>>> +
>>>> +    /*
>>>> +     * While migration can move some dirty pages,
>>>> +     * it cannot move them all from MIGRATE_ASYNC
>>>> +     * context.
>>>> +     */
>>>> +    if (folio_is_file_lru(folio) && folio_test_dirty(folio))
>>>> +        goto skip_batch;
>>>> +
>>>> +    /*
>>>> +     * Don't mess with PTEs if page is already on the node
>>>> +     * a single-threaded process is running on.
>>>> +     */
>>>> +    nid = folio_nid(folio);
>>>> +    if (target_node == nid)
>>>> +        goto skip_batch;
>>>> +
>>>> +    toptier = node_is_toptier(nid);
>>>> +
>>>> +    /*
>>>> +     * Skip scanning top tier node if normal numa
>>>> +     * balancing is disabled
>>>> +     */
>>>> +    if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) && toptier)
>>>> +        goto skip_batch;
>>>> +
>>>> +    if (folio_use_access_time(folio)) {
>>>> +        folio_xchg_access_time(folio, jiffies_to_msecs(jiffies));
>>>> +
>>>> +        /* Do not skip in this case */
>>>> +        nr_ptes = 0;
>>>> +        goto out;
>>> This doesn't smell right... perhaps I'm not understanding the logic. Why do you
>>> return nr_ptes = 0 if you end up in this conditional, but nr_ptes = 1 if you
>>> don't take this conditional? I think you want to return nr_ptes == 0 for both
>>> cases?...
>> In the existing code, we do not skip if we take this conditional. So nr_ptes == 0
>> is only a hint that we don't have to skip in this case.
> We also do not skip if we do not take the conditional,right? "hint that we don't
> have to skip in this case"... no I think it's a "directive that we must not
> skip"? A hint is something that the implementation is free to ignore. But I
> don't think that's the case here.
>
> What I'm saying is that I think this block should actually be:
>
> 	if (folio_use_access_time(folio))
> 		folio_xchg_access_time(folio, jiffies_to_msecs(jiffies));
>
> 	/* Do not skip in this case */
> 	nr_ptes = 0;
> 	goto out;

Ah you are right. Thanks!

>>>> +    }
>>>> +
>>>> +skip_batch:
>>>> +    nr_ptes = mprotect_folio_pte_batch(folio, addr, pte, oldpte, max_nr_ptes);
>>>> +out:
>>>> +    *foliop = folio;
>>>> +    return nr_ptes;
>>>> +}
>>>> +
>>>>    static long change_pte_range(struct mmu_gather *tlb,
>>>>            struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr,
>>>>            unsigned long end, pgprot_t newprot, unsigned long cp_flags)
>>>> @@ -94,6 +171,7 @@ static long change_pte_range(struct mmu_gather *tlb,
>>>>        bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
>>>>        bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
>>>>        bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
>>>> +    int nr_ptes;
>>>>          tlb_change_page_size(tlb, PAGE_SIZE);
>>>>        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
>>>> @@ -108,8 +186,11 @@ static long change_pte_range(struct mmu_gather *tlb,
>>>>        flush_tlb_batched_pending(vma->vm_mm);
>>>>        arch_enter_lazy_mmu_mode();
>>>>        do {
>>>> +        nr_ptes = 1;
>>>>            oldpte = ptep_get(pte);
>>>>            if (pte_present(oldpte)) {
>>>> +            int max_nr_ptes = (end - addr) >> PAGE_SHIFT;
>>>> +            struct folio *folio = NULL;
>>>>                pte_t ptent;
>>>>                  /*
>>>> @@ -117,53 +198,12 @@ static long change_pte_range(struct mmu_gather *tlb,
>>>>                 * pages. See similar comment in change_huge_pmd.
>>>>                 */
>>>>                if (prot_numa) {
>>>> -                struct folio *folio;
>>>> -                int nid;
>>>> -                bool toptier;
>>>> -
>>>> -                /* Avoid TLB flush if possible */
>>>> -                if (pte_protnone(oldpte))
>>>> -                    continue;
>>>> -
>>>> -                folio = vm_normal_folio(vma, addr, oldpte);
>>>> -                if (!folio || folio_is_zone_device(folio) ||
>>>> -                    folio_test_ksm(folio))
>>>> -                    continue;
>>>> -
>>>> -                /* Also skip shared copy-on-write pages */
>>>> -                if (is_cow_mapping(vma->vm_flags) &&
>>>> -                    (folio_maybe_dma_pinned(folio) ||
>>>> -                     folio_maybe_mapped_shared(folio)))
>>>> -                    continue;
>>>> -
>>>> -                /*
>>>> -                 * While migration can move some dirty pages,
>>>> -                 * it cannot move them all from MIGRATE_ASYNC
>>>> -                 * context.
>>>> -                 */
>>>> -                if (folio_is_file_lru(folio) &&
>>>> -                    folio_test_dirty(folio))
>>>> -                    continue;
>>>> -
>>>> -                /*
>>>> -                 * Don't mess with PTEs if page is already on the node
>>>> -                 * a single-threaded process is running on.
>>>> -                 */
>>>> -                nid = folio_nid(folio);
>>>> -                if (target_node == nid)
>>>> -                    continue;
>>>> -                toptier = node_is_toptier(nid);
>>>> -
>>>> -                /*
>>>> -                 * Skip scanning top tier node if normal numa
>>>> -                 * balancing is disabled
>>>> -                 */
>>>> -                if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) &&
>>>> -                    toptier)
>>>> +                nr_ptes = prot_numa_skip_ptes(&folio, vma,
>>>> +                                  addr, oldpte, pte,
>>>> +                                  target_node,
>>>> +                                  max_nr_ptes);
>>>> +                if (nr_ptes)
>>>>                        continue;
>>> ...But now here nr_ptes == 0 for the "don't skip" case, so won't you process
>>> that PTE twice because while (pte += nr_ptes, ...) won't advance it?
>>>
>>> Suggest forcing nr_ptes = 1 after this conditional "continue"?
>> nr_ptes will be forced to a non zero value through mprotect_folio_pte_batch().
> But you don't call mprotect_folio_pte_batch() if you have set nr_ptes = 0;
> Perhaps you are referring to calling mprotect_folio_pte_batch() on the
> processing path in a future patch? But that means that this patch is buggy
> without the future patch.

Yup it is there in the future patch. You are correct, I'll respin and force
nr_ptes = 1 in this case.

>
>>> Thanks,
>>> Ryan
>>>
>>>
>>>> -                if (folio_use_access_time(folio))
>>>> -                    folio_xchg_access_time(folio,
>>>> -                        jiffies_to_msecs(jiffies));
>>>>                }
>>>>                  oldpte = ptep_modify_prot_start(vma, addr, pte);
>>>> @@ -280,7 +320,7 @@ static long change_pte_range(struct mmu_gather *tlb,
>>>>                    pages++;
>>>>                }
>>>>            }
>>>> -    } while (pte++, addr += PAGE_SIZE, addr != end);
>>>> +    } while (pte += nr_ptes, addr += nr_ptes * PAGE_SIZE, addr != end);
>>>>        arch_leave_lazy_mmu_mode();
>>>>        pte_unmap_unlock(pte - 1, ptl);
>>>>

[PATCH v4 1/4] mm: Optimize mprotect() for MM_CP_PROT_NUMA by batch-skipping PTEs
[PATCH v4 2/4] mm: Add batched versions of ptep_modify_prot_start/commit
[PATCH v4 3/4] mm: Optimize mprotect() by PTE-batching
[PATCH v4 4/4] arm64: Add batched versions of ptep_modify_prot_start/commit