Batch ptep_modify_prot_start/commit in preparation for optimizing mprotect,
implementing them as a simple loop over the corresponding single pte
helpers. Architecture may override these helpers.
Signed-off-by: Dev Jain <dev.jain@arm.com>
---
include/linux/pgtable.h | 84 ++++++++++++++++++++++++++++++++++++++++-
mm/mprotect.c | 4 +-
2 files changed, 85 insertions(+), 3 deletions(-)
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index cf1515c163e2..e3b99920be05 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1331,7 +1331,9 @@ static inline pte_t ptep_modify_prot_start(struct vm_area_struct *vma,
/*
* Commit an update to a pte, leaving any hardware-controlled bits in
- * the PTE unmodified.
+ * the PTE unmodified. The pte returned from ptep_modify_prot_start() may
+ * additionally have young and/or dirty bits set where previously they were not,
+ * so the updated pte may have these additional changes.
*/
static inline void ptep_modify_prot_commit(struct vm_area_struct *vma,
unsigned long addr,
@@ -1340,6 +1342,86 @@ static inline void ptep_modify_prot_commit(struct vm_area_struct *vma,
__ptep_modify_prot_commit(vma, addr, ptep, pte);
}
#endif /* __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION */
+
+/**
+ * modify_prot_start_ptes - Start a pte protection read-modify-write transaction
+ * over a batch of ptes, which protects against asynchronous hardware
+ * modifications to the ptes. The intention is not to prevent the hardware from
+ * making pte updates, but to prevent any updates it may make from being lost.
+ * Please see the comment above ptep_modify_prot_start() for full description.
+ *
+ * @vma: The virtual memory area the pages are mapped into.
+ * @addr: Address the first page is mapped at.
+ * @ptep: Page table pointer for the first entry.
+ * @nr: Number of entries.
+ *
+ * May be overridden by the architecture; otherwise, implemented as a simple
+ * loop over ptep_modify_prot_start(), collecting the a/d bits from each pte
+ * in the batch.
+ *
+ * Note that PTE bits in the PTE batch besides the PFN can differ.
+ *
+ * Context: The caller holds the page table lock. The PTEs map consecutive
+ * pages that belong to the same folio. All other PTE bits must be identical for
+ * all PTEs in the batch except for young and dirty bits. The PTEs are all in
+ * the same PMD.
+ */
+#ifndef modify_prot_start_ptes
+static inline pte_t modify_prot_start_ptes(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep, unsigned int nr)
+{
+ pte_t pte, tmp_pte;
+
+ pte = ptep_modify_prot_start(vma, addr, ptep);
+ while (--nr) {
+ ptep++;
+ addr += PAGE_SIZE;
+ tmp_pte = ptep_modify_prot_start(vma, addr, ptep);
+ if (pte_dirty(tmp_pte))
+ pte = pte_mkdirty(pte);
+ if (pte_young(tmp_pte))
+ pte = pte_mkyoung(pte);
+ }
+ return pte;
+}
+#endif
+
+/**
+ * modify_prot_commit_ptes - Commit an update to a batch of ptes, leaving any
+ * hardware-controlled bits in the PTE unmodified.
+ *
+ * @vma: The virtual memory area the pages are mapped into.
+ * @addr: Address the first page is mapped at.
+ * @ptep: Page table pointer for the first entry.
+ * @old_pte: Old page table entry (for the first entry) which is now cleared.
+ * @pte: New page table entry to be set.
+ * @nr: Number of entries.
+ *
+ * May be overridden by the architecture; otherwise, implemented as a simple
+ * loop over ptep_modify_prot_commit().
+ *
+ * Context: The caller holds the page table lock. The PTEs are all in the same
+ * PMD. On exit, the set ptes in the batch map the same folio. The ptes set by
+ * ptep_modify_prot_start() may additionally have young and/or dirty bits set
+ * where previously they were not, so the updated ptes may have these
+ * additional changes.
+ */
+#ifndef modify_prot_commit_ptes
+static inline void modify_prot_commit_ptes(struct vm_area_struct *vma, unsigned long addr,
+ pte_t *ptep, pte_t old_pte, pte_t pte, unsigned int nr)
+{
+ int i;
+
+ for (i = 0; i < nr; ++i, ++ptep, addr += PAGE_SIZE) {
+ ptep_modify_prot_commit(vma, addr, ptep, old_pte, pte);
+
+ /* Advance PFN only, set same prot */
+ old_pte = pte_next_pfn(old_pte);
+ pte = pte_next_pfn(pte);
+ }
+}
+#endif
+
#endif /* CONFIG_MMU */
/*
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 97adc62c50ab..4977f198168e 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -204,7 +204,7 @@ static long change_pte_range(struct mmu_gather *tlb,
}
}
- oldpte = ptep_modify_prot_start(vma, addr, pte);
+ oldpte = modify_prot_start_ptes(vma, addr, pte, nr_ptes);
ptent = pte_modify(oldpte, newprot);
if (uffd_wp)
@@ -230,7 +230,7 @@ static long change_pte_range(struct mmu_gather *tlb,
can_change_pte_writable(vma, addr, ptent))
ptent = pte_mkwrite(ptent, vma);
- ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent);
+ modify_prot_commit_ptes(vma, addr, pte, oldpte, ptent, nr_ptes);
if (pte_needs_flush(oldpte, ptent))
tlb_flush_pte_range(tlb, addr, PAGE_SIZE);
pages++;
--
2.30.2
On 18 Jul 2025, at 5:02, Dev Jain wrote: > Batch ptep_modify_prot_start/commit in preparation for optimizing mprotect, > implementing them as a simple loop over the corresponding single pte > helpers. Architecture may override these helpers. > > Signed-off-by: Dev Jain <dev.jain@arm.com> > --- > include/linux/pgtable.h | 84 ++++++++++++++++++++++++++++++++++++++++- > mm/mprotect.c | 4 +- > 2 files changed, 85 insertions(+), 3 deletions(-) > LGTM. Reviewed-by: Zi Yan <ziy@nvidia.com> Best Regards, Yan, Zi
On 18/07/2025 10:02, Dev Jain wrote: > Batch ptep_modify_prot_start/commit in preparation for optimizing mprotect, > implementing them as a simple loop over the corresponding single pte > helpers. Architecture may override these helpers. > > Signed-off-by: Dev Jain <dev.jain@arm.com> Reviewed-by: Ryan Roberts <ryan.roberts@arm.com> > --- > include/linux/pgtable.h | 84 ++++++++++++++++++++++++++++++++++++++++- > mm/mprotect.c | 4 +- > 2 files changed, 85 insertions(+), 3 deletions(-) > > diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h > index cf1515c163e2..e3b99920be05 100644 > --- a/include/linux/pgtable.h > +++ b/include/linux/pgtable.h > @@ -1331,7 +1331,9 @@ static inline pte_t ptep_modify_prot_start(struct vm_area_struct *vma, > > /* > * Commit an update to a pte, leaving any hardware-controlled bits in > - * the PTE unmodified. > + * the PTE unmodified. The pte returned from ptep_modify_prot_start() may > + * additionally have young and/or dirty bits set where previously they were not, > + * so the updated pte may have these additional changes. nit: I still find this difficult to parse (Although I expect you might tell me that this is the text I suggested last time around :) ). I think you mean that "it is permissable for young and/or dirty bits to be set in old_pte, despite beling clear when originally returned by ptep_modify_prot_start()". Anyway, no big deal. I think we all know what it's getting at. Thanks, Ryan > */ > static inline void ptep_modify_prot_commit(struct vm_area_struct *vma, > unsigned long addr, > @@ -1340,6 +1342,86 @@ static inline void ptep_modify_prot_commit(struct vm_area_struct *vma, > __ptep_modify_prot_commit(vma, addr, ptep, pte); > } > #endif /* __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION */ > + > +/** > + * modify_prot_start_ptes - Start a pte protection read-modify-write transaction > + * over a batch of ptes, which protects against asynchronous hardware > + * modifications to the ptes. The intention is not to prevent the hardware from > + * making pte updates, but to prevent any updates it may make from being lost. > + * Please see the comment above ptep_modify_prot_start() for full description. > + * > + * @vma: The virtual memory area the pages are mapped into. > + * @addr: Address the first page is mapped at. > + * @ptep: Page table pointer for the first entry. > + * @nr: Number of entries. > + * > + * May be overridden by the architecture; otherwise, implemented as a simple > + * loop over ptep_modify_prot_start(), collecting the a/d bits from each pte > + * in the batch. > + * > + * Note that PTE bits in the PTE batch besides the PFN can differ. > + * > + * Context: The caller holds the page table lock. The PTEs map consecutive > + * pages that belong to the same folio. All other PTE bits must be identical for > + * all PTEs in the batch except for young and dirty bits. The PTEs are all in > + * the same PMD. > + */ > +#ifndef modify_prot_start_ptes > +static inline pte_t modify_prot_start_ptes(struct vm_area_struct *vma, > + unsigned long addr, pte_t *ptep, unsigned int nr) > +{ > + pte_t pte, tmp_pte; > + > + pte = ptep_modify_prot_start(vma, addr, ptep); > + while (--nr) { > + ptep++; > + addr += PAGE_SIZE; > + tmp_pte = ptep_modify_prot_start(vma, addr, ptep); > + if (pte_dirty(tmp_pte)) > + pte = pte_mkdirty(pte); > + if (pte_young(tmp_pte)) > + pte = pte_mkyoung(pte); > + } > + return pte; > +} > +#endif > + > +/** > + * modify_prot_commit_ptes - Commit an update to a batch of ptes, leaving any > + * hardware-controlled bits in the PTE unmodified. > + * > + * @vma: The virtual memory area the pages are mapped into. > + * @addr: Address the first page is mapped at. > + * @ptep: Page table pointer for the first entry. > + * @old_pte: Old page table entry (for the first entry) which is now cleared. > + * @pte: New page table entry to be set. > + * @nr: Number of entries. > + * > + * May be overridden by the architecture; otherwise, implemented as a simple > + * loop over ptep_modify_prot_commit(). > + * > + * Context: The caller holds the page table lock. The PTEs are all in the same > + * PMD. On exit, the set ptes in the batch map the same folio. The ptes set by > + * ptep_modify_prot_start() may additionally have young and/or dirty bits set > + * where previously they were not, so the updated ptes may have these > + * additional changes. > + */ > +#ifndef modify_prot_commit_ptes > +static inline void modify_prot_commit_ptes(struct vm_area_struct *vma, unsigned long addr, > + pte_t *ptep, pte_t old_pte, pte_t pte, unsigned int nr) > +{ > + int i; > + > + for (i = 0; i < nr; ++i, ++ptep, addr += PAGE_SIZE) { > + ptep_modify_prot_commit(vma, addr, ptep, old_pte, pte); > + > + /* Advance PFN only, set same prot */ > + old_pte = pte_next_pfn(old_pte); > + pte = pte_next_pfn(pte); > + } > +} > +#endif > + > #endif /* CONFIG_MMU */ > > /* > diff --git a/mm/mprotect.c b/mm/mprotect.c > index 97adc62c50ab..4977f198168e 100644 > --- a/mm/mprotect.c > +++ b/mm/mprotect.c > @@ -204,7 +204,7 @@ static long change_pte_range(struct mmu_gather *tlb, > } > } > > - oldpte = ptep_modify_prot_start(vma, addr, pte); > + oldpte = modify_prot_start_ptes(vma, addr, pte, nr_ptes); > ptent = pte_modify(oldpte, newprot); > > if (uffd_wp) > @@ -230,7 +230,7 @@ static long change_pte_range(struct mmu_gather *tlb, > can_change_pte_writable(vma, addr, ptent)) > ptent = pte_mkwrite(ptent, vma); > > - ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent); > + modify_prot_commit_ptes(vma, addr, pte, oldpte, ptent, nr_ptes); > if (pte_needs_flush(oldpte, ptent)) > tlb_flush_pte_range(tlb, addr, PAGE_SIZE); > pages++;
On Fri, Jul 18, 2025 at 5:03 PM Dev Jain <dev.jain@arm.com> wrote: > > Batch ptep_modify_prot_start/commit in preparation for optimizing mprotect, > implementing them as a simple loop over the corresponding single pte > helpers. Architecture may override these helpers. > > Signed-off-by: Dev Jain <dev.jain@arm.com> Reviewed-by: Barry Song <baohua@kernel.org> > --- > include/linux/pgtable.h | 84 ++++++++++++++++++++++++++++++++++++++++- > mm/mprotect.c | 4 +- > 2 files changed, 85 insertions(+), 3 deletions(-) > [...] > +#ifndef modify_prot_start_ptes > +static inline pte_t modify_prot_start_ptes(struct vm_area_struct *vma, > + unsigned long addr, pte_t *ptep, unsigned int nr) > +{ > + pte_t pte, tmp_pte; > + > + pte = ptep_modify_prot_start(vma, addr, ptep); > + while (--nr) { > + ptep++; > + addr += PAGE_SIZE; > + tmp_pte = ptep_modify_prot_start(vma, addr, ptep); > + if (pte_dirty(tmp_pte)) > + pte = pte_mkdirty(pte); > + if (pte_young(tmp_pte)) > + pte = pte_mkyoung(pte); It might be interesting to explore whether a similar optimization could apply here as well: https://lore.kernel.org/linux-mm/20250624152549.2647828-1-xavier.qyxia@gmail.com/ I suspect it would, but it's probably not worth including in this patch. > + } > + return pte; > +} > +#endif > + Thanks Barry
On Fri, Jul 18, 2025 at 02:32:40PM +0530, Dev Jain wrote: > Batch ptep_modify_prot_start/commit in preparation for optimizing mprotect, > implementing them as a simple loop over the corresponding single pte > helpers. Architecture may override these helpers. > > Signed-off-by: Dev Jain <dev.jain@arm.com> LGTM, so: Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> > --- > include/linux/pgtable.h | 84 ++++++++++++++++++++++++++++++++++++++++- > mm/mprotect.c | 4 +- > 2 files changed, 85 insertions(+), 3 deletions(-) > > diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h > index cf1515c163e2..e3b99920be05 100644 > --- a/include/linux/pgtable.h > +++ b/include/linux/pgtable.h > @@ -1331,7 +1331,9 @@ static inline pte_t ptep_modify_prot_start(struct vm_area_struct *vma, > > /* > * Commit an update to a pte, leaving any hardware-controlled bits in > - * the PTE unmodified. > + * the PTE unmodified. The pte returned from ptep_modify_prot_start() may > + * additionally have young and/or dirty bits set where previously they were not, > + * so the updated pte may have these additional changes. > */ > static inline void ptep_modify_prot_commit(struct vm_area_struct *vma, > unsigned long addr, > @@ -1340,6 +1342,86 @@ static inline void ptep_modify_prot_commit(struct vm_area_struct *vma, > __ptep_modify_prot_commit(vma, addr, ptep, pte); > } > #endif /* __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION */ > + > +/** > + * modify_prot_start_ptes - Start a pte protection read-modify-write transaction > + * over a batch of ptes, which protects against asynchronous hardware > + * modifications to the ptes. The intention is not to prevent the hardware from > + * making pte updates, but to prevent any updates it may make from being lost. > + * Please see the comment above ptep_modify_prot_start() for full description. > + * > + * @vma: The virtual memory area the pages are mapped into. > + * @addr: Address the first page is mapped at. > + * @ptep: Page table pointer for the first entry. > + * @nr: Number of entries. > + * > + * May be overridden by the architecture; otherwise, implemented as a simple > + * loop over ptep_modify_prot_start(), collecting the a/d bits from each pte > + * in the batch. > + * > + * Note that PTE bits in the PTE batch besides the PFN can differ. > + * > + * Context: The caller holds the page table lock. The PTEs map consecutive > + * pages that belong to the same folio. All other PTE bits must be identical for > + * all PTEs in the batch except for young and dirty bits. The PTEs are all in > + * the same PMD. > + */ > +#ifndef modify_prot_start_ptes > +static inline pte_t modify_prot_start_ptes(struct vm_area_struct *vma, > + unsigned long addr, pte_t *ptep, unsigned int nr) > +{ > + pte_t pte, tmp_pte; > + > + pte = ptep_modify_prot_start(vma, addr, ptep); > + while (--nr) { > + ptep++; > + addr += PAGE_SIZE; > + tmp_pte = ptep_modify_prot_start(vma, addr, ptep); > + if (pte_dirty(tmp_pte)) > + pte = pte_mkdirty(pte); > + if (pte_young(tmp_pte)) > + pte = pte_mkyoung(pte); > + } > + return pte; > +} > +#endif > + > +/** > + * modify_prot_commit_ptes - Commit an update to a batch of ptes, leaving any > + * hardware-controlled bits in the PTE unmodified. > + * > + * @vma: The virtual memory area the pages are mapped into. > + * @addr: Address the first page is mapped at. > + * @ptep: Page table pointer for the first entry. > + * @old_pte: Old page table entry (for the first entry) which is now cleared. > + * @pte: New page table entry to be set. > + * @nr: Number of entries. > + * > + * May be overridden by the architecture; otherwise, implemented as a simple > + * loop over ptep_modify_prot_commit(). > + * > + * Context: The caller holds the page table lock. The PTEs are all in the same > + * PMD. On exit, the set ptes in the batch map the same folio. The ptes set by > + * ptep_modify_prot_start() may additionally have young and/or dirty bits set > + * where previously they were not, so the updated ptes may have these > + * additional changes. > + */ > +#ifndef modify_prot_commit_ptes > +static inline void modify_prot_commit_ptes(struct vm_area_struct *vma, unsigned long addr, > + pte_t *ptep, pte_t old_pte, pte_t pte, unsigned int nr) > +{ > + int i; > + > + for (i = 0; i < nr; ++i, ++ptep, addr += PAGE_SIZE) { > + ptep_modify_prot_commit(vma, addr, ptep, old_pte, pte); > + > + /* Advance PFN only, set same prot */ > + old_pte = pte_next_pfn(old_pte); > + pte = pte_next_pfn(pte); > + } > +} > +#endif > + > #endif /* CONFIG_MMU */ > > /* > diff --git a/mm/mprotect.c b/mm/mprotect.c > index 97adc62c50ab..4977f198168e 100644 > --- a/mm/mprotect.c > +++ b/mm/mprotect.c > @@ -204,7 +204,7 @@ static long change_pte_range(struct mmu_gather *tlb, > } > } > > - oldpte = ptep_modify_prot_start(vma, addr, pte); > + oldpte = modify_prot_start_ptes(vma, addr, pte, nr_ptes); > ptent = pte_modify(oldpte, newprot); > > if (uffd_wp) > @@ -230,7 +230,7 @@ static long change_pte_range(struct mmu_gather *tlb, > can_change_pte_writable(vma, addr, ptent)) > ptent = pte_mkwrite(ptent, vma); > > - ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent); > + modify_prot_commit_ptes(vma, addr, pte, oldpte, ptent, nr_ptes); > if (pte_needs_flush(oldpte, ptent)) > tlb_flush_pte_range(tlb, addr, PAGE_SIZE); > pages++; > -- > 2.30.2 >
© 2016 - 2025 Red Hat, Inc.