mm/mprotect.c | 218 ++++++++++++++++++++++++++++---------------------- 1 file changed, 124 insertions(+), 94 deletions(-)
Micro-optimize the change_protection functionality and the change_pte_range() routine. This set of functions works in an incredibly tight loop, and even small inefficiencies are incredibly evident when spun hundreds, thousands or hundreds of thousands of times. There was an attempt to keep the batching functionality as much as possible, which introduced some part of the slowness, but not all of it. Removing it for !arm64 architectures would speed mprotect() up even further, but could easily pessimize cases where large folios are mapped (which is not as rare as it seems, particularly when it comes to the page cache these days). The micro-benchmark used for the tests was [0] (usable using google/benchmark and g++ -O2 -lbenchmark repro.cpp) This resulted in the following (first entry is baseline): --------------------------------------------------------- Benchmark Time CPU Iterations --------------------------------------------------------- mprotect_bench 85967 ns 85967 ns 6935 mprotect_bench 70684 ns 70684 ns 9887 After the patchset we can observe an ~18% speedup in mprotect. Wonderful for the elusive mprotect-based workloads! Testing & more ideas welcome. I suspect there is plenty of improvement possible but it would require more time than what I have on my hands right now. The entire inlined function (which inlines into change_protection()) is gigantic - I'm not surprised this is so finnicky. Note: per my profiling, the next _big_ bottleneck here is modify_prot_start_ptes, exactly on the xchg() done by x86. ptep_get_and_clear() is _expensive_. I don't think there's a properly safe way to go about it since we do depend on the D bit quite a lot. This might not be such an issue on other architectures. Luke Yang reported [1]: : On average, we see improvements ranging from a minimum of 5% to a : maximum of 55%, with most improvements showing around a 25% speed up in : the libmicro/mprot_tw4m micro benchmark. Link: https://lore.kernel.org/all/aY8-XuFZ7zCvXulB@luyang-thinkpadp1gen7.toromso.csb/ Link: https://gist.github.com/heatd/1450d273005aba91fa5744f44dfcd933 [0] Link: https://lkml.kernel.org/r/CAL2CeBxT4jtJ+LxYb6=BNxNMGinpgD_HYH5gGxOP-45Q2OncqQ@mail.gmail.com [1] Cc: Vlastimil Babka <vbabka@kernel.org> Cc: Jann Horn <jannh@google.com> Cc: David Hildenbrand <david@kernel.org> Cc: Dev Jain <dev.jain@arm.com> Cc: Luke Yang <luyang@redhat.com> Cc: jhladky@redhat.com Cc: linux-mm@kvack.org Cc: linux-kernel@vger.kernel.org v3: - Collapse a few lines into a single line in patch 1 (David) - Bring the inlining to a higher level (David) - Pick up David's patch 1 ACK (thank you!) - Pick up Luke Yang's Tested-by (thank you!) - Add Luke's results and akpmify the Links: a bit (cover letter) v2: - Addressed Sashiko's concerns - Picked up Lorenzo's R-b's (thank you!) - Squashed patch 1 and 4 into a single one (David) - Renamed the softleaf leaf function (David) - Dropped controversial noinlines & patch 3 (Lorenzo & David) v1: https://lore.kernel.org/linux-mm/20260319183108.1105090-1-pfalcato@suse.de/ Pedro Falcato (2): mm/mprotect: move softleaf code out of the main function mm/mprotect: special-case small folios when applying write permissions mm/mprotect.c | 218 ++++++++++++++++++++++++++++---------------------- 1 file changed, 124 insertions(+), 94 deletions(-) -- 2.53.0
On Thu, 2 Apr 2026 15:16:26 +0100 Pedro Falcato <pfalcato@suse.de> wrote:
> Micro-optimize the change_protection functionality and the
> change_pte_range() routine. This set of functions works in an incredibly
> tight loop, and even small inefficiencies are incredibly evident when spun
> hundreds, thousands or hundreds of thousands of times.
Thanks, I updated mm.git's mm-unstable branch to this version.
The update is rather large. If people think it best to spill this work
into next -rc1 then please advise.
>
> v3:
> - Collapse a few lines into a single line in patch 1 (David)
> - Bring the inlining to a higher level (David)
> - Pick up David's patch 1 ACK (thank you!)
> - Pick up Luke Yang's Tested-by (thank you!)
> - Add Luke's results and akpmify the Links: a bit (cover letter)
Here's how v3 altered mm.git:
mm/mprotect.c | 98 +++++++++++++++++++++++++++---------------------
1 file changed, 56 insertions(+), 42 deletions(-)
--- a/mm/mprotect.c~b
+++ a/mm/mprotect.c
@@ -103,7 +103,7 @@ bool can_change_pte_writable(struct vm_a
return can_change_shared_pte_writable(vma, pte);
}
-static __always_inline int mprotect_folio_pte_batch(struct folio *folio, pte_t *ptep,
+static int mprotect_folio_pte_batch(struct folio *folio, pte_t *ptep,
pte_t pte, int max_nr_ptes, fpb_t flags)
{
/* No underlying folio, so cannot batch */
@@ -143,7 +143,7 @@ static __always_inline void prot_commit_
* !PageAnonExclusive() pages, starting from start_idx. Caller must enforce
* that the ptes point to consecutive pages of the same anon large folio.
*/
-static int page_anon_exclusive_sub_batch(int start_idx, int max_len,
+static __always_inline int page_anon_exclusive_sub_batch(int start_idx, int max_len,
struct page *first_page, bool expected_anon_exclusive)
{
int idx;
@@ -177,13 +177,6 @@ static __always_inline void commit_anon_
int sub_batch_idx = 0;
int len;
- /* Optimize for the common order-0 case. */
- if (likely(nr_ptes == 1)) {
- prot_commit_flush_ptes(vma, addr, ptep, oldpte, ptent, 1,
- 0, PageAnonExclusive(first_page), tlb);
- return;
- }
-
while (nr_ptes) {
expected_anon_exclusive = PageAnonExclusive(first_page + sub_batch_idx);
len = page_anon_exclusive_sub_batch(sub_batch_idx, nr_ptes,
@@ -195,7 +188,7 @@ static __always_inline void commit_anon_
}
}
-static void set_write_prot_commit_flush_ptes(struct vm_area_struct *vma,
+static __always_inline void set_write_prot_commit_flush_ptes(struct vm_area_struct *vma,
struct folio *folio, struct page *page, unsigned long addr, pte_t *ptep,
pte_t oldpte, pte_t ptent, int nr_ptes, struct mmu_gather *tlb)
{
@@ -234,8 +227,7 @@ static long change_softleaf_pte(struct v
* just be safe and disable write
*/
if (folio_test_anon(folio))
- entry = make_readable_exclusive_migration_entry(
- swp_offset(entry));
+ entry = make_readable_exclusive_migration_entry(swp_offset(entry));
else
entry = make_readable_migration_entry(swp_offset(entry));
newpte = swp_entry_to_pte(entry);
@@ -246,8 +238,7 @@ static long change_softleaf_pte(struct v
* We do not preserve soft-dirtiness. See
* copy_nonpresent_pte() for explanation.
*/
- entry = make_readable_device_private_entry(
- swp_offset(entry));
+ entry = make_readable_device_private_entry(swp_offset(entry));
newpte = swp_entry_to_pte(entry);
if (pte_swp_uffd_wp(oldpte))
newpte = pte_swp_mkuffd_wp(newpte);
@@ -286,6 +277,45 @@ static long change_softleaf_pte(struct v
return 0;
}
+static __always_inline void change_present_ptes(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, unsigned long addr, pte_t *ptep,
+ int nr_ptes, unsigned long end, pgprot_t newprot,
+ struct folio *folio, struct page *page, unsigned long cp_flags)
+{
+ const bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
+ const bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
+ pte_t ptent, oldpte;
+
+ oldpte = modify_prot_start_ptes(vma, addr, ptep, nr_ptes);
+ ptent = pte_modify(oldpte, newprot);
+
+ if (uffd_wp)
+ ptent = pte_mkuffd_wp(ptent);
+ else if (uffd_wp_resolve)
+ ptent = pte_clear_uffd_wp(ptent);
+
+ /*
+ * In some writable, shared mappings, we might want
+ * to catch actual write access -- see
+ * vma_wants_writenotify().
+ *
+ * In all writable, private mappings, we have to
+ * properly handle COW.
+ *
+ * In both cases, we can sometimes still change PTEs
+ * writable and avoid the write-fault handler, for
+ * example, if a PTE is already dirty and no other
+ * COW or special handling is required.
+ */
+ if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) &&
+ !pte_write(ptent))
+ set_write_prot_commit_flush_ptes(vma, folio, page,
+ addr, ptep, oldpte, ptent, nr_ptes, tlb);
+ else
+ prot_commit_flush_ptes(vma, addr, ptep, oldpte, ptent,
+ nr_ptes, /* idx = */ 0, /* set_write = */ false, tlb);
+}
+
static long change_pte_range(struct mmu_gather *tlb,
struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr,
unsigned long end, pgprot_t newprot, unsigned long cp_flags)
@@ -296,7 +326,6 @@ static long change_pte_range(struct mmu_
bool is_private_single_threaded;
bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
- bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
int nr_ptes;
tlb_change_page_size(tlb, PAGE_SIZE);
@@ -317,7 +346,6 @@ static long change_pte_range(struct mmu_
int max_nr_ptes = (end - addr) >> PAGE_SHIFT;
struct folio *folio = NULL;
struct page *page;
- pte_t ptent;
/* Already in the desired state. */
if (prot_numa && pte_protnone(oldpte))
@@ -343,34 +371,20 @@ static long change_pte_range(struct mmu_
nr_ptes = mprotect_folio_pte_batch(folio, pte, oldpte, max_nr_ptes, flags);
- oldpte = modify_prot_start_ptes(vma, addr, pte, nr_ptes);
- ptent = pte_modify(oldpte, newprot);
-
- if (uffd_wp)
- ptent = pte_mkuffd_wp(ptent);
- else if (uffd_wp_resolve)
- ptent = pte_clear_uffd_wp(ptent);
-
/*
- * In some writable, shared mappings, we might want
- * to catch actual write access -- see
- * vma_wants_writenotify().
- *
- * In all writable, private mappings, we have to
- * properly handle COW.
- *
- * In both cases, we can sometimes still change PTEs
- * writable and avoid the write-fault handler, for
- * example, if a PTE is already dirty and no other
- * COW or special handling is required.
+ * Optimize for the small-folio common case by
+ * special-casing it here. Compiler constant propagation
+ * plus copious amounts of __always_inline does wonders.
*/
- if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) &&
- !pte_write(ptent))
- set_write_prot_commit_flush_ptes(vma, folio, page,
- addr, pte, oldpte, ptent, nr_ptes, tlb);
- else
- prot_commit_flush_ptes(vma, addr, pte, oldpte, ptent,
- nr_ptes, /* idx = */ 0, /* set_write = */ false, tlb);
+ if (likely(nr_ptes == 1)) {
+ change_present_ptes(tlb, vma, addr, pte, 1,
+ end, newprot, folio, page, cp_flags);
+ } else {
+ change_present_ptes(tlb, vma, addr, pte,
+ nr_ptes, end, newprot, folio, page,
+ cp_flags);
+ }
+
pages += nr_ptes;
} else if (pte_none(oldpte)) {
/*
_
© 2016 - 2026 Red Hat, Inc.