[RFC PATCH 1/2] mm: make lazy MMU mode context-aware

Alexander Gordeev posted 2 patches 1 week, 1 day ago
[RFC PATCH 1/2] mm: make lazy MMU mode context-aware
Posted by Alexander Gordeev 1 week, 1 day ago
Lazy MMU mode is assumed to be context-independent, in the sense
that it does not need any additional information while operating.
However, the s390 architecture benefits from knowing the exact
page table entries being modified.

Introduce lazy_mmu_mode_enable_pte(), which is provided with the
process address space and the page table being operated on. This
information is required to enable s390-specific optimizations.

The function takes parameters that are typically passed to page-
table level walkers, which implies that the span of PTE entries
never crosses a page table boundary.

Architectures that do not require such information simply do not
need to define the arch_enter_lazy_mmu_mode_pte() callback.

Signed-off-by: Alexander Gordeev <agordeev@linux.ibm.com>
---
 fs/proc/task_mmu.c      |  2 +-
 include/linux/pgtable.h | 42 +++++++++++++++++++++++++++++++++++++++++
 mm/madvise.c            |  8 ++++----
 mm/memory.c             |  8 ++++----
 mm/mprotect.c           |  2 +-
 mm/mremap.c             |  2 +-
 mm/vmalloc.c            |  6 +++---
 7 files changed, 56 insertions(+), 14 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index e091931d7ca1..4e3b1987874a 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -2752,7 +2752,7 @@ static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start,
 		return 0;
 	}
 
-	lazy_mmu_mode_enable();
+	lazy_mmu_mode_enable_pte(vma->vm_mm, start, end, start_pte);
 
 	if ((p->arg.flags & PM_SCAN_WP_MATCHING) && !p->vec_out) {
 		/* Fast path for performing exclusive WP */
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index a50df42a893f..481b45954800 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -271,6 +271,44 @@ static inline void lazy_mmu_mode_enable(void)
 		arch_enter_lazy_mmu_mode();
 }
 
+#ifndef arch_enter_lazy_mmu_mode_pte
+static inline void arch_enter_lazy_mmu_mode_pte(struct mm_struct *mm,
+						unsigned long addr,
+						unsigned long end,
+						pte_t *ptep)
+{
+	arch_enter_lazy_mmu_mode();
+}
+#endif
+
+/**
+ * lazy_mmu_mode_enable_pte() - Enable the lazy MMU mode with parameters
+ *
+ * Enters a new lazy MMU mode section; if the mode was not already enabled,
+ * enables it and calls arch_enter_lazy_mmu_mode_pte().
+ *
+ * Must be paired with a call to lazy_mmu_mode_disable().
+ *
+ * Has no effect if called:
+ * - While paused - see lazy_mmu_mode_pause()
+ * - In interrupt context
+ */
+static inline void lazy_mmu_mode_enable_pte(struct mm_struct *mm,
+					    unsigned long addr,
+					    unsigned long end,
+					    pte_t *ptep)
+{
+	struct lazy_mmu_state *state = &current->lazy_mmu_state;
+
+	if (in_interrupt() || state->pause_count > 0)
+		return;
+
+	VM_WARN_ON_ONCE(state->enable_count == U8_MAX);
+
+	if (state->enable_count++ == 0)
+		arch_enter_lazy_mmu_mode_pte(mm, addr, end, ptep);
+}
+
 /**
  * lazy_mmu_mode_disable() - Disable the lazy MMU mode.
  *
@@ -353,6 +391,10 @@ static inline void lazy_mmu_mode_resume(void)
 }
 #else
 static inline void lazy_mmu_mode_enable(void) {}
+static inline void lazy_mmu_mode_enable_pte(struct mm_struct *mm,
+					    unsigned long addr,
+					    unsigned long end,
+					    pte_t *ptep) {}
 static inline void lazy_mmu_mode_disable(void) {}
 static inline void lazy_mmu_mode_pause(void) {}
 static inline void lazy_mmu_mode_resume(void) {}
diff --git a/mm/madvise.c b/mm/madvise.c
index dbb69400786d..02edc80f678b 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -451,7 +451,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
 	if (!start_pte)
 		return 0;
 	flush_tlb_batched_pending(mm);
-	lazy_mmu_mode_enable();
+	lazy_mmu_mode_enable_pte(mm, addr, end, start_pte);
 	for (; addr < end; pte += nr, addr += nr * PAGE_SIZE) {
 		nr = 1;
 		ptent = ptep_get(pte);
@@ -506,7 +506,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
 				if (!start_pte)
 					break;
 				flush_tlb_batched_pending(mm);
-				lazy_mmu_mode_enable();
+				lazy_mmu_mode_enable_pte(mm, addr, end, start_pte);
 				if (!err)
 					nr = 0;
 				continue;
@@ -673,7 +673,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 	if (!start_pte)
 		return 0;
 	flush_tlb_batched_pending(mm);
-	lazy_mmu_mode_enable();
+	lazy_mmu_mode_enable_pte(mm, addr, end, start_pte);
 	for (; addr != end; pte += nr, addr += PAGE_SIZE * nr) {
 		nr = 1;
 		ptent = ptep_get(pte);
@@ -733,7 +733,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 				if (!start_pte)
 					break;
 				flush_tlb_batched_pending(mm);
-				lazy_mmu_mode_enable();
+				lazy_mmu_mode_enable_pte(mm, addr, end, pte);
 				if (!err)
 					nr = 0;
 				continue;
diff --git a/mm/memory.c b/mm/memory.c
index 2f815a34d924..43fa9965fb5f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1269,7 +1269,7 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
 	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
 	orig_src_pte = src_pte;
 	orig_dst_pte = dst_pte;
-	lazy_mmu_mode_enable();
+	lazy_mmu_mode_enable_pte(src_mm, addr, end, src_pte);
 
 	do {
 		nr = 1;
@@ -1917,7 +1917,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 		return addr;
 
 	flush_tlb_batched_pending(mm);
-	lazy_mmu_mode_enable();
+	lazy_mmu_mode_enable_pte(mm, addr, end, start_pte);
 	do {
 		bool any_skipped = false;
 
@@ -2875,7 +2875,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
 	mapped_pte = pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
 	if (!pte)
 		return -ENOMEM;
-	lazy_mmu_mode_enable();
+	lazy_mmu_mode_enable_pte(mm, addr, end, mapped_pte);
 	do {
 		BUG_ON(!pte_none(ptep_get(pte)));
 		if (!pfn_modify_allowed(pfn, prot)) {
@@ -3235,7 +3235,7 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
 			return -EINVAL;
 	}
 
-	lazy_mmu_mode_enable();
+	lazy_mmu_mode_enable_pte(mm, addr, end, mapped_pte);
 
 	if (fn) {
 		do {
diff --git a/mm/mprotect.c b/mm/mprotect.c
index c0571445bef7..43a2a65b8caf 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -233,7 +233,7 @@ static long change_pte_range(struct mmu_gather *tlb,
 		is_private_single_threaded = vma_is_single_threaded_private(vma);
 
 	flush_tlb_batched_pending(vma->vm_mm);
-	lazy_mmu_mode_enable();
+	lazy_mmu_mode_enable_pte(vma->vm_mm, addr, end, pte);
 	do {
 		nr_ptes = 1;
 		oldpte = ptep_get(pte);
diff --git a/mm/mremap.c b/mm/mremap.c
index 2be876a70cc0..ac7f649f3aad 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -260,7 +260,7 @@ static int move_ptes(struct pagetable_move_control *pmc,
 	if (new_ptl != old_ptl)
 		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
 	flush_tlb_batched_pending(vma->vm_mm);
-	lazy_mmu_mode_enable();
+	lazy_mmu_mode_enable_pte(mm, old_addr, old_end, old_ptep);
 
 	for (; old_addr < old_end; old_ptep += nr_ptes, old_addr += nr_ptes * PAGE_SIZE,
 		new_ptep += nr_ptes, new_addr += nr_ptes * PAGE_SIZE) {
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 61caa55a4402..5e702bcf03fd 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -108,7 +108,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 	if (!pte)
 		return -ENOMEM;
 
-	lazy_mmu_mode_enable();
+	lazy_mmu_mode_enable_pte(&init_mm, addr, end, pte);
 
 	do {
 		if (unlikely(!pte_none(ptep_get(pte)))) {
@@ -371,7 +371,7 @@ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 	unsigned long size = PAGE_SIZE;
 
 	pte = pte_offset_kernel(pmd, addr);
-	lazy_mmu_mode_enable();
+	lazy_mmu_mode_enable_pte(&init_mm, addr, end, pte);
 
 	do {
 #ifdef CONFIG_HUGETLB_PAGE
@@ -538,7 +538,7 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
 	if (!pte)
 		return -ENOMEM;
 
-	lazy_mmu_mode_enable();
+	lazy_mmu_mode_enable_pte(&init_mm, addr, end, pte);
 
 	do {
 		struct page *page = pages[*nr];
-- 
2.51.0
Re: [RFC PATCH 1/2] mm: make lazy MMU mode context-aware
Posted by David Hildenbrand (Arm) 1 week, 1 day ago
On 3/25/26 08:41, Alexander Gordeev wrote:
> Lazy MMU mode is assumed to be context-independent, in the sense
> that it does not need any additional information while operating.
> However, the s390 architecture benefits from knowing the exact
> page table entries being modified.
> 
> Introduce lazy_mmu_mode_enable_pte(), which is provided with the
> process address space and the page table being operated on. This
> information is required to enable s390-specific optimizations.
> 
> The function takes parameters that are typically passed to page-
> table level walkers, which implies that the span of PTE entries
> never crosses a page table boundary.
> 
> Architectures that do not require such information simply do not
> need to define the arch_enter_lazy_mmu_mode_pte() callback.
> 
> Signed-off-by: Alexander Gordeev <agordeev@linux.ibm.com>
> ---
>  fs/proc/task_mmu.c      |  2 +-
>  include/linux/pgtable.h | 42 +++++++++++++++++++++++++++++++++++++++++
>  mm/madvise.c            |  8 ++++----
>  mm/memory.c             |  8 ++++----
>  mm/mprotect.c           |  2 +-
>  mm/mremap.c             |  2 +-
>  mm/vmalloc.c            |  6 +++---
>  7 files changed, 56 insertions(+), 14 deletions(-)
> 
> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> index e091931d7ca1..4e3b1987874a 100644
> --- a/fs/proc/task_mmu.c
> +++ b/fs/proc/task_mmu.c
> @@ -2752,7 +2752,7 @@ static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start,
>  		return 0;
>  	}
>  
> -	lazy_mmu_mode_enable();
> +	lazy_mmu_mode_enable_pte(vma->vm_mm, start, end, start_pte);
>  
>  	if ((p->arg.flags & PM_SCAN_WP_MATCHING) && !p->vec_out) {
>  		/* Fast path for performing exclusive WP */
> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
> index a50df42a893f..481b45954800 100644
> --- a/include/linux/pgtable.h
> +++ b/include/linux/pgtable.h
> @@ -271,6 +271,44 @@ static inline void lazy_mmu_mode_enable(void)
>  		arch_enter_lazy_mmu_mode();
>  }
>  
> +#ifndef arch_enter_lazy_mmu_mode_pte
> +static inline void arch_enter_lazy_mmu_mode_pte(struct mm_struct *mm,
> +						unsigned long addr,
> +						unsigned long end,
> +						pte_t *ptep)

Two tab alignment please. (applies to other things hwere as well)

> +{
> +	arch_enter_lazy_mmu_mode();
> +}
> +#endif
> +
> +/**
> + * lazy_mmu_mode_enable_pte() - Enable the lazy MMU mode with parameters

You have to be a lot clearer about implications. For example, what
happens if we would bail out and not process all ptes? What are the
exact semantics.

> + *
> + * Enters a new lazy MMU mode section; if the mode was not already enabled,
> + * enables it and calls arch_enter_lazy_mmu_mode_pte().
> + *
> + * Must be paired with a call to lazy_mmu_mode_disable().
> + *
> + * Has no effect if called:
> + * - While paused - see lazy_mmu_mode_pause()
> + * - In interrupt context
> + */
> +static inline void lazy_mmu_mode_enable_pte(struct mm_struct *mm,
> +					    unsigned long addr,
> +					    unsigned long end,
> +					    pte_t *ptep)

It can be multiple ptes, so should this be some kind of "pte_range"/

lazy_mmu_mode_enable_for_pte_range()

A bit mouthful but clearer.

> +{
> +	struct lazy_mmu_state *state = &current->lazy_mmu_state;
> +
> +	if (in_interrupt() || state->pause_count > 0)
> +		return;
> +
> +	VM_WARN_ON_ONCE(state->enable_count == U8_MAX);
> +
> +	if (state->enable_count++ == 0)
> +		arch_enter_lazy_mmu_mode_pte(mm, addr, end, ptep);
> +}

I'm wondering whether that could instead be some optional interface that
we trigger after the lazy_mmu_mode_enable. But looking at
lazy_mmu_mode_enable() users, there don't seem to be cases where we
would process multiple different ranges under a single enable() call, right?

-- 
Cheers,

David
Re: [RFC PATCH 1/2] mm: make lazy MMU mode context-aware
Posted by Alexander Gordeev 1 week, 1 day ago
On Wed, Mar 25, 2026 at 10:55:23AM +0100, David Hildenbrand (Arm) wrote:

Hi David,

> > +/**
> > + * lazy_mmu_mode_enable_pte() - Enable the lazy MMU mode with parameters
> 
> You have to be a lot clearer about implications. For example, what
> happens if we would bail out and not process all ptes? What are the
> exact semantics.

The only implication is "only this address/PTE range could be updated
and that range may span one page table at most".

Whether all or portion of PTEs were actually updated is not defined,
just like in case of lazy_mmu_mode_enable_pte().

Makes sense?

> > + * Enters a new lazy MMU mode section; if the mode was not already enabled,
> > + * enables it and calls arch_enter_lazy_mmu_mode_pte().
> > + *
> > + * Must be paired with a call to lazy_mmu_mode_disable().
> > + *
> > + * Has no effect if called:
> > + * - While paused - see lazy_mmu_mode_pause()
> > + * - In interrupt context
> > + */
> > +static inline void lazy_mmu_mode_enable_pte(struct mm_struct *mm,
> > +					    unsigned long addr,
> > +					    unsigned long end,
> > +					    pte_t *ptep)
> 
> It can be multiple ptes, so should this be some kind of "pte_range"/
> 
> lazy_mmu_mode_enable_for_pte_range()
> 
> A bit mouthful but clearer.
> 
> > +{
> > +	struct lazy_mmu_state *state = &current->lazy_mmu_state;
> > +
> > +	if (in_interrupt() || state->pause_count > 0)
> > +		return;
> > +
> > +	VM_WARN_ON_ONCE(state->enable_count == U8_MAX);
> > +
> > +	if (state->enable_count++ == 0)
> > +		arch_enter_lazy_mmu_mode_pte(mm, addr, end, ptep);

I will also change arch_enter_lazy_mmu_mode_pte() to
arch_enter_lazy_mmu_mode_for_pte_range() then.

> > +}
> 
> I'm wondering whether that could instead be some optional interface that
> we trigger after the lazy_mmu_mode_enable. But looking at

To me just two separate and (as you put it) mouthful names appeal better
than an optional follow-up interface.

> lazy_mmu_mode_enable() users, there don't seem to be cases where we
> would process multiple different ranges under a single enable() call, right?

Multiple different ranges still could be processed, but then one should
continue using arch_enter_lazy_mmu_mode(). E.g. these were less obvious
than traditional walkers and left them intact:

	mm/migrate_device.c
	mm/tests/lazy_mmu_mode_kunit.c
	mm/userfaultfd.c
	mm/vmscan.c

> -- 
> Cheers,
> 
> David

Thanks for the quick review!
Re: [RFC PATCH 1/2] mm: make lazy MMU mode context-aware
Posted by David Hildenbrand (Arm) 2 days, 5 hours ago
On 3/25/26 17:20, Alexander Gordeev wrote:
> On Wed, Mar 25, 2026 at 10:55:23AM +0100, David Hildenbrand (Arm) wrote:
> 
> Hi David,
> 
>>> +/**
>>> + * lazy_mmu_mode_enable_pte() - Enable the lazy MMU mode with parameters
>>
>> You have to be a lot clearer about implications. For example, what
>> happens if we would bail out and not process all ptes? What are the
>> exact semantics.
> 
> The only implication is "only this address/PTE range could be updated
> and that range may span one page table at most".

Probably phrase it stronger. "No ptes outside of this range must be
updated" etc.

> 
> Whether all or portion of PTEs were actually updated is not defined,
> just like in case of lazy_mmu_mode_enable_pte().

Okay, then let's document that.

> 
> Makes sense?
> 

Yes.

>>> + * Enters a new lazy MMU mode section; if the mode was not already enabled,
>>> + * enables it and calls arch_enter_lazy_mmu_mode_pte().
>>> + *
>>> + * Must be paired with a call to lazy_mmu_mode_disable().
>>> + *
>>> + * Has no effect if called:
>>> + * - While paused - see lazy_mmu_mode_pause()
>>> + * - In interrupt context
>>> + */
>>> +static inline void lazy_mmu_mode_enable_pte(struct mm_struct *mm,
>>> +					    unsigned long addr,
>>> +					    unsigned long end,
>>> +					    pte_t *ptep)
>>
>> It can be multiple ptes, so should this be some kind of "pte_range"/
>>
>> lazy_mmu_mode_enable_for_pte_range()
>>
>> A bit mouthful but clearer.
>>
>>> +{
>>> +	struct lazy_mmu_state *state = &current->lazy_mmu_state;
>>> +
>>> +	if (in_interrupt() || state->pause_count > 0)
>>> +		return;
>>> +
>>> +	VM_WARN_ON_ONCE(state->enable_count == U8_MAX);
>>> +
>>> +	if (state->enable_count++ == 0)
>>> +		arch_enter_lazy_mmu_mode_pte(mm, addr, end, ptep);
> 
> I will also change arch_enter_lazy_mmu_mode_pte() to
> arch_enter_lazy_mmu_mode_for_pte_range() then.
> 
>>> +}
>>
>> I'm wondering whether that could instead be some optional interface that
>> we trigger after the lazy_mmu_mode_enable. But looking at
> 
> To me just two separate and (as you put it) mouthful names appeal better
> than an optional follow-up interface.

Yes, probably better.

-- 
Cheers,

David
Re: [RFC PATCH 1/2] mm: make lazy MMU mode context-aware
Posted by Kevin Brodsky 2 days, 12 hours ago
On 25/03/2026 17:20, Alexander Gordeev wrote:
> On Wed, Mar 25, 2026 at 10:55:23AM +0100, David Hildenbrand (Arm) wrote:
>
> Hi David,
>
>>> +/**
>>> + * lazy_mmu_mode_enable_pte() - Enable the lazy MMU mode with parameters
>> You have to be a lot clearer about implications. For example, what
>> happens if we would bail out and not process all ptes? What are the
>> exact semantics.
> The only implication is "only this address/PTE range could be updated
> and that range may span one page table at most".
>
> Whether all or portion of PTEs were actually updated is not defined,
> just like in case of lazy_mmu_mode_enable_pte().
>
> Makes sense?

I also feel that the comment needs to be much more specific. From a
brief glance at patch 2, it seems that __ipte_batch_set_pte() assumes
that all PTEs processed after this function is called are contiguous.
This should be documented.

>>> + * Enters a new lazy MMU mode section; if the mode was not already enabled,
>>> + * enables it and calls arch_enter_lazy_mmu_mode_pte().
>>> + *
>>> + * Must be paired with a call to lazy_mmu_mode_disable().
>>> + *
>>> + * Has no effect if called:
>>> + * - While paused - see lazy_mmu_mode_pause()
>>> + * - In interrupt context
>>> + */
>>> +static inline void lazy_mmu_mode_enable_pte(struct mm_struct *mm,
>>> +					    unsigned long addr,
>>> +					    unsigned long end,
>>> +					    pte_t *ptep)
>> It can be multiple ptes, so should this be some kind of "pte_range"/
>>
>> lazy_mmu_mode_enable_for_pte_range()
>>
>> A bit mouthful but clearer.
>>
>>> +{
>>> +	struct lazy_mmu_state *state = &current->lazy_mmu_state;
>>> +
>>> +	if (in_interrupt() || state->pause_count > 0)
>>> +		return;
>>> +
>>> +	VM_WARN_ON_ONCE(state->enable_count == U8_MAX);
>>> +
>>> +	if (state->enable_count++ == 0)
>>> +		arch_enter_lazy_mmu_mode_pte(mm, addr, end, ptep);
> I will also change arch_enter_lazy_mmu_mode_pte() to
> arch_enter_lazy_mmu_mode_for_pte_range() then.

Makes sense. The interface looks reasonable to me with this new name.

One more comment though: in previous discussions you mentioned the need
for arch_{pause,resume} hooks, is that no longer necessary simply
because {pause,resume} are not used on the paths where you make use of
the new enable function?

- Kevin
Re: [RFC PATCH 1/2] mm: make lazy MMU mode context-aware
Posted by Alexander Gordeev 1 week, 1 day ago
> Multiple different ranges still could be processed, but then one should
> continue using arch_enter_lazy_mmu_mode().
                 lazy_mmu_mode_enable()