[v2] Nesting support for lazy MMU mode

[PATCH v2 2/7] mm: introduce local state for lazy_mmu sections

Posted by Kevin Brodsky 5 days, 21 hours ago

arch_{enter,leave}_lazy_mmu_mode() currently have a stateless API
(taking and returning no value). This is proving problematic in
situations where leave() needs to restore some context back to its
original state (before enter() was called). In particular, this
makes it difficult to support the nesting of lazy_mmu sections -
leave() does not know whether the matching enter() call occurred
while lazy_mmu was already enabled, and whether to disable it or
not.

This patch gives all architectures the chance to store local state
while inside a lazy_mmu section by making enter() return some value,
storing it in a local variable, and having leave() take that value.
That value is typed lazy_mmu_state_t - each architecture defining
__HAVE_ARCH_ENTER_LAZY_MMU_MODE is free to define it as it sees fit.
For now we define it as int everywhere, which is sufficient to
support nesting.

The diff is unfortunately rather large as all the API changes need
to be done atomically. Main parts:

* Changing the prototypes of arch_{enter,leave}_lazy_mmu_mode()
  in generic and arch code, and introducing lazy_mmu_state_t.

* Introducing LAZY_MMU_{DEFAULT,NESTED} for future support of
  nesting. enter() always returns LAZY_MMU_DEFAULT for now.
  (linux/mm_types.h is not the most natural location for defining
  those constants, but there is no other obvious header that is
  accessible where arch's implement the helpers.)

* Changing all lazy_mmu sections to introduce a lazy_mmu_state
  local variable, having enter() set it and leave() take it. Most of
  these changes were generated using the following Coccinelle script:

@@
@@
{
+ lazy_mmu_state_t lazy_mmu_state;
...
- arch_enter_lazy_mmu_mode();
+ lazy_mmu_state = arch_enter_lazy_mmu_mode();
...
- arch_leave_lazy_mmu_mode();
+ arch_leave_lazy_mmu_mode(lazy_mmu_state);
...
}

* In a few cases (e.g. xen_flush_lazy_mmu()), a function knows that
  lazy_mmu is already enabled, and it temporarily disables it by
  calling leave() and then enter() again. Here we want to ensure
  that any operation between the leave() and enter() calls is
  completed immediately; for that reason we pass LAZY_MMU_DEFAULT to
  leave() to fully disable lazy_mmu. enter() will then re-enable it
  - this achieves the expected behaviour, whether nesting occurred
  before that function was called or not.

Note: it is difficult to provide a default definition of
lazy_mmu_state_t for architectures implementing lazy_mmu, because
that definition would need to be available in
arch/x86/include/asm/paravirt_types.h and adding a new generic
 #include there is very tricky due to the existing header soup.

Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
---
 arch/arm64/include/asm/pgtable.h              | 10 +++++++---
 .../include/asm/book3s/64/tlbflush-hash.h     |  9 ++++++---
 arch/powerpc/mm/book3s64/hash_tlb.c           | 10 ++++++----
 arch/powerpc/mm/book3s64/subpage_prot.c       |  5 +++--
 arch/sparc/include/asm/tlbflush_64.h          |  5 +++--
 arch/sparc/mm/tlb.c                           |  6 ++++--
 arch/x86/include/asm/paravirt.h               |  6 ++++--
 arch/x86/include/asm/paravirt_types.h         |  2 ++
 arch/x86/xen/enlighten_pv.c                   |  2 +-
 arch/x86/xen/mmu_pv.c                         |  2 +-
 fs/proc/task_mmu.c                            |  5 +++--
 include/linux/mm_types.h                      |  3 +++
 include/linux/pgtable.h                       |  6 ++++--
 mm/kasan/shadow.c                             |  4 ++--
 mm/madvise.c                                  | 20 ++++++++++---------
 mm/memory.c                                   | 20 +++++++++++--------
 mm/migrate_device.c                           |  5 +++--
 mm/mprotect.c                                 |  5 +++--
 mm/mremap.c                                   |  5 +++--
 mm/userfaultfd.c                              |  5 +++--
 mm/vmalloc.c                                  | 15 ++++++++------
 mm/vmscan.c                                   | 15 ++++++++------
 22 files changed, 102 insertions(+), 63 deletions(-)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 728d7b6ed20a..816197d08165 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -81,7 +81,9 @@ static inline void queue_pte_barriers(void)
 }
 
 #define  __HAVE_ARCH_ENTER_LAZY_MMU_MODE
-static inline void arch_enter_lazy_mmu_mode(void)
+typedef int lazy_mmu_state_t;
+
+static inline lazy_mmu_state_t arch_enter_lazy_mmu_mode(void)
 {
 	/*
 	 * lazy_mmu_mode is not supposed to permit nesting. But in practice this
@@ -96,12 +98,14 @@ static inline void arch_enter_lazy_mmu_mode(void)
 	 */
 
 	if (in_interrupt())
-		return;
+		return LAZY_MMU_DEFAULT;
 
 	set_thread_flag(TIF_LAZY_MMU);
+
+	return LAZY_MMU_DEFAULT;
 }
 
-static inline void arch_leave_lazy_mmu_mode(void)
+static inline void arch_leave_lazy_mmu_mode(lazy_mmu_state_t state)
 {
 	if (in_interrupt())
 		return;
diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h
index 176d7fd79eeb..c9f1e819e567 100644
--- a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h
@@ -25,13 +25,14 @@ DECLARE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch);
 extern void __flush_tlb_pending(struct ppc64_tlb_batch *batch);
 
 #define __HAVE_ARCH_ENTER_LAZY_MMU_MODE
+typedef int lazy_mmu_state_t;
 
-static inline void arch_enter_lazy_mmu_mode(void)
+static inline lazy_mmu_state_t arch_enter_lazy_mmu_mode(void)
 {
 	struct ppc64_tlb_batch *batch;
 
 	if (radix_enabled())
-		return;
+		return LAZY_MMU_DEFAULT;
 	/*
 	 * apply_to_page_range can call us this preempt enabled when
 	 * operating on kernel page tables.
@@ -39,9 +40,11 @@ static inline void arch_enter_lazy_mmu_mode(void)
 	preempt_disable();
 	batch = this_cpu_ptr(&ppc64_tlb_batch);
 	batch->active = 1;
+
+	return LAZY_MMU_DEFAULT;
 }
 
-static inline void arch_leave_lazy_mmu_mode(void)
+static inline void arch_leave_lazy_mmu_mode(lazy_mmu_state_t state)
 {
 	struct ppc64_tlb_batch *batch;
 
diff --git a/arch/powerpc/mm/book3s64/hash_tlb.c b/arch/powerpc/mm/book3s64/hash_tlb.c
index 21fcad97ae80..ee664f88e679 100644
--- a/arch/powerpc/mm/book3s64/hash_tlb.c
+++ b/arch/powerpc/mm/book3s64/hash_tlb.c
@@ -189,6 +189,7 @@ void hash__tlb_flush(struct mmu_gather *tlb)
  */
 void __flush_hash_table_range(unsigned long start, unsigned long end)
 {
+	lazy_mmu_state_t lazy_mmu_state;
 	int hugepage_shift;
 	unsigned long flags;
 
@@ -205,7 +206,7 @@ void __flush_hash_table_range(unsigned long start, unsigned long end)
 	 * way to do things but is fine for our needs here.
 	 */
 	local_irq_save(flags);
-	arch_enter_lazy_mmu_mode();
+	lazy_mmu_state = arch_enter_lazy_mmu_mode();
 	for (; start < end; start += PAGE_SIZE) {
 		pte_t *ptep = find_init_mm_pte(start, &hugepage_shift);
 		unsigned long pte;
@@ -217,12 +218,13 @@ void __flush_hash_table_range(unsigned long start, unsigned long end)
 			continue;
 		hpte_need_flush(&init_mm, start, ptep, pte, hugepage_shift);
 	}
-	arch_leave_lazy_mmu_mode();
+	arch_leave_lazy_mmu_mode(lazy_mmu_state);
 	local_irq_restore(flags);
 }
 
 void flush_hash_table_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr)
 {
+	lazy_mmu_state_t lazy_mmu_state;
 	pte_t *pte;
 	pte_t *start_pte;
 	unsigned long flags;
@@ -237,7 +239,7 @@ void flush_hash_table_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long
 	 * way to do things but is fine for our needs here.
 	 */
 	local_irq_save(flags);
-	arch_enter_lazy_mmu_mode();
+	lazy_mmu_state = arch_enter_lazy_mmu_mode();
 	start_pte = pte_offset_map(pmd, addr);
 	if (!start_pte)
 		goto out;
@@ -249,6 +251,6 @@ void flush_hash_table_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long
 	}
 	pte_unmap(start_pte);
 out:
-	arch_leave_lazy_mmu_mode();
+	arch_leave_lazy_mmu_mode(lazy_mmu_state);
 	local_irq_restore(flags);
 }
diff --git a/arch/powerpc/mm/book3s64/subpage_prot.c b/arch/powerpc/mm/book3s64/subpage_prot.c
index ec98e526167e..4720f9f321af 100644
--- a/arch/powerpc/mm/book3s64/subpage_prot.c
+++ b/arch/powerpc/mm/book3s64/subpage_prot.c
@@ -53,6 +53,7 @@ void subpage_prot_free(struct mm_struct *mm)
 static void hpte_flush_range(struct mm_struct *mm, unsigned long addr,
 			     int npages)
 {
+	lazy_mmu_state_t lazy_mmu_state;
 	pgd_t *pgd;
 	p4d_t *p4d;
 	pud_t *pud;
@@ -73,13 +74,13 @@ static void hpte_flush_range(struct mm_struct *mm, unsigned long addr,
 	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
 	if (!pte)
 		return;
-	arch_enter_lazy_mmu_mode();
+	lazy_mmu_state = arch_enter_lazy_mmu_mode();
 	for (; npages > 0; --npages) {
 		pte_update(mm, addr, pte, 0, 0, 0);
 		addr += PAGE_SIZE;
 		++pte;
 	}
-	arch_leave_lazy_mmu_mode();
+	arch_leave_lazy_mmu_mode(lazy_mmu_state);
 	pte_unmap_unlock(pte - 1, ptl);
 }
 
diff --git a/arch/sparc/include/asm/tlbflush_64.h b/arch/sparc/include/asm/tlbflush_64.h
index cd144eb31bdd..02c93a4e6af5 100644
--- a/arch/sparc/include/asm/tlbflush_64.h
+++ b/arch/sparc/include/asm/tlbflush_64.h
@@ -40,10 +40,11 @@ static inline void flush_tlb_range(struct vm_area_struct *vma,
 void flush_tlb_kernel_range(unsigned long start, unsigned long end);
 
 #define __HAVE_ARCH_ENTER_LAZY_MMU_MODE
+typedef int lazy_mmu_state_t;
 
 void flush_tlb_pending(void);
-void arch_enter_lazy_mmu_mode(void);
-void arch_leave_lazy_mmu_mode(void);
+lazy_mmu_state_t arch_enter_lazy_mmu_mode(void);
+void arch_leave_lazy_mmu_mode(lazy_mmu_state_t state);
 
 /* Local cpu only.  */
 void __flush_tlb_all(void);
diff --git a/arch/sparc/mm/tlb.c b/arch/sparc/mm/tlb.c
index a35ddcca5e76..bf5094b770af 100644
--- a/arch/sparc/mm/tlb.c
+++ b/arch/sparc/mm/tlb.c
@@ -50,16 +50,18 @@ void flush_tlb_pending(void)
 	put_cpu_var(tlb_batch);
 }
 
-void arch_enter_lazy_mmu_mode(void)
+lazy_mmu_state_t arch_enter_lazy_mmu_mode(void)
 {
 	struct tlb_batch *tb;
 
 	preempt_disable();
 	tb = this_cpu_ptr(&tlb_batch);
 	tb->active = 1;
+
+	return LAZY_MMU_DEFAULT;
 }
 
-void arch_leave_lazy_mmu_mode(void)
+void arch_leave_lazy_mmu_mode(lazy_mmu_state_t state)
 {
 	struct tlb_batch *tb = this_cpu_ptr(&tlb_batch);
 
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index b5e59a7ba0d0..65a0d394fba1 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -527,12 +527,14 @@ static inline void arch_end_context_switch(struct task_struct *next)
 }
 
 #define  __HAVE_ARCH_ENTER_LAZY_MMU_MODE
-static inline void arch_enter_lazy_mmu_mode(void)
+static inline lazy_mmu_state_t arch_enter_lazy_mmu_mode(void)
 {
 	PVOP_VCALL0(mmu.lazy_mode.enter);
+
+	return LAZY_MMU_DEFAULT;
 }
 
-static inline void arch_leave_lazy_mmu_mode(void)
+static inline void arch_leave_lazy_mmu_mode(lazy_mmu_state_t state)
 {
 	PVOP_VCALL0(mmu.lazy_mode.leave);
 }
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 37a8627d8277..bc1af86868a3 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -41,6 +41,8 @@ struct pv_info {
 };
 
 #ifdef CONFIG_PARAVIRT_XXL
+typedef int lazy_mmu_state_t;
+
 struct pv_lazy_ops {
 	/* Set deferred update mode, used for batching operations. */
 	void (*enter)(void);
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index 26bbaf4b7330..a245ba47a631 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -426,7 +426,7 @@ static void xen_start_context_switch(struct task_struct *prev)
 	BUG_ON(preemptible());
 
 	if (this_cpu_read(xen_lazy_mode) == XEN_LAZY_MMU) {
-		arch_leave_lazy_mmu_mode();
+		arch_leave_lazy_mmu_mode(LAZY_MMU_DEFAULT);
 		set_ti_thread_flag(task_thread_info(prev), TIF_LAZY_MMU_UPDATES);
 	}
 	enter_lazy(XEN_LAZY_CPU);
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index 2a4a8deaf612..2039d5132ca3 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -2140,7 +2140,7 @@ static void xen_flush_lazy_mmu(void)
 	preempt_disable();
 
 	if (xen_get_lazy_mode() == XEN_LAZY_MMU) {
-		arch_leave_lazy_mmu_mode();
+		arch_leave_lazy_mmu_mode(LAZY_MMU_DEFAULT);
 		arch_enter_lazy_mmu_mode();
 	}
 
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index ced01cf3c5ab..02aa55f83bae 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -2682,6 +2682,7 @@ static int pagemap_scan_thp_entry(pmd_t *pmd, unsigned long start,
 static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start,
 				  unsigned long end, struct mm_walk *walk)
 {
+	lazy_mmu_state_t lazy_mmu_state;
 	struct pagemap_scan_private *p = walk->private;
 	struct vm_area_struct *vma = walk->vma;
 	unsigned long addr, flush_end = 0;
@@ -2700,7 +2701,7 @@ static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start,
 		return 0;
 	}
 
-	arch_enter_lazy_mmu_mode();
+	lazy_mmu_state = arch_enter_lazy_mmu_mode();
 
 	if ((p->arg.flags & PM_SCAN_WP_MATCHING) && !p->vec_out) {
 		/* Fast path for performing exclusive WP */
@@ -2770,7 +2771,7 @@ static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start,
 	if (flush_end)
 		flush_tlb_range(vma, start, addr);
 
-	arch_leave_lazy_mmu_mode();
+	arch_leave_lazy_mmu_mode(lazy_mmu_state);
 	pte_unmap_unlock(start_pte, ptl);
 
 	cond_resched();
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 275e8060d918..143d819c1386 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1489,6 +1489,9 @@ extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm);
 extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm);
 extern void tlb_finish_mmu(struct mmu_gather *tlb);
 
+#define LAZY_MMU_DEFAULT	0
+#define LAZY_MMU_NESTED		1
+
 struct vm_fault;
 
 /**
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 8d6007123cdf..df0eb898b3fc 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -232,8 +232,10 @@ static inline int pmd_dirty(pmd_t pmd)
  * and the mode cannot be used in interrupt context.
  */
 #ifndef __HAVE_ARCH_ENTER_LAZY_MMU_MODE
-#define arch_enter_lazy_mmu_mode()	do {} while (0)
-#define arch_leave_lazy_mmu_mode()	do {} while (0)
+typedef int lazy_mmu_state_t;
+
+#define arch_enter_lazy_mmu_mode()	(LAZY_MMU_DEFAULT)
+#define arch_leave_lazy_mmu_mode(state)	((void)(state))
 #endif
 
 #ifndef pte_batch_hint
diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
index 5d2a876035d6..60b1b72f5ce1 100644
--- a/mm/kasan/shadow.c
+++ b/mm/kasan/shadow.c
@@ -305,7 +305,7 @@ static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr,
 	pte_t pte;
 	int index;
 
-	arch_leave_lazy_mmu_mode();
+	arch_leave_lazy_mmu_mode(LAZY_MMU_DEFAULT);
 
 	index = PFN_DOWN(addr - data->start);
 	page = data->pages[index];
@@ -482,7 +482,7 @@ static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr,
 	pte_t pte;
 	int none;
 
-	arch_leave_lazy_mmu_mode();
+	arch_leave_lazy_mmu_mode(LAZY_MMU_DEFAULT);
 
 	spin_lock(&init_mm.page_table_lock);
 	pte = ptep_get(ptep);
diff --git a/mm/madvise.c b/mm/madvise.c
index 35ed4ab0d7c5..72c032f2cf56 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -357,6 +357,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
 				unsigned long addr, unsigned long end,
 				struct mm_walk *walk)
 {
+	lazy_mmu_state_t lazy_mmu_state;
 	struct madvise_walk_private *private = walk->private;
 	struct mmu_gather *tlb = private->tlb;
 	bool pageout = private->pageout;
@@ -455,7 +456,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
 	if (!start_pte)
 		return 0;
 	flush_tlb_batched_pending(mm);
-	arch_enter_lazy_mmu_mode();
+	lazy_mmu_state = arch_enter_lazy_mmu_mode();
 	for (; addr < end; pte += nr, addr += nr * PAGE_SIZE) {
 		nr = 1;
 		ptent = ptep_get(pte);
@@ -463,7 +464,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
 		if (++batch_count == SWAP_CLUSTER_MAX) {
 			batch_count = 0;
 			if (need_resched()) {
-				arch_leave_lazy_mmu_mode();
+				arch_leave_lazy_mmu_mode(lazy_mmu_state);
 				pte_unmap_unlock(start_pte, ptl);
 				cond_resched();
 				goto restart;
@@ -499,7 +500,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
 				if (!folio_trylock(folio))
 					continue;
 				folio_get(folio);
-				arch_leave_lazy_mmu_mode();
+				arch_leave_lazy_mmu_mode(lazy_mmu_state);
 				pte_unmap_unlock(start_pte, ptl);
 				start_pte = NULL;
 				err = split_folio(folio);
@@ -510,7 +511,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
 				if (!start_pte)
 					break;
 				flush_tlb_batched_pending(mm);
-				arch_enter_lazy_mmu_mode();
+				lazy_mmu_state = arch_enter_lazy_mmu_mode();
 				if (!err)
 					nr = 0;
 				continue;
@@ -558,7 +559,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
 	}
 
 	if (start_pte) {
-		arch_leave_lazy_mmu_mode();
+		arch_leave_lazy_mmu_mode(lazy_mmu_state);
 		pte_unmap_unlock(start_pte, ptl);
 	}
 	if (pageout)
@@ -657,6 +658,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 
 {
 	const cydp_t cydp_flags = CYDP_CLEAR_YOUNG | CYDP_CLEAR_DIRTY;
+	lazy_mmu_state_t lazy_mmu_state;
 	struct mmu_gather *tlb = walk->private;
 	struct mm_struct *mm = tlb->mm;
 	struct vm_area_struct *vma = walk->vma;
@@ -677,7 +679,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 	if (!start_pte)
 		return 0;
 	flush_tlb_batched_pending(mm);
-	arch_enter_lazy_mmu_mode();
+	lazy_mmu_state = arch_enter_lazy_mmu_mode();
 	for (; addr != end; pte += nr, addr += PAGE_SIZE * nr) {
 		nr = 1;
 		ptent = ptep_get(pte);
@@ -727,7 +729,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 				if (!folio_trylock(folio))
 					continue;
 				folio_get(folio);
-				arch_leave_lazy_mmu_mode();
+				arch_leave_lazy_mmu_mode(lazy_mmu_state);
 				pte_unmap_unlock(start_pte, ptl);
 				start_pte = NULL;
 				err = split_folio(folio);
@@ -738,7 +740,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 				if (!start_pte)
 					break;
 				flush_tlb_batched_pending(mm);
-				arch_enter_lazy_mmu_mode();
+				lazy_mmu_state = arch_enter_lazy_mmu_mode();
 				if (!err)
 					nr = 0;
 				continue;
@@ -778,7 +780,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 	if (nr_swap)
 		add_mm_counter(mm, MM_SWAPENTS, nr_swap);
 	if (start_pte) {
-		arch_leave_lazy_mmu_mode();
+		arch_leave_lazy_mmu_mode(lazy_mmu_state);
 		pte_unmap_unlock(start_pte, ptl);
 	}
 	cond_resched();
diff --git a/mm/memory.c b/mm/memory.c
index d9de6c056179..a60aae069f1e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1207,6 +1207,7 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
 	       pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
 	       unsigned long end)
 {
+	lazy_mmu_state_t lazy_mmu_state;
 	struct mm_struct *dst_mm = dst_vma->vm_mm;
 	struct mm_struct *src_mm = src_vma->vm_mm;
 	pte_t *orig_src_pte, *orig_dst_pte;
@@ -1254,7 +1255,7 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
 	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
 	orig_src_pte = src_pte;
 	orig_dst_pte = dst_pte;
-	arch_enter_lazy_mmu_mode();
+	lazy_mmu_state = arch_enter_lazy_mmu_mode();
 
 	do {
 		nr = 1;
@@ -1323,7 +1324,7 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
 	} while (dst_pte += nr, src_pte += nr, addr += PAGE_SIZE * nr,
 		 addr != end);
 
-	arch_leave_lazy_mmu_mode();
+	arch_leave_lazy_mmu_mode(lazy_mmu_state);
 	pte_unmap_unlock(orig_src_pte, src_ptl);
 	add_mm_rss_vec(dst_mm, rss);
 	pte_unmap_unlock(orig_dst_pte, dst_ptl);
@@ -1822,6 +1823,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 				unsigned long addr, unsigned long end,
 				struct zap_details *details)
 {
+	lazy_mmu_state_t lazy_mmu_state;
 	bool force_flush = false, force_break = false;
 	struct mm_struct *mm = tlb->mm;
 	int rss[NR_MM_COUNTERS];
@@ -1842,7 +1844,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 		return addr;
 
 	flush_tlb_batched_pending(mm);
-	arch_enter_lazy_mmu_mode();
+	lazy_mmu_state = arch_enter_lazy_mmu_mode();
 	do {
 		bool any_skipped = false;
 
@@ -1874,7 +1876,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 		direct_reclaim = try_get_and_clear_pmd(mm, pmd, &pmdval);
 
 	add_mm_rss_vec(mm, rss);
-	arch_leave_lazy_mmu_mode();
+	arch_leave_lazy_mmu_mode(lazy_mmu_state);
 
 	/* Do the actual TLB flush before dropping ptl */
 	if (force_flush) {
@@ -2811,6 +2813,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
 			unsigned long addr, unsigned long end,
 			unsigned long pfn, pgprot_t prot)
 {
+	lazy_mmu_state_t lazy_mmu_state;
 	pte_t *pte, *mapped_pte;
 	spinlock_t *ptl;
 	int err = 0;
@@ -2818,7 +2821,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
 	mapped_pte = pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
 	if (!pte)
 		return -ENOMEM;
-	arch_enter_lazy_mmu_mode();
+	lazy_mmu_state = arch_enter_lazy_mmu_mode();
 	do {
 		BUG_ON(!pte_none(ptep_get(pte)));
 		if (!pfn_modify_allowed(pfn, prot)) {
@@ -2828,7 +2831,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
 		set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
 		pfn++;
 	} while (pte++, addr += PAGE_SIZE, addr != end);
-	arch_leave_lazy_mmu_mode();
+	arch_leave_lazy_mmu_mode(lazy_mmu_state);
 	pte_unmap_unlock(mapped_pte, ptl);
 	return err;
 }
@@ -3117,6 +3120,7 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
 				     pte_fn_t fn, void *data, bool create,
 				     pgtbl_mod_mask *mask)
 {
+	lazy_mmu_state_t lazy_mmu_state;
 	pte_t *pte, *mapped_pte;
 	int err = 0;
 	spinlock_t *ptl;
@@ -3135,7 +3139,7 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
 			return -EINVAL;
 	}
 
-	arch_enter_lazy_mmu_mode();
+	lazy_mmu_state = arch_enter_lazy_mmu_mode();
 
 	if (fn) {
 		do {
@@ -3148,7 +3152,7 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
 	}
 	*mask |= PGTBL_PTE_MODIFIED;
 
-	arch_leave_lazy_mmu_mode();
+	arch_leave_lazy_mmu_mode(lazy_mmu_state);
 
 	if (mm != &init_mm)
 		pte_unmap_unlock(mapped_pte, ptl);
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index abd9f6850db6..833ce5eafa40 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -59,6 +59,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
 				   unsigned long end,
 				   struct mm_walk *walk)
 {
+	lazy_mmu_state_t lazy_mmu_state;
 	struct migrate_vma *migrate = walk->private;
 	struct folio *fault_folio = migrate->fault_page ?
 		page_folio(migrate->fault_page) : NULL;
@@ -110,7 +111,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
 	ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
 	if (!ptep)
 		goto again;
-	arch_enter_lazy_mmu_mode();
+	lazy_mmu_state = arch_enter_lazy_mmu_mode();
 
 	for (; addr < end; addr += PAGE_SIZE, ptep++) {
 		struct dev_pagemap *pgmap;
@@ -287,7 +288,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
 	if (unmapped)
 		flush_tlb_range(walk->vma, start, end);
 
-	arch_leave_lazy_mmu_mode();
+	arch_leave_lazy_mmu_mode(lazy_mmu_state);
 	pte_unmap_unlock(ptep - 1, ptl);
 
 	return 0;
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 113b48985834..7bba651e5aa3 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -273,6 +273,7 @@ static long change_pte_range(struct mmu_gather *tlb,
 		struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr,
 		unsigned long end, pgprot_t newprot, unsigned long cp_flags)
 {
+	lazy_mmu_state_t lazy_mmu_state;
 	pte_t *pte, oldpte;
 	spinlock_t *ptl;
 	long pages = 0;
@@ -293,7 +294,7 @@ static long change_pte_range(struct mmu_gather *tlb,
 		target_node = numa_node_id();
 
 	flush_tlb_batched_pending(vma->vm_mm);
-	arch_enter_lazy_mmu_mode();
+	lazy_mmu_state = arch_enter_lazy_mmu_mode();
 	do {
 		nr_ptes = 1;
 		oldpte = ptep_get(pte);
@@ -439,7 +440,7 @@ static long change_pte_range(struct mmu_gather *tlb,
 			}
 		}
 	} while (pte += nr_ptes, addr += nr_ptes * PAGE_SIZE, addr != end);
-	arch_leave_lazy_mmu_mode();
+	arch_leave_lazy_mmu_mode(lazy_mmu_state);
 	pte_unmap_unlock(pte - 1, ptl);
 
 	return pages;
diff --git a/mm/mremap.c b/mm/mremap.c
index 35de0a7b910e..a562d8cf1eee 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -193,6 +193,7 @@ static int mremap_folio_pte_batch(struct vm_area_struct *vma, unsigned long addr
 static int move_ptes(struct pagetable_move_control *pmc,
 		unsigned long extent, pmd_t *old_pmd, pmd_t *new_pmd)
 {
+	lazy_mmu_state_t lazy_mmu_state;
 	struct vm_area_struct *vma = pmc->old;
 	bool need_clear_uffd_wp = vma_has_uffd_without_event_remap(vma);
 	struct mm_struct *mm = vma->vm_mm;
@@ -256,7 +257,7 @@ static int move_ptes(struct pagetable_move_control *pmc,
 	if (new_ptl != old_ptl)
 		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
 	flush_tlb_batched_pending(vma->vm_mm);
-	arch_enter_lazy_mmu_mode();
+	lazy_mmu_state = arch_enter_lazy_mmu_mode();
 
 	for (; old_addr < old_end; old_ptep += nr_ptes, old_addr += nr_ptes * PAGE_SIZE,
 		new_ptep += nr_ptes, new_addr += nr_ptes * PAGE_SIZE) {
@@ -301,7 +302,7 @@ static int move_ptes(struct pagetable_move_control *pmc,
 		}
 	}
 
-	arch_leave_lazy_mmu_mode();
+	arch_leave_lazy_mmu_mode(lazy_mmu_state);
 	if (force_flush)
 		flush_tlb_range(vma, old_end - len, old_end);
 	if (new_ptl != old_ptl)
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 50aaa8dcd24c..6ee71ba68b12 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -1076,6 +1076,7 @@ static long move_present_ptes(struct mm_struct *mm,
 			      struct folio **first_src_folio, unsigned long len,
 			      struct anon_vma *src_anon_vma)
 {
+	lazy_mmu_state_t lazy_mmu_state;
 	int err = 0;
 	struct folio *src_folio = *first_src_folio;
 	unsigned long src_start = src_addr;
@@ -1100,7 +1101,7 @@ static long move_present_ptes(struct mm_struct *mm,
 	/* It's safe to drop the reference now as the page-table is holding one. */
 	folio_put(*first_src_folio);
 	*first_src_folio = NULL;
-	arch_enter_lazy_mmu_mode();
+	lazy_mmu_state = arch_enter_lazy_mmu_mode();
 
 	while (true) {
 		orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte);
@@ -1138,7 +1139,7 @@ static long move_present_ptes(struct mm_struct *mm,
 			break;
 	}
 
-	arch_leave_lazy_mmu_mode();
+	arch_leave_lazy_mmu_mode(lazy_mmu_state);
 	if (src_addr > src_start)
 		flush_tlb_range(src_vma, src_start, src_addr);
 
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 4249e1e01947..9fc86ddf1711 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -95,6 +95,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 			phys_addr_t phys_addr, pgprot_t prot,
 			unsigned int max_page_shift, pgtbl_mod_mask *mask)
 {
+	lazy_mmu_state_t lazy_mmu_state;
 	pte_t *pte;
 	u64 pfn;
 	struct page *page;
@@ -105,7 +106,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 	if (!pte)
 		return -ENOMEM;
 
-	arch_enter_lazy_mmu_mode();
+	lazy_mmu_state = arch_enter_lazy_mmu_mode();
 
 	do {
 		if (unlikely(!pte_none(ptep_get(pte)))) {
@@ -131,7 +132,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 		pfn++;
 	} while (pte += PFN_DOWN(size), addr += size, addr != end);
 
-	arch_leave_lazy_mmu_mode();
+	arch_leave_lazy_mmu_mode(lazy_mmu_state);
 	*mask |= PGTBL_PTE_MODIFIED;
 	return 0;
 }
@@ -354,12 +355,13 @@ int ioremap_page_range(unsigned long addr, unsigned long end,
 static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 			     pgtbl_mod_mask *mask)
 {
+	lazy_mmu_state_t lazy_mmu_state;
 	pte_t *pte;
 	pte_t ptent;
 	unsigned long size = PAGE_SIZE;
 
 	pte = pte_offset_kernel(pmd, addr);
-	arch_enter_lazy_mmu_mode();
+	lazy_mmu_state = arch_enter_lazy_mmu_mode();
 
 	do {
 #ifdef CONFIG_HUGETLB_PAGE
@@ -378,7 +380,7 @@ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 		WARN_ON(!pte_none(ptent) && !pte_present(ptent));
 	} while (pte += (size >> PAGE_SHIFT), addr += size, addr != end);
 
-	arch_leave_lazy_mmu_mode();
+	arch_leave_lazy_mmu_mode(lazy_mmu_state);
 	*mask |= PGTBL_PTE_MODIFIED;
 }
 
@@ -514,6 +516,7 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
 		unsigned long end, pgprot_t prot, struct page **pages, int *nr,
 		pgtbl_mod_mask *mask)
 {
+	lazy_mmu_state_t lazy_mmu_state;
 	int err = 0;
 	pte_t *pte;
 
@@ -526,7 +529,7 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
 	if (!pte)
 		return -ENOMEM;
 
-	arch_enter_lazy_mmu_mode();
+	lazy_mmu_state = arch_enter_lazy_mmu_mode();
 
 	do {
 		struct page *page = pages[*nr];
@@ -548,7 +551,7 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
 		(*nr)++;
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 
-	arch_leave_lazy_mmu_mode();
+	arch_leave_lazy_mmu_mode(lazy_mmu_state);
 	*mask |= PGTBL_PTE_MODIFIED;
 
 	return err;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index ca9e1cd3cd68..2872497a0453 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3514,6 +3514,7 @@ static void walk_update_folio(struct lru_gen_mm_walk *walk, struct folio *folio,
 static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
 			   struct mm_walk *args)
 {
+	lazy_mmu_state_t lazy_mmu_state;
 	int i;
 	bool dirty;
 	pte_t *pte;
@@ -3543,7 +3544,7 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
 		return false;
 	}
 
-	arch_enter_lazy_mmu_mode();
+	lazy_mmu_state = arch_enter_lazy_mmu_mode();
 restart:
 	for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) {
 		unsigned long pfn;
@@ -3584,7 +3585,7 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
 	if (i < PTRS_PER_PTE && get_next_vma(PMD_MASK, PAGE_SIZE, args, &start, &end))
 		goto restart;
 
-	arch_leave_lazy_mmu_mode();
+	arch_leave_lazy_mmu_mode(lazy_mmu_state);
 	pte_unmap_unlock(pte, ptl);
 
 	return suitable_to_scan(total, young);
@@ -3593,6 +3594,7 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
 static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area_struct *vma,
 				  struct mm_walk *args, unsigned long *bitmap, unsigned long *first)
 {
+	lazy_mmu_state_t lazy_mmu_state;
 	int i;
 	bool dirty;
 	pmd_t *pmd;
@@ -3625,7 +3627,7 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area
 	if (!spin_trylock(ptl))
 		goto done;
 
-	arch_enter_lazy_mmu_mode();
+	lazy_mmu_state = arch_enter_lazy_mmu_mode();
 
 	do {
 		unsigned long pfn;
@@ -3672,7 +3674,7 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area
 
 	walk_update_folio(walk, last, gen, dirty);
 
-	arch_leave_lazy_mmu_mode();
+	arch_leave_lazy_mmu_mode(lazy_mmu_state);
 	spin_unlock(ptl);
 done:
 	*first = -1;
@@ -4220,6 +4222,7 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
  */
 bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
 {
+	lazy_mmu_state_t lazy_mmu_state;
 	int i;
 	bool dirty;
 	unsigned long start;
@@ -4271,7 +4274,7 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
 		}
 	}
 
-	arch_enter_lazy_mmu_mode();
+	lazy_mmu_state = arch_enter_lazy_mmu_mode();
 
 	pte -= (addr - start) / PAGE_SIZE;
 
@@ -4305,7 +4308,7 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
 
 	walk_update_folio(walk, last, gen, dirty);
 
-	arch_leave_lazy_mmu_mode();
+	arch_leave_lazy_mmu_mode(lazy_mmu_state);
 
 	/* feedback from rmap walkers to page table walkers */
 	if (mm_state && suitable_to_scan(i, young))
-- 
2.47.0

Re: [PATCH v2 2/7] mm: introduce local state for lazy_mmu sections

Posted by Jürgen Groß 4 days, 19 hours ago

On 08.09.25 09:39, Kevin Brodsky wrote:
> arch_{enter,leave}_lazy_mmu_mode() currently have a stateless API
> (taking and returning no value). This is proving problematic in
> situations where leave() needs to restore some context back to its
> original state (before enter() was called). In particular, this
> makes it difficult to support the nesting of lazy_mmu sections -
> leave() does not know whether the matching enter() call occurred
> while lazy_mmu was already enabled, and whether to disable it or
> not.
> 
> This patch gives all architectures the chance to store local state
> while inside a lazy_mmu section by making enter() return some value,
> storing it in a local variable, and having leave() take that value.
> That value is typed lazy_mmu_state_t - each architecture defining
> __HAVE_ARCH_ENTER_LAZY_MMU_MODE is free to define it as it sees fit.
> For now we define it as int everywhere, which is sufficient to
> support nesting.
> 
> The diff is unfortunately rather large as all the API changes need
> to be done atomically. Main parts:
> 
> * Changing the prototypes of arch_{enter,leave}_lazy_mmu_mode()
>    in generic and arch code, and introducing lazy_mmu_state_t.
> 
> * Introducing LAZY_MMU_{DEFAULT,NESTED} for future support of
>    nesting. enter() always returns LAZY_MMU_DEFAULT for now.
>    (linux/mm_types.h is not the most natural location for defining
>    those constants, but there is no other obvious header that is
>    accessible where arch's implement the helpers.)
> 
> * Changing all lazy_mmu sections to introduce a lazy_mmu_state
>    local variable, having enter() set it and leave() take it. Most of
>    these changes were generated using the following Coccinelle script:
> 
> @@
> @@
> {
> + lazy_mmu_state_t lazy_mmu_state;
> ...
> - arch_enter_lazy_mmu_mode();
> + lazy_mmu_state = arch_enter_lazy_mmu_mode();
> ...
> - arch_leave_lazy_mmu_mode();
> + arch_leave_lazy_mmu_mode(lazy_mmu_state);
> ...
> }
> 
> * In a few cases (e.g. xen_flush_lazy_mmu()), a function knows that
>    lazy_mmu is already enabled, and it temporarily disables it by
>    calling leave() and then enter() again. Here we want to ensure
>    that any operation between the leave() and enter() calls is
>    completed immediately; for that reason we pass LAZY_MMU_DEFAULT to
>    leave() to fully disable lazy_mmu. enter() will then re-enable it
>    - this achieves the expected behaviour, whether nesting occurred
>    before that function was called or not.
> 
> Note: it is difficult to provide a default definition of
> lazy_mmu_state_t for architectures implementing lazy_mmu, because
> that definition would need to be available in
> arch/x86/include/asm/paravirt_types.h and adding a new generic
>   #include there is very tricky due to the existing header soup.
> 
> Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
> Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>

Reviewed-by: Juergen Gross <jgross@suse.com> # arch/x86


Juergen

Re: [PATCH v2 2/7] mm: introduce local state for lazy_mmu sections

Posted by David Hildenbrand 4 days, 20 hours ago

On 08.09.25 09:39, Kevin Brodsky wrote:
> arch_{enter,leave}_lazy_mmu_mode() currently have a stateless API
> (taking and returning no value). This is proving problematic in
> situations where leave() needs to restore some context back to its
> original state (before enter() was called). In particular, this
> makes it difficult to support the nesting of lazy_mmu sections -
> leave() does not know whether the matching enter() call occurred
> while lazy_mmu was already enabled, and whether to disable it or
> not.
> 
> This patch gives all architectures the chance to store local state
> while inside a lazy_mmu section by making enter() return some value,
> storing it in a local variable, and having leave() take that value.
> That value is typed lazy_mmu_state_t - each architecture defining
> __HAVE_ARCH_ENTER_LAZY_MMU_MODE is free to define it as it sees fit.
> For now we define it as int everywhere, which is sufficient to
> support nesting.
> 
> The diff is unfortunately rather large as all the API changes need
> to be done atomically. Main parts:
> 
> * Changing the prototypes of arch_{enter,leave}_lazy_mmu_mode()
>    in generic and arch code, and introducing lazy_mmu_state_t.
> 
> * Introducing LAZY_MMU_{DEFAULT,NESTED} for future support of
>    nesting. enter() always returns LAZY_MMU_DEFAULT for now.
>    (linux/mm_types.h is not the most natural location for defining
>    those constants, but there is no other obvious header that is
>    accessible where arch's implement the helpers.)
> 
> * Changing all lazy_mmu sections to introduce a lazy_mmu_state
>    local variable, having enter() set it and leave() take it. Most of
>    these changes were generated using the following Coccinelle script:
> 
> @@
> @@
> {
> + lazy_mmu_state_t lazy_mmu_state;
> ...
> - arch_enter_lazy_mmu_mode();
> + lazy_mmu_state = arch_enter_lazy_mmu_mode();
> ...
> - arch_leave_lazy_mmu_mode();
> + arch_leave_lazy_mmu_mode(lazy_mmu_state);
> ...
> }
> 
> * In a few cases (e.g. xen_flush_lazy_mmu()), a function knows that
>    lazy_mmu is already enabled, and it temporarily disables it by
>    calling leave() and then enter() again. Here we want to ensure
>    that any operation between the leave() and enter() calls is
>    completed immediately; for that reason we pass LAZY_MMU_DEFAULT to
>    leave() to fully disable lazy_mmu. enter() will then re-enable it
>    - this achieves the expected behaviour, whether nesting occurred
>    before that function was called or not.
> 
> Note: it is difficult to provide a default definition of
> lazy_mmu_state_t for architectures implementing lazy_mmu, because
> that definition would need to be available in
> arch/x86/include/asm/paravirt_types.h and adding a new generic
>   #include there is very tricky due to the existing header soup.

Yeah, I was wondering about exactly that.

In particular because LAZY_MMU_DEFAULT etc resides somewehere compeltely 
different.

Which raises the question: is using a new type really of any benefit here?

Can't we just use an "enum lazy_mmu_state" and call it a day?

-- 
Cheers

David / dhildenb

Re: [PATCH v2 2/7] mm: introduce local state for lazy_mmu sections

Posted by Juergen Gross 4 days, 18 hours ago

On 09.09.25 11:07, David Hildenbrand wrote:
> On 08.09.25 09:39, Kevin Brodsky wrote:
>> arch_{enter,leave}_lazy_mmu_mode() currently have a stateless API
>> (taking and returning no value). This is proving problematic in
>> situations where leave() needs to restore some context back to its
>> original state (before enter() was called). In particular, this
>> makes it difficult to support the nesting of lazy_mmu sections -
>> leave() does not know whether the matching enter() call occurred
>> while lazy_mmu was already enabled, and whether to disable it or
>> not.
>>
>> This patch gives all architectures the chance to store local state
>> while inside a lazy_mmu section by making enter() return some value,
>> storing it in a local variable, and having leave() take that value.
>> That value is typed lazy_mmu_state_t - each architecture defining
>> __HAVE_ARCH_ENTER_LAZY_MMU_MODE is free to define it as it sees fit.
>> For now we define it as int everywhere, which is sufficient to
>> support nesting.
>>
>> The diff is unfortunately rather large as all the API changes need
>> to be done atomically. Main parts:
>>
>> * Changing the prototypes of arch_{enter,leave}_lazy_mmu_mode()
>>    in generic and arch code, and introducing lazy_mmu_state_t.
>>
>> * Introducing LAZY_MMU_{DEFAULT,NESTED} for future support of
>>    nesting. enter() always returns LAZY_MMU_DEFAULT for now.
>>    (linux/mm_types.h is not the most natural location for defining
>>    those constants, but there is no other obvious header that is
>>    accessible where arch's implement the helpers.)
>>
>> * Changing all lazy_mmu sections to introduce a lazy_mmu_state
>>    local variable, having enter() set it and leave() take it. Most of
>>    these changes were generated using the following Coccinelle script:
>>
>> @@
>> @@
>> {
>> + lazy_mmu_state_t lazy_mmu_state;
>> ...
>> - arch_enter_lazy_mmu_mode();
>> + lazy_mmu_state = arch_enter_lazy_mmu_mode();
>> ...
>> - arch_leave_lazy_mmu_mode();
>> + arch_leave_lazy_mmu_mode(lazy_mmu_state);
>> ...
>> }
>>
>> * In a few cases (e.g. xen_flush_lazy_mmu()), a function knows that
>>    lazy_mmu is already enabled, and it temporarily disables it by
>>    calling leave() and then enter() again. Here we want to ensure
>>    that any operation between the leave() and enter() calls is
>>    completed immediately; for that reason we pass LAZY_MMU_DEFAULT to
>>    leave() to fully disable lazy_mmu. enter() will then re-enable it
>>    - this achieves the expected behaviour, whether nesting occurred
>>    before that function was called or not.
>>
>> Note: it is difficult to provide a default definition of
>> lazy_mmu_state_t for architectures implementing lazy_mmu, because
>> that definition would need to be available in
>> arch/x86/include/asm/paravirt_types.h and adding a new generic
>>   #include there is very tricky due to the existing header soup.
> 
> Yeah, I was wondering about exactly that.
> 
> In particular because LAZY_MMU_DEFAULT etc resides somewehere compeltely different.
> 
> Which raises the question: is using a new type really of any benefit here?
> 
> Can't we just use an "enum lazy_mmu_state" and call it a day?
> 

The comment about the "header soup" made me look into this problem:

It seems some of the "#include <asm/paravirt.h>" instances in the code
base can just be dropped.

For the remaining cases I'd like to suggest a reorg of the related headers:
Instead of having the non-paravirt definition in one header and the paravirt
ones in paravirt.h, maybe it would be better to have only the paravirt
generic definitions in paravirt.h and the specific functions in the header
defining the non-paravirt variant. This would probably resolve the problem
with the "soup", as paravirt.h wouldn't rely on so many other headers.

I'm just preparing a patch doing the removal of the not needed includes, but
I'd be willing to address the disentangling as noted above.

Thoughts?


Juergen

Re: [PATCH v2 2/7] mm: introduce local state for lazy_mmu sections

Posted by Kevin Brodsky 4 days, 15 hours ago

On 09/09/2025 12:57, Juergen Gross wrote:
> On 09.09.25 11:07, David Hildenbrand wrote:
>> On 08.09.25 09:39, Kevin Brodsky wrote:
>>> [...]
>>>
>>> Note: it is difficult to provide a default definition of
>>> lazy_mmu_state_t for architectures implementing lazy_mmu, because
>>> that definition would need to be available in
>>> arch/x86/include/asm/paravirt_types.h and adding a new generic
>>>   #include there is very tricky due to the existing header soup.
>>
>> Yeah, I was wondering about exactly that.
>>
>> In particular because LAZY_MMU_DEFAULT etc resides somewehere
>> compeltely different.
>>
>> Which raises the question: is using a new type really of any benefit
>> here?
>>
>> Can't we just use an "enum lazy_mmu_state" and call it a day?
>>
>
> The comment about the "header soup" made me look into this problem:
>
> It seems some of the "#include <asm/paravirt.h>" instances in the code
> base can just be dropped.
>
> For the remaining cases I'd like to suggest a reorg of the related
> headers:
> Instead of having the non-paravirt definition in one header and the
> paravirt
> ones in paravirt.h, maybe it would be better to have only the paravirt
> generic definitions in paravirt.h and the specific functions in the
> header
> defining the non-paravirt variant. This would probably resolve the
> problem
> with the "soup", as paravirt.h wouldn't rely on so many other headers.
>
> I'm just preparing a patch doing the removal of the not needed
> includes, but
> I'd be willing to address the disentangling as noted above.
>
> Thoughts?

I don't know enough about these headers to express an informed opinion,
but it does sound like a useful cleanup. Do you think it would allow
<asm/paravirt_types.h> to include <linux/mm_types.h>? This is what we
would need to have a generic definition of lazy_mmu_state_t (which could
be overridden by defining some __HAVE_ARCH macro in <asm/mmu.h>).

- Kevin

Re: [PATCH v2 2/7] mm: introduce local state for lazy_mmu sections

Posted by Alexander Gordeev 4 days, 19 hours ago

On Tue, Sep 09, 2025 at 11:07:36AM +0200, David Hildenbrand wrote:
> On 08.09.25 09:39, Kevin Brodsky wrote:
> > arch_{enter,leave}_lazy_mmu_mode() currently have a stateless API
> > (taking and returning no value). This is proving problematic in
> > situations where leave() needs to restore some context back to its
> > original state (before enter() was called). In particular, this
> > makes it difficult to support the nesting of lazy_mmu sections -
> > leave() does not know whether the matching enter() call occurred
> > while lazy_mmu was already enabled, and whether to disable it or
> > not.
> > 
> > This patch gives all architectures the chance to store local state
> > while inside a lazy_mmu section by making enter() return some value,
> > storing it in a local variable, and having leave() take that value.
> > That value is typed lazy_mmu_state_t - each architecture defining
> > __HAVE_ARCH_ENTER_LAZY_MMU_MODE is free to define it as it sees fit.
> > For now we define it as int everywhere, which is sufficient to
> > support nesting.
...
> > {
> > + lazy_mmu_state_t lazy_mmu_state;
> > ...
> > - arch_enter_lazy_mmu_mode();
> > + lazy_mmu_state = arch_enter_lazy_mmu_mode();
> > ...
> > - arch_leave_lazy_mmu_mode();
> > + arch_leave_lazy_mmu_mode(lazy_mmu_state);
> > ...
> > }
> > 
> > * In a few cases (e.g. xen_flush_lazy_mmu()), a function knows that
> >    lazy_mmu is already enabled, and it temporarily disables it by
> >    calling leave() and then enter() again. Here we want to ensure
> >    that any operation between the leave() and enter() calls is
> >    completed immediately; for that reason we pass LAZY_MMU_DEFAULT to
> >    leave() to fully disable lazy_mmu. enter() will then re-enable it
> >    - this achieves the expected behaviour, whether nesting occurred
> >    before that function was called or not.
> > 
> > Note: it is difficult to provide a default definition of
> > lazy_mmu_state_t for architectures implementing lazy_mmu, because
> > that definition would need to be available in
> > arch/x86/include/asm/paravirt_types.h and adding a new generic
> >   #include there is very tricky due to the existing header soup.
> 
> Yeah, I was wondering about exactly that.
> 
> In particular because LAZY_MMU_DEFAULT etc resides somewehere compeltely
> different.
> 
> Which raises the question: is using a new type really of any benefit here?
> 
> Can't we just use an "enum lazy_mmu_state" and call it a day?

I could envision something completely different for this type on s390,
e.g. a pointer to a per-cpu structure. So I would really ask to stick
with the current approach.

> -- 
> Cheers
> 
> David / dhildenb

Thanks!

Re: [PATCH v2 2/7] mm: introduce local state for lazy_mmu sections

Posted by David Hildenbrand 4 days, 19 hours ago

On 09.09.25 11:40, Alexander Gordeev wrote:
> On Tue, Sep 09, 2025 at 11:07:36AM +0200, David Hildenbrand wrote:
>> On 08.09.25 09:39, Kevin Brodsky wrote:
>>> arch_{enter,leave}_lazy_mmu_mode() currently have a stateless API
>>> (taking and returning no value). This is proving problematic in
>>> situations where leave() needs to restore some context back to its
>>> original state (before enter() was called). In particular, this
>>> makes it difficult to support the nesting of lazy_mmu sections -
>>> leave() does not know whether the matching enter() call occurred
>>> while lazy_mmu was already enabled, and whether to disable it or
>>> not.
>>>
>>> This patch gives all architectures the chance to store local state
>>> while inside a lazy_mmu section by making enter() return some value,
>>> storing it in a local variable, and having leave() take that value.
>>> That value is typed lazy_mmu_state_t - each architecture defining
>>> __HAVE_ARCH_ENTER_LAZY_MMU_MODE is free to define it as it sees fit.
>>> For now we define it as int everywhere, which is sufficient to
>>> support nesting.
> ...
>>> {
>>> + lazy_mmu_state_t lazy_mmu_state;
>>> ...
>>> - arch_enter_lazy_mmu_mode();
>>> + lazy_mmu_state = arch_enter_lazy_mmu_mode();
>>> ...
>>> - arch_leave_lazy_mmu_mode();
>>> + arch_leave_lazy_mmu_mode(lazy_mmu_state);
>>> ...
>>> }
>>>
>>> * In a few cases (e.g. xen_flush_lazy_mmu()), a function knows that
>>>     lazy_mmu is already enabled, and it temporarily disables it by
>>>     calling leave() and then enter() again. Here we want to ensure
>>>     that any operation between the leave() and enter() calls is
>>>     completed immediately; for that reason we pass LAZY_MMU_DEFAULT to
>>>     leave() to fully disable lazy_mmu. enter() will then re-enable it
>>>     - this achieves the expected behaviour, whether nesting occurred
>>>     before that function was called or not.
>>>
>>> Note: it is difficult to provide a default definition of
>>> lazy_mmu_state_t for architectures implementing lazy_mmu, because
>>> that definition would need to be available in
>>> arch/x86/include/asm/paravirt_types.h and adding a new generic
>>>    #include there is very tricky due to the existing header soup.
>>
>> Yeah, I was wondering about exactly that.
>>
>> In particular because LAZY_MMU_DEFAULT etc resides somewehere compeltely
>> different.
>>
>> Which raises the question: is using a new type really of any benefit here?
>>
>> Can't we just use an "enum lazy_mmu_state" and call it a day?
> 
> I could envision something completely different for this type on s390,
> e.g. a pointer to a per-cpu structure. So I would really ask to stick
> with the current approach.

Would that integrate well with LAZY_MMU_DEFAULT etc?

-- 
Cheers

David / dhildenb

Re: [PATCH v2 2/7] mm: introduce local state for lazy_mmu sections

Posted by Alexander Gordeev 4 days, 17 hours ago

On Tue, Sep 09, 2025 at 12:09:48PM +0200, David Hildenbrand wrote:
> On 09.09.25 11:40, Alexander Gordeev wrote:
> > On Tue, Sep 09, 2025 at 11:07:36AM +0200, David Hildenbrand wrote:
> > > On 08.09.25 09:39, Kevin Brodsky wrote:
> > > > arch_{enter,leave}_lazy_mmu_mode() currently have a stateless API
> > > > (taking and returning no value). This is proving problematic in
> > > > situations where leave() needs to restore some context back to its
> > > > original state (before enter() was called). In particular, this
> > > > makes it difficult to support the nesting of lazy_mmu sections -
> > > > leave() does not know whether the matching enter() call occurred
> > > > while lazy_mmu was already enabled, and whether to disable it or
> > > > not.
> > > > 
> > > > This patch gives all architectures the chance to store local state
> > > > while inside a lazy_mmu section by making enter() return some value,
> > > > storing it in a local variable, and having leave() take that value.
> > > > That value is typed lazy_mmu_state_t - each architecture defining
> > > > __HAVE_ARCH_ENTER_LAZY_MMU_MODE is free to define it as it sees fit.
> > > > For now we define it as int everywhere, which is sufficient to
> > > > support nesting.
> > ...
> > > > {
> > > > + lazy_mmu_state_t lazy_mmu_state;
> > > > ...
> > > > - arch_enter_lazy_mmu_mode();
> > > > + lazy_mmu_state = arch_enter_lazy_mmu_mode();
> > > > ...
> > > > - arch_leave_lazy_mmu_mode();
> > > > + arch_leave_lazy_mmu_mode(lazy_mmu_state);
> > > > ...
> > > > }
> > > > 
> > > > * In a few cases (e.g. xen_flush_lazy_mmu()), a function knows that
> > > >     lazy_mmu is already enabled, and it temporarily disables it by
> > > >     calling leave() and then enter() again. Here we want to ensure
> > > >     that any operation between the leave() and enter() calls is
> > > >     completed immediately; for that reason we pass LAZY_MMU_DEFAULT to
> > > >     leave() to fully disable lazy_mmu. enter() will then re-enable it
> > > >     - this achieves the expected behaviour, whether nesting occurred
> > > >     before that function was called or not.
> > > > 
> > > > Note: it is difficult to provide a default definition of
> > > > lazy_mmu_state_t for architectures implementing lazy_mmu, because
> > > > that definition would need to be available in
> > > > arch/x86/include/asm/paravirt_types.h and adding a new generic
> > > >    #include there is very tricky due to the existing header soup.
> > > 
> > > Yeah, I was wondering about exactly that.
> > > 
> > > In particular because LAZY_MMU_DEFAULT etc resides somewehere compeltely
> > > different.
> > > 
> > > Which raises the question: is using a new type really of any benefit here?
> > > 
> > > Can't we just use an "enum lazy_mmu_state" and call it a day?
> > 
> > I could envision something completely different for this type on s390,
> > e.g. a pointer to a per-cpu structure. So I would really ask to stick
> > with the current approach.
> 
> Would that integrate well with LAZY_MMU_DEFAULT etc?

Hmm... I though the idea is to use LAZY_MMU_* by architectures that
want to use it - at least that is how I read the description above.

It is only kasan_populate|depopulate_vmalloc_pte() in generic code
that do not follow this pattern, and it looks as a problem to me.

> -- 
> Cheers
> 
> David / dhildenb

Thanks!

Re: [PATCH v2 2/7] mm: introduce local state for lazy_mmu sections

Posted by David Hildenbrand 4 days, 17 hours ago

On 09.09.25 13:45, Alexander Gordeev wrote:
> On Tue, Sep 09, 2025 at 12:09:48PM +0200, David Hildenbrand wrote:
>> On 09.09.25 11:40, Alexander Gordeev wrote:
>>> On Tue, Sep 09, 2025 at 11:07:36AM +0200, David Hildenbrand wrote:
>>>> On 08.09.25 09:39, Kevin Brodsky wrote:
>>>>> arch_{enter,leave}_lazy_mmu_mode() currently have a stateless API
>>>>> (taking and returning no value). This is proving problematic in
>>>>> situations where leave() needs to restore some context back to its
>>>>> original state (before enter() was called). In particular, this
>>>>> makes it difficult to support the nesting of lazy_mmu sections -
>>>>> leave() does not know whether the matching enter() call occurred
>>>>> while lazy_mmu was already enabled, and whether to disable it or
>>>>> not.
>>>>>
>>>>> This patch gives all architectures the chance to store local state
>>>>> while inside a lazy_mmu section by making enter() return some value,
>>>>> storing it in a local variable, and having leave() take that value.
>>>>> That value is typed lazy_mmu_state_t - each architecture defining
>>>>> __HAVE_ARCH_ENTER_LAZY_MMU_MODE is free to define it as it sees fit.
>>>>> For now we define it as int everywhere, which is sufficient to
>>>>> support nesting.
>>> ...
>>>>> {
>>>>> + lazy_mmu_state_t lazy_mmu_state;
>>>>> ...
>>>>> - arch_enter_lazy_mmu_mode();
>>>>> + lazy_mmu_state = arch_enter_lazy_mmu_mode();
>>>>> ...
>>>>> - arch_leave_lazy_mmu_mode();
>>>>> + arch_leave_lazy_mmu_mode(lazy_mmu_state);
>>>>> ...
>>>>> }
>>>>>
>>>>> * In a few cases (e.g. xen_flush_lazy_mmu()), a function knows that
>>>>>      lazy_mmu is already enabled, and it temporarily disables it by
>>>>>      calling leave() and then enter() again. Here we want to ensure
>>>>>      that any operation between the leave() and enter() calls is
>>>>>      completed immediately; for that reason we pass LAZY_MMU_DEFAULT to
>>>>>      leave() to fully disable lazy_mmu. enter() will then re-enable it
>>>>>      - this achieves the expected behaviour, whether nesting occurred
>>>>>      before that function was called or not.
>>>>>
>>>>> Note: it is difficult to provide a default definition of
>>>>> lazy_mmu_state_t for architectures implementing lazy_mmu, because
>>>>> that definition would need to be available in
>>>>> arch/x86/include/asm/paravirt_types.h and adding a new generic
>>>>>     #include there is very tricky due to the existing header soup.
>>>>
>>>> Yeah, I was wondering about exactly that.
>>>>
>>>> In particular because LAZY_MMU_DEFAULT etc resides somewehere compeltely
>>>> different.
>>>>
>>>> Which raises the question: is using a new type really of any benefit here?
>>>>
>>>> Can't we just use an "enum lazy_mmu_state" and call it a day?
>>>
>>> I could envision something completely different for this type on s390,
>>> e.g. a pointer to a per-cpu structure. So I would really ask to stick
>>> with the current approach.
>>
>> Would that integrate well with LAZY_MMU_DEFAULT etc?
> 
> Hmm... I though the idea is to use LAZY_MMU_* by architectures that
> want to use it - at least that is how I read the description above.
> 
> It is only kasan_populate|depopulate_vmalloc_pte() in generic code
> that do not follow this pattern, and it looks as a problem to me.

Yes, that's why I am asking.

What kind of information (pointer to a per-cpu structure) would you want 
to return, and would handling it similar to how 
pagefault_disable()/pagefault_enable() e.g., using a variable in 
"current" to track the nesting level avoid having s390x to do that?

-- 
Cheers

David / dhildenb

Re: [PATCH v2 2/7] mm: introduce local state for lazy_mmu sections

Posted by Kevin Brodsky 4 days, 15 hours ago

On 09/09/2025 13:54, David Hildenbrand wrote:
> On 09.09.25 13:45, Alexander Gordeev wrote:
>> On Tue, Sep 09, 2025 at 12:09:48PM +0200, David Hildenbrand wrote:
>>> On 09.09.25 11:40, Alexander Gordeev wrote:
>>>> On Tue, Sep 09, 2025 at 11:07:36AM +0200, David Hildenbrand wrote:
>>>>> On 08.09.25 09:39, Kevin Brodsky wrote:
>>>>>> arch_{enter,leave}_lazy_mmu_mode() currently have a stateless API
>>>>>> (taking and returning no value). This is proving problematic in
>>>>>> situations where leave() needs to restore some context back to its
>>>>>> original state (before enter() was called). In particular, this
>>>>>> makes it difficult to support the nesting of lazy_mmu sections -
>>>>>> leave() does not know whether the matching enter() call occurred
>>>>>> while lazy_mmu was already enabled, and whether to disable it or
>>>>>> not.
>>>>>>
>>>>>> This patch gives all architectures the chance to store local state
>>>>>> while inside a lazy_mmu section by making enter() return some value,
>>>>>> storing it in a local variable, and having leave() take that value.
>>>>>> That value is typed lazy_mmu_state_t - each architecture defining
>>>>>> __HAVE_ARCH_ENTER_LAZY_MMU_MODE is free to define it as it sees fit.
>>>>>> For now we define it as int everywhere, which is sufficient to
>>>>>> support nesting.
>>>> ...
>>>>>> {
>>>>>> + lazy_mmu_state_t lazy_mmu_state;
>>>>>> ...
>>>>>> - arch_enter_lazy_mmu_mode();
>>>>>> + lazy_mmu_state = arch_enter_lazy_mmu_mode();
>>>>>> ...
>>>>>> - arch_leave_lazy_mmu_mode();
>>>>>> + arch_leave_lazy_mmu_mode(lazy_mmu_state);
>>>>>> ...
>>>>>> }
>>>>>>
>>>>>> * In a few cases (e.g. xen_flush_lazy_mmu()), a function knows that
>>>>>>      lazy_mmu is already enabled, and it temporarily disables it by
>>>>>>      calling leave() and then enter() again. Here we want to ensure
>>>>>>      that any operation between the leave() and enter() calls is
>>>>>>      completed immediately; for that reason we pass
>>>>>> LAZY_MMU_DEFAULT to
>>>>>>      leave() to fully disable lazy_mmu. enter() will then
>>>>>> re-enable it
>>>>>>      - this achieves the expected behaviour, whether nesting
>>>>>> occurred
>>>>>>      before that function was called or not.
>>>>>>
>>>>>> Note: it is difficult to provide a default definition of
>>>>>> lazy_mmu_state_t for architectures implementing lazy_mmu, because
>>>>>> that definition would need to be available in
>>>>>> arch/x86/include/asm/paravirt_types.h and adding a new generic
>>>>>>     #include there is very tricky due to the existing header soup.
>>>>>
>>>>> Yeah, I was wondering about exactly that.
>>>>>
>>>>> In particular because LAZY_MMU_DEFAULT etc resides somewehere
>>>>> compeltely
>>>>> different.
>>>>>
>>>>> Which raises the question: is using a new type really of any
>>>>> benefit here?
>>>>>
>>>>> Can't we just use an "enum lazy_mmu_state" and call it a day?
>>>>
>>>> I could envision something completely different for this type on s390,
>>>> e.g. a pointer to a per-cpu structure. So I would really ask to stick
>>>> with the current approach.

This is indeed the motivation - let every arch do whatever it sees fit.
lazy_mmu_state_t is basically an opaque type as far as generic code is
concerned, which also means that this API change is the first and last
one we need (famous last words, I know). 

I mentioned in the cover letter that the pkeys-based page table
protection series [1] would have an immediate use for lazy_mmu_state_t.
In that proposal, any helper writing to pgtables needs to modify the
pkey register and then restore it. To reduce the overhead, lazy_mmu is
used to set the pkey register only once in enter(), and then restore it
in leave() [2]. This currently relies on storing the original pkey
register value in thread_struct, which is suboptimal and most
importantly doesn't work if lazy_mmu sections nest. With this series, we
could instead store the pkey register value in lazy_mmu_state_t
(enlarging it to 64 bits or more).

I also considered going further and making lazy_mmu_state_t a pointer as
Alexander suggested - more complex to manage, but also a lot more flexible.

>>> Would that integrate well with LAZY_MMU_DEFAULT etc?
>>
>> Hmm... I though the idea is to use LAZY_MMU_* by architectures that
>> want to use it - at least that is how I read the description above.
>>
>> It is only kasan_populate|depopulate_vmalloc_pte() in generic code
>> that do not follow this pattern, and it looks as a problem to me.

This discussion also made me realise that this is problematic, as the
LAZY_MMU_{DEFAULT,NESTED} macros were meant only for architectures'
convenience, not for generic code (where lazy_mmu_state_t should ideally
be an opaque type as mentioned above). It almost feels like the kasan
case deserves a different API, because this is not how enter() and
leave() are meant to be used. This would mean quite a bit of churn
though, so maybe just introduce another arch-defined value to pass to
leave() for such a situation - for instance,
arch_leave_lazy_mmu_mode(LAZY_MMU_FLUSH)?

>
> Yes, that's why I am asking.
>
> What kind of information (pointer to a per-cpu structure) would you
> want to return, and would handling it similar to how
> pagefault_disable()/pagefault_enable() e.g., using a variable in
> "current" to track the nesting level avoid having s390x to do that?

The pagefault_disabled approach works fine for simple use-cases, but it
doesn't scale well. The space allocated in task_struct/thread_struct to
track that state is wasted (unused) most of the time. Worse, it does not
truly enable states to be nested: it allows the outermost section to
store some state, but nested sections cannot allocate extra space. This
is really what the stack is for.

- Kevin

Re: [PATCH v2 2/7] mm: introduce local state for lazy_mmu sections

Posted by Alexander Gordeev 4 days, 14 hours ago

On Tue, Sep 09, 2025 at 03:49:46PM +0200, Kevin Brodsky wrote:
> On 09/09/2025 13:54, David Hildenbrand wrote:
> > On 09.09.25 13:45, Alexander Gordeev wrote:
> >> On Tue, Sep 09, 2025 at 12:09:48PM +0200, David Hildenbrand wrote:
> >>> On 09.09.25 11:40, Alexander Gordeev wrote:
> >>>> On Tue, Sep 09, 2025 at 11:07:36AM +0200, David Hildenbrand wrote:
> >>>>> On 08.09.25 09:39, Kevin Brodsky wrote:
> >>>>>> arch_{enter,leave}_lazy_mmu_mode() currently have a stateless API
> >>>>>> (taking and returning no value). This is proving problematic in
> >>>>>> situations where leave() needs to restore some context back to its
> >>>>>> original state (before enter() was called). In particular, this
> >>>>>> makes it difficult to support the nesting of lazy_mmu sections -
> >>>>>> leave() does not know whether the matching enter() call occurred
> >>>>>> while lazy_mmu was already enabled, and whether to disable it or
> >>>>>> not.
> >>>>>>
> >>>>>> This patch gives all architectures the chance to store local state
> >>>>>> while inside a lazy_mmu section by making enter() return some value,
> >>>>>> storing it in a local variable, and having leave() take that value.
> >>>>>> That value is typed lazy_mmu_state_t - each architecture defining
> >>>>>> __HAVE_ARCH_ENTER_LAZY_MMU_MODE is free to define it as it sees fit.
> >>>>>> For now we define it as int everywhere, which is sufficient to
> >>>>>> support nesting.
> >>>> ...
> >>>>>> {
> >>>>>> + lazy_mmu_state_t lazy_mmu_state;
> >>>>>> ...
> >>>>>> - arch_enter_lazy_mmu_mode();
> >>>>>> + lazy_mmu_state = arch_enter_lazy_mmu_mode();
> >>>>>> ...
> >>>>>> - arch_leave_lazy_mmu_mode();
> >>>>>> + arch_leave_lazy_mmu_mode(lazy_mmu_state);
> >>>>>> ...
> >>>>>> }
> >>>>>>
> >>>>>> * In a few cases (e.g. xen_flush_lazy_mmu()), a function knows that
> >>>>>>      lazy_mmu is already enabled, and it temporarily disables it by
> >>>>>>      calling leave() and then enter() again. Here we want to ensure
> >>>>>>      that any operation between the leave() and enter() calls is
> >>>>>>      completed immediately; for that reason we pass
> >>>>>> LAZY_MMU_DEFAULT to
> >>>>>>      leave() to fully disable lazy_mmu. enter() will then
> >>>>>> re-enable it
> >>>>>>      - this achieves the expected behaviour, whether nesting
> >>>>>> occurred
> >>>>>>      before that function was called or not.
> >>>>>>
> >>>>>> Note: it is difficult to provide a default definition of
> >>>>>> lazy_mmu_state_t for architectures implementing lazy_mmu, because
> >>>>>> that definition would need to be available in
> >>>>>> arch/x86/include/asm/paravirt_types.h and adding a new generic
> >>>>>>     #include there is very tricky due to the existing header soup.
> >>>>>
> >>>>> Yeah, I was wondering about exactly that.
> >>>>>
> >>>>> In particular because LAZY_MMU_DEFAULT etc resides somewehere
> >>>>> compeltely
> >>>>> different.
> >>>>>
> >>>>> Which raises the question: is using a new type really of any
> >>>>> benefit here?
> >>>>>
> >>>>> Can't we just use an "enum lazy_mmu_state" and call it a day?
> >>>>
> >>>> I could envision something completely different for this type on s390,
> >>>> e.g. a pointer to a per-cpu structure. So I would really ask to stick
> >>>> with the current approach.
> 
> This is indeed the motivation - let every arch do whatever it sees fit.
> lazy_mmu_state_t is basically an opaque type as far as generic code is
> concerned, which also means that this API change is the first and last
> one we need (famous last words, I know). 
> 
> I mentioned in the cover letter that the pkeys-based page table
> protection series [1] would have an immediate use for lazy_mmu_state_t.
> In that proposal, any helper writing to pgtables needs to modify the
> pkey register and then restore it. To reduce the overhead, lazy_mmu is
> used to set the pkey register only once in enter(), and then restore it
> in leave() [2]. This currently relies on storing the original pkey
> register value in thread_struct, which is suboptimal and most
> importantly doesn't work if lazy_mmu sections nest. With this series, we
> could instead store the pkey register value in lazy_mmu_state_t
> (enlarging it to 64 bits or more).
> 
> I also considered going further and making lazy_mmu_state_t a pointer as
> Alexander suggested - more complex to manage, but also a lot more flexible.
> 
> >>> Would that integrate well with LAZY_MMU_DEFAULT etc?
> >>
> >> Hmm... I though the idea is to use LAZY_MMU_* by architectures that
> >> want to use it - at least that is how I read the description above.
> >>
> >> It is only kasan_populate|depopulate_vmalloc_pte() in generic code
> >> that do not follow this pattern, and it looks as a problem to me.
> 
> This discussion also made me realise that this is problematic, as the
> LAZY_MMU_{DEFAULT,NESTED} macros were meant only for architectures'
> convenience, not for generic code (where lazy_mmu_state_t should ideally
> be an opaque type as mentioned above). It almost feels like the kasan
> case deserves a different API, because this is not how enter() and
> leave() are meant to be used. This would mean quite a bit of churn
> though, so maybe just introduce another arch-defined value to pass to
> leave() for such a situation - for instance,
> arch_leave_lazy_mmu_mode(LAZY_MMU_FLUSH)?

What about to adjust the semantics of apply_to_page_range() instead?

It currently assumes any caller is fine with apply_to_pte_range() to
enter the lazy mode. By contrast, kasan_(de)populate_vmalloc_pte() are
not fine at all and must leave the lazy mode. That literally suggests
the original assumption is incorrect.

We could change int apply_to_pte_range(..., bool create, ...) to e.g.
apply_to_pte_range(..., unsigned int flags, ...) and introduce a flag
that simply skips entering the lazy mmu mode.

Thanks!

Re: [PATCH v2 2/7] mm: introduce local state for lazy_mmu sections

Posted by Kevin Brodsky 3 days, 13 hours ago

On 09/09/2025 16:38, Alexander Gordeev wrote:
>>>>> Would that integrate well with LAZY_MMU_DEFAULT etc?
>>>> Hmm... I though the idea is to use LAZY_MMU_* by architectures that
>>>> want to use it - at least that is how I read the description above.
>>>>
>>>> It is only kasan_populate|depopulate_vmalloc_pte() in generic code
>>>> that do not follow this pattern, and it looks as a problem to me.
>> This discussion also made me realise that this is problematic, as the
>> LAZY_MMU_{DEFAULT,NESTED} macros were meant only for architectures'
>> convenience, not for generic code (where lazy_mmu_state_t should ideally
>> be an opaque type as mentioned above). It almost feels like the kasan
>> case deserves a different API, because this is not how enter() and
>> leave() are meant to be used. This would mean quite a bit of churn
>> though, so maybe just introduce another arch-defined value to pass to
>> leave() for such a situation - for instance,
>> arch_leave_lazy_mmu_mode(LAZY_MMU_FLUSH)?
> What about to adjust the semantics of apply_to_page_range() instead?
>
> It currently assumes any caller is fine with apply_to_pte_range() to
> enter the lazy mode. By contrast, kasan_(de)populate_vmalloc_pte() are
> not fine at all and must leave the lazy mode. That literally suggests
> the original assumption is incorrect.
>
> We could change int apply_to_pte_range(..., bool create, ...) to e.g.
> apply_to_pte_range(..., unsigned int flags, ...) and introduce a flag
> that simply skips entering the lazy mmu mode.

This is pretty much what Ryan proposed [1r] some time ago, although for
a different purpose (avoiding nesting). There wasn't much appetite for
it then, but I agree that this would be a more logical way to go about it.

- Kevin

[1r]
https://lore.kernel.org/all/20250530140446.2387131-4-ryan.roberts@arm.com/

Re: [PATCH v2 2/7] mm: introduce local state for lazy_mmu sections

Posted by Alexander Gordeev 2 days, 17 hours ago

On Wed, Sep 10, 2025 at 06:11:54PM +0200, Kevin Brodsky wrote:

Hi Kevin,

> On 09/09/2025 16:38, Alexander Gordeev wrote:
> >>>>> Would that integrate well with LAZY_MMU_DEFAULT etc?
> >>>> Hmm... I though the idea is to use LAZY_MMU_* by architectures that
> >>>> want to use it - at least that is how I read the description above.
> >>>>
> >>>> It is only kasan_populate|depopulate_vmalloc_pte() in generic code
> >>>> that do not follow this pattern, and it looks as a problem to me.
> >> This discussion also made me realise that this is problematic, as the
> >> LAZY_MMU_{DEFAULT,NESTED} macros were meant only for architectures'
> >> convenience, not for generic code (where lazy_mmu_state_t should ideally
> >> be an opaque type as mentioned above). It almost feels like the kasan
> >> case deserves a different API, because this is not how enter() and
> >> leave() are meant to be used. This would mean quite a bit of churn
> >> though, so maybe just introduce another arch-defined value to pass to
> >> leave() for such a situation - for instance,
> >> arch_leave_lazy_mmu_mode(LAZY_MMU_FLUSH)?
> > What about to adjust the semantics of apply_to_page_range() instead?
> >
> > It currently assumes any caller is fine with apply_to_pte_range() to
> > enter the lazy mode. By contrast, kasan_(de)populate_vmalloc_pte() are
> > not fine at all and must leave the lazy mode. That literally suggests
> > the original assumption is incorrect.
> >
> > We could change int apply_to_pte_range(..., bool create, ...) to e.g.
> > apply_to_pte_range(..., unsigned int flags, ...) and introduce a flag
> > that simply skips entering the lazy mmu mode.
> 
> This is pretty much what Ryan proposed [1r] some time ago, although for
> a different purpose (avoiding nesting). There wasn't much appetite for
> it then, but I agree that this would be a more logical way to go about it.
> 
> - Kevin
> 
> [1r]
> https://lore.kernel.org/all/20250530140446.2387131-4-ryan.roberts@arm.com/

May be I missing the point, but I read it as an opposition to the whole
series in general and to the way apply_to_pte_range() would be altered
in particular:

 static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
 				     unsigned long addr, unsigned long end,
 				     pte_fn_t fn, void *data, bool create,
-				     pgtbl_mod_mask *mask)
+				     pgtbl_mod_mask *mask, bool lazy_mmu)

The idea of instructing apply_to_page_range() to skip the lazy mmu mode
was not countered. Quite opposite, Liam suggested exactly the same:

<quote>
Could we do something like the pgtbl_mod_mask or zap_details and pass
through a struct or one unsigned int for create and lazy_mmu?

These wrappers are terrible for readability and annoying for argument
lists too.

Could we do something like the pgtbl_mod_mask or zap_details and pass
through a struct or one unsigned int for create and lazy_mmu?

At least we'd have better self-documenting code in the wrappers.. and if
we ever need a third boolean, we could avoid multiplying the wrappers
again.
<quote>

Thanks!

Re: [PATCH v2 2/7] mm: introduce local state for lazy_mmu sections

Posted by Kevin Brodsky 2 days, 12 hours ago

On 11/09/2025 14:06, Alexander Gordeev wrote:
> On Wed, Sep 10, 2025 at 06:11:54PM +0200, Kevin Brodsky wrote:
>
> Hi Kevin,
>
>> On 09/09/2025 16:38, Alexander Gordeev wrote:
>>>>>>> Would that integrate well with LAZY_MMU_DEFAULT etc?
>>>>>> Hmm... I though the idea is to use LAZY_MMU_* by architectures that
>>>>>> want to use it - at least that is how I read the description above.
>>>>>>
>>>>>> It is only kasan_populate|depopulate_vmalloc_pte() in generic code
>>>>>> that do not follow this pattern, and it looks as a problem to me.
>>>> This discussion also made me realise that this is problematic, as the
>>>> LAZY_MMU_{DEFAULT,NESTED} macros were meant only for architectures'
>>>> convenience, not for generic code (where lazy_mmu_state_t should ideally
>>>> be an opaque type as mentioned above). It almost feels like the kasan
>>>> case deserves a different API, because this is not how enter() and
>>>> leave() are meant to be used. This would mean quite a bit of churn
>>>> though, so maybe just introduce another arch-defined value to pass to
>>>> leave() for such a situation - for instance,
>>>> arch_leave_lazy_mmu_mode(LAZY_MMU_FLUSH)?
>>> What about to adjust the semantics of apply_to_page_range() instead?
>>>
>>> It currently assumes any caller is fine with apply_to_pte_range() to
>>> enter the lazy mode. By contrast, kasan_(de)populate_vmalloc_pte() are
>>> not fine at all and must leave the lazy mode. That literally suggests
>>> the original assumption is incorrect.
>>>
>>> We could change int apply_to_pte_range(..., bool create, ...) to e.g.
>>> apply_to_pte_range(..., unsigned int flags, ...) and introduce a flag
>>> that simply skips entering the lazy mmu mode.
>> This is pretty much what Ryan proposed [1r] some time ago, although for
>> a different purpose (avoiding nesting). There wasn't much appetite for
>> it then, but I agree that this would be a more logical way to go about it.
>>
>> - Kevin
>>
>> [1r]
>> https://lore.kernel.org/all/20250530140446.2387131-4-ryan.roberts@arm.com/
> May be I missing the point, but I read it as an opposition to the whole
> series in general and to the way apply_to_pte_range() would be altered
> in particular:
>
>  static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
>  				     unsigned long addr, unsigned long end,
>  				     pte_fn_t fn, void *data, bool create,
> -				     pgtbl_mod_mask *mask)
> +				     pgtbl_mod_mask *mask, bool lazy_mmu)
>
> The idea of instructing apply_to_page_range() to skip the lazy mmu mode
> was not countered. Quite opposite, Liam suggested exactly the same:

Yes that's a fair point. It would be sensible to post a new series
trying to eliminate the leave()/enter() calls in mm/kasan as you
suggested. Still I think that it makes sense to define an API to handle
that situation ("pausing" lazy_mmu), as discussed with David H.

- Kevin

>
> <quote>
> Could we do something like the pgtbl_mod_mask or zap_details and pass
> through a struct or one unsigned int for create and lazy_mmu?
>
> These wrappers are terrible for readability and annoying for argument
> lists too.
>
> Could we do something like the pgtbl_mod_mask or zap_details and pass
> through a struct or one unsigned int for create and lazy_mmu?
>
> At least we'd have better self-documenting code in the wrappers.. and if
> we ever need a third boolean, we could avoid multiplying the wrappers
> again.
> <quote>
>
> Thanks!

Re: [PATCH v2 2/7] mm: introduce local state for lazy_mmu sections

Posted by David Hildenbrand 4 days, 14 hours ago

>>>>>> Can't we just use an "enum lazy_mmu_state" and call it a day?
>>>>>
>>>>> I could envision something completely different for this type on s390,
>>>>> e.g. a pointer to a per-cpu structure. So I would really ask to stick
>>>>> with the current approach.
> 
> This is indeed the motivation - let every arch do whatever it sees fit.
> lazy_mmu_state_t is basically an opaque type as far as generic code is
> concerned, which also means that this API change is the first and last
> one we need (famous last words, I know).

It makes the API more complicated, though. :)

> 
> I mentioned in the cover letter that the pkeys-based page table
> protection series [1] would have an immediate use for lazy_mmu_state_t.
> In that proposal, any helper writing to pgtables needs to modify the
> pkey register and then restore it. To reduce the overhead, lazy_mmu is
> used to set the pkey register only once in enter(), and then restore it
> in leave() [2]. This currently relies on storing the original pkey
> register value in thread_struct, which is suboptimal and most

Can you elaborate why this is suboptimal? See below regarding the size of task_struct.

> importantly doesn't work if lazy_mmu sections nest.

Can you elaborate why it would be problematic with nesting (if we would have a count
and can handle the transition from 0->1 and 1->0)?

> With this series, we
> could instead store the pkey register value in lazy_mmu_state_t
> (enlarging it to 64 bits or more).

Yes.

> 
> I also considered going further and making lazy_mmu_state_t a pointer as
> Alexander suggested - more complex to manage, but also a lot more flexible.
> 
>>>> Would that integrate well with LAZY_MMU_DEFAULT etc?
>>>
>>> Hmm... I though the idea is to use LAZY_MMU_* by architectures that
>>> want to use it - at least that is how I read the description above.
>>>
>>> It is only kasan_populate|depopulate_vmalloc_pte() in generic code
>>> that do not follow this pattern, and it looks as a problem to me.
> 
> This discussion also made me realise that this is problematic, as the
> LAZY_MMU_{DEFAULT,NESTED} macros were meant only for architectures'
> convenience, not for generic code (where lazy_mmu_state_t should ideally
> be an opaque type as mentioned above). It almost feels like the kasan
> case deserves a different API, because this is not how enter() and
> leave() are meant to be used. This would mean quite a bit of churn
> though, so maybe just introduce another arch-defined value to pass to
> leave() for such a situation - for instance,
> arch_leave_lazy_mmu_mode(LAZY_MMU_FLUSH)?

The discussion made me realize that it's a bit hack right now :)

If LAZY_MMU_DEFAULT etc. are not for common code, then please
maintain them for the individual archs as well, just like you do with the
opaque type.

> 
>>
>> Yes, that's why I am asking.
>>
>> What kind of information (pointer to a per-cpu structure) would you
>> want to return, and would handling it similar to how
>> pagefault_disable()/pagefault_enable() e.g., using a variable in
>> "current" to track the nesting level avoid having s390x to do that?
> 
> The pagefault_disabled approach works fine for simple use-cases, but it
> doesn't scale well. The space allocated in task_struct/thread_struct to
> track that state is wasted (unused) most of the time.

I'm not sure that's a concern. Fitting an int into existing holes should work
and even another 64bit (8byte )...

I just checked with pahole using the Fedora config on current mm-unstable.


/* size: 9792, cachelines: 153, members: 276 */
/* sum members: 9619, holes: 20, sum holes: 125 */
/* sum bitfield members: 85 bits, bit holes: 2, sum bit holes: 43 bits */
/* padding: 32 */
/* member types with holes: 4, total: 6, bit holes: 2, total: 2 */
/* paddings: 6, sum paddings: 49 */
/* forced alignments: 12, forced holes: 2, sum forced holes: 60 */

Due to some "arch_task_struct_size" we might actually allocate more space.


Staring at my live system:

$ sudo slabinfo
Name                   Objects Objsize           Space Slabs/Part/Cpu  O/S O %Fr %Ef Flg
...
task_struct               1491   12376           24.8M      721/25/37    2 3   3  74


I am not sure if even an additional 8byte would move the needle here.


Worse, it does not
> truly enable states to be nested: it allows the outermost section to
> store some state, but nested sections cannot allocate extra space. This
> is really what the stack is for.

If it's really just 8 bytes I don't really see the problem. So likely there is
more to it?

-- 
Cheers

David / dhildenb

Re: [PATCH v2 2/7] mm: introduce local state for lazy_mmu sections

Posted by Kevin Brodsky 3 days, 13 hours ago

+Mark Rutland

On 09/09/2025 16:28, David Hildenbrand wrote:
>>>>>>> Can't we just use an "enum lazy_mmu_state" and call it a day?
>>>>>>
>>>>>> I could envision something completely different for this type on
>>>>>> s390,
>>>>>> e.g. a pointer to a per-cpu structure. So I would really ask to
>>>>>> stick
>>>>>> with the current approach.
>>
>> This is indeed the motivation - let every arch do whatever it sees fit.
>> lazy_mmu_state_t is basically an opaque type as far as generic code is
>> concerned, which also means that this API change is the first and last
>> one we need (famous last words, I know).
>
> It makes the API more complicated, though. :)

Somewhat, but in the regular case where enter() is called followed by
leave() there is really no complexity for the caller, just an extra
local variable.

There are complications where we want to exit lazy_mmu temporarily, as
in mm/kasan/shadow.c [1k], but this is in fact unavoidable. Chatting
with Mark Rutland, I realised that to truly support nested sections,
this must be handled in a special way in any case. To be clear, I am
referring to this situation:

__kasan_populate_vmalloc:
    apply_to_page_range:
        arch_enter_lazy_mmu_mode() {1}

        kasan_populate_vmalloc_pte:
            arch_leave_lazy_mmu_mode() {2}
            arch_enter_lazy_mmu_mode() {3}

        arch_leave_lazy_mmu_mode() {4}

With the approach this series takes, call {2} is made safe by passing a
special parameter (say LAZY_MMU_FLUSH) that forces lazy_mmu to be fully
exited - and call {3} will then re-enter lazy_mmu. This works regardless
of whether __kasan_populate_vmalloc() has been called with lazy_mmu
already enabled (i.e. calls {1} and {4} can be nested).

On the other hand, with a pagefault_disabled-like approach, there is no
way to instruct call {3} to fully exit lazy_mmu regardless of the
nesting level.

It would be possible to make both approaches work by introducing a new
API, along the lines of:
- int arch_disable_save_lazy_mmu_mode() (the return value indicates the
nesting level)
- void arch_restore_lazy_mmu_mode(int state) (re-enter lazy_mmu at the
given nesting level)

This is arguably more self-documenting than passing LAZY_MMU_FLUSH in
call {2}. This API is however no simpler when using a
pagefault_disabled-like approach (and less consistent than when always
saving state on the stack).

[1k]
https://lore.kernel.org/all/0d2efb7ddddbff6b288fbffeeb10166e90771718.1755528662.git.agordeev@linux.ibm.com/

>
>>
>> I mentioned in the cover letter that the pkeys-based page table
>> protection series [1] would have an immediate use for lazy_mmu_state_t.
>> In that proposal, any helper writing to pgtables needs to modify the
>> pkey register and then restore it. To reduce the overhead, lazy_mmu is
>> used to set the pkey register only once in enter(), and then restore it
>> in leave() [2]. This currently relies on storing the original pkey
>> register value in thread_struct, which is suboptimal and most
>
> Can you elaborate why this is suboptimal? See below regarding the size
> of task_struct.

Suboptimal in the sense that we're allocating fixed space for each task
that we are almost never using.

>
>> importantly doesn't work if lazy_mmu sections nest.
>
> Can you elaborate why it would be problematic with nesting (if we
> would have a count
> and can handle the transition from 0->1 and 1->0)?

It doesn't work in that specific patch I linked - but yes it can be made
to work if we have both an extra task_struct member to store the level
of nesting *and* an extra thread_struct member to store the saved pkey
register value (both of which are only used while in lazy_mmu).


>
>> With this series, we
>> could instead store the pkey register value in lazy_mmu_state_t
>> (enlarging it to 64 bits or more).
>
> Yes.
>
>>
>> I also considered going further and making lazy_mmu_state_t a pointer as
>> Alexander suggested - more complex to manage, but also a lot more
>> flexible.
>>
>>>>> Would that integrate well with LAZY_MMU_DEFAULT etc?
>>>>
>>>> Hmm... I though the idea is to use LAZY_MMU_* by architectures that
>>>> want to use it - at least that is how I read the description above.
>>>>
>>>> It is only kasan_populate|depopulate_vmalloc_pte() in generic code
>>>> that do not follow this pattern, and it looks as a problem to me.
>>
>> This discussion also made me realise that this is problematic, as the
>> LAZY_MMU_{DEFAULT,NESTED} macros were meant only for architectures'
>> convenience, not for generic code (where lazy_mmu_state_t should ideally
>> be an opaque type as mentioned above). It almost feels like the kasan
>> case deserves a different API, because this is not how enter() and
>> leave() are meant to be used. This would mean quite a bit of churn
>> though, so maybe just introduce another arch-defined value to pass to
>> leave() for such a situation - for instance,
>> arch_leave_lazy_mmu_mode(LAZY_MMU_FLUSH)?
>
> The discussion made me realize that it's a bit hack right now :)
>
> If LAZY_MMU_DEFAULT etc. are not for common code, then please
> maintain them for the individual archs as well, just like you do with the
> opaque type.

I see your point - having them defined in <linux/mm_types.h> could be
misleading. I just wanted to avoid all 4 architectures defining the same
macros. Maybe call them __LAZY_MMU_* to suggest they're not supposed to
be used in generic code?

>
>>
>>>
>>> Yes, that's why I am asking.
>>>
>>> What kind of information (pointer to a per-cpu structure) would you
>>> want to return, and would handling it similar to how
>>> pagefault_disable()/pagefault_enable() e.g., using a variable in
>>> "current" to track the nesting level avoid having s390x to do that?
>>
>> The pagefault_disabled approach works fine for simple use-cases, but it
>> doesn't scale well. The space allocated in task_struct/thread_struct to
>> track that state is wasted (unused) most of the time.
>
> I'm not sure that's a concern. Fitting an int into existing holes
> should work
> and even another 64bit (8byte )...
>
> I just checked with pahole using the Fedora config on current
> mm-unstable.
>
>
> /* size: 9792, cachelines: 153, members: 276 */
> /* sum members: 9619, holes: 20, sum holes: 125 */
> /* sum bitfield members: 85 bits, bit holes: 2, sum bit holes: 43 bits */
> /* padding: 32 */
> /* member types with holes: 4, total: 6, bit holes: 2, total: 2 */
> /* paddings: 6, sum paddings: 49 */
> /* forced alignments: 12, forced holes: 2, sum forced holes: 60 */
>
> Due to some "arch_task_struct_size" we might actually allocate more
> space.
>
>
> Staring at my live system:
>
> $ sudo slabinfo
> Name                   Objects Objsize           Space Slabs/Part/Cpu 
> O/S O %Fr %Ef Flg
> ...
> task_struct               1491   12376           24.8M     
> 721/25/37    2 3   3  74
>
>
> I am not sure if even an additional 8byte would move the needle here.
>
>
> Worse, it does not
>> truly enable states to be nested: it allows the outermost section to
>> store some state, but nested sections cannot allocate extra space. This
>> is really what the stack is for.
>
> If it's really just 8 bytes I don't really see the problem. So likely
> there is
> more to it? 

I suppose 8 extra bytes per task is acceptable, but some architectures
may want to add more state there.

The one case that is truly problematic (though not required at this
point) is where each (nested) section needs to store its own state. With
this series it works just fine as there is a lazy_mmu_state_t for each
section, however if we use task_struct/thread_struct there can be only
one member shared by all nested sections.

- Kevin

Re: [PATCH v2 2/7] mm: introduce local state for lazy_mmu sections

Posted by David Hildenbrand 3 days, 13 hours ago

> 
> Somewhat, but in the regular case where enter() is called followed by
> leave() there is really no complexity for the caller, just an extra
> local variable.
> 
> There are complications where we want to exit lazy_mmu temporarily, as
> in mm/kasan/shadow.c [1k], but this is in fact unavoidable. Chatting
> with Mark Rutland, I realised that to truly support nested sections,
> this must be handled in a special way in any case. To be clear, I am
> referring to this situation:
> 
> __kasan_populate_vmalloc:
>      apply_to_page_range:
>          arch_enter_lazy_mmu_mode() {1}
> 
>          kasan_populate_vmalloc_pte:
>              arch_leave_lazy_mmu_mode() {2}
>              arch_enter_lazy_mmu_mode() {3}
> 
>          arch_leave_lazy_mmu_mode() {4}
> 
> With the approach this series takes, call {2} is made safe by passing a
> special parameter (say LAZY_MMU_FLUSH) that forces lazy_mmu to be fully
> exited - and call {3} will then re-enter lazy_mmu. This works regardless
> of whether __kasan_populate_vmalloc() has been called with lazy_mmu
> already enabled (i.e. calls {1} and {4} can be nested).
> 
> On the other hand, with a pagefault_disabled-like approach, there is no
> way to instruct call {3} to fully exit lazy_mmu regardless of the
> nesting level.

Sure there is, with a better API. See below. :)

> 
> It would be possible to make both approaches work by introducing a new
> API, along the lines of:
> - int arch_disable_save_lazy_mmu_mode() (the return value indicates the
> nesting level)
> - void arch_restore_lazy_mmu_mode(int state) (re-enter lazy_mmu at the
> given nesting level)

Yes, I think we really need a proper API.

> 
> This is arguably more self-documenting than passing LAZY_MMU_FLUSH in
> call {2}. This API is however no simpler when using a
> pagefault_disabled-like approach (and less consistent than when always
> saving state on the stack).

Yes, a proper API is warranted. In particular, thinking about the following:

arch_enter_lazy_mmu_mode() {1}
	arch_enter_lazy_mmu_mode() {2}

	kasan_populate_vmalloc_pte:
		arch_leave_lazy_mmu_mode() {3}
		arch_enter_lazy_mmu_mode() {4}

	arch_leave_lazy_mmu_mode() {5}
arch_leave_lazy_mmu_mode() {6}


Imagine if we have the following API instead:

lazy_mmu_enable() {1}
	lazy_mmu_enable() {2}

	kasan_populate_vmalloc_pte:
		lazy_mmu_pause() {3}
		lazy_mmu_continue() {4}

	lazy_mmu_disable() {5}
lazy_mmu_disable() {6}


I think it is crucial that after lazy_mmu_save/lazy_mmu_restore, no more 
nesting must happen.

Assume we store in the task_struct

uint8_t lazy_mmu_enabled_count;
bool lazy_mmu_paused;

We can do things like

a) Sanity check that while we are paused that we get no more 
enable/disable requests
b) Sanity check that while we are paused that we get no more pause requests.

[...]

>>
>> If LAZY_MMU_DEFAULT etc. are not for common code, then please
>> maintain them for the individual archs as well, just like you do with the
>> opaque type.
> 
> I see your point - having them defined in <linux/mm_types.h> could be
> misleading. I just wanted to avoid all 4 architectures defining the same
> macros. Maybe call them __LAZY_MMU_* to suggest they're not supposed to
> be used in generic code?

Maybe look into avoiding them completely :) Let's agree on the API first 
and then figure out how to pass the information we need to pass.

[...]

>> Worse, it does not
>>> truly enable states to be nested: it allows the outermost section to
>>> store some state, but nested sections cannot allocate extra space. This
>>> is really what the stack is for.
>>
>> If it's really just 8 bytes I don't really see the problem. So likely
>> there is
>> more to it?
> 
> I suppose 8 extra bytes per task is acceptable, but some architectures
> may want to add more state there.

Just for reference: we currently perform an order-2 allocation, 
effectively leaving ~4KiB "unused".

If there are any real such case on the horizon where we need to store 
significantly more (in which case storing it on the stack might probably 
also bad), please let me know.

> 
> The one case that is truly problematic (though not required at this
> point) is where each (nested) section needs to store its own state. With
> this series it works just fine as there is a lazy_mmu_state_t for each
> section, however if we use task_struct/thread_struct there can be only
> one member shared by all nested sections.

Do we have a use case for that on the horizon? If so, I fully agree, we 
have to store information per level. How/what information we have to 
store would be another question.

-- 
Cheers

David / dhildenb

Re: [PATCH v2 2/7] mm: introduce local state for lazy_mmu sections

Posted by Kevin Brodsky 2 days, 12 hours ago

On 10/09/2025 17:37, David Hildenbrand wrote:
>>
>> Somewhat, but in the regular case where enter() is called followed by
>> leave() there is really no complexity for the caller, just an extra
>> local variable.
>>
>> There are complications where we want to exit lazy_mmu temporarily, as
>> in mm/kasan/shadow.c [1k], but this is in fact unavoidable. Chatting
>> with Mark Rutland, I realised that to truly support nested sections,
>> this must be handled in a special way in any case. To be clear, I am
>> referring to this situation:
>>
>> __kasan_populate_vmalloc:
>>      apply_to_page_range:
>>          arch_enter_lazy_mmu_mode() {1}
>>
>>          kasan_populate_vmalloc_pte:
>>              arch_leave_lazy_mmu_mode() {2}
>>              arch_enter_lazy_mmu_mode() {3}
>>
>>          arch_leave_lazy_mmu_mode() {4}
>>
>> With the approach this series takes, call {2} is made safe by passing a
>> special parameter (say LAZY_MMU_FLUSH) that forces lazy_mmu to be fully
>> exited - and call {3} will then re-enter lazy_mmu. This works regardless
>> of whether __kasan_populate_vmalloc() has been called with lazy_mmu
>> already enabled (i.e. calls {1} and {4} can be nested).
>>
>> On the other hand, with a pagefault_disabled-like approach, there is no
>> way to instruct call {3} to fully exit lazy_mmu regardless of the
>> nesting level.
>
> Sure there is, with a better API. See below. :) 

I meant while keeping the existing shape of the API but yes fair enough!

>
>>
>> It would be possible to make both approaches work by introducing a new
>> API, along the lines of:
>> - int arch_disable_save_lazy_mmu_mode() (the return value indicates the
>> nesting level)
>> - void arch_restore_lazy_mmu_mode(int state) (re-enter lazy_mmu at the
>> given nesting level)
>
> Yes, I think we really need a proper API.
>
>>
>> This is arguably more self-documenting than passing LAZY_MMU_FLUSH in
>> call {2}. This API is however no simpler when using a
>> pagefault_disabled-like approach (and less consistent than when always
>> saving state on the stack).
>
> Yes, a proper API is warranted. In particular, thinking about the
> following:
>
> arch_enter_lazy_mmu_mode() {1}
>     arch_enter_lazy_mmu_mode() {2}
>
>     kasan_populate_vmalloc_pte:
>         arch_leave_lazy_mmu_mode() {3}
>         arch_enter_lazy_mmu_mode() {4}
>
>     arch_leave_lazy_mmu_mode() {5}
> arch_leave_lazy_mmu_mode() {6}
>
>
> Imagine if we have the following API instead:
>
> lazy_mmu_enable() {1}
>     lazy_mmu_enable() {2}
>
>     kasan_populate_vmalloc_pte:
>         lazy_mmu_pause() {3}
>         lazy_mmu_continue() {4}
>
>     lazy_mmu_disable() {5}
> lazy_mmu_disable() {6}
>
>
> I think it is crucial that after lazy_mmu_save/lazy_mmu_restore, no
> more nesting must happen.

That makes sense to me - lazy_mmu should only be paused in very specific
situations and I don't see a justification for supporting nesting while
paused.

>
> Assume we store in the task_struct
>
> uint8_t lazy_mmu_enabled_count;
> bool lazy_mmu_paused;

I didn't think of that approach! I can't immediately see any problem
with it, assuming we're fine with storing arch-specific context in
thread_struct (which seems to be the case as things stand).

>
> We can do things like
>
> a) Sanity check that while we are paused that we get no more
> enable/disable requests
> b) Sanity check that while we are paused that we get no more pause
> requests.

These are good points - and this is only possible with such global
state. (Similarly we can check that the counter never underflows.)

>
> [...]
>
>>>
>>> If LAZY_MMU_DEFAULT etc. are not for common code, then please
>>> maintain them for the individual archs as well, just like you do
>>> with the
>>> opaque type.
>>
>> I see your point - having them defined in <linux/mm_types.h> could be
>> misleading. I just wanted to avoid all 4 architectures defining the same
>> macros. Maybe call them __LAZY_MMU_* to suggest they're not supposed to
>> be used in generic code?
>
> Maybe look into avoiding them completely :) Let's agree on the API
> first and then figure out how to pass the information we need to pass.
>
> [...]
>
>>> Worse, it does not
>>>> truly enable states to be nested: it allows the outermost section to
>>>> store some state, but nested sections cannot allocate extra space.
>>>> This
>>>> is really what the stack is for.
>>>
>>> If it's really just 8 bytes I don't really see the problem. So likely
>>> there is
>>> more to it?
>>
>> I suppose 8 extra bytes per task is acceptable, but some architectures
>> may want to add more state there.
>
> Just for reference: we currently perform an order-2 allocation,
> effectively leaving ~4KiB "unused".
>
> If there are any real such case on the horizon where we need to store
> significantly more (in which case storing it on the stack might
> probably also bad), please let me know.
>
>>
>> The one case that is truly problematic (though not required at this
>> point) is where each (nested) section needs to store its own state. With
>> this series it works just fine as there is a lazy_mmu_state_t for each
>> section, however if we use task_struct/thread_struct there can be only
>> one member shared by all nested sections.
>
> Do we have a use case for that on the horizon? If so, I fully agree,
> we have to store information per level. How/what information we have
> to store would be another question.

Not that I'm aware of, and all things considered it may not be so
likely: once lazy_mmu is enabled, entering nested sections isn't really
supposed to change any state.


Overall what you're proposing seems sensible to me, the additional
fields in task_struct don't take much space and we can keep the API
unchanged in most cases. It is also good to have the option to check
that the API is used correctly. I'll reply to the cover letter to let
anyone who didn't follow this thread chip in, before I go ahead and try
out that new approach.

- Kevin

Re: [PATCH v2 2/7] mm: introduce local state for lazy_mmu sections

Posted by David Hildenbrand 2 days, 11 hours ago

>>> On the other hand, with a pagefault_disabled-like approach, there is no
>>> way to instruct call {3} to fully exit lazy_mmu regardless of the
>>> nesting level.
>>
>> Sure there is, with a better API. See below. :)
> 
> I meant while keeping the existing shape of the API but yes fair enough!

Time to do it properly I guess :)

[...]

>> Assume we store in the task_struct
>>
>> uint8_t lazy_mmu_enabled_count;
>> bool lazy_mmu_paused;
> 
> I didn't think of that approach! I can't immediately see any problem
> with it, assuming we're fine with storing arch-specific context in
> thread_struct (which seems to be the case as things stand).

Right, just to complete the picture:

a) We will have some CONFIG_ARCH_LAZY_MMU

b) Without that config, all lazy_mmu_*() functions are a nop and no 
lazy_mmu_state is stored in task_struct

struct lazy_mmu_state {
	uint8_t enabled_count;
	bool paused;
}

c) With that config, common-code lazy_mmu_*() functions implement the 
updating of the lazy_mmu_state in task_struct and call into arch code
on the transition from 0->1, 1->0 etc.

Maybe that can be done through exiting 
arch_enter_lazy_mmu_mode()/arch_leave_lazy_mmu_mode() callbacks, maybe 
we need more. I feel like
we might be able to implement that through the existing helpers.

> 
>>
>> We can do things like
>>
>> a) Sanity check that while we are paused that we get no more
>> enable/disable requests
>> b) Sanity check that while we are paused that we get no more pause
>> requests.
> 
> These are good points - and this is only possible with such global
> state. (Similarly we can check that the counter never underflows.)

Exactly.

[..]

> 
> Overall what you're proposing seems sensible to me, the additional
> fields in task_struct don't take much space and we can keep the API
> unchanged in most cases. It is also good to have the option to check
> that the API is used correctly. I'll reply to the cover letter to let
> anyone who didn't follow this thread chip in, before I go ahead and try
> out that new approach.

And on top of the proposal above we will have some

struct arch_lazy_mmu_state;

define by the architecture (could be an empty struct on most).

We can store that inside "struct lazy_mmu_state;" or if we ever have to, 
start returning only that from the enable/disable etc. functions.

For now, I'd say just store it in the task struct in the lazy_mmu_state. 
But we can always adjust later if required.

In the first (this) series we probably don't even have to introduce 
arch_lazy_mmu_state.

-- 
Cheers

David / dhildenb

Re: [PATCH v2 2/7] mm: introduce local state for lazy_mmu sections

Posted by Kevin Brodsky 1 day, 21 hours ago

On 11/09/2025 20:14, David Hildenbrand wrote:
>>>> On the other hand, with a pagefault_disabled-like approach, there
>>>> is no
>>>> way to instruct call {3} to fully exit lazy_mmu regardless of the
>>>> nesting level.
>>>
>>> Sure there is, with a better API. See below. :)
>>
>> I meant while keeping the existing shape of the API but yes fair enough!
>
> Time to do it properly I guess :)

Yes, I think the discussions on that series have shown that we might as
well refactor it completely. Once and for all™!

>
> [...]
>
>>> Assume we store in the task_struct
>>>
>>> uint8_t lazy_mmu_enabled_count;
>>> bool lazy_mmu_paused;
>>
>> I didn't think of that approach! I can't immediately see any problem
>> with it, assuming we're fine with storing arch-specific context in
>> thread_struct (which seems to be the case as things stand).
>
> Right, just to complete the picture:
>
> a) We will have some CONFIG_ARCH_LAZY_MMU
>
> b) Without that config, all lazy_mmu_*() functions are a nop and no
> lazy_mmu_state is stored in task_struct 

Agreed on both counts (replacing __HAVE_ARCH_ENTER_LAZY_MMU_MODE).

>
> struct lazy_mmu_state {
>     uint8_t enabled_count;
>     bool paused;

Looking at the arm64 implementation, I'm thinking: instead of the paused
member, how about a PF_LAZY_MMU task flag? It would be set when lazy_mmu
is actually enabled (i.e. inside an enter()/leave() section, and not
inside a pause()/resume() section). This way, architectures could use
that flag directly to tell if lazy_mmu is enabled instead of reinventing
the wheel, all in slightly different ways. Namely:

* arm64 uses a thread flag (TIF_LAZY_MMU) - this is trivially replaced
with PF_LAZY_MMU
* powerpc and sparc use batch->active where batch is a per-CPU variable;
I expect this can also be replaced with PF_LAZY_MMU
* x86/xen is more complex as it has xen_lazy_mode which tracks both
LAZY_MMU and LAZY_CPU modes. I'd probably leave that one alone, unless a
Xen expert is motivated to refactor it.

With that approach, the implementation of arch_enter() and arch_leave()
becomes very simple (no tracking of lazy_mmu status) on arm64, powerpc
and sparc.

(Of course we could also have an "enabled" member in lazy_mmu_state
instead of PF_LAZY_MMU, there is no functional difference.)

> }
>
> c) With that config, common-code lazy_mmu_*() functions implement the
> updating of the lazy_mmu_state in task_struct and call into arch code
> on the transition from 0->1, 1->0 etc.

Indeed, this is how I thought about it. There is actually quite a lot
that can be moved to the generic functions:
* Updating lazy_mmu_state
* Sanity checks on lazy_mmu_state (e.g. underflow/overflow)
* Bailing out if in_interrupt() (not done consistently across arch's at
the moment)

>
> Maybe that can be done through exiting
> arch_enter_lazy_mmu_mode()/arch_leave_lazy_mmu_mode() callbacks, maybe
> we need more. I feel like
> we might be able to implement that through the existing helpers.

We might want to rename them to align with the new generic helpers, but
yes otherwise the principle should remain unchanged.

In fact, we will also need to revive arch_flush_lazy_mmu_mode(). Indeed,
in the nested situation, we need the following arch calls:

enter() -> arch_enter()
    enter() -> [nothing]
    leave() -> arch_flush()
leave() -> arch_leave()

leave() must always flush whatever arch state was batched, as may be
expected by the caller.

How does all that sound?

>
> [...]
>
>>
>> Overall what you're proposing seems sensible to me, the additional
>> fields in task_struct don't take much space and we can keep the API
>> unchanged in most cases. It is also good to have the option to check
>> that the API is used correctly. I'll reply to the cover letter to let
>> anyone who didn't follow this thread chip in, before I go ahead and try
>> out that new approach.
>
> And on top of the proposal above we will have some
>
> struct arch_lazy_mmu_state;
>
> define by the architecture (could be an empty struct on most).
>
> We can store that inside "struct lazy_mmu_state;" or if we ever have
> to, start returning only that from the enable/disable etc. functions.

I'm not sure we'd want to mix those styles (task_struct member + local
variable), that's adding complexity without much upside... Also having a
local variable at every nesting level only makes sense if we have an
arch callback regardless of nesting level, which is unnecessary in this
proposed API.

>
> For now, I'd say just store it in the task struct in the
> lazy_mmu_state. But we can always adjust later if required.
>
> In the first (this) series we probably don't even have to introduce
> arch_lazy_mmu_state. 

I suppose this could improve the overall struct layout - but otherwise I
don't really see the need compared to adding members to thread_struct
(which is fully arch-specific).

- Kevin

Re: [PATCH v2 2/7] mm: introduce local state for lazy_mmu sections

Posted by David Hildenbrand 1 day, 21 hours ago

>>
>> struct lazy_mmu_state {
>>      uint8_t enabled_count;
>>      bool paused;
> 
> Looking at the arm64 implementation, I'm thinking: instead of the paused
> member, how about a PF_LAZY_MMU task flag? It would be set when lazy_mmu
> is actually enabled (i.e. inside an enter()/leave() section, and not
> inside a pause()/resume() section). This way, architectures could use
> that flag directly to tell if lazy_mmu is enabled instead of reinventing
> the wheel, all in slightly different ways. Namely:
> 
> * arm64 uses a thread flag (TIF_LAZY_MMU) - this is trivially replaced
> with PF_LAZY_MMU
> * powerpc and sparc use batch->active where batch is a per-CPU variable;
> I expect this can also be replaced with PF_LAZY_MMU
> * x86/xen is more complex as it has xen_lazy_mode which tracks both
> LAZY_MMU and LAZY_CPU modes. I'd probably leave that one alone, unless a
> Xen expert is motivated to refactor it.
> 
> With that approach, the implementation of arch_enter() and arch_leave()
> becomes very simple (no tracking of lazy_mmu status) on arm64, powerpc
> and sparc.
> 
> (Of course we could also have an "enabled" member in lazy_mmu_state
> instead of PF_LAZY_MMU, there is no functional difference.)
> 

No strong opinion, but to me it feels like PF_LAZY_MMU is rather "the 
effective state when combining nested+paused", and might complicate the 
code + sanity checks?

So we could maintain that in addition fairly easily of course from the 
core instead of letting archs do that manually.

I would probably have to see the end result to judge whether removing 
the "paused" bool makes things look more complicated or not.

>> }
>>
>> c) With that config, common-code lazy_mmu_*() functions implement the
>> updating of the lazy_mmu_state in task_struct and call into arch code
>> on the transition from 0->1, 1->0 etc.
> 
> Indeed, this is how I thought about it. There is actually quite a lot
> that can be moved to the generic functions:
> * Updating lazy_mmu_state
> * Sanity checks on lazy_mmu_state (e.g. underflow/overflow)
> * Bailing out if in_interrupt() (not done consistently across arch's at
> the moment)
> 
>>
>> Maybe that can be done through exiting
>> arch_enter_lazy_mmu_mode()/arch_leave_lazy_mmu_mode() callbacks, maybe
>> we need more. I feel like
>> we might be able to implement that through the existing helpers.
> 
> We might want to rename them to align with the new generic helpers, but
> yes otherwise the principle should remain unchanged.
> 
> In fact, we will also need to revive arch_flush_lazy_mmu_mode().

That's okay if it's all hidden behaind a sane core API.

> Indeed,
> in the nested situation, we need the following arch calls:
> 
> enter() -> arch_enter()
>      enter() -> [nothing]
>      leave() -> arch_flush()
> leave() -> arch_leave()
> 
> leave() must always flush whatever arch state was batched, as may be
> expected by the caller.
> 
> How does all that sound?

I am no expert on the "always flush when leaving", but it sounds 
reasonable to me.

Which arch operations would you call from

pause()
continue()

?

>> And on top of the proposal above we will have some
>>
>> struct arch_lazy_mmu_state;
>>
>> define by the architecture (could be an empty struct on most).
>>
>> We can store that inside "struct lazy_mmu_state;" or if we ever have
>> to, start returning only that from the enable/disable etc. functions.
> 
> I'm not sure we'd want to mix those styles (task_struct member + local
> variable), that's adding complexity without much upside... Also having a
> local variable at every nesting level only makes sense if we have an
> arch callback regardless of nesting level, which is unnecessary in this
> proposed API.

Yes, that was rather a "if we ever really run out of space we could look 
into that", I am not a fan of it obviously.

-- 
Cheers

David / dhildenb

Re: [PATCH v2 2/7] mm: introduce local state for lazy_mmu sections

Posted by Kevin Brodsky 1 day, 20 hours ago

On 12/09/2025 10:04, David Hildenbrand wrote:
>>>
>>> struct lazy_mmu_state {
>>>      uint8_t enabled_count;
>>>      bool paused;
>>
>> Looking at the arm64 implementation, I'm thinking: instead of the paused
>> member, how about a PF_LAZY_MMU task flag? It would be set when lazy_mmu
>> is actually enabled (i.e. inside an enter()/leave() section, and not
>> inside a pause()/resume() section). This way, architectures could use
>> that flag directly to tell if lazy_mmu is enabled instead of reinventing
>> the wheel, all in slightly different ways. Namely:
>>
>> * arm64 uses a thread flag (TIF_LAZY_MMU) - this is trivially replaced
>> with PF_LAZY_MMU
>> * powerpc and sparc use batch->active where batch is a per-CPU variable;
>> I expect this can also be replaced with PF_LAZY_MMU
>> * x86/xen is more complex as it has xen_lazy_mode which tracks both
>> LAZY_MMU and LAZY_CPU modes. I'd probably leave that one alone, unless a
>> Xen expert is motivated to refactor it.
>>
>> With that approach, the implementation of arch_enter() and arch_leave()
>> becomes very simple (no tracking of lazy_mmu status) on arm64, powerpc
>> and sparc.
>>
>> (Of course we could also have an "enabled" member in lazy_mmu_state
>> instead of PF_LAZY_MMU, there is no functional difference.)
>>
>
> No strong opinion, but to me it feels like PF_LAZY_MMU is rather "the
> effective state when combining nested+paused", and might complicate
> the code + sanity checks?
>
> So we could maintain that in addition fairly easily of course from the
> core instead of letting archs do that manually.
>
> I would probably have to see the end result to judge whether removing
> the "paused" bool makes things look more complicated or not.

Agreed, it is a little difficult to consider all the cases ahead of
time. What is clear to me though is that [paused] can be inferred from
[count + enabled], and conversely [enabled] from [count + paused]. As a
result I really wouldn't store both paused and enabled in task_struct -
duplicating information is how you create inconsistent states.

We can very easily introduce helpers to get the enabled/paused status
regardless of how they're stored. Since "enabled" is what we need to
know in most cases (arch checking the status), I would rather store
"enabled" than "paused". But indeed, let's see how it turns out in practice.

>
>>> }
>>>
>>> c) With that config, common-code lazy_mmu_*() functions implement the
>>> updating of the lazy_mmu_state in task_struct and call into arch code
>>> on the transition from 0->1, 1->0 etc.
>>
>> Indeed, this is how I thought about it. There is actually quite a lot
>> that can be moved to the generic functions:
>> * Updating lazy_mmu_state
>> * Sanity checks on lazy_mmu_state (e.g. underflow/overflow)
>> * Bailing out if in_interrupt() (not done consistently across arch's at
>> the moment)
>>
>>>
>>> Maybe that can be done through exiting
>>> arch_enter_lazy_mmu_mode()/arch_leave_lazy_mmu_mode() callbacks, maybe
>>> we need more. I feel like
>>> we might be able to implement that through the existing helpers.
>>
>> We might want to rename them to align with the new generic helpers, but
>> yes otherwise the principle should remain unchanged.
>>
>> In fact, we will also need to revive arch_flush_lazy_mmu_mode().
>
> That's okay if it's all hidden behaind a sane core API.
>
>> Indeed,
>> in the nested situation, we need the following arch calls:
>>
>> enter() -> arch_enter()
>>      enter() -> [nothing]
>>      leave() -> arch_flush()
>> leave() -> arch_leave()
>>
>> leave() must always flush whatever arch state was batched, as may be
>> expected by the caller.
>>
>> How does all that sound?
>
> I am no expert on the "always flush when leaving", but it sounds
> reasonable to me.

This is a core expectation for lazy_mmu: when leave() is called, any
batched state is flushed. The fact it currently happens unconditionally
when calling leave() is in fact what stops nesting from exploding on
arm64 with DEBUG_PAGEALLOC [1].

[1] https://lore.kernel.org/all/aEhKSq0zVaUJkomX@arm.com/

>
> Which arch operations would you call from
>
> pause()
> continue()

I also wondered about that. I think the safest is to make them
respectively arch_leave() and arch_enter() - the flushing entailed by
arch_leave() might not be required, but it is safer. Additionally,
powerpc/sparc disable preemption while in lazy_mmu, so it seems like a
good idea to re-enable it while paused (by calling arch_leave()).

- Kevin

Re: [PATCH v2 2/7] mm: introduce local state for lazy_mmu sections

Posted by David Hildenbrand 1 day, 20 hours ago

> I also wondered about that. I think the safest is to make them
> respectively arch_leave() and arch_enter() - the flushing entailed by
> arch_leave() might not be required, but it is safer. Additionally,
> powerpc/sparc disable preemption while in lazy_mmu, so it seems like a
> good idea to re-enable it while paused (by calling arch_leave()).

Great, looking forward to seeing this all getting cleaned up and done 
properly for good.

-- 
Cheers

David / dhildenb

Re: [PATCH v2 2/7] mm: introduce local state for lazy_mmu sections

Posted by Alexander Gordeev 1 day, 16 hours ago

On Fri, Sep 12, 2025 at 10:55:50AM +0200, David Hildenbrand wrote:

Hi David, Kevin,

> Great, looking forward to seeing this all getting cleaned up and done
> properly for good.

I am currently working on lazy mmu for s390 and this nesting
initiative kind of interferres. Well, in fact it looks like
it does not, but I am bit lost in last couple of iterations ;)

The prerequisite for s390 would be something like the change
below. With that change I can store the context in a per-cpu
structure and use it later in arch-specific ptep_* primitives.

Moreover, with a further (experimental) rework we could use
a custom kasan sanitizer to spot false directly compiled
PTE accesses, as opposed to set_pte()/ptep_get() accessors.

I am not quite sure see whether this could be derailed by
the new lazy mmu API. At least I do not immediately see any
obvious problem. But may be you do?


[PATCH] mm: Make lazy MMU mode context-aware

The lazy MMU mode is assumed to be context-independent in a
sense the MMU does not need any additional data in lazy mode.
Yet, s390 architecture may benefit strongly if it knows the
exact page table entries being changed while in lazy mode.

Introduce arch_enter_lazy_mmu_mode_pte() that is provided
with the process memory space and the page table being
operated on as the prerequisite for s390 optimization.
It is expected to be called only against PTE page tables
and never cross the page table boundary.

There is no change for architectures that do not need any
context.

Signed-off-by: Alexander Gordeev <agordeev@linux.ibm.com>
---
 fs/proc/task_mmu.c      | 2 +-
 include/linux/pgtable.h | 8 ++++++++
 mm/madvise.c            | 8 ++++----
 mm/memory.c             | 8 ++++----
 mm/mprotect.c           | 2 +-
 mm/mremap.c             | 2 +-
 mm/vmalloc.c            | 6 +++---
 7 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 751479eb128f..02fcd2771b2a 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -2493,7 +2493,7 @@ static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start,
 		return 0;
 	}
 
-	arch_enter_lazy_mmu_mode();
+	arch_enter_lazy_mmu_mode_pte(vma->vm_mm, start, end, start_pte);
 
 	if ((p->arg.flags & PM_SCAN_WP_MATCHING) && !p->vec_out) {
 		/* Fast path for performing exclusive WP */
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 0b6e1f781d86..16235c198bcb 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -235,6 +235,14 @@ static inline int pmd_dirty(pmd_t pmd)
 #define arch_enter_lazy_mmu_mode()	do {} while (0)
 #define arch_leave_lazy_mmu_mode()	do {} while (0)
 #define arch_flush_lazy_mmu_mode()	do {} while (0)
+
+static inline void arch_enter_lazy_mmu_mode_pte(struct mm_struct *mm,
+						unsigned long addr,
+						unsigned long end,
+						pte_t *ptep)
+{
+	arch_enter_lazy_mmu_mode(); 
+}
 #endif
 
 #ifndef pte_batch_hint
diff --git a/mm/madvise.c b/mm/madvise.c
index 1d44a35ae85c..d36d4dc42378 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -448,7 +448,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
 	if (!start_pte)
 		return 0;
 	flush_tlb_batched_pending(mm);
-	arch_enter_lazy_mmu_mode();
+	arch_enter_lazy_mmu_mode_pte(mm, addr, end, start_pte);
 	for (; addr < end; pte += nr, addr += nr * PAGE_SIZE) {
 		nr = 1;
 		ptent = ptep_get(pte);
@@ -509,7 +509,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
 				if (!start_pte)
 					break;
 				flush_tlb_batched_pending(mm);
-				arch_enter_lazy_mmu_mode();
+				arch_enter_lazy_mmu_mode_pte(mm, addr, end, start_pte);
 				if (!err)
 					nr = 0;
 				continue;
@@ -678,7 +678,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 	if (!start_pte)
 		return 0;
 	flush_tlb_batched_pending(mm);
-	arch_enter_lazy_mmu_mode();
+	arch_enter_lazy_mmu_mode_pte(mm, addr, end, start_pte);
 	for (; addr != end; pte += nr, addr += PAGE_SIZE * nr) {
 		nr = 1;
 		ptent = ptep_get(pte);
@@ -743,7 +743,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 				if (!start_pte)
 					break;
 				flush_tlb_batched_pending(mm);
-				arch_enter_lazy_mmu_mode();
+				arch_enter_lazy_mmu_mode_pte(mm, addr, end, pte);
 				if (!err)
 					nr = 0;
 				continue;
diff --git a/mm/memory.c b/mm/memory.c
index b0cda5aab398..93c0b8457eb0 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1131,7 +1131,7 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
 	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
 	orig_src_pte = src_pte;
 	orig_dst_pte = dst_pte;
-	arch_enter_lazy_mmu_mode();
+	arch_enter_lazy_mmu_mode_pte(src_mm, addr, end, src_pte);
 
 	do {
 		nr = 1;
@@ -1723,7 +1723,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 		return addr;
 
 	flush_tlb_batched_pending(mm);
-	arch_enter_lazy_mmu_mode();
+	arch_enter_lazy_mmu_mode_pte(mm, addr, end, start_pte);
 	do {
 		bool any_skipped = false;
 
@@ -2707,7 +2707,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
 	mapped_pte = pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
 	if (!pte)
 		return -ENOMEM;
-	arch_enter_lazy_mmu_mode();
+	arch_enter_lazy_mmu_mode_pte(mm, addr, end, mapped_pte);
 	do {
 		BUG_ON(!pte_none(ptep_get(pte)));
 		if (!pfn_modify_allowed(pfn, prot)) {
@@ -3024,7 +3024,7 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
 			return -EINVAL;
 	}
 
-	arch_enter_lazy_mmu_mode();
+	arch_enter_lazy_mmu_mode_pte(mm, addr, end, mapped_pte);
 
 	if (fn) {
 		do {
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 88608d0dc2c2..919c1dedff87 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -106,7 +106,7 @@ static long change_pte_range(struct mmu_gather *tlb,
 		target_node = numa_node_id();
 
 	flush_tlb_batched_pending(vma->vm_mm);
-	arch_enter_lazy_mmu_mode();
+	arch_enter_lazy_mmu_mode_pte(vma->vm_mm, addr, end, pte);
 	do {
 		oldpte = ptep_get(pte);
 		if (pte_present(oldpte)) {
diff --git a/mm/mremap.c b/mm/mremap.c
index 60f6b8d0d5f0..08b9cb3bb9ef 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -233,7 +233,7 @@ static int move_ptes(struct pagetable_move_control *pmc,
 	if (new_ptl != old_ptl)
 		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
 	flush_tlb_batched_pending(vma->vm_mm);
-	arch_enter_lazy_mmu_mode();
+	arch_enter_lazy_mmu_mode_pte(mm, old_addr, old_end, old_pte);
 
 	for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
 				   new_pte++, new_addr += PAGE_SIZE) {
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 6dbcdceecae1..29cfc64970a5 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -105,7 +105,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 	if (!pte)
 		return -ENOMEM;
 
-	arch_enter_lazy_mmu_mode();
+	arch_enter_lazy_mmu_mode_pte(&init_mm, addr, end, pte);
 
 	do {
 		if (unlikely(!pte_none(ptep_get(pte)))) {
@@ -359,7 +359,7 @@ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 	unsigned long size = PAGE_SIZE;
 
 	pte = pte_offset_kernel(pmd, addr);
-	arch_enter_lazy_mmu_mode();
+	arch_enter_lazy_mmu_mode_pte(&init_mm, addr, end, pte);
 
 	do {
 #ifdef CONFIG_HUGETLB_PAGE
@@ -526,7 +526,7 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
 	if (!pte)
 		return -ENOMEM;
 
-	arch_enter_lazy_mmu_mode();
+	arch_enter_lazy_mmu_mode_pte(&init_mm, addr, end, pte);
 
 	do {
 		struct page *page = pages[*nr];

> David / dhildenb

Thanks!

Re: [PATCH v2 2/7] mm: introduce local state for lazy_mmu sections

Posted by David Hildenbrand 1 day, 16 hours ago

On 12.09.25 14:37, Alexander Gordeev wrote:
> On Fri, Sep 12, 2025 at 10:55:50AM +0200, David Hildenbrand wrote:
> 
> Hi David, Kevin,
> 
>> Great, looking forward to seeing this all getting cleaned up and done
>> properly for good.
> 
> I am currently working on lazy mmu for s390 and this nesting
> initiative kind of interferres. Well, in fact it looks like
> it does not, but I am bit lost in last couple of iterations ;)
> 
> The prerequisite for s390 would be something like the change
> below. With that change I can store the context in a per-cpu
> structure and use it later in arch-specific ptep_* primitives.
> 
> Moreover, with a further (experimental) rework we could use
> a custom kasan sanitizer to spot false directly compiled
> PTE accesses, as opposed to set_pte()/ptep_get() accessors.
> 
> I am not quite sure see whether this could be derailed by
> the new lazy mmu API. At least I do not immediately see any
> obvious problem. But may be you do?

It would just be passing more context down to the architecture, right?

-- 
Cheers

David / dhildenb

Re: [PATCH v2 2/7] mm: introduce local state for lazy_mmu sections

Posted by Alexander Gordeev 1 day, 16 hours ago

On Fri, Sep 12, 2025 at 02:40:55PM +0200, David Hildenbrand wrote:
> It would just be passing more context down to the architecture, right?

Yes. Namely this one would be arch-defined and arch_enter_lazy_mmu_mode()
by default.

static inline void arch_enter_lazy_mmu_mode_pte(struct mm_struct *mm,
						unsigned long addr,
						unsigned long end,
						pte_t *ptep)
{
	...
}

> David / dhildenb

Re: [PATCH v2 2/7] mm: introduce local state for lazy_mmu sections

Posted by David Hildenbrand 1 day, 16 hours ago

On 12.09.25 14:56, Alexander Gordeev wrote:
> On Fri, Sep 12, 2025 at 02:40:55PM +0200, David Hildenbrand wrote:
>> It would just be passing more context down to the architecture, right?
> 
> Yes. Namely this one would be arch-defined and arch_enter_lazy_mmu_mode()
> by default.
> 

How would that work with nesting? I feel like there is a fundamental 
problem with nesting with what you describe but I might be wrong.

-- 
Cheers

David / dhildenb

Re: [PATCH v2 2/7] mm: introduce local state for lazy_mmu sections

Posted by Alexander Gordeev 1 day, 15 hours ago

On Fri, Sep 12, 2025 at 03:02:15PM +0200, David Hildenbrand wrote:
> How would that work with nesting? I feel like there is a fundamental problem
> with nesting with what you describe but I might be wrong.

My picture is - flush on each lazy_mmu_disable(), pause on lazy_mmu_pause()
and honour only top-level arch_enter_lazy_mmu_mode_pte(mm, start, end, ptep) 
context on all nested levels.

In theory (and if I got it right, you leave the door open for this possibility)
every (mm, start, end, ptep) context could be stored for each nesting level
(as an opaque arch-specific data?).

But I do not really expect it ever, since arch_enter_lazy_mmu_mode_pte()
is only to be called in PTE walkers that never span more than one page
table and follow the pattern:

	ptep = pte_offset_map_lock(...);
	arch_enter_lazy_mmu_mode_pte(mm, start, end, ptep);

	for (...; ptep++) {
		/*
		 * set_pte(ptep, ...) or something
		 */
	}

	arch_leave_lazy_mmu_mode();                                             
	pte_unmap_unlock(...);                                         

As result, the lazy mmu mode is only "bound" to a single PTE table on s390,
while arch_enter_lazy_mmu_mode() is going to stay NOP.

So when you say you feel a fundamental problem - what that could be?

> David / dhildenb

Thanks!

Re: [PATCH v2 2/7] mm: introduce local state for lazy_mmu sections

Posted by David Hildenbrand 1 day, 14 hours ago

On 12.09.25 16:05, Alexander Gordeev wrote:
> On Fri, Sep 12, 2025 at 03:02:15PM +0200, David Hildenbrand wrote:
>> How would that work with nesting? I feel like there is a fundamental problem
>> with nesting with what you describe but I might be wrong.
> 
> My picture is - flush on each lazy_mmu_disable(), pause on lazy_mmu_pause()
> and honour only top-level arch_enter_lazy_mmu_mode_pte(mm, start, end, ptep)
> context on all nested levels.
> 
> In theory (and if I got it right, you leave the door open for this possibility)
> every (mm, start, end, ptep) context could be stored for each nesting level
> (as an opaque arch-specific data?).

Yes, I explained that we could do that, for example, by returning a 
"struct arch_lazy_mmu_state" from enable() and feeding it into disable().

I would just wish that we could avoid that ...

As an alternative, you could store it somewhere else as an array (percpu 
variable? task_struct) and support only a limited number of nesting 
levels. The current nesting level could always be retrieved from the 
task_struct, for example.

Maybe s390x really wouldn't need support for more than one nesting level 
right now.

> 
> But I do not really expect it ever, since arch_enter_lazy_mmu_mode_pte()
> is only to be called in PTE walkers that never span more than one page
> table and follow the pattern:

Well, the cover letter here states:

"Unfortunately, a corner case (DEBUG_PAGEALLOC) may still cause nesting 
to occur on arm64. Ryan proposed [2] to address that corner case at the 
generic level but this approach received pushback; [3] then attempted to 
solve the issue on arm64 only, but it was deemed too fragile."

So I guess we should support nesting cleanly, at least on the core-mm side.

I guess we could start with saying "well, s390x doesn't fully support 
nesting yet but doing so just requires changing the way we manage this 
per-nesting-level state internally".

s390 is trying to do something different than the other archs here, so 
that naturally concerns me :)

But if it's really just about forwarding that data and having s390 store 
it somewhere (task_struct, percpu variable, etc), fine with me.

-- 
Cheers

David / dhildenb

Re: [PATCH v2 2/7] mm: introduce local state for lazy_mmu sections

Posted by Kevin Brodsky 1 day, 14 hours ago

On 12/09/2025 16:25, David Hildenbrand wrote:
>
>>
>> But I do not really expect it ever, since arch_enter_lazy_mmu_mode_pte()
>> is only to be called in PTE walkers that never span more than one page
>> table and follow the pattern:
>
> Well, the cover letter here states:
>
> "Unfortunately, a corner case (DEBUG_PAGEALLOC) may still cause
> nesting to occur on arm64. Ryan proposed [2] to address that corner
> case at the generic level but this approach received pushback; [3]
> then attempted to solve the issue on arm64 only, but it was deemed too
> fragile."
>
> So I guess we should support nesting cleanly, at least on the core-mm
> side.

Nesting remains a rare occurrence though. I think it would be plausible
to require this new interface to be used in a region where no nesting
can occur, just like pause()/resume().

In fact, I think this is a requirement if we go for the approach we have
been discussing, because nested enter()/leave() calls are not meant to
call arch_enter()/arch_leave(), and I really wouldn't want to use a
different logic for this variant.

>
> I guess we could start with saying "well, s390x doesn't fully support
> nesting yet but doing so just requires changing the way we manage this
> per-nesting-level state internally".
>
> s390 is trying to do something different than the other archs here, so
> that naturally concerns me :)
>
> But if it's really just about forwarding that data and having s390
> store it somewhere (task_struct, percpu variable, etc), fine with me. 

Yes I think this is fine, with the restriction above. The extra
arguments are directly forwarded to arch code and otherwise ignored by
core code, and unless the arch defines some __HAVE_ARCH... or CONFIG,
the extended interface falls back to regular enter()/leave().

- Kevin

Re: [PATCH v2 2/7] mm: introduce local state for lazy_mmu sections

Posted by Kevin Brodsky 4 days, 15 hours ago

On 09/09/2025 15:49, Kevin Brodsky wrote:
> On 09/09/2025 13:54, David Hildenbrand wrote:
>> On 09.09.25 13:45, Alexander Gordeev wrote:
>>> On Tue, Sep 09, 2025 at 12:09:48PM +0200, David Hildenbrand wrote:
>>>> On 09.09.25 11:40, Alexander Gordeev wrote:
>>>>> On Tue, Sep 09, 2025 at 11:07:36AM +0200, David Hildenbrand wrote:
>>>>>> On 08.09.25 09:39, Kevin Brodsky wrote:
>>>>>>> arch_{enter,leave}_lazy_mmu_mode() currently have a stateless API
>>>>>>> (taking and returning no value). This is proving problematic in
>>>>>>> situations where leave() needs to restore some context back to its
>>>>>>> original state (before enter() was called). In particular, this
>>>>>>> makes it difficult to support the nesting of lazy_mmu sections -
>>>>>>> leave() does not know whether the matching enter() call occurred
>>>>>>> while lazy_mmu was already enabled, and whether to disable it or
>>>>>>> not.
>>>>>>>
>>>>>>> This patch gives all architectures the chance to store local state
>>>>>>> while inside a lazy_mmu section by making enter() return some value,
>>>>>>> storing it in a local variable, and having leave() take that value.
>>>>>>> That value is typed lazy_mmu_state_t - each architecture defining
>>>>>>> __HAVE_ARCH_ENTER_LAZY_MMU_MODE is free to define it as it sees fit.
>>>>>>> For now we define it as int everywhere, which is sufficient to
>>>>>>> support nesting.
>>>>> ...
>>>>>>> {
>>>>>>> + lazy_mmu_state_t lazy_mmu_state;
>>>>>>> ...
>>>>>>> - arch_enter_lazy_mmu_mode();
>>>>>>> + lazy_mmu_state = arch_enter_lazy_mmu_mode();
>>>>>>> ...
>>>>>>> - arch_leave_lazy_mmu_mode();
>>>>>>> + arch_leave_lazy_mmu_mode(lazy_mmu_state);
>>>>>>> ...
>>>>>>> }
>>>>>>>
>>>>>>> * In a few cases (e.g. xen_flush_lazy_mmu()), a function knows that
>>>>>>>      lazy_mmu is already enabled, and it temporarily disables it by
>>>>>>>      calling leave() and then enter() again. Here we want to ensure
>>>>>>>      that any operation between the leave() and enter() calls is
>>>>>>>      completed immediately; for that reason we pass
>>>>>>> LAZY_MMU_DEFAULT to
>>>>>>>      leave() to fully disable lazy_mmu. enter() will then
>>>>>>> re-enable it
>>>>>>>      - this achieves the expected behaviour, whether nesting
>>>>>>> occurred
>>>>>>>      before that function was called or not.
>>>>>>>
>>>>>>> Note: it is difficult to provide a default definition of
>>>>>>> lazy_mmu_state_t for architectures implementing lazy_mmu, because
>>>>>>> that definition would need to be available in
>>>>>>> arch/x86/include/asm/paravirt_types.h and adding a new generic
>>>>>>>     #include there is very tricky due to the existing header soup.
>>>>>> Yeah, I was wondering about exactly that.
>>>>>>
>>>>>> In particular because LAZY_MMU_DEFAULT etc resides somewehere
>>>>>> compeltely
>>>>>> different.
>>>>>>
>>>>>> Which raises the question: is using a new type really of any
>>>>>> benefit here?
>>>>>>
>>>>>> Can't we just use an "enum lazy_mmu_state" and call it a day?
>>>>> I could envision something completely different for this type on s390,
>>>>> e.g. a pointer to a per-cpu structure. So I would really ask to stick
>>>>> with the current approach.
> This is indeed the motivation - let every arch do whatever it sees fit.
> lazy_mmu_state_t is basically an opaque type as far as generic code is
> concerned, which also means that this API change is the first and last
> one we need (famous last words, I know). 
>
> I mentioned in the cover letter that the pkeys-based page table
> protection series [1] would have an immediate use for lazy_mmu_state_t.
> In that proposal, any helper writing to pgtables needs to modify the
> pkey register and then restore it. To reduce the overhead, lazy_mmu is
> used to set the pkey register only once in enter(), and then restore it
> in leave() [2]. This currently relies on storing the original pkey
> register value in thread_struct, which is suboptimal and most
> importantly doesn't work if lazy_mmu sections nest. With this series, we
> could instead store the pkey register value in lazy_mmu_state_t
> (enlarging it to 64 bits or more).

Forgot the references, sorry...

[1]
https://lore.kernel.org/linux-hardening/20250815085512.2182322-1-kevin.brodsky@arm.com/
[2]
https://lore.kernel.org/linux-hardening/20250815085512.2182322-19-kevin.brodsky@arm.com/

> I also considered going further and making lazy_mmu_state_t a pointer as
> Alexander suggested - more complex to manage, but also a lot more flexible.
>
>>>> Would that integrate well with LAZY_MMU_DEFAULT etc?
>>> Hmm... I though the idea is to use LAZY_MMU_* by architectures that
>>> want to use it - at least that is how I read the description above.
>>>
>>> It is only kasan_populate|depopulate_vmalloc_pte() in generic code
>>> that do not follow this pattern, and it looks as a problem to me.
> This discussion also made me realise that this is problematic, as the
> LAZY_MMU_{DEFAULT,NESTED} macros were meant only for architectures'
> convenience, not for generic code (where lazy_mmu_state_t should ideally
> be an opaque type as mentioned above). It almost feels like the kasan
> case deserves a different API, because this is not how enter() and
> leave() are meant to be used. This would mean quite a bit of churn
> though, so maybe just introduce another arch-defined value to pass to
> leave() for such a situation - for instance,
> arch_leave_lazy_mmu_mode(LAZY_MMU_FLUSH)?
>
>> Yes, that's why I am asking.
>>
>> What kind of information (pointer to a per-cpu structure) would you
>> want to return, and would handling it similar to how
>> pagefault_disable()/pagefault_enable() e.g., using a variable in
>> "current" to track the nesting level avoid having s390x to do that?
> The pagefault_disabled approach works fine for simple use-cases, but it
> doesn't scale well. The space allocated in task_struct/thread_struct to
> track that state is wasted (unused) most of the time. Worse, it does not
> truly enable states to be nested: it allows the outermost section to
> store some state, but nested sections cannot allocate extra space. This
> is really what the stack is for.
>
> - Kevin

Re: [PATCH v2 2/7] mm: introduce local state for lazy_mmu sections

Posted by Andrew Morton 4 days, 23 hours ago

On Mon,  8 Sep 2025 08:39:26 +0100 Kevin Brodsky <kevin.brodsky@arm.com> wrote:

> arch_{enter,leave}_lazy_mmu_mode() currently have a stateless API
> (taking and returning no value). This is proving problematic in
> situations where leave() needs to restore some context back to its
> original state (before enter() was called). In particular, this
> makes it difficult to support the nesting of lazy_mmu sections -
> leave() does not know whether the matching enter() call occurred
> while lazy_mmu was already enabled, and whether to disable it or
> not.
> 
> This patch gives all architectures the chance to store local state
> while inside a lazy_mmu section by making enter() return some value,
> storing it in a local variable, and having leave() take that value.
> That value is typed lazy_mmu_state_t - each architecture defining
> __HAVE_ARCH_ENTER_LAZY_MMU_MODE is free to define it as it sees fit.
> For now we define it as int everywhere, which is sufficient to
> support nesting.
> 
> The diff is unfortunately rather large as all the API changes need
> to be done atomically. Main parts:

This has a build error:

  CC      arch/x86/kernel/asm-offsets.s
In file included from ./arch/x86/include/asm/irqflags.h:102,
                 from ./include/linux/irqflags.h:18,
                 from ./include/linux/spinlock.h:59,
                 from ./include/linux/swait.h:7,
                 from ./include/linux/completion.h:12,
                 from ./include/linux/crypto.h:15,
                 from arch/x86/kernel/asm-offsets.c:9:
./arch/x86/include/asm/paravirt.h: In function 'arch_enter_lazy_mmu_mode':
./arch/x86/include/asm/paravirt.h:534:16: error: 'LAZY_MMU_DEFAULT' undeclared (first use in this function)
  534 |         return LAZY_MMU_DEFAULT;
      |                ^~~~~~~~~~~~~~~~
./arch/x86/include/asm/paravirt.h:534:16: note: each undeclared identifier is re

which gets fixed up later in the series.

Re: [PATCH v2 2/7] mm: introduce local state for lazy_mmu sections

Posted by Kevin Brodsky 4 days, 20 hours ago

On 09/09/2025 07:40, Andrew Morton wrote:
> On Mon,  8 Sep 2025 08:39:26 +0100 Kevin Brodsky <kevin.brodsky@arm.com> wrote:
>
>> arch_{enter,leave}_lazy_mmu_mode() currently have a stateless API
>> (taking and returning no value). This is proving problematic in
>> situations where leave() needs to restore some context back to its
>> original state (before enter() was called). In particular, this
>> makes it difficult to support the nesting of lazy_mmu sections -
>> leave() does not know whether the matching enter() call occurred
>> while lazy_mmu was already enabled, and whether to disable it or
>> not.
>>
>> This patch gives all architectures the chance to store local state
>> while inside a lazy_mmu section by making enter() return some value,
>> storing it in a local variable, and having leave() take that value.
>> That value is typed lazy_mmu_state_t - each architecture defining
>> __HAVE_ARCH_ENTER_LAZY_MMU_MODE is free to define it as it sees fit.
>> For now we define it as int everywhere, which is sufficient to
>> support nesting.
>>
>> The diff is unfortunately rather large as all the API changes need
>> to be done atomically. Main parts:
> This has a build error:
>
>   CC      arch/x86/kernel/asm-offsets.s
> In file included from ./arch/x86/include/asm/irqflags.h:102,
>                  from ./include/linux/irqflags.h:18,
>                  from ./include/linux/spinlock.h:59,
>                  from ./include/linux/swait.h:7,
>                  from ./include/linux/completion.h:12,
>                  from ./include/linux/crypto.h:15,
>                  from arch/x86/kernel/asm-offsets.c:9:
> ./arch/x86/include/asm/paravirt.h: In function 'arch_enter_lazy_mmu_mode':
> ./arch/x86/include/asm/paravirt.h:534:16: error: 'LAZY_MMU_DEFAULT' undeclared (first use in this function)
>   534 |         return LAZY_MMU_DEFAULT;
>       |                ^~~~~~~~~~~~~~~~
> ./arch/x86/include/asm/paravirt.h:534:16: note: each undeclared identifier is re
>
> which gets fixed up later in the series.

Oh indeed good catch! I don't think there's an easy way to fix this
cleanly due to the header soup. Since it's just a temporary change, I
suggest:

diff --git a/arch/x86/include/asm/paravirt.h
b/arch/x86/include/asm/paravirt.h
index 65a0d394fba1..67b9549b4255 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -531,7 +531,7 @@ static inline lazy_mmu_state_t
arch_enter_lazy_mmu_mode(void)
 {
     PVOP_VCALL0(mmu.lazy_mode.enter);
 
-    return LAZY_MMU_DEFAULT;
+    return 0; /* LAZY_MMU_DEFAULT */
 }
 
 static inline void arch_leave_lazy_mmu_mode(lazy_mmu_state_t state)


That will generate a trivial conflict with patch 4, naturally.

Should I send a v3 with that change?

- Kevin

Re: [PATCH v2 2/7] mm: introduce local state for lazy_mmu sections

Posted by Yeoreum Yun 5 days, 19 hours ago

Reviewed-by: Yeoreum Yun <yeoreum.yun@arm.com>

On Mon, Sep 08, 2025 at 08:39:26AM +0100, Kevin Brodsky wrote:
> arch_{enter,leave}_lazy_mmu_mode() currently have a stateless API
> (taking and returning no value). This is proving problematic in
> situations where leave() needs to restore some context back to its
> original state (before enter() was called). In particular, this
> makes it difficult to support the nesting of lazy_mmu sections -
> leave() does not know whether the matching enter() call occurred
> while lazy_mmu was already enabled, and whether to disable it or
> not.
>
> This patch gives all architectures the chance to store local state
> while inside a lazy_mmu section by making enter() return some value,
> storing it in a local variable, and having leave() take that value.
> That value is typed lazy_mmu_state_t - each architecture defining
> __HAVE_ARCH_ENTER_LAZY_MMU_MODE is free to define it as it sees fit.
> For now we define it as int everywhere, which is sufficient to
> support nesting.
>
> The diff is unfortunately rather large as all the API changes need
> to be done atomically. Main parts:
>
> * Changing the prototypes of arch_{enter,leave}_lazy_mmu_mode()
>   in generic and arch code, and introducing lazy_mmu_state_t.
>
> * Introducing LAZY_MMU_{DEFAULT,NESTED} for future support of
>   nesting. enter() always returns LAZY_MMU_DEFAULT for now.
>   (linux/mm_types.h is not the most natural location for defining
>   those constants, but there is no other obvious header that is
>   accessible where arch's implement the helpers.)
>
> * Changing all lazy_mmu sections to introduce a lazy_mmu_state
>   local variable, having enter() set it and leave() take it. Most of
>   these changes were generated using the following Coccinelle script:
>
> @@
> @@
> {
> + lazy_mmu_state_t lazy_mmu_state;
> ...
> - arch_enter_lazy_mmu_mode();
> + lazy_mmu_state = arch_enter_lazy_mmu_mode();
> ...
> - arch_leave_lazy_mmu_mode();
> + arch_leave_lazy_mmu_mode(lazy_mmu_state);
> ...
> }
>
> * In a few cases (e.g. xen_flush_lazy_mmu()), a function knows that
>   lazy_mmu is already enabled, and it temporarily disables it by
>   calling leave() and then enter() again. Here we want to ensure
>   that any operation between the leave() and enter() calls is
>   completed immediately; for that reason we pass LAZY_MMU_DEFAULT to
>   leave() to fully disable lazy_mmu. enter() will then re-enable it
>   - this achieves the expected behaviour, whether nesting occurred
>   before that function was called or not.
>
> Note: it is difficult to provide a default definition of
> lazy_mmu_state_t for architectures implementing lazy_mmu, because
> that definition would need to be available in
> arch/x86/include/asm/paravirt_types.h and adding a new generic
>  #include there is very tricky due to the existing header soup.
>
> Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
> Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
> ---
>  arch/arm64/include/asm/pgtable.h              | 10 +++++++---
>  .../include/asm/book3s/64/tlbflush-hash.h     |  9 ++++++---
>  arch/powerpc/mm/book3s64/hash_tlb.c           | 10 ++++++----
>  arch/powerpc/mm/book3s64/subpage_prot.c       |  5 +++--
>  arch/sparc/include/asm/tlbflush_64.h          |  5 +++--
>  arch/sparc/mm/tlb.c                           |  6 ++++--
>  arch/x86/include/asm/paravirt.h               |  6 ++++--
>  arch/x86/include/asm/paravirt_types.h         |  2 ++
>  arch/x86/xen/enlighten_pv.c                   |  2 +-
>  arch/x86/xen/mmu_pv.c                         |  2 +-
>  fs/proc/task_mmu.c                            |  5 +++--
>  include/linux/mm_types.h                      |  3 +++
>  include/linux/pgtable.h                       |  6 ++++--
>  mm/kasan/shadow.c                             |  4 ++--
>  mm/madvise.c                                  | 20 ++++++++++---------
>  mm/memory.c                                   | 20 +++++++++++--------
>  mm/migrate_device.c                           |  5 +++--
>  mm/mprotect.c                                 |  5 +++--
>  mm/mremap.c                                   |  5 +++--
>  mm/userfaultfd.c                              |  5 +++--
>  mm/vmalloc.c                                  | 15 ++++++++------
>  mm/vmscan.c                                   | 15 ++++++++------
>  22 files changed, 102 insertions(+), 63 deletions(-)
>
> diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
> index 728d7b6ed20a..816197d08165 100644
> --- a/arch/arm64/include/asm/pgtable.h
> +++ b/arch/arm64/include/asm/pgtable.h
> @@ -81,7 +81,9 @@ static inline void queue_pte_barriers(void)
>  }
>
>  #define  __HAVE_ARCH_ENTER_LAZY_MMU_MODE
> -static inline void arch_enter_lazy_mmu_mode(void)
> +typedef int lazy_mmu_state_t;
> +
> +static inline lazy_mmu_state_t arch_enter_lazy_mmu_mode(void)
>  {
>  	/*
>  	 * lazy_mmu_mode is not supposed to permit nesting. But in practice this
> @@ -96,12 +98,14 @@ static inline void arch_enter_lazy_mmu_mode(void)
>  	 */
>
>  	if (in_interrupt())
> -		return;
> +		return LAZY_MMU_DEFAULT;
>
>  	set_thread_flag(TIF_LAZY_MMU);
> +
> +	return LAZY_MMU_DEFAULT;
>  }
>
> -static inline void arch_leave_lazy_mmu_mode(void)
> +static inline void arch_leave_lazy_mmu_mode(lazy_mmu_state_t state)
>  {
>  	if (in_interrupt())
>  		return;
> diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h
> index 176d7fd79eeb..c9f1e819e567 100644
> --- a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h
> +++ b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h
> @@ -25,13 +25,14 @@ DECLARE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch);
>  extern void __flush_tlb_pending(struct ppc64_tlb_batch *batch);
>
>  #define __HAVE_ARCH_ENTER_LAZY_MMU_MODE
> +typedef int lazy_mmu_state_t;
>
> -static inline void arch_enter_lazy_mmu_mode(void)
> +static inline lazy_mmu_state_t arch_enter_lazy_mmu_mode(void)
>  {
>  	struct ppc64_tlb_batch *batch;
>
>  	if (radix_enabled())
> -		return;
> +		return LAZY_MMU_DEFAULT;
>  	/*
>  	 * apply_to_page_range can call us this preempt enabled when
>  	 * operating on kernel page tables.
> @@ -39,9 +40,11 @@ static inline void arch_enter_lazy_mmu_mode(void)
>  	preempt_disable();
>  	batch = this_cpu_ptr(&ppc64_tlb_batch);
>  	batch->active = 1;
> +
> +	return LAZY_MMU_DEFAULT;
>  }
>
> -static inline void arch_leave_lazy_mmu_mode(void)
> +static inline void arch_leave_lazy_mmu_mode(lazy_mmu_state_t state)
>  {
>  	struct ppc64_tlb_batch *batch;
>
> diff --git a/arch/powerpc/mm/book3s64/hash_tlb.c b/arch/powerpc/mm/book3s64/hash_tlb.c
> index 21fcad97ae80..ee664f88e679 100644
> --- a/arch/powerpc/mm/book3s64/hash_tlb.c
> +++ b/arch/powerpc/mm/book3s64/hash_tlb.c
> @@ -189,6 +189,7 @@ void hash__tlb_flush(struct mmu_gather *tlb)
>   */
>  void __flush_hash_table_range(unsigned long start, unsigned long end)
>  {
> +	lazy_mmu_state_t lazy_mmu_state;
>  	int hugepage_shift;
>  	unsigned long flags;
>
> @@ -205,7 +206,7 @@ void __flush_hash_table_range(unsigned long start, unsigned long end)
>  	 * way to do things but is fine for our needs here.
>  	 */
>  	local_irq_save(flags);
> -	arch_enter_lazy_mmu_mode();
> +	lazy_mmu_state = arch_enter_lazy_mmu_mode();
>  	for (; start < end; start += PAGE_SIZE) {
>  		pte_t *ptep = find_init_mm_pte(start, &hugepage_shift);
>  		unsigned long pte;
> @@ -217,12 +218,13 @@ void __flush_hash_table_range(unsigned long start, unsigned long end)
>  			continue;
>  		hpte_need_flush(&init_mm, start, ptep, pte, hugepage_shift);
>  	}
> -	arch_leave_lazy_mmu_mode();
> +	arch_leave_lazy_mmu_mode(lazy_mmu_state);
>  	local_irq_restore(flags);
>  }
>
>  void flush_hash_table_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr)
>  {
> +	lazy_mmu_state_t lazy_mmu_state;
>  	pte_t *pte;
>  	pte_t *start_pte;
>  	unsigned long flags;
> @@ -237,7 +239,7 @@ void flush_hash_table_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long
>  	 * way to do things but is fine for our needs here.
>  	 */
>  	local_irq_save(flags);
> -	arch_enter_lazy_mmu_mode();
> +	lazy_mmu_state = arch_enter_lazy_mmu_mode();
>  	start_pte = pte_offset_map(pmd, addr);
>  	if (!start_pte)
>  		goto out;
> @@ -249,6 +251,6 @@ void flush_hash_table_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long
>  	}
>  	pte_unmap(start_pte);
>  out:
> -	arch_leave_lazy_mmu_mode();
> +	arch_leave_lazy_mmu_mode(lazy_mmu_state);
>  	local_irq_restore(flags);
>  }
> diff --git a/arch/powerpc/mm/book3s64/subpage_prot.c b/arch/powerpc/mm/book3s64/subpage_prot.c
> index ec98e526167e..4720f9f321af 100644
> --- a/arch/powerpc/mm/book3s64/subpage_prot.c
> +++ b/arch/powerpc/mm/book3s64/subpage_prot.c
> @@ -53,6 +53,7 @@ void subpage_prot_free(struct mm_struct *mm)
>  static void hpte_flush_range(struct mm_struct *mm, unsigned long addr,
>  			     int npages)
>  {
> +	lazy_mmu_state_t lazy_mmu_state;
>  	pgd_t *pgd;
>  	p4d_t *p4d;
>  	pud_t *pud;
> @@ -73,13 +74,13 @@ static void hpte_flush_range(struct mm_struct *mm, unsigned long addr,
>  	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
>  	if (!pte)
>  		return;
> -	arch_enter_lazy_mmu_mode();
> +	lazy_mmu_state = arch_enter_lazy_mmu_mode();
>  	for (; npages > 0; --npages) {
>  		pte_update(mm, addr, pte, 0, 0, 0);
>  		addr += PAGE_SIZE;
>  		++pte;
>  	}
> -	arch_leave_lazy_mmu_mode();
> +	arch_leave_lazy_mmu_mode(lazy_mmu_state);
>  	pte_unmap_unlock(pte - 1, ptl);
>  }
>
> diff --git a/arch/sparc/include/asm/tlbflush_64.h b/arch/sparc/include/asm/tlbflush_64.h
> index cd144eb31bdd..02c93a4e6af5 100644
> --- a/arch/sparc/include/asm/tlbflush_64.h
> +++ b/arch/sparc/include/asm/tlbflush_64.h
> @@ -40,10 +40,11 @@ static inline void flush_tlb_range(struct vm_area_struct *vma,
>  void flush_tlb_kernel_range(unsigned long start, unsigned long end);
>
>  #define __HAVE_ARCH_ENTER_LAZY_MMU_MODE
> +typedef int lazy_mmu_state_t;
>
>  void flush_tlb_pending(void);
> -void arch_enter_lazy_mmu_mode(void);
> -void arch_leave_lazy_mmu_mode(void);
> +lazy_mmu_state_t arch_enter_lazy_mmu_mode(void);
> +void arch_leave_lazy_mmu_mode(lazy_mmu_state_t state);
>
>  /* Local cpu only.  */
>  void __flush_tlb_all(void);
> diff --git a/arch/sparc/mm/tlb.c b/arch/sparc/mm/tlb.c
> index a35ddcca5e76..bf5094b770af 100644
> --- a/arch/sparc/mm/tlb.c
> +++ b/arch/sparc/mm/tlb.c
> @@ -50,16 +50,18 @@ void flush_tlb_pending(void)
>  	put_cpu_var(tlb_batch);
>  }
>
> -void arch_enter_lazy_mmu_mode(void)
> +lazy_mmu_state_t arch_enter_lazy_mmu_mode(void)
>  {
>  	struct tlb_batch *tb;
>
>  	preempt_disable();
>  	tb = this_cpu_ptr(&tlb_batch);
>  	tb->active = 1;
> +
> +	return LAZY_MMU_DEFAULT;
>  }
>
> -void arch_leave_lazy_mmu_mode(void)
> +void arch_leave_lazy_mmu_mode(lazy_mmu_state_t state)
>  {
>  	struct tlb_batch *tb = this_cpu_ptr(&tlb_batch);
>
> diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
> index b5e59a7ba0d0..65a0d394fba1 100644
> --- a/arch/x86/include/asm/paravirt.h
> +++ b/arch/x86/include/asm/paravirt.h
> @@ -527,12 +527,14 @@ static inline void arch_end_context_switch(struct task_struct *next)
>  }
>
>  #define  __HAVE_ARCH_ENTER_LAZY_MMU_MODE
> -static inline void arch_enter_lazy_mmu_mode(void)
> +static inline lazy_mmu_state_t arch_enter_lazy_mmu_mode(void)
>  {
>  	PVOP_VCALL0(mmu.lazy_mode.enter);
> +
> +	return LAZY_MMU_DEFAULT;
>  }
>
> -static inline void arch_leave_lazy_mmu_mode(void)
> +static inline void arch_leave_lazy_mmu_mode(lazy_mmu_state_t state)
>  {
>  	PVOP_VCALL0(mmu.lazy_mode.leave);
>  }
> diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
> index 37a8627d8277..bc1af86868a3 100644
> --- a/arch/x86/include/asm/paravirt_types.h
> +++ b/arch/x86/include/asm/paravirt_types.h
> @@ -41,6 +41,8 @@ struct pv_info {
>  };
>
>  #ifdef CONFIG_PARAVIRT_XXL
> +typedef int lazy_mmu_state_t;
> +
>  struct pv_lazy_ops {
>  	/* Set deferred update mode, used for batching operations. */
>  	void (*enter)(void);
> diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
> index 26bbaf4b7330..a245ba47a631 100644
> --- a/arch/x86/xen/enlighten_pv.c
> +++ b/arch/x86/xen/enlighten_pv.c
> @@ -426,7 +426,7 @@ static void xen_start_context_switch(struct task_struct *prev)
>  	BUG_ON(preemptible());
>
>  	if (this_cpu_read(xen_lazy_mode) == XEN_LAZY_MMU) {
> -		arch_leave_lazy_mmu_mode();
> +		arch_leave_lazy_mmu_mode(LAZY_MMU_DEFAULT);
>  		set_ti_thread_flag(task_thread_info(prev), TIF_LAZY_MMU_UPDATES);
>  	}
>  	enter_lazy(XEN_LAZY_CPU);
> diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
> index 2a4a8deaf612..2039d5132ca3 100644
> --- a/arch/x86/xen/mmu_pv.c
> +++ b/arch/x86/xen/mmu_pv.c
> @@ -2140,7 +2140,7 @@ static void xen_flush_lazy_mmu(void)
>  	preempt_disable();
>
>  	if (xen_get_lazy_mode() == XEN_LAZY_MMU) {
> -		arch_leave_lazy_mmu_mode();
> +		arch_leave_lazy_mmu_mode(LAZY_MMU_DEFAULT);
>  		arch_enter_lazy_mmu_mode();
>  	}
>
> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> index ced01cf3c5ab..02aa55f83bae 100644
> --- a/fs/proc/task_mmu.c
> +++ b/fs/proc/task_mmu.c
> @@ -2682,6 +2682,7 @@ static int pagemap_scan_thp_entry(pmd_t *pmd, unsigned long start,
>  static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start,
>  				  unsigned long end, struct mm_walk *walk)
>  {
> +	lazy_mmu_state_t lazy_mmu_state;
>  	struct pagemap_scan_private *p = walk->private;
>  	struct vm_area_struct *vma = walk->vma;
>  	unsigned long addr, flush_end = 0;
> @@ -2700,7 +2701,7 @@ static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start,
>  		return 0;
>  	}
>
> -	arch_enter_lazy_mmu_mode();
> +	lazy_mmu_state = arch_enter_lazy_mmu_mode();
>
>  	if ((p->arg.flags & PM_SCAN_WP_MATCHING) && !p->vec_out) {
>  		/* Fast path for performing exclusive WP */
> @@ -2770,7 +2771,7 @@ static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start,
>  	if (flush_end)
>  		flush_tlb_range(vma, start, addr);
>
> -	arch_leave_lazy_mmu_mode();
> +	arch_leave_lazy_mmu_mode(lazy_mmu_state);
>  	pte_unmap_unlock(start_pte, ptl);
>
>  	cond_resched();
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index 275e8060d918..143d819c1386 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -1489,6 +1489,9 @@ extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm);
>  extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm);
>  extern void tlb_finish_mmu(struct mmu_gather *tlb);
>
> +#define LAZY_MMU_DEFAULT	0
> +#define LAZY_MMU_NESTED		1
> +
>  struct vm_fault;
>
>  /**
> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
> index 8d6007123cdf..df0eb898b3fc 100644
> --- a/include/linux/pgtable.h
> +++ b/include/linux/pgtable.h
> @@ -232,8 +232,10 @@ static inline int pmd_dirty(pmd_t pmd)
>   * and the mode cannot be used in interrupt context.
>   */
>  #ifndef __HAVE_ARCH_ENTER_LAZY_MMU_MODE
> -#define arch_enter_lazy_mmu_mode()	do {} while (0)
> -#define arch_leave_lazy_mmu_mode()	do {} while (0)
> +typedef int lazy_mmu_state_t;
> +
> +#define arch_enter_lazy_mmu_mode()	(LAZY_MMU_DEFAULT)
> +#define arch_leave_lazy_mmu_mode(state)	((void)(state))
>  #endif
>
>  #ifndef pte_batch_hint
> diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
> index 5d2a876035d6..60b1b72f5ce1 100644
> --- a/mm/kasan/shadow.c
> +++ b/mm/kasan/shadow.c
> @@ -305,7 +305,7 @@ static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr,
>  	pte_t pte;
>  	int index;
>
> -	arch_leave_lazy_mmu_mode();
> +	arch_leave_lazy_mmu_mode(LAZY_MMU_DEFAULT);
>
>  	index = PFN_DOWN(addr - data->start);
>  	page = data->pages[index];
> @@ -482,7 +482,7 @@ static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr,
>  	pte_t pte;
>  	int none;
>
> -	arch_leave_lazy_mmu_mode();
> +	arch_leave_lazy_mmu_mode(LAZY_MMU_DEFAULT);
>
>  	spin_lock(&init_mm.page_table_lock);
>  	pte = ptep_get(ptep);
> diff --git a/mm/madvise.c b/mm/madvise.c
> index 35ed4ab0d7c5..72c032f2cf56 100644
> --- a/mm/madvise.c
> +++ b/mm/madvise.c
> @@ -357,6 +357,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
>  				unsigned long addr, unsigned long end,
>  				struct mm_walk *walk)
>  {
> +	lazy_mmu_state_t lazy_mmu_state;
>  	struct madvise_walk_private *private = walk->private;
>  	struct mmu_gather *tlb = private->tlb;
>  	bool pageout = private->pageout;
> @@ -455,7 +456,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
>  	if (!start_pte)
>  		return 0;
>  	flush_tlb_batched_pending(mm);
> -	arch_enter_lazy_mmu_mode();
> +	lazy_mmu_state = arch_enter_lazy_mmu_mode();
>  	for (; addr < end; pte += nr, addr += nr * PAGE_SIZE) {
>  		nr = 1;
>  		ptent = ptep_get(pte);
> @@ -463,7 +464,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
>  		if (++batch_count == SWAP_CLUSTER_MAX) {
>  			batch_count = 0;
>  			if (need_resched()) {
> -				arch_leave_lazy_mmu_mode();
> +				arch_leave_lazy_mmu_mode(lazy_mmu_state);
>  				pte_unmap_unlock(start_pte, ptl);
>  				cond_resched();
>  				goto restart;
> @@ -499,7 +500,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
>  				if (!folio_trylock(folio))
>  					continue;
>  				folio_get(folio);
> -				arch_leave_lazy_mmu_mode();
> +				arch_leave_lazy_mmu_mode(lazy_mmu_state);
>  				pte_unmap_unlock(start_pte, ptl);
>  				start_pte = NULL;
>  				err = split_folio(folio);
> @@ -510,7 +511,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
>  				if (!start_pte)
>  					break;
>  				flush_tlb_batched_pending(mm);
> -				arch_enter_lazy_mmu_mode();
> +				lazy_mmu_state = arch_enter_lazy_mmu_mode();
>  				if (!err)
>  					nr = 0;
>  				continue;
> @@ -558,7 +559,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
>  	}
>
>  	if (start_pte) {
> -		arch_leave_lazy_mmu_mode();
> +		arch_leave_lazy_mmu_mode(lazy_mmu_state);
>  		pte_unmap_unlock(start_pte, ptl);
>  	}
>  	if (pageout)
> @@ -657,6 +658,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
>
>  {
>  	const cydp_t cydp_flags = CYDP_CLEAR_YOUNG | CYDP_CLEAR_DIRTY;
> +	lazy_mmu_state_t lazy_mmu_state;
>  	struct mmu_gather *tlb = walk->private;
>  	struct mm_struct *mm = tlb->mm;
>  	struct vm_area_struct *vma = walk->vma;
> @@ -677,7 +679,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
>  	if (!start_pte)
>  		return 0;
>  	flush_tlb_batched_pending(mm);
> -	arch_enter_lazy_mmu_mode();
> +	lazy_mmu_state = arch_enter_lazy_mmu_mode();
>  	for (; addr != end; pte += nr, addr += PAGE_SIZE * nr) {
>  		nr = 1;
>  		ptent = ptep_get(pte);
> @@ -727,7 +729,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
>  				if (!folio_trylock(folio))
>  					continue;
>  				folio_get(folio);
> -				arch_leave_lazy_mmu_mode();
> +				arch_leave_lazy_mmu_mode(lazy_mmu_state);
>  				pte_unmap_unlock(start_pte, ptl);
>  				start_pte = NULL;
>  				err = split_folio(folio);
> @@ -738,7 +740,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
>  				if (!start_pte)
>  					break;
>  				flush_tlb_batched_pending(mm);
> -				arch_enter_lazy_mmu_mode();
> +				lazy_mmu_state = arch_enter_lazy_mmu_mode();
>  				if (!err)
>  					nr = 0;
>  				continue;
> @@ -778,7 +780,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
>  	if (nr_swap)
>  		add_mm_counter(mm, MM_SWAPENTS, nr_swap);
>  	if (start_pte) {
> -		arch_leave_lazy_mmu_mode();
> +		arch_leave_lazy_mmu_mode(lazy_mmu_state);
>  		pte_unmap_unlock(start_pte, ptl);
>  	}
>  	cond_resched();
> diff --git a/mm/memory.c b/mm/memory.c
> index d9de6c056179..a60aae069f1e 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -1207,6 +1207,7 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
>  	       pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
>  	       unsigned long end)
>  {
> +	lazy_mmu_state_t lazy_mmu_state;
>  	struct mm_struct *dst_mm = dst_vma->vm_mm;
>  	struct mm_struct *src_mm = src_vma->vm_mm;
>  	pte_t *orig_src_pte, *orig_dst_pte;
> @@ -1254,7 +1255,7 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
>  	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
>  	orig_src_pte = src_pte;
>  	orig_dst_pte = dst_pte;
> -	arch_enter_lazy_mmu_mode();
> +	lazy_mmu_state = arch_enter_lazy_mmu_mode();
>
>  	do {
>  		nr = 1;
> @@ -1323,7 +1324,7 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
>  	} while (dst_pte += nr, src_pte += nr, addr += PAGE_SIZE * nr,
>  		 addr != end);
>
> -	arch_leave_lazy_mmu_mode();
> +	arch_leave_lazy_mmu_mode(lazy_mmu_state);
>  	pte_unmap_unlock(orig_src_pte, src_ptl);
>  	add_mm_rss_vec(dst_mm, rss);
>  	pte_unmap_unlock(orig_dst_pte, dst_ptl);
> @@ -1822,6 +1823,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
>  				unsigned long addr, unsigned long end,
>  				struct zap_details *details)
>  {
> +	lazy_mmu_state_t lazy_mmu_state;
>  	bool force_flush = false, force_break = false;
>  	struct mm_struct *mm = tlb->mm;
>  	int rss[NR_MM_COUNTERS];
> @@ -1842,7 +1844,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
>  		return addr;
>
>  	flush_tlb_batched_pending(mm);
> -	arch_enter_lazy_mmu_mode();
> +	lazy_mmu_state = arch_enter_lazy_mmu_mode();
>  	do {
>  		bool any_skipped = false;
>
> @@ -1874,7 +1876,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
>  		direct_reclaim = try_get_and_clear_pmd(mm, pmd, &pmdval);
>
>  	add_mm_rss_vec(mm, rss);
> -	arch_leave_lazy_mmu_mode();
> +	arch_leave_lazy_mmu_mode(lazy_mmu_state);
>
>  	/* Do the actual TLB flush before dropping ptl */
>  	if (force_flush) {
> @@ -2811,6 +2813,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
>  			unsigned long addr, unsigned long end,
>  			unsigned long pfn, pgprot_t prot)
>  {
> +	lazy_mmu_state_t lazy_mmu_state;
>  	pte_t *pte, *mapped_pte;
>  	spinlock_t *ptl;
>  	int err = 0;
> @@ -2818,7 +2821,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
>  	mapped_pte = pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
>  	if (!pte)
>  		return -ENOMEM;
> -	arch_enter_lazy_mmu_mode();
> +	lazy_mmu_state = arch_enter_lazy_mmu_mode();
>  	do {
>  		BUG_ON(!pte_none(ptep_get(pte)));
>  		if (!pfn_modify_allowed(pfn, prot)) {
> @@ -2828,7 +2831,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
>  		set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
>  		pfn++;
>  	} while (pte++, addr += PAGE_SIZE, addr != end);
> -	arch_leave_lazy_mmu_mode();
> +	arch_leave_lazy_mmu_mode(lazy_mmu_state);
>  	pte_unmap_unlock(mapped_pte, ptl);
>  	return err;
>  }
> @@ -3117,6 +3120,7 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
>  				     pte_fn_t fn, void *data, bool create,
>  				     pgtbl_mod_mask *mask)
>  {
> +	lazy_mmu_state_t lazy_mmu_state;
>  	pte_t *pte, *mapped_pte;
>  	int err = 0;
>  	spinlock_t *ptl;
> @@ -3135,7 +3139,7 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
>  			return -EINVAL;
>  	}
>
> -	arch_enter_lazy_mmu_mode();
> +	lazy_mmu_state = arch_enter_lazy_mmu_mode();
>
>  	if (fn) {
>  		do {
> @@ -3148,7 +3152,7 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
>  	}
>  	*mask |= PGTBL_PTE_MODIFIED;
>
> -	arch_leave_lazy_mmu_mode();
> +	arch_leave_lazy_mmu_mode(lazy_mmu_state);
>
>  	if (mm != &init_mm)
>  		pte_unmap_unlock(mapped_pte, ptl);
> diff --git a/mm/migrate_device.c b/mm/migrate_device.c
> index abd9f6850db6..833ce5eafa40 100644
> --- a/mm/migrate_device.c
> +++ b/mm/migrate_device.c
> @@ -59,6 +59,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
>  				   unsigned long end,
>  				   struct mm_walk *walk)
>  {
> +	lazy_mmu_state_t lazy_mmu_state;
>  	struct migrate_vma *migrate = walk->private;
>  	struct folio *fault_folio = migrate->fault_page ?
>  		page_folio(migrate->fault_page) : NULL;
> @@ -110,7 +111,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
>  	ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
>  	if (!ptep)
>  		goto again;
> -	arch_enter_lazy_mmu_mode();
> +	lazy_mmu_state = arch_enter_lazy_mmu_mode();
>
>  	for (; addr < end; addr += PAGE_SIZE, ptep++) {
>  		struct dev_pagemap *pgmap;
> @@ -287,7 +288,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
>  	if (unmapped)
>  		flush_tlb_range(walk->vma, start, end);
>
> -	arch_leave_lazy_mmu_mode();
> +	arch_leave_lazy_mmu_mode(lazy_mmu_state);
>  	pte_unmap_unlock(ptep - 1, ptl);
>
>  	return 0;
> diff --git a/mm/mprotect.c b/mm/mprotect.c
> index 113b48985834..7bba651e5aa3 100644
> --- a/mm/mprotect.c
> +++ b/mm/mprotect.c
> @@ -273,6 +273,7 @@ static long change_pte_range(struct mmu_gather *tlb,
>  		struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr,
>  		unsigned long end, pgprot_t newprot, unsigned long cp_flags)
>  {
> +	lazy_mmu_state_t lazy_mmu_state;
>  	pte_t *pte, oldpte;
>  	spinlock_t *ptl;
>  	long pages = 0;
> @@ -293,7 +294,7 @@ static long change_pte_range(struct mmu_gather *tlb,
>  		target_node = numa_node_id();
>
>  	flush_tlb_batched_pending(vma->vm_mm);
> -	arch_enter_lazy_mmu_mode();
> +	lazy_mmu_state = arch_enter_lazy_mmu_mode();
>  	do {
>  		nr_ptes = 1;
>  		oldpte = ptep_get(pte);
> @@ -439,7 +440,7 @@ static long change_pte_range(struct mmu_gather *tlb,
>  			}
>  		}
>  	} while (pte += nr_ptes, addr += nr_ptes * PAGE_SIZE, addr != end);
> -	arch_leave_lazy_mmu_mode();
> +	arch_leave_lazy_mmu_mode(lazy_mmu_state);
>  	pte_unmap_unlock(pte - 1, ptl);
>
>  	return pages;
> diff --git a/mm/mremap.c b/mm/mremap.c
> index 35de0a7b910e..a562d8cf1eee 100644
> --- a/mm/mremap.c
> +++ b/mm/mremap.c
> @@ -193,6 +193,7 @@ static int mremap_folio_pte_batch(struct vm_area_struct *vma, unsigned long addr
>  static int move_ptes(struct pagetable_move_control *pmc,
>  		unsigned long extent, pmd_t *old_pmd, pmd_t *new_pmd)
>  {
> +	lazy_mmu_state_t lazy_mmu_state;
>  	struct vm_area_struct *vma = pmc->old;
>  	bool need_clear_uffd_wp = vma_has_uffd_without_event_remap(vma);
>  	struct mm_struct *mm = vma->vm_mm;
> @@ -256,7 +257,7 @@ static int move_ptes(struct pagetable_move_control *pmc,
>  	if (new_ptl != old_ptl)
>  		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
>  	flush_tlb_batched_pending(vma->vm_mm);
> -	arch_enter_lazy_mmu_mode();
> +	lazy_mmu_state = arch_enter_lazy_mmu_mode();
>
>  	for (; old_addr < old_end; old_ptep += nr_ptes, old_addr += nr_ptes * PAGE_SIZE,
>  		new_ptep += nr_ptes, new_addr += nr_ptes * PAGE_SIZE) {
> @@ -301,7 +302,7 @@ static int move_ptes(struct pagetable_move_control *pmc,
>  		}
>  	}
>
> -	arch_leave_lazy_mmu_mode();
> +	arch_leave_lazy_mmu_mode(lazy_mmu_state);
>  	if (force_flush)
>  		flush_tlb_range(vma, old_end - len, old_end);
>  	if (new_ptl != old_ptl)
> diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
> index 50aaa8dcd24c..6ee71ba68b12 100644
> --- a/mm/userfaultfd.c
> +++ b/mm/userfaultfd.c
> @@ -1076,6 +1076,7 @@ static long move_present_ptes(struct mm_struct *mm,
>  			      struct folio **first_src_folio, unsigned long len,
>  			      struct anon_vma *src_anon_vma)
>  {
> +	lazy_mmu_state_t lazy_mmu_state;
>  	int err = 0;
>  	struct folio *src_folio = *first_src_folio;
>  	unsigned long src_start = src_addr;
> @@ -1100,7 +1101,7 @@ static long move_present_ptes(struct mm_struct *mm,
>  	/* It's safe to drop the reference now as the page-table is holding one. */
>  	folio_put(*first_src_folio);
>  	*first_src_folio = NULL;
> -	arch_enter_lazy_mmu_mode();
> +	lazy_mmu_state = arch_enter_lazy_mmu_mode();
>
>  	while (true) {
>  		orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte);
> @@ -1138,7 +1139,7 @@ static long move_present_ptes(struct mm_struct *mm,
>  			break;
>  	}
>
> -	arch_leave_lazy_mmu_mode();
> +	arch_leave_lazy_mmu_mode(lazy_mmu_state);
>  	if (src_addr > src_start)
>  		flush_tlb_range(src_vma, src_start, src_addr);
>
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index 4249e1e01947..9fc86ddf1711 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -95,6 +95,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
>  			phys_addr_t phys_addr, pgprot_t prot,
>  			unsigned int max_page_shift, pgtbl_mod_mask *mask)
>  {
> +	lazy_mmu_state_t lazy_mmu_state;
>  	pte_t *pte;
>  	u64 pfn;
>  	struct page *page;
> @@ -105,7 +106,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
>  	if (!pte)
>  		return -ENOMEM;
>
> -	arch_enter_lazy_mmu_mode();
> +	lazy_mmu_state = arch_enter_lazy_mmu_mode();
>
>  	do {
>  		if (unlikely(!pte_none(ptep_get(pte)))) {
> @@ -131,7 +132,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
>  		pfn++;
>  	} while (pte += PFN_DOWN(size), addr += size, addr != end);
>
> -	arch_leave_lazy_mmu_mode();
> +	arch_leave_lazy_mmu_mode(lazy_mmu_state);
>  	*mask |= PGTBL_PTE_MODIFIED;
>  	return 0;
>  }
> @@ -354,12 +355,13 @@ int ioremap_page_range(unsigned long addr, unsigned long end,
>  static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
>  			     pgtbl_mod_mask *mask)
>  {
> +	lazy_mmu_state_t lazy_mmu_state;
>  	pte_t *pte;
>  	pte_t ptent;
>  	unsigned long size = PAGE_SIZE;
>
>  	pte = pte_offset_kernel(pmd, addr);
> -	arch_enter_lazy_mmu_mode();
> +	lazy_mmu_state = arch_enter_lazy_mmu_mode();
>
>  	do {
>  #ifdef CONFIG_HUGETLB_PAGE
> @@ -378,7 +380,7 @@ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
>  		WARN_ON(!pte_none(ptent) && !pte_present(ptent));
>  	} while (pte += (size >> PAGE_SHIFT), addr += size, addr != end);
>
> -	arch_leave_lazy_mmu_mode();
> +	arch_leave_lazy_mmu_mode(lazy_mmu_state);
>  	*mask |= PGTBL_PTE_MODIFIED;
>  }
>
> @@ -514,6 +516,7 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
>  		unsigned long end, pgprot_t prot, struct page **pages, int *nr,
>  		pgtbl_mod_mask *mask)
>  {
> +	lazy_mmu_state_t lazy_mmu_state;
>  	int err = 0;
>  	pte_t *pte;
>
> @@ -526,7 +529,7 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
>  	if (!pte)
>  		return -ENOMEM;
>
> -	arch_enter_lazy_mmu_mode();
> +	lazy_mmu_state = arch_enter_lazy_mmu_mode();
>
>  	do {
>  		struct page *page = pages[*nr];
> @@ -548,7 +551,7 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
>  		(*nr)++;
>  	} while (pte++, addr += PAGE_SIZE, addr != end);
>
> -	arch_leave_lazy_mmu_mode();
> +	arch_leave_lazy_mmu_mode(lazy_mmu_state);
>  	*mask |= PGTBL_PTE_MODIFIED;
>
>  	return err;
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index ca9e1cd3cd68..2872497a0453 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -3514,6 +3514,7 @@ static void walk_update_folio(struct lru_gen_mm_walk *walk, struct folio *folio,
>  static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
>  			   struct mm_walk *args)
>  {
> +	lazy_mmu_state_t lazy_mmu_state;
>  	int i;
>  	bool dirty;
>  	pte_t *pte;
> @@ -3543,7 +3544,7 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
>  		return false;
>  	}
>
> -	arch_enter_lazy_mmu_mode();
> +	lazy_mmu_state = arch_enter_lazy_mmu_mode();
>  restart:
>  	for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) {
>  		unsigned long pfn;
> @@ -3584,7 +3585,7 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
>  	if (i < PTRS_PER_PTE && get_next_vma(PMD_MASK, PAGE_SIZE, args, &start, &end))
>  		goto restart;
>
> -	arch_leave_lazy_mmu_mode();
> +	arch_leave_lazy_mmu_mode(lazy_mmu_state);
>  	pte_unmap_unlock(pte, ptl);
>
>  	return suitable_to_scan(total, young);
> @@ -3593,6 +3594,7 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
>  static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area_struct *vma,
>  				  struct mm_walk *args, unsigned long *bitmap, unsigned long *first)
>  {
> +	lazy_mmu_state_t lazy_mmu_state;
>  	int i;
>  	bool dirty;
>  	pmd_t *pmd;
> @@ -3625,7 +3627,7 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area
>  	if (!spin_trylock(ptl))
>  		goto done;
>
> -	arch_enter_lazy_mmu_mode();
> +	lazy_mmu_state = arch_enter_lazy_mmu_mode();
>
>  	do {
>  		unsigned long pfn;
> @@ -3672,7 +3674,7 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area
>
>  	walk_update_folio(walk, last, gen, dirty);
>
> -	arch_leave_lazy_mmu_mode();
> +	arch_leave_lazy_mmu_mode(lazy_mmu_state);
>  	spin_unlock(ptl);
>  done:
>  	*first = -1;
> @@ -4220,6 +4222,7 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
>   */
>  bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
>  {
> +	lazy_mmu_state_t lazy_mmu_state;
>  	int i;
>  	bool dirty;
>  	unsigned long start;
> @@ -4271,7 +4274,7 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
>  		}
>  	}
>
> -	arch_enter_lazy_mmu_mode();
> +	lazy_mmu_state = arch_enter_lazy_mmu_mode();
>
>  	pte -= (addr - start) / PAGE_SIZE;
>
> @@ -4305,7 +4308,7 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
>
>  	walk_update_folio(walk, last, gen, dirty);
>
> -	arch_leave_lazy_mmu_mode();
> +	arch_leave_lazy_mmu_mode(lazy_mmu_state);
>
>  	/* feedback from rmap walkers to page table walkers */
>  	if (mm_state && suitable_to_scan(i, young))
> --
> 2.47.0
>

--
Sincerely,
Yeoreum Yun

[PATCH v2 1/7] mm: remove arch_flush_lazy_mmu_mode()
[PATCH v2 2/7] mm: introduce local state for lazy_mmu sections
[PATCH v2 3/7] arm64: mm: fully support nested lazy_mmu sections
[PATCH v2 4/7] x86/xen: support nested lazy_mmu sections (again)
[PATCH v2 5/7] powerpc/mm: support nested lazy_mmu sections
[PATCH v2 6/7] sparc/mm: support nested lazy_mmu sections
[PATCH v2 7/7] mm: update lazy_mmu documentation