As a follow-up to commit 691ee97e1a9d ("mm: fix lazy mmu docs and
usage") take a step forward and protect with a lock not only user,
but also kernel mappings before entering the lazy MMU mode. With
that the semantics of arch_enter|leave_lazy_mmu_mode() callbacks
is consolidated, which allows further simplifications.
The effect of this consolidation is not fully preemptible (Real-Time)
kernels can not enter the context switch while the lazy MMU mode is
active - which is easier to comprehend.
Signed-off-by: Alexander Gordeev <agordeev@linux.ibm.com>
---
include/linux/pgtable.h | 12 ++++++------
mm/kasan/shadow.c | 5 -----
mm/memory.c | 5 ++++-
mm/vmalloc.c | 6 ++++++
4 files changed, 16 insertions(+), 12 deletions(-)
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 0b6e1f781d86..33bf2b13c219 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -224,12 +224,12 @@ static inline int pmd_dirty(pmd_t pmd)
* a raw PTE pointer after it has been modified are not guaranteed to be
* up to date.
*
- * In the general case, no lock is guaranteed to be held between entry and exit
- * of the lazy mode. So the implementation must assume preemption may be enabled
- * and cpu migration is possible; it must take steps to be robust against this.
- * (In practice, for user PTE updates, the appropriate page table lock(s) are
- * held, but for kernel PTE updates, no lock is held). Nesting is not permitted
- * and the mode cannot be used in interrupt context.
+ * For PREEMPT_RT kernels implementation must assume that preemption may
+ * be enabled and cpu migration is possible between entry and exit of the
+ * lazy MMU mode; it must take steps to be robust against this. There is
+ * no such assumption for non-PREEMPT_RT kernels, since both kernel and
+ * user page tables are protected with a spinlock while in lazy MMU mode.
+ * Nesting is not permitted and the mode cannot be used in interrupt context.
*/
#ifndef __HAVE_ARCH_ENTER_LAZY_MMU_MODE
#define arch_enter_lazy_mmu_mode() do {} while (0)
diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
index d2c70cd2afb1..45115bd770a9 100644
--- a/mm/kasan/shadow.c
+++ b/mm/kasan/shadow.c
@@ -313,12 +313,10 @@ static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr,
__memset(page_to_virt(page), KASAN_VMALLOC_INVALID, PAGE_SIZE);
pte = pfn_pte(page_to_pfn(page), PAGE_KERNEL);
- spin_lock(&init_mm.page_table_lock);
if (likely(pte_none(ptep_get(ptep)))) {
set_pte_at(&init_mm, addr, ptep, pte);
data->pages[index] = NULL;
}
- spin_unlock(&init_mm.page_table_lock);
return 0;
}
@@ -465,13 +463,10 @@ static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr,
page = (unsigned long)__va(pte_pfn(ptep_get(ptep)) << PAGE_SHIFT);
- spin_lock(&init_mm.page_table_lock);
-
if (likely(!pte_none(ptep_get(ptep)))) {
pte_clear(&init_mm, addr, ptep);
free_page(page);
}
- spin_unlock(&init_mm.page_table_lock);
return 0;
}
diff --git a/mm/memory.c b/mm/memory.c
index 71b3d3f98999..1ddc532b1f13 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3017,6 +3017,7 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
pte = pte_offset_kernel(pmd, addr);
if (!pte)
return err;
+ spin_lock(&init_mm.page_table_lock);
} else {
if (create)
pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
@@ -3042,7 +3043,9 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
arch_leave_lazy_mmu_mode();
- if (mm != &init_mm)
+ if (mm == &init_mm)
+ spin_unlock(&init_mm.page_table_lock);
+ else
pte_unmap_unlock(mapped_pte, ptl);
*mask |= PGTBL_PTE_MODIFIED;
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index ab986dd09b6a..57b11000ae36 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -105,6 +105,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
if (!pte)
return -ENOMEM;
+ spin_lock(&init_mm.page_table_lock);
arch_enter_lazy_mmu_mode();
do {
@@ -132,6 +133,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
} while (pte += PFN_DOWN(size), addr += size, addr != end);
arch_leave_lazy_mmu_mode();
+ spin_unlock(&init_mm.page_table_lock);
*mask |= PGTBL_PTE_MODIFIED;
return 0;
}
@@ -359,6 +361,7 @@ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
unsigned long size = PAGE_SIZE;
pte = pte_offset_kernel(pmd, addr);
+ spin_lock(&init_mm.page_table_lock);
arch_enter_lazy_mmu_mode();
do {
@@ -379,6 +382,7 @@ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
} while (pte += (size >> PAGE_SHIFT), addr += size, addr != end);
arch_leave_lazy_mmu_mode();
+ spin_unlock(&init_mm.page_table_lock);
*mask |= PGTBL_PTE_MODIFIED;
}
@@ -525,6 +529,7 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
if (!pte)
return -ENOMEM;
+ spin_lock(&init_mm.page_table_lock);
arch_enter_lazy_mmu_mode();
do {
@@ -542,6 +547,7 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
} while (pte++, addr += PAGE_SIZE, addr != end);
arch_leave_lazy_mmu_mode();
+ spin_unlock(&init_mm.page_table_lock);
*mask |= PGTBL_PTE_MODIFIED;
return 0;
}
--
2.48.1
Hi Alexander, kernel test robot noticed the following build warnings: url: https://github.com/intel-lab-lkp/linux/commits/Alexander-Gordeev/mm-Cleanup-apply_to_pte_range-routine/20250613-013835 base: https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything patch link: https://lore.kernel.org/r/7bd3a45dbc375dc2c15cebae09cb2bb972d6039f.1749747752.git.agordeev%40linux.ibm.com patch subject: [PATCH 2/6] mm: Lock kernel page tables before entering lazy MMU mode config: x86_64-randconfig-161-20250613 (https://download.01.org/0day-ci/archive/20250613/202506132017.T1l1l6ME-lkp@intel.com/config) compiler: gcc-12 (Debian 12.2.0-14) 12.2.0 If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot <lkp@intel.com> | Reported-by: Dan Carpenter <dan.carpenter@linaro.org> | Closes: https://lore.kernel.org/r/202506132017.T1l1l6ME-lkp@intel.com/ smatch warnings: mm/vmalloc.c:552 vmap_pages_pte_range() warn: inconsistent returns 'global &init_mm.page_table_lock'. vim +552 mm/vmalloc.c 0a264884046f1ab Nicholas Piggin 2021-04-29 517 static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr, 2ba3e6947aed9bb Joerg Roedel 2020-06-01 518 unsigned long end, pgprot_t prot, struct page **pages, int *nr, 2ba3e6947aed9bb Joerg Roedel 2020-06-01 519 pgtbl_mod_mask *mask) ^1da177e4c3f415 Linus Torvalds 2005-04-16 520 { ^1da177e4c3f415 Linus Torvalds 2005-04-16 521 pte_t *pte; ^1da177e4c3f415 Linus Torvalds 2005-04-16 522 db64fe02258f150 Nicholas Piggin 2008-10-18 523 /* db64fe02258f150 Nicholas Piggin 2008-10-18 524 * nr is a running index into the array which helps higher level db64fe02258f150 Nicholas Piggin 2008-10-18 525 * callers keep track of where we're up to. db64fe02258f150 Nicholas Piggin 2008-10-18 526 */ db64fe02258f150 Nicholas Piggin 2008-10-18 527 2ba3e6947aed9bb Joerg Roedel 2020-06-01 528 pte = pte_alloc_kernel_track(pmd, addr, mask); ^1da177e4c3f415 Linus Torvalds 2005-04-16 529 if (!pte) ^1da177e4c3f415 Linus Torvalds 2005-04-16 530 return -ENOMEM; 44562c71e2cfc9e Ryan Roberts 2025-04-22 531 dac0cc793368851 Alexander Gordeev 2025-06-12 532 spin_lock(&init_mm.page_table_lock); 44562c71e2cfc9e Ryan Roberts 2025-04-22 533 arch_enter_lazy_mmu_mode(); 44562c71e2cfc9e Ryan Roberts 2025-04-22 534 ^1da177e4c3f415 Linus Torvalds 2005-04-16 535 do { db64fe02258f150 Nicholas Piggin 2008-10-18 536 struct page *page = pages[*nr]; db64fe02258f150 Nicholas Piggin 2008-10-18 537 c33c794828f2121 Ryan Roberts 2023-06-12 538 if (WARN_ON(!pte_none(ptep_get(pte)))) db64fe02258f150 Nicholas Piggin 2008-10-18 539 return -EBUSY; db64fe02258f150 Nicholas Piggin 2008-10-18 540 if (WARN_ON(!page)) ^1da177e4c3f415 Linus Torvalds 2005-04-16 541 return -ENOMEM; 4fcdcc12915c707 Yury Norov 2022-04-28 542 if (WARN_ON(!pfn_valid(page_to_pfn(page)))) 4fcdcc12915c707 Yury Norov 2022-04-28 543 return -EINVAL; These error paths don't unlock &init_mm.page_table_lock? 4fcdcc12915c707 Yury Norov 2022-04-28 544 ^1da177e4c3f415 Linus Torvalds 2005-04-16 545 set_pte_at(&init_mm, addr, pte, mk_pte(page, prot)); db64fe02258f150 Nicholas Piggin 2008-10-18 546 (*nr)++; ^1da177e4c3f415 Linus Torvalds 2005-04-16 547 } while (pte++, addr += PAGE_SIZE, addr != end); 44562c71e2cfc9e Ryan Roberts 2025-04-22 548 44562c71e2cfc9e Ryan Roberts 2025-04-22 549 arch_leave_lazy_mmu_mode(); dac0cc793368851 Alexander Gordeev 2025-06-12 550 spin_unlock(&init_mm.page_table_lock); 2ba3e6947aed9bb Joerg Roedel 2020-06-01 551 *mask |= PGTBL_PTE_MODIFIED; ^1da177e4c3f415 Linus Torvalds 2005-04-16 @552 return 0; ^1da177e4c3f415 Linus Torvalds 2005-04-16 553 } -- 0-DAY CI Kernel Test Service https://github.com/intel/lkp-tests/wiki
On Wed, Jun 18, 2025 at 08:32:28PM +0300, Dan Carpenter wrote: > Hi Alexander, > > kernel test robot noticed the following build warnings: > > url: https://github.com/intel-lab-lkp/linux/commits/Alexander-Gordeev/mm-Cleanup-apply_to_pte_range-routine/20250613-013835 > base: https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything > patch link: https://lore.kernel.org/r/7bd3a45dbc375dc2c15cebae09cb2bb972d6039f.1749747752.git.agordeev%40linux.ibm.com > patch subject: [PATCH 2/6] mm: Lock kernel page tables before entering lazy MMU mode > config: x86_64-randconfig-161-20250613 (https://download.01.org/0day-ci/archive/20250613/202506132017.T1l1l6ME-lkp@intel.com/config) > compiler: gcc-12 (Debian 12.2.0-14) 12.2.0 > > If you fix the issue in a separate patch/commit (i.e. not just a new version of > the same patch/commit), kindly add following tags > | Reported-by: kernel test robot <lkp@intel.com> > | Reported-by: Dan Carpenter <dan.carpenter@linaro.org> > | Closes: https://lore.kernel.org/r/202506132017.T1l1l6ME-lkp@intel.com/ > > smatch warnings: > mm/vmalloc.c:552 vmap_pages_pte_range() warn: inconsistent returns 'global &init_mm.page_table_lock'. > > vim +552 mm/vmalloc.c > > 0a264884046f1ab Nicholas Piggin 2021-04-29 517 static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr, > 2ba3e6947aed9bb Joerg Roedel 2020-06-01 518 unsigned long end, pgprot_t prot, struct page **pages, int *nr, > 2ba3e6947aed9bb Joerg Roedel 2020-06-01 519 pgtbl_mod_mask *mask) > ^1da177e4c3f415 Linus Torvalds 2005-04-16 520 { > ^1da177e4c3f415 Linus Torvalds 2005-04-16 521 pte_t *pte; > ^1da177e4c3f415 Linus Torvalds 2005-04-16 522 > db64fe02258f150 Nicholas Piggin 2008-10-18 523 /* > db64fe02258f150 Nicholas Piggin 2008-10-18 524 * nr is a running index into the array which helps higher level > db64fe02258f150 Nicholas Piggin 2008-10-18 525 * callers keep track of where we're up to. > db64fe02258f150 Nicholas Piggin 2008-10-18 526 */ > db64fe02258f150 Nicholas Piggin 2008-10-18 527 > 2ba3e6947aed9bb Joerg Roedel 2020-06-01 528 pte = pte_alloc_kernel_track(pmd, addr, mask); > ^1da177e4c3f415 Linus Torvalds 2005-04-16 529 if (!pte) > ^1da177e4c3f415 Linus Torvalds 2005-04-16 530 return -ENOMEM; > 44562c71e2cfc9e Ryan Roberts 2025-04-22 531 > dac0cc793368851 Alexander Gordeev 2025-06-12 532 spin_lock(&init_mm.page_table_lock); > 44562c71e2cfc9e Ryan Roberts 2025-04-22 533 arch_enter_lazy_mmu_mode(); > 44562c71e2cfc9e Ryan Roberts 2025-04-22 534 > ^1da177e4c3f415 Linus Torvalds 2005-04-16 535 do { > db64fe02258f150 Nicholas Piggin 2008-10-18 536 struct page *page = pages[*nr]; > db64fe02258f150 Nicholas Piggin 2008-10-18 537 > c33c794828f2121 Ryan Roberts 2023-06-12 538 if (WARN_ON(!pte_none(ptep_get(pte)))) > db64fe02258f150 Nicholas Piggin 2008-10-18 539 return -EBUSY; > db64fe02258f150 Nicholas Piggin 2008-10-18 540 if (WARN_ON(!page)) > ^1da177e4c3f415 Linus Torvalds 2005-04-16 541 return -ENOMEM; > 4fcdcc12915c707 Yury Norov 2022-04-28 542 if (WARN_ON(!pfn_valid(page_to_pfn(page)))) > 4fcdcc12915c707 Yury Norov 2022-04-28 543 return -EINVAL; > > These error paths don't unlock &init_mm.page_table_lock? > > 4fcdcc12915c707 Yury Norov 2022-04-28 544 > ^1da177e4c3f415 Linus Torvalds 2005-04-16 545 set_pte_at(&init_mm, addr, pte, mk_pte(page, prot)); > db64fe02258f150 Nicholas Piggin 2008-10-18 546 (*nr)++; > ^1da177e4c3f415 Linus Torvalds 2005-04-16 547 } while (pte++, addr += PAGE_SIZE, addr != end); > 44562c71e2cfc9e Ryan Roberts 2025-04-22 548 > 44562c71e2cfc9e Ryan Roberts 2025-04-22 549 arch_leave_lazy_mmu_mode(); > dac0cc793368851 Alexander Gordeev 2025-06-12 550 spin_unlock(&init_mm.page_table_lock); > 2ba3e6947aed9bb Joerg Roedel 2020-06-01 551 *mask |= PGTBL_PTE_MODIFIED; > ^1da177e4c3f415 Linus Torvalds 2005-04-16 @552 return 0; > ^1da177e4c3f415 Linus Torvalds 2005-04-16 553 } > > -- > 0-DAY CI Kernel Test Service > https://github.com/intel/lkp-tests/wiki > > This patch introduce a huge performance degrade when testing this by the test_vmalloc.sh performance tool. We return back to a single, not serialized global spilock where we spend 90% of cycles: <snip> + 91.01% 1.67% [kernel] [k] _raw_spin_lock - 89.29% 89.25% [kernel] [k] native_queued_spin_lock_slowpath 69.82% ret_from_fork_asm - ret_from_fork - 69.81% kthread - 69.66% test_func - 26.31% full_fit_alloc_test - 19.11% __vmalloc_node_noprof - __vmalloc_node_range_noprof - 13.73% vmap_small_pages_range_noflush _raw_spin_lock native_queued_spin_lock_slowpath - 5.38% __get_vm_area_node alloc_vmap_area _raw_spin_lock native_queued_spin_lock_slowpath - 13.32% vfree.part.0 - 13.31% remove_vm_area - 13.27% __vunmap_range_noflush _raw_spin_lock native_queued_spin_lock_slowpath - 25.57% fix_size_alloc_test - 22.59% __vmalloc_node_noprof - __vmalloc_node_range_noprof - 17.34% vmap_small_pages_range_noflush _raw_spin_lock native_queued_spin_lock_slowpath - 5.25% __get_vm_area_node alloc_vmap_area _raw_spin_lock native_queued_spin_lock_slowpath - 11.59% vfree.part.0 - remove_vm_area - 11.55% __vunmap_range_noflush _raw_spin_lock native_queued_spin_lock_slowpath - 17.78% long_busy_list_alloc_test - 13.90% __vmalloc_node_noprof - __vmalloc_node_range_noprof - 9.95% vmap_small_pages_range_noflush _raw_spin_lock <snip> No, we can not take this patch. -- Uladzislau Rezki
On Thu, Jun 12, 2025 at 07:36:09PM +0200, Alexander Gordeev wrote: > As a follow-up to commit 691ee97e1a9d ("mm: fix lazy mmu docs and > usage") take a step forward and protect with a lock not only user, > but also kernel mappings before entering the lazy MMU mode. With > that the semantics of arch_enter|leave_lazy_mmu_mode() callbacks > is consolidated, which allows further simplifications. > > The effect of this consolidation is not fully preemptible (Real-Time) > kernels can not enter the context switch while the lazy MMU mode is > active - which is easier to comprehend. > > Signed-off-by: Alexander Gordeev <agordeev@linux.ibm.com> > --- > include/linux/pgtable.h | 12 ++++++------ > mm/kasan/shadow.c | 5 ----- > mm/memory.c | 5 ++++- > mm/vmalloc.c | 6 ++++++ > 4 files changed, 16 insertions(+), 12 deletions(-) > > diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h > index 0b6e1f781d86..33bf2b13c219 100644 > --- a/include/linux/pgtable.h > +++ b/include/linux/pgtable.h > @@ -224,12 +224,12 @@ static inline int pmd_dirty(pmd_t pmd) > * a raw PTE pointer after it has been modified are not guaranteed to be > * up to date. > * > - * In the general case, no lock is guaranteed to be held between entry and exit > - * of the lazy mode. So the implementation must assume preemption may be enabled > - * and cpu migration is possible; it must take steps to be robust against this. > - * (In practice, for user PTE updates, the appropriate page table lock(s) are > - * held, but for kernel PTE updates, no lock is held). Nesting is not permitted > - * and the mode cannot be used in interrupt context. > + * For PREEMPT_RT kernels implementation must assume that preemption may > + * be enabled and cpu migration is possible between entry and exit of the > + * lazy MMU mode; it must take steps to be robust against this. There is > + * no such assumption for non-PREEMPT_RT kernels, since both kernel and > + * user page tables are protected with a spinlock while in lazy MMU mode. > + * Nesting is not permitted and the mode cannot be used in interrupt context. > */ > #ifndef __HAVE_ARCH_ENTER_LAZY_MMU_MODE > #define arch_enter_lazy_mmu_mode() do {} while (0) > diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c > index d2c70cd2afb1..45115bd770a9 100644 > --- a/mm/kasan/shadow.c > +++ b/mm/kasan/shadow.c > @@ -313,12 +313,10 @@ static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr, > __memset(page_to_virt(page), KASAN_VMALLOC_INVALID, PAGE_SIZE); > pte = pfn_pte(page_to_pfn(page), PAGE_KERNEL); > > - spin_lock(&init_mm.page_table_lock); > if (likely(pte_none(ptep_get(ptep)))) { > set_pte_at(&init_mm, addr, ptep, pte); > data->pages[index] = NULL; > } > - spin_unlock(&init_mm.page_table_lock); > > return 0; > } > @@ -465,13 +463,10 @@ static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr, > > page = (unsigned long)__va(pte_pfn(ptep_get(ptep)) << PAGE_SHIFT); > > - spin_lock(&init_mm.page_table_lock); > - > if (likely(!pte_none(ptep_get(ptep)))) { > pte_clear(&init_mm, addr, ptep); > free_page(page); > } > - spin_unlock(&init_mm.page_table_lock); > > return 0; > } > diff --git a/mm/memory.c b/mm/memory.c > index 71b3d3f98999..1ddc532b1f13 100644 > --- a/mm/memory.c > +++ b/mm/memory.c > @@ -3017,6 +3017,7 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, > pte = pte_offset_kernel(pmd, addr); > if (!pte) > return err; > + spin_lock(&init_mm.page_table_lock); > } else { > if (create) > pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); > @@ -3042,7 +3043,9 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, > > arch_leave_lazy_mmu_mode(); > > - if (mm != &init_mm) > + if (mm == &init_mm) > + spin_unlock(&init_mm.page_table_lock); > + else > pte_unmap_unlock(mapped_pte, ptl); > > *mask |= PGTBL_PTE_MODIFIED; > diff --git a/mm/vmalloc.c b/mm/vmalloc.c > index ab986dd09b6a..57b11000ae36 100644 > --- a/mm/vmalloc.c > +++ b/mm/vmalloc.c > @@ -105,6 +105,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, > if (!pte) > return -ENOMEM; > > + spin_lock(&init_mm.page_table_lock); > This is not good. We introduce another bottle-neck. -- Uladzislau Rezki
On 12/06/2025 18:36, Alexander Gordeev wrote: > As a follow-up to commit 691ee97e1a9d ("mm: fix lazy mmu docs and > usage") take a step forward and protect with a lock not only user, > but also kernel mappings before entering the lazy MMU mode. With > that the semantics of arch_enter|leave_lazy_mmu_mode() callbacks > is consolidated, which allows further simplifications. > > The effect of this consolidation is not fully preemptible (Real-Time) > kernels can not enter the context switch while the lazy MMU mode is > active - which is easier to comprehend. > > Signed-off-by: Alexander Gordeev <agordeev@linux.ibm.com> > --- > include/linux/pgtable.h | 12 ++++++------ > mm/kasan/shadow.c | 5 ----- > mm/memory.c | 5 ++++- > mm/vmalloc.c | 6 ++++++ > 4 files changed, 16 insertions(+), 12 deletions(-) > > diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h > index 0b6e1f781d86..33bf2b13c219 100644 > --- a/include/linux/pgtable.h > +++ b/include/linux/pgtable.h > @@ -224,12 +224,12 @@ static inline int pmd_dirty(pmd_t pmd) > * a raw PTE pointer after it has been modified are not guaranteed to be > * up to date. > * > - * In the general case, no lock is guaranteed to be held between entry and exit > - * of the lazy mode. So the implementation must assume preemption may be enabled > - * and cpu migration is possible; it must take steps to be robust against this. > - * (In practice, for user PTE updates, the appropriate page table lock(s) are > - * held, but for kernel PTE updates, no lock is held). Nesting is not permitted > - * and the mode cannot be used in interrupt context. > + * For PREEMPT_RT kernels implementation must assume that preemption may > + * be enabled and cpu migration is possible between entry and exit of the > + * lazy MMU mode; it must take steps to be robust against this. There is > + * no such assumption for non-PREEMPT_RT kernels, since both kernel and > + * user page tables are protected with a spinlock while in lazy MMU mode. > + * Nesting is not permitted and the mode cannot be used in interrupt context. While I agree that spec for lazy mmu mode is not well defined, and welcome changes to clarify and unify the implementations across arches, I think this is a step in the wrong direction. First the major one: you are serializing kernel pgtable operations that don't need to be serialized. This, surely, can only lead to performance loss? vmalloc could previously (mostly) run in parallel; The only part that was serialized was the allocation of the VA space. Once that's done, operations on the VA space can be done in parallel because each is only operating on the area it allocated. With your change I think all pte operations are serialised with the single init_mm.page_table_lock. Additionally, some arches (inc arm64) use apply_to_page_range() to modify the permissions of regions of kernel VA space. Again, we used to be able to modify multiple regions in parallel, but you are now serializing this for no good reason. Secondly, the lazy mmu handler still needs to handle the preemption-while-in-lazy-mmu case because, as you mention, it can still be preempted for PREEMPT_RT kernels where the spin lock is converted to a sleepable lock. So I think the handler needs to either explicitly disable preemption (as powerpc and sparc do) or handle it by plugging into the arch-specific context switch code (as x86 does) or only maintain per-task state in the first place (as arm64 does). Thanks, Ryan > */ > #ifndef __HAVE_ARCH_ENTER_LAZY_MMU_MODE > #define arch_enter_lazy_mmu_mode() do {} while (0) > diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c > index d2c70cd2afb1..45115bd770a9 100644 > --- a/mm/kasan/shadow.c > +++ b/mm/kasan/shadow.c > @@ -313,12 +313,10 @@ static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr, > __memset(page_to_virt(page), KASAN_VMALLOC_INVALID, PAGE_SIZE); > pte = pfn_pte(page_to_pfn(page), PAGE_KERNEL); > > - spin_lock(&init_mm.page_table_lock); > if (likely(pte_none(ptep_get(ptep)))) { > set_pte_at(&init_mm, addr, ptep, pte); > data->pages[index] = NULL; > } > - spin_unlock(&init_mm.page_table_lock); > > return 0; > } > @@ -465,13 +463,10 @@ static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr, > > page = (unsigned long)__va(pte_pfn(ptep_get(ptep)) << PAGE_SHIFT); > > - spin_lock(&init_mm.page_table_lock); > - > if (likely(!pte_none(ptep_get(ptep)))) { > pte_clear(&init_mm, addr, ptep); > free_page(page); > } > - spin_unlock(&init_mm.page_table_lock); > > return 0; > } > diff --git a/mm/memory.c b/mm/memory.c > index 71b3d3f98999..1ddc532b1f13 100644 > --- a/mm/memory.c > +++ b/mm/memory.c > @@ -3017,6 +3017,7 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, > pte = pte_offset_kernel(pmd, addr); > if (!pte) > return err; > + spin_lock(&init_mm.page_table_lock); > } else { > if (create) > pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); > @@ -3042,7 +3043,9 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, > > arch_leave_lazy_mmu_mode(); > > - if (mm != &init_mm) > + if (mm == &init_mm) > + spin_unlock(&init_mm.page_table_lock); > + else > pte_unmap_unlock(mapped_pte, ptl); > > *mask |= PGTBL_PTE_MODIFIED; > diff --git a/mm/vmalloc.c b/mm/vmalloc.c > index ab986dd09b6a..57b11000ae36 100644 > --- a/mm/vmalloc.c > +++ b/mm/vmalloc.c > @@ -105,6 +105,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, > if (!pte) > return -ENOMEM; > > + spin_lock(&init_mm.page_table_lock); > arch_enter_lazy_mmu_mode(); > > do { > @@ -132,6 +133,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, > } while (pte += PFN_DOWN(size), addr += size, addr != end); > > arch_leave_lazy_mmu_mode(); > + spin_unlock(&init_mm.page_table_lock); > *mask |= PGTBL_PTE_MODIFIED; > return 0; > } > @@ -359,6 +361,7 @@ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, > unsigned long size = PAGE_SIZE; > > pte = pte_offset_kernel(pmd, addr); > + spin_lock(&init_mm.page_table_lock); > arch_enter_lazy_mmu_mode(); > > do { > @@ -379,6 +382,7 @@ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, > } while (pte += (size >> PAGE_SHIFT), addr += size, addr != end); > > arch_leave_lazy_mmu_mode(); > + spin_unlock(&init_mm.page_table_lock); > *mask |= PGTBL_PTE_MODIFIED; > } > > @@ -525,6 +529,7 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr, > if (!pte) > return -ENOMEM; > > + spin_lock(&init_mm.page_table_lock); > arch_enter_lazy_mmu_mode(); > > do { > @@ -542,6 +547,7 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr, > } while (pte++, addr += PAGE_SIZE, addr != end); > > arch_leave_lazy_mmu_mode(); > + spin_unlock(&init_mm.page_table_lock); > *mask |= PGTBL_PTE_MODIFIED; > return 0; > }
© 2016 - 2025 Red Hat, Inc.