arch_sync_kernel_mappings() is an optional hook for arches to allow them
to synchonize certain levels of the kernel pgtables after modification.
But arm64 could benefit from a hook similar to this, paired with a call
prior to starting the batch of modifications.
So let's introduce arch_update_kernel_mappings_begin() and
arch_update_kernel_mappings_end(). Both have a default implementation
which can be overridden by the arch code. The default for the former is
a nop, and the default for the latter is to call
arch_sync_kernel_mappings(), so the latter replaces previous
arch_sync_kernel_mappings() callsites. So by default, the resulting
behaviour is unchanged.
To avoid include hell, the pgtbl_mod_mask type and it's associated
macros are moved to their own header.
In a future patch, arm64 will opt-in to overriding both functions.
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
---
include/linux/pgtable.h | 24 +----------------
include/linux/pgtable_modmask.h | 32 ++++++++++++++++++++++
include/linux/vmalloc.h | 47 +++++++++++++++++++++++++++++++++
mm/memory.c | 5 ++--
mm/vmalloc.c | 15 ++++++-----
5 files changed, 92 insertions(+), 31 deletions(-)
create mode 100644 include/linux/pgtable_modmask.h
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 94d267d02372..7f70786a73b3 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -4,6 +4,7 @@
#include <linux/pfn.h>
#include <asm/pgtable.h>
+#include <linux/pgtable_modmask.h>
#define PMD_ORDER (PMD_SHIFT - PAGE_SHIFT)
#define PUD_ORDER (PUD_SHIFT - PAGE_SHIFT)
@@ -1786,29 +1787,6 @@ static inline bool arch_has_pfn_modify_check(void)
# define PAGE_KERNEL_EXEC PAGE_KERNEL
#endif
-/*
- * Page Table Modification bits for pgtbl_mod_mask.
- *
- * These are used by the p?d_alloc_track*() set of functions an in the generic
- * vmalloc/ioremap code to track at which page-table levels entries have been
- * modified. Based on that the code can better decide when vmalloc and ioremap
- * mapping changes need to be synchronized to other page-tables in the system.
- */
-#define __PGTBL_PGD_MODIFIED 0
-#define __PGTBL_P4D_MODIFIED 1
-#define __PGTBL_PUD_MODIFIED 2
-#define __PGTBL_PMD_MODIFIED 3
-#define __PGTBL_PTE_MODIFIED 4
-
-#define PGTBL_PGD_MODIFIED BIT(__PGTBL_PGD_MODIFIED)
-#define PGTBL_P4D_MODIFIED BIT(__PGTBL_P4D_MODIFIED)
-#define PGTBL_PUD_MODIFIED BIT(__PGTBL_PUD_MODIFIED)
-#define PGTBL_PMD_MODIFIED BIT(__PGTBL_PMD_MODIFIED)
-#define PGTBL_PTE_MODIFIED BIT(__PGTBL_PTE_MODIFIED)
-
-/* Page-Table Modification Mask */
-typedef unsigned int pgtbl_mod_mask;
-
#endif /* !__ASSEMBLY__ */
#if !defined(MAX_POSSIBLE_PHYSMEM_BITS) && !defined(CONFIG_64BIT)
diff --git a/include/linux/pgtable_modmask.h b/include/linux/pgtable_modmask.h
new file mode 100644
index 000000000000..5a21b1bb8df3
--- /dev/null
+++ b/include/linux/pgtable_modmask.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_PGTABLE_MODMASK_H
+#define _LINUX_PGTABLE_MODMASK_H
+
+#ifndef __ASSEMBLY__
+
+/*
+ * Page Table Modification bits for pgtbl_mod_mask.
+ *
+ * These are used by the p?d_alloc_track*() set of functions an in the generic
+ * vmalloc/ioremap code to track at which page-table levels entries have been
+ * modified. Based on that the code can better decide when vmalloc and ioremap
+ * mapping changes need to be synchronized to other page-tables in the system.
+ */
+#define __PGTBL_PGD_MODIFIED 0
+#define __PGTBL_P4D_MODIFIED 1
+#define __PGTBL_PUD_MODIFIED 2
+#define __PGTBL_PMD_MODIFIED 3
+#define __PGTBL_PTE_MODIFIED 4
+
+#define PGTBL_PGD_MODIFIED BIT(__PGTBL_PGD_MODIFIED)
+#define PGTBL_P4D_MODIFIED BIT(__PGTBL_P4D_MODIFIED)
+#define PGTBL_PUD_MODIFIED BIT(__PGTBL_PUD_MODIFIED)
+#define PGTBL_PMD_MODIFIED BIT(__PGTBL_PMD_MODIFIED)
+#define PGTBL_PTE_MODIFIED BIT(__PGTBL_PTE_MODIFIED)
+
+/* Page-Table Modification Mask */
+typedef unsigned int pgtbl_mod_mask;
+
+#endif /* !__ASSEMBLY__ */
+
+#endif /* _LINUX_PGTABLE_MODMASK_H */
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 16dd4cba64f2..cb5d8f1965a1 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -11,6 +11,7 @@
#include <asm/page.h> /* pgprot_t */
#include <linux/rbtree.h>
#include <linux/overflow.h>
+#include <linux/pgtable_modmask.h>
#include <asm/vmalloc.h>
@@ -213,6 +214,26 @@ extern int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
int vmap_pages_range(unsigned long addr, unsigned long end, pgprot_t prot,
struct page **pages, unsigned int page_shift);
+#ifndef arch_update_kernel_mappings_begin
+/**
+ * arch_update_kernel_mappings_begin - A batch of kernel pgtable mappings are
+ * about to be updated.
+ * @start: Virtual address of start of range to be updated.
+ * @end: Virtual address of end of range to be updated.
+ *
+ * An optional hook to allow architecture code to prepare for a batch of kernel
+ * pgtable mapping updates. An architecture may use this to enter a lazy mode
+ * where some operations can be deferred until the end of the batch.
+ *
+ * Context: Called in task context and may be preemptible.
+ */
+static inline void arch_update_kernel_mappings_begin(unsigned long start,
+ unsigned long end)
+{
+}
+#endif
+
+#ifndef arch_update_kernel_mappings_end
/*
* Architectures can set this mask to a combination of PGTBL_P?D_MODIFIED values
* and let generic vmalloc and ioremap code know when arch_sync_kernel_mappings()
@@ -229,6 +250,32 @@ int vmap_pages_range(unsigned long addr, unsigned long end, pgprot_t prot,
*/
void arch_sync_kernel_mappings(unsigned long start, unsigned long end);
+/**
+ * arch_update_kernel_mappings_end - A batch of kernel pgtable mappings have
+ * been updated.
+ * @start: Virtual address of start of range that was updated.
+ * @end: Virtual address of end of range that was updated.
+ *
+ * An optional hook to inform architecture code that a batch update is complete.
+ * This balances a previous call to arch_update_kernel_mappings_begin().
+ *
+ * An architecture may override this for any purpose, such as exiting a lazy
+ * mode previously entered with arch_update_kernel_mappings_begin() or syncing
+ * kernel mappings to a secondary pgtable. The default implementation calls an
+ * arch-provided arch_sync_kernel_mappings() if any arch-defined pgtable level
+ * was updated.
+ *
+ * Context: Called in task context and may be preemptible.
+ */
+static inline void arch_update_kernel_mappings_end(unsigned long start,
+ unsigned long end,
+ pgtbl_mod_mask mask)
+{
+ if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
+ arch_sync_kernel_mappings(start, end);
+}
+#endif
+
/*
* Lowlevel-APIs (not for driver use!)
*/
diff --git a/mm/memory.c b/mm/memory.c
index a15f7dd500ea..f80930bc19f6 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3035,6 +3035,8 @@ static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr,
if (WARN_ON(addr >= end))
return -EINVAL;
+ arch_update_kernel_mappings_begin(start, end);
+
pgd = pgd_offset(mm, addr);
do {
next = pgd_addr_end(addr, end);
@@ -3055,8 +3057,7 @@ static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr,
break;
} while (pgd++, addr = next, addr != end);
- if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
- arch_sync_kernel_mappings(start, start + size);
+ arch_update_kernel_mappings_end(start, end, mask);
return err;
}
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 50fd44439875..c5c51d86ef78 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -312,10 +312,10 @@ int vmap_page_range(unsigned long addr, unsigned long end,
pgtbl_mod_mask mask = 0;
int err;
+ arch_update_kernel_mappings_begin(addr, end);
err = vmap_range_noflush(addr, end, phys_addr, pgprot_nx(prot),
ioremap_max_page_shift, &mask);
- if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
- arch_sync_kernel_mappings(addr, end);
+ arch_update_kernel_mappings_end(addr, end, mask);
flush_cache_vmap(addr, end);
if (!err)
@@ -463,6 +463,9 @@ void __vunmap_range_noflush(unsigned long start, unsigned long end)
pgtbl_mod_mask mask = 0;
BUG_ON(addr >= end);
+
+ arch_update_kernel_mappings_begin(start, end);
+
pgd = pgd_offset_k(addr);
do {
next = pgd_addr_end(addr, end);
@@ -473,8 +476,7 @@ void __vunmap_range_noflush(unsigned long start, unsigned long end)
vunmap_p4d_range(pgd, addr, next, &mask);
} while (pgd++, addr = next, addr != end);
- if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
- arch_sync_kernel_mappings(start, end);
+ arch_update_kernel_mappings_end(start, end, mask);
}
void vunmap_range_noflush(unsigned long start, unsigned long end)
@@ -625,6 +627,8 @@ int __vmap_pages_range_noflush(unsigned long addr, unsigned long end,
WARN_ON(page_shift < PAGE_SHIFT);
+ arch_update_kernel_mappings_begin(start, end);
+
if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC) ||
page_shift == PAGE_SHIFT) {
err = vmap_small_pages_range_noflush(addr, end, prot, pages,
@@ -642,8 +646,7 @@ int __vmap_pages_range_noflush(unsigned long addr, unsigned long end,
}
}
- if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
- arch_sync_kernel_mappings(start, end);
+ arch_update_kernel_mappings_end(start, end, mask);
return err;
}
--
2.43.0
On 2/5/25 20:39, Ryan Roberts wrote:
> arch_sync_kernel_mappings() is an optional hook for arches to allow them
> to synchonize certain levels of the kernel pgtables after modification.
> But arm64 could benefit from a hook similar to this, paired with a call
> prior to starting the batch of modifications.
>
> So let's introduce arch_update_kernel_mappings_begin() and
> arch_update_kernel_mappings_end(). Both have a default implementation
> which can be overridden by the arch code. The default for the former is
> a nop, and the default for the latter is to call
> arch_sync_kernel_mappings(), so the latter replaces previous
> arch_sync_kernel_mappings() callsites. So by default, the resulting
> behaviour is unchanged.
>
> To avoid include hell, the pgtbl_mod_mask type and it's associated
> macros are moved to their own header.
>
> In a future patch, arm64 will opt-in to overriding both functions.
>
> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
> ---
> include/linux/pgtable.h | 24 +----------------
> include/linux/pgtable_modmask.h | 32 ++++++++++++++++++++++
> include/linux/vmalloc.h | 47 +++++++++++++++++++++++++++++++++
> mm/memory.c | 5 ++--
> mm/vmalloc.c | 15 ++++++-----
> 5 files changed, 92 insertions(+), 31 deletions(-)
> create mode 100644 include/linux/pgtable_modmask.h
>
> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
> index 94d267d02372..7f70786a73b3 100644
> --- a/include/linux/pgtable.h
> +++ b/include/linux/pgtable.h
> @@ -4,6 +4,7 @@
>
> #include <linux/pfn.h>
> #include <asm/pgtable.h>
> +#include <linux/pgtable_modmask.h>
>
> #define PMD_ORDER (PMD_SHIFT - PAGE_SHIFT)
> #define PUD_ORDER (PUD_SHIFT - PAGE_SHIFT)
> @@ -1786,29 +1787,6 @@ static inline bool arch_has_pfn_modify_check(void)
> # define PAGE_KERNEL_EXEC PAGE_KERNEL
> #endif
>
> -/*
> - * Page Table Modification bits for pgtbl_mod_mask.
> - *
> - * These are used by the p?d_alloc_track*() set of functions an in the generic
> - * vmalloc/ioremap code to track at which page-table levels entries have been
> - * modified. Based on that the code can better decide when vmalloc and ioremap
> - * mapping changes need to be synchronized to other page-tables in the system.
> - */
> -#define __PGTBL_PGD_MODIFIED 0
> -#define __PGTBL_P4D_MODIFIED 1
> -#define __PGTBL_PUD_MODIFIED 2
> -#define __PGTBL_PMD_MODIFIED 3
> -#define __PGTBL_PTE_MODIFIED 4
> -
> -#define PGTBL_PGD_MODIFIED BIT(__PGTBL_PGD_MODIFIED)
> -#define PGTBL_P4D_MODIFIED BIT(__PGTBL_P4D_MODIFIED)
> -#define PGTBL_PUD_MODIFIED BIT(__PGTBL_PUD_MODIFIED)
> -#define PGTBL_PMD_MODIFIED BIT(__PGTBL_PMD_MODIFIED)
> -#define PGTBL_PTE_MODIFIED BIT(__PGTBL_PTE_MODIFIED)
> -
> -/* Page-Table Modification Mask */
> -typedef unsigned int pgtbl_mod_mask;
> -
> #endif /* !__ASSEMBLY__ */
>
> #if !defined(MAX_POSSIBLE_PHYSMEM_BITS) && !defined(CONFIG_64BIT)
> diff --git a/include/linux/pgtable_modmask.h b/include/linux/pgtable_modmask.h
> new file mode 100644
> index 000000000000..5a21b1bb8df3
> --- /dev/null
> +++ b/include/linux/pgtable_modmask.h
> @@ -0,0 +1,32 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _LINUX_PGTABLE_MODMASK_H
> +#define _LINUX_PGTABLE_MODMASK_H
> +
> +#ifndef __ASSEMBLY__
> +
> +/*
> + * Page Table Modification bits for pgtbl_mod_mask.
> + *
> + * These are used by the p?d_alloc_track*() set of functions an in the generic
> + * vmalloc/ioremap code to track at which page-table levels entries have been
> + * modified. Based on that the code can better decide when vmalloc and ioremap
> + * mapping changes need to be synchronized to other page-tables in the system.
> + */
> +#define __PGTBL_PGD_MODIFIED 0
> +#define __PGTBL_P4D_MODIFIED 1
> +#define __PGTBL_PUD_MODIFIED 2
> +#define __PGTBL_PMD_MODIFIED 3
> +#define __PGTBL_PTE_MODIFIED 4
> +
> +#define PGTBL_PGD_MODIFIED BIT(__PGTBL_PGD_MODIFIED)
> +#define PGTBL_P4D_MODIFIED BIT(__PGTBL_P4D_MODIFIED)
> +#define PGTBL_PUD_MODIFIED BIT(__PGTBL_PUD_MODIFIED)
> +#define PGTBL_PMD_MODIFIED BIT(__PGTBL_PMD_MODIFIED)
> +#define PGTBL_PTE_MODIFIED BIT(__PGTBL_PTE_MODIFIED)
> +
> +/* Page-Table Modification Mask */
> +typedef unsigned int pgtbl_mod_mask;
> +
> +#endif /* !__ASSEMBLY__ */
> +
> +#endif /* _LINUX_PGTABLE_MODMASK_H */
> diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
> index 16dd4cba64f2..cb5d8f1965a1 100644
> --- a/include/linux/vmalloc.h
> +++ b/include/linux/vmalloc.h
> @@ -11,6 +11,7 @@
> #include <asm/page.h> /* pgprot_t */
> #include <linux/rbtree.h>
> #include <linux/overflow.h>
> +#include <linux/pgtable_modmask.h>
>
> #include <asm/vmalloc.h>
>
> @@ -213,6 +214,26 @@ extern int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
> int vmap_pages_range(unsigned long addr, unsigned long end, pgprot_t prot,
> struct page **pages, unsigned int page_shift);
>
> +#ifndef arch_update_kernel_mappings_begin
> +/**
> + * arch_update_kernel_mappings_begin - A batch of kernel pgtable mappings are
> + * about to be updated.
> + * @start: Virtual address of start of range to be updated.
> + * @end: Virtual address of end of range to be updated.
> + *
> + * An optional hook to allow architecture code to prepare for a batch of kernel
> + * pgtable mapping updates. An architecture may use this to enter a lazy mode
> + * where some operations can be deferred until the end of the batch.
> + *
> + * Context: Called in task context and may be preemptible.
> + */
> +static inline void arch_update_kernel_mappings_begin(unsigned long start,
> + unsigned long end)
> +{
> +}
> +#endif
> +
> +#ifndef arch_update_kernel_mappings_end
> /*
> * Architectures can set this mask to a combination of PGTBL_P?D_MODIFIED values
> * and let generic vmalloc and ioremap code know when arch_sync_kernel_mappings()
> @@ -229,6 +250,32 @@ int vmap_pages_range(unsigned long addr, unsigned long end, pgprot_t prot,
> */
> void arch_sync_kernel_mappings(unsigned long start, unsigned long end);
>
> +/**
> + * arch_update_kernel_mappings_end - A batch of kernel pgtable mappings have
> + * been updated.
> + * @start: Virtual address of start of range that was updated.
> + * @end: Virtual address of end of range that was updated.
> + *
> + * An optional hook to inform architecture code that a batch update is complete.
> + * This balances a previous call to arch_update_kernel_mappings_begin().
> + *
> + * An architecture may override this for any purpose, such as exiting a lazy
> + * mode previously entered with arch_update_kernel_mappings_begin() or syncing
> + * kernel mappings to a secondary pgtable. The default implementation calls an
> + * arch-provided arch_sync_kernel_mappings() if any arch-defined pgtable level
> + * was updated.
> + *
> + * Context: Called in task context and may be preemptible.
> + */
> +static inline void arch_update_kernel_mappings_end(unsigned long start,
> + unsigned long end,
> + pgtbl_mod_mask mask)
> +{
> + if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
> + arch_sync_kernel_mappings(start, end);
> +}
One arch call back calling yet another arch call back sounds bit odd. Also
should not ARCH_PAGE_TABLE_SYNC_MASK be checked both for __begin and __end
callbacks in case a platform subscribes into this framework. Instead the
following changes sound more reasonable, but will also require some more
updates for the current platforms using arch_sync_kernel_mappings().
if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
arch_update_kernel_mappings_begin()
if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
arch_update_kernel_mappings_end()
Basically when any platform defines ARCH_PAGE_TABLE_SYNC_MASK and subscribes
this framework, it will also provide arch_update_kernel_mappings_begin/end()
callbacks as required.
> +#endif
> +
> /*
> * Lowlevel-APIs (not for driver use!)
> */
> diff --git a/mm/memory.c b/mm/memory.c
> index a15f7dd500ea..f80930bc19f6 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -3035,6 +3035,8 @@ static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr,
> if (WARN_ON(addr >= end))
> return -EINVAL;
>
> + arch_update_kernel_mappings_begin(start, end);
> +
> pgd = pgd_offset(mm, addr);
> do {
> next = pgd_addr_end(addr, end);
> @@ -3055,8 +3057,7 @@ static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr,
> break;
> } while (pgd++, addr = next, addr != end);
>
> - if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
> - arch_sync_kernel_mappings(start, start + size);
> + arch_update_kernel_mappings_end(start, end, mask);
>
> return err;
> }
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index 50fd44439875..c5c51d86ef78 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -312,10 +312,10 @@ int vmap_page_range(unsigned long addr, unsigned long end,
> pgtbl_mod_mask mask = 0;
> int err;
>
> + arch_update_kernel_mappings_begin(addr, end);
> err = vmap_range_noflush(addr, end, phys_addr, pgprot_nx(prot),
> ioremap_max_page_shift, &mask);
> - if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
> - arch_sync_kernel_mappings(addr, end);
> + arch_update_kernel_mappings_end(addr, end, mask);
>
> flush_cache_vmap(addr, end);
> if (!err)
> @@ -463,6 +463,9 @@ void __vunmap_range_noflush(unsigned long start, unsigned long end)
> pgtbl_mod_mask mask = 0;
>
> BUG_ON(addr >= end);
> +
> + arch_update_kernel_mappings_begin(start, end);
> +
> pgd = pgd_offset_k(addr);
> do {
> next = pgd_addr_end(addr, end);
> @@ -473,8 +476,7 @@ void __vunmap_range_noflush(unsigned long start, unsigned long end)
> vunmap_p4d_range(pgd, addr, next, &mask);
> } while (pgd++, addr = next, addr != end);
>
> - if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
> - arch_sync_kernel_mappings(start, end);
> + arch_update_kernel_mappings_end(start, end, mask);
> }
>
> void vunmap_range_noflush(unsigned long start, unsigned long end)
> @@ -625,6 +627,8 @@ int __vmap_pages_range_noflush(unsigned long addr, unsigned long end,
>
> WARN_ON(page_shift < PAGE_SHIFT);
>
> + arch_update_kernel_mappings_begin(start, end);
> +
> if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC) ||
> page_shift == PAGE_SHIFT) {
> err = vmap_small_pages_range_noflush(addr, end, prot, pages,
> @@ -642,8 +646,7 @@ int __vmap_pages_range_noflush(unsigned long addr, unsigned long end,
> }
> }
>
> - if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
> - arch_sync_kernel_mappings(start, end);
> + arch_update_kernel_mappings_end(start, end, mask);
>
> return err;
> }
On 10/02/2025 07:45, Anshuman Khandual wrote:
>
>
> On 2/5/25 20:39, Ryan Roberts wrote:
>> arch_sync_kernel_mappings() is an optional hook for arches to allow them
>> to synchonize certain levels of the kernel pgtables after modification.
>> But arm64 could benefit from a hook similar to this, paired with a call
>> prior to starting the batch of modifications.
>>
>> So let's introduce arch_update_kernel_mappings_begin() and
>> arch_update_kernel_mappings_end(). Both have a default implementation
>> which can be overridden by the arch code. The default for the former is
>> a nop, and the default for the latter is to call
>> arch_sync_kernel_mappings(), so the latter replaces previous
>> arch_sync_kernel_mappings() callsites. So by default, the resulting
>> behaviour is unchanged.
>>
>> To avoid include hell, the pgtbl_mod_mask type and it's associated
>> macros are moved to their own header.
>>
>> In a future patch, arm64 will opt-in to overriding both functions.
>>
>> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
>> ---
>> include/linux/pgtable.h | 24 +----------------
>> include/linux/pgtable_modmask.h | 32 ++++++++++++++++++++++
>> include/linux/vmalloc.h | 47 +++++++++++++++++++++++++++++++++
>> mm/memory.c | 5 ++--
>> mm/vmalloc.c | 15 ++++++-----
>> 5 files changed, 92 insertions(+), 31 deletions(-)
>> create mode 100644 include/linux/pgtable_modmask.h
>>
>> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
>> index 94d267d02372..7f70786a73b3 100644
>> --- a/include/linux/pgtable.h
>> +++ b/include/linux/pgtable.h
>> @@ -4,6 +4,7 @@
>>
>> #include <linux/pfn.h>
>> #include <asm/pgtable.h>
>> +#include <linux/pgtable_modmask.h>
>>
>> #define PMD_ORDER (PMD_SHIFT - PAGE_SHIFT)
>> #define PUD_ORDER (PUD_SHIFT - PAGE_SHIFT)
>> @@ -1786,29 +1787,6 @@ static inline bool arch_has_pfn_modify_check(void)
>> # define PAGE_KERNEL_EXEC PAGE_KERNEL
>> #endif
>>
>> -/*
>> - * Page Table Modification bits for pgtbl_mod_mask.
>> - *
>> - * These are used by the p?d_alloc_track*() set of functions an in the generic
>> - * vmalloc/ioremap code to track at which page-table levels entries have been
>> - * modified. Based on that the code can better decide when vmalloc and ioremap
>> - * mapping changes need to be synchronized to other page-tables in the system.
>> - */
>> -#define __PGTBL_PGD_MODIFIED 0
>> -#define __PGTBL_P4D_MODIFIED 1
>> -#define __PGTBL_PUD_MODIFIED 2
>> -#define __PGTBL_PMD_MODIFIED 3
>> -#define __PGTBL_PTE_MODIFIED 4
>> -
>> -#define PGTBL_PGD_MODIFIED BIT(__PGTBL_PGD_MODIFIED)
>> -#define PGTBL_P4D_MODIFIED BIT(__PGTBL_P4D_MODIFIED)
>> -#define PGTBL_PUD_MODIFIED BIT(__PGTBL_PUD_MODIFIED)
>> -#define PGTBL_PMD_MODIFIED BIT(__PGTBL_PMD_MODIFIED)
>> -#define PGTBL_PTE_MODIFIED BIT(__PGTBL_PTE_MODIFIED)
>> -
>> -/* Page-Table Modification Mask */
>> -typedef unsigned int pgtbl_mod_mask;
>> -
>> #endif /* !__ASSEMBLY__ */
>>
>> #if !defined(MAX_POSSIBLE_PHYSMEM_BITS) && !defined(CONFIG_64BIT)
>> diff --git a/include/linux/pgtable_modmask.h b/include/linux/pgtable_modmask.h
>> new file mode 100644
>> index 000000000000..5a21b1bb8df3
>> --- /dev/null
>> +++ b/include/linux/pgtable_modmask.h
>> @@ -0,0 +1,32 @@
>> +/* SPDX-License-Identifier: GPL-2.0 */
>> +#ifndef _LINUX_PGTABLE_MODMASK_H
>> +#define _LINUX_PGTABLE_MODMASK_H
>> +
>> +#ifndef __ASSEMBLY__
>> +
>> +/*
>> + * Page Table Modification bits for pgtbl_mod_mask.
>> + *
>> + * These are used by the p?d_alloc_track*() set of functions an in the generic
>> + * vmalloc/ioremap code to track at which page-table levels entries have been
>> + * modified. Based on that the code can better decide when vmalloc and ioremap
>> + * mapping changes need to be synchronized to other page-tables in the system.
>> + */
>> +#define __PGTBL_PGD_MODIFIED 0
>> +#define __PGTBL_P4D_MODIFIED 1
>> +#define __PGTBL_PUD_MODIFIED 2
>> +#define __PGTBL_PMD_MODIFIED 3
>> +#define __PGTBL_PTE_MODIFIED 4
>> +
>> +#define PGTBL_PGD_MODIFIED BIT(__PGTBL_PGD_MODIFIED)
>> +#define PGTBL_P4D_MODIFIED BIT(__PGTBL_P4D_MODIFIED)
>> +#define PGTBL_PUD_MODIFIED BIT(__PGTBL_PUD_MODIFIED)
>> +#define PGTBL_PMD_MODIFIED BIT(__PGTBL_PMD_MODIFIED)
>> +#define PGTBL_PTE_MODIFIED BIT(__PGTBL_PTE_MODIFIED)
>> +
>> +/* Page-Table Modification Mask */
>> +typedef unsigned int pgtbl_mod_mask;
>> +
>> +#endif /* !__ASSEMBLY__ */
>> +
>> +#endif /* _LINUX_PGTABLE_MODMASK_H */
>> diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
>> index 16dd4cba64f2..cb5d8f1965a1 100644
>> --- a/include/linux/vmalloc.h
>> +++ b/include/linux/vmalloc.h
>> @@ -11,6 +11,7 @@
>> #include <asm/page.h> /* pgprot_t */
>> #include <linux/rbtree.h>
>> #include <linux/overflow.h>
>> +#include <linux/pgtable_modmask.h>
>>
>> #include <asm/vmalloc.h>
>>
>> @@ -213,6 +214,26 @@ extern int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
>> int vmap_pages_range(unsigned long addr, unsigned long end, pgprot_t prot,
>> struct page **pages, unsigned int page_shift);
>>
>> +#ifndef arch_update_kernel_mappings_begin
>> +/**
>> + * arch_update_kernel_mappings_begin - A batch of kernel pgtable mappings are
>> + * about to be updated.
>> + * @start: Virtual address of start of range to be updated.
>> + * @end: Virtual address of end of range to be updated.
>> + *
>> + * An optional hook to allow architecture code to prepare for a batch of kernel
>> + * pgtable mapping updates. An architecture may use this to enter a lazy mode
>> + * where some operations can be deferred until the end of the batch.
>> + *
>> + * Context: Called in task context and may be preemptible.
>> + */
>> +static inline void arch_update_kernel_mappings_begin(unsigned long start,
>> + unsigned long end)
>> +{
>> +}
>> +#endif
>> +
>> +#ifndef arch_update_kernel_mappings_end
>> /*
>> * Architectures can set this mask to a combination of PGTBL_P?D_MODIFIED values
>> * and let generic vmalloc and ioremap code know when arch_sync_kernel_mappings()
>> @@ -229,6 +250,32 @@ int vmap_pages_range(unsigned long addr, unsigned long end, pgprot_t prot,
>> */
>> void arch_sync_kernel_mappings(unsigned long start, unsigned long end);
>>
>> +/**
>> + * arch_update_kernel_mappings_end - A batch of kernel pgtable mappings have
>> + * been updated.
>> + * @start: Virtual address of start of range that was updated.
>> + * @end: Virtual address of end of range that was updated.
>> + *
>> + * An optional hook to inform architecture code that a batch update is complete.
>> + * This balances a previous call to arch_update_kernel_mappings_begin().
>> + *
>> + * An architecture may override this for any purpose, such as exiting a lazy
>> + * mode previously entered with arch_update_kernel_mappings_begin() or syncing
>> + * kernel mappings to a secondary pgtable. The default implementation calls an
>> + * arch-provided arch_sync_kernel_mappings() if any arch-defined pgtable level
>> + * was updated.
>> + *
>> + * Context: Called in task context and may be preemptible.
>> + */
>> +static inline void arch_update_kernel_mappings_end(unsigned long start,
>> + unsigned long end,
>> + pgtbl_mod_mask mask)
>> +{
>> + if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
>> + arch_sync_kernel_mappings(start, end);
>> +}
>
> One arch call back calling yet another arch call back sounds bit odd.
It's no different from the default implementation of arch_make_huge_pte()
calling pte_mkhuge() is it?
> Also
> should not ARCH_PAGE_TABLE_SYNC_MASK be checked both for __begin and __end
> callbacks in case a platform subscribes into this framework.
I'm not sure how that would work? The mask is accumulated during the pgtable
walk. So we don't have a mask until we get to the end.
> Instead the
> following changes sound more reasonable, but will also require some more
> updates for the current platforms using arch_sync_kernel_mappings().
>
> if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
> arch_update_kernel_mappings_begin()
This makes no sense. mask is always 0 before doing the walk.
>
> if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
> arch_update_kernel_mappings_end()
>
> Basically when any platform defines ARCH_PAGE_TABLE_SYNC_MASK and subscribes
> this framework, it will also provide arch_update_kernel_mappings_begin/end()
> callbacks as required.
Personally I think it's cleaner to just pass mask to
arch_update_kernel_mappings_end() and it the function decide what it wants to do.
But it's a good question as to whether we should refactor x86 and arm to
directly implement arch_update_kernel_mappings_end() instead of
arch_sync_kernel_mappings(). Personally I thought it was better to avoid the
churn. But interested in others opinions.
Thanks,
Ryan
>
>> +#endif
>> +
>> /*
>> * Lowlevel-APIs (not for driver use!)
>> */
>> diff --git a/mm/memory.c b/mm/memory.c
>> index a15f7dd500ea..f80930bc19f6 100644
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -3035,6 +3035,8 @@ static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr,
>> if (WARN_ON(addr >= end))
>> return -EINVAL;
>>
>> + arch_update_kernel_mappings_begin(start, end);
>> +
>> pgd = pgd_offset(mm, addr);
>> do {
>> next = pgd_addr_end(addr, end);
>> @@ -3055,8 +3057,7 @@ static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr,
>> break;
>> } while (pgd++, addr = next, addr != end);
>>
>> - if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
>> - arch_sync_kernel_mappings(start, start + size);
>> + arch_update_kernel_mappings_end(start, end, mask);
>>
>> return err;
>> }
>> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
>> index 50fd44439875..c5c51d86ef78 100644
>> --- a/mm/vmalloc.c
>> +++ b/mm/vmalloc.c
>> @@ -312,10 +312,10 @@ int vmap_page_range(unsigned long addr, unsigned long end,
>> pgtbl_mod_mask mask = 0;
>> int err;
>>
>> + arch_update_kernel_mappings_begin(addr, end);
>> err = vmap_range_noflush(addr, end, phys_addr, pgprot_nx(prot),
>> ioremap_max_page_shift, &mask);
>> - if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
>> - arch_sync_kernel_mappings(addr, end);
>> + arch_update_kernel_mappings_end(addr, end, mask);
>>
>> flush_cache_vmap(addr, end);
>> if (!err)
>> @@ -463,6 +463,9 @@ void __vunmap_range_noflush(unsigned long start, unsigned long end)
>> pgtbl_mod_mask mask = 0;
>>
>> BUG_ON(addr >= end);
>> +
>> + arch_update_kernel_mappings_begin(start, end);
>> +
>> pgd = pgd_offset_k(addr);
>> do {
>> next = pgd_addr_end(addr, end);
>> @@ -473,8 +476,7 @@ void __vunmap_range_noflush(unsigned long start, unsigned long end)
>> vunmap_p4d_range(pgd, addr, next, &mask);
>> } while (pgd++, addr = next, addr != end);
>>
>> - if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
>> - arch_sync_kernel_mappings(start, end);
>> + arch_update_kernel_mappings_end(start, end, mask);
>> }
>>
>> void vunmap_range_noflush(unsigned long start, unsigned long end)
>> @@ -625,6 +627,8 @@ int __vmap_pages_range_noflush(unsigned long addr, unsigned long end,
>>
>> WARN_ON(page_shift < PAGE_SHIFT);
>>
>> + arch_update_kernel_mappings_begin(start, end);
>> +
>> if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC) ||
>> page_shift == PAGE_SHIFT) {
>> err = vmap_small_pages_range_noflush(addr, end, prot, pages,
>> @@ -642,8 +646,7 @@ int __vmap_pages_range_noflush(unsigned long addr, unsigned long end,
>> }
>> }
>>
>> - if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
>> - arch_sync_kernel_mappings(start, end);
>> + arch_update_kernel_mappings_end(start, end, mask);
>>
>> return err;
>> }
On 2/10/25 16:34, Ryan Roberts wrote:
> On 10/02/2025 07:45, Anshuman Khandual wrote:
>>
>>
>> On 2/5/25 20:39, Ryan Roberts wrote:
>>> arch_sync_kernel_mappings() is an optional hook for arches to allow them
>>> to synchonize certain levels of the kernel pgtables after modification.
>>> But arm64 could benefit from a hook similar to this, paired with a call
>>> prior to starting the batch of modifications.
>>>
>>> So let's introduce arch_update_kernel_mappings_begin() and
>>> arch_update_kernel_mappings_end(). Both have a default implementation
>>> which can be overridden by the arch code. The default for the former is
>>> a nop, and the default for the latter is to call
>>> arch_sync_kernel_mappings(), so the latter replaces previous
>>> arch_sync_kernel_mappings() callsites. So by default, the resulting
>>> behaviour is unchanged.
>>>
>>> To avoid include hell, the pgtbl_mod_mask type and it's associated
>>> macros are moved to their own header.
>>>
>>> In a future patch, arm64 will opt-in to overriding both functions.
>>>
>>> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
>>> ---
>>> include/linux/pgtable.h | 24 +----------------
>>> include/linux/pgtable_modmask.h | 32 ++++++++++++++++++++++
>>> include/linux/vmalloc.h | 47 +++++++++++++++++++++++++++++++++
>>> mm/memory.c | 5 ++--
>>> mm/vmalloc.c | 15 ++++++-----
>>> 5 files changed, 92 insertions(+), 31 deletions(-)
>>> create mode 100644 include/linux/pgtable_modmask.h
>>>
>>> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
>>> index 94d267d02372..7f70786a73b3 100644
>>> --- a/include/linux/pgtable.h
>>> +++ b/include/linux/pgtable.h
>>> @@ -4,6 +4,7 @@
>>>
>>> #include <linux/pfn.h>
>>> #include <asm/pgtable.h>
>>> +#include <linux/pgtable_modmask.h>
>>>
>>> #define PMD_ORDER (PMD_SHIFT - PAGE_SHIFT)
>>> #define PUD_ORDER (PUD_SHIFT - PAGE_SHIFT)
>>> @@ -1786,29 +1787,6 @@ static inline bool arch_has_pfn_modify_check(void)
>>> # define PAGE_KERNEL_EXEC PAGE_KERNEL
>>> #endif
>>>
>>> -/*
>>> - * Page Table Modification bits for pgtbl_mod_mask.
>>> - *
>>> - * These are used by the p?d_alloc_track*() set of functions an in the generic
>>> - * vmalloc/ioremap code to track at which page-table levels entries have been
>>> - * modified. Based on that the code can better decide when vmalloc and ioremap
>>> - * mapping changes need to be synchronized to other page-tables in the system.
>>> - */
>>> -#define __PGTBL_PGD_MODIFIED 0
>>> -#define __PGTBL_P4D_MODIFIED 1
>>> -#define __PGTBL_PUD_MODIFIED 2
>>> -#define __PGTBL_PMD_MODIFIED 3
>>> -#define __PGTBL_PTE_MODIFIED 4
>>> -
>>> -#define PGTBL_PGD_MODIFIED BIT(__PGTBL_PGD_MODIFIED)
>>> -#define PGTBL_P4D_MODIFIED BIT(__PGTBL_P4D_MODIFIED)
>>> -#define PGTBL_PUD_MODIFIED BIT(__PGTBL_PUD_MODIFIED)
>>> -#define PGTBL_PMD_MODIFIED BIT(__PGTBL_PMD_MODIFIED)
>>> -#define PGTBL_PTE_MODIFIED BIT(__PGTBL_PTE_MODIFIED)
>>> -
>>> -/* Page-Table Modification Mask */
>>> -typedef unsigned int pgtbl_mod_mask;
>>> -
>>> #endif /* !__ASSEMBLY__ */
>>>
>>> #if !defined(MAX_POSSIBLE_PHYSMEM_BITS) && !defined(CONFIG_64BIT)
>>> diff --git a/include/linux/pgtable_modmask.h b/include/linux/pgtable_modmask.h
>>> new file mode 100644
>>> index 000000000000..5a21b1bb8df3
>>> --- /dev/null
>>> +++ b/include/linux/pgtable_modmask.h
>>> @@ -0,0 +1,32 @@
>>> +/* SPDX-License-Identifier: GPL-2.0 */
>>> +#ifndef _LINUX_PGTABLE_MODMASK_H
>>> +#define _LINUX_PGTABLE_MODMASK_H
>>> +
>>> +#ifndef __ASSEMBLY__
>>> +
>>> +/*
>>> + * Page Table Modification bits for pgtbl_mod_mask.
>>> + *
>>> + * These are used by the p?d_alloc_track*() set of functions an in the generic
>>> + * vmalloc/ioremap code to track at which page-table levels entries have been
>>> + * modified. Based on that the code can better decide when vmalloc and ioremap
>>> + * mapping changes need to be synchronized to other page-tables in the system.
>>> + */
>>> +#define __PGTBL_PGD_MODIFIED 0
>>> +#define __PGTBL_P4D_MODIFIED 1
>>> +#define __PGTBL_PUD_MODIFIED 2
>>> +#define __PGTBL_PMD_MODIFIED 3
>>> +#define __PGTBL_PTE_MODIFIED 4
>>> +
>>> +#define PGTBL_PGD_MODIFIED BIT(__PGTBL_PGD_MODIFIED)
>>> +#define PGTBL_P4D_MODIFIED BIT(__PGTBL_P4D_MODIFIED)
>>> +#define PGTBL_PUD_MODIFIED BIT(__PGTBL_PUD_MODIFIED)
>>> +#define PGTBL_PMD_MODIFIED BIT(__PGTBL_PMD_MODIFIED)
>>> +#define PGTBL_PTE_MODIFIED BIT(__PGTBL_PTE_MODIFIED)
>>> +
>>> +/* Page-Table Modification Mask */
>>> +typedef unsigned int pgtbl_mod_mask;
>>> +
>>> +#endif /* !__ASSEMBLY__ */
>>> +
>>> +#endif /* _LINUX_PGTABLE_MODMASK_H */
>>> diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
>>> index 16dd4cba64f2..cb5d8f1965a1 100644
>>> --- a/include/linux/vmalloc.h
>>> +++ b/include/linux/vmalloc.h
>>> @@ -11,6 +11,7 @@
>>> #include <asm/page.h> /* pgprot_t */
>>> #include <linux/rbtree.h>
>>> #include <linux/overflow.h>
>>> +#include <linux/pgtable_modmask.h>
>>>
>>> #include <asm/vmalloc.h>
>>>
>>> @@ -213,6 +214,26 @@ extern int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
>>> int vmap_pages_range(unsigned long addr, unsigned long end, pgprot_t prot,
>>> struct page **pages, unsigned int page_shift);
>>>
>>> +#ifndef arch_update_kernel_mappings_begin
>>> +/**
>>> + * arch_update_kernel_mappings_begin - A batch of kernel pgtable mappings are
>>> + * about to be updated.
>>> + * @start: Virtual address of start of range to be updated.
>>> + * @end: Virtual address of end of range to be updated.
>>> + *
>>> + * An optional hook to allow architecture code to prepare for a batch of kernel
>>> + * pgtable mapping updates. An architecture may use this to enter a lazy mode
>>> + * where some operations can be deferred until the end of the batch.
>>> + *
>>> + * Context: Called in task context and may be preemptible.
>>> + */
>>> +static inline void arch_update_kernel_mappings_begin(unsigned long start,
>>> + unsigned long end)
>>> +{
>>> +}
>>> +#endif
>>> +
>>> +#ifndef arch_update_kernel_mappings_end
>>> /*
>>> * Architectures can set this mask to a combination of PGTBL_P?D_MODIFIED values
>>> * and let generic vmalloc and ioremap code know when arch_sync_kernel_mappings()
>>> @@ -229,6 +250,32 @@ int vmap_pages_range(unsigned long addr, unsigned long end, pgprot_t prot,
>>> */
>>> void arch_sync_kernel_mappings(unsigned long start, unsigned long end);
>>>
>>> +/**
>>> + * arch_update_kernel_mappings_end - A batch of kernel pgtable mappings have
>>> + * been updated.
>>> + * @start: Virtual address of start of range that was updated.
>>> + * @end: Virtual address of end of range that was updated.
>>> + *
>>> + * An optional hook to inform architecture code that a batch update is complete.
>>> + * This balances a previous call to arch_update_kernel_mappings_begin().
>>> + *
>>> + * An architecture may override this for any purpose, such as exiting a lazy
>>> + * mode previously entered with arch_update_kernel_mappings_begin() or syncing
>>> + * kernel mappings to a secondary pgtable. The default implementation calls an
>>> + * arch-provided arch_sync_kernel_mappings() if any arch-defined pgtable level
>>> + * was updated.
>>> + *
>>> + * Context: Called in task context and may be preemptible.
>>> + */
>>> +static inline void arch_update_kernel_mappings_end(unsigned long start,
>>> + unsigned long end,
>>> + pgtbl_mod_mask mask)
>>> +{
>>> + if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
>>> + arch_sync_kernel_mappings(start, end);
>>> +}
>>
>> One arch call back calling yet another arch call back sounds bit odd.
>
> It's no different from the default implementation of arch_make_huge_pte()
> calling pte_mkhuge() is it?
Agreed. arch_make_huge_pte() ---> pte_mkhuge() where either helpers can be
customized in the platform is another such example but unless necessary we
should probably avoid following that. Anyways it's not a big deal I guess.
>
>> Also
>> should not ARCH_PAGE_TABLE_SYNC_MASK be checked both for __begin and __end
>> callbacks in case a platform subscribes into this framework.
>
> I'm not sure how that would work? The mask is accumulated during the pgtable
> walk. So we don't have a mask until we get to the end.
A non-zero ARCH_PAGE_TABLE_SYNC_MASK indicates that a platform is subscribing
to this mechanism. So could ARCH_PAGE_TABLE_SYNC_MASK != 0 be used instead ?
>
>> Instead the
>> following changes sound more reasonable, but will also require some more
>> updates for the current platforms using arch_sync_kernel_mappings().
>>
>> if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
>> arch_update_kernel_mappings_begin()
>
> This makes no sense. mask is always 0 before doing the walk.
Got it.
>
>>
>> if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
>> arch_update_kernel_mappings_end()
>>
>> Basically when any platform defines ARCH_PAGE_TABLE_SYNC_MASK and subscribes
>> this framework, it will also provide arch_update_kernel_mappings_begin/end()
>> callbacks as required.
>
> Personally I think it's cleaner to just pass mask to
> arch_update_kernel_mappings_end() and it the function decide what it wants to do.
>
> But it's a good question as to whether we should refactor x86 and arm to
> directly implement arch_update_kernel_mappings_end() instead of
> arch_sync_kernel_mappings(). Personally I thought it was better to avoid the
> churn. But interested in others opinions.
>
> Thanks,
> Ryan
>
>>
>>> +#endif
>>> +
>>> /*
>>> * Lowlevel-APIs (not for driver use!)
>>> */
>>> diff --git a/mm/memory.c b/mm/memory.c
>>> index a15f7dd500ea..f80930bc19f6 100644
>>> --- a/mm/memory.c
>>> +++ b/mm/memory.c
>>> @@ -3035,6 +3035,8 @@ static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr,
>>> if (WARN_ON(addr >= end))
>>> return -EINVAL;
>>>
>>> + arch_update_kernel_mappings_begin(start, end);
>>> +
>>> pgd = pgd_offset(mm, addr);
>>> do {
>>> next = pgd_addr_end(addr, end);
>>> @@ -3055,8 +3057,7 @@ static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr,
>>> break;
>>> } while (pgd++, addr = next, addr != end);
>>>
>>> - if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
>>> - arch_sync_kernel_mappings(start, start + size);
>>> + arch_update_kernel_mappings_end(start, end, mask);
>>>
>>> return err;
>>> }
>>> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
>>> index 50fd44439875..c5c51d86ef78 100644
>>> --- a/mm/vmalloc.c
>>> +++ b/mm/vmalloc.c
>>> @@ -312,10 +312,10 @@ int vmap_page_range(unsigned long addr, unsigned long end,
>>> pgtbl_mod_mask mask = 0;
>>> int err;
>>>
>>> + arch_update_kernel_mappings_begin(addr, end);
>>> err = vmap_range_noflush(addr, end, phys_addr, pgprot_nx(prot),
>>> ioremap_max_page_shift, &mask);
>>> - if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
>>> - arch_sync_kernel_mappings(addr, end);
>>> + arch_update_kernel_mappings_end(addr, end, mask);
>>>
>>> flush_cache_vmap(addr, end);
>>> if (!err)
>>> @@ -463,6 +463,9 @@ void __vunmap_range_noflush(unsigned long start, unsigned long end)
>>> pgtbl_mod_mask mask = 0;
>>>
>>> BUG_ON(addr >= end);
>>> +
>>> + arch_update_kernel_mappings_begin(start, end);
>>> +
>>> pgd = pgd_offset_k(addr);
>>> do {
>>> next = pgd_addr_end(addr, end);
>>> @@ -473,8 +476,7 @@ void __vunmap_range_noflush(unsigned long start, unsigned long end)
>>> vunmap_p4d_range(pgd, addr, next, &mask);
>>> } while (pgd++, addr = next, addr != end);
>>>
>>> - if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
>>> - arch_sync_kernel_mappings(start, end);
>>> + arch_update_kernel_mappings_end(start, end, mask);
>>> }
>>>
>>> void vunmap_range_noflush(unsigned long start, unsigned long end)
>>> @@ -625,6 +627,8 @@ int __vmap_pages_range_noflush(unsigned long addr, unsigned long end,
>>>
>>> WARN_ON(page_shift < PAGE_SHIFT);
>>>
>>> + arch_update_kernel_mappings_begin(start, end);
>>> +
>>> if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC) ||
>>> page_shift == PAGE_SHIFT) {
>>> err = vmap_small_pages_range_noflush(addr, end, prot, pages,
>>> @@ -642,8 +646,7 @@ int __vmap_pages_range_noflush(unsigned long addr, unsigned long end,
>>> }
>>> }
>>>
>>> - if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
>>> - arch_sync_kernel_mappings(start, end);
>>> + arch_update_kernel_mappings_end(start, end, mask);
>>>
>>> return err;
>>> }
>
>>>> +/**
>>>> + * arch_update_kernel_mappings_end - A batch of kernel pgtable mappings have
>>>> + * been updated.
>>>> + * @start: Virtual address of start of range that was updated.
>>>> + * @end: Virtual address of end of range that was updated.
>>>> + *
>>>> + * An optional hook to inform architecture code that a batch update is complete.
>>>> + * This balances a previous call to arch_update_kernel_mappings_begin().
>>>> + *
>>>> + * An architecture may override this for any purpose, such as exiting a lazy
>>>> + * mode previously entered with arch_update_kernel_mappings_begin() or syncing
>>>> + * kernel mappings to a secondary pgtable. The default implementation calls an
>>>> + * arch-provided arch_sync_kernel_mappings() if any arch-defined pgtable level
>>>> + * was updated.
>>>> + *
>>>> + * Context: Called in task context and may be preemptible.
>>>> + */
>>>> +static inline void arch_update_kernel_mappings_end(unsigned long start,
>>>> + unsigned long end,
>>>> + pgtbl_mod_mask mask)
>>>> +{
>>>> + if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
>>>> + arch_sync_kernel_mappings(start, end);
>>>> +}
>>>
>>> One arch call back calling yet another arch call back sounds bit odd.
>>
>> It's no different from the default implementation of arch_make_huge_pte()
>> calling pte_mkhuge() is it?
>
> Agreed. arch_make_huge_pte() ---> pte_mkhuge() where either helpers can be
> customized in the platform is another such example but unless necessary we
> should probably avoid following that. Anyways it's not a big deal I guess.
>
>>
>>> Also
>>> should not ARCH_PAGE_TABLE_SYNC_MASK be checked both for __begin and __end
>>> callbacks in case a platform subscribes into this framework.
>>
>> I'm not sure how that would work? The mask is accumulated during the pgtable
>> walk. So we don't have a mask until we get to the end.
>
> A non-zero ARCH_PAGE_TABLE_SYNC_MASK indicates that a platform is subscribing
> to this mechanism. So could ARCH_PAGE_TABLE_SYNC_MASK != 0 be used instead ?
There are now 2 levels of mechanism:
Either: arch defines ARCH_PAGE_TABLE_SYNC_MASK to be non-zero and provides
arch_sync_kernel_mappings(). This is unchanged from how it was before.
Or: arch defines it's own version of one or both of
arch_update_kernel_mappings_begin() and arch_update_kernel_mappings_end().
So a non-zero ARCH_PAGE_TABLE_SYNC_MASK indicates that a platform is subscribing
to the *first* mechanism. It has nothing to do with the second mechanism. If the
platform defines arch_update_kernel_mappings_begin() it wants it to be called.
If it doesn't define it, then it doesn't get called.
Thanks,
Ryan
© 2016 - 2025 Red Hat, Inc.