arm64 currently changes permissions on vmalloc objects locklessly, via
apply_to_page_range, whose limitation is to deny changing permissions for
block mappings. Therefore, we move away to use the generic pagewalk API,
thus paving the path for enabling huge mappings by default on kernel space
mappings, thus leading to more efficient TLB usage. However, the API
currently enforces the init_mm.mmap_lock to be held. To avoid the
unnecessary bottleneck of the mmap_lock for our usecase, this patch
extends this generic API to be used locklessly, so as to retain the
existing behaviour for changing permissions. Apart from this reason, it is
noted at [1] that KFENCE can manipulate kernel pgtable entries during
softirqs. It does this by calling set_memory_valid() -> __change_memory_common().
This being a non-sleepable context, we cannot take the init_mm mmap lock.
Add comments to highlight the conditions under which we can use the
lockless variant - no underlying VMA, and the user having exclusive control
over the range, thus guaranteeing no concurrent access.
Since arm64 cannot handle kernel live mapping splitting without BBML2,
we require that the start and end of a given range lie on block mapping
boundaries. Return -EINVAL in case a partial block mapping is detected;
add a corresponding comment in ___change_memory_common() to warn that
eliminating such a condition is the responsibility of the caller.
apply_to_page_range() currently performs all pte level callbacks while in
lazy mmu mode. Since arm64 can optimize performance by batching barriers
when modifying kernel pgtables in lazy mmu mode, we would like to continue
to benefit from this optimisation. Unfortunately walk_kernel_page_table_range()
does not use lazy mmu mode. However, since the pagewalk framework is not
allocating any memory, we can safely bracket the whole operation inside
lazy mmu mode ourselves. Therefore, wrap the call to
walk_kernel_page_table_range() with the lazy MMU helpers.
[1] https://lore.kernel.org/linux-arm-kernel/89d0ad18-4772-4d8f-ae8a-7c48d26a927e@arm.com/
Signed-off-by: Dev Jain <dev.jain@arm.com>
---
arch/arm64/mm/pageattr.c | 157 +++++++++++++++++++++++++++++++--------
include/linux/pagewalk.h | 3 +
mm/pagewalk.c | 26 +++++++
3 files changed, 154 insertions(+), 32 deletions(-)
diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c
index 04d4a8f676db..cfc5279f27a2 100644
--- a/arch/arm64/mm/pageattr.c
+++ b/arch/arm64/mm/pageattr.c
@@ -8,6 +8,7 @@
#include <linux/mem_encrypt.h>
#include <linux/sched.h>
#include <linux/vmalloc.h>
+#include <linux/pagewalk.h>
#include <asm/cacheflush.h>
#include <asm/pgtable-prot.h>
@@ -20,6 +21,99 @@ struct page_change_data {
pgprot_t clear_mask;
};
+static ptdesc_t set_pageattr_masks(ptdesc_t val, struct mm_walk *walk)
+{
+ struct page_change_data *masks = walk->private;
+
+ val &= ~(pgprot_val(masks->clear_mask));
+ val |= (pgprot_val(masks->set_mask));
+
+ return val;
+}
+
+static int pageattr_pgd_entry(pgd_t *pgd, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ pgd_t val = pgdp_get(pgd);
+
+ if (pgd_leaf(val)) {
+ if (WARN_ON_ONCE((next - addr) != PGDIR_SIZE))
+ return -EINVAL;
+ val = __pgd(set_pageattr_masks(pgd_val(val), walk));
+ set_pgd(pgd, val);
+ walk->action = ACTION_CONTINUE;
+ }
+
+ return 0;
+}
+
+static int pageattr_p4d_entry(p4d_t *p4d, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ p4d_t val = p4dp_get(p4d);
+
+ if (p4d_leaf(val)) {
+ if (WARN_ON_ONCE((next - addr) != P4D_SIZE))
+ return -EINVAL;
+ val = __p4d(set_pageattr_masks(p4d_val(val), walk));
+ set_p4d(p4d, val);
+ walk->action = ACTION_CONTINUE;
+ }
+
+ return 0;
+}
+
+static int pageattr_pud_entry(pud_t *pud, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ pud_t val = pudp_get(pud);
+
+ if (pud_leaf(val)) {
+ if (WARN_ON_ONCE((next - addr) != PUD_SIZE))
+ return -EINVAL;
+ val = __pud(set_pageattr_masks(pud_val(val), walk));
+ set_pud(pud, val);
+ walk->action = ACTION_CONTINUE;
+ }
+
+ return 0;
+}
+
+static int pageattr_pmd_entry(pmd_t *pmd, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ pmd_t val = pmdp_get(pmd);
+
+ if (pmd_leaf(val)) {
+ if (WARN_ON_ONCE((next - addr) != PMD_SIZE))
+ return -EINVAL;
+ val = __pmd(set_pageattr_masks(pmd_val(val), walk));
+ set_pmd(pmd, val);
+ walk->action = ACTION_CONTINUE;
+ }
+
+ return 0;
+}
+
+static int pageattr_pte_entry(pte_t *pte, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ pte_t val = __ptep_get(pte);
+
+ val = __pte(set_pageattr_masks(pte_val(val), walk));
+ __set_pte(pte, val);
+
+ return 0;
+}
+
+static const struct mm_walk_ops pageattr_ops = {
+ .pgd_entry = pageattr_pgd_entry,
+ .p4d_entry = pageattr_p4d_entry,
+ .pud_entry = pageattr_pud_entry,
+ .pmd_entry = pageattr_pmd_entry,
+ .pte_entry = pageattr_pte_entry,
+};
+
bool rodata_full __ro_after_init = IS_ENABLED(CONFIG_RODATA_FULL_DEFAULT_ENABLED);
bool can_set_direct_map(void)
@@ -37,22 +131,7 @@ bool can_set_direct_map(void)
arm64_kfence_can_set_direct_map() || is_realm_world();
}
-static int change_page_range(pte_t *ptep, unsigned long addr, void *data)
-{
- struct page_change_data *cdata = data;
- pte_t pte = __ptep_get(ptep);
-
- pte = clear_pte_bit(pte, cdata->clear_mask);
- pte = set_pte_bit(pte, cdata->set_mask);
-
- __set_pte(ptep, pte);
- return 0;
-}
-
-/*
- * This function assumes that the range is mapped with PAGE_SIZE pages.
- */
-static int __change_memory_common(unsigned long start, unsigned long size,
+static int ___change_memory_common(unsigned long start, unsigned long size,
pgprot_t set_mask, pgprot_t clear_mask)
{
struct page_change_data data;
@@ -61,9 +140,28 @@ static int __change_memory_common(unsigned long start, unsigned long size,
data.set_mask = set_mask;
data.clear_mask = clear_mask;
- ret = apply_to_page_range(&init_mm, start, size, change_page_range,
- &data);
+ arch_enter_lazy_mmu_mode();
+
+ /*
+ * The caller must ensure that the range we are operating on does not
+ * partially overlap a block mapping. Any such case should either not
+ * exist, or must be eliminated by splitting the mapping - which for
+ * kernel mappings can be done only on BBML2 systems.
+ *
+ */
+ ret = walk_kernel_page_table_range_lockless(start, start + size,
+ &pageattr_ops, NULL, &data);
+ arch_leave_lazy_mmu_mode();
+
+ return ret;
+}
+static int __change_memory_common(unsigned long start, unsigned long size,
+ pgprot_t set_mask, pgprot_t clear_mask)
+{
+ int ret;
+
+ ret = ___change_memory_common(start, size, set_mask, clear_mask);
/*
* If the memory is being made valid without changing any other bits
* then a TLBI isn't required as a non-valid entry cannot be cached in
@@ -71,6 +169,7 @@ static int __change_memory_common(unsigned long start, unsigned long size,
*/
if (pgprot_val(set_mask) != PTE_VALID || pgprot_val(clear_mask))
flush_tlb_kernel_range(start, start + size);
+
return ret;
}
@@ -174,32 +273,26 @@ int set_memory_valid(unsigned long addr, int numpages, int enable)
int set_direct_map_invalid_noflush(struct page *page)
{
- struct page_change_data data = {
- .set_mask = __pgprot(0),
- .clear_mask = __pgprot(PTE_VALID),
- };
+ pgprot_t clear_mask = __pgprot(PTE_VALID);
+ pgprot_t set_mask = __pgprot(0);
if (!can_set_direct_map())
return 0;
- return apply_to_page_range(&init_mm,
- (unsigned long)page_address(page),
- PAGE_SIZE, change_page_range, &data);
+ return ___change_memory_common((unsigned long)page_address(page),
+ PAGE_SIZE, set_mask, clear_mask);
}
int set_direct_map_default_noflush(struct page *page)
{
- struct page_change_data data = {
- .set_mask = __pgprot(PTE_VALID | PTE_WRITE),
- .clear_mask = __pgprot(PTE_RDONLY),
- };
+ pgprot_t set_mask = __pgprot(PTE_VALID | PTE_WRITE);
+ pgprot_t clear_mask = __pgprot(PTE_RDONLY);
if (!can_set_direct_map())
return 0;
- return apply_to_page_range(&init_mm,
- (unsigned long)page_address(page),
- PAGE_SIZE, change_page_range, &data);
+ return ___change_memory_common((unsigned long)page_address(page),
+ PAGE_SIZE, set_mask, clear_mask);
}
static int __set_memory_enc_dec(unsigned long addr,
diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h
index 8ac2f6d6d2a3..79ab8c754dff 100644
--- a/include/linux/pagewalk.h
+++ b/include/linux/pagewalk.h
@@ -132,6 +132,9 @@ int walk_page_range(struct mm_struct *mm, unsigned long start,
int walk_kernel_page_table_range(unsigned long start,
unsigned long end, const struct mm_walk_ops *ops,
pgd_t *pgd, void *private);
+int walk_kernel_page_table_range_lockless(unsigned long start,
+ unsigned long end, const struct mm_walk_ops *ops,
+ pgd_t *pgd, void *private);
int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start,
unsigned long end, const struct mm_walk_ops *ops,
void *private);
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index ff5299eca687..7446984b2154 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -632,6 +632,32 @@ int walk_kernel_page_table_range(unsigned long start, unsigned long end,
return walk_pgd_range(start, end, &walk);
}
+/*
+ * Use this function to walk the kernel page tables locklessly. It should be
+ * guaranteed that the caller has exclusive access over the range they are
+ * operating on - that there should be no concurrent access, for example,
+ * changing permissions for vmalloc objects.
+ */
+int walk_kernel_page_table_range_lockless(unsigned long start, unsigned long end,
+ const struct mm_walk_ops *ops, pgd_t *pgd, void *private)
+{
+ struct mm_struct *mm = &init_mm;
+ struct mm_walk walk = {
+ .ops = ops,
+ .mm = mm,
+ .pgd = pgd,
+ .private = private,
+ .no_vma = true
+ };
+
+ if (start >= end)
+ return -EINVAL;
+ if (!check_ops_valid(ops))
+ return -EINVAL;
+
+ return walk_pgd_range(start, end, &walk);
+}
+
/**
* walk_page_range_debug - walk a range of pagetables not backed by a vma
* @mm: mm_struct representing the target process of page table walk
--
2.30.2
On 13/06/25 7:13 pm, Dev Jain wrote: > arm64 currently changes permissions on vmalloc objects locklessly, via > apply_to_page_range, whose limitation is to deny changing permissions for > block mappings. Therefore, we move away to use the generic pagewalk API, > thus paving the path for enabling huge mappings by default on kernel space > mappings, thus leading to more efficient TLB usage. However, the API > currently enforces the init_mm.mmap_lock to be held. To avoid the > unnecessary bottleneck of the mmap_lock for our usecase, this patch > extends this generic API to be used locklessly, so as to retain the > existing behaviour for changing permissions. Apart from this reason, it is > noted at [1] that KFENCE can manipulate kernel pgtable entries during > softirqs. It does this by calling set_memory_valid() -> __change_memory_common(). > This being a non-sleepable context, we cannot take the init_mm mmap lock. > > Add comments to highlight the conditions under which we can use the > lockless variant - no underlying VMA, and the user having exclusive control > over the range, thus guaranteeing no concurrent access. > > Since arm64 cannot handle kernel live mapping splitting without BBML2, > we require that the start and end of a given range lie on block mapping > boundaries. Return -EINVAL in case a partial block mapping is detected; > add a corresponding comment in ___change_memory_common() to warn that > eliminating such a condition is the responsibility of the caller. > > apply_to_page_range() currently performs all pte level callbacks while in > lazy mmu mode. Since arm64 can optimize performance by batching barriers > when modifying kernel pgtables in lazy mmu mode, we would like to continue > to benefit from this optimisation. Unfortunately walk_kernel_page_table_range() > does not use lazy mmu mode. However, since the pagewalk framework is not > allocating any memory, we can safely bracket the whole operation inside > lazy mmu mode ourselves. Therefore, wrap the call to > walk_kernel_page_table_range() with the lazy MMU helpers. > > [1] https://lore.kernel.org/linux-arm-kernel/89d0ad18-4772-4d8f-ae8a-7c48d26a927e@arm.com/ > > Signed-off-by: Dev Jain <dev.jain@arm.com> > --- > arch/arm64/mm/pageattr.c | 157 +++++++++++++++++++++++++++++++-------- > include/linux/pagewalk.h | 3 + > mm/pagewalk.c | 26 +++++++ > 3 files changed, 154 insertions(+), 32 deletions(-) > > diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c > index 04d4a8f676db..cfc5279f27a2 100644 > --- a/arch/arm64/mm/pageattr.c > +++ b/arch/arm64/mm/pageattr.c > @@ -8,6 +8,7 @@ > #include <linux/mem_encrypt.h> > #include <linux/sched.h> > #include <linux/vmalloc.h> > +#include <linux/pagewalk.h> > > #include <asm/cacheflush.h> > #include <asm/pgtable-prot.h> > @@ -20,6 +21,99 @@ struct page_change_data { > pgprot_t clear_mask; > }; > > +static ptdesc_t set_pageattr_masks(ptdesc_t val, struct mm_walk *walk) > +{ > + struct page_change_data *masks = walk->private; > + > + val &= ~(pgprot_val(masks->clear_mask)); > + val |= (pgprot_val(masks->set_mask)); > + > + return val; > +} > + > +static int pageattr_pgd_entry(pgd_t *pgd, unsigned long addr, > + unsigned long next, struct mm_walk *walk) > +{ > + pgd_t val = pgdp_get(pgd); > + > + if (pgd_leaf(val)) { > + if (WARN_ON_ONCE((next - addr) != PGDIR_SIZE)) > + return -EINVAL; > + val = __pgd(set_pageattr_masks(pgd_val(val), walk)); > + set_pgd(pgd, val); > + walk->action = ACTION_CONTINUE; > + } > + > + return 0; > +} > + > +static int pageattr_p4d_entry(p4d_t *p4d, unsigned long addr, > + unsigned long next, struct mm_walk *walk) > +{ > + p4d_t val = p4dp_get(p4d); > + > + if (p4d_leaf(val)) { > + if (WARN_ON_ONCE((next - addr) != P4D_SIZE)) > + return -EINVAL; > + val = __p4d(set_pageattr_masks(p4d_val(val), walk)); > + set_p4d(p4d, val); > + walk->action = ACTION_CONTINUE; > + } > + > + return 0; > +} > + > +static int pageattr_pud_entry(pud_t *pud, unsigned long addr, > + unsigned long next, struct mm_walk *walk) > +{ > + pud_t val = pudp_get(pud); > + > + if (pud_leaf(val)) { > + if (WARN_ON_ONCE((next - addr) != PUD_SIZE)) > + return -EINVAL; > + val = __pud(set_pageattr_masks(pud_val(val), walk)); > + set_pud(pud, val); > + walk->action = ACTION_CONTINUE; > + } > + > + return 0; > +} > + > +static int pageattr_pmd_entry(pmd_t *pmd, unsigned long addr, > + unsigned long next, struct mm_walk *walk) > +{ > + pmd_t val = pmdp_get(pmd); > + > + if (pmd_leaf(val)) { > + if (WARN_ON_ONCE((next - addr) != PMD_SIZE)) > + return -EINVAL; > + val = __pmd(set_pageattr_masks(pmd_val(val), walk)); > + set_pmd(pmd, val); > + walk->action = ACTION_CONTINUE; > + } > + > + return 0; > +} > + > +static int pageattr_pte_entry(pte_t *pte, unsigned long addr, > + unsigned long next, struct mm_walk *walk) > +{ > + pte_t val = __ptep_get(pte); > + > + val = __pte(set_pageattr_masks(pte_val(val), walk)); > + __set_pte(pte, val); > + > + return 0; > +} I was wondering, now that we have vmalloc contpte support, do we need to ensure in this pte level callback that we don't partially cover a contpte block?
On 26/06/2025 06:47, Dev Jain wrote: > > On 13/06/25 7:13 pm, Dev Jain wrote: >> arm64 currently changes permissions on vmalloc objects locklessly, via >> apply_to_page_range, whose limitation is to deny changing permissions for >> block mappings. Therefore, we move away to use the generic pagewalk API, >> thus paving the path for enabling huge mappings by default on kernel space >> mappings, thus leading to more efficient TLB usage. However, the API >> currently enforces the init_mm.mmap_lock to be held. To avoid the >> unnecessary bottleneck of the mmap_lock for our usecase, this patch >> extends this generic API to be used locklessly, so as to retain the >> existing behaviour for changing permissions. Apart from this reason, it is >> noted at [1] that KFENCE can manipulate kernel pgtable entries during >> softirqs. It does this by calling set_memory_valid() -> __change_memory_common(). >> This being a non-sleepable context, we cannot take the init_mm mmap lock. >> >> Add comments to highlight the conditions under which we can use the >> lockless variant - no underlying VMA, and the user having exclusive control >> over the range, thus guaranteeing no concurrent access. >> >> Since arm64 cannot handle kernel live mapping splitting without BBML2, >> we require that the start and end of a given range lie on block mapping >> boundaries. Return -EINVAL in case a partial block mapping is detected; >> add a corresponding comment in ___change_memory_common() to warn that >> eliminating such a condition is the responsibility of the caller. >> >> apply_to_page_range() currently performs all pte level callbacks while in >> lazy mmu mode. Since arm64 can optimize performance by batching barriers >> when modifying kernel pgtables in lazy mmu mode, we would like to continue >> to benefit from this optimisation. Unfortunately walk_kernel_page_table_range() >> does not use lazy mmu mode. However, since the pagewalk framework is not >> allocating any memory, we can safely bracket the whole operation inside >> lazy mmu mode ourselves. Therefore, wrap the call to >> walk_kernel_page_table_range() with the lazy MMU helpers. >> >> [1] https://lore.kernel.org/linux-arm-kernel/89d0ad18-4772-4d8f- >> ae8a-7c48d26a927e@arm.com/ >> >> Signed-off-by: Dev Jain <dev.jain@arm.com> >> --- >> arch/arm64/mm/pageattr.c | 157 +++++++++++++++++++++++++++++++-------- >> include/linux/pagewalk.h | 3 + >> mm/pagewalk.c | 26 +++++++ >> 3 files changed, 154 insertions(+), 32 deletions(-) >> >> diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c >> index 04d4a8f676db..cfc5279f27a2 100644 >> --- a/arch/arm64/mm/pageattr.c >> +++ b/arch/arm64/mm/pageattr.c >> @@ -8,6 +8,7 @@ >> #include <linux/mem_encrypt.h> >> #include <linux/sched.h> >> #include <linux/vmalloc.h> >> +#include <linux/pagewalk.h> >> #include <asm/cacheflush.h> >> #include <asm/pgtable-prot.h> >> @@ -20,6 +21,99 @@ struct page_change_data { >> pgprot_t clear_mask; >> }; >> +static ptdesc_t set_pageattr_masks(ptdesc_t val, struct mm_walk *walk) >> +{ >> + struct page_change_data *masks = walk->private; >> + >> + val &= ~(pgprot_val(masks->clear_mask)); >> + val |= (pgprot_val(masks->set_mask)); >> + >> + return val; >> +} >> + >> +static int pageattr_pgd_entry(pgd_t *pgd, unsigned long addr, >> + unsigned long next, struct mm_walk *walk) >> +{ >> + pgd_t val = pgdp_get(pgd); >> + >> + if (pgd_leaf(val)) { >> + if (WARN_ON_ONCE((next - addr) != PGDIR_SIZE)) >> + return -EINVAL; >> + val = __pgd(set_pageattr_masks(pgd_val(val), walk)); >> + set_pgd(pgd, val); >> + walk->action = ACTION_CONTINUE; >> + } >> + >> + return 0; >> +} >> + >> +static int pageattr_p4d_entry(p4d_t *p4d, unsigned long addr, >> + unsigned long next, struct mm_walk *walk) >> +{ >> + p4d_t val = p4dp_get(p4d); >> + >> + if (p4d_leaf(val)) { >> + if (WARN_ON_ONCE((next - addr) != P4D_SIZE)) >> + return -EINVAL; >> + val = __p4d(set_pageattr_masks(p4d_val(val), walk)); >> + set_p4d(p4d, val); >> + walk->action = ACTION_CONTINUE; >> + } >> + >> + return 0; >> +} >> + >> +static int pageattr_pud_entry(pud_t *pud, unsigned long addr, >> + unsigned long next, struct mm_walk *walk) >> +{ >> + pud_t val = pudp_get(pud); >> + >> + if (pud_leaf(val)) { >> + if (WARN_ON_ONCE((next - addr) != PUD_SIZE)) >> + return -EINVAL; >> + val = __pud(set_pageattr_masks(pud_val(val), walk)); >> + set_pud(pud, val); >> + walk->action = ACTION_CONTINUE; >> + } >> + >> + return 0; >> +} >> + >> +static int pageattr_pmd_entry(pmd_t *pmd, unsigned long addr, >> + unsigned long next, struct mm_walk *walk) >> +{ >> + pmd_t val = pmdp_get(pmd); >> + >> + if (pmd_leaf(val)) { >> + if (WARN_ON_ONCE((next - addr) != PMD_SIZE)) >> + return -EINVAL; >> + val = __pmd(set_pageattr_masks(pmd_val(val), walk)); >> + set_pmd(pmd, val); >> + walk->action = ACTION_CONTINUE; >> + } >> + >> + return 0; >> +} >> + >> +static int pageattr_pte_entry(pte_t *pte, unsigned long addr, >> + unsigned long next, struct mm_walk *walk) >> +{ >> + pte_t val = __ptep_get(pte); >> + >> + val = __pte(set_pageattr_masks(pte_val(val), walk)); >> + __set_pte(pte, val); >> + >> + return 0; >> +} > > I was wondering, now that we have vmalloc contpte support, > do we need to ensure in this pte level callback that > we don't partially cover a contpte block? Yes good point!
On 13/06/2025 14:43, Dev Jain wrote: > arm64 currently changes permissions on vmalloc objects locklessly, via > apply_to_page_range, whose limitation is to deny changing permissions for > block mappings. Therefore, we move away to use the generic pagewalk API, > thus paving the path for enabling huge mappings by default on kernel space > mappings, thus leading to more efficient TLB usage. However, the API > currently enforces the init_mm.mmap_lock to be held. To avoid the > unnecessary bottleneck of the mmap_lock for our usecase, this patch > extends this generic API to be used locklessly, so as to retain the > existing behaviour for changing permissions. Apart from this reason, it is > noted at [1] that KFENCE can manipulate kernel pgtable entries during > softirqs. It does this by calling set_memory_valid() -> __change_memory_common(). > This being a non-sleepable context, we cannot take the init_mm mmap lock. > > Add comments to highlight the conditions under which we can use the > lockless variant - no underlying VMA, and the user having exclusive control > over the range, thus guaranteeing no concurrent access. > > Since arm64 cannot handle kernel live mapping splitting without BBML2, > we require that the start and end of a given range lie on block mapping > boundaries. Return -EINVAL in case a partial block mapping is detected; > add a corresponding comment in ___change_memory_common() to warn that > eliminating such a condition is the responsibility of the caller. > > apply_to_page_range() currently performs all pte level callbacks while in > lazy mmu mode. Since arm64 can optimize performance by batching barriers > when modifying kernel pgtables in lazy mmu mode, we would like to continue > to benefit from this optimisation. Unfortunately walk_kernel_page_table_range() > does not use lazy mmu mode. However, since the pagewalk framework is not > allocating any memory, we can safely bracket the whole operation inside > lazy mmu mode ourselves. Therefore, wrap the call to > walk_kernel_page_table_range() with the lazy MMU helpers. > > [1] https://lore.kernel.org/linux-arm-kernel/89d0ad18-4772-4d8f-ae8a-7c48d26a927e@arm.com/ > > Signed-off-by: Dev Jain <dev.jain@arm.com> LGTM; with the nits raised by other folks addressed: Reviewed-by: Ryan Roberts <ryan.roberts@arm.com> > --- > arch/arm64/mm/pageattr.c | 157 +++++++++++++++++++++++++++++++-------- > include/linux/pagewalk.h | 3 + > mm/pagewalk.c | 26 +++++++ > 3 files changed, 154 insertions(+), 32 deletions(-) > > diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c > index 04d4a8f676db..cfc5279f27a2 100644 > --- a/arch/arm64/mm/pageattr.c > +++ b/arch/arm64/mm/pageattr.c > @@ -8,6 +8,7 @@ > #include <linux/mem_encrypt.h> > #include <linux/sched.h> > #include <linux/vmalloc.h> > +#include <linux/pagewalk.h> > > #include <asm/cacheflush.h> > #include <asm/pgtable-prot.h> > @@ -20,6 +21,99 @@ struct page_change_data { > pgprot_t clear_mask; > }; > > +static ptdesc_t set_pageattr_masks(ptdesc_t val, struct mm_walk *walk) > +{ > + struct page_change_data *masks = walk->private; > + > + val &= ~(pgprot_val(masks->clear_mask)); > + val |= (pgprot_val(masks->set_mask)); > + > + return val; > +} > + > +static int pageattr_pgd_entry(pgd_t *pgd, unsigned long addr, > + unsigned long next, struct mm_walk *walk) > +{ > + pgd_t val = pgdp_get(pgd); > + > + if (pgd_leaf(val)) { > + if (WARN_ON_ONCE((next - addr) != PGDIR_SIZE)) > + return -EINVAL; > + val = __pgd(set_pageattr_masks(pgd_val(val), walk)); > + set_pgd(pgd, val); > + walk->action = ACTION_CONTINUE; > + } > + > + return 0; > +} > + > +static int pageattr_p4d_entry(p4d_t *p4d, unsigned long addr, > + unsigned long next, struct mm_walk *walk) > +{ > + p4d_t val = p4dp_get(p4d); > + > + if (p4d_leaf(val)) { > + if (WARN_ON_ONCE((next - addr) != P4D_SIZE)) > + return -EINVAL; > + val = __p4d(set_pageattr_masks(p4d_val(val), walk)); > + set_p4d(p4d, val); > + walk->action = ACTION_CONTINUE; > + } > + > + return 0; > +} > + > +static int pageattr_pud_entry(pud_t *pud, unsigned long addr, > + unsigned long next, struct mm_walk *walk) > +{ > + pud_t val = pudp_get(pud); > + > + if (pud_leaf(val)) { > + if (WARN_ON_ONCE((next - addr) != PUD_SIZE)) > + return -EINVAL; > + val = __pud(set_pageattr_masks(pud_val(val), walk)); > + set_pud(pud, val); > + walk->action = ACTION_CONTINUE; > + } > + > + return 0; > +} > + > +static int pageattr_pmd_entry(pmd_t *pmd, unsigned long addr, > + unsigned long next, struct mm_walk *walk) > +{ > + pmd_t val = pmdp_get(pmd); > + > + if (pmd_leaf(val)) { > + if (WARN_ON_ONCE((next - addr) != PMD_SIZE)) > + return -EINVAL; > + val = __pmd(set_pageattr_masks(pmd_val(val), walk)); > + set_pmd(pmd, val); > + walk->action = ACTION_CONTINUE; > + } > + > + return 0; > +} > + > +static int pageattr_pte_entry(pte_t *pte, unsigned long addr, > + unsigned long next, struct mm_walk *walk) > +{ > + pte_t val = __ptep_get(pte); > + > + val = __pte(set_pageattr_masks(pte_val(val), walk)); > + __set_pte(pte, val); > + > + return 0; > +} > + > +static const struct mm_walk_ops pageattr_ops = { > + .pgd_entry = pageattr_pgd_entry, > + .p4d_entry = pageattr_p4d_entry, > + .pud_entry = pageattr_pud_entry, > + .pmd_entry = pageattr_pmd_entry, > + .pte_entry = pageattr_pte_entry, > +}; > + > bool rodata_full __ro_after_init = IS_ENABLED(CONFIG_RODATA_FULL_DEFAULT_ENABLED); > > bool can_set_direct_map(void) > @@ -37,22 +131,7 @@ bool can_set_direct_map(void) > arm64_kfence_can_set_direct_map() || is_realm_world(); > } > > -static int change_page_range(pte_t *ptep, unsigned long addr, void *data) > -{ > - struct page_change_data *cdata = data; > - pte_t pte = __ptep_get(ptep); > - > - pte = clear_pte_bit(pte, cdata->clear_mask); > - pte = set_pte_bit(pte, cdata->set_mask); > - > - __set_pte(ptep, pte); > - return 0; > -} > - > -/* > - * This function assumes that the range is mapped with PAGE_SIZE pages. > - */ > -static int __change_memory_common(unsigned long start, unsigned long size, > +static int ___change_memory_common(unsigned long start, unsigned long size, > pgprot_t set_mask, pgprot_t clear_mask) > { > struct page_change_data data; > @@ -61,9 +140,28 @@ static int __change_memory_common(unsigned long start, unsigned long size, > data.set_mask = set_mask; > data.clear_mask = clear_mask; > > - ret = apply_to_page_range(&init_mm, start, size, change_page_range, > - &data); > + arch_enter_lazy_mmu_mode(); > + > + /* > + * The caller must ensure that the range we are operating on does not > + * partially overlap a block mapping. Any such case should either not > + * exist, or must be eliminated by splitting the mapping - which for > + * kernel mappings can be done only on BBML2 systems. > + * > + */ > + ret = walk_kernel_page_table_range_lockless(start, start + size, > + &pageattr_ops, NULL, &data); > + arch_leave_lazy_mmu_mode(); > + > + return ret; > +} > > +static int __change_memory_common(unsigned long start, unsigned long size, > + pgprot_t set_mask, pgprot_t clear_mask) > +{ > + int ret; > + > + ret = ___change_memory_common(start, size, set_mask, clear_mask); > /* > * If the memory is being made valid without changing any other bits > * then a TLBI isn't required as a non-valid entry cannot be cached in > @@ -71,6 +169,7 @@ static int __change_memory_common(unsigned long start, unsigned long size, > */ > if (pgprot_val(set_mask) != PTE_VALID || pgprot_val(clear_mask)) > flush_tlb_kernel_range(start, start + size); > + > return ret; > } > > @@ -174,32 +273,26 @@ int set_memory_valid(unsigned long addr, int numpages, int enable) > > int set_direct_map_invalid_noflush(struct page *page) > { > - struct page_change_data data = { > - .set_mask = __pgprot(0), > - .clear_mask = __pgprot(PTE_VALID), > - }; > + pgprot_t clear_mask = __pgprot(PTE_VALID); > + pgprot_t set_mask = __pgprot(0); > > if (!can_set_direct_map()) > return 0; > > - return apply_to_page_range(&init_mm, > - (unsigned long)page_address(page), > - PAGE_SIZE, change_page_range, &data); > + return ___change_memory_common((unsigned long)page_address(page), > + PAGE_SIZE, set_mask, clear_mask); > } > > int set_direct_map_default_noflush(struct page *page) > { > - struct page_change_data data = { > - .set_mask = __pgprot(PTE_VALID | PTE_WRITE), > - .clear_mask = __pgprot(PTE_RDONLY), > - }; > + pgprot_t set_mask = __pgprot(PTE_VALID | PTE_WRITE); > + pgprot_t clear_mask = __pgprot(PTE_RDONLY); > > if (!can_set_direct_map()) > return 0; > > - return apply_to_page_range(&init_mm, > - (unsigned long)page_address(page), > - PAGE_SIZE, change_page_range, &data); > + return ___change_memory_common((unsigned long)page_address(page), > + PAGE_SIZE, set_mask, clear_mask); > } > > static int __set_memory_enc_dec(unsigned long addr, > diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h > index 8ac2f6d6d2a3..79ab8c754dff 100644 > --- a/include/linux/pagewalk.h > +++ b/include/linux/pagewalk.h > @@ -132,6 +132,9 @@ int walk_page_range(struct mm_struct *mm, unsigned long start, > int walk_kernel_page_table_range(unsigned long start, > unsigned long end, const struct mm_walk_ops *ops, > pgd_t *pgd, void *private); > +int walk_kernel_page_table_range_lockless(unsigned long start, > + unsigned long end, const struct mm_walk_ops *ops, > + pgd_t *pgd, void *private); > int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start, > unsigned long end, const struct mm_walk_ops *ops, > void *private); > diff --git a/mm/pagewalk.c b/mm/pagewalk.c > index ff5299eca687..7446984b2154 100644 > --- a/mm/pagewalk.c > +++ b/mm/pagewalk.c > @@ -632,6 +632,32 @@ int walk_kernel_page_table_range(unsigned long start, unsigned long end, > return walk_pgd_range(start, end, &walk); > } > > +/* > + * Use this function to walk the kernel page tables locklessly. It should be > + * guaranteed that the caller has exclusive access over the range they are > + * operating on - that there should be no concurrent access, for example, > + * changing permissions for vmalloc objects. > + */ > +int walk_kernel_page_table_range_lockless(unsigned long start, unsigned long end, > + const struct mm_walk_ops *ops, pgd_t *pgd, void *private) > +{ > + struct mm_struct *mm = &init_mm; > + struct mm_walk walk = { > + .ops = ops, > + .mm = mm, > + .pgd = pgd, > + .private = private, > + .no_vma = true > + }; > + > + if (start >= end) > + return -EINVAL; > + if (!check_ops_valid(ops)) > + return -EINVAL; > + > + return walk_pgd_range(start, end, &walk); > +} > + > /** > * walk_page_range_debug - walk a range of pagetables not backed by a vma > * @mm: mm_struct representing the target process of page table walk
On Fri, Jun 13, 2025 at 07:13:51PM +0530, Dev Jain wrote: > -/* > - * This function assumes that the range is mapped with PAGE_SIZE pages. > - */ > -static int __change_memory_common(unsigned long start, unsigned long size, > +static int ___change_memory_common(unsigned long start, unsigned long size, > pgprot_t set_mask, pgprot_t clear_mask) > { > struct page_change_data data; > @@ -61,9 +140,28 @@ static int __change_memory_common(unsigned long start, unsigned long size, > data.set_mask = set_mask; > data.clear_mask = clear_mask; > > - ret = apply_to_page_range(&init_mm, start, size, change_page_range, > - &data); > + arch_enter_lazy_mmu_mode(); > + > + /* > + * The caller must ensure that the range we are operating on does not > + * partially overlap a block mapping. Any such case should either not > + * exist, or must be eliminated by splitting the mapping - which for > + * kernel mappings can be done only on BBML2 systems. > + * > + */ > + ret = walk_kernel_page_table_range_lockless(start, start + size, > + &pageattr_ops, NULL, &data); x86 has a cpa_lock for set_memory/set_direct_map to ensure that there's on concurrency in kernel page table updates. I think arm64 has to have such lock as well. > + arch_leave_lazy_mmu_mode(); > + > + return ret; > +} -- Sincerely yours, Mike.
On 15/06/2025 08:32, Mike Rapoport wrote: > On Fri, Jun 13, 2025 at 07:13:51PM +0530, Dev Jain wrote: >> -/* >> - * This function assumes that the range is mapped with PAGE_SIZE pages. >> - */ >> -static int __change_memory_common(unsigned long start, unsigned long size, >> +static int ___change_memory_common(unsigned long start, unsigned long size, >> pgprot_t set_mask, pgprot_t clear_mask) >> { >> struct page_change_data data; >> @@ -61,9 +140,28 @@ static int __change_memory_common(unsigned long start, unsigned long size, >> data.set_mask = set_mask; >> data.clear_mask = clear_mask; >> >> - ret = apply_to_page_range(&init_mm, start, size, change_page_range, >> - &data); >> + arch_enter_lazy_mmu_mode(); >> + >> + /* >> + * The caller must ensure that the range we are operating on does not >> + * partially overlap a block mapping. Any such case should either not >> + * exist, or must be eliminated by splitting the mapping - which for >> + * kernel mappings can be done only on BBML2 systems. >> + * >> + */ >> + ret = walk_kernel_page_table_range_lockless(start, start + size, >> + &pageattr_ops, NULL, &data); > > x86 has a cpa_lock for set_memory/set_direct_map to ensure that there's on > concurrency in kernel page table updates. I think arm64 has to have such > lock as well. We don't have a lock today, using apply_to_page_range(); we are expecting that the caller has exclusive ownership of the portion of virtual memory - i.e. the vmalloc region or linear map. So I don't think this patch changes that requirement? Where it does get a bit more hairy is when we introduce the support for splitting. In that case, 2 non-overlapping areas of virtual memory may share a large leaf mapping that needs to be split. But I've been discussing that with Yang Shi at [1] and I think we can handle that locklessly too. Perhaps I'm misunderstanding something? [1] https://lore.kernel.org/all/f036acea-1bd1-48a7-8600-75ddd504b8db@arm.com/ Thanks, Ryan > >> + arch_leave_lazy_mmu_mode(); >> + >> + return ret; >> +} >
On 6/25/25 4:04 AM, Ryan Roberts wrote: > On 15/06/2025 08:32, Mike Rapoport wrote: >> On Fri, Jun 13, 2025 at 07:13:51PM +0530, Dev Jain wrote: >>> -/* >>> - * This function assumes that the range is mapped with PAGE_SIZE pages. >>> - */ >>> -static int __change_memory_common(unsigned long start, unsigned long size, >>> +static int ___change_memory_common(unsigned long start, unsigned long size, >>> pgprot_t set_mask, pgprot_t clear_mask) >>> { >>> struct page_change_data data; >>> @@ -61,9 +140,28 @@ static int __change_memory_common(unsigned long start, unsigned long size, >>> data.set_mask = set_mask; >>> data.clear_mask = clear_mask; >>> >>> - ret = apply_to_page_range(&init_mm, start, size, change_page_range, >>> - &data); >>> + arch_enter_lazy_mmu_mode(); >>> + >>> + /* >>> + * The caller must ensure that the range we are operating on does not >>> + * partially overlap a block mapping. Any such case should either not >>> + * exist, or must be eliminated by splitting the mapping - which for >>> + * kernel mappings can be done only on BBML2 systems. >>> + * >>> + */ >>> + ret = walk_kernel_page_table_range_lockless(start, start + size, >>> + &pageattr_ops, NULL, &data); >> x86 has a cpa_lock for set_memory/set_direct_map to ensure that there's on >> concurrency in kernel page table updates. I think arm64 has to have such >> lock as well. > We don't have a lock today, using apply_to_page_range(); we are expecting that > the caller has exclusive ownership of the portion of virtual memory - i.e. the > vmalloc region or linear map. So I don't think this patch changes that requirement? > > Where it does get a bit more hairy is when we introduce the support for > splitting. In that case, 2 non-overlapping areas of virtual memory may share a > large leaf mapping that needs to be split. But I've been discussing that with > Yang Shi at [1] and I think we can handle that locklessly too. If the split is serialized by a lock, changing permission can be lockless. But if split is lockless, changing permission may be a little bit tricky, particularly for CONT mappings. The implementation in my split patch assumes the whole range has cont bit cleared if the first PTE in the range has cont bit cleared because the lock guarantees two concurrent splits are serialized. But lockless split may trigger the below race: CPU A is splitting the page table, CPU B is changing the permission for one PTE entry in the same table. Clearing cont bit is RMW, changing permission is RMW too, but neither of them is atomic. CPU A CPU B read the PTE read the PTE clear the cont bit for the PTE change the PTE permission from RW to RO store the new PTE store the new PTE <- it will overwrite the PTE value stored by CPU B and result in misprogrammed cont PTEs We should need do one the of the follows to avoid the race off the top of my head: 1. Serialize the split with a lock 2. Make page table RMW atomic in both split and permission change 3. Check whether PTE is cont or not for every PTEs in the range instead of the first PTE, before clearing cont bit if they are 4. Retry if cont bit is not cleared in permission change, but we need distinguish this from changing permission for the whole CONT PTE range because we keep cont bit for this case Thanks, Yang > > Perhaps I'm misunderstanding something? > > [1] https://lore.kernel.org/all/f036acea-1bd1-48a7-8600-75ddd504b8db@arm.com/ > > Thanks, > Ryan > >>> + arch_leave_lazy_mmu_mode(); >>> + >>> + return ret; >>> +}
On 25/06/2025 21:40, Yang Shi wrote: > > > On 6/25/25 4:04 AM, Ryan Roberts wrote: >> On 15/06/2025 08:32, Mike Rapoport wrote: >>> On Fri, Jun 13, 2025 at 07:13:51PM +0530, Dev Jain wrote: >>>> -/* >>>> - * This function assumes that the range is mapped with PAGE_SIZE pages. >>>> - */ >>>> -static int __change_memory_common(unsigned long start, unsigned long size, >>>> +static int ___change_memory_common(unsigned long start, unsigned long size, >>>> pgprot_t set_mask, pgprot_t clear_mask) >>>> { >>>> struct page_change_data data; >>>> @@ -61,9 +140,28 @@ static int __change_memory_common(unsigned long start, >>>> unsigned long size, >>>> data.set_mask = set_mask; >>>> data.clear_mask = clear_mask; >>>> - ret = apply_to_page_range(&init_mm, start, size, change_page_range, >>>> - &data); >>>> + arch_enter_lazy_mmu_mode(); >>>> + >>>> + /* >>>> + * The caller must ensure that the range we are operating on does not >>>> + * partially overlap a block mapping. Any such case should either not >>>> + * exist, or must be eliminated by splitting the mapping - which for >>>> + * kernel mappings can be done only on BBML2 systems. >>>> + * >>>> + */ >>>> + ret = walk_kernel_page_table_range_lockless(start, start + size, >>>> + &pageattr_ops, NULL, &data); >>> x86 has a cpa_lock for set_memory/set_direct_map to ensure that there's on >>> concurrency in kernel page table updates. I think arm64 has to have such >>> lock as well. >> We don't have a lock today, using apply_to_page_range(); we are expecting that >> the caller has exclusive ownership of the portion of virtual memory - i.e. the >> vmalloc region or linear map. So I don't think this patch changes that >> requirement? >> >> Where it does get a bit more hairy is when we introduce the support for >> splitting. In that case, 2 non-overlapping areas of virtual memory may share a >> large leaf mapping that needs to be split. But I've been discussing that with >> Yang Shi at [1] and I think we can handle that locklessly too. > > If the split is serialized by a lock, changing permission can be lockless. But > if split is lockless, changing permission may be a little bit tricky, > particularly for CONT mappings. The implementation in my split patch assumes the > whole range has cont bit cleared if the first PTE in the range has cont bit > cleared because the lock guarantees two concurrent splits are serialized. > > But lockless split may trigger the below race: > > CPU A is splitting the page table, CPU B is changing the permission for one PTE > entry in the same table. Clearing cont bit is RMW, changing permission is RMW > too, but neither of them is atomic. > > CPU A CPU B > read the PTE read the PTE > clear the cont bit for the PTE > change the PTE permission from RW to RO > store the new PTE > > store the new PTE <- it will overwrite the PTE value stored by CPU B and result > in misprogrammed cont PTEs Ahh yes, good point! I missed that. When I was thinking about this, I had assumed that *both* CPUs racing to split would (non-atomically) RMW to remove the cont bit on the whole block. That is safe as long as nothing else in the PTE changes. But of course you're right that the first one to complete that may then go on to modify the permissions in their portion of the now-split VA space. So there is definitely a problem. > > > We should need do one the of the follows to avoid the race off the top of my head: > 1. Serialize the split with a lock I guess this is certainly the simplest as per your original proposal. > 2. Make page table RMW atomic in both split and permission change I don't think we would need atomic RMW for the permission change - we would only need it for removing the cont bit? My reasoning is that by the time a thread is doing the permission change it must have already finished splitting the cont block. The permission change will only be for PTEs that we know we have exclusive access too. The other CPU may still be "splitting" the cont block, but since we already won, it will just be reading the PTEs and noticing that cont is already clear? I guess split_contpte()/split_contpmd() becomes a loop doing READ_ONCE() to test if the bit is set, followed by atomic bit clear if it was set (avoid the atomic where we can)? > 3. Check whether PTE is cont or not for every PTEs in the range instead of the > first PTE, before clearing cont bit if they are Ahh perhaps this is what I'm actually describing above? > 4. Retry if cont bit is not cleared in permission change, but we need > distinguish this from changing permission for the whole CONT PTE range because > we keep cont bit for this case I'd prefer to keep the splitting decoupled from the permission change if we can. Personally, I'd prefer to take the lockless approach. I think it has the least chance of contention issues. But if you prefer to use a lock, then I'm ok with that as a starting point. I'd prefer to use a new separate lock though (like x86 does) rather than risking extra contention with the init_mm PTL. Thanks, Ryan > > Thanks, > Yang > >> >> Perhaps I'm misunderstanding something? >> >> [1] https://lore.kernel.org/all/f036acea-1bd1-48a7-8600-75ddd504b8db@arm.com/ >> >> Thanks, >> Ryan >> >>>> + arch_leave_lazy_mmu_mode(); >>>> + >>>> + return ret; >>>> +} >
On 6/26/25 1:47 AM, Ryan Roberts wrote: > On 25/06/2025 21:40, Yang Shi wrote: >> >> On 6/25/25 4:04 AM, Ryan Roberts wrote: >>> On 15/06/2025 08:32, Mike Rapoport wrote: >>>> On Fri, Jun 13, 2025 at 07:13:51PM +0530, Dev Jain wrote: >>>>> -/* >>>>> - * This function assumes that the range is mapped with PAGE_SIZE pages. >>>>> - */ >>>>> -static int __change_memory_common(unsigned long start, unsigned long size, >>>>> +static int ___change_memory_common(unsigned long start, unsigned long size, >>>>> pgprot_t set_mask, pgprot_t clear_mask) >>>>> { >>>>> struct page_change_data data; >>>>> @@ -61,9 +140,28 @@ static int __change_memory_common(unsigned long start, >>>>> unsigned long size, >>>>> data.set_mask = set_mask; >>>>> data.clear_mask = clear_mask; >>>>> - ret = apply_to_page_range(&init_mm, start, size, change_page_range, >>>>> - &data); >>>>> + arch_enter_lazy_mmu_mode(); >>>>> + >>>>> + /* >>>>> + * The caller must ensure that the range we are operating on does not >>>>> + * partially overlap a block mapping. Any such case should either not >>>>> + * exist, or must be eliminated by splitting the mapping - which for >>>>> + * kernel mappings can be done only on BBML2 systems. >>>>> + * >>>>> + */ >>>>> + ret = walk_kernel_page_table_range_lockless(start, start + size, >>>>> + &pageattr_ops, NULL, &data); >>>> x86 has a cpa_lock for set_memory/set_direct_map to ensure that there's on >>>> concurrency in kernel page table updates. I think arm64 has to have such >>>> lock as well. >>> We don't have a lock today, using apply_to_page_range(); we are expecting that >>> the caller has exclusive ownership of the portion of virtual memory - i.e. the >>> vmalloc region or linear map. So I don't think this patch changes that >>> requirement? >>> >>> Where it does get a bit more hairy is when we introduce the support for >>> splitting. In that case, 2 non-overlapping areas of virtual memory may share a >>> large leaf mapping that needs to be split. But I've been discussing that with >>> Yang Shi at [1] and I think we can handle that locklessly too. >> If the split is serialized by a lock, changing permission can be lockless. But >> if split is lockless, changing permission may be a little bit tricky, >> particularly for CONT mappings. The implementation in my split patch assumes the >> whole range has cont bit cleared if the first PTE in the range has cont bit >> cleared because the lock guarantees two concurrent splits are serialized. >> >> But lockless split may trigger the below race: >> >> CPU A is splitting the page table, CPU B is changing the permission for one PTE >> entry in the same table. Clearing cont bit is RMW, changing permission is RMW >> too, but neither of them is atomic. >> >> CPU A CPU B >> read the PTE read the PTE >> clear the cont bit for the PTE >> change the PTE permission from RW to RO >> store the new PTE >> >> store the new PTE <- it will overwrite the PTE value stored by CPU B and result >> in misprogrammed cont PTEs > Ahh yes, good point! I missed that. When I was thinking about this, I had > assumed that *both* CPUs racing to split would (non-atomically) RMW to remove > the cont bit on the whole block. That is safe as long as nothing else in the PTE > changes. But of course you're right that the first one to complete that may then > go on to modify the permissions in their portion of the now-split VA space. So > there is definitely a problem. > >> >> We should need do one the of the follows to avoid the race off the top of my head: >> 1. Serialize the split with a lock > I guess this is certainly the simplest as per your original proposal. Yeah > >> 2. Make page table RMW atomic in both split and permission change > I don't think we would need atomic RMW for the permission change - we would only > need it for removing the cont bit? My reasoning is that by the time a thread is > doing the permission change it must have already finished splitting the cont > block. The permission change will only be for PTEs that we know we have > exclusive access too. The other CPU may still be "splitting" the cont block, but > since we already won, it will just be reading the PTEs and noticing that cont is > already clear? I guess split_contpte()/split_contpmd() becomes a loop doing > READ_ONCE() to test if the bit is set, followed by atomic bit clear if it was > set (avoid the atomic where we can)? > >> 3. Check whether PTE is cont or not for every PTEs in the range instead of the >> first PTE, before clearing cont bit if they are > Ahh perhaps this is what I'm actually describing above? Yes > >> 4. Retry if cont bit is not cleared in permission change, but we need >> distinguish this from changing permission for the whole CONT PTE range because >> we keep cont bit for this case > I'd prefer to keep the splitting decoupled from the permission change if we can. I agree. > > > Personally, I'd prefer to take the lockless approach. I think it has the least > chance of contention issues. But if you prefer to use a lock, then I'm ok with > that as a starting point. I'd prefer to use a new separate lock though (like x86 > does) rather than risking extra contention with the init_mm PTL. A separate lock is fine to me. I think it will make our life easier to use a lock. We can always optimize it if the lock contention turns out to be a problem. Thanks, Yang > > Thanks, > Ryan > > >> Thanks, >> Yang >> >>> Perhaps I'm misunderstanding something? >>> >>> [1] https://lore.kernel.org/all/f036acea-1bd1-48a7-8600-75ddd504b8db@arm.com/ >>> >>> Thanks, >>> Ryan >>> >>>>> + arch_leave_lazy_mmu_mode(); >>>>> + >>>>> + return ret; >>>>> +}
On 15/06/25 1:02 pm, Mike Rapoport wrote: > On Fri, Jun 13, 2025 at 07:13:51PM +0530, Dev Jain wrote: >> -/* >> - * This function assumes that the range is mapped with PAGE_SIZE pages. >> - */ >> -static int __change_memory_common(unsigned long start, unsigned long size, >> +static int ___change_memory_common(unsigned long start, unsigned long size, >> pgprot_t set_mask, pgprot_t clear_mask) >> { >> struct page_change_data data; >> @@ -61,9 +140,28 @@ static int __change_memory_common(unsigned long start, unsigned long size, >> data.set_mask = set_mask; >> data.clear_mask = clear_mask; >> >> - ret = apply_to_page_range(&init_mm, start, size, change_page_range, >> - &data); >> + arch_enter_lazy_mmu_mode(); >> + >> + /* >> + * The caller must ensure that the range we are operating on does not >> + * partially overlap a block mapping. Any such case should either not >> + * exist, or must be eliminated by splitting the mapping - which for >> + * kernel mappings can be done only on BBML2 systems. >> + * >> + */ >> + ret = walk_kernel_page_table_range_lockless(start, start + size, >> + &pageattr_ops, NULL, &data); > x86 has a cpa_lock for set_memory/set_direct_map to ensure that there's on > concurrency in kernel page table updates. I think arm64 has to have such > lock as well. My understanding is that it is guaranteed that the set_memory_* caller has exclusive access to the range it is changing permissions for. The x86 comment is Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings) using cpa_lock. So that we don't allow any other cpu, with stale large tlb entries change the page attribute in parallel to some other cpu splitting a large page entry along with changing the attribute. On arm64 we are doing flush_tlb_kernel_range in __change_memory_common; and also, the caller of __change_memory_common is required to first split the start and end before changing permissions, so the splitting and permission change won't happen in parallel as described by the comment. > >> + arch_leave_lazy_mmu_mode(); >> + >> + return ret; >> +}
On Fri, Jun 13, 2025 at 07:13:51PM +0530, Dev Jain wrote: > arm64 currently changes permissions on vmalloc objects locklessly, via > apply_to_page_range, whose limitation is to deny changing permissions for > block mappings. Therefore, we move away to use the generic pagewalk API, > thus paving the path for enabling huge mappings by default on kernel space > mappings, thus leading to more efficient TLB usage. However, the API > currently enforces the init_mm.mmap_lock to be held. To avoid the > unnecessary bottleneck of the mmap_lock for our usecase, this patch > extends this generic API to be used locklessly, so as to retain the > existing behaviour for changing permissions. Apart from this reason, it is > noted at [1] that KFENCE can manipulate kernel pgtable entries during > softirqs. It does this by calling set_memory_valid() -> __change_memory_common(). > This being a non-sleepable context, we cannot take the init_mm mmap lock. > > Add comments to highlight the conditions under which we can use the > lockless variant - no underlying VMA, and the user having exclusive control > over the range, thus guaranteeing no concurrent access. > > Since arm64 cannot handle kernel live mapping splitting without BBML2, > we require that the start and end of a given range lie on block mapping > boundaries. Return -EINVAL in case a partial block mapping is detected; > add a corresponding comment in ___change_memory_common() to warn that > eliminating such a condition is the responsibility of the caller. > > apply_to_page_range() currently performs all pte level callbacks while in > lazy mmu mode. Since arm64 can optimize performance by batching barriers > when modifying kernel pgtables in lazy mmu mode, we would like to continue > to benefit from this optimisation. Unfortunately walk_kernel_page_table_range() > does not use lazy mmu mode. However, since the pagewalk framework is not > allocating any memory, we can safely bracket the whole operation inside > lazy mmu mode ourselves. Therefore, wrap the call to > walk_kernel_page_table_range() with the lazy MMU helpers. Thanks this is a great commit message! > > [1] https://lore.kernel.org/linux-arm-kernel/89d0ad18-4772-4d8f-ae8a-7c48d26a927e@arm.com/ > > Signed-off-by: Dev Jain <dev.jain@arm.com> > --- > arch/arm64/mm/pageattr.c | 157 +++++++++++++++++++++++++++++++-------- > include/linux/pagewalk.h | 3 + > mm/pagewalk.c | 26 +++++++ > 3 files changed, 154 insertions(+), 32 deletions(-) > > diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c > index 04d4a8f676db..cfc5279f27a2 100644 > --- a/arch/arm64/mm/pageattr.c > +++ b/arch/arm64/mm/pageattr.c > @@ -8,6 +8,7 @@ > #include <linux/mem_encrypt.h> > #include <linux/sched.h> > #include <linux/vmalloc.h> > +#include <linux/pagewalk.h> > > #include <asm/cacheflush.h> > #include <asm/pgtable-prot.h> > @@ -20,6 +21,99 @@ struct page_change_data { > pgprot_t clear_mask; > }; > > +static ptdesc_t set_pageattr_masks(ptdesc_t val, struct mm_walk *walk) > +{ > + struct page_change_data *masks = walk->private; > + > + val &= ~(pgprot_val(masks->clear_mask)); > + val |= (pgprot_val(masks->set_mask)); > + > + return val; > +} > + > +static int pageattr_pgd_entry(pgd_t *pgd, unsigned long addr, > + unsigned long next, struct mm_walk *walk) > +{ > + pgd_t val = pgdp_get(pgd); > + > + if (pgd_leaf(val)) { Does arm have PGD entry level leaves? :) just curious :P > + if (WARN_ON_ONCE((next - addr) != PGDIR_SIZE)) > + return -EINVAL; I guess the point here is to assert that the searched range _entirely spans_ the folio that the higher order leaf page table entry describes. I'm guessing this is desired. But I'm not sure this should be a warning? What if you happen to walk a range that isn't aligned like this? (Same comment goes for the other instances of the same pattern) > + val = __pgd(set_pageattr_masks(pgd_val(val), walk)); > + set_pgd(pgd, val); > + walk->action = ACTION_CONTINUE; > + } > + > + return 0; > +} > + > +static int pageattr_p4d_entry(p4d_t *p4d, unsigned long addr, > + unsigned long next, struct mm_walk *walk) > +{ > + p4d_t val = p4dp_get(p4d); > + > + if (p4d_leaf(val)) { > + if (WARN_ON_ONCE((next - addr) != P4D_SIZE)) > + return -EINVAL; > + val = __p4d(set_pageattr_masks(p4d_val(val), walk)); > + set_p4d(p4d, val); > + walk->action = ACTION_CONTINUE; > + } > + > + return 0; > +} > + > +static int pageattr_pud_entry(pud_t *pud, unsigned long addr, > + unsigned long next, struct mm_walk *walk) > +{ > + pud_t val = pudp_get(pud); > + > + if (pud_leaf(val)) { > + if (WARN_ON_ONCE((next - addr) != PUD_SIZE)) > + return -EINVAL; > + val = __pud(set_pageattr_masks(pud_val(val), walk)); > + set_pud(pud, val); > + walk->action = ACTION_CONTINUE; > + } > + > + return 0; > +} > + > +static int pageattr_pmd_entry(pmd_t *pmd, unsigned long addr, > + unsigned long next, struct mm_walk *walk) > +{ > + pmd_t val = pmdp_get(pmd); > + > + if (pmd_leaf(val)) { > + if (WARN_ON_ONCE((next - addr) != PMD_SIZE)) > + return -EINVAL; > + val = __pmd(set_pageattr_masks(pmd_val(val), walk)); > + set_pmd(pmd, val); > + walk->action = ACTION_CONTINUE; > + } > + > + return 0; > +} > + > +static int pageattr_pte_entry(pte_t *pte, unsigned long addr, > + unsigned long next, struct mm_walk *walk) > +{ > + pte_t val = __ptep_get(pte); > + > + val = __pte(set_pageattr_masks(pte_val(val), walk)); > + __set_pte(pte, val); > + > + return 0; > +} > + > +static const struct mm_walk_ops pageattr_ops = { > + .pgd_entry = pageattr_pgd_entry, > + .p4d_entry = pageattr_p4d_entry, > + .pud_entry = pageattr_pud_entry, > + .pmd_entry = pageattr_pmd_entry, > + .pte_entry = pageattr_pte_entry, > +}; > + > bool rodata_full __ro_after_init = IS_ENABLED(CONFIG_RODATA_FULL_DEFAULT_ENABLED); > > bool can_set_direct_map(void) > @@ -37,22 +131,7 @@ bool can_set_direct_map(void) > arm64_kfence_can_set_direct_map() || is_realm_world(); > } > > -static int change_page_range(pte_t *ptep, unsigned long addr, void *data) > -{ > - struct page_change_data *cdata = data; > - pte_t pte = __ptep_get(ptep); > - > - pte = clear_pte_bit(pte, cdata->clear_mask); > - pte = set_pte_bit(pte, cdata->set_mask); > - > - __set_pte(ptep, pte); > - return 0; > -} > - > -/* > - * This function assumes that the range is mapped with PAGE_SIZE pages. > - */ > -static int __change_memory_common(unsigned long start, unsigned long size, > +static int ___change_memory_common(unsigned long start, unsigned long size, > pgprot_t set_mask, pgprot_t clear_mask) I wouldn't presume to comment on conventions in arm64 arch code, but that's a lot of underscores :P I wonder if this is the best name for it as you're not only invoking it from __change_memory_common()? And yes this is a pedantic comment, I realise :) > { > struct page_change_data data; > @@ -61,9 +140,28 @@ static int __change_memory_common(unsigned long start, unsigned long size, > data.set_mask = set_mask; > data.clear_mask = clear_mask; > > - ret = apply_to_page_range(&init_mm, start, size, change_page_range, > - &data); > + arch_enter_lazy_mmu_mode(); > + > + /* > + * The caller must ensure that the range we are operating on does not > + * partially overlap a block mapping. Any such case should either not > + * exist, or must be eliminated by splitting the mapping - which for > + * kernel mappings can be done only on BBML2 systems. > + * EVEN MORE RIDICULOUS NIT: extra line in comment here! > + */ OK so this probably answers the comment above re: the warnings. > + ret = walk_kernel_page_table_range_lockless(start, start + size, > + &pageattr_ops, NULL, &data); > + arch_leave_lazy_mmu_mode(); > + > + return ret; > +} > > +static int __change_memory_common(unsigned long start, unsigned long size, > + pgprot_t set_mask, pgprot_t clear_mask) > +{ > + int ret; > + > + ret = ___change_memory_common(start, size, set_mask, clear_mask); > /* > * If the memory is being made valid without changing any other bits > * then a TLBI isn't required as a non-valid entry cannot be cached in > @@ -71,6 +169,7 @@ static int __change_memory_common(unsigned long start, unsigned long size, > */ > if (pgprot_val(set_mask) != PTE_VALID || pgprot_val(clear_mask)) > flush_tlb_kernel_range(start, start + size); > + > return ret; > } > > @@ -174,32 +273,26 @@ int set_memory_valid(unsigned long addr, int numpages, int enable) > > int set_direct_map_invalid_noflush(struct page *page) > { > - struct page_change_data data = { > - .set_mask = __pgprot(0), > - .clear_mask = __pgprot(PTE_VALID), > - }; > + pgprot_t clear_mask = __pgprot(PTE_VALID); > + pgprot_t set_mask = __pgprot(0); > > if (!can_set_direct_map()) > return 0; > > - return apply_to_page_range(&init_mm, > - (unsigned long)page_address(page), > - PAGE_SIZE, change_page_range, &data); > + return ___change_memory_common((unsigned long)page_address(page), > + PAGE_SIZE, set_mask, clear_mask); > } > > int set_direct_map_default_noflush(struct page *page) > { > - struct page_change_data data = { > - .set_mask = __pgprot(PTE_VALID | PTE_WRITE), > - .clear_mask = __pgprot(PTE_RDONLY), > - }; > + pgprot_t set_mask = __pgprot(PTE_VALID | PTE_WRITE); > + pgprot_t clear_mask = __pgprot(PTE_RDONLY); > > if (!can_set_direct_map()) > return 0; > > - return apply_to_page_range(&init_mm, > - (unsigned long)page_address(page), > - PAGE_SIZE, change_page_range, &data); > + return ___change_memory_common((unsigned long)page_address(page), > + PAGE_SIZE, set_mask, clear_mask); > } > > static int __set_memory_enc_dec(unsigned long addr, > diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h > index 8ac2f6d6d2a3..79ab8c754dff 100644 > --- a/include/linux/pagewalk.h > +++ b/include/linux/pagewalk.h > @@ -132,6 +132,9 @@ int walk_page_range(struct mm_struct *mm, unsigned long start, > int walk_kernel_page_table_range(unsigned long start, > unsigned long end, const struct mm_walk_ops *ops, > pgd_t *pgd, void *private); > +int walk_kernel_page_table_range_lockless(unsigned long start, > + unsigned long end, const struct mm_walk_ops *ops, > + pgd_t *pgd, void *private); > int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start, > unsigned long end, const struct mm_walk_ops *ops, > void *private); > diff --git a/mm/pagewalk.c b/mm/pagewalk.c > index ff5299eca687..7446984b2154 100644 > --- a/mm/pagewalk.c > +++ b/mm/pagewalk.c > @@ -632,6 +632,32 @@ int walk_kernel_page_table_range(unsigned long start, unsigned long end, > return walk_pgd_range(start, end, &walk); > } > > +/* > + * Use this function to walk the kernel page tables locklessly. It should be > + * guaranteed that the caller has exclusive access over the range they are > + * operating on - that there should be no concurrent access, for example, > + * changing permissions for vmalloc objects. > + */ > +int walk_kernel_page_table_range_lockless(unsigned long start, unsigned long end, > + const struct mm_walk_ops *ops, pgd_t *pgd, void *private) Don't really see the point in having a pgd parameter? > +{ > + struct mm_struct *mm = &init_mm; No real need to split out mm here, just reference &init_mm direct below. > + struct mm_walk walk = { > + .ops = ops, > + .mm = mm, > + .pgd = pgd, > + .private = private, > + .no_vma = true > + }; > + > + if (start >= end) > + return -EINVAL; > + if (!check_ops_valid(ops)) > + return -EINVAL; > + > + return walk_pgd_range(start, end, &walk); > +} > + > /** > * walk_page_range_debug - walk a range of pagetables not backed by a vma > * @mm: mm_struct representing the target process of page table walk > -- > 2.30.2 > Other than nits, etc. this looks fine. Thanks for the refactorings!
On Fri, Jun 13, 2025 at 05:27:27PM +0100, Lorenzo Stoakes wrote: > On Fri, Jun 13, 2025 at 07:13:51PM +0530, Dev Jain wrote: > > -/* > > - * This function assumes that the range is mapped with PAGE_SIZE pages. > > - */ > > -static int __change_memory_common(unsigned long start, unsigned long size, > > +static int ___change_memory_common(unsigned long start, unsigned long size, > > pgprot_t set_mask, pgprot_t clear_mask) > > I wouldn't presume to comment on conventions in arm64 arch code, but that's > a lot of underscores :P > > I wonder if this is the best name for it as you're not only invoking it > from __change_memory_common()? Could update_range_pgport() work? > And yes this is a pedantic comment, I realise :) > -- Sincerely yours, Mike.
On 15/06/2025 08:25, Mike Rapoport wrote: > On Fri, Jun 13, 2025 at 05:27:27PM +0100, Lorenzo Stoakes wrote: >> On Fri, Jun 13, 2025 at 07:13:51PM +0530, Dev Jain wrote: >>> -/* >>> - * This function assumes that the range is mapped with PAGE_SIZE pages. >>> - */ >>> -static int __change_memory_common(unsigned long start, unsigned long size, >>> +static int ___change_memory_common(unsigned long start, unsigned long size, >>> pgprot_t set_mask, pgprot_t clear_mask) >> >> I wouldn't presume to comment on conventions in arm64 arch code, but that's >> a lot of underscores :P >> >> I wonder if this is the best name for it as you're not only invoking it >> from __change_memory_common()? > > Could update_range_pgport() work? Sounds good to me! > >> And yes this is a pedantic comment, I realise :) >> >
On Fri, Jun 13, 2025 at 05:27:27PM +0100, Lorenzo Stoakes wrote: > On Fri, Jun 13, 2025 at 07:13:51PM +0530, Dev Jain wrote: > > + if (WARN_ON_ONCE((next - addr) != PGDIR_SIZE)) > > + return -EINVAL; > > I guess the point here is to assert that the searched range _entirely > spans_ the folio that the higher order leaf page table entry describes. > > I'm guessing this is desired. > > But I'm not sure this should be a warning? > > What if you happen to walk a range that isn't aligned like this? My understandging is that the caller must ensure that addr is pud/pmd/pte-aligned. But, imho, since -EINVAL is returned, I don't think the WARN_ON_ONCE() is needed. -- ~karim
On 14/06/25 8:20 pm, Karim Manaouil wrote: > On Fri, Jun 13, 2025 at 05:27:27PM +0100, Lorenzo Stoakes wrote: >> On Fri, Jun 13, 2025 at 07:13:51PM +0530, Dev Jain wrote: >>> + if (WARN_ON_ONCE((next - addr) != PGDIR_SIZE)) >>> + return -EINVAL; >> I guess the point here is to assert that the searched range _entirely >> spans_ the folio that the higher order leaf page table entry describes. >> >> I'm guessing this is desired. >> >> But I'm not sure this should be a warning? >> >> What if you happen to walk a range that isn't aligned like this? > My understandging is that the caller must ensure that addr is > pud/pmd/pte-aligned. But, imho, since -EINVAL is returned, I don't think > the WARN_ON_ONCE() is needed. I don't really have a strong opinion on this. Ryan may be better fitted to answer. >
On 19/06/2025 05:03, Dev Jain wrote: > > On 14/06/25 8:20 pm, Karim Manaouil wrote: >> On Fri, Jun 13, 2025 at 05:27:27PM +0100, Lorenzo Stoakes wrote: >>> On Fri, Jun 13, 2025 at 07:13:51PM +0530, Dev Jain wrote: >>>> + if (WARN_ON_ONCE((next - addr) != PGDIR_SIZE)) >>>> + return -EINVAL; >>> I guess the point here is to assert that the searched range _entirely >>> spans_ the folio that the higher order leaf page table entry describes. >>> >>> I'm guessing this is desired. >>> >>> But I'm not sure this should be a warning? >>> >>> What if you happen to walk a range that isn't aligned like this? >> My understandging is that the caller must ensure that addr is >> pud/pmd/pte-aligned. But, imho, since -EINVAL is returned, I don't think >> the WARN_ON_ONCE() is needed. > > I don't really have a strong opinion on this. Ryan may be better fitted > to answer. IMHO it's a pretty serious programming cock-up if we ever find this situation, so I'd prefer to emit the warning. apply_to_page_range() (which we are replacing here) emits WARN_ON_ONCE() if it arrives at any non-page leaf mappings, which is stronger than what we have here; it expects only to modify permissions on regions that are pte-mapped. Here we are relaxing that requirement to only require that the region begin and end on a leaf mapping boundary, where the leaf can be at any level. So for now, we still only expect this code to get called for regions that are fully pte-mapped. This series is an enabler to allow us to change that in future though. Yang Shi is working on a series which will ensure that a region that we want to change permissions for has its start and end on a leaf boundary by dynamically splitting the leaf mappings as needed (which can be done safely on arm64 when FEAT_BBM level 2 is supported). This will then open up the door to mapping the linear map and vmalloc with large leaf mappings by default. But due to the splitting we ensure never to trigger the warning; if we do, that is a bug. Thanks, Ryan
© 2016 - 2025 Red Hat, Inc.