[v3] Enable permission change on arm64 kernel block mappings

[PATCH v3 1/2] arm64: pageattr: Use pagewalk API to change memory permissions

Posted by Dev Jain 7 months, 4 weeks ago

arm64 currently changes permissions on vmalloc objects locklessly, via
apply_to_page_range, whose limitation is to deny changing permissions for
block mappings. Therefore, we move away to use the generic pagewalk API,
thus paving the path for enabling huge mappings by default on kernel space
mappings, thus leading to more efficient TLB usage. However, the API
currently enforces the init_mm.mmap_lock to be held. To avoid the
unnecessary bottleneck of the mmap_lock for our usecase, this patch
extends this generic API to be used locklessly, so as to retain the
existing behaviour for changing permissions. Apart from this reason, it is
noted at [1] that KFENCE can manipulate kernel pgtable entries during
softirqs. It does this by calling set_memory_valid() -> __change_memory_common().
This being a non-sleepable context, we cannot take the init_mm mmap lock.

Add comments to highlight the conditions under which we can use the
lockless variant - no underlying VMA, and the user having exclusive control
over the range, thus guaranteeing no concurrent access.

Since arm64 cannot handle kernel live mapping splitting without BBML2,
we require that the start and end of a given range lie on block mapping
boundaries. Return -EINVAL in case a partial block mapping is detected;
add a corresponding comment in ___change_memory_common() to warn that
eliminating such a condition is the responsibility of the caller.

apply_to_page_range() currently performs all pte level callbacks while in
lazy mmu mode. Since arm64 can optimize performance by batching barriers
when modifying kernel pgtables in lazy mmu mode, we would like to continue
to benefit from this optimisation. Unfortunately walk_kernel_page_table_range()
does not use lazy mmu mode. However, since the pagewalk framework is not
allocating any memory, we can safely bracket the whole operation inside
lazy mmu mode ourselves. Therefore, wrap the call to
walk_kernel_page_table_range() with the lazy MMU helpers.

[1] https://lore.kernel.org/linux-arm-kernel/89d0ad18-4772-4d8f-ae8a-7c48d26a927e@arm.com/

Signed-off-by: Dev Jain <dev.jain@arm.com>
---
 arch/arm64/mm/pageattr.c | 157 +++++++++++++++++++++++++++++++--------
 include/linux/pagewalk.h |   3 +
 mm/pagewalk.c            |  26 +++++++
 3 files changed, 154 insertions(+), 32 deletions(-)

diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c
index 04d4a8f676db..cfc5279f27a2 100644
--- a/arch/arm64/mm/pageattr.c
+++ b/arch/arm64/mm/pageattr.c
@@ -8,6 +8,7 @@
 #include <linux/mem_encrypt.h>
 #include <linux/sched.h>
 #include <linux/vmalloc.h>
+#include <linux/pagewalk.h>
 
 #include <asm/cacheflush.h>
 #include <asm/pgtable-prot.h>
@@ -20,6 +21,99 @@ struct page_change_data {
 	pgprot_t clear_mask;
 };
 
+static ptdesc_t set_pageattr_masks(ptdesc_t val, struct mm_walk *walk)
+{
+	struct page_change_data *masks = walk->private;
+
+	val &= ~(pgprot_val(masks->clear_mask));
+	val |= (pgprot_val(masks->set_mask));
+
+	return val;
+}
+
+static int pageattr_pgd_entry(pgd_t *pgd, unsigned long addr,
+			      unsigned long next, struct mm_walk *walk)
+{
+	pgd_t val = pgdp_get(pgd);
+
+	if (pgd_leaf(val)) {
+		if (WARN_ON_ONCE((next - addr) != PGDIR_SIZE))
+			return -EINVAL;
+		val = __pgd(set_pageattr_masks(pgd_val(val), walk));
+		set_pgd(pgd, val);
+		walk->action = ACTION_CONTINUE;
+	}
+
+	return 0;
+}
+
+static int pageattr_p4d_entry(p4d_t *p4d, unsigned long addr,
+			      unsigned long next, struct mm_walk *walk)
+{
+	p4d_t val = p4dp_get(p4d);
+
+	if (p4d_leaf(val)) {
+		if (WARN_ON_ONCE((next - addr) != P4D_SIZE))
+			return -EINVAL;
+		val = __p4d(set_pageattr_masks(p4d_val(val), walk));
+		set_p4d(p4d, val);
+		walk->action = ACTION_CONTINUE;
+	}
+
+	return 0;
+}
+
+static int pageattr_pud_entry(pud_t *pud, unsigned long addr,
+			      unsigned long next, struct mm_walk *walk)
+{
+	pud_t val = pudp_get(pud);
+
+	if (pud_leaf(val)) {
+		if (WARN_ON_ONCE((next - addr) != PUD_SIZE))
+			return -EINVAL;
+		val = __pud(set_pageattr_masks(pud_val(val), walk));
+		set_pud(pud, val);
+		walk->action = ACTION_CONTINUE;
+	}
+
+	return 0;
+}
+
+static int pageattr_pmd_entry(pmd_t *pmd, unsigned long addr,
+			      unsigned long next, struct mm_walk *walk)
+{
+	pmd_t val = pmdp_get(pmd);
+
+	if (pmd_leaf(val)) {
+		if (WARN_ON_ONCE((next - addr) != PMD_SIZE))
+			return -EINVAL;
+		val = __pmd(set_pageattr_masks(pmd_val(val), walk));
+		set_pmd(pmd, val);
+		walk->action = ACTION_CONTINUE;
+	}
+
+	return 0;
+}
+
+static int pageattr_pte_entry(pte_t *pte, unsigned long addr,
+			      unsigned long next, struct mm_walk *walk)
+{
+	pte_t val = __ptep_get(pte);
+
+	val = __pte(set_pageattr_masks(pte_val(val), walk));
+	__set_pte(pte, val);
+
+	return 0;
+}
+
+static const struct mm_walk_ops pageattr_ops = {
+	.pgd_entry	= pageattr_pgd_entry,
+	.p4d_entry	= pageattr_p4d_entry,
+	.pud_entry	= pageattr_pud_entry,
+	.pmd_entry	= pageattr_pmd_entry,
+	.pte_entry	= pageattr_pte_entry,
+};
+
 bool rodata_full __ro_after_init = IS_ENABLED(CONFIG_RODATA_FULL_DEFAULT_ENABLED);
 
 bool can_set_direct_map(void)
@@ -37,22 +131,7 @@ bool can_set_direct_map(void)
 		arm64_kfence_can_set_direct_map() || is_realm_world();
 }
 
-static int change_page_range(pte_t *ptep, unsigned long addr, void *data)
-{
-	struct page_change_data *cdata = data;
-	pte_t pte = __ptep_get(ptep);
-
-	pte = clear_pte_bit(pte, cdata->clear_mask);
-	pte = set_pte_bit(pte, cdata->set_mask);
-
-	__set_pte(ptep, pte);
-	return 0;
-}
-
-/*
- * This function assumes that the range is mapped with PAGE_SIZE pages.
- */
-static int __change_memory_common(unsigned long start, unsigned long size,
+static int ___change_memory_common(unsigned long start, unsigned long size,
 				pgprot_t set_mask, pgprot_t clear_mask)
 {
 	struct page_change_data data;
@@ -61,9 +140,28 @@ static int __change_memory_common(unsigned long start, unsigned long size,
 	data.set_mask = set_mask;
 	data.clear_mask = clear_mask;
 
-	ret = apply_to_page_range(&init_mm, start, size, change_page_range,
-					&data);
+	arch_enter_lazy_mmu_mode();
+
+	/*
+	 * The caller must ensure that the range we are operating on does not
+	 * partially overlap a block mapping. Any such case should either not
+	 * exist, or must be eliminated by splitting the mapping - which for
+	 * kernel mappings can be done only on BBML2 systems.
+	 *
+	 */
+	ret = walk_kernel_page_table_range_lockless(start, start + size,
+						    &pageattr_ops, NULL, &data);
+	arch_leave_lazy_mmu_mode();
+
+	return ret;
+}
 
+static int __change_memory_common(unsigned long start, unsigned long size,
+				  pgprot_t set_mask, pgprot_t clear_mask)
+{
+	int ret;
+
+	ret = ___change_memory_common(start, size, set_mask, clear_mask);
 	/*
 	 * If the memory is being made valid without changing any other bits
 	 * then a TLBI isn't required as a non-valid entry cannot be cached in
@@ -71,6 +169,7 @@ static int __change_memory_common(unsigned long start, unsigned long size,
 	 */
 	if (pgprot_val(set_mask) != PTE_VALID || pgprot_val(clear_mask))
 		flush_tlb_kernel_range(start, start + size);
+
 	return ret;
 }
 
@@ -174,32 +273,26 @@ int set_memory_valid(unsigned long addr, int numpages, int enable)
 
 int set_direct_map_invalid_noflush(struct page *page)
 {
-	struct page_change_data data = {
-		.set_mask = __pgprot(0),
-		.clear_mask = __pgprot(PTE_VALID),
-	};
+	pgprot_t clear_mask = __pgprot(PTE_VALID);
+	pgprot_t set_mask = __pgprot(0);
 
 	if (!can_set_direct_map())
 		return 0;
 
-	return apply_to_page_range(&init_mm,
-				   (unsigned long)page_address(page),
-				   PAGE_SIZE, change_page_range, &data);
+	return ___change_memory_common((unsigned long)page_address(page),
+					PAGE_SIZE, set_mask, clear_mask);
 }
 
 int set_direct_map_default_noflush(struct page *page)
 {
-	struct page_change_data data = {
-		.set_mask = __pgprot(PTE_VALID | PTE_WRITE),
-		.clear_mask = __pgprot(PTE_RDONLY),
-	};
+	pgprot_t set_mask = __pgprot(PTE_VALID | PTE_WRITE);
+	pgprot_t clear_mask = __pgprot(PTE_RDONLY);
 
 	if (!can_set_direct_map())
 		return 0;
 
-	return apply_to_page_range(&init_mm,
-				   (unsigned long)page_address(page),
-				   PAGE_SIZE, change_page_range, &data);
+	return ___change_memory_common((unsigned long)page_address(page),
+					PAGE_SIZE, set_mask, clear_mask);
 }
 
 static int __set_memory_enc_dec(unsigned long addr,
diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h
index 8ac2f6d6d2a3..79ab8c754dff 100644
--- a/include/linux/pagewalk.h
+++ b/include/linux/pagewalk.h
@@ -132,6 +132,9 @@ int walk_page_range(struct mm_struct *mm, unsigned long start,
 int walk_kernel_page_table_range(unsigned long start,
 		unsigned long end, const struct mm_walk_ops *ops,
 		pgd_t *pgd, void *private);
+int walk_kernel_page_table_range_lockless(unsigned long start,
+		unsigned long end, const struct mm_walk_ops *ops,
+		pgd_t *pgd, void *private);
 int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start,
 			unsigned long end, const struct mm_walk_ops *ops,
 			void *private);
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index ff5299eca687..7446984b2154 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -632,6 +632,32 @@ int walk_kernel_page_table_range(unsigned long start, unsigned long end,
 	return walk_pgd_range(start, end, &walk);
 }
 
+/*
+ * Use this function to walk the kernel page tables locklessly. It should be
+ * guaranteed that the caller has exclusive access over the range they are
+ * operating on - that there should be no concurrent access, for example,
+ * changing permissions for vmalloc objects.
+ */
+int walk_kernel_page_table_range_lockless(unsigned long start, unsigned long end,
+		const struct mm_walk_ops *ops, pgd_t *pgd, void *private)
+{
+	struct mm_struct *mm = &init_mm;
+	struct mm_walk walk = {
+		.ops		= ops,
+		.mm		= mm,
+		.pgd		= pgd,
+		.private	= private,
+		.no_vma		= true
+	};
+
+	if (start >= end)
+		return -EINVAL;
+	if (!check_ops_valid(ops))
+		return -EINVAL;
+
+	return walk_pgd_range(start, end, &walk);
+}
+
 /**
  * walk_page_range_debug - walk a range of pagetables not backed by a vma
  * @mm:		mm_struct representing the target process of page table walk
-- 
2.30.2

Re: [PATCH v3 1/2] arm64: pageattr: Use pagewalk API to change memory permissions

Posted by Dev Jain 7 months, 2 weeks ago

On 13/06/25 7:13 pm, Dev Jain wrote:
> arm64 currently changes permissions on vmalloc objects locklessly, via
> apply_to_page_range, whose limitation is to deny changing permissions for
> block mappings. Therefore, we move away to use the generic pagewalk API,
> thus paving the path for enabling huge mappings by default on kernel space
> mappings, thus leading to more efficient TLB usage. However, the API
> currently enforces the init_mm.mmap_lock to be held. To avoid the
> unnecessary bottleneck of the mmap_lock for our usecase, this patch
> extends this generic API to be used locklessly, so as to retain the
> existing behaviour for changing permissions. Apart from this reason, it is
> noted at [1] that KFENCE can manipulate kernel pgtable entries during
> softirqs. It does this by calling set_memory_valid() -> __change_memory_common().
> This being a non-sleepable context, we cannot take the init_mm mmap lock.
>
> Add comments to highlight the conditions under which we can use the
> lockless variant - no underlying VMA, and the user having exclusive control
> over the range, thus guaranteeing no concurrent access.
>
> Since arm64 cannot handle kernel live mapping splitting without BBML2,
> we require that the start and end of a given range lie on block mapping
> boundaries. Return -EINVAL in case a partial block mapping is detected;
> add a corresponding comment in ___change_memory_common() to warn that
> eliminating such a condition is the responsibility of the caller.
>
> apply_to_page_range() currently performs all pte level callbacks while in
> lazy mmu mode. Since arm64 can optimize performance by batching barriers
> when modifying kernel pgtables in lazy mmu mode, we would like to continue
> to benefit from this optimisation. Unfortunately walk_kernel_page_table_range()
> does not use lazy mmu mode. However, since the pagewalk framework is not
> allocating any memory, we can safely bracket the whole operation inside
> lazy mmu mode ourselves. Therefore, wrap the call to
> walk_kernel_page_table_range() with the lazy MMU helpers.
>
> [1] https://lore.kernel.org/linux-arm-kernel/89d0ad18-4772-4d8f-ae8a-7c48d26a927e@arm.com/
>
> Signed-off-by: Dev Jain <dev.jain@arm.com>
> ---
>   arch/arm64/mm/pageattr.c | 157 +++++++++++++++++++++++++++++++--------
>   include/linux/pagewalk.h |   3 +
>   mm/pagewalk.c            |  26 +++++++
>   3 files changed, 154 insertions(+), 32 deletions(-)
>
> diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c
> index 04d4a8f676db..cfc5279f27a2 100644
> --- a/arch/arm64/mm/pageattr.c
> +++ b/arch/arm64/mm/pageattr.c
> @@ -8,6 +8,7 @@
>   #include <linux/mem_encrypt.h>
>   #include <linux/sched.h>
>   #include <linux/vmalloc.h>
> +#include <linux/pagewalk.h>
>   
>   #include <asm/cacheflush.h>
>   #include <asm/pgtable-prot.h>
> @@ -20,6 +21,99 @@ struct page_change_data {
>   	pgprot_t clear_mask;
>   };
>   
> +static ptdesc_t set_pageattr_masks(ptdesc_t val, struct mm_walk *walk)
> +{
> +	struct page_change_data *masks = walk->private;
> +
> +	val &= ~(pgprot_val(masks->clear_mask));
> +	val |= (pgprot_val(masks->set_mask));
> +
> +	return val;
> +}
> +
> +static int pageattr_pgd_entry(pgd_t *pgd, unsigned long addr,
> +			      unsigned long next, struct mm_walk *walk)
> +{
> +	pgd_t val = pgdp_get(pgd);
> +
> +	if (pgd_leaf(val)) {
> +		if (WARN_ON_ONCE((next - addr) != PGDIR_SIZE))
> +			return -EINVAL;
> +		val = __pgd(set_pageattr_masks(pgd_val(val), walk));
> +		set_pgd(pgd, val);
> +		walk->action = ACTION_CONTINUE;
> +	}
> +
> +	return 0;
> +}
> +
> +static int pageattr_p4d_entry(p4d_t *p4d, unsigned long addr,
> +			      unsigned long next, struct mm_walk *walk)
> +{
> +	p4d_t val = p4dp_get(p4d);
> +
> +	if (p4d_leaf(val)) {
> +		if (WARN_ON_ONCE((next - addr) != P4D_SIZE))
> +			return -EINVAL;
> +		val = __p4d(set_pageattr_masks(p4d_val(val), walk));
> +		set_p4d(p4d, val);
> +		walk->action = ACTION_CONTINUE;
> +	}
> +
> +	return 0;
> +}
> +
> +static int pageattr_pud_entry(pud_t *pud, unsigned long addr,
> +			      unsigned long next, struct mm_walk *walk)
> +{
> +	pud_t val = pudp_get(pud);
> +
> +	if (pud_leaf(val)) {
> +		if (WARN_ON_ONCE((next - addr) != PUD_SIZE))
> +			return -EINVAL;
> +		val = __pud(set_pageattr_masks(pud_val(val), walk));
> +		set_pud(pud, val);
> +		walk->action = ACTION_CONTINUE;
> +	}
> +
> +	return 0;
> +}
> +
> +static int pageattr_pmd_entry(pmd_t *pmd, unsigned long addr,
> +			      unsigned long next, struct mm_walk *walk)
> +{
> +	pmd_t val = pmdp_get(pmd);
> +
> +	if (pmd_leaf(val)) {
> +		if (WARN_ON_ONCE((next - addr) != PMD_SIZE))
> +			return -EINVAL;
> +		val = __pmd(set_pageattr_masks(pmd_val(val), walk));
> +		set_pmd(pmd, val);
> +		walk->action = ACTION_CONTINUE;
> +	}
> +
> +	return 0;
> +}
> +
> +static int pageattr_pte_entry(pte_t *pte, unsigned long addr,
> +			      unsigned long next, struct mm_walk *walk)
> +{
> +	pte_t val = __ptep_get(pte);
> +
> +	val = __pte(set_pageattr_masks(pte_val(val), walk));
> +	__set_pte(pte, val);
> +
> +	return 0;
> +}

I was wondering, now that we have vmalloc contpte support,
do we need to ensure in this pte level callback that
we don't partially cover a contpte block?

Re: [PATCH v3 1/2] arm64: pageattr: Use pagewalk API to change memory permissions

Posted by Ryan Roberts 7 months, 2 weeks ago

On 26/06/2025 06:47, Dev Jain wrote:
> 
> On 13/06/25 7:13 pm, Dev Jain wrote:
>> arm64 currently changes permissions on vmalloc objects locklessly, via
>> apply_to_page_range, whose limitation is to deny changing permissions for
>> block mappings. Therefore, we move away to use the generic pagewalk API,
>> thus paving the path for enabling huge mappings by default on kernel space
>> mappings, thus leading to more efficient TLB usage. However, the API
>> currently enforces the init_mm.mmap_lock to be held. To avoid the
>> unnecessary bottleneck of the mmap_lock for our usecase, this patch
>> extends this generic API to be used locklessly, so as to retain the
>> existing behaviour for changing permissions. Apart from this reason, it is
>> noted at [1] that KFENCE can manipulate kernel pgtable entries during
>> softirqs. It does this by calling set_memory_valid() -> __change_memory_common().
>> This being a non-sleepable context, we cannot take the init_mm mmap lock.
>>
>> Add comments to highlight the conditions under which we can use the
>> lockless variant - no underlying VMA, and the user having exclusive control
>> over the range, thus guaranteeing no concurrent access.
>>
>> Since arm64 cannot handle kernel live mapping splitting without BBML2,
>> we require that the start and end of a given range lie on block mapping
>> boundaries. Return -EINVAL in case a partial block mapping is detected;
>> add a corresponding comment in ___change_memory_common() to warn that
>> eliminating such a condition is the responsibility of the caller.
>>
>> apply_to_page_range() currently performs all pte level callbacks while in
>> lazy mmu mode. Since arm64 can optimize performance by batching barriers
>> when modifying kernel pgtables in lazy mmu mode, we would like to continue
>> to benefit from this optimisation. Unfortunately walk_kernel_page_table_range()
>> does not use lazy mmu mode. However, since the pagewalk framework is not
>> allocating any memory, we can safely bracket the whole operation inside
>> lazy mmu mode ourselves. Therefore, wrap the call to
>> walk_kernel_page_table_range() with the lazy MMU helpers.
>>
>> [1] https://lore.kernel.org/linux-arm-kernel/89d0ad18-4772-4d8f-
>> ae8a-7c48d26a927e@arm.com/
>>
>> Signed-off-by: Dev Jain <dev.jain@arm.com>
>> ---
>>   arch/arm64/mm/pageattr.c | 157 +++++++++++++++++++++++++++++++--------
>>   include/linux/pagewalk.h |   3 +
>>   mm/pagewalk.c            |  26 +++++++
>>   3 files changed, 154 insertions(+), 32 deletions(-)
>>
>> diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c
>> index 04d4a8f676db..cfc5279f27a2 100644
>> --- a/arch/arm64/mm/pageattr.c
>> +++ b/arch/arm64/mm/pageattr.c
>> @@ -8,6 +8,7 @@
>>   #include <linux/mem_encrypt.h>
>>   #include <linux/sched.h>
>>   #include <linux/vmalloc.h>
>> +#include <linux/pagewalk.h>
>>     #include <asm/cacheflush.h>
>>   #include <asm/pgtable-prot.h>
>> @@ -20,6 +21,99 @@ struct page_change_data {
>>       pgprot_t clear_mask;
>>   };
>>   +static ptdesc_t set_pageattr_masks(ptdesc_t val, struct mm_walk *walk)
>> +{
>> +    struct page_change_data *masks = walk->private;
>> +
>> +    val &= ~(pgprot_val(masks->clear_mask));
>> +    val |= (pgprot_val(masks->set_mask));
>> +
>> +    return val;
>> +}
>> +
>> +static int pageattr_pgd_entry(pgd_t *pgd, unsigned long addr,
>> +                  unsigned long next, struct mm_walk *walk)
>> +{
>> +    pgd_t val = pgdp_get(pgd);
>> +
>> +    if (pgd_leaf(val)) {
>> +        if (WARN_ON_ONCE((next - addr) != PGDIR_SIZE))
>> +            return -EINVAL;
>> +        val = __pgd(set_pageattr_masks(pgd_val(val), walk));
>> +        set_pgd(pgd, val);
>> +        walk->action = ACTION_CONTINUE;
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +static int pageattr_p4d_entry(p4d_t *p4d, unsigned long addr,
>> +                  unsigned long next, struct mm_walk *walk)
>> +{
>> +    p4d_t val = p4dp_get(p4d);
>> +
>> +    if (p4d_leaf(val)) {
>> +        if (WARN_ON_ONCE((next - addr) != P4D_SIZE))
>> +            return -EINVAL;
>> +        val = __p4d(set_pageattr_masks(p4d_val(val), walk));
>> +        set_p4d(p4d, val);
>> +        walk->action = ACTION_CONTINUE;
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +static int pageattr_pud_entry(pud_t *pud, unsigned long addr,
>> +                  unsigned long next, struct mm_walk *walk)
>> +{
>> +    pud_t val = pudp_get(pud);
>> +
>> +    if (pud_leaf(val)) {
>> +        if (WARN_ON_ONCE((next - addr) != PUD_SIZE))
>> +            return -EINVAL;
>> +        val = __pud(set_pageattr_masks(pud_val(val), walk));
>> +        set_pud(pud, val);
>> +        walk->action = ACTION_CONTINUE;
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +static int pageattr_pmd_entry(pmd_t *pmd, unsigned long addr,
>> +                  unsigned long next, struct mm_walk *walk)
>> +{
>> +    pmd_t val = pmdp_get(pmd);
>> +
>> +    if (pmd_leaf(val)) {
>> +        if (WARN_ON_ONCE((next - addr) != PMD_SIZE))
>> +            return -EINVAL;
>> +        val = __pmd(set_pageattr_masks(pmd_val(val), walk));
>> +        set_pmd(pmd, val);
>> +        walk->action = ACTION_CONTINUE;
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +static int pageattr_pte_entry(pte_t *pte, unsigned long addr,
>> +                  unsigned long next, struct mm_walk *walk)
>> +{
>> +    pte_t val = __ptep_get(pte);
>> +
>> +    val = __pte(set_pageattr_masks(pte_val(val), walk));
>> +    __set_pte(pte, val);
>> +
>> +    return 0;
>> +}
> 
> I was wondering, now that we have vmalloc contpte support,
> do we need to ensure in this pte level callback that
> we don't partially cover a contpte block?

Yes good point!

Re: [PATCH v3 1/2] arm64: pageattr: Use pagewalk API to change memory permissions

Posted by Ryan Roberts 7 months, 2 weeks ago

On 13/06/2025 14:43, Dev Jain wrote:
> arm64 currently changes permissions on vmalloc objects locklessly, via
> apply_to_page_range, whose limitation is to deny changing permissions for
> block mappings. Therefore, we move away to use the generic pagewalk API,
> thus paving the path for enabling huge mappings by default on kernel space
> mappings, thus leading to more efficient TLB usage. However, the API
> currently enforces the init_mm.mmap_lock to be held. To avoid the
> unnecessary bottleneck of the mmap_lock for our usecase, this patch
> extends this generic API to be used locklessly, so as to retain the
> existing behaviour for changing permissions. Apart from this reason, it is
> noted at [1] that KFENCE can manipulate kernel pgtable entries during
> softirqs. It does this by calling set_memory_valid() -> __change_memory_common().
> This being a non-sleepable context, we cannot take the init_mm mmap lock.
> 
> Add comments to highlight the conditions under which we can use the
> lockless variant - no underlying VMA, and the user having exclusive control
> over the range, thus guaranteeing no concurrent access.
> 
> Since arm64 cannot handle kernel live mapping splitting without BBML2,
> we require that the start and end of a given range lie on block mapping
> boundaries. Return -EINVAL in case a partial block mapping is detected;
> add a corresponding comment in ___change_memory_common() to warn that
> eliminating such a condition is the responsibility of the caller.
> 
> apply_to_page_range() currently performs all pte level callbacks while in
> lazy mmu mode. Since arm64 can optimize performance by batching barriers
> when modifying kernel pgtables in lazy mmu mode, we would like to continue
> to benefit from this optimisation. Unfortunately walk_kernel_page_table_range()
> does not use lazy mmu mode. However, since the pagewalk framework is not
> allocating any memory, we can safely bracket the whole operation inside
> lazy mmu mode ourselves. Therefore, wrap the call to
> walk_kernel_page_table_range() with the lazy MMU helpers.
> 
> [1] https://lore.kernel.org/linux-arm-kernel/89d0ad18-4772-4d8f-ae8a-7c48d26a927e@arm.com/
> 
> Signed-off-by: Dev Jain <dev.jain@arm.com>

LGTM; with the nits raised by other folks addressed:

Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>


> ---
>  arch/arm64/mm/pageattr.c | 157 +++++++++++++++++++++++++++++++--------
>  include/linux/pagewalk.h |   3 +
>  mm/pagewalk.c            |  26 +++++++
>  3 files changed, 154 insertions(+), 32 deletions(-)
> 
> diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c
> index 04d4a8f676db..cfc5279f27a2 100644
> --- a/arch/arm64/mm/pageattr.c
> +++ b/arch/arm64/mm/pageattr.c
> @@ -8,6 +8,7 @@
>  #include <linux/mem_encrypt.h>
>  #include <linux/sched.h>
>  #include <linux/vmalloc.h>
> +#include <linux/pagewalk.h>
>  
>  #include <asm/cacheflush.h>
>  #include <asm/pgtable-prot.h>
> @@ -20,6 +21,99 @@ struct page_change_data {
>  	pgprot_t clear_mask;
>  };
>  
> +static ptdesc_t set_pageattr_masks(ptdesc_t val, struct mm_walk *walk)
> +{
> +	struct page_change_data *masks = walk->private;
> +
> +	val &= ~(pgprot_val(masks->clear_mask));
> +	val |= (pgprot_val(masks->set_mask));
> +
> +	return val;
> +}
> +
> +static int pageattr_pgd_entry(pgd_t *pgd, unsigned long addr,
> +			      unsigned long next, struct mm_walk *walk)
> +{
> +	pgd_t val = pgdp_get(pgd);
> +
> +	if (pgd_leaf(val)) {
> +		if (WARN_ON_ONCE((next - addr) != PGDIR_SIZE))
> +			return -EINVAL;
> +		val = __pgd(set_pageattr_masks(pgd_val(val), walk));
> +		set_pgd(pgd, val);
> +		walk->action = ACTION_CONTINUE;
> +	}
> +
> +	return 0;
> +}
> +
> +static int pageattr_p4d_entry(p4d_t *p4d, unsigned long addr,
> +			      unsigned long next, struct mm_walk *walk)
> +{
> +	p4d_t val = p4dp_get(p4d);
> +
> +	if (p4d_leaf(val)) {
> +		if (WARN_ON_ONCE((next - addr) != P4D_SIZE))
> +			return -EINVAL;
> +		val = __p4d(set_pageattr_masks(p4d_val(val), walk));
> +		set_p4d(p4d, val);
> +		walk->action = ACTION_CONTINUE;
> +	}
> +
> +	return 0;
> +}
> +
> +static int pageattr_pud_entry(pud_t *pud, unsigned long addr,
> +			      unsigned long next, struct mm_walk *walk)
> +{
> +	pud_t val = pudp_get(pud);
> +
> +	if (pud_leaf(val)) {
> +		if (WARN_ON_ONCE((next - addr) != PUD_SIZE))
> +			return -EINVAL;
> +		val = __pud(set_pageattr_masks(pud_val(val), walk));
> +		set_pud(pud, val);
> +		walk->action = ACTION_CONTINUE;
> +	}
> +
> +	return 0;
> +}
> +
> +static int pageattr_pmd_entry(pmd_t *pmd, unsigned long addr,
> +			      unsigned long next, struct mm_walk *walk)
> +{
> +	pmd_t val = pmdp_get(pmd);
> +
> +	if (pmd_leaf(val)) {
> +		if (WARN_ON_ONCE((next - addr) != PMD_SIZE))
> +			return -EINVAL;
> +		val = __pmd(set_pageattr_masks(pmd_val(val), walk));
> +		set_pmd(pmd, val);
> +		walk->action = ACTION_CONTINUE;
> +	}
> +
> +	return 0;
> +}
> +
> +static int pageattr_pte_entry(pte_t *pte, unsigned long addr,
> +			      unsigned long next, struct mm_walk *walk)
> +{
> +	pte_t val = __ptep_get(pte);
> +
> +	val = __pte(set_pageattr_masks(pte_val(val), walk));
> +	__set_pte(pte, val);
> +
> +	return 0;
> +}
> +
> +static const struct mm_walk_ops pageattr_ops = {
> +	.pgd_entry	= pageattr_pgd_entry,
> +	.p4d_entry	= pageattr_p4d_entry,
> +	.pud_entry	= pageattr_pud_entry,
> +	.pmd_entry	= pageattr_pmd_entry,
> +	.pte_entry	= pageattr_pte_entry,
> +};
> +
>  bool rodata_full __ro_after_init = IS_ENABLED(CONFIG_RODATA_FULL_DEFAULT_ENABLED);
>  
>  bool can_set_direct_map(void)
> @@ -37,22 +131,7 @@ bool can_set_direct_map(void)
>  		arm64_kfence_can_set_direct_map() || is_realm_world();
>  }
>  
> -static int change_page_range(pte_t *ptep, unsigned long addr, void *data)
> -{
> -	struct page_change_data *cdata = data;
> -	pte_t pte = __ptep_get(ptep);
> -
> -	pte = clear_pte_bit(pte, cdata->clear_mask);
> -	pte = set_pte_bit(pte, cdata->set_mask);
> -
> -	__set_pte(ptep, pte);
> -	return 0;
> -}
> -
> -/*
> - * This function assumes that the range is mapped with PAGE_SIZE pages.
> - */
> -static int __change_memory_common(unsigned long start, unsigned long size,
> +static int ___change_memory_common(unsigned long start, unsigned long size,
>  				pgprot_t set_mask, pgprot_t clear_mask)
>  {
>  	struct page_change_data data;
> @@ -61,9 +140,28 @@ static int __change_memory_common(unsigned long start, unsigned long size,
>  	data.set_mask = set_mask;
>  	data.clear_mask = clear_mask;
>  
> -	ret = apply_to_page_range(&init_mm, start, size, change_page_range,
> -					&data);
> +	arch_enter_lazy_mmu_mode();
> +
> +	/*
> +	 * The caller must ensure that the range we are operating on does not
> +	 * partially overlap a block mapping. Any such case should either not
> +	 * exist, or must be eliminated by splitting the mapping - which for
> +	 * kernel mappings can be done only on BBML2 systems.
> +	 *
> +	 */
> +	ret = walk_kernel_page_table_range_lockless(start, start + size,
> +						    &pageattr_ops, NULL, &data);
> +	arch_leave_lazy_mmu_mode();
> +
> +	return ret;
> +}
>  
> +static int __change_memory_common(unsigned long start, unsigned long size,
> +				  pgprot_t set_mask, pgprot_t clear_mask)
> +{
> +	int ret;
> +
> +	ret = ___change_memory_common(start, size, set_mask, clear_mask);
>  	/*
>  	 * If the memory is being made valid without changing any other bits
>  	 * then a TLBI isn't required as a non-valid entry cannot be cached in
> @@ -71,6 +169,7 @@ static int __change_memory_common(unsigned long start, unsigned long size,
>  	 */
>  	if (pgprot_val(set_mask) != PTE_VALID || pgprot_val(clear_mask))
>  		flush_tlb_kernel_range(start, start + size);
> +
>  	return ret;
>  }
>  
> @@ -174,32 +273,26 @@ int set_memory_valid(unsigned long addr, int numpages, int enable)
>  
>  int set_direct_map_invalid_noflush(struct page *page)
>  {
> -	struct page_change_data data = {
> -		.set_mask = __pgprot(0),
> -		.clear_mask = __pgprot(PTE_VALID),
> -	};
> +	pgprot_t clear_mask = __pgprot(PTE_VALID);
> +	pgprot_t set_mask = __pgprot(0);
>  
>  	if (!can_set_direct_map())
>  		return 0;
>  
> -	return apply_to_page_range(&init_mm,
> -				   (unsigned long)page_address(page),
> -				   PAGE_SIZE, change_page_range, &data);
> +	return ___change_memory_common((unsigned long)page_address(page),
> +					PAGE_SIZE, set_mask, clear_mask);
>  }
>  
>  int set_direct_map_default_noflush(struct page *page)
>  {
> -	struct page_change_data data = {
> -		.set_mask = __pgprot(PTE_VALID | PTE_WRITE),
> -		.clear_mask = __pgprot(PTE_RDONLY),
> -	};
> +	pgprot_t set_mask = __pgprot(PTE_VALID | PTE_WRITE);
> +	pgprot_t clear_mask = __pgprot(PTE_RDONLY);
>  
>  	if (!can_set_direct_map())
>  		return 0;
>  
> -	return apply_to_page_range(&init_mm,
> -				   (unsigned long)page_address(page),
> -				   PAGE_SIZE, change_page_range, &data);
> +	return ___change_memory_common((unsigned long)page_address(page),
> +					PAGE_SIZE, set_mask, clear_mask);
>  }
>  
>  static int __set_memory_enc_dec(unsigned long addr,
> diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h
> index 8ac2f6d6d2a3..79ab8c754dff 100644
> --- a/include/linux/pagewalk.h
> +++ b/include/linux/pagewalk.h
> @@ -132,6 +132,9 @@ int walk_page_range(struct mm_struct *mm, unsigned long start,
>  int walk_kernel_page_table_range(unsigned long start,
>  		unsigned long end, const struct mm_walk_ops *ops,
>  		pgd_t *pgd, void *private);
> +int walk_kernel_page_table_range_lockless(unsigned long start,
> +		unsigned long end, const struct mm_walk_ops *ops,
> +		pgd_t *pgd, void *private);
>  int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start,
>  			unsigned long end, const struct mm_walk_ops *ops,
>  			void *private);
> diff --git a/mm/pagewalk.c b/mm/pagewalk.c
> index ff5299eca687..7446984b2154 100644
> --- a/mm/pagewalk.c
> +++ b/mm/pagewalk.c
> @@ -632,6 +632,32 @@ int walk_kernel_page_table_range(unsigned long start, unsigned long end,
>  	return walk_pgd_range(start, end, &walk);
>  }
>  
> +/*
> + * Use this function to walk the kernel page tables locklessly. It should be
> + * guaranteed that the caller has exclusive access over the range they are
> + * operating on - that there should be no concurrent access, for example,
> + * changing permissions for vmalloc objects.
> + */
> +int walk_kernel_page_table_range_lockless(unsigned long start, unsigned long end,
> +		const struct mm_walk_ops *ops, pgd_t *pgd, void *private)
> +{
> +	struct mm_struct *mm = &init_mm;
> +	struct mm_walk walk = {
> +		.ops		= ops,
> +		.mm		= mm,
> +		.pgd		= pgd,
> +		.private	= private,
> +		.no_vma		= true
> +	};
> +
> +	if (start >= end)
> +		return -EINVAL;
> +	if (!check_ops_valid(ops))
> +		return -EINVAL;
> +
> +	return walk_pgd_range(start, end, &walk);
> +}
> +
>  /**
>   * walk_page_range_debug - walk a range of pagetables not backed by a vma
>   * @mm:		mm_struct representing the target process of page table walk

Re: [PATCH v3 1/2] arm64: pageattr: Use pagewalk API to change memory permissions

Posted by Mike Rapoport 7 months, 3 weeks ago

On Fri, Jun 13, 2025 at 07:13:51PM +0530, Dev Jain wrote:
> -/*
> - * This function assumes that the range is mapped with PAGE_SIZE pages.
> - */
> -static int __change_memory_common(unsigned long start, unsigned long size,
> +static int ___change_memory_common(unsigned long start, unsigned long size,
>  				pgprot_t set_mask, pgprot_t clear_mask)
>  {
>  	struct page_change_data data;
> @@ -61,9 +140,28 @@ static int __change_memory_common(unsigned long start, unsigned long size,
>  	data.set_mask = set_mask;
>  	data.clear_mask = clear_mask;
>  
> -	ret = apply_to_page_range(&init_mm, start, size, change_page_range,
> -					&data);
> +	arch_enter_lazy_mmu_mode();
> +
> +	/*
> +	 * The caller must ensure that the range we are operating on does not
> +	 * partially overlap a block mapping. Any such case should either not
> +	 * exist, or must be eliminated by splitting the mapping - which for
> +	 * kernel mappings can be done only on BBML2 systems.
> +	 *
> +	 */
> +	ret = walk_kernel_page_table_range_lockless(start, start + size,
> +						    &pageattr_ops, NULL, &data);

x86 has a cpa_lock for set_memory/set_direct_map to ensure that there's on
concurrency in kernel page table updates. I think arm64 has to have such
lock as well.

> +	arch_leave_lazy_mmu_mode();
> +
> +	return ret;
> +}

-- 
Sincerely yours,
Mike.

Re: [PATCH v3 1/2] arm64: pageattr: Use pagewalk API to change memory permissions

Posted by Ryan Roberts 7 months, 2 weeks ago

On 15/06/2025 08:32, Mike Rapoport wrote:
> On Fri, Jun 13, 2025 at 07:13:51PM +0530, Dev Jain wrote:
>> -/*
>> - * This function assumes that the range is mapped with PAGE_SIZE pages.
>> - */
>> -static int __change_memory_common(unsigned long start, unsigned long size,
>> +static int ___change_memory_common(unsigned long start, unsigned long size,
>>  				pgprot_t set_mask, pgprot_t clear_mask)
>>  {
>>  	struct page_change_data data;
>> @@ -61,9 +140,28 @@ static int __change_memory_common(unsigned long start, unsigned long size,
>>  	data.set_mask = set_mask;
>>  	data.clear_mask = clear_mask;
>>  
>> -	ret = apply_to_page_range(&init_mm, start, size, change_page_range,
>> -					&data);
>> +	arch_enter_lazy_mmu_mode();
>> +
>> +	/*
>> +	 * The caller must ensure that the range we are operating on does not
>> +	 * partially overlap a block mapping. Any such case should either not
>> +	 * exist, or must be eliminated by splitting the mapping - which for
>> +	 * kernel mappings can be done only on BBML2 systems.
>> +	 *
>> +	 */
>> +	ret = walk_kernel_page_table_range_lockless(start, start + size,
>> +						    &pageattr_ops, NULL, &data);
> 
> x86 has a cpa_lock for set_memory/set_direct_map to ensure that there's on
> concurrency in kernel page table updates. I think arm64 has to have such
> lock as well.

We don't have a lock today, using apply_to_page_range(); we are expecting that
the caller has exclusive ownership of the portion of virtual memory - i.e. the
vmalloc region or linear map. So I don't think this patch changes that requirement?

Where it does get a bit more hairy is when we introduce the support for
splitting. In that case, 2 non-overlapping areas of virtual memory may share a
large leaf mapping that needs to be split. But I've been discussing that with
Yang Shi at [1] and I think we can handle that locklessly too.

Perhaps I'm misunderstanding something?

[1] https://lore.kernel.org/all/f036acea-1bd1-48a7-8600-75ddd504b8db@arm.com/

Thanks,
Ryan

> 
>> +	arch_leave_lazy_mmu_mode();
>> +
>> +	return ret;
>> +}
>

Re: [PATCH v3 1/2] arm64: pageattr: Use pagewalk API to change memory permissions

Posted by Yang Shi 7 months, 2 weeks ago


On 6/25/25 4:04 AM, Ryan Roberts wrote:
> On 15/06/2025 08:32, Mike Rapoport wrote:
>> On Fri, Jun 13, 2025 at 07:13:51PM +0530, Dev Jain wrote:
>>> -/*
>>> - * This function assumes that the range is mapped with PAGE_SIZE pages.
>>> - */
>>> -static int __change_memory_common(unsigned long start, unsigned long size,
>>> +static int ___change_memory_common(unsigned long start, unsigned long size,
>>>   				pgprot_t set_mask, pgprot_t clear_mask)
>>>   {
>>>   	struct page_change_data data;
>>> @@ -61,9 +140,28 @@ static int __change_memory_common(unsigned long start, unsigned long size,
>>>   	data.set_mask = set_mask;
>>>   	data.clear_mask = clear_mask;
>>>   
>>> -	ret = apply_to_page_range(&init_mm, start, size, change_page_range,
>>> -					&data);
>>> +	arch_enter_lazy_mmu_mode();
>>> +
>>> +	/*
>>> +	 * The caller must ensure that the range we are operating on does not
>>> +	 * partially overlap a block mapping. Any such case should either not
>>> +	 * exist, or must be eliminated by splitting the mapping - which for
>>> +	 * kernel mappings can be done only on BBML2 systems.
>>> +	 *
>>> +	 */
>>> +	ret = walk_kernel_page_table_range_lockless(start, start + size,
>>> +						    &pageattr_ops, NULL, &data);
>> x86 has a cpa_lock for set_memory/set_direct_map to ensure that there's on
>> concurrency in kernel page table updates. I think arm64 has to have such
>> lock as well.
> We don't have a lock today, using apply_to_page_range(); we are expecting that
> the caller has exclusive ownership of the portion of virtual memory - i.e. the
> vmalloc region or linear map. So I don't think this patch changes that requirement?
>
> Where it does get a bit more hairy is when we introduce the support for
> splitting. In that case, 2 non-overlapping areas of virtual memory may share a
> large leaf mapping that needs to be split. But I've been discussing that with
> Yang Shi at [1] and I think we can handle that locklessly too.

If the split is serialized by a lock, changing permission can be 
lockless. But if split is lockless, changing permission may be a little 
bit tricky, particularly for CONT mappings. The implementation in my 
split patch assumes the whole range has cont bit cleared if the first 
PTE in the range has cont bit cleared because the lock guarantees two 
concurrent splits are serialized.

But lockless split may trigger the below race:

CPU A is splitting the page table, CPU B is changing the permission for 
one PTE entry in the same table. Clearing cont bit is RMW, changing 
permission is RMW too, but neither of them is atomic.

                CPU A                                      CPU B
read the PTE read the PTE
clear the cont bit for the PTE
                                    change the PTE permission from RW to RO
                                    store the new PTE

store the new PTE <- it will overwrite the PTE value stored by CPU B and 
result in misprogrammed cont PTEs


We should need do one the of the follows to avoid the race off the top 
of my head:
1. Serialize the split with a lock
2. Make page table RMW atomic in both split and permission change
3. Check whether PTE is cont or not for every PTEs in the range instead 
of the first PTE, before clearing cont bit if they are
4. Retry if cont bit is not cleared in permission change, but we need 
distinguish this from changing permission for the whole CONT PTE range 
because we keep cont bit for this case

Thanks,
Yang

>
> Perhaps I'm misunderstanding something?
>
> [1] https://lore.kernel.org/all/f036acea-1bd1-48a7-8600-75ddd504b8db@arm.com/
>
> Thanks,
> Ryan
>
>>> +	arch_leave_lazy_mmu_mode();
>>> +
>>> +	return ret;
>>> +}

Re: [PATCH v3 1/2] arm64: pageattr: Use pagewalk API to change memory permissions

Posted by Ryan Roberts 7 months, 2 weeks ago

On 25/06/2025 21:40, Yang Shi wrote:
> 
> 
> On 6/25/25 4:04 AM, Ryan Roberts wrote:
>> On 15/06/2025 08:32, Mike Rapoport wrote:
>>> On Fri, Jun 13, 2025 at 07:13:51PM +0530, Dev Jain wrote:
>>>> -/*
>>>> - * This function assumes that the range is mapped with PAGE_SIZE pages.
>>>> - */
>>>> -static int __change_memory_common(unsigned long start, unsigned long size,
>>>> +static int ___change_memory_common(unsigned long start, unsigned long size,
>>>>                   pgprot_t set_mask, pgprot_t clear_mask)
>>>>   {
>>>>       struct page_change_data data;
>>>> @@ -61,9 +140,28 @@ static int __change_memory_common(unsigned long start,
>>>> unsigned long size,
>>>>       data.set_mask = set_mask;
>>>>       data.clear_mask = clear_mask;
>>>>   -    ret = apply_to_page_range(&init_mm, start, size, change_page_range,
>>>> -                    &data);
>>>> +    arch_enter_lazy_mmu_mode();
>>>> +
>>>> +    /*
>>>> +     * The caller must ensure that the range we are operating on does not
>>>> +     * partially overlap a block mapping. Any such case should either not
>>>> +     * exist, or must be eliminated by splitting the mapping - which for
>>>> +     * kernel mappings can be done only on BBML2 systems.
>>>> +     *
>>>> +     */
>>>> +    ret = walk_kernel_page_table_range_lockless(start, start + size,
>>>> +                            &pageattr_ops, NULL, &data);
>>> x86 has a cpa_lock for set_memory/set_direct_map to ensure that there's on
>>> concurrency in kernel page table updates. I think arm64 has to have such
>>> lock as well.
>> We don't have a lock today, using apply_to_page_range(); we are expecting that
>> the caller has exclusive ownership of the portion of virtual memory - i.e. the
>> vmalloc region or linear map. So I don't think this patch changes that
>> requirement?
>>
>> Where it does get a bit more hairy is when we introduce the support for
>> splitting. In that case, 2 non-overlapping areas of virtual memory may share a
>> large leaf mapping that needs to be split. But I've been discussing that with
>> Yang Shi at [1] and I think we can handle that locklessly too.
> 
> If the split is serialized by a lock, changing permission can be lockless. But
> if split is lockless, changing permission may be a little bit tricky,
> particularly for CONT mappings. The implementation in my split patch assumes the
> whole range has cont bit cleared if the first PTE in the range has cont bit
> cleared because the lock guarantees two concurrent splits are serialized.
> 
> But lockless split may trigger the below race:
> 
> CPU A is splitting the page table, CPU B is changing the permission for one PTE
> entry in the same table. Clearing cont bit is RMW, changing permission is RMW
> too, but neither of them is atomic.
> 
>                CPU A                                      CPU B
> read the PTE read the PTE
> clear the cont bit for the PTE
>                                    change the PTE permission from RW to RO
>                                    store the new PTE
> 
> store the new PTE <- it will overwrite the PTE value stored by CPU B and result
> in misprogrammed cont PTEs

Ahh yes, good point! I missed that. When I was thinking about this, I had
assumed that *both* CPUs racing to split would (non-atomically) RMW to remove
the cont bit on the whole block. That is safe as long as nothing else in the PTE
changes. But of course you're right that the first one to complete that may then
go on to modify the permissions in their portion of the now-split VA space. So
there is definitely a problem.

> 
> 
> We should need do one the of the follows to avoid the race off the top of my head:
> 1. Serialize the split with a lock

I guess this is certainly the simplest as per your original proposal.

> 2. Make page table RMW atomic in both split and permission change

I don't think we would need atomic RMW for the permission change - we would only
need it for removing the cont bit? My reasoning is that by the time a thread is
doing the permission change it must have already finished splitting the cont
block. The permission change will only be for PTEs that we know we have
exclusive access too. The other CPU may still be "splitting" the cont block, but
since we already won, it will just be reading the PTEs and noticing that cont is
already clear? I guess split_contpte()/split_contpmd() becomes a loop doing
READ_ONCE() to test if the bit is set, followed by atomic bit clear if it was
set (avoid the atomic where we can)?

> 3. Check whether PTE is cont or not for every PTEs in the range instead of the
> first PTE, before clearing cont bit if they are

Ahh perhaps this is what I'm actually describing above?

> 4. Retry if cont bit is not cleared in permission change, but we need
> distinguish this from changing permission for the whole CONT PTE range because
> we keep cont bit for this case

I'd prefer to keep the splitting decoupled from the permission change if we can.


Personally, I'd prefer to take the lockless approach. I think it has the least
chance of contention issues. But if you prefer to use a lock, then I'm ok with
that as a starting point. I'd prefer to use a new separate lock though (like x86
does) rather than risking extra contention with the init_mm PTL.

Thanks,
Ryan


> 
> Thanks,
> Yang
> 
>>
>> Perhaps I'm misunderstanding something?
>>
>> [1] https://lore.kernel.org/all/f036acea-1bd1-48a7-8600-75ddd504b8db@arm.com/
>>
>> Thanks,
>> Ryan
>>
>>>> +    arch_leave_lazy_mmu_mode();
>>>> +
>>>> +    return ret;
>>>> +}
>

Re: [PATCH v3 1/2] arm64: pageattr: Use pagewalk API to change memory permissions

Posted by Yang Shi 7 months, 2 weeks ago


On 6/26/25 1:47 AM, Ryan Roberts wrote:
> On 25/06/2025 21:40, Yang Shi wrote:
>>
>> On 6/25/25 4:04 AM, Ryan Roberts wrote:
>>> On 15/06/2025 08:32, Mike Rapoport wrote:
>>>> On Fri, Jun 13, 2025 at 07:13:51PM +0530, Dev Jain wrote:
>>>>> -/*
>>>>> - * This function assumes that the range is mapped with PAGE_SIZE pages.
>>>>> - */
>>>>> -static int __change_memory_common(unsigned long start, unsigned long size,
>>>>> +static int ___change_memory_common(unsigned long start, unsigned long size,
>>>>>                    pgprot_t set_mask, pgprot_t clear_mask)
>>>>>    {
>>>>>        struct page_change_data data;
>>>>> @@ -61,9 +140,28 @@ static int __change_memory_common(unsigned long start,
>>>>> unsigned long size,
>>>>>        data.set_mask = set_mask;
>>>>>        data.clear_mask = clear_mask;
>>>>>    -    ret = apply_to_page_range(&init_mm, start, size, change_page_range,
>>>>> -                    &data);
>>>>> +    arch_enter_lazy_mmu_mode();
>>>>> +
>>>>> +    /*
>>>>> +     * The caller must ensure that the range we are operating on does not
>>>>> +     * partially overlap a block mapping. Any such case should either not
>>>>> +     * exist, or must be eliminated by splitting the mapping - which for
>>>>> +     * kernel mappings can be done only on BBML2 systems.
>>>>> +     *
>>>>> +     */
>>>>> +    ret = walk_kernel_page_table_range_lockless(start, start + size,
>>>>> +                            &pageattr_ops, NULL, &data);
>>>> x86 has a cpa_lock for set_memory/set_direct_map to ensure that there's on
>>>> concurrency in kernel page table updates. I think arm64 has to have such
>>>> lock as well.
>>> We don't have a lock today, using apply_to_page_range(); we are expecting that
>>> the caller has exclusive ownership of the portion of virtual memory - i.e. the
>>> vmalloc region or linear map. So I don't think this patch changes that
>>> requirement?
>>>
>>> Where it does get a bit more hairy is when we introduce the support for
>>> splitting. In that case, 2 non-overlapping areas of virtual memory may share a
>>> large leaf mapping that needs to be split. But I've been discussing that with
>>> Yang Shi at [1] and I think we can handle that locklessly too.
>> If the split is serialized by a lock, changing permission can be lockless. But
>> if split is lockless, changing permission may be a little bit tricky,
>> particularly for CONT mappings. The implementation in my split patch assumes the
>> whole range has cont bit cleared if the first PTE in the range has cont bit
>> cleared because the lock guarantees two concurrent splits are serialized.
>>
>> But lockless split may trigger the below race:
>>
>> CPU A is splitting the page table, CPU B is changing the permission for one PTE
>> entry in the same table. Clearing cont bit is RMW, changing permission is RMW
>> too, but neither of them is atomic.
>>
>>                 CPU A                                      CPU B
>> read the PTE read the PTE
>> clear the cont bit for the PTE
>>                                     change the PTE permission from RW to RO
>>                                     store the new PTE
>>
>> store the new PTE <- it will overwrite the PTE value stored by CPU B and result
>> in misprogrammed cont PTEs
> Ahh yes, good point! I missed that. When I was thinking about this, I had
> assumed that *both* CPUs racing to split would (non-atomically) RMW to remove
> the cont bit on the whole block. That is safe as long as nothing else in the PTE
> changes. But of course you're right that the first one to complete that may then
> go on to modify the permissions in their portion of the now-split VA space. So
> there is definitely a problem.
>
>>
>> We should need do one the of the follows to avoid the race off the top of my head:
>> 1. Serialize the split with a lock
> I guess this is certainly the simplest as per your original proposal.

Yeah

>
>> 2. Make page table RMW atomic in both split and permission change
> I don't think we would need atomic RMW for the permission change - we would only
> need it for removing the cont bit? My reasoning is that by the time a thread is
> doing the permission change it must have already finished splitting the cont
> block. The permission change will only be for PTEs that we know we have
> exclusive access too. The other CPU may still be "splitting" the cont block, but
> since we already won, it will just be reading the PTEs and noticing that cont is
> already clear? I guess split_contpte()/split_contpmd() becomes a loop doing
> READ_ONCE() to test if the bit is set, followed by atomic bit clear if it was
> set (avoid the atomic where we can)?
>
>> 3. Check whether PTE is cont or not for every PTEs in the range instead of the
>> first PTE, before clearing cont bit if they are
> Ahh perhaps this is what I'm actually describing above?

Yes

>
>> 4. Retry if cont bit is not cleared in permission change, but we need
>> distinguish this from changing permission for the whole CONT PTE range because
>> we keep cont bit for this case
> I'd prefer to keep the splitting decoupled from the permission change if we can.

I agree.

>
>
> Personally, I'd prefer to take the lockless approach. I think it has the least
> chance of contention issues. But if you prefer to use a lock, then I'm ok with
> that as a starting point. I'd prefer to use a new separate lock though (like x86
> does) rather than risking extra contention with the init_mm PTL.

A separate lock is fine to me. I think it will make our life easier to 
use a lock. We can always optimize it if the lock contention turns out 
to be a problem.

Thanks,
Yang

>
> Thanks,
> Ryan
>
>
>> Thanks,
>> Yang
>>
>>> Perhaps I'm misunderstanding something?
>>>
>>> [1] https://lore.kernel.org/all/f036acea-1bd1-48a7-8600-75ddd504b8db@arm.com/
>>>
>>> Thanks,
>>> Ryan
>>>
>>>>> +    arch_leave_lazy_mmu_mode();
>>>>> +
>>>>> +    return ret;
>>>>> +}

Re: [PATCH v3 1/2] arm64: pageattr: Use pagewalk API to change memory permissions

Posted by Dev Jain 7 months, 3 weeks ago

On 15/06/25 1:02 pm, Mike Rapoport wrote:
> On Fri, Jun 13, 2025 at 07:13:51PM +0530, Dev Jain wrote:
>> -/*
>> - * This function assumes that the range is mapped with PAGE_SIZE pages.
>> - */
>> -static int __change_memory_common(unsigned long start, unsigned long size,
>> +static int ___change_memory_common(unsigned long start, unsigned long size,
>>   				pgprot_t set_mask, pgprot_t clear_mask)
>>   {
>>   	struct page_change_data data;
>> @@ -61,9 +140,28 @@ static int __change_memory_common(unsigned long start, unsigned long size,
>>   	data.set_mask = set_mask;
>>   	data.clear_mask = clear_mask;
>>   
>> -	ret = apply_to_page_range(&init_mm, start, size, change_page_range,
>> -					&data);
>> +	arch_enter_lazy_mmu_mode();
>> +
>> +	/*
>> +	 * The caller must ensure that the range we are operating on does not
>> +	 * partially overlap a block mapping. Any such case should either not
>> +	 * exist, or must be eliminated by splitting the mapping - which for
>> +	 * kernel mappings can be done only on BBML2 systems.
>> +	 *
>> +	 */
>> +	ret = walk_kernel_page_table_range_lockless(start, start + size,
>> +						    &pageattr_ops, NULL, &data);
> x86 has a cpa_lock for set_memory/set_direct_map to ensure that there's on
> concurrency in kernel page table updates. I think arm64 has to have such
> lock as well.

My understanding is that it is guaranteed that the set_memory_* caller has
exclusive access to the range it is changing permissions for.
The x86 comment is

Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings) using cpa_lock.
So that we don't allow any other cpu, with stale large tlb entries change the page attribute in
parallel to some other cpu splitting a large page entry along with changing the attribute.

On arm64 we are doing flush_tlb_kernel_range in __change_memory_common; and also, the caller
of __change_memory_common is required to first split the start and end before changing permissions,
so the splitting and permission change won't happen in parallel as described by the comment.

>
>> +	arch_leave_lazy_mmu_mode();
>> +
>> +	return ret;
>> +}

Re: [PATCH v3 1/2] arm64: pageattr: Use pagewalk API to change memory permissions

Posted by Lorenzo Stoakes 7 months, 4 weeks ago

On Fri, Jun 13, 2025 at 07:13:51PM +0530, Dev Jain wrote:
> arm64 currently changes permissions on vmalloc objects locklessly, via
> apply_to_page_range, whose limitation is to deny changing permissions for
> block mappings. Therefore, we move away to use the generic pagewalk API,
> thus paving the path for enabling huge mappings by default on kernel space
> mappings, thus leading to more efficient TLB usage. However, the API
> currently enforces the init_mm.mmap_lock to be held. To avoid the
> unnecessary bottleneck of the mmap_lock for our usecase, this patch
> extends this generic API to be used locklessly, so as to retain the
> existing behaviour for changing permissions. Apart from this reason, it is
> noted at [1] that KFENCE can manipulate kernel pgtable entries during
> softirqs. It does this by calling set_memory_valid() -> __change_memory_common().
> This being a non-sleepable context, we cannot take the init_mm mmap lock.
>
> Add comments to highlight the conditions under which we can use the
> lockless variant - no underlying VMA, and the user having exclusive control
> over the range, thus guaranteeing no concurrent access.
>
> Since arm64 cannot handle kernel live mapping splitting without BBML2,
> we require that the start and end of a given range lie on block mapping
> boundaries. Return -EINVAL in case a partial block mapping is detected;
> add a corresponding comment in ___change_memory_common() to warn that
> eliminating such a condition is the responsibility of the caller.
>
> apply_to_page_range() currently performs all pte level callbacks while in
> lazy mmu mode. Since arm64 can optimize performance by batching barriers
> when modifying kernel pgtables in lazy mmu mode, we would like to continue
> to benefit from this optimisation. Unfortunately walk_kernel_page_table_range()
> does not use lazy mmu mode. However, since the pagewalk framework is not
> allocating any memory, we can safely bracket the whole operation inside
> lazy mmu mode ourselves. Therefore, wrap the call to
> walk_kernel_page_table_range() with the lazy MMU helpers.

Thanks this is a great commit message!

>
> [1] https://lore.kernel.org/linux-arm-kernel/89d0ad18-4772-4d8f-ae8a-7c48d26a927e@arm.com/
>
> Signed-off-by: Dev Jain <dev.jain@arm.com>
> ---
>  arch/arm64/mm/pageattr.c | 157 +++++++++++++++++++++++++++++++--------
>  include/linux/pagewalk.h |   3 +
>  mm/pagewalk.c            |  26 +++++++
>  3 files changed, 154 insertions(+), 32 deletions(-)
>
> diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c
> index 04d4a8f676db..cfc5279f27a2 100644
> --- a/arch/arm64/mm/pageattr.c
> +++ b/arch/arm64/mm/pageattr.c
> @@ -8,6 +8,7 @@
>  #include <linux/mem_encrypt.h>
>  #include <linux/sched.h>
>  #include <linux/vmalloc.h>
> +#include <linux/pagewalk.h>
>
>  #include <asm/cacheflush.h>
>  #include <asm/pgtable-prot.h>
> @@ -20,6 +21,99 @@ struct page_change_data {
>  	pgprot_t clear_mask;
>  };
>
> +static ptdesc_t set_pageattr_masks(ptdesc_t val, struct mm_walk *walk)
> +{
> +	struct page_change_data *masks = walk->private;
> +
> +	val &= ~(pgprot_val(masks->clear_mask));
> +	val |= (pgprot_val(masks->set_mask));
> +
> +	return val;
> +}
> +
> +static int pageattr_pgd_entry(pgd_t *pgd, unsigned long addr,
> +			      unsigned long next, struct mm_walk *walk)
> +{
> +	pgd_t val = pgdp_get(pgd);
> +
> +	if (pgd_leaf(val)) {

Does arm have PGD entry level leaves? :) just curious :P

> +		if (WARN_ON_ONCE((next - addr) != PGDIR_SIZE))
> +			return -EINVAL;

I guess the point here is to assert that the searched range _entirely
spans_ the folio that the higher order leaf page table entry describes.

I'm guessing this is desired.

But I'm not sure this should be a warning?

What if you happen to walk a range that isn't aligned like this?

(Same comment goes for the other instances of the same pattern)

> +		val = __pgd(set_pageattr_masks(pgd_val(val), walk));
> +		set_pgd(pgd, val);
> +		walk->action = ACTION_CONTINUE;
> +	}
> +
> +	return 0;
> +}
> +
> +static int pageattr_p4d_entry(p4d_t *p4d, unsigned long addr,
> +			      unsigned long next, struct mm_walk *walk)
> +{
> +	p4d_t val = p4dp_get(p4d);
> +
> +	if (p4d_leaf(val)) {
> +		if (WARN_ON_ONCE((next - addr) != P4D_SIZE))
> +			return -EINVAL;
> +		val = __p4d(set_pageattr_masks(p4d_val(val), walk));
> +		set_p4d(p4d, val);
> +		walk->action = ACTION_CONTINUE;
> +	}
> +
> +	return 0;
> +}
> +
> +static int pageattr_pud_entry(pud_t *pud, unsigned long addr,
> +			      unsigned long next, struct mm_walk *walk)
> +{
> +	pud_t val = pudp_get(pud);
> +
> +	if (pud_leaf(val)) {
> +		if (WARN_ON_ONCE((next - addr) != PUD_SIZE))
> +			return -EINVAL;
> +		val = __pud(set_pageattr_masks(pud_val(val), walk));
> +		set_pud(pud, val);
> +		walk->action = ACTION_CONTINUE;
> +	}
> +
> +	return 0;
> +}
> +
> +static int pageattr_pmd_entry(pmd_t *pmd, unsigned long addr,
> +			      unsigned long next, struct mm_walk *walk)
> +{
> +	pmd_t val = pmdp_get(pmd);
> +
> +	if (pmd_leaf(val)) {
> +		if (WARN_ON_ONCE((next - addr) != PMD_SIZE))
> +			return -EINVAL;
> +		val = __pmd(set_pageattr_masks(pmd_val(val), walk));
> +		set_pmd(pmd, val);
> +		walk->action = ACTION_CONTINUE;
> +	}
> +
> +	return 0;
> +}
> +
> +static int pageattr_pte_entry(pte_t *pte, unsigned long addr,
> +			      unsigned long next, struct mm_walk *walk)
> +{
> +	pte_t val = __ptep_get(pte);
> +
> +	val = __pte(set_pageattr_masks(pte_val(val), walk));
> +	__set_pte(pte, val);
> +
> +	return 0;
> +}
> +
> +static const struct mm_walk_ops pageattr_ops = {
> +	.pgd_entry	= pageattr_pgd_entry,
> +	.p4d_entry	= pageattr_p4d_entry,
> +	.pud_entry	= pageattr_pud_entry,
> +	.pmd_entry	= pageattr_pmd_entry,
> +	.pte_entry	= pageattr_pte_entry,
> +};
> +
>  bool rodata_full __ro_after_init = IS_ENABLED(CONFIG_RODATA_FULL_DEFAULT_ENABLED);
>
>  bool can_set_direct_map(void)
> @@ -37,22 +131,7 @@ bool can_set_direct_map(void)
>  		arm64_kfence_can_set_direct_map() || is_realm_world();
>  }
>
> -static int change_page_range(pte_t *ptep, unsigned long addr, void *data)
> -{
> -	struct page_change_data *cdata = data;
> -	pte_t pte = __ptep_get(ptep);
> -
> -	pte = clear_pte_bit(pte, cdata->clear_mask);
> -	pte = set_pte_bit(pte, cdata->set_mask);
> -
> -	__set_pte(ptep, pte);
> -	return 0;
> -}
> -
> -/*
> - * This function assumes that the range is mapped with PAGE_SIZE pages.
> - */
> -static int __change_memory_common(unsigned long start, unsigned long size,
> +static int ___change_memory_common(unsigned long start, unsigned long size,
>  				pgprot_t set_mask, pgprot_t clear_mask)

I wouldn't presume to comment on conventions in arm64 arch code, but that's
a lot of underscores :P

I wonder if this is the best name for it as you're not only invoking it
from __change_memory_common()?

And yes this is a pedantic comment, I realise :)

>  {
>  	struct page_change_data data;
> @@ -61,9 +140,28 @@ static int __change_memory_common(unsigned long start, unsigned long size,
>  	data.set_mask = set_mask;
>  	data.clear_mask = clear_mask;
>
> -	ret = apply_to_page_range(&init_mm, start, size, change_page_range,
> -					&data);
> +	arch_enter_lazy_mmu_mode();
> +
> +	/*
> +	 * The caller must ensure that the range we are operating on does not
> +	 * partially overlap a block mapping. Any such case should either not
> +	 * exist, or must be eliminated by splitting the mapping - which for
> +	 * kernel mappings can be done only on BBML2 systems.
> +	 *

EVEN MORE RIDICULOUS NIT: extra line in comment here!

> +	 */

OK so this probably answers the comment above re: the warnings.

> +	ret = walk_kernel_page_table_range_lockless(start, start + size,
> +						    &pageattr_ops, NULL, &data);
> +	arch_leave_lazy_mmu_mode();
> +
> +	return ret;
> +}
>
> +static int __change_memory_common(unsigned long start, unsigned long size,
> +				  pgprot_t set_mask, pgprot_t clear_mask)
> +{
> +	int ret;
> +
> +	ret = ___change_memory_common(start, size, set_mask, clear_mask);
>  	/*
>  	 * If the memory is being made valid without changing any other bits
>  	 * then a TLBI isn't required as a non-valid entry cannot be cached in
> @@ -71,6 +169,7 @@ static int __change_memory_common(unsigned long start, unsigned long size,
>  	 */
>  	if (pgprot_val(set_mask) != PTE_VALID || pgprot_val(clear_mask))
>  		flush_tlb_kernel_range(start, start + size);
> +
>  	return ret;
>  }
>
> @@ -174,32 +273,26 @@ int set_memory_valid(unsigned long addr, int numpages, int enable)
>
>  int set_direct_map_invalid_noflush(struct page *page)
>  {
> -	struct page_change_data data = {
> -		.set_mask = __pgprot(0),
> -		.clear_mask = __pgprot(PTE_VALID),
> -	};
> +	pgprot_t clear_mask = __pgprot(PTE_VALID);
> +	pgprot_t set_mask = __pgprot(0);
>
>  	if (!can_set_direct_map())
>  		return 0;
>
> -	return apply_to_page_range(&init_mm,
> -				   (unsigned long)page_address(page),
> -				   PAGE_SIZE, change_page_range, &data);
> +	return ___change_memory_common((unsigned long)page_address(page),
> +					PAGE_SIZE, set_mask, clear_mask);
>  }
>
>  int set_direct_map_default_noflush(struct page *page)
>  {
> -	struct page_change_data data = {
> -		.set_mask = __pgprot(PTE_VALID | PTE_WRITE),
> -		.clear_mask = __pgprot(PTE_RDONLY),
> -	};
> +	pgprot_t set_mask = __pgprot(PTE_VALID | PTE_WRITE);
> +	pgprot_t clear_mask = __pgprot(PTE_RDONLY);
>
>  	if (!can_set_direct_map())
>  		return 0;
>
> -	return apply_to_page_range(&init_mm,
> -				   (unsigned long)page_address(page),
> -				   PAGE_SIZE, change_page_range, &data);
> +	return ___change_memory_common((unsigned long)page_address(page),
> +					PAGE_SIZE, set_mask, clear_mask);
>  }
>
>  static int __set_memory_enc_dec(unsigned long addr,
> diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h
> index 8ac2f6d6d2a3..79ab8c754dff 100644
> --- a/include/linux/pagewalk.h
> +++ b/include/linux/pagewalk.h
> @@ -132,6 +132,9 @@ int walk_page_range(struct mm_struct *mm, unsigned long start,
>  int walk_kernel_page_table_range(unsigned long start,
>  		unsigned long end, const struct mm_walk_ops *ops,
>  		pgd_t *pgd, void *private);
> +int walk_kernel_page_table_range_lockless(unsigned long start,
> +		unsigned long end, const struct mm_walk_ops *ops,
> +		pgd_t *pgd, void *private);
>  int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start,
>  			unsigned long end, const struct mm_walk_ops *ops,
>  			void *private);
> diff --git a/mm/pagewalk.c b/mm/pagewalk.c
> index ff5299eca687..7446984b2154 100644
> --- a/mm/pagewalk.c
> +++ b/mm/pagewalk.c
> @@ -632,6 +632,32 @@ int walk_kernel_page_table_range(unsigned long start, unsigned long end,
>  	return walk_pgd_range(start, end, &walk);
>  }
>
> +/*
> + * Use this function to walk the kernel page tables locklessly. It should be
> + * guaranteed that the caller has exclusive access over the range they are
> + * operating on - that there should be no concurrent access, for example,
> + * changing permissions for vmalloc objects.
> + */
> +int walk_kernel_page_table_range_lockless(unsigned long start, unsigned long end,
> +		const struct mm_walk_ops *ops, pgd_t *pgd, void *private)

Don't really see the point in having a pgd parameter?

> +{
> +	struct mm_struct *mm = &init_mm;

No real need to split out mm here, just reference &init_mm direct below.

> +	struct mm_walk walk = {
> +		.ops		= ops,
> +		.mm		= mm,
> +		.pgd		= pgd,
> +		.private	= private,
> +		.no_vma		= true
> +	};
> +
> +	if (start >= end)
> +		return -EINVAL;
> +	if (!check_ops_valid(ops))
> +		return -EINVAL;
> +
> +	return walk_pgd_range(start, end, &walk);
> +}
> +
>  /**
>   * walk_page_range_debug - walk a range of pagetables not backed by a vma
>   * @mm:		mm_struct representing the target process of page table walk
> --
> 2.30.2
>

Other than nits, etc. this looks fine. Thanks for the refactorings!

Re: [PATCH v3 1/2] arm64: pageattr: Use pagewalk API to change memory permissions

Posted by Mike Rapoport 7 months, 3 weeks ago

On Fri, Jun 13, 2025 at 05:27:27PM +0100, Lorenzo Stoakes wrote:
> On Fri, Jun 13, 2025 at 07:13:51PM +0530, Dev Jain wrote:
> > -/*
> > - * This function assumes that the range is mapped with PAGE_SIZE pages.
> > - */
> > -static int __change_memory_common(unsigned long start, unsigned long size,
> > +static int ___change_memory_common(unsigned long start, unsigned long size,
> >  				pgprot_t set_mask, pgprot_t clear_mask)
> 
> I wouldn't presume to comment on conventions in arm64 arch code, but that's
> a lot of underscores :P
> 
> I wonder if this is the best name for it as you're not only invoking it
> from __change_memory_common()?

Could update_range_pgport() work?
 
> And yes this is a pedantic comment, I realise :)
> 

-- 
Sincerely yours,
Mike.

Re: [PATCH v3 1/2] arm64: pageattr: Use pagewalk API to change memory permissions

Posted by Ryan Roberts 7 months, 2 weeks ago

On 15/06/2025 08:25, Mike Rapoport wrote:
> On Fri, Jun 13, 2025 at 05:27:27PM +0100, Lorenzo Stoakes wrote:
>> On Fri, Jun 13, 2025 at 07:13:51PM +0530, Dev Jain wrote:
>>> -/*
>>> - * This function assumes that the range is mapped with PAGE_SIZE pages.
>>> - */
>>> -static int __change_memory_common(unsigned long start, unsigned long size,
>>> +static int ___change_memory_common(unsigned long start, unsigned long size,
>>>  				pgprot_t set_mask, pgprot_t clear_mask)
>>
>> I wouldn't presume to comment on conventions in arm64 arch code, but that's
>> a lot of underscores :P
>>
>> I wonder if this is the best name for it as you're not only invoking it
>> from __change_memory_common()?
> 
> Could update_range_pgport() work?

Sounds good to me!

>  
>> And yes this is a pedantic comment, I realise :)
>>
>

Re: [PATCH v3 1/2] arm64: pageattr: Use pagewalk API to change memory permissions

Posted by Karim Manaouil 7 months, 4 weeks ago

On Fri, Jun 13, 2025 at 05:27:27PM +0100, Lorenzo Stoakes wrote:
> On Fri, Jun 13, 2025 at 07:13:51PM +0530, Dev Jain wrote:
> > +		if (WARN_ON_ONCE((next - addr) != PGDIR_SIZE))
> > +			return -EINVAL;
> 
> I guess the point here is to assert that the searched range _entirely
> spans_ the folio that the higher order leaf page table entry describes.
> 
> I'm guessing this is desired.
> 
> But I'm not sure this should be a warning?
> 
> What if you happen to walk a range that isn't aligned like this?

My understandging is that the caller must ensure that addr is
pud/pmd/pte-aligned. But, imho, since -EINVAL is returned, I don't think
the WARN_ON_ONCE() is needed.

-- 
~karim

Re: [PATCH v3 1/2] arm64: pageattr: Use pagewalk API to change memory permissions

Posted by Dev Jain 7 months, 3 weeks ago

On 14/06/25 8:20 pm, Karim Manaouil wrote:
> On Fri, Jun 13, 2025 at 05:27:27PM +0100, Lorenzo Stoakes wrote:
>> On Fri, Jun 13, 2025 at 07:13:51PM +0530, Dev Jain wrote:
>>> +		if (WARN_ON_ONCE((next - addr) != PGDIR_SIZE))
>>> +			return -EINVAL;
>> I guess the point here is to assert that the searched range _entirely
>> spans_ the folio that the higher order leaf page table entry describes.
>>
>> I'm guessing this is desired.
>>
>> But I'm not sure this should be a warning?
>>
>> What if you happen to walk a range that isn't aligned like this?
> My understandging is that the caller must ensure that addr is
> pud/pmd/pte-aligned. But, imho, since -EINVAL is returned, I don't think
> the WARN_ON_ONCE() is needed.

I don't really have a strong opinion on this. Ryan may be better fitted
to answer.

>

Re: [PATCH v3 1/2] arm64: pageattr: Use pagewalk API to change memory permissions

Posted by Ryan Roberts 7 months, 2 weeks ago

On 19/06/2025 05:03, Dev Jain wrote:
> 
> On 14/06/25 8:20 pm, Karim Manaouil wrote:
>> On Fri, Jun 13, 2025 at 05:27:27PM +0100, Lorenzo Stoakes wrote:
>>> On Fri, Jun 13, 2025 at 07:13:51PM +0530, Dev Jain wrote:
>>>> +        if (WARN_ON_ONCE((next - addr) != PGDIR_SIZE))
>>>> +            return -EINVAL;
>>> I guess the point here is to assert that the searched range _entirely
>>> spans_ the folio that the higher order leaf page table entry describes.
>>>
>>> I'm guessing this is desired.
>>>
>>> But I'm not sure this should be a warning?
>>>
>>> What if you happen to walk a range that isn't aligned like this?
>> My understandging is that the caller must ensure that addr is
>> pud/pmd/pte-aligned. But, imho, since -EINVAL is returned, I don't think
>> the WARN_ON_ONCE() is needed.
> 
> I don't really have a strong opinion on this. Ryan may be better fitted
> to answer.

IMHO it's a pretty serious programming cock-up if we ever find this situation,
so I'd prefer to emit the warning.

apply_to_page_range() (which we are replacing here) emits WARN_ON_ONCE() if it
arrives at any non-page leaf mappings, which is stronger than what we have here;
it expects only to modify permissions on regions that are pte-mapped. Here we
are relaxing that requirement to only require that the region begin and end on a
leaf mapping boundary, where the leaf can be at any level.

So for now, we still only expect this code to get called for regions that are
fully pte-mapped.

This series is an enabler to allow us to change that in future though. Yang Shi
is working on a series which will ensure that a region that we want to change
permissions for has its start and end on a leaf boundary by dynamically
splitting the leaf mappings as needed (which can be done safely on arm64 when
FEAT_BBM level 2 is supported). This will then open up the door to mapping the
linear map and vmalloc with large leaf mappings by default. But due to the
splitting we ensure never to trigger the warning; if we do, that is a bug.

Thanks,
Ryan

[PATCH v3 1/2] arm64: pageattr: Use pagewalk API to change memory permissions
[PATCH v3 2/2] arm64: pageattr: Enable huge-vmalloc permission change