[v3] Perf improvements for hugetlb and vmalloc on arm64

[PATCH v3 11/11] arm64/mm: Batch barriers when updating kernel mappings

Posted by Ryan Roberts 11 months, 1 week ago

Because the kernel can't tolerate page faults for kernel mappings, when
setting a valid, kernel space pte (or pmd/pud/p4d/pgd), it emits a
dsb(ishst) to ensure that the store to the pgtable is observed by the
table walker immediately. Additionally it emits an isb() to ensure that
any already speculatively determined invalid mapping fault gets
canceled.

We can improve the performance of vmalloc operations by batching these
barriers until the end of a set of entry updates.
arch_enter_lazy_mmu_mode() and arch_leave_lazy_mmu_mode() provide the
required hooks.

vmalloc improves by up to 30% as a result.

Two new TIF_ flags are created; TIF_LAZY_MMU tells us if the task is in
the lazy mode and can therefore defer any barriers until exit from the
lazy mode. TIF_LAZY_MMU_PENDING is used to remember if any pte operation
was performed while in the lazy mode that required barriers. Then when
leaving lazy mode, if that flag is set, we emit the barriers.

Since arch_enter_lazy_mmu_mode() and arch_leave_lazy_mmu_mode() are used
for both user and kernel mappings, we need the second flag to avoid
emitting barriers unnecessarily if only user mappings were updated.

Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
---
 arch/arm64/include/asm/pgtable.h     | 73 ++++++++++++++++++++++------
 arch/arm64/include/asm/thread_info.h |  2 +
 arch/arm64/kernel/process.c          |  9 ++--
 3 files changed, 64 insertions(+), 20 deletions(-)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 1898c3069c43..149df945c1ab 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -40,6 +40,55 @@
 #include <linux/sched.h>
 #include <linux/page_table_check.h>
 
+static inline void emit_pte_barriers(void)
+{
+	/*
+	 * These barriers are emitted under certain conditions after a pte entry
+	 * was modified (see e.g. __set_pte_complete()). The dsb makes the store
+	 * visible to the table walker. The isb ensures that any previous
+	 * speculative "invalid translation" marker that is in the CPU's
+	 * pipeline gets cleared, so that any access to that address after
+	 * setting the pte to valid won't cause a spurious fault. If the thread
+	 * gets preempted after storing to the pgtable but before emitting these
+	 * barriers, __switch_to() emits a dsb which ensure the walker gets to
+	 * see the store. There is no guarrantee of an isb being issued though.
+	 * This is safe because it will still get issued (albeit on a
+	 * potentially different CPU) when the thread starts running again,
+	 * before any access to the address.
+	 */
+	dsb(ishst);
+	isb();
+}
+
+static inline void queue_pte_barriers(void)
+{
+	if (test_thread_flag(TIF_LAZY_MMU))
+		set_thread_flag(TIF_LAZY_MMU_PENDING);
+	else
+		emit_pte_barriers();
+}
+
+#define  __HAVE_ARCH_ENTER_LAZY_MMU_MODE
+static inline void arch_enter_lazy_mmu_mode(void)
+{
+	VM_WARN_ON(in_interrupt());
+	VM_WARN_ON(test_thread_flag(TIF_LAZY_MMU));
+
+	set_thread_flag(TIF_LAZY_MMU);
+}
+
+static inline void arch_flush_lazy_mmu_mode(void)
+{
+	if (test_and_clear_thread_flag(TIF_LAZY_MMU_PENDING))
+		emit_pte_barriers();
+}
+
+static inline void arch_leave_lazy_mmu_mode(void)
+{
+	arch_flush_lazy_mmu_mode();
+	clear_thread_flag(TIF_LAZY_MMU);
+}
+
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 #define __HAVE_ARCH_FLUSH_PMD_TLB_RANGE
 
@@ -323,10 +372,8 @@ static inline void __set_pte_complete(pte_t pte)
 	 * Only if the new pte is valid and kernel, otherwise TLB maintenance
 	 * has the necessary barriers.
 	 */
-	if (pte_valid_not_user(pte)) {
-		dsb(ishst);
-		isb();
-	}
+	if (pte_valid_not_user(pte))
+		queue_pte_barriers();
 }
 
 static inline void __set_pte(pte_t *ptep, pte_t pte)
@@ -778,10 +825,8 @@ static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
 
 	WRITE_ONCE(*pmdp, pmd);
 
-	if (pmd_valid(pmd)) {
-		dsb(ishst);
-		isb();
-	}
+	if (pmd_valid(pmd))
+		queue_pte_barriers();
 }
 
 static inline void pmd_clear(pmd_t *pmdp)
@@ -845,10 +890,8 @@ static inline void set_pud(pud_t *pudp, pud_t pud)
 
 	WRITE_ONCE(*pudp, pud);
 
-	if (pud_valid(pud)) {
-		dsb(ishst);
-		isb();
-	}
+	if (pud_valid(pud))
+		queue_pte_barriers();
 }
 
 static inline void pud_clear(pud_t *pudp)
@@ -925,8 +968,7 @@ static inline void set_p4d(p4d_t *p4dp, p4d_t p4d)
 	}
 
 	WRITE_ONCE(*p4dp, p4d);
-	dsb(ishst);
-	isb();
+	queue_pte_barriers();
 }
 
 static inline void p4d_clear(p4d_t *p4dp)
@@ -1052,8 +1094,7 @@ static inline void set_pgd(pgd_t *pgdp, pgd_t pgd)
 	}
 
 	WRITE_ONCE(*pgdp, pgd);
-	dsb(ishst);
-	isb();
+	queue_pte_barriers();
 }
 
 static inline void pgd_clear(pgd_t *pgdp)
diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
index 1114c1c3300a..1fdd74b7b831 100644
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -82,6 +82,8 @@ void arch_setup_new_exec(void);
 #define TIF_SME_VL_INHERIT	28	/* Inherit SME vl_onexec across exec */
 #define TIF_KERNEL_FPSTATE	29	/* Task is in a kernel mode FPSIMD section */
 #define TIF_TSC_SIGSEGV		30	/* SIGSEGV on counter-timer access */
+#define TIF_LAZY_MMU		31	/* Task in lazy mmu mode */
+#define TIF_LAZY_MMU_PENDING	32	/* Ops pending for lazy mmu mode exit */
 
 #define _TIF_SIGPENDING		(1 << TIF_SIGPENDING)
 #define _TIF_NEED_RESCHED	(1 << TIF_NEED_RESCHED)
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 42faebb7b712..45a55fe81788 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -680,10 +680,11 @@ struct task_struct *__switch_to(struct task_struct *prev,
 	gcs_thread_switch(next);
 
 	/*
-	 * Complete any pending TLB or cache maintenance on this CPU in case
-	 * the thread migrates to a different CPU.
-	 * This full barrier is also required by the membarrier system
-	 * call.
+	 * Complete any pending TLB or cache maintenance on this CPU in case the
+	 * thread migrates to a different CPU. This full barrier is also
+	 * required by the membarrier system call. Additionally it makes any
+	 * in-progress pgtable writes visible to the table walker; See
+	 * emit_pte_barriers().
 	 */
 	dsb(ish);
 
-- 
2.43.0

Re: [PATCH v3 11/11] arm64/mm: Batch barriers when updating kernel mappings

Posted by Catalin Marinas 10 months ago

On Tue, Mar 04, 2025 at 03:04:41PM +0000, Ryan Roberts wrote:
> diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
> index 1898c3069c43..149df945c1ab 100644
> --- a/arch/arm64/include/asm/pgtable.h
> +++ b/arch/arm64/include/asm/pgtable.h
> @@ -40,6 +40,55 @@
>  #include <linux/sched.h>
>  #include <linux/page_table_check.h>
>  
> +static inline void emit_pte_barriers(void)
> +{
> +	/*
> +	 * These barriers are emitted under certain conditions after a pte entry
> +	 * was modified (see e.g. __set_pte_complete()). The dsb makes the store
> +	 * visible to the table walker. The isb ensures that any previous
> +	 * speculative "invalid translation" marker that is in the CPU's
> +	 * pipeline gets cleared, so that any access to that address after
> +	 * setting the pte to valid won't cause a spurious fault. If the thread
> +	 * gets preempted after storing to the pgtable but before emitting these
> +	 * barriers, __switch_to() emits a dsb which ensure the walker gets to
> +	 * see the store. There is no guarrantee of an isb being issued though.
> +	 * This is safe because it will still get issued (albeit on a
> +	 * potentially different CPU) when the thread starts running again,
> +	 * before any access to the address.
> +	 */
> +	dsb(ishst);
> +	isb();
> +}
> +
> +static inline void queue_pte_barriers(void)
> +{
> +	if (test_thread_flag(TIF_LAZY_MMU))
> +		set_thread_flag(TIF_LAZY_MMU_PENDING);

As we can have lots of calls here, it might be slightly cheaper to test
TIF_LAZY_MMU_PENDING and avoid setting it unnecessarily.

I haven't checked - does the compiler generate multiple mrs from sp_el0
for subsequent test_thread_flag()?

> +	else
> +		emit_pte_barriers();
> +}
> +
> +#define  __HAVE_ARCH_ENTER_LAZY_MMU_MODE
> +static inline void arch_enter_lazy_mmu_mode(void)
> +{
> +	VM_WARN_ON(in_interrupt());
> +	VM_WARN_ON(test_thread_flag(TIF_LAZY_MMU));
> +
> +	set_thread_flag(TIF_LAZY_MMU);
> +}
> +
> +static inline void arch_flush_lazy_mmu_mode(void)
> +{
> +	if (test_and_clear_thread_flag(TIF_LAZY_MMU_PENDING))
> +		emit_pte_barriers();
> +}
> +
> +static inline void arch_leave_lazy_mmu_mode(void)
> +{
> +	arch_flush_lazy_mmu_mode();
> +	clear_thread_flag(TIF_LAZY_MMU);
> +}
> +
>  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
>  #define __HAVE_ARCH_FLUSH_PMD_TLB_RANGE
>  
> @@ -323,10 +372,8 @@ static inline void __set_pte_complete(pte_t pte)
>  	 * Only if the new pte is valid and kernel, otherwise TLB maintenance
>  	 * has the necessary barriers.
>  	 */
> -	if (pte_valid_not_user(pte)) {
> -		dsb(ishst);
> -		isb();
> -	}
> +	if (pte_valid_not_user(pte))
> +		queue_pte_barriers();
>  }

I think this scheme works, I couldn't find a counter-example unless
__set_pte() gets called in an interrupt context. You could add
VM_WARN_ON(in_interrupt()) in queue_pte_barriers() as well.

With preemption, the newly mapped range shouldn't be used before
arch_flush_lazy_mmu_mode() is called, so it looks safe as well. I think
x86 uses a per-CPU variable to track this but per-thread is easier to
reason about if there's no nesting.

>  static inline void __set_pte(pte_t *ptep, pte_t pte)
> @@ -778,10 +825,8 @@ static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
>  
>  	WRITE_ONCE(*pmdp, pmd);
>  
> -	if (pmd_valid(pmd)) {
> -		dsb(ishst);
> -		isb();
> -	}
> +	if (pmd_valid(pmd))
> +		queue_pte_barriers();
>  }

We discussed on a previous series - for pmd/pud we end up with barriers
even for user mappings but they are at a much coarser granularity (and I
wasn't keen on 'user' attributes for the table entries).

Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>

Re: [PATCH v3 11/11] arm64/mm: Batch barriers when updating kernel mappings

Posted by Ryan Roberts 10 months ago

On 14/04/2025 18:38, Catalin Marinas wrote:
> On Tue, Mar 04, 2025 at 03:04:41PM +0000, Ryan Roberts wrote:
>> diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
>> index 1898c3069c43..149df945c1ab 100644
>> --- a/arch/arm64/include/asm/pgtable.h
>> +++ b/arch/arm64/include/asm/pgtable.h
>> @@ -40,6 +40,55 @@
>>  #include <linux/sched.h>
>>  #include <linux/page_table_check.h>
>>  
>> +static inline void emit_pte_barriers(void)
>> +{
>> +	/*
>> +	 * These barriers are emitted under certain conditions after a pte entry
>> +	 * was modified (see e.g. __set_pte_complete()). The dsb makes the store
>> +	 * visible to the table walker. The isb ensures that any previous
>> +	 * speculative "invalid translation" marker that is in the CPU's
>> +	 * pipeline gets cleared, so that any access to that address after
>> +	 * setting the pte to valid won't cause a spurious fault. If the thread
>> +	 * gets preempted after storing to the pgtable but before emitting these
>> +	 * barriers, __switch_to() emits a dsb which ensure the walker gets to
>> +	 * see the store. There is no guarrantee of an isb being issued though.
>> +	 * This is safe because it will still get issued (albeit on a
>> +	 * potentially different CPU) when the thread starts running again,
>> +	 * before any access to the address.
>> +	 */
>> +	dsb(ishst);
>> +	isb();
>> +}
>> +
>> +static inline void queue_pte_barriers(void)
>> +{
>> +	if (test_thread_flag(TIF_LAZY_MMU))
>> +		set_thread_flag(TIF_LAZY_MMU_PENDING);
> 
> As we can have lots of calls here, it might be slightly cheaper to test
> TIF_LAZY_MMU_PENDING and avoid setting it unnecessarily.

Yes, good point.

> 
> I haven't checked - does the compiler generate multiple mrs from sp_el0
> for subsequent test_thread_flag()?

It emits a single mrs but it loads from the pointer twice. I think v3 is the version we want?


void TEST_queue_pte_barriers_v1(void)
{
	if (test_thread_flag(TIF_LAZY_MMU))
		set_thread_flag(TIF_LAZY_MMU_PENDING);
	else
		emit_pte_barriers();
}

void TEST_queue_pte_barriers_v2(void)
{
	if (test_thread_flag(TIF_LAZY_MMU) &&
	    !test_thread_flag(TIF_LAZY_MMU_PENDING))
		set_thread_flag(TIF_LAZY_MMU_PENDING);
	else
		emit_pte_barriers();
}

void TEST_queue_pte_barriers_v3(void)
{
	unsigned long flags = read_thread_flags();

	if ((flags & (_TIF_LAZY_MMU | _TIF_LAZY_MMU_PENDING)) == _TIF_LAZY_MMU)
		set_thread_flag(TIF_LAZY_MMU_PENDING);
	else
		emit_pte_barriers();
}


000000000000101c <TEST_queue_pte_barriers_v1>:
    101c:	d5384100 	mrs	x0, sp_el0
    1020:	f9400001 	ldr	x1, [x0]
    1024:	37f80081 	tbnz	w1, #31, 1034 <TEST_queue_pte_barriers_v1+0x18>
    1028:	d5033a9f 	dsb	ishst
    102c:	d5033fdf 	isb
    1030:	d65f03c0 	ret
    1034:	14000004 	b	1044 <TEST_queue_pte_barriers_v1+0x28>
    1038:	d2c00021 	mov	x1, #0x100000000           	// #4294967296
    103c:	f821301f 	stset	x1, [x0]
    1040:	d65f03c0 	ret
    1044:	f9800011 	prfm	pstl1strm, [x0]
    1048:	c85f7c01 	ldxr	x1, [x0]
    104c:	b2600021 	orr	x1, x1, #0x100000000
    1050:	c8027c01 	stxr	w2, x1, [x0]
    1054:	35ffffa2 	cbnz	w2, 1048 <TEST_queue_pte_barriers_v1+0x2c>
    1058:	d65f03c0 	ret

000000000000105c <TEST_queue_pte_barriers_v2>:
    105c:	d5384100 	mrs	x0, sp_el0
    1060:	f9400001 	ldr	x1, [x0]
    1064:	37f80081 	tbnz	w1, #31, 1074 <TEST_queue_pte_barriers_v2+0x18>
    1068:	d5033a9f 	dsb	ishst
    106c:	d5033fdf 	isb
    1070:	d65f03c0 	ret
    1074:	f9400001 	ldr	x1, [x0]
    1078:	b707ff81 	tbnz	x1, #32, 1068 <TEST_queue_pte_barriers_v2+0xc>
    107c:	14000004 	b	108c <TEST_queue_pte_barriers_v2+0x30>
    1080:	d2c00021 	mov	x1, #0x100000000           	// #4294967296
    1084:	f821301f 	stset	x1, [x0]
    1088:	d65f03c0 	ret
    108c:	f9800011 	prfm	pstl1strm, [x0]
    1090:	c85f7c01 	ldxr	x1, [x0]
    1094:	b2600021 	orr	x1, x1, #0x100000000
    1098:	c8027c01 	stxr	w2, x1, [x0]
    109c:	35ffffa2 	cbnz	w2, 1090 <TEST_queue_pte_barriers_v2+0x34>
    10a0:	d65f03c0 	ret

00000000000010a4 <TEST_queue_pte_barriers_v3>:
    10a4:	d5384101 	mrs	x1, sp_el0
    10a8:	f9400020 	ldr	x0, [x1]
    10ac:	d2b00002 	mov	x2, #0x80000000            	// #2147483648
    10b0:	92610400 	and	x0, x0, #0x180000000
    10b4:	eb02001f 	cmp	x0, x2
    10b8:	54000080 	b.eq	10c8 <TEST_queue_pte_barriers_v3+0x24>  // b.none
    10bc:	d5033a9f 	dsb	ishst
    10c0:	d5033fdf 	isb
    10c4:	d65f03c0 	ret
    10c8:	14000004 	b	10d8 <TEST_queue_pte_barriers_v3+0x34>
    10cc:	d2c00020 	mov	x0, #0x100000000           	// #4294967296
    10d0:	f820303f 	stset	x0, [x1]
    10d4:	d65f03c0 	ret
    10d8:	f9800031 	prfm	pstl1strm, [x1]
    10dc:	c85f7c20 	ldxr	x0, [x1]
    10e0:	b2600000 	orr	x0, x0, #0x100000000
    10e4:	c8027c20 	stxr	w2, x0, [x1]
    10e8:	35ffffa2 	cbnz	w2, 10dc <TEST_queue_pte_barriers_v3+0x38>
    10ec:	d65f03c0 	ret



> 
>> +	else
>> +		emit_pte_barriers();
>> +}
>> +
>> +#define  __HAVE_ARCH_ENTER_LAZY_MMU_MODE
>> +static inline void arch_enter_lazy_mmu_mode(void)
>> +{
>> +	VM_WARN_ON(in_interrupt());
>> +	VM_WARN_ON(test_thread_flag(TIF_LAZY_MMU));
>> +
>> +	set_thread_flag(TIF_LAZY_MMU);
>> +}
>> +
>> +static inline void arch_flush_lazy_mmu_mode(void)
>> +{
>> +	if (test_and_clear_thread_flag(TIF_LAZY_MMU_PENDING))
>> +		emit_pte_barriers();
>> +}
>> +
>> +static inline void arch_leave_lazy_mmu_mode(void)
>> +{
>> +	arch_flush_lazy_mmu_mode();
>> +	clear_thread_flag(TIF_LAZY_MMU);
>> +}
>> +
>>  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
>>  #define __HAVE_ARCH_FLUSH_PMD_TLB_RANGE
>>  
>> @@ -323,10 +372,8 @@ static inline void __set_pte_complete(pte_t pte)
>>  	 * Only if the new pte is valid and kernel, otherwise TLB maintenance
>>  	 * has the necessary barriers.
>>  	 */
>> -	if (pte_valid_not_user(pte)) {
>> -		dsb(ishst);
>> -		isb();
>> -	}
>> +	if (pte_valid_not_user(pte))
>> +		queue_pte_barriers();
>>  }
> 
> I think this scheme works, I couldn't find a counter-example unless
> __set_pte() gets called in an interrupt context. You could add
> VM_WARN_ON(in_interrupt()) in queue_pte_barriers() as well.
> 
> With preemption, the newly mapped range shouldn't be used before
> arch_flush_lazy_mmu_mode() is called, so it looks safe as well. I think
> x86 uses a per-CPU variable to track this but per-thread is easier to
> reason about if there's no nesting.
> 
>>  static inline void __set_pte(pte_t *ptep, pte_t pte)
>> @@ -778,10 +825,8 @@ static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
>>  
>>  	WRITE_ONCE(*pmdp, pmd);
>>  
>> -	if (pmd_valid(pmd)) {
>> -		dsb(ishst);
>> -		isb();
>> -	}
>> +	if (pmd_valid(pmd))
>> +		queue_pte_barriers();
>>  }
> 
> We discussed on a previous series - for pmd/pud we end up with barriers
> even for user mappings but they are at a much coarser granularity (and I
> wasn't keen on 'user' attributes for the table entries).
> 
> Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>

Thanks!

Ryan

Re: [PATCH v3 11/11] arm64/mm: Batch barriers when updating kernel mappings

Posted by Catalin Marinas 10 months ago

On Mon, Apr 14, 2025 at 07:28:46PM +0100, Ryan Roberts wrote:
> On 14/04/2025 18:38, Catalin Marinas wrote:
> > On Tue, Mar 04, 2025 at 03:04:41PM +0000, Ryan Roberts wrote:
> >> diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
> >> index 1898c3069c43..149df945c1ab 100644
> >> --- a/arch/arm64/include/asm/pgtable.h
> >> +++ b/arch/arm64/include/asm/pgtable.h
> >> @@ -40,6 +40,55 @@
> >>  #include <linux/sched.h>
> >>  #include <linux/page_table_check.h>
> >>  
> >> +static inline void emit_pte_barriers(void)
> >> +{
> >> +	/*
> >> +	 * These barriers are emitted under certain conditions after a pte entry
> >> +	 * was modified (see e.g. __set_pte_complete()). The dsb makes the store
> >> +	 * visible to the table walker. The isb ensures that any previous
> >> +	 * speculative "invalid translation" marker that is in the CPU's
> >> +	 * pipeline gets cleared, so that any access to that address after
> >> +	 * setting the pte to valid won't cause a spurious fault. If the thread
> >> +	 * gets preempted after storing to the pgtable but before emitting these
> >> +	 * barriers, __switch_to() emits a dsb which ensure the walker gets to
> >> +	 * see the store. There is no guarrantee of an isb being issued though.
> >> +	 * This is safe because it will still get issued (albeit on a
> >> +	 * potentially different CPU) when the thread starts running again,
> >> +	 * before any access to the address.
> >> +	 */
> >> +	dsb(ishst);
> >> +	isb();
> >> +}
> >> +
> >> +static inline void queue_pte_barriers(void)
> >> +{
> >> +	if (test_thread_flag(TIF_LAZY_MMU))
> >> +		set_thread_flag(TIF_LAZY_MMU_PENDING);
> > 
> > As we can have lots of calls here, it might be slightly cheaper to test
> > TIF_LAZY_MMU_PENDING and avoid setting it unnecessarily.
> 
> Yes, good point.
> 
> > I haven't checked - does the compiler generate multiple mrs from sp_el0
> > for subsequent test_thread_flag()?
> 
> It emits a single mrs but it loads from the pointer twice.

It's not that bad if only do the set_thread_flag() once.

> I think v3 is the version we want?
> 
> 
> void TEST_queue_pte_barriers_v1(void)
> {
> 	if (test_thread_flag(TIF_LAZY_MMU))
> 		set_thread_flag(TIF_LAZY_MMU_PENDING);
> 	else
> 		emit_pte_barriers();
> }
> 
> void TEST_queue_pte_barriers_v2(void)
> {
> 	if (test_thread_flag(TIF_LAZY_MMU) &&
> 	    !test_thread_flag(TIF_LAZY_MMU_PENDING))
> 		set_thread_flag(TIF_LAZY_MMU_PENDING);
> 	else
> 		emit_pte_barriers();
> }
> 
> void TEST_queue_pte_barriers_v3(void)
> {
> 	unsigned long flags = read_thread_flags();
> 
> 	if ((flags & (_TIF_LAZY_MMU | _TIF_LAZY_MMU_PENDING)) == _TIF_LAZY_MMU)
> 		set_thread_flag(TIF_LAZY_MMU_PENDING);
> 	else
> 		emit_pte_barriers();
> }

Doesn't v3 emit barriers once _TIF_LAZY_MMU_PENDING has been set? We
need something like:

	if (flags & _TIF_LAZY_MMU) {
		if (!(flags & _TIF_LAZY_MMU_PENDING))
			set_thread_flag(TIF_LAZY_MMU_PENDING);
	} else {
		emit_pte_barriers();
	}

-- 
Catalin

Re: [PATCH v3 11/11] arm64/mm: Batch barriers when updating kernel mappings

Posted by Ryan Roberts 9 months, 4 weeks ago

On 15/04/2025 11:51, Catalin Marinas wrote:
> On Mon, Apr 14, 2025 at 07:28:46PM +0100, Ryan Roberts wrote:
>> On 14/04/2025 18:38, Catalin Marinas wrote:
>>> On Tue, Mar 04, 2025 at 03:04:41PM +0000, Ryan Roberts wrote:
>>>> diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
>>>> index 1898c3069c43..149df945c1ab 100644
>>>> --- a/arch/arm64/include/asm/pgtable.h
>>>> +++ b/arch/arm64/include/asm/pgtable.h
>>>> @@ -40,6 +40,55 @@
>>>>  #include <linux/sched.h>
>>>>  #include <linux/page_table_check.h>
>>>>  
>>>> +static inline void emit_pte_barriers(void)
>>>> +{
>>>> +	/*
>>>> +	 * These barriers are emitted under certain conditions after a pte entry
>>>> +	 * was modified (see e.g. __set_pte_complete()). The dsb makes the store
>>>> +	 * visible to the table walker. The isb ensures that any previous
>>>> +	 * speculative "invalid translation" marker that is in the CPU's
>>>> +	 * pipeline gets cleared, so that any access to that address after
>>>> +	 * setting the pte to valid won't cause a spurious fault. If the thread
>>>> +	 * gets preempted after storing to the pgtable but before emitting these
>>>> +	 * barriers, __switch_to() emits a dsb which ensure the walker gets to
>>>> +	 * see the store. There is no guarrantee of an isb being issued though.
>>>> +	 * This is safe because it will still get issued (albeit on a
>>>> +	 * potentially different CPU) when the thread starts running again,
>>>> +	 * before any access to the address.
>>>> +	 */
>>>> +	dsb(ishst);
>>>> +	isb();
>>>> +}
>>>> +
>>>> +static inline void queue_pte_barriers(void)
>>>> +{
>>>> +	if (test_thread_flag(TIF_LAZY_MMU))
>>>> +		set_thread_flag(TIF_LAZY_MMU_PENDING);
>>>
>>> As we can have lots of calls here, it might be slightly cheaper to test
>>> TIF_LAZY_MMU_PENDING and avoid setting it unnecessarily.
>>
>> Yes, good point.
>>
>>> I haven't checked - does the compiler generate multiple mrs from sp_el0
>>> for subsequent test_thread_flag()?
>>
>> It emits a single mrs but it loads from the pointer twice.
> 
> It's not that bad if only do the set_thread_flag() once.
> 
>> I think v3 is the version we want?
>>
>>
>> void TEST_queue_pte_barriers_v1(void)
>> {
>> 	if (test_thread_flag(TIF_LAZY_MMU))
>> 		set_thread_flag(TIF_LAZY_MMU_PENDING);
>> 	else
>> 		emit_pte_barriers();
>> }
>>
>> void TEST_queue_pte_barriers_v2(void)
>> {
>> 	if (test_thread_flag(TIF_LAZY_MMU) &&
>> 	    !test_thread_flag(TIF_LAZY_MMU_PENDING))
>> 		set_thread_flag(TIF_LAZY_MMU_PENDING);
>> 	else
>> 		emit_pte_barriers();
>> }
>>
>> void TEST_queue_pte_barriers_v3(void)
>> {
>> 	unsigned long flags = read_thread_flags();
>>
>> 	if ((flags & (_TIF_LAZY_MMU | _TIF_LAZY_MMU_PENDING)) == _TIF_LAZY_MMU)
>> 		set_thread_flag(TIF_LAZY_MMU_PENDING);
>> 	else
>> 		emit_pte_barriers();
>> }
> 
> Doesn't v3 emit barriers once _TIF_LAZY_MMU_PENDING has been set? We
> need something like:
> 
> 	if (flags & _TIF_LAZY_MMU) {
> 		if (!(flags & _TIF_LAZY_MMU_PENDING))
> 			set_thread_flag(TIF_LAZY_MMU_PENDING);
> 	} else {
> 		emit_pte_barriers();
> 	}

Gah, yeah sorry, going to quickly. v2 is also logicially incorrect.

Fixed versions:

void TEST_queue_pte_barriers_v2(void)
{
	if (test_thread_flag(TIF_LAZY_MMU)) {
		if (!test_thread_flag(TIF_LAZY_MMU_PENDING))
			set_thread_flag(TIF_LAZY_MMU_PENDING);
	} else {
		emit_pte_barriers();
	}
}

void TEST_queue_pte_barriers_v3(void)
{
	unsigned long flags = read_thread_flags();

	if (flags & BIT(TIF_LAZY_MMU)) {
		if (!(flags & BIT(TIF_LAZY_MMU_PENDING)))
			set_thread_flag(TIF_LAZY_MMU_PENDING);
	} else {
		emit_pte_barriers();
	}
}

000000000000105c <TEST_queue_pte_barriers_v2>:
    105c:	d5384100 	mrs	x0, sp_el0
    1060:	f9400001 	ldr	x1, [x0]
    1064:	37f80081 	tbnz	w1, #31, 1074 <TEST_queue_pte_barriers_v2+0x18>
    1068:	d5033a9f 	dsb	ishst
    106c:	d5033fdf 	isb
    1070:	d65f03c0 	ret
    1074:	f9400001 	ldr	x1, [x0]
    1078:	b707ffc1 	tbnz	x1, #32, 1070 <TEST_queue_pte_barriers_v2+0x14>
    107c:	14000004 	b	108c <TEST_queue_pte_barriers_v2+0x30>
    1080:	d2c00021 	mov	x1, #0x100000000           	// #4294967296
    1084:	f821301f 	stset	x1, [x0]
    1088:	d65f03c0 	ret
    108c:	f9800011 	prfm	pstl1strm, [x0]
    1090:	c85f7c01 	ldxr	x1, [x0]
    1094:	b2600021 	orr	x1, x1, #0x100000000
    1098:	c8027c01 	stxr	w2, x1, [x0]
    109c:	35ffffa2 	cbnz	w2, 1090 <TEST_queue_pte_barriers_v2+0x34>
    10a0:	d65f03c0 	ret

00000000000010a4 <TEST_queue_pte_barriers_v3>:
    10a4:	d5384101 	mrs	x1, sp_el0
    10a8:	f9400020 	ldr	x0, [x1]
    10ac:	36f80060 	tbz	w0, #31, 10b8 <TEST_queue_pte_barriers_v3+0x14>
    10b0:	b60000a0 	tbz	x0, #32, 10c4 <TEST_queue_pte_barriers_v3+0x20>
    10b4:	d65f03c0 	ret
    10b8:	d5033a9f 	dsb	ishst
    10bc:	d5033fdf 	isb
    10c0:	d65f03c0 	ret
    10c4:	14000004 	b	10d4 <TEST_queue_pte_barriers_v3+0x30>
    10c8:	d2c00020 	mov	x0, #0x100000000           	// #4294967296
    10cc:	f820303f 	stset	x0, [x1]
    10d0:	d65f03c0 	ret
    10d4:	f9800031 	prfm	pstl1strm, [x1]
    10d8:	c85f7c20 	ldxr	x0, [x1]
    10dc:	b2600000 	orr	x0, x0, #0x100000000
    10e0:	c8027c20 	stxr	w2, x0, [x1]
    10e4:	35ffffa2 	cbnz	w2, 10d8 <TEST_queue_pte_barriers_v3+0x34>
    10e8:	d65f03c0 	ret

So v3 is the way to go, I think; it's a single mrs and a single ldr.

I'll get this fixed up and posted early next week.

Thanks,
Ryan

Re: [PATCH v3 11/11] arm64/mm: Batch barriers when updating kernel mappings

Posted by Anshuman Khandual 10 months, 1 week ago

On 3/4/25 20:34, Ryan Roberts wrote:
> Because the kernel can't tolerate page faults for kernel mappings, when
> setting a valid, kernel space pte (or pmd/pud/p4d/pgd), it emits a
> dsb(ishst) to ensure that the store to the pgtable is observed by the
> table walker immediately. Additionally it emits an isb() to ensure that
> any already speculatively determined invalid mapping fault gets
> canceled.
> 
> We can improve the performance of vmalloc operations by batching these
> barriers until the end of a set of entry updates.
> arch_enter_lazy_mmu_mode() and arch_leave_lazy_mmu_mode() provide the
> required hooks.
> 
> vmalloc improves by up to 30% as a result.
> 
> Two new TIF_ flags are created; TIF_LAZY_MMU tells us if the task is in
> the lazy mode and can therefore defer any barriers until exit from the
> lazy mode. TIF_LAZY_MMU_PENDING is used to remember if any pte operation
> was performed while in the lazy mode that required barriers. Then when
> leaving lazy mode, if that flag is set, we emit the barriers.
> 
> Since arch_enter_lazy_mmu_mode() and arch_leave_lazy_mmu_mode() are used
> for both user and kernel mappings, we need the second flag to avoid
> emitting barriers unnecessarily if only user mappings were updated.

Agreed and hence for that an additional TIF flag i.e TIF_LAZY_MMU_PENDING
can be justified.

> 
> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
> ---
>  arch/arm64/include/asm/pgtable.h     | 73 ++++++++++++++++++++++------
>  arch/arm64/include/asm/thread_info.h |  2 +
>  arch/arm64/kernel/process.c          |  9 ++--
>  3 files changed, 64 insertions(+), 20 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
> index 1898c3069c43..149df945c1ab 100644
> --- a/arch/arm64/include/asm/pgtable.h
> +++ b/arch/arm64/include/asm/pgtable.h
> @@ -40,6 +40,55 @@
>  #include <linux/sched.h>
>  #include <linux/page_table_check.h>
>  
> +static inline void emit_pte_barriers(void)
> +{
> +	/*
> +	 * These barriers are emitted under certain conditions after a pte entry
> +	 * was modified (see e.g. __set_pte_complete()). The dsb makes the store
> +	 * visible to the table walker. The isb ensures that any previous
> +	 * speculative "invalid translation" marker that is in the CPU's
> +	 * pipeline gets cleared, so that any access to that address after
> +	 * setting the pte to valid won't cause a spurious fault. If the thread
> +	 * gets preempted after storing to the pgtable but before emitting these
> +	 * barriers, __switch_to() emits a dsb which ensure the walker gets to
> +	 * see the store. There is no guarrantee of an isb being issued though.

typo					^^^^^^^^
 					
> +	 * This is safe because it will still get issued (albeit on a
> +	 * potentially different CPU) when the thread starts running again,
> +	 * before any access to the address.
> +	 */
> +	dsb(ishst);
> +	isb();
> +}
> +
> +static inline void queue_pte_barriers(void)
> +{
> +	if (test_thread_flag(TIF_LAZY_MMU))
> +		set_thread_flag(TIF_LAZY_MMU_PENDING);
> +	else
> +		emit_pte_barriers();
> +}
> +
> +#define  __HAVE_ARCH_ENTER_LAZY_MMU_MODE
> +static inline void arch_enter_lazy_mmu_mode(void)
> +{
> +	VM_WARN_ON(in_interrupt());
> +	VM_WARN_ON(test_thread_flag(TIF_LAZY_MMU));
> +
> +	set_thread_flag(TIF_LAZY_MMU);
> +}
> +
> +static inline void arch_flush_lazy_mmu_mode(void)
> +{
> +	if (test_and_clear_thread_flag(TIF_LAZY_MMU_PENDING))
> +		emit_pte_barriers();
> +}
> +
> +static inline void arch_leave_lazy_mmu_mode(void)
> +{
> +	arch_flush_lazy_mmu_mode();
> +	clear_thread_flag(TIF_LAZY_MMU);
> +}
> +
>  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
>  #define __HAVE_ARCH_FLUSH_PMD_TLB_RANGE
>  
> @@ -323,10 +372,8 @@ static inline void __set_pte_complete(pte_t pte)
>  	 * Only if the new pte is valid and kernel, otherwise TLB maintenance
>  	 * has the necessary barriers.
>  	 */
> -	if (pte_valid_not_user(pte)) {
> -		dsb(ishst);
> -		isb();
> -	}
> +	if (pte_valid_not_user(pte))
> +		queue_pte_barriers();
>  }
>  
>  static inline void __set_pte(pte_t *ptep, pte_t pte)
> @@ -778,10 +825,8 @@ static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
>  
>  	WRITE_ONCE(*pmdp, pmd);
>  
> -	if (pmd_valid(pmd)) {
> -		dsb(ishst);
> -		isb();
> -	}
> +	if (pmd_valid(pmd))
> +		queue_pte_barriers();
>  }
>  
>  static inline void pmd_clear(pmd_t *pmdp)
> @@ -845,10 +890,8 @@ static inline void set_pud(pud_t *pudp, pud_t pud)
>  
>  	WRITE_ONCE(*pudp, pud);
>  
> -	if (pud_valid(pud)) {
> -		dsb(ishst);
> -		isb();
> -	}
> +	if (pud_valid(pud))
> +		queue_pte_barriers();
>  }
>  
>  static inline void pud_clear(pud_t *pudp)
> @@ -925,8 +968,7 @@ static inline void set_p4d(p4d_t *p4dp, p4d_t p4d)
>  	}
>  
>  	WRITE_ONCE(*p4dp, p4d);
> -	dsb(ishst);
> -	isb();
> +	queue_pte_barriers();
>  }
>  
>  static inline void p4d_clear(p4d_t *p4dp)
> @@ -1052,8 +1094,7 @@ static inline void set_pgd(pgd_t *pgdp, pgd_t pgd)
>  	}
>  
>  	WRITE_ONCE(*pgdp, pgd);
> -	dsb(ishst);
> -	isb();
> +	queue_pte_barriers();
>  }
>  
>  static inline void pgd_clear(pgd_t *pgdp)
> diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
> index 1114c1c3300a..1fdd74b7b831 100644
> --- a/arch/arm64/include/asm/thread_info.h
> +++ b/arch/arm64/include/asm/thread_info.h
> @@ -82,6 +82,8 @@ void arch_setup_new_exec(void);
>  #define TIF_SME_VL_INHERIT	28	/* Inherit SME vl_onexec across exec */
>  #define TIF_KERNEL_FPSTATE	29	/* Task is in a kernel mode FPSIMD section */
>  #define TIF_TSC_SIGSEGV		30	/* SIGSEGV on counter-timer access */
> +#define TIF_LAZY_MMU		31	/* Task in lazy mmu mode */
> +#define TIF_LAZY_MMU_PENDING	32	/* Ops pending for lazy mmu mode exit */
>  
>  #define _TIF_SIGPENDING		(1 << TIF_SIGPENDING)
>  #define _TIF_NEED_RESCHED	(1 << TIF_NEED_RESCHED)
> diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
> index 42faebb7b712..45a55fe81788 100644
> --- a/arch/arm64/kernel/process.c
> +++ b/arch/arm64/kernel/process.c
> @@ -680,10 +680,11 @@ struct task_struct *__switch_to(struct task_struct *prev,
>  	gcs_thread_switch(next);
>  
>  	/*
> -	 * Complete any pending TLB or cache maintenance on this CPU in case
> -	 * the thread migrates to a different CPU.
> -	 * This full barrier is also required by the membarrier system
> -	 * call.
> +	 * Complete any pending TLB or cache maintenance on this CPU in case the
> +	 * thread migrates to a different CPU. This full barrier is also
> +	 * required by the membarrier system call. Additionally it makes any
> +	 * in-progress pgtable writes visible to the table walker; See
> +	 * emit_pte_barriers().
>  	 */
>  	dsb(ish);
>  

Otherwise, LGTM.

I will try and think through again if these deferred sync and flush can cause
subtle problems else where.