Because the kernel can't tolerate page faults for kernel mappings, when
setting a valid, kernel space pte (or pmd/pud/p4d/pgd), it emits a
dsb(ishst) to ensure that the store to the pgtable is observed by the
table walker immediately. Additionally it emits an isb() to ensure that
any already speculatively determined invalid mapping fault gets
canceled.
We can improve the performance of vmalloc operations by batching these
barriers until the end of a set of entry updates.
arch_enter_lazy_mmu_mode() and arch_leave_lazy_mmu_mode() provide the
required hooks.
vmalloc improves by up to 30% as a result.
Two new TIF_ flags are created; TIF_LAZY_MMU tells us if the task is in
the lazy mode and can therefore defer any barriers until exit from the
lazy mode. TIF_LAZY_MMU_PENDING is used to remember if any pte operation
was performed while in the lazy mode that required barriers. Then when
leaving lazy mode, if that flag is set, we emit the barriers.
Since arch_enter_lazy_mmu_mode() and arch_leave_lazy_mmu_mode() are used
for both user and kernel mappings, we need the second flag to avoid
emitting barriers unnecessarily if only user mappings were updated.
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
---
arch/arm64/include/asm/pgtable.h | 73 ++++++++++++++++++++++------
arch/arm64/include/asm/thread_info.h | 2 +
arch/arm64/kernel/process.c | 9 ++--
3 files changed, 64 insertions(+), 20 deletions(-)
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 1898c3069c43..149df945c1ab 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -40,6 +40,55 @@
#include <linux/sched.h>
#include <linux/page_table_check.h>
+static inline void emit_pte_barriers(void)
+{
+ /*
+ * These barriers are emitted under certain conditions after a pte entry
+ * was modified (see e.g. __set_pte_complete()). The dsb makes the store
+ * visible to the table walker. The isb ensures that any previous
+ * speculative "invalid translation" marker that is in the CPU's
+ * pipeline gets cleared, so that any access to that address after
+ * setting the pte to valid won't cause a spurious fault. If the thread
+ * gets preempted after storing to the pgtable but before emitting these
+ * barriers, __switch_to() emits a dsb which ensure the walker gets to
+ * see the store. There is no guarrantee of an isb being issued though.
+ * This is safe because it will still get issued (albeit on a
+ * potentially different CPU) when the thread starts running again,
+ * before any access to the address.
+ */
+ dsb(ishst);
+ isb();
+}
+
+static inline void queue_pte_barriers(void)
+{
+ if (test_thread_flag(TIF_LAZY_MMU))
+ set_thread_flag(TIF_LAZY_MMU_PENDING);
+ else
+ emit_pte_barriers();
+}
+
+#define __HAVE_ARCH_ENTER_LAZY_MMU_MODE
+static inline void arch_enter_lazy_mmu_mode(void)
+{
+ VM_WARN_ON(in_interrupt());
+ VM_WARN_ON(test_thread_flag(TIF_LAZY_MMU));
+
+ set_thread_flag(TIF_LAZY_MMU);
+}
+
+static inline void arch_flush_lazy_mmu_mode(void)
+{
+ if (test_and_clear_thread_flag(TIF_LAZY_MMU_PENDING))
+ emit_pte_barriers();
+}
+
+static inline void arch_leave_lazy_mmu_mode(void)
+{
+ arch_flush_lazy_mmu_mode();
+ clear_thread_flag(TIF_LAZY_MMU);
+}
+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#define __HAVE_ARCH_FLUSH_PMD_TLB_RANGE
@@ -323,10 +372,8 @@ static inline void __set_pte_complete(pte_t pte)
* Only if the new pte is valid and kernel, otherwise TLB maintenance
* has the necessary barriers.
*/
- if (pte_valid_not_user(pte)) {
- dsb(ishst);
- isb();
- }
+ if (pte_valid_not_user(pte))
+ queue_pte_barriers();
}
static inline void __set_pte(pte_t *ptep, pte_t pte)
@@ -778,10 +825,8 @@ static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
WRITE_ONCE(*pmdp, pmd);
- if (pmd_valid(pmd)) {
- dsb(ishst);
- isb();
- }
+ if (pmd_valid(pmd))
+ queue_pte_barriers();
}
static inline void pmd_clear(pmd_t *pmdp)
@@ -845,10 +890,8 @@ static inline void set_pud(pud_t *pudp, pud_t pud)
WRITE_ONCE(*pudp, pud);
- if (pud_valid(pud)) {
- dsb(ishst);
- isb();
- }
+ if (pud_valid(pud))
+ queue_pte_barriers();
}
static inline void pud_clear(pud_t *pudp)
@@ -925,8 +968,7 @@ static inline void set_p4d(p4d_t *p4dp, p4d_t p4d)
}
WRITE_ONCE(*p4dp, p4d);
- dsb(ishst);
- isb();
+ queue_pte_barriers();
}
static inline void p4d_clear(p4d_t *p4dp)
@@ -1052,8 +1094,7 @@ static inline void set_pgd(pgd_t *pgdp, pgd_t pgd)
}
WRITE_ONCE(*pgdp, pgd);
- dsb(ishst);
- isb();
+ queue_pte_barriers();
}
static inline void pgd_clear(pgd_t *pgdp)
diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
index 1114c1c3300a..1fdd74b7b831 100644
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -82,6 +82,8 @@ void arch_setup_new_exec(void);
#define TIF_SME_VL_INHERIT 28 /* Inherit SME vl_onexec across exec */
#define TIF_KERNEL_FPSTATE 29 /* Task is in a kernel mode FPSIMD section */
#define TIF_TSC_SIGSEGV 30 /* SIGSEGV on counter-timer access */
+#define TIF_LAZY_MMU 31 /* Task in lazy mmu mode */
+#define TIF_LAZY_MMU_PENDING 32 /* Ops pending for lazy mmu mode exit */
#define _TIF_SIGPENDING (1 << TIF_SIGPENDING)
#define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED)
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 42faebb7b712..45a55fe81788 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -680,10 +680,11 @@ struct task_struct *__switch_to(struct task_struct *prev,
gcs_thread_switch(next);
/*
- * Complete any pending TLB or cache maintenance on this CPU in case
- * the thread migrates to a different CPU.
- * This full barrier is also required by the membarrier system
- * call.
+ * Complete any pending TLB or cache maintenance on this CPU in case the
+ * thread migrates to a different CPU. This full barrier is also
+ * required by the membarrier system call. Additionally it makes any
+ * in-progress pgtable writes visible to the table walker; See
+ * emit_pte_barriers().
*/
dsb(ish);
--
2.43.0
On Tue, Mar 04, 2025 at 03:04:41PM +0000, Ryan Roberts wrote:
> diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
> index 1898c3069c43..149df945c1ab 100644
> --- a/arch/arm64/include/asm/pgtable.h
> +++ b/arch/arm64/include/asm/pgtable.h
> @@ -40,6 +40,55 @@
> #include <linux/sched.h>
> #include <linux/page_table_check.h>
>
> +static inline void emit_pte_barriers(void)
> +{
> + /*
> + * These barriers are emitted under certain conditions after a pte entry
> + * was modified (see e.g. __set_pte_complete()). The dsb makes the store
> + * visible to the table walker. The isb ensures that any previous
> + * speculative "invalid translation" marker that is in the CPU's
> + * pipeline gets cleared, so that any access to that address after
> + * setting the pte to valid won't cause a spurious fault. If the thread
> + * gets preempted after storing to the pgtable but before emitting these
> + * barriers, __switch_to() emits a dsb which ensure the walker gets to
> + * see the store. There is no guarrantee of an isb being issued though.
> + * This is safe because it will still get issued (albeit on a
> + * potentially different CPU) when the thread starts running again,
> + * before any access to the address.
> + */
> + dsb(ishst);
> + isb();
> +}
> +
> +static inline void queue_pte_barriers(void)
> +{
> + if (test_thread_flag(TIF_LAZY_MMU))
> + set_thread_flag(TIF_LAZY_MMU_PENDING);
As we can have lots of calls here, it might be slightly cheaper to test
TIF_LAZY_MMU_PENDING and avoid setting it unnecessarily.
I haven't checked - does the compiler generate multiple mrs from sp_el0
for subsequent test_thread_flag()?
> + else
> + emit_pte_barriers();
> +}
> +
> +#define __HAVE_ARCH_ENTER_LAZY_MMU_MODE
> +static inline void arch_enter_lazy_mmu_mode(void)
> +{
> + VM_WARN_ON(in_interrupt());
> + VM_WARN_ON(test_thread_flag(TIF_LAZY_MMU));
> +
> + set_thread_flag(TIF_LAZY_MMU);
> +}
> +
> +static inline void arch_flush_lazy_mmu_mode(void)
> +{
> + if (test_and_clear_thread_flag(TIF_LAZY_MMU_PENDING))
> + emit_pte_barriers();
> +}
> +
> +static inline void arch_leave_lazy_mmu_mode(void)
> +{
> + arch_flush_lazy_mmu_mode();
> + clear_thread_flag(TIF_LAZY_MMU);
> +}
> +
> #ifdef CONFIG_TRANSPARENT_HUGEPAGE
> #define __HAVE_ARCH_FLUSH_PMD_TLB_RANGE
>
> @@ -323,10 +372,8 @@ static inline void __set_pte_complete(pte_t pte)
> * Only if the new pte is valid and kernel, otherwise TLB maintenance
> * has the necessary barriers.
> */
> - if (pte_valid_not_user(pte)) {
> - dsb(ishst);
> - isb();
> - }
> + if (pte_valid_not_user(pte))
> + queue_pte_barriers();
> }
I think this scheme works, I couldn't find a counter-example unless
__set_pte() gets called in an interrupt context. You could add
VM_WARN_ON(in_interrupt()) in queue_pte_barriers() as well.
With preemption, the newly mapped range shouldn't be used before
arch_flush_lazy_mmu_mode() is called, so it looks safe as well. I think
x86 uses a per-CPU variable to track this but per-thread is easier to
reason about if there's no nesting.
> static inline void __set_pte(pte_t *ptep, pte_t pte)
> @@ -778,10 +825,8 @@ static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
>
> WRITE_ONCE(*pmdp, pmd);
>
> - if (pmd_valid(pmd)) {
> - dsb(ishst);
> - isb();
> - }
> + if (pmd_valid(pmd))
> + queue_pte_barriers();
> }
We discussed on a previous series - for pmd/pud we end up with barriers
even for user mappings but they are at a much coarser granularity (and I
wasn't keen on 'user' attributes for the table entries).
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
On 14/04/2025 18:38, Catalin Marinas wrote:
> On Tue, Mar 04, 2025 at 03:04:41PM +0000, Ryan Roberts wrote:
>> diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
>> index 1898c3069c43..149df945c1ab 100644
>> --- a/arch/arm64/include/asm/pgtable.h
>> +++ b/arch/arm64/include/asm/pgtable.h
>> @@ -40,6 +40,55 @@
>> #include <linux/sched.h>
>> #include <linux/page_table_check.h>
>>
>> +static inline void emit_pte_barriers(void)
>> +{
>> + /*
>> + * These barriers are emitted under certain conditions after a pte entry
>> + * was modified (see e.g. __set_pte_complete()). The dsb makes the store
>> + * visible to the table walker. The isb ensures that any previous
>> + * speculative "invalid translation" marker that is in the CPU's
>> + * pipeline gets cleared, so that any access to that address after
>> + * setting the pte to valid won't cause a spurious fault. If the thread
>> + * gets preempted after storing to the pgtable but before emitting these
>> + * barriers, __switch_to() emits a dsb which ensure the walker gets to
>> + * see the store. There is no guarrantee of an isb being issued though.
>> + * This is safe because it will still get issued (albeit on a
>> + * potentially different CPU) when the thread starts running again,
>> + * before any access to the address.
>> + */
>> + dsb(ishst);
>> + isb();
>> +}
>> +
>> +static inline void queue_pte_barriers(void)
>> +{
>> + if (test_thread_flag(TIF_LAZY_MMU))
>> + set_thread_flag(TIF_LAZY_MMU_PENDING);
>
> As we can have lots of calls here, it might be slightly cheaper to test
> TIF_LAZY_MMU_PENDING and avoid setting it unnecessarily.
Yes, good point.
>
> I haven't checked - does the compiler generate multiple mrs from sp_el0
> for subsequent test_thread_flag()?
It emits a single mrs but it loads from the pointer twice. I think v3 is the version we want?
void TEST_queue_pte_barriers_v1(void)
{
if (test_thread_flag(TIF_LAZY_MMU))
set_thread_flag(TIF_LAZY_MMU_PENDING);
else
emit_pte_barriers();
}
void TEST_queue_pte_barriers_v2(void)
{
if (test_thread_flag(TIF_LAZY_MMU) &&
!test_thread_flag(TIF_LAZY_MMU_PENDING))
set_thread_flag(TIF_LAZY_MMU_PENDING);
else
emit_pte_barriers();
}
void TEST_queue_pte_barriers_v3(void)
{
unsigned long flags = read_thread_flags();
if ((flags & (_TIF_LAZY_MMU | _TIF_LAZY_MMU_PENDING)) == _TIF_LAZY_MMU)
set_thread_flag(TIF_LAZY_MMU_PENDING);
else
emit_pte_barriers();
}
000000000000101c <TEST_queue_pte_barriers_v1>:
101c: d5384100 mrs x0, sp_el0
1020: f9400001 ldr x1, [x0]
1024: 37f80081 tbnz w1, #31, 1034 <TEST_queue_pte_barriers_v1+0x18>
1028: d5033a9f dsb ishst
102c: d5033fdf isb
1030: d65f03c0 ret
1034: 14000004 b 1044 <TEST_queue_pte_barriers_v1+0x28>
1038: d2c00021 mov x1, #0x100000000 // #4294967296
103c: f821301f stset x1, [x0]
1040: d65f03c0 ret
1044: f9800011 prfm pstl1strm, [x0]
1048: c85f7c01 ldxr x1, [x0]
104c: b2600021 orr x1, x1, #0x100000000
1050: c8027c01 stxr w2, x1, [x0]
1054: 35ffffa2 cbnz w2, 1048 <TEST_queue_pte_barriers_v1+0x2c>
1058: d65f03c0 ret
000000000000105c <TEST_queue_pte_barriers_v2>:
105c: d5384100 mrs x0, sp_el0
1060: f9400001 ldr x1, [x0]
1064: 37f80081 tbnz w1, #31, 1074 <TEST_queue_pte_barriers_v2+0x18>
1068: d5033a9f dsb ishst
106c: d5033fdf isb
1070: d65f03c0 ret
1074: f9400001 ldr x1, [x0]
1078: b707ff81 tbnz x1, #32, 1068 <TEST_queue_pte_barriers_v2+0xc>
107c: 14000004 b 108c <TEST_queue_pte_barriers_v2+0x30>
1080: d2c00021 mov x1, #0x100000000 // #4294967296
1084: f821301f stset x1, [x0]
1088: d65f03c0 ret
108c: f9800011 prfm pstl1strm, [x0]
1090: c85f7c01 ldxr x1, [x0]
1094: b2600021 orr x1, x1, #0x100000000
1098: c8027c01 stxr w2, x1, [x0]
109c: 35ffffa2 cbnz w2, 1090 <TEST_queue_pte_barriers_v2+0x34>
10a0: d65f03c0 ret
00000000000010a4 <TEST_queue_pte_barriers_v3>:
10a4: d5384101 mrs x1, sp_el0
10a8: f9400020 ldr x0, [x1]
10ac: d2b00002 mov x2, #0x80000000 // #2147483648
10b0: 92610400 and x0, x0, #0x180000000
10b4: eb02001f cmp x0, x2
10b8: 54000080 b.eq 10c8 <TEST_queue_pte_barriers_v3+0x24> // b.none
10bc: d5033a9f dsb ishst
10c0: d5033fdf isb
10c4: d65f03c0 ret
10c8: 14000004 b 10d8 <TEST_queue_pte_barriers_v3+0x34>
10cc: d2c00020 mov x0, #0x100000000 // #4294967296
10d0: f820303f stset x0, [x1]
10d4: d65f03c0 ret
10d8: f9800031 prfm pstl1strm, [x1]
10dc: c85f7c20 ldxr x0, [x1]
10e0: b2600000 orr x0, x0, #0x100000000
10e4: c8027c20 stxr w2, x0, [x1]
10e8: 35ffffa2 cbnz w2, 10dc <TEST_queue_pte_barriers_v3+0x38>
10ec: d65f03c0 ret
>
>> + else
>> + emit_pte_barriers();
>> +}
>> +
>> +#define __HAVE_ARCH_ENTER_LAZY_MMU_MODE
>> +static inline void arch_enter_lazy_mmu_mode(void)
>> +{
>> + VM_WARN_ON(in_interrupt());
>> + VM_WARN_ON(test_thread_flag(TIF_LAZY_MMU));
>> +
>> + set_thread_flag(TIF_LAZY_MMU);
>> +}
>> +
>> +static inline void arch_flush_lazy_mmu_mode(void)
>> +{
>> + if (test_and_clear_thread_flag(TIF_LAZY_MMU_PENDING))
>> + emit_pte_barriers();
>> +}
>> +
>> +static inline void arch_leave_lazy_mmu_mode(void)
>> +{
>> + arch_flush_lazy_mmu_mode();
>> + clear_thread_flag(TIF_LAZY_MMU);
>> +}
>> +
>> #ifdef CONFIG_TRANSPARENT_HUGEPAGE
>> #define __HAVE_ARCH_FLUSH_PMD_TLB_RANGE
>>
>> @@ -323,10 +372,8 @@ static inline void __set_pte_complete(pte_t pte)
>> * Only if the new pte is valid and kernel, otherwise TLB maintenance
>> * has the necessary barriers.
>> */
>> - if (pte_valid_not_user(pte)) {
>> - dsb(ishst);
>> - isb();
>> - }
>> + if (pte_valid_not_user(pte))
>> + queue_pte_barriers();
>> }
>
> I think this scheme works, I couldn't find a counter-example unless
> __set_pte() gets called in an interrupt context. You could add
> VM_WARN_ON(in_interrupt()) in queue_pte_barriers() as well.
>
> With preemption, the newly mapped range shouldn't be used before
> arch_flush_lazy_mmu_mode() is called, so it looks safe as well. I think
> x86 uses a per-CPU variable to track this but per-thread is easier to
> reason about if there's no nesting.
>
>> static inline void __set_pte(pte_t *ptep, pte_t pte)
>> @@ -778,10 +825,8 @@ static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
>>
>> WRITE_ONCE(*pmdp, pmd);
>>
>> - if (pmd_valid(pmd)) {
>> - dsb(ishst);
>> - isb();
>> - }
>> + if (pmd_valid(pmd))
>> + queue_pte_barriers();
>> }
>
> We discussed on a previous series - for pmd/pud we end up with barriers
> even for user mappings but they are at a much coarser granularity (and I
> wasn't keen on 'user' attributes for the table entries).
>
> Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Thanks!
Ryan
On Mon, Apr 14, 2025 at 07:28:46PM +0100, Ryan Roberts wrote:
> On 14/04/2025 18:38, Catalin Marinas wrote:
> > On Tue, Mar 04, 2025 at 03:04:41PM +0000, Ryan Roberts wrote:
> >> diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
> >> index 1898c3069c43..149df945c1ab 100644
> >> --- a/arch/arm64/include/asm/pgtable.h
> >> +++ b/arch/arm64/include/asm/pgtable.h
> >> @@ -40,6 +40,55 @@
> >> #include <linux/sched.h>
> >> #include <linux/page_table_check.h>
> >>
> >> +static inline void emit_pte_barriers(void)
> >> +{
> >> + /*
> >> + * These barriers are emitted under certain conditions after a pte entry
> >> + * was modified (see e.g. __set_pte_complete()). The dsb makes the store
> >> + * visible to the table walker. The isb ensures that any previous
> >> + * speculative "invalid translation" marker that is in the CPU's
> >> + * pipeline gets cleared, so that any access to that address after
> >> + * setting the pte to valid won't cause a spurious fault. If the thread
> >> + * gets preempted after storing to the pgtable but before emitting these
> >> + * barriers, __switch_to() emits a dsb which ensure the walker gets to
> >> + * see the store. There is no guarrantee of an isb being issued though.
> >> + * This is safe because it will still get issued (albeit on a
> >> + * potentially different CPU) when the thread starts running again,
> >> + * before any access to the address.
> >> + */
> >> + dsb(ishst);
> >> + isb();
> >> +}
> >> +
> >> +static inline void queue_pte_barriers(void)
> >> +{
> >> + if (test_thread_flag(TIF_LAZY_MMU))
> >> + set_thread_flag(TIF_LAZY_MMU_PENDING);
> >
> > As we can have lots of calls here, it might be slightly cheaper to test
> > TIF_LAZY_MMU_PENDING and avoid setting it unnecessarily.
>
> Yes, good point.
>
> > I haven't checked - does the compiler generate multiple mrs from sp_el0
> > for subsequent test_thread_flag()?
>
> It emits a single mrs but it loads from the pointer twice.
It's not that bad if only do the set_thread_flag() once.
> I think v3 is the version we want?
>
>
> void TEST_queue_pte_barriers_v1(void)
> {
> if (test_thread_flag(TIF_LAZY_MMU))
> set_thread_flag(TIF_LAZY_MMU_PENDING);
> else
> emit_pte_barriers();
> }
>
> void TEST_queue_pte_barriers_v2(void)
> {
> if (test_thread_flag(TIF_LAZY_MMU) &&
> !test_thread_flag(TIF_LAZY_MMU_PENDING))
> set_thread_flag(TIF_LAZY_MMU_PENDING);
> else
> emit_pte_barriers();
> }
>
> void TEST_queue_pte_barriers_v3(void)
> {
> unsigned long flags = read_thread_flags();
>
> if ((flags & (_TIF_LAZY_MMU | _TIF_LAZY_MMU_PENDING)) == _TIF_LAZY_MMU)
> set_thread_flag(TIF_LAZY_MMU_PENDING);
> else
> emit_pte_barriers();
> }
Doesn't v3 emit barriers once _TIF_LAZY_MMU_PENDING has been set? We
need something like:
if (flags & _TIF_LAZY_MMU) {
if (!(flags & _TIF_LAZY_MMU_PENDING))
set_thread_flag(TIF_LAZY_MMU_PENDING);
} else {
emit_pte_barriers();
}
--
Catalin
On 15/04/2025 11:51, Catalin Marinas wrote:
> On Mon, Apr 14, 2025 at 07:28:46PM +0100, Ryan Roberts wrote:
>> On 14/04/2025 18:38, Catalin Marinas wrote:
>>> On Tue, Mar 04, 2025 at 03:04:41PM +0000, Ryan Roberts wrote:
>>>> diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
>>>> index 1898c3069c43..149df945c1ab 100644
>>>> --- a/arch/arm64/include/asm/pgtable.h
>>>> +++ b/arch/arm64/include/asm/pgtable.h
>>>> @@ -40,6 +40,55 @@
>>>> #include <linux/sched.h>
>>>> #include <linux/page_table_check.h>
>>>>
>>>> +static inline void emit_pte_barriers(void)
>>>> +{
>>>> + /*
>>>> + * These barriers are emitted under certain conditions after a pte entry
>>>> + * was modified (see e.g. __set_pte_complete()). The dsb makes the store
>>>> + * visible to the table walker. The isb ensures that any previous
>>>> + * speculative "invalid translation" marker that is in the CPU's
>>>> + * pipeline gets cleared, so that any access to that address after
>>>> + * setting the pte to valid won't cause a spurious fault. If the thread
>>>> + * gets preempted after storing to the pgtable but before emitting these
>>>> + * barriers, __switch_to() emits a dsb which ensure the walker gets to
>>>> + * see the store. There is no guarrantee of an isb being issued though.
>>>> + * This is safe because it will still get issued (albeit on a
>>>> + * potentially different CPU) when the thread starts running again,
>>>> + * before any access to the address.
>>>> + */
>>>> + dsb(ishst);
>>>> + isb();
>>>> +}
>>>> +
>>>> +static inline void queue_pte_barriers(void)
>>>> +{
>>>> + if (test_thread_flag(TIF_LAZY_MMU))
>>>> + set_thread_flag(TIF_LAZY_MMU_PENDING);
>>>
>>> As we can have lots of calls here, it might be slightly cheaper to test
>>> TIF_LAZY_MMU_PENDING and avoid setting it unnecessarily.
>>
>> Yes, good point.
>>
>>> I haven't checked - does the compiler generate multiple mrs from sp_el0
>>> for subsequent test_thread_flag()?
>>
>> It emits a single mrs but it loads from the pointer twice.
>
> It's not that bad if only do the set_thread_flag() once.
>
>> I think v3 is the version we want?
>>
>>
>> void TEST_queue_pte_barriers_v1(void)
>> {
>> if (test_thread_flag(TIF_LAZY_MMU))
>> set_thread_flag(TIF_LAZY_MMU_PENDING);
>> else
>> emit_pte_barriers();
>> }
>>
>> void TEST_queue_pte_barriers_v2(void)
>> {
>> if (test_thread_flag(TIF_LAZY_MMU) &&
>> !test_thread_flag(TIF_LAZY_MMU_PENDING))
>> set_thread_flag(TIF_LAZY_MMU_PENDING);
>> else
>> emit_pte_barriers();
>> }
>>
>> void TEST_queue_pte_barriers_v3(void)
>> {
>> unsigned long flags = read_thread_flags();
>>
>> if ((flags & (_TIF_LAZY_MMU | _TIF_LAZY_MMU_PENDING)) == _TIF_LAZY_MMU)
>> set_thread_flag(TIF_LAZY_MMU_PENDING);
>> else
>> emit_pte_barriers();
>> }
>
> Doesn't v3 emit barriers once _TIF_LAZY_MMU_PENDING has been set? We
> need something like:
>
> if (flags & _TIF_LAZY_MMU) {
> if (!(flags & _TIF_LAZY_MMU_PENDING))
> set_thread_flag(TIF_LAZY_MMU_PENDING);
> } else {
> emit_pte_barriers();
> }
Gah, yeah sorry, going to quickly. v2 is also logicially incorrect.
Fixed versions:
void TEST_queue_pte_barriers_v2(void)
{
if (test_thread_flag(TIF_LAZY_MMU)) {
if (!test_thread_flag(TIF_LAZY_MMU_PENDING))
set_thread_flag(TIF_LAZY_MMU_PENDING);
} else {
emit_pte_barriers();
}
}
void TEST_queue_pte_barriers_v3(void)
{
unsigned long flags = read_thread_flags();
if (flags & BIT(TIF_LAZY_MMU)) {
if (!(flags & BIT(TIF_LAZY_MMU_PENDING)))
set_thread_flag(TIF_LAZY_MMU_PENDING);
} else {
emit_pte_barriers();
}
}
000000000000105c <TEST_queue_pte_barriers_v2>:
105c: d5384100 mrs x0, sp_el0
1060: f9400001 ldr x1, [x0]
1064: 37f80081 tbnz w1, #31, 1074 <TEST_queue_pte_barriers_v2+0x18>
1068: d5033a9f dsb ishst
106c: d5033fdf isb
1070: d65f03c0 ret
1074: f9400001 ldr x1, [x0]
1078: b707ffc1 tbnz x1, #32, 1070 <TEST_queue_pte_barriers_v2+0x14>
107c: 14000004 b 108c <TEST_queue_pte_barriers_v2+0x30>
1080: d2c00021 mov x1, #0x100000000 // #4294967296
1084: f821301f stset x1, [x0]
1088: d65f03c0 ret
108c: f9800011 prfm pstl1strm, [x0]
1090: c85f7c01 ldxr x1, [x0]
1094: b2600021 orr x1, x1, #0x100000000
1098: c8027c01 stxr w2, x1, [x0]
109c: 35ffffa2 cbnz w2, 1090 <TEST_queue_pte_barriers_v2+0x34>
10a0: d65f03c0 ret
00000000000010a4 <TEST_queue_pte_barriers_v3>:
10a4: d5384101 mrs x1, sp_el0
10a8: f9400020 ldr x0, [x1]
10ac: 36f80060 tbz w0, #31, 10b8 <TEST_queue_pte_barriers_v3+0x14>
10b0: b60000a0 tbz x0, #32, 10c4 <TEST_queue_pte_barriers_v3+0x20>
10b4: d65f03c0 ret
10b8: d5033a9f dsb ishst
10bc: d5033fdf isb
10c0: d65f03c0 ret
10c4: 14000004 b 10d4 <TEST_queue_pte_barriers_v3+0x30>
10c8: d2c00020 mov x0, #0x100000000 // #4294967296
10cc: f820303f stset x0, [x1]
10d0: d65f03c0 ret
10d4: f9800031 prfm pstl1strm, [x1]
10d8: c85f7c20 ldxr x0, [x1]
10dc: b2600000 orr x0, x0, #0x100000000
10e0: c8027c20 stxr w2, x0, [x1]
10e4: 35ffffa2 cbnz w2, 10d8 <TEST_queue_pte_barriers_v3+0x34>
10e8: d65f03c0 ret
So v3 is the way to go, I think; it's a single mrs and a single ldr.
I'll get this fixed up and posted early next week.
Thanks,
Ryan
On 3/4/25 20:34, Ryan Roberts wrote:
> Because the kernel can't tolerate page faults for kernel mappings, when
> setting a valid, kernel space pte (or pmd/pud/p4d/pgd), it emits a
> dsb(ishst) to ensure that the store to the pgtable is observed by the
> table walker immediately. Additionally it emits an isb() to ensure that
> any already speculatively determined invalid mapping fault gets
> canceled.
>
> We can improve the performance of vmalloc operations by batching these
> barriers until the end of a set of entry updates.
> arch_enter_lazy_mmu_mode() and arch_leave_lazy_mmu_mode() provide the
> required hooks.
>
> vmalloc improves by up to 30% as a result.
>
> Two new TIF_ flags are created; TIF_LAZY_MMU tells us if the task is in
> the lazy mode and can therefore defer any barriers until exit from the
> lazy mode. TIF_LAZY_MMU_PENDING is used to remember if any pte operation
> was performed while in the lazy mode that required barriers. Then when
> leaving lazy mode, if that flag is set, we emit the barriers.
>
> Since arch_enter_lazy_mmu_mode() and arch_leave_lazy_mmu_mode() are used
> for both user and kernel mappings, we need the second flag to avoid
> emitting barriers unnecessarily if only user mappings were updated.
Agreed and hence for that an additional TIF flag i.e TIF_LAZY_MMU_PENDING
can be justified.
>
> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
> ---
> arch/arm64/include/asm/pgtable.h | 73 ++++++++++++++++++++++------
> arch/arm64/include/asm/thread_info.h | 2 +
> arch/arm64/kernel/process.c | 9 ++--
> 3 files changed, 64 insertions(+), 20 deletions(-)
>
> diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
> index 1898c3069c43..149df945c1ab 100644
> --- a/arch/arm64/include/asm/pgtable.h
> +++ b/arch/arm64/include/asm/pgtable.h
> @@ -40,6 +40,55 @@
> #include <linux/sched.h>
> #include <linux/page_table_check.h>
>
> +static inline void emit_pte_barriers(void)
> +{
> + /*
> + * These barriers are emitted under certain conditions after a pte entry
> + * was modified (see e.g. __set_pte_complete()). The dsb makes the store
> + * visible to the table walker. The isb ensures that any previous
> + * speculative "invalid translation" marker that is in the CPU's
> + * pipeline gets cleared, so that any access to that address after
> + * setting the pte to valid won't cause a spurious fault. If the thread
> + * gets preempted after storing to the pgtable but before emitting these
> + * barriers, __switch_to() emits a dsb which ensure the walker gets to
> + * see the store. There is no guarrantee of an isb being issued though.
typo ^^^^^^^^
> + * This is safe because it will still get issued (albeit on a
> + * potentially different CPU) when the thread starts running again,
> + * before any access to the address.
> + */
> + dsb(ishst);
> + isb();
> +}
> +
> +static inline void queue_pte_barriers(void)
> +{
> + if (test_thread_flag(TIF_LAZY_MMU))
> + set_thread_flag(TIF_LAZY_MMU_PENDING);
> + else
> + emit_pte_barriers();
> +}
> +
> +#define __HAVE_ARCH_ENTER_LAZY_MMU_MODE
> +static inline void arch_enter_lazy_mmu_mode(void)
> +{
> + VM_WARN_ON(in_interrupt());
> + VM_WARN_ON(test_thread_flag(TIF_LAZY_MMU));
> +
> + set_thread_flag(TIF_LAZY_MMU);
> +}
> +
> +static inline void arch_flush_lazy_mmu_mode(void)
> +{
> + if (test_and_clear_thread_flag(TIF_LAZY_MMU_PENDING))
> + emit_pte_barriers();
> +}
> +
> +static inline void arch_leave_lazy_mmu_mode(void)
> +{
> + arch_flush_lazy_mmu_mode();
> + clear_thread_flag(TIF_LAZY_MMU);
> +}
> +
> #ifdef CONFIG_TRANSPARENT_HUGEPAGE
> #define __HAVE_ARCH_FLUSH_PMD_TLB_RANGE
>
> @@ -323,10 +372,8 @@ static inline void __set_pte_complete(pte_t pte)
> * Only if the new pte is valid and kernel, otherwise TLB maintenance
> * has the necessary barriers.
> */
> - if (pte_valid_not_user(pte)) {
> - dsb(ishst);
> - isb();
> - }
> + if (pte_valid_not_user(pte))
> + queue_pte_barriers();
> }
>
> static inline void __set_pte(pte_t *ptep, pte_t pte)
> @@ -778,10 +825,8 @@ static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
>
> WRITE_ONCE(*pmdp, pmd);
>
> - if (pmd_valid(pmd)) {
> - dsb(ishst);
> - isb();
> - }
> + if (pmd_valid(pmd))
> + queue_pte_barriers();
> }
>
> static inline void pmd_clear(pmd_t *pmdp)
> @@ -845,10 +890,8 @@ static inline void set_pud(pud_t *pudp, pud_t pud)
>
> WRITE_ONCE(*pudp, pud);
>
> - if (pud_valid(pud)) {
> - dsb(ishst);
> - isb();
> - }
> + if (pud_valid(pud))
> + queue_pte_barriers();
> }
>
> static inline void pud_clear(pud_t *pudp)
> @@ -925,8 +968,7 @@ static inline void set_p4d(p4d_t *p4dp, p4d_t p4d)
> }
>
> WRITE_ONCE(*p4dp, p4d);
> - dsb(ishst);
> - isb();
> + queue_pte_barriers();
> }
>
> static inline void p4d_clear(p4d_t *p4dp)
> @@ -1052,8 +1094,7 @@ static inline void set_pgd(pgd_t *pgdp, pgd_t pgd)
> }
>
> WRITE_ONCE(*pgdp, pgd);
> - dsb(ishst);
> - isb();
> + queue_pte_barriers();
> }
>
> static inline void pgd_clear(pgd_t *pgdp)
> diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
> index 1114c1c3300a..1fdd74b7b831 100644
> --- a/arch/arm64/include/asm/thread_info.h
> +++ b/arch/arm64/include/asm/thread_info.h
> @@ -82,6 +82,8 @@ void arch_setup_new_exec(void);
> #define TIF_SME_VL_INHERIT 28 /* Inherit SME vl_onexec across exec */
> #define TIF_KERNEL_FPSTATE 29 /* Task is in a kernel mode FPSIMD section */
> #define TIF_TSC_SIGSEGV 30 /* SIGSEGV on counter-timer access */
> +#define TIF_LAZY_MMU 31 /* Task in lazy mmu mode */
> +#define TIF_LAZY_MMU_PENDING 32 /* Ops pending for lazy mmu mode exit */
>
> #define _TIF_SIGPENDING (1 << TIF_SIGPENDING)
> #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED)
> diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
> index 42faebb7b712..45a55fe81788 100644
> --- a/arch/arm64/kernel/process.c
> +++ b/arch/arm64/kernel/process.c
> @@ -680,10 +680,11 @@ struct task_struct *__switch_to(struct task_struct *prev,
> gcs_thread_switch(next);
>
> /*
> - * Complete any pending TLB or cache maintenance on this CPU in case
> - * the thread migrates to a different CPU.
> - * This full barrier is also required by the membarrier system
> - * call.
> + * Complete any pending TLB or cache maintenance on this CPU in case the
> + * thread migrates to a different CPU. This full barrier is also
> + * required by the membarrier system call. Additionally it makes any
> + * in-progress pgtable writes visible to the table walker; See
> + * emit_pte_barriers().
> */
> dsb(ish);
>
Otherwise, LGTM.
I will try and think through again if these deferred sync and flush can cause
subtle problems else where.
© 2016 - 2026 Red Hat, Inc.