arch/arm64/include/asm/mte.h | 6 +++++- arch/arm64/kernel/mte.c | 5 +++++ 2 files changed, 10 insertions(+), 1 deletion(-)
In MTE synchronous mode, tag check faults are reported as immediate
Data Abort exceptions. The TFSR_EL1.TF1 bit is never set, since faults
never go through the asynchronous path. Therefore, reading TFSR_EL1
and executing data and instruction barriers on kernel entry, exit,
context switch, and suspend is unnecessary overhead in sync mode.
The exit path (mte_check_tfsr_exit) and the assembly paths
(check_mte_async_tcf / clear_mte_async_tcf in entry.S) already had this
check. Extend the same optimization on kernel entry/exit, context
switch and suspend.
All mte kselftests pass. The kunit before and after the patch show same
results.
A selection of test_vmalloc benchmarks running on a arm64 machine.
v6.19 is the baseline. (>0 is faster, <0 is slower, (R)/(I) =
statistically significant Regression/Improvement). Based on significance
and ignoring the noise, the benchmarks improved.
* 77 result classes were considered, with 9 wins, 0 losses and 68 ties
Results of fastpath [1] on v6.19 vs this patch:
+----------------------------+----------------------------------------------------------+------------+
| Benchmark | Result Class | barriers |
+============================+==========================================================+============+
| micromm/fork | fork: p:1, d:10 (seconds) | (I) 2.75% |
| | fork: p:512, d:10 (seconds) | 0.96% |
+----------------------------+----------------------------------------------------------+------------+
| micromm/munmap | munmap: p:1, d:10 (seconds) | -1.78% |
| | munmap: p:512, d:10 (seconds) | 5.02% |
+----------------------------+----------------------------------------------------------+------------+
| micromm/vmalloc | fix_align_alloc_test: p:1, h:0, l:500000 (usec) | -0.56% |
| | fix_size_alloc_test: p:1, h:0, l:500000 (usec) | 0.70% |
| | fix_size_alloc_test: p:4, h:0, l:500000 (usec) | 1.18% |
| | fix_size_alloc_test: p:16, h:0, l:500000 (usec) | -5.01% |
| | fix_size_alloc_test: p:16, h:1, l:500000 (usec) | 13.81% |
| | fix_size_alloc_test: p:64, h:0, l:100000 (usec) | 6.51% |
| | fix_size_alloc_test: p:64, h:1, l:100000 (usec) | 32.87% |
| | fix_size_alloc_test: p:256, h:0, l:100000 (usec) | 4.17% |
| | fix_size_alloc_test: p:256, h:1, l:100000 (usec) | 8.40% |
| | fix_size_alloc_test: p:512, h:0, l:100000 (usec) | -0.48% |
| | fix_size_alloc_test: p:512, h:1, l:100000 (usec) | -0.74% |
| | full_fit_alloc_test: p:1, h:0, l:500000 (usec) | 0.53% |
| | kvfree_rcu_1_arg_vmalloc_test: p:1, h:0, l:500000 (usec) | -2.81% |
| | kvfree_rcu_2_arg_vmalloc_test: p:1, h:0, l:500000 (usec) | -2.06% |
| | long_busy_list_alloc_test: p:1, h:0, l:500000 (usec) | -0.56% |
| | pcpu_alloc_test: p:1, h:0, l:500000 (usec) | -0.41% |
| | random_size_align_alloc_test: p:1, h:0, l:500000 (usec) | 0.89% |
| | random_size_alloc_test: p:1, h:0, l:500000 (usec) | 1.71% |
| | vm_map_ram_test: p:1, h:0, l:500000 (usec) | 0.83% |
+----------------------------+----------------------------------------------------------+------------+
| schbench/thread-contention | -m 16 -t 1 -r 10 -s 1000, avg_rps (req/sec) | 0.05% |
| | -m 16 -t 1 -r 10 -s 1000, req_latency_p99 (usec) | 0.60% |
| | -m 16 -t 1 -r 10 -s 1000, wakeup_latency_p99 (usec) | 0.00% |
| | -m 16 -t 4 -r 10 -s 1000, avg_rps (req/sec) | -0.34% |
| | -m 16 -t 4 -r 10 -s 1000, req_latency_p99 (usec) | -0.58% |
| | -m 16 -t 4 -r 10 -s 1000, wakeup_latency_p99 (usec) | 9.09% |
| | -m 16 -t 16 -r 10 -s 1000, avg_rps (req/sec) | -0.74% |
| | -m 16 -t 16 -r 10 -s 1000, req_latency_p99 (usec) | -1.40% |
| | -m 16 -t 16 -r 10 -s 1000, wakeup_latency_p99 (usec) | 0.00% |
| | -m 16 -t 64 -r 10 -s 1000, avg_rps (req/sec) | -0.78% |
| | -m 16 -t 64 -r 10 -s 1000, req_latency_p99 (usec) | -0.11% |
| | -m 16 -t 64 -r 10 -s 1000, wakeup_latency_p99 (usec) | 0.11% |
| | -m 16 -t 256 -r 10 -s 1000, avg_rps (req/sec) | 2.64% |
| | -m 16 -t 256 -r 10 -s 1000, req_latency_p99 (usec) | 3.15% |
| | -m 16 -t 256 -r 10 -s 1000, wakeup_latency_p99 (usec) | 17.54% |
| | -m 32 -t 1 -r 10 -s 1000, avg_rps (req/sec) | -1.22% |
| | -m 32 -t 1 -r 10 -s 1000, req_latency_p99 (usec) | 0.85% |
| | -m 32 -t 1 -r 10 -s 1000, wakeup_latency_p99 (usec) | 0.00% |
| | -m 32 -t 4 -r 10 -s 1000, avg_rps (req/sec) | -0.34% |
| | -m 32 -t 4 -r 10 -s 1000, req_latency_p99 (usec) | 1.05% |
| | -m 32 -t 4 -r 10 -s 1000, wakeup_latency_p99 (usec) | 0.00% |
| | -m 32 -t 16 -r 10 -s 1000, avg_rps (req/sec) | -0.41% |
| | -m 32 -t 16 -r 10 -s 1000, req_latency_p99 (usec) | 0.58% |
| | -m 32 -t 16 -r 10 -s 1000, wakeup_latency_p99 (usec) | 2.13% |
| | -m 32 -t 64 -r 10 -s 1000, avg_rps (req/sec) | 0.67% |
| | -m 32 -t 64 -r 10 -s 1000, req_latency_p99 (usec) | 2.07% |
| | -m 32 -t 64 -r 10 -s 1000, wakeup_latency_p99 (usec) | -1.28% |
| | -m 32 -t 256 -r 10 -s 1000, avg_rps (req/sec) | 1.01% |
| | -m 32 -t 256 -r 10 -s 1000, req_latency_p99 (usec) | 0.69% |
| | -m 32 -t 256 -r 10 -s 1000, wakeup_latency_p99 (usec) | 13.12% |
| | -m 64 -t 1 -r 10 -s 1000, avg_rps (req/sec) | -0.25% |
| | -m 64 -t 1 -r 10 -s 1000, req_latency_p99 (usec) | -0.48% |
| | -m 64 -t 1 -r 10 -s 1000, wakeup_latency_p99 (usec) | 10.53% |
| | -m 64 -t 4 -r 10 -s 1000, avg_rps (req/sec) | -0.06% |
| | -m 64 -t 4 -r 10 -s 1000, req_latency_p99 (usec) | 0.00% |
| | -m 64 -t 4 -r 10 -s 1000, wakeup_latency_p99 (usec) | 0.00% |
| | -m 64 -t 16 -r 10 -s 1000, avg_rps (req/sec) | -0.36% |
| | -m 64 -t 16 -r 10 -s 1000, req_latency_p99 (usec) | 0.52% |
| | -m 64 -t 16 -r 10 -s 1000, wakeup_latency_p99 (usec) | 0.11% |
| | -m 64 -t 64 -r 10 -s 1000, avg_rps (req/sec) | 0.52% |
| | -m 64 -t 64 -r 10 -s 1000, req_latency_p99 (usec) | 3.53% |
| | -m 64 -t 64 -r 10 -s 1000, wakeup_latency_p99 (usec) | -0.10% |
| | -m 64 -t 256 -r 10 -s 1000, avg_rps (req/sec) | 2.53% |
| | -m 64 -t 256 -r 10 -s 1000, req_latency_p99 (usec) | 1.82% |
| | -m 64 -t 256 -r 10 -s 1000, wakeup_latency_p99 (usec) | -5.80% |
+----------------------------+----------------------------------------------------------+------------+
| syscall/getpid | mean (ns) | (I) 15.98% |
| | p99 (ns) | (I) 11.11% |
| | p99.9 (ns) | (I) 16.13% |
+----------------------------+----------------------------------------------------------+------------+
| syscall/getppid | mean (ns) | (I) 14.82% |
| | p99 (ns) | (I) 17.86% |
| | p99.9 (ns) | (I) 9.09% |
+----------------------------+----------------------------------------------------------+------------+
| syscall/invalid | mean (ns) | (I) 17.78% |
| | p99 (ns) | (I) 11.11% |
| | p99.9 (ns) | 13.33% |
+----------------------------+----------------------------------------------------------+------------+
[1] https://gitlab.arm.com/tooling/fastpath
Signed-off-by: Muhammad Usama Anjum <usama.anjum@arm.com>
---
The patch applies on v6.19 and next-20260309.
---
arch/arm64/include/asm/mte.h | 6 +++++-
arch/arm64/kernel/mte.c | 5 +++++
2 files changed, 10 insertions(+), 1 deletion(-)
diff --git a/arch/arm64/include/asm/mte.h b/arch/arm64/include/asm/mte.h
index 6d4a78b9dc3e6..0e05d20cf2583 100644
--- a/arch/arm64/include/asm/mte.h
+++ b/arch/arm64/include/asm/mte.h
@@ -252,7 +252,8 @@ static inline void mte_check_tfsr_entry(void)
if (!kasan_hw_tags_enabled())
return;
- mte_check_tfsr_el1();
+ if (system_uses_mte_async_or_asymm_mode())
+ mte_check_tfsr_el1();
}
static inline void mte_check_tfsr_exit(void)
@@ -260,6 +261,9 @@ static inline void mte_check_tfsr_exit(void)
if (!kasan_hw_tags_enabled())
return;
+ if (!system_uses_mte_async_or_asymm_mode())
+ return;
+
/*
* The asynchronous faults are sync'ed automatically with
* TFSR_EL1 on kernel entry but for exit an explicit dsb()
diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c
index 32148bf09c1dc..8da2891b834d7 100644
--- a/arch/arm64/kernel/mte.c
+++ b/arch/arm64/kernel/mte.c
@@ -291,6 +291,8 @@ void mte_thread_switch(struct task_struct *next)
/* TCO may not have been disabled on exception entry for the current task. */
mte_disable_tco_entry(next);
+ if (!system_uses_mte_async_or_asymm_mode())
+ return;
/*
* Check if an async tag exception occurred at EL1.
*
@@ -350,6 +352,9 @@ void mte_suspend_enter(void)
if (!system_supports_mte())
return;
+ if (!system_uses_mte_async_or_asymm_mode())
+ return;
+
/*
* The barriers are required to guarantee that the indirect writes
* to TFSR_EL1 are synchronized before we report the state.
--
2.47.3
On 3/11/26 18:50, Muhammad Usama Anjum wrote: > In MTE synchronous mode, tag check faults are reported as immediate > Data Abort exceptions. The TFSR_EL1.TF1 bit is never set, since faults > never go through the asynchronous path. Therefore, reading TFSR_EL1 > and executing data and instruction barriers on kernel entry, exit, > context switch, and suspend is unnecessary overhead in sync mode. > > The exit path (mte_check_tfsr_exit) and the assembly paths > (check_mte_async_tcf / clear_mte_async_tcf in entry.S) already had this > check. Right, that's for user space (TFSR_EL1.TF0 IIUC). What you are adding is for KASAN. Maybe make that clearer. > Extend the same optimization on kernel entry/exit, context > switch and suspend. > > All mte kselftests pass. The kunit before and after the patch show same > results. > > A selection of test_vmalloc benchmarks running on a arm64 machine. > v6.19 is the baseline. (>0 is faster, <0 is slower, (R)/(I) = > statistically significant Regression/Improvement). Based on significance > and ignoring the noise, the benchmarks improved. > > * 77 result classes were considered, with 9 wins, 0 losses and 68 ties > > Results of fastpath [1] on v6.19 vs this patch: > > +----------------------------+----------------------------------------------------------+------------+ > | Benchmark | Result Class | barriers | > +============================+==========================================================+============+ > | micromm/fork | fork: p:1, d:10 (seconds) | (I) 2.75% | > | | fork: p:512, d:10 (seconds) | 0.96% | > +----------------------------+----------------------------------------------------------+------------+ > | micromm/munmap | munmap: p:1, d:10 (seconds) | -1.78% | > | | munmap: p:512, d:10 (seconds) | 5.02% | > +----------------------------+----------------------------------------------------------+------------+ > | micromm/vmalloc | fix_align_alloc_test: p:1, h:0, l:500000 (usec) | -0.56% | > | | fix_size_alloc_test: p:1, h:0, l:500000 (usec) | 0.70% | > | | fix_size_alloc_test: p:4, h:0, l:500000 (usec) | 1.18% | > | | fix_size_alloc_test: p:16, h:0, l:500000 (usec) | -5.01% | > | | fix_size_alloc_test: p:16, h:1, l:500000 (usec) | 13.81% | > | | fix_size_alloc_test: p:64, h:0, l:100000 (usec) | 6.51% | > | | fix_size_alloc_test: p:64, h:1, l:100000 (usec) | 32.87% | > | | fix_size_alloc_test: p:256, h:0, l:100000 (usec) | 4.17% | > | | fix_size_alloc_test: p:256, h:1, l:100000 (usec) | 8.40% | > | | fix_size_alloc_test: p:512, h:0, l:100000 (usec) | -0.48% | > | | fix_size_alloc_test: p:512, h:1, l:100000 (usec) | -0.74% | > | | full_fit_alloc_test: p:1, h:0, l:500000 (usec) | 0.53% | > | | kvfree_rcu_1_arg_vmalloc_test: p:1, h:0, l:500000 (usec) | -2.81% | > | | kvfree_rcu_2_arg_vmalloc_test: p:1, h:0, l:500000 (usec) | -2.06% | > | | long_busy_list_alloc_test: p:1, h:0, l:500000 (usec) | -0.56% | > | | pcpu_alloc_test: p:1, h:0, l:500000 (usec) | -0.41% | > | | random_size_align_alloc_test: p:1, h:0, l:500000 (usec) | 0.89% | > | | random_size_alloc_test: p:1, h:0, l:500000 (usec) | 1.71% | > | | vm_map_ram_test: p:1, h:0, l:500000 (usec) | 0.83% | > +----------------------------+----------------------------------------------------------+------------+ > | schbench/thread-contention | -m 16 -t 1 -r 10 -s 1000, avg_rps (req/sec) | 0.05% | > | | -m 16 -t 1 -r 10 -s 1000, req_latency_p99 (usec) | 0.60% | > | | -m 16 -t 1 -r 10 -s 1000, wakeup_latency_p99 (usec) | 0.00% | > | | -m 16 -t 4 -r 10 -s 1000, avg_rps (req/sec) | -0.34% | > | | -m 16 -t 4 -r 10 -s 1000, req_latency_p99 (usec) | -0.58% | > | | -m 16 -t 4 -r 10 -s 1000, wakeup_latency_p99 (usec) | 9.09% | > | | -m 16 -t 16 -r 10 -s 1000, avg_rps (req/sec) | -0.74% | > | | -m 16 -t 16 -r 10 -s 1000, req_latency_p99 (usec) | -1.40% | > | | -m 16 -t 16 -r 10 -s 1000, wakeup_latency_p99 (usec) | 0.00% | > | | -m 16 -t 64 -r 10 -s 1000, avg_rps (req/sec) | -0.78% | > | | -m 16 -t 64 -r 10 -s 1000, req_latency_p99 (usec) | -0.11% | > | | -m 16 -t 64 -r 10 -s 1000, wakeup_latency_p99 (usec) | 0.11% | > | | -m 16 -t 256 -r 10 -s 1000, avg_rps (req/sec) | 2.64% | > | | -m 16 -t 256 -r 10 -s 1000, req_latency_p99 (usec) | 3.15% | > | | -m 16 -t 256 -r 10 -s 1000, wakeup_latency_p99 (usec) | 17.54% | > | | -m 32 -t 1 -r 10 -s 1000, avg_rps (req/sec) | -1.22% | > | | -m 32 -t 1 -r 10 -s 1000, req_latency_p99 (usec) | 0.85% | > | | -m 32 -t 1 -r 10 -s 1000, wakeup_latency_p99 (usec) | 0.00% | > | | -m 32 -t 4 -r 10 -s 1000, avg_rps (req/sec) | -0.34% | > | | -m 32 -t 4 -r 10 -s 1000, req_latency_p99 (usec) | 1.05% | > | | -m 32 -t 4 -r 10 -s 1000, wakeup_latency_p99 (usec) | 0.00% | > | | -m 32 -t 16 -r 10 -s 1000, avg_rps (req/sec) | -0.41% | > | | -m 32 -t 16 -r 10 -s 1000, req_latency_p99 (usec) | 0.58% | > | | -m 32 -t 16 -r 10 -s 1000, wakeup_latency_p99 (usec) | 2.13% | > | | -m 32 -t 64 -r 10 -s 1000, avg_rps (req/sec) | 0.67% | > | | -m 32 -t 64 -r 10 -s 1000, req_latency_p99 (usec) | 2.07% | > | | -m 32 -t 64 -r 10 -s 1000, wakeup_latency_p99 (usec) | -1.28% | > | | -m 32 -t 256 -r 10 -s 1000, avg_rps (req/sec) | 1.01% | > | | -m 32 -t 256 -r 10 -s 1000, req_latency_p99 (usec) | 0.69% | > | | -m 32 -t 256 -r 10 -s 1000, wakeup_latency_p99 (usec) | 13.12% | > | | -m 64 -t 1 -r 10 -s 1000, avg_rps (req/sec) | -0.25% | > | | -m 64 -t 1 -r 10 -s 1000, req_latency_p99 (usec) | -0.48% | > | | -m 64 -t 1 -r 10 -s 1000, wakeup_latency_p99 (usec) | 10.53% | > | | -m 64 -t 4 -r 10 -s 1000, avg_rps (req/sec) | -0.06% | > | | -m 64 -t 4 -r 10 -s 1000, req_latency_p99 (usec) | 0.00% | > | | -m 64 -t 4 -r 10 -s 1000, wakeup_latency_p99 (usec) | 0.00% | > | | -m 64 -t 16 -r 10 -s 1000, avg_rps (req/sec) | -0.36% | > | | -m 64 -t 16 -r 10 -s 1000, req_latency_p99 (usec) | 0.52% | > | | -m 64 -t 16 -r 10 -s 1000, wakeup_latency_p99 (usec) | 0.11% | > | | -m 64 -t 64 -r 10 -s 1000, avg_rps (req/sec) | 0.52% | > | | -m 64 -t 64 -r 10 -s 1000, req_latency_p99 (usec) | 3.53% | > | | -m 64 -t 64 -r 10 -s 1000, wakeup_latency_p99 (usec) | -0.10% | > | | -m 64 -t 256 -r 10 -s 1000, avg_rps (req/sec) | 2.53% | > | | -m 64 -t 256 -r 10 -s 1000, req_latency_p99 (usec) | 1.82% | > | | -m 64 -t 256 -r 10 -s 1000, wakeup_latency_p99 (usec) | -5.80% | > +----------------------------+----------------------------------------------------------+------------+ > | syscall/getpid | mean (ns) | (I) 15.98% | > | | p99 (ns) | (I) 11.11% | > | | p99.9 (ns) | (I) 16.13% | > +----------------------------+----------------------------------------------------------+------------+ > | syscall/getppid | mean (ns) | (I) 14.82% | > | | p99 (ns) | (I) 17.86% | > | | p99.9 (ns) | (I) 9.09% | > +----------------------------+----------------------------------------------------------+------------+ > | syscall/invalid | mean (ns) | (I) 17.78% | > | | p99 (ns) | (I) 11.11% | > | | p99.9 (ns) | 13.33% | > +----------------------------+----------------------------------------------------------+------------+ > > [1] https://gitlab.arm.com/tooling/fastpath > > Signed-off-by: Muhammad Usama Anjum <usama.anjum@arm.com> > --- > The patch applies on v6.19 and next-20260309. > --- > arch/arm64/include/asm/mte.h | 6 +++++- > arch/arm64/kernel/mte.c | 5 +++++ > 2 files changed, 10 insertions(+), 1 deletion(-) > > diff --git a/arch/arm64/include/asm/mte.h b/arch/arm64/include/asm/mte.h > index 6d4a78b9dc3e6..0e05d20cf2583 100644 > --- a/arch/arm64/include/asm/mte.h > +++ b/arch/arm64/include/asm/mte.h > @@ -252,7 +252,8 @@ static inline void mte_check_tfsr_entry(void) > if (!kasan_hw_tags_enabled()) > return; > > - mte_check_tfsr_el1(); > + if (system_uses_mte_async_or_asymm_mode()) > + mte_check_tfsr_el1(); For symmetry, I would also write this as if (!system_uses_mte_async_or_asymm_mode()) return; mte_check_tfsr_el1(); Nothing jumped at me, but I am still new to this code + spec :) Reviewed-by: David Hildenbrand (Arm) <david@kernel.org> -- Cheers, David
Please review and share your thoughts. On 11/03/2026 5:50 pm, Muhammad Usama Anjum wrote: > In MTE synchronous mode, tag check faults are reported as immediate > Data Abort exceptions. The TFSR_EL1.TF1 bit is never set, since faults > never go through the asynchronous path. Therefore, reading TFSR_EL1 > and executing data and instruction barriers on kernel entry, exit, > context switch, and suspend is unnecessary overhead in sync mode. > > The exit path (mte_check_tfsr_exit) and the assembly paths > (check_mte_async_tcf / clear_mte_async_tcf in entry.S) already had this > check. Extend the same optimization on kernel entry/exit, context > switch and suspend. > > All mte kselftests pass. The kunit before and after the patch show same > results. > > A selection of test_vmalloc benchmarks running on a arm64 machine. > v6.19 is the baseline. (>0 is faster, <0 is slower, (R)/(I) = > statistically significant Regression/Improvement). Based on significance > and ignoring the noise, the benchmarks improved. > > * 77 result classes were considered, with 9 wins, 0 losses and 68 ties > > Results of fastpath [1] on v6.19 vs this patch: > > +----------------------------+----------------------------------------------------------+------------+ > | Benchmark | Result Class | barriers | > +============================+==========================================================+============+ > | micromm/fork | fork: p:1, d:10 (seconds) | (I) 2.75% | > | | fork: p:512, d:10 (seconds) | 0.96% | > +----------------------------+----------------------------------------------------------+------------+ > | micromm/munmap | munmap: p:1, d:10 (seconds) | -1.78% | > | | munmap: p:512, d:10 (seconds) | 5.02% | > +----------------------------+----------------------------------------------------------+------------+ > | micromm/vmalloc | fix_align_alloc_test: p:1, h:0, l:500000 (usec) | -0.56% | > | | fix_size_alloc_test: p:1, h:0, l:500000 (usec) | 0.70% | > | | fix_size_alloc_test: p:4, h:0, l:500000 (usec) | 1.18% | > | | fix_size_alloc_test: p:16, h:0, l:500000 (usec) | -5.01% | > | | fix_size_alloc_test: p:16, h:1, l:500000 (usec) | 13.81% | > | | fix_size_alloc_test: p:64, h:0, l:100000 (usec) | 6.51% | > | | fix_size_alloc_test: p:64, h:1, l:100000 (usec) | 32.87% | > | | fix_size_alloc_test: p:256, h:0, l:100000 (usec) | 4.17% | > | | fix_size_alloc_test: p:256, h:1, l:100000 (usec) | 8.40% | > | | fix_size_alloc_test: p:512, h:0, l:100000 (usec) | -0.48% | > | | fix_size_alloc_test: p:512, h:1, l:100000 (usec) | -0.74% | > | | full_fit_alloc_test: p:1, h:0, l:500000 (usec) | 0.53% | > | | kvfree_rcu_1_arg_vmalloc_test: p:1, h:0, l:500000 (usec) | -2.81% | > | | kvfree_rcu_2_arg_vmalloc_test: p:1, h:0, l:500000 (usec) | -2.06% | > | | long_busy_list_alloc_test: p:1, h:0, l:500000 (usec) | -0.56% | > | | pcpu_alloc_test: p:1, h:0, l:500000 (usec) | -0.41% | > | | random_size_align_alloc_test: p:1, h:0, l:500000 (usec) | 0.89% | > | | random_size_alloc_test: p:1, h:0, l:500000 (usec) | 1.71% | > | | vm_map_ram_test: p:1, h:0, l:500000 (usec) | 0.83% | > +----------------------------+----------------------------------------------------------+------------+ > | schbench/thread-contention | -m 16 -t 1 -r 10 -s 1000, avg_rps (req/sec) | 0.05% | > | | -m 16 -t 1 -r 10 -s 1000, req_latency_p99 (usec) | 0.60% | > | | -m 16 -t 1 -r 10 -s 1000, wakeup_latency_p99 (usec) | 0.00% | > | | -m 16 -t 4 -r 10 -s 1000, avg_rps (req/sec) | -0.34% | > | | -m 16 -t 4 -r 10 -s 1000, req_latency_p99 (usec) | -0.58% | > | | -m 16 -t 4 -r 10 -s 1000, wakeup_latency_p99 (usec) | 9.09% | > | | -m 16 -t 16 -r 10 -s 1000, avg_rps (req/sec) | -0.74% | > | | -m 16 -t 16 -r 10 -s 1000, req_latency_p99 (usec) | -1.40% | > | | -m 16 -t 16 -r 10 -s 1000, wakeup_latency_p99 (usec) | 0.00% | > | | -m 16 -t 64 -r 10 -s 1000, avg_rps (req/sec) | -0.78% | > | | -m 16 -t 64 -r 10 -s 1000, req_latency_p99 (usec) | -0.11% | > | | -m 16 -t 64 -r 10 -s 1000, wakeup_latency_p99 (usec) | 0.11% | > | | -m 16 -t 256 -r 10 -s 1000, avg_rps (req/sec) | 2.64% | > | | -m 16 -t 256 -r 10 -s 1000, req_latency_p99 (usec) | 3.15% | > | | -m 16 -t 256 -r 10 -s 1000, wakeup_latency_p99 (usec) | 17.54% | > | | -m 32 -t 1 -r 10 -s 1000, avg_rps (req/sec) | -1.22% | > | | -m 32 -t 1 -r 10 -s 1000, req_latency_p99 (usec) | 0.85% | > | | -m 32 -t 1 -r 10 -s 1000, wakeup_latency_p99 (usec) | 0.00% | > | | -m 32 -t 4 -r 10 -s 1000, avg_rps (req/sec) | -0.34% | > | | -m 32 -t 4 -r 10 -s 1000, req_latency_p99 (usec) | 1.05% | > | | -m 32 -t 4 -r 10 -s 1000, wakeup_latency_p99 (usec) | 0.00% | > | | -m 32 -t 16 -r 10 -s 1000, avg_rps (req/sec) | -0.41% | > | | -m 32 -t 16 -r 10 -s 1000, req_latency_p99 (usec) | 0.58% | > | | -m 32 -t 16 -r 10 -s 1000, wakeup_latency_p99 (usec) | 2.13% | > | | -m 32 -t 64 -r 10 -s 1000, avg_rps (req/sec) | 0.67% | > | | -m 32 -t 64 -r 10 -s 1000, req_latency_p99 (usec) | 2.07% | > | | -m 32 -t 64 -r 10 -s 1000, wakeup_latency_p99 (usec) | -1.28% | > | | -m 32 -t 256 -r 10 -s 1000, avg_rps (req/sec) | 1.01% | > | | -m 32 -t 256 -r 10 -s 1000, req_latency_p99 (usec) | 0.69% | > | | -m 32 -t 256 -r 10 -s 1000, wakeup_latency_p99 (usec) | 13.12% | > | | -m 64 -t 1 -r 10 -s 1000, avg_rps (req/sec) | -0.25% | > | | -m 64 -t 1 -r 10 -s 1000, req_latency_p99 (usec) | -0.48% | > | | -m 64 -t 1 -r 10 -s 1000, wakeup_latency_p99 (usec) | 10.53% | > | | -m 64 -t 4 -r 10 -s 1000, avg_rps (req/sec) | -0.06% | > | | -m 64 -t 4 -r 10 -s 1000, req_latency_p99 (usec) | 0.00% | > | | -m 64 -t 4 -r 10 -s 1000, wakeup_latency_p99 (usec) | 0.00% | > | | -m 64 -t 16 -r 10 -s 1000, avg_rps (req/sec) | -0.36% | > | | -m 64 -t 16 -r 10 -s 1000, req_latency_p99 (usec) | 0.52% | > | | -m 64 -t 16 -r 10 -s 1000, wakeup_latency_p99 (usec) | 0.11% | > | | -m 64 -t 64 -r 10 -s 1000, avg_rps (req/sec) | 0.52% | > | | -m 64 -t 64 -r 10 -s 1000, req_latency_p99 (usec) | 3.53% | > | | -m 64 -t 64 -r 10 -s 1000, wakeup_latency_p99 (usec) | -0.10% | > | | -m 64 -t 256 -r 10 -s 1000, avg_rps (req/sec) | 2.53% | > | | -m 64 -t 256 -r 10 -s 1000, req_latency_p99 (usec) | 1.82% | > | | -m 64 -t 256 -r 10 -s 1000, wakeup_latency_p99 (usec) | -5.80% | > +----------------------------+----------------------------------------------------------+------------+ > | syscall/getpid | mean (ns) | (I) 15.98% | > | | p99 (ns) | (I) 11.11% | > | | p99.9 (ns) | (I) 16.13% | > +----------------------------+----------------------------------------------------------+------------+ > | syscall/getppid | mean (ns) | (I) 14.82% | > | | p99 (ns) | (I) 17.86% | > | | p99.9 (ns) | (I) 9.09% | > +----------------------------+----------------------------------------------------------+------------+ > | syscall/invalid | mean (ns) | (I) 17.78% | > | | p99 (ns) | (I) 11.11% | > | | p99.9 (ns) | 13.33% | > +----------------------------+----------------------------------------------------------+------------+ > > [1] https://gitlab.arm.com/tooling/fastpath > > Signed-off-by: Muhammad Usama Anjum <usama.anjum@arm.com> > --- > The patch applies on v6.19 and next-20260309. > --- > arch/arm64/include/asm/mte.h | 6 +++++- > arch/arm64/kernel/mte.c | 5 +++++ > 2 files changed, 10 insertions(+), 1 deletion(-) > > diff --git a/arch/arm64/include/asm/mte.h b/arch/arm64/include/asm/mte.h > index 6d4a78b9dc3e6..0e05d20cf2583 100644 > --- a/arch/arm64/include/asm/mte.h > +++ b/arch/arm64/include/asm/mte.h > @@ -252,7 +252,8 @@ static inline void mte_check_tfsr_entry(void) > if (!kasan_hw_tags_enabled()) > return; > > - mte_check_tfsr_el1(); > + if (system_uses_mte_async_or_asymm_mode()) > + mte_check_tfsr_el1(); > } > > static inline void mte_check_tfsr_exit(void) > @@ -260,6 +261,9 @@ static inline void mte_check_tfsr_exit(void) > if (!kasan_hw_tags_enabled()) > return; > > + if (!system_uses_mte_async_or_asymm_mode()) > + return; > + > /* > * The asynchronous faults are sync'ed automatically with > * TFSR_EL1 on kernel entry but for exit an explicit dsb() > diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c > index 32148bf09c1dc..8da2891b834d7 100644 > --- a/arch/arm64/kernel/mte.c > +++ b/arch/arm64/kernel/mte.c > @@ -291,6 +291,8 @@ void mte_thread_switch(struct task_struct *next) > /* TCO may not have been disabled on exception entry for the current task. */ > mte_disable_tco_entry(next); > > + if (!system_uses_mte_async_or_asymm_mode()) > + return; > /* > * Check if an async tag exception occurred at EL1. > * > @@ -350,6 +352,9 @@ void mte_suspend_enter(void) > if (!system_supports_mte()) > return; > > + if (!system_uses_mte_async_or_asymm_mode()) > + return; > + > /* > * The barriers are required to guarantee that the indirect writes > * to TFSR_EL1 are synchronized before we report the state.
Look good to me. Feel free to add: Reviewed-by: Yeoreum Yun <yeoreum.yun@arm.com> On Wed, Mar 11, 2026 at 05:50:50PM +0000, Muhammad Usama Anjum wrote: > In MTE synchronous mode, tag check faults are reported as immediate > Data Abort exceptions. The TFSR_EL1.TF1 bit is never set, since faults > never go through the asynchronous path. Therefore, reading TFSR_EL1 > and executing data and instruction barriers on kernel entry, exit, > context switch, and suspend is unnecessary overhead in sync mode. > > The exit path (mte_check_tfsr_exit) and the assembly paths > (check_mte_async_tcf / clear_mte_async_tcf in entry.S) already had this > check. Extend the same optimization on kernel entry/exit, context > switch and suspend. > > All mte kselftests pass. The kunit before and after the patch show same > results. > > A selection of test_vmalloc benchmarks running on a arm64 machine. > v6.19 is the baseline. (>0 is faster, <0 is slower, (R)/(I) = > statistically significant Regression/Improvement). Based on significance > and ignoring the noise, the benchmarks improved. > > * 77 result classes were considered, with 9 wins, 0 losses and 68 ties > > Results of fastpath [1] on v6.19 vs this patch: > > +----------------------------+----------------------------------------------------------+------------+ > | Benchmark | Result Class | barriers | > +============================+==========================================================+============+ > | micromm/fork | fork: p:1, d:10 (seconds) | (I) 2.75% | > | | fork: p:512, d:10 (seconds) | 0.96% | > +----------------------------+----------------------------------------------------------+------------+ > | micromm/munmap | munmap: p:1, d:10 (seconds) | -1.78% | > | | munmap: p:512, d:10 (seconds) | 5.02% | > +----------------------------+----------------------------------------------------------+------------+ > | micromm/vmalloc | fix_align_alloc_test: p:1, h:0, l:500000 (usec) | -0.56% | > | | fix_size_alloc_test: p:1, h:0, l:500000 (usec) | 0.70% | > | | fix_size_alloc_test: p:4, h:0, l:500000 (usec) | 1.18% | > | | fix_size_alloc_test: p:16, h:0, l:500000 (usec) | -5.01% | > | | fix_size_alloc_test: p:16, h:1, l:500000 (usec) | 13.81% | > | | fix_size_alloc_test: p:64, h:0, l:100000 (usec) | 6.51% | > | | fix_size_alloc_test: p:64, h:1, l:100000 (usec) | 32.87% | > | | fix_size_alloc_test: p:256, h:0, l:100000 (usec) | 4.17% | > | | fix_size_alloc_test: p:256, h:1, l:100000 (usec) | 8.40% | > | | fix_size_alloc_test: p:512, h:0, l:100000 (usec) | -0.48% | > | | fix_size_alloc_test: p:512, h:1, l:100000 (usec) | -0.74% | > | | full_fit_alloc_test: p:1, h:0, l:500000 (usec) | 0.53% | > | | kvfree_rcu_1_arg_vmalloc_test: p:1, h:0, l:500000 (usec) | -2.81% | > | | kvfree_rcu_2_arg_vmalloc_test: p:1, h:0, l:500000 (usec) | -2.06% | > | | long_busy_list_alloc_test: p:1, h:0, l:500000 (usec) | -0.56% | > | | pcpu_alloc_test: p:1, h:0, l:500000 (usec) | -0.41% | > | | random_size_align_alloc_test: p:1, h:0, l:500000 (usec) | 0.89% | > | | random_size_alloc_test: p:1, h:0, l:500000 (usec) | 1.71% | > | | vm_map_ram_test: p:1, h:0, l:500000 (usec) | 0.83% | > +----------------------------+----------------------------------------------------------+------------+ > | schbench/thread-contention | -m 16 -t 1 -r 10 -s 1000, avg_rps (req/sec) | 0.05% | > | | -m 16 -t 1 -r 10 -s 1000, req_latency_p99 (usec) | 0.60% | > | | -m 16 -t 1 -r 10 -s 1000, wakeup_latency_p99 (usec) | 0.00% | > | | -m 16 -t 4 -r 10 -s 1000, avg_rps (req/sec) | -0.34% | > | | -m 16 -t 4 -r 10 -s 1000, req_latency_p99 (usec) | -0.58% | > | | -m 16 -t 4 -r 10 -s 1000, wakeup_latency_p99 (usec) | 9.09% | > | | -m 16 -t 16 -r 10 -s 1000, avg_rps (req/sec) | -0.74% | > | | -m 16 -t 16 -r 10 -s 1000, req_latency_p99 (usec) | -1.40% | > | | -m 16 -t 16 -r 10 -s 1000, wakeup_latency_p99 (usec) | 0.00% | > | | -m 16 -t 64 -r 10 -s 1000, avg_rps (req/sec) | -0.78% | > | | -m 16 -t 64 -r 10 -s 1000, req_latency_p99 (usec) | -0.11% | > | | -m 16 -t 64 -r 10 -s 1000, wakeup_latency_p99 (usec) | 0.11% | > | | -m 16 -t 256 -r 10 -s 1000, avg_rps (req/sec) | 2.64% | > | | -m 16 -t 256 -r 10 -s 1000, req_latency_p99 (usec) | 3.15% | > | | -m 16 -t 256 -r 10 -s 1000, wakeup_latency_p99 (usec) | 17.54% | > | | -m 32 -t 1 -r 10 -s 1000, avg_rps (req/sec) | -1.22% | > | | -m 32 -t 1 -r 10 -s 1000, req_latency_p99 (usec) | 0.85% | > | | -m 32 -t 1 -r 10 -s 1000, wakeup_latency_p99 (usec) | 0.00% | > | | -m 32 -t 4 -r 10 -s 1000, avg_rps (req/sec) | -0.34% | > | | -m 32 -t 4 -r 10 -s 1000, req_latency_p99 (usec) | 1.05% | > | | -m 32 -t 4 -r 10 -s 1000, wakeup_latency_p99 (usec) | 0.00% | > | | -m 32 -t 16 -r 10 -s 1000, avg_rps (req/sec) | -0.41% | > | | -m 32 -t 16 -r 10 -s 1000, req_latency_p99 (usec) | 0.58% | > | | -m 32 -t 16 -r 10 -s 1000, wakeup_latency_p99 (usec) | 2.13% | > | | -m 32 -t 64 -r 10 -s 1000, avg_rps (req/sec) | 0.67% | > | | -m 32 -t 64 -r 10 -s 1000, req_latency_p99 (usec) | 2.07% | > | | -m 32 -t 64 -r 10 -s 1000, wakeup_latency_p99 (usec) | -1.28% | > | | -m 32 -t 256 -r 10 -s 1000, avg_rps (req/sec) | 1.01% | > | | -m 32 -t 256 -r 10 -s 1000, req_latency_p99 (usec) | 0.69% | > | | -m 32 -t 256 -r 10 -s 1000, wakeup_latency_p99 (usec) | 13.12% | > | | -m 64 -t 1 -r 10 -s 1000, avg_rps (req/sec) | -0.25% | > | | -m 64 -t 1 -r 10 -s 1000, req_latency_p99 (usec) | -0.48% | > | | -m 64 -t 1 -r 10 -s 1000, wakeup_latency_p99 (usec) | 10.53% | > | | -m 64 -t 4 -r 10 -s 1000, avg_rps (req/sec) | -0.06% | > | | -m 64 -t 4 -r 10 -s 1000, req_latency_p99 (usec) | 0.00% | > | | -m 64 -t 4 -r 10 -s 1000, wakeup_latency_p99 (usec) | 0.00% | > | | -m 64 -t 16 -r 10 -s 1000, avg_rps (req/sec) | -0.36% | > | | -m 64 -t 16 -r 10 -s 1000, req_latency_p99 (usec) | 0.52% | > | | -m 64 -t 16 -r 10 -s 1000, wakeup_latency_p99 (usec) | 0.11% | > | | -m 64 -t 64 -r 10 -s 1000, avg_rps (req/sec) | 0.52% | > | | -m 64 -t 64 -r 10 -s 1000, req_latency_p99 (usec) | 3.53% | > | | -m 64 -t 64 -r 10 -s 1000, wakeup_latency_p99 (usec) | -0.10% | > | | -m 64 -t 256 -r 10 -s 1000, avg_rps (req/sec) | 2.53% | > | | -m 64 -t 256 -r 10 -s 1000, req_latency_p99 (usec) | 1.82% | > | | -m 64 -t 256 -r 10 -s 1000, wakeup_latency_p99 (usec) | -5.80% | > +----------------------------+----------------------------------------------------------+------------+ > | syscall/getpid | mean (ns) | (I) 15.98% | > | | p99 (ns) | (I) 11.11% | > | | p99.9 (ns) | (I) 16.13% | > +----------------------------+----------------------------------------------------------+------------+ > | syscall/getppid | mean (ns) | (I) 14.82% | > | | p99 (ns) | (I) 17.86% | > | | p99.9 (ns) | (I) 9.09% | > +----------------------------+----------------------------------------------------------+------------+ > | syscall/invalid | mean (ns) | (I) 17.78% | > | | p99 (ns) | (I) 11.11% | > | | p99.9 (ns) | 13.33% | > +----------------------------+----------------------------------------------------------+------------+ > > [1] https://gitlab.arm.com/tooling/fastpath > > Signed-off-by: Muhammad Usama Anjum <usama.anjum@arm.com> > --- > The patch applies on v6.19 and next-20260309. > --- > arch/arm64/include/asm/mte.h | 6 +++++- > arch/arm64/kernel/mte.c | 5 +++++ > 2 files changed, 10 insertions(+), 1 deletion(-) > > diff --git a/arch/arm64/include/asm/mte.h b/arch/arm64/include/asm/mte.h > index 6d4a78b9dc3e6..0e05d20cf2583 100644 > --- a/arch/arm64/include/asm/mte.h > +++ b/arch/arm64/include/asm/mte.h > @@ -252,7 +252,8 @@ static inline void mte_check_tfsr_entry(void) > if (!kasan_hw_tags_enabled()) > return; > > - mte_check_tfsr_el1(); > + if (system_uses_mte_async_or_asymm_mode()) > + mte_check_tfsr_el1(); > } > > static inline void mte_check_tfsr_exit(void) > @@ -260,6 +261,9 @@ static inline void mte_check_tfsr_exit(void) > if (!kasan_hw_tags_enabled()) > return; > > + if (!system_uses_mte_async_or_asymm_mode()) > + return; > + > /* > * The asynchronous faults are sync'ed automatically with > * TFSR_EL1 on kernel entry but for exit an explicit dsb() > diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c > index 32148bf09c1dc..8da2891b834d7 100644 > --- a/arch/arm64/kernel/mte.c > +++ b/arch/arm64/kernel/mte.c > @@ -291,6 +291,8 @@ void mte_thread_switch(struct task_struct *next) > /* TCO may not have been disabled on exception entry for the current task. */ > mte_disable_tco_entry(next); > > + if (!system_uses_mte_async_or_asymm_mode()) > + return; > /* > * Check if an async tag exception occurred at EL1. > * > @@ -350,6 +352,9 @@ void mte_suspend_enter(void) > if (!system_supports_mte()) > return; > > + if (!system_uses_mte_async_or_asymm_mode()) > + return; > + > /* > * The barriers are required to guarantee that the indirect writes > * to TFSR_EL1 are synchronized before we report the state. > -- > 2.47.3 > -- Sincerely, Yeoreum Yun
© 2016 - 2026 Red Hat, Inc.