Some arm64 platforms have slow per-CPU atomic operations, for example,
the Neoverse V2. This commit therefore moves SRCU-fast from per-CPU
atomic operations to interrupt-disabled non-read-modify-write-atomic
atomic_read()/atomic_set() operations. This works because
SRCU-fast-updown is not invoked from read-side primitives, which
means that if srcu_read_unlock_fast() NMI handlers. This means that
srcu_read_lock_fast_updown() and srcu_read_unlock_fast_updown() can
exclude themselves and each other
This reduces the overhead of calls to srcu_read_lock_fast_updown() and
srcu_read_unlock_fast_updown() from about 100ns to about 12ns on an ARM
Neoverse V2. Although this is not excellent compared to about 2ns on x86,
it sure beats 100ns.
This command was used to measure the overhead:
tools/testing/selftests/rcutorture/bin/kvm.sh --torture refscale --allcpus --duration 5 --configs NOPREEMPT --kconfig "CONFIG_NR_CPUS=64 CONFIG_TASKS_TRACE_RCU=y" --bootargs "refscale.loops=100000 refscale.guest_os_delay=5 refscale.nreaders=64 refscale.holdoff=30 torture.disable_onoff_at_boot refscale.scale_type=srcu-fast-updown refscale.verbose_batched=8 torture.verbose_sleep_frequency=8 torture.verbose_sleep_duration=8 refscale.nruns=100" --trust-make
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: <linux-arm-kernel@lists.infradead.org>
Cc: <bpf@vger.kernel.org>
---
include/linux/srcutree.h | 56 ++++++++++++++++++++++++++++++++++++----
1 file changed, 51 insertions(+), 5 deletions(-)
diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h
index d6f978b50472..70560dc4636c 100644
--- a/include/linux/srcutree.h
+++ b/include/linux/srcutree.h
@@ -253,6 +253,34 @@ static inline struct srcu_ctr __percpu *__srcu_ctr_to_ptr(struct srcu_struct *ss
return &ssp->sda->srcu_ctrs[idx];
}
+/*
+ * Non-atomic manipulation of SRCU lock counters.
+ */
+static inline struct srcu_ctr __percpu notrace *__srcu_read_lock_fast_na(struct srcu_struct *ssp)
+{
+ atomic_long_t *scnp;
+ struct srcu_ctr __percpu *scp;
+
+ lockdep_assert_preemption_disabled();
+ scp = READ_ONCE(ssp->srcu_ctrp);
+ scnp = raw_cpu_ptr(&scp->srcu_locks);
+ atomic_long_set(scnp, atomic_long_read(scnp) + 1);
+ return scp;
+}
+
+/*
+ * Non-atomic manipulation of SRCU unlock counters.
+ */
+static inline void notrace
+__srcu_read_unlock_fast_na(struct srcu_struct *ssp, struct srcu_ctr __percpu *scp)
+{
+ atomic_long_t *scnp;
+
+ lockdep_assert_preemption_disabled();
+ scnp = raw_cpu_ptr(&scp->srcu_unlocks);
+ atomic_long_set(scnp, atomic_long_read(scnp) + 1);
+}
+
/*
* Counts the new reader in the appropriate per-CPU element of the
* srcu_struct. Returns a pointer that must be passed to the matching
@@ -327,12 +355,23 @@ __srcu_read_unlock_fast(struct srcu_struct *ssp, struct srcu_ctr __percpu *scp)
static inline
struct srcu_ctr __percpu notrace *__srcu_read_lock_fast_updown(struct srcu_struct *ssp)
{
- struct srcu_ctr __percpu *scp = READ_ONCE(ssp->srcu_ctrp);
+ struct srcu_ctr __percpu *scp;
- if (!IS_ENABLED(CONFIG_NEED_SRCU_NMI_SAFE))
+ if (IS_ENABLED(CONFIG_ARM64) && IS_ENABLED(CONFIG_ARM64_USE_LSE_PERCPU_ATOMICS)) {
+ unsigned long flags;
+
+ local_irq_save(flags);
+ scp = __srcu_read_lock_fast_na(ssp);
+ local_irq_restore(flags); /* Avoids leaking the critical section. */
+ return scp;
+ }
+
+ scp = READ_ONCE(ssp->srcu_ctrp);
+ if (!IS_ENABLED(CONFIG_NEED_SRCU_NMI_SAFE)) {
this_cpu_inc(scp->srcu_locks.counter); // Y, and implicit RCU reader.
- else
+ } else {
atomic_long_inc(raw_cpu_ptr(&scp->srcu_locks)); // Y, and implicit RCU reader.
+ }
barrier(); /* Avoid leaking the critical section. */
return scp;
}
@@ -350,10 +389,17 @@ static inline void notrace
__srcu_read_unlock_fast_updown(struct srcu_struct *ssp, struct srcu_ctr __percpu *scp)
{
barrier(); /* Avoid leaking the critical section. */
- if (!IS_ENABLED(CONFIG_NEED_SRCU_NMI_SAFE))
+ if (IS_ENABLED(CONFIG_ARM64)) {
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __srcu_read_unlock_fast_na(ssp, scp);
+ local_irq_restore(flags);
+ } else if (!IS_ENABLED(CONFIG_NEED_SRCU_NMI_SAFE)) {
this_cpu_inc(scp->srcu_unlocks.counter); // Z, and implicit RCU reader.
- else
+ } else {
atomic_long_inc(raw_cpu_ptr(&scp->srcu_unlocks)); // Z, and implicit RCU reader.
+ }
}
void __srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor);
--
2.40.1
On 2025-11-02 16:44, Paul E. McKenney wrote:
> Some arm64 platforms have slow per-CPU atomic operations, for example,
> the Neoverse V2. This commit therefore moves SRCU-fast from per-CPU
> atomic operations to interrupt-disabled non-read-modify-write-atomic
> atomic_read()/atomic_set() operations. This works because
> SRCU-fast-updown is not invoked from read-side primitives, which
> means that if srcu_read_unlock_fast() NMI handlers. This means that
> srcu_read_lock_fast_updown() and srcu_read_unlock_fast_updown() can
> exclude themselves and each other
>
> This reduces the overhead of calls to srcu_read_lock_fast_updown() and
> srcu_read_unlock_fast_updown() from about 100ns to about 12ns on an ARM
> Neoverse V2. Although this is not excellent compared to about 2ns on x86,
> it sure beats 100ns.
>
> This command was used to measure the overhead:
>
> tools/testing/selftests/rcutorture/bin/kvm.sh --torture refscale --allcpus --duration 5 --configs NOPREEMPT --kconfig "CONFIG_NR_CPUS=64 CONFIG_TASKS_TRACE_RCU=y" --bootargs "refscale.loops=100000 refscale.guest_os_delay=5 refscale.nreaders=64 refscale.holdoff=30 torture.disable_onoff_at_boot refscale.scale_type=srcu-fast-updown refscale.verbose_batched=8 torture.verbose_sleep_frequency=8 torture.verbose_sleep_duration=8 refscale.nruns=100" --trust-make
>
Hi Paul,
At a high level, what are you trying to achieve with this ?
AFAIU, you are trying to remove the cost of atomics on per-cpu
data from srcu-fast read lock/unlock for frequent calls for
CONFIG_NEED_SRCU_NMI_SAFE=y, am I on the right track ?
[disclaimer: I've looked only briefly at your proposed patch.]
Then there are various other less specific approaches to consider
before introducing such architecture and use-case specific work-around.
One example is the libside (user level) rcu implementation which uses
two counters per cpu [1]. One counter is the rseq fast path, and the
second counter is for atomics (as fallback).
If the typical scenario we want to optimize for is thread context, we
can probably remove the atomic from the fast path with just preempt off
by partitioning the per-cpu counters further, one possibility being:
struct percpu_srcu_fast_pair {
unsigned long lock, unlock;
};
struct percpu_srcu_fast {
struct percpu_srcu_fast_pair thread;
struct percpu_srcu_fast_pair irq;
};
And the grace period sums both thread and irq counters.
Thoughts ?
Thanks,
Mathieu
[1] https://github.com/compudj/libside/blob/master/src/rcu.h#L71
--
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com
On Mon, Nov 03, 2025 at 08:34:10AM -0500, Mathieu Desnoyers wrote:
> On 2025-11-02 16:44, Paul E. McKenney wrote:
> > Some arm64 platforms have slow per-CPU atomic operations, for example,
> > the Neoverse V2. This commit therefore moves SRCU-fast from per-CPU
> > atomic operations to interrupt-disabled non-read-modify-write-atomic
> > atomic_read()/atomic_set() operations. This works because
> > SRCU-fast-updown is not invoked from read-side primitives, which
> > means that if srcu_read_unlock_fast() NMI handlers. This means that
> > srcu_read_lock_fast_updown() and srcu_read_unlock_fast_updown() can
> > exclude themselves and each other
> >
> > This reduces the overhead of calls to srcu_read_lock_fast_updown() and
> > srcu_read_unlock_fast_updown() from about 100ns to about 12ns on an ARM
> > Neoverse V2. Although this is not excellent compared to about 2ns on x86,
> > it sure beats 100ns.
> >
> > This command was used to measure the overhead:
> >
> > tools/testing/selftests/rcutorture/bin/kvm.sh --torture refscale --allcpus --duration 5 --configs NOPREEMPT --kconfig "CONFIG_NR_CPUS=64 CONFIG_TASKS_TRACE_RCU=y" --bootargs "refscale.loops=100000 refscale.guest_os_delay=5 refscale.nreaders=64 refscale.holdoff=30 torture.disable_onoff_at_boot refscale.scale_type=srcu-fast-updown refscale.verbose_batched=8 torture.verbose_sleep_frequency=8 torture.verbose_sleep_duration=8 refscale.nruns=100" --trust-make
> >
> Hi Paul,
>
> At a high level, what are you trying to achieve with this ?
I am working around the high single-CPU cost of arm64 LSE instructions,
as in about 50ns per compared non-LSE of about 5ns per. The 50ns rules
them out for uretprobes, for example.
But Catalin's later patch is in all ways better than mine, so I will be
keeping this one only until Catalin's hits mainline. Once that happens,
I will revert this one the following merge window. (It might be awhile
because of the testing required on a wide range of platforms.)
> AFAIU, you are trying to remove the cost of atomics on per-cpu
> data from srcu-fast read lock/unlock for frequent calls for
> CONFIG_NEED_SRCU_NMI_SAFE=y, am I on the right track ?
>
> [disclaimer: I've looked only briefly at your proposed patch.]
> Then there are various other less specific approaches to consider
> before introducing such architecture and use-case specific work-around.
>
> One example is the libside (user level) rcu implementation which uses
> two counters per cpu [1]. One counter is the rseq fast path, and the
> second counter is for atomics (as fallback).
>
> If the typical scenario we want to optimize for is thread context, we
> can probably remove the atomic from the fast path with just preempt off
> by partitioning the per-cpu counters further, one possibility being:
>
> struct percpu_srcu_fast_pair {
> unsigned long lock, unlock;
> };
>
> struct percpu_srcu_fast {
> struct percpu_srcu_fast_pair thread;
> struct percpu_srcu_fast_pair irq;
> };
>
> And the grace period sums both thread and irq counters.
>
> Thoughts ?
One complication here is that we need srcu_down_read() at task level
and the matching srcu_up_read() at softirq and/or hardirq level.
Or am I missing a trick in your proposed implementation?
Thanx, Paul
> Thanks,
>
> Mathieu
>
> [1] https://github.com/compudj/libside/blob/master/src/rcu.h#L71
>
> --
> Mathieu Desnoyers
> EfficiOS Inc.
> https://www.efficios.com
On 2025-11-03 12:08, Paul E. McKenney wrote:
> On Mon, Nov 03, 2025 at 08:34:10AM -0500, Mathieu Desnoyers wrote:
[...]
>> One example is the libside (user level) rcu implementation which uses
>> two counters per cpu [1]. One counter is the rseq fast path, and the
>> second counter is for atomics (as fallback).
>>
>> If the typical scenario we want to optimize for is thread context, we
>> can probably remove the atomic from the fast path with just preempt off
>> by partitioning the per-cpu counters further, one possibility being:
>>
>> struct percpu_srcu_fast_pair {
>> unsigned long lock, unlock;
>> };
>>
>> struct percpu_srcu_fast {
>> struct percpu_srcu_fast_pair thread;
>> struct percpu_srcu_fast_pair irq;
>> };
>>
>> And the grace period sums both thread and irq counters.
>>
>> Thoughts ?
>
> One complication here is that we need srcu_down_read() at task level
> and the matching srcu_up_read() at softirq and/or hardirq level.
>
> Or am I missing a trick in your proposed implementation?
I think you are indeed missing the crux of the solution here.
Each of task level and soft/hard irq level increments will be
dispatched into different counters (thread vs irq). But the
grace period will sum, for each the the two periods one after the
next, the unlock counts and then the lock counts. It will consider
the period as quiescent if the delta between the two sums is zero,
e.g.
(count[period].irq.unlock + count[period].thread.unlock -
count[period].irq.lock - count[period].thread.lock) == 0
so the sum does not care how the counters were incremented
(it just does a load-relaxed), but each counter category
have its own way of dealing with concurrency (thread: percpu
ops, irq: atomics).
This is effectively a use of split-counters, but the split
is across concurrency handling mechanisms rather than across
CPUs.
Thanks,
Mathieu
--
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com
On Mon, Nov 03, 2025 at 01:16:23PM -0500, Mathieu Desnoyers wrote:
> On 2025-11-03 12:08, Paul E. McKenney wrote:
> > On Mon, Nov 03, 2025 at 08:34:10AM -0500, Mathieu Desnoyers wrote:
> [...]
>
> > > One example is the libside (user level) rcu implementation which uses
> > > two counters per cpu [1]. One counter is the rseq fast path, and the
> > > second counter is for atomics (as fallback).
> > >
> > > If the typical scenario we want to optimize for is thread context, we
> > > can probably remove the atomic from the fast path with just preempt off
> > > by partitioning the per-cpu counters further, one possibility being:
> > >
> > > struct percpu_srcu_fast_pair {
> > > unsigned long lock, unlock;
> > > };
> > >
> > > struct percpu_srcu_fast {
> > > struct percpu_srcu_fast_pair thread;
> > > struct percpu_srcu_fast_pair irq;
> > > };
> > >
> > > And the grace period sums both thread and irq counters.
> > >
> > > Thoughts ?
> >
> > One complication here is that we need srcu_down_read() at task level
> > and the matching srcu_up_read() at softirq and/or hardirq level.
> >
> > Or am I missing a trick in your proposed implementation?
>
> I think you are indeed missing the crux of the solution here.
>
> Each of task level and soft/hard irq level increments will be
> dispatched into different counters (thread vs irq). But the
> grace period will sum, for each the the two periods one after the
> next, the unlock counts and then the lock counts. It will consider
> the period as quiescent if the delta between the two sums is zero,
> e.g.
>
> (count[period].irq.unlock + count[period].thread.unlock -
> count[period].irq.lock - count[period].thread.lock) == 0
>
> so the sum does not care how the counters were incremented
> (it just does a load-relaxed), but each counter category
> have its own way of dealing with concurrency (thread: percpu
> ops, irq: atomics).
>
> This is effectively a use of split-counters, but the split
> is across concurrency handling mechanisms rather than across
> CPUs.
Ah, got it, thank you! But we would need an additional softirq counter,
correct?
I will keep this in my back pocket in case Catalin's and Yicong's prefetch
trick turns out to be problematic, and again, thank you!
Thanx, Paul
On 2025-11-03 14:17, Paul E. McKenney wrote: [...] >> This is effectively a use of split-counters, but the split >> is across concurrency handling mechanisms rather than across >> CPUs. > > Ah, got it, thank you! But we would need an additional softirq counter, > correct? Fundamentally it depends on how you want to split frequent vs infrequent accesses. If the fast-paths you care about are all in thread context, then you only need to split between percpu ops vs atomic counters. The per-thread accesses would use percpu ops, and all the rest use atomics. The "all the rest" can cover everything else including softirqs, irq, and nmis. > > I will keep this in my back pocket in case Catalin's and Yicong's prefetch > trick turns out to be problematic, and again, thank you! You're welcome! :) Mathieu -- Mathieu Desnoyers EfficiOS Inc. https://www.efficios.com
Hi Paul,
On Sun, Nov 02, 2025 at 01:44:34PM -0800, Paul E. McKenney wrote:
> Some arm64 platforms have slow per-CPU atomic operations, for example,
> the Neoverse V2. This commit therefore moves SRCU-fast from per-CPU
> atomic operations to interrupt-disabled non-read-modify-write-atomic
> atomic_read()/atomic_set() operations. This works because
> SRCU-fast-updown is not invoked from read-side primitives, which
> means that if srcu_read_unlock_fast() NMI handlers. This means that
> srcu_read_lock_fast_updown() and srcu_read_unlock_fast_updown() can
> exclude themselves and each other
>
> This reduces the overhead of calls to srcu_read_lock_fast_updown() and
> srcu_read_unlock_fast_updown() from about 100ns to about 12ns on an ARM
> Neoverse V2. Although this is not excellent compared to about 2ns on x86,
> it sure beats 100ns.
>
> This command was used to measure the overhead:
>
> tools/testing/selftests/rcutorture/bin/kvm.sh --torture refscale --allcpus --duration 5 --configs NOPREEMPT --kconfig "CONFIG_NR_CPUS=64 CONFIG_TASKS_TRACE_RCU=y" --bootargs "refscale.loops=100000 refscale.guest_os_delay=5 refscale.nreaders=64 refscale.holdoff=30 torture.disable_onoff_at_boot refscale.scale_type=srcu-fast-updown refscale.verbose_batched=8 torture.verbose_sleep_frequency=8 torture.verbose_sleep_duration=8 refscale.nruns=100" --trust-make
>
> Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
> Cc: Catalin Marinas <catalin.marinas@arm.com>
> Cc: Will Deacon <will@kernel.org>
> Cc: Mark Rutland <mark.rutland@arm.com>
> Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
> Cc: Steven Rostedt <rostedt@goodmis.org>
> Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
> Cc: <linux-arm-kernel@lists.infradead.org>
> Cc: <bpf@vger.kernel.org>
> ---
> include/linux/srcutree.h | 56 ++++++++++++++++++++++++++++++++++++----
> 1 file changed, 51 insertions(+), 5 deletions(-)
[...]
> @@ -327,12 +355,23 @@ __srcu_read_unlock_fast(struct srcu_struct *ssp, struct srcu_ctr __percpu *scp)
> static inline
> struct srcu_ctr __percpu notrace *__srcu_read_lock_fast_updown(struct srcu_struct *ssp)
> {
> - struct srcu_ctr __percpu *scp = READ_ONCE(ssp->srcu_ctrp);
> + struct srcu_ctr __percpu *scp;
>
> - if (!IS_ENABLED(CONFIG_NEED_SRCU_NMI_SAFE))
> + if (IS_ENABLED(CONFIG_ARM64) && IS_ENABLED(CONFIG_ARM64_USE_LSE_PERCPU_ATOMICS)) {
> + unsigned long flags;
> +
> + local_irq_save(flags);
> + scp = __srcu_read_lock_fast_na(ssp);
> + local_irq_restore(flags); /* Avoids leaking the critical section. */
> + return scp;
> + }
Do we still need to pursue this after Catalin's prefetch suggestion for the
per-cpu atomics?
https://lore.kernel.org/r/aQU7l-qMKJTx4znJ@arm.com
Although disabling/enabling interrupts on your system seems to be
significantly faster than an atomic instruction, I'm worried that it's
all very SoC-specific and on a mobile part (especially with pseudo-NMI),
the relative costs could easily be the other way around.
Will
On Mon, Nov 03, 2025 at 12:51:48PM +0000, Will Deacon wrote:
> Hi Paul,
>
> On Sun, Nov 02, 2025 at 01:44:34PM -0800, Paul E. McKenney wrote:
> > Some arm64 platforms have slow per-CPU atomic operations, for example,
> > the Neoverse V2. This commit therefore moves SRCU-fast from per-CPU
> > atomic operations to interrupt-disabled non-read-modify-write-atomic
> > atomic_read()/atomic_set() operations. This works because
> > SRCU-fast-updown is not invoked from read-side primitives, which
> > means that if srcu_read_unlock_fast() NMI handlers. This means that
> > srcu_read_lock_fast_updown() and srcu_read_unlock_fast_updown() can
> > exclude themselves and each other
> >
> > This reduces the overhead of calls to srcu_read_lock_fast_updown() and
> > srcu_read_unlock_fast_updown() from about 100ns to about 12ns on an ARM
> > Neoverse V2. Although this is not excellent compared to about 2ns on x86,
> > it sure beats 100ns.
> >
> > This command was used to measure the overhead:
> >
> > tools/testing/selftests/rcutorture/bin/kvm.sh --torture refscale --allcpus --duration 5 --configs NOPREEMPT --kconfig "CONFIG_NR_CPUS=64 CONFIG_TASKS_TRACE_RCU=y" --bootargs "refscale.loops=100000 refscale.guest_os_delay=5 refscale.nreaders=64 refscale.holdoff=30 torture.disable_onoff_at_boot refscale.scale_type=srcu-fast-updown refscale.verbose_batched=8 torture.verbose_sleep_frequency=8 torture.verbose_sleep_duration=8 refscale.nruns=100" --trust-make
> >
> > Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
> > Cc: Catalin Marinas <catalin.marinas@arm.com>
> > Cc: Will Deacon <will@kernel.org>
> > Cc: Mark Rutland <mark.rutland@arm.com>
> > Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
> > Cc: Steven Rostedt <rostedt@goodmis.org>
> > Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
> > Cc: <linux-arm-kernel@lists.infradead.org>
> > Cc: <bpf@vger.kernel.org>
> > ---
> > include/linux/srcutree.h | 56 ++++++++++++++++++++++++++++++++++++----
> > 1 file changed, 51 insertions(+), 5 deletions(-)
>
> [...]
>
> > @@ -327,12 +355,23 @@ __srcu_read_unlock_fast(struct srcu_struct *ssp, struct srcu_ctr __percpu *scp)
> > static inline
> > struct srcu_ctr __percpu notrace *__srcu_read_lock_fast_updown(struct srcu_struct *ssp)
> > {
> > - struct srcu_ctr __percpu *scp = READ_ONCE(ssp->srcu_ctrp);
> > + struct srcu_ctr __percpu *scp;
> >
> > - if (!IS_ENABLED(CONFIG_NEED_SRCU_NMI_SAFE))
> > + if (IS_ENABLED(CONFIG_ARM64) && IS_ENABLED(CONFIG_ARM64_USE_LSE_PERCPU_ATOMICS)) {
> > + unsigned long flags;
> > +
> > + local_irq_save(flags);
> > + scp = __srcu_read_lock_fast_na(ssp);
> > + local_irq_restore(flags); /* Avoids leaking the critical section. */
> > + return scp;
> > + }
>
> Do we still need to pursue this after Catalin's prefetch suggestion for the
> per-cpu atomics?
>
> https://lore.kernel.org/r/aQU7l-qMKJTx4znJ@arm.com
>
> Although disabling/enabling interrupts on your system seems to be
> significantly faster than an atomic instruction, I'm worried that it's
> all very SoC-specific and on a mobile part (especially with pseudo-NMI),
> the relative costs could easily be the other way around.
In my testing Catalin's patch wins by at least 10% on microbenchmarks.
So I am holding this one in my back pocket just in case, but yes, you
should ignore it, hopefully forever. ;-)
Thanx, Paul
On Mon, Nov 03, 2025 at 12:51:48PM +0000, Will Deacon wrote:
> On Sun, Nov 02, 2025 at 01:44:34PM -0800, Paul E. McKenney wrote:
> > Some arm64 platforms have slow per-CPU atomic operations, for example,
> > the Neoverse V2. This commit therefore moves SRCU-fast from per-CPU
> > atomic operations to interrupt-disabled non-read-modify-write-atomic
> > atomic_read()/atomic_set() operations. This works because
> > SRCU-fast-updown is not invoked from read-side primitives, which
> > means that if srcu_read_unlock_fast() NMI handlers. This means that
> > srcu_read_lock_fast_updown() and srcu_read_unlock_fast_updown() can
> > exclude themselves and each other
> >
> > This reduces the overhead of calls to srcu_read_lock_fast_updown() and
> > srcu_read_unlock_fast_updown() from about 100ns to about 12ns on an ARM
> > Neoverse V2. Although this is not excellent compared to about 2ns on x86,
> > it sure beats 100ns.
> >
> > This command was used to measure the overhead:
> >
> > tools/testing/selftests/rcutorture/bin/kvm.sh --torture refscale --allcpus --duration 5 --configs NOPREEMPT --kconfig "CONFIG_NR_CPUS=64 CONFIG_TASKS_TRACE_RCU=y" --bootargs "refscale.loops=100000 refscale.guest_os_delay=5 refscale.nreaders=64 refscale.holdoff=30 torture.disable_onoff_at_boot refscale.scale_type=srcu-fast-updown refscale.verbose_batched=8 torture.verbose_sleep_frequency=8 torture.verbose_sleep_duration=8 refscale.nruns=100" --trust-make
> >
> > Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
> > Cc: Catalin Marinas <catalin.marinas@arm.com>
> > Cc: Will Deacon <will@kernel.org>
> > Cc: Mark Rutland <mark.rutland@arm.com>
> > Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
> > Cc: Steven Rostedt <rostedt@goodmis.org>
> > Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
> > Cc: <linux-arm-kernel@lists.infradead.org>
> > Cc: <bpf@vger.kernel.org>
> > ---
> > include/linux/srcutree.h | 56 ++++++++++++++++++++++++++++++++++++----
> > 1 file changed, 51 insertions(+), 5 deletions(-)
>
> [...]
>
> > @@ -327,12 +355,23 @@ __srcu_read_unlock_fast(struct srcu_struct *ssp, struct srcu_ctr __percpu *scp)
> > static inline
> > struct srcu_ctr __percpu notrace *__srcu_read_lock_fast_updown(struct srcu_struct *ssp)
> > {
> > - struct srcu_ctr __percpu *scp = READ_ONCE(ssp->srcu_ctrp);
> > + struct srcu_ctr __percpu *scp;
> >
> > - if (!IS_ENABLED(CONFIG_NEED_SRCU_NMI_SAFE))
> > + if (IS_ENABLED(CONFIG_ARM64) && IS_ENABLED(CONFIG_ARM64_USE_LSE_PERCPU_ATOMICS)) {
> > + unsigned long flags;
> > +
> > + local_irq_save(flags);
> > + scp = __srcu_read_lock_fast_na(ssp);
> > + local_irq_restore(flags); /* Avoids leaking the critical section. */
> > + return scp;
> > + }
>
> Do we still need to pursue this after Catalin's prefetch suggestion for the
> per-cpu atomics?
>
> https://lore.kernel.org/r/aQU7l-qMKJTx4znJ@arm.com
>
> Although disabling/enabling interrupts on your system seems to be
> significantly faster than an atomic instruction, I'm worried that it's
> all very SoC-specific and on a mobile part (especially with pseudo-NMI),
> the relative costs could easily be the other way around.
My preference would be to go for the percpu atomic prefetch but we'd
need to do a bit of benchmarking to see we don't break other platforms
(unlikely though).
--
Catalin
© 2016 - 2026 Red Hat, Inc.