Switch out the conditional load inerfaces used by rqspinlock
to smp_cond_read_acquire_timeout().
This interface handles the timeout check explicitly and does any
necessary amortization, so use check_timeout() directly.
Cc: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Ankur Arora <ankur.a.arora@oracle.com>
---
kernel/bpf/rqspinlock.c | 23 +++++++----------------
1 file changed, 7 insertions(+), 16 deletions(-)
diff --git a/kernel/bpf/rqspinlock.c b/kernel/bpf/rqspinlock.c
index 5ab354d55d82..4d2c12d131ae 100644
--- a/kernel/bpf/rqspinlock.c
+++ b/kernel/bpf/rqspinlock.c
@@ -241,20 +241,14 @@ static noinline int check_timeout(rqspinlock_t *lock, u32 mask,
}
/*
- * Do not amortize with spins when res_smp_cond_load_acquire is defined,
- * as the macro does internal amortization for us.
+ * Amortize timeout check for busy-wait loops.
*/
-#ifndef res_smp_cond_load_acquire
#define RES_CHECK_TIMEOUT(ts, ret, mask) \
({ \
if (!(ts).spin++) \
(ret) = check_timeout((lock), (mask), &(ts)); \
(ret); \
})
-#else
-#define RES_CHECK_TIMEOUT(ts, ret, mask) \
- ({ (ret) = check_timeout((lock), (mask), &(ts)); })
-#endif
/*
* Initialize the 'spin' member.
@@ -313,11 +307,8 @@ EXPORT_SYMBOL_GPL(resilient_tas_spin_lock);
*/
static DEFINE_PER_CPU_ALIGNED(struct qnode, rqnodes[_Q_MAX_NODES]);
-#ifndef res_smp_cond_load_acquire
-#define res_smp_cond_load_acquire(v, c) smp_cond_load_acquire(v, c)
-#endif
-
-#define res_atomic_cond_read_acquire(v, c) res_smp_cond_load_acquire(&(v)->counter, (c))
+#define res_atomic_cond_read_acquire_timeout(v, c, t) \
+ smp_cond_load_acquire_timeout(&(v)->counter, (c), (t))
/**
* resilient_queued_spin_lock_slowpath - acquire the queued spinlock
@@ -418,7 +409,8 @@ int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val)
*/
if (val & _Q_LOCKED_MASK) {
RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT);
- res_smp_cond_load_acquire(&lock->locked, !VAL || RES_CHECK_TIMEOUT(ts, ret, _Q_LOCKED_MASK));
+ smp_cond_load_acquire_timeout(&lock->locked, !VAL,
+ (ret = check_timeout(lock, _Q_LOCKED_MASK, &ts)));
}
if (ret) {
@@ -572,9 +564,8 @@ int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val)
* us.
*/
RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT * 2);
- val = res_atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK) ||
- RES_CHECK_TIMEOUT(ts, ret, _Q_LOCKED_PENDING_MASK));
-
+ val = res_atomic_cond_read_acquire_timeout(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK),
+ (ret = check_timeout(lock, _Q_LOCKED_PENDING_MASK, &ts)));
waitq_timeout:
if (ret) {
/*
--
2.43.5
On Wed, Sep 10, 2025 at 08:46:55PM -0700, Ankur Arora wrote: > Switch out the conditional load inerfaces used by rqspinlock > to smp_cond_read_acquire_timeout(). > This interface handles the timeout check explicitly and does any > necessary amortization, so use check_timeout() directly. It's worth mentioning that the default smp_cond_load_acquire_timeout() implementation (without hardware support) only spins 200 times instead of 16K times in the rqspinlock code. That's probably fine but it would be good to have confirmation from Kumar or Alexei. > diff --git a/kernel/bpf/rqspinlock.c b/kernel/bpf/rqspinlock.c > index 5ab354d55d82..4d2c12d131ae 100644 > --- a/kernel/bpf/rqspinlock.c > +++ b/kernel/bpf/rqspinlock.c [...] > @@ -313,11 +307,8 @@ EXPORT_SYMBOL_GPL(resilient_tas_spin_lock); > */ > static DEFINE_PER_CPU_ALIGNED(struct qnode, rqnodes[_Q_MAX_NODES]); > > -#ifndef res_smp_cond_load_acquire > -#define res_smp_cond_load_acquire(v, c) smp_cond_load_acquire(v, c) > -#endif > - > -#define res_atomic_cond_read_acquire(v, c) res_smp_cond_load_acquire(&(v)->counter, (c)) > +#define res_atomic_cond_read_acquire_timeout(v, c, t) \ > + smp_cond_load_acquire_timeout(&(v)->counter, (c), (t)) BTW, we have atomic_cond_read_acquire() which accesses the 'counter' of an atomic_t. You might as well add an atomic_cond_read_acquire_timeout() in atomic.h than open-code the atomic_t internals here. Otherwise the patch looks fine to me, much simpler than the previous attempt. Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Catalin Marinas <catalin.marinas@arm.com> writes: > On Wed, Sep 10, 2025 at 08:46:55PM -0700, Ankur Arora wrote: >> Switch out the conditional load inerfaces used by rqspinlock >> to smp_cond_read_acquire_timeout(). >> This interface handles the timeout check explicitly and does any >> necessary amortization, so use check_timeout() directly. > > It's worth mentioning that the default smp_cond_load_acquire_timeout() > implementation (without hardware support) only spins 200 times instead > of 16K times in the rqspinlock code. That's probably fine but it would > be good to have confirmation from Kumar or Alexei. As Kumar mentions, I'll redefine the count locally in rqspinlock.c to 16k. >> diff --git a/kernel/bpf/rqspinlock.c b/kernel/bpf/rqspinlock.c >> index 5ab354d55d82..4d2c12d131ae 100644 >> --- a/kernel/bpf/rqspinlock.c >> +++ b/kernel/bpf/rqspinlock.c > [...] >> @@ -313,11 +307,8 @@ EXPORT_SYMBOL_GPL(resilient_tas_spin_lock); >> */ >> static DEFINE_PER_CPU_ALIGNED(struct qnode, rqnodes[_Q_MAX_NODES]); >> >> -#ifndef res_smp_cond_load_acquire >> -#define res_smp_cond_load_acquire(v, c) smp_cond_load_acquire(v, c) >> -#endif >> - >> -#define res_atomic_cond_read_acquire(v, c) res_smp_cond_load_acquire(&(v)->counter, (c)) >> +#define res_atomic_cond_read_acquire_timeout(v, c, t) \ >> + smp_cond_load_acquire_timeout(&(v)->counter, (c), (t)) > > BTW, we have atomic_cond_read_acquire() which accesses the 'counter' of > an atomic_t. You might as well add an atomic_cond_read_acquire_timeout() > in atomic.h than open-code the atomic_t internals here. Good point. That also keeps it close to the locking/qspinlock.c use of atomic_cond_read_acquire(). Will add atomic_cond_read_acquire_timeout() (and other variants that we define in include/linux/atomic.h). > Otherwise the patch looks fine to me, much simpler than the previous > attempt. > > Reviewed-by: Catalin Marinas <catalin.marinas@arm.com> Thanks! -- ankur
On Thu, 11 Sept 2025 at 16:32, Catalin Marinas <catalin.marinas@arm.com> wrote: > > On Wed, Sep 10, 2025 at 08:46:55PM -0700, Ankur Arora wrote: > > Switch out the conditional load inerfaces used by rqspinlock > > to smp_cond_read_acquire_timeout(). > > This interface handles the timeout check explicitly and does any > > necessary amortization, so use check_timeout() directly. > > It's worth mentioning that the default smp_cond_load_acquire_timeout() > implementation (without hardware support) only spins 200 times instead > of 16K times in the rqspinlock code. That's probably fine but it would > be good to have confirmation from Kumar or Alexei. > > > diff --git a/kernel/bpf/rqspinlock.c b/kernel/bpf/rqspinlock.c > > index 5ab354d55d82..4d2c12d131ae 100644 > > --- a/kernel/bpf/rqspinlock.c > > +++ b/kernel/bpf/rqspinlock.c > [...] > > @@ -313,11 +307,8 @@ EXPORT_SYMBOL_GPL(resilient_tas_spin_lock); > > */ > > static DEFINE_PER_CPU_ALIGNED(struct qnode, rqnodes[_Q_MAX_NODES]); > > > > -#ifndef res_smp_cond_load_acquire > > -#define res_smp_cond_load_acquire(v, c) smp_cond_load_acquire(v, c) > > -#endif > > - > > -#define res_atomic_cond_read_acquire(v, c) res_smp_cond_load_acquire(&(v)->counter, (c)) > > +#define res_atomic_cond_read_acquire_timeout(v, c, t) \ > > + smp_cond_load_acquire_timeout(&(v)->counter, (c), (t)) > > BTW, we have atomic_cond_read_acquire() which accesses the 'counter' of > an atomic_t. You might as well add an atomic_cond_read_acquire_timeout() > in atomic.h than open-code the atomic_t internals here. > +1, and then drop res_atomic_cond_read_acquire_timeout from this file. > Otherwise the patch looks fine to me, much simpler than the previous > attempt. > > Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
On Thu, 11 Sept 2025 at 16:32, Catalin Marinas <catalin.marinas@arm.com> wrote: > > On Wed, Sep 10, 2025 at 08:46:55PM -0700, Ankur Arora wrote: > > Switch out the conditional load inerfaces used by rqspinlock > > to smp_cond_read_acquire_timeout(). > > This interface handles the timeout check explicitly and does any > > necessary amortization, so use check_timeout() directly. > > It's worth mentioning that the default smp_cond_load_acquire_timeout() > implementation (without hardware support) only spins 200 times instead > of 16K times in the rqspinlock code. That's probably fine but it would > be good to have confirmation from Kumar or Alexei. > This looks good, but I would still redefine the spin count from 200 to 16k for rqspinlock.c, especially because we need to keep RES_CHECK_TIMEOUT around which still uses 16k spins to amortize check_timeout. > > [...]
Kumar Kartikeya Dwivedi <memxor@gmail.com> writes: > On Thu, 11 Sept 2025 at 16:32, Catalin Marinas <catalin.marinas@arm.com> wrote: >> >> On Wed, Sep 10, 2025 at 08:46:55PM -0700, Ankur Arora wrote: >> > Switch out the conditional load inerfaces used by rqspinlock >> > to smp_cond_read_acquire_timeout(). >> > This interface handles the timeout check explicitly and does any >> > necessary amortization, so use check_timeout() directly. >> >> It's worth mentioning that the default smp_cond_load_acquire_timeout() >> implementation (without hardware support) only spins 200 times instead >> of 16K times in the rqspinlock code. That's probably fine but it would >> be good to have confirmation from Kumar or Alexei. >> > > This looks good, but I would still redefine the spin count from 200 to > 16k for rqspinlock.c, especially because we need to keep > RES_CHECK_TIMEOUT around which still uses 16k spins to amortize > check_timeout. By my count that amounts to ~100us per check_timeout() on x86 systems I've tested with cpu_relax(). Which seems quite reasonable. 16k also seems safer on CPUs where cpu_relax() is basically a NOP. -- ankur
On Thu, Sep 11, 2025 at 02:58:22PM -0700, Ankur Arora wrote: > > Kumar Kartikeya Dwivedi <memxor@gmail.com> writes: > > > On Thu, 11 Sept 2025 at 16:32, Catalin Marinas <catalin.marinas@arm.com> wrote: > >> > >> On Wed, Sep 10, 2025 at 08:46:55PM -0700, Ankur Arora wrote: > >> > Switch out the conditional load inerfaces used by rqspinlock > >> > to smp_cond_read_acquire_timeout(). > >> > This interface handles the timeout check explicitly and does any > >> > necessary amortization, so use check_timeout() directly. > >> > >> It's worth mentioning that the default smp_cond_load_acquire_timeout() > >> implementation (without hardware support) only spins 200 times instead > >> of 16K times in the rqspinlock code. That's probably fine but it would > >> be good to have confirmation from Kumar or Alexei. > >> > > > > This looks good, but I would still redefine the spin count from 200 to > > 16k for rqspinlock.c, especially because we need to keep > > RES_CHECK_TIMEOUT around which still uses 16k spins to amortize > > check_timeout. > > By my count that amounts to ~100us per check_timeout() on x86 > systems I've tested with cpu_relax(). Which seems quite reasonable. > > 16k also seems safer on CPUs where cpu_relax() is basically a NOP. Does this spin count work for poll_idle()? I don't remember where the 200 value came from. -- Catalin
Catalin Marinas <catalin.marinas@arm.com> writes: > On Thu, Sep 11, 2025 at 02:58:22PM -0700, Ankur Arora wrote: >> >> Kumar Kartikeya Dwivedi <memxor@gmail.com> writes: >> >> > On Thu, 11 Sept 2025 at 16:32, Catalin Marinas <catalin.marinas@arm.com> wrote: >> >> >> >> On Wed, Sep 10, 2025 at 08:46:55PM -0700, Ankur Arora wrote: >> >> > Switch out the conditional load inerfaces used by rqspinlock >> >> > to smp_cond_read_acquire_timeout(). >> >> > This interface handles the timeout check explicitly and does any >> >> > necessary amortization, so use check_timeout() directly. >> >> >> >> It's worth mentioning that the default smp_cond_load_acquire_timeout() >> >> implementation (without hardware support) only spins 200 times instead >> >> of 16K times in the rqspinlock code. That's probably fine but it would >> >> be good to have confirmation from Kumar or Alexei. >> >> >> > >> > This looks good, but I would still redefine the spin count from 200 to >> > 16k for rqspinlock.c, especially because we need to keep >> > RES_CHECK_TIMEOUT around which still uses 16k spins to amortize >> > check_timeout. >> >> By my count that amounts to ~100us per check_timeout() on x86 >> systems I've tested with cpu_relax(). Which seems quite reasonable. >> >> 16k also seems safer on CPUs where cpu_relax() is basically a NOP. > > Does this spin count work for poll_idle()? I don't remember where the > 200 value came from. Just reusing the value of POLL_IDLE_RELAX_COUNT which is is defined as 200. For the poll_idle() case I don't think the value of 200 makes sense for all architectures, so they'll need to redefine it (before defining ARCH_HAS_OPTIMIZED_POLL which gates poll_idle().) -- ankur
© 2016 - 2025 Red Hat, Inc.