Use smp_cond_load_acquire_timewait() to define
res_atomic_cond_read_acquire() and res_smp_cond_load_acquire_timewait().
The timeout check for both is done via RES_CHECK_TIMEOUT(). Define
res_smp_cond_load_acquire_waiting() to allow it to amortize the
check for spin-wait implementations.
Cc: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Ankur Arora <ankur.a.arora@oracle.com>
---
arch/arm64/include/asm/rqspinlock.h | 3 +++
include/asm-generic/rqspinlock.h | 4 ++++
kernel/bpf/rqspinlock.c | 25 +++++++++----------------
3 files changed, 16 insertions(+), 16 deletions(-)
diff --git a/arch/arm64/include/asm/rqspinlock.h b/arch/arm64/include/asm/rqspinlock.h
index a385603436e9..ce8feadeb9a9 100644
--- a/arch/arm64/include/asm/rqspinlock.h
+++ b/arch/arm64/include/asm/rqspinlock.h
@@ -3,6 +3,9 @@
#define _ASM_RQSPINLOCK_H
#include <asm/barrier.h>
+
+#define res_smp_cond_load_acquire_waiting() arch_timer_evtstrm_available()
+
#include <asm-generic/rqspinlock.h>
#endif /* _ASM_RQSPINLOCK_H */
diff --git a/include/asm-generic/rqspinlock.h b/include/asm-generic/rqspinlock.h
index 6d4244d643df..4b49c0ddf89a 100644
--- a/include/asm-generic/rqspinlock.h
+++ b/include/asm-generic/rqspinlock.h
@@ -247,4 +247,8 @@ static __always_inline void res_spin_unlock(rqspinlock_t *lock)
#define raw_res_spin_unlock_irqrestore(lock, flags) ({ raw_res_spin_unlock(lock); local_irq_restore(flags); })
+#ifndef res_smp_cond_load_acquire_waiting
+#define res_smp_cond_load_acquire_waiting() 0
+#endif
+
#endif /* __ASM_GENERIC_RQSPINLOCK_H */
diff --git a/kernel/bpf/rqspinlock.c b/kernel/bpf/rqspinlock.c
index 5ab354d55d82..8de1395422e8 100644
--- a/kernel/bpf/rqspinlock.c
+++ b/kernel/bpf/rqspinlock.c
@@ -82,6 +82,7 @@ struct rqspinlock_timeout {
u64 duration;
u64 cur;
u16 spin;
+ u8 wait;
};
#define RES_TIMEOUT_VAL 2
@@ -241,26 +242,20 @@ static noinline int check_timeout(rqspinlock_t *lock, u32 mask,
}
/*
- * Do not amortize with spins when res_smp_cond_load_acquire is defined,
- * as the macro does internal amortization for us.
+ * Only amortize with spins when we don't have a waiting implementation.
*/
-#ifndef res_smp_cond_load_acquire
#define RES_CHECK_TIMEOUT(ts, ret, mask) \
({ \
- if (!(ts).spin++) \
+ if ((ts).wait || !(ts).spin++) \
(ret) = check_timeout((lock), (mask), &(ts)); \
(ret); \
})
-#else
-#define RES_CHECK_TIMEOUT(ts, ret, mask) \
- ({ (ret) = check_timeout((lock), (mask), &(ts)); })
-#endif
/*
* Initialize the 'spin' member.
* Set spin member to 0 to trigger AA/ABBA checks immediately.
*/
-#define RES_INIT_TIMEOUT(ts) ({ (ts).spin = 0; })
+#define RES_INIT_TIMEOUT(ts) ({ (ts).spin = 0; (ts).wait = res_smp_cond_load_acquire_waiting(); })
/*
* We only need to reset 'timeout_end', 'spin' will just wrap around as necessary.
@@ -313,11 +308,8 @@ EXPORT_SYMBOL_GPL(resilient_tas_spin_lock);
*/
static DEFINE_PER_CPU_ALIGNED(struct qnode, rqnodes[_Q_MAX_NODES]);
-#ifndef res_smp_cond_load_acquire
-#define res_smp_cond_load_acquire(v, c) smp_cond_load_acquire(v, c)
-#endif
-
-#define res_atomic_cond_read_acquire(v, c) res_smp_cond_load_acquire(&(v)->counter, (c))
+#define res_atomic_cond_read_acquire(v, c, t) smp_cond_load_acquire_timewait(&(v)->counter, (c), (t))
+#define res_smp_cond_load_acquire_timewait(v, c, t) smp_cond_load_acquire_timewait(v, (c), (t))
/**
* resilient_queued_spin_lock_slowpath - acquire the queued spinlock
@@ -418,7 +410,8 @@ int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val)
*/
if (val & _Q_LOCKED_MASK) {
RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT);
- res_smp_cond_load_acquire(&lock->locked, !VAL || RES_CHECK_TIMEOUT(ts, ret, _Q_LOCKED_MASK));
+ res_smp_cond_load_acquire_timewait(&lock->locked, !VAL,
+ RES_CHECK_TIMEOUT(ts, ret, _Q_LOCKED_MASK));
}
if (ret) {
@@ -572,7 +565,7 @@ int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val)
* us.
*/
RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT * 2);
- val = res_atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK) ||
+ val = res_atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK),
RES_CHECK_TIMEOUT(ts, ret, _Q_LOCKED_PENDING_MASK));
waitq_timeout:
--
2.31.1
On Fri, Aug 29, 2025 at 01:07:35AM -0700, Ankur Arora wrote: > diff --git a/arch/arm64/include/asm/rqspinlock.h b/arch/arm64/include/asm/rqspinlock.h > index a385603436e9..ce8feadeb9a9 100644 > --- a/arch/arm64/include/asm/rqspinlock.h > +++ b/arch/arm64/include/asm/rqspinlock.h > @@ -3,6 +3,9 @@ > #define _ASM_RQSPINLOCK_H > > #include <asm/barrier.h> > + > +#define res_smp_cond_load_acquire_waiting() arch_timer_evtstrm_available() More on this below, I don't think we should define it. > diff --git a/kernel/bpf/rqspinlock.c b/kernel/bpf/rqspinlock.c > index 5ab354d55d82..8de1395422e8 100644 > --- a/kernel/bpf/rqspinlock.c > +++ b/kernel/bpf/rqspinlock.c > @@ -82,6 +82,7 @@ struct rqspinlock_timeout { > u64 duration; > u64 cur; > u16 spin; > + u8 wait; > }; > > #define RES_TIMEOUT_VAL 2 > @@ -241,26 +242,20 @@ static noinline int check_timeout(rqspinlock_t *lock, u32 mask, > } > > /* > - * Do not amortize with spins when res_smp_cond_load_acquire is defined, > - * as the macro does internal amortization for us. > + * Only amortize with spins when we don't have a waiting implementation. > */ > -#ifndef res_smp_cond_load_acquire > #define RES_CHECK_TIMEOUT(ts, ret, mask) \ > ({ \ > - if (!(ts).spin++) \ > + if ((ts).wait || !(ts).spin++) \ > (ret) = check_timeout((lock), (mask), &(ts)); \ > (ret); \ > }) > -#else > -#define RES_CHECK_TIMEOUT(ts, ret, mask) \ > - ({ (ret) = check_timeout((lock), (mask), &(ts)); }) > -#endif IIUC, RES_CHECK_TIMEOUT in the current res_smp_cond_load_acquire() usage doesn't amortise the spins, as the comment suggests, but rather the calls to check_timeout(). This is fine, it matches the behaviour of smp_cond_load_relaxed_timewait() you introduced in the first patch. The only difference is the number of spins - 200 (matching poll_idle) vs 64K above. Does 200 work for the above? > /* > * Initialize the 'spin' member. > * Set spin member to 0 to trigger AA/ABBA checks immediately. > */ > -#define RES_INIT_TIMEOUT(ts) ({ (ts).spin = 0; }) > +#define RES_INIT_TIMEOUT(ts) ({ (ts).spin = 0; (ts).wait = res_smp_cond_load_acquire_waiting(); }) First of all, I don't really like the smp_cond_load_acquire_waiting(), that's an implementation detail of smp_cond_load_*_timewait() that shouldn't leak outside. But more importantly, RES_CHECK_TIMEOUT() is also used outside the smp_cond_load_acquire_timewait() condition. The (ts).wait check only makes sense when used together with the WFE waiting. I would leave RES_CHECK_TIMEOUT() as is for the stand-alone cases and just use check_timeout() in the smp_cond_load_acquire_timewait() scenarios. I would also drop the res_smp_cond_load_acquire() macro since you now defined smp_cond_load_acquire_timewait() generically and can be used directly. -- Catalin
Catalin Marinas <catalin.marinas@arm.com> writes: > On Fri, Aug 29, 2025 at 01:07:35AM -0700, Ankur Arora wrote: >> diff --git a/arch/arm64/include/asm/rqspinlock.h b/arch/arm64/include/asm/rqspinlock.h >> index a385603436e9..ce8feadeb9a9 100644 >> --- a/arch/arm64/include/asm/rqspinlock.h >> +++ b/arch/arm64/include/asm/rqspinlock.h >> @@ -3,6 +3,9 @@ >> #define _ASM_RQSPINLOCK_H >> >> #include <asm/barrier.h> >> + >> +#define res_smp_cond_load_acquire_waiting() arch_timer_evtstrm_available() > > More on this below, I don't think we should define it. > >> diff --git a/kernel/bpf/rqspinlock.c b/kernel/bpf/rqspinlock.c >> index 5ab354d55d82..8de1395422e8 100644 >> --- a/kernel/bpf/rqspinlock.c >> +++ b/kernel/bpf/rqspinlock.c >> @@ -82,6 +82,7 @@ struct rqspinlock_timeout { >> u64 duration; >> u64 cur; >> u16 spin; >> + u8 wait; >> }; >> >> #define RES_TIMEOUT_VAL 2 >> @@ -241,26 +242,20 @@ static noinline int check_timeout(rqspinlock_t *lock, u32 mask, >> } >> >> /* >> - * Do not amortize with spins when res_smp_cond_load_acquire is defined, >> - * as the macro does internal amortization for us. >> + * Only amortize with spins when we don't have a waiting implementation. >> */ >> -#ifndef res_smp_cond_load_acquire >> #define RES_CHECK_TIMEOUT(ts, ret, mask) \ >> ({ \ >> - if (!(ts).spin++) \ >> + if ((ts).wait || !(ts).spin++) \ >> (ret) = check_timeout((lock), (mask), &(ts)); \ >> (ret); \ >> }) >> -#else >> -#define RES_CHECK_TIMEOUT(ts, ret, mask) \ >> - ({ (ret) = check_timeout((lock), (mask), &(ts)); }) >> -#endif > > IIUC, RES_CHECK_TIMEOUT in the current res_smp_cond_load_acquire() usage > doesn't amortise the spins, as the comment suggests, but rather the > calls to check_timeout(). This is fine, it matches the behaviour of > smp_cond_load_relaxed_timewait() you introduced in the first patch. The > only difference is the number of spins - 200 (matching poll_idle) vs 64K > above. Does 200 work for the above? Works for me. I had added this because there seemed to be vast gulf between 64K and 200. Happy to drop this. >> /* >> * Initialize the 'spin' member. >> * Set spin member to 0 to trigger AA/ABBA checks immediately. >> */ >> -#define RES_INIT_TIMEOUT(ts) ({ (ts).spin = 0; }) >> +#define RES_INIT_TIMEOUT(ts) ({ (ts).spin = 0; (ts).wait = res_smp_cond_load_acquire_waiting(); }) > > First of all, I don't really like the smp_cond_load_acquire_waiting(), > that's an implementation detail of smp_cond_load_*_timewait() that > shouldn't leak outside. But more importantly, RES_CHECK_TIMEOUT() is > also used outside the smp_cond_load_acquire_timewait() condition. The > (ts).wait check only makes sense when used together with the WFE > waiting. > > I would leave RES_CHECK_TIMEOUT() as is for the stand-alone cases and > just use check_timeout() in the smp_cond_load_acquire_timewait() > scenarios. I would also drop the res_smp_cond_load_acquire() macro since > you now defined smp_cond_load_acquire_timewait() generically and can be > used directly. Sounds good. -- ankur
On Mon, Sep 1, 2025 at 4:28 AM Catalin Marinas <catalin.marinas@arm.com> wrote: > > On Fri, Aug 29, 2025 at 01:07:35AM -0700, Ankur Arora wrote: > > diff --git a/arch/arm64/include/asm/rqspinlock.h b/arch/arm64/include/asm/rqspinlock.h > > index a385603436e9..ce8feadeb9a9 100644 > > --- a/arch/arm64/include/asm/rqspinlock.h > > +++ b/arch/arm64/include/asm/rqspinlock.h > > @@ -3,6 +3,9 @@ > > #define _ASM_RQSPINLOCK_H > > > > #include <asm/barrier.h> > > + > > +#define res_smp_cond_load_acquire_waiting() arch_timer_evtstrm_available() > > More on this below, I don't think we should define it. > > > diff --git a/kernel/bpf/rqspinlock.c b/kernel/bpf/rqspinlock.c > > index 5ab354d55d82..8de1395422e8 100644 > > --- a/kernel/bpf/rqspinlock.c > > +++ b/kernel/bpf/rqspinlock.c > > @@ -82,6 +82,7 @@ struct rqspinlock_timeout { > > u64 duration; > > u64 cur; > > u16 spin; > > + u8 wait; > > }; > > > > #define RES_TIMEOUT_VAL 2 > > @@ -241,26 +242,20 @@ static noinline int check_timeout(rqspinlock_t *lock, u32 mask, > > } > > > > /* > > - * Do not amortize with spins when res_smp_cond_load_acquire is defined, > > - * as the macro does internal amortization for us. > > + * Only amortize with spins when we don't have a waiting implementation. > > */ > > -#ifndef res_smp_cond_load_acquire > > #define RES_CHECK_TIMEOUT(ts, ret, mask) \ > > ({ \ > > - if (!(ts).spin++) \ > > + if ((ts).wait || !(ts).spin++) \ > > (ret) = check_timeout((lock), (mask), &(ts)); \ > > (ret); \ > > }) > > -#else > > -#define RES_CHECK_TIMEOUT(ts, ret, mask) \ > > - ({ (ret) = check_timeout((lock), (mask), &(ts)); }) > > -#endif > > IIUC, RES_CHECK_TIMEOUT in the current res_smp_cond_load_acquire() usage > doesn't amortise the spins, as the comment suggests, but rather the > calls to check_timeout(). This is fine, it matches the behaviour of > smp_cond_load_relaxed_timewait() you introduced in the first patch. The > only difference is the number of spins - 200 (matching poll_idle) vs 64K > above. Does 200 work for the above? > > > /* > > * Initialize the 'spin' member. > > * Set spin member to 0 to trigger AA/ABBA checks immediately. > > */ > > -#define RES_INIT_TIMEOUT(ts) ({ (ts).spin = 0; }) > > +#define RES_INIT_TIMEOUT(ts) ({ (ts).spin = 0; (ts).wait = res_smp_cond_load_acquire_waiting(); }) > > First of all, I don't really like the smp_cond_load_acquire_waiting(), > that's an implementation detail of smp_cond_load_*_timewait() that > shouldn't leak outside. But more importantly, RES_CHECK_TIMEOUT() is > also used outside the smp_cond_load_acquire_timewait() condition. The > (ts).wait check only makes sense when used together with the WFE > waiting. +1 to the above. Penalizing all other architectures with pointless runtime check: > - if (!(ts).spin++) \ > + if ((ts).wait || !(ts).spin++) \ is not acceptable.
Alexei Starovoitov <alexei.starovoitov@gmail.com> writes: > On Mon, Sep 1, 2025 at 4:28 AM Catalin Marinas <catalin.marinas@arm.com> wrote: >> >> On Fri, Aug 29, 2025 at 01:07:35AM -0700, Ankur Arora wrote: >> > diff --git a/arch/arm64/include/asm/rqspinlock.h b/arch/arm64/include/asm/rqspinlock.h >> > index a385603436e9..ce8feadeb9a9 100644 >> > --- a/arch/arm64/include/asm/rqspinlock.h >> > +++ b/arch/arm64/include/asm/rqspinlock.h >> > @@ -3,6 +3,9 @@ >> > #define _ASM_RQSPINLOCK_H >> > >> > #include <asm/barrier.h> >> > + >> > +#define res_smp_cond_load_acquire_waiting() arch_timer_evtstrm_available() >> >> More on this below, I don't think we should define it. >> >> > diff --git a/kernel/bpf/rqspinlock.c b/kernel/bpf/rqspinlock.c >> > index 5ab354d55d82..8de1395422e8 100644 >> > --- a/kernel/bpf/rqspinlock.c >> > +++ b/kernel/bpf/rqspinlock.c >> > @@ -82,6 +82,7 @@ struct rqspinlock_timeout { >> > u64 duration; >> > u64 cur; >> > u16 spin; >> > + u8 wait; >> > }; >> > >> > #define RES_TIMEOUT_VAL 2 >> > @@ -241,26 +242,20 @@ static noinline int check_timeout(rqspinlock_t *lock, u32 mask, >> > } >> > >> > /* >> > - * Do not amortize with spins when res_smp_cond_load_acquire is defined, >> > - * as the macro does internal amortization for us. >> > + * Only amortize with spins when we don't have a waiting implementation. >> > */ >> > -#ifndef res_smp_cond_load_acquire >> > #define RES_CHECK_TIMEOUT(ts, ret, mask) \ >> > ({ \ >> > - if (!(ts).spin++) \ >> > + if ((ts).wait || !(ts).spin++) \ >> > (ret) = check_timeout((lock), (mask), &(ts)); \ >> > (ret); \ >> > }) >> > -#else >> > -#define RES_CHECK_TIMEOUT(ts, ret, mask) \ >> > - ({ (ret) = check_timeout((lock), (mask), &(ts)); }) >> > -#endif >> >> IIUC, RES_CHECK_TIMEOUT in the current res_smp_cond_load_acquire() usage >> doesn't amortise the spins, as the comment suggests, but rather the >> calls to check_timeout(). This is fine, it matches the behaviour of >> smp_cond_load_relaxed_timewait() you introduced in the first patch. The >> only difference is the number of spins - 200 (matching poll_idle) vs 64K >> above. Does 200 work for the above? >> >> > /* >> > * Initialize the 'spin' member. >> > * Set spin member to 0 to trigger AA/ABBA checks immediately. >> > */ >> > -#define RES_INIT_TIMEOUT(ts) ({ (ts).spin = 0; }) >> > +#define RES_INIT_TIMEOUT(ts) ({ (ts).spin = 0; (ts).wait = res_smp_cond_load_acquire_waiting(); }) >> >> First of all, I don't really like the smp_cond_load_acquire_waiting(), >> that's an implementation detail of smp_cond_load_*_timewait() that >> shouldn't leak outside. But more importantly, RES_CHECK_TIMEOUT() is >> also used outside the smp_cond_load_acquire_timewait() condition. The >> (ts).wait check only makes sense when used together with the WFE >> waiting. > > +1 to the above. Ack. > Penalizing all other architectures with pointless runtime check: > >> - if (!(ts).spin++) \ >> + if ((ts).wait || !(ts).spin++) \ > > is not acceptable. Is it still a penalty if the context is a busy wait loop. Oddly enough the compiler does not eliminate this check on x86 (given that it is statically defined to be 0 and ts does not escape the function.) -- ankur
© 2016 - 2025 Red Hat, Inc.