[v4] barrier: Add smp_cond_load_*_timewait()

[PATCH v4 5/5] rqspinlock: use smp_cond_load_acquire_timewait()

Posted by Ankur Arora 5 months, 2 weeks ago

Use smp_cond_load_acquire_timewait() to define
res_atomic_cond_read_acquire() and res_smp_cond_load_acquire_timewait().

The timeout check for both is done via RES_CHECK_TIMEOUT(). Define
res_smp_cond_load_acquire_waiting() to allow it to amortize the
check for spin-wait implementations.

Cc: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Ankur Arora <ankur.a.arora@oracle.com>
---
 arch/arm64/include/asm/rqspinlock.h |  3 +++
 include/asm-generic/rqspinlock.h    |  4 ++++
 kernel/bpf/rqspinlock.c             | 25 +++++++++----------------
 3 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/arch/arm64/include/asm/rqspinlock.h b/arch/arm64/include/asm/rqspinlock.h
index a385603436e9..ce8feadeb9a9 100644
--- a/arch/arm64/include/asm/rqspinlock.h
+++ b/arch/arm64/include/asm/rqspinlock.h
@@ -3,6 +3,9 @@
 #define _ASM_RQSPINLOCK_H
 
 #include <asm/barrier.h>
+
+#define res_smp_cond_load_acquire_waiting() arch_timer_evtstrm_available()
+
 #include <asm-generic/rqspinlock.h>
 
 #endif /* _ASM_RQSPINLOCK_H */
diff --git a/include/asm-generic/rqspinlock.h b/include/asm-generic/rqspinlock.h
index 6d4244d643df..4b49c0ddf89a 100644
--- a/include/asm-generic/rqspinlock.h
+++ b/include/asm-generic/rqspinlock.h
@@ -247,4 +247,8 @@ static __always_inline void res_spin_unlock(rqspinlock_t *lock)
 
 #define raw_res_spin_unlock_irqrestore(lock, flags) ({ raw_res_spin_unlock(lock); local_irq_restore(flags); })
 
+#ifndef res_smp_cond_load_acquire_waiting
+#define res_smp_cond_load_acquire_waiting()	0
+#endif
+
 #endif /* __ASM_GENERIC_RQSPINLOCK_H */
diff --git a/kernel/bpf/rqspinlock.c b/kernel/bpf/rqspinlock.c
index 5ab354d55d82..8de1395422e8 100644
--- a/kernel/bpf/rqspinlock.c
+++ b/kernel/bpf/rqspinlock.c
@@ -82,6 +82,7 @@ struct rqspinlock_timeout {
 	u64 duration;
 	u64 cur;
 	u16 spin;
+	u8  wait;
 };
 
 #define RES_TIMEOUT_VAL	2
@@ -241,26 +242,20 @@ static noinline int check_timeout(rqspinlock_t *lock, u32 mask,
 }
 
 /*
- * Do not amortize with spins when res_smp_cond_load_acquire is defined,
- * as the macro does internal amortization for us.
+ * Only amortize with spins when we don't have a waiting implementation.
  */
-#ifndef res_smp_cond_load_acquire
 #define RES_CHECK_TIMEOUT(ts, ret, mask)                              \
 	({                                                            \
-		if (!(ts).spin++)                                     \
+		if ((ts).wait || !(ts).spin++)		      \
 			(ret) = check_timeout((lock), (mask), &(ts)); \
 		(ret);                                                \
 	})
-#else
-#define RES_CHECK_TIMEOUT(ts, ret, mask)			      \
-	({ (ret) = check_timeout((lock), (mask), &(ts)); })
-#endif
 
 /*
  * Initialize the 'spin' member.
  * Set spin member to 0 to trigger AA/ABBA checks immediately.
  */
-#define RES_INIT_TIMEOUT(ts) ({ (ts).spin = 0; })
+#define RES_INIT_TIMEOUT(ts) ({ (ts).spin = 0; (ts).wait = res_smp_cond_load_acquire_waiting(); })
 
 /*
  * We only need to reset 'timeout_end', 'spin' will just wrap around as necessary.
@@ -313,11 +308,8 @@ EXPORT_SYMBOL_GPL(resilient_tas_spin_lock);
  */
 static DEFINE_PER_CPU_ALIGNED(struct qnode, rqnodes[_Q_MAX_NODES]);
 
-#ifndef res_smp_cond_load_acquire
-#define res_smp_cond_load_acquire(v, c) smp_cond_load_acquire(v, c)
-#endif
-
-#define res_atomic_cond_read_acquire(v, c) res_smp_cond_load_acquire(&(v)->counter, (c))
+#define res_atomic_cond_read_acquire(v, c, t)		smp_cond_load_acquire_timewait(&(v)->counter, (c), (t))
+#define res_smp_cond_load_acquire_timewait(v, c, t)	smp_cond_load_acquire_timewait(v, (c), (t))
 
 /**
  * resilient_queued_spin_lock_slowpath - acquire the queued spinlock
@@ -418,7 +410,8 @@ int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val)
 	 */
 	if (val & _Q_LOCKED_MASK) {
 		RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT);
-		res_smp_cond_load_acquire(&lock->locked, !VAL || RES_CHECK_TIMEOUT(ts, ret, _Q_LOCKED_MASK));
+		res_smp_cond_load_acquire_timewait(&lock->locked, !VAL,
+						   RES_CHECK_TIMEOUT(ts, ret, _Q_LOCKED_MASK));
 	}
 
 	if (ret) {
@@ -572,7 +565,7 @@ int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val)
 	 * us.
 	 */
 	RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT * 2);
-	val = res_atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK) ||
+	val = res_atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK),
 					   RES_CHECK_TIMEOUT(ts, ret, _Q_LOCKED_PENDING_MASK));
 
 waitq_timeout:
-- 
2.31.1

Re: [PATCH v4 5/5] rqspinlock: use smp_cond_load_acquire_timewait()

Posted by Catalin Marinas 5 months, 1 week ago

On Fri, Aug 29, 2025 at 01:07:35AM -0700, Ankur Arora wrote:
> diff --git a/arch/arm64/include/asm/rqspinlock.h b/arch/arm64/include/asm/rqspinlock.h
> index a385603436e9..ce8feadeb9a9 100644
> --- a/arch/arm64/include/asm/rqspinlock.h
> +++ b/arch/arm64/include/asm/rqspinlock.h
> @@ -3,6 +3,9 @@
>  #define _ASM_RQSPINLOCK_H
>  
>  #include <asm/barrier.h>
> +
> +#define res_smp_cond_load_acquire_waiting() arch_timer_evtstrm_available()

More on this below, I don't think we should define it.

> diff --git a/kernel/bpf/rqspinlock.c b/kernel/bpf/rqspinlock.c
> index 5ab354d55d82..8de1395422e8 100644
> --- a/kernel/bpf/rqspinlock.c
> +++ b/kernel/bpf/rqspinlock.c
> @@ -82,6 +82,7 @@ struct rqspinlock_timeout {
>  	u64 duration;
>  	u64 cur;
>  	u16 spin;
> +	u8  wait;
>  };
>  
>  #define RES_TIMEOUT_VAL	2
> @@ -241,26 +242,20 @@ static noinline int check_timeout(rqspinlock_t *lock, u32 mask,
>  }
>  
>  /*
> - * Do not amortize with spins when res_smp_cond_load_acquire is defined,
> - * as the macro does internal amortization for us.
> + * Only amortize with spins when we don't have a waiting implementation.
>   */
> -#ifndef res_smp_cond_load_acquire
>  #define RES_CHECK_TIMEOUT(ts, ret, mask)                              \
>  	({                                                            \
> -		if (!(ts).spin++)                                     \
> +		if ((ts).wait || !(ts).spin++)		      \
>  			(ret) = check_timeout((lock), (mask), &(ts)); \
>  		(ret);                                                \
>  	})
> -#else
> -#define RES_CHECK_TIMEOUT(ts, ret, mask)			      \
> -	({ (ret) = check_timeout((lock), (mask), &(ts)); })
> -#endif

IIUC, RES_CHECK_TIMEOUT in the current res_smp_cond_load_acquire() usage
doesn't amortise the spins, as the comment suggests, but rather the
calls to check_timeout(). This is fine, it matches the behaviour of
smp_cond_load_relaxed_timewait() you introduced in the first patch. The
only difference is the number of spins - 200 (matching poll_idle) vs 64K
above. Does 200 work for the above?

>  /*
>   * Initialize the 'spin' member.
>   * Set spin member to 0 to trigger AA/ABBA checks immediately.
>   */
> -#define RES_INIT_TIMEOUT(ts) ({ (ts).spin = 0; })
> +#define RES_INIT_TIMEOUT(ts) ({ (ts).spin = 0; (ts).wait = res_smp_cond_load_acquire_waiting(); })

First of all, I don't really like the smp_cond_load_acquire_waiting(),
that's an implementation detail of smp_cond_load_*_timewait() that
shouldn't leak outside. But more importantly, RES_CHECK_TIMEOUT() is
also used outside the smp_cond_load_acquire_timewait() condition. The
(ts).wait check only makes sense when used together with the WFE
waiting.

I would leave RES_CHECK_TIMEOUT() as is for the stand-alone cases and
just use check_timeout() in the smp_cond_load_acquire_timewait()
scenarios. I would also drop the res_smp_cond_load_acquire() macro since
you now defined smp_cond_load_acquire_timewait() generically and can be
used directly.

-- 
Catalin

Re: [PATCH v4 5/5] rqspinlock: use smp_cond_load_acquire_timewait()

Posted by Ankur Arora 5 months, 1 week ago

Catalin Marinas <catalin.marinas@arm.com> writes:

> On Fri, Aug 29, 2025 at 01:07:35AM -0700, Ankur Arora wrote:
>> diff --git a/arch/arm64/include/asm/rqspinlock.h b/arch/arm64/include/asm/rqspinlock.h
>> index a385603436e9..ce8feadeb9a9 100644
>> --- a/arch/arm64/include/asm/rqspinlock.h
>> +++ b/arch/arm64/include/asm/rqspinlock.h
>> @@ -3,6 +3,9 @@
>>  #define _ASM_RQSPINLOCK_H
>>
>>  #include <asm/barrier.h>
>> +
>> +#define res_smp_cond_load_acquire_waiting() arch_timer_evtstrm_available()
>
> More on this below, I don't think we should define it.
>
>> diff --git a/kernel/bpf/rqspinlock.c b/kernel/bpf/rqspinlock.c
>> index 5ab354d55d82..8de1395422e8 100644
>> --- a/kernel/bpf/rqspinlock.c
>> +++ b/kernel/bpf/rqspinlock.c
>> @@ -82,6 +82,7 @@ struct rqspinlock_timeout {
>>  	u64 duration;
>>  	u64 cur;
>>  	u16 spin;
>> +	u8  wait;
>>  };
>>
>>  #define RES_TIMEOUT_VAL	2
>> @@ -241,26 +242,20 @@ static noinline int check_timeout(rqspinlock_t *lock, u32 mask,
>>  }
>>
>>  /*
>> - * Do not amortize with spins when res_smp_cond_load_acquire is defined,
>> - * as the macro does internal amortization for us.
>> + * Only amortize with spins when we don't have a waiting implementation.
>>   */
>> -#ifndef res_smp_cond_load_acquire
>>  #define RES_CHECK_TIMEOUT(ts, ret, mask)                              \
>>  	({                                                            \
>> -		if (!(ts).spin++)                                     \
>> +		if ((ts).wait || !(ts).spin++)		      \
>>  			(ret) = check_timeout((lock), (mask), &(ts)); \
>>  		(ret);                                                \
>>  	})
>> -#else
>> -#define RES_CHECK_TIMEOUT(ts, ret, mask)			      \
>> -	({ (ret) = check_timeout((lock), (mask), &(ts)); })
>> -#endif
>
> IIUC, RES_CHECK_TIMEOUT in the current res_smp_cond_load_acquire() usage
> doesn't amortise the spins, as the comment suggests, but rather the
> calls to check_timeout(). This is fine, it matches the behaviour of
> smp_cond_load_relaxed_timewait() you introduced in the first patch. The
> only difference is the number of spins - 200 (matching poll_idle) vs 64K
> above. Does 200 work for the above?

Works for me. I had added this because there seemed to be vast gulf between
64K and 200. Happy to drop this.

>>  /*
>>   * Initialize the 'spin' member.
>>   * Set spin member to 0 to trigger AA/ABBA checks immediately.
>>   */
>> -#define RES_INIT_TIMEOUT(ts) ({ (ts).spin = 0; })
>> +#define RES_INIT_TIMEOUT(ts) ({ (ts).spin = 0; (ts).wait = res_smp_cond_load_acquire_waiting(); })
>
> First of all, I don't really like the smp_cond_load_acquire_waiting(),
> that's an implementation detail of smp_cond_load_*_timewait() that
> shouldn't leak outside. But more importantly, RES_CHECK_TIMEOUT() is
> also used outside the smp_cond_load_acquire_timewait() condition. The
> (ts).wait check only makes sense when used together with the WFE
> waiting.
>
> I would leave RES_CHECK_TIMEOUT() as is for the stand-alone cases and
> just use check_timeout() in the smp_cond_load_acquire_timewait()
> scenarios. I would also drop the res_smp_cond_load_acquire() macro since
> you now defined smp_cond_load_acquire_timewait() generically and can be
> used directly.

Sounds good.

--
ankur

Re: [PATCH v4 5/5] rqspinlock: use smp_cond_load_acquire_timewait()

Posted by Alexei Starovoitov 5 months, 1 week ago

On Mon, Sep 1, 2025 at 4:28 AM Catalin Marinas <catalin.marinas@arm.com> wrote:
>
> On Fri, Aug 29, 2025 at 01:07:35AM -0700, Ankur Arora wrote:
> > diff --git a/arch/arm64/include/asm/rqspinlock.h b/arch/arm64/include/asm/rqspinlock.h
> > index a385603436e9..ce8feadeb9a9 100644
> > --- a/arch/arm64/include/asm/rqspinlock.h
> > +++ b/arch/arm64/include/asm/rqspinlock.h
> > @@ -3,6 +3,9 @@
> >  #define _ASM_RQSPINLOCK_H
> >
> >  #include <asm/barrier.h>
> > +
> > +#define res_smp_cond_load_acquire_waiting() arch_timer_evtstrm_available()
>
> More on this below, I don't think we should define it.
>
> > diff --git a/kernel/bpf/rqspinlock.c b/kernel/bpf/rqspinlock.c
> > index 5ab354d55d82..8de1395422e8 100644
> > --- a/kernel/bpf/rqspinlock.c
> > +++ b/kernel/bpf/rqspinlock.c
> > @@ -82,6 +82,7 @@ struct rqspinlock_timeout {
> >       u64 duration;
> >       u64 cur;
> >       u16 spin;
> > +     u8  wait;
> >  };
> >
> >  #define RES_TIMEOUT_VAL      2
> > @@ -241,26 +242,20 @@ static noinline int check_timeout(rqspinlock_t *lock, u32 mask,
> >  }
> >
> >  /*
> > - * Do not amortize with spins when res_smp_cond_load_acquire is defined,
> > - * as the macro does internal amortization for us.
> > + * Only amortize with spins when we don't have a waiting implementation.
> >   */
> > -#ifndef res_smp_cond_load_acquire
> >  #define RES_CHECK_TIMEOUT(ts, ret, mask)                              \
> >       ({                                                            \
> > -             if (!(ts).spin++)                                     \
> > +             if ((ts).wait || !(ts).spin++)                \
> >                       (ret) = check_timeout((lock), (mask), &(ts)); \
> >               (ret);                                                \
> >       })
> > -#else
> > -#define RES_CHECK_TIMEOUT(ts, ret, mask)                           \
> > -     ({ (ret) = check_timeout((lock), (mask), &(ts)); })
> > -#endif
>
> IIUC, RES_CHECK_TIMEOUT in the current res_smp_cond_load_acquire() usage
> doesn't amortise the spins, as the comment suggests, but rather the
> calls to check_timeout(). This is fine, it matches the behaviour of
> smp_cond_load_relaxed_timewait() you introduced in the first patch. The
> only difference is the number of spins - 200 (matching poll_idle) vs 64K
> above. Does 200 work for the above?
>
> >  /*
> >   * Initialize the 'spin' member.
> >   * Set spin member to 0 to trigger AA/ABBA checks immediately.
> >   */
> > -#define RES_INIT_TIMEOUT(ts) ({ (ts).spin = 0; })
> > +#define RES_INIT_TIMEOUT(ts) ({ (ts).spin = 0; (ts).wait = res_smp_cond_load_acquire_waiting(); })
>
> First of all, I don't really like the smp_cond_load_acquire_waiting(),
> that's an implementation detail of smp_cond_load_*_timewait() that
> shouldn't leak outside. But more importantly, RES_CHECK_TIMEOUT() is
> also used outside the smp_cond_load_acquire_timewait() condition. The
> (ts).wait check only makes sense when used together with the WFE
> waiting.

+1 to the above.

Penalizing all other architectures with pointless runtime check:

> -             if (!(ts).spin++)                                     \
> +             if ((ts).wait || !(ts).spin++)                \

is not acceptable.

Re: [PATCH v4 5/5] rqspinlock: use smp_cond_load_acquire_timewait()

Posted by Ankur Arora 5 months, 1 week ago

Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:

> On Mon, Sep 1, 2025 at 4:28 AM Catalin Marinas <catalin.marinas@arm.com> wrote:
>>
>> On Fri, Aug 29, 2025 at 01:07:35AM -0700, Ankur Arora wrote:
>> > diff --git a/arch/arm64/include/asm/rqspinlock.h b/arch/arm64/include/asm/rqspinlock.h
>> > index a385603436e9..ce8feadeb9a9 100644
>> > --- a/arch/arm64/include/asm/rqspinlock.h
>> > +++ b/arch/arm64/include/asm/rqspinlock.h
>> > @@ -3,6 +3,9 @@
>> >  #define _ASM_RQSPINLOCK_H
>> >
>> >  #include <asm/barrier.h>
>> > +
>> > +#define res_smp_cond_load_acquire_waiting() arch_timer_evtstrm_available()
>>
>> More on this below, I don't think we should define it.
>>
>> > diff --git a/kernel/bpf/rqspinlock.c b/kernel/bpf/rqspinlock.c
>> > index 5ab354d55d82..8de1395422e8 100644
>> > --- a/kernel/bpf/rqspinlock.c
>> > +++ b/kernel/bpf/rqspinlock.c
>> > @@ -82,6 +82,7 @@ struct rqspinlock_timeout {
>> >       u64 duration;
>> >       u64 cur;
>> >       u16 spin;
>> > +     u8  wait;
>> >  };
>> >
>> >  #define RES_TIMEOUT_VAL      2
>> > @@ -241,26 +242,20 @@ static noinline int check_timeout(rqspinlock_t *lock, u32 mask,
>> >  }
>> >
>> >  /*
>> > - * Do not amortize with spins when res_smp_cond_load_acquire is defined,
>> > - * as the macro does internal amortization for us.
>> > + * Only amortize with spins when we don't have a waiting implementation.
>> >   */
>> > -#ifndef res_smp_cond_load_acquire
>> >  #define RES_CHECK_TIMEOUT(ts, ret, mask)                              \
>> >       ({                                                            \
>> > -             if (!(ts).spin++)                                     \
>> > +             if ((ts).wait || !(ts).spin++)                \
>> >                       (ret) = check_timeout((lock), (mask), &(ts)); \
>> >               (ret);                                                \
>> >       })
>> > -#else
>> > -#define RES_CHECK_TIMEOUT(ts, ret, mask)                           \
>> > -     ({ (ret) = check_timeout((lock), (mask), &(ts)); })
>> > -#endif
>>
>> IIUC, RES_CHECK_TIMEOUT in the current res_smp_cond_load_acquire() usage
>> doesn't amortise the spins, as the comment suggests, but rather the
>> calls to check_timeout(). This is fine, it matches the behaviour of
>> smp_cond_load_relaxed_timewait() you introduced in the first patch. The
>> only difference is the number of spins - 200 (matching poll_idle) vs 64K
>> above. Does 200 work for the above?
>>
>> >  /*
>> >   * Initialize the 'spin' member.
>> >   * Set spin member to 0 to trigger AA/ABBA checks immediately.
>> >   */
>> > -#define RES_INIT_TIMEOUT(ts) ({ (ts).spin = 0; })
>> > +#define RES_INIT_TIMEOUT(ts) ({ (ts).spin = 0; (ts).wait = res_smp_cond_load_acquire_waiting(); })
>>
>> First of all, I don't really like the smp_cond_load_acquire_waiting(),
>> that's an implementation detail of smp_cond_load_*_timewait() that
>> shouldn't leak outside. But more importantly, RES_CHECK_TIMEOUT() is
>> also used outside the smp_cond_load_acquire_timewait() condition. The
>> (ts).wait check only makes sense when used together with the WFE
>> waiting.
>
> +1 to the above.

Ack.

> Penalizing all other architectures with pointless runtime check:
>
>> -             if (!(ts).spin++)                                     \
>> +             if ((ts).wait || !(ts).spin++)                \
>
> is not acceptable.

Is it still a penalty if the context is a busy wait loop.

Oddly enough the compiler does not eliminate this check on x86 (given
that it is statically defined to be 0 and ts does not escape the
function.)

--
ankur

[PATCH v4 1/5] asm-generic: barrier: Add smp_cond_load_relaxed_timewait()
[PATCH v4 2/5] arm64: barrier: Add smp_cond_load_relaxed_timewait()
[PATCH v4 3/5] arm64: rqspinlock: Remove private copy of smp_cond_load_acquire_timewait
[PATCH v4 4/5] asm-generic: barrier: Add smp_cond_load_acquire_timewait()
[PATCH v4 5/5] rqspinlock: use smp_cond_load_acquire_timewait()