Extend __cmpwait_relaxed() to __cmpwait_relaxed_timeout() which takes
an additional timeout value in ns.
Lacking WFET, or with zero or negative value of timeout we fallback
to WFE.
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: linux-arm-kernel@lists.infradead.org
Signed-off-by: Ankur Arora <ankur.a.arora@oracle.com>
---
arch/arm64/include/asm/barrier.h | 8 ++--
arch/arm64/include/asm/cmpxchg.h | 72 ++++++++++++++++++++++----------
2 files changed, 55 insertions(+), 25 deletions(-)
diff --git a/arch/arm64/include/asm/barrier.h b/arch/arm64/include/asm/barrier.h
index 6190e178db51..fbd71cd4ef4e 100644
--- a/arch/arm64/include/asm/barrier.h
+++ b/arch/arm64/include/asm/barrier.h
@@ -224,8 +224,8 @@ do { \
extern bool arch_timer_evtstrm_available(void);
/*
- * In the common case, cpu_poll_relax() sits waiting in __cmpwait_relaxed()
- * for the ptr value to change.
+ * In the common case, cpu_poll_relax() sits waiting in __cmpwait_relaxed()/
+ * __cmpwait_relaxed_timeout() for the ptr value to change.
*
* Since this period is reasonably long, choose SMP_TIMEOUT_POLL_COUNT
* to be 1, so smp_cond_load_{relaxed,acquire}_timeout() does a
@@ -234,7 +234,9 @@ extern bool arch_timer_evtstrm_available(void);
#define SMP_TIMEOUT_POLL_COUNT 1
#define cpu_poll_relax(ptr, val, timeout_ns) do { \
- if (arch_timer_evtstrm_available()) \
+ if (alternative_has_cap_unlikely(ARM64_HAS_WFXT)) \
+ __cmpwait_relaxed_timeout(ptr, val, timeout_ns); \
+ else if (arch_timer_evtstrm_available()) \
__cmpwait_relaxed(ptr, val); \
else \
cpu_relax(); \
diff --git a/arch/arm64/include/asm/cmpxchg.h b/arch/arm64/include/asm/cmpxchg.h
index d7a540736741..acd01a203b62 100644
--- a/arch/arm64/include/asm/cmpxchg.h
+++ b/arch/arm64/include/asm/cmpxchg.h
@@ -12,6 +12,7 @@
#include <asm/barrier.h>
#include <asm/lse.h>
+#include <asm/delay-const.h>
/*
* We need separate acquire parameters for ll/sc and lse, since the full
@@ -208,22 +209,41 @@ __CMPXCHG_GEN(_mb)
__cmpxchg128((ptr), (o), (n)); \
})
-#define __CMPWAIT_CASE(w, sfx, sz) \
-static inline void __cmpwait_case_##sz(volatile void *ptr, \
- unsigned long val) \
-{ \
- unsigned long tmp; \
- \
- asm volatile( \
- " sevl\n" \
- " wfe\n" \
- " ldxr" #sfx "\t%" #w "[tmp], %[v]\n" \
- " eor %" #w "[tmp], %" #w "[tmp], %" #w "[val]\n" \
- " cbnz %" #w "[tmp], 1f\n" \
- " wfe\n" \
- "1:" \
- : [tmp] "=&r" (tmp), [v] "+Q" (*(u##sz *)ptr) \
- : [val] "r" (val)); \
+/* Re-declared here to avoid include dependency. */
+extern u64 (*arch_timer_read_counter)(void);
+
+#define __CMPWAIT_CASE(w, sfx, sz) \
+static inline void __cmpwait_case_##sz(volatile void *ptr, \
+ unsigned long val, \
+ s64 timeout_ns) \
+{ \
+ unsigned long tmp; \
+ \
+ if (!alternative_has_cap_unlikely(ARM64_HAS_WFXT) || timeout_ns <= 0) { \
+ asm volatile( \
+ " sevl\n" \
+ " wfe\n" \
+ " ldxr" #sfx "\t%" #w "[tmp], %[v]\n" \
+ " eor %" #w "[tmp], %" #w "[tmp], %" #w "[val]\n" \
+ " cbnz %" #w "[tmp], 1f\n" \
+ " wfe\n" \
+ "1:" \
+ : [tmp] "=&r" (tmp), [v] "+Q" (*(u##sz *)ptr) \
+ : [val] "r" (val)); \
+ } else { \
+ u64 ecycles = arch_timer_read_counter() + \
+ NSECS_TO_CYCLES(timeout_ns); \
+ asm volatile( \
+ " sevl\n" \
+ " wfe\n" \
+ " ldxr" #sfx "\t%" #w "[tmp], %[v]\n" \
+ " eor %" #w "[tmp], %" #w "[tmp], %" #w "[val]\n" \
+ " cbnz %" #w "[tmp], 2f\n" \
+ " msr s0_3_c1_c0_0, %[ecycles]\n" \
+ "2:" \
+ : [tmp] "=&r" (tmp), [v] "+Q" (*(u##sz *)ptr) \
+ : [val] "r" (val), [ecycles] "r" (ecycles)); \
+ } \
}
__CMPWAIT_CASE(w, b, 8);
@@ -236,17 +256,22 @@ __CMPWAIT_CASE( , , 64);
#define __CMPWAIT_GEN(sfx) \
static __always_inline void __cmpwait##sfx(volatile void *ptr, \
unsigned long val, \
+ s64 timeout_ns, \
int size) \
{ \
switch (size) { \
case 1: \
- return __cmpwait_case##sfx##_8(ptr, (u8)val); \
+ return __cmpwait_case##sfx##_8(ptr, (u8)val, \
+ timeout_ns); \
case 2: \
- return __cmpwait_case##sfx##_16(ptr, (u16)val); \
+ return __cmpwait_case##sfx##_16(ptr, (u16)val, \
+ timeout_ns); \
case 4: \
- return __cmpwait_case##sfx##_32(ptr, val); \
+ return __cmpwait_case##sfx##_32(ptr, val, \
+ timeout_ns); \
case 8: \
- return __cmpwait_case##sfx##_64(ptr, val); \
+ return __cmpwait_case##sfx##_64(ptr, val, \
+ timeout_ns); \
default: \
BUILD_BUG(); \
} \
@@ -258,7 +283,10 @@ __CMPWAIT_GEN()
#undef __CMPWAIT_GEN
-#define __cmpwait_relaxed(ptr, val) \
- __cmpwait((ptr), (unsigned long)(val), sizeof(*(ptr)))
+#define __cmpwait_relaxed_timeout(ptr, val, timeout_ns) \
+ __cmpwait((ptr), (unsigned long)(val), timeout_ns, sizeof(*(ptr)))
+
+#define __cmpwait_relaxed(ptr, val) \
+ __cmpwait_relaxed_timeout(ptr, val, 0)
#endif /* __ASM_CMPXCHG_H */
--
2.31.1
On Sun, Dec 14, 2025 at 08:49:11PM -0800, Ankur Arora wrote:
> Extend __cmpwait_relaxed() to __cmpwait_relaxed_timeout() which takes
> an additional timeout value in ns.
>
> Lacking WFET, or with zero or negative value of timeout we fallback
> to WFE.
>
> Cc: Arnd Bergmann <arnd@arndb.de>
> Cc: Catalin Marinas <catalin.marinas@arm.com>
> Cc: Will Deacon <will@kernel.org>
> Cc: linux-arm-kernel@lists.infradead.org
> Signed-off-by: Ankur Arora <ankur.a.arora@oracle.com>
> ---
> arch/arm64/include/asm/barrier.h | 8 ++--
> arch/arm64/include/asm/cmpxchg.h | 72 ++++++++++++++++++++++----------
> 2 files changed, 55 insertions(+), 25 deletions(-)
Sorry, just spotted something else on this...
> diff --git a/arch/arm64/include/asm/barrier.h b/arch/arm64/include/asm/barrier.h
> index 6190e178db51..fbd71cd4ef4e 100644
> --- a/arch/arm64/include/asm/barrier.h
> +++ b/arch/arm64/include/asm/barrier.h
> @@ -224,8 +224,8 @@ do { \
> extern bool arch_timer_evtstrm_available(void);
>
> /*
> - * In the common case, cpu_poll_relax() sits waiting in __cmpwait_relaxed()
> - * for the ptr value to change.
> + * In the common case, cpu_poll_relax() sits waiting in __cmpwait_relaxed()/
> + * __cmpwait_relaxed_timeout() for the ptr value to change.
> *
> * Since this period is reasonably long, choose SMP_TIMEOUT_POLL_COUNT
> * to be 1, so smp_cond_load_{relaxed,acquire}_timeout() does a
> @@ -234,7 +234,9 @@ extern bool arch_timer_evtstrm_available(void);
> #define SMP_TIMEOUT_POLL_COUNT 1
>
> #define cpu_poll_relax(ptr, val, timeout_ns) do { \
> - if (arch_timer_evtstrm_available()) \
> + if (alternative_has_cap_unlikely(ARM64_HAS_WFXT)) \
> + __cmpwait_relaxed_timeout(ptr, val, timeout_ns); \
> + else if (arch_timer_evtstrm_available()) \
> __cmpwait_relaxed(ptr, val); \
Don't you want to make sure that we have the event stream available for
__cmpwait_relaxed_timeout() too? Otherwise, a large timeout is going to
cause problems.
Will
Will Deacon <will@kernel.org> writes:
> On Sun, Dec 14, 2025 at 08:49:11PM -0800, Ankur Arora wrote:
>> Extend __cmpwait_relaxed() to __cmpwait_relaxed_timeout() which takes
>> an additional timeout value in ns.
>>
>> Lacking WFET, or with zero or negative value of timeout we fallback
>> to WFE.
>>
>> Cc: Arnd Bergmann <arnd@arndb.de>
>> Cc: Catalin Marinas <catalin.marinas@arm.com>
>> Cc: Will Deacon <will@kernel.org>
>> Cc: linux-arm-kernel@lists.infradead.org
>> Signed-off-by: Ankur Arora <ankur.a.arora@oracle.com>
>> ---
>> arch/arm64/include/asm/barrier.h | 8 ++--
>> arch/arm64/include/asm/cmpxchg.h | 72 ++++++++++++++++++++++----------
>> 2 files changed, 55 insertions(+), 25 deletions(-)
>
> Sorry, just spotted something else on this...
>
>> diff --git a/arch/arm64/include/asm/barrier.h b/arch/arm64/include/asm/barrier.h
>> index 6190e178db51..fbd71cd4ef4e 100644
>> --- a/arch/arm64/include/asm/barrier.h
>> +++ b/arch/arm64/include/asm/barrier.h
>> @@ -224,8 +224,8 @@ do { \
>> extern bool arch_timer_evtstrm_available(void);
>>
>> /*
>> - * In the common case, cpu_poll_relax() sits waiting in __cmpwait_relaxed()
>> - * for the ptr value to change.
>> + * In the common case, cpu_poll_relax() sits waiting in __cmpwait_relaxed()/
>> + * __cmpwait_relaxed_timeout() for the ptr value to change.
>> *
>> * Since this period is reasonably long, choose SMP_TIMEOUT_POLL_COUNT
>> * to be 1, so smp_cond_load_{relaxed,acquire}_timeout() does a
>> @@ -234,7 +234,9 @@ extern bool arch_timer_evtstrm_available(void);
>> #define SMP_TIMEOUT_POLL_COUNT 1
>>
>> #define cpu_poll_relax(ptr, val, timeout_ns) do { \
>> - if (arch_timer_evtstrm_available()) \
>> + if (alternative_has_cap_unlikely(ARM64_HAS_WFXT)) \
>> + __cmpwait_relaxed_timeout(ptr, val, timeout_ns); \
>> + else if (arch_timer_evtstrm_available()) \
>> __cmpwait_relaxed(ptr, val); \
>
> Don't you want to make sure that we have the event stream available for
> __cmpwait_relaxed_timeout() too? Otherwise, a large timeout is going to
> cause problems.
Would that help though? If called from smp_cond_load_relaxed_timeout()
then we would wake up and just call __cmpwait_relaxed_timeout() again.
Or did you mean other future users of __cmpwait_relaxed_timeout()?
Also I remember the event stream coming up earlier and there the intent
was to disable it if WFET was available: https://lore.kernel.org/lkml/aJ3d2uoKtDop_gQO@arm.com/.
--
ankur
On Fri, Jan 09, 2026 at 11:05:06AM -0800, Ankur Arora wrote:
>
> Will Deacon <will@kernel.org> writes:
>
> > On Sun, Dec 14, 2025 at 08:49:11PM -0800, Ankur Arora wrote:
> >> Extend __cmpwait_relaxed() to __cmpwait_relaxed_timeout() which takes
> >> an additional timeout value in ns.
> >>
> >> Lacking WFET, or with zero or negative value of timeout we fallback
> >> to WFE.
> >>
> >> Cc: Arnd Bergmann <arnd@arndb.de>
> >> Cc: Catalin Marinas <catalin.marinas@arm.com>
> >> Cc: Will Deacon <will@kernel.org>
> >> Cc: linux-arm-kernel@lists.infradead.org
> >> Signed-off-by: Ankur Arora <ankur.a.arora@oracle.com>
> >> ---
> >> arch/arm64/include/asm/barrier.h | 8 ++--
> >> arch/arm64/include/asm/cmpxchg.h | 72 ++++++++++++++++++++++----------
> >> 2 files changed, 55 insertions(+), 25 deletions(-)
> >
> > Sorry, just spotted something else on this...
> >
> >> diff --git a/arch/arm64/include/asm/barrier.h b/arch/arm64/include/asm/barrier.h
> >> index 6190e178db51..fbd71cd4ef4e 100644
> >> --- a/arch/arm64/include/asm/barrier.h
> >> +++ b/arch/arm64/include/asm/barrier.h
> >> @@ -224,8 +224,8 @@ do { \
> >> extern bool arch_timer_evtstrm_available(void);
> >>
> >> /*
> >> - * In the common case, cpu_poll_relax() sits waiting in __cmpwait_relaxed()
> >> - * for the ptr value to change.
> >> + * In the common case, cpu_poll_relax() sits waiting in __cmpwait_relaxed()/
> >> + * __cmpwait_relaxed_timeout() for the ptr value to change.
> >> *
> >> * Since this period is reasonably long, choose SMP_TIMEOUT_POLL_COUNT
> >> * to be 1, so smp_cond_load_{relaxed,acquire}_timeout() does a
> >> @@ -234,7 +234,9 @@ extern bool arch_timer_evtstrm_available(void);
> >> #define SMP_TIMEOUT_POLL_COUNT 1
> >>
> >> #define cpu_poll_relax(ptr, val, timeout_ns) do { \
> >> - if (arch_timer_evtstrm_available()) \
> >> + if (alternative_has_cap_unlikely(ARM64_HAS_WFXT)) \
> >> + __cmpwait_relaxed_timeout(ptr, val, timeout_ns); \
> >> + else if (arch_timer_evtstrm_available()) \
> >> __cmpwait_relaxed(ptr, val); \
> >
> > Don't you want to make sure that we have the event stream available for
> > __cmpwait_relaxed_timeout() too? Otherwise, a large timeout is going to
> > cause problems.
>
> Would that help though? If called from smp_cond_load_relaxed_timeout()
> then we would wake up and just call __cmpwait_relaxed_timeout() again.
Fair enough, I can see that. Is it worth capping the maximum timeout
like we do for udelay()?
Will
Will Deacon <will@kernel.org> writes:
> On Fri, Jan 09, 2026 at 11:05:06AM -0800, Ankur Arora wrote:
>>
>> Will Deacon <will@kernel.org> writes:
>>
>> > On Sun, Dec 14, 2025 at 08:49:11PM -0800, Ankur Arora wrote:
>> >> Extend __cmpwait_relaxed() to __cmpwait_relaxed_timeout() which takes
>> >> an additional timeout value in ns.
>> >>
>> >> Lacking WFET, or with zero or negative value of timeout we fallback
>> >> to WFE.
>> >>
>> >> Cc: Arnd Bergmann <arnd@arndb.de>
>> >> Cc: Catalin Marinas <catalin.marinas@arm.com>
>> >> Cc: Will Deacon <will@kernel.org>
>> >> Cc: linux-arm-kernel@lists.infradead.org
>> >> Signed-off-by: Ankur Arora <ankur.a.arora@oracle.com>
>> >> ---
>> >> arch/arm64/include/asm/barrier.h | 8 ++--
>> >> arch/arm64/include/asm/cmpxchg.h | 72 ++++++++++++++++++++++----------
>> >> 2 files changed, 55 insertions(+), 25 deletions(-)
>> >
>> > Sorry, just spotted something else on this...
>> >
>> >> diff --git a/arch/arm64/include/asm/barrier.h b/arch/arm64/include/asm/barrier.h
>> >> index 6190e178db51..fbd71cd4ef4e 100644
>> >> --- a/arch/arm64/include/asm/barrier.h
>> >> +++ b/arch/arm64/include/asm/barrier.h
>> >> @@ -224,8 +224,8 @@ do { \
>> >> extern bool arch_timer_evtstrm_available(void);
>> >>
>> >> /*
>> >> - * In the common case, cpu_poll_relax() sits waiting in __cmpwait_relaxed()
>> >> - * for the ptr value to change.
>> >> + * In the common case, cpu_poll_relax() sits waiting in __cmpwait_relaxed()/
>> >> + * __cmpwait_relaxed_timeout() for the ptr value to change.
>> >> *
>> >> * Since this period is reasonably long, choose SMP_TIMEOUT_POLL_COUNT
>> >> * to be 1, so smp_cond_load_{relaxed,acquire}_timeout() does a
>> >> @@ -234,7 +234,9 @@ extern bool arch_timer_evtstrm_available(void);
>> >> #define SMP_TIMEOUT_POLL_COUNT 1
>> >>
>> >> #define cpu_poll_relax(ptr, val, timeout_ns) do { \
>> >> - if (arch_timer_evtstrm_available()) \
>> >> + if (alternative_has_cap_unlikely(ARM64_HAS_WFXT)) \
>> >> + __cmpwait_relaxed_timeout(ptr, val, timeout_ns); \
>> >> + else if (arch_timer_evtstrm_available()) \
>> >> __cmpwait_relaxed(ptr, val); \
>> >
>> > Don't you want to make sure that we have the event stream available for
>> > __cmpwait_relaxed_timeout() too? Otherwise, a large timeout is going to
>> > cause problems.
>>
>> Would that help though? If called from smp_cond_load_relaxed_timeout()
>> then we would wake up and just call __cmpwait_relaxed_timeout() again.
>
> Fair enough, I can see that. Is it worth capping the maximum timeout
> like we do for udelay()?
The DELAY_CONST_MAX thing?
So, I'm not sure your concern is about the overall timeout or timeout
per WFET iteration?
For the overall limit, at least rqspinlock has a pretty large timeout
value (NSEC_PER_SEC/4).
However, it might be a good idea to attach a DELAY_CONST_MAX like limit
when using this interface -- for architectures that do not have an optimized
way of polling/define ARCH_HAS_CPU_RELAX.
(Currently only x86 defines ARCH_HAS_CPU_RELAX but I had a series which
is meant to go after this that renames it to ARCH_HAS_ OPTIMIZED_POLL
and selects it for x86 and arm64 [1].)
But that still might mean that we could have fairly long WFET iterations.
Do you forsee a problem with that?
[1] https://lore.kernel.org/lkml/20250218213337.377987-1-ankur.a.arora@oracle.com/
Thanks
--
ankur
On Sun, Dec 14, 2025 at 08:49:11PM -0800, Ankur Arora wrote:
> +#define __CMPWAIT_CASE(w, sfx, sz) \
> +static inline void __cmpwait_case_##sz(volatile void *ptr, \
> + unsigned long val, \
> + s64 timeout_ns) \
> +{ \
> + unsigned long tmp; \
> + \
> + if (!alternative_has_cap_unlikely(ARM64_HAS_WFXT) || timeout_ns <= 0) { \
> + asm volatile( \
> + " sevl\n" \
> + " wfe\n" \
> + " ldxr" #sfx "\t%" #w "[tmp], %[v]\n" \
> + " eor %" #w "[tmp], %" #w "[tmp], %" #w "[val]\n" \
> + " cbnz %" #w "[tmp], 1f\n" \
> + " wfe\n" \
> + "1:" \
> + : [tmp] "=&r" (tmp), [v] "+Q" (*(u##sz *)ptr) \
> + : [val] "r" (val)); \
> + } else { \
> + u64 ecycles = arch_timer_read_counter() + \
> + NSECS_TO_CYCLES(timeout_ns); \
> + asm volatile( \
> + " sevl\n" \
> + " wfe\n" \
> + " ldxr" #sfx "\t%" #w "[tmp], %[v]\n" \
> + " eor %" #w "[tmp], %" #w "[tmp], %" #w "[val]\n" \
> + " cbnz %" #w "[tmp], 2f\n" \
> + " msr s0_3_c1_c0_0, %[ecycles]\n" \
> + "2:" \
> + : [tmp] "=&r" (tmp), [v] "+Q" (*(u##sz *)ptr) \
> + : [val] "r" (val), [ecycles] "r" (ecycles)); \
> + } \
Why not have a separate helper for the WFXT version and avoid the runtime
check on timeout_ns?
Will
Will Deacon <will@kernel.org> writes:
> On Sun, Dec 14, 2025 at 08:49:11PM -0800, Ankur Arora wrote:
>> +#define __CMPWAIT_CASE(w, sfx, sz) \
>> +static inline void __cmpwait_case_##sz(volatile void *ptr, \
>> + unsigned long val, \
>> + s64 timeout_ns) \
>> +{ \
>> + unsigned long tmp; \
>> + \
>> + if (!alternative_has_cap_unlikely(ARM64_HAS_WFXT) || timeout_ns <= 0) { \
>> + asm volatile( \
>> + " sevl\n" \
>> + " wfe\n" \
>> + " ldxr" #sfx "\t%" #w "[tmp], %[v]\n" \
>> + " eor %" #w "[tmp], %" #w "[tmp], %" #w "[val]\n" \
>> + " cbnz %" #w "[tmp], 1f\n" \
>> + " wfe\n" \
>> + "1:" \
>> + : [tmp] "=&r" (tmp), [v] "+Q" (*(u##sz *)ptr) \
>> + : [val] "r" (val)); \
>> + } else { \
>> + u64 ecycles = arch_timer_read_counter() + \
>> + NSECS_TO_CYCLES(timeout_ns); \
>> + asm volatile( \
>> + " sevl\n" \
>> + " wfe\n" \
>> + " ldxr" #sfx "\t%" #w "[tmp], %[v]\n" \
>> + " eor %" #w "[tmp], %" #w "[tmp], %" #w "[val]\n" \
>> + " cbnz %" #w "[tmp], 2f\n" \
>> + " msr s0_3_c1_c0_0, %[ecycles]\n" \
>> + "2:" \
>> + : [tmp] "=&r" (tmp), [v] "+Q" (*(u##sz *)ptr) \
>> + : [val] "r" (val), [ecycles] "r" (ecycles)); \
>> + } \
>
> Why not have a separate helper for the WFXT version and avoid the runtime
> check on timeout_ns?
My main reason for keeping them together was that a separate helper
needed duplication of a lot of the __CMPWAIT_CASE and __CMPWAIT_GEN
stuff.
Relooking at it, I think we could get by without duplicating the
__CMPWAIT_GEN (the WFE helper just needs to take an unused timeout_ns
paramter).
But, it seems overkill to get rid of the unnecessary check on timeout_ns
(which AFAICT should be well predicted) and the duplicate static branch.
--
ankur
On Fri, Jan 09, 2026 at 01:05:57AM -0800, Ankur Arora wrote:
>
> Will Deacon <will@kernel.org> writes:
>
> > On Sun, Dec 14, 2025 at 08:49:11PM -0800, Ankur Arora wrote:
> >> +#define __CMPWAIT_CASE(w, sfx, sz) \
> >> +static inline void __cmpwait_case_##sz(volatile void *ptr, \
> >> + unsigned long val, \
> >> + s64 timeout_ns) \
> >> +{ \
> >> + unsigned long tmp; \
> >> + \
> >> + if (!alternative_has_cap_unlikely(ARM64_HAS_WFXT) || timeout_ns <= 0) { \
> >> + asm volatile( \
> >> + " sevl\n" \
> >> + " wfe\n" \
> >> + " ldxr" #sfx "\t%" #w "[tmp], %[v]\n" \
> >> + " eor %" #w "[tmp], %" #w "[tmp], %" #w "[val]\n" \
> >> + " cbnz %" #w "[tmp], 1f\n" \
> >> + " wfe\n" \
> >> + "1:" \
> >> + : [tmp] "=&r" (tmp), [v] "+Q" (*(u##sz *)ptr) \
> >> + : [val] "r" (val)); \
> >> + } else { \
> >> + u64 ecycles = arch_timer_read_counter() + \
> >> + NSECS_TO_CYCLES(timeout_ns); \
> >> + asm volatile( \
> >> + " sevl\n" \
> >> + " wfe\n" \
> >> + " ldxr" #sfx "\t%" #w "[tmp], %[v]\n" \
> >> + " eor %" #w "[tmp], %" #w "[tmp], %" #w "[val]\n" \
> >> + " cbnz %" #w "[tmp], 2f\n" \
> >> + " msr s0_3_c1_c0_0, %[ecycles]\n" \
> >> + "2:" \
> >> + : [tmp] "=&r" (tmp), [v] "+Q" (*(u##sz *)ptr) \
> >> + : [val] "r" (val), [ecycles] "r" (ecycles)); \
> >> + } \
> >
> > Why not have a separate helper for the WFXT version and avoid the runtime
> > check on timeout_ns?
>
> My main reason for keeping them together was that a separate helper
> needed duplication of a lot of the __CMPWAIT_CASE and __CMPWAIT_GEN
> stuff.
>
> Relooking at it, I think we could get by without duplicating the
> __CMPWAIT_GEN (the WFE helper just needs to take an unused timeout_ns
> paramter).
>
> But, it seems overkill to get rid of the unnecessary check on timeout_ns
> (which AFAICT should be well predicted) and the duplicate static branch.
tbh, I was actually struggling to see what the check achieves. In fact,
why is 'timeout_ns' signed in the first place? Has BPF invented time
travel now? :p
If the requested timeout is 0 then we should return immediately (or issue
a WFET which will wake up immediately).
If the requested timeout is negative, then WFET may end up with a really
long timeout, but that should still be no worse than a plain WFE.
If the check is only there to de-multiplex __cmpwait() vs
__cmpwait_relaxed_timeout() as the caller, then I think we should just
avoid muxing them in the first place. The duplication argument doesn't
hold a lot of water when the asm block is already mostly copy-paste.
Will
Will Deacon <will@kernel.org> writes:
> On Fri, Jan 09, 2026 at 01:05:57AM -0800, Ankur Arora wrote:
>>
>> Will Deacon <will@kernel.org> writes:
>>
>> > On Sun, Dec 14, 2025 at 08:49:11PM -0800, Ankur Arora wrote:
>> >> +#define __CMPWAIT_CASE(w, sfx, sz) \
>> >> +static inline void __cmpwait_case_##sz(volatile void *ptr, \
>> >> + unsigned long val, \
>> >> + s64 timeout_ns) \
>> >> +{ \
>> >> + unsigned long tmp; \
>> >> + \
>> >> + if (!alternative_has_cap_unlikely(ARM64_HAS_WFXT) || timeout_ns <= 0) { \
>> >> + asm volatile( \
>> >> + " sevl\n" \
>> >> + " wfe\n" \
>> >> + " ldxr" #sfx "\t%" #w "[tmp], %[v]\n" \
>> >> + " eor %" #w "[tmp], %" #w "[tmp], %" #w "[val]\n" \
>> >> + " cbnz %" #w "[tmp], 1f\n" \
>> >> + " wfe\n" \
>> >> + "1:" \
>> >> + : [tmp] "=&r" (tmp), [v] "+Q" (*(u##sz *)ptr) \
>> >> + : [val] "r" (val)); \
>> >> + } else { \
>> >> + u64 ecycles = arch_timer_read_counter() + \
>> >> + NSECS_TO_CYCLES(timeout_ns); \
>> >> + asm volatile( \
>> >> + " sevl\n" \
>> >> + " wfe\n" \
>> >> + " ldxr" #sfx "\t%" #w "[tmp], %[v]\n" \
>> >> + " eor %" #w "[tmp], %" #w "[tmp], %" #w "[val]\n" \
>> >> + " cbnz %" #w "[tmp], 2f\n" \
>> >> + " msr s0_3_c1_c0_0, %[ecycles]\n" \
>> >> + "2:" \
>> >> + : [tmp] "=&r" (tmp), [v] "+Q" (*(u##sz *)ptr) \
>> >> + : [val] "r" (val), [ecycles] "r" (ecycles)); \
>> >> + } \
>> >
>> > Why not have a separate helper for the WFXT version and avoid the runtime
>> > check on timeout_ns?
>>
>> My main reason for keeping them together was that a separate helper
>> needed duplication of a lot of the __CMPWAIT_CASE and __CMPWAIT_GEN
>> stuff.
>>
>> Relooking at it, I think we could get by without duplicating the
>> __CMPWAIT_GEN (the WFE helper just needs to take an unused timeout_ns
>> paramter).
>>
>> But, it seems overkill to get rid of the unnecessary check on timeout_ns
>> (which AFAICT should be well predicted) and the duplicate static branch.
>
> tbh, I was actually struggling to see what the check achieves. In fact,
> why is 'timeout_ns' signed in the first place? Has BPF invented time
> travel now? :p
Worse. I had to invent it for them :D.
The BPF rqspinlock needs to be able to return failure (-EDEADLOCK, -ETIMEDOUT).
What seemed to me to be the most natural way was for the clock used by
rqspinlock itself returning either the clock value or an error value.
So that necessitated the top level timeout_ns to be signed.
Now the error would get filtered out by smp_cond_load_relaxed_timeout()
so cpu_poll_relax() should be called with a positive value of timeout_ns.
> If the requested timeout is 0 then we should return immediately (or issue
> a WFET which will wake up immediately).
>
> If the requested timeout is negative, then WFET may end up with a really
> long timeout, but that should still be no worse than a plain WFE.
cpu_poll_relax() should not be called with 0 or a negative value.
I'll add a comment to that effect.
> If the check is only there to de-multiplex __cmpwait() vs
> __cmpwait_relaxed_timeout() as the caller, then I think we should just
> avoid muxing them in the first place.
Yes that's a good point. That the only reason that check exists. And we can
do without it since cpu_poll_relax() has all the information it needs to
do the de-multiplexing.
> The duplication argument doesn't
> hold a lot of water when the asm block is already mostly copy-paste.
Fair enough.
--
ankur
© 2016 - 2026 Red Hat, Inc.