[PATCH] locking/qspinlock: Use atomic_try_cmpxchg_relaxed() in xchg_tail()

Uros Bizjak posted 1 patch 1 year, 10 months ago
There is a newer version of this series
kernel/locking/qspinlock.c | 13 +++++--------
1 file changed, 5 insertions(+), 8 deletions(-)
[PATCH] locking/qspinlock: Use atomic_try_cmpxchg_relaxed() in xchg_tail()
Posted by Uros Bizjak 1 year, 10 months ago
Use atomic_try_cmpxchg_relaxed(*ptr, &old, new) instead of
atomic_cmpxchg_relaxed (*ptr, old, new) == old in xchg_tail().
x86 CMPXCHG instruction returns success in ZF flag,
so this change saves a compare after cmpxchg.

No functional change intended.

Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Will Deacon <will@kernel.org>
Cc: Waiman Long <longman@redhat.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
---
 kernel/locking/qspinlock.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index ebe6b8ec7cb3..1df5fef8a656 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -220,21 +220,18 @@ static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
  */
 static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
 {
-	u32 old, new, val = atomic_read(&lock->val);
+	u32 old, new;
 
-	for (;;) {
-		new = (val & _Q_LOCKED_PENDING_MASK) | tail;
+	old = atomic_read(&lock->val);
+	do {
+		new = (old & _Q_LOCKED_PENDING_MASK) | tail;
 		/*
 		 * We can use relaxed semantics since the caller ensures that
 		 * the MCS node is properly initialized before updating the
 		 * tail.
 		 */
-		old = atomic_cmpxchg_relaxed(&lock->val, val, new);
-		if (old == val)
-			break;
+	} while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new));
 
-		val = old;
-	}
 	return old;
 }
 #endif /* _Q_PENDING_BITS == 8 */
-- 
2.42.0
Re: [PATCH] locking/qspinlock: Use atomic_try_cmpxchg_relaxed() in xchg_tail()
Posted by Waiman Long 1 year, 10 months ago
On 3/21/24 15:52, Uros Bizjak wrote:
> Use atomic_try_cmpxchg_relaxed(*ptr, &old, new) instead of
> atomic_cmpxchg_relaxed (*ptr, old, new) == old in xchg_tail().
> x86 CMPXCHG instruction returns success in ZF flag,
> so this change saves a compare after cmpxchg.
>
> No functional change intended.
>
> Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: Ingo Molnar <mingo@redhat.com>
> Cc: Will Deacon <will@kernel.org>
> Cc: Waiman Long <longman@redhat.com>
> Cc: Boqun Feng <boqun.feng@gmail.com>
> ---
>   kernel/locking/qspinlock.c | 13 +++++--------
>   1 file changed, 5 insertions(+), 8 deletions(-)
>
> diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
> index ebe6b8ec7cb3..1df5fef8a656 100644
> --- a/kernel/locking/qspinlock.c
> +++ b/kernel/locking/qspinlock.c
> @@ -220,21 +220,18 @@ static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
>    */
>   static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
>   {
> -	u32 old, new, val = atomic_read(&lock->val);
> +	u32 old, new;
>   
> -	for (;;) {
> -		new = (val & _Q_LOCKED_PENDING_MASK) | tail;
> +	old = atomic_read(&lock->val);
> +	do {
> +		new = (old & _Q_LOCKED_PENDING_MASK) | tail;
>   		/*
>   		 * We can use relaxed semantics since the caller ensures that
>   		 * the MCS node is properly initialized before updating the
>   		 * tail.
>   		 */
> -		old = atomic_cmpxchg_relaxed(&lock->val, val, new);
> -		if (old == val)
> -			break;
> +	} while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new));
>   
> -		val = old;
> -	}
>   	return old;
>   }
>   #endif /* _Q_PENDING_BITS == 8 */

LGTM, note that this xchg_tail() variant is not used in all the distros 
that I am aware of as it requires NR_CPUS >= 16k.

Reviewed-by: Waiman Long <longman@redhat.com>
Re: [PATCH] locking/qspinlock: Use atomic_try_cmpxchg_relaxed() in xchg_tail()
Posted by Uros Bizjak 1 year, 10 months ago
On Fri, Mar 22, 2024 at 12:52 AM Waiman Long <longman@redhat.com> wrote:
>
> On 3/21/24 15:52, Uros Bizjak wrote:
> > Use atomic_try_cmpxchg_relaxed(*ptr, &old, new) instead of
> > atomic_cmpxchg_relaxed (*ptr, old, new) == old in xchg_tail().
> > x86 CMPXCHG instruction returns success in ZF flag,
> > so this change saves a compare after cmpxchg.
> >
> > No functional change intended.
> >
> > Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
> > Cc: Peter Zijlstra <peterz@infradead.org>
> > Cc: Ingo Molnar <mingo@redhat.com>
> > Cc: Will Deacon <will@kernel.org>
> > Cc: Waiman Long <longman@redhat.com>
> > Cc: Boqun Feng <boqun.feng@gmail.com>
> > ---
> >   kernel/locking/qspinlock.c | 13 +++++--------
> >   1 file changed, 5 insertions(+), 8 deletions(-)
> >
> > diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
> > index ebe6b8ec7cb3..1df5fef8a656 100644
> > --- a/kernel/locking/qspinlock.c
> > +++ b/kernel/locking/qspinlock.c
> > @@ -220,21 +220,18 @@ static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
> >    */
> >   static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
> >   {
> > -     u32 old, new, val = atomic_read(&lock->val);
> > +     u32 old, new;
> >
> > -     for (;;) {
> > -             new = (val & _Q_LOCKED_PENDING_MASK) | tail;
> > +     old = atomic_read(&lock->val);
> > +     do {
> > +             new = (old & _Q_LOCKED_PENDING_MASK) | tail;
> >               /*
> >                * We can use relaxed semantics since the caller ensures that
> >                * the MCS node is properly initialized before updating the
> >                * tail.
> >                */
> > -             old = atomic_cmpxchg_relaxed(&lock->val, val, new);
> > -             if (old == val)
> > -                     break;
> > +     } while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new));
> >
> > -             val = old;
> > -     }
> >       return old;
> >   }
> >   #endif /* _Q_PENDING_BITS == 8 */
>
> LGTM, note that this xchg_tail() variant is not used in all the distros
> that I am aware of as it requires NR_CPUS >= 16k.

Yes, I am aware of this. I have tested the patch simply by
unconditionally setting _Q_PENDING_BITS to 1 in
include/asm-generic/qspinlock_types.h.
>
> Reviewed-by: Waiman Long <longman@redhat.com>

Thanks,
Uros.
[tip: locking/core] locking/qspinlock: Use atomic_try_cmpxchg_relaxed() in xchg_tail()
Posted by tip-bot2 for Uros Bizjak 1 year, 10 months ago
The following commit has been merged into the locking/core branch of tip:

Commit-ID:     79a34e3d8411050c3c7550c5163d6f9dc41e8f66
Gitweb:        https://git.kernel.org/tip/79a34e3d8411050c3c7550c5163d6f9dc41e8f66
Author:        Uros Bizjak <ubizjak@gmail.com>
AuthorDate:    Thu, 21 Mar 2024 20:52:47 +01:00
Committer:     Ingo Molnar <mingo@kernel.org>
CommitterDate: Thu, 11 Apr 2024 15:14:54 +02:00

locking/qspinlock: Use atomic_try_cmpxchg_relaxed() in xchg_tail()

Use atomic_try_cmpxchg_relaxed(*ptr, &old, new) instead of
atomic_cmpxchg_relaxed (*ptr, old, new) == old in xchg_tail().

x86 CMPXCHG instruction returns success in ZF flag,
so this change saves a compare after CMPXCHG.

No functional change intended.

Since this code requires NR_CPUS >= 16k, I have tested it
by unconditionally setting _Q_PENDING_BITS to 1 in
<asm-generic/qspinlock_types.h>.

Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Waiman Long <longman@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lore.kernel.org/r/20240321195309.484275-1-ubizjak@gmail.com
---
 kernel/locking/qspinlock.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index ebe6b8e..1df5fef 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -220,21 +220,18 @@ static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
  */
 static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
 {
-	u32 old, new, val = atomic_read(&lock->val);
+	u32 old, new;
 
-	for (;;) {
-		new = (val & _Q_LOCKED_PENDING_MASK) | tail;
+	old = atomic_read(&lock->val);
+	do {
+		new = (old & _Q_LOCKED_PENDING_MASK) | tail;
 		/*
 		 * We can use relaxed semantics since the caller ensures that
 		 * the MCS node is properly initialized before updating the
 		 * tail.
 		 */
-		old = atomic_cmpxchg_relaxed(&lock->val, val, new);
-		if (old == val)
-			break;
+	} while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new));
 
-		val = old;
-	}
 	return old;
 }
 #endif /* _Q_PENDING_BITS == 8 */