locking/atomic/x86: Improve arch_atomic*() family of functions

[PATCH 2/6] locking/atomic/x86: Rewrite x86_32 arch_atomic64_{,fetch}_{and,or,xor}() functions

Posted by Uros Bizjak 1 year, 10 months ago

Rewrite x86_32 arch_atomic64_{,fetch}_{and,or,xor}() functions to
use arch_atomic64_try_cmpxchg.  This implementation avoids one extra
trip through the cmpxchg loop.

The value preload before the cmpxchg loop does not need to be atomic,
but should use READ_ONCE to prevent compiler from merging, refetching
or reordering the read.

The generated code improves from:

  1917d5:	31 c9                	xor    %ecx,%ecx
  1917d7:	31 db                	xor    %ebx,%ebx
  1917d9:	89 4c 24 3c          	mov    %ecx,0x3c(%esp)
  1917dd:	8b 74 24 24          	mov    0x24(%esp),%esi
  1917e1:	89 c8                	mov    %ecx,%eax
  1917e3:	89 5c 24 34          	mov    %ebx,0x34(%esp)
  1917e7:	8b 7c 24 28          	mov    0x28(%esp),%edi
  1917eb:	21 ce                	and    %ecx,%esi
  1917ed:	89 74 24 4c          	mov    %esi,0x4c(%esp)
  1917f1:	21 df                	and    %ebx,%edi
  1917f3:	89 de                	mov    %ebx,%esi
  1917f5:	89 7c 24 50          	mov    %edi,0x50(%esp)
  1917f9:	8b 54 24 4c          	mov    0x4c(%esp),%edx
  1917fd:	8b 7c 24 2c          	mov    0x2c(%esp),%edi
  191801:	8b 4c 24 50          	mov    0x50(%esp),%ecx
  191805:	89 d3                	mov    %edx,%ebx
  191807:	89 f2                	mov    %esi,%edx
  191809:	f0 0f c7 0f          	lock cmpxchg8b (%edi)
  19180d:	89 c1                	mov    %eax,%ecx
  19180f:	8b 74 24 34          	mov    0x34(%esp),%esi
  191813:	89 d3                	mov    %edx,%ebx
  191815:	89 44 24 4c          	mov    %eax,0x4c(%esp)
  191819:	8b 44 24 3c          	mov    0x3c(%esp),%eax
  19181d:	89 df                	mov    %ebx,%edi
  19181f:	89 54 24 44          	mov    %edx,0x44(%esp)
  191823:	89 ca                	mov    %ecx,%edx
  191825:	31 de                	xor    %ebx,%esi
  191827:	31 c8                	xor    %ecx,%eax
  191829:	09 f0                	or     %esi,%eax
  19182b:	75 ac                	jne    1917d9 <...>

to:

  1912ba:	8b 06                	mov    (%esi),%eax
  1912bc:	8b 56 04             	mov    0x4(%esi),%edx
  1912bf:	89 44 24 3c          	mov    %eax,0x3c(%esp)
  1912c3:	89 c1                	mov    %eax,%ecx
  1912c5:	23 4c 24 34          	and    0x34(%esp),%ecx
  1912c9:	89 d3                	mov    %edx,%ebx
  1912cb:	23 5c 24 38          	and    0x38(%esp),%ebx
  1912cf:	89 54 24 40          	mov    %edx,0x40(%esp)
  1912d3:	89 4c 24 2c          	mov    %ecx,0x2c(%esp)
  1912d7:	89 5c 24 30          	mov    %ebx,0x30(%esp)
  1912db:	8b 5c 24 2c          	mov    0x2c(%esp),%ebx
  1912df:	8b 4c 24 30          	mov    0x30(%esp),%ecx
  1912e3:	f0 0f c7 0e          	lock cmpxchg8b (%esi)
  1912e7:	0f 85 f3 02 00 00    	jne    1915e0 <...>

Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Peter Zijlstra <peterz@infradead.org>
---
 arch/x86/include/asm/atomic64_32.h | 44 ++++++++++++------------------
 1 file changed, 18 insertions(+), 26 deletions(-)

diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h
index 11e817dab44a..84affd7a5d1c 100644
--- a/arch/x86/include/asm/atomic64_32.h
+++ b/arch/x86/include/asm/atomic64_32.h
@@ -201,69 +201,61 @@ static __always_inline s64 arch_atomic64_dec_if_positive(atomic64_t *v)
 
 static __always_inline void arch_atomic64_and(s64 i, atomic64_t *v)
 {
-	s64 old, c = 0;
+	s64 val = __READ_ONCE(v->counter);
 
-	while ((old = arch_atomic64_cmpxchg(v, c, c & i)) != c)
-		c = old;
+	do { } while (!arch_atomic64_try_cmpxchg(v, &val, val & i));
 }
 
 static __always_inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v)
 {
-	s64 old, c = 0;
+	s64 val = __READ_ONCE(v->counter);
 
-	while ((old = arch_atomic64_cmpxchg(v, c, c & i)) != c)
-		c = old;
+	do { } while (!arch_atomic64_try_cmpxchg(v, &val, val & i));
 
-	return old;
+	return val;
 }
 #define arch_atomic64_fetch_and arch_atomic64_fetch_and
 
 static __always_inline void arch_atomic64_or(s64 i, atomic64_t *v)
 {
-	s64 old, c = 0;
+	s64 val = __READ_ONCE(v->counter);
 
-	while ((old = arch_atomic64_cmpxchg(v, c, c | i)) != c)
-		c = old;
+	do { } while (!arch_atomic64_try_cmpxchg(v, &val, val | i));
 }
 
 static __always_inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v)
 {
-	s64 old, c = 0;
+	s64 val = __READ_ONCE(v->counter);
 
-	while ((old = arch_atomic64_cmpxchg(v, c, c | i)) != c)
-		c = old;
+	do { } while (!arch_atomic64_try_cmpxchg(v, &val, val | i));
 
-	return old;
+	return val;
 }
 #define arch_atomic64_fetch_or arch_atomic64_fetch_or
 
 static __always_inline void arch_atomic64_xor(s64 i, atomic64_t *v)
 {
-	s64 old, c = 0;
+	s64 val = __READ_ONCE(v->counter);
 
-	while ((old = arch_atomic64_cmpxchg(v, c, c ^ i)) != c)
-		c = old;
+	do { } while (!arch_atomic64_try_cmpxchg(v, &val, val ^ i));
 }
 
 static __always_inline s64 arch_atomic64_fetch_xor(s64 i, atomic64_t *v)
 {
-	s64 old, c = 0;
+	s64 val = __READ_ONCE(v->counter);
 
-	while ((old = arch_atomic64_cmpxchg(v, c, c ^ i)) != c)
-		c = old;
+	do { } while (!arch_atomic64_try_cmpxchg(v, &val, val ^ i));
 
-	return old;
+	return val;
 }
 #define arch_atomic64_fetch_xor arch_atomic64_fetch_xor
 
 static __always_inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v)
 {
-	s64 old, c = 0;
+	s64 val = __READ_ONCE(v->counter);
 
-	while ((old = arch_atomic64_cmpxchg(v, c, c + i)) != c)
-		c = old;
-
-	return old;
+	do { } while (!arch_atomic64_try_cmpxchg(v, &val, val + i));
+	return val;
 }
 #define arch_atomic64_fetch_add arch_atomic64_fetch_add
 
-- 
2.44.0

Re: [PATCH 2/6] locking/atomic/x86: Rewrite x86_32 arch_atomic64_{,fetch}_{and,or,xor}() functions

Posted by Mark Rutland 1 year, 10 months ago

On Tue, Apr 09, 2024 at 12:03:53PM +0200, Uros Bizjak wrote:
> Rewrite x86_32 arch_atomic64_{,fetch}_{and,or,xor}() functions to
> use arch_atomic64_try_cmpxchg.  This implementation avoids one extra
> trip through the cmpxchg loop.
> 
> The value preload before the cmpxchg loop does not need to be atomic,
> but should use READ_ONCE to prevent compiler from merging, refetching
> or reordering the read.
> 
> The generated code improves from:
> 
>   1917d5:	31 c9                	xor    %ecx,%ecx
>   1917d7:	31 db                	xor    %ebx,%ebx
>   1917d9:	89 4c 24 3c          	mov    %ecx,0x3c(%esp)
>   1917dd:	8b 74 24 24          	mov    0x24(%esp),%esi
>   1917e1:	89 c8                	mov    %ecx,%eax
>   1917e3:	89 5c 24 34          	mov    %ebx,0x34(%esp)
>   1917e7:	8b 7c 24 28          	mov    0x28(%esp),%edi
>   1917eb:	21 ce                	and    %ecx,%esi
>   1917ed:	89 74 24 4c          	mov    %esi,0x4c(%esp)
>   1917f1:	21 df                	and    %ebx,%edi
>   1917f3:	89 de                	mov    %ebx,%esi
>   1917f5:	89 7c 24 50          	mov    %edi,0x50(%esp)
>   1917f9:	8b 54 24 4c          	mov    0x4c(%esp),%edx
>   1917fd:	8b 7c 24 2c          	mov    0x2c(%esp),%edi
>   191801:	8b 4c 24 50          	mov    0x50(%esp),%ecx
>   191805:	89 d3                	mov    %edx,%ebx
>   191807:	89 f2                	mov    %esi,%edx
>   191809:	f0 0f c7 0f          	lock cmpxchg8b (%edi)
>   19180d:	89 c1                	mov    %eax,%ecx
>   19180f:	8b 74 24 34          	mov    0x34(%esp),%esi
>   191813:	89 d3                	mov    %edx,%ebx
>   191815:	89 44 24 4c          	mov    %eax,0x4c(%esp)
>   191819:	8b 44 24 3c          	mov    0x3c(%esp),%eax
>   19181d:	89 df                	mov    %ebx,%edi
>   19181f:	89 54 24 44          	mov    %edx,0x44(%esp)
>   191823:	89 ca                	mov    %ecx,%edx
>   191825:	31 de                	xor    %ebx,%esi
>   191827:	31 c8                	xor    %ecx,%eax
>   191829:	09 f0                	or     %esi,%eax
>   19182b:	75 ac                	jne    1917d9 <...>
> 
> to:
> 
>   1912ba:	8b 06                	mov    (%esi),%eax
>   1912bc:	8b 56 04             	mov    0x4(%esi),%edx
>   1912bf:	89 44 24 3c          	mov    %eax,0x3c(%esp)
>   1912c3:	89 c1                	mov    %eax,%ecx
>   1912c5:	23 4c 24 34          	and    0x34(%esp),%ecx
>   1912c9:	89 d3                	mov    %edx,%ebx
>   1912cb:	23 5c 24 38          	and    0x38(%esp),%ebx
>   1912cf:	89 54 24 40          	mov    %edx,0x40(%esp)
>   1912d3:	89 4c 24 2c          	mov    %ecx,0x2c(%esp)
>   1912d7:	89 5c 24 30          	mov    %ebx,0x30(%esp)
>   1912db:	8b 5c 24 2c          	mov    0x2c(%esp),%ebx
>   1912df:	8b 4c 24 30          	mov    0x30(%esp),%ecx
>   1912e3:	f0 0f c7 0e          	lock cmpxchg8b (%esi)
>   1912e7:	0f 85 f3 02 00 00    	jne    1915e0 <...>
> 
> Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Ingo Molnar <mingo@kernel.org>
> Cc: Borislav Petkov <bp@alien8.de>
> Cc: Dave Hansen <dave.hansen@linux.intel.com>
> Cc: "H. Peter Anvin" <hpa@zytor.com>
> Cc: Peter Zijlstra <peterz@infradead.org>
> ---
>  arch/x86/include/asm/atomic64_32.h | 44 ++++++++++++------------------
>  1 file changed, 18 insertions(+), 26 deletions(-)
> 
> diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h
> index 11e817dab44a..84affd7a5d1c 100644
> --- a/arch/x86/include/asm/atomic64_32.h
> +++ b/arch/x86/include/asm/atomic64_32.h
> @@ -201,69 +201,61 @@ static __always_inline s64 arch_atomic64_dec_if_positive(atomic64_t *v)
>  
>  static __always_inline void arch_atomic64_and(s64 i, atomic64_t *v)
>  {
> -	s64 old, c = 0;
> +	s64 val = __READ_ONCE(v->counter);

I reckon it's worth placing this in a helper with a big comment, e.g.

static __always_inline s64 arch_atomic64_read_tearable(atomic64_t *v)
{
	/*
	 * TODO: explain that this might be torn, but it occurs *once*, and can
	 * safely be consumed by atomic64_try_cmpxchg().
	 *
	 * TODO: point to the existing commentary regarding why we use
	 * __READ_ONCE() for KASAN reasons.
	 */
	return __READ_ONCE(v->counter);
}

... and then use that in each of the instances below.

That way the subtlety is clearly documented, and it'd more clearly align with
the x86_64 verions.

Mark.

>  
> -	while ((old = arch_atomic64_cmpxchg(v, c, c & i)) != c)
> -		c = old;
> +	do { } while (!arch_atomic64_try_cmpxchg(v, &val, val & i));
>  }
>  
>  static __always_inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v)
>  {
> -	s64 old, c = 0;
> +	s64 val = __READ_ONCE(v->counter);
>  
> -	while ((old = arch_atomic64_cmpxchg(v, c, c & i)) != c)
> -		c = old;
> +	do { } while (!arch_atomic64_try_cmpxchg(v, &val, val & i));
>  
> -	return old;
> +	return val;
>  }
>  #define arch_atomic64_fetch_and arch_atomic64_fetch_and
>  
>  static __always_inline void arch_atomic64_or(s64 i, atomic64_t *v)
>  {
> -	s64 old, c = 0;
> +	s64 val = __READ_ONCE(v->counter);
>  
> -	while ((old = arch_atomic64_cmpxchg(v, c, c | i)) != c)
> -		c = old;
> +	do { } while (!arch_atomic64_try_cmpxchg(v, &val, val | i));
>  }
>  
>  static __always_inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v)
>  {
> -	s64 old, c = 0;
> +	s64 val = __READ_ONCE(v->counter);
>  
> -	while ((old = arch_atomic64_cmpxchg(v, c, c | i)) != c)
> -		c = old;
> +	do { } while (!arch_atomic64_try_cmpxchg(v, &val, val | i));
>  
> -	return old;
> +	return val;
>  }
>  #define arch_atomic64_fetch_or arch_atomic64_fetch_or
>  
>  static __always_inline void arch_atomic64_xor(s64 i, atomic64_t *v)
>  {
> -	s64 old, c = 0;
> +	s64 val = __READ_ONCE(v->counter);
>  
> -	while ((old = arch_atomic64_cmpxchg(v, c, c ^ i)) != c)
> -		c = old;
> +	do { } while (!arch_atomic64_try_cmpxchg(v, &val, val ^ i));
>  }
>  
>  static __always_inline s64 arch_atomic64_fetch_xor(s64 i, atomic64_t *v)
>  {
> -	s64 old, c = 0;
> +	s64 val = __READ_ONCE(v->counter);
>  
> -	while ((old = arch_atomic64_cmpxchg(v, c, c ^ i)) != c)
> -		c = old;
> +	do { } while (!arch_atomic64_try_cmpxchg(v, &val, val ^ i));
>  
> -	return old;
> +	return val;
>  }
>  #define arch_atomic64_fetch_xor arch_atomic64_fetch_xor
>  
>  static __always_inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v)
>  {
> -	s64 old, c = 0;
> +	s64 val = __READ_ONCE(v->counter);
>  
> -	while ((old = arch_atomic64_cmpxchg(v, c, c + i)) != c)
> -		c = old;
> -
> -	return old;
> +	do { } while (!arch_atomic64_try_cmpxchg(v, &val, val + i));
> +	return val;
>  }
>  #define arch_atomic64_fetch_add arch_atomic64_fetch_add
>  
> -- 
> 2.44.0
> 
>

Re: [PATCH 2/6] locking/atomic/x86: Rewrite x86_32 arch_atomic64_{,fetch}_{and,or,xor}() functions

Posted by Uros Bizjak 1 year, 10 months ago

On Tue, Apr 9, 2024 at 1:13 PM Mark Rutland <mark.rutland@arm.com> wrote:

> >  static __always_inline void arch_atomic64_and(s64 i, atomic64_t *v)
> >  {
> > -     s64 old, c = 0;
> > +     s64 val = __READ_ONCE(v->counter);
>
> I reckon it's worth placing this in a helper with a big comment, e.g.
>
> static __always_inline s64 arch_atomic64_read_tearable(atomic64_t *v)
> {
>         /*
>          * TODO: explain that this might be torn, but it occurs *once*, and can
>          * safely be consumed by atomic64_try_cmpxchg().
>          *
>          * TODO: point to the existing commentary regarding why we use
>          * __READ_ONCE() for KASAN reasons.
>          */
>         return __READ_ONCE(v->counter);
> }
>
> ... and then use that in each of the instances below.
>
> That way the subtlety is clearly documented, and it'd more clearly align with
> the x86_64 verions.

This is an excellent idea. The separate definitions needs to be placed
in atomic64_32.h and atomic_64_64.h (due to use of atomic64_t
typedef), but it will allow the same unification of functions between
x64_32 and x64_64 as the approach with __READ_ONCE().

Thanks,
Uros.

Re: [PATCH 2/6] locking/atomic/x86: Rewrite x86_32 arch_atomic64_{,fetch}_{and,or,xor}() functions

Posted by Uros Bizjak 1 year, 10 months ago

On Tue, Apr 9, 2024 at 2:03 PM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> On Tue, Apr 9, 2024 at 1:13 PM Mark Rutland <mark.rutland@arm.com> wrote:
>
> > >  static __always_inline void arch_atomic64_and(s64 i, atomic64_t *v)
> > >  {
> > > -     s64 old, c = 0;
> > > +     s64 val = __READ_ONCE(v->counter);
> >
> > I reckon it's worth placing this in a helper with a big comment, e.g.
> >
> > static __always_inline s64 arch_atomic64_read_tearable(atomic64_t *v)
> > {
> >         /*
> >          * TODO: explain that this might be torn, but it occurs *once*, and can
> >          * safely be consumed by atomic64_try_cmpxchg().
> >          *
> >          * TODO: point to the existing commentary regarding why we use
> >          * __READ_ONCE() for KASAN reasons.
> >          */
> >         return __READ_ONCE(v->counter);
> > }
> >
> > ... and then use that in each of the instances below.
> >
> > That way the subtlety is clearly documented, and it'd more clearly align with
> > the x86_64 verions.
>
> This is an excellent idea. The separate definitions needs to be placed
> in atomic64_32.h and atomic_64_64.h (due to use of atomic64_t
> typedef), but it will allow the same unification of functions between
> x64_32 and x64_64 as the approach with __READ_ONCE().

Something like this:

--cut here--
/*
 * This function is intended to preload the value from atomic64_t
 * location in a non-atomic way. The read might be torn, but can
 * safely be consumed by the compare-and-swap loop.
 */
static __always_inline s64 arch_atomic64_read_tearable(atomic64_t *v)
{
    /*
     * See the comment in arch_atomic_read() on why we use
     * __READ_ONCE() instead of READ_ONCE_NOCHECK() here.
     */
    return __READ_ONCE(v->counter);
}
--cut here--

Thanks,
Uros.

Re: [PATCH 2/6] locking/atomic/x86: Rewrite x86_32 arch_atomic64_{,fetch}_{and,or,xor}() functions

Posted by Mark Rutland 1 year, 10 months ago

On Tue, Apr 09, 2024 at 02:50:19PM +0200, Uros Bizjak wrote:
> On Tue, Apr 9, 2024 at 2:03 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> >
> > On Tue, Apr 9, 2024 at 1:13 PM Mark Rutland <mark.rutland@arm.com> wrote:
> >
> > > >  static __always_inline void arch_atomic64_and(s64 i, atomic64_t *v)
> > > >  {
> > > > -     s64 old, c = 0;
> > > > +     s64 val = __READ_ONCE(v->counter);
> > >
> > > I reckon it's worth placing this in a helper with a big comment, e.g.
> > >
> > > static __always_inline s64 arch_atomic64_read_tearable(atomic64_t *v)
> > > {
> > >         /*
> > >          * TODO: explain that this might be torn, but it occurs *once*, and can
> > >          * safely be consumed by atomic64_try_cmpxchg().
> > >          *
> > >          * TODO: point to the existing commentary regarding why we use
> > >          * __READ_ONCE() for KASAN reasons.
> > >          */
> > >         return __READ_ONCE(v->counter);
> > > }
> > >
> > > ... and then use that in each of the instances below.
> > >
> > > That way the subtlety is clearly documented, and it'd more clearly align with
> > > the x86_64 verions.
> >
> > This is an excellent idea. The separate definitions needs to be placed
> > in atomic64_32.h and atomic_64_64.h (due to use of atomic64_t
> > typedef), but it will allow the same unification of functions between
> > x64_32 and x64_64 as the approach with __READ_ONCE().
> 
> Something like this:
> 
> --cut here--
> /*
>  * This function is intended to preload the value from atomic64_t
>  * location in a non-atomic way. The read might be torn, but can
>  * safely be consumed by the compare-and-swap loop.
>  */
> static __always_inline s64 arch_atomic64_read_tearable(atomic64_t *v)
> {
>     /*
>      * See the comment in arch_atomic_read() on why we use
>      * __READ_ONCE() instead of READ_ONCE_NOCHECK() here.
>      */
>     return __READ_ONCE(v->counter);
> }
> --cut here--
> 
> Thanks,
> Uros.

Yeah, something of that shape.

Having thought for a bit longer, it's probably better to use '_torn' rather
than '_tearable' (i.e. name this arch_atomic64_read_torn()).

It'd be nice if we could specify the usage restrictions a bit more clearly,
since this can only be used for compare-and-swap loops that implement
unconditional atomics. (e.g. arch_atomic64_and(), but not
arch_atomic_add_unless()).

So I'd suggest:

/*
 * Read an atomic64_t non-atomically.
 *
 * This is intended to be used in cases where a subsequent atomic operation
 * will handle the torn value, and can be used to prime the first iteration of
 * unconditional try_cmpxchg() loops, e.g.
 *
 * 	s64 val = arch_atomic64_read_torn(v);
 * 	do { } while (!arch_atomic_try_cmpxchg(v, &val, val OP i);
 *
 * This is NOT safe to use where the value is not always checked by a
 * subsequent atomic operation, such as in conditional try_cmpxchg() loops that
 * can break before the atomic, e.g.
 *
 * 	s64 val = arch_atomic64_read_torn(v);
 * 	do {
 * 		if (condition(val))
 * 			break;
 * 	} while (!arch_atomic_try_cmpxchg(v, &val, val OP i);
 */
static __always_inline s64 arch_atomic64_read_torn(atomic64_t *v)
{
    /* See comment in arch_atomic_read() */
    return __READ_ONCE(v->counter);
}

Mark.

Re: [PATCH 2/6] locking/atomic/x86: Rewrite x86_32 arch_atomic64_{,fetch}_{and,or,xor}() functions

Posted by Uros Bizjak 1 year, 10 months ago

On Tue, Apr 9, 2024 at 6:34 PM Mark Rutland <mark.rutland@arm.com> wrote:

> > Something like this:
> >
> > --cut here--
> > /*
> >  * This function is intended to preload the value from atomic64_t
> >  * location in a non-atomic way. The read might be torn, but can
> >  * safely be consumed by the compare-and-swap loop.
> >  */
> > static __always_inline s64 arch_atomic64_read_tearable(atomic64_t *v)
> > {
> >     /*
> >      * See the comment in arch_atomic_read() on why we use
> >      * __READ_ONCE() instead of READ_ONCE_NOCHECK() here.
> >      */
> >     return __READ_ONCE(v->counter);
> > }
> > --cut here--
>
> Yeah, something of that shape.
>
> Having thought for a bit longer, it's probably better to use '_torn' rather
> than '_tearable' (i.e. name this arch_atomic64_read_torn()).

How about we simply name the function

arch_atomic64_read_nonatomic()

in the sense that it reads atomic64_t variables in a non-atomic way?

Uros.

Re: [PATCH 2/6] locking/atomic/x86: Rewrite x86_32 arch_atomic64_{,fetch}_{and,or,xor}() functions

Posted by Uros Bizjak 1 year, 10 months ago

On Tue, Apr 9, 2024 at 6:34 PM Mark Rutland <mark.rutland@arm.com> wrote:

> > > > ... and then use that in each of the instances below.
> > > >
> > > > That way the subtlety is clearly documented, and it'd more clearly align with
> > > > the x86_64 verions.
> > >
> > > This is an excellent idea. The separate definitions needs to be placed
> > > in atomic64_32.h and atomic_64_64.h (due to use of atomic64_t
> > > typedef), but it will allow the same unification of functions between
> > > x64_32 and x64_64 as the approach with __READ_ONCE().
> >
> > Something like this:
> >
> > --cut here--
> > /*
> >  * This function is intended to preload the value from atomic64_t
> >  * location in a non-atomic way. The read might be torn, but can
> >  * safely be consumed by the compare-and-swap loop.
> >  */
> > static __always_inline s64 arch_atomic64_read_tearable(atomic64_t *v)
> > {
> >     /*
> >      * See the comment in arch_atomic_read() on why we use
> >      * __READ_ONCE() instead of READ_ONCE_NOCHECK() here.
> >      */
> >     return __READ_ONCE(v->counter);
> > }
> > --cut here--
> >
> > Thanks,
> > Uros.
>
> Yeah, something of that shape.
>
> Having thought for a bit longer, it's probably better to use '_torn' rather
> than '_tearable' (i.e. name this arch_atomic64_read_torn()).
>
> It'd be nice if we could specify the usage restrictions a bit more clearly,
> since this can only be used for compare-and-swap loops that implement
> unconditional atomics. (e.g. arch_atomic64_and(), but not
> arch_atomic_add_unless()).
>
> So I'd suggest:

Eh, just sent a v2 a second before I received your mail. I'll respin
the patchset tomorrow to include your suggested text. Please note that
v2 patch set avoids all cosmetic  changes.

Thanks,
Uros.

>
> /*
>  * Read an atomic64_t non-atomically.
>  *
>  * This is intended to be used in cases where a subsequent atomic operation
>  * will handle the torn value, and can be used to prime the first iteration of
>  * unconditional try_cmpxchg() loops, e.g.
>  *
>  *      s64 val = arch_atomic64_read_torn(v);
>  *      do { } while (!arch_atomic_try_cmpxchg(v, &val, val OP i);
>  *
>  * This is NOT safe to use where the value is not always checked by a
>  * subsequent atomic operation, such as in conditional try_cmpxchg() loops that
>  * can break before the atomic, e.g.
>  *
>  *      s64 val = arch_atomic64_read_torn(v);
>  *      do {
>  *              if (condition(val))
>  *                      break;
>  *      } while (!arch_atomic_try_cmpxchg(v, &val, val OP i);
>  */
> static __always_inline s64 arch_atomic64_read_torn(atomic64_t *v)
> {
>     /* See comment in arch_atomic_read() */
>     return __READ_ONCE(v->counter);
> }
>
> Mark.

[PATCH 1/6] locking/atomic/x86: Introduce arch_atomic64_try_cmpxchg to x86_32
[PATCH 2/6] locking/atomic/x86: Rewrite x86_32 arch_atomic64_{,fetch}_{and,or,xor}() functions
[PATCH 3/6] locking/atomic/x86: Use READ_ONCE before atomic{,64}_try_cmpxchg loops
[PATCH 4/6] locking/atomic/x86: Merge x86_32 and x86_64 arch_atomic64_fetch_{and,or,xor}() functions
[PATCH 5/6] locking/atomic/x86: Define arch_atomic_sub() family using arch_atomic_add() functions
[PATCH 6/6] locking/atomic/x86: Reorder a couple of arch_atomic64 functions