[v2] Provide emulation for one--byte cmpxchg()

[PATCH v2 cmpxchg 12/13] sh: Emulate one-byte cmpxchg

Posted by Paul E. McKenney 1 year, 9 months ago

Use the new cmpxchg_emu_u8() to emulate one-byte cmpxchg() on sh.

[ paulmck: Drop two-byte support per Arnd Bergmann feedback. ]
[ paulmck: Apply feedback from Naresh Kamboju. ]
[ Apply Geert Uytterhoeven feedback. ]

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Andi Shyti <andi.shyti@linux.intel.com>
Cc: Palmer Dabbelt <palmer@rivosinc.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: <linux-sh@vger.kernel.org>
---
 arch/sh/Kconfig               | 1 +
 arch/sh/include/asm/cmpxchg.h | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 2ad3e29f0ebec..f47e9ccf4efd2 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -16,6 +16,7 @@ config SUPERH
 	select ARCH_HIBERNATION_POSSIBLE if MMU
 	select ARCH_MIGHT_HAVE_PC_PARPORT
 	select ARCH_WANT_IPC_PARSE_VERSION
+	select ARCH_NEED_CMPXCHG_1_EMU
 	select CPU_NO_EFFICIENT_FFS
 	select DMA_DECLARE_COHERENT
 	select GENERIC_ATOMIC64
diff --git a/arch/sh/include/asm/cmpxchg.h b/arch/sh/include/asm/cmpxchg.h
index 5d617b3ef78f7..1e5dc5ccf7bf5 100644
--- a/arch/sh/include/asm/cmpxchg.h
+++ b/arch/sh/include/asm/cmpxchg.h
@@ -9,6 +9,7 @@
 
 #include <linux/compiler.h>
 #include <linux/types.h>
+#include <linux/cmpxchg-emu.h>
 
 #if defined(CONFIG_GUSA_RB)
 #include <asm/cmpxchg-grb.h>
@@ -56,6 +57,8 @@ static inline unsigned long __cmpxchg(volatile void * ptr, unsigned long old,
 		unsigned long new, int size)
 {
 	switch (size) {
+	case 1:
+		return cmpxchg_emu_u8(ptr, old, new);
 	case 4:
 		return __cmpxchg_u32(ptr, old, new);
 	}
-- 
2.40.1

Re: [PATCH v2 cmpxchg 12/13] sh: Emulate one-byte cmpxchg

Posted by John Paul Adrian Glaubitz 1 year, 9 months ago

Hi Paul,

On Wed, 2024-05-01 at 16:01 -0700, Paul E. McKenney wrote:
> Use the new cmpxchg_emu_u8() to emulate one-byte cmpxchg() on sh.
> 
> [ paulmck: Drop two-byte support per Arnd Bergmann feedback. ]
> [ paulmck: Apply feedback from Naresh Kamboju. ]
> [ Apply Geert Uytterhoeven feedback. ]
> 
> Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
> Cc: Andi Shyti <andi.shyti@linux.intel.com>
> Cc: Palmer Dabbelt <palmer@rivosinc.com>
> Cc: Masami Hiramatsu <mhiramat@kernel.org>
> Cc: Arnd Bergmann <arnd@arndb.de>
> Cc: <linux-sh@vger.kernel.org>
> ---
>  arch/sh/Kconfig               | 1 +
>  arch/sh/include/asm/cmpxchg.h | 3 +++
>  2 files changed, 4 insertions(+)
> 
> diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
> index 2ad3e29f0ebec..f47e9ccf4efd2 100644
> --- a/arch/sh/Kconfig
> +++ b/arch/sh/Kconfig
> @@ -16,6 +16,7 @@ config SUPERH
>  	select ARCH_HIBERNATION_POSSIBLE if MMU
>  	select ARCH_MIGHT_HAVE_PC_PARPORT
>  	select ARCH_WANT_IPC_PARSE_VERSION
> +	select ARCH_NEED_CMPXCHG_1_EMU
>  	select CPU_NO_EFFICIENT_FFS
>  	select DMA_DECLARE_COHERENT
>  	select GENERIC_ATOMIC64
> diff --git a/arch/sh/include/asm/cmpxchg.h b/arch/sh/include/asm/cmpxchg.h
> index 5d617b3ef78f7..1e5dc5ccf7bf5 100644
> --- a/arch/sh/include/asm/cmpxchg.h
> +++ b/arch/sh/include/asm/cmpxchg.h
> @@ -9,6 +9,7 @@
>  
>  #include <linux/compiler.h>
>  #include <linux/types.h>
> +#include <linux/cmpxchg-emu.h>
>  
>  #if defined(CONFIG_GUSA_RB)
>  #include <asm/cmpxchg-grb.h>
> @@ -56,6 +57,8 @@ static inline unsigned long __cmpxchg(volatile void * ptr, unsigned long old,
>  		unsigned long new, int size)
>  {
>  	switch (size) {
> +	case 1:
> +		return cmpxchg_emu_u8(ptr, old, new);
>  	case 4:
>  		return __cmpxchg_u32(ptr, old, new);
>  	}

Thanks for the patch. However, I don't quite understand its purpose.

There is already a case for 8-byte cmpxchg in the switch statement below:

        case 1:                                         \
                __xchg__res = xchg_u8(__xchg_ptr, x);   \
                break;

Does cmpxchg_emu_u8() have any advantages over the native xchg_u8()?

Thanks,
Adrian

-- 
 .''`.  John Paul Adrian Glaubitz
: :' :  Debian Developer
`. `'   Physicist
  `-    GPG: 62FF 8A75 84E0 2956 9546  0006 7426 3B37 F5B5 F913

Re: [PATCH v2 cmpxchg 12/13] sh: Emulate one-byte cmpxchg

Posted by Paul E. McKenney 1 year, 9 months ago

On Thu, May 02, 2024 at 06:52:53AM +0200, John Paul Adrian Glaubitz wrote:
> Hi Paul,
> 
> On Wed, 2024-05-01 at 16:01 -0700, Paul E. McKenney wrote:
> > Use the new cmpxchg_emu_u8() to emulate one-byte cmpxchg() on sh.
> > 
> > [ paulmck: Drop two-byte support per Arnd Bergmann feedback. ]
> > [ paulmck: Apply feedback from Naresh Kamboju. ]
> > [ Apply Geert Uytterhoeven feedback. ]
> > 
> > Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
> > Cc: Andi Shyti <andi.shyti@linux.intel.com>
> > Cc: Palmer Dabbelt <palmer@rivosinc.com>
> > Cc: Masami Hiramatsu <mhiramat@kernel.org>
> > Cc: Arnd Bergmann <arnd@arndb.de>
> > Cc: <linux-sh@vger.kernel.org>
> > ---
> >  arch/sh/Kconfig               | 1 +
> >  arch/sh/include/asm/cmpxchg.h | 3 +++
> >  2 files changed, 4 insertions(+)
> > 
> > diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
> > index 2ad3e29f0ebec..f47e9ccf4efd2 100644
> > --- a/arch/sh/Kconfig
> > +++ b/arch/sh/Kconfig
> > @@ -16,6 +16,7 @@ config SUPERH
> >  	select ARCH_HIBERNATION_POSSIBLE if MMU
> >  	select ARCH_MIGHT_HAVE_PC_PARPORT
> >  	select ARCH_WANT_IPC_PARSE_VERSION
> > +	select ARCH_NEED_CMPXCHG_1_EMU
> >  	select CPU_NO_EFFICIENT_FFS
> >  	select DMA_DECLARE_COHERENT
> >  	select GENERIC_ATOMIC64
> > diff --git a/arch/sh/include/asm/cmpxchg.h b/arch/sh/include/asm/cmpxchg.h
> > index 5d617b3ef78f7..1e5dc5ccf7bf5 100644
> > --- a/arch/sh/include/asm/cmpxchg.h
> > +++ b/arch/sh/include/asm/cmpxchg.h
> > @@ -9,6 +9,7 @@
> >  
> >  #include <linux/compiler.h>
> >  #include <linux/types.h>
> > +#include <linux/cmpxchg-emu.h>
> >  
> >  #if defined(CONFIG_GUSA_RB)
> >  #include <asm/cmpxchg-grb.h>
> > @@ -56,6 +57,8 @@ static inline unsigned long __cmpxchg(volatile void * ptr, unsigned long old,
> >  		unsigned long new, int size)
> >  {
> >  	switch (size) {
> > +	case 1:
> > +		return cmpxchg_emu_u8(ptr, old, new);
> >  	case 4:
> >  		return __cmpxchg_u32(ptr, old, new);
> >  	}
> 
> Thanks for the patch. However, I don't quite understand its purpose.
> 
> There is already a case for 8-byte cmpxchg in the switch statement below:
> 
>         case 1:                                         \
>                 __xchg__res = xchg_u8(__xchg_ptr, x);   \
>                 break;
> 
> Does cmpxchg_emu_u8() have any advantages over the native xchg_u8()?

That would be 8-bit xchg() rather than 8-byte cmpxchg(), correct?

Or am I missing something subtle here that makes sh also support one-byte
(8-bit) cmpxchg()?

							Thanx, Paul

Re: [PATCH v2 cmpxchg 12/13] sh: Emulate one-byte cmpxchg

Posted by D. Jeff Dionne 1 year, 9 months ago

On May 2, 2024, at 14:07, Paul E. McKenney <paulmck@kernel.org> wrote:

> That would be 8-bit xchg() rather than 8-byte cmpxchg(), correct?
> 
> Or am I missing something subtle here that makes sh also support one-byte
> (8-bit) cmpxchg()?

The native SH atomic operation is test and set TAS.B.  J2 adds a compare and swap CAS.L instruction, carefully chosen for patent free prior art (s360, IIRC).

The (relatively expensive) encoding space we allocated for CAS.L does not contain size bits.

Not all SH4 patents had expired when J2 was under development, but now have (watch this space).  Not sure (me myself) if there are more atomic operations in sh4.

Cheers,
J

> 
>                            Thanx, Paul

Re: [PATCH v2 cmpxchg 12/13] sh: Emulate one-byte cmpxchg

Posted by Arnd Bergmann 1 year, 9 months ago

On Thu, May 2, 2024, at 07:42, D. Jeff Dionne wrote:
> On May 2, 2024, at 14:07, Paul E. McKenney <paulmck@kernel.org> wrote:
>
>> That would be 8-bit xchg() rather than 8-byte cmpxchg(), correct?
>> 
>> Or am I missing something subtle here that makes sh also support one-byte
>> (8-bit) cmpxchg()?
>
> The native SH atomic operation is test and set TAS.B.  J2 adds a 
> compare and swap CAS.L instruction, carefully chosen for patent free 
> prior art (s360, IIRC).
>
> The (relatively expensive) encoding space we allocated for CAS.L does 
> not contain size bits.
>
> Not all SH4 patents had expired when J2 was under development, but now 
> have (watch this space).  Not sure (me myself) if there are more atomic 
> operations in sh4.

SH4A supports MIPS R4000 style LL/SC instructions, but it looks like
the older SH4 does not.

      Arnd

Re: [PATCH v2 cmpxchg 12/13] sh: Emulate one-byte cmpxchg

Posted by John Paul Adrian Glaubitz 1 year, 9 months ago

On Wed, 2024-05-01 at 22:06 -0700, Paul E. McKenney wrote:
> > Does cmpxchg_emu_u8() have any advantages over the native xchg_u8()?
> 
> That would be 8-bit xchg() rather than 8-byte cmpxchg(), correct?

Indeed. I realized this after sending my reply.

> Or am I missing something subtle here that makes sh also support one-byte
> (8-bit) cmpxchg()?

Is there an explanation available that explains the rationale behind the
series, so I can learn more about it?

Also, I am opposed to removing Alpha entirely as it's still being actively
maintained in Debian and Gentoo and works well.

Adrian

-- 
 .''`.  John Paul Adrian Glaubitz
: :' :  Debian Developer
`. `'   Physicist
  `-    GPG: 62FF 8A75 84E0 2956 9546  0006 7426 3B37 F5B5 F913

Re: [PATCH v2 cmpxchg 12/13] sh: Emulate one-byte cmpxchg

Posted by Paul E. McKenney 1 year, 9 months ago

On Thu, May 02, 2024 at 07:11:52AM +0200, John Paul Adrian Glaubitz wrote:
> On Wed, 2024-05-01 at 22:06 -0700, Paul E. McKenney wrote:
> > > Does cmpxchg_emu_u8() have any advantages over the native xchg_u8()?
> > 
> > That would be 8-bit xchg() rather than 8-byte cmpxchg(), correct?
> 
> Indeed. I realized this after sending my reply.

No problem, as I do know that feeling!

> > Or am I missing something subtle here that makes sh also support one-byte
> > (8-bit) cmpxchg()?
> 
> Is there an explanation available that explains the rationale behind the
> series, so I can learn more about it?

We have some places in mainline that need one-byte cmpxchg(), so this
series provides emulation for architectures that do not support this
notion.

> Also, I am opposed to removing Alpha entirely as it's still being actively
> maintained in Debian and Gentoo and works well.

Understood, and this sort of compatibility consideration is why this
version of this patchset does not emulate two-byte (16-bit) cmpxchg()
operations.  The original (RFC) series did emulate these, which does
not work on a few architectures that do not provide 16-bit load/store
instructions, hence no 16-bit support in this series.

So this one-byte-only series affects only Alpha systems lacking
single-byte load/store instructions.  If I understand correctly, Alpha
21164A (EV56) and later *do* have single-byte load/store instructions,
and thus are still just fine.  In fact, it looks like EV56 also has
two-byte load/store instructions, and so would have been OK with
the original one-/two-byte RFC series.

Arnd will not be shy about correcting me if I am wrong.  ;-)

> Adrian
> 
> -- 
>  .''`.  John Paul Adrian Glaubitz
> : :' :  Debian Developer
> `. `'   Physicist
>   `-    GPG: 62FF 8A75 84E0 2956 9546  0006 7426 3B37 F5B5 F913

Re: [PATCH v2 cmpxchg 12/13] sh: Emulate one-byte cmpxchg

Posted by Arnd Bergmann 1 year, 9 months ago

On Thu, May 2, 2024, at 15:33, Paul E. McKenney wrote:
> On Thu, May 02, 2024 at 07:11:52AM +0200, John Paul Adrian Glaubitz wrote:
>> On Wed, 2024-05-01 at 22:06 -0700, Paul E. McKenney wrote:
>> > > Does cmpxchg_emu_u8() have any advantages over the native xchg_u8()?
>> > 
>> > That would be 8-bit xchg() rather than 8-byte cmpxchg(), correct?
>> 
>> Indeed. I realized this after sending my reply.
>
> So this one-byte-only series affects only Alpha systems lacking
> single-byte load/store instructions.  If I understand correctly, Alpha
> 21164A (EV56) and later *do* have single-byte load/store instructions,
> and thus are still just fine.  In fact, it looks like EV56 also has
> two-byte load/store instructions, and so would have been OK with
> the original one-/two-byte RFC series.

Correct, the only other architecture I'm aware of that is missing
16-bit load/store entirely is ARMv3.

> Arnd will not be shy about correcting me if I am wrong.  ;-)

I'll take this as a reminder to send out my EV4/EV5 removal
series. I've merged my patches with Al's bugfixes and rebased
all on top of 6.9-rc now. It's a bit late now, so I'll
send this tomorrow:

https://git.kernel.org/pub/scm/linux/kernel/garch/alpha/include/asm/cmpxchg.hit/arnd/asm-generic.git/log/?h=alpha-cleanup-6.9

     Arnd

Re: [PATCH v2 cmpxchg 12/13] sh: Emulate one-byte cmpxchg

Posted by Al Viro 1 year, 9 months ago

On Thu, May 02, 2024 at 06:33:49AM -0700, Paul E. McKenney wrote:

> Understood, and this sort of compatibility consideration is why this
> version of this patchset does not emulate two-byte (16-bit) cmpxchg()
> operations.  The original (RFC) series did emulate these, which does
> not work on a few architectures that do not provide 16-bit load/store
> instructions, hence no 16-bit support in this series.
> 
> So this one-byte-only series affects only Alpha systems lacking
> single-byte load/store instructions.  If I understand correctly, Alpha
> 21164A (EV56) and later *do* have single-byte load/store instructions,
> and thus are still just fine.  In fact, it looks like EV56 also has
> two-byte load/store instructions, and so would have been OK with
> the original one-/two-byte RFC series.

Wait a sec.  On Alpha we already implement 16bit and 8bit xchg and cmpxchg.
See arch/alpha/include/asm/xchg.h:
static inline unsigned long
____cmpxchg(_u16, volatile short *m, unsigned short old, unsigned short new)
{
       unsigned long prev, tmp, cmp, addr64;

       __asm__ __volatile__(
       "       andnot  %5,7,%4\n"
       "       inswl   %1,%5,%1\n"
       "1:     ldq_l   %2,0(%4)\n"
       "       extwl   %2,%5,%0\n"
       "       cmpeq   %0,%6,%3\n"
       "       beq     %3,2f\n"
       "       mskwl   %2,%5,%2\n"
       "       or      %1,%2,%2\n"
       "       stq_c   %2,0(%4)\n"
       "       beq     %2,3f\n"
       "2:\n"
       ".subsection 2\n"
       "3:     br      1b\n"
       ".previous"
       : "=&r" (prev), "=&r" (new), "=&r" (tmp), "=&r" (cmp), "=&r" (addr64)
       : "r" ((long)m), "Ir" (old), "1" (new) : "memory");

       return prev;
}

Load-locked and store-conditional are done on 64bit value, with
16bit operations done in registers.  This is what 16bit store
(assignment to unsigned short *) turns into with
        stw $17,0($16)		// *(u16*)r16 = r17
and without -mbwx
        insql $17,$16,$17	// r17 = r17 << (8 * (r16 & 7))
        ldq_u $1,0($16)		// r1 = *(u64 *)(r16 & ~7)
	mskwl $1,$16,$1		// r1 &= ~(0xffff << (8 * (r16 & 7))
	bis $17,$1,$17		// r17 |= r1
	stq_u $17,0($16)	// *(u64 *)(r16 & ~7) = r17

What's more, load-locked/store-conditional doesn't have 16bit and 8bit
variants on any Alphas - it's always 32bit (ldl_l) or 64bit (ldq_l).

What BWX adds is load/store byte/word, load/store byte/word unaligned
and sign-extend byte/word.  IOW, it's absolutely irrelevant for
cmpxchg (or xchg) purposes.

Re: [PATCH v2 cmpxchg 12/13] sh: Emulate one-byte cmpxchg

Posted by Paul E. McKenney 1 year, 9 months ago

On Thu, May 02, 2024 at 09:53:45PM +0100, Al Viro wrote:
> On Thu, May 02, 2024 at 06:33:49AM -0700, Paul E. McKenney wrote:
> 
> > Understood, and this sort of compatibility consideration is why this
> > version of this patchset does not emulate two-byte (16-bit) cmpxchg()
> > operations.  The original (RFC) series did emulate these, which does
> > not work on a few architectures that do not provide 16-bit load/store
> > instructions, hence no 16-bit support in this series.
> > 
> > So this one-byte-only series affects only Alpha systems lacking
> > single-byte load/store instructions.  If I understand correctly, Alpha
> > 21164A (EV56) and later *do* have single-byte load/store instructions,
> > and thus are still just fine.  In fact, it looks like EV56 also has
> > two-byte load/store instructions, and so would have been OK with
> > the original one-/two-byte RFC series.
> 
> Wait a sec.  On Alpha we already implement 16bit and 8bit xchg and cmpxchg.
> See arch/alpha/include/asm/xchg.h:
> static inline unsigned long
> ____cmpxchg(_u16, volatile short *m, unsigned short old, unsigned short new)
> {
>        unsigned long prev, tmp, cmp, addr64;
> 
>        __asm__ __volatile__(
>        "       andnot  %5,7,%4\n"
>        "       inswl   %1,%5,%1\n"
>        "1:     ldq_l   %2,0(%4)\n"
>        "       extwl   %2,%5,%0\n"
>        "       cmpeq   %0,%6,%3\n"
>        "       beq     %3,2f\n"
>        "       mskwl   %2,%5,%2\n"
>        "       or      %1,%2,%2\n"
>        "       stq_c   %2,0(%4)\n"
>        "       beq     %2,3f\n"
>        "2:\n"
>        ".subsection 2\n"
>        "3:     br      1b\n"
>        ".previous"
>        : "=&r" (prev), "=&r" (new), "=&r" (tmp), "=&r" (cmp), "=&r" (addr64)
>        : "r" ((long)m), "Ir" (old), "1" (new) : "memory");
> 
>        return prev;
> }
> 
> Load-locked and store-conditional are done on 64bit value, with
> 16bit operations done in registers.  This is what 16bit store
> (assignment to unsigned short *) turns into with
>         stw $17,0($16)		// *(u16*)r16 = r17
> and without -mbwx
>         insql $17,$16,$17	// r17 = r17 << (8 * (r16 & 7))
>         ldq_u $1,0($16)		// r1 = *(u64 *)(r16 & ~7)
> 	mskwl $1,$16,$1		// r1 &= ~(0xffff << (8 * (r16 & 7))
> 	bis $17,$1,$17		// r17 |= r1
> 	stq_u $17,0($16)	// *(u64 *)(r16 & ~7) = r17
> 
> What's more, load-locked/store-conditional doesn't have 16bit and 8bit
> variants on any Alphas - it's always 32bit (ldl_l) or 64bit (ldq_l).
> 
> What BWX adds is load/store byte/word, load/store byte/word unaligned
> and sign-extend byte/word.  IOW, it's absolutely irrelevant for
> cmpxchg (or xchg) purposes.

If you are only ever doing atomic read-modify-write operations on the
byte in question, then agreed, you don't care about byte loads and stores.

But there are use cases that do mix smp_store_release() with cmpxchg(),
and those use cases won't work unless at least byte store is implemented.
Or I suppose that we could use cmpxchg() instead of smp_store_release(),
but that is wasteful for architectures that do support byte stores.

So EV56 adds the byte loads and stores needed for those use cases.

Or am I missing your point?

							Thanx, Paul

alpha cmpxchg.h (was Re: [PATCH v2 cmpxchg 12/13] sh: Emulate one-byte cmpxchg)

Posted by Al Viro 1 year, 9 months ago

On Thu, May 02, 2024 at 09:53:45PM +0100, Al Viro wrote:
> What's more, load-locked/store-conditional doesn't have 16bit and 8bit
> variants on any Alphas - it's always 32bit (ldl_l) or 64bit (ldq_l).
> 
> What BWX adds is load/store byte/word, load/store byte/word unaligned
> and sign-extend byte/word.  IOW, it's absolutely irrelevant for
> cmpxchg (or xchg) purposes.

FWIW, I do have a cmpxchg-related patch for alpha - the mess with xchg.h
(parametrized double include) is no longer needed, and hadn't been since
2018 (fbfcd0199170 "locking/xchg/alpha: Remove superfluous memory barriers
from the _local() variants" was the point when the things settled down).
Only tangentially related to your stuff, but it makes the damn thing
easier to follow.


commit e992b5436ccd504b07a390118cf2be686355b957
Author: Al Viro <viro@zeniv.linux.org.uk>
Date:   Mon Apr 8 17:43:37 2024 -0400

    alpha: no need to include asm/xchg.h twice
    
    We used to generate different helpers for local and full
    {cmp,}xchg(); these days the barriers are in arch_{cmp,}xchg()
    instead and generated helpers are identical for local and
    full cases.  No need for those parametrized includes of
    asm/xchg.h - we might as well insert its contents directly
    in asm/cmpxchg.h and do it only once.
    
    Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/arch/alpha/include/asm/cmpxchg.h b/arch/alpha/include/asm/cmpxchg.h
index 91d4a4d9258c..ae1b96479d0c 100644
--- a/arch/alpha/include/asm/cmpxchg.h
+++ b/arch/alpha/include/asm/cmpxchg.h
@@ -3,17 +3,232 @@
 #define _ALPHA_CMPXCHG_H
 
 /*
- * Atomic exchange routines.
+ * Atomic exchange.
+ * Since it can be used to implement critical sections
+ * it must clobber "memory" (also for interrupts in UP).
  */
 
-#define ____xchg(type, args...)		__arch_xchg ## type ## _local(args)
-#define ____cmpxchg(type, args...)	__cmpxchg ## type ## _local(args)
-#include <asm/xchg.h>
+static inline unsigned long
+____xchg_u8(volatile char *m, unsigned long val)
+{
+	unsigned long ret, tmp, addr64;
+
+	__asm__ __volatile__(
+	"	andnot	%4,7,%3\n"
+	"	insbl	%1,%4,%1\n"
+	"1:	ldq_l	%2,0(%3)\n"
+	"	extbl	%2,%4,%0\n"
+	"	mskbl	%2,%4,%2\n"
+	"	or	%1,%2,%2\n"
+	"	stq_c	%2,0(%3)\n"
+	"	beq	%2,2f\n"
+	".subsection 2\n"
+	"2:	br	1b\n"
+	".previous"
+	: "=&r" (ret), "=&r" (val), "=&r" (tmp), "=&r" (addr64)
+	: "r" ((long)m), "1" (val) : "memory");
+
+	return ret;
+}
+
+static inline unsigned long
+____xchg_u16(volatile short *m, unsigned long val)
+{
+	unsigned long ret, tmp, addr64;
+
+	__asm__ __volatile__(
+	"	andnot	%4,7,%3\n"
+	"	inswl	%1,%4,%1\n"
+	"1:	ldq_l	%2,0(%3)\n"
+	"	extwl	%2,%4,%0\n"
+	"	mskwl	%2,%4,%2\n"
+	"	or	%1,%2,%2\n"
+	"	stq_c	%2,0(%3)\n"
+	"	beq	%2,2f\n"
+	".subsection 2\n"
+	"2:	br	1b\n"
+	".previous"
+	: "=&r" (ret), "=&r" (val), "=&r" (tmp), "=&r" (addr64)
+	: "r" ((long)m), "1" (val) : "memory");
+
+	return ret;
+}
+
+static inline unsigned long
+____xchg_u32(volatile int *m, unsigned long val)
+{
+	unsigned long dummy;
+
+	__asm__ __volatile__(
+	"1:	ldl_l %0,%4\n"
+	"	bis $31,%3,%1\n"
+	"	stl_c %1,%2\n"
+	"	beq %1,2f\n"
+	".subsection 2\n"
+	"2:	br 1b\n"
+	".previous"
+	: "=&r" (val), "=&r" (dummy), "=m" (*m)
+	: "rI" (val), "m" (*m) : "memory");
+
+	return val;
+}
+
+static inline unsigned long
+____xchg_u64(volatile long *m, unsigned long val)
+{
+	unsigned long dummy;
+
+	__asm__ __volatile__(
+	"1:	ldq_l %0,%4\n"
+	"	bis $31,%3,%1\n"
+	"	stq_c %1,%2\n"
+	"	beq %1,2f\n"
+	".subsection 2\n"
+	"2:	br 1b\n"
+	".previous"
+	: "=&r" (val), "=&r" (dummy), "=m" (*m)
+	: "rI" (val), "m" (*m) : "memory");
+
+	return val;
+}
+
+/* This function doesn't exist, so you'll get a linker error
+   if something tries to do an invalid xchg().  */
+extern void __xchg_called_with_bad_pointer(void);
+
+static __always_inline unsigned long
+____xchg(volatile void *ptr, unsigned long x, int size)
+{
+	return
+		size == 1 ? ____xchg_u8(ptr, x) :
+		size == 2 ? ____xchg_u16(ptr, x) :
+		size == 4 ? ____xchg_u32(ptr, x) :
+		size == 8 ? ____xchg_u64(ptr, x) :
+			(__xchg_called_with_bad_pointer(), x);
+}
+
+/*
+ * Atomic compare and exchange.  Compare OLD with MEM, if identical,
+ * store NEW in MEM.  Return the initial value in MEM.  Success is
+ * indicated by comparing RETURN with OLD.
+ */
+
+static inline unsigned long
+____cmpxchg_u8(volatile char *m, unsigned char old, unsigned char new)
+{
+	unsigned long prev, tmp, cmp, addr64;
+
+	__asm__ __volatile__(
+	"	andnot	%5,7,%4\n"
+	"	insbl	%1,%5,%1\n"
+	"1:	ldq_l	%2,0(%4)\n"
+	"	extbl	%2,%5,%0\n"
+	"	cmpeq	%0,%6,%3\n"
+	"	beq	%3,2f\n"
+	"	mskbl	%2,%5,%2\n"
+	"	or	%1,%2,%2\n"
+	"	stq_c	%2,0(%4)\n"
+	"	beq	%2,3f\n"
+	"2:\n"
+	".subsection 2\n"
+	"3:	br	1b\n"
+	".previous"
+	: "=&r" (prev), "=&r" (new), "=&r" (tmp), "=&r" (cmp), "=&r" (addr64)
+	: "r" ((long)m), "Ir" (old), "1" (new) : "memory");
+
+	return prev;
+}
+
+static inline unsigned long
+____cmpxchg_u16(volatile short *m, unsigned short old, unsigned short new)
+{
+	unsigned long prev, tmp, cmp, addr64;
+
+	__asm__ __volatile__(
+	"	andnot	%5,7,%4\n"
+	"	inswl	%1,%5,%1\n"
+	"1:	ldq_l	%2,0(%4)\n"
+	"	extwl	%2,%5,%0\n"
+	"	cmpeq	%0,%6,%3\n"
+	"	beq	%3,2f\n"
+	"	mskwl	%2,%5,%2\n"
+	"	or	%1,%2,%2\n"
+	"	stq_c	%2,0(%4)\n"
+	"	beq	%2,3f\n"
+	"2:\n"
+	".subsection 2\n"
+	"3:	br	1b\n"
+	".previous"
+	: "=&r" (prev), "=&r" (new), "=&r" (tmp), "=&r" (cmp), "=&r" (addr64)
+	: "r" ((long)m), "Ir" (old), "1" (new) : "memory");
+
+	return prev;
+}
+
+static inline unsigned long
+____cmpxchg_u32(volatile int *m, int old, int new)
+{
+	unsigned long prev, cmp;
+
+	__asm__ __volatile__(
+	"1:	ldl_l %0,%5\n"
+	"	cmpeq %0,%3,%1\n"
+	"	beq %1,2f\n"
+	"	mov %4,%1\n"
+	"	stl_c %1,%2\n"
+	"	beq %1,3f\n"
+	"2:\n"
+	".subsection 2\n"
+	"3:	br 1b\n"
+	".previous"
+	: "=&r"(prev), "=&r"(cmp), "=m"(*m)
+	: "r"((long) old), "r"(new), "m"(*m) : "memory");
+
+	return prev;
+}
+
+static inline unsigned long
+____cmpxchg_u64(volatile long *m, unsigned long old, unsigned long new)
+{
+	unsigned long prev, cmp;
+
+	__asm__ __volatile__(
+	"1:	ldq_l %0,%5\n"
+	"	cmpeq %0,%3,%1\n"
+	"	beq %1,2f\n"
+	"	mov %4,%1\n"
+	"	stq_c %1,%2\n"
+	"	beq %1,3f\n"
+	"2:\n"
+	".subsection 2\n"
+	"3:	br 1b\n"
+	".previous"
+	: "=&r"(prev), "=&r"(cmp), "=m"(*m)
+	: "r"((long) old), "r"(new), "m"(*m) : "memory");
+
+	return prev;
+}
+
+/* This function doesn't exist, so you'll get a linker error
+   if something tries to do an invalid cmpxchg().  */
+extern void __cmpxchg_called_with_bad_pointer(void);
+
+static __always_inline unsigned long
+____cmpxchg(volatile void *ptr, unsigned long old, unsigned long new,
+	      int size)
+{
+	return
+		size == 1 ? ____cmpxchg_u8(ptr, old, new) :
+		size == 2 ? ____cmpxchg_u16(ptr, old, new) :
+		size == 4 ? ____cmpxchg_u32(ptr, old, new) :
+		size == 8 ? ____cmpxchg_u64(ptr, old, new) :
+			(__cmpxchg_called_with_bad_pointer(), old);
+}
 
 #define xchg_local(ptr, x)						\
 ({									\
 	__typeof__(*(ptr)) _x_ = (x);					\
-	(__typeof__(*(ptr))) __arch_xchg_local((ptr), (unsigned long)_x_,\
+	(__typeof__(*(ptr))) ____xchg((ptr), (unsigned long)_x_,	\
 					       sizeof(*(ptr)));		\
 })
 
@@ -21,7 +236,7 @@
 ({									\
 	__typeof__(*(ptr)) _o_ = (o);					\
 	__typeof__(*(ptr)) _n_ = (n);					\
-	(__typeof__(*(ptr))) __cmpxchg_local((ptr), (unsigned long)_o_,	\
+	(__typeof__(*(ptr))) ____cmpxchg((ptr), (unsigned long)_o_,	\
 					  (unsigned long)_n_,		\
 					  sizeof(*(ptr)));		\
 })
@@ -32,12 +247,6 @@
 	cmpxchg_local((ptr), (o), (n));					\
 })
 
-#undef ____xchg
-#undef ____cmpxchg
-#define ____xchg(type, args...)		__arch_xchg ##type(args)
-#define ____cmpxchg(type, args...)	__cmpxchg ##type(args)
-#include <asm/xchg.h>
-
 /*
  * The leading and the trailing memory barriers guarantee that these
  * operations are fully ordered.
@@ -48,7 +257,7 @@
 	__typeof__(*(ptr)) _x_ = (x);					\
 	smp_mb();							\
 	__ret = (__typeof__(*(ptr)))					\
-		__arch_xchg((ptr), (unsigned long)_x_, sizeof(*(ptr)));	\
+		____xchg((ptr), (unsigned long)_x_, sizeof(*(ptr)));	\
 	smp_mb();							\
 	__ret;								\
 })
@@ -59,7 +268,7 @@
 	__typeof__(*(ptr)) _o_ = (o);					\
 	__typeof__(*(ptr)) _n_ = (n);					\
 	smp_mb();							\
-	__ret = (__typeof__(*(ptr))) __cmpxchg((ptr),			\
+	__ret = (__typeof__(*(ptr))) ____cmpxchg((ptr),			\
 		(unsigned long)_o_, (unsigned long)_n_, sizeof(*(ptr)));\
 	smp_mb();							\
 	__ret;								\
@@ -71,6 +280,4 @@
 	arch_cmpxchg((ptr), (o), (n));					\
 })
 
-#undef ____cmpxchg
-
 #endif /* _ALPHA_CMPXCHG_H */
diff --git a/arch/alpha/include/asm/xchg.h b/arch/alpha/include/asm/xchg.h
deleted file mode 100644
index 7adb80c6746a..000000000000
--- a/arch/alpha/include/asm/xchg.h
+++ /dev/null
@@ -1,246 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ALPHA_CMPXCHG_H
-#error Do not include xchg.h directly!
-#else
-/*
- * xchg/xchg_local and cmpxchg/cmpxchg_local share the same code
- * except that local version do not have the expensive memory barrier.
- * So this file is included twice from asm/cmpxchg.h.
- */
-
-/*
- * Atomic exchange.
- * Since it can be used to implement critical sections
- * it must clobber "memory" (also for interrupts in UP).
- */
-
-static inline unsigned long
-____xchg(_u8, volatile char *m, unsigned long val)
-{
-	unsigned long ret, tmp, addr64;
-
-	__asm__ __volatile__(
-	"	andnot	%4,7,%3\n"
-	"	insbl	%1,%4,%1\n"
-	"1:	ldq_l	%2,0(%3)\n"
-	"	extbl	%2,%4,%0\n"
-	"	mskbl	%2,%4,%2\n"
-	"	or	%1,%2,%2\n"
-	"	stq_c	%2,0(%3)\n"
-	"	beq	%2,2f\n"
-	".subsection 2\n"
-	"2:	br	1b\n"
-	".previous"
-	: "=&r" (ret), "=&r" (val), "=&r" (tmp), "=&r" (addr64)
-	: "r" ((long)m), "1" (val) : "memory");
-
-	return ret;
-}
-
-static inline unsigned long
-____xchg(_u16, volatile short *m, unsigned long val)
-{
-	unsigned long ret, tmp, addr64;
-
-	__asm__ __volatile__(
-	"	andnot	%4,7,%3\n"
-	"	inswl	%1,%4,%1\n"
-	"1:	ldq_l	%2,0(%3)\n"
-	"	extwl	%2,%4,%0\n"
-	"	mskwl	%2,%4,%2\n"
-	"	or	%1,%2,%2\n"
-	"	stq_c	%2,0(%3)\n"
-	"	beq	%2,2f\n"
-	".subsection 2\n"
-	"2:	br	1b\n"
-	".previous"
-	: "=&r" (ret), "=&r" (val), "=&r" (tmp), "=&r" (addr64)
-	: "r" ((long)m), "1" (val) : "memory");
-
-	return ret;
-}
-
-static inline unsigned long
-____xchg(_u32, volatile int *m, unsigned long val)
-{
-	unsigned long dummy;
-
-	__asm__ __volatile__(
-	"1:	ldl_l %0,%4\n"
-	"	bis $31,%3,%1\n"
-	"	stl_c %1,%2\n"
-	"	beq %1,2f\n"
-	".subsection 2\n"
-	"2:	br 1b\n"
-	".previous"
-	: "=&r" (val), "=&r" (dummy), "=m" (*m)
-	: "rI" (val), "m" (*m) : "memory");
-
-	return val;
-}
-
-static inline unsigned long
-____xchg(_u64, volatile long *m, unsigned long val)
-{
-	unsigned long dummy;
-
-	__asm__ __volatile__(
-	"1:	ldq_l %0,%4\n"
-	"	bis $31,%3,%1\n"
-	"	stq_c %1,%2\n"
-	"	beq %1,2f\n"
-	".subsection 2\n"
-	"2:	br 1b\n"
-	".previous"
-	: "=&r" (val), "=&r" (dummy), "=m" (*m)
-	: "rI" (val), "m" (*m) : "memory");
-
-	return val;
-}
-
-/* This function doesn't exist, so you'll get a linker error
-   if something tries to do an invalid xchg().  */
-extern void __xchg_called_with_bad_pointer(void);
-
-static __always_inline unsigned long
-____xchg(, volatile void *ptr, unsigned long x, int size)
-{
-	switch (size) {
-		case 1:
-			return ____xchg(_u8, ptr, x);
-		case 2:
-			return ____xchg(_u16, ptr, x);
-		case 4:
-			return ____xchg(_u32, ptr, x);
-		case 8:
-			return ____xchg(_u64, ptr, x);
-	}
-	__xchg_called_with_bad_pointer();
-	return x;
-}
-
-/*
- * Atomic compare and exchange.  Compare OLD with MEM, if identical,
- * store NEW in MEM.  Return the initial value in MEM.  Success is
- * indicated by comparing RETURN with OLD.
- */
-
-static inline unsigned long
-____cmpxchg(_u8, volatile char *m, unsigned char old, unsigned char new)
-{
-	unsigned long prev, tmp, cmp, addr64;
-
-	__asm__ __volatile__(
-	"	andnot	%5,7,%4\n"
-	"	insbl	%1,%5,%1\n"
-	"1:	ldq_l	%2,0(%4)\n"
-	"	extbl	%2,%5,%0\n"
-	"	cmpeq	%0,%6,%3\n"
-	"	beq	%3,2f\n"
-	"	mskbl	%2,%5,%2\n"
-	"	or	%1,%2,%2\n"
-	"	stq_c	%2,0(%4)\n"
-	"	beq	%2,3f\n"
-	"2:\n"
-	".subsection 2\n"
-	"3:	br	1b\n"
-	".previous"
-	: "=&r" (prev), "=&r" (new), "=&r" (tmp), "=&r" (cmp), "=&r" (addr64)
-	: "r" ((long)m), "Ir" (old), "1" (new) : "memory");
-
-	return prev;
-}
-
-static inline unsigned long
-____cmpxchg(_u16, volatile short *m, unsigned short old, unsigned short new)
-{
-	unsigned long prev, tmp, cmp, addr64;
-
-	__asm__ __volatile__(
-	"	andnot	%5,7,%4\n"
-	"	inswl	%1,%5,%1\n"
-	"1:	ldq_l	%2,0(%4)\n"
-	"	extwl	%2,%5,%0\n"
-	"	cmpeq	%0,%6,%3\n"
-	"	beq	%3,2f\n"
-	"	mskwl	%2,%5,%2\n"
-	"	or	%1,%2,%2\n"
-	"	stq_c	%2,0(%4)\n"
-	"	beq	%2,3f\n"
-	"2:\n"
-	".subsection 2\n"
-	"3:	br	1b\n"
-	".previous"
-	: "=&r" (prev), "=&r" (new), "=&r" (tmp), "=&r" (cmp), "=&r" (addr64)
-	: "r" ((long)m), "Ir" (old), "1" (new) : "memory");
-
-	return prev;
-}
-
-static inline unsigned long
-____cmpxchg(_u32, volatile int *m, int old, int new)
-{
-	unsigned long prev, cmp;
-
-	__asm__ __volatile__(
-	"1:	ldl_l %0,%5\n"
-	"	cmpeq %0,%3,%1\n"
-	"	beq %1,2f\n"
-	"	mov %4,%1\n"
-	"	stl_c %1,%2\n"
-	"	beq %1,3f\n"
-	"2:\n"
-	".subsection 2\n"
-	"3:	br 1b\n"
-	".previous"
-	: "=&r"(prev), "=&r"(cmp), "=m"(*m)
-	: "r"((long) old), "r"(new), "m"(*m) : "memory");
-
-	return prev;
-}
-
-static inline unsigned long
-____cmpxchg(_u64, volatile long *m, unsigned long old, unsigned long new)
-{
-	unsigned long prev, cmp;
-
-	__asm__ __volatile__(
-	"1:	ldq_l %0,%5\n"
-	"	cmpeq %0,%3,%1\n"
-	"	beq %1,2f\n"
-	"	mov %4,%1\n"
-	"	stq_c %1,%2\n"
-	"	beq %1,3f\n"
-	"2:\n"
-	".subsection 2\n"
-	"3:	br 1b\n"
-	".previous"
-	: "=&r"(prev), "=&r"(cmp), "=m"(*m)
-	: "r"((long) old), "r"(new), "m"(*m) : "memory");
-
-	return prev;
-}
-
-/* This function doesn't exist, so you'll get a linker error
-   if something tries to do an invalid cmpxchg().  */
-extern void __cmpxchg_called_with_bad_pointer(void);
-
-static __always_inline unsigned long
-____cmpxchg(, volatile void *ptr, unsigned long old, unsigned long new,
-	      int size)
-{
-	switch (size) {
-		case 1:
-			return ____cmpxchg(_u8, ptr, old, new);
-		case 2:
-			return ____cmpxchg(_u16, ptr, old, new);
-		case 4:
-			return ____cmpxchg(_u32, ptr, old, new);
-		case 8:
-			return ____cmpxchg(_u64, ptr, old, new);
-	}
-	__cmpxchg_called_with_bad_pointer();
-	return old;
-}
-
-#endif

Re: alpha cmpxchg.h (was Re: [PATCH v2 cmpxchg 12/13] sh: Emulate one-byte cmpxchg)

Posted by Linus Torvalds 1 year, 9 months ago

On Thu, 2 May 2024 at 14:01, Al Viro <viro@zeniv.linux.org.uk> wrote:
>
> +static inline unsigned long
> +____xchg_u8(volatile char *m, unsigned long val)
> +{
> +       unsigned long ret, tmp, addr64;
> +
> +       __asm__ __volatile__(
> +       "       andnot  %4,7,%3\n"
> +       "       insbl   %1,%4,%1\n"
> +       "1:     ldq_l   %2,0(%3)\n"
> +       "       extbl   %2,%4,%0\n"
> +       "       mskbl   %2,%4,%2\n"
> +       "       or      %1,%2,%2\n"
> +       "       stq_c   %2,0(%3)\n"
> +       "       beq     %2,2f\n"
> +       ".subsection 2\n"
> +       "2:     br      1b\n"
> +       ".previous"
> +       : "=&r" (ret), "=&r" (val), "=&r" (tmp), "=&r" (addr64)
> +       : "r" ((long)m), "1" (val) : "memory");
> +
> +       return ret;
> +}

Side note: if you move this around, I think you should just uninline
it too and turn it into a function call.

This inline asm doesn't actually take any advantage of the inlining.
The main reason to inline something like this is that you could then
deal with different compile-time alignments better than using the
generic software sequence. But that's not what the inline asm actually
does, and it uses the worst-case code sequence for inserting the byte.

Put that together with "byte and word xchg are rare", and it really
smells to me like we shouldn't be inlining this.

Now, the 32-bit and 64-bit cases are different - more common, but also
much simpler code sequences. They seem worth inlining.

That said, maybe for alpha, the "just move code around" is better than
"fix up old bad decisions" just because the effort is lower.

              Linus