[PATCH RESEND v7 6/6] arm64: futex: support futex with FEAT_LSUI

Yeoreum Yun posted 6 patches 1 month, 2 weeks ago
There is a newer version of this series
[PATCH RESEND v7 6/6] arm64: futex: support futex with FEAT_LSUI
Posted by Yeoreum Yun 1 month, 2 weeks ago
Current futex atomic operations are implemented with ll/sc instructions
and clearing PSTATE.PAN.

Since Armv9.6, FEAT_LSUI supplies not only load/store instructions but
also atomic operation for user memory access in kernel it doesn't need
to clear PSTATE.PAN bit anymore.

With theses instructions some of futex atomic operations don't need to
be implmented with ldxr/stlxr pair instead can be implmented with
one atomic operation supplied by FEAT_LSUI.

However, some of futex atomic operations still need to use ll/sc way
via ldtxr/stltxr supplied by FEAT_LSUI since there is no correspondant
atomic instruction or doesn't support word size operation.
(i.e) eor, cas{mb}t

But It's good to work without clearing PSTATE.PAN bit.

Signed-off-by: Yeoreum Yun <yeoreum.yun@arm.com>
---
 arch/arm64/include/asm/futex.h | 130 ++++++++++++++++++++++++++++++++-
 1 file changed, 129 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h
index 22a6301a9f3d..ece35ca9b5d9 100644
--- a/arch/arm64/include/asm/futex.h
+++ b/arch/arm64/include/asm/futex.h
@@ -9,6 +9,8 @@
 #include <linux/uaccess.h>
 #include <linux/stringify.h>

+#include <asm/alternative.h>
+#include <asm/alternative-macros.h>
 #include <asm/errno.h>

 #define LLSC_MAX_LOOPS	128 /* What's the largest number you can think of? */
@@ -115,11 +117,137 @@ __llsc_futex_cmpxchg(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval)
 	return ret;
 }

+#ifdef CONFIG_AS_HAS_LSUI
+
+#define __LSUI_PREAMBLE	".arch_extension lsui\n"
+
+#define LSUI_FUTEX_ATOMIC_OP(op, asm_op, mb)				\
+static __always_inline int						\
+__lsui_futex_atomic_##op(int oparg, u32 __user *uaddr, int *oval)	\
+{									\
+	int ret = 0;							\
+	int oldval;							\
+									\
+	uaccess_ttbr0_enable();						\
+	asm volatile("// __lsui_futex_atomic_" #op "\n"			\
+	__LSUI_PREAMBLE							\
+"1:	" #asm_op #mb "	%w3, %w2, %1\n"					\
+"2:\n"									\
+	_ASM_EXTABLE_UACCESS_ERR(1b, 2b, %w0)				\
+	: "+r" (ret), "+Q" (*uaddr), "=r" (oldval)			\
+	: "r" (oparg)							\
+	: "memory");							\
+	uaccess_ttbr0_disable();					\
+									\
+	if (!ret)							\
+		*oval = oldval;						\
+									\
+	return ret;							\
+}
+
+LSUI_FUTEX_ATOMIC_OP(add, ldtadd, al)
+LSUI_FUTEX_ATOMIC_OP(or, ldtset, al)
+LSUI_FUTEX_ATOMIC_OP(andnot, ldtclr, al)
+LSUI_FUTEX_ATOMIC_OP(set, swpt, al)
+
+static __always_inline int
+__lsui_futex_atomic_and(int oparg, u32 __user *uaddr, int *oval)
+{
+	return __lsui_futex_atomic_andnot(~oparg, uaddr, oval);
+}
+
+static __always_inline int
+__lsui_futex_atomic_eor(int oparg, u32 __user *uaddr, int *oval)
+{
+	unsigned int loops = LLSC_MAX_LOOPS;
+	int ret, oldval, tmp;
+
+	uaccess_ttbr0_enable();
+	/*
+	 * there are no ldteor/stteor instructions...
+	 */
+	asm volatile("// __lsui_futex_atomic_eor\n"
+	__LSUI_PREAMBLE
+"	prfm	pstl1strm, %2\n"
+"1:	ldtxr	%w1, %2\n"
+"	eor	%w3, %w1, %w5\n"
+"2:	stltxr	%w0, %w3, %2\n"
+"	cbz	%w0, 3f\n"
+"	sub	%w4, %w4, %w0\n"
+"	cbnz	%w4, 1b\n"
+"	mov	%w0, %w6\n"
+"3:\n"
+"	dmb	ish\n"
+	_ASM_EXTABLE_UACCESS_ERR(1b, 3b, %w0)
+	_ASM_EXTABLE_UACCESS_ERR(2b, 3b, %w0)
+	: "=&r" (ret), "=&r" (oldval), "+Q" (*uaddr), "=&r" (tmp),
+	  "+r" (loops)
+	: "r" (oparg), "Ir" (-EAGAIN)
+	: "memory");
+	uaccess_ttbr0_disable();
+
+	if (!ret)
+		*oval = oldval;
+
+	return ret;
+}
+
+static __always_inline int
+__lsui_futex_cmpxchg(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval)
+{
+	int ret = 0;
+	unsigned int loops = LLSC_MAX_LOOPS;
+	u32 val, tmp;
+
+	uaccess_ttbr0_enable();
+	/*
+	 * cas{al}t doesn't support word size...
+	 */
+	asm volatile("//__lsui_futex_cmpxchg\n"
+	__LSUI_PREAMBLE
+"	prfm	pstl1strm, %2\n"
+"1:	ldtxr	%w1, %2\n"
+"	eor	%w3, %w1, %w5\n"
+"	cbnz	%w3, 4f\n"
+"2:	stltxr	%w3, %w6, %2\n"
+"	cbz	%w3, 3f\n"
+"	sub	%w4, %w4, %w3\n"
+"	cbnz	%w4, 1b\n"
+"	mov	%w0, %w7\n"
+"3:\n"
+"	dmb	ish\n"
+"4:\n"
+	_ASM_EXTABLE_UACCESS_ERR(1b, 4b, %w0)
+	_ASM_EXTABLE_UACCESS_ERR(2b, 4b, %w0)
+	: "+r" (ret), "=&r" (val), "+Q" (*uaddr), "=&r" (tmp), "+r" (loops)
+	: "r" (oldval), "r" (newval), "Ir" (-EAGAIN)
+	: "memory");
+	uaccess_ttbr0_disable();
+
+	if (!ret)
+		*oval = oldval;
+
+	return ret;
+}
+
+#define __lsui_llsc_body(op, ...)					\
+({									\
+	alternative_has_cap_likely(ARM64_HAS_LSUI) ?			\
+		__lsui_##op(__VA_ARGS__) : __llsc_##op(__VA_ARGS__);	\
+})
+
+#else	/* CONFIG_AS_HAS_LSUI */
+
+#define __lsui_llsc_body(op, ...)	__llsc_##op(__VA_ARGS__)
+
+#endif	/* CONFIG_AS_HAS_LSUI */
+
+
 #define FUTEX_ATOMIC_OP(op)						\
 static __always_inline int						\
 __futex_atomic_##op(int oparg, u32 __user *uaddr, int *oval)		\
 {									\
-	return __llsc_futex_atomic_##op(oparg, uaddr, oval);		\
+	return __lsui_llsc_body(futex_atomic_##op, oparg, uaddr, oval);	\
 }

 FUTEX_ATOMIC_OP(add)
--
LEVI:{C3F47F37-75D8-414A-A8BA-3980EC8A46D7}
Re: [PATCH RESEND v7 6/6] arm64: futex: support futex with FEAT_LSUI
Posted by Catalin Marinas 3 weeks ago
On Sat, Aug 16, 2025 at 04:19:29PM +0100, Yeoreum Yun wrote:
> @@ -115,11 +117,137 @@ __llsc_futex_cmpxchg(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval)
>  	return ret;
>  }
> 
> +#ifdef CONFIG_AS_HAS_LSUI
> +
> +#define __LSUI_PREAMBLE	".arch_extension lsui\n"
> +
> +#define LSUI_FUTEX_ATOMIC_OP(op, asm_op, mb)				\
> +static __always_inline int						\
> +__lsui_futex_atomic_##op(int oparg, u32 __user *uaddr, int *oval)	\
> +{									\
> +	int ret = 0;							\
> +	int oldval;							\
> +									\
> +	uaccess_ttbr0_enable();						\

I think we can drop uaccess_ttbr0_*() from these functions. At the
kconfig level, TTBR0_PAN selects PAN. Hardware with LSUI will also
have PAN (since 8.1), so the above is an unnecessary branch or nop,
depending on how the alternatives play out. But add a comment instead.

> +static __always_inline int
> +__lsui_futex_atomic_eor(int oparg, u32 __user *uaddr, int *oval)
> +{
> +	unsigned int loops = LLSC_MAX_LOOPS;
> +	int ret, oldval, tmp;
> +
> +	uaccess_ttbr0_enable();
> +	/*
> +	 * there are no ldteor/stteor instructions...
> +	 */
> +	asm volatile("// __lsui_futex_atomic_eor\n"
> +	__LSUI_PREAMBLE
> +"	prfm	pstl1strm, %2\n"
> +"1:	ldtxr	%w1, %2\n"
> +"	eor	%w3, %w1, %w5\n"
> +"2:	stltxr	%w0, %w3, %2\n"
> +"	cbz	%w0, 3f\n"
> +"	sub	%w4, %w4, %w0\n"
> +"	cbnz	%w4, 1b\n"
> +"	mov	%w0, %w6\n"
> +"3:\n"
> +"	dmb	ish\n"
> +	_ASM_EXTABLE_UACCESS_ERR(1b, 3b, %w0)
> +	_ASM_EXTABLE_UACCESS_ERR(2b, 3b, %w0)
> +	: "=&r" (ret), "=&r" (oldval), "+Q" (*uaddr), "=&r" (tmp),
> +	  "+r" (loops)
> +	: "r" (oparg), "Ir" (-EAGAIN)
> +	: "memory");
> +	uaccess_ttbr0_disable();
> +
> +	if (!ret)
> +		*oval = oldval;
> +
> +	return ret;
> +}

That's an unfortunate omission from the architecture.

> +#define __lsui_llsc_body(op, ...)					\
> +({									\
> +	alternative_has_cap_likely(ARM64_HAS_LSUI) ?			\
> +		__lsui_##op(__VA_ARGS__) : __llsc_##op(__VA_ARGS__);	\
> +})
> +
> +#else	/* CONFIG_AS_HAS_LSUI */
> +
> +#define __lsui_llsc_body(op, ...)	__llsc_##op(__VA_ARGS__)
> +
> +#endif	/* CONFIG_AS_HAS_LSUI */
> +
> +
>  #define FUTEX_ATOMIC_OP(op)						\
>  static __always_inline int						\
>  __futex_atomic_##op(int oparg, u32 __user *uaddr, int *oval)		\
>  {									\
> -	return __llsc_futex_atomic_##op(oparg, uaddr, oval);		\
> +	return __lsui_llsc_body(futex_atomic_##op, oparg, uaddr, oval);	\
>  }

That's what I got confused about. It looks fine:

Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Re: [PATCH RESEND v7 6/6] arm64: futex: support futex with FEAT_LSUI
Posted by Yeoreum Yun 2 weeks, 5 days ago
Hi Catalin,

> On Sat, Aug 16, 2025 at 04:19:29PM +0100, Yeoreum Yun wrote:
> > @@ -115,11 +117,137 @@ __llsc_futex_cmpxchg(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval)
> >  	return ret;
> >  }
> >
> > +#ifdef CONFIG_AS_HAS_LSUI
> > +
> > +#define __LSUI_PREAMBLE	".arch_extension lsui\n"
> > +
> > +#define LSUI_FUTEX_ATOMIC_OP(op, asm_op, mb)				\
> > +static __always_inline int						\
> > +__lsui_futex_atomic_##op(int oparg, u32 __user *uaddr, int *oval)	\
> > +{									\
> > +	int ret = 0;							\
> > +	int oldval;							\
> > +									\
> > +	uaccess_ttbr0_enable();						\
>
> I think we can drop uaccess_ttbr0_*() from these functions. At the
> kconfig level, TTBR0_PAN selects PAN. Hardware with LSUI will also
> have PAN (since 8.1), so the above is an unnecessary branch or nop,
> depending on how the alternatives play out. But add a comment instead.

Thanks to point out this.
I'll change it.

>
> > +static __always_inline int
> > +__lsui_futex_atomic_eor(int oparg, u32 __user *uaddr, int *oval)
> > +{
> > +	unsigned int loops = LLSC_MAX_LOOPS;
> > +	int ret, oldval, tmp;
> > +
> > +	uaccess_ttbr0_enable();
> > +	/*
> > +	 * there are no ldteor/stteor instructions...
> > +	 */
> > +	asm volatile("// __lsui_futex_atomic_eor\n"
> > +	__LSUI_PREAMBLE
> > +"	prfm	pstl1strm, %2\n"
> > +"1:	ldtxr	%w1, %2\n"
> > +"	eor	%w3, %w1, %w5\n"
> > +"2:	stltxr	%w0, %w3, %2\n"
> > +"	cbz	%w0, 3f\n"
> > +"	sub	%w4, %w4, %w0\n"
> > +"	cbnz	%w4, 1b\n"
> > +"	mov	%w0, %w6\n"
> > +"3:\n"
> > +"	dmb	ish\n"
> > +	_ASM_EXTABLE_UACCESS_ERR(1b, 3b, %w0)
> > +	_ASM_EXTABLE_UACCESS_ERR(2b, 3b, %w0)
> > +	: "=&r" (ret), "=&r" (oldval), "+Q" (*uaddr), "=&r" (tmp),
> > +	  "+r" (loops)
> > +	: "r" (oparg), "Ir" (-EAGAIN)
> > +	: "memory");
> > +	uaccess_ttbr0_disable();
> > +
> > +	if (!ret)
> > +		*oval = oldval;
> > +
> > +	return ret;
> > +}
>
> That's an unfortunate omission from the architecture.
>
> > +#define __lsui_llsc_body(op, ...)					\
> > +({									\
> > +	alternative_has_cap_likely(ARM64_HAS_LSUI) ?			\
> > +		__lsui_##op(__VA_ARGS__) : __llsc_##op(__VA_ARGS__);	\
> > +})
> > +
> > +#else	/* CONFIG_AS_HAS_LSUI */
> > +
> > +#define __lsui_llsc_body(op, ...)	__llsc_##op(__VA_ARGS__)
> > +
> > +#endif	/* CONFIG_AS_HAS_LSUI */
> > +
> > +
> >  #define FUTEX_ATOMIC_OP(op)						\
> >  static __always_inline int						\
> >  __futex_atomic_##op(int oparg, u32 __user *uaddr, int *oval)		\
> >  {									\
> > -	return __llsc_futex_atomic_##op(oparg, uaddr, oval);		\
> > +	return __lsui_llsc_body(futex_atomic_##op, oparg, uaddr, oval);	\
> >  }
>
> That's what I got confused about. It looks fine:
>
> Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>

Thanks!
--
Sincerely,
Yeoreum Yun
Re: [PATCH RESEND v7 6/6] arm64: futex: support futex with FEAT_LSUI
Posted by Will Deacon 3 weeks, 1 day ago
On Sat, Aug 16, 2025 at 04:19:29PM +0100, Yeoreum Yun wrote:
> Current futex atomic operations are implemented with ll/sc instructions
> and clearing PSTATE.PAN.
> 
> Since Armv9.6, FEAT_LSUI supplies not only load/store instructions but
> also atomic operation for user memory access in kernel it doesn't need
> to clear PSTATE.PAN bit anymore.
> 
> With theses instructions some of futex atomic operations don't need to
> be implmented with ldxr/stlxr pair instead can be implmented with
> one atomic operation supplied by FEAT_LSUI.
> 
> However, some of futex atomic operations still need to use ll/sc way
> via ldtxr/stltxr supplied by FEAT_LSUI since there is no correspondant
> atomic instruction or doesn't support word size operation.
> (i.e) eor, cas{mb}t
> 
> But It's good to work without clearing PSTATE.PAN bit.
> 
> Signed-off-by: Yeoreum Yun <yeoreum.yun@arm.com>
> ---
>  arch/arm64/include/asm/futex.h | 130 ++++++++++++++++++++++++++++++++-
>  1 file changed, 129 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h
> index 22a6301a9f3d..ece35ca9b5d9 100644
> --- a/arch/arm64/include/asm/futex.h
> +++ b/arch/arm64/include/asm/futex.h
> @@ -9,6 +9,8 @@
>  #include <linux/uaccess.h>
>  #include <linux/stringify.h>
> 
> +#include <asm/alternative.h>
> +#include <asm/alternative-macros.h>
>  #include <asm/errno.h>
> 
>  #define LLSC_MAX_LOOPS	128 /* What's the largest number you can think of? */
> @@ -115,11 +117,137 @@ __llsc_futex_cmpxchg(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval)
>  	return ret;
>  }
> 
> +#ifdef CONFIG_AS_HAS_LSUI
> +
> +#define __LSUI_PREAMBLE	".arch_extension lsui\n"
> +
> +#define LSUI_FUTEX_ATOMIC_OP(op, asm_op, mb)				\
> +static __always_inline int						\
> +__lsui_futex_atomic_##op(int oparg, u32 __user *uaddr, int *oval)	\
> +{									\
> +	int ret = 0;							\
> +	int oldval;							\
> +									\
> +	uaccess_ttbr0_enable();						\
> +	asm volatile("// __lsui_futex_atomic_" #op "\n"			\
> +	__LSUI_PREAMBLE							\
> +"1:	" #asm_op #mb "	%w3, %w2, %1\n"					\
> +"2:\n"									\
> +	_ASM_EXTABLE_UACCESS_ERR(1b, 2b, %w0)				\
> +	: "+r" (ret), "+Q" (*uaddr), "=r" (oldval)			\
> +	: "r" (oparg)							\
> +	: "memory");							\
> +	uaccess_ttbr0_disable();					\
> +									\
> +	if (!ret)							\
> +		*oval = oldval;						\
> +									\
> +	return ret;							\
> +}
> +
> +LSUI_FUTEX_ATOMIC_OP(add, ldtadd, al)
> +LSUI_FUTEX_ATOMIC_OP(or, ldtset, al)
> +LSUI_FUTEX_ATOMIC_OP(andnot, ldtclr, al)
> +LSUI_FUTEX_ATOMIC_OP(set, swpt, al)
> +
> +static __always_inline int
> +__lsui_futex_atomic_and(int oparg, u32 __user *uaddr, int *oval)
> +{
> +	return __lsui_futex_atomic_andnot(~oparg, uaddr, oval);
> +}
> +
> +static __always_inline int
> +__lsui_futex_atomic_eor(int oparg, u32 __user *uaddr, int *oval)
> +{
> +	unsigned int loops = LLSC_MAX_LOOPS;
> +	int ret, oldval, tmp;
> +
> +	uaccess_ttbr0_enable();
> +	/*
> +	 * there are no ldteor/stteor instructions...
> +	 */

*sigh*

Were these new instructions not added with futex in mind?

I wonder whether CAS would be better than exclusives for xor...

> +static __always_inline int
> +__lsui_futex_cmpxchg(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval)
> +{
> +	int ret = 0;
> +	unsigned int loops = LLSC_MAX_LOOPS;
> +	u32 val, tmp;
> +
> +	uaccess_ttbr0_enable();
> +	/*
> +	 * cas{al}t doesn't support word size...
> +	 */

What about just aligning down and doing a 64-bit cas in that case?

Will
Re: [PATCH RESEND v7 6/6] arm64: futex: support futex with FEAT_LSUI
Posted by Catalin Marinas 3 weeks ago
On Thu, Sep 11, 2025 at 04:22:24PM +0100, Will Deacon wrote:
> On Sat, Aug 16, 2025 at 04:19:29PM +0100, Yeoreum Yun wrote:
> > +static __always_inline int
> > +__lsui_futex_atomic_eor(int oparg, u32 __user *uaddr, int *oval)
> > +{
> > +	unsigned int loops = LLSC_MAX_LOOPS;
> > +	int ret, oldval, tmp;
> > +
> > +	uaccess_ttbr0_enable();
> > +	/*
> > +	 * there are no ldteor/stteor instructions...
> > +	 */
> 
> *sigh*
> 
> Were these new instructions not added with futex in mind?

I guess it was _most_ of the futex.

> I wonder whether CAS would be better than exclusives for xor...

I was first thinking we could share some of the code with
__futex_cmpxchg() but...

> > +static __always_inline int
> > +__lsui_futex_cmpxchg(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval)
> > +{
> > +	int ret = 0;
> > +	unsigned int loops = LLSC_MAX_LOOPS;
> > +	u32 val, tmp;
> > +
> > +	uaccess_ttbr0_enable();
> > +	/*
> > +	 * cas{al}t doesn't support word size...
> > +	 */
> 
> What about just aligning down and doing a 64-bit cas in that case?

I think it gets more complicated. Here we get the oldval from the
caller, so no need to do a read. With CAS, we'd need to read the full
64-bit, replace half of it with oldval and newval just to be able to do
the operation. On top of this, we need to check which half of the 64-bit
value. I think it to hairy for little benefit.

-- 
Catalin
Re: [PATCH RESEND v7 6/6] arm64: futex: support futex with FEAT_LSUI
Posted by Yeoreum Yun 2 weeks, 5 days ago
Hi,

> On Thu, Sep 11, 2025 at 04:22:24PM +0100, Will Deacon wrote:
> > On Sat, Aug 16, 2025 at 04:19:29PM +0100, Yeoreum Yun wrote:
> > > +static __always_inline int
> > > +__lsui_futex_atomic_eor(int oparg, u32 __user *uaddr, int *oval)
> > > +{
> > > +	unsigned int loops = LLSC_MAX_LOOPS;
> > > +	int ret, oldval, tmp;
> > > +
> > > +	uaccess_ttbr0_enable();
> > > +	/*
> > > +	 * there are no ldteor/stteor instructions...
> > > +	 */
> >
> > *sigh*
> >
> > Were these new instructions not added with futex in mind?
>
> I guess it was _most_ of the futex.
>
> > I wonder whether CAS would be better than exclusives for xor...
>
> I was first thinking we could share some of the code with
> __futex_cmpxchg() but...
>
> > > +static __always_inline int
> > > +__lsui_futex_cmpxchg(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval)
> > > +{
> > > +	int ret = 0;
> > > +	unsigned int loops = LLSC_MAX_LOOPS;
> > > +	u32 val, tmp;
> > > +
> > > +	uaccess_ttbr0_enable();
> > > +	/*
> > > +	 * cas{al}t doesn't support word size...
> > > +	 */
> >
> > What about just aligning down and doing a 64-bit cas in that case?
>
> I think it gets more complicated. Here we get the oldval from the
> caller, so no need to do a read. With CAS, we'd need to read the full
> 64-bit, replace half of it with oldval and newval just to be able to do
> the operation. On top of this, we need to check which half of the 64-bit
> value. I think it to hairy for little benefit.

Agree. also the unrelated to change for other 32 bit can make
a failure futex atomic operation.

So, I'll keep the llsc method even using lsui for cmpxchg and eor.

Thanks!
--
Sincerely,
Yeoreum Yun
Re: [PATCH RESEND v7 6/6] arm64: futex: support futex with FEAT_LSUI
Posted by Yeoreum Yun 3 weeks, 1 day ago
Hi Will,

> > Current futex atomic operations are implemented with ll/sc instructions
> > and clearing PSTATE.PAN.
> >
> > Since Armv9.6, FEAT_LSUI supplies not only load/store instructions but
> > also atomic operation for user memory access in kernel it doesn't need
> > to clear PSTATE.PAN bit anymore.
> >
> > With theses instructions some of futex atomic operations don't need to
> > be implmented with ldxr/stlxr pair instead can be implmented with
> > one atomic operation supplied by FEAT_LSUI.
> >
> > However, some of futex atomic operations still need to use ll/sc way
> > via ldtxr/stltxr supplied by FEAT_LSUI since there is no correspondant
> > atomic instruction or doesn't support word size operation.
> > (i.e) eor, cas{mb}t
> >
> > But It's good to work without clearing PSTATE.PAN bit.
> >
> > Signed-off-by: Yeoreum Yun <yeoreum.yun@arm.com>
> > ---
> >  arch/arm64/include/asm/futex.h | 130 ++++++++++++++++++++++++++++++++-
> >  1 file changed, 129 insertions(+), 1 deletion(-)
> >
> > diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h
> > index 22a6301a9f3d..ece35ca9b5d9 100644
> > --- a/arch/arm64/include/asm/futex.h
> > +++ b/arch/arm64/include/asm/futex.h
> > @@ -9,6 +9,8 @@
> >  #include <linux/uaccess.h>
> >  #include <linux/stringify.h>
> >
> > +#include <asm/alternative.h>
> > +#include <asm/alternative-macros.h>
> >  #include <asm/errno.h>
> >
> >  #define LLSC_MAX_LOOPS	128 /* What's the largest number you can think of? */
> > @@ -115,11 +117,137 @@ __llsc_futex_cmpxchg(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval)
> >  	return ret;
> >  }
> >
> > +#ifdef CONFIG_AS_HAS_LSUI
> > +
> > +#define __LSUI_PREAMBLE	".arch_extension lsui\n"
> > +
> > +#define LSUI_FUTEX_ATOMIC_OP(op, asm_op, mb)				\
> > +static __always_inline int						\
> > +__lsui_futex_atomic_##op(int oparg, u32 __user *uaddr, int *oval)	\
> > +{									\
> > +	int ret = 0;							\
> > +	int oldval;							\
> > +									\
> > +	uaccess_ttbr0_enable();						\
> > +	asm volatile("// __lsui_futex_atomic_" #op "\n"			\
> > +	__LSUI_PREAMBLE							\
> > +"1:	" #asm_op #mb "	%w3, %w2, %1\n"					\
> > +"2:\n"									\
> > +	_ASM_EXTABLE_UACCESS_ERR(1b, 2b, %w0)				\
> > +	: "+r" (ret), "+Q" (*uaddr), "=r" (oldval)			\
> > +	: "r" (oparg)							\
> > +	: "memory");							\
> > +	uaccess_ttbr0_disable();					\
> > +									\
> > +	if (!ret)							\
> > +		*oval = oldval;						\
> > +									\
> > +	return ret;							\
> > +}
> > +
> > +LSUI_FUTEX_ATOMIC_OP(add, ldtadd, al)
> > +LSUI_FUTEX_ATOMIC_OP(or, ldtset, al)
> > +LSUI_FUTEX_ATOMIC_OP(andnot, ldtclr, al)
> > +LSUI_FUTEX_ATOMIC_OP(set, swpt, al)
> > +
> > +static __always_inline int
> > +__lsui_futex_atomic_and(int oparg, u32 __user *uaddr, int *oval)
> > +{
> > +	return __lsui_futex_atomic_andnot(~oparg, uaddr, oval);
> > +}
> > +
> > +static __always_inline int
> > +__lsui_futex_atomic_eor(int oparg, u32 __user *uaddr, int *oval)
> > +{
> > +	unsigned int loops = LLSC_MAX_LOOPS;
> > +	int ret, oldval, tmp;
> > +
> > +	uaccess_ttbr0_enable();
> > +	/*
> > +	 * there are no ldteor/stteor instructions...
> > +	 */
>
> *sigh*
>
> Were these new instructions not added with futex in mind?

rather than the futex, this seems to be designed for atomic_op()...
(like user version of LSE)...

That's why it seems no "eor" for this...

> I wonder whether CAS would be better than exclusives for xor...
>
> > +static __always_inline int
> > +__lsui_futex_cmpxchg(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval)
> > +{
> > +	int ret = 0;
> > +	unsigned int loops = LLSC_MAX_LOOPS;
> > +	u32 val, tmp;
> > +
> > +	uaccess_ttbr0_enable();
> > +	/*
> > +	 * cas{al}t doesn't support word size...
> > +	 */
>
> What about just aligning down and doing a 64-bit cas in that case?

Though it applies with cas{al}t applying to
futex_eor() and futex_cmpxchg(), I think it still need to compare with
old value is the same at the time of load. that means the routine will
be the same for LLSC way like:

again:
   oldval = uaddr;
   oldval2 = oldval
   cas uaddr, oldval2, newval
   if (oldval != oldval2)
     goto again;

with the CAS feature, try cmpxchg if old was different,
returns -EAGAIN immediately seems not the same beheavior with
former __llsc_futext_atomic_op().

This patch's intension is "not to change former beheavior"
but removing change of PSTATE only.

If this beheavior change is allowed,
I'll replace them with CAS one with delight :

Thanks!

--
Sincerely,
Yeoreum Yun