[PATCH RESEND v7 4/6] arm64: futex: refactor futex atomic operation

Yeoreum Yun posted 6 patches 1 month, 2 weeks ago
There is a newer version of this series
[PATCH RESEND v7 4/6] arm64: futex: refactor futex atomic operation
Posted by Yeoreum Yun 1 month, 2 weeks ago
Refactor futex atomic operations using ll/sc method with
clearing PSTATE.PAN to prepare to apply FEAT_LSUI on them.

Signed-off-by: Yeoreum Yun <yeoreum.yun@arm.com>
---
 arch/arm64/include/asm/futex.h | 132 +++++++++++++++++++++------------
 1 file changed, 84 insertions(+), 48 deletions(-)

diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h
index bc06691d2062..ab7003cb4724 100644
--- a/arch/arm64/include/asm/futex.h
+++ b/arch/arm64/include/asm/futex.h
@@ -7,17 +7,21 @@

 #include <linux/futex.h>
 #include <linux/uaccess.h>
+#include <linux/stringify.h>

 #include <asm/errno.h>

-#define FUTEX_MAX_LOOPS	128 /* What's the largest number you can think of? */
+#define LLSC_MAX_LOOPS	128 /* What's the largest number you can think of? */

-#define __futex_atomic_op(insn, ret, oldval, uaddr, tmp, oparg)		\
-do {									\
-	unsigned int loops = FUTEX_MAX_LOOPS;				\
+#define LLSC_FUTEX_ATOMIC_OP(op, insn)					\
+static __always_inline int						\
+__llsc_futex_atomic_##op(int oparg, u32 __user *uaddr, int *oval)	\
+{									\
+	unsigned int loops = LLSC_MAX_LOOPS;				\
+	int ret, oldval, tmp;						\
 									\
 	uaccess_enable_privileged();					\
-	asm volatile(							\
+	asm volatile("// __llsc_futex_atomic_" #op "\n"			\
 "	prfm	pstl1strm, %2\n"					\
 "1:	ldxr	%w1, %2\n"						\
 	insn "\n"							\
@@ -35,45 +39,103 @@ do {									\
 	: "r" (oparg), "Ir" (-EAGAIN)					\
 	: "memory");							\
 	uaccess_disable_privileged();					\
-} while (0)
+									\
+	if (!ret)							\
+		*oval = oldval;						\
+									\
+	return ret;							\
+}
+
+LLSC_FUTEX_ATOMIC_OP(add, "add	%w3, %w1, %w5")
+LLSC_FUTEX_ATOMIC_OP(or, "orr	%w3, %w1, %w5")
+LLSC_FUTEX_ATOMIC_OP(and, "and	%w3, %w1, %w5")
+LLSC_FUTEX_ATOMIC_OP(eor, "eor	%w3, %w1, %w5")
+LLSC_FUTEX_ATOMIC_OP(set, "mov	%w3, %w5")
+
+static __always_inline int
+__llsc_futex_cmpxchg(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval)
+{
+	int ret = 0;
+	unsigned int loops = LLSC_MAX_LOOPS;
+	u32 val, tmp;
+
+	uaccess_enable_privileged();
+	asm volatile("//__llsc_futex_cmpxchg\n"
+"	prfm	pstl1strm, %2\n"
+"1:	ldxr	%w1, %2\n"
+"	eor	%w3, %w1, %w5\n"
+"	cbnz	%w3, 4f\n"
+"2:	stlxr	%w3, %w6, %2\n"
+"	cbz	%w3, 3f\n"
+"	sub	%w4, %w4, %w3\n"
+"	cbnz	%w4, 1b\n"
+"	mov	%w0, %w7\n"
+"3:\n"
+"	dmb	ish\n"
+"4:\n"
+	_ASM_EXTABLE_UACCESS_ERR(1b, 4b, %w0)
+	_ASM_EXTABLE_UACCESS_ERR(2b, 4b, %w0)
+	: "+r" (ret), "=&r" (val), "+Q" (*uaddr), "=&r" (tmp), "+r" (loops)
+	: "r" (oldval), "r" (newval), "Ir" (-EAGAIN)
+	: "memory");
+	uaccess_disable_privileged();
+
+	if (!ret)
+		*oval = val;
+
+	return ret;
+}
+
+#define FUTEX_ATOMIC_OP(op)						\
+static __always_inline int						\
+__futex_atomic_##op(int oparg, u32 __user *uaddr, int *oval)		\
+{									\
+	return __llsc_futex_atomic_##op(oparg, uaddr, oval);		\
+}
+
+FUTEX_ATOMIC_OP(add)
+FUTEX_ATOMIC_OP(or)
+FUTEX_ATOMIC_OP(and)
+FUTEX_ATOMIC_OP(eor)
+FUTEX_ATOMIC_OP(set)
+
+static __always_inline int
+__futex_cmpxchg(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval)
+{
+	return __llsc_futex_cmpxchg(uaddr, oldval, newval, oval);
+}

 static inline int
 arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *_uaddr)
 {
-	int oldval = 0, ret, tmp;
-	u32 __user *uaddr = __uaccess_mask_ptr(_uaddr);
+	int ret;
+	u32 __user *uaddr;

 	if (!access_ok(_uaddr, sizeof(u32)))
 		return -EFAULT;

+	uaddr = __uaccess_mask_ptr(_uaddr);
+
 	switch (op) {
 	case FUTEX_OP_SET:
-		__futex_atomic_op("mov	%w3, %w5",
-				  ret, oldval, uaddr, tmp, oparg);
+		ret = __futex_atomic_set(oparg, uaddr, oval);
 		break;
 	case FUTEX_OP_ADD:
-		__futex_atomic_op("add	%w3, %w1, %w5",
-				  ret, oldval, uaddr, tmp, oparg);
+		ret = __futex_atomic_add(oparg, uaddr, oval);
 		break;
 	case FUTEX_OP_OR:
-		__futex_atomic_op("orr	%w3, %w1, %w5",
-				  ret, oldval, uaddr, tmp, oparg);
+		ret = __futex_atomic_or(oparg, uaddr, oval);
 		break;
 	case FUTEX_OP_ANDN:
-		__futex_atomic_op("and	%w3, %w1, %w5",
-				  ret, oldval, uaddr, tmp, ~oparg);
+		ret = __futex_atomic_and(~oparg, uaddr, oval);
 		break;
 	case FUTEX_OP_XOR:
-		__futex_atomic_op("eor	%w3, %w1, %w5",
-				  ret, oldval, uaddr, tmp, oparg);
+		ret = __futex_atomic_eor(oparg, uaddr, oval);
 		break;
 	default:
 		ret = -ENOSYS;
 	}

-	if (!ret)
-		*oval = oldval;
-
 	return ret;
 }

@@ -81,40 +143,14 @@ static inline int
 futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *_uaddr,
 			      u32 oldval, u32 newval)
 {
-	int ret = 0;
-	unsigned int loops = FUTEX_MAX_LOOPS;
-	u32 val, tmp;
 	u32 __user *uaddr;

 	if (!access_ok(_uaddr, sizeof(u32)))
 		return -EFAULT;

 	uaddr = __uaccess_mask_ptr(_uaddr);
-	uaccess_enable_privileged();
-	asm volatile("// futex_atomic_cmpxchg_inatomic\n"
-"	prfm	pstl1strm, %2\n"
-"1:	ldxr	%w1, %2\n"
-"	sub	%w3, %w1, %w5\n"
-"	cbnz	%w3, 4f\n"
-"2:	stlxr	%w3, %w6, %2\n"
-"	cbz	%w3, 3f\n"
-"	sub	%w4, %w4, %w3\n"
-"	cbnz	%w4, 1b\n"
-"	mov	%w0, %w7\n"
-"3:\n"
-"	dmb	ish\n"
-"4:\n"
-	_ASM_EXTABLE_UACCESS_ERR(1b, 4b, %w0)
-	_ASM_EXTABLE_UACCESS_ERR(2b, 4b, %w0)
-	: "+r" (ret), "=&r" (val), "+Q" (*uaddr), "=&r" (tmp), "+r" (loops)
-	: "r" (oldval), "r" (newval), "Ir" (-EAGAIN)
-	: "memory");
-	uaccess_disable_privileged();
-
-	if (!ret)
-		*uval = val;

-	return ret;
+	return __futex_cmpxchg(uaddr, oldval, newval, uval);
 }

 #endif /* __ASM_FUTEX_H */
--
LEVI:{C3F47F37-75D8-414A-A8BA-3980EC8A46D7}
Re: [PATCH RESEND v7 4/6] arm64: futex: refactor futex atomic operation
Posted by Catalin Marinas 3 weeks ago
On Sat, Aug 16, 2025 at 04:19:27PM +0100, Yeoreum Yun wrote:
> diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h
> index bc06691d2062..ab7003cb4724 100644
> --- a/arch/arm64/include/asm/futex.h
> +++ b/arch/arm64/include/asm/futex.h
> @@ -7,17 +7,21 @@
> 
>  #include <linux/futex.h>
>  #include <linux/uaccess.h>
> +#include <linux/stringify.h>
> 
>  #include <asm/errno.h>
> 
> -#define FUTEX_MAX_LOOPS	128 /* What's the largest number you can think of? */
> +#define LLSC_MAX_LOOPS	128 /* What's the largest number you can think of? */

I just noticed - you might as well leave the name as is here, especially
if in patch 6 you align down address and use CAS on a 64-bit value as
per Will's comment (and it's no longer LLSC). I think renaming this is
unnecessary.

-- 
Catalin
Re: [PATCH RESEND v7 4/6] arm64: futex: refactor futex atomic operation
Posted by Yeoreum Yun 2 weeks, 5 days ago
Hi Catalin,

> On Sat, Aug 16, 2025 at 04:19:27PM +0100, Yeoreum Yun wrote:
> > diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h
> > index bc06691d2062..ab7003cb4724 100644
> > --- a/arch/arm64/include/asm/futex.h
> > +++ b/arch/arm64/include/asm/futex.h
> > @@ -7,17 +7,21 @@
> >
> >  #include <linux/futex.h>
> >  #include <linux/uaccess.h>
> > +#include <linux/stringify.h>
> >
> >  #include <asm/errno.h>
> >
> > -#define FUTEX_MAX_LOOPS	128 /* What's the largest number you can think of? */
> > +#define LLSC_MAX_LOOPS	128 /* What's the largest number you can think of? */
>
> I just noticed - you might as well leave the name as is here, especially
> if in patch 6 you align down address and use CAS on a 64-bit value as
> per Will's comment (and it's no longer LLSC). I think renaming this is
> unnecessary.

Okay. I'll restore to use origin name.
But I think LSUI wouldn't be used with CAS according to patch 6's
comments from you and additionally i think
chaning the CAS would make a failure because of
change of unrelated field. i.e)

struct user_structure{
  uint32 futex;
  uint32 some_value;
};

In this case, the change of some_value from user side could make a
failure of futex atomic operation.

So I think it would be better to keep the current LLSC implementation
in LSUI.

Thanks.

--
Sincerely,
Yeoreum Yun
Re: [PATCH RESEND v7 4/6] arm64: futex: refactor futex atomic operation
Posted by Catalin Marinas 2 weeks, 4 days ago
On Mon, Sep 15, 2025 at 11:32:39AM +0100, Yeoreum Yun wrote:
> > On Sat, Aug 16, 2025 at 04:19:27PM +0100, Yeoreum Yun wrote:
> > > diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h
> > > index bc06691d2062..ab7003cb4724 100644
> > > --- a/arch/arm64/include/asm/futex.h
> > > +++ b/arch/arm64/include/asm/futex.h
> > > @@ -7,17 +7,21 @@
> > >
> > >  #include <linux/futex.h>
> > >  #include <linux/uaccess.h>
> > > +#include <linux/stringify.h>
> > >
> > >  #include <asm/errno.h>
> > >
> > > -#define FUTEX_MAX_LOOPS	128 /* What's the largest number you can think of? */
> > > +#define LLSC_MAX_LOOPS	128 /* What's the largest number you can think of? */
> >
> > I just noticed - you might as well leave the name as is here, especially
> > if in patch 6 you align down address and use CAS on a 64-bit value as
> > per Will's comment (and it's no longer LLSC). I think renaming this is
> > unnecessary.
> 
> Okay. I'll restore to use origin name.
> But I think LSUI wouldn't be used with CAS according to patch 6's
> comments from you and additionally i think
> chaning the CAS would make a failure because of
> change of unrelated field. i.e)
> 
> struct user_structure{
>   uint32 futex;
>   uint32 some_value;
> };
> 
> In this case, the change of some_value from user side could make a
> failure of futex atomic operation.

Yes but the loop would read 'some_value' again, fold in 'futex' and
retry. It would eventually succeed or fail after 128 iterations if the
user keeps changing that location. Note that's also the case with LL/SC,
the exclusive monitor would be cleared by some store in the same cache
line (well, depending on the hardware implementation) and the STXR fail.
From this perspective, CAS has better chance of succeeding.

> So I think it would be better to keep the current LLSC implementation
> in LSUI.

I think the code would look simpler with LL/SC but you can give it a try
and post the code sample here (not in a new series).

BTW, is there a test suite for all the futex operations? The cover
letter did not mention any.

-- 
Catalin
Re: [PATCH RESEND v7 4/6] arm64: futex: refactor futex atomic operation
Posted by Yeoreum Yun 2 weeks, 4 days ago
Hi Catalin,

> > > On Sat, Aug 16, 2025 at 04:19:27PM +0100, Yeoreum Yun wrote:
> > > > diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h
> > > > index bc06691d2062..ab7003cb4724 100644
> > > > --- a/arch/arm64/include/asm/futex.h
> > > > +++ b/arch/arm64/include/asm/futex.h
> > > > @@ -7,17 +7,21 @@
> > > >
> > > >  #include <linux/futex.h>
> > > >  #include <linux/uaccess.h>
> > > > +#include <linux/stringify.h>
> > > >
> > > >  #include <asm/errno.h>
> > > >
> > > > -#define FUTEX_MAX_LOOPS	128 /* What's the largest number you can think of? */
> > > > +#define LLSC_MAX_LOOPS	128 /* What's the largest number you can think of? */
> > >
> > > I just noticed - you might as well leave the name as is here, especially
> > > if in patch 6 you align down address and use CAS on a 64-bit value as
> > > per Will's comment (and it's no longer LLSC). I think renaming this is
> > > unnecessary.
> >
> > Okay. I'll restore to use origin name.
> > But I think LSUI wouldn't be used with CAS according to patch 6's
> > comments from you and additionally i think
> > chaning the CAS would make a failure because of
> > change of unrelated field. i.e)
> >
> > struct user_structure{
> >   uint32 futex;
> >   uint32 some_value;
> > };
> >
> > In this case, the change of some_value from user side could make a
> > failure of futex atomic operation.
>
> Yes but the loop would read 'some_value' again, fold in 'futex' and
> retry. It would eventually succeed or fail after 128 iterations if the
> user keeps changing that location. Note that's also the case with LL/SC,
> the exclusive monitor would be cleared by some store in the same cache
> line (well, depending on the hardware implementation) and the STXR fail.
> From this perspective, CAS has better chance of succeeding.

Oh. I see Thanks for insight ;)

>
> > So I think it would be better to keep the current LLSC implementation
> > in LSUI.
>
> I think the code would look simpler with LL/SC but you can give it a try
> and post the code sample here (not in a new series).

Okay. I'll try.

>
> BTW, is there a test suite for all the futex operations? The cover
> letter did not mention any.

with selftest's futex testcase, I've tested.
But Since there is no test for each operation even in LTP,
I tested it with additional test from me.

>
> --
> Catalin

--
Sincerely,
Yeoreum Yun
Re: [PATCH RESEND v7 4/6] arm64: futex: refactor futex atomic operation
Posted by Catalin Marinas 2 weeks, 3 days ago
On Mon, Sep 15, 2025 at 11:34:07PM +0100, Yeoreum Yun wrote:
> with selftest's futex testcase, I've tested.
> But Since there is no test for each operation even in LTP,
> I tested it with additional test from me.

As a separate series, you may improve the futex kselftest to cover all
the cases.

-- 
Catalin
Re: [PATCH RESEND v7 4/6] arm64: futex: refactor futex atomic operation
Posted by Will Deacon 2 weeks, 4 days ago
On Mon, Sep 15, 2025 at 08:40:33PM +0100, Catalin Marinas wrote:
> On Mon, Sep 15, 2025 at 11:32:39AM +0100, Yeoreum Yun wrote:
> > > On Sat, Aug 16, 2025 at 04:19:27PM +0100, Yeoreum Yun wrote:
> > > > diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h
> > > > index bc06691d2062..ab7003cb4724 100644
> > > > --- a/arch/arm64/include/asm/futex.h
> > > > +++ b/arch/arm64/include/asm/futex.h
> > > > @@ -7,17 +7,21 @@
> > > >
> > > >  #include <linux/futex.h>
> > > >  #include <linux/uaccess.h>
> > > > +#include <linux/stringify.h>
> > > >
> > > >  #include <asm/errno.h>
> > > >
> > > > -#define FUTEX_MAX_LOOPS	128 /* What's the largest number you can think of? */
> > > > +#define LLSC_MAX_LOOPS	128 /* What's the largest number you can think of? */
> > >
> > > I just noticed - you might as well leave the name as is here, especially
> > > if in patch 6 you align down address and use CAS on a 64-bit value as
> > > per Will's comment (and it's no longer LLSC). I think renaming this is
> > > unnecessary.
> > 
> > Okay. I'll restore to use origin name.
> > But I think LSUI wouldn't be used with CAS according to patch 6's
> > comments from you and additionally i think
> > chaning the CAS would make a failure because of
> > change of unrelated field. i.e)
> > 
> > struct user_structure{
> >   uint32 futex;
> >   uint32 some_value;
> > };
> > 
> > In this case, the change of some_value from user side could make a
> > failure of futex atomic operation.
> 
> Yes but the loop would read 'some_value' again, fold in 'futex' and
> retry. It would eventually succeed or fail after 128 iterations if the
> user keeps changing that location. Note that's also the case with LL/SC,
> the exclusive monitor would be cleared by some store in the same cache
> line (well, depending on the hardware implementation) and the STXR fail.
> From this perspective, CAS has better chance of succeeding.
> 
> > So I think it would be better to keep the current LLSC implementation
> > in LSUI.
> 
> I think the code would look simpler with LL/SC but you can give it a try
> and post the code sample here (not in a new series).

If you stick the cas*t instruction in its own helper say, cmpxchg_user(),
then you can do all the shifting/masking in C and I don't reckon it's
that bad. It means we (a) get rid of exclusives, which is the whole
point of this and (b) don't have to mess around with PAN.

> BTW, is there a test suite for all the futex operations? The cover
> letter did not mention any.

I was thinking that too. I'm sure I remember a 'futextest' kicking
around when we did the arm64 port but nowadays there's something in
tools/testing/selftests/futex/ which might be better.

Will
Re: [PATCH RESEND v7 4/6] arm64: futex: refactor futex atomic operation
Posted by Catalin Marinas 2 weeks, 4 days ago
On Mon, Sep 15, 2025 at 09:35:55PM +0100, Will Deacon wrote:
> On Mon, Sep 15, 2025 at 08:40:33PM +0100, Catalin Marinas wrote:
> > On Mon, Sep 15, 2025 at 11:32:39AM +0100, Yeoreum Yun wrote:
> > > So I think it would be better to keep the current LLSC implementation
> > > in LSUI.
> > 
> > I think the code would look simpler with LL/SC but you can give it a try
> > and post the code sample here (not in a new series).
> 
> If you stick the cas*t instruction in its own helper say, cmpxchg_user(),
> then you can do all the shifting/masking in C and I don't reckon it's
> that bad. It means we (a) get rid of exclusives, which is the whole
> point of this and (b) don't have to mess around with PAN.

We get rid of PAN toggling already since FEAT_LSUI introduces
LDTXR/STTXR. But, I'm all for CAS if it doesn't look too bad. Easier
I think if we do a get_user() of a u64 and combine it with the futex u32
while taking care of CPU endianness. All in a loop. Hopefully the
compiler is smart enough to reduce masking/or'ing to fewer instructions.

-- 
Catalin
Re: [PATCH RESEND v7 4/6] arm64: futex: refactor futex atomic operation
Posted by Yeoreum Yun 2 weeks, 4 days ago
Hi,
> On Mon, Sep 15, 2025 at 09:35:55PM +0100, Will Deacon wrote:
> > On Mon, Sep 15, 2025 at 08:40:33PM +0100, Catalin Marinas wrote:
> > > On Mon, Sep 15, 2025 at 11:32:39AM +0100, Yeoreum Yun wrote:
> > > > So I think it would be better to keep the current LLSC implementation
> > > > in LSUI.
> > >
> > > I think the code would look simpler with LL/SC but you can give it a try
> > > and post the code sample here (not in a new series).
> >
> > If you stick the cas*t instruction in its own helper say, cmpxchg_user(),
> > then you can do all the shifting/masking in C and I don't reckon it's
> > that bad. It means we (a) get rid of exclusives, which is the whole
> > point of this and (b) don't have to mess around with PAN.
>
> We get rid of PAN toggling already since FEAT_LSUI introduces
> LDTXR/STTXR. But, I'm all for CAS if it doesn't look too bad. Easier
> I think if we do a get_user() of a u64 and combine it with the futex u32
> while taking care of CPU endianness. All in a loop. Hopefully the
> compiler is smart enough to reduce masking/or'ing to fewer instructions.
>

Sorry for my wrong previous email.

Here is what i intened with CAS operation:

diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h
index 1d6d9f856ac5..0aeda7ced2c0 100644
--- a/arch/arm64/include/asm/futex.h
+++ b/arch/arm64/include/asm/futex.h
@@ -126,6 +126,60 @@ LSUI_FUTEX_ATOMIC_OP(or, ldtset, al)
 LSUI_FUTEX_ATOMIC_OP(andnot, ldtclr, al)
 LSUI_FUTEX_ATOMIC_OP(set, swpt, al)

+
+#define LSUI_CMPXCHG_HELPER(suffix, start_bit)                                 \
+static __always_inline int                                                     \
+__lsui_cmpxchg_helper_##suffix(u64 __user *uaddr, u32 oldval, u32 newval)      \
+{                                                                              \
+       int ret = 0;                                                            \
+       u64 oval, nval, tmp;                                                    \
+                                                                               \
+       asm volatile("//__lsui_cmpxchg_helper_" #suffix "\n"                    \
+       __LSUI_PREAMBLE                                                         \
+"      prfm    pstl1strm, %2\n"                                                \
+"1:    ldtr    %x1, %2\n"                                                      \
+"      mov     %x3, %x1\n"                                                     \
+"      bfi     %x1, %x5, #" #start_bit ", #32\n"                               \
+"      bfi     %x3, %x6, #" #start_bit ", #32\n"                               \
+"      mov     %x4, %x1\n"                                                     \
+"2:    caslt   %x1, %x3, %2\n"                                                 \
+"      sub     %x1, %x1, %x4\n"                                                \
+"      cbz     %x1, 3f\n"                                                      \
+"      mov     %w0, %w7\n"                                                     \
+"3:\n"                                                                         \
+"      dmb     ish\n"                                                          \
+"4:\n"                                                                         \
+       _ASM_EXTABLE_UACCESS_ERR(1b, 4b, %w0)                                   \
+       _ASM_EXTABLE_UACCESS_ERR(2b, 4b, %w0)                                   \
+       : "+r" (ret), "=&r" (oval), "+Q" (*uaddr), "=&r" (nval), "=&r" (tmp)    \
+       : "r" (oldval), "r" (newval), "Ir" (-EAGAIN)                            \
+       : "memory");                                                            \
+                                                                               \
+       return ret;                                                             \
+}
+
+LSUI_CMPXCHG_HELPER(lo, 0)
+LSUI_CMPXCHG_HELPER(hi, 32)
+
+static __always_inline int
+__lsui_cmpxchg_helper(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval)
+{
+       int ret;
+       unsigned long uaddr_al;
+
+       uaddr_al = ALIGN_DOWN((unsigned long)uaddr, sizeof(u64));
+
+       if (uaddr_al != (unsigned long)uaddr)
+               ret = __lsui_cmpxchg_helper_hi((u64 __user *)uaddr_al, oldval, newval);
+       else
+               ret = __lsui_cmpxchg_helper_lo((u64 __user *)uaddr_al, oldval, newval);
+
+       if (!ret)
+               *oval = oldval;
+
+       return ret;
+}
+
 static __always_inline int
 __lsui_futex_atomic_and(int oparg, u32 __user *uaddr, int *oval)
 {
@@ -135,71 +189,25 @@ __lsui_futex_atomic_and(int oparg, u32 __user *uaddr, int *oval)
 static __always_inline int
 __lsui_futex_atomic_eor(int oparg, u32 __user *uaddr, int *oval)
 {
-       unsigned int loops = FUTEX_MAX_LOOPS;
-       int ret, oldval, tmp;
+       int ret = -EAGAIN;
+       u32 oldval, newval;

        /*
         * there are no ldteor/stteor instructions...
         */
-       asm volatile("// __lsui_futex_atomic_eor\n"
-       __LSUI_PREAMBLE
-"      prfm    pstl1strm, %2\n"
-"1:    ldtxr   %w1, %2\n"
-"      eor     %w3, %w1, %w5\n"
-"2:    stltxr  %w0, %w3, %2\n"
-"      cbz     %w0, 3f\n"
-"      sub     %w4, %w4, %w0\n"
-"      cbnz    %w4, 1b\n"
-"      mov     %w0, %w6\n"
-"3:\n"
-"      dmb     ish\n"
-       _ASM_EXTABLE_UACCESS_ERR(1b, 3b, %w0)
-       _ASM_EXTABLE_UACCESS_ERR(2b, 3b, %w0)
-       : "=&r" (ret), "=&r" (oldval), "+Q" (*uaddr), "=&r" (tmp),
-         "+r" (loops)
-       : "r" (oparg), "Ir" (-EAGAIN)
-       : "memory");
+       unsafe_get_user(oldval, uaddr, err_fault);
+       newval = oldval ^ oparg;

-       if (!ret)
-               *oval = oldval;
+       ret = __lsui_cmpxchg_helper(uaddr, oldval, newval, oval);

+err_fault:
        return ret;
 }

 static __always_inline int
 __lsui_futex_cmpxchg(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval)
 {
-       int ret = 0;
-       unsigned int loops = FUTEX_MAX_LOOPS;
-       u32 val, tmp;
-
-       /*
-        * cas{al}t doesn't support word size...
-        */
-       asm volatile("//__lsui_futex_cmpxchg\n"
-       __LSUI_PREAMBLE
-"      prfm    pstl1strm, %2\n"
-"1:    ldtxr   %w1, %2\n"
-"      eor     %w3, %w1, %w5\n"
-"      cbnz    %w3, 4f\n"
-"2:    stltxr  %w3, %w6, %2\n"
-"      cbz     %w3, 3f\n"
-"      sub     %w4, %w4, %w3\n"
-"      cbnz    %w4, 1b\n"
-"      mov     %w0, %w7\n"
-"3:\n"
-"      dmb     ish\n"
-"4:\n"
-       _ASM_EXTABLE_UACCESS_ERR(1b, 4b, %w0)
-       _ASM_EXTABLE_UACCESS_ERR(2b, 4b, %w0)
-       : "+r" (ret), "=&r" (val), "+Q" (*uaddr), "=&r" (tmp), "+r" (loops)
-       : "r" (oldval), "r" (newval), "Ir" (-EAGAIN)
-       : "memory");
-
-       if (!ret)
-               *oval = oldval;
-
-       return ret;
+       return __lsui_cmpxchg_helper(uaddr, oldval, newval, oval);
 }

 #define __lsui_llsc_body(op, ...)                                      \
(END)

Am I missing something?

Thanks!

--
Sincerely,
Yeoreum Yun
Re: [PATCH RESEND v7 4/6] arm64: futex: refactor futex atomic operation
Posted by Mark Rutland 2 weeks, 3 days ago
On Tue, Sep 16, 2025 at 11:02:19AM +0100, Yeoreum Yun wrote:
> Hi,
> > On Mon, Sep 15, 2025 at 09:35:55PM +0100, Will Deacon wrote:
> > > On Mon, Sep 15, 2025 at 08:40:33PM +0100, Catalin Marinas wrote:
> > > > On Mon, Sep 15, 2025 at 11:32:39AM +0100, Yeoreum Yun wrote:
> > > > > So I think it would be better to keep the current LLSC implementation
> > > > > in LSUI.
> > > >
> > > > I think the code would look simpler with LL/SC but you can give it a try
> > > > and post the code sample here (not in a new series).
> > >
> > > If you stick the cas*t instruction in its own helper say, cmpxchg_user(),
> > > then you can do all the shifting/masking in C and I don't reckon it's
> > > that bad. It means we (a) get rid of exclusives, which is the whole
> > > point of this and (b) don't have to mess around with PAN.
> >
> > We get rid of PAN toggling already since FEAT_LSUI introduces
> > LDTXR/STTXR. But, I'm all for CAS if it doesn't look too bad. Easier
> > I think if we do a get_user() of a u64 and combine it with the futex u32
> > while taking care of CPU endianness. All in a loop. Hopefully the
> > compiler is smart enough to reduce masking/or'ing to fewer instructions.
> >
> 
> Sorry for my wrong previous email.
> 
> Here is what i intened with CAS operation:
> 
> diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h
> index 1d6d9f856ac5..0aeda7ced2c0 100644
> --- a/arch/arm64/include/asm/futex.h
> +++ b/arch/arm64/include/asm/futex.h
> @@ -126,6 +126,60 @@ LSUI_FUTEX_ATOMIC_OP(or, ldtset, al)
>  LSUI_FUTEX_ATOMIC_OP(andnot, ldtclr, al)
>  LSUI_FUTEX_ATOMIC_OP(set, swpt, al)
> 
> +
> +#define LSUI_CMPXCHG_HELPER(suffix, start_bit)                                 \
> +static __always_inline int                                                     \
> +__lsui_cmpxchg_helper_##suffix(u64 __user *uaddr, u32 oldval, u32 newval)      \
> +{                                                                              \
> +       int ret = 0;                                                            \
> +       u64 oval, nval, tmp;                                                    \
> +                                                                               \
> +       asm volatile("//__lsui_cmpxchg_helper_" #suffix "\n"                    \
> +       __LSUI_PREAMBLE                                                         \
> +"      prfm    pstl1strm, %2\n"                                                \
> +"1:    ldtr    %x1, %2\n"                                                      \
> +"      mov     %x3, %x1\n"                                                     \
> +"      bfi     %x1, %x5, #" #start_bit ", #32\n"                               \
> +"      bfi     %x3, %x6, #" #start_bit ", #32\n"                               \
> +"      mov     %x4, %x1\n"                                                     \
> +"2:    caslt   %x1, %x3, %2\n"                                                 \
> +"      sub     %x1, %x1, %x4\n"                                                \
> +"      cbz     %x1, 3f\n"                                                      \
> +"      mov     %w0, %w7\n"                                                     \
> +"3:\n"                                                                         \
> +"      dmb     ish\n"                                                          \
> +"4:\n"                                                                         \
> +       _ASM_EXTABLE_UACCESS_ERR(1b, 4b, %w0)                                   \
> +       _ASM_EXTABLE_UACCESS_ERR(2b, 4b, %w0)                                   \
> +       : "+r" (ret), "=&r" (oval), "+Q" (*uaddr), "=&r" (nval), "=&r" (tmp)    \
> +       : "r" (oldval), "r" (newval), "Ir" (-EAGAIN)                            \
> +       : "memory");                                                            \
> +                                                                               \
> +       return ret;                                                             \
> +}
> +
> +LSUI_CMPXCHG_HELPER(lo, 0)
> +LSUI_CMPXCHG_HELPER(hi, 32)
> +
> +static __always_inline int
> +__lsui_cmpxchg_helper(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval)
> +{
> +       int ret;
> +       unsigned long uaddr_al;
> +
> +       uaddr_al = ALIGN_DOWN((unsigned long)uaddr, sizeof(u64));
> +
> +       if (uaddr_al != (unsigned long)uaddr)
> +               ret = __lsui_cmpxchg_helper_hi((u64 __user *)uaddr_al, oldval, newval);
> +       else
> +               ret = __lsui_cmpxchg_helper_lo((u64 __user *)uaddr_al, oldval, newval);
> +
> +       if (!ret)
> +               *oval = oldval;
> +
> +       return ret;
> +}

I think Will expects that you do more of this in C, e.g. have a basic
user cmpxchg on a 64-bit type, e.g.

/*
 * NOTE: *oldp is NOT updated if a fault is taken.
 */
static __always_inline int
user_cmpxchg64_release(u64 __usr *addr, u64 *oldp, u64 new)
{
	int err = 0;

	asm volatile(
	__LSUI_PREAMBLE
	"1:	caslt	%x[old], %x[new], %[addr]\n"
	"2:\n"
	_ASM_EXTABLE_UACCESS_ERR(1b, 4b, %w0) 
	: [addr] "+Q" (addr),
	  [old] "+r" (*oldp)
	: [new] "r" (new)
	: "memory"
	);

	return err;
}

That should be the *only* assembly you need to implement.

Atop that, have a wrapper that uses get_user() and that helper above to
implement the 32-bit user cmpxchg, with all the bit manipulation in C:

/*
 * NOTE: *oldp is NOT updated if a fault is taken.
 */
static int user_cmpxchg32_release(u32 __user *addr, u32 *oldp, u32 new)
{
	u64 __user *addr64 = PTR_ALIGN_DOWN(addr, sizeof(u64));
	u64 old64, new64;
	int ret;

	if (get_user(old64, uaddr64))
		return -EFAULT;

	/*
	 * TODO: merge '*oldp' into 'old64', and 'new' into 'new64',
	 * taking into account endianness and alignment
	 */

	ret = user_cmpxchg64_release(uaddr64, &old64, new64);
	if (ret)
		return ret;

	/*
	 * TODO: extract '*oldp' from 'old64', taking into account
	 * endianness and alignment.
	 */

	return 0;
}

Then use that 32-bit user cmpxchg for any cases which need it.

As the compiler will have visiblity of all of the bit manipulation, it
will hopefully be able to fold that together appropriately.

Mark.
Re: [PATCH RESEND v7 4/6] arm64: futex: refactor futex atomic operation
Posted by Yeoreum Yun 2 weeks, 3 days ago
Hi Mark,

[...]
> > diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h
> > index 1d6d9f856ac5..0aeda7ced2c0 100644
> > --- a/arch/arm64/include/asm/futex.h
> > +++ b/arch/arm64/include/asm/futex.h
> > @@ -126,6 +126,60 @@ LSUI_FUTEX_ATOMIC_OP(or, ldtset, al)
> >  LSUI_FUTEX_ATOMIC_OP(andnot, ldtclr, al)
> >  LSUI_FUTEX_ATOMIC_OP(set, swpt, al)
> >
> > +
> > +#define LSUI_CMPXCHG_HELPER(suffix, start_bit)                                 \
> > +static __always_inline int                                                     \
> > +__lsui_cmpxchg_helper_##suffix(u64 __user *uaddr, u32 oldval, u32 newval)      \
> > +{                                                                              \
> > +       int ret = 0;                                                            \
> > +       u64 oval, nval, tmp;                                                    \
> > +                                                                               \
> > +       asm volatile("//__lsui_cmpxchg_helper_" #suffix "\n"                    \
> > +       __LSUI_PREAMBLE                                                         \
> > +"      prfm    pstl1strm, %2\n"                                                \
> > +"1:    ldtr    %x1, %2\n"                                                      \
> > +"      mov     %x3, %x1\n"                                                     \
> > +"      bfi     %x1, %x5, #" #start_bit ", #32\n"                               \
> > +"      bfi     %x3, %x6, #" #start_bit ", #32\n"                               \
> > +"      mov     %x4, %x1\n"                                                     \
> > +"2:    caslt   %x1, %x3, %2\n"                                                 \
> > +"      sub     %x1, %x1, %x4\n"                                                \
> > +"      cbz     %x1, 3f\n"                                                      \
> > +"      mov     %w0, %w7\n"                                                     \
> > +"3:\n"                                                                         \
> > +"      dmb     ish\n"                                                          \
> > +"4:\n"                                                                         \
> > +       _ASM_EXTABLE_UACCESS_ERR(1b, 4b, %w0)                                   \
> > +       _ASM_EXTABLE_UACCESS_ERR(2b, 4b, %w0)                                   \
> > +       : "+r" (ret), "=&r" (oval), "+Q" (*uaddr), "=&r" (nval), "=&r" (tmp)    \
> > +       : "r" (oldval), "r" (newval), "Ir" (-EAGAIN)                            \
> > +       : "memory");                                                            \
> > +                                                                               \
> > +       return ret;                                                             \
> > +}
> > +
> > +LSUI_CMPXCHG_HELPER(lo, 0)
> > +LSUI_CMPXCHG_HELPER(hi, 32)
> > +
> > +static __always_inline int
> > +__lsui_cmpxchg_helper(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval)
> > +{
> > +       int ret;
> > +       unsigned long uaddr_al;
> > +
> > +       uaddr_al = ALIGN_DOWN((unsigned long)uaddr, sizeof(u64));
> > +
> > +       if (uaddr_al != (unsigned long)uaddr)
> > +               ret = __lsui_cmpxchg_helper_hi((u64 __user *)uaddr_al, oldval, newval);
> > +       else
> > +               ret = __lsui_cmpxchg_helper_lo((u64 __user *)uaddr_al, oldval, newval);
> > +
> > +       if (!ret)
> > +               *oval = oldval;
> > +
> > +       return ret;
> > +}
>
> I think Will expects that you do more of this in C, e.g. have a basic
> user cmpxchg on a 64-bit type, e.g.
>
> /*
>  * NOTE: *oldp is NOT updated if a fault is taken.
>  */
> static __always_inline int
> user_cmpxchg64_release(u64 __usr *addr, u64 *oldp, u64 new)
> {
> 	int err = 0;
>
> 	asm volatile(
> 	__LSUI_PREAMBLE
> 	"1:	caslt	%x[old], %x[new], %[addr]\n"
> 	"2:\n"
> 	_ASM_EXTABLE_UACCESS_ERR(1b, 4b, %w0)
> 	: [addr] "+Q" (addr),
> 	  [old] "+r" (*oldp)
> 	: [new] "r" (new)
> 	: "memory"
> 	);
>
> 	return err;
> }
>
> That should be the *only* assembly you need to implement.
>
> Atop that, have a wrapper that uses get_user() and that helper above to
> implement the 32-bit user cmpxchg, with all the bit manipulation in C:

Thanks for your suggestion. But small question.
I think it's enough to use usafe_get_user() instead of get_user() in here
since when FEAT_LSUI enabled, it doeesn't need to call
uaccess_ttbr0_enable()/disable().

or Am I missing something?

>
> /*
>  * NOTE: *oldp is NOT updated if a fault is taken.
>  */
> static int user_cmpxchg32_release(u32 __user *addr, u32 *oldp, u32 new)
> {
> 	u64 __user *addr64 = PTR_ALIGN_DOWN(addr, sizeof(u64));
> 	u64 old64, new64;
> 	int ret;
>
> 	if (get_user(old64, uaddr64))
> 		return -EFAULT;
>
> 	/*
> 	 * TODO: merge '*oldp' into 'old64', and 'new' into 'new64',
> 	 * taking into account endianness and alignment
> 	 */
>
> 	ret = user_cmpxchg64_release(uaddr64, &old64, new64);
> 	if (ret)
> 		return ret;
>
> 	/*
> 	 * TODO: extract '*oldp' from 'old64', taking into account
> 	 * endianness and alignment.
> 	 */

I think extraction doesn't need since futext_cmpxchg op passes the
"oldval" argument which fetched and passed already.
So when it success, it seems enough to return the passed arguments.
But, I'm not sure which is better whether dividing the cmpxchg64 and
cmpxchge32 interface or make one helper -- user_cmpxchg_release
(__lsui_cmpxchg_helper).

[...]

Thanks.

--
Sincerely,
Yeoreum Yun
Re: [PATCH RESEND v7 4/6] arm64: futex: refactor futex atomic operation
Posted by Mark Rutland 2 weeks, 3 days ago
On Tue, Sep 16, 2025 at 02:27:37PM +0100, Yeoreum Yun wrote:
> Hi Mark,
> 
> [...]
> > > diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h
> > > index 1d6d9f856ac5..0aeda7ced2c0 100644
> > > --- a/arch/arm64/include/asm/futex.h
> > > +++ b/arch/arm64/include/asm/futex.h
> > > @@ -126,6 +126,60 @@ LSUI_FUTEX_ATOMIC_OP(or, ldtset, al)
> > >  LSUI_FUTEX_ATOMIC_OP(andnot, ldtclr, al)
> > >  LSUI_FUTEX_ATOMIC_OP(set, swpt, al)
> > >
> > > +
> > > +#define LSUI_CMPXCHG_HELPER(suffix, start_bit)                                 \
> > > +static __always_inline int                                                     \
> > > +__lsui_cmpxchg_helper_##suffix(u64 __user *uaddr, u32 oldval, u32 newval)      \
> > > +{                                                                              \
> > > +       int ret = 0;                                                            \
> > > +       u64 oval, nval, tmp;                                                    \
> > > +                                                                               \
> > > +       asm volatile("//__lsui_cmpxchg_helper_" #suffix "\n"                    \
> > > +       __LSUI_PREAMBLE                                                         \
> > > +"      prfm    pstl1strm, %2\n"                                                \
> > > +"1:    ldtr    %x1, %2\n"                                                      \
> > > +"      mov     %x3, %x1\n"                                                     \
> > > +"      bfi     %x1, %x5, #" #start_bit ", #32\n"                               \
> > > +"      bfi     %x3, %x6, #" #start_bit ", #32\n"                               \
> > > +"      mov     %x4, %x1\n"                                                     \
> > > +"2:    caslt   %x1, %x3, %2\n"                                                 \
> > > +"      sub     %x1, %x1, %x4\n"                                                \
> > > +"      cbz     %x1, 3f\n"                                                      \
> > > +"      mov     %w0, %w7\n"                                                     \
> > > +"3:\n"                                                                         \
> > > +"      dmb     ish\n"                                                          \
> > > +"4:\n"                                                                         \
> > > +       _ASM_EXTABLE_UACCESS_ERR(1b, 4b, %w0)                                   \
> > > +       _ASM_EXTABLE_UACCESS_ERR(2b, 4b, %w0)                                   \
> > > +       : "+r" (ret), "=&r" (oval), "+Q" (*uaddr), "=&r" (nval), "=&r" (tmp)    \
> > > +       : "r" (oldval), "r" (newval), "Ir" (-EAGAIN)                            \
> > > +       : "memory");                                                            \
> > > +                                                                               \
> > > +       return ret;                                                             \
> > > +}
> > > +
> > > +LSUI_CMPXCHG_HELPER(lo, 0)
> > > +LSUI_CMPXCHG_HELPER(hi, 32)
> > > +
> > > +static __always_inline int
> > > +__lsui_cmpxchg_helper(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval)
> > > +{
> > > +       int ret;
> > > +       unsigned long uaddr_al;
> > > +
> > > +       uaddr_al = ALIGN_DOWN((unsigned long)uaddr, sizeof(u64));
> > > +
> > > +       if (uaddr_al != (unsigned long)uaddr)
> > > +               ret = __lsui_cmpxchg_helper_hi((u64 __user *)uaddr_al, oldval, newval);
> > > +       else
> > > +               ret = __lsui_cmpxchg_helper_lo((u64 __user *)uaddr_al, oldval, newval);
> > > +
> > > +       if (!ret)
> > > +               *oval = oldval;
> > > +
> > > +       return ret;
> > > +}
> >
> > I think Will expects that you do more of this in C, e.g. have a basic
> > user cmpxchg on a 64-bit type, e.g.
> >
> > /*
> >  * NOTE: *oldp is NOT updated if a fault is taken.
> >  */
> > static __always_inline int
> > user_cmpxchg64_release(u64 __usr *addr, u64 *oldp, u64 new)
> > {
> > 	int err = 0;
> >
> > 	asm volatile(
> > 	__LSUI_PREAMBLE
> > 	"1:	caslt	%x[old], %x[new], %[addr]\n"
> > 	"2:\n"
> > 	_ASM_EXTABLE_UACCESS_ERR(1b, 4b, %w0)
> > 	: [addr] "+Q" (addr),
> > 	  [old] "+r" (*oldp)
> > 	: [new] "r" (new)
> > 	: "memory"
> > 	);
> >
> > 	return err;
> > }
> >
> > That should be the *only* assembly you need to implement.
> >
> > Atop that, have a wrapper that uses get_user() and that helper above to
> > implement the 32-bit user cmpxchg, with all the bit manipulation in C:
> 
> Thanks for your suggestion. But small question.
> I think it's enough to use usafe_get_user() instead of get_user() in here
> since when FEAT_LSUI enabled, it doeesn't need to call
> uaccess_ttbr0_enable()/disable().

Regardless of uaccess_ttbr0_enable() and uaccess_ttbr0_disable()
specifically, API-wise unsafe_get_user() is only supposed to be called
between user_access_begin() and user_access_end(), and there's some
stuff we probably want to add there (e.g. might_fault(), which
unsafe_get_user() lacks today).

Do we call those?

Mark.
Re: [PATCH RESEND v7 4/6] arm64: futex: refactor futex atomic operation
Posted by Yeoreum Yun 2 weeks, 3 days ago
Hi Mark,

[...]
> > > > diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h
> > > > index 1d6d9f856ac5..0aeda7ced2c0 100644
> > > > --- a/arch/arm64/include/asm/futex.h
> > > > +++ b/arch/arm64/include/asm/futex.h
> > > > @@ -126,6 +126,60 @@ LSUI_FUTEX_ATOMIC_OP(or, ldtset, al)
> > > >  LSUI_FUTEX_ATOMIC_OP(andnot, ldtclr, al)
> > > >  LSUI_FUTEX_ATOMIC_OP(set, swpt, al)
> > > >
> > > > +
> > > > +#define LSUI_CMPXCHG_HELPER(suffix, start_bit)                                 \
> > > > +static __always_inline int                                                     \
> > > > +__lsui_cmpxchg_helper_##suffix(u64 __user *uaddr, u32 oldval, u32 newval)      \
> > > > +{                                                                              \
> > > > +       int ret = 0;                                                            \
> > > > +       u64 oval, nval, tmp;                                                    \
> > > > +                                                                               \
> > > > +       asm volatile("//__lsui_cmpxchg_helper_" #suffix "\n"                    \
> > > > +       __LSUI_PREAMBLE                                                         \
> > > > +"      prfm    pstl1strm, %2\n"                                                \
> > > > +"1:    ldtr    %x1, %2\n"                                                      \
> > > > +"      mov     %x3, %x1\n"                                                     \
> > > > +"      bfi     %x1, %x5, #" #start_bit ", #32\n"                               \
> > > > +"      bfi     %x3, %x6, #" #start_bit ", #32\n"                               \
> > > > +"      mov     %x4, %x1\n"                                                     \
> > > > +"2:    caslt   %x1, %x3, %2\n"                                                 \
> > > > +"      sub     %x1, %x1, %x4\n"                                                \
> > > > +"      cbz     %x1, 3f\n"                                                      \
> > > > +"      mov     %w0, %w7\n"                                                     \
> > > > +"3:\n"                                                                         \
> > > > +"      dmb     ish\n"                                                          \
> > > > +"4:\n"                                                                         \
> > > > +       _ASM_EXTABLE_UACCESS_ERR(1b, 4b, %w0)                                   \
> > > > +       _ASM_EXTABLE_UACCESS_ERR(2b, 4b, %w0)                                   \
> > > > +       : "+r" (ret), "=&r" (oval), "+Q" (*uaddr), "=&r" (nval), "=&r" (tmp)    \
> > > > +       : "r" (oldval), "r" (newval), "Ir" (-EAGAIN)                            \
> > > > +       : "memory");                                                            \
> > > > +                                                                               \
> > > > +       return ret;                                                             \
> > > > +}
> > > > +
> > > > +LSUI_CMPXCHG_HELPER(lo, 0)
> > > > +LSUI_CMPXCHG_HELPER(hi, 32)
> > > > +
> > > > +static __always_inline int
> > > > +__lsui_cmpxchg_helper(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval)
> > > > +{
> > > > +       int ret;
> > > > +       unsigned long uaddr_al;
> > > > +
> > > > +       uaddr_al = ALIGN_DOWN((unsigned long)uaddr, sizeof(u64));
> > > > +
> > > > +       if (uaddr_al != (unsigned long)uaddr)
> > > > +               ret = __lsui_cmpxchg_helper_hi((u64 __user *)uaddr_al, oldval, newval);
> > > > +       else
> > > > +               ret = __lsui_cmpxchg_helper_lo((u64 __user *)uaddr_al, oldval, newval);
> > > > +
> > > > +       if (!ret)
> > > > +               *oval = oldval;
> > > > +
> > > > +       return ret;
> > > > +}
> > >
> > > I think Will expects that you do more of this in C, e.g. have a basic
> > > user cmpxchg on a 64-bit type, e.g.
> > >
> > > /*
> > >  * NOTE: *oldp is NOT updated if a fault is taken.
> > >  */
> > > static __always_inline int
> > > user_cmpxchg64_release(u64 __usr *addr, u64 *oldp, u64 new)
> > > {
> > > 	int err = 0;
> > >
> > > 	asm volatile(
> > > 	__LSUI_PREAMBLE
> > > 	"1:	caslt	%x[old], %x[new], %[addr]\n"
> > > 	"2:\n"
> > > 	_ASM_EXTABLE_UACCESS_ERR(1b, 4b, %w0)
> > > 	: [addr] "+Q" (addr),
> > > 	  [old] "+r" (*oldp)
> > > 	: [new] "r" (new)
> > > 	: "memory"
> > > 	);
> > >
> > > 	return err;
> > > }
> > >
> > > That should be the *only* assembly you need to implement.
> > >
> > > Atop that, have a wrapper that uses get_user() and that helper above to
> > > implement the 32-bit user cmpxchg, with all the bit manipulation in C:
> >
> > Thanks for your suggestion. But small question.
> > I think it's enough to use usafe_get_user() instead of get_user() in here
> > since when FEAT_LSUI enabled, it doeesn't need to call
> > uaccess_ttbr0_enable()/disable().
>
> Regardless of uaccess_ttbr0_enable() and uaccess_ttbr0_disable()
> specifically, API-wise unsafe_get_user() is only supposed to be called
> between user_access_begin() and user_access_end(), and there's some
> stuff we probably want to add there (e.g. might_fault(), which
> unsafe_get_user() lacks today).
>
> Do we call those?

Yes when you're available.
As you mention, the difference seems might_fault(),
But I'm not sure whether that would be a reason to validate to use
get_user() instead of unsafe_get_user() taking a increase of instruction
of "nop" -- uaccess_ttbr0_enable()/disable() in LSUI
except the reason for DEUBG purpose.

--
Sincerely,
Yeoreum Yun
Re: [PATCH RESEND v7 4/6] arm64: futex: refactor futex atomic operation
Posted by Mark Rutland 2 weeks, 3 days ago
On Tue, Sep 16, 2025 at 02:58:16PM +0100, Yeoreum Yun wrote:
> Hi Mark,
> 
> [...]
> > > I think it's enough to use usafe_get_user() instead of get_user() in here
> > > since when FEAT_LSUI enabled, it doeesn't need to call
> > > uaccess_ttbr0_enable()/disable().
> >
> > Regardless of uaccess_ttbr0_enable() and uaccess_ttbr0_disable()
> > specifically, API-wise unsafe_get_user() is only supposed to be called
> > between user_access_begin() and user_access_end(), and there's some
> > stuff we probably want to add there (e.g. might_fault(), which
> > unsafe_get_user() lacks today).
> >
> > Do we call those?
> 
> Yes when you're available.
> As you mention, the difference seems might_fault(),
> But I'm not sure whether that would be a reason to validate to use
> get_user() instead of unsafe_get_user() taking a increase of instruction
> of "nop" -- uaccess_ttbr0_enable()/disable() in LSUI
> except the reason for DEUBG purpose.

I think the practical impact of those NOPs is going to be neglible, and
not worth optimizing for unless/until we have data demonstrating
otherwise.

If we want to strictly avoid those NOPs, I think that we should do a
more general cleanup, and e.g. have variants of user_access_begin() and
user_access_end() that do not mess with TTBR0. I don't think we need to
do that for this series.

For now, I think that you should either:

* Use get_user().

* Use user_access_begin() .. user_access_end() wrapping both
  unsafe_get_user() and the user cmpxchg.

Thanks,
Mark.
Re: [PATCH RESEND v7 4/6] arm64: futex: refactor futex atomic operation
Posted by Yeoreum Yun 2 weeks, 3 days ago
On Tue, Sep 16, 2025 at 03:07:09PM +0100, Mark Rutland wrote:
> On Tue, Sep 16, 2025 at 02:58:16PM +0100, Yeoreum Yun wrote:
> > Hi Mark,
> >
> > [...]
> > > > I think it's enough to use usafe_get_user() instead of get_user() in here
> > > > since when FEAT_LSUI enabled, it doeesn't need to call
> > > > uaccess_ttbr0_enable()/disable().
> > >
> > > Regardless of uaccess_ttbr0_enable() and uaccess_ttbr0_disable()
> > > specifically, API-wise unsafe_get_user() is only supposed to be called
> > > between user_access_begin() and user_access_end(), and there's some
> > > stuff we probably want to add there (e.g. might_fault(), which
> > > unsafe_get_user() lacks today).
> > >
> > > Do we call those?
> >
> > Yes when you're available.
> > As you mention, the difference seems might_fault(),
> > But I'm not sure whether that would be a reason to validate to use
> > get_user() instead of unsafe_get_user() taking a increase of instruction
> > of "nop" -- uaccess_ttbr0_enable()/disable() in LSUI
> > except the reason for DEUBG purpose.
>
> I think the practical impact of those NOPs is going to be neglible, and
> not worth optimizing for unless/until we have data demonstrating
> otherwise.
>
> If we want to strictly avoid those NOPs, I think that we should do a
> more general cleanup, and e.g. have variants of user_access_begin() and
> user_access_end() that do not mess with TTBR0. I don't think we need to
> do that for this series.
>
> For now, I think that you should either:
>
> * Use get_user().
>
> * Use user_access_begin() .. user_access_end() wrapping both
>   unsafe_get_user() and the user cmpxchg.

Understood. I'll use get_user() in this series after
getting some more comments from other about C version which I sent
before your suggestion (Sorry to miss your email before I sent).

Thanks :D

--
Sincerely,
Yeoreum Yun
Re: [PATCH RESEND v7 4/6] arm64: futex: refactor futex atomic operation
Posted by Will Deacon 2 weeks, 4 days ago
On Tue, Sep 16, 2025 at 11:02:19AM +0100, Yeoreum Yun wrote:
> Hi,
> > On Mon, Sep 15, 2025 at 09:35:55PM +0100, Will Deacon wrote:
> > > On Mon, Sep 15, 2025 at 08:40:33PM +0100, Catalin Marinas wrote:
> > > > On Mon, Sep 15, 2025 at 11:32:39AM +0100, Yeoreum Yun wrote:
> > > > > So I think it would be better to keep the current LLSC implementation
> > > > > in LSUI.
> > > >
> > > > I think the code would look simpler with LL/SC but you can give it a try
> > > > and post the code sample here (not in a new series).
> > >
> > > If you stick the cas*t instruction in its own helper say, cmpxchg_user(),
> > > then you can do all the shifting/masking in C and I don't reckon it's
> > > that bad. It means we (a) get rid of exclusives, which is the whole
> > > point of this and (b) don't have to mess around with PAN.
> >
> > We get rid of PAN toggling already since FEAT_LSUI introduces
> > LDTXR/STTXR. But, I'm all for CAS if it doesn't look too bad. Easier
> > I think if we do a get_user() of a u64 and combine it with the futex u32
> > while taking care of CPU endianness. All in a loop. Hopefully the
> > compiler is smart enough to reduce masking/or'ing to fewer instructions.
> >
> 
> Sorry for my wrong previous email.
> 
> Here is what i intened with CAS operation:
> 
> diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h
> index 1d6d9f856ac5..0aeda7ced2c0 100644
> --- a/arch/arm64/include/asm/futex.h
> +++ b/arch/arm64/include/asm/futex.h
> @@ -126,6 +126,60 @@ LSUI_FUTEX_ATOMIC_OP(or, ldtset, al)
>  LSUI_FUTEX_ATOMIC_OP(andnot, ldtclr, al)
>  LSUI_FUTEX_ATOMIC_OP(set, swpt, al)
> 
> +
> +#define LSUI_CMPXCHG_HELPER(suffix, start_bit)                                 \
> +static __always_inline int                                                     \
> +__lsui_cmpxchg_helper_##suffix(u64 __user *uaddr, u32 oldval, u32 newval)      \
> +{                                                                              \
> +       int ret = 0;                                                            \
> +       u64 oval, nval, tmp;                                                    \
> +                                                                               \
> +       asm volatile("//__lsui_cmpxchg_helper_" #suffix "\n"                    \
> +       __LSUI_PREAMBLE                                                         \
> +"      prfm    pstl1strm, %2\n"                                                \
> +"1:    ldtr    %x1, %2\n"                                                      \
> +"      mov     %x3, %x1\n"                                                     \
> +"      bfi     %x1, %x5, #" #start_bit ", #32\n"                               \
> +"      bfi     %x3, %x6, #" #start_bit ", #32\n"                               \
> +"      mov     %x4, %x1\n"                                                     \
> +"2:    caslt   %x1, %x3, %2\n"                                                 \
> +"      sub     %x1, %x1, %x4\n"                                                \
> +"      cbz     %x1, 3f\n"                                                      \
> +"      mov     %w0, %w7\n"                                                     \
> +"3:\n"                                                                         \
> +"      dmb     ish\n"                                                          \
> +"4:\n"                                                                         \
> +       _ASM_EXTABLE_UACCESS_ERR(1b, 4b, %w0)                                   \
> +       _ASM_EXTABLE_UACCESS_ERR(2b, 4b, %w0)                                   \
> +       : "+r" (ret), "=&r" (oval), "+Q" (*uaddr), "=&r" (nval), "=&r" (tmp)    \
> +       : "r" (oldval), "r" (newval), "Ir" (-EAGAIN)                            \
> +       : "memory");                                                            \

The vast majority of this can be written in C.

Will
Re: [PATCH RESEND v7 4/6] arm64: futex: refactor futex atomic operation
Posted by Yeoreum Yun 2 weeks, 3 days ago
Hi,

[...]
> > +#define LSUI_CMPXCHG_HELPER(suffix, start_bit)                                 \
> > +static __always_inline int                                                     \
> > +__lsui_cmpxchg_helper_##suffix(u64 __user *uaddr, u32 oldval, u32 newval)      \
> > +{                                                                              \
> > +       int ret = 0;                                                            \
> > +       u64 oval, nval, tmp;                                                    \
> > +                                                                               \
> > +       asm volatile("//__lsui_cmpxchg_helper_" #suffix "\n"                    \
> > +       __LSUI_PREAMBLE                                                         \
> > +"      prfm    pstl1strm, %2\n"                                                \
> > +"1:    ldtr    %x1, %2\n"                                                      \
> > +"      mov     %x3, %x1\n"                                                     \
> > +"      bfi     %x1, %x5, #" #start_bit ", #32\n"                               \
> > +"      bfi     %x3, %x6, #" #start_bit ", #32\n"                               \
> > +"      mov     %x4, %x1\n"                                                     \
> > +"2:    caslt   %x1, %x3, %2\n"                                                 \
> > +"      sub     %x1, %x1, %x4\n"                                                \
> > +"      cbz     %x1, 3f\n"                                                      \
> > +"      mov     %w0, %w7\n"                                                     \
> > +"3:\n"                                                                         \
> > +"      dmb     ish\n"                                                          \
> > +"4:\n"                                                                         \
> > +       _ASM_EXTABLE_UACCESS_ERR(1b, 4b, %w0)                                   \
> > +       _ASM_EXTABLE_UACCESS_ERR(2b, 4b, %w0)                                   \
> > +       : "+r" (ret), "=&r" (oval), "+Q" (*uaddr), "=&r" (nval), "=&r" (tmp)    \
> > +       : "r" (oldval), "r" (newval), "Ir" (-EAGAIN)                            \
> > +       : "memory");                                                            \
>
> The vast majority of this can be written in C.

Here is the version with C base on patch 6:

diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h
diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h
index 1d6d9f856ac5..68af15ba545a 100644
--- a/arch/arm64/include/asm/futex.h
+++ b/arch/arm64/include/asm/futex.h
@@ -127,81 +127,77 @@ LSUI_FUTEX_ATOMIC_OP(andnot, ldtclr, al)
 LSUI_FUTEX_ATOMIC_OP(set, swpt, al)

 static __always_inline int
-__lsui_futex_atomic_and(int oparg, u32 __user *uaddr, int *oval)
+__lsui_cmpxchg_helper(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval)
 {
-       return __lsui_futex_atomic_andnot(~oparg, uaddr, oval);
-}
+       int ret = -EAGAIN;
+       u64 __user *uaddr_al;
+       u64 oval64, nval64, tmp;
+       static const u64 lo32_mask = GENMASK_U64(31, 0);
+
+       uaddr_al = (u64 __user *) ALIGN_DOWN((unsigned long)uaddr, sizeof(u64));
+       unsafe_get_user(oval64, uaddr_al, err_fault);
+
+       if ((u32 __user *)uaddr_al != uaddr) {
+               nval64 = ((oval64 & lo32_mask) | ((u64)newval << 32));
+               oval64 = ((oval64 & lo32_mask) | ((u64)oldval << 32));
+       } else {
+               nval64 = ((oval64 & ~lo32_mask) | newval);
+               oval64 = ((oval64 & ~lo32_mask) | oldval);
+       }

-static __always_inline int
-__lsui_futex_atomic_eor(int oparg, u32 __user *uaddr, int *oval)
-{
-       unsigned int loops = FUTEX_MAX_LOOPS;
-       int ret, oldval, tmp;
+       tmp = oval64;

-       /*
-        * there are no ldteor/stteor instructions...
-        */
-       asm volatile("// __lsui_futex_atomic_eor\n"
+       asm volatile("//__lsui_cmpxchg_helper\n"
        __LSUI_PREAMBLE
-"      prfm    pstl1strm, %2\n"
-"1:    ldtxr   %w1, %2\n"
-"      eor     %w3, %w1, %w5\n"
-"2:    stltxr  %w0, %w3, %2\n"
-"      cbz     %w0, 3f\n"
-"      sub     %w4, %w4, %w0\n"
-"      cbnz    %w4, 1b\n"
-"      mov     %w0, %w6\n"
-"3:\n"
+"1:    caslt   %x1, %x3, %2\n"
+"      sub     %x1, %x1, %x4\n"
+"      cbnz    %x1, 2f\n"
+"      mov     %w0, %w5\n"
+"2:\n"
 "      dmb     ish\n"
+"3:\n"
        _ASM_EXTABLE_UACCESS_ERR(1b, 3b, %w0)
-       _ASM_EXTABLE_UACCESS_ERR(2b, 3b, %w0)
-       : "=&r" (ret), "=&r" (oldval), "+Q" (*uaddr), "=&r" (tmp),
-         "+r" (loops)
-       : "r" (oparg), "Ir" (-EAGAIN)
+       : "+r" (ret), "=&r" (oval64), "+Q" (*uaddr_al)
+       : "r" (nval64), "r" (tmp), "Ir" (0)
        : "memory");

        if (!ret)
                *oval = oldval;

+err_fault:
        return ret;
 }

 static __always_inline int
-__lsui_futex_cmpxchg(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval)
+__lsui_futex_atomic_and(int oparg, u32 __user *uaddr, int *oval)
 {
-       int ret = 0;
-       unsigned int loops = FUTEX_MAX_LOOPS;
-       u32 val, tmp;
+       return __lsui_futex_atomic_andnot(~oparg, uaddr, oval);
+}
+
+static __always_inline int
+__lsui_futex_atomic_eor(int oparg, u32 __user *uaddr, int *oval)
+{
+{
+       int ret = -EAGAIN;
+       u32 oldval, newval;

        /*
-        * cas{al}t doesn't support word size...
+        * there are no ldteor/stteor instructions...
         */
-       asm volatile("//__lsui_futex_cmpxchg\n"
-       __LSUI_PREAMBLE
-"      prfm    pstl1strm, %2\n"
-"1:    ldtxr   %w1, %2\n"
-"      eor     %w3, %w1, %w5\n"
-"      cbnz    %w3, 4f\n"
-"2:    stltxr  %w3, %w6, %2\n"
-"      cbz     %w3, 3f\n"
-"      sub     %w4, %w4, %w3\n"
-"      cbnz    %w4, 1b\n"
-"      mov     %w0, %w7\n"
-"3:\n"
-"      dmb     ish\n"
-"4:\n"
-       _ASM_EXTABLE_UACCESS_ERR(1b, 4b, %w0)
-       _ASM_EXTABLE_UACCESS_ERR(2b, 4b, %w0)
-       : "+r" (ret), "=&r" (val), "+Q" (*uaddr), "=&r" (tmp), "+r" (loops)
-       : "r" (oldval), "r" (newval), "Ir" (-EAGAIN)
-       : "memory");
+       unsafe_get_user(oldval, uaddr, err_fault);
+       newval = oldval ^ oparg;

-       if (!ret)
-               *oval = oldval;
+       ret = __lsui_cmpxchg_helper(uaddr, oldval, newval, oval);

+err_fault:
        return ret;
 }

+static __always_inline int
+__lsui_futex_cmpxchg(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval)
+{
+       return __lsui_cmpxchg_helper(uaddr, oldval, newval, oval);
+}
+
 #define __lsui_llsc_body(op, ...)                                      \
 ({                                                                     \
        alternative_has_cap_likely(ARM64_HAS_LSUI) ?                    \
(END)

I'm not sure this is good for you.
But If you share your thought, That's would be greatful.
(Note:
  When I test with 256 threads for futex_atomic_eor op, there is not much
  difference with former assembly version)

Thanks!

--
Sincerely,
Yeoreum Yun
Re: [PATCH RESEND v7 4/6] arm64: futex: refactor futex atomic operation
Posted by Yeoreum Yun 2 weeks, 3 days ago
Hi all,

> Hi,
>
> [...]
> > > +#define LSUI_CMPXCHG_HELPER(suffix, start_bit)                                 \
> > > +static __always_inline int                                                     \
> > > +__lsui_cmpxchg_helper_##suffix(u64 __user *uaddr, u32 oldval, u32 newval)      \
> > > +{                                                                              \
> > > +       int ret = 0;                                                            \
> > > +       u64 oval, nval, tmp;                                                    \
> > > +                                                                               \
> > > +       asm volatile("//__lsui_cmpxchg_helper_" #suffix "\n"                    \
> > > +       __LSUI_PREAMBLE                                                         \
> > > +"      prfm    pstl1strm, %2\n"                                                \
> > > +"1:    ldtr    %x1, %2\n"                                                      \
> > > +"      mov     %x3, %x1\n"                                                     \
> > > +"      bfi     %x1, %x5, #" #start_bit ", #32\n"                               \
> > > +"      bfi     %x3, %x6, #" #start_bit ", #32\n"                               \
> > > +"      mov     %x4, %x1\n"                                                     \
> > > +"2:    caslt   %x1, %x3, %2\n"                                                 \
> > > +"      sub     %x1, %x1, %x4\n"                                                \
> > > +"      cbz     %x1, 3f\n"                                                      \
> > > +"      mov     %w0, %w7\n"                                                     \
> > > +"3:\n"                                                                         \
> > > +"      dmb     ish\n"                                                          \
> > > +"4:\n"                                                                         \
> > > +       _ASM_EXTABLE_UACCESS_ERR(1b, 4b, %w0)                                   \
> > > +       _ASM_EXTABLE_UACCESS_ERR(2b, 4b, %w0)                                   \
> > > +       : "+r" (ret), "=&r" (oval), "+Q" (*uaddr), "=&r" (nval), "=&r" (tmp)    \
> > > +       : "r" (oldval), "r" (newval), "Ir" (-EAGAIN)                            \
> > > +       : "memory");                                                            \
> >
> > The vast majority of this can be written in C.
>
> Here is the version with C base on patch 6:
>
> diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h
> diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h
> index 1d6d9f856ac5..68af15ba545a 100644
> --- a/arch/arm64/include/asm/futex.h
> +++ b/arch/arm64/include/asm/futex.h
> @@ -127,81 +127,77 @@ LSUI_FUTEX_ATOMIC_OP(andnot, ldtclr, al)
>  LSUI_FUTEX_ATOMIC_OP(set, swpt, al)
>
>  static __always_inline int
> -__lsui_futex_atomic_and(int oparg, u32 __user *uaddr, int *oval)
> +__lsui_cmpxchg_helper(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval)
>  {
> -       return __lsui_futex_atomic_andnot(~oparg, uaddr, oval);
> -}
> +       int ret = -EAGAIN;
> +       u64 __user *uaddr_al;
> +       u64 oval64, nval64, tmp;
> +       static const u64 lo32_mask = GENMASK_U64(31, 0);
> +
> +       uaddr_al = (u64 __user *) ALIGN_DOWN((unsigned long)uaddr, sizeof(u64));
> +       unsafe_get_user(oval64, uaddr_al, err_fault);
> +
> +       if ((u32 __user *)uaddr_al != uaddr) {
> +               nval64 = ((oval64 & lo32_mask) | ((u64)newval << 32));
> +               oval64 = ((oval64 & lo32_mask) | ((u64)oldval << 32));
> +       } else {
> +               nval64 = ((oval64 & ~lo32_mask) | newval);
> +               oval64 = ((oval64 & ~lo32_mask) | oldval);
> +       }
>
> -static __always_inline int
> -__lsui_futex_atomic_eor(int oparg, u32 __user *uaddr, int *oval)
> -{
> -       unsigned int loops = FUTEX_MAX_LOOPS;
> -       int ret, oldval, tmp;
> +       tmp = oval64;
>
> -       /*
> -        * there are no ldteor/stteor instructions...
> -        */
> -       asm volatile("// __lsui_futex_atomic_eor\n"
> +       asm volatile("//__lsui_cmpxchg_helper\n"
>         __LSUI_PREAMBLE
> -"      prfm    pstl1strm, %2\n"
> -"1:    ldtxr   %w1, %2\n"
> -"      eor     %w3, %w1, %w5\n"
> -"2:    stltxr  %w0, %w3, %2\n"
> -"      cbz     %w0, 3f\n"
> -"      sub     %w4, %w4, %w0\n"
> -"      cbnz    %w4, 1b\n"
> -"      mov     %w0, %w6\n"
> -"3:\n"
> +"1:    caslt   %x1, %x3, %2\n"
> +"      sub     %x1, %x1, %x4\n"
> +"      cbnz    %x1, 2f\n"
> +"      mov     %w0, %w5\n"
> +"2:\n"
>  "      dmb     ish\n"
> +"3:\n"
>         _ASM_EXTABLE_UACCESS_ERR(1b, 3b, %w0)
> -       _ASM_EXTABLE_UACCESS_ERR(2b, 3b, %w0)
> -       : "=&r" (ret), "=&r" (oldval), "+Q" (*uaddr), "=&r" (tmp),
> -         "+r" (loops)
> -       : "r" (oparg), "Ir" (-EAGAIN)
> +       : "+r" (ret), "=&r" (oval64), "+Q" (*uaddr_al)
> +       : "r" (nval64), "r" (tmp), "Ir" (0)
>         : "memory");
>
>         if (!ret)
>                 *oval = oldval;
>
> +err_fault:
>         return ret;
>  }
>
>  static __always_inline int
> -__lsui_futex_cmpxchg(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval)
> +__lsui_futex_atomic_and(int oparg, u32 __user *uaddr, int *oval)
>  {
> -       int ret = 0;
> -       unsigned int loops = FUTEX_MAX_LOOPS;
> -       u32 val, tmp;
> +       return __lsui_futex_atomic_andnot(~oparg, uaddr, oval);
> +}
> +
> +static __always_inline int
> +__lsui_futex_atomic_eor(int oparg, u32 __user *uaddr, int *oval)
> +{
> +{
> +       int ret = -EAGAIN;
> +       u32 oldval, newval;
>
>         /*
> -        * cas{al}t doesn't support word size...
> +        * there are no ldteor/stteor instructions...
>          */
> -       asm volatile("//__lsui_futex_cmpxchg\n"
> -       __LSUI_PREAMBLE
> -"      prfm    pstl1strm, %2\n"
> -"1:    ldtxr   %w1, %2\n"
> -"      eor     %w3, %w1, %w5\n"
> -"      cbnz    %w3, 4f\n"
> -"2:    stltxr  %w3, %w6, %2\n"
> -"      cbz     %w3, 3f\n"
> -"      sub     %w4, %w4, %w3\n"
> -"      cbnz    %w4, 1b\n"
> -"      mov     %w0, %w7\n"
> -"3:\n"
> -"      dmb     ish\n"
> -"4:\n"
> -       _ASM_EXTABLE_UACCESS_ERR(1b, 4b, %w0)
> -       _ASM_EXTABLE_UACCESS_ERR(2b, 4b, %w0)
> -       : "+r" (ret), "=&r" (val), "+Q" (*uaddr), "=&r" (tmp), "+r" (loops)
> -       : "r" (oldval), "r" (newval), "Ir" (-EAGAIN)
> -       : "memory");
> +       unsafe_get_user(oldval, uaddr, err_fault);
> +       newval = oldval ^ oparg;
>
> -       if (!ret)
> -               *oval = oldval;
> +       ret = __lsui_cmpxchg_helper(uaddr, oldval, newval, oval);
>
> +err_fault:
>         return ret;
>  }
>
> +static __always_inline int
> +__lsui_futex_cmpxchg(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval)
> +{
> +       return __lsui_cmpxchg_helper(uaddr, oldval, newval, oval);
> +}
> +
>  #define __lsui_llsc_body(op, ...)                                      \
>  ({                                                                     \
>         alternative_has_cap_likely(ARM64_HAS_LSUI) ?                    \
> (END)
>
> I'm not sure this is good for you.
> But If you share your thought, That's would be greatful.
> (Note:
>   When I test with 256 threads for futex_atomic_eor op, there is not much
>   difference with former assembly version)
>

Might be all discussion seems to be going to made cmpxchg with C version.
I'll respin cmpxchg with C version with the missing endianess support.

Thanks.

--
Sincerely,
Yeoreum Yun
Re: [PATCH RESEND v7 4/6] arm64: futex: refactor futex atomic operation
Posted by Yeoreum Yun 2 weeks, 4 days ago
Hi,

> On Mon, Sep 15, 2025 at 09:35:55PM +0100, Will Deacon wrote:
> > On Mon, Sep 15, 2025 at 08:40:33PM +0100, Catalin Marinas wrote:
> > > On Mon, Sep 15, 2025 at 11:32:39AM +0100, Yeoreum Yun wrote:
> > > > So I think it would be better to keep the current LLSC implementation
> > > > in LSUI.
> > >
> > > I think the code would look simpler with LL/SC but you can give it a try
> > > and post the code sample here (not in a new series).
> >
> > If you stick the cas*t instruction in its own helper say, cmpxchg_user(),
> > then you can do all the shifting/masking in C and I don't reckon it's
> > that bad. It means we (a) get rid of exclusives, which is the whole
> > point of this and (b) don't have to mess around with PAN.
>
> We get rid of PAN toggling already since FEAT_LSUI introduces
> LDTXR/STTXR. But, I'm all for CAS if it doesn't look too bad. Easier
> I think if we do a get_user() of a u64 and combine it with the futex u32
> while taking care of CPU endianness. All in a loop. Hopefully the
> compiler is smart enough to reduce masking/or'ing to fewer instructions.
>

Hmm, I think sure shifting/masking can be replace by single bfi
instruction like:

diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h
index 1d6d9f856ac5..30da0006c0c8 100644
--- a/arch/arm64/include/asm/futex.h
+++ b/arch/arm64/include/asm/futex.h
@@ -126,6 +126,59 @@ LSUI_FUTEX_ATOMIC_OP(or, ldtset, al)
 LSUI_FUTEX_ATOMIC_OP(andnot, ldtclr, al)
 LSUI_FUTEX_ATOMIC_OP(set, swpt, al)

+
+#define LSUI_CMPXCHG_HELPER(suffix, start_bit)                                 \
+static __always_inline int                                                     \
+__lsui_cmpxchg_helper_##suffix(u64 __user *uaddr, u32 oldval, u32 newval)      \
+{                                                                              \
+       int ret = 0;                                                            \
+       u64 oval, nval, tmp;                                                    \
+                                                                               \
+       asm volatile("//__lsui_cmpxchg_helper_" #suffix "\n"                    \
+       __LSUI_PREAMBLE                                                         \
+"      prfm    pstl1strm, %2\n"                                                \
+"1:    ldtr    %x1, %2\n"                                                      \
+"      bfi     %x1, %x5, #" #start_bit ", #32\n"                               \
+"      bfi     %x1, %x6, #" #start_bit ", #32\n"                               \
+"      mov     %x4, %x5\n"                                                     \
+"2:    caslt   %x5, %x6, %2\n"                                                 \
+"      sub     %x4, %x4, %x5\n"                                                \
+"      cbz     %x4, 3f\n"                                                      \
+"      mov     %w0, %w7\n"                                                     \
+"3:\n"                                                                         \
+"      dmb     ish\n"                                                          \
+"4:\n"                                                                         \
+       _ASM_EXTABLE_UACCESS_ERR(1b, 4b, %w0)                                   \
+       _ASM_EXTABLE_UACCESS_ERR(2b, 4b, %w0)                                   \
+       : "+r" (ret), "=&r" (oval), "+Q" (*uaddr), "=&r" (nval), "=&r" (tmp)    \
+       : "r" (oldval), "r" (newval), "Ir" (-EAGAIN)                            \
+       : "memory");                                                            \
+                                                                               \
+       return ret;                                                             \
+}
+
+LSUI_CMPXCHG_HELPER(lo, 0)
+LSUI_CMPXCHG_HELPER(hi, 32)
+
+static __always_inline int
+__lsui_cmpxchg_helper(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval)
+{
+       int ret;
+       unsigned long uaddr_al;
+
+       uaddr_al = ALIGN_DOWN((unsigned long)uaddr, sizeof(u64));
+
+       if (uaddr_al != (unsigned long)uaddr)
+               ret = __lsui_cmpxchg_helper_hi((u64 __user *)uaddr_al, oldval, newval);
+       else
+               ret = __lsui_cmpxchg_helper_lo((u64 __user *)uaddr_al, oldval, newval);
+
+       if (!ret)
+               *oval = oldval;
+
+       return ret;
+}
+
 static __always_inline int
 __lsui_futex_atomic_and(int oparg, u32 __user *uaddr, int *oval)
 {
@@ -135,71 +188,25 @@ __lsui_futex_atomic_and(int oparg, u32 __user *uaddr, int *oval)
 static __always_inline int
 __lsui_futex_atomic_eor(int oparg, u32 __user *uaddr, int *oval)
 {
-       unsigned int loops = FUTEX_MAX_LOOPS;
-       int ret, oldval, tmp;
+       int ret = -EAGAIN;
+       u32 oldval, newval;

        /*
         * there are no ldteor/stteor instructions...
         */
-       asm volatile("// __lsui_futex_atomic_eor\n"
-       __LSUI_PREAMBLE
-"      prfm    pstl1strm, %2\n"
-"1:    ldtxr   %w1, %2\n"
-"      eor     %w3, %w1, %w5\n"
-"2:    stltxr  %w0, %w3, %2\n"
-"      cbz     %w0, 3f\n"
-"      sub     %w4, %w4, %w0\n"
-"      cbnz    %w4, 1b\n"
-"      mov     %w0, %w6\n"
-"3:\n"
-"      dmb     ish\n"
-       _ASM_EXTABLE_UACCESS_ERR(1b, 3b, %w0)
-       _ASM_EXTABLE_UACCESS_ERR(2b, 3b, %w0)
-       : "=&r" (ret), "=&r" (oldval), "+Q" (*uaddr), "=&r" (tmp),
-         "+r" (loops)
-       : "r" (oparg), "Ir" (-EAGAIN)
-       : "memory");
+       unsafe_get_user(oldval, uaddr, err_fault);
+       newval = oldval ^ oparg;

-       if (!ret)
-               *oval = oldval;
+       ret = __lsui_cmpxchg_helper(uaddr, oldval, newval, oval);

+err_fault:
        return ret;
 }

 static __always_inline int
 __lsui_futex_cmpxchg(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval)
 {
-       int ret = 0;
-       unsigned int loops = FUTEX_MAX_LOOPS;
-       u32 val, tmp;
-
-       /*
-        * cas{al}t doesn't support word size...
-        */
-       asm volatile("//__lsui_futex_cmpxchg\n"
-       __LSUI_PREAMBLE
-"      prfm    pstl1strm, %2\n"
-"1:    ldtxr   %w1, %2\n"
-"      eor     %w3, %w1, %w5\n"
-"      cbnz    %w3, 4f\n"
-"2:    stltxr  %w3, %w6, %2\n"
-"      cbz     %w3, 3f\n"
-"      sub     %w4, %w4, %w3\n"
-"      cbnz    %w4, 1b\n"
-"      mov     %w0, %w7\n"
-"3:\n"
-"      dmb     ish\n"
-"4:\n"
-       _ASM_EXTABLE_UACCESS_ERR(1b, 4b, %w0)
-       _ASM_EXTABLE_UACCESS_ERR(2b, 4b, %w0)
-       : "+r" (ret), "=&r" (val), "+Q" (*uaddr), "=&r" (tmp), "+r" (loops)
-       : "r" (oldval), "r" (newval), "Ir" (-EAGAIN)
-       : "memory");
-
-       if (!ret)
-               *oval = oldval;
-
-       return ret;
+       return __lsui_cmpxchg_helper(uaddr, oldval, newval, oval);
 }

 #define __lsui_llsc_body(op, ...)                                      \


This is based on the patch #6.

Am I missing something?

--
Sincerely,
Yeoreum Yun
Re: [PATCH RESEND v7 4/6] arm64: futex: refactor futex atomic operation
Posted by Yeoreum Yun 2 weeks, 4 days ago
Sorry, Ignore this. I've sent wrong this :(
I'll send it again.

> Hi,
>
> > On Mon, Sep 15, 2025 at 09:35:55PM +0100, Will Deacon wrote:
> > > On Mon, Sep 15, 2025 at 08:40:33PM +0100, Catalin Marinas wrote:
> > > > On Mon, Sep 15, 2025 at 11:32:39AM +0100, Yeoreum Yun wrote:
> > > > > So I think it would be better to keep the current LLSC implementation
> > > > > in LSUI.
> > > >
> > > > I think the code would look simpler with LL/SC but you can give it a try
> > > > and post the code sample here (not in a new series).
> > >
> > > If you stick the cas*t instruction in its own helper say, cmpxchg_user(),
> > > then you can do all the shifting/masking in C and I don't reckon it's
> > > that bad. It means we (a) get rid of exclusives, which is the whole
> > > point of this and (b) don't have to mess around with PAN.
> >
> > We get rid of PAN toggling already since FEAT_LSUI introduces
> > LDTXR/STTXR. But, I'm all for CAS if it doesn't look too bad. Easier
> > I think if we do a get_user() of a u64 and combine it with the futex u32
> > while taking care of CPU endianness. All in a loop. Hopefully the
> > compiler is smart enough to reduce masking/or'ing to fewer instructions.
> >
>
> Hmm, I think sure shifting/masking can be replace by single bfi
> instruction like:
>
> diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h
> index 1d6d9f856ac5..30da0006c0c8 100644
> --- a/arch/arm64/include/asm/futex.h
> +++ b/arch/arm64/include/asm/futex.h
> @@ -126,6 +126,59 @@ LSUI_FUTEX_ATOMIC_OP(or, ldtset, al)
>  LSUI_FUTEX_ATOMIC_OP(andnot, ldtclr, al)
>  LSUI_FUTEX_ATOMIC_OP(set, swpt, al)
>
> +
> +#define LSUI_CMPXCHG_HELPER(suffix, start_bit)                                 \
> +static __always_inline int                                                     \
> +__lsui_cmpxchg_helper_##suffix(u64 __user *uaddr, u32 oldval, u32 newval)      \
> +{                                                                              \
> +       int ret = 0;                                                            \
> +       u64 oval, nval, tmp;                                                    \
> +                                                                               \
> +       asm volatile("//__lsui_cmpxchg_helper_" #suffix "\n"                    \
> +       __LSUI_PREAMBLE                                                         \
> +"      prfm    pstl1strm, %2\n"                                                \
> +"1:    ldtr    %x1, %2\n"                                                      \
> +"      bfi     %x1, %x5, #" #start_bit ", #32\n"                               \
> +"      bfi     %x1, %x6, #" #start_bit ", #32\n"                               \
> +"      mov     %x4, %x5\n"                                                     \
> +"2:    caslt   %x5, %x6, %2\n"                                                 \
> +"      sub     %x4, %x4, %x5\n"                                                \
> +"      cbz     %x4, 3f\n"                                                      \
> +"      mov     %w0, %w7\n"                                                     \
> +"3:\n"                                                                         \
> +"      dmb     ish\n"                                                          \
> +"4:\n"                                                                         \
> +       _ASM_EXTABLE_UACCESS_ERR(1b, 4b, %w0)                                   \
> +       _ASM_EXTABLE_UACCESS_ERR(2b, 4b, %w0)                                   \
> +       : "+r" (ret), "=&r" (oval), "+Q" (*uaddr), "=&r" (nval), "=&r" (tmp)    \
> +       : "r" (oldval), "r" (newval), "Ir" (-EAGAIN)                            \
> +       : "memory");                                                            \
> +                                                                               \
> +       return ret;                                                             \
> +}
> +
> +LSUI_CMPXCHG_HELPER(lo, 0)
> +LSUI_CMPXCHG_HELPER(hi, 32)
> +
> +static __always_inline int
> +__lsui_cmpxchg_helper(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval)
> +{
> +       int ret;
> +       unsigned long uaddr_al;
> +
> +       uaddr_al = ALIGN_DOWN((unsigned long)uaddr, sizeof(u64));
> +
> +       if (uaddr_al != (unsigned long)uaddr)
> +               ret = __lsui_cmpxchg_helper_hi((u64 __user *)uaddr_al, oldval, newval);
> +       else
> +               ret = __lsui_cmpxchg_helper_lo((u64 __user *)uaddr_al, oldval, newval);
> +
> +       if (!ret)
> +               *oval = oldval;
> +
> +       return ret;
> +}
> +
>  static __always_inline int
>  __lsui_futex_atomic_and(int oparg, u32 __user *uaddr, int *oval)
>  {
> @@ -135,71 +188,25 @@ __lsui_futex_atomic_and(int oparg, u32 __user *uaddr, int *oval)
>  static __always_inline int
>  __lsui_futex_atomic_eor(int oparg, u32 __user *uaddr, int *oval)
>  {
> -       unsigned int loops = FUTEX_MAX_LOOPS;
> -       int ret, oldval, tmp;
> +       int ret = -EAGAIN;
> +       u32 oldval, newval;
>
>         /*
>          * there are no ldteor/stteor instructions...
>          */
> -       asm volatile("// __lsui_futex_atomic_eor\n"
> -       __LSUI_PREAMBLE
> -"      prfm    pstl1strm, %2\n"
> -"1:    ldtxr   %w1, %2\n"
> -"      eor     %w3, %w1, %w5\n"
> -"2:    stltxr  %w0, %w3, %2\n"
> -"      cbz     %w0, 3f\n"
> -"      sub     %w4, %w4, %w0\n"
> -"      cbnz    %w4, 1b\n"
> -"      mov     %w0, %w6\n"
> -"3:\n"
> -"      dmb     ish\n"
> -       _ASM_EXTABLE_UACCESS_ERR(1b, 3b, %w0)
> -       _ASM_EXTABLE_UACCESS_ERR(2b, 3b, %w0)
> -       : "=&r" (ret), "=&r" (oldval), "+Q" (*uaddr), "=&r" (tmp),
> -         "+r" (loops)
> -       : "r" (oparg), "Ir" (-EAGAIN)
> -       : "memory");
> +       unsafe_get_user(oldval, uaddr, err_fault);
> +       newval = oldval ^ oparg;
>
> -       if (!ret)
> -               *oval = oldval;
> +       ret = __lsui_cmpxchg_helper(uaddr, oldval, newval, oval);
>
> +err_fault:
>         return ret;
>  }
>
>  static __always_inline int
>  __lsui_futex_cmpxchg(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval)
>  {
> -       int ret = 0;
> -       unsigned int loops = FUTEX_MAX_LOOPS;
> -       u32 val, tmp;
> -
> -       /*
> -        * cas{al}t doesn't support word size...
> -        */
> -       asm volatile("//__lsui_futex_cmpxchg\n"
> -       __LSUI_PREAMBLE
> -"      prfm    pstl1strm, %2\n"
> -"1:    ldtxr   %w1, %2\n"
> -"      eor     %w3, %w1, %w5\n"
> -"      cbnz    %w3, 4f\n"
> -"2:    stltxr  %w3, %w6, %2\n"
> -"      cbz     %w3, 3f\n"
> -"      sub     %w4, %w4, %w3\n"
> -"      cbnz    %w4, 1b\n"
> -"      mov     %w0, %w7\n"
> -"3:\n"
> -"      dmb     ish\n"
> -"4:\n"
> -       _ASM_EXTABLE_UACCESS_ERR(1b, 4b, %w0)
> -       _ASM_EXTABLE_UACCESS_ERR(2b, 4b, %w0)
> -       : "+r" (ret), "=&r" (val), "+Q" (*uaddr), "=&r" (tmp), "+r" (loops)
> -       : "r" (oldval), "r" (newval), "Ir" (-EAGAIN)
> -       : "memory");
> -
> -       if (!ret)
> -               *oval = oldval;
> -
> -       return ret;
> +       return __lsui_cmpxchg_helper(uaddr, oldval, newval, oval);
>  }
>
>  #define __lsui_llsc_body(op, ...)                                      \
>
>
> This is based on the patch #6.
>
> Am I missing something?
>
> --
> Sincerely,
> Yeoreum Yun
>

--
Sincerely,
Yeoreum Yun
Re: [PATCH RESEND v7 4/6] arm64: futex: refactor futex atomic operation
Posted by Catalin Marinas 3 weeks ago
On Sat, Aug 16, 2025 at 04:19:27PM +0100, Yeoreum Yun wrote:
> +#define FUTEX_ATOMIC_OP(op)						\
> +static __always_inline int						\
> +__futex_atomic_##op(int oparg, u32 __user *uaddr, int *oval)		\
> +{									\
> +	return __llsc_futex_atomic_##op(oparg, uaddr, oval);		\
> +}
> +
> +FUTEX_ATOMIC_OP(add)
> +FUTEX_ATOMIC_OP(or)
> +FUTEX_ATOMIC_OP(and)
> +FUTEX_ATOMIC_OP(eor)
> +FUTEX_ATOMIC_OP(set)
> +
> +static __always_inline int
> +__futex_cmpxchg(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval)
> +{
> +	return __llsc_futex_cmpxchg(uaddr, oldval, newval, oval);
> +}

The patch looks fine and my impression that the __futex_* functions will
be used in patch 6 to dispatch between lsui and llsc. But you got them
the other way around. I'll comment there. For this patch:

Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Re: [PATCH RESEND v7 4/6] arm64: futex: refactor futex atomic operation
Posted by Yeoreum Yun 2 weeks, 5 days ago
> On Sat, Aug 16, 2025 at 04:19:27PM +0100, Yeoreum Yun wrote:
> > +#define FUTEX_ATOMIC_OP(op)						\
> > +static __always_inline int						\
> > +__futex_atomic_##op(int oparg, u32 __user *uaddr, int *oval)		\
> > +{									\
> > +	return __llsc_futex_atomic_##op(oparg, uaddr, oval);		\
> > +}
> > +
> > +FUTEX_ATOMIC_OP(add)
> > +FUTEX_ATOMIC_OP(or)
> > +FUTEX_ATOMIC_OP(and)
> > +FUTEX_ATOMIC_OP(eor)
> > +FUTEX_ATOMIC_OP(set)
> > +
> > +static __always_inline int
> > +__futex_cmpxchg(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval)
> > +{
> > +	return __llsc_futex_cmpxchg(uaddr, oldval, newval, oval);
> > +}
>
> The patch looks fine and my impression that the __futex_* functions will
> be used in patch 6 to dispatch between lsui and llsc. But you got them
> the other way around. I'll comment there. For this patch:
>
> Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>

Thanks!

--
Sincerely,
Yeoreum Yun
Re: [PATCH RESEND v7 4/6] arm64: futex: refactor futex atomic operation
Posted by Catalin Marinas 3 weeks ago
On Fri, Sep 12, 2025 at 05:44:15PM +0100, Catalin Marinas wrote:
> On Sat, Aug 16, 2025 at 04:19:27PM +0100, Yeoreum Yun wrote:
> > +#define FUTEX_ATOMIC_OP(op)						\
> > +static __always_inline int						\
> > +__futex_atomic_##op(int oparg, u32 __user *uaddr, int *oval)		\
> > +{									\
> > +	return __llsc_futex_atomic_##op(oparg, uaddr, oval);		\
> > +}
> > +
> > +FUTEX_ATOMIC_OP(add)
> > +FUTEX_ATOMIC_OP(or)
> > +FUTEX_ATOMIC_OP(and)
> > +FUTEX_ATOMIC_OP(eor)
> > +FUTEX_ATOMIC_OP(set)
> > +
> > +static __always_inline int
> > +__futex_cmpxchg(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval)
> > +{
> > +	return __llsc_futex_cmpxchg(uaddr, oldval, newval, oval);
> > +}
> 
> The patch looks fine and my impression that the __futex_* functions will
> be used in patch 6 to dispatch between lsui and llsc. But you got them
> the other way around. I'll comment there.

Ah, ignore me, I misread patch 6. It looks good.

> For this patch:
> 
> Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Re: [PATCH RESEND v7 4/6] arm64: futex: refactor futex atomic operation
Posted by Will Deacon 3 weeks, 1 day ago
On Sat, Aug 16, 2025 at 04:19:27PM +0100, Yeoreum Yun wrote:
> Refactor futex atomic operations using ll/sc method with
> clearing PSTATE.PAN to prepare to apply FEAT_LSUI on them.
> 
> Signed-off-by: Yeoreum Yun <yeoreum.yun@arm.com>
> ---
>  arch/arm64/include/asm/futex.h | 132 +++++++++++++++++++++------------
>  1 file changed, 84 insertions(+), 48 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h
> index bc06691d2062..ab7003cb4724 100644
> --- a/arch/arm64/include/asm/futex.h
> +++ b/arch/arm64/include/asm/futex.h
> @@ -7,17 +7,21 @@
> 
>  #include <linux/futex.h>
>  #include <linux/uaccess.h>
> +#include <linux/stringify.h>
> 
>  #include <asm/errno.h>
> 
> -#define FUTEX_MAX_LOOPS	128 /* What's the largest number you can think of? */
> +#define LLSC_MAX_LOOPS	128 /* What's the largest number you can think of? */
> 
> -#define __futex_atomic_op(insn, ret, oldval, uaddr, tmp, oparg)		\
> -do {									\
> -	unsigned int loops = FUTEX_MAX_LOOPS;				\
> +#define LLSC_FUTEX_ATOMIC_OP(op, insn)					\
> +static __always_inline int						\
> +__llsc_futex_atomic_##op(int oparg, u32 __user *uaddr, int *oval)	\
> +{									\
> +	unsigned int loops = LLSC_MAX_LOOPS;				\
> +	int ret, oldval, tmp;						\
>  									\
>  	uaccess_enable_privileged();					\
> -	asm volatile(							\
> +	asm volatile("// __llsc_futex_atomic_" #op "\n"			\
>  "	prfm	pstl1strm, %2\n"					\
>  "1:	ldxr	%w1, %2\n"						\
>  	insn "\n"							\
> @@ -35,45 +39,103 @@ do {									\
>  	: "r" (oparg), "Ir" (-EAGAIN)					\
>  	: "memory");							\
>  	uaccess_disable_privileged();					\
> -} while (0)
> +									\
> +	if (!ret)							\
> +		*oval = oldval;						\

Why push the store to '*oval' down into here?

Will
Re: [PATCH RESEND v7 4/6] arm64: futex: refactor futex atomic operation
Posted by Yeoreum Yun 3 weeks, 1 day ago
Hi Will,

[...]

> > -#define __futex_atomic_op(insn, ret, oldval, uaddr, tmp, oparg)		\
> > -do {									\
> > -	unsigned int loops = FUTEX_MAX_LOOPS;				\
> > +#define LLSC_FUTEX_ATOMIC_OP(op, insn)					\
> > +static __always_inline int						\
> > +__llsc_futex_atomic_##op(int oparg, u32 __user *uaddr, int *oval)	\
> > +{									\
> > +	unsigned int loops = LLSC_MAX_LOOPS;				\
> > +	int ret, oldval, tmp;						\
> >  									\
> >  	uaccess_enable_privileged();					\
> > -	asm volatile(							\
> > +	asm volatile("// __llsc_futex_atomic_" #op "\n"			\
> >  "	prfm	pstl1strm, %2\n"					\
> >  "1:	ldxr	%w1, %2\n"						\
> >  	insn "\n"							\
> > @@ -35,45 +39,103 @@ do {									\
> >  	: "r" (oparg), "Ir" (-EAGAIN)					\
> >  	: "memory");							\
> >  	uaccess_disable_privileged();					\
> > -} while (0)
> > +									\
> > +	if (!ret)							\
> > +		*oval = oldval;						\
>
> Why push the store to '*oval' down into here?

As __llsc_futext_atomic_##op() is declared with inline function
I think it would be better to pass oval from arch_futex_atomic_op_inuser()
as it is for readability.

Is it awful?

Thanks.

--
Sincerely,
Yeoreum Yun