Refactor futex atomic operations using ll/sc method with
clearing PSTATE.PAN to prepare to apply FEAT_LSUI on them.
Signed-off-by: Yeoreum Yun <yeoreum.yun@arm.com>
---
arch/arm64/include/asm/futex.h | 132 +++++++++++++++++++++------------
1 file changed, 84 insertions(+), 48 deletions(-)
diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h
index bc06691d2062..ab7003cb4724 100644
--- a/arch/arm64/include/asm/futex.h
+++ b/arch/arm64/include/asm/futex.h
@@ -7,17 +7,21 @@
#include <linux/futex.h>
#include <linux/uaccess.h>
+#include <linux/stringify.h>
#include <asm/errno.h>
-#define FUTEX_MAX_LOOPS 128 /* What's the largest number you can think of? */
+#define LLSC_MAX_LOOPS 128 /* What's the largest number you can think of? */
-#define __futex_atomic_op(insn, ret, oldval, uaddr, tmp, oparg) \
-do { \
- unsigned int loops = FUTEX_MAX_LOOPS; \
+#define LLSC_FUTEX_ATOMIC_OP(op, insn) \
+static __always_inline int \
+__llsc_futex_atomic_##op(int oparg, u32 __user *uaddr, int *oval) \
+{ \
+ unsigned int loops = LLSC_MAX_LOOPS; \
+ int ret, oldval, tmp; \
\
uaccess_enable_privileged(); \
- asm volatile( \
+ asm volatile("// __llsc_futex_atomic_" #op "\n" \
" prfm pstl1strm, %2\n" \
"1: ldxr %w1, %2\n" \
insn "\n" \
@@ -35,45 +39,103 @@ do { \
: "r" (oparg), "Ir" (-EAGAIN) \
: "memory"); \
uaccess_disable_privileged(); \
-} while (0)
+ \
+ if (!ret) \
+ *oval = oldval; \
+ \
+ return ret; \
+}
+
+LLSC_FUTEX_ATOMIC_OP(add, "add %w3, %w1, %w5")
+LLSC_FUTEX_ATOMIC_OP(or, "orr %w3, %w1, %w5")
+LLSC_FUTEX_ATOMIC_OP(and, "and %w3, %w1, %w5")
+LLSC_FUTEX_ATOMIC_OP(eor, "eor %w3, %w1, %w5")
+LLSC_FUTEX_ATOMIC_OP(set, "mov %w3, %w5")
+
+static __always_inline int
+__llsc_futex_cmpxchg(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval)
+{
+ int ret = 0;
+ unsigned int loops = LLSC_MAX_LOOPS;
+ u32 val, tmp;
+
+ uaccess_enable_privileged();
+ asm volatile("//__llsc_futex_cmpxchg\n"
+" prfm pstl1strm, %2\n"
+"1: ldxr %w1, %2\n"
+" eor %w3, %w1, %w5\n"
+" cbnz %w3, 4f\n"
+"2: stlxr %w3, %w6, %2\n"
+" cbz %w3, 3f\n"
+" sub %w4, %w4, %w3\n"
+" cbnz %w4, 1b\n"
+" mov %w0, %w7\n"
+"3:\n"
+" dmb ish\n"
+"4:\n"
+ _ASM_EXTABLE_UACCESS_ERR(1b, 4b, %w0)
+ _ASM_EXTABLE_UACCESS_ERR(2b, 4b, %w0)
+ : "+r" (ret), "=&r" (val), "+Q" (*uaddr), "=&r" (tmp), "+r" (loops)
+ : "r" (oldval), "r" (newval), "Ir" (-EAGAIN)
+ : "memory");
+ uaccess_disable_privileged();
+
+ if (!ret)
+ *oval = val;
+
+ return ret;
+}
+
+#define FUTEX_ATOMIC_OP(op) \
+static __always_inline int \
+__futex_atomic_##op(int oparg, u32 __user *uaddr, int *oval) \
+{ \
+ return __llsc_futex_atomic_##op(oparg, uaddr, oval); \
+}
+
+FUTEX_ATOMIC_OP(add)
+FUTEX_ATOMIC_OP(or)
+FUTEX_ATOMIC_OP(and)
+FUTEX_ATOMIC_OP(eor)
+FUTEX_ATOMIC_OP(set)
+
+static __always_inline int
+__futex_cmpxchg(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval)
+{
+ return __llsc_futex_cmpxchg(uaddr, oldval, newval, oval);
+}
static inline int
arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *_uaddr)
{
- int oldval = 0, ret, tmp;
- u32 __user *uaddr = __uaccess_mask_ptr(_uaddr);
+ int ret;
+ u32 __user *uaddr;
if (!access_ok(_uaddr, sizeof(u32)))
return -EFAULT;
+ uaddr = __uaccess_mask_ptr(_uaddr);
+
switch (op) {
case FUTEX_OP_SET:
- __futex_atomic_op("mov %w3, %w5",
- ret, oldval, uaddr, tmp, oparg);
+ ret = __futex_atomic_set(oparg, uaddr, oval);
break;
case FUTEX_OP_ADD:
- __futex_atomic_op("add %w3, %w1, %w5",
- ret, oldval, uaddr, tmp, oparg);
+ ret = __futex_atomic_add(oparg, uaddr, oval);
break;
case FUTEX_OP_OR:
- __futex_atomic_op("orr %w3, %w1, %w5",
- ret, oldval, uaddr, tmp, oparg);
+ ret = __futex_atomic_or(oparg, uaddr, oval);
break;
case FUTEX_OP_ANDN:
- __futex_atomic_op("and %w3, %w1, %w5",
- ret, oldval, uaddr, tmp, ~oparg);
+ ret = __futex_atomic_and(~oparg, uaddr, oval);
break;
case FUTEX_OP_XOR:
- __futex_atomic_op("eor %w3, %w1, %w5",
- ret, oldval, uaddr, tmp, oparg);
+ ret = __futex_atomic_eor(oparg, uaddr, oval);
break;
default:
ret = -ENOSYS;
}
- if (!ret)
- *oval = oldval;
-
return ret;
}
@@ -81,40 +143,14 @@ static inline int
futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *_uaddr,
u32 oldval, u32 newval)
{
- int ret = 0;
- unsigned int loops = FUTEX_MAX_LOOPS;
- u32 val, tmp;
u32 __user *uaddr;
if (!access_ok(_uaddr, sizeof(u32)))
return -EFAULT;
uaddr = __uaccess_mask_ptr(_uaddr);
- uaccess_enable_privileged();
- asm volatile("// futex_atomic_cmpxchg_inatomic\n"
-" prfm pstl1strm, %2\n"
-"1: ldxr %w1, %2\n"
-" sub %w3, %w1, %w5\n"
-" cbnz %w3, 4f\n"
-"2: stlxr %w3, %w6, %2\n"
-" cbz %w3, 3f\n"
-" sub %w4, %w4, %w3\n"
-" cbnz %w4, 1b\n"
-" mov %w0, %w7\n"
-"3:\n"
-" dmb ish\n"
-"4:\n"
- _ASM_EXTABLE_UACCESS_ERR(1b, 4b, %w0)
- _ASM_EXTABLE_UACCESS_ERR(2b, 4b, %w0)
- : "+r" (ret), "=&r" (val), "+Q" (*uaddr), "=&r" (tmp), "+r" (loops)
- : "r" (oldval), "r" (newval), "Ir" (-EAGAIN)
- : "memory");
- uaccess_disable_privileged();
-
- if (!ret)
- *uval = val;
- return ret;
+ return __futex_cmpxchg(uaddr, oldval, newval, uval);
}
#endif /* __ASM_FUTEX_H */
--
LEVI:{C3F47F37-75D8-414A-A8BA-3980EC8A46D7}
On Sat, Aug 16, 2025 at 04:19:27PM +0100, Yeoreum Yun wrote: > diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h > index bc06691d2062..ab7003cb4724 100644 > --- a/arch/arm64/include/asm/futex.h > +++ b/arch/arm64/include/asm/futex.h > @@ -7,17 +7,21 @@ > > #include <linux/futex.h> > #include <linux/uaccess.h> > +#include <linux/stringify.h> > > #include <asm/errno.h> > > -#define FUTEX_MAX_LOOPS 128 /* What's the largest number you can think of? */ > +#define LLSC_MAX_LOOPS 128 /* What's the largest number you can think of? */ I just noticed - you might as well leave the name as is here, especially if in patch 6 you align down address and use CAS on a 64-bit value as per Will's comment (and it's no longer LLSC). I think renaming this is unnecessary. -- Catalin
Hi Catalin, > On Sat, Aug 16, 2025 at 04:19:27PM +0100, Yeoreum Yun wrote: > > diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h > > index bc06691d2062..ab7003cb4724 100644 > > --- a/arch/arm64/include/asm/futex.h > > +++ b/arch/arm64/include/asm/futex.h > > @@ -7,17 +7,21 @@ > > > > #include <linux/futex.h> > > #include <linux/uaccess.h> > > +#include <linux/stringify.h> > > > > #include <asm/errno.h> > > > > -#define FUTEX_MAX_LOOPS 128 /* What's the largest number you can think of? */ > > +#define LLSC_MAX_LOOPS 128 /* What's the largest number you can think of? */ > > I just noticed - you might as well leave the name as is here, especially > if in patch 6 you align down address and use CAS on a 64-bit value as > per Will's comment (and it's no longer LLSC). I think renaming this is > unnecessary. Okay. I'll restore to use origin name. But I think LSUI wouldn't be used with CAS according to patch 6's comments from you and additionally i think chaning the CAS would make a failure because of change of unrelated field. i.e) struct user_structure{ uint32 futex; uint32 some_value; }; In this case, the change of some_value from user side could make a failure of futex atomic operation. So I think it would be better to keep the current LLSC implementation in LSUI. Thanks. -- Sincerely, Yeoreum Yun
On Mon, Sep 15, 2025 at 11:32:39AM +0100, Yeoreum Yun wrote: > > On Sat, Aug 16, 2025 at 04:19:27PM +0100, Yeoreum Yun wrote: > > > diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h > > > index bc06691d2062..ab7003cb4724 100644 > > > --- a/arch/arm64/include/asm/futex.h > > > +++ b/arch/arm64/include/asm/futex.h > > > @@ -7,17 +7,21 @@ > > > > > > #include <linux/futex.h> > > > #include <linux/uaccess.h> > > > +#include <linux/stringify.h> > > > > > > #include <asm/errno.h> > > > > > > -#define FUTEX_MAX_LOOPS 128 /* What's the largest number you can think of? */ > > > +#define LLSC_MAX_LOOPS 128 /* What's the largest number you can think of? */ > > > > I just noticed - you might as well leave the name as is here, especially > > if in patch 6 you align down address and use CAS on a 64-bit value as > > per Will's comment (and it's no longer LLSC). I think renaming this is > > unnecessary. > > Okay. I'll restore to use origin name. > But I think LSUI wouldn't be used with CAS according to patch 6's > comments from you and additionally i think > chaning the CAS would make a failure because of > change of unrelated field. i.e) > > struct user_structure{ > uint32 futex; > uint32 some_value; > }; > > In this case, the change of some_value from user side could make a > failure of futex atomic operation. Yes but the loop would read 'some_value' again, fold in 'futex' and retry. It would eventually succeed or fail after 128 iterations if the user keeps changing that location. Note that's also the case with LL/SC, the exclusive monitor would be cleared by some store in the same cache line (well, depending on the hardware implementation) and the STXR fail. From this perspective, CAS has better chance of succeeding. > So I think it would be better to keep the current LLSC implementation > in LSUI. I think the code would look simpler with LL/SC but you can give it a try and post the code sample here (not in a new series). BTW, is there a test suite for all the futex operations? The cover letter did not mention any. -- Catalin
Hi Catalin, > > > On Sat, Aug 16, 2025 at 04:19:27PM +0100, Yeoreum Yun wrote: > > > > diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h > > > > index bc06691d2062..ab7003cb4724 100644 > > > > --- a/arch/arm64/include/asm/futex.h > > > > +++ b/arch/arm64/include/asm/futex.h > > > > @@ -7,17 +7,21 @@ > > > > > > > > #include <linux/futex.h> > > > > #include <linux/uaccess.h> > > > > +#include <linux/stringify.h> > > > > > > > > #include <asm/errno.h> > > > > > > > > -#define FUTEX_MAX_LOOPS 128 /* What's the largest number you can think of? */ > > > > +#define LLSC_MAX_LOOPS 128 /* What's the largest number you can think of? */ > > > > > > I just noticed - you might as well leave the name as is here, especially > > > if in patch 6 you align down address and use CAS on a 64-bit value as > > > per Will's comment (and it's no longer LLSC). I think renaming this is > > > unnecessary. > > > > Okay. I'll restore to use origin name. > > But I think LSUI wouldn't be used with CAS according to patch 6's > > comments from you and additionally i think > > chaning the CAS would make a failure because of > > change of unrelated field. i.e) > > > > struct user_structure{ > > uint32 futex; > > uint32 some_value; > > }; > > > > In this case, the change of some_value from user side could make a > > failure of futex atomic operation. > > Yes but the loop would read 'some_value' again, fold in 'futex' and > retry. It would eventually succeed or fail after 128 iterations if the > user keeps changing that location. Note that's also the case with LL/SC, > the exclusive monitor would be cleared by some store in the same cache > line (well, depending on the hardware implementation) and the STXR fail. > From this perspective, CAS has better chance of succeeding. Oh. I see Thanks for insight ;) > > > So I think it would be better to keep the current LLSC implementation > > in LSUI. > > I think the code would look simpler with LL/SC but you can give it a try > and post the code sample here (not in a new series). Okay. I'll try. > > BTW, is there a test suite for all the futex operations? The cover > letter did not mention any. with selftest's futex testcase, I've tested. But Since there is no test for each operation even in LTP, I tested it with additional test from me. > > -- > Catalin -- Sincerely, Yeoreum Yun
On Mon, Sep 15, 2025 at 11:34:07PM +0100, Yeoreum Yun wrote: > with selftest's futex testcase, I've tested. > But Since there is no test for each operation even in LTP, > I tested it with additional test from me. As a separate series, you may improve the futex kselftest to cover all the cases. -- Catalin
On Mon, Sep 15, 2025 at 08:40:33PM +0100, Catalin Marinas wrote: > On Mon, Sep 15, 2025 at 11:32:39AM +0100, Yeoreum Yun wrote: > > > On Sat, Aug 16, 2025 at 04:19:27PM +0100, Yeoreum Yun wrote: > > > > diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h > > > > index bc06691d2062..ab7003cb4724 100644 > > > > --- a/arch/arm64/include/asm/futex.h > > > > +++ b/arch/arm64/include/asm/futex.h > > > > @@ -7,17 +7,21 @@ > > > > > > > > #include <linux/futex.h> > > > > #include <linux/uaccess.h> > > > > +#include <linux/stringify.h> > > > > > > > > #include <asm/errno.h> > > > > > > > > -#define FUTEX_MAX_LOOPS 128 /* What's the largest number you can think of? */ > > > > +#define LLSC_MAX_LOOPS 128 /* What's the largest number you can think of? */ > > > > > > I just noticed - you might as well leave the name as is here, especially > > > if in patch 6 you align down address and use CAS on a 64-bit value as > > > per Will's comment (and it's no longer LLSC). I think renaming this is > > > unnecessary. > > > > Okay. I'll restore to use origin name. > > But I think LSUI wouldn't be used with CAS according to patch 6's > > comments from you and additionally i think > > chaning the CAS would make a failure because of > > change of unrelated field. i.e) > > > > struct user_structure{ > > uint32 futex; > > uint32 some_value; > > }; > > > > In this case, the change of some_value from user side could make a > > failure of futex atomic operation. > > Yes but the loop would read 'some_value' again, fold in 'futex' and > retry. It would eventually succeed or fail after 128 iterations if the > user keeps changing that location. Note that's also the case with LL/SC, > the exclusive monitor would be cleared by some store in the same cache > line (well, depending on the hardware implementation) and the STXR fail. > From this perspective, CAS has better chance of succeeding. > > > So I think it would be better to keep the current LLSC implementation > > in LSUI. > > I think the code would look simpler with LL/SC but you can give it a try > and post the code sample here (not in a new series). If you stick the cas*t instruction in its own helper say, cmpxchg_user(), then you can do all the shifting/masking in C and I don't reckon it's that bad. It means we (a) get rid of exclusives, which is the whole point of this and (b) don't have to mess around with PAN. > BTW, is there a test suite for all the futex operations? The cover > letter did not mention any. I was thinking that too. I'm sure I remember a 'futextest' kicking around when we did the arm64 port but nowadays there's something in tools/testing/selftests/futex/ which might be better. Will
On Mon, Sep 15, 2025 at 09:35:55PM +0100, Will Deacon wrote: > On Mon, Sep 15, 2025 at 08:40:33PM +0100, Catalin Marinas wrote: > > On Mon, Sep 15, 2025 at 11:32:39AM +0100, Yeoreum Yun wrote: > > > So I think it would be better to keep the current LLSC implementation > > > in LSUI. > > > > I think the code would look simpler with LL/SC but you can give it a try > > and post the code sample here (not in a new series). > > If you stick the cas*t instruction in its own helper say, cmpxchg_user(), > then you can do all the shifting/masking in C and I don't reckon it's > that bad. It means we (a) get rid of exclusives, which is the whole > point of this and (b) don't have to mess around with PAN. We get rid of PAN toggling already since FEAT_LSUI introduces LDTXR/STTXR. But, I'm all for CAS if it doesn't look too bad. Easier I think if we do a get_user() of a u64 and combine it with the futex u32 while taking care of CPU endianness. All in a loop. Hopefully the compiler is smart enough to reduce masking/or'ing to fewer instructions. -- Catalin
Hi, > On Mon, Sep 15, 2025 at 09:35:55PM +0100, Will Deacon wrote: > > On Mon, Sep 15, 2025 at 08:40:33PM +0100, Catalin Marinas wrote: > > > On Mon, Sep 15, 2025 at 11:32:39AM +0100, Yeoreum Yun wrote: > > > > So I think it would be better to keep the current LLSC implementation > > > > in LSUI. > > > > > > I think the code would look simpler with LL/SC but you can give it a try > > > and post the code sample here (not in a new series). > > > > If you stick the cas*t instruction in its own helper say, cmpxchg_user(), > > then you can do all the shifting/masking in C and I don't reckon it's > > that bad. It means we (a) get rid of exclusives, which is the whole > > point of this and (b) don't have to mess around with PAN. > > We get rid of PAN toggling already since FEAT_LSUI introduces > LDTXR/STTXR. But, I'm all for CAS if it doesn't look too bad. Easier > I think if we do a get_user() of a u64 and combine it with the futex u32 > while taking care of CPU endianness. All in a loop. Hopefully the > compiler is smart enough to reduce masking/or'ing to fewer instructions. > Sorry for my wrong previous email. Here is what i intened with CAS operation: diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h index 1d6d9f856ac5..0aeda7ced2c0 100644 --- a/arch/arm64/include/asm/futex.h +++ b/arch/arm64/include/asm/futex.h @@ -126,6 +126,60 @@ LSUI_FUTEX_ATOMIC_OP(or, ldtset, al) LSUI_FUTEX_ATOMIC_OP(andnot, ldtclr, al) LSUI_FUTEX_ATOMIC_OP(set, swpt, al) + +#define LSUI_CMPXCHG_HELPER(suffix, start_bit) \ +static __always_inline int \ +__lsui_cmpxchg_helper_##suffix(u64 __user *uaddr, u32 oldval, u32 newval) \ +{ \ + int ret = 0; \ + u64 oval, nval, tmp; \ + \ + asm volatile("//__lsui_cmpxchg_helper_" #suffix "\n" \ + __LSUI_PREAMBLE \ +" prfm pstl1strm, %2\n" \ +"1: ldtr %x1, %2\n" \ +" mov %x3, %x1\n" \ +" bfi %x1, %x5, #" #start_bit ", #32\n" \ +" bfi %x3, %x6, #" #start_bit ", #32\n" \ +" mov %x4, %x1\n" \ +"2: caslt %x1, %x3, %2\n" \ +" sub %x1, %x1, %x4\n" \ +" cbz %x1, 3f\n" \ +" mov %w0, %w7\n" \ +"3:\n" \ +" dmb ish\n" \ +"4:\n" \ + _ASM_EXTABLE_UACCESS_ERR(1b, 4b, %w0) \ + _ASM_EXTABLE_UACCESS_ERR(2b, 4b, %w0) \ + : "+r" (ret), "=&r" (oval), "+Q" (*uaddr), "=&r" (nval), "=&r" (tmp) \ + : "r" (oldval), "r" (newval), "Ir" (-EAGAIN) \ + : "memory"); \ + \ + return ret; \ +} + +LSUI_CMPXCHG_HELPER(lo, 0) +LSUI_CMPXCHG_HELPER(hi, 32) + +static __always_inline int +__lsui_cmpxchg_helper(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval) +{ + int ret; + unsigned long uaddr_al; + + uaddr_al = ALIGN_DOWN((unsigned long)uaddr, sizeof(u64)); + + if (uaddr_al != (unsigned long)uaddr) + ret = __lsui_cmpxchg_helper_hi((u64 __user *)uaddr_al, oldval, newval); + else + ret = __lsui_cmpxchg_helper_lo((u64 __user *)uaddr_al, oldval, newval); + + if (!ret) + *oval = oldval; + + return ret; +} + static __always_inline int __lsui_futex_atomic_and(int oparg, u32 __user *uaddr, int *oval) { @@ -135,71 +189,25 @@ __lsui_futex_atomic_and(int oparg, u32 __user *uaddr, int *oval) static __always_inline int __lsui_futex_atomic_eor(int oparg, u32 __user *uaddr, int *oval) { - unsigned int loops = FUTEX_MAX_LOOPS; - int ret, oldval, tmp; + int ret = -EAGAIN; + u32 oldval, newval; /* * there are no ldteor/stteor instructions... */ - asm volatile("// __lsui_futex_atomic_eor\n" - __LSUI_PREAMBLE -" prfm pstl1strm, %2\n" -"1: ldtxr %w1, %2\n" -" eor %w3, %w1, %w5\n" -"2: stltxr %w0, %w3, %2\n" -" cbz %w0, 3f\n" -" sub %w4, %w4, %w0\n" -" cbnz %w4, 1b\n" -" mov %w0, %w6\n" -"3:\n" -" dmb ish\n" - _ASM_EXTABLE_UACCESS_ERR(1b, 3b, %w0) - _ASM_EXTABLE_UACCESS_ERR(2b, 3b, %w0) - : "=&r" (ret), "=&r" (oldval), "+Q" (*uaddr), "=&r" (tmp), - "+r" (loops) - : "r" (oparg), "Ir" (-EAGAIN) - : "memory"); + unsafe_get_user(oldval, uaddr, err_fault); + newval = oldval ^ oparg; - if (!ret) - *oval = oldval; + ret = __lsui_cmpxchg_helper(uaddr, oldval, newval, oval); +err_fault: return ret; } static __always_inline int __lsui_futex_cmpxchg(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval) { - int ret = 0; - unsigned int loops = FUTEX_MAX_LOOPS; - u32 val, tmp; - - /* - * cas{al}t doesn't support word size... - */ - asm volatile("//__lsui_futex_cmpxchg\n" - __LSUI_PREAMBLE -" prfm pstl1strm, %2\n" -"1: ldtxr %w1, %2\n" -" eor %w3, %w1, %w5\n" -" cbnz %w3, 4f\n" -"2: stltxr %w3, %w6, %2\n" -" cbz %w3, 3f\n" -" sub %w4, %w4, %w3\n" -" cbnz %w4, 1b\n" -" mov %w0, %w7\n" -"3:\n" -" dmb ish\n" -"4:\n" - _ASM_EXTABLE_UACCESS_ERR(1b, 4b, %w0) - _ASM_EXTABLE_UACCESS_ERR(2b, 4b, %w0) - : "+r" (ret), "=&r" (val), "+Q" (*uaddr), "=&r" (tmp), "+r" (loops) - : "r" (oldval), "r" (newval), "Ir" (-EAGAIN) - : "memory"); - - if (!ret) - *oval = oldval; - - return ret; + return __lsui_cmpxchg_helper(uaddr, oldval, newval, oval); } #define __lsui_llsc_body(op, ...) \ (END) Am I missing something? Thanks! -- Sincerely, Yeoreum Yun
On Tue, Sep 16, 2025 at 11:02:19AM +0100, Yeoreum Yun wrote: > Hi, > > On Mon, Sep 15, 2025 at 09:35:55PM +0100, Will Deacon wrote: > > > On Mon, Sep 15, 2025 at 08:40:33PM +0100, Catalin Marinas wrote: > > > > On Mon, Sep 15, 2025 at 11:32:39AM +0100, Yeoreum Yun wrote: > > > > > So I think it would be better to keep the current LLSC implementation > > > > > in LSUI. > > > > > > > > I think the code would look simpler with LL/SC but you can give it a try > > > > and post the code sample here (not in a new series). > > > > > > If you stick the cas*t instruction in its own helper say, cmpxchg_user(), > > > then you can do all the shifting/masking in C and I don't reckon it's > > > that bad. It means we (a) get rid of exclusives, which is the whole > > > point of this and (b) don't have to mess around with PAN. > > > > We get rid of PAN toggling already since FEAT_LSUI introduces > > LDTXR/STTXR. But, I'm all for CAS if it doesn't look too bad. Easier > > I think if we do a get_user() of a u64 and combine it with the futex u32 > > while taking care of CPU endianness. All in a loop. Hopefully the > > compiler is smart enough to reduce masking/or'ing to fewer instructions. > > > > Sorry for my wrong previous email. > > Here is what i intened with CAS operation: > > diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h > index 1d6d9f856ac5..0aeda7ced2c0 100644 > --- a/arch/arm64/include/asm/futex.h > +++ b/arch/arm64/include/asm/futex.h > @@ -126,6 +126,60 @@ LSUI_FUTEX_ATOMIC_OP(or, ldtset, al) > LSUI_FUTEX_ATOMIC_OP(andnot, ldtclr, al) > LSUI_FUTEX_ATOMIC_OP(set, swpt, al) > > + > +#define LSUI_CMPXCHG_HELPER(suffix, start_bit) \ > +static __always_inline int \ > +__lsui_cmpxchg_helper_##suffix(u64 __user *uaddr, u32 oldval, u32 newval) \ > +{ \ > + int ret = 0; \ > + u64 oval, nval, tmp; \ > + \ > + asm volatile("//__lsui_cmpxchg_helper_" #suffix "\n" \ > + __LSUI_PREAMBLE \ > +" prfm pstl1strm, %2\n" \ > +"1: ldtr %x1, %2\n" \ > +" mov %x3, %x1\n" \ > +" bfi %x1, %x5, #" #start_bit ", #32\n" \ > +" bfi %x3, %x6, #" #start_bit ", #32\n" \ > +" mov %x4, %x1\n" \ > +"2: caslt %x1, %x3, %2\n" \ > +" sub %x1, %x1, %x4\n" \ > +" cbz %x1, 3f\n" \ > +" mov %w0, %w7\n" \ > +"3:\n" \ > +" dmb ish\n" \ > +"4:\n" \ > + _ASM_EXTABLE_UACCESS_ERR(1b, 4b, %w0) \ > + _ASM_EXTABLE_UACCESS_ERR(2b, 4b, %w0) \ > + : "+r" (ret), "=&r" (oval), "+Q" (*uaddr), "=&r" (nval), "=&r" (tmp) \ > + : "r" (oldval), "r" (newval), "Ir" (-EAGAIN) \ > + : "memory"); \ > + \ > + return ret; \ > +} > + > +LSUI_CMPXCHG_HELPER(lo, 0) > +LSUI_CMPXCHG_HELPER(hi, 32) > + > +static __always_inline int > +__lsui_cmpxchg_helper(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval) > +{ > + int ret; > + unsigned long uaddr_al; > + > + uaddr_al = ALIGN_DOWN((unsigned long)uaddr, sizeof(u64)); > + > + if (uaddr_al != (unsigned long)uaddr) > + ret = __lsui_cmpxchg_helper_hi((u64 __user *)uaddr_al, oldval, newval); > + else > + ret = __lsui_cmpxchg_helper_lo((u64 __user *)uaddr_al, oldval, newval); > + > + if (!ret) > + *oval = oldval; > + > + return ret; > +} I think Will expects that you do more of this in C, e.g. have a basic user cmpxchg on a 64-bit type, e.g. /* * NOTE: *oldp is NOT updated if a fault is taken. */ static __always_inline int user_cmpxchg64_release(u64 __usr *addr, u64 *oldp, u64 new) { int err = 0; asm volatile( __LSUI_PREAMBLE "1: caslt %x[old], %x[new], %[addr]\n" "2:\n" _ASM_EXTABLE_UACCESS_ERR(1b, 4b, %w0) : [addr] "+Q" (addr), [old] "+r" (*oldp) : [new] "r" (new) : "memory" ); return err; } That should be the *only* assembly you need to implement. Atop that, have a wrapper that uses get_user() and that helper above to implement the 32-bit user cmpxchg, with all the bit manipulation in C: /* * NOTE: *oldp is NOT updated if a fault is taken. */ static int user_cmpxchg32_release(u32 __user *addr, u32 *oldp, u32 new) { u64 __user *addr64 = PTR_ALIGN_DOWN(addr, sizeof(u64)); u64 old64, new64; int ret; if (get_user(old64, uaddr64)) return -EFAULT; /* * TODO: merge '*oldp' into 'old64', and 'new' into 'new64', * taking into account endianness and alignment */ ret = user_cmpxchg64_release(uaddr64, &old64, new64); if (ret) return ret; /* * TODO: extract '*oldp' from 'old64', taking into account * endianness and alignment. */ return 0; } Then use that 32-bit user cmpxchg for any cases which need it. As the compiler will have visiblity of all of the bit manipulation, it will hopefully be able to fold that together appropriately. Mark.
Hi Mark, [...] > > diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h > > index 1d6d9f856ac5..0aeda7ced2c0 100644 > > --- a/arch/arm64/include/asm/futex.h > > +++ b/arch/arm64/include/asm/futex.h > > @@ -126,6 +126,60 @@ LSUI_FUTEX_ATOMIC_OP(or, ldtset, al) > > LSUI_FUTEX_ATOMIC_OP(andnot, ldtclr, al) > > LSUI_FUTEX_ATOMIC_OP(set, swpt, al) > > > > + > > +#define LSUI_CMPXCHG_HELPER(suffix, start_bit) \ > > +static __always_inline int \ > > +__lsui_cmpxchg_helper_##suffix(u64 __user *uaddr, u32 oldval, u32 newval) \ > > +{ \ > > + int ret = 0; \ > > + u64 oval, nval, tmp; \ > > + \ > > + asm volatile("//__lsui_cmpxchg_helper_" #suffix "\n" \ > > + __LSUI_PREAMBLE \ > > +" prfm pstl1strm, %2\n" \ > > +"1: ldtr %x1, %2\n" \ > > +" mov %x3, %x1\n" \ > > +" bfi %x1, %x5, #" #start_bit ", #32\n" \ > > +" bfi %x3, %x6, #" #start_bit ", #32\n" \ > > +" mov %x4, %x1\n" \ > > +"2: caslt %x1, %x3, %2\n" \ > > +" sub %x1, %x1, %x4\n" \ > > +" cbz %x1, 3f\n" \ > > +" mov %w0, %w7\n" \ > > +"3:\n" \ > > +" dmb ish\n" \ > > +"4:\n" \ > > + _ASM_EXTABLE_UACCESS_ERR(1b, 4b, %w0) \ > > + _ASM_EXTABLE_UACCESS_ERR(2b, 4b, %w0) \ > > + : "+r" (ret), "=&r" (oval), "+Q" (*uaddr), "=&r" (nval), "=&r" (tmp) \ > > + : "r" (oldval), "r" (newval), "Ir" (-EAGAIN) \ > > + : "memory"); \ > > + \ > > + return ret; \ > > +} > > + > > +LSUI_CMPXCHG_HELPER(lo, 0) > > +LSUI_CMPXCHG_HELPER(hi, 32) > > + > > +static __always_inline int > > +__lsui_cmpxchg_helper(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval) > > +{ > > + int ret; > > + unsigned long uaddr_al; > > + > > + uaddr_al = ALIGN_DOWN((unsigned long)uaddr, sizeof(u64)); > > + > > + if (uaddr_al != (unsigned long)uaddr) > > + ret = __lsui_cmpxchg_helper_hi((u64 __user *)uaddr_al, oldval, newval); > > + else > > + ret = __lsui_cmpxchg_helper_lo((u64 __user *)uaddr_al, oldval, newval); > > + > > + if (!ret) > > + *oval = oldval; > > + > > + return ret; > > +} > > I think Will expects that you do more of this in C, e.g. have a basic > user cmpxchg on a 64-bit type, e.g. > > /* > * NOTE: *oldp is NOT updated if a fault is taken. > */ > static __always_inline int > user_cmpxchg64_release(u64 __usr *addr, u64 *oldp, u64 new) > { > int err = 0; > > asm volatile( > __LSUI_PREAMBLE > "1: caslt %x[old], %x[new], %[addr]\n" > "2:\n" > _ASM_EXTABLE_UACCESS_ERR(1b, 4b, %w0) > : [addr] "+Q" (addr), > [old] "+r" (*oldp) > : [new] "r" (new) > : "memory" > ); > > return err; > } > > That should be the *only* assembly you need to implement. > > Atop that, have a wrapper that uses get_user() and that helper above to > implement the 32-bit user cmpxchg, with all the bit manipulation in C: Thanks for your suggestion. But small question. I think it's enough to use usafe_get_user() instead of get_user() in here since when FEAT_LSUI enabled, it doeesn't need to call uaccess_ttbr0_enable()/disable(). or Am I missing something? > > /* > * NOTE: *oldp is NOT updated if a fault is taken. > */ > static int user_cmpxchg32_release(u32 __user *addr, u32 *oldp, u32 new) > { > u64 __user *addr64 = PTR_ALIGN_DOWN(addr, sizeof(u64)); > u64 old64, new64; > int ret; > > if (get_user(old64, uaddr64)) > return -EFAULT; > > /* > * TODO: merge '*oldp' into 'old64', and 'new' into 'new64', > * taking into account endianness and alignment > */ > > ret = user_cmpxchg64_release(uaddr64, &old64, new64); > if (ret) > return ret; > > /* > * TODO: extract '*oldp' from 'old64', taking into account > * endianness and alignment. > */ I think extraction doesn't need since futext_cmpxchg op passes the "oldval" argument which fetched and passed already. So when it success, it seems enough to return the passed arguments. But, I'm not sure which is better whether dividing the cmpxchg64 and cmpxchge32 interface or make one helper -- user_cmpxchg_release (__lsui_cmpxchg_helper). [...] Thanks. -- Sincerely, Yeoreum Yun
On Tue, Sep 16, 2025 at 02:27:37PM +0100, Yeoreum Yun wrote: > Hi Mark, > > [...] > > > diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h > > > index 1d6d9f856ac5..0aeda7ced2c0 100644 > > > --- a/arch/arm64/include/asm/futex.h > > > +++ b/arch/arm64/include/asm/futex.h > > > @@ -126,6 +126,60 @@ LSUI_FUTEX_ATOMIC_OP(or, ldtset, al) > > > LSUI_FUTEX_ATOMIC_OP(andnot, ldtclr, al) > > > LSUI_FUTEX_ATOMIC_OP(set, swpt, al) > > > > > > + > > > +#define LSUI_CMPXCHG_HELPER(suffix, start_bit) \ > > > +static __always_inline int \ > > > +__lsui_cmpxchg_helper_##suffix(u64 __user *uaddr, u32 oldval, u32 newval) \ > > > +{ \ > > > + int ret = 0; \ > > > + u64 oval, nval, tmp; \ > > > + \ > > > + asm volatile("//__lsui_cmpxchg_helper_" #suffix "\n" \ > > > + __LSUI_PREAMBLE \ > > > +" prfm pstl1strm, %2\n" \ > > > +"1: ldtr %x1, %2\n" \ > > > +" mov %x3, %x1\n" \ > > > +" bfi %x1, %x5, #" #start_bit ", #32\n" \ > > > +" bfi %x3, %x6, #" #start_bit ", #32\n" \ > > > +" mov %x4, %x1\n" \ > > > +"2: caslt %x1, %x3, %2\n" \ > > > +" sub %x1, %x1, %x4\n" \ > > > +" cbz %x1, 3f\n" \ > > > +" mov %w0, %w7\n" \ > > > +"3:\n" \ > > > +" dmb ish\n" \ > > > +"4:\n" \ > > > + _ASM_EXTABLE_UACCESS_ERR(1b, 4b, %w0) \ > > > + _ASM_EXTABLE_UACCESS_ERR(2b, 4b, %w0) \ > > > + : "+r" (ret), "=&r" (oval), "+Q" (*uaddr), "=&r" (nval), "=&r" (tmp) \ > > > + : "r" (oldval), "r" (newval), "Ir" (-EAGAIN) \ > > > + : "memory"); \ > > > + \ > > > + return ret; \ > > > +} > > > + > > > +LSUI_CMPXCHG_HELPER(lo, 0) > > > +LSUI_CMPXCHG_HELPER(hi, 32) > > > + > > > +static __always_inline int > > > +__lsui_cmpxchg_helper(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval) > > > +{ > > > + int ret; > > > + unsigned long uaddr_al; > > > + > > > + uaddr_al = ALIGN_DOWN((unsigned long)uaddr, sizeof(u64)); > > > + > > > + if (uaddr_al != (unsigned long)uaddr) > > > + ret = __lsui_cmpxchg_helper_hi((u64 __user *)uaddr_al, oldval, newval); > > > + else > > > + ret = __lsui_cmpxchg_helper_lo((u64 __user *)uaddr_al, oldval, newval); > > > + > > > + if (!ret) > > > + *oval = oldval; > > > + > > > + return ret; > > > +} > > > > I think Will expects that you do more of this in C, e.g. have a basic > > user cmpxchg on a 64-bit type, e.g. > > > > /* > > * NOTE: *oldp is NOT updated if a fault is taken. > > */ > > static __always_inline int > > user_cmpxchg64_release(u64 __usr *addr, u64 *oldp, u64 new) > > { > > int err = 0; > > > > asm volatile( > > __LSUI_PREAMBLE > > "1: caslt %x[old], %x[new], %[addr]\n" > > "2:\n" > > _ASM_EXTABLE_UACCESS_ERR(1b, 4b, %w0) > > : [addr] "+Q" (addr), > > [old] "+r" (*oldp) > > : [new] "r" (new) > > : "memory" > > ); > > > > return err; > > } > > > > That should be the *only* assembly you need to implement. > > > > Atop that, have a wrapper that uses get_user() and that helper above to > > implement the 32-bit user cmpxchg, with all the bit manipulation in C: > > Thanks for your suggestion. But small question. > I think it's enough to use usafe_get_user() instead of get_user() in here > since when FEAT_LSUI enabled, it doeesn't need to call > uaccess_ttbr0_enable()/disable(). Regardless of uaccess_ttbr0_enable() and uaccess_ttbr0_disable() specifically, API-wise unsafe_get_user() is only supposed to be called between user_access_begin() and user_access_end(), and there's some stuff we probably want to add there (e.g. might_fault(), which unsafe_get_user() lacks today). Do we call those? Mark.
Hi Mark, [...] > > > > diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h > > > > index 1d6d9f856ac5..0aeda7ced2c0 100644 > > > > --- a/arch/arm64/include/asm/futex.h > > > > +++ b/arch/arm64/include/asm/futex.h > > > > @@ -126,6 +126,60 @@ LSUI_FUTEX_ATOMIC_OP(or, ldtset, al) > > > > LSUI_FUTEX_ATOMIC_OP(andnot, ldtclr, al) > > > > LSUI_FUTEX_ATOMIC_OP(set, swpt, al) > > > > > > > > + > > > > +#define LSUI_CMPXCHG_HELPER(suffix, start_bit) \ > > > > +static __always_inline int \ > > > > +__lsui_cmpxchg_helper_##suffix(u64 __user *uaddr, u32 oldval, u32 newval) \ > > > > +{ \ > > > > + int ret = 0; \ > > > > + u64 oval, nval, tmp; \ > > > > + \ > > > > + asm volatile("//__lsui_cmpxchg_helper_" #suffix "\n" \ > > > > + __LSUI_PREAMBLE \ > > > > +" prfm pstl1strm, %2\n" \ > > > > +"1: ldtr %x1, %2\n" \ > > > > +" mov %x3, %x1\n" \ > > > > +" bfi %x1, %x5, #" #start_bit ", #32\n" \ > > > > +" bfi %x3, %x6, #" #start_bit ", #32\n" \ > > > > +" mov %x4, %x1\n" \ > > > > +"2: caslt %x1, %x3, %2\n" \ > > > > +" sub %x1, %x1, %x4\n" \ > > > > +" cbz %x1, 3f\n" \ > > > > +" mov %w0, %w7\n" \ > > > > +"3:\n" \ > > > > +" dmb ish\n" \ > > > > +"4:\n" \ > > > > + _ASM_EXTABLE_UACCESS_ERR(1b, 4b, %w0) \ > > > > + _ASM_EXTABLE_UACCESS_ERR(2b, 4b, %w0) \ > > > > + : "+r" (ret), "=&r" (oval), "+Q" (*uaddr), "=&r" (nval), "=&r" (tmp) \ > > > > + : "r" (oldval), "r" (newval), "Ir" (-EAGAIN) \ > > > > + : "memory"); \ > > > > + \ > > > > + return ret; \ > > > > +} > > > > + > > > > +LSUI_CMPXCHG_HELPER(lo, 0) > > > > +LSUI_CMPXCHG_HELPER(hi, 32) > > > > + > > > > +static __always_inline int > > > > +__lsui_cmpxchg_helper(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval) > > > > +{ > > > > + int ret; > > > > + unsigned long uaddr_al; > > > > + > > > > + uaddr_al = ALIGN_DOWN((unsigned long)uaddr, sizeof(u64)); > > > > + > > > > + if (uaddr_al != (unsigned long)uaddr) > > > > + ret = __lsui_cmpxchg_helper_hi((u64 __user *)uaddr_al, oldval, newval); > > > > + else > > > > + ret = __lsui_cmpxchg_helper_lo((u64 __user *)uaddr_al, oldval, newval); > > > > + > > > > + if (!ret) > > > > + *oval = oldval; > > > > + > > > > + return ret; > > > > +} > > > > > > I think Will expects that you do more of this in C, e.g. have a basic > > > user cmpxchg on a 64-bit type, e.g. > > > > > > /* > > > * NOTE: *oldp is NOT updated if a fault is taken. > > > */ > > > static __always_inline int > > > user_cmpxchg64_release(u64 __usr *addr, u64 *oldp, u64 new) > > > { > > > int err = 0; > > > > > > asm volatile( > > > __LSUI_PREAMBLE > > > "1: caslt %x[old], %x[new], %[addr]\n" > > > "2:\n" > > > _ASM_EXTABLE_UACCESS_ERR(1b, 4b, %w0) > > > : [addr] "+Q" (addr), > > > [old] "+r" (*oldp) > > > : [new] "r" (new) > > > : "memory" > > > ); > > > > > > return err; > > > } > > > > > > That should be the *only* assembly you need to implement. > > > > > > Atop that, have a wrapper that uses get_user() and that helper above to > > > implement the 32-bit user cmpxchg, with all the bit manipulation in C: > > > > Thanks for your suggestion. But small question. > > I think it's enough to use usafe_get_user() instead of get_user() in here > > since when FEAT_LSUI enabled, it doeesn't need to call > > uaccess_ttbr0_enable()/disable(). > > Regardless of uaccess_ttbr0_enable() and uaccess_ttbr0_disable() > specifically, API-wise unsafe_get_user() is only supposed to be called > between user_access_begin() and user_access_end(), and there's some > stuff we probably want to add there (e.g. might_fault(), which > unsafe_get_user() lacks today). > > Do we call those? Yes when you're available. As you mention, the difference seems might_fault(), But I'm not sure whether that would be a reason to validate to use get_user() instead of unsafe_get_user() taking a increase of instruction of "nop" -- uaccess_ttbr0_enable()/disable() in LSUI except the reason for DEUBG purpose. -- Sincerely, Yeoreum Yun
On Tue, Sep 16, 2025 at 02:58:16PM +0100, Yeoreum Yun wrote: > Hi Mark, > > [...] > > > I think it's enough to use usafe_get_user() instead of get_user() in here > > > since when FEAT_LSUI enabled, it doeesn't need to call > > > uaccess_ttbr0_enable()/disable(). > > > > Regardless of uaccess_ttbr0_enable() and uaccess_ttbr0_disable() > > specifically, API-wise unsafe_get_user() is only supposed to be called > > between user_access_begin() and user_access_end(), and there's some > > stuff we probably want to add there (e.g. might_fault(), which > > unsafe_get_user() lacks today). > > > > Do we call those? > > Yes when you're available. > As you mention, the difference seems might_fault(), > But I'm not sure whether that would be a reason to validate to use > get_user() instead of unsafe_get_user() taking a increase of instruction > of "nop" -- uaccess_ttbr0_enable()/disable() in LSUI > except the reason for DEUBG purpose. I think the practical impact of those NOPs is going to be neglible, and not worth optimizing for unless/until we have data demonstrating otherwise. If we want to strictly avoid those NOPs, I think that we should do a more general cleanup, and e.g. have variants of user_access_begin() and user_access_end() that do not mess with TTBR0. I don't think we need to do that for this series. For now, I think that you should either: * Use get_user(). * Use user_access_begin() .. user_access_end() wrapping both unsafe_get_user() and the user cmpxchg. Thanks, Mark.
On Tue, Sep 16, 2025 at 03:07:09PM +0100, Mark Rutland wrote: > On Tue, Sep 16, 2025 at 02:58:16PM +0100, Yeoreum Yun wrote: > > Hi Mark, > > > > [...] > > > > I think it's enough to use usafe_get_user() instead of get_user() in here > > > > since when FEAT_LSUI enabled, it doeesn't need to call > > > > uaccess_ttbr0_enable()/disable(). > > > > > > Regardless of uaccess_ttbr0_enable() and uaccess_ttbr0_disable() > > > specifically, API-wise unsafe_get_user() is only supposed to be called > > > between user_access_begin() and user_access_end(), and there's some > > > stuff we probably want to add there (e.g. might_fault(), which > > > unsafe_get_user() lacks today). > > > > > > Do we call those? > > > > Yes when you're available. > > As you mention, the difference seems might_fault(), > > But I'm not sure whether that would be a reason to validate to use > > get_user() instead of unsafe_get_user() taking a increase of instruction > > of "nop" -- uaccess_ttbr0_enable()/disable() in LSUI > > except the reason for DEUBG purpose. > > I think the practical impact of those NOPs is going to be neglible, and > not worth optimizing for unless/until we have data demonstrating > otherwise. > > If we want to strictly avoid those NOPs, I think that we should do a > more general cleanup, and e.g. have variants of user_access_begin() and > user_access_end() that do not mess with TTBR0. I don't think we need to > do that for this series. > > For now, I think that you should either: > > * Use get_user(). > > * Use user_access_begin() .. user_access_end() wrapping both > unsafe_get_user() and the user cmpxchg. Understood. I'll use get_user() in this series after getting some more comments from other about C version which I sent before your suggestion (Sorry to miss your email before I sent). Thanks :D -- Sincerely, Yeoreum Yun
On Tue, Sep 16, 2025 at 11:02:19AM +0100, Yeoreum Yun wrote: > Hi, > > On Mon, Sep 15, 2025 at 09:35:55PM +0100, Will Deacon wrote: > > > On Mon, Sep 15, 2025 at 08:40:33PM +0100, Catalin Marinas wrote: > > > > On Mon, Sep 15, 2025 at 11:32:39AM +0100, Yeoreum Yun wrote: > > > > > So I think it would be better to keep the current LLSC implementation > > > > > in LSUI. > > > > > > > > I think the code would look simpler with LL/SC but you can give it a try > > > > and post the code sample here (not in a new series). > > > > > > If you stick the cas*t instruction in its own helper say, cmpxchg_user(), > > > then you can do all the shifting/masking in C and I don't reckon it's > > > that bad. It means we (a) get rid of exclusives, which is the whole > > > point of this and (b) don't have to mess around with PAN. > > > > We get rid of PAN toggling already since FEAT_LSUI introduces > > LDTXR/STTXR. But, I'm all for CAS if it doesn't look too bad. Easier > > I think if we do a get_user() of a u64 and combine it with the futex u32 > > while taking care of CPU endianness. All in a loop. Hopefully the > > compiler is smart enough to reduce masking/or'ing to fewer instructions. > > > > Sorry for my wrong previous email. > > Here is what i intened with CAS operation: > > diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h > index 1d6d9f856ac5..0aeda7ced2c0 100644 > --- a/arch/arm64/include/asm/futex.h > +++ b/arch/arm64/include/asm/futex.h > @@ -126,6 +126,60 @@ LSUI_FUTEX_ATOMIC_OP(or, ldtset, al) > LSUI_FUTEX_ATOMIC_OP(andnot, ldtclr, al) > LSUI_FUTEX_ATOMIC_OP(set, swpt, al) > > + > +#define LSUI_CMPXCHG_HELPER(suffix, start_bit) \ > +static __always_inline int \ > +__lsui_cmpxchg_helper_##suffix(u64 __user *uaddr, u32 oldval, u32 newval) \ > +{ \ > + int ret = 0; \ > + u64 oval, nval, tmp; \ > + \ > + asm volatile("//__lsui_cmpxchg_helper_" #suffix "\n" \ > + __LSUI_PREAMBLE \ > +" prfm pstl1strm, %2\n" \ > +"1: ldtr %x1, %2\n" \ > +" mov %x3, %x1\n" \ > +" bfi %x1, %x5, #" #start_bit ", #32\n" \ > +" bfi %x3, %x6, #" #start_bit ", #32\n" \ > +" mov %x4, %x1\n" \ > +"2: caslt %x1, %x3, %2\n" \ > +" sub %x1, %x1, %x4\n" \ > +" cbz %x1, 3f\n" \ > +" mov %w0, %w7\n" \ > +"3:\n" \ > +" dmb ish\n" \ > +"4:\n" \ > + _ASM_EXTABLE_UACCESS_ERR(1b, 4b, %w0) \ > + _ASM_EXTABLE_UACCESS_ERR(2b, 4b, %w0) \ > + : "+r" (ret), "=&r" (oval), "+Q" (*uaddr), "=&r" (nval), "=&r" (tmp) \ > + : "r" (oldval), "r" (newval), "Ir" (-EAGAIN) \ > + : "memory"); \ The vast majority of this can be written in C. Will
Hi, [...] > > +#define LSUI_CMPXCHG_HELPER(suffix, start_bit) \ > > +static __always_inline int \ > > +__lsui_cmpxchg_helper_##suffix(u64 __user *uaddr, u32 oldval, u32 newval) \ > > +{ \ > > + int ret = 0; \ > > + u64 oval, nval, tmp; \ > > + \ > > + asm volatile("//__lsui_cmpxchg_helper_" #suffix "\n" \ > > + __LSUI_PREAMBLE \ > > +" prfm pstl1strm, %2\n" \ > > +"1: ldtr %x1, %2\n" \ > > +" mov %x3, %x1\n" \ > > +" bfi %x1, %x5, #" #start_bit ", #32\n" \ > > +" bfi %x3, %x6, #" #start_bit ", #32\n" \ > > +" mov %x4, %x1\n" \ > > +"2: caslt %x1, %x3, %2\n" \ > > +" sub %x1, %x1, %x4\n" \ > > +" cbz %x1, 3f\n" \ > > +" mov %w0, %w7\n" \ > > +"3:\n" \ > > +" dmb ish\n" \ > > +"4:\n" \ > > + _ASM_EXTABLE_UACCESS_ERR(1b, 4b, %w0) \ > > + _ASM_EXTABLE_UACCESS_ERR(2b, 4b, %w0) \ > > + : "+r" (ret), "=&r" (oval), "+Q" (*uaddr), "=&r" (nval), "=&r" (tmp) \ > > + : "r" (oldval), "r" (newval), "Ir" (-EAGAIN) \ > > + : "memory"); \ > > The vast majority of this can be written in C. Here is the version with C base on patch 6: diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h index 1d6d9f856ac5..68af15ba545a 100644 --- a/arch/arm64/include/asm/futex.h +++ b/arch/arm64/include/asm/futex.h @@ -127,81 +127,77 @@ LSUI_FUTEX_ATOMIC_OP(andnot, ldtclr, al) LSUI_FUTEX_ATOMIC_OP(set, swpt, al) static __always_inline int -__lsui_futex_atomic_and(int oparg, u32 __user *uaddr, int *oval) +__lsui_cmpxchg_helper(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval) { - return __lsui_futex_atomic_andnot(~oparg, uaddr, oval); -} + int ret = -EAGAIN; + u64 __user *uaddr_al; + u64 oval64, nval64, tmp; + static const u64 lo32_mask = GENMASK_U64(31, 0); + + uaddr_al = (u64 __user *) ALIGN_DOWN((unsigned long)uaddr, sizeof(u64)); + unsafe_get_user(oval64, uaddr_al, err_fault); + + if ((u32 __user *)uaddr_al != uaddr) { + nval64 = ((oval64 & lo32_mask) | ((u64)newval << 32)); + oval64 = ((oval64 & lo32_mask) | ((u64)oldval << 32)); + } else { + nval64 = ((oval64 & ~lo32_mask) | newval); + oval64 = ((oval64 & ~lo32_mask) | oldval); + } -static __always_inline int -__lsui_futex_atomic_eor(int oparg, u32 __user *uaddr, int *oval) -{ - unsigned int loops = FUTEX_MAX_LOOPS; - int ret, oldval, tmp; + tmp = oval64; - /* - * there are no ldteor/stteor instructions... - */ - asm volatile("// __lsui_futex_atomic_eor\n" + asm volatile("//__lsui_cmpxchg_helper\n" __LSUI_PREAMBLE -" prfm pstl1strm, %2\n" -"1: ldtxr %w1, %2\n" -" eor %w3, %w1, %w5\n" -"2: stltxr %w0, %w3, %2\n" -" cbz %w0, 3f\n" -" sub %w4, %w4, %w0\n" -" cbnz %w4, 1b\n" -" mov %w0, %w6\n" -"3:\n" +"1: caslt %x1, %x3, %2\n" +" sub %x1, %x1, %x4\n" +" cbnz %x1, 2f\n" +" mov %w0, %w5\n" +"2:\n" " dmb ish\n" +"3:\n" _ASM_EXTABLE_UACCESS_ERR(1b, 3b, %w0) - _ASM_EXTABLE_UACCESS_ERR(2b, 3b, %w0) - : "=&r" (ret), "=&r" (oldval), "+Q" (*uaddr), "=&r" (tmp), - "+r" (loops) - : "r" (oparg), "Ir" (-EAGAIN) + : "+r" (ret), "=&r" (oval64), "+Q" (*uaddr_al) + : "r" (nval64), "r" (tmp), "Ir" (0) : "memory"); if (!ret) *oval = oldval; +err_fault: return ret; } static __always_inline int -__lsui_futex_cmpxchg(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval) +__lsui_futex_atomic_and(int oparg, u32 __user *uaddr, int *oval) { - int ret = 0; - unsigned int loops = FUTEX_MAX_LOOPS; - u32 val, tmp; + return __lsui_futex_atomic_andnot(~oparg, uaddr, oval); +} + +static __always_inline int +__lsui_futex_atomic_eor(int oparg, u32 __user *uaddr, int *oval) +{ +{ + int ret = -EAGAIN; + u32 oldval, newval; /* - * cas{al}t doesn't support word size... + * there are no ldteor/stteor instructions... */ - asm volatile("//__lsui_futex_cmpxchg\n" - __LSUI_PREAMBLE -" prfm pstl1strm, %2\n" -"1: ldtxr %w1, %2\n" -" eor %w3, %w1, %w5\n" -" cbnz %w3, 4f\n" -"2: stltxr %w3, %w6, %2\n" -" cbz %w3, 3f\n" -" sub %w4, %w4, %w3\n" -" cbnz %w4, 1b\n" -" mov %w0, %w7\n" -"3:\n" -" dmb ish\n" -"4:\n" - _ASM_EXTABLE_UACCESS_ERR(1b, 4b, %w0) - _ASM_EXTABLE_UACCESS_ERR(2b, 4b, %w0) - : "+r" (ret), "=&r" (val), "+Q" (*uaddr), "=&r" (tmp), "+r" (loops) - : "r" (oldval), "r" (newval), "Ir" (-EAGAIN) - : "memory"); + unsafe_get_user(oldval, uaddr, err_fault); + newval = oldval ^ oparg; - if (!ret) - *oval = oldval; + ret = __lsui_cmpxchg_helper(uaddr, oldval, newval, oval); +err_fault: return ret; } +static __always_inline int +__lsui_futex_cmpxchg(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval) +{ + return __lsui_cmpxchg_helper(uaddr, oldval, newval, oval); +} + #define __lsui_llsc_body(op, ...) \ ({ \ alternative_has_cap_likely(ARM64_HAS_LSUI) ? \ (END) I'm not sure this is good for you. But If you share your thought, That's would be greatful. (Note: When I test with 256 threads for futex_atomic_eor op, there is not much difference with former assembly version) Thanks! -- Sincerely, Yeoreum Yun
Hi all, > Hi, > > [...] > > > +#define LSUI_CMPXCHG_HELPER(suffix, start_bit) \ > > > +static __always_inline int \ > > > +__lsui_cmpxchg_helper_##suffix(u64 __user *uaddr, u32 oldval, u32 newval) \ > > > +{ \ > > > + int ret = 0; \ > > > + u64 oval, nval, tmp; \ > > > + \ > > > + asm volatile("//__lsui_cmpxchg_helper_" #suffix "\n" \ > > > + __LSUI_PREAMBLE \ > > > +" prfm pstl1strm, %2\n" \ > > > +"1: ldtr %x1, %2\n" \ > > > +" mov %x3, %x1\n" \ > > > +" bfi %x1, %x5, #" #start_bit ", #32\n" \ > > > +" bfi %x3, %x6, #" #start_bit ", #32\n" \ > > > +" mov %x4, %x1\n" \ > > > +"2: caslt %x1, %x3, %2\n" \ > > > +" sub %x1, %x1, %x4\n" \ > > > +" cbz %x1, 3f\n" \ > > > +" mov %w0, %w7\n" \ > > > +"3:\n" \ > > > +" dmb ish\n" \ > > > +"4:\n" \ > > > + _ASM_EXTABLE_UACCESS_ERR(1b, 4b, %w0) \ > > > + _ASM_EXTABLE_UACCESS_ERR(2b, 4b, %w0) \ > > > + : "+r" (ret), "=&r" (oval), "+Q" (*uaddr), "=&r" (nval), "=&r" (tmp) \ > > > + : "r" (oldval), "r" (newval), "Ir" (-EAGAIN) \ > > > + : "memory"); \ > > > > The vast majority of this can be written in C. > > Here is the version with C base on patch 6: > > diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h > diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h > index 1d6d9f856ac5..68af15ba545a 100644 > --- a/arch/arm64/include/asm/futex.h > +++ b/arch/arm64/include/asm/futex.h > @@ -127,81 +127,77 @@ LSUI_FUTEX_ATOMIC_OP(andnot, ldtclr, al) > LSUI_FUTEX_ATOMIC_OP(set, swpt, al) > > static __always_inline int > -__lsui_futex_atomic_and(int oparg, u32 __user *uaddr, int *oval) > +__lsui_cmpxchg_helper(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval) > { > - return __lsui_futex_atomic_andnot(~oparg, uaddr, oval); > -} > + int ret = -EAGAIN; > + u64 __user *uaddr_al; > + u64 oval64, nval64, tmp; > + static const u64 lo32_mask = GENMASK_U64(31, 0); > + > + uaddr_al = (u64 __user *) ALIGN_DOWN((unsigned long)uaddr, sizeof(u64)); > + unsafe_get_user(oval64, uaddr_al, err_fault); > + > + if ((u32 __user *)uaddr_al != uaddr) { > + nval64 = ((oval64 & lo32_mask) | ((u64)newval << 32)); > + oval64 = ((oval64 & lo32_mask) | ((u64)oldval << 32)); > + } else { > + nval64 = ((oval64 & ~lo32_mask) | newval); > + oval64 = ((oval64 & ~lo32_mask) | oldval); > + } > > -static __always_inline int > -__lsui_futex_atomic_eor(int oparg, u32 __user *uaddr, int *oval) > -{ > - unsigned int loops = FUTEX_MAX_LOOPS; > - int ret, oldval, tmp; > + tmp = oval64; > > - /* > - * there are no ldteor/stteor instructions... > - */ > - asm volatile("// __lsui_futex_atomic_eor\n" > + asm volatile("//__lsui_cmpxchg_helper\n" > __LSUI_PREAMBLE > -" prfm pstl1strm, %2\n" > -"1: ldtxr %w1, %2\n" > -" eor %w3, %w1, %w5\n" > -"2: stltxr %w0, %w3, %2\n" > -" cbz %w0, 3f\n" > -" sub %w4, %w4, %w0\n" > -" cbnz %w4, 1b\n" > -" mov %w0, %w6\n" > -"3:\n" > +"1: caslt %x1, %x3, %2\n" > +" sub %x1, %x1, %x4\n" > +" cbnz %x1, 2f\n" > +" mov %w0, %w5\n" > +"2:\n" > " dmb ish\n" > +"3:\n" > _ASM_EXTABLE_UACCESS_ERR(1b, 3b, %w0) > - _ASM_EXTABLE_UACCESS_ERR(2b, 3b, %w0) > - : "=&r" (ret), "=&r" (oldval), "+Q" (*uaddr), "=&r" (tmp), > - "+r" (loops) > - : "r" (oparg), "Ir" (-EAGAIN) > + : "+r" (ret), "=&r" (oval64), "+Q" (*uaddr_al) > + : "r" (nval64), "r" (tmp), "Ir" (0) > : "memory"); > > if (!ret) > *oval = oldval; > > +err_fault: > return ret; > } > > static __always_inline int > -__lsui_futex_cmpxchg(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval) > +__lsui_futex_atomic_and(int oparg, u32 __user *uaddr, int *oval) > { > - int ret = 0; > - unsigned int loops = FUTEX_MAX_LOOPS; > - u32 val, tmp; > + return __lsui_futex_atomic_andnot(~oparg, uaddr, oval); > +} > + > +static __always_inline int > +__lsui_futex_atomic_eor(int oparg, u32 __user *uaddr, int *oval) > +{ > +{ > + int ret = -EAGAIN; > + u32 oldval, newval; > > /* > - * cas{al}t doesn't support word size... > + * there are no ldteor/stteor instructions... > */ > - asm volatile("//__lsui_futex_cmpxchg\n" > - __LSUI_PREAMBLE > -" prfm pstl1strm, %2\n" > -"1: ldtxr %w1, %2\n" > -" eor %w3, %w1, %w5\n" > -" cbnz %w3, 4f\n" > -"2: stltxr %w3, %w6, %2\n" > -" cbz %w3, 3f\n" > -" sub %w4, %w4, %w3\n" > -" cbnz %w4, 1b\n" > -" mov %w0, %w7\n" > -"3:\n" > -" dmb ish\n" > -"4:\n" > - _ASM_EXTABLE_UACCESS_ERR(1b, 4b, %w0) > - _ASM_EXTABLE_UACCESS_ERR(2b, 4b, %w0) > - : "+r" (ret), "=&r" (val), "+Q" (*uaddr), "=&r" (tmp), "+r" (loops) > - : "r" (oldval), "r" (newval), "Ir" (-EAGAIN) > - : "memory"); > + unsafe_get_user(oldval, uaddr, err_fault); > + newval = oldval ^ oparg; > > - if (!ret) > - *oval = oldval; > + ret = __lsui_cmpxchg_helper(uaddr, oldval, newval, oval); > > +err_fault: > return ret; > } > > +static __always_inline int > +__lsui_futex_cmpxchg(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval) > +{ > + return __lsui_cmpxchg_helper(uaddr, oldval, newval, oval); > +} > + > #define __lsui_llsc_body(op, ...) \ > ({ \ > alternative_has_cap_likely(ARM64_HAS_LSUI) ? \ > (END) > > I'm not sure this is good for you. > But If you share your thought, That's would be greatful. > (Note: > When I test with 256 threads for futex_atomic_eor op, there is not much > difference with former assembly version) > Might be all discussion seems to be going to made cmpxchg with C version. I'll respin cmpxchg with C version with the missing endianess support. Thanks. -- Sincerely, Yeoreum Yun
Hi, > On Mon, Sep 15, 2025 at 09:35:55PM +0100, Will Deacon wrote: > > On Mon, Sep 15, 2025 at 08:40:33PM +0100, Catalin Marinas wrote: > > > On Mon, Sep 15, 2025 at 11:32:39AM +0100, Yeoreum Yun wrote: > > > > So I think it would be better to keep the current LLSC implementation > > > > in LSUI. > > > > > > I think the code would look simpler with LL/SC but you can give it a try > > > and post the code sample here (not in a new series). > > > > If you stick the cas*t instruction in its own helper say, cmpxchg_user(), > > then you can do all the shifting/masking in C and I don't reckon it's > > that bad. It means we (a) get rid of exclusives, which is the whole > > point of this and (b) don't have to mess around with PAN. > > We get rid of PAN toggling already since FEAT_LSUI introduces > LDTXR/STTXR. But, I'm all for CAS if it doesn't look too bad. Easier > I think if we do a get_user() of a u64 and combine it with the futex u32 > while taking care of CPU endianness. All in a loop. Hopefully the > compiler is smart enough to reduce masking/or'ing to fewer instructions. > Hmm, I think sure shifting/masking can be replace by single bfi instruction like: diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h index 1d6d9f856ac5..30da0006c0c8 100644 --- a/arch/arm64/include/asm/futex.h +++ b/arch/arm64/include/asm/futex.h @@ -126,6 +126,59 @@ LSUI_FUTEX_ATOMIC_OP(or, ldtset, al) LSUI_FUTEX_ATOMIC_OP(andnot, ldtclr, al) LSUI_FUTEX_ATOMIC_OP(set, swpt, al) + +#define LSUI_CMPXCHG_HELPER(suffix, start_bit) \ +static __always_inline int \ +__lsui_cmpxchg_helper_##suffix(u64 __user *uaddr, u32 oldval, u32 newval) \ +{ \ + int ret = 0; \ + u64 oval, nval, tmp; \ + \ + asm volatile("//__lsui_cmpxchg_helper_" #suffix "\n" \ + __LSUI_PREAMBLE \ +" prfm pstl1strm, %2\n" \ +"1: ldtr %x1, %2\n" \ +" bfi %x1, %x5, #" #start_bit ", #32\n" \ +" bfi %x1, %x6, #" #start_bit ", #32\n" \ +" mov %x4, %x5\n" \ +"2: caslt %x5, %x6, %2\n" \ +" sub %x4, %x4, %x5\n" \ +" cbz %x4, 3f\n" \ +" mov %w0, %w7\n" \ +"3:\n" \ +" dmb ish\n" \ +"4:\n" \ + _ASM_EXTABLE_UACCESS_ERR(1b, 4b, %w0) \ + _ASM_EXTABLE_UACCESS_ERR(2b, 4b, %w0) \ + : "+r" (ret), "=&r" (oval), "+Q" (*uaddr), "=&r" (nval), "=&r" (tmp) \ + : "r" (oldval), "r" (newval), "Ir" (-EAGAIN) \ + : "memory"); \ + \ + return ret; \ +} + +LSUI_CMPXCHG_HELPER(lo, 0) +LSUI_CMPXCHG_HELPER(hi, 32) + +static __always_inline int +__lsui_cmpxchg_helper(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval) +{ + int ret; + unsigned long uaddr_al; + + uaddr_al = ALIGN_DOWN((unsigned long)uaddr, sizeof(u64)); + + if (uaddr_al != (unsigned long)uaddr) + ret = __lsui_cmpxchg_helper_hi((u64 __user *)uaddr_al, oldval, newval); + else + ret = __lsui_cmpxchg_helper_lo((u64 __user *)uaddr_al, oldval, newval); + + if (!ret) + *oval = oldval; + + return ret; +} + static __always_inline int __lsui_futex_atomic_and(int oparg, u32 __user *uaddr, int *oval) { @@ -135,71 +188,25 @@ __lsui_futex_atomic_and(int oparg, u32 __user *uaddr, int *oval) static __always_inline int __lsui_futex_atomic_eor(int oparg, u32 __user *uaddr, int *oval) { - unsigned int loops = FUTEX_MAX_LOOPS; - int ret, oldval, tmp; + int ret = -EAGAIN; + u32 oldval, newval; /* * there are no ldteor/stteor instructions... */ - asm volatile("// __lsui_futex_atomic_eor\n" - __LSUI_PREAMBLE -" prfm pstl1strm, %2\n" -"1: ldtxr %w1, %2\n" -" eor %w3, %w1, %w5\n" -"2: stltxr %w0, %w3, %2\n" -" cbz %w0, 3f\n" -" sub %w4, %w4, %w0\n" -" cbnz %w4, 1b\n" -" mov %w0, %w6\n" -"3:\n" -" dmb ish\n" - _ASM_EXTABLE_UACCESS_ERR(1b, 3b, %w0) - _ASM_EXTABLE_UACCESS_ERR(2b, 3b, %w0) - : "=&r" (ret), "=&r" (oldval), "+Q" (*uaddr), "=&r" (tmp), - "+r" (loops) - : "r" (oparg), "Ir" (-EAGAIN) - : "memory"); + unsafe_get_user(oldval, uaddr, err_fault); + newval = oldval ^ oparg; - if (!ret) - *oval = oldval; + ret = __lsui_cmpxchg_helper(uaddr, oldval, newval, oval); +err_fault: return ret; } static __always_inline int __lsui_futex_cmpxchg(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval) { - int ret = 0; - unsigned int loops = FUTEX_MAX_LOOPS; - u32 val, tmp; - - /* - * cas{al}t doesn't support word size... - */ - asm volatile("//__lsui_futex_cmpxchg\n" - __LSUI_PREAMBLE -" prfm pstl1strm, %2\n" -"1: ldtxr %w1, %2\n" -" eor %w3, %w1, %w5\n" -" cbnz %w3, 4f\n" -"2: stltxr %w3, %w6, %2\n" -" cbz %w3, 3f\n" -" sub %w4, %w4, %w3\n" -" cbnz %w4, 1b\n" -" mov %w0, %w7\n" -"3:\n" -" dmb ish\n" -"4:\n" - _ASM_EXTABLE_UACCESS_ERR(1b, 4b, %w0) - _ASM_EXTABLE_UACCESS_ERR(2b, 4b, %w0) - : "+r" (ret), "=&r" (val), "+Q" (*uaddr), "=&r" (tmp), "+r" (loops) - : "r" (oldval), "r" (newval), "Ir" (-EAGAIN) - : "memory"); - - if (!ret) - *oval = oldval; - - return ret; + return __lsui_cmpxchg_helper(uaddr, oldval, newval, oval); } #define __lsui_llsc_body(op, ...) \ This is based on the patch #6. Am I missing something? -- Sincerely, Yeoreum Yun
Sorry, Ignore this. I've sent wrong this :( I'll send it again. > Hi, > > > On Mon, Sep 15, 2025 at 09:35:55PM +0100, Will Deacon wrote: > > > On Mon, Sep 15, 2025 at 08:40:33PM +0100, Catalin Marinas wrote: > > > > On Mon, Sep 15, 2025 at 11:32:39AM +0100, Yeoreum Yun wrote: > > > > > So I think it would be better to keep the current LLSC implementation > > > > > in LSUI. > > > > > > > > I think the code would look simpler with LL/SC but you can give it a try > > > > and post the code sample here (not in a new series). > > > > > > If you stick the cas*t instruction in its own helper say, cmpxchg_user(), > > > then you can do all the shifting/masking in C and I don't reckon it's > > > that bad. It means we (a) get rid of exclusives, which is the whole > > > point of this and (b) don't have to mess around with PAN. > > > > We get rid of PAN toggling already since FEAT_LSUI introduces > > LDTXR/STTXR. But, I'm all for CAS if it doesn't look too bad. Easier > > I think if we do a get_user() of a u64 and combine it with the futex u32 > > while taking care of CPU endianness. All in a loop. Hopefully the > > compiler is smart enough to reduce masking/or'ing to fewer instructions. > > > > Hmm, I think sure shifting/masking can be replace by single bfi > instruction like: > > diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h > index 1d6d9f856ac5..30da0006c0c8 100644 > --- a/arch/arm64/include/asm/futex.h > +++ b/arch/arm64/include/asm/futex.h > @@ -126,6 +126,59 @@ LSUI_FUTEX_ATOMIC_OP(or, ldtset, al) > LSUI_FUTEX_ATOMIC_OP(andnot, ldtclr, al) > LSUI_FUTEX_ATOMIC_OP(set, swpt, al) > > + > +#define LSUI_CMPXCHG_HELPER(suffix, start_bit) \ > +static __always_inline int \ > +__lsui_cmpxchg_helper_##suffix(u64 __user *uaddr, u32 oldval, u32 newval) \ > +{ \ > + int ret = 0; \ > + u64 oval, nval, tmp; \ > + \ > + asm volatile("//__lsui_cmpxchg_helper_" #suffix "\n" \ > + __LSUI_PREAMBLE \ > +" prfm pstl1strm, %2\n" \ > +"1: ldtr %x1, %2\n" \ > +" bfi %x1, %x5, #" #start_bit ", #32\n" \ > +" bfi %x1, %x6, #" #start_bit ", #32\n" \ > +" mov %x4, %x5\n" \ > +"2: caslt %x5, %x6, %2\n" \ > +" sub %x4, %x4, %x5\n" \ > +" cbz %x4, 3f\n" \ > +" mov %w0, %w7\n" \ > +"3:\n" \ > +" dmb ish\n" \ > +"4:\n" \ > + _ASM_EXTABLE_UACCESS_ERR(1b, 4b, %w0) \ > + _ASM_EXTABLE_UACCESS_ERR(2b, 4b, %w0) \ > + : "+r" (ret), "=&r" (oval), "+Q" (*uaddr), "=&r" (nval), "=&r" (tmp) \ > + : "r" (oldval), "r" (newval), "Ir" (-EAGAIN) \ > + : "memory"); \ > + \ > + return ret; \ > +} > + > +LSUI_CMPXCHG_HELPER(lo, 0) > +LSUI_CMPXCHG_HELPER(hi, 32) > + > +static __always_inline int > +__lsui_cmpxchg_helper(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval) > +{ > + int ret; > + unsigned long uaddr_al; > + > + uaddr_al = ALIGN_DOWN((unsigned long)uaddr, sizeof(u64)); > + > + if (uaddr_al != (unsigned long)uaddr) > + ret = __lsui_cmpxchg_helper_hi((u64 __user *)uaddr_al, oldval, newval); > + else > + ret = __lsui_cmpxchg_helper_lo((u64 __user *)uaddr_al, oldval, newval); > + > + if (!ret) > + *oval = oldval; > + > + return ret; > +} > + > static __always_inline int > __lsui_futex_atomic_and(int oparg, u32 __user *uaddr, int *oval) > { > @@ -135,71 +188,25 @@ __lsui_futex_atomic_and(int oparg, u32 __user *uaddr, int *oval) > static __always_inline int > __lsui_futex_atomic_eor(int oparg, u32 __user *uaddr, int *oval) > { > - unsigned int loops = FUTEX_MAX_LOOPS; > - int ret, oldval, tmp; > + int ret = -EAGAIN; > + u32 oldval, newval; > > /* > * there are no ldteor/stteor instructions... > */ > - asm volatile("// __lsui_futex_atomic_eor\n" > - __LSUI_PREAMBLE > -" prfm pstl1strm, %2\n" > -"1: ldtxr %w1, %2\n" > -" eor %w3, %w1, %w5\n" > -"2: stltxr %w0, %w3, %2\n" > -" cbz %w0, 3f\n" > -" sub %w4, %w4, %w0\n" > -" cbnz %w4, 1b\n" > -" mov %w0, %w6\n" > -"3:\n" > -" dmb ish\n" > - _ASM_EXTABLE_UACCESS_ERR(1b, 3b, %w0) > - _ASM_EXTABLE_UACCESS_ERR(2b, 3b, %w0) > - : "=&r" (ret), "=&r" (oldval), "+Q" (*uaddr), "=&r" (tmp), > - "+r" (loops) > - : "r" (oparg), "Ir" (-EAGAIN) > - : "memory"); > + unsafe_get_user(oldval, uaddr, err_fault); > + newval = oldval ^ oparg; > > - if (!ret) > - *oval = oldval; > + ret = __lsui_cmpxchg_helper(uaddr, oldval, newval, oval); > > +err_fault: > return ret; > } > > static __always_inline int > __lsui_futex_cmpxchg(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval) > { > - int ret = 0; > - unsigned int loops = FUTEX_MAX_LOOPS; > - u32 val, tmp; > - > - /* > - * cas{al}t doesn't support word size... > - */ > - asm volatile("//__lsui_futex_cmpxchg\n" > - __LSUI_PREAMBLE > -" prfm pstl1strm, %2\n" > -"1: ldtxr %w1, %2\n" > -" eor %w3, %w1, %w5\n" > -" cbnz %w3, 4f\n" > -"2: stltxr %w3, %w6, %2\n" > -" cbz %w3, 3f\n" > -" sub %w4, %w4, %w3\n" > -" cbnz %w4, 1b\n" > -" mov %w0, %w7\n" > -"3:\n" > -" dmb ish\n" > -"4:\n" > - _ASM_EXTABLE_UACCESS_ERR(1b, 4b, %w0) > - _ASM_EXTABLE_UACCESS_ERR(2b, 4b, %w0) > - : "+r" (ret), "=&r" (val), "+Q" (*uaddr), "=&r" (tmp), "+r" (loops) > - : "r" (oldval), "r" (newval), "Ir" (-EAGAIN) > - : "memory"); > - > - if (!ret) > - *oval = oldval; > - > - return ret; > + return __lsui_cmpxchg_helper(uaddr, oldval, newval, oval); > } > > #define __lsui_llsc_body(op, ...) \ > > > This is based on the patch #6. > > Am I missing something? > > -- > Sincerely, > Yeoreum Yun > -- Sincerely, Yeoreum Yun
On Sat, Aug 16, 2025 at 04:19:27PM +0100, Yeoreum Yun wrote: > +#define FUTEX_ATOMIC_OP(op) \ > +static __always_inline int \ > +__futex_atomic_##op(int oparg, u32 __user *uaddr, int *oval) \ > +{ \ > + return __llsc_futex_atomic_##op(oparg, uaddr, oval); \ > +} > + > +FUTEX_ATOMIC_OP(add) > +FUTEX_ATOMIC_OP(or) > +FUTEX_ATOMIC_OP(and) > +FUTEX_ATOMIC_OP(eor) > +FUTEX_ATOMIC_OP(set) > + > +static __always_inline int > +__futex_cmpxchg(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval) > +{ > + return __llsc_futex_cmpxchg(uaddr, oldval, newval, oval); > +} The patch looks fine and my impression that the __futex_* functions will be used in patch 6 to dispatch between lsui and llsc. But you got them the other way around. I'll comment there. For this patch: Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
> On Sat, Aug 16, 2025 at 04:19:27PM +0100, Yeoreum Yun wrote: > > +#define FUTEX_ATOMIC_OP(op) \ > > +static __always_inline int \ > > +__futex_atomic_##op(int oparg, u32 __user *uaddr, int *oval) \ > > +{ \ > > + return __llsc_futex_atomic_##op(oparg, uaddr, oval); \ > > +} > > + > > +FUTEX_ATOMIC_OP(add) > > +FUTEX_ATOMIC_OP(or) > > +FUTEX_ATOMIC_OP(and) > > +FUTEX_ATOMIC_OP(eor) > > +FUTEX_ATOMIC_OP(set) > > + > > +static __always_inline int > > +__futex_cmpxchg(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval) > > +{ > > + return __llsc_futex_cmpxchg(uaddr, oldval, newval, oval); > > +} > > The patch looks fine and my impression that the __futex_* functions will > be used in patch 6 to dispatch between lsui and llsc. But you got them > the other way around. I'll comment there. For this patch: > > Reviewed-by: Catalin Marinas <catalin.marinas@arm.com> Thanks! -- Sincerely, Yeoreum Yun
On Fri, Sep 12, 2025 at 05:44:15PM +0100, Catalin Marinas wrote: > On Sat, Aug 16, 2025 at 04:19:27PM +0100, Yeoreum Yun wrote: > > +#define FUTEX_ATOMIC_OP(op) \ > > +static __always_inline int \ > > +__futex_atomic_##op(int oparg, u32 __user *uaddr, int *oval) \ > > +{ \ > > + return __llsc_futex_atomic_##op(oparg, uaddr, oval); \ > > +} > > + > > +FUTEX_ATOMIC_OP(add) > > +FUTEX_ATOMIC_OP(or) > > +FUTEX_ATOMIC_OP(and) > > +FUTEX_ATOMIC_OP(eor) > > +FUTEX_ATOMIC_OP(set) > > + > > +static __always_inline int > > +__futex_cmpxchg(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval) > > +{ > > + return __llsc_futex_cmpxchg(uaddr, oldval, newval, oval); > > +} > > The patch looks fine and my impression that the __futex_* functions will > be used in patch 6 to dispatch between lsui and llsc. But you got them > the other way around. I'll comment there. Ah, ignore me, I misread patch 6. It looks good. > For this patch: > > Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
On Sat, Aug 16, 2025 at 04:19:27PM +0100, Yeoreum Yun wrote: > Refactor futex atomic operations using ll/sc method with > clearing PSTATE.PAN to prepare to apply FEAT_LSUI on them. > > Signed-off-by: Yeoreum Yun <yeoreum.yun@arm.com> > --- > arch/arm64/include/asm/futex.h | 132 +++++++++++++++++++++------------ > 1 file changed, 84 insertions(+), 48 deletions(-) > > diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h > index bc06691d2062..ab7003cb4724 100644 > --- a/arch/arm64/include/asm/futex.h > +++ b/arch/arm64/include/asm/futex.h > @@ -7,17 +7,21 @@ > > #include <linux/futex.h> > #include <linux/uaccess.h> > +#include <linux/stringify.h> > > #include <asm/errno.h> > > -#define FUTEX_MAX_LOOPS 128 /* What's the largest number you can think of? */ > +#define LLSC_MAX_LOOPS 128 /* What's the largest number you can think of? */ > > -#define __futex_atomic_op(insn, ret, oldval, uaddr, tmp, oparg) \ > -do { \ > - unsigned int loops = FUTEX_MAX_LOOPS; \ > +#define LLSC_FUTEX_ATOMIC_OP(op, insn) \ > +static __always_inline int \ > +__llsc_futex_atomic_##op(int oparg, u32 __user *uaddr, int *oval) \ > +{ \ > + unsigned int loops = LLSC_MAX_LOOPS; \ > + int ret, oldval, tmp; \ > \ > uaccess_enable_privileged(); \ > - asm volatile( \ > + asm volatile("// __llsc_futex_atomic_" #op "\n" \ > " prfm pstl1strm, %2\n" \ > "1: ldxr %w1, %2\n" \ > insn "\n" \ > @@ -35,45 +39,103 @@ do { \ > : "r" (oparg), "Ir" (-EAGAIN) \ > : "memory"); \ > uaccess_disable_privileged(); \ > -} while (0) > + \ > + if (!ret) \ > + *oval = oldval; \ Why push the store to '*oval' down into here? Will
Hi Will, [...] > > -#define __futex_atomic_op(insn, ret, oldval, uaddr, tmp, oparg) \ > > -do { \ > > - unsigned int loops = FUTEX_MAX_LOOPS; \ > > +#define LLSC_FUTEX_ATOMIC_OP(op, insn) \ > > +static __always_inline int \ > > +__llsc_futex_atomic_##op(int oparg, u32 __user *uaddr, int *oval) \ > > +{ \ > > + unsigned int loops = LLSC_MAX_LOOPS; \ > > + int ret, oldval, tmp; \ > > \ > > uaccess_enable_privileged(); \ > > - asm volatile( \ > > + asm volatile("// __llsc_futex_atomic_" #op "\n" \ > > " prfm pstl1strm, %2\n" \ > > "1: ldxr %w1, %2\n" \ > > insn "\n" \ > > @@ -35,45 +39,103 @@ do { \ > > : "r" (oparg), "Ir" (-EAGAIN) \ > > : "memory"); \ > > uaccess_disable_privileged(); \ > > -} while (0) > > + \ > > + if (!ret) \ > > + *oval = oldval; \ > > Why push the store to '*oval' down into here? As __llsc_futext_atomic_##op() is declared with inline function I think it would be better to pass oval from arch_futex_atomic_op_inuser() as it is for readability. Is it awful? Thanks. -- Sincerely, Yeoreum Yun
© 2016 - 2025 Red Hat, Inc.