[PATCH] futex: improve user space accesses

Linus Torvalds posted 1 patch 1 year, 2 months ago
arch/x86/include/asm/futex.h |  8 +++--
kernel/futex/core.c          | 22 --------------
kernel/futex/futex.h         | 59 ++++++++++++++++++++++++++++++++++--
3 files changed, 63 insertions(+), 26 deletions(-)
[PATCH] futex: improve user space accesses
Posted by Linus Torvalds 1 year, 2 months ago
Josh Poimboeuf reports that he got a "will-it-scale.per_process_ops 1.9%
improvement" report for his patch that changed __get_user() to use
pointer masking instead of the explicit speculation barrier.  However,
that patch doesn't actually work in the general case, because some (very
bad) architecture-specific code actually depends on __get_user() also
working on kernel addresses.

A profile showed that the offending __get_user() was the futex code,
which really should be fixed up to not use that horrid legacy case.
Rewrite futex_get_value_locked() to use the modern user acccess helpers,
and inline it so that the compiler not only avoids the function call for
a few instructions, but can do CSE on the address masking.

It also turns out the x86 futex functions have unnecessary barriers in
other places, so let's fix those up too.

Link: https://lore.kernel.org/all/20241115230653.hfvzyf3aqqntgp63@jpoimboe/
Reported-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/futex.h |  8 +++--
 kernel/futex/core.c          | 22 --------------
 kernel/futex/futex.h         | 59 ++++++++++++++++++++++++++++++++++--
 3 files changed, 63 insertions(+), 26 deletions(-)

diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h
index 99d345b686fa..6e2458088800 100644
--- a/arch/x86/include/asm/futex.h
+++ b/arch/x86/include/asm/futex.h
@@ -48,7 +48,9 @@ do {								\
 static __always_inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
 		u32 __user *uaddr)
 {
-	if (!user_access_begin(uaddr, sizeof(u32)))
+	if (can_do_masked_user_access())
+		uaddr = masked_user_access_begin(uaddr);
+	else if (!user_access_begin(uaddr, sizeof(u32)))
 		return -EFAULT;
 
 	switch (op) {
@@ -84,7 +86,9 @@ static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
 {
 	int ret = 0;
 
-	if (!user_access_begin(uaddr, sizeof(u32)))
+	if (can_do_masked_user_access())
+		uaddr = masked_user_access_begin(uaddr);
+	else if (!user_access_begin(uaddr, sizeof(u32)))
 		return -EFAULT;
 	asm volatile("\n"
 		"1:\t" LOCK_PREFIX "cmpxchgl %3, %2\n"
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index 326bfe6549d7..9833fdb63e39 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -451,28 +451,6 @@ struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, union futex_key *
 	return NULL;
 }
 
-int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32 uval, u32 newval)
-{
-	int ret;
-
-	pagefault_disable();
-	ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
-	pagefault_enable();
-
-	return ret;
-}
-
-int futex_get_value_locked(u32 *dest, u32 __user *from)
-{
-	int ret;
-
-	pagefault_disable();
-	ret = __get_user(*dest, from);
-	pagefault_enable();
-
-	return ret ? -EFAULT : 0;
-}
-
 /**
  * wait_for_owner_exiting - Block until the owner has exited
  * @ret: owner's current futex lock status
diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h
index 8b195d06f4e8..323572014b32 100644
--- a/kernel/futex/futex.h
+++ b/kernel/futex/futex.h
@@ -6,6 +6,7 @@
 #include <linux/rtmutex.h>
 #include <linux/sched/wake_q.h>
 #include <linux/compat.h>
+#include <linux/uaccess.h>
 
 #ifdef CONFIG_PREEMPT_RT
 #include <linux/rcuwait.h>
@@ -225,10 +226,64 @@ extern bool __futex_wake_mark(struct futex_q *q);
 extern void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q);
 
 extern int fault_in_user_writeable(u32 __user *uaddr);
-extern int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32 uval, u32 newval);
-extern int futex_get_value_locked(u32 *dest, u32 __user *from);
 extern struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, union futex_key *key);
 
+static inline int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32 uval, u32 newval)
+{
+	int ret;
+
+	pagefault_disable();
+	ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
+	pagefault_enable();
+
+	return ret;
+}
+
+/*
+ * This does a plain atomic user space read, and the user pointer has
+ * already been verified earlier by get_futex_key() to be both aligned
+ * and actually in user space, just like futex_atomic_cmpxchg_inatomic().
+ *
+ * We still want to avoid any speculation, and while __get_user() is
+ * the traditional model for this, it's actually slower then doing
+ * this manually these days.
+ *
+ * We could just have a per-architecture special function for it,
+ * the same way we do futex_atomic_cmpxchg_inatomic(), but rather
+ * than force everybody to do that, write it out long-hand using
+ * the low-level user-access infrastructure.
+ *
+ * This looks a bit overkill, but generally just results in a couple
+ * of instructions.
+ */
+static __always_inline int futex_read_inatomic(u32 *dest, u32 __user *from)
+{
+	u32 val;
+
+	if (can_do_masked_user_access())
+		from = masked_user_access_begin(from);
+	else if (!user_read_access_begin(from, sizeof(*from)))
+		return -EFAULT;
+	unsafe_get_user(val, from, Efault);
+	user_access_end();
+	*dest = val;
+	return 0;
+Efault:
+	user_access_end();
+	return -EFAULT;
+}
+
+static inline int futex_get_value_locked(u32 *dest, u32 __user *from)
+{
+	int ret;
+
+	pagefault_disable();
+	ret = futex_read_inatomic(dest, from);
+	pagefault_enable();
+
+	return ret;
+}
+
 extern void __futex_unqueue(struct futex_q *q);
 extern void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb);
 extern int futex_unqueue(struct futex_q *q);
-- 
2.47.0.293.g502c15559b
Re: [PATCH] futex: improve user space accesses
Posted by Andreas Schwab 1 year, 2 months ago
On Nov 22 2024, Linus Torvalds wrote:

> Josh Poimboeuf reports that he got a "will-it-scale.per_process_ops 1.9%
> improvement" report for his patch that changed __get_user() to use
> pointer masking instead of the explicit speculation barrier.  However,
> that patch doesn't actually work in the general case, because some (very
> bad) architecture-specific code actually depends on __get_user() also
> working on kernel addresses.
>
> A profile showed that the offending __get_user() was the futex code,
> which really should be fixed up to not use that horrid legacy case.
> Rewrite futex_get_value_locked() to use the modern user acccess helpers,
> and inline it so that the compiler not only avoids the function call for
> a few instructions, but can do CSE on the address masking.

This breaks userspace on ppc32.  As soon as /init in the initrd is
started the kernel hangs (without any messages).

-- 
Andreas Schwab, schwab@linux-m68k.org
GPG Key fingerprint = 7578 EB47 D4E5 4D69 2510  2552 DF73 E780 A9DA AEC1
"And now for something completely different."
Re: [PATCH] futex: improve user space accesses
Posted by Linus Torvalds 1 year, 2 months ago
On Sun, 8 Dec 2024 at 14:54, Andreas Schwab <schwab@linux-m68k.org> wrote:
>
> This breaks userspace on ppc32.  As soon as /init in the initrd is
> started the kernel hangs (without any messages).

Funky, funky. Most of the diff is the code movement (and some small
x86-specific stuff), so for ppc, the only part that should be relevant
is the futex_get_value_locked().

And since ppc doesn't do the masked user access thing, so it
*literally* boils down to just that

        if (!user_read_access_begin(from, sizeof(*from)))
                return -EFAULT;
        unsafe_get_user(val, from, Efault);
        user_access_end();

path.

Ahh... And now that I write that out, the bug is obvious: it should be using

        user_read_access_end();

to match up with the user_read_access_begin().

And yeah, ppc is the only platform that has that
"read-vs-write-vs-both" thing, so this bug is not visible anywhere
else.

IOW, does this one-liner fix it for you?

  --- a/kernel/futex/futex.h
  +++ b/kernel/futex/futex.h
  @@ -265,7 +265,7 @@
        else if (!user_read_access_begin(from, sizeof(*from)))
                return -EFAULT;
        unsafe_get_user(val, from, Efault);
  -     user_access_end();
  +     user_read_access_end();
        *dest = val;
        return 0;
   Efault:

I bet it does, but I'll wait for confirmation before actually
committing that fix.

Thanks,

                Linus
Re: [PATCH] futex: improve user space accesses
Posted by Andreas Schwab 1 year, 1 month ago
On Dez 08 2024, Linus Torvalds wrote:

> IOW, does this one-liner fix it for you?
>
>   --- a/kernel/futex/futex.h
>   +++ b/kernel/futex/futex.h
>   @@ -265,7 +265,7 @@
>         else if (!user_read_access_begin(from, sizeof(*from)))
>                 return -EFAULT;
>         unsafe_get_user(val, from, Efault);
>   -     user_access_end();
>   +     user_read_access_end();
>         *dest = val;
>         return 0;
>    Efault:

Thanks, I can confirm that this fixed the crash (changing both arms as
pointed out by Christophe).

-- 
Andreas Schwab, schwab@linux-m68k.org
GPG Key fingerprint = 7578 EB47 D4E5 4D69 2510  2552 DF73 E780 A9DA AEC1
"And now for something completely different."
Re: [PATCH] futex: improve user space accesses
Posted by Christophe Leroy 1 year, 2 months ago

Le 09/12/2024 à 01:32, Linus Torvalds a écrit :
> On Sun, 8 Dec 2024 at 14:54, Andreas Schwab <schwab@linux-m68k.org> wrote:
>>
>> This breaks userspace on ppc32.  As soon as /init in the initrd is
>> started the kernel hangs (without any messages).
> 
> Funky, funky. Most of the diff is the code movement (and some small
> x86-specific stuff), so for ppc, the only part that should be relevant
> is the futex_get_value_locked().
> 
> And since ppc doesn't do the masked user access thing, so it
> *literally* boils down to just that
> 
>          if (!user_read_access_begin(from, sizeof(*from)))
>                  return -EFAULT;
>          unsafe_get_user(val, from, Efault);
>          user_access_end();
> 
> path.
> 
> Ahh... And now that I write that out, the bug is obvious: it should be using
> 
>          user_read_access_end();
> 
> to match up with the user_read_access_begin().

Yes indeed, especially on book3s/32, which is only able to write-protect 
user accesses. On that platform user_read_access_...() are no-ops.

user_access_end() and user_write_access_end() are similar, and rely on a 
thread var stored by user_access_begin(). When calling that 
user_access_end() without prior call to user_access_begin(), that var 
has value ~0 instead of the address of the user segment being accessed, 
and ~0 is a kernel address so user_access_end() applies some user 
segment flags to a kernel segment which most likely leads to a complete 
mess allthough I'm not able to trigger the hang with QEMU.

> 
> And yeah, ppc is the only platform that has that
> "read-vs-write-vs-both" thing, so this bug is not visible anywhere
> else.
> 
> IOW, does this one-liner fix it for you?
> 
>    --- a/kernel/futex/futex.h
>    +++ b/kernel/futex/futex.h
>    @@ -265,7 +265,7 @@
>          else if (!user_read_access_begin(from, sizeof(*from)))
>                  return -EFAULT;
>          unsafe_get_user(val, from, Efault);
>    -     user_access_end();
>    +     user_read_access_end();
>          *dest = val;
>          return 0;
>     Efault:
> 
> I bet it does, but I'll wait for confirmation before actually
> committing that fix.
> 

You'll need the same change in the Efault leg.

Christophe
Re: [PATCH] futex: improve user space accesses
Posted by Josh Poimboeuf 1 year, 2 months ago
On Fri, Nov 22, 2024 at 11:33:05AM -0800, Linus Torvalds wrote:
> Josh Poimboeuf reports that he got a "will-it-scale.per_process_ops 1.9%
> improvement" report for his patch that changed __get_user() to use
> pointer masking instead of the explicit speculation barrier.  However,
> that patch doesn't actually work in the general case, because some (very
> bad) architecture-specific code actually depends on __get_user() also
> working on kernel addresses.
> 
> A profile showed that the offending __get_user() was the futex code,
> which really should be fixed up to not use that horrid legacy case.
> Rewrite futex_get_value_locked() to use the modern user acccess helpers,
> and inline it so that the compiler not only avoids the function call for
> a few instructions, but can do CSE on the address masking.
> 
> It also turns out the x86 futex functions have unnecessary barriers in
> other places, so let's fix those up too.
> 
> Link: https://lore.kernel.org/all/20241115230653.hfvzyf3aqqntgp63@jpoimboe/
> Reported-by: Josh Poimboeuf <jpoimboe@kernel.org>
> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

I didn't get a chance to try to recreate the original benchmark, but
this looks obviously correct.

Reviewed-by: Josh Poimboeuf <jpoimboe@kernel.org>

-- 
Josh