The futex hashmap is system wide and shared by random tasks. Each slot
is hashed based on its address and VMA. Due to randomized VMAs the same
logical lock (pointer) can end up in a different hash bucket on each
invocation of the application. This in turn means that different
applications may share a hash bucket on each invocation and it is not
always clear which applications will be involved. This can result in
high latency's to acquire the futex_hash_bucket::lock especially if the
lock owner is limited to a CPU and not be effectively PI boosted.
Introduce a task local hash map. The hashmap can be allocated via
prctl(PR_FUTEX_HASH, PR_FUTEX_HASH_ALLOCATE, 0)
The `0' argument allocates a default number of 4 slots, a higher number
can be specified if desired. The current uppoer limit is 16.
The hashmap can be shared with other threads within an application via
prctl(PR_FUTEX_HASH, PR_FUTEX_HASH_SHARE);
Once the shared hashmap is enabled, all threads must enable it.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/futex.h | 8 +++
include/linux/sched.h | 2 +
include/uapi/linux/prctl.h | 5 ++
kernel/futex/core.c | 125 +++++++++++++++++++++++++++++++++++++
kernel/sys.c | 4 ++
5 files changed, 144 insertions(+)
diff --git a/include/linux/futex.h b/include/linux/futex.h
index b70df27d7e85c..e92cbea336e8e 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -69,6 +69,7 @@ static inline void futex_init_task(struct task_struct *tsk)
tsk->pi_state_cache = NULL;
tsk->futex_state = FUTEX_STATE_OK;
mutex_init(&tsk->futex_exit_mutex);
+ rcu_assign_pointer(tsk->futex_hash_table, NULL);
}
void futex_exit_recursive(struct task_struct *tsk);
@@ -77,6 +78,8 @@ void futex_exec_release(struct task_struct *tsk);
long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
u32 __user *uaddr2, u32 val2, u32 val3);
+int futex_hash_prctl(unsigned long arg2, unsigned long arg3,
+ unsigned long arg4, unsigned long arg5);
#else
static inline void futex_init_task(struct task_struct *tsk) { }
static inline void futex_exit_recursive(struct task_struct *tsk) { }
@@ -88,6 +91,11 @@ static inline long do_futex(u32 __user *uaddr, int op, u32 val,
{
return -EINVAL;
}
+static inline int futex_hash_prctl(unsigned long arg2, unsigned long arg3,
+ unsigned long arg4, unsigned long arg5)
+{
+ return -EINVAL;
+}
#endif
#endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ade6417609002..8854c6029a9b4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -58,6 +58,7 @@ struct bpf_net_context;
struct capture_control;
struct cfs_rq;
struct fs_struct;
+struct futex_hash_table;
struct futex_pi_state;
struct io_context;
struct io_uring_task;
@@ -1281,6 +1282,7 @@ struct task_struct {
#endif
#ifdef CONFIG_FUTEX
struct robust_list_head __user *robust_list;
+ struct futex_hash_table *futex_hash_table;
#ifdef CONFIG_COMPAT
struct compat_robust_list_head __user *compat_robust_list;
#endif
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 35791791a879b..2475b128ba85d 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -328,4 +328,9 @@ struct prctl_mm_map {
# define PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC 0x10 /* Clear the aspect on exec */
# define PR_PPC_DEXCR_CTRL_MASK 0x1f
+/* FUTEX hash management */
+#define PR_FUTEX_HASH 74
+# define PR_FUTEX_HASH_ALLOCATE 1
+# define PR_FUTEX_HASH_SHARE 2
+
#endif /* _LINUX_PRCTL_H */
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index de6d7f71961eb..7c97fc96f84a3 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -39,6 +39,7 @@
#include <linux/memblock.h>
#include <linux/fault-inject.h>
#include <linux/slab.h>
+#include <linux/prctl.h>
#include "futex.h"
#include "../locking/rtmutex_common.h"
@@ -55,6 +56,12 @@ static struct {
#define futex_queues (__futex_data.queues)
#define futex_hashsize (__futex_data.hashsize)
+struct futex_hash_table {
+ unsigned int slots;
+ int users;
+ spinlock_t lock;
+ struct futex_hash_bucket queues[];
+};
/*
* Fault injections for futexes.
@@ -1040,6 +1047,9 @@ static inline void exit_pi_state_list(struct task_struct *curr) { }
static void futex_cleanup(struct task_struct *tsk)
{
+ struct futex_hash_table *fht;
+ bool need_free = false;
+
if (unlikely(tsk->robust_list)) {
exit_robust_list(tsk);
tsk->robust_list = NULL;
@@ -1054,6 +1064,23 @@ static void futex_cleanup(struct task_struct *tsk)
if (unlikely(!list_empty(&tsk->pi_state_list)))
exit_pi_state_list(tsk);
+
+ rcu_read_lock();
+ fht = rcu_dereference(current->futex_hash_table);
+ if (fht) {
+
+ spin_lock(&fht->lock);
+ fht->users--;
+ WARN_ON_ONCE(fht->users < 0);
+ if (fht->users == 0)
+ need_free = true;
+ spin_unlock(&fht->lock);
+ rcu_assign_pointer(current->futex_hash_table, NULL);
+ }
+ rcu_read_unlock();
+
+ if (need_free)
+ kfree_rcu_mightsleep(fht);
}
/**
@@ -1153,6 +1180,104 @@ static void futex_hash_bucket_init(struct futex_hash_bucket *fhb)
spin_lock_init(&fhb->lock);
}
+static int futex_hash_allocate(unsigned long arg3, unsigned long arg4,
+ unsigned long arg5)
+{
+ unsigned int hash_slots = arg3;
+ struct futex_hash_table *fht;
+ size_t struct_size;
+ int i;
+
+ if (hash_slots == 0)
+ hash_slots = 4;
+ if (hash_slots < 2)
+ hash_slots = 2;
+ if (hash_slots > 16)
+ hash_slots = 16;
+ if (!is_power_of_2(hash_slots))
+ hash_slots = rounddown_pow_of_two(hash_slots);
+
+ if (current->futex_hash_table)
+ return -EALREADY;
+
+ struct_size = hash_slots * sizeof(struct futex_hash_bucket);
+ struct_size += sizeof(struct futex_hash_table);
+ fht = kmalloc(struct_size, GFP_KERNEL);
+ if (!fht)
+ return -ENOMEM;
+
+ fht->slots = hash_slots;
+ fht->users = 1;
+ spin_lock_init(&fht->lock);
+
+ for (i = 0; i < hash_slots; i++)
+ futex_hash_bucket_init(&fht->queues[i]);
+
+ rcu_assign_pointer(current->futex_hash_table, fht);
+ return 0;
+}
+
+static int futex_hash_share(unsigned long arg3, unsigned long arg4,
+ unsigned long arg5)
+{
+ struct futex_hash_table *fht;
+ struct task_struct *task;
+ pid_t task_pid;
+ int ret;
+
+ rcu_read_lock();
+ /* XXX maybe auto attach on fork() */
+ task_pid = task_tgid_vnr(current);
+ task = find_task_by_vpid(task_pid);
+ if (!task) {
+ ret = -ESRCH;
+ goto out;
+ }
+
+ fht = rcu_dereference(task->futex_hash_table);
+ if (!fht) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ spin_lock(&fht->lock);
+ if (fht->users <= 0) {
+ ret = -EINVAL;
+ goto unlock_out;
+ }
+ fht->users++;
+
+ rcu_assign_pointer(current->futex_hash_table, fht);
+ ret = 0;
+
+unlock_out:
+ spin_unlock(&fht->lock);
+out:
+ rcu_read_unlock();
+ return ret;
+}
+
+int futex_hash_prctl(unsigned long arg2, unsigned long arg3,
+ unsigned long arg4, unsigned long arg5)
+{
+ int ret;
+
+ switch (arg2) {
+ case PR_FUTEX_HASH_ALLOCATE:
+ ret = futex_hash_allocate(arg3, arg4, arg5);
+ break;
+
+ case PR_FUTEX_HASH_SHARE:
+ ret = futex_hash_share(arg3, arg4, arg5);
+ break;
+
+ default:
+ ret = -EINVAL;
+ break;
+ }
+ return ret;
+}
+
static int __init futex_init(void)
{
unsigned int futex_shift;
diff --git a/kernel/sys.c b/kernel/sys.c
index 4da31f28fda81..0dcbb8ce9f19d 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -52,6 +52,7 @@
#include <linux/user_namespace.h>
#include <linux/time_namespace.h>
#include <linux/binfmts.h>
+#include <linux/futex.h>
#include <linux/sched.h>
#include <linux/sched/autogroup.h>
@@ -2784,6 +2785,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
case PR_RISCV_SET_ICACHE_FLUSH_CTX:
error = RISCV_SET_ICACHE_FLUSH_CTX(arg2, arg3);
break;
+ case PR_FUTEX_HASH:
+ error = futex_hash_prctl(arg2, arg3, arg4, arg5);
+ break;
default:
error = -EINVAL;
break;
--
2.45.2
On Sun, Oct 27, 2024 at 12:34:51AM +0200, Sebastian Andrzej Siewior wrote: > Introduce a task local hash map. The hashmap can be allocated via > prctl(PR_FUTEX_HASH, PR_FUTEX_HASH_ALLOCATE, 0) Per process, per task is useless and will make things malfunction. Things missing in this patch are CLONE_THREAD / CLONE_VM, and create must be absolutely forbidden once mm_users != 1.
On 2024-10-28 11:16:33 [+0100], Peter Zijlstra wrote: > On Sun, Oct 27, 2024 at 12:34:51AM +0200, Sebastian Andrzej Siewior wrote: > > > Introduce a task local hash map. The hashmap can be allocated via > > prctl(PR_FUTEX_HASH, PR_FUTEX_HASH_ALLOCATE, 0) > > Per process, per task is useless and will make things malfunction. > > Things missing in this patch are CLONE_THREAD / CLONE_VM, and create > must be absolutely forbidden once mm_users != 1. I moved this to struct signal_struct and limited it for now to the group leader. Sebastian
On Mon, Oct 28, 2024 at 11:24:08AM +0100, Sebastian Andrzej Siewior wrote: > On 2024-10-28 11:16:33 [+0100], Peter Zijlstra wrote: > > On Sun, Oct 27, 2024 at 12:34:51AM +0200, Sebastian Andrzej Siewior wrote: > > > > > Introduce a task local hash map. The hashmap can be allocated via > > > prctl(PR_FUTEX_HASH, PR_FUTEX_HASH_ALLOCATE, 0) > > > > Per process, per task is useless and will make things malfunction. > > > > Things missing in this patch are CLONE_THREAD / CLONE_VM, and create > > must be absolutely forbidden once mm_users != 1. > > I moved this to struct signal_struct and limited it for now to the > group leader. That works I suppose. 'process' is a really dodgy thing in Linux anyway :/ So CLONE_THREAD implies CLONE_SIGHAND, and CLONE_SIGHAND in turn implies CLONE_VM -- however, you can do CLONE_VM without either SIGHAND or THREAD, (or SIGHAND|VM without THREAD). And it all quickly becomes a real mess. 'Sane' userspace doesn't play such games, and insane userspace gets to keep the pieces I suppose.
On Mon, Oct 28, 2024 at 11:46:39AM +0100, Peter Zijlstra wrote: > On Mon, Oct 28, 2024 at 11:24:08AM +0100, Sebastian Andrzej Siewior wrote: > > On 2024-10-28 11:16:33 [+0100], Peter Zijlstra wrote: > > > On Sun, Oct 27, 2024 at 12:34:51AM +0200, Sebastian Andrzej Siewior wrote: > > > > > > > Introduce a task local hash map. The hashmap can be allocated via > > > > prctl(PR_FUTEX_HASH, PR_FUTEX_HASH_ALLOCATE, 0) > > > > > > Per process, per task is useless and will make things malfunction. > > > > > > Things missing in this patch are CLONE_THREAD / CLONE_VM, and create > > > must be absolutely forbidden once mm_users != 1. > > > > I moved this to struct signal_struct and limited it for now to the > > group leader. > > That works I suppose. > > 'process' is a really dodgy thing in Linux anyway :/ > > So CLONE_THREAD implies CLONE_SIGHAND, and CLONE_SIGHAND in turn implies > CLONE_VM -- however, you can do CLONE_VM without either SIGHAND or > THREAD, (or SIGHAND|VM without THREAD). > > And it all quickly becomes a real mess. > > 'Sane' userspace doesn't play such games, and insane userspace gets to > keep the pieces I suppose. Bah, I now remember there used to be (or maybe there still are, who knows) a JVM that used a 'naked' CLONE_VM. And JVMs are also known to use futex. That would suggest putting the hash in mm_struct *or* make sure to disallow / warn about these private hash when !CLONE_THREAD.
On Sun, Oct 27 2024 at 00:34, Sebastian Andrzej Siewior wrote:
> static void futex_cleanup(struct task_struct *tsk)
> {
> + struct futex_hash_table *fht;
> + bool need_free = false;
> +
> if (unlikely(tsk->robust_list)) {
> exit_robust_list(tsk);
> tsk->robust_list = NULL;
> @@ -1054,6 +1064,23 @@ static void futex_cleanup(struct task_struct *tsk)
>
> if (unlikely(!list_empty(&tsk->pi_state_list)))
> exit_pi_state_list(tsk);
> +
> + rcu_read_lock();
> + fht = rcu_dereference(current->futex_hash_table);
> + if (fht) {
> +
> + spin_lock(&fht->lock);
I don't think you need a spin lock for this.
> + fht->users--;
> + WARN_ON_ONCE(fht->users < 0);
> + if (fht->users == 0)
> + need_free = true;
> + spin_unlock(&fht->lock);
> + rcu_assign_pointer(current->futex_hash_table, NULL);
> + }
> + rcu_read_unlock();
scoped_guard (rcu) {
fht = rcu_dereference(current->futex_hash_table);
if (fht && rcuref_put_rcusafe(&fht->refcnt)
rcu_assign_pointer(current->futex_hash_table, NULL);
else
fht = NULL;
}
kfree_rcu_mightsleep(fht);
Hmm? But see below
> +static int futex_hash_allocate(unsigned long arg3, unsigned long arg4,
> + unsigned long arg5)
> +{
> + unsigned int hash_slots = arg3;
> + struct futex_hash_table *fht;
> + size_t struct_size;
> + int i;
> +
> + if (hash_slots == 0)
> + hash_slots = 4;
> + if (hash_slots < 2)
> + hash_slots = 2;
> + if (hash_slots > 16)
> + hash_slots = 16;
> + if (!is_power_of_2(hash_slots))
> + hash_slots = rounddown_pow_of_two(hash_slots);
> +
> + if (current->futex_hash_table)
> + return -EALREADY;
You also need to check whether a hash table exists already in the
process. The table must be process shared to make sense. So it should
live in current->signal, which is a valid pointer in the context of
current.
So this needs some more thoughts especially vs. automatic creation and
sharing.
The first question is whether the hash table might have been already
created when the application reaches main(). If so then this call will
fail.
To make this work correctly, this needs proper serialization as well.
Assume a situation where the application does not allocate a table
upfront and the first futex use happens late when threads are running
already.
CPU0 CPU1
T1 T2
sys_futex() sys_futex()
create_hash() create_hash()
So T1 and T2 create their local hash and the subsequent usage will fail
because they operate on different hashs. You have the same problem
vs. your allocation scheme when two threads do prctl(ALLOC). We really
want to make this as simple as possible.
sys_futex()
...
if (private)
hash = private_hash(current);
else
hash = &global_hash;
private_hash()
hash = READ_ONCE(current->signal->futex_hash);
if (hash)
return hash;
guard(spinlock_irq)(&task->sighand->siglock);
hash = current->signal->futex_hash;
if (hash))
return hash;
hash = alloc_hash();
WRITE_ONCE(current->signal->futex_hash, hash);
return hash;
alloc_hash()
if (!local_hash_enabled)
return &global_hash;
hash = alloc();
return hash ?: &global_hash;
futex_cleanup_hash()
scoped_guard(spinlock_irq, ¤t->sighand->siglock) {
hash = current->signal->futex_hash;
if (!hash || current->signal->nr_threads > 1)
return;
current->signal->futex_hash = NULL;
if (hash == &global_hash)
return;
}
kfree_rcu_mightsleep(hash);
Or something like that.
Thanks,
tglx
On 2024-10-27 13:19:54 [+0100], Thomas Gleixner wrote: > > + if (current->futex_hash_table) > > + return -EALREADY; > > You also need to check whether a hash table exists already in the > process. The table must be process shared to make sense. So it should > live in current->signal, which is a valid pointer in the context of > current. > > So this needs some more thoughts especially vs. automatic creation and > sharing. > > The first question is whether the hash table might have been already > created when the application reaches main(). If so then this call will > fail. > > To make this work correctly, this needs proper serialization as well. > > Assume a situation where the application does not allocate a table > upfront and the first futex use happens late when threads are running > already. > > CPU0 CPU1 > T1 T2 > sys_futex() sys_futex() > create_hash() create_hash() > > So T1 and T2 create their local hash and the subsequent usage will fail > because they operate on different hashs. You have the same problem > vs. your allocation scheme when two threads do prctl(ALLOC). We really > want to make this as simple as possible. So I moved this to struct signal_struct and limited allocation to the group leader. You want automated creation of this? For everyone or with a hint? This is 64 bytes per slot due to the cache alignment but event without this struct takes 56 bytes on PREEMPT_RT and 24 bytes on non-RT. So the four slots are 256 bytes. Assuming 2.5k tasks it takes 625 KiB. Maybe not that much. Let me post v2 the signal_struct and then think about auto ON. > > Thanks, > > tglx Sebastian
On Mon, Oct 28 2024 at 11:30, Sebastian Andrzej Siewior wrote:
> On 2024-10-27 13:19:54 [+0100], Thomas Gleixner wrote:
>> So T1 and T2 create their local hash and the subsequent usage will fail
>> because they operate on different hashs. You have the same problem
>> vs. your allocation scheme when two threads do prctl(ALLOC). We really
>> want to make this as simple as possible.
>
> So I moved this to struct signal_struct and limited allocation to the
> group leader.
>
> You want automated creation of this? For everyone or with a hint? This
> is 64 bytes per slot due to the cache alignment but event without this
> struct takes 56 bytes on PREEMPT_RT and 24 bytes on non-RT. So the four
> slots are 256 bytes. Assuming 2.5k tasks it takes 625 KiB. Maybe not
> that much.
>
> Let me post v2 the signal_struct and then think about auto ON.
It only affects actual futex users. A lot of executables never use
them. For ease of testing, can you please make this automatic so there
is no need to modify a test case?
Aside of that for RT we really want it automatically enabled and as
Linus suggested back then probably for NUMA too.
Stick a trace point or a debugfs counter into the allocation so you can
observe how many of those are actually allocated and used concurrently.
Thanks,
tglx
On Mon, Oct 28, 2024 at 11:58:18AM +0100, Thomas Gleixner wrote: > On Mon, Oct 28 2024 at 11:30, Sebastian Andrzej Siewior wrote: > > On 2024-10-27 13:19:54 [+0100], Thomas Gleixner wrote: > >> So T1 and T2 create their local hash and the subsequent usage will fail > >> because they operate on different hashs. You have the same problem > >> vs. your allocation scheme when two threads do prctl(ALLOC). We really > >> want to make this as simple as possible. > > > > So I moved this to struct signal_struct and limited allocation to the > > group leader. > > > > You want automated creation of this? For everyone or with a hint? This > > is 64 bytes per slot due to the cache alignment but event without this > > struct takes 56 bytes on PREEMPT_RT and 24 bytes on non-RT. So the four > > slots are 256 bytes. Assuming 2.5k tasks it takes 625 KiB. Maybe not > > that much. > > > > Let me post v2 the signal_struct and then think about auto ON. > > It only affects actual futex users. A lot of executables never use > them. For ease of testing, can you please make this automatic so there > is no need to modify a test case? > > Aside of that for RT we really want it automatically enabled and as > Linus suggested back then probably for NUMA too. > > Stick a trace point or a debugfs counter into the allocation so you can > observe how many of those are actually allocated and used concurrently. Ideally it would re-hash and auto-scale to something like 4*nr_threads, but I realize that's probably a pain in the arse to make happen.
On Mon, Oct 28 2024 at 12:00, Peter Zijlstra wrote:
> On Mon, Oct 28, 2024 at 11:58:18AM +0100, Thomas Gleixner wrote:
>> > Let me post v2 the signal_struct and then think about auto ON.
>>
>> It only affects actual futex users. A lot of executables never use
>> them. For ease of testing, can you please make this automatic so there
>> is no need to modify a test case?
>>
>> Aside of that for RT we really want it automatically enabled and as
>> Linus suggested back then probably for NUMA too.
>>
>> Stick a trace point or a debugfs counter into the allocation so you can
>> observe how many of those are actually allocated and used concurrently.
>
> Ideally it would re-hash and auto-scale to something like 4*nr_threads,
> but I realize that's probably a pain in the arse to make happen.
That's what we did with the original series, but with this model it's
daft. What we maybe could do there is:
private_hash()
scoped_guard(rcu) {
hash = rcu_dereference(current->signal->futex_hash);
if (hash && rcuref_get(&hash->ref))
return hash;
}
guard(spinlock_irq)(&task->sighand->siglock);
hash = current->signal->futex_hash;
if (hash && rcuref_get(&hash->ref))
return hash;
// Let alloc scale according to signal->nr_threads
// alloc acquires a reference count
....
And on fork do the following:
scoped_guard(spinlock_irq, &task->sighand->siglock) {
hash = current->signal->futex_hash;
if (!hash || hash_size_ok())
return hash;
// Drop the initial reference, which forces the last
// user and subsequent new users into the respective
// slow paths, where they get stuck on sighand lock.
if (!rcuref_put(&hash->ref))
return;
// rcuref_put() dropped the last reference
old_hash = realloc_hash(hash);
hash = current->signal->futex_hash;
}
kfree_rcu(old_hash);
return hash;
A similar logic is required when putting the last reference
futex_hash_put()
{
if (!rcuref_put(&hash->ref))
return;
scoped_guard(spinlock_irq, &task->sighand->siglock) {
// Fork might have raced with this
if (hash != current->signal->futex_hash)
return;
old_hash = realloc_hash(hash);
}
kfree_rcu(old_hash);
}
realloc_hash(old_hash)
{
new_hash = alloc():
if (!new_hash) {
// Make the old hash alive again
rcuref_init(&old_hash->ref);
return NULL;
}
rehash(old_hash, new_hash);
rcu_assign_pointer(current->signal->new_hash);
return old_hash;
}
Or something like that. On the usage site this needs
// Takes a reference count on the hash
hb = futex_hash(key);
lock(hb);
queue();
unlock(hb);
futex_hash_put(hb);
which means, after the put @hb is not longer valid as the rehashing
might happen right in the put or afterwards. That needs some auditing of
the usage sites, but that should work. Whether it's worth it is a
different question.
Thanks,
tglx
On Mon, Oct 28, 2024 at 01:02:34PM +0100, Thomas Gleixner wrote:
> That's what we did with the original series, but with this model it's
> daft. What we maybe could do there is:
Not sure what's daft -- a single JVM running on 400+ CPUs with 4
hashbuckets sounds awesome.
>
> private_hash()
> scoped_guard(rcu) {
> hash = rcu_dereference(current->signal->futex_hash);
So I really do think mm_struct is a better place for this than signal
struct -- CLONE_SIGHAND is not mandatory when CLONE_VM.
I've long forgotten which JVM used the naked CLONE_VM, but there is some
creative code out there.
And futexes fundamentally live in the memory address space.
> if (hash && rcuref_get(&hash->ref))
> return hash;
> }
>
> guard(spinlock_irq)(&task->sighand->siglock);
> hash = current->signal->futex_hash;
> if (hash && rcuref_get(&hash->ref))
> return hash;
> // Let alloc scale according to signal->nr_threads
mm->mm_users
> // alloc acquires a reference count
> ....
It might make sense to have a prctl() setting that inhibits the hash
allocation entirely, reverting back to the global hash tables.
> And on fork do the following:
>
> scoped_guard(spinlock_irq, &task->sighand->siglock) {
> hash = current->signal->futex_hash;
> if (!hash || hash_size_ok())
> return hash;
>
> // Drop the initial reference, which forces the last
> // user and subsequent new users into the respective
> // slow paths, where they get stuck on sighand lock.
> if (!rcuref_put(&hash->ref))
> return;
>
> // rcuref_put() dropped the last reference
> old_hash = realloc_hash(hash);
> hash = current->signal->futex_hash;
> }
> kfree_rcu(old_hash);
> return hash;
>
> A similar logic is required when putting the last reference
>
> futex_hash_put()
> {
> if (!rcuref_put(&hash->ref))
> return;
>
> scoped_guard(spinlock_irq, &task->sighand->siglock) {
> // Fork might have raced with this
> if (hash != current->signal->futex_hash)
> return;
> old_hash = realloc_hash(hash);
> }
> kfree_rcu(old_hash);
> }
I'm not sure having that rehash under siglock is a fine idea. It's
convenient, no doubt, but urgh, could get expensive.
Another scheme would be to have 2 concurrent hash-tables for a little
while.
On Wed, Oct 30 2024 at 22:08, Peter Zijlstra wrote:
> On Mon, Oct 28, 2024 at 01:02:34PM +0100, Thomas Gleixner wrote:
>
>> That's what we did with the original series, but with this model it's
>> daft. What we maybe could do there is:
>
> Not sure what's daft -- a single JVM running on 400+ CPUs with 4
> hashbuckets sounds awesome.
Hahaha. You're right and I discussed this with Sebastian earlier
today. He's going to do some research on scaling of the hash vs. number
of threads (manually for now).
Vs. daft: The original 'register first' model was trivial in that
regard. "Automagic" requires way more to get it right.
The global hash size is next_power_of_2(256 * num_possible CPUs). So
with 256 CPUs this ends up with 64k hash buckets (4M of memory).
As one thread of a multithreaded application can only have one futex
queued at a time, it's reasonable to size that according to the number
of threads.
But from my memory of investigating JVM and database muck there can be a
gazillion of futexes in the process. So depending on the scaling factor
(number of buckets per thread) the resulting hash distribution might be
truly bad and cause a vast amount of hash collisions if the hash size is
too small. Even if the bucket lock is not the problem, then the list
walk will show up as a cache miss sh*tshow.
Let's see what Sebastians experiments will unearth.
>> private_hash()
>> scoped_guard(rcu) {
>> hash = rcu_dereference(current->signal->futex_hash);
>
> So I really do think mm_struct is a better place for this than signal
> struct -- CLONE_SIGHAND is not mandatory when CLONE_VM.
>
> I've long forgotten which JVM used the naked CLONE_VM, but there is some
> creative code out there.
>
> And futexes fundamentally live in the memory address space.
Agreed.
>> if (hash && rcuref_get(&hash->ref))
>> return hash;
>> }
>>
>> guard(spinlock_irq)(&task->sighand->siglock);
>> hash = current->signal->futex_hash;
>> if (hash && rcuref_get(&hash->ref))
>> return hash;
>> // Let alloc scale according to signal->nr_threads
>
> mm->mm_users
>
>> // alloc acquires a reference count
>> ....
>
> It might make sense to have a prctl() setting that inhibits the hash
> allocation entirely, reverting back to the global hash tables.
Right. It might even be the default to begin with. See below.
> I'm not sure having that rehash under siglock is a fine idea. It's
> convenient, no doubt, but urgh, could get expensive.
If we move it to mm, then it will be a separate lock anyway, a mutex.
> Another scheme would be to have 2 concurrent hash-tables for a little
> while.
Thought about that before and discarded it as too complex, but now that
you mentioned it again, let me explain why and you can tell me that I'm
failing to see the obvious.
As that switch over needs to be lockless on the usage site, this opens
another can of worms.
struct mm_futex {
struct futex_hash __rcu *cur;
struct futex_hash __rcu *prev;
mutex_t mutex;
....
};
The reallocation part becomes obviously trivial
mutex_lock(&mm->futex.mutex);
old = mm->futex.curr;
new = alloc(...);
rcu_assign_pointer(mm->futex.prev, old);
rcu_assign_pointer(mm->futex.cur, new);
if (rcuref_put(old->ref)) {
// old is invalid now
rcu_assign_pointer(mm->futex.prev, NULL);
kfree_rcu(old);
}
mutex_unlock(&mm->futex.mutex);
So enqueue will always use mm->futex.cur which it sees when getting a
refcount. If the get() fails it has to retry. Trivial enough.
wake/requeue gets more interesting because it has to check the old hash
first ((if it exist)and move the entries in the hash bucket over,
otherwise we end up with dealing with that nonsense in the actual
wait/requeue guts. That needs to be done at _every_ operation ...
That might be actually doable (or not), but it requires a non-trivial
amount of complexity especially vs. the hb->waiters optimization.
If that's handled, then it falls flat when a futex is used as a wait
queue. There might be a thread sitting on it waiting for an event which
never happens, which means the old hash never gets removed. Then the
process gets another pile of threads and can't expand the hash because
there are already two instances.
Remember: Futex code consists of 5% functionality and 95% corner case
handling already. No need to up this to 98+% :)
I really want to start simple and let the process' futex usage come to a
grinding halt when the resizing and rehashing takes place.
Resizing won't happen every other millisecond especially as the hash
size has to double every time. And we'll never downsize.
Any smart application will resize on start or the admin will tweak the
hash size or disable it with whatever magic mechanism we will provide.
That said for the sake of least surprise, this might actually require to
be opt-in to begin with. At least at the system level it has to be
disabled by default (unless configured otherwise via config or command
line).
I really want to know who thought that futexes are a good idea to begin
with. Once we figured that out I need to find those who added all the
other complexity on top of that bad idea :)
Thanks,
tglx
On Thu, Oct 31, 2024 at 12:14:16AM +0100, Thomas Gleixner wrote: > If that's handled, then it falls flat when a futex is used as a wait > queue. There might be a thread sitting on it waiting for an event which > never happens, which means the old hash never gets removed. Then the > process gets another pile of threads and can't expand the hash because > there are already two instances. Damn... I mean it means a userspace thread is basically stuck forever, but that's not our problem. Yes this is annoying. The whole requeue on lookup and hb->waiters thing looks simple enough, but this is a bit annoying. I can probably make it work though, however.. > I really want to start simple and let the process' futex usage come to a > grinding halt when the resizing and rehashing takes place. Fair enough. Lets do the simple thing first. > I really want to know who thought that futexes are a good idea to begin > with. Once we figured that out I need to find those who added all the > other complexity on top of that bad idea :) If memory serves me, it was some tall German dude that did most of that.
© 2016 - 2026 Red Hat, Inc.