[v8] futex: Add support task local hash maps.

[PATCH v8 03/15] futex: Add basic infrastructure for local task local hash.

Posted by Sebastian Andrzej Siewior 1 year ago

The futex hashmap is system wide and shared by random tasks. Each slot
is hashed based on its address and VMA. Due to randomized VMAs (and
memory allocations) the same logical lock (pointer) can end up in a
different hash bucket on each invocation of the application. This in
turn means that different applications may share a hash bucket on the
first invocation but not on the second an it is not always clear which
applications will be involved. This can result in high latency's to
acquire the futex_hash_bucket::lock especially if the lock owner is
limited to a CPU and not be effectively PI boosted.

Introduce a task local hash map. The hashmap can be allocated via
	prctl(PR_FUTEX_HASH, PR_FUTEX_HASH_SET_SLOTS, 0)

The `0' argument allocates a default number of 16 slots, a higher number
can be specified if desired. The current upper limit is 131072.
The allocated hashmap is used by all threads within a process.
A thread can check if the private map has been allocated via
	prctl(PR_FUTEX_HASH, PR_FUTEX_HASH_GET_SLOTS);

Which return the current number of slots.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 include/linux/futex.h      |  20 ++++++++
 include/linux/mm_types.h   |   6 ++-
 include/uapi/linux/prctl.h |   5 ++
 kernel/fork.c              |   2 +
 kernel/futex/core.c        | 101 +++++++++++++++++++++++++++++++++++--
 kernel/sys.c               |   4 ++
 6 files changed, 133 insertions(+), 5 deletions(-)

diff --git a/include/linux/futex.h b/include/linux/futex.h
index b70df27d7e85c..943828db52234 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -77,6 +77,15 @@ void futex_exec_release(struct task_struct *tsk);
 
 long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 	      u32 __user *uaddr2, u32 val2, u32 val3);
+int futex_hash_prctl(unsigned long arg2, unsigned long arg3);
+int futex_hash_allocate_default(void);
+void futex_hash_free(struct mm_struct *mm);
+
+static inline void futex_mm_init(struct mm_struct *mm)
+{
+	mm->futex_hash_bucket = NULL;
+}
+
 #else
 static inline void futex_init_task(struct task_struct *tsk) { }
 static inline void futex_exit_recursive(struct task_struct *tsk) { }
@@ -88,6 +97,17 @@ static inline long do_futex(u32 __user *uaddr, int op, u32 val,
 {
 	return -EINVAL;
 }
+static inline int futex_hash_prctl(unsigned long arg2, unsigned long arg3)
+{
+	return -EINVAL;
+}
+static inline int futex_hash_allocate_default(void)
+{
+	return 0;
+}
+static inline void futex_hash_free(struct mm_struct *mm) { }
+static inline void futex_mm_init(struct mm_struct *mm) { }
+
 #endif
 
 #endif
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 6b27db7f94963..c20f2310d78ca 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -30,6 +30,7 @@
 #define INIT_PASID	0
 
 struct address_space;
+struct futex_hash_bucket;
 struct mem_cgroup;
 
 /*
@@ -936,7 +937,10 @@ struct mm_struct {
 		 */
 		seqcount_t mm_lock_seq;
 #endif
-
+#ifdef CONFIG_FUTEX
+		unsigned int			futex_hash_mask;
+		struct futex_hash_bucket	*futex_hash_bucket;
+#endif
 
 		unsigned long hiwater_rss; /* High-watermark of RSS usage */
 		unsigned long hiwater_vm;  /* High-water virtual memory usage */
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 5c6080680cb27..55b843644c51a 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -353,4 +353,9 @@ struct prctl_mm_map {
  */
 #define PR_LOCK_SHADOW_STACK_STATUS      76
 
+/* FUTEX hash management */
+#define PR_FUTEX_HASH			77
+# define PR_FUTEX_HASH_SET_SLOTS	1
+# define PR_FUTEX_HASH_GET_SLOTS	2
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index 735405a9c5f32..80ac156adebbf 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1287,6 +1287,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	RCU_INIT_POINTER(mm->exe_file, NULL);
 	mmu_notifier_subscriptions_init(mm);
 	init_tlb_flush_pending(mm);
+	futex_mm_init(mm);
 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !defined(CONFIG_SPLIT_PMD_PTLOCKS)
 	mm->pmd_huge_pte = NULL;
 #endif
@@ -1364,6 +1365,7 @@ static inline void __mmput(struct mm_struct *mm)
 	if (mm->binfmt)
 		module_put(mm->binfmt->module);
 	lru_gen_del_mm(mm);
+	futex_hash_free(mm);
 	mmdrop(mm);
 }
 
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index d1d3c7b358b23..26328d8072fee 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -39,6 +39,7 @@
 #include <linux/memblock.h>
 #include <linux/fault-inject.h>
 #include <linux/slab.h>
+#include <linux/prctl.h>
 
 #include "futex.h"
 #include "../locking/rtmutex_common.h"
@@ -107,18 +108,40 @@ late_initcall(fail_futex_debugfs);
 
 #endif /* CONFIG_FAIL_FUTEX */
 
+static inline bool futex_key_is_private(union futex_key *key)
+{
+	/*
+	 * Relies on get_futex_key() to set either bit for shared
+	 * futexes -- see comment with union futex_key.
+	 */
+	return !(key->both.offset & (FUT_OFF_INODE | FUT_OFF_MMSHARED));
+}
+
 /**
- * futex_hash - Return the hash bucket in the global hash
+ * futex_hash - Return the hash bucket in the global or local hash
  * @key:	Pointer to the futex key for which the hash is calculated
  *
  * We hash on the keys returned from get_futex_key (see below) and return the
- * corresponding hash bucket in the global hash.
+ * corresponding hash bucket in the global hash. If the FUTEX is private and
+ * a local hash table is privated then this one is used.
  */
 struct futex_hash_bucket *futex_hash(union futex_key *key)
 {
-	u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4,
-			  key->both.offset);
+	struct futex_hash_bucket *fhb;
+	u32 hash;
 
+	fhb = current->mm->futex_hash_bucket;
+	if (fhb && futex_key_is_private(key)) {
+		u32 hash_mask = current->mm->futex_hash_mask;
+
+		hash = jhash2((u32 *)key,
+			      offsetof(typeof(*key), both.offset) / 4,
+			      key->both.offset);
+		return &fhb[hash & hash_mask];
+	}
+	hash = jhash2((u32 *)key,
+		      offsetof(typeof(*key), both.offset) / 4,
+		      key->both.offset);
 	return &futex_queues[hash & (futex_hashsize - 1)];
 }
 
@@ -1131,6 +1154,76 @@ static void futex_hash_bucket_init(struct futex_hash_bucket *fhb)
 	spin_lock_init(&fhb->lock);
 }
 
+void futex_hash_free(struct mm_struct *mm)
+{
+	kvfree(mm->futex_hash_bucket);
+}
+
+static int futex_hash_allocate(unsigned int hash_slots)
+{
+	struct futex_hash_bucket *fhb;
+	int i;
+
+	if (current->mm->futex_hash_bucket)
+		return -EALREADY;
+
+	if (!thread_group_leader(current))
+		return -EINVAL;
+
+	if (hash_slots == 0)
+		hash_slots = 16;
+	if (hash_slots < 2)
+		hash_slots = 2;
+	if (hash_slots > 131072)
+		hash_slots = 131072;
+	if (!is_power_of_2(hash_slots))
+		hash_slots = rounddown_pow_of_two(hash_slots);
+
+	fhb = kvmalloc_array(hash_slots, sizeof(struct futex_hash_bucket), GFP_KERNEL_ACCOUNT);
+	if (!fhb)
+		return -ENOMEM;
+
+	current->mm->futex_hash_mask = hash_slots - 1;
+
+	for (i = 0; i < hash_slots; i++)
+		futex_hash_bucket_init(&fhb[i]);
+
+	current->mm->futex_hash_bucket = fhb;
+	return 0;
+}
+
+int futex_hash_allocate_default(void)
+{
+	return futex_hash_allocate(0);
+}
+
+static int futex_hash_get_slots(void)
+{
+	if (current->mm->futex_hash_bucket)
+		return current->mm->futex_hash_mask + 1;
+	return 0;
+}
+
+int futex_hash_prctl(unsigned long arg2, unsigned long arg3)
+{
+	int ret;
+
+	switch (arg2) {
+	case PR_FUTEX_HASH_SET_SLOTS:
+		ret = futex_hash_allocate(arg3);
+		break;
+
+	case PR_FUTEX_HASH_GET_SLOTS:
+		ret = futex_hash_get_slots();
+		break;
+
+	default:
+		ret = -EINVAL;
+		break;
+	}
+	return ret;
+}
+
 static int __init futex_init(void)
 {
 	unsigned int futex_shift;
diff --git a/kernel/sys.c b/kernel/sys.c
index cb366ff8703af..e509ad9795103 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -52,6 +52,7 @@
 #include <linux/user_namespace.h>
 #include <linux/time_namespace.h>
 #include <linux/binfmts.h>
+#include <linux/futex.h>
 
 #include <linux/sched.h>
 #include <linux/sched/autogroup.h>
@@ -2811,6 +2812,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 			return -EINVAL;
 		error = arch_lock_shadow_stack_status(me, arg2);
 		break;
+	case PR_FUTEX_HASH:
+		error = futex_hash_prctl(arg2, arg3);
+		break;
 	default:
 		trace_task_prctl_unknown(option, arg2, arg3, arg4, arg5);
 		error = -EINVAL;
-- 
2.47.2

Re: [PATCH v8 03/15] futex: Add basic infrastructure for local task local hash.

Posted by Peter Zijlstra 1 year ago

On Mon, Feb 03, 2025 at 02:59:23PM +0100, Sebastian Andrzej Siewior wrote:

>  struct futex_hash_bucket *futex_hash(union futex_key *key)
>  {
> -	u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4,
> -			  key->both.offset);
> +	struct futex_hash_bucket *fhb;
> +	u32 hash;
>  
> +	fhb = current->mm->futex_hash_bucket;
> +	if (fhb && futex_key_is_private(key)) {
> +		u32 hash_mask = current->mm->futex_hash_mask;
> +
> +		hash = jhash2((u32 *)key,
> +			      offsetof(typeof(*key), both.offset) / 4,
> +			      key->both.offset);
> +		return &fhb[hash & hash_mask];
> +	}
> +	hash = jhash2((u32 *)key,
> +		      offsetof(typeof(*key), both.offset) / 4,
> +		      key->both.offset);
>  	return &futex_queues[hash & (futex_hashsize - 1)];
>  }

Having read this later patch, I would still prefer this to look like:

struct futex_hash_bucker *futex_hash(union futex_key *key)
{
	struct futex_hash_bucket *fhb = current->mm->futex_hash_bucket;
	u32 hash;

	if (futex_key_is_private) {
		// shorter jhash invocation;
	} else {
		hash = jhash2((u32 *)key,
				offsetof(typeof(*key), both.offset) / 4,
				key->both.offset);
	}

	if (fhb)
		return fhb[hash & current->mm->futex_hash_mask];

	return &futex_queues[hash & (futex_hashsize - 1)];
}

Because the shorter hash invocation only cares about being private, not
about having fhb.

Notably, I'm also still thikning about all that pending FUTEX2_NUMA
stuff.

Re: [PATCH v8 03/15] futex: Add basic infrastructure for local task local hash.

Posted by Sebastian Andrzej Siewior 1 year ago

On 2025-02-03 15:41:01 [+0100], Peter Zijlstra wrote:
> Notably, I'm also still thikning about all that pending FUTEX2_NUMA
> stuff.

With local-hash there is still room for FUTEX2_NUMA?

Sebastian

Re: [PATCH v8 03/15] futex: Add basic infrastructure for local task local hash.

Posted by Peter Zijlstra 1 year ago

On Mon, Feb 03, 2025 at 04:52:51PM +0100, Sebastian Andrzej Siewior wrote:
> On 2025-02-03 15:41:01 [+0100], Peter Zijlstra wrote:
> > Notably, I'm also still thikning about all that pending FUTEX2_NUMA
> > stuff.
> 
> With local-hash there is still room for FUTEX2_NUMA?

Yes, single process can span multiple nodes, and then you still get the
cross node contention issues.

Re: [PATCH v8 03/15] futex: Add basic infrastructure for local task local hash.

Posted by Thomas Gleixner 1 year ago

On Tue, Feb 04 2025 at 09:41, Peter Zijlstra wrote:
> On Mon, Feb 03, 2025 at 04:52:51PM +0100, Sebastian Andrzej Siewior wrote:
>> On 2025-02-03 15:41:01 [+0100], Peter Zijlstra wrote:
>> > Notably, I'm also still thikning about all that pending FUTEX2_NUMA
>> > stuff.
>> 
>> With local-hash there is still room for FUTEX2_NUMA?
>
> Yes, single process can span multiple nodes, and then you still get the
> cross node contention issues.

This is strictly about process private futexes, while FUTEX2_NUMA covers
both shared and private futexes and helps to avoid the hash collision
and resulting contention problems which are caused by the current global
hash.

With the private futex seperation you obviously get the cross node
memory access problems, but not the general contention issues which
FUTEX2_NUMA tries to address.

Thanks,

        tglx

Re: [PATCH v8 03/15] futex: Add basic infrastructure for local task local hash.

Posted by Peter Zijlstra 1 year ago

On Mon, Feb 03, 2025 at 03:41:02PM +0100, Peter Zijlstra wrote:
> Because the shorter hash invocation only cares about being private, not
> about having fhb.

While driving to pick up kid from high school, I realized I was full of
crap. Short hash is very much specific to local thing.

Carry on.

Re: [PATCH v8 03/15] futex: Add basic infrastructure for local task local hash.

Posted by Peter Zijlstra 1 year ago

On Mon, Feb 03, 2025 at 02:59:23PM +0100, Sebastian Andrzej Siewior wrote:
> +static int futex_hash_allocate(unsigned int hash_slots)
> +{
> +	struct futex_hash_bucket *fhb;
> +	int i;
> +
> +	if (current->mm->futex_hash_bucket)
> +		return -EALREADY;
> +
> +	if (!thread_group_leader(current))
> +		return -EINVAL;
> +
> +	if (hash_slots == 0)
> +		hash_slots = 16;
> +	if (hash_slots < 2)
> +		hash_slots = 2;
> +	if (hash_slots > 131072)
> +		hash_slots = 131072;
> +	if (!is_power_of_2(hash_slots))
> +		hash_slots = rounddown_pow_of_two(hash_slots);

Why not require the argument is 2^n and fail otherwise?

Re: [PATCH v8 03/15] futex: Add basic infrastructure for local task local hash.

Posted by Peter Zijlstra 1 year ago

On Mon, Feb 03, 2025 at 02:59:23PM +0100, Sebastian Andrzej Siewior wrote:
> The futex hashmap is system wide and shared by random tasks. Each slot
> is hashed based on its address and VMA. Due to randomized VMAs (and
> memory allocations) the same logical lock (pointer) can end up in a
> different hash bucket on each invocation of the application. This in
> turn means that different applications may share a hash bucket on the
> first invocation but not on the second an it is not always clear which
> applications will be involved. This can result in high latency's to
> acquire the futex_hash_bucket::lock especially if the lock owner is
> limited to a CPU and not be effectively PI boosted.
> 
> Introduce a task local hash map. The hashmap can be allocated via
> 	prctl(PR_FUTEX_HASH, PR_FUTEX_HASH_SET_SLOTS, 0)
> 
> The `0' argument allocates a default number of 16 slots, a higher number
> can be specified if desired. The current upper limit is 131072.

Hmm, I would expect 0 to disable the local thing.

Now, I realize this is somewhat tricky, since there might be futexes
inside it. But mapping 0 to some default value seems.. well, odd.

Re: [PATCH v8 03/15] futex: Add basic infrastructure for local task local hash.

Posted by Sebastian Andrzej Siewior 1 year ago

On 2025-02-03 15:27:43 [+0100], Peter Zijlstra wrote:
> On Mon, Feb 03, 2025 at 02:59:23PM +0100, Sebastian Andrzej Siewior wrote:
> > The futex hashmap is system wide and shared by random tasks. Each slot
> > is hashed based on its address and VMA. Due to randomized VMAs (and
> > memory allocations) the same logical lock (pointer) can end up in a
> > different hash bucket on each invocation of the application. This in
> > turn means that different applications may share a hash bucket on the
> > first invocation but not on the second an it is not always clear which
> > applications will be involved. This can result in high latency's to
> > acquire the futex_hash_bucket::lock especially if the lock owner is
> > limited to a CPU and not be effectively PI boosted.
> > 
> > Introduce a task local hash map. The hashmap can be allocated via
> > 	prctl(PR_FUTEX_HASH, PR_FUTEX_HASH_SET_SLOTS, 0)
> > 
> > The `0' argument allocates a default number of 16 slots, a higher number
> > can be specified if desired. The current upper limit is 131072.
> 
> Hmm, I would expect 0 to disable the local thing.
> 
> Now, I realize this is somewhat tricky, since there might be futexes
> inside it. But mapping 0 to some default value seems.. well, odd.

Looking at this from the top of the series:
- The default value (currently 16) supposed to be something sane so the
  user does not have to worry about it. We could increase it if needed.

- Currently each thread will alter the limit if needed based on the
  formula. So even if you set it to 2 or 16, once you have 16 threads
  you will have 64 buckets.

- The number of buckets can only be increased, not shrunk. There is no
  technical requirement for it other than "avoid a race where you use a
  lower number if a lot of threads a fired at once".

- It is not expected to go back to the global hash.

- The upper limit is the same limit as for the global hash.

All the things I mentioned are open for debate:

- We could force the user to specify a power-of-number for the buckets
  (due to current implementation) and not round.

- We could allow to increase and shrink the number of buckets. Allowing
  it to shrink by accident if many threads are fired at once.

- We could say the user knows best and disable the heuristic once a size
  has been requested.

- We could use 0 to disable the local hash and use the global one
  instead if this is a requirement. Doing this before a thread is
  created is preferred. Later might get tricky.

Sebastian

Re: [PATCH v8 03/15] futex: Add basic infrastructure for local task local hash.

Posted by Peter Zijlstra 1 year ago

On Mon, Feb 03, 2025 at 04:51:14PM +0100, Sebastian Andrzej Siewior wrote:

> All the things I mentioned are open for debate:

So how about we add sysctl_futex_private; when set to 0 it will disable
the private hash for every new task, unless task explicitly calls
PR_FUTEX_HASH_SET. When non-zero, it is the base for the auto-scaling.

(then BASE_SMALL would have this default to 0, otherwise 4 as per this
patch set)

I would let PR_FUTEX_HASH_SET override and disable auto-scaling. There
is nothing more annoying than a computer that thinks it knows better.
User asked for N, user gets N etc.

If user sets 0, fall back to global hash.


Anyway, none of this solves anything when a process has both an active
RT part and an active !RT part (which isn't uncommon AFAICT).

Then the RT bits will still get interference from the !RT bits. Do we
want to complicate things and consider that?

Re: [PATCH v8 03/15] futex: Add basic infrastructure for local task local hash.

Posted by Sebastian Andrzej Siewior 1 year ago

On 2025-02-04 11:34:47 [+0100], Peter Zijlstra wrote:
> On Mon, Feb 03, 2025 at 04:51:14PM +0100, Sebastian Andrzej Siewior wrote:
> 
> > All the things I mentioned are open for debate:
> 
> So how about we add sysctl_futex_private; when set to 0 it will disable
> the private hash for every new task, unless task explicitly calls
> PR_FUTEX_HASH_SET. When non-zero, it is the base for the auto-scaling.
>
> (then BASE_SMALL would have this default to 0, otherwise 4 as per this
> patch set)
> 
> I would let PR_FUTEX_HASH_SET override and disable auto-scaling. There
> is nothing more annoying than a computer that thinks it knows better.
> User asked for N, user gets N etc.
> 
> If user sets 0, fall back to global hash.

Looks sane. I started with PR_FUTEX_HASH_SET to avoid the rehash at
runtime. Now that I have this working PR_FUTEX_HASH_SET does not look
needed. But okay. Having a default enable or disable with the sysctl and
a PR_FUTEX_HASH_SET which sets a requested value and disabling
auto-scale looks okay.
Assuming the user knows best. Even that it is hard to get right given
random pointer and so on.

> Anyway, none of this solves anything when a process has both an active
> RT part and an active !RT part (which isn't uncommon AFAICT).
> 
> Then the RT bits will still get interference from the !RT bits. Do we
> want to complicate things and consider that?

I don't think so. The active and inactive are common but it is still the
same process so you can expect it. The ugly part is when it is an
entirely different task and it is random which one it is.

Sebastian

Re: [PATCH v8 03/15] futex: Add basic infrastructure for local task local hash.

Posted by Juri Lelli 1 year ago

Hi,

On 05/02/25 09:39, Sebastian Andrzej Siewior wrote:
> On 2025-02-04 11:34:47 [+0100], Peter Zijlstra wrote:

...

> > Anyway, none of this solves anything when a process has both an active
> > RT part and an active !RT part (which isn't uncommon AFAICT).
> > 
> > Then the RT bits will still get interference from the !RT bits. Do we
> > want to complicate things and consider that?
> 
> I don't think so. The active and inactive are common but it is still the
> same process so you can expect it. The ugly part is when it is an
> entirely different task and it is random which one it is.

Not entirely sure we are thinking about the same situation, but it looks
like we have cases of RT tasks that are affected by the underlying issue
this set is about because they make use of libraries. So, in this case
we have a cross-process (RT/!RT) situation that I am not sure we can
address sanely. What do you think?

Thanks,
Juri

Re: [PATCH v8 03/15] futex: Add basic infrastructure for local task local hash.

Posted by Sebastian Andrzej Siewior 1 year ago

On 2025-02-07 10:41:02 [+0100], Juri Lelli wrote:
> Hi,
> 
> On 05/02/25 09:39, Sebastian Andrzej Siewior wrote:
> > On 2025-02-04 11:34:47 [+0100], Peter Zijlstra wrote:
> 
> ...
> 
> > > Anyway, none of this solves anything when a process has both an active
> > > RT part and an active !RT part (which isn't uncommon AFAICT).
> > > 
> > > Then the RT bits will still get interference from the !RT bits. Do we
> > > want to complicate things and consider that?
> > 
> > I don't think so. The active and inactive are common but it is still the
> > same process so you can expect it. The ugly part is when it is an
> > entirely different task and it is random which one it is.
> 
> Not entirely sure we are thinking about the same situation, but it looks
> like we have cases of RT tasks that are affected by the underlying issue
> this set is about because they make use of libraries. So, in this case
> we have a cross-process (RT/!RT) situation that I am not sure we can
> address sanely. What do you think?

I wouldn't advice to use "unknown" code in a RT application and even
threads. Audit libs before using them and not just collect them.
A lock without PI in your RT thread is not good. A lot of locks, not
knowing the "locked" time, also not good. Things that work most of the
time due to the fastpath and only break from time to time.
Also, a thread does fork() once during start up and things may continue
to be good but may catch up eventually.

> Thanks,
> Juri

Sebastian

Re: [PATCH v8 03/15] futex: Add basic infrastructure for local task local hash.

Posted by Peter Zijlstra 1 year ago

On Fri, Feb 07, 2025 at 12:00:50PM +0100, Sebastian Andrzej Siewior wrote:
> On 2025-02-07 10:41:02 [+0100], Juri Lelli wrote:
> > Hi,
> > 
> > On 05/02/25 09:39, Sebastian Andrzej Siewior wrote:
> > > On 2025-02-04 11:34:47 [+0100], Peter Zijlstra wrote:
> > 
> > ...
> > 
> > > > Anyway, none of this solves anything when a process has both an active
> > > > RT part and an active !RT part (which isn't uncommon AFAICT).
> > > > 
> > > > Then the RT bits will still get interference from the !RT bits. Do we
> > > > want to complicate things and consider that?
> > > 
> > > I don't think so. The active and inactive are common but it is still the
> > > same process so you can expect it. The ugly part is when it is an
> > > entirely different task and it is random which one it is.
> > 
> > Not entirely sure we are thinking about the same situation, but it looks
> > like we have cases of RT tasks that are affected by the underlying issue
> > this set is about because they make use of libraries. So, in this case
> > we have a cross-process (RT/!RT) situation that I am not sure we can
> > address sanely. What do you think?
> 
> I wouldn't advice to use "unknown" code in a RT application and even
> threads. Audit libs before using them and not just collect them.
> A lock without PI in your RT thread is not good. A lot of locks, not
> knowing the "locked" time, also not good. Things that work most of the
> time due to the fastpath and only break from time to time.
> Also, a thread does fork() once during start up and things may continue
> to be good but may catch up eventually.

Anyway, supposing people want to tinker, it should be possible to split
the local hash based on if the address is mlocked or not.

This gives some obvious issues where people do futex_wait(), mlock() and
then expect futex_wake() to still work, but that should be rare. You
typically mlock before you start using the memory.

As always, mlockall is bad, do not use, like ever. But especially in
mixed mode RT programs.

Re: [PATCH v8 03/15] futex: Add basic infrastructure for local task local hash.

Posted by Juri Lelli 1 year ago

On 07/02/25 12:06, Peter Zijlstra wrote:
> On Fri, Feb 07, 2025 at 12:00:50PM +0100, Sebastian Andrzej Siewior wrote:
> > On 2025-02-07 10:41:02 [+0100], Juri Lelli wrote:
> > > Hi,
> > > 
> > > On 05/02/25 09:39, Sebastian Andrzej Siewior wrote:
> > > > On 2025-02-04 11:34:47 [+0100], Peter Zijlstra wrote:
> > > 
> > > ...
> > > 
> > > > > Anyway, none of this solves anything when a process has both an active
> > > > > RT part and an active !RT part (which isn't uncommon AFAICT).
> > > > > 
> > > > > Then the RT bits will still get interference from the !RT bits. Do we
> > > > > want to complicate things and consider that?
> > > > 
> > > > I don't think so. The active and inactive are common but it is still the
> > > > same process so you can expect it. The ugly part is when it is an
> > > > entirely different task and it is random which one it is.
> > > 
> > > Not entirely sure we are thinking about the same situation, but it looks
> > > like we have cases of RT tasks that are affected by the underlying issue
> > > this set is about because they make use of libraries. So, in this case
> > > we have a cross-process (RT/!RT) situation that I am not sure we can
> > > address sanely. What do you think?
> > 
> > I wouldn't advice to use "unknown" code in a RT application and even
> > threads. Audit libs before using them and not just collect them.

Indeed, that is the message. But, you know, existing/legacy applications
or applications "repurposed" for RT. :/

> > A lock without PI in your RT thread is not good. A lot of locks, not
> > knowing the "locked" time, also not good. Things that work most of the
> > time due to the fastpath and only break from time to time.
> > Also, a thread does fork() once during start up and things may continue
> > to be good but may catch up eventually.
> 
> Anyway, supposing people want to tinker, it should be possible to split
> the local hash based on if the address is mlocked or not.

So, fully agree that we don't want to implement changes just to target
current broken usages, but wondered if there are saner situations (Peter's
point above?) where the current idea might be further extended to.

> This gives some obvious issues where people do futex_wait(), mlock() and
> then expect futex_wake() to still work, but that should be rare. You
> typically mlock before you start using the memory.
> 
> As always, mlockall is bad, do not use, like ever. But especially in
> mixed mode RT programs.
>