[PATCH v12 14/21] futex: Allow to resize the private local hash

Sebastian Andrzej Siewior posted 21 patches 8 months ago
[PATCH v12 14/21] futex: Allow to resize the private local hash
Posted by Sebastian Andrzej Siewior 8 months ago
The mm_struct::futex_hash_lock guards the futex_hash_bucket assignment/
replacement. The futex_hash_allocate()/ PR_FUTEX_HASH_SET_SLOTS
operation can now be invoked at runtime and resize an already existing
internal private futex_hash_bucket to another size.

The reallocation is based on an idea by Thomas Gleixner: The initial
allocation of struct futex_private_hash sets the reference count
to one. Every user acquires a reference on the local hash before using
it and drops it after it enqueued itself on the hash bucket. There is no
reference held while the task is scheduled out while waiting for the
wake up.
The resize process allocates a new struct futex_private_hash and drops
the initial reference. Synchronized with mm_struct::futex_hash_lock it
is checked if the reference counter for the currently used
mm_struct::futex_phash is marked as DEAD. If so, then all users enqueued
on the current private hash are requeued on the new private hash and the
new private hash is set to mm_struct::futex_phash. Otherwise the newly
allocated private hash is saved as mm_struct::futex_phash_new and the
rehashing and reassigning is delayed to the futex_hash() caller once the
reference counter is marked DEAD.
The replacement is not performed at rcuref_put() time because certain
callers, such as futex_wait_queue(), drop their reference after changing
the task state. This change will be destroyed once the futex_hash_lock
is acquired.

The user can change the number slots with PR_FUTEX_HASH_SET_SLOTS
multiple times. An increase and decrease is allowed and request blocks
until the assignment is done.

The private hash allocated at thread creation is changed from 16 to
  16 <= 4 * number_of_threads <= global_hash_size
where number_of_threads can not exceed the number of online CPUs. Should
the user PR_FUTEX_HASH_SET_SLOTS then the auto scaling is disabled.

[peterz: reorganize the code to avoid state tracking and simplify new
object handling, block the user until changes are in effect, allow
increase and decrease of the hash].

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 include/linux/futex.h    |   3 +-
 include/linux/mm_types.h |   4 +-
 kernel/futex/core.c      | 290 ++++++++++++++++++++++++++++++++++++---
 kernel/futex/requeue.c   |   5 +
 4 files changed, 281 insertions(+), 21 deletions(-)

diff --git a/include/linux/futex.h b/include/linux/futex.h
index 1d3f7555825ec..40bc778b2bb45 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -85,7 +85,8 @@ void futex_hash_free(struct mm_struct *mm);
 
 static inline void futex_mm_init(struct mm_struct *mm)
 {
-	mm->futex_phash =  NULL;
+	rcu_assign_pointer(mm->futex_phash, NULL);
+	mutex_init(&mm->futex_hash_lock);
 }
 
 #else /* !CONFIG_FUTEX_PRIVATE_HASH */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index a4b5661e41770..32ba5126e2214 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1033,7 +1033,9 @@ struct mm_struct {
 		seqcount_t mm_lock_seq;
 #endif
 #ifdef CONFIG_FUTEX_PRIVATE_HASH
-		struct futex_private_hash	*futex_phash;
+		struct mutex			futex_hash_lock;
+		struct futex_private_hash	__rcu *futex_phash;
+		struct futex_private_hash	*futex_phash_new;
 #endif
 
 		unsigned long hiwater_rss; /* High-watermark of RSS usage */
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index 53b3a00a92539..9e7dad52abea8 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -40,6 +40,7 @@
 #include <linux/fault-inject.h>
 #include <linux/slab.h>
 #include <linux/prctl.h>
+#include <linux/rcuref.h>
 
 #include "futex.h"
 #include "../locking/rtmutex_common.h"
@@ -57,7 +58,9 @@ static struct {
 #define futex_hashmask (__futex_data.hashmask)
 
 struct futex_private_hash {
+	rcuref_t	users;
 	unsigned int	hash_mask;
+	struct rcu_head	rcu;
 	void		*mm;
 	bool		custom;
 	struct futex_hash_bucket queues[];
@@ -129,11 +132,14 @@ static inline bool futex_key_is_private(union futex_key *key)
 
 bool futex_private_hash_get(struct futex_private_hash *fph)
 {
-	return false;
+	return rcuref_get(&fph->users);
 }
 
 void futex_private_hash_put(struct futex_private_hash *fph)
 {
+	/* Ignore return value, last put is verified via rcuref_is_dead() */
+	if (rcuref_put(&fph->users))
+		wake_up_var(fph->mm);
 }
 
 /**
@@ -143,8 +149,23 @@ void futex_private_hash_put(struct futex_private_hash *fph)
  * Obtain an additional reference for the already obtained hash bucket. The
  * caller must already own an reference.
  */
-void futex_hash_get(struct futex_hash_bucket *hb) { }
-void futex_hash_put(struct futex_hash_bucket *hb) { }
+void futex_hash_get(struct futex_hash_bucket *hb)
+{
+	struct futex_private_hash *fph = hb->priv;
+
+	if (!fph)
+		return;
+	WARN_ON_ONCE(!futex_private_hash_get(fph));
+}
+
+void futex_hash_put(struct futex_hash_bucket *hb)
+{
+	struct futex_private_hash *fph = hb->priv;
+
+	if (!fph)
+		return;
+	futex_private_hash_put(fph);
+}
 
 static struct futex_hash_bucket *
 __futex_hash_private(union futex_key *key, struct futex_private_hash *fph)
@@ -155,7 +176,7 @@ __futex_hash_private(union futex_key *key, struct futex_private_hash *fph)
 		return NULL;
 
 	if (!fph)
-		fph = key->private.mm->futex_phash;
+		fph = rcu_dereference(key->private.mm->futex_phash);
 	if (!fph || !fph->hash_mask)
 		return NULL;
 
@@ -165,21 +186,119 @@ __futex_hash_private(union futex_key *key, struct futex_private_hash *fph)
 	return &fph->queues[hash & fph->hash_mask];
 }
 
+static void futex_rehash_private(struct futex_private_hash *old,
+				 struct futex_private_hash *new)
+{
+	struct futex_hash_bucket *hb_old, *hb_new;
+	unsigned int slots = old->hash_mask + 1;
+	unsigned int i;
+
+	for (i = 0; i < slots; i++) {
+		struct futex_q *this, *tmp;
+
+		hb_old = &old->queues[i];
+
+		spin_lock(&hb_old->lock);
+		plist_for_each_entry_safe(this, tmp, &hb_old->chain, list) {
+
+			plist_del(&this->list, &hb_old->chain);
+			futex_hb_waiters_dec(hb_old);
+
+			WARN_ON_ONCE(this->lock_ptr != &hb_old->lock);
+
+			hb_new = __futex_hash(&this->key, new);
+			futex_hb_waiters_inc(hb_new);
+			/*
+			 * The new pointer isn't published yet but an already
+			 * moved user can be unqueued due to timeout or signal.
+			 */
+			spin_lock_nested(&hb_new->lock, SINGLE_DEPTH_NESTING);
+			plist_add(&this->list, &hb_new->chain);
+			this->lock_ptr = &hb_new->lock;
+			spin_unlock(&hb_new->lock);
+		}
+		spin_unlock(&hb_old->lock);
+	}
+}
+
+static bool __futex_pivot_hash(struct mm_struct *mm,
+			       struct futex_private_hash *new)
+{
+	struct futex_private_hash *fph;
+
+	WARN_ON_ONCE(mm->futex_phash_new);
+
+	fph = rcu_dereference_protected(mm->futex_phash,
+					lockdep_is_held(&mm->futex_hash_lock));
+	if (fph) {
+		if (!rcuref_is_dead(&fph->users)) {
+			mm->futex_phash_new = new;
+			return false;
+		}
+
+		futex_rehash_private(fph, new);
+	}
+	rcu_assign_pointer(mm->futex_phash, new);
+	kvfree_rcu(fph, rcu);
+	return true;
+}
+
+static void futex_pivot_hash(struct mm_struct *mm)
+{
+	scoped_guard(mutex, &mm->futex_hash_lock) {
+		struct futex_private_hash *fph;
+
+		fph = mm->futex_phash_new;
+		if (fph) {
+			mm->futex_phash_new = NULL;
+			__futex_pivot_hash(mm, fph);
+		}
+	}
+}
+
 struct futex_private_hash *futex_private_hash(void)
 {
 	struct mm_struct *mm = current->mm;
-	struct futex_private_hash *fph;
+	/*
+	 * Ideally we don't loop. If there is a replacement in progress
+	 * then a new private hash is already prepared and a reference can't be
+	 * obtained once the last user dropped it's.
+	 * In that case we block on mm_struct::futex_hash_lock and either have
+	 * to perform the replacement or wait while someone else is doing the
+	 * job. Eitherway, on the second iteration we acquire a reference on the
+	 * new private hash or loop again because a new replacement has been
+	 * requested.
+	 */
+again:
+	scoped_guard(rcu) {
+		struct futex_private_hash *fph;
 
-	fph = mm->futex_phash;
-	return fph;
+		fph = rcu_dereference(mm->futex_phash);
+		if (!fph)
+			return NULL;
+
+		if (rcuref_get(&fph->users))
+			return fph;
+	}
+	futex_pivot_hash(mm);
+	goto again;
 }
 
 struct futex_hash_bucket *futex_hash(union futex_key *key)
 {
+	struct futex_private_hash *fph;
 	struct futex_hash_bucket *hb;
 
-	hb = __futex_hash(key, NULL);
-	return hb;
+again:
+	scoped_guard(rcu) {
+		hb = __futex_hash(key, NULL);
+		fph = hb->priv;
+
+		if (!fph || futex_private_hash_get(fph))
+			return hb;
+	}
+	futex_pivot_hash(key->private.mm);
+	goto again;
 }
 
 #else /* !CONFIG_FUTEX_PRIVATE_HASH */
@@ -664,6 +783,8 @@ int futex_unqueue(struct futex_q *q)
 	spinlock_t *lock_ptr;
 	int ret = 0;
 
+	/* RCU so lock_ptr is not going away during locking. */
+	guard(rcu)();
 	/* In the common case we don't take the spinlock, which is nice. */
 retry:
 	/*
@@ -1065,6 +1186,10 @@ static void exit_pi_state_list(struct task_struct *curr)
 	struct futex_pi_state *pi_state;
 	union futex_key key = FUTEX_KEY_INIT;
 
+	/*
+	 * The mutex mm_struct::futex_hash_lock might be acquired.
+	 */
+	might_sleep();
 	/*
 	 * Ensure the hash remains stable (no resize) during the while loop
 	 * below. The hb pointer is acquired under the pi_lock so we can't block
@@ -1261,7 +1386,51 @@ static void futex_hash_bucket_init(struct futex_hash_bucket *fhb,
 #ifdef CONFIG_FUTEX_PRIVATE_HASH
 void futex_hash_free(struct mm_struct *mm)
 {
-	kvfree(mm->futex_phash);
+	struct futex_private_hash *fph;
+
+	kvfree(mm->futex_phash_new);
+	fph = rcu_dereference_raw(mm->futex_phash);
+	if (fph) {
+		WARN_ON_ONCE(rcuref_read(&fph->users) > 1);
+		kvfree(fph);
+	}
+}
+
+static bool futex_pivot_pending(struct mm_struct *mm)
+{
+	struct futex_private_hash *fph;
+
+	guard(rcu)();
+
+	if (!mm->futex_phash_new)
+		return true;
+
+	fph = rcu_dereference(mm->futex_phash);
+	return rcuref_is_dead(&fph->users);
+}
+
+static bool futex_hash_less(struct futex_private_hash *a,
+			    struct futex_private_hash *b)
+{
+	/* user provided always wins */
+	if (!a->custom && b->custom)
+		return true;
+	if (a->custom && !b->custom)
+		return false;
+
+	/* zero-sized hash wins */
+	if (!b->hash_mask)
+		return true;
+	if (!a->hash_mask)
+		return false;
+
+	/* keep the biggest */
+	if (a->hash_mask < b->hash_mask)
+		return true;
+	if (a->hash_mask > b->hash_mask)
+		return false;
+
+	return false; /* equal */
 }
 
 static int futex_hash_allocate(unsigned int hash_slots, bool custom)
@@ -1273,16 +1442,23 @@ static int futex_hash_allocate(unsigned int hash_slots, bool custom)
 	if (hash_slots && (hash_slots == 1 || !is_power_of_2(hash_slots)))
 		return -EINVAL;
 
-	if (mm->futex_phash)
-		return -EALREADY;
-
-	if (!thread_group_empty(current))
-		return -EINVAL;
+	/*
+	 * Once we've disabled the global hash there is no way back.
+	 */
+	scoped_guard(rcu) {
+		fph = rcu_dereference(mm->futex_phash);
+		if (fph && !fph->hash_mask) {
+			if (custom)
+				return -EBUSY;
+			return 0;
+		}
+	}
 
 	fph = kvzalloc(struct_size(fph, queues, hash_slots), GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
 	if (!fph)
 		return -ENOMEM;
 
+	rcuref_init(&fph->users, 1);
 	fph->hash_mask = hash_slots ? hash_slots - 1 : 0;
 	fph->custom = custom;
 	fph->mm = mm;
@@ -1290,26 +1466,102 @@ static int futex_hash_allocate(unsigned int hash_slots, bool custom)
 	for (i = 0; i < hash_slots; i++)
 		futex_hash_bucket_init(&fph->queues[i], fph);
 
-	mm->futex_phash = fph;
+	if (custom) {
+		/*
+		 * Only let prctl() wait / retry; don't unduly delay clone().
+		 */
+again:
+		wait_var_event(mm, futex_pivot_pending(mm));
+	}
+
+	scoped_guard(mutex, &mm->futex_hash_lock) {
+		struct futex_private_hash *free __free(kvfree) = NULL;
+		struct futex_private_hash *cur, *new;
+
+		cur = rcu_dereference_protected(mm->futex_phash,
+						lockdep_is_held(&mm->futex_hash_lock));
+		new = mm->futex_phash_new;
+		mm->futex_phash_new = NULL;
+
+		if (fph) {
+			if (cur && !new) {
+				/*
+				 * If we have an existing hash, but do not yet have
+				 * allocated a replacement hash, drop the initial
+				 * reference on the existing hash.
+				 */
+				futex_private_hash_put(cur);
+			}
+
+			if (new) {
+				/*
+				 * Two updates raced; throw out the lesser one.
+				 */
+				if (futex_hash_less(new, fph)) {
+					free = new;
+					new = fph;
+				} else {
+					free = fph;
+				}
+			} else {
+				new = fph;
+			}
+			fph = NULL;
+		}
+
+		if (new) {
+			/*
+			 * Will set mm->futex_phash_new on failure;
+			 * futex_private_hash_get() will try again.
+			 */
+			if (!__futex_pivot_hash(mm, new) && custom)
+				goto again;
+		}
+	}
 	return 0;
 }
 
 int futex_hash_allocate_default(void)
 {
+	unsigned int threads, buckets, current_buckets = 0;
+	struct futex_private_hash *fph;
+
 	if (!current->mm)
 		return 0;
 
-	if (current->mm->futex_phash)
+	scoped_guard(rcu) {
+		threads = min_t(unsigned int,
+				get_nr_threads(current),
+				num_online_cpus());
+
+		fph = rcu_dereference(current->mm->futex_phash);
+		if (fph) {
+			if (fph->custom)
+				return 0;
+
+			current_buckets = fph->hash_mask + 1;
+		}
+	}
+
+	/*
+	 * The default allocation will remain within
+	 *   16 <= threads * 4 <= global hash size
+	 */
+	buckets = roundup_pow_of_two(4 * threads);
+	buckets = clamp(buckets, 16, futex_hashmask + 1);
+
+	if (current_buckets >= buckets)
 		return 0;
 
-	return futex_hash_allocate(16, false);
+	return futex_hash_allocate(buckets, false);
 }
 
 static int futex_hash_get_slots(void)
 {
 	struct futex_private_hash *fph;
 
-	fph = current->mm->futex_phash;
+	guard(rcu)();
+	fph = rcu_dereference(current->mm->futex_phash);
 	if (fph && fph->hash_mask)
 		return fph->hash_mask + 1;
 	return 0;
diff --git a/kernel/futex/requeue.c b/kernel/futex/requeue.c
index b0e64fd454d96..c716a66f86929 100644
--- a/kernel/futex/requeue.c
+++ b/kernel/futex/requeue.c
@@ -87,6 +87,11 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
 		futex_hb_waiters_inc(hb2);
 		plist_add(&q->list, &hb2->chain);
 		q->lock_ptr = &hb2->lock;
+		/*
+		 * hb1 and hb2 belong to the same futex_hash_bucket_private
+		 * because if we managed get a reference on hb1 then it can't be
+		 * replaced. Therefore we avoid put(hb1)+get(hb2) here.
+		 */
 	}
 	q->key = *key2;
 }
-- 
2.49.0
Re: [PATCH v12 14/21] futex: Allow to resize the private local hash
Posted by Lai, Yi 6 months, 2 weeks ago
On Wed, Apr 16, 2025 at 06:29:14PM +0200, Sebastian Andrzej Siewior wrote:
> The mm_struct::futex_hash_lock guards the futex_hash_bucket assignment/
> replacement. The futex_hash_allocate()/ PR_FUTEX_HASH_SET_SLOTS
> operation can now be invoked at runtime and resize an already existing
> internal private futex_hash_bucket to another size.
> 
> The reallocation is based on an idea by Thomas Gleixner: The initial
> allocation of struct futex_private_hash sets the reference count
> to one. Every user acquires a reference on the local hash before using
> it and drops it after it enqueued itself on the hash bucket. There is no
> reference held while the task is scheduled out while waiting for the
> wake up.
> The resize process allocates a new struct futex_private_hash and drops
> the initial reference. Synchronized with mm_struct::futex_hash_lock it
> is checked if the reference counter for the currently used
> mm_struct::futex_phash is marked as DEAD. If so, then all users enqueued
> on the current private hash are requeued on the new private hash and the
> new private hash is set to mm_struct::futex_phash. Otherwise the newly
> allocated private hash is saved as mm_struct::futex_phash_new and the
> rehashing and reassigning is delayed to the futex_hash() caller once the
> reference counter is marked DEAD.
> The replacement is not performed at rcuref_put() time because certain
> callers, such as futex_wait_queue(), drop their reference after changing
> the task state. This change will be destroyed once the futex_hash_lock
> is acquired.
> 
> The user can change the number slots with PR_FUTEX_HASH_SET_SLOTS
> multiple times. An increase and decrease is allowed and request blocks
> until the assignment is done.
> 
> The private hash allocated at thread creation is changed from 16 to
>   16 <= 4 * number_of_threads <= global_hash_size
> where number_of_threads can not exceed the number of online CPUs. Should
> the user PR_FUTEX_HASH_SET_SLOTS then the auto scaling is disabled.
> 
> [peterz: reorganize the code to avoid state tracking and simplify new
> object handling, block the user until changes are in effect, allow
> increase and decrease of the hash].
> 
> Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
> ---
>  include/linux/futex.h    |   3 +-
>  include/linux/mm_types.h |   4 +-
>  kernel/futex/core.c      | 290 ++++++++++++++++++++++++++++++++++++---
>  kernel/futex/requeue.c   |   5 +
>  4 files changed, 281 insertions(+), 21 deletions(-)
> 
> diff --git a/include/linux/futex.h b/include/linux/futex.h
> index 1d3f7555825ec..40bc778b2bb45 100644
> --- a/include/linux/futex.h
> +++ b/include/linux/futex.h
> @@ -85,7 +85,8 @@ void futex_hash_free(struct mm_struct *mm);
>  
>  static inline void futex_mm_init(struct mm_struct *mm)
>  {
> -	mm->futex_phash =  NULL;
> +	rcu_assign_pointer(mm->futex_phash, NULL);
> +	mutex_init(&mm->futex_hash_lock);
>  }
>  
>  #else /* !CONFIG_FUTEX_PRIVATE_HASH */
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index a4b5661e41770..32ba5126e2214 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -1033,7 +1033,9 @@ struct mm_struct {
>  		seqcount_t mm_lock_seq;
>  #endif
>  #ifdef CONFIG_FUTEX_PRIVATE_HASH
> -		struct futex_private_hash	*futex_phash;
> +		struct mutex			futex_hash_lock;
> +		struct futex_private_hash	__rcu *futex_phash;
> +		struct futex_private_hash	*futex_phash_new;
>  #endif
>  
>  		unsigned long hiwater_rss; /* High-watermark of RSS usage */
> diff --git a/kernel/futex/core.c b/kernel/futex/core.c
> index 53b3a00a92539..9e7dad52abea8 100644
> --- a/kernel/futex/core.c
> +++ b/kernel/futex/core.c
> @@ -40,6 +40,7 @@
>  #include <linux/fault-inject.h>
>  #include <linux/slab.h>
>  #include <linux/prctl.h>
> +#include <linux/rcuref.h>
>  
>  #include "futex.h"
>  #include "../locking/rtmutex_common.h"
> @@ -57,7 +58,9 @@ static struct {
>  #define futex_hashmask (__futex_data.hashmask)
>  
>  struct futex_private_hash {
> +	rcuref_t	users;
>  	unsigned int	hash_mask;
> +	struct rcu_head	rcu;
>  	void		*mm;
>  	bool		custom;
>  	struct futex_hash_bucket queues[];
> @@ -129,11 +132,14 @@ static inline bool futex_key_is_private(union futex_key *key)
>  
>  bool futex_private_hash_get(struct futex_private_hash *fph)
>  {
> -	return false;
> +	return rcuref_get(&fph->users);
>  }
>  
>  void futex_private_hash_put(struct futex_private_hash *fph)
>  {
> +	/* Ignore return value, last put is verified via rcuref_is_dead() */
> +	if (rcuref_put(&fph->users))
> +		wake_up_var(fph->mm);
>  }
>  
>  /**
> @@ -143,8 +149,23 @@ void futex_private_hash_put(struct futex_private_hash *fph)
>   * Obtain an additional reference for the already obtained hash bucket. The
>   * caller must already own an reference.
>   */
> -void futex_hash_get(struct futex_hash_bucket *hb) { }
> -void futex_hash_put(struct futex_hash_bucket *hb) { }
> +void futex_hash_get(struct futex_hash_bucket *hb)
> +{
> +	struct futex_private_hash *fph = hb->priv;
> +
> +	if (!fph)
> +		return;
> +	WARN_ON_ONCE(!futex_private_hash_get(fph));
> +}
> +
> +void futex_hash_put(struct futex_hash_bucket *hb)
> +{
> +	struct futex_private_hash *fph = hb->priv;
> +
> +	if (!fph)
> +		return;
> +	futex_private_hash_put(fph);
> +}
>  
>  static struct futex_hash_bucket *
>  __futex_hash_private(union futex_key *key, struct futex_private_hash *fph)
> @@ -155,7 +176,7 @@ __futex_hash_private(union futex_key *key, struct futex_private_hash *fph)
>  		return NULL;
>  
>  	if (!fph)
> -		fph = key->private.mm->futex_phash;
> +		fph = rcu_dereference(key->private.mm->futex_phash);
>  	if (!fph || !fph->hash_mask)
>  		return NULL;
>  
> @@ -165,21 +186,119 @@ __futex_hash_private(union futex_key *key, struct futex_private_hash *fph)
>  	return &fph->queues[hash & fph->hash_mask];
>  }
>  
> +static void futex_rehash_private(struct futex_private_hash *old,
> +				 struct futex_private_hash *new)
> +{
> +	struct futex_hash_bucket *hb_old, *hb_new;
> +	unsigned int slots = old->hash_mask + 1;
> +	unsigned int i;
> +
> +	for (i = 0; i < slots; i++) {
> +		struct futex_q *this, *tmp;
> +
> +		hb_old = &old->queues[i];
> +
> +		spin_lock(&hb_old->lock);
> +		plist_for_each_entry_safe(this, tmp, &hb_old->chain, list) {
> +
> +			plist_del(&this->list, &hb_old->chain);
> +			futex_hb_waiters_dec(hb_old);
> +
> +			WARN_ON_ONCE(this->lock_ptr != &hb_old->lock);
> +
> +			hb_new = __futex_hash(&this->key, new);
> +			futex_hb_waiters_inc(hb_new);
> +			/*
> +			 * The new pointer isn't published yet but an already
> +			 * moved user can be unqueued due to timeout or signal.
> +			 */
> +			spin_lock_nested(&hb_new->lock, SINGLE_DEPTH_NESTING);
> +			plist_add(&this->list, &hb_new->chain);
> +			this->lock_ptr = &hb_new->lock;
> +			spin_unlock(&hb_new->lock);
> +		}
> +		spin_unlock(&hb_old->lock);
> +	}
> +}
> +
> +static bool __futex_pivot_hash(struct mm_struct *mm,
> +			       struct futex_private_hash *new)
> +{
> +	struct futex_private_hash *fph;
> +
> +	WARN_ON_ONCE(mm->futex_phash_new);
> +
> +	fph = rcu_dereference_protected(mm->futex_phash,
> +					lockdep_is_held(&mm->futex_hash_lock));
> +	if (fph) {
> +		if (!rcuref_is_dead(&fph->users)) {
> +			mm->futex_phash_new = new;
> +			return false;
> +		}
> +
> +		futex_rehash_private(fph, new);
> +	}
> +	rcu_assign_pointer(mm->futex_phash, new);
> +	kvfree_rcu(fph, rcu);
> +	return true;
> +}
> +

Hi Sebastian Andrzej Siewior,

Greetings!

I used Syzkaller and found that there is KASAN: null-ptr-deref Read in __futex_pivot_hash in linux-next next-20250527.

After bisection and the first bad commit is:
"
bd54df5ea7ca futex: Allow to resize the private local hash
"

All detailed into can be found at:
https://github.com/laifryiee/syzkaller_logs/tree/main/250531_004606___futex_pivot_hash
Syzkaller repro code:
https://github.com/laifryiee/syzkaller_logs/tree/main/250531_004606___futex_pivot_hash/repro.c
Syzkaller repro syscall steps:
https://github.com/laifryiee/syzkaller_logs/tree/main/250531_004606___futex_pivot_hash/repro.prog
Syzkaller report:
https://github.com/laifryiee/syzkaller_logs/tree/main/250531_004606___futex_pivot_hash/repro.report
Kconfig(make olddefconfig):
https://github.com/laifryiee/syzkaller_logs/tree/main/250531_004606___futex_pivot_hash/kconfig_origin
Bisect info:
https://github.com/laifryiee/syzkaller_logs/tree/main/250531_004606___futex_pivot_hash/bisect_info.log
bzImage:
https://github.com/laifryiee/syzkaller_logs/raw/refs/heads/main/250531_004606___futex_pivot_hash/bzImage_fefff2755f2aa4125dce2a1edfe7e545c7c621f2
Issue dmesg:
https://github.com/laifryiee/syzkaller_logs/blob/main/250531_004606___futex_pivot_hash/bzImage_fefff2755f2aa4125dce2a1edfe7e545c7c621f2

"
[  266.064649] Adding 124996k swap on ./swap-file.  Priority:0 extents:1 across:124996k
[  266.075472] Oops: general protection fault, probably for non-canonical address 0xdffffc0000000001: 0000 [#11] SMP I
[  266.075983] KASAN: null-ptr-deref in range [0x0000000000000008-0x000000000000000f]
[  266.076337] CPU: 0 UID: 0 PID: 1168 Comm: repro Tainted: G    B D             6.15.0-next-20250527-fefff2755f2a #1
[  266.076882] Tainted: [B]=BAD_PAGE, [D]=DIE
[  266.077073] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.o4
[  266.077594] RIP: 0010:plist_del+0xf3/0x2d0
[  266.077803] Code: 48 89 fa 48 c1 ea 03 80 3c 02 00 0f 85 a6 01 00 00 49 8d 7f 08 4c 8b 73 10 48 b8 00 00 00 00 00 0
[  266.078640] RSP: 0018:ffff8880159dfc40 EFLAGS: 00010202
[  266.078886] RAX: dffffc0000000000 RBX: ffff88800f2397e8 RCX: ffffffff85ca6b25
[  266.079327] RDX: 0000000000000001 RSI: 0000000000000008 RDI: 0000000000000008
[  266.079658] RBP: ffff8880159dfc70 R08: 0000000000000001 R09: ffffed1002b3bf7d
[  266.079989] R10: 0000000000000003 R11: 000000000000000c R12: ffff88800f239800
[  266.080311] R13: ffff88800f2397f0 R14: 0000000000000000 R15: 0000000000000000
[  266.080635] FS:  00007f8c127ff640(0000) GS:ffff8880e355f000(0000) knlGS:0000000000000000
[  266.080998] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  266.081260] CR2: 00007f8c127fee38 CR3: 00000000149da003 CR4: 0000000000770ef0
[  266.081594] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  266.081919] DR3: 0000000000000000 DR6: 00000000ffff07f0 DR7: 0000000000000400
[  266.082248] PKRU: 55555554
[  266.082377] Call Trace:
[  266.082496]  <TASK>
[  266.082605]  __futex_pivot_hash+0x2b0/0x520
[  266.082815]  futex_hash_allocate+0xb26/0x10b0
[  266.083028]  ? __pfx_futex_hash_allocate+0x10/0x10
[  266.083261]  ? __sanitizer_cov_trace_switch+0x58/0xa0
[  266.083508]  ? __sanitizer_cov_trace_const_cmp4+0x1a/0x20
[  266.083756]  ? static_key_count+0x69/0x80
[  266.083948]  futex_hash_prctl+0x20c/0x650
[  266.084146]  __do_sys_prctl+0x1a0d/0x2170
[  266.084347]  ? __pfx___do_sys_prctl+0x10/0x10
[  266.084563]  __x64_sys_prctl+0xc6/0x150
[  266.084742]  ? syscall_trace_enter+0x14d/0x280
[  266.084956]  x64_sys_call+0x1a25/0x2150
[  266.085144]  do_syscall_64+0x6d/0x2e0
[  266.085324]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
[  266.085558] RIP: 0033:0x7f8c1283ee5d
[  266.085731] Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 8
[  266.086550] RSP: 002b:00007f8c127fed48 EFLAGS: 00000246 ORIG_RAX: 000000000000009d
[  266.086895] RAX: ffffffffffffffda RBX: 00007f8c127ff640 RCX: 00007f8c1283ee5d
[  266.087219] RDX: 0000000000000000 RSI: 0000000000000001 RDI: 000000000000004e
[  266.087546] RBP: 00007f8c127fed60 R08: 0000000000000000 R09: 0000000000000000
[  266.087869] R10: 0000000000000000 R11: 0000000000000246 R12: 00007f8c127ff640
[  266.088191] R13: 0000000000000013 R14: 00007f8c1289f560 R15: 0000000000000000
[  266.088521]  </TASK>
[  266.088631] Modules linked in:
[  266.088810] ---[ end trace 0000000000000000 ]---
[  266.089030] RIP: 0010:__futex_pivot_hash+0x271/0x520
[  266.089265] Code: e8 84 a5 58 04 48 8b 45 d0 48 c1 e8 03 42 80 3c 28 00 0f 85 5e 02 00 00 48 8b 45 d0 4c 8b 30 4c 0
[  266.090087] RSP: 0018:ffff88801b43fc80 EFLAGS: 00010206
[  266.090332] RAX: 0007c018e000003c RBX: 003e00c7000001c9 RCX: ffffffff81799536
[  266.090660] RDX: 0000000000000000 RSI: 0000000000000008 RDI: ffff8880227e8888
[  266.090983] RBP: ffff88801b43fcf8 R08: 0000000000000001 R09: ffffed1003687f7d
[  266.091309] R10: 0000000000000003 R11: 6e696c6261736944 R12: ffff888014430d68
[  266.091634] R13: dffffc0000000000 R14: 003e00c7000001e1 R15: ffff888014430a80
[  266.091950] FS:  00007f8c127ff640(0000) GS:ffff8880e355f000(0000) knlGS:0000000000000000
[  266.092319] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  266.092582] CR2: 00007f8c127fee38 CR3: 00000000149da003 CR4: 0000000000770ef0
[  266.092915] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  266.093243] DR3: 0000000000000000 DR6: 00000000ffff07f0 DR7: 0000000000000400
[  266.093608] PKRU: 55555554
[  266.093738] note: repro[1168] exited with preempt_count 1
"

I also tried lastest linux-tag next-20250530. This issue can be reproduced. Here is the log:

"
[   50.554828] Adding 124996k swap on ./swap-file.  Priority:0 extents:1 across:124996k
[   50.563846] Oops: general protection fault, probably for non-canonical address 0xe028fc18c0000065: 0000 [#4] SMP KI
[   50.564384] KASAN: maybe wild-memory-access in range [0x014800c600000328-0x014800c60000032f]
[   50.564774] CPU: 1 UID: 0 PID: 813 Comm: repro Tainted: G    B D             6.15.0-next-20250530-kvm #3 PREEMPT(v
[   50.565314] Tainted: [B]=BAD_PAGE, [D]=DIE
[   50.565514] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.o4
[   50.566028] RIP: 0010:__futex_pivot_hash+0x204/0x530
[   50.566278] Code: e8 f1 e6 5b 04 48 8b 45 d0 48 c1 e8 03 42 80 3c 28 00 0f 85 d1 02 00 00 48 8b 45 d0 4c 8b 30 4c 0
[   50.567119] RSP: 0018:ffff88801241fc80 EFLAGS: 00010206
[   50.567372] RAX: 00290018c0000065 RBX: 014800c600000310 RCX: ffffffff8179ecdc
[   50.567706] RDX: 0000000000000000 RSI: 0000000000000008 RDI: ffff88801d5d1708
[   50.568036] RBP: ffff88801241fcf8 R08: 0000000000000001 R09: ffffed1002483f7d
[   50.568364] R10: 0000000000000003 R11: 00000000bd9dfb48 R12: ffff88801429bf00
[   50.568699] R13: dffffc0000000000 R14: 014800c600000328 R15: 0000000000000001
[   50.569035] FS:  00007f183fe43640(0000) GS:ffff8880e3652000(0000) knlGS:0000000000000000
[   50.569415] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   50.569691] CR2: 00007f183fe42e38 CR3: 000000001115c005 CR4: 0000000000770ef0
[   50.570026] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[   50.570349] DR3: 0000000000000000 DR6: 00000000ffff07f0 DR7: 0000000000000400
[   50.570684] PKRU: 55555554
[   50.570820] Call Trace:
[   50.570946]  <TASK>
[   50.571060]  futex_hash_allocate+0xb3a/0x1060
[   50.571279]  ? sigprocmask+0x24e/0x370
[   50.571470]  ? __pfx_futex_hash_allocate+0x10/0x10
[   50.571703]  ? rcu_is_watching+0x19/0xc0
[   50.571899]  ? __sanitizer_cov_trace_switch+0x58/0xa0
[   50.572152]  ? __sanitizer_cov_trace_const_cmp4+0x1a/0x20
[   50.572416]  ? static_key_count+0x63/0x80
[   50.572608]  ? __sanitizer_cov_trace_const_cmp8+0x1c/0x30
[   50.572870]  futex_hash_prctl+0x1fe/0x650
[   50.573069]  __do_sys_prctl+0x4a3/0x2110
[   50.573270]  ? __pfx___do_sys_prctl+0x10/0x10
[   50.573486]  ? __audit_syscall_entry+0x39f/0x500
[   50.573714]  __x64_sys_prctl+0xc6/0x150
[   50.573905]  ? syscall_trace_enter+0x14d/0x280
[   50.574120]  x64_sys_call+0x1a2f/0x1fa0
[   50.574314]  do_syscall_64+0x6d/0x2e0
[   50.574497]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
[   50.574736] RIP: 0033:0x7f183fc3ee5d
[   50.574911] Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 8
[   50.575748] RSP: 002b:00007f183fe42d48 EFLAGS: 00000246 ORIG_RAX: 000000000000009d
[   50.576105] RAX: ffffffffffffffda RBX: 00007f183fe43640 RCX: 00007f183fc3ee5d
[   50.576434] RDX: 0000000000000000 RSI: 0000000000000001 RDI: 000000000000004e
[   50.576768] RBP: 00007f183fe42d60 R08: 0000000000000000 R09: 0000000000000000
[   50.577105] R10: 0000000000000000 R11: 0000000000000246 R12: 00007f183fe43640
[   50.577444] R13: 000000000000000c R14: 00007f183fc9f560 R15: 0000000000000000
[   50.577781]  </TASK>
[   50.577887] Modules linked in:
[   50.578095] ---[ end trace 0000000000000000 ]---
[   50.578316] RIP: 0010:__futex_pivot_hash+0x204/0x530
[   50.578559] Code: e8 f1 e6 5b 04 48 8b 45 d0 48 c1 e8 03 42 80 3c 28 00 0f 85 d1 02 00 00 48 8b 45 d0 4c 8b 30 4c 0
[   50.579394] RSP: 0018:ffff888012557c80 EFLAGS: 00010206
[   50.579643] RAX: 00798018e0000056 RBX: 03cc00c700000299 RCX: ffffffff8179ecdc
[   50.579975] RDX: 0000000000000000 RSI: 0000000000000008 RDI: ffff8880117f8488
[   50.580303] RBP: ffff888012557cf8 R08: 0000000000000001 R09: ffffed10024aaf7d
[   50.580669] R10: 0000000000000003 R11: 6e696c6261736944 R12: ffff888012cf0000
[   50.581597] R13: dffffc0000000000 R14: 03cc00c7000002b1 R15: 0000000000000001
[   50.581937] FS:  00007f183fe43640(0000) GS:ffff8880e3652000(0000) knlGS:0000000000000000
[   50.582309] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   50.582583] CR2: 00007f183fe42e38 CR3: 000000001115c005 CR4: 0000000000770ef0
[   50.582977] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[   50.583294] DR3: 0000000000000000 DR6: 00000000ffff07f0 DR7: 0000000000000400
[   50.583622] PKRU: 55555554
[   50.583758] note: repro[813] exited with preempt_count 1
"

Hope this cound be insightful to you.

Regards,
Yi Lai

---

If you don't need the following environment to reproduce the problem or if you
already have one reproduced environment, please ignore the following information.

How to reproduce:
git clone https://gitlab.com/xupengfe/repro_vm_env.git
cd repro_vm_env
tar -xvf repro_vm_env.tar.gz
cd repro_vm_env; ./start3.sh  // it needs qemu-system-x86_64 and I used v7.1.0
  // start3.sh will load bzImage_2241ab53cbb5cdb08a6b2d4688feb13971058f65 v6.2-rc5 kernel
  // You could change the bzImage_xxx as you want
  // Maybe you need to remove line "-drive if=pflash,format=raw,readonly=on,file=./OVMF_CODE.fd \" for different qemu version
You could use below command to log in, there is no password for root.
ssh -p 10023 root@localhost

After login vm(virtual machine) successfully, you could transfer reproduced
binary to the vm by below way, and reproduce the problem in vm:
gcc -pthread -o repro repro.c
scp -P 10023 repro root@localhost:/root/

Get the bzImage for target kernel:
Please use target kconfig and copy it to kernel_src/.config
make olddefconfig
make -jx bzImage           //x should equal or less than cpu num your pc has

Fill the bzImage file into above start3.sh to load the target kernel in vm.


Tips:
If you already have qemu-system-x86_64, please ignore below info.
If you want to install qemu v7.1.0 version:
git clone https://github.com/qemu/qemu.git
cd qemu
git checkout -f v7.1.0
mkdir build
cd build
yum install -y ninja-build.x86_64
yum -y install libslirp-devel.x86_64
../configure --target-list=x86_64-softmmu --enable-kvm --enable-vnc --enable-gtk --enable-sdl --enable-usb-redir --enable-slirp
make
make install 

> +static void futex_pivot_hash(struct mm_struct *mm)
> +{
> +	scoped_guard(mutex, &mm->futex_hash_lock) {
> +		struct futex_private_hash *fph;
> +
> +		fph = mm->futex_phash_new;
> +		if (fph) {
> +			mm->futex_phash_new = NULL;
> +			__futex_pivot_hash(mm, fph);
> +		}
> +	}
> +}
> +
>  struct futex_private_hash *futex_private_hash(void)
>  {
>  	struct mm_struct *mm = current->mm;
> -	struct futex_private_hash *fph;
> +	/*
> +	 * Ideally we don't loop. If there is a replacement in progress
> +	 * then a new private hash is already prepared and a reference can't be
> +	 * obtained once the last user dropped it's.
> +	 * In that case we block on mm_struct::futex_hash_lock and either have
> +	 * to perform the replacement or wait while someone else is doing the
> +	 * job. Eitherway, on the second iteration we acquire a reference on the
> +	 * new private hash or loop again because a new replacement has been
> +	 * requested.
> +	 */
> +again:
> +	scoped_guard(rcu) {
> +		struct futex_private_hash *fph;
>  
> -	fph = mm->futex_phash;
> -	return fph;
> +		fph = rcu_dereference(mm->futex_phash);
> +		if (!fph)
> +			return NULL;
> +
> +		if (rcuref_get(&fph->users))
> +			return fph;
> +	}
> +	futex_pivot_hash(mm);
> +	goto again;
>  }
>  
>  struct futex_hash_bucket *futex_hash(union futex_key *key)
>  {
> +	struct futex_private_hash *fph;
>  	struct futex_hash_bucket *hb;
>  
> -	hb = __futex_hash(key, NULL);
> -	return hb;
> +again:
> +	scoped_guard(rcu) {
> +		hb = __futex_hash(key, NULL);
> +		fph = hb->priv;
> +
> +		if (!fph || futex_private_hash_get(fph))
> +			return hb;
> +	}
> +	futex_pivot_hash(key->private.mm);
> +	goto again;
>  }
>  
>  #else /* !CONFIG_FUTEX_PRIVATE_HASH */
> @@ -664,6 +783,8 @@ int futex_unqueue(struct futex_q *q)
>  	spinlock_t *lock_ptr;
>  	int ret = 0;
>  
> +	/* RCU so lock_ptr is not going away during locking. */
> +	guard(rcu)();
>  	/* In the common case we don't take the spinlock, which is nice. */
>  retry:
>  	/*
> @@ -1065,6 +1186,10 @@ static void exit_pi_state_list(struct task_struct *curr)
>  	struct futex_pi_state *pi_state;
>  	union futex_key key = FUTEX_KEY_INIT;
>  
> +	/*
> +	 * The mutex mm_struct::futex_hash_lock might be acquired.
> +	 */
> +	might_sleep();
>  	/*
>  	 * Ensure the hash remains stable (no resize) during the while loop
>  	 * below. The hb pointer is acquired under the pi_lock so we can't block
> @@ -1261,7 +1386,51 @@ static void futex_hash_bucket_init(struct futex_hash_bucket *fhb,
>  #ifdef CONFIG_FUTEX_PRIVATE_HASH
>  void futex_hash_free(struct mm_struct *mm)
>  {
> -	kvfree(mm->futex_phash);
> +	struct futex_private_hash *fph;
> +
> +	kvfree(mm->futex_phash_new);
> +	fph = rcu_dereference_raw(mm->futex_phash);
> +	if (fph) {
> +		WARN_ON_ONCE(rcuref_read(&fph->users) > 1);
> +		kvfree(fph);
> +	}
> +}
> +
> +static bool futex_pivot_pending(struct mm_struct *mm)
> +{
> +	struct futex_private_hash *fph;
> +
> +	guard(rcu)();
> +
> +	if (!mm->futex_phash_new)
> +		return true;
> +
> +	fph = rcu_dereference(mm->futex_phash);
> +	return rcuref_is_dead(&fph->users);
> +}
> +
> +static bool futex_hash_less(struct futex_private_hash *a,
> +			    struct futex_private_hash *b)
> +{
> +	/* user provided always wins */
> +	if (!a->custom && b->custom)
> +		return true;
> +	if (a->custom && !b->custom)
> +		return false;
> +
> +	/* zero-sized hash wins */
> +	if (!b->hash_mask)
> +		return true;
> +	if (!a->hash_mask)
> +		return false;
> +
> +	/* keep the biggest */
> +	if (a->hash_mask < b->hash_mask)
> +		return true;
> +	if (a->hash_mask > b->hash_mask)
> +		return false;
> +
> +	return false; /* equal */
>  }
>  
>  static int futex_hash_allocate(unsigned int hash_slots, bool custom)
> @@ -1273,16 +1442,23 @@ static int futex_hash_allocate(unsigned int hash_slots, bool custom)
>  	if (hash_slots && (hash_slots == 1 || !is_power_of_2(hash_slots)))
>  		return -EINVAL;
>  
> -	if (mm->futex_phash)
> -		return -EALREADY;
> -
> -	if (!thread_group_empty(current))
> -		return -EINVAL;
> +	/*
> +	 * Once we've disabled the global hash there is no way back.
> +	 */
> +	scoped_guard(rcu) {
> +		fph = rcu_dereference(mm->futex_phash);
> +		if (fph && !fph->hash_mask) {
> +			if (custom)
> +				return -EBUSY;
> +			return 0;
> +		}
> +	}
>  
>  	fph = kvzalloc(struct_size(fph, queues, hash_slots), GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
>  	if (!fph)
>  		return -ENOMEM;
>  
> +	rcuref_init(&fph->users, 1);
>  	fph->hash_mask = hash_slots ? hash_slots - 1 : 0;
>  	fph->custom = custom;
>  	fph->mm = mm;
> @@ -1290,26 +1466,102 @@ static int futex_hash_allocate(unsigned int hash_slots, bool custom)
>  	for (i = 0; i < hash_slots; i++)
>  		futex_hash_bucket_init(&fph->queues[i], fph);
>  
> -	mm->futex_phash = fph;
> +	if (custom) {
> +		/*
> +		 * Only let prctl() wait / retry; don't unduly delay clone().
> +		 */
> +again:
> +		wait_var_event(mm, futex_pivot_pending(mm));
> +	}
> +
> +	scoped_guard(mutex, &mm->futex_hash_lock) {
> +		struct futex_private_hash *free __free(kvfree) = NULL;
> +		struct futex_private_hash *cur, *new;
> +
> +		cur = rcu_dereference_protected(mm->futex_phash,
> +						lockdep_is_held(&mm->futex_hash_lock));
> +		new = mm->futex_phash_new;
> +		mm->futex_phash_new = NULL;
> +
> +		if (fph) {
> +			if (cur && !new) {
> +				/*
> +				 * If we have an existing hash, but do not yet have
> +				 * allocated a replacement hash, drop the initial
> +				 * reference on the existing hash.
> +				 */
> +				futex_private_hash_put(cur);
> +			}
> +
> +			if (new) {
> +				/*
> +				 * Two updates raced; throw out the lesser one.
> +				 */
> +				if (futex_hash_less(new, fph)) {
> +					free = new;
> +					new = fph;
> +				} else {
> +					free = fph;
> +				}
> +			} else {
> +				new = fph;
> +			}
> +			fph = NULL;
> +		}
> +
> +		if (new) {
> +			/*
> +			 * Will set mm->futex_phash_new on failure;
> +			 * futex_private_hash_get() will try again.
> +			 */
> +			if (!__futex_pivot_hash(mm, new) && custom)
> +				goto again;
> +		}
> +	}
>  	return 0;
>  }
>  
>  int futex_hash_allocate_default(void)
>  {
> +	unsigned int threads, buckets, current_buckets = 0;
> +	struct futex_private_hash *fph;
> +
>  	if (!current->mm)
>  		return 0;
>  
> -	if (current->mm->futex_phash)
> +	scoped_guard(rcu) {
> +		threads = min_t(unsigned int,
> +				get_nr_threads(current),
> +				num_online_cpus());
> +
> +		fph = rcu_dereference(current->mm->futex_phash);
> +		if (fph) {
> +			if (fph->custom)
> +				return 0;
> +
> +			current_buckets = fph->hash_mask + 1;
> +		}
> +	}
> +
> +	/*
> +	 * The default allocation will remain within
> +	 *   16 <= threads * 4 <= global hash size
> +	 */
> +	buckets = roundup_pow_of_two(4 * threads);
> +	buckets = clamp(buckets, 16, futex_hashmask + 1);
> +
> +	if (current_buckets >= buckets)
>  		return 0;
>  
> -	return futex_hash_allocate(16, false);
> +	return futex_hash_allocate(buckets, false);
>  }
>  
>  static int futex_hash_get_slots(void)
>  {
>  	struct futex_private_hash *fph;
>  
> -	fph = current->mm->futex_phash;
> +	guard(rcu)();
> +	fph = rcu_dereference(current->mm->futex_phash);
>  	if (fph && fph->hash_mask)
>  		return fph->hash_mask + 1;
>  	return 0;
> diff --git a/kernel/futex/requeue.c b/kernel/futex/requeue.c
> index b0e64fd454d96..c716a66f86929 100644
> --- a/kernel/futex/requeue.c
> +++ b/kernel/futex/requeue.c
> @@ -87,6 +87,11 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
>  		futex_hb_waiters_inc(hb2);
>  		plist_add(&q->list, &hb2->chain);
>  		q->lock_ptr = &hb2->lock;
> +		/*
> +		 * hb1 and hb2 belong to the same futex_hash_bucket_private
> +		 * because if we managed get a reference on hb1 then it can't be
> +		 * replaced. Therefore we avoid put(hb1)+get(hb2) here.
> +		 */
>  	}
>  	q->key = *key2;
>  }
> -- 
> 2.49.0
>
Re: [PATCH v12 14/21] futex: Allow to resize the private local hash
Posted by Sebastian Andrzej Siewior 6 months, 2 weeks ago
On 2025-06-01 15:39:47 [+0800], Lai, Yi wrote:
> Hi Sebastian Andrzej Siewior,
Hi Yi,
> Greetings!
> 
> I used Syzkaller and found that there is KASAN: null-ptr-deref Read in __futex_pivot_hash in linux-next next-20250527.
> 
> After bisection and the first bad commit is:
> "
> bd54df5ea7ca futex: Allow to resize the private local hash
> "

Thank you for the report. Next time please trim your report. There is no
need to put your report in the middle of the patch.

The following fixes it:

----------->8--------------

From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Mon, 2 Jun 2025 12:11:13 +0200
Subject: [PATCH] futex: Verify under the lock if global hash is in use

Once the global hash is requested there is no way back to switch back to
the per-task private hash. This is checked at the begin of the function.

It is possible that two threads simultaneously request the global hash
and both pass the initial check and block later on the
mm::futex_hash_lock. In this case the first thread performs the switch
to the global hash. The second thread will also attempt to switch to the
global hash and while doing so, accessing the nonexisting slot 1 of the
struct futex_private_hash.
This has been reported by Yi Lai.

Verify under mm_struct::futex_phash that the global hash is not in use.

Reported-by: "Lai, Yi" <yi1.lai@linux.intel.com>
Closes: https://lore.kernel.org/all/aDwDw9Aygqo6oAx+@ly-workstation/
Fixes: bd54df5ea7cad ("futex: Allow to resize the private local hash")
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 kernel/futex/core.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index 1cd3a646c91fd..abbd97c2fcba8 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -1629,6 +1629,16 @@ static int futex_hash_allocate(unsigned int hash_slots, unsigned int flags)
 		mm->futex_phash_new = NULL;
 
 		if (fph) {
+			if (cur && !cur->hash_mask) {
+				/*
+				 * If two threads simultaneously request the global
+				 * hash then the first one performs the switch,
+				 * the second one returns here.
+				 */
+				free = fph;
+				mm->futex_phash_new = new;
+				return -EBUSY;
+			}
 			if (cur && !new) {
 				/*
 				 * If we have an existing hash, but do not yet have
-- 
2.49.0


Sebastian
Re: [PATCH v12 14/21] futex: Allow to resize the private local hash
Posted by Lai, Yi 6 months, 2 weeks ago
On Mon, Jun 02, 2025 at 01:00:27PM +0200, Sebastian Andrzej Siewior wrote:
> On 2025-06-01 15:39:47 [+0800], Lai, Yi wrote:
> > Hi Sebastian Andrzej Siewior,
> Hi Yi,
> > Greetings!
> > 
> > I used Syzkaller and found that there is KASAN: null-ptr-deref Read in __futex_pivot_hash in linux-next next-20250527.
> > 
> > After bisection and the first bad commit is:
> > "
> > bd54df5ea7ca futex: Allow to resize the private local hash
> > "
> 
> Thank you for the report. Next time please trim your report. There is no
> need to put your report in the middle of the patch.
> 
> The following fixes it:
>

Will trim my report next time.

After applying following patch on top of lastest linux-next, issue
cannot be reproduced. Thanks.

Regards,
Yi Lai

> ----------->8--------------
> 
> From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
> Date: Mon, 2 Jun 2025 12:11:13 +0200
> Subject: [PATCH] futex: Verify under the lock if global hash is in use
> 
> Once the global hash is requested there is no way back to switch back to
> the per-task private hash. This is checked at the begin of the function.
> 
> It is possible that two threads simultaneously request the global hash
> and both pass the initial check and block later on the
> mm::futex_hash_lock. In this case the first thread performs the switch
> to the global hash. The second thread will also attempt to switch to the
> global hash and while doing so, accessing the nonexisting slot 1 of the
> struct futex_private_hash.
> This has been reported by Yi Lai.
> 
> Verify under mm_struct::futex_phash that the global hash is not in use.
> 
> Reported-by: "Lai, Yi" <yi1.lai@linux.intel.com>
> Closes: https://lore.kernel.org/all/aDwDw9Aygqo6oAx+@ly-workstation/
> Fixes: bd54df5ea7cad ("futex: Allow to resize the private local hash")
> Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
> ---
>  kernel/futex/core.c | 10 ++++++++++
>  1 file changed, 10 insertions(+)
> 
> diff --git a/kernel/futex/core.c b/kernel/futex/core.c
> index 1cd3a646c91fd..abbd97c2fcba8 100644
> --- a/kernel/futex/core.c
> +++ b/kernel/futex/core.c
> @@ -1629,6 +1629,16 @@ static int futex_hash_allocate(unsigned int hash_slots, unsigned int flags)
>  		mm->futex_phash_new = NULL;
>  
>  		if (fph) {
> +			if (cur && !cur->hash_mask) {
> +				/*
> +				 * If two threads simultaneously request the global
> +				 * hash then the first one performs the switch,
> +				 * the second one returns here.
> +				 */
> +				free = fph;
> +				mm->futex_phash_new = new;
> +				return -EBUSY;
> +			}
>  			if (cur && !new) {
>  				/*
>  				 * If we have an existing hash, but do not yet have
> -- 
> 2.49.0
> 
> 
> Sebastian
Re: [PATCH v12 14/21] futex: Allow to resize the private local hash
Posted by Sebastian Andrzej Siewior 6 months, 2 weeks ago
On 2025-06-02 22:36:45 [+0800], Lai, Yi wrote:
> Will trim my report next time.
Thank you.

> After applying following patch on top of lastest linux-next, issue
> cannot be reproduced. Thanks.

Does this statement above count as
Tested-by: "Lai, Yi" <yi1.lai@linux.intel.com>

?

> Regards,
> Yi Lai

Sebastian
Re: [PATCH v12 14/21] futex: Allow to resize the private local hash
Posted by Lai, Yi 6 months, 2 weeks ago
On Mon, Jun 02, 2025 at 04:44:22PM +0200, Sebastian Andrzej Siewior wrote:
> On 2025-06-02 22:36:45 [+0800], Lai, Yi wrote:
> > Will trim my report next time.
> Thank you.
> 
> > After applying following patch on top of lastest linux-next, issue
> > cannot be reproduced. Thanks.
> 
> Does this statement above count as
> Tested-by: "Lai, Yi" <yi1.lai@linux.intel.com>
> 
> ?
>

Yes. Please kindly include it.

Tested-by: "Lai, Yi" <yi1.lai@linux.intel.com>

> > Regards,
> > Yi Lai
> 
> Sebastian
[tip: locking/urgent] futex: Allow to resize the private local hash
Posted by tip-bot2 for Sebastian Andrzej Siewior 6 months, 1 week ago
The following commit has been merged into the locking/urgent branch of tip:

Commit-ID:     703b5f31aee5bda47868c09a3522a78823c1bb77
Gitweb:        https://git.kernel.org/tip/703b5f31aee5bda47868c09a3522a78823c1bb77
Author:        Sebastian Andrzej Siewior <bigeasy@linutronix.de>
AuthorDate:    Mon, 02 Jun 2025 13:00:27 +02:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Wed, 11 Jun 2025 16:26:44 +02:00

futex: Allow to resize the private local hash

Once the global hash is requested there is no way back to switch back to
the per-task private hash. This is checked at the begin of the function.

It is possible that two threads simultaneously request the global hash
and both pass the initial check and block later on the
mm::futex_hash_lock. In this case the first thread performs the switch
to the global hash. The second thread will also attempt to switch to the
global hash and while doing so, accessing the nonexisting slot 1 of the
struct futex_private_hash.
This has been reported by Yi Lai.

Verify under mm_struct::futex_phash that the global hash is not in use.

Fixes: bd54df5ea7cad ("futex: Allow to resize the private local hash")
Closes: https://lore.kernel.org/all/aDwDw9Aygqo6oAx+@ly-workstation/
Reported-by: "Lai, Yi" <yi1.lai@linux.intel.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20250602110027.wfqbHgzb@linutronix.de
---
 kernel/futex/core.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index b652d2f..33b3643 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -1629,6 +1629,16 @@ again:
 		mm->futex_phash_new = NULL;
 
 		if (fph) {
+			if (cur && !cur->hash_mask) {
+				/*
+				 * If two threads simultaneously request the global
+				 * hash then the first one performs the switch,
+				 * the second one returns here.
+				 */
+				free = fph;
+				mm->futex_phash_new = new;
+				return -EBUSY;
+			}
 			if (cur && !new) {
 				/*
 				 * If we have an existing hash, but do not yet have
Re: [tip: locking/urgent] futex: Allow to resize the private local hash
Posted by Calvin Owens 6 months ago
On Wednesday 06/11 at 14:39 -0000, tip-bot2 for Sebastian Andrzej Siewior wrote:
> <snip> 
> It is possible that two threads simultaneously request the global hash
> and both pass the initial check and block later on the
> mm::futex_hash_lock. In this case the first thread performs the switch
> to the global hash. The second thread will also attempt to switch to the
> global hash and while doing so, accessing the nonexisting slot 1 of the
> struct futex_private_hash.

In case it's interesting to anyone, I'm hitting this one in real life,
one of my build machines got stuck overnight:

Jun 16 02:51:34 beethoven kernel: rcu: INFO: rcu_preempt self-detected stall on CPU
Jun 16 02:51:34 beethoven kernel: rcu:         16-....: (59997 ticks this GP) idle=eaf4/1/0x4000000000000000 softirq=14417247/14470115 fqs=21169
Jun 16 02:51:34 beethoven kernel: rcu:         (t=60000 jiffies g=21453525 q=663214 ncpus=24)
Jun 16 02:51:34 beethoven kernel: CPU: 16 UID: 1000 PID: 2028199 Comm: cargo Not tainted 6.16.0-rc1-lto-00236-g8c6bc74c7f89 #1 PREEMPT 
Jun 16 02:51:34 beethoven kernel: Hardware name: ASRock B850 Pro-A/B850 Pro-A, BIOS 3.11 11/12/2024
Jun 16 02:51:34 beethoven kernel: RIP: 0010:queued_spin_lock_slowpath+0x162/0x1d0
Jun 16 02:51:34 beethoven kernel: Code: 0f 1f 84 00 00 00 00 00 f3 90 83 7a 08 00 74 f8 48 8b 32 48 85 f6 74 09 0f 0d 0e eb 0d 31 f6 eb 09 31 f6 eb 05 0f 1f 00 f3 90 <8b> 07 66 85 c0 75 f7 39 c8 75 13 41 b8 01 00 00 00 89 c8 f0 44 0f
Jun 16 02:51:34 beethoven kernel: RSP: 0018:ffffc9002fb1fc38 EFLAGS: 00000206
Jun 16 02:51:34 beethoven kernel: RAX: 0000000000447f3a RBX: ffffc9003029fdf0 RCX: 0000000000440000
Jun 16 02:51:34 beethoven kernel: RDX: ffff88901fea5100 RSI: 0000000000000000 RDI: ffff888127e7d844
Jun 16 02:51:34 beethoven kernel: RBP: ffff8883a3c07248 R08: 0000000000000000 R09: 00000000b69b409a
Jun 16 02:51:34 beethoven kernel: R10: 000000001bd29fd9 R11: 0000000069b409ab R12: ffff888127e7d844
Jun 16 02:51:34 beethoven kernel: R13: ffff888127e7d840 R14: ffffc9003029fde0 R15: ffff8883a3c07248
Jun 16 02:51:34 beethoven kernel: FS:  00007f61c23d85c0(0000) GS:ffff88909b9f6000(0000) knlGS:0000000000000000
Jun 16 02:51:34 beethoven kernel: CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
Jun 16 02:51:34 beethoven kernel: CR2: 000056407760f3e0 CR3: 0000000905f29000 CR4: 0000000000750ef0
Jun 16 02:51:34 beethoven kernel: PKRU: 55555554
Jun 16 02:51:34 beethoven kernel: Call Trace:
Jun 16 02:51:34 beethoven kernel:  <TASK>
Jun 16 02:51:34 beethoven kernel:  __futex_pivot_hash+0x1f8/0x2e0
Jun 16 02:51:34 beethoven kernel:  futex_hash+0x95/0xe0
Jun 16 02:51:34 beethoven kernel:  futex_wait_setup+0x7e/0x230
Jun 16 02:51:34 beethoven kernel:  __futex_wait+0x66/0x130
Jun 16 02:51:34 beethoven kernel:  ? __futex_wake_mark+0xc0/0xc0
Jun 16 02:51:34 beethoven kernel:  futex_wait+0xee/0x180
Jun 16 02:51:34 beethoven kernel:  ? hrtimer_setup_sleeper_on_stack+0xe0/0xe0
Jun 16 02:51:34 beethoven kernel:  do_futex+0x86/0x120
Jun 16 02:51:34 beethoven kernel:  __se_sys_futex+0x16d/0x1e0
Jun 16 02:51:34 beethoven kernel:  do_syscall_64+0x47/0x170
Jun 16 02:51:34 beethoven kernel:  entry_SYSCALL_64_after_hwframe+0x4b/0x53
Jun 16 02:51:34 beethoven kernel: RIP: 0033:0x7f61c1d18779
Jun 16 02:51:34 beethoven kernel: Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 4f 86 0d 00 f7 d8 64 89 01 48
Jun 16 02:51:34 beethoven kernel: RSP: 002b:00007ffcd3f6e3f8 EFLAGS: 00000246 ORIG_RAX: 00000000000000ca
Jun 16 02:51:34 beethoven kernel: RAX: ffffffffffffffda RBX: 00007f61c1d18760 RCX: 00007f61c1d18779
Jun 16 02:51:34 beethoven kernel: RDX: 00000000000000a9 RSI: 0000000000000089 RDI: 0000564077580bb0
Jun 16 02:51:34 beethoven kernel: RBP: 00007ffcd3f6e450 R08: 0000000000000000 R09: 00007ffcffffffff
Jun 16 02:51:34 beethoven kernel: R10: 00007ffcd3f6e410 R11: 0000000000000246 R12: 000000001dcd6401
Jun 16 02:51:34 beethoven kernel: R13: 00007f61c1c33fd0 R14: 0000564077580bb0 R15: 00000000000000a9
Jun 16 02:51:34 beethoven kernel:  </TASK>
<repeats forever until I wake up and kill the machine>

It seems like this is well understood already, but let me know if
there's any debug info I can send that might be useful.

Thanks,
Calvin
Re: [tip: locking/urgent] futex: Allow to resize the private local hash
Posted by Sebastian Andrzej Siewior 6 months ago
On 2025-06-16 10:14:24 [-0700], Calvin Owens wrote:
> On Wednesday 06/11 at 14:39 -0000, tip-bot2 for Sebastian Andrzej Siewior wrote:
> > <snip> 
> > It is possible that two threads simultaneously request the global hash
> > and both pass the initial check and block later on the
> > mm::futex_hash_lock. In this case the first thread performs the switch
> > to the global hash. The second thread will also attempt to switch to the
> > global hash and while doing so, accessing the nonexisting slot 1 of the
> > struct futex_private_hash.
> 
> In case it's interesting to anyone, I'm hitting this one in real life,
> one of my build machines got stuck overnight:

The scenario described in the description is not something that happens
on its own. The bot explicitly "asked" for it. This won't happen in a
"normal" scenario where you do not explicitly ask for specific hash via
the prctl() interface.

> Jun 16 02:51:34 beethoven kernel: rcu: INFO: rcu_preempt self-detected stall on CPU
> Jun 16 02:51:34 beethoven kernel: rcu:         16-....: (59997 ticks this GP) idle=eaf4/1/0x4000000000000000 softirq=14417247/14470115 fqs=21169
> Jun 16 02:51:34 beethoven kernel: rcu:         (t=60000 jiffies g=21453525 q=663214 ncpus=24)
> Jun 16 02:51:34 beethoven kernel: CPU: 16 UID: 1000 PID: 2028199 Comm: cargo Not tainted 6.16.0-rc1-lto-00236-g8c6bc74c7f89 #1 PREEMPT 
> Jun 16 02:51:34 beethoven kernel: Hardware name: ASRock B850 Pro-A/B850 Pro-A, BIOS 3.11 11/12/2024
> Jun 16 02:51:34 beethoven kernel: RIP: 0010:queued_spin_lock_slowpath+0x162/0x1d0
> Jun 16 02:51:34 beethoven kernel: Code: 0f 1f 84 00 00 00 00 00 f3 90 83 7a 08 00 74 f8 48 8b 32 48 85 f6 74 09 0f 0d 0e eb 0d 31 f6 eb 09 31 f6 eb 05 0f 1f 00 f3 90 <8b> 07 66 85 c0 75 f7 39 c8 75 13 41 b8 01 00 00 00 89 c8 f0 44 0f
…
> Jun 16 02:51:34 beethoven kernel: Call Trace:
> Jun 16 02:51:34 beethoven kernel:  <TASK>
> Jun 16 02:51:34 beethoven kernel:  __futex_pivot_hash+0x1f8/0x2e0
> Jun 16 02:51:34 beethoven kernel:  futex_hash+0x95/0xe0
> Jun 16 02:51:34 beethoven kernel:  futex_wait_setup+0x7e/0x230
> Jun 16 02:51:34 beethoven kernel:  __futex_wait+0x66/0x130
> Jun 16 02:51:34 beethoven kernel:  ? __futex_wake_mark+0xc0/0xc0
> Jun 16 02:51:34 beethoven kernel:  futex_wait+0xee/0x180
> Jun 16 02:51:34 beethoven kernel:  ? hrtimer_setup_sleeper_on_stack+0xe0/0xe0
> Jun 16 02:51:34 beethoven kernel:  do_futex+0x86/0x120
> Jun 16 02:51:34 beethoven kernel:  __se_sys_futex+0x16d/0x1e0
> Jun 16 02:51:34 beethoven kernel:  do_syscall_64+0x47/0x170
> Jun 16 02:51:34 beethoven kernel:  entry_SYSCALL_64_after_hwframe+0x4b/0x53
…
> <repeats forever until I wake up and kill the machine>
> 
> It seems like this is well understood already, but let me know if
> there's any debug info I can send that might be useful.

This is with LTO enabled.
Based on the backtrace: there was a resize request (probably because a
thread was created) and the resize was delayed because the hash was in
use. The hash was released and now this thread moves all enqueued users
from the old the hash to the new. RIP says it is a spin lock that it is
stuck on. This is either the new or the old hash bucket lock.
If this lifelocks then someone else must have it locked and not
released.
Is this the only thread stuck or is there more?
I'm puzzled here. It looks as if there was an unlock missing.

> Thanks,
> Calvin

Sebastian
Re: [tip: locking/urgent] futex: Allow to resize the private local hash
Posted by Calvin Owens 6 months ago
On Tuesday 06/17 at 09:16 +0200, Sebastian Andrzej Siewior wrote:
> On 2025-06-16 10:14:24 [-0700], Calvin Owens wrote:
> > On Wednesday 06/11 at 14:39 -0000, tip-bot2 for Sebastian Andrzej Siewior wrote:
> > > <snip> 
> > > It is possible that two threads simultaneously request the global hash
> > > and both pass the initial check and block later on the
> > > mm::futex_hash_lock. In this case the first thread performs the switch
> > > to the global hash. The second thread will also attempt to switch to the
> > > global hash and while doing so, accessing the nonexisting slot 1 of the
> > > struct futex_private_hash.
> > 
> > In case it's interesting to anyone, I'm hitting this one in real life,
> > one of my build machines got stuck overnight:
> 
> The scenario described in the description is not something that happens
> on its own. The bot explicitly "asked" for it. This won't happen in a
> "normal" scenario where you do not explicitly ask for specific hash via
> the prctl() interface.

Ugh, I'm sorry, I was in too much of a hurry this morning... cargo is
obviously not calling PR_FUTEX_HASH which is new in 6.16 :/

> > Jun 16 02:51:34 beethoven kernel: rcu: INFO: rcu_preempt self-detected stall on CPU
> > Jun 16 02:51:34 beethoven kernel: rcu:         16-....: (59997 ticks this GP) idle=eaf4/1/0x4000000000000000 softirq=14417247/14470115 fqs=21169
> > Jun 16 02:51:34 beethoven kernel: rcu:         (t=60000 jiffies g=21453525 q=663214 ncpus=24)
> > Jun 16 02:51:34 beethoven kernel: CPU: 16 UID: 1000 PID: 2028199 Comm: cargo Not tainted 6.16.0-rc1-lto-00236-g8c6bc74c7f89 #1 PREEMPT 
> > Jun 16 02:51:34 beethoven kernel: Hardware name: ASRock B850 Pro-A/B850 Pro-A, BIOS 3.11 11/12/2024
> > Jun 16 02:51:34 beethoven kernel: RIP: 0010:queued_spin_lock_slowpath+0x162/0x1d0
> > Jun 16 02:51:34 beethoven kernel: Code: 0f 1f 84 00 00 00 00 00 f3 90 83 7a 08 00 74 f8 48 8b 32 48 85 f6 74 09 0f 0d 0e eb 0d 31 f6 eb 09 31 f6 eb 05 0f 1f 00 f3 90 <8b> 07 66 85 c0 75 f7 39 c8 75 13 41 b8 01 00 00 00 89 c8 f0 44 0f
> …
> > Jun 16 02:51:34 beethoven kernel: Call Trace:
> > Jun 16 02:51:34 beethoven kernel:  <TASK>
> > Jun 16 02:51:34 beethoven kernel:  __futex_pivot_hash+0x1f8/0x2e0
> > Jun 16 02:51:34 beethoven kernel:  futex_hash+0x95/0xe0
> > Jun 16 02:51:34 beethoven kernel:  futex_wait_setup+0x7e/0x230
> > Jun 16 02:51:34 beethoven kernel:  __futex_wait+0x66/0x130
> > Jun 16 02:51:34 beethoven kernel:  ? __futex_wake_mark+0xc0/0xc0
> > Jun 16 02:51:34 beethoven kernel:  futex_wait+0xee/0x180
> > Jun 16 02:51:34 beethoven kernel:  ? hrtimer_setup_sleeper_on_stack+0xe0/0xe0
> > Jun 16 02:51:34 beethoven kernel:  do_futex+0x86/0x120
> > Jun 16 02:51:34 beethoven kernel:  __se_sys_futex+0x16d/0x1e0
> > Jun 16 02:51:34 beethoven kernel:  do_syscall_64+0x47/0x170
> > Jun 16 02:51:34 beethoven kernel:  entry_SYSCALL_64_after_hwframe+0x4b/0x53
> …
> > <repeats forever until I wake up and kill the machine>
> > 
> > It seems like this is well understood already, but let me know if
> > there's any debug info I can send that might be useful.
> 
> This is with LTO enabled.

Full lto with llvm-20.1.7.

> Based on the backtrace: there was a resize request (probably because a
> thread was created) and the resize was delayed because the hash was in
> use. The hash was released and now this thread moves all enqueued users
> from the old the hash to the new. RIP says it is a spin lock that it is
> stuck on. This is either the new or the old hash bucket lock.
> If this lifelocks then someone else must have it locked and not
> released.
> Is this the only thread stuck or is there more?
> I'm puzzled here. It looks as if there was an unlock missing.

Nothing showed up in the logs but the RCU stalls on CPU16, always in
queued_spin_lock_slowpath().

I'll run the build it was doing when it happened in a loop overnight and
see if I can trigger it again.

> > Thanks,
> > Calvin
> 
> Sebastian
Re: [tip: locking/urgent] futex: Allow to resize the private local hash
Posted by Sebastian Andrzej Siewior 6 months ago
On 2025-06-17 02:23:08 [-0700], Calvin Owens wrote:
> Ugh, I'm sorry, I was in too much of a hurry this morning... cargo is
> obviously not calling PR_FUTEX_HASH which is new in 6.16 :/
No worries.

> > This is with LTO enabled.
> 
> Full lto with llvm-20.1.7.
> 
…
> Nothing showed up in the logs but the RCU stalls on CPU16, always in
> queued_spin_lock_slowpath().
> 
> I'll run the build it was doing when it happened in a loop overnight and
> see if I can trigger it again.

Please check if you can reproduce it and if so if it also happens
without lto.
I have no idea why one spinlock_t remains locked. It is either locked or
some stray memory.

Oh. Lockdep adds quite some overhead but it should complain that a
spinlock_t is still locked while returning to userland.

> > > Thanks,
> > > Calvin
> > 
Sebastian
Re: [tip: locking/urgent] futex: Allow to resize the private local hash
Posted by Calvin Owens 6 months ago
On Tuesday 06/17 at 11:50 +0200, Sebastian Andrzej Siewior wrote:
> On 2025-06-17 02:23:08 [-0700], Calvin Owens wrote:
> > Ugh, I'm sorry, I was in too much of a hurry this morning... cargo is
> > obviously not calling PR_FUTEX_HASH which is new in 6.16 :/
> No worries.
> 
> > > This is with LTO enabled.
> > 
> > Full lto with llvm-20.1.7.
> > 
> …
> > Nothing showed up in the logs but the RCU stalls on CPU16, always in
> > queued_spin_lock_slowpath().
> > 
> > I'll run the build it was doing when it happened in a loop overnight and
> > see if I can trigger it again.

Actually got an oops this time:

    Oops: general protection fault, probably for non-canonical address 0xfdd92c90843cf111: 0000 [#1] SMP
    CPU: 3 UID: 1000 PID: 323127 Comm: cargo Not tainted 6.16.0-rc2-lto-00024-g9afe652958c3 #1 PREEMPT 
    Hardware name: ASRock B850 Pro-A/B850 Pro-A, BIOS 3.11 11/12/2024
    RIP: 0010:queued_spin_lock_slowpath+0x12a/0x1d0
    Code: c8 c1 e8 10 66 87 47 02 66 85 c0 74 48 0f b7 c0 49 c7 c0 f8 ff ff ff 89 c6 c1 ee 02 83 e0 03 49 8b b4 f0 00 21 67 83 c1 e0 04 <48> 89 94 30 00 f1 4a 84 83 7a 08 00 75 10 0f 1f 84 00 00 00 00 00
    RSP: 0018:ffffc9002c953d20 EFLAGS: 00010256
    RAX: 0000000000000000 RBX: ffff88814e78be40 RCX: 0000000000100000
    RDX: ffff88901fce5100 RSI: fdd92c90fff20011 RDI: ffff8881c2ae9384
    RBP: 000000000000002b R08: fffffffffffffff8 R09: 00000000002ab900
    R10: 000000000000b823 R11: 0000000000000c00 R12: ffff88814e78be40
    R13: ffffc9002c953d48 R14: ffffc9002c953d48 R15: ffff8881c2ae9384
    FS:  00007f086efb6600(0000) GS:ffff88909b836000(0000) knlGS:0000000000000000
    CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
    CR2: 000055ced9c42650 CR3: 000000034b88e000 CR4: 0000000000750ef0
    PKRU: 55555554
    Call Trace:
     <TASK>
     futex_unqueue+0x2e/0x110
     __futex_wait+0xc5/0x130
     ? __futex_wake_mark+0xc0/0xc0
     futex_wait+0xee/0x180
     ? hrtimer_setup_sleeper_on_stack+0xe0/0xe0
     do_futex+0x86/0x120
     __se_sys_futex+0x16d/0x1e0
     ? __x64_sys_write+0xba/0xc0
     do_syscall_64+0x47/0x170
     entry_SYSCALL_64_after_hwframe+0x4b/0x53
    RIP: 0033:0x7f086e918779
    Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 4f 86 0d 00 f7 d8 64 89 01 48
    RSP: 002b:00007ffc5815f678 EFLAGS: 00000246 ORIG_RAX: 00000000000000ca
    RAX: ffffffffffffffda RBX: 00007f086e918760 RCX: 00007f086e918779
    RDX: 000000000000002b RSI: 0000000000000089 RDI: 00005636f9fb60d0
    RBP: 00007ffc5815f6d0 R08: 0000000000000000 R09: 00007ffcffffffff
    R10: 00007ffc5815f690 R11: 0000000000000246 R12: 000000001dcd6401
    R13: 00007f086e833fd0 R14: 00005636f9fb60d0 R15: 000000000000002b
     </TASK>
    ---[ end trace 0000000000000000 ]---
    RIP: 0010:queued_spin_lock_slowpath+0x12a/0x1d0
    Code: c8 c1 e8 10 66 87 47 02 66 85 c0 74 48 0f b7 c0 49 c7 c0 f8 ff ff ff 89 c6 c1 ee 02 83 e0 03 49 8b b4 f0 00 21 67 83 c1 e0 04 <48> 89 94 30 00 f1 4a 84 83 7a 08 00 75 10 0f 1f 84 00 00 00 00 00
    RSP: 0018:ffffc9002c953d20 EFLAGS: 00010256
    RAX: 0000000000000000 RBX: ffff88814e78be40 RCX: 0000000000100000
    RDX: ffff88901fce5100 RSI: fdd92c90fff20011 RDI: ffff8881c2ae9384
    RBP: 000000000000002b R08: fffffffffffffff8 R09: 00000000002ab900
    R10: 000000000000b823 R11: 0000000000000c00 R12: ffff88814e78be40
    R13: ffffc9002c953d48 R14: ffffc9002c953d48 R15: ffff8881c2ae9384
    FS:  00007f086efb6600(0000) GS:ffff88909b836000(0000) knlGS:0000000000000000
    CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
    CR2: 000055ced9c42650 CR3: 000000034b88e000 CR4: 0000000000750ef0
    PKRU: 55555554
    Kernel panic - not syncing: Fatal exception
    Kernel Offset: disabled
    ---[ end Kernel panic - not syncing: Fatal exception ]---

This is a giant Yocto build, but the comm is always cargo, so hopefully
I can run those bits in isolation and hit it more quickly.

> Please check if you can reproduce it and if so if it also happens
> without lto.
> I have no idea why one spinlock_t remains locked. It is either locked or
> some stray memory.
> Oh. Lockdep adds quite some overhead but it should complain that a
> spinlock_t is still locked while returning to userland.

I'll report back when I've tried :)

I'll also try some of the mm debug configs.

Thanks,
Calvin
Re: [tip: locking/urgent] futex: Allow to resize the private local hash
Posted by Sebastian Andrzej Siewior 6 months ago
On 2025-06-17 09:11:06 [-0700], Calvin Owens wrote:
> Actually got an oops this time:
> 
>     Oops: general protection fault, probably for non-canonical address 0xfdd92c90843cf111: 0000 [#1] SMP
>     CPU: 3 UID: 1000 PID: 323127 Comm: cargo Not tainted 6.16.0-rc2-lto-00024-g9afe652958c3 #1 PREEMPT 
>     Hardware name: ASRock B850 Pro-A/B850 Pro-A, BIOS 3.11 11/12/2024
>     RIP: 0010:queued_spin_lock_slowpath+0x12a/0x1d0
…
>     Call Trace:
>      <TASK>
>      futex_unqueue+0x2e/0x110
>      __futex_wait+0xc5/0x130
>      futex_wait+0xee/0x180
>      do_futex+0x86/0x120
>      __se_sys_futex+0x16d/0x1e0
>      do_syscall_64+0x47/0x170
>      entry_SYSCALL_64_after_hwframe+0x4b/0x53
>     RIP: 0033:0x7f086e918779

The lock_ptr is pointing to invalid memory. It explodes within
queued_spin_lock_slowpath() which looks like decode_tail() returned a
wrong pointer/ offset.

futex_queue() adds a local futex_q to the list and its lock_ptr points
to the hb lock. Then we do schedule() and after the wakeup the lock_ptr
is NULL after a successful wake.  Otherwise it still points to the
futex_hash_bucket::lock.

Since futex_unqueue() attempts to acquire the lock, then there was no
wakeup but a timeout or a signal that ended the wait. The lock_ptr can
change during resize.
During the resize futex_rehash_private() moves the futex_q members from
the old queue to the new one. The lock is accessed within RCU and the
lock_ptr value is compared against the old value after locking. That
means it is accessed either before the rehash moved it the new hash
bucket or afterwards.
I don't see how this pointer can become invalid. RCU protects against
cleanup and the pointer compare ensures that it is the "current"
pointer.
I've been looking at clang's assembly of futex_unqueue() and it looks
correct. And futex_rehash_private() iterates over all slots.

> This is a giant Yocto build, but the comm is always cargo, so hopefully
> I can run those bits in isolation and hit it more quickly.

If it still explodes without LTO, would you mind trying gcc?

> Thanks,
> Calvin

Sebastian
Re: [tip: locking/urgent] futex: Allow to resize the private local hash
Posted by Calvin Owens 6 months ago
On Wednesday 06/18 at 18:03 +0200, Sebastian Andrzej Siewior wrote:
> On 2025-06-17 09:11:06 [-0700], Calvin Owens wrote:
> > Actually got an oops this time:
> > 
> >     Oops: general protection fault, probably for non-canonical address 0xfdd92c90843cf111: 0000 [#1] SMP
> >     CPU: 3 UID: 1000 PID: 323127 Comm: cargo Not tainted 6.16.0-rc2-lto-00024-g9afe652958c3 #1 PREEMPT 
> >     Hardware name: ASRock B850 Pro-A/B850 Pro-A, BIOS 3.11 11/12/2024
> >     RIP: 0010:queued_spin_lock_slowpath+0x12a/0x1d0
> …
> >     Call Trace:
> >      <TASK>
> >      futex_unqueue+0x2e/0x110
> >      __futex_wait+0xc5/0x130
> >      futex_wait+0xee/0x180
> >      do_futex+0x86/0x120
> >      __se_sys_futex+0x16d/0x1e0
> >      do_syscall_64+0x47/0x170
> >      entry_SYSCALL_64_after_hwframe+0x4b/0x53
> >     RIP: 0033:0x7f086e918779
> 
> The lock_ptr is pointing to invalid memory. It explodes within
> queued_spin_lock_slowpath() which looks like decode_tail() returned a
> wrong pointer/ offset.
> 
> futex_queue() adds a local futex_q to the list and its lock_ptr points
> to the hb lock. Then we do schedule() and after the wakeup the lock_ptr
> is NULL after a successful wake.  Otherwise it still points to the
> futex_hash_bucket::lock.
> 
> Since futex_unqueue() attempts to acquire the lock, then there was no
> wakeup but a timeout or a signal that ended the wait. The lock_ptr can
> change during resize.
> During the resize futex_rehash_private() moves the futex_q members from
> the old queue to the new one. The lock is accessed within RCU and the
> lock_ptr value is compared against the old value after locking. That
> means it is accessed either before the rehash moved it the new hash
> bucket or afterwards.
> I don't see how this pointer can become invalid. RCU protects against
> cleanup and the pointer compare ensures that it is the "current"
> pointer.
> I've been looking at clang's assembly of futex_unqueue() and it looks
> correct. And futex_rehash_private() iterates over all slots.

Didn't get much out of lockdep unfortunately.

It notices the corruption in the spinlock:

    BUG: spinlock bad magic on CPU#2, cargo/4129172
     lock: 0xffff8881410ecdc8, .magic: dead4ead, .owner: <none>/-1, .owner_cpu: -1
    CPU: 2 UID: 1000 PID: 4129172 Comm: cargo Not tainted 6.16.0-rc2-nolto-lockdep-00047-g52da431bf03b #1 PREEMPT
    Hardware name: ASRock B850 Pro-A/B850 Pro-A, BIOS 3.11 11/12/2024
    Call Trace:
     <TASK>
     dump_stack_lvl+0x5a/0x80
     do_raw_spin_lock+0x6a/0xd0
     futex_wait_setup+0x8e/0x200
     __futex_wait+0x63/0x120
     ? __futex_wake_mark+0x40/0x40
     futex_wait+0x5b/0xd0
     ? hrtimer_dummy_timeout+0x10/0x10
     do_futex+0x86/0x120
     __se_sys_futex+0x10d/0x180
     ? entry_SYSCALL_64_after_hwframe+0x4b/0x53
     do_syscall_64+0x6a/0x1070
     entry_SYSCALL_64_after_hwframe+0x4b/0x53
    RIP: 0033:0x7ff7e7ffb779
    Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 4f 86 0d 00 f7 d8 64 89 01 48
    RSP: 002b:00007fff29bee078 EFLAGS: 00000246 ORIG_RAX: 00000000000000ca
    RAX: ffffffffffffffda RBX: 00007ff7e7ffb760 RCX: 00007ff7e7ffb779
    RDX: 00000000000000b6 RSI: 0000000000000089 RDI: 000055a5e2b9c1a0
    RBP: 00007fff29bee0d0 R08: 0000000000000000 R09: 00007fffffffffff
    R10: 00007fff29bee090 R11: 0000000000000246 R12: 000000001dcd6401
    R13: 00007ff7e7f16fd0 R14: 000055a5e2b9c1a0 R15: 00000000000000b6
     </TASK>

That was followed by this WARN:

    ------------[ cut here ]------------
    rcuref - imbalanced put()
    WARNING: CPU: 2 PID: 4129172 at lib/rcuref.c:266 rcuref_put_slowpath+0x55/0x70
    CPU: 2 UID: 1000 PID: 4129172 Comm: cargo Not tainted 6.16.0-rc2-nolto-lockdep-00047-g52da431bf03b #1 PREEMPT
    Hardware name: ASRock B850 Pro-A/B850 Pro-A, BIOS 3.11 11/12/2024
    RIP: 0010:rcuref_put_slowpath+0x55/0x70
    Code: 00 00 00 c0 73 2a 85 f6 79 06 c7 07 00 00 00 a0 31 c0 c3 53 48 89 fb 48 c7 c7 da 7f 32 83 c6 05 7f 9c 35 02 01 e8 1b 83 9f ff <0f> 0b 48 89 df 5b 31 c0 c7 07 00 00 00 e0 c3 cc cc cc cc cc cc cc
    RSP: 0018:ffffc90026e7fca8 EFLAGS: 00010282
    RAX: 0000000000000019 RBX: ffff8881410ec000 RCX: 0000000000000027
    RDX: 00000000ffff7fff RSI: 0000000000000002 RDI: ffff88901fc9c008
    RBP: 0000000000000000 R08: 0000000000007fff R09: ffffffff83676870
    R10: 0000000000017ffd R11: 00000000ffff7fff R12: 00000000000000b7
    R13: 000055a5e2b9c1a0 R14: ffff8881410ecdc0 R15: 0000000000000001
    FS:  00007ff7e875c600(0000) GS:ffff88909b96a000(0000) knlGS:0000000000000000
    CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
    CR2: 00007fd4b8001028 CR3: 0000000fd7d31000 CR4: 0000000000750ef0
    PKRU: 55555554
    Call Trace:
     <TASK>
     futex_private_hash_put+0xa7/0xc0
     futex_wait_setup+0x1c0/0x200
     __futex_wait+0x63/0x120
     ? __futex_wake_mark+0x40/0x40
     futex_wait+0x5b/0xd0
     ? hrtimer_dummy_timeout+0x10/0x10
     do_futex+0x86/0x120
     __se_sys_futex+0x10d/0x180
     ? entry_SYSCALL_64_after_hwframe+0x4b/0x53
     do_syscall_64+0x6a/0x1070
     entry_SYSCALL_64_after_hwframe+0x4b/0x53
    RIP: 0033:0x7ff7e7ffb779
    Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 4f 86 0d 00 f7 d8 64 89 01 48
    RSP: 002b:00007fff29bee078 EFLAGS: 00000246 ORIG_RAX: 00000000000000ca
    RAX: ffffffffffffffda RBX: 00007ff7e7ffb760 RCX: 00007ff7e7ffb779
    RDX: 00000000000000b6 RSI: 0000000000000089 RDI: 000055a5e2b9c1a0
    RBP: 00007fff29bee0d0 R08: 0000000000000000 R09: 00007fffffffffff
    R10: 00007fff29bee090 R11: 0000000000000246 R12: 000000001dcd6401
    R13: 00007ff7e7f16fd0 R14: 000055a5e2b9c1a0 R15: 00000000000000b6
     </TASK>
    irq event stamp: 59385407
    hardirqs last  enabled at (59385407): [<ffffffff8274264c>] _raw_spin_unlock_irqrestore+0x2c/0x50
    hardirqs last disabled at (59385406): [<ffffffff8274250d>] _raw_spin_lock_irqsave+0x1d/0x60
    softirqs last  enabled at (59341786): [<ffffffff8133cc1e>] __irq_exit_rcu+0x4e/0xd0
    softirqs last disabled at (59341781): [<ffffffff8133cc1e>] __irq_exit_rcu+0x4e/0xd0
    ---[ end trace 0000000000000000 ]---

The oops after that is from a different task this time, but it just
looks like slab corruption:

    BUG: unable to handle page fault for address: 0000000000001300
    #PF: supervisor read access in kernel mode
    #PF: error_code(0x0000) - not-present page
    PGD 0 P4D 0
    Oops: Oops: 0000 [#1] SMP
    CPU: 4 UID: 1000 PID: 4170542 Comm: zstd Tainted: G        W           6.16.0-rc2-nolto-lockdep-00047-g52da431bf03b #1 PREEMPT
    Tainted: [W]=WARN
    Hardware name: ASRock B850 Pro-A/B850 Pro-A, BIOS 3.11 11/12/2024
    RIP: 0010:__kvmalloc_node_noprof+0x1a2/0x4a0
    Code: 0f 84 a3 01 00 00 41 83 f8 ff 74 10 48 8b 03 48 c1 e8 3f 41 39 c0 0f 85 8d 01 00 00 41 8b 46 28 49 8b 36 48 8d 4d 20 48 89 ea <4a> 8b 1c 20 4c 89 e0 65 48 0f c7 0e 74 4e eb 9f 41 83 f8 ff 75 b4
    RSP: 0018:ffffc90036a87c00 EFLAGS: 00010246
    RAX: 0000000000001000 RBX: ffffea0005043a00 RCX: 0000000000054764
    RDX: 0000000000054744 RSI: ffffffff84347c80 RDI: 0000000000000080
    RBP: 0000000000054744 R08: 00000000ffffffff R09: 0000000000000000
    R10: ffffffff8140972d R11: 0000000000000000 R12: 0000000000000300
    R13: 00000000004029c0 R14: ffff888100044800 R15: 0000000000001040
    FS:  00007fca63240740(0000) GS:ffff88909b9ea000(0000) knlGS:0000000000000000
    CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
    CR2: 0000000000001300 CR3: 00000004fcac3000 CR4: 0000000000750ef0
    PKRU: 55555554
    Call Trace:
     <TASK>
     ? futex_hash_allocate+0x17f/0x400
     futex_hash_allocate+0x17f/0x400
     ? futex_hash_allocate+0x4d/0x400
     ? futex_hash_allocate_default+0x2b/0x1e0
     ? futex_hash_allocate_default+0x2b/0x1e0
     ? copy_process+0x35e/0x12a0
     ? futex_hash_allocate_default+0x2b/0x1e0
     ? copy_process+0x35e/0x12a0
     copy_process+0xcf3/0x12a0
     ? entry_SYSCALL_64_after_hwframe+0x4b/0x53
     kernel_clone+0x7f/0x310
     ? copy_clone_args_from_user+0x93/0x1e0
     ? entry_SYSCALL_64_after_hwframe+0x4b/0x53
     __se_sys_clone3+0xbb/0xc0
     ? _copy_to_user+0x1f/0x60
     ? __se_sys_rt_sigprocmask+0xf2/0x120
     ? trace_hardirqs_off+0x40/0xb0
     do_syscall_64+0x6a/0x1070
     entry_SYSCALL_64_after_hwframe+0x4b/0x53
    RIP: 0033:0x7fca6335f7a9
    Code: 90 b8 01 00 00 00 b9 01 00 00 00 eb ec 0f 1f 40 00 b8 ea ff ff ff 48 85 ff 74 28 48 85 d2 74 23 49 89 c8 b8 b3 01 00 00 0f 05 <48> 85 c0 7c 14 74 01 c3 31 ed 4c 89 c7 ff d2 48 89 c7 b8 3c 00 00
    RSP: 002b:00007ffcfe17fe78 EFLAGS: 00000202 ORIG_RAX: 00000000000001b3
    RAX: ffffffffffffffda RBX: 00007fca632e18e0 RCX: 00007fca6335f7a9
    RDX: 00007fca632e18e0 RSI: 0000000000000058 RDI: 00007ffcfe17fed0
    RBP: 00007fca60f666c0 R08: 00007fca60f666c0 R09: 00007ffcfe17ffc7
    R10: 0000000000000008 R11: 0000000000000202 R12: ffffffffffffff88
    R13: 0000000000000002 R14: 00007ffcfe17fed0 R15: 00007fca60766000
     </TASK>
    CR2: 0000000000001300
    ---[ end trace 0000000000000000 ]---
    RIP: 0010:__kvmalloc_node_noprof+0x1a2/0x4a0
    Code: 0f 84 a3 01 00 00 41 83 f8 ff 74 10 48 8b 03 48 c1 e8 3f 41 39 c0 0f 85 8d 01 00 00 41 8b 46 28 49 8b 36 48 8d 4d 20 48 89 ea <4a> 8b 1c 20 4c 89 e0 65 48 0f c7 0e 74 4e eb 9f 41 83 f8 ff 75 b4
    RSP: 0018:ffffc90036a87c00 EFLAGS: 00010246
    RAX: 0000000000001000 RBX: ffffea0005043a00 RCX: 0000000000054764
    RDX: 0000000000054744 RSI: ffffffff84347c80 RDI: 0000000000000080
    RBP: 0000000000054744 R08: 00000000ffffffff R09: 0000000000000000
    R10: ffffffff8140972d R11: 0000000000000000 R12: 0000000000000300
    R13: 00000000004029c0 R14: ffff888100044800 R15: 0000000000001040
    FS:  00007fca63240740(0000) GS:ffff88909b9ea000(0000) knlGS:0000000000000000
    CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
    CR2: 0000000000001300 CR3: 00000004fcac3000 CR4: 0000000000750ef0
    PKRU: 55555554
    Kernel panic - not syncing: Fatal exception
    Kernel Offset: disabled
    ---[ end Kernel panic - not syncing: Fatal exception ]---

No lock/rcu splats at all.

> > This is a giant Yocto build, but the comm is always cargo, so hopefully
> > I can run those bits in isolation and hit it more quickly.
> 
> If it still explodes without LTO, would you mind trying gcc?

Will do.

Haven't had much luck isolating what triggers it, but if I run two copies
of these large build jobs in a loop, it reliably triggers in 6-8 hours.

Just to be clear, I can only trigger this on the one machine. I ran it
through memtest86+ yesterday and it passed, FWIW, but I'm a little
suspicious of the hardware right now too. I double checked that
everything in the BIOS related to power/perf is at factory settings.

Note that READ_ONLY_THP_FOR_FS and NO_PAGE_MAPCOUNT are both off.

> > Thanks,
> > Calvin
> 
> Sebastian
Re: [tip: locking/urgent] futex: Allow to resize the private local hash
Posted by Sebastian Andrzej Siewior 6 months ago
On 2025-06-18 09:49:18 [-0700], Calvin Owens wrote:
> Didn't get much out of lockdep unfortunately.
> 
> It notices the corruption in the spinlock:
> 
>     BUG: spinlock bad magic on CPU#2, cargo/4129172
>      lock: 0xffff8881410ecdc8, .magic: dead4ead, .owner: <none>/-1, .owner_cpu: -1

Yes. Which is what I assumed while I suggested this. But it complains
about bad magic. It says the magic is 0xdead4ead but this is
SPINLOCK_MAGIC. I was expecting any value but this one.

> That was followed by this WARN:
> 
>     ------------[ cut here ]------------
>     rcuref - imbalanced put()
>     WARNING: CPU: 2 PID: 4129172 at lib/rcuref.c:266 rcuref_put_slowpath+0x55/0x70

This is "reasonable". If the lock is broken, the remaining memory is
probably garbage anyway. It complains there that the reference put due
to invalid counter.

…
> The oops after that is from a different task this time, but it just
> looks like slab corruption:
> 
…

The previous complained an invalid free from within the exec.

> No lock/rcu splats at all.
It exploded before that could happen.

> > If it still explodes without LTO, would you mind trying gcc?
> 
> Will do.

Thank you.

> Haven't had much luck isolating what triggers it, but if I run two copies
> of these large build jobs in a loop, it reliably triggers in 6-8 hours.
> 
> Just to be clear, I can only trigger this on the one machine. I ran it
> through memtest86+ yesterday and it passed, FWIW, but I'm a little
> suspicious of the hardware right now too. I double checked that
> everything in the BIOS related to power/perf is at factory settings.

But then it is kind of odd that it happens only with the futex code.

Sebastian
Re: [tip: locking/urgent] futex: Allow to resize the private local hash
Posted by Calvin Owens 6 months ago
( Dropping linux-tip-commits from Cc )

On Wednesday 06/18 at 19:09 +0200, Sebastian Andrzej Siewior wrote:
> On 2025-06-18 09:49:18 [-0700], Calvin Owens wrote:
> > Didn't get much out of lockdep unfortunately.
> > 
> > It notices the corruption in the spinlock:
> > 
> >     BUG: spinlock bad magic on CPU#2, cargo/4129172
> >      lock: 0xffff8881410ecdc8, .magic: dead4ead, .owner: <none>/-1, .owner_cpu: -1
> 
> Yes. Which is what I assumed while I suggested this. But it complains
> about bad magic. It says the magic is 0xdead4ead but this is
> SPINLOCK_MAGIC. I was expecting any value but this one.
> 
> > That was followed by this WARN:
> > 
> >     ------------[ cut here ]------------
> >     rcuref - imbalanced put()
> >     WARNING: CPU: 2 PID: 4129172 at lib/rcuref.c:266 rcuref_put_slowpath+0x55/0x70
> 
> This is "reasonable". If the lock is broken, the remaining memory is
> probably garbage anyway. It complains there that the reference put due
> to invalid counter.
> 
> …
> > The oops after that is from a different task this time, but it just
> > looks like slab corruption:
> > 
> …
> 
> The previous complained an invalid free from within the exec.
> 
> > No lock/rcu splats at all.
> It exploded before that could happen.
> 
> > > If it still explodes without LTO, would you mind trying gcc?
> > 
> > Will do.
> 
> Thank you.
> 
> > Haven't had much luck isolating what triggers it, but if I run two copies
> > of these large build jobs in a loop, it reliably triggers in 6-8 hours.
> > 
> > Just to be clear, I can only trigger this on the one machine. I ran it
> > through memtest86+ yesterday and it passed, FWIW, but I'm a little
> > suspicious of the hardware right now too. I double checked that
> > everything in the BIOS related to power/perf is at factory settings.
> 
> But then it is kind of odd that it happens only with the futex code.

I think the missing ingredient was PREEMPT: the 2nd machine has been
trying for over a day, but I rebuilt its kernel with PREEMPT_FULL this
morning (still llvm), and it just hit a similar oops.

    Oops: general protection fault, probably for non-canonical address 0x74656d2f74696750: 0000 [#1] SMP
    CPU: 10 UID: 1000 PID: 542469 Comm: cargo Not tainted 6.16.0-rc2-00045-g4663747812d1 #1 PREEMPT 
    Hardware name: Gigabyte Technology Co., Ltd. A620I AX/A620I AX, BIOS F3 07/10/2023
    RIP: 0010:futex_hash+0x23/0x90
    Code: 1f 84 00 00 00 00 00 41 57 41 56 53 48 89 fb e8 b3 04 fe ff 48 89 df 31 f6 e8 79 00 00 00 48 8b 78 18 49 89 c6 48 85 ff 74 55 <80> 7f 21 00 75 4f f0 83 07 01 79 49 e8 fc 17 37 00 84 c0 75 40 e8
    RSP: 0018:ffffc9002e46fcd8 EFLAGS: 00010202
    RAX: ffff888a68e25c40 RBX: ffffc9002e46fda0 RCX: 0000000036616534
    RDX: 00000000ffffffff RSI: 0000000910180c00 RDI: 74656d2f7469672f
    RBP: 00000000000000b0 R08: 000000000318dd0d R09: 000000002e117cb0
    R10: 00000000318dd0d0 R11: 000000000000001b R12: 0000000000000000
    R13: 000055e79b431170 R14: ffff888a68e25c40 R15: ffff8881ea0ae900
    FS:  00007f1b6037b580(0000) GS:ffff8898a528b000(0000) knlGS:0000000000000000
    CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
    CR2: 0000555830170098 CR3: 0000000d73e93000 CR4: 0000000000350ef0
    Call Trace:
     <TASK>
     futex_wait_setup+0x7e/0x1d0
     __futex_wait+0x63/0x120
     ? __futex_wake_mark+0x40/0x40
     futex_wait+0x5b/0xd0
     ? hrtimer_dummy_timeout+0x10/0x10
     do_futex+0x86/0x120
     __x64_sys_futex+0x10a/0x180
     do_syscall_64+0x48/0x4f0
     entry_SYSCALL_64_after_hwframe+0x4b/0x53

I also enabled DEBUG_PREEMPT, but that didn't print any additional info.

I'm testing a GCC kernel on both machines now.

Thanks,
Calvin

> Sebastian
Re: [tip: locking/urgent] futex: Allow to resize the private local hash
Posted by Calvin Owens 6 months ago
On Wednesday 06/18 at 13:56 -0700, Calvin Owens wrote:
> ( Dropping linux-tip-commits from Cc )
>
> On Wednesday 06/18 at 19:09 +0200, Sebastian Andrzej Siewior wrote:
> > On 2025-06-18 09:49:18 [-0700], Calvin Owens wrote:
> > > Didn't get much out of lockdep unfortunately.
> > >
> > > It notices the corruption in the spinlock:
> > >
> > >     BUG: spinlock bad magic on CPU#2, cargo/4129172
> > >      lock: 0xffff8881410ecdc8, .magic: dead4ead, .owner: <none>/-1, .owner_cpu: -1
> >
> > Yes. Which is what I assumed while I suggested this. But it complains
> > about bad magic. It says the magic is 0xdead4ead but this is
> > SPINLOCK_MAGIC. I was expecting any value but this one.
> >
> > > That was followed by this WARN:
> > >
> > >     ------------[ cut here ]------------
> > >     rcuref - imbalanced put()
> > >     WARNING: CPU: 2 PID: 4129172 at lib/rcuref.c:266 rcuref_put_slowpath+0x55/0x70
> >
> > This is "reasonable". If the lock is broken, the remaining memory is
> > probably garbage anyway. It complains there that the reference put due
> > to invalid counter.
> >
> > …
> > > The oops after that is from a different task this time, but it just
> > > looks like slab corruption:
> > >
> > …
> >
> > The previous complained an invalid free from within the exec.
> >
> > > No lock/rcu splats at all.
> > It exploded before that could happen.
> >
> > > > If it still explodes without LTO, would you mind trying gcc?
> > >
> > > Will do.
> >
> > Thank you.
> >
> > > Haven't had much luck isolating what triggers it, but if I run two copies
> > > of these large build jobs in a loop, it reliably triggers in 6-8 hours.
> > >
> > > Just to be clear, I can only trigger this on the one machine. I ran it
> > > through memtest86+ yesterday and it passed, FWIW, but I'm a little
> > > suspicious of the hardware right now too. I double checked that
> > > everything in the BIOS related to power/perf is at factory settings.
> >
> > But then it is kind of odd that it happens only with the futex code.
>
> I think the missing ingredient was PREEMPT: the 2nd machine has been
> trying for over a day, but I rebuilt its kernel with PREEMPT_FULL this
> morning (still llvm), and it just hit a similar oops.
>
>     Oops: general protection fault, probably for non-canonical address 0x74656d2f74696750: 0000 [#1] SMP
>     CPU: 10 UID: 1000 PID: 542469 Comm: cargo Not tainted 6.16.0-rc2-00045-g4663747812d1 #1 PREEMPT
>     Hardware name: Gigabyte Technology Co., Ltd. A620I AX/A620I AX, BIOS F3 07/10/2023
>     RIP: 0010:futex_hash+0x23/0x90
>     Code: 1f 84 00 00 00 00 00 41 57 41 56 53 48 89 fb e8 b3 04 fe ff 48 89 df 31 f6 e8 79 00 00 00 48 8b 78 18 49 89 c6 48 85 ff 74 55 <80> 7f 21 00 75 4f f0 83 07 01 79 49 e8 fc 17 37 00 84 c0 75 40 e8
>     RSP: 0018:ffffc9002e46fcd8 EFLAGS: 00010202
>     RAX: ffff888a68e25c40 RBX: ffffc9002e46fda0 RCX: 0000000036616534
>     RDX: 00000000ffffffff RSI: 0000000910180c00 RDI: 74656d2f7469672f
>     RBP: 00000000000000b0 R08: 000000000318dd0d R09: 000000002e117cb0
>     R10: 00000000318dd0d0 R11: 000000000000001b R12: 0000000000000000
>     R13: 000055e79b431170 R14: ffff888a68e25c40 R15: ffff8881ea0ae900
>     FS:  00007f1b6037b580(0000) GS:ffff8898a528b000(0000) knlGS:0000000000000000
>     CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>     CR2: 0000555830170098 CR3: 0000000d73e93000 CR4: 0000000000350ef0
>     Call Trace:
>      <TASK>
>      futex_wait_setup+0x7e/0x1d0
>      __futex_wait+0x63/0x120
>      ? __futex_wake_mark+0x40/0x40
>      futex_wait+0x5b/0xd0
>      ? hrtimer_dummy_timeout+0x10/0x10
>      do_futex+0x86/0x120
>      __x64_sys_futex+0x10a/0x180
>      do_syscall_64+0x48/0x4f0
>      entry_SYSCALL_64_after_hwframe+0x4b/0x53
>
> I also enabled DEBUG_PREEMPT, but that didn't print any additional info.
>
> I'm testing a GCC kernel on both machines now.

Machine #2 oopsed with the GCC kernel after just over an hour:

    BUG: unable to handle page fault for address: ffff88a91eac4458
    #PF: supervisor read access in kernel mode
    #PF: error_code(0x0000) - not-present page
    PGD 4401067 P4D 4401067 PUD 0
    Oops: Oops: 0000 [#1] SMP
    CPU: 4 UID: 1000 PID: 881756 Comm: cargo Not tainted 6.16.0-rc2-gcc-00045-g4663747812d1 #1 PREEMPT
    Hardware name: Gigabyte Technology Co., Ltd. A620I AX/A620I AX, BIOS F3 07/10/2023
    RIP: 0010:futex_hash+0x16/0x90
    Code: 4d 85 e4 74 99 4c 89 e7 e8 07 51 80 00 eb 8f 0f 1f 44 00 00 41 54 55 48 89 fd 53 e8 14 f2 fd ff 48 89 ef 31 f6 e8 da f6 ff ff <48> 8b 78 18 48 89 c3 48 85 ff 74 0c 80 7f 21 00 75 06 f0 83 07 01
    RSP: 0018:ffffc9002973fcf8 EFLAGS: 00010282
    RAX: ffff88a91eac4440 RBX: ffff888d5a170000 RCX: 00000000add26115
    RDX: 0000001c49080440 RSI: 00000000236034e8 RDI: 00000000f1a67530
    RBP: ffffc9002973fdb8 R08: 00000000eb13f1af R09: ffffffff829c0fc0
    R10: 0000000000000246 R11: 0000000000000000 R12: ffff888d5a1700f0
    R13: ffffc9002973fdb8 R14: ffffc9002973fd70 R15: 0000000000000002
    FS:  00007f64614ba9c0(0000) GS:ffff888cccceb000(0000) knlGS:0000000000000000
    CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
    CR2: ffff88a91eac4458 CR3: 000000015e508000 CR4: 0000000000350ef0
    Call Trace:
     <TASK>
     futex_wait_setup+0x51/0x1b0
     __futex_wait+0xc0/0x120
     ? __futex_wake_mark+0x50/0x50
     futex_wait+0x55/0xe0
     ? hrtimer_setup_sleeper_on_stack+0x30/0x30
     do_futex+0x91/0x120
     __x64_sys_futex+0xfc/0x1d0
     do_syscall_64+0x44/0x1130
     entry_SYSCALL_64_after_hwframe+0x4b/0x53
    RIP: 0033:0x7f64615bd74d
    Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d ab c6 0b 00 f7 d8 64 89 01 48
    RSP: 002b:00007ffea50a6cc8 EFLAGS: 00000246 ORIG_RAX: 00000000000000ca
    RAX: ffffffffffffffda RBX: 00007f64615bd730 RCX: 00007f64615bd74d
    RDX: 0000000000000080 RSI: 0000000000000089 RDI: 000055bb7e399d90
    RBP: 00007ffea50a6d20 R08: 0000000000000000 R09: 00007ffeffffffff
    R10: 00007ffea50a6ce0 R11: 0000000000000246 R12: 000000001dcd6401
    R13: 00007f64614e3710 R14: 000055bb7e399d90 R15: 0000000000000080
     </TASK>
    CR2: ffff88a91eac4458
    ---[ end trace 0000000000000000 ]---

Two CPUs oopsed at once with that same stack, the config and vmlinux are
uploaded in the git (https://github.com/jcalvinowens/lkml-debug-616).

I tried reproducing with DEBUG_PAGEALLOC, but the bug doesn't happen
with it turned on.

> Thanks,
> Calvin
>
> > Sebastian
Re: [tip: locking/urgent] futex: Allow to resize the private local hash
Posted by Calvin Owens 6 months ago
On Wednesday 06/18 at 15:47 -0700, Calvin Owens wrote:
> On Wednesday 06/18 at 13:56 -0700, Calvin Owens wrote:
> > ( Dropping linux-tip-commits from Cc )
> >
> > On Wednesday 06/18 at 19:09 +0200, Sebastian Andrzej Siewior wrote:
> > > On 2025-06-18 09:49:18 [-0700], Calvin Owens wrote:
> > > > Didn't get much out of lockdep unfortunately.
> > > >
> > > > It notices the corruption in the spinlock:
> > > >
> > > >     BUG: spinlock bad magic on CPU#2, cargo/4129172
> > > >      lock: 0xffff8881410ecdc8, .magic: dead4ead, .owner: <none>/-1, .owner_cpu: -1
> > >
> > > Yes. Which is what I assumed while I suggested this. But it complains
> > > about bad magic. It says the magic is 0xdead4ead but this is
> > > SPINLOCK_MAGIC. I was expecting any value but this one.
> > >
> > > > That was followed by this WARN:
> > > >
> > > >     ------------[ cut here ]------------
> > > >     rcuref - imbalanced put()
> > > >     WARNING: CPU: 2 PID: 4129172 at lib/rcuref.c:266 rcuref_put_slowpath+0x55/0x70
> > >
> > > This is "reasonable". If the lock is broken, the remaining memory is
> > > probably garbage anyway. It complains there that the reference put due
> > > to invalid counter.
> > >
> > > …
> > > > The oops after that is from a different task this time, but it just
> > > > looks like slab corruption:
> > > >
> > > …
> > >
> > > The previous complained an invalid free from within the exec.
> > >
> > > > No lock/rcu splats at all.
> > > It exploded before that could happen.
> > >
> > > > > If it still explodes without LTO, would you mind trying gcc?
> > > >
> > > > Will do.
> > >
> > > Thank you.
> > >
> > > > Haven't had much luck isolating what triggers it, but if I run two copies
> > > > of these large build jobs in a loop, it reliably triggers in 6-8 hours.
> > > >
> > > > Just to be clear, I can only trigger this on the one machine. I ran it
> > > > through memtest86+ yesterday and it passed, FWIW, but I'm a little
> > > > suspicious of the hardware right now too. I double checked that
> > > > everything in the BIOS related to power/perf is at factory settings.
> > >
> > > But then it is kind of odd that it happens only with the futex code.
> >
> > I think the missing ingredient was PREEMPT: the 2nd machine has been
> > trying for over a day, but I rebuilt its kernel with PREEMPT_FULL this
> > morning (still llvm), and it just hit a similar oops.
> >
> >     Oops: general protection fault, probably for non-canonical address 0x74656d2f74696750: 0000 [#1] SMP
> >     CPU: 10 UID: 1000 PID: 542469 Comm: cargo Not tainted 6.16.0-rc2-00045-g4663747812d1 #1 PREEMPT
> >     Hardware name: Gigabyte Technology Co., Ltd. A620I AX/A620I AX, BIOS F3 07/10/2023
> >     RIP: 0010:futex_hash+0x23/0x90
> >     Code: 1f 84 00 00 00 00 00 41 57 41 56 53 48 89 fb e8 b3 04 fe ff 48 89 df 31 f6 e8 79 00 00 00 48 8b 78 18 49 89 c6 48 85 ff 74 55 <80> 7f 21 00 75 4f f0 83 07 01 79 49 e8 fc 17 37 00 84 c0 75 40 e8
> >     RSP: 0018:ffffc9002e46fcd8 EFLAGS: 00010202
> >     RAX: ffff888a68e25c40 RBX: ffffc9002e46fda0 RCX: 0000000036616534
> >     RDX: 00000000ffffffff RSI: 0000000910180c00 RDI: 74656d2f7469672f
> >     RBP: 00000000000000b0 R08: 000000000318dd0d R09: 000000002e117cb0
> >     R10: 00000000318dd0d0 R11: 000000000000001b R12: 0000000000000000
> >     R13: 000055e79b431170 R14: ffff888a68e25c40 R15: ffff8881ea0ae900
> >     FS:  00007f1b6037b580(0000) GS:ffff8898a528b000(0000) knlGS:0000000000000000
> >     CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> >     CR2: 0000555830170098 CR3: 0000000d73e93000 CR4: 0000000000350ef0
> >     Call Trace:
> >      <TASK>
> >      futex_wait_setup+0x7e/0x1d0
> >      __futex_wait+0x63/0x120
> >      ? __futex_wake_mark+0x40/0x40
> >      futex_wait+0x5b/0xd0
> >      ? hrtimer_dummy_timeout+0x10/0x10
> >      do_futex+0x86/0x120
> >      __x64_sys_futex+0x10a/0x180
> >      do_syscall_64+0x48/0x4f0
> >      entry_SYSCALL_64_after_hwframe+0x4b/0x53
> >
> > I also enabled DEBUG_PREEMPT, but that didn't print any additional info.
> >
> > I'm testing a GCC kernel on both machines now.
> 
> Machine #2 oopsed with the GCC kernel after just over an hour:
> 
>     BUG: unable to handle page fault for address: ffff88a91eac4458
>     #PF: supervisor read access in kernel mode
>     #PF: error_code(0x0000) - not-present page
>     PGD 4401067 P4D 4401067 PUD 0
>     Oops: Oops: 0000 [#1] SMP
>     CPU: 4 UID: 1000 PID: 881756 Comm: cargo Not tainted 6.16.0-rc2-gcc-00045-g4663747812d1 #1 PREEMPT
>     Hardware name: Gigabyte Technology Co., Ltd. A620I AX/A620I AX, BIOS F3 07/10/2023
>     RIP: 0010:futex_hash+0x16/0x90
>     Code: 4d 85 e4 74 99 4c 89 e7 e8 07 51 80 00 eb 8f 0f 1f 44 00 00 41 54 55 48 89 fd 53 e8 14 f2 fd ff 48 89 ef 31 f6 e8 da f6 ff ff <48> 8b 78 18 48 89 c3 48 85 ff 74 0c 80 7f 21 00 75 06 f0 83 07 01
>     RSP: 0018:ffffc9002973fcf8 EFLAGS: 00010282
>     RAX: ffff88a91eac4440 RBX: ffff888d5a170000 RCX: 00000000add26115
>     RDX: 0000001c49080440 RSI: 00000000236034e8 RDI: 00000000f1a67530
>     RBP: ffffc9002973fdb8 R08: 00000000eb13f1af R09: ffffffff829c0fc0
>     R10: 0000000000000246 R11: 0000000000000000 R12: ffff888d5a1700f0
>     R13: ffffc9002973fdb8 R14: ffffc9002973fd70 R15: 0000000000000002
>     FS:  00007f64614ba9c0(0000) GS:ffff888cccceb000(0000) knlGS:0000000000000000
>     CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>     CR2: ffff88a91eac4458 CR3: 000000015e508000 CR4: 0000000000350ef0
>     Call Trace:
>      <TASK>
>      futex_wait_setup+0x51/0x1b0
>      __futex_wait+0xc0/0x120
>      ? __futex_wake_mark+0x50/0x50
>      futex_wait+0x55/0xe0
>      ? hrtimer_setup_sleeper_on_stack+0x30/0x30
>      do_futex+0x91/0x120
>      __x64_sys_futex+0xfc/0x1d0
>      do_syscall_64+0x44/0x1130
>      entry_SYSCALL_64_after_hwframe+0x4b/0x53
>     RIP: 0033:0x7f64615bd74d
>     Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d ab c6 0b 00 f7 d8 64 89 01 48
>     RSP: 002b:00007ffea50a6cc8 EFLAGS: 00000246 ORIG_RAX: 00000000000000ca
>     RAX: ffffffffffffffda RBX: 00007f64615bd730 RCX: 00007f64615bd74d
>     RDX: 0000000000000080 RSI: 0000000000000089 RDI: 000055bb7e399d90
>     RBP: 00007ffea50a6d20 R08: 0000000000000000 R09: 00007ffeffffffff
>     R10: 00007ffea50a6ce0 R11: 0000000000000246 R12: 000000001dcd6401
>     R13: 00007f64614e3710 R14: 000055bb7e399d90 R15: 0000000000000080
>      </TASK>
>     CR2: ffff88a91eac4458
>     ---[ end trace 0000000000000000 ]---
> 
> Two CPUs oopsed at once with that same stack, the config and vmlinux are
> uploaded in the git (https://github.com/jcalvinowens/lkml-debug-616).
> 
> I tried reproducing with DEBUG_PAGEALLOC, but the bug doesn't happen
> with it turned on.

I've been rotating through debug options one at a time, I've reproduced
the oops with the following which yielded no additional console output:

    * DEBUG_VM
    * PAGE_POISONING (and page_poison=1)
    * DEBUG_ATOMIC_SLEEP
    * DEBUG_PREEMPT

(No poison patterns showed up at all in the oops traces either.)

I am not able to reproduce the oops at all with these options:

    * DEBUG_PAGEALLOC_ENABLE_DEFAULT
    * SLUB_DEBUG_ON

I'm also experimenting with stress-ng as a reproducer, no luck so far.

A third machine with an older Skylake CPU died overnight, but nothing
was logged over netconsole. Luckily it actually has a serial header on
the motherboard, so that's wired up and it's running again, maybe it
dies in a different way that might be a better clue...

> > Thanks,
> > Calvin
> >
> > > Sebastian
Re: [tip: locking/urgent] futex: Allow to resize the private local hash
Posted by Sebastian Andrzej Siewior 6 months ago
On 2025-06-19 14:07:30 [-0700], Calvin Owens wrote:
> > Machine #2 oopsed with the GCC kernel after just over an hour:
> > 
> >     BUG: unable to handle page fault for address: ffff88a91eac4458
> >     RIP: 0010:futex_hash+0x16/0x90
…
> >     Call Trace:
> >      <TASK>
> >      futex_wait_setup+0x51/0x1b0
…

The futex_hash_bucket pointer has an invalid ->priv pointer.
This could be use-after-free or double-free. I've been looking through
your config and you don't have CONFIG_SLAB_FREELIST_* set. I don't
remember which one but one of the two has a "primitiv" double free
detection. 

…
> I am not able to reproduce the oops at all with these options:
> 
>     * DEBUG_PAGEALLOC_ENABLE_DEFAULT
>     * SLUB_DEBUG_ON

SLUB_DEBUG_ON is something that would "reliably" notice double free.
If you drop SLUB_DEBUG_ON (but keep SLUB_DEBUG) then you can boot with
slab_debug=f keeping only the consistency checks. The "poison" checks
would be excluded for instance. That allocation is kvzalloc() but it
should be small on your machine to avoid vmalloc() and use only
kmalloc().

> I'm also experimenting with stress-ng as a reproducer, no luck so far.

Not sure what you are using there. I think cargo does:
- lock/ unlock in a threads
- create new thread which triggers auto-resize
- auto-resize gets delayed due to lock/ unlock in other threads (the
  reference is held)

And now something happens leading to what we see.
_Maybe_ the cargo application terminates/ execs before the new struct is
assigned in an unexpected way.
The regular hash bucket has reference counting so it should raise
warnings if it goes wrong. I haven't seen those.

> A third machine with an older Skylake CPU died overnight, but nothing
> was logged over netconsole. Luckily it actually has a serial header on
> the motherboard, so that's wired up and it's running again, maybe it
> dies in a different way that might be a better clue...

So far I *think* that cargo does something that I don't expect and this
leads to a memory double-free. The SLUB_DEBUG_ON hopefully delays the
process long enough that the double free does not trigger.

I think I'm going to look for a random rust packet that is using cargo
for building (unless you have a recommendation) and look what it is
doing. It was always cargo after all. Maybe this brings some light.
 
> > > Thanks,
> > > Calvin

Sebastian
Re: [tip: locking/urgent] futex: Allow to resize the private local hash
Posted by Calvin Owens 6 months ago
On Friday 06/20 at 12:31 +0200, Sebastian Andrzej Siewior wrote:
> On 2025-06-19 14:07:30 [-0700], Calvin Owens wrote:
> > > Machine #2 oopsed with the GCC kernel after just over an hour:
> > > 
> > >     BUG: unable to handle page fault for address: ffff88a91eac4458
> > >     RIP: 0010:futex_hash+0x16/0x90
> …
> > >     Call Trace:
> > >      <TASK>
> > >      futex_wait_setup+0x51/0x1b0
> …
> 
> The futex_hash_bucket pointer has an invalid ->priv pointer.
> This could be use-after-free or double-free. I've been looking through
> your config and you don't have CONFIG_SLAB_FREELIST_* set. I don't
> remember which one but one of the two has a "primitiv" double free
> detection. 
> 
> …
> > I am not able to reproduce the oops at all with these options:
> > 
> >     * DEBUG_PAGEALLOC_ENABLE_DEFAULT
> >     * SLUB_DEBUG_ON
> 
> SLUB_DEBUG_ON is something that would "reliably" notice double free.
> If you drop SLUB_DEBUG_ON (but keep SLUB_DEBUG) then you can boot with
> slab_debug=f keeping only the consistency checks. The "poison" checks
> would be excluded for instance. That allocation is kvzalloc() but it
> should be small on your machine to avoid vmalloc() and use only
> kmalloc().

I'll try slab_debug=f next.

> > I'm also experimenting with stress-ng as a reproducer, no luck so far.
> 
> Not sure what you are using there. I think cargo does:
> - lock/ unlock in a threads
> - create new thread which triggers auto-resize
> - auto-resize gets delayed due to lock/ unlock in other threads (the
>   reference is held)

I've tried various combinations of --io, --fork, --exec, --futex, --cpu,
--vm, and --forkheavy. It's not mixing the operations in threads as I
understand it, so I guess it won't ever do anything like what you're
describing no matter what stressors I run?

I did get this message once, something I haven't seen before:

    [33024.247423] [    T281] sched: DL replenish lagged too much

...but maybe that's my fault for overloading it so much.

> And now something happens leading to what we see.
> _Maybe_ the cargo application terminates/ execs before the new struct is
> assigned in an unexpected way.
> The regular hash bucket has reference counting so it should raise
> warnings if it goes wrong. I haven't seen those.
> 
> > A third machine with an older Skylake CPU died overnight, but nothing
> > was logged over netconsole. Luckily it actually has a serial header on
> > the motherboard, so that's wired up and it's running again, maybe it
> > dies in a different way that might be a better clue...
> 
> So far I *think* that cargo does something that I don't expect and this
> leads to a memory double-free. The SLUB_DEBUG_ON hopefully delays the
> process long enough that the double free does not trigger.
> 
> I think I'm going to look for a random rust packet that is using cargo
> for building (unless you have a recommendation) and look what it is
> doing. It was always cargo after all. Maybe this brings some light.

The list of things in my big build that use cargo is pretty short:

    === Dependendency Snapshot ===
    Dep    =mc:house:cargo-native.do_install
    Package=mc:house:cargo-native.do_populate_sysroot
    RDep   =mc:house:cargo-c-native.do_prepare_recipe_sysroot
            mc:house:cargo-native.do_create_spdx
            mc:house:cbindgen-native.do_prepare_recipe_sysroot
            mc:house:librsvg-native.do_prepare_recipe_sysroot
            mc:house:librsvg.do_prepare_recipe_sysroot
            mc:house:libstd-rs.do_prepare_recipe_sysroot
            mc:house:python3-maturin-native.do_prepare_recipe_sysroot
            mc:house:python3-maturin-native.do_populate_sysroot
            mc:house:python3-rpds-py.do_prepare_recipe_sysroot
            mc:house:python3-setuptools-rust-native.do_prepare_recipe_sysroot

I've tried building each of those targets alone (and all of them
together) in a loop, but that hasn't triggered anything. I guess that
other concurrent builds are necessary to trigger whatever this is.

I tried using stress-ng --vm and --cpu together to "load up" the machine
while running the isolated targets, but that hasn't worked either.

If you want to run *exactly* what I am, clone this unholy mess:

    https://github.com/jcalvinowens/meta-house

...setup for yocto and install kas as described here:

    https://docs.yoctoproject.org/ref-manual/system-requirements.html#ubuntu-and-debian
    https://github.com/jcalvinowens/meta-house/blob/6f6a9c643169fc37ba809f7230261d0e5255b6d7/README.md#kas

...and run (for the 32-thread machine):

    BB_NUMBER_THREADS="48" PARALLEL_MAKE="-j 36" kas build kas/walnascar.yaml -- -k

Fair warning, it needs a *lot* of RAM at the high concurrency, I have
96GB with 128GB of swap to spill into. It needs ~500GB of disk space if
it runs to completion and downloads ~15GB of tarballs when it starts.

Annoyingly it won't work if the system compiler is gcc-15 right now (the
verison of glib it has won't build, haven't had a chance to fix it yet).

> > > > Thanks,
> > > > Calvin
> 
> Sebastian
Re: [tip: locking/urgent] futex: Allow to resize the private local hash
Posted by Calvin Owens 6 months ago
On Friday 06/20 at 11:56 -0700, Calvin Owens wrote:
> On Friday 06/20 at 12:31 +0200, Sebastian Andrzej Siewior wrote:
> > On 2025-06-19 14:07:30 [-0700], Calvin Owens wrote:
> > > > Machine #2 oopsed with the GCC kernel after just over an hour:
> > > > 
> > > >     BUG: unable to handle page fault for address: ffff88a91eac4458
> > > >     RIP: 0010:futex_hash+0x16/0x90
> > …
> > > >     Call Trace:
> > > >      <TASK>
> > > >      futex_wait_setup+0x51/0x1b0
> > …
> > 
> > The futex_hash_bucket pointer has an invalid ->priv pointer.
> > This could be use-after-free or double-free. I've been looking through
> > your config and you don't have CONFIG_SLAB_FREELIST_* set. I don't
> > remember which one but one of the two has a "primitiv" double free
> > detection. 
> > 
> > …
> > > I am not able to reproduce the oops at all with these options:
> > > 
> > >     * DEBUG_PAGEALLOC_ENABLE_DEFAULT
> > >     * SLUB_DEBUG_ON
> > 
> > SLUB_DEBUG_ON is something that would "reliably" notice double free.
> > If you drop SLUB_DEBUG_ON (but keep SLUB_DEBUG) then you can boot with
> > slab_debug=f keeping only the consistency checks. The "poison" checks
> > would be excluded for instance. That allocation is kvzalloc() but it
> > should be small on your machine to avoid vmalloc() and use only
> > kmalloc().
> 
> I'll try slab_debug=f next.

I just hit the oops with SLUB_DEBUG and slab_debug=f, but nothing new
was logged.

> > > I'm also experimenting with stress-ng as a reproducer, no luck so far.
> > 
> > Not sure what you are using there. I think cargo does:
> > - lock/ unlock in a threads
> > - create new thread which triggers auto-resize
> > - auto-resize gets delayed due to lock/ unlock in other threads (the
> >   reference is held)
> 
> I've tried various combinations of --io, --fork, --exec, --futex, --cpu,
> --vm, and --forkheavy. It's not mixing the operations in threads as I
> understand it, so I guess it won't ever do anything like what you're
> describing no matter what stressors I run?
> 
> I did get this message once, something I haven't seen before:
> 
>     [33024.247423] [    T281] sched: DL replenish lagged too much
> 
> ...but maybe that's my fault for overloading it so much.
> 
> > And now something happens leading to what we see.
> > _Maybe_ the cargo application terminates/ execs before the new struct is
> > assigned in an unexpected way.
> > The regular hash bucket has reference counting so it should raise
> > warnings if it goes wrong. I haven't seen those.
> > 
> > > A third machine with an older Skylake CPU died overnight, but nothing
> > > was logged over netconsole. Luckily it actually has a serial header on
> > > the motherboard, so that's wired up and it's running again, maybe it
> > > dies in a different way that might be a better clue...
> > 
> > So far I *think* that cargo does something that I don't expect and this
> > leads to a memory double-free. The SLUB_DEBUG_ON hopefully delays the
> > process long enough that the double free does not trigger.
> > 
> > I think I'm going to look for a random rust packet that is using cargo
> > for building (unless you have a recommendation) and look what it is
> > doing. It was always cargo after all. Maybe this brings some light.
> 
> The list of things in my big build that use cargo is pretty short:
> 
>     === Dependendency Snapshot ===
>     Dep    =mc:house:cargo-native.do_install
>     Package=mc:house:cargo-native.do_populate_sysroot
>     RDep   =mc:house:cargo-c-native.do_prepare_recipe_sysroot
>             mc:house:cargo-native.do_create_spdx
>             mc:house:cbindgen-native.do_prepare_recipe_sysroot
>             mc:house:librsvg-native.do_prepare_recipe_sysroot
>             mc:house:librsvg.do_prepare_recipe_sysroot
>             mc:house:libstd-rs.do_prepare_recipe_sysroot
>             mc:house:python3-maturin-native.do_prepare_recipe_sysroot
>             mc:house:python3-maturin-native.do_populate_sysroot
>             mc:house:python3-rpds-py.do_prepare_recipe_sysroot
>             mc:house:python3-setuptools-rust-native.do_prepare_recipe_sysroot
> 
> I've tried building each of those targets alone (and all of them
> together) in a loop, but that hasn't triggered anything. I guess that
> other concurrent builds are necessary to trigger whatever this is.
> 
> I tried using stress-ng --vm and --cpu together to "load up" the machine
> while running the isolated targets, but that hasn't worked either.
> 
> If you want to run *exactly* what I am, clone this unholy mess:
> 
>     https://github.com/jcalvinowens/meta-house
> 
> ...setup for yocto and install kas as described here:
> 
>     https://docs.yoctoproject.org/ref-manual/system-requirements.html#ubuntu-and-debian
>     https://github.com/jcalvinowens/meta-house/blob/6f6a9c643169fc37ba809f7230261d0e5255b6d7/README.md#kas
> 
> ...and run (for the 32-thread machine):
> 
>     BB_NUMBER_THREADS="48" PARALLEL_MAKE="-j 36" kas build kas/walnascar.yaml -- -k
> 
> Fair warning, it needs a *lot* of RAM at the high concurrency, I have
> 96GB with 128GB of swap to spill into. It needs ~500GB of disk space if
> it runs to completion and downloads ~15GB of tarballs when it starts.
> 
> Annoyingly it won't work if the system compiler is gcc-15 right now (the
> verison of glib it has won't build, haven't had a chance to fix it yet).
> 
> > > > > Thanks,
> > > > > Calvin
> > 
> > Sebastian
Re: [tip: locking/urgent] futex: Allow to resize the private local hash
Posted by Calvin Owens 6 months ago
On Friday 06/20 at 18:02 -0700, Calvin Owens wrote:
> On Friday 06/20 at 11:56 -0700, Calvin Owens wrote:
> > On Friday 06/20 at 12:31 +0200, Sebastian Andrzej Siewior wrote:
> > > On 2025-06-19 14:07:30 [-0700], Calvin Owens wrote:
> > > > > Machine #2 oopsed with the GCC kernel after just over an hour:
> > > > > 
> > > > >     BUG: unable to handle page fault for address: ffff88a91eac4458
> > > > >     RIP: 0010:futex_hash+0x16/0x90
> > > …
> > > > >     Call Trace:
> > > > >      <TASK>
> > > > >      futex_wait_setup+0x51/0x1b0
> > > …
> > > 
> > > The futex_hash_bucket pointer has an invalid ->priv pointer.
> > > This could be use-after-free or double-free. I've been looking through
> > > your config and you don't have CONFIG_SLAB_FREELIST_* set. I don't
> > > remember which one but one of the two has a "primitiv" double free
> > > detection. 
> > > 
> > > …
> > > > I am not able to reproduce the oops at all with these options:
> > > > 
> > > >     * DEBUG_PAGEALLOC_ENABLE_DEFAULT
> > > >     * SLUB_DEBUG_ON
> > > 
> > > SLUB_DEBUG_ON is something that would "reliably" notice double free.
> > > If you drop SLUB_DEBUG_ON (but keep SLUB_DEBUG) then you can boot with
> > > slab_debug=f keeping only the consistency checks. The "poison" checks
> > > would be excluded for instance. That allocation is kvzalloc() but it
> > > should be small on your machine to avoid vmalloc() and use only
> > > kmalloc().
> > 
> > I'll try slab_debug=f next.
> 
> I just hit the oops with SLUB_DEBUG and slab_debug=f, but nothing new
> was logged.

I went back to the original GCC config, and set up yocto to log what it
was doing over /dev/kmsg so maybe we can isolate the trigger.

I got a novel oops this time:

    BUG: kernel NULL pointer dereference, address: 0000000000000000
    #PF: supervisor read access in kernel mode
    #PF: error_code(0x0000) - not-present page
    PGD 0 P4D 0 
    Oops: Oops: 0000 [#1] SMP
    CPU: 6 UID: 0 PID: 12 Comm: kworker/u128:0 Not tainted 6.16.0-rc2-gcc-00269-g11313e2f7812 #1 PREEMPT 
    Hardware name: Gigabyte Technology Co., Ltd. A620I AX/A620I AX, BIOS F3 07/10/2023
    Workqueue: netns cleanup_net
    RIP: 0010:default_device_exit_batch+0xd0/0x2f0
    Code: 00 00 00 66 66 2e 0f 1f 84 00 00 00 00 00 66 66 2e 0f 1f 84 00 00 00 00 00 66 66 2e 0f 1f 84 00 00 00 00 00 66 0f 1f 44 00 00 <49> 8b 94 24 40 01 00 00 4c 89 e5 49 8d 84 24 40 01 00 00 48 39 04
    RSP: 0018:ffffc900001c7d58 EFLAGS: 00010202
    RAX: ffff888f1bacc140 RBX: ffffc900001c7e18 RCX: 0000000000000002
    RDX: ffff888165232930 RSI: 0000000000000000 RDI: ffffffff82a00820
    RBP: ffff888f1bacc000 R08: 0000036dae5dbcdb R09: ffff8881038c5300
    R10: 000000000000036e R11: 0000000000000001 R12: fffffffffffffec0
    R13: dead000000000122 R14: dead000000000100 R15: ffffc900001c7dd0
    FS:  0000000000000000(0000) GS:ffff888cccd6b000(0000) knlGS:0000000000000000
    CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
    CR2: 0000000000000000 CR3: 0000000a414f4000 CR4: 0000000000350ef0
    Call Trace:
     <TASK>
     ops_undo_list+0xd9/0x1e0
     cleanup_net+0x1b2/0x2c0
     process_one_work+0x148/0x240
     worker_thread+0x2d7/0x410
     ? rescuer_thread+0x500/0x500
     kthread+0xd5/0x1e0
     ? kthread_queue_delayed_work+0x70/0x70
     ret_from_fork+0xa0/0xe0
     ? kthread_queue_delayed_work+0x70/0x70
     ? kthread_queue_delayed_work+0x70/0x70
     ret_from_fork_asm+0x11/0x20
     </TASK>
    CR2: 0000000000000000
    ---[ end trace 0000000000000000 ]---
    2025-06-20 23:47:28 - INFO     - ##teamcity[message text='recipe libaio-0.3.113-r0: task do_populate_sysroot: Succeeded' status='NORMAL']
    2025-06-20 23:47:28 - ERROR    - ##teamcity[message text='recipe libaio-0.3.113-r0: task do_populate_sysroot: Succeeded' status='NORMAL']
    RIP: 0010:default_device_exit_batch+0xd0/0x2f0
    Code: 00 00 00 66 66 2e 0f 1f 84 00 00 00 00 00 66 66 2e 0f 1f 84 00 00 00 00 00 66 66 2e 0f 1f 84 00 00 00 00 00 66 0f 1f 44 00 00 <49> 8b 94 24 40 01 00 00 4c 89 e5 49 8d 84 24 40 01 00 00 48 39 04
    RSP: 0018:ffffc900001c7d58 EFLAGS: 00010202
    RAX: ffff888f1bacc140 RBX: ffffc900001c7e18 RCX: 0000000000000002
    RDX: ffff888165232930 RSI: 0000000000000000 RDI: ffffffff82a00820
    RBP: ffff888f1bacc000 R08: 0000036dae5dbcdb R09: ffff8881038c5300
    R10: 000000000000036e R11: 0000000000000001 R12: fffffffffffffec0
    R13: dead000000000122 R14: dead000000000100 R15: ffffc900001c7dd0
    FS:  0000000000000000(0000) GS:ffff888cccd6b000(0000) knlGS:0000000000000000
    CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
    CR2: 0000000000000000 CR3: 000000000361a000 CR4: 0000000000350ef0
    Kernel panic - not syncing: Fatal exception
    Kernel Offset: disabled
    ---[ end Kernel panic - not syncing: Fatal exception ]---

Based on subtracting the set of things that had completed do_compile from
the set of things that started, it was building:

    clang-native, duktape, linux-upstream, nodejs-native, and zstd

...when it oopsed. The whole 5MB log is in "new-different-oops.txt".

> > > > I'm also experimenting with stress-ng as a reproducer, no luck so far.
> > > 
> > > Not sure what you are using there. I think cargo does:
> > > - lock/ unlock in a threads
> > > - create new thread which triggers auto-resize
> > > - auto-resize gets delayed due to lock/ unlock in other threads (the
> > >   reference is held)
> > 
> > I've tried various combinations of --io, --fork, --exec, --futex, --cpu,
> > --vm, and --forkheavy. It's not mixing the operations in threads as I
> > understand it, so I guess it won't ever do anything like what you're
> > describing no matter what stressors I run?
> > 
> > I did get this message once, something I haven't seen before:
> > 
> >     [33024.247423] [    T281] sched: DL replenish lagged too much
> > 
> > ...but maybe that's my fault for overloading it so much.
> > 
> > > And now something happens leading to what we see.
> > > _Maybe_ the cargo application terminates/ execs before the new struct is
> > > assigned in an unexpected way.
> > > The regular hash bucket has reference counting so it should raise
> > > warnings if it goes wrong. I haven't seen those.
> > > 
> > > > A third machine with an older Skylake CPU died overnight, but nothing
> > > > was logged over netconsole. Luckily it actually has a serial header on
> > > > the motherboard, so that's wired up and it's running again, maybe it
> > > > dies in a different way that might be a better clue...
> > > 
> > > So far I *think* that cargo does something that I don't expect and this
> > > leads to a memory double-free. The SLUB_DEBUG_ON hopefully delays the
> > > process long enough that the double free does not trigger.
> > > 
> > > I think I'm going to look for a random rust packet that is using cargo
> > > for building (unless you have a recommendation) and look what it is
> > > doing. It was always cargo after all. Maybe this brings some light.
> > 
> > The list of things in my big build that use cargo is pretty short:
> > 
> >     === Dependendency Snapshot ===
> >     Dep    =mc:house:cargo-native.do_install
> >     Package=mc:house:cargo-native.do_populate_sysroot
> >     RDep   =mc:house:cargo-c-native.do_prepare_recipe_sysroot
> >             mc:house:cargo-native.do_create_spdx
> >             mc:house:cbindgen-native.do_prepare_recipe_sysroot
> >             mc:house:librsvg-native.do_prepare_recipe_sysroot
> >             mc:house:librsvg.do_prepare_recipe_sysroot
> >             mc:house:libstd-rs.do_prepare_recipe_sysroot
> >             mc:house:python3-maturin-native.do_prepare_recipe_sysroot
> >             mc:house:python3-maturin-native.do_populate_sysroot
> >             mc:house:python3-rpds-py.do_prepare_recipe_sysroot
> >             mc:house:python3-setuptools-rust-native.do_prepare_recipe_sysroot
> > 
> > I've tried building each of those targets alone (and all of them
> > together) in a loop, but that hasn't triggered anything. I guess that
> > other concurrent builds are necessary to trigger whatever this is.
> > 
> > I tried using stress-ng --vm and --cpu together to "load up" the machine
> > while running the isolated targets, but that hasn't worked either.
> > 
> > If you want to run *exactly* what I am, clone this unholy mess:
> > 
> >     https://github.com/jcalvinowens/meta-house
> > 
> > ...setup for yocto and install kas as described here:
> > 
> >     https://docs.yoctoproject.org/ref-manual/system-requirements.html#ubuntu-and-debian
> >     https://github.com/jcalvinowens/meta-house/blob/6f6a9c643169fc37ba809f7230261d0e5255b6d7/README.md#kas
> > 
> > ...and run (for the 32-thread machine):
> > 
> >     BB_NUMBER_THREADS="48" PARALLEL_MAKE="-j 36" kas build kas/walnascar.yaml -- -k
> > 
> > Fair warning, it needs a *lot* of RAM at the high concurrency, I have
> > 96GB with 128GB of swap to spill into. It needs ~500GB of disk space if
> > it runs to completion and downloads ~15GB of tarballs when it starts.
> > 
> > Annoyingly it won't work if the system compiler is gcc-15 right now (the
> > verison of glib it has won't build, haven't had a chance to fix it yet).
> > 
> > > > > > Thanks,
> > > > > > Calvin
> > > 
> > > Sebastian
Re: [tip: locking/urgent] futex: Allow to resize the private local hash
Posted by Sebastian Andrzej Siewior 6 months ago
On 2025-06-21 00:24:14 [-0700], Calvin Owens wrote:
> 
> I went back to the original GCC config, and set up yocto to log what it
> was doing over /dev/kmsg so maybe we can isolate the trigger.
> 
> I got a novel oops this time:

I think I got it:

Could you please try this:

diff --git a/include/linux/futex.h b/include/linux/futex.h
index 005b040c4791b..b37193653e6b5 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -89,6 +89,7 @@ void futex_hash_free(struct mm_struct *mm);
 static inline void futex_mm_init(struct mm_struct *mm)
 {
 	RCU_INIT_POINTER(mm->futex_phash, NULL);
+	mm->futex_phash_new = NULL;
 	mutex_init(&mm->futex_hash_lock);
 }
 

Sebastian
Re: [tip: locking/urgent] futex: Allow to resize the private local hash
Posted by Calvin Owens 5 months, 4 weeks ago
On Saturday 06/21 at 23:01 +0200, Sebastian Andrzej Siewior wrote:
> On 2025-06-21 00:24:14 [-0700], Calvin Owens wrote:
> > 
> > I went back to the original GCC config, and set up yocto to log what it
> > was doing over /dev/kmsg so maybe we can isolate the trigger.
> > 
> > I got a novel oops this time:
> 
> I think I got it:
> 
> Could you please try this:

That did it!

Tested-By: Calvin Owens <calvin@wbinvd.org>

This was a fun little diversion, thanks :)

> diff --git a/include/linux/futex.h b/include/linux/futex.h
> index 005b040c4791b..b37193653e6b5 100644
> --- a/include/linux/futex.h
> +++ b/include/linux/futex.h
> @@ -89,6 +89,7 @@ void futex_hash_free(struct mm_struct *mm);
>  static inline void futex_mm_init(struct mm_struct *mm)
>  {
>  	RCU_INIT_POINTER(mm->futex_phash, NULL);
> +	mm->futex_phash_new = NULL;
>  	mutex_init(&mm->futex_hash_lock);
>  }
>  
> 
> Sebastian
Re: [tip: locking/urgent] futex: Allow to resize the private local hash
Posted by Calvin Owens 6 months ago
On Tuesday 06/17 at 09:11 -0700, Calvin Owens wrote:
> On Tuesday 06/17 at 11:50 +0200, Sebastian Andrzej Siewior wrote:
> > On 2025-06-17 02:23:08 [-0700], Calvin Owens wrote:
> > > Ugh, I'm sorry, I was in too much of a hurry this morning... cargo is
> > > obviously not calling PR_FUTEX_HASH which is new in 6.16 :/
> > No worries.
> >
> > > > This is with LTO enabled.
> > >
> > > Full lto with llvm-20.1.7.
> > >
> > …
> > > Nothing showed up in the logs but the RCU stalls on CPU16, always in
> > > queued_spin_lock_slowpath().
> > >
> > > I'll run the build it was doing when it happened in a loop overnight and
> > > see if I can trigger it again.
>
> Actually got an oops this time:
>
> <snip>
>
> This is a giant Yocto build, but the comm is always cargo, so hopefully
> I can run those bits in isolation and hit it more quickly.
>
> > Please check if you can reproduce it and if so if it also happens
> > without lto.

It takes longer with LTO disabled, but I'm still seeing some crashes.

First this WARN:

    ------------[ cut here ]------------
    WARNING: CPU: 2 PID: 1866190 at mm/slub.c:4753 free_large_kmalloc+0xa5/0xc0
    CPU: 2 UID: 1000 PID: 1866190 Comm: python3 Not tainted 6.16.0-rc2-nolto-00024-g9afe652958c3 #1 PREEMPT 
    Hardware name: ASRock B850 Pro-A/B850 Pro-A, BIOS 3.11 11/12/2024
    RIP: 0010:free_large_kmalloc+0xa5/0xc0
    Code: 02 00 00 74 01 fb 83 7b 30 ff 74 07 c7 43 30 ff ff ff ff f0 ff 4b 34 75 08 48 89 df e8 84 dd f9 ff 48 83 c4 08 5b 41 5e 5d c3 <0f> 0b 48 89 df 48 c7 c6 46 92 f5 82 48 83 c4 08 5b 41 5e 5d e9 42
    RSP: 0018:ffffc90024d67ce8 EFLAGS: 00010206
    RAX: 00000000ff000000 RBX: ffffea00051d5700 RCX: ffffea00042f2208
    RDX: 0000000000053a55 RSI: ffff88814755c000 RDI: ffffea00051d5700
    RBP: 0000000000000000 R08: fffffffffffdfce5 R09: ffffffff83d52928
    R10: ffffea00047ae080 R11: 0000000000000003 R12: ffff8882cae5cd00
    R13: ffff88819bb19c08 R14: ffff88819bb194c0 R15: ffff8883a24df900
    FS:  0000000000000000(0000) GS:ffff88909bf54000(0000) knlGS:0000000000000000
    CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
    CR2: 000055842ea1e3f0 CR3: 0000000d82b9d000 CR4: 0000000000750ef0
    PKRU: 55555554
    Call Trace:
     <TASK>
     futex_hash_free+0x10/0x40
     __mmput+0xb4/0xd0
     exec_mmap+0x1e2/0x210
     begin_new_exec+0x491/0x6c0
     load_elf_binary+0x25d/0x1050
     ? load_misc_binary+0x19a/0x2d0
     bprm_execve+0x1d5/0x370
     do_execveat_common+0x29e/0x300
     __x64_sys_execve+0x33/0x40
     do_syscall_64+0x48/0xfb0
     entry_SYSCALL_64_after_hwframe+0x4b/0x53
    RIP: 0033:0x7fd8ec8e7dd7
    Code: Unable to access opcode bytes at 0x7fd8ec8e7dad.
    RSP: 002b:00007fd8adff9e88 EFLAGS: 00000206 ORIG_RAX: 000000000000003b
    RAX: ffffffffffffffda RBX: 00007fd8adffb6c0 RCX: 00007fd8ec8e7dd7
    RDX: 000055842ed3ce60 RSI: 00007fd8eaea3870 RDI: 00007fd8eae87940
    RBP: 00007fd8adff9e90 R08: 00000000ffffffff R09: 0000000000000000
    R10: 0000000000000008 R11: 0000000000000206 R12: 00007fd8ed12da28
    R13: 00007fd8eae87940 R14: 00007fd8eaea3870 R15: 0000000000000001
     </TASK>
    ---[ end trace 0000000000000000 ]---
    page: refcount:0 mapcount:0 mapping:0000000000000000 index:0x13d1507b pfn:0x14755c
    flags: 0x2000000000000000(node=0|zone=1)
    raw: 2000000000000000 ffffea00042f2208 ffff88901fd66b00 0000000000000000
    raw: 0000000013d1507b 0000000000000000 00000000ffffffff 0000000000000000
    page dumped because: Not a kmalloc allocation
    page: refcount:0 mapcount:0 mapping:0000000000000000 index:0x13d1507b pfn:0x14755c
    flags: 0x2000000000000000(node=0|zone=1)
    raw: 2000000000000000 ffffea00042f2208 ffff88901fd66b00 0000000000000000
    raw: 0000000013d1507b 0000000000000000 00000000ffffffff 0000000000000000
    page dumped because: Not a kmalloc allocation

...and then it oopsed (same stack as my last mail) about twenty minutes
later when I hit Ctrl+C to stop the build:

    BUG: unable to handle page fault for address: 00000008849281a9
    #PF: supervisor write access in kernel mode
    #PF: error_code(0x0002) - not-present page
    PGD 0 P4D 0 
    Oops: Oops: 0002 [#1] SMP
    CPU: 13 UID: 1000 PID: 1864338 Comm: python3 Tainted: G        W           6.16.0-rc2-nolto-00024-g9afe652958c3 #1 PREEMPT 
    Tainted: [W]=WARN
    Hardware name: ASRock B850 Pro-A/B850 Pro-A, BIOS 3.11 11/12/2024
    RIP: 0010:queued_spin_lock_slowpath+0x112/0x1a0
    Code: c8 c1 e8 10 66 87 47 02 66 85 c0 74 40 0f b7 c0 49 c7 c0 f8 ff ff ff 89 c6 c1 ee 02 83 e0 03 49 8b b4 f0 40 8b 06 83 c1 e0 04 <48> 89 94 30 00 12 d5 83 83 7a 08 00 75 08 f3 90 83 7a 08 00 74 f8
    RSP: 0018:ffffc9002b35fd20 EFLAGS: 00010212
    RAX: 0000000000000020 RBX: ffffc9002b35fd50 RCX: 0000000000380000
    RDX: ffff88901fde5200 RSI: 0000000900bd6f89 RDI: ffff88814755d204
    RBP: 0000000000000000 R08: fffffffffffffff8 R09: 00000000002ab900
    R10: 0000000000000065 R11: 0000000000001000 R12: ffff88906c343e40
    R13: ffffc9002b35fd50 R14: ffff88814755d204 R15: 00007fd8eb6feac0
    FS:  00007fd8eb6ff6c0(0000) GS:ffff88909c094000(0000) knlGS:0000000000000000
    CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
    CR2: 00000008849281a9 CR3: 0000001fcf611000 CR4: 0000000000750ef0
    PKRU: 55555554
    Call Trace:
     <TASK>
     futex_unqueue+0x21/0x90
     __futex_wait+0xb7/0x120
     ? __futex_wake_mark+0x40/0x40
     futex_wait+0x5b/0xd0
     do_futex+0x86/0x120
     __se_sys_futex+0x10d/0x180
     do_syscall_64+0x48/0xfb0
     entry_SYSCALL_64_after_hwframe+0x4b/0x53
    RIP: 0033:0x7fd8ec8a49ee
    Code: 08 0f 85 f5 4b ff ff 49 89 fb 48 89 f0 48 89 d7 48 89 ce 4c 89 c2 4d 89 ca 4c 8b 44 24 08 4c 8b 4c 24 10 4c 89 5c 24 08 0f 05 <c3> 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 80 00 00 00 00 48 83 ec 08
    RSP: 002b:00007fd8eb6fe9b8 EFLAGS: 00000246 ORIG_RAX: 00000000000000ca
    RAX: ffffffffffffffda RBX: 00007fd8eb6ff6c0 RCX: 00007fd8ec8a49ee
    RDX: 0000000000000000 RSI: 0000000000000189 RDI: 00007fd8eb6feac0
    RBP: 0000000000000000 R08: 0000000000000000 R09: 00000000ffffffff
    R10: 0000000000000000 R11: 0000000000000246 R12: 00007fd8eb6fea00
    R13: 0000000000001de0 R14: 00007fd8ececa240 R15: 00000000000000ef
     </TASK>
    CR2: 00000008849281a9
    ---[ end trace 0000000000000000 ]---

I enabled lockdep and I've got it running again.

I set up a little git repo with a copy of all the traces so far, and the
kconfigs I'm running:

    https://github.com/jcalvinowens/lkml-debug-616

...and I pushed the actual vmlinux binaries here:

    https://github.com/jcalvinowens/lkml-debug-616/releases/tag/20250617

There were some block warnings on another machine running the same
workload, but of course they aren't necessarily related.

> > I have no idea why one spinlock_t remains locked. It is either locked or
> > some stray memory.
> > Oh. Lockdep adds quite some overhead but it should complain that a
> > spinlock_t is still locked while returning to userland.
> 
> I'll report back when I've tried :)
> 
> I'll also try some of the mm debug configs.
> 
> Thanks,
> Calvin
Re: [tip: locking/urgent] futex: Allow to resize the private local hash
Posted by Sebastian Andrzej Siewior 6 months ago
On 2025-06-17 19:15:37 [-0700], Calvin Owens wrote:
> It takes longer with LTO disabled, but I'm still seeing some crashes.
> 
> First this WARN:
> 
>     ------------[ cut here ]------------
>     WARNING: CPU: 2 PID: 1866190 at mm/slub.c:4753 free_large_kmalloc+0xa5/0xc0
>     CPU: 2 UID: 1000 PID: 1866190 Comm: python3 Not tainted 6.16.0-rc2-nolto-00024-g9afe652958c3 #1 PREEMPT 
…
>     RIP: 0010:free_large_kmalloc+0xa5/0xc0
…
>     Call Trace:
>      <TASK>
>      futex_hash_free+0x10/0x40
This points me to kernel/futex/core.c:1535, which is futex_phash_new.
Thanks for the provided vmlinux.
This is odd. The assignment happens only under &mm->futex_hash_lock and
it a bad pointer. The kvmalloc() pointer is stored there and only
remains there if a rehash did not happen before the task ended.

>      __mmput+0xb4/0xd0
>      exec_mmap+0x1e2/0x210
>      begin_new_exec+0x491/0x6c0
>      load_elf_binary+0x25d/0x1050
…
> ...and then it oopsed (same stack as my last mail) about twenty minutes
> later when I hit Ctrl+C to stop the build:
> 
…
> I enabled lockdep and I've got it running again.
> 
> I set up a little git repo with a copy of all the traces so far, and the
> kconfigs I'm running:
> 
>     https://github.com/jcalvinowens/lkml-debug-616
> 
> ...and I pushed the actual vmlinux binaries here:
> 
>     https://github.com/jcalvinowens/lkml-debug-616/releases/tag/20250617
> 
> There were some block warnings on another machine running the same
> workload, but of course they aren't necessarily related.

I have no explanation so far.

Sebastian
Re: [tip: locking/urgent] futex: Allow to resize the private local hash
Posted by Sebastian Andrzej Siewior 6 months, 1 week ago
On 2025-06-11 14:39:16 [-0000], tip-bot2 for Sebastian Andrzej Siewior wrote:
> The following commit has been merged into the locking/urgent branch of tip:
> 
> Commit-ID:     703b5f31aee5bda47868c09a3522a78823c1bb77
> Gitweb:        https://git.kernel.org/tip/703b5f31aee5bda47868c09a3522a78823c1bb77
> Author:        Sebastian Andrzej Siewior <bigeasy@linutronix.de>
> AuthorDate:    Mon, 02 Jun 2025 13:00:27 +02:00
> Committer:     Peter Zijlstra <peterz@infradead.org>
> CommitterDate: Wed, 11 Jun 2025 16:26:44 +02:00
> 
> futex: Allow to resize the private local hash
> 
> Once the global hash is requested there is no way back to switch back to
> the per-task private hash. This is checked at the begin of the function.
> 
> It is possible that two threads simultaneously request the global hash
> and both pass the initial check and block later on the
> mm::futex_hash_lock. In this case the first thread performs the switch
> to the global hash. The second thread will also attempt to switch to the
> global hash and while doing so, accessing the nonexisting slot 1 of the
> struct futex_private_hash.
> This has been reported by Yi Lai.
> 
> Verify under mm_struct::futex_phash that the global hash is not in use.

Could you please replace it with
	https://lore.kernel.org/all/20250610104400.1077266-5-bigeasy@linutronix.de/

It also looks like the subject from commit bd54df5ea7cad ("futex: Allow
to resize the private local hash")

Sebastian
Re: [tip: locking/urgent] futex: Allow to resize the private local hash
Posted by Peter Zijlstra 6 months, 1 week ago
On Wed, Jun 11, 2025 at 04:43:02PM +0200, Sebastian Andrzej Siewior wrote:
> On 2025-06-11 14:39:16 [-0000], tip-bot2 for Sebastian Andrzej Siewior wrote:
> > The following commit has been merged into the locking/urgent branch of tip:
> > 
> > Commit-ID:     703b5f31aee5bda47868c09a3522a78823c1bb77
> > Gitweb:        https://git.kernel.org/tip/703b5f31aee5bda47868c09a3522a78823c1bb77
> > Author:        Sebastian Andrzej Siewior <bigeasy@linutronix.de>
> > AuthorDate:    Mon, 02 Jun 2025 13:00:27 +02:00
> > Committer:     Peter Zijlstra <peterz@infradead.org>
> > CommitterDate: Wed, 11 Jun 2025 16:26:44 +02:00
> > 
> > futex: Allow to resize the private local hash
> > 
> > Once the global hash is requested there is no way back to switch back to
> > the per-task private hash. This is checked at the begin of the function.
> > 
> > It is possible that two threads simultaneously request the global hash
> > and both pass the initial check and block later on the
> > mm::futex_hash_lock. In this case the first thread performs the switch
> > to the global hash. The second thread will also attempt to switch to the
> > global hash and while doing so, accessing the nonexisting slot 1 of the
> > struct futex_private_hash.
> > This has been reported by Yi Lai.
> > 
> > Verify under mm_struct::futex_phash that the global hash is not in use.
> 
> Could you please replace it with
> 	https://lore.kernel.org/all/20250610104400.1077266-5-bigeasy@linutronix.de/
> 
> It also looks like the subject from commit bd54df5ea7cad ("futex: Allow
> to resize the private local hash")

Now done so, unless I messed up again :/
Re: [tip: locking/urgent] futex: Allow to resize the private local hash
Posted by Peter Zijlstra 6 months, 1 week ago
On Wed, Jun 11, 2025 at 05:11:13PM +0200, Peter Zijlstra wrote:
> On Wed, Jun 11, 2025 at 04:43:02PM +0200, Sebastian Andrzej Siewior wrote:
> > On 2025-06-11 14:39:16 [-0000], tip-bot2 for Sebastian Andrzej Siewior wrote:
> > > The following commit has been merged into the locking/urgent branch of tip:
> > > 
> > > Commit-ID:     703b5f31aee5bda47868c09a3522a78823c1bb77
> > > Gitweb:        https://git.kernel.org/tip/703b5f31aee5bda47868c09a3522a78823c1bb77
> > > Author:        Sebastian Andrzej Siewior <bigeasy@linutronix.de>
> > > AuthorDate:    Mon, 02 Jun 2025 13:00:27 +02:00
> > > Committer:     Peter Zijlstra <peterz@infradead.org>
> > > CommitterDate: Wed, 11 Jun 2025 16:26:44 +02:00
> > > 
> > > futex: Allow to resize the private local hash
> > > 
> > > Once the global hash is requested there is no way back to switch back to
> > > the per-task private hash. This is checked at the begin of the function.
> > > 
> > > It is possible that two threads simultaneously request the global hash
> > > and both pass the initial check and block later on the
> > > mm::futex_hash_lock. In this case the first thread performs the switch
> > > to the global hash. The second thread will also attempt to switch to the
> > > global hash and while doing so, accessing the nonexisting slot 1 of the
> > > struct futex_private_hash.
> > > This has been reported by Yi Lai.
> > > 
> > > Verify under mm_struct::futex_phash that the global hash is not in use.
> > 
> > Could you please replace it with
> > 	https://lore.kernel.org/all/20250610104400.1077266-5-bigeasy@linutronix.de/
> > 
> > It also looks like the subject from commit bd54df5ea7cad ("futex: Allow
> > to resize the private local hash")
> 
> Now done so, unless I messed up again :/

ARGH, let me try that again :-(
Re: [tip: locking/urgent] futex: Allow to resize the private local hash
Posted by Sebastian Andrzej Siewior 6 months, 1 week ago
On 2025-06-11 17:20:19 [+0200], Peter Zijlstra wrote:
> ARGH, let me try that again :-(

That last commit 69a14d146f3b87819f3fb73ed5d1de3e1fa680c1 looks great.
Thank you.

Sebastian
[tip: locking/urgent] futex: Allow to resize the private local hash
Posted by tip-bot2 for Sebastian Andrzej Siewior 6 months, 1 week ago
The following commit has been merged into the locking/urgent branch of tip:

Commit-ID:     cdd0f803c1f9b69785f5ff865864cfea11081c91
Gitweb:        https://git.kernel.org/tip/cdd0f803c1f9b69785f5ff865864cfea11081c91
Author:        Sebastian Andrzej Siewior <bigeasy@linutronix.de>
AuthorDate:    Mon, 02 Jun 2025 13:00:27 +02:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Thu, 05 Jun 2025 14:37:59 +02:00

futex: Allow to resize the private local hash

On 2025-06-01 15:39:47 [+0800], Lai, Yi wrote:
> Hi Sebastian Andrzej Siewior,
Hi Yi,
> Greetings!
>
> I used Syzkaller and found that there is KASAN: null-ptr-deref Read in __futex_pivot_hash in linux-next next-20250527.
>
> After bisection and the first bad commit is:
> "
> bd54df5ea7ca futex: Allow to resize the private local hash
> "

Thank you for the report. Next time please trim your report. There is no
need to put your report in the middle of the patch.

The following fixes it:

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20250602110027.wfqbHgzb@linutronix.de
---
 kernel/futex/core.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index b652d2f..33b3643 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -1629,6 +1629,16 @@ again:
 		mm->futex_phash_new = NULL;
 
 		if (fph) {
+			if (cur && !cur->hash_mask) {
+				/*
+				 * If two threads simultaneously request the global
+				 * hash then the first one performs the switch,
+				 * the second one returns here.
+				 */
+				free = fph;
+				mm->futex_phash_new = new;
+				return -EBUSY;
+			}
 			if (cur && !new) {
 				/*
 				 * If we have an existing hash, but do not yet have
Re: [PATCH v12 14/21] futex: Allow to resize the private local hash
Posted by André Almeida 7 months, 2 weeks ago
Em 16/04/2025 13:29, Sebastian Andrzej Siewior escreveu:
> The mm_struct::futex_hash_lock guards the futex_hash_bucket assignment/
> replacement. The futex_hash_allocate()/ PR_FUTEX_HASH_SET_SLOTS
> operation can now be invoked at runtime and resize an already existing
> internal private futex_hash_bucket to another size.
> 
> The reallocation is based on an idea by Thomas Gleixner: The initial
> allocation of struct futex_private_hash sets the reference count
> to one. Every user acquires a reference on the local hash before using
> it and drops it after it enqueued itself on the hash bucket. There is no
> reference held while the task is scheduled out while waiting for the
> wake up.
> The resize process allocates a new struct futex_private_hash and drops
> the initial reference. Synchronized with mm_struct::futex_hash_lock it
> is checked if the reference counter for the currently used
> mm_struct::futex_phash is marked as DEAD. If so, then all users enqueued
> on the current private hash are requeued on the new private hash and the
> new private hash is set to mm_struct::futex_phash. Otherwise the newly
> allocated private hash is saved as mm_struct::futex_phash_new and the
> rehashing and reassigning is delayed to the futex_hash() caller once the
> reference counter is marked DEAD.
> The replacement is not performed at rcuref_put() time because certain
> callers, such as futex_wait_queue(), drop their reference after changing
> the task state. This change will be destroyed once the futex_hash_lock
> is acquired.
> 
> The user can change the number slots with PR_FUTEX_HASH_SET_SLOTS
> multiple times. An increase and decrease is allowed and request blocks
> until the assignment is done.
> 
> The private hash allocated at thread creation is changed from 16 to
>    16 <= 4 * number_of_threads <= global_hash_size
> where number_of_threads can not exceed the number of online CPUs. Should
> the user PR_FUTEX_HASH_SET_SLOTS then the auto scaling is disabled.
> 
> [peterz: reorganize the code to avoid state tracking and simplify new
> object handling, block the user until changes are in effect, allow
> increase and decrease of the hash].
> 
> Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
> ---
>   include/linux/futex.h    |   3 +-
>   include/linux/mm_types.h |   4 +-
>   kernel/futex/core.c      | 290 ++++++++++++++++++++++++++++++++++++---
>   kernel/futex/requeue.c   |   5 +
>   4 files changed, 281 insertions(+), 21 deletions(-)
> 

[...]

>   static int futex_hash_allocate(unsigned int hash_slots, bool custom)
> @@ -1273,16 +1442,23 @@ static int futex_hash_allocate(unsigned int hash_slots, bool custom)
>   	if (hash_slots && (hash_slots == 1 || !is_power_of_2(hash_slots)))
>   		return -EINVAL;
>   
> -	if (mm->futex_phash)
> -		return -EALREADY;
> -
> -	if (!thread_group_empty(current))
> -		return -EINVAL;
> +	/*
> +	 * Once we've disabled the global hash there is no way back.
> +	 */
> +	scoped_guard(rcu) {
> +		fph = rcu_dereference(mm->futex_phash);
> +		if (fph && !fph->hash_mask) {
> +			if (custom)
> +				return -EBUSY;
> +			return 0;
> +		}
> +	}
>   
>   	fph = kvzalloc(struct_size(fph, queues, hash_slots), GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
>   	if (!fph)
>   		return -ENOMEM;
>   
> +	rcuref_init(&fph->users, 1);
>   	fph->hash_mask = hash_slots ? hash_slots - 1 : 0;
>   	fph->custom = custom;
>   	fph->mm = mm;
> @@ -1290,26 +1466,102 @@ static int futex_hash_allocate(unsigned int hash_slots, bool custom)
>   	for (i = 0; i < hash_slots; i++)
>   		futex_hash_bucket_init(&fph->queues[i], fph);
>   
> -	mm->futex_phash = fph;

If (hash_slots == 0), do we still need to do all of this work bellow? I 
thought that using the global hash would allow to skip this.

> +	if (custom) {
> +		/*
> +		 * Only let prctl() wait / retry; don't unduly delay clone().
> +		 */
> +again:
> +		wait_var_event(mm, futex_pivot_pending(mm));
> +	}
> +
> +	scoped_guard(mutex, &mm->futex_hash_lock) {
> +		struct futex_private_hash *free __free(kvfree) = NULL;
> +		struct futex_private_hash *cur, *new;
> +
> +		cur = rcu_dereference_protected(mm->futex_phash,
> +						lockdep_is_held(&mm->futex_hash_lock));
> +		new = mm->futex_phash_new;
> +		mm->futex_phash_new = NULL;
> +
> +		if (fph) {
> +			if (cur && !new) {
> +				/*
> +				 * If we have an existing hash, but do not yet have
> +				 * allocated a replacement hash, drop the initial
> +				 * reference on the existing hash.
> +				 */
> +				futex_private_hash_put(cur);
> +			}
> +
> +			if (new) {
> +				/*
> +				 * Two updates raced; throw out the lesser one.
> +				 */
> +				if (futex_hash_less(new, fph)) {
> +					free = new;
> +					new = fph;
> +				} else {
> +					free = fph;
> +				}
> +			} else {
> +				new = fph;
> +			}
> +			fph = NULL;
> +		}
> +
> +		if (new) {
> +			/*
> +			 * Will set mm->futex_phash_new on failure;
> +			 * futex_private_hash_get() will try again.
> +			 */
> +			if (!__futex_pivot_hash(mm, new) && custom)
> +				goto again;

Is it safe to use a goto inside a scoped_guard(){}?
Re: [PATCH v12 14/21] futex: Allow to resize the private local hash
Posted by Sebastian Andrzej Siewior 7 months ago
On 2025-05-08 17:32:24 [-0300], André Almeida wrote:
> > @@ -1290,26 +1466,102 @@ static int futex_hash_allocate(unsigned int hash_slots, bool custom)
> >   	for (i = 0; i < hash_slots; i++)
> >   		futex_hash_bucket_init(&fph->queues[i], fph);
> > -	mm->futex_phash = fph;
> 
> If (hash_slots == 0), do we still need to do all of this work bellow? I
> thought that using the global hash would allow to skip this.

Not sure what you mean by below. We need to create a smaller struct
futex_private_hash and initialize it. We also need to move all current
futex waiters, which might be on the private hash that is going away,
over to the global hash. So yes, all this is needed.

> > +	if (custom) {
> > +		/*
> > +		 * Only let prctl() wait / retry; don't unduly delay clone().
> > +		 */
> > +again:
> > +		wait_var_event(mm, futex_pivot_pending(mm));
> > +	}
> > +
> > +	scoped_guard(mutex, &mm->futex_hash_lock) {
> > +		struct futex_private_hash *free __free(kvfree) = NULL;
> > +		struct futex_private_hash *cur, *new;
> > +
> > +		cur = rcu_dereference_protected(mm->futex_phash,
> > +						lockdep_is_held(&mm->futex_hash_lock));
> > +		new = mm->futex_phash_new;
> > +		mm->futex_phash_new = NULL;
> > +
> > +		if (fph) {
> > +			if (cur && !new) {
> > +				/*
> > +				 * If we have an existing hash, but do not yet have
> > +				 * allocated a replacement hash, drop the initial
> > +				 * reference on the existing hash.
> > +				 */
> > +				futex_private_hash_put(cur);
> > +			}
> > +
> > +			if (new) {
> > +				/*
> > +				 * Two updates raced; throw out the lesser one.
> > +				 */
> > +				if (futex_hash_less(new, fph)) {
> > +					free = new;
> > +					new = fph;
> > +				} else {
> > +					free = fph;
> > +				}
> > +			} else {
> > +				new = fph;
> > +			}
> > +			fph = NULL;
> > +		}
> > +
> > +		if (new) {
> > +			/*
> > +			 * Will set mm->futex_phash_new on failure;
> > +			 * futex_private_hash_get() will try again.
> > +			 */
> > +			if (!__futex_pivot_hash(mm, new) && custom)
> > +				goto again;
> 
> Is it safe to use a goto inside a scoped_guard(){}?

We jump outside of the scoped_guard() and while testing I've been
looking at the assembly and gcc did the right thing. So I would say why
not. The alternative would be to do manual lock/unlock and think about
the unlock just before the goto statement so this looks "easier".

Sebastian
Re: [PATCH v12 14/21] futex: Allow to resize the private local hash
Posted by André Almeida 7 months ago
Em 16/05/2025 07:49, Sebastian Andrzej Siewior escreveu:
> On 2025-05-08 17:32:24 [-0300], André Almeida wrote:
>>> +			if (!__futex_pivot_hash(mm, new) && custom)
>>> +				goto again;
>>
>> Is it safe to use a goto inside a scoped_guard(){}?
> 
> We jump outside of the scoped_guard() and while testing I've been
> looking at the assembly and gcc did the right thing. So I would say why
> not. The alternative would be to do manual lock/unlock and think about
> the unlock just before the goto statement so this looks "easier".
> 

Ok, thanks for conforming it! I wasn't sure about the goto but now it's 
clear to me.
[PATCH] futex: Fix futex_mm_init() build failure on older compilers, remove rcu_assign_pointer()
Posted by Ingo Molnar 7 months, 1 week ago

* Sebastian Andrzej Siewior <bigeasy@linutronix.de> wrote:

> diff --git a/include/linux/futex.h b/include/linux/futex.h
> index 1d3f7555825ec..40bc778b2bb45 100644
> --- a/include/linux/futex.h
> +++ b/include/linux/futex.h
> @@ -85,7 +85,8 @@ void futex_hash_free(struct mm_struct *mm);
>  
>  static inline void futex_mm_init(struct mm_struct *mm)
>  {
> -	mm->futex_phash =  NULL;
> +	rcu_assign_pointer(mm->futex_phash, NULL);
> +	mutex_init(&mm->futex_hash_lock);
>  }

This breaks the build on older compilers - I tried gcc-9, x86-64 
defconfig:

  CC      io_uring/futex.o
 In file included from ./arch/x86/include/generated/asm/rwonce.h:1,
                 from ./include/linux/compiler.h:390,
                 from ./include/linux/array_size.h:5,
                 from ./include/linux/kernel.h:16,
                 from io_uring/futex.c:2:
 ./include/linux/futex.h: In function 'futex_mm_init':
 ./include/linux/rcupdate.h:555:36: error: dereferencing pointer to incomplete type 'struct futex_private_hash'
  555 | #define RCU_INITIALIZER(v) (typeof(*(v)) __force __rcu *)(v)
      |                                    ^~~~
 ./include/asm-generic/rwonce.h:55:33: note: in definition of macro '__WRITE_ONCE'
   55 |  *(volatile typeof(x) *)&(x) = (val);    \
      |                                 ^~~
 ./arch/x86/include/asm/barrier.h:63:2: note: in expansion of macro 'WRITE_ONCE'
   63 |  WRITE_ONCE(*p, v);      \
      |  ^~~~~~~~~~
 ./include/asm-generic/barrier.h:172:55: note: in expansion of macro '__smp_store_release'
  172 | #define smp_store_release(p, v) do { kcsan_release(); __smp_store_release(p, v); } while (0)
      |                                                       ^~~~~~~~~~~~~~~~~~~
 ./include/linux/rcupdate.h:596:3: note: in expansion of macro 'smp_store_release'
  596 |   smp_store_release(&p, RCU_INITIALIZER((typeof(p))_r_a_p__v)); \
      |   ^~~~~~~~~~~~~~~~~
 ./include/linux/rcupdate.h:596:25: note: in expansion of macro 'RCU_INITIALIZER'
  596 |   smp_store_release(&p, RCU_INITIALIZER((typeof(p))_r_a_p__v)); \
      |                         ^~~~~~~~~~~~~~~
 ./include/linux/futex.h:91:2: note: in expansion of macro 'rcu_assign_pointer'
   91 |  rcu_assign_pointer(mm->futex_phash, NULL);
      |  ^~~~~~~~~~~~~~~~~~
 make[3]: *** [scripts/Makefile.build:203: io_uring/futex.o] Error 1
 make[2]: *** [scripts/Makefile.build:461: io_uring] Error 2
 make[1]: *** [/home/mingo/tip/Makefile:2004: .] Error 2
 make: *** [Makefile:248: __sub-make] Error 2

The problem appears to be that this variant of rcu_assign_pointer() 
wants to know the full type of 'struct futex_private_hash', which type 
is local to futex.c:

   kernel/futex/core.c:struct futex_private_hash {

So either we uninline futex_mm_init() and move it into futex/core.c, or 
we share the structure definition with kernel/fork.c. Both have 
disadvantages.

A third solution would be to just initialize mm->futex_phash with NULL 
like the patch below, it's not like this new MM's ->futex_phash can be 
observed externally until the task is inserted into the task list - 
which guarantees full store ordering.

This relaxation of this initialization might also give a tiny speedup 
on certain platforms.

But an Ack from PeterZ on that assumption would be nice.

Thanks,

	Ingo

=====================================>
Signed-off-by: Ingo Molnar <mingo@kernel.org>

 include/linux/futex.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/include/linux/futex.h b/include/linux/futex.h
index eccc99751bd9..168ffd5996b4 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -88,7 +88,14 @@ void futex_hash_free(struct mm_struct *mm);
 
 static inline void futex_mm_init(struct mm_struct *mm)
 {
-	rcu_assign_pointer(mm->futex_phash, NULL);
+	/*
+	 * No need for rcu_assign_pointer() here, as we can rely on
+	 * tasklist_lock write-ordering in copy_process(), before
+	 * the task's MM becomes visible and the ->futex_phash
+	 * becomes externally observable:
+	 */
+	mm->futex_phash = NULL;
+
 	mutex_init(&mm->futex_hash_lock);
 }
[tip: locking/futex] futex: Relax the rcu_assign_pointer() assignment of mm->futex_phash in futex_mm_init()
Posted by tip-bot2 for Ingo Molnar 7 months, 1 week ago
The following commit has been merged into the locking/futex branch of tip:

Commit-ID:     094ac8cff7858bee5fa4554f6ea66c964f8e160e
Gitweb:        https://git.kernel.org/tip/094ac8cff7858bee5fa4554f6ea66c964f8e160e
Author:        Ingo Molnar <mingo@kernel.org>
AuthorDate:    Sat, 10 May 2025 10:45:28 +02:00
Committer:     Ingo Molnar <mingo@kernel.org>
CommitterDate: Sun, 11 May 2025 10:02:12 +02:00

futex: Relax the rcu_assign_pointer() assignment of mm->futex_phash in futex_mm_init()

The following commit added an rcu_assign_pointer() assignment to
futex_mm_init() in <linux/futex.h>:

  bd54df5ea7ca ("futex: Allow to resize the private local hash")

Which breaks the build on older compilers (gcc-9, x86-64 defconfig):

   CC      io_uring/futex.o
   In file included from ./arch/x86/include/generated/asm/rwonce.h:1,
                    from ./include/linux/compiler.h:390,
                    from ./include/linux/array_size.h:5,
                    from ./include/linux/kernel.h:16,
                    from io_uring/futex.c:2:
   ./include/linux/futex.h: In function 'futex_mm_init':
   ./include/linux/rcupdate.h:555:36: error: dereferencing pointer to incomplete type 'struct futex_private_hash'

The problem is that this variant of rcu_assign_pointer() wants to
know the full type of 'struct futex_private_hash', which type
is local to futex.c:

   kernel/futex/core.c:struct futex_private_hash {

There are a couple of mechanical solutions for this bug:

  - we can uninline futex_mm_init() and move it into futex/core.c

  - or we can share the structure definition with kernel/fork.c.

But both of these solutions have disadvantages: the first one adds
runtime overhead, while the second one dis-encapsulates private
futex types.

A third solution, implemented by this patch, is to just initialize
mm->futex_phash with NULL like the patch below, it's not like this
new MM's ->futex_phash can be observed externally until the task
is inserted into the task list, which guarantees full store ordering.

The relaxation of this initialization might also give a tiny speedup
on certain platforms.

Fixes: bd54df5ea7ca ("futex: Allow to resize the private local hash")
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: André Almeida <andrealmeid@igalia.com>
Cc: Darren Hart <dvhart@infradead.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: Waiman Long <longman@redhat.com>
Link: https://lore.kernel.org/r/aB8SI00EHBri23lB@gmail.com
---
 include/linux/futex.h |  9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/include/linux/futex.h b/include/linux/futex.h
index eccc997..168ffd5 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -88,7 +88,14 @@ void futex_hash_free(struct mm_struct *mm);
 
 static inline void futex_mm_init(struct mm_struct *mm)
 {
-	rcu_assign_pointer(mm->futex_phash, NULL);
+	/*
+	 * No need for rcu_assign_pointer() here, as we can rely on
+	 * tasklist_lock write-ordering in copy_process(), before
+	 * the task's MM becomes visible and the ->futex_phash
+	 * becomes externally observable:
+	 */
+	mm->futex_phash = NULL;
+
 	mutex_init(&mm->futex_hash_lock);
 }
 
[tip: locking/futex] futex: Allow to resize the private local hash
Posted by tip-bot2 for Sebastian Andrzej Siewior 7 months, 2 weeks ago
The following commit has been merged into the locking/futex branch of tip:

Commit-ID:     bd54df5ea7cadac520e346d5f0fe5d58e635b6ba
Gitweb:        https://git.kernel.org/tip/bd54df5ea7cadac520e346d5f0fe5d58e635b6ba
Author:        Sebastian Andrzej Siewior <bigeasy@linutronix.de>
AuthorDate:    Wed, 16 Apr 2025 18:29:14 +02:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Sat, 03 May 2025 12:02:08 +02:00

futex: Allow to resize the private local hash

The mm_struct::futex_hash_lock guards the futex_hash_bucket assignment/
replacement. The futex_hash_allocate()/ PR_FUTEX_HASH_SET_SLOTS
operation can now be invoked at runtime and resize an already existing
internal private futex_hash_bucket to another size.

The reallocation is based on an idea by Thomas Gleixner: The initial
allocation of struct futex_private_hash sets the reference count
to one. Every user acquires a reference on the local hash before using
it and drops it after it enqueued itself on the hash bucket. There is no
reference held while the task is scheduled out while waiting for the
wake up.
The resize process allocates a new struct futex_private_hash and drops
the initial reference. Synchronized with mm_struct::futex_hash_lock it
is checked if the reference counter for the currently used
mm_struct::futex_phash is marked as DEAD. If so, then all users enqueued
on the current private hash are requeued on the new private hash and the
new private hash is set to mm_struct::futex_phash. Otherwise the newly
allocated private hash is saved as mm_struct::futex_phash_new and the
rehashing and reassigning is delayed to the futex_hash() caller once the
reference counter is marked DEAD.
The replacement is not performed at rcuref_put() time because certain
callers, such as futex_wait_queue(), drop their reference after changing
the task state. This change will be destroyed once the futex_hash_lock
is acquired.

The user can change the number slots with PR_FUTEX_HASH_SET_SLOTS
multiple times. An increase and decrease is allowed and request blocks
until the assignment is done.

The private hash allocated at thread creation is changed from 16 to
  16 <= 4 * number_of_threads <= global_hash_size
where number_of_threads can not exceed the number of online CPUs. Should
the user PR_FUTEX_HASH_SET_SLOTS then the auto scaling is disabled.

[peterz: reorganize the code to avoid state tracking and simplify new
object handling, block the user until changes are in effect, allow
increase and decrease of the hash].

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20250416162921.513656-15-bigeasy@linutronix.de
---
 include/linux/futex.h    |   3 +-
 include/linux/mm_types.h |   4 +-
 kernel/futex/core.c      | 290 +++++++++++++++++++++++++++++++++++---
 kernel/futex/requeue.c   |   5 +-
 4 files changed, 281 insertions(+), 21 deletions(-)

diff --git a/include/linux/futex.h b/include/linux/futex.h
index 1d3f755..40bc778 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -85,7 +85,8 @@ void futex_hash_free(struct mm_struct *mm);
 
 static inline void futex_mm_init(struct mm_struct *mm)
 {
-	mm->futex_phash =  NULL;
+	rcu_assign_pointer(mm->futex_phash, NULL);
+	mutex_init(&mm->futex_hash_lock);
 }
 
 #else /* !CONFIG_FUTEX_PRIVATE_HASH */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index a4b5661..32ba512 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1033,7 +1033,9 @@ struct mm_struct {
 		seqcount_t mm_lock_seq;
 #endif
 #ifdef CONFIG_FUTEX_PRIVATE_HASH
-		struct futex_private_hash	*futex_phash;
+		struct mutex			futex_hash_lock;
+		struct futex_private_hash	__rcu *futex_phash;
+		struct futex_private_hash	*futex_phash_new;
 #endif
 
 		unsigned long hiwater_rss; /* High-watermark of RSS usage */
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index 53b3a00..9e7dad5 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -40,6 +40,7 @@
 #include <linux/fault-inject.h>
 #include <linux/slab.h>
 #include <linux/prctl.h>
+#include <linux/rcuref.h>
 
 #include "futex.h"
 #include "../locking/rtmutex_common.h"
@@ -57,7 +58,9 @@ static struct {
 #define futex_hashmask (__futex_data.hashmask)
 
 struct futex_private_hash {
+	rcuref_t	users;
 	unsigned int	hash_mask;
+	struct rcu_head	rcu;
 	void		*mm;
 	bool		custom;
 	struct futex_hash_bucket queues[];
@@ -129,11 +132,14 @@ static inline bool futex_key_is_private(union futex_key *key)
 
 bool futex_private_hash_get(struct futex_private_hash *fph)
 {
-	return false;
+	return rcuref_get(&fph->users);
 }
 
 void futex_private_hash_put(struct futex_private_hash *fph)
 {
+	/* Ignore return value, last put is verified via rcuref_is_dead() */
+	if (rcuref_put(&fph->users))
+		wake_up_var(fph->mm);
 }
 
 /**
@@ -143,8 +149,23 @@ void futex_private_hash_put(struct futex_private_hash *fph)
  * Obtain an additional reference for the already obtained hash bucket. The
  * caller must already own an reference.
  */
-void futex_hash_get(struct futex_hash_bucket *hb) { }
-void futex_hash_put(struct futex_hash_bucket *hb) { }
+void futex_hash_get(struct futex_hash_bucket *hb)
+{
+	struct futex_private_hash *fph = hb->priv;
+
+	if (!fph)
+		return;
+	WARN_ON_ONCE(!futex_private_hash_get(fph));
+}
+
+void futex_hash_put(struct futex_hash_bucket *hb)
+{
+	struct futex_private_hash *fph = hb->priv;
+
+	if (!fph)
+		return;
+	futex_private_hash_put(fph);
+}
 
 static struct futex_hash_bucket *
 __futex_hash_private(union futex_key *key, struct futex_private_hash *fph)
@@ -155,7 +176,7 @@ __futex_hash_private(union futex_key *key, struct futex_private_hash *fph)
 		return NULL;
 
 	if (!fph)
-		fph = key->private.mm->futex_phash;
+		fph = rcu_dereference(key->private.mm->futex_phash);
 	if (!fph || !fph->hash_mask)
 		return NULL;
 
@@ -165,21 +186,119 @@ __futex_hash_private(union futex_key *key, struct futex_private_hash *fph)
 	return &fph->queues[hash & fph->hash_mask];
 }
 
+static void futex_rehash_private(struct futex_private_hash *old,
+				 struct futex_private_hash *new)
+{
+	struct futex_hash_bucket *hb_old, *hb_new;
+	unsigned int slots = old->hash_mask + 1;
+	unsigned int i;
+
+	for (i = 0; i < slots; i++) {
+		struct futex_q *this, *tmp;
+
+		hb_old = &old->queues[i];
+
+		spin_lock(&hb_old->lock);
+		plist_for_each_entry_safe(this, tmp, &hb_old->chain, list) {
+
+			plist_del(&this->list, &hb_old->chain);
+			futex_hb_waiters_dec(hb_old);
+
+			WARN_ON_ONCE(this->lock_ptr != &hb_old->lock);
+
+			hb_new = __futex_hash(&this->key, new);
+			futex_hb_waiters_inc(hb_new);
+			/*
+			 * The new pointer isn't published yet but an already
+			 * moved user can be unqueued due to timeout or signal.
+			 */
+			spin_lock_nested(&hb_new->lock, SINGLE_DEPTH_NESTING);
+			plist_add(&this->list, &hb_new->chain);
+			this->lock_ptr = &hb_new->lock;
+			spin_unlock(&hb_new->lock);
+		}
+		spin_unlock(&hb_old->lock);
+	}
+}
+
+static bool __futex_pivot_hash(struct mm_struct *mm,
+			       struct futex_private_hash *new)
+{
+	struct futex_private_hash *fph;
+
+	WARN_ON_ONCE(mm->futex_phash_new);
+
+	fph = rcu_dereference_protected(mm->futex_phash,
+					lockdep_is_held(&mm->futex_hash_lock));
+	if (fph) {
+		if (!rcuref_is_dead(&fph->users)) {
+			mm->futex_phash_new = new;
+			return false;
+		}
+
+		futex_rehash_private(fph, new);
+	}
+	rcu_assign_pointer(mm->futex_phash, new);
+	kvfree_rcu(fph, rcu);
+	return true;
+}
+
+static void futex_pivot_hash(struct mm_struct *mm)
+{
+	scoped_guard(mutex, &mm->futex_hash_lock) {
+		struct futex_private_hash *fph;
+
+		fph = mm->futex_phash_new;
+		if (fph) {
+			mm->futex_phash_new = NULL;
+			__futex_pivot_hash(mm, fph);
+		}
+	}
+}
+
 struct futex_private_hash *futex_private_hash(void)
 {
 	struct mm_struct *mm = current->mm;
-	struct futex_private_hash *fph;
+	/*
+	 * Ideally we don't loop. If there is a replacement in progress
+	 * then a new private hash is already prepared and a reference can't be
+	 * obtained once the last user dropped it's.
+	 * In that case we block on mm_struct::futex_hash_lock and either have
+	 * to perform the replacement or wait while someone else is doing the
+	 * job. Eitherway, on the second iteration we acquire a reference on the
+	 * new private hash or loop again because a new replacement has been
+	 * requested.
+	 */
+again:
+	scoped_guard(rcu) {
+		struct futex_private_hash *fph;
 
-	fph = mm->futex_phash;
-	return fph;
+		fph = rcu_dereference(mm->futex_phash);
+		if (!fph)
+			return NULL;
+
+		if (rcuref_get(&fph->users))
+			return fph;
+	}
+	futex_pivot_hash(mm);
+	goto again;
 }
 
 struct futex_hash_bucket *futex_hash(union futex_key *key)
 {
+	struct futex_private_hash *fph;
 	struct futex_hash_bucket *hb;
 
-	hb = __futex_hash(key, NULL);
-	return hb;
+again:
+	scoped_guard(rcu) {
+		hb = __futex_hash(key, NULL);
+		fph = hb->priv;
+
+		if (!fph || futex_private_hash_get(fph))
+			return hb;
+	}
+	futex_pivot_hash(key->private.mm);
+	goto again;
 }
 
 #else /* !CONFIG_FUTEX_PRIVATE_HASH */
@@ -664,6 +783,8 @@ int futex_unqueue(struct futex_q *q)
 	spinlock_t *lock_ptr;
 	int ret = 0;
 
+	/* RCU so lock_ptr is not going away during locking. */
+	guard(rcu)();
 	/* In the common case we don't take the spinlock, which is nice. */
 retry:
 	/*
@@ -1066,6 +1187,10 @@ static void exit_pi_state_list(struct task_struct *curr)
 	union futex_key key = FUTEX_KEY_INIT;
 
 	/*
+	 * The mutex mm_struct::futex_hash_lock might be acquired.
+	 */
+	might_sleep();
+	/*
 	 * Ensure the hash remains stable (no resize) during the while loop
 	 * below. The hb pointer is acquired under the pi_lock so we can't block
 	 * on the mutex.
@@ -1261,7 +1386,51 @@ static void futex_hash_bucket_init(struct futex_hash_bucket *fhb,
 #ifdef CONFIG_FUTEX_PRIVATE_HASH
 void futex_hash_free(struct mm_struct *mm)
 {
-	kvfree(mm->futex_phash);
+	struct futex_private_hash *fph;
+
+	kvfree(mm->futex_phash_new);
+	fph = rcu_dereference_raw(mm->futex_phash);
+	if (fph) {
+		WARN_ON_ONCE(rcuref_read(&fph->users) > 1);
+		kvfree(fph);
+	}
+}
+
+static bool futex_pivot_pending(struct mm_struct *mm)
+{
+	struct futex_private_hash *fph;
+
+	guard(rcu)();
+
+	if (!mm->futex_phash_new)
+		return true;
+
+	fph = rcu_dereference(mm->futex_phash);
+	return rcuref_is_dead(&fph->users);
+}
+
+static bool futex_hash_less(struct futex_private_hash *a,
+			    struct futex_private_hash *b)
+{
+	/* user provided always wins */
+	if (!a->custom && b->custom)
+		return true;
+	if (a->custom && !b->custom)
+		return false;
+
+	/* zero-sized hash wins */
+	if (!b->hash_mask)
+		return true;
+	if (!a->hash_mask)
+		return false;
+
+	/* keep the biggest */
+	if (a->hash_mask < b->hash_mask)
+		return true;
+	if (a->hash_mask > b->hash_mask)
+		return false;
+
+	return false; /* equal */
 }
 
 static int futex_hash_allocate(unsigned int hash_slots, bool custom)
@@ -1273,16 +1442,23 @@ static int futex_hash_allocate(unsigned int hash_slots, bool custom)
 	if (hash_slots && (hash_slots == 1 || !is_power_of_2(hash_slots)))
 		return -EINVAL;
 
-	if (mm->futex_phash)
-		return -EALREADY;
-
-	if (!thread_group_empty(current))
-		return -EINVAL;
+	/*
+	 * Once we've disabled the global hash there is no way back.
+	 */
+	scoped_guard(rcu) {
+		fph = rcu_dereference(mm->futex_phash);
+		if (fph && !fph->hash_mask) {
+			if (custom)
+				return -EBUSY;
+			return 0;
+		}
+	}
 
 	fph = kvzalloc(struct_size(fph, queues, hash_slots), GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
 	if (!fph)
 		return -ENOMEM;
 
+	rcuref_init(&fph->users, 1);
 	fph->hash_mask = hash_slots ? hash_slots - 1 : 0;
 	fph->custom = custom;
 	fph->mm = mm;
@@ -1290,26 +1466,102 @@ static int futex_hash_allocate(unsigned int hash_slots, bool custom)
 	for (i = 0; i < hash_slots; i++)
 		futex_hash_bucket_init(&fph->queues[i], fph);
 
-	mm->futex_phash = fph;
+	if (custom) {
+		/*
+		 * Only let prctl() wait / retry; don't unduly delay clone().
+		 */
+again:
+		wait_var_event(mm, futex_pivot_pending(mm));
+	}
+
+	scoped_guard(mutex, &mm->futex_hash_lock) {
+		struct futex_private_hash *free __free(kvfree) = NULL;
+		struct futex_private_hash *cur, *new;
+
+		cur = rcu_dereference_protected(mm->futex_phash,
+						lockdep_is_held(&mm->futex_hash_lock));
+		new = mm->futex_phash_new;
+		mm->futex_phash_new = NULL;
+
+		if (fph) {
+			if (cur && !new) {
+				/*
+				 * If we have an existing hash, but do not yet have
+				 * allocated a replacement hash, drop the initial
+				 * reference on the existing hash.
+				 */
+				futex_private_hash_put(cur);
+			}
+
+			if (new) {
+				/*
+				 * Two updates raced; throw out the lesser one.
+				 */
+				if (futex_hash_less(new, fph)) {
+					free = new;
+					new = fph;
+				} else {
+					free = fph;
+				}
+			} else {
+				new = fph;
+			}
+			fph = NULL;
+		}
+
+		if (new) {
+			/*
+			 * Will set mm->futex_phash_new on failure;
+			 * futex_private_hash_get() will try again.
+			 */
+			if (!__futex_pivot_hash(mm, new) && custom)
+				goto again;
+		}
+	}
 	return 0;
 }
 
 int futex_hash_allocate_default(void)
 {
+	unsigned int threads, buckets, current_buckets = 0;
+	struct futex_private_hash *fph;
+
 	if (!current->mm)
 		return 0;
 
-	if (current->mm->futex_phash)
+	scoped_guard(rcu) {
+		threads = min_t(unsigned int,
+				get_nr_threads(current),
+				num_online_cpus());
+
+		fph = rcu_dereference(current->mm->futex_phash);
+		if (fph) {
+			if (fph->custom)
+				return 0;
+
+			current_buckets = fph->hash_mask + 1;
+		}
+	}
+
+	/*
+	 * The default allocation will remain within
+	 *   16 <= threads * 4 <= global hash size
+	 */
+	buckets = roundup_pow_of_two(4 * threads);
+	buckets = clamp(buckets, 16, futex_hashmask + 1);
+
+	if (current_buckets >= buckets)
 		return 0;
 
-	return futex_hash_allocate(16, false);
+	return futex_hash_allocate(buckets, false);
 }
 
 static int futex_hash_get_slots(void)
 {
 	struct futex_private_hash *fph;
 
-	fph = current->mm->futex_phash;
+	guard(rcu)();
+	fph = rcu_dereference(current->mm->futex_phash);
 	if (fph && fph->hash_mask)
 		return fph->hash_mask + 1;
 	return 0;
diff --git a/kernel/futex/requeue.c b/kernel/futex/requeue.c
index b0e64fd..c716a66 100644
--- a/kernel/futex/requeue.c
+++ b/kernel/futex/requeue.c
@@ -87,6 +87,11 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
 		futex_hb_waiters_inc(hb2);
 		plist_add(&q->list, &hb2->chain);
 		q->lock_ptr = &hb2->lock;
+		/*
+		 * hb1 and hb2 belong to the same futex_hash_bucket_private
+		 * because if we managed get a reference on hb1 then it can't be
+		 * replaced. Therefore we avoid put(hb1)+get(hb2) here.
+		 */
 	}
 	q->key = *key2;
 }