[RFC PATCH v3 6/9] futex: Allow to re-allocate the private hash bucket.

Sebastian Andrzej Siewior posted 9 patches 6 days, 23 hours ago
[RFC PATCH v3 6/9] futex: Allow to re-allocate the private hash bucket.
Posted by Sebastian Andrzej Siewior 6 days, 23 hours ago
The mm_struct::futex_hash_lock guards the futex_hash_bucket assignment/
replacement. The futex_hash_allocate()/ PR_FUTEX_HASH_SET_SLOTS
operation can now be invoked at runtime and resize the internal private
futex_hash_bucket to another size.
The idea is to use the recently introduced ref counting to keep a valid
HB around. On resize/ replacement the new HB is assigned and all users
currently queued on hb will get poked so they can requeue themself.

This has been only tested with FUTEX_LOCK_PI.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 include/linux/futex.h           |  1 +
 include/linux/mm_types.h        |  1 +
 kernel/futex/core.c             | 64 ++++++++++++++++++++++++++++-----
 kernel/futex/futex.h            |  1 +
 kernel/futex/pi.c               | 25 +++++++++++++
 kernel/locking/rtmutex.c        | 26 ++++++++++++++
 kernel/locking/rtmutex_common.h |  2 ++
 7 files changed, 111 insertions(+), 9 deletions(-)

diff --git a/include/linux/futex.h b/include/linux/futex.h
index 359fc24eb37ff..838a5a6be0444 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -85,6 +85,7 @@ void futex_hash_free(struct mm_struct *mm);
 static inline void futex_mm_init(struct mm_struct *mm)
 {
 	rcu_assign_pointer(mm->futex_hash_bucket, NULL);
+	mutex_init(&mm->futex_hash_lock);
 }
 
 #else
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 057ad1de59ca0..5bf86ea363780 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -899,6 +899,7 @@ struct mm_struct {
 		int mm_lock_seq;
 #endif
 
+		struct mutex				futex_hash_lock;
 		struct futex_hash_bucket_private	__rcu *futex_hash_bucket;
 
 		unsigned long hiwater_rss; /* High-watermark of RSS usage */
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index cff5652a29917..70d4b1d93bbb8 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -595,6 +595,7 @@ struct futex_hash_bucket *futex_q_lock(struct futex_q *q)
 {
 	struct futex_hash_bucket *hb;
 
+try_again:
 	hb = futex_hash(&q->key);
 
 	/*
@@ -610,7 +611,13 @@ struct futex_hash_bucket *futex_q_lock(struct futex_q *q)
 	q->lock_ptr = &hb->lock;
 
 	spin_lock(&hb->lock);
-	return hb;
+	if (futex_check_hb_valid(hb))
+		return hb;
+
+	futex_hb_waiters_dec(hb);
+	spin_unlock(&hb->lock);
+	futex_hash_put(hb);
+	goto try_again;
 }
 
 void futex_q_unlock(struct futex_hash_bucket *hb)
@@ -1238,18 +1245,50 @@ void futex_hash_free(struct mm_struct *mm)
 	futex_hash_priv_put(hb_p);
 }
 
+static void futex_put_old_hb_p(struct futex_hash_bucket_private *hb_p)
+{
+	unsigned int slots = hb_p->hash_mask + 1;
+	struct futex_hash_bucket *hb;
+	DEFINE_WAKE_Q(wake_q);
+	unsigned int i;
+
+	for (i = 0; i < slots; i++) {
+		struct futex_q *this;
+
+		hb = &hb_p->queues[i];
+
+		spin_lock(&hb->lock);
+		plist_for_each_entry(this, &hb->chain, list)
+			wake_q_add(&wake_q, this->task);
+		spin_unlock(&hb->lock);
+	}
+	futex_hash_priv_put(hb_p);
+
+	wake_up_q(&wake_q);
+}
+
+bool futex_check_hb_valid(struct futex_hash_bucket *hb)
+{
+	struct futex_hash_bucket_private *hb_p_now;
+	struct futex_hash_bucket_private *hb_p;
+
+	if (hb->hb_slot == 0)
+		return true;
+	guard(rcu)();
+	hb_p_now = rcu_dereference(current->mm->futex_hash_bucket);
+	hb_p = container_of(hb, struct futex_hash_bucket_private,
+			    queues[hb->hb_slot - 1]);
+
+	return hb_p_now == hb_p;
+}
+
 static int futex_hash_allocate(unsigned int hash_slots)
 {
-	struct futex_hash_bucket_private *hb_p;
+	struct futex_hash_bucket_private *hb_p, *hb_p_old = NULL;
+	struct mm_struct *mm;
 	size_t alloc_size;
 	int i;
 
-	if (current->mm->futex_hash_bucket)
-		return -EALREADY;
-
-	if (!thread_group_leader(current))
-		return -EINVAL;
-
 	if (hash_slots < 2)
 		hash_slots = 2;
 	if (hash_slots > 131072)
@@ -1277,7 +1316,14 @@ static int futex_hash_allocate(unsigned int hash_slots)
 		hb_p->queues[i].hb_slot = i + 1;
 	}
 
-	rcu_assign_pointer(current->mm->futex_hash_bucket, hb_p);
+	mm = current->mm;
+	scoped_guard(mutex, &mm->futex_hash_lock) {
+		hb_p_old = rcu_dereference_check(mm->futex_hash_bucket,
+						 lockdep_is_held(&mm->futex_hash_lock));
+		rcu_assign_pointer(mm->futex_hash_bucket, hb_p);
+	}
+	if (hb_p_old)
+		futex_put_old_hb_p(hb_p_old);
 	return 0;
 }
 
diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h
index c6d59949766d2..b974d675730e4 100644
--- a/kernel/futex/futex.h
+++ b/kernel/futex/futex.h
@@ -204,6 +204,7 @@ futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
 extern struct futex_hash_bucket *futex_hash(union futex_key *key);
 extern void futex_hash_put(struct futex_hash_bucket *hb);
 extern void futex_hash_get(struct futex_hash_bucket *hb);
+extern bool futex_check_hb_valid(struct futex_hash_bucket *hb);
 
 static inline struct futex_hash_bucket *futex_hb_from_futex_q(struct futex_q *q)
 {
diff --git a/kernel/futex/pi.c b/kernel/futex/pi.c
index 399ac712f1fd6..1a0a9cd31f911 100644
--- a/kernel/futex/pi.c
+++ b/kernel/futex/pi.c
@@ -998,6 +998,7 @@ int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int tryl
 	rt_mutex_pre_schedule();
 
 	rt_mutex_init_waiter(&rt_waiter);
+	rt_waiter.hb = hb;
 
 	/*
 	 * On PREEMPT_RT, when hb->lock becomes an rt_mutex, we must not
@@ -1066,6 +1067,23 @@ int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int tryl
 	 */
 	rt_mutex_post_schedule();
 no_block:
+	if (!futex_check_hb_valid(hb)) {
+		/*
+		 * We might got the lock, we might not. If the HB changed under
+		 * us it was all for nothing. Try again from scratch.
+		 */
+		futex_unqueue_pi(&q);
+		spin_unlock(q.lock_ptr);
+		futex_hash_put(hb);
+
+		if (to) {
+			hrtimer_cancel(&to->timer);
+			destroy_hrtimer_on_stack(&to->timer);
+		}
+		if (refill_pi_state_cache())
+			return -ENOMEM;
+		goto retry_private;
+	}
 	/*
 	 * Fixup the pi_state owner and possibly acquire the lock if we
 	 * haven't already.
@@ -1226,6 +1244,12 @@ int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
 		 * space.
 		 */
 		return ret;
+	} else {
+		if (!futex_check_hb_valid(hb)) {
+			spin_unlock(&hb->lock);
+			futex_hash_put(hb);
+			goto retry;
+		}
 	}
 
 	/*
@@ -1250,6 +1274,7 @@ int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
 			return ret;
 		}
 	}
+	/* XXX if the HB changed but uval did not, we might need to check if there is a waiter pending */
 
 	/*
 	 * If uval has changed, let user space handle it.
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index ebebd0eec7f63..188a9b16412df 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -56,10 +56,29 @@ static inline int __ww_mutex_check_kill(struct rt_mutex *lock,
 	return 0;
 }
 
+extern bool futex_check_hb_valid(struct futex_hash_bucket *hb);
+
+static inline bool __internal_retry_reason(struct rt_mutex_waiter *waiter)
+{
+	if (!IS_ENABLED(CONFIG_FUTEX))
+		return false;
+
+	if (!waiter->hb)
+		return false;
+	if (futex_check_hb_valid(waiter->hb))
+		return false;
+	return true;
+}
+
 #else
 # define build_ww_mutex()	(true)
 # define ww_container_of(rtm)	container_of(rtm, struct ww_mutex, base)
 # include "ww_mutex.h"
+
+static inline bool __internal_retry_reason(struct rt_mutex_waiter *waiter)
+{
+	return false;
+}
 #endif
 
 /*
@@ -1626,6 +1645,13 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock,
 				break;
 		}
 
+		if (!build_ww_mutex()) {
+			if (__internal_retry_reason(waiter)) {
+				ret = -EAGAIN;
+				break;
+			}
+		}
+
 		if (waiter == rt_mutex_top_waiter(lock))
 			owner = rt_mutex_owner(lock);
 		else
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 1162e07cdaea1..fb26ad08f259a 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -56,6 +56,7 @@ struct rt_mutex_waiter {
 	struct rt_mutex_base	*lock;
 	unsigned int		wake_state;
 	struct ww_acquire_ctx	*ww_ctx;
+	struct futex_hash_bucket *hb;
 };
 
 /**
@@ -215,6 +216,7 @@ static inline void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
 	RB_CLEAR_NODE(&waiter->tree.entry);
 	waiter->wake_state = TASK_NORMAL;
 	waiter->task = NULL;
+	waiter->hb = NULL;
 }
 
 static inline void rt_mutex_init_rtlock_waiter(struct rt_mutex_waiter *waiter)
-- 
2.45.2