Preparatory changes for Proxy Execution v11

[PATCH v11 1/7] locking/mutex: Remove wakeups from under mutex::wait_lock

Posted by John Stultz 1 year, 7 months ago

From: Peter Zijlstra <peterz@infradead.org>

In preparation to nest mutex::wait_lock under rq::lock we need to remove
wakeups from under it.

Cc: Joel Fernandes <joelaf@google.com>
Cc: Qais Yousef <qyousef@layalina.io>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Ben Segall <bsegall@google.com>
Cc: Zimuzo Ezeozue <zezeozue@google.com>
Cc: Youssef Esmat <youssefesmat@google.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Will Deacon <will@kernel.org>
Cc: Waiman Long <longman@redhat.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Metin Kaya <Metin.Kaya@arm.com>
Cc: Xuewen Yan <xuewen.yan94@gmail.com>
Cc: K Prateek Nayak <kprateek.nayak@amd.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Daniel Lezcano <daniel.lezcano@linaro.org>
Cc: kernel-team@android.com
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
[Heavily changed after 55f036ca7e74 ("locking: WW mutex cleanup") and
08295b3b5bee ("locking: Implement an algorithm choice for Wound-Wait
mutexes")]
Signed-off-by: Juri Lelli <juri.lelli@redhat.com>
[jstultz: rebased to mainline, added extra wake_up_q & init
 to avoid hangs, similar to Connor's rework of this patch]
Signed-off-by: John Stultz <jstultz@google.com>
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Tested-by: Metin Kaya <metin.kaya@arm.com>
Acked-by: Davidlohr Bueso <dave@stgolabs.net>
Reviewed-by: Metin Kaya <metin.kaya@arm.com>
---
v5:
* Reverted back to an earlier version of this patch to undo
  the change that kept the wake_q in the ctx structure, as
  that broke the rule that the wake_q must always be on the
  stack, as its not safe for concurrency.
v6:
* Made tweaks suggested by Waiman Long
v7:
* Fixups to pass wake_qs down for PREEMPT_RT logic
v10:
* Switched preempt_enable to be lower close to the unlock as
  suggested by Valentin
* Added additional preempt_disable coverage around the wake_q
  calls as again noted by Valentin
---
 kernel/locking/mutex.c       | 17 +++++++++++++----
 kernel/locking/rtmutex.c     | 30 +++++++++++++++++++++---------
 kernel/locking/rwbase_rt.c   |  8 +++++++-
 kernel/locking/rwsem.c       |  4 ++--
 kernel/locking/spinlock_rt.c |  3 ++-
 kernel/locking/ww_mutex.h    | 29 ++++++++++++++++++-----------
 6 files changed, 63 insertions(+), 28 deletions(-)

diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index cbae8c0b89ab..4269da1f3ef5 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -575,6 +575,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
 		    struct lockdep_map *nest_lock, unsigned long ip,
 		    struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx)
 {
+	DEFINE_WAKE_Q(wake_q);
 	struct mutex_waiter waiter;
 	struct ww_mutex *ww;
 	int ret;
@@ -625,7 +626,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
 	 */
 	if (__mutex_trylock(lock)) {
 		if (ww_ctx)
-			__ww_mutex_check_waiters(lock, ww_ctx);
+			__ww_mutex_check_waiters(lock, ww_ctx, &wake_q);
 
 		goto skip_wait;
 	}
@@ -645,7 +646,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
 		 * Add in stamp order, waking up waiters that must kill
 		 * themselves.
 		 */
-		ret = __ww_mutex_add_waiter(&waiter, lock, ww_ctx);
+		ret = __ww_mutex_add_waiter(&waiter, lock, ww_ctx, &wake_q);
 		if (ret)
 			goto err_early_kill;
 	}
@@ -681,6 +682,11 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
 		}
 
 		raw_spin_unlock(&lock->wait_lock);
+		/* Make sure we do wakeups before calling schedule */
+		if (!wake_q_empty(&wake_q)) {
+			wake_up_q(&wake_q);
+			wake_q_init(&wake_q);
+		}
 		schedule_preempt_disabled();
 
 		first = __mutex_waiter_is_first(lock, &waiter);
@@ -714,7 +720,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
 		 */
 		if (!ww_ctx->is_wait_die &&
 		    !__mutex_waiter_is_first(lock, &waiter))
-			__ww_mutex_check_waiters(lock, ww_ctx);
+			__ww_mutex_check_waiters(lock, ww_ctx, &wake_q);
 	}
 
 	__mutex_remove_waiter(lock, &waiter);
@@ -730,6 +736,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
 		ww_mutex_lock_acquired(ww, ww_ctx);
 
 	raw_spin_unlock(&lock->wait_lock);
+	wake_up_q(&wake_q);
 	preempt_enable();
 	return 0;
 
@@ -741,6 +748,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
 	raw_spin_unlock(&lock->wait_lock);
 	debug_mutex_free_waiter(&waiter);
 	mutex_release(&lock->dep_map, ip);
+	wake_up_q(&wake_q);
 	preempt_enable();
 	return ret;
 }
@@ -951,9 +959,10 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne
 	if (owner & MUTEX_FLAG_HANDOFF)
 		__mutex_handoff(lock, next);
 
+	preempt_disable();
 	raw_spin_unlock(&lock->wait_lock);
-
 	wake_up_q(&wake_q);
+	preempt_enable();
 }
 
 #ifndef CONFIG_DEBUG_LOCK_ALLOC
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 88d08eeb8bc0..7a85d9bfa972 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -34,13 +34,15 @@
 
 static inline int __ww_mutex_add_waiter(struct rt_mutex_waiter *waiter,
 					struct rt_mutex *lock,
-					struct ww_acquire_ctx *ww_ctx)
+					struct ww_acquire_ctx *ww_ctx,
+					struct wake_q_head *wake_q)
 {
 	return 0;
 }
 
 static inline void __ww_mutex_check_waiters(struct rt_mutex *lock,
-					    struct ww_acquire_ctx *ww_ctx)
+					    struct ww_acquire_ctx *ww_ctx,
+					    struct wake_q_head *wake_q)
 {
 }
 
@@ -1207,6 +1209,7 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock,
 	struct rt_mutex_waiter *top_waiter = waiter;
 	struct rt_mutex_base *next_lock;
 	int chain_walk = 0, res;
+	DEFINE_WAKE_Q(wake_q);
 
 	lockdep_assert_held(&lock->wait_lock);
 
@@ -1245,7 +1248,10 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock,
 
 		/* Check whether the waiter should back out immediately */
 		rtm = container_of(lock, struct rt_mutex, rtmutex);
-		res = __ww_mutex_add_waiter(waiter, rtm, ww_ctx);
+		preempt_disable();
+		res = __ww_mutex_add_waiter(waiter, rtm, ww_ctx, &wake_q);
+		wake_up_q(&wake_q);
+		preempt_enable();
 		if (res) {
 			raw_spin_lock(&task->pi_lock);
 			rt_mutex_dequeue(lock, waiter);
@@ -1678,7 +1684,8 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock,
 				       struct ww_acquire_ctx *ww_ctx,
 				       unsigned int state,
 				       enum rtmutex_chainwalk chwalk,
-				       struct rt_mutex_waiter *waiter)
+				       struct rt_mutex_waiter *waiter,
+				       struct wake_q_head *wake_q)
 {
 	struct rt_mutex *rtm = container_of(lock, struct rt_mutex, rtmutex);
 	struct ww_mutex *ww = ww_container_of(rtm);
@@ -1689,7 +1696,7 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock,
 	/* Try to acquire the lock again: */
 	if (try_to_take_rt_mutex(lock, current, NULL)) {
 		if (build_ww_mutex() && ww_ctx) {
-			__ww_mutex_check_waiters(rtm, ww_ctx);
+			__ww_mutex_check_waiters(rtm, ww_ctx, wake_q);
 			ww_mutex_lock_acquired(ww, ww_ctx);
 		}
 		return 0;
@@ -1707,7 +1714,7 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock,
 		/* acquired the lock */
 		if (build_ww_mutex() && ww_ctx) {
 			if (!ww_ctx->is_wait_die)
-				__ww_mutex_check_waiters(rtm, ww_ctx);
+				__ww_mutex_check_waiters(rtm, ww_ctx, wake_q);
 			ww_mutex_lock_acquired(ww, ww_ctx);
 		}
 	} else {
@@ -1729,7 +1736,8 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock,
 
 static inline int __rt_mutex_slowlock_locked(struct rt_mutex_base *lock,
 					     struct ww_acquire_ctx *ww_ctx,
-					     unsigned int state)
+					     unsigned int state,
+					     struct wake_q_head *wake_q)
 {
 	struct rt_mutex_waiter waiter;
 	int ret;
@@ -1738,7 +1746,7 @@ static inline int __rt_mutex_slowlock_locked(struct rt_mutex_base *lock,
 	waiter.ww_ctx = ww_ctx;
 
 	ret = __rt_mutex_slowlock(lock, ww_ctx, state, RT_MUTEX_MIN_CHAINWALK,
-				  &waiter);
+				  &waiter, wake_q);
 
 	debug_rt_mutex_free_waiter(&waiter);
 	return ret;
@@ -1754,6 +1762,7 @@ static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock,
 				     struct ww_acquire_ctx *ww_ctx,
 				     unsigned int state)
 {
+	DEFINE_WAKE_Q(wake_q);
 	unsigned long flags;
 	int ret;
 
@@ -1775,8 +1784,11 @@ static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock,
 	 * irqsave/restore variants.
 	 */
 	raw_spin_lock_irqsave(&lock->wait_lock, flags);
-	ret = __rt_mutex_slowlock_locked(lock, ww_ctx, state);
+	ret = __rt_mutex_slowlock_locked(lock, ww_ctx, state, &wake_q);
+	preempt_disable();
 	raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
+	wake_up_q(&wake_q);
+	preempt_enable();
 	rt_mutex_post_schedule();
 
 	return ret;
diff --git a/kernel/locking/rwbase_rt.c b/kernel/locking/rwbase_rt.c
index 34a59569db6b..9f4322c07486 100644
--- a/kernel/locking/rwbase_rt.c
+++ b/kernel/locking/rwbase_rt.c
@@ -69,6 +69,7 @@ static int __sched __rwbase_read_lock(struct rwbase_rt *rwb,
 				      unsigned int state)
 {
 	struct rt_mutex_base *rtm = &rwb->rtmutex;
+	DEFINE_WAKE_Q(wake_q);
 	int ret;
 
 	rwbase_pre_schedule();
@@ -110,7 +111,7 @@ static int __sched __rwbase_read_lock(struct rwbase_rt *rwb,
 	 * For rwlocks this returns 0 unconditionally, so the below
 	 * !ret conditionals are optimized out.
 	 */
-	ret = rwbase_rtmutex_slowlock_locked(rtm, state);
+	ret = rwbase_rtmutex_slowlock_locked(rtm, state, &wake_q);
 
 	/*
 	 * On success the rtmutex is held, so there can't be a writer
@@ -121,7 +122,12 @@ static int __sched __rwbase_read_lock(struct rwbase_rt *rwb,
 	 */
 	if (!ret)
 		atomic_inc(&rwb->readers);
+
+	preempt_disable();
 	raw_spin_unlock_irq(&rtm->wait_lock);
+	wake_up_q(&wake_q);
+	preempt_enable();
+
 	if (!ret)
 		rwbase_rtmutex_unlock(rtm);
 
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index c6d17aee4209..79ab7b8df5c1 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -1415,8 +1415,8 @@ static inline void __downgrade_write(struct rw_semaphore *sem)
 #define rwbase_rtmutex_lock_state(rtm, state)		\
 	__rt_mutex_lock(rtm, state)
 
-#define rwbase_rtmutex_slowlock_locked(rtm, state)	\
-	__rt_mutex_slowlock_locked(rtm, NULL, state)
+#define rwbase_rtmutex_slowlock_locked(rtm, state, wq)	\
+	__rt_mutex_slowlock_locked(rtm, NULL, state, wq)
 
 #define rwbase_rtmutex_unlock(rtm)			\
 	__rt_mutex_unlock(rtm)
diff --git a/kernel/locking/spinlock_rt.c b/kernel/locking/spinlock_rt.c
index 38e292454fcc..fb1810a14c9d 100644
--- a/kernel/locking/spinlock_rt.c
+++ b/kernel/locking/spinlock_rt.c
@@ -162,7 +162,8 @@ rwbase_rtmutex_lock_state(struct rt_mutex_base *rtm, unsigned int state)
 }
 
 static __always_inline int
-rwbase_rtmutex_slowlock_locked(struct rt_mutex_base *rtm, unsigned int state)
+rwbase_rtmutex_slowlock_locked(struct rt_mutex_base *rtm, unsigned int state,
+			       struct wake_q_head *wake_q)
 {
 	rtlock_slowlock_locked(rtm);
 	return 0;
diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h
index 3ad2cc4823e5..7189c6631d90 100644
--- a/kernel/locking/ww_mutex.h
+++ b/kernel/locking/ww_mutex.h
@@ -275,7 +275,7 @@ __ww_ctx_less(struct ww_acquire_ctx *a, struct ww_acquire_ctx *b)
  */
 static bool
 __ww_mutex_die(struct MUTEX *lock, struct MUTEX_WAITER *waiter,
-	       struct ww_acquire_ctx *ww_ctx)
+	       struct ww_acquire_ctx *ww_ctx, struct wake_q_head *wake_q)
 {
 	if (!ww_ctx->is_wait_die)
 		return false;
@@ -284,7 +284,7 @@ __ww_mutex_die(struct MUTEX *lock, struct MUTEX_WAITER *waiter,
 #ifndef WW_RT
 		debug_mutex_wake_waiter(lock, waiter);
 #endif
-		wake_up_process(waiter->task);
+		wake_q_add(wake_q, waiter->task);
 	}
 
 	return true;
@@ -299,7 +299,8 @@ __ww_mutex_die(struct MUTEX *lock, struct MUTEX_WAITER *waiter,
  */
 static bool __ww_mutex_wound(struct MUTEX *lock,
 			     struct ww_acquire_ctx *ww_ctx,
-			     struct ww_acquire_ctx *hold_ctx)
+			     struct ww_acquire_ctx *hold_ctx,
+			     struct wake_q_head *wake_q)
 {
 	struct task_struct *owner = __ww_mutex_owner(lock);
 
@@ -331,7 +332,7 @@ static bool __ww_mutex_wound(struct MUTEX *lock,
 		 * wakeup pending to re-read the wounded state.
 		 */
 		if (owner != current)
-			wake_up_process(owner);
+			wake_q_add(wake_q, owner);
 
 		return true;
 	}
@@ -352,7 +353,8 @@ static bool __ww_mutex_wound(struct MUTEX *lock,
  * The current task must not be on the wait list.
  */
 static void
-__ww_mutex_check_waiters(struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx)
+__ww_mutex_check_waiters(struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx,
+			 struct wake_q_head *wake_q)
 {
 	struct MUTEX_WAITER *cur;
 
@@ -364,8 +366,8 @@ __ww_mutex_check_waiters(struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx)
 		if (!cur->ww_ctx)
 			continue;
 
-		if (__ww_mutex_die(lock, cur, ww_ctx) ||
-		    __ww_mutex_wound(lock, cur->ww_ctx, ww_ctx))
+		if (__ww_mutex_die(lock, cur, ww_ctx, wake_q) ||
+		    __ww_mutex_wound(lock, cur->ww_ctx, ww_ctx, wake_q))
 			break;
 	}
 }
@@ -377,6 +379,8 @@ __ww_mutex_check_waiters(struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx)
 static __always_inline void
 ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 {
+	DEFINE_WAKE_Q(wake_q);
+
 	ww_mutex_lock_acquired(lock, ctx);
 
 	/*
@@ -405,8 +409,10 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 	 * die or wound us.
 	 */
 	lock_wait_lock(&lock->base);
-	__ww_mutex_check_waiters(&lock->base, ctx);
+	__ww_mutex_check_waiters(&lock->base, ctx, &wake_q);
 	unlock_wait_lock(&lock->base);
+
+	wake_up_q(&wake_q);
 }
 
 static __always_inline int
@@ -488,7 +494,8 @@ __ww_mutex_check_kill(struct MUTEX *lock, struct MUTEX_WAITER *waiter,
 static inline int
 __ww_mutex_add_waiter(struct MUTEX_WAITER *waiter,
 		      struct MUTEX *lock,
-		      struct ww_acquire_ctx *ww_ctx)
+		      struct ww_acquire_ctx *ww_ctx,
+		      struct wake_q_head *wake_q)
 {
 	struct MUTEX_WAITER *cur, *pos = NULL;
 	bool is_wait_die;
@@ -532,7 +539,7 @@ __ww_mutex_add_waiter(struct MUTEX_WAITER *waiter,
 		pos = cur;
 
 		/* Wait-Die: ensure younger waiters die. */
-		__ww_mutex_die(lock, cur, ww_ctx);
+		__ww_mutex_die(lock, cur, ww_ctx, wake_q);
 	}
 
 	__ww_waiter_add(lock, waiter, pos);
@@ -550,7 +557,7 @@ __ww_mutex_add_waiter(struct MUTEX_WAITER *waiter,
 		 * such that either we or the fastpath will wound @ww->ctx.
 		 */
 		smp_mb();
-		__ww_mutex_wound(lock, ww_ctx, ww->ctx);
+		__ww_mutex_wound(lock, ww_ctx, ww->ctx, wake_q);
 	}
 
 	return 0;
-- 
2.45.2.993.g49e7a77208-goog

Re: [PATCH v11 1/7] locking/mutex: Remove wakeups from under mutex::wait_lock

Posted by Peter Zijlstra 1 year, 7 months ago

On Tue, Jul 09, 2024 at 01:31:44PM -0700, John Stultz wrote:
> @@ -405,8 +409,10 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
>  	 * die or wound us.
>  	 */
>  	lock_wait_lock(&lock->base);
> -	__ww_mutex_check_waiters(&lock->base, ctx);
> +	__ww_mutex_check_waiters(&lock->base, ctx, &wake_q);

  preempt_disable()

>  	unlock_wait_lock(&lock->base);
> +
> +	wake_up_q(&wake_q);

  preempt_enable();

>  }
>  
>  static __always_inline int

Re: [PATCH v11 1/7] locking/mutex: Remove wakeups from under mutex::wait_lock

Posted by John Stultz 1 year, 6 months ago

On Fri, Jul 12, 2024 at 7:52 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Tue, Jul 09, 2024 at 01:31:44PM -0700, John Stultz wrote:
> > @@ -405,8 +409,10 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
> >        * die or wound us.
> >        */
> >       lock_wait_lock(&lock->base);
> > -     __ww_mutex_check_waiters(&lock->base, ctx);
> > +     __ww_mutex_check_waiters(&lock->base, ctx, &wake_q);
>
>   preempt_disable()
>
> >       unlock_wait_lock(&lock->base);
> > +
> > +     wake_up_q(&wake_q);
>
>   preempt_enable();

Ah. Will fix for the next version. Thank you for the review and
pointing this out!
-john

Re: [PATCH v11 1/7] locking/mutex: Remove wakeups from under mutex::wait_lock

Posted by K Prateek Nayak 1 year, 7 months ago

Hello John,

On 7/10/2024 2:01 AM, John Stultz wrote:
> From: Peter Zijlstra <peterz@infradead.org>
> 
> In preparation to nest mutex::wait_lock under rq::lock we need to remove
> wakeups from under it.
> 
> Cc: Joel Fernandes <joelaf@google.com>
> Cc: Qais Yousef <qyousef@layalina.io>
> Cc: Ingo Molnar <mingo@redhat.com>
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: Juri Lelli <juri.lelli@redhat.com>
> Cc: Vincent Guittot <vincent.guittot@linaro.org>
> Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
> Cc: Valentin Schneider <vschneid@redhat.com>
> Cc: Steven Rostedt <rostedt@goodmis.org>
> Cc: Ben Segall <bsegall@google.com>
> Cc: Zimuzo Ezeozue <zezeozue@google.com>
> Cc: Youssef Esmat <youssefesmat@google.com>
> Cc: Mel Gorman <mgorman@suse.de>
> Cc: Will Deacon <will@kernel.org>
> Cc: Waiman Long <longman@redhat.com>
> Cc: Boqun Feng <boqun.feng@gmail.com>
> Cc: "Paul E. McKenney" <paulmck@kernel.org>
> Cc: Metin Kaya <Metin.Kaya@arm.com>
> Cc: Xuewen Yan <xuewen.yan94@gmail.com>
> Cc: K Prateek Nayak <kprateek.nayak@amd.com>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Daniel Lezcano <daniel.lezcano@linaro.org>
> Cc: kernel-team@android.com
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> [Heavily changed after 55f036ca7e74 ("locking: WW mutex cleanup") and
> 08295b3b5bee ("locking: Implement an algorithm choice for Wound-Wait
> mutexes")]
> Signed-off-by: Juri Lelli <juri.lelli@redhat.com>
> [jstultz: rebased to mainline, added extra wake_up_q & init
>   to avoid hangs, similar to Connor's rework of this patch]
> Signed-off-by: John Stultz <jstultz@google.com>
> Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
> Tested-by: Metin Kaya <metin.kaya@arm.com>
> Acked-by: Davidlohr Bueso <dave@stgolabs.net>
> Reviewed-by: Metin Kaya <metin.kaya@arm.com>
> ---
> v5:
> * Reverted back to an earlier version of this patch to undo
>    the change that kept the wake_q in the ctx structure, as
>    that broke the rule that the wake_q must always be on the
>    stack, as its not safe for concurrency.
> v6:
> * Made tweaks suggested by Waiman Long
> v7:
> * Fixups to pass wake_qs down for PREEMPT_RT logic
> v10:
> * Switched preempt_enable to be lower close to the unlock as
>    suggested by Valentin
> * Added additional preempt_disable coverage around the wake_q
>    calls as again noted by Valentin
> ---
>   kernel/locking/mutex.c       | 17 +++++++++++++----
>   kernel/locking/rtmutex.c     | 30 +++++++++++++++++++++---------
>   kernel/locking/rwbase_rt.c   |  8 +++++++-
>   kernel/locking/rwsem.c       |  4 ++--
>   kernel/locking/spinlock_rt.c |  3 ++-
>   kernel/locking/ww_mutex.h    | 29 ++++++++++++++++++-----------
>   6 files changed, 63 insertions(+), 28 deletions(-)
> 
> diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
> index cbae8c0b89ab..4269da1f3ef5 100644
> --- a/kernel/locking/mutex.c
> +++ b/kernel/locking/mutex.c
> @@ -575,6 +575,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
>   		    struct lockdep_map *nest_lock, unsigned long ip,
>   		    struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx)
>   {
> +	DEFINE_WAKE_Q(wake_q);
>   	struct mutex_waiter waiter;
>   	struct ww_mutex *ww;
>   	int ret;
> @@ -625,7 +626,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
>   	 */
>   	if (__mutex_trylock(lock)) {
>   		if (ww_ctx)
> -			__ww_mutex_check_waiters(lock, ww_ctx);
> +			__ww_mutex_check_waiters(lock, ww_ctx, &wake_q);
>   
>   		goto skip_wait;
>   	}
> @@ -645,7 +646,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
>   		 * Add in stamp order, waking up waiters that must kill
>   		 * themselves.
>   		 */
> -		ret = __ww_mutex_add_waiter(&waiter, lock, ww_ctx);
> +		ret = __ww_mutex_add_waiter(&waiter, lock, ww_ctx, &wake_q);
>   		if (ret)
>   			goto err_early_kill;
>   	}
> @@ -681,6 +682,11 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
>   		}
>   
>   		raw_spin_unlock(&lock->wait_lock);
> +		/* Make sure we do wakeups before calling schedule */
> +		if (!wake_q_empty(&wake_q)) {

nit.

This checks seems unnecessary (to my untrained eye). Any harm in
skipping it and simply doing a wake_up_q() followed by wake_q_init()
unconditionally?

> +			wake_up_q(&wake_q);
> +			wake_q_init(&wake_q);
> +		}
>   		schedule_preempt_disabled();
>   
>   		first = __mutex_waiter_is_first(lock, &waiter);
> @@ -714,7 +720,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
>   		 */
>   		if (!ww_ctx->is_wait_die &&
>   		    !__mutex_waiter_is_first(lock, &waiter))
> -			__ww_mutex_check_waiters(lock, ww_ctx);
> +			__ww_mutex_check_waiters(lock, ww_ctx, &wake_q);
>   	}
>   
>   	__mutex_remove_waiter(lock, &waiter);
> @@ -730,6 +736,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
>   		ww_mutex_lock_acquired(ww, ww_ctx);
>   
>   	raw_spin_unlock(&lock->wait_lock);
> +	wake_up_q(&wake_q);
>   	preempt_enable();
>   	return 0;
>   
> @@ -741,6 +748,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
>   	raw_spin_unlock(&lock->wait_lock);
>   	debug_mutex_free_waiter(&waiter);
>   	mutex_release(&lock->dep_map, ip);
> +	wake_up_q(&wake_q);
>   	preempt_enable();
>   	return ret;
>   }
> @@ -951,9 +959,10 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne
>   	if (owner & MUTEX_FLAG_HANDOFF)
>   		__mutex_handoff(lock, next);
>   
> +	preempt_disable();
>   	raw_spin_unlock(&lock->wait_lock);
> -
>   	wake_up_q(&wake_q);
> +	preempt_enable();
>   }
>   
>   #ifndef CONFIG_DEBUG_LOCK_ALLOC
> diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
> index 88d08eeb8bc0..7a85d9bfa972 100644
> --- a/kernel/locking/rtmutex.c
> +++ b/kernel/locking/rtmutex.c
> @@ -34,13 +34,15 @@
>   
>   static inline int __ww_mutex_add_waiter(struct rt_mutex_waiter *waiter,
>   					struct rt_mutex *lock,
> -					struct ww_acquire_ctx *ww_ctx)
> +					struct ww_acquire_ctx *ww_ctx,
> +					struct wake_q_head *wake_q)
>   {
>   	return 0;
>   }
>   
>   static inline void __ww_mutex_check_waiters(struct rt_mutex *lock,
> -					    struct ww_acquire_ctx *ww_ctx)
> +					    struct ww_acquire_ctx *ww_ctx,
> +					    struct wake_q_head *wake_q)
>   {
>   }
>   
> @@ -1207,6 +1209,7 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock,
>   	struct rt_mutex_waiter *top_waiter = waiter;
>   	struct rt_mutex_base *next_lock;
>   	int chain_walk = 0, res;
> +	DEFINE_WAKE_Q(wake_q);
>   
>   	lockdep_assert_held(&lock->wait_lock);
>   
> @@ -1245,7 +1248,10 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock,
>   
>   		/* Check whether the waiter should back out immediately */
>   		rtm = container_of(lock, struct rt_mutex, rtmutex);
> -		res = __ww_mutex_add_waiter(waiter, rtm, ww_ctx);
> +		preempt_disable();
> +		res = __ww_mutex_add_waiter(waiter, rtm, ww_ctx, &wake_q);
> +		wake_up_q(&wake_q);
> +		preempt_enable();

I'm trying to understand this - we enter task_blocks_on_rt_mutex() with
"wait_lock" held (I believe the lockdep_assert_held() in the previous
hunk checks for the same). I walked down the call chain (although
briefly) and could only spot "task->pi_lock" being locked and unlocked
before this call to "wake_up_q()" but the "wait_lock" seems to be held
throughout, only being unlocked and locked again for
"rt_mutex_adjust_prio_chain()" later down.

Did I miss something or is disabling preemption for this specific hunk
enough to enable safe nesting?
--
Thanks and Regards,
Prateek

>   		if (res) {
>   			raw_spin_lock(&task->pi_lock);
>   			rt_mutex_dequeue(lock, waiter);
> [..snip..]

Re: [PATCH v11 1/7] locking/mutex: Remove wakeups from under mutex::wait_lock

Posted by John Stultz 1 year, 6 months ago

On Wed, Jul 10, 2024 at 10:42 AM 'K Prateek Nayak' via kernel-team
<kernel-team@android.com> wrote:
> On 7/10/2024 2:01 AM, John Stultz wrote:
> > @@ -681,6 +682,11 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
> >               }
> >
> >               raw_spin_unlock(&lock->wait_lock);
> > +             /* Make sure we do wakeups before calling schedule */
> > +             if (!wake_q_empty(&wake_q)) {
>
> nit.
>
> This checks seems unnecessary (to my untrained eye). Any harm in
> skipping it and simply doing a wake_up_q() followed by wake_q_init()
> unconditionally?
>
> > +                     wake_up_q(&wake_q);
> > +                     wake_q_init(&wake_q);
> > +             }
> >               schedule_preempt_disabled();

Ah, thank you for the suggestion. I've reworked this in my tree!

> > @@ -1207,6 +1209,7 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock,
> >       struct rt_mutex_waiter *top_waiter = waiter;
> >       struct rt_mutex_base *next_lock;
> >       int chain_walk = 0, res;
> > +     DEFINE_WAKE_Q(wake_q);
> >
> >       lockdep_assert_held(&lock->wait_lock);
> >
> > @@ -1245,7 +1248,10 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock,
> >
> >               /* Check whether the waiter should back out immediately */
> >               rtm = container_of(lock, struct rt_mutex, rtmutex);
> > -             res = __ww_mutex_add_waiter(waiter, rtm, ww_ctx);
> > +             preempt_disable();
> > +             res = __ww_mutex_add_waiter(waiter, rtm, ww_ctx, &wake_q);
> > +             wake_up_q(&wake_q);
> > +             preempt_enable();
>
> I'm trying to understand this - we enter task_blocks_on_rt_mutex() with
> "wait_lock" held (I believe the lockdep_assert_held() in the previous
> hunk checks for the same). I walked down the call chain (although
> briefly) and could only spot "task->pi_lock" being locked and unlocked
> before this call to "wake_up_q()" but the "wait_lock" seems to be held
> throughout, only being unlocked and locked again for
> "rt_mutex_adjust_prio_chain()" later down.
>
> Did I miss something or is disabling preemption for this specific hunk
> enough to enable safe nesting?

Thank you for pointing this out! It looks like I need to pipe the
wake_q down through task_blocks_on_rt_mutex(), and
rtlock_slowlock_locked() and define one in rtlock_slowlock().

Really appreciate the review and feedback here!
-john

Re: [PATCH v11 1/7] locking/mutex: Remove wakeups from under mutex::wait_lock

Posted by Peter Zijlstra 1 year, 7 months ago

On Wed, Jul 10, 2024 at 11:11:51PM +0530, K Prateek Nayak wrote:

> > @@ -681,6 +682,11 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
> >   		}
> >   		raw_spin_unlock(&lock->wait_lock);
> > +		/* Make sure we do wakeups before calling schedule */
> > +		if (!wake_q_empty(&wake_q)) {
> 
> nit.
> 
> This checks seems unnecessary (to my untrained eye). Any harm in
> skipping it and simply doing a wake_up_q() followed by wake_q_init()
> unconditionally?

Yeah, that doesn't really save anything.

> > +			wake_up_q(&wake_q);
> > +			wake_q_init(&wake_q);
> > +		}
> >   		schedule_preempt_disabled();
> >   		first = __mutex_waiter_is_first(lock, &waiter);

> > @@ -1207,6 +1209,7 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock,
> >   	struct rt_mutex_waiter *top_waiter = waiter;
> >   	struct rt_mutex_base *next_lock;
> >   	int chain_walk = 0, res;
> > +	DEFINE_WAKE_Q(wake_q);
> >   	lockdep_assert_held(&lock->wait_lock);
> > @@ -1245,7 +1248,10 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock,
> >   		/* Check whether the waiter should back out immediately */
> >   		rtm = container_of(lock, struct rt_mutex, rtmutex);
> > -		res = __ww_mutex_add_waiter(waiter, rtm, ww_ctx);
> > +		preempt_disable();
> > +		res = __ww_mutex_add_waiter(waiter, rtm, ww_ctx, &wake_q);
> > +		wake_up_q(&wake_q);
> > +		preempt_enable();
> 
> I'm trying to understand this - we enter task_blocks_on_rt_mutex() with
> "wait_lock" held (I believe the lockdep_assert_held() in the previous
> hunk checks for the same). I walked down the call chain (although
> briefly) and could only spot "task->pi_lock" being locked and unlocked
> before this call to "wake_up_q()" but the "wait_lock" seems to be held
> throughout, only being unlocked and locked again for
> "rt_mutex_adjust_prio_chain()" later down.
> 
> Did I miss something or is disabling preemption for this specific hunk
> enough to enable safe nesting?

So the whole ww-mutex stuff got significantly reworked since this patch
was started, but I think you're right, this wake_up_q() needs to be
outside wait_lock, while currently it sits inside.

One thing that helps though, is that I think that when
__ww_mutex_add_waiter() return !0, the wake_q should be empty here.

[PATCH v11 1/7] locking/mutex: Remove wakeups from under mutex::wait_lock
[PATCH v11 2/7] locking/mutex: Make mutex::wait_lock irq safe
[PATCH v11 3/7] locking/mutex: Expose __mutex_owner()
[PATCH v11 4/7] sched: Add move_queued_task_locked helper
[PATCH v11 5/7] sched: Consolidate pick_*_task to task_is_pushable helper
[PATCH v11 6/7] sched: Split out __schedule() deactivate task logic into a helper
[PATCH v11 7/7] sched: Split scheduler and execution contexts