[PATCH v4] sched_ext: Fix lock imbalance in dispatch_to_local_dsq()

Andrea Righi posted 1 patch 1 year ago
There is a newer version of this series
kernel/sched/ext.c | 110 ++++++++++++++++++++++++++++-----------------
1 file changed, 68 insertions(+), 42 deletions(-)
[PATCH v4] sched_ext: Fix lock imbalance in dispatch_to_local_dsq()
Posted by Andrea Righi 1 year ago
While performing the rq locking dance in dispatch_to_local_dsq(), we may
trigger the following lock imbalance condition, in particular when
multiple tasks are rapidly changing CPU affinity (i.e., running a
`stress-ng --race-sched 0`):

[   13.413579] =====================================
[   13.413660] WARNING: bad unlock balance detected!
[   13.413729] 6.13.0-virtme #15 Not tainted
[   13.413792] -------------------------------------
[   13.413859] kworker/1:1/80 is trying to release lock (&rq->__lock) at:
[   13.413954] [<ffffffff873c6c48>] dispatch_to_local_dsq+0x108/0x1a0
[   13.414111] but there are no more locks to release!
[   13.414176]
[   13.414176] other info that might help us debug this:
[   13.414258] 1 lock held by kworker/1:1/80:
[   13.414318]  #0: ffff8b66feb41698 (&rq->__lock){-.-.}-{2:2}, at: raw_spin_rq_lock_nested+0x20/0x90
[   13.414612]
[   13.414612] stack backtrace:
[   13.415255] CPU: 1 UID: 0 PID: 80 Comm: kworker/1:1 Not tainted 6.13.0-virtme #15
[   13.415505] Workqueue:  0x0 (events)
[   13.415567] Sched_ext: dsp_local_on (enabled+all), task: runnable_at=-2ms
[   13.415570] Call Trace:
[   13.415700]  <TASK>
[   13.415744]  dump_stack_lvl+0x78/0xe0
[   13.415806]  ? dispatch_to_local_dsq+0x108/0x1a0
[   13.415884]  print_unlock_imbalance_bug+0x11b/0x130
[   13.415965]  ? dispatch_to_local_dsq+0x108/0x1a0
[   13.416226]  lock_release+0x231/0x2c0
[   13.416326]  _raw_spin_unlock+0x1b/0x40
[   13.416422]  dispatch_to_local_dsq+0x108/0x1a0
[   13.416554]  flush_dispatch_buf+0x199/0x1d0
[   13.416652]  balance_one+0x194/0x370
[   13.416751]  balance_scx+0x61/0x1e0
[   13.416848]  prev_balance+0x43/0xb0
[   13.416947]  __pick_next_task+0x6b/0x1b0
[   13.417052]  __schedule+0x20d/0x1740

This happens because dispatch_to_local_dsq() is racing with
dispatch_dequeue() and, when the latter wins, we incorrectly assume that
the task has been moved to dst_rq.

Fix by properly tracking the currently locked rq.

Fixes: 4d3ca89bdd31 ("sched_ext: Refactor consume_remote_task()")
Signed-off-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c | 110 ++++++++++++++++++++++++++++-----------------
 1 file changed, 68 insertions(+), 42 deletions(-)

ChangeLog v3 -> v4:
 - small refactoring to fix an unused variable build warning on UP

ChangeLog v2 -> v3:
 - keep track of the currently locked rq in dispatch_to_local_dsq()

ChangeLog v1 -> v2:
 - more comments to clarify the race with dequeue
 - rebase to tip

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 4493a3c419c9..a290052478ed 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -2253,9 +2253,11 @@ static void move_local_task_to_local_dsq(struct task_struct *p, u64 enq_flags,
  * @dst_rq: rq to move the task into, locked on return
  *
  * Move @p which is currently on @src_rq to @dst_rq's local DSQ.
+ *
+ * Return the rq where @p has been moved.
  */
-static void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags,
-					  struct rq *src_rq, struct rq *dst_rq)
+static struct rq *move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags,
+						struct rq *src_rq, struct rq *dst_rq)
 {
 	lockdep_assert_rq_held(src_rq);
 
@@ -2277,6 +2279,8 @@ static void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags,
 	dst_rq->scx.extra_enq_flags = enq_flags;
 	activate_task(dst_rq, p, 0);
 	dst_rq->scx.extra_enq_flags = 0;
+
+	return dst_rq;
 }
 
 /*
@@ -2387,7 +2391,13 @@ static bool consume_remote_task(struct rq *this_rq, struct task_struct *p,
 	}
 }
 #else	/* CONFIG_SMP */
-static inline void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags, struct rq *src_rq, struct rq *dst_rq) { WARN_ON_ONCE(1); }
+static inline struct rq *
+move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags,
+			      struct rq *src_rq, struct rq *dst_rq)
+{
+	WARN_ON_ONCE(1);
+	return src_rq;
+}
 static inline bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, bool trigger_error) { return false; }
 static inline bool consume_remote_task(struct rq *this_rq, struct task_struct *p, struct scx_dispatch_q *dsq, struct rq *task_rq) { return false; }
 #endif	/* CONFIG_SMP */
@@ -2538,38 +2548,12 @@ static bool consume_global_dsq(struct rq *rq)
 	return consume_dispatch_q(rq, global_dsqs[node]);
 }
 
-/**
- * dispatch_to_local_dsq - Dispatch a task to a local dsq
- * @rq: current rq which is locked
- * @dst_dsq: destination DSQ
- * @p: task to dispatch
- * @enq_flags: %SCX_ENQ_*
- *
- * We're holding @rq lock and want to dispatch @p to @dst_dsq which is a local
- * DSQ. This function performs all the synchronization dancing needed because
- * local DSQs are protected with rq locks.
- *
- * The caller must have exclusive ownership of @p (e.g. through
- * %SCX_OPSS_DISPATCHING).
- */
-static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
-				  struct task_struct *p, u64 enq_flags)
+#ifdef CONFIG_SMP
+static void move_to_remote_dsq(struct rq *rq, struct rq *src_rq, struct rq *dst_rq,
+			       struct task_struct *p, u64 enq_flags)
 {
-	struct rq *src_rq = task_rq(p);
-	struct rq *dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq);
+	struct rq *locked_rq = rq;
 
-	/*
-	 * We're synchronized against dequeue through DISPATCHING. As @p can't
-	 * be dequeued, its task_rq and cpus_allowed are stable too.
-	 *
-	 * If dispatching to @rq that @p is already on, no lock dancing needed.
-	 */
-	if (rq == src_rq && rq == dst_rq) {
-		dispatch_enqueue(dst_dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS);
-		return;
-	}
-
-#ifdef CONFIG_SMP
 	if (unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) {
 		dispatch_enqueue(find_global_dsq(p), p,
 				 enq_flags | SCX_ENQ_CLEAR_OPSS);
@@ -2593,12 +2577,16 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
 	atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
 
 	/* switch to @src_rq lock */
-	if (rq != src_rq) {
-		raw_spin_rq_unlock(rq);
+	if (locked_rq != src_rq) {
+		raw_spin_rq_unlock(locked_rq);
+		locked_rq = src_rq;
 		raw_spin_rq_lock(src_rq);
 	}
 
-	/* task_rq couldn't have changed if we're still the holding cpu */
+	/*
+	 * If p->scx.holding_cpu still matches the current CPU, task_rq(p)
+	 * has not changed and we can safely move the task to @dst_rq.
+	 */
 	if (likely(p->scx.holding_cpu == raw_smp_processor_id()) &&
 	    !WARN_ON_ONCE(src_rq != task_rq(p))) {
 		/*
@@ -2610,8 +2598,8 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
 			p->scx.holding_cpu = -1;
 			dispatch_enqueue(&dst_rq->scx.local_dsq, p, enq_flags);
 		} else {
-			move_remote_task_to_local_dsq(p, enq_flags,
-						      src_rq, dst_rq);
+			locked_rq = move_remote_task_to_local_dsq(p, enq_flags,
+								  src_rq, dst_rq);
 		}
 
 		/* if the destination CPU is idle, wake it up */
@@ -2620,13 +2608,51 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
 	}
 
 	/* switch back to @rq lock */
-	if (rq != dst_rq) {
-		raw_spin_rq_unlock(dst_rq);
+	if (locked_rq != rq) {
+		raw_spin_rq_unlock(locked_rq);
 		raw_spin_rq_lock(rq);
 	}
-#else	/* CONFIG_SMP */
-	BUG();	/* control can not reach here on UP */
+}
+#else /* CONFIG_SMP */
+static void move_to_remote_dsq(struct rq *rq, struct rq *src_rq, struct rq *dst_rq,
+			       struct task_struct *p, u64 enq_flags)
+{
+	BUG();	/* this should never be called on UP */
+}
 #endif	/* CONFIG_SMP */
+
+/**
+ * dispatch_to_local_dsq - Dispatch a task to a local dsq
+ * @rq: current rq which is locked
+ * @dst_dsq: destination DSQ
+ * @p: task to dispatch
+ * @enq_flags: %SCX_ENQ_*
+ *
+ * We're holding @rq lock and want to dispatch @p to @dst_dsq which is a local
+ * DSQ. This function performs all the synchronization dancing needed because
+ * local DSQs are protected with rq locks.
+ *
+ * The caller must have exclusive ownership of @p (e.g. through
+ * %SCX_OPSS_DISPATCHING).
+ */
+static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
+				  struct task_struct *p, u64 enq_flags)
+{
+	struct rq *src_rq = task_rq(p);
+	struct rq *dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq);
+
+	/*
+	 * We're synchronized against dequeue through DISPATCHING. As @p can't
+	 * be dequeued, its task_rq and cpus_allowed are stable too.
+	 *
+	 * If dispatching to @rq that @p is already on, no lock dancing needed.
+	 */
+	if (rq == src_rq && rq == dst_rq) {
+		dispatch_enqueue(dst_dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS);
+		return;
+	}
+
+	move_to_remote_dsq(rq, src_rq, dst_rq, p, enq_flags);
 }
 
 /**
-- 
2.48.1
Re: [PATCH v4] sched_ext: Fix lock imbalance in dispatch_to_local_dsq()
Posted by Tejun Heo 1 year ago
Hello,

On Sat, Jan 25, 2025 at 10:16:57AM +0100, Andrea Righi wrote:
...
> @@ -2253,9 +2253,11 @@ static void move_local_task_to_local_dsq(struct task_struct *p, u64 enq_flags,
>   * @dst_rq: rq to move the task into, locked on return
>   *
>   * Move @p which is currently on @src_rq to @dst_rq's local DSQ.
> + *
> + * Return the rq where @p has been moved.
>   */
> -static void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags,
> -					  struct rq *src_rq, struct rq *dst_rq)
> +static struct rq *move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags,
> +						struct rq *src_rq, struct rq *dst_rq)
>  {
>  	lockdep_assert_rq_held(src_rq);
>  
> @@ -2277,6 +2279,8 @@ static void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags,
>  	dst_rq->scx.extra_enq_flags = enq_flags;
>  	activate_task(dst_rq, p, 0);
>  	dst_rq->scx.extra_enq_flags = 0;
> +
> +	return dst_rq;

The returned dst_rq always matches the input param, right? Let's please not
do this. The return value can mislead users to assume that the returned
value may be different from the input. e.g. kobj_get() does similar identity
return for convenience and that led people to assume that kobj_get() does
zero-ref testing before inc'ing which led to a group of bugs. Just follow up
the call statement with an explicit assignment if necessary. Nothing
meaningful is achieved by merging that into the call statement.

> +/**
> + * dispatch_to_local_dsq - Dispatch a task to a local dsq
> + * @rq: current rq which is locked
> + * @dst_dsq: destination DSQ
> + * @p: task to dispatch
> + * @enq_flags: %SCX_ENQ_*
> + *
> + * We're holding @rq lock and want to dispatch @p to @dst_dsq which is a local
> + * DSQ. This function performs all the synchronization dancing needed because
> + * local DSQs are protected with rq locks.
> + *
> + * The caller must have exclusive ownership of @p (e.g. through
> + * %SCX_OPSS_DISPATCHING).
> + */
> +static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
> +				  struct task_struct *p, u64 enq_flags)
> +{
> +	struct rq *src_rq = task_rq(p);
> +	struct rq *dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq);
> +
> +	/*
> +	 * We're synchronized against dequeue through DISPATCHING. As @p can't
> +	 * be dequeued, its task_rq and cpus_allowed are stable too.
> +	 *
> +	 * If dispatching to @rq that @p is already on, no lock dancing needed.
> +	 */
> +	if (rq == src_rq && rq == dst_rq) {
> +		dispatch_enqueue(dst_dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS);
> +		return;
> +	}
> +
> +	move_to_remote_dsq(rq, src_rq, dst_rq, p, enq_flags);

I'm not sure the refactoring is adding much, but even if it does:

- All the changes the patch makes and why should be explained in the
  description.

- The refactoring is obscuring the actual fix. If refactoring is desirable,
  please put it in a separate patch.

Thanks.

-- 
tejun
Re: [PATCH v4] sched_ext: Fix lock imbalance in dispatch_to_local_dsq()
Posted by Andrea Righi 1 year ago
On Mon, Jan 27, 2025 at 08:59:12AM -1000, Tejun Heo wrote:
> Hello,
> 
> On Sat, Jan 25, 2025 at 10:16:57AM +0100, Andrea Righi wrote:
> ...
> > @@ -2253,9 +2253,11 @@ static void move_local_task_to_local_dsq(struct task_struct *p, u64 enq_flags,
> >   * @dst_rq: rq to move the task into, locked on return
> >   *
> >   * Move @p which is currently on @src_rq to @dst_rq's local DSQ.
> > + *
> > + * Return the rq where @p has been moved.
> >   */
> > -static void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags,
> > -					  struct rq *src_rq, struct rq *dst_rq)
> > +static struct rq *move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags,
> > +						struct rq *src_rq, struct rq *dst_rq)
> >  {
> >  	lockdep_assert_rq_held(src_rq);
> >  
> > @@ -2277,6 +2279,8 @@ static void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags,
> >  	dst_rq->scx.extra_enq_flags = enq_flags;
> >  	activate_task(dst_rq, p, 0);
> >  	dst_rq->scx.extra_enq_flags = 0;
> > +
> > +	return dst_rq;
> 
> The returned dst_rq always matches the input param, right? Let's please not
> do this. The return value can mislead users to assume that the returned
> value may be different from the input. e.g. kobj_get() does similar identity
> return for convenience and that led people to assume that kobj_get() does
> zero-ref testing before inc'ing which led to a group of bugs. Just follow up
> the call statement with an explicit assignment if necessary. Nothing
> meaningful is achieved by merging that into the call statement.
> 
> > +/**
> > + * dispatch_to_local_dsq - Dispatch a task to a local dsq
> > + * @rq: current rq which is locked
> > + * @dst_dsq: destination DSQ
> > + * @p: task to dispatch
> > + * @enq_flags: %SCX_ENQ_*
> > + *
> > + * We're holding @rq lock and want to dispatch @p to @dst_dsq which is a local
> > + * DSQ. This function performs all the synchronization dancing needed because
> > + * local DSQs are protected with rq locks.
> > + *
> > + * The caller must have exclusive ownership of @p (e.g. through
> > + * %SCX_OPSS_DISPATCHING).
> > + */
> > +static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
> > +				  struct task_struct *p, u64 enq_flags)
> > +{
> > +	struct rq *src_rq = task_rq(p);
> > +	struct rq *dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq);
> > +
> > +	/*
> > +	 * We're synchronized against dequeue through DISPATCHING. As @p can't
> > +	 * be dequeued, its task_rq and cpus_allowed are stable too.
> > +	 *
> > +	 * If dispatching to @rq that @p is already on, no lock dancing needed.
> > +	 */
> > +	if (rq == src_rq && rq == dst_rq) {
> > +		dispatch_enqueue(dst_dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS);
> > +		return;
> > +	}
> > +
> > +	move_to_remote_dsq(rq, src_rq, dst_rq, p, enq_flags);
> 
> I'm not sure the refactoring is adding much, but even if it does:
> 
> - All the changes the patch makes and why should be explained in the
>   description.
> 
> - The refactoring is obscuring the actual fix. If refactoring is desirable,
>   please put it in a separate patch.

Ok, makes sense, I'll send a new patch without the refactoring.

Thanks,
-Andrea