sched_ext: Drop rq lock before calling ops.exit_task()

[PATCH sched_ext/for-7.0-fixes] sched_ext: Drop rq lock before calling ops.exit_task()

Posted by Andrea Righi 3 weeks, 5 days ago

sched_ext_dead() calls scx_exit_task() while holding the rq lock, which
invokes ops.exit_task(). If the BPF program calls helpers that acquire
non-raw locks (e.g., bpf_task_storage_delete()), this can trigger the
following BUG:

 =============================
 [ BUG: Invalid wait context ]
 7.0.0-rc1-virtme #1 Not tainted
 -----------------------------
 (udev-worker)/115 is trying to lock:
 ffffffffa6970dd0 (rcu_tasks_trace_srcu_struct_srcu_usage.lock){....}-{3:3}, at: spin_lock_irqsave_ssp_contention+0x54/0x90
 other info that might help us debug this:
 context-{5:5}
 3 locks held by (udev-worker)/115:
  #0: ffff8e16c634ce58 (&p->pi_lock){-.-.}-{2:2}, at: _task_rq_lock+0x2c/0x100
  #1: ffff8e16fbdbdae0 (&rq->__lock){-.-.}-{2:2}, at: raw_spin_rq_lock_nested+0x24/0xb0
  #2: ffffffffa6971b60 (rcu_read_lock){....}-{1:3}, at: __bpf_prog_enter+0x64/0x110
 stack backtrace:
 ...
 Sched_ext: cosmos_1.0.7_g780e898fc_dirty_x64_unknown_linux_gnu (enabled+all), task: runnable_at=-2ms
 Call Trace:
  <TASK>
  __lock_acquire+0xf86/0x1de0
  lock_acquire+0xcf/0x310
  _raw_spin_lock_irqsave+0x39/0x60
  spin_lock_irqsave_ssp_contention+0x54/0x90
  srcu_gp_start_if_needed+0x2a7/0x490
  bpf_selem_unlink+0x24b/0x590
  bpf_task_storage_delete+0x3a/0x90
  bpf_prog_3b623b4be76cfb86_scx_pmu_task_fini+0x26/0x2a
  bpf_prog_4b1530d9d9852432_cosmos_exit_task+0x1d/0x1f
  bpf__sched_ext_ops_exit_task+0x4b/0xa7

Fix this by extending scx_exit_task() to take optional rq and rq_flags
pointers. When they are provided, temporarily drop the rq lock before
invoking ops.exit_task() and re-acquire it afterwards. When they are
NULL, call ops.exit_task() with the rq lock held as before.

After dropping the rq lock around ops.exit_task(), interrupts are
enabled, so an interrupt can potentially run and call
enqueue_task_scx(), which uses SCX_KF_ENQUEUE; scx_kf_allow() would
treat this as invalid nesting because the interrupted context still has
SCX_KF_REST set (from ops.exit_task()). This nesting should be
legitimate when the inner call is from an interrupt handler, so skip the
nesting check when in_interrupt() is true.

Fixes: 7900aa699c34 ("sched_ext: Fix cgroup exit ordering by moving sched_ext_free() to finish_task_switch()")
Cc: stable@vger.kernel.org # v6.19+
Signed-off-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c | 30 ++++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 1594987d637b0..37415713b7c0b 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -275,9 +275,10 @@ static const struct sched_class *scx_setscheduler_class(struct task_struct *p)
 static __always_inline void scx_kf_allow(u32 mask)
 {
 	/* nesting is allowed only in increasing scx_kf_mask order */
-	WARN_ONCE((mask | higher_bits(mask)) & current->scx.kf_mask,
-		  "invalid nesting current->scx.kf_mask=0x%x mask=0x%x\n",
-		  current->scx.kf_mask, mask);
+	if (!in_interrupt())
+		WARN_ONCE((mask | higher_bits(mask)) & current->scx.kf_mask,
+			  "invalid nesting current->scx.kf_mask=0x%x mask=0x%x\n",
+			  current->scx.kf_mask, mask);
 	current->scx.kf_mask |= mask;
 	barrier();
 }
@@ -2968,7 +2969,8 @@ static void scx_disable_task(struct task_struct *p)
 	scx_set_task_state(p, SCX_TASK_READY);
 }
 
-static void scx_exit_task(struct task_struct *p)
+static void scx_exit_task(struct task_struct *p, struct rq **rq,
+			 struct rq_flags *rf)
 {
 	struct scx_sched *sch = scx_root;
 	struct scx_exit_task_args args = {
@@ -2993,9 +2995,17 @@ static void scx_exit_task(struct task_struct *p)
 		return;
 	}
 
-	if (SCX_HAS_OP(sch, exit_task))
-		SCX_CALL_OP_TASK(sch, SCX_KF_REST, exit_task, task_rq(p),
-				 p, &args);
+	if (SCX_HAS_OP(sch, exit_task)) {
+		if (rq && rf) {
+			task_rq_unlock(*rq, p, rf);
+			SCX_CALL_OP_TASK(sch, SCX_KF_REST, exit_task, NULL, p, &args);
+			*rq = task_rq_lock(p, rf);
+		} else {
+			SCX_CALL_OP_TASK(sch, SCX_KF_REST, exit_task, task_rq(p),
+					 p, &args);
+		}
+	}
+
 	scx_set_task_state(p, SCX_TASK_NONE);
 }
 
@@ -3068,7 +3078,7 @@ void scx_cancel_fork(struct task_struct *p)
 
 		rq = task_rq_lock(p, &rf);
 		WARN_ON_ONCE(scx_get_task_state(p) >= SCX_TASK_READY);
-		scx_exit_task(p);
+		scx_exit_task(p, &rq, &rf);
 		task_rq_unlock(rq, p, &rf);
 	}
 
@@ -3127,7 +3137,7 @@ void sched_ext_dead(struct task_struct *p)
 		struct rq *rq;
 
 		rq = task_rq_lock(p, &rf);
-		scx_exit_task(p);
+		scx_exit_task(p, &rq, &rf);
 		task_rq_unlock(rq, p, &rf);
 	}
 }
@@ -4359,7 +4369,7 @@ static void scx_disable_workfn(struct kthread_work *work)
 			p->sched_class = new_class;
 		}
 
-		scx_exit_task(p);
+		scx_exit_task(p, NULL, NULL);
 	}
 	scx_task_iter_stop(&sti);
 	percpu_up_write(&scx_fork_rwsem);
-- 
2.53.0

Re: [PATCH sched_ext/for-7.0-fixes] sched_ext: Drop rq lock before calling ops.exit_task()

Posted by Tejun Heo 3 weeks, 3 days ago

Hello,

On Thu, Mar 12, 2026 at 12:14:41AM +0100, Andrea Righi wrote:
> sched_ext_dead() calls scx_exit_task() while holding the rq lock, which
> invokes ops.exit_task(). If the BPF program calls helpers that acquire
> non-raw locks (e.g., bpf_task_storage_delete()), this can trigger the
> following BUG:
> 
>  =============================
>  [ BUG: Invalid wait context ]
>  7.0.0-rc1-virtme #1 Not tainted
>  -----------------------------
>  (udev-worker)/115 is trying to lock:
>  ffffffffa6970dd0 (rcu_tasks_trace_srcu_struct_srcu_usage.lock){....}-{3:3}, at: spin_lock_irqsave_ssp_contention+0x54/0x90
>  other info that might help us debug this:
>  context-{5:5}
>  3 locks held by (udev-worker)/115:
>   #0: ffff8e16c634ce58 (&p->pi_lock){-.-.}-{2:2}, at: _task_rq_lock+0x2c/0x100
>   #1: ffff8e16fbdbdae0 (&rq->__lock){-.-.}-{2:2}, at: raw_spin_rq_lock_nested+0x24/0xb0
>   #2: ffffffffa6971b60 (rcu_read_lock){....}-{1:3}, at: __bpf_prog_enter+0x64/0x110
>  stack backtrace:
>  ...
>  Sched_ext: cosmos_1.0.7_g780e898fc_dirty_x64_unknown_linux_gnu (enabled+all), task: runnable_at=-2ms
>  Call Trace:
>   <TASK>
>   __lock_acquire+0xf86/0x1de0
>   lock_acquire+0xcf/0x310
>   _raw_spin_lock_irqsave+0x39/0x60
>   spin_lock_irqsave_ssp_contention+0x54/0x90
>   srcu_gp_start_if_needed+0x2a7/0x490
>   bpf_selem_unlink+0x24b/0x590
>   bpf_task_storage_delete+0x3a/0x90
>   bpf_prog_3b623b4be76cfb86_scx_pmu_task_fini+0x26/0x2a
>   bpf_prog_4b1530d9d9852432_cosmos_exit_task+0x1d/0x1f
>   bpf__sched_ext_ops_exit_task+0x4b/0xa7

I think the better way to handle this is making sure bpf operations that we
may need are safe while holding rq lock. After all, we need to be able to
use them while holding rq lock. It doens't make a lot of sense for exit to
disallow that.

Thanks.

-- 
tejun