[PATCH 12/14] sched: Add shared runqueue locking to __task_rq_lock()

Peter Zijlstra posted 14 patches 4 months, 4 weeks ago
[PATCH 12/14] sched: Add shared runqueue locking to __task_rq_lock()
Posted by Peter Zijlstra 4 months, 4 weeks ago
In order to fix the whole SCHED_EXT balance/pick mess, and avoid
further complicating all this, make the regular:

  p->pi_lock
    rq->lock
      dsq->lock

order work. Notably, while sched_class::pick_task() is called with
rq->lock held, and pick_task_scx() takes dsq->lock, and while the
normal sched_change pattern goes into dequeue/enqueue and thus takes
dsq->lock, various other things like task_call_func() /
sched_setaffinity() do not necessarily do so.

Therefore, add a per task spinlock pointer that can be set to
reference the shared runqueue lock where appropriate and teach
__task_rq_lock() to take this long along with rq->lock.

This ensures all 'normal' scheduling operations serialize against the
shared lock.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/sched.h |    2 +-
 kernel/sched/core.c   |   27 ++++++++++++++++++++++-----
 kernel/sched/sched.h  |   10 ++++++----
 kernel/sched/stats.h  |    2 +-
 4 files changed, 30 insertions(+), 11 deletions(-)

--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1225,8 +1225,8 @@ struct task_struct {
 	/* Protection against (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed, mempolicy: */
 	spinlock_t			alloc_lock;
 
-	/* Protection of the PI data structures: */
 	raw_spinlock_t			pi_lock;
+	raw_spinlock_t			*srq_lock;
 
 	struct wake_q_node		wake_q;
 
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -703,17 +703,24 @@ void double_rq_lock(struct rq *rq1, stru
 struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
 	__acquires(rq->lock)
 {
+	raw_spinlock_t *slock;
 	struct rq *rq;
 
 	lockdep_assert_held(&p->pi_lock);
 
 	for (;;) {
 		rq = task_rq(p);
+		slock = p->srq_lock;
 		raw_spin_rq_lock(rq);
-		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
+		if (slock)
+			raw_spin_lock(slock);
+		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p) &&
+			   (!slock || p->srq_lock == slock))) {
 			rq_pin_lock(rq, rf);
 			return rq;
 		}
+		if (slock)
+			raw_spin_unlock(slock);
 		raw_spin_rq_unlock(rq);
 
 		while (unlikely(task_on_rq_migrating(p)))
@@ -728,12 +735,16 @@ struct rq *task_rq_lock(struct task_stru
 	__acquires(p->pi_lock)
 	__acquires(rq->lock)
 {
+	raw_spinlock_t *slock;
 	struct rq *rq;
 
 	for (;;) {
 		raw_spin_lock_irqsave(&p->pi_lock, rf->flags);
 		rq = task_rq(p);
+		slock = p->srq_lock;
 		raw_spin_rq_lock(rq);
+		if (slock)
+			raw_spin_lock(slock);
 		/*
 		 *	move_queued_task()		task_rq_lock()
 		 *
@@ -751,10 +762,14 @@ struct rq *task_rq_lock(struct task_stru
 		 * dependency headed by '[L] rq = task_rq()' and the acquire
 		 * will pair with the WMB to ensure we then also see migrating.
 		 */
-		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
+		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p) &&
+			   (!slock || p->srq_lock == slock))) {
 			rq_pin_lock(rq, rf);
 			return rq;
 		}
+
+		if (slock)
+			raw_spin_unlock(slock);
 		raw_spin_rq_unlock(rq);
 		raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
 
@@ -2617,7 +2632,8 @@ static int migration_cpu_stop(void *data
 		 */
 		WARN_ON_ONCE(!pending->stop_pending);
 		preempt_disable();
-		task_rq_unlock(rq, p, &rf);
+		rq_unlock(rq, &rf);
+		raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
 		stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop,
 				    &pending->arg, &pending->stop_work);
 		preempt_enable();
@@ -2626,7 +2642,8 @@ static int migration_cpu_stop(void *data
 out:
 	if (pending)
 		pending->stop_pending = false;
-	task_rq_unlock(rq, p, &rf);
+	rq_unlock(rq, &rf);
+	raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
 
 	if (complete)
 		complete_all(&pending->done);
@@ -3743,7 +3760,7 @@ static int ttwu_runnable(struct task_str
 		ttwu_do_wakeup(p);
 		ret = 1;
 	}
-	__task_rq_unlock(rq, &rf);
+	__task_rq_unlock(rq, p, &rf);
 
 	return ret;
 }
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1800,10 +1800,13 @@ struct rq *task_rq_lock(struct task_stru
 	__acquires(p->pi_lock)
 	__acquires(rq->lock);
 
-static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf)
+static inline void
+__task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
 	__releases(rq->lock)
 {
 	rq_unpin_lock(rq, rf);
+	if (p->srq_lock)
+		raw_spin_unlock(p->srq_lock);
 	raw_spin_rq_unlock(rq);
 }
 
@@ -1812,8 +1815,7 @@ task_rq_unlock(struct rq *rq, struct tas
 	__releases(rq->lock)
 	__releases(p->pi_lock)
 {
-	rq_unpin_lock(rq, rf);
-	raw_spin_rq_unlock(rq);
+	__task_rq_unlock(rq, p, rf);
 	raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
 }
 
@@ -1824,7 +1826,7 @@ DEFINE_LOCK_GUARD_1(task_rq_lock, struct
 
 DEFINE_LOCK_GUARD_1(__task_rq_lock, struct task_struct,
 		    _T->rq = __task_rq_lock(_T->lock, &_T->rf),
-		    __task_rq_unlock(_T->rq, &_T->rf),
+		    __task_rq_unlock(_T->rq, _T->lock, &_T->rf),
 		    struct rq *rq; struct rq_flags rf)
 
 static inline void rq_lock_irqsave(struct rq *rq, struct rq_flags *rf)
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -206,7 +206,7 @@ static inline void psi_ttwu_dequeue(stru
 
 		rq = __task_rq_lock(p, &rf);
 		psi_task_change(p, p->psi_flags, 0);
-		__task_rq_unlock(rq, &rf);
+		__task_rq_unlock(rq, p, &rf);
 	}
 }
Re: [PATCH 12/14] sched: Add shared runqueue locking to __task_rq_lock()
Posted by Tejun Heo 4 months, 4 weeks ago
Hello,

On Wed, Sep 10, 2025 at 05:44:21PM +0200, Peter Zijlstra wrote:
> @@ -703,17 +703,24 @@ void double_rq_lock(struct rq *rq1, stru
>  struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
>  	__acquires(rq->lock)
>  {
> +	raw_spinlock_t *slock;
>  	struct rq *rq;
>  
>  	lockdep_assert_held(&p->pi_lock);
>  
>  	for (;;) {
>  		rq = task_rq(p);
> +		slock = p->srq_lock;
>  		raw_spin_rq_lock(rq);
> -		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
> +		if (slock)
> +			raw_spin_lock(slock);
> +		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p) &&
> +			   (!slock || p->srq_lock == slock))) {
>  			rq_pin_lock(rq, rf);
>  			return rq;
>  		}

With the !slock condition, the following scenario is possible:

  __task_rq_lock()
     slock = p->srq_lock; /* NULL */
                                                dispatch_enqueue()
                                                  p->srq_lock = &dsq->lock;
                                                enqueue finishes
     raw_spin_rq_lock(rq);
     rq is the same, $slock is NULL, return
  do something assuming p is locked down        p gets dispatched to another rq

I'm unclear on when p->srq_lock would be safe to set and clear, so the goal
is that whoever does [__]task_rq_lock() ends up waiting on the dsq lock that
the task is queued on, and if we can exclude other sched operations that
way, we don't have to hold source rq lock when moving the task to another rq
for execution, right?

In the last patch, it's set on dispatch_enqueue() and cleared when the task
leaves the DSQ. Let's consider a simple scenario where a task gets enqueued,
gets put on a non-local DSQ and then dispatched to a local DSQ, Assuming
everything works out and we don't have to lock the source rq for migration,
we'd be depending on task_rq_lock() reliably hitting p->srq_lock to avoid
races, but I'm not sure how this would work. Let's say p is currently
associated with CPU1 on a non-local DSQ w/ p->srq_lock set to its source
DSQ.

  pick_task_ext() on CPU0               task property change on CPU1
    locks the DSQ
    picks p      
    task_unlink_from_dsq()              task_rq_lock();
      p->srq_lock = NULL;                 lock rq on CPU1
    p is moved to local DSQ               sees p->src_lock == NULL
                                          return
  p starts running
  anything can happen
                                        proceed with property change

What am I missing?

Thanks.

-- 
tejun
Re: [PATCH 12/14] sched: Add shared runqueue locking to __task_rq_lock()
Posted by Peter Zijlstra 4 months, 3 weeks ago
On Thu, Sep 11, 2025 at 02:19:57PM -1000, Tejun Heo wrote:
> Hello,
> 
> On Wed, Sep 10, 2025 at 05:44:21PM +0200, Peter Zijlstra wrote:
> > @@ -703,17 +703,24 @@ void double_rq_lock(struct rq *rq1, stru
> >  struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
> >  	__acquires(rq->lock)
> >  {
> > +	raw_spinlock_t *slock;
> >  	struct rq *rq;
> >  
> >  	lockdep_assert_held(&p->pi_lock);
> >  
> >  	for (;;) {
> >  		rq = task_rq(p);
> > +		slock = p->srq_lock;
> >  		raw_spin_rq_lock(rq);
> > -		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
> > +		if (slock)
> > +			raw_spin_lock(slock);
> > +		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p) &&
> > +			   (!slock || p->srq_lock == slock))) {
> >  			rq_pin_lock(rq, rf);
> >  			return rq;
> >  		}

Yeah, I think that needs to change a little. Perhaps something like:

	slock2 = p->srq_lock;
	if (... && (!slock2 || slock2 == slock))

> With the !slock condition, the following scenario is possible:
> 
>   __task_rq_lock()
>      slock = p->srq_lock; /* NULL */
>                                                 dispatch_enqueue()
>                                                   p->srq_lock = &dsq->lock;
>                                                 enqueue finishes
>      raw_spin_rq_lock(rq);
>      rq is the same, $slock is NULL, return
>   do something assuming p is locked down        p gets dispatched to another rq
> 
> I'm unclear on when p->srq_lock would be safe to set and clear, so the goal
> is that whoever does [__]task_rq_lock() ends up waiting on the dsq lock that
> the task is queued on, and if we can exclude other sched operations that
> way, we don't have to hold source rq lock when moving the task to another rq
> for execution, right?

Indeed. If !p->srq_lock then task_rq(p)->lock must be sufficient.

So for enqueue, which sets p->srq_lock, this must be done while holding
task_rq(p)->lock.

So the above example should be serialized on task_rq(p)->lock, since
__task_rq_lock() holds it, enqueue cannot happen. Conversely, if enqueue
holds task_rq(p)->lock, then __task_rq_lock() will have to wait for
that, and then observe the newly set p->srq_lock and cycle to take that.

> In the last patch, it's set on dispatch_enqueue() and cleared when the task
> leaves the DSQ. Let's consider a simple scenario where a task gets enqueued,
> gets put on a non-local DSQ and then dispatched to a local DSQ, Assuming
> everything works out and we don't have to lock the source rq for migration,
> we'd be depending on task_rq_lock() reliably hitting p->srq_lock to avoid
> races, but I'm not sure how this would work. Let's say p is currently
> associated with CPU1 on a non-local DSQ w/ p->srq_lock set to its source
> DSQ.
> 
>   pick_task_ext() on CPU0               task property change on CPU1
>     locks the DSQ
>     picks p      
>     task_unlink_from_dsq()              task_rq_lock();
>       p->srq_lock = NULL;                 lock rq on CPU1
>     p is moved to local DSQ               sees p->src_lock == NULL
>                                           return
>   p starts running
>   anything can happen
>                                         proceed with property change

Hmm, the thinking was that if !p->srq_lock then task_rq(p)->lock should
be sufficient.

We must do set_task_cpu(0) before task_unlink_from_dsq() (and I got this
order wrong in yesterday's email).

  pick_task_ext() on CPU0		
    lock DSQ
    pick p
    set_task_cpu(0)			task_rq_lock()
    task_unlink_from_dsq()		  if !p->srq_lock, then task_rq(p) == 0
      p->srq_lock = NULL;
    p is moved to local DSQ

Perhaps the p->srq_lock store should be store-release, so that the cpu
store is before.

Then if we observe p->srq_lock, we'll serialize against DSQ and all is
well, if we observe !p->srq_lock then we must also observe task_rq(p) ==
0 and then we'll serialize on rq->lock.


Now let me see if there isn't an ABA issue here, consider:

pre: task_cpu(p) != 2, p->srq_lock = NULL

  CPU0				CPU1				CPU2

  __task_rq_lock()		enqueue_task_scx() 		pick_task_scx()

				rq = task_rq(p);
				LOCK rq->lock
  rq = task_rq(p)
  LOCK rq->lock
    .. waits
				LOCK dsq->lock
				enqueue on dsq
				p->srq_lock = &dsq->lock	
				UNLOCK dsq->lock
								LOCK dsq->lock
								pick p
				UNLOCK rq->lock
								set_task_cpu(2)
								task_unlink_from_dsq()
								  p->srq_lock = NULL;
								UNLOCK dsq->lock
    .. resumes
 
At this point our CPU0's __task_rq_lock():

  - if it observes p->srq_lock, it will cycle taking that, only to then
    find out p->srq_lock is no longer set, but then it must also see
    task_rq() has changed, so the next cycle will block on CPU2's
    rq->lock.

  - if it observes !p->srq_lock, then it cannot be the initial NULL,
    since the initial task_rq(p)->lock ordering prohibits this. So it
    must be the second NULL, which then also mandates we see the CPU
    change and we'll cycle to take CPU2's rq->lock.

That is, I _think_ we're okay :-)
Re: [PATCH 12/14] sched: Add shared runqueue locking to __task_rq_lock()
Posted by Tejun Heo 4 months, 3 weeks ago
Hello,

On Fri, Sep 12, 2025 at 01:54:59PM +0200, Peter Zijlstra wrote:
> > With the !slock condition, the following scenario is possible:
> > 
> >   __task_rq_lock()
> >      slock = p->srq_lock; /* NULL */
> >                                                 dispatch_enqueue()
> >                                                   p->srq_lock = &dsq->lock;
> >                                                 enqueue finishes
> >      raw_spin_rq_lock(rq);
> >      rq is the same, $slock is NULL, return
> >   do something assuming p is locked down        p gets dispatched to another rq
> > 
> > I'm unclear on when p->srq_lock would be safe to set and clear, so the goal
> > is that whoever does [__]task_rq_lock() ends up waiting on the dsq lock that
> > the task is queued on, and if we can exclude other sched operations that
> > way, we don't have to hold source rq lock when moving the task to another rq
> > for execution, right?
> 
> Indeed. If !p->srq_lock then task_rq(p)->lock must be sufficient.
> 
> So for enqueue, which sets p->srq_lock, this must be done while holding
> task_rq(p)->lock.
> 
> So the above example should be serialized on task_rq(p)->lock, since
> __task_rq_lock() holds it, enqueue cannot happen. Conversely, if enqueue
> holds task_rq(p)->lock, then __task_rq_lock() will have to wait for
> that, and then observe the newly set p->srq_lock and cycle to take that.

For that to work, [__]task_rq_lock() would have to read p->srq_lock while
holdling rq_lock. Simple reordering, but yeah it'd help to have
setter/getter for explicit locking rules.

...
> We must do set_task_cpu(0) before task_unlink_from_dsq() (and I got this
> order wrong in yesterday's email).
> 
>   pick_task_ext() on CPU0		
>     lock DSQ
>     pick p
>     set_task_cpu(0)			task_rq_lock()
>     task_unlink_from_dsq()		  if !p->srq_lock, then task_rq(p) == 0
>       p->srq_lock = NULL;
>     p is moved to local DSQ
> 
> Perhaps the p->srq_lock store should be store-release, so that the cpu
> store is before.
> 
> Then if we observe p->srq_lock, we'll serialize against DSQ and all is
> well, if we observe !p->srq_lock then we must also observe task_rq(p) ==
> 0 and then we'll serialize on rq->lock.

I see, so the interlocking is between task_rq(p) and p->srq_lock - either
one sees the updated CPU or non-NULL srq_lock. As long as the one that
clears ->srq_lock has both the destination rq and DSQ locked, task_rq_lock()
either ends up waiting on ->srq_lock or sees updated CPU and has to loop
over and wait on the destination rq.

> Now let me see if there isn't an ABA issue here, consider:
> 
> pre: task_cpu(p) != 2, p->srq_lock = NULL
> 
>   CPU0				CPU1				CPU2
> 
>   __task_rq_lock()		enqueue_task_scx() 		pick_task_scx()
> 
> 				rq = task_rq(p);
> 				LOCK rq->lock
>   rq = task_rq(p)
>   LOCK rq->lock
>     .. waits
> 				LOCK dsq->lock
> 				enqueue on dsq
> 				p->srq_lock = &dsq->lock	
> 				UNLOCK dsq->lock
> 								LOCK dsq->lock
> 								pick p
> 				UNLOCK rq->lock
> 								set_task_cpu(2)
> 								task_unlink_from_dsq()
> 								  p->srq_lock = NULL;
> 								UNLOCK dsq->lock
>     .. resumes
>  
> At this point our CPU0's __task_rq_lock():
> 
>   - if it observes p->srq_lock, it will cycle taking that, only to then
>     find out p->srq_lock is no longer set, but then it must also see
>     task_rq() has changed, so the next cycle will block on CPU2's
>     rq->lock.
> 
>   - if it observes !p->srq_lock, then it cannot be the initial NULL,
>     since the initial task_rq(p)->lock ordering prohibits this. So it
>     must be the second NULL, which then also mandates we see the CPU
>     change and we'll cycle to take CPU2's rq->lock.
> 
> That is, I _think_ we're okay :-)

It *seems* that way to me. There are two other scenarios tho.

- A task can move from a non-local DSQ to another non-local DSQ at any time
  while queued. As this doesn't cause rq migration, we can probably just
  overwrite p->srq_lock to the new one. Need to think about it a bit more.

- A task can be queued on a BPF data structure and thus may not be on any
  DSQ. I think this can be handled by adding a raw_spinlock to task_struct
  and treating the task as if it's on its own DSQ by pointing to that one,
  and grabbing that lock when transferring that task from BPF side.

So, it *seems* solvable but I'm afraid it's becoming too subtle. How about
doing something simpler and just add a per-task lock which nests inside rq
lock which is always grabbed by [__]task_rq_lock() and optionally grabbed by
sched classes that want to migrate tasks without grabbing the source rq
lock? That way, we don't need to the lock pointer dancing while achieving
about the same result. From sched_ext's POV, grabbing that per-task lock is
likely going to be cheaper than doing the rq lock switching, so it's way
simpler and nothing gets worse.

Thanks.

-- 
tejun
Re: [PATCH 12/14] sched: Add shared runqueue locking to __task_rq_lock()
Posted by Peter Zijlstra 4 months, 3 weeks ago
On Fri, Sep 12, 2025 at 07:56:21AM -1000, Tejun Heo wrote:

> It *seems* that way to me. There are two other scenarios tho.
> 
> - A task can move from a non-local DSQ to another non-local DSQ at any time
>   while queued. As this doesn't cause rq migration, we can probably just
>   overwrite p->srq_lock to the new one. Need to think about it a bit more.

It can use task_on_rq_migrating(), exactly like 'normal' rq-to-rq
migration:

	LOCK src_dsq->lock
	p->on_rq = TASK_ON_RQ_MIGRATING;
	task_unlink_from_dsq();
	UNLOCK src_dsq->lock

	LOCK dst_dsq->lock
	dispatch_enqueue()
	p->on_rq = TASK_ON_RQ_QUEUED;
	UNLOCK dst_dsq->lock

Same reasoning as for the pick_task_scx() migration, if it observes
!p->srq_lock, then it must observe MIGRATING and we'll spin-wait until
QUEUED. At which point we'll see the new srq_lock.

> - A task can be queued on a BPF data structure and thus may not be on any
>   DSQ. I think this can be handled by adding a raw_spinlock to task_struct
>   and treating the task as if it's on its own DSQ by pointing to that one,
>   and grabbing that lock when transferring that task from BPF side.

Hmm, and BPF data structures cannot have a lock associated with them?
I'm thinking they must, something is serializing all that.

> So, it *seems* solvable but I'm afraid it's becoming too subtle. How about
> doing something simpler and just add a per-task lock which nests inside rq
> lock which is always grabbed by [__]task_rq_lock() and optionally grabbed by
> sched classes that want to migrate tasks without grabbing the source rq
> lock? That way, we don't need to the lock pointer dancing while achieving
> about the same result. From sched_ext's POV, grabbing that per-task lock is
> likely going to be cheaper than doing the rq lock switching, so it's way
> simpler and nothing gets worse.

I *really* don't like that. Fundamentally a runqueue is 'rich' data
structure. It has a container (list, tree, whatever) but also a pile of
statistics (time, vtime, counts, load-sums, averages). Adding/removing a
task from a runqueue needs all that serialized. A per-task lock simply
cannot do this.

If you've hidden this lock inside BPF such that C cannot access it, then
your abstraction needs fixing. Surely it is possible to have a C DSQ to
mirror whatever the BPF thing does. Add a few helpers for BPF to
create/destroy DSQs (IDs) and a callback to map a task to a DSQ. Then
the C part can use the DSQ lock, and hold it while calling into whatever
BPF.

Additionally, it can sanity check the BPF thing, tasks cannot go
'missing' without C knowing wtf they went -- which is that bypass
problem, no?
Re: [PATCH 12/14] sched: Add shared runqueue locking to __task_rq_lock()
Posted by Tejun Heo 4 months, 3 weeks ago
Hello,

On Mon, Sep 15, 2025 at 10:38:15AM +0200, Peter Zijlstra wrote:
> On Fri, Sep 12, 2025 at 07:56:21AM -1000, Tejun Heo wrote:
> > It *seems* that way to me. There are two other scenarios tho.
> > 
> > - A task can move from a non-local DSQ to another non-local DSQ at any time
> >   while queued. As this doesn't cause rq migration, we can probably just
> >   overwrite p->srq_lock to the new one. Need to think about it a bit more.
> 
> It can use task_on_rq_migrating(), exactly like 'normal' rq-to-rq
> migration:
> 
> 	LOCK src_dsq->lock
> 	p->on_rq = TASK_ON_RQ_MIGRATING;
> 	task_unlink_from_dsq();
> 	UNLOCK src_dsq->lock
> 
> 	LOCK dst_dsq->lock
> 	dispatch_enqueue()
> 	p->on_rq = TASK_ON_RQ_QUEUED;
> 	UNLOCK dst_dsq->lock
> 
> Same reasoning as for the pick_task_scx() migration, if it observes
> !p->srq_lock, then it must observe MIGRATING and we'll spin-wait until
> QUEUED. At which point we'll see the new srq_lock.

I see.

> > - A task can be queued on a BPF data structure and thus may not be on any
> >   DSQ. I think this can be handled by adding a raw_spinlock to task_struct
> >   and treating the task as if it's on its own DSQ by pointing to that one,
> >   and grabbing that lock when transferring that task from BPF side.
> 
> Hmm, and BPF data structures cannot have a lock associated with them?
> I'm thinking they must, something is serializing all that.
> 
> > So, it *seems* solvable but I'm afraid it's becoming too subtle. How about
> > doing something simpler and just add a per-task lock which nests inside rq
> > lock which is always grabbed by [__]task_rq_lock() and optionally grabbed by
> > sched classes that want to migrate tasks without grabbing the source rq
> > lock? That way, we don't need to the lock pointer dancing while achieving
> > about the same result. From sched_ext's POV, grabbing that per-task lock is
> > likely going to be cheaper than doing the rq lock switching, so it's way
> > simpler and nothing gets worse.
> 
> I *really* don't like that. Fundamentally a runqueue is 'rich' data
> structure. It has a container (list, tree, whatever) but also a pile of
> statistics (time, vtime, counts, load-sums, averages). Adding/removing a
> task from a runqueue needs all that serialized. A per-task lock simply
> cannot do this.
> 
> If you've hidden this lock inside BPF such that C cannot access it, then
> your abstraction needs fixing. Surely it is possible to have a C DSQ to
> mirror whatever the BPF thing does. Add a few helpers for BPF to
> create/destroy DSQs (IDs) and a callback to map a task to a DSQ. Then
> the C part can use the DSQ lock, and hold it while calling into whatever
> BPF.

Most current schedulers (except for scx_qmap which is there just to demo how
to use BPF side queueing) use use DSQs to handle tasks the way you're
describing. However, BPF arena is becoming more accessible and gaining wider
usage, paired with purely BPF side synchronization constructs (spinlock or
some lockless data structure).

Long term, I think maintaining flexibility is of higher importance for
sched_ext than e.g. small performance improvements or even design or
implementation aesthetics. The primary purpose is enabling trying out new,
sometimes wild, things after all. As such, I don't think it'd be a good idea
to put strict restrictions on how the BPF side operates unless it affects
the ability to recover the system from a malfunctioning BPF scheduler, of
course.

> Additionally, it can sanity check the BPF thing, tasks cannot go
> 'missing' without C knowing wtf they went -- which is that bypass
> problem, no?

They are orthogonal. Even if all tasks are on DSQs, the scheduler may still
fail to dispatch some DSQs for too long, mess up the ordering inside, cause
excessive bouncing across them, and what not. So, the kernel side still
needs to be able to detect and contain failures. The only difference between
a task being on a DSQ or BPF side is that there needs to be extra per-task
state tracking to ensure that tasks only transit states in orderly fashion
(ie. don't dispatch the same task twice).

Thanks.

-- 
tejun
Re: [PATCH 12/14] sched: Add shared runqueue locking to __task_rq_lock()
Posted by Tejun Heo 4 months, 3 weeks ago
Hello, again.

On Tue, Sep 16, 2025 at 12:29:57PM -1000, Tejun Heo wrote:
...
> Long term, I think maintaining flexibility is of higher importance for
> sched_ext than e.g. small performance improvements or even design or
> implementation aesthetics. The primary purpose is enabling trying out new,
> sometimes wild, things after all. As such, I don't think it'd be a good idea
> to put strict restrictions on how the BPF side operates unless it affects
> the ability to recover the system from a malfunctioning BPF scheduler, of
> course.

Thinking a bit more about it. I wonder the status-quo is actually an okay
balance. All in-kernel sched classes are per-CPU rich rq design, which
meshes well with the current locking scheme, for obvious reasons.

sched_ext is an oddball in that it may want to hot-migrate tasks at the last
minute because who knows what the BPF side wants to do. However, this just
boils down to having to always call balance() before any pick_task()
attempts (including DL server case). Yeah, it's a niggle, especially as
there needs to be a secondary hook to handle losing the race between
balance() and pick_task(), but it's pretty contained conceptually and not a
lot of code.

Thanks.

-- 
tejun
Re: [PATCH 12/14] sched: Add shared runqueue locking to __task_rq_lock()
Posted by Peter Zijlstra 4 months, 3 weeks ago
On Fri, Sep 12, 2025 at 01:54:59PM +0200, Peter Zijlstra wrote:
> On Thu, Sep 11, 2025 at 02:19:57PM -1000, Tejun Heo wrote:
> > Hello,
> > 
> > On Wed, Sep 10, 2025 at 05:44:21PM +0200, Peter Zijlstra wrote:
> > > @@ -703,17 +703,24 @@ void double_rq_lock(struct rq *rq1, stru
> > >  struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
> > >  	__acquires(rq->lock)
> > >  {
> > > +	raw_spinlock_t *slock;
> > >  	struct rq *rq;
> > >  
> > >  	lockdep_assert_held(&p->pi_lock);
> > >  
> > >  	for (;;) {
> > >  		rq = task_rq(p);
> > > +		slock = p->srq_lock;
> > >  		raw_spin_rq_lock(rq);
> > > -		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
> > > +		if (slock)
> > > +			raw_spin_lock(slock);
> > > +		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p) &&
> > > +			   (!slock || p->srq_lock == slock))) {
> > >  			rq_pin_lock(rq, rf);
> > >  			return rq;
> > >  		}
> 
> Yeah, I think that needs to change a little. Perhaps something like:
> 
> 	slock2 = p->srq_lock;
> 	if (... && (!slock2 || slock2 == slock))

I'm being stupid, all that wants is: && (p->srq_lock == slock). If there
is a mis-match, unlock and re-try.