In order to fix the whole SCHED_EXT balance/pick mess, and avoid
further complicating all this, make the regular:
p->pi_lock
rq->lock
dsq->lock
order work. Notably, while sched_class::pick_task() is called with
rq->lock held, and pick_task_scx() takes dsq->lock, and while the
normal sched_change pattern goes into dequeue/enqueue and thus takes
dsq->lock, various other things like task_call_func() /
sched_setaffinity() do not necessarily do so.
Therefore, add a per task spinlock pointer that can be set to
reference the shared runqueue lock where appropriate and teach
__task_rq_lock() to take this long along with rq->lock.
This ensures all 'normal' scheduling operations serialize against the
shared lock.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
include/linux/sched.h | 2 +-
kernel/sched/core.c | 27 ++++++++++++++++++++++-----
kernel/sched/sched.h | 10 ++++++----
kernel/sched/stats.h | 2 +-
4 files changed, 30 insertions(+), 11 deletions(-)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1225,8 +1225,8 @@ struct task_struct {
/* Protection against (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed, mempolicy: */
spinlock_t alloc_lock;
- /* Protection of the PI data structures: */
raw_spinlock_t pi_lock;
+ raw_spinlock_t *srq_lock;
struct wake_q_node wake_q;
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -703,17 +703,24 @@ void double_rq_lock(struct rq *rq1, stru
struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
__acquires(rq->lock)
{
+ raw_spinlock_t *slock;
struct rq *rq;
lockdep_assert_held(&p->pi_lock);
for (;;) {
rq = task_rq(p);
+ slock = p->srq_lock;
raw_spin_rq_lock(rq);
- if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
+ if (slock)
+ raw_spin_lock(slock);
+ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p) &&
+ (!slock || p->srq_lock == slock))) {
rq_pin_lock(rq, rf);
return rq;
}
+ if (slock)
+ raw_spin_unlock(slock);
raw_spin_rq_unlock(rq);
while (unlikely(task_on_rq_migrating(p)))
@@ -728,12 +735,16 @@ struct rq *task_rq_lock(struct task_stru
__acquires(p->pi_lock)
__acquires(rq->lock)
{
+ raw_spinlock_t *slock;
struct rq *rq;
for (;;) {
raw_spin_lock_irqsave(&p->pi_lock, rf->flags);
rq = task_rq(p);
+ slock = p->srq_lock;
raw_spin_rq_lock(rq);
+ if (slock)
+ raw_spin_lock(slock);
/*
* move_queued_task() task_rq_lock()
*
@@ -751,10 +762,14 @@ struct rq *task_rq_lock(struct task_stru
* dependency headed by '[L] rq = task_rq()' and the acquire
* will pair with the WMB to ensure we then also see migrating.
*/
- if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
+ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p) &&
+ (!slock || p->srq_lock == slock))) {
rq_pin_lock(rq, rf);
return rq;
}
+
+ if (slock)
+ raw_spin_unlock(slock);
raw_spin_rq_unlock(rq);
raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
@@ -2617,7 +2632,8 @@ static int migration_cpu_stop(void *data
*/
WARN_ON_ONCE(!pending->stop_pending);
preempt_disable();
- task_rq_unlock(rq, p, &rf);
+ rq_unlock(rq, &rf);
+ raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop,
&pending->arg, &pending->stop_work);
preempt_enable();
@@ -2626,7 +2642,8 @@ static int migration_cpu_stop(void *data
out:
if (pending)
pending->stop_pending = false;
- task_rq_unlock(rq, p, &rf);
+ rq_unlock(rq, &rf);
+ raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
if (complete)
complete_all(&pending->done);
@@ -3743,7 +3760,7 @@ static int ttwu_runnable(struct task_str
ttwu_do_wakeup(p);
ret = 1;
}
- __task_rq_unlock(rq, &rf);
+ __task_rq_unlock(rq, p, &rf);
return ret;
}
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1800,10 +1800,13 @@ struct rq *task_rq_lock(struct task_stru
__acquires(p->pi_lock)
__acquires(rq->lock);
-static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf)
+static inline void
+__task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
__releases(rq->lock)
{
rq_unpin_lock(rq, rf);
+ if (p->srq_lock)
+ raw_spin_unlock(p->srq_lock);
raw_spin_rq_unlock(rq);
}
@@ -1812,8 +1815,7 @@ task_rq_unlock(struct rq *rq, struct tas
__releases(rq->lock)
__releases(p->pi_lock)
{
- rq_unpin_lock(rq, rf);
- raw_spin_rq_unlock(rq);
+ __task_rq_unlock(rq, p, rf);
raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
}
@@ -1824,7 +1826,7 @@ DEFINE_LOCK_GUARD_1(task_rq_lock, struct
DEFINE_LOCK_GUARD_1(__task_rq_lock, struct task_struct,
_T->rq = __task_rq_lock(_T->lock, &_T->rf),
- __task_rq_unlock(_T->rq, &_T->rf),
+ __task_rq_unlock(_T->rq, _T->lock, &_T->rf),
struct rq *rq; struct rq_flags rf)
static inline void rq_lock_irqsave(struct rq *rq, struct rq_flags *rf)
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -206,7 +206,7 @@ static inline void psi_ttwu_dequeue(stru
rq = __task_rq_lock(p, &rf);
psi_task_change(p, p->psi_flags, 0);
- __task_rq_unlock(rq, &rf);
+ __task_rq_unlock(rq, p, &rf);
}
}
Hello,
On Wed, Sep 10, 2025 at 05:44:21PM +0200, Peter Zijlstra wrote:
> @@ -703,17 +703,24 @@ void double_rq_lock(struct rq *rq1, stru
> struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
> __acquires(rq->lock)
> {
> + raw_spinlock_t *slock;
> struct rq *rq;
>
> lockdep_assert_held(&p->pi_lock);
>
> for (;;) {
> rq = task_rq(p);
> + slock = p->srq_lock;
> raw_spin_rq_lock(rq);
> - if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
> + if (slock)
> + raw_spin_lock(slock);
> + if (likely(rq == task_rq(p) && !task_on_rq_migrating(p) &&
> + (!slock || p->srq_lock == slock))) {
> rq_pin_lock(rq, rf);
> return rq;
> }
With the !slock condition, the following scenario is possible:
__task_rq_lock()
slock = p->srq_lock; /* NULL */
dispatch_enqueue()
p->srq_lock = &dsq->lock;
enqueue finishes
raw_spin_rq_lock(rq);
rq is the same, $slock is NULL, return
do something assuming p is locked down p gets dispatched to another rq
I'm unclear on when p->srq_lock would be safe to set and clear, so the goal
is that whoever does [__]task_rq_lock() ends up waiting on the dsq lock that
the task is queued on, and if we can exclude other sched operations that
way, we don't have to hold source rq lock when moving the task to another rq
for execution, right?
In the last patch, it's set on dispatch_enqueue() and cleared when the task
leaves the DSQ. Let's consider a simple scenario where a task gets enqueued,
gets put on a non-local DSQ and then dispatched to a local DSQ, Assuming
everything works out and we don't have to lock the source rq for migration,
we'd be depending on task_rq_lock() reliably hitting p->srq_lock to avoid
races, but I'm not sure how this would work. Let's say p is currently
associated with CPU1 on a non-local DSQ w/ p->srq_lock set to its source
DSQ.
pick_task_ext() on CPU0 task property change on CPU1
locks the DSQ
picks p
task_unlink_from_dsq() task_rq_lock();
p->srq_lock = NULL; lock rq on CPU1
p is moved to local DSQ sees p->src_lock == NULL
return
p starts running
anything can happen
proceed with property change
What am I missing?
Thanks.
--
tejun
On Thu, Sep 11, 2025 at 02:19:57PM -1000, Tejun Heo wrote:
> Hello,
>
> On Wed, Sep 10, 2025 at 05:44:21PM +0200, Peter Zijlstra wrote:
> > @@ -703,17 +703,24 @@ void double_rq_lock(struct rq *rq1, stru
> > struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
> > __acquires(rq->lock)
> > {
> > + raw_spinlock_t *slock;
> > struct rq *rq;
> >
> > lockdep_assert_held(&p->pi_lock);
> >
> > for (;;) {
> > rq = task_rq(p);
> > + slock = p->srq_lock;
> > raw_spin_rq_lock(rq);
> > - if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
> > + if (slock)
> > + raw_spin_lock(slock);
> > + if (likely(rq == task_rq(p) && !task_on_rq_migrating(p) &&
> > + (!slock || p->srq_lock == slock))) {
> > rq_pin_lock(rq, rf);
> > return rq;
> > }
Yeah, I think that needs to change a little. Perhaps something like:
slock2 = p->srq_lock;
if (... && (!slock2 || slock2 == slock))
> With the !slock condition, the following scenario is possible:
>
> __task_rq_lock()
> slock = p->srq_lock; /* NULL */
> dispatch_enqueue()
> p->srq_lock = &dsq->lock;
> enqueue finishes
> raw_spin_rq_lock(rq);
> rq is the same, $slock is NULL, return
> do something assuming p is locked down p gets dispatched to another rq
>
> I'm unclear on when p->srq_lock would be safe to set and clear, so the goal
> is that whoever does [__]task_rq_lock() ends up waiting on the dsq lock that
> the task is queued on, and if we can exclude other sched operations that
> way, we don't have to hold source rq lock when moving the task to another rq
> for execution, right?
Indeed. If !p->srq_lock then task_rq(p)->lock must be sufficient.
So for enqueue, which sets p->srq_lock, this must be done while holding
task_rq(p)->lock.
So the above example should be serialized on task_rq(p)->lock, since
__task_rq_lock() holds it, enqueue cannot happen. Conversely, if enqueue
holds task_rq(p)->lock, then __task_rq_lock() will have to wait for
that, and then observe the newly set p->srq_lock and cycle to take that.
> In the last patch, it's set on dispatch_enqueue() and cleared when the task
> leaves the DSQ. Let's consider a simple scenario where a task gets enqueued,
> gets put on a non-local DSQ and then dispatched to a local DSQ, Assuming
> everything works out and we don't have to lock the source rq for migration,
> we'd be depending on task_rq_lock() reliably hitting p->srq_lock to avoid
> races, but I'm not sure how this would work. Let's say p is currently
> associated with CPU1 on a non-local DSQ w/ p->srq_lock set to its source
> DSQ.
>
> pick_task_ext() on CPU0 task property change on CPU1
> locks the DSQ
> picks p
> task_unlink_from_dsq() task_rq_lock();
> p->srq_lock = NULL; lock rq on CPU1
> p is moved to local DSQ sees p->src_lock == NULL
> return
> p starts running
> anything can happen
> proceed with property change
Hmm, the thinking was that if !p->srq_lock then task_rq(p)->lock should
be sufficient.
We must do set_task_cpu(0) before task_unlink_from_dsq() (and I got this
order wrong in yesterday's email).
pick_task_ext() on CPU0
lock DSQ
pick p
set_task_cpu(0) task_rq_lock()
task_unlink_from_dsq() if !p->srq_lock, then task_rq(p) == 0
p->srq_lock = NULL;
p is moved to local DSQ
Perhaps the p->srq_lock store should be store-release, so that the cpu
store is before.
Then if we observe p->srq_lock, we'll serialize against DSQ and all is
well, if we observe !p->srq_lock then we must also observe task_rq(p) ==
0 and then we'll serialize on rq->lock.
Now let me see if there isn't an ABA issue here, consider:
pre: task_cpu(p) != 2, p->srq_lock = NULL
CPU0 CPU1 CPU2
__task_rq_lock() enqueue_task_scx() pick_task_scx()
rq = task_rq(p);
LOCK rq->lock
rq = task_rq(p)
LOCK rq->lock
.. waits
LOCK dsq->lock
enqueue on dsq
p->srq_lock = &dsq->lock
UNLOCK dsq->lock
LOCK dsq->lock
pick p
UNLOCK rq->lock
set_task_cpu(2)
task_unlink_from_dsq()
p->srq_lock = NULL;
UNLOCK dsq->lock
.. resumes
At this point our CPU0's __task_rq_lock():
- if it observes p->srq_lock, it will cycle taking that, only to then
find out p->srq_lock is no longer set, but then it must also see
task_rq() has changed, so the next cycle will block on CPU2's
rq->lock.
- if it observes !p->srq_lock, then it cannot be the initial NULL,
since the initial task_rq(p)->lock ordering prohibits this. So it
must be the second NULL, which then also mandates we see the CPU
change and we'll cycle to take CPU2's rq->lock.
That is, I _think_ we're okay :-)
Hello, On Fri, Sep 12, 2025 at 01:54:59PM +0200, Peter Zijlstra wrote: > > With the !slock condition, the following scenario is possible: > > > > __task_rq_lock() > > slock = p->srq_lock; /* NULL */ > > dispatch_enqueue() > > p->srq_lock = &dsq->lock; > > enqueue finishes > > raw_spin_rq_lock(rq); > > rq is the same, $slock is NULL, return > > do something assuming p is locked down p gets dispatched to another rq > > > > I'm unclear on when p->srq_lock would be safe to set and clear, so the goal > > is that whoever does [__]task_rq_lock() ends up waiting on the dsq lock that > > the task is queued on, and if we can exclude other sched operations that > > way, we don't have to hold source rq lock when moving the task to another rq > > for execution, right? > > Indeed. If !p->srq_lock then task_rq(p)->lock must be sufficient. > > So for enqueue, which sets p->srq_lock, this must be done while holding > task_rq(p)->lock. > > So the above example should be serialized on task_rq(p)->lock, since > __task_rq_lock() holds it, enqueue cannot happen. Conversely, if enqueue > holds task_rq(p)->lock, then __task_rq_lock() will have to wait for > that, and then observe the newly set p->srq_lock and cycle to take that. For that to work, [__]task_rq_lock() would have to read p->srq_lock while holdling rq_lock. Simple reordering, but yeah it'd help to have setter/getter for explicit locking rules. ... > We must do set_task_cpu(0) before task_unlink_from_dsq() (and I got this > order wrong in yesterday's email). > > pick_task_ext() on CPU0 > lock DSQ > pick p > set_task_cpu(0) task_rq_lock() > task_unlink_from_dsq() if !p->srq_lock, then task_rq(p) == 0 > p->srq_lock = NULL; > p is moved to local DSQ > > Perhaps the p->srq_lock store should be store-release, so that the cpu > store is before. > > Then if we observe p->srq_lock, we'll serialize against DSQ and all is > well, if we observe !p->srq_lock then we must also observe task_rq(p) == > 0 and then we'll serialize on rq->lock. I see, so the interlocking is between task_rq(p) and p->srq_lock - either one sees the updated CPU or non-NULL srq_lock. As long as the one that clears ->srq_lock has both the destination rq and DSQ locked, task_rq_lock() either ends up waiting on ->srq_lock or sees updated CPU and has to loop over and wait on the destination rq. > Now let me see if there isn't an ABA issue here, consider: > > pre: task_cpu(p) != 2, p->srq_lock = NULL > > CPU0 CPU1 CPU2 > > __task_rq_lock() enqueue_task_scx() pick_task_scx() > > rq = task_rq(p); > LOCK rq->lock > rq = task_rq(p) > LOCK rq->lock > .. waits > LOCK dsq->lock > enqueue on dsq > p->srq_lock = &dsq->lock > UNLOCK dsq->lock > LOCK dsq->lock > pick p > UNLOCK rq->lock > set_task_cpu(2) > task_unlink_from_dsq() > p->srq_lock = NULL; > UNLOCK dsq->lock > .. resumes > > At this point our CPU0's __task_rq_lock(): > > - if it observes p->srq_lock, it will cycle taking that, only to then > find out p->srq_lock is no longer set, but then it must also see > task_rq() has changed, so the next cycle will block on CPU2's > rq->lock. > > - if it observes !p->srq_lock, then it cannot be the initial NULL, > since the initial task_rq(p)->lock ordering prohibits this. So it > must be the second NULL, which then also mandates we see the CPU > change and we'll cycle to take CPU2's rq->lock. > > That is, I _think_ we're okay :-) It *seems* that way to me. There are two other scenarios tho. - A task can move from a non-local DSQ to another non-local DSQ at any time while queued. As this doesn't cause rq migration, we can probably just overwrite p->srq_lock to the new one. Need to think about it a bit more. - A task can be queued on a BPF data structure and thus may not be on any DSQ. I think this can be handled by adding a raw_spinlock to task_struct and treating the task as if it's on its own DSQ by pointing to that one, and grabbing that lock when transferring that task from BPF side. So, it *seems* solvable but I'm afraid it's becoming too subtle. How about doing something simpler and just add a per-task lock which nests inside rq lock which is always grabbed by [__]task_rq_lock() and optionally grabbed by sched classes that want to migrate tasks without grabbing the source rq lock? That way, we don't need to the lock pointer dancing while achieving about the same result. From sched_ext's POV, grabbing that per-task lock is likely going to be cheaper than doing the rq lock switching, so it's way simpler and nothing gets worse. Thanks. -- tejun
On Fri, Sep 12, 2025 at 07:56:21AM -1000, Tejun Heo wrote: > It *seems* that way to me. There are two other scenarios tho. > > - A task can move from a non-local DSQ to another non-local DSQ at any time > while queued. As this doesn't cause rq migration, we can probably just > overwrite p->srq_lock to the new one. Need to think about it a bit more. It can use task_on_rq_migrating(), exactly like 'normal' rq-to-rq migration: LOCK src_dsq->lock p->on_rq = TASK_ON_RQ_MIGRATING; task_unlink_from_dsq(); UNLOCK src_dsq->lock LOCK dst_dsq->lock dispatch_enqueue() p->on_rq = TASK_ON_RQ_QUEUED; UNLOCK dst_dsq->lock Same reasoning as for the pick_task_scx() migration, if it observes !p->srq_lock, then it must observe MIGRATING and we'll spin-wait until QUEUED. At which point we'll see the new srq_lock. > - A task can be queued on a BPF data structure and thus may not be on any > DSQ. I think this can be handled by adding a raw_spinlock to task_struct > and treating the task as if it's on its own DSQ by pointing to that one, > and grabbing that lock when transferring that task from BPF side. Hmm, and BPF data structures cannot have a lock associated with them? I'm thinking they must, something is serializing all that. > So, it *seems* solvable but I'm afraid it's becoming too subtle. How about > doing something simpler and just add a per-task lock which nests inside rq > lock which is always grabbed by [__]task_rq_lock() and optionally grabbed by > sched classes that want to migrate tasks without grabbing the source rq > lock? That way, we don't need to the lock pointer dancing while achieving > about the same result. From sched_ext's POV, grabbing that per-task lock is > likely going to be cheaper than doing the rq lock switching, so it's way > simpler and nothing gets worse. I *really* don't like that. Fundamentally a runqueue is 'rich' data structure. It has a container (list, tree, whatever) but also a pile of statistics (time, vtime, counts, load-sums, averages). Adding/removing a task from a runqueue needs all that serialized. A per-task lock simply cannot do this. If you've hidden this lock inside BPF such that C cannot access it, then your abstraction needs fixing. Surely it is possible to have a C DSQ to mirror whatever the BPF thing does. Add a few helpers for BPF to create/destroy DSQs (IDs) and a callback to map a task to a DSQ. Then the C part can use the DSQ lock, and hold it while calling into whatever BPF. Additionally, it can sanity check the BPF thing, tasks cannot go 'missing' without C knowing wtf they went -- which is that bypass problem, no?
Hello, On Mon, Sep 15, 2025 at 10:38:15AM +0200, Peter Zijlstra wrote: > On Fri, Sep 12, 2025 at 07:56:21AM -1000, Tejun Heo wrote: > > It *seems* that way to me. There are two other scenarios tho. > > > > - A task can move from a non-local DSQ to another non-local DSQ at any time > > while queued. As this doesn't cause rq migration, we can probably just > > overwrite p->srq_lock to the new one. Need to think about it a bit more. > > It can use task_on_rq_migrating(), exactly like 'normal' rq-to-rq > migration: > > LOCK src_dsq->lock > p->on_rq = TASK_ON_RQ_MIGRATING; > task_unlink_from_dsq(); > UNLOCK src_dsq->lock > > LOCK dst_dsq->lock > dispatch_enqueue() > p->on_rq = TASK_ON_RQ_QUEUED; > UNLOCK dst_dsq->lock > > Same reasoning as for the pick_task_scx() migration, if it observes > !p->srq_lock, then it must observe MIGRATING and we'll spin-wait until > QUEUED. At which point we'll see the new srq_lock. I see. > > - A task can be queued on a BPF data structure and thus may not be on any > > DSQ. I think this can be handled by adding a raw_spinlock to task_struct > > and treating the task as if it's on its own DSQ by pointing to that one, > > and grabbing that lock when transferring that task from BPF side. > > Hmm, and BPF data structures cannot have a lock associated with them? > I'm thinking they must, something is serializing all that. > > > So, it *seems* solvable but I'm afraid it's becoming too subtle. How about > > doing something simpler and just add a per-task lock which nests inside rq > > lock which is always grabbed by [__]task_rq_lock() and optionally grabbed by > > sched classes that want to migrate tasks without grabbing the source rq > > lock? That way, we don't need to the lock pointer dancing while achieving > > about the same result. From sched_ext's POV, grabbing that per-task lock is > > likely going to be cheaper than doing the rq lock switching, so it's way > > simpler and nothing gets worse. > > I *really* don't like that. Fundamentally a runqueue is 'rich' data > structure. It has a container (list, tree, whatever) but also a pile of > statistics (time, vtime, counts, load-sums, averages). Adding/removing a > task from a runqueue needs all that serialized. A per-task lock simply > cannot do this. > > If you've hidden this lock inside BPF such that C cannot access it, then > your abstraction needs fixing. Surely it is possible to have a C DSQ to > mirror whatever the BPF thing does. Add a few helpers for BPF to > create/destroy DSQs (IDs) and a callback to map a task to a DSQ. Then > the C part can use the DSQ lock, and hold it while calling into whatever > BPF. Most current schedulers (except for scx_qmap which is there just to demo how to use BPF side queueing) use use DSQs to handle tasks the way you're describing. However, BPF arena is becoming more accessible and gaining wider usage, paired with purely BPF side synchronization constructs (spinlock or some lockless data structure). Long term, I think maintaining flexibility is of higher importance for sched_ext than e.g. small performance improvements or even design or implementation aesthetics. The primary purpose is enabling trying out new, sometimes wild, things after all. As such, I don't think it'd be a good idea to put strict restrictions on how the BPF side operates unless it affects the ability to recover the system from a malfunctioning BPF scheduler, of course. > Additionally, it can sanity check the BPF thing, tasks cannot go > 'missing' without C knowing wtf they went -- which is that bypass > problem, no? They are orthogonal. Even if all tasks are on DSQs, the scheduler may still fail to dispatch some DSQs for too long, mess up the ordering inside, cause excessive bouncing across them, and what not. So, the kernel side still needs to be able to detect and contain failures. The only difference between a task being on a DSQ or BPF side is that there needs to be extra per-task state tracking to ensure that tasks only transit states in orderly fashion (ie. don't dispatch the same task twice). Thanks. -- tejun
Hello, again. On Tue, Sep 16, 2025 at 12:29:57PM -1000, Tejun Heo wrote: ... > Long term, I think maintaining flexibility is of higher importance for > sched_ext than e.g. small performance improvements or even design or > implementation aesthetics. The primary purpose is enabling trying out new, > sometimes wild, things after all. As such, I don't think it'd be a good idea > to put strict restrictions on how the BPF side operates unless it affects > the ability to recover the system from a malfunctioning BPF scheduler, of > course. Thinking a bit more about it. I wonder the status-quo is actually an okay balance. All in-kernel sched classes are per-CPU rich rq design, which meshes well with the current locking scheme, for obvious reasons. sched_ext is an oddball in that it may want to hot-migrate tasks at the last minute because who knows what the BPF side wants to do. However, this just boils down to having to always call balance() before any pick_task() attempts (including DL server case). Yeah, it's a niggle, especially as there needs to be a secondary hook to handle losing the race between balance() and pick_task(), but it's pretty contained conceptually and not a lot of code. Thanks. -- tejun
On Fri, Sep 12, 2025 at 01:54:59PM +0200, Peter Zijlstra wrote:
> On Thu, Sep 11, 2025 at 02:19:57PM -1000, Tejun Heo wrote:
> > Hello,
> >
> > On Wed, Sep 10, 2025 at 05:44:21PM +0200, Peter Zijlstra wrote:
> > > @@ -703,17 +703,24 @@ void double_rq_lock(struct rq *rq1, stru
> > > struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
> > > __acquires(rq->lock)
> > > {
> > > + raw_spinlock_t *slock;
> > > struct rq *rq;
> > >
> > > lockdep_assert_held(&p->pi_lock);
> > >
> > > for (;;) {
> > > rq = task_rq(p);
> > > + slock = p->srq_lock;
> > > raw_spin_rq_lock(rq);
> > > - if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
> > > + if (slock)
> > > + raw_spin_lock(slock);
> > > + if (likely(rq == task_rq(p) && !task_on_rq_migrating(p) &&
> > > + (!slock || p->srq_lock == slock))) {
> > > rq_pin_lock(rq, rf);
> > > return rq;
> > > }
>
> Yeah, I think that needs to change a little. Perhaps something like:
>
> slock2 = p->srq_lock;
> if (... && (!slock2 || slock2 == slock))
I'm being stupid, all that wants is: && (p->srq_lock == slock). If there
is a mis-match, unlock and re-try.
© 2016 - 2026 Red Hat, Inc.