Provide a LOCKED queue flag, indicating that the {en,de}queue()
operation is in task_rq_lock() context.
Note: the sched_change in scx_bypass() is the only one that does not
use task_rq_lock(). If that were fixed, we could have sched_change
imply LOCKED.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
kernel/sched/core.c | 31 +++++++++++++++++++++++++------
kernel/sched/sched.h | 7 +++++++
kernel/sched/syscalls.c | 4 ++--
3 files changed, 34 insertions(+), 8 deletions(-)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2716,7 +2716,7 @@ void set_cpus_allowed_common(struct task
static void
do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx)
{
- u32 flags = DEQUEUE_SAVE | DEQUEUE_NOCLOCK;
+ u32 flags = DEQUEUE_SAVE | DEQUEUE_NOCLOCK | DEQUEUE_LOCKED;
scoped_guard (sched_change, p, flags) {
p->sched_class->set_cpus_allowed(p, ctx);
@@ -3749,7 +3749,7 @@ static int ttwu_runnable(struct task_str
if (task_on_rq_queued(p)) {
update_rq_clock(rq);
if (p->se.sched_delayed)
- enqueue_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_DELAYED);
+ enqueue_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_DELAYED | ENQUEUE_LOCKED);
if (!task_on_cpu(rq, p)) {
/*
* When on_rq && !on_cpu the task is preempted, see if
@@ -4816,7 +4816,7 @@ void wake_up_new_task(struct task_struct
update_rq_clock(rq);
post_init_entity_util_avg(p);
- activate_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_INITIAL);
+ activate_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_INITIAL | ENQUEUE_LOCKED);
trace_sched_wakeup_new(p);
wakeup_preempt(rq, p, wake_flags);
if (p->sched_class->task_woken) {
@@ -7310,7 +7310,7 @@ void rt_mutex_post_schedule(void)
void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
{
int prio, oldprio, queue_flag =
- DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
+ DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK | DEQUEUE_LOCKED;
const struct sched_class *prev_class, *next_class;
struct rq_flags rf;
struct rq *rq;
@@ -8056,7 +8056,7 @@ int migrate_task_to(struct task_struct *
void sched_setnuma(struct task_struct *p, int nid)
{
guard(task_rq_lock)(p);
- scoped_guard (sched_change, p, DEQUEUE_SAVE)
+ scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_LOCKED)
p->numa_preferred_nid = nid;
}
#endif /* CONFIG_NUMA_BALANCING */
@@ -9160,7 +9160,7 @@ static void sched_change_group(struct ta
void sched_move_task(struct task_struct *tsk, bool for_autogroup)
{
unsigned int queue_flags =
- DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
+ DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK | DEQUEUE_LOCKED;
bool resched = false;
struct rq *rq;
@@ -10841,6 +10841,13 @@ struct sched_change_ctx *sched_change_be
struct rq *rq = task_rq(p);
lockdep_assert_rq_held(rq);
+#ifdef CONFIG_PROVE_LOCKING
+ if (flags & DEQUEUE_LOCKED) {
+ lockdep_assert_held(&p->pi_lock);
+ if (p->srq_lock)
+ lockdep_assert_held(p->srq_lock);
+ }
+#endif
if (flags & DEQUEUE_CLASS) {
if (WARN_ON_ONCE(flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)))
@@ -10862,6 +10869,9 @@ struct sched_change_ctx *sched_change_be
.flags = flags,
.queued = task_on_rq_queued(p),
.running = task_current(rq, p),
+#ifdef CONFIG_PROVE_LOCKING
+ .srq_lock = p->srq_lock,
+#endif
};
if (!(flags & DEQUEUE_CLASS)) {
@@ -10888,6 +10898,15 @@ void sched_change_end(struct sched_chang
struct rq *rq = task_rq(p);
lockdep_assert_rq_held(rq);
+#ifdef CONFIG_PROVE_LOCKING
+ if (ctx->flags & ENQUEUE_LOCKED) {
+ lockdep_assert_held(&p->pi_lock);
+ if (p->srq_lock)
+ lockdep_assert_held(p->srq_lock);
+ if (ctx->srq_lock && ctx->srq_lock != p->srq_lock)
+ lockdep_assert_not_held(ctx->srq_lock);
+ }
+#endif
if ((ctx->flags & ENQUEUE_CLASS) && p->sched_class->switching_to)
p->sched_class->switching_to(rq, p);
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2340,6 +2340,8 @@ extern const u32 sched_prio_to_wmult[40
* CLASS - going to update p->sched_class; makes sched_change call the
* various switch methods.
*
+ * LOCKED - task_rq_lock() context, implies p->srq_lock taken when set.
+ *
* ENQUEUE_HEAD - place at front of runqueue (tail if not specified)
* ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline)
* ENQUEUE_MIGRATED - the task was migrated during wakeup
@@ -2355,6 +2357,7 @@ extern const u32 sched_prio_to_wmult[40
#define DEQUEUE_MIGRATING 0x0010 /* Matches ENQUEUE_MIGRATING */
#define DEQUEUE_DELAYED 0x0020 /* Matches ENQUEUE_DELAYED */
#define DEQUEUE_CLASS 0x0040 /* Matches ENQUEUE_CLASS */
+#define DEQUEUE_LOCKED 0x0080 /* Matches ENQUEUE_LOCKED */
#define DEQUEUE_SPECIAL 0x00010000
#define DEQUEUE_THROTTLE 0x00020000
@@ -2367,6 +2370,7 @@ extern const u32 sched_prio_to_wmult[40
#define ENQUEUE_MIGRATING 0x0010
#define ENQUEUE_DELAYED 0x0020
#define ENQUEUE_CLASS 0x0040
+#define ENQUEUE_LOCKED 0x0080
#define ENQUEUE_HEAD 0x00010000
#define ENQUEUE_REPLENISH 0x00020000
@@ -3963,6 +3967,9 @@ extern void balance_callbacks(struct rq
struct sched_change_ctx {
u64 prio;
struct task_struct *p;
+#ifdef CONFIG_PROVE_LOCKING
+ raw_spinlock_t *srq_lock;
+#endif
int flags;
bool queued;
bool running;
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -89,7 +89,7 @@ void set_user_nice(struct task_struct *p
return;
}
- scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK) {
+ scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK | DEQUEUE_LOCKED) {
p->static_prio = NICE_TO_PRIO(nice);
set_load_weight(p, true);
old_prio = p->prio;
@@ -503,7 +503,7 @@ int __sched_setscheduler(struct task_str
struct balance_callback *head;
struct rq_flags rf;
int reset_on_fork;
- int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
+ int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK | DEQUEUE_LOCKED;
struct rq *rq;
bool cpuset_locked = false;
Hello, Peter.
On Wed, Sep 10, 2025 at 05:44:22PM +0200, Peter Zijlstra wrote:
> Provide a LOCKED queue flag, indicating that the {en,de}queue()
> operation is in task_rq_lock() context.
>
> Note: the sched_change in scx_bypass() is the only one that does not
> use task_rq_lock(). If that were fixed, we could have sched_change
> imply LOCKED.
I don't see any harm in doing task_rq_lock() in the scx_bypass() loop.
Please feel free to switch that for simplicity.
Thanks.
--
tejun
On Wed, Sep 10, 2025 at 04:01:55PM -1000, Tejun Heo wrote:
> Hello, Peter.
>
> On Wed, Sep 10, 2025 at 05:44:22PM +0200, Peter Zijlstra wrote:
> > Provide a LOCKED queue flag, indicating that the {en,de}queue()
> > operation is in task_rq_lock() context.
> >
> > Note: the sched_change in scx_bypass() is the only one that does not
> > use task_rq_lock(). If that were fixed, we could have sched_change
> > imply LOCKED.
>
> I don't see any harm in doing task_rq_lock() in the scx_bypass() loop.
> Please feel free to switch that for simplicity.
I didn't immediately see how to do that. Doesn't that
list_for_each_entry_safe_reverse() rely on rq->lock to retain integrity?
Moreover, since the goal is to allow:
__schedule()
lock(rq->lock);
next = pick_task() := pick_task_scx()
lock(dsq->lock);
p = some_dsq_task(dsq);
task_unlink_from_dsq(p, dsq);
set_task_cpu(p, cpu_of(rq));
move_task_to_local_dsq(p, ...);
return p;
without dropping rq->lock, by relying on dsq->lock to serialize things,
I don't see how we can retain the runnable list at all.
And at this point, I'm not sure I understand ext well enough to know
what this bypass stuff does at all, let alone suggest means to
re architect this.
Hello, On Thu, Sep 11, 2025 at 11:42:40AM +0200, Peter Zijlstra wrote: ... > I didn't immediately see how to do that. Doesn't that > list_for_each_entry_safe_reverse() rely on rq->lock to retain integrity? Ah, sorry, I was thinking it was iterating scx_tasks list. Yes, as implemented, it needs to hold rq lock throughout. > Moreover, since the goal is to allow: > > __schedule() > lock(rq->lock); > next = pick_task() := pick_task_scx() > lock(dsq->lock); > p = some_dsq_task(dsq); > task_unlink_from_dsq(p, dsq); > set_task_cpu(p, cpu_of(rq)); > move_task_to_local_dsq(p, ...); > return p; > > without dropping rq->lock, by relying on dsq->lock to serialize things, > I don't see how we can retain the runnable list at all. > > And at this point, I'm not sure I understand ext well enough to know > what this bypass stuff does at all, let alone suggest means to > re architect this. Bypass mode is enabled when the kernel side can't trust the BPF scheduling anymore and wants to fall back to dumb FIFO scheduling to guarantee forward progress (e.g. so that we can switch back to fair). It comes down to flipping scx_rq_bypassing() on, which makes scheduling paths bypass most BPF parts and fall back to FIFO behavior, and then making sure every thread is on FIFO behavior. The latter part is what the loop is doing. It scans all currently runnable tasks and dequeues and re-enqueues them. As scx_rq_bypass() is true at this point, if a task were queued on the BPF side, the cycling takes it out of the BPF side and puts it on the fallback FIFO queue. If we want to get rid of the locking requirement: - Walk scx_tasks list which is iterated with a cursor and allows dropping locks while iterating. However, on some hardware, there are cases where CPUs are extremely slowed down from BPF scheduler making bad decisions and causing a lot of sync cacheline pingponging across e.g. NUMA nodes. As scx_bypass() is what's supposed to extricate the system from this state, walking all tasks while going through each's locking probably isn't going to be great. - We can update ->runnable_list iteration to allow dropping rq lock e.g. with a cursor based iteration. Maybe some code can be shared with scx_tasks iteration. Cycling through locks still isn't going to be great but here it's likely a lot fewer of them at least. Neither option is great. Leave it as-is for now? Thanks. -- tejun
On Thu, Sep 11, 2025 at 10:40:06AM -1000, Tejun Heo wrote:
> Hello,
>
> On Thu, Sep 11, 2025 at 11:42:40AM +0200, Peter Zijlstra wrote:
> ...
> > I didn't immediately see how to do that. Doesn't that
> > list_for_each_entry_safe_reverse() rely on rq->lock to retain integrity?
>
> Ah, sorry, I was thinking it was iterating scx_tasks list. Yes, as
> implemented, it needs to hold rq lock throughout.
>
> > Moreover, since the goal is to allow:
> >
> > __schedule()
> > lock(rq->lock);
> > next = pick_task() := pick_task_scx()
> > lock(dsq->lock);
> > p = some_dsq_task(dsq);
> > task_unlink_from_dsq(p, dsq);
> > set_task_cpu(p, cpu_of(rq));
> > move_task_to_local_dsq(p, ...);
> > return p;
> >
> > without dropping rq->lock, by relying on dsq->lock to serialize things,
> > I don't see how we can retain the runnable list at all.
> >
> > And at this point, I'm not sure I understand ext well enough to know
> > what this bypass stuff does at all, let alone suggest means to
> > re architect this.
>
> Bypass mode is enabled when the kernel side can't trust the BPF scheduling
> anymore and wants to fall back to dumb FIFO scheduling to guarantee forward
> progress (e.g. so that we can switch back to fair).
>
> It comes down to flipping scx_rq_bypassing() on, which makes scheduling
> paths bypass most BPF parts and fall back to FIFO behavior, and then making
> sure every thread is on FIFO behavior. The latter part is what the loop is
> doing. It scans all currently runnable tasks and dequeues and re-enqueues
> them. As scx_rq_bypass() is true at this point, if a task were queued on the
> BPF side, the cycling takes it out of the BPF side and puts it on the
> fallback FIFO queue.
>
> If we want to get rid of the locking requirement:
>
> - Walk scx_tasks list which is iterated with a cursor and allows dropping
> locks while iterating. However, on some hardware, there are cases where
> CPUs are extremely slowed down from BPF scheduler making bad decisions and
> causing a lot of sync cacheline pingponging across e.g. NUMA nodes. As
> scx_bypass() is what's supposed to extricate the system from this state,
> walking all tasks while going through each's locking probably isn't going
> to be great.
>
> - We can update ->runnable_list iteration to allow dropping rq lock e.g.
> with a cursor based iteration. Maybe some code can be shared with
> scx_tasks iteration. Cycling through locks still isn't going to be great
> but here it's likely a lot fewer of them at least.
>
> Neither option is great. Leave it as-is for now?
Ah, but I think we *have* to change it :/ The thing is that with the new
pick you can change 'rq' without holding the source rq->lock. So we
can't maintain this list.
Could something like so work?
scoped_guard (rcu) for_each_process_thread(g, p) {
if (p->flags & PF_EXITING || p->sched_class != ext_sched_class)
continue;
guard(task_rq_lock)(p);
scoped_guard (sched_change, p) {
/* no-op */
}
}
Hello,
On Fri, Sep 12, 2025 at 04:19:04PM +0200, Peter Zijlstra wrote:
...
> Ah, but I think we *have* to change it :/ The thing is that with the new
> pick you can change 'rq' without holding the source rq->lock. So we
> can't maintain this list.
>
> Could something like so work?
>
> scoped_guard (rcu) for_each_process_thread(g, p) {
> if (p->flags & PF_EXITING || p->sched_class != ext_sched_class)
> continue;
>
> guard(task_rq_lock)(p);
> scoped_guard (sched_change, p) {
> /* no-op */
> }
> }
Yeah, or I can make scx_tasks iteration smarter so that it can skip through
the list for tasks which aren't runnable. As long as it doesn't do lock ops
on every task, it should be fine. I think this is solvable one way or
another. Let's continue in the other subthread.
Thanks.
--
tejun
On Fri, Sep 12, 2025 at 06:32:32AM -1000, Tejun Heo wrote:
> Hello,
>
> On Fri, Sep 12, 2025 at 04:19:04PM +0200, Peter Zijlstra wrote:
> ...
> > Ah, but I think we *have* to change it :/ The thing is that with the new
> > pick you can change 'rq' without holding the source rq->lock. So we
> > can't maintain this list.
> >
> > Could something like so work?
> >
> > scoped_guard (rcu) for_each_process_thread(g, p) {
> > if (p->flags & PF_EXITING || p->sched_class != ext_sched_class)
> > continue;
> >
> > guard(task_rq_lock)(p);
> > scoped_guard (sched_change, p) {
> > /* no-op */
> > }
> > }
>
> Yeah, or I can make scx_tasks iteration smarter so that it can skip through
> the list for tasks which aren't runnable. As long as it doesn't do lock ops
> on every task, it should be fine. I think this is solvable one way or
> another. Let's continue in the other subthread.
Well, either this or scx_tasks iterator will result in lock ops for
every task, this is unavoidable if we want the normal p->pi_lock,
rq->lock (dsq->lock) taken for every sched_change caller.
I have the below which I would like to include in the series such that I
can clean up all that DEQUEUE_LOCKED stuff a bit, this being the only
sched_change that's 'weird'.
Added 'bonus' is of course one less user of the runnable_list.
(also, I have to note, for_each_cpu with preemption disabled is asking
for trouble, the enormous core count machines are no longer super
esoteric)
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -4817,6 +4817,7 @@ static void scx_bypass(bool bypass)
{
static DEFINE_RAW_SPINLOCK(bypass_lock);
static unsigned long bypass_timestamp;
+ struct task_struct *g, *p;
struct scx_sched *sch;
unsigned long flags;
int cpu;
@@ -4849,16 +4850,16 @@ static void scx_bypass(bool bypass)
* queued tasks are re-queued according to the new scx_rq_bypassing()
* state. As an optimization, walk each rq's runnable_list instead of
* the scx_tasks list.
- *
- * This function can't trust the scheduler and thus can't use
- * cpus_read_lock(). Walk all possible CPUs instead of online.
+ */
+
+ /*
+ * XXX online_mask is stable due to !preempt (per bypass_lock)
+ * so could this be for_each_online_cpu() ?
*/
for_each_possible_cpu(cpu) {
struct rq *rq = cpu_rq(cpu);
- struct task_struct *p, *n;
raw_spin_rq_lock(rq);
-
if (bypass) {
WARN_ON_ONCE(rq->scx.flags & SCX_RQ_BYPASSING);
rq->scx.flags |= SCX_RQ_BYPASSING;
@@ -4866,36 +4867,33 @@ static void scx_bypass(bool bypass)
WARN_ON_ONCE(!(rq->scx.flags & SCX_RQ_BYPASSING));
rq->scx.flags &= ~SCX_RQ_BYPASSING;
}
+ raw_spin_rq_unlock(rq);
+ }
+
+ /* implicit RCU section due to bypass_lock */
+ for_each_process_thread(g, p) {
+ unsigned int state;
- /*
- * We need to guarantee that no tasks are on the BPF scheduler
- * while bypassing. Either we see enabled or the enable path
- * sees scx_rq_bypassing() before moving tasks to SCX.
- */
- if (!scx_enabled()) {
- raw_spin_rq_unlock(rq);
+ guard(raw_spinlock)(&p->pi_lock);
+ if (p->flags & PF_EXITING || p->sched_class != &ext_sched_class)
+ continue;
+
+ state = READ_ONCE(p->__state);
+ if (state != TASK_RUNNING && state != TASK_WAKING)
continue;
- }
- /*
- * The use of list_for_each_entry_safe_reverse() is required
- * because each task is going to be removed from and added back
- * to the runnable_list during iteration. Because they're added
- * to the tail of the list, safe reverse iteration can still
- * visit all nodes.
- */
- list_for_each_entry_safe_reverse(p, n, &rq->scx.runnable_list,
- scx.runnable_node) {
- /* cycling deq/enq is enough, see the function comment */
- scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) {
- /* nothing */ ;
- }
+ guard(__task_rq_lock)(p);
+ scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) {
+ /* nothing */ ;
}
+ }
- /* resched to restore ticks and idle state */
- if (cpu_online(cpu) || cpu == smp_processor_id())
- resched_curr(rq);
+ /* implicit !preempt section due to bypass_lock */
+ for_each_online_cpu(cpu) {
+ struct rq *rq = cpu_rq(cpu);
+ raw_spin_rq_lock(rq);
+ resched_curr(cpu_rq(cpu));
raw_spin_rq_unlock(rq);
}
Hello,
On Thu, Sep 25, 2025 at 03:10:25PM +0200, Peter Zijlstra wrote:
...
> Well, either this or scx_tasks iterator will result in lock ops for
> every task, this is unavoidable if we want the normal p->pi_lock,
> rq->lock (dsq->lock) taken for every sched_change caller.
>
> I have the below which I would like to include in the series such that I
> can clean up all that DEQUEUE_LOCKED stuff a bit, this being the only
> sched_change that's 'weird'.
>
> Added 'bonus' is of course one less user of the runnable_list.
>
> (also, I have to note, for_each_cpu with preemption disabled is asking
> for trouble, the enormous core count machines are no longer super
> esoteric)
Oh yeah, we can break up every N CPUs. There's no cross-CPU atomicity
requirement.
> + /*
> + * XXX online_mask is stable due to !preempt (per bypass_lock)
> + * so could this be for_each_online_cpu() ?
> */
CPUs can go on and offline while CPUs are being bypassed. We can handle that
in hotplug ops but I'm not sure the complexity is justified in this case.
> for_each_possible_cpu(cpu) {
> struct rq *rq = cpu_rq(cpu);
> - struct task_struct *p, *n;
>
> raw_spin_rq_lock(rq);
> -
> if (bypass) {
> WARN_ON_ONCE(rq->scx.flags & SCX_RQ_BYPASSING);
> rq->scx.flags |= SCX_RQ_BYPASSING;
> @@ -4866,36 +4867,33 @@ static void scx_bypass(bool bypass)
> WARN_ON_ONCE(!(rq->scx.flags & SCX_RQ_BYPASSING));
> rq->scx.flags &= ~SCX_RQ_BYPASSING;
I may be using BYPASSING being set as all tasks having been cycled. Will
check. We may need an extra state to note that bypass switching is complete.
Hmm... the switching is not synchronized against scheduling operations
anymore - ie. we can end up mixing regular op and bypassed operation for the
same scheduling event (e.g. enqueue vs. task state transitions), which can
lead subtle state inconsistencies on the BPF scheduler side. Either the
bypassing state should become per-task, which likely has system
recoverability issues under lock storm conditions, or maybe we can just
shift it to the scheduling path - e.g. decide whether to bypass or not at
the beginning of enqueue path and then stick to it until the whole operation
is finished.
> }
> + raw_spin_rq_unlock(rq);
> + }
> +
> + /* implicit RCU section due to bypass_lock */
> + for_each_process_thread(g, p) {
I don't think this is safe. p->tasks is unlinked from __unhash_process() but
tasks can schedule between being unhashed and the final preemt_disable() in
do_exit() and thus the above iteration can miss tasks which may currently be
runnable.
> + unsigned int state;
>
> + guard(raw_spinlock)(&p->pi_lock);
> + if (p->flags & PF_EXITING || p->sched_class != &ext_sched_class)
> + continue;
> +
> + state = READ_ONCE(p->__state);
> + if (state != TASK_RUNNING && state != TASK_WAKING)
> continue;
>
> + guard(__task_rq_lock)(p);
> + scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) {
> + /* nothing */ ;
> }
> + }
This is significantly more expensive. On large systems, the number of
threads can easily reach six digits. Iterating all of them while doing
locking ops on each of them might become problematic depending on what the
rest of the system is doing (unfortunately, it's not too difficult to cause
meltdowns on some NUMA systems with cross-node traffic). I don't think
p->tasks iterations can be broken up either.
I'm sure there's a solution for all these. Maybe once bypass is set and the
per-task iteration can be broken up, this is no longer a problem, ok maybe
there's some other way to maintain runnable list in a way that's decoupled
from rq lock. The interlocking requirement is relaxed on the removal side.
There must be a way to visit all runnable tasks but visiting some tasks
spuriously is not a problem, so there's some leeway too.
As with everything, this part is a bit tricky and will need non-trivial
amount of testing to verify that it can recover the system from BPF
scheduler induced death sprials (e.g. migrating tasks too frequently across
NUMA boundaries on some systems). The change guard cleanups make sense
regardless of how the rest develops. Would it make sense to land them first?
Once we know what to do with the core scheduling locking, I'm sure we can
find a way to make this work accordingly.
Thanks.
--
tejun
On Thu, Sep 25, 2025 at 05:40:27AM -1000, Tejun Heo wrote:
> Hello,
>
> On Thu, Sep 25, 2025 at 03:10:25PM +0200, Peter Zijlstra wrote:
> ...
> > Well, either this or scx_tasks iterator will result in lock ops for
> > every task, this is unavoidable if we want the normal p->pi_lock,
> > rq->lock (dsq->lock) taken for every sched_change caller.
> >
> > I have the below which I would like to include in the series such that I
> > can clean up all that DEQUEUE_LOCKED stuff a bit, this being the only
> > sched_change that's 'weird'.
> >
> > Added 'bonus' is of course one less user of the runnable_list.
> >
> > (also, I have to note, for_each_cpu with preemption disabled is asking
> > for trouble, the enormous core count machines are no longer super
> > esoteric)
>
> Oh yeah, we can break up every N CPUs. There's no cross-CPU atomicity
> requirement.
Right.
> > + /*
> > + * XXX online_mask is stable due to !preempt (per bypass_lock)
> > + * so could this be for_each_online_cpu() ?
> > */
>
> CPUs can go on and offline while CPUs are being bypassed. We can handle that
> in hotplug ops but I'm not sure the complexity is justified in this case.
Well, not in the current code, since the CPU running this has IRQs and
preemption disabled (per bypass_lock) and thus stop_machine, as used in
hotplug can't make progress.
That is; disabling preemption serializes against hotplug. This is
something that the scheduler relies on in quite a few places.
> > for_each_possible_cpu(cpu) {
> > struct rq *rq = cpu_rq(cpu);
> > - struct task_struct *p, *n;
> >
> > raw_spin_rq_lock(rq);
> > -
> > if (bypass) {
> > WARN_ON_ONCE(rq->scx.flags & SCX_RQ_BYPASSING);
> > rq->scx.flags |= SCX_RQ_BYPASSING;
> > @@ -4866,36 +4867,33 @@ static void scx_bypass(bool bypass)
> > WARN_ON_ONCE(!(rq->scx.flags & SCX_RQ_BYPASSING));
> > rq->scx.flags &= ~SCX_RQ_BYPASSING;
>
> I may be using BYPASSING being set as all tasks having been cycled. Will
> check. We may need an extra state to note that bypass switching is complete.
> Hmm... the switching is not synchronized against scheduling operations
> anymore - ie. we can end up mixing regular op and bypassed operation for the
> same scheduling event (e.g. enqueue vs. task state transitions), which can
> lead subtle state inconsistencies on the BPF scheduler side. Either the
> bypassing state should become per-task, which likely has system
> recoverability issues under lock storm conditions, or maybe we can just
> shift it to the scheduling path - e.g. decide whether to bypass or not at
> the beginning of enqueue path and then stick to it until the whole operation
> is finished.
Makes sense.
> > }
> > + raw_spin_rq_unlock(rq);
> > + }
> > +
> > + /* implicit RCU section due to bypass_lock */
> > + for_each_process_thread(g, p) {
>
> I don't think this is safe. p->tasks is unlinked from __unhash_process() but
> tasks can schedule between being unhashed and the final preemt_disable() in
> do_exit() and thus the above iteration can miss tasks which may currently be
> runnable.
Bah, you're quite right :/
> > + unsigned int state;
> >
> > + guard(raw_spinlock)(&p->pi_lock);
> > + if (p->flags & PF_EXITING || p->sched_class != &ext_sched_class)
> > + continue;
> > +
> > + state = READ_ONCE(p->__state);
> > + if (state != TASK_RUNNING && state != TASK_WAKING)
> > continue;
> >
> > + guard(__task_rq_lock)(p);
> > + scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) {
> > + /* nothing */ ;
> > }
> > + }
>
> This is significantly more expensive. On large systems, the number of
> threads can easily reach six digits. Iterating all of them while doing
> locking ops on each of them might become problematic depending on what the
> rest of the system is doing (unfortunately, it's not too difficult to cause
> meltdowns on some NUMA systems with cross-node traffic). I don't think
> p->tasks iterations can be broken up either.
I thought to have understood that bypass isn't something that happens
when the system is happy. As long as it completes at some point all this
should be fine right?
I mean, yeah, it'll take a while, but meh.
Also, we could run the thing at fair or FIFO-1 or something, to be
outside of ext itself. Possibly we can freeze all the ext tasks on
return to user to limit the amount of noise they generate.
> The change guard cleanups make sense
> regardless of how the rest develops. Would it make sense to land them first?
> Once we know what to do with the core scheduling locking, I'm sure we can
> find a way to make this work accordingly.
Yeah, definitely. Thing is, if we can get all sched_change users to be
the same, that all cleans up better.
But if cleaning this up gets to be too vexing, we can postpone that.
Hello,
On Thu, Sep 25, 2025 at 05:53:23PM +0200, Peter Zijlstra wrote:
> > CPUs can go on and offline while CPUs are being bypassed. We can handle that
> > in hotplug ops but I'm not sure the complexity is justified in this case.
>
> Well, not in the current code, since the CPU running this has IRQs and
> preemption disabled (per bypass_lock) and thus stop_machine, as used in
> hotplug can't make progress.
>
> That is; disabling preemption serializes against hotplug. This is
> something that the scheduler relies on in quite a few places.
Oh, I meant something like:
CPU X goes down
scx_bypass(true);
stuff happening in bypass mode.
tasks are scheduling, sleeping and CPU X comes up
everything.
scx_bypass(false);
When CPU X comes up, it should come up in bypass mode, which can easily be
done in online callback, but it's just a bit simpler to keep them always in
sync.
> > This is significantly more expensive. On large systems, the number of
> > threads can easily reach six digits. Iterating all of them while doing
> > locking ops on each of them might become problematic depending on what the
> > rest of the system is doing (unfortunately, it's not too difficult to cause
> > meltdowns on some NUMA systems with cross-node traffic). I don't think
> > p->tasks iterations can be broken up either.
>
> I thought to have understood that bypass isn't something that happens
> when the system is happy. As long as it completes at some point all this
> should be fine right?
>
> I mean, yeah, it'll take a while, but meh.
>
> Also, we could run the thing at fair or FIFO-1 or something, to be
> outside of ext itself. Possibly we can freeze all the ext tasks on
> return to user to limit the amount of noise they generate.
One problem scenario that we saw with sapphire rapids multi socket machines
is that when there are a lot of cross-socket locking operations (same locks
getting hammered on from two sockets), forward progress slows down to the
point where hard lockup triggers really easily. We saw two problems in such
scenarios - the total throughput of locking operations was low and the
distribution of successes across CPUs was pretty skewed. Combining the two
factors, the slowest CPU on sapphire rapids ran about two orders of
magnitude slower than a similarly sized AMD machine doing the smae thing.
The benchmark became a part of stress-ng, the --flipflop.
Anyways, what this comes down to is that on some machines, scx_bypass(true)
has to be pretty careful to avoid these hard lockup scenarios as that's
what's expected to recover the system when such situations develop.
> > The change guard cleanups make sense
> > regardless of how the rest develops. Would it make sense to land them first?
> > Once we know what to do with the core scheduling locking, I'm sure we can
> > find a way to make this work accordingly.
>
> Yeah, definitely. Thing is, if we can get all sched_change users to be
> the same, that all cleans up better.
>
> But if cleaning this up gets to be too vexing, we can postpone that.
Yeah, I think it's just going to be a bit more involved and it'd be easier
if we don't make it block other stuff.
Thanks.
--
tejun
Hello, On Fri, Sep 12, 2025 at 06:32:32AM -1000, Tejun Heo wrote: > Yeah, or I can make scx_tasks iteration smarter so that it can skip through > the list for tasks which aren't runnable. As long as it doesn't do lock ops > on every task, it should be fine. I think this is solvable one way or > another. Let's continue in the other subthread. Thought more about it. There's another use case for this runnable list, which is the watchdog. As in the migration synchronization, I think the right thing to do here is just adding a nested lock. That doesn't add any overhead or complications to other sched classes and from sched_ext POV given how expensive migrations can be, if we make that a bit cheaper (and I believe we will with changes being discussed), added up, the outcome would likely be lower overhead. Thanks. -- tejun
On Sat, Sep 13, 2025 at 12:32:27PM -1000, Tejun Heo wrote: > Hello, > > On Fri, Sep 12, 2025 at 06:32:32AM -1000, Tejun Heo wrote: > > Yeah, or I can make scx_tasks iteration smarter so that it can skip through > > the list for tasks which aren't runnable. As long as it doesn't do lock ops > > on every task, it should be fine. I think this is solvable one way or > > another. Let's continue in the other subthread. > > Thought more about it. There's another use case for this runnable list, > which is the watchdog. As in the migration synchronization, I think the > right thing to do here is just adding a nested lock. That doesn't add any > overhead or complications to other sched classes and from sched_ext POV > given how expensive migrations can be, if we make that a bit cheaper (and I > believe we will with changes being discussed), added up, the outcome would > likely be lower overhead. I really don't see how you could possibly retain that runnable_list. pick_next_task() must be able to migrate a task from a shared runqueue to a local runqueue. It must do this without taking a random other per-cpu runqueue. Therefore, a task on a DSQ must have no 'local' state. This very much means the runnable_list cannot be per cpu. No per-task lock is going to help with that. The watchdog will have to go iterate DSQs or something like that.
© 2016 - 2026 Red Hat, Inc.