kernel/sched/ext.c | 42 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 40 insertions(+), 2 deletions(-)
When a BPF scheduler is being disabled, scx_root can be set to NULL
while tasks are still associated with the sched_ext class. If a task is
subject to an affinity change, priority adjustment, or policy switch
during this window, sched_class operations will dereference a NULL
scx_root pointer, triggering a BUG like the following:
BUG: kernel NULL pointer dereference, address: 00000000000001c0
...
RIP: 0010:set_cpus_allowed_scx+0x1a/0xa0
...
Call Trace:
__set_cpus_allowed_ptr_locked+0x142/0x1c0
__sched_setaffinity+0x72/0x100
sched_setaffinity+0x281/0x360
Similarly, tasks can be in various states, depending on the timing of
concurrent operations. This causes spurious WARN_ON_ONCE() triggers in
scx_disable_task() and invalid state transitions when tasks are switched
to or from the sched_ext class:
WARNING: kernel/sched/ext.c:3118 at scx_disable_task+0x7c/0x180
...
Call Trace:
sched_change_begin+0xf2/0x270
__sched_setscheduler+0x346/0xc70
Fix by:
- Adding NULL checks at the beginning of sched_class operations
(set_cpus_allowed_scx, reweight_task_scx, switching_to_scx) to skip
BPF scheduler notifications when scx_root is NULL.
- Making the state assertion in scx_disable_task() conditional and only
warn during normal operation. Add early return if task is not in
SCX_TASK_ENABLED state to make the function idempotent.
- In switched_from_scx(), check task state before calling
scx_disable_task() to avoid calling it on tasks in a transitional
state.
Fixes: d310fb4009689 ("sched_ext: Clean up scx_root usages")
Cc: stable@vger.kernel.org # v6.16+
Signed-off-by: Andrea Righi <arighi@nvidia.com>
---
kernel/sched/ext.c | 42 ++++++++++++++++++++++++++++++++++++++++--
1 file changed, 40 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index afe28c04d5aa7..aae5c5141cf1e 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -2619,6 +2619,9 @@ static void set_cpus_allowed_scx(struct task_struct *p,
set_cpus_allowed_common(p, ac);
+ if (unlikely(!sch))
+ return;
+
/*
* The effective cpumask is stored in @p->cpus_ptr which may temporarily
* differ from the configured one in @p->cpus_mask. Always tell the bpf
@@ -2920,7 +2923,18 @@ static void scx_disable_task(struct task_struct *p)
struct rq *rq = task_rq(p);
lockdep_assert_rq_held(rq);
- WARN_ON_ONCE(scx_get_task_state(p) != SCX_TASK_ENABLED);
+
+ /*
+ * During disabling, tasks can be in various states due to
+ * concurrent operations, only warn about unexpected state during
+ * normal operation.
+ */
+ if (likely(scx_enable_state() != SCX_DISABLING))
+ WARN_ON_ONCE(scx_get_task_state(p) != SCX_TASK_ENABLED);
+
+ /* If task is not enabled, skip disable */
+ if (scx_get_task_state(p) != SCX_TASK_ENABLED)
+ return;
if (SCX_HAS_OP(sch, disable))
SCX_CALL_OP_TASK(sch, SCX_KF_REST, disable, rq, p);
@@ -3063,6 +3077,9 @@ static void reweight_task_scx(struct rq *rq, struct task_struct *p,
lockdep_assert_rq_held(task_rq(p));
+ if (unlikely(!sch))
+ return;
+
p->scx.weight = sched_weight_to_cgroup(scale_load_down(lw->weight));
if (SCX_HAS_OP(sch, set_weight))
SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_weight, rq,
@@ -3077,6 +3094,21 @@ static void switching_to_scx(struct rq *rq, struct task_struct *p)
{
struct scx_sched *sch = scx_root;
+ /*
+ * We may race with a concurrent disable, skip enabling if scx_root
+ * is NULL or the task is in a transitional state.
+ */
+ if (unlikely(!sch || scx_enable_state() == SCX_DISABLING))
+ return;
+
+ /*
+ * Task might not be properly initialized if it's being switched to
+ * SCX after scx_init_task_enabled was set. Initialize to READY state
+ * first if needed.
+ */
+ if (scx_get_task_state(p) == SCX_TASK_NONE)
+ scx_set_task_state(p, SCX_TASK_READY);
+
scx_enable_task(p);
/*
@@ -3090,7 +3122,13 @@ static void switching_to_scx(struct rq *rq, struct task_struct *p)
static void switched_from_scx(struct rq *rq, struct task_struct *p)
{
- scx_disable_task(p);
+ /*
+ * Only disable if the task is actually enabled. During scheduler
+ * disabling, tasks might already be in READY state if they've been
+ * disabled by concurrent operations.
+ */
+ if (scx_get_task_state(p) == SCX_TASK_ENABLED)
+ scx_disable_task(p);
}
static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p,int wake_flags) {}
--
2.52.0
Hello, On Mon, Feb 02, 2026 at 04:13:41PM +0100, Andrea Righi wrote: > @@ -2619,6 +2619,9 @@ static void set_cpus_allowed_scx(struct task_struct *p, > > set_cpus_allowed_common(p, ac); > > + if (unlikely(!sch)) > + return; > + I don't quite understand how this would happen. set_cpu_allowed_scx() is called from do_set_cpus_allowed() with task_rq locked. ie. the task *has* to be on sched_ext for it to be called. It's straightforward task rq lock synchronization, so there's no race window. Combined with the failures in switching_to_scx() and switched_form_scx(), I wonder whether what's actually broken is more something like the disable path missing some tasks? Thanks. -- tejun
On Mon, Feb 02, 2026 at 07:10:02AM -1000, Tejun Heo wrote:
> Hello,
>
> On Mon, Feb 02, 2026 at 04:13:41PM +0100, Andrea Righi wrote:
> > @@ -2619,6 +2619,9 @@ static void set_cpus_allowed_scx(struct task_struct *p,
> >
> > set_cpus_allowed_common(p, ac);
> >
> > + if (unlikely(!sch))
> > + return;
> > +
>
> I don't quite understand how this would happen. set_cpu_allowed_scx() is
> called from do_set_cpus_allowed() with task_rq locked. ie. the task *has* to
> be on sched_ext for it to be called. It's straightforward task rq lock
> synchronization, so there's no race window.
>
> Combined with the failures in switching_to_scx() and switched_form_scx(), I
> wonder whether what's actually broken is more something like the disable
> path missing some tasks?
>
> Thanks.
>
> --
> tejun
I'm able to reproduce the NULL pointer dereference in set_cpu_allowed_scx()
quite easily running `stress-ng --race-sched 0` with an scx scheduler that
is intentionally starving tasks, triggering a stall => disable.
I think this is what's happening:
CPU0 CPU1
---- ----
__sched_setscheduler()
task_rq_lock(p)
next_class = __setscheduler_class()
// next_class is ext_sched_class
scx_disable_workfn()
scx_set_enable_state(SCX_DISABLING)
scx_task_iter_start()
while ((p = next())) {
...
p->sched_class = fair_sched_class
...
}
scx_task_iter_stop()
synchronize_rcu()
RCU_INIT_POINTER(scx_root, NULL)
scoped_guard(sched_change, ...) {
p->sched_class = next_class;
// next_class is still ext_sched_class,
// overwriting fair_sched_class!
}
// Guard ends, calls sched_change_end()
// switching_to_scx() called
// scx_root == NULL => returns early
task_rq_unlock(p)
sched_setaffinity(p)
set_cpus_allowed_scx()
sch = scx_root; // scx_root == NULL => BUG!
-Andrea
On Mon, Feb 02, 2026 at 07:54:50PM +0100, Andrea Righi wrote:
> I'm able to reproduce the NULL pointer dereference in set_cpu_allowed_scx()
> quite easily running `stress-ng --race-sched 0` with an scx scheduler that
> is intentionally starving tasks, triggering a stall => disable.
>
> I think this is what's happening:
>
> CPU0 CPU1
> ---- ----
> __sched_setscheduler()
> task_rq_lock(p)
>
> next_class = __setscheduler_class()
> // next_class is ext_sched_class
> scx_disable_workfn()
> scx_set_enable_state(SCX_DISABLING)
>
> scx_task_iter_start()
> while ((p = next())) {
> ...
> p->sched_class = fair_sched_class
> ...
> }
> scx_task_iter_stop()
>
> synchronize_rcu()
> RCU_INIT_POINTER(scx_root, NULL)
>
> scoped_guard(sched_change, ...) {
> p->sched_class = next_class;
> // next_class is still ext_sched_class,
> // overwriting fair_sched_class!
> }
> // Guard ends, calls sched_change_end()
> // switching_to_scx() called
> // scx_root == NULL => returns early
>
> task_rq_unlock(p)
>
> sched_setaffinity(p)
> set_cpus_allowed_scx()
> sch = scx_root; // scx_root == NULL => BUG!
Does the following patch fix the issue?
Thanks.
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 136b01950a62..1fc2b358a175 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -4234,7 +4234,13 @@ static void scx_disable_workfn(struct kthread_work *work)
* Here, every runnable task is guaranteed to make forward progress and
* we can safely use blocking synchronization constructs. Actually
* disable ops.
+ *
+ * Wait for all CPUs to observe %SCX_DISABLING. Otherwise,
+ * task_should_scx() can see %SCX_ENABLED and __sched_setscheduler() put
+ * a task into sched_ext while we're migrating tasks out below.
*/
+ synchronize_rcu();
+
mutex_lock(&scx_enable_mutex);
static_branch_disable(&__scx_switched_all);
On Mon, Feb 02, 2026 at 10:52:04AM -1000, Tejun Heo wrote:
> On Mon, Feb 02, 2026 at 07:54:50PM +0100, Andrea Righi wrote:
> > I'm able to reproduce the NULL pointer dereference in set_cpu_allowed_scx()
> > quite easily running `stress-ng --race-sched 0` with an scx scheduler that
> > is intentionally starving tasks, triggering a stall => disable.
> >
> > I think this is what's happening:
> >
> > CPU0 CPU1
> > ---- ----
> > __sched_setscheduler()
> > task_rq_lock(p)
> >
> > next_class = __setscheduler_class()
> > // next_class is ext_sched_class
> > scx_disable_workfn()
> > scx_set_enable_state(SCX_DISABLING)
> >
> > scx_task_iter_start()
> > while ((p = next())) {
> > ...
> > p->sched_class = fair_sched_class
> > ...
> > }
> > scx_task_iter_stop()
> >
> > synchronize_rcu()
> > RCU_INIT_POINTER(scx_root, NULL)
> >
> > scoped_guard(sched_change, ...) {
> > p->sched_class = next_class;
> > // next_class is still ext_sched_class,
> > // overwriting fair_sched_class!
> > }
> > // Guard ends, calls sched_change_end()
> > // switching_to_scx() called
> > // scx_root == NULL => returns early
> >
> > task_rq_unlock(p)
> >
> > sched_setaffinity(p)
> > set_cpus_allowed_scx()
> > sch = scx_root; // scx_root == NULL => BUG!
>
> Does the following patch fix the issue?
Nope, I can still trigger this (with the same modified scx_simple +
stress-ng --race-sched 0:
[ 15.899233] sched_ext: BPF scheduler "simple" disabled (runtime error)
[ 15.899447] sched_ext: simple: SCX_DSQ_LOCAL[_ON] target CPU 10 not allowed for stress-ng-race-[726]
[ 15.899586] scx_exit+0x50/0x70
[ 15.899655] task_can_run_on_remote_rq+0x8c/0x180
[ 15.899735] dispatch_to_local_dsq+0x61/0x1f0
[ 15.899900] flush_dispatch_buf+0x15e/0x190
[ 15.899994] pick_task_scx+0x2b2/0x890
[ 15.900058] __schedule+0x683/0x1250
[ 15.900135] schedule_idle+0x22/0x40
[ 15.900263] cpu_startup_entry+0x29/0x30
[ 15.900330] start_secondary+0xf8/0x100
[ 15.900394] common_startup_64+0x13e/0x148
[ 15.900539] BUG: kernel NULL pointer dereference, address: 00000000000001c0
[ 15.900660] #PF: supervisor read access in kernel mode
[ 15.900724] #PF: error_code(0x0000) - not-present page
[ 15.900787] PGD 0 P4D 0
[ 15.900822] Oops: Oops: 0000 [#1] SMP NOPTI
[ 15.900872] CPU: 9 UID: 1000 PID: 350 Comm: stress-ng-race- Not tainted 6.19.0-rc8-virtme #43 PREEMPT(voluntary)
[ 15.900992] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
[ 15.901068] RIP: 0010:set_cpus_allowed_scx+0x1a/0xa0
[ 15.901148] Code: 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 f3 0f 1e fa 0f 1f 44 00 00 55 48 8b 2d 37 39 6e 02 53 48 89 fb e8 16 9b fe ff <48> 8b 85 c0 01 00 00 f6 c4 10 74 50 65 48 8b 05 ba c9 4c 02 8b b0
[ 15.901378] RSP: 0018:ffffd432c0e27df8 EFLAGS: 00010086
[ 15.901442] RAX: ffff8cbc827db0d0 RBX: ffff8cbc86870000 RCX: ffff8cbc827db280
[ 15.901537] RDX: ffff8cbc86870000 RSI: ffffd432c0e27eb8 RDI: 0000000000000200
[ 15.901624] RBP: 0000000000000000 R08: 0000000000000001 R09: 0000000000000000
[ 15.901713] R10: 0000000000000001 R11: 0000000000000001 R12: ffffd432c0e27eb8
[ 15.901807] R13: ffffd432c0e27e50 R14: ffff8cbcba218500 R15: 0000000000000000
[ 15.901900] FS: 00007f398e11eb00(0000) GS:ffff8cbd23723000(0000) knlGS:0000000000000000
[ 15.901998] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 15.902072] CR2: 00000000000001c0 CR3: 0000000103f4b000 CR4: 0000000000750ef0
[ 15.902195] PKRU: 55555554
[ 15.902232] Call Trace:
[ 15.902268] <TASK>
[ 15.902302] __set_cpus_allowed_ptr_locked+0x142/0x1c0
[ 15.902368] __set_cpus_allowed_ptr+0x64/0xa0
[ 15.902435] __sched_setaffinity+0x72/0x100
[ 15.902489] sched_setaffinity+0x281/0x360
[ 15.902543] __x64_sys_sched_setaffinity+0x50/0x80
[ 15.902608] do_syscall_64+0xbd/0xf80
[ 15.902660] entry_SYSCALL_64_after_hwframe+0x77/0x7f
>
> Thanks.
>
> diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
> index 136b01950a62..1fc2b358a175 100644
> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c
> @@ -4234,7 +4234,13 @@ static void scx_disable_workfn(struct kthread_work *work)
> * Here, every runnable task is guaranteed to make forward progress and
> * we can safely use blocking synchronization constructs. Actually
> * disable ops.
> + *
> + * Wait for all CPUs to observe %SCX_DISABLING. Otherwise,
> + * task_should_scx() can see %SCX_ENABLED and __sched_setscheduler() put
> + * a task into sched_ext while we're migrating tasks out below.
> */
> + synchronize_rcu();
> +
> mutex_lock(&scx_enable_mutex);
>
> static_branch_disable(&__scx_switched_all);
-Andrea
On Mon, Feb 02, 2026 at 11:50:05PM +0100, Andrea Righi wrote:
> On Mon, Feb 02, 2026 at 10:52:04AM -1000, Tejun Heo wrote:
> > On Mon, Feb 02, 2026 at 07:54:50PM +0100, Andrea Righi wrote:
> > > I'm able to reproduce the NULL pointer dereference in set_cpu_allowed_scx()
> > > quite easily running `stress-ng --race-sched 0` with an scx scheduler that
> > > is intentionally starving tasks, triggering a stall => disable.
> > >
> > > I think this is what's happening:
> > >
> > > CPU0 CPU1
> > > ---- ----
> > > __sched_setscheduler()
> > > task_rq_lock(p)
> > >
> > > next_class = __setscheduler_class()
> > > // next_class is ext_sched_class
> > > scx_disable_workfn()
> > > scx_set_enable_state(SCX_DISABLING)
> > >
> > > scx_task_iter_start()
> > > while ((p = next())) {
> > > ...
> > > p->sched_class = fair_sched_class
> > > ...
> > > }
> > > scx_task_iter_stop()
> > >
> > > synchronize_rcu()
> > > RCU_INIT_POINTER(scx_root, NULL)
> > >
> > > scoped_guard(sched_change, ...) {
> > > p->sched_class = next_class;
> > > // next_class is still ext_sched_class,
> > > // overwriting fair_sched_class!
> > > }
> > > // Guard ends, calls sched_change_end()
> > > // switching_to_scx() called
> > > // scx_root == NULL => returns early
> > >
> > > task_rq_unlock(p)
> > >
> > > sched_setaffinity(p)
> > > set_cpus_allowed_scx()
> > > sch = scx_root; // scx_root == NULL => BUG!
> >
> > Does the following patch fix the issue?
>
> Nope, I can still trigger this (with the same modified scx_simple +
> stress-ng --race-sched 0:
A quick reproducer:
https://github.com/sched-ext/scx/tree/scx-bug
$ make
$ vng -vr -- "stress-ng --race-sched 0 & ./build/scheds/c/scx_bug"
...
[ 3.375119] BUG: kernel NULL pointer dereference, address: 00000000000001c0
[ 3.375836] RIP: 0010:set_cpus_allowed_scx+0x1a/0xa0
It happens almost immediately.
-Andrea
On Tue, Feb 03, 2026 at 03:01:30PM +0100, Andrea Righi wrote: > > Nope, I can still trigger this (with the same modified scx_simple + > > stress-ng --race-sched 0: > > A quick reproducer: > https://github.com/sched-ext/scx/tree/scx-bug > > $ make > $ vng -vr -- "stress-ng --race-sched 0 & ./build/scheds/c/scx_bug" > ... > [ 3.375119] BUG: kernel NULL pointer dereference, address: 00000000000001c0 > [ 3.375836] RIP: 0010:set_cpus_allowed_scx+0x1a/0xa0 > > It happens almost immediately. This should fix it: http://lkml.kernel.org/r/fc034891cef55029c16122f4279e4057@kernel.org Thanks. -- tejun
© 2016 - 2026 Red Hat, Inc.