kernel/sched/ext.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-)
When loading the ebpf scheduler, the tasks in the scx_tasks list will
be traversed and invoke __setscheduler_class() to get new sched_class.
however, this would also incorrectly set the per-cpu migration
task's->sched_class to rt_sched_class, even after unload, the per-cpu
migration task's->sched_class remains sched_rt_class.
The log for this issue is as follows:
./scx_rustland --stats 1
[ 199.245639][ T630] sched_ext: "rustland" does not implement cgroup cpu.weight
[ 199.269213][ T630] sched_ext: BPF scheduler "rustland" enabled
04:25:09 [INFO] RustLand scheduler attached
bpftrace -e 'iter:task /strcontains(ctx->task->comm, "migration")/
{ printf("%s:%d->%pS\n", ctx->task->comm, ctx->task->pid, ctx->task->sched_class); }'
Attaching 1 probe...
migration/0:24->rt_sched_class+0x0/0xe0
migration/1:27->rt_sched_class+0x0/0xe0
migration/2:33->rt_sched_class+0x0/0xe0
migration/3:39->rt_sched_class+0x0/0xe0
migration/4:45->rt_sched_class+0x0/0xe0
migration/5:52->rt_sched_class+0x0/0xe0
migration/6:58->rt_sched_class+0x0/0xe0
migration/7:64->rt_sched_class+0x0/0xe0
sched_ext: BPF scheduler "rustland" disabled (unregistered from user space)
EXIT: unregistered from user space
04:25:21 [INFO] Unregister RustLand scheduler
bpftrace -e 'iter:task /strcontains(ctx->task->comm, "migration")/
{ printf("%s:%d->%pS\n", ctx->task->comm, ctx->task->pid, ctx->task->sched_class); }'
Attaching 1 probe...
migration/0:24->rt_sched_class+0x0/0xe0
migration/1:27->rt_sched_class+0x0/0xe0
migration/2:33->rt_sched_class+0x0/0xe0
migration/3:39->rt_sched_class+0x0/0xe0
migration/4:45->rt_sched_class+0x0/0xe0
migration/5:52->rt_sched_class+0x0/0xe0
migration/6:58->rt_sched_class+0x0/0xe0
migration/7:64->rt_sched_class+0x0/0xe0
This commit therefore generate a new scx_setscheduler_class() and
add check for stop_sched_class to replace __setscheduler_class().
Signed-off-by: Zqiang <qiang.zhang@linux.dev>
---
kernel/sched/ext.c | 14 ++++++++++----
1 file changed, 10 insertions(+), 4 deletions(-)
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index b40d35964cd4..9447fada0050 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -248,6 +248,14 @@ static struct scx_dispatch_q *find_user_dsq(struct scx_sched *sch, u64 dsq_id)
return rhashtable_lookup(&sch->dsq_hash, &dsq_id, dsq_hash_params);
}
+static const struct sched_class *scx_setscheduler_class(struct task_struct *p)
+{
+ if (p->sched_class == &stop_sched_class)
+ return &stop_sched_class;
+
+ return __setscheduler_class(p->policy, p->prio);
+}
+
/*
* scx_kf_mask enforcement. Some kfuncs can only be called from specific SCX
* ops. When invoking SCX ops, SCX_CALL_OP[_RET]() should be used to indicate
@@ -4241,8 +4249,7 @@ static void scx_disable_workfn(struct kthread_work *work)
while ((p = scx_task_iter_next_locked(&sti))) {
unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
const struct sched_class *old_class = p->sched_class;
- const struct sched_class *new_class =
- __setscheduler_class(p->policy, p->prio);
+ const struct sched_class *new_class = scx_setscheduler_class(p);
update_rq_clock(task_rq(p));
@@ -5045,8 +5052,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
while ((p = scx_task_iter_next_locked(&sti))) {
unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
const struct sched_class *old_class = p->sched_class;
- const struct sched_class *new_class =
- __setscheduler_class(p->policy, p->prio);
+ const struct sched_class *new_class = scx_setscheduler_class(p);
if (scx_get_task_state(p) != SCX_TASK_READY)
continue;
--
2.17.1
Hi Zqiang,
On Mon, Dec 01, 2025 at 07:25:40PM +0800, Zqiang wrote:
> When loading the ebpf scheduler, the tasks in the scx_tasks list will
> be traversed and invoke __setscheduler_class() to get new sched_class.
> however, this would also incorrectly set the per-cpu migration
> task's->sched_class to rt_sched_class, even after unload, the per-cpu
> migration task's->sched_class remains sched_rt_class.
>
> The log for this issue is as follows:
>
> ./scx_rustland --stats 1
> [ 199.245639][ T630] sched_ext: "rustland" does not implement cgroup cpu.weight
> [ 199.269213][ T630] sched_ext: BPF scheduler "rustland" enabled
> 04:25:09 [INFO] RustLand scheduler attached
>
> bpftrace -e 'iter:task /strcontains(ctx->task->comm, "migration")/
> { printf("%s:%d->%pS\n", ctx->task->comm, ctx->task->pid, ctx->task->sched_class); }'
> Attaching 1 probe...
> migration/0:24->rt_sched_class+0x0/0xe0
> migration/1:27->rt_sched_class+0x0/0xe0
> migration/2:33->rt_sched_class+0x0/0xe0
> migration/3:39->rt_sched_class+0x0/0xe0
> migration/4:45->rt_sched_class+0x0/0xe0
> migration/5:52->rt_sched_class+0x0/0xe0
> migration/6:58->rt_sched_class+0x0/0xe0
> migration/7:64->rt_sched_class+0x0/0xe0
>
> sched_ext: BPF scheduler "rustland" disabled (unregistered from user space)
> EXIT: unregistered from user space
> 04:25:21 [INFO] Unregister RustLand scheduler
>
> bpftrace -e 'iter:task /strcontains(ctx->task->comm, "migration")/
> { printf("%s:%d->%pS\n", ctx->task->comm, ctx->task->pid, ctx->task->sched_class); }'
> Attaching 1 probe...
> migration/0:24->rt_sched_class+0x0/0xe0
> migration/1:27->rt_sched_class+0x0/0xe0
> migration/2:33->rt_sched_class+0x0/0xe0
> migration/3:39->rt_sched_class+0x0/0xe0
> migration/4:45->rt_sched_class+0x0/0xe0
> migration/5:52->rt_sched_class+0x0/0xe0
> migration/6:58->rt_sched_class+0x0/0xe0
> migration/7:64->rt_sched_class+0x0/0xe0
>
> This commit therefore generate a new scx_setscheduler_class() and
> add check for stop_sched_class to replace __setscheduler_class().
>
> Signed-off-by: Zqiang <qiang.zhang@linux.dev>
Good catch! It looks like we had this since the beginnig...
Maybe we should add:
Fixes: f0e1a0643a59b ("sched_ext: Implement BPF extensible scheduler class")
In any case, the fix looks correct to me.
Reviewed-by: Andrea Righi <arighi@nvidia.com>
Thanks,
-Andrea
> ---
> kernel/sched/ext.c | 14 ++++++++++----
> 1 file changed, 10 insertions(+), 4 deletions(-)
>
> diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
> index b40d35964cd4..9447fada0050 100644
> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c
> @@ -248,6 +248,14 @@ static struct scx_dispatch_q *find_user_dsq(struct scx_sched *sch, u64 dsq_id)
> return rhashtable_lookup(&sch->dsq_hash, &dsq_id, dsq_hash_params);
> }
>
> +static const struct sched_class *scx_setscheduler_class(struct task_struct *p)
> +{
> + if (p->sched_class == &stop_sched_class)
> + return &stop_sched_class;
> +
> + return __setscheduler_class(p->policy, p->prio);
> +}
> +
> /*
> * scx_kf_mask enforcement. Some kfuncs can only be called from specific SCX
> * ops. When invoking SCX ops, SCX_CALL_OP[_RET]() should be used to indicate
> @@ -4241,8 +4249,7 @@ static void scx_disable_workfn(struct kthread_work *work)
> while ((p = scx_task_iter_next_locked(&sti))) {
> unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
> const struct sched_class *old_class = p->sched_class;
> - const struct sched_class *new_class =
> - __setscheduler_class(p->policy, p->prio);
> + const struct sched_class *new_class = scx_setscheduler_class(p);
>
> update_rq_clock(task_rq(p));
>
> @@ -5045,8 +5052,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
> while ((p = scx_task_iter_next_locked(&sti))) {
> unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
> const struct sched_class *old_class = p->sched_class;
> - const struct sched_class *new_class =
> - __setscheduler_class(p->policy, p->prio);
> + const struct sched_class *new_class = scx_setscheduler_class(p);
>
> if (scx_get_task_state(p) != SCX_TASK_READY)
> continue;
> --
> 2.17.1
>
© 2016 - 2025 Red Hat, Inc.