kernel/sched/ext.c | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-)
For some BPF-schedulers, they should do something when
task is doing migration, such as updating per-cpu map.
If SCX_OPS_TRACK_MIGRATION is set, runnable/quiescent
would be called whether task is doing migration or not.
Signed-off-by: Henry Huang <henry.hj@antgroup.com>
---
kernel/sched/ext.c | 24 ++++++++++++++++++++++--
1 file changed, 22 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index b498d86..42c5251 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -161,6 +161,12 @@ enum scx_ops_flags {
SCX_OPS_BUILTIN_IDLE_PER_NODE = 1LLU << 6,
/*
+ * If set, runnable/quiescent ops would be called whether the task is
+ * doing migration or not.
+ */
+ SCX_OPS_TRACK_MIGRATION = 1LLU << 7,
+
+ /*
* CPU cgroup support flags
*/
SCX_OPS_HAS_CGROUP_WEIGHT = 1LLU << 16, /* DEPRECATED, will be removed on 6.18 */
@@ -172,6 +178,7 @@ enum scx_ops_flags {
SCX_OPS_ALLOW_QUEUED_WAKEUP |
SCX_OPS_SWITCH_PARTIAL |
SCX_OPS_BUILTIN_IDLE_PER_NODE |
+ SCX_OPS_TRACK_MIGRATION |
SCX_OPS_HAS_CGROUP_WEIGHT,
/* high 8 bits are internal, don't include in SCX_OPS_ALL_FLAGS */
@@ -2390,7 +2397,11 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags
rq->scx.nr_running++;
add_nr_running(rq, 1);
- if (SCX_HAS_OP(sch, runnable) && !task_on_rq_migrating(p))
+ if (task_on_rq_migrating(p))
+ enq_flags |= ENQUEUE_MIGRATING;
+
+ if (SCX_HAS_OP(sch, runnable) &&
+ ((sch->ops.flags & SCX_OPS_TRACK_MIGRATION) || !(enq_flags & ENQUEUE_MIGRATING)))
SCX_CALL_OP_TASK(sch, SCX_KF_REST, runnable, rq, p, enq_flags);
if (enq_flags & SCX_ENQ_WAKEUP)
@@ -2463,6 +2474,9 @@ static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags
return true;
}
+ if (task_on_rq_migrating(p))
+ deq_flags |= DEQUEUE_MIGRATING;
+
ops_dequeue(rq, p, deq_flags);
/*
@@ -2482,7 +2496,8 @@ static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags
SCX_CALL_OP_TASK(sch, SCX_KF_REST, stopping, rq, p, false);
}
- if (SCX_HAS_OP(sch, quiescent) && !task_on_rq_migrating(p))
+ if (SCX_HAS_OP(sch, quiescent) &&
+ ((sch->ops.flags & SCX_OPS_TRACK_MIGRATION) || !(deq_flags & DEQUEUE_MIGRATING)))
SCX_CALL_OP_TASK(sch, SCX_KF_REST, quiescent, rq, p, deq_flags);
if (deq_flags & SCX_DEQ_SLEEP)
@@ -5495,6 +5510,11 @@ static int validate_ops(struct scx_sched *sch, const struct sched_ext_ops *ops)
return -EINVAL;
}
+ if ((ops->flags & SCX_OPS_TRACK_MIGRATION) && (!ops->runnable || !ops->quiescent)) {
+ scx_error(sch, "SCX_OPS_TRACK_MIGRATION requires ops.runnable() and ops.quiescent() to be implemented");
+ return -EINVAL;
+ }
+
if (ops->flags & SCX_OPS_HAS_CGROUP_WEIGHT)
pr_warn("SCX_OPS_HAS_CGROUP_WEIGHT is deprecated and a noop\n");
--
Henry
Hi Henry, On Mon, Jun 23, 2025 at 12:32:20PM +0800, Henry Huang wrote: > For some BPF-schedulers, they should do something when > task is doing migration, such as updating per-cpu map. > If SCX_OPS_TRACK_MIGRATION is set, runnable/quiescent > would be called whether task is doing migration or not. > > Signed-off-by: Henry Huang <henry.hj@antgroup.com> Looks good. One last thing, since we're exposing ENQUEUE_MIGRATING and DEQUEUE_MIGRATING to BPF, can you introduce some scx enums, similar to what we're doing with DEQUEUE_SLEEP? Something like the following. With that you can add my: Reviewed-by: Andrea Righi <arighi@nvidia.com> Thanks, -Andrea kernel/sched/ext.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 2f0ab232a2786..bf04cde71b34a 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -918,8 +918,11 @@ enum scx_enq_flags { }; enum scx_deq_flags { + /* expose select ENQUEUE_* flags as enums */ + SCX_ENQ_MIGRATING = ENQUEUE_MIGRATING, /* expose select DEQUEUE_* flags as enums */ SCX_DEQ_SLEEP = DEQUEUE_SLEEP, + SCX_DEQ_MIGRATING = DEQUEUE_MIGRATING, /* high 32bits are SCX specific */ @@ -2385,10 +2388,10 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags add_nr_running(rq, 1); if (task_on_rq_migrating(p)) - enq_flags |= ENQUEUE_MIGRATING; + enq_flags |= SCX_ENQ_MIGRATING; if (SCX_HAS_OP(sch, runnable) && - ((sch->ops.flags & SCX_OPS_TRACK_MIGRATION) || !(enq_flags & ENQUEUE_MIGRATING))) + ((sch->ops.flags & SCX_OPS_TRACK_MIGRATION) || !(enq_flags & SCX_ENQ_MIGRATING))) SCX_CALL_OP_TASK(sch, SCX_KF_REST, runnable, rq, p, enq_flags); if (enq_flags & SCX_ENQ_WAKEUP) @@ -2471,7 +2474,7 @@ static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags } if (task_on_rq_migrating(p)) - deq_flags |= DEQUEUE_MIGRATING; + deq_flags |= SCX_DEQ_MIGRATING; ops_dequeue(rq, p, deq_flags); @@ -2493,7 +2496,7 @@ static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags } if (SCX_HAS_OP(sch, quiescent) && - ((sch->ops.flags & SCX_OPS_TRACK_MIGRATION) || !(deq_flags & DEQUEUE_MIGRATING))) + ((sch->ops.flags & SCX_OPS_TRACK_MIGRATION) || !(deq_flags & SCX_DEQ_MIGRATING))) SCX_CALL_OP_TASK(sch, SCX_KF_REST, quiescent, rq, p, deq_flags); if (deq_flags & SCX_DEQ_SLEEP) > --- > kernel/sched/ext.c | 24 ++++++++++++++++++++++-- > 1 file changed, 22 insertions(+), 2 deletions(-) > > diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c > index b498d86..42c5251 100644 > --- a/kernel/sched/ext.c > +++ b/kernel/sched/ext.c > @@ -161,6 +161,12 @@ enum scx_ops_flags { > SCX_OPS_BUILTIN_IDLE_PER_NODE = 1LLU << 6, > > /* > + * If set, runnable/quiescent ops would be called whether the task is > + * doing migration or not. > + */ > + SCX_OPS_TRACK_MIGRATION = 1LLU << 7, > + > + /* > * CPU cgroup support flags > */ > SCX_OPS_HAS_CGROUP_WEIGHT = 1LLU << 16, /* DEPRECATED, will be removed on 6.18 */ > @@ -172,6 +178,7 @@ enum scx_ops_flags { > SCX_OPS_ALLOW_QUEUED_WAKEUP | > SCX_OPS_SWITCH_PARTIAL | > SCX_OPS_BUILTIN_IDLE_PER_NODE | > + SCX_OPS_TRACK_MIGRATION | > SCX_OPS_HAS_CGROUP_WEIGHT, > > /* high 8 bits are internal, don't include in SCX_OPS_ALL_FLAGS */ > @@ -2390,7 +2397,11 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags > rq->scx.nr_running++; > add_nr_running(rq, 1); > > - if (SCX_HAS_OP(sch, runnable) && !task_on_rq_migrating(p)) > + if (task_on_rq_migrating(p)) > + enq_flags |= ENQUEUE_MIGRATING; > + > + if (SCX_HAS_OP(sch, runnable) && > + ((sch->ops.flags & SCX_OPS_TRACK_MIGRATION) || !(enq_flags & ENQUEUE_MIGRATING))) > SCX_CALL_OP_TASK(sch, SCX_KF_REST, runnable, rq, p, enq_flags); > > if (enq_flags & SCX_ENQ_WAKEUP) > @@ -2463,6 +2474,9 @@ static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags > return true; > } > > + if (task_on_rq_migrating(p)) > + deq_flags |= DEQUEUE_MIGRATING; > + > ops_dequeue(rq, p, deq_flags); > > /* > @@ -2482,7 +2496,8 @@ static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags > SCX_CALL_OP_TASK(sch, SCX_KF_REST, stopping, rq, p, false); > } > > - if (SCX_HAS_OP(sch, quiescent) && !task_on_rq_migrating(p)) > + if (SCX_HAS_OP(sch, quiescent) && > + ((sch->ops.flags & SCX_OPS_TRACK_MIGRATION) || !(deq_flags & DEQUEUE_MIGRATING))) > SCX_CALL_OP_TASK(sch, SCX_KF_REST, quiescent, rq, p, deq_flags); > > if (deq_flags & SCX_DEQ_SLEEP) > @@ -5495,6 +5510,11 @@ static int validate_ops(struct scx_sched *sch, const struct sched_ext_ops *ops) > return -EINVAL; > } > > + if ((ops->flags & SCX_OPS_TRACK_MIGRATION) && (!ops->runnable || !ops->quiescent)) { > + scx_error(sch, "SCX_OPS_TRACK_MIGRATION requires ops.runnable() and ops.quiescent() to be implemented"); > + return -EINVAL; > + } > + > if (ops->flags & SCX_OPS_HAS_CGROUP_WEIGHT) > pr_warn("SCX_OPS_HAS_CGROUP_WEIGHT is deprecated and a noop\n"); > > -- > Henry >
© 2016 - 2025 Red Hat, Inc.