sched_ext currently suffers starvation due to RT. The same workload when
converted to EXT can get zero runtime if RT is 100% running, causing EXT
processes to stall. Fix it by adding a DL server for EXT.
A kselftest is also provided later to verify:
./runner -t rt_stall
===== START =====
TEST: rt_stall
DESCRIPTION: Verify that RT tasks cannot stall SCHED_EXT tasks
OUTPUT:
TAP version 13
1..1
ok 1 PASS: CFS task got more than 4.00% of runtime
Cc: Luigi De Matteis <ldematteis123@gmail.com>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Joel Fernandes <joelagnelf@nvidia.com>
---
kernel/sched/core.c | 3 ++
kernel/sched/deadline.c | 2 +-
kernel/sched/ext.c | 62 +++++++++++++++++++++++++++++++++++++++--
kernel/sched/sched.h | 2 ++
4 files changed, 66 insertions(+), 3 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7d856d8dcb94..a1ee241a63ce 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8650,6 +8650,9 @@ void __init sched_init(void)
hrtick_rq_init(rq);
atomic_set(&rq->nr_iowait, 0);
fair_server_init(rq);
+#ifdef CONFIG_SCHED_CLASS_EXT
+ ext_server_init(rq);
+#endif
#ifdef CONFIG_SCHED_CORE
rq->core = rq;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 562e1e7196b1..c61752c2e052 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1570,7 +1570,7 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64
* The fair server (sole dl_server) does not account for real-time
* workload because it is running fair work.
*/
- if (dl_se == &rq->fair_server)
+ if (dl_se == &rq->fair_server || dl_se == &rq->ext_server)
return;
#ifdef CONFIG_RT_GROUP_SCHED
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index f00bb75ad539..34c95100fbe5 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1909,6 +1909,9 @@ static void update_curr_scx(struct rq *rq)
if (!curr->scx.slice)
touch_core_sched(rq, curr);
}
+
+ if (dl_server_active(&rq->ext_server))
+ dl_server_update(&rq->ext_server, delta_exec);
}
static bool scx_dsq_priq_less(struct rb_node *node_a,
@@ -2396,6 +2399,15 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags
if (enq_flags & SCX_ENQ_WAKEUP)
touch_core_sched(rq, p);
+ if (rq->scx.nr_running == 1) {
+ /* Account for idle runtime */
+ if (!rq->nr_running)
+ dl_server_update_idle_time(rq, rq->curr, &rq->ext_server);
+
+ /* Start dl_server if this is the first task being enqueued */
+ dl_server_start(&rq->ext_server);
+ }
+
do_enqueue_task(rq, p, enq_flags, sticky_cpu);
out:
rq->scx.flags &= ~SCX_RQ_IN_WAKEUP;
@@ -2495,6 +2507,11 @@ static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags
sub_nr_running(rq, 1);
dispatch_dequeue(rq, p);
+
+ /* Stop the server if this was the last task */
+ if (rq->scx.nr_running == 0)
+ dl_server_stop(&rq->ext_server);
+
return true;
}
@@ -4050,6 +4067,15 @@ static void switching_to_scx(struct rq *rq, struct task_struct *p)
static void switched_from_scx(struct rq *rq, struct task_struct *p)
{
scx_disable_task(p);
+
+ /*
+ * After class switch, if the DL server is still active, restart it so
+ * that DL timers will be queued, in case SCX switched to higher class.
+ */
+ if (dl_server_active(&rq->ext_server)) {
+ dl_server_stop(&rq->ext_server);
+ dl_server_start(&rq->ext_server);
+ }
}
static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p,int wake_flags) {}
@@ -7310,8 +7336,8 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu)
* relative scale between 0 and %SCX_CPUPERF_ONE. This determines how the
* schedutil cpufreq governor chooses the target frequency.
*
- * The actual performance level chosen, CPU grouping, and the overhead and
- * latency of the operations are dependent on the hardware and cpufreq driver in
+ * The actual performance level chosen, CPU grouping, and the overhead and latency
+ * of the operations are dependent on the hardware and cpufreq driver in
* use. Consult hardware and cpufreq documentation for more information. The
* current performance level can be monitored using scx_bpf_cpuperf_cur().
*/
@@ -7603,6 +7629,38 @@ BTF_ID_FLAGS(func, scx_bpf_now)
BTF_ID_FLAGS(func, scx_bpf_events, KF_TRUSTED_ARGS)
BTF_KFUNCS_END(scx_kfunc_ids_any)
+/*
+ * Check if ext scheduler has tasks ready to run.
+ */
+static bool ext_server_has_tasks(struct sched_dl_entity *dl_se)
+{
+ return !!dl_se->rq->scx.nr_running;
+}
+
+/*
+ * Select the next task to run from the ext scheduling class.
+ */
+static struct task_struct *ext_server_pick_task(struct sched_dl_entity *dl_se,
+ void *flags)
+{
+ struct rq_flags *rf = flags;
+
+ balance_scx(dl_se->rq, dl_se->rq->curr, rf);
+ return pick_task_scx(dl_se->rq, rf);
+}
+
+/*
+ * Initialize the ext server deadline entity.
+ */
+void ext_server_init(struct rq *rq)
+{
+ struct sched_dl_entity *dl_se = &rq->ext_server;
+
+ init_dl_entity(dl_se);
+
+ dl_server_init(dl_se, rq, ext_server_has_tasks, ext_server_pick_task);
+}
+
static const struct btf_kfunc_id_set scx_kfunc_set_any = {
.owner = THIS_MODULE,
.set = &scx_kfunc_ids_any,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 1ac2fb398982..576b69cee6b1 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -389,6 +389,7 @@ extern void dl_server_update_idle_time(struct rq *rq,
struct task_struct *p,
struct sched_dl_entity *rq_dl_server);
extern void fair_server_init(struct rq *rq);
+extern void ext_server_init(struct rq *rq);
extern void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq);
extern int dl_server_apply_params(struct sched_dl_entity *dl_se,
u64 runtime, u64 period, bool init);
@@ -1137,6 +1138,7 @@ struct rq {
#endif
struct sched_dl_entity fair_server;
+ struct sched_dl_entity ext_server;
#ifdef CONFIG_FAIR_GROUP_SCHED
/* list of leaf cfs_rq on this CPU: */
--
2.43.0
On Fri, Jun 20, 2025 at 04:32:23PM -0400, Joel Fernandes wrote: > +static struct task_struct *ext_server_pick_task(struct sched_dl_entity *dl_se, > + void *flags) > +{ > + struct rq_flags *rf = flags; > + > + balance_scx(dl_se->rq, dl_se->rq->curr, rf); > + return pick_task_scx(dl_se->rq, rf); > +} ... > +void ext_server_init(struct rq *rq) > +{ > + struct sched_dl_entity *dl_se = &rq->ext_server; > + > + init_dl_entity(dl_se); > + > + dl_server_init(dl_se, rq, ext_server_has_tasks, ext_server_pick_task); > +} Needing to pass in @rf to ext_server_pick_task() makes sense as SCX always needs to balance first. However, I still don't understand why that necessitates adding @rf to sched_class->pick_task(). The existing assumption is that ->pick_task() cannot release and regrab the rq lock and that's why ->balance() exists in the first place. Breaking this can lead to other misbehaviors - e.g. if a higher priority class ->pick_task() releases the rq lock, a lower one may end up losing a task that its ->balance() saw. This can lead to lower priority class's ->pick_task() being called without its ->balance() being called which can lead to stalls. One thing that confuses me is that all that the patchset needs to do seems to be adding @rf to dl_server_pick_f and that seems fine to me. Why is it necessary to add @rf to sched_class->pick_task()? Thanks. -- tejun
On Mon, Jun 23, 2025 at 12:11:07PM -1000, Tejun Heo wrote: > On Fri, Jun 20, 2025 at 04:32:23PM -0400, Joel Fernandes wrote: > > +static struct task_struct *ext_server_pick_task(struct sched_dl_entity *dl_se, > > + void *flags) > > +{ > > + struct rq_flags *rf = flags; > > + > > + balance_scx(dl_se->rq, dl_se->rq->curr, rf); > > + return pick_task_scx(dl_se->rq, rf); > > +} > ... > > +void ext_server_init(struct rq *rq) > > +{ > > + struct sched_dl_entity *dl_se = &rq->ext_server; > > + > > + init_dl_entity(dl_se); > > + > > + dl_server_init(dl_se, rq, ext_server_has_tasks, ext_server_pick_task); > > +} > > Needing to pass in @rf to ext_server_pick_task() makes sense as SCX always > needs to balance first. However, I still don't understand why that > necessitates adding @rf to sched_class->pick_task(). The existing assumption > is that ->pick_task() cannot release and regrab the rq lock and that's why > ->balance() exists in the first place. Breaking this can lead to other > misbehaviors - e.g. if a higher priority class ->pick_task() releases the rq > lock, a lower one may end up losing a task that its ->balance() saw. This > can lead to lower priority class's ->pick_task() being called without its > ->balance() being called which can lead to stalls. > > One thing that confuses me is that all that the patchset needs to do seems > to be adding @rf to dl_server_pick_f and that seems fine to me. Why is it > necessary to add @rf to sched_class->pick_task()? Because ext_server_pick_task is called via DL's pick_task? In deadline.c, pick_task_dl -> _pick_task_dl -> ext_server_pick_task This changes the signature of the pick_task_dl function, which in turn changes the signature of class->pick_task. How about I pass NULL to pick_task_scx() from ext_server_pick_task(), and also annotate all functions where rf is unused, by naming the argument as rf_unused (except for DL), would that make it more clear that the rq lock should not be arbitrarily dropped just because rf was passed? And perhaps sprinkling some more code comments. thanks, - Joel
Hello, Joel. On Mon, Jun 30, 2025 at 11:12:52AM -0400, Joel Fernandes wrote: ... > > One thing that confuses me is that all that the patchset needs to do seems > > to be adding @rf to dl_server_pick_f and that seems fine to me. Why is it > > necessary to add @rf to sched_class->pick_task()? > > Because ext_server_pick_task is called via DL's pick_task? > > In deadline.c, pick_task_dl -> _pick_task_dl -> ext_server_pick_task Ah, right, sorry about being so dense. > This changes the signature of the pick_task_dl function, which in turn > changes the signature of class->pick_task. > > How about I pass NULL to pick_task_scx() from ext_server_pick_task(), and > also annotate all functions where rf is unused, by naming the argument as > rf_unused (except for DL), would that make it more clear that the rq lock > should not be arbitrarily dropped just because rf was passed? And perhaps > sprinkling some more code comments. I think what bothers me is that this erases the distinction between ->balance() and ->pick_task(). However, I'm not sure the distinction means anything anymore especially given the removal of !CONFIG_SMP paths. Looking at the balance callsites, I think we can just fold each ->balance() into the head of the corresponding ->pick_task(). Peter, what do you think? Thanks. -- tejun
Hi Tejun, On 6/30/2025 11:38 AM, Tejun Heo wrote: >> This changes the signature of the pick_task_dl function, which in turn >> changes the signature of class->pick_task. >> >> How about I pass NULL to pick_task_scx() from ext_server_pick_task(), and >> also annotate all functions where rf is unused, by naming the argument as >> rf_unused (except for DL), would that make it more clear that the rq lock >> should not be arbitrarily dropped just because rf was passed? And perhaps >> sprinkling some more code comments. > > I think what bothers me is that this erases the distinction between > ->balance() and ->pick_task(). However, I'm not sure the distinction means > anything anymore especially given the removal of !CONFIG_SMP paths. Looking > at the balance callsites, I think we can just fold each ->balance() into the > head of the corresponding ->pick_task(). Peter, what do you think? I think one added complexity with that would be, the ->balance() would be have to be embedded into the other sister ->pick_next_task() as well? And core scheduling paths add complexity there, I'd vouch for keeping it separate but curious what Peter has to say as well. thanks, - Joel
© 2016 - 2025 Red Hat, Inc.