sched_ext currently suffers starvation due to RT. The same workload when
converted to EXT can get zero runtime if RT is 100% running, causing EXT
processes to stall. Fix it by adding a DL server for EXT.
Only either the EXT or the CFS DL-servers are active at a time. The
bandwidth of them is added and removed accordingly, when sched_ext
programs are loaded/unloaded.
A kselftest is also provided later to verify:
./runner -t rt_stall
===== START =====
TEST: rt_stall
DESCRIPTION: Verify that RT tasks cannot stall SCHED_EXT tasks
OUTPUT:
TAP version 13
1..1
ok 1 PASS: CFS task got more than 4.00% of runtime
Cc: Luigi De Matteis <ldematteis123@gmail.com>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Joel Fernandes <joelagnelf@nvidia.com>
---
kernel/sched/core.c | 3 ++
kernel/sched/deadline.c | 2 +-
kernel/sched/ext.c | 62 +++++++++++++++++++++++++++++++++++++++--
kernel/sched/sched.h | 2 ++
4 files changed, 66 insertions(+), 3 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 19b393b0b096..17e7cab0ddf5 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8645,6 +8645,9 @@ void __init sched_init(void)
hrtick_rq_init(rq);
atomic_set(&rq->nr_iowait, 0);
fair_server_init(rq);
+#ifdef CONFIG_SCHED_CLASS_EXT
+ ext_server_init(rq);
+#endif
#ifdef CONFIG_SCHED_CORE
rq->core = rq;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 3181384881b8..b703cbf627c1 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1570,7 +1570,7 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64
* The fair server (sole dl_server) does not account for real-time
* workload because it is running fair work.
*/
- if (dl_se == &rq->fair_server)
+ if (dl_se == &rq->fair_server || dl_se == &rq->ext_server)
return;
#ifdef CONFIG_RT_GROUP_SCHED
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index d765379cd94c..52f98c3944ed 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1830,6 +1830,9 @@ static void update_curr_scx(struct rq *rq)
if (!curr->scx.slice)
touch_core_sched(rq, curr);
}
+
+ if (dl_server_active(&rq->ext_server))
+ dl_server_update(&rq->ext_server, delta_exec);
}
static bool scx_dsq_priq_less(struct rb_node *node_a,
@@ -2308,6 +2311,15 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags
if (enq_flags & SCX_ENQ_WAKEUP)
touch_core_sched(rq, p);
+ if (rq->scx.nr_running == 1) {
+ /* Account for idle runtime */
+ if (!rq->nr_running)
+ dl_server_update_idle_time(rq, rq->curr, &rq->ext_server);
+
+ /* Start dl_server if this is the first task being enqueued */
+ dl_server_start(&rq->ext_server);
+ }
+
do_enqueue_task(rq, p, enq_flags, sticky_cpu);
out:
rq->scx.flags &= ~SCX_RQ_IN_WAKEUP;
@@ -2403,6 +2415,11 @@ static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags
sub_nr_running(rq, 1);
dispatch_dequeue(rq, p);
+
+ /* Stop the server if this was the last task */
+ if (rq->scx.nr_running == 0)
+ dl_server_stop(&rq->ext_server);
+
return true;
}
@@ -3894,6 +3911,15 @@ static void switching_to_scx(struct rq *rq, struct task_struct *p)
static void switched_from_scx(struct rq *rq, struct task_struct *p)
{
scx_ops_disable_task(p);
+
+ /*
+ * After class switch, if the DL server is still active, restart it so
+ * that DL timers will be queued, in case SCX switched to higher class.
+ */
+ if (dl_server_active(&rq->ext_server)) {
+ dl_server_stop(&rq->ext_server);
+ dl_server_start(&rq->ext_server);
+ }
}
static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p,int wake_flags) {}
@@ -7106,8 +7132,8 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu)
* relative scale between 0 and %SCX_CPUPERF_ONE. This determines how the
* schedutil cpufreq governor chooses the target frequency.
*
- * The actual performance level chosen, CPU grouping, and the overhead and
- * latency of the operations are dependent on the hardware and cpufreq driver in
+ * The actual performance level chosen, CPU grouping, and the overhead and latency
+ * of the operations are dependent on the hardware and cpufreq driver in
* use. Consult hardware and cpufreq documentation for more information. The
* current performance level can be monitored using scx_bpf_cpuperf_cur().
*/
@@ -7385,6 +7411,38 @@ BTF_ID_FLAGS(func, scx_bpf_now)
BTF_ID_FLAGS(func, scx_bpf_events, KF_TRUSTED_ARGS)
BTF_KFUNCS_END(scx_kfunc_ids_any)
+/*
+ * Check if ext scheduler has tasks ready to run.
+ */
+static bool ext_server_has_tasks(struct sched_dl_entity *dl_se)
+{
+ return !!dl_se->rq->scx.nr_running;
+}
+
+/*
+ * Select the next task to run from the ext scheduling class.
+ */
+static struct task_struct *ext_server_pick_task(struct sched_dl_entity *dl_se,
+ void *flags)
+{
+ struct rq_flags *rf = flags;
+
+ balance_scx(dl_se->rq, dl_se->rq->curr, rf);
+ return pick_task_scx(dl_se->rq, rf);
+}
+
+/*
+ * Initialize the ext server deadline entity.
+ */
+void ext_server_init(struct rq *rq)
+{
+ struct sched_dl_entity *dl_se = &rq->ext_server;
+
+ init_dl_entity(dl_se);
+
+ dl_server_init(dl_se, rq, ext_server_has_tasks, ext_server_pick_task);
+}
+
static const struct btf_kfunc_id_set scx_kfunc_set_any = {
.owner = THIS_MODULE,
.set = &scx_kfunc_ids_any,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 467e39205ebf..d206421b1146 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -389,6 +389,7 @@ extern void dl_server_update_idle_time(struct rq *rq,
struct task_struct *p,
struct sched_dl_entity *rq_dl_server);
extern void fair_server_init(struct rq *rq);
+extern void ext_server_init(struct rq *rq);
extern void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq);
extern int dl_server_apply_params(struct sched_dl_entity *dl_se,
u64 runtime, u64 period, bool init);
@@ -1137,6 +1138,7 @@ struct rq {
#endif
struct sched_dl_entity fair_server;
+ struct sched_dl_entity ext_server;
#ifdef CONFIG_FAIR_GROUP_SCHED
/* list of leaf cfs_rq on this CPU: */
--
2.43.0
On Mon, Jun 02, 2025 at 02:00:59PM -0400, Joel Fernandes wrote:
...
> @@ -2308,6 +2311,15 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags
> if (enq_flags & SCX_ENQ_WAKEUP)
> touch_core_sched(rq, p);
>
> + if (rq->scx.nr_running == 1) {
> + /* Account for idle runtime */
> + if (!rq->nr_running)
> + dl_server_update_idle_time(rq, rq->curr, &rq->ext_server);
> +
> + /* Start dl_server if this is the first task being enqueued */
> + dl_server_start(&rq->ext_server);
> + }
The following patch from Peter isn't upstream yet but SCX probably should do
something similar. Otherwise, the start/stop overhead can become pretty
expensive:
https://lore.kernel.org/all/20250520094538.086709102@infradead.org/
Another thing which is worth considering is that while rq->nr_running based
test would work in a lot of cases, it won't work in all cases for SCX as the
BPF scheduler may choose to not dispatch to the particular CPU even if a
task is currently associated with it.
For example, a soft partitioning scheduler might change partition CPU
allocations after enqueue() is complete and a task may end up associated
with a CPU that's no longer in its partition and when dispatch() is called
from the CPU, the BPF scheduler may not consume that task. This can become a
problem for the dl server based forward progress guarantee as that task is
enabling the dl server only on the rq that it's currently associated with.
This shouldn't be too common and the proposed patch puts us back in the same
state as the original RT bandwidth control, so no need to hold this series
for this issue but I think the right solution would be adding an optional
SCX operation so that the BPF scheduler can decide which CPUs should be
running the dl server.
Thanks.
--
tejun
On 6/2/2025 8:23 PM, Tejun Heo wrote:
> On Mon, Jun 02, 2025 at 02:00:59PM -0400, Joel Fernandes wrote:
> ...
>> @@ -2308,6 +2311,15 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags
>> if (enq_flags & SCX_ENQ_WAKEUP)
>> touch_core_sched(rq, p);
>>
>> + if (rq->scx.nr_running == 1) {
>> + /* Account for idle runtime */
>> + if (!rq->nr_running)
>> + dl_server_update_idle_time(rq, rq->curr, &rq->ext_server);
>> +
>> + /* Start dl_server if this is the first task being enqueued */
>> + dl_server_start(&rq->ext_server);
>> + }
> The following patch from Peter isn't upstream yet but SCX probably should do
> something similar. Otherwise, the start/stop overhead can become pretty
> expensive:
>
> https://lore.kernel.org/all/20250520094538.086709102@infradead.org/
Right. If it is Ok with you, we can do that after this patchset can be merged,
that way we can use the 'dl_server_idle' addition to 'sched_dl_entity', from
that patch as well.
thanks,
- Joel
On Thu, Jun 12, 2025 at 12:54:44PM -0400, Joel Fernandes wrote: > > https://lore.kernel.org/all/20250520094538.086709102@infradead.org/ > > Right. If it is Ok with you, we can do that after this patchset can be merged, > that way we can use the 'dl_server_idle' addition to 'sched_dl_entity', from > that patch as well. Oh yeah, any ordering is fine by me. Peter, how should we route these patches? Thanks. -- tejun
© 2016 - 2025 Red Hat, Inc.