sched_ext currently suffers starvation due to RT. The same workload when
converted to EXT can get zero runtime if RT is 100% running, causing EXT
processes to stall. Fix it by adding a DL server for EXT.
A kselftest is also provided later to verify:
./runner -t rt_stall
===== START =====
TEST: rt_stall
DESCRIPTION: Verify that RT tasks cannot stall SCHED_EXT tasks
OUTPUT:
TAP version 13
1..1
ok 1 PASS: CFS task got more than 4.00% of runtime
Cc: Luigi De Matteis <ldematteis123@gmail.com>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Joel Fernandes <joelagnelf@nvidia.com>
---
kernel/sched/core.c | 3 ++
kernel/sched/deadline.c | 2 +-
kernel/sched/ext.c | 62 +++++++++++++++++++++++++++++++++++++++--
kernel/sched/sched.h | 2 ++
4 files changed, 66 insertions(+), 3 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d19e4b7a0020..09bff60c22d8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8617,6 +8617,9 @@ void __init sched_init(void)
hrtick_rq_init(rq);
atomic_set(&rq->nr_iowait, 0);
fair_server_init(rq);
+#ifdef CONFIG_SCHED_CLASS_EXT
+ ext_server_init(rq);
+#endif
#ifdef CONFIG_SCHED_CORE
rq->core = rq;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index ef592751417f..bcb66d9692ae 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1571,7 +1571,7 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64
* The fair server (sole dl_server) does not account for real-time
* workload because it is running fair work.
*/
- if (dl_se == &rq->fair_server)
+ if (dl_se == &rq->fair_server || dl_se == &rq->ext_server)
return;
#ifdef CONFIG_RT_GROUP_SCHED
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 636b08977d19..553d3e6087fe 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1677,6 +1677,9 @@ static void update_curr_scx(struct rq *rq)
if (!curr->scx.slice)
touch_core_sched(rq, curr);
}
+
+ if (dl_server_active(&rq->ext_server))
+ dl_server_update(&rq->ext_server, delta_exec);
}
static bool scx_dsq_priq_less(struct rb_node *node_a,
@@ -2147,6 +2150,15 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags
if (enq_flags & SCX_ENQ_WAKEUP)
touch_core_sched(rq, p);
+ if (rq->scx.nr_running == 1) {
+ /* Account for idle runtime */
+ if (!rq->nr_running)
+ dl_server_update_idle_time(rq, rq->curr, &rq->ext_server);
+
+ /* Start dl_server if this is the first task being enqueued */
+ dl_server_start(&rq->ext_server);
+ }
+
do_enqueue_task(rq, p, enq_flags, sticky_cpu);
out:
rq->scx.flags &= ~SCX_RQ_IN_WAKEUP;
@@ -2238,6 +2250,11 @@ static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags
sub_nr_running(rq, 1);
dispatch_dequeue(rq, p);
+
+ /* Stop the server if this was the last task */
+ if (rq->scx.nr_running == 0)
+ dl_server_stop(&rq->ext_server);
+
return true;
}
@@ -4207,6 +4224,15 @@ static void switching_to_scx(struct rq *rq, struct task_struct *p)
static void switched_from_scx(struct rq *rq, struct task_struct *p)
{
scx_ops_disable_task(p);
+
+ /*
+ * After class switch, if the DL server is still active, restart it so
+ * that DL timers will be queued, in case SCX switched to higher class.
+ */
+ if (dl_server_active(&rq->ext_server)) {
+ dl_server_stop(&rq->ext_server);
+ dl_server_start(&rq->ext_server);
+ }
}
static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p,int wake_flags) {}
@@ -7440,8 +7466,8 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu)
* relative scale between 0 and %SCX_CPUPERF_ONE. This determines how the
* schedutil cpufreq governor chooses the target frequency.
*
- * The actual performance level chosen, CPU grouping, and the overhead and
- * latency of the operations are dependent on the hardware and cpufreq driver in
+ * The actual performance level chosen, CPU grouping, and the overhead and latency
+ * of the operations are dependent on the hardware and cpufreq driver in
* use. Consult hardware and cpufreq documentation for more information. The
* current performance level can be monitored using scx_bpf_cpuperf_cur().
*/
@@ -7793,6 +7819,38 @@ BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE)
BTF_ID_FLAGS(func, scx_bpf_now)
BTF_KFUNCS_END(scx_kfunc_ids_any)
+/*
+ * Check if ext scheduler has tasks ready to run.
+ */
+static bool ext_server_has_tasks(struct sched_dl_entity *dl_se)
+{
+ return !!dl_se->rq->scx.nr_running;
+}
+
+/*
+ * Select the next task to run from the ext scheduling class.
+ */
+static struct task_struct *ext_server_pick_task(struct sched_dl_entity *dl_se,
+ void *flags)
+{
+ struct rq_flags *rf = flags;
+
+ balance_scx(dl_se->rq, dl_se->rq->curr, rf);
+ return pick_task_scx(dl_se->rq, rf);
+}
+
+/*
+ * Initialize the ext server deadline entity.
+ */
+void ext_server_init(struct rq *rq)
+{
+ struct sched_dl_entity *dl_se = &rq->ext_server;
+
+ init_dl_entity(dl_se);
+
+ dl_server_init(dl_se, rq, ext_server_has_tasks, ext_server_pick_task);
+}
+
static const struct btf_kfunc_id_set scx_kfunc_set_any = {
.owner = THIS_MODULE,
.set = &scx_kfunc_ids_any,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b3d1201b8f3d..8421eb56c50b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -395,6 +395,7 @@ extern void dl_server_update_idle_time(struct rq *rq,
struct task_struct *p,
struct sched_dl_entity *rq_dl_server);
extern void fair_server_init(struct rq *rq);
+extern void ext_server_init(struct rq *rq);
extern void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq);
extern int dl_server_apply_params(struct sched_dl_entity *dl_se,
u64 runtime, u64 period, bool init);
@@ -1141,6 +1142,7 @@ struct rq {
#endif
struct sched_dl_entity fair_server;
+ struct sched_dl_entity ext_server;
#ifdef CONFIG_FAIR_GROUP_SCHED
/* list of leaf cfs_rq on this CPU: */
--
2.43.0
Hi Joel,
On Fri, Mar 14, 2025 at 10:21:50PM -0400, Joel Fernandes wrote:
...
> @@ -7793,6 +7819,38 @@ BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE)
> BTF_ID_FLAGS(func, scx_bpf_now)
> BTF_KFUNCS_END(scx_kfunc_ids_any)
>
> +/*
> + * Check if ext scheduler has tasks ready to run.
> + */
> +static bool ext_server_has_tasks(struct sched_dl_entity *dl_se)
> +{
> + return !!dl_se->rq->scx.nr_running;
> +}
> +
> +/*
> + * Select the next task to run from the ext scheduling class.
> + */
> +static struct task_struct *ext_server_pick_task(struct sched_dl_entity *dl_se,
> + void *flags)
> +{
> + struct rq_flags *rf = flags;
> +
It'd be nice to add a comment here to clarify that we need to call
balance_scx() before pick_task_scx(), so that we can trigger ops.dispatch()
and consume tasks that may be pending in the BPF scheduler's DSQs,
otherwise pick_task_scx() may not find any scx task to run, reducing the
effectiveness of the dl_server.
> + balance_scx(dl_se->rq, dl_se->rq->curr, rf);
> + return pick_task_scx(dl_se->rq, rf);
> +}
Thanks,
-Andrea
On 3/15/2025 1:56 PM, Andrea Righi wrote:
> Hi Joel,
>
> On Fri, Mar 14, 2025 at 10:21:50PM -0400, Joel Fernandes wrote:
> ...
>> @@ -7793,6 +7819,38 @@ BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE)
>> BTF_ID_FLAGS(func, scx_bpf_now)
>> BTF_KFUNCS_END(scx_kfunc_ids_any)
>>
>> +/*
>> + * Check if ext scheduler has tasks ready to run.
>> + */
>> +static bool ext_server_has_tasks(struct sched_dl_entity *dl_se)
>> +{
>> + return !!dl_se->rq->scx.nr_running;
>> +}
>> +
>> +/*
>> + * Select the next task to run from the ext scheduling class.
>> + */
>> +static struct task_struct *ext_server_pick_task(struct sched_dl_entity *dl_se,
>> + void *flags)
>> +{
>> + struct rq_flags *rf = flags;
>> +
>
> It'd be nice to add a comment here to clarify that we need to call
> balance_scx() before pick_task_scx(), so that we can trigger ops.dispatch()
> and consume tasks that may be pending in the BPF scheduler's DSQs,
> otherwise pick_task_scx() may not find any scx task to run, reducing the
> effectiveness of the dl_server.
Thanks for pointing this out, I will add rationale for the balance as you mentioned.
- Joel
On Fri, Mar 14, 2025 at 10:21:50PM -0400, Joel Fernandes wrote: > sched_ext currently suffers starvation due to RT. The same workload when > converted to EXT can get zero runtime if RT is 100% running, causing EXT > processes to stall. Fix it by adding a DL server for EXT. This needs a lot more words on why you need a second server. Because I don't think you do.
On 3/15/2025 3:22 AM, Peter Zijlstra wrote: > On Fri, Mar 14, 2025 at 10:21:50PM -0400, Joel Fernandes wrote: >> sched_ext currently suffers starvation due to RT. The same workload when >> converted to EXT can get zero runtime if RT is 100% running, causing EXT >> processes to stall. Fix it by adding a DL server for EXT. > > This needs a lot more words on why you need a second server. Because I > don't think you do. Sure, I will add more words to the change log to explain rationale. When you say "I don't think you do", do you mean that both FAIR and EXT could be served by the same server? If so, that will not handle the case where the system has both FAIR and EXT tasks in the mix (EXT has a partial mode where certain tasks can be made EXT with certain others left as FAIR) and FAIR runs 100% and starves EXT. We still need bandwidth allocated to EXT in such a situation. So we do need an EXT server. Or did you mean something else? thanks, - Joel
On Sat, Mar 15, 2025 at 07:15:27PM -0400, Joel Fernandes wrote: > > > On 3/15/2025 3:22 AM, Peter Zijlstra wrote: > > On Fri, Mar 14, 2025 at 10:21:50PM -0400, Joel Fernandes wrote: > >> sched_ext currently suffers starvation due to RT. The same workload when > >> converted to EXT can get zero runtime if RT is 100% running, causing EXT > >> processes to stall. Fix it by adding a DL server for EXT. > > > > This needs a lot more words on why you need a second server. Because I > > don't think you do. > > Sure, I will add more words to the change log to explain rationale. When you say > "I don't think you do", do you mean that both FAIR and EXT could be served by > the same server? Yeah, because now you get two deadline entities both having a reservation on bandwidth. One of which is not going to be used -- this is not nice. > If so, that will not handle the case where the system has both > FAIR and EXT tasks in the mix (EXT has a partial mode where certain tasks can be > made EXT with certain others left as FAIR) and FAIR runs 100% and starves EXT. Well, you did not mention that issue, you only babbled about RT. I did point out that issue with ext, and TJ said this mixed mode wasn't really meant to be used or somesuch. So if that's changed, this needs a separate discussion. Also; I gotta ask, why is nvidia looking at ext ?
Hello, Peter, I replied to other parts of your email in another thread so I am just replying to this part: On 3/17/2025 11:31 AM, Peter Zijlstra wrote: > > Also; I gotta ask, why is nvidia looking at ext ? There are some complex CPU topologies which perform poorly with the existing FAIR scheduler as reported by people (I have not seen the data though so there's that). There are also workloads where it is beneficial to schedule on cores which have the data in their cache and are submitting work to GPGPU, which makes the GPGPU operations faster. thanks, - Joel
Hello, On Mon, Mar 17, 2025 at 11:31:01AM +0100, Peter Zijlstra wrote: > On Sat, Mar 15, 2025 at 07:15:27PM -0400, Joel Fernandes wrote: > > If so, that will not handle the case where the system has both > > FAIR and EXT tasks in the mix (EXT has a partial mode where certain tasks can be > > made EXT with certain others left as FAIR) and FAIR runs 100% and starves EXT. > > Well, you did not mention that issue, you only babbled about RT. > > I did point out that issue with ext, and TJ said this mixed mode wasn't > really meant to be used or somesuch. It's true that most of the current use cases don't use mixed mode. That said, some folks are interested in it and if we can prevent starvation from fair saturating CPUs in mixed mode with a DL server, that'd be really nice. Would it be possible to toggle the reservations depending on the ext's operation mode? Thanks. -- tejun
On Mon, Mar 17, 2025 at 06:57:19AM -1000, Tejun Heo wrote: > Hello, > > On Mon, Mar 17, 2025 at 11:31:01AM +0100, Peter Zijlstra wrote: > > On Sat, Mar 15, 2025 at 07:15:27PM -0400, Joel Fernandes wrote: > > > If so, that will not handle the case where the system has both > > > FAIR and EXT tasks in the mix (EXT has a partial mode where certain tasks can be > > > made EXT with certain others left as FAIR) and FAIR runs 100% and starves EXT. > > > > Well, you did not mention that issue, you only babbled about RT. > > > > I did point out that issue with ext, and TJ said this mixed mode wasn't > > really meant to be used or somesuch. > > It's true that most of the current use cases don't use mixed mode. That > said, some folks are interested in it and if we can prevent starvation from > fair saturating CPUs in mixed mode with a DL server, that'd be really nice. > Would it be possible to toggle the reservations depending on the ext's > operation mode? Yeah, that should be doable.
Hello, Peter, Tejun, On 3/17/2025 6:06 PM, Peter Zijlstra wrote: > On Mon, Mar 17, 2025 at 06:57:19AM -1000, Tejun Heo wrote: >> Hello, >> >> On Mon, Mar 17, 2025 at 11:31:01AM +0100, Peter Zijlstra wrote: >>> On Sat, Mar 15, 2025 at 07:15:27PM -0400, Joel Fernandes wrote: >>>> If so, that will not handle the case where the system has both >>>> FAIR and EXT tasks in the mix (EXT has a partial mode where certain tasks can be >>>> made EXT with certain others left as FAIR) and FAIR runs 100% and starves EXT. >>> >>> Well, you did not mention that issue, you only babbled about RT. You are right, I will add more details about this to the change log. >>> >>> I did point out that issue with ext, and TJ said this mixed mode wasn't >>> really meant to be used or somesuch. >> >> It's true that most of the current use cases don't use mixed mode. That >> said, some folks are interested in it and if we can prevent starvation from >> fair saturating CPUs in mixed mode with a DL server, that'd be really nice. >> Would it be possible to toggle the reservations depending on the ext's >> operation mode? > > Yeah, that should be doable. Just to clarify, Tejun is suggesting that in mixed mode, we boost EXT independent of FAIR. And in normal mode, we we boost both FAIR+EXT, because well - nothing would be running as fair anyway. But what is the point of doing that, if we have boost EXT independent of FAIR anyway? We need that code _anyway_ due to mixed mode so it would not simplify anything. Or did Tejun mean something else about "toggle the reservations"? thanks, - Joel
Hello, On Mon, Mar 17, 2025 at 10:48:16PM +0100, Joel Fernandes wrote: ... > Just to clarify, Tejun is suggesting that in mixed mode, we boost EXT > independent of FAIR. And in normal mode, we we boost both FAIR+EXT, because well > - nothing would be running as fair anyway. > > But what is the point of doing that, if we have boost EXT independent of FAIR > anyway? We need that code _anyway_ due to mixed mode so it would not simplify > anything. > > Or did Tejun mean something else about "toggle the reservations"? My understanding is that if we have both FAIR and EXT's DL servers reserving execution time all the time, we'd be reserving execution time for something which can't be active, so the only change necessary I think is just retracting FAIR's or EXT's reservation whent we know they are not active (ie. if EXT is not loaded or EXT is loaded in full-sys mode). Thanks. -- tejun
On 3/17/2025 11:16 PM, Tejun Heo wrote: > Hello, > > On Mon, Mar 17, 2025 at 10:48:16PM +0100, Joel Fernandes wrote: > ... >> Just to clarify, Tejun is suggesting that in mixed mode, we boost EXT >> independent of FAIR. And in normal mode, we we boost both FAIR+EXT, because well >> - nothing would be running as fair anyway. >> >> But what is the point of doing that, if we have boost EXT independent of FAIR >> anyway? We need that code _anyway_ due to mixed mode so it would not simplify >> anything. >> >> Or did Tejun mean something else about "toggle the reservations"? > > My understanding is that if we have both FAIR and EXT's DL servers reserving > execution time all the time, we'd be reserving execution time for something > which can't be active, so the only change necessary I think is just > retracting FAIR's or EXT's reservation whent we know they are not active > (ie. if EXT is not loaded or EXT is loaded in full-sys mode). > Ah, I see what you mean. We already have a 'toggle' like that though because if FAIR or EXT is not running (due to whatever reason), we would have already called 'dl_server_stop()' or would never have called 'dl_server_start()'. On the other hand, even if full-sys-mode, we need the EXT server to boost it to above RT if EXT is running, so we need its server initialized and ready to go. Let me know if I missed anything though, thanks, - Joel
On Mon, Mar 17, 2025 at 11:39:32PM +0100, Joel Fernandes wrote: > On 3/17/2025 11:16 PM, Tejun Heo wrote: > > On Mon, Mar 17, 2025 at 10:48:16PM +0100, Joel Fernandes wrote: > > ... > >> Just to clarify, Tejun is suggesting that in mixed mode, we boost EXT > >> independent of FAIR. And in normal mode, we we boost both FAIR+EXT, because well > >> - nothing would be running as fair anyway. > >> > >> But what is the point of doing that, if we have boost EXT independent of FAIR > >> anyway? We need that code _anyway_ due to mixed mode so it would not simplify > >> anything. > >> > >> Or did Tejun mean something else about "toggle the reservations"? > > > > My understanding is that if we have both FAIR and EXT's DL servers reserving > > execution time all the time, we'd be reserving execution time for something > > which can't be active, so the only change necessary I think is just > > retracting FAIR's or EXT's reservation whent we know they are not active > > (ie. if EXT is not loaded or EXT is loaded in full-sys mode). > > > Ah, I see what you mean. We already have a 'toggle' like that though because if > FAIR or EXT is not running (due to whatever reason), we would have already > called 'dl_server_stop()' or would never have called 'dl_server_start()'. > > On the other hand, even if full-sys-mode, we need the EXT server to boost it to > above RT if EXT is running, so we need its server initialized and ready to go. > > Let me know if I missed anything though, thanks, I'm not very familiar with DL but it looks like a stopped DL server would still be reserving bandwidth which limits what other actual DL users would be able to reserve without causing overflow. It looks like EXT's activation modes should be calling into dl_bw_manage() so that FAIR's and EXT's reservations can be retracted when not in use. Thanks. -- tejun
On 3/17/2025 11:48 PM, Tejun Heo wrote: > On Mon, Mar 17, 2025 at 11:39:32PM +0100, Joel Fernandes wrote: >> On 3/17/2025 11:16 PM, Tejun Heo wrote: >>> On Mon, Mar 17, 2025 at 10:48:16PM +0100, Joel Fernandes wrote: >>> ... >>>> Just to clarify, Tejun is suggesting that in mixed mode, we boost EXT >>>> independent of FAIR. And in normal mode, we we boost both FAIR+EXT, because well >>>> - nothing would be running as fair anyway. >>>> >>>> But what is the point of doing that, if we have boost EXT independent of FAIR >>>> anyway? We need that code _anyway_ due to mixed mode so it would not simplify >>>> anything. >>>> >>>> Or did Tejun mean something else about "toggle the reservations"? >>> My understanding is that if we have both FAIR and EXT's DL servers reserving >>> execution time all the time, we'd be reserving execution time for something >>> which can't be active, so the only change necessary I think is just >>> retracting FAIR's or EXT's reservation whent we know they are not active >>> (ie. if EXT is not loaded or EXT is loaded in full-sys mode). >>> >> Ah, I see what you mean. We already have a 'toggle' like that though because if >> FAIR or EXT is not running (due to whatever reason), we would have already >> called 'dl_server_stop()' or would never have called 'dl_server_start()'. >> >> On the other hand, even if full-sys-mode, we need the EXT server to boost it to >> above RT if EXT is running, so we need its server initialized and ready to go. >> >> Let me know if I missed anything though, thanks, > I'm not very familiar with DL but it looks like a stopped DL server would > still be reserving bandwidth which limits what other actual DL users would > be able to reserve without causing overflow. It looks like EXT's activation > modes should be calling into dl_bw_manage() so that FAIR's and EXT's > reservations can be retracted when not in use. Ah, you raise a good point. Sorry, you were on to something and that makes sense to me. Let me see how to wire it up. Basically, when we switch to full-mode from say partial, we could/should remove the bandwidth reservation of the servers. I think I confused the concept of "server not running" to "server reserving bandwidth". My bad! thanks, - Joel
© 2016 - 2025 Red Hat, Inc.