Chris reported that commit 5f6bd380c7bd ("sched/rt: Remove default
bandwidth control") caused a significant dip in his favourite
benchmark of the day. Simply disabling dl_server cured things.
His workload hammers the 0->1, 1->0 transitions, and the
dl_server_{start,stop}() overhead kills it -- fairly obviously a bad
idea in hind sight and all that.
Change things around to only disable the dl_server when there has not
been a fair task around for a whole period. Since the default period
is 1 second, this ensures the benchmark never trips this, overhead
gone.
Fixes: 557a6bfc662c ("sched/fair: Add trivial fair server")
Reported-by: Chris Mason <clm@meta.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
include/linux/sched.h | 1 +
kernel/sched/deadline.c | 31 +++++++++++++++++++++++++++----
2 files changed, 28 insertions(+), 4 deletions(-)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -702,6 +702,7 @@ struct sched_dl_entity {
unsigned int dl_defer : 1;
unsigned int dl_defer_armed : 1;
unsigned int dl_defer_running : 1;
+ unsigned int dl_server_idle : 1;
/*
* Bandwidth enforcement timer. Each -deadline task has its
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1215,6 +1215,8 @@ static void __push_dl_task(struct rq *rq
/* a defer timer will not be reset if the runtime consumed was < dl_server_min_res */
static const u64 dl_server_min_res = 1 * NSEC_PER_MSEC;
+static bool dl_server_stopped(struct sched_dl_entity *dl_se);
+
static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_dl_entity *dl_se)
{
struct rq *rq = rq_of_dl_se(dl_se);
@@ -1234,6 +1236,7 @@ static enum hrtimer_restart dl_server_ti
if (!dl_se->server_has_tasks(dl_se)) {
replenish_dl_entity(dl_se);
+ dl_server_stopped(dl_se);
return HRTIMER_NORESTART;
}
@@ -1639,8 +1642,10 @@ void dl_server_update_idle_time(struct r
void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec)
{
/* 0 runtime = fair server disabled */
- if (dl_se->dl_runtime)
+ if (dl_se->dl_runtime) {
+ dl_se->dl_server_idle = 0;
update_curr_dl_se(dl_se->rq, dl_se, delta_exec);
+ }
}
void dl_server_start(struct sched_dl_entity *dl_se)
@@ -1663,7 +1668,7 @@ void dl_server_start(struct sched_dl_ent
setup_new_dl_entity(dl_se);
}
- if (!dl_se->dl_runtime)
+ if (!dl_se->dl_runtime || dl_se->dl_server_active)
return;
dl_se->dl_server_active = 1;
@@ -1672,7 +1677,7 @@ void dl_server_start(struct sched_dl_ent
resched_curr(dl_se->rq);
}
-void dl_server_stop(struct sched_dl_entity *dl_se)
+static void __dl_server_stop(struct sched_dl_entity *dl_se)
{
if (!dl_se->dl_runtime)
return;
@@ -1684,6 +1689,24 @@ void dl_server_stop(struct sched_dl_enti
dl_se->dl_server_active = 0;
}
+static bool dl_server_stopped(struct sched_dl_entity *dl_se)
+{
+ if (!dl_se->dl_server_active)
+ return false;
+
+ if (dl_se->dl_server_idle) {
+ __dl_server_stop(dl_se);
+ return true;
+ }
+
+ dl_se->dl_server_idle = 1;
+ return false;
+}
+
+void dl_server_stop(struct sched_dl_entity *dl_se)
+{
+}
+
void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
dl_server_has_tasks_f has_tasks,
dl_server_pick_f pick_task)
@@ -2435,7 +2458,7 @@ static struct task_struct *__pick_task_d
if (dl_server(dl_se)) {
p = dl_se->server_pick_task(dl_se);
if (!p) {
- if (dl_server_active(dl_se)) {
+ if (!dl_server_stopped(dl_se)) {
dl_se->dl_yielded = 1;
update_curr_dl_se(rq, dl_se, 0);
}
Hi,
On 20/05/25 11:45, Peter Zijlstra wrote:
> Chris reported that commit 5f6bd380c7bd ("sched/rt: Remove default
> bandwidth control") caused a significant dip in his favourite
> benchmark of the day. Simply disabling dl_server cured things.
>
> His workload hammers the 0->1, 1->0 transitions, and the
> dl_server_{start,stop}() overhead kills it -- fairly obviously a bad
> idea in hind sight and all that.
>
> Change things around to only disable the dl_server when there has not
> been a fair task around for a whole period. Since the default period
> is 1 second, this ensures the benchmark never trips this, overhead
> gone.
>
> Fixes: 557a6bfc662c ("sched/fair: Add trivial fair server")
> Reported-by: Chris Mason <clm@meta.com>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
> include/linux/sched.h | 1 +
> kernel/sched/deadline.c | 31 +++++++++++++++++++++++++++----
> 2 files changed, 28 insertions(+), 4 deletions(-)
>
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -702,6 +702,7 @@ struct sched_dl_entity {
> unsigned int dl_defer : 1;
> unsigned int dl_defer_armed : 1;
> unsigned int dl_defer_running : 1;
> + unsigned int dl_server_idle : 1;
>
> /*
> * Bandwidth enforcement timer. Each -deadline task has its
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -1215,6 +1215,8 @@ static void __push_dl_task(struct rq *rq
> /* a defer timer will not be reset if the runtime consumed was < dl_server_min_res */
> static const u64 dl_server_min_res = 1 * NSEC_PER_MSEC;
>
> +static bool dl_server_stopped(struct sched_dl_entity *dl_se);
> +
> static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_dl_entity *dl_se)
> {
> struct rq *rq = rq_of_dl_se(dl_se);
> @@ -1234,6 +1236,7 @@ static enum hrtimer_restart dl_server_ti
>
> if (!dl_se->server_has_tasks(dl_se)) {
> replenish_dl_entity(dl_se);
> + dl_server_stopped(dl_se);
> return HRTIMER_NORESTART;
> }
>
> @@ -1639,8 +1642,10 @@ void dl_server_update_idle_time(struct r
> void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec)
> {
> /* 0 runtime = fair server disabled */
> - if (dl_se->dl_runtime)
> + if (dl_se->dl_runtime) {
> + dl_se->dl_server_idle = 0;
> update_curr_dl_se(dl_se->rq, dl_se, delta_exec);
> + }
> }
>
> void dl_server_start(struct sched_dl_entity *dl_se)
> @@ -1663,7 +1668,7 @@ void dl_server_start(struct sched_dl_ent
> setup_new_dl_entity(dl_se);
> }
>
> - if (!dl_se->dl_runtime)
> + if (!dl_se->dl_runtime || dl_se->dl_server_active)
> return;
>
> dl_se->dl_server_active = 1;
> @@ -1672,7 +1677,7 @@ void dl_server_start(struct sched_dl_ent
> resched_curr(dl_se->rq);
> }
>
> -void dl_server_stop(struct sched_dl_entity *dl_se)
> +static void __dl_server_stop(struct sched_dl_entity *dl_se)
> {
> if (!dl_se->dl_runtime)
> return;
> @@ -1684,6 +1689,24 @@ void dl_server_stop(struct sched_dl_enti
> dl_se->dl_server_active = 0;
> }
>
> +static bool dl_server_stopped(struct sched_dl_entity *dl_se)
> +{
> + if (!dl_se->dl_server_active)
> + return false;
> +
> + if (dl_se->dl_server_idle) {
> + __dl_server_stop(dl_se);
> + return true;
> + }
> +
> + dl_se->dl_server_idle = 1;
> + return false;
> +}
> +
> +void dl_server_stop(struct sched_dl_entity *dl_se)
> +{
> +}
What if we explicitly set the server to idle (instead of ignoring the
stop) where this gets called in dequeue_entities()? Also, don't we need
to actually stop the server if we are changing its parameters from
sched_fair_server_write()?
Thanks,
Juri
On Tue, Jun 03, 2025 at 06:03:12PM +0200, Juri Lelli wrote:
> Hi,
>
> > @@ -1684,6 +1689,24 @@ void dl_server_stop(struct sched_dl_enti
> > dl_se->dl_server_active = 0;
> > }
> >
> > +static bool dl_server_stopped(struct sched_dl_entity *dl_se)
> > +{
> > + if (!dl_se->dl_server_active)
> > + return false;
> > +
> > + if (dl_se->dl_server_idle) {
> > + __dl_server_stop(dl_se);
> > + return true;
> > + }
> > +
> > + dl_se->dl_server_idle = 1;
> > + return false;
> > +}
> > +
> > +void dl_server_stop(struct sched_dl_entity *dl_se)
> > +{
> > +}
>
> What if we explicitly set the server to idle (instead of ignoring the
> stop) where this gets called in dequeue_entities()?
That would break thing; we want to detect if it was ever !idle in the
period.
> Also, don't we need to actually stop the server if we are changing its
> parameters from sched_fair_server_write()?
Quite - let me just remove the offending callsites them.
Would this explain this massive regression 0day reported here? Seems
weird.
Anyway, let me go update the patch.
© 2016 - 2025 Red Hat, Inc.