From: Peter Zijlstra <peterz@infradead.org>
Use deadline servers to service fair tasks.
This patch adds a fair_server deadline entity which acts as a container
for fair entities and can be used to fix starvation when higher priority
(wrt fair) tasks are monopolizing CPU(s).
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Daniel Bristot de Oliveira <bristot@kernel.org>
---
kernel/sched/core.c | 24 ++++++++++++++++--------
kernel/sched/deadline.c | 23 +++++++++++++++++++++++
kernel/sched/fair.c | 25 +++++++++++++++++++++++++
kernel/sched/sched.h | 4 ++++
4 files changed, 68 insertions(+), 8 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7019a40457a6..04e2270487b7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6007,6 +6007,14 @@ static void put_prev_task_balance(struct rq *rq, struct task_struct *prev,
#endif
put_prev_task(rq, prev);
+
+ /*
+ * We've updated @prev and no longer need the server link, clear it.
+ * Must be done before ->pick_next_task() because that can (re)set
+ * ->dl_server.
+ */
+ if (prev->dl_server)
+ prev->dl_server = NULL;
}
/*
@@ -6037,6 +6045,13 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
p = pick_next_task_idle(rq);
}
+ /*
+ * This is a normal CFS pick, but the previous could be a DL pick.
+ * Clear it as previous is no longer picked.
+ */
+ if (prev->dl_server)
+ prev->dl_server = NULL;
+
/*
* This is the fast path; it cannot be a DL server pick;
* therefore even if @p == @prev, ->dl_server must be NULL.
@@ -6050,14 +6065,6 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
restart:
put_prev_task_balance(rq, prev, rf);
- /*
- * We've updated @prev and no longer need the server link, clear it.
- * Must be done before ->pick_next_task() because that can (re)set
- * ->dl_server.
- */
- if (prev->dl_server)
- prev->dl_server = NULL;
-
for_each_class(class) {
p = class->pick_next_task(rq);
if (p)
@@ -10051,6 +10058,7 @@ void __init sched_init(void)
#endif /* CONFIG_SMP */
hrtick_rq_init(rq);
atomic_set(&rq->nr_iowait, 0);
+ fair_server_init(rq);
#ifdef CONFIG_SCHED_CORE
rq->core = rq;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index a04a436af8cc..db5dc5c09106 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1382,6 +1382,13 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64
resched_curr(rq);
}
+ /*
+ * The fair server (sole dl_server) does not account for real-time
+ * workload because it is running fair work.
+ */
+ if (dl_se == &rq->fair_server)
+ return;
+
/*
* Because -- for now -- we share the rt bandwidth, we need to
* account our runtime there too, otherwise actual rt tasks
@@ -1415,15 +1422,31 @@ void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec)
void dl_server_start(struct sched_dl_entity *dl_se)
{
+ struct rq *rq = dl_se->rq;
+
if (!dl_server(dl_se)) {
+ /* Disabled */
+ dl_se->dl_runtime = 0;
+ dl_se->dl_deadline = 1000 * NSEC_PER_MSEC;
+ dl_se->dl_period = 1000 * NSEC_PER_MSEC;
+
dl_se->dl_server = 1;
setup_new_dl_entity(dl_se);
}
+
+ if (!dl_se->dl_runtime)
+ return;
+
enqueue_dl_entity(dl_se, ENQUEUE_WAKEUP);
+ if (!dl_task(dl_se->rq->curr) || dl_entity_preempt(dl_se, &rq->curr->dl))
+ resched_curr(dl_se->rq);
}
void dl_server_stop(struct sched_dl_entity *dl_se)
{
+ if (!dl_se->dl_runtime)
+ return;
+
dequeue_dl_entity(dl_se, DEQUEUE_SLEEP);
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 03be0d1330a6..304697a80e9e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6722,6 +6722,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
*/
util_est_enqueue(&rq->cfs, p);
+ if (!rq->cfs.h_nr_running)
+ dl_server_start(&rq->fair_server);
+
/*
* If in_iowait is set, the code below may not trigger any cpufreq
* utilization updates, so do it here explicitly with the IOWAIT flag
@@ -6866,6 +6869,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
rq->next_balance = jiffies;
dequeue_throttle:
+ if (!rq->cfs.h_nr_running)
+ dl_server_stop(&rq->fair_server);
+
util_est_update(&rq->cfs, p, task_sleep);
hrtick_update(rq);
}
@@ -8538,6 +8544,25 @@ static struct task_struct *__pick_next_task_fair(struct rq *rq)
return pick_next_task_fair(rq, NULL, NULL);
}
+static bool fair_server_has_tasks(struct sched_dl_entity *dl_se)
+{
+ return !!dl_se->rq->cfs.nr_running;
+}
+
+static struct task_struct *fair_server_pick(struct sched_dl_entity *dl_se)
+{
+ return pick_next_task_fair(dl_se->rq, NULL, NULL);
+}
+
+void fair_server_init(struct rq *rq)
+{
+ struct sched_dl_entity *dl_se = &rq->fair_server;
+
+ init_dl_entity(dl_se);
+
+ dl_server_init(dl_se, rq, fair_server_has_tasks, fair_server_pick);
+}
+
/*
* Account for a descheduled task:
*/
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index d2242679239e..205e56929e15 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -340,6 +340,8 @@ extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
dl_server_has_tasks_f has_tasks,
dl_server_pick_f pick);
+extern void fair_server_init(struct rq *rq);
+
#ifdef CONFIG_CGROUP_SCHED
struct cfs_rq;
@@ -1016,6 +1018,8 @@ struct rq {
struct rt_rq rt;
struct dl_rq dl;
+ struct sched_dl_entity fair_server;
+
#ifdef CONFIG_FAIR_GROUP_SCHED
/* list of leaf cfs_rq on this CPU: */
struct list_head leaf_cfs_rq_list;
--
2.44.0
On Fri, Apr 05, 2024 at 07:28:00PM +0200, Daniel Bristot de Oliveira wrote:
> From: Peter Zijlstra <peterz@infradead.org>
>
> Use deadline servers to service fair tasks.
>
> This patch adds a fair_server deadline entity which acts as a container
> for fair entities and can be used to fix starvation when higher priority
> (wrt fair) tasks are monopolizing CPU(s).
>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Signed-off-by: Daniel Bristot de Oliveira <bristot@kernel.org>
> ---
> kernel/sched/core.c | 24 ++++++++++++++++--------
> kernel/sched/deadline.c | 23 +++++++++++++++++++++++
> kernel/sched/fair.c | 25 +++++++++++++++++++++++++
> kernel/sched/sched.h | 4 ++++
> 4 files changed, 68 insertions(+), 8 deletions(-)
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 7019a40457a6..04e2270487b7 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -6007,6 +6007,14 @@ static void put_prev_task_balance(struct rq *rq, struct task_struct *prev,
> #endif
>
> put_prev_task(rq, prev);
> +
> + /*
> + * We've updated @prev and no longer need the server link, clear it.
> + * Must be done before ->pick_next_task() because that can (re)set
> + * ->dl_server.
> + */
> + if (prev->dl_server)
> + prev->dl_server = NULL;
> }
>
> /*
> @@ -6037,6 +6045,13 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> p = pick_next_task_idle(rq);
> }
>
> + /*
> + * This is a normal CFS pick, but the previous could be a DL pick.
> + * Clear it as previous is no longer picked.
> + */
> + if (prev->dl_server)
> + prev->dl_server = NULL;
> +
> /*
> * This is the fast path; it cannot be a DL server pick;
> * therefore even if @p == @prev, ->dl_server must be NULL.
> @@ -6050,14 +6065,6 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> restart:
> put_prev_task_balance(rq, prev, rf);
>
> - /*
> - * We've updated @prev and no longer need the server link, clear it.
> - * Must be done before ->pick_next_task() because that can (re)set
> - * ->dl_server.
> - */
> - if (prev->dl_server)
> - prev->dl_server = NULL;
> -
> for_each_class(class) {
> p = class->pick_next_task(rq);
> if (p)
This bit seems like a fix for 63ba8422f876 ("sched/deadline: Introduce
deadline servers"), should it be a separate patch?
On 4/10/24 19:24, Peter Zijlstra wrote:
> On Fri, Apr 05, 2024 at 07:28:00PM +0200, Daniel Bristot de Oliveira wrote:
>> From: Peter Zijlstra <peterz@infradead.org>
>>
>> Use deadline servers to service fair tasks.
>>
>> This patch adds a fair_server deadline entity which acts as a container
>> for fair entities and can be used to fix starvation when higher priority
>> (wrt fair) tasks are monopolizing CPU(s).
>>
>> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
>> Signed-off-by: Daniel Bristot de Oliveira <bristot@kernel.org>
>> ---
>> kernel/sched/core.c | 24 ++++++++++++++++--------
>> kernel/sched/deadline.c | 23 +++++++++++++++++++++++
>> kernel/sched/fair.c | 25 +++++++++++++++++++++++++
>> kernel/sched/sched.h | 4 ++++
>> 4 files changed, 68 insertions(+), 8 deletions(-)
>>
>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
>> index 7019a40457a6..04e2270487b7 100644
>> --- a/kernel/sched/core.c
>> +++ b/kernel/sched/core.c
>> @@ -6007,6 +6007,14 @@ static void put_prev_task_balance(struct rq *rq, struct task_struct *prev,
>> #endif
>>
>> put_prev_task(rq, prev);
>> +
>> + /*
>> + * We've updated @prev and no longer need the server link, clear it.
>> + * Must be done before ->pick_next_task() because that can (re)set
>> + * ->dl_server.
>> + */
>> + if (prev->dl_server)
>> + prev->dl_server = NULL;
>> }
>>
>> /*
>> @@ -6037,6 +6045,13 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
>> p = pick_next_task_idle(rq);
>> }
>>
>> + /*
>> + * This is a normal CFS pick, but the previous could be a DL pick.
>> + * Clear it as previous is no longer picked.
>> + */
>> + if (prev->dl_server)
>> + prev->dl_server = NULL;
>> +
>> /*
>> * This is the fast path; it cannot be a DL server pick;
>> * therefore even if @p == @prev, ->dl_server must be NULL.
>> @@ -6050,14 +6065,6 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
>> restart:
>> put_prev_task_balance(rq, prev, rf);
>>
>> - /*
>> - * We've updated @prev and no longer need the server link, clear it.
>> - * Must be done before ->pick_next_task() because that can (re)set
>> - * ->dl_server.
>> - */
>> - if (prev->dl_server)
>> - prev->dl_server = NULL;
>> -
>> for_each_class(class) {
>> p = class->pick_next_task(rq);
>> if (p)
>
> This bit seems like a fix for 63ba8422f876 ("sched/deadline: Introduce
> deadline servers"), should it be a separate patch?
It was actually reported in a separated patch as a pre-review of this series. So I
think you can pick them as fixes already from there, and add the fixes tag?
https://lore.kernel.org/lkml/20240313012451.1693807-2-joel@joelfernandes.org/
https://lore.kernel.org/lkml/20240313012451.1693807-3-joel@joelfernandes.org/
Also add the Reviewed-by me...
-- Daniel
© 2016 - 2026 Red Hat, Inc.